risp/src/paragraph/mod.rs

// Copyright (C) 2024 Philipp Hofer
//
// Licensed under the EUPL, Version 1.2 or - as soon they will be approved by
// the European Commission - subsequent versions of the EUPL (the "Licence").
// You may not use this work except in compliance with the Licence.
//
// You should have received a copy of the European Union Public License along
// with this program.  If not, you may obtain a copy of the Licence at:
// <https://joinup.ec.europa.eu/software/page/eupl>
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the Licence is distributed on an "AS IS" basis,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the Licence for the specific language governing permissions and
// limitations under the Licence.

//! Deals with getting the content of a single paragraph.

mod parser;

use regex::Regex;
use std::{
    fs,
    hash::{DefaultHasher, Hash, Hasher},
    path::Path,
};
use tracing::{debug, info};

use crate::{
    law,
    misc::{fetch_with_retries, get_cache_dir, Error},
};

use self::parser::Risdok;

pub struct Parser {
    remove: Vec<String>,
    replace: Vec<(String, String)>,
    move_para_headers_into_content: bool,
    pub(crate) par_sign: String,
}

impl Default for Parser {
    fn default() -> Self {
        Self::new(String::from("§"))
    }
}

impl Parser {
    #[must_use]
    pub fn new(par_sign: String) -> Self {
        Self {
            remove: Vec::new(),
            replace: Vec::new(),
            move_para_headers_into_content: false,
            par_sign,
        }
    }

    pub fn move_para_headers_into_content(&mut self) {
        self.move_para_headers_into_content = true;
    }

    pub fn add_string_to_remove(&mut self, data: &str) {
        self.remove.push(data.into());
    }

    pub fn add_string_to_replace(&mut self, search: &str, replace: &str) {
        self.replace.push((search.into(), replace.into()));
    }

    /// Parses the content available at the specified `url` and processes it using the provided
    /// `law::Builder`.
    ///
    /// This function is responsible for downloading the content from the given `url`,
    /// pre-processing the text (such as removing unwanted characters and performing specified
    /// replacements), and then parsing the pre-processed XML content.
    ///
    /// This method also handles caching of the downloaded content to avoid repeated downloads of
    /// the same resource, improving efficiency and reducing network load.
    ///
    /// # Parameters
    ///
    /// - `url`: The URL from which to fetch the law text. - `builder`: A mutable reference to a
    ///   `law::Builder` instance, which is used to construct the law structure based on the parsed
    ///   content.
    ///
    /// # Returns
    ///
    /// - `Ok(bool)`: Returns `true` if parsing was successful, `false` otherwise.
    /// - `Err(Error)`: An error occurred during the fetching or parsing process.
    ///
    /// # Errors
    ///
    /// Errors can occur due to several reasons:
    /// - Network issues preventing the download of content from the given `url`.
    /// - I/O errors related to file system operations, such as problems with reading from or
    ///   writing to the cache.
    /// - Parsing errors if the downloaded content cannot be properly processed or interpreted as
    ///   expected.
    ///
    /// # Example Usage
    ///
    /// ```
    /// use risp::{Config, law::{Law, Heading, Content, Section, HeadingContent}};
    /// use std::path::Path;
    ///
    /// let (_, _, _, mut builder, parser) = Config::load(Path::new("data/configs/abgb.toml")).unwrap();
    /// let result = parser.parse("https://www.ris.bka.gv.at/Dokumente/Bundesnormen/NOR12017691/NOR12017691.xml", &mut builder).unwrap();
    ///
    /// let law: Law = builder.into();
    /// assert_eq!(
    ///     law,
    ///     Law {
    ///         name: "ABGB".into(),
    ///         header: vec![Heading {
    ///             name: "Nullter Theil. Einleitung".into(),
    ///             desc: Some("Von den bürgerlichen Gesetzen überhaupt.".into()),
    ///             content: vec![HeadingContent::Paragraph(Section {
    ///                 symb: "§ 1.".into(),
    ///                 par_header: Some("Begriff des bürgerlichen Rechtes.".into()),
    ///                 par_note: None,
    ///                 content: Content::Text("Der Inbegriff der Gesetze, wodurch die Privat-Rechte und Pflichten der Einwohner des Staates unter sich bestimmt werden, macht das bürgerliche Recht in demselben aus.".into())
    ///             })]
    ///         }]
    ///     }
    /// );
    /// ```
    pub fn parse(&self, url: &str, builder: &mut law::Builder) -> Result<bool, Error> {
        info!("Parsing {url}");
        let xml = fetch(url)?;

        let xml = xml.replace('\u{a0}', " ");

        self.parse_from_str(&xml, builder)
    }

    fn parse_from_str(&self, xml: &str, builder: &mut law::Builder) -> Result<bool, Error> {
        let mut xml = String::from(xml);
        for r in &self.remove {
            xml = xml.replace(r, "");
        }
        for (search, replace) in &self.replace {
            xml = xml.replace(search, replace);
        }

        let xml = if self.move_para_headers_into_content {
            self.do_move_para_headers_into_content(&xml)
        } else {
            xml
        };

        Risdok::from_str(&xml, builder)
    }

    fn do_move_para_headers_into_content(&self, xml: &str) -> String {
        let mut result = String::from(xml);
        let ueberschrift_regex = Regex::new(&format!(
            "<ueberschrift typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">({}.*?)</ueberschrift>",
            self.par_sign
        ))
        .unwrap();
        let absatz_regex =
            Regex::new("<absatz typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">").unwrap();

        // Find all matches for <ueberschrift> tags and iterate over them in reverse to avoid messing up the indices
        for cap in ueberschrift_regex.captures_iter(xml) {
            let ueberschrift_content = &cap[1];

            // Check if there's an <absatz> following the <ueberschrift>
            if let Some(absatz_match) = absatz_regex.find(&result[cap.get(0).unwrap().end()..]) {
                // Calculate the insertion point for the <gldsym> tag
                let insert_point =
                    cap.get(0).unwrap().end() + absatz_match.start() + absatz_match.as_str().len();
                // Insert the <gldsym> tag with the ueberschrift content into the result string
                result.insert_str(
                    insert_point,
                    &format!("<gldsym>{ueberschrift_content}</gldsym>"),
                );
            }

            // Remove the <ueberschrift> tag from the result string
            result.replace_range(cap.get(0).unwrap().range(), "");
        }

        debug!("{result:#?}");
        result
    }
}

fn fetch(url: &str) -> Result<String, Error> {
    let mut hasher = DefaultHasher::new();
    url.hash(&mut hasher);
    let hash = format!("{:x}", hasher.finish());

    let expected_filename = format!("{}par-{hash}", get_cache_dir()?);
    if let Ok(data) = fs::read_to_string(&expected_filename) {
        Ok(data)
    } else {
        let data = fetch_with_retries(url)?;
        let path = Path::new(&expected_filename);
        if let Some(parent) = path.parent() {
            // Try to create the directory (and any necessary parent directories)
            fs::create_dir_all(parent).expect("Unable to create directory");
        }
        fs::write(expected_filename, &data).expect("Unable to write file");
        Ok(data)
    }
}

#[cfg(test)]
mod tests {
    use std::fs;

    use crate::config::Config;
    use pretty_assertions::assert_eq;

    #[test]
    fn all_configs_produce_expected_output() {
        let configs = fs::read_dir("./data/configs").expect("No folder with config files");

        for config in configs {
            let path = format!("{}", config.unwrap().path().display());
            println!("Testing {path}");

            let (_, law_id, _, mut builder, parser) = Config::load(&path).unwrap();

            let paragraph_path = format!("./data/expected/overview/{law_id}");
            let expected_path = format!("./data/expected/par/{law_id}");

            let pars =
                fs::read_to_string(paragraph_path).expect("Could not read file {paragraph_path}.");
            let pars = pars.trim().split('\n').collect::<Vec<&str>>();

            for par in pars {
                let cont = parser.parse(par, &mut builder).unwrap();
                if !cont {
                    break;
                }
            }

            let actual = &builder.history;

            match fs::read_to_string(&expected_path) {
                Ok(expected) => {
                    let e = expected.trim().split('\n').collect::<Vec<&str>>();
                    assert_eq!(actual, &e);
                }
                Err(_) => {
                    let to_write = actual.join("\n");
                    fs::write(expected_path, to_write).expect("Unable to write file");
                }
            }
        }
    }
}