risp/src/paragraph/mod.rs

// Copyright (C) 2024 Philipp Hofer
//
// Licensed under the EUPL, Version 1.2 or - as soon they will be approved by
// the European Commission - subsequent versions of the EUPL (the "Licence").
// You may not use this work except in compliance with the Licence.
//
// You should have received a copy of the European Union Public License along
// with this program.  If not, you may obtain a copy of the Licence at:
// <https://joinup.ec.europa.eu/software/page/eupl>
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the Licence is distributed on an "AS IS" basis,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the Licence for the specific language governing permissions and
// limitations under the Licence.

//! Deals with getting all paragraphs for a given law text

mod parser;

use regex::Regex;
use std::{
    fs,
    hash::{DefaultHasher, Hash, Hasher},
    path::Path,
};

use log::info;

use crate::{
    law::LawBuilder,
    misc::{fetch_with_retries, get_cache_dir, Error},
};

use self::parser::Risdok;

pub struct Parser {
    remove: Vec<String>,
    replace: Vec<(String, String)>,
    move_para_headers_into_content: bool,
}

impl Default for Parser {
    fn default() -> Self {
        Self::new()
    }
}

impl Parser {
    pub fn new() -> Self {
        Self {
            remove: Vec::new(),
            replace: Vec::new(),
            move_para_headers_into_content: false,
        }
    }

    pub fn move_para_headers_into_content(&mut self) {
        self.move_para_headers_into_content = true;
    }

    pub fn add_string_to_remove(&mut self, data: &str) {
        self.remove.push(data.into());
    }

    pub fn add_string_to_replace(&mut self, search: &str, replace: &str) {
        self.replace.push((search.into(), replace.into()));
    }

    /// Parses the content available in `url`. Calls appropriate functions in supplied `LawBuilder`.
    pub fn parse(&self, url: &str, builder: &mut LawBuilder) -> Result<bool, Error> {
        info!("Parsing {url}");
        let xml = fetch(url)?;

        let xml = xml.replace('\u{a0}', " ");

        self.parse_from_str(&xml, builder)
    }

    fn parse_from_str(&self, xml: &str, builder: &mut LawBuilder) -> Result<bool, Error> {
        let mut xml = String::from(xml);
        for r in &self.remove {
            xml = xml.replace(r, "");
        }
        for (search, replace) in &self.replace {
            xml = xml.replace(search, replace);
        }

        let xml = if self.move_para_headers_into_content {
            Self::do_move_para_headers_into_content(xml)
        } else {
            xml
        };

        Risdok::from_str(&xml, builder)
    }

    fn do_move_para_headers_into_content(xml: String) -> String {
        let mut result = String::from(&xml);
        let ueberschrift_regex = Regex::new(
            "<ueberschrift typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">(§.*?)</ueberschrift>",
        )
        .unwrap();
        let absatz_regex =
            Regex::new("<absatz typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">").unwrap();

        // Find all matches for <ueberschrift> tags and iterate over them in reverse to avoid messing up the indices
        for cap in ueberschrift_regex.captures_iter(&xml) {
            let ueberschrift_content = &cap[1];

            // Check if there's an <absatz> following the <ueberschrift>
            if let Some(absatz_match) = absatz_regex.find(&result[cap.get(0).unwrap().end()..]) {
                // Calculate the insertion point for the <gldsym> tag
                let insert_point =
                    cap.get(0).unwrap().end() + absatz_match.start() + absatz_match.as_str().len();
                // Insert the <gldsym> tag with the ueberschrift content into the result string
                result.insert_str(
                    insert_point,
                    &format!("<gldsym>{}</gldsym>", ueberschrift_content),
                );
            }

            // Remove the <ueberschrift> tag from the result string
            result.replace_range(cap.get(0).unwrap().range(), "");
        }
        result
    }
}

fn fetch(url: &str) -> Result<String, Error> {
    let mut hasher = DefaultHasher::new();
    url.hash(&mut hasher);
    let hash = format!("{:x}", hasher.finish());

    let expected_filename = format!("{}par-{hash}", get_cache_dir()?);

    match fs::read_to_string(&expected_filename) {
        Ok(data) => Ok(data),
        Err(_) => {
            info!("Not finding url {url} in the cache, downloading...");
            let data = fetch_with_retries(url)?;
            let path = Path::new(&expected_filename);
            if let Some(parent) = path.parent() {
                // Try to create the directory (and any necessary parent directories)
                fs::create_dir_all(parent).expect("Unable to create directory");
            }
            fs::write(expected_filename, &data).expect("Unable to write file");
            Ok(data)
        }
    }
}

#[cfg(test)]
mod tests {
    use std::fs;

    use crate::config::Config;
    use pretty_assertions::assert_eq;

    #[test]
    fn all_configs_produce_expected_output() {
        let configs = fs::read_dir("./data/configs").expect("No folder with config files");

        for config in configs {
            let path = format!("{}", config.unwrap().path().display());

            let (law_id, mut builder, parser) = Config::load(&path).unwrap();

            let paragraph_path = format!("./data/expected/overview/{law_id}");
            let expected_path = format!("./data/expected/par/{law_id}");

            let pars =
                fs::read_to_string(paragraph_path).expect("Could not read file {paragraph_path}.");
            let pars = pars.trim().split('\n').collect::<Vec<&str>>();

            for par in pars {
                let cont = parser.parse(par, &mut builder).unwrap();
                if !cont {
                    break;
                }
            }

            let actual = &builder.history;

            match fs::read_to_string(&expected_path) {
                Ok(expected) => {
                    let e = expected.trim().split('\n').collect::<Vec<&str>>();
                    assert_eq!(actual, &e);
                }
                Err(_) => {
                    let to_write = actual.join("\n");
                    fs::write(expected_path, to_write).expect("Unable to write file");
                }
            }
        }
    }
}