enable function to move paragraphs into absaetze

2024-02-15 15:07:36 +01:00
parent 567994a47e
commit 6c31131cdd
5 changed files with 61 additions and 0 deletions
@@ -583,6 +583,7 @@ dependencies = [
 "env_logger",
 "log",
 "pretty_assertions",
 "regex",
 "roxmltree",
 "serde",
 "serde_json",
@@ -17,6 +17,7 @@ tqdm = "0.6"
 toml = "0.8"
 clap = { version = "4.5.0", features = ["derive"] }
 directories = "5.0"
 regex = "1.10"
 [dev-dependencies]
 pretty_assertions = "1.4"
@@ -82,6 +82,10 @@ impl Config {
            parser.add_string_to_replace(&to_replace.find, &to_replace.replace_with);
        }
        if config.parser.move_para_headers_into_content {
            parser.move_para_headers_into_content();
        }
        Ok((config.law.id, builder, parser))
    }
 }
@@ -101,6 +105,16 @@ struct Classifier {
 #[derive(Debug, Deserialize, Default)]
 struct ParserConfig {
    /// e.g. used in EheG to transform `<ueberschrift typ="para" ct="text" halign="c">§
    /// 6</ueberschrift>` into
    /// `
    /// <absatz typ="abs" ct="text" halign="j">
    /// <gldsym>§ 1.</gldsym>
    /// text...
    /// </absatz>
    ///`
    #[serde(default)] //okay to not have this part in the config
    move_para_headers_into_content: bool,
    #[serde(default)] //okay to not have this part in the config
    remove_strings: Vec<String>,
    #[serde(default)] //okay to not have this part in the config
@@ -63,6 +63,7 @@ pub fn starts_with_letter(_classifier_name: &str, instance_name: &str) -> bool {
 pub fn starts_with_uppercaseletter(_classifier_name: &str, instance_name: &str) -> bool {
    instance_name.starts_with(|c: char| c.is_ascii_uppercase())
        && instance_name.chars().nth(0) != Some('I')
        && (instance_name.chars().nth(1) == Some('.') || instance_name.chars().nth(1) == Some(')'))
 }
@@ -17,6 +17,7 @@
 //! Deals with getting all paragraphs for a given law text
 mod parser;
 use regex::Regex;
 use std::{
    fs,
    hash::{DefaultHasher, Hash, Hasher},
@@ -35,6 +36,7 @@ use self::parser::Risdok;
 pub struct Parser {
    remove: Vec<String>,
    replace: Vec<(String, String)>,
    move_para_headers_into_content: bool,
 }
 impl Default for Parser {
@@ -48,9 +50,14 @@ impl Parser {
        Self {
            remove: Vec::new(),
            replace: Vec::new(),
            move_para_headers_into_content: false,
        }
    }
    pub fn move_para_headers_into_content(&mut self) {
        self.move_para_headers_into_content = true;
    }
    pub fn add_string_to_remove(&mut self, data: &str) {
        self.remove.push(data.into());
    }
@@ -78,8 +85,45 @@ impl Parser {
            xml = xml.replace(search, replace);
        }
        let xml = if self.move_para_headers_into_content {
            Self::do_move_para_headers_into_content(xml)
        } else {
            xml
        };
        Risdok::from_str(&xml, builder)
    }
    fn do_move_para_headers_into_content(xml: String) -> String {
        let mut result = String::from(&xml);
        let ueberschrift_regex = Regex::new(
            "<ueberschrift typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">(§.*?)</ueberschrift>",
        )
        .unwrap();
        let absatz_regex =
            Regex::new("<absatz typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">").unwrap();
        // Find all matches for <ueberschrift> tags and iterate over them in reverse to avoid messing up the indices
        for cap in ueberschrift_regex.captures_iter(&xml) {
            let ueberschrift_content = &cap[1];
            // Check if there's an <absatz> following the <ueberschrift>
            if let Some(absatz_match) = absatz_regex.find(&result[cap.get(0).unwrap().end()..]) {
                // Calculate the insertion point for the <gldsym> tag
                let insert_point =
                    cap.get(0).unwrap().end() + absatz_match.start() + absatz_match.as_str().len();
                // Insert the <gldsym> tag with the ueberschrift content into the result string
                result.insert_str(
                    insert_point,
                    &format!("<gldsym>{}</gldsym>", ueberschrift_content),
                );
            }
            // Remove the <ueberschrift> tag from the result string
            result.replace_range(cap.get(0).unwrap().range(), "");
        }
        result
    }
 }
 fn fetch(url: &str) -> Result<String, Error> {