enable function to move paragraphs into absaetze

2024-02-15 15:07:36 +01:00
parent 567994a47e
commit 6c31131cdd
5 changed files with 61 additions and 0 deletions
@@ -583,6 +583,7 @@ dependencies = [
 "env_logger",
 "log",
 "pretty_assertions",
+ "regex",
 "roxmltree",
 "serde",
 "serde_json",
@@ -17,6 +17,7 @@ tqdm = "0.6"
 toml = "0.8"
 clap = { version = "4.5.0", features = ["derive"] }
 directories = "5.0"
+regex = "1.10"

 [dev-dependencies]
 pretty_assertions = "1.4"
@@ -82,6 +82,10 @@ impl Config {
            parser.add_string_to_replace(&to_replace.find, &to_replace.replace_with);
        }

+        if config.parser.move_para_headers_into_content {
+            parser.move_para_headers_into_content();
+        }
+
        Ok((config.law.id, builder, parser))
    }
 }
@@ -101,6 +105,16 @@ struct Classifier {

 #[derive(Debug, Deserialize, Default)]
 struct ParserConfig {
+    /// e.g. used in EheG to transform `<ueberschrift typ="para" ct="text" halign="c">§
+    /// 6</ueberschrift>` into
+    /// `
+    /// <absatz typ="abs" ct="text" halign="j">
+    /// <gldsym>§ 1.</gldsym>
+    /// text...
+    /// </absatz>
+    ///`
+    #[serde(default)] //okay to not have this part in the config
+    move_para_headers_into_content: bool,
    #[serde(default)] //okay to not have this part in the config
    remove_strings: Vec<String>,
    #[serde(default)] //okay to not have this part in the config
@@ -63,6 +63,7 @@ pub fn starts_with_letter(_classifier_name: &str, instance_name: &str) -> bool {

 pub fn starts_with_uppercaseletter(_classifier_name: &str, instance_name: &str) -> bool {
    instance_name.starts_with(|c: char| c.is_ascii_uppercase())
+        && instance_name.chars().nth(0) != Some('I')
        && (instance_name.chars().nth(1) == Some('.') || instance_name.chars().nth(1) == Some(')'))
 }

@@ -17,6 +17,7 @@
 //! Deals with getting all paragraphs for a given law text
 mod parser;

+use regex::Regex;
 use std::{
    fs,
    hash::{DefaultHasher, Hash, Hasher},
@@ -35,6 +36,7 @@ use self::parser::Risdok;
 pub struct Parser {
    remove: Vec<String>,
    replace: Vec<(String, String)>,
+    move_para_headers_into_content: bool,
 }

 impl Default for Parser {
@@ -48,9 +50,14 @@ impl Parser {
        Self {
            remove: Vec::new(),
            replace: Vec::new(),
+            move_para_headers_into_content: false,
        }
    }

+    pub fn move_para_headers_into_content(&mut self) {
+        self.move_para_headers_into_content = true;
+    }
+
    pub fn add_string_to_remove(&mut self, data: &str) {
        self.remove.push(data.into());
    }
@@ -78,8 +85,45 @@ impl Parser {
            xml = xml.replace(search, replace);
        }

+        let xml = if self.move_para_headers_into_content {
+            Self::do_move_para_headers_into_content(xml)
+        } else {
+            xml
+        };
+
        Risdok::from_str(&xml, builder)
    }
+
+    fn do_move_para_headers_into_content(xml: String) -> String {
+        let mut result = String::from(&xml);
+        let ueberschrift_regex = Regex::new(
+            "<ueberschrift typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">(§.*?)</ueberschrift>",
+        )
+        .unwrap();
+        let absatz_regex =
+            Regex::new("<absatz typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">").unwrap();
+
+        // Find all matches for <ueberschrift> tags and iterate over them in reverse to avoid messing up the indices
+        for cap in ueberschrift_regex.captures_iter(&xml) {
+            let ueberschrift_content = &cap[1];
+
+            // Check if there's an <absatz> following the <ueberschrift>
+            if let Some(absatz_match) = absatz_regex.find(&result[cap.get(0).unwrap().end()..]) {
+                // Calculate the insertion point for the <gldsym> tag
+                let insert_point =
+                    cap.get(0).unwrap().end() + absatz_match.start() + absatz_match.as_str().len();
+                // Insert the <gldsym> tag with the ueberschrift content into the result string
+                result.insert_str(
+                    insert_point,
+                    &format!("<gldsym>{}</gldsym>", ueberschrift_content),
+                );
+            }
+
+            // Remove the <ueberschrift> tag from the result string
+            result.replace_range(cap.get(0).unwrap().range(), "");
+        }
+        result
+    }
 }

 fn fetch(url: &str) -> Result<String, Error> {