enable function to move paragraphs into absaetze

2024-02-15 15:07:36 +01:00
parent 567994a47e
commit 6c31131cdd
5 changed files with 61 additions and 0 deletions
@@ -17,6 +17,7 @@
 //! Deals with getting all paragraphs for a given law text
 mod parser;

+use regex::Regex;
 use std::{
    fs,
    hash::{DefaultHasher, Hash, Hasher},
@@ -35,6 +36,7 @@ use self::parser::Risdok;
 pub struct Parser {
    remove: Vec<String>,
    replace: Vec<(String, String)>,
+    move_para_headers_into_content: bool,
 }

 impl Default for Parser {
@@ -48,9 +50,14 @@ impl Parser {
        Self {
            remove: Vec::new(),
            replace: Vec::new(),
+            move_para_headers_into_content: false,
        }
    }

+    pub fn move_para_headers_into_content(&mut self) {
+        self.move_para_headers_into_content = true;
+    }
+
    pub fn add_string_to_remove(&mut self, data: &str) {
        self.remove.push(data.into());
    }
@@ -78,8 +85,45 @@ impl Parser {
            xml = xml.replace(search, replace);
        }

+        let xml = if self.move_para_headers_into_content {
+            Self::do_move_para_headers_into_content(xml)
+        } else {
+            xml
+        };
+
        Risdok::from_str(&xml, builder)
    }
+
+    fn do_move_para_headers_into_content(xml: String) -> String {
+        let mut result = String::from(&xml);
+        let ueberschrift_regex = Regex::new(
+            "<ueberschrift typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">(§.*?)</ueberschrift>",
+        )
+        .unwrap();
+        let absatz_regex =
+            Regex::new("<absatz typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">").unwrap();
+
+        // Find all matches for <ueberschrift> tags and iterate over them in reverse to avoid messing up the indices
+        for cap in ueberschrift_regex.captures_iter(&xml) {
+            let ueberschrift_content = &cap[1];
+
+            // Check if there's an <absatz> following the <ueberschrift>
+            if let Some(absatz_match) = absatz_regex.find(&result[cap.get(0).unwrap().end()..]) {
+                // Calculate the insertion point for the <gldsym> tag
+                let insert_point =
+                    cap.get(0).unwrap().end() + absatz_match.start() + absatz_match.as_str().len();
+                // Insert the <gldsym> tag with the ueberschrift content into the result string
+                result.insert_str(
+                    insert_point,
+                    &format!("<gldsym>{}</gldsym>", ueberschrift_content),
+                );
+            }
+
+            // Remove the <ueberschrift> tag from the result string
+            result.replace_range(cap.get(0).unwrap().range(), "");
+        }
+        result
+    }
 }

 fn fetch(url: &str) -> Result<String, Error> {