diff --git a/Cargo.lock b/Cargo.lock index fcf1bf2..1deae59 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -583,6 +583,7 @@ dependencies = [ "env_logger", "log", "pretty_assertions", + "regex", "roxmltree", "serde", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index fca36a0..3040264 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,7 @@ tqdm = "0.6" toml = "0.8" clap = { version = "4.5.0", features = ["derive"] } directories = "5.0" +regex = "1.10" [dev-dependencies] pretty_assertions = "1.4" diff --git a/src/config.rs b/src/config.rs index d6da168..a508b8c 100644 --- a/src/config.rs +++ b/src/config.rs @@ -82,6 +82,10 @@ impl Config { parser.add_string_to_replace(&to_replace.find, &to_replace.replace_with); } + if config.parser.move_para_headers_into_content { + parser.move_para_headers_into_content(); + } + Ok((config.law.id, builder, parser)) } } @@ -101,6 +105,16 @@ struct Classifier { #[derive(Debug, Deserialize, Default)] struct ParserConfig { + /// e.g. used in EheG to transform `§ + /// 6` into + /// ` + /// + /// § 1. + /// text... + /// + ///` + #[serde(default)] //okay to not have this part in the config + move_para_headers_into_content: bool, #[serde(default)] //okay to not have this part in the config remove_strings: Vec, #[serde(default)] //okay to not have this part in the config diff --git a/src/law/responsible.rs b/src/law/responsible.rs index 13f28ef..e0fa480 100644 --- a/src/law/responsible.rs +++ b/src/law/responsible.rs @@ -63,6 +63,7 @@ pub fn starts_with_letter(_classifier_name: &str, instance_name: &str) -> bool { pub fn starts_with_uppercaseletter(_classifier_name: &str, instance_name: &str) -> bool { instance_name.starts_with(|c: char| c.is_ascii_uppercase()) + && instance_name.chars().nth(0) != Some('I') && (instance_name.chars().nth(1) == Some('.') || instance_name.chars().nth(1) == Some(')')) } diff --git a/src/paragraph/mod.rs b/src/paragraph/mod.rs index f1e031c..0b73f07 100644 --- a/src/paragraph/mod.rs +++ b/src/paragraph/mod.rs @@ -17,6 +17,7 @@ //! Deals with getting all paragraphs for a given law text mod parser; +use regex::Regex; use std::{ fs, hash::{DefaultHasher, Hash, Hasher}, @@ -35,6 +36,7 @@ use self::parser::Risdok; pub struct Parser { remove: Vec, replace: Vec<(String, String)>, + move_para_headers_into_content: bool, } impl Default for Parser { @@ -48,9 +50,14 @@ impl Parser { Self { remove: Vec::new(), replace: Vec::new(), + move_para_headers_into_content: false, } } + pub fn move_para_headers_into_content(&mut self) { + self.move_para_headers_into_content = true; + } + pub fn add_string_to_remove(&mut self, data: &str) { self.remove.push(data.into()); } @@ -78,8 +85,45 @@ impl Parser { xml = xml.replace(search, replace); } + let xml = if self.move_para_headers_into_content { + Self::do_move_para_headers_into_content(xml) + } else { + xml + }; + Risdok::from_str(&xml, builder) } + + fn do_move_para_headers_into_content(xml: String) -> String { + let mut result = String::from(&xml); + let ueberschrift_regex = Regex::new( + "(§.*?)", + ) + .unwrap(); + let absatz_regex = + Regex::new("").unwrap(); + + // Find all matches for tags and iterate over them in reverse to avoid messing up the indices + for cap in ueberschrift_regex.captures_iter(&xml) { + let ueberschrift_content = &cap[1]; + + // Check if there's an following the + if let Some(absatz_match) = absatz_regex.find(&result[cap.get(0).unwrap().end()..]) { + // Calculate the insertion point for the tag + let insert_point = + cap.get(0).unwrap().end() + absatz_match.start() + absatz_match.as_str().len(); + // Insert the tag with the ueberschrift content into the result string + result.insert_str( + insert_point, + &format!("{}", ueberschrift_content), + ); + } + + // Remove the tag from the result string + result.replace_range(cap.get(0).unwrap().range(), ""); + } + result + } } fn fetch(url: &str) -> Result {