diff --git a/Cargo.lock b/Cargo.lock
index fcf1bf2..1deae59 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -583,6 +583,7 @@ dependencies = [
"env_logger",
"log",
"pretty_assertions",
+ "regex",
"roxmltree",
"serde",
"serde_json",
diff --git a/Cargo.toml b/Cargo.toml
index fca36a0..3040264 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -17,6 +17,7 @@ tqdm = "0.6"
toml = "0.8"
clap = { version = "4.5.0", features = ["derive"] }
directories = "5.0"
+regex = "1.10"
[dev-dependencies]
pretty_assertions = "1.4"
diff --git a/src/config.rs b/src/config.rs
index d6da168..a508b8c 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -82,6 +82,10 @@ impl Config {
parser.add_string_to_replace(&to_replace.find, &to_replace.replace_with);
}
+ if config.parser.move_para_headers_into_content {
+ parser.move_para_headers_into_content();
+ }
+
Ok((config.law.id, builder, parser))
}
}
@@ -101,6 +105,16 @@ struct Classifier {
#[derive(Debug, Deserialize, Default)]
struct ParserConfig {
+ /// e.g. used in EheG to transform `§
+ /// 6` into
+ /// `
+ ///
+ /// § 1.
+ /// text...
+ ///
+ ///`
+ #[serde(default)] //okay to not have this part in the config
+ move_para_headers_into_content: bool,
#[serde(default)] //okay to not have this part in the config
remove_strings: Vec,
#[serde(default)] //okay to not have this part in the config
diff --git a/src/law/responsible.rs b/src/law/responsible.rs
index 13f28ef..e0fa480 100644
--- a/src/law/responsible.rs
+++ b/src/law/responsible.rs
@@ -63,6 +63,7 @@ pub fn starts_with_letter(_classifier_name: &str, instance_name: &str) -> bool {
pub fn starts_with_uppercaseletter(_classifier_name: &str, instance_name: &str) -> bool {
instance_name.starts_with(|c: char| c.is_ascii_uppercase())
+ && instance_name.chars().nth(0) != Some('I')
&& (instance_name.chars().nth(1) == Some('.') || instance_name.chars().nth(1) == Some(')'))
}
diff --git a/src/paragraph/mod.rs b/src/paragraph/mod.rs
index f1e031c..0b73f07 100644
--- a/src/paragraph/mod.rs
+++ b/src/paragraph/mod.rs
@@ -17,6 +17,7 @@
//! Deals with getting all paragraphs for a given law text
mod parser;
+use regex::Regex;
use std::{
fs,
hash::{DefaultHasher, Hash, Hasher},
@@ -35,6 +36,7 @@ use self::parser::Risdok;
pub struct Parser {
remove: Vec,
replace: Vec<(String, String)>,
+ move_para_headers_into_content: bool,
}
impl Default for Parser {
@@ -48,9 +50,14 @@ impl Parser {
Self {
remove: Vec::new(),
replace: Vec::new(),
+ move_para_headers_into_content: false,
}
}
+ pub fn move_para_headers_into_content(&mut self) {
+ self.move_para_headers_into_content = true;
+ }
+
pub fn add_string_to_remove(&mut self, data: &str) {
self.remove.push(data.into());
}
@@ -78,8 +85,45 @@ impl Parser {
xml = xml.replace(search, replace);
}
+ let xml = if self.move_para_headers_into_content {
+ Self::do_move_para_headers_into_content(xml)
+ } else {
+ xml
+ };
+
Risdok::from_str(&xml, builder)
}
+
+ fn do_move_para_headers_into_content(xml: String) -> String {
+ let mut result = String::from(&xml);
+ let ueberschrift_regex = Regex::new(
+ "(§.*?)",
+ )
+ .unwrap();
+ let absatz_regex =
+ Regex::new("").unwrap();
+
+ // Find all matches for tags and iterate over them in reverse to avoid messing up the indices
+ for cap in ueberschrift_regex.captures_iter(&xml) {
+ let ueberschrift_content = &cap[1];
+
+ // Check if there's an following the
+ if let Some(absatz_match) = absatz_regex.find(&result[cap.get(0).unwrap().end()..]) {
+ // Calculate the insertion point for the tag
+ let insert_point =
+ cap.get(0).unwrap().end() + absatz_match.start() + absatz_match.as_str().len();
+ // Insert the tag with the ueberschrift content into the result string
+ result.insert_str(
+ insert_point,
+ &format!("{}", ueberschrift_content),
+ );
+ }
+
+ // Remove the tag from the result string
+ result.replace_range(cap.get(0).unwrap().range(), "");
+ }
+ result
+ }
}
fn fetch(url: &str) -> Result {