enable function to move paragraphs into absaetze
All checks were successful
CI/CD Pipeline / test (push) Successful in 1m48s

This commit is contained in:
philipp 2024-02-15 15:07:36 +01:00
parent 567994a47e
commit 6c31131cdd
5 changed files with 61 additions and 0 deletions

1
Cargo.lock generated
View File

@ -583,6 +583,7 @@ dependencies = [
"env_logger", "env_logger",
"log", "log",
"pretty_assertions", "pretty_assertions",
"regex",
"roxmltree", "roxmltree",
"serde", "serde",
"serde_json", "serde_json",

View File

@ -17,6 +17,7 @@ tqdm = "0.6"
toml = "0.8" toml = "0.8"
clap = { version = "4.5.0", features = ["derive"] } clap = { version = "4.5.0", features = ["derive"] }
directories = "5.0" directories = "5.0"
regex = "1.10"
[dev-dependencies] [dev-dependencies]
pretty_assertions = "1.4" pretty_assertions = "1.4"

View File

@ -82,6 +82,10 @@ impl Config {
parser.add_string_to_replace(&to_replace.find, &to_replace.replace_with); parser.add_string_to_replace(&to_replace.find, &to_replace.replace_with);
} }
if config.parser.move_para_headers_into_content {
parser.move_para_headers_into_content();
}
Ok((config.law.id, builder, parser)) Ok((config.law.id, builder, parser))
} }
} }
@ -101,6 +105,16 @@ struct Classifier {
#[derive(Debug, Deserialize, Default)] #[derive(Debug, Deserialize, Default)]
struct ParserConfig { struct ParserConfig {
/// e.g. used in EheG to transform `<ueberschrift typ="para" ct="text" halign="c">§
/// 6</ueberschrift>` into
/// `
/// <absatz typ="abs" ct="text" halign="j">
/// <gldsym>§ 1.</gldsym>
/// text...
/// </absatz>
///`
#[serde(default)] //okay to not have this part in the config
move_para_headers_into_content: bool,
#[serde(default)] //okay to not have this part in the config #[serde(default)] //okay to not have this part in the config
remove_strings: Vec<String>, remove_strings: Vec<String>,
#[serde(default)] //okay to not have this part in the config #[serde(default)] //okay to not have this part in the config

View File

@ -63,6 +63,7 @@ pub fn starts_with_letter(_classifier_name: &str, instance_name: &str) -> bool {
pub fn starts_with_uppercaseletter(_classifier_name: &str, instance_name: &str) -> bool { pub fn starts_with_uppercaseletter(_classifier_name: &str, instance_name: &str) -> bool {
instance_name.starts_with(|c: char| c.is_ascii_uppercase()) instance_name.starts_with(|c: char| c.is_ascii_uppercase())
&& instance_name.chars().nth(0) != Some('I')
&& (instance_name.chars().nth(1) == Some('.') || instance_name.chars().nth(1) == Some(')')) && (instance_name.chars().nth(1) == Some('.') || instance_name.chars().nth(1) == Some(')'))
} }

View File

@ -17,6 +17,7 @@
//! Deals with getting all paragraphs for a given law text //! Deals with getting all paragraphs for a given law text
mod parser; mod parser;
use regex::Regex;
use std::{ use std::{
fs, fs,
hash::{DefaultHasher, Hash, Hasher}, hash::{DefaultHasher, Hash, Hasher},
@ -35,6 +36,7 @@ use self::parser::Risdok;
pub struct Parser { pub struct Parser {
remove: Vec<String>, remove: Vec<String>,
replace: Vec<(String, String)>, replace: Vec<(String, String)>,
move_para_headers_into_content: bool,
} }
impl Default for Parser { impl Default for Parser {
@ -48,9 +50,14 @@ impl Parser {
Self { Self {
remove: Vec::new(), remove: Vec::new(),
replace: Vec::new(), replace: Vec::new(),
move_para_headers_into_content: false,
} }
} }
pub fn move_para_headers_into_content(&mut self) {
self.move_para_headers_into_content = true;
}
pub fn add_string_to_remove(&mut self, data: &str) { pub fn add_string_to_remove(&mut self, data: &str) {
self.remove.push(data.into()); self.remove.push(data.into());
} }
@ -78,8 +85,45 @@ impl Parser {
xml = xml.replace(search, replace); xml = xml.replace(search, replace);
} }
let xml = if self.move_para_headers_into_content {
Self::do_move_para_headers_into_content(xml)
} else {
xml
};
Risdok::from_str(&xml, builder) Risdok::from_str(&xml, builder)
} }
fn do_move_para_headers_into_content(xml: String) -> String {
let mut result = String::from(&xml);
let ueberschrift_regex = Regex::new(
"<ueberschrift typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">(§.*?)</ueberschrift>",
)
.unwrap();
let absatz_regex =
Regex::new("<absatz typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">").unwrap();
// Find all matches for <ueberschrift> tags and iterate over them in reverse to avoid messing up the indices
for cap in ueberschrift_regex.captures_iter(&xml) {
let ueberschrift_content = &cap[1];
// Check if there's an <absatz> following the <ueberschrift>
if let Some(absatz_match) = absatz_regex.find(&result[cap.get(0).unwrap().end()..]) {
// Calculate the insertion point for the <gldsym> tag
let insert_point =
cap.get(0).unwrap().end() + absatz_match.start() + absatz_match.as_str().len();
// Insert the <gldsym> tag with the ueberschrift content into the result string
result.insert_str(
insert_point,
&format!("<gldsym>{}</gldsym>", ueberschrift_content),
);
}
// Remove the <ueberschrift> tag from the result string
result.replace_range(cap.get(0).unwrap().range(), "");
}
result
}
} }
fn fetch(url: &str) -> Result<String, Error> { fn fetch(url: &str) -> Result<String, Error> {