enable function to move paragraphs into absaetze
All checks were successful
CI/CD Pipeline / test (push) Successful in 1m48s
All checks were successful
CI/CD Pipeline / test (push) Successful in 1m48s
This commit is contained in:
parent
567994a47e
commit
6c31131cdd
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -583,6 +583,7 @@ dependencies = [
|
||||
"env_logger",
|
||||
"log",
|
||||
"pretty_assertions",
|
||||
"regex",
|
||||
"roxmltree",
|
||||
"serde",
|
||||
"serde_json",
|
||||
|
@ -17,6 +17,7 @@ tqdm = "0.6"
|
||||
toml = "0.8"
|
||||
clap = { version = "4.5.0", features = ["derive"] }
|
||||
directories = "5.0"
|
||||
regex = "1.10"
|
||||
|
||||
[dev-dependencies]
|
||||
pretty_assertions = "1.4"
|
||||
|
@ -82,6 +82,10 @@ impl Config {
|
||||
parser.add_string_to_replace(&to_replace.find, &to_replace.replace_with);
|
||||
}
|
||||
|
||||
if config.parser.move_para_headers_into_content {
|
||||
parser.move_para_headers_into_content();
|
||||
}
|
||||
|
||||
Ok((config.law.id, builder, parser))
|
||||
}
|
||||
}
|
||||
@ -101,6 +105,16 @@ struct Classifier {
|
||||
|
||||
#[derive(Debug, Deserialize, Default)]
|
||||
struct ParserConfig {
|
||||
/// e.g. used in EheG to transform `<ueberschrift typ="para" ct="text" halign="c">§
|
||||
/// 6</ueberschrift>` into
|
||||
/// `
|
||||
/// <absatz typ="abs" ct="text" halign="j">
|
||||
/// <gldsym>§ 1.</gldsym>
|
||||
/// text...
|
||||
/// </absatz>
|
||||
///`
|
||||
#[serde(default)] //okay to not have this part in the config
|
||||
move_para_headers_into_content: bool,
|
||||
#[serde(default)] //okay to not have this part in the config
|
||||
remove_strings: Vec<String>,
|
||||
#[serde(default)] //okay to not have this part in the config
|
||||
|
@ -63,6 +63,7 @@ pub fn starts_with_letter(_classifier_name: &str, instance_name: &str) -> bool {
|
||||
|
||||
pub fn starts_with_uppercaseletter(_classifier_name: &str, instance_name: &str) -> bool {
|
||||
instance_name.starts_with(|c: char| c.is_ascii_uppercase())
|
||||
&& instance_name.chars().nth(0) != Some('I')
|
||||
&& (instance_name.chars().nth(1) == Some('.') || instance_name.chars().nth(1) == Some(')'))
|
||||
}
|
||||
|
||||
|
@ -17,6 +17,7 @@
|
||||
//! Deals with getting all paragraphs for a given law text
|
||||
mod parser;
|
||||
|
||||
use regex::Regex;
|
||||
use std::{
|
||||
fs,
|
||||
hash::{DefaultHasher, Hash, Hasher},
|
||||
@ -35,6 +36,7 @@ use self::parser::Risdok;
|
||||
pub struct Parser {
|
||||
remove: Vec<String>,
|
||||
replace: Vec<(String, String)>,
|
||||
move_para_headers_into_content: bool,
|
||||
}
|
||||
|
||||
impl Default for Parser {
|
||||
@ -48,9 +50,14 @@ impl Parser {
|
||||
Self {
|
||||
remove: Vec::new(),
|
||||
replace: Vec::new(),
|
||||
move_para_headers_into_content: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn move_para_headers_into_content(&mut self) {
|
||||
self.move_para_headers_into_content = true;
|
||||
}
|
||||
|
||||
pub fn add_string_to_remove(&mut self, data: &str) {
|
||||
self.remove.push(data.into());
|
||||
}
|
||||
@ -78,8 +85,45 @@ impl Parser {
|
||||
xml = xml.replace(search, replace);
|
||||
}
|
||||
|
||||
let xml = if self.move_para_headers_into_content {
|
||||
Self::do_move_para_headers_into_content(xml)
|
||||
} else {
|
||||
xml
|
||||
};
|
||||
|
||||
Risdok::from_str(&xml, builder)
|
||||
}
|
||||
|
||||
fn do_move_para_headers_into_content(xml: String) -> String {
|
||||
let mut result = String::from(&xml);
|
||||
let ueberschrift_regex = Regex::new(
|
||||
"<ueberschrift typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">(§.*?)</ueberschrift>",
|
||||
)
|
||||
.unwrap();
|
||||
let absatz_regex =
|
||||
Regex::new("<absatz typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">").unwrap();
|
||||
|
||||
// Find all matches for <ueberschrift> tags and iterate over them in reverse to avoid messing up the indices
|
||||
for cap in ueberschrift_regex.captures_iter(&xml) {
|
||||
let ueberschrift_content = &cap[1];
|
||||
|
||||
// Check if there's an <absatz> following the <ueberschrift>
|
||||
if let Some(absatz_match) = absatz_regex.find(&result[cap.get(0).unwrap().end()..]) {
|
||||
// Calculate the insertion point for the <gldsym> tag
|
||||
let insert_point =
|
||||
cap.get(0).unwrap().end() + absatz_match.start() + absatz_match.as_str().len();
|
||||
// Insert the <gldsym> tag with the ueberschrift content into the result string
|
||||
result.insert_str(
|
||||
insert_point,
|
||||
&format!("<gldsym>{}</gldsym>", ueberschrift_content),
|
||||
);
|
||||
}
|
||||
|
||||
// Remove the <ueberschrift> tag from the result string
|
||||
result.replace_range(cap.get(0).unwrap().range(), "");
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
fn fetch(url: &str) -> Result<String, Error> {
|
||||
|
Loading…
Reference in New Issue
Block a user