enable function to move paragraphs into absaetze
All checks were successful
CI/CD Pipeline / test (push) Successful in 1m48s

This commit is contained in:
2024-02-15 15:07:36 +01:00
parent 567994a47e
commit 6c31131cdd
5 changed files with 61 additions and 0 deletions

View File

@ -17,6 +17,7 @@
//! Deals with getting all paragraphs for a given law text
mod parser;
use regex::Regex;
use std::{
fs,
hash::{DefaultHasher, Hash, Hasher},
@ -35,6 +36,7 @@ use self::parser::Risdok;
pub struct Parser {
remove: Vec<String>,
replace: Vec<(String, String)>,
move_para_headers_into_content: bool,
}
impl Default for Parser {
@ -48,9 +50,14 @@ impl Parser {
Self {
remove: Vec::new(),
replace: Vec::new(),
move_para_headers_into_content: false,
}
}
pub fn move_para_headers_into_content(&mut self) {
self.move_para_headers_into_content = true;
}
pub fn add_string_to_remove(&mut self, data: &str) {
self.remove.push(data.into());
}
@ -78,8 +85,45 @@ impl Parser {
xml = xml.replace(search, replace);
}
let xml = if self.move_para_headers_into_content {
Self::do_move_para_headers_into_content(xml)
} else {
xml
};
Risdok::from_str(&xml, builder)
}
fn do_move_para_headers_into_content(xml: String) -> String {
let mut result = String::from(&xml);
let ueberschrift_regex = Regex::new(
"<ueberschrift typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">(§.*?)</ueberschrift>",
)
.unwrap();
let absatz_regex =
Regex::new("<absatz typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">").unwrap();
// Find all matches for <ueberschrift> tags and iterate over them in reverse to avoid messing up the indices
for cap in ueberschrift_regex.captures_iter(&xml) {
let ueberschrift_content = &cap[1];
// Check if there's an <absatz> following the <ueberschrift>
if let Some(absatz_match) = absatz_regex.find(&result[cap.get(0).unwrap().end()..]) {
// Calculate the insertion point for the <gldsym> tag
let insert_point =
cap.get(0).unwrap().end() + absatz_match.start() + absatz_match.as_str().len();
// Insert the <gldsym> tag with the ueberschrift content into the result string
result.insert_str(
insert_point,
&format!("<gldsym>{}</gldsym>", ueberschrift_content),
);
}
// Remove the <ueberschrift> tag from the result string
result.replace_range(cap.get(0).unwrap().range(), "");
}
result
}
}
fn fetch(url: &str) -> Result<String, Error> {