enable function to move paragraphs into absaetze
All checks were successful
CI/CD Pipeline / test (push) Successful in 1m48s
All checks were successful
CI/CD Pipeline / test (push) Successful in 1m48s
This commit is contained in:
@ -17,6 +17,7 @@
|
||||
//! Deals with getting all paragraphs for a given law text
|
||||
mod parser;
|
||||
|
||||
use regex::Regex;
|
||||
use std::{
|
||||
fs,
|
||||
hash::{DefaultHasher, Hash, Hasher},
|
||||
@ -35,6 +36,7 @@ use self::parser::Risdok;
|
||||
pub struct Parser {
|
||||
remove: Vec<String>,
|
||||
replace: Vec<(String, String)>,
|
||||
move_para_headers_into_content: bool,
|
||||
}
|
||||
|
||||
impl Default for Parser {
|
||||
@ -48,9 +50,14 @@ impl Parser {
|
||||
Self {
|
||||
remove: Vec::new(),
|
||||
replace: Vec::new(),
|
||||
move_para_headers_into_content: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn move_para_headers_into_content(&mut self) {
|
||||
self.move_para_headers_into_content = true;
|
||||
}
|
||||
|
||||
pub fn add_string_to_remove(&mut self, data: &str) {
|
||||
self.remove.push(data.into());
|
||||
}
|
||||
@ -78,8 +85,45 @@ impl Parser {
|
||||
xml = xml.replace(search, replace);
|
||||
}
|
||||
|
||||
let xml = if self.move_para_headers_into_content {
|
||||
Self::do_move_para_headers_into_content(xml)
|
||||
} else {
|
||||
xml
|
||||
};
|
||||
|
||||
Risdok::from_str(&xml, builder)
|
||||
}
|
||||
|
||||
fn do_move_para_headers_into_content(xml: String) -> String {
|
||||
let mut result = String::from(&xml);
|
||||
let ueberschrift_regex = Regex::new(
|
||||
"<ueberschrift typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">(§.*?)</ueberschrift>",
|
||||
)
|
||||
.unwrap();
|
||||
let absatz_regex =
|
||||
Regex::new("<absatz typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">").unwrap();
|
||||
|
||||
// Find all matches for <ueberschrift> tags and iterate over them in reverse to avoid messing up the indices
|
||||
for cap in ueberschrift_regex.captures_iter(&xml) {
|
||||
let ueberschrift_content = &cap[1];
|
||||
|
||||
// Check if there's an <absatz> following the <ueberschrift>
|
||||
if let Some(absatz_match) = absatz_regex.find(&result[cap.get(0).unwrap().end()..]) {
|
||||
// Calculate the insertion point for the <gldsym> tag
|
||||
let insert_point =
|
||||
cap.get(0).unwrap().end() + absatz_match.start() + absatz_match.as_str().len();
|
||||
// Insert the <gldsym> tag with the ueberschrift content into the result string
|
||||
result.insert_str(
|
||||
insert_point,
|
||||
&format!("<gldsym>{}</gldsym>", ueberschrift_content),
|
||||
);
|
||||
}
|
||||
|
||||
// Remove the <ueberschrift> tag from the result string
|
||||
result.replace_range(cap.get(0).unwrap().range(), "");
|
||||
}
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
fn fetch(url: &str) -> Result<String, Error> {
|
||||
|
Reference in New Issue
Block a user