enable function to move paragraphs into absaetze
All checks were successful
CI/CD Pipeline / test (push) Successful in 1m48s
All checks were successful
CI/CD Pipeline / test (push) Successful in 1m48s
This commit is contained in:
parent
567994a47e
commit
6c31131cdd
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -583,6 +583,7 @@ dependencies = [
|
|||||||
"env_logger",
|
"env_logger",
|
||||||
"log",
|
"log",
|
||||||
"pretty_assertions",
|
"pretty_assertions",
|
||||||
|
"regex",
|
||||||
"roxmltree",
|
"roxmltree",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
@ -17,6 +17,7 @@ tqdm = "0.6"
|
|||||||
toml = "0.8"
|
toml = "0.8"
|
||||||
clap = { version = "4.5.0", features = ["derive"] }
|
clap = { version = "4.5.0", features = ["derive"] }
|
||||||
directories = "5.0"
|
directories = "5.0"
|
||||||
|
regex = "1.10"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
pretty_assertions = "1.4"
|
pretty_assertions = "1.4"
|
||||||
|
@ -82,6 +82,10 @@ impl Config {
|
|||||||
parser.add_string_to_replace(&to_replace.find, &to_replace.replace_with);
|
parser.add_string_to_replace(&to_replace.find, &to_replace.replace_with);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if config.parser.move_para_headers_into_content {
|
||||||
|
parser.move_para_headers_into_content();
|
||||||
|
}
|
||||||
|
|
||||||
Ok((config.law.id, builder, parser))
|
Ok((config.law.id, builder, parser))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -101,6 +105,16 @@ struct Classifier {
|
|||||||
|
|
||||||
#[derive(Debug, Deserialize, Default)]
|
#[derive(Debug, Deserialize, Default)]
|
||||||
struct ParserConfig {
|
struct ParserConfig {
|
||||||
|
/// e.g. used in EheG to transform `<ueberschrift typ="para" ct="text" halign="c">§
|
||||||
|
/// 6</ueberschrift>` into
|
||||||
|
/// `
|
||||||
|
/// <absatz typ="abs" ct="text" halign="j">
|
||||||
|
/// <gldsym>§ 1.</gldsym>
|
||||||
|
/// text...
|
||||||
|
/// </absatz>
|
||||||
|
///`
|
||||||
|
#[serde(default)] //okay to not have this part in the config
|
||||||
|
move_para_headers_into_content: bool,
|
||||||
#[serde(default)] //okay to not have this part in the config
|
#[serde(default)] //okay to not have this part in the config
|
||||||
remove_strings: Vec<String>,
|
remove_strings: Vec<String>,
|
||||||
#[serde(default)] //okay to not have this part in the config
|
#[serde(default)] //okay to not have this part in the config
|
||||||
|
@ -63,6 +63,7 @@ pub fn starts_with_letter(_classifier_name: &str, instance_name: &str) -> bool {
|
|||||||
|
|
||||||
pub fn starts_with_uppercaseletter(_classifier_name: &str, instance_name: &str) -> bool {
|
pub fn starts_with_uppercaseletter(_classifier_name: &str, instance_name: &str) -> bool {
|
||||||
instance_name.starts_with(|c: char| c.is_ascii_uppercase())
|
instance_name.starts_with(|c: char| c.is_ascii_uppercase())
|
||||||
|
&& instance_name.chars().nth(0) != Some('I')
|
||||||
&& (instance_name.chars().nth(1) == Some('.') || instance_name.chars().nth(1) == Some(')'))
|
&& (instance_name.chars().nth(1) == Some('.') || instance_name.chars().nth(1) == Some(')'))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17,6 +17,7 @@
|
|||||||
//! Deals with getting all paragraphs for a given law text
|
//! Deals with getting all paragraphs for a given law text
|
||||||
mod parser;
|
mod parser;
|
||||||
|
|
||||||
|
use regex::Regex;
|
||||||
use std::{
|
use std::{
|
||||||
fs,
|
fs,
|
||||||
hash::{DefaultHasher, Hash, Hasher},
|
hash::{DefaultHasher, Hash, Hasher},
|
||||||
@ -35,6 +36,7 @@ use self::parser::Risdok;
|
|||||||
pub struct Parser {
|
pub struct Parser {
|
||||||
remove: Vec<String>,
|
remove: Vec<String>,
|
||||||
replace: Vec<(String, String)>,
|
replace: Vec<(String, String)>,
|
||||||
|
move_para_headers_into_content: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for Parser {
|
impl Default for Parser {
|
||||||
@ -48,9 +50,14 @@ impl Parser {
|
|||||||
Self {
|
Self {
|
||||||
remove: Vec::new(),
|
remove: Vec::new(),
|
||||||
replace: Vec::new(),
|
replace: Vec::new(),
|
||||||
|
move_para_headers_into_content: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn move_para_headers_into_content(&mut self) {
|
||||||
|
self.move_para_headers_into_content = true;
|
||||||
|
}
|
||||||
|
|
||||||
pub fn add_string_to_remove(&mut self, data: &str) {
|
pub fn add_string_to_remove(&mut self, data: &str) {
|
||||||
self.remove.push(data.into());
|
self.remove.push(data.into());
|
||||||
}
|
}
|
||||||
@ -78,8 +85,45 @@ impl Parser {
|
|||||||
xml = xml.replace(search, replace);
|
xml = xml.replace(search, replace);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let xml = if self.move_para_headers_into_content {
|
||||||
|
Self::do_move_para_headers_into_content(xml)
|
||||||
|
} else {
|
||||||
|
xml
|
||||||
|
};
|
||||||
|
|
||||||
Risdok::from_str(&xml, builder)
|
Risdok::from_str(&xml, builder)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn do_move_para_headers_into_content(xml: String) -> String {
|
||||||
|
let mut result = String::from(&xml);
|
||||||
|
let ueberschrift_regex = Regex::new(
|
||||||
|
"<ueberschrift typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">(§.*?)</ueberschrift>",
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
let absatz_regex =
|
||||||
|
Regex::new("<absatz typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">").unwrap();
|
||||||
|
|
||||||
|
// Find all matches for <ueberschrift> tags and iterate over them in reverse to avoid messing up the indices
|
||||||
|
for cap in ueberschrift_regex.captures_iter(&xml) {
|
||||||
|
let ueberschrift_content = &cap[1];
|
||||||
|
|
||||||
|
// Check if there's an <absatz> following the <ueberschrift>
|
||||||
|
if let Some(absatz_match) = absatz_regex.find(&result[cap.get(0).unwrap().end()..]) {
|
||||||
|
// Calculate the insertion point for the <gldsym> tag
|
||||||
|
let insert_point =
|
||||||
|
cap.get(0).unwrap().end() + absatz_match.start() + absatz_match.as_str().len();
|
||||||
|
// Insert the <gldsym> tag with the ueberschrift content into the result string
|
||||||
|
result.insert_str(
|
||||||
|
insert_point,
|
||||||
|
&format!("<gldsym>{}</gldsym>", ueberschrift_content),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove the <ueberschrift> tag from the result string
|
||||||
|
result.replace_range(cap.get(0).unwrap().range(), "");
|
||||||
|
}
|
||||||
|
result
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn fetch(url: &str) -> Result<String, Error> {
|
fn fetch(url: &str) -> Result<String, Error> {
|
||||||
|
Loading…
Reference in New Issue
Block a user