philipp 350b1e5ebb
Some checks failed
CI/CD Pipeline / test (push) Failing after 32s
add paragraph parser to lib, add test for teg
2024-02-05 14:28:57 +01:00

127 lines
4.9 KiB
Rust
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! Deals with getting all paragraphs for a given law text
mod parser;
use log::info;
use crate::{law::LawBuilder, misc::Error};
use self::parser::Risdok;
pub struct Parser {
remove: Vec<String>,
}
impl Parser {
pub fn new() -> Self {
Self { remove: Vec::new() }
}
pub fn add_string_to_remove(&mut self, data: &str) {
self.remove.push(data.into());
}
/// Parses the content available in `url`. Calls appropriate functions in supplied `LawBuilder`.
pub fn parse(&self, url: &str, builder: &mut LawBuilder) -> Result<bool, Error> {
info!("Parsing {url}");
let xml = fetch(url)?;
self.parse_from_str(&xml, builder)
}
fn parse_from_str(&self, xml: &str, builder: &mut LawBuilder) -> Result<bool, Error> {
let mut xml = String::from(xml);
for r in &self.remove {
xml = xml.replace(r, "");
}
Risdok::from_str(&xml, builder)
}
}
#[cfg(not(test))]
fn fetch(url: &str) -> Result<String, Error> {
Ok(ureq::get(url).call()?.into_string()?)
}
#[cfg(test)]
fn fetch(url: &str) -> Result<String, Error> {
use std::{
fs,
hash::{DefaultHasher, Hash, Hasher},
};
let mut hasher = DefaultHasher::new();
url.hash(&mut hasher);
let hash = format!("{:x}", hasher.finish());
let expected_filename = format!("./data/cache/par-{hash}");
match fs::read_to_string(&expected_filename) {
Ok(data) => Ok(data),
Err(_) => {
info!("Not finding url {url} in the cache, downloading...");
let data = ureq::get(url).call()?.into_string()?;
fs::write(expected_filename, &data).expect("Unable to write file");
Ok(data)
}
}
}
#[cfg(test)]
mod tests {
use std::{fs, sync::Arc};
use crate::{
law::{responsible::contains, Classifier, LawBuilder},
risparser::paragraph::Parser,
};
use pretty_assertions::assert_eq;
#[test]
fn paragraph() {
let laws = fs::read_dir("./data/expected/overview/")
.expect("No folder with expected overview files");
for law in laws {
let path = law.unwrap();
let law_path = format!("{}", path.path().display());
let law_id = path.file_name().into_string().unwrap().to_string();
//TODO: Remove this if once all law texts pass
if ["10001905".into()].contains(&law_id) {
let expected_path = format!("./data/expected/par/{law_id}");
let pars = fs::read_to_string(law_path).expect("Could not read file {file_path}.");
let pars = pars.trim().split('\n').collect::<Vec<&str>>();
let mut builder = LawBuilder::new("law");
builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains)).root());
let mut parser = Parser::new();
parser.add_string_to_remove(r#"<absatz typ="abs" ct="text" halign="j"><i>(§§ 1, 2, 3, 4,6, § 8 Abs. 1 und 2 des Gesetzes vom 16. Februar 1883, RGBl. Nr. 20, betreffend das Verfahren zum Zwecke der Todeserklärung und der Beweisführung des Todes, in der Fassung des Gesetzes vom 31. März 1918, RGBl. Nr. 129:)</i></absatz>"#);
parser.add_string_to_remove(r#"<absatz typ="abs" ct="text" halign="j"><i>(§§ 1 bis 11 des Gesetzes über die Verschollenheit, die Todeserklärung und die Feststellung der Todeszeit vom 4. Juli 1939, Deutsches RGBl. I S. 1186:)</i></absatz>"#);
parser.add_string_to_remove(r#"<absatz typ="abs" ct="text" halign="j"><i>(§ 56 Abs. 3 des Gesetzes vom 4. Juli 1939, Deutsches RGBl. I S. 1186:)</i></absatz>"#);
parser.add_string_to_remove(r#"<absatz typ="abs" ct="text" halign="j"><i>(§ 10 des Gesetzes vom 16. Februar 1883, RGBl. Nr. 20, betreffend das Verfahren zum Zwecke der Todeserklärung und der Beweisführung des Todes, in der Fassung des Gesetzes vom 31. März 1918, RGBl. Nr. 129:)</i></absatz>"#);
parser.add_string_to_remove(r#"<absatz typ="abs" ct="text" halign="j"><i>(§§ 10a, 10b und 10c des Gesetzes vom 16. Februar 1883, RGBl. Nr. 20, betreffend das Verfahren zum Zwecke der Todeserklärung und der Beweisführung des Todes, in der Fassung des Gesetzes vom 31. März 1918, RGBl. Nr. 129:)</i></absatz>"#);
parser.add_string_to_remove("<i>");
parser.add_string_to_remove("</i>");
for par in pars {
let cont = parser.parse(par, &mut builder).unwrap();
if !cont {
break;
}
}
let actual = builder.history;
let expected = fs::read_to_string(&expected_path)
.expect(&format!("Could not read file {expected_path}."));
let expected = expected.trim().split('\n').collect::<Vec<&str>>();
assert_eq!(actual, expected);
}
}
}
}