philipp c92d72c580
All checks were successful
CI/CD Pipeline / test (push) Successful in 2m23s
add configs for laws
2024-02-06 11:18:40 +01:00

160 lines
4.5 KiB
Rust

//! Deals with getting all paragraphs for a given law text
mod parser;
use log::info;
use crate::{law::LawBuilder, misc::Error};
use self::parser::Risdok;
pub struct Parser {
remove: Vec<String>,
replace: Vec<(String, String)>,
}
impl Parser {
pub fn new() -> Self {
Self {
remove: Vec::new(),
replace: Vec::new(),
}
}
pub fn add_string_to_remove(&mut self, data: &str) {
self.remove.push(data.into());
}
pub fn add_string_to_replace(&mut self, search: &str, replace: &str) {
self.replace.push((search.into(), replace.into()));
}
/// Parses the content available in `url`. Calls appropriate functions in supplied `LawBuilder`.
pub fn parse(&self, url: &str, builder: &mut LawBuilder) -> Result<bool, Error> {
info!("Parsing {url}");
let xml = fetch(url)?;
let xml = xml.replace("\u{a0}", " ");
self.parse_from_str(&xml, builder)
}
fn parse_from_str(&self, xml: &str, builder: &mut LawBuilder) -> Result<bool, Error> {
let mut xml = String::from(xml);
for r in &self.remove {
xml = xml.replace(r, "");
}
for (search, replace) in &self.replace {
xml = xml.replace(search, replace);
}
Risdok::from_str(&xml, builder)
}
}
#[cfg(not(test))]
fn fetch(url: &str) -> Result<String, Error> {
Ok(ureq::get(url).call()?.into_string()?)
}
#[cfg(test)]
fn fetch(url: &str) -> Result<String, Error> {
use std::{
collections::hash_map::DefaultHasher,
fs,
hash::{Hash, Hasher},
};
let mut hasher = DefaultHasher::new();
url.hash(&mut hasher);
let hash = format!("{:x}", hasher.finish());
let expected_filename = format!("./data/cache/par-{hash}");
match fs::read_to_string(&expected_filename) {
Ok(data) => Ok(data),
Err(_) => {
info!("Not finding url {url} in the cache, downloading...");
let data = ureq::get(url).call()?.into_string()?;
fs::write(expected_filename, &data).expect("Unable to write file");
Ok(data)
}
}
}
#[cfg(test)]
mod tests {
use std::{fs, sync::Arc};
use crate::{
config::Config,
law::{
responsible::{contains, starts_with_number},
Classifier, LawBuilder,
},
risparser::paragraph::Parser,
};
use pretty_assertions::assert_eq;
fn test(law_id: &str, builder: &mut LawBuilder, parser: Parser) {
let paragraph_path = format!("./data/expected/overview/{law_id}");
let expected_path = format!("./data/expected/par/{law_id}");
let pars =
fs::read_to_string(paragraph_path).expect("Could not read file {paragraph_path}.");
let pars = pars.trim().split('\n').collect::<Vec<&str>>();
for par in pars {
println!("{par}");
let cont = parser.parse(par, builder).unwrap();
if !cont {
break;
}
}
let actual = &builder.history;
println!("{actual:?}");
let expected = fs::read_to_string(&expected_path)
.expect(&format!("Could not read file {expected_path}."));
let expected = expected.trim().split('\n').collect::<Vec<&str>>();
assert_eq!(actual, &expected);
}
#[test]
fn all_configs_produce_expected_output() {
let configs = fs::read_dir("./data/configs").expect("No folder with config files");
for config in configs {
let path = format!("{}", config.unwrap().path().display());
let (law_id, mut builder, parser) = Config::load(&path).unwrap();
let paragraph_path = format!("./data/expected/overview/{law_id}");
let expected_path = format!("./data/expected/par/{law_id}");
let pars =
fs::read_to_string(paragraph_path).expect("Could not read file {paragraph_path}.");
let pars = pars.trim().split('\n').collect::<Vec<&str>>();
for par in pars {
println!("{par}");
let cont = parser.parse(par, &mut builder).unwrap();
if !cont {
break;
}
}
let actual = &builder.history;
let expected = fs::read_to_string(&expected_path)
.expect(&format!("Could not read file {expected_path}."));
let expected = expected.trim().split('\n').collect::<Vec<&str>>();
assert_eq!(actual, &expected);
}
}
}