160 lines
4.5 KiB
Rust
160 lines
4.5 KiB
Rust
//! Deals with getting all paragraphs for a given law text
|
|
mod parser;
|
|
|
|
use log::info;
|
|
|
|
use crate::{law::LawBuilder, misc::Error};
|
|
|
|
use self::parser::Risdok;
|
|
|
|
pub struct Parser {
|
|
remove: Vec<String>,
|
|
replace: Vec<(String, String)>,
|
|
}
|
|
|
|
impl Parser {
|
|
pub fn new() -> Self {
|
|
Self {
|
|
remove: Vec::new(),
|
|
replace: Vec::new(),
|
|
}
|
|
}
|
|
|
|
pub fn add_string_to_remove(&mut self, data: &str) {
|
|
self.remove.push(data.into());
|
|
}
|
|
|
|
pub fn add_string_to_replace(&mut self, search: &str, replace: &str) {
|
|
self.replace.push((search.into(), replace.into()));
|
|
}
|
|
|
|
/// Parses the content available in `url`. Calls appropriate functions in supplied `LawBuilder`.
|
|
pub fn parse(&self, url: &str, builder: &mut LawBuilder) -> Result<bool, Error> {
|
|
info!("Parsing {url}");
|
|
let xml = fetch(url)?;
|
|
|
|
let xml = xml.replace("\u{a0}", " ");
|
|
|
|
self.parse_from_str(&xml, builder)
|
|
}
|
|
|
|
fn parse_from_str(&self, xml: &str, builder: &mut LawBuilder) -> Result<bool, Error> {
|
|
let mut xml = String::from(xml);
|
|
for r in &self.remove {
|
|
xml = xml.replace(r, "");
|
|
}
|
|
for (search, replace) in &self.replace {
|
|
xml = xml.replace(search, replace);
|
|
}
|
|
|
|
Risdok::from_str(&xml, builder)
|
|
}
|
|
}
|
|
|
|
#[cfg(not(test))]
|
|
fn fetch(url: &str) -> Result<String, Error> {
|
|
Ok(ureq::get(url).call()?.into_string()?)
|
|
}
|
|
|
|
#[cfg(test)]
|
|
fn fetch(url: &str) -> Result<String, Error> {
|
|
use std::{
|
|
collections::hash_map::DefaultHasher,
|
|
fs,
|
|
hash::{Hash, Hasher},
|
|
};
|
|
|
|
let mut hasher = DefaultHasher::new();
|
|
url.hash(&mut hasher);
|
|
let hash = format!("{:x}", hasher.finish());
|
|
|
|
let expected_filename = format!("./data/cache/par-{hash}");
|
|
|
|
match fs::read_to_string(&expected_filename) {
|
|
Ok(data) => Ok(data),
|
|
Err(_) => {
|
|
info!("Not finding url {url} in the cache, downloading...");
|
|
let data = ureq::get(url).call()?.into_string()?;
|
|
fs::write(expected_filename, &data).expect("Unable to write file");
|
|
Ok(data)
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
|
|
use std::{fs, sync::Arc};
|
|
|
|
use crate::{
|
|
config::Config,
|
|
law::{
|
|
responsible::{contains, starts_with_number},
|
|
Classifier, LawBuilder,
|
|
},
|
|
risparser::paragraph::Parser,
|
|
};
|
|
use pretty_assertions::assert_eq;
|
|
|
|
fn test(law_id: &str, builder: &mut LawBuilder, parser: Parser) {
|
|
let paragraph_path = format!("./data/expected/overview/{law_id}");
|
|
let expected_path = format!("./data/expected/par/{law_id}");
|
|
|
|
let pars =
|
|
fs::read_to_string(paragraph_path).expect("Could not read file {paragraph_path}.");
|
|
let pars = pars.trim().split('\n').collect::<Vec<&str>>();
|
|
|
|
for par in pars {
|
|
println!("{par}");
|
|
let cont = parser.parse(par, builder).unwrap();
|
|
if !cont {
|
|
break;
|
|
}
|
|
}
|
|
|
|
let actual = &builder.history;
|
|
|
|
println!("{actual:?}");
|
|
|
|
let expected = fs::read_to_string(&expected_path)
|
|
.expect(&format!("Could not read file {expected_path}."));
|
|
let expected = expected.trim().split('\n').collect::<Vec<&str>>();
|
|
|
|
assert_eq!(actual, &expected);
|
|
}
|
|
|
|
#[test]
|
|
fn all_configs_produce_expected_output() {
|
|
let configs = fs::read_dir("./data/configs").expect("No folder with config files");
|
|
|
|
for config in configs {
|
|
let path = format!("{}", config.unwrap().path().display());
|
|
|
|
let (law_id, mut builder, parser) = Config::load(&path).unwrap();
|
|
|
|
let paragraph_path = format!("./data/expected/overview/{law_id}");
|
|
let expected_path = format!("./data/expected/par/{law_id}");
|
|
|
|
let pars =
|
|
fs::read_to_string(paragraph_path).expect("Could not read file {paragraph_path}.");
|
|
let pars = pars.trim().split('\n').collect::<Vec<&str>>();
|
|
|
|
for par in pars {
|
|
println!("{par}");
|
|
let cont = parser.parse(par, &mut builder).unwrap();
|
|
if !cont {
|
|
break;
|
|
}
|
|
}
|
|
|
|
let actual = &builder.history;
|
|
|
|
let expected = fs::read_to_string(&expected_path)
|
|
.expect(&format!("Could not read file {expected_path}."));
|
|
let expected = expected.trim().split('\n').collect::<Vec<&str>>();
|
|
|
|
assert_eq!(actual, &expected);
|
|
}
|
|
}
|
|
}
|