//! Deals with getting all paragraphs for a given law text mod parser; use log::info; use crate::{law::LawBuilder, misc::Error}; use self::parser::Risdok; pub struct Parser { remove: Vec<String>, replace: Vec<(String, String)>, } impl Parser { pub fn new() -> Self { Self { remove: Vec::new(), replace: Vec::new(), } } pub fn add_string_to_remove(&mut self, data: &str) { self.remove.push(data.into()); } pub fn add_string_to_replace(&mut self, search: &str, replace: &str) { self.replace.push((search.into(), replace.into())); } /// Parses the content available in `url`. Calls appropriate functions in supplied `LawBuilder`. pub fn parse(&self, url: &str, builder: &mut LawBuilder) -> Result<bool, Error> { info!("Parsing {url}"); let xml = fetch(url)?; let xml = xml.replace("\u{a0}", " "); self.parse_from_str(&xml, builder) } fn parse_from_str(&self, xml: &str, builder: &mut LawBuilder) -> Result<bool, Error> { let mut xml = String::from(xml); for r in &self.remove { xml = xml.replace(r, ""); } for (search, replace) in &self.replace { xml = xml.replace(search, replace); } Risdok::from_str(&xml, builder) } } #[cfg(not(test))] fn fetch(url: &str) -> Result<String, Error> { Ok(ureq::get(url).call()?.into_string()?) } #[cfg(test)] fn fetch(url: &str) -> Result<String, Error> { use std::{ collections::hash_map::DefaultHasher, fs, hash::{Hash, Hasher}, }; let mut hasher = DefaultHasher::new(); url.hash(&mut hasher); let hash = format!("{:x}", hasher.finish()); let expected_filename = format!("./data/cache/par-{hash}"); match fs::read_to_string(&expected_filename) { Ok(data) => Ok(data), Err(_) => { info!("Not finding url {url} in the cache, downloading..."); let data = ureq::get(url).call()?.into_string()?; fs::write(expected_filename, &data).expect("Unable to write file"); Ok(data) } } } #[cfg(test)] mod tests { use std::{fs, sync::Arc}; use crate::{ config::Config, law::{ responsible::{contains, starts_with_number}, Classifier, LawBuilder, }, risparser::paragraph::Parser, }; use pretty_assertions::assert_eq; fn test(law_id: &str, builder: &mut LawBuilder, parser: Parser) { let paragraph_path = format!("./data/expected/overview/{law_id}"); let expected_path = format!("./data/expected/par/{law_id}"); let pars = fs::read_to_string(paragraph_path).expect("Could not read file {paragraph_path}."); let pars = pars.trim().split('\n').collect::<Vec<&str>>(); for par in pars { println!("{par}"); let cont = parser.parse(par, builder).unwrap(); if !cont { break; } } let actual = &builder.history; println!("{actual:?}"); let expected = fs::read_to_string(&expected_path) .expect(&format!("Could not read file {expected_path}.")); let expected = expected.trim().split('\n').collect::<Vec<&str>>(); assert_eq!(actual, &expected); } #[test] fn all_configs_produce_expected_output() { let configs = fs::read_dir("./data/configs").expect("No folder with config files"); for config in configs { let path = format!("{}", config.unwrap().path().display()); let (law_id, mut builder, parser) = Config::load(&path).unwrap(); let paragraph_path = format!("./data/expected/overview/{law_id}"); let expected_path = format!("./data/expected/par/{law_id}"); let pars = fs::read_to_string(paragraph_path).expect("Could not read file {paragraph_path}."); let pars = pars.trim().split('\n').collect::<Vec<&str>>(); for par in pars { println!("{par}"); let cont = parser.parse(par, &mut builder).unwrap(); if !cont { break; } } let actual = &builder.history; let expected = fs::read_to_string(&expected_path) .expect(&format!("Could not read file {expected_path}.")); let expected = expected.trim().split('\n').collect::<Vec<&str>>(); assert_eq!(actual, &expected); } } }