// Copyright (C) 2024 Philipp Hofer // // Licensed under the EUPL, Version 1.2 or - as soon they will be approved by // the European Commission - subsequent versions of the EUPL (the "Licence"). // You may not use this work except in compliance with the Licence. // // You should have received a copy of the European Union Public License along // with this program. If not, you may obtain a copy of the Licence at: // // // Unless required by applicable law or agreed to in writing, software // distributed under the Licence is distributed on an "AS IS" basis, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the Licence for the specific language governing permissions and // limitations under the Licence. //! Deals with getting all paragraphs for a given law text mod parser; use regex::Regex; use std::{ fs, hash::{DefaultHasher, Hash, Hasher}, path::Path, }; use log::info; use crate::{ law::LawBuilder, misc::{fetch_with_retries, get_cache_dir, Error}, }; use self::parser::Risdok; pub struct Parser { remove: Vec, replace: Vec<(String, String)>, move_para_headers_into_content: bool, } impl Default for Parser { fn default() -> Self { Self::new() } } impl Parser { pub fn new() -> Self { Self { remove: Vec::new(), replace: Vec::new(), move_para_headers_into_content: false, } } pub fn move_para_headers_into_content(&mut self) { self.move_para_headers_into_content = true; } pub fn add_string_to_remove(&mut self, data: &str) { self.remove.push(data.into()); } pub fn add_string_to_replace(&mut self, search: &str, replace: &str) { self.replace.push((search.into(), replace.into())); } /// Parses the content available in `url`. Calls appropriate functions in supplied `LawBuilder`. pub fn parse(&self, url: &str, builder: &mut LawBuilder) -> Result { info!("Parsing {url}"); let xml = fetch(url)?; let xml = xml.replace('\u{a0}', " "); self.parse_from_str(&xml, builder) } fn parse_from_str(&self, xml: &str, builder: &mut LawBuilder) -> Result { let mut xml = String::from(xml); for r in &self.remove { xml = xml.replace(r, ""); } for (search, replace) in &self.replace { xml = xml.replace(search, replace); } let xml = if self.move_para_headers_into_content { Self::do_move_para_headers_into_content(xml) } else { xml }; Risdok::from_str(&xml, builder) } fn do_move_para_headers_into_content(xml: String) -> String { let mut result = String::from(&xml); let ueberschrift_regex = Regex::new( "(ยง.*?)", ) .unwrap(); let absatz_regex = Regex::new("").unwrap(); // Find all matches for tags and iterate over them in reverse to avoid messing up the indices for cap in ueberschrift_regex.captures_iter(&xml) { let ueberschrift_content = &cap[1]; // Check if there's an following the if let Some(absatz_match) = absatz_regex.find(&result[cap.get(0).unwrap().end()..]) { // Calculate the insertion point for the tag let insert_point = cap.get(0).unwrap().end() + absatz_match.start() + absatz_match.as_str().len(); // Insert the tag with the ueberschrift content into the result string result.insert_str( insert_point, &format!("{}", ueberschrift_content), ); } // Remove the tag from the result string result.replace_range(cap.get(0).unwrap().range(), ""); } result } } fn fetch(url: &str) -> Result { let mut hasher = DefaultHasher::new(); url.hash(&mut hasher); let hash = format!("{:x}", hasher.finish()); let expected_filename = format!("{}par-{hash}", get_cache_dir()?); match fs::read_to_string(&expected_filename) { Ok(data) => Ok(data), Err(_) => { info!("Not finding url {url} in the cache, downloading..."); let data = fetch_with_retries(url)?; let path = Path::new(&expected_filename); if let Some(parent) = path.parent() { // Try to create the directory (and any necessary parent directories) fs::create_dir_all(parent).expect("Unable to create directory"); } fs::write(expected_filename, &data).expect("Unable to write file"); Ok(data) } } } #[cfg(test)] mod tests { use std::fs; use crate::config::Config; use pretty_assertions::assert_eq; #[test] fn all_configs_produce_expected_output() { let configs = fs::read_dir("./data/configs").expect("No folder with config files"); for config in configs { let path = format!("{}", config.unwrap().path().display()); let (law_id, mut builder, parser) = Config::load(&path).unwrap(); let paragraph_path = format!("./data/expected/overview/{law_id}"); let expected_path = format!("./data/expected/par/{law_id}"); let pars = fs::read_to_string(paragraph_path).expect("Could not read file {paragraph_path}."); let pars = pars.trim().split('\n').collect::>(); for par in pars { let cont = parser.parse(par, &mut builder).unwrap(); if !cont { break; } } let actual = &builder.history; match fs::read_to_string(&expected_path) { Ok(expected) => { let e = expected.trim().split('\n').collect::>(); assert_eq!(actual, &e); } Err(_) => { let to_write = actual.join("\n"); fs::write(expected_path, to_write).expect("Unable to write file"); } } } } }