// Copyright (C) 2024 Philipp Hofer // // Licensed under the EUPL, Version 1.2 or - as soon they will be approved by // the European Commission - subsequent versions of the EUPL (the "Licence"). // You may not use this work except in compliance with the Licence. // // You should have received a copy of the European Union Public License along // with this program. If not, you may obtain a copy of the Licence at: // // // Unless required by applicable law or agreed to in writing, software // distributed under the Licence is distributed on an "AS IS" basis, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the Licence for the specific language governing permissions and // limitations under the Licence. //! Deals with getting the content of a single paragraph. mod parser; use regex::Regex; use std::{ fs, hash::{DefaultHasher, Hash, Hasher}, path::Path, }; use tracing::{debug, info}; use crate::{ law, misc::{fetch_with_retries, get_cache_dir, Error}, }; use self::parser::Risdok; pub struct Parser { remove: Vec, replace: Vec<(String, String)>, move_para_headers_into_content: bool, pub(crate) par_sign: String, } impl Default for Parser { fn default() -> Self { Self::new(String::from("§")) } } impl Parser { #[must_use] pub fn new(par_sign: String) -> Self { Self { remove: Vec::new(), replace: Vec::new(), move_para_headers_into_content: false, par_sign, } } pub fn move_para_headers_into_content(&mut self) { self.move_para_headers_into_content = true; } pub fn add_string_to_remove(&mut self, data: &str) { self.remove.push(data.into()); } pub fn add_string_to_replace(&mut self, search: &str, replace: &str) { self.replace.push((search.into(), replace.into())); } /// Parses the content available at the specified `url` and processes it using the provided /// `law::Builder`. /// /// This function is responsible for downloading the content from the given `url`, /// pre-processing the text (such as removing unwanted characters and performing specified /// replacements), and then parsing the pre-processed XML content. /// /// This method also handles caching of the downloaded content to avoid repeated downloads of /// the same resource, improving efficiency and reducing network load. /// /// # Parameters /// /// - `url`: The URL from which to fetch the law text. - `builder`: A mutable reference to a /// `law::Builder` instance, which is used to construct the law structure based on the parsed /// content. /// /// # Returns /// /// - `Ok(bool)`: Returns `true` if parsing was successful, `false` otherwise. /// - `Err(Error)`: An error occurred during the fetching or parsing process. /// /// # Errors /// /// Errors can occur due to several reasons: /// - Network issues preventing the download of content from the given `url`. /// - I/O errors related to file system operations, such as problems with reading from or /// writing to the cache. /// - Parsing errors if the downloaded content cannot be properly processed or interpreted as /// expected. /// /// # Example Usage /// /// ``` /// use risp::{Config, law::{Law, Heading, Content, Section, HeadingContent}}; /// use std::path::Path; /// /// let (_, _, _, mut builder, parser) = Config::load(Path::new("data/configs/abgb.toml")).unwrap(); /// let result = parser.parse("https://www.ris.bka.gv.at/Dokumente/Bundesnormen/NOR12017691/NOR12017691.xml", &mut builder).unwrap(); /// /// let law: Law = builder.into(); /// assert_eq!( /// law, /// Law { /// name: "ABGB".into(), /// header: vec![Heading { /// name: "Nullter Theil. Einleitung".into(), /// desc: Some("Von den bürgerlichen Gesetzen überhaupt.".into()), /// content: vec![HeadingContent::Paragraph(Section { /// symb: "§ 1.".into(), /// par_header: Some("Begriff des bürgerlichen Rechtes.".into()), /// par_note: None, /// content: Content::Text("Der Inbegriff der Gesetze, wodurch die Privat-Rechte und Pflichten der Einwohner des Staates unter sich bestimmt werden, macht das bürgerliche Recht in demselben aus.".into()) /// })] /// }] /// } /// ); /// ``` pub fn parse(&self, url: &str, builder: &mut law::Builder) -> Result { info!("Parsing {url}"); let xml = fetch(url)?; let xml = xml.replace('\u{a0}', " "); self.parse_from_str(&xml, builder) } fn parse_from_str(&self, xml: &str, builder: &mut law::Builder) -> Result { let mut xml = String::from(xml); for r in &self.remove { xml = xml.replace(r, ""); } for (search, replace) in &self.replace { xml = xml.replace(search, replace); } let xml = if self.move_para_headers_into_content { self.do_move_para_headers_into_content(&xml) } else { xml }; Risdok::from_str(&xml, builder) } fn do_move_para_headers_into_content(&self, xml: &str) -> String { let mut result = String::from(xml); let ueberschrift_regex = Regex::new(&format!( "({}.*?)", self.par_sign )) .unwrap(); let absatz_regex = Regex::new("").unwrap(); // Find all matches for tags and iterate over them in reverse to avoid messing up the indices for cap in ueberschrift_regex.captures_iter(xml) { let ueberschrift_content = &cap[1]; // Check if there's an following the if let Some(absatz_match) = absatz_regex.find(&result[cap.get(0).unwrap().end()..]) { // Calculate the insertion point for the tag let insert_point = cap.get(0).unwrap().end() + absatz_match.start() + absatz_match.as_str().len(); // Insert the tag with the ueberschrift content into the result string result.insert_str( insert_point, &format!("{ueberschrift_content}"), ); } // Remove the tag from the result string result.replace_range(cap.get(0).unwrap().range(), ""); } debug!("{result:#?}"); result } } fn fetch(url: &str) -> Result { let mut hasher = DefaultHasher::new(); url.hash(&mut hasher); let hash = format!("{:x}", hasher.finish()); let expected_filename = format!("{}par-{hash}", get_cache_dir()?); if let Ok(data) = fs::read_to_string(&expected_filename) { Ok(data) } else { let data = fetch_with_retries(url)?; let path = Path::new(&expected_filename); if let Some(parent) = path.parent() { // Try to create the directory (and any necessary parent directories) fs::create_dir_all(parent).expect("Unable to create directory"); } fs::write(expected_filename, &data).expect("Unable to write file"); Ok(data) } } #[cfg(test)] mod tests { use std::fs; use crate::config::Config; use pretty_assertions::assert_eq; #[test] fn all_configs_produce_expected_output() { let configs = fs::read_dir("./data/configs").expect("No folder with config files"); for config in configs { let path = format!("{}", config.unwrap().path().display()); println!("Testing {path}"); let (_, law_id, _, mut builder, parser) = Config::load(&path).unwrap(); let paragraph_path = format!("./data/expected/overview/{law_id}"); let expected_path = format!("./data/expected/par/{law_id}"); let pars = fs::read_to_string(paragraph_path).expect("Could not read file {paragraph_path}."); let pars = pars.trim().split('\n').collect::>(); for par in pars { let cont = parser.parse(par, &mut builder).unwrap(); if !cont { break; } } let actual = &builder.history; match fs::read_to_string(&expected_path) { Ok(expected) => { let e = expected.trim().split('\n').collect::>(); assert_eq!(actual, &e); } Err(_) => { let to_write = actual.join("\n"); fs::write(expected_path, to_write).expect("Unable to write file"); } } } } }