risp/src/paragraph/mod.rs
philipp 13c734019d
Some checks failed
CI/CD Pipeline / test (push) Failing after 1m43s
use retry mechanism for fetching data
2024-02-18 17:00:15 +01:00

198 lines
6.4 KiB
Rust

// Copyright (C) 2024 Philipp Hofer
//
// Licensed under the EUPL, Version 1.2 or - as soon they will be approved by
// the European Commission - subsequent versions of the EUPL (the "Licence").
// You may not use this work except in compliance with the Licence.
//
// You should have received a copy of the European Union Public License along
// with this program. If not, you may obtain a copy of the Licence at:
// <https://joinup.ec.europa.eu/software/page/eupl>
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the Licence is distributed on an "AS IS" basis,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the Licence for the specific language governing permissions and
// limitations under the Licence.
//! Deals with getting all paragraphs for a given law text
mod parser;
use regex::Regex;
use std::{
fs,
hash::{DefaultHasher, Hash, Hasher},
path::Path,
};
use log::info;
use crate::{
law::LawBuilder,
misc::{fetch_with_retries, get_cache_dir, Error},
};
use self::parser::Risdok;
pub struct Parser {
remove: Vec<String>,
replace: Vec<(String, String)>,
move_para_headers_into_content: bool,
}
impl Default for Parser {
fn default() -> Self {
Self::new()
}
}
impl Parser {
pub fn new() -> Self {
Self {
remove: Vec::new(),
replace: Vec::new(),
move_para_headers_into_content: false,
}
}
pub fn move_para_headers_into_content(&mut self) {
self.move_para_headers_into_content = true;
}
pub fn add_string_to_remove(&mut self, data: &str) {
self.remove.push(data.into());
}
pub fn add_string_to_replace(&mut self, search: &str, replace: &str) {
self.replace.push((search.into(), replace.into()));
}
/// Parses the content available in `url`. Calls appropriate functions in supplied `LawBuilder`.
pub fn parse(&self, url: &str, builder: &mut LawBuilder) -> Result<bool, Error> {
info!("Parsing {url}");
let xml = fetch(url)?;
let xml = xml.replace('\u{a0}', " ");
self.parse_from_str(&xml, builder)
}
fn parse_from_str(&self, xml: &str, builder: &mut LawBuilder) -> Result<bool, Error> {
let mut xml = String::from(xml);
for r in &self.remove {
xml = xml.replace(r, "");
}
for (search, replace) in &self.replace {
xml = xml.replace(search, replace);
}
let xml = if self.move_para_headers_into_content {
Self::do_move_para_headers_into_content(xml)
} else {
xml
};
Risdok::from_str(&xml, builder)
}
fn do_move_para_headers_into_content(xml: String) -> String {
let mut result = String::from(&xml);
let ueberschrift_regex = Regex::new(
"<ueberschrift typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">(§.*?)</ueberschrift>",
)
.unwrap();
let absatz_regex =
Regex::new("<absatz typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">").unwrap();
// Find all matches for <ueberschrift> tags and iterate over them in reverse to avoid messing up the indices
for cap in ueberschrift_regex.captures_iter(&xml) {
let ueberschrift_content = &cap[1];
// Check if there's an <absatz> following the <ueberschrift>
if let Some(absatz_match) = absatz_regex.find(&result[cap.get(0).unwrap().end()..]) {
// Calculate the insertion point for the <gldsym> tag
let insert_point =
cap.get(0).unwrap().end() + absatz_match.start() + absatz_match.as_str().len();
// Insert the <gldsym> tag with the ueberschrift content into the result string
result.insert_str(
insert_point,
&format!("<gldsym>{}</gldsym>", ueberschrift_content),
);
}
// Remove the <ueberschrift> tag from the result string
result.replace_range(cap.get(0).unwrap().range(), "");
}
result
}
}
fn fetch(url: &str) -> Result<String, Error> {
let mut hasher = DefaultHasher::new();
url.hash(&mut hasher);
let hash = format!("{:x}", hasher.finish());
let expected_filename = format!("{}par-{hash}", get_cache_dir()?);
match fs::read_to_string(&expected_filename) {
Ok(data) => Ok(data),
Err(_) => {
info!("Not finding url {url} in the cache, downloading...");
let data = fetch_with_retries(url)?;
let path = Path::new(&expected_filename);
if let Some(parent) = path.parent() {
// Try to create the directory (and any necessary parent directories)
fs::create_dir_all(parent).expect("Unable to create directory");
}
fs::write(expected_filename, &data).expect("Unable to write file");
Ok(data)
}
}
}
#[cfg(test)]
mod tests {
use std::fs;
use crate::config::Config;
use pretty_assertions::assert_eq;
#[test]
fn all_configs_produce_expected_output() {
let configs = fs::read_dir("./data/configs").expect("No folder with config files");
for config in configs {
let path = format!("{}", config.unwrap().path().display());
let (law_id, mut builder, parser) = Config::load(&path).unwrap();
let paragraph_path = format!("./data/expected/overview/{law_id}");
let expected_path = format!("./data/expected/par/{law_id}");
let pars =
fs::read_to_string(paragraph_path).expect("Could not read file {paragraph_path}.");
let pars = pars.trim().split('\n').collect::<Vec<&str>>();
for par in pars {
let cont = parser.parse(par, &mut builder).unwrap();
if !cont {
break;
}
}
let actual = &builder.history;
match fs::read_to_string(&expected_path) {
Ok(expected) => {
let e = expected.trim().split('\n').collect::<Vec<&str>>();
assert_eq!(actual, &e);
}
Err(_) => {
let to_write = actual.join("\n");
fs::write(expected_path, to_write).expect("Unable to write file");
}
}
}
}
}