257 lines
9.0 KiB
Rust
257 lines
9.0 KiB
Rust
// Copyright (C) 2024 Philipp Hofer
|
|
//
|
|
// Licensed under the EUPL, Version 1.2 or - as soon they will be approved by
|
|
// the European Commission - subsequent versions of the EUPL (the "Licence").
|
|
// You may not use this work except in compliance with the Licence.
|
|
//
|
|
// You should have received a copy of the European Union Public License along
|
|
// with this program. If not, you may obtain a copy of the Licence at:
|
|
// <https://joinup.ec.europa.eu/software/page/eupl>
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the Licence is distributed on an "AS IS" basis,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the Licence for the specific language governing permissions and
|
|
// limitations under the Licence.
|
|
|
|
//! Deals with getting the content of a single paragraph.
|
|
|
|
mod parser;
|
|
|
|
use regex::Regex;
|
|
use std::{
|
|
fs,
|
|
hash::{DefaultHasher, Hash, Hasher},
|
|
path::Path,
|
|
};
|
|
use tracing::{debug, info};
|
|
|
|
use crate::{
|
|
law,
|
|
misc::{fetch_with_retries, get_cache_dir, Error},
|
|
};
|
|
|
|
use self::parser::Risdok;
|
|
|
|
pub struct Parser {
|
|
remove: Vec<String>,
|
|
replace: Vec<(String, String)>,
|
|
move_para_headers_into_content: bool,
|
|
pub(crate) par_sign: String,
|
|
}
|
|
|
|
impl Default for Parser {
|
|
fn default() -> Self {
|
|
Self::new(String::from("§"))
|
|
}
|
|
}
|
|
|
|
impl Parser {
|
|
#[must_use]
|
|
pub fn new(par_sign: String) -> Self {
|
|
Self {
|
|
remove: Vec::new(),
|
|
replace: Vec::new(),
|
|
move_para_headers_into_content: false,
|
|
par_sign,
|
|
}
|
|
}
|
|
|
|
pub fn move_para_headers_into_content(&mut self) {
|
|
self.move_para_headers_into_content = true;
|
|
}
|
|
|
|
pub fn add_string_to_remove(&mut self, data: &str) {
|
|
self.remove.push(data.into());
|
|
}
|
|
|
|
pub fn add_string_to_replace(&mut self, search: &str, replace: &str) {
|
|
self.replace.push((search.into(), replace.into()));
|
|
}
|
|
|
|
/// Parses the content available at the specified `url` and processes it using the provided
|
|
/// `law::Builder`.
|
|
///
|
|
/// This function is responsible for downloading the content from the given `url`,
|
|
/// pre-processing the text (such as removing unwanted characters and performing specified
|
|
/// replacements), and then parsing the pre-processed XML content.
|
|
///
|
|
/// This method also handles caching of the downloaded content to avoid repeated downloads of
|
|
/// the same resource, improving efficiency and reducing network load.
|
|
///
|
|
/// # Parameters
|
|
///
|
|
/// - `url`: The URL from which to fetch the law text. - `builder`: A mutable reference to a
|
|
/// `law::Builder` instance, which is used to construct the law structure based on the parsed
|
|
/// content.
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// - `Ok(bool)`: Returns `true` if parsing was successful, `false` otherwise.
|
|
/// - `Err(Error)`: An error occurred during the fetching or parsing process.
|
|
///
|
|
/// # Errors
|
|
///
|
|
/// Errors can occur due to several reasons:
|
|
/// - Network issues preventing the download of content from the given `url`.
|
|
/// - I/O errors related to file system operations, such as problems with reading from or
|
|
/// writing to the cache.
|
|
/// - Parsing errors if the downloaded content cannot be properly processed or interpreted as
|
|
/// expected.
|
|
///
|
|
/// # Example Usage
|
|
///
|
|
/// ```
|
|
/// use risp::{Config, law::{Law, Heading, Content, Section, HeadingContent}};
|
|
/// use std::path::Path;
|
|
///
|
|
/// let (_, _, _, mut builder, parser) = Config::load(Path::new("data/configs/abgb.toml")).unwrap();
|
|
/// let result = parser.parse("https://www.ris.bka.gv.at/Dokumente/Bundesnormen/NOR12017691/NOR12017691.xml", &mut builder).unwrap();
|
|
///
|
|
/// let law: Law = builder.into();
|
|
/// assert_eq!(
|
|
/// law,
|
|
/// Law {
|
|
/// name: "ABGB".into(),
|
|
/// header: vec![Heading {
|
|
/// name: "Nullter Theil. Einleitung".into(),
|
|
/// desc: Some("Von den bürgerlichen Gesetzen überhaupt.".into()),
|
|
/// content: vec![HeadingContent::Paragraph(Section {
|
|
/// symb: "§ 1.".into(),
|
|
/// par_header: Some("Begriff des bürgerlichen Rechtes.".into()),
|
|
/// par_note: None,
|
|
/// content: Content::Text("Der Inbegriff der Gesetze, wodurch die Privat-Rechte und Pflichten der Einwohner des Staates unter sich bestimmt werden, macht das bürgerliche Recht in demselben aus.".into())
|
|
/// })]
|
|
/// }]
|
|
/// }
|
|
/// );
|
|
/// ```
|
|
pub fn parse(&self, url: &str, builder: &mut law::Builder) -> Result<bool, Error> {
|
|
info!("Parsing {url}");
|
|
let xml = fetch(url)?;
|
|
|
|
let xml = xml.replace('\u{a0}', " ");
|
|
|
|
self.parse_from_str(&xml, builder)
|
|
}
|
|
|
|
fn parse_from_str(&self, xml: &str, builder: &mut law::Builder) -> Result<bool, Error> {
|
|
let mut xml = String::from(xml);
|
|
for r in &self.remove {
|
|
xml = xml.replace(r, "");
|
|
}
|
|
for (search, replace) in &self.replace {
|
|
xml = xml.replace(search, replace);
|
|
}
|
|
|
|
let xml = if self.move_para_headers_into_content {
|
|
self.do_move_para_headers_into_content(&xml)
|
|
} else {
|
|
xml
|
|
};
|
|
|
|
Risdok::from_str(&xml, builder)
|
|
}
|
|
|
|
fn do_move_para_headers_into_content(&self, xml: &str) -> String {
|
|
let mut result = String::from(xml);
|
|
let ueberschrift_regex = Regex::new(&format!(
|
|
"<ueberschrift typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">({}.*?)</ueberschrift>",
|
|
self.par_sign
|
|
))
|
|
.unwrap();
|
|
let absatz_regex =
|
|
Regex::new("<absatz typ=\"[^\"]*\" ct=\"[^\"]*\" halign=\"[^\"]*\">").unwrap();
|
|
|
|
// Find all matches for <ueberschrift> tags and iterate over them in reverse to avoid messing up the indices
|
|
for cap in ueberschrift_regex.captures_iter(xml) {
|
|
let ueberschrift_content = &cap[1];
|
|
|
|
// Check if there's an <absatz> following the <ueberschrift>
|
|
if let Some(absatz_match) = absatz_regex.find(&result[cap.get(0).unwrap().end()..]) {
|
|
// Calculate the insertion point for the <gldsym> tag
|
|
let insert_point =
|
|
cap.get(0).unwrap().end() + absatz_match.start() + absatz_match.as_str().len();
|
|
// Insert the <gldsym> tag with the ueberschrift content into the result string
|
|
result.insert_str(
|
|
insert_point,
|
|
&format!("<gldsym>{ueberschrift_content}</gldsym>"),
|
|
);
|
|
}
|
|
|
|
// Remove the <ueberschrift> tag from the result string
|
|
result.replace_range(cap.get(0).unwrap().range(), "");
|
|
}
|
|
|
|
debug!("{result:#?}");
|
|
result
|
|
}
|
|
}
|
|
|
|
fn fetch(url: &str) -> Result<String, Error> {
|
|
let mut hasher = DefaultHasher::new();
|
|
url.hash(&mut hasher);
|
|
let hash = format!("{:x}", hasher.finish());
|
|
|
|
let expected_filename = format!("{}par-{hash}", get_cache_dir()?);
|
|
if let Ok(data) = fs::read_to_string(&expected_filename) {
|
|
Ok(data)
|
|
} else {
|
|
let data = fetch_with_retries(url)?;
|
|
let path = Path::new(&expected_filename);
|
|
if let Some(parent) = path.parent() {
|
|
// Try to create the directory (and any necessary parent directories)
|
|
fs::create_dir_all(parent).expect("Unable to create directory");
|
|
}
|
|
fs::write(expected_filename, &data).expect("Unable to write file");
|
|
Ok(data)
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use std::fs;
|
|
|
|
use crate::config::Config;
|
|
use pretty_assertions::assert_eq;
|
|
|
|
#[test]
|
|
fn all_configs_produce_expected_output() {
|
|
let configs = fs::read_dir("./data/configs").expect("No folder with config files");
|
|
|
|
for config in configs {
|
|
let path = format!("{}", config.unwrap().path().display());
|
|
println!("Testing {path}");
|
|
|
|
let (_, law_id, _, mut builder, parser) = Config::load(&path).unwrap();
|
|
|
|
let paragraph_path = format!("./data/expected/overview/{law_id}");
|
|
let expected_path = format!("./data/expected/par/{law_id}");
|
|
|
|
let pars =
|
|
fs::read_to_string(paragraph_path).expect("Could not read file {paragraph_path}.");
|
|
let pars = pars.trim().split('\n').collect::<Vec<&str>>();
|
|
|
|
for par in pars {
|
|
let cont = parser.parse(par, &mut builder).unwrap();
|
|
if !cont {
|
|
break;
|
|
}
|
|
}
|
|
|
|
let actual = &builder.history;
|
|
|
|
match fs::read_to_string(&expected_path) {
|
|
Ok(expected) => {
|
|
let e = expected.trim().split('\n').collect::<Vec<&str>>();
|
|
assert_eq!(actual, &e);
|
|
}
|
|
Err(_) => {
|
|
let to_write = actual.join("\n");
|
|
fs::write(expected_path, to_write).expect("Unable to write file");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|