2024-02-15 13:50:58 +01:00
|
|
|
// Copyright (C) 2024 Philipp Hofer
|
|
|
|
//
|
2024-02-15 16:12:14 +01:00
|
|
|
// Licensed under the EUPL, Version 1.2 or - as soon they will be approved by
|
2024-02-15 13:50:58 +01:00
|
|
|
// the European Commission - subsequent versions of the EUPL (the "Licence").
|
|
|
|
// You may not use this work except in compliance with the Licence.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the European Union Public License along
|
|
|
|
// with this program. If not, you may obtain a copy of the Licence at:
|
|
|
|
// <https://joinup.ec.europa.eu/software/page/eupl>
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the Licence is distributed on an "AS IS" basis,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the Licence for the specific language governing permissions and
|
|
|
|
// limitations under the Licence.
|
|
|
|
|
2024-02-06 14:18:03 +01:00
|
|
|
#[allow(clippy::wildcard_imports)] // I use *-operator on purpose: I want to receive a compiler
|
|
|
|
// warning if I've not updated my `create_classifier` function
|
2024-02-06 10:17:14 +01:00
|
|
|
use crate::law::{self, responsible::*};
|
|
|
|
use crate::misc::Error;
|
2024-02-06 11:45:44 +01:00
|
|
|
use crate::paragraph::Parser;
|
2024-08-22 10:46:59 +02:00
|
|
|
use crate::{default_type, Typ};
|
2024-05-26 12:20:19 +02:00
|
|
|
use crate::{law::ClassifierApplicable, misc};
|
2024-02-06 14:18:03 +01:00
|
|
|
use serde::Deserialize;
|
|
|
|
use std::fs;
|
|
|
|
use std::path::Path;
|
|
|
|
use std::sync::Arc;
|
2024-05-26 12:20:19 +02:00
|
|
|
use tracing::{event, info, instrument, Level};
|
2024-02-06 10:17:14 +01:00
|
|
|
|
|
|
|
// TODO: more generic
|
|
|
|
fn create_classifier(match_function: &str) -> Result<ClassifierApplicable, Error> {
|
|
|
|
let func: ClassifierApplicable = match match_function {
|
|
|
|
"contains" => Arc::new(contains),
|
2024-02-19 16:33:50 +01:00
|
|
|
"contains_case_sensitive" => Arc::new(contains_case_sensitive),
|
2024-02-06 10:17:14 +01:00
|
|
|
"starts_with_roman_number" => Arc::new(starts_with_roman_number),
|
|
|
|
"contains_at_start" => Arc::new(contains_at_start),
|
|
|
|
"starts_with_number" => Arc::new(starts_with_number),
|
|
|
|
"starts_with_letter" => Arc::new(starts_with_letter),
|
2024-02-06 13:10:59 +01:00
|
|
|
"starts_with_dash" => Arc::new(starts_with_dash),
|
|
|
|
"starts_with_uppercaseletter" => Arc::new(starts_with_uppercaseletter),
|
2024-02-06 10:17:14 +01:00
|
|
|
"contains_without_unter" => Arc::new(contains_without_unter),
|
|
|
|
_ => {
|
|
|
|
return Err(Error::new(&format!(
|
2024-02-06 14:18:03 +01:00
|
|
|
"Unknown match function: {match_function}"
|
2024-02-06 10:17:14 +01:00
|
|
|
)))
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
Ok(func)
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
|
|
pub struct Config {
|
|
|
|
law: Law,
|
2024-02-06 11:18:40 +01:00
|
|
|
#[serde(default)]
|
2024-02-06 10:17:14 +01:00
|
|
|
parser: ParserConfig,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Config {
|
2024-02-27 16:03:24 +01:00
|
|
|
#[instrument(level = "trace", skip(path))]
|
2024-02-27 08:30:42 +01:00
|
|
|
/// Loads a configuration from a specified path and constructs a `LawBuilder` and `Parser` based on it.
|
|
|
|
///
|
|
|
|
/// This function reads a configuration file from the given path, expecting it to be in TOML format. It then
|
|
|
|
/// parses the configuration to set up a `LawBuilder` with specified classifiers and a `Parser` with specific
|
|
|
|
/// string manipulation rules (like removing or replacing strings). Additionally, it processes parser settings
|
|
|
|
/// for moving paragraph headers into content if specified.
|
|
|
|
///
|
|
|
|
/// # Parameters
|
|
|
|
///
|
|
|
|
/// - `path`: A path to the configuration file. This can be any type that implements the `AsRef<Path>` trait,
|
|
|
|
/// allowing for flexible path specifications (e.g., `&str`, `String`, `Path`, or `PathBuf`).
|
|
|
|
///
|
|
|
|
/// # Returns
|
|
|
|
///
|
|
|
|
/// Returns a `Result` containing a tuple of the law ID (`usize`), the constructed `LawBuilder`, and the `Parser`
|
|
|
|
/// upon successful operation. If any error occurs during the process (e.g., file reading, TOML parsing, classifier
|
|
|
|
/// creation), it returns an `Error`.
|
|
|
|
///
|
|
|
|
/// # Errors
|
|
|
|
///
|
|
|
|
/// This function can return an `Error` in several cases:
|
|
|
|
///
|
|
|
|
/// - If the specified path does not exist or cannot be read.
|
|
|
|
/// - If the configuration file content is not valid TOML or does not conform to the expected structure.
|
|
|
|
/// - If there's an issue creating any of the classifiers specified in the configuration (e.g., if the `match_function`
|
|
|
|
/// for a classifier fails).
|
|
|
|
///
|
|
|
|
/// # Examples
|
|
|
|
///
|
|
|
|
/// ```
|
2024-02-27 10:33:07 +01:00
|
|
|
/// use risp::Config;
|
2024-02-27 08:30:42 +01:00
|
|
|
/// use std::path::Path;
|
|
|
|
///
|
2024-08-22 10:46:59 +02:00
|
|
|
/// let (_, law_id, builder, parser) = Config::load(Path::new("data/configs/abgb.toml")).unwrap();
|
2024-02-27 08:30:42 +01:00
|
|
|
///
|
|
|
|
/// assert_eq!(law_id, 10001622);
|
|
|
|
/// ```
|
2024-02-27 16:03:24 +01:00
|
|
|
pub fn load<P: AsRef<Path> + std::fmt::Debug>(
|
|
|
|
path: P,
|
2024-08-22 10:46:59 +02:00
|
|
|
) -> Result<(Typ, usize, law::Builder, Parser), Error> {
|
2024-05-26 12:20:19 +02:00
|
|
|
info!("Using cache dir: {}", misc::get_cache_dir().unwrap());
|
|
|
|
|
2024-02-06 10:17:14 +01:00
|
|
|
let config_str = fs::read_to_string(path)?;
|
|
|
|
let config: Config = toml::from_str(&config_str)?;
|
|
|
|
|
2024-05-26 13:01:23 +02:00
|
|
|
let mut builder = law::Builder::new(config.law.name, config.law.par_sign);
|
2024-03-21 10:08:03 +01:00
|
|
|
if let Some(classifiers) = config.law.classifiers {
|
|
|
|
for classifier in &classifiers {
|
|
|
|
let to_add = law::Classifier::new(
|
|
|
|
&classifier.name,
|
|
|
|
create_classifier(&classifier.match_function)?,
|
|
|
|
);
|
|
|
|
if classifier.is_root {
|
|
|
|
builder.add_classifier(to_add.root());
|
|
|
|
} else {
|
|
|
|
builder.add_classifier(to_add);
|
|
|
|
}
|
2024-02-06 10:17:14 +01:00
|
|
|
}
|
2024-03-21 10:08:03 +01:00
|
|
|
event!(
|
|
|
|
Level::INFO,
|
|
|
|
"Added {} classifiers from config",
|
|
|
|
&classifiers.len()
|
|
|
|
);
|
|
|
|
} else {
|
|
|
|
builder.no_headers();
|
|
|
|
event!(Level::INFO, "Assuming law text does not contain headers");
|
2024-02-06 10:17:14 +01:00
|
|
|
}
|
|
|
|
|
2024-05-26 13:01:23 +02:00
|
|
|
let mut parser = Parser::new(builder.par_sign.clone());
|
2024-02-06 10:17:14 +01:00
|
|
|
|
2024-02-27 16:03:24 +01:00
|
|
|
for to_remove in &config.parser.remove_strings {
|
|
|
|
parser.add_string_to_remove(to_remove);
|
2024-02-06 10:17:14 +01:00
|
|
|
}
|
2024-02-27 16:03:24 +01:00
|
|
|
event!(
|
|
|
|
Level::INFO,
|
|
|
|
"Added {} strings to remove",
|
|
|
|
&config.parser.remove_strings.len()
|
|
|
|
);
|
2024-02-06 10:17:14 +01:00
|
|
|
|
2024-02-27 16:03:24 +01:00
|
|
|
for to_replace in &config.parser.replace_rules {
|
2024-02-06 10:17:14 +01:00
|
|
|
parser.add_string_to_replace(&to_replace.find, &to_replace.replace_with);
|
|
|
|
}
|
2024-02-27 16:03:24 +01:00
|
|
|
event!(
|
|
|
|
Level::INFO,
|
|
|
|
"Added {} strings to replace",
|
|
|
|
&config.parser.replace_rules.len()
|
|
|
|
);
|
2024-02-06 10:17:14 +01:00
|
|
|
|
2024-02-15 15:07:36 +01:00
|
|
|
if config.parser.move_para_headers_into_content {
|
2024-02-27 16:03:24 +01:00
|
|
|
event!(
|
|
|
|
Level::WARN,
|
|
|
|
"Move para headers into content. Make sure you know what you do!"
|
|
|
|
);
|
2024-02-15 15:07:36 +01:00
|
|
|
parser.move_para_headers_into_content();
|
|
|
|
}
|
2024-08-22 10:46:59 +02:00
|
|
|
Ok((config.law.typ, config.law.id, builder, parser))
|
2024-02-06 10:17:14 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
|
|
struct Law {
|
|
|
|
id: usize,
|
2024-02-17 10:40:00 +01:00
|
|
|
name: String,
|
2024-08-22 10:46:59 +02:00
|
|
|
#[serde(default = "default_type")]
|
|
|
|
typ: Typ,
|
2024-05-26 13:01:23 +02:00
|
|
|
par_sign: Option<String>,
|
2024-03-21 10:08:03 +01:00
|
|
|
classifiers: Option<Vec<Classifier>>,
|
2024-02-06 10:17:14 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
|
|
struct Classifier {
|
|
|
|
name: String,
|
|
|
|
is_root: bool,
|
|
|
|
match_function: String,
|
|
|
|
}
|
|
|
|
|
2024-02-06 11:18:40 +01:00
|
|
|
#[derive(Debug, Deserialize, Default)]
|
2024-02-06 10:17:14 +01:00
|
|
|
struct ParserConfig {
|
2024-02-15 15:07:36 +01:00
|
|
|
/// e.g. used in EheG to transform `<ueberschrift typ="para" ct="text" halign="c">§
|
|
|
|
/// 6</ueberschrift>` into
|
|
|
|
/// `
|
|
|
|
/// <absatz typ="abs" ct="text" halign="j">
|
|
|
|
/// <gldsym>§ 1.</gldsym>
|
|
|
|
/// text...
|
|
|
|
/// </absatz>
|
|
|
|
///`
|
|
|
|
#[serde(default)] //okay to not have this part in the config
|
|
|
|
move_para_headers_into_content: bool,
|
2024-02-06 11:18:40 +01:00
|
|
|
#[serde(default)] //okay to not have this part in the config
|
2024-02-06 10:17:14 +01:00
|
|
|
remove_strings: Vec<String>,
|
2024-02-06 11:18:40 +01:00
|
|
|
#[serde(default)] //okay to not have this part in the config
|
2024-02-06 10:17:14 +01:00
|
|
|
replace_rules: Vec<ReplaceRule>,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Deserialize)]
|
|
|
|
struct ReplaceRule {
|
|
|
|
find: String,
|
|
|
|
replace_with: String,
|
|
|
|
}
|
2024-02-06 11:18:40 +01:00
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
|
|
|
use std::fs;
|
|
|
|
|
|
|
|
use super::Config;
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn all_configs_are_deserializable() {
|
|
|
|
let configs = fs::read_dir("./data/configs").expect("No folder with config files");
|
|
|
|
|
|
|
|
for config in configs {
|
|
|
|
let path = format!("{}", config.unwrap().path().display());
|
|
|
|
Config::load(&path).unwrap();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|