risp/src/config.rs

// Copyright (C) 2024 Philipp Hofer
//
// Licensed under the EUPL, Version 1.2 or - as soon they will be approved by
// the European Commission - subsequent versions of the EUPL (the "Licence").
// You may not use this work except in compliance with the Licence.
//
// You should have received a copy of the European Union Public License along
// with this program.  If not, you may obtain a copy of the Licence at:
// <https://joinup.ec.europa.eu/software/page/eupl>
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the Licence is distributed on an "AS IS" basis,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the Licence for the specific language governing permissions and
// limitations under the Licence.

#[allow(clippy::wildcard_imports)] // I use *-operator on purpose: I want to receive a compiler
// warning if I've not updated my `create_classifier` function
use crate::law::{self, responsible::*};
use crate::misc::Error;
use crate::paragraph::Parser;
use crate::{default_type, Typ};
use crate::{law::ClassifierApplicable, misc};
use serde::Deserialize;
use std::fs;
use std::path::Path;
use std::sync::Arc;
use tracing::{event, info, instrument, Level};

// TODO: more generic
fn create_classifier(match_function: &str) -> Result<ClassifierApplicable, Error> {
    let func: ClassifierApplicable = match match_function {
        "contains" => Arc::new(contains),
        "contains_case_sensitive" => Arc::new(contains_case_sensitive),
        "starts_with_roman_number" => Arc::new(starts_with_roman_number),
        "contains_at_start" => Arc::new(contains_at_start),
        "starts_with_number" => Arc::new(starts_with_number),
        "starts_with_letter" => Arc::new(starts_with_letter),
        "starts_with_dash" => Arc::new(starts_with_dash),
        "starts_with_uppercaseletter" => Arc::new(starts_with_uppercaseletter),
        "contains_without_unter" => Arc::new(contains_without_unter),
        _ => {
            return Err(Error::new(&format!(
                "Unknown match function: {match_function}"
            )))
        }
    };

    Ok(func)
}

#[derive(Debug, Deserialize)]
pub struct Config {
    law: Law,
    #[serde(default)]
    parser: ParserConfig,
}

impl Config {
    #[instrument(level = "trace", skip(path))]
    /// Loads a configuration from a specified path and constructs a `LawBuilder` and `Parser` based on it.
    ///
    /// This function reads a configuration file from the given path, expecting it to be in TOML format. It then
    /// parses the configuration to set up a `LawBuilder` with specified classifiers and a `Parser` with specific
    /// string manipulation rules (like removing or replacing strings). Additionally, it processes parser settings
    /// for moving paragraph headers into content if specified.
    ///
    /// # Parameters
    ///
    /// - `path`: A path to the configuration file. This can be any type that implements the `AsRef<Path>` trait,
    ///   allowing for flexible path specifications (e.g., `&str`, `String`, `Path`, or `PathBuf`).
    ///
    /// # Returns
    ///
    /// Returns a `Result` containing a tuple of the law ID (`usize`), the constructed `LawBuilder`, and the `Parser`
    /// upon successful operation. If any error occurs during the process (e.g., file reading, TOML parsing, classifier
    /// creation), it returns an `Error`.
    ///
    /// # Errors
    ///
    /// This function can return an `Error` in several cases:
    ///
    /// - If the specified path does not exist or cannot be read.
    /// - If the configuration file content is not valid TOML or does not conform to the expected structure.
    /// - If there's an issue creating any of the classifiers specified in the configuration (e.g., if the `match_function`
    ///   for a classifier fails).
    ///
    /// # Examples
    ///
    /// ```
    /// use risp::Config;
    /// use std::path::Path;
    ///
    /// let (_, law_id, builder, parser) = Config::load(Path::new("data/configs/abgb.toml")).unwrap();
    ///
    /// assert_eq!(law_id, 10001622);
    /// ```    
    pub fn load<P: AsRef<Path> + std::fmt::Debug>(
        path: P,
    ) -> Result<(Typ, usize, law::Builder, Parser), Error> {
        info!("Using cache dir: {}", misc::get_cache_dir().unwrap());

        let config_str = fs::read_to_string(path)?;
        let config: Config = toml::from_str(&config_str)?;

        let mut builder = law::Builder::new(config.law.name, config.law.par_sign);
        if let Some(classifiers) = config.law.classifiers {
            for classifier in &classifiers {
                let to_add = law::Classifier::new(
                    &classifier.name,
                    create_classifier(&classifier.match_function)?,
                );
                if classifier.is_root {
                    builder.add_classifier(to_add.root());
                } else {
                    builder.add_classifier(to_add);
                }
            }
            event!(
                Level::INFO,
                "Added {} classifiers from config",
                &classifiers.len()
            );
        } else {
            builder.no_headers();
            event!(Level::INFO, "Assuming law text does not contain headers");
        }

        let mut parser = Parser::new(builder.par_sign.clone());

        for to_remove in &config.parser.remove_strings {
            parser.add_string_to_remove(to_remove);
        }
        event!(
            Level::INFO,
            "Added {} strings to remove",
            &config.parser.remove_strings.len()
        );

        for to_replace in &config.parser.replace_rules {
            parser.add_string_to_replace(&to_replace.find, &to_replace.replace_with);
        }
        event!(
            Level::INFO,
            "Added {} strings to replace",
            &config.parser.replace_rules.len()
        );

        if config.parser.move_para_headers_into_content {
            event!(
                Level::WARN,
                "Move para headers into content. Make sure you know what you do!"
            );
            parser.move_para_headers_into_content();
        }
        Ok((config.law.typ, config.law.id, builder, parser))
    }
}

#[derive(Debug, Deserialize)]
struct Law {
    id: usize,
    name: String,
    #[serde(default = "default_type")]
    typ: Typ,
    par_sign: Option<String>,
    classifiers: Option<Vec<Classifier>>,
}

#[derive(Debug, Deserialize)]
struct Classifier {
    name: String,
    is_root: bool,
    match_function: String,
}

#[derive(Debug, Deserialize, Default)]
struct ParserConfig {
    /// e.g. used in EheG to transform `<ueberschrift typ="para" ct="text" halign="c">§
    /// 6</ueberschrift>` into
    /// `
    /// <absatz typ="abs" ct="text" halign="j">
    /// <gldsym>§ 1.</gldsym>
    /// text...
    /// </absatz>
    ///`
    #[serde(default)] //okay to not have this part in the config
    move_para_headers_into_content: bool,
    #[serde(default)] //okay to not have this part in the config
    remove_strings: Vec<String>,
    #[serde(default)] //okay to not have this part in the config
    replace_rules: Vec<ReplaceRule>,
}

#[derive(Debug, Deserialize)]
struct ReplaceRule {
    find: String,
    replace_with: String,
}

#[cfg(test)]
mod tests {
    use std::fs;

    use super::Config;

    #[test]
    fn all_configs_are_deserializable() {
        let configs = fs::read_dir("./data/configs").expect("No folder with config files");

        for config in configs {
            let path = format!("{}", config.unwrap().path().display());
            Config::load(&path).unwrap();
        }
    }
}
add license 2024-02-15 13:50:58 +01:00			`// Copyright (C) 2024 Philipp Hofer`
			`//`
use ascii ' in license text 2024-02-15 16:12:14 +01:00			`// Licensed under the EUPL, Version 1.2 or - as soon they will be approved by`
add license 2024-02-15 13:50:58 +01:00			`// the European Commission - subsequent versions of the EUPL (the "Licence").`
			`// You may not use this work except in compliance with the Licence.`
			`//`
			`// You should have received a copy of the European Union Public License along`
			`// with this program. If not, you may obtain a copy of the Licence at:`
			`// <https://joinup.ec.europa.eu/software/page/eupl>`
			`//`
			`// Unless required by applicable law or agreed to in writing, software`
			`// distributed under the Licence is distributed on an "AS IS" basis,`
			`// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`// See the Licence for the specific language governing permissions and`
			`// limitations under the Licence.`

cleaner code 2024-02-06 14:18:03 +01:00			`#[allow(clippy::wildcard_imports)] // I use *-operator on purpose: I want to receive a compiler`
			// warning if I've not updated my `create_classifier` function
add config for law texts, fixes #3 2024-02-06 10:17:14 +01:00			`use crate::law::{self, responsible::*};`
			`use crate::misc::Error;`
restructure project 2024-02-06 11:45:44 +01:00			`use crate::paragraph::Parser;`
allow specifying applications, preparation to also add 'landesgesetze' 2024-08-22 10:46:59 +02:00			`use crate::{default_type, Typ};`
print used cache dir 2024-05-26 12:20:19 +02:00			`use crate::{law::ClassifierApplicable, misc};`
cleaner code 2024-02-06 14:18:03 +01:00			`use serde::Deserialize;`
			`use std::fs;`
			`use std::path::Path;`
			`use std::sync::Arc;`
print used cache dir 2024-05-26 12:20:19 +02:00			`use tracing::{event, info, instrument, Level};`
add config for law texts, fixes #3 2024-02-06 10:17:14 +01:00
			`// TODO: more generic`
			`fn create_classifier(match_function: &str) -> Result<ClassifierApplicable, Error> {`
			`let func: ClassifierApplicable = match match_function {`
			`"contains" => Arc::new(contains),`
add medieng 2024-02-19 16:33:50 +01:00			`"contains_case_sensitive" => Arc::new(contains_case_sensitive),`
add config for law texts, fixes #3 2024-02-06 10:17:14 +01:00			`"starts_with_roman_number" => Arc::new(starts_with_roman_number),`
			`"contains_at_start" => Arc::new(contains_at_start),`
			`"starts_with_number" => Arc::new(starts_with_number),`
			`"starts_with_letter" => Arc::new(starts_with_letter),`
improve abgb 2024-02-06 13:10:59 +01:00			`"starts_with_dash" => Arc::new(starts_with_dash),`
			`"starts_with_uppercaseletter" => Arc::new(starts_with_uppercaseletter),`
add config for law texts, fixes #3 2024-02-06 10:17:14 +01:00			`"contains_without_unter" => Arc::new(contains_without_unter),`
			`_ => {`
			`return Err(Error::new(&format!(`
cleaner code 2024-02-06 14:18:03 +01:00			`"Unknown match function: {match_function}"`
add config for law texts, fixes #3 2024-02-06 10:17:14 +01:00			`)))`
			`}`
			`};`

			`Ok(func)`
			`}`

			`#[derive(Debug, Deserialize)]`
			`pub struct Config {`
			`law: Law,`
add configs for laws 2024-02-06 11:18:40 +01:00			`#[serde(default)]`
add config for law texts, fixes #3 2024-02-06 10:17:14 +01:00			`parser: ParserConfig,`
			`}`

			`impl Config {`
start including tracing 2024-02-27 16:03:24 +01:00			`#[instrument(level = "trace", skip(path))]`
add docs for Config::load() 2024-02-27 08:30:42 +01:00			/// Loads a configuration from a specified path and constructs a `LawBuilder` and `Parser` based on it.
			`///`
			`/// This function reads a configuration file from the given path, expecting it to be in TOML format. It then`
			/// parses the configuration to set up a `LawBuilder` with specified classifiers and a `Parser` with specific
			`/// string manipulation rules (like removing or replacing strings). Additionally, it processes parser settings`
			`/// for moving paragraph headers into content if specified.`
			`///`
			`/// # Parameters`
			`///`
			/// - `path`: A path to the configuration file. This can be any type that implements the `AsRef<Path>` trait,
			/// allowing for flexible path specifications (e.g., `&str`, `String`, `Path`, or `PathBuf`).
			`///`
			`/// # Returns`
			`///`
			/// Returns a `Result` containing a tuple of the law ID (`usize`), the constructed `LawBuilder`, and the `Parser`
			`/// upon successful operation. If any error occurs during the process (e.g., file reading, TOML parsing, classifier`
			/// creation), it returns an `Error`.
			`///`
			`/// # Errors`
			`///`
			/// This function can return an `Error` in several cases:
			`///`
			`/// - If the specified path does not exist or cannot be read.`
			`/// - If the configuration file content is not valid TOML or does not conform to the expected structure.`
			/// - If there's an issue creating any of the classifiers specified in the configuration (e.g., if the `match_function`
			`/// for a classifier fails).`
			`///`
			`/// # Examples`
			`///`
			/// ```
restructre lib 2024-02-27 10:33:07 +01:00			`/// use risp::Config;`
add docs for Config::load() 2024-02-27 08:30:42 +01:00			`/// use std::path::Path;`
			`///`
allow specifying applications, preparation to also add 'landesgesetze' 2024-08-22 10:46:59 +02:00			`/// let (_, law_id, builder, parser) = Config::load(Path::new("data/configs/abgb.toml")).unwrap();`
add docs for Config::load() 2024-02-27 08:30:42 +01:00			`///`
			`/// assert_eq!(law_id, 10001622);`
			/// ```
start including tracing 2024-02-27 16:03:24 +01:00			`pub fn load<P: AsRef<Path> + std::fmt::Debug>(`
			`path: P,`
allow specifying applications, preparation to also add 'landesgesetze' 2024-08-22 10:46:59 +02:00			`) -> Result<(Typ, usize, law::Builder, Parser), Error> {`
print used cache dir 2024-05-26 12:20:19 +02:00			`info!("Using cache dir: {}", misc::get_cache_dir().unwrap());`

add config for law texts, fixes #3 2024-02-06 10:17:14 +01:00			`let config_str = fs::read_to_string(path)?;`
			`let config: Config = toml::from_str(&config_str)?;`

add aeuv 2024-05-26 13:01:23 +02:00			`let mut builder = law::Builder::new(config.law.name, config.law.par_sign);`
allow laws with no headers 2024-03-21 10:08:03 +01:00			`if let Some(classifiers) = config.law.classifiers {`
			`for classifier in &classifiers {`
			`let to_add = law::Classifier::new(`
			`&classifier.name,`
			`create_classifier(&classifier.match_function)?,`
			`);`
			`if classifier.is_root {`
			`builder.add_classifier(to_add.root());`
			`} else {`
			`builder.add_classifier(to_add);`
			`}`
add config for law texts, fixes #3 2024-02-06 10:17:14 +01:00			`}`
allow laws with no headers 2024-03-21 10:08:03 +01:00			`event!(`
			`Level::INFO,`
			`"Added {} classifiers from config",`
			`&classifiers.len()`
			`);`
			`} else {`
			`builder.no_headers();`
			`event!(Level::INFO, "Assuming law text does not contain headers");`
add config for law texts, fixes #3 2024-02-06 10:17:14 +01:00			`}`

add aeuv 2024-05-26 13:01:23 +02:00			`let mut parser = Parser::new(builder.par_sign.clone());`
add config for law texts, fixes #3 2024-02-06 10:17:14 +01:00
start including tracing 2024-02-27 16:03:24 +01:00			`for to_remove in &config.parser.remove_strings {`
			`parser.add_string_to_remove(to_remove);`
add config for law texts, fixes #3 2024-02-06 10:17:14 +01:00			`}`
start including tracing 2024-02-27 16:03:24 +01:00			`event!(`
			`Level::INFO,`
			`"Added {} strings to remove",`
			`&config.parser.remove_strings.len()`
			`);`
add config for law texts, fixes #3 2024-02-06 10:17:14 +01:00
start including tracing 2024-02-27 16:03:24 +01:00			`for to_replace in &config.parser.replace_rules {`
add config for law texts, fixes #3 2024-02-06 10:17:14 +01:00			`parser.add_string_to_replace(&to_replace.find, &to_replace.replace_with);`
			`}`
start including tracing 2024-02-27 16:03:24 +01:00			`event!(`
			`Level::INFO,`
			`"Added {} strings to replace",`
			`&config.parser.replace_rules.len()`
			`);`
add config for law texts, fixes #3 2024-02-06 10:17:14 +01:00
enable function to move paragraphs into absaetze 2024-02-15 15:07:36 +01:00			`if config.parser.move_para_headers_into_content {`
start including tracing 2024-02-27 16:03:24 +01:00			`event!(`
			`Level::WARN,`
			`"Move para headers into content. Make sure you know what you do!"`
			`);`
enable function to move paragraphs into absaetze 2024-02-15 15:07:36 +01:00			`parser.move_para_headers_into_content();`
			`}`
allow specifying applications, preparation to also add 'landesgesetze' 2024-08-22 10:46:59 +02:00			`Ok((config.law.typ, config.law.id, builder, parser))`
add config for law texts, fixes #3 2024-02-06 10:17:14 +01:00			`}`
			`}`

			`#[derive(Debug, Deserialize)]`
			`struct Law {`
			`id: usize,`
add name in law config 2024-02-17 10:40:00 +01:00			`name: String,`
allow specifying applications, preparation to also add 'landesgesetze' 2024-08-22 10:46:59 +02:00			`#[serde(default = "default_type")]`
			`typ: Typ,`
add aeuv 2024-05-26 13:01:23 +02:00			`par_sign: Option<String>,`
allow laws with no headers 2024-03-21 10:08:03 +01:00			`classifiers: Option<Vec<Classifier>>,`
add config for law texts, fixes #3 2024-02-06 10:17:14 +01:00			`}`

			`#[derive(Debug, Deserialize)]`
			`struct Classifier {`
			`name: String,`
			`is_root: bool,`
			`match_function: String,`
			`}`

add configs for laws 2024-02-06 11:18:40 +01:00			`#[derive(Debug, Deserialize, Default)]`
add config for law texts, fixes #3 2024-02-06 10:17:14 +01:00			`struct ParserConfig {`
enable function to move paragraphs into absaetze 2024-02-15 15:07:36 +01:00			/// e.g. used in EheG to transform `<ueberschrift typ="para" ct="text" halign="c">§
			/// 6</ueberschrift>` into
			/// `
			`/// <absatz typ="abs" ct="text" halign="j">`
			`/// <gldsym>§ 1.</gldsym>`
			`/// text...`
			`/// </absatz>`
			///`
			`#[serde(default)] //okay to not have this part in the config`
			`move_para_headers_into_content: bool,`
add configs for laws 2024-02-06 11:18:40 +01:00			`#[serde(default)] //okay to not have this part in the config`
add config for law texts, fixes #3 2024-02-06 10:17:14 +01:00			`remove_strings: Vec<String>,`
add configs for laws 2024-02-06 11:18:40 +01:00			`#[serde(default)] //okay to not have this part in the config`
add config for law texts, fixes #3 2024-02-06 10:17:14 +01:00			`replace_rules: Vec<ReplaceRule>,`
			`}`

			`#[derive(Debug, Deserialize)]`
			`struct ReplaceRule {`
			`find: String,`
			`replace_with: String,`
			`}`
add configs for laws 2024-02-06 11:18:40 +01:00
			`#[cfg(test)]`
			`mod tests {`
			`use std::fs;`

			`use super::Config;`

			`#[test]`
			`fn all_configs_are_deserializable() {`
			`let configs = fs::read_dir("./data/configs").expect("No folder with config files");`

			`for config in configs {`
			`let path = format!("{}", config.unwrap().path().display());`
			`Config::load(&path).unwrap();`
			`}`
			`}`
			`}`