From 15e0e485c43627604ef80a351f9b357e9133182a Mon Sep 17 00:00:00 2001 From: philipp Date: Tue, 6 Feb 2024 10:17:14 +0100 Subject: [PATCH] add config for law texts, fixes #3 --- Cargo.lock | 10 ++++ Cargo.toml | 2 + data/configs/mschg.toml | 34 +++++++++++++ src/config.rs | 93 ++++++++++++++++++++++++++++++++++ src/law/mod.rs | 35 +++++++------ src/lib.rs | 1 + src/main.rs | 7 ++- src/misc.rs | 14 +++++ src/risparser/paragraph/mod.rs | 12 ++--- 9 files changed, 185 insertions(+), 23 deletions(-) create mode 100644 data/configs/mschg.toml create mode 100644 src/config.rs diff --git a/Cargo.lock b/Cargo.lock index 92cf46f..b34b955 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -396,6 +396,7 @@ dependencies = [ "serde", "serde_json", "time", + "toml", "tqdm", "ureq", ] @@ -603,6 +604,15 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "toml" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" +dependencies = [ + "serde", +] + [[package]] name = "tqdm" version = "0.6.0" diff --git a/Cargo.toml b/Cargo.toml index 4db3db1..e0e355a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,8 @@ roxmltree = "0.18" env_logger = "0.10" log = "0.4" tqdm = "0.6" +toml = "0.5" + [dev-dependencies] pretty_assertions = "1.4" diff --git a/data/configs/mschg.toml b/data/configs/mschg.toml new file mode 100644 index 0000000..db3f0f0 --- /dev/null +++ b/data/configs/mschg.toml @@ -0,0 +1,34 @@ +[law] +id = "10002180" + +[[law.classifiers]] +name = "Abschnitt" +match_function = "contains" + +[[law.classifiers]] +name = "Number" +match_function = "starts_with_number" + +[parser] +remove_strings = ["", ""] + +[[parser.replace_rules]] +find = "\\u{a0}" +replace_with = " " + +[[parser.replace_rules]] +find = "bis" +replace_with = "bis" + +[[parser.replace_rules]] +find = "ter" +replace_with = "ter" + +[[parser.replace_rules]] +find = "" +replace_with = "-" + +[[parser.replace_rules]] +find = "(Anm.: § 69 aufgehoben durch Art. 1 Z 12, BGBl. I Nr. 124/2017)" +replace_with = "§ 69.(Anm.: § 69 aufgehoben durch Art. 1 Z 12, BGBl. I Nr. 124/2017)" + diff --git a/src/config.rs b/src/config.rs new file mode 100644 index 0000000..a0dc3d4 --- /dev/null +++ b/src/config.rs @@ -0,0 +1,93 @@ +use serde::Deserialize; +use std::fs; +use std::path::Path; +use std::sync::Arc; + +use crate::law::{self, responsible::*}; +use crate::law::{ClassifierApplicable, LawBuilder}; +use crate::misc::Error; +use crate::risparser::paragraph::Parser; + +// TODO: more generic +fn create_classifier(match_function: &str) -> Result { + let func: ClassifierApplicable = match match_function { + "contains" => Arc::new(contains), + "starts_with_roman_number" => Arc::new(starts_with_roman_number), + "contains_at_start" => Arc::new(contains_at_start), + "starts_with_number" => Arc::new(starts_with_number), + "starts_with_letter" => Arc::new(starts_with_letter), + "starts_with_uppercaseletter" => Arc::new(starts_with_letter), + "contains_without_unter" => Arc::new(contains_without_unter), + _ => { + return Err(Error::new(&format!( + "Unknown match function: {}", + match_function + ))) + } + }; + + Ok(func) +} + +#[derive(Debug, Deserialize)] +pub struct Config { + law: Law, + parser: ParserConfig, +} + +impl Config { + pub fn load>(path: P) -> Result<(usize, LawBuilder, Parser), Error> { + let config_str = fs::read_to_string(path)?; + let config: Config = toml::from_str(&config_str)?; + + let mut builder = LawBuilder::new(); + for classifier in config.law.classifiers { + let to_add = law::Classifier::new( + &classifier.name, + create_classifier(&classifier.match_function)?, + ); + if classifier.is_root { + builder.add_classifier(to_add.root()); + } else { + builder.add_classifier(to_add); + } + } + + let mut parser = Parser::new(); + + for to_remove in config.parser.remove_strings { + parser.add_string_to_remove(&to_remove); + } + + for to_replace in config.parser.replace_rules { + parser.add_string_to_replace(&to_replace.find, &to_replace.replace_with); + } + + Ok((config.law.id, builder, parser)) + } +} + +#[derive(Debug, Deserialize)] +struct Law { + id: usize, + classifiers: Vec, +} + +#[derive(Debug, Deserialize)] +struct Classifier { + name: String, + is_root: bool, + match_function: String, +} + +#[derive(Debug, Deserialize)] +struct ParserConfig { + remove_strings: Vec, + replace_rules: Vec, +} + +#[derive(Debug, Deserialize)] +struct ReplaceRule { + find: String, + replace_with: String, +} diff --git a/src/law/mod.rs b/src/law/mod.rs index e9147d0..4fe41fd 100644 --- a/src/law/mod.rs +++ b/src/law/mod.rs @@ -7,20 +7,33 @@ use std::{ sync::Arc, }; +use crate::{config::Config, misc::Error, risparser::overview::parse}; + pub mod responsible; /// That's our struct, holding all the information of the law text. #[derive(Debug, Serialize, Deserialize, PartialEq)] pub struct Law { - pub name: String, //ABGB, UrhG pub header: Vec, } impl Law { + pub fn from_config(path: &str) -> Result { + let (law_id, mut builder, parser) = Config::load(path)?; + let pars = parse(law_id).unwrap(); + + for par in pars { + let cont = parser.parse(&par, &mut builder).unwrap(); + if !cont { + break; + } + } + + Ok(builder.into()) + } + //TODO: add test pub fn to_md(&self) { - println!("# {}", self.name); - for header in &self.header { Self::print_md(header, 2); } @@ -55,10 +68,7 @@ impl From for Law { }); } - Self { - name: builder.name, - header: ret, - } + Self { header: ret } } } @@ -107,9 +117,6 @@ impl From for HeadingContent { /// Is used to generate a law struct. It's organized mainly by classifier. #[derive(Debug)] pub struct LawBuilder { - /// Name of the law - name: String, //ABGB, UrhG - /// Structure of the law text classifiers: Vec, @@ -127,8 +134,7 @@ pub struct LawBuilder { impl PartialEq for LawBuilder { fn eq(&self, other: &Self) -> bool { - self.name == other.name - && self.classifiers == other.classifiers + self.classifiers == other.classifiers && self.header == other.header && self.next_para_header == other.next_para_header } @@ -136,9 +142,8 @@ impl PartialEq for LawBuilder { impl LawBuilder { /// Creates a new law builder. Adds classifier for known law texts. - pub fn new(name: &str) -> Self { + pub fn new() -> Self { Self { - name: name.into(), classifiers: Vec::new(), header: Vec::new(), next_para_header: None, @@ -357,7 +362,7 @@ impl From<&str> for ClassifierInstance { } } -type ClassifierApplicable = Arc bool>; +pub(crate) type ClassifierApplicable = Arc bool>; #[derive(Clone)] pub struct Classifier { diff --git a/src/lib.rs b/src/lib.rs index 8de8ebe..4479647 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +pub mod config; pub mod law; pub mod misc; pub mod risparser; diff --git a/src/main.rs b/src/main.rs index bf7f331..caa8f05 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,8 +1,11 @@ -use risp::law::{Law, LawBuilder}; +use risp::law::Law; fn main() { env_logger::init(); - let law: Law = LawBuilder::new("StGB").into(); + let config_path = "./data/config/mschg.toml"; + + let law = Law::from_config(config_path).unwrap(); + law.to_md(); } diff --git a/src/misc.rs b/src/misc.rs index da10d14..c4d0734 100644 --- a/src/misc.rs +++ b/src/misc.rs @@ -8,6 +8,12 @@ pub struct Error { msg: String, } +impl Error { + pub fn new(msg: &str) -> Self { + Self { msg: msg.into() } + } +} + impl From for Error { fn from(value: ureq::Error) -> Self { Self { @@ -29,6 +35,14 @@ impl From for Error { } } } +impl From for Error { + fn from(value: toml::de::Error) -> Self { + Self { + msg: value.to_string(), + } + } +} + impl From for Error { fn from(value: roxmltree::Error) -> Self { Self { diff --git a/src/risparser/paragraph/mod.rs b/src/risparser/paragraph/mod.rs index 517b5b2..8560a6f 100644 --- a/src/risparser/paragraph/mod.rs +++ b/src/risparser/paragraph/mod.rs @@ -172,7 +172,7 @@ mod tests { #[test] fn teg() { let law_id = "10001905"; - let mut builder = LawBuilder::new("law"); + let mut builder = LawBuilder::new(); builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains)).root()); let mut parser = Parser::new(); @@ -190,7 +190,7 @@ mod tests { #[test] fn mschg() { let law_id = "10002180"; - let mut builder = LawBuilder::new("law"); + let mut builder = LawBuilder::new(); builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains)).root()); builder.add_classifier(Classifier::new("Number", Arc::new(&starts_with_number))); @@ -213,7 +213,7 @@ mod tests { #[test] fn stgb() { let law_id = "10002296"; - let mut builder = LawBuilder::new("law"); + let mut builder = LawBuilder::new(); builder.add_classifier(Classifier::new("Teil", Arc::new(&contains)).root()); builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains))); @@ -259,7 +259,7 @@ mod tests { #[test] fn kschg() { let law_id = "10002462"; - let mut builder = LawBuilder::new("law"); + let mut builder = LawBuilder::new(); builder.add_classifier(Classifier::new("Hauptstück", Arc::new(&contains)).root()); builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains))); @@ -273,7 +273,7 @@ mod tests { #[test] fn vvg() { let law_id = "20011654"; - let mut builder = LawBuilder::new("law"); + let mut builder = LawBuilder::new(); builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains)).root()); let parser = Parser::new(); @@ -283,7 +283,7 @@ mod tests { #[test] fn urhg() { let law_id = "10001848"; - let mut builder = LawBuilder::new("law"); + let mut builder = LawBuilder::new(); builder.add_classifier(Classifier::new("Hauptstück", Arc::new(&contains)).root()); builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains))); builder.add_classifier(Classifier::new("Number", Arc::new(&starts_with_number)));