add config for law texts, fixes #3
All checks were successful
CI/CD Pipeline / test (push) Successful in 9m17s

This commit is contained in:
philipp 2024-02-06 10:17:14 +01:00
parent 54a8c7a0fe
commit 15e0e485c4
9 changed files with 185 additions and 23 deletions

10
Cargo.lock generated
View File

@ -396,6 +396,7 @@ dependencies = [
"serde", "serde",
"serde_json", "serde_json",
"time", "time",
"toml",
"tqdm", "tqdm",
"ureq", "ureq",
] ]
@ -603,6 +604,15 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "toml"
version = "0.5.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234"
dependencies = [
"serde",
]
[[package]] [[package]]
name = "tqdm" name = "tqdm"
version = "0.6.0" version = "0.6.0"

View File

@ -14,6 +14,8 @@ roxmltree = "0.18"
env_logger = "0.10" env_logger = "0.10"
log = "0.4" log = "0.4"
tqdm = "0.6" tqdm = "0.6"
toml = "0.5"
[dev-dependencies] [dev-dependencies]
pretty_assertions = "1.4" pretty_assertions = "1.4"

34
data/configs/mschg.toml Normal file
View File

@ -0,0 +1,34 @@
[law]
id = "10002180"
[[law.classifiers]]
name = "Abschnitt"
match_function = "contains"
[[law.classifiers]]
name = "Number"
match_function = "starts_with_number"
[parser]
remove_strings = ["<i>", "</i>"]
[[parser.replace_rules]]
find = "\\u{a0}"
replace_with = " "
[[parser.replace_rules]]
find = "<super>bis</super>"
replace_with = "bis"
[[parser.replace_rules]]
find = "<super>ter</super>"
replace_with = "ter"
[[parser.replace_rules]]
find = "<gdash />"
replace_with = "-"
[[parser.replace_rules]]
find = "(Anm.: § 69 aufgehoben durch Art. 1 Z 12, BGBl. I Nr. 124/2017)"
replace_with = "<gldsym>§ 69.</gldsym>(Anm.: § 69 aufgehoben durch Art. 1 Z 12, BGBl. I Nr. 124/2017)"

93
src/config.rs Normal file
View File

@ -0,0 +1,93 @@
use serde::Deserialize;
use std::fs;
use std::path::Path;
use std::sync::Arc;
use crate::law::{self, responsible::*};
use crate::law::{ClassifierApplicable, LawBuilder};
use crate::misc::Error;
use crate::risparser::paragraph::Parser;
// TODO: more generic
fn create_classifier(match_function: &str) -> Result<ClassifierApplicable, Error> {
let func: ClassifierApplicable = match match_function {
"contains" => Arc::new(contains),
"starts_with_roman_number" => Arc::new(starts_with_roman_number),
"contains_at_start" => Arc::new(contains_at_start),
"starts_with_number" => Arc::new(starts_with_number),
"starts_with_letter" => Arc::new(starts_with_letter),
"starts_with_uppercaseletter" => Arc::new(starts_with_letter),
"contains_without_unter" => Arc::new(contains_without_unter),
_ => {
return Err(Error::new(&format!(
"Unknown match function: {}",
match_function
)))
}
};
Ok(func)
}
#[derive(Debug, Deserialize)]
pub struct Config {
law: Law,
parser: ParserConfig,
}
impl Config {
pub fn load<P: AsRef<Path>>(path: P) -> Result<(usize, LawBuilder, Parser), Error> {
let config_str = fs::read_to_string(path)?;
let config: Config = toml::from_str(&config_str)?;
let mut builder = LawBuilder::new();
for classifier in config.law.classifiers {
let to_add = law::Classifier::new(
&classifier.name,
create_classifier(&classifier.match_function)?,
);
if classifier.is_root {
builder.add_classifier(to_add.root());
} else {
builder.add_classifier(to_add);
}
}
let mut parser = Parser::new();
for to_remove in config.parser.remove_strings {
parser.add_string_to_remove(&to_remove);
}
for to_replace in config.parser.replace_rules {
parser.add_string_to_replace(&to_replace.find, &to_replace.replace_with);
}
Ok((config.law.id, builder, parser))
}
}
#[derive(Debug, Deserialize)]
struct Law {
id: usize,
classifiers: Vec<Classifier>,
}
#[derive(Debug, Deserialize)]
struct Classifier {
name: String,
is_root: bool,
match_function: String,
}
#[derive(Debug, Deserialize)]
struct ParserConfig {
remove_strings: Vec<String>,
replace_rules: Vec<ReplaceRule>,
}
#[derive(Debug, Deserialize)]
struct ReplaceRule {
find: String,
replace_with: String,
}

View File

@ -7,20 +7,33 @@ use std::{
sync::Arc, sync::Arc,
}; };
use crate::{config::Config, misc::Error, risparser::overview::parse};
pub mod responsible; pub mod responsible;
/// That's our struct, holding all the information of the law text. /// That's our struct, holding all the information of the law text.
#[derive(Debug, Serialize, Deserialize, PartialEq)] #[derive(Debug, Serialize, Deserialize, PartialEq)]
pub struct Law { pub struct Law {
pub name: String, //ABGB, UrhG
pub header: Vec<Heading>, pub header: Vec<Heading>,
} }
impl Law { impl Law {
pub fn from_config(path: &str) -> Result<Law, Error> {
let (law_id, mut builder, parser) = Config::load(path)?;
let pars = parse(law_id).unwrap();
for par in pars {
let cont = parser.parse(&par, &mut builder).unwrap();
if !cont {
break;
}
}
Ok(builder.into())
}
//TODO: add test //TODO: add test
pub fn to_md(&self) { pub fn to_md(&self) {
println!("# {}", self.name);
for header in &self.header { for header in &self.header {
Self::print_md(header, 2); Self::print_md(header, 2);
} }
@ -55,10 +68,7 @@ impl From<LawBuilder> for Law {
}); });
} }
Self { Self { header: ret }
name: builder.name,
header: ret,
}
} }
} }
@ -107,9 +117,6 @@ impl From<ClassifierInstance> for HeadingContent {
/// Is used to generate a law struct. It's organized mainly by classifier. /// Is used to generate a law struct. It's organized mainly by classifier.
#[derive(Debug)] #[derive(Debug)]
pub struct LawBuilder { pub struct LawBuilder {
/// Name of the law
name: String, //ABGB, UrhG
/// Structure of the law text /// Structure of the law text
classifiers: Vec<Classifier>, classifiers: Vec<Classifier>,
@ -127,8 +134,7 @@ pub struct LawBuilder {
impl PartialEq for LawBuilder { impl PartialEq for LawBuilder {
fn eq(&self, other: &Self) -> bool { fn eq(&self, other: &Self) -> bool {
self.name == other.name self.classifiers == other.classifiers
&& self.classifiers == other.classifiers
&& self.header == other.header && self.header == other.header
&& self.next_para_header == other.next_para_header && self.next_para_header == other.next_para_header
} }
@ -136,9 +142,8 @@ impl PartialEq for LawBuilder {
impl LawBuilder { impl LawBuilder {
/// Creates a new law builder. Adds classifier for known law texts. /// Creates a new law builder. Adds classifier for known law texts.
pub fn new(name: &str) -> Self { pub fn new() -> Self {
Self { Self {
name: name.into(),
classifiers: Vec::new(), classifiers: Vec::new(),
header: Vec::new(), header: Vec::new(),
next_para_header: None, next_para_header: None,
@ -357,7 +362,7 @@ impl From<&str> for ClassifierInstance {
} }
} }
type ClassifierApplicable = Arc<dyn Fn(&str, &str) -> bool>; pub(crate) type ClassifierApplicable = Arc<dyn Fn(&str, &str) -> bool>;
#[derive(Clone)] #[derive(Clone)]
pub struct Classifier { pub struct Classifier {

View File

@ -1,3 +1,4 @@
pub mod config;
pub mod law; pub mod law;
pub mod misc; pub mod misc;
pub mod risparser; pub mod risparser;

View File

@ -1,8 +1,11 @@
use risp::law::{Law, LawBuilder}; use risp::law::Law;
fn main() { fn main() {
env_logger::init(); env_logger::init();
let law: Law = LawBuilder::new("StGB").into(); let config_path = "./data/config/mschg.toml";
let law = Law::from_config(config_path).unwrap();
law.to_md(); law.to_md();
} }

View File

@ -8,6 +8,12 @@ pub struct Error {
msg: String, msg: String,
} }
impl Error {
pub fn new(msg: &str) -> Self {
Self { msg: msg.into() }
}
}
impl From<ureq::Error> for Error { impl From<ureq::Error> for Error {
fn from(value: ureq::Error) -> Self { fn from(value: ureq::Error) -> Self {
Self { Self {
@ -29,6 +35,14 @@ impl From<serde_json::Error> for Error {
} }
} }
} }
impl From<toml::de::Error> for Error {
fn from(value: toml::de::Error) -> Self {
Self {
msg: value.to_string(),
}
}
}
impl From<roxmltree::Error> for Error { impl From<roxmltree::Error> for Error {
fn from(value: roxmltree::Error) -> Self { fn from(value: roxmltree::Error) -> Self {
Self { Self {

View File

@ -172,7 +172,7 @@ mod tests {
#[test] #[test]
fn teg() { fn teg() {
let law_id = "10001905"; let law_id = "10001905";
let mut builder = LawBuilder::new("law"); let mut builder = LawBuilder::new();
builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains)).root()); builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains)).root());
let mut parser = Parser::new(); let mut parser = Parser::new();
@ -190,7 +190,7 @@ mod tests {
#[test] #[test]
fn mschg() { fn mschg() {
let law_id = "10002180"; let law_id = "10002180";
let mut builder = LawBuilder::new("law"); let mut builder = LawBuilder::new();
builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains)).root()); builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains)).root());
builder.add_classifier(Classifier::new("Number", Arc::new(&starts_with_number))); builder.add_classifier(Classifier::new("Number", Arc::new(&starts_with_number)));
@ -213,7 +213,7 @@ mod tests {
#[test] #[test]
fn stgb() { fn stgb() {
let law_id = "10002296"; let law_id = "10002296";
let mut builder = LawBuilder::new("law"); let mut builder = LawBuilder::new();
builder.add_classifier(Classifier::new("Teil", Arc::new(&contains)).root()); builder.add_classifier(Classifier::new("Teil", Arc::new(&contains)).root());
builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains))); builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains)));
@ -259,7 +259,7 @@ mod tests {
#[test] #[test]
fn kschg() { fn kschg() {
let law_id = "10002462"; let law_id = "10002462";
let mut builder = LawBuilder::new("law"); let mut builder = LawBuilder::new();
builder.add_classifier(Classifier::new("Hauptstück", Arc::new(&contains)).root()); builder.add_classifier(Classifier::new("Hauptstück", Arc::new(&contains)).root());
builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains))); builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains)));
@ -273,7 +273,7 @@ mod tests {
#[test] #[test]
fn vvg() { fn vvg() {
let law_id = "20011654"; let law_id = "20011654";
let mut builder = LawBuilder::new("law"); let mut builder = LawBuilder::new();
builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains)).root()); builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains)).root());
let parser = Parser::new(); let parser = Parser::new();
@ -283,7 +283,7 @@ mod tests {
#[test] #[test]
fn urhg() { fn urhg() {
let law_id = "10001848"; let law_id = "10001848";
let mut builder = LawBuilder::new("law"); let mut builder = LawBuilder::new();
builder.add_classifier(Classifier::new("Hauptstück", Arc::new(&contains)).root()); builder.add_classifier(Classifier::new("Hauptstück", Arc::new(&contains)).root());
builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains))); builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains)));
builder.add_classifier(Classifier::new("Number", Arc::new(&starts_with_number))); builder.add_classifier(Classifier::new("Number", Arc::new(&starts_with_number)));