add config for law texts, fixes #3
	
		
			
	
		
	
	
		
	
		
			All checks were successful
		
		
	
	
		
			
				
	
				CI/CD Pipeline / test (push) Successful in 9m17s
				
			
		
		
	
	
				
					
				
			
		
			All checks were successful
		
		
	
	CI/CD Pipeline / test (push) Successful in 9m17s
				
			This commit is contained in:
		
							
								
								
									
										10
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										10
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -396,6 +396,7 @@ dependencies = [ | |||||||
|  "serde", |  "serde", | ||||||
|  "serde_json", |  "serde_json", | ||||||
|  "time", |  "time", | ||||||
|  |  "toml", | ||||||
|  "tqdm", |  "tqdm", | ||||||
|  "ureq", |  "ureq", | ||||||
| ] | ] | ||||||
| @@ -603,6 +604,15 @@ version = "0.1.1" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" | checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "toml" | ||||||
|  | version = "0.5.11" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" | ||||||
|  | dependencies = [ | ||||||
|  |  "serde", | ||||||
|  | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "tqdm" | name = "tqdm" | ||||||
| version = "0.6.0" | version = "0.6.0" | ||||||
|   | |||||||
| @@ -14,6 +14,8 @@ roxmltree = "0.18" | |||||||
| env_logger = "0.10" | env_logger = "0.10" | ||||||
| log = "0.4" | log = "0.4" | ||||||
| tqdm = "0.6" | tqdm = "0.6" | ||||||
|  | toml = "0.5" | ||||||
|  |  | ||||||
|  |  | ||||||
| [dev-dependencies] | [dev-dependencies] | ||||||
| pretty_assertions = "1.4" | pretty_assertions = "1.4" | ||||||
|   | |||||||
							
								
								
									
										34
									
								
								data/configs/mschg.toml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								data/configs/mschg.toml
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,34 @@ | |||||||
|  | [law] | ||||||
|  | id = "10002180" | ||||||
|  |  | ||||||
|  | [[law.classifiers]] | ||||||
|  | name = "Abschnitt" | ||||||
|  | match_function = "contains" | ||||||
|  |  | ||||||
|  | [[law.classifiers]] | ||||||
|  | name = "Number" | ||||||
|  | match_function = "starts_with_number" | ||||||
|  |  | ||||||
|  | [parser] | ||||||
|  | remove_strings = ["<i>", "</i>"] | ||||||
|  |  | ||||||
|  | [[parser.replace_rules]] | ||||||
|  | find = "\\u{a0}" | ||||||
|  | replace_with = " " | ||||||
|  |  | ||||||
|  | [[parser.replace_rules]] | ||||||
|  | find = "<super>bis</super>" | ||||||
|  | replace_with = "bis" | ||||||
|  |  | ||||||
|  | [[parser.replace_rules]] | ||||||
|  | find = "<super>ter</super>" | ||||||
|  | replace_with = "ter" | ||||||
|  |  | ||||||
|  | [[parser.replace_rules]] | ||||||
|  | find = "<gdash />" | ||||||
|  | replace_with = "-" | ||||||
|  |  | ||||||
|  | [[parser.replace_rules]] | ||||||
|  | find = "(Anm.: § 69 aufgehoben durch Art. 1 Z 12, BGBl. I Nr. 124/2017)" | ||||||
|  | replace_with = "<gldsym>§ 69.</gldsym>(Anm.: § 69 aufgehoben durch Art. 1 Z 12, BGBl. I Nr. 124/2017)" | ||||||
|  |  | ||||||
							
								
								
									
										93
									
								
								src/config.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										93
									
								
								src/config.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,93 @@ | |||||||
|  | use serde::Deserialize; | ||||||
|  | use std::fs; | ||||||
|  | use std::path::Path; | ||||||
|  | use std::sync::Arc; | ||||||
|  |  | ||||||
|  | use crate::law::{self, responsible::*}; | ||||||
|  | use crate::law::{ClassifierApplicable, LawBuilder}; | ||||||
|  | use crate::misc::Error; | ||||||
|  | use crate::risparser::paragraph::Parser; | ||||||
|  |  | ||||||
|  | // TODO: more generic | ||||||
|  | fn create_classifier(match_function: &str) -> Result<ClassifierApplicable, Error> { | ||||||
|  |     let func: ClassifierApplicable = match match_function { | ||||||
|  |         "contains" => Arc::new(contains), | ||||||
|  |         "starts_with_roman_number" => Arc::new(starts_with_roman_number), | ||||||
|  |         "contains_at_start" => Arc::new(contains_at_start), | ||||||
|  |         "starts_with_number" => Arc::new(starts_with_number), | ||||||
|  |         "starts_with_letter" => Arc::new(starts_with_letter), | ||||||
|  |         "starts_with_uppercaseletter" => Arc::new(starts_with_letter), | ||||||
|  |         "contains_without_unter" => Arc::new(contains_without_unter), | ||||||
|  |         _ => { | ||||||
|  |             return Err(Error::new(&format!( | ||||||
|  |                 "Unknown match function: {}", | ||||||
|  |                 match_function | ||||||
|  |             ))) | ||||||
|  |         } | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     Ok(func) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[derive(Debug, Deserialize)] | ||||||
|  | pub struct Config { | ||||||
|  |     law: Law, | ||||||
|  |     parser: ParserConfig, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl Config { | ||||||
|  |     pub fn load<P: AsRef<Path>>(path: P) -> Result<(usize, LawBuilder, Parser), Error> { | ||||||
|  |         let config_str = fs::read_to_string(path)?; | ||||||
|  |         let config: Config = toml::from_str(&config_str)?; | ||||||
|  |  | ||||||
|  |         let mut builder = LawBuilder::new(); | ||||||
|  |         for classifier in config.law.classifiers { | ||||||
|  |             let to_add = law::Classifier::new( | ||||||
|  |                 &classifier.name, | ||||||
|  |                 create_classifier(&classifier.match_function)?, | ||||||
|  |             ); | ||||||
|  |             if classifier.is_root { | ||||||
|  |                 builder.add_classifier(to_add.root()); | ||||||
|  |             } else { | ||||||
|  |                 builder.add_classifier(to_add); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let mut parser = Parser::new(); | ||||||
|  |  | ||||||
|  |         for to_remove in config.parser.remove_strings { | ||||||
|  |             parser.add_string_to_remove(&to_remove); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         for to_replace in config.parser.replace_rules { | ||||||
|  |             parser.add_string_to_replace(&to_replace.find, &to_replace.replace_with); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Ok((config.law.id, builder, parser)) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[derive(Debug, Deserialize)] | ||||||
|  | struct Law { | ||||||
|  |     id: usize, | ||||||
|  |     classifiers: Vec<Classifier>, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[derive(Debug, Deserialize)] | ||||||
|  | struct Classifier { | ||||||
|  |     name: String, | ||||||
|  |     is_root: bool, | ||||||
|  |     match_function: String, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[derive(Debug, Deserialize)] | ||||||
|  | struct ParserConfig { | ||||||
|  |     remove_strings: Vec<String>, | ||||||
|  |     replace_rules: Vec<ReplaceRule>, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[derive(Debug, Deserialize)] | ||||||
|  | struct ReplaceRule { | ||||||
|  |     find: String, | ||||||
|  |     replace_with: String, | ||||||
|  | } | ||||||
| @@ -7,20 +7,33 @@ use std::{ | |||||||
|     sync::Arc, |     sync::Arc, | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | use crate::{config::Config, misc::Error, risparser::overview::parse}; | ||||||
|  |  | ||||||
| pub mod responsible; | pub mod responsible; | ||||||
|  |  | ||||||
| /// That's our struct, holding all the information of the law text. | /// That's our struct, holding all the information of the law text. | ||||||
| #[derive(Debug, Serialize, Deserialize, PartialEq)] | #[derive(Debug, Serialize, Deserialize, PartialEq)] | ||||||
| pub struct Law { | pub struct Law { | ||||||
|     pub name: String, //ABGB, UrhG |  | ||||||
|     pub header: Vec<Heading>, |     pub header: Vec<Heading>, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl Law { | impl Law { | ||||||
|  |     pub fn from_config(path: &str) -> Result<Law, Error> { | ||||||
|  |         let (law_id, mut builder, parser) = Config::load(path)?; | ||||||
|  |         let pars = parse(law_id).unwrap(); | ||||||
|  |  | ||||||
|  |         for par in pars { | ||||||
|  |             let cont = parser.parse(&par, &mut builder).unwrap(); | ||||||
|  |             if !cont { | ||||||
|  |                 break; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Ok(builder.into()) | ||||||
|  |     } | ||||||
|  |  | ||||||
|     //TODO: add test |     //TODO: add test | ||||||
|     pub fn to_md(&self) { |     pub fn to_md(&self) { | ||||||
|         println!("# {}", self.name); |  | ||||||
|  |  | ||||||
|         for header in &self.header { |         for header in &self.header { | ||||||
|             Self::print_md(header, 2); |             Self::print_md(header, 2); | ||||||
|         } |         } | ||||||
| @@ -55,10 +68,7 @@ impl From<LawBuilder> for Law { | |||||||
|             }); |             }); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         Self { |         Self { header: ret } | ||||||
|             name: builder.name, |  | ||||||
|             header: ret, |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -107,9 +117,6 @@ impl From<ClassifierInstance> for HeadingContent { | |||||||
| /// Is used to generate a law struct. It's organized mainly by classifier. | /// Is used to generate a law struct. It's organized mainly by classifier. | ||||||
| #[derive(Debug)] | #[derive(Debug)] | ||||||
| pub struct LawBuilder { | pub struct LawBuilder { | ||||||
|     /// Name of the law |  | ||||||
|     name: String, //ABGB, UrhG |  | ||||||
|  |  | ||||||
|     /// Structure of the law text |     /// Structure of the law text | ||||||
|     classifiers: Vec<Classifier>, |     classifiers: Vec<Classifier>, | ||||||
|  |  | ||||||
| @@ -127,8 +134,7 @@ pub struct LawBuilder { | |||||||
|  |  | ||||||
| impl PartialEq for LawBuilder { | impl PartialEq for LawBuilder { | ||||||
|     fn eq(&self, other: &Self) -> bool { |     fn eq(&self, other: &Self) -> bool { | ||||||
|         self.name == other.name |         self.classifiers == other.classifiers | ||||||
|             && self.classifiers == other.classifiers |  | ||||||
|             && self.header == other.header |             && self.header == other.header | ||||||
|             && self.next_para_header == other.next_para_header |             && self.next_para_header == other.next_para_header | ||||||
|     } |     } | ||||||
| @@ -136,9 +142,8 @@ impl PartialEq for LawBuilder { | |||||||
|  |  | ||||||
| impl LawBuilder { | impl LawBuilder { | ||||||
|     /// Creates a new law builder. Adds classifier for known law texts. |     /// Creates a new law builder. Adds classifier for known law texts. | ||||||
|     pub fn new(name: &str) -> Self { |     pub fn new() -> Self { | ||||||
|         Self { |         Self { | ||||||
|             name: name.into(), |  | ||||||
|             classifiers: Vec::new(), |             classifiers: Vec::new(), | ||||||
|             header: Vec::new(), |             header: Vec::new(), | ||||||
|             next_para_header: None, |             next_para_header: None, | ||||||
| @@ -357,7 +362,7 @@ impl From<&str> for ClassifierInstance { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| type ClassifierApplicable = Arc<dyn Fn(&str, &str) -> bool>; | pub(crate) type ClassifierApplicable = Arc<dyn Fn(&str, &str) -> bool>; | ||||||
|  |  | ||||||
| #[derive(Clone)] | #[derive(Clone)] | ||||||
| pub struct Classifier { | pub struct Classifier { | ||||||
|   | |||||||
| @@ -1,3 +1,4 @@ | |||||||
|  | pub mod config; | ||||||
| pub mod law; | pub mod law; | ||||||
| pub mod misc; | pub mod misc; | ||||||
| pub mod risparser; | pub mod risparser; | ||||||
|   | |||||||
| @@ -1,8 +1,11 @@ | |||||||
| use risp::law::{Law, LawBuilder}; | use risp::law::Law; | ||||||
|  |  | ||||||
| fn main() { | fn main() { | ||||||
|     env_logger::init(); |     env_logger::init(); | ||||||
|  |  | ||||||
|     let law: Law = LawBuilder::new("StGB").into(); |     let config_path = "./data/config/mschg.toml"; | ||||||
|  |  | ||||||
|  |     let law = Law::from_config(config_path).unwrap(); | ||||||
|  |  | ||||||
|     law.to_md(); |     law.to_md(); | ||||||
| } | } | ||||||
|   | |||||||
							
								
								
									
										14
									
								
								src/misc.rs
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								src/misc.rs
									
									
									
									
									
								
							| @@ -8,6 +8,12 @@ pub struct Error { | |||||||
|     msg: String, |     msg: String, | ||||||
| } | } | ||||||
|  |  | ||||||
|  | impl Error { | ||||||
|  |     pub fn new(msg: &str) -> Self { | ||||||
|  |         Self { msg: msg.into() } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| impl From<ureq::Error> for Error { | impl From<ureq::Error> for Error { | ||||||
|     fn from(value: ureq::Error) -> Self { |     fn from(value: ureq::Error) -> Self { | ||||||
|         Self { |         Self { | ||||||
| @@ -29,6 +35,14 @@ impl From<serde_json::Error> for Error { | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  | impl From<toml::de::Error> for Error { | ||||||
|  |     fn from(value: toml::de::Error) -> Self { | ||||||
|  |         Self { | ||||||
|  |             msg: value.to_string(), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| impl From<roxmltree::Error> for Error { | impl From<roxmltree::Error> for Error { | ||||||
|     fn from(value: roxmltree::Error) -> Self { |     fn from(value: roxmltree::Error) -> Self { | ||||||
|         Self { |         Self { | ||||||
|   | |||||||
| @@ -172,7 +172,7 @@ mod tests { | |||||||
|     #[test] |     #[test] | ||||||
|     fn teg() { |     fn teg() { | ||||||
|         let law_id = "10001905"; |         let law_id = "10001905"; | ||||||
|         let mut builder = LawBuilder::new("law"); |         let mut builder = LawBuilder::new(); | ||||||
|         builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains)).root()); |         builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains)).root()); | ||||||
|  |  | ||||||
|         let mut parser = Parser::new(); |         let mut parser = Parser::new(); | ||||||
| @@ -190,7 +190,7 @@ mod tests { | |||||||
|     #[test] |     #[test] | ||||||
|     fn mschg() { |     fn mschg() { | ||||||
|         let law_id = "10002180"; |         let law_id = "10002180"; | ||||||
|         let mut builder = LawBuilder::new("law"); |         let mut builder = LawBuilder::new(); | ||||||
|         builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains)).root()); |         builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains)).root()); | ||||||
|         builder.add_classifier(Classifier::new("Number", Arc::new(&starts_with_number))); |         builder.add_classifier(Classifier::new("Number", Arc::new(&starts_with_number))); | ||||||
|  |  | ||||||
| @@ -213,7 +213,7 @@ mod tests { | |||||||
|     #[test] |     #[test] | ||||||
|     fn stgb() { |     fn stgb() { | ||||||
|         let law_id = "10002296"; |         let law_id = "10002296"; | ||||||
|         let mut builder = LawBuilder::new("law"); |         let mut builder = LawBuilder::new(); | ||||||
|         builder.add_classifier(Classifier::new("Teil", Arc::new(&contains)).root()); |         builder.add_classifier(Classifier::new("Teil", Arc::new(&contains)).root()); | ||||||
|         builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains))); |         builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains))); | ||||||
|  |  | ||||||
| @@ -259,7 +259,7 @@ mod tests { | |||||||
|     #[test] |     #[test] | ||||||
|     fn kschg() { |     fn kschg() { | ||||||
|         let law_id = "10002462"; |         let law_id = "10002462"; | ||||||
|         let mut builder = LawBuilder::new("law"); |         let mut builder = LawBuilder::new(); | ||||||
|         builder.add_classifier(Classifier::new("Hauptstück", Arc::new(&contains)).root()); |         builder.add_classifier(Classifier::new("Hauptstück", Arc::new(&contains)).root()); | ||||||
|         builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains))); |         builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains))); | ||||||
|  |  | ||||||
| @@ -273,7 +273,7 @@ mod tests { | |||||||
|     #[test] |     #[test] | ||||||
|     fn vvg() { |     fn vvg() { | ||||||
|         let law_id = "20011654"; |         let law_id = "20011654"; | ||||||
|         let mut builder = LawBuilder::new("law"); |         let mut builder = LawBuilder::new(); | ||||||
|         builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains)).root()); |         builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains)).root()); | ||||||
|  |  | ||||||
|         let parser = Parser::new(); |         let parser = Parser::new(); | ||||||
| @@ -283,7 +283,7 @@ mod tests { | |||||||
|     #[test] |     #[test] | ||||||
|     fn urhg() { |     fn urhg() { | ||||||
|         let law_id = "10001848"; |         let law_id = "10001848"; | ||||||
|         let mut builder = LawBuilder::new("law"); |         let mut builder = LawBuilder::new(); | ||||||
|         builder.add_classifier(Classifier::new("Hauptstück", Arc::new(&contains)).root()); |         builder.add_classifier(Classifier::new("Hauptstück", Arc::new(&contains)).root()); | ||||||
|         builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains))); |         builder.add_classifier(Classifier::new("Abschnitt", Arc::new(&contains))); | ||||||
|         builder.add_classifier(Classifier::new("Number", Arc::new(&starts_with_number))); |         builder.add_classifier(Classifier::new("Number", Arc::new(&starts_with_number))); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user