From f5d9d35dd76e7153e1d6d95ab24a42c020fed207 Mon Sep 17 00:00:00 2001 From: philipp Date: Tue, 7 Nov 2023 09:51:08 +0100 Subject: [PATCH] add hacks for abgb --- Cargo.lock | 159 +++++++++++++++++++++++++++++++++++++++++++++- Cargo.toml | 1 + src/law.rs | 38 ++++++++++- src/main.rs | 2 +- src/par/mod.rs | 41 ++++++++++++ src/par/parser.rs | 7 ++ 6 files changed, 245 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a2d88ef..f2c7c56 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,12 +17,30 @@ dependencies = [ "memchr", ] +[[package]] +name = "anyhow" +version = "1.0.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + [[package]] name = "base64" version = "0.21.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9" +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + [[package]] name = "bitflags" version = "2.4.1" @@ -53,6 +71,31 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crossterm" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e64e6c0fbe2c17357405f7c758c1ef960fce08bdfb2c03d88d2a18d7e09c4b67" +dependencies = [ + "bitflags 1.3.2", + "crossterm_winapi", + "libc", + "mio", + "parking_lot", + "signal-hook", + "signal-hook-mio", + "winapi", +] + +[[package]] +name = "crossterm_winapi" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b" +dependencies = [ + "winapi", +] + [[package]] name = "deranged" version = "0.3.9" @@ -160,6 +203,12 @@ version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + [[package]] name = "libc" version = "0.2.149" @@ -172,6 +221,16 @@ version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" +[[package]] +name = "lock_api" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" +dependencies = [ + "autocfg", + "scopeguard", +] + [[package]] name = "log" version = "0.4.20" @@ -193,12 +252,47 @@ dependencies = [ "adler", ] +[[package]] +name = "mio" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dce281c5e46beae905d4de1870d8b1509a9142b62eedf18b443b011ca8343d0" +dependencies = [ + "libc", + "log", + "wasi", + "windows-sys", +] + [[package]] name = "once_cell" version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + [[package]] name = "percent-encoding" version = "2.3.0" @@ -239,6 +333,15 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "regex" version = "1.10.2" @@ -293,6 +396,7 @@ dependencies = [ "serde", "serde_json", "time", + "tqdm", "ureq", ] @@ -311,7 +415,7 @@ version = "0.38.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3" dependencies = [ - "bitflags", + "bitflags 2.4.1", "errno", "libc", "linux-raw-sys", @@ -346,6 +450,12 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "sct" version = "0.7.1" @@ -387,6 +497,42 @@ dependencies = [ "serde", ] +[[package]] +name = "signal-hook" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8621587d4798caf8eb44879d42e56b9a93ea5dcd315a6487c357130095b62801" +dependencies = [ + "libc", + "signal-hook-registry", +] + +[[package]] +name = "signal-hook-mio" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af" +dependencies = [ + "libc", + "mio", + "signal-hook", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" +dependencies = [ + "libc", +] + +[[package]] +name = "smallvec" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a" + [[package]] name = "spin" version = "0.9.8" @@ -457,6 +603,17 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" +[[package]] +name = "tqdm" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c9e4aea46830eb68bbb272f637baa91318b94aee47d172c68e4c43495fd521f" +dependencies = [ + "anyhow", + "crossterm", + "lazy_static", +] + [[package]] name = "unicode-bidi" version = "0.3.13" diff --git a/Cargo.toml b/Cargo.toml index 90acbb2..4db3db1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,6 +13,7 @@ serde_json = "1.0" roxmltree = "0.18" env_logger = "0.10" log = "0.4" +tqdm = "0.6" [dev-dependencies] pretty_assertions = "1.4" diff --git a/src/law.rs b/src/law.rs index 551fd55..3c19271 100644 --- a/src/law.rs +++ b/src/law.rs @@ -108,6 +108,23 @@ pub(crate) fn contains(classifier_name: &str, instance_name: &str) -> bool { .contains(&classifier_name.to_lowercase()) } +fn starts_with_roman_number(_: &str, s: &str) -> bool { + // Define the prefixes for Roman numerals. + let roman_prefixes = [ + "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", "XI", "XII", "XIII", "XIV", + "XV", "XVI", "XVII", "XVIII", "XIX", "XX", + ]; + + // Check if the string starts with one of the Roman numeral prefixes followed by a period. + roman_prefixes + .iter() + .any(|&prefix| s.starts_with(&(prefix.to_string() + "."))) +} + +fn contains_at_start(_classifier_name: &str, instance_name: &str) -> bool { + !instance_name.is_empty() && instance_name.starts_with('@') +} + fn starts_with_number(_classifier_name: &str, instance_name: &str) -> bool { matches!(instance_name.trim().as_bytes().first(), Some(c) if c.is_ascii_digit()) } @@ -195,6 +212,20 @@ impl LawBuilder { classifiers.push(Classifier::new("Abschnitt", Arc::new(&contains)).root()); classifiers.push(Classifier::new("Number", Arc::new(&starts_with_number))); + } else if name == "ABGB" { + law_id = Some(10001622); + + classifiers.push(Classifier::new("Einleitung", Arc::new(&contains)).root()); + classifiers.push(Classifier::new("Theil", Arc::new(&contains)).root()); + + classifiers.push(Classifier::new("Hauptstück", Arc::new(&contains))); + classifiers.push(Classifier::new("Abschnitt", Arc::new(&contains))); + classifiers.push(Classifier::new("Abtheilung", Arc::new(&contains))); + + classifiers.push(Classifier::new("heading", Arc::new(&contains_at_start))); + classifiers.push(Classifier::new("letter", Arc::new(&starts_with_letter))); + classifiers.push(Classifier::new("num", Arc::new(&starts_with_number))); + classifiers.push(Classifier::new("rom", Arc::new(&starts_with_roman_number))); } let mut builder = Self { @@ -209,7 +240,7 @@ impl LawBuilder { let paragraphs = overview::parse(law_id.unwrap()).unwrap(); - for paragraph in paragraphs { + for paragraph in tqdm::tqdm(paragraphs.into_iter()) { let cont = par::parse(¶graph, &mut builder).unwrap(); if !cont { break; @@ -332,6 +363,11 @@ impl LawBuilder { pub(crate) fn new_next_para_header(&mut self, header: &str) { #[cfg(test)] self.history.push(format!("New_new_para_header: {header}")); + + if let Some(next_para_header) = &self.next_para_header { + self.new_header(&next_para_header.clone()); // promote to bigger header :-) + } + debug!("new_next_para_header={header}"); self.next_para_header = Some(header.trim().into()); } diff --git a/src/main.rs b/src/main.rs index 2543c25..42535ad 100644 --- a/src/main.rs +++ b/src/main.rs @@ -42,7 +42,7 @@ impl From for Error { fn main() { env_logger::init(); - let law = LawBuilder::new("MSchG"); + let law = LawBuilder::new("ABGB"); law.to_md(); } diff --git a/src/par/mod.rs b/src/par/mod.rs index 30976b1..69c057b 100644 --- a/src/par/mod.rs +++ b/src/par/mod.rs @@ -28,11 +28,50 @@ pub(crate) fn parse_from_str(xml: &str, builder: &mut LawBuilder) -> Result1. Verwertungsrechte."#, ); // 1. Verwertungsrechte. before § 14 + let xml = xml.replace( + r#"Medizinische Behandlung"#, + r#"Medizinische Behandlung"#, + ); // 1. Verwertungsrechte. before § 14 + let xml = xml.replace( r#"4b. Presseveröffentlichungen."#, r#"4b. Presseveröffentlichungen."#, ); // § 99d UrhG, Titel kein Para.... //TODO: not working + let xml = xml.replace( + r#"Erfordernisse zur Ersitzung:"#, + r#"Erfordernisse zur Ersitzung:"#, + ); + + let xml = xml.replace( + r#""#, + r#"@"#, + ); + + let xml = xml.replace( + r#"Zweyter Theildesbürgerlichen Gesetzbuches.Von dem Sachenrechte."#, + r#"Zweyter Theil des bürgerlichen Gesetzbuches. Von dem Sachenrechte."#, + ); + + let xml = xml.replace( + r#"Dritter Theildesbürgerlichen Gesetzbuches."#, + r#"Dritter Theil des bürgerlichen Gesetzbuches."#, + ); + let xml = xml.replace( + r#"Von den gemeinschaftlichen Bestimmungen der Personen- und Sachenrechte."#, + r#"Von den gemeinschaftlichen Bestimmungen der Personen- und Sachenrechte."#, + ); + + let xml = xml.replace( + r#"Erste Abtheilungdes Sachenrechtes."#, + r#"Erste Abtheilung des Sachenrechtes."#, + ); + + let xml = xml.replace( + r#"Von den dinglichen Rechten."#, + r#"Von den dinglichen Rechten."#, + ); + let xml = xml.replace("", ""); // e.g. § 23a in MSchG let xml = xml.replace("", ""); // e.g. § 23a in MSchG let xml = xml.replace("", ""); // § 69 in MSchG @@ -44,6 +83,8 @@ pub(crate) fn parse_from_str(xml: &str, builder: &mut LawBuilder) -> Result", ""); // § 14 in FSG let xml = xml.replace("", ""); + let xml = xml.replace("", ""); + let xml = xml.replace( r#"(Anm.: § 69 aufgehoben durch Art. 1 Z 12, BGBl. I Nr. 124/2017)"#, r#"§ 69.(Anm.: § 69 aufgehoben durch Art. 1 Z 12, BGBl. I Nr. 124/2017)"#, diff --git a/src/par/parser.rs b/src/par/parser.rs index a3d7622..9923140 100644 --- a/src/par/parser.rs +++ b/src/par/parser.rs @@ -106,6 +106,13 @@ impl Abschnitt { } } + if let Some(child) = c.peek() { + if Ueberschrift::test(child, "para") { + builder + .new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content); + } + } + // e.g. § 405 abgb has two para (of diseased paragraph) if let Some(child) = c.peek() { if Ueberschrift::test(child, "para") { builder