add hacks for abgb

This commit is contained in:
philipp 2023-11-07 09:51:08 +01:00
parent 27a1c2c2d6
commit f5d9d35dd7
6 changed files with 245 additions and 3 deletions

159
Cargo.lock generated
View File

@ -17,12 +17,30 @@ dependencies = [
"memchr", "memchr",
] ]
[[package]]
name = "anyhow"
version = "1.0.75"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6"
[[package]]
name = "autocfg"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]] [[package]]
name = "base64" name = "base64"
version = "0.21.5" version = "0.21.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9" checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9"
[[package]]
name = "bitflags"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]] [[package]]
name = "bitflags" name = "bitflags"
version = "2.4.1" version = "2.4.1"
@ -53,6 +71,31 @@ dependencies = [
"cfg-if", "cfg-if",
] ]
[[package]]
name = "crossterm"
version = "0.25.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e64e6c0fbe2c17357405f7c758c1ef960fce08bdfb2c03d88d2a18d7e09c4b67"
dependencies = [
"bitflags 1.3.2",
"crossterm_winapi",
"libc",
"mio",
"parking_lot",
"signal-hook",
"signal-hook-mio",
"winapi",
]
[[package]]
name = "crossterm_winapi"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "acdd7c62a3665c7f6830a51635d9ac9b23ed385797f70a83bb8bafe9c572ab2b"
dependencies = [
"winapi",
]
[[package]] [[package]]
name = "deranged" name = "deranged"
version = "0.3.9" version = "0.3.9"
@ -160,6 +203,12 @@ version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38"
[[package]]
name = "lazy_static"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
[[package]] [[package]]
name = "libc" name = "libc"
version = "0.2.149" version = "0.2.149"
@ -172,6 +221,16 @@ version = "0.4.10"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f"
[[package]]
name = "lock_api"
version = "0.4.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45"
dependencies = [
"autocfg",
"scopeguard",
]
[[package]] [[package]]
name = "log" name = "log"
version = "0.4.20" version = "0.4.20"
@ -193,12 +252,47 @@ dependencies = [
"adler", "adler",
] ]
[[package]]
name = "mio"
version = "0.8.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3dce281c5e46beae905d4de1870d8b1509a9142b62eedf18b443b011ca8343d0"
dependencies = [
"libc",
"log",
"wasi",
"windows-sys",
]
[[package]] [[package]]
name = "once_cell" name = "once_cell"
version = "1.18.0" version = "1.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
[[package]]
name = "parking_lot"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
dependencies = [
"lock_api",
"parking_lot_core",
]
[[package]]
name = "parking_lot_core"
version = "0.9.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e"
dependencies = [
"cfg-if",
"libc",
"redox_syscall",
"smallvec",
"windows-targets",
]
[[package]] [[package]]
name = "percent-encoding" name = "percent-encoding"
version = "2.3.0" version = "2.3.0"
@ -239,6 +333,15 @@ dependencies = [
"proc-macro2", "proc-macro2",
] ]
[[package]]
name = "redox_syscall"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
dependencies = [
"bitflags 1.3.2",
]
[[package]] [[package]]
name = "regex" name = "regex"
version = "1.10.2" version = "1.10.2"
@ -293,6 +396,7 @@ dependencies = [
"serde", "serde",
"serde_json", "serde_json",
"time", "time",
"tqdm",
"ureq", "ureq",
] ]
@ -311,7 +415,7 @@ version = "0.38.21"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3" checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3"
dependencies = [ dependencies = [
"bitflags", "bitflags 2.4.1",
"errno", "errno",
"libc", "libc",
"linux-raw-sys", "linux-raw-sys",
@ -346,6 +450,12 @@ version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741"
[[package]]
name = "scopeguard"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]] [[package]]
name = "sct" name = "sct"
version = "0.7.1" version = "0.7.1"
@ -387,6 +497,42 @@ dependencies = [
"serde", "serde",
] ]
[[package]]
name = "signal-hook"
version = "0.3.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8621587d4798caf8eb44879d42e56b9a93ea5dcd315a6487c357130095b62801"
dependencies = [
"libc",
"signal-hook-registry",
]
[[package]]
name = "signal-hook-mio"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "29ad2e15f37ec9a6cc544097b78a1ec90001e9f71b81338ca39f430adaca99af"
dependencies = [
"libc",
"mio",
"signal-hook",
]
[[package]]
name = "signal-hook-registry"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1"
dependencies = [
"libc",
]
[[package]]
name = "smallvec"
version = "1.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a"
[[package]] [[package]]
name = "spin" name = "spin"
version = "0.9.8" version = "0.9.8"
@ -457,6 +603,17 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tqdm"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c9e4aea46830eb68bbb272f637baa91318b94aee47d172c68e4c43495fd521f"
dependencies = [
"anyhow",
"crossterm",
"lazy_static",
]
[[package]] [[package]]
name = "unicode-bidi" name = "unicode-bidi"
version = "0.3.13" version = "0.3.13"

View File

@ -13,6 +13,7 @@ serde_json = "1.0"
roxmltree = "0.18" roxmltree = "0.18"
env_logger = "0.10" env_logger = "0.10"
log = "0.4" log = "0.4"
tqdm = "0.6"
[dev-dependencies] [dev-dependencies]
pretty_assertions = "1.4" pretty_assertions = "1.4"

View File

@ -108,6 +108,23 @@ pub(crate) fn contains(classifier_name: &str, instance_name: &str) -> bool {
.contains(&classifier_name.to_lowercase()) .contains(&classifier_name.to_lowercase())
} }
fn starts_with_roman_number(_: &str, s: &str) -> bool {
// Define the prefixes for Roman numerals.
let roman_prefixes = [
"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", "XI", "XII", "XIII", "XIV",
"XV", "XVI", "XVII", "XVIII", "XIX", "XX",
];
// Check if the string starts with one of the Roman numeral prefixes followed by a period.
roman_prefixes
.iter()
.any(|&prefix| s.starts_with(&(prefix.to_string() + ".")))
}
fn contains_at_start(_classifier_name: &str, instance_name: &str) -> bool {
!instance_name.is_empty() && instance_name.starts_with('@')
}
fn starts_with_number(_classifier_name: &str, instance_name: &str) -> bool { fn starts_with_number(_classifier_name: &str, instance_name: &str) -> bool {
matches!(instance_name.trim().as_bytes().first(), Some(c) if c.is_ascii_digit()) matches!(instance_name.trim().as_bytes().first(), Some(c) if c.is_ascii_digit())
} }
@ -195,6 +212,20 @@ impl LawBuilder {
classifiers.push(Classifier::new("Abschnitt", Arc::new(&contains)).root()); classifiers.push(Classifier::new("Abschnitt", Arc::new(&contains)).root());
classifiers.push(Classifier::new("Number", Arc::new(&starts_with_number))); classifiers.push(Classifier::new("Number", Arc::new(&starts_with_number)));
} else if name == "ABGB" {
law_id = Some(10001622);
classifiers.push(Classifier::new("Einleitung", Arc::new(&contains)).root());
classifiers.push(Classifier::new("Theil", Arc::new(&contains)).root());
classifiers.push(Classifier::new("Hauptstück", Arc::new(&contains)));
classifiers.push(Classifier::new("Abschnitt", Arc::new(&contains)));
classifiers.push(Classifier::new("Abtheilung", Arc::new(&contains)));
classifiers.push(Classifier::new("heading", Arc::new(&contains_at_start)));
classifiers.push(Classifier::new("letter", Arc::new(&starts_with_letter)));
classifiers.push(Classifier::new("num", Arc::new(&starts_with_number)));
classifiers.push(Classifier::new("rom", Arc::new(&starts_with_roman_number)));
} }
let mut builder = Self { let mut builder = Self {
@ -209,7 +240,7 @@ impl LawBuilder {
let paragraphs = overview::parse(law_id.unwrap()).unwrap(); let paragraphs = overview::parse(law_id.unwrap()).unwrap();
for paragraph in paragraphs { for paragraph in tqdm::tqdm(paragraphs.into_iter()) {
let cont = par::parse(&paragraph, &mut builder).unwrap(); let cont = par::parse(&paragraph, &mut builder).unwrap();
if !cont { if !cont {
break; break;
@ -332,6 +363,11 @@ impl LawBuilder {
pub(crate) fn new_next_para_header(&mut self, header: &str) { pub(crate) fn new_next_para_header(&mut self, header: &str) {
#[cfg(test)] #[cfg(test)]
self.history.push(format!("New_new_para_header: {header}")); self.history.push(format!("New_new_para_header: {header}"));
if let Some(next_para_header) = &self.next_para_header {
self.new_header(&next_para_header.clone()); // promote to bigger header :-)
}
debug!("new_next_para_header={header}"); debug!("new_next_para_header={header}");
self.next_para_header = Some(header.trim().into()); self.next_para_header = Some(header.trim().into());
} }

View File

@ -42,7 +42,7 @@ impl From<roxmltree::Error> for Error {
fn main() { fn main() {
env_logger::init(); env_logger::init();
let law = LawBuilder::new("MSchG"); let law = LawBuilder::new("ABGB");
law.to_md(); law.to_md();
} }

View File

@ -28,11 +28,50 @@ pub(crate) fn parse_from_str(xml: &str, builder: &mut LawBuilder) -> Result<bool
r#"<ueberschrift typ="g1" ct="text" halign="c">1. Verwertungsrechte.</ueberschrift>"#, r#"<ueberschrift typ="g1" ct="text" halign="c">1. Verwertungsrechte.</ueberschrift>"#,
); // 1. Verwertungsrechte. before § 14 ); // 1. Verwertungsrechte. before § 14
let xml = xml.replace(
r#"<ueberschrift typ="g1" ct="text" halign="c">Medizinische Behandlung</ueberschrift>"#,
r#"<ueberschrift typ="art" ct="text" halign="c">Medizinische Behandlung</ueberschrift>"#,
); // 1. Verwertungsrechte. before § 14
let xml = xml.replace( let xml = xml.replace(
r#"<ueberschrift typ="para" ct="text" halign="c">4b. Presseveröffentlichungen.</ueberschrift>"#, r#"<ueberschrift typ="para" ct="text" halign="c">4b. Presseveröffentlichungen.</ueberschrift>"#,
r#"<ueberschrift typ="g1" ct="text" halign="c">4b. Presseveröffentlichungen.</ueberschrift>"#, r#"<ueberschrift typ="g1" ct="text" halign="c">4b. Presseveröffentlichungen.</ueberschrift>"#,
); // § 99d UrhG, Titel kein Para.... //TODO: not working ); // § 99d UrhG, Titel kein Para.... //TODO: not working
let xml = xml.replace(
r#"<ueberschrift typ="para" ct="text" halign="c">Erfordernisse zur Ersitzung:</ueberschrift>"#,
r#"<ueberschrift typ="art" ct="text" halign="c">Erfordernisse zur Ersitzung:</ueberschrift>"#,
);
let xml = xml.replace(
r#"<ueberschrift typ="art" ct="text" halign="c">"#,
r#"<ueberschrift typ="g1" ct="text" halign="c">@"#,
);
let xml = xml.replace(
r#"<ueberschrift typ="g1" ct="text" halign="c">Zweyter Theil</ueberschrift><ueberschrift typ="g1" ct="text" halign="c">des</ueberschrift><ueberschrift typ="g1" ct="text" halign="c">bürgerlichen Gesetzbuches.</ueberschrift><ueberschrift typ="g1" ct="text" halign="c">Von dem Sachenrechte.</ueberschrift>"#,
r#"<ueberschrift typ="g1" ct="text" halign="c">Zweyter Theil des bürgerlichen Gesetzbuches. Von dem Sachenrechte.</ueberschrift>"#,
);
let xml = xml.replace(
r#"<ueberschrift typ="g1" ct="text" halign="c">Dritter Theil</ueberschrift><ueberschrift typ="g1min" ct="text" halign="c">des</ueberschrift><ueberschrift typ="g1min" ct="text" halign="c">bürgerlichen Gesetzbuches.</ueberschrift>"#,
r#"<ueberschrift typ="g1" ct="text" halign="c">Dritter Theil des bürgerlichen Gesetzbuches.</ueberschrift>"#,
);
let xml = xml.replace(
r#"<ueberschrift typ="g1" ct="text" halign="c">Von den gemeinschaftlichen Bestimmungen der Personen- und Sachenrechte.</ueberschrift>"#,
r#"<ueberschrift typ="g2" ct="text" halign="c">Von den gemeinschaftlichen Bestimmungen der Personen- und Sachenrechte.</ueberschrift>"#,
);
let xml = xml.replace(
r#"<ueberschrift typ="g1" ct="text" halign="c">Erste Abtheilung</ueberschrift><ueberschrift typ="g2" ct="text" halign="c">des Sachenrechtes.</ueberschrift>"#,
r#"<ueberschrift typ="g1" ct="text" halign="c">Erste Abtheilung des Sachenrechtes.</ueberschrift>"#,
);
let xml = xml.replace(
r#"<ueberschrift typ="g1min" ct="text" halign="c">Von den dinglichen Rechten.</ueberschrift>"#,
r#"<ueberschrift typ="g2" ct="text" halign="c">Von den dinglichen Rechten.</ueberschrift>"#,
);
let xml = xml.replace("<super>", ""); // e.g. § 23a in MSchG let xml = xml.replace("<super>", ""); // e.g. § 23a in MSchG
let xml = xml.replace("</super>", ""); // e.g. § 23a in MSchG let xml = xml.replace("</super>", ""); // e.g. § 23a in MSchG
let xml = xml.replace("<i>", ""); // § 69 in MSchG let xml = xml.replace("<i>", ""); // § 69 in MSchG
@ -44,6 +83,8 @@ pub(crate) fn parse_from_str(xml: &str, builder: &mut LawBuilder) -> Result<bool
let xml = xml.replace("<b>", ""); // § 14 in FSG let xml = xml.replace("<b>", ""); // § 14 in FSG
let xml = xml.replace("</b>", ""); let xml = xml.replace("</b>", "");
let xml = xml.replace("<tab />", "");
let xml = xml.replace( let xml = xml.replace(
r#"(Anm.: § 69 aufgehoben durch Art. 1 Z 12, BGBl. I Nr. 124/2017)"#, r#"(Anm.: § 69 aufgehoben durch Art. 1 Z 12, BGBl. I Nr. 124/2017)"#,
r#"<gldsym>§ 69.</gldsym>(Anm.: § 69 aufgehoben durch Art. 1 Z 12, BGBl. I Nr. 124/2017)"#, r#"<gldsym>§ 69.</gldsym>(Anm.: § 69 aufgehoben durch Art. 1 Z 12, BGBl. I Nr. 124/2017)"#,

View File

@ -106,6 +106,13 @@ impl Abschnitt {
} }
} }
if let Some(child) = c.peek() {
if Ueberschrift::test(child, "para") {
builder
.new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content);
}
}
// e.g. § 405 abgb has two para (of diseased paragraph)
if let Some(child) = c.peek() { if let Some(child) = c.peek() {
if Ueberschrift::test(child, "para") { if Ueberschrift::test(child, "para") {
builder builder