From f97dd7bde20dfee1d6cd71da672ec5689460e97e Mon Sep 17 00:00:00 2001 From: philipp Date: Thu, 15 Feb 2024 18:37:08 +0100 Subject: [PATCH] start draft of assert struct (with nice debug msgs) + more structure @ par parser --- src/paragraph/mod.rs | 1 + src/paragraph/parser/abschnitt.rs | 254 +++++++++++++++++++++ src/paragraph/{parser.rs => parser/mod.rs} | 240 +++---------------- 3 files changed, 284 insertions(+), 211 deletions(-) create mode 100644 src/paragraph/parser/abschnitt.rs rename src/paragraph/{parser.rs => parser/mod.rs} (55%) diff --git a/src/paragraph/mod.rs b/src/paragraph/mod.rs index 4b3b117..e47b4ad 100644 --- a/src/paragraph/mod.rs +++ b/src/paragraph/mod.rs @@ -15,6 +15,7 @@ // limitations under the Licence. //! Deals with getting all paragraphs for a given law text + mod parser; use regex::Regex; diff --git a/src/paragraph/parser/abschnitt.rs b/src/paragraph/parser/abschnitt.rs new file mode 100644 index 0000000..6691797 --- /dev/null +++ b/src/paragraph/parser/abschnitt.rs @@ -0,0 +1,254 @@ +use std::collections::HashMap; +use std::iter::Peekable; + +use roxmltree::{Children, Node}; + +use crate::law::LawBuilder; +use crate::paragraph::parser::{ + Absatz, AbsatzAbs, Content, Fzinhalt, Kzinhalt, Liste, Table, Ueberschrift, +}; + +#[derive(Debug, PartialEq)] +pub(crate) struct Abschnitt { + metadata: HashMap, + pub(crate) cont: bool, +} + +impl Default for Abschnitt { + fn default() -> Self { + Self { + metadata: HashMap::new(), + cont: false, + } + } +} + +impl Abschnitt { + pub(crate) fn parse(n: Node, builder: &mut LawBuilder) -> Abschnitt { + assert!(n.tag_name().name() == "abschnitt"); + + let mut ret = Abschnitt::default(); + let mut c = n.children().peekable(); + + Self::skip_static_fields(&mut c); + + if !ret.handle_headers(&mut c, builder) { + return ret; + } + + while let Some(child) = c.peek() { + // Shciffahrtsgesetz: stop @ anlagen (for now) + if Ueberschrift::test(child, "anlage") { + return ret; + } + if Ueberschrift::test(child, "g1") { + let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1"); + if ueberschrift.content.trim().starts_with("Artikel") { + return ret; + } + builder.new_header(&ueberschrift.content); + } else if Ueberschrift::test(child, "g2") { + let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g2"); + builder.new_desc(&ueberschrift.content); + } else if Ueberschrift::test(child, "g1min") { + let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1min"); + builder.new_header(&ueberschrift.content); + } else if Ueberschrift::test(child, "art") { + let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "art"); + if ueberschrift.content.trim().starts_with("Artikel") { + return ret; + } + } else { + break; + } + } + + if let Some(child) = c.peek() { + if Ueberschrift::test(child, "para") { + builder + .new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content); + } + } + // e.g. § 405 abgb has two para (of diseased paragraph) + if let Some(child) = c.peek() { + if Ueberschrift::test(child, "para") { + builder + .new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content); + } + } + + // We have 2 tasks + // 1) Get paragraph id + // 2) Get content + + let mut absatze = Vec::new(); + let absatz = AbsatzAbs::parse(c.next().expect("We need at least one 'Absatz'")); + let par_id = absatz + .gldsym + .clone() + .expect("First 'Absatz' needs to have § id"); + + // If there's a "liste" after an "absatz", the "liste" should be part of the "absatz" + if let Some(child) = c.peek() { + if Liste::test(child) { + let liste = Liste::parse(c.next().unwrap()); + let mut to_add = vec![Content::Text(absatz.content), liste.get_content()]; + if let Some(subchild) = c.peek() { + if Absatz::test_with_typ(subchild, "satz") { + // After a 'liste' there can be a ', builder: &mut LawBuilder) -> bool { + while let Some(child) = &c.peek() { + if Ueberschrift::test(child, "titel") { + let key = Ueberschrift::parse(c.next().unwrap(), "titel").content; + + // We are done with meta-data parsing + if key == "Text" { + break; + } + + let absatz = Absatz::parse( + c.next() + .expect("Expected absatz after title in par headers"), + ); + if &absatz.typ != "erltext" { + panic!( + "Expected erlext absatz after title in par headers, got '{}'", + absatz.typ + ); + } + let value = absatz.content; + + // We want ot use this information in our markdown output. + // TODO: Use all metadata, instead of this specific call + if key == "Beachte" { + builder.add_next_para_note(value.clone()); + } + + self.metadata.insert(key, value); + continue; + } + + panic!("Something unforeseen happened") + } + true + } + + // At the beginning of each 'Abschnitt' there are 4 static fields. Since they don't provide any + // value, we skip them with this function. + // If they are not there, we panic (unwrap), as we should take a look why they changed that. + fn skip_static_fields(node: &mut Peekable) { + Kzinhalt::parse(node.next().unwrap()); // "Bundesrecht konsolidiert" + Kzinhalt::parse(node.next().unwrap()); // "Bundesrecht konsolidiert" + Fzinhalt::parse(node.next().unwrap()); // "www.ris.bka.gv.at" and "Seite X von Y" + Fzinhalt::parse(node.next().unwrap()); // "www.ris.bka.gv.at" and "Seite X von Y" + } +} diff --git a/src/paragraph/parser.rs b/src/paragraph/parser/mod.rs similarity index 55% rename from src/paragraph/parser.rs rename to src/paragraph/parser/mod.rs index bdfcda2..01b27de 100644 --- a/src/paragraph/parser.rs +++ b/src/paragraph/parser/mod.rs @@ -14,6 +14,9 @@ // See the Licence for the specific language governing permissions and // limitations under the Licence. +mod abschnitt; + +use abschnitt::Abschnitt; use roxmltree::Node; use crate::{ @@ -21,6 +24,29 @@ use crate::{ misc::Error, }; +struct Expect<'a> { + node: &'a Node<'a, 'a>, +} + +impl<'a> From<&'a Node<'a, 'a>> for Expect<'a> { + fn from(node: &'a Node<'a, 'a>) -> Self { + Expect { node } + } +} + +impl<'a> Expect<'a> { + fn tag(&self, value: &str) { + if self.node.tag_name().name() != value { + panic!( + "Expected tag '{value}', got {} (tag: {}, content: {:?})", + self.node.tag_name().name(), + self.node.tag_name().name(), + self.node.text(), + ); + } + } +} + #[derive(Debug, PartialEq)] pub(crate) struct Risdok {} @@ -74,212 +100,7 @@ impl Nutzdaten { assert_eq!(c.next(), None); - ret - } -} - -#[derive(Debug, PartialEq)] -pub(crate) struct Abschnitt; -impl Abschnitt { - pub(crate) fn parse(n: Node, builder: &mut LawBuilder) -> bool { - assert!(n.tag_name().name() == "abschnitt"); - - let mut c = n.children().peekable(); - - Kzinhalt::parse(c.next().unwrap()); - Kzinhalt::parse(c.next().unwrap()); - Fzinhalt::parse(c.next().unwrap()); - Fzinhalt::parse(c.next().unwrap()); - - // Skip all UeberschriftTitle and Absatz - while let Some(child) = &c.peek() { - if Ueberschrift::test_with_typ_and_content(child, "titel", "Beachte") { - c.next(); - let absatz = Absatz::parse( - c.next() - .expect("After a 'Beachte' title, we need an Absatz"), - ); - if absatz.typ != *"erltext" { - panic!("Expected erltext absatz after 'Beachte'"); - } - - builder.add_next_para_note(absatz.content); - - continue; - } - - // Stop parsing if we reached "Anlagen" (e.g. Schifffahrtsgesetz) - if Ueberschrift::test(child, "anlage") { - return false; - } - if Ueberschrift::test(child, "titel") { - c.next(); - continue; - } - if Absatz::test_with_typ(child, "erltext") { - c.next(); - continue; - } - break; - } - - while let Some(child) = c.peek() { - if Ueberschrift::test(child, "g1") { - let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1"); - if ueberschrift.content.trim().starts_with("Artikel") { - return false; - } - builder.new_header(&ueberschrift.content); - } else if Ueberschrift::test(child, "g2") { - let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g2"); - builder.new_desc(&ueberschrift.content); - } else if Ueberschrift::test(child, "g1min") { - let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1min"); - builder.new_header(&ueberschrift.content); - } else if Ueberschrift::test(child, "art") { - let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "art"); - if ueberschrift.content.trim().starts_with("Artikel") { - return false; - } - } else { - break; - } - } - - if let Some(child) = c.peek() { - if Ueberschrift::test(child, "para") { - builder - .new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content); - } - } - // e.g. § 405 abgb has two para (of diseased paragraph) - if let Some(child) = c.peek() { - if Ueberschrift::test(child, "para") { - builder - .new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content); - } - } - - // We have 2 tasks - // 1) Get paragraph id - // 2) Get content - - let mut absatze = Vec::new(); - let absatz = AbsatzAbs::parse(c.next().expect("We need at least one 'Absatz'")); - let par_id = absatz - .gldsym - .clone() - .expect("First 'Absatz' needs to have § id"); - - // If there's a "liste" after an "absatz", the "liste" should be part of the "absatz" - if let Some(child) = c.peek() { - if Liste::test(child) { - let liste = Liste::parse(c.next().unwrap()); - let mut to_add = vec![Content::Text(absatz.content), liste.get_content()]; - if let Some(subchild) = c.peek() { - if Absatz::test_with_typ(subchild, "satz") { - // After a 'liste' there can be a ' bool { - n.tag_name().name() == "absatz" - } pub(crate) fn test_with_typ(n: &Node, typ: &str) -> bool { - Self::test(n) && n.attribute("typ") == Some(typ) + n.tag_name().name() == "absatz" && n.attribute("typ") == Some(typ) } pub(crate) fn parse(n: Node) -> Self { - assert!(Self::test(&n)); + Expect::from(&n).tag("absatz"); if let Some(text) = n.text() { Self {