start draft of assert struct (with nice debug msgs) + more structure @ par parser

2024-02-15 18:37:08 +01:00
parent 3edbe11200
commit f97dd7bde2
3 changed files with 284 additions and 211 deletions
@@ -15,6 +15,7 @@
 // limitations under the Licence.

 //! Deals with getting all paragraphs for a given law text
+
 mod parser;

 use regex::Regex;
@@ -0,0 +1,254 @@
+use std::collections::HashMap;
+use std::iter::Peekable;
+
+use roxmltree::{Children, Node};
+
+use crate::law::LawBuilder;
+use crate::paragraph::parser::{
+    Absatz, AbsatzAbs, Content, Fzinhalt, Kzinhalt, Liste, Table, Ueberschrift,
+};
+
+#[derive(Debug, PartialEq)]
+pub(crate) struct Abschnitt {
+    metadata: HashMap<String, String>,
+    pub(crate) cont: bool,
+}
+
+impl Default for Abschnitt {
+    fn default() -> Self {
+        Self {
+            metadata: HashMap::new(),
+            cont: false,
+        }
+    }
+}
+
+impl Abschnitt {
+    pub(crate) fn parse(n: Node, builder: &mut LawBuilder) -> Abschnitt {
+        assert!(n.tag_name().name() == "abschnitt");
+
+        let mut ret = Abschnitt::default();
+        let mut c = n.children().peekable();
+
+        Self::skip_static_fields(&mut c);
+
+        if !ret.handle_headers(&mut c, builder) {
+            return ret;
+        }
+
+        while let Some(child) = c.peek() {
+            // Shciffahrtsgesetz: stop @ anlagen (for now)
+            if Ueberschrift::test(child, "anlage") {
+                return ret;
+            }
+            if Ueberschrift::test(child, "g1") {
+                let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1");
+                if ueberschrift.content.trim().starts_with("Artikel") {
+                    return ret;
+                }
+                builder.new_header(&ueberschrift.content);
+            } else if Ueberschrift::test(child, "g2") {
+                let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g2");
+                builder.new_desc(&ueberschrift.content);
+            } else if Ueberschrift::test(child, "g1min") {
+                let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1min");
+                builder.new_header(&ueberschrift.content);
+            } else if Ueberschrift::test(child, "art") {
+                let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "art");
+                if ueberschrift.content.trim().starts_with("Artikel") {
+                    return ret;
+                }
+            } else {
+                break;
+            }
+        }
+
+        if let Some(child) = c.peek() {
+            if Ueberschrift::test(child, "para") {
+                builder
+                    .new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content);
+            }
+        }
+        // e.g. § 405 abgb has two para (of diseased paragraph)
+        if let Some(child) = c.peek() {
+            if Ueberschrift::test(child, "para") {
+                builder
+                    .new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content);
+            }
+        }
+
+        // We have 2 tasks
+        // 1) Get paragraph id
+        // 2) Get content
+
+        let mut absatze = Vec::new();
+        let absatz = AbsatzAbs::parse(c.next().expect("We need at least one 'Absatz'"));
+        let par_id = absatz
+            .gldsym
+            .clone()
+            .expect("First 'Absatz' needs to have § id");
+
+        // If there's a  "liste" after an "absatz", the "liste" should be part of the "absatz"
+        if let Some(child) = c.peek() {
+            if Liste::test(child) {
+                let liste = Liste::parse(c.next().unwrap());
+                let mut to_add = vec![Content::Text(absatz.content), liste.get_content()];
+                if let Some(subchild) = c.peek() {
+                    if Absatz::test_with_typ(subchild, "satz") {
+                        // After a 'liste' there can be a '<absatz typ="satz"' which should be part of the list
+                        // (e.g. 85 StGB)
+                        to_add.push(Content::Text(Absatz::parse(c.next().unwrap()).content));
+                    }
+                }
+                absatze.push(Content::List(to_add));
+            } else if Table::test(child) {
+                // If there's a  "table" after an "absatz", the "table" should be part of the "absatz"
+                let table = Table::parse(c.next().unwrap());
+                if let Some(child) = c.peek() {
+                    if Absatz::test_with_typ(child, "erltext") {
+                        let after_absatz = Absatz::parse(c.next().unwrap());
+                        absatze.push(Content::List(vec![
+                            Content::Text(absatz.content),
+                            Content::List(table.get_list()),
+                            Content::Text(after_absatz.content),
+                        ]));
+                    } else {
+                        absatze.push(Content::List(vec![
+                            Content::Text(absatz.content),
+                            Content::List(table.get_list()),
+                        ]));
+                    }
+                }
+            } else if Absatz::test_with_typ(child, "satz") {
+                // After a 'liste' there can be a '<absatz typ="satz"' which should be part of the list
+                // (e.g. 1209 ABGB)
+                absatze.push(Content::List(vec![
+                    Content::Text(absatz.content.clone()),
+                    Content::Text(Absatz::parse(c.next().unwrap()).content),
+                ]));
+            } else {
+                absatze.push(Content::Text(absatz.content.clone()));
+            }
+        } else {
+            absatze.push(Content::Text(absatz.content.clone()));
+        }
+
+        //There can be as many 'Absätze' as our lovely lawsetter wants
+        while let Some(child) = c.peek() {
+            if AbsatzAbs::test(child) {
+                let abs = AbsatzAbs::parse(c.next().unwrap());
+
+                // If there's a  "liste" after an "absatz", the "liste" should be part of the "absatz"
+                if let Some(child) = c.peek() {
+                    if Liste::test(child) {
+                        let liste = Liste::parse(c.next().unwrap());
+                        let mut to_add = vec![Content::Text(abs.content), liste.get_content()];
+                        if let Some(subchild) = c.peek() {
+                            if Absatz::test_with_typ(subchild, "satz") {
+                                // After a 'liste' there can be a '<absatz typ="satz"' which should be part of the list
+                                // (e.g. 85 StGB)
+                                to_add
+                                    .push(Content::Text(Absatz::parse(c.next().unwrap()).content));
+                            }
+                        }
+                        absatze.push(Content::List(to_add));
+                    } else {
+                        let mut content = abs.content;
+                        while let Some(subchild) = c.peek() {
+                            if Absatz::test_with_typ(subchild, "erltext") {
+                                content += &Absatz::parse(c.next().unwrap()).content;
+                            } else {
+                                break;
+                            }
+                        }
+                        absatze.push(Content::Text(content));
+                    }
+                } else {
+                    absatze.push(Content::Text(abs.content));
+                }
+                continue;
+            }
+            break;
+        }
+
+        if absatze.len() == 1 {
+            builder.new_par(par_id, absatze[0].clone());
+        } else {
+            let mut contents = Vec::new();
+            for a in &absatze {
+                contents.push(a.clone());
+            }
+            builder.new_par(par_id, Content::Item(contents));
+        }
+
+        // Skip all UeberschriftTitle and Absatz
+        while let Some(child) = c.peek() {
+            if Ueberschrift::test(child, "titel") {
+                c.next();
+                continue;
+            }
+            if Absatz::test_with_typ(child, "erltext") {
+                c.next();
+                continue;
+            }
+            break;
+        }
+
+        assert_eq!(c.next(), None);
+
+        ret.cont = true;
+        ret
+    }
+
+    // There are paragraph-specific meta-data at the top of each xml file. We parse those. When we
+    // encounter the title "Text" the real content starts, we stop parsing meta data.
+    //
+    // # Returns
+    // `false` if the parsing should be stopped
+    fn handle_headers(&mut self, c: &mut Peekable<Children>, builder: &mut LawBuilder) -> bool {
+        while let Some(child) = &c.peek() {
+            if Ueberschrift::test(child, "titel") {
+                let key = Ueberschrift::parse(c.next().unwrap(), "titel").content;
+
+                // We are done with meta-data parsing
+                if key == "Text" {
+                    break;
+                }
+
+                let absatz = Absatz::parse(
+                    c.next()
+                        .expect("Expected absatz after title in par headers"),
+                );
+                if &absatz.typ != "erltext" {
+                    panic!(
+                        "Expected erlext absatz after title in par headers, got '{}'",
+                        absatz.typ
+                    );
+                }
+                let value = absatz.content;
+
+                // We want ot use this information in our markdown output.
+                // TODO: Use all metadata, instead of this specific call
+                if key == "Beachte" {
+                    builder.add_next_para_note(value.clone());
+                }
+
+                self.metadata.insert(key, value);
+                continue;
+            }
+
+            panic!("Something unforeseen happened")
+        }
+        true
+    }
+
+    // At the beginning of each 'Abschnitt' there are 4 static fields. Since they don't provide any
+    // value, we skip them with this function.
+    // If they are not there, we panic (unwrap), as we should take a look why they changed that.
+    fn skip_static_fields(node: &mut Peekable<Children>) {
+        Kzinhalt::parse(node.next().unwrap()); // "Bundesrecht konsolidiert"
+        Kzinhalt::parse(node.next().unwrap()); // "Bundesrecht konsolidiert"
+        Fzinhalt::parse(node.next().unwrap()); // "www.ris.bka.gv.at" and "Seite X von Y"
+        Fzinhalt::parse(node.next().unwrap()); // "www.ris.bka.gv.at" and "Seite X von Y"
+    }
+}
@@ -14,6 +14,9 @@
 // See the Licence for the specific language governing permissions and
 // limitations under the Licence.

+mod abschnitt;
+
+use abschnitt::Abschnitt;
 use roxmltree::Node;

 use crate::{
@@ -21,6 +24,29 @@ use crate::{
    misc::Error,
 };

+struct Expect<'a> {
+    node: &'a Node<'a, 'a>,
+}
+
+impl<'a> From<&'a Node<'a, 'a>> for Expect<'a> {
+    fn from(node: &'a Node<'a, 'a>) -> Self {
+        Expect { node }
+    }
+}
+
+impl<'a> Expect<'a> {
+    fn tag(&self, value: &str) {
+        if self.node.tag_name().name() != value {
+            panic!(
+                "Expected tag '{value}', got {} (tag: {}, content: {:?})",
+                self.node.tag_name().name(),
+                self.node.tag_name().name(),
+                self.node.text(),
+            );
+        }
+    }
+}
+
 #[derive(Debug, PartialEq)]
 pub(crate) struct Risdok {}

@@ -74,212 +100,7 @@ impl Nutzdaten {

        assert_eq!(c.next(), None);

-        ret
-    }
-}
-
-#[derive(Debug, PartialEq)]
-pub(crate) struct Abschnitt;
-impl Abschnitt {
-    pub(crate) fn parse(n: Node, builder: &mut LawBuilder) -> bool {
-        assert!(n.tag_name().name() == "abschnitt");
-
-        let mut c = n.children().peekable();
-
-        Kzinhalt::parse(c.next().unwrap());
-        Kzinhalt::parse(c.next().unwrap());
-        Fzinhalt::parse(c.next().unwrap());
-        Fzinhalt::parse(c.next().unwrap());
-
-        // Skip all UeberschriftTitle and Absatz
-        while let Some(child) = &c.peek() {
-            if Ueberschrift::test_with_typ_and_content(child, "titel", "Beachte") {
-                c.next();
-                let absatz = Absatz::parse(
-                    c.next()
-                        .expect("After a 'Beachte' title, we need an Absatz"),
-                );
-                if absatz.typ != *"erltext" {
-                    panic!("Expected erltext absatz after 'Beachte'");
-                }
-
-                builder.add_next_para_note(absatz.content);
-
-                continue;
-            }
-
-            // Stop parsing if we reached "Anlagen" (e.g. Schifffahrtsgesetz)
-            if Ueberschrift::test(child, "anlage") {
-                return false;
-            }
-            if Ueberschrift::test(child, "titel") {
-                c.next();
-                continue;
-            }
-            if Absatz::test_with_typ(child, "erltext") {
-                c.next();
-                continue;
-            }
-            break;
-        }
-
-        while let Some(child) = c.peek() {
-            if Ueberschrift::test(child, "g1") {
-                let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1");
-                if ueberschrift.content.trim().starts_with("Artikel") {
-                    return false;
-                }
-                builder.new_header(&ueberschrift.content);
-            } else if Ueberschrift::test(child, "g2") {
-                let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g2");
-                builder.new_desc(&ueberschrift.content);
-            } else if Ueberschrift::test(child, "g1min") {
-                let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1min");
-                builder.new_header(&ueberschrift.content);
-            } else if Ueberschrift::test(child, "art") {
-                let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "art");
-                if ueberschrift.content.trim().starts_with("Artikel") {
-                    return false;
-                }
-            } else {
-                break;
-            }
-        }
-
-        if let Some(child) = c.peek() {
-            if Ueberschrift::test(child, "para") {
-                builder
-                    .new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content);
-            }
-        }
-        // e.g. § 405 abgb has two para (of diseased paragraph)
-        if let Some(child) = c.peek() {
-            if Ueberschrift::test(child, "para") {
-                builder
-                    .new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content);
-            }
-        }
-
-        // We have 2 tasks
-        // 1) Get paragraph id
-        // 2) Get content
-
-        let mut absatze = Vec::new();
-        let absatz = AbsatzAbs::parse(c.next().expect("We need at least one 'Absatz'"));
-        let par_id = absatz
-            .gldsym
-            .clone()
-            .expect("First 'Absatz' needs to have § id");
-
-        // If there's a  "liste" after an "absatz", the "liste" should be part of the "absatz"
-        if let Some(child) = c.peek() {
-            if Liste::test(child) {
-                let liste = Liste::parse(c.next().unwrap());
-                let mut to_add = vec![Content::Text(absatz.content), liste.get_content()];
-                if let Some(subchild) = c.peek() {
-                    if Absatz::test_with_typ(subchild, "satz") {
-                        // After a 'liste' there can be a '<absatz typ="satz"' which should be part of the list
-                        // (e.g. 85 StGB)
-                        to_add.push(Content::Text(Absatz::parse(c.next().unwrap()).content));
-                    }
-                }
-                absatze.push(Content::List(to_add));
-            } else if Table::test(child) {
-                // If there's a  "table" after an "absatz", the "table" should be part of the "absatz"
-                let table = Table::parse(c.next().unwrap());
-                if let Some(child) = c.peek() {
-                    if Absatz::test_with_typ(child, "erltext") {
-                        let after_absatz = Absatz::parse(c.next().unwrap());
-                        absatze.push(Content::List(vec![
-                            Content::Text(absatz.content),
-                            Content::List(table.get_list()),
-                            Content::Text(after_absatz.content),
-                        ]));
-                    } else {
-                        absatze.push(Content::List(vec![
-                            Content::Text(absatz.content),
-                            Content::List(table.get_list()),
-                        ]));
-                    }
-                }
-            } else if Absatz::test_with_typ(child, "satz") {
-                // After a 'liste' there can be a '<absatz typ="satz"' which should be part of the list
-                // (e.g. 1209 ABGB)
-                absatze.push(Content::List(vec![
-                    Content::Text(absatz.content.clone()),
-                    Content::Text(Absatz::parse(c.next().unwrap()).content),
-                ]));
-            } else {
-                absatze.push(Content::Text(absatz.content.clone()));
-            }
-        } else {
-            absatze.push(Content::Text(absatz.content.clone()));
-        }
-
-        //There can be as many 'Absätze' as our lovely lawsetter wants
-        while let Some(child) = c.peek() {
-            if AbsatzAbs::test(child) {
-                let abs = AbsatzAbs::parse(c.next().unwrap());
-
-                // If there's a  "liste" after an "absatz", the "liste" should be part of the "absatz"
-                if let Some(child) = c.peek() {
-                    if Liste::test(child) {
-                        let liste = Liste::parse(c.next().unwrap());
-                        let mut to_add = vec![Content::Text(abs.content), liste.get_content()];
-                        if let Some(subchild) = c.peek() {
-                            if Absatz::test_with_typ(subchild, "satz") {
-                                // After a 'liste' there can be a '<absatz typ="satz"' which should be part of the list
-                                // (e.g. 85 StGB)
-                                to_add
-                                    .push(Content::Text(Absatz::parse(c.next().unwrap()).content));
-                            }
-                        }
-                        absatze.push(Content::List(to_add));
-                    } else {
-                        let mut content = abs.content;
-                        while let Some(subchild) = c.peek() {
-                            if Absatz::test_with_typ(subchild, "erltext") {
-                                content += &Absatz::parse(c.next().unwrap()).content;
-                            } else {
-                                break;
-                            }
-                        }
-                        absatze.push(Content::Text(content));
-                    }
-                } else {
-                    absatze.push(Content::Text(abs.content));
-                }
-                continue;
-            }
-            break;
-        }
-
-        if absatze.len() == 1 {
-            builder.new_par(par_id, absatze[0].clone());
-        } else {
-            let mut contents = Vec::new();
-            for a in &absatze {
-                contents.push(a.clone());
-            }
-            builder.new_par(par_id, Content::Item(contents));
-        }
-
-        // Skip all UeberschriftTitle and Absatz
-        while let Some(child) = c.peek() {
-            if Ueberschrift::test(child, "titel") {
-                c.next();
-                continue;
-            }
-            if Absatz::test_with_typ(child, "erltext") {
-                c.next();
-                continue;
-            }
-            break;
-        }
-
-        assert_eq!(c.next(), None);
-
-        true
+        ret.cont
    }
 }

@@ -555,15 +376,12 @@ pub(crate) struct Absatz {
    typ: String,
 }
 impl Absatz {
-    pub(crate) fn test(n: &Node) -> bool {
-        n.tag_name().name() == "absatz"
-    }
    pub(crate) fn test_with_typ(n: &Node, typ: &str) -> bool {
-        Self::test(n) && n.attribute("typ") == Some(typ)
+        n.tag_name().name() == "absatz" && n.attribute("typ") == Some(typ)
    }

    pub(crate) fn parse(n: Node) -> Self {
-        assert!(Self::test(&n));
+        Expect::from(&n).tag("absatz");

        if let Some(text) = n.text() {
            Self {