start draft of assert struct (with nice debug msgs) + more structure @ par parser

2024-02-15 18:37:08 +01:00 · 2024-02-15 18:37:08 +01:00 · f97dd7bde2
commit f97dd7bde2
parent 3edbe11200
3 changed files with 284 additions and 211 deletions
--- a/src/paragraph/mod.rs
+++ b/src/paragraph/mod.rs
@ -15,6 +15,7 @@
 // limitations under the Licence.
 //! Deals with getting all paragraphs for a given law text
 mod parser;
 use regex::Regex;
--- a/src/paragraph/parser/abschnitt.rs
+++ b/src/paragraph/parser/abschnitt.rs
@ -0,0 +1,254 @@
 use std::collections::HashMap;
 use std::iter::Peekable;
 use roxmltree::{Children, Node};
 use crate::law::LawBuilder;
 use crate::paragraph::parser::{
    Absatz, AbsatzAbs, Content, Fzinhalt, Kzinhalt, Liste, Table, Ueberschrift,
 };
 #[derive(Debug, PartialEq)]
 pub(crate) struct Abschnitt {
    metadata: HashMap<String, String>,
    pub(crate) cont: bool,
 }
 impl Default for Abschnitt {
    fn default() -> Self {
        Self {
            metadata: HashMap::new(),
            cont: false,
        }
    }
 }
 impl Abschnitt {
    pub(crate) fn parse(n: Node, builder: &mut LawBuilder) -> Abschnitt {
        assert!(n.tag_name().name() == "abschnitt");
        let mut ret = Abschnitt::default();
        let mut c = n.children().peekable();
        Self::skip_static_fields(&mut c);
        if !ret.handle_headers(&mut c, builder) {
            return ret;
        }
        while let Some(child) = c.peek() {
            // Shciffahrtsgesetz: stop @ anlagen (for now)
            if Ueberschrift::test(child, "anlage") {
                return ret;
            }
            if Ueberschrift::test(child, "g1") {
                let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1");
                if ueberschrift.content.trim().starts_with("Artikel") {
                    return ret;
                }
                builder.new_header(&ueberschrift.content);
            } else if Ueberschrift::test(child, "g2") {
                let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g2");
                builder.new_desc(&ueberschrift.content);
            } else if Ueberschrift::test(child, "g1min") {
                let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1min");
                builder.new_header(&ueberschrift.content);
            } else if Ueberschrift::test(child, "art") {
                let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "art");
                if ueberschrift.content.trim().starts_with("Artikel") {
                    return ret;
                }
            } else {
                break;
            }
        }
        if let Some(child) = c.peek() {
            if Ueberschrift::test(child, "para") {
                builder
                    .new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content);
            }
        }
        // e.g. § 405 abgb has two para (of diseased paragraph)
        if let Some(child) = c.peek() {
            if Ueberschrift::test(child, "para") {
                builder
                    .new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content);
            }
        }
        // We have 2 tasks
        // 1) Get paragraph id
        // 2) Get content
        let mut absatze = Vec::new();
        let absatz = AbsatzAbs::parse(c.next().expect("We need at least one 'Absatz'"));
        let par_id = absatz
            .gldsym
            .clone()
            .expect("First 'Absatz' needs to have § id");
        // If there's a  "liste" after an "absatz", the "liste" should be part of the "absatz"
        if let Some(child) = c.peek() {
            if Liste::test(child) {
                let liste = Liste::parse(c.next().unwrap());
                let mut to_add = vec![Content::Text(absatz.content), liste.get_content()];
                if let Some(subchild) = c.peek() {
                    if Absatz::test_with_typ(subchild, "satz") {
                        // After a 'liste' there can be a '<absatz typ="satz"' which should be part of the list
                        // (e.g. 85 StGB)
                        to_add.push(Content::Text(Absatz::parse(c.next().unwrap()).content));
                    }
                }
                absatze.push(Content::List(to_add));
            } else if Table::test(child) {
                // If there's a  "table" after an "absatz", the "table" should be part of the "absatz"
                let table = Table::parse(c.next().unwrap());
                if let Some(child) = c.peek() {
                    if Absatz::test_with_typ(child, "erltext") {
                        let after_absatz = Absatz::parse(c.next().unwrap());
                        absatze.push(Content::List(vec![
                            Content::Text(absatz.content),
                            Content::List(table.get_list()),
                            Content::Text(after_absatz.content),
                        ]));
                    } else {
                        absatze.push(Content::List(vec![
                            Content::Text(absatz.content),
                            Content::List(table.get_list()),
                        ]));
                    }
                }
            } else if Absatz::test_with_typ(child, "satz") {
                // After a 'liste' there can be a '<absatz typ="satz"' which should be part of the list
                // (e.g. 1209 ABGB)
                absatze.push(Content::List(vec![
                    Content::Text(absatz.content.clone()),
                    Content::Text(Absatz::parse(c.next().unwrap()).content),
                ]));
            } else {
                absatze.push(Content::Text(absatz.content.clone()));
            }
        } else {
            absatze.push(Content::Text(absatz.content.clone()));
        }
        //There can be as many 'Absätze' as our lovely lawsetter wants
        while let Some(child) = c.peek() {
            if AbsatzAbs::test(child) {
                let abs = AbsatzAbs::parse(c.next().unwrap());
                // If there's a  "liste" after an "absatz", the "liste" should be part of the "absatz"
                if let Some(child) = c.peek() {
                    if Liste::test(child) {
                        let liste = Liste::parse(c.next().unwrap());
                        let mut to_add = vec![Content::Text(abs.content), liste.get_content()];
                        if let Some(subchild) = c.peek() {
                            if Absatz::test_with_typ(subchild, "satz") {
                                // After a 'liste' there can be a '<absatz typ="satz"' which should be part of the list
                                // (e.g. 85 StGB)
                                to_add
                                    .push(Content::Text(Absatz::parse(c.next().unwrap()).content));
                            }
                        }
                        absatze.push(Content::List(to_add));
                    } else {
                        let mut content = abs.content;
                        while let Some(subchild) = c.peek() {
                            if Absatz::test_with_typ(subchild, "erltext") {
                                content += &Absatz::parse(c.next().unwrap()).content;
                            } else {
                                break;
                            }
                        }
                        absatze.push(Content::Text(content));
                    }
                } else {
                    absatze.push(Content::Text(abs.content));
                }
                continue;
            }
            break;
        }
        if absatze.len() == 1 {
            builder.new_par(par_id, absatze[0].clone());
        } else {
            let mut contents = Vec::new();
            for a in &absatze {
                contents.push(a.clone());
            }
            builder.new_par(par_id, Content::Item(contents));
        }
        // Skip all UeberschriftTitle and Absatz
        while let Some(child) = c.peek() {
            if Ueberschrift::test(child, "titel") {
                c.next();
                continue;
            }
            if Absatz::test_with_typ(child, "erltext") {
                c.next();
                continue;
            }
            break;
        }
        assert_eq!(c.next(), None);
        ret.cont = true;
        ret
    }
    // There are paragraph-specific meta-data at the top of each xml file. We parse those. When we
    // encounter the title "Text" the real content starts, we stop parsing meta data.
    //
    // # Returns
    // `false` if the parsing should be stopped
    fn handle_headers(&mut self, c: &mut Peekable<Children>, builder: &mut LawBuilder) -> bool {
        while let Some(child) = &c.peek() {
            if Ueberschrift::test(child, "titel") {
                let key = Ueberschrift::parse(c.next().unwrap(), "titel").content;
                // We are done with meta-data parsing
                if key == "Text" {
                    break;
                }
                let absatz = Absatz::parse(
                    c.next()
                        .expect("Expected absatz after title in par headers"),
                );
                if &absatz.typ != "erltext" {
                    panic!(
                        "Expected erlext absatz after title in par headers, got '{}'",
                        absatz.typ
                    );
                }
                let value = absatz.content;
                // We want ot use this information in our markdown output.
                // TODO: Use all metadata, instead of this specific call
                if key == "Beachte" {
                    builder.add_next_para_note(value.clone());
                }
                self.metadata.insert(key, value);
                continue;
            }
            panic!("Something unforeseen happened")
        }
        true
    }
    // At the beginning of each 'Abschnitt' there are 4 static fields. Since they don't provide any
    // value, we skip them with this function.
    // If they are not there, we panic (unwrap), as we should take a look why they changed that.
    fn skip_static_fields(node: &mut Peekable<Children>) {
        Kzinhalt::parse(node.next().unwrap()); // "Bundesrecht konsolidiert"
        Kzinhalt::parse(node.next().unwrap()); // "Bundesrecht konsolidiert"
        Fzinhalt::parse(node.next().unwrap()); // "www.ris.bka.gv.at" and "Seite X von Y"
        Fzinhalt::parse(node.next().unwrap()); // "www.ris.bka.gv.at" and "Seite X von Y"
    }
 }
--- a/src/paragraph/parser/mod.rs
+++ b/src/paragraph/parser/mod.rs
@ -14,6 +14,9 @@
 // See the Licence for the specific language governing permissions and
 // limitations under the Licence.
 mod abschnitt;
 use abschnitt::Abschnitt;
 use roxmltree::Node;
 use crate::{
@ -21,6 +24,29 @@ use crate::{
    misc::Error,
 };
 struct Expect<'a> {
    node: &'a Node<'a, 'a>,
 }
 impl<'a> From<&'a Node<'a, 'a>> for Expect<'a> {
    fn from(node: &'a Node<'a, 'a>) -> Self {
        Expect { node }
    }
 }
 impl<'a> Expect<'a> {
    fn tag(&self, value: &str) {
        if self.node.tag_name().name() != value {
            panic!(
                "Expected tag '{value}', got {} (tag: {}, content: {:?})",
                self.node.tag_name().name(),
                self.node.tag_name().name(),
                self.node.text(),
            );
        }
    }
 }
 #[derive(Debug, PartialEq)]
 pub(crate) struct Risdok {}
@ -74,212 +100,7 @@ impl Nutzdaten {
        assert_eq!(c.next(), None);
-        ret
+        ret.cont
    }
 }
 #[derive(Debug, PartialEq)]
 pub(crate) struct Abschnitt;
 impl Abschnitt {
    pub(crate) fn parse(n: Node, builder: &mut LawBuilder) -> bool {
        assert!(n.tag_name().name() == "abschnitt");
        let mut c = n.children().peekable();
        Kzinhalt::parse(c.next().unwrap());
        Kzinhalt::parse(c.next().unwrap());
        Fzinhalt::parse(c.next().unwrap());
        Fzinhalt::parse(c.next().unwrap());
        // Skip all UeberschriftTitle and Absatz
        while let Some(child) = &c.peek() {
            if Ueberschrift::test_with_typ_and_content(child, "titel", "Beachte") {
                c.next();
                let absatz = Absatz::parse(
                    c.next()
                        .expect("After a 'Beachte' title, we need an Absatz"),
                );
                if absatz.typ != *"erltext" {
                    panic!("Expected erltext absatz after 'Beachte'");
                }
                builder.add_next_para_note(absatz.content);
                continue;
            }
            // Stop parsing if we reached "Anlagen" (e.g. Schifffahrtsgesetz)
            if Ueberschrift::test(child, "anlage") {
                return false;
            }
            if Ueberschrift::test(child, "titel") {
                c.next();
                continue;
            }
            if Absatz::test_with_typ(child, "erltext") {
                c.next();
                continue;
            }
            break;
        }
        while let Some(child) = c.peek() {
            if Ueberschrift::test(child, "g1") {
                let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1");
                if ueberschrift.content.trim().starts_with("Artikel") {
                    return false;
                }
                builder.new_header(&ueberschrift.content);
            } else if Ueberschrift::test(child, "g2") {
                let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g2");
                builder.new_desc(&ueberschrift.content);
            } else if Ueberschrift::test(child, "g1min") {
                let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1min");
                builder.new_header(&ueberschrift.content);
            } else if Ueberschrift::test(child, "art") {
                let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "art");
                if ueberschrift.content.trim().starts_with("Artikel") {
                    return false;
                }
            } else {
                break;
            }
        }
        if let Some(child) = c.peek() {
            if Ueberschrift::test(child, "para") {
                builder
                    .new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content);
            }
        }
        // e.g. § 405 abgb has two para (of diseased paragraph)
        if let Some(child) = c.peek() {
            if Ueberschrift::test(child, "para") {
                builder
                    .new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content);
            }
        }
        // We have 2 tasks
        // 1) Get paragraph id
        // 2) Get content
        let mut absatze = Vec::new();
        let absatz = AbsatzAbs::parse(c.next().expect("We need at least one 'Absatz'"));
        let par_id = absatz
            .gldsym
            .clone()
            .expect("First 'Absatz' needs to have § id");
        // If there's a  "liste" after an "absatz", the "liste" should be part of the "absatz"
        if let Some(child) = c.peek() {
            if Liste::test(child) {
                let liste = Liste::parse(c.next().unwrap());
                let mut to_add = vec![Content::Text(absatz.content), liste.get_content()];
                if let Some(subchild) = c.peek() {
                    if Absatz::test_with_typ(subchild, "satz") {
                        // After a 'liste' there can be a '<absatz typ="satz"' which should be part of the list
                        // (e.g. 85 StGB)
                        to_add.push(Content::Text(Absatz::parse(c.next().unwrap()).content));
                    }
                }
                absatze.push(Content::List(to_add));
            } else if Table::test(child) {
                // If there's a  "table" after an "absatz", the "table" should be part of the "absatz"
                let table = Table::parse(c.next().unwrap());
                if let Some(child) = c.peek() {
                    if Absatz::test_with_typ(child, "erltext") {
                        let after_absatz = Absatz::parse(c.next().unwrap());
                        absatze.push(Content::List(vec![
                            Content::Text(absatz.content),
                            Content::List(table.get_list()),
                            Content::Text(after_absatz.content),
                        ]));
                    } else {
                        absatze.push(Content::List(vec![
                            Content::Text(absatz.content),
                            Content::List(table.get_list()),
                        ]));
                    }
                }
            } else if Absatz::test_with_typ(child, "satz") {
                // After a 'liste' there can be a '<absatz typ="satz"' which should be part of the list
                // (e.g. 1209 ABGB)
                absatze.push(Content::List(vec![
                    Content::Text(absatz.content.clone()),
                    Content::Text(Absatz::parse(c.next().unwrap()).content),
                ]));
            } else {
                absatze.push(Content::Text(absatz.content.clone()));
            }
        } else {
            absatze.push(Content::Text(absatz.content.clone()));
        }
        //There can be as many 'Absätze' as our lovely lawsetter wants
        while let Some(child) = c.peek() {
            if AbsatzAbs::test(child) {
                let abs = AbsatzAbs::parse(c.next().unwrap());
                // If there's a  "liste" after an "absatz", the "liste" should be part of the "absatz"
                if let Some(child) = c.peek() {
                    if Liste::test(child) {
                        let liste = Liste::parse(c.next().unwrap());
                        let mut to_add = vec![Content::Text(abs.content), liste.get_content()];
                        if let Some(subchild) = c.peek() {
                            if Absatz::test_with_typ(subchild, "satz") {
                                // After a 'liste' there can be a '<absatz typ="satz"' which should be part of the list
                                // (e.g. 85 StGB)
                                to_add
                                    .push(Content::Text(Absatz::parse(c.next().unwrap()).content));
                            }
                        }
                        absatze.push(Content::List(to_add));
                    } else {
                        let mut content = abs.content;
                        while let Some(subchild) = c.peek() {
                            if Absatz::test_with_typ(subchild, "erltext") {
                                content += &Absatz::parse(c.next().unwrap()).content;
                            } else {
                                break;
                            }
                        }
                        absatze.push(Content::Text(content));
                    }
                } else {
                    absatze.push(Content::Text(abs.content));
                }
                continue;
            }
            break;
        }
        if absatze.len() == 1 {
            builder.new_par(par_id, absatze[0].clone());
        } else {
            let mut contents = Vec::new();
            for a in &absatze {
                contents.push(a.clone());
            }
            builder.new_par(par_id, Content::Item(contents));
        }
        // Skip all UeberschriftTitle and Absatz
        while let Some(child) = c.peek() {
            if Ueberschrift::test(child, "titel") {
                c.next();
                continue;
            }
            if Absatz::test_with_typ(child, "erltext") {
                c.next();
                continue;
            }
            break;
        }
        assert_eq!(c.next(), None);
        true
    }
 }
@ -555,15 +376,12 @@ pub(crate) struct Absatz {
    typ: String,
 }
 impl Absatz {
    pub(crate) fn test(n: &Node) -> bool {
        n.tag_name().name() == "absatz"
    }
    pub(crate) fn test_with_typ(n: &Node, typ: &str) -> bool {
-        Self::test(n) && n.attribute("typ") == Some(typ)
+        n.tag_name().name() == "absatz" && n.attribute("typ") == Some(typ)
    }
    pub(crate) fn parse(n: Node) -> Self {
-        assert!(Self::test(&n));
+        Expect::from(&n).tag("absatz");
        if let Some(text) = n.text() {
            Self {