From ac20dfcb485e791b6c5db80f254099449ec199fc Mon Sep 17 00:00:00 2001 From: philipp Date: Thu, 15 Feb 2024 19:05:56 +0100 Subject: [PATCH] more cleaning --- src/paragraph/parser/abschnitt.rs | 97 +++++++++++++++---------------- 1 file changed, 48 insertions(+), 49 deletions(-) diff --git a/src/paragraph/parser/abschnitt.rs b/src/paragraph/parser/abschnitt.rs index 32bc4a6..65ed42f 100644 --- a/src/paragraph/parser/abschnitt.rs +++ b/src/paragraph/parser/abschnitt.rs @@ -8,15 +8,12 @@ use crate::paragraph::parser::{ Absatz, AbsatzAbs, Content, Fzinhalt, Kzinhalt, Liste, Table, Ueberschrift, }; -#[derive(Debug, PartialEq)] -#[derive(Default)] +#[derive(Debug, PartialEq, Default)] pub(crate) struct Abschnitt { metadata: HashMap, pub(crate) cont: bool, } - - impl Abschnitt { pub(crate) fn parse(n: Node, builder: &mut LawBuilder) -> Abschnitt { assert!(n.tag_name().name() == "abschnitt"); @@ -26,53 +23,12 @@ impl Abschnitt { Self::skip_static_fields(&mut c); - ret.handle_headers(&mut c, builder); + ret.handle_metadata(&mut c, builder); - while let Some(child) = c.peek() { - // Schiffahrtsgesetz: stop @ anlagen (for now) - if Ueberschrift::test(child, "anlage") { - return ret; - } - if Ueberschrift::test(child, "g1") { - let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1"); - if ueberschrift.content.trim().starts_with("Artikel") { - return ret; - } - builder.new_header(&ueberschrift.content); - } else if Ueberschrift::test(child, "g2") { - let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g2"); - builder.new_desc(&ueberschrift.content); - } else if Ueberschrift::test(child, "g1min") { - let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1min"); - builder.new_header(&ueberschrift.content); - } else if Ueberschrift::test(child, "art") { - let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "art"); - if ueberschrift.content.trim().starts_with("Artikel") { - return ret; - } - } else { - break; - } + if !ret.handle_headers(&mut c, builder) { + return ret; } - if let Some(child) = c.peek() { - if Ueberschrift::test(child, "para") { - builder - .new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content); - } - } - // e.g. § 405 abgb has two para (of diseased paragraph) - if let Some(child) = c.peek() { - if Ueberschrift::test(child, "para") { - builder - .new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content); - } - } - - // We have 2 tasks - // 1) Get paragraph id - // 2) Get content - let mut absatze = Vec::new(); let absatz = AbsatzAbs::parse(c.next().expect("We need at least one 'Absatz'")); let par_id = absatz @@ -194,7 +150,7 @@ impl Abschnitt { // There are paragraph-specific meta-data at the top of each xml file. We parse those. When we // encounter the title "Text" the real content starts, we stop parsing meta data. - fn handle_headers(&mut self, c: &mut Peekable, builder: &mut LawBuilder) { + fn handle_metadata(&mut self, c: &mut Peekable, builder: &mut LawBuilder) { loop { let key = Ueberschrift::parse(c.next().unwrap(), "titel").content; @@ -230,4 +186,47 @@ impl Abschnitt { Fzinhalt::parse(node.next().unwrap()); // "www.ris.bka.gv.at" and "Seite X von Y" Fzinhalt::parse(node.next().unwrap()); // "www.ris.bka.gv.at" and "Seite X von Y" } + + // we have optionally headers. Such as "Einleitung", "Von den bürgerlichen Gesetzen üerhaupt," + // etc. If we have headers which indicate that we are done and we want to stop parsing + // ("anlage" + "Artikel" we indicate this wish by returning false. + fn handle_headers(&self, c: &mut Peekable, builder: &mut LawBuilder) -> bool { + while let Some(child) = c.peek() { + // Schiffahrtsgesetz: stop @ anlagen (for now) + if Ueberschrift::test(child, "anlage") { + return false; + } + if Ueberschrift::test(child, "g1") { + let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1"); + if ueberschrift.content.trim().starts_with("Artikel") { + return false; + } + builder.new_header(&ueberschrift.content); + } else if Ueberschrift::test(child, "g2") { + let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g2"); + builder.new_desc(&ueberschrift.content); + } else if Ueberschrift::test(child, "g1min") { + let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1min"); + builder.new_header(&ueberschrift.content); + } else if Ueberschrift::test(child, "art") { + let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "art"); + if ueberschrift.content.trim().starts_with("Artikel") { + return false; + } + } else { + break; + } + } + + // while (not if) because we can have two subsequent paraheaders (e.g. § 405 abgb) + while let Some(child) = c.peek() { + if Ueberschrift::test(child, "para") { + builder + .new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content); + continue; + } + break; + } + true + } }