From 33e10185c01e388f616ae3c42b7e8662c0c4bf01 Mon Sep 17 00:00:00 2001 From: philipp Date: Fri, 16 Feb 2024 10:29:30 +0100 Subject: [PATCH] clean code, parse footer metadata --- src/paragraph/parser/abschnitt.rs | 36 +++++++++++++++++++------------ src/paragraph/parser/liste.rs | 25 --------------------- src/paragraph/parser/mod.rs | 18 ++++++++-------- 3 files changed, 31 insertions(+), 48 deletions(-) diff --git a/src/paragraph/parser/abschnitt.rs b/src/paragraph/parser/abschnitt.rs index f270a56..2a63cd5 100644 --- a/src/paragraph/parser/abschnitt.rs +++ b/src/paragraph/parser/abschnitt.rs @@ -54,12 +54,12 @@ impl Abschnitt { builder.new_par(par_id, absatze[0].clone()); } else { let mut contents = Vec::new(); - for a in &absatze { - contents.push(a.clone()); - } + contents.extend(absatze.iter().cloned()); builder.new_par(par_id, Content::Item(contents)); } + ret.handle_metadata(&mut c, builder); + // Skip all UeberschriftTitle and Absatz while let Some(child) = c.peek() { if Ueberschrift::test(child, "titel") { @@ -73,29 +73,38 @@ impl Abschnitt { break; } - // assert_eq!(c.next(), None); + assert_eq!(c.next(), None); ret.cont = true; ret } - // There are paragraph-specific meta-data at the top of each xml file. We parse those. When we - // encounter the title "Text" the real content starts, we stop parsing meta data. + // There are paragraph-specific meta-data at the top and bottom of each xml file. We parse + // those. When we encounter the title "Text" the real content starts, we stop parsing meta + // data. fn handle_metadata(&mut self, c: &mut Peekable, builder: &mut LawBuilder) { - loop { + while c.peek().is_some() { let key = Ueberschrift::parse(c.next().unwrap(), "titel").content; + println!("{key}"); // We are done with meta-data parsing if key == "Text" { break; } - let absatz = Absatz::parse( - c.next() - .expect("Expected absatz after title in par headers"), - ); - - let value = absatz.content; + let mut value = String::new(); + while let Some(child) = c.peek() { + if Absatz::test_with_typ(child, "erltext") { + let absatz = Absatz::parse(c.next().unwrap()); + value.push_str(&format!("{}\n", absatz.content)); + } else { + break; + } + } + value = value.trim().into(); + if value == "" { + panic!("Expected at least on erltext-absatz after title meta-data"); + } // We want ot use this information in our markdown output. // TODO: Use all metadata, instead of this specific call @@ -104,7 +113,6 @@ impl Abschnitt { } self.metadata.insert(key, value); - continue; } } diff --git a/src/paragraph/parser/liste.rs b/src/paragraph/parser/liste.rs index e8b632f..412a61d 100644 --- a/src/paragraph/parser/liste.rs +++ b/src/paragraph/parser/liste.rs @@ -46,31 +46,6 @@ impl Liste { Self { content } } - pub(crate) fn parse(n: Node) -> Self { - Expect::from(&n).tag("liste"); - - let mut content = Vec::new(); - - let mut c = n.children().peekable(); - - while let Some(child) = c.peek() { - if Ziffernliste::test(child) { - content.push(Ziffernliste::parse(c.next().unwrap()).get_content()); - } else if Schlussteil::test(child) { - // 162 Schifffahrtsgesetz show use that a 'schlussteil' can be at the start of a list - content.push(Content::Text(Schlussteil::parse(c.next().unwrap()).content)); - } else if Absatz::test_with_typ(child, "satz") { - content.push(Content::Text(Absatz::parse(c.next().unwrap()).content)); - } else { - break; - } - } - - assert_eq!(c.next(), None); - - Self { content } - } - pub(crate) fn get_content(&self) -> Content { Content::List(self.content.clone()) } diff --git a/src/paragraph/parser/mod.rs b/src/paragraph/parser/mod.rs index a7869af..806c474 100644 --- a/src/paragraph/parser/mod.rs +++ b/src/paragraph/parser/mod.rs @@ -303,17 +303,17 @@ impl Absatz { pub(crate) fn parse(n: Node) -> Self { Expect::from(&n).tag("absatz"); - if let Some(text) = n.text() { - Self { - content: text.into(), - typ: n.attribute("typ").unwrap().into(), - } - } else { - Self { - content: String::new(), - typ: n.attribute("typ").unwrap().into(), + let typ = n.attribute("typ").unwrap().into(); + + let mut content = String::new(); + // Get text from this element + all direct childs + for c in n.children() { + if let Some(text) = c.text() { + content.push_str(text); } } + + Self { content, typ } } }