From 76ddf9796bdd8a30e526cf7a2bba55860f4c1e05 Mon Sep 17 00:00:00 2001 From: philipp Date: Fri, 16 Feb 2024 10:37:17 +0100 Subject: [PATCH] extract absatz --- src/paragraph/parser/absatz.rs | 72 +++++++++++++++++++++++++++++++ src/paragraph/parser/abschnitt.rs | 48 ++------------------- src/paragraph/parser/mod.rs | 30 +------------ 3 files changed, 78 insertions(+), 72 deletions(-) create mode 100644 src/paragraph/parser/absatz.rs diff --git a/src/paragraph/parser/absatz.rs b/src/paragraph/parser/absatz.rs new file mode 100644 index 0000000..eed151c --- /dev/null +++ b/src/paragraph/parser/absatz.rs @@ -0,0 +1,72 @@ +use std::iter::Peekable; + +use roxmltree::{Children, Node}; + +use crate::law::Content; + +use super::{liste::Liste, table::Table, AbsatzAbs, Expect}; + +#[derive(Debug, PartialEq)] +pub(crate) struct Absatz { + pub(crate) content: String, + pub(crate) typ: String, +} +impl Absatz { + pub(crate) fn test_with_typ(n: &Node, typ: &str) -> bool { + n.tag_name().name() == "absatz" && n.attribute("typ") == Some(typ) + } + + // Parses one logical 'Absatz'. If there's a List or Table after the Absatz, RIS assumes this + // one to be included in the paragraph + // + // # Returns + // - String: (optional) paragraph id + // - Content: content of the paragraph + pub(crate) fn parse_full(c: &mut Peekable) -> (Option, Content) { + let absatz = AbsatzAbs::parse(c.next().unwrap()); + let par_id = absatz.gldsym; + + let mut content = Vec::new(); + content.push(Content::Text(absatz.content)); + + // If there's a "liste" after an "absatz", the "liste" should be part of the "absatz" + while let Some(child) = c.peek() { + if Liste::test(child) { + content.push(Liste::parse_full(c).get_content()) + } else if Table::test(child) { + // If there's a "table" after an "absatz", the "table" should be part of the "absatz" + let table = Table::parse_full(c); + content.extend(table.iter().cloned()); + } else if Absatz::test_with_typ(child, "satz") + || Absatz::test_with_typ(child, "erltext") + { + // After a 'absatz' there can be a ' Self { + Expect::from(&n).tag("absatz"); + + let typ = n.attribute("typ").unwrap().into(); + + let mut content = String::new(); + // Get text from this element + all direct childs + for c in n.children() { + if let Some(text) = c.text() { + content.push_str(text); + } + } + + Self { content, typ } + } +} diff --git a/src/paragraph/parser/abschnitt.rs b/src/paragraph/parser/abschnitt.rs index 97e09e3..eca8166 100644 --- a/src/paragraph/parser/abschnitt.rs +++ b/src/paragraph/parser/abschnitt.rs @@ -4,10 +4,8 @@ use std::iter::Peekable; use roxmltree::{Children, Node}; use crate::law::LawBuilder; -use crate::paragraph::parser::liste::Liste; -use crate::paragraph::parser::{Absatz, AbsatzAbs, Content, Fzinhalt, Kzinhalt, Ueberschrift}; - -use super::table::Table; +use crate::paragraph::parser::absatz::Absatz; +use crate::paragraph::parser::{AbsatzAbs, Content, Fzinhalt, Kzinhalt, Ueberschrift}; #[derive(Debug, PartialEq, Default)] pub(crate) struct Abschnitt { @@ -33,7 +31,7 @@ impl Abschnitt { let mut absatze = Vec::new(); // Special handling of first paragraph (needs id)... - let (par_id, first_abs) = ret.parse_absatz(&mut c); + let (par_id, first_abs) = Absatz::parse_full(&mut c); let par_id = match par_id { Some(par_id) => par_id, None => panic!("First paragraph needs to have an id, not found"), @@ -43,7 +41,7 @@ impl Abschnitt { // ... and then there can be as many 'Absätze' as our law-setter wants while let Some(child) = c.peek() { if AbsatzAbs::test(child) { - let (_, absatz) = ret.parse_absatz(&mut c); + let (_, absatz) = Absatz::parse_full(&mut c); absatze.push(absatz); } else { break; @@ -168,42 +166,4 @@ impl Abschnitt { } true } - - // Parses one logical 'Absatz'. If there's a List or Table after the Absatz, RIS assumes this - // one to be included in the paragraph - // - // # Returns - // - String: (optional) paragraph id - // - Content: content of the paragraph - fn parse_absatz(&self, c: &mut Peekable) -> (Option, Content) { - let absatz = AbsatzAbs::parse(c.next().unwrap()); - let par_id = absatz.gldsym; - - let mut content = Vec::new(); - content.push(Content::Text(absatz.content)); - - // If there's a "liste" after an "absatz", the "liste" should be part of the "absatz" - while let Some(child) = c.peek() { - if Liste::test(child) { - content.push(Liste::parse_full(c).get_content()) - } else if Table::test(child) { - // If there's a "table" after an "absatz", the "table" should be part of the "absatz" - let table = Table::parse_full(c); - content.extend(table.iter().cloned()); - } else if Absatz::test_with_typ(child, "satz") - || Absatz::test_with_typ(child, "erltext") - { - // After a 'absatz' there can be a ' { @@ -289,34 +291,6 @@ impl Leaf { n.text().unwrap().into() } } - -#[derive(Debug, PartialEq)] -pub(crate) struct Absatz { - content: String, - typ: String, -} -impl Absatz { - pub(crate) fn test_with_typ(n: &Node, typ: &str) -> bool { - n.tag_name().name() == "absatz" && n.attribute("typ") == Some(typ) - } - - pub(crate) fn parse(n: Node) -> Self { - Expect::from(&n).tag("absatz"); - - let typ = n.attribute("typ").unwrap().into(); - - let mut content = String::new(); - // Get text from this element + all direct childs - for c in n.children() { - if let Some(text) = c.text() { - content.push_str(text); - } - } - - Self { content, typ } - } -} - #[derive(Debug, PartialEq)] pub(crate) struct Ueberschrift { typ: String,