From eb6f3e8abaca76d073e95c2ee5c91e673bc4bae8 Mon Sep 17 00:00:00 2001 From: philipp Date: Sat, 4 Nov 2023 20:39:17 +0100 Subject: [PATCH] push --- README.md | 2 +- src/law.rs | 3 +- src/par/mod.rs | 14 ++++ src/par/parser.rs | 197 +++++++++++++++++++++++++++++++--------------- 4 files changed, 152 insertions(+), 64 deletions(-) diff --git a/README.md b/README.md index 079112e..b1eb1c1 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ RISolve # Next step -- [ ] call law struct fn with paragraph content +- [ ] UrhG § 17 Abs. 1 not parsed # Naming diff --git a/src/law.rs b/src/law.rs index 6838c6c..8398123 100644 --- a/src/law.rs +++ b/src/law.rs @@ -223,7 +223,8 @@ pub(crate) enum Content { Text(String), //This is my direct law text TextWithList(String, Vec>), Item(Vec>), //(1) This is general law. (2) This is more specific law - List(Vec>), //1. my first item + List(Vec>), + TextWithListAndText(String, Vec>, String), //1. my first item } #[cfg(test)] diff --git a/src/par/mod.rs b/src/par/mod.rs index 9b3a3b6..d18c86a 100644 --- a/src/par/mod.rs +++ b/src/par/mod.rs @@ -10,6 +10,20 @@ pub(crate) fn parse(url: &str, builder: &mut LawBuilder) -> Result<(), Error> { println!("{url}"); let xml = fetch_page(url)?; let xml = xml.replace("", "-"); // used e.g. in §11 Abs. 3 UrhG + // + // + let xml = xml.replace( + // e.g. in § 17 (2) TODO: check that this onyl happens here + r#""#, + r#""#, + ); + let xml = xml.replace( + // e.g. in § 17 (2) TODO: check that this onyl happens here + r#""#, + "", + ); + println!("{xml}"); + let risdok = Risdok::from_str(&xml, builder)?; println!("{builder:#?}"); diff --git a/src/par/parser.rs b/src/par/parser.rs index 4e62335..16ae53b 100644 --- a/src/par/parser.rs +++ b/src/par/parser.rs @@ -1,5 +1,3 @@ -use std::fmt::Display; - use roxmltree::Node; use crate::{ @@ -39,20 +37,6 @@ impl Risdok { } } -impl Display for Risdok { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - for abs in &self.nutzdaten.abschnitt.absatze { - let mut w = String::new(); - if let Some(symb) = &abs.gldsym { - w.push_str(&format!("\n{symb} ")); - } - w.push_str(&format!("{}\n", abs.content)); - f.write_str(&w)?; - } - Ok(()) - } -} - #[derive(Debug, PartialEq)] pub(crate) struct Metadaten; impl Metadaten { @@ -66,30 +50,25 @@ impl Metadaten { } #[derive(Debug, PartialEq)] -pub(crate) struct Nutzdaten { - abschnitt: Abschnitt, -} +pub(crate) struct Nutzdaten {} impl Nutzdaten { pub(crate) fn parse(n: Node, builder: &mut LawBuilder) -> Self { assert!(n.tag_name().name() == "nutzdaten"); let mut c = n.children(); - let ret = Self { - abschnitt: Abschnitt::parse(c.next().unwrap(), builder), - }; + + Abschnitt::parse(c.next().unwrap(), builder); assert_eq!(c.next(), None); - ret + Self {} } } #[derive(Debug, PartialEq)] -pub(crate) struct Abschnitt { - absatze: Vec, -} +pub(crate) struct Abschnitt; impl Abschnitt { - pub(crate) fn parse(n: Node, builder: &mut LawBuilder) -> Self { + pub(crate) fn parse(n: Node, builder: &mut LawBuilder) { assert!(n.tag_name().name() == "abschnitt"); let mut c = n.children().peekable(); @@ -156,27 +135,55 @@ impl Abschnitt { .gldsym .clone() .expect("First 'Absatz' needs to have § id"); - absatze.push(absatz); + // If there's a "liste" after an "absatz", the "liste" should be part of the "absatz" if let Some(child) = c.peek() { if Liste::test(child) { let liste = Liste::parse(c.next().unwrap()); - //TODO do something with list + absatze.push(Content::TextWithList( + absatz.content.clone(), + liste.get_list(), + )) + } else if Table::test(child) { + // If there's a "table" after an "absatz", the "table" should be part of the "absatz" + let table = Table::parse(c.next().unwrap()); + if let Some(child) = c.peek() { + if Absatz::test_with_typ(child, "erltext") { + let after_absatz = Absatz::parse(c.next().unwrap()); + absatze.push(Content::TextWithListAndText( + absatz.content, + table.get_list(), + after_absatz.content, + )) + } else { + absatze.push(Content::TextWithList(absatz.content, table.get_list())) + } + } } + } else { + absatze.push(Content::Text(absatz.content.clone())); } + //TODO: Continue here, (2) and (3) is somehow skipped + //There can be as many 'Absätze' as our lovely lawsetter wants loop { match c.peek() { Some(child) => { if AbsatzAbs::test(child) { - absatze.push(AbsatzAbs::parse(c.next().unwrap())); + let abs = AbsatzAbs::parse(c.next().unwrap()); + // If there's a "liste" after an "absatz", the "liste" should be part of the "absatz" if let Some(child) = c.peek() { - if Liste::test(child) { + if Liste::test(&child) { let liste = Liste::parse(c.next().unwrap()); //TODO do something with list + absatze.push(Content::TextWithList(abs.content, liste.get_list())) + } else { + absatze.push(Content::Text(abs.content)); } + } else { + absatze.push(Content::Text(abs.content)); } continue; } @@ -187,38 +194,15 @@ impl Abschnitt { } if absatze.len() == 1 { - builder.new_par(par_id, Content::Text(absatze[0].content.clone())); + builder.new_par(par_id, absatze[0].clone()); } else { let mut contents = Vec::new(); for a in &absatze { - contents.push(Box::new(Content::Text(a.content.clone()))); + contents.push(Box::new(a.clone())); } builder.new_par(par_id, Content::Item(contents)); } - //if absatze.len() == 1 { - // builder.new_par(Content::Text(format!( - // "{} {}", - // absatze[0].gldsym.clone().unwrap(), - // absatze[0].content - // ))); - //} else { - // let mut content = Vec::new(); - // for a in &absatze { - // let mut txt = String::new(); - // if let Some(sym) = &a.gldsym { - // if symb.is_some() { - // panic!("Two (or more) § symbols in single paragraph ?!?"); - // } else { - // symb = Some(sym); - // } - // } - // txt.push_str(&a.content); - // content.push(Box::new(Content::Text(txt))); - // } - // builder.new_par(Content::Item(content)); - //} - // Skip all UeberschriftTitle and Absatz loop { match c.peek() { @@ -237,11 +221,7 @@ impl Abschnitt { } } - println!("===="); - println!("{c:#?}"); assert_eq!(c.next(), None); - - Self { absatze } } } @@ -319,6 +299,78 @@ impl Ziffernliste { } } +#[derive(Debug, PartialEq)] +pub(crate) struct Td { + absatz: Absatz, +} +impl Td { + pub(crate) fn parse(n: &Node) -> Self { + assert!(n.tag_name().name() == "td"); + + let mut c = n.children(); + let absatz = Absatz::parse(c.next().unwrap()); + + assert_eq!(c.next(), None); + + Self { absatz } + } +} + +#[derive(Debug, PartialEq)] +pub(crate) struct Tr { + tds: Vec, +} +impl Tr { + pub(crate) fn parse(n: &Node) -> Self { + assert!(n.tag_name().name() == "tr"); + + let mut tds = Vec::new(); + + let mut c = n.children(); + for child in c { + tds.push(Td::parse(&child)); + } + + Self { tds } + } +} +#[derive(Debug, PartialEq)] +pub(crate) struct Table { + trs: Vec, +} +impl Table { + pub(crate) fn test(n: &Node) -> bool { + n.tag_name().name() == "table" + } + + pub(crate) fn parse(n: Node) -> Self { + assert!(Self::test(&n)); + let mut trs = Vec::new(); + + let mut c = n.children(); + for child in c { + trs.push(Tr::parse(&child)); + } + + Self { trs } + } + + pub(crate) fn get_list(&self) -> Vec> { + let mut ret = Vec::new(); + + for tr in &self.trs { + let mut txt = String::new(); + for td in &tr.tds { + txt.push_str(&format!("{} ", td.absatz.content)); + } + + ret.push(Box::new(Content::Text(format!("- {txt}",)))); + } + + ret + } +} + #[derive(Debug, PartialEq)] pub(crate) struct Liste { ziffernliste: Ziffernliste, @@ -341,6 +393,20 @@ impl Liste { Self { ziffernliste } } + + pub(crate) fn get_list(&self) -> Vec> { + let mut ret = Vec::new(); + + for a in &self.ziffernliste.listelems { + ret.push(Box::new(Content::Text(format!( + "{} {}", + a.symbol.content, + a.text.clone() + )))); + } + + ret + } } #[derive(Debug, PartialEq)] @@ -405,12 +471,19 @@ impl Absatz { pub(crate) fn test(n: &Node) -> bool { n.tag_name().name() == "absatz" } + pub(crate) fn test_with_typ(n: &Node, typ: &str) -> bool { + n.tag_name().name() == "absatz" && n.attribute("typ") == Some(typ) + } pub(crate) fn parse(n: Node) -> Self { assert!(n.tag_name().name() == "absatz"); - Self { - content: n.text().unwrap().into(), + if let Some(text) = n.text() { + Self { + content: text.into(), + } + } else { + Self { content: "".into() } } } }