more cleaning
All checks were successful
CI/CD Pipeline / test (push) Successful in 1m51s

This commit is contained in:
philipp 2024-02-15 19:05:56 +01:00
parent 90d9982add
commit ac20dfcb48

View File

@ -8,15 +8,12 @@ use crate::paragraph::parser::{
Absatz, AbsatzAbs, Content, Fzinhalt, Kzinhalt, Liste, Table, Ueberschrift, Absatz, AbsatzAbs, Content, Fzinhalt, Kzinhalt, Liste, Table, Ueberschrift,
}; };
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq, Default)]
#[derive(Default)]
pub(crate) struct Abschnitt { pub(crate) struct Abschnitt {
metadata: HashMap<String, String>, metadata: HashMap<String, String>,
pub(crate) cont: bool, pub(crate) cont: bool,
} }
impl Abschnitt { impl Abschnitt {
pub(crate) fn parse(n: Node, builder: &mut LawBuilder) -> Abschnitt { pub(crate) fn parse(n: Node, builder: &mut LawBuilder) -> Abschnitt {
assert!(n.tag_name().name() == "abschnitt"); assert!(n.tag_name().name() == "abschnitt");
@ -26,53 +23,12 @@ impl Abschnitt {
Self::skip_static_fields(&mut c); Self::skip_static_fields(&mut c);
ret.handle_headers(&mut c, builder); ret.handle_metadata(&mut c, builder);
while let Some(child) = c.peek() { if !ret.handle_headers(&mut c, builder) {
// Schiffahrtsgesetz: stop @ anlagen (for now) return ret;
if Ueberschrift::test(child, "anlage") {
return ret;
}
if Ueberschrift::test(child, "g1") {
let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1");
if ueberschrift.content.trim().starts_with("Artikel") {
return ret;
}
builder.new_header(&ueberschrift.content);
} else if Ueberschrift::test(child, "g2") {
let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g2");
builder.new_desc(&ueberschrift.content);
} else if Ueberschrift::test(child, "g1min") {
let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1min");
builder.new_header(&ueberschrift.content);
} else if Ueberschrift::test(child, "art") {
let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "art");
if ueberschrift.content.trim().starts_with("Artikel") {
return ret;
}
} else {
break;
}
} }
if let Some(child) = c.peek() {
if Ueberschrift::test(child, "para") {
builder
.new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content);
}
}
// e.g. § 405 abgb has two para (of diseased paragraph)
if let Some(child) = c.peek() {
if Ueberschrift::test(child, "para") {
builder
.new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content);
}
}
// We have 2 tasks
// 1) Get paragraph id
// 2) Get content
let mut absatze = Vec::new(); let mut absatze = Vec::new();
let absatz = AbsatzAbs::parse(c.next().expect("We need at least one 'Absatz'")); let absatz = AbsatzAbs::parse(c.next().expect("We need at least one 'Absatz'"));
let par_id = absatz let par_id = absatz
@ -194,7 +150,7 @@ impl Abschnitt {
// There are paragraph-specific meta-data at the top of each xml file. We parse those. When we // There are paragraph-specific meta-data at the top of each xml file. We parse those. When we
// encounter the title "Text" the real content starts, we stop parsing meta data. // encounter the title "Text" the real content starts, we stop parsing meta data.
fn handle_headers(&mut self, c: &mut Peekable<Children>, builder: &mut LawBuilder) { fn handle_metadata(&mut self, c: &mut Peekable<Children>, builder: &mut LawBuilder) {
loop { loop {
let key = Ueberschrift::parse(c.next().unwrap(), "titel").content; let key = Ueberschrift::parse(c.next().unwrap(), "titel").content;
@ -230,4 +186,47 @@ impl Abschnitt {
Fzinhalt::parse(node.next().unwrap()); // "www.ris.bka.gv.at" and "Seite X von Y" Fzinhalt::parse(node.next().unwrap()); // "www.ris.bka.gv.at" and "Seite X von Y"
Fzinhalt::parse(node.next().unwrap()); // "www.ris.bka.gv.at" and "Seite X von Y" Fzinhalt::parse(node.next().unwrap()); // "www.ris.bka.gv.at" and "Seite X von Y"
} }
// we have optionally headers. Such as "Einleitung", "Von den bürgerlichen Gesetzen üerhaupt,"
// etc. If we have headers which indicate that we are done and we want to stop parsing
// ("anlage" + "Artikel" we indicate this wish by returning false.
fn handle_headers(&self, c: &mut Peekable<Children>, builder: &mut LawBuilder) -> bool {
while let Some(child) = c.peek() {
// Schiffahrtsgesetz: stop @ anlagen (for now)
if Ueberschrift::test(child, "anlage") {
return false;
}
if Ueberschrift::test(child, "g1") {
let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1");
if ueberschrift.content.trim().starts_with("Artikel") {
return false;
}
builder.new_header(&ueberschrift.content);
} else if Ueberschrift::test(child, "g2") {
let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g2");
builder.new_desc(&ueberschrift.content);
} else if Ueberschrift::test(child, "g1min") {
let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1min");
builder.new_header(&ueberschrift.content);
} else if Ueberschrift::test(child, "art") {
let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "art");
if ueberschrift.content.trim().starts_with("Artikel") {
return false;
}
} else {
break;
}
}
// while (not if) because we can have two subsequent paraheaders (e.g. § 405 abgb)
while let Some(child) = c.peek() {
if Ueberschrift::test(child, "para") {
builder
.new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content);
continue;
}
break;
}
true
}
} }