Compare commits

...

2 Commits

Author SHA1 Message Date
eff5c546c8 Merge branch 'large' of ssh://git.hofer.link:2222/philipp/risp into large
Some checks failed
CI/CD Pipeline / test (push) Failing after 1m38s
2024-02-27 07:04:26 +01:00
d46212ba42 push 2024-02-27 07:02:07 +01:00
3 changed files with 89 additions and 5 deletions

View File

@ -42,6 +42,11 @@ name = "dash"
is_root = false
match_function = "starts_with_dash"
[[law.classifiers]]
name = "Artikel"
is_root = false
match_function = "contains"
[parser]
remove_strings = [
"<tab />",

View File

@ -154,19 +154,19 @@ impl Abschnitt {
return;
}
if Ueberschrift::test(child, "g1") {
let (ueberschrift, para_header) = Ueberschrift::parse(c, "g1");
let (ueberschrift, para_header) = Ueberschrift::parse_full(c, "g1");
builder.new_header(&ueberschrift.content);
if let Some(para_header) = para_header {
builder.new_next_para_header(&para_header.content);
}
} else if Ueberschrift::test(child, "g2") {
let (ueberschrift, para_header) = Ueberschrift::parse(c, "g2");
let (ueberschrift, para_header) = Ueberschrift::parse_full(c, "g2");
builder.new_desc(&ueberschrift.content);
if let Some(para_header) = para_header {
builder.new_next_para_header(&para_header.content);
}
} else if Ueberschrift::test(child, "g1min") {
let (ueberschrift, para_header) = Ueberschrift::parse(c, "g1min");
let (ueberschrift, para_header) = Ueberschrift::parse_full(c, "g1min");
builder.new_header(&ueberschrift.content);
if let Some(para_header) = para_header {
builder.new_next_para_header(&para_header.content);

View File

@ -355,8 +355,7 @@ impl Ueberschrift {
n.tag_name().name() == "ueberschrift" && n.attribute("typ").unwrap() == typ
}
pub(crate) fn parse(c: &mut Peekable<Children>, typ: &str) -> (Self, Option<Self>) {
let n = c.next().unwrap();
pub(crate) fn parse(n: Node, typ: &str) -> Self {
Expect::from(&n).tag("ueberschrift");
assert_eq!(n.attribute("typ").unwrap(), typ);
@ -456,6 +455,86 @@ impl Ueberschrift {
}
}
(art, next_para_header)
}
pub(crate) fn parse_full(c: &mut Peekable<Children>, typ: &str) -> (Self, Option<Self>) {
let n = c.next().unwrap();
Expect::from(&n).tag("ueberschrift");
assert_eq!(n.attribute("typ").unwrap(), typ);
let mut ret = Self {
content: n.text().unwrap().into(),
typ: typ.into(),
};
let mut next_para_header = None;
if ret.content != "Text" {
if let Some(child) = c.peek() {
if Ueberschrift::test(child, "art") {
let next_para = Ueberschrift::parse(c.next().unwrap(), "art");
next_para_header = Some(next_para);
} else if typ == "g1" && ret.content.starts_with("Artikel ") {
// wrongly tagged
// artikel
// TODO: remove code duplication from further down
let mut clone = c.clone();
if let Some(first_child) = clone.next() {
if let Some(second_child) = clone.next() {
if Ueberschrift::test(&first_child, "para")
&& Ueberschrift::test(&second_child, "para")
{
let first_para = Ueberschrift::parse(c.next().unwrap(), "para");
let second_para = Ueberschrift::parse(c.next().unwrap(), "para");
next_para_header = Some(Ueberschrift {
typ: first_para.typ,
content: format!(
"{} - {}",
first_para.content, second_para.content
),
});
}
}
}
} else if typ == "art" && !ret.content.starts_with("Artikel") {
ret.content = format!("Artikel {}", ret.content);
}
}
}
debug!("Parsed Ueberschrift {ret:#?}");
(ret, next_para_header)
}
fn parse_art(n: &mut Peekable<Children>) -> (Self, Option<Self>) {
let art = Self::parse(n.next().unwrap(), "art");
let mut next_para_header = None;
let mut clone = n.clone();
if let Some(first_child) = clone.next() {
if let Some(second_child) = clone.next() {
if Ueberschrift::test(&first_child, "para")
&& Ueberschrift::test(&second_child, "para")
{
let first_para = Ueberschrift::parse(n.next().unwrap(), "para");
let second_para = Ueberschrift::parse(n.next().unwrap(), "para");
next_para_header = Some(Ueberschrift {
typ: first_para.typ,
content: format!("{} - {}", first_para.content, second_para.content),
});
} else if Ueberschrift::test(&first_child, "para") {
let first_para = Ueberschrift::parse(n.next().unwrap(), "para");
next_para_header = Some(Ueberschrift {
typ: first_para.typ,
content: format!("{}", first_para.content),
});
}
}
}
(art, next_para_header)
}
}