parse first page of urhg
This commit is contained in:
@ -1,5 +1,7 @@
|
||||
mod parser;
|
||||
|
||||
use log::{debug, info};
|
||||
|
||||
use crate::{law::LawBuilder, par::parser::Risdok, Error};
|
||||
|
||||
fn fetch_page(url: &str) -> Result<String, Error> {
|
||||
@ -7,27 +9,26 @@ fn fetch_page(url: &str) -> Result<String, Error> {
|
||||
}
|
||||
|
||||
pub(crate) fn parse(url: &str, builder: &mut LawBuilder) -> Result<(), Error> {
|
||||
println!("{url}");
|
||||
info!("Parsing {url}");
|
||||
let xml = fetch_page(url)?;
|
||||
let xml = xml.replace("<gdash />", "-"); // used e.g. in §11 Abs. 3 UrhG
|
||||
//
|
||||
//
|
||||
let xml = xml.replace(
|
||||
// e.g. in § 17 (2) TODO: check that this onyl happens here
|
||||
r#"<liste><schlussteil ebene="0" art="normal" ct="text">"#,
|
||||
r#"<absatz typ="abs" ct="text" halign="j">"#,
|
||||
// in § 17 (2)
|
||||
r#"<liste><schlussteil ebene="0" art="normal" ct="text">(2) Einer Rundfunksendung steht es gleich, wenn ein Werk von einer im In- oder im Ausland gelegenen Stelle aus der Öffentlichkeit im Inland, ähnlich wie durch Rundfunk, aber mit Hilfe von Leitungen wahrnehmbar gemacht wird.</schlussteil></liste>"#,
|
||||
r#"<absatz typ="abs" ct="text" halign="j">(2) Einer Rundfunksendung steht es gleich, wenn ein Werk von einer im In- oder im Ausland gelegenen Stelle aus der Öffentlichkeit im Inland, ähnlich wie durch Rundfunk, aber mit Hilfe von Leitungen wahrnehmbar gemacht wird.</absatz>"#,
|
||||
);
|
||||
|
||||
let xml = xml.replace(
|
||||
// e.g. in § 17 (2) TODO: check that this onyl happens here
|
||||
r#"</schlussteil></liste>"#,
|
||||
"</absatz>",
|
||||
);
|
||||
println!("{xml}");
|
||||
r#"<ueberschrift typ="para" ct="text" halign="c">1. Verwertungsrechte.</ueberschrift>"#,
|
||||
r#"<ueberschrift typ="g1" ct="text" halign="c">1. Verwertungsrechte.</ueberschrift>"#,
|
||||
); // 1. Verwertungsrechte. before § 14
|
||||
|
||||
let xml = xml.replace("<i>.</i>", "."); // e.g. § 37d Abs. 4 (last point)...
|
||||
debug!("{xml}");
|
||||
|
||||
let risdok = Risdok::from_str(&xml, builder)?;
|
||||
|
||||
println!("{builder:#?}");
|
||||
//println!("{risdok}");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
use log::info;
|
||||
use roxmltree::Node;
|
||||
|
||||
use crate::{
|
||||
@ -86,7 +87,7 @@ impl Abschnitt {
|
||||
c.next();
|
||||
continue;
|
||||
}
|
||||
if Absatz::test(child) {
|
||||
if Absatz::test_with_typ(child, "erltext") {
|
||||
c.next();
|
||||
continue;
|
||||
}
|
||||
@ -140,25 +141,30 @@ impl Abschnitt {
|
||||
if let Some(child) = c.peek() {
|
||||
if Liste::test(child) {
|
||||
let liste = Liste::parse(c.next().unwrap());
|
||||
absatze.push(Content::TextWithList(
|
||||
absatz.content.clone(),
|
||||
liste.get_list(),
|
||||
))
|
||||
absatze.push(Content::List(vec![
|
||||
Content::Text(absatz.content).into(),
|
||||
liste.get_content().into(),
|
||||
]));
|
||||
} else if Table::test(child) {
|
||||
// If there's a "table" after an "absatz", the "table" should be part of the "absatz"
|
||||
let table = Table::parse(c.next().unwrap());
|
||||
if let Some(child) = c.peek() {
|
||||
if Absatz::test_with_typ(child, "erltext") {
|
||||
let after_absatz = Absatz::parse(c.next().unwrap());
|
||||
absatze.push(Content::TextWithListAndText(
|
||||
absatz.content,
|
||||
table.get_list(),
|
||||
after_absatz.content,
|
||||
))
|
||||
absatze.push(Content::List(vec![
|
||||
Content::Text(absatz.content).into(),
|
||||
Content::List(table.get_list()).into(),
|
||||
Content::Text(after_absatz.content).into(),
|
||||
]))
|
||||
} else {
|
||||
absatze.push(Content::TextWithList(absatz.content, table.get_list()))
|
||||
absatze.push(Content::List(vec![
|
||||
Content::Text(absatz.content).into(),
|
||||
Content::List(table.get_list()).into(),
|
||||
]));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
absatze.push(Content::Text(absatz.content.clone()));
|
||||
}
|
||||
} else {
|
||||
absatze.push(Content::Text(absatz.content.clone()));
|
||||
@ -177,8 +183,10 @@ impl Abschnitt {
|
||||
if let Some(child) = c.peek() {
|
||||
if Liste::test(&child) {
|
||||
let liste = Liste::parse(c.next().unwrap());
|
||||
//TODO do something with list
|
||||
absatze.push(Content::TextWithList(abs.content, liste.get_list()))
|
||||
absatze.push(Content::List(vec![
|
||||
Content::Text(abs.content).into(),
|
||||
liste.get_content().into(),
|
||||
]));
|
||||
} else {
|
||||
absatze.push(Content::Text(abs.content));
|
||||
}
|
||||
@ -225,7 +233,7 @@ impl Abschnitt {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
pub(crate) struct Symbol {
|
||||
stellen: String,
|
||||
content: String,
|
||||
@ -242,7 +250,7 @@ impl Symbol {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
pub(crate) struct Listelem {
|
||||
symbol: Symbol,
|
||||
text: String,
|
||||
@ -267,36 +275,39 @@ impl Listelem {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
pub(crate) struct Ziffernliste {
|
||||
ebene: String,
|
||||
listelems: Vec<Listelem>,
|
||||
}
|
||||
impl Ziffernliste {
|
||||
pub(crate) fn test(n: &Node) -> bool {
|
||||
["ziffernliste", "aufzaehlung", "literaliste"].contains(&n.tag_name().name())
|
||||
}
|
||||
|
||||
pub(crate) fn parse(n: Node) -> Self {
|
||||
assert!(n.tag_name().name() == "ziffernliste");
|
||||
assert!(Self::test(&n));
|
||||
|
||||
let ebene = n.attribute("ebene").unwrap().into();
|
||||
let mut c = n.children().peekable();
|
||||
|
||||
let mut listelems = Vec::new();
|
||||
loop {
|
||||
match c.peek() {
|
||||
Some(child) => {
|
||||
if Listelem::test(child) {
|
||||
listelems.push(Listelem::parse(c.next().unwrap()));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
None => break,
|
||||
}
|
||||
break;
|
||||
|
||||
for child in n.children() {
|
||||
listelems.push(Listelem::parse(child));
|
||||
}
|
||||
|
||||
assert_eq!(c.next(), None);
|
||||
|
||||
Self { ebene, listelems }
|
||||
}
|
||||
|
||||
pub(crate) fn get_content(&self) -> Content {
|
||||
let mut elems = Vec::new();
|
||||
|
||||
for elem in &self.listelems {
|
||||
elems.push(Content::Text(format!("{} {}", elem.symbol.content, elem.text)).into());
|
||||
}
|
||||
|
||||
Content::List(elems)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
@ -372,40 +383,69 @@ impl Table {
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub(crate) struct Liste {
|
||||
ziffernliste: Ziffernliste,
|
||||
pub(crate) struct Schlussteil {
|
||||
content: String,
|
||||
}
|
||||
impl Liste {
|
||||
impl Schlussteil {
|
||||
pub(crate) fn test(n: &Node) -> bool {
|
||||
n.tag_name().name() == "liste"
|
||||
(n.tag_name().name() == "schlussteil" || n.tag_name().name() == "schluss")
|
||||
&& n.children().count() == 1
|
||||
&& n.children().next().unwrap().tag_name().name() == "ziffernliste"
|
||||
}
|
||||
|
||||
pub(crate) fn parse(n: Node) -> Self {
|
||||
assert!(Self::test(&n));
|
||||
|
||||
let mut c = n.children();
|
||||
let content = n.children().next().unwrap().text().unwrap().into(); //not sure
|
||||
|
||||
let ziffernliste = Ziffernliste::parse(c.next().unwrap());
|
||||
Self { content }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Liste {
|
||||
content: Vec<Content>,
|
||||
}
|
||||
impl Liste {
|
||||
pub(crate) fn test(n: &Node) -> bool {
|
||||
n.tag_name().name() == "liste"
|
||||
}
|
||||
|
||||
pub(crate) fn parse(n: Node) -> Self {
|
||||
assert!(Self::test(&n));
|
||||
|
||||
let mut content = Vec::new();
|
||||
|
||||
let mut c = n.children().peekable();
|
||||
|
||||
content.push(Ziffernliste::parse(c.next().unwrap()).get_content().into());
|
||||
|
||||
loop {
|
||||
if let Some(child) = c.peek() {
|
||||
if Ziffernliste::test(child) {
|
||||
content.push(Ziffernliste::parse(c.next().unwrap()).get_content().into());
|
||||
} else if Schlussteil::test(child) {
|
||||
content.push(Content::Text(Schlussteil::parse(c.next().unwrap()).content));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(c.next(), None);
|
||||
|
||||
Self { ziffernliste }
|
||||
Self { content }
|
||||
}
|
||||
|
||||
pub(crate) fn get_list(&self) -> Vec<Box<Content>> {
|
||||
let mut ret = Vec::new();
|
||||
|
||||
for a in &self.ziffernliste.listelems {
|
||||
ret.push(Box::new(Content::Text(format!(
|
||||
"{} {}",
|
||||
a.symbol.content,
|
||||
a.text.clone()
|
||||
))));
|
||||
}
|
||||
|
||||
ret
|
||||
pub(crate) fn get_content(&self) -> Content {
|
||||
Content::List(
|
||||
self.content
|
||||
.clone()
|
||||
.into_iter()
|
||||
.map(|c| Box::new(c))
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@ -546,6 +586,8 @@ impl Layoutdaten {
|
||||
mod tests {
|
||||
use std::{fs::File, io::Read};
|
||||
|
||||
use log::error;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
@ -557,7 +599,7 @@ mod tests {
|
||||
let mut builder = LawBuilder::test("no-headers");
|
||||
let risdok = Risdok::from_str(&xml, &mut builder);
|
||||
if risdok.is_err() {
|
||||
println!("{:#?}", risdok.as_ref().err());
|
||||
error!("{:#?}", risdok.as_ref().err());
|
||||
}
|
||||
assert!(risdok.is_ok());
|
||||
|
||||
|
Reference in New Issue
Block a user