parse first page of urhg
This commit is contained in:
98
src/law.rs
98
src/law.rs
@ -1,3 +1,6 @@
|
||||
use log::debug;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::overview;
|
||||
|
||||
// pub(crate) struct Law {
|
||||
@ -15,7 +18,7 @@ use crate::overview;
|
||||
// }
|
||||
|
||||
/// Is used to generate a law struct. It's organized mainly by classifier.
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct LawBuilder {
|
||||
/// Name of the law
|
||||
name: String, //ABGB, UrhG
|
||||
@ -29,19 +32,35 @@ pub(crate) struct LawBuilder {
|
||||
next_para_header: Option<String>,
|
||||
}
|
||||
|
||||
fn contains(classifier_name: &str, instance_name: &str) -> bool {
|
||||
instance_name.contains(classifier_name)
|
||||
}
|
||||
|
||||
fn starts_with_number(_classifier_name: &str, instance_name: &str) -> bool {
|
||||
match instance_name.trim().as_bytes().get(0) {
|
||||
Some(c) if c.is_ascii_digit() => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
impl LawBuilder {
|
||||
#[cfg(test)]
|
||||
pub(crate) fn test(name: &str) -> Self {
|
||||
let mut classifiers = Vec::new();
|
||||
if name == "UrhG" {
|
||||
let hauptstueck = Classifier::new("Hauptstück");
|
||||
let hauptstueck = Classifier::new("Hauptstück", Arc::new(&contains));
|
||||
classifiers.push(hauptstueck.clone());
|
||||
|
||||
let mut abschnitt = Classifier::new("Abschnitt");
|
||||
let mut abschnitt = Classifier::new("Abschnitt", Arc::new(&contains));
|
||||
abschnitt.set_parent(hauptstueck);
|
||||
classifiers.push(abschnitt);
|
||||
|
||||
let mut numbered_header =
|
||||
Classifier::new("Numbered Header", Arc::new(&starts_with_number));
|
||||
numbered_header.set_parent(abschnitt);
|
||||
classifiers.push(numbered_header);
|
||||
} else if name == "no-headers" {
|
||||
let mut class = Classifier::new("");
|
||||
let mut class = Classifier::new("", Arc::new(&contains));
|
||||
class.add_instance(ClassifierInstance::new(""));
|
||||
classifiers.push(class);
|
||||
}
|
||||
@ -54,19 +73,24 @@ impl LawBuilder {
|
||||
}
|
||||
|
||||
/// Creates a new law builder. Adds classifier for known law texts.
|
||||
pub(crate) fn new(name: &str) {
|
||||
pub(crate) fn new(name: &str) -> LawBuilder {
|
||||
//TODO: return Law (not LawBuilder)
|
||||
let mut classifiers = Vec::new();
|
||||
|
||||
let mut law_id = None;
|
||||
if name == "UrhG" {
|
||||
law_id = Some(10001848);
|
||||
let hauptstueck = Classifier::new("Hauptstück");
|
||||
let hauptstueck = Classifier::new("Hauptstück", Arc::new(&contains));
|
||||
classifiers.push(hauptstueck.clone());
|
||||
|
||||
let mut abschnitt = Classifier::new("Abschnitt");
|
||||
let mut abschnitt = Classifier::new("Abschnitt", Arc::new(&contains));
|
||||
abschnitt.set_parent(hauptstueck);
|
||||
classifiers.push(abschnitt);
|
||||
classifiers.push(abschnitt.clone());
|
||||
|
||||
let mut numbered_header =
|
||||
Classifier::new("Numbered Header", Arc::new(&starts_with_number));
|
||||
numbered_header.set_parent(abschnitt);
|
||||
classifiers.push(numbered_header);
|
||||
}
|
||||
|
||||
let mut builder = Self {
|
||||
@ -77,11 +101,13 @@ impl LawBuilder {
|
||||
};
|
||||
|
||||
overview::parse(law_id.unwrap(), &mut builder).unwrap();
|
||||
|
||||
builder
|
||||
}
|
||||
|
||||
/// Sets a new header.
|
||||
pub(crate) fn new_header(&mut self, name: &str) {
|
||||
println!("new_header={name}");
|
||||
debug!("new_header={name}");
|
||||
let classifier_index = self
|
||||
.classifiers
|
||||
.iter()
|
||||
@ -98,7 +124,7 @@ impl LawBuilder {
|
||||
|
||||
/// Sets a new description for the last classifier.
|
||||
pub(crate) fn new_desc(&mut self, desc: &str) {
|
||||
println!("new_desc={desc}");
|
||||
debug!("new_desc={desc}");
|
||||
if let Some(index) = self.last_header_index {
|
||||
self.classifiers[index].set_desc(desc);
|
||||
} else {
|
||||
@ -108,10 +134,15 @@ impl LawBuilder {
|
||||
|
||||
/// Adds a new paragraph.
|
||||
pub(crate) fn new_par(&mut self, par: String, content: Content) {
|
||||
println!("new_par=par:{par};content:{content:#?}");
|
||||
if let Some(class) = self.classifiers.last_mut() {
|
||||
let section = Section { symb: par, content };
|
||||
class.add_section(section);
|
||||
debug!("new_par=par:{par};content:{content:#?}");
|
||||
if let Some(index) = self.last_header_index {
|
||||
let section = Section {
|
||||
symb: par,
|
||||
content,
|
||||
par_header: self.next_para_header.clone(),
|
||||
};
|
||||
self.next_para_header = None;
|
||||
self.classifiers[index].add_section(section);
|
||||
} else {
|
||||
panic!("Expected at least one classifier");
|
||||
}
|
||||
@ -119,14 +150,23 @@ impl LawBuilder {
|
||||
|
||||
/// Next paragraph has a header, store its name.
|
||||
pub(crate) fn new_next_para_header(&mut self, header: &str) {
|
||||
println!("new_next_para_header={header}");
|
||||
debug!("new_next_para_header={header}");
|
||||
self.next_para_header = Some(header.into());
|
||||
}
|
||||
|
||||
pub(crate) fn toc(&self) {
|
||||
for class in &self.classifiers {
|
||||
for inst in &class.instances {
|
||||
println!("{}", inst.name);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct Section {
|
||||
symb: String, // §"1", §"2", ...
|
||||
par_header: Option<String>,
|
||||
content: Content,
|
||||
//header: Option<Header>,
|
||||
}
|
||||
@ -152,7 +192,7 @@ impl Header {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct ClassifierInstance {
|
||||
name: String,
|
||||
desc: Option<String>,
|
||||
@ -177,19 +217,31 @@ impl ClassifierInstance {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct Classifier {
|
||||
name: String, // Hauptstück, Theil, Abschnitt, ol
|
||||
parent: Option<Box<Classifier>>,
|
||||
instances: Vec<ClassifierInstance>,
|
||||
used_for_fn: Arc<dyn Fn(&str, &str) -> bool>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Classifier {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("Classifier")
|
||||
.field("name", &self.name)
|
||||
.field("parent", &self.parent)
|
||||
.field("instances", &self.instances)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Classifier {
|
||||
fn new(name: &str) -> Self {
|
||||
fn new(name: &str, used_for_fn: Arc<dyn Fn(&str, &str) -> bool>) -> Self {
|
||||
Self {
|
||||
name: name.into(),
|
||||
parent: None,
|
||||
instances: Vec::new(),
|
||||
used_for_fn,
|
||||
}
|
||||
}
|
||||
|
||||
@ -210,7 +262,7 @@ impl Classifier {
|
||||
}
|
||||
|
||||
fn used_for(&self, name: &str) -> bool {
|
||||
name.contains(&self.name)
|
||||
(self.used_for_fn)(&self.name, name)
|
||||
}
|
||||
|
||||
fn add_section(&mut self, section: Section) {
|
||||
@ -218,13 +270,11 @@ impl Classifier {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) enum Content {
|
||||
Text(String), //This is my direct law text
|
||||
TextWithList(String, Vec<Box<Content>>),
|
||||
Text(String), //This is my direct law text
|
||||
Item(Vec<Box<Content>>), //(1) This is general law. (2) This is more specific law
|
||||
List(Vec<Box<Content>>),
|
||||
TextWithListAndText(String, Vec<Box<Content>>, String), //1. my first item
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
@ -41,5 +41,8 @@ impl From<roxmltree::Error> for Error {
|
||||
}
|
||||
|
||||
fn main() {
|
||||
LawBuilder::new("UrhG");
|
||||
env_logger::init();
|
||||
let builder = LawBuilder::new("UrhG");
|
||||
|
||||
println!("{:#?}", builder.toc());
|
||||
}
|
||||
|
@ -187,6 +187,8 @@ pub(crate) struct ContentUrlItem {
|
||||
mod tests {
|
||||
use std::{fs::File, io::Read};
|
||||
|
||||
use log::debug;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
@ -204,7 +206,7 @@ mod tests {
|
||||
let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json);
|
||||
if wrapper.is_err() {
|
||||
let dbg = wrapper.as_ref().err().unwrap();
|
||||
println!("{dbg:#?}");
|
||||
debug!("{dbg:#?}");
|
||||
}
|
||||
|
||||
assert!(wrapper.is_ok());
|
||||
@ -219,7 +221,7 @@ mod tests {
|
||||
let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json);
|
||||
if wrapper.is_err() {
|
||||
let dbg = wrapper.as_ref().err().unwrap();
|
||||
println!("{dbg:#?}");
|
||||
debug!("{dbg:#?}");
|
||||
}
|
||||
|
||||
assert!(wrapper.is_ok());
|
||||
@ -234,7 +236,7 @@ mod tests {
|
||||
let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json);
|
||||
if wrapper.is_err() {
|
||||
let dbg = wrapper.as_ref().err().unwrap();
|
||||
println!("{dbg:#?}");
|
||||
debug!("{dbg:#?}");
|
||||
}
|
||||
|
||||
assert!(wrapper.is_ok());
|
||||
|
@ -1,5 +1,7 @@
|
||||
mod parser;
|
||||
|
||||
use log::{debug, info};
|
||||
|
||||
use crate::{law::LawBuilder, par::parser::Risdok, Error};
|
||||
|
||||
fn fetch_page(url: &str) -> Result<String, Error> {
|
||||
@ -7,27 +9,26 @@ fn fetch_page(url: &str) -> Result<String, Error> {
|
||||
}
|
||||
|
||||
pub(crate) fn parse(url: &str, builder: &mut LawBuilder) -> Result<(), Error> {
|
||||
println!("{url}");
|
||||
info!("Parsing {url}");
|
||||
let xml = fetch_page(url)?;
|
||||
let xml = xml.replace("<gdash />", "-"); // used e.g. in §11 Abs. 3 UrhG
|
||||
//
|
||||
//
|
||||
let xml = xml.replace(
|
||||
// e.g. in § 17 (2) TODO: check that this onyl happens here
|
||||
r#"<liste><schlussteil ebene="0" art="normal" ct="text">"#,
|
||||
r#"<absatz typ="abs" ct="text" halign="j">"#,
|
||||
// in § 17 (2)
|
||||
r#"<liste><schlussteil ebene="0" art="normal" ct="text">(2) Einer Rundfunksendung steht es gleich, wenn ein Werk von einer im In- oder im Ausland gelegenen Stelle aus der Öffentlichkeit im Inland, ähnlich wie durch Rundfunk, aber mit Hilfe von Leitungen wahrnehmbar gemacht wird.</schlussteil></liste>"#,
|
||||
r#"<absatz typ="abs" ct="text" halign="j">(2) Einer Rundfunksendung steht es gleich, wenn ein Werk von einer im In- oder im Ausland gelegenen Stelle aus der Öffentlichkeit im Inland, ähnlich wie durch Rundfunk, aber mit Hilfe von Leitungen wahrnehmbar gemacht wird.</absatz>"#,
|
||||
);
|
||||
|
||||
let xml = xml.replace(
|
||||
// e.g. in § 17 (2) TODO: check that this onyl happens here
|
||||
r#"</schlussteil></liste>"#,
|
||||
"</absatz>",
|
||||
);
|
||||
println!("{xml}");
|
||||
r#"<ueberschrift typ="para" ct="text" halign="c">1. Verwertungsrechte.</ueberschrift>"#,
|
||||
r#"<ueberschrift typ="g1" ct="text" halign="c">1. Verwertungsrechte.</ueberschrift>"#,
|
||||
); // 1. Verwertungsrechte. before § 14
|
||||
|
||||
let xml = xml.replace("<i>.</i>", "."); // e.g. § 37d Abs. 4 (last point)...
|
||||
debug!("{xml}");
|
||||
|
||||
let risdok = Risdok::from_str(&xml, builder)?;
|
||||
|
||||
println!("{builder:#?}");
|
||||
//println!("{risdok}");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
use log::info;
|
||||
use roxmltree::Node;
|
||||
|
||||
use crate::{
|
||||
@ -86,7 +87,7 @@ impl Abschnitt {
|
||||
c.next();
|
||||
continue;
|
||||
}
|
||||
if Absatz::test(child) {
|
||||
if Absatz::test_with_typ(child, "erltext") {
|
||||
c.next();
|
||||
continue;
|
||||
}
|
||||
@ -140,25 +141,30 @@ impl Abschnitt {
|
||||
if let Some(child) = c.peek() {
|
||||
if Liste::test(child) {
|
||||
let liste = Liste::parse(c.next().unwrap());
|
||||
absatze.push(Content::TextWithList(
|
||||
absatz.content.clone(),
|
||||
liste.get_list(),
|
||||
))
|
||||
absatze.push(Content::List(vec![
|
||||
Content::Text(absatz.content).into(),
|
||||
liste.get_content().into(),
|
||||
]));
|
||||
} else if Table::test(child) {
|
||||
// If there's a "table" after an "absatz", the "table" should be part of the "absatz"
|
||||
let table = Table::parse(c.next().unwrap());
|
||||
if let Some(child) = c.peek() {
|
||||
if Absatz::test_with_typ(child, "erltext") {
|
||||
let after_absatz = Absatz::parse(c.next().unwrap());
|
||||
absatze.push(Content::TextWithListAndText(
|
||||
absatz.content,
|
||||
table.get_list(),
|
||||
after_absatz.content,
|
||||
))
|
||||
absatze.push(Content::List(vec![
|
||||
Content::Text(absatz.content).into(),
|
||||
Content::List(table.get_list()).into(),
|
||||
Content::Text(after_absatz.content).into(),
|
||||
]))
|
||||
} else {
|
||||
absatze.push(Content::TextWithList(absatz.content, table.get_list()))
|
||||
absatze.push(Content::List(vec![
|
||||
Content::Text(absatz.content).into(),
|
||||
Content::List(table.get_list()).into(),
|
||||
]));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
absatze.push(Content::Text(absatz.content.clone()));
|
||||
}
|
||||
} else {
|
||||
absatze.push(Content::Text(absatz.content.clone()));
|
||||
@ -177,8 +183,10 @@ impl Abschnitt {
|
||||
if let Some(child) = c.peek() {
|
||||
if Liste::test(&child) {
|
||||
let liste = Liste::parse(c.next().unwrap());
|
||||
//TODO do something with list
|
||||
absatze.push(Content::TextWithList(abs.content, liste.get_list()))
|
||||
absatze.push(Content::List(vec![
|
||||
Content::Text(abs.content).into(),
|
||||
liste.get_content().into(),
|
||||
]));
|
||||
} else {
|
||||
absatze.push(Content::Text(abs.content));
|
||||
}
|
||||
@ -225,7 +233,7 @@ impl Abschnitt {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
pub(crate) struct Symbol {
|
||||
stellen: String,
|
||||
content: String,
|
||||
@ -242,7 +250,7 @@ impl Symbol {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
pub(crate) struct Listelem {
|
||||
symbol: Symbol,
|
||||
text: String,
|
||||
@ -267,36 +275,39 @@ impl Listelem {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
pub(crate) struct Ziffernliste {
|
||||
ebene: String,
|
||||
listelems: Vec<Listelem>,
|
||||
}
|
||||
impl Ziffernliste {
|
||||
pub(crate) fn test(n: &Node) -> bool {
|
||||
["ziffernliste", "aufzaehlung", "literaliste"].contains(&n.tag_name().name())
|
||||
}
|
||||
|
||||
pub(crate) fn parse(n: Node) -> Self {
|
||||
assert!(n.tag_name().name() == "ziffernliste");
|
||||
assert!(Self::test(&n));
|
||||
|
||||
let ebene = n.attribute("ebene").unwrap().into();
|
||||
let mut c = n.children().peekable();
|
||||
|
||||
let mut listelems = Vec::new();
|
||||
loop {
|
||||
match c.peek() {
|
||||
Some(child) => {
|
||||
if Listelem::test(child) {
|
||||
listelems.push(Listelem::parse(c.next().unwrap()));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
None => break,
|
||||
}
|
||||
break;
|
||||
|
||||
for child in n.children() {
|
||||
listelems.push(Listelem::parse(child));
|
||||
}
|
||||
|
||||
assert_eq!(c.next(), None);
|
||||
|
||||
Self { ebene, listelems }
|
||||
}
|
||||
|
||||
pub(crate) fn get_content(&self) -> Content {
|
||||
let mut elems = Vec::new();
|
||||
|
||||
for elem in &self.listelems {
|
||||
elems.push(Content::Text(format!("{} {}", elem.symbol.content, elem.text)).into());
|
||||
}
|
||||
|
||||
Content::List(elems)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
@ -372,40 +383,69 @@ impl Table {
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub(crate) struct Liste {
|
||||
ziffernliste: Ziffernliste,
|
||||
pub(crate) struct Schlussteil {
|
||||
content: String,
|
||||
}
|
||||
impl Liste {
|
||||
impl Schlussteil {
|
||||
pub(crate) fn test(n: &Node) -> bool {
|
||||
n.tag_name().name() == "liste"
|
||||
(n.tag_name().name() == "schlussteil" || n.tag_name().name() == "schluss")
|
||||
&& n.children().count() == 1
|
||||
&& n.children().next().unwrap().tag_name().name() == "ziffernliste"
|
||||
}
|
||||
|
||||
pub(crate) fn parse(n: Node) -> Self {
|
||||
assert!(Self::test(&n));
|
||||
|
||||
let mut c = n.children();
|
||||
let content = n.children().next().unwrap().text().unwrap().into(); //not sure
|
||||
|
||||
let ziffernliste = Ziffernliste::parse(c.next().unwrap());
|
||||
Self { content }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Liste {
|
||||
content: Vec<Content>,
|
||||
}
|
||||
impl Liste {
|
||||
pub(crate) fn test(n: &Node) -> bool {
|
||||
n.tag_name().name() == "liste"
|
||||
}
|
||||
|
||||
pub(crate) fn parse(n: Node) -> Self {
|
||||
assert!(Self::test(&n));
|
||||
|
||||
let mut content = Vec::new();
|
||||
|
||||
let mut c = n.children().peekable();
|
||||
|
||||
content.push(Ziffernliste::parse(c.next().unwrap()).get_content().into());
|
||||
|
||||
loop {
|
||||
if let Some(child) = c.peek() {
|
||||
if Ziffernliste::test(child) {
|
||||
content.push(Ziffernliste::parse(c.next().unwrap()).get_content().into());
|
||||
} else if Schlussteil::test(child) {
|
||||
content.push(Content::Text(Schlussteil::parse(c.next().unwrap()).content));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(c.next(), None);
|
||||
|
||||
Self { ziffernliste }
|
||||
Self { content }
|
||||
}
|
||||
|
||||
pub(crate) fn get_list(&self) -> Vec<Box<Content>> {
|
||||
let mut ret = Vec::new();
|
||||
|
||||
for a in &self.ziffernliste.listelems {
|
||||
ret.push(Box::new(Content::Text(format!(
|
||||
"{} {}",
|
||||
a.symbol.content,
|
||||
a.text.clone()
|
||||
))));
|
||||
}
|
||||
|
||||
ret
|
||||
pub(crate) fn get_content(&self) -> Content {
|
||||
Content::List(
|
||||
self.content
|
||||
.clone()
|
||||
.into_iter()
|
||||
.map(|c| Box::new(c))
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@ -546,6 +586,8 @@ impl Layoutdaten {
|
||||
mod tests {
|
||||
use std::{fs::File, io::Read};
|
||||
|
||||
use log::error;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
@ -557,7 +599,7 @@ mod tests {
|
||||
let mut builder = LawBuilder::test("no-headers");
|
||||
let risdok = Risdok::from_str(&xml, &mut builder);
|
||||
if risdok.is_err() {
|
||||
println!("{:#?}", risdok.as_ref().err());
|
||||
error!("{:#?}", risdok.as_ref().err());
|
||||
}
|
||||
assert!(risdok.is_ok());
|
||||
|
||||
|
Reference in New Issue
Block a user