parse first page of urhg

This commit is contained in:
2023-11-05 12:46:04 +01:00
parent eb6f3e8aba
commit 55517758c2
7 changed files with 349 additions and 92 deletions

View File

@ -1,3 +1,6 @@
use log::debug;
use std::sync::Arc;
use crate::overview;
// pub(crate) struct Law {
@ -15,7 +18,7 @@ use crate::overview;
// }
/// Is used to generate a law struct. It's organized mainly by classifier.
#[derive(Debug, PartialEq)]
#[derive(Debug)]
pub(crate) struct LawBuilder {
/// Name of the law
name: String, //ABGB, UrhG
@ -29,19 +32,35 @@ pub(crate) struct LawBuilder {
next_para_header: Option<String>,
}
fn contains(classifier_name: &str, instance_name: &str) -> bool {
instance_name.contains(classifier_name)
}
fn starts_with_number(_classifier_name: &str, instance_name: &str) -> bool {
match instance_name.trim().as_bytes().get(0) {
Some(c) if c.is_ascii_digit() => true,
_ => false,
}
}
impl LawBuilder {
#[cfg(test)]
pub(crate) fn test(name: &str) -> Self {
let mut classifiers = Vec::new();
if name == "UrhG" {
let hauptstueck = Classifier::new("Hauptstück");
let hauptstueck = Classifier::new("Hauptstück", Arc::new(&contains));
classifiers.push(hauptstueck.clone());
let mut abschnitt = Classifier::new("Abschnitt");
let mut abschnitt = Classifier::new("Abschnitt", Arc::new(&contains));
abschnitt.set_parent(hauptstueck);
classifiers.push(abschnitt);
let mut numbered_header =
Classifier::new("Numbered Header", Arc::new(&starts_with_number));
numbered_header.set_parent(abschnitt);
classifiers.push(numbered_header);
} else if name == "no-headers" {
let mut class = Classifier::new("");
let mut class = Classifier::new("", Arc::new(&contains));
class.add_instance(ClassifierInstance::new(""));
classifiers.push(class);
}
@ -54,19 +73,24 @@ impl LawBuilder {
}
/// Creates a new law builder. Adds classifier for known law texts.
pub(crate) fn new(name: &str) {
pub(crate) fn new(name: &str) -> LawBuilder {
//TODO: return Law (not LawBuilder)
let mut classifiers = Vec::new();
let mut law_id = None;
if name == "UrhG" {
law_id = Some(10001848);
let hauptstueck = Classifier::new("Hauptstück");
let hauptstueck = Classifier::new("Hauptstück", Arc::new(&contains));
classifiers.push(hauptstueck.clone());
let mut abschnitt = Classifier::new("Abschnitt");
let mut abschnitt = Classifier::new("Abschnitt", Arc::new(&contains));
abschnitt.set_parent(hauptstueck);
classifiers.push(abschnitt);
classifiers.push(abschnitt.clone());
let mut numbered_header =
Classifier::new("Numbered Header", Arc::new(&starts_with_number));
numbered_header.set_parent(abschnitt);
classifiers.push(numbered_header);
}
let mut builder = Self {
@ -77,11 +101,13 @@ impl LawBuilder {
};
overview::parse(law_id.unwrap(), &mut builder).unwrap();
builder
}
/// Sets a new header.
pub(crate) fn new_header(&mut self, name: &str) {
println!("new_header={name}");
debug!("new_header={name}");
let classifier_index = self
.classifiers
.iter()
@ -98,7 +124,7 @@ impl LawBuilder {
/// Sets a new description for the last classifier.
pub(crate) fn new_desc(&mut self, desc: &str) {
println!("new_desc={desc}");
debug!("new_desc={desc}");
if let Some(index) = self.last_header_index {
self.classifiers[index].set_desc(desc);
} else {
@ -108,10 +134,15 @@ impl LawBuilder {
/// Adds a new paragraph.
pub(crate) fn new_par(&mut self, par: String, content: Content) {
println!("new_par=par:{par};content:{content:#?}");
if let Some(class) = self.classifiers.last_mut() {
let section = Section { symb: par, content };
class.add_section(section);
debug!("new_par=par:{par};content:{content:#?}");
if let Some(index) = self.last_header_index {
let section = Section {
symb: par,
content,
par_header: self.next_para_header.clone(),
};
self.next_para_header = None;
self.classifiers[index].add_section(section);
} else {
panic!("Expected at least one classifier");
}
@ -119,14 +150,23 @@ impl LawBuilder {
/// Next paragraph has a header, store its name.
pub(crate) fn new_next_para_header(&mut self, header: &str) {
println!("new_next_para_header={header}");
debug!("new_next_para_header={header}");
self.next_para_header = Some(header.into());
}
pub(crate) fn toc(&self) {
for class in &self.classifiers {
for inst in &class.instances {
println!("{}", inst.name);
}
}
}
}
#[derive(Debug, PartialEq, Clone)]
#[derive(Debug, Clone)]
pub(crate) struct Section {
symb: String, // §"1", §"2", ...
par_header: Option<String>,
content: Content,
//header: Option<Header>,
}
@ -152,7 +192,7 @@ impl Header {
}
}
#[derive(Clone, Debug, PartialEq)]
#[derive(Clone, Debug)]
pub(crate) struct ClassifierInstance {
name: String,
desc: Option<String>,
@ -177,19 +217,31 @@ impl ClassifierInstance {
}
}
#[derive(Clone, Debug, PartialEq)]
#[derive(Clone)]
pub(crate) struct Classifier {
name: String, // Hauptstück, Theil, Abschnitt, ol
parent: Option<Box<Classifier>>,
instances: Vec<ClassifierInstance>,
used_for_fn: Arc<dyn Fn(&str, &str) -> bool>,
}
impl std::fmt::Debug for Classifier {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Classifier")
.field("name", &self.name)
.field("parent", &self.parent)
.field("instances", &self.instances)
.finish()
}
}
impl Classifier {
fn new(name: &str) -> Self {
fn new(name: &str, used_for_fn: Arc<dyn Fn(&str, &str) -> bool>) -> Self {
Self {
name: name.into(),
parent: None,
instances: Vec::new(),
used_for_fn,
}
}
@ -210,7 +262,7 @@ impl Classifier {
}
fn used_for(&self, name: &str) -> bool {
name.contains(&self.name)
(self.used_for_fn)(&self.name, name)
}
fn add_section(&mut self, section: Section) {
@ -218,13 +270,11 @@ impl Classifier {
}
}
#[derive(Clone, Debug, PartialEq)]
#[derive(Clone, Debug)]
pub(crate) enum Content {
Text(String), //This is my direct law text
TextWithList(String, Vec<Box<Content>>),
Text(String), //This is my direct law text
Item(Vec<Box<Content>>), //(1) This is general law. (2) This is more specific law
List(Vec<Box<Content>>),
TextWithListAndText(String, Vec<Box<Content>>, String), //1. my first item
}
#[cfg(test)]

View File

@ -41,5 +41,8 @@ impl From<roxmltree::Error> for Error {
}
fn main() {
LawBuilder::new("UrhG");
env_logger::init();
let builder = LawBuilder::new("UrhG");
println!("{:#?}", builder.toc());
}

View File

@ -187,6 +187,8 @@ pub(crate) struct ContentUrlItem {
mod tests {
use std::{fs::File, io::Read};
use log::debug;
use super::*;
#[derive(Deserialize)]
@ -204,7 +206,7 @@ mod tests {
let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json);
if wrapper.is_err() {
let dbg = wrapper.as_ref().err().unwrap();
println!("{dbg:#?}");
debug!("{dbg:#?}");
}
assert!(wrapper.is_ok());
@ -219,7 +221,7 @@ mod tests {
let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json);
if wrapper.is_err() {
let dbg = wrapper.as_ref().err().unwrap();
println!("{dbg:#?}");
debug!("{dbg:#?}");
}
assert!(wrapper.is_ok());
@ -234,7 +236,7 @@ mod tests {
let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json);
if wrapper.is_err() {
let dbg = wrapper.as_ref().err().unwrap();
println!("{dbg:#?}");
debug!("{dbg:#?}");
}
assert!(wrapper.is_ok());

View File

@ -1,5 +1,7 @@
mod parser;
use log::{debug, info};
use crate::{law::LawBuilder, par::parser::Risdok, Error};
fn fetch_page(url: &str) -> Result<String, Error> {
@ -7,27 +9,26 @@ fn fetch_page(url: &str) -> Result<String, Error> {
}
pub(crate) fn parse(url: &str, builder: &mut LawBuilder) -> Result<(), Error> {
println!("{url}");
info!("Parsing {url}");
let xml = fetch_page(url)?;
let xml = xml.replace("<gdash />", "-"); // used e.g. in §11 Abs. 3 UrhG
//
//
let xml = xml.replace(
// e.g. in § 17 (2) TODO: check that this onyl happens here
r#"<liste><schlussteil ebene="0" art="normal" ct="text">"#,
r#"<absatz typ="abs" ct="text" halign="j">"#,
// in § 17 (2)
r#"<liste><schlussteil ebene="0" art="normal" ct="text">(2) Einer Rundfunksendung steht es gleich, wenn ein Werk von einer im In- oder im Ausland gelegenen Stelle aus der Öffentlichkeit im Inland, ähnlich wie durch Rundfunk, aber mit Hilfe von Leitungen wahrnehmbar gemacht wird.</schlussteil></liste>"#,
r#"<absatz typ="abs" ct="text" halign="j">(2) Einer Rundfunksendung steht es gleich, wenn ein Werk von einer im In- oder im Ausland gelegenen Stelle aus der Öffentlichkeit im Inland, ähnlich wie durch Rundfunk, aber mit Hilfe von Leitungen wahrnehmbar gemacht wird.</absatz>"#,
);
let xml = xml.replace(
// e.g. in § 17 (2) TODO: check that this onyl happens here
r#"</schlussteil></liste>"#,
"</absatz>",
);
println!("{xml}");
r#"<ueberschrift typ="para" ct="text" halign="c">1. Verwertungsrechte.</ueberschrift>"#,
r#"<ueberschrift typ="g1" ct="text" halign="c">1. Verwertungsrechte.</ueberschrift>"#,
); // 1. Verwertungsrechte. before § 14
let xml = xml.replace("<i>.</i>", "."); // e.g. § 37d Abs. 4 (last point)...
debug!("{xml}");
let risdok = Risdok::from_str(&xml, builder)?;
println!("{builder:#?}");
//println!("{risdok}");
Ok(())
}

View File

@ -1,3 +1,4 @@
use log::info;
use roxmltree::Node;
use crate::{
@ -86,7 +87,7 @@ impl Abschnitt {
c.next();
continue;
}
if Absatz::test(child) {
if Absatz::test_with_typ(child, "erltext") {
c.next();
continue;
}
@ -140,25 +141,30 @@ impl Abschnitt {
if let Some(child) = c.peek() {
if Liste::test(child) {
let liste = Liste::parse(c.next().unwrap());
absatze.push(Content::TextWithList(
absatz.content.clone(),
liste.get_list(),
))
absatze.push(Content::List(vec![
Content::Text(absatz.content).into(),
liste.get_content().into(),
]));
} else if Table::test(child) {
// If there's a "table" after an "absatz", the "table" should be part of the "absatz"
let table = Table::parse(c.next().unwrap());
if let Some(child) = c.peek() {
if Absatz::test_with_typ(child, "erltext") {
let after_absatz = Absatz::parse(c.next().unwrap());
absatze.push(Content::TextWithListAndText(
absatz.content,
table.get_list(),
after_absatz.content,
))
absatze.push(Content::List(vec![
Content::Text(absatz.content).into(),
Content::List(table.get_list()).into(),
Content::Text(after_absatz.content).into(),
]))
} else {
absatze.push(Content::TextWithList(absatz.content, table.get_list()))
absatze.push(Content::List(vec![
Content::Text(absatz.content).into(),
Content::List(table.get_list()).into(),
]));
}
}
} else {
absatze.push(Content::Text(absatz.content.clone()));
}
} else {
absatze.push(Content::Text(absatz.content.clone()));
@ -177,8 +183,10 @@ impl Abschnitt {
if let Some(child) = c.peek() {
if Liste::test(&child) {
let liste = Liste::parse(c.next().unwrap());
//TODO do something with list
absatze.push(Content::TextWithList(abs.content, liste.get_list()))
absatze.push(Content::List(vec![
Content::Text(abs.content).into(),
liste.get_content().into(),
]));
} else {
absatze.push(Content::Text(abs.content));
}
@ -225,7 +233,7 @@ impl Abschnitt {
}
}
#[derive(Debug, PartialEq)]
#[derive(Debug, PartialEq, Clone)]
pub(crate) struct Symbol {
stellen: String,
content: String,
@ -242,7 +250,7 @@ impl Symbol {
}
}
#[derive(Debug, PartialEq)]
#[derive(Debug, PartialEq, Clone)]
pub(crate) struct Listelem {
symbol: Symbol,
text: String,
@ -267,36 +275,39 @@ impl Listelem {
}
}
#[derive(Debug, PartialEq)]
#[derive(Debug, PartialEq, Clone)]
pub(crate) struct Ziffernliste {
ebene: String,
listelems: Vec<Listelem>,
}
impl Ziffernliste {
pub(crate) fn test(n: &Node) -> bool {
["ziffernliste", "aufzaehlung", "literaliste"].contains(&n.tag_name().name())
}
pub(crate) fn parse(n: Node) -> Self {
assert!(n.tag_name().name() == "ziffernliste");
assert!(Self::test(&n));
let ebene = n.attribute("ebene").unwrap().into();
let mut c = n.children().peekable();
let mut listelems = Vec::new();
loop {
match c.peek() {
Some(child) => {
if Listelem::test(child) {
listelems.push(Listelem::parse(c.next().unwrap()));
continue;
}
}
None => break,
}
break;
for child in n.children() {
listelems.push(Listelem::parse(child));
}
assert_eq!(c.next(), None);
Self { ebene, listelems }
}
pub(crate) fn get_content(&self) -> Content {
let mut elems = Vec::new();
for elem in &self.listelems {
elems.push(Content::Text(format!("{} {}", elem.symbol.content, elem.text)).into());
}
Content::List(elems)
}
}
#[derive(Debug, PartialEq)]
@ -372,40 +383,69 @@ impl Table {
}
#[derive(Debug, PartialEq)]
pub(crate) struct Liste {
ziffernliste: Ziffernliste,
pub(crate) struct Schlussteil {
content: String,
}
impl Liste {
impl Schlussteil {
pub(crate) fn test(n: &Node) -> bool {
n.tag_name().name() == "liste"
(n.tag_name().name() == "schlussteil" || n.tag_name().name() == "schluss")
&& n.children().count() == 1
&& n.children().next().unwrap().tag_name().name() == "ziffernliste"
}
pub(crate) fn parse(n: Node) -> Self {
assert!(Self::test(&n));
let mut c = n.children();
let content = n.children().next().unwrap().text().unwrap().into(); //not sure
let ziffernliste = Ziffernliste::parse(c.next().unwrap());
Self { content }
}
}
#[derive(Debug)]
pub(crate) struct Liste {
content: Vec<Content>,
}
impl Liste {
pub(crate) fn test(n: &Node) -> bool {
n.tag_name().name() == "liste"
}
pub(crate) fn parse(n: Node) -> Self {
assert!(Self::test(&n));
let mut content = Vec::new();
let mut c = n.children().peekable();
content.push(Ziffernliste::parse(c.next().unwrap()).get_content().into());
loop {
if let Some(child) = c.peek() {
if Ziffernliste::test(child) {
content.push(Ziffernliste::parse(c.next().unwrap()).get_content().into());
} else if Schlussteil::test(child) {
content.push(Content::Text(Schlussteil::parse(c.next().unwrap()).content));
} else {
break;
}
} else {
break;
}
}
assert_eq!(c.next(), None);
Self { ziffernliste }
Self { content }
}
pub(crate) fn get_list(&self) -> Vec<Box<Content>> {
let mut ret = Vec::new();
for a in &self.ziffernliste.listelems {
ret.push(Box::new(Content::Text(format!(
"{} {}",
a.symbol.content,
a.text.clone()
))));
}
ret
pub(crate) fn get_content(&self) -> Content {
Content::List(
self.content
.clone()
.into_iter()
.map(|c| Box::new(c))
.collect(),
)
}
}
@ -546,6 +586,8 @@ impl Layoutdaten {
mod tests {
use std::{fs::File, io::Read};
use log::error;
use super::*;
#[test]
@ -557,7 +599,7 @@ mod tests {
let mut builder = LawBuilder::test("no-headers");
let risdok = Risdok::from_str(&xml, &mut builder);
if risdok.is_err() {
println!("{:#?}", risdok.as_ref().err());
error!("{:#?}", risdok.as_ref().err());
}
assert!(risdok.is_ok());