parse first page of urhg

This commit is contained in:
philipp 2023-11-05 12:46:04 +01:00
parent eb6f3e8aba
commit 55517758c2
7 changed files with 349 additions and 92 deletions

157
Cargo.lock generated
View File

@ -8,12 +8,27 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
[[package]]
name = "aho-corasick"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "base64" name = "base64"
version = "0.21.5" version = "0.21.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9" checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9"
[[package]]
name = "bitflags"
version = "2.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
[[package]] [[package]]
name = "cc" name = "cc"
version = "1.0.83" version = "1.0.83"
@ -53,6 +68,29 @@ version = "0.1.13"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8"
[[package]]
name = "env_logger"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0"
dependencies = [
"humantime",
"is-terminal",
"log",
"regex",
"termcolor",
]
[[package]]
name = "errno"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860"
dependencies = [
"libc",
"windows-sys",
]
[[package]] [[package]]
name = "flate2" name = "flate2"
version = "1.0.28" version = "1.0.28"
@ -83,6 +121,18 @@ dependencies = [
"wasi", "wasi",
] ]
[[package]]
name = "hermit-abi"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
[[package]]
name = "humantime"
version = "2.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
[[package]] [[package]]
name = "idna" name = "idna"
version = "0.4.0" version = "0.4.0"
@ -93,6 +143,17 @@ dependencies = [
"unicode-normalization", "unicode-normalization",
] ]
[[package]]
name = "is-terminal"
version = "0.4.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
dependencies = [
"hermit-abi",
"rustix",
"windows-sys",
]
[[package]] [[package]]
name = "itoa" name = "itoa"
version = "1.0.9" version = "1.0.9"
@ -105,12 +166,24 @@ version = "0.2.149"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
[[package]]
name = "linux-raw-sys"
version = "0.4.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f"
[[package]] [[package]]
name = "log" name = "log"
version = "0.4.20" version = "0.4.20"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
[[package]]
name = "memchr"
version = "2.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
[[package]] [[package]]
name = "miniz_oxide" name = "miniz_oxide"
version = "0.7.1" version = "0.7.1"
@ -166,6 +239,35 @@ dependencies = [
"proc-macro2", "proc-macro2",
] ]
[[package]]
name = "regex"
version = "1.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
dependencies = [
"aho-corasick",
"memchr",
"regex-automata",
"regex-syntax",
]
[[package]]
name = "regex-automata"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
dependencies = [
"aho-corasick",
"memchr",
"regex-syntax",
]
[[package]]
name = "regex-syntax"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
[[package]] [[package]]
name = "ring" name = "ring"
version = "0.17.5" version = "0.17.5"
@ -184,6 +286,8 @@ dependencies = [
name = "risp" name = "risp"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"env_logger",
"log",
"pretty_assertions", "pretty_assertions",
"roxmltree", "roxmltree",
"serde", "serde",
@ -201,6 +305,19 @@ dependencies = [
"xmlparser", "xmlparser",
] ]
[[package]]
name = "rustix"
version = "0.38.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3"
dependencies = [
"bitflags",
"errno",
"libc",
"linux-raw-sys",
"windows-sys",
]
[[package]] [[package]]
name = "rustls" name = "rustls"
version = "0.21.8" version = "0.21.8"
@ -287,6 +404,15 @@ dependencies = [
"unicode-ident", "unicode-ident",
] ]
[[package]]
name = "termcolor"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6093bad37da69aab9d123a8091e4be0aa4a03e4d601ec641c327398315f62b64"
dependencies = [
"winapi-util",
]
[[package]] [[package]]
name = "time" name = "time"
version = "0.3.30" version = "0.3.30"
@ -397,6 +523,37 @@ version = "0.25.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc" checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc"
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-util"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596"
dependencies = [
"winapi",
]
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]] [[package]]
name = "windows-sys" name = "windows-sys"
version = "0.48.0" version = "0.48.0"

View File

@ -11,6 +11,8 @@ time = { version = "0.3", features = [ "formatting" ] }
serde = { version = "1.0", features = [ "derive" ] } serde = { version = "1.0", features = [ "derive" ] }
serde_json = "1.0" serde_json = "1.0"
roxmltree = "0.18" roxmltree = "0.18"
env_logger = "0.10"
log = "0.4"
[dev-dependencies] [dev-dependencies]
pretty_assertions = "1.4" pretty_assertions = "1.4"

View File

@ -1,3 +1,6 @@
use log::debug;
use std::sync::Arc;
use crate::overview; use crate::overview;
// pub(crate) struct Law { // pub(crate) struct Law {
@ -15,7 +18,7 @@ use crate::overview;
// } // }
/// Is used to generate a law struct. It's organized mainly by classifier. /// Is used to generate a law struct. It's organized mainly by classifier.
#[derive(Debug, PartialEq)] #[derive(Debug)]
pub(crate) struct LawBuilder { pub(crate) struct LawBuilder {
/// Name of the law /// Name of the law
name: String, //ABGB, UrhG name: String, //ABGB, UrhG
@ -29,19 +32,35 @@ pub(crate) struct LawBuilder {
next_para_header: Option<String>, next_para_header: Option<String>,
} }
fn contains(classifier_name: &str, instance_name: &str) -> bool {
instance_name.contains(classifier_name)
}
fn starts_with_number(_classifier_name: &str, instance_name: &str) -> bool {
match instance_name.trim().as_bytes().get(0) {
Some(c) if c.is_ascii_digit() => true,
_ => false,
}
}
impl LawBuilder { impl LawBuilder {
#[cfg(test)] #[cfg(test)]
pub(crate) fn test(name: &str) -> Self { pub(crate) fn test(name: &str) -> Self {
let mut classifiers = Vec::new(); let mut classifiers = Vec::new();
if name == "UrhG" { if name == "UrhG" {
let hauptstueck = Classifier::new("Hauptstück"); let hauptstueck = Classifier::new("Hauptstück", Arc::new(&contains));
classifiers.push(hauptstueck.clone()); classifiers.push(hauptstueck.clone());
let mut abschnitt = Classifier::new("Abschnitt"); let mut abschnitt = Classifier::new("Abschnitt", Arc::new(&contains));
abschnitt.set_parent(hauptstueck); abschnitt.set_parent(hauptstueck);
classifiers.push(abschnitt); classifiers.push(abschnitt);
let mut numbered_header =
Classifier::new("Numbered Header", Arc::new(&starts_with_number));
numbered_header.set_parent(abschnitt);
classifiers.push(numbered_header);
} else if name == "no-headers" { } else if name == "no-headers" {
let mut class = Classifier::new(""); let mut class = Classifier::new("", Arc::new(&contains));
class.add_instance(ClassifierInstance::new("")); class.add_instance(ClassifierInstance::new(""));
classifiers.push(class); classifiers.push(class);
} }
@ -54,19 +73,24 @@ impl LawBuilder {
} }
/// Creates a new law builder. Adds classifier for known law texts. /// Creates a new law builder. Adds classifier for known law texts.
pub(crate) fn new(name: &str) { pub(crate) fn new(name: &str) -> LawBuilder {
//TODO: return Law (not LawBuilder) //TODO: return Law (not LawBuilder)
let mut classifiers = Vec::new(); let mut classifiers = Vec::new();
let mut law_id = None; let mut law_id = None;
if name == "UrhG" { if name == "UrhG" {
law_id = Some(10001848); law_id = Some(10001848);
let hauptstueck = Classifier::new("Hauptstück"); let hauptstueck = Classifier::new("Hauptstück", Arc::new(&contains));
classifiers.push(hauptstueck.clone()); classifiers.push(hauptstueck.clone());
let mut abschnitt = Classifier::new("Abschnitt"); let mut abschnitt = Classifier::new("Abschnitt", Arc::new(&contains));
abschnitt.set_parent(hauptstueck); abschnitt.set_parent(hauptstueck);
classifiers.push(abschnitt); classifiers.push(abschnitt.clone());
let mut numbered_header =
Classifier::new("Numbered Header", Arc::new(&starts_with_number));
numbered_header.set_parent(abschnitt);
classifiers.push(numbered_header);
} }
let mut builder = Self { let mut builder = Self {
@ -77,11 +101,13 @@ impl LawBuilder {
}; };
overview::parse(law_id.unwrap(), &mut builder).unwrap(); overview::parse(law_id.unwrap(), &mut builder).unwrap();
builder
} }
/// Sets a new header. /// Sets a new header.
pub(crate) fn new_header(&mut self, name: &str) { pub(crate) fn new_header(&mut self, name: &str) {
println!("new_header={name}"); debug!("new_header={name}");
let classifier_index = self let classifier_index = self
.classifiers .classifiers
.iter() .iter()
@ -98,7 +124,7 @@ impl LawBuilder {
/// Sets a new description for the last classifier. /// Sets a new description for the last classifier.
pub(crate) fn new_desc(&mut self, desc: &str) { pub(crate) fn new_desc(&mut self, desc: &str) {
println!("new_desc={desc}"); debug!("new_desc={desc}");
if let Some(index) = self.last_header_index { if let Some(index) = self.last_header_index {
self.classifiers[index].set_desc(desc); self.classifiers[index].set_desc(desc);
} else { } else {
@ -108,10 +134,15 @@ impl LawBuilder {
/// Adds a new paragraph. /// Adds a new paragraph.
pub(crate) fn new_par(&mut self, par: String, content: Content) { pub(crate) fn new_par(&mut self, par: String, content: Content) {
println!("new_par=par:{par};content:{content:#?}"); debug!("new_par=par:{par};content:{content:#?}");
if let Some(class) = self.classifiers.last_mut() { if let Some(index) = self.last_header_index {
let section = Section { symb: par, content }; let section = Section {
class.add_section(section); symb: par,
content,
par_header: self.next_para_header.clone(),
};
self.next_para_header = None;
self.classifiers[index].add_section(section);
} else { } else {
panic!("Expected at least one classifier"); panic!("Expected at least one classifier");
} }
@ -119,14 +150,23 @@ impl LawBuilder {
/// Next paragraph has a header, store its name. /// Next paragraph has a header, store its name.
pub(crate) fn new_next_para_header(&mut self, header: &str) { pub(crate) fn new_next_para_header(&mut self, header: &str) {
println!("new_next_para_header={header}"); debug!("new_next_para_header={header}");
self.next_para_header = Some(header.into()); self.next_para_header = Some(header.into());
} }
pub(crate) fn toc(&self) {
for class in &self.classifiers {
for inst in &class.instances {
println!("{}", inst.name);
}
}
}
} }
#[derive(Debug, PartialEq, Clone)] #[derive(Debug, Clone)]
pub(crate) struct Section { pub(crate) struct Section {
symb: String, // §"1", §"2", ... symb: String, // §"1", §"2", ...
par_header: Option<String>,
content: Content, content: Content,
//header: Option<Header>, //header: Option<Header>,
} }
@ -152,7 +192,7 @@ impl Header {
} }
} }
#[derive(Clone, Debug, PartialEq)] #[derive(Clone, Debug)]
pub(crate) struct ClassifierInstance { pub(crate) struct ClassifierInstance {
name: String, name: String,
desc: Option<String>, desc: Option<String>,
@ -177,19 +217,31 @@ impl ClassifierInstance {
} }
} }
#[derive(Clone, Debug, PartialEq)] #[derive(Clone)]
pub(crate) struct Classifier { pub(crate) struct Classifier {
name: String, // Hauptstück, Theil, Abschnitt, ol name: String, // Hauptstück, Theil, Abschnitt, ol
parent: Option<Box<Classifier>>, parent: Option<Box<Classifier>>,
instances: Vec<ClassifierInstance>, instances: Vec<ClassifierInstance>,
used_for_fn: Arc<dyn Fn(&str, &str) -> bool>,
}
impl std::fmt::Debug for Classifier {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Classifier")
.field("name", &self.name)
.field("parent", &self.parent)
.field("instances", &self.instances)
.finish()
}
} }
impl Classifier { impl Classifier {
fn new(name: &str) -> Self { fn new(name: &str, used_for_fn: Arc<dyn Fn(&str, &str) -> bool>) -> Self {
Self { Self {
name: name.into(), name: name.into(),
parent: None, parent: None,
instances: Vec::new(), instances: Vec::new(),
used_for_fn,
} }
} }
@ -210,7 +262,7 @@ impl Classifier {
} }
fn used_for(&self, name: &str) -> bool { fn used_for(&self, name: &str) -> bool {
name.contains(&self.name) (self.used_for_fn)(&self.name, name)
} }
fn add_section(&mut self, section: Section) { fn add_section(&mut self, section: Section) {
@ -218,13 +270,11 @@ impl Classifier {
} }
} }
#[derive(Clone, Debug, PartialEq)] #[derive(Clone, Debug)]
pub(crate) enum Content { pub(crate) enum Content {
Text(String), //This is my direct law text Text(String), //This is my direct law text
TextWithList(String, Vec<Box<Content>>),
Item(Vec<Box<Content>>), //(1) This is general law. (2) This is more specific law Item(Vec<Box<Content>>), //(1) This is general law. (2) This is more specific law
List(Vec<Box<Content>>), List(Vec<Box<Content>>),
TextWithListAndText(String, Vec<Box<Content>>, String), //1. my first item
} }
#[cfg(test)] #[cfg(test)]

View File

@ -41,5 +41,8 @@ impl From<roxmltree::Error> for Error {
} }
fn main() { fn main() {
LawBuilder::new("UrhG"); env_logger::init();
let builder = LawBuilder::new("UrhG");
println!("{:#?}", builder.toc());
} }

View File

@ -187,6 +187,8 @@ pub(crate) struct ContentUrlItem {
mod tests { mod tests {
use std::{fs::File, io::Read}; use std::{fs::File, io::Read};
use log::debug;
use super::*; use super::*;
#[derive(Deserialize)] #[derive(Deserialize)]
@ -204,7 +206,7 @@ mod tests {
let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json); let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json);
if wrapper.is_err() { if wrapper.is_err() {
let dbg = wrapper.as_ref().err().unwrap(); let dbg = wrapper.as_ref().err().unwrap();
println!("{dbg:#?}"); debug!("{dbg:#?}");
} }
assert!(wrapper.is_ok()); assert!(wrapper.is_ok());
@ -219,7 +221,7 @@ mod tests {
let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json); let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json);
if wrapper.is_err() { if wrapper.is_err() {
let dbg = wrapper.as_ref().err().unwrap(); let dbg = wrapper.as_ref().err().unwrap();
println!("{dbg:#?}"); debug!("{dbg:#?}");
} }
assert!(wrapper.is_ok()); assert!(wrapper.is_ok());
@ -234,7 +236,7 @@ mod tests {
let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json); let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json);
if wrapper.is_err() { if wrapper.is_err() {
let dbg = wrapper.as_ref().err().unwrap(); let dbg = wrapper.as_ref().err().unwrap();
println!("{dbg:#?}"); debug!("{dbg:#?}");
} }
assert!(wrapper.is_ok()); assert!(wrapper.is_ok());

View File

@ -1,5 +1,7 @@
mod parser; mod parser;
use log::{debug, info};
use crate::{law::LawBuilder, par::parser::Risdok, Error}; use crate::{law::LawBuilder, par::parser::Risdok, Error};
fn fetch_page(url: &str) -> Result<String, Error> { fn fetch_page(url: &str) -> Result<String, Error> {
@ -7,27 +9,26 @@ fn fetch_page(url: &str) -> Result<String, Error> {
} }
pub(crate) fn parse(url: &str, builder: &mut LawBuilder) -> Result<(), Error> { pub(crate) fn parse(url: &str, builder: &mut LawBuilder) -> Result<(), Error> {
println!("{url}"); info!("Parsing {url}");
let xml = fetch_page(url)?; let xml = fetch_page(url)?;
let xml = xml.replace("<gdash />", "-"); // used e.g. in §11 Abs. 3 UrhG let xml = xml.replace("<gdash />", "-"); // used e.g. in §11 Abs. 3 UrhG
// //
// //
let xml = xml.replace( let xml = xml.replace(
// e.g. in § 17 (2) TODO: check that this onyl happens here // in § 17 (2)
r#"<liste><schlussteil ebene="0" art="normal" ct="text">"#, r#"<liste><schlussteil ebene="0" art="normal" ct="text">(2) Einer Rundfunksendung steht es gleich, wenn ein Werk von einer im In- oder im Ausland gelegenen Stelle aus der Öffentlichkeit im Inland, ähnlich wie durch Rundfunk, aber mit Hilfe von Leitungen wahrnehmbar gemacht wird.</schlussteil></liste>"#,
r#"<absatz typ="abs" ct="text" halign="j">"#, r#"<absatz typ="abs" ct="text" halign="j">(2) Einer Rundfunksendung steht es gleich, wenn ein Werk von einer im In- oder im Ausland gelegenen Stelle aus der Öffentlichkeit im Inland, ähnlich wie durch Rundfunk, aber mit Hilfe von Leitungen wahrnehmbar gemacht wird.</absatz>"#,
); );
let xml = xml.replace( let xml = xml.replace(
// e.g. in § 17 (2) TODO: check that this onyl happens here r#"<ueberschrift typ="para" ct="text" halign="c">1. Verwertungsrechte.</ueberschrift>"#,
r#"</schlussteil></liste>"#, r#"<ueberschrift typ="g1" ct="text" halign="c">1. Verwertungsrechte.</ueberschrift>"#,
"</absatz>", ); // 1. Verwertungsrechte. before § 14
);
println!("{xml}"); let xml = xml.replace("<i>.</i>", "."); // e.g. § 37d Abs. 4 (last point)...
debug!("{xml}");
let risdok = Risdok::from_str(&xml, builder)?; let risdok = Risdok::from_str(&xml, builder)?;
println!("{builder:#?}");
//println!("{risdok}");
Ok(()) Ok(())
} }

View File

@ -1,3 +1,4 @@
use log::info;
use roxmltree::Node; use roxmltree::Node;
use crate::{ use crate::{
@ -86,7 +87,7 @@ impl Abschnitt {
c.next(); c.next();
continue; continue;
} }
if Absatz::test(child) { if Absatz::test_with_typ(child, "erltext") {
c.next(); c.next();
continue; continue;
} }
@ -140,25 +141,30 @@ impl Abschnitt {
if let Some(child) = c.peek() { if let Some(child) = c.peek() {
if Liste::test(child) { if Liste::test(child) {
let liste = Liste::parse(c.next().unwrap()); let liste = Liste::parse(c.next().unwrap());
absatze.push(Content::TextWithList( absatze.push(Content::List(vec![
absatz.content.clone(), Content::Text(absatz.content).into(),
liste.get_list(), liste.get_content().into(),
)) ]));
} else if Table::test(child) { } else if Table::test(child) {
// If there's a "table" after an "absatz", the "table" should be part of the "absatz" // If there's a "table" after an "absatz", the "table" should be part of the "absatz"
let table = Table::parse(c.next().unwrap()); let table = Table::parse(c.next().unwrap());
if let Some(child) = c.peek() { if let Some(child) = c.peek() {
if Absatz::test_with_typ(child, "erltext") { if Absatz::test_with_typ(child, "erltext") {
let after_absatz = Absatz::parse(c.next().unwrap()); let after_absatz = Absatz::parse(c.next().unwrap());
absatze.push(Content::TextWithListAndText( absatze.push(Content::List(vec![
absatz.content, Content::Text(absatz.content).into(),
table.get_list(), Content::List(table.get_list()).into(),
after_absatz.content, Content::Text(after_absatz.content).into(),
)) ]))
} else { } else {
absatze.push(Content::TextWithList(absatz.content, table.get_list())) absatze.push(Content::List(vec![
Content::Text(absatz.content).into(),
Content::List(table.get_list()).into(),
]));
} }
} }
} else {
absatze.push(Content::Text(absatz.content.clone()));
} }
} else { } else {
absatze.push(Content::Text(absatz.content.clone())); absatze.push(Content::Text(absatz.content.clone()));
@ -177,8 +183,10 @@ impl Abschnitt {
if let Some(child) = c.peek() { if let Some(child) = c.peek() {
if Liste::test(&child) { if Liste::test(&child) {
let liste = Liste::parse(c.next().unwrap()); let liste = Liste::parse(c.next().unwrap());
//TODO do something with list absatze.push(Content::List(vec![
absatze.push(Content::TextWithList(abs.content, liste.get_list())) Content::Text(abs.content).into(),
liste.get_content().into(),
]));
} else { } else {
absatze.push(Content::Text(abs.content)); absatze.push(Content::Text(abs.content));
} }
@ -225,7 +233,7 @@ impl Abschnitt {
} }
} }
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq, Clone)]
pub(crate) struct Symbol { pub(crate) struct Symbol {
stellen: String, stellen: String,
content: String, content: String,
@ -242,7 +250,7 @@ impl Symbol {
} }
} }
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq, Clone)]
pub(crate) struct Listelem { pub(crate) struct Listelem {
symbol: Symbol, symbol: Symbol,
text: String, text: String,
@ -267,36 +275,39 @@ impl Listelem {
} }
} }
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq, Clone)]
pub(crate) struct Ziffernliste { pub(crate) struct Ziffernliste {
ebene: String, ebene: String,
listelems: Vec<Listelem>, listelems: Vec<Listelem>,
} }
impl Ziffernliste { impl Ziffernliste {
pub(crate) fn test(n: &Node) -> bool {
["ziffernliste", "aufzaehlung", "literaliste"].contains(&n.tag_name().name())
}
pub(crate) fn parse(n: Node) -> Self { pub(crate) fn parse(n: Node) -> Self {
assert!(n.tag_name().name() == "ziffernliste"); assert!(Self::test(&n));
let ebene = n.attribute("ebene").unwrap().into(); let ebene = n.attribute("ebene").unwrap().into();
let mut c = n.children().peekable();
let mut listelems = Vec::new(); let mut listelems = Vec::new();
loop {
match c.peek() {
Some(child) => {
if Listelem::test(child) {
listelems.push(Listelem::parse(c.next().unwrap()));
continue;
}
}
None => break,
}
break;
}
assert_eq!(c.next(), None); for child in n.children() {
listelems.push(Listelem::parse(child));
}
Self { ebene, listelems } Self { ebene, listelems }
} }
pub(crate) fn get_content(&self) -> Content {
let mut elems = Vec::new();
for elem in &self.listelems {
elems.push(Content::Text(format!("{} {}", elem.symbol.content, elem.text)).into());
}
Content::List(elems)
}
} }
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
@ -372,40 +383,69 @@ impl Table {
} }
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
pub(crate) struct Liste { pub(crate) struct Schlussteil {
ziffernliste: Ziffernliste, content: String,
} }
impl Liste { impl Schlussteil {
pub(crate) fn test(n: &Node) -> bool { pub(crate) fn test(n: &Node) -> bool {
n.tag_name().name() == "liste" (n.tag_name().name() == "schlussteil" || n.tag_name().name() == "schluss")
&& n.children().count() == 1 && n.children().count() == 1
&& n.children().next().unwrap().tag_name().name() == "ziffernliste"
} }
pub(crate) fn parse(n: Node) -> Self { pub(crate) fn parse(n: Node) -> Self {
assert!(Self::test(&n)); assert!(Self::test(&n));
let mut c = n.children(); let content = n.children().next().unwrap().text().unwrap().into(); //not sure
let ziffernliste = Ziffernliste::parse(c.next().unwrap()); Self { content }
}
}
#[derive(Debug)]
pub(crate) struct Liste {
content: Vec<Content>,
}
impl Liste {
pub(crate) fn test(n: &Node) -> bool {
n.tag_name().name() == "liste"
}
pub(crate) fn parse(n: Node) -> Self {
assert!(Self::test(&n));
let mut content = Vec::new();
let mut c = n.children().peekable();
content.push(Ziffernliste::parse(c.next().unwrap()).get_content().into());
loop {
if let Some(child) = c.peek() {
if Ziffernliste::test(child) {
content.push(Ziffernliste::parse(c.next().unwrap()).get_content().into());
} else if Schlussteil::test(child) {
content.push(Content::Text(Schlussteil::parse(c.next().unwrap()).content));
} else {
break;
}
} else {
break;
}
}
assert_eq!(c.next(), None); assert_eq!(c.next(), None);
Self { ziffernliste } Self { content }
} }
pub(crate) fn get_list(&self) -> Vec<Box<Content>> { pub(crate) fn get_content(&self) -> Content {
let mut ret = Vec::new(); Content::List(
self.content
for a in &self.ziffernliste.listelems { .clone()
ret.push(Box::new(Content::Text(format!( .into_iter()
"{} {}", .map(|c| Box::new(c))
a.symbol.content, .collect(),
a.text.clone() )
))));
}
ret
} }
} }
@ -546,6 +586,8 @@ impl Layoutdaten {
mod tests { mod tests {
use std::{fs::File, io::Read}; use std::{fs::File, io::Read};
use log::error;
use super::*; use super::*;
#[test] #[test]
@ -557,7 +599,7 @@ mod tests {
let mut builder = LawBuilder::test("no-headers"); let mut builder = LawBuilder::test("no-headers");
let risdok = Risdok::from_str(&xml, &mut builder); let risdok = Risdok::from_str(&xml, &mut builder);
if risdok.is_err() { if risdok.is_err() {
println!("{:#?}", risdok.as_ref().err()); error!("{:#?}", risdok.as_ref().err());
} }
assert!(risdok.is_ok()); assert!(risdok.is_ok());