parse first page of urhg
This commit is contained in:
parent
eb6f3e8aba
commit
55517758c2
157
Cargo.lock
generated
157
Cargo.lock
generated
@ -8,12 +8,27 @@ version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "base64"
|
||||
version = "0.21.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "2.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.0.83"
|
||||
@ -53,6 +68,29 @@ version = "0.1.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8"
|
||||
|
||||
[[package]]
|
||||
name = "env_logger"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0"
|
||||
dependencies = [
|
||||
"humantime",
|
||||
"is-terminal",
|
||||
"log",
|
||||
"regex",
|
||||
"termcolor",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "errno"
|
||||
version = "0.3.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "flate2"
|
||||
version = "1.0.28"
|
||||
@ -83,6 +121,18 @@ dependencies = [
|
||||
"wasi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
|
||||
|
||||
[[package]]
|
||||
name = "humantime"
|
||||
version = "2.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "0.4.0"
|
||||
@ -93,6 +143,17 @@ dependencies = [
|
||||
"unicode-normalization",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "is-terminal"
|
||||
version = "0.4.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
|
||||
dependencies = [
|
||||
"hermit-abi",
|
||||
"rustix",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.9"
|
||||
@ -105,12 +166,24 @@ version = "0.2.149"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.4.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f"
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
version = "0.4.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
|
||||
|
||||
[[package]]
|
||||
name = "miniz_oxide"
|
||||
version = "0.7.1"
|
||||
@ -166,6 +239,35 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex"
|
||||
version = "1.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.17.5"
|
||||
@ -184,6 +286,8 @@ dependencies = [
|
||||
name = "risp"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"env_logger",
|
||||
"log",
|
||||
"pretty_assertions",
|
||||
"roxmltree",
|
||||
"serde",
|
||||
@ -201,6 +305,19 @@ dependencies = [
|
||||
"xmlparser",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "0.38.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"errno",
|
||||
"libc",
|
||||
"linux-raw-sys",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls"
|
||||
version = "0.21.8"
|
||||
@ -287,6 +404,15 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "termcolor"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6093bad37da69aab9d123a8091e4be0aa4a03e4d601ec641c327398315f62b64"
|
||||
dependencies = [
|
||||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.3.30"
|
||||
@ -397,6 +523,37 @@ version = "0.25.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc"
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
|
||||
dependencies = [
|
||||
"winapi-i686-pc-windows-gnu",
|
||||
"winapi-x86_64-pc-windows-gnu",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-i686-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||
|
||||
[[package]]
|
||||
name = "winapi-util"
|
||||
version = "0.1.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596"
|
||||
dependencies = [
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi-x86_64-pc-windows-gnu"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.48.0"
|
||||
|
@ -11,6 +11,8 @@ time = { version = "0.3", features = [ "formatting" ] }
|
||||
serde = { version = "1.0", features = [ "derive" ] }
|
||||
serde_json = "1.0"
|
||||
roxmltree = "0.18"
|
||||
env_logger = "0.10"
|
||||
log = "0.4"
|
||||
|
||||
[dev-dependencies]
|
||||
pretty_assertions = "1.4"
|
||||
|
98
src/law.rs
98
src/law.rs
@ -1,3 +1,6 @@
|
||||
use log::debug;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::overview;
|
||||
|
||||
// pub(crate) struct Law {
|
||||
@ -15,7 +18,7 @@ use crate::overview;
|
||||
// }
|
||||
|
||||
/// Is used to generate a law struct. It's organized mainly by classifier.
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct LawBuilder {
|
||||
/// Name of the law
|
||||
name: String, //ABGB, UrhG
|
||||
@ -29,19 +32,35 @@ pub(crate) struct LawBuilder {
|
||||
next_para_header: Option<String>,
|
||||
}
|
||||
|
||||
fn contains(classifier_name: &str, instance_name: &str) -> bool {
|
||||
instance_name.contains(classifier_name)
|
||||
}
|
||||
|
||||
fn starts_with_number(_classifier_name: &str, instance_name: &str) -> bool {
|
||||
match instance_name.trim().as_bytes().get(0) {
|
||||
Some(c) if c.is_ascii_digit() => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
impl LawBuilder {
|
||||
#[cfg(test)]
|
||||
pub(crate) fn test(name: &str) -> Self {
|
||||
let mut classifiers = Vec::new();
|
||||
if name == "UrhG" {
|
||||
let hauptstueck = Classifier::new("Hauptstück");
|
||||
let hauptstueck = Classifier::new("Hauptstück", Arc::new(&contains));
|
||||
classifiers.push(hauptstueck.clone());
|
||||
|
||||
let mut abschnitt = Classifier::new("Abschnitt");
|
||||
let mut abschnitt = Classifier::new("Abschnitt", Arc::new(&contains));
|
||||
abschnitt.set_parent(hauptstueck);
|
||||
classifiers.push(abschnitt);
|
||||
|
||||
let mut numbered_header =
|
||||
Classifier::new("Numbered Header", Arc::new(&starts_with_number));
|
||||
numbered_header.set_parent(abschnitt);
|
||||
classifiers.push(numbered_header);
|
||||
} else if name == "no-headers" {
|
||||
let mut class = Classifier::new("");
|
||||
let mut class = Classifier::new("", Arc::new(&contains));
|
||||
class.add_instance(ClassifierInstance::new(""));
|
||||
classifiers.push(class);
|
||||
}
|
||||
@ -54,19 +73,24 @@ impl LawBuilder {
|
||||
}
|
||||
|
||||
/// Creates a new law builder. Adds classifier for known law texts.
|
||||
pub(crate) fn new(name: &str) {
|
||||
pub(crate) fn new(name: &str) -> LawBuilder {
|
||||
//TODO: return Law (not LawBuilder)
|
||||
let mut classifiers = Vec::new();
|
||||
|
||||
let mut law_id = None;
|
||||
if name == "UrhG" {
|
||||
law_id = Some(10001848);
|
||||
let hauptstueck = Classifier::new("Hauptstück");
|
||||
let hauptstueck = Classifier::new("Hauptstück", Arc::new(&contains));
|
||||
classifiers.push(hauptstueck.clone());
|
||||
|
||||
let mut abschnitt = Classifier::new("Abschnitt");
|
||||
let mut abschnitt = Classifier::new("Abschnitt", Arc::new(&contains));
|
||||
abschnitt.set_parent(hauptstueck);
|
||||
classifiers.push(abschnitt);
|
||||
classifiers.push(abschnitt.clone());
|
||||
|
||||
let mut numbered_header =
|
||||
Classifier::new("Numbered Header", Arc::new(&starts_with_number));
|
||||
numbered_header.set_parent(abschnitt);
|
||||
classifiers.push(numbered_header);
|
||||
}
|
||||
|
||||
let mut builder = Self {
|
||||
@ -77,11 +101,13 @@ impl LawBuilder {
|
||||
};
|
||||
|
||||
overview::parse(law_id.unwrap(), &mut builder).unwrap();
|
||||
|
||||
builder
|
||||
}
|
||||
|
||||
/// Sets a new header.
|
||||
pub(crate) fn new_header(&mut self, name: &str) {
|
||||
println!("new_header={name}");
|
||||
debug!("new_header={name}");
|
||||
let classifier_index = self
|
||||
.classifiers
|
||||
.iter()
|
||||
@ -98,7 +124,7 @@ impl LawBuilder {
|
||||
|
||||
/// Sets a new description for the last classifier.
|
||||
pub(crate) fn new_desc(&mut self, desc: &str) {
|
||||
println!("new_desc={desc}");
|
||||
debug!("new_desc={desc}");
|
||||
if let Some(index) = self.last_header_index {
|
||||
self.classifiers[index].set_desc(desc);
|
||||
} else {
|
||||
@ -108,10 +134,15 @@ impl LawBuilder {
|
||||
|
||||
/// Adds a new paragraph.
|
||||
pub(crate) fn new_par(&mut self, par: String, content: Content) {
|
||||
println!("new_par=par:{par};content:{content:#?}");
|
||||
if let Some(class) = self.classifiers.last_mut() {
|
||||
let section = Section { symb: par, content };
|
||||
class.add_section(section);
|
||||
debug!("new_par=par:{par};content:{content:#?}");
|
||||
if let Some(index) = self.last_header_index {
|
||||
let section = Section {
|
||||
symb: par,
|
||||
content,
|
||||
par_header: self.next_para_header.clone(),
|
||||
};
|
||||
self.next_para_header = None;
|
||||
self.classifiers[index].add_section(section);
|
||||
} else {
|
||||
panic!("Expected at least one classifier");
|
||||
}
|
||||
@ -119,14 +150,23 @@ impl LawBuilder {
|
||||
|
||||
/// Next paragraph has a header, store its name.
|
||||
pub(crate) fn new_next_para_header(&mut self, header: &str) {
|
||||
println!("new_next_para_header={header}");
|
||||
debug!("new_next_para_header={header}");
|
||||
self.next_para_header = Some(header.into());
|
||||
}
|
||||
|
||||
pub(crate) fn toc(&self) {
|
||||
for class in &self.classifiers {
|
||||
for inst in &class.instances {
|
||||
println!("{}", inst.name);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
#[derive(Debug, Clone)]
|
||||
pub(crate) struct Section {
|
||||
symb: String, // §"1", §"2", ...
|
||||
par_header: Option<String>,
|
||||
content: Content,
|
||||
//header: Option<Header>,
|
||||
}
|
||||
@ -152,7 +192,7 @@ impl Header {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct ClassifierInstance {
|
||||
name: String,
|
||||
desc: Option<String>,
|
||||
@ -177,19 +217,31 @@ impl ClassifierInstance {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct Classifier {
|
||||
name: String, // Hauptstück, Theil, Abschnitt, ol
|
||||
parent: Option<Box<Classifier>>,
|
||||
instances: Vec<ClassifierInstance>,
|
||||
used_for_fn: Arc<dyn Fn(&str, &str) -> bool>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Classifier {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("Classifier")
|
||||
.field("name", &self.name)
|
||||
.field("parent", &self.parent)
|
||||
.field("instances", &self.instances)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl Classifier {
|
||||
fn new(name: &str) -> Self {
|
||||
fn new(name: &str, used_for_fn: Arc<dyn Fn(&str, &str) -> bool>) -> Self {
|
||||
Self {
|
||||
name: name.into(),
|
||||
parent: None,
|
||||
instances: Vec::new(),
|
||||
used_for_fn,
|
||||
}
|
||||
}
|
||||
|
||||
@ -210,7 +262,7 @@ impl Classifier {
|
||||
}
|
||||
|
||||
fn used_for(&self, name: &str) -> bool {
|
||||
name.contains(&self.name)
|
||||
(self.used_for_fn)(&self.name, name)
|
||||
}
|
||||
|
||||
fn add_section(&mut self, section: Section) {
|
||||
@ -218,13 +270,11 @@ impl Classifier {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) enum Content {
|
||||
Text(String), //This is my direct law text
|
||||
TextWithList(String, Vec<Box<Content>>),
|
||||
Text(String), //This is my direct law text
|
||||
Item(Vec<Box<Content>>), //(1) This is general law. (2) This is more specific law
|
||||
List(Vec<Box<Content>>),
|
||||
TextWithListAndText(String, Vec<Box<Content>>, String), //1. my first item
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
@ -41,5 +41,8 @@ impl From<roxmltree::Error> for Error {
|
||||
}
|
||||
|
||||
fn main() {
|
||||
LawBuilder::new("UrhG");
|
||||
env_logger::init();
|
||||
let builder = LawBuilder::new("UrhG");
|
||||
|
||||
println!("{:#?}", builder.toc());
|
||||
}
|
||||
|
@ -187,6 +187,8 @@ pub(crate) struct ContentUrlItem {
|
||||
mod tests {
|
||||
use std::{fs::File, io::Read};
|
||||
|
||||
use log::debug;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
@ -204,7 +206,7 @@ mod tests {
|
||||
let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json);
|
||||
if wrapper.is_err() {
|
||||
let dbg = wrapper.as_ref().err().unwrap();
|
||||
println!("{dbg:#?}");
|
||||
debug!("{dbg:#?}");
|
||||
}
|
||||
|
||||
assert!(wrapper.is_ok());
|
||||
@ -219,7 +221,7 @@ mod tests {
|
||||
let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json);
|
||||
if wrapper.is_err() {
|
||||
let dbg = wrapper.as_ref().err().unwrap();
|
||||
println!("{dbg:#?}");
|
||||
debug!("{dbg:#?}");
|
||||
}
|
||||
|
||||
assert!(wrapper.is_ok());
|
||||
@ -234,7 +236,7 @@ mod tests {
|
||||
let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json);
|
||||
if wrapper.is_err() {
|
||||
let dbg = wrapper.as_ref().err().unwrap();
|
||||
println!("{dbg:#?}");
|
||||
debug!("{dbg:#?}");
|
||||
}
|
||||
|
||||
assert!(wrapper.is_ok());
|
||||
|
@ -1,5 +1,7 @@
|
||||
mod parser;
|
||||
|
||||
use log::{debug, info};
|
||||
|
||||
use crate::{law::LawBuilder, par::parser::Risdok, Error};
|
||||
|
||||
fn fetch_page(url: &str) -> Result<String, Error> {
|
||||
@ -7,27 +9,26 @@ fn fetch_page(url: &str) -> Result<String, Error> {
|
||||
}
|
||||
|
||||
pub(crate) fn parse(url: &str, builder: &mut LawBuilder) -> Result<(), Error> {
|
||||
println!("{url}");
|
||||
info!("Parsing {url}");
|
||||
let xml = fetch_page(url)?;
|
||||
let xml = xml.replace("<gdash />", "-"); // used e.g. in §11 Abs. 3 UrhG
|
||||
//
|
||||
//
|
||||
let xml = xml.replace(
|
||||
// e.g. in § 17 (2) TODO: check that this onyl happens here
|
||||
r#"<liste><schlussteil ebene="0" art="normal" ct="text">"#,
|
||||
r#"<absatz typ="abs" ct="text" halign="j">"#,
|
||||
// in § 17 (2)
|
||||
r#"<liste><schlussteil ebene="0" art="normal" ct="text">(2) Einer Rundfunksendung steht es gleich, wenn ein Werk von einer im In- oder im Ausland gelegenen Stelle aus der Öffentlichkeit im Inland, ähnlich wie durch Rundfunk, aber mit Hilfe von Leitungen wahrnehmbar gemacht wird.</schlussteil></liste>"#,
|
||||
r#"<absatz typ="abs" ct="text" halign="j">(2) Einer Rundfunksendung steht es gleich, wenn ein Werk von einer im In- oder im Ausland gelegenen Stelle aus der Öffentlichkeit im Inland, ähnlich wie durch Rundfunk, aber mit Hilfe von Leitungen wahrnehmbar gemacht wird.</absatz>"#,
|
||||
);
|
||||
|
||||
let xml = xml.replace(
|
||||
// e.g. in § 17 (2) TODO: check that this onyl happens here
|
||||
r#"</schlussteil></liste>"#,
|
||||
"</absatz>",
|
||||
);
|
||||
println!("{xml}");
|
||||
r#"<ueberschrift typ="para" ct="text" halign="c">1. Verwertungsrechte.</ueberschrift>"#,
|
||||
r#"<ueberschrift typ="g1" ct="text" halign="c">1. Verwertungsrechte.</ueberschrift>"#,
|
||||
); // 1. Verwertungsrechte. before § 14
|
||||
|
||||
let xml = xml.replace("<i>.</i>", "."); // e.g. § 37d Abs. 4 (last point)...
|
||||
debug!("{xml}");
|
||||
|
||||
let risdok = Risdok::from_str(&xml, builder)?;
|
||||
|
||||
println!("{builder:#?}");
|
||||
//println!("{risdok}");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
use log::info;
|
||||
use roxmltree::Node;
|
||||
|
||||
use crate::{
|
||||
@ -86,7 +87,7 @@ impl Abschnitt {
|
||||
c.next();
|
||||
continue;
|
||||
}
|
||||
if Absatz::test(child) {
|
||||
if Absatz::test_with_typ(child, "erltext") {
|
||||
c.next();
|
||||
continue;
|
||||
}
|
||||
@ -140,25 +141,30 @@ impl Abschnitt {
|
||||
if let Some(child) = c.peek() {
|
||||
if Liste::test(child) {
|
||||
let liste = Liste::parse(c.next().unwrap());
|
||||
absatze.push(Content::TextWithList(
|
||||
absatz.content.clone(),
|
||||
liste.get_list(),
|
||||
))
|
||||
absatze.push(Content::List(vec![
|
||||
Content::Text(absatz.content).into(),
|
||||
liste.get_content().into(),
|
||||
]));
|
||||
} else if Table::test(child) {
|
||||
// If there's a "table" after an "absatz", the "table" should be part of the "absatz"
|
||||
let table = Table::parse(c.next().unwrap());
|
||||
if let Some(child) = c.peek() {
|
||||
if Absatz::test_with_typ(child, "erltext") {
|
||||
let after_absatz = Absatz::parse(c.next().unwrap());
|
||||
absatze.push(Content::TextWithListAndText(
|
||||
absatz.content,
|
||||
table.get_list(),
|
||||
after_absatz.content,
|
||||
))
|
||||
absatze.push(Content::List(vec![
|
||||
Content::Text(absatz.content).into(),
|
||||
Content::List(table.get_list()).into(),
|
||||
Content::Text(after_absatz.content).into(),
|
||||
]))
|
||||
} else {
|
||||
absatze.push(Content::TextWithList(absatz.content, table.get_list()))
|
||||
absatze.push(Content::List(vec![
|
||||
Content::Text(absatz.content).into(),
|
||||
Content::List(table.get_list()).into(),
|
||||
]));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
absatze.push(Content::Text(absatz.content.clone()));
|
||||
}
|
||||
} else {
|
||||
absatze.push(Content::Text(absatz.content.clone()));
|
||||
@ -177,8 +183,10 @@ impl Abschnitt {
|
||||
if let Some(child) = c.peek() {
|
||||
if Liste::test(&child) {
|
||||
let liste = Liste::parse(c.next().unwrap());
|
||||
//TODO do something with list
|
||||
absatze.push(Content::TextWithList(abs.content, liste.get_list()))
|
||||
absatze.push(Content::List(vec![
|
||||
Content::Text(abs.content).into(),
|
||||
liste.get_content().into(),
|
||||
]));
|
||||
} else {
|
||||
absatze.push(Content::Text(abs.content));
|
||||
}
|
||||
@ -225,7 +233,7 @@ impl Abschnitt {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
pub(crate) struct Symbol {
|
||||
stellen: String,
|
||||
content: String,
|
||||
@ -242,7 +250,7 @@ impl Symbol {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
pub(crate) struct Listelem {
|
||||
symbol: Symbol,
|
||||
text: String,
|
||||
@ -267,36 +275,39 @@ impl Listelem {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[derive(Debug, PartialEq, Clone)]
|
||||
pub(crate) struct Ziffernliste {
|
||||
ebene: String,
|
||||
listelems: Vec<Listelem>,
|
||||
}
|
||||
impl Ziffernliste {
|
||||
pub(crate) fn test(n: &Node) -> bool {
|
||||
["ziffernliste", "aufzaehlung", "literaliste"].contains(&n.tag_name().name())
|
||||
}
|
||||
|
||||
pub(crate) fn parse(n: Node) -> Self {
|
||||
assert!(n.tag_name().name() == "ziffernliste");
|
||||
assert!(Self::test(&n));
|
||||
|
||||
let ebene = n.attribute("ebene").unwrap().into();
|
||||
let mut c = n.children().peekable();
|
||||
|
||||
let mut listelems = Vec::new();
|
||||
loop {
|
||||
match c.peek() {
|
||||
Some(child) => {
|
||||
if Listelem::test(child) {
|
||||
listelems.push(Listelem::parse(c.next().unwrap()));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
None => break,
|
||||
}
|
||||
break;
|
||||
|
||||
for child in n.children() {
|
||||
listelems.push(Listelem::parse(child));
|
||||
}
|
||||
|
||||
assert_eq!(c.next(), None);
|
||||
|
||||
Self { ebene, listelems }
|
||||
}
|
||||
|
||||
pub(crate) fn get_content(&self) -> Content {
|
||||
let mut elems = Vec::new();
|
||||
|
||||
for elem in &self.listelems {
|
||||
elems.push(Content::Text(format!("{} {}", elem.symbol.content, elem.text)).into());
|
||||
}
|
||||
|
||||
Content::List(elems)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
@ -372,40 +383,69 @@ impl Table {
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub(crate) struct Liste {
|
||||
ziffernliste: Ziffernliste,
|
||||
pub(crate) struct Schlussteil {
|
||||
content: String,
|
||||
}
|
||||
impl Liste {
|
||||
impl Schlussteil {
|
||||
pub(crate) fn test(n: &Node) -> bool {
|
||||
n.tag_name().name() == "liste"
|
||||
(n.tag_name().name() == "schlussteil" || n.tag_name().name() == "schluss")
|
||||
&& n.children().count() == 1
|
||||
&& n.children().next().unwrap().tag_name().name() == "ziffernliste"
|
||||
}
|
||||
|
||||
pub(crate) fn parse(n: Node) -> Self {
|
||||
assert!(Self::test(&n));
|
||||
|
||||
let mut c = n.children();
|
||||
let content = n.children().next().unwrap().text().unwrap().into(); //not sure
|
||||
|
||||
let ziffernliste = Ziffernliste::parse(c.next().unwrap());
|
||||
Self { content }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Liste {
|
||||
content: Vec<Content>,
|
||||
}
|
||||
impl Liste {
|
||||
pub(crate) fn test(n: &Node) -> bool {
|
||||
n.tag_name().name() == "liste"
|
||||
}
|
||||
|
||||
pub(crate) fn parse(n: Node) -> Self {
|
||||
assert!(Self::test(&n));
|
||||
|
||||
let mut content = Vec::new();
|
||||
|
||||
let mut c = n.children().peekable();
|
||||
|
||||
content.push(Ziffernliste::parse(c.next().unwrap()).get_content().into());
|
||||
|
||||
loop {
|
||||
if let Some(child) = c.peek() {
|
||||
if Ziffernliste::test(child) {
|
||||
content.push(Ziffernliste::parse(c.next().unwrap()).get_content().into());
|
||||
} else if Schlussteil::test(child) {
|
||||
content.push(Content::Text(Schlussteil::parse(c.next().unwrap()).content));
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert_eq!(c.next(), None);
|
||||
|
||||
Self { ziffernliste }
|
||||
Self { content }
|
||||
}
|
||||
|
||||
pub(crate) fn get_list(&self) -> Vec<Box<Content>> {
|
||||
let mut ret = Vec::new();
|
||||
|
||||
for a in &self.ziffernliste.listelems {
|
||||
ret.push(Box::new(Content::Text(format!(
|
||||
"{} {}",
|
||||
a.symbol.content,
|
||||
a.text.clone()
|
||||
))));
|
||||
}
|
||||
|
||||
ret
|
||||
pub(crate) fn get_content(&self) -> Content {
|
||||
Content::List(
|
||||
self.content
|
||||
.clone()
|
||||
.into_iter()
|
||||
.map(|c| Box::new(c))
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
@ -546,6 +586,8 @@ impl Layoutdaten {
|
||||
mod tests {
|
||||
use std::{fs::File, io::Read};
|
||||
|
||||
use log::error;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
@ -557,7 +599,7 @@ mod tests {
|
||||
let mut builder = LawBuilder::test("no-headers");
|
||||
let risdok = Risdok::from_str(&xml, &mut builder);
|
||||
if risdok.is_err() {
|
||||
println!("{:#?}", risdok.as_ref().err());
|
||||
error!("{:#?}", risdok.as_ref().err());
|
||||
}
|
||||
assert!(risdok.is_ok());
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user