parse first page of urhg
This commit is contained in:
parent
eb6f3e8aba
commit
55517758c2
157
Cargo.lock
generated
157
Cargo.lock
generated
@ -8,12 +8,27 @@ version = "1.0.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "aho-corasick"
|
||||||
|
version = "1.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "base64"
|
name = "base64"
|
||||||
version = "0.21.5"
|
version = "0.21.5"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9"
|
checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "bitflags"
|
||||||
|
version = "2.4.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cc"
|
name = "cc"
|
||||||
version = "1.0.83"
|
version = "1.0.83"
|
||||||
@ -53,6 +68,29 @@ version = "0.1.13"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8"
|
checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "env_logger"
|
||||||
|
version = "0.10.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "85cdab6a89accf66733ad5a1693a4dcced6aeff64602b634530dd73c1f3ee9f0"
|
||||||
|
dependencies = [
|
||||||
|
"humantime",
|
||||||
|
"is-terminal",
|
||||||
|
"log",
|
||||||
|
"regex",
|
||||||
|
"termcolor",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "errno"
|
||||||
|
version = "0.3.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860"
|
||||||
|
dependencies = [
|
||||||
|
"libc",
|
||||||
|
"windows-sys",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "flate2"
|
name = "flate2"
|
||||||
version = "1.0.28"
|
version = "1.0.28"
|
||||||
@ -83,6 +121,18 @@ dependencies = [
|
|||||||
"wasi",
|
"wasi",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "hermit-abi"
|
||||||
|
version = "0.3.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "humantime"
|
||||||
|
version = "2.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "idna"
|
name = "idna"
|
||||||
version = "0.4.0"
|
version = "0.4.0"
|
||||||
@ -93,6 +143,17 @@ dependencies = [
|
|||||||
"unicode-normalization",
|
"unicode-normalization",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "is-terminal"
|
||||||
|
version = "0.4.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
|
||||||
|
dependencies = [
|
||||||
|
"hermit-abi",
|
||||||
|
"rustix",
|
||||||
|
"windows-sys",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "itoa"
|
name = "itoa"
|
||||||
version = "1.0.9"
|
version = "1.0.9"
|
||||||
@ -105,12 +166,24 @@ version = "0.2.149"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
|
checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "linux-raw-sys"
|
||||||
|
version = "0.4.10"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "log"
|
name = "log"
|
||||||
version = "0.4.20"
|
version = "0.4.20"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
|
checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "memchr"
|
||||||
|
version = "2.6.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "miniz_oxide"
|
name = "miniz_oxide"
|
||||||
version = "0.7.1"
|
version = "0.7.1"
|
||||||
@ -166,6 +239,35 @@ dependencies = [
|
|||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex"
|
||||||
|
version = "1.10.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-automata",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-automata"
|
||||||
|
version = "0.4.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
|
||||||
|
dependencies = [
|
||||||
|
"aho-corasick",
|
||||||
|
"memchr",
|
||||||
|
"regex-syntax",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-syntax"
|
||||||
|
version = "0.8.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ring"
|
name = "ring"
|
||||||
version = "0.17.5"
|
version = "0.17.5"
|
||||||
@ -184,6 +286,8 @@ dependencies = [
|
|||||||
name = "risp"
|
name = "risp"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"env_logger",
|
||||||
|
"log",
|
||||||
"pretty_assertions",
|
"pretty_assertions",
|
||||||
"roxmltree",
|
"roxmltree",
|
||||||
"serde",
|
"serde",
|
||||||
@ -201,6 +305,19 @@ dependencies = [
|
|||||||
"xmlparser",
|
"xmlparser",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rustix"
|
||||||
|
version = "0.38.21"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3"
|
||||||
|
dependencies = [
|
||||||
|
"bitflags",
|
||||||
|
"errno",
|
||||||
|
"libc",
|
||||||
|
"linux-raw-sys",
|
||||||
|
"windows-sys",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rustls"
|
name = "rustls"
|
||||||
version = "0.21.8"
|
version = "0.21.8"
|
||||||
@ -287,6 +404,15 @@ dependencies = [
|
|||||||
"unicode-ident",
|
"unicode-ident",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "termcolor"
|
||||||
|
version = "1.3.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6093bad37da69aab9d123a8091e4be0aa4a03e4d601ec641c327398315f62b64"
|
||||||
|
dependencies = [
|
||||||
|
"winapi-util",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "time"
|
name = "time"
|
||||||
version = "0.3.30"
|
version = "0.3.30"
|
||||||
@ -397,6 +523,37 @@ version = "0.25.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc"
|
checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi"
|
||||||
|
version = "0.3.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
|
||||||
|
dependencies = [
|
||||||
|
"winapi-i686-pc-windows-gnu",
|
||||||
|
"winapi-x86_64-pc-windows-gnu",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi-i686-pc-windows-gnu"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi-util"
|
||||||
|
version = "0.1.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596"
|
||||||
|
dependencies = [
|
||||||
|
"winapi",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "winapi-x86_64-pc-windows-gnu"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "windows-sys"
|
name = "windows-sys"
|
||||||
version = "0.48.0"
|
version = "0.48.0"
|
||||||
|
@ -11,6 +11,8 @@ time = { version = "0.3", features = [ "formatting" ] }
|
|||||||
serde = { version = "1.0", features = [ "derive" ] }
|
serde = { version = "1.0", features = [ "derive" ] }
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
roxmltree = "0.18"
|
roxmltree = "0.18"
|
||||||
|
env_logger = "0.10"
|
||||||
|
log = "0.4"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
pretty_assertions = "1.4"
|
pretty_assertions = "1.4"
|
||||||
|
98
src/law.rs
98
src/law.rs
@ -1,3 +1,6 @@
|
|||||||
|
use log::debug;
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
use crate::overview;
|
use crate::overview;
|
||||||
|
|
||||||
// pub(crate) struct Law {
|
// pub(crate) struct Law {
|
||||||
@ -15,7 +18,7 @@ use crate::overview;
|
|||||||
// }
|
// }
|
||||||
|
|
||||||
/// Is used to generate a law struct. It's organized mainly by classifier.
|
/// Is used to generate a law struct. It's organized mainly by classifier.
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug)]
|
||||||
pub(crate) struct LawBuilder {
|
pub(crate) struct LawBuilder {
|
||||||
/// Name of the law
|
/// Name of the law
|
||||||
name: String, //ABGB, UrhG
|
name: String, //ABGB, UrhG
|
||||||
@ -29,19 +32,35 @@ pub(crate) struct LawBuilder {
|
|||||||
next_para_header: Option<String>,
|
next_para_header: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn contains(classifier_name: &str, instance_name: &str) -> bool {
|
||||||
|
instance_name.contains(classifier_name)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn starts_with_number(_classifier_name: &str, instance_name: &str) -> bool {
|
||||||
|
match instance_name.trim().as_bytes().get(0) {
|
||||||
|
Some(c) if c.is_ascii_digit() => true,
|
||||||
|
_ => false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl LawBuilder {
|
impl LawBuilder {
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub(crate) fn test(name: &str) -> Self {
|
pub(crate) fn test(name: &str) -> Self {
|
||||||
let mut classifiers = Vec::new();
|
let mut classifiers = Vec::new();
|
||||||
if name == "UrhG" {
|
if name == "UrhG" {
|
||||||
let hauptstueck = Classifier::new("Hauptstück");
|
let hauptstueck = Classifier::new("Hauptstück", Arc::new(&contains));
|
||||||
classifiers.push(hauptstueck.clone());
|
classifiers.push(hauptstueck.clone());
|
||||||
|
|
||||||
let mut abschnitt = Classifier::new("Abschnitt");
|
let mut abschnitt = Classifier::new("Abschnitt", Arc::new(&contains));
|
||||||
abschnitt.set_parent(hauptstueck);
|
abschnitt.set_parent(hauptstueck);
|
||||||
classifiers.push(abschnitt);
|
classifiers.push(abschnitt);
|
||||||
|
|
||||||
|
let mut numbered_header =
|
||||||
|
Classifier::new("Numbered Header", Arc::new(&starts_with_number));
|
||||||
|
numbered_header.set_parent(abschnitt);
|
||||||
|
classifiers.push(numbered_header);
|
||||||
} else if name == "no-headers" {
|
} else if name == "no-headers" {
|
||||||
let mut class = Classifier::new("");
|
let mut class = Classifier::new("", Arc::new(&contains));
|
||||||
class.add_instance(ClassifierInstance::new(""));
|
class.add_instance(ClassifierInstance::new(""));
|
||||||
classifiers.push(class);
|
classifiers.push(class);
|
||||||
}
|
}
|
||||||
@ -54,19 +73,24 @@ impl LawBuilder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a new law builder. Adds classifier for known law texts.
|
/// Creates a new law builder. Adds classifier for known law texts.
|
||||||
pub(crate) fn new(name: &str) {
|
pub(crate) fn new(name: &str) -> LawBuilder {
|
||||||
//TODO: return Law (not LawBuilder)
|
//TODO: return Law (not LawBuilder)
|
||||||
let mut classifiers = Vec::new();
|
let mut classifiers = Vec::new();
|
||||||
|
|
||||||
let mut law_id = None;
|
let mut law_id = None;
|
||||||
if name == "UrhG" {
|
if name == "UrhG" {
|
||||||
law_id = Some(10001848);
|
law_id = Some(10001848);
|
||||||
let hauptstueck = Classifier::new("Hauptstück");
|
let hauptstueck = Classifier::new("Hauptstück", Arc::new(&contains));
|
||||||
classifiers.push(hauptstueck.clone());
|
classifiers.push(hauptstueck.clone());
|
||||||
|
|
||||||
let mut abschnitt = Classifier::new("Abschnitt");
|
let mut abschnitt = Classifier::new("Abschnitt", Arc::new(&contains));
|
||||||
abschnitt.set_parent(hauptstueck);
|
abschnitt.set_parent(hauptstueck);
|
||||||
classifiers.push(abschnitt);
|
classifiers.push(abschnitt.clone());
|
||||||
|
|
||||||
|
let mut numbered_header =
|
||||||
|
Classifier::new("Numbered Header", Arc::new(&starts_with_number));
|
||||||
|
numbered_header.set_parent(abschnitt);
|
||||||
|
classifiers.push(numbered_header);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut builder = Self {
|
let mut builder = Self {
|
||||||
@ -77,11 +101,13 @@ impl LawBuilder {
|
|||||||
};
|
};
|
||||||
|
|
||||||
overview::parse(law_id.unwrap(), &mut builder).unwrap();
|
overview::parse(law_id.unwrap(), &mut builder).unwrap();
|
||||||
|
|
||||||
|
builder
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Sets a new header.
|
/// Sets a new header.
|
||||||
pub(crate) fn new_header(&mut self, name: &str) {
|
pub(crate) fn new_header(&mut self, name: &str) {
|
||||||
println!("new_header={name}");
|
debug!("new_header={name}");
|
||||||
let classifier_index = self
|
let classifier_index = self
|
||||||
.classifiers
|
.classifiers
|
||||||
.iter()
|
.iter()
|
||||||
@ -98,7 +124,7 @@ impl LawBuilder {
|
|||||||
|
|
||||||
/// Sets a new description for the last classifier.
|
/// Sets a new description for the last classifier.
|
||||||
pub(crate) fn new_desc(&mut self, desc: &str) {
|
pub(crate) fn new_desc(&mut self, desc: &str) {
|
||||||
println!("new_desc={desc}");
|
debug!("new_desc={desc}");
|
||||||
if let Some(index) = self.last_header_index {
|
if let Some(index) = self.last_header_index {
|
||||||
self.classifiers[index].set_desc(desc);
|
self.classifiers[index].set_desc(desc);
|
||||||
} else {
|
} else {
|
||||||
@ -108,10 +134,15 @@ impl LawBuilder {
|
|||||||
|
|
||||||
/// Adds a new paragraph.
|
/// Adds a new paragraph.
|
||||||
pub(crate) fn new_par(&mut self, par: String, content: Content) {
|
pub(crate) fn new_par(&mut self, par: String, content: Content) {
|
||||||
println!("new_par=par:{par};content:{content:#?}");
|
debug!("new_par=par:{par};content:{content:#?}");
|
||||||
if let Some(class) = self.classifiers.last_mut() {
|
if let Some(index) = self.last_header_index {
|
||||||
let section = Section { symb: par, content };
|
let section = Section {
|
||||||
class.add_section(section);
|
symb: par,
|
||||||
|
content,
|
||||||
|
par_header: self.next_para_header.clone(),
|
||||||
|
};
|
||||||
|
self.next_para_header = None;
|
||||||
|
self.classifiers[index].add_section(section);
|
||||||
} else {
|
} else {
|
||||||
panic!("Expected at least one classifier");
|
panic!("Expected at least one classifier");
|
||||||
}
|
}
|
||||||
@ -119,14 +150,23 @@ impl LawBuilder {
|
|||||||
|
|
||||||
/// Next paragraph has a header, store its name.
|
/// Next paragraph has a header, store its name.
|
||||||
pub(crate) fn new_next_para_header(&mut self, header: &str) {
|
pub(crate) fn new_next_para_header(&mut self, header: &str) {
|
||||||
println!("new_next_para_header={header}");
|
debug!("new_next_para_header={header}");
|
||||||
self.next_para_header = Some(header.into());
|
self.next_para_header = Some(header.into());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn toc(&self) {
|
||||||
|
for class in &self.classifiers {
|
||||||
|
for inst in &class.instances {
|
||||||
|
println!("{}", inst.name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub(crate) struct Section {
|
pub(crate) struct Section {
|
||||||
symb: String, // §"1", §"2", ...
|
symb: String, // §"1", §"2", ...
|
||||||
|
par_header: Option<String>,
|
||||||
content: Content,
|
content: Content,
|
||||||
//header: Option<Header>,
|
//header: Option<Header>,
|
||||||
}
|
}
|
||||||
@ -152,7 +192,7 @@ impl Header {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, PartialEq)]
|
#[derive(Clone, Debug)]
|
||||||
pub(crate) struct ClassifierInstance {
|
pub(crate) struct ClassifierInstance {
|
||||||
name: String,
|
name: String,
|
||||||
desc: Option<String>,
|
desc: Option<String>,
|
||||||
@ -177,19 +217,31 @@ impl ClassifierInstance {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, PartialEq)]
|
#[derive(Clone)]
|
||||||
pub(crate) struct Classifier {
|
pub(crate) struct Classifier {
|
||||||
name: String, // Hauptstück, Theil, Abschnitt, ol
|
name: String, // Hauptstück, Theil, Abschnitt, ol
|
||||||
parent: Option<Box<Classifier>>,
|
parent: Option<Box<Classifier>>,
|
||||||
instances: Vec<ClassifierInstance>,
|
instances: Vec<ClassifierInstance>,
|
||||||
|
used_for_fn: Arc<dyn Fn(&str, &str) -> bool>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::fmt::Debug for Classifier {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
f.debug_struct("Classifier")
|
||||||
|
.field("name", &self.name)
|
||||||
|
.field("parent", &self.parent)
|
||||||
|
.field("instances", &self.instances)
|
||||||
|
.finish()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Classifier {
|
impl Classifier {
|
||||||
fn new(name: &str) -> Self {
|
fn new(name: &str, used_for_fn: Arc<dyn Fn(&str, &str) -> bool>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
name: name.into(),
|
name: name.into(),
|
||||||
parent: None,
|
parent: None,
|
||||||
instances: Vec::new(),
|
instances: Vec::new(),
|
||||||
|
used_for_fn,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -210,7 +262,7 @@ impl Classifier {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn used_for(&self, name: &str) -> bool {
|
fn used_for(&self, name: &str) -> bool {
|
||||||
name.contains(&self.name)
|
(self.used_for_fn)(&self.name, name)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn add_section(&mut self, section: Section) {
|
fn add_section(&mut self, section: Section) {
|
||||||
@ -218,13 +270,11 @@ impl Classifier {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, PartialEq)]
|
#[derive(Clone, Debug)]
|
||||||
pub(crate) enum Content {
|
pub(crate) enum Content {
|
||||||
Text(String), //This is my direct law text
|
Text(String), //This is my direct law text
|
||||||
TextWithList(String, Vec<Box<Content>>),
|
|
||||||
Item(Vec<Box<Content>>), //(1) This is general law. (2) This is more specific law
|
Item(Vec<Box<Content>>), //(1) This is general law. (2) This is more specific law
|
||||||
List(Vec<Box<Content>>),
|
List(Vec<Box<Content>>),
|
||||||
TextWithListAndText(String, Vec<Box<Content>>, String), //1. my first item
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
@ -41,5 +41,8 @@ impl From<roxmltree::Error> for Error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
LawBuilder::new("UrhG");
|
env_logger::init();
|
||||||
|
let builder = LawBuilder::new("UrhG");
|
||||||
|
|
||||||
|
println!("{:#?}", builder.toc());
|
||||||
}
|
}
|
||||||
|
@ -187,6 +187,8 @@ pub(crate) struct ContentUrlItem {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use std::{fs::File, io::Read};
|
use std::{fs::File, io::Read};
|
||||||
|
|
||||||
|
use log::debug;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
#[derive(Deserialize)]
|
||||||
@ -204,7 +206,7 @@ mod tests {
|
|||||||
let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json);
|
let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json);
|
||||||
if wrapper.is_err() {
|
if wrapper.is_err() {
|
||||||
let dbg = wrapper.as_ref().err().unwrap();
|
let dbg = wrapper.as_ref().err().unwrap();
|
||||||
println!("{dbg:#?}");
|
debug!("{dbg:#?}");
|
||||||
}
|
}
|
||||||
|
|
||||||
assert!(wrapper.is_ok());
|
assert!(wrapper.is_ok());
|
||||||
@ -219,7 +221,7 @@ mod tests {
|
|||||||
let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json);
|
let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json);
|
||||||
if wrapper.is_err() {
|
if wrapper.is_err() {
|
||||||
let dbg = wrapper.as_ref().err().unwrap();
|
let dbg = wrapper.as_ref().err().unwrap();
|
||||||
println!("{dbg:#?}");
|
debug!("{dbg:#?}");
|
||||||
}
|
}
|
||||||
|
|
||||||
assert!(wrapper.is_ok());
|
assert!(wrapper.is_ok());
|
||||||
@ -234,7 +236,7 @@ mod tests {
|
|||||||
let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json);
|
let wrapper: serde_json::Result<Wrapper> = serde_json::from_str(&json);
|
||||||
if wrapper.is_err() {
|
if wrapper.is_err() {
|
||||||
let dbg = wrapper.as_ref().err().unwrap();
|
let dbg = wrapper.as_ref().err().unwrap();
|
||||||
println!("{dbg:#?}");
|
debug!("{dbg:#?}");
|
||||||
}
|
}
|
||||||
|
|
||||||
assert!(wrapper.is_ok());
|
assert!(wrapper.is_ok());
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
mod parser;
|
mod parser;
|
||||||
|
|
||||||
|
use log::{debug, info};
|
||||||
|
|
||||||
use crate::{law::LawBuilder, par::parser::Risdok, Error};
|
use crate::{law::LawBuilder, par::parser::Risdok, Error};
|
||||||
|
|
||||||
fn fetch_page(url: &str) -> Result<String, Error> {
|
fn fetch_page(url: &str) -> Result<String, Error> {
|
||||||
@ -7,27 +9,26 @@ fn fetch_page(url: &str) -> Result<String, Error> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn parse(url: &str, builder: &mut LawBuilder) -> Result<(), Error> {
|
pub(crate) fn parse(url: &str, builder: &mut LawBuilder) -> Result<(), Error> {
|
||||||
println!("{url}");
|
info!("Parsing {url}");
|
||||||
let xml = fetch_page(url)?;
|
let xml = fetch_page(url)?;
|
||||||
let xml = xml.replace("<gdash />", "-"); // used e.g. in §11 Abs. 3 UrhG
|
let xml = xml.replace("<gdash />", "-"); // used e.g. in §11 Abs. 3 UrhG
|
||||||
//
|
//
|
||||||
//
|
//
|
||||||
let xml = xml.replace(
|
let xml = xml.replace(
|
||||||
// e.g. in § 17 (2) TODO: check that this onyl happens here
|
// in § 17 (2)
|
||||||
r#"<liste><schlussteil ebene="0" art="normal" ct="text">"#,
|
r#"<liste><schlussteil ebene="0" art="normal" ct="text">(2) Einer Rundfunksendung steht es gleich, wenn ein Werk von einer im In- oder im Ausland gelegenen Stelle aus der Öffentlichkeit im Inland, ähnlich wie durch Rundfunk, aber mit Hilfe von Leitungen wahrnehmbar gemacht wird.</schlussteil></liste>"#,
|
||||||
r#"<absatz typ="abs" ct="text" halign="j">"#,
|
r#"<absatz typ="abs" ct="text" halign="j">(2) Einer Rundfunksendung steht es gleich, wenn ein Werk von einer im In- oder im Ausland gelegenen Stelle aus der Öffentlichkeit im Inland, ähnlich wie durch Rundfunk, aber mit Hilfe von Leitungen wahrnehmbar gemacht wird.</absatz>"#,
|
||||||
);
|
);
|
||||||
|
|
||||||
let xml = xml.replace(
|
let xml = xml.replace(
|
||||||
// e.g. in § 17 (2) TODO: check that this onyl happens here
|
r#"<ueberschrift typ="para" ct="text" halign="c">1. Verwertungsrechte.</ueberschrift>"#,
|
||||||
r#"</schlussteil></liste>"#,
|
r#"<ueberschrift typ="g1" ct="text" halign="c">1. Verwertungsrechte.</ueberschrift>"#,
|
||||||
"</absatz>",
|
); // 1. Verwertungsrechte. before § 14
|
||||||
);
|
|
||||||
println!("{xml}");
|
let xml = xml.replace("<i>.</i>", "."); // e.g. § 37d Abs. 4 (last point)...
|
||||||
|
debug!("{xml}");
|
||||||
|
|
||||||
let risdok = Risdok::from_str(&xml, builder)?;
|
let risdok = Risdok::from_str(&xml, builder)?;
|
||||||
|
|
||||||
println!("{builder:#?}");
|
|
||||||
//println!("{risdok}");
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
use log::info;
|
||||||
use roxmltree::Node;
|
use roxmltree::Node;
|
||||||
|
|
||||||
use crate::{
|
use crate::{
|
||||||
@ -86,7 +87,7 @@ impl Abschnitt {
|
|||||||
c.next();
|
c.next();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if Absatz::test(child) {
|
if Absatz::test_with_typ(child, "erltext") {
|
||||||
c.next();
|
c.next();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -140,25 +141,30 @@ impl Abschnitt {
|
|||||||
if let Some(child) = c.peek() {
|
if let Some(child) = c.peek() {
|
||||||
if Liste::test(child) {
|
if Liste::test(child) {
|
||||||
let liste = Liste::parse(c.next().unwrap());
|
let liste = Liste::parse(c.next().unwrap());
|
||||||
absatze.push(Content::TextWithList(
|
absatze.push(Content::List(vec![
|
||||||
absatz.content.clone(),
|
Content::Text(absatz.content).into(),
|
||||||
liste.get_list(),
|
liste.get_content().into(),
|
||||||
))
|
]));
|
||||||
} else if Table::test(child) {
|
} else if Table::test(child) {
|
||||||
// If there's a "table" after an "absatz", the "table" should be part of the "absatz"
|
// If there's a "table" after an "absatz", the "table" should be part of the "absatz"
|
||||||
let table = Table::parse(c.next().unwrap());
|
let table = Table::parse(c.next().unwrap());
|
||||||
if let Some(child) = c.peek() {
|
if let Some(child) = c.peek() {
|
||||||
if Absatz::test_with_typ(child, "erltext") {
|
if Absatz::test_with_typ(child, "erltext") {
|
||||||
let after_absatz = Absatz::parse(c.next().unwrap());
|
let after_absatz = Absatz::parse(c.next().unwrap());
|
||||||
absatze.push(Content::TextWithListAndText(
|
absatze.push(Content::List(vec![
|
||||||
absatz.content,
|
Content::Text(absatz.content).into(),
|
||||||
table.get_list(),
|
Content::List(table.get_list()).into(),
|
||||||
after_absatz.content,
|
Content::Text(after_absatz.content).into(),
|
||||||
))
|
]))
|
||||||
} else {
|
} else {
|
||||||
absatze.push(Content::TextWithList(absatz.content, table.get_list()))
|
absatze.push(Content::List(vec![
|
||||||
|
Content::Text(absatz.content).into(),
|
||||||
|
Content::List(table.get_list()).into(),
|
||||||
|
]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
absatze.push(Content::Text(absatz.content.clone()));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
absatze.push(Content::Text(absatz.content.clone()));
|
absatze.push(Content::Text(absatz.content.clone()));
|
||||||
@ -177,8 +183,10 @@ impl Abschnitt {
|
|||||||
if let Some(child) = c.peek() {
|
if let Some(child) = c.peek() {
|
||||||
if Liste::test(&child) {
|
if Liste::test(&child) {
|
||||||
let liste = Liste::parse(c.next().unwrap());
|
let liste = Liste::parse(c.next().unwrap());
|
||||||
//TODO do something with list
|
absatze.push(Content::List(vec![
|
||||||
absatze.push(Content::TextWithList(abs.content, liste.get_list()))
|
Content::Text(abs.content).into(),
|
||||||
|
liste.get_content().into(),
|
||||||
|
]));
|
||||||
} else {
|
} else {
|
||||||
absatze.push(Content::Text(abs.content));
|
absatze.push(Content::Text(abs.content));
|
||||||
}
|
}
|
||||||
@ -225,7 +233,7 @@ impl Abschnitt {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq, Clone)]
|
||||||
pub(crate) struct Symbol {
|
pub(crate) struct Symbol {
|
||||||
stellen: String,
|
stellen: String,
|
||||||
content: String,
|
content: String,
|
||||||
@ -242,7 +250,7 @@ impl Symbol {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq, Clone)]
|
||||||
pub(crate) struct Listelem {
|
pub(crate) struct Listelem {
|
||||||
symbol: Symbol,
|
symbol: Symbol,
|
||||||
text: String,
|
text: String,
|
||||||
@ -267,36 +275,39 @@ impl Listelem {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq, Clone)]
|
||||||
pub(crate) struct Ziffernliste {
|
pub(crate) struct Ziffernliste {
|
||||||
ebene: String,
|
ebene: String,
|
||||||
listelems: Vec<Listelem>,
|
listelems: Vec<Listelem>,
|
||||||
}
|
}
|
||||||
impl Ziffernliste {
|
impl Ziffernliste {
|
||||||
|
pub(crate) fn test(n: &Node) -> bool {
|
||||||
|
["ziffernliste", "aufzaehlung", "literaliste"].contains(&n.tag_name().name())
|
||||||
|
}
|
||||||
|
|
||||||
pub(crate) fn parse(n: Node) -> Self {
|
pub(crate) fn parse(n: Node) -> Self {
|
||||||
assert!(n.tag_name().name() == "ziffernliste");
|
assert!(Self::test(&n));
|
||||||
|
|
||||||
let ebene = n.attribute("ebene").unwrap().into();
|
let ebene = n.attribute("ebene").unwrap().into();
|
||||||
let mut c = n.children().peekable();
|
|
||||||
|
|
||||||
let mut listelems = Vec::new();
|
let mut listelems = Vec::new();
|
||||||
loop {
|
|
||||||
match c.peek() {
|
for child in n.children() {
|
||||||
Some(child) => {
|
listelems.push(Listelem::parse(child));
|
||||||
if Listelem::test(child) {
|
|
||||||
listelems.push(Listelem::parse(c.next().unwrap()));
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None => break,
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
assert_eq!(c.next(), None);
|
|
||||||
|
|
||||||
Self { ebene, listelems }
|
Self { ebene, listelems }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub(crate) fn get_content(&self) -> Content {
|
||||||
|
let mut elems = Vec::new();
|
||||||
|
|
||||||
|
for elem in &self.listelems {
|
||||||
|
elems.push(Content::Text(format!("{} {}", elem.symbol.content, elem.text)).into());
|
||||||
|
}
|
||||||
|
|
||||||
|
Content::List(elems)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
@ -372,40 +383,69 @@ impl Table {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, PartialEq)]
|
#[derive(Debug, PartialEq)]
|
||||||
pub(crate) struct Liste {
|
pub(crate) struct Schlussteil {
|
||||||
ziffernliste: Ziffernliste,
|
content: String,
|
||||||
}
|
}
|
||||||
impl Liste {
|
impl Schlussteil {
|
||||||
pub(crate) fn test(n: &Node) -> bool {
|
pub(crate) fn test(n: &Node) -> bool {
|
||||||
n.tag_name().name() == "liste"
|
(n.tag_name().name() == "schlussteil" || n.tag_name().name() == "schluss")
|
||||||
&& n.children().count() == 1
|
&& n.children().count() == 1
|
||||||
&& n.children().next().unwrap().tag_name().name() == "ziffernliste"
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn parse(n: Node) -> Self {
|
pub(crate) fn parse(n: Node) -> Self {
|
||||||
assert!(Self::test(&n));
|
assert!(Self::test(&n));
|
||||||
|
|
||||||
let mut c = n.children();
|
let content = n.children().next().unwrap().text().unwrap().into(); //not sure
|
||||||
|
|
||||||
let ziffernliste = Ziffernliste::parse(c.next().unwrap());
|
Self { content }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub(crate) struct Liste {
|
||||||
|
content: Vec<Content>,
|
||||||
|
}
|
||||||
|
impl Liste {
|
||||||
|
pub(crate) fn test(n: &Node) -> bool {
|
||||||
|
n.tag_name().name() == "liste"
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn parse(n: Node) -> Self {
|
||||||
|
assert!(Self::test(&n));
|
||||||
|
|
||||||
|
let mut content = Vec::new();
|
||||||
|
|
||||||
|
let mut c = n.children().peekable();
|
||||||
|
|
||||||
|
content.push(Ziffernliste::parse(c.next().unwrap()).get_content().into());
|
||||||
|
|
||||||
|
loop {
|
||||||
|
if let Some(child) = c.peek() {
|
||||||
|
if Ziffernliste::test(child) {
|
||||||
|
content.push(Ziffernliste::parse(c.next().unwrap()).get_content().into());
|
||||||
|
} else if Schlussteil::test(child) {
|
||||||
|
content.push(Content::Text(Schlussteil::parse(c.next().unwrap()).content));
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
assert_eq!(c.next(), None);
|
assert_eq!(c.next(), None);
|
||||||
|
|
||||||
Self { ziffernliste }
|
Self { content }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn get_list(&self) -> Vec<Box<Content>> {
|
pub(crate) fn get_content(&self) -> Content {
|
||||||
let mut ret = Vec::new();
|
Content::List(
|
||||||
|
self.content
|
||||||
for a in &self.ziffernliste.listelems {
|
.clone()
|
||||||
ret.push(Box::new(Content::Text(format!(
|
.into_iter()
|
||||||
"{} {}",
|
.map(|c| Box::new(c))
|
||||||
a.symbol.content,
|
.collect(),
|
||||||
a.text.clone()
|
)
|
||||||
))));
|
|
||||||
}
|
|
||||||
|
|
||||||
ret
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -546,6 +586,8 @@ impl Layoutdaten {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use std::{fs::File, io::Read};
|
use std::{fs::File, io::Read};
|
||||||
|
|
||||||
|
use log::error;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@ -557,7 +599,7 @@ mod tests {
|
|||||||
let mut builder = LawBuilder::test("no-headers");
|
let mut builder = LawBuilder::test("no-headers");
|
||||||
let risdok = Risdok::from_str(&xml, &mut builder);
|
let risdok = Risdok::from_str(&xml, &mut builder);
|
||||||
if risdok.is_err() {
|
if risdok.is_err() {
|
||||||
println!("{:#?}", risdok.as_ref().err());
|
error!("{:#?}", risdok.as_ref().err());
|
||||||
}
|
}
|
||||||
assert!(risdok.is_ok());
|
assert!(risdok.is_ok());
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user