restructure project
All checks were successful
CI/CD Pipeline / test (push) Successful in 1m35s

This commit is contained in:
2024-02-06 11:45:44 +01:00
parent 359177a4c4
commit 7fa62ef1f9
8 changed files with 8 additions and 10 deletions

133
src/paragraph/mod.rs Normal file
View File

@ -0,0 +1,133 @@
//! Deals with getting all paragraphs for a given law text
mod parser;
use log::info;
use crate::{law::LawBuilder, misc::Error};
use self::parser::Risdok;
pub struct Parser {
remove: Vec<String>,
replace: Vec<(String, String)>,
}
impl Default for Parser {
fn default() -> Self {
Self::new()
}
}
impl Parser {
pub fn new() -> Self {
Self {
remove: Vec::new(),
replace: Vec::new(),
}
}
pub fn add_string_to_remove(&mut self, data: &str) {
self.remove.push(data.into());
}
pub fn add_string_to_replace(&mut self, search: &str, replace: &str) {
self.replace.push((search.into(), replace.into()));
}
/// Parses the content available in `url`. Calls appropriate functions in supplied `LawBuilder`.
pub fn parse(&self, url: &str, builder: &mut LawBuilder) -> Result<bool, Error> {
info!("Parsing {url}");
let xml = fetch(url)?;
let xml = xml.replace('\u{a0}', " ");
self.parse_from_str(&xml, builder)
}
fn parse_from_str(&self, xml: &str, builder: &mut LawBuilder) -> Result<bool, Error> {
let mut xml = String::from(xml);
for r in &self.remove {
xml = xml.replace(r, "");
}
for (search, replace) in &self.replace {
xml = xml.replace(search, replace);
}
Risdok::from_str(&xml, builder)
}
}
#[cfg(not(test))]
fn fetch(url: &str) -> Result<String, Error> {
Ok(ureq::get(url).call()?.into_string()?)
}
#[cfg(test)]
fn fetch(url: &str) -> Result<String, Error> {
use std::{
collections::hash_map::DefaultHasher,
fs,
hash::{Hash, Hasher},
};
let mut hasher = DefaultHasher::new();
url.hash(&mut hasher);
let hash = format!("{:x}", hasher.finish());
let expected_filename = format!("./data/cache/par-{hash}");
match fs::read_to_string(&expected_filename) {
Ok(data) => Ok(data),
Err(_) => {
info!("Not finding url {url} in the cache, downloading...");
let data = ureq::get(url).call()?.into_string()?;
fs::write(expected_filename, &data).expect("Unable to write file");
Ok(data)
}
}
}
#[cfg(test)]
mod tests {
use std::{fs};
use crate::{
config::Config,
};
use pretty_assertions::assert_eq;
#[test]
fn all_configs_produce_expected_output() {
let configs = fs::read_dir("./data/configs").expect("No folder with config files");
for config in configs {
let path = format!("{}", config.unwrap().path().display());
let (law_id, mut builder, parser) = Config::load(&path).unwrap();
let paragraph_path = format!("./data/expected/overview/{law_id}");
let expected_path = format!("./data/expected/par/{law_id}");
let pars =
fs::read_to_string(paragraph_path).expect("Could not read file {paragraph_path}.");
let pars = pars.trim().split('\n').collect::<Vec<&str>>();
for par in pars {
println!("{par}");
let cont = parser.parse(par, &mut builder).unwrap();
if !cont {
break;
}
}
let actual = &builder.history;
let expected = fs::read_to_string(&expected_path)
.unwrap_or_else(|_| panic!("Could not read file {expected_path}."));
let expected = expected.trim().split('\n').collect::<Vec<&str>>();
assert_eq!(actual, &expected);
}
}
}

565
src/paragraph/parser.rs Normal file
View File

@ -0,0 +1,565 @@
use roxmltree::Node;
use crate::{
law::{Content, LawBuilder},
misc::Error,
};
#[derive(Debug, PartialEq)]
pub(crate) struct Risdok {}
impl Risdok {
pub(crate) fn parse(n: Node, builder: &mut LawBuilder) -> bool {
assert!(n.tag_name().name() == "risdok");
let mut c = n.children();
Metadaten::parse(c.next().unwrap());
let nutzdaten = Nutzdaten::parse(c.next().unwrap(), builder);
if !nutzdaten {
return false;
}
Layoutdaten::parse(c.next().unwrap());
assert_eq!(c.next(), None);
true
}
pub(crate) fn from_str(xml: &str, builder: &mut LawBuilder) -> Result<bool, Error> {
let doc = roxmltree::Document::parse(xml)?;
let root = doc.root();
assert_eq!(root.children().count(), 1);
Ok(Self::parse(root.children().next().unwrap(), builder))
}
}
#[derive(Debug, PartialEq)]
pub(crate) struct Metadaten;
impl Metadaten {
pub(crate) fn parse(n: Node) -> Self {
assert!(n.tag_name().name() == "metadaten");
assert_eq!(n.children().next(), None);
Self {}
}
}
#[derive(Debug, PartialEq)]
pub(crate) struct Nutzdaten {}
impl Nutzdaten {
pub(crate) fn parse(n: Node, builder: &mut LawBuilder) -> bool {
assert!(n.tag_name().name() == "nutzdaten");
let mut c = n.children();
let ret = Abschnitt::parse(c.next().unwrap(), builder);
assert_eq!(c.next(), None);
ret
}
}
#[derive(Debug, PartialEq)]
pub(crate) struct Abschnitt;
impl Abschnitt {
pub(crate) fn parse(n: Node, builder: &mut LawBuilder) -> bool {
assert!(n.tag_name().name() == "abschnitt");
let mut c = n.children().peekable();
Kzinhalt::parse(c.next().unwrap());
Kzinhalt::parse(c.next().unwrap());
Fzinhalt::parse(c.next().unwrap());
Fzinhalt::parse(c.next().unwrap());
// Skip all UeberschriftTitle and Absatz
while let Some(child) = c.peek() {
if Ueberschrift::test(child, "titel") {
c.next();
continue;
}
if Absatz::test_with_typ(child, "erltext") {
c.next();
continue;
}
break;
}
while let Some(child) = c.peek() {
if Ueberschrift::test(child, "g1") {
let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1");
if ueberschrift.content.trim().starts_with("Artikel") {
return false;
}
builder.new_header(&ueberschrift.content);
} else if Ueberschrift::test(child, "g2") {
let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g2");
builder.new_desc(&ueberschrift.content);
} else if Ueberschrift::test(child, "g1min") {
let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "g1min");
builder.new_header(&ueberschrift.content);
} else if Ueberschrift::test(child, "art") {
let ueberschrift = Ueberschrift::parse(c.next().unwrap(), "art");
if ueberschrift.content.trim().starts_with("Artikel") {
return false;
}
} else {
break;
}
}
if let Some(child) = c.peek() {
if Ueberschrift::test(child, "para") {
builder
.new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content);
}
}
// e.g. § 405 abgb has two para (of diseased paragraph)
if let Some(child) = c.peek() {
if Ueberschrift::test(child, "para") {
builder
.new_next_para_header(&Ueberschrift::parse(c.next().unwrap(), "para").content);
}
}
// We have 2 tasks
// 1) Get paragraph id
// 2) Get content
let mut absatze = Vec::new();
let absatz = AbsatzAbs::parse(c.next().expect("We need at least one 'Absatz'"));
let par_id = absatz
.gldsym
.clone()
.expect("First 'Absatz' needs to have § id");
// If there's a "liste" after an "absatz", the "liste" should be part of the "absatz"
if let Some(child) = c.peek() {
if Liste::test(child) {
let liste = Liste::parse(c.next().unwrap());
absatze.push(Content::List(vec![
Content::Text(absatz.content.replace('\u{a0}', " ")),
liste.get_content(),
]));
} else if Table::test(child) {
// If there's a "table" after an "absatz", the "table" should be part of the "absatz"
let table = Table::parse(c.next().unwrap());
if let Some(child) = c.peek() {
if Absatz::test_with_typ(child, "erltext") {
let after_absatz = Absatz::parse(c.next().unwrap());
absatze.push(Content::List(vec![
Content::Text(absatz.content.replace('\u{a0}', " ")),
Content::List(table.get_list()),
Content::Text(after_absatz.content),
]));
} else {
absatze.push(Content::List(vec![
Content::Text(absatz.content.replace('\u{a0}', " ")),
Content::List(table.get_list()),
]));
}
}
} else {
absatze.push(Content::Text(absatz.content.replace('\u{a0}', " ").clone()));
}
} else {
absatze.push(Content::Text(absatz.content.replace('\u{a0}', " ").clone()));
}
//There can be as many 'Absätze' as our lovely lawsetter wants
while let Some(child) = c.peek() {
if AbsatzAbs::test(child) {
let abs = AbsatzAbs::parse(c.next().unwrap());
// If there's a "liste" after an "absatz", the "liste" should be part of the "absatz"
if let Some(child) = c.peek() {
if Liste::test(child) {
let liste = Liste::parse(c.next().unwrap());
absatze.push(Content::List(vec![
Content::Text(abs.content.replace('\u{a0}', " ")),
liste.get_content(),
]));
} else {
absatze.push(Content::Text(abs.content.replace('\u{a0}', " ")));
}
} else {
absatze.push(Content::Text(abs.content.replace('\u{a0}', " ")));
}
continue;
}
break;
}
if absatze.len() == 1 {
builder.new_par(par_id, absatze[0].clone());
} else {
let mut contents = Vec::new();
for a in &absatze {
contents.push(a.clone());
}
builder.new_par(par_id, Content::Item(contents));
}
// Skip all UeberschriftTitle and Absatz
while let Some(child) = c.peek() {
if Ueberschrift::test(child, "titel") {
c.next();
continue;
}
if Absatz::test(child) {
c.next();
continue;
}
break;
}
assert_eq!(c.next(), None);
true
}
}
#[derive(Debug, PartialEq, Clone)]
pub(crate) struct Symbol {
stellen: String,
content: String,
}
impl Symbol {
pub(crate) fn parse(n: Node) -> Self {
assert!(n.tag_name().name() == "symbol");
assert_eq!(n.children().count(), 1);
let stellen = n.attribute("stellen").unwrap().into();
let content = n.text().unwrap().into();
Self { stellen, content }
}
}
#[derive(Debug, PartialEq, Clone)]
pub(crate) struct Listelem {
symbol: Symbol,
text: String,
}
impl Listelem {
pub(crate) fn parse(n: Node) -> Self {
assert!(n.tag_name().name() == "listelem");
let mut c = n.children();
let symbol = Symbol::parse(c.next().unwrap());
let text = c.next().unwrap().text().unwrap().into();
assert_eq!(c.next(), None);
Self { symbol, text }
}
}
#[derive(Debug, PartialEq, Clone)]
pub(crate) struct Ziffernliste {
ebene: String,
listelems: Vec<Listelem>,
}
impl Ziffernliste {
pub(crate) fn test(n: &Node) -> bool {
// strichliste -> § 194b FSG
["ziffernliste", "aufzaehlung", "literaliste", "strichliste"].contains(&n.tag_name().name())
}
pub(crate) fn parse(n: Node) -> Self {
assert!(Self::test(&n));
let ebene = n.attribute("ebene").unwrap().into();
let mut listelems = Vec::new();
for child in n.children() {
listelems.push(Listelem::parse(child));
}
Self { ebene, listelems }
}
pub(crate) fn get_content(&self) -> Content {
let mut elems = Vec::new();
for elem in &self.listelems {
elems.push(Content::Text(
format!("{} {}", elem.symbol.content, elem.text).replace('\u{a0}', " "),
));
}
Content::List(elems)
}
}
#[derive(Debug, PartialEq)]
pub(crate) struct Td {
absatz: Absatz,
}
impl Td {
pub(crate) fn parse(n: &Node) -> Self {
assert!(n.tag_name().name() == "td");
let mut c = n.children();
let absatz = Absatz::parse(c.next().unwrap());
assert_eq!(c.next(), None);
Self { absatz }
}
}
#[derive(Debug, PartialEq)]
pub(crate) struct Tr {
tds: Vec<Td>,
}
impl Tr {
pub(crate) fn parse(n: &Node) -> Self {
assert!(n.tag_name().name() == "tr");
let mut tds = Vec::new();
for child in n.children() {
tds.push(Td::parse(&child));
}
Self { tds }
}
}
#[derive(Debug, PartialEq)]
pub(crate) struct Table {
trs: Vec<Tr>,
}
impl Table {
pub(crate) fn test(n: &Node) -> bool {
n.tag_name().name() == "table"
}
pub(crate) fn parse(n: Node) -> Self {
assert!(Self::test(&n));
let mut trs = Vec::new();
for child in n.children() {
trs.push(Tr::parse(&child));
}
Self { trs }
}
pub(crate) fn get_list(&self) -> Vec<Content> {
let mut ret = Vec::new();
for tr in &self.trs {
let mut txt = String::new();
for td in &tr.tds {
txt.push_str(&format!("{} ", td.absatz.content));
}
ret.push(Content::Text(format!("- {txt}",).replace('\u{a0}', " ")));
}
ret
}
}
#[derive(Debug, PartialEq)]
pub(crate) struct Schlussteil {
content: String,
}
impl Schlussteil {
pub(crate) fn test(n: &Node) -> bool {
(n.tag_name().name() == "schlussteil" || n.tag_name().name() == "schluss")
&& n.children().count() == 1
}
pub(crate) fn parse(n: Node) -> Self {
assert!(Self::test(&n));
let content = n.children().next().unwrap().text().unwrap().into(); //not sure
Self { content }
}
}
#[derive(Debug)]
pub(crate) struct Liste {
content: Vec<Content>,
}
impl Liste {
pub(crate) fn test(n: &Node) -> bool {
n.tag_name().name() == "liste"
}
pub(crate) fn parse(n: Node) -> Self {
assert!(Self::test(&n));
let mut content = Vec::new();
let mut c = n.children().peekable();
content.push(Ziffernliste::parse(c.next().unwrap()).get_content());
while let Some(child) = c.peek() {
if Ziffernliste::test(child) {
content.push(Ziffernliste::parse(c.next().unwrap()).get_content());
} else if Schlussteil::test(child) {
content.push(Content::Text(
Schlussteil::parse(c.next().unwrap())
.content
.replace('\u{a0}', " "),
));
} else {
break;
}
}
assert_eq!(c.next(), None);
Self { content }
}
pub(crate) fn get_content(&self) -> Content {
Content::List(self.content.clone())
}
}
#[derive(Debug, PartialEq)]
pub(crate) struct AbsatzAbs {
gldsym: Option<String>,
content: String,
}
impl AbsatzAbs {
pub(crate) fn test(n: &Node) -> bool {
n.tag_name().name() == "absatz" && n.attribute("typ").unwrap() == "abs"
}
pub(crate) fn parse(n: Node) -> Self {
assert!(n.tag_name().name() == "absatz");
assert_eq!(n.attribute("typ").unwrap(), "abs");
let mut c = n.children().peekable();
let gldsym = match c.peek() {
Some(child) => {
if Leaf::test(child, "gldsym") {
Some(Leaf::parse(c.next().unwrap(), "gldsym").replace('\u{a0}', " "))
} else {
None
}
}
None => None,
};
let ret = Self {
gldsym,
content: c.next().unwrap().text().unwrap().trim().into(),
};
assert_eq!(c.next(), None);
ret
}
}
#[derive(Debug, PartialEq)]
pub(crate) struct Leaf {
content: String,
}
impl Leaf {
pub(crate) fn test(n: &Node, name: &str) -> bool {
n.tag_name().name() == name && n.children().count() == 1
}
pub(crate) fn parse(n: Node, name: &str) -> String {
assert!(n.tag_name().name() == name);
assert_eq!(n.children().count(), 1);
n.text().unwrap().into()
}
}
#[derive(Debug, PartialEq)]
pub(crate) struct Absatz {
content: String,
}
impl Absatz {
pub(crate) fn test(n: &Node) -> bool {
n.tag_name().name() == "absatz"
}
pub(crate) fn test_with_typ(n: &Node, typ: &str) -> bool {
Self::test(n) && n.attribute("typ") == Some(typ)
}
pub(crate) fn parse(n: Node) -> Self {
assert!(Self::test(&n));
if let Some(text) = n.text() {
Self {
content: text.into(),
}
} else {
Self {
content: String::new(),
}
}
}
}
#[derive(Debug, PartialEq)]
pub(crate) struct Ueberschrift {
typ: String,
content: String,
}
impl Ueberschrift {
fn test(n: &Node, typ: &str) -> bool {
n.tag_name().name() == "ueberschrift" && n.attribute("typ").unwrap() == typ
}
pub(crate) fn parse(n: Node, typ: &str) -> Self {
assert!(n.tag_name().name() == "ueberschrift");
assert_eq!(n.attribute("typ").unwrap(), typ);
Self {
content: n.text().unwrap().into(),
typ: typ.into(),
}
}
}
#[derive(Debug, PartialEq)]
pub(crate) struct Kzinhalt;
impl Kzinhalt {
pub(crate) fn parse(n: Node) -> Self {
assert!(n.tag_name().name() == "kzinhalt");
//TODO parse if necessary
Self {}
}
}
#[derive(Debug, PartialEq)]
pub(crate) struct Fzinhalt;
impl Fzinhalt {
pub(crate) fn parse(n: Node) -> Self {
assert!(n.tag_name().name() == "fzinhalt");
//TODO parse if necessary
Self {}
}
}
#[derive(Debug, PartialEq)]
pub(crate) struct Layoutdaten;
impl Layoutdaten {
pub(crate) fn parse(n: Node) -> Self {
assert!(n.tag_name().name() == "layoutdaten");
assert_eq!(n.children().next(), None);
Self {}
}
}