create test for full urhg for overview parsing

This commit is contained in:
2023-11-06 12:00:20 +01:00
parent 385ab4e044
commit 1c4f798788
200 changed files with 13941 additions and 26 deletions

View File

@ -104,7 +104,19 @@ impl LawBuilder {
pub(crate) fn test(name: &str) -> Self {
let mut last_header_index = None;
let mut classifiers = Vec::new();
if name == "test" {
if name == "UrhG" {
let hauptstueck = Classifier::new("Hauptstück", Arc::new(&contains));
classifiers.push(hauptstueck.clone());
let mut abschnitt = Classifier::new("Abschnitt", Arc::new(&contains));
abschnitt.set_parent(0);
classifiers.push(abschnitt);
let mut numbered_header =
Classifier::new("Numbered Header", Arc::new(&starts_with_number));
numbered_header.set_parent(9999);
classifiers.push(numbered_header);
} else if name == "test" {
let h1 = Classifier::new("h1", Arc::new(&contains));
classifiers.push(h1);
@ -167,7 +179,7 @@ impl LawBuilder {
last_header_index: None,
};
overview::parse(law_id.unwrap(), &mut builder).unwrap();
overview::parse(law_id.unwrap()).unwrap();
builder.into()
}

View File

@ -2,6 +2,7 @@
mod parser;
use log::info;
use roxmltree::Node;
use serde::Deserialize;
use time::{format_description, OffsetDateTime};
@ -38,35 +39,49 @@ pub(crate) struct Wrapper {
ogd_search_result: OgdSearchResult,
}
pub(crate) fn parse(overview_id: usize, builder: &mut LawBuilder) -> Result<(), Error> {
pub(crate) fn parse(overview_id: usize) -> Result<Vec<String>, Error> {
let mut page = 1;
let mut skip = true;
let mut ret = Vec::new();
loop {
info!("=== Fetching overview page #{page} ===");
let json = fetch_page(overview_id, page)?;
let wrapper: Wrapper = serde_json::from_str(&json)?;
let mut iter = wrapper.ogd_search_result.get_par().into_iter();
let mut boxed_iter: Box<dyn Iterator<Item = String>> = if skip {
skip = false;
Box::new(iter.skip(1)) // You must specify how many items to skip with `n`
} else {
Box::new(iter)
};
for par in boxed_iter {
// skip bc. first one is never relevant for me :-)
if !crate::par::parse(&par, builder).unwrap() {
break;
}
let (cont, nodes) = parse_from_str(&json, skip)?;
for n in nodes {
ret.push(n.clone());
}
page += 1;
if !wrapper.ogd_search_result.has_next_page() {
if !cont {
break;
}
skip = false;
page += 1;
}
Ok(())
Ok(ret)
}
pub(crate) fn parse_from_str(
content: &str,
skip_first: bool,
) -> Result<(bool, Vec<String>), Error> {
let mut ret = Vec::new();
let wrapper: Wrapper = serde_json::from_str(content)?;
let iter = wrapper.ogd_search_result.get_par().into_iter();
let boxed_iter: Box<dyn Iterator<Item = String>> = if skip_first {
Box::new(iter.skip(1))
} else {
Box::new(iter)
};
for par in boxed_iter {
ret.push(par);
//if !crate::par::parse(&par).unwrap() {
// return Ok(false);
//}
}
if !wrapper.ogd_search_result.has_next_page() {
return Ok((false, ret));
}
return Ok((true, ret));
}

View File

@ -192,12 +192,27 @@ pub(crate) struct ContentUrlItem {
#[cfg(test)]
mod tests {
use std::{fs::File, io::Read};
use std::{
fs::{self, File},
io::{self, BufRead, Read},
path::Path,
};
use log::debug;
use crate::{law::LawBuilder, overview::parse_from_str};
use super::*;
fn read_lines<P>(filename: P) -> io::Result<Vec<String>>
where
P: AsRef<Path>,
{
let file = File::open(filename)?;
let buf_reader = io::BufReader::new(file);
buf_reader.lines().collect()
}
#[derive(Deserialize)]
#[serde(rename_all = "PascalCase")]
pub(crate) struct Wrapper {
@ -248,4 +263,37 @@ mod tests {
assert!(wrapper.is_ok());
}
#[test]
fn test_overview_full_urhg() {
let mut files = Vec::new();
let path = Path::new("./data/urhg/overview");
let mut entries: Vec<_> = fs::read_dir(path)
.unwrap()
.filter_map(|entry| entry.ok())
.collect();
entries.sort_by_key(|entry| entry.file_name());
let last_index = fs::read_dir(path).unwrap().count() - 1;
let mut skip = true;
for (idx, entry) in entries.into_iter().enumerate() {
let mut file = File::open(path.join(entry.file_name())).unwrap();
let mut json = String::new();
file.read_to_string(&mut json).unwrap();
let expected_continue = !(idx == last_index);
let (cont, cur_files) = parse_from_str(&json, skip).unwrap();
assert_eq!(cont, expected_continue);
for file in cur_files {
files.push(file);
}
skip = false;
}
let expected = read_lines(path.join("../overview.result")).unwrap();
assert_eq!(files, expected);
}
}

View File

@ -11,9 +11,12 @@ fn fetch_page(url: &str) -> Result<String, Error> {
pub(crate) fn parse(url: &str, builder: &mut LawBuilder) -> Result<bool, Error> {
info!("Parsing {url}");
let xml = fetch_page(url)?;
parse_from_str(&xml, builder)
}
pub(crate) fn parse_from_str(xml: &str, builder: &mut LawBuilder) -> Result<bool, Error> {
let xml = xml.replace("<gdash />", "-"); // used e.g. in §11 Abs. 3 UrhG
//
//
let xml = xml.replace(
// in § 17 (2)
r#"<liste><schlussteil ebene="0" art="normal" ct="text">(2) Einer Rundfunksendung steht es gleich, wenn ein Werk von einer im In- oder im Ausland gelegenen Stelle aus der Öffentlichkeit im Inland, ähnlich wie durch Rundfunk, aber mit Hilfe von Leitungen wahrnehmbar gemacht wird.</schlussteil></liste>"#,