create test for full urhg for overview parsing
This commit is contained in:
16
src/law.rs
16
src/law.rs
@ -104,7 +104,19 @@ impl LawBuilder {
|
||||
pub(crate) fn test(name: &str) -> Self {
|
||||
let mut last_header_index = None;
|
||||
let mut classifiers = Vec::new();
|
||||
if name == "test" {
|
||||
if name == "UrhG" {
|
||||
let hauptstueck = Classifier::new("Hauptstück", Arc::new(&contains));
|
||||
classifiers.push(hauptstueck.clone());
|
||||
|
||||
let mut abschnitt = Classifier::new("Abschnitt", Arc::new(&contains));
|
||||
abschnitt.set_parent(0);
|
||||
classifiers.push(abschnitt);
|
||||
|
||||
let mut numbered_header =
|
||||
Classifier::new("Numbered Header", Arc::new(&starts_with_number));
|
||||
numbered_header.set_parent(9999);
|
||||
classifiers.push(numbered_header);
|
||||
} else if name == "test" {
|
||||
let h1 = Classifier::new("h1", Arc::new(&contains));
|
||||
classifiers.push(h1);
|
||||
|
||||
@ -167,7 +179,7 @@ impl LawBuilder {
|
||||
last_header_index: None,
|
||||
};
|
||||
|
||||
overview::parse(law_id.unwrap(), &mut builder).unwrap();
|
||||
overview::parse(law_id.unwrap()).unwrap();
|
||||
|
||||
builder.into()
|
||||
}
|
||||
|
@ -2,6 +2,7 @@
|
||||
mod parser;
|
||||
|
||||
use log::info;
|
||||
use roxmltree::Node;
|
||||
use serde::Deserialize;
|
||||
use time::{format_description, OffsetDateTime};
|
||||
|
||||
@ -38,35 +39,49 @@ pub(crate) struct Wrapper {
|
||||
ogd_search_result: OgdSearchResult,
|
||||
}
|
||||
|
||||
pub(crate) fn parse(overview_id: usize, builder: &mut LawBuilder) -> Result<(), Error> {
|
||||
pub(crate) fn parse(overview_id: usize) -> Result<Vec<String>, Error> {
|
||||
let mut page = 1;
|
||||
let mut skip = true;
|
||||
let mut ret = Vec::new();
|
||||
loop {
|
||||
info!("=== Fetching overview page #{page} ===");
|
||||
let json = fetch_page(overview_id, page)?;
|
||||
|
||||
let wrapper: Wrapper = serde_json::from_str(&json)?;
|
||||
|
||||
let mut iter = wrapper.ogd_search_result.get_par().into_iter();
|
||||
let mut boxed_iter: Box<dyn Iterator<Item = String>> = if skip {
|
||||
skip = false;
|
||||
Box::new(iter.skip(1)) // You must specify how many items to skip with `n`
|
||||
} else {
|
||||
Box::new(iter)
|
||||
};
|
||||
for par in boxed_iter {
|
||||
// skip bc. first one is never relevant for me :-)
|
||||
if !crate::par::parse(&par, builder).unwrap() {
|
||||
break;
|
||||
}
|
||||
let (cont, nodes) = parse_from_str(&json, skip)?;
|
||||
for n in nodes {
|
||||
ret.push(n.clone());
|
||||
}
|
||||
|
||||
page += 1;
|
||||
|
||||
if !wrapper.ogd_search_result.has_next_page() {
|
||||
if !cont {
|
||||
break;
|
||||
}
|
||||
skip = false;
|
||||
page += 1;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(ret)
|
||||
}
|
||||
|
||||
pub(crate) fn parse_from_str(
|
||||
content: &str,
|
||||
skip_first: bool,
|
||||
) -> Result<(bool, Vec<String>), Error> {
|
||||
let mut ret = Vec::new();
|
||||
let wrapper: Wrapper = serde_json::from_str(content)?;
|
||||
|
||||
let iter = wrapper.ogd_search_result.get_par().into_iter();
|
||||
let boxed_iter: Box<dyn Iterator<Item = String>> = if skip_first {
|
||||
Box::new(iter.skip(1))
|
||||
} else {
|
||||
Box::new(iter)
|
||||
};
|
||||
for par in boxed_iter {
|
||||
ret.push(par);
|
||||
//if !crate::par::parse(&par).unwrap() {
|
||||
// return Ok(false);
|
||||
//}
|
||||
}
|
||||
|
||||
if !wrapper.ogd_search_result.has_next_page() {
|
||||
return Ok((false, ret));
|
||||
}
|
||||
return Ok((true, ret));
|
||||
}
|
||||
|
@ -192,12 +192,27 @@ pub(crate) struct ContentUrlItem {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{fs::File, io::Read};
|
||||
use std::{
|
||||
fs::{self, File},
|
||||
io::{self, BufRead, Read},
|
||||
path::Path,
|
||||
};
|
||||
|
||||
use log::debug;
|
||||
|
||||
use crate::{law::LawBuilder, overview::parse_from_str};
|
||||
|
||||
use super::*;
|
||||
|
||||
fn read_lines<P>(filename: P) -> io::Result<Vec<String>>
|
||||
where
|
||||
P: AsRef<Path>,
|
||||
{
|
||||
let file = File::open(filename)?;
|
||||
let buf_reader = io::BufReader::new(file);
|
||||
buf_reader.lines().collect()
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
#[serde(rename_all = "PascalCase")]
|
||||
pub(crate) struct Wrapper {
|
||||
@ -248,4 +263,37 @@ mod tests {
|
||||
|
||||
assert!(wrapper.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_overview_full_urhg() {
|
||||
let mut files = Vec::new();
|
||||
let path = Path::new("./data/urhg/overview");
|
||||
let mut entries: Vec<_> = fs::read_dir(path)
|
||||
.unwrap()
|
||||
.filter_map(|entry| entry.ok())
|
||||
.collect();
|
||||
|
||||
entries.sort_by_key(|entry| entry.file_name());
|
||||
|
||||
let last_index = fs::read_dir(path).unwrap().count() - 1;
|
||||
let mut skip = true;
|
||||
for (idx, entry) in entries.into_iter().enumerate() {
|
||||
let mut file = File::open(path.join(entry.file_name())).unwrap();
|
||||
let mut json = String::new();
|
||||
file.read_to_string(&mut json).unwrap();
|
||||
|
||||
let expected_continue = !(idx == last_index);
|
||||
let (cont, cur_files) = parse_from_str(&json, skip).unwrap();
|
||||
assert_eq!(cont, expected_continue);
|
||||
|
||||
for file in cur_files {
|
||||
files.push(file);
|
||||
}
|
||||
|
||||
skip = false;
|
||||
}
|
||||
|
||||
let expected = read_lines(path.join("../overview.result")).unwrap();
|
||||
assert_eq!(files, expected);
|
||||
}
|
||||
}
|
||||
|
@ -11,9 +11,12 @@ fn fetch_page(url: &str) -> Result<String, Error> {
|
||||
pub(crate) fn parse(url: &str, builder: &mut LawBuilder) -> Result<bool, Error> {
|
||||
info!("Parsing {url}");
|
||||
let xml = fetch_page(url)?;
|
||||
parse_from_str(&xml, builder)
|
||||
}
|
||||
|
||||
pub(crate) fn parse_from_str(xml: &str, builder: &mut LawBuilder) -> Result<bool, Error> {
|
||||
let xml = xml.replace("<gdash />", "-"); // used e.g. in §11 Abs. 3 UrhG
|
||||
//
|
||||
//
|
||||
|
||||
let xml = xml.replace(
|
||||
// in § 17 (2)
|
||||
r#"<liste><schlussteil ebene="0" art="normal" ct="text">(2) Einer Rundfunksendung steht es gleich, wenn ein Werk von einer im In- oder im Ausland gelegenen Stelle aus der Öffentlichkeit im Inland, ähnlich wie durch Rundfunk, aber mit Hilfe von Leitungen wahrnehmbar gemacht wird.</schlussteil></liste>"#,
|
||||
|
Reference in New Issue
Block a user