163 lines
5.5 KiB
Rust
163 lines
5.5 KiB
Rust
// Copyright (C) 2024 Philipp Hofer
|
|
//
|
|
// Licensed under the EUPL, Version 1.2 or - as soon they will be approved by
|
|
// the European Commission - subsequent versions of the EUPL (the "Licence").
|
|
// You may not use this work except in compliance with the Licence.
|
|
//
|
|
// You should have received a copy of the European Union Public License along
|
|
// with this program. If not, you may obtain a copy of the Licence at:
|
|
// <https://joinup.ec.europa.eu/software/page/eupl>
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the Licence is distributed on an "AS IS" basis,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the Licence for the specific language governing permissions and
|
|
// limitations under the Licence.
|
|
|
|
//! Deals with getting all paragraphs for a given law text
|
|
|
|
mod ris_structure;
|
|
|
|
use std::path::Path;
|
|
|
|
use log::info;
|
|
use serde::Deserialize;
|
|
|
|
use crate::misc::{current_date, get_cache_dir, Error};
|
|
|
|
use ris_structure::OgdSearchResult;
|
|
|
|
/// Parses a law text from the Austrian RIS (Rechtsinformationssystem) based on the given `law_id`.
|
|
///
|
|
/// This function iterates over all pages of the law text, with each page containing a maximum of 100
|
|
/// paragraphs in XML format. It extracts and returns the links to each paragraph as XML URLs.
|
|
///
|
|
/// The first section of the first page is skipped (`skip` is set to true) because it always
|
|
/// contains the table of contents. The function continues processing subsequent pages until there
|
|
/// are no more pages left to fetch.
|
|
///
|
|
/// # Parameters
|
|
///
|
|
/// - `law_id`: The unique identifier of the law in the RIS system.
|
|
///
|
|
/// # Returns
|
|
///
|
|
/// - `Ok(Vec<String>)`: A vector of XML file links representing paragraphs from the given law text.
|
|
/// - `Err(Error)`: An error if there was an issue fetching or parsing the law text.
|
|
///
|
|
/// # Example
|
|
/// ```
|
|
/// use risp::overview::parse;
|
|
///
|
|
/// let list_with_xml_links_to_paragraphs = parse(10001905).unwrap();
|
|
/// assert_eq!(list_with_xml_links_to_paragraphs.len(), 31); // TEG has 31 paragraphs
|
|
/// assert_eq!(list_with_xml_links_to_paragraphs[0], "https://www.ris.bka.gv.at/Dokumente/Bundesnormen/NOR12025190/NOR12025190.xml"); // Link to first paragraph
|
|
/// ```
|
|
pub fn parse(law_id: usize) -> Result<Vec<String>, Error> {
|
|
let mut page = 1;
|
|
let mut skip = true;
|
|
let mut ret = Vec::new();
|
|
loop {
|
|
info!("=== Fetching overview page #{page} ===");
|
|
let json = fetch_page(law_id, page)?;
|
|
let (cont, nodes) = parse_from_str(&json, skip)?;
|
|
for n in nodes {
|
|
ret.push(n.clone());
|
|
}
|
|
if !cont {
|
|
break;
|
|
}
|
|
skip = false;
|
|
page += 1;
|
|
}
|
|
|
|
Ok(ret)
|
|
}
|
|
|
|
#[derive(Deserialize)]
|
|
#[serde(rename_all = "PascalCase")]
|
|
struct Overview {
|
|
ogd_search_result: OgdSearchResult,
|
|
}
|
|
|
|
fn parse_from_str(content: &str, skip_first: bool) -> Result<(bool, Vec<String>), Error> {
|
|
let mut ret = Vec::new();
|
|
let wrapper: Overview = serde_json::from_str(content)?;
|
|
|
|
let iter = wrapper.ogd_search_result.get_par().into_iter();
|
|
let boxed_iter: Box<dyn Iterator<Item = String>> = if skip_first {
|
|
Box::new(iter.skip(1))
|
|
} else {
|
|
Box::new(iter)
|
|
};
|
|
for par in boxed_iter {
|
|
ret.push(par);
|
|
}
|
|
|
|
if !wrapper.ogd_search_result.has_next_page() {
|
|
return Ok((false, ret));
|
|
}
|
|
Ok((true, ret))
|
|
}
|
|
|
|
fn fetch_page(overview_id: usize, page: usize) -> Result<String, Error> {
|
|
use std::fs;
|
|
|
|
let expected_filename = format!("{}law-{overview_id}-{page}", get_cache_dir()?);
|
|
|
|
match fs::read_to_string(&expected_filename) {
|
|
Ok(data) => Ok(data),
|
|
Err(_) => {
|
|
info!("Not finding law_id {overview_id} (page {page}) in the cache, downloading...");
|
|
let data = ureq::post("https://data.bka.gv.at/ris/api/v2.6/Bundesrecht")
|
|
.send_form(&[
|
|
("Applikation", "BrKons"),
|
|
("Gesetzesnummer", &format!("{overview_id}")),
|
|
("DokumenteProSeite", "OneHundred"),
|
|
("Seitennummer", &format!("{page}")),
|
|
("Fassung.FassungVom", ¤t_date()),
|
|
])?
|
|
.into_string()?;
|
|
let path = Path::new(&expected_filename);
|
|
if let Some(parent) = path.parent() {
|
|
// Try to create the directory (and any necessary parent directories)
|
|
fs::create_dir_all(parent).expect("Unable to create directory");
|
|
}
|
|
fs::write(expected_filename, &data).expect("Unable to write file");
|
|
Ok(data)
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use crate::{config::Config, overview::parse};
|
|
use pretty_assertions::assert_eq;
|
|
use std::fs;
|
|
|
|
#[test]
|
|
fn overview() {
|
|
let configs = fs::read_dir("./data/configs").expect("No folder with config files");
|
|
|
|
for config in configs {
|
|
let path = format!("{}", config.unwrap().path().display());
|
|
|
|
let (law_id, _, _) = Config::load(&path).unwrap();
|
|
|
|
let actual = parse(law_id).unwrap();
|
|
let expected_path = format!("./data/expected/overview/{law_id}");
|
|
match fs::read_to_string(&expected_path) {
|
|
Ok(expected) => {
|
|
let expected = expected.trim().split('\n').collect::<Vec<&str>>();
|
|
|
|
assert_eq!(actual, expected);
|
|
}
|
|
Err(_) => {
|
|
let to_write = actual.join("\n");
|
|
fs::write(expected_path, to_write).expect("Unable to write file");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|