risp/src/overview/mod.rs

// Copyright (C) 2024 Philipp Hofer
//
// Licensed under the EUPL, Version 1.2 or - as soon they will be approved by
// the European Commission - subsequent versions of the EUPL (the "Licence").
// You may not use this work except in compliance with the Licence.
//
// You should have received a copy of the European Union Public License along
// with this program.  If not, you may obtain a copy of the Licence at:
// <https://joinup.ec.europa.eu/software/page/eupl>
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the Licence is distributed on an "AS IS" basis,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the Licence for the specific language governing permissions and
// limitations under the Licence.

//! Deals with getting all paragraphs for a given law text

mod ris_structure;

use std::path::Path;

use log::info;
use serde::Deserialize;

use crate::misc::{current_date, get_cache_dir, Error};

use ris_structure::OgdSearchResult;

/// Parses a law text from the Austrian RIS (Rechtsinformationssystem) based on the given `law_id`.
///
/// This function iterates over all pages of the law text, with each page containing a maximum of 100
/// paragraphs in XML format. It extracts and returns the links to each paragraph as XML URLs.
///
/// The first section of the first page is skipped (`skip` is set to true) because it always
/// contains the table of contents. The function continues processing subsequent pages until there
/// are no more pages left to fetch.
///
/// # Parameters
///
/// - `law_id`: The unique identifier of the law in the RIS system.
///
/// # Returns
///
/// - `Ok(Vec<String>)`: A vector of XML file links representing paragraphs from the given law text.
/// - `Err(Error)`: An error if there was an issue fetching or parsing the law text.
///
/// # Example
/// ```
/// use risp::overview::parse;
///
/// let list_with_xml_links_to_paragraphs = parse(10001905).unwrap();
/// assert_eq!(list_with_xml_links_to_paragraphs.len(), 31); // TEG has 31 paragraphs
/// assert_eq!(list_with_xml_links_to_paragraphs[0], "https://www.ris.bka.gv.at/Dokumente/Bundesnormen/NOR12025190/NOR12025190.xml"); // Link to first paragraph
/// ```
pub fn parse(law_id: usize) -> Result<Vec<String>, Error> {
    let mut page = 1;
    let mut skip = true;
    let mut ret = Vec::new();
    loop {
        info!("=== Fetching overview page #{page} ===");
        let json = fetch_page(law_id, page)?;
        let (cont, nodes) = parse_from_str(&json, skip)?;
        for n in nodes {
            ret.push(n.clone());
        }
        if !cont {
            break;
        }
        skip = false;
        page += 1;
    }

    Ok(ret)
}

#[derive(Deserialize)]
#[serde(rename_all = "PascalCase")]
struct Overview {
    ogd_search_result: OgdSearchResult,
}

fn parse_from_str(content: &str, skip_first: bool) -> Result<(bool, Vec<String>), Error> {
    let mut ret = Vec::new();
    let wrapper: Overview = serde_json::from_str(content)?;

    let iter = wrapper.ogd_search_result.get_par().into_iter();
    let boxed_iter: Box<dyn Iterator<Item = String>> = if skip_first {
        Box::new(iter.skip(1))
    } else {
        Box::new(iter)
    };
    for par in boxed_iter {
        ret.push(par);
    }

    if !wrapper.ogd_search_result.has_next_page() {
        return Ok((false, ret));
    }
    Ok((true, ret))
}

fn fetch_page(overview_id: usize, page: usize) -> Result<String, Error> {
    use std::fs;

    let expected_filename = format!("{}law-{overview_id}-{page}", get_cache_dir()?);

    match fs::read_to_string(&expected_filename) {
        Ok(data) => Ok(data),
        Err(_) => {
            info!("Not finding law_id {overview_id} (page {page}) in the cache, downloading...");
            let data = ureq::post("https://data.bka.gv.at/ris/api/v2.6/Bundesrecht")
                .send_form(&[
                    ("Applikation", "BrKons"),
                    ("Gesetzesnummer", &format!("{overview_id}")),
                    ("DokumenteProSeite", "OneHundred"),
                    ("Seitennummer", &format!("{page}")),
                    ("Fassung.FassungVom", &current_date()),
                ])?
                .into_string()?;
            let path = Path::new(&expected_filename);
            if let Some(parent) = path.parent() {
                // Try to create the directory (and any necessary parent directories)
                fs::create_dir_all(parent).expect("Unable to create directory");
            }
            fs::write(expected_filename, &data).expect("Unable to write file");
            Ok(data)
        }
    }
}

#[cfg(test)]
mod tests {
    use crate::{config::Config, overview::parse};
    use pretty_assertions::assert_eq;
    use std::fs;

    #[test]
    fn overview() {
        let configs = fs::read_dir("./data/configs").expect("No folder with config files");

        for config in configs {
            let path = format!("{}", config.unwrap().path().display());

            let (law_id, _, _) = Config::load(&path).unwrap();

            let actual = parse(law_id).unwrap();
            let expected_path = format!("./data/expected/overview/{law_id}");
            match fs::read_to_string(&expected_path) {
                Ok(expected) => {
                    let expected = expected.trim().split('\n').collect::<Vec<&str>>();

                    assert_eq!(actual, expected);
                }
                Err(_) => {
                    let to_write = actual.join("\n");
                    fs::write(expected_path, to_write).expect("Unable to write file");
                }
            }
        }
    }
}