This commit is contained in:
parent
2f077a447c
commit
cb55a074d7
102
Cargo.lock
generated
102
Cargo.lock
generated
@ -336,6 +336,15 @@ version = "0.4.20"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
|
checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "matchers"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
|
||||||
|
dependencies = [
|
||||||
|
"regex-automata 0.1.10",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "memchr"
|
name = "memchr"
|
||||||
version = "2.7.1"
|
version = "2.7.1"
|
||||||
@ -363,6 +372,16 @@ dependencies = [
|
|||||||
"windows-sys 0.48.0",
|
"windows-sys 0.48.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "nu-ansi-term"
|
||||||
|
version = "0.46.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
|
||||||
|
dependencies = [
|
||||||
|
"overload",
|
||||||
|
"winapi",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "num-conv"
|
name = "num-conv"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
@ -381,6 +400,12 @@ version = "0.2.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
|
checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "overload"
|
||||||
|
version = "0.1.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "parking_lot"
|
name = "parking_lot"
|
||||||
version = "0.12.1"
|
version = "0.12.1"
|
||||||
@ -478,8 +503,17 @@ checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"aho-corasick",
|
"aho-corasick",
|
||||||
"memchr",
|
"memchr",
|
||||||
"regex-automata",
|
"regex-automata 0.4.5",
|
||||||
"regex-syntax",
|
"regex-syntax 0.8.2",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-automata"
|
||||||
|
version = "0.1.10"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
|
||||||
|
dependencies = [
|
||||||
|
"regex-syntax 0.6.29",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -490,9 +524,15 @@ checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"aho-corasick",
|
"aho-corasick",
|
||||||
"memchr",
|
"memchr",
|
||||||
"regex-syntax",
|
"regex-syntax 0.8.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-syntax"
|
||||||
|
version = "0.6.29"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "regex-syntax"
|
name = "regex-syntax"
|
||||||
version = "0.8.2"
|
version = "0.8.2"
|
||||||
@ -529,6 +569,7 @@ dependencies = [
|
|||||||
"toml",
|
"toml",
|
||||||
"tqdm",
|
"tqdm",
|
||||||
"tracing",
|
"tracing",
|
||||||
|
"tracing-subscriber",
|
||||||
"ureq",
|
"ureq",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -621,6 +662,15 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "sharded-slab"
|
||||||
|
version = "0.1.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
|
||||||
|
dependencies = [
|
||||||
|
"lazy_static",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "signal-hook"
|
name = "signal-hook"
|
||||||
version = "0.3.17"
|
version = "0.3.17"
|
||||||
@ -706,6 +756,16 @@ dependencies = [
|
|||||||
"syn",
|
"syn",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "thread_local"
|
||||||
|
version = "1.1.8"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"once_cell",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "time"
|
name = "time"
|
||||||
version = "0.3.34"
|
version = "0.3.34"
|
||||||
@ -826,6 +886,36 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
|
checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"once_cell",
|
"once_cell",
|
||||||
|
"valuable",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tracing-log"
|
||||||
|
version = "0.2.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
|
||||||
|
dependencies = [
|
||||||
|
"log",
|
||||||
|
"once_cell",
|
||||||
|
"tracing-core",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tracing-subscriber"
|
||||||
|
version = "0.3.18"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
|
||||||
|
dependencies = [
|
||||||
|
"matchers",
|
||||||
|
"nu-ansi-term",
|
||||||
|
"once_cell",
|
||||||
|
"regex",
|
||||||
|
"sharded-slab",
|
||||||
|
"smallvec",
|
||||||
|
"thread_local",
|
||||||
|
"tracing",
|
||||||
|
"tracing-core",
|
||||||
|
"tracing-log",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -889,6 +979,12 @@ version = "0.2.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
|
checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "valuable"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "wasi"
|
name = "wasi"
|
||||||
version = "0.11.0+wasi-snapshot-preview1"
|
version = "0.11.0+wasi-snapshot-preview1"
|
||||||
|
@ -17,6 +17,7 @@ clap = { version = "4.5.0", features = ["derive"] }
|
|||||||
directories = "5.0"
|
directories = "5.0"
|
||||||
regex = "1.10"
|
regex = "1.10"
|
||||||
tracing = "0.1"
|
tracing = "0.1"
|
||||||
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
pretty_assertions = "1.4"
|
pretty_assertions = "1.4"
|
||||||
|
@ -24,6 +24,7 @@ use serde::Deserialize;
|
|||||||
use std::fs;
|
use std::fs;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
use tracing::{event, instrument, Level};
|
||||||
|
|
||||||
// TODO: more generic
|
// TODO: more generic
|
||||||
fn create_classifier(match_function: &str) -> Result<ClassifierApplicable, Error> {
|
fn create_classifier(match_function: &str) -> Result<ClassifierApplicable, Error> {
|
||||||
@ -55,6 +56,7 @@ pub struct Config {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Config {
|
impl Config {
|
||||||
|
#[instrument(level = "trace", skip(path))]
|
||||||
/// Loads a configuration from a specified path and constructs a `LawBuilder` and `Parser` based on it.
|
/// Loads a configuration from a specified path and constructs a `LawBuilder` and `Parser` based on it.
|
||||||
///
|
///
|
||||||
/// This function reads a configuration file from the given path, expecting it to be in TOML format. It then
|
/// This function reads a configuration file from the given path, expecting it to be in TOML format. It then
|
||||||
@ -92,12 +94,14 @@ impl Config {
|
|||||||
///
|
///
|
||||||
/// assert_eq!(law_id, 10001622);
|
/// assert_eq!(law_id, 10001622);
|
||||||
/// ```
|
/// ```
|
||||||
pub fn load<P: AsRef<Path>>(path: P) -> Result<(usize, law::Builder, Parser), Error> {
|
pub fn load<P: AsRef<Path> + std::fmt::Debug>(
|
||||||
|
path: P,
|
||||||
|
) -> Result<(usize, law::Builder, Parser), Error> {
|
||||||
let config_str = fs::read_to_string(path)?;
|
let config_str = fs::read_to_string(path)?;
|
||||||
let config: Config = toml::from_str(&config_str)?;
|
let config: Config = toml::from_str(&config_str)?;
|
||||||
|
|
||||||
let mut builder = law::Builder::new(config.law.name);
|
let mut builder = law::Builder::new(config.law.name);
|
||||||
for classifier in config.law.classifiers {
|
for classifier in &config.law.classifiers {
|
||||||
let to_add = law::Classifier::new(
|
let to_add = law::Classifier::new(
|
||||||
&classifier.name,
|
&classifier.name,
|
||||||
create_classifier(&classifier.match_function)?,
|
create_classifier(&classifier.match_function)?,
|
||||||
@ -108,18 +112,37 @@ impl Config {
|
|||||||
builder.add_classifier(to_add);
|
builder.add_classifier(to_add);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
event!(
|
||||||
|
Level::INFO,
|
||||||
|
"Added {} classifiers from config",
|
||||||
|
&config.law.classifiers.len()
|
||||||
|
);
|
||||||
|
|
||||||
let mut parser = Parser::new();
|
let mut parser = Parser::new();
|
||||||
|
|
||||||
for to_remove in config.parser.remove_strings {
|
for to_remove in &config.parser.remove_strings {
|
||||||
parser.add_string_to_remove(&to_remove);
|
parser.add_string_to_remove(to_remove);
|
||||||
}
|
}
|
||||||
|
event!(
|
||||||
|
Level::INFO,
|
||||||
|
"Added {} strings to remove",
|
||||||
|
&config.parser.remove_strings.len()
|
||||||
|
);
|
||||||
|
|
||||||
for to_replace in config.parser.replace_rules {
|
for to_replace in &config.parser.replace_rules {
|
||||||
parser.add_string_to_replace(&to_replace.find, &to_replace.replace_with);
|
parser.add_string_to_replace(&to_replace.find, &to_replace.replace_with);
|
||||||
}
|
}
|
||||||
|
event!(
|
||||||
|
Level::INFO,
|
||||||
|
"Added {} strings to replace",
|
||||||
|
&config.parser.replace_rules.len()
|
||||||
|
);
|
||||||
|
|
||||||
if config.parser.move_para_headers_into_content {
|
if config.parser.move_para_headers_into_content {
|
||||||
|
event!(
|
||||||
|
Level::WARN,
|
||||||
|
"Move para headers into content. Make sure you know what you do!"
|
||||||
|
);
|
||||||
parser.move_para_headers_into_content();
|
parser.move_para_headers_into_content();
|
||||||
}
|
}
|
||||||
Ok((config.law.id, builder, parser))
|
Ok((config.law.id, builder, parser))
|
||||||
|
@ -24,6 +24,7 @@ use std::{
|
|||||||
rc::Rc,
|
rc::Rc,
|
||||||
sync::Arc,
|
sync::Arc,
|
||||||
};
|
};
|
||||||
|
use tracing::instrument;
|
||||||
|
|
||||||
use crate::{config::Config, misc::Error, overview::parse};
|
use crate::{config::Config, misc::Error, overview::parse};
|
||||||
|
|
||||||
@ -37,6 +38,7 @@ pub struct Law {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Law {
|
impl Law {
|
||||||
|
#[instrument]
|
||||||
/// Creates a `Law` instance from a configuration file.
|
/// Creates a `Law` instance from a configuration file.
|
||||||
///
|
///
|
||||||
/// This function initializes the law processing pipeline by loading configurations from the
|
/// This function initializes the law processing pipeline by loading configurations from the
|
||||||
@ -71,7 +73,9 @@ impl Law {
|
|||||||
///
|
///
|
||||||
/// let law = Law::from_config("./data/configs/abgb.toml").unwrap();
|
/// let law = Law::from_config("./data/configs/abgb.toml").unwrap();
|
||||||
/// ```
|
/// ```
|
||||||
pub fn from_config<P: AsRef<Path>>(path: P) -> Result<Law, Error> {
|
pub fn from_config<P: AsRef<Path> + tracing::Value + std::fmt::Debug>(
|
||||||
|
path: P,
|
||||||
|
) -> Result<Law, Error> {
|
||||||
let (law_id, mut builder, parser) = Config::load(path)?;
|
let (law_id, mut builder, parser) = Config::load(path)?;
|
||||||
let pars = parse(law_id)?;
|
let pars = parse(law_id)?;
|
||||||
|
|
||||||
|
@ -22,6 +22,8 @@ use risp::{
|
|||||||
law::{responsible::always_true, Classifier, Law},
|
law::{responsible::always_true, Classifier, Law},
|
||||||
Config,
|
Config,
|
||||||
};
|
};
|
||||||
|
use tracing_subscriber::filter::EnvFilter;
|
||||||
|
use tracing_subscriber::fmt;
|
||||||
|
|
||||||
#[derive(Parser, Debug)]
|
#[derive(Parser, Debug)]
|
||||||
#[command(version, about, long_about = None)]
|
#[command(version, about, long_about = None)]
|
||||||
@ -39,8 +41,12 @@ struct Args {
|
|||||||
#[arg(long)]
|
#[arg(long)]
|
||||||
clear_cache: bool,
|
clear_cache: bool,
|
||||||
}
|
}
|
||||||
|
use tracing_subscriber::prelude::*;
|
||||||
fn main() {
|
fn main() {
|
||||||
|
tracing_subscriber::registry()
|
||||||
|
.with(fmt::layer())
|
||||||
|
.with(EnvFilter::from_default_env())
|
||||||
|
.init();
|
||||||
let args = Args::parse();
|
let args = Args::parse();
|
||||||
|
|
||||||
if args.clear_cache {
|
if args.clear_cache {
|
||||||
|
@ -21,11 +21,13 @@ mod ris_structure;
|
|||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
|
use tracing::{event, instrument, Level};
|
||||||
|
|
||||||
use crate::misc::{current_date, get_cache_dir, Error};
|
use crate::misc::{current_date, get_cache_dir, Error};
|
||||||
|
|
||||||
use ris_structure::OgdSearchResult;
|
use ris_structure::OgdSearchResult;
|
||||||
|
|
||||||
|
#[instrument(level = "trace")]
|
||||||
/// Parses a law text from the Austrian RIS (Rechtsinformationssystem) based on the given `law_id`.
|
/// Parses a law text from the Austrian RIS (Rechtsinformationssystem) based on the given `law_id`.
|
||||||
///
|
///
|
||||||
/// This function iterates over all pages of the law text, with each page containing a maximum of 100
|
/// This function iterates over all pages of the law text, with each page containing a maximum of 100
|
||||||
@ -67,6 +69,7 @@ pub fn parse(law_id: usize) -> Result<Vec<String>, Error> {
|
|||||||
let mut ret = Vec::new();
|
let mut ret = Vec::new();
|
||||||
loop {
|
loop {
|
||||||
//info!("=== Fetching overview page #{page} ===");
|
//info!("=== Fetching overview page #{page} ===");
|
||||||
|
event!(Level::INFO, "Fetching over page #{page}");
|
||||||
let json = fetch_page(law_id, page)?;
|
let json = fetch_page(law_id, page)?;
|
||||||
let (cont, nodes) = parse_from_str(&json, skip)?;
|
let (cont, nodes) = parse_from_str(&json, skip)?;
|
||||||
for n in nodes {
|
for n in nodes {
|
||||||
@ -113,9 +116,16 @@ fn fetch_page(overview_id: usize, page: usize) -> Result<String, Error> {
|
|||||||
|
|
||||||
let expected_filename = format!("{}law-{overview_id}-{page}", get_cache_dir()?);
|
let expected_filename = format!("{}law-{overview_id}-{page}", get_cache_dir()?);
|
||||||
if let Ok(data) = fs::read_to_string(&expected_filename) {
|
if let Ok(data) = fs::read_to_string(&expected_filename) {
|
||||||
|
event!(
|
||||||
|
Level::DEBUG,
|
||||||
|
"Using cached version of law_id {overview_id} (page {page})"
|
||||||
|
);
|
||||||
Ok(data)
|
Ok(data)
|
||||||
} else {
|
} else {
|
||||||
//info!("Not finding law_id {overview_id} (page {page}) in the cache, downloading...");
|
event!(
|
||||||
|
Level::INFO,
|
||||||
|
"Not finding law_id {overview_id} (page {page}) in the cache, downloading..."
|
||||||
|
);
|
||||||
let data = ureq::post("https://data.bka.gv.at/ris/api/v2.6/Bundesrecht")
|
let data = ureq::post("https://data.bka.gv.at/ris/api/v2.6/Bundesrecht")
|
||||||
.send_form(&[
|
.send_form(&[
|
||||||
("Applikation", "BrKons"),
|
("Applikation", "BrKons"),
|
||||||
|
@ -76,7 +76,7 @@ impl<'a> Expect<'a> {
|
|||||||
fn empty(next: Option<Node<'_, '_>>) {
|
fn empty(next: Option<Node<'_, '_>>) {
|
||||||
if let Some(n) = next {
|
if let Some(n) = next {
|
||||||
let expect = Expect::from(&n);
|
let expect = Expect::from(&n);
|
||||||
assert!(false, "Expected no more elements, got {expect}");
|
panic!("Expected no more elements, got {expect}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user