This commit is contained in:
parent
2f077a447c
commit
cb55a074d7
102
Cargo.lock
generated
102
Cargo.lock
generated
@ -336,6 +336,15 @@ version = "0.4.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
|
||||
|
||||
[[package]]
|
||||
name = "matchers"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
|
||||
dependencies = [
|
||||
"regex-automata 0.1.10",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.7.1"
|
||||
@ -363,6 +372,16 @@ dependencies = [
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nu-ansi-term"
|
||||
version = "0.46.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
|
||||
dependencies = [
|
||||
"overload",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-conv"
|
||||
version = "0.1.0"
|
||||
@ -381,6 +400,12 @@ version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d"
|
||||
|
||||
[[package]]
|
||||
name = "overload"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.12.1"
|
||||
@ -478,8 +503,17 @@ checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-automata",
|
||||
"regex-syntax",
|
||||
"regex-automata 0.4.5",
|
||||
"regex-syntax 0.8.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-automata"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
|
||||
dependencies = [
|
||||
"regex-syntax 0.6.29",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -490,9 +524,15 @@ checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"memchr",
|
||||
"regex-syntax",
|
||||
"regex-syntax 0.8.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.6.29"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
|
||||
|
||||
[[package]]
|
||||
name = "regex-syntax"
|
||||
version = "0.8.2"
|
||||
@ -529,6 +569,7 @@ dependencies = [
|
||||
"toml",
|
||||
"tqdm",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"ureq",
|
||||
]
|
||||
|
||||
@ -621,6 +662,15 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sharded-slab"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "signal-hook"
|
||||
version = "0.3.17"
|
||||
@ -706,6 +756,16 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thread_local"
|
||||
version = "1.1.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.3.34"
|
||||
@ -826,6 +886,36 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"valuable",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-log"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
|
||||
dependencies = [
|
||||
"log",
|
||||
"once_cell",
|
||||
"tracing-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tracing-subscriber"
|
||||
version = "0.3.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
|
||||
dependencies = [
|
||||
"matchers",
|
||||
"nu-ansi-term",
|
||||
"once_cell",
|
||||
"regex",
|
||||
"sharded-slab",
|
||||
"smallvec",
|
||||
"thread_local",
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"tracing-log",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -889,6 +979,12 @@ version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
|
||||
|
||||
[[package]]
|
||||
name = "valuable"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.11.0+wasi-snapshot-preview1"
|
||||
|
@ -17,6 +17,7 @@ clap = { version = "4.5.0", features = ["derive"] }
|
||||
directories = "5.0"
|
||||
regex = "1.10"
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
||||
|
||||
[dev-dependencies]
|
||||
pretty_assertions = "1.4"
|
||||
|
@ -24,6 +24,7 @@ use serde::Deserialize;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use tracing::{event, instrument, Level};
|
||||
|
||||
// TODO: more generic
|
||||
fn create_classifier(match_function: &str) -> Result<ClassifierApplicable, Error> {
|
||||
@ -55,6 +56,7 @@ pub struct Config {
|
||||
}
|
||||
|
||||
impl Config {
|
||||
#[instrument(level = "trace", skip(path))]
|
||||
/// Loads a configuration from a specified path and constructs a `LawBuilder` and `Parser` based on it.
|
||||
///
|
||||
/// This function reads a configuration file from the given path, expecting it to be in TOML format. It then
|
||||
@ -92,12 +94,14 @@ impl Config {
|
||||
///
|
||||
/// assert_eq!(law_id, 10001622);
|
||||
/// ```
|
||||
pub fn load<P: AsRef<Path>>(path: P) -> Result<(usize, law::Builder, Parser), Error> {
|
||||
pub fn load<P: AsRef<Path> + std::fmt::Debug>(
|
||||
path: P,
|
||||
) -> Result<(usize, law::Builder, Parser), Error> {
|
||||
let config_str = fs::read_to_string(path)?;
|
||||
let config: Config = toml::from_str(&config_str)?;
|
||||
|
||||
let mut builder = law::Builder::new(config.law.name);
|
||||
for classifier in config.law.classifiers {
|
||||
for classifier in &config.law.classifiers {
|
||||
let to_add = law::Classifier::new(
|
||||
&classifier.name,
|
||||
create_classifier(&classifier.match_function)?,
|
||||
@ -108,18 +112,37 @@ impl Config {
|
||||
builder.add_classifier(to_add);
|
||||
}
|
||||
}
|
||||
event!(
|
||||
Level::INFO,
|
||||
"Added {} classifiers from config",
|
||||
&config.law.classifiers.len()
|
||||
);
|
||||
|
||||
let mut parser = Parser::new();
|
||||
|
||||
for to_remove in config.parser.remove_strings {
|
||||
parser.add_string_to_remove(&to_remove);
|
||||
for to_remove in &config.parser.remove_strings {
|
||||
parser.add_string_to_remove(to_remove);
|
||||
}
|
||||
event!(
|
||||
Level::INFO,
|
||||
"Added {} strings to remove",
|
||||
&config.parser.remove_strings.len()
|
||||
);
|
||||
|
||||
for to_replace in config.parser.replace_rules {
|
||||
for to_replace in &config.parser.replace_rules {
|
||||
parser.add_string_to_replace(&to_replace.find, &to_replace.replace_with);
|
||||
}
|
||||
event!(
|
||||
Level::INFO,
|
||||
"Added {} strings to replace",
|
||||
&config.parser.replace_rules.len()
|
||||
);
|
||||
|
||||
if config.parser.move_para_headers_into_content {
|
||||
event!(
|
||||
Level::WARN,
|
||||
"Move para headers into content. Make sure you know what you do!"
|
||||
);
|
||||
parser.move_para_headers_into_content();
|
||||
}
|
||||
Ok((config.law.id, builder, parser))
|
||||
|
@ -24,6 +24,7 @@ use std::{
|
||||
rc::Rc,
|
||||
sync::Arc,
|
||||
};
|
||||
use tracing::instrument;
|
||||
|
||||
use crate::{config::Config, misc::Error, overview::parse};
|
||||
|
||||
@ -37,6 +38,7 @@ pub struct Law {
|
||||
}
|
||||
|
||||
impl Law {
|
||||
#[instrument]
|
||||
/// Creates a `Law` instance from a configuration file.
|
||||
///
|
||||
/// This function initializes the law processing pipeline by loading configurations from the
|
||||
@ -71,7 +73,9 @@ impl Law {
|
||||
///
|
||||
/// let law = Law::from_config("./data/configs/abgb.toml").unwrap();
|
||||
/// ```
|
||||
pub fn from_config<P: AsRef<Path>>(path: P) -> Result<Law, Error> {
|
||||
pub fn from_config<P: AsRef<Path> + tracing::Value + std::fmt::Debug>(
|
||||
path: P,
|
||||
) -> Result<Law, Error> {
|
||||
let (law_id, mut builder, parser) = Config::load(path)?;
|
||||
let pars = parse(law_id)?;
|
||||
|
||||
|
@ -22,6 +22,8 @@ use risp::{
|
||||
law::{responsible::always_true, Classifier, Law},
|
||||
Config,
|
||||
};
|
||||
use tracing_subscriber::filter::EnvFilter;
|
||||
use tracing_subscriber::fmt;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(version, about, long_about = None)]
|
||||
@ -39,8 +41,12 @@ struct Args {
|
||||
#[arg(long)]
|
||||
clear_cache: bool,
|
||||
}
|
||||
|
||||
use tracing_subscriber::prelude::*;
|
||||
fn main() {
|
||||
tracing_subscriber::registry()
|
||||
.with(fmt::layer())
|
||||
.with(EnvFilter::from_default_env())
|
||||
.init();
|
||||
let args = Args::parse();
|
||||
|
||||
if args.clear_cache {
|
||||
|
@ -21,11 +21,13 @@ mod ris_structure;
|
||||
use std::path::Path;
|
||||
|
||||
use serde::Deserialize;
|
||||
use tracing::{event, instrument, Level};
|
||||
|
||||
use crate::misc::{current_date, get_cache_dir, Error};
|
||||
|
||||
use ris_structure::OgdSearchResult;
|
||||
|
||||
#[instrument(level = "trace")]
|
||||
/// Parses a law text from the Austrian RIS (Rechtsinformationssystem) based on the given `law_id`.
|
||||
///
|
||||
/// This function iterates over all pages of the law text, with each page containing a maximum of 100
|
||||
@ -67,6 +69,7 @@ pub fn parse(law_id: usize) -> Result<Vec<String>, Error> {
|
||||
let mut ret = Vec::new();
|
||||
loop {
|
||||
//info!("=== Fetching overview page #{page} ===");
|
||||
event!(Level::INFO, "Fetching over page #{page}");
|
||||
let json = fetch_page(law_id, page)?;
|
||||
let (cont, nodes) = parse_from_str(&json, skip)?;
|
||||
for n in nodes {
|
||||
@ -113,9 +116,16 @@ fn fetch_page(overview_id: usize, page: usize) -> Result<String, Error> {
|
||||
|
||||
let expected_filename = format!("{}law-{overview_id}-{page}", get_cache_dir()?);
|
||||
if let Ok(data) = fs::read_to_string(&expected_filename) {
|
||||
event!(
|
||||
Level::DEBUG,
|
||||
"Using cached version of law_id {overview_id} (page {page})"
|
||||
);
|
||||
Ok(data)
|
||||
} else {
|
||||
//info!("Not finding law_id {overview_id} (page {page}) in the cache, downloading...");
|
||||
event!(
|
||||
Level::INFO,
|
||||
"Not finding law_id {overview_id} (page {page}) in the cache, downloading..."
|
||||
);
|
||||
let data = ureq::post("https://data.bka.gv.at/ris/api/v2.6/Bundesrecht")
|
||||
.send_form(&[
|
||||
("Applikation", "BrKons"),
|
||||
|
@ -76,7 +76,7 @@ impl<'a> Expect<'a> {
|
||||
fn empty(next: Option<Node<'_, '_>>) {
|
||||
if let Some(n) = next {
|
||||
let expect = Expect::from(&n);
|
||||
assert!(false, "Expected no more elements, got {expect}");
|
||||
panic!("Expected no more elements, got {expect}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user