add paragraph parser to lib, add test for teg
Some checks failed
CI/CD Pipeline / test (push) Failing after 32s

This commit is contained in:
2024-02-05 14:28:57 +01:00
parent 727916cb3f
commit 350b1e5ebb
57 changed files with 403 additions and 450 deletions

View File

@ -1,5 +1,4 @@
use log::{debug, info};
use risp::risparser::overview::parse;
use serde::{Deserialize, Serialize};
use std::{
cell::RefCell,
@ -8,23 +7,18 @@ use std::{
sync::Arc,
};
use crate::par;
use self::responsible::{
contains, contains_at_start, contains_without_unter, starts_with_letter, starts_with_number,
starts_with_roman_number, starts_with_uppercaseletter,
};
mod responsible;
pub mod responsible;
/// That's our struct, holding all the information of the law text.
#[derive(Debug, Serialize, Deserialize, PartialEq)]
pub(crate) struct Law {
name: String, //ABGB, UrhG
header: Vec<Heading>,
pub struct Law {
pub name: String, //ABGB, UrhG
pub header: Vec<Heading>,
}
impl Law {
pub(crate) fn to_md(&self) {
//TODO: add test
pub fn to_md(&self) {
println!("# {}", self.name);
for header in &self.header {
@ -69,10 +63,10 @@ impl From<LawBuilder> for Law {
}
#[derive(Debug, Serialize, Deserialize, PartialEq)]
struct Heading {
name: String, //1. Hauptstück; 3. Theil; ...
desc: Option<String>,
content: HeadingContent, // 1. Theil; 1. Subtheil; ...
pub struct Heading {
pub name: String, //1. Hauptstück; 3. Theil; ...
pub desc: Option<String>,
pub content: HeadingContent, // 1. Theil; 1. Subtheil; ...
}
impl Display for Heading {
@ -86,7 +80,7 @@ impl Display for Heading {
}
#[derive(Debug, Serialize, Deserialize, PartialEq)]
enum HeadingContent {
pub enum HeadingContent {
Paragraph(Vec<Section>),
Heading(Vec<Heading>),
}
@ -112,23 +106,23 @@ impl From<ClassifierInstance> for HeadingContent {
/// Is used to generate a law struct. It's organized mainly by classifier.
#[derive(Debug)]
pub(crate) struct LawBuilder {
pub struct LawBuilder {
/// Name of the law
pub(crate) name: String, //ABGB, UrhG
name: String, //ABGB, UrhG
/// Structure of the law text
pub(crate) classifiers: Vec<Classifier>,
classifiers: Vec<Classifier>,
/// Instances
pub(crate) header: Vec<Rc<RefCell<ClassifierInstance>>>,
header: Vec<Rc<RefCell<ClassifierInstance>>>,
last_instance: Option<Rc<RefCell<ClassifierInstance>>>,
/// Stores the header of the next paragraph
pub(crate) next_para_header: Option<String>,
next_para_header: Option<String>,
#[cfg(test)]
pub(crate) history: Vec<String>,
pub history: Vec<String>,
}
impl PartialEq for LawBuilder {
@ -141,115 +135,56 @@ impl PartialEq for LawBuilder {
}
impl LawBuilder {
#[cfg(test)]
pub(crate) fn test(name: &str) -> Self {
let mut classifiers = Vec::new();
//#[cfg(test)]
//pub fn test(name: &str) -> Self {
// let mut classifiers = Vec::new();
if name == "new" {
classifiers.push(Classifier::new("a", Arc::new(&contains)).root());
classifiers.push(Classifier::new("b", Arc::new(&contains)));
classifiers.push(Classifier::new("c", Arc::new(&contains)));
classifiers.push(Classifier::new("d", Arc::new(&contains)));
} else if name == "UrhG" {
classifiers.push(Classifier::new("Hauptstück", Arc::new(&contains)).root());
classifiers.push(Classifier::new("Abschnitt", Arc::new(&contains)));
classifiers.push(Classifier::new("Number", Arc::new(&starts_with_number)));
}
// if name == "new" {
// classifiers.push(Classifier::new("a", Arc::new(&contains)).root());
// classifiers.push(Classifier::new("b", Arc::new(&contains)));
// classifiers.push(Classifier::new("c", Arc::new(&contains)));
// classifiers.push(Classifier::new("d", Arc::new(&contains)));
// } else if name == "UrhG" {
// classifiers.push(Classifier::new("Hauptstück", Arc::new(&contains)).root());
// classifiers.push(Classifier::new("Abschnitt", Arc::new(&contains)));
// classifiers.push(Classifier::new("Number", Arc::new(&starts_with_number)));
// }
Self {
name: name.into(),
classifiers,
header: Vec::new(),
next_para_header: None,
last_instance: None,
#[cfg(test)]
history: Vec::new(),
}
}
// Self {
// name: name.into(),
// classifiers,
// header: Vec::new(),
// next_para_header: None,
// last_instance: None,
// #[cfg(test)]
// history: Vec::new(),
// }
//}
/// Creates a new law builder. Adds classifier for known law texts.
pub(crate) fn new(name: &str) -> Self {
let mut classifiers = Vec::new();
let mut law_id = None;
if name == "UrhG" {
law_id = Some(10_001_848);
classifiers.push(Classifier::new("Hauptstück", Arc::new(&contains)).root());
classifiers.push(Classifier::new("Abschnitt", Arc::new(&contains)));
classifiers.push(Classifier::new("Number", Arc::new(&starts_with_number)));
} else if name == "MSchG" {
law_id = Some(10_002_180);
classifiers.push(Classifier::new("Abschnitt", Arc::new(&contains)).root());
classifiers.push(Classifier::new("Number", Arc::new(&starts_with_number)));
} else if name == "ABGB" {
law_id = Some(10_001_622);
classifiers.push(Classifier::new("Einleitung", Arc::new(&contains)).root());
classifiers.push(Classifier::new("Theil", Arc::new(&contains)).root());
classifiers.push(Classifier::new("Hauptstück", Arc::new(&contains)));
classifiers.push(Classifier::new("Abschnitt", Arc::new(&contains)));
classifiers.push(Classifier::new("Abtheilung", Arc::new(&contains)));
classifiers.push(Classifier::new("heading", Arc::new(&contains_at_start)));
classifiers.push(Classifier::new("letter", Arc::new(&starts_with_letter)));
classifiers.push(Classifier::new("num", Arc::new(&starts_with_number)));
classifiers.push(Classifier::new("rom", Arc::new(&starts_with_roman_number)));
} else if name == "FSG" {
law_id = Some(10_003_898);
classifiers.push(Classifier::new("Artikel", Arc::new(&contains)).root());
classifiers.push(Classifier::new(
"Abschnitt",
Arc::new(&contains_without_unter),
));
classifiers.push(Classifier::new("Hauptstück", Arc::new(&contains)));
classifiers.push(Classifier::new("Unterabschnitt", Arc::new(&contains)));
classifiers.push(Classifier::new(
"uppercase letter",
Arc::new(&starts_with_uppercaseletter),
));
classifiers.push(Classifier::new("num", Arc::new(&starts_with_number)));
} else if name == "VVG" {
law_id = Some(20_004_425);
classifiers.push(Classifier::new("Abschnitt", Arc::new(&contains)).root());
} else if name == "KSchG" {
law_id = Some(10_002_462);
classifiers.push(Classifier::new("Hauptstück", Arc::new(&contains)).root());
classifiers.push(Classifier::new("Abschnitt", Arc::new(&contains)));
} else if name == "StGB" {
law_id = Some(10_002_296);
classifiers.push(Classifier::new("Teil", Arc::new(&contains)).root());
classifiers.push(Classifier::new("Abschnitt", Arc::new(&contains)));
}
let mut builder = Self {
pub fn new(name: &str) -> Self {
Self {
name: name.into(),
classifiers,
classifiers: Vec::new(),
header: Vec::new(),
next_para_header: None,
last_instance: None,
#[cfg(test)]
history: Vec::new(),
};
let paragraphs = parse(law_id.unwrap()).unwrap();
for paragraph in tqdm::tqdm(paragraphs.into_iter()) {
let cont = par::parse(&paragraph, &mut builder).unwrap();
if !cont {
break;
}
}
builder
//let paragraphs = overview::parse(law_id.unwrap()).unwrap();
//for paragraph in tqdm::tqdm(paragraphs.into_iter()) {
// let cont = paragraph::parse(&paragraph, &mut builder).unwrap();
// if !cont {
// break;
// }
//}
}
pub fn add_classifier(&mut self, classifier: Classifier) {
self.classifiers.push(classifier);
}
fn responsible_classifier(&self, name: &str) -> Option<&Classifier> {
@ -279,7 +214,7 @@ impl LawBuilder {
}
/// Sets a new header.
pub(crate) fn new_header(&mut self, name: &str) {
pub fn new_header(&mut self, name: &str) {
let name = name.trim();
#[cfg(test)]
self.history.push(format!("New_header: {name}"));
@ -324,7 +259,7 @@ impl LawBuilder {
}
/// Sets a new description for the last classifier.
pub(crate) fn new_desc(&mut self, desc: &str) {
pub fn new_desc(&mut self, desc: &str) {
let desc = desc.trim();
#[cfg(test)]
self.history.push(format!("New desc: {desc}"));
@ -338,7 +273,7 @@ impl LawBuilder {
}
/// Adds a new paragraph.
pub(crate) fn new_par(&mut self, par: String, content: Content) {
pub fn new_par(&mut self, par: String, content: Content) {
#[cfg(test)]
self.history.push(format!(
"New_par: {par};{}",
@ -362,7 +297,7 @@ impl LawBuilder {
}
/// Next paragraph has a header, store its name.
pub(crate) fn new_next_para_header(&mut self, header: &str) {
pub fn new_next_para_header(&mut self, header: &str) {
#[cfg(test)]
self.history.push(format!("New_new_para_header: {header}"));
@ -376,10 +311,10 @@ impl LawBuilder {
}
#[derive(Clone, PartialEq, Serialize, Deserialize)]
pub(crate) struct Section {
pub(crate) symb: String, // §"1", §"2", ...
pub(crate) par_header: Option<String>,
pub(crate) content: Content,
pub struct Section {
pub symb: String, // §"1", §"2", ...
pub par_header: Option<String>,
pub content: Content,
}
impl fmt::Debug for Section {
@ -400,12 +335,12 @@ impl fmt::Display for Section {
}
#[derive(Clone, PartialEq)]
pub(crate) struct ClassifierInstance {
pub(crate) name: String, //e.g. 1 Theilstück
pub(crate) desc: Option<String>,
pub(crate) sections: Vec<Section>,
pub(crate) children: Vec<Rc<RefCell<ClassifierInstance>>>,
pub(crate) parent: Option<Rc<RefCell<ClassifierInstance>>>,
struct ClassifierInstance {
name: String, //e.g. 1 Theilstück
desc: Option<String>,
sections: Vec<Section>,
children: Vec<Rc<RefCell<ClassifierInstance>>>,
parent: Option<Rc<RefCell<ClassifierInstance>>>,
}
impl ClassifierInstance {
@ -460,12 +395,12 @@ impl From<&str> for ClassifierInstance {
type ClassifierApplicable = Arc<dyn Fn(&str, &str) -> bool>;
#[derive(Clone)]
pub(crate) struct Classifier {
pub(crate) name: String, // Hauptstück, Theil, Abschnitt, ol
pub struct Classifier {
pub name: String, // Hauptstück, Theil, Abschnitt, ol
pub(crate) used_for_fn: ClassifierApplicable,
pub(crate) instances: Vec<ClassifierInstance>,
pub(crate) child: Vec<Rc<RefCell<Classifier>>>,
pub(crate) root: bool,
instances: Vec<ClassifierInstance>,
pub child: Vec<Rc<RefCell<Classifier>>>,
pub root: bool,
}
impl PartialEq for Classifier {
@ -475,7 +410,7 @@ impl PartialEq for Classifier {
}
impl Classifier {
fn new(name: &str, used_for_fn: ClassifierApplicable) -> Self {
pub fn new(name: &str, used_for_fn: ClassifierApplicable) -> Self {
Self {
name: name.into(),
used_for_fn,
@ -485,7 +420,7 @@ impl Classifier {
}
}
fn root(self) -> Self {
pub fn root(self) -> Self {
Self { root: true, ..self }
}
@ -505,7 +440,7 @@ impl std::fmt::Debug for Classifier {
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub(crate) enum Content {
pub enum Content {
Text(String), //This is my direct law text
Item(Vec<Content>), //(1) This is general law. (2) This is more specific law
List(Vec<Content>),
@ -526,94 +461,94 @@ impl Display for Content {
}
}
#[cfg(test)]
mod tests {
use pretty_assertions::assert_eq;
use std::{
fs::File,
io::{self, BufRead, Read},
path::Path,
};
use super::*;
fn read_lines<P>(filename: P) -> io::Result<Vec<String>>
where
P: AsRef<Path>,
{
let file = File::open(filename)?;
let buf_reader = io::BufReader::new(file);
buf_reader.lines().collect()
}
#[ignore]
#[test]
fn test_with_live_data() {
let law: Law = LawBuilder::new("UrhG").into();
let path = Path::new("./data/urhg/builder.result");
let mut file = File::open(path).unwrap();
let mut json = String::new();
file.read_to_string(&mut json).unwrap();
let expected: Law = serde_json::from_str(&json).unwrap();
assert_eq!(law, expected);
}
#[ignore]
#[test]
fn test_stgb_with_live_data() {
let law: Law = LawBuilder::new("StGB").into();
let path = Path::new("./data/stgb/builder.result");
let mut file = File::open(path).unwrap();
let mut json = String::new();
file.read_to_string(&mut json).unwrap();
let expected: Law = serde_json::from_str(&json).unwrap();
//println!("{}", serde_json::to_string(&law).unwrap());
assert_eq!(law, expected);
}
#[test]
fn test_builder_full_urhg() {
let mut builder = LawBuilder::test("UrhG");
let path = Path::new("./data/urhg/par");
let input = read_lines(path.join("../par.result")).unwrap();
for i in input {
let (command, content) = i.split_once(":").unwrap();
match command {
"New_header" => builder.new_header(content),
"New desc" => builder.new_desc(content),
"New_new_para_header" => builder.new_next_para_header(content),
"New_par" => {
let (par, real_content) = i.split_once(";").unwrap();
let (_, real_par) = par.split_once(":").unwrap();
let real_content: Content = serde_json::from_str(real_content).unwrap();
builder.new_par(real_par.trim().into(), real_content);
}
_ => {
panic!("Don't know command '{command}'");
}
}
}
let actual: Law = builder.into();
//println!("{}", serde_json::to_string(&law).unwrap());
let mut file = File::open(path.join("../builder.result")).unwrap();
let mut json = String::new();
file.read_to_string(&mut json).unwrap();
let expected = serde_json::from_str(&json).unwrap();
assert_eq!(actual, expected);
}
}
//#[cfg(test)]
//mod tests {
// use pretty_assertions::assert_eq;
// use std::{
// fs::File,
// io::{self, BufRead, Read},
// path::Path,
// };
//
// use super::*;
//
// fn read_lines<P>(filename: P) -> io::Result<Vec<String>>
// where
// P: AsRef<Path>,
// {
// let file = File::open(filename)?;
// let buf_reader = io::BufReader::new(file);
// buf_reader.lines().collect()
// }
//
// #[ignore]
// #[test]
// fn test_with_live_data() {
// let law: Law = LawBuilder::new("UrhG").into();
//
// let path = Path::new("./data/urhg/builder.result");
// let mut file = File::open(path).unwrap();
// let mut json = String::new();
// file.read_to_string(&mut json).unwrap();
//
// let expected: Law = serde_json::from_str(&json).unwrap();
//
// assert_eq!(law, expected);
// }
//
// #[ignore]
// #[test]
// fn test_stgb_with_live_data() {
// let law: Law = LawBuilder::new("StGB").into();
//
// let path = Path::new("./data/stgb/builder.result");
// let mut file = File::open(path).unwrap();
// let mut json = String::new();
// file.read_to_string(&mut json).unwrap();
//
// let expected: Law = serde_json::from_str(&json).unwrap();
//
// //println!("{}", serde_json::to_string(&law).unwrap());
//
// assert_eq!(law, expected);
// }
//
// #[test]
// fn test_builder_full_urhg() {
// let mut builder = LawBuilder::test("UrhG");
//
// let path = Path::new("./data/urhg/par");
// let input = read_lines(path.join("../par.result")).unwrap();
//
// for i in input {
// let (command, content) = i.split_once(":").unwrap();
//
// match command {
// "New_header" => builder.new_header(content),
// "New desc" => builder.new_desc(content),
// "New_new_para_header" => builder.new_next_para_header(content),
// "New_par" => {
// let (par, real_content) = i.split_once(";").unwrap();
// let (_, real_par) = par.split_once(":").unwrap();
// let real_content: Content = serde_json::from_str(real_content).unwrap();
// builder.new_par(real_par.trim().into(), real_content);
// }
// _ => {
// panic!("Don't know command '{command}'");
// }
// }
// }
//
// let actual: Law = builder.into();
//
// //println!("{}", serde_json::to_string(&law).unwrap());
//
// let mut file = File::open(path.join("../builder.result")).unwrap();
// let mut json = String::new();
// file.read_to_string(&mut json).unwrap();
//
// let expected = serde_json::from_str(&json).unwrap();
//
// assert_eq!(actual, expected);
// }
//}

View File

@ -1,17 +1,17 @@
pub(crate) fn contains_without_unter(classifier_name: &str, instance_name: &str) -> bool {
pub fn contains_without_unter(classifier_name: &str, instance_name: &str) -> bool {
instance_name
.to_lowercase()
.contains(&classifier_name.to_lowercase())
&& !instance_name.to_lowercase().contains("unter")
}
pub(crate) fn contains(classifier_name: &str, instance_name: &str) -> bool {
pub fn contains(classifier_name: &str, instance_name: &str) -> bool {
instance_name
.to_lowercase()
.contains(&classifier_name.to_lowercase())
}
pub(crate) fn starts_with_roman_number(_: &str, s: &str) -> bool {
pub fn starts_with_roman_number(_: &str, s: &str) -> bool {
// Define the prefixes for Roman numerals.
let roman_prefixes = [
"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X", "XI", "XII", "XIII", "XIV",
@ -24,20 +24,20 @@ pub(crate) fn starts_with_roman_number(_: &str, s: &str) -> bool {
.any(|&prefix| s.starts_with(&(prefix.to_string() + ".")))
}
pub(crate) fn contains_at_start(_classifier_name: &str, instance_name: &str) -> bool {
pub fn contains_at_start(_classifier_name: &str, instance_name: &str) -> bool {
!instance_name.is_empty() && instance_name.starts_with('@')
}
pub(crate) fn starts_with_number(_classifier_name: &str, instance_name: &str) -> bool {
pub fn starts_with_number(_classifier_name: &str, instance_name: &str) -> bool {
matches!(instance_name.trim().as_bytes().first(), Some(c) if c.is_ascii_digit())
}
pub(crate) fn starts_with_letter(_classifier_name: &str, instance_name: &str) -> bool {
pub fn starts_with_letter(_classifier_name: &str, instance_name: &str) -> bool {
instance_name.starts_with(|c: char| c.is_ascii_lowercase())
&& (instance_name.chars().nth(1) == Some('.') || instance_name.chars().nth(1) == Some(')'))
}
pub(crate) fn starts_with_uppercaseletter(_classifier_name: &str, instance_name: &str) -> bool {
pub fn starts_with_uppercaseletter(_classifier_name: &str, instance_name: &str) -> bool {
instance_name.starts_with(|c: char| c.is_ascii_uppercase())
&& (instance_name.chars().nth(1) == Some('.') || instance_name.chars().nth(1) == Some(')'))
}