parse stvo
All checks were successful
CI/CD Pipeline / test (push) Successful in 2m59s

This commit is contained in:
2024-09-12 23:18:51 +02:00
parent 6e654e84e8
commit 34ce30a789
155 changed files with 665 additions and 12 deletions

View File

@ -701,6 +701,7 @@ impl std::fmt::Debug for Classifier {
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub enum Content {
Image(String),
Text(String), //This is my direct law text
List(Vec<Content>), //(1) This is general law. (2) This is more specific law
Multi(Vec<Content>),
@ -709,7 +710,7 @@ pub enum Content {
impl Display for Content {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Text(a) => f.write_str(&format!("{a}\n")),
Self::Text(a) | Self::Image(a) => f.write_str(&format!("{a}\n")),
Self::List(a) | Self::Multi(a) => {
let mut ret = String::new();
for aa in a {

View File

@ -96,6 +96,7 @@ struct Overview {
fn parse_from_str(content: &str, skip_first: bool) -> Result<(bool, Vec<String>), Error> {
let mut ret = Vec::new();
let wrapper: Overview = serde_json::from_str(content)?;
let iter = wrapper.ogd_search_result.get_par().into_iter();

View File

@ -16,7 +16,7 @@
use std::collections::HashMap;
use serde::Deserialize;
use serde::{Deserialize, Deserializer};
fn deserialize_string_to_usize<'de, D>(deserializer: D) -> Result<usize, D::Error>
where
@ -48,15 +48,11 @@ impl OgdSearchResult {
pub(crate) fn get_par(&self) -> Vec<String> {
let mut ret = Vec::new();
for doc_ref in &self.ogd_document_results.ogd_document_reference {
for urls in &doc_ref
.data
.document_list
.content_reference
.urls
.content_url
{
if urls.data_type == "Xml" {
ret.push(urls.url.clone());
for con_refs in &doc_ref.data.document_list.content_reference {
for urls in &con_refs.urls.content_url {
if urls.data_type == "Xml" {
ret.push(urls.url.clone());
}
}
}
}
@ -214,7 +210,26 @@ pub(crate) struct BrKons {
#[allow(dead_code)]
#[serde(rename_all = "PascalCase")]
pub(crate) struct DocumentList {
content_reference: ContentReference,
#[serde(deserialize_with = "deserialize_content_reference")]
content_reference: Vec<ContentReference>,
}
fn deserialize_content_reference<'de, D>(deserializer: D) -> Result<Vec<ContentReference>, D::Error>
where
D: Deserializer<'de>,
{
#[derive(Deserialize)]
#[serde(untagged)]
enum ContentReferenceField {
Single(ContentReference),
Multiple(Vec<ContentReference>),
}
let field = ContentReferenceField::deserialize(deserializer)?;
Ok(match field {
ContentReferenceField::Single(item) => vec![item],
ContentReferenceField::Multiple(items) => items,
})
}
#[derive(Deserialize)]
@ -230,9 +245,28 @@ pub(crate) struct ContentReference {
#[allow(dead_code)]
#[serde(rename_all = "PascalCase")]
pub(crate) struct ContentUrl {
#[serde(deserialize_with = "deserialize_content_url")]
content_url: Vec<ContentUrlItem>,
}
fn deserialize_content_url<'de, D>(deserializer: D) -> Result<Vec<ContentUrlItem>, D::Error>
where
D: Deserializer<'de>,
{
#[derive(Deserialize)]
#[serde(untagged)]
enum ContentUrlField {
Single(ContentUrlItem),
Multiple(Vec<ContentUrlItem>),
}
let field = ContentUrlField::deserialize(deserializer)?;
Ok(match field {
ContentUrlField::Single(item) => vec![item],
ContentUrlField::Multiple(items) => items,
})
}
#[derive(Deserialize)]
#[allow(dead_code)]
#[serde(rename_all = "PascalCase")]

View File

@ -60,6 +60,10 @@ impl Absatz {
// After a 'absatz' there can be a '<absatz typ="[satz|erltext]"' which should be part of the first absatz
// (e.g. 1209 ABGB)
content.push(Content::Text(Absatz::parse(c.next().unwrap()).content));
} else if child.tag_name().name() == "absatz"
&& child.attribute("typ") == Some("abbobj")
{
content.push(Self::parse_abbobj(c.next().unwrap()));
} else {
break;
}
@ -71,6 +75,39 @@ impl Absatz {
}
}
pub(crate) fn parse_abbobj(n: Node) -> Content {
Expect::from(&n).tag("absatz").typ("abbobj");
let mut ret = Vec::new();
let mut c = n.children().peekable();
// skip tab(s)
while let Some(child) = c.peek() {
if child.tag_name().name() == "tab" {
c.next();
continue;
}
let binary = c.next().unwrap();
// TODO: this if should not be necessary...
if binary.tag_name().name() != "binary" {
continue;
}
let mut c = binary.children();
let src = c.next().unwrap();
ret.push(Content::Image(src.text().unwrap().into()))
}
if ret.len() == 1 {
ret[0].clone()
} else {
Content::Multi(ret)
}
}
pub(crate) fn parse(n: Node) -> Self {
Expect::from(&n).tag("absatz");

View File

@ -58,6 +58,8 @@ impl Abschnitt {
if AbsatzAbs::test(child) {
let (_, absatz) = Absatz::parse_full(&mut c);
absatze.push(absatz);
} else if Ueberschrift::test(child, "erll") {
absatze.push(Ueberschrift::parse_full_erll(&mut c));
} else {
break;
}

View File

@ -1,5 +1,6 @@
// Copyright (C) 2024 Philipp Hofer
//
//
// Licensed under the EUPL, Version 1.2 or - as soon they will be approved by
// the European Commission - subsequent versions of the EUPL (the "Licence").
// You may not use this work except in compliance with the Licence.
@ -22,6 +23,7 @@ mod table;
use std::{fmt::Display, iter::Peekable};
use abschnitt::Abschnitt;
use liste::Liste;
use roxmltree::{Children, Node};
use crate::{
@ -418,6 +420,36 @@ impl Ueberschrift {
typ: typ.into(),
}
}
fn parse_full_erll(n: &mut Peekable<Children>) -> Content {
let mut ret = Vec::new();
let mut curr = Vec::new();
// We need at least 1 erll
curr.push(Content::Text(
Self::parse(n.next().unwrap(), "erll").content,
));
while let Some(child) = &mut n.peek() {
if Absatz::test_with_typ(child, "abbobj") {
curr.push(Absatz::parse_abbobj(n.next().unwrap()));
} else if Liste::test(child) {
curr.push(Content::List(Liste::parse_full(n).content));
} else if Absatz::test_with_typ(child, "abs") {
let (_, absatz) = Absatz::parse_full(n);
curr.push(absatz);
} else if Ueberschrift::test(child, "erll") {
ret.push(Content::Multi(curr));
curr = vec![Content::Text(
Self::parse(n.next().unwrap(), "erll").content,
)];
} else {
break;
}
}
Content::Multi(ret)
}
}
#[derive(Debug, PartialEq)]