start parsing urhg

This commit is contained in:
philipp 2023-11-04 11:25:17 +01:00
parent 816b234112
commit f462936790
3 changed files with 38 additions and 29 deletions

View File

@ -38,6 +38,7 @@ impl From<roxmltree::Error> for Error {
} }
fn main() { fn main() {
law::parse(10001899).unwrap(); //law::parse(10001899).unwrap(); //TEG
//par::parse("https://www.ris.bka.gv.at/Dokumente/Bundesnormen/NOR12025172/NOR12025172.xml"); law::parse(10001848).unwrap(); //UrhG
//par::parse("https://www.ris.bka.gv.at/Dokumente/Bundesnormen/NOR12025172/NOR12025172.xml");
} }

View File

@ -7,6 +7,7 @@ fn fetch_page(url: &str) -> Result<String, Error> {
} }
pub(crate) fn parse(url: &str) -> Result<(), Error> { pub(crate) fn parse(url: &str) -> Result<(), Error> {
println!("{url}");
let xml = fetch_page(url)?; let xml = fetch_page(url)?;
let risdok = Risdok::from_str(&xml)?; let risdok = Risdok::from_str(&xml)?;

View File

@ -86,7 +86,8 @@ impl Nutzdaten {
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
pub(crate) struct Abschnitt { pub(crate) struct Abschnitt {
ueberschriftPara: Option<UeberschriftPara>, ueberschrifts: Vec<Ueberschrift>,
ueberschriftPara: Option<Ueberschrift>,
absatze: Vec<AbsatzAbs>, absatze: Vec<AbsatzAbs>,
} }
impl Abschnitt { impl Abschnitt {
@ -104,7 +105,7 @@ impl Abschnitt {
loop { loop {
match c.peek() { match c.peek() {
Some(child) => { Some(child) => {
if UeberschriftTitle::test(child) { if Ueberschrift::test(child, "titel") {
c.next(); c.next();
continue; continue;
} }
@ -118,10 +119,28 @@ impl Abschnitt {
} }
} }
let mut ueberschrifts = Vec::new();
loop {
match &c.peek() {
Some(child) => {
if Ueberschrift::test(&child, "g1") {
ueberschrifts.push(Ueberschrift::parse(c.next().unwrap(), "g1"));
} else if Ueberschrift::test(&child, "g2") {
ueberschrifts.push(Ueberschrift::parse(c.next().unwrap(), "g2"));
} else if Ueberschrift::test(&child, "g1min") {
ueberschrifts.push(Ueberschrift::parse(c.next().unwrap(), "g1min"));
} else {
break;
}
}
None => break,
}
}
let mut ueberschriftPara = None; let mut ueberschriftPara = None;
if let Some(child) = c.peek() { if let Some(child) = c.peek() {
if UeberschriftPara::test(child) { if Ueberschrift::test(child, "para") {
ueberschriftPara = Some(UeberschriftPara::parse(c.next().unwrap())) ueberschriftPara = Some(Ueberschrift::parse(c.next().unwrap(), "para"))
} }
} }
@ -143,7 +162,7 @@ impl Abschnitt {
loop { loop {
match c.peek() { match c.peek() {
Some(child) => { Some(child) => {
if UeberschriftTitle::test(child) { if Ueberschrift::test(child, "titel") {
c.next(); c.next();
continue; continue;
} }
@ -162,6 +181,7 @@ impl Abschnitt {
Self { Self {
ueberschriftPara, ueberschriftPara,
absatze, absatze,
ueberschrifts,
} }
} }
} }
@ -239,40 +259,27 @@ impl Absatz {
} }
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
pub(crate) struct UeberschriftPara { pub(crate) struct Ueberschrift {
typ: String,
content: String, content: String,
} }
impl UeberschriftPara { impl Ueberschrift {
pub(crate) fn test(n: &Node) -> bool { fn test(n: &Node, typ: &str) -> bool {
n.tag_name().name() == "ueberschrift" && n.attribute("typ").unwrap() == "para" n.tag_name().name() == "ueberschrift" && n.attribute("typ").unwrap() == typ
} }
pub(crate) fn parse(n: Node) -> Self { pub(crate) fn parse(n: Node, typ: &str) -> Self {
assert!(n.tag_name().name() == "ueberschrift"); assert!(n.tag_name().name() == "ueberschrift");
assert_eq!(n.attribute("typ").unwrap(), "para"); assert_eq!(n.attribute("typ").unwrap(), typ);
Self { Self {
content: n.text().unwrap().into(), content: n.text().unwrap().into(),
typ: typ.into(),
} }
} }
} }
#[derive(Debug, PartialEq)]
pub(crate) struct UeberschriftTitle;
impl UeberschriftTitle {
fn test(n: &Node) -> bool {
n.tag_name().name() == "ueberschrift" && n.attribute("typ").unwrap() == "titel"
}
pub(crate) fn parse(n: Node) -> Self {
assert!(n.tag_name().name() == "ueberschrift");
assert_eq!(n.attribute("typ").unwrap(), "titel");
//TODO parse if necessary
Self {}
}
}
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
pub(crate) struct Kzinhalt; pub(crate) struct Kzinhalt;
impl Kzinhalt { impl Kzinhalt {