use std::io::BufRead; use bytes::Bytes; use countio::Counter; use quick_xml::{events, Reader}; use time::format_description::well_known::Iso8601; use time::OffsetDateTime; use url::Url; use crate::{ attribute as attr, attribute::{Frequency, Priority}, parse::Parser, record::{EntryRecord, IndexRecord, BYTE_LIMIT, RECORD_LIMIT}, Error, }; /// Sitemap parser for the versatile XML file with an optional support of extensions. /// /// For example: /// /// ```xml /// /// /// /// https://www.example.com/foo.html /// 2022-06-04 /// /// /// ``` /// Enforces [total written/read bytes](BYTE_LIMIT) and [total records](RECORD_LIMIT) limits. /// See [Error]. /// /// ```rust /// use sitemapo::{ /// parse::{Parser, XmlParser}, /// record::EntryRecord, /// Error, /// }; /// /// fn main() -> Result<(), Error> { /// let buf = // "...".as_bytes(); /// # r#" /// # /// # https://www.example.com/file1.html /// # 2022-09-08T10:43:13.000-04:00 /// # daily /// # 0.6 /// # /// # /// # "#.as_bytes(); /// /// let mut parser = XmlParser::new(buf)?; /// let _rec: Option = parser.read()?; /// let _buf = parser.close()?; /// Ok(()) /// } /// ``` pub struct XmlParser { record: Option, pub(crate) reader: Reader>, pub(crate) records: usize, path: Vec, } impl XmlParser { /// Creates a new instance with a provided reader. pub(crate) fn from_reader(reader: R) -> Self { Self { record: None, reader: Reader::from_reader(Counter::new(reader)), records: 0, path: Vec::default(), } } /// Creates a new instance with a provided reader. pub(crate) fn from_wrapper(wrapped: Reader>, path: &str) -> Self { let bytes = Bytes::from(path.as_bytes().to_vec()); Self { record: None, reader: wrapped, records: 0, path: Vec::from([bytes]), } } /// Returns a reference to the underlying reader. pub fn get_ref(&self) -> &R { self.reader.get_ref().get_ref() } /// Returns a mutable reference to the underlying reader. pub fn get_mut(&mut self) -> &mut R { self.reader.get_mut().get_mut() } /// Returns an underlying reader. pub fn into_inner(self) -> R { self.reader.into_inner().into_inner() } pub(crate) fn try_if_readable(&mut self) -> Result<(), Error> { if self.records + 1 > RECORD_LIMIT { return Err(Error::EntryLimit { over: 1 }); } if self.reader.get_ref().reader_bytes() > BYTE_LIMIT { let over = self.reader.get_ref().reader_bytes() - BYTE_LIMIT; return Err(Error::ByteLimit { over }); } Ok(()) } pub(crate) fn write_event( &mut self, event: events::Event, tag: &[u8], create: impl FnOnce() -> D, handle: impl FnOnce(&mut Self, &str), check: impl FnOnce(Option) -> Option, ) -> Result>, Error> { match event { events::Event::Start(bytes) => { let name = bytes.name().into_inner(); if name.eq_ignore_ascii_case(tag) { self.records += 1; let instance = create(); self.record.replace(instance); } self.path.push(name.to_vec().into()); } events::Event::Text(bytes) => { let text = bytes.unescape()?; handle(self, &text); } events::Event::End(bytes) => { let name = bytes.name().into_inner().to_vec(); if self.path.pop() != Some(name.clone().into()) { // TODO: Skip til next start tag. } if name.eq_ignore_ascii_case(tag) { let rec = self.record.take(); return Ok(Some(check(rec))); } } events::Event::Eof => { let rec = self.record.take(); return Ok(Some(check(rec))); } _ => {} // ignore } Ok(None) } } impl XmlParser { fn handle_entry_text(&mut self, text: &str) { static LOC: [&str; 3] = [attr::URL_SET, attr::URL, attr::LOCATION]; static MOD: [&str; 3] = [attr::URL_SET, attr::URL, attr::LAST_MODIFIED]; static FRQ: [&str; 3] = [attr::URL_SET, attr::URL, attr::CHANGE_FREQUENCY]; static PRI: [&str; 3] = [attr::URL_SET, attr::URL, attr::PRIORITY]; if let Some(rec) = &mut self.record { match self.path.as_slice() { x if x == LOC => rec.location = Url::parse(text).ok(), x if x == MOD => rec.modified = OffsetDateTime::parse(text, &Iso8601::PARSING).ok(), x if x == FRQ => rec.frequency = Frequency::parse(text).ok(), x if x == PRI => rec.priority = Priority::parse(text).ok(), _ => {} } } } fn is_entry_good(mut record: Option) -> Option { if record.as_ref().is_some_and(|u| u.location.is_none()) { let _ = record.take(); } record } pub(crate) fn write_entry_event( &mut self, event: events::Event, ) -> Result>, Error> { self.write_event( event, attr::URL.as_bytes(), EntryRecord::clean, Self::handle_entry_text, Self::is_entry_good, ) } } impl XmlParser { fn handle_index_text(&mut self, text: &str) { static LOC: [&str; 3] = [attr::SITEMAP_INDEX, attr::SITEMAP, attr::LOCATION]; static MOD: [&str; 3] = [attr::SITEMAP_INDEX, attr::SITEMAP, attr::LAST_MODIFIED]; if let Some(rec) = &mut self.record { match self.path.as_slice() { x if x == LOC => rec.location = Url::parse(text).ok(), x if x == MOD => rec.modified = OffsetDateTime::parse(text, &Iso8601::PARSING).ok(), _ => {} } } } fn is_index_good(mut record: Option) -> Option { if record.as_ref().is_some_and(|u| u.location.is_none()) { let _ = record.take(); } record } pub(crate) fn write_index_event( &mut self, event: events::Event, ) -> Result>, Error> { self.write_event( event, attr::SITEMAP.as_bytes(), IndexRecord::clean, Self::handle_index_text, Self::is_index_good, ) } } impl Parser for XmlParser { type Error = Error; fn new(reader: R) -> Result { // TODO: events::Event::Decl. Ok(Self::from_reader(reader)) } fn read(&mut self) -> Result, Self::Error> { let mut buf = Vec::new(); loop { self.try_if_readable()?; let event = self.reader.read_event_into(&mut buf)?; if let Some(record) = self.write_entry_event(event)? { return Ok(record); } } } fn close(self) -> Result { // TODO: events::Event::Decl. Ok(self.into_inner()) } } impl Parser for XmlParser { type Error = Error; fn new(reader: R) -> Result { // TODO: events::Event::Decl. Ok(Self::from_reader(reader)) } fn read(&mut self) -> Result, Self::Error> { let mut buf = Vec::new(); loop { self.try_if_readable()?; let event = self.reader.read_event_into(&mut buf)?; if let Some(record) = self.write_index_event(event)? { return Ok(record); } } } fn close(self) -> Result { // TODO: events::Event::Decl. Ok(self.into_inner()) } } impl std::fmt::Debug for XmlParser { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("XmlParser") .field("bytes", &self.reader.get_ref().reader_bytes()) .field("records", &self.records) .finish() } } #[cfg(feature = "tokio")] #[cfg_attr(docsrs, doc(cfg(feature = "tokio")))] mod tokio { use async_trait::async_trait; use tokio::io::AsyncBufRead; use crate::{ parse::{AsyncParser, XmlParser}, record::{EntryRecord, IndexRecord}, Error, }; #[async_trait] impl AsyncParser for XmlParser { type Error = Error; async fn new(reader: R) -> Result { // TODO: events::Event::Decl. Ok(Self::from_reader(reader)) } async fn read(&mut self) -> Result, Self::Error> { let mut buf = Vec::new(); loop { self.try_if_readable()?; let event = self.reader.read_event_into_async(&mut buf).await?; if let Some(record) = self.write_entry_event(event)? { return Ok(record); } } } async fn close(self) -> Result { // TODO: events::Event::Decl. Ok(self.into_inner()) } } #[async_trait] impl AsyncParser for XmlParser { type Error = Error; async fn new(reader: R) -> Result { // TODO: events::Event::Decl. Ok(Self::from_reader(reader)) } async fn read(&mut self) -> Result, Self::Error> { let mut buf = Vec::new(); loop { self.try_if_readable()?; let event = self.reader.read_event_into_async(&mut buf).await?; if let Some(record) = self.write_index_event(event)? { return Ok(record); } } } async fn close(self) -> Result { // TODO: events::Event::Decl. Ok(self.into_inner()) } } } #[cfg(test)] mod test { use url::Url; use crate::Error; use crate::{parse::XmlParser, record::EntryRecord}; const EXAMPLE: &'static str = r#" https://www.example.com/file1.html 2022-09-08T10:43:13.000-04:00 daily 0.6 "#; #[test] fn synk() -> Result<(), Error> { use crate::parse::Parser; let buf = EXAMPLE.as_bytes(); let mut parser = XmlParser::new(buf)?; let record: EntryRecord = parser.read()?.unwrap(); parser.close()?; let exp = Url::parse("https://www.example.com/file1.html"); assert_eq!(record.location(), &exp.unwrap()); Ok(()) } #[cfg(feature = "tokio")] #[tokio::test] async fn asynk() -> Result<(), Error> { use crate::parse::AsyncParser; let buf = EXAMPLE.as_bytes(); let mut parser = XmlParser::new(buf).await?; let record: EntryRecord = parser.read().await?.unwrap(); parser.close().await?; let exp = Url::parse("https://www.example.com/file1.html"); assert_eq!(record.location(), &exp.unwrap()); Ok(()) } }