use std::marker::PhantomData; use countio::Counter; use quick_xml::{events, Reader}; use url::Url; use crate::{ attribute as attr, parse::XmlParser, record::{EntryRecord, IndexRecord}, Error, }; // TODO: Limits. pub(crate) enum SitemapDetector { // Plain(TxtParser), Entry(XmlParser), Index(XmlParser), } impl SitemapDetector { fn is_index(event: events::Event) -> Result, Error> { if let events::Event::Start(bytes) = event { let name = bytes.name().into_inner(); if name.eq_ignore_ascii_case(attr::SITEMAP_INDEX.as_bytes()) { return Ok(Some(true)); } else if name.eq_ignore_ascii_case(attr::URL_SET.as_bytes()) { return Ok(Some(false)); } } Ok(None) } fn create(is_index: bool, reader: Reader>) -> Self { if is_index { Self::Index(XmlParser::from_wrapper(reader, attr::SITEMAP_INDEX)) } else { Self::Entry(XmlParser::from_wrapper(reader, attr::URL_SET)) } } } impl SitemapDetector { /// pub fn from_sync(reader: R) -> Result { let mut reader = Reader::from_reader(Counter::new(reader)); let mut buf = Vec::new(); loop { let event = reader.read_event_into(&mut buf)?; if let Some(is_index) = Self::is_index(event)? { return Ok(Self::create(is_index, reader)); } } } } #[cfg(feature = "tokio")] #[cfg_attr(docsrs, doc(cfg(feature = "tokio")))] impl SitemapDetector { /// pub async fn from_async(reader: R) -> Result { let mut reader = Reader::from_reader(Counter::new(reader)); let mut buf = Vec::new(); loop { let event = reader.read_event_into_async(&mut buf).await?; if let Some(is_index) = Self::is_index(event)? { return Ok(Self::create(is_index, reader)); } } } } /// Automatic sitemap record resolver. /// /// ```rust /// #[derive(Debug, thiserror::Error)] /// enum CustomError { /// // .. /// #[error("sitemap error: {0}")] /// Sitemap(#[from] sitemapo::Error), /// //.. /// } /// /// fn main() -> Result<(), CustomError> { /// type SyncReader = std::io::BufReader>>; /// fn fetch(_: url::Url) -> Result { /// // .. /// unreachable!() /// } /// /// let sitemaps = Vec::default(); // Sitemaps listed in the robots.txt file. /// let mut parser = sitemapo::AutoParser::new_sync(&sitemaps, fetch); /// while let Some(_record) = parser.read_sync()? { /// // .. /// } /// /// Ok(()) /// } /// ``` pub struct AutoParser O> { sitemaps: Vec, // plain: Option>, index: Option>, entry: Option>, error_type: PhantomData, fetcher: F, } impl, O, F: Fn(Url) -> O> AutoParser { fn new(sitemaps: &[Url], fetcher: F) -> Self { Self { sitemaps: sitemaps.to_vec(), // plain: None, index: None, entry: None, error_type: PhantomData, fetcher, } } /// fn save_parser(&mut self, detector: SitemapDetector) { match detector { // SitemapDetector::Plain(parser) => self.plain = Some(parser), SitemapDetector::Entry(parser) => self.entry = Some(parser), SitemapDetector::Index(parser) => self.index = Some(parser), } } } impl, F: Fn(Url) -> Result> AutoParser, F> { /// Creates a new instance from the `robots.txt` provided list of root sitemaps /// (or sitemap indexes) and callback for fetching. pub fn new_sync(sitemaps: &[Url], fetcher: F) -> Self { Self::new(sitemaps, fetcher) } /// pub fn read_sync(&mut self) -> Result, E> { while !self.sitemaps.is_empty() || self.index.is_some() || self.entry.is_some() { if self.index.is_none() && self.entry.is_none() { let sitemap = self.sitemaps.pop().expect("not empty"); let reader = (self.fetcher)(sitemap)?; let detector = SitemapDetector::from_sync(reader)?; self.save_parser(detector); } if let Some(parser) = &mut self.entry { use crate::parse::Parser; let r = parser.read(); if let Ok(Some(record)) = r { return Ok(Some(record)); } self.entry.take(); if let Err(e) = r { return Err(e.into()); } } if let Some(parser) = &mut self.index { use crate::parse::Parser; let record = parser.read(); if let Ok(Some(r)) = &record { let reader = (self.fetcher)(r.location().clone())?; match SitemapDetector::from_sync(reader)? { SitemapDetector::Index(_) => { /* Ignore nested sitemap index. */ } sitemap => self.save_parser(sitemap), } } self.index.take(); if let Err(e) = record { return Err(e.into()); } } } Ok(None) } } impl, F: Fn(Url) -> Result> Iterator for AutoParser, F> { type Item = EntryRecord; fn next(&mut self) -> Option { self.read_sync().ok().flatten() } } // TODO: AsyncIterator/Stream. // https://tokio.rs/tokio/tutorial/streams #[cfg(feature = "tokio")] #[cfg_attr(docsrs, doc(cfg(feature = "tokio")))] impl< R: tokio::io::AsyncBufRead + Unpin + Send, E: std::error::Error + From, A: std::future::Future>, F: Fn(Url) -> A, > AutoParser { /// Creates a new instance from the `robots.txt` provided list of root sitemaps /// (or sitemap indexes) and callback for fetching. pub fn new_async(sitemaps: &[Url], fetcher: F) -> Self { Self::new(sitemaps, fetcher) } /// pub async fn read_async(&mut self) -> Result, E> { while !self.sitemaps.is_empty() || self.index.is_some() || self.entry.is_some() { if self.index.is_none() && self.entry.is_none() { let sitemap = self.sitemaps.pop().expect("not empty"); let reader = (self.fetcher)(sitemap).await?; let detector = SitemapDetector::from_async(reader).await?; self.save_parser(detector); } if let Some(parser) = &mut self.entry { use crate::parse::AsyncParser; let r = parser.read().await; if let Ok(Some(record)) = r { return Ok(Some(record)); } self.entry.take(); if let Err(e) = r { return Err(e.into()); } } if let Some(parser) = &mut self.index { use crate::parse::AsyncParser; let record = parser.read().await; if let Ok(Some(r)) = &record { let reader = (self.fetcher)(r.location().clone()).await?; match SitemapDetector::from_async(reader).await? { SitemapDetector::Index(_) => { /* Ignore nested sitemap index. */ } sitemap => self.save_parser(sitemap), } } self.index.take(); if let Err(e) = record { return Err(e.into()); } } } Ok(None) } } impl, O, F: Fn(Url) -> O> std::fmt::Debug for AutoParser { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("Sitemap") .field("pool", &self.sitemaps.len()) .field("index", &self.index.is_some()) .field("entry", &self.entry.is_some()) .finish() } } #[cfg(test)] mod test { use super::*; #[derive(Debug, thiserror::Error)] enum CustomError { #[error("sitemap error: {0}")] Sitemap(#[from] Error), } #[test] fn synk() -> Result<(), Error> { type SyncReader = std::io::BufReader>>; fn sync_fetcher(_: Url) -> Result { unreachable!() } let _ = AutoParser::new_sync(&[], sync_fetcher); Ok(()) } #[cfg(feature = "tokio")] #[tokio::test] async fn asynk() -> Result<(), Error> { type AsyncReader = tokio::io::BufReader>>; async fn async_fetcher(_: Url) -> Result { unreachable!() } let _ = AutoParser::new_async(&[], async_fetcher); Ok(()) } }