use std::io::BufRead; use std::marker::PhantomData; use countio::Counter; use url::Url; use crate::{ parse::Parser, record::{BYTE_LIMIT, RECORD_LIMIT}, Error, }; /// Sitemap parser for the simple TXT file that contains one URL per line. /// /// For example: /// /// ```txt /// https://www.example.com/file1.html /// https://www.example.com/file2.html /// ``` /// /// Enforces [total written/read bytes](BYTE_LIMIT) and [total url](RECORD_LIMIT) limits. /// See [Error]. /// /// ```rust /// use sitemapo::{ /// parse::{Parser, TxtParser}, /// Error, /// }; /// /// fn main() -> Result<(), Error> { /// let buf = "https://example.com/file1.html".as_bytes(); /// /// let mut parser = TxtParser::new(buf).unwrap(); /// let _rec = parser.read()?; /// let _buf = parser.close()?; /// Ok(()) /// } /// ``` pub struct TxtParser { record_type: PhantomData, pub(crate) reader: Counter, pub(crate) records: usize, } impl TxtParser { /// Creates a new instance with a provided reader. pub(crate) fn from_reader(reader: R) -> Self { Self { record_type: PhantomData, reader: Counter::new(reader), records: 0, } } /// Returns a reference to the underlying reader. pub fn get_ref(&self) -> &R { self.reader.get_ref() } /// Returns a mutable reference to the underlying reader. pub fn get_mut(&mut self) -> &mut R { self.reader.get_mut() } /// Returns an underlying reader. pub fn into_inner(self) -> R { self.reader.into_inner() } pub(crate) fn try_if_readable(&mut self) -> Result<(), Error> { if self.records + 1 > RECORD_LIMIT { return Err(Error::EntryLimit { over: 1 }); } if self.reader.reader_bytes() > BYTE_LIMIT { let over = self.reader.reader_bytes() - BYTE_LIMIT; return Err(Error::ByteLimit { over }); } Ok(()) } } impl Parser for TxtParser { type Error = Error; fn new(reader: R) -> Result { Ok(Self::from_reader(reader)) } fn read(&mut self) -> Result, Self::Error> { loop { self.try_if_readable()?; let mut buf = String::new(); if self.reader.read_line(&mut buf)? == 0 { return Ok(None); } self.records += 1; match Url::parse(buf.as_str()) { Ok(address) => return Ok(Some(address)), Err(_) => continue, } } } fn close(self) -> Result { Ok(self.into_inner()) } } impl std::fmt::Debug for TxtParser { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("TxtParser") .field("bytes", &self.reader.reader_bytes()) .field("records", &self.records) .finish() } } #[cfg(feature = "tokio")] #[cfg_attr(docsrs, doc(cfg(feature = "tokio")))] mod tokio { use async_trait::async_trait; use tokio::io::{AsyncBufRead, AsyncBufReadExt}; use url::Url; use crate::{ parse::{AsyncParser, TxtParser}, Error, }; #[async_trait] impl AsyncParser for TxtParser { type Error = Error; async fn new(reader: R) -> Result { Ok(Self::from_reader(reader)) } async fn read(&mut self) -> Result, Self::Error> { loop { self.try_if_readable()?; let mut buf = String::new(); if self.reader.read_line(&mut buf).await? == 0 { return Ok(None); } self.records += 1; match Url::parse(buf.as_str()) { Ok(address) => return Ok(Some(address)), Err(_) => continue, } } } async fn close(self) -> Result { Ok(self.into_inner()) } } } #[cfg(test)] mod test { use url::Url; use crate::{parse::TxtParser, Error}; #[test] fn synk() -> Result<(), Error> { use crate::parse::Parser; let buf = r#"https://www.example.com/file1.html https://www.example.com/file2.html"# .as_bytes(); let mut parser = TxtParser::new(buf)?; let url = parser.read()?; parser.close()?; let exp = Url::parse("https://www.example.com/file1.html"); assert_eq!(url, exp.ok()); Ok(()) } #[cfg(feature = "tokio")] #[tokio::test] async fn asynk() -> Result<(), Error> { use crate::parse::AsyncParser; let buf = r#"https://www.example.com/file1.html https://www.example.com/file2.html"# .as_bytes(); let mut parser = TxtParser::new(buf).await?; let url = parser.read().await?; parser.close().await?; let exp = Url::parse("https://www.example.com/file1.html"); assert_eq!(url, exp.ok()); Ok(()) } }