use std::fmt; use bstr::ByteSlice; use nom::branch::{alt, Alt}; use nom::bytes::complete::{tag, tag_no_case, take_while}; use nom::character::complete::{space0, space1}; use nom::combinator::{eof, opt}; use nom::error::{Error as NomError, ParseError as NomParseError}; use nom::multi::many_till; use nom::sequence::preceded; use nom::{Err as NomErr, IResult as NomResult}; /// The `Directive` enum represents every supported `robots.txt` directive. // TODO: Attach position. #[derive(Clone, Copy, PartialEq, Eq)] pub enum Directive<'a> { UserAgent(&'a [u8]), Allow(&'a [u8]), Disallow(&'a [u8]), CrawlDelay(&'a [u8]), Sitemap(&'a [u8]), Unknown(&'a [u8]), } impl fmt::Debug for Directive<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let (label, slice) = match self { Self::UserAgent(x) => ("User-Agent", x), Self::Allow(x) => ("Allow", x), Self::Disallow(x) => ("Disallow", x), Self::CrawlDelay(x) => ("Crawl-Delay", x), Self::Sitemap(x) => ("Sitemap", x), Self::Unknown(x) => ("Unknown", x), }; f.debug_tuple(label).field(&slice.as_bstr()).finish() } } const CARRIAGE: u8 = b'\r'; const NEWLINE: u8 = b'\n'; const COMMENT: u8 = b'#'; /// Returns true if the character code is neither a newline nor a carriage return. fn not_line_ending(c: u8) -> bool { c != NEWLINE && c != CARRIAGE } /// Returns true if the character code is neither a newline, a carriage return, /// nor a comment character. fn not_line_ending_or_comment(c: u8) -> bool { c != NEWLINE && c != CARRIAGE && c != COMMENT } /// Consumes every character until a newline. fn consume_newline(input: &[u8]) -> NomResult<&[u8], Option<&[u8]>> { let (input, _) = take_while(|i| i == CARRIAGE)(input)?; let (input, output) = opt(tag(b"\n"))(input)?; Ok((input, output)) } #[derive(Debug)] pub struct Lexer; impl Lexer { /// Parses the input slice into the list of directives. /// /// # Safety /// /// Discards the possibility of any error as [`unknown`] consumes anything. pub fn parse_tokens(input: &[u8]) -> Vec { match Self::lex(input) { Ok((_, directives)) => directives, Err(_) => unreachable!(), // Vec::default() } } /// Parses the input slice into the list of directives. fn lex(input: &[u8]) -> NomResult<&[u8], Vec> { // Removes the byte order mark (BOM). let (input, _) = opt(tag(b"\xef"))(input)?; let (input, _) = opt(tag(b"\xbb"))(input)?; let (input, _) = opt(tag(b"\xbf"))(input)?; // Creates and runs the matcher. let matcher = alt(( Self::user_agent, Self::allow, Self::disallow, Self::crawl_delay, Self::sitemap, Self::unknown, )); let (input, (directives, _)) = many_till(matcher, eof)(input)?; Ok((input, directives)) } /// Attempts to parse the `User-Agent` directive. fn user_agent(input: &[u8]) -> NomResult<&[u8], Directive> { let spellings = ( tag_no_case("user-agent"), tag_no_case("user agent"), tag_no_case("useragent"), ); let (input, agent) = Self::builder(input, spellings)?; Ok((input, Directive::UserAgent(agent))) } /// Attempts to parse the `Allow` directive. fn allow(input: &[u8]) -> NomResult<&[u8], Directive> { let spellings = ( tag_no_case("allow"), tag_no_case("alow"), tag_no_case("allaw"), ); let (input, rule) = Self::builder(input, spellings)?; Ok((input, Directive::Allow(rule))) } /// Attempts to parse the `Disallow` directive. fn disallow(input: &[u8]) -> NomResult<&[u8], Directive> { let spellings = ( tag_no_case("disallow"), tag_no_case("dissallow"), tag_no_case("dissalow"), tag_no_case("disalow"), tag_no_case("diasllow"), tag_no_case("disallaw"), ); // Empty disallow is equivalent to allow all. // https://moz.com/learn/seo/robotstxt let (input, rule) = Self::builder(input, spellings)?; if rule.is_empty() { Ok((input, Directive::Allow(b"/"))) } else { Ok((input, Directive::Disallow(rule))) } } /// Attempts to parse the `Crawl-Delay` directive. fn crawl_delay(input: &[u8]) -> NomResult<&[u8], Directive> { let spellings = ( tag_no_case("crawl-delay"), tag_no_case("crawl delay"), tag_no_case("crawldelay"), ); let (input, delay) = Self::builder(input, spellings)?; Ok((input, Directive::CrawlDelay(delay))) } /// Attempts to parse the `Sitemap` directive. fn sitemap(input: &[u8]) -> NomResult<&[u8], Directive> { let spellings = ( tag_no_case("sitemap"), tag_no_case("site-map"), tag_no_case("site map"), ); let (input, sitemap) = Self::builder(input, spellings)?; Ok((input, Directive::Sitemap(sitemap))) } /// Consumes the rest of the line as no directives were found here. fn unknown(input: &[u8]) -> NomResult<&[u8], Directive> { let (input, unknown) = take_while(not_line_ending)(input)?; let (input, _) = consume_newline(input)?; Ok((input, Directive::Unknown(unknown))) } /// Attempts to match `spellings` to the `input` slice. /// Used to simplify individual directive parsers. fn builder<'a, O, E: NomParseError<&'a [u8]>>( input: &'a [u8], spellings: impl Alt<&'a [u8], O, E>, ) -> NomResult<&'a [u8], &'a [u8]> where NomErr>: From>, { // Tries to match to the spelling list. let (input, _) = preceded(space0, alt(spellings))(input)?; // Tries to match the separator (colon or spaces). let (input, _) = alt((preceded(space0, tag(b":")), space1))(input)?; // Tries to retrieve the value of the kv pair. let (input, line) = take_while(not_line_ending_or_comment)(input)?; // Skips the rest. let (input, _) = opt(preceded(tag(b"#"), take_while(not_line_ending)))(input)?; let (input, _) = consume_newline(input)?; let line = line.trim(); Ok((input, line)) } } #[cfg(test)] mod lexing { use super::*; #[test] fn single() { let r = b"user-agent: robotxt"; let r = Lexer::parse_tokens(r); let ua = b"robotxt"; let ua = Directive::UserAgent(ua); assert_eq!(r, vec![ua]); } #[test] fn empty() { let r = b" user-agent: robotxt\n user-agent: robotxt"; let r = Lexer::parse_tokens(r); let ua = b"robotxt"; let ua = Directive::UserAgent(ua); let em = Directive::Unknown(b""); assert_eq!(r, vec![em, ua, em, ua]); } }