use std::cmp::min; use std::time::Duration; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use url::Url; use crate::parse::lexer::Lexer; use crate::parse::parser::Parser; use crate::parse::rule::Rule; use crate::paths::normalize_path; use crate::BYTE_LIMIT; /// The [`Rules`] enum determines if the [RobotsInner::is_allowed] results /// from the set of [`Rule`]s or the single provided global rule. #[derive(Debug, Clone, PartialEq, Eq)] pub(crate) enum Rules { Rules(Vec), Always(bool), } /// The [`RobotsInner`] struct provides convenient and efficient storage for /// the data associated with certain user-agent for further matching. #[derive(Debug, Clone, PartialEq, Eq)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct RobotsInner { user_agent: String, #[cfg_attr(feature = "serde", serde(flatten))] rules: Rules, crawl_delay: Option, sitemaps: Vec, } impl RobotsInner { /// Creates a new [`RobotsInner`] from the byte slice. pub fn from_bytes(robots: &[u8], user_agent: &str) -> Self { // Limits the input to 500 kibibytes. let limit = min(robots.len(), BYTE_LIMIT); let robots = &robots[0..limit]; // Replaces '\x00' with '\n'. let robots: Vec<_> = robots .iter() .map(|u| match u { b'\x00' => b'\n', v => *v, }) .collect(); let directives = Lexer::parse_tokens(&robots); let state = Parser::parse_rules(&directives, user_agent); Self { user_agent: state.longest_match, rules: Self::optimize(state.rules), crawl_delay: state.crawl_delay, sitemaps: state.sitemaps, } } // Applies optimizations if enabled. fn optimize(rules: Vec) -> Rules { // TODO: Remove overlapping rules. #[cfg(feature = "optimal")] if rules.is_empty() || rules.iter().all(|r| r.is_allowed()) { // Empty or all allow. return Rules::Always(true); } else if rules.iter().all(|r| !r.is_allowed()) && rules.iter().rev().any(|r| r.is_universal()) { // All disallow + universal disallow. // Universal rule should be one of the smallest, so reverse the iter. return Rules::Always(false); } Rules::Rules(rules) } /// Creates a new [`RobotsInner`] from the global rule. pub fn from_always(always: bool, crawl_delay: Option, user_agent: &str) -> Self { Self { user_agent: user_agent.to_string(), rules: Rules::Always(always), crawl_delay, sitemaps: Vec::default(), } } /// Returns `Some(true)` if there is an explicit `allow` or the global rule. /// NOTE: Expects relative path. pub fn try_is_allowed(&self, path: &str) -> Option { match self.rules { Rules::Always(always) => Some(always), Rules::Rules(ref rules) => match normalize_path(path).as_str() { "/robots.txt" => Some(true), path => rules .iter() .find(|r| r.is_match(path)) .map(|rule| rule.is_allowed()), }, } } /// Returns true if the relative path is allowed for this set of rules. /// NOTE: Expects relative path. pub fn is_allowed(&self, path: &str) -> bool { // Returns true is there is no rule matching the path. self.try_is_allowed(path).unwrap_or(true) } /// Returns `Some(_)` if the rules fully allow or disallow. pub fn is_always(&self) -> Option { match &self.rules { Rules::Rules(_) => None, Rules::Always(always) => Some(*always), } } /// Returns the longest matching user-agent. pub fn user_agent(&self) -> &str { self.user_agent.as_ref() } /// Returns the specified crawl-delay. pub fn crawl_delay(&self) -> Option { self.crawl_delay } /// Returns all collected sitemaps. pub fn sitemaps(&self) -> &[Url] { self.sitemaps.as_slice() } /// Returns the total amount of applied rules unless constructed /// with (or optimized to) the global rule. pub fn len(&self) -> Option { match &self.rules { Rules::Rules(vec) => Some(vec.len()), Rules::Always(_) => None, } } /// Returns true if there are no applied rules i.e. it is constructed /// with (or optimized to) the global rule. pub fn is_empty(&self) -> Option { self.len().map(|len| len == 0) } } #[cfg(test)] #[cfg(feature = "optimal")] mod optimal_output { use super::*; use crate::ALL_UAS; #[test] fn from() { let r = RobotsInner::from_always(true, None, "foo"); assert_eq!(r.is_always(), Some(true)); let r = RobotsInner::from_always(false, None, "foo"); assert_eq!(r.is_always(), Some(false)); } #[test] fn empty() { let r = RobotsInner::from_bytes(b"", ALL_UAS); assert_eq!(r.is_always(), Some(true)); } #[test] fn allow() { let t = b"Allow: / \n Allow: /foo"; let r = RobotsInner::from_bytes(t, ALL_UAS); assert_eq!(r.is_always(), Some(true)); } #[test] fn disallow_all() { let t = b"Disallow: /* \n Disallow: /foo"; let r = RobotsInner::from_bytes(t, ALL_UAS); assert_eq!(r.is_always(), Some(false)); } #[test] fn disallow_exc() { let t = b"Disallow: /* \n Allow: /foo"; let r = RobotsInner::from_bytes(t, ALL_UAS); assert_eq!(r.is_always(), None); } } #[cfg(test)] mod precedence_rules { use super::*; use crate::ALL_UAS; #[test] fn simple() { let t = b"Allow: /p \n Disallow: /"; let r = RobotsInner::from_bytes(t, ALL_UAS); assert!(r.is_allowed("/page")); } #[test] fn restrictive() { let t = b"Allow: /folder \n Disallow: /folder"; let r = RobotsInner::from_bytes(t, ALL_UAS); assert!(r.is_allowed("/folder/page")); } #[test] fn restrictive2() { let t = b"Allow: /page \n Disallow: /*.ph"; let r = RobotsInner::from_bytes(t, ALL_UAS); assert!(r.is_allowed("/page.php5")); } #[test] fn longer() { let t = b"Allow: /page \n Disallow: /*.htm"; let r = RobotsInner::from_bytes(t, ALL_UAS); assert!(!r.is_allowed("/page.htm")); } #[test] fn specific() { let t = b"Allow: /$ \n Disallow: /"; let r = RobotsInner::from_bytes(t, ALL_UAS); assert!(r.is_allowed("/")); } #[test] fn specific2() { let t = b"Allow: /$ \n Disallow: /"; let r = RobotsInner::from_bytes(t, ALL_UAS); assert!(!r.is_allowed("/page.htm")); } } #[cfg(test)] mod precedence_agents { use super::*; static TXT: &[u8] = br#""" User-Agent: bot-robotxt Allow: /1 Disallow: / User-Agent: * Allow: /2 Disallow: / User-Agent: bot Allow: /3 Disallow: / """#; #[test] fn specific() { let r = RobotsInner::from_bytes(TXT, "bot-robotxt"); // Matches: assert!(r.is_allowed("/1")); // Doesn't match: assert!(!r.is_allowed("/2")); assert!(!r.is_allowed("/3")); } #[test] fn strict() { let r = RobotsInner::from_bytes(TXT, "bot"); // Matches: assert!(r.is_allowed("/3")); // Doesn't match: assert!(!r.is_allowed("/1")); assert!(!r.is_allowed("/2")); } #[test] fn missing() { let r = RobotsInner::from_bytes(TXT, "super-bot"); // Matches: assert!(r.is_allowed("/2")); // Doesn't match: assert!(!r.is_allowed("/1")); assert!(!r.is_allowed("/3")); } #[test] fn partial() { let r = RobotsInner::from_bytes(TXT, "bot-super"); // Matches: assert!(r.is_allowed("/3")); // Doesn't match: assert!(!r.is_allowed("/1")); assert!(!r.is_allowed("/2")); } }