/*! Parses command line arguments into a structured and typed representation. */ use std::{borrow::Cow, collections::BTreeSet, ffi::OsString}; use anyhow::Context; use crate::flags::{ defs::FLAGS, hiargs::HiArgs, lowargs::{LoggingMode, LowArgs, SpecialMode}, Flag, FlagValue, }; /// The result of parsing CLI arguments. /// /// This is basically a `anyhow::Result`, but with one extra variant that is /// inhabited whenever ripgrep should execute a "special" mode. That is, when a /// user provides the `-h/--help` or `-V/--version` flags. /// /// This special variant exists to allow CLI parsing to short circuit as /// quickly as is reasonable. For example, it lets CLI parsing avoid reading /// ripgrep's configuration and converting low level arguments into a higher /// level representation. #[derive(Debug)] pub(crate) enum ParseResult { Special(SpecialMode), Ok(T), Err(anyhow::Error), } impl ParseResult { /// If this result is `Ok`, then apply `then` to it. Otherwise, return this /// result unchanged. fn and_then( self, mut then: impl FnMut(T) -> ParseResult, ) -> ParseResult { match self { ParseResult::Special(mode) => ParseResult::Special(mode), ParseResult::Ok(t) => then(t), ParseResult::Err(err) => ParseResult::Err(err), } } } /// Parse CLI arguments and convert then to their high level representation. pub(crate) fn parse() -> ParseResult { parse_low().and_then(|low| match HiArgs::from_low_args(low) { Ok(hi) => ParseResult::Ok(hi), Err(err) => ParseResult::Err(err), }) } /// Parse CLI arguments only into their low level representation. /// /// This takes configuration into account. That is, it will try to read /// `RIPGREP_CONFIG_PATH` and prepend any arguments found there to the /// arguments passed to this process. /// /// This will also set one-time global state flags, such as the log level and /// whether messages should be printed. fn parse_low() -> ParseResult { if let Err(err) = crate::logger::Logger::init() { let err = anyhow::anyhow!("failed to initialize logger: {err}"); return ParseResult::Err(err); } let parser = Parser::new(); let mut low = LowArgs::default(); if let Err(err) = parser.parse(std::env::args_os().skip(1), &mut low) { return ParseResult::Err(err); } // Even though we haven't parsed the config file yet (assuming it exists), // we can still use the arguments given on the CLI to setup ripgrep's // logging preferences. Even if the config file changes them in some way, // it's really the best we can do. This way, for example, folks can pass // `--trace` and see any messages logged during config file parsing. set_log_levels(&low); // Before we try to take configuration into account, we can bail early // if a special mode was enabled. This is basically only for version and // help output which shouldn't be impacted by extra configuration. if let Some(special) = low.special.take() { return ParseResult::Special(special); } // If the end user says no config, then respect it. if low.no_config { log::debug!("not reading config files because --no-config is present"); return ParseResult::Ok(low); } // Look for arguments from a config file. If we got nothing (whether the // file is empty or RIPGREP_CONFIG_PATH wasn't set), then we don't need // to re-parse. let config_args = crate::flags::config::args(); if config_args.is_empty() { log::debug!("no extra arguments found from configuration file"); return ParseResult::Ok(low); } // The final arguments are just the arguments from the CLI appending to // the end of the config arguments. let mut final_args = config_args; final_args.extend(std::env::args_os().skip(1)); // Now do the CLI parsing dance again. let mut low = LowArgs::default(); if let Err(err) = parser.parse(final_args.into_iter(), &mut low) { return ParseResult::Err(err); } // Reset the message and logging levels, since they could have changed. set_log_levels(&low); ParseResult::Ok(low) } /// Sets global state flags that control logging based on low-level arguments. fn set_log_levels(low: &LowArgs) { crate::messages::set_messages(!low.no_messages); crate::messages::set_ignore_messages(!low.no_ignore_messages); match low.logging { Some(LoggingMode::Trace) => { log::set_max_level(log::LevelFilter::Trace) } Some(LoggingMode::Debug) => { log::set_max_level(log::LevelFilter::Debug) } None => log::set_max_level(log::LevelFilter::Warn), } } /// Parse the sequence of CLI arguments given a low level typed set of /// arguments. /// /// This is exposed for testing that the correct low-level arguments are parsed /// from a CLI. It just runs the parser once over the CLI arguments. It doesn't /// setup logging or read from a config file. /// /// This assumes the iterator given does *not* begin with the binary name. #[cfg(test)] pub(crate) fn parse_low_raw( rawargs: impl IntoIterator>, ) -> anyhow::Result { let mut args = LowArgs::default(); Parser::new().parse(rawargs, &mut args)?; Ok(args) } /// Return the metadata for the flag of the given name. pub(super) fn lookup(name: &str) -> Option<&'static dyn Flag> { // N.B. Creating a new parser might look expensive, but it only builds // the lookup trie exactly once. That is, we get a `&'static Parser` from // `Parser::new()`. match Parser::new().find_long(name) { FlagLookup::Match(&FlagInfo { flag, .. }) => Some(flag), _ => None, } } /// A parser for turning a sequence of command line arguments into a more /// strictly typed set of arguments. #[derive(Debug)] struct Parser { /// A single map that contains all possible flag names. This includes /// short and long names, aliases and negations. This maps those names to /// indices into `info`. map: FlagMap, /// A map from IDs returned by the `map` to the corresponding flag /// information. info: Vec, } impl Parser { /// Create a new parser. /// /// This always creates the same parser and only does it once. Callers may /// call this repeatedly, and the parser will only be built once. fn new() -> &'static Parser { use std::sync::OnceLock; // Since a parser's state is immutable and completely determined by // FLAGS, and since FLAGS is a constant, we can initialize it exactly // once. static P: OnceLock = OnceLock::new(); P.get_or_init(|| { let mut infos = vec![]; for &flag in FLAGS.iter() { infos.push(FlagInfo { flag, name: Ok(flag.name_long()), kind: FlagInfoKind::Standard, }); for alias in flag.aliases() { infos.push(FlagInfo { flag, name: Ok(alias), kind: FlagInfoKind::Alias, }); } if let Some(byte) = flag.name_short() { infos.push(FlagInfo { flag, name: Err(byte), kind: FlagInfoKind::Standard, }); } if let Some(name) = flag.name_negated() { infos.push(FlagInfo { flag, name: Ok(name), kind: FlagInfoKind::Negated, }); } } let map = FlagMap::new(&infos); Parser { map, info: infos } }) } /// Parse the given CLI arguments into a low level representation. /// /// The iterator given should *not* start with the binary name. fn parse(&self, rawargs: I, args: &mut LowArgs) -> anyhow::Result<()> where I: IntoIterator, O: Into, { let mut p = lexopt::Parser::from_args(rawargs); while let Some(arg) = p.next().context("invalid CLI arguments")? { let lookup = match arg { lexopt::Arg::Value(value) => { args.positional.push(value); continue; } lexopt::Arg::Short(ch) if ch == 'h' => { // Special case -h/--help since behavior is different // based on whether short or long flag is given. args.special = Some(SpecialMode::HelpShort); continue; } lexopt::Arg::Short(ch) if ch == 'V' => { // Special case -V/--version since behavior is different // based on whether short or long flag is given. args.special = Some(SpecialMode::VersionShort); continue; } lexopt::Arg::Short(ch) => self.find_short(ch), lexopt::Arg::Long(name) if name == "help" => { // Special case -h/--help since behavior is different // based on whether short or long flag is given. args.special = Some(SpecialMode::HelpLong); continue; } lexopt::Arg::Long(name) if name == "version" => { // Special case -V/--version since behavior is different // based on whether short or long flag is given. args.special = Some(SpecialMode::VersionLong); continue; } lexopt::Arg::Long(name) => self.find_long(name), }; let mat = match lookup { FlagLookup::Match(mat) => mat, FlagLookup::UnrecognizedShort(name) => { anyhow::bail!("unrecognized flag -{name}") } FlagLookup::UnrecognizedLong(name) => { let mut msg = format!("unrecognized flag --{name}"); if let Some(suggest_msg) = suggest(&name) { msg = format!("{msg}\n\n{suggest_msg}"); } anyhow::bail!("{msg}") } }; let value = if matches!(mat.kind, FlagInfoKind::Negated) { // Negated flags are always switches, even if the non-negated // flag is not. For example, --context-separator accepts a // value, but --no-context-separator does not. FlagValue::Switch(false) } else if mat.flag.is_switch() { FlagValue::Switch(true) } else { FlagValue::Value(p.value().with_context(|| { format!("missing value for flag {mat}") })?) }; mat.flag .update(value, args) .with_context(|| format!("error parsing flag {mat}"))?; } Ok(()) } /// Look for a flag by its short name. fn find_short(&self, ch: char) -> FlagLookup<'_> { if !ch.is_ascii() { return FlagLookup::UnrecognizedShort(ch); } let byte = u8::try_from(ch).unwrap(); let Some(index) = self.map.find(&[byte]) else { return FlagLookup::UnrecognizedShort(ch); }; FlagLookup::Match(&self.info[index]) } /// Look for a flag by its long name. /// /// This also works for aliases and negated names. fn find_long(&self, name: &str) -> FlagLookup<'_> { let Some(index) = self.map.find(name.as_bytes()) else { return FlagLookup::UnrecognizedLong(name.to_string()); }; FlagLookup::Match(&self.info[index]) } } /// The result of looking up a flag name. #[derive(Debug)] enum FlagLookup<'a> { /// Lookup found a match and the metadata for the flag is attached. Match(&'a FlagInfo), /// The given short name is unrecognized. UnrecognizedShort(char), /// The given long name is unrecognized. UnrecognizedLong(String), } /// The info about a flag associated with a flag's ID in the flag map. #[derive(Debug)] struct FlagInfo { /// The flag object and its associated metadata. flag: &'static dyn Flag, /// The actual name that is stored in the Aho-Corasick automaton. When this /// is a byte, it corresponds to a short single character ASCII flag. The /// actual pattern that's in the Aho-Corasick automaton is just the single /// byte. name: Result<&'static str, u8>, /// The type of flag that is stored for the corresponding Aho-Corasick /// pattern. kind: FlagInfoKind, } /// The kind of flag that is being matched. #[derive(Debug)] enum FlagInfoKind { /// A standard flag, e.g., --passthru. Standard, /// A negation of a standard flag, e.g., --no-multiline. Negated, /// An alias for a standard flag, e.g., --passthrough. Alias, } impl std::fmt::Display for FlagInfo { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self.name { Ok(long) => write!(f, "--{long}"), Err(short) => write!(f, "-{short}", short = char::from(short)), } } } /// A map from flag names (short, long, negated and aliases) to their ID. /// /// Once an ID is known, it can be used to look up a flag's metadata in the /// parser's internal state. #[derive(Debug)] struct FlagMap { map: std::collections::HashMap, usize>, } impl FlagMap { /// Create a new map of flags for the given flag information. /// /// The index of each flag info corresponds to its ID. fn new(infos: &[FlagInfo]) -> FlagMap { let mut map = std::collections::HashMap::with_capacity(infos.len()); for (i, info) in infos.iter().enumerate() { match info.name { Ok(name) => { assert_eq!(None, map.insert(name.as_bytes().to_vec(), i)); } Err(byte) => { assert_eq!(None, map.insert(vec![byte], i)); } } } FlagMap { map } } /// Look for a match of `name` in the given Aho-Corasick automaton. /// /// This only returns a match if the one found has a length equivalent to /// the length of the name given. fn find(&self, name: &[u8]) -> Option { self.map.get(name).copied() } } /// Possibly return a message suggesting flags similar in the name to the one /// given. /// /// The one given should be a flag given by the user (without the leading /// dashes) that was unrecognized. This attempts to find existing flags that /// are similar to the one given. fn suggest(unrecognized: &str) -> Option { let similars = find_similar_names(unrecognized); if similars.is_empty() { return None; } let list = similars .into_iter() .map(|name| format!("--{name}")) .collect::>() .join(", "); Some(format!("similar flags that are available: {list}")) } /// Return a sequence of names similar to the unrecognized name given. fn find_similar_names(unrecognized: &str) -> Vec<&'static str> { // The jaccard similarity threshold at which we consider two flag names // similar enough that it's worth suggesting it to the end user. // // This value was determined by some ad hoc experimentation. It might need // further tweaking. const THRESHOLD: f64 = 0.4; let mut similar = vec![]; let bow_given = ngrams(unrecognized); for &flag in FLAGS.iter() { let name = flag.name_long(); let bow = ngrams(name); if jaccard_index(&bow_given, &bow) >= THRESHOLD { similar.push(name); } if let Some(name) = flag.name_negated() { let bow = ngrams(name); if jaccard_index(&bow_given, &bow) >= THRESHOLD { similar.push(name); } } for name in flag.aliases() { let bow = ngrams(name); if jaccard_index(&bow_given, &bow) >= THRESHOLD { similar.push(name); } } } similar } /// A "bag of words" is a set of ngrams. type BagOfWords<'a> = BTreeSet>; /// Returns the jaccard index (a measure of similarity) between sets of ngrams. fn jaccard_index(ngrams1: &BagOfWords<'_>, ngrams2: &BagOfWords<'_>) -> f64 { let union = u32::try_from(ngrams1.union(ngrams2).count()) .expect("fewer than u32::MAX flags"); let intersection = u32::try_from(ngrams1.intersection(ngrams2).count()) .expect("fewer than u32::MAX flags"); f64::from(intersection) / f64::from(union) } /// Returns all 3-grams in the slice given. /// /// If the slice doesn't contain a 3-gram, then one is artificially created by /// padding it out with a character that will never appear in a flag name. fn ngrams(flag_name: &str) -> BagOfWords<'_> { // We only allow ASCII flag names, so we can just use bytes. let slice = flag_name.as_bytes(); let seq: Vec> = match slice.len() { 0 => vec![Cow::Owned(b"!!!".to_vec())], 1 => vec![Cow::Owned(vec![slice[0], b'!', b'!'])], 2 => vec![Cow::Owned(vec![slice[0], slice[1], b'!'])], _ => slice.windows(3).map(Cow::Borrowed).collect(), }; BTreeSet::from_iter(seq) }