/* * SPDX-FileCopyrightText: Peter Pentchev * SPDX-License-Identifier: BSD-2-Clause */ //! Detect a UTF-8-capable locale. #![allow(clippy::module_name_repetitions)] use std::collections::{HashMap, HashSet}; use std::env; use std::hash::BuildHasher; use std::io::Error as IoError; use std::process::{Command, Stdio}; use anyhow::{anyhow, Error as AnyError}; use encoding::all::ISO_8859_1; use encoding::{DecoderTrap, Encoding}; use once_cell::sync::Lazy; use regex::{Error as RegexError, Regex}; use thiserror::Error; /// An error that occurred while examining the environment or locales. #[derive(Debug, Error)] #[non_exhaustive] pub enum UErr { /// Could not decode the list of locales output by `locale -a`. #[error("Could not decode the obtained list of locales")] DecodeLocaleList(#[source] AnyError), /// Invalid value for an environment variable. #[error("The {0} environment variable's value is not a valid string")] InvalidEnvValue(String), /// Something went really, really wrong... #[error("Internal utf8-locale error: {0}")] Internal(String), /// Could not compile a regular expression. #[error("Internal error: could not compile the {0} regular expression")] Regex(String, #[source] RegexError), /// Could not extract a captured group out of a regular expression. #[error("Internal error: could not extract the '{0}' regex group out of {1}")] RegexCaptures(String, String), /// Could not run a program, e.g. `locale -a`. #[error("Could not run the `{0}` program")] RunProgram(String, #[source] IoError), } /// The variables examined by the [`LanguagesDetect`] class by default. pub const LOCALE_VARIABLES: [&str; 14] = [ "LC_ALL", "LANG", "LC_MESSAGES", "LC_COLLATE", "LC_NAME", "LC_IDENTIFICATION", "LC_CTYPE", "LC_NUMERIC", "LC_TIME", "LC_MONETARY", "LC_PAPER", "LC_ADDRESS", "LC_TELEPHONE", "LC_MEASUREMENT", ]; /// The encodings recognized as UTF-8 for the various locale distributions. pub const UTF8_ENCODINGS: [&str; 2] = ["UTF-8", "utf8"]; /// The list of preferred languages used by the [`Utf8Detect`] class by default. pub const UTF8_LANGUAGES: [&str; 5] = ["C", "en", "de", "es", "it"]; /// Break a locale name down into components. pub const RE_LOCALE_NAME: &str = r"(?x) ^ (?P [a-zA-Z0-9]+ ) (?: _ (?P [a-zA-Z0-9]+ ) )? (?: \. (?P [a-zA-Z0-9-]+ ) )? (?: @ (?P [a-zA-Z0-9]+ ) )? $ "; /// Initialize the language weights array in order. fn build_weights(langs: &[&str]) -> (HashMap, usize) { let mut res = HashMap::new(); for lang in langs { let weight = res.len(); res.entry((*lang).to_owned()).or_insert(weight); } let unweight = res.len(); (res, unweight) } /// Get the regular expression used to parse a locale name. /// /// # Errors /// /// [`UErr::Regex`] on failure to compile a built-in regular expression. fn get_re_name() -> Result<&'static Regex, UErr> { /// The regular expression used for parsing a locale name. static RE_NAME: Lazy> = Lazy::new(|| Regex::new(RE_LOCALE_NAME)); RE_NAME .as_ref() .map_err(|err| UErr::Regex("locale name".to_owned(), err.clone())) } /// Get a locale name that may hopefully be used for UTF-8 output. /// /// The [`detect_utf8_locale()`] function runs the external `locale` command to /// obtain a list of the supported locale names, and then picks a suitable one /// to use so that programs are more likely to output valid UTF-8 characters /// and language-neutral messages. It prefers the `C` base locale, but if /// neither `C.UTF-8` nor `C.utf8` is available, it will fall back to a list of /// other locale names that are likely to be present on the system. Note that /// the [`Utf8Detect`] class is the preferred way of doing this. /// /// The [`UTF8_LANGUAGES`] variable contains a list of default languages in /// order of preference that the [`Utf8Detect`] class passes to this function by /// default. /// /// # Errors /// /// [`UErr::RunProgram`] if `locale -a` could not be executed. /// [`UErr::DecodeLocaleList`] if the output of `locale -a` could not be decoded as /// ISO-8859-1 text. /// [`UErr::Regex`] on failure to compile a built-in regular expression. /// [`UErr::RegexCaptures`] on failure to extract a captured group out of /// a successful regular expression match. #[inline] pub fn detect_utf8_locale(languages: &[&str]) -> Result { let re_name = get_re_name()?; let (weights, unweight) = build_weights(languages); let raw = Command::new("locale") .arg("-a") .stderr(Stdio::inherit()) .output() .map_err(|err| UErr::RunProgram("locale -a".to_owned(), err))? .stdout; let text = ISO_8859_1 .decode(&raw, DecoderTrap::Strict) .map_err(|err| UErr::DecodeLocaleList(anyhow!("Could not decode a string: {err}")))?; Ok(text .lines() .try_fold( ("C".to_owned(), unweight), |state, line| -> Result<(String, usize), UErr> { Ok(match re_name.captures(line) { None => state, Some(caps) => match caps.name("codeset") { None => state, Some(value) => { if UTF8_ENCODINGS.contains(&value.as_str()) { let lang = caps .name("lang") .ok_or_else(|| { UErr::RegexCaptures("lang".to_owned(), format!("{caps:?}")) })? .as_str(); match weights.get(lang) { None => state, Some(&weight) => { if weight < state.1 { (line.to_owned(), weight) } else { state } } } } else { state } } }, }) }, )? .0) } /// Prepare the environment variables that need to be changed. /// /// The [`get_utf8_vars()`] function invokes [`detect_utf8_locale()`] and /// then returns a hashmap with `LC_ALL` set to the obtained locale name and /// `LANGUAGE` set to an empty string so that recent versions of the gettext /// library do not choose a different language to output messages in. /// /// # Errors /// /// Propagates errors returned by [`detect_utf8_locale()`]. #[inline] pub fn get_utf8_vars(languages: &[&str]) -> Result, UErr> { let loc = detect_utf8_locale(languages)?; let arr = [ ("LC_ALL".to_owned(), loc), ("LANGUAGE".to_owned(), String::new()), ]; Ok(arr.into_iter().collect()) } /// Prepare the environment to run subprocesses in. /// /// The [`get_utf8_env()`] function invokes [`detect_utf8_locale()`] and then /// returns a hashmap based on [`std::env::vars()`], but with `LC_ALL` set to /// the obtained locale name and `LANGUAGE` set to an empty string so that /// recent versions of the gettext library do not choose a different language /// to output messages in. Note that the [`Utf8Detect`] class is the preferred /// way of doing this. /// /// # Errors /// /// Propagates errors returned by [`get_utf8_vars()`]. #[inline] pub fn get_utf8_env(languages: &[&str]) -> Result, UErr> { Ok(env::vars().chain(get_utf8_vars(languages)?).collect()) } /// Determine preferred languages as per the current locale settings. /// /// The [`get_preferred_languages()`] function examines the specified /// hashmap of environment variables and returns a list of /// the languages specified in the locale variables (`LC_ALL`, `LANG`, /// `LC_MESSAGES`, etc) in order of preference as defined by either /// the `names` parameter. Note that the [`LanguagesDetect`] class is /// the preferred way of doing this. /// /// Note that "C" is always appended to the end of the list if it is not /// already present. /// /// # Errors /// /// [`UErr::Regex`] on failure to compile a built-in regular expression. #[inline] pub fn get_preferred_languages( env: &HashMap, names: &[&str], ) -> Result, UErr> { let re_name = get_re_name()?; let mut res: Vec = Vec::new(); for name in names { if let Some(value) = env.get(&(*name).to_owned()) { if let Some(caps) = re_name.captures(value) { let cap = |group| { caps.name(group) .ok_or_else(|| UErr::RegexCaptures(group.to_owned(), format!("{caps:?}"))) }; if UTF8_ENCODINGS.contains(&cap("codeset")?.as_str()) { let lang = cap("lang")?.as_str().to_owned(); if !res.contains(&lang) { res.push(lang); } } } } } /* Make sure "C" is always in the list. */ if !res.contains(&"C".to_owned()) { res.push("C".to_owned()); } Ok(res) } /// Determine preferred languages as per the current locale settings. /// /// This class is used to invoke the [`get_preferred_languages()`] function /// with reasonable default values: the current process environment and /// the default [`LOCALE_VARIABLES`] list of variable names, with the option /// of overriding either. #[derive(Debug, Default)] #[non_exhaustive] pub struct LanguagesDetect<'names> { /// The environment variables to examine instead of [`mod@std::env`]. pub env: Option>, /// The names of locale variables to use instead of the defaults. pub names: Option<&'names [&'names str]>, } impl<'names> LanguagesDetect<'names> { /// Prepare to detect languages in the default manner. #[inline] #[must_use] pub fn new() -> Self { Self::default() } /// Detect the preferred languages according to the specified settings. /// /// # Errors /// /// Returns an error if one of the required environment variables has /// a value that is not a valid UTF-8 string. #[inline] pub fn detect(self) -> Result, UErr> { let qnames = self.names.unwrap_or(&LOCALE_VARIABLES); let qenv = self.env.map_or_else( || { let vars: HashSet = LOCALE_VARIABLES .iter() .map(|name| (*name).to_owned()) .collect(); env::vars_os() .filter_map(|(os_name, os_value)| { os_name.to_str().and_then(|name| { vars.contains(name).then(|| { os_value.to_str().map_or_else( || Err(UErr::InvalidEnvValue(name.to_owned())), |value| Ok((name.to_owned(), value.to_owned())), ) }) }) }) .collect::>() }, Ok, )?; get_preferred_languages(&qenv, qnames) } /// Specify the environment variables to examine instead of [`mod@std::env`]. #[allow(clippy::missing_const_for_fn)] #[inline] #[must_use] pub fn with_env(self, env: HashMap) -> Self { Self { env: Some(env), ..self } } /// Specify the names of the environment variables to look at instead of /// [`LOCALE_VARIABLES`]. #[allow(clippy::missing_const_for_fn)] #[inline] #[must_use] pub fn with_names(self, names: &'names [&'names str]) -> Self { Self { names: Some(names), ..self } } } /// Information about an available UTF-8 environment. #[derive(Debug)] #[non_exhaustive] pub struct Utf8Environment { /// The environment to run a child process in. pub env: HashMap, /// The environment variables that need to be updated. pub env_vars: HashMap, /// The name of the UTF-8 locale. pub locale: String, } /// Determine a UTF-8 locale to use and prepare the environment variables. /// /// This class holds an optional list of preferred languages (if none is /// specified, the [`Utf8Detect::detect()`] method uses the ones in /// the [`UTF8_LANGUAGES`] variable by default), and an optional map of /// environment variables to augment (if none is specified, the current process /// environment is used by default). #[derive(Debug, Default)] #[non_exhaustive] pub struct Utf8Detect { /// The environment variables to use instead of the current process's ones. pub env: Option>, /// The languages to look for in order of preference. pub languages: Option>, } impl Utf8Detect { /// Prepare to detect a locale in the default manner. #[inline] #[must_use] pub fn new() -> Self { Self::default() } /// Detect a UTF-8 locale, prepare the environment. /// This method invokes the (mostly internal) [`get_utf8_vars()`] function /// which, in turn, invokes the (again mostly internal) [`detect_utf8_locale()`] /// one, which uses the external `locale` utility to obtain the list of /// available locales and then picks a UTF-8-capable one according to /// the list of preferred languages. /// /// # Errors /// /// Propagate errors returned by [`get_utf8_vars()`]. #[inline] pub fn detect(self) -> Result { let env_vars = match self.languages { None => get_utf8_vars(&UTF8_LANGUAGES)?, Some(langs) => { let lvec: Vec<&str> = langs.iter().map(|lang| &**lang).collect(); get_utf8_vars(&lvec)? } }; let renv = self.env.unwrap_or_else(|| env::vars().collect()); let locale = env_vars .get("LC_ALL") .ok_or_else(|| { UErr::Internal(format!( "Internal error: no 'LC_ALL' after successful detection: {env_vars:?}" )) })? .to_string(); Ok(Utf8Environment { env: renv .into_iter() .chain( env_vars .iter() .map(|(name, value)| (name.to_string(), value.to_string())), ) .collect(), env_vars, locale, }) } /// Specify the environment variables to record. #[allow(clippy::missing_const_for_fn)] #[inline] #[must_use] pub fn with_env(self, env: HashMap) -> Self { Self { env: Some(env), ..self } } /// Specify the preferred languages to look for among the locales. #[inline] #[must_use] pub fn with_languages(self, langs: Vec) -> Self { Self { languages: Some(langs), ..self } } }