use std::borrow::Cow; use std::path::{Path, PathBuf}; use std::ffi::{OsStr, OsString}; /// Converts the Path `P` to a UTF-8 string which can be safely written to a file /// irrespective of whether the original Path contains unprintable characters /// or is an invalid UTF-8 string. If the Path is a valid UTF-8 string and /// contains no control characters such as `\t` it is returned as-is, otherwise /// it is encoded as a Base-64 string and given a special prefix which means /// the resultant string can be unambiguously detected as an encoded path rather /// than an actual path. This conversion can be reversed using the `decode_path` /// function. pub fn encode_path

(p: &P) -> Cow where P: AsRef { let p = p.as_ref(); if let Some(s) = p.to_str() { if !should_be_encoded(s) { return Cow::Borrowed(s); } } Cow::Owned(encode_os(p.as_os_str())) } /// Reverses the encoding of a Path performed by `encode_path`. This function /// should always be used to reverse the encoding, as it will correctly detect /// whether the string 'S' is an actual path or one that was Base-64 encoded. /// The function will only return an error if the Path was the Base-64 encoded /// form and the encoding has been tampered with. pub fn decode_path(encoded_path_string: &str) -> Result { if encoded_path_string.starts_with(PREFIX) { let bytes = decode_bytes(encoded_path_string)?; let os_str = decode_os(bytes); Ok(PathBuf::from(os_str)) } else { Ok(PathBuf::from(encoded_path_string)) } } /// Drive letters must be A-Z, single character only. Therefore this /// always represents an invalid path (note also that ':' is illegal anywhere /// in Windows paths). #[cfg(windows)] const PREFIX: &str = "::\\_"; /// On Unix (which also means BSD, Android, OSX...), filenames can contain any byte /// except '\0' and '/', which makes formulating an impossible filename very difficult /// (since we can't use a zero-byte in a printable string and '/' is the usual /// directory separator). You can even use filenames such as '/../../b64' in the shell /// and File::create() and they work ok because the '..' file in the root directory /// is a link back to the root directory making it impossible to 'escape' the /// filesystem (very clever, Unix guys). /// However, you cannot have a file under '/dev/null' because it is defined as a file /// in POSIX! http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap10.html /// Therefore any path beginning with '/dev/null' will be an invalid path. /// Baldrick levels of cunning going on here. #[cfg(not(windows))] const PREFIX: &str = "/dev/null/b64_"; /// Even if a Path can be converted to a valid UTF-8 string we still might want /// to encode it: it's difficult to write filenames with newlines or '\b' in a sensible /// manner, for example. fn should_be_encoded(s: &str) -> bool { s.chars().any(|c| c.is_control()) } #[cfg(windows)] fn encode_os(s: &OsStr) -> String { use std::os::windows::ffi::OsStrExt; let wide_chars = s.encode_wide().collect::>(); let bytes = u16_slice_to_byte_array(&wide_chars); encode_bytes(&bytes) } #[cfg(not(windows))] fn encode_os(s: &OsStr) -> String { use std::os::unix::ffi::OsStrExt; let bytes = s.as_bytes(); encode_bytes(bytes) } /// A small wrapper around the 'encode' call to the base64 library to ensure /// we do it the same way every time. fn encode_bytes(bytes: &[u8]) -> String { let mut b64 = PREFIX.to_string(); base64::encode_config_buf(bytes, base64::STANDARD, &mut b64); b64 } /// A small wrapper around the 'decode' call to the base64 library to ensure /// we do it the same way every time. The decode will not fail unless the /// previously encoded string is messed with in some way, but that is a /// distinct possibility in human-editable files, either by malice or misfortune. fn decode_bytes(encoded_str: &str) -> Result, base64::DecodeError> { let encoded_bytes = &encoded_str[PREFIX.len()..]; base64::decode_config(encoded_bytes, base64::STANDARD) } #[cfg(not(windows))] pub(crate) fn decode_os(bytes: Vec) -> OsString { use std::os::unix::ffi::OsStringExt; OsString::from_vec(bytes) } #[cfg(windows)] pub(crate) fn decode_os(bytes: Vec) -> OsString { use std::os::windows::ffi::OsStringExt; let mut wide_chars = Vec::with_capacity(bytes.len() / 2); let mut i = 0; while i < bytes.len() - 1 { let wide = bytes_to_u16(bytes[i], bytes[i + 1]); wide_chars.push(wide); i += 2; } OsString::from_wide(&wide_chars) } #[cfg(windows)] #[inline] fn bytes_to_u16(b1: u8, b2: u8) -> u16 { let result = ((b1 as u16) << 8) + b2 as u16; result } #[cfg(windows)] #[inline] fn u16_to_bytes(value: u16) -> [u8; 2] { let b1: u8 = ((value >> 8) & 0xff) as u8; let b2: u8 = (value & 0xff) as u8; return [b1, b2] } #[cfg(windows)] fn u16_slice_to_byte_array(wides: &[u16]) -> Vec { let mut bytes = Vec::with_capacity(wides.len() * 2); for &wc in wides { let a = u16_to_bytes(wc); bytes.push(a[0]); bytes.push(a[1]); } bytes } #[cfg(test)] mod tests { use std::path::PathBuf; use super::*; // On Unix, only the '\0' and '/' are invalid in filenames but any // other byte sequence is valid. // // For UTF-8 these bytes are forbidden *anywhere* in the byte sequence // (see https://en.wikipedia.org/wiki/UTF-8#Codepage_layout): // // 0xc0 (192), 0xc1 (193) // 0xf5 (245) to 0xff (255) inclusive // // Therefore sequence including such bytes will be valid paths but not a valid Rust String. // This is "Hello" followed by an invalid byte. #[cfg(unix)] const INVALID_UTF8_BYTE_SEQUENCE: [u8; 6] = [0x48, 0x65, 0x6c, 0x6c, 0x6f, 0xc0]; // On Windows, the following characters are invalid in filenames according to // https://docs.microsoft.com/en-us/windows/desktop/fileio/naming-a-file // // < (less than) // > (greater than) // : (colon - sometimes works, but is actually NTFS Alternate Data Streams) // " (double quote) // / (forward slash) // \ (backslash) // | (vertical bar or pipe) // ? (question mark) // * (asterisk) // // However, note that these are all printable characters. // Windows also bans bytes 0..31 (the ASCII control characters) - so no // tabs, bells or newlines in filenames. // // On Windows, paths are UTF-16-le, not UTF-8. So we need to make a UTF-16 // string that is not a valid UTF-8 string. // This is an invalid byte sequence according to http://unicode.org/faq/utf_bom.html#utf16-7 // path.display() works, and prints "Hello\u{d800}H", but path.to_str() will return None. // Windows will accept this as a valid path, but it is not a valid Rust String. #[cfg(windows)] const INVALID_UTF16_BYTE_SEQUENCE: [u16; 7] = [0x48, 0x65, 0x6c, 0x6c, 0x6f, 0xd800, 0x48]; // "Hello\u{d800}H" #[test] fn for_utf8_which_does_not_need_encoding() { let pb = PathBuf::new(); let s = encode_path(&pb); assert_eq!(s, "", "Empty paths should be empty strings."); let pb2 = decode_path(&s).unwrap(); assert_eq!(pb2, pb, "Empty paths should be round-trippable."); let pb = PathBuf::from("hello"); let s = encode_path(&pb); assert_eq!(s, "hello", "Valid UTF-8 paths without control chars should be encoded as-is."); let pb2 = decode_path(&s).unwrap(); assert_eq!(pb2, pb, "Valid UTF-8 paths without control chars should be round-trippable."); } #[cfg(unix)] #[test] fn for_valid_utf8_needing_unix_encoding() { // There are separate Unix and Windows tests because on Windows a valid UTF-8 string // will still be treated as UTF-16 wide chars by the time it is encoded. let pb = PathBuf::from("hello\tworld"); let s = encode_path(&pb); assert_eq!(s, format!("{}aGVsbG8Jd29ybGQ=", PREFIX), "Paths with control characters in them should be base-64 encoded."); let pb2 = decode_path(&s).unwrap(); assert_eq!(pb2, pb, "Paths with control characters in them should be round-trippable."); } #[cfg(windows)] #[test] fn for_valid_utf8_needing_windows_encoding() { // There are separate Unix and Windows tests because on Windows a valid UTF-8 string // will still be treated as UTF-16 wide chars by the time it is encoded. let pb = PathBuf::from("hello\tworld"); let s = path_to_path_string(&pb); assert_eq!(s, format!("{}AGgAZQBsAGwAbwAJAHcAbwByAGwAZA==", PREFIX), "Paths with control characters in them should be base-64 encoded."); let pb2 = path_string_to_path_buf(&s); assert_eq!(pb2, pb, "Paths with control characters in them should be round-trippable."); } #[cfg(unix)] #[test] fn for_invalid_utf8() { let os = decode_os(INVALID_UTF8_BYTE_SEQUENCE.to_vec()); let pb = PathBuf::from(os); let s = encode_path(&pb); assert_eq!(s, format!("{}SGVsbG/A", PREFIX), "Invalid UTF-8 byte sequences should be base-64 encoded."); let pb2 = decode_path(&s).unwrap(); assert_eq!(pb2, pb, "Invalid UTF-8 byte sequences should be round-trippable."); } #[cfg(windows)] #[test] fn for_invalid_utf16() { let bytes = u16_slice_to_byte_array(&INVALID_UTF16_BYTE_SEQUENCE); let os = decode_os(bytes); let pb = PathBuf::from(os); let s = encode_path(&pb); assert_eq!(s, format!("{}AEgAZQBsAGwAb9gAAEg=", PREFIX), "Invalid UTF-16 byte sequences should be base-64 encoded."); let pb2 = decode_path(&s); assert_eq!(pb2, pb, "Invalid UTF-16 byte sequences should be round-trippable."); } #[cfg(unix)] #[test] fn decode_for_mangled_base64_returns_err() { // Create a path that will get Base-64 encoded. // \x11 is just a random control character. let mut s = encode_path(&"Hello\x11world").into_owned(); // Mangle the encoded string, as if a user manually edited it. s.push('\t'); let decode_attempt = decode_path(&s); assert!(decode_attempt.is_err(), "Tabs are not valid in Base-64 encoded strings, so we should get an error when decoding it."); } }