// SPDX-License-Identifier: MPL-2.0 use aho_corasick::{AhoCorasick, AhoCorasickBuilder, PatternID}; use imara_diff::{ intern::{Interner, Token}, Algorithm, }; use memchr::memmem; #[allow(dead_code)] // it IS used in `split_into_tokens_corasick` const fn const_str_equals(a: &str, b: &str) -> bool { let mut i = 0; while i < a.len() && i < b.len() { if a.as_bytes()[i] != b.as_bytes()[i] { return false; } i += 1; } i == a.len() && i == b.len() } /// Replace all occurrences of `from` with `to` in `input`. /// /// This function is optimized for the case where no replacements are made. /// /// # Arguments /// /// * `input` - The input string to search for replacements. /// * `from` - The `Finder` to search for. Must be created from valid UTF-8. /// * `to` - The string to replace `from` with. /// * `scratch_buffer` - A buffer to store the result in. Is expected to be empty. /// /// # Returns /// /// A tuple containing the modified `input` and the `clear`ed `scratch_buffer`. /// /// # Panics /// /// Might panic if `from` is not valid UTF-8. fn str_replace_opt( input: String, from: &memmem::Finder, to: &str, scratch_buffer: String, ) -> (String, String) { let mut _ignored = false; str_replace_opt_ext(input, from, to, scratch_buffer, &mut _ignored) } fn str_replace_opt_ext( mut input: String, from: &memmem::Finder, to: &str, scratch_buffer: String, did_replace: &mut bool, ) -> (String, String) { let mut result = scratch_buffer; let mut last_end = 0; for m in from.find_iter(input.as_bytes()) { let start = m; let end = start + from.needle().len(); // string indexing could panic if the Finder is not valid UTF-8 result.push_str(&input[last_end..start]); result.push_str(to); last_end = end; } if last_end == 0 { // no replacements were made *did_replace = false; // no need to clear the scratch buffer, since it's already empty (input, result) } else { *did_replace = true; // copy the remaining text result.push_str(&input[last_end..]); input.clear(); (result, input) } } macro_rules! finder { ($needle:expr) => {{ static FINDER: LazyLock = LazyLock::new(|| memmem::Finder::new($needle.as_bytes())); &FINDER }}; } /// Find all `regex` matches in `input` and replace them with the result of `replacement`. /// /// This function is optimized for the case where no replacements are made and intended for `replacement`s /// that have capture groups. For `replacement`s that don't have capture groups, further optimization is possible. /// /// # Arguments /// /// * `input` - The input string to search for replacements. /// * `regex` - The regex to search for. /// * `replacement` - The replacer to use for replacements. /// * `scratch_buffer` - A buffer to store the result in. Is expected to be empty. /// /// # Returns /// /// A tuple containing the modified `input` and the `clear`ed `scratch_buffer`. fn regex_replace_opt( mut input: String, regex: &Regex, mut replacement: R, scratch_buffer: String, ) -> (String, String) { let mut capt_iter = regex.captures_iter(&input).peekable(); if capt_iter.peek().is_none() { // no matches found, return early // no need to clear the scratch buffer, since it's already empty (input, scratch_buffer) } else { let mut result = scratch_buffer; let mut last_end = 0; for cap in capt_iter { let m = cap.get(0).unwrap(); let start = m.start(); let end = m.end(); result.push_str(&input[last_end..start]); replacement.replace_append(&cap, &mut result); last_end = end; } // copy the remaining text result.push_str(&input[last_end..]); input.clear(); (result, input) } } #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub(crate) enum RevisionHash { Sha1(Sha1Hash), Blake3(blake3::Hash), } /// Split the input text into paragraphs. /// /// # Arguments /// /// * `text` - The input text to split. /// * `scratch_buffers` - A tuple containing two scratch buffers to use for temporary storage. /// They must be empty and will again be empty after the function returns. /// They should be reused across multiple calls to this function. pub fn split_into_paragraphs( text: &str, scratch_buffers: (&mut String, &mut String), ) -> Vec { if cfg!(feature = "optimized-str") { split_into_paragraphs_optimized(text, scratch_buffers) } else { split_into_paragraphs_naive(text) } } #[doc(hidden)] /* only public for benchmarking */ pub fn split_into_paragraphs_naive(text: &str) -> Vec { let text = text.replace("\r\n", "\n").replace("\r", "\n"); let text = text .replace("", "\n\n
") .replace("
", "\n\n"); let text = text .replace("", "\n\n") .replace("", "\n\n"); let text = text.replace("{|", "\n\n{|").replace("|}", "|}\n\n"); let text = text.replace("|-\n", "\n\n|-\n"); text.split("\n\n").map(|s| s.to_string()).collect() } #[doc(hidden)] /* only public for benchmarking */ pub fn split_into_paragraphs_optimized( text: &str, scratch_buffers: (&mut String, &mut String), ) -> Vec { scratch_buffers.0.push_str(text); let (text, scratch_buffer) = ( std::mem::take(scratch_buffers.0), std::mem::take(scratch_buffers.1), ); let (text, scratch_buffer) = str_replace_opt(text, finder!("\r\n"), "\n", scratch_buffer); let (text, scratch_buffer) = str_replace_opt(text, finder!("\r"), "\n", scratch_buffer); let (text, scratch_buffer) = str_replace_opt(text, finder!(""), "\n\n
", scratch_buffer); let (text, scratch_buffer) = str_replace_opt(text, finder!("
"), "\n\n", scratch_buffer); let (text, scratch_buffer) = str_replace_opt(text, finder!(""), "\n\n", scratch_buffer); let (text, scratch_buffer) = str_replace_opt(text, finder!(""), "\n\n", scratch_buffer); let (text, scratch_buffer) = str_replace_opt(text, finder!("{|"), "\n\n{|", scratch_buffer); let (text, scratch_buffer) = str_replace_opt(text, finder!("|}"), "|}\n\n", scratch_buffer); let (text, scratch_buffer) = str_replace_opt(text, finder!("|-\n"), "\n\n|-\n", scratch_buffer); let result = text.split("\n\n").map(|s| s.to_string()).collect(); let mut text = text; text.clear(); // scratch_buffer is already empty *scratch_buffers.0 = text; *scratch_buffers.1 = scratch_buffer; result } use regex::Regex; pub fn split_into_sentences( text: &str, scratch_buffers: (&mut String, &mut String), ) -> Vec { if cfg!(feature = "optimized-str") { split_into_sentences_optimized(text, scratch_buffers) } else { split_into_sentences_naive(text) } } #[doc(hidden)] /* only public for benchmarking */ pub fn split_into_sentences_naive(text: &str) -> Vec { static REGEX_DOT: LazyLock = LazyLock::new(|| Regex::new(r"([^\s\.=][^\s\.=][^\s\.=]\.) ").unwrap()); static REGEX_URL: LazyLock = LazyLock::new(|| Regex::new(r"(http.*?://.*?[ \|<>\n\r])").unwrap()); let text = text.replace("\n", "\n@@@@"); let text = REGEX_DOT.replace_all(&text, "$1@@@@"); let text = text.replace("; ", ";@@@@"); let text = text.replace("? ", "?@@@@"); let text = text.replace("! ", "!@@@@"); let text = text.replace(": ", ":@@@@"); let text = text.replace("\t", "\t@@@@"); let text = text.replace("", "-->@@@@"); let text = text.replace("", "/ref>@@@@"); let text = REGEX_URL.replace_all(&text, "@@@@$1@@@@"); let mut text = text.into_owned(); while text.contains("@@@@@@@@") { text = text.replace("@@@@@@@@", "@@@@"); } text.split("@@@@").map(|s| s.to_string()).collect() } pub fn split_into_sentences_optimized( text: &str, scratch_buffers: (&mut String, &mut String), ) -> Vec { static REGEX_DOT: LazyLock = LazyLock::new(|| Regex::new(r"([^\s\.=][^\s\.=][^\s\.=]\.) ").unwrap()); static REGEX_URL: LazyLock = LazyLock::new(|| Regex::new(r"(http.*?://.*?[ \|<>\n\r])").unwrap()); scratch_buffers.0.push_str(text); let (text, scratch_buffer) = ( std::mem::take(scratch_buffers.0), std::mem::take(scratch_buffers.1), ); let (text, scratch_buffer) = str_replace_opt(text, finder!("\n"), "\n@@@@", scratch_buffer); let (text, scratch_buffer) = regex_replace_opt(text, ®EX_DOT, "$1@@@@", scratch_buffer); let (text, scratch_buffer) = str_replace_opt(text, finder!("; "), ";@@@@", scratch_buffer); let (text, scratch_buffer) = str_replace_opt(text, finder!("? "), "?@@@@", scratch_buffer); let (text, scratch_buffer) = str_replace_opt(text, finder!("! "), "!@@@@", scratch_buffer); let (text, scratch_buffer) = str_replace_opt(text, finder!(": "), ":@@@@", scratch_buffer); let (text, scratch_buffer) = str_replace_opt(text, finder!("\t"), "\t@@@@", scratch_buffer); let (text, scratch_buffer) = str_replace_opt(text, finder!(""), "-->@@@@", scratch_buffer); let (text, scratch_buffer) = str_replace_opt(text, finder!(""), "/ref>@@@@", scratch_buffer); let (text, scratch_buffer) = regex_replace_opt(text, ®EX_URL, "@@@@$1@@@@", scratch_buffer); let (mut text, mut scratch_buffer) = (text, scratch_buffer); let mut did_replace = true; while did_replace { (text, scratch_buffer) = str_replace_opt_ext( text, finder!("@@@@@@@@"), "@@@@", scratch_buffer, &mut did_replace, ); } let result = text.split("@@@@").map(|s| s.to_string()).collect(); text.clear(); // scratch_buffer is already empty *scratch_buffers.0 = text; *scratch_buffers.1 = scratch_buffer; result } pub fn split_into_tokens(text: &str) -> Vec { if cfg!(feature = "optimized-str") { split_into_tokens_corasick(text) } else { split_into_tokens_naive(text) } } #[doc(hidden)] /* only public for benchmarking */ pub fn split_into_tokens_naive(text: &str) -> Vec { let text = text .replace("|", "||ææææ||") .replace("\n", "||") .replace(" ", "||"); let symbols = [ '.', ',', ';', ':', '?', '!', '-', '_', '/', '\\', '(', ')', '[', ']', '{', '}', '*', '#', '@', '&', '=', '+', '%', '~', '$', '^', '<', '>', '"', '\'', '´', '`', '¸', '˛', '’', '¤', '₳', '฿', '₵', '¢', '₡', '₢', '₫', '₯', '֏', '₠', '€', 'ƒ', '₣', '₲', '₴', '₭', '₺', '₾', 'ℳ', '₥', '₦', '₧', '₱', '₰', '£', '៛', '₽', '₹', '₨', '₪', '৳', '₸', '₮', '₩', '¥', '§', '‖', '¦', '⟨', '⟩', '–', '—', '¯', '»', '«', '”', '÷', '×', '′', '″', '‴', '¡', '¿', '©', '℗', '®', '℠', '™', ]; let mut text = text; for symbol in symbols { let sym_str = format!("||{}||", symbol); text = text.replace(symbol, &sym_str); } let text = text.replace("[||||[", "[[").replace("]||||]", "]]"); let text = text.replace("{||||{", "{{").replace("}||||}", "}}"); let text = text .replace("<||||!||||-||||-||", "||||"); let mut text = text; while text.contains("||||") { text = text.replace("||||", "||"); } text.split("||") .filter(|&s| !s.is_empty()) .map(|w| { if w == "ææææ" { "|".to_string() } else { w.to_string() } }) .collect() } #[doc(hidden)] /* only public for benchmarking */ pub fn split_into_tokens_corasick(text: &str) -> Vec { // used to determine whether a match is a separator or a symbol const FIRST_SYMBOL: PatternID = PatternID::new_unchecked(2); const PATTERNS: &[&str] = &[ /* separators --> */ " ", "\n", /* match composite symbols first --> */ "", "[[", "]]", "{{", "}}", /* then match single character symbols --> */ "|", ".", ",", ";", ":", "?", "!", "-", "_", "/", "\\", "(", ")", "[", "]", "{", "}", "*", "#", "@", "&", "=", "+", "%", "~", "$", "^", "<", ">", "\"", "'", "´", "`", "¸", "˛", "’", "¤", "₳", "฿", "₵", "¢", "₡", "₢", "₫", "₯", "֏", "₠", "€", "ƒ", "₣", "₲", "₴", "₭", "₺", "₾", "ℳ", "₥", "₦", "₧", "₱", "₰", "£", "៛", "₽", "₹", "₨", "₪", "৳", "₸", "₮", "₩", "¥", "§", "‖", "¦", "⟨", "⟩", "–", "—", "¯", "»", "«", "”", "÷", "×", "′", "″", "‴", "¡", "¿", "©", "℗", "®", "℠", "™", ]; const _: () = { let first_symbol = PATTERNS[FIRST_SYMBOL.as_usize()]; assert!(const_str_equals(first_symbol, "