/* These are "legacy" tests (borrowing Nuspell's terminology) which are ported originally from Hunspell's codebase. Each case has a `.dic` and `.aff` file which set up an example dictionary and each case might have any of... * `.good`: a listing of words for which `Dictionary::check` should return `true`. * `.wrong`: a listing of words for which `Dictionary::check` should return `false`. * `.sug`: a listing of words which should be suggested for the given wrong word on the corresponding line in `.wrong`, separated by commas and whitespace. We use a simple declarative macro to create a `#[test]` for each case. The advantage of a `#[test]` for each case is that a single test can fail but the others will run. We could use a single test which globs for `.dic` files instead: that would make it very easy to add a case. But these cases are meant just to ensure we have parity with the Hunspell/Nuspell test beds, so cases should not be added often. The test runs faster without the glob (and requires no glob dependency). */ use std::{ fs::{self, File}, io::{self, Read}, path::{Path, PathBuf}, }; use spellbook::Dictionary; macro_rules! check { ($case:ident) => { #[allow(non_snake_case)] #[test] fn $case() { let case = stringify!($case).strip_prefix("check_").unwrap(); do_check_case(case); } }; } fn do_check_case(case: &str) { let manifest_path = PathBuf::from(std::env::var_os("CARGO_MANIFEST_DIR").unwrap()); let path = manifest_path.join("tests/legacy").join(case); let aff = read_to_string(path.with_extension("aff")).unwrap(); let dic = read_to_string(path.with_extension("dic")).unwrap(); let dict = Dictionary::new(&aff, &dic).unwrap(); for good_word in fs::read_to_string(path.with_extension("good")) .iter() .flat_map(|text| text.lines()) { let word = good_word.trim(); assert!( dict.check(word), "expected {word:?} to be correct but it was incorrect" ); } for wrong_word in fs::read_to_string(path.with_extension("wrong")) .iter() .flat_map(|text| text.lines()) { let word = wrong_word.trim(); assert!( !dict.check(word), "expected {word:?} to be incorrect but it was correct" ); } } check!(check_1463589); check!(check_1463589_utf); check!(check_1592880); check!(check_1695964); check!(check_1706659); check!(check_1975530); check!(check_2970240); check!(check_2970242); check!(check_2999225); check!(check_affixes); check!(check_alias2); check!(check_alias3); check!(check_alias); check!(check_allcaps2); check!(check_allcaps3); check!(check_allcaps); check!(check_allcaps_utf); check!(check_arabic); check!(check_base); check!(check_base_utf); check!(check_breakdefault); check!(check_break); check!(check_breakoff); check!(check_checkcompoundcase2); check!(check_checkcompoundcase); check!(check_checkcompoundcaseutf); check!(check_checkcompounddup); // Use CHECKCOMPOUNDPATTERN replacements which aren't implemented yet: // check!(check_checkcompoundpattern2); // check!(check_checkcompoundpattern3); // check!(check_checkcompoundpattern4); check!(check_checkcompoundpattern); check!(check_checkcompoundrep); check!(check_checkcompoundtriple); check!(check_checksharps); check!(check_checksharpsutf); check!(check_circumfix); check!(check_colons_in_words); check!(check_complexprefixes2); check!(check_complexprefixes); check!(check_complexprefixesutf); check!(check_compoundaffix2); check!(check_compoundaffix3); check!(check_compoundaffix); check!(check_compoundflag); check!(check_compoundrule2); check!(check_compoundrule3); check!(check_compoundrule4); check!(check_compoundrule5); check!(check_compoundrule6); check!(check_compoundrule7); check!(check_compoundrule8); check!(check_compoundrule); check!(check_conditionalprefix); // Fails due to weird encoding of the aff/dic: // check!(check_condition); check!(check_condition_utf); check!(check_digits_in_words); check!(check_dotless_i); // Fails due to weird encoding of the aff/dic: // check!(check_encoding); check!(check_flag); check!(check_flaglong); check!(check_flagnum); check!(check_flagutf8); check!(check_fogemorpheme); check!(check_forbiddenword); check!(check_forceucase); check!(check_fullstrip); check!(check_germancompounding); check!(check_germancompoundingold); check!(check_hu); check!(check_i35725); check!(check_i53643); check!(check_i54633); // Fails due to weird encoding of the aff/dic: // check!(check_i54980); check!(check_i58202); check!(check_i68568); check!(check_i68568utf); check!(check_iconv2); check!(check_iconv); check!(check_ignore); check!(check_ignoreutf); check!(check_IJ); check!(check_keepcase); check!(check_korean); check!(check_map); check!(check_maputf); // Presumably needs morphology support? // check!(check_morph); check!(check_needaffix2); check!(check_needaffix3); check!(check_needaffix4); check!(check_needaffix5); check!(check_needaffix); check!(check_nepali); check!(check_ngram_utf_fix); check!(check_nosuggest); check!(check_oconv); check!(check_onlyincompound2); check!(check_onlyincompound); check!(check_opentaal_cpdpat2); check!(check_opentaal_cpdpat); check!(check_opentaal_forbiddenword1); check!(check_opentaal_forbiddenword2); check!(check_opentaal_keepcase); check!(check_phone); check!(check_rep); check!(check_reputf); check!(check_simplifiedtriple); check!(check_slash); check!(check_sug); check!(check_sugutf); check!(check_utf8_bom2); check!(check_utf8_bom); check!(check_utf8); check!(check_utf8_nonbmp); check!(check_utfcompound); check!(check_warn); check!(check_zeroaffix); macro_rules! suggest { ($case:ident) => { #[allow(non_snake_case)] #[test] fn $case() { let case = stringify!($case).strip_prefix("suggest_").unwrap(); do_suggest_case(case); } }; } fn do_suggest_case(case: &str) { let manifest_path = PathBuf::from(std::env::var_os("CARGO_MANIFEST_DIR").unwrap()); let path = manifest_path.join("tests/legacy").join(case); let aff = read_to_string(path.with_extension("aff")).unwrap(); let dic = read_to_string(path.with_extension("dic")).unwrap(); let dict = Dictionary::new(&aff, &dic).unwrap(); let mut list_sugs = Vec::new(); let mut sugs = Vec::new(); for word in fs::read_to_string(path.with_extension("wrong")) .iter() .flat_map(|text| text.lines()) .filter(|line| !line.is_empty()) { assert!( !dict.check(word), "word {word:?} was in the .wrong file but was actually correct" ); dict.suggest(word, &mut sugs); if !sugs.is_empty() { list_sugs.push(std::mem::take(&mut sugs)); } } let mut expected_list_sugs = Vec::new(); for line in fs::read_to_string(path.with_extension("sug")) .iter() .flat_map(|text| text.lines()) .filter(|line| !line.is_empty()) { let sugs: Vec<_> = line.split(", ").map(ToOwned::to_owned).collect(); if !sugs.is_empty() { expected_list_sugs.push(sugs); } } assert_eq!( expected_list_sugs, list_sugs, "(left: expected, right: actual)" ); } suggest!(suggest_1463589); suggest!(suggest_1463589_utf); suggest!(suggest_1695964); suggest!(suggest_IJ); suggest!(suggest_allcaps); suggest!(suggest_allcaps_utf); suggest!(suggest_allcaps2); suggest!(suggest_base); suggest!(suggest_base_utf); suggest!(suggest_breakdefault); suggest!(suggest_forceucase); suggest!(suggest_i35725); suggest!(suggest_i54633); suggest!(suggest_i58202); suggest!(suggest_keepcase); suggest!(suggest_map); suggest!(suggest_maputf); suggest!(suggest_ngram_utf_fix); suggest!(suggest_oconv); suggest!(suggest_onlyincompound); suggest!(suggest_opentaal_forbiddenword1); suggest!(suggest_opentaal_forbiddenword2); suggest!(suggest_opentaal_keepcase); suggest!(suggest_rep); suggest!(suggest_reputf); suggest!(suggest_sug); suggest!(suggest_sugutf); // These are marked as failing in Nuspell: // suggest!(suggest_checksharps); // suggest!(suggest_checksharpsutf); // suggest!(suggest_nosuggest); // suggest!(suggest_phone); // suggest!(suggest_utf8_nonbmp); /// Reads the contents of a file into a String, handling detecting and decoding of non-UTF-8 /// contents. fn read_to_string>(path: P) -> io::Result { // Adapted from Helix's document opening function: // const BUF_SIZE: usize = 8 * 1024; let mut buf = [0u8; BUF_SIZE]; let mut reader = File::open(path).unwrap(); let read = reader.read(&mut buf).unwrap(); assert_ne!(read, 0); // Guess encoding. let mut detector = chardetng::EncodingDetector::new(); detector.feed(&buf, read < BUF_SIZE); let encoding = detector.guess(None, true); let mut decoder = encoding.new_decoder(); let mut output = String::new(); let mut slice = &buf[..read]; let mut is_empty = read == 0; let mut total_written = 0usize; // Zero-initialized bytes are a valid str. let mut buf_out = [0u8; BUF_SIZE]; let buf_str = unsafe { std::str::from_utf8_unchecked_mut(&mut buf_out) }; loop { let mut total_read = 0usize; // An inner loop is necessary as it is possible that the input buffer // may not be completely decoded on the first `decode_to_str()` call // which would happen in cases where the output buffer is filled to // capacity. loop { let (result, read, written, ..) = decoder.decode_to_str( &slice[total_read..], &mut buf_str[total_written..], is_empty, ); // These variables act as the read and write cursors of `buf` and `buf_str` respectively. // They are necessary in case the output buffer fills before decoding of the entire input // loop is complete. Otherwise, the loop would endlessly iterate over the same `buf` and // the data inside the output buffer would be overwritten. total_read += read; total_written += written; match result { encoding_rs::CoderResult::InputEmpty => { assert_eq!(slice.len(), total_read); break; } encoding_rs::CoderResult::OutputFull => { assert!(slice.len() > total_read); output.push_str(&buf_str[..total_written]); total_written = 0; } } } // Once the end of the stream is reached, the output buffer is // flushed and the loop terminates. if is_empty { assert_eq!(reader.read(&mut buf)?, 0); output.push_str(&buf_str[..total_written]); break; } // Once the previous input has been processed and decoded, the next set of // data is fetched from the reader. The end of the reader is determined to // be when exactly `0` bytes were read from the reader, as per the invariants // of the `Read` trait. let read = reader.read(&mut buf)?; slice = &buf[..read]; is_empty = read == 0; } Ok(output) }