// This file is part of ICU4X. For terms of use, please see the file // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). use icu_casemap::CaseMapper; use icu_locid::langid; #[test] fn test_simple_mappings() { let case_mapping = CaseMapper::new(); // Basic case mapping assert_eq!(case_mapping.simple_uppercase('a'), 'A'); assert_eq!(case_mapping.simple_lowercase('a'), 'a'); assert_eq!(case_mapping.simple_titlecase('a'), 'A'); assert_eq!(case_mapping.simple_fold('a'), 'a'); assert_eq!(case_mapping.simple_uppercase('A'), 'A'); assert_eq!(case_mapping.simple_lowercase('A'), 'a'); assert_eq!(case_mapping.simple_titlecase('A'), 'A'); assert_eq!(case_mapping.simple_fold('A'), 'a'); // Case mapping of titlecase character assert_eq!(case_mapping.simple_uppercase('\u{1c4}'), '\u{1c4}'); assert_eq!(case_mapping.simple_titlecase('\u{1c4}'), '\u{1c5}'); assert_eq!(case_mapping.simple_lowercase('\u{1c4}'), '\u{1c6}'); assert_eq!(case_mapping.simple_uppercase('\u{1c5}'), '\u{1c4}'); assert_eq!(case_mapping.simple_titlecase('\u{1c5}'), '\u{1c5}'); assert_eq!(case_mapping.simple_lowercase('\u{1c5}'), '\u{1c6}'); assert_eq!(case_mapping.simple_uppercase('\u{1c6}'), '\u{1c4}'); assert_eq!(case_mapping.simple_titlecase('\u{1c6}'), '\u{1c5}'); assert_eq!(case_mapping.simple_lowercase('\u{1c6}'), '\u{1c6}'); // Turkic case folding assert_eq!(case_mapping.simple_fold('I'), 'i'); assert_eq!(case_mapping.simple_fold_turkic('I'), 'ı'); assert_eq!(case_mapping.simple_fold('İ'), 'İ'); assert_eq!(case_mapping.simple_fold_turkic('İ'), 'i'); // Supplementary code points (Deseret) assert_eq!(case_mapping.simple_uppercase('\u{1043c}'), '\u{10414}'); assert_eq!(case_mapping.simple_lowercase('\u{1043c}'), '\u{1043c}'); assert_eq!(case_mapping.simple_titlecase('\u{1043c}'), '\u{10414}'); assert_eq!(case_mapping.simple_fold('\u{1043c}'), '\u{1043c}'); assert_eq!(case_mapping.simple_uppercase('\u{10414}'), '\u{10414}'); assert_eq!(case_mapping.simple_lowercase('\u{10414}'), '\u{1043c}'); assert_eq!(case_mapping.simple_titlecase('\u{10414}'), '\u{10414}'); assert_eq!(case_mapping.simple_fold('\u{10414}'), '\u{1043c}'); } // These and the below tests are taken from StringCaseTest::TestCaseConversion in ICU4C. #[test] fn test_full_mappings() { let case_mapping = CaseMapper::new(); let root = langid!("und"); let tr = langid!("tr"); let lt = langid!("lt"); let uppercase_greek = "ΙΕΣΥΣ ΧΡΙΣΤΟΣ"; // "IESUS CHRISTOS" let lowercase_greek = "ιεσυς χριστος"; // "IESUS CHRISTOS" assert_eq!( case_mapping.uppercase_to_string(lowercase_greek, &root), uppercase_greek ); assert_eq!( case_mapping.lowercase_to_string(uppercase_greek, &root), lowercase_greek ); assert_eq!( case_mapping.fold_string(uppercase_greek), case_mapping.fold_string(lowercase_greek) ); let lowercase_turkish_1 = "istanbul, not constantınople"; let uppercase_turkish_1 = "İSTANBUL, NOT CONSTANTINOPLE"; assert_eq!( case_mapping.lowercase_to_string(uppercase_turkish_1, &root), "i\u{307}stanbul, not constantinople" ); assert_eq!( case_mapping.lowercase_to_string(uppercase_turkish_1, &tr), lowercase_turkish_1 ); let lowercase_turkish_2 = "topkapı palace, istanbul"; let uppercase_turkish_2 = "TOPKAPI PALACE, İSTANBUL"; assert_eq!( case_mapping.uppercase_to_string(lowercase_turkish_2, &root), "TOPKAPI PALACE, ISTANBUL" ); assert_eq!( case_mapping.uppercase_to_string(lowercase_turkish_2, &tr), uppercase_turkish_2 ); let initial_german = "Süßmayrstraße"; let uppercase_german = "SÜSSMAYRSTRASSE"; assert_eq!( case_mapping.uppercase_to_string(initial_german, &root), uppercase_german ); let before = "aBIΣßΣ/\u{5ffff}"; let after = "abiσßς/\u{5ffff}"; let after_turkish = "abıσßς/\u{5ffff}"; assert_eq!(case_mapping.lowercase_to_string(before, &root), after); assert_eq!(case_mapping.lowercase_to_string(before, &tr), after_turkish); let before = "aBiςßσ/\u{fb03}\u{fb03}\u{fb03}\u{5ffff}"; let after = "ABIΣSSΣ/FFIFFIFFI\u{5ffff}"; let after_turkish = "ABİΣSSΣ/FFIFFIFFI\u{5ffff}"; assert_eq!(case_mapping.uppercase_to_string(before, &root), after); assert_eq!(case_mapping.uppercase_to_string(before, &tr), after_turkish); let before = "ßa"; let after = "SSA"; assert_eq!(case_mapping.uppercase_to_string(before, &root), after); let initial_deseret = "\u{1043c}\u{10414}"; let upper_deseret = "\u{10414}\u{10414}"; let lower_deseret = "\u{1043c}\u{1043c}"; assert_eq!( case_mapping.uppercase_to_string(initial_deseret, &root), upper_deseret ); assert_eq!( case_mapping.lowercase_to_string(initial_deseret, &root), lower_deseret ); // lj ligature let initial_ligature = "\u{1c7}\u{1c8}\u{1c9}"; let lower_ligature = "\u{1c9}\u{1c9}\u{1c9}"; let upper_ligature = "\u{1c7}\u{1c7}\u{1c7}"; assert_eq!( case_mapping.uppercase_to_string(initial_ligature, &root), upper_ligature ); assert_eq!( case_mapping.lowercase_to_string(initial_ligature, &root), lower_ligature ); // Sigmas preceded and/or followed by cased letters let initial_sigmas = "i\u{307}\u{3a3}\u{308}j \u{307}\u{3a3}\u{308}j i\u{ad}\u{3a3}\u{308} \u{307}\u{3a3}\u{308}"; let lower_sigmas = "i\u{307}\u{3c3}\u{308}j \u{307}\u{3c3}\u{308}j i\u{ad}\u{3c2}\u{308} \u{307}\u{3c3}\u{308}"; let upper_sigmas = "I\u{307}\u{3a3}\u{308}J \u{307}\u{3a3}\u{308}J I\u{ad}\u{3a3}\u{308} \u{307}\u{3a3}\u{308}"; assert_eq!( case_mapping.uppercase_to_string(initial_sigmas, &root), upper_sigmas ); assert_eq!( case_mapping.lowercase_to_string(initial_sigmas, &root), lower_sigmas ); // Turkish & Azerbaijani dotless i & dotted I: // Remove dot above if there was a capital I before and there are no more accents above. let initial_dots = "I İ I\u{307} I\u{327}\u{307} I\u{301}\u{307} I\u{327}\u{307}\u{301}"; let after = "i i\u{307} i\u{307} i\u{327}\u{307} i\u{301}\u{307} i\u{327}\u{307}\u{301}"; let after_turkish = "ı i i i\u{327} ı\u{301}\u{307} i\u{327}\u{301}"; assert_eq!(case_mapping.lowercase_to_string(initial_dots, &root), after); assert_eq!( case_mapping.lowercase_to_string(initial_dots, &tr), after_turkish ); // Lithuanian dot above in uppercasing let initial_dots = "a\u{307} \u{307} i\u{307} j\u{327}\u{307} j\u{301}\u{307}"; let after = "A\u{307} \u{307} I\u{307} J\u{327}\u{307} J\u{301}\u{307}"; let after_lithuanian = "A\u{307} \u{307} I J\u{327} J\u{301}\u{307}"; assert_eq!(case_mapping.uppercase_to_string(initial_dots, &root), after); assert_eq!( case_mapping.uppercase_to_string(initial_dots, <), after_lithuanian ); // Lithuanian adds dot above to i in lowercasing if there are more above accents let initial_dots = "I I\u{301} J J\u{301} \u{12e} \u{12e}\u{301} \u{cc}\u{cd}\u{128}"; let after = "i i\u{301} j j\u{301} \u{12f} \u{12f}\u{301} \u{ec}\u{ed}\u{129}"; let after_lithuanian = "i i\u{307}\u{301} j j\u{307}\u{301} \u{12f} \u{12f}\u{307}\u{301} i\u{307}\u{300}i\u{307}\u{301}i\u{307}\u{303}"; assert_eq!(case_mapping.lowercase_to_string(initial_dots, &root), after); assert_eq!( case_mapping.lowercase_to_string(initial_dots, <), after_lithuanian ); // Test case folding let initial = "Aßµ\u{fb03}\u{1040c}İı"; let simple = "assμffi\u{10434}i\u{307}ı"; let turkic = "assμffi\u{10434}iı"; assert_eq!(case_mapping.fold_string(initial), simple); assert_eq!(case_mapping.fold_turkic_string(initial), turkic); } #[test] fn test_armenian() { let cm = CaseMapper::new(); let root = langid!("und"); let east = langid!("hy"); let west = langid!("hyw"); let default_options = Default::default(); let s = "և Երևանի"; assert_eq!(cm.uppercase_to_string(s, &root), "ԵՒ ԵՐԵՒԱՆԻ"); assert_eq!(cm.uppercase_to_string(s, &east), "ԵՎ ԵՐԵՎԱՆԻ"); assert_eq!(cm.uppercase_to_string(s, &west), "ԵՒ ԵՐԵՒԱՆԻ"); let ew = "և"; let yerevan = "Երևանի"; assert_eq!( cm.titlecase_segment_with_only_case_data_to_string(ew, &root, default_options), "Եւ" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string(yerevan, &root, default_options), "Երևանի" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string(ew, &east, default_options), "Եվ" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string(yerevan, &east, default_options), "Երևանի" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string(ew, &west, default_options), "Եւ" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string(yerevan, &west, default_options), "Երևանի" ); } #[test] fn test_dutch() { let cm = CaseMapper::new(); let nl = langid!("nl"); let default_options = Default::default(); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("ijssel", &nl, default_options), "IJssel" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("igloo", &nl, default_options), "Igloo" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("IJMUIDEN", &nl, default_options), "IJmuiden" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("ij", &nl, default_options), "IJ" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("IJ", &nl, default_options), "IJ" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("íj́", &nl, default_options), "ÍJ́" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("ÍJ́", &nl, default_options), "ÍJ́" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("íJ́", &nl, default_options), "ÍJ́" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("Ij́", &nl, default_options), "Ij́" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("ij́", &nl, default_options), "Ij́" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("ïj́", &nl, default_options), "Ïj́" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("íj\u{0308}", &nl, default_options), "Íj\u{0308}" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("íj́\u{1D16E}", &nl, default_options), "Íj́\u{1D16E}" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("íj\u{1ABE}", &nl, default_options), "Íj\u{1ABE}" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("ijabc", &nl, default_options), "IJabc" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("IJabc", &nl, default_options), "IJabc" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("íj́abc", &nl, default_options), "ÍJ́abc" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("ÍJ́abc", &nl, default_options), "ÍJ́abc" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("íJ́abc", &nl, default_options), "ÍJ́abc" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("Ij́abc", &nl, default_options), "Ij́abc" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("ij́abc", &nl, default_options), "Ij́abc" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("ïj́abc", &nl, default_options), "Ïj́abc" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("íjabc\u{0308}", &nl, default_options), "Íjabc\u{0308}" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("íj́abc\u{1D16E}", &nl, default_options), "ÍJ́abc\u{1D16E}" ); assert_eq!( cm.titlecase_segment_with_only_case_data_to_string("íjabc\u{1ABE}", &nl, default_options), "Íjabc\u{1ABE}" ); } #[test] fn test_greek_upper() { let nfc = icu_normalizer::ComposingNormalizer::new_nfc(); let nfd = icu_normalizer::DecomposingNormalizer::new_nfd(); let cm = CaseMapper::new(); let modern_greek = &langid!("el"); let assert_greek_uppercase = |input: &str, expected: &str| { assert_eq!( cm.uppercase_to_string(nfc.normalize(input).as_str(), modern_greek), nfc.normalize(expected) ); assert_eq!( cm.uppercase_to_string(nfd.normalize(input).as_str(), modern_greek), nfd.normalize(expected) ); }; // https://unicode-org.atlassian.net/browse/ICU-5456 assert_greek_uppercase("άδικος, κείμενο, ίριδα", "ΑΔΙΚΟΣ, ΚΕΙΜΕΝΟ, ΙΡΙΔΑ"); // https://bugzilla.mozilla.org/show_bug.cgi?id=307039 // https://bug307039.bmoattachments.org/attachment.cgi?id=194893 assert_greek_uppercase("Πατάτα", "ΠΑΤΑΤΑ"); assert_greek_uppercase("Αέρας, Μυστήριο, Ωραίο", "ΑΕΡΑΣ, ΜΥΣΤΗΡΙΟ, ΩΡΑΙΟ"); assert_greek_uppercase("Μαΐου, Πόρος, Ρύθμιση", "ΜΑΪΟΥ, ΠΟΡΟΣ, ΡΥΘΜΙΣΗ"); assert_greek_uppercase("ΰ, Τηρώ, Μάιος", "Ϋ, ΤΗΡΩ, ΜΑΪΟΣ"); assert_greek_uppercase("άυλος", "ΑΫΛΟΣ"); assert_greek_uppercase("ΑΫΛΟΣ", "ΑΫΛΟΣ"); assert_greek_uppercase( "Άκλιτα ρήματα ή άκλιτες μετοχές", "ΑΚΛΙΤΑ ΡΗΜΑΤΑ Ή ΑΚΛΙΤΕΣ ΜΕΤΟΧΕΣ", ); // http://www.unicode.org/udhr/d/udhr_ell_monotonic.html assert_greek_uppercase( "Επειδή η αναγνώριση της αξιοπρέπειας", "ΕΠΕΙΔΗ Η ΑΝΑΓΝΩΡΙΣΗ ΤΗΣ ΑΞΙΟΠΡΕΠΕΙΑΣ", ); assert_greek_uppercase("νομικού ή διεθνούς", "ΝΟΜΙΚΟΥ Ή ΔΙΕΘΝΟΥΣ"); // http://unicode.org/udhr/d/udhr_ell_polytonic.html assert_greek_uppercase("Ἐπειδὴ ἡ ἀναγνώριση", "ΕΠΕΙΔΗ Η ΑΝΑΓΝΩΡΙΣΗ"); assert_greek_uppercase("νομικοῦ ἢ διεθνοῦς", "ΝΟΜΙΚΟΥ Ή ΔΙΕΘΝΟΥΣ"); // From Google bug report assert_greek_uppercase("Νέο, Δημιουργία", "ΝΕΟ, ΔΗΜΙΟΥΡΓΙΑ"); // http://crbug.com/234797 assert_greek_uppercase( "Ελάτε να φάτε τα καλύτερα παϊδάκια!", "ΕΛΑΤΕ ΝΑ ΦΑΤΕ ΤΑ ΚΑΛΥΤΕΡΑ ΠΑΪΔΑΚΙΑ!", ); assert_greek_uppercase("Μαΐου, τρόλεϊ", "ΜΑΪΟΥ, ΤΡΟΛΕΪ"); assert_greek_uppercase("Το ένα ή το άλλο.", "ΤΟ ΕΝΑ Ή ΤΟ ΑΛΛΟ."); // http://multilingualtypesetting.co.uk/blog/greek-typesetting-tips/ assert_greek_uppercase("ρωμέικα", "ΡΩΜΕΪΚΑ"); assert_greek_uppercase("ή.", "Ή."); // The ὑπογεγραμμέναι become Ι as in default case conversion, but they are // specially handled by the implementation. assert_greek_uppercase("ᾠδή, -ήν, -ῆς, -ῇ", "ΩΙΔΗ, -ΗΝ, -ΗΣ, -ΗΙ"); assert_greek_uppercase("ᾍδης", "ΑΙΔΗΣ"); // Handle breathing marks on rho assert_greek_uppercase("ῥήματα ῤήματα", "ΡΗΜΑΤΑ ΡΗΜΑΤΑ"); }