// This file is part of ICU4X. For terms of use, please see the file // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). use icu_casemap::greek_to_me::{ self, GreekDiacritics, GreekPrecomposedLetterData, GreekVowel, PackedGreekPrecomposedLetterData, }; use icu_casemap::CaseMapper; use icu_normalizer::DecomposingNormalizer; use icu_properties::{maps, GeneralCategoryGroup, Script}; use std::collections::BTreeMap; use std::fmt::Write; fn main() { let decomposer = DecomposingNormalizer::new_nfd(); let script = maps::script(); let gc = maps::general_category(); let cm = CaseMapper::new(); let mut vec_370 = vec![0; 0x400 - 0x370]; let mut vec_1f00 = vec![0; 0x100]; let mut extras = BTreeMap::new(); // Loop over all codepoints for range in script.iter_ranges_for_value(Script::Greek) { for ch in range { if let Ok(ch) = char::try_from(ch) { if !GeneralCategoryGroup::Letter.contains(gc.get(ch)) { continue; } let nfd = decomposer.normalize_utf8(ch.encode_utf8(&mut [0; 4]).as_bytes()); let mut data: Option = None; for nfd_ch in nfd.chars() { match nfd_ch { // accented: [:toNFD=/[\u0300\u0301\u0342\u0302\u0303\u0311]/:]&[:Grek:]&[:L:] (from the JSPs: toNFD is an extension). greek_to_me::diacritics!(ACCENTS) => { if let Some(GreekPrecomposedLetterData::Vowel(_, ref mut diacritics)) = data { diacritics.accented = true; } else { panic!("Found character {ch} that has diacritics but is not a Greek vowel"); } } // dialytika: [:toNFD=/[\u0308]/:]&[:Grek:]&[:L:] (from the JSPs: toNFD is an extension). greek_to_me::diacritics!(DIALYTIKA) => { if let Some(GreekPrecomposedLetterData::Vowel(_, ref mut diacritics)) = data { diacritics.dialytika = true; } else { panic!("Found character {ch} that has diacritics but is not a Greek vowel"); } } // precomposed_ypogegrammeni [:toNFD=/[\u0345]/:]&[:Grek:]&[:L:] (from the JSPs: toNFD is an extension). greek_to_me::diacritics!(YPOGEGRAMMENI) => { if let Some(GreekPrecomposedLetterData::Vowel(_, ref mut diacritics)) = data { diacritics.ypogegrammeni = true; } else { panic!("Found character {ch} that has diacritics but is not a Greek vowel"); } } greek_to_me::diacritics!(BREATHING_AND_LENGTH) | greek_to_me::diacritics!(ACCENTS) => { // Rho takes breathing marks but other consonants should not if let Some(GreekPrecomposedLetterData::Consonant(false)) = data { panic!("Found character {ch} that has diacritics but is not a Greek vowel"); } } // Ignore all small letters '\u{1D00}'..='\u{1DBF}' | '\u{AB65}' => (), // caps: [[:Grek:]&[:L:]-[\u1D00-\u1DBF\uAB65]] . NFD, remove non-letters, uppercase letter if GeneralCategoryGroup::Letter.contains(gc.get(letter)) => { let uppercased = cm.uppercase_to_string( letter.encode_utf8(&mut [0; 4]), &Default::default(), ); let mut iter = uppercased.chars(); let uppercased = iter.next().unwrap(); assert!( iter.next().is_none(), "{letter} Should uppercase to a single letter char, instead uppercased to {uppercased:?}" ); if let Ok(vowel) = GreekVowel::try_from(uppercased) { data = Some(GreekPrecomposedLetterData::Vowel( vowel, GreekDiacritics::default(), )) } else { let is_rho = uppercased == greek_to_me::CAPITAL_RHO; data = Some(GreekPrecomposedLetterData::Consonant(is_rho)) }; } _ => (), } } if let Some(data) = data { let packed = PackedGreekPrecomposedLetterData::from(data); assert_eq!( Ok(data), packed.try_into(), "packed data for {ch} ({packed:?}) must roundtrip!" ); let ch_i = ch as usize; if (0x370..=0x3FF).contains(&ch_i) { vec_370[ch_i - 0x370] = packed.0; } else if (0x1f00..0x1fff).contains(&ch_i) { vec_1f00[ch_i - 0x1f00] = packed.0; } else { extras.insert(ch, packed.0); } } } } } vec_370.truncate( vec_370 .iter() .rposition(|x| *x != 0) .expect("must have some greek data") + 1, ); vec_1f00.truncate( vec_1f00 .iter() .rposition(|x| *x != 0) .expect("must have some greek data") + 1, ); let mut others = String::new(); for (ch, data) in extras { writeln!(&mut others, " '{ch}' => {data},").unwrap(); } let output = format!( r#"// This file is part of ICU4X. For terms of use, please see the file // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). // This file is generated by running `cargo test --test gen_greek_to_me --features compiled_data,datagen // // Do not edit manually // All u8s in this file are PackedGreekPrecomposedLetterDatas, see parent module /// Data for characters in U+370-U+3FF pub(crate) const DATA_370: [u8; 0x{:x}] = {vec_370:?}; /// Data for characters in U+1F00-U+1FFF pub(crate) const DATA_1F00: [u8; 0x{:x}] = {vec_1f00:?}; /// Characters like the ohm sign that do not belong in the two blocks above pub(crate) fn match_extras(ch: char) -> Option {{ Some(match ch {{ {others} _ => return None }}) }} "#, vec_370.len(), vec_1f00.len(), ); let local = include_str!("../src/greek_to_me/data.rs"); // We cannot just check if the two are unequal because on Windows core.autocrlf // may have messed with the line endings on the file, or it may have not (either // due to a changed setting, or due to the code being in a cargo cache/vendor. We also // cannot fix this by passing `--config newline_style=unix` to rustfmt. We must // perform a `\r`-agnostic comparison. // // (technically this should only catch `\r\n` and not just `\r` but for a golden // test on rustfmt output it does not matter) if local .trim() .chars() .filter(|&x| x != '\r') .ne(output.trim().chars().filter(|&x| x != '\r')) { println!( r#"Please copy the following file to src/greek_to_me/data.rs: ======================================================== {output} ========================================================"# ); panic!("Found mismatch between generated Greek specialcasing data and checked-in data. Please check in the updated file shown above."); } }