#!/usr/bin/env python3 # # Copyright 2011-2015 The Rust Project Developers. See the COPYRIGHT # file at the top-level directory of this distribution and at # http://rust-lang.org/COPYRIGHT. # # Licensed under the Apache License, Version 2.0 or the MIT license # , at your # option. This file may not be copied, modified, or distributed # except according to those terms. # This script uses the following Unicode UCD data: # - emoji/emoji-data.txt # # Since this should not require frequent updates, we just store this # out-of-line and check the tables.rs file into git. import fileinput, re, os, sys, operator preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT // file at the top-level directory of this distribution and at // http://rust-lang.org/COPYRIGHT. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. // NOTE: The following code was generated by "scripts/unicode.py", do not edit directly #![allow(missing_docs, non_upper_case_globals, non_snake_case)] ''' UNICODE_VERSION = (16, 0, 0) UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION # Download a UCD table file def fetch_unidata(f): if not os.path.exists(os.path.basename(f)): os.system("curl -O https://www.unicode.org/Public/%s/ucd/%s" % (UNICODE_VERSION_NUMBER, f)) if not os.path.exists(os.path.basename(f)): sys.stderr.write("cannot load %s" % f) exit(1) # Loads code point data from emoji-data.txt # Implementation from unicode-segmentation def load_emoji_properties(f): fetch_unidata(f) kinds = {} re1 = re.compile(r"^ *([0-9A-F]+) *; *(\w+)") re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+) *#") for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")): kind = None d_lo = 0 d_hi = 0 m = re1.match(line) if m: d_lo = m.group(1) d_hi = m.group(1) kind = m.group(2).strip() else: m = re2.match(line) if m: d_lo = m.group(1) d_hi = m.group(2) kind = m.group(3).strip() else: continue d_lo = int(d_lo, 16) d_hi = int(d_hi, 16) if kind not in kinds: kinds[kind] = [] kinds[kind].append((d_lo, d_hi)) return kinds def load_general_category_properties(f): fetch_unidata(f) general_category_list = [] re1 = re.compile(r"^([0-9A-F]+);([^;]+);([A-Za-z]+);.*$") re2 = re.compile(r"^<(.*), First>$") re3 = re.compile(r"^<(.*), Last>$") re4 = re.compile(r"^<(.*)>$") special_group_lo = 0 special_group_text = '' special_group_gc = '' for line in fileinput.input(os.path.basename(f), openhook=fileinput.hook_encoded("utf-8")): d_ch = 0 d_name = '' d_gc = '' d_lo = 0 d_hi = 0 m = re1.match(line) if not m: continue d_ch = m.group(1) d_name = m.group(2).strip() d_gc = m.group(3).strip() if not d_name.startswith('<'): d_lo = int(d_ch, 16) d_hi = d_lo general_category_list.append((d_lo, d_hi, d_gc)) continue m2 = re2.match(d_name) if m2: special_group_lo = int(d_ch, 16) special_group_text = m2.group(1) special_group_gc = d_gc continue m3 = re3.match(d_name) if m3: assert(special_group_text == m3.group(1)) assert(special_group_gc == d_gc) d_lo = special_group_lo d_hi = int(d_ch, 16) general_category_list.append((d_lo, d_hi, d_gc)) continue m4 = re4.match(d_name) if m4: d_lo = int(d_ch, 16) d_hi = d_lo general_category_list.append((d_lo, d_hi, d_gc)) continue raise ValueError("unreachable") return general_category_list def format_table_content(f, content, indent): line = " "*indent first = True for chunk in content.split(","): if len(line) + len(chunk) < 98: if first: line += chunk else: line += ", " + chunk first = False else: f.write(line + ",\n") line = " "*indent + chunk f.write(line) def escape_char(c): if c == 'multi': return "\"\"" return "'\\u{%x}'" % c def escape_char_list(l): line = "[" first = True for c in l: if first: line += escape_char(c) else: line += ", " + escape_char(c) first = False line += "]" return line def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True, pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True): pub_string = "const" if not is_const: pub_string = "let" if is_pub: pub_string = "pub " + pub_string f.write(" %s %s: %s = &[\n" % (pub_string, name, t_type)) data = "" first = True for dat in t_data: if not first: data += "," first = False data += pfun(dat) format_table_content(f, data, 8) f.write("\n ];\n\n") def emit_general_category_module(f): f.write("""#[cfg(feature = \"general-category\")] pub mod general_category {""") f.write(""" #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)] /// The most general classification of a character. pub enum GeneralCategory { /// `Lu`, an uppercase letter UppercaseLetter, /// `Ll`, a lowercase letter LowercaseLetter, /// `Lt`, a digraphic character, with first part uppercase TitlecaseLetter, /// `Lm`, a modifier letter ModifierLetter, /// `Lo`, other letters, including syllables and ideographs OtherLetter, /// `Mn`, a nonspacing combining mark (zero advance width) NonspacingMark, /// `Mc`, a spacing combining mark (positive advance width) SpacingMark, /// `Me`, an enclosing combining mark EnclosingMark, /// `Nd`, a decimal digit DecimalNumber, /// `Nl`, a letterlike numeric character LetterNumber, /// `No`, a numeric character of other type OtherNumber, /// `Pc`, a connecting punctuation mark, like a tie ConnectorPunctuation, /// `Pd`, a dash or hyphen punctuation mark DashPunctuation, /// `Ps`, an opening punctuation mark (of a pair) OpenPunctuation, /// `Pe`, a closing punctuation mark (of a pair) ClosePunctuation, /// `Pi`, an initial quotation mark InitialPunctuation, /// `Pf`, a final quotation mark FinalPunctuation, /// `Po`, a punctuation mark of other type OtherPunctuation, /// `Sm`, a symbol of mathematical use MathSymbol, /// `Sc`, a currency sign CurrencySymbol, /// `Sk`, a non-letterlike modifier symbol ModifierSymbol, /// `So`, a symbol of other type OtherSymbol, /// `Zs`, a space character (of various non-zero widths) SpaceSeparator, /// `Zl`, U+2028 LINE SEPARATOR only LineSeparator, /// `Zp`, U+2029 PARAGRAPH SEPARATOR only ParagraphSeparator, /// `Cc`, a C0 or C1 control code Control, /// `Cf`, a format control character Format, /// `Cs`, a surrogate code point Surrogate, /// `Co`, a private-use character PrivateUse, /// `Cn`, a reserved unassigned code point or a noncharacter Unassigned, } #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)] /// Groupings of the most general classification of a character. pub enum GeneralCategoryGroup { /// Lu | Ll | Lt | Lm | Lo Letter, /// Mn | Mc | Me Mark, /// Nd | Nl | No Number, /// Pc | Pd | Ps | Pe | Pi | Pf | Po Punctuation, /// Sm | Sc | Sk | So Symbol, /// Zs | Zl | Zp Separator, /// Cc | Cf | Cs | Co | Cn Other, } #[inline] pub(crate) fn general_category_of_char(c: char) -> GeneralCategory { super::util::bsearch_range_value_table(c, GENERAL_CATEGORY).unwrap_or(GeneralCategory::Unassigned) } #[inline] pub(crate) fn general_category_is_letter_cased(gc: GeneralCategory) -> bool { matches!(gc, GeneralCategory::UppercaseLetter | GeneralCategory::LowercaseLetter | GeneralCategory::TitlecaseLetter) } #[inline] pub(crate) fn general_category_group(gc: GeneralCategory) -> GeneralCategoryGroup { match gc { GeneralCategory::UppercaseLetter | GeneralCategory::LowercaseLetter | GeneralCategory::TitlecaseLetter | GeneralCategory::ModifierLetter | GeneralCategory::OtherLetter => GeneralCategoryGroup::Letter, GeneralCategory::NonspacingMark | GeneralCategory::SpacingMark | GeneralCategory::EnclosingMark => GeneralCategoryGroup::Mark, GeneralCategory::DecimalNumber | GeneralCategory::LetterNumber | GeneralCategory::OtherNumber => GeneralCategoryGroup::Number, GeneralCategory::ConnectorPunctuation | GeneralCategory::DashPunctuation | GeneralCategory::OpenPunctuation | GeneralCategory::ClosePunctuation | GeneralCategory::InitialPunctuation | GeneralCategory::FinalPunctuation | GeneralCategory::OtherPunctuation => GeneralCategoryGroup::Punctuation, GeneralCategory::MathSymbol | GeneralCategory::CurrencySymbol | GeneralCategory::ModifierSymbol | GeneralCategory::OtherSymbol => GeneralCategoryGroup::Symbol, GeneralCategory::SpaceSeparator | GeneralCategory::LineSeparator | GeneralCategory::ParagraphSeparator => GeneralCategoryGroup::Separator, GeneralCategory::Control | GeneralCategory::Format | GeneralCategory::Surrogate | GeneralCategory::PrivateUse | GeneralCategory::Unassigned => GeneralCategoryGroup::Other, } } """) gc_variants = { "Lu": "GeneralCategory::UppercaseLetter", "Ll": "GeneralCategory::LowercaseLetter" , "Lt": "GeneralCategory::TitlecaseLetter" , "Lm": "GeneralCategory::ModifierLetter" , "Lo": "GeneralCategory::OtherLetter", "Mn": "GeneralCategory::NonspacingMark", "Mc": "GeneralCategory::SpacingMark" , "Me": "GeneralCategory::EnclosingMark", "Nd": "GeneralCategory::DecimalNumber", "Nl": "GeneralCategory::LetterNumber" , "No": "GeneralCategory::OtherNumber", "Pc": "GeneralCategory::ConnectorPunctuation", "Pd": "GeneralCategory::DashPunctuation" , "Ps": "GeneralCategory::OpenPunctuation" , "Pe": "GeneralCategory::ClosePunctuation" , "Pi": "GeneralCategory::InitialPunctuation" , "Pf": "GeneralCategory::FinalPunctuation" , "Po": "GeneralCategory::OtherPunctuation", "Sm": "GeneralCategory::MathSymbol", "Sc": "GeneralCategory::CurrencySymbol" , "Sk": "GeneralCategory::ModifierSymbol" , "So": "GeneralCategory::OtherSymbol", "Zs": "GeneralCategory::SpaceSeparator", "Zl": "GeneralCategory::LineSeparator" , "Zp": "GeneralCategory::ParagraphSeparator", "Cc": "GeneralCategory::Control", "Cf": "GeneralCategory::Format" , "Cs": "GeneralCategory::Surrogate" , "Co": "GeneralCategory::PrivateUse" , "Cn": "GeneralCategory::Unassigned", } f.write(" // General category table:\n") general_category_char_table = load_general_category_properties("UnicodeData.txt") general_category_group_table = [] for input_idx in range(len(general_category_char_table)): if general_category_char_table[input_idx][2] == "Cs": continue existing_group_count = len(general_category_group_table) if existing_group_count == 0: general_category_group_table.append(general_category_char_table[input_idx]) elif (general_category_group_table[existing_group_count - 1][1] + 1 == general_category_char_table[input_idx][0] and general_category_group_table[existing_group_count - 1][2] == general_category_char_table[input_idx][2]): general_category_group_table[existing_group_count - 1] = (general_category_group_table[existing_group_count - 1][0], general_category_char_table[input_idx][1], general_category_group_table[existing_group_count - 1][2]) else: general_category_group_table.append(general_category_char_table[input_idx]) emit_table(f, "GENERAL_CATEGORY", general_category_group_table, "&[(char, char, GeneralCategory)]", is_pub=False, pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), gc_variants[x[2]])) f.write("}\n\n") def emit_emoji_module(f): f.write("""#[cfg(feature = \"emoji\")] pub mod emoji {""") f.write(""" #[derive(Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd, Debug)] #[non_exhaustive] /// The emoji character properties of a character. pub enum EmojiStatus { /// `Emoji=NO`, `Emoji_Component=NO` NonEmoji, /// `Emoji=NO`, `Emoji_Component=YES` NonEmojiButEmojiComponent, /// `Emoji=YES`, `Emoji_Component=NO`;`Emoji_Presentation=YES` EmojiPresentation, /// `Emoji=YES`, `Emoji_Component=NO`;`Emoji_Modifier_Base=YES` EmojiModifierBase, /// `Emoji=YES`, `Emoji_Component=NO`;`Emoji_Presentation=YES`, `Emoji_Modifier_Base=YES` EmojiPresentationAndModifierBase, /// `Emoji=YES`, `Emoji_Component=NO` EmojiOther, /// `Emoji=YES`, `Emoji_Component=YES`;`Emoji_Presentation=YES` EmojiPresentationAndEmojiComponent, /// `Emoji=YES`, `Emoji_Component=YES`;`Emoji_Presentation=YES`, `Emoji_Modifier=YES` EmojiPresentationAndModifierAndEmojiComponent, /// `Emoji=YES`, `Emoji_Component=YES` EmojiOtherAndEmojiComponent, } #[inline] pub(crate) fn emoji_status(c: char) -> EmojiStatus { // FIXME: do we want to special case ASCII here? super::util::bsearch_range_value_table(c, EMOJI_STATUS).unwrap() } #[inline] pub(crate) fn is_emoji_status_for_emoji_char_or_emoji_component(s: EmojiStatus) -> bool { !matches!(s, EmojiStatus::NonEmoji) } #[inline] pub(crate) fn is_emoji_status_for_emoji_char(s: EmojiStatus) -> bool { !matches!(s, EmojiStatus::NonEmoji | EmojiStatus::NonEmojiButEmojiComponent) } #[inline] pub(crate) fn is_emoji_status_for_emoji_component(s: EmojiStatus) -> bool { matches!(s, EmojiStatus::EmojiPresentationAndEmojiComponent | EmojiStatus::EmojiPresentationAndModifierAndEmojiComponent | EmojiStatus::EmojiOtherAndEmojiComponent) } """) f.write(" // Emoji status table:\n") emoji_status_table = load_emoji_properties("emoji/emoji-data.txt") # we combine things together here. # `Extended_Pictographic`` is only for future proof usages, we ignore it here. # emoji_prop_list = ["Emoji", "Emoji_Presentation", "Emoji_Modifier", "Emoji_Modifier_Base", "Emoji_Component", "Extended_Pictographic"] emoji_prop_list = ["Emoji", "Emoji_Presentation", "Emoji_Modifier", "Emoji_Modifier_Base", "Emoji_Component"] # need to skip surrogates because they're not representable by rust `char`s emoji_status_table["Surrogate"] = [(0xD800, 0xDFFF)] emoji_prop_list.append("Surrogate") emoji_prop_list_len = [len(emoji_status_table[x]) for x in emoji_prop_list] emoji_prop_count = len(emoji_prop_list) code_point_first = 0 code_point_last = 0x10FFFF emoji_prop_list_pos = [0 for _ in emoji_prop_list] cur_group_first = code_point_first emoji_table = [] def group_text(s): if s == "Surrogate": return "" elif s == "": return "EmojiStatus::NonEmoji" elif s == "Emoji_Component": return "EmojiStatus::NonEmojiButEmojiComponent" elif s == "Emoji;Emoji_Presentation": return "EmojiStatus::EmojiPresentation" elif s == "Emoji;Emoji_Presentation;Emoji_Modifier_Base": return "EmojiStatus::EmojiPresentationAndModifierBase" elif s == "Emoji;Emoji_Modifier_Base": return "EmojiStatus::EmojiModifierBase" elif s == "Emoji": return "EmojiStatus::EmojiOther" elif s == "Emoji;Emoji_Presentation;Emoji_Component": return "EmojiStatus::EmojiPresentationAndEmojiComponent" elif s == "Emoji;Emoji_Presentation;Emoji_Modifier;Emoji_Component": return "EmojiStatus::EmojiPresentationAndModifierAndEmojiComponent" elif s == "Emoji;Emoji_Component": return "EmojiStatus::EmojiOtherAndEmojiComponent" else: return "EmojiStatus::NewCombination(\"" + s + "\")" while cur_group_first <= code_point_last: cur_group_props = [] cur_group_last = code_point_last for prop_list_idx in range(emoji_prop_count): if emoji_prop_list_pos[prop_list_idx] >= emoji_prop_list_len[prop_list_idx]: continue elif emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] > cur_group_first: cur_group_last = min(cur_group_last, emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] - 1) else: cur_group_props.append(emoji_prop_list[prop_list_idx]) cur_group_last = min(cur_group_last, emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][1]) cur_group_text = group_text(";".join(cur_group_props)) if cur_group_text != "": emoji_table.append((cur_group_first, cur_group_last, cur_group_text)) for prop_list_idx in range(emoji_prop_count): if emoji_prop_list_pos[prop_list_idx] >= emoji_prop_list_len[prop_list_idx]: continue elif emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][0] > cur_group_first: continue else: if cur_group_last == emoji_status_table[emoji_prop_list[prop_list_idx]][emoji_prop_list_pos[prop_list_idx]][1]: emoji_prop_list_pos[prop_list_idx] += 1 cur_group_first = cur_group_last + 1 emit_table(f, "EMOJI_STATUS", emoji_table, "&[(char, char, EmojiStatus)]", is_pub=False, pfun=lambda x: "(%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2])) f.write("}\n\n") def emit_util_mod(f): f.write(""" #[allow(dead_code)] pub mod util { use core::result::Result::{Ok, Err}; pub fn bsearch_range_value_table(c: char, r: &'static [(char, char, T)]) -> Option { use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } else if hi < c { Less } else { Greater } }) { Ok(idx) => { let (_, _, cat) = r[idx]; Some(cat) } Err(_) => None } } } """) if __name__ == "__main__": r = "tables.rs" if os.path.exists(r): os.remove(r) with open(r, "w") as rf: # write the file's preamble rf.write(preamble) rf.write(""" /// The version of [Unicode](http://www.unicode.org/) /// that this version of unicode-security is based on. pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s); """ % UNICODE_VERSION) emit_util_mod(rf) ### general category module emit_general_category_module(rf) ### emoji module emit_emoji_module(rf)