#!/usr/bin/env python3 """ This script generates the data tables from Unicode Consortium case folding data. See ``README.md`` for instructions on how to use this script. """ import re import sys def parse_version(lines): """Parse a triple containing the Unicode standard version.""" line = next(lines) return re.match(r'# CaseFolding-(\d+)\.(\d+)\.(\d+)\.txt', line).groups() def strip_comments(line): """Strip anything after a '#' character.""" return line.partition('#')[0] def parse_tables(lines): """Parse the main bulk of the data.""" tables = {'C': {}, 'F': {}, 'S': {}, 'T': {}} for line in lines: line = strip_comments(line).strip() if not line: continue code, status, mapping, _empty = line.split(';') code = int(code, 16) status = status.strip() if status == 'F': # Only status F (full) mappings can expand to multiple chars mapping = [int(s, 16) for s in mapping.split()] else: mapping = int(mapping, 16) tables[status][code] = mapping if tables['T'] != {0x0049: 0x0131, 0x0130: 0x0069}: raise Exception('Turkic tables have changed -- please update code') return tables def render(codepoint): """Render a codepoint as a Rust hex escape.""" return r"'\u{{{:x}}}'".format(codepoint) def main(lines): print('// NOTE: the following code was generated by `scripts/generate.py`; do not edit directly') version = parse_version(lines) print(""" /// The version of [Unicode](http://www.unicode.org/) that this version /// of `unicode-casefold` is based on. pub const UNICODE_VERSION: (u64, u64, u64) = ({}, {}, {}); """.format(*version)) print(""" #[derive(Copy, Clone, Debug)] pub enum Buffer { Zero, One(char), Two(char, char), } """) tables = parse_tables(lines) print("""/// Common mappings shared by both the full and simple mappings.""") print("""pub static COMMON_TABLE: &'static [(char, char)] = &[""") for code, mapping in sorted(tables['C'].items()): print(""" ({}, {}),""".format(render(code), render(mapping))) print("""];""") print() print("""/// Full mappings, which cause strings to grow in length.""") print("""pub static FULL_TABLE: &'static [(char, (char, Buffer))] = &[""") for code, mapping in sorted(tables['F'].items()): if len(mapping) == 1: variant = 'Buffer::Zero' elif len(mapping) == 2: variant = 'Buffer::One({})'.format(render(mapping[1])) elif len(mapping) == 3: variant = 'Buffer::Two({}, {})'.format(render(mapping[1]), render(mapping[2])) else: raise Exception('code {} maps to a string of length {}'.format(code, len(mapping))) print(""" ({}, ({}, {})),""".format(render(code), render(mapping[0]), variant)) print("""];""") print() print("""/// Simple mappings, which differ from those in the `FULL_TABLE`.""") print("""pub static SIMPLE_TABLE: &'static [(char, char)] = &[""") for code, mapping in sorted(tables['S'].items()): print(""" ({}, {}),""".format(render(code), render(mapping))) print("""];""") if __name__ == '__main__': main(sys.stdin)