#!/usr/bin/env python # Trivet # Copyright (c) 2023 by Stacy Prowell. All rights reserved. # https://gitlab.com/binary-tools/trivet """ Read the XML Unicode database file and construct a Rust version for use by Trivet. This converts the XML into a literal data structure in Rust using the following process. The result is a BTree mapping &str to char. The strings come from the names (NA attribute) and aliases (name attributes of contained name-alias elements). Obtain the Unicode database from https://unicode.org/ucd/. The desired file is the flat complete set, and is named ucd.all.flat.xml. This only needs to be run if the Unicode database needs to be updated. Run with: $ python3 build_unicode_db.py > ucd.rs Check the output to make sure it looks reasonable, then move it into place in the src/parse/strings folder. $ mv ucd.rs ../src/parse/strings/ucd.rs """ import xml.etree.ElementTree as ET def main(): """Entry point when run from the prompt.""" # Open and read the Unicode XML data file. tree = ET.parse("ucd.all.flat.xml") root = tree.getroot() # Extract all char children. ucd = {} for character in root.iter('{http://www.unicode.org/ns/2003/ucd/1.0}char'): # If there is a code point in the attribute, then get it. if 'cp' in character.attrib: code_point = character.attrib['cp'] # Now try to find the name. if 'na' in character.attrib: name = character.attrib['na'] if name != '': # Found the name. ucd[name] = f'\\u{{{code_point}}}' # Now print any aliases. for alias in character.iter('{http://www.unicode.org/ns/2003/ucd/1.0}name-alias'): if 'alias' in alias.attrib: alias = alias.attrib['alias'] if alias != '': ucd[alias] = f'\\u{{{code_point}}}' # Now construct the Rust file. print("// trivet") print("// copyright") print("") print("//! Provide the Unicode database.") print("") print("/// The Unicode database.") print("""/// /// This is generated from the complete database, which can be obtained from /// [unicode.org](https://unicode.org/ucd/). It is an array of pairs, with /// each pair consisting of a Unicode name or alias and the corresponding code /// point. """) print("pub const UCD: &[(&str, char)] = &[") for name,code_point in ucd.items(): print(f" (\"{name.upper()}\", '{code_point}'),") print("];") if __name__ == '__main__': main()