#! /usr/bin/python # Copyright (C) 2016 Apple Inc. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # This tool processes the Unicode Character Database file CaseFolding.txt to create # canonicalization table as decribed in ECMAScript 6 standard in section # "21.2.2.8.2 Runtime Semantics: Canonicalize()", step 2. import optparse import re import sys from sets import Set header = """/* * Copyright (C) 2016 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // DO NO EDIT! - This file was generated by generateYarrCanonicalizeUnicode #include "config.h" #include "YarrCanonicalize.h" namespace JSC { namespace Yarr { """ footer = """} } // JSC::Yarr """ MaxUnicode = 0x10ffff commonAndSimpleLinesRE = re.compile(r"(?P

[0-9A-F]+)\s*;\s*[CS]\s*;\s*(?P[0-9A-F]+)", re.IGNORECASE)

def openOrExit(path, mode):
    try:
        return open(path, mode)
    except IOError as e:
        print "I/O error opening {0}, ({1}): {2}".format(path, e.errno, e.strerror)
        exit(1)

class Canonicalize:
    def __init__(self):
        self.canonicalGroups = {};

    def addMapping(self, code, mapping):
        if mapping not in self.canonicalGroups:
            self.canonicalGroups[mapping] = []
        self.canonicalGroups[mapping].append(code)
        
    def readCaseFolding(self, file):
        codesSeen = Set()
        for line in file:
            line = line.split('#', 1)[0]
            line = line.rstrip()
            if (not len(line)):
                continue

            fields = commonAndSimpleLinesRE.match(line)
            if (not fields):
                continue

            code = int(fields.group('code'), 16)
            mapping = int(fields.group('mapping'), 16)

            codesSeen.add(code)
            self.addMapping(code, mapping)

        for i in range(MaxUnicode + 1):
            if i in codesSeen:
                continue;

            self.addMapping(i, i)

    def createTables(self, file):
        typeInfo = [""] * (MaxUnicode + 1)
        characterSets = []

        for mapping in sorted(self.canonicalGroups.keys()):
            characters = self.canonicalGroups[mapping]
            if len(characters) == 1:
                typeInfo[characters[0]] = "CanonicalizeUnique:0"
            else:
                characters.sort()
                if len(characters) > 2:
                    for ch in characters:
                        typeInfo[ch] = "CanonicalizeSet:%d" % len(characterSets)
                    characterSets.append(characters)
                else:
                    low = characters[0]
                    high = characters[1]
                    delta = high - low
                    if delta == 1:
                        type = "CanonicalizeAlternatingUnaligned:0" if low & 1 else "CanonicalizeAlternatingAligned:0"
                        typeInfo[low] = type
                        typeInfo[high] = type
                    else:
                        typeInfo[low] = "CanonicalizeRangeLo:%d" % delta
                        typeInfo[high] = "CanonicalizeRangeHi:%d" % delta

        rangeInfo = []
        end = 0
        while end <= MaxUnicode:
            begin = end
            type = typeInfo[end]
            while end < MaxUnicode and typeInfo[end + 1] == type:
                end = end + 1
            rangeInfo.append({"begin": begin, "end": end, "type": type})
            end = end + 1
        
        for i in range(len(characterSets)):
            characters = ""
            set = characterSets[i]
            for ch in set:
                characters = characters + "0x{character:04x}, ".format(character=ch)
            file.write("const UChar32 unicodeCharacterSet{index:d}[] = {{ {characters}0 }};\n".format(index=i, characters=characters))

        file.write("\n")
        file.write("static const size_t UNICODE_CANONICALIZATION_SETS = {setCount:d};\n".format(setCount=len(characterSets)))
        file.write("const UChar32* const unicodeCharacterSetInfo[UNICODE_CANONICALIZATION_SETS] = {\n")

        for i in range(len(characterSets)):
            file.write("    unicodeCharacterSet{setNumber:d},\n".format(setNumber=i))

        file.write("};\n")
        file.write("\n")
        file.write("const size_t UNICODE_CANONICALIZATION_RANGES = {rangeCount:d};\n".format(rangeCount=len(rangeInfo)))
        file.write("const CanonicalizationRange unicodeRangeInfo[UNICODE_CANONICALIZATION_RANGES] = {\n")

        for info in rangeInfo:
            typeAndValue = info["type"].split(":")
            file.write("    {{ 0x{begin:04x}, 0x{end:04x}, 0x{value:04x}, {type} }},\n".format(begin=info["begin"], end=info["end"], value=int(typeAndValue[1]), type=typeAndValue[0]))

        file.write("};\n")
        file.write("\n")

        
if __name__ == "__main__":
    parser = optparse.OptionParser(usage = "usage: %prog   ")
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(" ")

    caseFoldingTxtPath = args[0]
    canonicalizeHPath = args[1]
    caseFoldingTxtFile = openOrExit(caseFoldingTxtPath, "r")
    canonicalizeHFile = openOrExit(canonicalizeHPath, "wb")

    canonicalize = Canonicalize()
    canonicalize.readCaseFolding(caseFoldingTxtFile)

    canonicalizeHFile.write(header);
    canonicalize.createTables(canonicalizeHFile)
    canonicalizeHFile.write(footer);

    caseFoldingTxtFile.close()
    canonicalizeHFile.close()

    exit(0)