from typing import Optional, List, Dict, Tuple from io import TextIOBase import re import requests from sys import stdout, argv from pathlib import Path from datetime import datetime, timezone from abc import ABCMeta, abstractmethod import json class ITableGenerator(metaclass=ABCMeta): @abstractmethod def get_codepage_table(self, codepage: int) -> List[Optional[int]]: pass class UnicodeOrgTableGenerator(ITableGenerator): def get_codepage_definition_url(self, codepage: int) -> str: return f"https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/PC/CP{codepage}.TXT" def get_codepage_table(self, codepage: int) -> List[Optional[int]]: url = self.get_codepage_definition_url(codepage) req = requests.get(url) req.raise_for_status() entries = [ l.split("\t") for l in req.text.split("\n") if len(l) > 2 and l[:2] == "0x" ] dic: List[Optional[int]] = [None] * 256 for e in entries: index = int(e[0][2:], 16) if index | 255 != 255: continue dic[index] = ( int(e[1][2:], 16) if len(e[1]) > 2 and e[1][:2] == "0x" else None ) return dic class GitHubICUTableGenerator(ITableGenerator): from re import Pattern LIST_SEARCH_REGEX = re.compile(r"(?<=UCM_SOURCE_FILES = )(:?[\w\-\.]|\\\n)+") LIST_SPLITTING_REGEX = re.compile(r"(?: |\\\n)+") LIST_SEARCH_REGEX = re.compile(r"(?<=UCM_SOURCE_FILES = )(:?[\w\-\.]|\\\n)+") LIST_SPLITTING_REGEX = re.compile(r"(?: |\\\n)+") UCM_SEARCH_REGEX = re.compile( r"(?<=CHARMAP\n)( \\x[0-9A-F]+ \|\d\n)+(?=END CHARMAP)" ) # ones other than 0 don't make sense UCM_PARSING_REGEX = re.compile(r" \\x([0-9A-F]+) \|0") def search_ucm_file(self, search: str) -> str: file_list_data_request = requests.get( "https://raw.githubusercontent.com/unicode-org/icu/master/icu4c/source/data/mappings/ucmfiles.mk" ) file_list_data_request.raise_for_status() file_list_data_all = file_list_data_request.text file_list_data_search_result = GitHubICUTableGenerator.LIST_SEARCH_REGEX.search( file_list_data_all ) if file_list_data_request is None: raise FileNotFoundError("Couldn't extract code page file information") file_list_data: str = file_list_data_search_result[0] file_list: List[str] = GitHubICUTableGenerator.LIST_SPLITTING_REGEX.split( file_list_data ) ucm_file_name = next(iter((l for l in file_list if l.startswith(search)))) return f"https://raw.githubusercontent.com/unicode-org/icu/master/icu4c/source/data/mappings/{ucm_file_name}" def get_codepage_definition_url(self, codepage: int) -> str: return self.search_ucm_file(f"{self.prefix}-{codepage}") def get_codepage_table(self, codepage: int) -> List[Optional[int]]: url = self.get_codepage_definition_url(codepage) req = requests.get(url) req.raise_for_status() ucm_search_result = GitHubICUTableGenerator.UCM_SEARCH_REGEX.search(req.text) if ucm_search_result is None: raise RuntimeError( f"Syntax of UCM file is different from what is expected. (cp{codepage})" ) entries = list( map( lambda r: (int(r[1], 16), int(r[2], 16)), filter( lambda r: r is not None, map( lambda l: GitHubICUTableGenerator.UCM_PARSING_REGEX.search(l), ucm_search_result[0].rstrip("\n").split("\n"), ), ), ) ) dic: List[Optional[int]] = [None] * 256 for unicode_code, index in entries: if index | 255 != 255: continue dic[index] = unicode_code return dic def __init__(self, prefix="ibm"): self.prefix = prefix class CombinedGenerator(ITableGenerator): def __init__(self, table: List[Tuple[slice, ITableGenerator]]): self.table: List[Tuple[slice, ITableGenerator]] = table def get_codepage_table(self, codepage: int) -> List[Optional[int]]: dic: List[Optional[int]] = [None] * 256 for _slice, subgenerator in self.table: subdic = subgenerator.get_codepage_table(codepage) dic[_slice] = subdic[_slice] return dic target_codepages = { "unicode.org": [ 437, 737, 775, 850, 852, 855, 857, 860, 861, 862, 863, 864, 865, 866, 869, ], "icu.ibm": [720, 858], # Windows defines private use code points "combine.cp874": [874], } def main(): unicode_org = UnicodeOrgTableGenerator() icu_ibm = GitHubICUTableGenerator("ibm") icu_win = GitHubICUTableGenerator("windows") cp874 = CombinedGenerator( [(slice(None), unicode_org), (slice(0x80, 0xA0), icu_win)] ) dic = {} for cp_list, generator in { "unicode.org": unicode_org, "icu.ibm": icu_ibm, "combine.cp874": cp874, }.items(): for codepage in target_codepages[cp_list]: dic[codepage] = generator.get_codepage_table(codepage) timestamp = datetime.now(timezone.utc).isoformat(timespec="seconds") assets_path = Path(argv[0]).parent / "assets" assets_path.mkdir(exist_ok=True) with (assets_path / "code_tables.json").open( "w", encoding="UTF-8", newline="\n" ) as f: json.dump({"created": timestamp, "tables": dic}, f) if __name__ == "__main__": main()