import re
from html.parser import HTMLParser
import requests
from sys import stdout, argv
import json
from collections import deque
from datetime import datetime, timezone
from pathlib import Path
class FetchingDecompositionHTMLParser(HTMLParser):
def __init__(self):
super().__init__()
self.in_td = False
self.in_p = False
# {"(Unicode char)": "(decomposed chars)"}
self.encoding_dic = {}
# {"(element)": {"currnet": "(composed char)", "next": {(sub dictionary)}}}
self.decoding_dic = {}
self.overall_regex = re.compile(r"0x([0-9A-F]+)(?: 0x([0-9A-F]+))*")
self.one_regex = re.compile(r"0x([0-9A-F]+)")
self.char_to_be_composed = ""
def handle_starttag(self, tag, attrs):
"""
Check the beginning of td & p tags
"""
if tag.lower() == "td":
self.in_td = True
if tag.lower() == "p" and self.in_td:
self.in_p = True
def handle_endtag(self, tag):
"""
Check the end of td & p tags
"""
if tag.lower() == "td" and self.in_td:
self.in_td = False
if tag.lower() == "p" and self.in_p:
self.in_p = False
def handle_data(self, data):
if self.in_p and self.in_td:
overall_match = self.overall_regex.match(data)
if overall_match is not None:
codepoints = [
chr(int(m[1], 16))
for m in (
self.one_regex.match(codepoint_str)
for codepoint_str in data.split(" ")
)
if m is not None
]
# decomposition definition
if len(codepoints) >= 2:
self.encoding_dic[self.char_to_be_composed] = "".join(codepoints)
self.decoding_dic.setdefault(
codepoints[0], {"current": None, "next": {}}
)
d = self.decoding_dic[codepoints[0]]
for c in codepoints[1:]:
# `"current": None` may be overwritten later
d["next"].setdefault(c, {"current": None, "next": {}})
d = d["next"][c]
d["current"] = self.char_to_be_composed
self.char_to_be_composed = ""
# character to be decomposed
else:
self.char_to_be_composed = codepoints[0]
if __name__ == "__main__":
parser = FetchingDecompositionHTMLParser()
with requests.get(
"https://developer.apple.com/library/archive/technotes/tn/tn1150table.html"
) as req:
parser.feed(req.text)
timestamp = datetime.now(timezone.utc).isoformat(timespec="seconds")
assets_dir = Path(argv[0]).parent / "assets"
assets_dir.mkdir(exist_ok=True)
with (assets_dir / "hfs_table.json").open("w", encoding="UTF-8", newline="\n") as f:
json.dump(
{
"created": timestamp,
"encoding": parser.encoding_dic,
"decoding": parser.decoding_dic,
},
f,
)