from html.parser import HTMLParser import json class TableMiningParser(HTMLParser): def __init__(self): super().__init__() self.n_tbody = 0 self.content_list = [] self.content_row = [] self.in_header_row = False self.in_tr = False self.in_td = False self.analyze_finished = False self.in_link_in_td = False def handle_starttag(self, tag, attrs): if self.analyze_finished: return if tag == "tbody": self.n_tbody += 1 if self.n_tbody == 1: if tag == "tr": self.in_tr = True elif self.in_tr: if tag == "th": self.in_header_row = True elif tag == "td": self.in_td = True elif self.in_td and tag == "a": self.in_link_in_td = True def handle_endtag(self, tag): if self.analyze_finished: return if tag == "tbody": self.n_tbody -= 1 if self.n_tbody == 0: self.analyze_finished = True if self.n_tbody == 1: if self.in_tr: if self.in_td: if tag == "td": self.in_td = False elif self.in_link_in_td and tag == "a": self.in_link_in_td = False elif tag == "tr": self.in_tr = False if not self.in_header_row: self.content_list.append(self.content_row) self.content_row = [] self.in_header_row = False def handle_data(self, data): if self.n_tbody == 1 and self.in_tr and self.in_td: cleaned_data = data.strip().strip("\u200E\u200F") if self.in_link_in_td or cleaned_data != "": self.content_row.append(cleaned_data) def convert_row(row): ( locale_id_str, locale_name, locale_english_full_name, language_english_name, locale_local_name, acp_str, oemcp_str, country_abbrev, language_abbrev, ) = row locale_id = int(locale_id_str, 16) acp = int(acp_str) oemcp = int(oemcp_str) return { "locale": locale_name, "locale_id": locale_id, "language": language_english_name, "locale_name_english": locale_english_full_name, "locale_name_local": locale_local_name, "acp": acp, "oemcp": oemcp, "country_abbrev": country_abbrev, "language_abbrev": language_abbrev, } parser = TableMiningParser() with open("nls_info.html", encoding="UTF-8") as f: parser.feed(f.read()) content_list = [convert_row(r) for r in parser.content_list] with open("nls_info.json", "w", encoding="UTF-8") as f: json.dump(content_list, f)