#!/usr/bin/env python3 import csv import re import sys from argparse import ArgumentParser, Namespace from dataclasses import dataclass from itertools import takewhile from pathlib import Path from typing import Optional from bs4 import BeautifulSoup, NavigableString from jinja2 import Environment, FileSystemLoader, StrictUndefined ROOT = Path(__file__).parent DATA_DIR = ROOT / "data" INPUT_HTML = DATA_DIR / "marc-relators.html" CSV_FILE = DATA_DIR / "marc-relators.csv" OUTPUT_RS = ROOT / "src" / "inner.rs" @dataclass(frozen=True) class MarcRelator: name: str code: str note: str name_override: Optional[str] @property def variant(self) -> str: if self.name_override: return self.name_override.strip() return "".join([x.capitalize() for x in re.split(" |-", self.name) if x]).strip() @property def doc_str(self) -> str: return self.note.replace("[", "\\[").replace("]", "\\]") @property def note_str(self) -> str: return self.note.replace("\\", "\\\\").replace('"', '\\"') def main() -> None: args = arg_parser().parse_args() args.func(args) def arg_parser() -> ArgumentParser: parser = ArgumentParser(Path(__file__).name, description="Codegen help") subs = parser.add_subparsers(help="Commands to run", required=True) html = subs.add_parser("parse-html", help="Parse HTML to CSV") html.set_defaults(func=parse_html) code = subs.add_parser("codegen", help="Turn the CSV into Rust code") code.set_defaults(func=codegen) return parser def codegen(args: Namespace) -> None: relators = [] with open(CSV_FILE) as f: reader = csv.DictReader(f) for row in reader: relator = MarcRelator(row["name"], row["code"], row["note"], row.get("name_override")) relators.append(relator) env = Environment( loader=FileSystemLoader(Path(__file__).parent / "templates"), block_start_string="//{%", block_end_string="%}//", variable_start_string="//{{", variable_end_string="}}//", comment_start_string="//#", comment_end_string="#//", autoescape=False, trim_blocks=False, lstrip_blocks=False, keep_trailing_newline=True, undefined=StrictUndefined, ) template = env.get_template("inner.rs") data = template.render(relators=relators) # trim the data because `cargo fmt` seems to be upset at the very long lines trimmed_data = [] for line in data.split("\n"): line = line.strip() if line: trimmed_data.append(line) with open(OUTPUT_RS, "w") as f: f.write("\n".join(trimmed_data)) def parse_html(args: Namespace) -> None: # You'd think we could use `requests` for this, but it chokes on the encoding, so no print("Using cached output HTML") with open(INPUT_HTML, "r", encoding="utf-8") as f: html_text = f.read() soup = BeautifulSoup(html_text, features="html5lib") dl_nodes = soup.find_all("dl") # there are currently four of these, and we want the last one assert len(dl_nodes) == 4 parent = dl_nodes[3] # there's a bunch off stuff, make sure we grabbed (probably) the right one assert len(parent.contents) > 1000 print("Starting parsing") output = [] skip_next_dd = False expecting_dt = True name = None code = None for node in parent.contents: if isinstance(node, NavigableString): continue if expecting_dt: if node.name != "dt": continue authorized = node.select("span.authorized") if not authorized: expecting_dt = False skip_next_dd = True continue name = authorized[0].string.strip() code = ( node.select("span.relator-code")[0].string.replace("[", "").replace("]", "").strip() ) expecting_dt = False elif not skip_next_dd: css_class = node.get("class") if css_class and "use-note" in css_class: if node.string: string = node.string else: string = list( takewhile(lambda x: isinstance(x, NavigableString), node.children) )[0] output.append((name, code, string.strip())) expecting_dt = True else: raise Exception("Unknown CSS classes: " + str(css_class)) else: skip_next_dd = False expecting_dt = True print("Writing output") with open(CSV_FILE, "w") as f: writer = csv.writer(f) writer.writerow(["name", "code", "note", "name_override"]) for row in output: writer.writerow(row) print("Done") if __name__ == "__main__": try: main() except KeyboardInterrupt: print("") sys.exit(1)