from pathlib import Path import urllib.parse import re import aiohttp from bs4 import BeautifulSoup CACHE_DIR = Path("cache") GUROBI_REF_MAN_URL = urllib.parse.urlparse('https://www.gurobi.com/documentation/11.0/refman/') DOC_REMOVE = [ r"For examples of how to query or modify parameter values from our different APIs, refer to our Parameter Examples.", r"For examples of how to query or modify attributes, refer to our Attribute Examples.", r"Next:.+Up:.+Previous.+", r"One important note about integer-valued parameters: while the maximum value that can be stored in a signed integer is.+", r"Please refer to this section for more information on SOS constraints.", "next", "up", "previous", ] DOC_REMOVE = [re.compile(r) for r in DOC_REMOVE] def remove_newlines(s: str) -> str: return ' '.join(filter(bool, s.splitlines())) def get_url(path: str): url = GUROBI_REF_MAN_URL._replace(path=GUROBI_REF_MAN_URL.path + path).geturl() return url async def fetch_html(session: aiohttp.ClientSession, path: str): cache_path = CACHE_DIR / path if cache_path.exists(): with open(cache_path, 'r') as fp: return fp.read() url = get_url(path) async with session.get(url) as res: print("GET", url, res.status) if res.status != 200: raise Exception("bad request") text = await res.text() cache_path.parent.mkdir(parents=True, exist_ok=True) with open(cache_path, 'w') as fp: fp.write(text) return text def _clean_documentation(s: str) -> list: paragraphs = map(lambda x: remove_newlines(x.strip()), filter(bool, s.split('\n\n'))) paragraphs = [ p for p in paragraphs if not any(r.fullmatch(p) for r in DOC_REMOVE) ] return paragraphs _DTYPES = { "double": "dbl", "string": "str", "int": "int", "char": "chr" } async def fetch_parameter_data(session: aiohttp.ClientSession, name: str, path: str) -> dict: doc = await fetch_html(session, path) soup = BeautifulSoup(doc, 'html.parser') replace_images_with_alt(soup) table = soup.find("div", class_="documentation__content").find("table") data = {"name": name, "url": get_url(path) } ty = table.find(string="Type:", recursive=True).parent.parent.find_next_sibling('td').text data['ty'] = ty data['dtype'] = _DTYPES[ty] data['default'] = table.find(string="Default value:", recursive=True).parent.parent.find_next_sibling('td').text if ty == 'int' or ty == "double": data['min'] = table.find(string="Minimum value:", recursive=True).parent.parent.find_next_sibling('td').text data['max'] = table.find(string="Maximum value:", recursive=True).parent.parent.find_next_sibling('td').text documentation = _clean_documentation(table.find_next_sibling('p').text) data['cl_only'] = "Note: Command-line only" in documentation data['doc'] = documentation return data _OTYPES = { "Model": "model", "Multi-objective": "model", "Multi-Scenario": "model", "Quality": "model", "Linear Constraint": "constr", "Quadratic Constraint": "qconstr", "SOS": "sos", "Variable": "var", "General Constraint": "gconstr", "Batch": "batch", } async def fetch_attribute_data(session: aiohttp.ClientSession, name: str, path: str) -> dict: doc = await fetch_html(session, path) soup = BeautifulSoup(doc, 'html.parser') replace_images_with_alt(soup) table = soup.find("div", class_="documentation__content").find("table") data = {"name": name, "url": get_url(path) } ty = table.find(string="Type:", recursive=True).parent.parent.find_next_sibling('td').text mod = table.find(string="Modifiable:", recursive=True).parent.parent.find_next_sibling('td').text object_ty = table.parent\ .find(class_='navigation')\ .find('b', text=lambda x : "Up:" in x)\ .find_next_sibling("a")\ .text.replace(" Attributes", "") data['ty'] = ty data['dtype'] = _DTYPES[ty] data['otype'] = _OTYPES[object_ty] data['modifiable'] = mod.lower() == "yes" documentation = _clean_documentation(table.find_next_sibling('p').text) data['cl_only'] = "Note: Command-line only" in documentation data['doc'] = documentation return data async def fetch_parameter_list(session: aiohttp.ClientSession): html_doc = await fetch_html(session, "parameters.html") soup = BeautifulSoup(html_doc, 'html.parser') desc = soup.find("a", string="Parameter Descriptions") items = desc.find_next_sibling('ul').find_all('a') return {i.text: i.attrs['href'] for i in items} async def fetch_attribute_list(session: aiohttp.ClientSession): html_doc = await fetch_html(session, "attributes.html") soup = BeautifulSoup(html_doc, 'html.parser') attrlist = {} for i in soup.find('ul', class_="ChildLinks").find_all('a'): if 'Attributes' in i.text or 'Examples' in i.text: continue attrlist[i.text] = i.attrs['href'] return attrlist def http_session(): http_headers = {'user-agent': "Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0"} return aiohttp.ClientSession(connector=aiohttp.TCPConnector(limit=20), headers=http_headers) def replace_images_with_alt(soup): for img in soup.find_all("img"): try: alt = img.attrs["alt"] except KeyError: continue img.replace_with(BeautifulSoup(alt, "html.parser"))