#!/usr/bin/env python3 import requests from bs4 import BeautifulSoup import bs4 """ Languages are messy and I am too lazy to support them :p lang = [#"https://en.wikipedia.org/wiki/NKo_(Unicode_block)", #"https://en.wikipedia.org/wiki/Samaritan_(Unicode_block)", "https://en.wikipedia.org/wiki/Syriac_(Unicode_block)", #"https://en.wikipedia.org/wiki/Thaana_(Unicode_block)", "https://en.wikipedia.org/wiki/Tifinagh_(Unicode_block)", "https://en.wikipedia.org/wiki/Devanagari_(Unicode_block)", "https://en.wikipedia.org/wiki/Bengali_(Unicode_block)", "https://en.wikipedia.org/wiki/Gurmukhi_(Unicode_block)", "https://en.wikipedia.org/wiki/Gujarati_(Unicode_block)" ] lang_bali = ["https://en.wikipedia.org/wiki/Balinese_(Unicode_block)"] lang_bugi = ["https://en.wikipedia.org/wiki/Buginese_(Unicode_block)"] lang_cher = ["https://en.wikipedia.org/wiki/Cherokee_(Unicode_block)"] # Missing "Greek and Coptic" block lang_copt = ["https://en.wikipedia.org/wiki/Coptic_(Unicode_block)", "https://en.wikipedia.org/wiki/Coptic_Epact_Numbers"] # 256, missing a lot lang_cyrl = ["https://en.wikipedia.org/wiki/Cyrillic_(Unicode_block)", "https://en.wikipedia.org/wiki/Cyrillic_Supplement"] lang_geor = ["https://en.wikipedia.org/wiki/Georgian_(Unicode_block)"] # 94 + 38 lang_glag = ["https://en.wikipedia.org/wiki/Glagolitic_(Unicode_block)", "https://en.wikipedia.org/wiki/Glagolitic_Supplement"] # Missing "Greek and Coptic" block # also missing "Combining and letter-free diacritics" lang_grek = ["https://en.wikipedia.org/wiki/Greek_Extended"] lang_mand = ["https://en.wikipedia.org/wiki/Mandaic_(Unicode_block)"] """ symbols = ["https://en.wikipedia.org/wiki/Currency_Symbols_(Unicode_block)", "https://en.wikipedia.org/wiki/General_Punctuation", "https://en.wikipedia.org/wiki/Letterlike_Symbols", "https://en.wikipedia.org/wiki/Number_Forms", "https://en.wikipedia.org/wiki/Miscellaneous_Symbols", ] phonetic = ["https://en.wikipedia.org/wiki/IPA_Extensions", "https://en.wikipedia.org/wiki/Spacing_Modifier_Letters", "https://en.wikipedia.org/wiki/Phonetic_Extensions", "https://en.wikipedia.org/wiki/Phonetic_Extensions_Supplement", "https://en.wikipedia.org/wiki/Modifier_Tone_Letters", "https://en.wikipedia.org/wiki/Superscripts_and_Subscripts", ] enclosed = ["https://en.wikipedia.org/wiki/Enclosed_alphanumerics"] enclosed_supplement = ["https://en.wikipedia.org/wiki/Enclosed_Alphanumeric_Supplement", "https://en.wikipedia.org/wiki/Enclosed_Ideographic_Supplement", ] arrows = ["https://en.wikipedia.org/wiki/Arrows_(Unicode_block)"] arrows_supplement = ["https://en.wikipedia.org/wiki/Supplemental_Arrows-A", "https://en.wikipedia.org/wiki/Supplemental_Arrows-B", "https://en.wikipedia.org/wiki/Supplemental_Arrows-C", ] dingbat = ["https://en.wikipedia.org/wiki/Dingbat"] math = ["https://en.wikipedia.org/wiki/Mathematical_Operators", "https://en.wikipedia.org/wiki/Supplemental_Mathematical_Operators", "https://en.wikipedia.org/wiki/Miscellaneous_Mathematical_Symbols-A", "https://en.wikipedia.org/wiki/Miscellaneous_Mathematical_Symbols-B", ] tech = ["https://en.wikipedia.org/wiki/Miscellaneous_Technical"] games = ["https://en.wikipedia.org/wiki/Mahjong_Tiles_(Unicode_block)", "https://en.wikipedia.org/wiki/Domino_Tiles", "https://en.wikipedia.org/wiki/Unicode_Playing_Card_Block", ] box = ["https://en.wikipedia.org/wiki/Box_Drawing", "https://en.wikipedia.org/wiki/Block_Elements", "https://en.wikipedia.org/wiki/Geometric_Shapes", ] box_supplement = ["https://en.wikipedia.org/wiki/Geometric_Shapes_Extended"] categories = { "symbols": (symbols, "Some symbols"), "enclosed": (enclosed, "Enclosed numbers and letters"), "enclosed_supplement": (enclosed_supplement, "More enclosed numbers and letters"), "arrows": (arrows, "Arrows"), "arrows_supplement": (arrows_supplement, "More arrows"), "dingbat": (dingbat, "Dingbat symbols"), "math": (math, "Mathematical operators etc."), "games": (games, "Mahjong tiles, dominos and cards"), "box": (box, "Box drawing characters"), "box_supplement": (box_supplement, "More geometric shapes"), "tech": (tech, "Miscellaneous technical symbols"), "phonetic": (phonetic, "Representation of the sounds of spoken language") } chars = {} session = requests.Session() # speed stuff up for category in categories: valid = [] urls = categories[category][0] desc = categories[category][1] for url in urls: html = session.get(url).text soup = BeautifulSoup(html, 'html.parser') tds = soup.find_all('td') for item in tds: if item.children.__length_hint__() == 1: n = next(item.children) else: continue text = "" if type(n) == bs4.element.NavigableString: text = item.get_text() elif type(n) == bs4.element.Tag and n.name == "a": text = n.get_text() if text == "": continue try: a = item["title"] if len(text) == 1:# and text.isprintable(): valid.append(text) except KeyError: if len(text.split(' ')) == 3 and text.split(" ")[0].isdigit(): #print("expect", text.split(' ')[0]) pass chars[category] = (urls, valid, desc) print("#![no_std]") print("/// All ASCII characters except spacing") print("pub const ASCII: &'static [char] = &[", end="") for char in """!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~""": if char == "'": print("'\\'',", end="") elif char == "\\": print("'\\\\',", end="") else: print("'", char, "',", sep="", end="") print("];") print("/// ASCII spacing characters") print("pub const ASCII_SPACE: &'static [char] = &[' ', '\\t', '\\n', '\\r'];") for category in chars: print("///", chars[category][2]) print("///") print("/// Scraped from these wikipedia pages:") print("///") for url in chars[category][0]: print("/// -", url) print("///") print("pub const", category.upper(), ": &'static [char] = &[", end="") for char in chars[category][1]: print("'", char, "',", sep='', end='') print("];") print(""" /// Assert that we got all characters from Wikipedia. #[cfg(test)] mod tests { use super::*; #[test] fn ascii() { // letters + numbers + symbols assert_eq!(ASCII.len(), 26 + 26 + 10 + 32); assert_eq!(ASCII_SPACE.len(), 4); } #[test] fn arrows() { assert_eq!(ARROWS.len(), 112) } #[test] fn arrows_supplement() { assert_eq!(ARROWS_SUPPLEMENT.len(), 16 + 128 + 148) } #[test] fn box_() { // box is keyword assert_eq!(BOX.len(), 128 + 32 + 96) } #[test] fn box_supplement() { assert_eq!(BOX_SUPPLEMENT.len(), 85) } #[test] fn dingbat() { assert_eq!(DINGBAT.len(), 48 + 12 * 16) } #[test] fn enclosed() { assert_eq!(ENCLOSED.len(), 160) } #[test] fn enclosed_supplement() { // - regional indicators assert_eq!(ENCLOSED_SUPPLEMENT.len(), 191 - 26 + 64) } #[test] fn games() { assert_eq!(GAMES.len(), 44 + 100 + 82) } #[test] fn math() { assert_eq!(MATH.len(), 256 + 256 + 48 + 128) } #[test] fn symbols() { // - spaces assert_eq!(SYMBOLS.len(), 32 + 111 - 41 + 80 + 60 + 256) } #[test] fn tech() { assert_eq!(TECH.len(), 256) } #[test] fn phonetic() { assert_eq!(PHONETIC.len(), 96 + 80 + 128 + 64 + 32 + 42) } } """)