# coding=utf-8 # Utility script to parse the profanity word list and output several Rust files containing the word list # in different formats. # Usage: python parse_word_list.py import csv import sys total_lengths = set() def load_csv(filename): """ Open a CSV file and return a tuple of the header row and the data rows. """ with open(filename, 'r') as f: reader = csv.reader(f) header = next(reader) data = [row for row in reader] return header, data # csv header # text,canonical_form_1,canonical_form_2,canonical_form_3,category_1,category_2,category_3,severity_rating,severity_description def group_by_severity(data): """ Group the data by severity. """ groups = {} for row in data: severity = row[8] if severity not in groups: groups[severity] = [] severity_group = groups[severity] severity_group.append(row[0]) for word in row[1:4]: if word == "": continue word = word.lower() if word not in severity_group: severity_group.append(word) if "" in groups: del groups[""] return groups def group_by_category(data): """ Group the data by category. Note: a single word can have multiple categories. """ groups = {} for row in data: for category in row[4:7]: if category not in groups: groups[category] = [] category_group = groups[category] category_group.append(row[0]) for word in row[1:4]: if word == "": continue word = word.lower() if word not in category_group: category_group.append(word) if "" in groups: del groups[""] return groups def group_by_word(data): """ Group the data by the canonical form of the word. As with categories, a single word can have multiple canonical forms. """ groups = {} for row in data: for word in row[1:4]: if word not in groups: groups[word] = [] word_group = groups[word] word_group.append(row) if "" in groups: del groups[""] return groups def rustify_string(s): """ Convert a string to a valid Rust identifier. This converts the string to uppercase, and removes non-alphanumeric characters (except for spaces). Then trims any duplicated spaces, and finally replaces spaces with underscores. """ return '_'.join([x for x in s.upper().replace("/", "").replace("-", "_").replace(' ', '_').split('_') if x != '']) def write_rust_file(filename, kind, data): """ Write a Rust file containing the data. Data is a list of tuples, where the first element is the variable name and the second is the actual word list. """ with open(filename, 'w') as f: f.write('//! {} list generated by parse_word_list.py\n'.format(kind)) for name, words in data: total_lengths.add(len(words)) f.write(f"pub const {rustify_string(name)}: [&str; {len(words)}] = [\n") for word in words: f.write(f' "{word}",\n') f.write("];\n") def main(): header, data = load_csv(sys.argv[1]) output_dir = sys.argv[2] severity_group = group_by_severity(data) write_rust_file(f'{output_dir}/severity.rs', 'severity', [(severity, rows) for severity, rows in severity_group.items()]) category_groups = group_by_category(data) write_rust_file(f'{output_dir}/category.rs', 'category', [(category, rows) for category, rows in category_groups.items()]) word_groups = group_by_word(data) write_rust_file(f'{output_dir}/word.rs', 'word', [(word, [row[0] for row in rows]) for word, rows in word_groups.items()]) print(f"Total lengths: {sorted(total_lengths)}") if __name__ == '__main__': main()