import argparse from urllib.parse import urlparse from uniseg.wordbreak import words from wimbd.utils.utils import read_json_gz_file def main(): parse = argparse.ArgumentParser("") parse.add_argument("--in_file", type=str) args = parse.parse_args() data = read_json_gz_file(args.in_file) for row in data: tokenized_words = list(words(row['text'])) word_count = len([x for x in tokenized_words if x != ' ']) #print(urlparse(row['metadata']['url']).netloc, word_count) print(urlparse(row['url']).netloc, word_count) if __name__ == "__main__": main()