import multiprocessing import random import re import string import time import matplotlib.pyplot as plt import spacy import stringzilla as sz from blitztext import KeywordProcessor as BlitzTextProcessor from flashtext import KeywordProcessor from spacy.matcher import PhraseMatcher # Constants for benchmark configuration WORDS_PER_TEXT = 5000 NUM_TEXTS_PARALLEL = 1000 MAX_KEYWORDS = 20001 STEP_SIZE = 1000 # Load spaCy model with only the tokenizer nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "lemmatizer"]) def get_word_of_length(str_length): return ''.join(random.choice(string.ascii_lowercase) for _ in range(str_length)) def find_keywords(text, keywords): return [text.find(kw) for kw in keywords] def spacy_match(doc, matcher): return matcher(doc) def stringzilla_match(text, keywords): return [sz.find(text, kw) for kw in keywords] def benchmark_keyword_extraction(keywords_length, all_words): sample_text_words = random.sample(all_words, WORDS_PER_TEXT) sample_text = ' '.join(sample_text_words) unique_keywords_sublist = list(set(random.sample(all_words, keywords_length))) compiled_re = re.compile('|'.join([r'\b' + keyword + r'\b' for keyword in unique_keywords_sublist])) flashtext_processor = KeywordProcessor() flashtext_processor.add_keywords_from_list(unique_keywords_sublist) blitztext_processor = BlitzTextProcessor() for keyword in unique_keywords_sublist: blitztext_processor.add_keyword(keyword, None) # Prepare Spacy PhraseMatcher matcher = PhraseMatcher(nlp.vocab, attr="LOWER") patterns = [nlp.make_doc(text) for text in unique_keywords_sublist] matcher.add("KeywordList", patterns) start = time.time() _ = blitztext_processor.extract_keywords(sample_text, None) mid1 = time.time() _ = compiled_re.findall(sample_text) mid2 = time.time() _ = flashtext_processor.extract_keywords(sample_text) mid3 = time.time() _ = find_keywords(sample_text, unique_keywords_sublist) mid4 = time.time() doc = nlp(sample_text) _ = spacy_match(doc, matcher) mid5 = time.time() _ = stringzilla_match(sample_text, unique_keywords_sublist) end = time.time() return { "keywords": keywords_length, "blitztext": mid1 - start, "regex": mid2 - mid1, "flashtext": mid3 - mid2, "python_find": mid4 - mid3, "spacy": mid5 - mid4, "stringzilla": end - mid5 } def benchmark_keyword_extraction_parallel(keywords_length, all_words): sample_texts = [' '.join(random.sample(all_words, WORDS_PER_TEXT)) for _ in range(NUM_TEXTS_PARALLEL)] unique_keywords_sublist = list(set(random.sample(all_words, keywords_length))) compiled_re = re.compile('|'.join([r'\b' + keyword + r'\b' for keyword in unique_keywords_sublist])) flashtext_processor = KeywordProcessor() flashtext_processor.add_keywords_from_list(unique_keywords_sublist) blitztext_processor = BlitzTextProcessor() for keyword in unique_keywords_sublist: blitztext_processor.add_keyword(keyword, None) # Prepare Spacy PhraseMatcher matcher = PhraseMatcher(nlp.vocab, attr="LOWER") patterns = [nlp.make_doc(text) for text in unique_keywords_sublist] matcher.add("KeywordList", patterns) start = time.time() with multiprocessing.Pool() as pool: _ = pool.map(flashtext_processor.extract_keywords, sample_texts) mid1 = time.time() with multiprocessing.Pool() as pool: _ = pool.map(compiled_re.findall, sample_texts) mid2 = time.time() _ = blitztext_processor.parallel_extract_keywords_from_texts(sample_texts, None) mid3 = time.time() with multiprocessing.Pool() as pool: _ = pool.starmap(find_keywords, [(text, unique_keywords_sublist) for text in sample_texts]) mid4 = time.time() docs = list(nlp.pipe(sample_texts)) _ = [spacy_match(doc, matcher) for doc in docs] mid5 = time.time() with multiprocessing.Pool() as pool: _ = pool.starmap(stringzilla_match, [(text, unique_keywords_sublist) for text in sample_texts]) end = time.time() return { "keywords": keywords_length, "flashtext": mid1 - start, "regex": mid2 - mid1, "blitztext": mid3 - mid2, "python_find": mid4 - mid3, "spacy": mid5 - mid4, "stringzilla": end - mid5 } def run_benchmark_single_threaded(benchmark_func): all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for _ in range(100000)] return [benchmark_func(keywords_length, all_words) for keywords_length in range(0, MAX_KEYWORDS, STEP_SIZE)] def run_benchmark_multi_threaded(benchmark_func): all_words = [get_word_of_length(random.choice([3, 4, 5, 6, 7, 8])) for _ in range(100000)] return [benchmark_func(keywords_length, all_words) for keywords_length in range(0, MAX_KEYWORDS, STEP_SIZE)] def plot_results(results, filename, title, num_texts=1): plt.figure(figsize=(12, 6)) for method in ["flashtext", "regex", "blitztext", "python_find", "spacy", "stringzilla"]: plt.plot([r["keywords"] for r in results], [r[method] for r in results], label=method.capitalize(), marker='o') plt.xlabel('Number of Keywords') plt.ylabel('Time (seconds)') plt.title(title) plt.legend() plt.grid(True, linestyle='--', alpha=0.7) # Add benchmark details to the plot details = f"Benchmark Details:\n" details += f"Words per text: {WORDS_PER_TEXT}\n" details += f"Number of texts processed: {num_texts}\n" details += f"Max keywords: {MAX_KEYWORDS}\n" details += f"Step size: {STEP_SIZE}" plt.text(0.05, 0.95, details, transform=plt.gca().transAxes, fontsize=8, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8)) plt.tight_layout() plt.savefig(filename, dpi=300) print(f"Plot saved as {filename}") def write_results_to_markdown(single_extraction, multi_extraction, filename='benchmark_results.md'): with open(filename, 'w') as f: f.write("# Benchmark Results\n\n") f.write("## Benchmark Configuration\n\n") f.write(f"- Words per text: {WORDS_PER_TEXT}\n") f.write(f"- Number of texts for parallel: {NUM_TEXTS_PARALLEL}\n") f.write(f"- Maximum number of keywords: {MAX_KEYWORDS - 1}\n") f.write(f"- Step size for keyword count: {STEP_SIZE}\n\n") f.write("## Keyword Extraction (Single-threaded)\n\n") f.write("| Keywords | FlashText | Regex | BlitzText | Python find() | Spacy | StringZilla |\n") f.write("|----------|-----------|-------|-----------|---------------|-------|-------------|\n") for result in single_extraction: f.write( f"| {result['keywords']} | {result['flashtext']:.5f} | {result['regex']:.5f} | {result['blitztext']:.5f} | {result['python_find']:.5f} | {result['spacy']:.5f} | {result['stringzilla']:.5f} |\n") f.write("\n## Keyword Extraction (Multi-threaded)\n\n") f.write("| Keywords | FlashText | Regex | BlitzText | Python find() | Spacy | StringZilla |\n") f.write("|----------|-----------|-------|-----------|---------------|-------|-------------|\n") for result in multi_extraction: f.write( f"| {result['keywords']} | {result['flashtext']:.5f} | {result['regex']:.5f} | {result['blitztext']:.5f} | {result['python_find']:.5f} | {result['spacy']:.5f} | {result['stringzilla']:.5f} |\n") print(f"Results written to {filename}") if __name__ == '__main__': print("Running single-threaded extraction benchmark...") single_extraction = run_benchmark_single_threaded(benchmark_keyword_extraction) print("Running multi-threaded extraction benchmark...") multi_extraction = run_benchmark_multi_threaded(benchmark_keyword_extraction_parallel) # multi_extraction = [] plot_results(single_extraction, 'benchmark_results_single.png', 'Single-threaded Performance Comparison') plot_results(multi_extraction, 'benchmark_results_multi.png', 'Multi-threaded Performance Comparison', NUM_TEXTS_PARALLEL) write_results_to_markdown(single_extraction, multi_extraction)