{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import jaro\n", "import rltk.similarity as sim\n", "import py_stringmatching.similarity_measure.jaro_winkler\n", "import py_stringmatching.similarity_measure.jaro\n", "# import rapidfuzz.distance\n", "import py_stringmatching.similarity_measure.monge_elkan\n", "from grams.core.strsim import jaro_winkler_similarity, monge_elkan_similarity, CharacterTokenizer, jaro_similarity\n", "\n", "tok = CharacterTokenizer()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "key = \"United Kingdom\"\n", "query = \"Sengenia (United Kingdom)\"\n", "query = 'Embecosm (United Kingdom)'\n", "query = \"Landcatch Natural Selection\"\n", "key, query = 'Dance of the Dwarfs', 'Dance of Death: The Abbot'\n", "key, query = 'United Kingdom', 'UK'\n", "# key, query = 'United', 'UK'\n", "key, query = 'United Kingdom', 'distrito electoral de la Cámara de los Comunes'" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "jaro winkler" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'key' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m/Users/rook/workspace/sm-dev/grams/tests/rust/strsim/strsim_check.ipynb Cell 4\u001b[0m in \u001b[0;36m1\n\u001b[0;32m----> 1\u001b[0m \u001b[39mprint\u001b[39m(sim\u001b[39m.\u001b[39mjaro_winkler_similarity(key, query, threshold\u001b[39m=\u001b[39m\u001b[39m0.7\u001b[39m, scaling_factor\u001b[39m=\u001b[39m\u001b[39m0.1\u001b[39m, prefix_len\u001b[39m=\u001b[39m\u001b[39m4\u001b[39m))\n\u001b[1;32m 2\u001b[0m jw \u001b[39m=\u001b[39m py_stringmatching\u001b[39m.\u001b[39msimilarity_measure\u001b[39m.\u001b[39mjaro_winkler\u001b[39m.\u001b[39mJaroWinkler(); \u001b[39mprint\u001b[39m(jw\u001b[39m.\u001b[39mget_raw_score(key, query))\n\u001b[1;32m 3\u001b[0m \u001b[39mprint\u001b[39m(jaro\u001b[39m.\u001b[39mjaro_winkler_metric(key, query))\n", "\u001b[0;31mNameError\u001b[0m: name 'key' is not defined" ] } ], "source": [ "print(sim.jaro_winkler_similarity(key, query, threshold=0.7, scaling_factor=0.1, prefix_len=4))\n", "jw = py_stringmatching.similarity_measure.jaro_winkler.JaroWinkler(); print(jw.get_raw_score(key, query))\n", "print(jaro.jaro_winkler_metric(key, query))\n", "# print(jaro_winkler_similarity(tok.tokenize(key), tok.tokenize(query), threshold=0.7, scaling_factor=0.1, prefix_len=4))\n", "# print(rapidfuzz.distance.JaroWinkler.similarity(key, query, score_cutoff=0.7, prefix_weight=0.1))" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "jaro" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.5555555555555555\n", "0.5555555820465088\n", "0.5555555555555555\n", "0.5555555555555555\n" ] } ], "source": [ "print(sim.jaro_distance(key, query))\n", "print(py_stringmatching.similarity_measure.jaro.Jaro().get_raw_score(key, query))\n", "print(jaro.jaro_metric(key, query))\n", "print(jaro_similarity(tok.tokenize(key), tok.tokenize(query)))\n", "# print(rapidfuzz.distance.Jaro.similarity(key, query))" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.5198412835597992\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "thread '' panicked at 'index out of bounds: the len is 0 but the index is 0', src/strsim/jaro.rs:31:20\n" ] }, { "ename": "PanicException", "evalue": "index out of bounds: the len is 0 but the index is 0", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mPanicException\u001b[0m Traceback (most recent call last)", "\u001b[1;32m/Users/rook/workspace/sm-dev/grams/tests/rust/strsim/strsim_check.ipynb Cell 7\u001b[0m in \u001b[0;36m3\n\u001b[1;32m 1\u001b[0m oursim \u001b[39m=\u001b[39m \u001b[39mlambda\u001b[39;00m x, y: jaro_winkler_similarity(tok\u001b[39m.\u001b[39mtokenize(x), tok\u001b[39m.\u001b[39mtokenize(y))\n\u001b[1;32m 2\u001b[0m \u001b[39mprint\u001b[39m(sim\u001b[39m.\u001b[39mmonge_elkan_similarity(key\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), query\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), function\u001b[39m=\u001b[39mjw\u001b[39m.\u001b[39mget_sim_score))\n\u001b[0;32m----> 3\u001b[0m \u001b[39mprint\u001b[39m(sim\u001b[39m.\u001b[39;49mmonge_elkan_similarity(key\u001b[39m.\u001b[39;49msplit(\u001b[39m\"\u001b[39;49m\u001b[39m \u001b[39;49m\u001b[39m\"\u001b[39;49m), query\u001b[39m.\u001b[39;49msplit(\u001b[39m\"\u001b[39;49m\u001b[39m \u001b[39;49m\u001b[39m\"\u001b[39;49m), function\u001b[39m=\u001b[39;49moursim))\n\u001b[1;32m 4\u001b[0m \u001b[39mprint\u001b[39m(py_stringmatching\u001b[39m.\u001b[39msimilarity_measure\u001b[39m.\u001b[39mmonge_elkan\u001b[39m.\u001b[39mMongeElkan(sim_func\u001b[39m=\u001b[39moursim)\u001b[39m.\u001b[39mget_raw_score(key\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), query\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m)))\n", "File \u001b[0;32m~/workspace/sm-dev/.venv/lib/python3.9/site-packages/rltk/similarity/hybrid.py:106\u001b[0m, in \u001b[0;36mmonge_elkan_similarity\u001b[0;34m(bag1, bag2, function, parameters, lower_bound)\u001b[0m\n\u001b[1;32m 104\u001b[0m max_score \u001b[39m=\u001b[39m utils\u001b[39m.\u001b[39mMIN_FLOAT\n\u001b[1;32m 105\u001b[0m \u001b[39mfor\u001b[39;00m ele2 \u001b[39min\u001b[39;00m bag2:\n\u001b[0;32m--> 106\u001b[0m max_score \u001b[39m=\u001b[39m \u001b[39mmax\u001b[39m(max_score, function(ele1, ele2, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mparameters))\n\u001b[1;32m 107\u001b[0m score_sum \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m max_score\n\u001b[1;32m 109\u001b[0m \u001b[39m# if it satisfies early exit condition\u001b[39;00m\n", "\u001b[1;32m/Users/rook/workspace/sm-dev/grams/tests/rust/strsim/strsim_check.ipynb Cell 7\u001b[0m in \u001b[0;36m1\n\u001b[0;32m----> 1\u001b[0m oursim \u001b[39m=\u001b[39m \u001b[39mlambda\u001b[39;00m x, y: jaro_winkler_similarity(tok\u001b[39m.\u001b[39;49mtokenize(x), tok\u001b[39m.\u001b[39;49mtokenize(y))\n\u001b[1;32m 2\u001b[0m \u001b[39mprint\u001b[39m(sim\u001b[39m.\u001b[39mmonge_elkan_similarity(key\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), query\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), function\u001b[39m=\u001b[39mjw\u001b[39m.\u001b[39mget_sim_score))\n\u001b[1;32m 3\u001b[0m \u001b[39mprint\u001b[39m(sim\u001b[39m.\u001b[39mmonge_elkan_similarity(key\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), query\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), function\u001b[39m=\u001b[39moursim))\n", "\u001b[0;31mPanicException\u001b[0m: index out of bounds: the len is 0 but the index is 0" ] } ], "source": [ "oursim = lambda x, y: jaro_winkler_similarity(tok.tokenize(x), tok.tokenize(y))\n", "print(sim.monge_elkan_similarity(key.split(\" \"), query.split(\" \"), function=jw.get_sim_score))\n", "print(sim.monge_elkan_similarity(key.split(\" \"), query.split(\" \"), function=oursim))\n", "print(py_stringmatching.similarity_measure.monge_elkan.MongeElkan(sim_func=oursim).get_raw_score(key.split(\" \"), query.split(\" \")))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "United UK 0.5555555555555555 0.6000000238418579\n", "Kingdom UK 0.5476190476190476 0.5476190447807312\n" ] } ], "source": [ "for x in key.split(\" \"):\n", " for y in query.split(\" \"):\n", " # print(x, y, jaro.jaro_winkler_metric(x, y), jw.get_raw_score(x, y))\n", " # print(x, y, jaro.jaro_metric(x, y), py_stringmatching.similarity_measure.jaro.Jaro().get_raw_score(x, y))\n", " print(x, y, jaro_winkler_similarity(tok.tokenize(x), tok.tokenize(y)), jw.get_raw_score(x, y))" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.15" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }