{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import jaro\n",
"import rltk.similarity as sim\n",
"import py_stringmatching.similarity_measure.jaro_winkler\n",
"import py_stringmatching.similarity_measure.jaro\n",
"# import rapidfuzz.distance\n",
"import py_stringmatching.similarity_measure.monge_elkan\n",
"from grams.core.strsim import jaro_winkler_similarity, monge_elkan_similarity, CharacterTokenizer, jaro_similarity\n",
"\n",
"tok = CharacterTokenizer()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"key = \"United Kingdom\"\n",
"query = \"Sengenia (United Kingdom)\"\n",
"query = 'Embecosm (United Kingdom)'\n",
"query = \"Landcatch Natural Selection\"\n",
"key, query = 'Dance of the Dwarfs', 'Dance of Death: The Abbot'\n",
"key, query = 'United Kingdom', 'UK'\n",
"# key, query = 'United', 'UK'\n",
"key, query = 'United Kingdom', 'distrito electoral de la Cámara de los Comunes'"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"jaro winkler"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'key' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m/Users/rook/workspace/sm-dev/grams/tests/rust/strsim/strsim_check.ipynb Cell 4\u001b[0m in \u001b[0;36m1\n\u001b[0;32m----> 1\u001b[0m \u001b[39mprint\u001b[39m(sim\u001b[39m.\u001b[39mjaro_winkler_similarity(key, query, threshold\u001b[39m=\u001b[39m\u001b[39m0.7\u001b[39m, scaling_factor\u001b[39m=\u001b[39m\u001b[39m0.1\u001b[39m, prefix_len\u001b[39m=\u001b[39m\u001b[39m4\u001b[39m))\n\u001b[1;32m 2\u001b[0m jw \u001b[39m=\u001b[39m py_stringmatching\u001b[39m.\u001b[39msimilarity_measure\u001b[39m.\u001b[39mjaro_winkler\u001b[39m.\u001b[39mJaroWinkler(); \u001b[39mprint\u001b[39m(jw\u001b[39m.\u001b[39mget_raw_score(key, query))\n\u001b[1;32m 3\u001b[0m \u001b[39mprint\u001b[39m(jaro\u001b[39m.\u001b[39mjaro_winkler_metric(key, query))\n",
"\u001b[0;31mNameError\u001b[0m: name 'key' is not defined"
]
}
],
"source": [
"print(sim.jaro_winkler_similarity(key, query, threshold=0.7, scaling_factor=0.1, prefix_len=4))\n",
"jw = py_stringmatching.similarity_measure.jaro_winkler.JaroWinkler(); print(jw.get_raw_score(key, query))\n",
"print(jaro.jaro_winkler_metric(key, query))\n",
"# print(jaro_winkler_similarity(tok.tokenize(key), tok.tokenize(query), threshold=0.7, scaling_factor=0.1, prefix_len=4))\n",
"# print(rapidfuzz.distance.JaroWinkler.similarity(key, query, score_cutoff=0.7, prefix_weight=0.1))"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"jaro"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5555555555555555\n",
"0.5555555820465088\n",
"0.5555555555555555\n",
"0.5555555555555555\n"
]
}
],
"source": [
"print(sim.jaro_distance(key, query))\n",
"print(py_stringmatching.similarity_measure.jaro.Jaro().get_raw_score(key, query))\n",
"print(jaro.jaro_metric(key, query))\n",
"print(jaro_similarity(tok.tokenize(key), tok.tokenize(query)))\n",
"# print(rapidfuzz.distance.Jaro.similarity(key, query))"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.5198412835597992\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"thread '' panicked at 'index out of bounds: the len is 0 but the index is 0', src/strsim/jaro.rs:31:20\n"
]
},
{
"ename": "PanicException",
"evalue": "index out of bounds: the len is 0 but the index is 0",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mPanicException\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m/Users/rook/workspace/sm-dev/grams/tests/rust/strsim/strsim_check.ipynb Cell 7\u001b[0m in \u001b[0;36m3\n\u001b[1;32m 1\u001b[0m oursim \u001b[39m=\u001b[39m \u001b[39mlambda\u001b[39;00m x, y: jaro_winkler_similarity(tok\u001b[39m.\u001b[39mtokenize(x), tok\u001b[39m.\u001b[39mtokenize(y))\n\u001b[1;32m 2\u001b[0m \u001b[39mprint\u001b[39m(sim\u001b[39m.\u001b[39mmonge_elkan_similarity(key\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), query\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), function\u001b[39m=\u001b[39mjw\u001b[39m.\u001b[39mget_sim_score))\n\u001b[0;32m----> 3\u001b[0m \u001b[39mprint\u001b[39m(sim\u001b[39m.\u001b[39;49mmonge_elkan_similarity(key\u001b[39m.\u001b[39;49msplit(\u001b[39m\"\u001b[39;49m\u001b[39m \u001b[39;49m\u001b[39m\"\u001b[39;49m), query\u001b[39m.\u001b[39;49msplit(\u001b[39m\"\u001b[39;49m\u001b[39m \u001b[39;49m\u001b[39m\"\u001b[39;49m), function\u001b[39m=\u001b[39;49moursim))\n\u001b[1;32m 4\u001b[0m \u001b[39mprint\u001b[39m(py_stringmatching\u001b[39m.\u001b[39msimilarity_measure\u001b[39m.\u001b[39mmonge_elkan\u001b[39m.\u001b[39mMongeElkan(sim_func\u001b[39m=\u001b[39moursim)\u001b[39m.\u001b[39mget_raw_score(key\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), query\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m)))\n",
"File \u001b[0;32m~/workspace/sm-dev/.venv/lib/python3.9/site-packages/rltk/similarity/hybrid.py:106\u001b[0m, in \u001b[0;36mmonge_elkan_similarity\u001b[0;34m(bag1, bag2, function, parameters, lower_bound)\u001b[0m\n\u001b[1;32m 104\u001b[0m max_score \u001b[39m=\u001b[39m utils\u001b[39m.\u001b[39mMIN_FLOAT\n\u001b[1;32m 105\u001b[0m \u001b[39mfor\u001b[39;00m ele2 \u001b[39min\u001b[39;00m bag2:\n\u001b[0;32m--> 106\u001b[0m max_score \u001b[39m=\u001b[39m \u001b[39mmax\u001b[39m(max_score, function(ele1, ele2, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mparameters))\n\u001b[1;32m 107\u001b[0m score_sum \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m max_score\n\u001b[1;32m 109\u001b[0m \u001b[39m# if it satisfies early exit condition\u001b[39;00m\n",
"\u001b[1;32m/Users/rook/workspace/sm-dev/grams/tests/rust/strsim/strsim_check.ipynb Cell 7\u001b[0m in \u001b[0;36m1\n\u001b[0;32m----> 1\u001b[0m oursim \u001b[39m=\u001b[39m \u001b[39mlambda\u001b[39;00m x, y: jaro_winkler_similarity(tok\u001b[39m.\u001b[39;49mtokenize(x), tok\u001b[39m.\u001b[39;49mtokenize(y))\n\u001b[1;32m 2\u001b[0m \u001b[39mprint\u001b[39m(sim\u001b[39m.\u001b[39mmonge_elkan_similarity(key\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), query\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), function\u001b[39m=\u001b[39mjw\u001b[39m.\u001b[39mget_sim_score))\n\u001b[1;32m 3\u001b[0m \u001b[39mprint\u001b[39m(sim\u001b[39m.\u001b[39mmonge_elkan_similarity(key\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), query\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), function\u001b[39m=\u001b[39moursim))\n",
"\u001b[0;31mPanicException\u001b[0m: index out of bounds: the len is 0 but the index is 0"
]
}
],
"source": [
"oursim = lambda x, y: jaro_winkler_similarity(tok.tokenize(x), tok.tokenize(y))\n",
"print(sim.monge_elkan_similarity(key.split(\" \"), query.split(\" \"), function=jw.get_sim_score))\n",
"print(sim.monge_elkan_similarity(key.split(\" \"), query.split(\" \"), function=oursim))\n",
"print(py_stringmatching.similarity_measure.monge_elkan.MongeElkan(sim_func=oursim).get_raw_score(key.split(\" \"), query.split(\" \")))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"United UK 0.5555555555555555 0.6000000238418579\n",
"Kingdom UK 0.5476190476190476 0.5476190447807312\n"
]
}
],
"source": [
"for x in key.split(\" \"):\n",
" for y in query.split(\" \"):\n",
" # print(x, y, jaro.jaro_winkler_metric(x, y), jw.get_raw_score(x, y))\n",
" # print(x, y, jaro.jaro_metric(x, y), py_stringmatching.similarity_measure.jaro.Jaro().get_raw_score(x, y))\n",
" print(x, y, jaro_winkler_similarity(tok.tokenize(x), tok.tokenize(y)), jw.get_raw_score(x, y))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}