{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import jaro\n",
    "import rltk.similarity as sim\n",
    "import py_stringmatching.similarity_measure.jaro_winkler\n",
    "import py_stringmatching.similarity_measure.jaro\n",
    "# import rapidfuzz.distance\n",
    "import py_stringmatching.similarity_measure.monge_elkan\n",
    "from grams.core.strsim import jaro_winkler_similarity, monge_elkan_similarity, CharacterTokenizer, jaro_similarity\n",
    "\n",
    "tok = CharacterTokenizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "key = \"United Kingdom\"\n",
    "query = \"Sengenia (United Kingdom)\"\n",
    "query = 'Embecosm (United Kingdom)'\n",
    "query = \"Landcatch Natural Selection\"\n",
    "key, query = 'Dance of the Dwarfs', 'Dance of Death:  The Abbot'\n",
    "key, query = 'United Kingdom', 'UK'\n",
    "# key, query = 'United', 'UK'\n",
    "key, query = 'United Kingdom', 'distrito electoral  de la Cámara de los Comunes'"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "jaro winkler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "ename": "NameError",
     "evalue": "name 'key' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m/Users/rook/workspace/sm-dev/grams/tests/rust/strsim/strsim_check.ipynb Cell 4\u001b[0m in \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/rook/workspace/sm-dev/grams/tests/rust/strsim/strsim_check.ipynb#W3sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mprint\u001b[39m(sim\u001b[39m.\u001b[39mjaro_winkler_similarity(key, query, threshold\u001b[39m=\u001b[39m\u001b[39m0.7\u001b[39m, scaling_factor\u001b[39m=\u001b[39m\u001b[39m0.1\u001b[39m, prefix_len\u001b[39m=\u001b[39m\u001b[39m4\u001b[39m))\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/rook/workspace/sm-dev/grams/tests/rust/strsim/strsim_check.ipynb#W3sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m jw \u001b[39m=\u001b[39m py_stringmatching\u001b[39m.\u001b[39msimilarity_measure\u001b[39m.\u001b[39mjaro_winkler\u001b[39m.\u001b[39mJaroWinkler(); \u001b[39mprint\u001b[39m(jw\u001b[39m.\u001b[39mget_raw_score(key, query))\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/rook/workspace/sm-dev/grams/tests/rust/strsim/strsim_check.ipynb#W3sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m \u001b[39mprint\u001b[39m(jaro\u001b[39m.\u001b[39mjaro_winkler_metric(key, query))\n",
      "\u001b[0;31mNameError\u001b[0m: name 'key' is not defined"
     ]
    }
   ],
   "source": [
    "print(sim.jaro_winkler_similarity(key, query, threshold=0.7, scaling_factor=0.1, prefix_len=4))\n",
    "jw = py_stringmatching.similarity_measure.jaro_winkler.JaroWinkler(); print(jw.get_raw_score(key, query))\n",
    "print(jaro.jaro_winkler_metric(key, query))\n",
    "# print(jaro_winkler_similarity(tok.tokenize(key), tok.tokenize(query), threshold=0.7, scaling_factor=0.1, prefix_len=4))\n",
    "# print(rapidfuzz.distance.JaroWinkler.similarity(key, query, score_cutoff=0.7, prefix_weight=0.1))"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "jaro"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.5555555555555555\n",
      "0.5555555820465088\n",
      "0.5555555555555555\n",
      "0.5555555555555555\n"
     ]
    }
   ],
   "source": [
    "print(sim.jaro_distance(key, query))\n",
    "print(py_stringmatching.similarity_measure.jaro.Jaro().get_raw_score(key, query))\n",
    "print(jaro.jaro_metric(key, query))\n",
    "print(jaro_similarity(tok.tokenize(key), tok.tokenize(query)))\n",
    "# print(rapidfuzz.distance.Jaro.similarity(key, query))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.5198412835597992\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "thread '<unnamed>' panicked at 'index out of bounds: the len is 0 but the index is 0', src/strsim/jaro.rs:31:20\n"
     ]
    },
    {
     "ename": "PanicException",
     "evalue": "index out of bounds: the len is 0 but the index is 0",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mPanicException\u001b[0m                            Traceback (most recent call last)",
      "\u001b[1;32m/Users/rook/workspace/sm-dev/grams/tests/rust/strsim/strsim_check.ipynb Cell 7\u001b[0m in \u001b[0;36m3\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/rook/workspace/sm-dev/grams/tests/rust/strsim/strsim_check.ipynb#W6sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m oursim \u001b[39m=\u001b[39m \u001b[39mlambda\u001b[39;00m x, y: jaro_winkler_similarity(tok\u001b[39m.\u001b[39mtokenize(x), tok\u001b[39m.\u001b[39mtokenize(y))\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/rook/workspace/sm-dev/grams/tests/rust/strsim/strsim_check.ipynb#W6sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m \u001b[39mprint\u001b[39m(sim\u001b[39m.\u001b[39mmonge_elkan_similarity(key\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), query\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), function\u001b[39m=\u001b[39mjw\u001b[39m.\u001b[39mget_sim_score))\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/rook/workspace/sm-dev/grams/tests/rust/strsim/strsim_check.ipynb#W6sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m \u001b[39mprint\u001b[39m(sim\u001b[39m.\u001b[39;49mmonge_elkan_similarity(key\u001b[39m.\u001b[39;49msplit(\u001b[39m\"\u001b[39;49m\u001b[39m \u001b[39;49m\u001b[39m\"\u001b[39;49m), query\u001b[39m.\u001b[39;49msplit(\u001b[39m\"\u001b[39;49m\u001b[39m \u001b[39;49m\u001b[39m\"\u001b[39;49m), function\u001b[39m=\u001b[39;49moursim))\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/rook/workspace/sm-dev/grams/tests/rust/strsim/strsim_check.ipynb#W6sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m \u001b[39mprint\u001b[39m(py_stringmatching\u001b[39m.\u001b[39msimilarity_measure\u001b[39m.\u001b[39mmonge_elkan\u001b[39m.\u001b[39mMongeElkan(sim_func\u001b[39m=\u001b[39moursim)\u001b[39m.\u001b[39mget_raw_score(key\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), query\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m)))\n",
      "File \u001b[0;32m~/workspace/sm-dev/.venv/lib/python3.9/site-packages/rltk/similarity/hybrid.py:106\u001b[0m, in \u001b[0;36mmonge_elkan_similarity\u001b[0;34m(bag1, bag2, function, parameters, lower_bound)\u001b[0m\n\u001b[1;32m    104\u001b[0m max_score \u001b[39m=\u001b[39m utils\u001b[39m.\u001b[39mMIN_FLOAT\n\u001b[1;32m    105\u001b[0m \u001b[39mfor\u001b[39;00m ele2 \u001b[39min\u001b[39;00m bag2:\n\u001b[0;32m--> 106\u001b[0m     max_score \u001b[39m=\u001b[39m \u001b[39mmax\u001b[39m(max_score, function(ele1, ele2, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mparameters))\n\u001b[1;32m    107\u001b[0m score_sum \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m max_score\n\u001b[1;32m    109\u001b[0m \u001b[39m# if it satisfies early exit condition\u001b[39;00m\n",
      "\u001b[1;32m/Users/rook/workspace/sm-dev/grams/tests/rust/strsim/strsim_check.ipynb Cell 7\u001b[0m in \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/rook/workspace/sm-dev/grams/tests/rust/strsim/strsim_check.ipynb#W6sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m oursim \u001b[39m=\u001b[39m \u001b[39mlambda\u001b[39;00m x, y: jaro_winkler_similarity(tok\u001b[39m.\u001b[39;49mtokenize(x), tok\u001b[39m.\u001b[39;49mtokenize(y))\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/rook/workspace/sm-dev/grams/tests/rust/strsim/strsim_check.ipynb#W6sZmlsZQ%3D%3D?line=1'>2</a>\u001b[0m \u001b[39mprint\u001b[39m(sim\u001b[39m.\u001b[39mmonge_elkan_similarity(key\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), query\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), function\u001b[39m=\u001b[39mjw\u001b[39m.\u001b[39mget_sim_score))\n\u001b[1;32m      <a href='vscode-notebook-cell:/Users/rook/workspace/sm-dev/grams/tests/rust/strsim/strsim_check.ipynb#W6sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m \u001b[39mprint\u001b[39m(sim\u001b[39m.\u001b[39mmonge_elkan_similarity(key\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), query\u001b[39m.\u001b[39msplit(\u001b[39m\"\u001b[39m\u001b[39m \u001b[39m\u001b[39m\"\u001b[39m), function\u001b[39m=\u001b[39moursim))\n",
      "\u001b[0;31mPanicException\u001b[0m: index out of bounds: the len is 0 but the index is 0"
     ]
    }
   ],
   "source": [
    "oursim = lambda x, y: jaro_winkler_similarity(tok.tokenize(x), tok.tokenize(y))\n",
    "print(sim.monge_elkan_similarity(key.split(\" \"), query.split(\" \"), function=jw.get_sim_score))\n",
    "print(sim.monge_elkan_similarity(key.split(\" \"), query.split(\" \"), function=oursim))\n",
    "print(py_stringmatching.similarity_measure.monge_elkan.MongeElkan(sim_func=oursim).get_raw_score(key.split(\" \"), query.split(\" \")))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "United UK 0.5555555555555555 0.6000000238418579\n",
      "Kingdom UK 0.5476190476190476 0.5476190447807312\n"
     ]
    }
   ],
   "source": [
    "for x in key.split(\" \"):\n",
    "    for y in query.split(\" \"):\n",
    "        # print(x, y, jaro.jaro_winkler_metric(x, y), jw.get_raw_score(x, y))\n",
    "        # print(x, y, jaro.jaro_metric(x, y), py_stringmatching.similarity_measure.jaro.Jaro().get_raw_score(x, y))\n",
    "        print(x, y, jaro_winkler_similarity(tok.tokenize(x), tok.tokenize(y)), jw.get_raw_score(x, y))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.15"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}