{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install dpss\n", "import dpss\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "a = pd.DataFrame({'ID': {0: 'pXwTG',\n", " 1: 'puvjO',\n", " 2: '',\n", " 3: 'TvBHC',\n", " 4: 'TBb0g',\n", " 5: '',\n", " 6: 'cQ8pO',\n", " 7: 'dHji0',\n", " 8: 'vS0Er',\n", " 9: 'a1'},\n", " 'value': {0: 1000,\n", " 1: 200,\n", " 2: 947,\n", " 3: 40,\n", " 4: 80,\n", " 5: 200,\n", " 6: 349,\n", " 7: 98,\n", " 8: 100,\n", " 9: 15},\n", " 'matched_group': {0: '',\n", " 1: '',\n", " 2: '',\n", " 3: '',\n", " 4: '',\n", " 5: '',\n", " 6: '',\n", " 7: '',\n", " 8: '',\n", " 9: ''}})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "b = pd.DataFrame({'ID': {0: 'pXwTG',\n", " 1: 'puvjO',\n", " 2: '',\n", " 3: 'TvBHC',\n", " 4: 'TvBHC',\n", " 5: 'TvBHC',\n", " 6: '',\n", " 7: '',\n", " 8: 'vS0Er',\n", " 9: 'a'},\n", " 'value': {0: 1000,\n", " 1: 200,\n", " 2: 1147,\n", " 3: 90,\n", " 4: 15,\n", " 5: 15,\n", " 6: 300,\n", " 7: 49,\n", " 8: 198,\n", " 9: 15},\n", " 'matched_group': {0: '',\n", " 1: '',\n", " 2: '',\n", " 3: '',\n", " 4: '',\n", " 5: '',\n", " 6: '',\n", " 7: '',\n", " 8: '',\n", " 9: ''}})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "a" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "b" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "merged = a.merge(b, how=\"outer\", on=\"ID\", indicator=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "merged[\"result\"] = \"\"\n", "merged.loc[merged[\"value_x\"] == merged[\"value_y\"], \"result\"] = \"one to one\"\n", "merged" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "matched_ids = merged[\"ID\"][merged[\"result\"] == \"one to one\"].tolist()\n", "a_remained = a[~a[\"ID\"].isin(matched_ids)]\n", "b_remained = b[~b[\"ID\"].isin(matched_ids)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "a_remained" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "b_remained" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dpss_match = dpss.sequence_matcher(a_remained[\"value\"], b_remained[\"value\"], 5, 5, 1000, True, True)\n", "dpss_match" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dpss_match[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for i in range(len(dpss_match[0])):\n", " for a_elem_idx in range(len(dpss_match[0][i][0])):\n", " idx = a_remained.loc[\n", " (a_remained[\"value\"] == dpss_match[0][i][0][a_elem_idx]) & (a_remained[\"matched_group\"] == \"\")].index[0]\n", " a_remained.loc[idx, \"matched_group\"] = i\n", " for b_elem_idx in range(len(dpss_match[0][i][1])):\n", " idx = b_remained.loc[\n", " (b_remained[\"value\"] == dpss_match[0][i][1][b_elem_idx]) & (b_remained[\"matched_group\"] == \"\")].index[0]\n", " b_remained.loc[idx, \"matched_group\"] = i" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "a_remained" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "b_remained" ] } ], "metadata": { "interpreter": { "hash": "f4f645914a7754ea497a6f401fbe5616a5c589c7602116aebc1781809c601840" }, "kernelspec": { "display_name": "Python 3.9.5 ('env': venv)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.5" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }