{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "1c23a3e3-332b-4712-a2b8-1788cd14087e", "metadata": {}, "outputs": [], "source": [ "%reset -f" ] }, { "cell_type": "code", "execution_count": 2, "id": "207f38ab-618d-42fa-accb-f400dfc8be34", "metadata": {}, "outputs": [], "source": [ "user = \"HHegde\"\n", "db = f\"/Users/{user}/.data/oaklib/phenio.db\"" ] }, { "cell_type": "code", "execution_count": 3, "id": "2e2ce839-b8cc-4f1a-a931-67b85ba0df4d", "metadata": {}, "outputs": [], "source": [ "%reload_ext sql\n", "%sql sqlite:///{db}" ] }, { "cell_type": "code", "execution_count": 4, "id": "87025967-554f-4c9d-9967-97ce1e40acf7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite:////Users/HHegde/.data/oaklib/phenio.db\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsubjectpredicateobjectevidence_typepublicationsource
uuid:70269c5a-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0011097ECO:0000269PMID:31675180infores:hpo-annotations
uuid:70269c5b-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0002187ECO:0000269PMID:31675180infores:hpo-annotations
" ], "text/plain": [ "[('uuid:70269c5a-42a9-11ee-be37-31ef105c25ea', 'MONDO:0023659', 'biolink:has_phenotype', 'HP:0011097', 'ECO:0000269', 'PMID:31675180', 'infores:hpo-annotations'),\n", " ('uuid:70269c5b-42a9-11ee-be37-31ef105c25ea', 'MONDO:0023659', 'biolink:has_phenotype', 'HP:0002187', 'ECO:0000269', 'PMID:31675180', 'infores:hpo-annotations')]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql SELECT * FROM term_association LIMIT 2;" ] }, { "cell_type": "code", "execution_count": 5, "id": "e639def1-00e1-4113-9da6-fb2f32701952", "metadata": {}, "outputs": [], "source": [ "import sqlite3\n", "import pandas as pd\n", "pd.set_option('display.max_rows', None)\n", "from semsimian import Semsimian\n", "from collections import Counter" ] }, { "cell_type": "code", "execution_count": 6, "id": "bb1e79e6-2b18-466f-95fb-d219f8c64431", "metadata": {}, "outputs": [], "source": [ "conn = sqlite3.connect(db)\n", "res = conn.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n", "# tables = res.fetchall()\n", "\n", "# tables" ] }, { "cell_type": "code", "execution_count": 7, "id": "e406c2fa-43e1-4f57-b8be-52334edfdbd6", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsubjectpredicateobjectevidence_typepublicationsource
0uuid:70269c5a-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0011097ECO:0000269PMID:31675180infores:hpo-annotations
1uuid:70269c5b-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0002187ECO:0000269PMID:31675180infores:hpo-annotations
2uuid:70269c5c-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0001518ECO:0000269PMID:31675180infores:hpo-annotations
3uuid:70269c5d-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0032792ECO:0000269PMID:31675180infores:hpo-annotations
4uuid:70269c5e-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0011451ECO:0000269PMID:31675180infores:hpo-annotations
\n", "
" ], "text/plain": [ " id subject \\\n", "0 uuid:70269c5a-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "1 uuid:70269c5b-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "2 uuid:70269c5c-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "3 uuid:70269c5d-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "4 uuid:70269c5e-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "\n", " predicate object evidence_type publication \\\n", "0 biolink:has_phenotype HP:0011097 ECO:0000269 PMID:31675180 \n", "1 biolink:has_phenotype HP:0002187 ECO:0000269 PMID:31675180 \n", "2 biolink:has_phenotype HP:0001518 ECO:0000269 PMID:31675180 \n", "3 biolink:has_phenotype HP:0032792 ECO:0000269 PMID:31675180 \n", "4 biolink:has_phenotype HP:0011451 ECO:0000269 PMID:31675180 \n", "\n", " source \n", "0 infores:hpo-annotations \n", "1 infores:hpo-annotations \n", "2 infores:hpo-annotations \n", "3 infores:hpo-annotations \n", "4 infores:hpo-annotations " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_term_association = pd.read_sql_query(\"SELECT * FROM term_association\", conn)\n", "df_term_association.head()" ] }, { "cell_type": "code", "execution_count": 8, "id": "76a44b5a-2532-44a4-8226-e74322e289c9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['MONDO', 'HGNC', 'WB', 'MGI', 'RGD', 'Xenbase', 'ZFIN'],\n", " dtype=object)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_term_association['subject'].str.split(\":\").str[0].unique()" ] }, { "cell_type": "code", "execution_count": 9, "id": "23cd279e-77fd-472c-bd13-d21753d3cf9f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['HP', 'WBPhenotype', 'MP', 'XPO', 'ZP'], dtype=object)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_term_association['object'].str.split(\":\").str[0].unique()" ] }, { "cell_type": "code", "execution_count": 10, "id": "f8af957d-612c-4922-998e-760553f171ad", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subjectpredicateobject
808522HGNC:10031biolink:has_phenotypeHP:0002652
956399HGNC:13780biolink:has_phenotypeHP:0001270
710713HGNC:2218biolink:has_phenotypeHP:0000007
220697MONDO:0007949biolink:has_phenotypeHP:0002684
727526HGNC:3534biolink:has_phenotypeHP:0001892
185712MONDO:0007316biolink:has_phenotypeHP:0001293
892081HGNC:20858biolink:has_phenotypeHP:0002315
751827HGNC:5136biolink:has_phenotypeHP:0001156
718733HGNC:2972biolink:has_phenotypeHP:0002355
890804HGNC:16700biolink:has_phenotypeHP:0004209
861364HGNC:2652biolink:has_phenotypeHP:0011985
143160MONDO:0018866biolink:has_phenotypeHP:0001357
868056HGNC:11957biolink:has_phenotypeHP:0002608
239787MONDO:0010184biolink:has_phenotypeHP:0001789
107556MONDO:0013276biolink:has_phenotypeHP:0030873
191045MONDO:0008318biolink:has_phenotypeHP:0001597
836978HGNC:12428biolink:has_phenotypeHP:0003577
191789MONDO:0014339biolink:has_phenotypeHP:0002346
863045HGNC:4661biolink:has_phenotypeHP:0010669
188703MONDO:0019633biolink:has_phenotypeHP:0003326
867924HGNC:467biolink:has_phenotypeHP:0000407
817610HGNC:10889biolink:has_phenotypeHP:0002418
911676HGNC:14889biolink:has_phenotypeHP:0000006
955238HGNC:19711biolink:has_phenotypeHP:0001254
707513HGNC:2092biolink:has_phenotypeHP:0001257
863162HGNC:4431biolink:has_phenotypeHP:0025116
149207MONDO:0017314biolink:has_phenotypeHP:0000767
837087HGNC:12428biolink:has_phenotypeHP:0000494
134709MONDO:0020527biolink:has_phenotypeHP:0000708
201130MONDO:0012271biolink:has_phenotypeHP:0009701
780863HGNC:7707biolink:has_phenotypeHP:0001508
191580MONDO:0021055biolink:has_phenotypeHP:0006725
942774HGNC:25812biolink:has_phenotypeHP:0001976
12356MONDO:0010802biolink:has_phenotypeHP:0001643
857720HGNC:8768biolink:has_phenotypeHP:0003390
685158HGNC:20biolink:has_phenotypeHP:0000973
82514MONDO:0008854biolink:has_phenotypeHP:0001769
768967HGNC:6904biolink:has_phenotypeHP:0001677
234118MONDO:0018631biolink:has_phenotypeHP:0100840
12881MONDO:0010515biolink:has_phenotypeHP:0001166
810622HGNC:10389biolink:has_phenotypeHP:0006758
116022MONDO:0020769biolink:has_phenotypeHP:0000750
44850MONDO:0008965biolink:has_phenotypeHP:0002139
848690HGNC:4790biolink:has_phenotypeHP:0002212
854532HGNC:12771biolink:has_phenotypeHP:0004603
207826MONDO:0014024biolink:has_phenotypeHP:0006380
881431HGNC:30074biolink:has_phenotypeHP:0000358
954759HGNC:25902biolink:has_phenotypeHP:0001249
775560HGNC:7481biolink:has_phenotypeHP:0007302
63788MONDO:0044208biolink:has_phenotypeHP:0001873
\n", "
" ], "text/plain": [ " subject predicate object\n", "808522 HGNC:10031 biolink:has_phenotype HP:0002652\n", "956399 HGNC:13780 biolink:has_phenotype HP:0001270\n", "710713 HGNC:2218 biolink:has_phenotype HP:0000007\n", "220697 MONDO:0007949 biolink:has_phenotype HP:0002684\n", "727526 HGNC:3534 biolink:has_phenotype HP:0001892\n", "185712 MONDO:0007316 biolink:has_phenotype HP:0001293\n", "892081 HGNC:20858 biolink:has_phenotype HP:0002315\n", "751827 HGNC:5136 biolink:has_phenotype HP:0001156\n", "718733 HGNC:2972 biolink:has_phenotype HP:0002355\n", "890804 HGNC:16700 biolink:has_phenotype HP:0004209\n", "861364 HGNC:2652 biolink:has_phenotype HP:0011985\n", "143160 MONDO:0018866 biolink:has_phenotype HP:0001357\n", "868056 HGNC:11957 biolink:has_phenotype HP:0002608\n", "239787 MONDO:0010184 biolink:has_phenotype HP:0001789\n", "107556 MONDO:0013276 biolink:has_phenotype HP:0030873\n", "191045 MONDO:0008318 biolink:has_phenotype HP:0001597\n", "836978 HGNC:12428 biolink:has_phenotype HP:0003577\n", "191789 MONDO:0014339 biolink:has_phenotype HP:0002346\n", "863045 HGNC:4661 biolink:has_phenotype HP:0010669\n", "188703 MONDO:0019633 biolink:has_phenotype HP:0003326\n", "867924 HGNC:467 biolink:has_phenotype HP:0000407\n", "817610 HGNC:10889 biolink:has_phenotype HP:0002418\n", "911676 HGNC:14889 biolink:has_phenotype HP:0000006\n", "955238 HGNC:19711 biolink:has_phenotype HP:0001254\n", "707513 HGNC:2092 biolink:has_phenotype HP:0001257\n", "863162 HGNC:4431 biolink:has_phenotype HP:0025116\n", "149207 MONDO:0017314 biolink:has_phenotype HP:0000767\n", "837087 HGNC:12428 biolink:has_phenotype HP:0000494\n", "134709 MONDO:0020527 biolink:has_phenotype HP:0000708\n", "201130 MONDO:0012271 biolink:has_phenotype HP:0009701\n", "780863 HGNC:7707 biolink:has_phenotype HP:0001508\n", "191580 MONDO:0021055 biolink:has_phenotype HP:0006725\n", "942774 HGNC:25812 biolink:has_phenotype HP:0001976\n", "12356 MONDO:0010802 biolink:has_phenotype HP:0001643\n", "857720 HGNC:8768 biolink:has_phenotype HP:0003390\n", "685158 HGNC:20 biolink:has_phenotype HP:0000973\n", "82514 MONDO:0008854 biolink:has_phenotype HP:0001769\n", "768967 HGNC:6904 biolink:has_phenotype HP:0001677\n", "234118 MONDO:0018631 biolink:has_phenotype HP:0100840\n", "12881 MONDO:0010515 biolink:has_phenotype HP:0001166\n", "810622 HGNC:10389 biolink:has_phenotype HP:0006758\n", "116022 MONDO:0020769 biolink:has_phenotype HP:0000750\n", "44850 MONDO:0008965 biolink:has_phenotype HP:0002139\n", "848690 HGNC:4790 biolink:has_phenotype HP:0002212\n", "854532 HGNC:12771 biolink:has_phenotype HP:0004603\n", "207826 MONDO:0014024 biolink:has_phenotype HP:0006380\n", "881431 HGNC:30074 biolink:has_phenotype HP:0000358\n", "954759 HGNC:25902 biolink:has_phenotype HP:0001249\n", "775560 HGNC:7481 biolink:has_phenotype HP:0007302\n", "63788 MONDO:0044208 biolink:has_phenotype HP:0001873" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_100 = df_term_association[df_term_association['object'].str.startswith(\"HP:\")].sample(n=50)#, random_state=1)\n", "df_100 = df_100[['subject', 'predicate', 'object']]\n", "df_100" ] }, { "cell_type": "code", "execution_count": 20, "id": "68078a59-7ac9-419a-af3b-9b4e089c3e4f", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'HP:0000023',\n", " 'HP:0000098',\n", " 'HP:0000175',\n", " 'HP:0000189',\n", " 'HP:0000218',\n", " 'HP:0000268',\n", " 'HP:0000272',\n", " 'HP:0000275',\n", " 'HP:0000276',\n", " 'HP:0000278',\n", " 'HP:0000347',\n", " 'HP:0000486',\n", " 'HP:0000490',\n", " 'HP:0000494',\n", " 'HP:0000501',\n", " 'HP:0000505',\n", " 'HP:0000518',\n", " 'HP:0000541',\n", " 'HP:0000545',\n", " 'HP:0000565',\n", " 'HP:0000577',\n", " 'HP:0000678',\n", " 'HP:0000767',\n", " 'HP:0000768',\n", " 'HP:0000938',\n", " 'HP:0000939',\n", " 'HP:0001065',\n", " 'HP:0001083',\n", " 'HP:0001132',\n", " 'HP:0001166',\n", " 'HP:0001252',\n", " 'HP:0001371',\n", " 'HP:0001382',\n", " 'HP:0001519',\n", " 'HP:0001533',\n", " 'HP:0001634',\n", " 'HP:0001635',\n", " 'HP:0001653',\n", " 'HP:0001659',\n", " 'HP:0001704',\n", " 'HP:0001761',\n", " 'HP:0001763',\n", " 'HP:0001765',\n", " 'HP:0002097',\n", " 'HP:0002105',\n", " 'HP:0002107',\n", " 'HP:0002108',\n", " 'HP:0002360',\n", " 'HP:0002435',\n", " 'HP:0002616',\n", " 'HP:0002636',\n", " 'HP:0002647',\n", " 'HP:0002650',\n", " 'HP:0002705',\n", " 'HP:0002751',\n", " 'HP:0002808',\n", " 'HP:0002816',\n", " 'HP:0002996',\n", " 'HP:0003088',\n", " 'HP:0003179',\n", " 'HP:0003199',\n", " 'HP:0003202',\n", " 'HP:0003302',\n", " 'HP:0003326',\n", " 'HP:0003758',\n", " 'HP:0004298',\n", " 'HP:0004326',\n", " 'HP:0004382',\n", " 'HP:0004872',\n", " 'HP:0004927',\n", " 'HP:0004933',\n", " 'HP:0004970',\n", " 'HP:0005059',\n", " 'HP:0005136',\n", " 'HP:0005294',\n", " 'HP:0006687',\n", " 'HP:0007018',\n", " 'HP:0007676',\n", " 'HP:0007720',\n", " 'HP:0007800',\n", " 'HP:0008132',\n", " 'HP:0010807',\n", " 'HP:0012019',\n", " 'HP:0012369',\n", " 'HP:0012432',\n", " 'HP:0012499',\n", " 'HP:0025586',\n", " 'HP:0032934',\n", " 'HP:0100775'}" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# object_terms = set(df_100['object'])\n", "# Test Marfan syndrome\n", "object_terms = {\n", " \"HP:0100775\",\n", " \"HP:0003179\",\n", " \"HP:0001083\",\n", " \"HP:0000501\",\n", " \"HP:0002705\",\n", " \"HP:0004382\",\n", " \"HP:0004326\",\n", " \"HP:0002816\",\n", " \"HP:0004298\",\n", " \"HP:0002996\",\n", " \"HP:0002808\",\n", " \"HP:0002751\",\n", " \"HP:0002647\",\n", " \"HP:0002636\",\n", " \"HP:0002616\",\n", " \"HP:0002435\",\n", " \"HP:0002360\",\n", " \"HP:0007800\",\n", " \"HP:0032934\",\n", " \"HP:0012432\",\n", " \"HP:0007720\",\n", " \"HP:0002107\",\n", " \"HP:0002105\",\n", " \"HP:0007676\",\n", " \"HP:0000939\",\n", " \"HP:0000938\",\n", " \"HP:0002097\",\n", " \"HP:0012369\",\n", " \"HP:0000767\",\n", " \"HP:0000678\",\n", " \"HP:0012019\",\n", " \"HP:0010807\",\n", " \"HP:0000577\",\n", " \"HP:0000565\",\n", " \"HP:0000545\",\n", " \"HP:0000541\",\n", " \"HP:0000494\",\n", " \"HP:0000486\",\n", " \"HP:0006687\",\n", " \"HP:0007018\",\n", " \"HP:0000278\",\n", " \"HP:0000276\",\n", " \"HP:0000275\",\n", " \"HP:0000272\",\n", " \"HP:0000268\",\n", " \"HP:0000218\",\n", " \"HP:0000189\",\n", " \"HP:0000175\",\n", " \"HP:0000098\",\n", " \"HP:0000023\",\n", " \"HP:0001635\",\n", " \"HP:0001763\",\n", " \"HP:0005294\",\n", " \"HP:0003758\",\n", " \"HP:0003326\",\n", " \"HP:0003302\",\n", " \"HP:0003202\",\n", " \"HP:0003199\",\n", " \"HP:0005059\",\n", " \"HP:0003088\",\n", " \"HP:0025586\",\n", " \"HP:0005136\",\n", " \"HP:0001761\",\n", " \"HP:0001704\",\n", " \"HP:0001765\",\n", " \"HP:0001659\",\n", " \"HP:0001653\",\n", " \"HP:0001634\",\n", " \"HP:0001533\",\n", " \"HP:0001519\",\n", " \"HP:0008132\",\n", " \"HP:0001382\",\n", " \"HP:0001371\",\n", " \"HP:0001252\",\n", " \"HP:0001166\",\n", " \"HP:0001132\",\n", " \"HP:0000347\",\n", " \"HP:0001065\",\n", " \"HP:0000490\",\n", " \"HP:0000505\",\n", " \"HP:0000518\",\n", " \"HP:0000768\",\n", " \"HP:0004970\",\n", " \"HP:0004933\",\n", " \"HP:0004927\",\n", " \"HP:0002108\",\n", " \"HP:0004872\",\n", " \"HP:0012499\",\n", " \"HP:0002650\"\n", "}\n", "object_terms" ] }, { "cell_type": "code", "execution_count": 21, "id": "3e1acac2-16e3-4088-a2a2-5a386f276e40", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 27.3 s, sys: 2.55 s, total: 29.8 s\n", "Wall time: 30.3 s\n" ] } ], "source": [ "%%time\n", "\n", "predicates= [\n", " \"rdfs:subClassOf\",\n", " \"BFO:0000050\",\n", " \"UPHENO:0000001\",\n", " ]\n", "semsimian = Semsimian(\n", " spo=None,\n", " predicates=predicates,\n", " pairwise_similarity_attributes=None,\n", " resource_path=db,\n", " )\n" ] }, { "cell_type": "code", "execution_count": 22, "id": "01828784-8ec6-4490-8631-117b1886906d", "metadata": {}, "outputs": [], "source": [ "def get_search_results(semsimian, object_terms, search_type):\n", " subject_prefixes = [\"MONDO:\"]\n", " assoc_predicate = {\"biolink:has_phenotype\"}\n", " limit = 100\n", " return semsimian.associations_search(\n", " assoc_predicate,\n", " object_terms,\n", " False,\n", " search_type,\n", " None,\n", " subject_prefixes,\n", " limit,\n", " )\n", "\n", "def calculate_overlap(df, n):\n", " # Ensure n is not greater than the length of the dataframe\n", " n = min(n, len(df))\n", " \n", " # Slice the dataframe to only consider the first n rows\n", " df_sliced = df.iloc[:n]\n", "\n", " # Calculate the number of items in Full_search that also exist in Hybrid_search\n", " num_matches = sum(df_sliced[\"Full_search\"].isin(df_sliced[\"Hybrid_search\"]))\n", " \n", " \n", " # Calculate the percentage overlap\n", " percent_overlap = (num_matches / n) * 100\n", " \n", " return f\"{percent_overlap}%\"\n" ] }, { "cell_type": "code", "execution_count": 23, "id": "1118e317-0a47-4d76-9fd7-8b36edf2755c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3h 40min 26s, sys: 4min, total: 3h 44min 27s\n", "Wall time: 17min 41s\n" ] } ], "source": [ "%%time\n", "\n", "result_full = get_search_results(semsimian, object_terms, \"full\")" ] }, { "cell_type": "code", "execution_count": 24, "id": "ab175771-2365-4091-9a34-3ee8db889bd5", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using cache! \"MONDO:biolink:has_phenotypefull\"\n", "CPU times: user 47min 33s, sys: 24 s, total: 47min 57s\n", "Wall time: 3min 47s\n" ] } ], "source": [ "%%time\n", "\n", "result_hybrid = get_search_results(semsimian, object_terms, \"hybrid\")" ] }, { "cell_type": "code", "execution_count": 25, "id": "466f9dcd-977e-4387-af4a-51e41a01fa30", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result_full_curie_score = [[curie, score] for (score, _, curie) in result_full]\n", "result_hybrid_curie_score = [[curie, score] for (score, _, curie) in result_hybrid]\n", "len(result_full_curie_score) == len(result_hybrid_curie_score)" ] }, { "cell_type": "code", "execution_count": 26, "id": "b17dfaaa-bb46-4233-9937-0af2221320a0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Full_searchFull_scoreHybrid_searchHybrid_scoreMATCH
0MONDO:000794716.221826MONDO:000794716.221826True
1MONDO:001389713.967443MONDO:001389713.967443True
2MONDO:001342613.811724MONDO:001342613.811724True
3MONDO:001143113.382852MONDO:001143113.382852True
4MONDO:001242713.208601MONDO:001242713.208601True
5MONDO:001451413.087608MONDO:001451413.087608True
6MONDO:001495013.018393MONDO:001495013.018393True
7MONDO:001217112.959062MONDO:001217112.959062True
8MONDO:003073112.957262MONDO:001426212.820997False
9MONDO:001426212.820997MONDO:001051512.669963False
10MONDO:002452912.720854MONDO:001600212.539704False
11MONDO:001051512.669963MONDO:001962512.517714False
12MONDO:001600212.539704MONDO:000915912.460405False
13MONDO:001962512.517714MONDO:001935412.449903False
14MONDO:000915912.460405MONDO:000881812.360394False
15MONDO:001935412.449903MONDO:001483112.351288False
16MONDO:000881812.360394MONDO:000842612.322358False
17MONDO:001483112.351288MONDO:001895412.258602False
18MONDO:000842612.322358MONDO:003050012.234267False
19MONDO:001895412.258602MONDO:085915112.224425False
20MONDO:003050012.234267MONDO:001956712.207587False
21MONDO:085915112.224425MONDO:001221212.179639False
22MONDO:001956712.207587MONDO:003402412.154007False
23MONDO:001221212.179639MONDO:001201312.123645False
24MONDO:003402412.154007MONDO:001730912.097610False
25MONDO:001201312.123645MONDO:000736312.089953False
26MONDO:001730912.097610MONDO:000951112.071908False
27MONDO:000736312.089953MONDO:005481312.060745False
28MONDO:000951112.071908MONDO:000753712.039003False
29MONDO:005481312.060745MONDO:001375412.003640False
30MONDO:000753712.039003MONDO:085917711.883261False
31MONDO:000737212.017054MONDO:003402111.837237False
32MONDO:001375412.003640MONDO:000716011.786118False
33MONDO:000772011.898544MONDO:001523011.781601False
34MONDO:085917711.883261MONDO:000867811.781164False
35MONDO:003402111.837237MONDO:001901911.739629False
36MONDO:001256911.798970MONDO:001075311.735383False
37MONDO:001191511.798970MONDO:001305111.722570False
38MONDO:000716011.786118MONDO:002746211.721282False
39MONDO:001523011.781601MONDO:000705711.705854False
40MONDO:000867811.781164MONDO:001780611.662459False
41MONDO:001901911.739629MONDO:001413911.641001False
42MONDO:001075311.735383MONDO:000957911.625764False
43MONDO:001305111.722570MONDO:001114711.617690False
44MONDO:002746211.721282MONDO:085919311.614443False
45MONDO:000705711.705854MONDO:002068111.610557False
46MONDO:001780611.662459MONDO:001291411.609362False
47MONDO:001413911.641001MONDO:003270711.587106False
48MONDO:000957911.625764MONDO:000752211.585127False
49MONDO:001114711.617690MONDO:000924211.574202False
50MONDO:085919311.614443MONDO:006053211.572993False
51MONDO:002068111.610557MONDO:001277311.549693False
52MONDO:001291411.609362MONDO:001019411.547988False
53MONDO:003270711.587106MONDO:000752511.546833False
54MONDO:000752211.585127MONDO:000911211.545883False
55MONDO:000924211.574202MONDO:001380011.537000False
56MONDO:006053211.572993MONDO:001731411.535089False
57MONDO:001277311.549693MONDO:000931811.533214False
58MONDO:001019411.547988MONDO:001423611.530959False
59MONDO:000752511.546833MONDO:001020811.526944False
60MONDO:000911211.545883MONDO:002673311.514011False
61MONDO:001380011.537000MONDO:002453511.511612False
62MONDO:001731411.535089MONDO:000971711.485904False
63MONDO:000931811.533214MONDO:001110611.470966False
64MONDO:001423611.530959MONDO:000831011.469897False
65MONDO:001020811.526944MONDO:000936311.464871False
66MONDO:002673311.514011MONDO:001056111.456316False
67MONDO:002453511.511612MONDO:001026111.455513False
68MONDO:000971711.485904MONDO:002745111.449975False
69MONDO:001110611.470966MONDO:001114211.447775False
70MONDO:000831011.469897MONDO:000903311.441172False
71MONDO:000936311.464871MONDO:001065311.425481False
72MONDO:001056111.456316MONDO:085919411.411297False
73MONDO:001026111.455513MONDO:001044111.402327False
74MONDO:002745111.449975MONDO:000794911.401413False
75MONDO:001114211.447775MONDO:001234211.398925False
76MONDO:000903311.441172MONDO:001007511.390286False
77MONDO:001065311.425481MONDO:001031011.381537False
78MONDO:085919411.411297MONDO:001756911.376257False
79MONDO:001044111.402327MONDO:000992611.375838False
80MONDO:000794911.401413MONDO:001892311.357222False
81MONDO:001234211.398925MONDO:000752411.354872False
82MONDO:001007511.390286MONDO:001470011.352468False
83MONDO:001031011.381537MONDO:000956611.350380False
84MONDO:001756911.376257MONDO:001124411.348228False
85MONDO:000992611.375838MONDO:001957111.347518False
86MONDO:001892311.357222MONDO:000899911.346674False
87MONDO:000991011.355538MONDO:001160411.338133False
88MONDO:000752411.354872MONDO:001249611.332304False
89MONDO:001470011.352468MONDO:001057111.330400False
90MONDO:000956611.350380MONDO:000773811.330224False
91MONDO:001124411.348228MONDO:001287311.328784False
92MONDO:001957111.347518MONDO:001059011.325360False
93MONDO:000899911.346674MONDO:000916111.321351False
94MONDO:001160411.338133MONDO:000905211.312873False
95MONDO:001437911.337413MONDO:001065011.307532False
96MONDO:001249611.332304MONDO:001274011.307180False
97MONDO:001057111.330400MONDO:001149311.303764False
98MONDO:000773811.330224MONDO:001285311.302427False
99MONDO:001287311.328784MONDO:000752311.299591False
\n", "
" ], "text/plain": [ " Full_search Full_score Hybrid_search Hybrid_score MATCH\n", "0 MONDO:0007947 16.221826 MONDO:0007947 16.221826 True\n", "1 MONDO:0013897 13.967443 MONDO:0013897 13.967443 True\n", "2 MONDO:0013426 13.811724 MONDO:0013426 13.811724 True\n", "3 MONDO:0011431 13.382852 MONDO:0011431 13.382852 True\n", "4 MONDO:0012427 13.208601 MONDO:0012427 13.208601 True\n", "5 MONDO:0014514 13.087608 MONDO:0014514 13.087608 True\n", "6 MONDO:0014950 13.018393 MONDO:0014950 13.018393 True\n", "7 MONDO:0012171 12.959062 MONDO:0012171 12.959062 True\n", "8 MONDO:0030731 12.957262 MONDO:0014262 12.820997 False\n", "9 MONDO:0014262 12.820997 MONDO:0010515 12.669963 False\n", "10 MONDO:0024529 12.720854 MONDO:0016002 12.539704 False\n", "11 MONDO:0010515 12.669963 MONDO:0019625 12.517714 False\n", "12 MONDO:0016002 12.539704 MONDO:0009159 12.460405 False\n", "13 MONDO:0019625 12.517714 MONDO:0019354 12.449903 False\n", "14 MONDO:0009159 12.460405 MONDO:0008818 12.360394 False\n", "15 MONDO:0019354 12.449903 MONDO:0014831 12.351288 False\n", "16 MONDO:0008818 12.360394 MONDO:0008426 12.322358 False\n", "17 MONDO:0014831 12.351288 MONDO:0018954 12.258602 False\n", "18 MONDO:0008426 12.322358 MONDO:0030500 12.234267 False\n", "19 MONDO:0018954 12.258602 MONDO:0859151 12.224425 False\n", "20 MONDO:0030500 12.234267 MONDO:0019567 12.207587 False\n", "21 MONDO:0859151 12.224425 MONDO:0012212 12.179639 False\n", "22 MONDO:0019567 12.207587 MONDO:0034024 12.154007 False\n", "23 MONDO:0012212 12.179639 MONDO:0012013 12.123645 False\n", "24 MONDO:0034024 12.154007 MONDO:0017309 12.097610 False\n", "25 MONDO:0012013 12.123645 MONDO:0007363 12.089953 False\n", "26 MONDO:0017309 12.097610 MONDO:0009511 12.071908 False\n", "27 MONDO:0007363 12.089953 MONDO:0054813 12.060745 False\n", "28 MONDO:0009511 12.071908 MONDO:0007537 12.039003 False\n", "29 MONDO:0054813 12.060745 MONDO:0013754 12.003640 False\n", "30 MONDO:0007537 12.039003 MONDO:0859177 11.883261 False\n", "31 MONDO:0007372 12.017054 MONDO:0034021 11.837237 False\n", "32 MONDO:0013754 12.003640 MONDO:0007160 11.786118 False\n", "33 MONDO:0007720 11.898544 MONDO:0015230 11.781601 False\n", "34 MONDO:0859177 11.883261 MONDO:0008678 11.781164 False\n", "35 MONDO:0034021 11.837237 MONDO:0019019 11.739629 False\n", "36 MONDO:0012569 11.798970 MONDO:0010753 11.735383 False\n", "37 MONDO:0011915 11.798970 MONDO:0013051 11.722570 False\n", "38 MONDO:0007160 11.786118 MONDO:0027462 11.721282 False\n", "39 MONDO:0015230 11.781601 MONDO:0007057 11.705854 False\n", "40 MONDO:0008678 11.781164 MONDO:0017806 11.662459 False\n", "41 MONDO:0019019 11.739629 MONDO:0014139 11.641001 False\n", "42 MONDO:0010753 11.735383 MONDO:0009579 11.625764 False\n", "43 MONDO:0013051 11.722570 MONDO:0011147 11.617690 False\n", "44 MONDO:0027462 11.721282 MONDO:0859193 11.614443 False\n", "45 MONDO:0007057 11.705854 MONDO:0020681 11.610557 False\n", "46 MONDO:0017806 11.662459 MONDO:0012914 11.609362 False\n", "47 MONDO:0014139 11.641001 MONDO:0032707 11.587106 False\n", "48 MONDO:0009579 11.625764 MONDO:0007522 11.585127 False\n", "49 MONDO:0011147 11.617690 MONDO:0009242 11.574202 False\n", "50 MONDO:0859193 11.614443 MONDO:0060532 11.572993 False\n", "51 MONDO:0020681 11.610557 MONDO:0012773 11.549693 False\n", "52 MONDO:0012914 11.609362 MONDO:0010194 11.547988 False\n", "53 MONDO:0032707 11.587106 MONDO:0007525 11.546833 False\n", "54 MONDO:0007522 11.585127 MONDO:0009112 11.545883 False\n", "55 MONDO:0009242 11.574202 MONDO:0013800 11.537000 False\n", "56 MONDO:0060532 11.572993 MONDO:0017314 11.535089 False\n", "57 MONDO:0012773 11.549693 MONDO:0009318 11.533214 False\n", "58 MONDO:0010194 11.547988 MONDO:0014236 11.530959 False\n", "59 MONDO:0007525 11.546833 MONDO:0010208 11.526944 False\n", "60 MONDO:0009112 11.545883 MONDO:0026733 11.514011 False\n", "61 MONDO:0013800 11.537000 MONDO:0024535 11.511612 False\n", "62 MONDO:0017314 11.535089 MONDO:0009717 11.485904 False\n", "63 MONDO:0009318 11.533214 MONDO:0011106 11.470966 False\n", "64 MONDO:0014236 11.530959 MONDO:0008310 11.469897 False\n", "65 MONDO:0010208 11.526944 MONDO:0009363 11.464871 False\n", "66 MONDO:0026733 11.514011 MONDO:0010561 11.456316 False\n", "67 MONDO:0024535 11.511612 MONDO:0010261 11.455513 False\n", "68 MONDO:0009717 11.485904 MONDO:0027451 11.449975 False\n", "69 MONDO:0011106 11.470966 MONDO:0011142 11.447775 False\n", "70 MONDO:0008310 11.469897 MONDO:0009033 11.441172 False\n", "71 MONDO:0009363 11.464871 MONDO:0010653 11.425481 False\n", "72 MONDO:0010561 11.456316 MONDO:0859194 11.411297 False\n", "73 MONDO:0010261 11.455513 MONDO:0010441 11.402327 False\n", "74 MONDO:0027451 11.449975 MONDO:0007949 11.401413 False\n", "75 MONDO:0011142 11.447775 MONDO:0012342 11.398925 False\n", "76 MONDO:0009033 11.441172 MONDO:0010075 11.390286 False\n", "77 MONDO:0010653 11.425481 MONDO:0010310 11.381537 False\n", "78 MONDO:0859194 11.411297 MONDO:0017569 11.376257 False\n", "79 MONDO:0010441 11.402327 MONDO:0009926 11.375838 False\n", "80 MONDO:0007949 11.401413 MONDO:0018923 11.357222 False\n", "81 MONDO:0012342 11.398925 MONDO:0007524 11.354872 False\n", "82 MONDO:0010075 11.390286 MONDO:0014700 11.352468 False\n", "83 MONDO:0010310 11.381537 MONDO:0009566 11.350380 False\n", "84 MONDO:0017569 11.376257 MONDO:0011244 11.348228 False\n", "85 MONDO:0009926 11.375838 MONDO:0019571 11.347518 False\n", "86 MONDO:0018923 11.357222 MONDO:0008999 11.346674 False\n", "87 MONDO:0009910 11.355538 MONDO:0011604 11.338133 False\n", "88 MONDO:0007524 11.354872 MONDO:0012496 11.332304 False\n", "89 MONDO:0014700 11.352468 MONDO:0010571 11.330400 False\n", "90 MONDO:0009566 11.350380 MONDO:0007738 11.330224 False\n", "91 MONDO:0011244 11.348228 MONDO:0012873 11.328784 False\n", "92 MONDO:0019571 11.347518 MONDO:0010590 11.325360 False\n", "93 MONDO:0008999 11.346674 MONDO:0009161 11.321351 False\n", "94 MONDO:0011604 11.338133 MONDO:0009052 11.312873 False\n", "95 MONDO:0014379 11.337413 MONDO:0010650 11.307532 False\n", "96 MONDO:0012496 11.332304 MONDO:0012740 11.307180 False\n", "97 MONDO:0010571 11.330400 MONDO:0011493 11.303764 False\n", "98 MONDO:0007738 11.330224 MONDO:0012853 11.302427 False\n", "99 MONDO:0012873 11.328784 MONDO:0007523 11.299591 False" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_full = pd.DataFrame(result_full_curie_score, columns=[\"Full_search\", \"Full_score\"])\n", "df_hybrid = pd.DataFrame(result_hybrid_curie_score, columns=[\"Hybrid_search\", \"Hybrid_score\"])\n", "df = pd.concat([df_full, df_hybrid], axis=1)\n", "df[\"MATCH\"] = df[\"Full_search\"] == df[\"Hybrid_search\"]\n", "df\n" ] }, { "cell_type": "code", "execution_count": 27, "id": "6418f222-6ff4-4bc8-b523-8505bdeae547", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Percentage of common terms: 92.0%\n" ] } ], "source": [ "# Extract the first element (MONDO ID) from each sublist in result_1 and result_2\n", "search_result_1 = set([item[0] for item in result_full_curie_score])\n", "search_result_2 = set([item[0] for item in result_hybrid_curie_score])\n", "\n", "# Find the intersection of the two sets\n", "common_results = search_result_1.intersection(search_result_2)\n", "\n", "# Calculate the percentage of common terms\n", "percentage_common = (len(common_results) / len(search_result_1)) * 100\n", "\n", "print(f\"Percentage of common terms: {percentage_common}%\")\n" ] }, { "cell_type": "code", "execution_count": 28, "id": "3b075f06-b218-4bfe-aa4d-f8c48ad12324", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'90.0%'" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "calculate_overlap(df, 10)" ] }, { "cell_type": "code", "execution_count": null, "id": "8160ec1f-27f3-468e-9119-c84792e1498c", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 5 }