{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "1c23a3e3-332b-4712-a2b8-1788cd14087e",
"metadata": {},
"outputs": [],
"source": [
"%reset -f"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "207f38ab-618d-42fa-accb-f400dfc8be34",
"metadata": {},
"outputs": [],
"source": [
"user = \"HHegde\"\n",
"db = f\"/Users/{user}/.data/oaklib/phenio.db\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "2e2ce839-b8cc-4f1a-a931-67b85ba0df4d",
"metadata": {},
"outputs": [],
"source": [
"%reload_ext sql\n",
"%sql sqlite:///{db}"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "87025967-554f-4c9d-9967-97ce1e40acf7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite:////Users/HHegde/.data/oaklib/phenio.db\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"
\n",
" \n",
" \n",
" id | \n",
" subject | \n",
" predicate | \n",
" object | \n",
" evidence_type | \n",
" publication | \n",
" source | \n",
"
\n",
" \n",
" \n",
" \n",
" uuid:70269c5a-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0011097 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" uuid:70269c5b-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0002187 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
"
"
],
"text/plain": [
"[('uuid:70269c5a-42a9-11ee-be37-31ef105c25ea', 'MONDO:0023659', 'biolink:has_phenotype', 'HP:0011097', 'ECO:0000269', 'PMID:31675180', 'infores:hpo-annotations'),\n",
" ('uuid:70269c5b-42a9-11ee-be37-31ef105c25ea', 'MONDO:0023659', 'biolink:has_phenotype', 'HP:0002187', 'ECO:0000269', 'PMID:31675180', 'infores:hpo-annotations')]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql SELECT * FROM term_association LIMIT 2;"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e639def1-00e1-4113-9da6-fb2f32701952",
"metadata": {},
"outputs": [],
"source": [
"import sqlite3\n",
"import pandas as pd\n",
"pd.set_option('display.max_rows', None)\n",
"from semsimian import Semsimian\n",
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "bb1e79e6-2b18-466f-95fb-d219f8c64431",
"metadata": {},
"outputs": [],
"source": [
"conn = sqlite3.connect(db)\n",
"res = conn.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n",
"# tables = res.fetchall()\n",
"\n",
"# tables"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e406c2fa-43e1-4f57-b8be-52334edfdbd6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" subject | \n",
" predicate | \n",
" object | \n",
" evidence_type | \n",
" publication | \n",
" source | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" uuid:70269c5a-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0011097 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 1 | \n",
" uuid:70269c5b-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0002187 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 2 | \n",
" uuid:70269c5c-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0001518 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 3 | \n",
" uuid:70269c5d-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0032792 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 4 | \n",
" uuid:70269c5e-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0011451 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id subject \\\n",
"0 uuid:70269c5a-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"1 uuid:70269c5b-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"2 uuid:70269c5c-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"3 uuid:70269c5d-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"4 uuid:70269c5e-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"\n",
" predicate object evidence_type publication \\\n",
"0 biolink:has_phenotype HP:0011097 ECO:0000269 PMID:31675180 \n",
"1 biolink:has_phenotype HP:0002187 ECO:0000269 PMID:31675180 \n",
"2 biolink:has_phenotype HP:0001518 ECO:0000269 PMID:31675180 \n",
"3 biolink:has_phenotype HP:0032792 ECO:0000269 PMID:31675180 \n",
"4 biolink:has_phenotype HP:0011451 ECO:0000269 PMID:31675180 \n",
"\n",
" source \n",
"0 infores:hpo-annotations \n",
"1 infores:hpo-annotations \n",
"2 infores:hpo-annotations \n",
"3 infores:hpo-annotations \n",
"4 infores:hpo-annotations "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_term_association = pd.read_sql_query(\"SELECT * FROM term_association\", conn)\n",
"df_term_association.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "76a44b5a-2532-44a4-8226-e74322e289c9",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['MONDO', 'HGNC', 'WB', 'MGI', 'RGD', 'Xenbase', 'ZFIN'],\n",
" dtype=object)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_term_association['subject'].str.split(\":\").str[0].unique()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "23cd279e-77fd-472c-bd13-d21753d3cf9f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['HP', 'WBPhenotype', 'MP', 'XPO', 'ZP'], dtype=object)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_term_association['object'].str.split(\":\").str[0].unique()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "f8af957d-612c-4922-998e-760553f171ad",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" subject | \n",
" predicate | \n",
" object | \n",
"
\n",
" \n",
" \n",
" \n",
" 808522 | \n",
" HGNC:10031 | \n",
" biolink:has_phenotype | \n",
" HP:0002652 | \n",
"
\n",
" \n",
" 956399 | \n",
" HGNC:13780 | \n",
" biolink:has_phenotype | \n",
" HP:0001270 | \n",
"
\n",
" \n",
" 710713 | \n",
" HGNC:2218 | \n",
" biolink:has_phenotype | \n",
" HP:0000007 | \n",
"
\n",
" \n",
" 220697 | \n",
" MONDO:0007949 | \n",
" biolink:has_phenotype | \n",
" HP:0002684 | \n",
"
\n",
" \n",
" 727526 | \n",
" HGNC:3534 | \n",
" biolink:has_phenotype | \n",
" HP:0001892 | \n",
"
\n",
" \n",
" 185712 | \n",
" MONDO:0007316 | \n",
" biolink:has_phenotype | \n",
" HP:0001293 | \n",
"
\n",
" \n",
" 892081 | \n",
" HGNC:20858 | \n",
" biolink:has_phenotype | \n",
" HP:0002315 | \n",
"
\n",
" \n",
" 751827 | \n",
" HGNC:5136 | \n",
" biolink:has_phenotype | \n",
" HP:0001156 | \n",
"
\n",
" \n",
" 718733 | \n",
" HGNC:2972 | \n",
" biolink:has_phenotype | \n",
" HP:0002355 | \n",
"
\n",
" \n",
" 890804 | \n",
" HGNC:16700 | \n",
" biolink:has_phenotype | \n",
" HP:0004209 | \n",
"
\n",
" \n",
" 861364 | \n",
" HGNC:2652 | \n",
" biolink:has_phenotype | \n",
" HP:0011985 | \n",
"
\n",
" \n",
" 143160 | \n",
" MONDO:0018866 | \n",
" biolink:has_phenotype | \n",
" HP:0001357 | \n",
"
\n",
" \n",
" 868056 | \n",
" HGNC:11957 | \n",
" biolink:has_phenotype | \n",
" HP:0002608 | \n",
"
\n",
" \n",
" 239787 | \n",
" MONDO:0010184 | \n",
" biolink:has_phenotype | \n",
" HP:0001789 | \n",
"
\n",
" \n",
" 107556 | \n",
" MONDO:0013276 | \n",
" biolink:has_phenotype | \n",
" HP:0030873 | \n",
"
\n",
" \n",
" 191045 | \n",
" MONDO:0008318 | \n",
" biolink:has_phenotype | \n",
" HP:0001597 | \n",
"
\n",
" \n",
" 836978 | \n",
" HGNC:12428 | \n",
" biolink:has_phenotype | \n",
" HP:0003577 | \n",
"
\n",
" \n",
" 191789 | \n",
" MONDO:0014339 | \n",
" biolink:has_phenotype | \n",
" HP:0002346 | \n",
"
\n",
" \n",
" 863045 | \n",
" HGNC:4661 | \n",
" biolink:has_phenotype | \n",
" HP:0010669 | \n",
"
\n",
" \n",
" 188703 | \n",
" MONDO:0019633 | \n",
" biolink:has_phenotype | \n",
" HP:0003326 | \n",
"
\n",
" \n",
" 867924 | \n",
" HGNC:467 | \n",
" biolink:has_phenotype | \n",
" HP:0000407 | \n",
"
\n",
" \n",
" 817610 | \n",
" HGNC:10889 | \n",
" biolink:has_phenotype | \n",
" HP:0002418 | \n",
"
\n",
" \n",
" 911676 | \n",
" HGNC:14889 | \n",
" biolink:has_phenotype | \n",
" HP:0000006 | \n",
"
\n",
" \n",
" 955238 | \n",
" HGNC:19711 | \n",
" biolink:has_phenotype | \n",
" HP:0001254 | \n",
"
\n",
" \n",
" 707513 | \n",
" HGNC:2092 | \n",
" biolink:has_phenotype | \n",
" HP:0001257 | \n",
"
\n",
" \n",
" 863162 | \n",
" HGNC:4431 | \n",
" biolink:has_phenotype | \n",
" HP:0025116 | \n",
"
\n",
" \n",
" 149207 | \n",
" MONDO:0017314 | \n",
" biolink:has_phenotype | \n",
" HP:0000767 | \n",
"
\n",
" \n",
" 837087 | \n",
" HGNC:12428 | \n",
" biolink:has_phenotype | \n",
" HP:0000494 | \n",
"
\n",
" \n",
" 134709 | \n",
" MONDO:0020527 | \n",
" biolink:has_phenotype | \n",
" HP:0000708 | \n",
"
\n",
" \n",
" 201130 | \n",
" MONDO:0012271 | \n",
" biolink:has_phenotype | \n",
" HP:0009701 | \n",
"
\n",
" \n",
" 780863 | \n",
" HGNC:7707 | \n",
" biolink:has_phenotype | \n",
" HP:0001508 | \n",
"
\n",
" \n",
" 191580 | \n",
" MONDO:0021055 | \n",
" biolink:has_phenotype | \n",
" HP:0006725 | \n",
"
\n",
" \n",
" 942774 | \n",
" HGNC:25812 | \n",
" biolink:has_phenotype | \n",
" HP:0001976 | \n",
"
\n",
" \n",
" 12356 | \n",
" MONDO:0010802 | \n",
" biolink:has_phenotype | \n",
" HP:0001643 | \n",
"
\n",
" \n",
" 857720 | \n",
" HGNC:8768 | \n",
" biolink:has_phenotype | \n",
" HP:0003390 | \n",
"
\n",
" \n",
" 685158 | \n",
" HGNC:20 | \n",
" biolink:has_phenotype | \n",
" HP:0000973 | \n",
"
\n",
" \n",
" 82514 | \n",
" MONDO:0008854 | \n",
" biolink:has_phenotype | \n",
" HP:0001769 | \n",
"
\n",
" \n",
" 768967 | \n",
" HGNC:6904 | \n",
" biolink:has_phenotype | \n",
" HP:0001677 | \n",
"
\n",
" \n",
" 234118 | \n",
" MONDO:0018631 | \n",
" biolink:has_phenotype | \n",
" HP:0100840 | \n",
"
\n",
" \n",
" 12881 | \n",
" MONDO:0010515 | \n",
" biolink:has_phenotype | \n",
" HP:0001166 | \n",
"
\n",
" \n",
" 810622 | \n",
" HGNC:10389 | \n",
" biolink:has_phenotype | \n",
" HP:0006758 | \n",
"
\n",
" \n",
" 116022 | \n",
" MONDO:0020769 | \n",
" biolink:has_phenotype | \n",
" HP:0000750 | \n",
"
\n",
" \n",
" 44850 | \n",
" MONDO:0008965 | \n",
" biolink:has_phenotype | \n",
" HP:0002139 | \n",
"
\n",
" \n",
" 848690 | \n",
" HGNC:4790 | \n",
" biolink:has_phenotype | \n",
" HP:0002212 | \n",
"
\n",
" \n",
" 854532 | \n",
" HGNC:12771 | \n",
" biolink:has_phenotype | \n",
" HP:0004603 | \n",
"
\n",
" \n",
" 207826 | \n",
" MONDO:0014024 | \n",
" biolink:has_phenotype | \n",
" HP:0006380 | \n",
"
\n",
" \n",
" 881431 | \n",
" HGNC:30074 | \n",
" biolink:has_phenotype | \n",
" HP:0000358 | \n",
"
\n",
" \n",
" 954759 | \n",
" HGNC:25902 | \n",
" biolink:has_phenotype | \n",
" HP:0001249 | \n",
"
\n",
" \n",
" 775560 | \n",
" HGNC:7481 | \n",
" biolink:has_phenotype | \n",
" HP:0007302 | \n",
"
\n",
" \n",
" 63788 | \n",
" MONDO:0044208 | \n",
" biolink:has_phenotype | \n",
" HP:0001873 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" subject predicate object\n",
"808522 HGNC:10031 biolink:has_phenotype HP:0002652\n",
"956399 HGNC:13780 biolink:has_phenotype HP:0001270\n",
"710713 HGNC:2218 biolink:has_phenotype HP:0000007\n",
"220697 MONDO:0007949 biolink:has_phenotype HP:0002684\n",
"727526 HGNC:3534 biolink:has_phenotype HP:0001892\n",
"185712 MONDO:0007316 biolink:has_phenotype HP:0001293\n",
"892081 HGNC:20858 biolink:has_phenotype HP:0002315\n",
"751827 HGNC:5136 biolink:has_phenotype HP:0001156\n",
"718733 HGNC:2972 biolink:has_phenotype HP:0002355\n",
"890804 HGNC:16700 biolink:has_phenotype HP:0004209\n",
"861364 HGNC:2652 biolink:has_phenotype HP:0011985\n",
"143160 MONDO:0018866 biolink:has_phenotype HP:0001357\n",
"868056 HGNC:11957 biolink:has_phenotype HP:0002608\n",
"239787 MONDO:0010184 biolink:has_phenotype HP:0001789\n",
"107556 MONDO:0013276 biolink:has_phenotype HP:0030873\n",
"191045 MONDO:0008318 biolink:has_phenotype HP:0001597\n",
"836978 HGNC:12428 biolink:has_phenotype HP:0003577\n",
"191789 MONDO:0014339 biolink:has_phenotype HP:0002346\n",
"863045 HGNC:4661 biolink:has_phenotype HP:0010669\n",
"188703 MONDO:0019633 biolink:has_phenotype HP:0003326\n",
"867924 HGNC:467 biolink:has_phenotype HP:0000407\n",
"817610 HGNC:10889 biolink:has_phenotype HP:0002418\n",
"911676 HGNC:14889 biolink:has_phenotype HP:0000006\n",
"955238 HGNC:19711 biolink:has_phenotype HP:0001254\n",
"707513 HGNC:2092 biolink:has_phenotype HP:0001257\n",
"863162 HGNC:4431 biolink:has_phenotype HP:0025116\n",
"149207 MONDO:0017314 biolink:has_phenotype HP:0000767\n",
"837087 HGNC:12428 biolink:has_phenotype HP:0000494\n",
"134709 MONDO:0020527 biolink:has_phenotype HP:0000708\n",
"201130 MONDO:0012271 biolink:has_phenotype HP:0009701\n",
"780863 HGNC:7707 biolink:has_phenotype HP:0001508\n",
"191580 MONDO:0021055 biolink:has_phenotype HP:0006725\n",
"942774 HGNC:25812 biolink:has_phenotype HP:0001976\n",
"12356 MONDO:0010802 biolink:has_phenotype HP:0001643\n",
"857720 HGNC:8768 biolink:has_phenotype HP:0003390\n",
"685158 HGNC:20 biolink:has_phenotype HP:0000973\n",
"82514 MONDO:0008854 biolink:has_phenotype HP:0001769\n",
"768967 HGNC:6904 biolink:has_phenotype HP:0001677\n",
"234118 MONDO:0018631 biolink:has_phenotype HP:0100840\n",
"12881 MONDO:0010515 biolink:has_phenotype HP:0001166\n",
"810622 HGNC:10389 biolink:has_phenotype HP:0006758\n",
"116022 MONDO:0020769 biolink:has_phenotype HP:0000750\n",
"44850 MONDO:0008965 biolink:has_phenotype HP:0002139\n",
"848690 HGNC:4790 biolink:has_phenotype HP:0002212\n",
"854532 HGNC:12771 biolink:has_phenotype HP:0004603\n",
"207826 MONDO:0014024 biolink:has_phenotype HP:0006380\n",
"881431 HGNC:30074 biolink:has_phenotype HP:0000358\n",
"954759 HGNC:25902 biolink:has_phenotype HP:0001249\n",
"775560 HGNC:7481 biolink:has_phenotype HP:0007302\n",
"63788 MONDO:0044208 biolink:has_phenotype HP:0001873"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_100 = df_term_association[df_term_association['object'].str.startswith(\"HP:\")].sample(n=50)#, random_state=1)\n",
"df_100 = df_100[['subject', 'predicate', 'object']]\n",
"df_100"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "68078a59-7ac9-419a-af3b-9b4e089c3e4f",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"{'HP:0000023',\n",
" 'HP:0000098',\n",
" 'HP:0000175',\n",
" 'HP:0000189',\n",
" 'HP:0000218',\n",
" 'HP:0000268',\n",
" 'HP:0000272',\n",
" 'HP:0000275',\n",
" 'HP:0000276',\n",
" 'HP:0000278',\n",
" 'HP:0000347',\n",
" 'HP:0000486',\n",
" 'HP:0000490',\n",
" 'HP:0000494',\n",
" 'HP:0000501',\n",
" 'HP:0000505',\n",
" 'HP:0000518',\n",
" 'HP:0000541',\n",
" 'HP:0000545',\n",
" 'HP:0000565',\n",
" 'HP:0000577',\n",
" 'HP:0000678',\n",
" 'HP:0000767',\n",
" 'HP:0000768',\n",
" 'HP:0000938',\n",
" 'HP:0000939',\n",
" 'HP:0001065',\n",
" 'HP:0001083',\n",
" 'HP:0001132',\n",
" 'HP:0001166',\n",
" 'HP:0001252',\n",
" 'HP:0001371',\n",
" 'HP:0001382',\n",
" 'HP:0001519',\n",
" 'HP:0001533',\n",
" 'HP:0001634',\n",
" 'HP:0001635',\n",
" 'HP:0001653',\n",
" 'HP:0001659',\n",
" 'HP:0001704',\n",
" 'HP:0001761',\n",
" 'HP:0001763',\n",
" 'HP:0001765',\n",
" 'HP:0002097',\n",
" 'HP:0002105',\n",
" 'HP:0002107',\n",
" 'HP:0002108',\n",
" 'HP:0002360',\n",
" 'HP:0002435',\n",
" 'HP:0002616',\n",
" 'HP:0002636',\n",
" 'HP:0002647',\n",
" 'HP:0002650',\n",
" 'HP:0002705',\n",
" 'HP:0002751',\n",
" 'HP:0002808',\n",
" 'HP:0002816',\n",
" 'HP:0002996',\n",
" 'HP:0003088',\n",
" 'HP:0003179',\n",
" 'HP:0003199',\n",
" 'HP:0003202',\n",
" 'HP:0003302',\n",
" 'HP:0003326',\n",
" 'HP:0003758',\n",
" 'HP:0004298',\n",
" 'HP:0004326',\n",
" 'HP:0004382',\n",
" 'HP:0004872',\n",
" 'HP:0004927',\n",
" 'HP:0004933',\n",
" 'HP:0004970',\n",
" 'HP:0005059',\n",
" 'HP:0005136',\n",
" 'HP:0005294',\n",
" 'HP:0006687',\n",
" 'HP:0007018',\n",
" 'HP:0007676',\n",
" 'HP:0007720',\n",
" 'HP:0007800',\n",
" 'HP:0008132',\n",
" 'HP:0010807',\n",
" 'HP:0012019',\n",
" 'HP:0012369',\n",
" 'HP:0012432',\n",
" 'HP:0012499',\n",
" 'HP:0025586',\n",
" 'HP:0032934',\n",
" 'HP:0100775'}"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# object_terms = set(df_100['object'])\n",
"# Test Marfan syndrome\n",
"object_terms = {\n",
" \"HP:0100775\",\n",
" \"HP:0003179\",\n",
" \"HP:0001083\",\n",
" \"HP:0000501\",\n",
" \"HP:0002705\",\n",
" \"HP:0004382\",\n",
" \"HP:0004326\",\n",
" \"HP:0002816\",\n",
" \"HP:0004298\",\n",
" \"HP:0002996\",\n",
" \"HP:0002808\",\n",
" \"HP:0002751\",\n",
" \"HP:0002647\",\n",
" \"HP:0002636\",\n",
" \"HP:0002616\",\n",
" \"HP:0002435\",\n",
" \"HP:0002360\",\n",
" \"HP:0007800\",\n",
" \"HP:0032934\",\n",
" \"HP:0012432\",\n",
" \"HP:0007720\",\n",
" \"HP:0002107\",\n",
" \"HP:0002105\",\n",
" \"HP:0007676\",\n",
" \"HP:0000939\",\n",
" \"HP:0000938\",\n",
" \"HP:0002097\",\n",
" \"HP:0012369\",\n",
" \"HP:0000767\",\n",
" \"HP:0000678\",\n",
" \"HP:0012019\",\n",
" \"HP:0010807\",\n",
" \"HP:0000577\",\n",
" \"HP:0000565\",\n",
" \"HP:0000545\",\n",
" \"HP:0000541\",\n",
" \"HP:0000494\",\n",
" \"HP:0000486\",\n",
" \"HP:0006687\",\n",
" \"HP:0007018\",\n",
" \"HP:0000278\",\n",
" \"HP:0000276\",\n",
" \"HP:0000275\",\n",
" \"HP:0000272\",\n",
" \"HP:0000268\",\n",
" \"HP:0000218\",\n",
" \"HP:0000189\",\n",
" \"HP:0000175\",\n",
" \"HP:0000098\",\n",
" \"HP:0000023\",\n",
" \"HP:0001635\",\n",
" \"HP:0001763\",\n",
" \"HP:0005294\",\n",
" \"HP:0003758\",\n",
" \"HP:0003326\",\n",
" \"HP:0003302\",\n",
" \"HP:0003202\",\n",
" \"HP:0003199\",\n",
" \"HP:0005059\",\n",
" \"HP:0003088\",\n",
" \"HP:0025586\",\n",
" \"HP:0005136\",\n",
" \"HP:0001761\",\n",
" \"HP:0001704\",\n",
" \"HP:0001765\",\n",
" \"HP:0001659\",\n",
" \"HP:0001653\",\n",
" \"HP:0001634\",\n",
" \"HP:0001533\",\n",
" \"HP:0001519\",\n",
" \"HP:0008132\",\n",
" \"HP:0001382\",\n",
" \"HP:0001371\",\n",
" \"HP:0001252\",\n",
" \"HP:0001166\",\n",
" \"HP:0001132\",\n",
" \"HP:0000347\",\n",
" \"HP:0001065\",\n",
" \"HP:0000490\",\n",
" \"HP:0000505\",\n",
" \"HP:0000518\",\n",
" \"HP:0000768\",\n",
" \"HP:0004970\",\n",
" \"HP:0004933\",\n",
" \"HP:0004927\",\n",
" \"HP:0002108\",\n",
" \"HP:0004872\",\n",
" \"HP:0012499\",\n",
" \"HP:0002650\"\n",
"}\n",
"object_terms"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "3e1acac2-16e3-4088-a2a2-5a386f276e40",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 27.3 s, sys: 2.55 s, total: 29.8 s\n",
"Wall time: 30.3 s\n"
]
}
],
"source": [
"%%time\n",
"\n",
"predicates= [\n",
" \"rdfs:subClassOf\",\n",
" \"BFO:0000050\",\n",
" \"UPHENO:0000001\",\n",
" ]\n",
"semsimian = Semsimian(\n",
" spo=None,\n",
" predicates=predicates,\n",
" pairwise_similarity_attributes=None,\n",
" resource_path=db,\n",
" )\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "01828784-8ec6-4490-8631-117b1886906d",
"metadata": {},
"outputs": [],
"source": [
"def get_search_results(semsimian, object_terms, search_type):\n",
" subject_prefixes = [\"MONDO:\"]\n",
" assoc_predicate = {\"biolink:has_phenotype\"}\n",
" limit = 100\n",
" return semsimian.associations_search(\n",
" assoc_predicate,\n",
" object_terms,\n",
" False,\n",
" search_type,\n",
" None,\n",
" subject_prefixes,\n",
" limit,\n",
" )\n",
"\n",
"def calculate_overlap(df, n):\n",
" # Ensure n is not greater than the length of the dataframe\n",
" n = min(n, len(df))\n",
" \n",
" # Slice the dataframe to only consider the first n rows\n",
" df_sliced = df.iloc[:n]\n",
"\n",
" # Calculate the number of items in Full_search that also exist in Hybrid_search\n",
" num_matches = sum(df_sliced[\"Full_search\"].isin(df_sliced[\"Hybrid_search\"]))\n",
" \n",
" \n",
" # Calculate the percentage overlap\n",
" percent_overlap = (num_matches / n) * 100\n",
" \n",
" return f\"{percent_overlap}%\"\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "1118e317-0a47-4d76-9fd7-8b36edf2755c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3h 40min 26s, sys: 4min, total: 3h 44min 27s\n",
"Wall time: 17min 41s\n"
]
}
],
"source": [
"%%time\n",
"\n",
"result_full = get_search_results(semsimian, object_terms, \"full\")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "ab175771-2365-4091-9a34-3ee8db889bd5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using cache! \"MONDO:biolink:has_phenotypefull\"\n",
"CPU times: user 47min 33s, sys: 24 s, total: 47min 57s\n",
"Wall time: 3min 47s\n"
]
}
],
"source": [
"%%time\n",
"\n",
"result_hybrid = get_search_results(semsimian, object_terms, \"hybrid\")"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "466f9dcd-977e-4387-af4a-51e41a01fa30",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result_full_curie_score = [[curie, score] for (score, _, curie) in result_full]\n",
"result_hybrid_curie_score = [[curie, score] for (score, _, curie) in result_hybrid]\n",
"len(result_full_curie_score) == len(result_hybrid_curie_score)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "b17dfaaa-bb46-4233-9937-0af2221320a0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Full_search | \n",
" Full_score | \n",
" Hybrid_search | \n",
" Hybrid_score | \n",
" MATCH | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" MONDO:0007947 | \n",
" 16.221826 | \n",
" MONDO:0007947 | \n",
" 16.221826 | \n",
" True | \n",
"
\n",
" \n",
" 1 | \n",
" MONDO:0013897 | \n",
" 13.967443 | \n",
" MONDO:0013897 | \n",
" 13.967443 | \n",
" True | \n",
"
\n",
" \n",
" 2 | \n",
" MONDO:0013426 | \n",
" 13.811724 | \n",
" MONDO:0013426 | \n",
" 13.811724 | \n",
" True | \n",
"
\n",
" \n",
" 3 | \n",
" MONDO:0011431 | \n",
" 13.382852 | \n",
" MONDO:0011431 | \n",
" 13.382852 | \n",
" True | \n",
"
\n",
" \n",
" 4 | \n",
" MONDO:0012427 | \n",
" 13.208601 | \n",
" MONDO:0012427 | \n",
" 13.208601 | \n",
" True | \n",
"
\n",
" \n",
" 5 | \n",
" MONDO:0014514 | \n",
" 13.087608 | \n",
" MONDO:0014514 | \n",
" 13.087608 | \n",
" True | \n",
"
\n",
" \n",
" 6 | \n",
" MONDO:0014950 | \n",
" 13.018393 | \n",
" MONDO:0014950 | \n",
" 13.018393 | \n",
" True | \n",
"
\n",
" \n",
" 7 | \n",
" MONDO:0012171 | \n",
" 12.959062 | \n",
" MONDO:0012171 | \n",
" 12.959062 | \n",
" True | \n",
"
\n",
" \n",
" 8 | \n",
" MONDO:0030731 | \n",
" 12.957262 | \n",
" MONDO:0014262 | \n",
" 12.820997 | \n",
" False | \n",
"
\n",
" \n",
" 9 | \n",
" MONDO:0014262 | \n",
" 12.820997 | \n",
" MONDO:0010515 | \n",
" 12.669963 | \n",
" False | \n",
"
\n",
" \n",
" 10 | \n",
" MONDO:0024529 | \n",
" 12.720854 | \n",
" MONDO:0016002 | \n",
" 12.539704 | \n",
" False | \n",
"
\n",
" \n",
" 11 | \n",
" MONDO:0010515 | \n",
" 12.669963 | \n",
" MONDO:0019625 | \n",
" 12.517714 | \n",
" False | \n",
"
\n",
" \n",
" 12 | \n",
" MONDO:0016002 | \n",
" 12.539704 | \n",
" MONDO:0009159 | \n",
" 12.460405 | \n",
" False | \n",
"
\n",
" \n",
" 13 | \n",
" MONDO:0019625 | \n",
" 12.517714 | \n",
" MONDO:0019354 | \n",
" 12.449903 | \n",
" False | \n",
"
\n",
" \n",
" 14 | \n",
" MONDO:0009159 | \n",
" 12.460405 | \n",
" MONDO:0008818 | \n",
" 12.360394 | \n",
" False | \n",
"
\n",
" \n",
" 15 | \n",
" MONDO:0019354 | \n",
" 12.449903 | \n",
" MONDO:0014831 | \n",
" 12.351288 | \n",
" False | \n",
"
\n",
" \n",
" 16 | \n",
" MONDO:0008818 | \n",
" 12.360394 | \n",
" MONDO:0008426 | \n",
" 12.322358 | \n",
" False | \n",
"
\n",
" \n",
" 17 | \n",
" MONDO:0014831 | \n",
" 12.351288 | \n",
" MONDO:0018954 | \n",
" 12.258602 | \n",
" False | \n",
"
\n",
" \n",
" 18 | \n",
" MONDO:0008426 | \n",
" 12.322358 | \n",
" MONDO:0030500 | \n",
" 12.234267 | \n",
" False | \n",
"
\n",
" \n",
" 19 | \n",
" MONDO:0018954 | \n",
" 12.258602 | \n",
" MONDO:0859151 | \n",
" 12.224425 | \n",
" False | \n",
"
\n",
" \n",
" 20 | \n",
" MONDO:0030500 | \n",
" 12.234267 | \n",
" MONDO:0019567 | \n",
" 12.207587 | \n",
" False | \n",
"
\n",
" \n",
" 21 | \n",
" MONDO:0859151 | \n",
" 12.224425 | \n",
" MONDO:0012212 | \n",
" 12.179639 | \n",
" False | \n",
"
\n",
" \n",
" 22 | \n",
" MONDO:0019567 | \n",
" 12.207587 | \n",
" MONDO:0034024 | \n",
" 12.154007 | \n",
" False | \n",
"
\n",
" \n",
" 23 | \n",
" MONDO:0012212 | \n",
" 12.179639 | \n",
" MONDO:0012013 | \n",
" 12.123645 | \n",
" False | \n",
"
\n",
" \n",
" 24 | \n",
" MONDO:0034024 | \n",
" 12.154007 | \n",
" MONDO:0017309 | \n",
" 12.097610 | \n",
" False | \n",
"
\n",
" \n",
" 25 | \n",
" MONDO:0012013 | \n",
" 12.123645 | \n",
" MONDO:0007363 | \n",
" 12.089953 | \n",
" False | \n",
"
\n",
" \n",
" 26 | \n",
" MONDO:0017309 | \n",
" 12.097610 | \n",
" MONDO:0009511 | \n",
" 12.071908 | \n",
" False | \n",
"
\n",
" \n",
" 27 | \n",
" MONDO:0007363 | \n",
" 12.089953 | \n",
" MONDO:0054813 | \n",
" 12.060745 | \n",
" False | \n",
"
\n",
" \n",
" 28 | \n",
" MONDO:0009511 | \n",
" 12.071908 | \n",
" MONDO:0007537 | \n",
" 12.039003 | \n",
" False | \n",
"
\n",
" \n",
" 29 | \n",
" MONDO:0054813 | \n",
" 12.060745 | \n",
" MONDO:0013754 | \n",
" 12.003640 | \n",
" False | \n",
"
\n",
" \n",
" 30 | \n",
" MONDO:0007537 | \n",
" 12.039003 | \n",
" MONDO:0859177 | \n",
" 11.883261 | \n",
" False | \n",
"
\n",
" \n",
" 31 | \n",
" MONDO:0007372 | \n",
" 12.017054 | \n",
" MONDO:0034021 | \n",
" 11.837237 | \n",
" False | \n",
"
\n",
" \n",
" 32 | \n",
" MONDO:0013754 | \n",
" 12.003640 | \n",
" MONDO:0007160 | \n",
" 11.786118 | \n",
" False | \n",
"
\n",
" \n",
" 33 | \n",
" MONDO:0007720 | \n",
" 11.898544 | \n",
" MONDO:0015230 | \n",
" 11.781601 | \n",
" False | \n",
"
\n",
" \n",
" 34 | \n",
" MONDO:0859177 | \n",
" 11.883261 | \n",
" MONDO:0008678 | \n",
" 11.781164 | \n",
" False | \n",
"
\n",
" \n",
" 35 | \n",
" MONDO:0034021 | \n",
" 11.837237 | \n",
" MONDO:0019019 | \n",
" 11.739629 | \n",
" False | \n",
"
\n",
" \n",
" 36 | \n",
" MONDO:0012569 | \n",
" 11.798970 | \n",
" MONDO:0010753 | \n",
" 11.735383 | \n",
" False | \n",
"
\n",
" \n",
" 37 | \n",
" MONDO:0011915 | \n",
" 11.798970 | \n",
" MONDO:0013051 | \n",
" 11.722570 | \n",
" False | \n",
"
\n",
" \n",
" 38 | \n",
" MONDO:0007160 | \n",
" 11.786118 | \n",
" MONDO:0027462 | \n",
" 11.721282 | \n",
" False | \n",
"
\n",
" \n",
" 39 | \n",
" MONDO:0015230 | \n",
" 11.781601 | \n",
" MONDO:0007057 | \n",
" 11.705854 | \n",
" False | \n",
"
\n",
" \n",
" 40 | \n",
" MONDO:0008678 | \n",
" 11.781164 | \n",
" MONDO:0017806 | \n",
" 11.662459 | \n",
" False | \n",
"
\n",
" \n",
" 41 | \n",
" MONDO:0019019 | \n",
" 11.739629 | \n",
" MONDO:0014139 | \n",
" 11.641001 | \n",
" False | \n",
"
\n",
" \n",
" 42 | \n",
" MONDO:0010753 | \n",
" 11.735383 | \n",
" MONDO:0009579 | \n",
" 11.625764 | \n",
" False | \n",
"
\n",
" \n",
" 43 | \n",
" MONDO:0013051 | \n",
" 11.722570 | \n",
" MONDO:0011147 | \n",
" 11.617690 | \n",
" False | \n",
"
\n",
" \n",
" 44 | \n",
" MONDO:0027462 | \n",
" 11.721282 | \n",
" MONDO:0859193 | \n",
" 11.614443 | \n",
" False | \n",
"
\n",
" \n",
" 45 | \n",
" MONDO:0007057 | \n",
" 11.705854 | \n",
" MONDO:0020681 | \n",
" 11.610557 | \n",
" False | \n",
"
\n",
" \n",
" 46 | \n",
" MONDO:0017806 | \n",
" 11.662459 | \n",
" MONDO:0012914 | \n",
" 11.609362 | \n",
" False | \n",
"
\n",
" \n",
" 47 | \n",
" MONDO:0014139 | \n",
" 11.641001 | \n",
" MONDO:0032707 | \n",
" 11.587106 | \n",
" False | \n",
"
\n",
" \n",
" 48 | \n",
" MONDO:0009579 | \n",
" 11.625764 | \n",
" MONDO:0007522 | \n",
" 11.585127 | \n",
" False | \n",
"
\n",
" \n",
" 49 | \n",
" MONDO:0011147 | \n",
" 11.617690 | \n",
" MONDO:0009242 | \n",
" 11.574202 | \n",
" False | \n",
"
\n",
" \n",
" 50 | \n",
" MONDO:0859193 | \n",
" 11.614443 | \n",
" MONDO:0060532 | \n",
" 11.572993 | \n",
" False | \n",
"
\n",
" \n",
" 51 | \n",
" MONDO:0020681 | \n",
" 11.610557 | \n",
" MONDO:0012773 | \n",
" 11.549693 | \n",
" False | \n",
"
\n",
" \n",
" 52 | \n",
" MONDO:0012914 | \n",
" 11.609362 | \n",
" MONDO:0010194 | \n",
" 11.547988 | \n",
" False | \n",
"
\n",
" \n",
" 53 | \n",
" MONDO:0032707 | \n",
" 11.587106 | \n",
" MONDO:0007525 | \n",
" 11.546833 | \n",
" False | \n",
"
\n",
" \n",
" 54 | \n",
" MONDO:0007522 | \n",
" 11.585127 | \n",
" MONDO:0009112 | \n",
" 11.545883 | \n",
" False | \n",
"
\n",
" \n",
" 55 | \n",
" MONDO:0009242 | \n",
" 11.574202 | \n",
" MONDO:0013800 | \n",
" 11.537000 | \n",
" False | \n",
"
\n",
" \n",
" 56 | \n",
" MONDO:0060532 | \n",
" 11.572993 | \n",
" MONDO:0017314 | \n",
" 11.535089 | \n",
" False | \n",
"
\n",
" \n",
" 57 | \n",
" MONDO:0012773 | \n",
" 11.549693 | \n",
" MONDO:0009318 | \n",
" 11.533214 | \n",
" False | \n",
"
\n",
" \n",
" 58 | \n",
" MONDO:0010194 | \n",
" 11.547988 | \n",
" MONDO:0014236 | \n",
" 11.530959 | \n",
" False | \n",
"
\n",
" \n",
" 59 | \n",
" MONDO:0007525 | \n",
" 11.546833 | \n",
" MONDO:0010208 | \n",
" 11.526944 | \n",
" False | \n",
"
\n",
" \n",
" 60 | \n",
" MONDO:0009112 | \n",
" 11.545883 | \n",
" MONDO:0026733 | \n",
" 11.514011 | \n",
" False | \n",
"
\n",
" \n",
" 61 | \n",
" MONDO:0013800 | \n",
" 11.537000 | \n",
" MONDO:0024535 | \n",
" 11.511612 | \n",
" False | \n",
"
\n",
" \n",
" 62 | \n",
" MONDO:0017314 | \n",
" 11.535089 | \n",
" MONDO:0009717 | \n",
" 11.485904 | \n",
" False | \n",
"
\n",
" \n",
" 63 | \n",
" MONDO:0009318 | \n",
" 11.533214 | \n",
" MONDO:0011106 | \n",
" 11.470966 | \n",
" False | \n",
"
\n",
" \n",
" 64 | \n",
" MONDO:0014236 | \n",
" 11.530959 | \n",
" MONDO:0008310 | \n",
" 11.469897 | \n",
" False | \n",
"
\n",
" \n",
" 65 | \n",
" MONDO:0010208 | \n",
" 11.526944 | \n",
" MONDO:0009363 | \n",
" 11.464871 | \n",
" False | \n",
"
\n",
" \n",
" 66 | \n",
" MONDO:0026733 | \n",
" 11.514011 | \n",
" MONDO:0010561 | \n",
" 11.456316 | \n",
" False | \n",
"
\n",
" \n",
" 67 | \n",
" MONDO:0024535 | \n",
" 11.511612 | \n",
" MONDO:0010261 | \n",
" 11.455513 | \n",
" False | \n",
"
\n",
" \n",
" 68 | \n",
" MONDO:0009717 | \n",
" 11.485904 | \n",
" MONDO:0027451 | \n",
" 11.449975 | \n",
" False | \n",
"
\n",
" \n",
" 69 | \n",
" MONDO:0011106 | \n",
" 11.470966 | \n",
" MONDO:0011142 | \n",
" 11.447775 | \n",
" False | \n",
"
\n",
" \n",
" 70 | \n",
" MONDO:0008310 | \n",
" 11.469897 | \n",
" MONDO:0009033 | \n",
" 11.441172 | \n",
" False | \n",
"
\n",
" \n",
" 71 | \n",
" MONDO:0009363 | \n",
" 11.464871 | \n",
" MONDO:0010653 | \n",
" 11.425481 | \n",
" False | \n",
"
\n",
" \n",
" 72 | \n",
" MONDO:0010561 | \n",
" 11.456316 | \n",
" MONDO:0859194 | \n",
" 11.411297 | \n",
" False | \n",
"
\n",
" \n",
" 73 | \n",
" MONDO:0010261 | \n",
" 11.455513 | \n",
" MONDO:0010441 | \n",
" 11.402327 | \n",
" False | \n",
"
\n",
" \n",
" 74 | \n",
" MONDO:0027451 | \n",
" 11.449975 | \n",
" MONDO:0007949 | \n",
" 11.401413 | \n",
" False | \n",
"
\n",
" \n",
" 75 | \n",
" MONDO:0011142 | \n",
" 11.447775 | \n",
" MONDO:0012342 | \n",
" 11.398925 | \n",
" False | \n",
"
\n",
" \n",
" 76 | \n",
" MONDO:0009033 | \n",
" 11.441172 | \n",
" MONDO:0010075 | \n",
" 11.390286 | \n",
" False | \n",
"
\n",
" \n",
" 77 | \n",
" MONDO:0010653 | \n",
" 11.425481 | \n",
" MONDO:0010310 | \n",
" 11.381537 | \n",
" False | \n",
"
\n",
" \n",
" 78 | \n",
" MONDO:0859194 | \n",
" 11.411297 | \n",
" MONDO:0017569 | \n",
" 11.376257 | \n",
" False | \n",
"
\n",
" \n",
" 79 | \n",
" MONDO:0010441 | \n",
" 11.402327 | \n",
" MONDO:0009926 | \n",
" 11.375838 | \n",
" False | \n",
"
\n",
" \n",
" 80 | \n",
" MONDO:0007949 | \n",
" 11.401413 | \n",
" MONDO:0018923 | \n",
" 11.357222 | \n",
" False | \n",
"
\n",
" \n",
" 81 | \n",
" MONDO:0012342 | \n",
" 11.398925 | \n",
" MONDO:0007524 | \n",
" 11.354872 | \n",
" False | \n",
"
\n",
" \n",
" 82 | \n",
" MONDO:0010075 | \n",
" 11.390286 | \n",
" MONDO:0014700 | \n",
" 11.352468 | \n",
" False | \n",
"
\n",
" \n",
" 83 | \n",
" MONDO:0010310 | \n",
" 11.381537 | \n",
" MONDO:0009566 | \n",
" 11.350380 | \n",
" False | \n",
"
\n",
" \n",
" 84 | \n",
" MONDO:0017569 | \n",
" 11.376257 | \n",
" MONDO:0011244 | \n",
" 11.348228 | \n",
" False | \n",
"
\n",
" \n",
" 85 | \n",
" MONDO:0009926 | \n",
" 11.375838 | \n",
" MONDO:0019571 | \n",
" 11.347518 | \n",
" False | \n",
"
\n",
" \n",
" 86 | \n",
" MONDO:0018923 | \n",
" 11.357222 | \n",
" MONDO:0008999 | \n",
" 11.346674 | \n",
" False | \n",
"
\n",
" \n",
" 87 | \n",
" MONDO:0009910 | \n",
" 11.355538 | \n",
" MONDO:0011604 | \n",
" 11.338133 | \n",
" False | \n",
"
\n",
" \n",
" 88 | \n",
" MONDO:0007524 | \n",
" 11.354872 | \n",
" MONDO:0012496 | \n",
" 11.332304 | \n",
" False | \n",
"
\n",
" \n",
" 89 | \n",
" MONDO:0014700 | \n",
" 11.352468 | \n",
" MONDO:0010571 | \n",
" 11.330400 | \n",
" False | \n",
"
\n",
" \n",
" 90 | \n",
" MONDO:0009566 | \n",
" 11.350380 | \n",
" MONDO:0007738 | \n",
" 11.330224 | \n",
" False | \n",
"
\n",
" \n",
" 91 | \n",
" MONDO:0011244 | \n",
" 11.348228 | \n",
" MONDO:0012873 | \n",
" 11.328784 | \n",
" False | \n",
"
\n",
" \n",
" 92 | \n",
" MONDO:0019571 | \n",
" 11.347518 | \n",
" MONDO:0010590 | \n",
" 11.325360 | \n",
" False | \n",
"
\n",
" \n",
" 93 | \n",
" MONDO:0008999 | \n",
" 11.346674 | \n",
" MONDO:0009161 | \n",
" 11.321351 | \n",
" False | \n",
"
\n",
" \n",
" 94 | \n",
" MONDO:0011604 | \n",
" 11.338133 | \n",
" MONDO:0009052 | \n",
" 11.312873 | \n",
" False | \n",
"
\n",
" \n",
" 95 | \n",
" MONDO:0014379 | \n",
" 11.337413 | \n",
" MONDO:0010650 | \n",
" 11.307532 | \n",
" False | \n",
"
\n",
" \n",
" 96 | \n",
" MONDO:0012496 | \n",
" 11.332304 | \n",
" MONDO:0012740 | \n",
" 11.307180 | \n",
" False | \n",
"
\n",
" \n",
" 97 | \n",
" MONDO:0010571 | \n",
" 11.330400 | \n",
" MONDO:0011493 | \n",
" 11.303764 | \n",
" False | \n",
"
\n",
" \n",
" 98 | \n",
" MONDO:0007738 | \n",
" 11.330224 | \n",
" MONDO:0012853 | \n",
" 11.302427 | \n",
" False | \n",
"
\n",
" \n",
" 99 | \n",
" MONDO:0012873 | \n",
" 11.328784 | \n",
" MONDO:0007523 | \n",
" 11.299591 | \n",
" False | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Full_search Full_score Hybrid_search Hybrid_score MATCH\n",
"0 MONDO:0007947 16.221826 MONDO:0007947 16.221826 True\n",
"1 MONDO:0013897 13.967443 MONDO:0013897 13.967443 True\n",
"2 MONDO:0013426 13.811724 MONDO:0013426 13.811724 True\n",
"3 MONDO:0011431 13.382852 MONDO:0011431 13.382852 True\n",
"4 MONDO:0012427 13.208601 MONDO:0012427 13.208601 True\n",
"5 MONDO:0014514 13.087608 MONDO:0014514 13.087608 True\n",
"6 MONDO:0014950 13.018393 MONDO:0014950 13.018393 True\n",
"7 MONDO:0012171 12.959062 MONDO:0012171 12.959062 True\n",
"8 MONDO:0030731 12.957262 MONDO:0014262 12.820997 False\n",
"9 MONDO:0014262 12.820997 MONDO:0010515 12.669963 False\n",
"10 MONDO:0024529 12.720854 MONDO:0016002 12.539704 False\n",
"11 MONDO:0010515 12.669963 MONDO:0019625 12.517714 False\n",
"12 MONDO:0016002 12.539704 MONDO:0009159 12.460405 False\n",
"13 MONDO:0019625 12.517714 MONDO:0019354 12.449903 False\n",
"14 MONDO:0009159 12.460405 MONDO:0008818 12.360394 False\n",
"15 MONDO:0019354 12.449903 MONDO:0014831 12.351288 False\n",
"16 MONDO:0008818 12.360394 MONDO:0008426 12.322358 False\n",
"17 MONDO:0014831 12.351288 MONDO:0018954 12.258602 False\n",
"18 MONDO:0008426 12.322358 MONDO:0030500 12.234267 False\n",
"19 MONDO:0018954 12.258602 MONDO:0859151 12.224425 False\n",
"20 MONDO:0030500 12.234267 MONDO:0019567 12.207587 False\n",
"21 MONDO:0859151 12.224425 MONDO:0012212 12.179639 False\n",
"22 MONDO:0019567 12.207587 MONDO:0034024 12.154007 False\n",
"23 MONDO:0012212 12.179639 MONDO:0012013 12.123645 False\n",
"24 MONDO:0034024 12.154007 MONDO:0017309 12.097610 False\n",
"25 MONDO:0012013 12.123645 MONDO:0007363 12.089953 False\n",
"26 MONDO:0017309 12.097610 MONDO:0009511 12.071908 False\n",
"27 MONDO:0007363 12.089953 MONDO:0054813 12.060745 False\n",
"28 MONDO:0009511 12.071908 MONDO:0007537 12.039003 False\n",
"29 MONDO:0054813 12.060745 MONDO:0013754 12.003640 False\n",
"30 MONDO:0007537 12.039003 MONDO:0859177 11.883261 False\n",
"31 MONDO:0007372 12.017054 MONDO:0034021 11.837237 False\n",
"32 MONDO:0013754 12.003640 MONDO:0007160 11.786118 False\n",
"33 MONDO:0007720 11.898544 MONDO:0015230 11.781601 False\n",
"34 MONDO:0859177 11.883261 MONDO:0008678 11.781164 False\n",
"35 MONDO:0034021 11.837237 MONDO:0019019 11.739629 False\n",
"36 MONDO:0012569 11.798970 MONDO:0010753 11.735383 False\n",
"37 MONDO:0011915 11.798970 MONDO:0013051 11.722570 False\n",
"38 MONDO:0007160 11.786118 MONDO:0027462 11.721282 False\n",
"39 MONDO:0015230 11.781601 MONDO:0007057 11.705854 False\n",
"40 MONDO:0008678 11.781164 MONDO:0017806 11.662459 False\n",
"41 MONDO:0019019 11.739629 MONDO:0014139 11.641001 False\n",
"42 MONDO:0010753 11.735383 MONDO:0009579 11.625764 False\n",
"43 MONDO:0013051 11.722570 MONDO:0011147 11.617690 False\n",
"44 MONDO:0027462 11.721282 MONDO:0859193 11.614443 False\n",
"45 MONDO:0007057 11.705854 MONDO:0020681 11.610557 False\n",
"46 MONDO:0017806 11.662459 MONDO:0012914 11.609362 False\n",
"47 MONDO:0014139 11.641001 MONDO:0032707 11.587106 False\n",
"48 MONDO:0009579 11.625764 MONDO:0007522 11.585127 False\n",
"49 MONDO:0011147 11.617690 MONDO:0009242 11.574202 False\n",
"50 MONDO:0859193 11.614443 MONDO:0060532 11.572993 False\n",
"51 MONDO:0020681 11.610557 MONDO:0012773 11.549693 False\n",
"52 MONDO:0012914 11.609362 MONDO:0010194 11.547988 False\n",
"53 MONDO:0032707 11.587106 MONDO:0007525 11.546833 False\n",
"54 MONDO:0007522 11.585127 MONDO:0009112 11.545883 False\n",
"55 MONDO:0009242 11.574202 MONDO:0013800 11.537000 False\n",
"56 MONDO:0060532 11.572993 MONDO:0017314 11.535089 False\n",
"57 MONDO:0012773 11.549693 MONDO:0009318 11.533214 False\n",
"58 MONDO:0010194 11.547988 MONDO:0014236 11.530959 False\n",
"59 MONDO:0007525 11.546833 MONDO:0010208 11.526944 False\n",
"60 MONDO:0009112 11.545883 MONDO:0026733 11.514011 False\n",
"61 MONDO:0013800 11.537000 MONDO:0024535 11.511612 False\n",
"62 MONDO:0017314 11.535089 MONDO:0009717 11.485904 False\n",
"63 MONDO:0009318 11.533214 MONDO:0011106 11.470966 False\n",
"64 MONDO:0014236 11.530959 MONDO:0008310 11.469897 False\n",
"65 MONDO:0010208 11.526944 MONDO:0009363 11.464871 False\n",
"66 MONDO:0026733 11.514011 MONDO:0010561 11.456316 False\n",
"67 MONDO:0024535 11.511612 MONDO:0010261 11.455513 False\n",
"68 MONDO:0009717 11.485904 MONDO:0027451 11.449975 False\n",
"69 MONDO:0011106 11.470966 MONDO:0011142 11.447775 False\n",
"70 MONDO:0008310 11.469897 MONDO:0009033 11.441172 False\n",
"71 MONDO:0009363 11.464871 MONDO:0010653 11.425481 False\n",
"72 MONDO:0010561 11.456316 MONDO:0859194 11.411297 False\n",
"73 MONDO:0010261 11.455513 MONDO:0010441 11.402327 False\n",
"74 MONDO:0027451 11.449975 MONDO:0007949 11.401413 False\n",
"75 MONDO:0011142 11.447775 MONDO:0012342 11.398925 False\n",
"76 MONDO:0009033 11.441172 MONDO:0010075 11.390286 False\n",
"77 MONDO:0010653 11.425481 MONDO:0010310 11.381537 False\n",
"78 MONDO:0859194 11.411297 MONDO:0017569 11.376257 False\n",
"79 MONDO:0010441 11.402327 MONDO:0009926 11.375838 False\n",
"80 MONDO:0007949 11.401413 MONDO:0018923 11.357222 False\n",
"81 MONDO:0012342 11.398925 MONDO:0007524 11.354872 False\n",
"82 MONDO:0010075 11.390286 MONDO:0014700 11.352468 False\n",
"83 MONDO:0010310 11.381537 MONDO:0009566 11.350380 False\n",
"84 MONDO:0017569 11.376257 MONDO:0011244 11.348228 False\n",
"85 MONDO:0009926 11.375838 MONDO:0019571 11.347518 False\n",
"86 MONDO:0018923 11.357222 MONDO:0008999 11.346674 False\n",
"87 MONDO:0009910 11.355538 MONDO:0011604 11.338133 False\n",
"88 MONDO:0007524 11.354872 MONDO:0012496 11.332304 False\n",
"89 MONDO:0014700 11.352468 MONDO:0010571 11.330400 False\n",
"90 MONDO:0009566 11.350380 MONDO:0007738 11.330224 False\n",
"91 MONDO:0011244 11.348228 MONDO:0012873 11.328784 False\n",
"92 MONDO:0019571 11.347518 MONDO:0010590 11.325360 False\n",
"93 MONDO:0008999 11.346674 MONDO:0009161 11.321351 False\n",
"94 MONDO:0011604 11.338133 MONDO:0009052 11.312873 False\n",
"95 MONDO:0014379 11.337413 MONDO:0010650 11.307532 False\n",
"96 MONDO:0012496 11.332304 MONDO:0012740 11.307180 False\n",
"97 MONDO:0010571 11.330400 MONDO:0011493 11.303764 False\n",
"98 MONDO:0007738 11.330224 MONDO:0012853 11.302427 False\n",
"99 MONDO:0012873 11.328784 MONDO:0007523 11.299591 False"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_full = pd.DataFrame(result_full_curie_score, columns=[\"Full_search\", \"Full_score\"])\n",
"df_hybrid = pd.DataFrame(result_hybrid_curie_score, columns=[\"Hybrid_search\", \"Hybrid_score\"])\n",
"df = pd.concat([df_full, df_hybrid], axis=1)\n",
"df[\"MATCH\"] = df[\"Full_search\"] == df[\"Hybrid_search\"]\n",
"df\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "6418f222-6ff4-4bc8-b523-8505bdeae547",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Percentage of common terms: 92.0%\n"
]
}
],
"source": [
"# Extract the first element (MONDO ID) from each sublist in result_1 and result_2\n",
"search_result_1 = set([item[0] for item in result_full_curie_score])\n",
"search_result_2 = set([item[0] for item in result_hybrid_curie_score])\n",
"\n",
"# Find the intersection of the two sets\n",
"common_results = search_result_1.intersection(search_result_2)\n",
"\n",
"# Calculate the percentage of common terms\n",
"percentage_common = (len(common_results) / len(search_result_1)) * 100\n",
"\n",
"print(f\"Percentage of common terms: {percentage_common}%\")\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "3b075f06-b218-4bfe-aa4d-f8c48ad12324",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'90.0%'"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"calculate_overlap(df, 10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8160ec1f-27f3-468e-9119-c84792e1498c",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}