{
"cells": [
{
"cell_type": "code",
"execution_count": 8,
"id": "d7a838ab-eccb-48a0-bfda-bb9fecbe1dec",
"metadata": {},
"outputs": [],
"source": [
"%reset -f"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "450a2da2-b889-4fcc-a6f0-90cf71d78c7a",
"metadata": {},
"outputs": [],
"source": [
"user = \"jtr4v\"\n",
"db = f\"/Users/{user}/.data/oaklib/phenio.db\""
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "b955bde6-60c8-4ba7-83ed-4383f3161176",
"metadata": {},
"outputs": [],
"source": [
"%reload_ext sql\n",
"%sql sqlite:///{db}"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "96a7a5a6-a3c4-4242-9ed6-2be7c31efd81",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" * sqlite:////Users/jtr4v/.data/oaklib/phenio.db\n",
"Done.\n"
]
},
{
"data": {
"text/html": [
"
\n",
" \n",
" \n",
" id | \n",
" subject | \n",
" predicate | \n",
" object | \n",
" evidence_type | \n",
" publication | \n",
" source | \n",
"
\n",
" \n",
" \n",
" \n",
" uuid:70269c5a-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0011097 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" uuid:70269c5b-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0002187 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" uuid:70269c5c-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0001518 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" uuid:70269c5d-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0032792 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" uuid:70269c5e-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0011451 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
"
"
],
"text/plain": [
"[('uuid:70269c5a-42a9-11ee-be37-31ef105c25ea', 'MONDO:0023659', 'biolink:has_phenotype', 'HP:0011097', 'ECO:0000269', 'PMID:31675180', 'infores:hpo-annotations'),\n",
" ('uuid:70269c5b-42a9-11ee-be37-31ef105c25ea', 'MONDO:0023659', 'biolink:has_phenotype', 'HP:0002187', 'ECO:0000269', 'PMID:31675180', 'infores:hpo-annotations'),\n",
" ('uuid:70269c5c-42a9-11ee-be37-31ef105c25ea', 'MONDO:0023659', 'biolink:has_phenotype', 'HP:0001518', 'ECO:0000269', 'PMID:31675180', 'infores:hpo-annotations'),\n",
" ('uuid:70269c5d-42a9-11ee-be37-31ef105c25ea', 'MONDO:0023659', 'biolink:has_phenotype', 'HP:0032792', 'ECO:0000269', 'PMID:31675180', 'infores:hpo-annotations'),\n",
" ('uuid:70269c5e-42a9-11ee-be37-31ef105c25ea', 'MONDO:0023659', 'biolink:has_phenotype', 'HP:0011451', 'ECO:0000269', 'PMID:31675180', 'infores:hpo-annotations')]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%sql SELECT * FROM term_association LIMIT 5;"
]
},
{
"cell_type": "markdown",
"id": "dba6b3e5-d637-48fb-a7b1-c29356c1f3ef",
"metadata": {},
"source": [
"### Python"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "723d28cb-8ac6-4eb2-bea8-a1626677bc51",
"metadata": {},
"outputs": [],
"source": [
"import sqlite3\n",
"import pandas as pd\n",
"import numpy as np\n",
"# Set the option to None to display all rows\n",
"# pd.set_option('display.max_rows', None)\n",
"\n",
"from semsimian import Semsimian"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "ebd327bb-e641-459d-bd37-7d96a9036f67",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('term_association',),\n",
" ('has_oio_synonym_statement',),\n",
" ('anonymous_expression',),\n",
" ('anonymous_class_expression',),\n",
" ('anonymous_property_expression',),\n",
" ('anonymous_individual_expression',),\n",
" ('owl_restriction',),\n",
" ('owl_complex_axiom',),\n",
" ('prefix',),\n",
" ('rdf_list_statement',),\n",
" ('rdf_level_summary_statistic',),\n",
" ('relation_graph_construct',),\n",
" ('subgraph_query',),\n",
" ('entailed_edge',),\n",
" ('repair_action',),\n",
" ('problem',),\n",
" ('lexical_problem',),\n",
" ('statements',)]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"conn = sqlite3.connect(db)\n",
"res = conn.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n",
"tables = res.fetchall()\n",
"\n",
"tables"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "08aac28f-997f-4864-8bff-6ffcc5ae73c4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" subject | \n",
" predicate | \n",
" object | \n",
" evidence_type | \n",
" publication | \n",
" source | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" uuid:70269c5a-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0011097 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 1 | \n",
" uuid:70269c5b-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0002187 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 2 | \n",
" uuid:70269c5c-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0001518 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 3 | \n",
" uuid:70269c5d-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0032792 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 4 | \n",
" uuid:70269c5e-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0011451 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id subject \\\n",
"0 uuid:70269c5a-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"1 uuid:70269c5b-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"2 uuid:70269c5c-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"3 uuid:70269c5d-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"4 uuid:70269c5e-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"\n",
" predicate object evidence_type publication \\\n",
"0 biolink:has_phenotype HP:0011097 ECO:0000269 PMID:31675180 \n",
"1 biolink:has_phenotype HP:0002187 ECO:0000269 PMID:31675180 \n",
"2 biolink:has_phenotype HP:0001518 ECO:0000269 PMID:31675180 \n",
"3 biolink:has_phenotype HP:0032792 ECO:0000269 PMID:31675180 \n",
"4 biolink:has_phenotype HP:0011451 ECO:0000269 PMID:31675180 \n",
"\n",
" source \n",
"0 infores:hpo-annotations \n",
"1 infores:hpo-annotations \n",
"2 infores:hpo-annotations \n",
"3 infores:hpo-annotations \n",
"4 infores:hpo-annotations "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_term_association = pd.read_sql_query(\"SELECT * FROM term_association\", conn)\n",
"df_term_association.head()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "f19aa997-02db-4d3a-9c2b-437e77430a61",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['MONDO', 'HGNC', 'WB', 'MGI', 'RGD', 'Xenbase', 'ZFIN'],\n",
" dtype=object)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_term_association['subject'].str.split(\":\").str[0].unique()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "f0aefb13-1a00-48d2-a68f-d660bb393ec0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['HP', 'WBPhenotype', 'MP', 'XPO', 'ZP'], dtype=object)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_term_association['object'].str.split(\":\").str[0].unique()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "e1755b79-d6b9-4ce6-9bb7-1ee2a331e11e",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" subject | \n",
" predicate | \n",
" object | \n",
" evidence_type | \n",
" publication | \n",
" source | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" uuid:70269c5a-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0011097 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 1 | \n",
" uuid:70269c5b-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0002187 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 2 | \n",
" uuid:70269c5c-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0001518 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 3 | \n",
" uuid:70269c5d-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0032792 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 4 | \n",
" uuid:70269c5e-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0011451 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 240650 | \n",
" uuid:7e6fe369-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0009033 | \n",
" biolink:has_phenotype | \n",
" HP:0001831 | \n",
" ECO:0000304 | \n",
" orphanet:1777 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 240651 | \n",
" uuid:7e6fe36a-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0009033 | \n",
" biolink:has_phenotype | \n",
" HP:0002970 | \n",
" ECO:0000304 | \n",
" orphanet:1777 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 240652 | \n",
" uuid:7e6fe36b-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0009033 | \n",
" biolink:has_phenotype | \n",
" HP:0004209 | \n",
" ECO:0000304 | \n",
" orphanet:1777 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 240653 | \n",
" uuid:7e6fe36c-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0009033 | \n",
" biolink:has_phenotype | \n",
" HP:0005692 | \n",
" ECO:0000304 | \n",
" orphanet:1777 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 240654 | \n",
" uuid:7e6fe36d-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0009033 | \n",
" biolink:has_phenotype | \n",
" HP:0007370 | \n",
" ECO:0000304 | \n",
" orphanet:1777 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
"
\n",
"
240643 rows × 7 columns
\n",
"
"
],
"text/plain": [
" id subject \\\n",
"0 uuid:70269c5a-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"1 uuid:70269c5b-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"2 uuid:70269c5c-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"3 uuid:70269c5d-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"4 uuid:70269c5e-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"... ... ... \n",
"240650 uuid:7e6fe369-42a9-11ee-be37-31ef105c25ea MONDO:0009033 \n",
"240651 uuid:7e6fe36a-42a9-11ee-be37-31ef105c25ea MONDO:0009033 \n",
"240652 uuid:7e6fe36b-42a9-11ee-be37-31ef105c25ea MONDO:0009033 \n",
"240653 uuid:7e6fe36c-42a9-11ee-be37-31ef105c25ea MONDO:0009033 \n",
"240654 uuid:7e6fe36d-42a9-11ee-be37-31ef105c25ea MONDO:0009033 \n",
"\n",
" predicate object evidence_type publication \\\n",
"0 biolink:has_phenotype HP:0011097 ECO:0000269 PMID:31675180 \n",
"1 biolink:has_phenotype HP:0002187 ECO:0000269 PMID:31675180 \n",
"2 biolink:has_phenotype HP:0001518 ECO:0000269 PMID:31675180 \n",
"3 biolink:has_phenotype HP:0032792 ECO:0000269 PMID:31675180 \n",
"4 biolink:has_phenotype HP:0011451 ECO:0000269 PMID:31675180 \n",
"... ... ... ... ... \n",
"240650 biolink:has_phenotype HP:0001831 ECO:0000304 orphanet:1777 \n",
"240651 biolink:has_phenotype HP:0002970 ECO:0000304 orphanet:1777 \n",
"240652 biolink:has_phenotype HP:0004209 ECO:0000304 orphanet:1777 \n",
"240653 biolink:has_phenotype HP:0005692 ECO:0000304 orphanet:1777 \n",
"240654 biolink:has_phenotype HP:0007370 ECO:0000304 orphanet:1777 \n",
"\n",
" source \n",
"0 infores:hpo-annotations \n",
"1 infores:hpo-annotations \n",
"2 infores:hpo-annotations \n",
"3 infores:hpo-annotations \n",
"4 infores:hpo-annotations \n",
"... ... \n",
"240650 infores:hpo-annotations \n",
"240651 infores:hpo-annotations \n",
"240652 infores:hpo-annotations \n",
"240653 infores:hpo-annotations \n",
"240654 infores:hpo-annotations \n",
"\n",
"[240643 rows x 7 columns]"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_term_association[df_term_association['subject'].str.startswith(\"MONDO:\")]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "81556e70-f889-483f-b496-ddbd51baab08",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" subject | \n",
" predicate | \n",
" object | \n",
" evidence_type | \n",
" publication | \n",
" source | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" uuid:70269c5a-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0011097 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 1 | \n",
" uuid:70269c5b-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0002187 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 2 | \n",
" uuid:70269c5c-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0001518 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 3 | \n",
" uuid:70269c5d-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0032792 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
" 4 | \n",
" uuid:70269c5e-42a9-11ee-be37-31ef105c25ea | \n",
" MONDO:0023659 | \n",
" biolink:has_phenotype | \n",
" HP:0011451 | \n",
" ECO:0000269 | \n",
" PMID:31675180 | \n",
" infores:hpo-annotations | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id subject \\\n",
"0 uuid:70269c5a-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"1 uuid:70269c5b-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"2 uuid:70269c5c-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"3 uuid:70269c5d-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"4 uuid:70269c5e-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n",
"\n",
" predicate object evidence_type publication \\\n",
"0 biolink:has_phenotype HP:0011097 ECO:0000269 PMID:31675180 \n",
"1 biolink:has_phenotype HP:0002187 ECO:0000269 PMID:31675180 \n",
"2 biolink:has_phenotype HP:0001518 ECO:0000269 PMID:31675180 \n",
"3 biolink:has_phenotype HP:0032792 ECO:0000269 PMID:31675180 \n",
"4 biolink:has_phenotype HP:0011451 ECO:0000269 PMID:31675180 \n",
"\n",
" source \n",
"0 infores:hpo-annotations \n",
"1 infores:hpo-annotations \n",
"2 infores:hpo-annotations \n",
"3 infores:hpo-annotations \n",
"4 infores:hpo-annotations "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"condition_1 = df_term_association['subject'].str.startswith(\"MONDO:\")\n",
"condition_2 = df_term_association['object'].str.startswith(\"HP:\")\n",
"\n",
"df = df_term_association[condition_1 & condition_2]\n",
"\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "81d9f3fe-1f76-4bae-af5e-d9c47da0e13d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'HP:0000042',\n",
" 'HP:0002453',\n",
" 'HP:0004233',\n",
" 'HP:0007041',\n",
" 'HP:0007734',\n",
" 'HP:0008523',\n",
" 'HP:0010037',\n",
" 'HP:0011193',\n",
" 'HP:0011787',\n",
" 'HP:0410275'}"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Set seed for random\n",
"np.random.seed(0)\n",
"random_hp_1 = np.random.choice(df['object'].unique(), size = 10, replace=False)\n",
"set(random_hp_1)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "5f698837-f6c0-4627-b3f1-bf27007c3b61",
"metadata": {},
"outputs": [],
"source": [
"# # Set seed for random\n",
"# np.random.seed(10)\n",
"# random_hp_2 = np.random.choice(df['object'].unique(), size = 10, replace=False)\n",
"# set(random_hp_2)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "66f86458-52cc-419b-a629-55615e9b5687",
"metadata": {},
"outputs": [],
"source": [
"hp_1 = [\"HP:0003394\",\n",
"\"HP:0003771\",\n",
"\"HP:0012378\",\n",
"\"HP:0012450\",\n",
"\"HP:0000974\",\n",
"\"HP:0001027\",\n",
"\"HP:0001030\",\n",
"\"HP:0001065\",\n",
"\"HP:0001073\",\n",
"\"HP:0001075\",\n",
"\"HP:0002761\",\n",
"\"HP:0001386\",\n",
"\"HP:0001537\",\n",
"\"HP:0001622\",\n",
"\"HP:0001760\",\n",
"\"HP:0001762\",\n",
"\"HP:0001763\",\n",
"\"HP:0001788\",\n",
"\"HP:0002035\",\n",
"\"HP:0002036\",\n",
"\"HP:0002616\",\n",
"\"HP:0002650\",\n",
"\"HP:0002758\",\n",
"\"HP:0002827\",\n",
"\"HP:0002829\",\n",
"\"HP:0002999\",\n",
"\"HP:0003010\",\n",
"\"HP:0003083\",\n",
"\"HP:0003834\",\n",
"\"HP:0000938\",\n",
"\"HP:0001058\",\n",
"\"HP:0001252\",\n",
"\"HP:0001324\",\n",
"\"HP:0002013\",\n",
"\"HP:0002018\",\n",
"\"HP:0002020\",\n",
"\"HP:0000015\",\n",
"\"HP:0000023\",\n",
"\"HP:0000139\",\n",
"\"HP:0000286\",\n",
"\"HP:0000481\",\n",
"\"HP:0000978\",\n",
"\"HP:0000993\",\n",
"\"HP:0001063\",\n",
"\"HP:0004872\",\n",
"\"HP:0004944\",\n",
"\"HP:0004947\",\n",
"\"HP:0001270\",\n",
"\"HP:0005294\",\n",
"\"HP:0006243\",\n",
"\"HP:0007495\",\n",
"\"HP:0009763\",\n",
"\"HP:0010749\",\n",
"\"HP:0010750\",\n",
"\"HP:0010754\",\n",
"\"HP:0025014\",\n",
"\"HP:0025019\",\n",
"\"HP:0025509\",\n",
"\"HP:0030009\",\n",
"\"HP:0031364\",\n",
"\"HP:0031653\",\n",
"\"HP:0001278\",\n",
"\"HP:0001634\",\n",
"\"HP:0001653\",\n",
"\"HP:0001704\",\n",
"\"HP:0002315\"]\n",
"\n",
"hp_2 = [\"HP:0003645\",\n",
"\"HP:0005261\",\n",
"\"HP:0002758\",\n",
"\"HP:0003125\",\n",
"\"HP:0001892\",\n",
"\"HP:0001934\",\n",
"\"HP:0000967\",\n",
"\"HP:0000978\",\n",
"\"HP:0000979\",\n",
"\"HP:0040242\",\n",
"\"HP:0007420\",\n",
"\"HP:0030140\",\n",
"\"HP:0001386\",\n",
"\"HP:0002829\",\n",
"\"HP:0002239\",\n",
"\"HP:0011889\",\n",
"\"HP:0001907\",\n",
"\"HP:0012223\",\n",
"\"HP:0009811\",\n",
"\"HP:0012233\",\n",
"\"HP:0030746\",\n",
"\"HP:0002170\"]"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "f7cd8388-abc3-4fc9-913e-6c79d8f1d7ad",
"metadata": {},
"outputs": [],
"source": [
"mondo = set(df['subject'].drop_duplicates())"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "5072b5ad-b7ae-4a31-aa2d-f7c22cd0b7d6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 43.5 s, sys: 1.89 s, total: 45.4 s\n",
"Wall time: 46.4 s\n"
]
}
],
"source": [
"%%time\n",
"predicates = [\"rdfs:subClassOf\",\"BFO:0000050\", \"UPHENO:0000001\"]\n",
"rss = Semsimian(spo=None, predicates=predicates, pairwise_similarity_attributes=None, resource_path=db)\n",
"all_x_all = rss.all_by_all_pairwise_similarity(subject_terms=set(hp_1), object_terms=set(hp_2), minimum_jaccard_threshold=0, minimum_resnik_threshold=0)\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "104d7245-044e-4696-8f7b-51c65b1806fa",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1.61 ms, sys: 241 µs, total: 1.85 ms\n",
"Wall time: 2 ms\n"
]
}
],
"source": [
"%%time\n",
"\n",
"rows_list = []\n",
"for term1_key, values in all_x_all.items():\n",
" for term2_key, result in values.items():\n",
" jaccard, resnik, phenodigm, _, _ = result\n",
" row_dict = {\"subject\": term1_key, \"object\": term2_key, \"jaccard\": jaccard, \"aic\": resnik, \"phenodigm\": phenodigm}\n",
" rows_list.append(row_dict)\n",
"\n",
"new_df = pd.DataFrame(rows_list)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "40ef069a-6b31-4b5c-a749-93ebfeb5dc8d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" subject | \n",
" object | \n",
" jaccard | \n",
" aic | \n",
" phenodigm | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" HP:0001386 | \n",
" HP:0001386 | \n",
" 1.000000 | \n",
" 18.008877 | \n",
" 4.243687 | \n",
"
\n",
" \n",
" 1 | \n",
" HP:0000978 | \n",
" HP:0000978 | \n",
" 1.000000 | \n",
" 18.008877 | \n",
" 4.243687 | \n",
"
\n",
" \n",
" 2 | \n",
" HP:0002829 | \n",
" HP:0002829 | \n",
" 1.000000 | \n",
" 17.008877 | \n",
" 4.124182 | \n",
"
\n",
" \n",
" 3 | \n",
" HP:0002758 | \n",
" HP:0002758 | \n",
" 1.000000 | \n",
" 14.838952 | \n",
" 3.852136 | \n",
"
\n",
" \n",
" 4 | \n",
" HP:0031364 | \n",
" HP:0000967 | \n",
" 0.975309 | \n",
" 16.423915 | \n",
" 4.002298 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 1447 | \n",
" HP:0003083 | \n",
" HP:0002239 | \n",
" 0.116751 | \n",
" 4.134184 | \n",
" 0.694745 | \n",
"
\n",
" \n",
" 1448 | \n",
" HP:0006243 | \n",
" HP:0002829 | \n",
" 0.112583 | \n",
" 4.010992 | \n",
" 0.671989 | \n",
"
\n",
" \n",
" 1449 | \n",
" HP:0006243 | \n",
" HP:0003125 | \n",
" 0.111111 | \n",
" 4.134184 | \n",
" 0.677756 | \n",
"
\n",
" \n",
" 1450 | \n",
" HP:0003083 | \n",
" HP:0002829 | \n",
" 0.106250 | \n",
" 4.010992 | \n",
" 0.652815 | \n",
"
\n",
" \n",
" 1451 | \n",
" HP:0003083 | \n",
" HP:0003125 | \n",
" 0.106061 | \n",
" 4.134184 | \n",
" 0.662174 | \n",
"
\n",
" \n",
"
\n",
"
1452 rows × 5 columns
\n",
"
"
],
"text/plain": [
" subject object jaccard aic phenodigm\n",
"0 HP:0001386 HP:0001386 1.000000 18.008877 4.243687\n",
"1 HP:0000978 HP:0000978 1.000000 18.008877 4.243687\n",
"2 HP:0002829 HP:0002829 1.000000 17.008877 4.124182\n",
"3 HP:0002758 HP:0002758 1.000000 14.838952 3.852136\n",
"4 HP:0031364 HP:0000967 0.975309 16.423915 4.002298\n",
"... ... ... ... ... ...\n",
"1447 HP:0003083 HP:0002239 0.116751 4.134184 0.694745\n",
"1448 HP:0006243 HP:0002829 0.112583 4.010992 0.671989\n",
"1449 HP:0006243 HP:0003125 0.111111 4.134184 0.677756\n",
"1450 HP:0003083 HP:0002829 0.106250 4.010992 0.652815\n",
"1451 HP:0003083 HP:0003125 0.106061 4.134184 0.662174\n",
"\n",
"[1452 rows x 5 columns]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_df_sorted = new_df.sort_values(by=[\"jaccard\", \"aic\"], ascending=False).reset_index()\n",
"new_df_sorted.drop(\"index\", axis=1, inplace=True)\n",
"new_df_sorted"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "195afbdc-6c10-4313-9514-573afe5feabc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 18.008877\n",
"2 17.008877\n",
"3 14.838952\n",
"4 16.423915\n",
"5 15.686949\n",
" ... \n",
"1079 5.079804\n",
"1124 4.253051\n",
"1194 4.403398\n",
"1204 4.217714\n",
"1256 6.722319\n",
"Name: aic, Length: 63, dtype: float64"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new_df_sorted['aic'].drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "41bf2bfa-6ea1-4067-b8c4-0274a6ad3ed6",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"df_subset = new_df_sorted.head(10)\n",
"\n",
"# Create a figure and a set of subplots\n",
"fig, ax = plt.subplots()\n",
"\n",
"# Set the bar width\n",
"bar_width = 0.35\n",
"\n",
"# Get the positions of the bars\n",
"positions = list(range(len(df_subset)))\n",
"\n",
"# Plot bars for 'jaccard' in one color\n",
"plt.bar([p - bar_width/2 for p in positions], df_subset['jaccard'], width=bar_width, color='blue', label='jaccard')\n",
"\n",
"# Plot bars for 'aic' in another color\n",
"plt.bar([p + bar_width/2 for p in positions], df_subset['aic'], width=bar_width, color='green', label='aic')\n",
"\n",
"# Set the x ticks with names\n",
"plt.xticks(positions, df_subset['subject'], rotation='vertical')\n",
"\n",
"# Show object values on top of the bars\n",
"for i in range(len(df_subset)):\n",
" plt.text(x = i, y = df_subset['jaccard'].iloc[i], s = df_subset['object'].iloc[i], size = 10, ha = 'center', va = 'bottom', rotation='vertical')\n",
"\n",
"\n",
"# Add legend\n",
"plt.legend()\n",
"\n",
"# Add labels and title\n",
"plt.xlabel('Subject')\n",
"plt.ylabel('Values')\n",
"plt.title('Top 10 Subjects vs Corresponding Objects')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8a08cea9-9c94-42fa-a25f-720c77339c6c",
"metadata": {},
"outputs": [],
"source": [
"df_subset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3a215cf0-7895-4d9f-bfff-c4f32d84ffce",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}