{ "cells": [ { "cell_type": "code", "execution_count": 8, "id": "d7a838ab-eccb-48a0-bfda-bb9fecbe1dec", "metadata": {}, "outputs": [], "source": [ "%reset -f" ] }, { "cell_type": "code", "execution_count": 9, "id": "450a2da2-b889-4fcc-a6f0-90cf71d78c7a", "metadata": {}, "outputs": [], "source": [ "user = \"jtr4v\"\n", "db = f\"/Users/{user}/.data/oaklib/phenio.db\"" ] }, { "cell_type": "code", "execution_count": 10, "id": "b955bde6-60c8-4ba7-83ed-4383f3161176", "metadata": {}, "outputs": [], "source": [ "%reload_ext sql\n", "%sql sqlite:///{db}" ] }, { "cell_type": "code", "execution_count": 11, "id": "96a7a5a6-a3c4-4242-9ed6-2be7c31efd81", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " * sqlite:////Users/jtr4v/.data/oaklib/phenio.db\n", "Done.\n" ] }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsubjectpredicateobjectevidence_typepublicationsource
uuid:70269c5a-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0011097ECO:0000269PMID:31675180infores:hpo-annotations
uuid:70269c5b-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0002187ECO:0000269PMID:31675180infores:hpo-annotations
uuid:70269c5c-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0001518ECO:0000269PMID:31675180infores:hpo-annotations
uuid:70269c5d-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0032792ECO:0000269PMID:31675180infores:hpo-annotations
uuid:70269c5e-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0011451ECO:0000269PMID:31675180infores:hpo-annotations
" ], "text/plain": [ "[('uuid:70269c5a-42a9-11ee-be37-31ef105c25ea', 'MONDO:0023659', 'biolink:has_phenotype', 'HP:0011097', 'ECO:0000269', 'PMID:31675180', 'infores:hpo-annotations'),\n", " ('uuid:70269c5b-42a9-11ee-be37-31ef105c25ea', 'MONDO:0023659', 'biolink:has_phenotype', 'HP:0002187', 'ECO:0000269', 'PMID:31675180', 'infores:hpo-annotations'),\n", " ('uuid:70269c5c-42a9-11ee-be37-31ef105c25ea', 'MONDO:0023659', 'biolink:has_phenotype', 'HP:0001518', 'ECO:0000269', 'PMID:31675180', 'infores:hpo-annotations'),\n", " ('uuid:70269c5d-42a9-11ee-be37-31ef105c25ea', 'MONDO:0023659', 'biolink:has_phenotype', 'HP:0032792', 'ECO:0000269', 'PMID:31675180', 'infores:hpo-annotations'),\n", " ('uuid:70269c5e-42a9-11ee-be37-31ef105c25ea', 'MONDO:0023659', 'biolink:has_phenotype', 'HP:0011451', 'ECO:0000269', 'PMID:31675180', 'infores:hpo-annotations')]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%sql SELECT * FROM term_association LIMIT 5;" ] }, { "cell_type": "markdown", "id": "dba6b3e5-d637-48fb-a7b1-c29356c1f3ef", "metadata": {}, "source": [ "### Python" ] }, { "cell_type": "code", "execution_count": 12, "id": "723d28cb-8ac6-4eb2-bea8-a1626677bc51", "metadata": {}, "outputs": [], "source": [ "import sqlite3\n", "import pandas as pd\n", "import numpy as np\n", "# Set the option to None to display all rows\n", "# pd.set_option('display.max_rows', None)\n", "\n", "from semsimian import Semsimian" ] }, { "cell_type": "code", "execution_count": 13, "id": "ebd327bb-e641-459d-bd37-7d96a9036f67", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('term_association',),\n", " ('has_oio_synonym_statement',),\n", " ('anonymous_expression',),\n", " ('anonymous_class_expression',),\n", " ('anonymous_property_expression',),\n", " ('anonymous_individual_expression',),\n", " ('owl_restriction',),\n", " ('owl_complex_axiom',),\n", " ('prefix',),\n", " ('rdf_list_statement',),\n", " ('rdf_level_summary_statistic',),\n", " ('relation_graph_construct',),\n", " ('subgraph_query',),\n", " ('entailed_edge',),\n", " ('repair_action',),\n", " ('problem',),\n", " ('lexical_problem',),\n", " ('statements',)]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "conn = sqlite3.connect(db)\n", "res = conn.execute(\"SELECT name FROM sqlite_master WHERE type='table';\")\n", "tables = res.fetchall()\n", "\n", "tables" ] }, { "cell_type": "code", "execution_count": 14, "id": "08aac28f-997f-4864-8bff-6ffcc5ae73c4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsubjectpredicateobjectevidence_typepublicationsource
0uuid:70269c5a-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0011097ECO:0000269PMID:31675180infores:hpo-annotations
1uuid:70269c5b-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0002187ECO:0000269PMID:31675180infores:hpo-annotations
2uuid:70269c5c-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0001518ECO:0000269PMID:31675180infores:hpo-annotations
3uuid:70269c5d-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0032792ECO:0000269PMID:31675180infores:hpo-annotations
4uuid:70269c5e-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0011451ECO:0000269PMID:31675180infores:hpo-annotations
\n", "
" ], "text/plain": [ " id subject \\\n", "0 uuid:70269c5a-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "1 uuid:70269c5b-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "2 uuid:70269c5c-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "3 uuid:70269c5d-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "4 uuid:70269c5e-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "\n", " predicate object evidence_type publication \\\n", "0 biolink:has_phenotype HP:0011097 ECO:0000269 PMID:31675180 \n", "1 biolink:has_phenotype HP:0002187 ECO:0000269 PMID:31675180 \n", "2 biolink:has_phenotype HP:0001518 ECO:0000269 PMID:31675180 \n", "3 biolink:has_phenotype HP:0032792 ECO:0000269 PMID:31675180 \n", "4 biolink:has_phenotype HP:0011451 ECO:0000269 PMID:31675180 \n", "\n", " source \n", "0 infores:hpo-annotations \n", "1 infores:hpo-annotations \n", "2 infores:hpo-annotations \n", "3 infores:hpo-annotations \n", "4 infores:hpo-annotations " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_term_association = pd.read_sql_query(\"SELECT * FROM term_association\", conn)\n", "df_term_association.head()" ] }, { "cell_type": "code", "execution_count": 15, "id": "f19aa997-02db-4d3a-9c2b-437e77430a61", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['MONDO', 'HGNC', 'WB', 'MGI', 'RGD', 'Xenbase', 'ZFIN'],\n", " dtype=object)" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_term_association['subject'].str.split(\":\").str[0].unique()" ] }, { "cell_type": "code", "execution_count": 16, "id": "f0aefb13-1a00-48d2-a68f-d660bb393ec0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['HP', 'WBPhenotype', 'MP', 'XPO', 'ZP'], dtype=object)" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_term_association['object'].str.split(\":\").str[0].unique()" ] }, { "cell_type": "code", "execution_count": 17, "id": "e1755b79-d6b9-4ce6-9bb7-1ee2a331e11e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsubjectpredicateobjectevidence_typepublicationsource
0uuid:70269c5a-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0011097ECO:0000269PMID:31675180infores:hpo-annotations
1uuid:70269c5b-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0002187ECO:0000269PMID:31675180infores:hpo-annotations
2uuid:70269c5c-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0001518ECO:0000269PMID:31675180infores:hpo-annotations
3uuid:70269c5d-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0032792ECO:0000269PMID:31675180infores:hpo-annotations
4uuid:70269c5e-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0011451ECO:0000269PMID:31675180infores:hpo-annotations
........................
240650uuid:7e6fe369-42a9-11ee-be37-31ef105c25eaMONDO:0009033biolink:has_phenotypeHP:0001831ECO:0000304orphanet:1777infores:hpo-annotations
240651uuid:7e6fe36a-42a9-11ee-be37-31ef105c25eaMONDO:0009033biolink:has_phenotypeHP:0002970ECO:0000304orphanet:1777infores:hpo-annotations
240652uuid:7e6fe36b-42a9-11ee-be37-31ef105c25eaMONDO:0009033biolink:has_phenotypeHP:0004209ECO:0000304orphanet:1777infores:hpo-annotations
240653uuid:7e6fe36c-42a9-11ee-be37-31ef105c25eaMONDO:0009033biolink:has_phenotypeHP:0005692ECO:0000304orphanet:1777infores:hpo-annotations
240654uuid:7e6fe36d-42a9-11ee-be37-31ef105c25eaMONDO:0009033biolink:has_phenotypeHP:0007370ECO:0000304orphanet:1777infores:hpo-annotations
\n", "

240643 rows × 7 columns

\n", "
" ], "text/plain": [ " id subject \\\n", "0 uuid:70269c5a-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "1 uuid:70269c5b-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "2 uuid:70269c5c-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "3 uuid:70269c5d-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "4 uuid:70269c5e-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "... ... ... \n", "240650 uuid:7e6fe369-42a9-11ee-be37-31ef105c25ea MONDO:0009033 \n", "240651 uuid:7e6fe36a-42a9-11ee-be37-31ef105c25ea MONDO:0009033 \n", "240652 uuid:7e6fe36b-42a9-11ee-be37-31ef105c25ea MONDO:0009033 \n", "240653 uuid:7e6fe36c-42a9-11ee-be37-31ef105c25ea MONDO:0009033 \n", "240654 uuid:7e6fe36d-42a9-11ee-be37-31ef105c25ea MONDO:0009033 \n", "\n", " predicate object evidence_type publication \\\n", "0 biolink:has_phenotype HP:0011097 ECO:0000269 PMID:31675180 \n", "1 biolink:has_phenotype HP:0002187 ECO:0000269 PMID:31675180 \n", "2 biolink:has_phenotype HP:0001518 ECO:0000269 PMID:31675180 \n", "3 biolink:has_phenotype HP:0032792 ECO:0000269 PMID:31675180 \n", "4 biolink:has_phenotype HP:0011451 ECO:0000269 PMID:31675180 \n", "... ... ... ... ... \n", "240650 biolink:has_phenotype HP:0001831 ECO:0000304 orphanet:1777 \n", "240651 biolink:has_phenotype HP:0002970 ECO:0000304 orphanet:1777 \n", "240652 biolink:has_phenotype HP:0004209 ECO:0000304 orphanet:1777 \n", "240653 biolink:has_phenotype HP:0005692 ECO:0000304 orphanet:1777 \n", "240654 biolink:has_phenotype HP:0007370 ECO:0000304 orphanet:1777 \n", "\n", " source \n", "0 infores:hpo-annotations \n", "1 infores:hpo-annotations \n", "2 infores:hpo-annotations \n", "3 infores:hpo-annotations \n", "4 infores:hpo-annotations \n", "... ... \n", "240650 infores:hpo-annotations \n", "240651 infores:hpo-annotations \n", "240652 infores:hpo-annotations \n", "240653 infores:hpo-annotations \n", "240654 infores:hpo-annotations \n", "\n", "[240643 rows x 7 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_term_association[df_term_association['subject'].str.startswith(\"MONDO:\")]" ] }, { "cell_type": "code", "execution_count": 18, "id": "81556e70-f889-483f-b496-ddbd51baab08", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idsubjectpredicateobjectevidence_typepublicationsource
0uuid:70269c5a-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0011097ECO:0000269PMID:31675180infores:hpo-annotations
1uuid:70269c5b-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0002187ECO:0000269PMID:31675180infores:hpo-annotations
2uuid:70269c5c-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0001518ECO:0000269PMID:31675180infores:hpo-annotations
3uuid:70269c5d-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0032792ECO:0000269PMID:31675180infores:hpo-annotations
4uuid:70269c5e-42a9-11ee-be37-31ef105c25eaMONDO:0023659biolink:has_phenotypeHP:0011451ECO:0000269PMID:31675180infores:hpo-annotations
\n", "
" ], "text/plain": [ " id subject \\\n", "0 uuid:70269c5a-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "1 uuid:70269c5b-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "2 uuid:70269c5c-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "3 uuid:70269c5d-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "4 uuid:70269c5e-42a9-11ee-be37-31ef105c25ea MONDO:0023659 \n", "\n", " predicate object evidence_type publication \\\n", "0 biolink:has_phenotype HP:0011097 ECO:0000269 PMID:31675180 \n", "1 biolink:has_phenotype HP:0002187 ECO:0000269 PMID:31675180 \n", "2 biolink:has_phenotype HP:0001518 ECO:0000269 PMID:31675180 \n", "3 biolink:has_phenotype HP:0032792 ECO:0000269 PMID:31675180 \n", "4 biolink:has_phenotype HP:0011451 ECO:0000269 PMID:31675180 \n", "\n", " source \n", "0 infores:hpo-annotations \n", "1 infores:hpo-annotations \n", "2 infores:hpo-annotations \n", "3 infores:hpo-annotations \n", "4 infores:hpo-annotations " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "condition_1 = df_term_association['subject'].str.startswith(\"MONDO:\")\n", "condition_2 = df_term_association['object'].str.startswith(\"HP:\")\n", "\n", "df = df_term_association[condition_1 & condition_2]\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 19, "id": "81d9f3fe-1f76-4bae-af5e-d9c47da0e13d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'HP:0000042',\n", " 'HP:0002453',\n", " 'HP:0004233',\n", " 'HP:0007041',\n", " 'HP:0007734',\n", " 'HP:0008523',\n", " 'HP:0010037',\n", " 'HP:0011193',\n", " 'HP:0011787',\n", " 'HP:0410275'}" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Set seed for random\n", "np.random.seed(0)\n", "random_hp_1 = np.random.choice(df['object'].unique(), size = 10, replace=False)\n", "set(random_hp_1)" ] }, { "cell_type": "code", "execution_count": 20, "id": "5f698837-f6c0-4627-b3f1-bf27007c3b61", "metadata": {}, "outputs": [], "source": [ "# # Set seed for random\n", "# np.random.seed(10)\n", "# random_hp_2 = np.random.choice(df['object'].unique(), size = 10, replace=False)\n", "# set(random_hp_2)" ] }, { "cell_type": "code", "execution_count": 21, "id": "66f86458-52cc-419b-a629-55615e9b5687", "metadata": {}, "outputs": [], "source": [ "hp_1 = [\"HP:0003394\",\n", "\"HP:0003771\",\n", "\"HP:0012378\",\n", "\"HP:0012450\",\n", "\"HP:0000974\",\n", "\"HP:0001027\",\n", "\"HP:0001030\",\n", "\"HP:0001065\",\n", "\"HP:0001073\",\n", "\"HP:0001075\",\n", "\"HP:0002761\",\n", "\"HP:0001386\",\n", "\"HP:0001537\",\n", "\"HP:0001622\",\n", "\"HP:0001760\",\n", "\"HP:0001762\",\n", "\"HP:0001763\",\n", "\"HP:0001788\",\n", "\"HP:0002035\",\n", "\"HP:0002036\",\n", "\"HP:0002616\",\n", "\"HP:0002650\",\n", "\"HP:0002758\",\n", "\"HP:0002827\",\n", "\"HP:0002829\",\n", "\"HP:0002999\",\n", "\"HP:0003010\",\n", "\"HP:0003083\",\n", "\"HP:0003834\",\n", "\"HP:0000938\",\n", "\"HP:0001058\",\n", "\"HP:0001252\",\n", "\"HP:0001324\",\n", "\"HP:0002013\",\n", "\"HP:0002018\",\n", "\"HP:0002020\",\n", "\"HP:0000015\",\n", "\"HP:0000023\",\n", "\"HP:0000139\",\n", "\"HP:0000286\",\n", "\"HP:0000481\",\n", "\"HP:0000978\",\n", "\"HP:0000993\",\n", "\"HP:0001063\",\n", "\"HP:0004872\",\n", "\"HP:0004944\",\n", "\"HP:0004947\",\n", "\"HP:0001270\",\n", "\"HP:0005294\",\n", "\"HP:0006243\",\n", "\"HP:0007495\",\n", "\"HP:0009763\",\n", "\"HP:0010749\",\n", "\"HP:0010750\",\n", "\"HP:0010754\",\n", "\"HP:0025014\",\n", "\"HP:0025019\",\n", "\"HP:0025509\",\n", "\"HP:0030009\",\n", "\"HP:0031364\",\n", "\"HP:0031653\",\n", "\"HP:0001278\",\n", "\"HP:0001634\",\n", "\"HP:0001653\",\n", "\"HP:0001704\",\n", "\"HP:0002315\"]\n", "\n", "hp_2 = [\"HP:0003645\",\n", "\"HP:0005261\",\n", "\"HP:0002758\",\n", "\"HP:0003125\",\n", "\"HP:0001892\",\n", "\"HP:0001934\",\n", "\"HP:0000967\",\n", "\"HP:0000978\",\n", "\"HP:0000979\",\n", "\"HP:0040242\",\n", "\"HP:0007420\",\n", "\"HP:0030140\",\n", "\"HP:0001386\",\n", "\"HP:0002829\",\n", "\"HP:0002239\",\n", "\"HP:0011889\",\n", "\"HP:0001907\",\n", "\"HP:0012223\",\n", "\"HP:0009811\",\n", "\"HP:0012233\",\n", "\"HP:0030746\",\n", "\"HP:0002170\"]" ] }, { "cell_type": "code", "execution_count": 22, "id": "f7cd8388-abc3-4fc9-913e-6c79d8f1d7ad", "metadata": {}, "outputs": [], "source": [ "mondo = set(df['subject'].drop_duplicates())" ] }, { "cell_type": "code", "execution_count": 23, "id": "5072b5ad-b7ae-4a31-aa2d-f7c22cd0b7d6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 43.5 s, sys: 1.89 s, total: 45.4 s\n", "Wall time: 46.4 s\n" ] } ], "source": [ "%%time\n", "predicates = [\"rdfs:subClassOf\",\"BFO:0000050\", \"UPHENO:0000001\"]\n", "rss = Semsimian(spo=None, predicates=predicates, pairwise_similarity_attributes=None, resource_path=db)\n", "all_x_all = rss.all_by_all_pairwise_similarity(subject_terms=set(hp_1), object_terms=set(hp_2), minimum_jaccard_threshold=0, minimum_resnik_threshold=0)\n" ] }, { "cell_type": "code", "execution_count": 24, "id": "104d7245-044e-4696-8f7b-51c65b1806fa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.61 ms, sys: 241 µs, total: 1.85 ms\n", "Wall time: 2 ms\n" ] } ], "source": [ "%%time\n", "\n", "rows_list = []\n", "for term1_key, values in all_x_all.items():\n", " for term2_key, result in values.items():\n", " jaccard, resnik, phenodigm, _, _ = result\n", " row_dict = {\"subject\": term1_key, \"object\": term2_key, \"jaccard\": jaccard, \"aic\": resnik, \"phenodigm\": phenodigm}\n", " rows_list.append(row_dict)\n", "\n", "new_df = pd.DataFrame(rows_list)" ] }, { "cell_type": "code", "execution_count": 25, "id": "40ef069a-6b31-4b5c-a749-93ebfeb5dc8d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
subjectobjectjaccardaicphenodigm
0HP:0001386HP:00013861.00000018.0088774.243687
1HP:0000978HP:00009781.00000018.0088774.243687
2HP:0002829HP:00028291.00000017.0088774.124182
3HP:0002758HP:00027581.00000014.8389523.852136
4HP:0031364HP:00009670.97530916.4239154.002298
..................
1447HP:0003083HP:00022390.1167514.1341840.694745
1448HP:0006243HP:00028290.1125834.0109920.671989
1449HP:0006243HP:00031250.1111114.1341840.677756
1450HP:0003083HP:00028290.1062504.0109920.652815
1451HP:0003083HP:00031250.1060614.1341840.662174
\n", "

1452 rows × 5 columns

\n", "
" ], "text/plain": [ " subject object jaccard aic phenodigm\n", "0 HP:0001386 HP:0001386 1.000000 18.008877 4.243687\n", "1 HP:0000978 HP:0000978 1.000000 18.008877 4.243687\n", "2 HP:0002829 HP:0002829 1.000000 17.008877 4.124182\n", "3 HP:0002758 HP:0002758 1.000000 14.838952 3.852136\n", "4 HP:0031364 HP:0000967 0.975309 16.423915 4.002298\n", "... ... ... ... ... ...\n", "1447 HP:0003083 HP:0002239 0.116751 4.134184 0.694745\n", "1448 HP:0006243 HP:0002829 0.112583 4.010992 0.671989\n", "1449 HP:0006243 HP:0003125 0.111111 4.134184 0.677756\n", "1450 HP:0003083 HP:0002829 0.106250 4.010992 0.652815\n", "1451 HP:0003083 HP:0003125 0.106061 4.134184 0.662174\n", "\n", "[1452 rows x 5 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_df_sorted = new_df.sort_values(by=[\"jaccard\", \"aic\"], ascending=False).reset_index()\n", "new_df_sorted.drop(\"index\", axis=1, inplace=True)\n", "new_df_sorted" ] }, { "cell_type": "code", "execution_count": 26, "id": "195afbdc-6c10-4313-9514-573afe5feabc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 18.008877\n", "2 17.008877\n", "3 14.838952\n", "4 16.423915\n", "5 15.686949\n", " ... \n", "1079 5.079804\n", "1124 4.253051\n", "1194 4.403398\n", "1204 4.217714\n", "1256 6.722319\n", "Name: aic, Length: 63, dtype: float64" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_df_sorted['aic'].drop_duplicates()" ] }, { "cell_type": "code", "execution_count": null, "id": "41bf2bfa-6ea1-4067-b8c4-0274a6ad3ed6", "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", "df_subset = new_df_sorted.head(10)\n", "\n", "# Create a figure and a set of subplots\n", "fig, ax = plt.subplots()\n", "\n", "# Set the bar width\n", "bar_width = 0.35\n", "\n", "# Get the positions of the bars\n", "positions = list(range(len(df_subset)))\n", "\n", "# Plot bars for 'jaccard' in one color\n", "plt.bar([p - bar_width/2 for p in positions], df_subset['jaccard'], width=bar_width, color='blue', label='jaccard')\n", "\n", "# Plot bars for 'aic' in another color\n", "plt.bar([p + bar_width/2 for p in positions], df_subset['aic'], width=bar_width, color='green', label='aic')\n", "\n", "# Set the x ticks with names\n", "plt.xticks(positions, df_subset['subject'], rotation='vertical')\n", "\n", "# Show object values on top of the bars\n", "for i in range(len(df_subset)):\n", " plt.text(x = i, y = df_subset['jaccard'].iloc[i], s = df_subset['object'].iloc[i], size = 10, ha = 'center', va = 'bottom', rotation='vertical')\n", "\n", "\n", "# Add legend\n", "plt.legend()\n", "\n", "# Add labels and title\n", "plt.xlabel('Subject')\n", "plt.ylabel('Values')\n", "plt.title('Top 10 Subjects vs Corresponding Objects')" ] }, { "cell_type": "code", "execution_count": null, "id": "8a08cea9-9c94-42fa-a25f-720c77339c6c", "metadata": {}, "outputs": [], "source": [ "df_subset" ] }, { "cell_type": "code", "execution_count": null, "id": "3a215cf0-7895-4d9f-bfff-c4f32d84ffce", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }