{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Set `ssd_path` to a folder in which a file can be placed. It does not need to be \"ssd\" drive, but faster drives are better." ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "metadata": {} }, "outputs": [], "source": [ "import time\n", "from pathlib import Path\n", "import numpy as np\n", "from bed_reader import create_bed, open_bed\n", "\n", "ssd_path = Path(\"m:/deldir/bench2\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "On my system, it takes about 5 minutes to generate the 61 GB file (1/16th of a 1TB drive). The file is filed with a pattern of 0, 1, and 2 values.\n", "\n", "In this case, we are pretending that we can more easily get all the SNP information for one individual at a time, so we write the file in the (less common) \"individual-major\" format." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "iid_count = 250_000\n", "sid_count = 1_000_000\n", "\n", "file_path = ssd_path / f\"{iid_count}x{sid_count}mode0.bed\"\n", "\n", "snp_row = np.array(range(sid_count)) % 3\n", "snp_row = snp_row.astype(np.uint8)\n", "\n", "start_time = time.time()\n", "with create_bed(file_path, iid_count=iid_count, sid_count=sid_count, major=\"individual\") as bed_writer:\n", " for iid_index in range(iid_count):\n", " if iid_index % 1_000 == 0:\n", " current_time = time.time()\n", " elapsed_time = current_time - start_time\n", " if iid_index > 0:\n", " estimated_total_time = elapsed_time / iid_index * iid_count\n", " time_remaining = estimated_total_time - elapsed_time\n", " print(f\"iid_index={iid_index:_}, Time elapsed: {elapsed_time:.2f} s, Estimated total time: {estimated_total_time:.2f} s, Time remaining: {time_remaining:.2f} s\")\n", " else:\n", " print(f\"Starting processing at iid_index={iid_index:_}\")\n", " bed_writer.write(snp_row)\n", " snp_row[iid_index] = (snp_row[iid_index] + 1) % 3\n", "total_time = time.time() - start_time\n", "print(f\"Processing complete. Total time taken: {total_time:.2f} s\") \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We read from the file to check that it contains the expected values." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "metadata": {} }, "outputs": [], "source": [ "with open_bed(file_path) as bed_reader:\n", " val = bed_reader.read(np.s_[:10, :10])\n", "\n", "assert np.all(\n", " val\n", " == [\n", " [0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n", " [1.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n", " [1.0, 2.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n", " [1.0, 2.0, 0.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n", " [1.0, 2.0, 0.0, 1.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n", " [1.0, 2.0, 0.0, 1.0, 2.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n", " [1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 0.0, 1.0, 2.0, 0.0],\n", " [1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 1.0, 2.0, 0.0],\n", " [1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 2.0, 0.0],\n", " [1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 0.0],\n", " ]\n", ")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "metadata": {} }, "outputs": [ { "data": { "text/plain": [ "(250000, 1000000)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bed_reader = open_bed(file_path) # open and keep open\n", "bed_reader.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we use `bed-reader` to convert the file the less common individual-major mode (also known as \"mode 0\") into the usual SNP-major mode (mode 1).\n", "\n", "On my machine, this takes about an hour. The script works my reading the data for 1000 SNPs at a time. This requires more memory than reading data of 1 SNP at a time, but is much, much faster. If you need to use less memory, change `sid_at_a_time` to a smaller number such as 500, 250, 100, etc." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "sid_at_a_time = 1000\n", "file_path1 = ssd_path / f\"{iid_count}x{sid_count}mode1.bed\"\n", "start_time = time.time()\n", "\n", "assert (\n", " bed_reader.major == \"individual\"\n", "), \"No need to transpose if major is already SNP-major\"\n", "\n", "with create_bed(\n", " file_path1,\n", " iid_count=bed_reader.iid_count,\n", " sid_count=bed_reader.sid_count,\n", " properties=bed_reader.properties,\n", " major=\"SNP\",\n", ") as bed_writer:\n", " for sid_index in range(0, bed_reader.sid_count, sid_at_a_time):\n", " if sid_index % 1 == 0:\n", " current_time = time.time()\n", " elapsed_time = current_time - start_time\n", " if sid_index > 0:\n", " estimated_total_time = elapsed_time / sid_index * sid_count\n", " time_remaining = estimated_total_time - elapsed_time\n", " print(\n", " f\"sid_index={sid_index:_}, Time elapsed: {elapsed_time:.2f} s, Estimated total time: {estimated_total_time:.2f} s, Time remaining: {time_remaining:.2f} s\"\n", " )\n", " else:\n", " print(f\"Starting processing at sid_index={sid_index:_}\")\n", " iid_column_by_chunk = bed_reader.read(\n", " np.s_[:, sid_index : sid_index + sid_at_a_time], dtype=np.int8\n", " )\n", " for iid_column in iid_column_by_chunk.T:\n", " bed_writer.write(iid_column)\n", "total_time = time.time() - start_time\n", "print(f\"Processing complete. Total time taken: {total_time:.2f} s\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We confirm that the new copy of the file is the same size and starts with the same values as before." ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "metadata": {} }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(250000, 1000000)\n" ] } ], "source": [ "with open_bed(file_path1) as bed_reader1:\n", " print(bed_reader1.shape)\n", " val1 = bed_reader1.read(np.s_[:10, :10])\n", "assert np.all(\n", " val1 == [\n", " [0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n", " [1.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n", " [1.0, 2.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n", " [1.0, 2.0, 0.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n", " [1.0, 2.0, 0.0, 1.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n", " [1.0, 2.0, 0.0, 1.0, 2.0, 2.0, 0.0, 1.0, 2.0, 0.0],\n", " [1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 0.0, 1.0, 2.0, 0.0],\n", " [1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 1.0, 2.0, 0.0],\n", " [1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 2.0, 0.0],\n", " [1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 1.0, 2.0, 0.0, 0.0],\n", " ]\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 2 }