{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "83d34fb6", "metadata": { "ExecuteTime": { "end_time": "2023-09-28T15:11:01.731660Z", "start_time": "2023-09-28T15:11:01.293239Z" } }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import sqlite3\n", "import os\n", "import pyzstd" ] }, { "cell_type": "code", "execution_count": 2, "id": "c174a0de", "metadata": { "ExecuteTime": { "end_time": "2023-09-28T15:24:58.779565Z", "start_time": "2023-09-28T15:24:58.770465Z" } }, "outputs": [], "source": [ "folder_name = \"dia_test.d\"\n", "if not os.path.exists(folder_name):\n", " os.mkdir(folder_name)\n", "tdf_file_name = os.path.join(folder_name, \"analysis.tdf\")\n", "tdf_bin_file_name = os.path.join(folder_name, \"analysis.tdf_bin\")\n", "num_cycles = 2\n", "frames_per_cycle = 3\n", "num_dia_window_groups = 2\n", "scan_groups_per_window_group = 2\n", "num_frames = num_cycles * frames_per_cycle\n", "num_scans = 709\n", "\n", "# Mode 9 is DIAPASEF\n", "scanmode = 9\n", "mz_min = 100.000000\n", "mz_max = 1000.000000\n", "im_min = 0.5\n", "im_max = 1.5\n", "num_tof = num_frames * num_scans\n", "num_tof = (num_tof + 1) * num_tof // 2" ] }, { "cell_type": "code", "execution_count": 3, "id": "b7ef86d4", "metadata": { "ExecuteTime": { "end_time": "2023-09-28T15:25:02.177458Z", "start_time": "2023-09-28T15:25:02.166668Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0, 9, 9, 0, 9, 9]\n" ] } ], "source": [ "frame_data = []\n", "count = 0\n", "offset = 0\n", "msms_type = [0 if i % frames_per_cycle == 0 else scanmode for i in range(num_frames)]\n", "print(msms_type)\n", "for frame in range(num_frames):\n", " frame_tofs = []\n", " frame_ints = []\n", " frame_counts = []\n", " for scan in range(num_scans):\n", " count += 1\n", " frame_tofs.append(np.arange(1 + offset, 1 + offset + count))\n", " frame_ints.append(np.arange(1 + offset, 1 + offset + count) * 2)\n", " frame_counts.append(count)\n", " offset += count\n", " frame_data.append(\n", " (\n", " frame_counts,\n", " frame_tofs,\n", " frame_ints,\n", " )\n", " )" ] }, { "cell_type": "code", "execution_count": 4, "id": "56e819a0", "metadata": { "ExecuteTime": { "end_time": "2023-09-28T15:25:02.400280Z", "start_time": "2023-09-28T15:25:02.381215Z" } }, "outputs": [], "source": [ "data = []\n", "frame_offsets = []\n", "frame_offset = 0\n", "summed_intensities = []\n", "max_intensities = []\n", "num_peaks = []\n", "for frame in frame_data:\n", " frame_offsets.append(frame_offset)\n", " scans = frame[0]\n", " scan_count = len(scans)\n", " ints = np.concatenate(frame[2])\n", " summed_intensities.append(np.sum(ints))\n", " max_intensities.append(np.max(ints))\n", " num_peaks.append(len(ints))\n", " buffer = np.zeros(scan_count + len(ints) * 2, dtype=np.uint32)\n", " buffer[0] = scan_count\n", " buffer[1:scan_count] = np.array(scans[:-1]) * 2\n", " buffer[scan_count + 1 :: 2] = ints\n", " offset = scan_count\n", " for tofs in frame[1]:\n", " buffer[offset] = tofs[0]\n", " buffer[offset + 2 : offset + 2 * len(tofs) : 2] = np.diff(tofs)\n", " offset += 2 * len(tofs)\n", " buffer = np.frombuffer(buffer, dtype=np.uint8)\n", " buffer = buffer.reshape(-1, 4).T.flatten()\n", " decompressed_bytes = buffer\n", " compressed_data = pyzstd.compress(decompressed_bytes)\n", " compressed_data = np.frombuffer(compressed_data, dtype=np.uint8)\n", " frame_size = len(compressed_data) + 8\n", " data.append(np.frombuffer(np.array([frame_size], dtype=np.uint32), dtype=np.uint8))\n", " data.append(np.frombuffer(np.array([scan_count], dtype=np.uint32), dtype=np.uint8))\n", " data.append(compressed_data)\n", " frame_offset += frame_size\n", "bin_data = np.concatenate(data)" ] }, { "cell_type": "code", "execution_count": 5, "id": "1a7862fe", "metadata": { "ExecuteTime": { "end_time": "2023-09-28T15:25:02.572316Z", "start_time": "2023-09-28T15:25:02.565679Z" } }, "outputs": [], "source": [ "if os.path.exists(tdf_bin_file_name):\n", " os.remove(tdf_bin_file_name)\n", "\n", "with open(tdf_bin_file_name, \"wb\") as tdf_bin_file:\n", " tdf_bin_file.write(bin_data.tobytes())" ] }, { "cell_type": "code", "execution_count": 6, "id": "a6f6cf63", "metadata": { "ExecuteTime": { "end_time": "2023-09-28T15:25:02.769843Z", "start_time": "2023-09-28T15:25:02.742240Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdTimePolarityScanModeMsMsTypeTimsIdMaxIntensitySummedIntensitiesNumScansNumPeaksAccumulationTimeRampTime
010.1+90050339063350624720709251695100100
120.2+99105222012142948829238392709754376100100
230.3+9922230452625641095707444007091257057100100
340.4+90365228045732110617042693107091759738100100
450.5+995393512570570233213589396887092262419100100
560.6+997258618100770424046638821007092765100100100
\n", "
" ], "text/plain": [ " Id Time Polarity ScanMode MsMsType TimsId MaxIntensity \\\n", "0 1 0.1 + 9 0 0 503390 \n", "1 2 0.2 + 9 9 10522 2012142 \n", "2 3 0.3 + 9 9 22230 4526256 \n", "3 4 0.4 + 9 0 36522 8045732 \n", "4 5 0.5 + 9 9 53935 12570570 \n", "5 6 0.6 + 9 9 72586 18100770 \n", "\n", " SummedIntensities NumScans NumPeaks AccumulationTime RampTime \n", "0 63350624720 709 251695 100 100 \n", "1 948829238392 709 754376 100 100 \n", "2 4109570744400 709 1257057 100 100 \n", "3 11061704269310 709 1759738 100 100 \n", "4 23321358939688 709 2262419 100 100 \n", "5 42404663882100 709 2765100 100 100 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "size = num_frames\n", "peaks = num_scans * (num_scans + 1) // 2\n", "\n", "frames = pd.DataFrame(\n", " {\n", " \"Id\": np.arange(1, size + 1),\n", " \"Time\": np.arange(1, size + 1, dtype=np.float64) / 10,\n", " \"Polarity\": [\"+\"] * size,\n", " \"ScanMode\": [scanmode] * size,\n", " \"MsMsType\": msms_type,\n", " \"TimsId\": frame_offsets,\n", " \"MaxIntensity\": max_intensities,\n", " \"SummedIntensities\": summed_intensities,\n", " \"NumScans\": [num_scans] * size,\n", " \"NumPeaks\": num_peaks,\n", " # 'MzCalibration': [1] * size,\n", " # 'T1': [1] * size,\n", " # 'T2': [1] * size,\n", " # 'TimsCalibration': [1] * size,\n", " # 'PropertyGroup': [1] * size,\n", " \"AccumulationTime\": [100] * size,\n", " \"RampTime\": [100] * size,\n", " # 'Pressure': [2] * size,\n", " }\n", ")\n", "frames" ] }, { "cell_type": "code", "execution_count": 7, "id": "2076aef6", "metadata": { "ExecuteTime": { "end_time": "2023-09-28T15:25:03.251715Z", "start_time": "2023-09-28T15:25:03.229805Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
WindowGroupScanNumBeginScanNumEndIsolationMzIsolationWidthCollisionEnergy
01301804005042
112303806005042
22301808005042
3223038010005042
\n", "
" ], "text/plain": [ " WindowGroup ScanNumBegin ScanNumEnd IsolationMz IsolationWidth \\\n", "0 1 30 180 400 50 \n", "1 1 230 380 600 50 \n", "2 2 30 180 800 50 \n", "3 2 230 380 1000 50 \n", "\n", " CollisionEnergy \n", "0 42 \n", "1 42 \n", "2 42 \n", "3 42 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# size -= 1\n", "# precursors = pd.DataFrame(\n", "# {\n", "# \"Id\": np.arange(1, size + 1),\n", "# \"LargestPeakMz\": 500.0 + np.arange(size),\n", "# \"AverageMz\": 500.5 + np.arange(size),\n", "# \"MonoisotopicMz\": 500.0 + np.arange(size),\n", "# \"Charge\": [2 if i % 2 == 0 else 3 for i in range(size)],\n", "# \"ScanNumber\": [1 if i % 2 == 0 else 2 for i in range(size)],\n", "# \"Intensity\": [10] * size,\n", "# \"Parent\": [(i // 2) * 2 + 1 for i in range(size)],\n", "# }\n", "# )\n", "# size += 1\n", "# precursors\n", "\n", "\n", "# fragment_frames = pd.DataFrame(\n", "# {\n", "# 'Frame': [(i // 2 + 1) * 2 for i in range(size)],\n", "# 'ScanNumBegin': [2 if i % 2 == 0 else 1 for i in range(size)],\n", "# 'ScanNumEnd': [3 if i % 2 == 0 else 2 for i in range(size)],\n", "# 'IsolationMz': 500.5 + np.arange(size),\n", "# 'IsolationWidth': [2.0] * size,\n", "# 'CollisionEnergy': [0.0] * size,\n", "# 'Precursor': np.arange(1, size + 1),\n", "# }\n", "# )\n", "# fragment_frames.iloc[-1] = fragment_frames.iloc[-3]\n", "# fragment_frames.Frame.values[-1] = fragment_frames.Frame.values[-2]\n", "# fragment_frames\n", "\n", "\n", "dia_frame_msms_windows = pd.DataFrame(\n", " {\n", " \"WindowGroup\": [\n", " 1 + (x // 2)\n", " for x in range(num_dia_window_groups * scan_groups_per_window_group)\n", " ],\n", " \"ScanNumBegin\": [\n", " 30 + (200 * (x % 2))\n", " for x in range(num_dia_window_groups * scan_groups_per_window_group)\n", " ],\n", " \"ScanNumEnd\": 0,\n", " \"IsolationMz\": [\n", " 200 * (x + scan_groups_per_window_group)\n", " for x in range(num_dia_window_groups * scan_groups_per_window_group)\n", " ],\n", " \"IsolationWidth\": 50,\n", " \"CollisionEnergy\": 42,\n", " }\n", ")\n", "dia_frame_msms_windows[\"ScanNumEnd\"] = 150 + dia_frame_msms_windows[\"ScanNumBegin\"]\n", "dia_frame_msms_windows" ] }, { "cell_type": "code", "execution_count": 8, "id": "ee0e91a2", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
FrameWindowGroup
121
231
452
562
\n", "
" ], "text/plain": [ " Frame WindowGroup\n", "1 2 1\n", "2 3 1\n", "4 5 2\n", "5 6 2" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dia_frame_msms_info = pd.DataFrame(\n", " {\n", " \"Frame\": frames[\"Id\"][frames[\"MsMsType\"] == 9],\n", " \"WindowGroup\": [\n", " 1 + (x // num_dia_window_groups)\n", " for x in range((frames[\"MsMsType\"] == 9).sum())\n", " ],\n", " }\n", ")\n", "\n", "dia_frame_msms_info" ] }, { "cell_type": "code", "execution_count": 9, "id": "9937ac5f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id
01
12
\n", "
" ], "text/plain": [ " id\n", "0 1\n", "1 2" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dia_frame_msms_window_groups = pd.DataFrame(\n", " {\"id\": [x + 1 for x in range(num_dia_window_groups)]}\n", ")\n", "dia_frame_msms_window_groups" ] }, { "cell_type": "code", "execution_count": 10, "id": "d71b81b9", "metadata": { "ExecuteTime": { "end_time": "2023-09-28T15:25:03.842767Z", "start_time": "2023-09-28T15:25:03.822643Z" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
KeyValue
0TimsCompressionType2
1MaxNumPeaksPerScan2765100
2DigitizerNumSamples9050385
3MzAcqRangeLower100.0
4MzAcqRangeUpper1000.0
5AcquisitionSoftwaretimsTOF
6SampleNametest
7OneOverK0AcqRangeLower0.5
8OneOverK0AcqRangeUpper1.5
\n", "
" ], "text/plain": [ " Key Value\n", "0 TimsCompressionType 2\n", "1 MaxNumPeaksPerScan 2765100\n", "2 DigitizerNumSamples 9050385\n", "3 MzAcqRangeLower 100.0\n", "4 MzAcqRangeUpper 1000.0\n", "5 AcquisitionSoftware timsTOF\n", "6 SampleName test\n", "7 OneOverK0AcqRangeLower 0.5\n", "8 OneOverK0AcqRangeUpper 1.5" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "global_meta_data = {\n", " # \"SchemaType\": \"TDF\",\n", " # \"SchemaVersionMajor\": 3,\n", " # \"SchemaVersionMinor\": 7,\n", " # \"AcquisitionSoftwareVendor\": \"Bruker\",\n", " # \"InstrumentVendor\": \"Bruker\",\n", " # \"ClosedProperly\": 1,\n", " \"TimsCompressionType\": 2,\n", " \"MaxNumPeaksPerScan\": int(frames.NumPeaks.values[-1]),\n", " # \"AnalysisId\": \"00000000-0000-0000-0000-000000000000\",\n", " \"DigitizerNumSamples\": num_tof,\n", " \"MzAcqRangeLower\": mz_min,\n", " \"MzAcqRangeUpper\": mz_max,\n", " \"AcquisitionSoftware\": \"timsTOF\",\n", " # \"AcquisitionSoftwareVersion\": \"0.0\",\n", " # \"AcquisitionFirmwareVersion\": \"0.1\",\n", " # \"AcquisitionDateTime\": \"2023-05-05T21:20:37.229+02:00\",\n", " # \"InstrumentName\": \"timsTOF SCP\",\n", " # \"InstrumentFamily\": 9,\n", " # \"InstrumentRevision\": 3,\n", " # \"InstrumentSourceType\": 11,\n", " # \"InstrumentSerialNumber\": 0,\n", " # \"OperatorName\": \"Admin\",\n", " # \"Description\": \"\",\n", " \"SampleName\": \"test\",\n", " # \"MethodName\": \"test.m\",\n", " # \"DenoisingEnabled\": 0,\n", " # \"PeakWidthEstimateValue\": 0.000025,\n", " # \"PeakWidthEstimateType\": 1,\n", " # \"PeakListIndexScaleFactor\": 1,\n", " \"OneOverK0AcqRangeLower\": im_min,\n", " \"OneOverK0AcqRangeUpper\": im_max,\n", " # \"DigitizerType\": \"SA248P\",\n", " # \"DigitizerSerialNumber\": \"AQ00074235\",\n", "}\n", "global_meta_data = pd.DataFrame(\n", " {\n", " \"Key\": global_meta_data.keys(),\n", " \"Value\": global_meta_data.values(),\n", " }\n", ")\n", "global_meta_data" ] }, { "cell_type": "code", "execution_count": 11, "id": "649fa28a", "metadata": { "ExecuteTime": { "end_time": "2023-09-28T15:25:04.436258Z", "start_time": "2023-09-28T15:25:04.389245Z" } }, "outputs": [], "source": [ "if os.path.exists(tdf_file_name):\n", " os.remove(tdf_file_name)\n", "with sqlite3.connect(tdf_file_name) as sql_database_connection:\n", " global_meta_data.to_sql(\"GlobalMetaData\", sql_database_connection, index=False)\n", " frames.to_sql(\"Frames\", sql_database_connection, index=False)\n", " dia_frame_msms_info.to_sql(\"DiaFrameMsMsInfo\", sql_database_connection, index=False)\n", "\n", " dia_frame_msms_window_groups.to_sql(\n", " \"DiaFrameMsMsWindowGroups\", sql_database_connection, index=False\n", " )\n", "\n", " dia_frame_msms_windows.to_sql(\n", " \"DiaFrameMsMsWindows\", sql_database_connection, index=False\n", " )\n", "\n", " # # There is no precursors table in my DIA .tdf\n", " # precursors.to_sql(\n", " # \"Precursors\",\n", " # sql_database_connection,\n", " # index=False\n", " # )\n", " # fragment_frames.to_sql(\n", " # \"PasefFrameMsMsInfo\",\n", " # sql_database_connection,\n", " # index=False\n", " # )" ] }, { "cell_type": "code", "execution_count": 12, "id": "35217cb6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Id Time Polarity ScanMode MsMsType TimsId MaxIntensity \\\n", "0 1 0.1 + 9 0 0 503390 \n", "1 2 0.2 + 9 9 10522 2012142 \n", "2 3 0.3 + 9 9 22230 4526256 \n", "3 4 0.4 + 9 0 36522 8045732 \n", "4 5 0.5 + 9 9 53935 12570570 \n", "5 6 0.6 + 9 9 72586 18100770 \n", "\n", " SummedIntensities NumScans NumPeaks AccumulationTime RampTime \n", "0 63350624720 709 251695 100 100 \n", "1 948829238392 709 754376 100 100 \n", "2 4109570744400 709 1257057 100 100 \n", "3 11061704269310 709 1759738 100 100 \n", "4 23321358939688 709 2262419 100 100 \n", "5 42404663882100 709 2765100 100 100 \n" ] } ], "source": [ "with sqlite3.connect(tdf_file_name) as sql_database_connection:\n", " print(pd.read_sql_query(\"SELECT * FROM Frames\", sql_database_connection))" ] }, { "cell_type": "code", "execution_count": 13, "id": "b10148b5", "metadata": { "ExecuteTime": { "end_time": "2023-09-28T15:11:12.359759Z", "start_time": "2023-09-28T15:11:07.862708Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:root:WARNING: No Bruker libraries are available for this operating system. Mobility and m/z values need to be estimated. While this estimation often returns acceptable results with errors < 0.02 Th, huge errors (e.g. offsets of 6 Th) have already been observed for some samples!\n", "100%|██████████| 6/6 [00:00<00:00, 11.51it/s]\n" ] } ], "source": [ "import alphatims.bruker\n", "\n", "alphatims.bruker.BRUKER_DLL_FILE_NAME = \"\"\n", "data = alphatims.bruker.TimsTOF(folder_name)\n", "# data.tof_indices" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 5 }