In [1]:
import pandas as pd
import numpy as np
import sqlite3
import os
import pyzstd

In [2]:
folder_name = "dia_test.d"
if not os.path.exists(folder_name):
    os.mkdir(folder_name)
tdf_file_name = os.path.join(folder_name, "analysis.tdf")
tdf_bin_file_name = os.path.join(folder_name, "analysis.tdf_bin")
num_cycles = 2
frames_per_cycle = 3
num_dia_window_groups = 2
scan_groups_per_window_group = 2
num_frames = num_cycles * frames_per_cycle
num_scans = 709

# Mode 9 is DIAPASEF
scanmode = 9
mz_min = 100.000000
mz_max = 1000.000000
im_min = 0.5
im_max = 1.5
num_tof = num_frames * num_scans
num_tof = (num_tof + 1) * num_tof // 2

In [3]:
frame_data = []
count = 0
offset = 0
msms_type = [0 if i % frames_per_cycle == 0 else scanmode for i in range(num_frames)]
print(msms_type)
for frame in range(num_frames):
    frame_tofs = []
    frame_ints = []
    frame_counts = []
    for scan in range(num_scans):
        count += 1
        frame_tofs.append(np.arange(1 + offset, 1 + offset + count))
        frame_ints.append(np.arange(1 + offset, 1 + offset + count) * 2)
        frame_counts.append(count)
        offset += count
    frame_data.append(
        (
            frame_counts,
            frame_tofs,
            frame_ints,
        )
    )

[0, 9, 9, 0, 9, 9]


In [4]:
data = []
frame_offsets = []
frame_offset = 0
summed_intensities = []
max_intensities = []
num_peaks = []
for frame in frame_data:
    frame_offsets.append(frame_offset)
    scans = frame[0]
    scan_count = len(scans)
    ints = np.concatenate(frame[2])
    summed_intensities.append(np.sum(ints))
    max_intensities.append(np.max(ints))
    num_peaks.append(len(ints))
    buffer = np.zeros(scan_count + len(ints) * 2, dtype=np.uint32)
    buffer[0] = scan_count
    buffer[1:scan_count] = np.array(scans[:-1]) * 2
    buffer[scan_count + 1 :: 2] = ints
    offset = scan_count
    for tofs in frame[1]:
        buffer[offset] = tofs[0]
        buffer[offset + 2 : offset + 2 * len(tofs) : 2] = np.diff(tofs)
        offset += 2 * len(tofs)
    buffer = np.frombuffer(buffer, dtype=np.uint8)
    buffer = buffer.reshape(-1, 4).T.flatten()
    decompressed_bytes = buffer
    compressed_data = pyzstd.compress(decompressed_bytes)
    compressed_data = np.frombuffer(compressed_data, dtype=np.uint8)
    frame_size = len(compressed_data) + 8
    data.append(np.frombuffer(np.array([frame_size], dtype=np.uint32), dtype=np.uint8))
    data.append(np.frombuffer(np.array([scan_count], dtype=np.uint32), dtype=np.uint8))
    data.append(compressed_data)
    frame_offset += frame_size
bin_data = np.concatenate(data)

In [5]:
if os.path.exists(tdf_bin_file_name):
    os.remove(tdf_bin_file_name)

with open(tdf_bin_file_name, "wb") as tdf_bin_file:
    tdf_bin_file.write(bin_data.tobytes())

In [6]:
size = num_frames
peaks = num_scans * (num_scans + 1) // 2

frames = pd.DataFrame(
    {
        "Id": np.arange(1, size + 1),
        "Time": np.arange(1, size + 1, dtype=np.float64) / 10,
        "Polarity": ["+"] * size,
        "ScanMode": [scanmode] * size,
        "MsMsType": msms_type,
        "TimsId": frame_offsets,
        "MaxIntensity": max_intensities,
        "SummedIntensities": summed_intensities,
        "NumScans": [num_scans] * size,
        "NumPeaks": num_peaks,
        #         'MzCalibration': [1] * size,
        #         'T1': [1] * size,
        #         'T2': [1] * size,
        #         'TimsCalibration': [1] * size,
        #         'PropertyGroup': [1] * size,
        "AccumulationTime": [100] * size,
        "RampTime": [100] * size,
        #         'Pressure': [2] * size,
    }
)
frames

Unnamed: 0,Id,Time,Polarity,ScanMode,MsMsType,TimsId,MaxIntensity,SummedIntensities,NumScans,NumPeaks,AccumulationTime,RampTime
0,1,0.1,+,9,0,0,503390,63350624720,709,251695,100,100
1,2,0.2,+,9,9,10522,2012142,948829238392,709,754376,100,100
2,3,0.3,+,9,9,22230,4526256,4109570744400,709,1257057,100,100
3,4,0.4,+,9,0,36522,8045732,11061704269310,709,1759738,100,100
4,5,0.5,+,9,9,53935,12570570,23321358939688,709,2262419,100,100
5,6,0.6,+,9,9,72586,18100770,42404663882100,709,2765100,100,100


In [7]:
# size -= 1
# precursors = pd.DataFrame(
#     {
#         "Id": np.arange(1, size + 1),
#         "LargestPeakMz": 500.0 + np.arange(size),
#         "AverageMz": 500.5 + np.arange(size),
#         "MonoisotopicMz": 500.0 + np.arange(size),
#         "Charge": [2 if i % 2 == 0 else 3 for i in range(size)],
#         "ScanNumber": [1 if i % 2 == 0 else 2 for i in range(size)],
#         "Intensity": [10] * size,
#         "Parent": [(i // 2) * 2 + 1 for i in range(size)],
#     }
# )
# size += 1
# precursors


# fragment_frames = pd.DataFrame(
#     {
#         'Frame': [(i // 2 + 1) * 2 for i in range(size)],
#         'ScanNumBegin': [2 if i % 2 == 0 else 1 for i in range(size)],
#         'ScanNumEnd': [3 if i % 2 == 0 else 2 for i in range(size)],
#         'IsolationMz': 500.5 + np.arange(size),
#         'IsolationWidth': [2.0] * size,
#         'CollisionEnergy': [0.0] * size,
#         'Precursor': np.arange(1, size + 1),
#     }
# )
# fragment_frames.iloc[-1] = fragment_frames.iloc[-3]
# fragment_frames.Frame.values[-1] = fragment_frames.Frame.values[-2]
# fragment_frames


dia_frame_msms_windows = pd.DataFrame(
    {
        "WindowGroup": [
            1 + (x // 2)
            for x in range(num_dia_window_groups * scan_groups_per_window_group)
        ],
        "ScanNumBegin": [
            30 + (200 * (x % 2))
            for x in range(num_dia_window_groups * scan_groups_per_window_group)
        ],
        "ScanNumEnd": 0,
        "IsolationMz": [
            200 * (x + scan_groups_per_window_group)
            for x in range(num_dia_window_groups * scan_groups_per_window_group)
        ],
        "IsolationWidth": 50,
        "CollisionEnergy": 42,
    }
)
dia_frame_msms_windows["ScanNumEnd"] = 150 + dia_frame_msms_windows["ScanNumBegin"]
dia_frame_msms_windows

Unnamed: 0,WindowGroup,ScanNumBegin,ScanNumEnd,IsolationMz,IsolationWidth,CollisionEnergy
0,1,30,180,400,50,42
1,1,230,380,600,50,42
2,2,30,180,800,50,42
3,2,230,380,1000,50,42


In [8]:
dia_frame_msms_info = pd.DataFrame(
    {
        "Frame": frames["Id"][frames["MsMsType"] == 9],
        "WindowGroup": [
            1 + (x // num_dia_window_groups)
            for x in range((frames["MsMsType"] == 9).sum())
        ],
    }
)

dia_frame_msms_info

Unnamed: 0,Frame,WindowGroup
1,2,1
2,3,1
4,5,2
5,6,2


In [9]:
dia_frame_msms_window_groups = pd.DataFrame(
    {"id": [x + 1 for x in range(num_dia_window_groups)]}
)
dia_frame_msms_window_groups

Unnamed: 0,id
0,1
1,2


In [10]:
global_meta_data = {
    #     "SchemaType": "TDF",
    #     "SchemaVersionMajor": 3,
    #     "SchemaVersionMinor": 7,
    #     "AcquisitionSoftwareVendor": "Bruker",
    #     "InstrumentVendor": "Bruker",
    #     "ClosedProperly": 1,
    "TimsCompressionType": 2,
    "MaxNumPeaksPerScan": int(frames.NumPeaks.values[-1]),
    #     "AnalysisId": "00000000-0000-0000-0000-000000000000",
    "DigitizerNumSamples": num_tof,
    "MzAcqRangeLower": mz_min,
    "MzAcqRangeUpper": mz_max,
    "AcquisitionSoftware": "timsTOF",
    #     "AcquisitionSoftwareVersion": "0.0",
    #     "AcquisitionFirmwareVersion": "0.1",
    #     "AcquisitionDateTime": "2023-05-05T21:20:37.229+02:00",
    #     "InstrumentName": "timsTOF SCP",
    #     "InstrumentFamily": 9,
    #     "InstrumentRevision": 3,
    #     "InstrumentSourceType": 11,
    #     "InstrumentSerialNumber": 0,
    #     "OperatorName": "Admin",
    #     "Description": "",
    "SampleName": "test",
    #     "MethodName": "test.m",
    #     "DenoisingEnabled": 0,
    #     "PeakWidthEstimateValue": 0.000025,
    #     "PeakWidthEstimateType": 1,
    #     "PeakListIndexScaleFactor": 1,
    "OneOverK0AcqRangeLower": im_min,
    "OneOverK0AcqRangeUpper": im_max,
    #     "DigitizerType": "SA248P",
    #     "DigitizerSerialNumber": "AQ00074235",
}
global_meta_data = pd.DataFrame(
    {
        "Key": global_meta_data.keys(),
        "Value": global_meta_data.values(),
    }
)
global_meta_data

Unnamed: 0,Key,Value
0,TimsCompressionType,2
1,MaxNumPeaksPerScan,2765100
2,DigitizerNumSamples,9050385
3,MzAcqRangeLower,100.0
4,MzAcqRangeUpper,1000.0
5,AcquisitionSoftware,timsTOF
6,SampleName,test
7,OneOverK0AcqRangeLower,0.5
8,OneOverK0AcqRangeUpper,1.5


In [11]:
if os.path.exists(tdf_file_name):
    os.remove(tdf_file_name)
with sqlite3.connect(tdf_file_name) as sql_database_connection:
    global_meta_data.to_sql("GlobalMetaData", sql_database_connection, index=False)
    frames.to_sql("Frames", sql_database_connection, index=False)
    dia_frame_msms_info.to_sql("DiaFrameMsMsInfo", sql_database_connection, index=False)

    dia_frame_msms_window_groups.to_sql(
        "DiaFrameMsMsWindowGroups", sql_database_connection, index=False
    )

    dia_frame_msms_windows.to_sql(
        "DiaFrameMsMsWindows", sql_database_connection, index=False
    )

    # # There is no precursors table in my DIA .tdf
    # precursors.to_sql(
    #     "Precursors",
    #     sql_database_connection,
    #     index=False
    # )
    # fragment_frames.to_sql(
    #     "PasefFrameMsMsInfo",
    #     sql_database_connection,
    #     index=False
    # )

In [12]:
with sqlite3.connect(tdf_file_name) as sql_database_connection:
    print(pd.read_sql_query("SELECT * FROM Frames", sql_database_connection))

   Id  Time Polarity  ScanMode  MsMsType  TimsId  MaxIntensity  \
0   1   0.1        +         9         0       0        503390   
1   2   0.2        +         9         9   10522       2012142   
2   3   0.3        +         9         9   22230       4526256   
3   4   0.4        +         9         0   36522       8045732   
4   5   0.5        +         9         9   53935      12570570   
5   6   0.6        +         9         9   72586      18100770   

   SummedIntensities  NumScans  NumPeaks  AccumulationTime  RampTime  
0        63350624720       709    251695               100       100  
1       948829238392       709    754376               100       100  
2      4109570744400       709   1257057               100       100  
3     11061704269310       709   1759738               100       100  
4     23321358939688       709   2262419               100       100  
5     42404663882100       709   2765100               100       100  


In [13]:
import alphatims.bruker

alphatims.bruker.BRUKER_DLL_FILE_NAME = ""
data = alphatims.bruker.TimsTOF(folder_name)
# data.tof_indices

100%|██████████| 6/6 [00:00<00:00, 11.51it/s]
