# Copyright (c) 2021, EleutherAI # This file is based on code by the authors denoted below and has been modified from its original version. # # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # copied from gpt-neox/megatron/data/indexed_dataset.py # Adapted to only include MMapDataset reader # other slight modifications too # ************* # **IMPORTANT** # ************* # This Implementation assumes that the sequences in # the dataset are always of sequence length 2049 import os import shutil import struct from functools import lru_cache from itertools import accumulate import numpy as np import torch dtypes = { 1: np.uint8, 2: np.int8, 3: np.int16, 4: np.int32, 5: np.int64, 6: np.float32, 7: np.float64, 8: np.uint16, } def index_file_path(prefix_path): return prefix_path + ".idx" def data_file_path(prefix_path): return prefix_path + ".bin" class MMapIndexedDataset(torch.utils.data.Dataset): class Index(object): _HDR_MAGIC = b"MMIDIDX\x00\x00" @classmethod def writer(cls, path, dtype): class _Writer(object): def __enter__(self): self._file = open(path, "wb") # Write Magic string so we can check the file format then opening it again. self._file.write(cls._HDR_MAGIC) # Write version number # Little endian unsigned 64 Bit integer self._file.write(struct.pack("