from struct import unpack, pack from sys import argv from functools import partial from faiss import Kmeans import numpy as np from tqdm import tqdm def default_filter(i, vec): return True def reservoir_sampling(iterator, k: int): """Reservoir sampling from an iterator.""" res = [] while len(res) < k: res.append(next(iterator)) for i, vec in enumerate(iterator, k + 1): j = np.random.randint(0, i) if j < k: res[j] = vec return res def read_vec_yield( filepath: str, vec_type: np.dtype = np.float32, picker=default_filter ): """Read vectors and yield an iterator.""" size = np.dtype(vec_type).itemsize i = 0 with open(filepath, "rb") as f: while True: try: buf = f.read(4) if len(buf) == 0: break dim = unpack("