r""" Script to pre-compute the constants associated to the computation of the cardinalities used in the HyperLogLog algorithm in the version that employes the multiplicities of the registers. The paper that describes the algorithm is the following [New cardinality estimation algorithms for HyperLogLog sketches](https://oertl.github.io/hyperloglog-sketch-estimation-paper/paper/paper.pdf). Specifically, the two formulas we need to precompute are the following. For a multiplicity x, that can have value between 0 and the number of registers m, which we recall are equal to m=2^P, we need to compute the following constant: $$\tau(x) = \frac{x}{3} - \frac{m}{3} \sum_{k=1}^{\inf} {\left(1 - {\left(1 - \frac{x}{m}\right)}^{2^{-k}}\right)}^{2} 2^{-k}$$ The second constant we need to compute is sigma, also defined for a multiplicity x, that can have value between 0 and the number of registers m, which we recall are equal to m=2^P, is defined as follows: $$\sigma(x) = x + m \sum_{k=1}^{\inf} {\frac{x}{m}}^{2^k} 2^{k-1}$$ Since we cannot compute the infinite sum, we approximate it by computing the terms up until convergence, which is when the value of the term is smaller than the precision of the machine. We write out the constants as two Rust files that can be readily imported in the hyperloglog-rs crate, in the positions "src/tau_multiplicities_constants.rs" and "src/sigma_multiplicities_constants.rs". We compute the values for all precisions we considered, between 4 and 18. The files will header will look like the following, with no more than 3 decimals for each constant. ```rust //! This file is generated by the script multiplities_constants_precomputation.py //! and contains the constants used in the computation of the cardinalities //! in the HyperLogLog algorithm that employs the multiplicities of the registers. //! The paper that describes the algorithm is the following //! [New cardinality estimation algorithms for HyperLogLog sketches](https://oertl.github.io/hyperloglog-sketch-estimation-paper/paper/paper.pdf). //! The constants are computed for all precisions between 4 and 18. pub const TAU: [&[f32]; 15] = [ // precision 4 &[ 0.0, 0.33, 0.654, 0.97, 1.278, 1.577, 1.864, 2.139, 2.399, ``` """ import sys from tqdm.auto import trange def compute_tau(x, m): r""" Compute the constant tau(x) for a given multiplicity x and number of registers m. Parameters ---------- x : int The multiplicity of the register. m : int The number of registers. """ summation = 0 k = 1 x = (1 - x / m) while True: term = (1 - x**(2**(-k)))**2 * 2**(-k) summation += term if term < sys.float_info.epsilon: break k += 1 return m / 3.0 * (1 - x - summation) def compute_sigma(x, m): r""" Compute the constant sigma(x) for a given multiplicity x and number of registers m. Parameters ---------- x : int The multiplicity of the register. m : int The number of registers. """ summation = 0 k = 1 x = x / m try: while True: term = x**(2**k) * 2**(k - 1) summation += term if term < sys.float_info.epsilon: break k += 1 except OverflowError: return float("inf") return m * (x + summation) def write_out_tau(): r""" Compute the constants for all precisions and write them to a Rust file. """ with open("src/tau_multiplicities_constants.rs", "w", encoding="utf8") as file: file.write( r"""//! This file is generated by the script multiplities_constants_precomputation.py //! and contains the constants used in the computation of the cardinalities //! in the HyperLogLog algorithm that employs the multiplicities of the registers. //! The paper that describes the algorithm is the following //! [New cardinality estimation algorithms for HyperLogLog sketches](https://oertl.github.io/hyperloglog-sketch-estimation-paper/paper/paper.pdf). //! The constants are computed for all precisions between 4 and 18. pub const TAU: [&[f32]; 15] = [ """) for precision in trange(4, 19, desc="Precision", leave=False): file.write(f" // precision {precision}\n") file.write(" &[") for multeplicity in range(0, 2**precision + 1): tau = compute_tau(multeplicity, 2**precision) # We round the tau so that it does not have more than # 3 decimals. tau = round(tau, 3) # If tau is higher than 10k, we round it so that it # does not have more than 2 decimals. if tau > 10000: tau = round(tau, 2) file.write( f"\n {tau},") file.write("\n ],\n") file.write("];\n") def write_out_sigma(): r""" Compute the constants for all precisions and write them to a Rust file. """ with open("src/sigma_multiplicities_constants.rs", "w", encoding="utf8") as file: file.write( r"""//! This file is generated by the script multiplities_constants_precomputation.py //! and contains the constants used in the computation of the cardinalities //! in the HyperLogLog algorithm that employs the multiplicities of the registers. //! The paper that describes the algorithm is the following //! [New cardinality estimation algorithms for HyperLogLog sketches](https://oertl.github.io/hyperloglog-sketch-estimation-paper/paper/paper.pdf). //! The constants are computed for all precisions between 4 and 18. pub const SIGMA: [&[f32]; 15] = [ """) for precision in trange(4, 19, desc="Precision", leave=False): file.write(f" // precision {precision}\n") file.write(" &[") for multeplicity in range(0, 2**precision + 1): sigma = compute_sigma(multeplicity, 2**precision) # We round the sigma so that it does not have more than # 3 decimals. sigma = round(sigma, 6) if sigma > 10: sigma = round(sigma, 5) if sigma > 100: sigma = round(sigma, 4) if sigma > 1000: sigma = round(sigma, 3) # If sigma is higher than 10k, we round it so that it # does not have more than 2 decimals. if sigma > 10000: sigma = round(sigma, 2) if sigma == 0.434294 or sigma == 0.4342: sigma = "core::f32::consts::LOG10_E" if sigma == 0.785398: sigma = "core::f32::consts::FRAC_PI_4" if sigma == 0.31831 or sigma == 0.318: sigma = "core::f32::consts::FRAC_1_PI" if sigma == float("inf"): sigma = "core::f32::INFINITY" file.write( f"\n {sigma},") file.write("\n ],\n") file.write("];\n") if __name__ == "__main__": write_out_tau() write_out_sigma()