/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ #ifndef _HLL_HPP_ #define _HLL_HPP_ #include "common_defs.hpp" #include "HllUtil.hpp" #include #include #include namespace datasketches { /** * This is a high performance implementation of Phillipe Flajolet’s HLL sketch but with * significantly improved error behavior. If the ONLY use case for sketching is counting * uniques and merging, the HLL sketch is a reasonable choice, although the highest performing in terms of accuracy for * storage space consumed is CPC (Compressed Probabilistic Counting). For large enough counts, this HLL version (with HLL_4) can be 2 to * 16 times smaller than the Theta sketch family for the same accuracy. * *

This implementation offers three different types of HLL sketch, each with different * trade-offs with accuracy, space and performance. These types are specified with the * {@link TgtHllType} parameter. * *

In terms of accuracy, all three types, for the same lg_config_k, have the same error * distribution as a function of n, the number of unique values fed to the sketch. * The configuration parameter lg_config_k is the log-base-2 of K, * where K is the number of buckets or slots for the sketch. * *

During warmup, when the sketch has only received a small number of unique items * (up to about 10% of K), this implementation leverages a new class of estimator * algorithms with significantly better accuracy. * *

This sketch also offers the capability of operating off-heap. Given a WritableMemory object * created by the user, the sketch will perform all of its updates and internal phase transitions * in that object, which can actually reside either on-heap or off-heap based on how it is * configured. In large systems that must update and merge many millions of sketches, having the * sketch operate off-heap avoids the serialization and deserialization costs of moving sketches * to and from off-heap memory-mapped files, for example, and eliminates big garbage collection * delays. * * author Jon Malkin * author Lee Rhodes * author Kevin Lang */ /** * Specifies the target type of HLL sketch to be created. It is a target in that the actual * allocation of the HLL array is deferred until sufficient number of items have been received by * the warm-up phases. * *

These three target types are isomorphic representations of the same underlying HLL algorithm. * Thus, given the same value of lg_config_k and the same input, all three HLL target types * will produce identical estimates and have identical error distributions.

* *

The memory (and also the serialization) of the sketch during this early warmup phase starts * out very small (8 bytes, when empty) and then grows in increments of 4 bytes as required * until the full HLL array is allocated. This transition point occurs at about 10% of K for * sketches where lg_config_k is > 8.

* *
    *
  • HLL_8 This uses an 8-bit byte per HLL bucket. It is generally the * fastest in terms of update time, but has the largest storage footprint of about * K bytes.
  • * *
  • HLL_6 This uses a 6-bit field per HLL bucket. It is the generally the next fastest * in terms of update time with a storage footprint of about 3/4 * K bytes.
  • * *
  • HLL_4 This uses a 4-bit field per HLL bucket and for large counts may require * the use of a small internal auxiliary array for storing statistical exceptions, which are rare. * For the values of lg_config_k > 13 (K = 8192), * this additional array adds about 3% to the overall storage. It is generally the slowest in * terms of update time, but has the smallest storage footprint of about * K/2 * 1.03 bytes.
  • *
*/ enum target_hll_type { HLL_4, ///< 4 bits per entry (most compact, size may vary) HLL_6, ///< 6 bits per entry (fixed size) HLL_8 ///< 8 bits per entry (fastest, fixed size) }; template class HllSketchImpl; template class hll_union_alloc; template using AllocU8 = typename std::allocator_traits::template rebind_alloc; template using vector_u8 = std::vector>; template > class hll_sketch_alloc final { public: /** * Constructs a new HLL sketch. * @param lg_config_k Sketch can hold 2^lg_config_k rows * @param tgt_type The HLL mode to use, if/when the sketch reaches that state * @param start_full_size Indicates whether to start in HLL mode, * keeping memory use constant (if HLL_6 or HLL_8) at the cost of * starting out using much more memory */ explicit hll_sketch_alloc(uint8_t lg_config_k, target_hll_type tgt_type = HLL_4, bool start_full_size = false, const A& allocator = A()); /** * Copy constructor */ hll_sketch_alloc(const hll_sketch_alloc& that); /** * Copy constructor to a new target type */ hll_sketch_alloc(const hll_sketch_alloc& that, target_hll_type tgt_type); /** * Move constructor */ hll_sketch_alloc(hll_sketch_alloc&& that) noexcept; /** * Reconstructs a sketch from a serialized image on a stream. * @param is An input stream with a binary image of a sketch */ static hll_sketch_alloc deserialize(std::istream& is, const A& allocator = A()); /** * Reconstructs a sketch from a serialized image in a byte array. * @param is bytes An input array with a binary image of a sketch * @param len Length of the input array, in bytes */ static hll_sketch_alloc deserialize(const void* bytes, size_t len, const A& allocator = A()); //! Class destructor virtual ~hll_sketch_alloc(); //! Copy assignment operator hll_sketch_alloc operator=(const hll_sketch_alloc& other); //! Move assignment operator hll_sketch_alloc operator=(hll_sketch_alloc&& other); /** * Resets the sketch to an empty state in coupon collection mode. * Does not re-use existing internal objects. */ void reset(); typedef vector_u8 vector_bytes; // alias for users /** * Serializes the sketch to a byte array, compacting data structures * where feasible to eliminate unused storage in the serialized image. * @param header_size_bytes Allows for PostgreSQL integration */ vector_bytes serialize_compact(unsigned header_size_bytes = 0) const; /** * Serializes the sketch to a byte array, retaining all internal * data structures in their current form. */ vector_bytes serialize_updatable() const; /** * Serializes the sketch to an ostream, compacting data structures * where feasible to eliminate unused storage in the serialized image. * @param os std::ostream to use for output. */ void serialize_compact(std::ostream& os) const; /** * Serializes the sketch to an ostream, retaining all internal data * structures in their current form. * @param os std::ostream to use for output. */ void serialize_updatable(std::ostream& os) const; /** * Human readable summary with optional detail * @param summary if true, output the sketch summary * @param detail if true, output the internal data array * @param auxDetail if true, output the internal Aux array, if it exists. * @param all if true, outputs all entries including empty ones * @return human readable string with optional detail. */ string to_string(bool summary = true, bool detail = false, bool aux_detail = false, bool all = false) const; /** * Present the given std::string as a potential unique item. * The string is converted to a byte array using UTF8 encoding. * If the string is null or empty no update attempt is made and the method returns. * @param datum The given string. */ void update(const std::string& datum); /** * Present the given unsigned 64-bit integer as a potential unique item. * @param datum The given integer. */ void update(uint64_t datum); /** * Present the given unsigned 32-bit integer as a potential unique item. * @param datum The given integer. */ void update(uint32_t datum); /** * Present the given unsigned 16-bit integer as a potential unique item. * @param datum The given integer. */ void update(uint16_t datum); /** * Present the given unsigned 8-bit integer as a potential unique item. * @param datum The given integer. */ void update(uint8_t datum); /** * Present the given signed 64-bit integer as a potential unique item. * @param datum The given integer. */ void update(int64_t datum); /** * Present the given signed 32-bit integer as a potential unique item. * @param datum The given integer. */ void update(int32_t datum); /** * Present the given signed 16-bit integer as a potential unique item. * @param datum The given integer. */ void update(int16_t datum); /** * Present the given signed 8-bit integer as a potential unique item. * @param datum The given integer. */ void update(int8_t datum); /** * Present the given 64-bit floating point value as a potential unique item. * @param datum The given double. */ void update(double datum); /** * Present the given 32-bit floating point value as a potential unique item. * @param datum The given float. */ void update(float datum); /** * Present the given data array as a potential unique item. * @param data The given array. * @param length_bytes The array length in bytes. */ void update(const void* data, size_t length_bytes); /** * Returns the current cardinality estimate * @return the cardinality estimate */ double get_estimate() const; /** * This is less accurate than the getEstimate() method * and is automatically used when the sketch has gone through * union operations where the more accurate HIP estimator cannot * be used. * * This is made public only for error characterization software * that exists in separate packages and is not intended for normal * use. * @return the composite cardinality estimate */ double get_composite_estimate() const; /** * Returns the approximate lower error bound given the specified * number of standard deviations. * @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}. * @return The approximate lower bound. */ double get_lower_bound(uint8_t num_std_dev) const; /** * Returns the approximate upper error bound given the specified * number of standard deviations. * @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}. * @return The approximate upper bound. */ double get_upper_bound(uint8_t num_std_dev) const; /** * Returns sketch's configured lg_k value. * @return Configured lg_k value. */ uint8_t get_lg_config_k() const; /** * Returns the sketch's target HLL mode (from #target_hll_type). * @return The sketch's target HLL mode. */ target_hll_type get_target_type() const; /** * Indicates if the sketch is currently stored compacted. * @return True if the sketch is stored in compact form. */ bool is_compact() const; /** * Indicates if the sketch is currently empty. * @return True if the sketch is empty. */ bool is_empty() const; /** * Returns the size of the sketch serialized in compact form. * @return Size of the sketch serialized in compact form, in bytes. */ uint32_t get_compact_serialization_bytes() const; /** * Returns the size of the sketch serialized without compaction. * @return Size of the sketch serialized without compaction, in bytes. */ uint32_t get_updatable_serialization_bytes() const; /** * Returns the maximum size in bytes that this sketch can grow to * given lg_config_k. However, for the HLL_4 sketch type, this * value can be exceeded in extremely rare cases. If exceeded, it * will be larger by only a few percent. * * @param lg_config_k The Log2 of K for the target HLL sketch. This value must be * between 4 and 21 inclusively. * @param tgt_type the desired Hll type * @return the maximum size in bytes that this sketch can grow to. */ static uint32_t get_max_updatable_serialization_bytes(uint8_t lg_k, target_hll_type tgt_type); /** * Gets the current (approximate) Relative Error (RE) asymptotic values given several * parameters. This is used primarily for testing. * @param upper_bound return the RE for the Upper Bound, otherwise for the Lower Bound. * @param unioned set true if the sketch is the result of a union operation. * @param lg_config_k the configured value for the sketch. * @param num_std_dev the given number of Standard Deviations. This must be an integer between * 1 and 3, inclusive. * @return the current (approximate) RelativeError */ static double get_rel_err(bool upper_bound, bool unioned, uint8_t lg_config_k, uint8_t num_std_dev); private: explicit hll_sketch_alloc(HllSketchImpl* that); void coupon_update(uint32_t coupon); std::string type_as_string() const; std::string mode_as_string() const; hll_mode get_current_mode() const; uint8_t get_serialization_version() const; bool is_out_of_order_flag() const; bool is_estimation_mode() const; typedef typename std::allocator_traits::template rebind_alloc AllocHllSketch; HllSketchImpl* sketch_impl; friend hll_union_alloc; }; /** * This performs union operations for HLL sketches. This union operator is configured with a * lgMaxK instead of the normal lg_config_k. * *

This union operator does permit the unioning of sketches with different values of * lg_config_k. The user should be aware that the resulting accuracy of a sketch returned * at the end of the unioning process will be a function of the smallest of lg_max_k and * lg_config_k that the union operator has seen. * *

This union operator also permits unioning of any of the three different target hll_sketch * types. * *

Although the API for this union operator parallels many of the methods of the * HllSketch, the behavior of the union operator has some fundamental differences. * *

First, the user cannot specify the #tgt_hll_type as an input parameter. * Instead, it is specified for the sketch returned with #get_result(tgt_hll_tyope). * *

Second, the internal effective value of log-base-2 of k for the union operation can * change dynamically based on the smallest lg_config_k that the union operation has seen. * * author Jon Malkin * author Lee Rhodes * author Kevin Lang */ template > class hll_union_alloc { public: /** * Construct an hll_union operator with the given maximum log2 of k. * @param lg_max_k The maximum size, in log2, of k. The value must * be between 7 and 21, inclusive. */ explicit hll_union_alloc(uint8_t lg_max_k, const A& allocator = A()); /** * Returns the current cardinality estimate * @return the cardinality estimate */ double get_estimate() const; /** * This is less accurate than the get_estimate() method * and is automatically used when the union has gone through * union operations where the more accurate HIP estimator cannot * be used. * * This is made public only for error characterization software * that exists in separate packages and is not intended for normal * use. * @return the composite cardinality estimate */ double get_composite_estimate() const; /** * Returns the approximate lower error bound given the specified * number of standard deviations. * @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}. * @return The approximate lower bound. */ double get_lower_bound(uint8_t num_std_dev) const; /** * Returns the approximate upper error bound given the specified * number of standard deviations. * @param num_std_dev Number of standard deviations, an integer from the set {1, 2, 3}. * @return The approximate upper bound. */ double get_upper_bound(uint8_t num_std_dev) const; /** * Returns union's configured lg_k value. * @return Configured lg_k value. */ uint8_t get_lg_config_k() const; /** * Returns the union's target HLL mode (from #target_hll_type). * @return The union's target HLL mode. */ target_hll_type get_target_type() const; /** * Indicates if the union is currently empty. * @return True if the union is empty. */ bool is_empty() const; /** * Resets the union to an empty state in coupon collection mode. * Does not re-use existing internal objects. */ void reset(); /** * Returns the result of this union operator with the specified * #tgt_hll_type. * @param The tgt_hll_type enum value of the desired result (Default: HLL_4) * @return The result of this union with the specified tgt_hll_type */ hll_sketch_alloc get_result(target_hll_type tgt_type = HLL_4) const; /** * Update this union operator with the given sketch. * @param The given sketch. */ void update(const hll_sketch_alloc& sketch); /** * Update this union operator with the given temporary sketch. * @param The given sketch. */ void update(hll_sketch_alloc&& sketch); /** * Present the given std::string as a potential unique item. * The string is converted to a byte array using UTF8 encoding. * If the string is null or empty no update attempt is made and the method returns. * @param datum The given string. */ void update(const std::string& datum); /** * Present the given unsigned 64-bit integer as a potential unique item. * @param datum The given integer. */ void update(uint64_t datum); /** * Present the given unsigned 32-bit integer as a potential unique item. * @param datum The given integer. */ void update(uint32_t datum); /** * Present the given unsigned 16-bit integer as a potential unique item. * @param datum The given integer. */ void update(uint16_t datum); /** * Present the given unsigned 8-bit integer as a potential unique item. * @param datum The given integer. */ void update(uint8_t datum); /** * Present the given signed 64-bit integer as a potential unique item. * @param datum The given integer. */ void update(int64_t datum); /** * Present the given signed 32-bit integer as a potential unique item. * @param datum The given integer. */ void update(int32_t datum); /** * Present the given signed 16-bit integer as a potential unique item. * @param datum The given integer. */ void update(int16_t datum); /** * Present the given signed 8-bit integer as a potential unique item. * @param datum The given integer. */ void update(int8_t datum); /** * Present the given 64-bit floating point value as a potential unique item. * @param datum The given double. */ void update(double datum); /** * Present the given 32-bit floating point value as a potential unique item. * @param datum The given float. */ void update(float datum); /** * Present the given data array as a potential unique item. * @param data The given array. * @param length_bytes The array length in bytes. */ void update(const void* data, size_t length_bytes); /** * Gets the current (approximate) Relative Error (RE) asymptotic values given several * parameters. This is used primarily for testing. * @param upper_bound return the RE for the Upper Bound, otherwise for the Lower Bound. * @param unioned set true if the sketch is the result of a union operation. * @param lg_config_k the configured value for the sketch. * @param num_std_dev the given number of Standard Deviations. This must be an integer between * 1 and 3, inclusive. * @return the current (approximate) RelativeError */ static double get_rel_err(bool upper_bound, bool unioned, uint8_t lg_config_k, uint8_t num_std_dev); private: /** * Union the given source and destination sketches. This method examines the state of * the current internal gadget and the incoming sketch and determines the optimal way to * perform the union. This may involve swapping, down-sampling, transforming, and / or * copying one of the arguments and may completely replace the internals of the union. * * @param incoming_impl the given incoming sketch, which may not be modified. * @param lg_max_k the maximum value of log2 K for this union. */ inline void union_impl(const hll_sketch_alloc& sketch, uint8_t lg_max_k); static HllSketchImpl* copy_or_downsample(const HllSketchImpl* src_impl, uint8_t tgt_lg_k); void coupon_update(uint32_t coupon); hll_mode get_current_mode() const; bool is_out_of_order_flag() const; bool is_estimation_mode() const; // calls couponUpdate on sketch, freeing the old sketch upon changes in hll_mode static HllSketchImpl* leak_free_coupon_update(HllSketchImpl* impl, uint32_t coupon); uint8_t lg_max_k_; hll_sketch_alloc gadget_; }; /// convenience alias for hll_sketch with default allocator typedef hll_sketch_alloc<> hll_sketch; /// convenience alias for hll_union with default allocator typedef hll_union_alloc<> hll_union; } // namespace datasketches #include "hll.private.hpp" #endif // _HLL_HPP_