/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ #ifndef TUPLE_SKETCH_HPP_ #define TUPLE_SKETCH_HPP_ #include #include "serde.hpp" #include "theta_update_sketch_base.hpp" namespace datasketches { // forward-declarations template class tuple_sketch; template class update_tuple_sketch; template class compact_tuple_sketch; template class theta_sketch_alloc; template struct pair_extract_key { K& operator()(std::pair& entry) const { return entry.first; } const K& operator()(const std::pair& entry) const { return entry.first; } }; template< typename Summary, typename Allocator = std::allocator > class tuple_sketch { public: using Entry = std::pair; using ExtractKey = pair_extract_key; using iterator = theta_iterator; using const_iterator = theta_const_iterator; virtual ~tuple_sketch() = default; /** * @return allocator */ virtual Allocator get_allocator() const = 0; /** * @return true if this sketch represents an empty set (not the same as no retained entries!) */ virtual bool is_empty() const = 0; /** * @return estimate of the distinct count of the input stream */ double get_estimate() const; /** * Returns the approximate lower error bound given a number of standard deviations. * This parameter is similar to the number of standard deviations of the normal distribution * and corresponds to approximately 67%, 95% and 99% confidence intervals. * @param num_std_devs number of Standard Deviations (1, 2 or 3) * @return the lower bound */ double get_lower_bound(uint8_t num_std_devs) const; /** * Returns the approximate upper error bound given a number of standard deviations. * This parameter is similar to the number of standard deviations of the normal distribution * and corresponds to approximately 67%, 95% and 99% confidence intervals. * @param num_std_devs number of Standard Deviations (1, 2 or 3) * @return the upper bound */ double get_upper_bound(uint8_t num_std_devs) const; /** * @return true if the sketch is in estimation mode (as opposed to exact mode) */ bool is_estimation_mode() const; /** * @return theta as a fraction from 0 to 1 (effective sampling rate) */ double get_theta() const; /** * @return theta as a positive integer between 0 and LLONG_MAX */ virtual uint64_t get_theta64() const = 0; /** * @return the number of retained entries in the sketch */ virtual uint32_t get_num_retained() const = 0; /** * @return hash of the seed that was used to hash the input */ virtual uint16_t get_seed_hash() const = 0; /** * @return true if retained entries are ordered */ virtual bool is_ordered() const = 0; /** * Provides a human-readable summary of this sketch as a string * @param print_items if true include the list of items retained by the sketch * @return sketch summary as a string */ string to_string(bool print_items = false) const; /** * Iterator over entries in this sketch. * @return begin iterator */ virtual iterator begin() = 0; /** * Iterator pointing past the valid range. * Not to be incremented or dereferenced. * @return end iterator */ virtual iterator end() = 0; /** * Const iterator over entries in this sketch. * @return begin const iterator */ virtual const_iterator begin() const = 0; /** * Const iterator pointing past the valid range. * Not to be incremented or dereferenced. * @return end const iterator */ virtual const_iterator end() const = 0; protected: using ostrstream = std::basic_ostringstream, AllocChar>; virtual void print_specifics(ostrstream& os) const = 0; static uint16_t get_seed_hash(uint64_t seed); static void check_sketch_type(uint8_t actual, uint8_t expected); static void check_serial_version(uint8_t actual, uint8_t expected); static void check_seed_hash(uint16_t actual, uint16_t expected); }; // update sketch // for types with defined default constructor and + operation template struct default_update_policy { Summary create() const { return Summary(); } void update(Summary& summary, const Update& update) const { summary += update; } }; template< typename Summary, typename Update = Summary, typename Policy = default_update_policy, typename Allocator = std::allocator > class update_tuple_sketch: public tuple_sketch { public: using Base = tuple_sketch; using Entry = typename Base::Entry; using ExtractKey = typename Base::ExtractKey; using iterator = typename Base::iterator; using const_iterator = typename Base::const_iterator; using AllocEntry = typename std::allocator_traits::template rebind_alloc; using tuple_map = theta_update_sketch_base; using resize_factor = typename tuple_map::resize_factor; // No constructor here. Use builder instead. class builder; update_tuple_sketch(const update_tuple_sketch&) = default; update_tuple_sketch(update_tuple_sketch&&) noexcept = default; virtual ~update_tuple_sketch() = default; update_tuple_sketch& operator=(const update_tuple_sketch&) = default; update_tuple_sketch& operator=(update_tuple_sketch&&) = default; virtual Allocator get_allocator() const; virtual bool is_empty() const; virtual bool is_ordered() const; virtual uint64_t get_theta64() const; virtual uint32_t get_num_retained() const; virtual uint16_t get_seed_hash() const; /** * @return configured nominal number of entries in the sketch */ uint8_t get_lg_k() const; /** * @return configured resize factor of the sketch */ resize_factor get_rf() const; /** * Update this sketch with a given string. * @param value string to update the sketch with */ template inline void update(const std::string& key, FwdUpdate&& value); /** * Update this sketch with a given unsigned 64-bit integer. * @param value uint64_t to update the sketch with */ template inline void update(uint64_t key, FwdUpdate&& value); /** * Update this sketch with a given signed 64-bit integer. * @param value int64_t to update the sketch with */ template inline void update(int64_t key, FwdUpdate&& value); /** * Update this sketch with a given unsigned 32-bit integer. * For compatibility with Java implementation. * @param value uint32_t to update the sketch with */ template inline void update(uint32_t key, FwdUpdate&& value); /** * Update this sketch with a given signed 32-bit integer. * For compatibility with Java implementation. * @param value int32_t to update the sketch with */ template inline void update(int32_t key, FwdUpdate&& value); /** * Update this sketch with a given unsigned 16-bit integer. * For compatibility with Java implementation. * @param value uint16_t to update the sketch with */ template inline void update(uint16_t key, FwdUpdate&& value); /** * Update this sketch with a given signed 16-bit integer. * For compatibility with Java implementation. * @param value int16_t to update the sketch with */ template inline void update(int16_t key, FwdUpdate&& value); /** * Update this sketch with a given unsigned 8-bit integer. * For compatibility with Java implementation. * @param value uint8_t to update the sketch with */ template inline void update(uint8_t key, FwdUpdate&& value); /** * Update this sketch with a given signed 8-bit integer. * For compatibility with Java implementation. * @param value int8_t to update the sketch with */ template inline void update(int8_t key, FwdUpdate&& value); /** * Update this sketch with a given double-precision floating point value. * For compatibility with Java implementation. * @param value double to update the sketch with */ template inline void update(double key, FwdUpdate&& value); /** * Update this sketch with a given floating point value. * For compatibility with Java implementation. * @param value float to update the sketch with */ template inline void update(float key, FwdUpdate&& value); /** * Update this sketch with given data of any type. * This is a "universal" update that covers all cases above, * but may produce different hashes. * Be very careful to hash input values consistently using the same approach * both over time and on different platforms * and while passing sketches between C++ environment and Java environment. * Otherwise two sketches that should represent overlapping sets will be disjoint * For instance, for signed 32-bit values call update(int32_t) method above, * which does widening conversion to int64_t, if compatibility with Java is expected * @param data pointer to the data * @param length of the data in bytes */ template void update(const void* key, size_t length, FwdUpdate&& value); /** * Remove retained entries in excess of the nominal size k (if any) */ void trim(); /** * Converts this sketch to a compact sketch (ordered or unordered). * @param ordered optional flag to specify if ordered sketch should be produced * @return compact sketch */ compact_tuple_sketch compact(bool ordered = true) const; virtual iterator begin(); virtual iterator end(); virtual const_iterator begin() const; virtual const_iterator end() const; protected: Policy policy_; tuple_map map_; // for builder update_tuple_sketch(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator); using ostrstream = typename Base::ostrstream; virtual void print_specifics(ostrstream& os) const; }; // compact sketch template< typename Summary, typename Allocator = std::allocator > class compact_tuple_sketch: public tuple_sketch { public: using Base = tuple_sketch; using Entry = typename Base::Entry; using ExtractKey = typename Base::ExtractKey; using iterator = typename Base::iterator; using const_iterator = typename Base::const_iterator; using AllocEntry = typename std::allocator_traits::template rebind_alloc; using AllocU64 = typename std::allocator_traits::template rebind_alloc; using AllocBytes = typename std::allocator_traits::template rebind_alloc; using vector_bytes = std::vector; using comparator = compare_by_key; static const uint8_t SERIAL_VERSION = 1; static const uint8_t SKETCH_FAMILY = 9; static const uint8_t SKETCH_TYPE = 5; enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED }; // Instances of this type can be obtained: // - by compacting an update_tuple_sketch // - as a result of a set operation // - by deserializing a previously serialized compact sketch compact_tuple_sketch(const Base& other, bool ordered); compact_tuple_sketch(const compact_tuple_sketch&) = default; compact_tuple_sketch(compact_tuple_sketch&&) noexcept; virtual ~compact_tuple_sketch() = default; compact_tuple_sketch& operator=(const compact_tuple_sketch&) = default; compact_tuple_sketch& operator=(compact_tuple_sketch&&) = default; compact_tuple_sketch(const theta_sketch_alloc& other, const Summary& summary, bool ordered = true); virtual Allocator get_allocator() const; virtual bool is_empty() const; virtual bool is_ordered() const; virtual uint64_t get_theta64() const; virtual uint32_t get_num_retained() const; virtual uint16_t get_seed_hash() const; template> void serialize(std::ostream& os, const SerDe& sd = SerDe()) const; template> vector_bytes serialize(unsigned header_size_bytes = 0, const SerDe& sd = SerDe()) const; virtual iterator begin(); virtual iterator end(); virtual const_iterator begin() const; virtual const_iterator end() const; /** * This method deserializes a sketch from a given stream. * @param is input stream * @param seed the seed for the hash function that was used to create the sketch * @param instance of a SerDe * @return an instance of a sketch */ template> static compact_tuple_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, const SerDe& sd = SerDe(), const Allocator& allocator = Allocator()); /** * This method deserializes a sketch from a given array of bytes. * @param bytes pointer to the array of bytes * @param size the size of the array * @param seed the seed for the hash function that was used to create the sketch * @param instance of a SerDe * @return an instance of the sketch */ template> static compact_tuple_sketch deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const SerDe& sd = SerDe(), const Allocator& allocator = Allocator()); // for internal use compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector&& entries); protected: bool is_empty_; bool is_ordered_; uint16_t seed_hash_; uint64_t theta_; std::vector entries_; /** * Computes size needed to serialize summaries in the sketch. * This version is for fixed-size arithmetic types (integral and floating point). * @return size in bytes needed to serialize summaries in this sketch */ template::value, int>::type = 0> size_t get_serialized_size_summaries_bytes(const SerDe& sd) const; /** * Computes size needed to serialize summaries in the sketch. * This version is for all other types and can be expensive since every item needs to be looked at. * @return size in bytes needed to serialize summaries in this sketch */ template::value, int>::type = 0> size_t get_serialized_size_summaries_bytes(const SerDe& sd) const; // for deserialize class deleter_of_summaries { public: deleter_of_summaries(uint32_t num, bool destroy, const Allocator& allocator): allocator_(allocator), num_(num), destroy_(destroy) {} void set_destroy(bool destroy) { destroy_ = destroy; } void operator() (Summary* ptr) { if (ptr != nullptr) { if (destroy_) { for (uint32_t i = 0; i < num_; ++i) ptr[i].~Summary(); } allocator_.deallocate(ptr, num_); } } private: Allocator allocator_; uint32_t num_; bool destroy_; }; using ostrstream = typename Base::ostrstream; virtual void print_specifics(ostrstream& os) const; }; // builder template class tuple_base_builder: public theta_base_builder { public: tuple_base_builder(const Policy& policy, const Allocator& allocator); protected: Policy policy_; }; template class update_tuple_sketch::builder: public tuple_base_builder { public: /** * Creates and instance of the builder with default parameters. */ builder(const P& policy = P(), const A& allocator = A()); /** * This is to create an instance of the sketch with predefined parameters. * @return an instance of the sketch */ update_tuple_sketch build() const; }; } /* namespace datasketches */ #include "tuple_sketch_impl.hpp" #endif