/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ #ifndef _VAR_OPT_SKETCH_IMPL_HPP_ #define _VAR_OPT_SKETCH_IMPL_HPP_ #include #include #include #include #include #include "var_opt_sketch.hpp" #include "serde.hpp" #include "bounds_binomial_proportions.hpp" #include "count_zeros.hpp" #include "memory_operations.hpp" #include "ceiling_power_of_2.hpp" namespace datasketches { /** * Implementation code for the VarOpt sketch. * * author Kevin Lang * author Jon Malkin */ template var_opt_sketch::var_opt_sketch(uint32_t k, resize_factor rf, const A& allocator) : var_opt_sketch(k, rf, false, allocator) {} template var_opt_sketch::var_opt_sketch(const var_opt_sketch& other) : k_(other.k_), h_(other.h_), m_(other.m_), r_(other.r_), n_(other.n_), total_wt_r_(other.total_wt_r_), rf_(other.rf_), curr_items_alloc_(other.curr_items_alloc_), filled_data_(other.filled_data_), allocator_(other.allocator_), data_(nullptr), weights_(nullptr), num_marks_in_h_(other.num_marks_in_h_), marks_(nullptr) { data_ = allocator_.allocate(curr_items_alloc_); // skip gap or anything unused at the end for (size_t i = 0; i < h_; ++i) new (&data_[i]) T(other.data_[i]); for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i) new (&data_[i]) T(other.data_[i]); // we skipped the gap filled_data_ = false; weights_ = AllocDouble(allocator_).allocate(curr_items_alloc_); // doubles so can successfully copy regardless of the internal state std::copy(other.weights_, other.weights_ + curr_items_alloc_, weights_); if (other.marks_ != nullptr) { marks_ = AllocBool(allocator_).allocate(curr_items_alloc_); std::copy(other.marks_, other.marks_ + curr_items_alloc_, marks_); } } template var_opt_sketch::var_opt_sketch(const var_opt_sketch& other, bool as_sketch, uint64_t adjusted_n) : k_(other.k_), h_(other.h_), m_(other.m_), r_(other.r_), n_(adjusted_n), total_wt_r_(other.total_wt_r_), rf_(other.rf_), curr_items_alloc_(other.curr_items_alloc_), filled_data_(other.filled_data_), allocator_(other.allocator_), data_(nullptr), weights_(nullptr), num_marks_in_h_(other.num_marks_in_h_), marks_(nullptr) { data_ = allocator_.allocate(curr_items_alloc_); // skip gap or anything unused at the end for (size_t i = 0; i < h_; ++i) new (&data_[i]) T(other.data_[i]); for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i) new (&data_[i]) T(other.data_[i]); // we skipped the gap filled_data_ = false; weights_ = AllocDouble(allocator_).allocate(curr_items_alloc_); // doubles so can successfully copy regardless of the internal state std::copy(other.weights_, other.weights_ + curr_items_alloc_, weights_); if (!as_sketch && other.marks_ != nullptr) { marks_ = AllocBool(allocator_).allocate(curr_items_alloc_); std::copy(other.marks_, other.marks_ + curr_items_alloc_, marks_); } } template var_opt_sketch::var_opt_sketch(T* data, double* weights, size_t len, uint32_t k, uint64_t n, uint32_t h_count, uint32_t r_count, double total_wt_r, const A& allocator) : k_(k), h_(h_count), m_(0), r_(r_count), n_(n), total_wt_r_(total_wt_r), rf_(DEFAULT_RESIZE_FACTOR), curr_items_alloc_(len), filled_data_(n > k), allocator_(allocator), data_(data), weights_(weights), num_marks_in_h_(0), marks_(nullptr) {} template var_opt_sketch::var_opt_sketch(var_opt_sketch&& other) noexcept : k_(other.k_), h_(other.h_), m_(other.m_), r_(other.r_), n_(other.n_), total_wt_r_(other.total_wt_r_), rf_(other.rf_), curr_items_alloc_(other.curr_items_alloc_), filled_data_(other.filled_data_), allocator_(other.allocator_), data_(other.data_), weights_(other.weights_), num_marks_in_h_(other.num_marks_in_h_), marks_(other.marks_) { other.data_ = nullptr; other.weights_ = nullptr; other.marks_ = nullptr; } template var_opt_sketch::var_opt_sketch(uint32_t k, resize_factor rf, bool is_gadget, const A& allocator) : k_(k), h_(0), m_(0), r_(0), n_(0), total_wt_r_(0.0), rf_(rf), allocator_(allocator) { if (k == 0 || k_ > MAX_K) { throw std::invalid_argument("k must be at least 1 and less than 2^31 - 1"); } uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_)); uint32_t initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, MIN_LG_ARR_ITEMS); curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size); if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap ++curr_items_alloc_; } allocate_data_arrays(curr_items_alloc_, is_gadget); num_marks_in_h_ = 0; } template var_opt_sketch::var_opt_sketch(uint32_t k, uint32_t h, uint32_t m, uint32_t r, uint64_t n, double total_wt_r, resize_factor rf, uint32_t curr_items_alloc, bool filled_data, std::unique_ptr items, std::unique_ptr weights, uint32_t num_marks_in_h, std::unique_ptr marks, const A& allocator) : k_(k), h_(h), m_(m), r_(r), n_(n), total_wt_r_(total_wt_r), rf_(rf), curr_items_alloc_(curr_items_alloc), filled_data_(filled_data), allocator_(allocator), data_(items.release()), weights_(weights.release()), num_marks_in_h_(num_marks_in_h), marks_(marks.release()) {} template var_opt_sketch::~var_opt_sketch() { if (data_ != nullptr) { if (filled_data_) { // destroy everything const size_t num_to_destroy = std::min(k_ + 1, curr_items_alloc_); for (size_t i = 0; i < num_to_destroy; ++i) { allocator_.destroy(data_ + i); } } else { // skip gap or anything unused at the end for (size_t i = 0; i < h_; ++i) { allocator_.destroy(data_+ i); } for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i) { allocator_.destroy(data_ + i); } } allocator_.deallocate(data_, curr_items_alloc_); } if (weights_ != nullptr) { AllocDouble(allocator_).deallocate(weights_, curr_items_alloc_); } if (marks_ != nullptr) { AllocBool(allocator_).deallocate(marks_, curr_items_alloc_); } } template var_opt_sketch& var_opt_sketch::operator=(const var_opt_sketch& other) { var_opt_sketch sk_copy(other); std::swap(k_, sk_copy.k_); std::swap(h_, sk_copy.h_); std::swap(m_, sk_copy.m_); std::swap(r_, sk_copy.r_); std::swap(n_, sk_copy.n_); std::swap(total_wt_r_, sk_copy.total_wt_r_); std::swap(rf_, sk_copy.rf_); std::swap(curr_items_alloc_, sk_copy.curr_items_alloc_); std::swap(filled_data_, sk_copy.filled_data_); std::swap(allocator_, sk_copy.allocator_); std::swap(data_, sk_copy.data_); std::swap(weights_, sk_copy.weights_); std::swap(num_marks_in_h_, sk_copy.num_marks_in_h_); std::swap(marks_, sk_copy.marks_); return *this; } template var_opt_sketch& var_opt_sketch::operator=(var_opt_sketch&& other) { std::swap(k_, other.k_); std::swap(h_, other.h_); std::swap(m_, other.m_); std::swap(r_, other.r_); std::swap(n_, other.n_); std::swap(total_wt_r_, other.total_wt_r_); std::swap(rf_, other.rf_); std::swap(curr_items_alloc_, other.curr_items_alloc_); std::swap(filled_data_, other.filled_data_); std::swap(allocator_, other.allocator_); std::swap(data_, other.data_); std::swap(weights_, other.weights_); std::swap(num_marks_in_h_, other.num_marks_in_h_); std::swap(marks_, other.marks_); return *this; } /* * An empty sketch requires 8 bytes. * *

 * Long || Start Byte Adr:
 * Adr:
 *      ||       0        |    1   |    2   |    3   |    4   |    5   |    6   |    7   |
 *  0   || Preamble_Longs | SerVer | FamID  |  Flags |---------Max Res. Size (K)---------|
 *

* * A non-empty sketch requires 24 bytes of preamble for an under-full sample; once there are * at least k items the sketch uses 32 bytes of preamble. * * The count of items seen is limited to 48 bits (~256 trillion) even though there are adjacent * unused preamble bits. The acceptance probability for an item is a double in the range [0,1), * limiting us to 53 bits of randomness due to details of the IEEE floating point format. To * ensure meaningful probabilities as the items seen count approaches capacity, we intentionally * use slightly fewer bits. * * Following the header are weights for the heavy items, then marks in the event this is a gadget. * The serialized items come last. * *

 * Long || Start Byte Adr:
 * Adr:
 *      ||       0        |    1   |    2   |    3   |    4   |    5   |    6   |    7   |
 *  0   || Preamble_Longs | SerVer | FamID  |  Flags |---------Max Res. Size (K)---------|
 *
 *      ||       8        |    9   |   10   |   11   |   12   |   13   |   14   |   15   |
 *  1   ||---------------------------Items Seen Count (N)--------------------------------|
 *
 *      ||      16        |   17   |   18   |   19   |   20   |   21   |   22   |   23   |
 *  2   ||-------------Item Count in H---------------|-------Item Count in R-------------|
 *
 *      ||      24        |   25   |   26   |   27   |   28   |   29   |   30   |   31   |
 *  3   ||-------------------------------Total Weight in R-------------------------------|
 *

*/ // implementation for fixed-size arithmetic types (integral and floating point) template template::value, int>::type> size_t var_opt_sketch::get_serialized_size_bytes() const { if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; } size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3; num_bytes += h_ * sizeof(double); // weights if (marks_ != nullptr) { // marks num_bytes += (h_ / 8) + (h_ % 8 > 0); } num_bytes += (h_ + r_) * sizeof(TT); // the actual items return num_bytes; } // implementation for all other types template template::value, int>::type> size_t var_opt_sketch::get_serialized_size_bytes() const { if (is_empty()) { return PREAMBLE_LONGS_EMPTY << 3; } size_t num_bytes = (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL) << 3; num_bytes += h_ * sizeof(double); // weights if (marks_ != nullptr) { // marks num_bytes += (h_ / 8) + (h_ % 8 > 0); } // must iterate over the items for (auto it: *this) num_bytes += S().size_of_item(it.first); return num_bytes; } template std::vector> var_opt_sketch::serialize(unsigned header_size_bytes) const { const size_t size = header_size_bytes + get_serialized_size_bytes(); std::vector> bytes(size, 0, allocator_); uint8_t* ptr = bytes.data() + header_size_bytes; uint8_t* end_ptr = ptr + size; bool empty = is_empty(); uint8_t preLongs = (empty ? PREAMBLE_LONGS_EMPTY : (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL)); uint8_t first_byte = (preLongs & 0x3F) | ((static_cast(rf_)) << 6); uint8_t flags = (marks_ != nullptr ? GADGET_FLAG_MASK : 0); if (empty) { flags |= EMPTY_FLAG_MASK; } // first prelong uint8_t ser_ver(SER_VER); uint8_t family(FAMILY_ID); ptr += copy_to_mem(first_byte, ptr); ptr += copy_to_mem(ser_ver, ptr); ptr += copy_to_mem(family, ptr); ptr += copy_to_mem(flags, ptr); ptr += copy_to_mem(k_, ptr); if (!empty) { // second and third prelongs ptr += copy_to_mem(n_, ptr); ptr += copy_to_mem(h_, ptr); ptr += copy_to_mem(r_, ptr); // fourth prelong, if needed if (r_ > 0) { ptr += copy_to_mem(total_wt_r_, ptr); } // first h_ weights ptr += copy_to_mem(weights_, ptr, h_ * sizeof(double)); // first h_ marks as packed bytes iff we have a gadget if (marks_ != nullptr) { uint8_t val = 0; for (uint32_t i = 0; i < h_; ++i) { if (marks_[i]) { val |= 0x1 << (i & 0x7); } if ((i & 0x7) == 0x7) { ptr += copy_to_mem(val, ptr); val = 0; } } // write out any remaining values if ((h_ & 0x7) > 0) { ptr += copy_to_mem(val, ptr); } } // write the sample items, skipping the gap. Either h_ or r_ may be 0 ptr += S().serialize(ptr, end_ptr - ptr, data_, h_); ptr += S().serialize(ptr, end_ptr - ptr, &data_[h_ + 1], r_); } size_t bytes_written = ptr - bytes.data(); if (bytes_written != size) { throw std::logic_error("serialized size mismatch: " + std::to_string(bytes_written) + " != " + std::to_string(size)); } return bytes; } template void var_opt_sketch::serialize(std::ostream& os) const { const bool empty = (h_ == 0) && (r_ == 0); const uint8_t preLongs = (empty ? PREAMBLE_LONGS_EMPTY : (r_ == 0 ? PREAMBLE_LONGS_WARMUP : PREAMBLE_LONGS_FULL)); const uint8_t first_byte = (preLongs & 0x3F) | ((static_cast(rf_)) << 6); uint8_t flags = (marks_ != nullptr ? GADGET_FLAG_MASK : 0); if (empty) { flags |= EMPTY_FLAG_MASK; } // first prelong const uint8_t ser_ver(SER_VER); const uint8_t family(FAMILY_ID); write(os, first_byte); write(os, ser_ver); write(os, family); write(os, flags); write(os, k_); if (!empty) { // second and third prelongs write(os, n_); write(os, h_); write(os, r_); // fourth prelong, if needed if (r_ > 0) { write(os, total_wt_r_); } // write the first h_ weights write(os, weights_, h_ * sizeof(double)); // write the first h_ marks as packed bytes iff we have a gadget if (marks_ != nullptr) { uint8_t val = 0; for (uint32_t i = 0; i < h_; ++i) { if (marks_[i]) { val |= 0x1 << (i & 0x7); } if ((i & 0x7) == 0x7) { write(os, val); val = 0; } } // write out any remaining values if ((h_ & 0x7) > 0) { write(os, val); } } // write the sample items, skipping the gap. Either h_ or r_ may be 0 S().serialize(os, data_, h_); S().serialize(os, &data_[h_ + 1], r_); } } template var_opt_sketch var_opt_sketch::deserialize(const void* bytes, size_t size, const A& allocator) { ensure_minimum_memory(size, 8); const char* ptr = static_cast(bytes); const char* base = ptr; const char* end_ptr = ptr + size; uint8_t first_byte; ptr += copy_from_mem(ptr, first_byte); uint8_t preamble_longs = first_byte & 0x3f; resize_factor rf = static_cast((first_byte >> 6) & 0x03); uint8_t serial_version; ptr += copy_from_mem(ptr, serial_version); uint8_t family_id; ptr += copy_from_mem(ptr, family_id); uint8_t flags; ptr += copy_from_mem(ptr, flags); uint32_t k; ptr += copy_from_mem(ptr, k); check_preamble_longs(preamble_longs, flags); check_family_and_serialization_version(family_id, serial_version); ensure_minimum_memory(size, preamble_longs << 3); const bool is_empty = flags & EMPTY_FLAG_MASK; const bool is_gadget = flags & GADGET_FLAG_MASK; if (is_empty) { return var_opt_sketch(k, rf, is_gadget, allocator); } // second and third prelongs uint64_t n; uint32_t h, r; ptr += copy_from_mem(ptr, n); ptr += copy_from_mem(ptr, h); ptr += copy_from_mem(ptr, r); const uint32_t array_size = validate_and_get_target_size(preamble_longs, k, n, h, r, rf); // current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating double total_wt_r = 0.0; if (preamble_longs == PREAMBLE_LONGS_FULL) { ptr += copy_from_mem(ptr, total_wt_r); if (std::isnan(total_wt_r) || r == 0 || total_wt_r <= 0.0) { throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. " "Found r = " + std::to_string(r) + ", R region weight = " + std::to_string(total_wt_r)); } } else { total_wt_r = 0.0; } // read the first h_ weights, fill in rest of array with -1.0 check_memory_size(ptr - base + (h * sizeof(double)), size); std::unique_ptr weights(AllocDouble(allocator).allocate(array_size), weights_deleter(array_size, allocator)); double* wts = weights.get(); // to avoid lots of .get() calls -- do not delete ptr += copy_from_mem(ptr, wts, h * sizeof(double)); for (size_t i = 0; i < h; ++i) { if (!(wts[i] > 0.0)) { throw std::invalid_argument("Possible corruption: Non-positive weight when deserializing: " + std::to_string(wts[i])); } } std::fill(wts + h, wts + array_size, -1.0); // read the first h_ marks as packed bytes iff we have a gadget uint32_t num_marks_in_h = 0; std::unique_ptr marks(nullptr, marks_deleter(array_size, allocator)); if (is_gadget) { uint8_t val = 0; marks = std::unique_ptr(AllocBool(allocator).allocate(array_size), marks_deleter(array_size, allocator)); const size_t size_marks = (h / 8) + (h % 8 > 0 ? 1 : 0); check_memory_size(ptr - base + size_marks, size); for (uint32_t i = 0; i < h; ++i) { if ((i & 0x7) == 0x0) { // should trigger on first iteration ptr += copy_from_mem(ptr, val); } marks.get()[i] = ((val >> (i & 0x7)) & 0x1) == 1; num_marks_in_h += (marks.get()[i] ? 1 : 0); } } // read the sample items, skipping the gap. Either h_ or r_ may be 0 items_deleter deleter(array_size, allocator); std::unique_ptr items(A(allocator).allocate(array_size), deleter); ptr += S().deserialize(ptr, end_ptr - ptr, items.get(), h); items.get_deleter().set_h(h); // serde didn't throw, so the items are now valid ptr += S().deserialize(ptr, end_ptr - ptr, &(items.get()[h + 1]), r); items.get_deleter().set_r(r); // serde didn't throw, so the items are now valid return var_opt_sketch(k, h, (r > 0 ? 1 : 0), r, n, total_wt_r, rf, array_size, false, std::move(items), std::move(weights), num_marks_in_h, std::move(marks), allocator); } template var_opt_sketch var_opt_sketch::deserialize(std::istream& is, const A& allocator) { const auto first_byte = read(is); uint8_t preamble_longs = first_byte & 0x3f; const resize_factor rf = static_cast((first_byte >> 6) & 0x03); const auto serial_version = read(is); const auto family_id = read(is); const auto flags = read(is); const auto k = read(is); check_preamble_longs(preamble_longs, flags); check_family_and_serialization_version(family_id, serial_version); const bool is_empty = flags & EMPTY_FLAG_MASK; const bool is_gadget = flags & GADGET_FLAG_MASK; if (is_empty) { if (!is.good()) throw std::runtime_error("error reading from std::istream"); else return var_opt_sketch(k, rf, is_gadget, allocator); } // second and third prelongs const auto n = read(is); const auto h = read(is); const auto r = read(is); const uint32_t array_size = validate_and_get_target_size(preamble_longs, k, n, h, r, rf); // current_items_alloc_ is set but validate R region weight (4th prelong), if needed, before allocating double total_wt_r = 0.0; if (preamble_longs == PREAMBLE_LONGS_FULL) { total_wt_r = read(is); if (std::isnan(total_wt_r) || r == 0 || total_wt_r <= 0.0) { throw std::invalid_argument("Possible corruption: deserializing in full mode but r = 0 or invalid R weight. " "Found r = " + std::to_string(r) + ", R region weight = " + std::to_string(total_wt_r)); } } // read the first h weights, fill remainder with -1.0 std::unique_ptr weights(AllocDouble(allocator).allocate(array_size), weights_deleter(array_size, allocator)); double* wts = weights.get(); // to avoid lots of .get() calls -- do not delete read(is, wts, h * sizeof(double)); for (size_t i = 0; i < h; ++i) { if (!(wts[i] > 0.0)) { throw std::invalid_argument("Possible corruption: Non-positive weight when deserializing: " + std::to_string(wts[i])); } } std::fill(wts + h, wts + array_size, -1.0); // read the first h_ marks as packed bytes iff we have a gadget uint32_t num_marks_in_h = 0; std::unique_ptr marks(nullptr, marks_deleter(array_size, allocator)); if (is_gadget) { marks = std::unique_ptr(AllocBool(allocator).allocate(array_size), marks_deleter(array_size, allocator)); uint8_t val = 0; for (uint32_t i = 0; i < h; ++i) { if ((i & 0x7) == 0x0) { // should trigger on first iteration val = read(is); } marks.get()[i] = ((val >> (i & 0x7)) & 0x1) == 1; num_marks_in_h += (marks.get()[i] ? 1 : 0); } } // read the sample items, skipping the gap. Either h or r may be 0 items_deleter deleter(array_size, allocator); std::unique_ptr items(A(allocator).allocate(array_size), deleter); S().deserialize(is, items.get(), h); // aka &data_[0] items.get_deleter().set_h(h); // serde didn't throw, so the items are now valid S().deserialize(is, &(items.get()[h + 1]), r); items.get_deleter().set_r(r); // serde didn't throw, so the items are now valid if (!is.good()) throw std::runtime_error("error reading from std::istream"); return var_opt_sketch(k, h, (r > 0 ? 1 : 0), r, n, total_wt_r, rf, array_size, false, std::move(items), std::move(weights), num_marks_in_h, std::move(marks), allocator); } template bool var_opt_sketch::is_empty() const { return (h_ == 0 && r_ == 0); } template void var_opt_sketch::reset() { const uint32_t prev_alloc = curr_items_alloc_; const uint32_t ceiling_lg_k = to_log_2(ceiling_power_of_2(k_)); const uint32_t initial_lg_size = starting_sub_multiple(ceiling_lg_k, rf_, MIN_LG_ARR_ITEMS); curr_items_alloc_ = get_adjusted_size(k_, 1 << initial_lg_size); if (curr_items_alloc_ == k_) { // if full size, need to leave 1 for the gap ++curr_items_alloc_; } if (filled_data_) { // destroy everything const size_t num_to_destroy = std::min(k_ + 1, prev_alloc); for (size_t i = 0; i < num_to_destroy; ++i) allocator_.destroy(data_ + i); } else { // skip gap or anything unused at the end for (size_t i = 0; i < h_; ++i) allocator_.destroy(data_+ i); for (size_t i = h_ + 1; i < h_ + r_ + 1; ++i) allocator_.destroy(data_ + i); } if (curr_items_alloc_ < prev_alloc) { const bool is_gadget = (marks_ != nullptr); allocator_.deallocate(data_, prev_alloc); AllocDouble(allocator_).deallocate(weights_, prev_alloc); if (marks_ != nullptr) AllocBool(allocator_).deallocate(marks_, prev_alloc); allocate_data_arrays(curr_items_alloc_, is_gadget); } n_ = 0; h_ = 0; m_ = 0; r_ = 0; num_marks_in_h_ = 0; total_wt_r_ = 0.0; filled_data_ = false; } template uint64_t var_opt_sketch::get_n() const { return n_; } template uint32_t var_opt_sketch::get_k() const { return k_; } template uint32_t var_opt_sketch::get_num_samples() const { const uint32_t num_in_sketch = h_ + r_; return (num_in_sketch < k_ ? num_in_sketch : k_); } template void var_opt_sketch::update(const T& item, double weight) { update(item, weight, false); } template void var_opt_sketch::update(T&& item, double weight) { update(std::move(item), weight, false); } template string var_opt_sketch::to_string() const { std::basic_ostringstream, AllocChar > os; os << "### VarOpt SUMMARY: " << std::endl; os << " k : " << k_ << std::endl; os << " h : " << h_ << std::endl; os << " r : " << r_ << std::endl; os << " weight_r : " << total_wt_r_ << std::endl; os << " Current size : " << curr_items_alloc_ << std::endl; os << " Resize factor: " << (1 << rf_) << std::endl; os << "### END SKETCH SUMMARY" << std::endl; return os.str(); } template string var_opt_sketch::items_to_string() const { std::basic_ostringstream, AllocChar > os; os << "### Sketch Items" << std::endl; int idx = 0; for (auto record : *this) { os << idx << ": " << record.first << "\twt = " << record.second << std::endl; ++idx; } return os.str(); } template string var_opt_sketch::items_to_string(bool print_gap) const { std::basic_ostringstream, AllocChar > os; os << "### Sketch Items" << std::endl; const uint32_t array_length = (n_ < k_ ? n_ : k_ + 1); for (uint32_t i = 0, display_idx = 0; i < array_length; ++i) { if (i == h_ && print_gap) { os << i << ": GAP" << std::endl; ++display_idx; } else { os << i << ": " << data_[i] << "\twt = "; if (weights_[i] == -1.0) { os << get_tau() << "\t(-1.0)" << std::endl; } else { os << weights_[i] << std::endl; } ++display_idx; } } return os.str(); } template template void var_opt_sketch::update(O&& item, double weight, bool mark) { if (weight < 0.0 || std::isnan(weight) || std::isinf(weight)) { throw std::invalid_argument("Item weights must be nonnegative and finite. Found: " + std::to_string(weight)); } else if (weight == 0.0) { return; } ++n_; if (r_ == 0) { // exact mode update_warmup_phase(std::forward(item), weight, mark); } else { // sketch is in estimation mode so we can make the following check, // although very conservative to check every time if ((h_ != 0) && (peek_min() < get_tau())) throw std::logic_error("sketch not in valid estimation mode"); // what tau would be if deletion candidates turn out to be R plus the new item // note: (r_ + 1) - 1 is intentional const double hypothetical_tau = (weight + total_wt_r_) / ((r_ + 1) - 1); // is new item's turn to be considered for reservoir? const double condition1 = (h_ == 0) || (weight <= peek_min()); // is new item light enough for reservoir? const double condition2 = weight < hypothetical_tau; if (condition1 && condition2) { update_light(std::forward(item), weight, mark); } else if (r_ == 1) { update_heavy_r_eq1(std::forward(item), weight, mark); } else { update_heavy_general(std::forward(item), weight, mark); } } } template template void var_opt_sketch::update_warmup_phase(O&& item, double weight, bool mark) { // seems overly cautious if (r_ > 0 || m_ != 0 || h_ > k_) throw std::logic_error("invalid sketch state during warmup"); if (h_ >= curr_items_alloc_) { grow_data_arrays(); } // store items as they come in until full new (&data_[h_]) T(std::forward(item)); weights_[h_] = weight; if (marks_ != nullptr) { marks_[h_] = mark; } ++h_; num_marks_in_h_ += mark ? 1 : 0; // check if need to heapify if (h_ > k_) { filled_data_ = true; transition_from_warmup(); } } /* In the "light" case the new item has weight <= old_tau, so would appear to the right of the R items in a hypothetical reverse-sorted list. It is easy to prove that it is light enough to be part of this round's downsampling */ template template void var_opt_sketch::update_light(O&& item, double weight, bool mark) { if (r_ == 0 || (r_ + h_) != k_) throw std::logic_error("invalid sketch state during light warmup"); const uint32_t m_slot = h_; // index of the gap, which becomes the M region if (filled_data_) { data_[m_slot] = std::forward(item); } else { new (&data_[m_slot]) T(std::forward(item)); filled_data_ = true; } weights_[m_slot] = weight; if (marks_ != nullptr) { marks_[m_slot] = mark; } ++m_; grow_candidate_set(total_wt_r_ + weight, r_ + 1); } /* In the "heavy" case the new item has weight > old_tau, so would appear to the left of items in R in a hypothetical reverse-sorted list and might or might not be light enough be part of this round's downsampling. [After first splitting off the R=1 case] we greatly simplify the code by putting the new item into the H heap whether it needs to be there or not. In other words, it might go into the heap and then come right back out, but that should be okay because pseudo_heavy items cannot predominate in long streams unless (max wt) / (min wt) > o(exp(N)) */ template template void var_opt_sketch::update_heavy_general(O&& item, double weight, bool mark) { if (r_ < 2 || m_ != 0 || (r_ + h_) != k_) throw std::logic_error("invalid sketch state during heavy general update"); // put into H, although may come back out momentarily push(std::forward(item), weight, mark); grow_candidate_set(total_wt_r_, r_); } /* The analysis of this case is similar to that of the general heavy case. The one small technical difference is that since R < 2, we must grab an M item to have a valid starting point for continue_by_growing_candidate_set () */ template template void var_opt_sketch::update_heavy_r_eq1(O&& item, double weight, bool mark) { if (r_ != 1 || m_ != 0 || (r_ + h_) != k_) throw std::logic_error("invalid sketch state during heavy r=1 update"); push(std::forward(item), weight, mark); // new item into H pop_min_to_m_region(); // pop lightest back into M // Any set of two items is downsample-able to one item, // so the two lightest items are a valid starting point for the following const uint32_t m_slot = k_ - 1; // array is k+1, 1 in R, so slot before is M grow_candidate_set(weights_[m_slot] + total_wt_r_, 2); } /** * Decreases sketch's value of k by 1, updating stored values as needed. * *

Subject to certain pre-conditions, decreasing k causes tau to increase. This fact is used by * the unioning algorithm to force "marked" items out of H and into the reservoir region.