// Copyright (c) 2016-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). #pragma once #include #include #include #include "db/compaction_iteration_stats.h" #include "db/dbformat.h" #include "db/pinned_iterators_manager.h" #include "db/version_edit.h" #include "include/rocksdb/comparator.h" #include "include/rocksdb/types.h" #include "table/internal_iterator.h" #include "table/scoped_arena_iterator.h" #include "table/table_builder.h" #include "util/kv_map.h" namespace rocksdb { // A RangeDelAggregator aggregates range deletion tombstones as they are // encountered in memtables/SST files. It provides methods that check whether a // key is covered by range tombstones or write the relevant tombstones to a new // SST file. class RangeDelAggregator { public: // @param snapshots These are used to organize the tombstones into snapshot // stripes, which is the seqnum range between consecutive snapshots, // including the higher snapshot and excluding the lower one. Currently, // this is used by ShouldDelete() to prevent deletion of keys that are // covered by range tombstones in other snapshot stripes. This constructor // is used for writes (flush/compaction). All DB snapshots are provided // such that no keys are removed that are uncovered according to any DB // snapshot. // Note this overload does not lazily initialize Rep. RangeDelAggregator(const InternalKeyComparator& icmp, const std::vector& snapshots, bool collapse_deletions = true); // @param upper_bound Similar to snapshots above, except with a single // snapshot, which allows us to store the snapshot on the stack and defer // initialization of heap-allocating members (in Rep) until the first range // deletion is encountered. This constructor is used in case of reads (get/ // iterator), for which only the user snapshot (upper_bound) is provided // such that the seqnum space is divided into two stripes. Only the older // stripe will be used by ShouldDelete(). RangeDelAggregator(const InternalKeyComparator& icmp, SequenceNumber upper_bound, bool collapse_deletions = false); // We maintain position in the tombstone map across calls to ShouldDelete. The // caller may wish to specify a mode to optimize positioning the iterator // during the next call to ShouldDelete. The non-kFullScan modes are only // available when deletion collapsing is enabled. // // For example, if we invoke Next() on an iterator, kForwardTraversal should // be specified to advance one-by-one through deletions until one is found // with its interval containing the key. This will typically be faster than // doing a full binary search (kBinarySearch). enum RangePositioningMode { kFullScan, // used iff collapse_deletions_ == false kForwardTraversal, kBackwardTraversal, kBinarySearch, }; // Returns whether the key should be deleted, which is the case when it is // covered by a range tombstone residing in the same snapshot stripe. // @param mode If collapse_deletions_ is true, this dictates how we will find // the deletion whose interval contains this key. Otherwise, its // value must be kFullScan indicating linear scan from beginning.. bool ShouldDelete(const ParsedInternalKey& parsed, RangePositioningMode mode = kFullScan); bool ShouldDelete(const Slice& internal_key, RangePositioningMode mode = kFullScan); // Checks whether range deletions cover any keys between `start` and `end`, // inclusive. // // @param start User key representing beginning of range to check for overlap. // @param end User key representing end of range to check for overlap. This // argument is inclusive, so the existence of a range deletion covering // `end` causes this to return true. bool IsRangeOverlapped(const Slice& start, const Slice& end); bool ShouldAddTombstones(bool bottommost_level = false); // Adds tombstones to the tombstone aggregation structure maintained by this // object. // @return non-OK status if any of the tombstone keys are corrupted. Status AddTombstones(std::unique_ptr input); // Resets iterators maintained across calls to ShouldDelete(). This may be // called when the tombstones change, or the owner may call explicitly, e.g., // if it's an iterator that just seeked to an arbitrary position. The effect // of invalidation is that the following call to ShouldDelete() will binary // search for its tombstone. void InvalidateTombstoneMapPositions(); // Writes tombstones covering a range to a table builder. // @param extend_before_min_key If true, the range of tombstones to be added // to the TableBuilder starts from the beginning of the key-range; // otherwise, it starts from meta->smallest. // @param lower_bound/upper_bound Any range deletion with [start_key, end_key) // that overlaps the target range [*lower_bound, *upper_bound) is added to // the builder. If lower_bound is nullptr, the target range extends // infinitely to the left. If upper_bound is nullptr, the target range // extends infinitely to the right. If both are nullptr, the target range // extends infinitely in both directions, i.e., all range deletions are // added to the builder. // @param meta The file's metadata. We modify the begin and end keys according // to the range tombstones added to this file such that the read path does // not miss range tombstones that cover gaps before/after/between files in // a level. lower_bound/upper_bound above constrain how far file boundaries // can be extended. // @param bottommost_level If true, we will filter out any tombstones // belonging to the oldest snapshot stripe, because all keys potentially // covered by this tombstone are guaranteed to have been deleted by // compaction. void AddToBuilder(TableBuilder* builder, const Slice* lower_bound, const Slice* upper_bound, FileMetaData* meta, CompactionIterationStats* range_del_out_stats = nullptr, bool bottommost_level = false); bool IsEmpty(); private: // Maps tombstone user start key -> tombstone object typedef std::multimap TombstoneMap; // Also maintains position in TombstoneMap last seen by ShouldDelete(). The // end iterator indicates invalidation (e.g., if AddTombstones() changes the // underlying map). End iterator cannot be invalidated. struct PositionalTombstoneMap { explicit PositionalTombstoneMap(TombstoneMap _raw_map) : raw_map(std::move(_raw_map)), iter(raw_map.end()) {} PositionalTombstoneMap(const PositionalTombstoneMap&) = delete; PositionalTombstoneMap(PositionalTombstoneMap&& other) : raw_map(std::move(other.raw_map)), iter(raw_map.end()) {} TombstoneMap raw_map; TombstoneMap::const_iterator iter; }; // Maps snapshot seqnum -> map of tombstones that fall in that stripe, i.e., // their seqnums are greater than the next smaller snapshot's seqnum. typedef std::map StripeMap; struct Rep { StripeMap stripe_map_; PinnedIteratorsManager pinned_iters_mgr_; }; // Initializes rep_ lazily. This aggregator object is constructed for every // read, so expensive members should only be created when necessary, i.e., // once the first range deletion is encountered. void InitRep(const std::vector& snapshots); PositionalTombstoneMap& GetPositionalTombstoneMap(SequenceNumber seq); Status AddTombstone(RangeTombstone tombstone); SequenceNumber upper_bound_; std::unique_ptr rep_; const InternalKeyComparator& icmp_; // collapse range deletions so they're binary searchable const bool collapse_deletions_; }; } // namespace rocksdb