// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. // // Decodes the blocks generated by block_builder.cc. #include "table/block.h" #include #include #include #include #include "monitoring/perf_context_imp.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/comparator.h" #include "table/block_prefix_index.h" #include "table/format.h" #include "util/coding.h" #include "util/logging.h" namespace rocksdb { // Helper routine: decode the next block entry starting at "p", // storing the number of shared key bytes, non_shared key bytes, // and the length of the value in "*shared", "*non_shared", and // "*value_length", respectively. Will not derefence past "limit". // // If any errors are detected, returns nullptr. Otherwise, returns a // pointer to the key delta (just past the three decoded values). static inline const char* DecodeEntry(const char* p, const char* limit, uint32_t* shared, uint32_t* non_shared, uint32_t* value_length) { if (limit - p < 3) return nullptr; *shared = reinterpret_cast(p)[0]; *non_shared = reinterpret_cast(p)[1]; *value_length = reinterpret_cast(p)[2]; if ((*shared | *non_shared | *value_length) < 128) { // Fast path: all three values are encoded in one byte each p += 3; } else { if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr; } if (static_cast(limit - p) < (*non_shared + *value_length)) { return nullptr; } return p; } void BlockIter::Next() { assert(Valid()); ParseNextKey(); } void BlockIter::Prev() { assert(Valid()); assert(prev_entries_idx_ == -1 || static_cast(prev_entries_idx_) < prev_entries_.size()); // Check if we can use cached prev_entries_ if (prev_entries_idx_ > 0 && prev_entries_[prev_entries_idx_].offset == current_) { // Read cached CachedPrevEntry prev_entries_idx_--; const CachedPrevEntry& current_prev_entry = prev_entries_[prev_entries_idx_]; const char* key_ptr = nullptr; if (current_prev_entry.key_ptr != nullptr) { // The key is not delta encoded and stored in the data block key_ptr = current_prev_entry.key_ptr; key_pinned_ = true; } else { // The key is delta encoded and stored in prev_entries_keys_buff_ key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset; key_pinned_ = false; } const Slice current_key(key_ptr, current_prev_entry.key_size); current_ = current_prev_entry.offset; key_.SetInternalKey(current_key, false /* copy */); value_ = current_prev_entry.value; return; } // Clear prev entries cache prev_entries_idx_ = -1; prev_entries_.clear(); prev_entries_keys_buff_.clear(); // Scan backwards to a restart point before current_ const uint32_t original = current_; while (GetRestartPoint(restart_index_) >= original) { if (restart_index_ == 0) { // No more entries current_ = restarts_; restart_index_ = num_restarts_; return; } restart_index_--; } SeekToRestartPoint(restart_index_); do { if (!ParseNextKey()) { break; } Slice current_key = key(); if (key_.IsKeyPinned()) { // The key is not delta encoded prev_entries_.emplace_back(current_, current_key.data(), 0, current_key.size(), value()); } else { // The key is delta encoded, cache decoded key in buffer size_t new_key_offset = prev_entries_keys_buff_.size(); prev_entries_keys_buff_.append(current_key.data(), current_key.size()); prev_entries_.emplace_back(current_, nullptr, new_key_offset, current_key.size(), value()); } // Loop until end of current entry hits the start of original entry } while (NextEntryOffset() < original); prev_entries_idx_ = static_cast(prev_entries_.size()) - 1; } void BlockIter::Seek(const Slice& target) { PERF_TIMER_GUARD(block_seek_nanos); if (data_ == nullptr) { // Not init yet return; } uint32_t index = 0; bool ok = false; if (prefix_index_) { ok = PrefixSeek(target, &index); } else { ok = BinarySeek(target, 0, num_restarts_ - 1, &index); } if (!ok) { return; } SeekToRestartPoint(index); // Linear search (within restart block) for first key >= target while (true) { if (!ParseNextKey() || Compare(key_.GetInternalKey(), target) >= 0) { return; } } } void BlockIter::SeekForPrev(const Slice& target) { PERF_TIMER_GUARD(block_seek_nanos); if (data_ == nullptr) { // Not init yet return; } uint32_t index = 0; bool ok = false; ok = BinarySeek(target, 0, num_restarts_ - 1, &index); if (!ok) { return; } SeekToRestartPoint(index); // Linear search (within restart block) for first key >= target while (ParseNextKey() && Compare(key_.GetInternalKey(), target) < 0) { } if (!Valid()) { SeekToLast(); } else { while (Valid() && Compare(key_.GetInternalKey(), target) > 0) { Prev(); } } } void BlockIter::SeekToFirst() { if (data_ == nullptr) { // Not init yet return; } SeekToRestartPoint(0); ParseNextKey(); } void BlockIter::SeekToLast() { if (data_ == nullptr) { // Not init yet return; } SeekToRestartPoint(num_restarts_ - 1); while (ParseNextKey() && NextEntryOffset() < restarts_) { // Keep skipping } } void BlockIter::CorruptionError() { current_ = restarts_; restart_index_ = num_restarts_; status_ = Status::Corruption("bad entry in block"); key_.Clear(); value_.clear(); } bool BlockIter::ParseNextKey() { current_ = NextEntryOffset(); const char* p = data_ + current_; const char* limit = data_ + restarts_; // Restarts come right after data if (p >= limit) { // No more entries to return. Mark as invalid. current_ = restarts_; restart_index_ = num_restarts_; return false; } // Decode next entry uint32_t shared, non_shared, value_length; p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); if (p == nullptr || key_.Size() < shared) { CorruptionError(); return false; } else { if (shared == 0) { // If this key dont share any bytes with prev key then we dont need // to decode it and can use it's address in the block directly. key_.SetInternalKey(Slice(p, non_shared), false /* copy */); key_pinned_ = true; } else { // This key share `shared` bytes with prev key, we need to decode it key_.TrimAppend(shared, p, non_shared); key_pinned_ = false; } if (global_seqno_ != kDisableGlobalSequenceNumber) { // If we are reading a file with a global sequence number we should // expect that all encoded sequence numbers are zeros and any value // type is kTypeValue, kTypeMerge or kTypeDeletion assert(GetInternalKeySeqno(key_.GetInternalKey()) == 0); ValueType value_type = ExtractValueType(key_.GetInternalKey()); assert(value_type == ValueType::kTypeValue || value_type == ValueType::kTypeMerge || value_type == ValueType::kTypeDeletion); if (key_pinned_) { // TODO(tec): Investigate updating the seqno in the loaded block // directly instead of doing a copy and update. // We cannot use the key address in the block directly because // we have a global_seqno_ that will overwrite the encoded one. key_.OwnKey(); key_pinned_ = false; } key_.UpdateInternalKey(global_seqno_, value_type); } value_ = Slice(p + non_shared, value_length); while (restart_index_ + 1 < num_restarts_ && GetRestartPoint(restart_index_ + 1) < current_) { ++restart_index_; } return true; } } // Binary search in restart array to find the first restart point that // is either the last restart point with a key less than target, // which means the key of next restart point is larger than target, or // the first restart point with a key = target bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right, uint32_t* index) { assert(left <= right); while (left < right) { uint32_t mid = (left + right + 1) / 2; uint32_t region_offset = GetRestartPoint(mid); uint32_t shared, non_shared, value_length; const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_, &shared, &non_shared, &value_length); if (key_ptr == nullptr || (shared != 0)) { CorruptionError(); return false; } Slice mid_key(key_ptr, non_shared); int cmp = Compare(mid_key, target); if (cmp < 0) { // Key at "mid" is smaller than "target". Therefore all // blocks before "mid" are uninteresting. left = mid; } else if (cmp > 0) { // Key at "mid" is >= "target". Therefore all blocks at or // after "mid" are uninteresting. right = mid - 1; } else { left = right = mid; } } *index = left; return true; } // Compare target key and the block key of the block of `block_index`. // Return -1 if error. int BlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { uint32_t region_offset = GetRestartPoint(block_index); uint32_t shared, non_shared, value_length; const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_, &shared, &non_shared, &value_length); if (key_ptr == nullptr || (shared != 0)) { CorruptionError(); return 1; // Return target is smaller } Slice block_key(key_ptr, non_shared); return Compare(block_key, target); } // Binary search in block_ids to find the first block // with a key >= target bool BlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, uint32_t left, uint32_t right, uint32_t* index) { assert(left <= right); uint32_t left_bound = left; while (left <= right) { uint32_t mid = (right + left) / 2; int cmp = CompareBlockKey(block_ids[mid], target); if (!status_.ok()) { return false; } if (cmp < 0) { // Key at "target" is larger than "mid". Therefore all // blocks before or at "mid" are uninteresting. left = mid + 1; } else { // Key at "target" is <= "mid". Therefore all blocks // after "mid" are uninteresting. // If there is only one block left, we found it. if (left == right) break; right = mid; } } if (left == right) { // In one of the two following cases: // (1) left is the first one of block_ids // (2) there is a gap of blocks between block of `left` and `left-1`. // we can further distinguish the case of key in the block or key not // existing, by comparing the target key and the key of the previous // block to the left of the block found. if (block_ids[left] > 0 && (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) && CompareBlockKey(block_ids[left] - 1, target) > 0) { current_ = restarts_; return false; } *index = block_ids[left]; return true; } else { assert(left > right); // Mark iterator invalid current_ = restarts_; return false; } } bool BlockIter::PrefixSeek(const Slice& target, uint32_t* index) { assert(prefix_index_); uint32_t* block_ids = nullptr; uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids); if (num_blocks == 0) { current_ = restarts_; return false; } else { return BinaryBlockIndexSeek(target, block_ids, 0, num_blocks - 1, index); } } uint32_t Block::NumRestarts() const { assert(size_ >= 2*sizeof(uint32_t)); return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); } Block::Block(BlockContents&& contents, SequenceNumber _global_seqno, size_t read_amp_bytes_per_bit, Statistics* statistics) : contents_(std::move(contents)), data_(contents_.data.data()), size_(contents_.data.size()), global_seqno_(_global_seqno) { if (size_ < sizeof(uint32_t)) { size_ = 0; // Error marker } else { restart_offset_ = static_cast(size_) - (1 + NumRestarts()) * sizeof(uint32_t); if (restart_offset_ > size_ - sizeof(uint32_t)) { // The size is too small for NumRestarts() and therefore // restart_offset_ wrapped around. size_ = 0; } } if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) { read_amp_bitmap_.reset(new BlockReadAmpBitmap( restart_offset_, read_amp_bytes_per_bit, statistics)); } } InternalIterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter, bool total_order_seek, Statistics* stats) { if (size_ < 2*sizeof(uint32_t)) { if (iter != nullptr) { iter->SetStatus(Status::Corruption("bad block contents")); return iter; } else { return NewErrorInternalIterator(Status::Corruption("bad block contents")); } } const uint32_t num_restarts = NumRestarts(); if (num_restarts == 0) { if (iter != nullptr) { iter->SetStatus(Status::OK()); return iter; } else { return NewEmptyInternalIterator(); } } else { BlockPrefixIndex* prefix_index_ptr = total_order_seek ? nullptr : prefix_index_.get(); if (iter != nullptr) { iter->Initialize(cmp, data_, restart_offset_, num_restarts, prefix_index_ptr, global_seqno_, read_amp_bitmap_.get()); } else { iter = new BlockIter(cmp, data_, restart_offset_, num_restarts, prefix_index_ptr, global_seqno_, read_amp_bitmap_.get()); } if (read_amp_bitmap_) { if (read_amp_bitmap_->GetStatistics() != stats) { // DB changed the Statistics pointer, we need to notify read_amp_bitmap_ read_amp_bitmap_->SetStatistics(stats); } } } return iter; } void Block::SetBlockPrefixIndex(BlockPrefixIndex* prefix_index) { prefix_index_.reset(prefix_index); } size_t Block::ApproximateMemoryUsage() const { size_t usage = usable_size(); if (prefix_index_) { usage += prefix_index_->ApproximateMemoryUsage(); } return usage; } // NOTE vmx 2017-06-08: This code is based on `Block::NewIterator`, it was // a bit simplified. The major difference is that not a `BlockIter()` but a // `RtreeBlockIter` is created. InternalIterator* Block::NewRtreeIterator(const Comparator* cmp, RtreeBlockIter* iter, Statistics* stats, RtreeIteratorContext* context) { if (size_ < 2*sizeof(uint32_t)) { if (iter != nullptr) { iter->SetStatus(Status::Corruption("bad block contents")); return iter; } else { return NewErrorInternalIterator(Status::Corruption("bad block contents")); } } const uint32_t num_restarts = NumRestarts(); if (num_restarts == 0) { if (iter != nullptr) { iter->SetStatus(Status::OK()); return iter; } else { return NewEmptyInternalIterator(); } } else { if (iter != nullptr) { iter->Initialize(cmp, data_, restart_offset_, num_restarts, global_seqno_, read_amp_bitmap_.get(), context->query_mbb); } else { iter = new RtreeBlockIter(cmp, data_, restart_offset_, num_restarts, global_seqno_, read_amp_bitmap_.get(), context->query_mbb); } if (read_amp_bitmap_) { if (read_amp_bitmap_->GetStatistics() != stats) { // DB changed the Statistics pointer, we need to notify read_amp_bitmap_ read_amp_bitmap_->SetStatistics(stats); } } } return iter; } void RtreeBlockIter::Next() { assert(Valid()); ParseNextKey(); } void RtreeBlockIter::SeekToFirst() { if (data_ == nullptr) { // Not init yet return; } BlockIter::SeekToRestartPoint(0); ParseNextKey(); } // Calls the `ParseNextKey()` from `BlockIter`, but skips keys if an Mbb is // given and it doesn't intersect bool RtreeBlockIter::ParseNextKey() { bool ret = false; do { ret = BlockIter::ParseNextKey(); } while (Valid() && !IntersectMbb(key(), query_mbb_)); return ret; } bool RtreeBlockIter::IntersectMbb(const Slice& aa_orig, Mbb bb) { // If the query bounding box is empty, return true, which corresponds to a // full table scan if (bb.empty()) { return true; } // Make a mutable copy of the slice Slice aa = Slice(aa_orig); // The key consists of the Keypath, the Internal Id and two more dimensions Slice ignore = Slice(); GetLengthPrefixedSlice(&aa, &ignore); uint64_t aa_iid = *reinterpret_cast(aa.data()); // If the bounding boxes don't intersect in one dimension, they won't // intersect at all, hence we can return early if (aa_iid > bb.iid.max || bb.iid.min > aa_iid) { return false; } double aa_min; double aa_max; aa_min = *reinterpret_cast(aa.data() + 8); aa_max = *reinterpret_cast(aa.data() + 16); if (aa_min > bb.first.max || bb.first.min > aa_max) { return false; } aa_min = *reinterpret_cast(aa.data() + 24); aa_max = *reinterpret_cast(aa.data() + 32); if (aa_min > bb.second.max || bb.second.min > aa_max) { return false; } return true; } } // namespace rocksdb