// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "db/column_family.h" #include #include #include #include #include #include #include #include "db/blob/blob_file_cache.h" #include "db/blob/blob_source.h" #include "db/compaction/compaction_picker.h" #include "db/compaction/compaction_picker_fifo.h" #include "db/compaction/compaction_picker_level.h" #include "db/compaction/compaction_picker_universal.h" #include "db/db_impl/db_impl.h" #include "db/internal_stats.h" #include "db/job_context.h" #include "db/range_del_aggregator.h" #include "db/table_properties_collector.h" #include "db/version_set.h" #include "file/sst_file_manager_impl.h" #include "logging/logging.h" #include "monitoring/thread_status_util.h" #include "options/options_helper.h" #include "port/port.h" #include "rocksdb/convenience.h" #include "rocksdb/table.h" #include "rocksdb/write_controller.h" #include "table/merging_iterator.h" #include "util/autovector.h" #include "util/cast_util.h" #include "util/compression.h" namespace ROCKSDB_NAMESPACE { ColumnFamilyHandleImpl::ColumnFamilyHandleImpl( ColumnFamilyData* column_family_data, DBImpl* db, InstrumentedMutex* mutex) : cfd_(column_family_data), db_(db), mutex_(mutex) { if (cfd_ != nullptr) { cfd_->Ref(); } } ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() { if (cfd_ != nullptr) { for (auto& listener : cfd_->ioptions()->listeners) { listener->OnColumnFamilyHandleDeletionStarted(this); } // Job id == 0 means that this is not our background process, but rather // user thread // Need to hold some shared pointers owned by the initial_cf_options // before final cleaning up finishes. ColumnFamilyOptions initial_cf_options_copy = cfd_->initial_cf_options(); JobContext job_context(0); mutex_->Lock(); bool dropped = cfd_->IsDropped(); if (cfd_->UnrefAndTryDelete()) { if (dropped) { db_->FindObsoleteFiles(&job_context, false, true); } } mutex_->Unlock(); if (job_context.HaveSomethingToDelete()) { bool defer_purge = db_->immutable_db_options().avoid_unnecessary_blocking_io; db_->PurgeObsoleteFiles(job_context, defer_purge); } job_context.Clean(); } } uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); } const std::string& ColumnFamilyHandleImpl::GetName() const { return cfd()->GetName(); } Status ColumnFamilyHandleImpl::GetDescriptor(ColumnFamilyDescriptor* desc) { // accessing mutable cf-options requires db mutex. InstrumentedMutexLock l(mutex_); *desc = ColumnFamilyDescriptor(cfd()->GetName(), cfd()->GetLatestCFOptions()); return Status::OK(); } const Comparator* ColumnFamilyHandleImpl::GetComparator() const { return cfd()->user_comparator(); } void GetIntTblPropCollectorFactory( const ImmutableCFOptions& ioptions, IntTblPropCollectorFactories* int_tbl_prop_collector_factories) { assert(int_tbl_prop_collector_factories); auto& collector_factories = ioptions.table_properties_collector_factories; for (size_t i = 0; i < ioptions.table_properties_collector_factories.size(); ++i) { assert(collector_factories[i]); int_tbl_prop_collector_factories->emplace_back( new UserKeyTablePropertiesCollectorFactory(collector_factories[i])); } } Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) { if (!cf_options.compression_per_level.empty()) { for (size_t level = 0; level < cf_options.compression_per_level.size(); ++level) { if (!CompressionTypeSupported(cf_options.compression_per_level[level])) { return Status::InvalidArgument( "Compression type " + CompressionTypeToString(cf_options.compression_per_level[level]) + " is not linked with the binary."); } } } else { if (!CompressionTypeSupported(cf_options.compression)) { return Status::InvalidArgument( "Compression type " + CompressionTypeToString(cf_options.compression) + " is not linked with the binary."); } } if (cf_options.compression_opts.zstd_max_train_bytes > 0) { if (cf_options.compression_opts.use_zstd_dict_trainer) { if (!ZSTD_TrainDictionarySupported()) { return Status::InvalidArgument( "zstd dictionary trainer cannot be used because ZSTD 1.1.3+ " "is not linked with the binary."); } } else if (!ZSTD_FinalizeDictionarySupported()) { return Status::InvalidArgument( "zstd finalizeDictionary cannot be used because ZSTD 1.4.5+ " "is not linked with the binary."); } if (cf_options.compression_opts.max_dict_bytes == 0) { return Status::InvalidArgument( "The dictionary size limit (`CompressionOptions::max_dict_bytes`) " "should be nonzero if we're using zstd's dictionary generator."); } } if (!CompressionTypeSupported(cf_options.blob_compression_type)) { std::ostringstream oss; oss << "The specified blob compression type " << CompressionTypeToString(cf_options.blob_compression_type) << " is not available."; return Status::InvalidArgument(oss.str()); } return Status::OK(); } Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options) { if (cf_options.inplace_update_support) { return Status::InvalidArgument( "In-place memtable updates (inplace_update_support) is not compatible " "with concurrent writes (allow_concurrent_memtable_write)"); } if (!cf_options.memtable_factory->IsInsertConcurrentlySupported()) { return Status::InvalidArgument( "Memtable doesn't concurrent writes (allow_concurrent_memtable_write)"); } return Status::OK(); } Status CheckCFPathsSupported(const DBOptions& db_options, const ColumnFamilyOptions& cf_options) { // More than one cf_paths are supported only in universal // and level compaction styles. This function also checks the case // in which cf_paths is not specified, which results in db_paths // being used. if ((cf_options.compaction_style != kCompactionStyleUniversal) && (cf_options.compaction_style != kCompactionStyleLevel)) { if (cf_options.cf_paths.size() > 1) { return Status::NotSupported( "More than one CF paths are only supported in " "universal and level compaction styles. "); } else if (cf_options.cf_paths.empty() && db_options.db_paths.size() > 1) { return Status::NotSupported( "More than one DB paths are only supported in " "universal and level compaction styles. "); } } return Status::OK(); } namespace { const uint64_t kDefaultTtl = 0xfffffffffffffffe; const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe; } // anonymous namespace ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options, const ColumnFamilyOptions& src) { ColumnFamilyOptions result = src; size_t clamp_max = std::conditional< sizeof(size_t) == 4, std::integral_constant, std::integral_constant>::type::value; ClipToRange(&result.write_buffer_size, (static_cast(64)) << 10, clamp_max); // if user sets arena_block_size, we trust user to use this value. Otherwise, // calculate a proper value from writer_buffer_size; if (result.arena_block_size <= 0) { result.arena_block_size = std::min(size_t{1024 * 1024}, result.write_buffer_size / 8); // Align up to 4k const size_t align = 4 * 1024; result.arena_block_size = ((result.arena_block_size + align - 1) / align) * align; } result.min_write_buffer_number_to_merge = std::min(result.min_write_buffer_number_to_merge, result.max_write_buffer_number - 1); if (result.min_write_buffer_number_to_merge < 1) { result.min_write_buffer_number_to_merge = 1; } if (db_options.atomic_flush && result.min_write_buffer_number_to_merge > 1) { ROCKS_LOG_WARN( db_options.logger, "Currently, if atomic_flush is true, then triggering flush for any " "column family internally (non-manual flush) will trigger flushing " "all column families even if the number of memtables is smaller " "min_write_buffer_number_to_merge. Therefore, configuring " "min_write_buffer_number_to_merge > 1 is not compatible and should " "be satinized to 1. Not doing so will lead to data loss and " "inconsistent state across multiple column families when WAL is " "disabled, which is a common setting for atomic flush"); result.min_write_buffer_number_to_merge = 1; } if (result.num_levels < 1) { result.num_levels = 1; } if (result.compaction_style == kCompactionStyleLevel && result.num_levels < 2) { result.num_levels = 2; } if (result.compaction_style == kCompactionStyleUniversal && db_options.allow_ingest_behind && result.num_levels < 3) { result.num_levels = 3; } if (result.max_write_buffer_number < 2) { result.max_write_buffer_number = 2; } // fall back max_write_buffer_number_to_maintain if // max_write_buffer_size_to_maintain is not set if (result.max_write_buffer_size_to_maintain < 0) { result.max_write_buffer_size_to_maintain = result.max_write_buffer_number * static_cast(result.write_buffer_size); } else if (result.max_write_buffer_size_to_maintain == 0 && result.max_write_buffer_number_to_maintain < 0) { result.max_write_buffer_number_to_maintain = result.max_write_buffer_number; } // bloom filter size shouldn't exceed 1/4 of memtable size. if (result.memtable_prefix_bloom_size_ratio > 0.25) { result.memtable_prefix_bloom_size_ratio = 0.25; } else if (result.memtable_prefix_bloom_size_ratio < 0) { result.memtable_prefix_bloom_size_ratio = 0; } if (!result.prefix_extractor) { assert(result.memtable_factory); Slice name = result.memtable_factory->Name(); if (name.compare("HashSkipListRepFactory") == 0 || name.compare("HashLinkListRepFactory") == 0) { result.memtable_factory = std::make_shared(); } } if (result.compaction_style == kCompactionStyleFIFO) { // since we delete level0 files in FIFO compaction when there are too many // of them, these options don't really mean anything result.level0_slowdown_writes_trigger = std::numeric_limits::max(); result.level0_stop_writes_trigger = std::numeric_limits::max(); } if (result.max_bytes_for_level_multiplier <= 0) { result.max_bytes_for_level_multiplier = 1; } if (result.level0_file_num_compaction_trigger == 0) { ROCKS_LOG_WARN(db_options.logger, "level0_file_num_compaction_trigger cannot be 0"); result.level0_file_num_compaction_trigger = 1; } if (result.level0_stop_writes_trigger < result.level0_slowdown_writes_trigger || result.level0_slowdown_writes_trigger < result.level0_file_num_compaction_trigger) { ROCKS_LOG_WARN(db_options.logger, "This condition must be satisfied: " "level0_stop_writes_trigger(%d) >= " "level0_slowdown_writes_trigger(%d) >= " "level0_file_num_compaction_trigger(%d)", result.level0_stop_writes_trigger, result.level0_slowdown_writes_trigger, result.level0_file_num_compaction_trigger); if (result.level0_slowdown_writes_trigger < result.level0_file_num_compaction_trigger) { result.level0_slowdown_writes_trigger = result.level0_file_num_compaction_trigger; } if (result.level0_stop_writes_trigger < result.level0_slowdown_writes_trigger) { result.level0_stop_writes_trigger = result.level0_slowdown_writes_trigger; } ROCKS_LOG_WARN(db_options.logger, "Adjust the value to " "level0_stop_writes_trigger(%d)" "level0_slowdown_writes_trigger(%d)" "level0_file_num_compaction_trigger(%d)", result.level0_stop_writes_trigger, result.level0_slowdown_writes_trigger, result.level0_file_num_compaction_trigger); } if (result.soft_pending_compaction_bytes_limit == 0) { result.soft_pending_compaction_bytes_limit = result.hard_pending_compaction_bytes_limit; } else if (result.hard_pending_compaction_bytes_limit > 0 && result.soft_pending_compaction_bytes_limit > result.hard_pending_compaction_bytes_limit) { result.soft_pending_compaction_bytes_limit = result.hard_pending_compaction_bytes_limit; } // When the DB is stopped, it's possible that there are some .trash files that // were not deleted yet, when we open the DB we will find these .trash files // and schedule them to be deleted (or delete immediately if SstFileManager // was not used) auto sfm = static_cast(db_options.sst_file_manager.get()); for (size_t i = 0; i < result.cf_paths.size(); i++) { DeleteScheduler::CleanupDirectory(db_options.env, sfm, result.cf_paths[i].path) .PermitUncheckedError(); } if (result.cf_paths.empty()) { result.cf_paths = db_options.db_paths; } if (result.level_compaction_dynamic_level_bytes) { if (result.compaction_style != kCompactionStyleLevel) { ROCKS_LOG_WARN(db_options.info_log.get(), "level_compaction_dynamic_level_bytes only makes sense" "for level-based compaction"); result.level_compaction_dynamic_level_bytes = false; } else if (result.cf_paths.size() > 1U) { // we don't yet know how to make both of this feature and multiple // DB path work. ROCKS_LOG_WARN(db_options.info_log.get(), "multiple cf_paths/db_paths and" "level_compaction_dynamic_level_bytes" "can't be used together"); result.level_compaction_dynamic_level_bytes = false; } } if (result.max_compaction_bytes == 0) { result.max_compaction_bytes = result.target_file_size_base * 25; } bool is_block_based_table = (result.table_factory->IsInstanceOf( TableFactory::kBlockBasedTableName())); const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60; if (result.ttl == kDefaultTtl) { if (is_block_based_table && result.compaction_style != kCompactionStyleFIFO) { result.ttl = kAdjustedTtl; } else { result.ttl = 0; } } const uint64_t kAdjustedPeriodicCompSecs = 30 * 24 * 60 * 60; // Turn on periodic compactions and set them to occur once every 30 days if // compaction filters are used and periodic_compaction_seconds is set to the // default value. if (result.compaction_style != kCompactionStyleFIFO) { if ((result.compaction_filter != nullptr || result.compaction_filter_factory != nullptr) && result.periodic_compaction_seconds == kDefaultPeriodicCompSecs && is_block_based_table) { result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs; } } else { // result.compaction_style == kCompactionStyleFIFO if (result.ttl == 0) { if (is_block_based_table) { if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) { result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs; } result.ttl = result.periodic_compaction_seconds; } } else if (result.periodic_compaction_seconds != 0) { result.ttl = std::min(result.ttl, result.periodic_compaction_seconds); } } // TTL compactions would work similar to Periodic Compactions in Universal in // most of the cases. So, if ttl is set, execute the periodic compaction // codepath. if (result.compaction_style == kCompactionStyleUniversal && result.ttl != 0) { if (result.periodic_compaction_seconds != 0) { result.periodic_compaction_seconds = std::min(result.ttl, result.periodic_compaction_seconds); } else { result.periodic_compaction_seconds = result.ttl; } } if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) { result.periodic_compaction_seconds = 0; } return result; } int SuperVersion::dummy = 0; void* const SuperVersion::kSVInUse = &SuperVersion::dummy; void* const SuperVersion::kSVObsolete = nullptr; SuperVersion::~SuperVersion() { for (auto td : to_delete) { delete td; } } SuperVersion* SuperVersion::Ref() { refs.fetch_add(1, std::memory_order_relaxed); return this; } bool SuperVersion::Unref() { // fetch_sub returns the previous value of ref uint32_t previous_refs = refs.fetch_sub(1); assert(previous_refs > 0); return previous_refs == 1; } void SuperVersion::Cleanup() { assert(refs.load(std::memory_order_relaxed) == 0); // Since this SuperVersion object is being deleted, // decrement reference to the immutable MemtableList // this SV object was pointing to. imm->Unref(&to_delete); MemTable* m = mem->Unref(); if (m != nullptr) { auto* memory_usage = current->cfd()->imm()->current_memory_usage(); assert(*memory_usage >= m->ApproximateMemoryUsage()); *memory_usage -= m->ApproximateMemoryUsage(); to_delete.push_back(m); } current->Unref(); cfd->UnrefAndTryDelete(); } void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem, MemTableListVersion* new_imm, Version* new_current) { cfd = new_cfd; mem = new_mem; imm = new_imm; current = new_current; cfd->Ref(); mem->Ref(); imm->Ref(); current->Ref(); refs.store(1, std::memory_order_relaxed); } namespace { void SuperVersionUnrefHandle(void* ptr) { // UnrefHandle is called when a thread exits or a ThreadLocalPtr gets // destroyed. When the former happens, the thread shouldn't see kSVInUse. // When the latter happens, only super_version_ holds a reference // to ColumnFamilyData, so no further queries are possible. SuperVersion* sv = static_cast(ptr); bool was_last_ref __attribute__((__unused__)); was_last_ref = sv->Unref(); // Thread-local SuperVersions can't outlive ColumnFamilyData::super_version_. // This is important because we can't do SuperVersion cleanup here. // That would require locking DB mutex, which would deadlock because // SuperVersionUnrefHandle is called with locked ThreadLocalPtr mutex. assert(!was_last_ref); } } // anonymous namespace std::vector ColumnFamilyData::GetDbPaths() const { std::vector paths; paths.reserve(ioptions_.cf_paths.size()); for (const DbPath& db_path : ioptions_.cf_paths) { paths.emplace_back(db_path.path); } return paths; } const uint32_t ColumnFamilyData::kDummyColumnFamilyDataId = std::numeric_limits::max(); ColumnFamilyData::ColumnFamilyData( uint32_t id, const std::string& name, Version* _dummy_versions, Cache* _table_cache, WriteBufferManager* write_buffer_manager, std::shared_ptr write_controller, const ColumnFamilyOptions& cf_options, const ImmutableDBOptions& db_options, const FileOptions* file_options, ColumnFamilySet* column_family_set, BlockCacheTracer* const block_cache_tracer, const std::shared_ptr& io_tracer, const std::string& db_id, const std::string& db_session_id) : id_(id), name_(name), dummy_versions_(_dummy_versions), current_(nullptr), refs_(0), initialized_(false), dropped_(false), internal_comparator_(cf_options.comparator), initial_cf_options_(SanitizeOptions(db_options, cf_options)), ioptions_(db_options, initial_cf_options_), mutable_cf_options_(initial_cf_options_), is_delete_range_supported_( cf_options.table_factory->IsDeleteRangeSupported()), write_buffer_manager_(write_buffer_manager), write_controller_(write_controller), mem_(nullptr), imm_(ioptions_.min_write_buffer_number_to_merge, ioptions_.max_write_buffer_number_to_maintain, ioptions_.max_write_buffer_size_to_maintain), super_version_(nullptr), super_version_number_(0), local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)), next_(nullptr), prev_(nullptr), log_number_(0), column_family_set_(column_family_set), queued_for_flush_(false), queued_for_compaction_(false), prev_compaction_needed_bytes_(0), allow_2pc_(db_options.allow_2pc), last_memtable_id_(0), db_paths_registered_(false), mempurge_used_(false), next_epoch_number_(1) { if (id_ != kDummyColumnFamilyDataId) { // TODO(cc): RegisterDbPaths can be expensive, considering moving it // outside of this constructor which might be called with db mutex held. // TODO(cc): considering using ioptions_.fs, currently some tests rely on // EnvWrapper, that's the main reason why we use env here. Status s = ioptions_.env->RegisterDbPaths(GetDbPaths()); if (s.ok()) { db_paths_registered_ = true; } else { ROCKS_LOG_ERROR( ioptions_.logger, "Failed to register data paths of column family (id: %d, name: %s)", id_, name_.c_str()); } } Ref(); // Convert user defined table properties collector factories to internal ones. GetIntTblPropCollectorFactory(ioptions_, &int_tbl_prop_collector_factories_); // if _dummy_versions is nullptr, then this is a dummy column family. if (_dummy_versions != nullptr) { internal_stats_.reset( new InternalStats(ioptions_.num_levels, ioptions_.clock, this)); table_cache_.reset(new TableCache(ioptions_, file_options, _table_cache, block_cache_tracer, io_tracer, db_session_id)); blob_file_cache_.reset( new BlobFileCache(_table_cache, ioptions(), soptions(), id_, internal_stats_->GetBlobFileReadHist(), io_tracer)); blob_source_.reset(new BlobSource(ioptions(), db_id, db_session_id, blob_file_cache_.get())); if (ioptions_.compaction_style == kCompactionStyleLevel) { compaction_picker_.reset( new LevelCompactionPicker(ioptions_, &internal_comparator_)); } else if (ioptions_.compaction_style == kCompactionStyleUniversal) { compaction_picker_.reset( new UniversalCompactionPicker(ioptions_, &internal_comparator_)); } else if (ioptions_.compaction_style == kCompactionStyleFIFO) { compaction_picker_.reset( new FIFOCompactionPicker(ioptions_, &internal_comparator_)); } else if (ioptions_.compaction_style == kCompactionStyleNone) { compaction_picker_.reset( new NullCompactionPicker(ioptions_, &internal_comparator_)); ROCKS_LOG_WARN(ioptions_.logger, "Column family %s does not use any background compaction. " "Compactions can only be done via CompactFiles\n", GetName().c_str()); } else { ROCKS_LOG_ERROR(ioptions_.logger, "Unable to recognize the specified compaction style %d. " "Column family %s will use kCompactionStyleLevel.\n", ioptions_.compaction_style, GetName().c_str()); compaction_picker_.reset( new LevelCompactionPicker(ioptions_, &internal_comparator_)); } if (column_family_set_->NumberOfColumnFamilies() < 10) { ROCKS_LOG_HEADER(ioptions_.logger, "--------------- Options for column family [%s]:\n", name.c_str()); initial_cf_options_.Dump(ioptions_.logger); } else { ROCKS_LOG_INFO(ioptions_.logger, "\t(skipping printing options of [%s])\n", name.c_str()); } } RecalculateWriteStallConditions(mutable_cf_options_); if (cf_options.table_factory->IsInstanceOf( TableFactory::kBlockBasedTableName()) && cf_options.table_factory->GetOptions()) { const BlockBasedTableOptions* bbto = cf_options.table_factory->GetOptions(); const auto& options_overrides = bbto->cache_usage_options.options_overrides; const auto file_metadata_charged = options_overrides.at(CacheEntryRole::kFileMetadata).charged; if (bbto->block_cache && file_metadata_charged == CacheEntryRoleOptions::Decision::kEnabled) { // TODO(hx235): Add a `ConcurrentCacheReservationManager` at DB scope // responsible for reservation of `ObsoleteFileInfo` so that we can keep // this `file_metadata_cache_res_mgr_` nonconcurrent file_metadata_cache_res_mgr_.reset(new ConcurrentCacheReservationManager( std::make_shared< CacheReservationManagerImpl>( bbto->block_cache))); } if (bbto->block_cache && table_cache_) { cache_owner_id_ = bbto->block_cache->GetNextItemOwnerId(); table_cache_->SetBlockCacheOwnerId(cache_owner_id_); } } } // DB mutex held ColumnFamilyData::~ColumnFamilyData() { assert(refs_.load(std::memory_order_relaxed) == 0); ResetCFRate(this); // remove from linked list auto prev = prev_; auto next = next_; prev->next_ = next; next->prev_ = prev; const BlockBasedTableOptions* bbto = ioptions_.table_factory->GetOptions(); if (bbto && bbto->block_cache) { bbto->block_cache->DiscardItemOwnerId(&cache_owner_id_); } if (!dropped_ && column_family_set_ != nullptr) { // If it's dropped, it's already removed from column family set // If column_family_set_ == nullptr, this is dummy CFD and not in // ColumnFamilySet column_family_set_->RemoveColumnFamily(this); } if (current_ != nullptr) { current_->Unref(); } // It would be wrong if this ColumnFamilyData is in flush_queue_ or // compaction_queue_ and we destroyed it assert(!queued_for_flush_); assert(!queued_for_compaction_); assert(super_version_ == nullptr); if (dummy_versions_ != nullptr) { // List must be empty assert(dummy_versions_->Next() == dummy_versions_); bool deleted __attribute__((__unused__)); deleted = dummy_versions_->Unref(); assert(deleted); } if (mem_ != nullptr) { delete mem_->Unref(); } autovector to_delete; imm_.current()->Unref(&to_delete); for (MemTable* m : to_delete) { delete m; } if (db_paths_registered_) { // TODO(cc): considering using ioptions_.fs, currently some tests rely on // EnvWrapper, that's the main reason why we use env here. Status s = ioptions_.env->UnregisterDbPaths(GetDbPaths()); if (!s.ok()) { ROCKS_LOG_ERROR( ioptions_.logger, "Failed to unregister data paths of column family (id: %d, name: %s)", id_, name_.c_str()); } } } bool ColumnFamilyData::UnrefAndTryDelete() { int old_refs = refs_.fetch_sub(1); assert(old_refs > 0); if (old_refs == 1) { assert(super_version_ == nullptr); delete this; return true; } if (old_refs == 2 && super_version_ != nullptr) { // Only the super_version_ holds me SuperVersion* sv = super_version_; super_version_ = nullptr; // Release SuperVersion references kept in ThreadLocalPtr. local_sv_.reset(); if (sv->Unref()) { // Note: sv will delete this ColumnFamilyData during Cleanup() assert(sv->cfd == this); sv->Cleanup(); delete sv; return true; } } return false; } void ColumnFamilyData::SetDropped() { // can't drop default CF assert(id_ != 0); dropped_ = true; ResetCFRate(this); write_controller_token_.reset(); // remove from column_family_set column_family_set_->RemoveColumnFamily(this); } ColumnFamilyOptions ColumnFamilyData::GetLatestCFOptions() const { return BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_); } uint64_t ColumnFamilyData::OldestLogToKeep() { auto current_log = GetLogNumber(); if (allow_2pc_) { auto imm_prep_log = imm()->PrecomputeMinLogContainingPrepSection(); auto mem_prep_log = mem()->GetMinLogContainingPrepSection(); if (imm_prep_log > 0 && imm_prep_log < current_log) { current_log = imm_prep_log; } if (mem_prep_log > 0 && mem_prep_log < current_log) { current_log = mem_prep_log; } } return current_log; } const double kIncSlowdownRatio = 0.8; const double kDecSlowdownRatio = 1 / kIncSlowdownRatio; const double kNearStopSlowdownRatio = 0.6; const double kDelayRecoverSlowdownRatio = 1.4; namespace { // If penalize_stop is true, we further reduce slowdown rate. std::unique_ptr SetupDelay( WriteController* write_controller, uint64_t compaction_needed_bytes, uint64_t prev_compaction_need_bytes, bool penalize_stop, bool auto_compactions_disabled) { const uint64_t kMinWriteRate = 16 * 1024u; // Minimum write rate 16KB/s. uint64_t max_write_rate = write_controller->max_delayed_write_rate(); uint64_t write_rate = write_controller->delayed_write_rate(); if (auto_compactions_disabled) { // When auto compaction is disabled, always use the value user gave. write_rate = max_write_rate; } else if (write_controller->NeedsDelay() && max_write_rate > kMinWriteRate) { // If user gives rate less than kMinWriteRate, don't adjust it. // // If already delayed, need to adjust based on previous compaction debt. // When there are two or more column families require delay, we always // increase or reduce write rate based on information for one single // column family. It is likely to be OK but we can improve if there is a // problem. // Ignore compaction_needed_bytes = 0 case because compaction_needed_bytes // is only available in level-based compaction // // If the compaction debt stays the same as previously, we also further slow // down. It usually means a mem table is full. It's mainly for the case // where both of flush and compaction are much slower than the speed we // insert to mem tables, so we need to actively slow down before we get // feedback signal from compaction and flushes to avoid the full stop // because of hitting the max write buffer number. // // If DB just falled into the stop condition, we need to further reduce // the write rate to avoid the stop condition. if (penalize_stop) { // Penalize the near stop or stop condition by more aggressive slowdown. // This is to provide the long term slowdown increase signal. // The penalty is more than the reward of recovering to the normal // condition. write_rate = static_cast(static_cast(write_rate) * kNearStopSlowdownRatio); if (write_rate < kMinWriteRate) { write_rate = kMinWriteRate; } } else if (prev_compaction_need_bytes > 0 && prev_compaction_need_bytes <= compaction_needed_bytes) { write_rate = static_cast(static_cast(write_rate) * kIncSlowdownRatio); if (write_rate < kMinWriteRate) { write_rate = kMinWriteRate; } } else if (prev_compaction_need_bytes > compaction_needed_bytes) { // We are speeding up by ratio of kSlowdownRatio when we have paid // compaction debt. But we'll never speed up to faster than the write rate // given by users. write_rate = static_cast(static_cast(write_rate) * kDecSlowdownRatio); if (write_rate > max_write_rate) { write_rate = max_write_rate; } } } return write_controller->GetDelayToken(write_rate); } int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger, int level0_slowdown_writes_trigger) { // SanitizeOptions() ensures it. assert(level0_file_num_compaction_trigger <= level0_slowdown_writes_trigger); if (level0_file_num_compaction_trigger < 0) { return std::numeric_limits::max(); } const int64_t twice_level0_trigger = static_cast(level0_file_num_compaction_trigger) * 2; const int64_t one_fourth_trigger_slowdown = static_cast(level0_file_num_compaction_trigger) + ((level0_slowdown_writes_trigger - level0_file_num_compaction_trigger) / 4); assert(twice_level0_trigger >= 0); assert(one_fourth_trigger_slowdown >= 0); // 1/4 of the way between L0 compaction trigger threshold and slowdown // condition. // Or twice as compaction trigger, if it is smaller. int64_t res = std::min(twice_level0_trigger, one_fourth_trigger_slowdown); if (res >= std::numeric_limits::max()) { return std::numeric_limits::max(); } else { // res fits in int return static_cast(res); } } } // anonymous namespace namespace { const int kMemtablePenalty = 10; const int kNumPendingSteps = 100; } // namespace double ColumnFamilyData::TEST_CalculateWriteDelayDivider( uint64_t compaction_needed_bytes, const MutableCFOptions& mutable_cf_options, WriteStallCause& write_stall_cause) { return CalculateWriteDelayDividerAndMaybeUpdateWriteStallCause( compaction_needed_bytes, mutable_cf_options, write_stall_cause); } void ColumnFamilyData::DynamicSetupDelay( uint64_t max_write_rate, uint64_t compaction_needed_bytes, const MutableCFOptions& mutable_cf_options, WriteStallCause& write_stall_cause) { const double rate_divider = CalculateWriteDelayDividerAndMaybeUpdateWriteStallCause( compaction_needed_bytes, mutable_cf_options, write_stall_cause); assert(rate_divider >= 1); auto write_rate = static_cast(max_write_rate / rate_divider); if (write_rate < WriteController::kMinWriteRate) { write_rate = WriteController::kMinWriteRate; } UpdateCFRate(this, write_rate); } std::pair ColumnFamilyData::GetWriteStallConditionAndCause( int num_unflushed_memtables, int num_l0_files, uint64_t num_compaction_needed_bytes, const MutableCFOptions& mutable_cf_options, const ImmutableCFOptions& immutable_cf_options) { if (num_unflushed_memtables >= mutable_cf_options.max_write_buffer_number) { return {WriteStallCondition::kStopped, WriteStallCause::kMemtableLimit}; } else if (!mutable_cf_options.disable_auto_compactions && num_l0_files >= mutable_cf_options.level0_stop_writes_trigger) { return {WriteStallCondition::kStopped, WriteStallCause::kL0FileCountLimit}; } else if (!mutable_cf_options.disable_auto_compactions && mutable_cf_options.hard_pending_compaction_bytes_limit > 0 && num_compaction_needed_bytes >= mutable_cf_options.hard_pending_compaction_bytes_limit) { return {WriteStallCondition::kStopped, WriteStallCause::kPendingCompactionBytes}; } else if (mutable_cf_options.max_write_buffer_number > 3 && num_unflushed_memtables >= mutable_cf_options.max_write_buffer_number - 1 && num_unflushed_memtables - 1 >= immutable_cf_options.min_write_buffer_number_to_merge) { return {WriteStallCondition::kDelayed, WriteStallCause::kMemtableLimit}; } else if (!mutable_cf_options.disable_auto_compactions && mutable_cf_options.level0_slowdown_writes_trigger >= 0 && num_l0_files >= mutable_cf_options.level0_slowdown_writes_trigger) { return {WriteStallCondition::kDelayed, WriteStallCause::kL0FileCountLimit}; } else if (!mutable_cf_options.disable_auto_compactions && mutable_cf_options.soft_pending_compaction_bytes_limit > 0 && num_compaction_needed_bytes >= mutable_cf_options.soft_pending_compaction_bytes_limit) { return {WriteStallCondition::kDelayed, WriteStallCause::kPendingCompactionBytes}; } return {WriteStallCondition::kNormal, WriteStallCause::kNone}; } // Delay divider is by how much we divide the users delayed_write_rate. // E.g. divider 10 will result in 10 Mb/s from users 100 Mb/s. // The rate is reduced linearly according to the range from slowdown to stop. double ColumnFamilyData::CalculateWriteDelayDividerAndMaybeUpdateWriteStallCause( uint64_t compaction_needed_bytes, const MutableCFOptions& mutable_cf_options, WriteStallCause& write_stall_cause) { assert(current_ != nullptr); const auto* vstorage = current_->storage_info(); // Memtables // this can only be entered when we're at the last memtable and theres more // than 3. delay by 10X when writing to the last memtable. double memtable_divider = 1; auto num_unflushed_memtables = imm()->NumNotFlushed(); if (mutable_cf_options.max_write_buffer_number > 3 && num_unflushed_memtables >= mutable_cf_options.max_write_buffer_number - 1 && num_unflushed_memtables - 1 >= ioptions_.min_write_buffer_number_to_merge) { memtable_divider = kMemtablePenalty; } // Pending Compaction Bytes double pending_divider = 1; auto soft_limit = mutable_cf_options.soft_pending_compaction_bytes_limit; if (soft_limit > 0 && compaction_needed_bytes > soft_limit) { auto hard_limit = mutable_cf_options.hard_pending_compaction_bytes_limit; // soft_limit != hard_limit here. we're in a kDelayed state here and not // stop. assert(hard_limit > soft_limit); uint64_t soft_hard_range = hard_limit - soft_limit; uint64_t step_size = ceil(soft_hard_range / kNumPendingSteps); uint64_t extra_bytes = compaction_needed_bytes - soft_limit; uint64_t step_num = extra_bytes / step_size; assert(step_num < kNumPendingSteps); pending_divider = 1 / (1 - (static_cast(step_num) / kNumPendingSteps)); } double biggest_divider = 1; if (memtable_divider > pending_divider) { biggest_divider = memtable_divider; write_stall_cause = WriteStallCause::kMemtableLimit; } else if (pending_divider > 1) { biggest_divider = pending_divider; write_stall_cause = WriteStallCause::kPendingCompactionBytes; } // dont delay based on L0 when the user disables auto compactions if (mutable_cf_options.disable_auto_compactions) { return biggest_divider; } // L0 files double l0_divider = 1; const auto extra_l0_ssts = vstorage->l0_delay_trigger_count() - mutable_cf_options.level0_slowdown_writes_trigger; if (extra_l0_ssts > 0) { const auto num_L0_steps = mutable_cf_options.level0_stop_writes_trigger - mutable_cf_options.level0_slowdown_writes_trigger; assert(num_L0_steps > 0); // since extra_l0_ssts == num_L0_steps then we're in a stop condition. assert(extra_l0_ssts < num_L0_steps); l0_divider = 1 / (1 - (static_cast(extra_l0_ssts) / num_L0_steps)); } if (l0_divider > biggest_divider) { biggest_divider = l0_divider; write_stall_cause = WriteStallCause::kL0FileCountLimit; } return biggest_divider; } WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions( const MutableCFOptions& mutable_cf_options) { auto write_stall_condition = WriteStallCondition::kNormal; if (current_ != nullptr) { auto* vstorage = current_->storage_info(); auto write_controller = write_controller_ptr(); uint64_t compaction_needed_bytes = vstorage->estimated_compaction_needed_bytes(); auto write_stall_condition_and_cause = GetWriteStallConditionAndCause( imm()->NumNotFlushed(), vstorage->l0_delay_trigger_count(), vstorage->estimated_compaction_needed_bytes(), mutable_cf_options, *ioptions()); write_stall_condition = write_stall_condition_and_cause.first; auto write_stall_cause = write_stall_condition_and_cause.second; bool was_stopped = write_controller->IsStopped(); bool needed_delay = write_controller->NeedsDelay(); bool dynamic_delay = write_controller->is_dynamic_delay(); // GetWriteStallConditionAndCause returns the first condition met, so its // possible that a later condition will require a harder rate limiting. // calculate all conditions with DynamicSetupDelay and reevaluate the // write_stall_cause. this is only relevant in the kDelayed case. if (dynamic_delay) { if (write_stall_condition == WriteStallCondition::kDelayed) { DynamicSetupDelay(write_controller->max_delayed_write_rate(), compaction_needed_bytes, mutable_cf_options, write_stall_cause); write_controller_token_.reset(); } else { write_controller->HandleRemoveDelayReq(this); } } if (write_stall_condition == WriteStallCondition::kStopped && write_stall_cause == WriteStallCause::kMemtableLimit) { write_controller_token_ = write_controller->GetStopToken(); internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_STOPS, 1); ROCKS_LOG_WARN( ioptions_.logger, "[%s] Stopping writes because we have %d immutable memtables " "(waiting for flush), max_write_buffer_number is set to %d", name_.c_str(), imm()->NumNotFlushed(), mutable_cf_options.max_write_buffer_number); } else if (write_stall_condition == WriteStallCondition::kStopped && write_stall_cause == WriteStallCause::kL0FileCountLimit) { write_controller_token_ = write_controller->GetStopToken(); internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_STOPS, 1); if (compaction_picker_->IsLevel0CompactionInProgress()) { internal_stats_->AddCFStats( InternalStats::L0_FILE_COUNT_LIMIT_STOPS_WITH_ONGOING_COMPACTION, 1); } ROCKS_LOG_WARN(ioptions_.logger, "[%s] Stopping writes because we have %d level-0 files", name_.c_str(), vstorage->l0_delay_trigger_count()); } else if (write_stall_condition == WriteStallCondition::kStopped && write_stall_cause == WriteStallCause::kPendingCompactionBytes) { write_controller_token_ = write_controller->GetStopToken(); internal_stats_->AddCFStats( InternalStats::PENDING_COMPACTION_BYTES_LIMIT_STOPS, 1); ROCKS_LOG_WARN( ioptions_.logger, "[%s] Stopping writes because of estimated pending compaction " "bytes %" PRIu64, name_.c_str(), compaction_needed_bytes); } else if (write_stall_condition == WriteStallCondition::kDelayed && write_stall_cause == WriteStallCause::kMemtableLimit) { if (!dynamic_delay) { write_controller_token_ = SetupDelay(write_controller, compaction_needed_bytes, prev_compaction_needed_bytes_, was_stopped, mutable_cf_options.disable_auto_compactions); } internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_DELAYS, 1); ROCKS_LOG_WARN( ioptions_.logger, "[%s] Stalling writes because we have %d immutable memtables " "(waiting for flush), max_write_buffer_number is set to %d " "rate %" PRIu64, name_.c_str(), imm()->NumNotFlushed(), mutable_cf_options.max_write_buffer_number, write_controller->delayed_write_rate()); } else if (write_stall_condition == WriteStallCondition::kDelayed && write_stall_cause == WriteStallCause::kL0FileCountLimit) { if (!dynamic_delay) { // L0 is the last two files from stopping. bool near_stop = vstorage->l0_delay_trigger_count() >= mutable_cf_options.level0_stop_writes_trigger - 2; write_controller_token_ = SetupDelay(write_controller, compaction_needed_bytes, prev_compaction_needed_bytes_, was_stopped || near_stop, mutable_cf_options.disable_auto_compactions); } internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_DELAYS, 1); if (compaction_picker_->IsLevel0CompactionInProgress()) { internal_stats_->AddCFStats( InternalStats::L0_FILE_COUNT_LIMIT_DELAYS_WITH_ONGOING_COMPACTION, 1); } ROCKS_LOG_WARN(ioptions_.logger, "[%s] Stalling writes because we have %d level-0 files " "rate %" PRIu64, name_.c_str(), vstorage->l0_delay_trigger_count(), write_controller->delayed_write_rate()); } else if (write_stall_condition == WriteStallCondition::kDelayed && write_stall_cause == WriteStallCause::kPendingCompactionBytes) { // If the distance to hard limit is less than 1/4 of the gap between soft // and // hard bytes limit, we think it is near stop and speed up the slowdown. if (!dynamic_delay) { bool near_stop = mutable_cf_options.hard_pending_compaction_bytes_limit > 0 && (compaction_needed_bytes - mutable_cf_options.soft_pending_compaction_bytes_limit) > 3 * (mutable_cf_options.hard_pending_compaction_bytes_limit - mutable_cf_options.soft_pending_compaction_bytes_limit) / 4; write_controller_token_ = SetupDelay(write_controller, compaction_needed_bytes, prev_compaction_needed_bytes_, was_stopped || near_stop, mutable_cf_options.disable_auto_compactions); } internal_stats_->AddCFStats( InternalStats::PENDING_COMPACTION_BYTES_LIMIT_DELAYS, 1); ROCKS_LOG_WARN( ioptions_.logger, "[%s] Stalling writes because of estimated pending compaction " "bytes %" PRIu64 " rate %" PRIu64, name_.c_str(), vstorage->estimated_compaction_needed_bytes(), write_controller->delayed_write_rate()); } else { assert(write_stall_condition == WriteStallCondition::kNormal); if (vstorage->l0_delay_trigger_count() >= GetL0ThresholdSpeedupCompaction( mutable_cf_options.level0_file_num_compaction_trigger, mutable_cf_options.level0_slowdown_writes_trigger)) { write_controller_token_ = write_controller->GetCompactionPressureToken(); ROCKS_LOG_INFO( ioptions_.logger, "[%s] Increasing compaction threads because we have %d level-0 " "files ", name_.c_str(), vstorage->l0_delay_trigger_count()); } else if (vstorage->estimated_compaction_needed_bytes() >= mutable_cf_options.soft_pending_compaction_bytes_limit / 4) { // Increase compaction threads if bytes needed for compaction exceeds // 1/4 of threshold for slowing down. // If soft pending compaction byte limit is not set, always speed up // compaction. write_controller_token_ = write_controller->GetCompactionPressureToken(); if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0) { ROCKS_LOG_INFO( ioptions_.logger, "[%s] Increasing compaction threads because of estimated pending " "compaction " "bytes %" PRIu64, name_.c_str(), vstorage->estimated_compaction_needed_bytes()); } } else { write_controller_token_.reset(); } // If the DB recovers from delay conditions, we reward with reducing // double the slowdown ratio. This is to balance the long term slowdown // increase signal. if (needed_delay && !dynamic_delay) { uint64_t write_rate = write_controller->delayed_write_rate(); write_controller->set_delayed_write_rate(static_cast( static_cast(write_rate) * kDelayRecoverSlowdownRatio)); // Set the low pri limit to be 1/4 the delayed write rate. // Note we don't reset this value even after delay condition is relased. // Low-pri rate will continue to apply if there is a compaction // pressure. write_controller->low_pri_rate_limiter()->SetBytesPerSecond(write_rate / 4); } } prev_compaction_needed_bytes_ = compaction_needed_bytes; } return write_stall_condition; } const FileOptions* ColumnFamilyData::soptions() const { return &(column_family_set_->file_options_); } void ColumnFamilyData::SetCurrent(Version* current_version) { current_ = current_version; } uint64_t ColumnFamilyData::GetNumLiveVersions() const { return VersionSet::GetNumLiveVersions(dummy_versions_); } uint64_t ColumnFamilyData::GetTotalSstFilesSize() const { return VersionSet::GetTotalSstFilesSize(dummy_versions_); } uint64_t ColumnFamilyData::GetTotalBlobFileSize() const { return VersionSet::GetTotalBlobFileSize(dummy_versions_); } uint64_t ColumnFamilyData::GetLiveSstFilesSize() const { return current_->GetSstFilesSize(); } MemTable* ColumnFamilyData::ConstructNewMemtable( const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) { return new MemTable(internal_comparator_, ioptions_, mutable_cf_options, write_buffer_manager_, earliest_seq, id_); } void ColumnFamilyData::CreateNewMemtable( const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) { if (mem_ != nullptr) { delete mem_->Unref(); } SetMemtable(ConstructNewMemtable(mutable_cf_options, earliest_seq)); mem_->Ref(); } bool ColumnFamilyData::NeedsCompaction() const { return !mutable_cf_options_.disable_auto_compactions && compaction_picker_->NeedsCompaction(current_->storage_info()); } Compaction* ColumnFamilyData::PickCompaction( const MutableCFOptions& mutable_options, const MutableDBOptions& mutable_db_options, LogBuffer* log_buffer) { auto* result = compaction_picker_->PickCompaction( GetName(), mutable_options, mutable_db_options, current_->storage_info(), log_buffer); if (result != nullptr) { result->SetInputVersion(current_); } return result; } bool ColumnFamilyData::RangeOverlapWithCompaction( const Slice& smallest_user_key, const Slice& largest_user_key, int level) const { return compaction_picker_->RangeOverlapWithCompaction( smallest_user_key, largest_user_key, level); } Status ColumnFamilyData::RangesOverlapWithMemtables( const autovector& ranges, SuperVersion* super_version, bool allow_data_in_errors, bool* overlap) { assert(overlap != nullptr); *overlap = false; // Create an InternalIterator over all unflushed memtables Arena arena; ReadOptions read_opts; read_opts.total_order_seek = true; MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena); merge_iter_builder.AddIterator( super_version->mem->NewIterator(read_opts, &arena)); super_version->imm->AddIterators(read_opts, &merge_iter_builder, false /* add_range_tombstone_iter */); ScopedArenaIterator memtable_iter(merge_iter_builder.Finish()); auto read_seq = super_version->current->version_set()->LastSequence(); ReadRangeDelAggregator range_del_agg(&internal_comparator_, read_seq); auto* active_range_del_iter = super_version->mem->NewRangeTombstoneIterator( read_opts, read_seq, false /* immutable_memtable */); range_del_agg.AddTombstones( std::unique_ptr(active_range_del_iter)); Status status; status = super_version->imm->AddRangeTombstoneIterators( read_opts, nullptr /* arena */, &range_del_agg); // AddRangeTombstoneIterators always return Status::OK. assert(status.ok()); for (size_t i = 0; i < ranges.size() && status.ok() && !*overlap; ++i) { auto* vstorage = super_version->current->storage_info(); auto* ucmp = vstorage->InternalComparator()->user_comparator(); InternalKey range_start(ranges[i].start, kMaxSequenceNumber, kValueTypeForSeek); memtable_iter->Seek(range_start.Encode()); status = memtable_iter->status(); ParsedInternalKey seek_result; if (status.ok() && memtable_iter->Valid()) { status = ParseInternalKey(memtable_iter->key(), &seek_result, allow_data_in_errors); } if (status.ok()) { if (memtable_iter->Valid() && ucmp->Compare(seek_result.user_key, ranges[i].limit) <= 0) { *overlap = true; } else if (range_del_agg.IsRangeOverlapped(ranges[i].start, ranges[i].limit)) { *overlap = true; } } } return status; } const int ColumnFamilyData::kCompactAllLevels = -1; const int ColumnFamilyData::kCompactToBaseLevel = -2; Compaction* ColumnFamilyData::CompactRange( const MutableCFOptions& mutable_cf_options, const MutableDBOptions& mutable_db_options, int input_level, int output_level, const CompactRangeOptions& compact_range_options, const InternalKey* begin, const InternalKey* end, InternalKey** compaction_end, bool* conflict, uint64_t max_file_num_to_ignore, const std::string& trim_ts) { auto* result = compaction_picker_->CompactRange( GetName(), mutable_cf_options, mutable_db_options, current_->storage_info(), input_level, output_level, compact_range_options, begin, end, compaction_end, conflict, max_file_num_to_ignore, trim_ts); if (result != nullptr) { result->SetInputVersion(current_); } TEST_SYNC_POINT("ColumnFamilyData::CompactRange:Return"); return result; } SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(DBImpl* db) { SuperVersion* sv = GetThreadLocalSuperVersion(db); sv->Ref(); if (!ReturnThreadLocalSuperVersion(sv)) { // This Unref() corresponds to the Ref() in GetThreadLocalSuperVersion() // when the thread-local pointer was populated. So, the Ref() earlier in // this function still prevents the returned SuperVersion* from being // deleted out from under the caller. sv->Unref(); } return sv; } SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) { // The SuperVersion is cached in thread local storage to avoid acquiring // mutex when SuperVersion does not change since the last use. When a new // SuperVersion is installed, the compaction or flush thread cleans up // cached SuperVersion in all existing thread local storage. To avoid // acquiring mutex for this operation, we use atomic Swap() on the thread // local pointer to guarantee exclusive access. If the thread local pointer // is being used while a new SuperVersion is installed, the cached // SuperVersion can become stale. In that case, the background thread would // have swapped in kSVObsolete. We re-check the value at when returning // SuperVersion back to thread local, with an atomic compare and swap. // The superversion will need to be released if detected to be stale. void* ptr = local_sv_->Swap(SuperVersion::kSVInUse); // Invariant: // (1) Scrape (always) installs kSVObsolete in ThreadLocal storage // (2) the Swap above (always) installs kSVInUse, ThreadLocal storage // should only keep kSVInUse before ReturnThreadLocalSuperVersion call // (if no Scrape happens). assert(ptr != SuperVersion::kSVInUse); SuperVersion* sv = static_cast(ptr); if (sv == SuperVersion::kSVObsolete || sv->version_number != super_version_number_.load()) { RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_ACQUIRES); SuperVersion* sv_to_delete = nullptr; if (sv != SuperVersion::kSVObsolete && sv->Unref()) { RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_CLEANUPS); db->mutex()->Lock(); // NOTE: underlying resources held by superversion (sst files) might // not be released until the next background job. sv->Cleanup(); if (db->immutable_db_options().avoid_unnecessary_blocking_io) { db->AddSuperVersionsToFreeQueue(sv); db->SchedulePurge(); } else { sv_to_delete = sv; } } else { db->mutex()->Lock(); } sv = super_version_->Ref(); db->mutex()->Unlock(); delete sv_to_delete; } assert(sv != nullptr); return sv; } bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) { assert(sv != nullptr); // Put the SuperVersion back void* expected = SuperVersion::kSVInUse; if (local_sv_->CompareAndSwap(static_cast(sv), expected)) { // When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal // storage has not been altered and no Scrape has happened. The // SuperVersion is still current. return true; } else { // ThreadLocal scrape happened in the process of this GetImpl call (after // thread local Swap() at the beginning and before CompareAndSwap()). // This means the SuperVersion it holds is obsolete. assert(expected == SuperVersion::kSVObsolete); } return false; } void ColumnFamilyData::InstallSuperVersion(SuperVersionContext* sv_context, InstrumentedMutex* db_mutex) { db_mutex->AssertHeld(); return InstallSuperVersion(sv_context, mutable_cf_options_); } void ColumnFamilyData::InstallSuperVersion( SuperVersionContext* sv_context, const MutableCFOptions& mutable_cf_options) { SuperVersion* new_superversion = sv_context->new_superversion.release(); new_superversion->mutable_cf_options = mutable_cf_options; new_superversion->Init(this, mem_, imm_.current(), current_); SuperVersion* old_superversion = super_version_; super_version_ = new_superversion; ++super_version_number_; super_version_->version_number = super_version_number_; if (old_superversion == nullptr || old_superversion->current != current() || old_superversion->mem != mem_ || old_superversion->imm != imm_.current()) { // Should not recalculate slow down condition if nothing has changed, since // currently RecalculateWriteStallConditions() treats it as further slowing // down is needed. super_version_->write_stall_condition = RecalculateWriteStallConditions(mutable_cf_options); } else { super_version_->write_stall_condition = old_superversion->write_stall_condition; } if (old_superversion != nullptr) { // Reset SuperVersions cached in thread local storage. // This should be done before old_superversion->Unref(). That's to ensure // that local_sv_ never holds the last reference to SuperVersion, since // it has no means to safely do SuperVersion cleanup. ResetThreadLocalSuperVersions(); if (old_superversion->mutable_cf_options.write_buffer_size != mutable_cf_options.write_buffer_size) { mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size); } if (old_superversion->write_stall_condition != new_superversion->write_stall_condition) { sv_context->PushWriteStallNotification( old_superversion->write_stall_condition, new_superversion->write_stall_condition, GetName(), ioptions()); } if (old_superversion->Unref()) { old_superversion->Cleanup(); sv_context->superversions_to_free.push_back(old_superversion); } } } void ColumnFamilyData::ResetThreadLocalSuperVersions() { autovector sv_ptrs; local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete); for (auto ptr : sv_ptrs) { assert(ptr); if (ptr == SuperVersion::kSVInUse) { continue; } auto sv = static_cast(ptr); bool was_last_ref __attribute__((__unused__)); was_last_ref = sv->Unref(); // sv couldn't have been the last reference because // ResetThreadLocalSuperVersions() is called before // unref'ing super_version_. assert(!was_last_ref); } } Status ColumnFamilyData::ValidateOptions( const DBOptions& db_options, const ColumnFamilyOptions& cf_options) { Status s; s = CheckCompressionSupported(cf_options); if (s.ok() && db_options.allow_concurrent_memtable_write) { s = CheckConcurrentWritesSupported(cf_options); } if (s.ok() && db_options.unordered_write && cf_options.max_successive_merges != 0) { s = Status::InvalidArgument( "max_successive_merges > 0 is incompatible with unordered_write"); } if (s.ok()) { s = CheckCFPathsSupported(db_options, cf_options); } if (!s.ok()) { return s; } if (cf_options.ttl > 0 && cf_options.ttl != kDefaultTtl) { if (!cf_options.table_factory->IsInstanceOf( TableFactory::kBlockBasedTableName())) { return Status::NotSupported( "TTL is only supported in Block-Based Table format. "); } } if (cf_options.periodic_compaction_seconds > 0 && cf_options.periodic_compaction_seconds != kDefaultPeriodicCompSecs) { if (!cf_options.table_factory->IsInstanceOf( TableFactory::kBlockBasedTableName())) { return Status::NotSupported( "Periodic Compaction is only supported in " "Block-Based Table format. "); } } if (cf_options.enable_blob_garbage_collection) { if (cf_options.blob_garbage_collection_age_cutoff < 0.0 || cf_options.blob_garbage_collection_age_cutoff > 1.0) { return Status::InvalidArgument( "The age cutoff for blob garbage collection should be in the range " "[0.0, 1.0]."); } if (cf_options.blob_garbage_collection_force_threshold < 0.0 || cf_options.blob_garbage_collection_force_threshold > 1.0) { return Status::InvalidArgument( "The garbage ratio threshold for forcing blob garbage collection " "should be in the range [0.0, 1.0]."); } } if (cf_options.compaction_style == kCompactionStyleFIFO && db_options.max_open_files != -1 && cf_options.ttl > 0) { return Status::NotSupported( "FIFO compaction only supported with max_open_files = -1."); } std::vector supported{0, 1, 2, 4, 8}; if (std::find(supported.begin(), supported.end(), cf_options.memtable_protection_bytes_per_key) == supported.end()) { return Status::NotSupported( "Memtable per key-value checksum protection only supports 0, 1, 2, 4 " "or 8 bytes per key."); } return s; } Status ColumnFamilyData::SetOptions( const DBOptions& db_opts, const std::unordered_map& options_map) { ColumnFamilyOptions cf_opts = BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_); ConfigOptions config_opts; config_opts.mutable_options_only = true; Status s = GetColumnFamilyOptionsFromMap(config_opts, cf_opts, options_map, &cf_opts); if (s.ok()) { s = ValidateOptions(db_opts, cf_opts); } if (s.ok()) { mutable_cf_options_ = MutableCFOptions(cf_opts); mutable_cf_options_.RefreshDerivedOptions(ioptions_); } return s; } // REQUIRES: DB mutex held Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) { if (initial_cf_options_.compaction_style != kCompactionStyleLevel) { return Env::WLTH_NOT_SET; } if (level == 0) { return Env::WLTH_MEDIUM; } int base_level = current_->storage_info()->base_level(); // L1: medium, L2: long, ... if (level - base_level >= 2) { return Env::WLTH_EXTREME; } else if (level < base_level) { // There is no restriction which prevents level passed in to be smaller // than base_level. return Env::WLTH_MEDIUM; } return static_cast( level - base_level + static_cast(Env::WLTH_MEDIUM)); } Status ColumnFamilyData::AddDirectories( std::map>* created_dirs) { Status s; assert(created_dirs != nullptr); assert(data_dirs_.empty()); for (auto& p : ioptions_.cf_paths) { auto existing_dir = created_dirs->find(p.path); if (existing_dir == created_dirs->end()) { std::unique_ptr path_directory; s = DBImpl::CreateAndNewDirectory(ioptions_.fs.get(), p.path, &path_directory); if (!s.ok()) { return s; } assert(path_directory != nullptr); data_dirs_.emplace_back(path_directory.release()); (*created_dirs)[p.path] = data_dirs_.back(); } else { data_dirs_.emplace_back(existing_dir->second); } } assert(data_dirs_.size() == ioptions_.cf_paths.size()); return s; } FSDirectory* ColumnFamilyData::GetDataDir(size_t path_id) const { if (data_dirs_.empty()) { return nullptr; } assert(path_id < data_dirs_.size()); return data_dirs_[path_id].get(); } void ColumnFamilyData::RecoverEpochNumbers() { assert(current_); auto* vstorage = current_->storage_info(); assert(vstorage); vstorage->RecoverEpochNumbers(this); } ColumnFamilySet::ColumnFamilySet( const std::string& dbname, const ImmutableDBOptions* db_options, const FileOptions& file_options, Cache* table_cache, WriteBufferManager* _write_buffer_manager, std::shared_ptr _write_controller, BlockCacheTracer* const block_cache_tracer, const std::shared_ptr& io_tracer, const std::string& db_id, const std::string& db_session_id) : max_column_family_(0), file_options_(file_options), dummy_cfd_(new ColumnFamilyData( ColumnFamilyData::kDummyColumnFamilyDataId, "", nullptr, nullptr, nullptr, nullptr, ColumnFamilyOptions(), *db_options, &file_options_, nullptr, block_cache_tracer, io_tracer, db_id, db_session_id)), default_cfd_cache_(nullptr), db_name_(dbname), db_options_(db_options), table_cache_(table_cache), write_buffer_manager_(_write_buffer_manager), write_controller_(_write_controller), block_cache_tracer_(block_cache_tracer), io_tracer_(io_tracer), db_id_(db_id), db_session_id_(db_session_id) { // initialize linked list dummy_cfd_->prev_ = dummy_cfd_; dummy_cfd_->next_ = dummy_cfd_; write_buffer_manager_->RegisterWriteController(write_controller_); } ColumnFamilySet::~ColumnFamilySet() { write_buffer_manager_->DeregisterWriteController(write_controller_); while (column_family_data_.size() > 0) { // cfd destructor will delete itself from column_family_data_ auto cfd = column_family_data_.begin()->second; bool last_ref __attribute__((__unused__)); last_ref = cfd->UnrefAndTryDelete(); assert(last_ref); } bool dummy_last_ref __attribute__((__unused__)); dummy_last_ref = dummy_cfd_->UnrefAndTryDelete(); assert(dummy_last_ref); } ColumnFamilyData* ColumnFamilySet::GetDefault() const { assert(default_cfd_cache_ != nullptr); return default_cfd_cache_; } ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const { auto cfd_iter = column_family_data_.find(id); if (cfd_iter != column_family_data_.end()) { return cfd_iter->second; } else { return nullptr; } } ColumnFamilyData* ColumnFamilySet::GetColumnFamily( const std::string& name) const { auto cfd_iter = column_families_.find(name); if (cfd_iter != column_families_.end()) { auto cfd = GetColumnFamily(cfd_iter->second); assert(cfd != nullptr); return cfd; } else { return nullptr; } } uint32_t ColumnFamilySet::GetNextColumnFamilyID() { return ++max_column_family_; } uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; } void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) { max_column_family_ = std::max(new_max_column_family, max_column_family_); } size_t ColumnFamilySet::NumberOfColumnFamilies() const { return column_families_.size(); } // under a DB mutex AND write thread ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( const std::string& name, uint32_t id, Version* dummy_versions, const ColumnFamilyOptions& options) { assert(column_families_.find(name) == column_families_.end()); ColumnFamilyData* new_cfd = new ColumnFamilyData( id, name, dummy_versions, table_cache_, write_buffer_manager_, write_controller_, options, *db_options_, &file_options_, this, block_cache_tracer_, io_tracer_, db_id_, db_session_id_); column_families_.insert({name, id}); column_family_data_.insert({id, new_cfd}); max_column_family_ = std::max(max_column_family_, id); // add to linked list new_cfd->next_ = dummy_cfd_; auto prev = dummy_cfd_->prev_; new_cfd->prev_ = prev; prev->next_ = new_cfd; dummy_cfd_->prev_ = new_cfd; if (id == 0) { default_cfd_cache_ = new_cfd; } return new_cfd; } void ColumnFamilyData::UpdateCFRate(void* client_id, uint64_t write_rate) { if (write_controller_ && write_controller_->is_dynamic_delay()) { write_controller_->HandleNewDelayReq(client_id, write_rate); } } void ColumnFamilyData::ResetCFRate(void* client_id) { if (write_controller_ && write_controller_->is_dynamic_delay()) { write_controller_->HandleRemoveDelayReq(client_id); } } // under a DB mutex AND from a write thread void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) { auto cfd_iter = column_family_data_.find(cfd->GetID()); assert(cfd_iter != column_family_data_.end()); column_family_data_.erase(cfd_iter); column_families_.erase(cfd->GetName()); } // under a DB mutex OR from a write thread bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) { if (column_family_id == 0) { // optimization for common case current_ = column_family_set_->GetDefault(); } else { current_ = column_family_set_->GetColumnFamily(column_family_id); } handle_.SetCFD(current_); return current_ != nullptr; } uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const { assert(current_ != nullptr); return current_->GetLogNumber(); } MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const { assert(current_ != nullptr); return current_->mem(); } ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() { assert(current_ != nullptr); return &handle_; } uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) { uint32_t column_family_id = 0; if (column_family != nullptr) { auto cfh = static_cast_with_check(column_family); column_family_id = cfh->GetID(); } return column_family_id; } const Comparator* GetColumnFamilyUserComparator( ColumnFamilyHandle* column_family) { if (column_family != nullptr) { return column_family->GetComparator(); } return nullptr; } } // namespace ROCKSDB_NAMESPACE