// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #include #include #include #include #include #include "db/db_test_util.h" #include "db/read_callback.h" #include "db/version_edit.h" #include "options/options_helper.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/experimental.h" #include "rocksdb/iostats_context.h" #include "rocksdb/persistent_cache.h" #include "rocksdb/trace_record.h" #include "rocksdb/trace_record_result.h" #include "rocksdb/utilities/replayer.h" #include "rocksdb/wal_filter.h" #include "test_util/testutil.h" #include "util/defer.h" #include "util/random.h" #include "utilities/fault_injection_env.h" namespace ROCKSDB_NAMESPACE { class DBTest2 : public DBTestBase { public: DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {} std::vector GetLevelFileMetadatas(int level, int cf = 0) { VersionSet* const versions = dbfull()->GetVersionSet(); assert(versions); ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetColumnFamily(cf); assert(cfd); Version* const current = cfd->current(); assert(current); VersionStorageInfo* const storage_info = current->storage_info(); assert(storage_info); return storage_info->LevelFiles(level); } }; TEST_F(DBTest2, OpenForReadOnly) { DB* db_ptr = nullptr; std::string dbname = test::PerThreadDBPath("db_readonly"); Options options = CurrentOptions(); options.create_if_missing = true; // OpenForReadOnly should fail but will create in the file system ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr)); // Since is created, we should be able to delete the dir // We first get the list files under // There should not be any subdirectories -- this is not checked here std::vector files; ASSERT_OK(env_->GetChildren(dbname, &files)); for (auto& f : files) { ASSERT_OK(env_->DeleteFile(dbname + "/" + f)); } // should be empty now and we should be able to delete it ASSERT_OK(env_->DeleteDir(dbname)); options.create_if_missing = false; // OpenForReadOnly should fail since was successfully deleted ASSERT_NOK(DB::OpenForReadOnly(options, dbname, &db_ptr)); // With create_if_missing false, there should not be a dir in the file system ASSERT_NOK(env_->FileExists(dbname)); } TEST_F(DBTest2, OpenForReadOnlyWithColumnFamilies) { DB* db_ptr = nullptr; std::string dbname = test::PerThreadDBPath("db_readonly"); Options options = CurrentOptions(); options.create_if_missing = true; ColumnFamilyOptions cf_options(options); std::vector column_families; column_families.emplace_back(kDefaultColumnFamilyName, cf_options); column_families.emplace_back("goku", cf_options); std::vector handles; // OpenForReadOnly should fail but will create in the file system ASSERT_NOK( DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr)); // Since is created, we should be able to delete the dir // We first get the list files under // There should not be any subdirectories -- this is not checked here std::vector files; ASSERT_OK(env_->GetChildren(dbname, &files)); for (auto& f : files) { ASSERT_OK(env_->DeleteFile(dbname + "/" + f)); } // should be empty now and we should be able to delete it ASSERT_OK(env_->DeleteDir(dbname)); options.create_if_missing = false; // OpenForReadOnly should fail since was successfully deleted ASSERT_NOK( DB::OpenForReadOnly(options, dbname, column_families, &handles, &db_ptr)); // With create_if_missing false, there should not be a dir in the file system ASSERT_NOK(env_->FileExists(dbname)); } class PartitionedIndexTestListener : public EventListener { public: void OnFlushCompleted(DB* /*db*/, const FlushJobInfo& info) override { ASSERT_GT(info.table_properties.index_partitions, 1); ASSERT_EQ(info.table_properties.index_key_is_user_key, 0); } }; TEST_F(DBTest2, PartitionedIndexUserToInternalKey) { const int kValueSize = 10500; const int kNumEntriesPerFile = 1000; const int kNumFiles = 3; const int kNumDistinctKeys = 30; BlockBasedTableOptions table_options; Options options = CurrentOptions(); options.disable_auto_compactions = true; table_options.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; PartitionedIndexTestListener* listener = new PartitionedIndexTestListener(); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.listeners.emplace_back(listener); std::vector snapshots; Reopen(options); Random rnd(301); for (int i = 0; i < kNumFiles; i++) { for (int j = 0; j < kNumEntriesPerFile; j++) { int key_id = (i * kNumEntriesPerFile + j) % kNumDistinctKeys; std::string value = rnd.RandomString(kValueSize); ASSERT_OK(Put("keykey_" + std::to_string(key_id), value)); snapshots.push_back(db_->GetSnapshot()); } ASSERT_OK(Flush()); } for (auto s : snapshots) { db_->ReleaseSnapshot(s); } } class PrefixFullBloomWithReverseComparator : public DBTestBase, public ::testing::WithParamInterface { public: PrefixFullBloomWithReverseComparator() : DBTestBase("prefix_bloom_reverse", /*env_do_fsync=*/true) {} void SetUp() override { if_cache_filter_ = GetParam(); } bool if_cache_filter_; }; TEST_P(PrefixFullBloomWithReverseComparator, PrefixFullBloomWithReverseComparator) { Options options = last_options_; options.comparator = ReverseBytewiseComparator(); options.prefix_extractor.reset(NewCappedPrefixTransform(3)); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); BlockBasedTableOptions bbto; if (if_cache_filter_) { bbto.no_block_cache = false; bbto.cache_index_and_filter_blocks = true; bbto.block_cache = NewLRUCache(1); } bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); bbto.whole_key_filtering = false; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); DestroyAndReopen(options); ASSERT_OK(dbfull()->Put(WriteOptions(), "bar123", "foo")); ASSERT_OK(dbfull()->Put(WriteOptions(), "bar234", "foo2")); ASSERT_OK(dbfull()->Put(WriteOptions(), "foo123", "foo3")); ASSERT_OK(dbfull()->Flush(FlushOptions())); if (bbto.block_cache) { bbto.block_cache->EraseUnRefEntries(); } std::unique_ptr iter(db_->NewIterator(ReadOptions())); iter->Seek("bar345"); ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("bar234", iter->key().ToString()); ASSERT_EQ("foo2", iter->value().ToString()); iter->Next(); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("bar123", iter->key().ToString()); ASSERT_EQ("foo", iter->value().ToString()); iter->Seek("foo234"); ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("foo123", iter->key().ToString()); ASSERT_EQ("foo3", iter->value().ToString()); iter->Seek("bar"); ASSERT_OK(iter->status()); ASSERT_TRUE(!iter->Valid()); } INSTANTIATE_TEST_CASE_P(PrefixFullBloomWithReverseComparator, PrefixFullBloomWithReverseComparator, testing::Bool()); TEST_F(DBTest2, IteratorPropertyVersionNumber) { ASSERT_OK(Put("", "")); Iterator* iter1 = db_->NewIterator(ReadOptions()); ASSERT_OK(iter1->status()); std::string prop_value; ASSERT_OK( iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); uint64_t version_number1 = static_cast(std::atoi(prop_value.c_str())); ASSERT_OK(Put("", "")); ASSERT_OK(Flush()); Iterator* iter2 = db_->NewIterator(ReadOptions()); ASSERT_OK(iter2->status()); ASSERT_OK( iter2->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); uint64_t version_number2 = static_cast(std::atoi(prop_value.c_str())); ASSERT_GT(version_number2, version_number1); ASSERT_OK(Put("", "")); Iterator* iter3 = db_->NewIterator(ReadOptions()); ASSERT_OK(iter3->status()); ASSERT_OK( iter3->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); uint64_t version_number3 = static_cast(std::atoi(prop_value.c_str())); ASSERT_EQ(version_number2, version_number3); iter1->SeekToFirst(); ASSERT_OK( iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); uint64_t version_number1_new = static_cast(std::atoi(prop_value.c_str())); ASSERT_EQ(version_number1, version_number1_new); delete iter1; delete iter2; delete iter3; } TEST_F(DBTest2, CacheIndexAndFilterWithDBRestart) { Options options = CurrentOptions(); options.create_if_missing = true; options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; table_options.filter_policy.reset(NewBloomFilterPolicy(20)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "a", "begin")); ASSERT_OK(Put(1, "z", "end")); ASSERT_OK(Flush(1)); ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); std::string value; value = Get(1, "a"); } TEST_F(DBTest2, MaxSuccessiveMergesChangeWithDBRecovery) { Options options = CurrentOptions(); options.create_if_missing = true; options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); options.max_successive_merges = 3; options.merge_operator = MergeOperators::CreatePutOperator(); options.disable_auto_compactions = true; DestroyAndReopen(options); ASSERT_OK(Put("poi", "Finch")); ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Reese")); ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Shaw")); ASSERT_OK(db_->Merge(WriteOptions(), "poi", "Root")); options.max_successive_merges = 2; Reopen(options); } class DBTestSharedWriteBufferAcrossCFs : public DBTestBase, public testing::WithParamInterface> { public: DBTestSharedWriteBufferAcrossCFs() : DBTestBase("db_test_shared_write_buffer", /*env_do_fsync=*/true) {} void SetUp() override { use_old_interface_ = std::get<0>(GetParam()); cost_cache_ = std::get<1>(GetParam()); } bool use_old_interface_; bool cost_cache_; }; TEST_P(DBTestSharedWriteBufferAcrossCFs, SharedWriteBufferAcrossCFs) { Options options = CurrentOptions(); options.arena_block_size = 4096; auto flush_listener = std::make_shared(); options.listeners.push_back(flush_listener); // Don't trip the listener at shutdown. options.avoid_flush_during_shutdown = true; // Avoid undeterministic value by malloc_usable_size(); // Force arena block size to 1 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "Arena::Arena:0", [&](void* arg) { size_t* block_size = static_cast(arg); *block_size = 1; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "Arena::AllocateNewBlock:0", [&](void* arg) { std::pair* pair = static_cast*>(arg); *std::get<0>(*pair) = *std::get<1>(*pair); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); // The total soft write buffer size is about 105000 std::shared_ptr cache = NewLRUCache(4 * 1024 * 1024, 2); ASSERT_LT(cache->GetUsage(), 256 * 1024); if (use_old_interface_) { options.db_write_buffer_size = 120000; // this is the real limit } else if (!cost_cache_) { options.write_buffer_manager.reset(new WriteBufferManager(114285)); } else { options.write_buffer_manager.reset(new WriteBufferManager(114285, cache)); } options.write_buffer_size = 500000; // this is never hit CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); WriteOptions wo; wo.disableWAL = true; std::function wait_flush = [&]() { ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2])); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[3])); // Ensure background work is fully finished including listener callbacks // before accessing listener state. ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); }; // Create some data and flush "default" and "nikitich" so that they // are newer CFs created. flush_listener->expected_flush_reason = FlushReason::kManualFlush; ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); ASSERT_OK(Flush(3)); ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); ASSERT_OK(Flush(0)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), static_cast(1)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), static_cast(1)); flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager; ASSERT_OK(Put(3, Key(1), DummyString(30000), wo)); if (cost_cache_) { ASSERT_GE(cache->GetUsage(), 256 * 1024); ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024); } wait_flush(); ASSERT_OK(Put(0, Key(1), DummyString(60000), wo)); if (cost_cache_) { ASSERT_GE(cache->GetUsage(), 256 * 1024); ASSERT_LE(cache->GetUsage(), 2 * 256 * 1024); } wait_flush(); ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); // No flush should trigger wait_flush(); { ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), static_cast(1)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), static_cast(0)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), static_cast(0)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), static_cast(1)); } // Trigger a flush. Flushing "nikitich". ASSERT_OK(Put(3, Key(2), DummyString(30000), wo)); wait_flush(); ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); wait_flush(); { ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), static_cast(1)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), static_cast(0)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), static_cast(0)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), static_cast(2)); } // Without hitting the threshold, no flush should trigger. ASSERT_OK(Put(2, Key(1), DummyString(30000), wo)); wait_flush(); ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); wait_flush(); ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); wait_flush(); { ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), static_cast(1)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), static_cast(0)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), static_cast(0)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), static_cast(2)); } // Hit the write buffer limit again. "default" // will have been flushed. ASSERT_OK(Put(2, Key(2), DummyString(10000), wo)); wait_flush(); ASSERT_OK(Put(3, Key(1), DummyString(1), wo)); wait_flush(); ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); wait_flush(); ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); wait_flush(); ASSERT_OK(Put(0, Key(1), DummyString(1), wo)); wait_flush(); { ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), static_cast(2)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), static_cast(0)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), static_cast(0)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), static_cast(2)); } // Trigger another flush. This time "dobrynia". "pikachu" should not // be flushed, althrough it was never flushed. ASSERT_OK(Put(1, Key(1), DummyString(1), wo)); wait_flush(); ASSERT_OK(Put(2, Key(1), DummyString(80000), wo)); wait_flush(); ASSERT_OK(Put(1, Key(1), DummyString(1), wo)); wait_flush(); ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); wait_flush(); { ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), static_cast(2)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), static_cast(0)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), static_cast(1)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), static_cast(2)); } if (cost_cache_) { ASSERT_GE(cache->GetUsage(), 256 * 1024); Close(); options.write_buffer_manager.reset(); last_options_.write_buffer_manager.reset(); ASSERT_LT(cache->GetUsage(), 256 * 1024); } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } INSTANTIATE_TEST_CASE_P(DBTestSharedWriteBufferAcrossCFs, DBTestSharedWriteBufferAcrossCFs, ::testing::Values(std::make_tuple(true, false), std::make_tuple(false, false), std::make_tuple(false, true))); TEST_F(DBTest2, SharedWriteBufferLimitAcrossDB) { std::string dbname2 = test::PerThreadDBPath("db_shared_wb_db2"); Options options = CurrentOptions(); options.arena_block_size = 4096; auto flush_listener = std::make_shared(); options.listeners.push_back(flush_listener); // Don't trip the listener at shutdown. options.avoid_flush_during_shutdown = true; // Avoid undeterministic value by malloc_usable_size(); // Force arena block size to 1 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "Arena::Arena:0", [&](void* arg) { size_t* block_size = static_cast(arg); *block_size = 1; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "Arena::AllocateNewBlock:0", [&](void* arg) { std::pair* pair = static_cast*>(arg); *std::get<0>(*pair) = *std::get<1>(*pair); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); options.write_buffer_size = 500000; // this is never hit // Use a write buffer total size so that the soft limit is about // 105000. options.write_buffer_manager.reset(new WriteBufferManager(120000)); CreateAndReopenWithCF({"cf1", "cf2"}, options); ASSERT_OK(DestroyDB(dbname2, options)); DB* db2 = nullptr; ASSERT_OK(DB::Open(options, dbname2, &db2)); WriteOptions wo; wo.disableWAL = true; std::function wait_flush = [&]() { ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[0])); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[2])); ASSERT_OK(static_cast(db2)->TEST_WaitForFlushMemTable()); // Ensure background work is fully finished including listener callbacks // before accessing listener state. ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); ASSERT_OK( static_cast_with_check(db2)->TEST_WaitForBackgroundWork()); }; // Trigger a flush on cf2 flush_listener->expected_flush_reason = FlushReason::kWriteBufferManager; ASSERT_OK(Put(2, Key(1), DummyString(70000), wo)); wait_flush(); ASSERT_OK(Put(0, Key(1), DummyString(20000), wo)); wait_flush(); // Insert to DB2 ASSERT_OK(db2->Put(wo, Key(2), DummyString(20000))); wait_flush(); ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); wait_flush(); ASSERT_OK(static_cast(db2)->TEST_WaitForFlushMemTable()); { ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default") + GetNumberOfSstFilesForColumnFamily(db_, "cf1") + GetNumberOfSstFilesForColumnFamily(db_, "cf2"), static_cast(1)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), static_cast(0)); } // Triggering to flush another CF in DB1 ASSERT_OK(db2->Put(wo, Key(2), DummyString(70000))); wait_flush(); ASSERT_OK(Put(2, Key(1), DummyString(1), wo)); wait_flush(); { ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), static_cast(1)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"), static_cast(0)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"), static_cast(1)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), static_cast(0)); } // Triggering flush in DB2. ASSERT_OK(db2->Put(wo, Key(3), DummyString(40000))); wait_flush(); ASSERT_OK(db2->Put(wo, Key(1), DummyString(1))); wait_flush(); ASSERT_OK(static_cast(db2)->TEST_WaitForFlushMemTable()); { ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), static_cast(1)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf1"), static_cast(0)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "cf2"), static_cast(1)); ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db2, "default"), static_cast(1)); } delete db2; ASSERT_OK(DestroyDB(dbname2, options)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBTest2, TestWriteBufferNoLimitWithCache) { Options options = CurrentOptions(); options.arena_block_size = 4096; std::shared_ptr cache = NewLRUCache(LRUCacheOptions( 10000000 /* capacity */, 1 /* num_shard_bits */, false /* strict_capacity_limit */, 0.0 /* high_pri_pool_ratio */, nullptr /* memory_allocator */, kDefaultToAdaptiveMutex, kDontChargeCacheMetadata)); options.write_buffer_size = 50000; // this is never hit // Use a write buffer total size so that the soft limit is about // 105000. options.write_buffer_manager.reset(new WriteBufferManager(0, cache)); Reopen(options); ASSERT_OK(Put("foo", "bar")); // One dummy entry is 256KB. ASSERT_GT(cache->GetUsage(), 128000); } namespace { void ValidateKeyExistence(DB* db, const std::vector& keys_must_exist, const std::vector& keys_must_not_exist) { // Ensure that expected keys exist std::vector values; if (keys_must_exist.size() > 0) { std::vector status_list = db->MultiGet(ReadOptions(), keys_must_exist, &values); for (size_t i = 0; i < keys_must_exist.size(); i++) { ASSERT_OK(status_list[i]); } } // Ensure that given keys don't exist if (keys_must_not_exist.size() > 0) { std::vector status_list = db->MultiGet(ReadOptions(), keys_must_not_exist, &values); for (size_t i = 0; i < keys_must_not_exist.size(); i++) { ASSERT_TRUE(status_list[i].IsNotFound()); } } } } // anonymous namespace TEST_F(DBTest2, WalFilterTest) { class TestWalFilter : public WalFilter { private: // Processing option that is requested to be applied at the given index WalFilter::WalProcessingOption wal_processing_option_; // Index at which to apply wal_processing_option_ // At other indexes default wal_processing_option::kContinueProcessing is // returned. size_t apply_option_at_record_index_; // Current record index, incremented with each record encountered. size_t current_record_index_; public: TestWalFilter(WalFilter::WalProcessingOption wal_processing_option, size_t apply_option_for_record_index) : wal_processing_option_(wal_processing_option), apply_option_at_record_index_(apply_option_for_record_index), current_record_index_(0) {} WalProcessingOption LogRecord(const WriteBatch& /*batch*/, WriteBatch* /*new_batch*/, bool* /*batch_changed*/) const override { WalFilter::WalProcessingOption option_to_return; if (current_record_index_ == apply_option_at_record_index_) { option_to_return = wal_processing_option_; } else { option_to_return = WalProcessingOption::kContinueProcessing; } // Filter is passed as a const object for RocksDB to not modify the // object, however we modify it for our own purpose here and hence // cast the constness away. (const_cast(this)->current_record_index_)++; return option_to_return; } const char* Name() const override { return "TestWalFilter"; } }; // Create 3 batches with two keys each std::vector> batch_keys(3); batch_keys[0].push_back("key1"); batch_keys[0].push_back("key2"); batch_keys[1].push_back("key3"); batch_keys[1].push_back("key4"); batch_keys[2].push_back("key5"); batch_keys[2].push_back("key6"); // Test with all WAL processing options for (int option = 0; option < static_cast( WalFilter::WalProcessingOption::kWalProcessingOptionMax); option++) { Options options = OptionsForLogIterTest(); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); // Write given keys in given batches for (size_t i = 0; i < batch_keys.size(); i++) { WriteBatch batch; for (size_t j = 0; j < batch_keys[i].size(); j++) { ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024))); } ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); } WalFilter::WalProcessingOption wal_processing_option = static_cast(option); // Create a test filter that would apply wal_processing_option at the first // record size_t apply_option_for_record_index = 1; TestWalFilter test_wal_filter(wal_processing_option, apply_option_for_record_index); // Reopen database with option to use WAL filter options = OptionsForLogIterTest(); options.wal_filter = &test_wal_filter; Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options); if (wal_processing_option == WalFilter::WalProcessingOption::kCorruptedRecord) { ASSERT_NOK(status); // In case of corruption we can turn off paranoid_checks to reopen // databse options.paranoid_checks = false; ReopenWithColumnFamilies({"default", "pikachu"}, options); } else { ASSERT_OK(status); } // Compute which keys we expect to be found // and which we expect not to be found after recovery. std::vector keys_must_exist; std::vector keys_must_not_exist; switch (wal_processing_option) { case WalFilter::WalProcessingOption::kCorruptedRecord: case WalFilter::WalProcessingOption::kContinueProcessing: { fprintf(stderr, "Testing with complete WAL processing\n"); // we expect all records to be processed for (size_t i = 0; i < batch_keys.size(); i++) { for (size_t j = 0; j < batch_keys[i].size(); j++) { keys_must_exist.emplace_back(batch_keys[i][j]); } } break; } case WalFilter::WalProcessingOption::kIgnoreCurrentRecord: { fprintf(stderr, "Testing with ignoring record %" ROCKSDB_PRIszt " only\n", apply_option_for_record_index); // We expect the record with apply_option_for_record_index to be not // found. for (size_t i = 0; i < batch_keys.size(); i++) { for (size_t j = 0; j < batch_keys[i].size(); j++) { if (i == apply_option_for_record_index) { keys_must_not_exist.emplace_back(batch_keys[i][j]); } else { keys_must_exist.emplace_back(batch_keys[i][j]); } } } break; } case WalFilter::WalProcessingOption::kStopReplay: { fprintf(stderr, "Testing with stopping replay from record %" ROCKSDB_PRIszt "\n", apply_option_for_record_index); // We expect records beyond apply_option_for_record_index to be not // found. for (size_t i = 0; i < batch_keys.size(); i++) { for (size_t j = 0; j < batch_keys[i].size(); j++) { if (i >= apply_option_for_record_index) { keys_must_not_exist.emplace_back(batch_keys[i][j]); } else { keys_must_exist.emplace_back(batch_keys[i][j]); } } } break; } default: FAIL(); // unhandled case } bool checked_after_reopen = false; while (true) { // Ensure that expected keys exists // and not expected keys don't exist after recovery ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist); if (checked_after_reopen) { break; } // reopen database again to make sure previous log(s) are not used //(even if they were skipped) // reopn database with option to use WAL filter options = OptionsForLogIterTest(); ReopenWithColumnFamilies({"default", "pikachu"}, options); checked_after_reopen = true; } } } TEST_F(DBTest2, WalFilterTestWithChangeBatch) { class ChangeBatchHandler : public WriteBatch::Handler { private: // Batch to insert keys in WriteBatch* new_write_batch_; // Number of keys to add in the new batch size_t num_keys_to_add_in_new_batch_; // Number of keys added to new batch size_t num_keys_added_; public: ChangeBatchHandler(WriteBatch* new_write_batch, size_t num_keys_to_add_in_new_batch) : new_write_batch_(new_write_batch), num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch), num_keys_added_(0) {} void Put(const Slice& key, const Slice& value) override { if (num_keys_added_ < num_keys_to_add_in_new_batch_) { ASSERT_OK(new_write_batch_->Put(key, value)); ++num_keys_added_; } } }; class TestWalFilterWithChangeBatch : public WalFilter { private: // Index at which to start changing records size_t change_records_from_index_; // Number of keys to add in the new batch size_t num_keys_to_add_in_new_batch_; // Current record index, incremented with each record encountered. size_t current_record_index_; public: TestWalFilterWithChangeBatch(size_t change_records_from_index, size_t num_keys_to_add_in_new_batch) : change_records_from_index_(change_records_from_index), num_keys_to_add_in_new_batch_(num_keys_to_add_in_new_batch), current_record_index_(0) {} WalProcessingOption LogRecord(const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) const override { if (current_record_index_ >= change_records_from_index_) { ChangeBatchHandler handler(new_batch, num_keys_to_add_in_new_batch_); Status s = batch.Iterate(&handler); if (s.ok()) { *batch_changed = true; } else { assert(false); } } // Filter is passed as a const object for RocksDB to not modify the // object, however we modify it for our own purpose here and hence // cast the constness away. (const_cast(this) ->current_record_index_)++; return WalProcessingOption::kContinueProcessing; } const char* Name() const override { return "TestWalFilterWithChangeBatch"; } }; std::vector> batch_keys(3); batch_keys[0].push_back("key1"); batch_keys[0].push_back("key2"); batch_keys[1].push_back("key3"); batch_keys[1].push_back("key4"); batch_keys[2].push_back("key5"); batch_keys[2].push_back("key6"); Options options = OptionsForLogIterTest(); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); // Write given keys in given batches for (size_t i = 0; i < batch_keys.size(); i++) { WriteBatch batch; for (size_t j = 0; j < batch_keys[i].size(); j++) { ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024))); } ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); } // Create a test filter that would apply wal_processing_option at the first // record size_t change_records_from_index = 1; size_t num_keys_to_add_in_new_batch = 1; TestWalFilterWithChangeBatch test_wal_filter_with_change_batch( change_records_from_index, num_keys_to_add_in_new_batch); // Reopen database with option to use WAL filter options = OptionsForLogIterTest(); options.wal_filter = &test_wal_filter_with_change_batch; ReopenWithColumnFamilies({"default", "pikachu"}, options); // Ensure that all keys exist before change_records_from_index_ // And after that index only single key exists // as our filter adds only single key for each batch std::vector keys_must_exist; std::vector keys_must_not_exist; for (size_t i = 0; i < batch_keys.size(); i++) { for (size_t j = 0; j < batch_keys[i].size(); j++) { if (i >= change_records_from_index && j >= num_keys_to_add_in_new_batch) { keys_must_not_exist.emplace_back(batch_keys[i][j]); } else { keys_must_exist.emplace_back(batch_keys[i][j]); } } } bool checked_after_reopen = false; while (true) { // Ensure that expected keys exists // and not expected keys don't exist after recovery ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist); if (checked_after_reopen) { break; } // reopen database again to make sure previous log(s) are not used //(even if they were skipped) // reopn database with option to use WAL filter options = OptionsForLogIterTest(); ReopenWithColumnFamilies({"default", "pikachu"}, options); checked_after_reopen = true; } } TEST_F(DBTest2, WalFilterTestWithChangeBatchExtraKeys) { class TestWalFilterWithChangeBatchAddExtraKeys : public WalFilter { public: WalProcessingOption LogRecord(const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) const override { *new_batch = batch; Status s = new_batch->Put("key_extra", "value_extra"); if (s.ok()) { *batch_changed = true; } else { assert(false); } return WalProcessingOption::kContinueProcessing; } const char* Name() const override { return "WalFilterTestWithChangeBatchExtraKeys"; } }; std::vector> batch_keys(3); batch_keys[0].push_back("key1"); batch_keys[0].push_back("key2"); batch_keys[1].push_back("key3"); batch_keys[1].push_back("key4"); batch_keys[2].push_back("key5"); batch_keys[2].push_back("key6"); Options options = OptionsForLogIterTest(); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); // Write given keys in given batches for (size_t i = 0; i < batch_keys.size(); i++) { WriteBatch batch; for (size_t j = 0; j < batch_keys[i].size(); j++) { ASSERT_OK(batch.Put(handles_[0], batch_keys[i][j], DummyString(1024))); } ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); } // Create a test filter that would add extra keys TestWalFilterWithChangeBatchAddExtraKeys test_wal_filter_extra_keys; // Reopen database with option to use WAL filter options = OptionsForLogIterTest(); options.wal_filter = &test_wal_filter_extra_keys; Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options); ASSERT_TRUE(status.IsNotSupported()); // Reopen without filter, now reopen should succeed - previous // attempt to open must not have altered the db. options = OptionsForLogIterTest(); ReopenWithColumnFamilies({"default", "pikachu"}, options); std::vector keys_must_exist; std::vector keys_must_not_exist; // empty vector for (size_t i = 0; i < batch_keys.size(); i++) { for (size_t j = 0; j < batch_keys[i].size(); j++) { keys_must_exist.emplace_back(batch_keys[i][j]); } } ValidateKeyExistence(db_, keys_must_exist, keys_must_not_exist); } TEST_F(DBTest2, WalFilterTestWithColumnFamilies) { class TestWalFilterWithColumnFamilies : public WalFilter { private: // column_family_id -> log_number map (provided to WALFilter) std::map cf_log_number_map_; // column_family_name -> column_family_id map (provided to WALFilter) std::map cf_name_id_map_; // column_family_name -> keys_found_in_wal map // We store keys that are applicable to the column_family // during recovery (i.e. aren't already flushed to SST file(s)) // for verification against the keys we expect. std::map> cf_wal_keys_; public: void ColumnFamilyLogNumberMap( const std::map& cf_lognumber_map, const std::map& cf_name_id_map) override { cf_log_number_map_ = cf_lognumber_map; cf_name_id_map_ = cf_name_id_map; } WalProcessingOption LogRecordFound(unsigned long long log_number, const std::string& /*log_file_name*/, const WriteBatch& batch, WriteBatch* /*new_batch*/, bool* /*batch_changed*/) override { class LogRecordBatchHandler : public WriteBatch::Handler { private: const std::map& cf_log_number_map_; std::map>& cf_wal_keys_; unsigned long long log_number_; public: LogRecordBatchHandler( unsigned long long current_log_number, const std::map& cf_log_number_map, std::map>& cf_wal_keys) : cf_log_number_map_(cf_log_number_map), cf_wal_keys_(cf_wal_keys), log_number_(current_log_number) {} Status PutCF(uint32_t column_family_id, const Slice& key, const Slice& /*value*/) override { auto it = cf_log_number_map_.find(column_family_id); assert(it != cf_log_number_map_.end()); unsigned long long log_number_for_cf = it->second; // If the current record is applicable for column_family_id // (i.e. isn't flushed to SST file(s) for column_family_id) // add it to the cf_wal_keys_ map for verification. if (log_number_ >= log_number_for_cf) { cf_wal_keys_[column_family_id].push_back( std::string(key.data(), key.size())); } return Status::OK(); } } handler(log_number, cf_log_number_map_, cf_wal_keys_); Status s = batch.Iterate(&handler); if (!s.ok()) { // TODO(AR) is this ok? return WalProcessingOption::kCorruptedRecord; } return WalProcessingOption::kContinueProcessing; } const char* Name() const override { return "WalFilterTestWithColumnFamilies"; } const std::map>& GetColumnFamilyKeys() { return cf_wal_keys_; } const std::map& GetColumnFamilyNameIdMap() { return cf_name_id_map_; } }; std::vector> batch_keys_pre_flush(3); batch_keys_pre_flush[0].push_back("key1"); batch_keys_pre_flush[0].push_back("key2"); batch_keys_pre_flush[1].push_back("key3"); batch_keys_pre_flush[1].push_back("key4"); batch_keys_pre_flush[2].push_back("key5"); batch_keys_pre_flush[2].push_back("key6"); Options options = OptionsForLogIterTest(); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); // Write given keys in given batches for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) { WriteBatch batch; for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) { ASSERT_OK(batch.Put(handles_[0], batch_keys_pre_flush[i][j], DummyString(1024))); ASSERT_OK(batch.Put(handles_[1], batch_keys_pre_flush[i][j], DummyString(1024))); } ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); } // Flush default column-family ASSERT_OK(db_->Flush(FlushOptions(), handles_[0])); // Do some more writes std::vector> batch_keys_post_flush(3); batch_keys_post_flush[0].push_back("key7"); batch_keys_post_flush[0].push_back("key8"); batch_keys_post_flush[1].push_back("key9"); batch_keys_post_flush[1].push_back("key10"); batch_keys_post_flush[2].push_back("key11"); batch_keys_post_flush[2].push_back("key12"); // Write given keys in given batches for (size_t i = 0; i < batch_keys_post_flush.size(); i++) { WriteBatch batch; for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) { ASSERT_OK(batch.Put(handles_[0], batch_keys_post_flush[i][j], DummyString(1024))); ASSERT_OK(batch.Put(handles_[1], batch_keys_post_flush[i][j], DummyString(1024))); } ASSERT_OK(dbfull()->Write(WriteOptions(), &batch)); } // On Recovery we should only find the second batch applicable to default CF // But both batches applicable to pikachu CF // Create a test filter that would add extra keys TestWalFilterWithColumnFamilies test_wal_filter_column_families; // Reopen database with option to use WAL filter options = OptionsForLogIterTest(); options.wal_filter = &test_wal_filter_column_families; Status status = TryReopenWithColumnFamilies({"default", "pikachu"}, options); ASSERT_TRUE(status.ok()); // verify that handles_[0] only has post_flush keys // while handles_[1] has pre and post flush keys auto cf_wal_keys = test_wal_filter_column_families.GetColumnFamilyKeys(); auto name_id_map = test_wal_filter_column_families.GetColumnFamilyNameIdMap(); size_t index = 0; auto keys_cf = cf_wal_keys[name_id_map[kDefaultColumnFamilyName]]; // default column-family, only post_flush keys are expected for (size_t i = 0; i < batch_keys_post_flush.size(); i++) { for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) { Slice key_from_the_log(keys_cf[index++]); Slice batch_key(batch_keys_post_flush[i][j]); ASSERT_EQ(key_from_the_log.compare(batch_key), 0); } } ASSERT_EQ(index, keys_cf.size()); index = 0; keys_cf = cf_wal_keys[name_id_map["pikachu"]]; // pikachu column-family, all keys are expected for (size_t i = 0; i < batch_keys_pre_flush.size(); i++) { for (size_t j = 0; j < batch_keys_pre_flush[i].size(); j++) { Slice key_from_the_log(keys_cf[index++]); Slice batch_key(batch_keys_pre_flush[i][j]); ASSERT_EQ(key_from_the_log.compare(batch_key), 0); } } for (size_t i = 0; i < batch_keys_post_flush.size(); i++) { for (size_t j = 0; j < batch_keys_post_flush[i].size(); j++) { Slice key_from_the_log(keys_cf[index++]); Slice batch_key(batch_keys_post_flush[i][j]); ASSERT_EQ(key_from_the_log.compare(batch_key), 0); } } ASSERT_EQ(index, keys_cf.size()); } TEST_F(DBTest2, PresetCompressionDict) { // Verifies that compression ratio improves when dictionary is enabled, and // improves even further when the dictionary is trained by ZSTD. const size_t kBlockSizeBytes = 4 << 10; const size_t kL0FileBytes = 128 << 10; const size_t kApproxPerBlockOverheadBytes = 50; const int kNumL0Files = 5; Options options; // Make sure to use any custom env that the test is configured with. options.env = CurrentOptions().env; options.allow_concurrent_memtable_write = false; options.arena_block_size = kBlockSizeBytes; options.create_if_missing = true; options.disable_auto_compactions = true; options.level0_file_num_compaction_trigger = kNumL0Files; options.memtable_factory.reset( test::NewSpecialSkipListFactory(kL0FileBytes / kBlockSizeBytes)); options.num_levels = 2; options.target_file_size_base = kL0FileBytes; options.target_file_size_multiplier = 2; options.write_buffer_size = kL0FileBytes; BlockBasedTableOptions table_options; table_options.block_size = kBlockSizeBytes; std::vector compression_types; if (Zlib_Supported()) { compression_types.push_back(kZlibCompression); } #if LZ4_VERSION_NUMBER >= 10400 // r124+ compression_types.push_back(kLZ4Compression); compression_types.push_back(kLZ4HCCompression); #endif // LZ4_VERSION_NUMBER >= 10400 if (ZSTD_Supported()) { compression_types.push_back(kZSTD); } enum DictionaryTypes : int { kWithoutDict, kWithDict, kWithZSTDfinalizeDict, kWithZSTDTrainedDict, kDictEnd, }; for (auto compression_type : compression_types) { options.compression = compression_type; size_t bytes_without_dict = 0; size_t bytes_with_dict = 0; size_t bytes_with_zstd_finalize_dict = 0; size_t bytes_with_zstd_trained_dict = 0; for (int i = kWithoutDict; i < kDictEnd; i++) { // First iteration: compress without preset dictionary // Second iteration: compress with preset dictionary // Third iteration (zstd only): compress with zstd-trained dictionary // // To make sure the compression dictionary has the intended effect, we // verify the compressed size is smaller in successive iterations. Also in // the non-first iterations, verify the data we get out is the same data // we put in. switch (i) { case kWithoutDict: options.compression_opts.max_dict_bytes = 0; options.compression_opts.zstd_max_train_bytes = 0; break; case kWithDict: options.compression_opts.max_dict_bytes = kBlockSizeBytes; options.compression_opts.zstd_max_train_bytes = 0; break; case kWithZSTDfinalizeDict: if (compression_type != kZSTD || !ZSTD_FinalizeDictionarySupported()) { continue; } options.compression_opts.max_dict_bytes = kBlockSizeBytes; options.compression_opts.zstd_max_train_bytes = kL0FileBytes; options.compression_opts.use_zstd_dict_trainer = false; break; case kWithZSTDTrainedDict: if (compression_type != kZSTD || !ZSTD_TrainDictionarySupported()) { continue; } options.compression_opts.max_dict_bytes = kBlockSizeBytes; options.compression_opts.zstd_max_train_bytes = kL0FileBytes; options.compression_opts.use_zstd_dict_trainer = true; break; default: assert(false); } options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); std::string seq_datas[10]; for (int j = 0; j < 10; ++j) { seq_datas[j] = rnd.RandomString(kBlockSizeBytes - kApproxPerBlockOverheadBytes); } ASSERT_EQ(0, NumTableFilesAtLevel(0, 1)); for (int j = 0; j < kNumL0Files; ++j) { for (size_t k = 0; k < kL0FileBytes / kBlockSizeBytes + 1; ++k) { auto key_num = j * (kL0FileBytes / kBlockSizeBytes) + k; ASSERT_OK(Put(1, Key(static_cast(key_num)), seq_datas[(key_num / 10) % 10])); } ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[1])); ASSERT_EQ(j + 1, NumTableFilesAtLevel(0, 1)); } ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1], true /* disallow_trivial_move */)); ASSERT_EQ(0, NumTableFilesAtLevel(0, 1)); ASSERT_GT(NumTableFilesAtLevel(1, 1), 0); // Get the live sst files size size_t total_sst_bytes = TotalSize(1); if (i == kWithoutDict) { bytes_without_dict = total_sst_bytes; } else if (i == kWithDict) { bytes_with_dict = total_sst_bytes; } else if (i == kWithZSTDfinalizeDict) { bytes_with_zstd_finalize_dict = total_sst_bytes; } else if (i == kWithZSTDTrainedDict) { bytes_with_zstd_trained_dict = total_sst_bytes; } for (size_t j = 0; j < kNumL0Files * (kL0FileBytes / kBlockSizeBytes); j++) { ASSERT_EQ(seq_datas[(j / 10) % 10], Get(1, Key(static_cast(j)))); } if (i == kWithDict) { ASSERT_GT(bytes_without_dict, bytes_with_dict); } else if (i == kWithZSTDTrainedDict) { // In zstd compression, it is sometimes possible that using a finalized // dictionary does not get as good a compression ratio as raw content // dictionary. But using a dictionary should always get better // compression ratio than not using one. ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_finalize_dict || bytes_without_dict > bytes_with_zstd_finalize_dict); } else if (i == kWithZSTDTrainedDict) { // In zstd compression, it is sometimes possible that using a trained // dictionary does not get as good a compression ratio as without // training. // But using a dictionary (with or without training) should always get // better compression ratio than not using one. ASSERT_TRUE(bytes_with_dict > bytes_with_zstd_trained_dict || bytes_without_dict > bytes_with_zstd_trained_dict); } DestroyAndReopen(options); } } } TEST_F(DBTest2, PresetCompressionDictLocality) { if (!ZSTD_Supported()) { return; } // Verifies that compression dictionary is generated from local data. The // verification simply checks all output SSTs have different compression // dictionaries. We do not verify effectiveness as that'd likely be flaky in // the future. const int kNumEntriesPerFile = 1 << 10; // 1KB const int kNumBytesPerEntry = 1 << 10; // 1KB const int kNumFiles = 4; Options options = CurrentOptions(); options.compression = kZSTD; options.compression_opts.max_dict_bytes = 1 << 14; // 16KB options.compression_opts.zstd_max_train_bytes = 1 << 18; // 256KB options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); options.target_file_size_base = kNumEntriesPerFile * kNumBytesPerEntry; BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); Random rnd(301); for (int i = 0; i < kNumFiles; ++i) { for (int j = 0; j < kNumEntriesPerFile; ++j) { ASSERT_OK(Put(Key(i * kNumEntriesPerFile + j), rnd.RandomString(kNumBytesPerEntry))); } ASSERT_OK(Flush()); MoveFilesToLevel(1); ASSERT_EQ(NumTableFilesAtLevel(1), i + 1); } // Store all the dictionaries generated during a full compaction. std::vector compression_dicts; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "BlockBasedTableBuilder::WriteCompressionDictBlock:RawDict", [&](void* arg) { compression_dicts.emplace_back(static_cast(arg)->ToString()); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); CompactRangeOptions compact_range_opts; compact_range_opts.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr)); // Dictionary compression should not be so good as to compress four totally // random files into one. If it does then there's probably something wrong // with the test. ASSERT_GT(NumTableFilesAtLevel(1), 1); // Furthermore, there should be one compression dictionary generated per file. // And they should all be different from each other. ASSERT_EQ(NumTableFilesAtLevel(1), static_cast(compression_dicts.size())); for (size_t i = 1; i < compression_dicts.size(); ++i) { std::string& a = compression_dicts[i - 1]; std::string& b = compression_dicts[i]; size_t alen = a.size(); size_t blen = b.size(); ASSERT_TRUE(alen != blen || memcmp(a.data(), b.data(), alen) != 0); } } class PresetCompressionDictTest : public DBTestBase, public testing::WithParamInterface> { public: PresetCompressionDictTest() : DBTestBase("db_test2", false /* env_do_fsync */), compression_type_(std::get<0>(GetParam())), bottommost_(std::get<1>(GetParam())) {} protected: const CompressionType compression_type_; const bool bottommost_; }; INSTANTIATE_TEST_CASE_P( DBTest2, PresetCompressionDictTest, ::testing::Combine(::testing::ValuesIn(GetSupportedDictCompressions()), ::testing::Bool())); TEST_P(PresetCompressionDictTest, Flush) { // Verifies that dictionary is generated and written during flush only when // `ColumnFamilyOptions::compression` enables dictionary. Also verifies the // size of the dictionary is within expectations according to the limit on // buffering set by `CompressionOptions::max_dict_buffer_bytes`. const size_t kValueLen = 256; const size_t kKeysPerFile = 1 << 10; const size_t kDictLen = 16 << 10; const size_t kBlockLen = 4 << 10; Options options = CurrentOptions(); if (bottommost_) { options.bottommost_compression = compression_type_; options.bottommost_compression_opts.enabled = true; options.bottommost_compression_opts.max_dict_bytes = kDictLen; options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen; } else { options.compression = compression_type_; options.compression_opts.max_dict_bytes = kDictLen; options.compression_opts.max_dict_buffer_bytes = kBlockLen; } options.memtable_factory.reset(test::NewSpecialSkipListFactory(kKeysPerFile)); options.statistics = CreateDBStatistics(); BlockBasedTableOptions bbto; bbto.block_size = kBlockLen; bbto.cache_index_and_filter_blocks = true; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); Reopen(options); Random rnd(301); for (size_t i = 0; i <= kKeysPerFile; ++i) { ASSERT_OK(Put(Key(static_cast(i)), rnd.RandomString(kValueLen))); } ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a // compression dictionary exists since dictionaries would be preloaded when // the flush finishes. if (bottommost_) { // Flush is never considered bottommost. This should change in the future // since flushed files may have nothing underneath them, like the one in // this test case. ASSERT_EQ( TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), 0); } else { ASSERT_GT( TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), 0); // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on // number of bytes needs to be adjusted in case the cached block is in // ZSTD's digested dictionary format. if (compression_type_ != kZSTD && compression_type_ != kZSTDNotFinalCompression) { // Although we limited buffering to `kBlockLen`, there may be up to two // blocks of data included in the dictionary since we only check limit // after each block is built. ASSERT_LE(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), 2 * kBlockLen); } } } TEST_P(PresetCompressionDictTest, CompactNonBottommost) { // Verifies that dictionary is generated and written during compaction to // non-bottommost level only when `ColumnFamilyOptions::compression` enables // dictionary. Also verifies the size of the dictionary is within expectations // according to the limit on buffering set by // `CompressionOptions::max_dict_buffer_bytes`. const size_t kValueLen = 256; const size_t kKeysPerFile = 1 << 10; const size_t kDictLen = 16 << 10; const size_t kBlockLen = 4 << 10; Options options = CurrentOptions(); if (bottommost_) { options.bottommost_compression = compression_type_; options.bottommost_compression_opts.enabled = true; options.bottommost_compression_opts.max_dict_bytes = kDictLen; options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen; } else { options.compression = compression_type_; options.compression_opts.max_dict_bytes = kDictLen; options.compression_opts.max_dict_buffer_bytes = kBlockLen; } options.disable_auto_compactions = true; options.statistics = CreateDBStatistics(); BlockBasedTableOptions bbto; bbto.block_size = kBlockLen; bbto.cache_index_and_filter_blocks = true; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); Reopen(options); Random rnd(301); for (size_t j = 0; j <= kKeysPerFile; ++j) { ASSERT_OK(Put(Key(static_cast(j)), rnd.RandomString(kValueLen))); } ASSERT_OK(Flush()); MoveFilesToLevel(2); for (int i = 0; i < 2; ++i) { for (size_t j = 0; j <= kKeysPerFile; ++j) { ASSERT_OK(Put(Key(static_cast(j)), rnd.RandomString(kValueLen))); } ASSERT_OK(Flush()); } ASSERT_EQ("2,0,1", FilesPerLevel(0)); uint64_t prev_compression_dict_bytes_inserted = TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT); // This L0->L1 compaction merges the two L0 files into L1. The produced L1 // file is not bottommost due to the existing L2 file covering the same key- // range. ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); ASSERT_EQ("0,1,1", FilesPerLevel(0)); // We can use `BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT` to detect whether a // compression dictionary exists since dictionaries would be preloaded when // the compaction finishes. if (bottommost_) { ASSERT_EQ( TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), prev_compression_dict_bytes_inserted); } else { ASSERT_GT( TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), prev_compression_dict_bytes_inserted); // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on // number of bytes needs to be adjusted in case the cached block is in // ZSTD's digested dictionary format. if (compression_type_ != kZSTD && compression_type_ != kZSTDNotFinalCompression) { // Although we limited buffering to `kBlockLen`, there may be up to two // blocks of data included in the dictionary since we only check limit // after each block is built. ASSERT_LE(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), prev_compression_dict_bytes_inserted + 2 * kBlockLen); } } } TEST_P(PresetCompressionDictTest, CompactBottommost) { // Verifies that dictionary is generated and written during compaction to // non-bottommost level only when either `ColumnFamilyOptions::compression` or // `ColumnFamilyOptions::bottommost_compression` enables dictionary. Also // verifies the size of the dictionary is within expectations according to the // limit on buffering set by `CompressionOptions::max_dict_buffer_bytes`. const size_t kValueLen = 256; const size_t kKeysPerFile = 1 << 10; const size_t kDictLen = 16 << 10; const size_t kBlockLen = 4 << 10; Options options = CurrentOptions(); if (bottommost_) { options.bottommost_compression = compression_type_; options.bottommost_compression_opts.enabled = true; options.bottommost_compression_opts.max_dict_bytes = kDictLen; options.bottommost_compression_opts.max_dict_buffer_bytes = kBlockLen; } else { options.compression = compression_type_; options.compression_opts.max_dict_bytes = kDictLen; options.compression_opts.max_dict_buffer_bytes = kBlockLen; } options.disable_auto_compactions = true; options.statistics = CreateDBStatistics(); BlockBasedTableOptions bbto; bbto.block_size = kBlockLen; bbto.cache_index_and_filter_blocks = true; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); Reopen(options); Random rnd(301); for (int i = 0; i < 2; ++i) { for (size_t j = 0; j <= kKeysPerFile; ++j) { ASSERT_OK(Put(Key(static_cast(j)), rnd.RandomString(kValueLen))); } ASSERT_OK(Flush()); } ASSERT_EQ("2", FilesPerLevel(0)); uint64_t prev_compression_dict_bytes_inserted = TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT); CompactRangeOptions cro; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); ASSERT_EQ("0,1", FilesPerLevel(0)); ASSERT_GT( TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), prev_compression_dict_bytes_inserted); // TODO(ajkr): fix the below assertion to work with ZSTD. The expectation on // number of bytes needs to be adjusted in case the cached block is in ZSTD's // digested dictionary format. if (compression_type_ != kZSTD && compression_type_ != kZSTDNotFinalCompression) { // Although we limited buffering to `kBlockLen`, there may be up to two // blocks of data included in the dictionary since we only check limit after // each block is built. ASSERT_LE( TestGetTickerCount(options, BLOCK_CACHE_COMPRESSION_DICT_BYTES_INSERT), prev_compression_dict_bytes_inserted + 2 * kBlockLen); } } class CompactionCompressionListener : public EventListener { public: explicit CompactionCompressionListener(Options* db_options) : db_options_(db_options) {} void OnCompactionCompleted(DB* db, const CompactionJobInfo& ci) override { // Figure out last level with files int bottommost_level = 0; for (int level = 0; level < db->NumberLevels(); level++) { std::string files_at_level; ASSERT_TRUE( db->GetProperty("rocksdb.num-files-at-level" + std::to_string(level), &files_at_level)); if (files_at_level != "0") { bottommost_level = level; } } if (db_options_->bottommost_compression != kDisableCompressionOption && ci.output_level == bottommost_level) { ASSERT_EQ(ci.compression, db_options_->bottommost_compression); } else if (db_options_->compression_per_level.size() != 0) { ASSERT_EQ(ci.compression, db_options_->compression_per_level[ci.output_level]); } else { ASSERT_EQ(ci.compression, db_options_->compression); } max_level_checked = std::max(max_level_checked, ci.output_level); } int max_level_checked = 0; const Options* db_options_; }; enum CompressionFailureType { kTestCompressionFail, kTestDecompressionFail, kTestDecompressionCorruption }; class CompressionFailuresTest : public DBTest2, public testing::WithParamInterface> { public: CompressionFailuresTest() { std::tie(compression_failure_type_, compression_type_, compression_max_dict_bytes_, compression_parallel_threads_) = GetParam(); } CompressionFailureType compression_failure_type_ = kTestCompressionFail; CompressionType compression_type_ = kNoCompression; uint32_t compression_max_dict_bytes_ = 0; uint32_t compression_parallel_threads_ = 0; }; INSTANTIATE_TEST_CASE_P( DBTest2, CompressionFailuresTest, ::testing::Combine(::testing::Values(kTestCompressionFail, kTestDecompressionFail, kTestDecompressionCorruption), ::testing::ValuesIn(GetSupportedCompressions()), ::testing::Values(0, 10), ::testing::Values(1, 4))); TEST_P(CompressionFailuresTest, CompressionFailures) { if (compression_type_ == kNoCompression) { return; } Options options = CurrentOptions(); options.level0_file_num_compaction_trigger = 2; options.max_bytes_for_level_base = 1024; options.max_bytes_for_level_multiplier = 2; options.num_levels = 7; options.max_background_compactions = 1; options.target_file_size_base = 512; BlockBasedTableOptions table_options; table_options.block_size = 512; table_options.verify_compression = true; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.compression = compression_type_; options.compression_opts.parallel_threads = compression_parallel_threads_; options.compression_opts.max_dict_bytes = compression_max_dict_bytes_; options.bottommost_compression_opts.parallel_threads = compression_parallel_threads_; options.bottommost_compression_opts.max_dict_bytes = compression_max_dict_bytes_; if (compression_failure_type_ == kTestCompressionFail) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompressData:TamperWithReturnValue", [](void* arg) { bool* ret = static_cast(arg); *ret = false; }); } else if (compression_failure_type_ == kTestDecompressionFail) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "UncompressBlockData:TamperWithReturnValue", [](void* arg) { Status* ret = static_cast(arg); ASSERT_OK(*ret); *ret = Status::Corruption("kTestDecompressionFail"); }); } else if (compression_failure_type_ == kTestDecompressionCorruption) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "UncompressBlockData:" "TamperWithDecompressionOutput", [](void* arg) { BlockContents* contents = static_cast(arg); // Ensure uncompressed data != original data const size_t len = contents->data.size() + 1; std::unique_ptr fake_data(new char[len]()); *contents = BlockContents(std::move(fake_data), len); }); } std::map key_value_written; const int kKeySize = 5; const int kValUnitSize = 16; const int kValSize = 256; Random rnd(405); Status s = Status::OK(); DestroyAndReopen(options); // Write 10 random files for (int i = 0; i < 10; i++) { for (int j = 0; j < 5; j++) { std::string key = rnd.RandomString(kKeySize); // Ensure good compression ratio std::string valueUnit = rnd.RandomString(kValUnitSize); std::string value; for (int k = 0; k < kValSize; k += kValUnitSize) { value += valueUnit; } s = Put(key, value); if (compression_failure_type_ == kTestCompressionFail) { key_value_written[key] = value; ASSERT_OK(s); } } s = Flush(); if (compression_failure_type_ == kTestCompressionFail) { ASSERT_OK(s); } s = dbfull()->TEST_WaitForCompact(); if (compression_failure_type_ == kTestCompressionFail) { ASSERT_OK(s); } if (i == 4) { // Make compression fail at the mid of table building ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); } } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); if (compression_failure_type_ == kTestCompressionFail) { // Should be kNoCompression, check content consistency std::unique_ptr db_iter(db_->NewIterator(ReadOptions())); for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { std::string key = db_iter->key().ToString(); std::string value = db_iter->value().ToString(); ASSERT_NE(key_value_written.find(key), key_value_written.end()); ASSERT_EQ(key_value_written[key], value); key_value_written.erase(key); } ASSERT_OK(db_iter->status()); ASSERT_EQ(0, key_value_written.size()); } else if (compression_failure_type_ == kTestDecompressionFail) { ASSERT_EQ(std::string(s.getState()), "Could not decompress: kTestDecompressionFail"); } else if (compression_failure_type_ == kTestDecompressionCorruption) { ASSERT_EQ(std::string(s.getState()), "Decompressed block did not match pre-compression block"); } } TEST_F(DBTest2, CompressionOptions) { if (!Zlib_Supported() || !Snappy_Supported()) { return; } Options options = CurrentOptions(); options.level0_file_num_compaction_trigger = 2; options.max_bytes_for_level_base = 100; options.max_bytes_for_level_multiplier = 2; options.num_levels = 7; options.max_background_compactions = 1; CompactionCompressionListener* listener = new CompactionCompressionListener(&options); options.listeners.emplace_back(listener); const int kKeySize = 5; const int kValSize = 20; Random rnd(301); std::vector compression_parallel_threads = {1, 4}; std::map key_value_written; for (int iter = 0; iter <= 2; iter++) { listener->max_level_checked = 0; if (iter == 0) { // Use different compression algorithms for different levels but // always use Zlib for bottommost level options.compression_per_level = {kNoCompression, kNoCompression, kNoCompression, kSnappyCompression, kSnappyCompression, kSnappyCompression, kZlibCompression}; options.compression = kNoCompression; options.bottommost_compression = kZlibCompression; } else if (iter == 1) { // Use Snappy except for bottommost level use ZLib options.compression_per_level = {}; options.compression = kSnappyCompression; options.bottommost_compression = kZlibCompression; } else if (iter == 2) { // Use Snappy everywhere options.compression_per_level = {}; options.compression = kSnappyCompression; options.bottommost_compression = kDisableCompressionOption; } for (auto num_threads : compression_parallel_threads) { options.compression_opts.parallel_threads = num_threads; options.bottommost_compression_opts.parallel_threads = num_threads; DestroyAndReopen(options); // Write 10 random files for (int i = 0; i < 10; i++) { for (int j = 0; j < 5; j++) { std::string key = rnd.RandomString(kKeySize); std::string value = rnd.RandomString(kValSize); key_value_written[key] = value; ASSERT_OK(Put(key, value)); } ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); } // Make sure that we wrote enough to check all 7 levels ASSERT_EQ(listener->max_level_checked, 6); // Make sure database content is the same as key_value_written std::unique_ptr db_iter(db_->NewIterator(ReadOptions())); for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { std::string key = db_iter->key().ToString(); std::string value = db_iter->value().ToString(); ASSERT_NE(key_value_written.find(key), key_value_written.end()); ASSERT_EQ(key_value_written[key], value); key_value_written.erase(key); } ASSERT_OK(db_iter->status()); ASSERT_EQ(0, key_value_written.size()); } } } class CompactionStallTestListener : public EventListener { public: CompactionStallTestListener() : compacting_files_cnt_(0), compacted_files_cnt_(0) {} void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override { ASSERT_EQ(ci.cf_name, "default"); ASSERT_EQ(ci.base_input_level, 0); ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum); compacting_files_cnt_ += ci.input_files.size(); } void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override { ASSERT_EQ(ci.cf_name, "default"); ASSERT_EQ(ci.base_input_level, 0); ASSERT_EQ(ci.compaction_reason, CompactionReason::kLevelL0FilesNum); compacted_files_cnt_ += ci.input_files.size(); } std::atomic compacting_files_cnt_; std::atomic compacted_files_cnt_; }; TEST_F(DBTest2, CompactionStall) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:0"}, {"DBImpl::BGWorkCompaction", "DBTest2::CompactionStall:1"}, {"DBTest2::CompactionStall:2", "DBImpl::NotifyOnCompactionBegin::UnlockMutex"}, {"DBTest2::CompactionStall:3", "DBImpl::NotifyOnCompactionCompleted::UnlockMutex"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); Options options = CurrentOptions(); options.level0_file_num_compaction_trigger = 4; options.max_background_compactions = 40; CompactionStallTestListener* listener = new CompactionStallTestListener(); options.listeners.emplace_back(listener); DestroyAndReopen(options); // make sure all background compaction jobs can be scheduled auto stop_token = dbfull()->TEST_write_controler().GetCompactionPressureToken(); Random rnd(301); // 4 Files in L0 for (int i = 0; i < 4; i++) { for (int j = 0; j < 10; j++) { ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10))); } ASSERT_OK(Flush()); } // Wait for compaction to be triggered TEST_SYNC_POINT("DBTest2::CompactionStall:0"); // Clear "DBImpl::BGWorkCompaction" SYNC_POINT since we want to hold it again // at DBTest2::CompactionStall::1 ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); // Another 6 L0 files to trigger compaction again for (int i = 0; i < 6; i++) { for (int j = 0; j < 10; j++) { ASSERT_OK(Put(rnd.RandomString(10), rnd.RandomString(10))); } ASSERT_OK(Flush()); } // Wait for another compaction to be triggered TEST_SYNC_POINT("DBTest2::CompactionStall:1"); // Hold NotifyOnCompactionBegin in the unlock mutex section TEST_SYNC_POINT("DBTest2::CompactionStall:2"); // Hold NotifyOnCompactionCompleted in the unlock mutex section TEST_SYNC_POINT("DBTest2::CompactionStall:3"); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_LT(NumTableFilesAtLevel(0), options.level0_file_num_compaction_trigger); ASSERT_GT(listener->compacted_files_cnt_.load(), 10 - options.level0_file_num_compaction_trigger); ASSERT_EQ(listener->compacting_files_cnt_.load(), listener->compacted_files_cnt_.load()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBTest2, FirstSnapshotTest) { Options options; options.write_buffer_size = 100000; // Small write buffer options = CurrentOptions(options); CreateAndReopenWithCF({"pikachu"}, options); // This snapshot will have sequence number 0 what is expected behaviour. const Snapshot* s1 = db_->GetSnapshot(); ASSERT_OK(Put(1, "k1", std::string(100000, 'x'))); // Fill memtable ASSERT_OK(Put(1, "k2", std::string(100000, 'y'))); // Trigger flush db_->ReleaseSnapshot(s1); } TEST_F(DBTest2, DuplicateSnapshot) { Options options; options = CurrentOptions(options); std::vector snapshots; DBImpl* dbi = static_cast_with_check(db_); SequenceNumber oldest_ww_snap, first_ww_snap; ASSERT_OK(Put("k", "v")); // inc seq snapshots.push_back(db_->GetSnapshot()); snapshots.push_back(db_->GetSnapshot()); ASSERT_OK(Put("k", "v")); // inc seq snapshots.push_back(db_->GetSnapshot()); snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary()); first_ww_snap = snapshots.back()->GetSequenceNumber(); ASSERT_OK(Put("k", "v")); // inc seq snapshots.push_back(dbi->GetSnapshotForWriteConflictBoundary()); snapshots.push_back(db_->GetSnapshot()); ASSERT_OK(Put("k", "v")); // inc seq snapshots.push_back(db_->GetSnapshot()); { InstrumentedMutexLock l(dbi->mutex()); auto seqs = dbi->snapshots().GetAll(&oldest_ww_snap); ASSERT_EQ(seqs.size(), 4); // duplicates are not counted ASSERT_EQ(oldest_ww_snap, first_ww_snap); } for (auto s : snapshots) { db_->ReleaseSnapshot(s); } } class PinL0IndexAndFilterBlocksTest : public DBTestBase, public testing::WithParamInterface> { public: PinL0IndexAndFilterBlocksTest() : DBTestBase("db_pin_l0_index_bloom_test", /*env_do_fsync=*/true) {} void SetUp() override { infinite_max_files_ = std::get<0>(GetParam()); disallow_preload_ = std::get<1>(GetParam()); } void CreateTwoLevels(Options* options, bool close_afterwards) { if (infinite_max_files_) { options->max_open_files = -1; } options->create_if_missing = true; options->statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; table_options.pin_l0_filter_and_index_blocks_in_cache = true; table_options.filter_policy.reset(NewBloomFilterPolicy(20)); options->table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu"}, *options); ASSERT_OK(Put(1, "a", "begin")); ASSERT_OK(Put(1, "z", "end")); ASSERT_OK(Flush(1)); // move this table to L1 ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1])); ASSERT_EQ(1, NumTableFilesAtLevel(1, 1)); // reset block cache table_options.block_cache = NewLRUCache(64 * 1024); options->table_factory.reset(NewBlockBasedTableFactory(table_options)); ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, *options)); // create new table at L0 ASSERT_OK(Put(1, "a2", "begin2")); ASSERT_OK(Put(1, "z2", "end2")); ASSERT_OK(Flush(1)); if (close_afterwards) { Close(); // This ensures that there is no ref to block cache entries } table_options.block_cache->EraseUnRefEntries(); } bool infinite_max_files_; bool disallow_preload_; }; TEST_P(PinL0IndexAndFilterBlocksTest, IndexAndFilterBlocksOfNewTableAddedToCacheWithPinning) { Options options = CurrentOptions(); if (infinite_max_files_) { options.max_open_files = -1; } options.create_if_missing = true; options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); BlockBasedTableOptions table_options; table_options.cache_index_and_filter_blocks = true; table_options.pin_l0_filter_and_index_blocks_in_cache = true; table_options.filter_policy.reset(NewBloomFilterPolicy(20)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "key", "val")); // Create a new table. ASSERT_OK(Flush(1)); // index/filter blocks added to block cache right after table creation. ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); // only index/filter were added ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_ADD)); ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS)); std::string value; // Miss and hit count should remain the same, they're all pinned. ASSERT_TRUE(db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value)); ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); // Miss and hit count should remain the same, they're all pinned. value = Get(1, "key"); ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } TEST_P(PinL0IndexAndFilterBlocksTest, MultiLevelIndexAndFilterBlocksCachedWithPinning) { Options options = CurrentOptions(); PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, false); // get base cache values uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS); uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT); std::string value; // this should be read from L0 // so cache values don't change value = Get(1, "a2"); ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); // this should be read from L1 // the file is opened, prefetching results in a cache filter miss // the block is loaded and added to the cache, // then the get results in a cache hit for L1 // When we have inifinite max_files, there is still cache miss because we have // reset the block cache value = Get(1, "a"); ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); } TEST_P(PinL0IndexAndFilterBlocksTest, DisablePrefetchingNonL0IndexAndFilter) { Options options = CurrentOptions(); // This ensures that db does not ref anything in the block cache, so // EraseUnRefEntries could clear them up. bool close_afterwards = true; PinL0IndexAndFilterBlocksTest::CreateTwoLevels(&options, close_afterwards); // Get base cache values uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS); uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT); if (disallow_preload_) { // Now we have two files. We narrow the max open files to allow 3 entries // so that preloading SST files won't happen. options.max_open_files = 13; // RocksDB sanitize max open files to at least 20. Modify it back. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { int* max_open_files = static_cast(arg); *max_open_files = 13; }); } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); // Reopen database. If max_open_files is set as -1, table readers will be // preloaded. This will trigger a BlockBasedTable::Open() and prefetch // L0 index and filter. Level 1's prefetching is disabled in DB::Open() ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); if (!disallow_preload_) { // After reopen, cache miss are increased by one because we read (and only // read) filter and index on L0 ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } else { // If max_open_files is not -1, we do not preload table readers, so there is // no change. ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } std::string value; // this should be read from L0 value = Get(1, "a2"); // If max_open_files is -1, we have pinned index and filter in Rep, so there // will not be changes in index and filter misses or hits. If max_open_files // is not -1, Get() will open a TableReader and prefetch index and filter. ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); // this should be read from L1 value = Get(1, "a"); if (!disallow_preload_) { // In infinite max files case, there's a cache miss in executing Get() // because index and filter are not prefetched before. ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } else { // In this case, cache miss will be increased by one in // BlockBasedTable::Open() because this is not in DB::Open() code path so we // will prefetch L1's index and filter. Cache hit will also be increased by // one because Get() will read index and filter from the block cache // prefetched in previous Open() call. ASSERT_EQ(fm + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); ASSERT_EQ(ih + 1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } // Force a full compaction to one single file. There will be a block // cache read for both of index and filter. If prefetch doesn't explicitly // happen, it will happen when verifying the file. Compact(1, "a", "zzzzz"); ASSERT_OK(dbfull()->TEST_WaitForCompact()); if (!disallow_preload_) { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); ASSERT_EQ(ih + 2, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } else { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } // Bloom and index hit will happen when a Get() happens. value = Get(1, "a"); if (!disallow_preload_) { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); ASSERT_EQ(ih + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } else { ASSERT_EQ(fm + 3, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(fh + 2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); ASSERT_EQ(im + 3, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); ASSERT_EQ(ih + 4, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); } } INSTANTIATE_TEST_CASE_P(PinL0IndexAndFilterBlocksTest, PinL0IndexAndFilterBlocksTest, ::testing::Values(std::make_tuple(true, false), std::make_tuple(false, false), std::make_tuple(false, true))); TEST_F(DBTest2, MaxCompactionBytesTest) { Options options = CurrentOptions(); options.memtable_factory.reset(test::NewSpecialSkipListFactory( DBTestBase::kNumKeysByGenerateNewRandomFile)); options.compaction_style = kCompactionStyleLevel; options.write_buffer_size = 200 << 10; options.arena_block_size = 4 << 10; options.level0_file_num_compaction_trigger = 4; options.num_levels = 4; options.compression = kNoCompression; options.max_bytes_for_level_base = 450 << 10; options.target_file_size_base = 100 << 10; // Infinite for full compaction. options.max_compaction_bytes = options.target_file_size_base * 100; Reopen(options); Random rnd(301); for (int num = 0; num < 8; num++) { GenerateNewRandomFile(&rnd); } CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kForce; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); ASSERT_EQ("0,0,8", FilesPerLevel(0)); // When compact from Ln -> Ln+1, cut a file if the file overlaps with // more than three files in Ln+1. options.max_compaction_bytes = options.target_file_size_base * 3; Reopen(options); GenerateNewRandomFile(&rnd); // Add three more small files that overlap with the previous file for (int i = 0; i < 3; i++) { ASSERT_OK(Put("a", "z")); ASSERT_OK(Flush()); } ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Output files to L1 are cut to 4 pieces, according to // options.max_compaction_bytes (300K) // There are 8 files on L2 (grandparents level), each one is 100K. The first // file overlaps with a, b which max_compaction_bytes is less than 300K, the // second one overlaps with d, e, which is also less than 300K. Including any // extra grandparent file will make the future compaction larger than 300K. // L1: [ 1 ] [ 2 ] [ 3 ] [ 4 ] // L2: [a] [b] [c] [d] [e] [f] [g] [h] ASSERT_EQ("0,4,8", FilesPerLevel(0)); } static void UniqueIdCallback(void* arg) { int* result = static_cast(arg); if (*result == -1) { *result = 0; } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearTrace(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback); } class MockPersistentCache : public PersistentCache { public: explicit MockPersistentCache(const bool is_compressed, const size_t max_size) : is_compressed_(is_compressed), max_size_(max_size) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback); } ~MockPersistentCache() override = default; PersistentCache::StatsType Stats() override { return PersistentCache::StatsType(); } uint64_t NewId() override { return last_id_.fetch_add(1, std::memory_order_relaxed); } Status Insert(const Slice& page_key, const char* data, const size_t size) override { MutexLock _(&lock_); if (size_ > max_size_) { size_ -= data_.begin()->second.size(); data_.erase(data_.begin()); } data_.insert(std::make_pair(page_key.ToString(), std::string(data, size))); size_ += size; return Status::OK(); } Status Lookup(const Slice& page_key, std::unique_ptr* data, size_t* size) override { MutexLock _(&lock_); auto it = data_.find(page_key.ToString()); if (it == data_.end()) { return Status::NotFound(); } assert(page_key.ToString() == it->first); data->reset(new char[it->second.size()]); memcpy(data->get(), it->second.c_str(), it->second.size()); *size = it->second.size(); return Status::OK(); } bool IsCompressed() override { return is_compressed_; } std::string GetPrintableOptions() const override { return "MockPersistentCache"; } port::Mutex lock_; std::map data_; const bool is_compressed_ = true; size_t size_ = 0; const size_t max_size_ = 10 * 1024; // 10KiB std::atomic last_id_{1}; }; #ifdef OS_LINUX // Make sure that in CPU time perf context counters, Env::NowCPUNanos() // is used, rather than Env::CPUNanos(); TEST_F(DBTest2, TestPerfContextGetCpuTime) { // force resizing table cache so table handle is not preloaded so that // we can measure find_table_nanos during Get(). dbfull()->TEST_table_cache()->SetCapacity(0); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Flush()); env_->now_cpu_count_.store(0); env_->SetMockSleep(); // NOTE: Presumed unnecessary and removed: resetting mock time in env // CPU timing is not enabled with kEnableTimeExceptForMutex SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex); ASSERT_EQ("bar", Get("foo")); ASSERT_EQ(0, get_perf_context()->get_cpu_nanos); ASSERT_EQ(0, env_->now_cpu_count_.load()); constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000}; constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds; // Add time to NowNanos() reading. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "TableCache::FindTable:0", [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); ASSERT_EQ("bar", Get("foo")); ASSERT_GT(env_->now_cpu_count_.load(), 2); ASSERT_LT(get_perf_context()->get_cpu_nanos, kDummyAddonNanos); ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos); SetPerfLevel(PerfLevel::kDisable); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBTest2, TestPerfContextIterCpuTime) { DestroyAndReopen(CurrentOptions()); // force resizing table cache so table handle is not preloaded so that // we can measure find_table_nanos during iteration dbfull()->TEST_table_cache()->SetCapacity(0); const size_t kNumEntries = 10; for (size_t i = 0; i < kNumEntries; ++i) { ASSERT_OK(Put("k" + std::to_string(i), "v" + std::to_string(i))); } ASSERT_OK(Flush()); for (size_t i = 0; i < kNumEntries; ++i) { ASSERT_EQ("v" + std::to_string(i), Get("k" + std::to_string(i))); } std::string last_key = "k" + std::to_string(kNumEntries - 1); std::string last_value = "v" + std::to_string(kNumEntries - 1); env_->now_cpu_count_.store(0); env_->SetMockSleep(); // NOTE: Presumed unnecessary and removed: resetting mock time in env // CPU timing is not enabled with kEnableTimeExceptForMutex SetPerfLevel(PerfLevel::kEnableTimeExceptForMutex); Iterator* iter = db_->NewIterator(ReadOptions()); iter->Seek("k0"); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("v0", iter->value().ToString()); iter->SeekForPrev(last_key); ASSERT_TRUE(iter->Valid()); iter->SeekToLast(); ASSERT_TRUE(iter->Valid()); ASSERT_EQ(last_value, iter->value().ToString()); iter->SeekToFirst(); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("v0", iter->value().ToString()); ASSERT_EQ(0, get_perf_context()->iter_seek_cpu_nanos); iter->Next(); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("v1", iter->value().ToString()); ASSERT_EQ(0, get_perf_context()->iter_next_cpu_nanos); iter->Prev(); ASSERT_TRUE(iter->Valid()); ASSERT_OK(iter->status()); ASSERT_EQ("v0", iter->value().ToString()); ASSERT_EQ(0, get_perf_context()->iter_prev_cpu_nanos); ASSERT_EQ(0, env_->now_cpu_count_.load()); delete iter; constexpr uint64_t kDummyAddonSeconds = uint64_t{1000000}; constexpr uint64_t kDummyAddonNanos = 1000000000U * kDummyAddonSeconds; // Add time to NowNanos() reading. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "TableCache::FindTable:0", [&](void* /*arg*/) { env_->MockSleepForSeconds(kDummyAddonSeconds); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); SetPerfLevel(PerfLevel::kEnableTimeAndCPUTimeExceptForMutex); iter = db_->NewIterator(ReadOptions()); iter->Seek("k0"); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("v0", iter->value().ToString()); iter->SeekForPrev(last_key); ASSERT_TRUE(iter->Valid()); iter->SeekToLast(); ASSERT_TRUE(iter->Valid()); ASSERT_EQ(last_value, iter->value().ToString()); iter->SeekToFirst(); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("v0", iter->value().ToString()); ASSERT_GT(get_perf_context()->iter_seek_cpu_nanos, 0); ASSERT_LT(get_perf_context()->iter_seek_cpu_nanos, kDummyAddonNanos); iter->Next(); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("v1", iter->value().ToString()); ASSERT_GT(get_perf_context()->iter_next_cpu_nanos, 0); ASSERT_LT(get_perf_context()->iter_next_cpu_nanos, kDummyAddonNanos); iter->Prev(); ASSERT_TRUE(iter->Valid()); ASSERT_OK(iter->status()); ASSERT_EQ("v0", iter->value().ToString()); ASSERT_GT(get_perf_context()->iter_prev_cpu_nanos, 0); ASSERT_LT(get_perf_context()->iter_prev_cpu_nanos, kDummyAddonNanos); ASSERT_GE(env_->now_cpu_count_.load(), 12); ASSERT_GT(get_perf_context()->find_table_nanos, kDummyAddonNanos); SetPerfLevel(PerfLevel::kDisable); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); delete iter; } #endif // OS_LINUX #if !defined OS_SOLARIS TEST_F(DBTest2, PersistentCache) { int num_iter = 80; Options options; options.write_buffer_size = 64 * 1024; // small write buffer options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); options = CurrentOptions(options); auto bsizes = {/*no block cache*/ 0, /*1M*/ 1 * 1024 * 1024}; auto types = {/*compressed*/ 1, /*uncompressed*/ 0}; for (auto bsize : bsizes) { for (auto type : types) { BlockBasedTableOptions table_options; table_options.persistent_cache.reset( new MockPersistentCache(type, 10 * 1024)); table_options.no_block_cache = true; table_options.block_cache = bsize ? NewLRUCache(bsize) : nullptr; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); // default column family doesn't have block cache Options no_block_cache_opts; no_block_cache_opts.statistics = options.statistics; no_block_cache_opts = CurrentOptions(no_block_cache_opts); BlockBasedTableOptions table_options_no_bc; table_options_no_bc.no_block_cache = true; no_block_cache_opts.table_factory.reset( NewBlockBasedTableFactory(table_options_no_bc)); ReopenWithColumnFamilies( {"default", "pikachu"}, std::vector({no_block_cache_opts, options})); Random rnd(301); // Write 8MB (80 values, each 100K) ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); std::vector values; std::string str; for (int i = 0; i < num_iter; i++) { if (i % 4 == 0) { // high compression ratio str = rnd.RandomString(1000); } values.push_back(str); ASSERT_OK(Put(1, Key(i), values[i])); } // flush all data from memtable so that reads are from block cache ASSERT_OK(Flush(1)); for (int i = 0; i < num_iter; i++) { ASSERT_EQ(Get(1, Key(i)), values[i]); } auto hit = options.statistics->getTickerCount(PERSISTENT_CACHE_HIT); auto miss = options.statistics->getTickerCount(PERSISTENT_CACHE_MISS); ASSERT_GT(hit, 0); ASSERT_GT(miss, 0); } } } #endif // !defined OS_SOLARIS namespace { void CountSyncPoint() { TEST_SYNC_POINT_CALLBACK("DBTest2::MarkedPoint", nullptr /* arg */); } } // anonymous namespace TEST_F(DBTest2, SyncPointMarker) { std::atomic sync_point_called(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBTest2::MarkedPoint", [&](void* /*arg*/) { sync_point_called.fetch_add(1); }); // The first dependency enforces Marker can be loaded before MarkedPoint. // The second checks that thread 1's MarkedPoint should be disabled here. // Execution order: // | Thread 1 | Thread 2 | // | | Marker | // | MarkedPoint | | // | Thread1First | | // | | MarkedPoint | ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependencyAndMarkers( {{"DBTest2::SyncPointMarker:Thread1First", "DBTest2::MarkedPoint"}}, {{"DBTest2::SyncPointMarker:Marker", "DBTest2::MarkedPoint"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); std::function func1 = [&]() { CountSyncPoint(); TEST_SYNC_POINT("DBTest2::SyncPointMarker:Thread1First"); }; std::function func2 = [&]() { TEST_SYNC_POINT("DBTest2::SyncPointMarker:Marker"); CountSyncPoint(); }; auto thread1 = port::Thread(func1); auto thread2 = port::Thread(func2); thread1.join(); thread2.join(); // Callback is only executed once ASSERT_EQ(sync_point_called.load(), 1); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } size_t GetEncodedEntrySize(size_t key_size, size_t value_size) { std::string buffer; PutVarint32(&buffer, static_cast(0)); PutVarint32(&buffer, static_cast(key_size)); PutVarint32(&buffer, static_cast(value_size)); return buffer.size() + key_size + value_size; } TEST_F(DBTest2, ReadAmpBitmap) { Options options = CurrentOptions(); BlockBasedTableOptions bbto; uint32_t bytes_per_bit[2] = {1, 16}; for (size_t k = 0; k < 2; k++) { // Disable delta encoding to make it easier to calculate read amplification bbto.use_delta_encoding = false; // Huge block cache to make it easier to calculate read amplification bbto.block_cache = NewLRUCache(1024 * 1024 * 1024); bbto.read_amp_bytes_per_bit = bytes_per_bit[k]; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); DestroyAndReopen(options); const size_t kNumEntries = 10000; Random rnd(301); for (size_t i = 0; i < kNumEntries; i++) { ASSERT_OK(Put(Key(static_cast(i)), rnd.RandomString(100))); } ASSERT_OK(Flush()); Close(); Reopen(options); // Read keys/values randomly and verify that reported read amp error // is less than 2% uint64_t total_useful_bytes = 0; std::set read_keys; std::string value; for (size_t i = 0; i < kNumEntries * 5; i++) { int key_idx = rnd.Next() % kNumEntries; std::string key = Key(key_idx); ASSERT_OK(db_->Get(ReadOptions(), key, &value)); if (read_keys.find(key_idx) == read_keys.end()) { auto internal_key = InternalKey(key, 0, ValueType::kTypeValue); total_useful_bytes += GetEncodedEntrySize(internal_key.size(), value.size()); read_keys.insert(key_idx); } double expected_read_amp = static_cast(total_useful_bytes) / options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES); double read_amp = static_cast(options.statistics->getTickerCount( READ_AMP_ESTIMATE_USEFUL_BYTES)) / options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES); double error_pct = fabs(expected_read_amp - read_amp) * 100; // Error between reported read amp and real read amp should be less than // 2% EXPECT_LE(error_pct, 2); } // Make sure we read every thing in the DB (which is smaller than our cache) Iterator* iter = db_->NewIterator(ReadOptions()); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ASSERT_EQ(iter->value().ToString(), Get(iter->key().ToString())); } ASSERT_OK(iter->status()); delete iter; // Read amp is on average 100% since we read all what we loaded in memory if (k == 0) { ASSERT_EQ( options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES), options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES)); } else { ASSERT_NEAR( options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES) * 1.0f / options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES), 1, .01); } } } #ifndef OS_SOLARIS // GetUniqueIdFromFile is not implemented TEST_F(DBTest2, ReadAmpBitmapLiveInCacheAfterDBClose) { { const int kIdBufLen = 100; char id_buf[kIdBufLen]; Status s = Status::NotSupported(); #ifndef OS_WIN // You can't open a directory on windows using random access file std::unique_ptr file; s = env_->NewRandomAccessFile(dbname_, &file, EnvOptions()); if (s.ok()) { if (file->GetUniqueId(id_buf, kIdBufLen) == 0) { // fs holding db directory doesn't support getting a unique file id, // this means that running this test will fail because lru_cache will // load the blocks again regardless of them being already in the cache return; } } #endif if (!s.ok()) { std::unique_ptr dir; ASSERT_OK(env_->NewDirectory(dbname_, &dir)); if (dir->GetUniqueId(id_buf, kIdBufLen) == 0) { // fs holding db directory doesn't support getting a unique file id, // this means that running this test will fail because lru_cache will // load the blocks again regardless of them being already in the cache return; } } } uint32_t bytes_per_bit[2] = {1, 16}; for (size_t k = 0; k < 2; k++) { std::shared_ptr lru_cache = NewLRUCache(1024 * 1024 * 1024); std::shared_ptr stats = ROCKSDB_NAMESPACE::CreateDBStatistics(); Options options = CurrentOptions(); BlockBasedTableOptions bbto; // Disable delta encoding to make it easier to calculate read amplification bbto.use_delta_encoding = false; // Huge block cache to make it easier to calculate read amplification bbto.block_cache = lru_cache; bbto.read_amp_bytes_per_bit = bytes_per_bit[k]; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); options.statistics = stats; DestroyAndReopen(options); const int kNumEntries = 10000; Random rnd(301); for (int i = 0; i < kNumEntries; i++) { ASSERT_OK(Put(Key(i), rnd.RandomString(100))); } ASSERT_OK(Flush()); Close(); Reopen(options); std::set read_keys; std::string value; // Iter1: Read half the DB, Read even keys // Key(0), Key(2), Key(4), Key(6), Key(8), ... for (int i = 0; i < kNumEntries; i += 2) { std::string key = Key(i); ASSERT_OK(db_->Get(ReadOptions(), key, &value)); if (read_keys.find(i) == read_keys.end()) { auto internal_key = InternalKey(key, 0, ValueType::kTypeValue); read_keys.insert(i); } } size_t total_useful_bytes_iter1 = options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES); size_t total_loaded_bytes_iter1 = options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES); Close(); std::shared_ptr new_statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); // Destroy old statistics obj that the blocks in lru_cache are pointing to options.statistics.reset(); // Use the statistics object that we just created options.statistics = new_statistics; Reopen(options); // Iter2: Read half the DB, Read odd keys // Key(1), Key(3), Key(5), Key(7), Key(9), ... for (int i = 1; i < kNumEntries; i += 2) { std::string key = Key(i); ASSERT_OK(db_->Get(ReadOptions(), key, &value)); if (read_keys.find(i) == read_keys.end()) { auto internal_key = InternalKey(key, 0, ValueType::kTypeValue); read_keys.insert(i); } } size_t total_useful_bytes_iter2 = options.statistics->getTickerCount(READ_AMP_ESTIMATE_USEFUL_BYTES); size_t total_loaded_bytes_iter2 = options.statistics->getTickerCount(READ_AMP_TOTAL_READ_BYTES); // Read amp is on average 100% since we read all what we loaded in memory if (k == 0) { ASSERT_EQ(total_useful_bytes_iter1 + total_useful_bytes_iter2, total_loaded_bytes_iter1 + total_loaded_bytes_iter2); } else { ASSERT_NEAR((total_useful_bytes_iter1 + total_useful_bytes_iter2) * 1.0f / (total_loaded_bytes_iter1 + total_loaded_bytes_iter2), 1, .01); } } } #endif // !OS_SOLARIS TEST_F(DBTest2, AutomaticCompactionOverlapManualCompaction) { Options options = CurrentOptions(); options.num_levels = 3; options.IncreaseParallelism(20); DestroyAndReopen(options); ASSERT_OK(Put(Key(0), "a")); ASSERT_OK(Put(Key(5), "a")); ASSERT_OK(Flush()); ASSERT_OK(Put(Key(10), "a")); ASSERT_OK(Put(Key(15), "a")); ASSERT_OK(Flush()); CompactRangeOptions cro; cro.change_level = true; cro.target_level = 2; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); auto get_stat = [](std::string level_str, LevelStatType type, std::map props) { auto prop_str = "compaction." + level_str + "." + InternalStats::compaction_level_stats.at(type).property_name.c_str(); auto prop_item = props.find(prop_str); return prop_item == props.end() ? 0 : std::stod(prop_item->second); }; // Trivial move 2 files to L2 ASSERT_EQ("0,0,2", FilesPerLevel()); // Also test that the stats GetMapProperty API reporting the same result { std::map prop; ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop)); ASSERT_EQ(0, get_stat("L0", LevelStatType::NUM_FILES, prop)); ASSERT_EQ(0, get_stat("L1", LevelStatType::NUM_FILES, prop)); ASSERT_EQ(2, get_stat("L2", LevelStatType::NUM_FILES, prop)); ASSERT_EQ(2, get_stat("Sum", LevelStatType::NUM_FILES, prop)); } // While the compaction is running, we will create 2 new files that // can fit in L2, these 2 files will be moved to L2 and overlap with // the running compaction and break the LSM consistency. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionJob::Run():Start", [&](void* /*arg*/) { ASSERT_OK( dbfull()->SetOptions({{"level0_file_num_compaction_trigger", "2"}, {"max_bytes_for_level_base", "1"}})); ASSERT_OK(Put(Key(6), "a")); ASSERT_OK(Put(Key(7), "a")); ASSERT_OK(Flush()); ASSERT_OK(Put(Key(8), "a")); ASSERT_OK(Put(Key(9), "a")); ASSERT_OK(Flush()); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); // Run a manual compaction that will compact the 2 files in L2 // into 1 file in L2 cro.exclusive_manual_compaction = false; cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); // Test that the stats GetMapProperty API reporting 1 file in L2 { std::map prop; ASSERT_TRUE(dbfull()->GetMapProperty("rocksdb.cfstats", &prop)); ASSERT_EQ(1, get_stat("L2", LevelStatType::NUM_FILES, prop)); } } TEST_F(DBTest2, ManualCompactionOverlapManualCompaction) { Options options = CurrentOptions(); options.num_levels = 2; options.IncreaseParallelism(20); options.disable_auto_compactions = true; DestroyAndReopen(options); ASSERT_OK(Put(Key(0), "a")); ASSERT_OK(Put(Key(5), "a")); ASSERT_OK(Flush()); ASSERT_OK(Put(Key(10), "a")); ASSERT_OK(Put(Key(15), "a")); ASSERT_OK(Flush()); ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); // Trivial move 2 files to L1 ASSERT_EQ("0,2", FilesPerLevel()); std::function bg_manual_compact = [&]() { std::string k1 = Key(6); std::string k2 = Key(9); Slice k1s(k1); Slice k2s(k2); CompactRangeOptions cro; cro.exclusive_manual_compaction = false; ASSERT_OK(db_->CompactRange(cro, &k1s, &k2s)); }; ROCKSDB_NAMESPACE::port::Thread bg_thread; // While the compaction is running, we will create 2 new files that // can fit in L1, these 2 files will be moved to L1 and overlap with // the running compaction and break the LSM consistency. std::atomic flag(false); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionJob::Run():Start", [&](void* /*arg*/) { if (flag.exchange(true)) { // We want to make sure to call this callback only once return; } ASSERT_OK(Put(Key(6), "a")); ASSERT_OK(Put(Key(7), "a")); ASSERT_OK(Flush()); ASSERT_OK(Put(Key(8), "a")); ASSERT_OK(Put(Key(9), "a")); ASSERT_OK(Flush()); // Start a non-exclusive manual compaction in a bg thread bg_thread = port::Thread(bg_manual_compact); // This manual compaction conflict with the other manual compaction // so it should wait until the first compaction finish env_->SleepForMicroseconds(1000000); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); // Run a manual compaction that will compact the 2 files in L1 // into 1 file in L1 CompactRangeOptions cro; cro.exclusive_manual_compaction = false; cro.bottommost_level_compaction = BottommostLevelCompaction::kForceOptimized; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); bg_thread.join(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBTest2, PausingManualCompaction1) { Options options = CurrentOptions(); options.disable_auto_compactions = true; options.num_levels = 7; DestroyAndReopen(options); Random rnd(301); // Generate a file containing 10 keys. for (int i = 0; i < 10; i++) { ASSERT_OK(Put(Key(i), rnd.RandomString(50))); } ASSERT_OK(Flush()); // Generate another file containing same keys for (int i = 0; i < 10; i++) { ASSERT_OK(Put(Key(i), rnd.RandomString(50))); } ASSERT_OK(Flush()); int manual_compactions_paused = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionJob::Run():PausingManualCompaction:1", [&](void* arg) { auto canceled = static_cast*>(arg); // CompactRange triggers manual compaction and cancel the compaction // by set *canceled as true if (canceled != nullptr) { canceled->store(true, std::memory_order_release); } manual_compactions_paused += 1; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "TestCompactFiles:PausingManualCompaction:3", [&](void* arg) { auto paused = static_cast*>(arg); // CompactFiles() relies on manual_compactions_paused to // determine if thie compaction should be paused or not ASSERT_EQ(0, paused->load(std::memory_order_acquire)); paused->fetch_add(1, std::memory_order_release); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); std::vector files_before_compact, files_after_compact; // Remember file name before compaction is triggered std::vector files_meta; dbfull()->GetLiveFilesMetaData(&files_meta); for (const auto& file : files_meta) { files_before_compact.push_back(file.name); } // OK, now trigger a manual compaction ASSERT_TRUE(dbfull() ->CompactRange(CompactRangeOptions(), nullptr, nullptr) .IsManualCompactionPaused()); // Wait for compactions to get scheduled and stopped ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Get file names after compaction is stopped files_meta.clear(); dbfull()->GetLiveFilesMetaData(&files_meta); for (const auto& file : files_meta) { files_after_compact.push_back(file.name); } // Like nothing happened ASSERT_EQ(files_before_compact, files_after_compact); ASSERT_EQ(manual_compactions_paused, 1); manual_compactions_paused = 0; // Now make sure CompactFiles also not run ASSERT_TRUE(dbfull() ->CompactFiles(ROCKSDB_NAMESPACE::CompactionOptions(), files_before_compact, 0) .IsManualCompactionPaused()); // Wait for manual compaction to get scheduled and finish ASSERT_OK(dbfull()->TEST_WaitForCompact()); files_meta.clear(); files_after_compact.clear(); dbfull()->GetLiveFilesMetaData(&files_meta); for (const auto& file : files_meta) { files_after_compact.push_back(file.name); } ASSERT_EQ(files_before_compact, files_after_compact); // CompactFiles returns at entry point ASSERT_EQ(manual_compactions_paused, 0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } // PausingManualCompaction does not affect auto compaction TEST_F(DBTest2, PausingManualCompaction2) { Options options = CurrentOptions(); options.level0_file_num_compaction_trigger = 2; options.disable_auto_compactions = false; DestroyAndReopen(options); dbfull()->DisableManualCompaction(); Random rnd(301); for (int i = 0; i < 2; i++) { // Generate a file containing 100 keys. for (int j = 0; j < 100; j++) { ASSERT_OK(Put(Key(j), rnd.RandomString(50))); } ASSERT_OK(Flush()); } ASSERT_OK(dbfull()->TEST_WaitForCompact()); std::vector files_meta; dbfull()->GetLiveFilesMetaData(&files_meta); ASSERT_EQ(files_meta.size(), 1); } TEST_F(DBTest2, PausingManualCompaction3) { CompactRangeOptions compact_options; Options options = CurrentOptions(); options.disable_auto_compactions = true; options.num_levels = 7; Random rnd(301); auto generate_files = [&]() { for (int i = 0; i < options.num_levels; i++) { for (int j = 0; j < options.num_levels - i + 1; j++) { for (int k = 0; k < 1000; k++) { ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50))); } ASSERT_OK(Flush()); } for (int l = 1; l < options.num_levels - i; l++) { MoveFilesToLevel(l); } } }; DestroyAndReopen(options); generate_files(); ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); int run_manual_compactions = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionJob::Run():PausingManualCompaction:1", [&](void* /*arg*/) { run_manual_compactions++; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); dbfull()->DisableManualCompaction(); ASSERT_TRUE(dbfull() ->CompactRange(compact_options, nullptr, nullptr) .IsManualCompactionPaused()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); // As manual compaction disabled, not even reach sync point ASSERT_EQ(run_manual_compactions, 0); ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( "CompactionJob::Run():PausingManualCompaction:1"); dbfull()->EnableManualCompaction(); ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBTest2, PausingManualCompaction4) { CompactRangeOptions compact_options; Options options = CurrentOptions(); options.disable_auto_compactions = true; options.num_levels = 7; Random rnd(301); auto generate_files = [&]() { for (int i = 0; i < options.num_levels; i++) { for (int j = 0; j < options.num_levels - i + 1; j++) { for (int k = 0; k < 1000; k++) { ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50))); } ASSERT_OK(Flush()); } for (int l = 1; l < options.num_levels - i; l++) { MoveFilesToLevel(l); } } }; DestroyAndReopen(options); generate_files(); ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); int run_manual_compactions = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionJob::Run():PausingManualCompaction:2", [&](void* arg) { auto canceled = static_cast*>(arg); // CompactRange triggers manual compaction and cancel the compaction // by set *canceled as true if (canceled != nullptr) { canceled->store(true, std::memory_order_release); } run_manual_compactions++; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "TestCompactFiles:PausingManualCompaction:3", [&](void* arg) { auto paused = static_cast*>(arg); // CompactFiles() relies on manual_compactions_paused to // determine if thie compaction should be paused or not ASSERT_EQ(0, paused->load(std::memory_order_acquire)); paused->fetch_add(1, std::memory_order_release); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_TRUE(dbfull() ->CompactRange(compact_options, nullptr, nullptr) .IsManualCompactionPaused()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(run_manual_compactions, 1); ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( "CompactionJob::Run():PausingManualCompaction:2"); ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBTest2, CancelManualCompaction1) { CompactRangeOptions compact_options; auto canceledPtr = std::unique_ptr>(new std::atomic{true}); compact_options.canceled = canceledPtr.get(); Options options = CurrentOptions(); options.disable_auto_compactions = true; options.num_levels = 7; Random rnd(301); auto generate_files = [&]() { for (int i = 0; i < options.num_levels; i++) { for (int j = 0; j < options.num_levels - i + 1; j++) { for (int k = 0; k < 1000; k++) { ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50))); } ASSERT_OK(Flush()); } for (int l = 1; l < options.num_levels - i; l++) { MoveFilesToLevel(l); } } }; DestroyAndReopen(options); generate_files(); ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); int run_manual_compactions = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionJob::Run():PausingManualCompaction:1", [&](void* /*arg*/) { run_manual_compactions++; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); // Setup a callback to disable compactions after a couple of levels are // compacted int compactions_run = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) { ++compactions_run; }); ASSERT_TRUE(dbfull() ->CompactRange(compact_options, nullptr, nullptr) .IsManualCompactionPaused()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); // Since compactions are disabled, we shouldn't start compacting. // E.g. we should call the compaction function exactly one time. ASSERT_EQ(compactions_run, 0); ASSERT_EQ(run_manual_compactions, 0); ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); compactions_run = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( "DBImpl::RunManualCompaction()::1"); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) { ++compactions_run; // After 3 compactions disable if (compactions_run == 3) { compact_options.canceled->store(true, std::memory_order_release); } }); compact_options.canceled->store(false, std::memory_order_release); ASSERT_TRUE(dbfull() ->CompactRange(compact_options, nullptr, nullptr) .IsManualCompactionPaused()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(compactions_run, 3); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( "DBImpl::RunManualCompaction()::1"); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( "CompactionJob::Run():PausingManualCompaction:1"); // Compactions should work again if we re-enable them.. compact_options.canceled->store(false, std::memory_order_relaxed); ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBTest2, CancelManualCompaction2) { CompactRangeOptions compact_options; auto canceledPtr = std::unique_ptr>(new std::atomic{true}); compact_options.canceled = canceledPtr.get(); compact_options.max_subcompactions = 1; Options options = CurrentOptions(); options.disable_auto_compactions = true; options.num_levels = 7; Random rnd(301); auto generate_files = [&]() { for (int i = 0; i < options.num_levels; i++) { for (int j = 0; j < options.num_levels - i + 1; j++) { for (int k = 0; k < 1000; k++) { ASSERT_OK(Put(Key(k + j * 1000), rnd.RandomString(50))); } ASSERT_OK(Flush()); } for (int l = 1; l < options.num_levels - i; l++) { MoveFilesToLevel(l); } } }; DestroyAndReopen(options); generate_files(); ASSERT_EQ("2,3,4,5,6,7,8", FilesPerLevel()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); int compactions_run = 0; std::atomic kv_compactions{0}; int compactions_stopped_at = 0; int kv_compactions_stopped_at = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::RunManualCompaction()::1", [&](void* /*arg*/) { ++compactions_run; // After 3 compactions disable }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionIterator:ProcessKV", [&](void* /*arg*/) { int kv_compactions_run = kv_compactions.fetch_add(1, std::memory_order_release); if (kv_compactions_run == 5) { compact_options.canceled->store(true, std::memory_order_release); kv_compactions_stopped_at = kv_compactions_run; compactions_stopped_at = compactions_run; } }); compact_options.canceled->store(false, std::memory_order_release); ASSERT_TRUE(dbfull() ->CompactRange(compact_options, nullptr, nullptr) .IsManualCompactionPaused()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); // NOTE: as we set compact_options.max_subcompacitons = 1, and store true to // the canceled variable from the single compacting thread (via callback), // this value is deterministically kv_compactions_stopped_at + 1. ASSERT_EQ(kv_compactions, kv_compactions_stopped_at + 1); ASSERT_EQ(compactions_run, compactions_stopped_at); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( "CompactionIterator::ProcessKV"); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( "DBImpl::RunManualCompaction()::1"); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( "CompactionJob::Run():PausingManualCompaction:1"); // Compactions should work again if we re-enable them.. compact_options.canceled->store(false, std::memory_order_relaxed); ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } class CancelCompactionListener : public EventListener { public: CancelCompactionListener() : num_compaction_started_(0), num_compaction_ended_(0) {} void OnCompactionBegin(DB* /*db*/, const CompactionJobInfo& ci) override { ASSERT_EQ(ci.cf_name, "default"); ASSERT_EQ(ci.base_input_level, 0); num_compaction_started_++; } void OnCompactionCompleted(DB* /*db*/, const CompactionJobInfo& ci) override { ASSERT_EQ(ci.cf_name, "default"); ASSERT_EQ(ci.base_input_level, 0); ASSERT_EQ(ci.status.code(), code_); ASSERT_EQ(ci.status.subcode(), subcode_); num_compaction_ended_++; } std::atomic num_compaction_started_; std::atomic num_compaction_ended_; Status::Code code_; Status::SubCode subcode_; }; TEST_F(DBTest2, CancelManualCompactionWithListener) { CompactRangeOptions compact_options; auto canceledPtr = std::unique_ptr>(new std::atomic{true}); compact_options.canceled = canceledPtr.get(); compact_options.max_subcompactions = 1; Options options = CurrentOptions(); options.disable_auto_compactions = true; CancelCompactionListener* listener = new CancelCompactionListener(); options.listeners.emplace_back(listener); DestroyAndReopen(options); Random rnd(301); for (int i = 0; i < 10; i++) { for (int j = 0; j < 10; j++) { ASSERT_OK(Put(Key(i + j * 10), rnd.RandomString(50))); } ASSERT_OK(Flush()); } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionIterator:ProcessKV", [&](void* /*arg*/) { compact_options.canceled->store(true, std::memory_order_release); }); int running_compaction = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionJob::FinishCompactionOutputFile1", [&](void* /*arg*/) { running_compaction++; }); // Case I: 1 Notify begin compaction, 2 Set *canceled as true to disable // manual compaction in the callback function, 3 Compaction not run, // 4 Notify compaction end. listener->code_ = Status::kIncomplete; listener->subcode_ = Status::SubCode::kManualCompactionPaused; compact_options.canceled->store(false, std::memory_order_release); ASSERT_TRUE(dbfull() ->CompactRange(compact_options, nullptr, nullptr) .IsManualCompactionPaused()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(listener->num_compaction_started_, 0); ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_); ASSERT_EQ(running_compaction, 0); listener->num_compaction_started_ = 0; listener->num_compaction_ended_ = 0; // Case II: 1 Set *canceled as true in the callback function to disable manual // compaction, 2 Notify begin compaction (return without notifying), 3 Notify // compaction end (return without notifying). ASSERT_TRUE(dbfull() ->CompactRange(compact_options, nullptr, nullptr) .IsManualCompactionPaused()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(listener->num_compaction_started_, 0); ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_); ASSERT_EQ(running_compaction, 0); // Case III: 1 Notify begin compaction, 2 Compaction in between // 3. Set *canceled as true in the callback function to disable manual // compaction, 4 Notify compaction end. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearCallBack( "CompactionIterator:ProcessKV"); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionJob::Run:BeforeVerify", [&](void* /*arg*/) { compact_options.canceled->store(true, std::memory_order_release); }); listener->code_ = Status::kOk; listener->subcode_ = Status::SubCode::kNone; compact_options.canceled->store(false, std::memory_order_release); ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(listener->num_compaction_started_, 0); ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_); // Compaction job will succeed. ASSERT_GT(running_compaction, 0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBTest2, CompactionOnBottomPriorityWithListener) { int num_levels = 3; const int kNumFilesTrigger = 4; Options options = CurrentOptions(); env_->SetBackgroundThreads(0, Env::Priority::HIGH); env_->SetBackgroundThreads(0, Env::Priority::LOW); env_->SetBackgroundThreads(1, Env::Priority::BOTTOM); options.env = env_; options.compaction_style = kCompactionStyleUniversal; options.num_levels = num_levels; options.write_buffer_size = 100 << 10; // 100KB options.target_file_size_base = 32 << 10; // 32KB options.level0_file_num_compaction_trigger = kNumFilesTrigger; // Trigger compaction if size amplification exceeds 110% options.compaction_options_universal.max_size_amplification_percent = 110; CancelCompactionListener* listener = new CancelCompactionListener(); options.listeners.emplace_back(listener); DestroyAndReopen(options); int num_bottom_thread_compaction_scheduled = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::BackgroundCompaction:ForwardToBottomPriPool", [&](void* /*arg*/) { num_bottom_thread_compaction_scheduled++; }); int num_compaction_jobs = 0; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionJob::Run():End", [&](void* /*arg*/) { num_compaction_jobs++; }); listener->code_ = Status::kOk; listener->subcode_ = Status::SubCode::kNone; Random rnd(301); for (int i = 0; i < 1; ++i) { for (int num = 0; num < kNumFilesTrigger; num++) { int key_idx = 0; GenerateNewFile(&rnd, &key_idx, true /* no_wait */); // use no_wait above because that one waits for flush and compaction. We // don't want to wait for compaction because the full compaction is // intentionally blocked while more files are flushed. ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); } } ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_GT(num_bottom_thread_compaction_scheduled, 0); ASSERT_EQ(num_compaction_jobs, 1); ASSERT_GT(listener->num_compaction_started_, 0); ASSERT_EQ(listener->num_compaction_started_, listener->num_compaction_ended_); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBTest2, OptimizeForPointLookup) { Options options = CurrentOptions(); Close(); options.OptimizeForPointLookup(2); ASSERT_OK(DB::Open(options, dbname_, &db_)); ASSERT_OK(Put("foo", "v1")); ASSERT_EQ("v1", Get("foo")); ASSERT_OK(Flush()); ASSERT_EQ("v1", Get("foo")); } TEST_F(DBTest2, OptimizeForSmallDB) { Options options = CurrentOptions(); Close(); options.OptimizeForSmallDb(); // Find the cache object ASSERT_TRUE(options.table_factory->IsInstanceOf( TableFactory::kBlockBasedTableName())); auto table_options = options.table_factory->GetOptions(); ASSERT_TRUE(table_options != nullptr); std::shared_ptr cache = table_options->block_cache; ASSERT_EQ(0, cache->GetUsage()); ASSERT_OK(DB::Open(options, dbname_, &db_)); ASSERT_OK(Put("foo", "v1")); // memtable size is costed to the block cache ASSERT_NE(0, cache->GetUsage()); ASSERT_EQ("v1", Get("foo")); ASSERT_OK(Flush()); size_t prev_size = cache->GetUsage(); // Remember block cache size, so that we can find that // it is filled after Get(). // Use pinnable slice so that it can ping the block so that // when we check the size it is not evicted. PinnableSlice value; ASSERT_OK(db_->Get(ReadOptions(), db_->DefaultColumnFamily(), "foo", &value)); ASSERT_GT(cache->GetUsage(), prev_size); value.Reset(); } TEST_F(DBTest2, IterRaceFlush1) { ASSERT_OK(Put("foo", "v1")); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"DBImpl::NewIterator:1", "DBTest2::IterRaceFlush:1"}, {"DBTest2::IterRaceFlush:2", "DBImpl::NewIterator:2"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ROCKSDB_NAMESPACE::port::Thread t1([&] { TEST_SYNC_POINT("DBTest2::IterRaceFlush:1"); ASSERT_OK(Put("foo", "v2")); ASSERT_OK(Flush()); TEST_SYNC_POINT("DBTest2::IterRaceFlush:2"); }); // iterator is created after the first Put(), and its snapshot sequence is // assigned after second Put(), so it must see v2. { std::unique_ptr it(db_->NewIterator(ReadOptions())); it->Seek("foo"); ASSERT_TRUE(it->Valid()); ASSERT_OK(it->status()); ASSERT_EQ("foo", it->key().ToString()); ASSERT_EQ("v2", it->value().ToString()); } t1.join(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBTest2, IterRaceFlush2) { ASSERT_OK(Put("foo", "v1")); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"DBImpl::NewIterator:3", "DBTest2::IterRaceFlush2:1"}, {"DBTest2::IterRaceFlush2:2", "DBImpl::NewIterator:4"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ROCKSDB_NAMESPACE::port::Thread t1([&] { TEST_SYNC_POINT("DBTest2::IterRaceFlush2:1"); ASSERT_OK(Put("foo", "v2")); ASSERT_OK(Flush()); TEST_SYNC_POINT("DBTest2::IterRaceFlush2:2"); }); // iterator is created after the first Put(), and its snapshot sequence is // assigned before second Put(), thus it must see v1. { std::unique_ptr it(db_->NewIterator(ReadOptions())); it->Seek("foo"); ASSERT_TRUE(it->Valid()); ASSERT_OK(it->status()); ASSERT_EQ("foo", it->key().ToString()); ASSERT_EQ("v1", it->value().ToString()); } t1.join(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBTest2, IterRefreshRaceFlush) { ASSERT_OK(Put("foo", "v1")); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"ArenaWrappedDBIter::Refresh:1", "DBTest2::IterRefreshRaceFlush:1"}, {"DBTest2::IterRefreshRaceFlush:2", "ArenaWrappedDBIter::Refresh:2"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ROCKSDB_NAMESPACE::port::Thread t1([&] { TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:1"); ASSERT_OK(Put("foo", "v2")); ASSERT_OK(Flush()); TEST_SYNC_POINT("DBTest2::IterRefreshRaceFlush:2"); }); // iterator is refreshed after the first Put(), and its sequence number is // assigned after second Put(), thus it must see v2. { std::unique_ptr it(db_->NewIterator(ReadOptions())); ASSERT_OK(it->status()); ASSERT_OK(it->Refresh()); it->Seek("foo"); ASSERT_TRUE(it->Valid()); ASSERT_OK(it->status()); ASSERT_EQ("foo", it->key().ToString()); ASSERT_EQ("v2", it->value().ToString()); } t1.join(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBTest2, GetRaceFlush1) { ASSERT_OK(Put("foo", "v1")); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"DBImpl::GetImpl:1", "DBTest2::GetRaceFlush:1"}, {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:2"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ROCKSDB_NAMESPACE::port::Thread t1([&] { TEST_SYNC_POINT("DBTest2::GetRaceFlush:1"); ASSERT_OK(Put("foo", "v2")); ASSERT_OK(Flush()); TEST_SYNC_POINT("DBTest2::GetRaceFlush:2"); }); // Get() is issued after the first Put(), so it should see either // "v1" or "v2". ASSERT_NE("NOT_FOUND", Get("foo")); t1.join(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBTest2, GetRaceFlush2) { ASSERT_OK(Put("foo", "v1")); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"DBImpl::GetImpl:3", "DBTest2::GetRaceFlush:1"}, {"DBTest2::GetRaceFlush:2", "DBImpl::GetImpl:4"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); port::Thread t1([&] { TEST_SYNC_POINT("DBTest2::GetRaceFlush:1"); ASSERT_OK(Put("foo", "v2")); ASSERT_OK(Flush()); TEST_SYNC_POINT("DBTest2::GetRaceFlush:2"); }); // Get() is issued after the first Put(), so it should see either // "v1" or "v2". ASSERT_NE("NOT_FOUND", Get("foo")); t1.join(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBTest2, DirectIO) { if (!IsDirectIOSupported()) { return; } Options options = CurrentOptions(); options.use_direct_reads = options.use_direct_io_for_flush_and_compaction = true; options.allow_mmap_reads = options.allow_mmap_writes = false; DestroyAndReopen(options); ASSERT_OK(Put(Key(0), "a")); ASSERT_OK(Put(Key(5), "a")); ASSERT_OK(Flush()); ASSERT_OK(Put(Key(10), "a")); ASSERT_OK(Put(Key(15), "a")); ASSERT_OK(Flush()); ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); Reopen(options); } TEST_F(DBTest2, MemtableOnlyIterator) { Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "foo", "first")); ASSERT_OK(Put(1, "bar", "second")); ReadOptions ropt; ropt.read_tier = kMemtableTier; std::string value; Iterator* it = nullptr; // Before flushing // point lookups ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value)); ASSERT_EQ("first", value); ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value)); ASSERT_EQ("second", value); // Memtable-only iterator (read_tier=kMemtableTier); data not flushed yet. it = db_->NewIterator(ropt, handles_[1]); int count = 0; for (it->SeekToFirst(); it->Valid(); it->Next()) { ASSERT_TRUE(it->Valid()); count++; } ASSERT_TRUE(!it->Valid()); ASSERT_OK(it->status()); ASSERT_EQ(2, count); delete it; ASSERT_OK(Flush(1)); // After flushing // point lookups ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value)); ASSERT_EQ("first", value); ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value)); ASSERT_EQ("second", value); // nothing should be returned using memtable-only iterator after flushing. it = db_->NewIterator(ropt, handles_[1]); ASSERT_OK(it->status()); count = 0; for (it->SeekToFirst(); it->Valid(); it->Next()) { ASSERT_TRUE(it->Valid()); count++; } ASSERT_TRUE(!it->Valid()); ASSERT_EQ(0, count); ASSERT_OK(it->status()); delete it; // Add a key to memtable ASSERT_OK(Put(1, "foobar", "third")); it = db_->NewIterator(ropt, handles_[1]); ASSERT_OK(it->status()); count = 0; for (it->SeekToFirst(); it->Valid(); it->Next()) { ASSERT_TRUE(it->Valid()); ASSERT_EQ("foobar", it->key().ToString()); ASSERT_EQ("third", it->value().ToString()); count++; } ASSERT_TRUE(!it->Valid()); ASSERT_EQ(1, count); ASSERT_OK(it->status()); delete it; } TEST_F(DBTest2, LowPriWrite) { Options options = CurrentOptions(); // Compaction pressure should trigger since 6 files options.level0_file_num_compaction_trigger = 4; options.level0_slowdown_writes_trigger = 12; options.level0_stop_writes_trigger = 30; options.delayed_write_rate = 8 * 1024 * 1024; Reopen(options); std::atomic rate_limit_count(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "GenericRateLimiter::Request:1", [&](void* arg) { rate_limit_count.fetch_add(1); int64_t* rate_bytes_per_sec = static_cast(arg); ASSERT_EQ(1024 * 1024, *rate_bytes_per_sec); }); // Make a trivial L5 for L0 to compact into. L6 will be large so debt ratio // will not cause compaction pressure. Random rnd(301); ASSERT_OK(Put("", rnd.RandomString(102400))); ASSERT_OK(Flush()); MoveFilesToLevel(6); ASSERT_OK(Put("", "")); ASSERT_OK(Flush()); MoveFilesToLevel(5); // Block compaction ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ {"DBTest.LowPriWrite:0", "DBImpl::BGWorkCompaction"}, }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); WriteOptions wo; for (int i = 0; i < 6; i++) { wo.low_pri = false; ASSERT_OK(Put("", "", wo)); wo.low_pri = true; ASSERT_OK(Put("", "", wo)); ASSERT_OK(Flush()); } ASSERT_EQ(0, rate_limit_count.load()); wo.low_pri = true; ASSERT_OK(Put("", "", wo)); ASSERT_EQ(1, rate_limit_count.load()); wo.low_pri = false; ASSERT_OK(Put("", "", wo)); ASSERT_EQ(1, rate_limit_count.load()); wo.low_pri = true; std::string big_value = std::string(1 * 1024 * 1024, 'x'); ASSERT_OK(Put("", big_value, wo)); ASSERT_LT(1, rate_limit_count.load()); // Reset rate_limit_count = 0; wo.low_pri = false; ASSERT_OK(Put("", big_value, wo)); ASSERT_EQ(0, rate_limit_count.load()); TEST_SYNC_POINT("DBTest.LowPriWrite:0"); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ASSERT_OK(dbfull()->TEST_WaitForCompact()); wo.low_pri = true; ASSERT_OK(Put("", "", wo)); ASSERT_EQ(0, rate_limit_count.load()); wo.low_pri = false; ASSERT_OK(Put("", "", wo)); ASSERT_EQ(0, rate_limit_count.load()); } TEST_F(DBTest2, RateLimitedCompactionReads) { // compaction input has 512KB data const int kNumKeysPerFile = 128; const int kBytesPerKey = 1024; const int kNumL0Files = 4; for (int compaction_readahead_size : {0, 32 << 10}) { for (auto use_direct_io : {false, true}) { if (use_direct_io && !IsDirectIOSupported()) { continue; } Options options = CurrentOptions(); options.compaction_readahead_size = compaction_readahead_size; options.compression = kNoCompression; options.level0_file_num_compaction_trigger = kNumL0Files; options.memtable_factory.reset( test::NewSpecialSkipListFactory(kNumKeysPerFile)); // takes roughly one second, split into 100 x 10ms intervals. Each // interval permits 5.12KB, which is smaller than the block size, so this // test exercises the code for chunking reads. options.rate_limiter.reset(NewGenericRateLimiter( static_cast(kNumL0Files * kNumKeysPerFile * kBytesPerKey) /* rate_bytes_per_sec */, 10 * 1000 /* refill_period_us */, 10 /* fairness */, RateLimiter::Mode::kReadsOnly)); options.use_direct_reads = options.use_direct_io_for_flush_and_compaction = use_direct_io; BlockBasedTableOptions bbto; bbto.block_size = 16384; bbto.no_block_cache = true; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); DestroyAndReopen(options); // To precisely control when to start bg compaction for excluding previous // rate-limited bytes of flush read for table verification std::shared_ptr sleeping_task( new test::SleepingBackgroundTask()); env_->SetBackgroundThreads(1, Env::LOW); env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, sleeping_task.get(), Env::Priority::LOW); sleeping_task->WaitUntilSleeping(); for (int i = 0; i < kNumL0Files; ++i) { for (int j = 0; j <= kNumKeysPerFile; ++j) { ASSERT_OK(Put(Key(j), DummyString(kBytesPerKey))); } ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); if (i + 1 < kNumL0Files) { ASSERT_EQ(i + 1, NumTableFilesAtLevel(0)); } } size_t rate_limited_bytes_start_bytes = options.rate_limiter->GetTotalBytesThrough(Env::IO_TOTAL); sleeping_task->WakeUp(); sleeping_task->WaitUntilDone(); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ(0, NumTableFilesAtLevel(0)); // should be slightly above 512KB due to non-data blocks read. Arbitrarily // chose 1MB as the upper bound on the total bytes read. size_t rate_limited_bytes = static_cast( options.rate_limiter->GetTotalBytesThrough(Env::IO_TOTAL)) - rate_limited_bytes_start_bytes; // The charges can exist for `IO_LOW` and `IO_USER` priorities. size_t rate_limited_bytes_by_pri = options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW) + options.rate_limiter->GetTotalBytesThrough(Env::IO_USER); ASSERT_EQ(rate_limited_bytes, static_cast(rate_limited_bytes_by_pri)); // Include the explicit prefetch of the footer in direct I/O case. size_t direct_io_extra = use_direct_io ? 512 * 1024 : 0; ASSERT_GE( rate_limited_bytes, static_cast(kNumKeysPerFile * kBytesPerKey * kNumL0Files)); ASSERT_LT( rate_limited_bytes, static_cast(2 * kNumKeysPerFile * kBytesPerKey * kNumL0Files + direct_io_extra)); Iterator* iter = db_->NewIterator(ReadOptions()); ASSERT_OK(iter->status()); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ASSERT_EQ(iter->value().ToString(), DummyString(kBytesPerKey)); } delete iter; // bytes read for user iterator shouldn't count against the rate limit. rate_limited_bytes_by_pri = options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW) + options.rate_limiter->GetTotalBytesThrough(Env::IO_USER); ASSERT_EQ(rate_limited_bytes, static_cast(rate_limited_bytes_by_pri)); } } } // Make sure DB can be reopen with reduced number of levels, given no file // is on levels higher than the new num_levels. TEST_F(DBTest2, ReduceLevel) { Options options; options.env = env_; options.disable_auto_compactions = true; options.num_levels = 7; Reopen(options); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Flush()); MoveFilesToLevel(6); ASSERT_EQ("0,0,0,0,0,0,1", FilesPerLevel()); CompactRangeOptions compact_options; compact_options.change_level = true; compact_options.target_level = 1; ASSERT_OK(dbfull()->CompactRange(compact_options, nullptr, nullptr)); ASSERT_EQ("0,1", FilesPerLevel()); options.num_levels = 3; Reopen(options); ASSERT_EQ("0,1", FilesPerLevel()); } // Test that ReadCallback is actually used in both memtbale and sst tables TEST_F(DBTest2, ReadCallbackTest) { Options options; options.disable_auto_compactions = true; options.num_levels = 7; options.env = env_; Reopen(options); std::vector snapshots; // Try to create a db with multiple layers and a memtable const std::string key = "foo"; const std::string value = "bar"; // This test assumes that the seq start with 1 and increased by 1 after each // write batch of size 1. If that behavior changes, the test needs to be // updated as well. // TODO(myabandeh): update this test to use the seq number that is returned by // the DB instead of assuming what seq the DB used. int i = 1; for (; i < 10; i++) { ASSERT_OK(Put(key, value + std::to_string(i))); // Take a snapshot to avoid the value being removed during compaction auto snapshot = dbfull()->GetSnapshot(); snapshots.push_back(snapshot); } ASSERT_OK(Flush()); for (; i < 20; i++) { ASSERT_OK(Put(key, value + std::to_string(i))); // Take a snapshot to avoid the value being removed during compaction auto snapshot = dbfull()->GetSnapshot(); snapshots.push_back(snapshot); } ASSERT_OK(Flush()); MoveFilesToLevel(6); ASSERT_EQ("0,0,0,0,0,0,2", FilesPerLevel()); for (; i < 30; i++) { ASSERT_OK(Put(key, value + std::to_string(i))); auto snapshot = dbfull()->GetSnapshot(); snapshots.push_back(snapshot); } ASSERT_OK(Flush()); ASSERT_EQ("1,0,0,0,0,0,2", FilesPerLevel()); // And also add some values to the memtable for (; i < 40; i++) { ASSERT_OK(Put(key, value + std::to_string(i))); auto snapshot = dbfull()->GetSnapshot(); snapshots.push_back(snapshot); } class TestReadCallback : public ReadCallback { public: explicit TestReadCallback(SequenceNumber snapshot) : ReadCallback(snapshot), snapshot_(snapshot) {} bool IsVisibleFullCheck(SequenceNumber seq) override { return seq <= snapshot_; } private: SequenceNumber snapshot_; }; for (int seq = 1; seq < i; seq++) { PinnableSlice pinnable_val; ReadOptions roptions; TestReadCallback callback(seq); bool dont_care = true; DBImpl::GetImplOptions get_impl_options; get_impl_options.column_family = dbfull()->DefaultColumnFamily(); get_impl_options.value = &pinnable_val; get_impl_options.value_found = &dont_care; get_impl_options.callback = &callback; Status s = dbfull()->GetImpl(roptions, key, get_impl_options); ASSERT_TRUE(s.ok()); // Assuming that after each Put the DB increased seq by one, the value and // seq number must be equal since we also inc value by 1 after each Put. ASSERT_EQ(value + std::to_string(seq), pinnable_val.ToString()); } for (auto snapshot : snapshots) { dbfull()->ReleaseSnapshot(snapshot); } } TEST_F(DBTest2, LiveFilesOmitObsoleteFiles) { // Regression test for race condition where an obsolete file is returned to // user as a "live file" but then deleted, all while file deletions are // disabled. // // It happened like this: // // 1. [flush thread] Log file "x.log" found by FindObsoleteFiles // 2. [user thread] DisableFileDeletions, GetSortedWalFiles are called and the // latter returned "x.log" // 3. [flush thread] PurgeObsoleteFiles deleted "x.log" // 4. [user thread] Reading "x.log" failed // // Unfortunately the only regression test I can come up with involves sleep. // We cannot set SyncPoints to repro since, once the fix is applied, the // SyncPoints would cause a deadlock as the repro's sequence of events is now // prohibited. // // Instead, if we sleep for a second between Find and Purge, and ensure the // read attempt happens after purge, then the sequence of events will almost // certainly happen on the old code. ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ {"DBImpl::BackgroundCallFlush:FilesFound", "DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered"}, {"DBImpl::PurgeObsoleteFiles:End", "DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured"}, }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::PurgeObsoleteFiles:Begin", [&](void* /*arg*/) { env_->SleepForMicroseconds(1000000); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(Put("key", "val")); FlushOptions flush_opts; flush_opts.wait = false; ASSERT_OK(db_->Flush(flush_opts)); TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:FlushTriggered"); ASSERT_OK(db_->DisableFileDeletions()); VectorWalPtr log_files; ASSERT_OK(db_->GetSortedWalFiles(log_files)); TEST_SYNC_POINT("DBTest2::LiveFilesOmitObsoleteFiles:LiveFilesCaptured"); for (const auto& log_file : log_files) { ASSERT_OK(env_->FileExists(LogFileName(dbname_, log_file->LogNumber()))); } ASSERT_OK(db_->EnableFileDeletions()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBTest2, TestNumPread) { Options options = CurrentOptions(); bool prefetch_supported = test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); // disable block cache BlockBasedTableOptions table_options; table_options.no_block_cache = true; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); env_->count_random_reads_ = true; env_->random_file_open_counter_.store(0); ASSERT_OK(Put("bar", "foo")); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Flush()); if (prefetch_supported) { // After flush, we'll open the file and read footer, meta block, // property block and index block. ASSERT_EQ(4, env_->random_read_counter_.Read()); } else { // With prefetch not supported, we will do a single read into a buffer ASSERT_EQ(1, env_->random_read_counter_.Read()); } ASSERT_EQ(1, env_->random_file_open_counter_.load()); // One pread per a normal data block read env_->random_file_open_counter_.store(0); env_->random_read_counter_.Reset(); ASSERT_EQ("bar", Get("foo")); ASSERT_EQ(1, env_->random_read_counter_.Read()); // All files are already opened. ASSERT_EQ(0, env_->random_file_open_counter_.load()); env_->random_file_open_counter_.store(0); env_->random_read_counter_.Reset(); ASSERT_OK(Put("bar2", "foo2")); ASSERT_OK(Put("foo2", "bar2")); ASSERT_OK(Flush()); if (prefetch_supported) { // After flush, we'll open the file and read footer, meta block, // property block and index block. ASSERT_EQ(4, env_->random_read_counter_.Read()); } else { // With prefetch not supported, we will do a single read into a buffer ASSERT_EQ(1, env_->random_read_counter_.Read()); } ASSERT_EQ(1, env_->random_file_open_counter_.load()); env_->random_file_open_counter_.store(0); env_->random_read_counter_.Reset(); ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); if (prefetch_supported) { // Compaction needs two input blocks, which requires 2 preads, and // generate a new SST file which needs 4 preads (footer, meta block, // property block and index block). In total 6. ASSERT_EQ(6, env_->random_read_counter_.Read()); } else { // With prefetch off, compaction needs two input blocks, // followed by a single buffered read. In total 3. ASSERT_EQ(3, env_->random_read_counter_.Read()); } // All compaction input files should have already been opened. ASSERT_EQ(1, env_->random_file_open_counter_.load()); // One pread per a normal data block read env_->random_file_open_counter_.store(0); env_->random_read_counter_.Reset(); ASSERT_EQ("foo2", Get("bar2")); ASSERT_EQ(1, env_->random_read_counter_.Read()); // SST files are already opened. ASSERT_EQ(0, env_->random_file_open_counter_.load()); } class TraceExecutionResultHandler : public TraceRecordResult::Handler { public: TraceExecutionResultHandler() = default; ~TraceExecutionResultHandler() override = default; Status Handle(const StatusOnlyTraceExecutionResult& result) override { if (result.GetStartTimestamp() > result.GetEndTimestamp()) { return Status::InvalidArgument("Invalid timestamps."); } result.GetStatus().PermitUncheckedError(); switch (result.GetTraceType()) { case kTraceWrite: { total_latency_ += result.GetLatency(); cnt_++; writes_++; break; } default: return Status::Corruption("Type mismatch."); } return Status::OK(); } Status Handle(const SingleValueTraceExecutionResult& result) override { if (result.GetStartTimestamp() > result.GetEndTimestamp()) { return Status::InvalidArgument("Invalid timestamps."); } result.GetStatus().PermitUncheckedError(); switch (result.GetTraceType()) { case kTraceGet: { total_latency_ += result.GetLatency(); cnt_++; gets_++; break; } default: return Status::Corruption("Type mismatch."); } return Status::OK(); } Status Handle(const MultiValuesTraceExecutionResult& result) override { if (result.GetStartTimestamp() > result.GetEndTimestamp()) { return Status::InvalidArgument("Invalid timestamps."); } for (const Status& s : result.GetMultiStatus()) { s.PermitUncheckedError(); } switch (result.GetTraceType()) { case kTraceMultiGet: { total_latency_ += result.GetLatency(); cnt_++; multigets_++; break; } default: return Status::Corruption("Type mismatch."); } return Status::OK(); } Status Handle(const IteratorTraceExecutionResult& result) override { if (result.GetStartTimestamp() > result.GetEndTimestamp()) { return Status::InvalidArgument("Invalid timestamps."); } result.GetStatus().PermitUncheckedError(); switch (result.GetTraceType()) { case kTraceIteratorSeek: case kTraceIteratorSeekForPrev: { total_latency_ += result.GetLatency(); cnt_++; seeks_++; break; } default: return Status::Corruption("Type mismatch."); } return Status::OK(); } void Reset() { total_latency_ = 0; cnt_ = 0; writes_ = 0; gets_ = 0; seeks_ = 0; multigets_ = 0; } double GetAvgLatency() const { return cnt_ == 0 ? 0.0 : 1.0 * total_latency_ / cnt_; } int GetNumWrites() const { return writes_; } int GetNumGets() const { return gets_; } int GetNumIterSeeks() const { return seeks_; } int GetNumMultiGets() const { return multigets_; } private: std::atomic total_latency_{0}; std::atomic cnt_{0}; std::atomic writes_{0}; std::atomic gets_{0}; std::atomic seeks_{0}; std::atomic multigets_{0}; }; TEST_F(DBTest2, TraceAndReplay) { Options options = CurrentOptions(); options.merge_operator = MergeOperators::CreatePutOperator(); ReadOptions ro; WriteOptions wo; TraceOptions trace_opts; EnvOptions env_opts; CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); Iterator* single_iter = nullptr; ASSERT_TRUE(db_->EndTrace().IsIOError()); std::string trace_filename = dbname_ + "/rocksdb.trace"; std::unique_ptr trace_writer; ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer)); ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer))); // 5 Writes ASSERT_OK(Put(0, "a", "1")); ASSERT_OK(Merge(0, "b", "2")); ASSERT_OK(Delete(0, "c")); ASSERT_OK(SingleDelete(0, "d")); ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f")); // 6th Write WriteBatch batch; ASSERT_OK(batch.Put("f", "11")); ASSERT_OK(batch.Merge("g", "12")); ASSERT_OK(batch.Delete("h")); ASSERT_OK(batch.SingleDelete("i")); ASSERT_OK(batch.DeleteRange("j", "k")); ASSERT_OK(db_->Write(wo, &batch)); // 2 Seek(ForPrev)s single_iter = db_->NewIterator(ro); single_iter->Seek("f"); // Seek 1 single_iter->SeekForPrev("g"); ASSERT_OK(single_iter->status()); delete single_iter; // 2 Gets ASSERT_EQ("1", Get(0, "a")); ASSERT_EQ("12", Get(0, "g")); // 7th and 8th Write, 3rd Get ASSERT_OK(Put(1, "foo", "bar")); ASSERT_OK(Put(1, "rocksdb", "rocks")); ASSERT_EQ("NOT_FOUND", Get(1, "leveldb")); // Total Write x 8, Get x 3, Seek x 2. ASSERT_OK(db_->EndTrace()); // These should not get into the trace file as it is after EndTrace. ASSERT_OK(Put("hello", "world")); ASSERT_OK(Merge("foo", "bar")); // Open another db, replay, and verify the data std::string value; std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay"); ASSERT_OK(DestroyDB(dbname2, options)); // Using a different name than db2, to pacify infer's use-after-lifetime // warnings (http://fbinfer.com). DB* db2_init = nullptr; options.create_if_missing = true; ASSERT_OK(DB::Open(options, dbname2, &db2_init)); ColumnFamilyHandle* cf; ASSERT_OK( db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); delete cf; delete db2_init; DB* db2 = nullptr; std::vector column_families; ColumnFamilyOptions cf_options; cf_options.merge_operator = MergeOperators::CreatePutOperator(); column_families.emplace_back("default", cf_options); column_families.emplace_back("pikachu", ColumnFamilyOptions()); std::vector handles; DBOptions db_opts; db_opts.env = env_; ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2)); env_->SleepForMicroseconds(100); // Verify that the keys don't already exist ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound()); std::unique_ptr trace_reader; ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); std::unique_ptr replayer; ASSERT_OK( db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer)); TraceExecutionResultHandler res_handler; std::function &&)> res_cb = [&res_handler](Status exec_s, std::unique_ptr&& res) { ASSERT_TRUE(exec_s.ok() || exec_s.IsNotSupported()); if (res != nullptr) { ASSERT_OK(res->Accept(&res_handler)); res.reset(); } }; // Unprepared replay should fail with Status::Incomplete() ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete()); ASSERT_OK(replayer->Prepare()); // Ok to repeatedly Prepare(). ASSERT_OK(replayer->Prepare()); // Replay using 1 thread, 1x speed. ASSERT_OK(replayer->Replay(ReplayOptions(1, 1.0), res_cb)); ASSERT_GE(res_handler.GetAvgLatency(), 0.0); ASSERT_EQ(res_handler.GetNumWrites(), 8); ASSERT_EQ(res_handler.GetNumGets(), 3); ASSERT_EQ(res_handler.GetNumIterSeeks(), 2); ASSERT_EQ(res_handler.GetNumMultiGets(), 0); res_handler.Reset(); ASSERT_OK(db2->Get(ro, handles[0], "a", &value)); ASSERT_EQ("1", value); ASSERT_OK(db2->Get(ro, handles[0], "g", &value)); ASSERT_EQ("12", value); ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound()); ASSERT_OK(db2->Get(ro, handles[1], "foo", &value)); ASSERT_EQ("bar", value); ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value)); ASSERT_EQ("rocks", value); // Re-replay should fail with Status::Incomplete() if Prepare() was not // called. Currently we don't distinguish between unprepared and trace end. ASSERT_TRUE(replayer->Replay(ReplayOptions(), nullptr).IsIncomplete()); // Re-replay using 2 threads, 2x speed. ASSERT_OK(replayer->Prepare()); ASSERT_OK(replayer->Replay(ReplayOptions(2, 2.0), res_cb)); ASSERT_GE(res_handler.GetAvgLatency(), 0.0); ASSERT_EQ(res_handler.GetNumWrites(), 8); ASSERT_EQ(res_handler.GetNumGets(), 3); ASSERT_EQ(res_handler.GetNumIterSeeks(), 2); ASSERT_EQ(res_handler.GetNumMultiGets(), 0); res_handler.Reset(); // Re-replay using 2 threads, 1/2 speed. ASSERT_OK(replayer->Prepare()); ASSERT_OK(replayer->Replay(ReplayOptions(2, 0.5), res_cb)); ASSERT_GE(res_handler.GetAvgLatency(), 0.0); ASSERT_EQ(res_handler.GetNumWrites(), 8); ASSERT_EQ(res_handler.GetNumGets(), 3); ASSERT_EQ(res_handler.GetNumIterSeeks(), 2); ASSERT_EQ(res_handler.GetNumMultiGets(), 0); res_handler.Reset(); replayer.reset(); for (auto handle : handles) { delete handle; } delete db2; ASSERT_OK(DestroyDB(dbname2, options)); } TEST_F(DBTest2, TraceAndManualReplay) { Options options = CurrentOptions(); options.merge_operator = MergeOperators::CreatePutOperator(); ReadOptions ro; WriteOptions wo; TraceOptions trace_opts; EnvOptions env_opts; CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); Iterator* single_iter = nullptr; ASSERT_TRUE(db_->EndTrace().IsIOError()); std::string trace_filename = dbname_ + "/rocksdb.trace"; std::unique_ptr trace_writer; ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer)); ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer))); ASSERT_OK(Put(0, "a", "1")); ASSERT_OK(Merge(0, "b", "2")); ASSERT_OK(Delete(0, "c")); ASSERT_OK(SingleDelete(0, "d")); ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f")); WriteBatch batch; ASSERT_OK(batch.Put("f", "11")); ASSERT_OK(batch.Merge("g", "12")); ASSERT_OK(batch.Delete("h")); ASSERT_OK(batch.SingleDelete("i")); ASSERT_OK(batch.DeleteRange("j", "k")); ASSERT_OK(db_->Write(wo, &batch)); single_iter = db_->NewIterator(ro); single_iter->Seek("f"); single_iter->SeekForPrev("g"); ASSERT_OK(single_iter->status()); delete single_iter; // Write some sequenced keys for testing lower/upper bounds of iterator. batch.Clear(); ASSERT_OK(batch.Put("iter-0", "iter-0")); ASSERT_OK(batch.Put("iter-1", "iter-1")); ASSERT_OK(batch.Put("iter-2", "iter-2")); ASSERT_OK(batch.Put("iter-3", "iter-3")); ASSERT_OK(batch.Put("iter-4", "iter-4")); ASSERT_OK(db_->Write(wo, &batch)); ReadOptions bounded_ro = ro; Slice lower_bound("iter-1"); Slice upper_bound("iter-3"); bounded_ro.iterate_lower_bound = &lower_bound; bounded_ro.iterate_upper_bound = &upper_bound; single_iter = db_->NewIterator(bounded_ro); single_iter->Seek("iter-0"); ASSERT_EQ(single_iter->key().ToString(), "iter-1"); single_iter->Seek("iter-2"); ASSERT_EQ(single_iter->key().ToString(), "iter-2"); single_iter->Seek("iter-4"); ASSERT_FALSE(single_iter->Valid()); single_iter->SeekForPrev("iter-0"); ASSERT_FALSE(single_iter->Valid()); single_iter->SeekForPrev("iter-2"); ASSERT_EQ(single_iter->key().ToString(), "iter-2"); single_iter->SeekForPrev("iter-4"); ASSERT_EQ(single_iter->key().ToString(), "iter-2"); ASSERT_OK(single_iter->status()); delete single_iter; ASSERT_EQ("1", Get(0, "a")); ASSERT_EQ("12", Get(0, "g")); ASSERT_OK(Put(1, "foo", "bar")); ASSERT_OK(Put(1, "rocksdb", "rocks")); ASSERT_EQ("NOT_FOUND", Get(1, "leveldb")); // Same as TraceAndReplay, Write x 8, Get x 3, Seek x 2. // Plus 1 WriteBatch for iterator with lower/upper bounds, and 6 // Seek(ForPrev)s. // Total Write x 9, Get x 3, Seek x 8 ASSERT_OK(db_->EndTrace()); // These should not get into the trace file as it is after EndTrace. ASSERT_OK(Put("hello", "world")); ASSERT_OK(Merge("foo", "bar")); // Open another db, replay, and verify the data std::string value; std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay"); ASSERT_OK(DestroyDB(dbname2, options)); // Using a different name than db2, to pacify infer's use-after-lifetime // warnings (http://fbinfer.com). DB* db2_init = nullptr; options.create_if_missing = true; ASSERT_OK(DB::Open(options, dbname2, &db2_init)); ColumnFamilyHandle* cf; ASSERT_OK( db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); delete cf; delete db2_init; DB* db2 = nullptr; std::vector column_families; ColumnFamilyOptions cf_options; cf_options.merge_operator = MergeOperators::CreatePutOperator(); column_families.emplace_back("default", cf_options); column_families.emplace_back("pikachu", ColumnFamilyOptions()); std::vector handles; DBOptions db_opts; db_opts.env = env_; ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2)); env_->SleepForMicroseconds(100); // Verify that the keys don't already exist ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound()); std::unique_ptr trace_reader; ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); std::unique_ptr replayer; ASSERT_OK( db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer)); TraceExecutionResultHandler res_handler; // Manual replay for 2 times. The 2nd checks if the replay can restart. std::unique_ptr record; std::unique_ptr result; for (int i = 0; i < 2; i++) { // Next should fail if unprepared. ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete()); ASSERT_OK(replayer->Prepare()); Status s = Status::OK(); // Looping until trace end. while (s.ok()) { s = replayer->Next(&record); // Skip unsupported operations. if (s.IsNotSupported()) { continue; } if (s.ok()) { ASSERT_OK(replayer->Execute(record, &result)); if (result != nullptr) { ASSERT_OK(result->Accept(&res_handler)); if (record->GetTraceType() == kTraceIteratorSeek || record->GetTraceType() == kTraceIteratorSeekForPrev) { IteratorSeekQueryTraceRecord* iter_rec = dynamic_cast(record.get()); IteratorTraceExecutionResult* iter_res = dynamic_cast(result.get()); // Check if lower/upper bounds are correctly saved and decoded. std::string lower_str = iter_rec->GetLowerBound().ToString(); std::string upper_str = iter_rec->GetUpperBound().ToString(); std::string iter_key = iter_res->GetKey().ToString(); std::string iter_value = iter_res->GetValue().ToString(); if (!lower_str.empty() && !upper_str.empty()) { ASSERT_EQ(lower_str, "iter-1"); ASSERT_EQ(upper_str, "iter-3"); if (iter_res->GetValid()) { // If iterator is valid, then lower_bound <= key < upper_bound. ASSERT_GE(iter_key, lower_str); ASSERT_LT(iter_key, upper_str); } else { // If iterator is invalid, then // key < lower_bound or key >= upper_bound. ASSERT_TRUE(iter_key < lower_str || iter_key >= upper_str); } } // If iterator is invalid, the key and value should be empty. if (!iter_res->GetValid()) { ASSERT_TRUE(iter_key.empty()); ASSERT_TRUE(iter_value.empty()); } } result.reset(); } } } // Status::Incomplete() will be returned when manually reading the trace // end, or Prepare() was not called. ASSERT_TRUE(s.IsIncomplete()); ASSERT_TRUE(replayer->Next(nullptr).IsIncomplete()); ASSERT_GE(res_handler.GetAvgLatency(), 0.0); ASSERT_EQ(res_handler.GetNumWrites(), 9); ASSERT_EQ(res_handler.GetNumGets(), 3); ASSERT_EQ(res_handler.GetNumIterSeeks(), 8); ASSERT_EQ(res_handler.GetNumMultiGets(), 0); res_handler.Reset(); } ASSERT_OK(db2->Get(ro, handles[0], "a", &value)); ASSERT_EQ("1", value); ASSERT_OK(db2->Get(ro, handles[0], "g", &value)); ASSERT_EQ("12", value); ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound()); ASSERT_OK(db2->Get(ro, handles[1], "foo", &value)); ASSERT_EQ("bar", value); ASSERT_OK(db2->Get(ro, handles[1], "rocksdb", &value)); ASSERT_EQ("rocks", value); // Test execution of artificially created TraceRecords. uint64_t fake_ts = 1U; // Write batch.Clear(); ASSERT_OK(batch.Put("trace-record-write1", "write1")); ASSERT_OK(batch.Put("trace-record-write2", "write2")); record.reset(new WriteQueryTraceRecord(batch.Data(), fake_ts++)); ASSERT_OK(replayer->Execute(record, &result)); ASSERT_TRUE(result != nullptr); ASSERT_OK(result->Accept(&res_handler)); // Write x 1 ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write1", &value)); ASSERT_EQ("write1", value); ASSERT_OK(db2->Get(ro, handles[0], "trace-record-write2", &value)); ASSERT_EQ("write2", value); ASSERT_GE(res_handler.GetAvgLatency(), 0.0); ASSERT_EQ(res_handler.GetNumWrites(), 1); ASSERT_EQ(res_handler.GetNumGets(), 0); ASSERT_EQ(res_handler.GetNumIterSeeks(), 0); ASSERT_EQ(res_handler.GetNumMultiGets(), 0); res_handler.Reset(); // Get related // Get an existing key. record.reset(new GetQueryTraceRecord(handles[0]->GetID(), "trace-record-write1", fake_ts++)); ASSERT_OK(replayer->Execute(record, &result)); ASSERT_TRUE(result != nullptr); ASSERT_OK(result->Accept(&res_handler)); // Get x 1 // Get an non-existing key, should still return Status::OK(). record.reset(new GetQueryTraceRecord(handles[0]->GetID(), "trace-record-get", fake_ts++)); ASSERT_OK(replayer->Execute(record, &result)); ASSERT_TRUE(result != nullptr); ASSERT_OK(result->Accept(&res_handler)); // Get x 2 // Get from an invalid (non-existing) cf_id. uint32_t invalid_cf_id = handles[1]->GetID() + 1; record.reset(new GetQueryTraceRecord(invalid_cf_id, "whatever", fake_ts++)); ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption()); ASSERT_TRUE(result == nullptr); ASSERT_GE(res_handler.GetAvgLatency(), 0.0); ASSERT_EQ(res_handler.GetNumWrites(), 0); ASSERT_EQ(res_handler.GetNumGets(), 2); ASSERT_EQ(res_handler.GetNumIterSeeks(), 0); ASSERT_EQ(res_handler.GetNumMultiGets(), 0); res_handler.Reset(); // Iteration related for (IteratorSeekQueryTraceRecord::SeekType seekType : {IteratorSeekQueryTraceRecord::kSeek, IteratorSeekQueryTraceRecord::kSeekForPrev}) { // Seek to an existing key. record.reset(new IteratorSeekQueryTraceRecord( seekType, handles[0]->GetID(), "trace-record-write1", fake_ts++)); ASSERT_OK(replayer->Execute(record, &result)); ASSERT_TRUE(result != nullptr); ASSERT_OK(result->Accept(&res_handler)); // Seek x 1 in one iteration // Seek to an non-existing key, should still return Status::OK(). record.reset(new IteratorSeekQueryTraceRecord( seekType, handles[0]->GetID(), "trace-record-get", fake_ts++)); ASSERT_OK(replayer->Execute(record, &result)); ASSERT_TRUE(result != nullptr); ASSERT_OK(result->Accept(&res_handler)); // Seek x 2 in one iteration // Seek from an invalid cf_id. record.reset(new IteratorSeekQueryTraceRecord(seekType, invalid_cf_id, "whatever", fake_ts++)); ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption()); ASSERT_TRUE(result == nullptr); } ASSERT_GE(res_handler.GetAvgLatency(), 0.0); ASSERT_EQ(res_handler.GetNumWrites(), 0); ASSERT_EQ(res_handler.GetNumGets(), 0); ASSERT_EQ(res_handler.GetNumIterSeeks(), 4); // Seek x 2 in two iterations ASSERT_EQ(res_handler.GetNumMultiGets(), 0); res_handler.Reset(); // MultiGet related // Get existing keys. record.reset(new MultiGetQueryTraceRecord( std::vector({handles[0]->GetID(), handles[1]->GetID()}), std::vector({"a", "foo"}), fake_ts++)); ASSERT_OK(replayer->Execute(record, &result)); ASSERT_TRUE(result != nullptr); ASSERT_OK(result->Accept(&res_handler)); // MultiGet x 1 // Get all non-existing keys, should still return Status::OK(). record.reset(new MultiGetQueryTraceRecord( std::vector({handles[0]->GetID(), handles[1]->GetID()}), std::vector({"no1", "no2"}), fake_ts++)); ASSERT_OK(replayer->Execute(record, &result)); ASSERT_TRUE(result != nullptr); ASSERT_OK(result->Accept(&res_handler)); // MultiGet x 2 // Get mixed of existing and non-existing keys, should still return // Status::OK(). record.reset(new MultiGetQueryTraceRecord( std::vector({handles[0]->GetID(), handles[1]->GetID()}), std::vector({"a", "no2"}), fake_ts++)); ASSERT_OK(replayer->Execute(record, &result)); ASSERT_TRUE(result != nullptr); MultiValuesTraceExecutionResult* mvr = dynamic_cast(result.get()); ASSERT_TRUE(mvr != nullptr); ASSERT_OK(mvr->GetMultiStatus()[0]); ASSERT_TRUE(mvr->GetMultiStatus()[1].IsNotFound()); ASSERT_EQ(mvr->GetValues()[0], "1"); ASSERT_EQ(mvr->GetValues()[1], ""); ASSERT_OK(result->Accept(&res_handler)); // MultiGet x 3 // Get from an invalid (non-existing) cf_id. record.reset(new MultiGetQueryTraceRecord( std::vector( {handles[0]->GetID(), handles[1]->GetID(), invalid_cf_id}), std::vector({"a", "foo", "whatever"}), fake_ts++)); ASSERT_TRUE(replayer->Execute(record, &result).IsCorruption()); ASSERT_TRUE(result == nullptr); // Empty MultiGet record.reset(new MultiGetQueryTraceRecord( std::vector(), std::vector(), fake_ts++)); ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument()); ASSERT_TRUE(result == nullptr); // MultiGet size mismatch record.reset(new MultiGetQueryTraceRecord( std::vector({handles[0]->GetID(), handles[1]->GetID()}), std::vector({"a"}), fake_ts++)); ASSERT_TRUE(replayer->Execute(record, &result).IsInvalidArgument()); ASSERT_TRUE(result == nullptr); ASSERT_GE(res_handler.GetAvgLatency(), 0.0); ASSERT_EQ(res_handler.GetNumWrites(), 0); ASSERT_EQ(res_handler.GetNumGets(), 0); ASSERT_EQ(res_handler.GetNumIterSeeks(), 0); ASSERT_EQ(res_handler.GetNumMultiGets(), 3); res_handler.Reset(); replayer.reset(); for (auto handle : handles) { delete handle; } delete db2; ASSERT_OK(DestroyDB(dbname2, options)); } TEST_F(DBTest2, TraceWithLimit) { Options options = CurrentOptions(); options.merge_operator = MergeOperators::CreatePutOperator(); ReadOptions ro; WriteOptions wo; TraceOptions trace_opts; EnvOptions env_opts; CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); // test the max trace file size options trace_opts.max_trace_file_size = 5; std::string trace_filename = dbname_ + "/rocksdb.trace1"; std::unique_ptr trace_writer; ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer)); ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer))); ASSERT_OK(Put(0, "a", "1")); ASSERT_OK(Put(0, "b", "1")); ASSERT_OK(Put(0, "c", "1")); ASSERT_OK(db_->EndTrace()); std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay2"); std::string value; ASSERT_OK(DestroyDB(dbname2, options)); // Using a different name than db2, to pacify infer's use-after-lifetime // warnings (http://fbinfer.com). DB* db2_init = nullptr; options.create_if_missing = true; ASSERT_OK(DB::Open(options, dbname2, &db2_init)); ColumnFamilyHandle* cf; ASSERT_OK( db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); delete cf; delete db2_init; DB* db2 = nullptr; std::vector column_families; ColumnFamilyOptions cf_options; cf_options.merge_operator = MergeOperators::CreatePutOperator(); column_families.emplace_back("default", cf_options); column_families.emplace_back("pikachu", ColumnFamilyOptions()); std::vector handles; DBOptions db_opts; db_opts.env = env_; ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2)); env_->SleepForMicroseconds(100); // Verify that the keys don't already exist ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound()); std::unique_ptr trace_reader; ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); std::unique_ptr replayer; ASSERT_OK( db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer)); ASSERT_OK(replayer->Prepare()); ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr)); replayer.reset(); ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound()); for (auto handle : handles) { delete handle; } delete db2; ASSERT_OK(DestroyDB(dbname2, options)); } TEST_F(DBTest2, TraceWithSampling) { Options options = CurrentOptions(); ReadOptions ro; WriteOptions wo; TraceOptions trace_opts; EnvOptions env_opts; CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); // test the trace file sampling options trace_opts.sampling_frequency = 2; std::string trace_filename = dbname_ + "/rocksdb.trace_sampling"; std::unique_ptr trace_writer; ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer)); ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer))); ASSERT_OK(Put(0, "a", "1")); ASSERT_OK(Put(0, "b", "2")); ASSERT_OK(Put(0, "c", "3")); ASSERT_OK(Put(0, "d", "4")); ASSERT_OK(Put(0, "e", "5")); ASSERT_OK(db_->EndTrace()); std::string dbname2 = test::PerThreadDBPath(env_, "/db_replay_sampling"); std::string value; ASSERT_OK(DestroyDB(dbname2, options)); // Using a different name than db2, to pacify infer's use-after-lifetime // warnings (http://fbinfer.com). DB* db2_init = nullptr; options.create_if_missing = true; ASSERT_OK(DB::Open(options, dbname2, &db2_init)); ColumnFamilyHandle* cf; ASSERT_OK( db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); delete cf; delete db2_init; DB* db2 = nullptr; std::vector column_families; ColumnFamilyOptions cf_options; column_families.emplace_back("default", cf_options); column_families.emplace_back("pikachu", ColumnFamilyOptions()); std::vector handles; DBOptions db_opts; db_opts.env = env_; ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2)); env_->SleepForMicroseconds(100); ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "b", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "d", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound()); std::unique_ptr trace_reader; ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); std::unique_ptr replayer; ASSERT_OK( db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer)); ASSERT_OK(replayer->Prepare()); ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr)); replayer.reset(); ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); ASSERT_FALSE(db2->Get(ro, handles[0], "b", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "c", &value).IsNotFound()); ASSERT_FALSE(db2->Get(ro, handles[0], "d", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "e", &value).IsNotFound()); for (auto handle : handles) { delete handle; } delete db2; ASSERT_OK(DestroyDB(dbname2, options)); } TEST_F(DBTest2, TraceWithFilter) { Options options = CurrentOptions(); options.merge_operator = MergeOperators::CreatePutOperator(); ReadOptions ro; WriteOptions wo; TraceOptions trace_opts; EnvOptions env_opts; CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); Iterator* single_iter = nullptr; trace_opts.filter = TraceFilterType::kTraceFilterWrite; std::string trace_filename = dbname_ + "/rocksdb.trace"; std::unique_ptr trace_writer; ASSERT_OK(NewFileTraceWriter(env_, env_opts, trace_filename, &trace_writer)); ASSERT_OK(db_->StartTrace(trace_opts, std::move(trace_writer))); ASSERT_OK(Put(0, "a", "1")); ASSERT_OK(Merge(0, "b", "2")); ASSERT_OK(Delete(0, "c")); ASSERT_OK(SingleDelete(0, "d")); ASSERT_OK(db_->DeleteRange(wo, dbfull()->DefaultColumnFamily(), "e", "f")); WriteBatch batch; ASSERT_OK(batch.Put("f", "11")); ASSERT_OK(batch.Merge("g", "12")); ASSERT_OK(batch.Delete("h")); ASSERT_OK(batch.SingleDelete("i")); ASSERT_OK(batch.DeleteRange("j", "k")); ASSERT_OK(db_->Write(wo, &batch)); single_iter = db_->NewIterator(ro); single_iter->Seek("f"); single_iter->SeekForPrev("g"); delete single_iter; ASSERT_EQ("1", Get(0, "a")); ASSERT_EQ("12", Get(0, "g")); ASSERT_OK(Put(1, "foo", "bar")); ASSERT_OK(Put(1, "rocksdb", "rocks")); ASSERT_EQ("NOT_FOUND", Get(1, "leveldb")); ASSERT_OK(db_->EndTrace()); // These should not get into the trace file as it is after EndTrace. ASSERT_OK(Put("hello", "world")); ASSERT_OK(Merge("foo", "bar")); // Open another db, replay, and verify the data std::string value; std::string dbname2 = test::PerThreadDBPath(env_, "db_replay"); ASSERT_OK(DestroyDB(dbname2, options)); // Using a different name than db2, to pacify infer's use-after-lifetime // warnings (http://fbinfer.com). DB* db2_init = nullptr; options.create_if_missing = true; ASSERT_OK(DB::Open(options, dbname2, &db2_init)); ColumnFamilyHandle* cf; ASSERT_OK( db2_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf)); delete cf; delete db2_init; DB* db2 = nullptr; std::vector column_families; ColumnFamilyOptions cf_options; cf_options.merge_operator = MergeOperators::CreatePutOperator(); column_families.emplace_back("default", cf_options); column_families.emplace_back("pikachu", ColumnFamilyOptions()); std::vector handles; DBOptions db_opts; db_opts.env = env_; ASSERT_OK(DB::Open(db_opts, dbname2, column_families, &handles, &db2)); env_->SleepForMicroseconds(100); // Verify that the keys don't already exist ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound()); std::unique_ptr trace_reader; ASSERT_OK(NewFileTraceReader(env_, env_opts, trace_filename, &trace_reader)); std::unique_ptr replayer; ASSERT_OK( db2->NewDefaultReplayer(handles, std::move(trace_reader), &replayer)); ASSERT_OK(replayer->Prepare()); ASSERT_OK(replayer->Replay(ReplayOptions(), nullptr)); replayer.reset(); // All the key-values should not present since we filter out the WRITE ops. ASSERT_TRUE(db2->Get(ro, handles[0], "a", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "g", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "hello", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "world", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "foo", &value).IsNotFound()); ASSERT_TRUE(db2->Get(ro, handles[0], "rocksdb", &value).IsNotFound()); for (auto handle : handles) { delete handle; } delete db2; ASSERT_OK(DestroyDB(dbname2, options)); // Set up a new db. std::string dbname3 = test::PerThreadDBPath(env_, "db_not_trace_read"); ASSERT_OK(DestroyDB(dbname3, options)); DB* db3_init = nullptr; options.create_if_missing = true; ColumnFamilyHandle* cf3; ASSERT_OK(DB::Open(options, dbname3, &db3_init)); ASSERT_OK( db3_init->CreateColumnFamily(ColumnFamilyOptions(), "pikachu", &cf3)); delete cf3; delete db3_init; column_families.clear(); column_families.emplace_back("default", cf_options); column_families.emplace_back("pikachu", ColumnFamilyOptions()); handles.clear(); DB* db3 = nullptr; ASSERT_OK(DB::Open(db_opts, dbname3, column_families, &handles, &db3)); env_->SleepForMicroseconds(100); // Verify that the keys don't already exist ASSERT_TRUE(db3->Get(ro, handles[0], "a", &value).IsNotFound()); ASSERT_TRUE(db3->Get(ro, handles[0], "g", &value).IsNotFound()); // The tracer will not record the READ ops. trace_opts.filter = TraceFilterType::kTraceFilterGet; std::string trace_filename3 = dbname_ + "/rocksdb.trace_3"; std::unique_ptr trace_writer3; ASSERT_OK( NewFileTraceWriter(env_, env_opts, trace_filename3, &trace_writer3)); ASSERT_OK(db3->StartTrace(trace_opts, std::move(trace_writer3))); ASSERT_OK(db3->Put(wo, handles[0], "a", "1")); ASSERT_OK(db3->Merge(wo, handles[0], "b", "2")); ASSERT_OK(db3->Delete(wo, handles[0], "c")); ASSERT_OK(db3->SingleDelete(wo, handles[0], "d")); ASSERT_OK(db3->Get(ro, handles[0], "a", &value)); ASSERT_EQ(value, "1"); ASSERT_TRUE(db3->Get(ro, handles[0], "c", &value).IsNotFound()); ASSERT_OK(db3->EndTrace()); for (auto handle : handles) { delete handle; } delete db3; ASSERT_OK(DestroyDB(dbname3, options)); std::unique_ptr trace_reader3; ASSERT_OK( NewFileTraceReader(env_, env_opts, trace_filename3, &trace_reader3)); // Count the number of records in the trace file; int count = 0; std::string data; Status s; while (true) { s = trace_reader3->Read(&data); if (!s.ok()) { break; } count += 1; } // We also need to count the header and footer // 4 WRITE + HEADER + FOOTER = 6 ASSERT_EQ(count, 6); } TEST_F(DBTest2, PinnableSliceAndMmapReads) { Options options = CurrentOptions(); options.env = env_; if (!IsMemoryMappedAccessSupported()) { ROCKSDB_GTEST_SKIP("Test requires default environment"); return; } options.allow_mmap_reads = true; options.max_open_files = 100; options.compression = kNoCompression; Reopen(options); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Flush()); PinnableSlice pinned_value; ASSERT_EQ(Get("foo", &pinned_value), Status::OK()); // It is not safe to pin mmap files as they might disappear by compaction ASSERT_FALSE(pinned_value.IsPinned()); ASSERT_EQ(pinned_value.ToString(), "bar"); ASSERT_OK(dbfull()->TEST_CompactRange( 0 /* level */, nullptr /* begin */, nullptr /* end */, nullptr /* column_family */, true /* disallow_trivial_move */)); // Ensure pinned_value doesn't rely on memory munmap'd by the above // compaction. It crashes if it does. ASSERT_EQ(pinned_value.ToString(), "bar"); pinned_value.Reset(); // Unsafe to pin mmap files when they could be kicked out of table cache Close(); ASSERT_OK(ReadOnlyReopen(options)); ASSERT_EQ(Get("foo", &pinned_value), Status::OK()); ASSERT_FALSE(pinned_value.IsPinned()); ASSERT_EQ(pinned_value.ToString(), "bar"); pinned_value.Reset(); // In read-only mode with infinite capacity on table cache it should pin the // value and avoid the memcpy Close(); options.max_open_files = -1; ASSERT_OK(ReadOnlyReopen(options)); ASSERT_EQ(Get("foo", &pinned_value), Status::OK()); ASSERT_TRUE(pinned_value.IsPinned()); ASSERT_EQ(pinned_value.ToString(), "bar"); } TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) { Options options = CurrentOptions(); options.create_if_missing = true; options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); BlockBasedTableOptions bbto; bbto.no_block_cache = false; bbto.cache_index_and_filter_blocks = false; bbto.block_cache = NewLRUCache(100000); bbto.block_size = 400; // small block size options.table_factory.reset(NewBlockBasedTableFactory(bbto)); Reopen(options); Random rnd(301); std::string v = rnd.RandomString(400); // Since v is the size of a block, each key should take a block // of 400+ bytes. ASSERT_OK(Put("1", v)); ASSERT_OK(Put("3", v)); ASSERT_OK(Put("5", v)); ASSERT_OK(Put("7", v)); ASSERT_OK(Flush()); ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage()); // Verify that iterators don't pin more than one data block in block cache // at each time. { std::unique_ptr iter(db_->NewIterator(ReadOptions())); iter->SeekToFirst(); for (int i = 0; i < 4; i++) { ASSERT_TRUE(iter->Valid()); // Block cache should contain exactly one block. ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0); ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800); iter->Next(); } ASSERT_FALSE(iter->Valid()); iter->Seek("4"); ASSERT_TRUE(iter->Valid()); ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0); ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800); iter->Seek("3"); ASSERT_TRUE(iter->Valid()); ASSERT_OK(iter->status()); ASSERT_GT(bbto.block_cache->GetPinnedUsage(), 0); ASSERT_LT(bbto.block_cache->GetPinnedUsage(), 800); } ASSERT_EQ(0, bbto.block_cache->GetPinnedUsage()); // Test compaction case ASSERT_OK(Put("2", v)); ASSERT_OK(Put("5", v)); ASSERT_OK(Put("6", v)); ASSERT_OK(Put("8", v)); ASSERT_OK(Flush()); // Clear existing data in block cache bbto.block_cache->SetCapacity(0); bbto.block_cache->SetCapacity(100000); // Verify compaction input iterators don't hold more than one data blocks at // one time. std::atomic finished(false); std::atomic block_newed(0); std::atomic block_destroyed(0); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "Block::Block:0", [&](void* /*arg*/) { if (finished) { return; } // Two iterators. At most 2 outstanding blocks. EXPECT_GE(block_newed.load(), block_destroyed.load()); EXPECT_LE(block_newed.load(), block_destroyed.load() + 1); block_newed.fetch_add(1); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "Block::~Block", [&](void* /*arg*/) { if (finished) { return; } // Two iterators. At most 2 outstanding blocks. EXPECT_GE(block_newed.load(), block_destroyed.load() + 1); EXPECT_LE(block_newed.load(), block_destroyed.load() + 2); block_destroyed.fetch_add(1); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "CompactionJob::Run:BeforeVerify", [&](void* /*arg*/) { finished = true; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); // Two input files. Each of them has 4 data blocks. ASSERT_EQ(8, block_newed.load()); ASSERT_EQ(8, block_destroyed.load()); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) { // Setup sync point dependency to reproduce the race condition of // DBImpl::GetColumnFamilyHandleUnlocked ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1", "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2"}, {"TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2", "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1"}, }); SyncPoint::GetInstance()->EnableProcessing(); CreateColumnFamilies({"test1", "test2"}, Options()); ASSERT_EQ(handles_.size(), 2); DBImpl* dbi = static_cast_with_check(db_); port::Thread user_thread1([&]() { auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[0]->GetID()); ASSERT_EQ(cfh->GetID(), handles_[0]->GetID()); TEST_SYNC_POINT( "TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked1"); TEST_SYNC_POINT( "TestGetColumnFamilyHandleUnlocked::ReadColumnFamilyHandle1"); ASSERT_EQ(cfh->GetID(), handles_[0]->GetID()); }); port::Thread user_thread2([&]() { TEST_SYNC_POINT( "TestGetColumnFamilyHandleUnlocked::PreGetColumnFamilyHandleUnlocked2"); auto cfh = dbi->GetColumnFamilyHandleUnlocked(handles_[1]->GetID()); ASSERT_EQ(cfh->GetID(), handles_[1]->GetID()); TEST_SYNC_POINT( "TestGetColumnFamilyHandleUnlocked::GetColumnFamilyHandleUnlocked2"); ASSERT_EQ(cfh->GetID(), handles_[1]->GetID()); }); user_thread1.join(); user_thread2.join(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } TEST_F(DBTest2, TestCompactFiles) { // Setup sync point dependency to reproduce the race condition of // DBImpl::GetColumnFamilyHandleUnlocked ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ {"TestCompactFiles::IngestExternalFile1", "TestCompactFiles::IngestExternalFile2"}, }); SyncPoint::GetInstance()->EnableProcessing(); Options options; options.env = env_; options.num_levels = 2; options.disable_auto_compactions = true; Reopen(options); auto* handle = db_->DefaultColumnFamily(); ASSERT_EQ(db_->NumberLevels(handle), 2); ROCKSDB_NAMESPACE::SstFileWriter sst_file_writer{ ROCKSDB_NAMESPACE::EnvOptions(), options}; std::string external_file1 = dbname_ + "/test_compact_files1.sst_t"; std::string external_file2 = dbname_ + "/test_compact_files2.sst_t"; std::string external_file3 = dbname_ + "/test_compact_files3.sst_t"; ASSERT_OK(sst_file_writer.Open(external_file1)); ASSERT_OK(sst_file_writer.Put("1", "1")); ASSERT_OK(sst_file_writer.Put("2", "2")); ASSERT_OK(sst_file_writer.Finish()); ASSERT_OK(sst_file_writer.Open(external_file2)); ASSERT_OK(sst_file_writer.Put("3", "3")); ASSERT_OK(sst_file_writer.Put("4", "4")); ASSERT_OK(sst_file_writer.Finish()); ASSERT_OK(sst_file_writer.Open(external_file3)); ASSERT_OK(sst_file_writer.Put("5", "5")); ASSERT_OK(sst_file_writer.Put("6", "6")); ASSERT_OK(sst_file_writer.Finish()); ASSERT_OK(db_->IngestExternalFile(handle, {external_file1, external_file3}, IngestExternalFileOptions())); ASSERT_EQ(NumTableFilesAtLevel(1, 0), 2); std::vector files; GetSstFiles(env_, dbname_, &files); ASSERT_EQ(files.size(), 2); Status user_thread1_status; port::Thread user_thread1([&]() { user_thread1_status = db_->CompactFiles(CompactionOptions(), handle, files, 1); }); Status user_thread2_status; port::Thread user_thread2([&]() { user_thread2_status = db_->IngestExternalFile(handle, {external_file2}, IngestExternalFileOptions()); TEST_SYNC_POINT("TestCompactFiles::IngestExternalFile1"); }); user_thread1.join(); user_thread2.join(); ASSERT_OK(user_thread1_status); ASSERT_OK(user_thread2_status); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); } TEST_F(DBTest2, MultiDBParallelOpenTest) { const int kNumDbs = 2; Options options = CurrentOptions(); std::vector dbnames; for (int i = 0; i < kNumDbs; ++i) { dbnames.emplace_back(test::PerThreadDBPath(env_, "db" + std::to_string(i))); ASSERT_OK(DestroyDB(dbnames.back(), options)); } // Verify empty DBs can be created in parallel std::vector open_threads; std::vector dbs{static_cast(kNumDbs), nullptr}; options.create_if_missing = true; for (int i = 0; i < kNumDbs; ++i) { open_threads.emplace_back( [&](int dbnum) { ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum])); }, i); } // Now add some data and close, so next we can verify non-empty DBs can be // recovered in parallel for (int i = 0; i < kNumDbs; ++i) { open_threads[i].join(); ASSERT_OK(dbs[i]->Put(WriteOptions(), "xi", "gua")); delete dbs[i]; } // Verify non-empty DBs can be recovered in parallel open_threads.clear(); for (int i = 0; i < kNumDbs; ++i) { open_threads.emplace_back( [&](int dbnum) { ASSERT_OK(DB::Open(options, dbnames[dbnum], &dbs[dbnum])); }, i); } // Wait and cleanup for (int i = 0; i < kNumDbs; ++i) { open_threads[i].join(); delete dbs[i]; ASSERT_OK(DestroyDB(dbnames[i], options)); } } namespace { class DummyOldStats : public Statistics { public: const char* Name() const override { return "DummyOldStats"; } uint64_t getTickerCount(uint32_t /*ticker_type*/) const override { return 0; } void recordTick(uint32_t /* ticker_type */, uint64_t /* count */) override { num_rt++; } void setTickerCount(uint32_t /*ticker_type*/, uint64_t /*count*/) override {} uint64_t getAndResetTickerCount(uint32_t /*ticker_type*/) override { return 0; } void measureTime(uint32_t /*histogram_type*/, uint64_t /*count*/) override { num_mt++; } void histogramData( uint32_t /*histogram_type*/, ROCKSDB_NAMESPACE::HistogramData* const /*data*/) const override {} std::string getHistogramString(uint32_t /*type*/) const override { return ""; } bool HistEnabledForType(uint32_t /*type*/) const override { return false; } std::string ToString() const override { return ""; } std::atomic num_rt{0}; std::atomic num_mt{0}; }; } // anonymous namespace TEST_F(DBTest2, OldStatsInterface) { DummyOldStats* dos = new DummyOldStats(); std::shared_ptr stats(dos); Options options = CurrentOptions(); options.create_if_missing = true; options.statistics = stats; Reopen(options); ASSERT_OK(Put("foo", "bar")); ASSERT_EQ("bar", Get("foo")); ASSERT_OK(Flush()); ASSERT_EQ("bar", Get("foo")); ASSERT_GT(dos->num_rt, 0); ASSERT_GT(dos->num_mt, 0); } TEST_F(DBTest2, CloseWithUnreleasedSnapshot) { const Snapshot* ss = db_->GetSnapshot(); for (auto h : handles_) { db_->DestroyColumnFamilyHandle(h); } handles_.clear(); ASSERT_NOK(db_->Close()); db_->ReleaseSnapshot(ss); ASSERT_OK(db_->Close()); delete db_; db_ = nullptr; } TEST_F(DBTest2, PrefixBloomReseek) { Options options = CurrentOptions(); options.create_if_missing = true; options.prefix_extractor.reset(NewCappedPrefixTransform(3)); BlockBasedTableOptions bbto; bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); bbto.whole_key_filtering = false; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); DestroyAndReopen(options); // Construct two L1 files with keys: // f1:[aaa1 ccc1] f2:[ddd0] ASSERT_OK(Put("aaa1", "")); ASSERT_OK(Put("ccc1", "")); ASSERT_OK(Flush()); ASSERT_OK(Put("ddd0", "")); ASSERT_OK(Flush()); CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); ASSERT_OK(Put("bbb1", "")); Iterator* iter = db_->NewIterator(ReadOptions()); ASSERT_OK(iter->status()); // Seeking into f1, the iterator will check bloom filter which returns the // file iterator ot be invalidate, and the cursor will put into f2, with // the next key to be "ddd0". iter->Seek("bbb1"); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("bbb1", iter->key().ToString()); // Reseek ccc1, the L1 iterator needs to go back to f1 and reseek. iter->Seek("ccc1"); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("ccc1", iter->key().ToString()); delete iter; } TEST_F(DBTest2, PrefixBloomFilteredOut) { Options options = CurrentOptions(); options.create_if_missing = true; options.prefix_extractor.reset(NewCappedPrefixTransform(3)); BlockBasedTableOptions bbto; bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); bbto.whole_key_filtering = false; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); // This test is also the primary test for prefix_seek_opt_in_only for (bool opt_in : {false, true}) { options.prefix_seek_opt_in_only = opt_in; DestroyAndReopen(options); // Construct two L1 files with keys: // f1:[aaa1 ccc1] f2:[ddd0] ASSERT_OK(Put("aaa1", "")); ASSERT_OK(Put("ccc1", "")); ASSERT_OK(Flush()); ASSERT_OK(Put("ddd0", "")); ASSERT_OK(Flush()); CompactRangeOptions cro; cro.bottommost_level_compaction = BottommostLevelCompaction::kSkip; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); ReadOptions ropts; for (bool same : {false, true}) { ropts.prefix_same_as_start = same; std::unique_ptr iter(db_->NewIterator(ropts)); ASSERT_OK(iter->status()); iter->Seek("bbb1"); ASSERT_OK(iter->status()); if (opt_in && !same) { // Unbounded total order seek ASSERT_TRUE(iter->Valid()); ASSERT_EQ(iter->key(), "ccc1"); } else { // Bloom filter is filterd out by f1. When same == false, this is just // one valid position following the contract. Postioning to ccc1 or ddd0 // is also valid. This is just to validate the behavior of the current // implementation. If underlying implementation changes, the test might // fail here. ASSERT_FALSE(iter->Valid()); } } } } TEST_F(DBTest2, RowCacheSnapshot) { Options options = CurrentOptions(); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); options.row_cache = NewLRUCache(8 * 8192); DestroyAndReopen(options); ASSERT_OK(Put("foo", "bar1")); const Snapshot* s1 = db_->GetSnapshot(); ASSERT_OK(Put("foo", "bar2")); ASSERT_OK(Flush()); ASSERT_OK(Put("foo2", "bar")); const Snapshot* s2 = db_->GetSnapshot(); ASSERT_OK(Put("foo3", "bar")); const Snapshot* s3 = db_->GetSnapshot(); ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 0); ASSERT_EQ(Get("foo"), "bar2"); ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 0); ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); ASSERT_EQ(Get("foo"), "bar2"); ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 1); ASSERT_EQ(Get("foo", s1), "bar1"); ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 1); ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); ASSERT_EQ(Get("foo", s2), "bar2"); ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 2); ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); ASSERT_EQ(Get("foo", s1), "bar1"); ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 3); ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); ASSERT_EQ(Get("foo", s3), "bar2"); ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_HIT), 4); ASSERT_EQ(TestGetTickerCount(options, ROW_CACHE_MISS), 2); db_->ReleaseSnapshot(s1); db_->ReleaseSnapshot(s2); db_->ReleaseSnapshot(s3); } // When DB is reopened with multiple column families, the manifest file // is written after the first CF is flushed, and it is written again // after each flush. If DB crashes between the flushes, the flushed CF // flushed will pass the latest log file, and now we require it not // to be corrupted, and triggering a corruption report. // We need to fix the bug and enable the test. TEST_F(DBTest2, CrashInRecoveryMultipleCF) { const std::vector sync_points = { "DBImpl::RecoverLogFiles:BeforeFlushFinalMemtable", "VersionSet::ProcessManifestWrites:BeforeWriteLastVersionEdit:0"}; for (const auto& test_sync_point : sync_points) { Options options = CurrentOptions(); // First destroy original db to ensure a clean start. DestroyAndReopen(options); options.create_if_missing = true; options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Flush()); ASSERT_OK(Put(1, "foo", "bar")); ASSERT_OK(Flush(1)); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Put(1, "foo", "bar")); // The value is large enough to be divided to two blocks. std::string large_value(400, ' '); ASSERT_OK(Put("foo1", large_value)); ASSERT_OK(Put("foo2", large_value)); Close(); // Corrupt the log file in the middle, so that it is not corrupted // in the tail. std::vector filenames; ASSERT_OK(env_->GetChildren(dbname_, &filenames)); for (const auto& f : filenames) { uint64_t number; FileType type; if (ParseFileName(f, &number, &type) && type == FileType::kWalFile) { std::string fname = dbname_ + "/" + f; std::string file_content; ASSERT_OK(ReadFileToString(env_, fname, &file_content)); file_content[400] = 'h'; file_content[401] = 'a'; ASSERT_OK(WriteStringToFile(env_, file_content, fname, false)); break; } } // Reopen and freeze the file system after the first manifest write. FaultInjectionTestEnv fit_env(options.env); options.env = &fit_env; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( test_sync_point, [&](void* /*arg*/) { fit_env.SetFilesystemActive(false); }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); ASSERT_NOK(TryReopenWithColumnFamilies( {kDefaultColumnFamilyName, "pikachu"}, options)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); fit_env.SetFilesystemActive(true); // If we continue using failure ingestion Env, it will conplain something // when renaming current file, which is not expected. Need to investigate // why. options.env = env_; ASSERT_OK(TryReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options)); } } TEST_F(DBTest2, SeekFileRangeDeleteTail) { Options options = CurrentOptions(); options.prefix_extractor.reset(NewCappedPrefixTransform(1)); options.num_levels = 3; DestroyAndReopen(options); ASSERT_OK(Put("a", "a")); const Snapshot* s1 = db_->GetSnapshot(); ASSERT_OK( db_->DeleteRange(WriteOptions(), db_->DefaultColumnFamily(), "a", "f")); ASSERT_OK(Put("b", "a")); ASSERT_OK(Flush()); ASSERT_OK(Put("x", "a")); ASSERT_OK(Put("z", "a")); ASSERT_OK(Flush()); CompactRangeOptions cro; cro.change_level = true; cro.target_level = 2; ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); { ReadOptions ro; ro.total_order_seek = true; std::unique_ptr iter(db_->NewIterator(ro)); ASSERT_OK(iter->status()); iter->Seek("e"); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("x", iter->key().ToString()); } db_->ReleaseSnapshot(s1); } TEST_F(DBTest2, BackgroundPurgeTest) { Options options = CurrentOptions(); options.write_buffer_manager = std::make_shared(1 << 20); options.avoid_unnecessary_blocking_io = true; DestroyAndReopen(options); size_t base_value = options.write_buffer_manager->memory_usage(); ASSERT_OK(Put("a", "a")); Iterator* iter = db_->NewIterator(ReadOptions()); ASSERT_OK(iter->status()); ASSERT_OK(Flush()); size_t value = options.write_buffer_manager->memory_usage(); ASSERT_GT(value, base_value); db_->GetEnv()->SetBackgroundThreads(1, Env::Priority::HIGH); test::SleepingBackgroundTask sleeping_task_after; db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_after, Env::Priority::HIGH); delete iter; Env::Default()->SleepForMicroseconds(100000); value = options.write_buffer_manager->memory_usage(); ASSERT_GT(value, base_value); sleeping_task_after.WakeUp(); sleeping_task_after.WaitUntilDone(); test::SleepingBackgroundTask sleeping_task_after2; db_->GetEnv()->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_after2, Env::Priority::HIGH); sleeping_task_after2.WakeUp(); sleeping_task_after2.WaitUntilDone(); value = options.write_buffer_manager->memory_usage(); ASSERT_EQ(base_value, value); } TEST_F(DBTest2, SwitchMemtableRaceWithNewManifest) { Options options = CurrentOptions(); DestroyAndReopen(options); options.max_manifest_file_size = 10; options.create_if_missing = true; CreateAndReopenWithCF({"pikachu"}, options); ASSERT_EQ(2, handles_.size()); ASSERT_OK(Put("foo", "value")); const int kL0Files = options.level0_file_num_compaction_trigger; for (int i = 0; i < kL0Files; ++i) { ASSERT_OK(Put(/*cf=*/1, "a", std::to_string(i))); ASSERT_OK(Flush(/*cf=*/1)); } port::Thread thread([&]() { ASSERT_OK(Flush()); }); ASSERT_OK(dbfull()->TEST_WaitForCompact()); thread.join(); } TEST_F(DBTest2, SameSmallestInSameLevel) { // This test validates fractional casacading logic when several files at one // one level only contains the same user key. Options options = CurrentOptions(); options.merge_operator = MergeOperators::CreateStringAppendOperator(); DestroyAndReopen(options); ASSERT_OK(Put("key", "1")); ASSERT_OK(Put("key", "2")); ASSERT_OK(db_->Merge(WriteOptions(), "key", "3")); ASSERT_OK(db_->Merge(WriteOptions(), "key", "4")); ASSERT_OK(Flush()); CompactRangeOptions cro; cro.change_level = true; cro.target_level = 2; ASSERT_OK(dbfull()->CompactRange(cro, db_->DefaultColumnFamily(), nullptr, nullptr)); ASSERT_OK(db_->Merge(WriteOptions(), "key", "5")); ASSERT_OK(Flush()); ASSERT_OK(db_->Merge(WriteOptions(), "key", "6")); ASSERT_OK(Flush()); ASSERT_OK(db_->Merge(WriteOptions(), "key", "7")); ASSERT_OK(Flush()); ASSERT_OK(db_->Merge(WriteOptions(), "key", "8")); ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,4,1", FilesPerLevel()); ASSERT_EQ("2,3,4,5,6,7,8", Get("key")); } TEST_F(DBTest2, FileConsistencyCheckInOpen) { ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Flush()); SyncPoint::GetInstance()->SetCallBack( "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) { Status* ret_s = static_cast(arg); *ret_s = Status::Corruption("fcc"); }); SyncPoint::GetInstance()->EnableProcessing(); Options options = CurrentOptions(); options.force_consistency_checks = true; ASSERT_NOK(TryReopen(options)); SyncPoint::GetInstance()->DisableProcessing(); } TEST_F(DBTest2, BlockBasedTablePrefixIndexSeekForPrev) { // create a DB with block prefix index BlockBasedTableOptions table_options; Options options = CurrentOptions(); table_options.block_size = 300; table_options.index_type = BlockBasedTableOptions::kHashSearch; table_options.index_shortening = BlockBasedTableOptions::IndexShorteningMode::kNoShortening; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.prefix_extractor.reset(NewFixedPrefixTransform(1)); Reopen(options); Random rnd(301); std::string large_value = rnd.RandomString(500); ASSERT_OK(Put("a1", large_value)); ASSERT_OK(Put("x1", large_value)); ASSERT_OK(Put("y1", large_value)); ASSERT_OK(Flush()); { std::unique_ptr iterator(db_->NewIterator(ReadOptions())); ASSERT_OK(iterator->status()); iterator->SeekForPrev("x3"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("x1", iterator->key().ToString()); iterator->SeekForPrev("a3"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("a1", iterator->key().ToString()); iterator->SeekForPrev("y3"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("y1", iterator->key().ToString()); // Query more than one non-existing prefix to cover the case both // of empty hash bucket and hash bucket conflict. iterator->SeekForPrev("b1"); // Result should be not valid or "a1". if (iterator->Valid()) { ASSERT_EQ("a1", iterator->key().ToString()); } iterator->SeekForPrev("c1"); // Result should be not valid or "a1". if (iterator->Valid()) { ASSERT_EQ("a1", iterator->key().ToString()); } iterator->SeekForPrev("d1"); // Result should be not valid or "a1". if (iterator->Valid()) { ASSERT_EQ("a1", iterator->key().ToString()); } iterator->SeekForPrev("y3"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("y1", iterator->key().ToString()); } } TEST_F(DBTest2, PartitionedIndexPrefetchFailure) { Options options = last_options_; options.env = env_; options.max_open_files = 20; BlockBasedTableOptions bbto; bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; bbto.metadata_block_size = 128; bbto.block_size = 128; bbto.block_cache = NewLRUCache(16777216); bbto.cache_index_and_filter_blocks = true; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); DestroyAndReopen(options); // Force no table cache so every read will preload the SST file. dbfull()->TEST_table_cache()->SetCapacity(0); bbto.block_cache->SetCapacity(0); Random rnd(301); for (int i = 0; i < 4096; i++) { ASSERT_OK(Put(Key(i), rnd.RandomString(32))); } ASSERT_OK(Flush()); // Try different random failures in table open for 300 times. for (int i = 0; i < 300; i++) { env_->num_reads_fails_ = 0; env_->rand_reads_fail_odd_ = 8; std::string value; Status s = dbfull()->Get(ReadOptions(), Key(1), &value); if (env_->num_reads_fails_ > 0) { ASSERT_NOK(s); } else { ASSERT_OK(s); } } env_->rand_reads_fail_odd_ = 0; } TEST_F(DBTest2, ChangePrefixExtractor) { for (bool use_partitioned_filter : {true, false}) { // create a DB with block prefix index BlockBasedTableOptions table_options; Options options = CurrentOptions(); options.prefix_seek_opt_in_only = false; // Use legacy prefix seek // Sometimes filter is checked based on upper bound. Assert counters // for that case. Otherwise, only check data correctness. bool expect_filter_check = !use_partitioned_filter; table_options.partition_filters = use_partitioned_filter; if (use_partitioned_filter) { table_options.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; } table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.statistics = CreateDBStatistics(); options.prefix_extractor.reset(NewFixedPrefixTransform(2)); DestroyAndReopen(options); Random rnd(301); ASSERT_OK(Put("aa", "")); ASSERT_OK(Put("xb", "")); ASSERT_OK(Put("xx1", "")); ASSERT_OK(Put("xz1", "")); ASSERT_OK(Put("zz", "")); ASSERT_OK(Flush()); // After reopening DB with prefix size 2 => 1, prefix extractor // won't take effective unless it won't change results based // on upper bound and seek key. options.prefix_extractor.reset(NewFixedPrefixTransform(1)); Reopen(options); { std::unique_ptr iterator(db_->NewIterator(ReadOptions())); ASSERT_OK(iterator->status()); iterator->Seek("xa"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xb", iterator->key().ToString()); if (expect_filter_check) { EXPECT_EQ(0, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } iterator->Seek("xz"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xz1", iterator->key().ToString()); if (expect_filter_check) { EXPECT_EQ(0, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } } std::string ub_str = "xg9"; Slice ub(ub_str); ReadOptions ro; ro.iterate_upper_bound = &ub; { std::unique_ptr iterator(db_->NewIterator(ro)); ASSERT_OK(iterator->status()); // SeekForPrev() never uses prefix bloom if it is changed. iterator->SeekForPrev("xg0"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xb", iterator->key().ToString()); if (expect_filter_check) { EXPECT_EQ(0, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } } ub_str = "xx9"; ub = Slice(ub_str); { std::unique_ptr iterator(db_->NewIterator(ro)); ASSERT_OK(iterator->status()); iterator->Seek("x"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xb", iterator->key().ToString()); if (expect_filter_check) { EXPECT_EQ(0, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } iterator->Seek("xx0"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xx1", iterator->key().ToString()); if (expect_filter_check) { EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } } CompactRangeOptions compact_range_opts; compact_range_opts.bottommost_level_compaction = BottommostLevelCompaction::kForce; ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr)); ASSERT_OK(db_->CompactRange(compact_range_opts, nullptr, nullptr)); // Re-execute similar queries after a full compaction { std::unique_ptr iterator(db_->NewIterator(ReadOptions())); iterator->Seek("x"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xb", iterator->key().ToString()); if (expect_filter_check) { EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } iterator->Seek("xg"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xx1", iterator->key().ToString()); if (expect_filter_check) { EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } iterator->Seek("xz"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xz1", iterator->key().ToString()); if (expect_filter_check) { EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } ASSERT_OK(iterator->status()); } { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->SeekForPrev("xx0"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xb", iterator->key().ToString()); if (expect_filter_check) { EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } iterator->Seek("xx0"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xx1", iterator->key().ToString()); if (expect_filter_check) { EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } ASSERT_OK(iterator->status()); } ub_str = "xg9"; ub = Slice(ub_str); { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->SeekForPrev("xg0"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("xb", iterator->key().ToString()); if (expect_filter_check) { EXPECT_EQ(1, PopTicker(options, NON_LAST_LEVEL_SEEK_FILTER_MATCH)); } ASSERT_OK(iterator->status()); } } } TEST_F(DBTest2, BlockBasedTablePrefixGetIndexNotFound) { // create a DB with block prefix index BlockBasedTableOptions table_options; Options options = CurrentOptions(); table_options.block_size = 300; table_options.index_type = BlockBasedTableOptions::kHashSearch; table_options.index_shortening = BlockBasedTableOptions::IndexShorteningMode::kNoShortening; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.prefix_extractor.reset(NewFixedPrefixTransform(1)); options.level0_file_num_compaction_trigger = 8; Reopen(options); ASSERT_OK(Put("b1", "ok")); ASSERT_OK(Flush()); // Flushing several files so that the chance that hash bucket // is empty fo "b" in at least one of the files is high. ASSERT_OK(Put("a1", "")); ASSERT_OK(Put("c1", "")); ASSERT_OK(Flush()); ASSERT_OK(Put("a2", "")); ASSERT_OK(Put("c2", "")); ASSERT_OK(Flush()); ASSERT_OK(Put("a3", "")); ASSERT_OK(Put("c3", "")); ASSERT_OK(Flush()); ASSERT_OK(Put("a4", "")); ASSERT_OK(Put("c4", "")); ASSERT_OK(Flush()); ASSERT_OK(Put("a5", "")); ASSERT_OK(Put("c5", "")); ASSERT_OK(Flush()); ASSERT_EQ("ok", Get("b1")); } TEST_F(DBTest2, AutoPrefixMode1) { do { // create a DB with block prefix index Options options = CurrentOptions(); BlockBasedTableOptions table_options = *options.table_factory->GetOptions(); table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.prefix_extractor.reset(NewFixedPrefixTransform(1)); options.statistics = CreateDBStatistics(); Reopen(options); Random rnd(301); std::string large_value = rnd.RandomString(500); ASSERT_OK(Put("a1", large_value)); ASSERT_OK(Put("x1", large_value)); ASSERT_OK(Put("y1", large_value)); ASSERT_OK(Flush()); ReadOptions ro; ro.total_order_seek = false; ro.auto_prefix_mode = true; const auto hit_stat = options.num_levels == 1 ? LAST_LEVEL_SEEK_FILTER_MATCH : NON_LAST_LEVEL_SEEK_FILTER_MATCH; const auto miss_stat = options.num_levels == 1 ? LAST_LEVEL_SEEK_FILTERED : NON_LAST_LEVEL_SEEK_FILTERED; { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek("b1"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("x1", iterator->key().ToString()); EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); } Slice ub; ro.iterate_upper_bound = &ub; ub = "b9"; { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); } ub = "z"; { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek("b1"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("x1", iterator->key().ToString()); EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); } ub = "c"; { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); } ub = "c1"; { std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); } // The same queries without recreating iterator { std::unique_ptr iterator(db_->NewIterator(ro)); ub = "b9"; iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); ub = "z"; iterator->Seek("b1"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("x1", iterator->key().ToString()); EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ub = "c"; iterator->Seek("b1"); ASSERT_FALSE(iterator->Valid()); EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat)); ub = "b9"; iterator->SeekForPrev("b1"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("a1", iterator->key().ToString()); EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ub = "zz"; iterator->SeekToLast(); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("y1", iterator->key().ToString()); iterator->SeekToFirst(); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("a1", iterator->key().ToString()); } // Similar, now with reverse comparator // Technically, we are violating axiom 2 of prefix_extractors, but // it should be revised because of major use-cases using // ReverseBytewiseComparator with capped/fixed prefix Seek. (FIXME) options.comparator = ReverseBytewiseComparator(); options.prefix_extractor.reset(NewFixedPrefixTransform(1)); DestroyAndReopen(options); ASSERT_OK(Put("a1", large_value)); ASSERT_OK(Put("x1", large_value)); ASSERT_OK(Put("y1", large_value)); ASSERT_OK(Flush()); { std::unique_ptr iterator(db_->NewIterator(ro)); ub = "b1"; iterator->Seek("b9"); ASSERT_FALSE(iterator->Valid()); EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); ub = "b1"; iterator->Seek("z"); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("y1", iterator->key().ToString()); EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ub = "b1"; iterator->Seek("c"); ASSERT_FALSE(iterator->Valid()); EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ub = "b"; iterator->Seek("c9"); ASSERT_FALSE(iterator->Valid()); // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor // is "correctly" implemented. EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ub = "a"; iterator->Seek("b9"); // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor // is "correctly" implemented. ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("a1", iterator->key().ToString()); EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ub = "b"; iterator->Seek("a"); ASSERT_FALSE(iterator->Valid()); // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor // matches BytewiseComparator::IsSameLengthImmediateSuccessor. Upper // comparing before seek key prevents a real bug from surfacing. EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ub = "b1"; iterator->SeekForPrev("b9"); ASSERT_TRUE(iterator->Valid()); // Fails if ReverseBytewiseComparator::IsSameLengthImmediateSuccessor // is "correctly" implemented. ASSERT_EQ("x1", iterator->key().ToString()); EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ub = "a"; iterator->SeekToLast(); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("a1", iterator->key().ToString()); iterator->SeekToFirst(); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("y1", iterator->key().ToString()); } // Now something a bit different, related to "short" keys that // auto_prefix_mode can omit. See "BUG" section of auto_prefix_mode. options.comparator = BytewiseComparator(); for (const auto config : {"fixed:2", "capped:2"}) { ASSERT_OK(SliceTransform::CreateFromString(ConfigOptions(), config, &options.prefix_extractor)); // FIXME: kHashSearch, etc. requires all keys be InDomain if (StartsWith(config, "fixed") && (table_options.index_type == BlockBasedTableOptions::kHashSearch || StartsWith(options.memtable_factory->Name(), "Hash"))) { continue; } DestroyAndReopen(options); const char* a_end_stuff = "a\xffXYZ"; const char* b_begin_stuff = "b\x00XYZ"; ASSERT_OK(Put("a", large_value)); ASSERT_OK(Put("b", large_value)); ASSERT_OK(Put(Slice(b_begin_stuff, 3), large_value)); ASSERT_OK(Put("c", large_value)); ASSERT_OK(Flush()); // control showing valid optimization with auto_prefix mode ub = Slice(a_end_stuff, 4); ro.iterate_upper_bound = &ub; std::unique_ptr iterator(db_->NewIterator(ro)); iterator->Seek(Slice(a_end_stuff, 2)); ASSERT_FALSE(iterator->Valid()); EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); // test, cannot be validly optimized with auto_prefix_mode ub = Slice(b_begin_stuff, 2); ro.iterate_upper_bound = &ub; iterator->Seek(Slice(a_end_stuff, 2)); // !!! BUG !!! See "BUG" section of auto_prefix_mode. ASSERT_FALSE(iterator->Valid()); EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); EXPECT_EQ(1, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); // To prove that is the wrong result, now use total order seek ReadOptions tos_ro = ro; tos_ro.total_order_seek = true; tos_ro.auto_prefix_mode = false; iterator.reset(db_->NewIterator(tos_ro)); iterator->Seek(Slice(a_end_stuff, 2)); ASSERT_TRUE(iterator->Valid()); ASSERT_EQ("b", iterator->key().ToString()); EXPECT_EQ(0, TestGetAndResetTickerCount(options, hit_stat)); EXPECT_EQ(0, TestGetAndResetTickerCount(options, miss_stat)); ASSERT_OK(iterator->status()); } } while (ChangeOptions(kSkipPlainTable)); } class RenameCurrentTest : public DBTestBase, public testing::WithParamInterface { public: RenameCurrentTest() : DBTestBase("rename_current_test", /*env_do_fsync=*/true), sync_point_(GetParam()) {} ~RenameCurrentTest() override = default; void SetUp() override { env_->no_file_overwrite_.store(true, std::memory_order_release); } void TearDown() override { env_->no_file_overwrite_.store(false, std::memory_order_release); } void SetupSyncPoints() { SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->SetCallBack(sync_point_, [&](void* arg) { Status* s = static_cast(arg); assert(s); *s = Status::IOError("Injected IO error."); }); } const std::string sync_point_; }; INSTANTIATE_TEST_CASE_P(DistributedFS, RenameCurrentTest, ::testing::Values("SetCurrentFile:BeforeRename", "SetCurrentFile:AfterRename")); TEST_P(RenameCurrentTest, Open) { Destroy(last_options_); Options options = GetDefaultOptions(); options.create_if_missing = true; SetupSyncPoints(); SyncPoint::GetInstance()->EnableProcessing(); Status s = TryReopen(options); ASSERT_NOK(s); SyncPoint::GetInstance()->DisableProcessing(); Reopen(options); } TEST_P(RenameCurrentTest, Flush) { Destroy(last_options_); Options options = GetDefaultOptions(); options.max_manifest_file_size = 1; options.create_if_missing = true; Reopen(options); ASSERT_OK(Put("key", "value")); SetupSyncPoints(); SyncPoint::GetInstance()->EnableProcessing(); ASSERT_NOK(Flush()); ASSERT_NOK(Put("foo", "value")); SyncPoint::GetInstance()->DisableProcessing(); Reopen(options); ASSERT_EQ("value", Get("key")); ASSERT_EQ("NOT_FOUND", Get("foo")); } TEST_P(RenameCurrentTest, Compaction) { Destroy(last_options_); Options options = GetDefaultOptions(); options.max_manifest_file_size = 1; options.create_if_missing = true; Reopen(options); ASSERT_OK(Put("a", "a_value")); ASSERT_OK(Put("c", "c_value")); ASSERT_OK(Flush()); ASSERT_OK(Put("b", "b_value")); ASSERT_OK(Put("d", "d_value")); ASSERT_OK(Flush()); SetupSyncPoints(); SyncPoint::GetInstance()->EnableProcessing(); ASSERT_NOK(db_->CompactRange(CompactRangeOptions(), /*begin=*/nullptr, /*end=*/nullptr)); ASSERT_NOK(Put("foo", "value")); SyncPoint::GetInstance()->DisableProcessing(); Reopen(options); ASSERT_EQ("NOT_FOUND", Get("foo")); ASSERT_EQ("d_value", Get("d")); } TEST_F(DBTest2, VariousFileTemperatures) { constexpr size_t kNumberFileTypes = static_cast(kBlobFile) + 1U; struct MyTestFS : public FileTemperatureTestFS { explicit MyTestFS(const std::shared_ptr& fs) : FileTemperatureTestFS(fs) { Reset(); } IOStatus NewWritableFile(const std::string& fname, const FileOptions& opts, std::unique_ptr* result, IODebugContext* dbg) override { IOStatus ios = FileTemperatureTestFS::NewWritableFile(fname, opts, result, dbg); if (ios.ok()) { uint64_t number; FileType type; if (ParseFileName(GetFileName(fname), &number, "LOG", &type)) { if (type == kTableFile) { // Not checked here } else if (type == kWalFile) { if (opts.temperature != expected_wal_temperature) { std::cerr << "Attempt to open " << fname << " with temperature " << temperature_to_string[opts.temperature] << " rather than " << temperature_to_string[expected_wal_temperature] << std::endl; assert(false); } } else if (type == kDescriptorFile) { if (opts.temperature != expected_manifest_temperature) { std::cerr << "Attempt to open " << fname << " with temperature " << temperature_to_string[opts.temperature] << " rather than " << temperature_to_string[expected_wal_temperature] << std::endl; assert(false); } } else if (opts.temperature != expected_other_metadata_temperature) { std::cerr << "Attempt to open " << fname << " with temperature " << temperature_to_string[opts.temperature] << " rather than " << temperature_to_string[expected_wal_temperature] << std::endl; assert(false); } UpdateCount(type, 1); } } return ios; } IOStatus RenameFile(const std::string& src, const std::string& dst, const IOOptions& options, IODebugContext* dbg) override { IOStatus ios = FileTemperatureTestFS::RenameFile(src, dst, options, dbg); if (ios.ok()) { uint64_t number; FileType src_type; FileType dst_type; assert(ParseFileName(GetFileName(src), &number, "LOG", &src_type)); assert(ParseFileName(GetFileName(dst), &number, "LOG", &dst_type)); UpdateCount(src_type, -1); UpdateCount(dst_type, 1); } return ios; } void UpdateCount(FileType type, int delta) { size_t i = static_cast(type); assert(i < kNumberFileTypes); counts[i].FetchAddRelaxed(delta); } std::map PopCounts() { std::map ret; for (size_t i = 0; i < kNumberFileTypes; ++i) { int c = counts[i].ExchangeRelaxed(0); if (c > 0) { ret[static_cast(i)] = c; } } return ret; } FileOptions OptimizeForLogWrite( const FileOptions& file_options, const DBOptions& /*db_options*/) const override { FileOptions opts = file_options; if (optimize_wal_temperature != Temperature::kUnknown) { opts.temperature = optimize_wal_temperature; } return opts; } FileOptions OptimizeForManifestWrite( const FileOptions& file_options) const override { FileOptions opts = file_options; if (optimize_manifest_temperature != Temperature::kUnknown) { opts.temperature = optimize_manifest_temperature; } return opts; } void Reset() { optimize_manifest_temperature = Temperature::kUnknown; optimize_wal_temperature = Temperature::kUnknown; expected_manifest_temperature = Temperature::kUnknown; expected_other_metadata_temperature = Temperature::kUnknown; expected_wal_temperature = Temperature::kUnknown; for (auto& c : counts) { c.StoreRelaxed(0); } } Temperature optimize_manifest_temperature; Temperature optimize_wal_temperature; Temperature expected_manifest_temperature; Temperature expected_other_metadata_temperature; Temperature expected_wal_temperature; std::array, kNumberFileTypes> counts; }; // We don't have enough non-unknown temps to confidently distinguish that // a specific setting caused a specific outcome, in a single run. This is a // reasonable work-around without blowing up test time. Only returns // non-unknown temperatures. auto RandomTemp = [] { static std::vector temps = { Temperature::kHot, Temperature::kWarm, Temperature::kCold}; return temps[Random::GetTLSInstance()->Uniform( static_cast(temps.size()))]; }; auto test_fs = std::make_shared(env_->GetFileSystem()); std::unique_ptr env(new CompositeEnvWrapper(env_, test_fs)); for (bool use_optimize : {false, true}) { std::cerr << "use_optimize: " << std::to_string(use_optimize) << std::endl; for (bool use_temp_options : {false, true}) { std::cerr << "use_temp_options: " << std::to_string(use_temp_options) << std::endl; Options options = CurrentOptions(); // Currently require for last level temperature options.compaction_style = kCompactionStyleUniversal; options.env = env.get(); test_fs->Reset(); if (use_optimize) { test_fs->optimize_manifest_temperature = RandomTemp(); test_fs->expected_manifest_temperature = test_fs->optimize_manifest_temperature; test_fs->optimize_wal_temperature = RandomTemp(); test_fs->expected_wal_temperature = test_fs->optimize_wal_temperature; } if (use_temp_options) { options.metadata_write_temperature = RandomTemp(); test_fs->expected_manifest_temperature = options.metadata_write_temperature; test_fs->expected_other_metadata_temperature = options.metadata_write_temperature; options.wal_write_temperature = RandomTemp(); test_fs->expected_wal_temperature = options.wal_write_temperature; options.last_level_temperature = RandomTemp(); options.default_write_temperature = RandomTemp(); } DestroyAndReopen(options); Defer closer([&] { Close(); }); using FTC = std::map; // Files on DB startup ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 1}, {kDescriptorFile, 2}, {kCurrentFile, 2}, {kIdentityFile, 1}, {kOptionsFile, 1}})); // Temperature count map using TCM = std::map; ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), TCM({})); ASSERT_OK(Put("foo", "1")); ASSERT_OK(Put("bar", "1")); ASSERT_OK(Flush()); ASSERT_OK(Put("foo", "2")); ASSERT_OK(Put("bar", "2")); ASSERT_OK(Flush()); ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), TCM({{options.default_write_temperature, 2}})); ASSERT_OK(db_->CompactRange({}, nullptr, nullptr)); ASSERT_OK(dbfull()->TEST_WaitForBackgroundWork()); ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), TCM({{options.last_level_temperature, 1}})); ASSERT_OK(Put("foo", "3")); ASSERT_OK(Put("bar", "3")); ASSERT_OK(Flush()); // Just in memtable/WAL ASSERT_OK(Put("dog", "3")); { TCM expected; expected[options.default_write_temperature] += 1; expected[options.last_level_temperature] += 1; ASSERT_EQ(test_fs->CountCurrentSstFilesByTemp(), expected); } // New files during operation ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 3}, {kTableFile, 4}})); Reopen(options); // New files during re-open/recovery ASSERT_EQ(test_fs->PopCounts(), FTC({{kWalFile, 1}, {kTableFile, 1}, {kDescriptorFile, 1}, {kCurrentFile, 1}, {kOptionsFile, 1}})); Destroy(options); } } } TEST_F(DBTest2, LastLevelTemperature) { class TestListener : public EventListener { public: void OnFileReadFinish(const FileOperationInfo& info) override { UpdateFileTemperature(info); } void OnFileWriteFinish(const FileOperationInfo& info) override { UpdateFileTemperature(info); } void OnFileFlushFinish(const FileOperationInfo& info) override { UpdateFileTemperature(info); } void OnFileSyncFinish(const FileOperationInfo& info) override { UpdateFileTemperature(info); } void OnFileCloseFinish(const FileOperationInfo& info) override { UpdateFileTemperature(info); } bool ShouldBeNotifiedOnFileIO() override { return true; } std::unordered_map file_temperatures; private: void UpdateFileTemperature(const FileOperationInfo& info) { auto filename = GetFileName(info.path); uint64_t number; FileType type; ASSERT_TRUE(ParseFileName(filename, &number, &type)); if (type == kTableFile) { MutexLock l(&mutex_); auto ret = file_temperatures.insert({number, info.temperature}); if (!ret.second) { // the same file temperature should always be the same for all events ASSERT_TRUE(ret.first->second == info.temperature); } } } std::string GetFileName(const std::string& fname) { auto filename = fname.substr(fname.find_last_of(kFilePathSeparator) + 1); // workaround only for Windows that the file path could contain both // Windows FilePathSeparator and '/' filename = filename.substr(filename.find_last_of('/') + 1); return filename; } port::Mutex mutex_; }; const int kNumLevels = 7; const int kLastLevel = kNumLevels - 1; auto* listener = new TestListener(); Options options = CurrentOptions(); options.last_level_temperature = Temperature::kWarm; options.level0_file_num_compaction_trigger = 2; options.level_compaction_dynamic_level_bytes = true; options.num_levels = kNumLevels; options.statistics = CreateDBStatistics(); options.listeners.emplace_back(listener); Reopen(options); auto size = GetSstSizeHelper(Temperature::kUnknown); ASSERT_EQ(size, 0); size = GetSstSizeHelper(Temperature::kWarm); ASSERT_EQ(size, 0); size = GetSstSizeHelper(Temperature::kHot); ASSERT_EQ(size, 0); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Put("bar", "bar")); ASSERT_OK(Flush()); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Put("bar", "bar")); ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); get_iostats_context()->Reset(); IOStatsContext* iostats = get_iostats_context(); ColumnFamilyMetaData metadata; db_->GetColumnFamilyMetaData(&metadata); ASSERT_EQ(1, metadata.file_count); SstFileMetaData meta = metadata.levels[kLastLevel].files[0]; ASSERT_EQ(Temperature::kWarm, meta.temperature); uint64_t number; FileType type; ASSERT_TRUE(ParseFileName(meta.name, &number, &type)); ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature); size = GetSstSizeHelper(Temperature::kUnknown); ASSERT_EQ(size, 0); size = GetSstSizeHelper(Temperature::kWarm); ASSERT_GT(size, 0); ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0); ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0); ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0); ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0); ASSERT_EQ("bar", Get("foo")); ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1); ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0); ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0); ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0); ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0); ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0); ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0); ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0); ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0); ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0); // non-bottommost file still has unknown temperature ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Put("bar", "bar")); ASSERT_OK(Flush()); ASSERT_EQ("bar", Get("bar")); ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 1); ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0); ASSERT_GT(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0); ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0); ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0); ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0); ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0); ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0); ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0); ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0); db_->GetColumnFamilyMetaData(&metadata); ASSERT_EQ(2, metadata.file_count); meta = metadata.levels[0].files[0]; ASSERT_EQ(Temperature::kUnknown, meta.temperature); ASSERT_TRUE(ParseFileName(meta.name, &number, &type)); ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature); meta = metadata.levels[kLastLevel].files[0]; ASSERT_EQ(Temperature::kWarm, meta.temperature); ASSERT_TRUE(ParseFileName(meta.name, &number, &type)); ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature); size = GetSstSizeHelper(Temperature::kUnknown); ASSERT_GT(size, 0); size = GetSstSizeHelper(Temperature::kWarm); ASSERT_GT(size, 0); // reopen and check the information is persisted Reopen(options); db_->GetColumnFamilyMetaData(&metadata); ASSERT_EQ(2, metadata.file_count); meta = metadata.levels[0].files[0]; ASSERT_EQ(Temperature::kUnknown, meta.temperature); ASSERT_TRUE(ParseFileName(meta.name, &number, &type)); ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature); meta = metadata.levels[kLastLevel].files[0]; ASSERT_EQ(Temperature::kWarm, meta.temperature); ASSERT_TRUE(ParseFileName(meta.name, &number, &type)); ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature); size = GetSstSizeHelper(Temperature::kUnknown); ASSERT_GT(size, 0); size = GetSstSizeHelper(Temperature::kWarm); ASSERT_GT(size, 0); // check other non-exist temperatures size = GetSstSizeHelper(Temperature::kHot); ASSERT_EQ(size, 0); size = GetSstSizeHelper(Temperature::kCold); ASSERT_EQ(size, 0); std::string prop; ASSERT_TRUE(dbfull()->GetProperty( DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22), &prop)); ASSERT_EQ(std::atoi(prop.c_str()), 0); Reopen(options); db_->GetColumnFamilyMetaData(&metadata); ASSERT_EQ(2, metadata.file_count); meta = metadata.levels[0].files[0]; ASSERT_EQ(Temperature::kUnknown, meta.temperature); ASSERT_TRUE(ParseFileName(meta.name, &number, &type)); ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature); meta = metadata.levels[kLastLevel].files[0]; ASSERT_EQ(Temperature::kWarm, meta.temperature); ASSERT_TRUE(ParseFileName(meta.name, &number, &type)); ASSERT_EQ(listener->file_temperatures.at(number), meta.temperature); } TEST_F(DBTest2, LastLevelTemperatureUniversal) { const int kTriggerNum = 3; const int kNumLevels = 5; const int kBottommostLevel = kNumLevels - 1; Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; options.level0_file_num_compaction_trigger = kTriggerNum; options.num_levels = kNumLevels; options.statistics = CreateDBStatistics(); DestroyAndReopen(options); auto size = GetSstSizeHelper(Temperature::kUnknown); ASSERT_EQ(size, 0); size = GetSstSizeHelper(Temperature::kWarm); ASSERT_EQ(size, 0); size = GetSstSizeHelper(Temperature::kHot); ASSERT_EQ(size, 0); get_iostats_context()->Reset(); IOStatsContext* iostats = get_iostats_context(); for (int i = 0; i < kTriggerNum; i++) { ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Put("bar", "bar")); ASSERT_OK(Flush()); } ASSERT_OK(dbfull()->TEST_WaitForCompact()); ColumnFamilyMetaData metadata; db_->GetColumnFamilyMetaData(&metadata); ASSERT_EQ(1, metadata.file_count); ASSERT_EQ(Temperature::kUnknown, metadata.levels[kBottommostLevel].files[0].temperature); size = GetSstSizeHelper(Temperature::kUnknown); ASSERT_GT(size, 0); size = GetSstSizeHelper(Temperature::kWarm); ASSERT_EQ(size, 0); ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0); ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_read_count, 0); ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0); ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0); ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0); ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0); ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0); ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0); ASSERT_EQ("bar", Get("foo")); ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_read_count, 0); ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_read_count, 0); ASSERT_EQ(iostats->file_io_stats_by_temperature.hot_file_bytes_read, 0); ASSERT_EQ(iostats->file_io_stats_by_temperature.warm_file_bytes_read, 0); ASSERT_EQ(iostats->file_io_stats_by_temperature.cold_file_bytes_read, 0); ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0); ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0); ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0); ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0); ASSERT_EQ(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0); ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Put("bar", "bar")); ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); db_->GetColumnFamilyMetaData(&metadata); ASSERT_EQ(2, metadata.file_count); ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); size = GetSstSizeHelper(Temperature::kUnknown); ASSERT_GT(size, 0); size = GetSstSizeHelper(Temperature::kWarm); ASSERT_EQ(size, 0); // Update last level temperature options.last_level_temperature = Temperature::kWarm; Reopen(options); db_->GetColumnFamilyMetaData(&metadata); // Should not impact existing ones ASSERT_EQ(Temperature::kUnknown, metadata.levels[kBottommostLevel].files[0].temperature); size = GetSstSizeHelper(Temperature::kUnknown); ASSERT_GT(size, 0); size = GetSstSizeHelper(Temperature::kWarm); ASSERT_EQ(size, 0); // new generated file should have the new settings ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); db_->GetColumnFamilyMetaData(&metadata); ASSERT_EQ(1, metadata.file_count); ASSERT_EQ(Temperature::kWarm, metadata.levels[kBottommostLevel].files[0].temperature); size = GetSstSizeHelper(Temperature::kUnknown); ASSERT_EQ(size, 0); size = GetSstSizeHelper(Temperature::kWarm); ASSERT_GT(size, 0); ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_BYTES), 0); ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_BYTES), 0); ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_BYTES), 0); ASSERT_EQ(options.statistics->getTickerCount(HOT_FILE_READ_COUNT), 0); ASSERT_GT(options.statistics->getTickerCount(WARM_FILE_READ_COUNT), 0); ASSERT_EQ(options.statistics->getTickerCount(COLD_FILE_READ_COUNT), 0); // non-bottommost file still has unknown temperature ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Put("bar", "bar")); ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); db_->GetColumnFamilyMetaData(&metadata); ASSERT_EQ(2, metadata.file_count); ASSERT_EQ(Temperature::kUnknown, metadata.levels[0].files[0].temperature); size = GetSstSizeHelper(Temperature::kUnknown); ASSERT_GT(size, 0); size = GetSstSizeHelper(Temperature::kWarm); ASSERT_GT(size, 0); // check other non-exist temperatures size = GetSstSizeHelper(Temperature::kHot); ASSERT_EQ(size, 0); size = GetSstSizeHelper(Temperature::kCold); ASSERT_EQ(size, 0); std::string prop; ASSERT_TRUE(dbfull()->GetProperty( DB::Properties::kLiveSstFilesSizeAtTemperature + std::to_string(22), &prop)); ASSERT_EQ(std::atoi(prop.c_str()), 0); // Update last level temperature dynamically with SetOptions auto s = db_->SetOptions({{"last_level_temperature", "kCold"}}); ASSERT_OK(s); ASSERT_EQ(db_->GetOptions().last_level_temperature, Temperature::kCold); db_->GetColumnFamilyMetaData(&metadata); // Should not impact the existing files ASSERT_EQ(Temperature::kWarm, metadata.levels[kBottommostLevel].files[0].temperature); size = GetSstSizeHelper(Temperature::kUnknown); ASSERT_GT(size, 0); size = GetSstSizeHelper(Temperature::kWarm); ASSERT_GT(size, 0); size = GetSstSizeHelper(Temperature::kCold); ASSERT_EQ(size, 0); // new generated files should have the new settings ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); db_->GetColumnFamilyMetaData(&metadata); ASSERT_EQ(1, metadata.file_count); ASSERT_EQ(Temperature::kCold, metadata.levels[kBottommostLevel].files[0].temperature); size = GetSstSizeHelper(Temperature::kUnknown); ASSERT_EQ(size, 0); size = GetSstSizeHelper(Temperature::kWarm); ASSERT_EQ(size, 0); size = GetSstSizeHelper(Temperature::kCold); ASSERT_GT(size, 0); // kLastTemperature is an invalid temperature options.last_level_temperature = Temperature::kLastTemperature; s = TryReopen(options); ASSERT_TRUE(s.IsIOError()); } TEST_F(DBTest2, LastLevelStatistics) { for (bool write_time_default : {false, true}) { SCOPED_TRACE("write time default? " + std::to_string(write_time_default)); Options options = CurrentOptions(); options.last_level_temperature = Temperature::kWarm; if (write_time_default) { options.default_write_temperature = Temperature::kHot; ASSERT_EQ(options.default_temperature, Temperature::kUnknown); } else { options.default_temperature = Temperature::kHot; ASSERT_EQ(options.default_write_temperature, Temperature::kUnknown); } options.level0_file_num_compaction_trigger = 2; options.level_compaction_dynamic_level_bytes = true; options.statistics = CreateDBStatistics(); BlockBasedTableOptions bbto; bbto.no_block_cache = true; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); DestroyAndReopen(options); // generate 1 sst on level 0 ASSERT_OK(Put("foo1", "bar")); ASSERT_OK(Put("bar", "bar")); ASSERT_OK(Flush()); ASSERT_EQ("bar", Get("bar")); ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), 0); ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), 0); ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), options.statistics->getTickerCount(HOT_FILE_READ_COUNT)); ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), 0); ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), 0); // 2nd flush to trigger compaction ASSERT_OK(Put("foo2", "bar")); ASSERT_OK(Put("bar", "bar")); ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("bar", Get("bar")); ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), options.statistics->getTickerCount(HOT_FILE_READ_COUNT)); ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), options.statistics->getTickerCount(WARM_FILE_READ_BYTES)); ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), options.statistics->getTickerCount(WARM_FILE_READ_COUNT)); auto pre_bytes = options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES); auto pre_count = options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT); // 3rd flush to generate 1 sst on level 0 ASSERT_OK(Put("foo3", "bar")); ASSERT_OK(Put("bar", "bar")); ASSERT_OK(Flush()); ASSERT_EQ("bar", Get("foo1")); ASSERT_EQ("bar", Get("foo2")); ASSERT_EQ("bar", Get("foo3")); ASSERT_EQ("bar", Get("bar")); ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), pre_bytes); ASSERT_GT(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), pre_count); ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), options.statistics->getTickerCount(HOT_FILE_READ_COUNT)); ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), options.statistics->getTickerCount(WARM_FILE_READ_BYTES)); ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), options.statistics->getTickerCount(WARM_FILE_READ_COUNT)); // Control ASSERT_NE(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT)); // Not a realistic setting to make last level kWarm and default temp kCold. // This is just for testing default temp can be reset on reopen while the // last level temp is consistent across DB reopen because those file's temp // are persisted in manifest. options.default_temperature = Temperature::kCold; ASSERT_OK(options.statistics->Reset()); Reopen(options); ASSERT_EQ("bar", Get("foo1")); ASSERT_EQ("bar", Get("foo2")); ASSERT_EQ("bar", Get("foo3")); ASSERT_EQ("bar", Get("bar")); if (write_time_default) { // Unchanged ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), options.statistics->getTickerCount(HOT_FILE_READ_COUNT)); ASSERT_LT(0, options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); ASSERT_EQ(0, options.statistics->getTickerCount(COLD_FILE_READ_BYTES)); } else { // Changed (in how we map kUnknown) ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_BYTES), options.statistics->getTickerCount(COLD_FILE_READ_BYTES)); ASSERT_EQ(options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT), options.statistics->getTickerCount(COLD_FILE_READ_COUNT)); ASSERT_EQ(0, options.statistics->getTickerCount(HOT_FILE_READ_BYTES)); ASSERT_LT(0, options.statistics->getTickerCount(COLD_FILE_READ_BYTES)); } ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_BYTES), options.statistics->getTickerCount(WARM_FILE_READ_BYTES)); ASSERT_EQ(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), options.statistics->getTickerCount(WARM_FILE_READ_COUNT)); // Control ASSERT_NE(options.statistics->getTickerCount(LAST_LEVEL_READ_COUNT), options.statistics->getTickerCount(NON_LAST_LEVEL_READ_COUNT)); } } TEST_F(DBTest2, CheckpointFileTemperature) { class NoLinkTestFS : public FileTemperatureTestFS { using FileTemperatureTestFS::FileTemperatureTestFS; IOStatus LinkFile(const std::string&, const std::string&, const IOOptions&, IODebugContext*) override { // return not supported to force checkpoint copy the file instead of just // link return IOStatus::NotSupported(); } }; auto test_fs = std::make_shared(env_->GetFileSystem()); std::unique_ptr env(new CompositeEnvWrapper(env_, test_fs)); Options options = CurrentOptions(); options.last_level_temperature = Temperature::kWarm; // set dynamic_level to true so the compaction would compact the data to the // last level directly which will have the last_level_temperature options.level_compaction_dynamic_level_bytes = true; options.level0_file_num_compaction_trigger = 2; options.env = env.get(); Reopen(options); // generate a bottommost file and a non-bottommost file ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Put("bar", "bar")); ASSERT_OK(Flush()); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Put("bar", "bar")); ASSERT_OK(Flush()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Put("bar", "bar")); ASSERT_OK(Flush()); auto size = GetSstSizeHelper(Temperature::kWarm); ASSERT_GT(size, 0); std::map temperatures; std::vector infos; ASSERT_OK( dbfull()->GetLiveFilesStorageInfo(LiveFilesStorageInfoOptions(), &infos)); for (const auto& info : infos) { temperatures.emplace(info.file_number, info.temperature); } test_fs->PopRequestedSstFileTemperatures(); Checkpoint* checkpoint; ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); ASSERT_OK( checkpoint->CreateCheckpoint(dbname_ + kFilePathSeparator + "tempcp")); // checking src file src_temperature hints: 2 sst files: 1 sst is kWarm, // another is kUnknown std::vector> requested_temps; test_fs->PopRequestedSstFileTemperatures(&requested_temps); // Two requests ASSERT_EQ(requested_temps.size(), 2); std::set distinct_requests; for (const auto& requested_temp : requested_temps) { // Matching manifest temperatures ASSERT_EQ(temperatures.at(requested_temp.first), requested_temp.second); distinct_requests.insert(requested_temp.first); } // Each request to distinct file ASSERT_EQ(distinct_requests.size(), requested_temps.size()); delete checkpoint; Close(); } TEST_F(DBTest2, FileTemperatureManifestFixup) { auto test_fs = std::make_shared(env_->GetFileSystem()); std::unique_ptr env(new CompositeEnvWrapper(env_, test_fs)); Options options = CurrentOptions(); options.last_level_temperature = Temperature::kWarm; // set dynamic_level to true so the compaction would compact the data to the // last level directly which will have the last_level_temperature options.level_compaction_dynamic_level_bytes = true; options.level0_file_num_compaction_trigger = 2; options.env = env.get(); std::vector cfs = {/*"default",*/ "test1", "test2"}; CreateAndReopenWithCF(cfs, options); // Needed for later re-opens (weird) cfs.insert(cfs.begin(), kDefaultColumnFamilyName); // Generate a bottommost file in all CFs for (int cf = 0; cf < 3; ++cf) { ASSERT_OK(Put(cf, "a", "val")); ASSERT_OK(Put(cf, "c", "val")); ASSERT_OK(Flush(cf)); ASSERT_OK(Put(cf, "b", "val")); ASSERT_OK(Put(cf, "d", "val")); ASSERT_OK(Flush(cf)); } ASSERT_OK(dbfull()->TEST_WaitForCompact()); // verify ASSERT_GT(GetSstSizeHelper(Temperature::kWarm), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kHot), 0); // Generate a non-bottommost file in all CFs for (int cf = 0; cf < 3; ++cf) { ASSERT_OK(Put(cf, "e", "val")); ASSERT_OK(Flush(cf)); } // re-verify ASSERT_GT(GetSstSizeHelper(Temperature::kWarm), 0); // Not supported: ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kHot), 0); // Now change FS temperature on bottommost file(s) to kCold std::map current_temps; test_fs->CopyCurrentSstFileTemperatures(¤t_temps); for (auto e : current_temps) { if (e.second == Temperature::kWarm) { test_fs->OverrideSstFileTemperature(e.first, Temperature::kCold); } } // Metadata not yet updated ASSERT_EQ(Get("a"), "val"); ASSERT_EQ(GetSstSizeHelper(Temperature::kCold), 0); // Update with Close and UpdateManifestForFilesState, but first save cf // descriptors std::vector column_families; for (size_t i = 0; i < handles_.size(); ++i) { ColumnFamilyDescriptor cfdescriptor; handles_[i]->GetDescriptor(&cfdescriptor).PermitUncheckedError(); column_families.push_back(cfdescriptor); } Close(); experimental::UpdateManifestForFilesStateOptions update_opts; update_opts.update_temperatures = true; ASSERT_OK(experimental::UpdateManifestForFilesState( options, dbname_, column_families, update_opts)); // Re-open and re-verify after update ReopenWithColumnFamilies(cfs, options); ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); // Not supported: ASSERT_GT(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kWarm), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kHot), 0); // Change kUnknown to kHot test_fs->CopyCurrentSstFileTemperatures(¤t_temps); for (auto e : current_temps) { if (e.second == Temperature::kUnknown) { test_fs->OverrideSstFileTemperature(e.first, Temperature::kHot); } } // Update with Close and UpdateManifestForFilesState Close(); ASSERT_OK(experimental::UpdateManifestForFilesState( options, dbname_, column_families, update_opts)); // Re-open and re-verify after update ReopenWithColumnFamilies(cfs, options); ASSERT_GT(GetSstSizeHelper(Temperature::kCold), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kUnknown), 0); ASSERT_EQ(GetSstSizeHelper(Temperature::kWarm), 0); ASSERT_GT(GetSstSizeHelper(Temperature::kHot), 0); Close(); } // WAL recovery mode is WALRecoveryMode::kPointInTimeRecovery. TEST_F(DBTest2, PointInTimeRecoveryWithIOErrorWhileReadingWal) { Options options = CurrentOptions(); DestroyAndReopen(options); ASSERT_OK(Put("foo", "value0")); Close(); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); bool should_inject_error = false; SyncPoint::GetInstance()->SetCallBack( "DBImpl::RecoverLogFiles:BeforeReadWal", [&](void* /*arg*/) { should_inject_error = true; }); SyncPoint::GetInstance()->SetCallBack( "LogReader::ReadMore:AfterReadFile", [&](void* arg) { if (should_inject_error) { ASSERT_NE(nullptr, arg); *static_cast(arg) = Status::IOError("Injected IOError"); } }); SyncPoint::GetInstance()->EnableProcessing(); options.avoid_flush_during_recovery = true; options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; Status s = TryReopen(options); ASSERT_TRUE(s.IsIOError()); } TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( {{"DBImpl::BackgroundCallFlush:Start:1", "PointInTimeRecoveryWithSyncFailureInCFCreation:1"}, {"PointInTimeRecoveryWithSyncFailureInCFCreation:2", "DBImpl::BackgroundCallFlush:Start:2"}}); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); CreateColumnFamilies({"test1"}, Options()); ASSERT_OK(Put("foo", "bar")); // Creating a CF when a flush is going on, log is synced but the // closed log file is not synced and corrupted. port::Thread flush_thread([&]() { ASSERT_NOK(Flush()); }); TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:1"); CreateColumnFamilies({"test2"}, Options()); env_->corrupt_in_sync_ = true; TEST_SYNC_POINT("PointInTimeRecoveryWithSyncFailureInCFCreation:2"); flush_thread.join(); env_->corrupt_in_sync_ = false; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); // Reopening the DB should not corrupt anything Options options = CurrentOptions(); options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; ReopenWithColumnFamilies({"default", "test1", "test2"}, options); } TEST_F(DBTest2, SortL0FilesByEpochNumber) { Options options = CurrentOptions(); options.num_levels = 1; options.compaction_style = kCompactionStyleUniversal; DestroyAndReopen(options); // Set up L0 files to be sorted by their epoch_number ASSERT_OK(Put("key1", "seq1")); SstFileWriter sst_file_writer{EnvOptions(), options}; std::string external_file1 = dbname_ + "/test_files1.sst"; std::string external_file2 = dbname_ + "/test_files2.sst"; ASSERT_OK(sst_file_writer.Open(external_file1)); ASSERT_OK(sst_file_writer.Put("key2", "seq0")); ASSERT_OK(sst_file_writer.Finish()); ASSERT_OK(sst_file_writer.Open(external_file2)); ASSERT_OK(sst_file_writer.Put("key3", "seq0")); ASSERT_OK(sst_file_writer.Finish()); ASSERT_OK(Put("key4", "seq2")); ASSERT_OK(Flush()); auto* handle = db_->DefaultColumnFamily(); ASSERT_OK(db_->IngestExternalFile(handle, {external_file1, external_file2}, IngestExternalFileOptions())); // To verify L0 files are sorted by epoch_number in descending order // instead of largest_seqno std::vector level0_files = GetLevelFileMetadatas(0 /* level*/); ASSERT_EQ(level0_files.size(), 3); EXPECT_EQ(level0_files[0]->epoch_number, 3); EXPECT_EQ(level0_files[0]->fd.largest_seqno, 0); ASSERT_EQ(level0_files[0]->num_entries, 1); ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key3")); EXPECT_EQ(level0_files[1]->epoch_number, 2); EXPECT_EQ(level0_files[1]->fd.largest_seqno, 0); ASSERT_EQ(level0_files[1]->num_entries, 1); ASSERT_TRUE(level0_files[1]->largest.user_key() == Slice("key2")); EXPECT_EQ(level0_files[2]->epoch_number, 1); EXPECT_EQ(level0_files[2]->fd.largest_seqno, 2); ASSERT_EQ(level0_files[2]->num_entries, 2); ASSERT_TRUE(level0_files[2]->largest.user_key() == Slice("key4")); ASSERT_TRUE(level0_files[2]->smallest.user_key() == Slice("key1")); // To verify compacted file is assigned with the minimum epoch_number // among input files' ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); level0_files = GetLevelFileMetadatas(0 /* level*/); ASSERT_EQ(level0_files.size(), 1); EXPECT_EQ(level0_files[0]->epoch_number, 1); ASSERT_EQ(level0_files[0]->num_entries, 4); ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key4")); ASSERT_TRUE(level0_files[0]->smallest.user_key() == Slice("key1")); } TEST_F(DBTest2, SameEpochNumberAfterCompactRangeChangeLevel) { Options options = CurrentOptions(); options.num_levels = 7; options.compaction_style = CompactionStyle::kCompactionStyleLevel; options.disable_auto_compactions = true; DestroyAndReopen(options); // Set up the file in L1 to be moved to L0 in later step of CompactRange() ASSERT_OK(Put("key1", "seq1")); ASSERT_OK(Flush()); MoveFilesToLevel(1, 0); std::vector level0_files = GetLevelFileMetadatas(0 /* level*/); ASSERT_EQ(level0_files.size(), 0); std::vector level1_files = GetLevelFileMetadatas(1 /* level*/); ASSERT_EQ(level1_files.size(), 1); std::vector level2_files = GetLevelFileMetadatas(2 /* level*/); ASSERT_EQ(level2_files.size(), 0); ASSERT_EQ(level1_files[0]->epoch_number, 1); // To verify CompactRange() moving file to L0 still keeps the file's // epoch_number CompactRangeOptions croptions; croptions.change_level = true; croptions.target_level = 0; ASSERT_OK(db_->CompactRange(croptions, nullptr, nullptr)); level0_files = GetLevelFileMetadatas(0 /* level*/); level1_files = GetLevelFileMetadatas(1 /* level*/); ASSERT_EQ(level0_files.size(), 1); ASSERT_EQ(level1_files.size(), 0); EXPECT_EQ(level0_files[0]->epoch_number, 1); ASSERT_EQ(level0_files[0]->num_entries, 1); ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key1")); } TEST_F(DBTest2, RecoverEpochNumber) { for (bool allow_ingest_behind : {true, false}) { Options options = CurrentOptions(); options.allow_ingest_behind = allow_ingest_behind; options.num_levels = 7; options.compaction_style = kCompactionStyleLevel; options.disable_auto_compactions = true; DestroyAndReopen(options); CreateAndReopenWithCF({"cf1"}, options); VersionSet* versions = dbfull()->GetVersionSet(); assert(versions); const ColumnFamilyData* default_cf = versions->GetColumnFamilySet()->GetDefault(); const ColumnFamilyData* cf1 = versions->GetColumnFamilySet()->GetColumnFamily("cf1"); // Set up files in default CF to recover in later step ASSERT_OK(Put("key1", "epoch1")); ASSERT_OK(Flush()); MoveFilesToLevel(1 /* level*/, 0 /* cf*/); ASSERT_OK(Put("key2", "epoch2")); ASSERT_OK(Flush()); std::vector level0_files = GetLevelFileMetadatas(0 /* level*/); ASSERT_EQ(level0_files.size(), 1); ASSERT_EQ(level0_files[0]->epoch_number, allow_ingest_behind ? 2 + kReservedEpochNumberForFileIngestedBehind : 2); ASSERT_EQ(level0_files[0]->num_entries, 1); ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key2")); std::vector level1_files = GetLevelFileMetadatas(1 /* level*/); ASSERT_EQ(level1_files.size(), 1); ASSERT_EQ(level1_files[0]->epoch_number, allow_ingest_behind ? 1 + kReservedEpochNumberForFileIngestedBehind : 1); ASSERT_EQ(level1_files[0]->num_entries, 1); ASSERT_TRUE(level1_files[0]->largest.user_key() == Slice("key1")); // Set up files in cf1 to recover in later step ASSERT_OK(Put(1 /* cf */, "cf1_key1", "epoch1")); ASSERT_OK(Flush(1 /* cf */)); std::vector level0_files_cf1 = GetLevelFileMetadatas(0 /* level*/, 1 /* cf*/); ASSERT_EQ(level0_files_cf1.size(), 1); ASSERT_EQ(level0_files_cf1[0]->epoch_number, allow_ingest_behind ? 1 + kReservedEpochNumberForFileIngestedBehind : 1); ASSERT_EQ(level0_files_cf1[0]->num_entries, 1); ASSERT_TRUE(level0_files_cf1[0]->largest.user_key() == Slice("cf1_key1")); ASSERT_EQ(default_cf->GetNextEpochNumber(), allow_ingest_behind ? 3 + kReservedEpochNumberForFileIngestedBehind : 3); ASSERT_EQ(cf1->GetNextEpochNumber(), allow_ingest_behind ? 2 + kReservedEpochNumberForFileIngestedBehind : 2); // To verify epoch_number of files of different levels/CFs are // persisted and recovered correctly ReopenWithColumnFamilies({"default", "cf1"}, options); versions = dbfull()->GetVersionSet(); assert(versions); default_cf = versions->GetColumnFamilySet()->GetDefault(); cf1 = versions->GetColumnFamilySet()->GetColumnFamily("cf1"); level0_files = GetLevelFileMetadatas(0 /* level*/); ASSERT_EQ(level0_files.size(), 1); EXPECT_EQ(level0_files[0]->epoch_number, allow_ingest_behind ? 2 + kReservedEpochNumberForFileIngestedBehind : 2); ASSERT_EQ(level0_files[0]->num_entries, 1); ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key2")); level1_files = GetLevelFileMetadatas(1 /* level*/); ASSERT_EQ(level1_files.size(), 1); EXPECT_EQ(level1_files[0]->epoch_number, allow_ingest_behind ? 1 + kReservedEpochNumberForFileIngestedBehind : 1); ASSERT_EQ(level1_files[0]->num_entries, 1); ASSERT_TRUE(level1_files[0]->largest.user_key() == Slice("key1")); level0_files_cf1 = GetLevelFileMetadatas(0 /* level*/, 1 /* cf*/); ASSERT_EQ(level0_files_cf1.size(), 1); EXPECT_EQ(level0_files_cf1[0]->epoch_number, allow_ingest_behind ? 1 + kReservedEpochNumberForFileIngestedBehind : 1); ASSERT_EQ(level0_files_cf1[0]->num_entries, 1); ASSERT_TRUE(level0_files_cf1[0]->largest.user_key() == Slice("cf1_key1")); // To verify next epoch number is recovered correctly EXPECT_EQ(default_cf->GetNextEpochNumber(), allow_ingest_behind ? 3 + kReservedEpochNumberForFileIngestedBehind : 3); EXPECT_EQ(cf1->GetNextEpochNumber(), allow_ingest_behind ? 2 + kReservedEpochNumberForFileIngestedBehind : 2); } } TEST_F(DBTest2, RenameDirectory) { Options options = CurrentOptions(); DestroyAndReopen(options); ASSERT_OK(Put("foo", "value0")); Close(); auto old_dbname = dbname_; auto new_dbname = dbname_ + "_2"; EXPECT_OK(env_->RenameFile(dbname_, new_dbname)); options.create_if_missing = false; dbname_ = new_dbname; ASSERT_OK(TryReopen(options)); ASSERT_EQ("value0", Get("foo")); Destroy(options); dbname_ = old_dbname; } TEST_F(DBTest2, SstUniqueIdVerifyBackwardCompatible) { const int kNumSst = 3; const int kLevel0Trigger = 4; auto options = CurrentOptions(); options.level0_file_num_compaction_trigger = kLevel0Trigger; options.statistics = CreateDBStatistics(); // Skip for now options.verify_sst_unique_id_in_manifest = false; Reopen(options); std::atomic_int skipped = 0; std::atomic_int passed = 0; SyncPoint::GetInstance()->SetCallBack( "BlockBasedTable::Open::SkippedVerifyUniqueId", [&](void* /*arg*/) { skipped++; }); SyncPoint::GetInstance()->SetCallBack( "BlockBasedTable::Open::PassedVerifyUniqueId", [&](void* /*arg*/) { passed++; }); SyncPoint::GetInstance()->EnableProcessing(); // generate a few SSTs for (int i = 0; i < kNumSst; i++) { for (int j = 0; j < 100; j++) { ASSERT_OK(Put(Key(i * 10 + j), "value")); } ASSERT_OK(Flush()); } // Verification has been skipped on files so far EXPECT_EQ(skipped, kNumSst); EXPECT_EQ(passed, 0); // Reopen with verification options.verify_sst_unique_id_in_manifest = true; skipped = 0; passed = 0; Reopen(options); EXPECT_EQ(skipped, 0); EXPECT_EQ(passed, kNumSst); // Now simulate no unique id in manifest for next file // NOTE: this only works for loading manifest from disk, // not in-memory manifest, so we need to re-open below. SyncPoint::GetInstance()->SetCallBack( "VersionEdit::EncodeTo:UniqueId", [&](void* arg) { auto unique_id = static_cast(arg); // remove id before writing it to manifest (*unique_id)[0] = 0; (*unique_id)[1] = 0; }); // test compaction generated Sst for (int i = kNumSst; i < kLevel0Trigger; i++) { for (int j = 0; j < 100; j++) { ASSERT_OK(Put(Key(i * 10 + j), "value")); } ASSERT_OK(Flush()); } ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,1", FilesPerLevel(0)); // Reopen (with verification) ASSERT_TRUE(options.verify_sst_unique_id_in_manifest); skipped = 0; passed = 0; Reopen(options); EXPECT_EQ(skipped, 1); EXPECT_EQ(passed, 0); } TEST_F(DBTest2, SstUniqueIdVerify) { const int kNumSst = 3; const int kLevel0Trigger = 4; auto options = CurrentOptions(); options.level0_file_num_compaction_trigger = kLevel0Trigger; // Allow mismatch for now options.verify_sst_unique_id_in_manifest = false; Reopen(options); SyncPoint::GetInstance()->SetCallBack( "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) { auto props = static_cast(props_vs); // update table property session_id to a different one, which // changes unique ID props->db_session_id = DBImpl::GenerateDbSessionId(nullptr); }); SyncPoint::GetInstance()->EnableProcessing(); // generate a few SSTs for (int i = 0; i < kNumSst; i++) { for (int j = 0; j < 100; j++) { ASSERT_OK(Put(Key(i * 10 + j), "value")); } ASSERT_OK(Flush()); } // Reopen with verification should report corruption options.verify_sst_unique_id_in_manifest = true; auto s = TryReopen(options); ASSERT_TRUE(s.IsCorruption()); // Reopen without verification should be fine options.verify_sst_unique_id_in_manifest = false; Reopen(options); // test compaction generated Sst for (int i = kNumSst; i < kLevel0Trigger; i++) { for (int j = 0; j < 100; j++) { ASSERT_OK(Put(Key(i * 10 + j), "value")); } ASSERT_OK(Flush()); } ASSERT_OK(dbfull()->TEST_WaitForCompact()); ASSERT_EQ("0,1", FilesPerLevel(0)); // Reopen with verification should fail options.verify_sst_unique_id_in_manifest = true; s = TryReopen(options); ASSERT_TRUE(s.IsCorruption()); } TEST_F(DBTest2, SstUniqueIdVerifyMultiCFs) { const int kNumSst = 3; const int kLevel0Trigger = 4; auto options = CurrentOptions(); options.level0_file_num_compaction_trigger = kLevel0Trigger; // Allow mismatch for now options.verify_sst_unique_id_in_manifest = false; CreateAndReopenWithCF({"one", "two"}, options); // generate good SSTs for (int cf_num : {0, 2}) { for (int i = 0; i < kNumSst; i++) { for (int j = 0; j < 100; j++) { ASSERT_OK(Put(cf_num, Key(i * 10 + j), "value")); } ASSERT_OK(Flush(cf_num)); } } // generate SSTs with bad unique id SyncPoint::GetInstance()->SetCallBack( "PropertyBlockBuilder::AddTableProperty:Start", [&](void* props_vs) { auto props = static_cast(props_vs); // update table property session_id to a different one props->db_session_id = DBImpl::GenerateDbSessionId(nullptr); }); SyncPoint::GetInstance()->EnableProcessing(); for (int i = 0; i < kNumSst; i++) { for (int j = 0; j < 100; j++) { ASSERT_OK(Put(1, Key(i * 10 + j), "value")); } ASSERT_OK(Flush(1)); } // Reopen with verification should report corruption options.verify_sst_unique_id_in_manifest = true; auto s = TryReopenWithColumnFamilies({"default", "one", "two"}, options); ASSERT_TRUE(s.IsCorruption()); } TEST_F(DBTest2, BestEffortsRecoveryWithSstUniqueIdVerification) { const auto tamper_with_uniq_id = [&](void* arg) { auto props = static_cast(arg); assert(props); // update table property session_id to a different one props->db_session_id = DBImpl::GenerateDbSessionId(nullptr); }; const auto assert_db = [&](size_t expected_count, const std::string& expected_v) { std::unique_ptr it(db_->NewIterator(ReadOptions())); size_t cnt = 0; for (it->SeekToFirst(); it->Valid(); it->Next(), ++cnt) { ASSERT_EQ(std::to_string(cnt), it->key()); ASSERT_EQ(expected_v, it->value()); } EXPECT_OK(it->status()); ASSERT_EQ(expected_count, cnt); }; const int num_l0_compaction_trigger = 8; const int num_l0 = num_l0_compaction_trigger - 1; Options options = CurrentOptions(); options.level0_file_num_compaction_trigger = num_l0_compaction_trigger; for (int k = 0; k < num_l0; ++k) { // Allow mismatch for now options.verify_sst_unique_id_in_manifest = false; DestroyAndReopen(options); constexpr size_t num_keys_per_file = 10; for (int i = 0; i < num_l0; ++i) { for (size_t j = 0; j < num_keys_per_file; ++j) { ASSERT_OK(Put(std::to_string(j), "v" + std::to_string(i))); } if (i == k) { SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->SetCallBack( "PropertyBlockBuilder::AddTableProperty:Start", tamper_with_uniq_id); SyncPoint::GetInstance()->EnableProcessing(); } ASSERT_OK(Flush()); } options.verify_sst_unique_id_in_manifest = true; Status s = TryReopen(options); ASSERT_TRUE(s.IsCorruption()); options.best_efforts_recovery = true; Reopen(options); assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1)); // Reopen with regular recovery options.best_efforts_recovery = false; Reopen(options); assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1)); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); for (size_t i = 0; i < num_keys_per_file; ++i) { ASSERT_OK(Put(std::to_string(i), "v")); } ASSERT_OK(Flush()); Reopen(options); { for (size_t i = 0; i < num_keys_per_file; ++i) { ASSERT_EQ("v", Get(std::to_string(i))); } } } } TEST_F(DBTest2, GetLatestSeqAndTsForKey) { Destroy(last_options_); Options options = CurrentOptions(); options.max_write_buffer_size_to_maintain = 64 << 10; options.create_if_missing = true; options.disable_auto_compactions = true; options.comparator = test::BytewiseComparatorWithU64TsWrapper(); options.statistics = CreateDBStatistics(); Reopen(options); constexpr uint64_t kTsU64Value = 12; for (uint64_t key = 0; key < 100; ++key) { std::string ts; PutFixed64(&ts, kTsU64Value); std::string key_str; PutFixed64(&key_str, key); std::reverse(key_str.begin(), key_str.end()); ASSERT_OK(db_->Put(WriteOptions(), key_str, ts, "value")); } ASSERT_OK(Flush()); constexpr bool cache_only = true; constexpr SequenceNumber lower_bound_seq = 0; auto* cfhi = static_cast_with_check( dbfull()->DefaultColumnFamily()); assert(cfhi); assert(cfhi->cfd()); SuperVersion* sv = cfhi->cfd()->GetSuperVersion(); for (uint64_t key = 0; key < 100; ++key) { std::string key_str; PutFixed64(&key_str, key); std::reverse(key_str.begin(), key_str.end()); std::string ts; SequenceNumber seq = kMaxSequenceNumber; bool found_record_for_key = false; bool is_blob_index = false; const Status s = dbfull()->GetLatestSequenceForKey( sv, key_str, cache_only, lower_bound_seq, &seq, &ts, &found_record_for_key, &is_blob_index); ASSERT_OK(s); std::string expected_ts; PutFixed64(&expected_ts, kTsU64Value); ASSERT_EQ(expected_ts, ts); ASSERT_TRUE(found_record_for_key); ASSERT_FALSE(is_blob_index); } // Verify that no read to SST files. ASSERT_EQ(0, options.statistics->getTickerCount(GET_HIT_L0)); } #if defined(ZSTD_ADVANCED) TEST_F(DBTest2, ZSTDChecksum) { // Verify that corruption during decompression is caught. Options options = CurrentOptions(); options.create_if_missing = true; options.compression = kZSTD; options.compression_opts.max_compressed_bytes_per_kb = 1024; options.compression_opts.checksum = true; DestroyAndReopen(options); Random rnd(33); ASSERT_OK(Put(Key(0), rnd.RandomString(4 << 10))); SyncPoint::GetInstance()->SetCallBack( "BlockBasedTableBuilder::WriteBlock:TamperWithCompressedData", [&](void* arg) { std::string* output = static_cast(arg); // https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#zstandard-frames // Checksum is the last 4 bytes, corrupting that part in unit test is // more controllable. output->data()[output->size() - 1]++; }); SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(Flush()); PinnableSlice val; Status s = Get(Key(0), &val); ASSERT_TRUE(s.IsCorruption()); // Corruption caught during flush. options.paranoid_file_checks = true; DestroyAndReopen(options); ASSERT_OK(Put(Key(0), rnd.RandomString(4 << 10))); s = Flush(); ASSERT_TRUE(s.IsCorruption()); } #endif TEST_F(DBTest2, TableCacheMissDuringReadFromBlockCacheTier) { Options options = CurrentOptions(); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); Reopen(options); // Give table cache zero capacity to prevent preloading tables. That way, // `kBlockCacheTier` reads will fail due to table cache misses. dbfull()->TEST_table_cache()->SetCapacity(0); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Flush()); uint64_t orig_num_file_opens = TestGetTickerCount(options, NO_FILE_OPENS); ReadOptions non_blocking_opts; non_blocking_opts.read_tier = kBlockCacheTier; std::string value; ASSERT_TRUE(db_->Get(non_blocking_opts, "foo", &value).IsIncomplete()); ASSERT_EQ(orig_num_file_opens, TestGetTickerCount(options, NO_FILE_OPENS)); } } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); }