// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #include #include "db/db_test_util.h" #include "options/options_helper.h" #include "port/stack_trace.h" #include "rocksdb/filter_policy.h" #include "rocksdb/flush_block_policy.h" #include "rocksdb/merge_operator.h" #include "rocksdb/perf_context.h" #include "rocksdb/table.h" #include "rocksdb/utilities/debug.h" #include "table/block_based/block_based_table_reader.h" #include "table/block_based/block_builder.h" #include "test_util/sync_point.h" #include "util/file_checksum_helper.h" #include "util/random.h" #include "utilities/counted_fs.h" #include "utilities/fault_injection_env.h" #include "utilities/fault_injection_fs.h" #include "utilities/merge_operators.h" #include "utilities/merge_operators/string_append/stringappend.h" namespace ROCKSDB_NAMESPACE { namespace { class MyFlushBlockPolicy : public FlushBlockPolicy { public: explicit MyFlushBlockPolicy(const int num_keys_in_block, const BlockBuilder& data_block_builder) : num_keys_in_block_(num_keys_in_block), num_keys_(0), data_block_builder_(data_block_builder) {} bool Update(const Slice& /*key*/, const Slice& /*value*/) override { if (data_block_builder_.empty()) { // First key in this block num_keys_ = 1; return false; } // Flush every 10 keys if (num_keys_ == num_keys_in_block_) { num_keys_ = 1; return true; } num_keys_++; return false; } private: const int num_keys_in_block_; int num_keys_; const BlockBuilder& data_block_builder_; }; class MyFlushBlockPolicyFactory : public FlushBlockPolicyFactory { public: explicit MyFlushBlockPolicyFactory(const int num_keys_in_block) : num_keys_in_block_(num_keys_in_block) {} virtual const char* Name() const override { return "MyFlushBlockPolicyFactory"; } virtual FlushBlockPolicy* NewFlushBlockPolicy( const BlockBasedTableOptions& /*table_options*/, const BlockBuilder& data_block_builder) const override { return new MyFlushBlockPolicy(num_keys_in_block_, data_block_builder); } private: const int num_keys_in_block_; }; } // namespace static bool enable_io_uring = true; extern "C" bool RocksDbIOUringEnable() { return enable_io_uring; } class DBBasicTest : public DBTestBase { public: DBBasicTest() : DBTestBase("db_basic_test", /*env_do_fsync=*/false) {} }; TEST_F(DBBasicTest, OpenWhenOpen) { Options options = CurrentOptions(); options.env = env_; DB* db2 = nullptr; Status s = DB::Open(options, dbname_, &db2); ASSERT_NOK(s) << [db2]() { delete db2; return "db2 open: ok"; }(); ASSERT_EQ(Status::Code::kIOError, s.code()); ASSERT_EQ(Status::SubCode::kNone, s.subcode()); ASSERT_TRUE(strstr(s.getState(), "lock ") != nullptr); delete db2; } TEST_F(DBBasicTest, EnableDirectIOWithZeroBuf) { if (!IsDirectIOSupported()) { ROCKSDB_GTEST_BYPASS("Direct IO not supported"); return; } Options options = GetDefaultOptions(); options.create_if_missing = true; options.use_direct_io_for_flush_and_compaction = true; options.writable_file_max_buffer_size = 0; ASSERT_TRUE(TryReopen(options).IsInvalidArgument()); options.writable_file_max_buffer_size = 1024; Reopen(options); const std::unordered_map new_db_opts = { {"writable_file_max_buffer_size", "0"}}; ASSERT_TRUE(db_->SetDBOptions(new_db_opts).IsInvalidArgument()); } TEST_F(DBBasicTest, UniqueSession) { Options options = CurrentOptions(); std::string sid1, sid2, sid3, sid4; ASSERT_OK(db_->GetDbSessionId(sid1)); Reopen(options); ASSERT_OK(db_->GetDbSessionId(sid2)); ASSERT_OK(Put("foo", "v1")); ASSERT_OK(db_->GetDbSessionId(sid4)); Reopen(options); ASSERT_OK(db_->GetDbSessionId(sid3)); ASSERT_NE(sid1, sid2); ASSERT_NE(sid1, sid3); ASSERT_NE(sid2, sid3); ASSERT_EQ(sid2, sid4); // Expected compact format for session ids (see notes in implementation) TestRegex expected("[0-9A-Z]{20}"); EXPECT_MATCHES_REGEX(sid1, expected); EXPECT_MATCHES_REGEX(sid2, expected); EXPECT_MATCHES_REGEX(sid3, expected); Close(); ASSERT_OK(ReadOnlyReopen(options)); ASSERT_OK(db_->GetDbSessionId(sid1)); // Test uniqueness between readonly open (sid1) and regular open (sid3) ASSERT_NE(sid1, sid3); Close(); ASSERT_OK(ReadOnlyReopen(options)); ASSERT_OK(db_->GetDbSessionId(sid2)); ASSERT_EQ("v1", Get("foo")); ASSERT_OK(db_->GetDbSessionId(sid3)); ASSERT_NE(sid1, sid2); ASSERT_EQ(sid2, sid3); CreateAndReopenWithCF({"goku"}, options); ASSERT_OK(db_->GetDbSessionId(sid1)); ASSERT_OK(Put("bar", "e1")); ASSERT_OK(db_->GetDbSessionId(sid2)); ASSERT_EQ("e1", Get("bar")); ASSERT_OK(db_->GetDbSessionId(sid3)); ReopenWithColumnFamilies({"default", "goku"}, options); ASSERT_OK(db_->GetDbSessionId(sid4)); ASSERT_EQ(sid1, sid2); ASSERT_EQ(sid2, sid3); ASSERT_NE(sid1, sid4); } TEST_F(DBBasicTest, ReadOnlyDB) { ASSERT_OK(Put("foo", "v1")); ASSERT_OK(Put("bar", "v2")); ASSERT_OK(Put("foo", "v3")); Close(); auto verify_one_iter = [&](Iterator* iter) { int count = 0; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ASSERT_OK(iter->status()); ++count; } ASSERT_OK(iter->status()); // Always expect two keys: "foo" and "bar" ASSERT_EQ(count, 2); }; auto verify_all_iters = [&]() { Iterator* iter = db_->NewIterator(ReadOptions()); verify_one_iter(iter); delete iter; std::vector iters; ASSERT_OK(db_->NewIterators(ReadOptions(), {dbfull()->DefaultColumnFamily()}, &iters)); ASSERT_EQ(static_cast(1), iters.size()); verify_one_iter(iters[0]); delete iters[0]; }; auto options = CurrentOptions(); assert(options.env == env_); ASSERT_OK(ReadOnlyReopen(options)); ASSERT_EQ("v3", Get("foo")); ASSERT_EQ("v2", Get("bar")); verify_all_iters(); Close(); // Reopen and flush memtable. Reopen(options); ASSERT_OK(Flush()); Close(); // Now check keys in read only mode. ASSERT_OK(ReadOnlyReopen(options)); ASSERT_EQ("v3", Get("foo")); ASSERT_EQ("v2", Get("bar")); verify_all_iters(); ASSERT_TRUE(db_->SyncWAL().IsNotSupported()); } // TODO akanksha: Update the test to check that combination // does not actually write to FS (use open read-only with // CompositeEnvWrapper+ReadOnlyFileSystem). TEST_F(DBBasicTest, DISABLED_ReadOnlyDBWithWriteDBIdToManifestSet) { ASSERT_OK(Put("foo", "v1")); ASSERT_OK(Put("bar", "v2")); ASSERT_OK(Put("foo", "v3")); Close(); auto options = CurrentOptions(); options.write_dbid_to_manifest = true; assert(options.env == env_); ASSERT_OK(ReadOnlyReopen(options)); std::string db_id1; ASSERT_OK(db_->GetDbIdentity(db_id1)); ASSERT_EQ("v3", Get("foo")); ASSERT_EQ("v2", Get("bar")); Iterator* iter = db_->NewIterator(ReadOptions()); int count = 0; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ASSERT_OK(iter->status()); ++count; } ASSERT_EQ(count, 2); delete iter; Close(); // Reopen and flush memtable. Reopen(options); ASSERT_OK(Flush()); Close(); // Now check keys in read only mode. ASSERT_OK(ReadOnlyReopen(options)); ASSERT_EQ("v3", Get("foo")); ASSERT_EQ("v2", Get("bar")); ASSERT_TRUE(db_->SyncWAL().IsNotSupported()); std::string db_id2; ASSERT_OK(db_->GetDbIdentity(db_id2)); ASSERT_EQ(db_id1, db_id2); } TEST_F(DBBasicTest, CompactedDB) { const uint64_t kFileSize = 1 << 20; Options options = CurrentOptions(); options.disable_auto_compactions = true; options.write_buffer_size = kFileSize; options.target_file_size_base = kFileSize; options.max_bytes_for_level_base = 1 << 30; options.compression = kNoCompression; Reopen(options); // 1 L0 file, use CompactedDB if max_open_files = -1 ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1'))); ASSERT_OK(Flush()); Close(); ASSERT_OK(ReadOnlyReopen(options)); Status s = Put("new", "value"); ASSERT_EQ(s.ToString(), "Not implemented: Not supported operation in read only mode."); ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa")); Close(); options.max_open_files = -1; ASSERT_OK(ReadOnlyReopen(options)); s = Put("new", "value"); ASSERT_EQ(s.ToString(), "Not implemented: Not supported in compacted db mode."); ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa")); Close(); Reopen(options); // Add more L0 files ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2'))); ASSERT_OK(Flush()); ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, 'a'))); ASSERT_OK(Flush()); ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b'))); ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e'))); ASSERT_OK(Flush()); ASSERT_OK(Put("something_not_flushed", "x")); Close(); ASSERT_OK(ReadOnlyReopen(options)); // Fallback to read-only DB s = Put("new", "value"); ASSERT_EQ(s.ToString(), "Not implemented: Not supported operation in read only mode."); // TODO: validate that other write ops return NotImplemented // (DBImplReadOnly is missing some overrides) // Ensure no deadlock on flush triggered by another API function // (Old deadlock bug depends on something_not_flushed above.) std::vector files; uint64_t manifest_file_size; ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true)); LiveFilesStorageInfoOptions lfsi_opts; lfsi_opts.wal_size_for_flush = 0; // always std::vector files2; ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files2)); Close(); // Full compaction Reopen(options); // Add more keys ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f'))); ASSERT_OK(Put("hhh", DummyString(kFileSize / 2, 'h'))); ASSERT_OK(Put("iii", DummyString(kFileSize / 2, 'i'))); ASSERT_OK(Put("jjj", DummyString(kFileSize / 2, 'j'))); ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); ASSERT_EQ(3, NumTableFilesAtLevel(1)); Close(); // CompactedDB ASSERT_OK(ReadOnlyReopen(options)); s = Put("new", "value"); ASSERT_EQ(s.ToString(), "Not implemented: Not supported in compacted db mode."); ASSERT_EQ("NOT_FOUND", Get("abc")); ASSERT_EQ(DummyString(kFileSize / 2, 'a'), Get("aaa")); ASSERT_EQ(DummyString(kFileSize / 2, 'b'), Get("bbb")); ASSERT_EQ("NOT_FOUND", Get("ccc")); ASSERT_EQ(DummyString(kFileSize / 2, 'e'), Get("eee")); ASSERT_EQ(DummyString(kFileSize / 2, 'f'), Get("fff")); ASSERT_EQ("NOT_FOUND", Get("ggg")); ASSERT_EQ(DummyString(kFileSize / 2, 'h'), Get("hhh")); ASSERT_EQ(DummyString(kFileSize / 2, 'i'), Get("iii")); ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj")); ASSERT_EQ("NOT_FOUND", Get("kkk")); // TODO: validate that other write ops return NotImplemented // (CompactedDB is missing some overrides) // Ensure no deadlock on flush triggered by another API function ASSERT_OK(db_->GetLiveFiles(files, &manifest_file_size, /*flush*/ true)); ASSERT_OK(db_->GetLiveFilesStorageInfo(lfsi_opts, &files2)); // MultiGet std::vector values; std::vector status_list = dbfull()->MultiGet( ReadOptions(), std::vector({Slice("aaa"), Slice("ccc"), Slice("eee"), Slice("ggg"), Slice("iii"), Slice("kkk")}), &values); ASSERT_EQ(status_list.size(), static_cast(6)); ASSERT_EQ(values.size(), static_cast(6)); ASSERT_OK(status_list[0]); ASSERT_EQ(DummyString(kFileSize / 2, 'a'), values[0]); ASSERT_TRUE(status_list[1].IsNotFound()); ASSERT_OK(status_list[2]); ASSERT_EQ(DummyString(kFileSize / 2, 'e'), values[2]); ASSERT_TRUE(status_list[3].IsNotFound()); ASSERT_OK(status_list[4]); ASSERT_EQ(DummyString(kFileSize / 2, 'i'), values[4]); ASSERT_TRUE(status_list[5].IsNotFound()); Reopen(options); // Add a key ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f'))); Close(); ASSERT_OK(ReadOnlyReopen(options)); s = Put("new", "value"); ASSERT_EQ(s.ToString(), "Not implemented: Not supported operation in read only mode."); } TEST_F(DBBasicTest, LevelLimitReopen) { Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu"}, options); const std::string value(1024 * 1024, ' '); int i = 0; while (NumTableFilesAtLevel(2, 1) == 0) { ASSERT_OK(Put(1, Key(i++), value)); ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable()); ASSERT_OK(dbfull()->TEST_WaitForCompact()); } options.num_levels = 1; options.max_bytes_for_level_multiplier_additional.resize(1, 1); Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options); ASSERT_EQ(s.IsInvalidArgument(), true); ASSERT_EQ(s.ToString(), "Invalid argument: db has more levels than options.num_levels"); options.num_levels = 10; options.max_bytes_for_level_multiplier_additional.resize(10, 1); ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); } TEST_F(DBBasicTest, PutDeleteGet) { do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ASSERT_OK(Put(1, "foo", "v1")); ASSERT_EQ("v1", Get(1, "foo")); ASSERT_OK(Put(1, "foo", "v2")); ASSERT_EQ("v2", Get(1, "foo")); ASSERT_OK(Delete(1, "foo")); ASSERT_EQ("NOT_FOUND", Get(1, "foo")); } while (ChangeOptions()); } TEST_F(DBBasicTest, PutSingleDeleteGet) { do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ASSERT_OK(Put(1, "foo", "v1")); ASSERT_EQ("v1", Get(1, "foo")); ASSERT_OK(Put(1, "foo2", "v2")); ASSERT_EQ("v2", Get(1, "foo2")); ASSERT_OK(SingleDelete(1, "foo")); ASSERT_EQ("NOT_FOUND", Get(1, "foo")); // Ski FIFO and universal compaction because they do not apply to the test // case. Skip MergePut because single delete does not get removed when it // encounters a merge. } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction | kSkipMergePut)); } TEST_F(DBBasicTest, TimedPutBasic) { do { Options options = CurrentOptions(); options.merge_operator = MergeOperators::CreateStringAppendOperator(); CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(TimedPut(1, "foo", "v1", /*write_unix_time=*/0)); // Read from memtable ASSERT_EQ("v1", Get(1, "foo")); ASSERT_OK(TimedPut(1, "foo", "v2.1", /*write_unix_time=*/3)); ASSERT_EQ("v2.1", Get(1, "foo")); // Read from sst file ASSERT_OK(db_->Flush(FlushOptions(), handles_[1])); ASSERT_OK(Merge(1, "foo", "v2.2")); ASSERT_EQ("v2.1,v2.2", Get(1, "foo")); ASSERT_OK(Delete(1, "foo")); ASSERT_EQ("NOT_FOUND", Get(1, "foo")); ASSERT_OK(TimedPut(1, "bar", "bv1", /*write_unix_time=*/0)); ASSERT_EQ("bv1", Get(1, "bar")); ASSERT_OK(TimedPut(1, "baz", "bzv1", /*write_unix_time=*/0)); ASSERT_EQ("bzv1", Get(1, "baz")); if (option_config_ != kRowCache) { std::string range_del_begin = "b"; std::string range_del_end = "baz"; Slice begin_rdel = range_del_begin, end_rdel = range_del_end; ASSERT_OK( db_->DeleteRange(WriteOptions(), handles_[1], begin_rdel, end_rdel)); ASSERT_EQ("NOT_FOUND", Get(1, "bar")); } ASSERT_EQ("bzv1", Get(1, "baz")); ASSERT_OK(SingleDelete(1, "baz")); ASSERT_EQ("NOT_FOUND", Get(1, "baz")); } while (ChangeOptions(kSkipPlainTable)); } TEST_F(DBBasicTest, EmptyFlush) { // It is possible to produce empty flushes when using single deletes. Tests // whether empty flushes cause issues. do { Random rnd(301); Options options = CurrentOptions(); options.disable_auto_compactions = true; CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "a", Slice())); ASSERT_OK(SingleDelete(1, "a")); ASSERT_OK(Flush(1)); ASSERT_EQ("[ ]", AllEntriesFor("a", 1)); // Skip FIFO and universal compaction as they do not apply to the test // case. Skip MergePut because merges cannot be combined with single // deletions. } while (ChangeOptions(kSkipFIFOCompaction | kSkipUniversalCompaction | kSkipMergePut)); } TEST_F(DBBasicTest, GetFromVersions) { do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ASSERT_OK(Put(1, "foo", "v1")); ASSERT_OK(Flush(1)); ASSERT_EQ("v1", Get(1, "foo")); ASSERT_EQ("NOT_FOUND", Get(0, "foo")); } while (ChangeOptions()); } TEST_F(DBBasicTest, GetSnapshot) { anon::OptionsOverride options_override; options_override.skip_policy = kSkipNoSnapshot; do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); // Try with both a short key and a long key for (int i = 0; i < 2; i++) { std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x'); ASSERT_OK(Put(1, key, "v1")); const Snapshot* s1 = db_->GetSnapshot(); ASSERT_OK(Put(1, key, "v2")); ASSERT_EQ("v2", Get(1, key)); ASSERT_EQ("v1", Get(1, key, s1)); ASSERT_OK(Flush(1)); ASSERT_EQ("v2", Get(1, key)); ASSERT_EQ("v1", Get(1, key, s1)); db_->ReleaseSnapshot(s1); } } while (ChangeOptions()); } TEST_F(DBBasicTest, CheckLock) { do { DB* localdb = nullptr; Options options = CurrentOptions(); ASSERT_OK(TryReopen(options)); // second open should fail Status s = DB::Open(options, dbname_, &localdb); ASSERT_NOK(s) << [localdb]() { delete localdb; return "localdb open: ok"; }(); #ifdef OS_LINUX ASSERT_TRUE(s.ToString().find("lock ") != std::string::npos); #endif // OS_LINUX } while (ChangeCompactOptions()); } TEST_F(DBBasicTest, FlushMultipleMemtable) { do { Options options = CurrentOptions(); WriteOptions writeOpt = WriteOptions(); writeOpt.disableWAL = true; options.max_write_buffer_number = 4; options.min_write_buffer_number_to_merge = 3; options.max_write_buffer_size_to_maintain = -1; CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); ASSERT_OK(Flush(1)); ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); ASSERT_EQ("v1", Get(1, "foo")); ASSERT_EQ("v1", Get(1, "bar")); ASSERT_OK(Flush(1)); } while (ChangeCompactOptions()); } TEST_F(DBBasicTest, FlushEmptyColumnFamily) { // Block flush thread and disable compaction thread env_->SetBackgroundThreads(1, Env::HIGH); env_->SetBackgroundThreads(1, Env::LOW); test::SleepingBackgroundTask sleeping_task_low; env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, Env::Priority::LOW); test::SleepingBackgroundTask sleeping_task_high; env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_high, Env::Priority::HIGH); Options options = CurrentOptions(); // disable compaction options.disable_auto_compactions = true; WriteOptions writeOpt = WriteOptions(); writeOpt.disableWAL = true; options.max_write_buffer_number = 2; options.min_write_buffer_number_to_merge = 1; options.max_write_buffer_size_to_maintain = static_cast(options.write_buffer_size); CreateAndReopenWithCF({"pikachu"}, options); // Compaction can still go through even if no thread can flush the // mem table. ASSERT_OK(Flush(0)); ASSERT_OK(Flush(1)); // Insert can go through ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1")); ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); ASSERT_EQ("v1", Get(0, "foo")); ASSERT_EQ("v1", Get(1, "bar")); sleeping_task_high.WakeUp(); sleeping_task_high.WaitUntilDone(); // Flush can still go through. ASSERT_OK(Flush(0)); ASSERT_OK(Flush(1)); sleeping_task_low.WakeUp(); sleeping_task_low.WaitUntilDone(); } TEST_F(DBBasicTest, Flush) { do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); WriteOptions writeOpt = WriteOptions(); writeOpt.disableWAL = true; SetPerfLevel(kEnableTime); ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); // this will now also flush the last 2 writes ASSERT_OK(Flush(1)); ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); get_perf_context()->Reset(); Get(1, "foo"); ASSERT_TRUE((int)get_perf_context()->get_from_output_files_time > 0); ASSERT_EQ(2, (int)get_perf_context()->get_read_bytes); ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); ASSERT_EQ("v1", Get(1, "foo")); ASSERT_EQ("v1", Get(1, "bar")); writeOpt.disableWAL = true; ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v2")); ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2")); ASSERT_OK(Flush(1)); ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); ASSERT_EQ("v2", Get(1, "bar")); get_perf_context()->Reset(); ASSERT_EQ("v2", Get(1, "foo")); ASSERT_TRUE((int)get_perf_context()->get_from_output_files_time > 0); writeOpt.disableWAL = false; ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v3")); ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3")); ASSERT_OK(Flush(1)); ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); // 'foo' should be there because its put // has WAL enabled. ASSERT_EQ("v3", Get(1, "foo")); ASSERT_EQ("v3", Get(1, "bar")); SetPerfLevel(kDisable); } while (ChangeCompactOptions()); } TEST_F(DBBasicTest, ManifestRollOver) { do { Options options; options.max_manifest_file_size = 10; // 10 bytes options = CurrentOptions(options); CreateAndReopenWithCF({"pikachu"}, options); { ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1'))); ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2'))); ASSERT_OK(Put(1, "manifest_key3", std::string(1000, '3'))); uint64_t manifest_before_flush = dbfull()->TEST_Current_Manifest_FileNo(); ASSERT_OK(Flush(1)); // This should trigger LogAndApply. uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo(); ASSERT_GT(manifest_after_flush, manifest_before_flush); ReopenWithColumnFamilies({"default", "pikachu"}, options); ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush); // check if a new manifest file got inserted or not. ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1")); ASSERT_EQ(std::string(1000, '2'), Get(1, "manifest_key2")); ASSERT_EQ(std::string(1000, '3'), Get(1, "manifest_key3")); } } while (ChangeCompactOptions()); } TEST_F(DBBasicTest, IdentityAcrossRestarts) { constexpr size_t kMinIdSize = 10; do { for (bool with_manifest : {false, true}) { for (bool write_file : {false, true}) { std::string idfilename = IdentityFileName(dbname_); std::string id1, tmp; ASSERT_OK(db_->GetDbIdentity(id1)); ASSERT_GE(id1.size(), kMinIdSize); Options options = CurrentOptions(); options.write_dbid_to_manifest = with_manifest; options.write_identity_file = true; // initially Reopen(options); std::string id2; ASSERT_OK(db_->GetDbIdentity(id2)); // id2 should match id1 because identity was not regenerated ASSERT_EQ(id1, id2); ASSERT_OK(ReadFileToString(env_, idfilename, &tmp)); ASSERT_EQ(tmp, id2); if (write_file) { // Recover from deleted/missing IDENTITY ASSERT_OK(env_->DeleteFile(idfilename)); } else { // Transition to no IDENTITY file options.write_identity_file = false; if (!with_manifest) { // Incompatible options, should fail ASSERT_NOK(TryReopen(options)); // Back to a usable config and continue options.write_identity_file = true; Reopen(options); continue; } } Reopen(options); std::string id3; ASSERT_OK(db_->GetDbIdentity(id3)); if (with_manifest) { // id3 should match id1 because identity was restored from manifest ASSERT_EQ(id1, id3); } else { // id3 should NOT match id1 because identity was regenerated ASSERT_NE(id1, id3); ASSERT_GE(id3.size(), kMinIdSize); } if (write_file) { ASSERT_OK(ReadFileToString(env_, idfilename, &tmp)); ASSERT_EQ(tmp, id3); // Recover from truncated IDENTITY std::unique_ptr w; ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions())); ASSERT_OK(w->Close()); } else { ASSERT_TRUE(env_->FileExists(idfilename).IsNotFound()); } Reopen(options); std::string id4; ASSERT_OK(db_->GetDbIdentity(id4)); if (with_manifest) { // id4 should match id1 because identity was restored from manifest ASSERT_EQ(id1, id4); } else { // id4 should NOT match id1 because identity was regenerated ASSERT_NE(id1, id4); ASSERT_GE(id4.size(), kMinIdSize); } std::string silly_id = "asdf123456789"; if (write_file) { ASSERT_OK(ReadFileToString(env_, idfilename, &tmp)); ASSERT_EQ(tmp, id4); // Recover from overwritten IDENTITY std::unique_ptr w; ASSERT_OK(env_->NewWritableFile(idfilename, &w, EnvOptions())); ASSERT_OK(w->Append(silly_id)); ASSERT_OK(w->Close()); } else { ASSERT_TRUE(env_->FileExists(idfilename).IsNotFound()); } Reopen(options); std::string id5; ASSERT_OK(db_->GetDbIdentity(id5)); if (with_manifest) { // id4 should match id1 because identity was restored from manifest ASSERT_EQ(id1, id5); } else { ASSERT_EQ(id5, silly_id); } if (write_file) { ASSERT_OK(ReadFileToString(env_, idfilename, &tmp)); ASSERT_EQ(tmp, id5); } else { ASSERT_TRUE(env_->FileExists(idfilename).IsNotFound()); } } } } while (ChangeCompactOptions()); } TEST_F(DBBasicTest, LockFileRecovery) { Options options = CurrentOptions(); // Regardless of best_efforts_recovery for (bool ber : {false, true}) { options.best_efforts_recovery = ber; DestroyAndReopen(options); std::string id1, id2; ASSERT_OK(db_->GetDbIdentity(id1)); Close(); // Should be OK to re-open DB after lock file deleted std::string lockfilename = LockFileName(dbname_); ASSERT_OK(env_->DeleteFile(lockfilename)); Reopen(options); // Should be same DB as before ASSERT_OK(db_->GetDbIdentity(id2)); ASSERT_EQ(id1, id2); } } TEST_F(DBBasicTest, Snapshot) { env_->SetMockSleep(); anon::OptionsOverride options_override; options_override.skip_policy = kSkipNoSnapshot; do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); ASSERT_OK(Put(0, "foo", "0v1")); ASSERT_OK(Put(1, "foo", "1v1")); const Snapshot* s1 = db_->GetSnapshot(); ASSERT_EQ(1U, GetNumSnapshots()); uint64_t time_snap1 = GetTimeOldestSnapshots(); ASSERT_GT(time_snap1, 0U); ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); ASSERT_OK(Put(0, "foo", "0v2")); ASSERT_OK(Put(1, "foo", "1v2")); env_->MockSleepForSeconds(1); const Snapshot* s2 = db_->GetSnapshot(); ASSERT_EQ(2U, GetNumSnapshots()); ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); ASSERT_OK(Put(0, "foo", "0v3")); ASSERT_OK(Put(1, "foo", "1v3")); { ManagedSnapshot s3(db_); ASSERT_EQ(3U, GetNumSnapshots()); ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); ASSERT_OK(Put(0, "foo", "0v4")); ASSERT_OK(Put(1, "foo", "1v4")); ASSERT_EQ("0v1", Get(0, "foo", s1)); ASSERT_EQ("1v1", Get(1, "foo", s1)); ASSERT_EQ("0v2", Get(0, "foo", s2)); ASSERT_EQ("1v2", Get(1, "foo", s2)); ASSERT_EQ("0v3", Get(0, "foo", s3.snapshot())); ASSERT_EQ("1v3", Get(1, "foo", s3.snapshot())); ASSERT_EQ("0v4", Get(0, "foo")); ASSERT_EQ("1v4", Get(1, "foo")); } ASSERT_EQ(2U, GetNumSnapshots()); ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); ASSERT_EQ(GetSequenceOldestSnapshots(), s1->GetSequenceNumber()); ASSERT_EQ("0v1", Get(0, "foo", s1)); ASSERT_EQ("1v1", Get(1, "foo", s1)); ASSERT_EQ("0v2", Get(0, "foo", s2)); ASSERT_EQ("1v2", Get(1, "foo", s2)); ASSERT_EQ("0v4", Get(0, "foo")); ASSERT_EQ("1v4", Get(1, "foo")); db_->ReleaseSnapshot(s1); ASSERT_EQ("0v2", Get(0, "foo", s2)); ASSERT_EQ("1v2", Get(1, "foo", s2)); ASSERT_EQ("0v4", Get(0, "foo")); ASSERT_EQ("1v4", Get(1, "foo")); ASSERT_EQ(1U, GetNumSnapshots()); ASSERT_LT(time_snap1, GetTimeOldestSnapshots()); ASSERT_EQ(GetSequenceOldestSnapshots(), s2->GetSequenceNumber()); db_->ReleaseSnapshot(s2); ASSERT_EQ(0U, GetNumSnapshots()); ASSERT_EQ(GetSequenceOldestSnapshots(), 0); ASSERT_EQ("0v4", Get(0, "foo")); ASSERT_EQ("1v4", Get(1, "foo")); } while (ChangeOptions()); } class DBBasicMultiConfigs : public DBBasicTest, public ::testing::WithParamInterface { public: DBBasicMultiConfigs() { option_config_ = GetParam(); } static std::vector GenerateOptionConfigs() { std::vector option_configs; for (int option_config = kDefault; option_config < kEnd; ++option_config) { if (!ShouldSkipOptions(option_config, kSkipFIFOCompaction)) { option_configs.push_back(option_config); } } return option_configs; } }; TEST_P(DBBasicMultiConfigs, CompactBetweenSnapshots) { anon::OptionsOverride options_override; options_override.skip_policy = kSkipNoSnapshot; Options options = CurrentOptions(options_override); options.disable_auto_compactions = true; DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); FillLevels("a", "z", 1); ASSERT_OK(Put(1, "foo", "first")); const Snapshot* snapshot1 = db_->GetSnapshot(); ASSERT_OK(Put(1, "foo", "second")); ASSERT_OK(Put(1, "foo", "third")); ASSERT_OK(Put(1, "foo", "fourth")); const Snapshot* snapshot2 = db_->GetSnapshot(); ASSERT_OK(Put(1, "foo", "fifth")); ASSERT_OK(Put(1, "foo", "sixth")); // All entries (including duplicates) exist // before any compaction or flush is triggered. ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fifth, fourth, third, second, first ]"); ASSERT_EQ("sixth", Get(1, "foo")); ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); ASSERT_EQ("first", Get(1, "foo", snapshot1)); // After a flush, "second", "third" and "fifth" should // be removed ASSERT_OK(Flush(1)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth, first ]"); // after we release the snapshot1, only two values left db_->ReleaseSnapshot(snapshot1); FillLevels("a", "z", 1); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); // We have only one valid snapshot snapshot2. Since snapshot1 is // not valid anymore, "first" should be removed by a compaction. ASSERT_EQ("sixth", Get(1, "foo")); ASSERT_EQ("fourth", Get(1, "foo", snapshot2)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth, fourth ]"); // after we release the snapshot2, only one value should be left db_->ReleaseSnapshot(snapshot2); FillLevels("a", "z", 1); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); ASSERT_EQ("sixth", Get(1, "foo")); ASSERT_EQ(AllEntriesFor("foo", 1), "[ sixth ]"); } INSTANTIATE_TEST_CASE_P( DBBasicMultiConfigs, DBBasicMultiConfigs, ::testing::ValuesIn(DBBasicMultiConfigs::GenerateOptionConfigs())); TEST_F(DBBasicTest, DBOpen_Options) { Options options = CurrentOptions(); Close(); Destroy(options); // Does not exist, and create_if_missing == false: error DB* db = nullptr; options.create_if_missing = false; Status s = DB::Open(options, dbname_, &db); ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr); ASSERT_TRUE(db == nullptr); // Does not exist, and create_if_missing == true: OK options.create_if_missing = true; s = DB::Open(options, dbname_, &db); ASSERT_OK(s); ASSERT_TRUE(db != nullptr); delete db; db = nullptr; // Does exist, and error_if_exists == true: error options.create_if_missing = false; options.error_if_exists = true; s = DB::Open(options, dbname_, &db); ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr); ASSERT_TRUE(db == nullptr); // Does exist, and error_if_exists == false: OK options.create_if_missing = true; options.error_if_exists = false; s = DB::Open(options, dbname_, &db); ASSERT_OK(s); ASSERT_TRUE(db != nullptr); delete db; db = nullptr; } TEST_F(DBBasicTest, CompactOnFlush) { anon::OptionsOverride options_override; options_override.skip_policy = kSkipNoSnapshot; do { Options options = CurrentOptions(options_override); options.disable_auto_compactions = true; CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "foo", "v1")); ASSERT_OK(Flush(1)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ v1 ]"); // Write two new keys ASSERT_OK(Put(1, "a", "begin")); ASSERT_OK(Put(1, "z", "end")); ASSERT_OK(Flush(1)); // Case1: Delete followed by a put ASSERT_OK(Delete(1, "foo")); ASSERT_OK(Put(1, "foo", "v2")); ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, DEL, v1 ]"); // After the current memtable is flushed, the DEL should // have been removed ASSERT_OK(Flush(1)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2, v1 ]"); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ v2 ]"); // Case 2: Delete followed by another delete ASSERT_OK(Delete(1, "foo")); ASSERT_OK(Delete(1, "foo")); ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, DEL, v2 ]"); ASSERT_OK(Flush(1)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v2 ]"); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); // Case 3: Put followed by a delete ASSERT_OK(Put(1, "foo", "v3")); ASSERT_OK(Delete(1, "foo")); ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL, v3 ]"); ASSERT_OK(Flush(1)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ DEL ]"); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); // Case 4: Put followed by another Put ASSERT_OK(Put(1, "foo", "v4")); ASSERT_OK(Put(1, "foo", "v5")); ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5, v4 ]"); ASSERT_OK(Flush(1)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ v5 ]"); // clear database ASSERT_OK(Delete(1, "foo")); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); // Case 5: Put followed by snapshot followed by another Put // Both puts should remain. ASSERT_OK(Put(1, "foo", "v6")); const Snapshot* snapshot = db_->GetSnapshot(); ASSERT_OK(Put(1, "foo", "v7")); ASSERT_OK(Flush(1)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ v7, v6 ]"); db_->ReleaseSnapshot(snapshot); // clear database ASSERT_OK(Delete(1, "foo")); ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ ]"); // Case 5: snapshot followed by a put followed by another Put // Only the last put should remain. const Snapshot* snapshot1 = db_->GetSnapshot(); ASSERT_OK(Put(1, "foo", "v8")); ASSERT_OK(Put(1, "foo", "v9")); ASSERT_OK(Flush(1)); ASSERT_EQ(AllEntriesFor("foo", 1), "[ v9 ]"); db_->ReleaseSnapshot(snapshot1); } while (ChangeCompactOptions()); } TEST_F(DBBasicTest, FlushOneColumnFamily) { Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", "alyosha", "popovich"}, options); ASSERT_OK(Put(0, "Default", "Default")); ASSERT_OK(Put(1, "pikachu", "pikachu")); ASSERT_OK(Put(2, "ilya", "ilya")); ASSERT_OK(Put(3, "muromec", "muromec")); ASSERT_OK(Put(4, "dobrynia", "dobrynia")); ASSERT_OK(Put(5, "nikitich", "nikitich")); ASSERT_OK(Put(6, "alyosha", "alyosha")); ASSERT_OK(Put(7, "popovich", "popovich")); for (int i = 0; i < 8; ++i) { ASSERT_OK(Flush(i)); auto tables = ListTableFiles(env_, dbname_); ASSERT_EQ(tables.size(), i + 1U); } } TEST_F(DBBasicTest, MultiGetSimple) { do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); SetPerfLevel(kEnableCount); ASSERT_OK(Put(1, "k1", "v1")); ASSERT_OK(Put(1, "k2", "v2")); ASSERT_OK(Put(1, "k3", "v3")); ASSERT_OK(Put(1, "k4", "v4")); ASSERT_OK(Delete(1, "k4")); ASSERT_OK(Put(1, "k5", "v5")); ASSERT_OK(Delete(1, "no_key")); std::vector keys({"k1", "k2", "k3", "k4", "k5", "no_key"}); std::vector values(20, "Temporary data to be overwritten"); std::vector cfs(keys.size(), handles_[1]); get_perf_context()->Reset(); std::vector s = db_->MultiGet(ReadOptions(), cfs, keys, &values); ASSERT_EQ(values.size(), keys.size()); ASSERT_EQ(values[0], "v1"); ASSERT_EQ(values[1], "v2"); ASSERT_EQ(values[2], "v3"); ASSERT_EQ(values[4], "v5"); // four kv pairs * two bytes per value ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes); ASSERT_OK(s[0]); ASSERT_OK(s[1]); ASSERT_OK(s[2]); ASSERT_TRUE(s[3].IsNotFound()); ASSERT_OK(s[4]); ASSERT_TRUE(s[5].IsNotFound()); SetPerfLevel(kDisable); } while (ChangeCompactOptions()); } TEST_F(DBBasicTest, MultiGetEmpty) { do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); // Empty Key Set std::vector keys; std::vector values; std::vector cfs; std::vector s = db_->MultiGet(ReadOptions(), cfs, keys, &values); ASSERT_EQ(s.size(), 0U); // Empty Database, Empty Key Set Options options = CurrentOptions(); options.create_if_missing = true; DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); s = db_->MultiGet(ReadOptions(), cfs, keys, &values); ASSERT_EQ(s.size(), 0U); // Empty Database, Search for Keys keys.resize(2); keys[0] = "a"; keys[1] = "b"; cfs.push_back(handles_[0]); cfs.push_back(handles_[1]); s = db_->MultiGet(ReadOptions(), cfs, keys, &values); ASSERT_EQ(static_cast(s.size()), 2); ASSERT_TRUE(s[0].IsNotFound() && s[1].IsNotFound()); } while (ChangeCompactOptions()); } class DBBlockChecksumTest : public DBBasicTest, public testing::WithParamInterface {}; INSTANTIATE_TEST_CASE_P(FormatVersions, DBBlockChecksumTest, testing::ValuesIn(test::kFooterFormatVersionsToTest)); TEST_P(DBBlockChecksumTest, BlockChecksumTest) { BlockBasedTableOptions table_options; table_options.format_version = GetParam(); Options options = CurrentOptions(); const int kNumPerFile = 2; const auto algs = GetSupportedChecksums(); const int algs_size = static_cast(algs.size()); // generate one table with each type of checksum for (int i = 0; i < algs_size; ++i) { table_options.checksum = algs[i]; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); for (int j = 0; j < kNumPerFile; ++j) { ASSERT_OK(Put(Key(i * kNumPerFile + j), Key(i * kNumPerFile + j))); } ASSERT_OK(Flush()); } // with each valid checksum type setting... for (int i = 0; i < algs_size; ++i) { table_options.checksum = algs[i]; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); // verify every type of checksum (should be regardless of that setting) for (int j = 0; j < algs_size * kNumPerFile; ++j) { ASSERT_EQ(Key(j), Get(Key(j))); } } // Now test invalid checksum type table_options.checksum = static_cast(123); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); ASSERT_TRUE(TryReopen(options).IsInvalidArgument()); } // On Windows you can have either memory mapped file or a file // with unbuffered access. So this asserts and does not make // sense to run #ifndef OS_WIN TEST_F(DBBasicTest, MmapAndBufferOptions) { if (!IsMemoryMappedAccessSupported()) { return; } Options options = CurrentOptions(); options.use_direct_reads = true; options.allow_mmap_reads = true; ASSERT_NOK(TryReopen(options)); // All other combinations are acceptable options.use_direct_reads = false; ASSERT_OK(TryReopen(options)); if (IsDirectIOSupported()) { options.use_direct_reads = true; options.allow_mmap_reads = false; ASSERT_OK(TryReopen(options)); } options.use_direct_reads = false; ASSERT_OK(TryReopen(options)); } #endif class TestEnv : public EnvWrapper { public: explicit TestEnv(Env* base_env) : EnvWrapper(base_env), close_count(0) {} static const char* kClassName() { return "TestEnv"; } const char* Name() const override { return kClassName(); } class TestLogger : public Logger { public: using Logger::Logv; explicit TestLogger(TestEnv* env_ptr) : Logger() { env = env_ptr; } ~TestLogger() override { if (!closed_) { CloseHelper().PermitUncheckedError(); } } void Logv(const char* /*format*/, va_list /*ap*/) override {} protected: Status CloseImpl() override { return CloseHelper(); } private: Status CloseHelper() { env->CloseCountInc(); ; return Status::IOError(); } TestEnv* env; }; void CloseCountInc() { close_count++; } int GetCloseCount() { return close_count; } Status NewLogger(const std::string& /*fname*/, std::shared_ptr* result) override { result->reset(new TestLogger(this)); return Status::OK(); } private: int close_count; }; TEST_F(DBBasicTest, DBClose) { Options options = GetDefaultOptions(); std::string dbname = test::PerThreadDBPath("db_close_test"); ASSERT_OK(DestroyDB(dbname, options)); DB* db = nullptr; TestEnv* env = new TestEnv(env_); std::unique_ptr local_env_guard(env); options.create_if_missing = true; options.env = env; Status s = DB::Open(options, dbname, &db); ASSERT_OK(s); ASSERT_TRUE(db != nullptr); s = db->Close(); ASSERT_EQ(env->GetCloseCount(), 1); ASSERT_EQ(s, Status::IOError()); delete db; ASSERT_EQ(env->GetCloseCount(), 1); // Do not call DB::Close() and ensure our logger Close() still gets called s = DB::Open(options, dbname, &db); ASSERT_OK(s); ASSERT_TRUE(db != nullptr); delete db; ASSERT_EQ(env->GetCloseCount(), 2); // close by WaitForCompact() with close_db option options.create_if_missing = false; s = DB::Open(options, dbname, &db); ASSERT_OK(s); ASSERT_TRUE(db != nullptr); WaitForCompactOptions wait_for_compact_options = WaitForCompactOptions(); wait_for_compact_options.close_db = true; s = db->WaitForCompact(wait_for_compact_options); ASSERT_EQ(env->GetCloseCount(), 3); // see TestLogger::CloseHelper() ASSERT_EQ(s, Status::IOError()); delete db; ASSERT_EQ(env->GetCloseCount(), 3); // Provide our own logger and ensure DB::Close() does not close it options.info_log.reset(new TestEnv::TestLogger(env)); s = DB::Open(options, dbname, &db); ASSERT_OK(s); ASSERT_TRUE(db != nullptr); s = db->Close(); ASSERT_EQ(s, Status::OK()); delete db; ASSERT_EQ(env->GetCloseCount(), 3); options.info_log.reset(); ASSERT_EQ(env->GetCloseCount(), 4); } TEST_F(DBBasicTest, DBCloseAllDirectoryFDs) { Options options = GetDefaultOptions(); std::string dbname = test::PerThreadDBPath("db_close_all_dir_fds_test"); // Configure a specific WAL directory options.wal_dir = dbname + "_wal_dir"; // Configure 3 different data directories options.db_paths.emplace_back(dbname + "_1", 512 * 1024); options.db_paths.emplace_back(dbname + "_2", 4 * 1024 * 1024); options.db_paths.emplace_back(dbname + "_3", 1024 * 1024 * 1024); ASSERT_OK(DestroyDB(dbname, options)); DB* db = nullptr; std::unique_ptr env = NewCompositeEnv( std::make_shared(FileSystem::Default())); options.create_if_missing = true; options.env = env.get(); Status s = DB::Open(options, dbname, &db); ASSERT_OK(s); ASSERT_TRUE(db != nullptr); // Explicitly close the database to ensure the open and close counter for // directories are equivalent s = db->Close(); auto* counted_fs = options.env->GetFileSystem()->CheckedCast(); ASSERT_TRUE(counted_fs != nullptr); ASSERT_EQ(counted_fs->counters()->dir_opens, counted_fs->counters()->dir_closes); ASSERT_OK(s); delete db; } TEST_F(DBBasicTest, DBCloseFlushError) { std::unique_ptr fault_injection_env( new FaultInjectionTestEnv(env_)); Options options = GetDefaultOptions(); options.create_if_missing = true; options.manual_wal_flush = true; options.write_buffer_size = 100; options.env = fault_injection_env.get(); Reopen(options); ASSERT_OK(Put("key1", "value1")); ASSERT_OK(Put("key2", "value2")); ASSERT_OK(dbfull()->TEST_SwitchMemtable()); ASSERT_OK(Put("key3", "value3")); fault_injection_env->SetFilesystemActive(false); Status s = dbfull()->Close(); ASSERT_NE(s, Status::OK()); // retry should return the same error s = dbfull()->Close(); ASSERT_NE(s, Status::OK()); fault_injection_env->SetFilesystemActive(true); // retry close() is no-op even the system is back. Could be improved if // Close() is retry-able: #9029 s = dbfull()->Close(); ASSERT_NE(s, Status::OK()); Destroy(options); } class DBMultiGetTestWithParam : public DBBasicTest, public testing::WithParamInterface> {}; TEST_P(DBMultiGetTestWithParam, MultiGetMultiCF) { #ifndef USE_COROUTINES if (std::get<1>(GetParam())) { ROCKSDB_GTEST_SKIP("This test requires coroutine support"); return; } #endif // USE_COROUTINES Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", "alyosha", "popovich"}, options); // tuples std::vector> cf_kv_vec; static const int num_keys = 24; cf_kv_vec.reserve(num_keys); for (int i = 0; i < num_keys; ++i) { int cf = i / 3; int cf_key = 1 % 3; cf_kv_vec.emplace_back( cf, "cf" + std::to_string(cf) + "_key_" + std::to_string(cf_key), "cf" + std::to_string(cf) + "_val_" + std::to_string(cf_key)); ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]), std::get<2>(cf_kv_vec[i]))); } int get_sv_count = 0; ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check(db_); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::MultiCFSnapshot::AfterRefSV", [&](void* /*arg*/) { if (++get_sv_count == 2) { // After MultiGet refs a couple of CFs, flush all CFs so MultiGet // is forced to repeat the process for (int i = 0; i < num_keys; ++i) { int cf = i / 3; int cf_key = i % 8; if (cf_key == 0) { ASSERT_OK(Flush(cf)); } ASSERT_OK(Put(std::get<0>(cf_kv_vec[i]), std::get<1>(cf_kv_vec[i]), std::get<2>(cf_kv_vec[i]) + "_2")); } } if (get_sv_count == 11) { for (int i = 0; i < 8; ++i) { auto* cfd = static_cast_with_check( db->GetColumnFamilyHandle(i)) ->cfd(); ASSERT_EQ(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); } } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); std::vector cfs; std::vector keys; std::vector values; for (int i = 0; i < num_keys; ++i) { cfs.push_back(std::get<0>(cf_kv_vec[i])); keys.push_back(std::get<1>(cf_kv_vec[i])); } values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()), std::get<1>(GetParam())); ASSERT_EQ(values.size(), num_keys); for (unsigned int j = 0; j < values.size(); ++j) { ASSERT_EQ(values[j], std::get<2>(cf_kv_vec[j]) + "_2"); } keys.clear(); cfs.clear(); cfs.push_back(std::get<0>(cf_kv_vec[0])); keys.push_back(std::get<1>(cf_kv_vec[0])); cfs.push_back(std::get<0>(cf_kv_vec[3])); keys.push_back(std::get<1>(cf_kv_vec[3])); cfs.push_back(std::get<0>(cf_kv_vec[4])); keys.push_back(std::get<1>(cf_kv_vec[4])); values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()), std::get<1>(GetParam())); ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[0]) + "_2"); ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[3]) + "_2"); ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[4]) + "_2"); keys.clear(); cfs.clear(); cfs.push_back(std::get<0>(cf_kv_vec[7])); keys.push_back(std::get<1>(cf_kv_vec[7])); cfs.push_back(std::get<0>(cf_kv_vec[6])); keys.push_back(std::get<1>(cf_kv_vec[6])); cfs.push_back(std::get<0>(cf_kv_vec[1])); keys.push_back(std::get<1>(cf_kv_vec[1])); values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()), std::get<1>(GetParam())); ASSERT_EQ(values[0], std::get<2>(cf_kv_vec[7]) + "_2"); ASSERT_EQ(values[1], std::get<2>(cf_kv_vec[6]) + "_2"); ASSERT_EQ(values[2], std::get<2>(cf_kv_vec[1]) + "_2"); for (int cf = 0; cf < 8; ++cf) { auto* cfd = static_cast_with_check( static_cast_with_check(db_)->GetColumnFamilyHandle(cf)) ->cfd(); ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVObsolete); } } TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFMutex) { #ifndef USE_COROUTINES if (std::get<1>(GetParam())) { ROCKSDB_GTEST_SKIP("This test requires coroutine support"); return; } #endif // USE_COROUTINES Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", "alyosha", "popovich"}, options); for (int i = 0; i < 8; ++i) { ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key", "cf" + std::to_string(i) + "_val")); } int get_sv_count = 0; int retries = 0; bool last_try = false; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::MultiCFSnapshot::LastTry", [&](void* /*arg*/) { last_try = true; }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::MultiCFSnapshot::AfterRefSV", [&](void* /*arg*/) { if (last_try) { return; } if (++get_sv_count == 2) { ++retries; get_sv_count = 0; for (int i = 0; i < 8; ++i) { ASSERT_OK(Flush(i)); ASSERT_OK(Put( i, "cf" + std::to_string(i) + "_key", "cf" + std::to_string(i) + "_val" + std::to_string(retries))); } } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency({ {"DBImpl::MultiCFSnapshot::AfterLastTryRefSV", "DBMultiGetTestWithParam::MultiGetMultiCFMutex:BeforeCreateSV"}, {"DBMultiGetTestWithParam::MultiGetMultiCFMutex:AfterCreateSV", "DBImpl::MultiCFSnapshot::BeforeLastTryUnRefSV"}, }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); port::Thread create_sv_thread([this]() { TEST_SYNC_POINT( "DBMultiGetTestWithParam::MultiGetMultiCFMutex:BeforeCreateSV"); // Create a new SuperVersion for each column family after last_try // of MultiGet ref SuperVersion and before unref it. for (int i = 0; i < 8; ++i) { ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key", "cf" + std::to_string(i) + "_val_after_last_try")); ASSERT_OK(Flush(i)); } TEST_SYNC_POINT( "DBMultiGetTestWithParam::MultiGetMultiCFMutex:AfterCreateSV"); }); std::vector cfs; std::vector keys; std::vector values; for (int i = 0; i < 8; ++i) { cfs.push_back(i); keys.push_back("cf" + std::to_string(i) + "_key"); } values = MultiGet(cfs, keys, nullptr, std::get<0>(GetParam()), std::get<1>(GetParam())); create_sv_thread.join(); ASSERT_TRUE(last_try); ASSERT_EQ(values.size(), 8); for (unsigned int j = 0; j < values.size(); ++j) { ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val" + std::to_string(retries)); } for (int i = 0; i < 8; ++i) { auto* cfd = static_cast_with_check( static_cast_with_check(db_)->GetColumnFamilyHandle(i)) ->cfd(); ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); } ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFSnapshot) { #ifndef USE_COROUTINES if (std::get<1>(GetParam())) { ROCKSDB_GTEST_SKIP("This test requires coroutine support"); return; } #endif // USE_COROUTINES Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", "alyosha", "popovich"}, options); for (int i = 0; i < 8; ++i) { ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key", "cf" + std::to_string(i) + "_val")); } int get_sv_count = 0; ROCKSDB_NAMESPACE::DBImpl* db = static_cast_with_check(db_); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "DBImpl::MultiCFSnapshot::AfterRefSV", [&](void* /*arg*/) { if (++get_sv_count == 2) { for (int i = 0; i < 8; ++i) { ASSERT_OK(Flush(i)); ASSERT_OK(Put(i, "cf" + std::to_string(i) + "_key", "cf" + std::to_string(i) + "_val2")); } } if (get_sv_count == 8) { for (int i = 0; i < 8; ++i) { auto* cfd = static_cast_with_check( db->GetColumnFamilyHandle(i)) ->cfd(); ASSERT_TRUE( (cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVInUse) || (cfd->TEST_GetLocalSV()->Get() == SuperVersion::kSVObsolete)); } } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); std::vector cfs; std::vector keys; std::vector values; for (int i = 0; i < 8; ++i) { cfs.push_back(i); keys.push_back("cf" + std::to_string(i) + "_key"); } const Snapshot* snapshot = db_->GetSnapshot(); values = MultiGet(cfs, keys, snapshot, std::get<0>(GetParam()), std::get<1>(GetParam())); db_->ReleaseSnapshot(snapshot); ASSERT_EQ(values.size(), 8); for (unsigned int j = 0; j < values.size(); ++j) { ASSERT_EQ(values[j], "cf" + std::to_string(j) + "_val"); } for (int i = 0; i < 8; ++i) { auto* cfd = static_cast_with_check( static_cast_with_check(db_)->GetColumnFamilyHandle(i)) ->cfd(); ASSERT_NE(cfd->TEST_GetLocalSV()->Get(), SuperVersion::kSVInUse); } } TEST_P(DBMultiGetTestWithParam, MultiGetMultiCFUnsorted) { #ifndef USE_COROUTINES if (std::get<1>(GetParam())) { ROCKSDB_GTEST_SKIP("This test requires coroutine support"); return; } #endif // USE_COROUTINES Options options = CurrentOptions(); CreateAndReopenWithCF({"one", "two"}, options); ASSERT_OK(Put(1, "foo", "bar")); ASSERT_OK(Put(2, "baz", "xyz")); ASSERT_OK(Put(1, "abc", "def")); // Note: keys for the same CF do not form a consecutive range std::vector cfs{1, 2, 1}; std::vector keys{"foo", "baz", "abc"}; std::vector values; values = MultiGet(cfs, keys, /* snapshot */ nullptr, /* batched */ std::get<0>(GetParam()), /* async */ std::get<1>(GetParam())); ASSERT_EQ(values.size(), 3); ASSERT_EQ(values[0], "bar"); ASSERT_EQ(values[1], "xyz"); ASSERT_EQ(values[2], "def"); } TEST_P(DBMultiGetTestWithParam, MultiGetBatchedSimpleUnsorted) { #ifndef USE_COROUTINES if (std::get<1>(GetParam())) { ROCKSDB_GTEST_SKIP("This test requires coroutine support"); return; } #endif // USE_COROUTINES // Skip for unbatched MultiGet if (!std::get<0>(GetParam())) { ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet"); return; } do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); SetPerfLevel(kEnableCount); ASSERT_OK(Put(1, "k1", "v1")); ASSERT_OK(Put(1, "k2", "v2")); ASSERT_OK(Put(1, "k3", "v3")); ASSERT_OK(Put(1, "k4", "v4")); ASSERT_OK(Delete(1, "k4")); ASSERT_OK(Put(1, "k5", "v5")); ASSERT_OK(Delete(1, "no_key")); get_perf_context()->Reset(); std::vector keys({"no_key", "k5", "k4", "k3", "k2", "k1"}); std::vector values(keys.size()); std::vector cfs(keys.size(), handles_[1]); std::vector s(keys.size()); ReadOptions ro; ro.async_io = std::get<1>(GetParam()); db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(), s.data(), false); ASSERT_EQ(values.size(), keys.size()); ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1"); ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v2"); ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3"); ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5"); // four kv pairs * two bytes per value ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes); ASSERT_TRUE(s[0].IsNotFound()); ASSERT_OK(s[1]); ASSERT_TRUE(s[2].IsNotFound()); ASSERT_OK(s[3]); ASSERT_OK(s[4]); ASSERT_OK(s[5]); SetPerfLevel(kDisable); } while (ChangeCompactOptions()); } TEST_P(DBMultiGetTestWithParam, MultiGetBatchedSortedMultiFile) { #ifndef USE_COROUTINES if (std::get<1>(GetParam())) { ROCKSDB_GTEST_SKIP("This test requires coroutine support"); return; } #endif // USE_COROUTINES // Skip for unbatched MultiGet if (!std::get<0>(GetParam())) { ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet"); return; } do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); SetPerfLevel(kEnableCount); // To expand the power of this test, generate > 1 table file and // mix with memtable ASSERT_OK(Put(1, "k1", "v1")); ASSERT_OK(Put(1, "k2", "v2")); ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "k3", "v3")); ASSERT_OK(Put(1, "k4", "v4")); ASSERT_OK(Flush(1)); ASSERT_OK(Delete(1, "k4")); ASSERT_OK(Put(1, "k5", "v5")); ASSERT_OK(Delete(1, "no_key")); get_perf_context()->Reset(); std::vector keys({"k1", "k2", "k3", "k4", "k5", "no_key"}); std::vector values(keys.size()); std::vector cfs(keys.size(), handles_[1]); std::vector s(keys.size()); ReadOptions ro; ro.async_io = std::get<1>(GetParam()); db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(), s.data(), true); ASSERT_EQ(values.size(), keys.size()); ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v1"); ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v2"); ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3"); ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v5"); // four kv pairs * two bytes per value ASSERT_EQ(8, (int)get_perf_context()->multiget_read_bytes); ASSERT_OK(s[0]); ASSERT_OK(s[1]); ASSERT_OK(s[2]); ASSERT_TRUE(s[3].IsNotFound()); ASSERT_OK(s[4]); ASSERT_TRUE(s[5].IsNotFound()); SetPerfLevel(kDisable); } while (ChangeOptions()); } TEST_P(DBMultiGetTestWithParam, MultiGetBatchedDuplicateKeys) { #ifndef USE_COROUTINES if (std::get<1>(GetParam())) { ROCKSDB_GTEST_SKIP("This test requires coroutine support"); return; } #endif // USE_COROUTINES // Skip for unbatched MultiGet if (!std::get<0>(GetParam())) { ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet"); return; } Options opts = CurrentOptions(); opts.merge_operator = MergeOperators::CreateStringAppendOperator(); CreateAndReopenWithCF({"pikachu"}, opts); SetPerfLevel(kEnableCount); // To expand the power of this test, generate > 1 table file and // mix with memtable ASSERT_OK(Merge(1, "k1", "v1")); ASSERT_OK(Merge(1, "k2", "v2")); ASSERT_OK(Flush(1)); MoveFilesToLevel(2, 1); ASSERT_OK(Merge(1, "k3", "v3")); ASSERT_OK(Merge(1, "k4", "v4")); ASSERT_OK(Flush(1)); MoveFilesToLevel(2, 1); ASSERT_OK(Merge(1, "k4", "v4_2")); ASSERT_OK(Merge(1, "k6", "v6")); ASSERT_OK(Flush(1)); MoveFilesToLevel(2, 1); ASSERT_OK(Merge(1, "k7", "v7")); ASSERT_OK(Merge(1, "k8", "v8")); ASSERT_OK(Flush(1)); MoveFilesToLevel(2, 1); get_perf_context()->Reset(); std::vector keys({"k8", "k8", "k8", "k4", "k4", "k1", "k3"}); std::vector values(keys.size()); std::vector cfs(keys.size(), handles_[1]); std::vector s(keys.size()); ReadOptions ro; ro.async_io = std::get<1>(GetParam()); db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(), s.data(), false); ASSERT_EQ(values.size(), keys.size()); ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v8"); ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v8"); ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v8"); ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v4,v4_2"); ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v4,v4_2"); ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1"); ASSERT_EQ(std::string(values[6].data(), values[6].size()), "v3"); ASSERT_EQ(24, (int)get_perf_context()->multiget_read_bytes); for (Status& status : s) { ASSERT_OK(status); } SetPerfLevel(kDisable); } TEST_P(DBMultiGetTestWithParam, MultiGetBatchedMultiLevel) { #ifndef USE_COROUTINES if (std::get<1>(GetParam())) { ROCKSDB_GTEST_SKIP("This test requires coroutine support"); return; } #endif // USE_COROUTINES // Skip for unbatched MultiGet if (!std::get<0>(GetParam())) { ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet"); return; } Options options = CurrentOptions(); options.disable_auto_compactions = true; Reopen(options); int num_keys = 0; for (int i = 0; i < 128; ++i) { ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i))); num_keys++; if (num_keys == 8) { ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { ASSERT_OK(Flush()); num_keys = 0; } MoveFilesToLevel(2); for (int i = 0; i < 128; i += 3) { ASSERT_OK(Put("key_" + std::to_string(i), "val_l1_" + std::to_string(i))); num_keys++; if (num_keys == 8) { ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { ASSERT_OK(Flush()); num_keys = 0; } MoveFilesToLevel(1); for (int i = 0; i < 128; i += 5) { ASSERT_OK(Put("key_" + std::to_string(i), "val_l0_" + std::to_string(i))); num_keys++; if (num_keys == 8) { ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { ASSERT_OK(Flush()); num_keys = 0; } ASSERT_EQ(0, num_keys); for (int i = 0; i < 128; i += 9) { ASSERT_OK(Put("key_" + std::to_string(i), "val_mem_" + std::to_string(i))); } std::vector keys; std::vector values; for (int i = 64; i < 80; ++i) { keys.push_back("key_" + std::to_string(i)); } values = MultiGet(keys, nullptr, std::get<1>(GetParam())); ASSERT_EQ(values.size(), 16); for (unsigned int j = 0; j < values.size(); ++j) { int key = j + 64; if (key % 9 == 0) { ASSERT_EQ(values[j], "val_mem_" + std::to_string(key)); } else if (key % 5 == 0) { ASSERT_EQ(values[j], "val_l0_" + std::to_string(key)); } else if (key % 3 == 0) { ASSERT_EQ(values[j], "val_l1_" + std::to_string(key)); } else { ASSERT_EQ(values[j], "val_l2_" + std::to_string(key)); } } } TEST_P(DBMultiGetTestWithParam, MultiGetDuplicatesEmptyLevel) { #ifndef USE_COROUTINES if (std::get<1>(GetParam())) { ROCKSDB_GTEST_BYPASS("This test requires coroutine support"); return; } #endif // USE_COROUTINES // Skip for unbatched MultiGet if (!std::get<0>(GetParam())) { ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet"); return; } std::shared_ptr fault_fs( new FaultInjectionTestFS(env_->GetFileSystem())); std::unique_ptr env(new CompositeEnvWrapper(env_, fault_fs)); Options options = CurrentOptions(); options.env = env.get(); options.disable_auto_compactions = true; options.merge_operator = MergeOperators::CreateStringAppendOperator(); LRUCacheOptions cache_opts; cache_opts.capacity = 1 << 20; BlockBasedTableOptions table_opts; table_opts.metadata_cache_options.top_level_index_pinning = PinningTier::kAll; table_opts.metadata_cache_options.partition_pinning = PinningTier::kNone; table_opts.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; table_opts.cache_index_and_filter_blocks = true; table_opts.block_cache = cache_opts.MakeSharedCache(); table_opts.flush_block_policy_factory.reset(new MyFlushBlockPolicyFactory(1)); options.table_factory.reset(new BlockBasedTableFactory(table_opts)); Reopen(options); int key; // Setup the LSM so that the following search bounds are generated for // key 9 for each level - // Level 1 - lb = 0, rb = max // Level 2 - lb = 0, rb = 0 // Level 3 - lb = 0, rb = -1 // Level 4 - lb = 0, rb = 0 key = 9; ASSERT_OK(Put("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); ASSERT_OK(Flush()); MoveFilesToLevel(4); key = 5; ASSERT_OK(Put("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); key = 9; ASSERT_OK( Merge("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); const Snapshot* snap = dbfull()->GetSnapshot(); ASSERT_OK( Merge("key_" + std::to_string(key), "val_l2_ext_" + std::to_string(key))); ASSERT_OK(Flush()); // Leave level 3 empty MoveFilesToLevel(2); key = 2; ASSERT_OK(Put("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); key = 6; ASSERT_OK( Merge("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); ASSERT_OK(Flush()); MoveFilesToLevel(1); std::vector keys; std::vector values; keys.push_back("key_" + std::to_string(9)); keys.push_back("key_" + std::to_string(9)); int num_reads = 0; SyncPoint::GetInstance()->SetCallBack( "FaultInjectionTestFS::RandomRead", [&](void*) { ++num_reads; // Fail on the 2nd read. First read is index partition, // second read is data block in level 1 if (num_reads == 2) { fault_fs->SetFilesystemActive(false); } else { fault_fs->SetFilesystemActive(true); } }); SyncPoint::GetInstance()->EnableProcessing(); size_t capacity = table_opts.block_cache->GetCapacity(); table_opts.block_cache->SetCapacity(0); table_opts.block_cache->SetCapacity(capacity); values = MultiGet(keys, nullptr, std::get<1>(GetParam())); ASSERT_EQ(values.size(), 2); SyncPoint::GetInstance()->DisableProcessing(); dbfull()->ReleaseSnapshot(snap); Destroy(options); } TEST_P(DBMultiGetTestWithParam, MultiGetDuplicatesNonEmptyLevel) { #ifndef USE_COROUTINES if (std::get<1>(GetParam())) { ROCKSDB_GTEST_BYPASS("This test requires coroutine support"); return; } #endif // USE_COROUTINES // Skip for unbatched MultiGet if (!std::get<0>(GetParam())) { ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet"); return; } std::shared_ptr fault_fs( new FaultInjectionTestFS(env_->GetFileSystem())); std::unique_ptr env(new CompositeEnvWrapper(env_, fault_fs)); Options options = CurrentOptions(); options.env = env.get(); options.disable_auto_compactions = true; options.merge_operator = MergeOperators::CreateStringAppendOperator(); LRUCacheOptions cache_opts; cache_opts.capacity = 1 << 20; BlockBasedTableOptions table_opts; table_opts.metadata_cache_options.top_level_index_pinning = PinningTier::kAll; table_opts.metadata_cache_options.partition_pinning = PinningTier::kNone; table_opts.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; table_opts.cache_index_and_filter_blocks = true; table_opts.block_cache = cache_opts.MakeSharedCache(); table_opts.flush_block_policy_factory.reset(new MyFlushBlockPolicyFactory(1)); options.table_factory.reset(new BlockBasedTableFactory(table_opts)); Reopen(options); int key; // Setup the LSM so that the following search bounds are generated for // key 9 for each level - // Level 1 - lb = 0, rb = max // Level 2 - lb = 0, rb = 0 // Level 3 - lb = 0, rb = 1 // Level 4 - N/A key = 8; ASSERT_OK(Put("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); ASSERT_OK(Flush()); MoveFilesToLevel(4); key = 7; ASSERT_OK(Put("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); ASSERT_OK(Flush()); key = 9; ASSERT_OK(Put("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); ASSERT_OK(Flush()); MoveFilesToLevel(3); key = 5; ASSERT_OK(Put("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); key = 9; ASSERT_OK( Merge("key_" + std::to_string(key), "merge1_l2_" + std::to_string(key))); const Snapshot* snap = dbfull()->GetSnapshot(); ASSERT_OK( Merge("key_" + std::to_string(key), "merge2_l2_" + std::to_string(key))); ASSERT_OK(Flush()); MoveFilesToLevel(2); key = 2; ASSERT_OK(Put("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); key = 6; ASSERT_OK( Merge("key_" + std::to_string(key), "val_l2_" + std::to_string(key))); ASSERT_OK(Flush()); MoveFilesToLevel(1); std::vector keys; std::vector values; keys.push_back("key_" + std::to_string(9)); keys.push_back("key_" + std::to_string(9)); int num_reads = 0; SyncPoint::GetInstance()->SetCallBack( "FaultInjectionTestFS::RandomRead", [&](void*) { ++num_reads; // Fail on the 2nd read. First read is index partition, // second read is data block in level 1 if (num_reads == 2) { fault_fs->SetFilesystemActive(false); } else { fault_fs->SetFilesystemActive(true); } }); SyncPoint::GetInstance()->EnableProcessing(); size_t capacity = table_opts.block_cache->GetCapacity(); table_opts.block_cache->SetCapacity(0); table_opts.block_cache->SetCapacity(capacity); values = MultiGet(keys, nullptr, std::get<1>(GetParam())); ASSERT_EQ(values.size(), 2); ASSERT_EQ(values[0], "Corruption: Not active"); ASSERT_EQ(values[1], "val_l2_9,merge1_l2_9,merge2_l2_9"); SyncPoint::GetInstance()->DisableProcessing(); dbfull()->ReleaseSnapshot(snap); Destroy(options); } TEST_P(DBMultiGetTestWithParam, MultiGetBatchedMultiLevelMerge) { #ifndef USE_COROUTINES if (std::get<1>(GetParam())) { ROCKSDB_GTEST_SKIP("This test requires coroutine support"); return; } #endif // USE_COROUTINES // Skip for unbatched MultiGet if (!std::get<0>(GetParam())) { ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet"); return; } Options options = CurrentOptions(); options.disable_auto_compactions = true; options.merge_operator = MergeOperators::CreateStringAppendOperator(); BlockBasedTableOptions bbto; bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); options.table_factory.reset(NewBlockBasedTableFactory(bbto)); Reopen(options); int num_keys = 0; for (int i = 0; i < 128; ++i) { ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i))); num_keys++; if (num_keys == 8) { ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { ASSERT_OK(Flush()); num_keys = 0; } MoveFilesToLevel(2); for (int i = 0; i < 128; i += 3) { ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i))); num_keys++; if (num_keys == 8) { ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { ASSERT_OK(Flush()); num_keys = 0; } MoveFilesToLevel(1); for (int i = 0; i < 128; i += 5) { ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i))); num_keys++; if (num_keys == 8) { ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { ASSERT_OK(Flush()); num_keys = 0; } ASSERT_EQ(0, num_keys); for (int i = 0; i < 128; i += 9) { ASSERT_OK( Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i))); } std::vector keys; std::vector values; for (int i = 32; i < 80; ++i) { keys.push_back("key_" + std::to_string(i)); } values = MultiGet(keys, nullptr, std::get<1>(GetParam())); ASSERT_EQ(values.size(), keys.size()); for (unsigned int j = 0; j < 48; ++j) { int key = j + 32; std::string value; value.append("val_l2_" + std::to_string(key)); if (key % 3 == 0) { value.append(","); value.append("val_l1_" + std::to_string(key)); } if (key % 5 == 0) { value.append(","); value.append("val_l0_" + std::to_string(key)); } if (key % 9 == 0) { value.append(","); value.append("val_mem_" + std::to_string(key)); } ASSERT_EQ(values[j], value); } } TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSizeInMemory) { #ifndef USE_COROUTINES if (std::get<1>(GetParam())) { ROCKSDB_GTEST_SKIP("This test requires coroutine support"); return; } #endif // USE_COROUTINES // Skip for unbatched MultiGet if (!std::get<0>(GetParam())) { ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet"); return; } CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); SetPerfLevel(kEnableCount); ASSERT_OK(Put(1, "k1", "v_1")); ASSERT_OK(Put(1, "k2", "v_2")); ASSERT_OK(Put(1, "k3", "v_3")); ASSERT_OK(Put(1, "k4", "v_4")); ASSERT_OK(Put(1, "k5", "v_5")); ASSERT_OK(Put(1, "k6", "v_6")); std::vector keys = {"k1", "k2", "k3", "k4", "k5", "k6"}; std::vector values(keys.size()); std::vector s(keys.size()); std::vector cfs(keys.size(), handles_[1]); get_perf_context()->Reset(); ReadOptions ro; ro.value_size_soft_limit = 11; ro.async_io = std::get<1>(GetParam()); db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(), s.data(), false); ASSERT_EQ(values.size(), keys.size()); for (unsigned int i = 0; i < 4; i++) { ASSERT_EQ(std::string(values[i].data(), values[i].size()), "v_" + std::to_string(i + 1)); } for (unsigned int i = 4; i < 6; i++) { ASSERT_TRUE(s[i].IsAborted()); } ASSERT_EQ(12, (int)get_perf_context()->multiget_read_bytes); SetPerfLevel(kDisable); } TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSize) { #ifndef USE_COROUTINES if (std::get<1>(GetParam())) { ROCKSDB_GTEST_SKIP("This test requires coroutine support"); return; } #endif // USE_COROUTINES // Skip for unbatched MultiGet if (!std::get<0>(GetParam())) { return; } do { CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); SetPerfLevel(kEnableCount); ASSERT_OK(Put(1, "k6", "v6")); ASSERT_OK(Put(1, "k7", "v7_")); ASSERT_OK(Put(1, "k3", "v3_")); ASSERT_OK(Put(1, "k4", "v4")); ASSERT_OK(Flush(1)); ASSERT_OK(Delete(1, "k4")); ASSERT_OK(Put(1, "k11", "v11")); ASSERT_OK(Delete(1, "no_key")); ASSERT_OK(Put(1, "k8", "v8_")); ASSERT_OK(Put(1, "k13", "v13")); ASSERT_OK(Put(1, "k14", "v14")); ASSERT_OK(Put(1, "k15", "v15")); ASSERT_OK(Put(1, "k16", "v16")); ASSERT_OK(Put(1, "k17", "v17")); ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "k1", "v1_")); ASSERT_OK(Put(1, "k2", "v2_")); ASSERT_OK(Put(1, "k5", "v5_")); ASSERT_OK(Put(1, "k9", "v9_")); ASSERT_OK(Put(1, "k10", "v10")); ASSERT_OK(Delete(1, "k2")); ASSERT_OK(Delete(1, "k6")); get_perf_context()->Reset(); std::vector keys({"k1", "k10", "k11", "k12", "k13", "k14", "k15", "k16", "k17", "k2", "k3", "k4", "k5", "k6", "k7", "k8", "k9", "no_key"}); std::vector values(keys.size()); std::vector cfs(keys.size(), handles_[1]); std::vector s(keys.size()); ReadOptions ro; ro.value_size_soft_limit = 20; ro.async_io = std::get<1>(GetParam()); db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(), s.data(), false); ASSERT_EQ(values.size(), keys.size()); // In memory keys ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v1_"); ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v10"); ASSERT_TRUE(s[9].IsNotFound()); // k2 ASSERT_EQ(std::string(values[12].data(), values[12].size()), "v5_"); ASSERT_TRUE(s[13].IsNotFound()); // k6 ASSERT_EQ(std::string(values[16].data(), values[16].size()), "v9_"); // In sst files ASSERT_EQ(std::string(values[2].data(), values[1].size()), "v11"); ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v13"); ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v14"); // Remaining aborted after value_size exceeds. ASSERT_TRUE(s[3].IsAborted()); ASSERT_TRUE(s[6].IsAborted()); ASSERT_TRUE(s[7].IsAborted()); ASSERT_TRUE(s[8].IsAborted()); ASSERT_TRUE(s[10].IsAborted()); ASSERT_TRUE(s[11].IsAborted()); ASSERT_TRUE(s[14].IsAborted()); ASSERT_TRUE(s[15].IsAborted()); ASSERT_TRUE(s[17].IsAborted()); // 6 kv pairs * 3 bytes per value (i.e. 18) ASSERT_EQ(21, (int)get_perf_context()->multiget_read_bytes); SetPerfLevel(kDisable); } while (ChangeCompactOptions()); } TEST_P(DBMultiGetTestWithParam, MultiGetBatchedValueSizeMultiLevelMerge) { if (std::get<1>(GetParam())) { ROCKSDB_GTEST_BYPASS("This test needs to be fixed for async IO"); return; } // Skip for unbatched MultiGet if (!std::get<0>(GetParam())) { ROCKSDB_GTEST_BYPASS("This test is only for batched MultiGet"); return; } Options options = CurrentOptions(); options.disable_auto_compactions = true; options.merge_operator = MergeOperators::CreateStringAppendOperator(); BlockBasedTableOptions bbto; bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); options.table_factory.reset(NewBlockBasedTableFactory(bbto)); Reopen(options); int num_keys = 0; for (int i = 0; i < 64; ++i) { ASSERT_OK(Put("key_" + std::to_string(i), "val_l2_" + std::to_string(i))); num_keys++; if (num_keys == 8) { ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { ASSERT_OK(Flush()); num_keys = 0; } MoveFilesToLevel(2); for (int i = 0; i < 64; i += 3) { ASSERT_OK(Merge("key_" + std::to_string(i), "val_l1_" + std::to_string(i))); num_keys++; if (num_keys == 8) { ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { ASSERT_OK(Flush()); num_keys = 0; } MoveFilesToLevel(1); for (int i = 0; i < 64; i += 5) { ASSERT_OK(Merge("key_" + std::to_string(i), "val_l0_" + std::to_string(i))); num_keys++; if (num_keys == 8) { ASSERT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { ASSERT_OK(Flush()); num_keys = 0; } ASSERT_EQ(0, num_keys); for (int i = 0; i < 64; i += 9) { ASSERT_OK( Merge("key_" + std::to_string(i), "val_mem_" + std::to_string(i))); } std::vector keys_str; for (int i = 10; i < 50; ++i) { keys_str.push_back("key_" + std::to_string(i)); } std::vector keys(keys_str.size()); for (int i = 0; i < 40; i++) { keys[i] = Slice(keys_str[i]); } std::vector values(keys_str.size()); std::vector statuses(keys_str.size()); ReadOptions read_options; read_options.verify_checksums = true; read_options.value_size_soft_limit = 380; read_options.async_io = std::get<1>(GetParam()); db_->MultiGet(read_options, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data()); ASSERT_EQ(values.size(), keys.size()); for (unsigned int j = 0; j < 26; ++j) { int key = j + 10; std::string value; value.append("val_l2_" + std::to_string(key)); if (key % 3 == 0) { value.append(","); value.append("val_l1_" + std::to_string(key)); } if (key % 5 == 0) { value.append(","); value.append("val_l0_" + std::to_string(key)); } if (key % 9 == 0) { value.append(","); value.append("val_mem_" + std::to_string(key)); } ASSERT_EQ(values[j], value); ASSERT_OK(statuses[j]); } // All remaning keys status is set Status::Abort for (unsigned int j = 26; j < 40; j++) { ASSERT_TRUE(statuses[j].IsAborted()); } } INSTANTIATE_TEST_CASE_P(DBMultiGetTestWithParam, DBMultiGetTestWithParam, testing::Combine(testing::Bool(), testing::Bool())); #if USE_COROUTINES class DBMultiGetAsyncIOTest : public DBBasicTest, public ::testing::WithParamInterface { public: DBMultiGetAsyncIOTest() : DBBasicTest(), statistics_(ROCKSDB_NAMESPACE::CreateDBStatistics()) { BlockBasedTableOptions bbto; bbto.filter_policy.reset(NewBloomFilterPolicy(10)); options_ = CurrentOptions(); options_.disable_auto_compactions = true; options_.statistics = statistics_; options_.table_factory.reset(NewBlockBasedTableFactory(bbto)); options_.env = Env::Default(); Reopen(options_); int num_keys = 0; // Put all keys in the bottommost level, and overwrite some keys // in L0 and L1 for (int i = 0; i < 256; ++i) { EXPECT_OK(Put(Key(i), "val_l2_" + std::to_string(i))); num_keys++; if (num_keys == 8) { EXPECT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { EXPECT_OK(Flush()); num_keys = 0; } MoveFilesToLevel(2); for (int i = 0; i < 128; i += 3) { EXPECT_OK(Put(Key(i), "val_l1_" + std::to_string(i))); num_keys++; if (num_keys == 8) { EXPECT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { EXPECT_OK(Flush()); num_keys = 0; } // Put some range deletes in L1 for (int i = 128; i < 256; i += 32) { std::string range_begin = Key(i); std::string range_end = Key(i + 16); EXPECT_OK(dbfull()->DeleteRange(WriteOptions(), dbfull()->DefaultColumnFamily(), range_begin, range_end)); // Also do some Puts to force creation of bloom filter for (int j = i + 16; j < i + 32; ++j) { if (j % 3 == 0) { EXPECT_OK(Put(Key(j), "val_l1_" + std::to_string(j))); } } EXPECT_OK(Flush()); } MoveFilesToLevel(1); for (int i = 0; i < 128; i += 5) { EXPECT_OK(Put(Key(i), "val_l0_" + std::to_string(i))); num_keys++; if (num_keys == 8) { EXPECT_OK(Flush()); num_keys = 0; } } if (num_keys > 0) { EXPECT_OK(Flush()); num_keys = 0; } EXPECT_EQ(0, num_keys); } const std::shared_ptr& statistics() { return statistics_; } protected: void PrepareDBForTest() { #ifdef ROCKSDB_IOURING_PRESENT Reopen(options_); #else // ROCKSDB_IOURING_PRESENT // Warm up the block cache so we don't need to use the IO uring Iterator* iter = dbfull()->NewIterator(ReadOptions()); for (iter->SeekToFirst(); iter->Valid() && iter->status().ok(); iter->Next()) ; EXPECT_OK(iter->status()); delete iter; #endif // ROCKSDB_IOURING_PRESENT } void ReopenDB() { Reopen(options_); } private: std::shared_ptr statistics_; Options options_; }; TEST_P(DBMultiGetAsyncIOTest, GetFromL0) { // All 3 keys in L0. The L0 files should be read serially. std::vector key_strs{Key(0), Key(40), Key(80)}; std::vector keys{key_strs[0], key_strs[1], key_strs[2]}; std::vector values(key_strs.size()); std::vector statuses(key_strs.size()); PrepareDBForTest(); ReadOptions ro; ro.async_io = true; ro.optimize_multiget_for_io = GetParam(); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data()); ASSERT_EQ(values.size(), 3); ASSERT_OK(statuses[0]); ASSERT_OK(statuses[1]); ASSERT_OK(statuses[2]); ASSERT_EQ(values[0], "val_l0_" + std::to_string(0)); ASSERT_EQ(values[1], "val_l0_" + std::to_string(40)); ASSERT_EQ(values[2], "val_l0_" + std::to_string(80)); HistogramData multiget_io_batch_size; statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size); // With async IO, lookups will happen in parallel for each key #ifdef ROCKSDB_IOURING_PRESENT if (GetParam()) { ASSERT_EQ(multiget_io_batch_size.count, 1); ASSERT_EQ(multiget_io_batch_size.max, 3); ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3); } else { // Without Async IO, MultiGet will call MultiRead 3 times, once for each // L0 file ASSERT_EQ(multiget_io_batch_size.count, 3); } #else // ROCKSDB_IOURING_PRESENT ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 0); #endif // ROCKSDB_IOURING_PRESENT } TEST_P(DBMultiGetAsyncIOTest, GetFromL1) { std::vector key_strs; std::vector keys; std::vector values; std::vector statuses; key_strs.push_back(Key(33)); key_strs.push_back(Key(54)); key_strs.push_back(Key(102)); keys.emplace_back(key_strs[0]); keys.emplace_back(key_strs[1]); keys.emplace_back(key_strs[2]); values.resize(keys.size()); statuses.resize(keys.size()); PrepareDBForTest(); ReadOptions ro; ro.async_io = true; ro.optimize_multiget_for_io = GetParam(); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data()); ASSERT_EQ(values.size(), 3); ASSERT_EQ(statuses[0], Status::OK()); ASSERT_EQ(statuses[1], Status::OK()); ASSERT_EQ(statuses[2], Status::OK()); ASSERT_EQ(values[0], "val_l1_" + std::to_string(33)); ASSERT_EQ(values[1], "val_l1_" + std::to_string(54)); ASSERT_EQ(values[2], "val_l1_" + std::to_string(102)); HistogramData multiget_io_batch_size; statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size); #ifdef ROCKSDB_IOURING_PRESENT // A batch of 3 async IOs is expected, one for each overlapping file in L1 ASSERT_EQ(multiget_io_batch_size.count, 1); ASSERT_EQ(multiget_io_batch_size.max, 3); ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3); #else // ROCKSDB_IOURING_PRESENT ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 0); #endif // ROCKSDB_IOURING_PRESENT } #ifdef ROCKSDB_IOURING_PRESENT TEST_P(DBMultiGetAsyncIOTest, GetFromL1Error) { std::vector key_strs; std::vector keys; std::vector values; std::vector statuses; key_strs.push_back(Key(33)); key_strs.push_back(Key(54)); key_strs.push_back(Key(102)); keys.emplace_back(key_strs[0]); keys.emplace_back(key_strs[1]); keys.emplace_back(key_strs[2]); values.resize(keys.size()); statuses.resize(keys.size()); int count = 0; SyncPoint::GetInstance()->SetCallBack( "TableCache::GetTableReader:BeforeOpenFile", [&](void* status) { count++; // Fail the last table reader open, which is the 6th SST file // since 3 overlapping L0 files + 3 L1 files containing the keys if (count == 6) { Status* s = static_cast(status); *s = Status::IOError(); } }); // DB open will create table readers unless we reduce the table cache // capacity. // SanitizeOptions will set max_open_files to minimum of 20. Table cache // is allocated with max_open_files - 10 as capacity. So override // max_open_files to 11 so table cache capacity will become 1. This will // prevent file open during DB open and force the file to be opened // during MultiGet SyncPoint::GetInstance()->SetCallBack( "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { int* max_open_files = (int*)arg; *max_open_files = 11; }); SyncPoint::GetInstance()->EnableProcessing(); PrepareDBForTest(); ReadOptions ro; ro.async_io = true; ro.optimize_multiget_for_io = GetParam(); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data()); SyncPoint::GetInstance()->DisableProcessing(); ASSERT_EQ(values.size(), 3); ASSERT_EQ(statuses[0], Status::OK()); ASSERT_EQ(statuses[1], Status::OK()); ASSERT_EQ(statuses[2], Status::IOError()); HistogramData multiget_io_batch_size; statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size); // A batch of 3 async IOs is expected, one for each overlapping file in L1 ASSERT_EQ(multiget_io_batch_size.count, 1); ASSERT_EQ(multiget_io_batch_size.max, 2); ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2); } #endif // ROCKSDB_IOURING_PRESENT TEST_P(DBMultiGetAsyncIOTest, LastKeyInFile) { std::vector key_strs; std::vector keys; std::vector values; std::vector statuses; // 21 is the last key in the first L1 file key_strs.push_back(Key(21)); key_strs.push_back(Key(54)); key_strs.push_back(Key(102)); keys.emplace_back(key_strs[0]); keys.emplace_back(key_strs[1]); keys.emplace_back(key_strs[2]); values.resize(keys.size()); statuses.resize(keys.size()); PrepareDBForTest(); ReadOptions ro; ro.async_io = true; ro.optimize_multiget_for_io = GetParam(); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data()); ASSERT_EQ(values.size(), 3); ASSERT_EQ(statuses[0], Status::OK()); ASSERT_EQ(statuses[1], Status::OK()); ASSERT_EQ(statuses[2], Status::OK()); ASSERT_EQ(values[0], "val_l1_" + std::to_string(21)); ASSERT_EQ(values[1], "val_l1_" + std::to_string(54)); ASSERT_EQ(values[2], "val_l1_" + std::to_string(102)); #ifdef ROCKSDB_IOURING_PRESENT HistogramData multiget_io_batch_size; statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size); // Since the first MultiGet key is the last key in a file, the MultiGet is // expected to lookup in that file first, before moving on to other files. // So the first file lookup will issue one async read, and the next lookup // will lookup 2 files in parallel and issue 2 async reads ASSERT_EQ(multiget_io_batch_size.count, 2); ASSERT_EQ(multiget_io_batch_size.max, 2); #endif // ROCKSDB_IOURING_PRESENT } TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2) { std::vector key_strs; std::vector keys; std::vector values; std::vector statuses; // 33 and 102 are in L1, and 56 is in L2 key_strs.push_back(Key(33)); key_strs.push_back(Key(56)); key_strs.push_back(Key(102)); keys.emplace_back(key_strs[0]); keys.emplace_back(key_strs[1]); keys.emplace_back(key_strs[2]); values.resize(keys.size()); statuses.resize(keys.size()); PrepareDBForTest(); ReadOptions ro; ro.async_io = true; ro.optimize_multiget_for_io = GetParam(); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data()); ASSERT_EQ(values.size(), 3); ASSERT_EQ(statuses[0], Status::OK()); ASSERT_EQ(statuses[1], Status::OK()); ASSERT_EQ(statuses[2], Status::OK()); ASSERT_EQ(values[0], "val_l1_" + std::to_string(33)); ASSERT_EQ(values[1], "val_l2_" + std::to_string(56)); ASSERT_EQ(values[2], "val_l1_" + std::to_string(102)); #ifdef ROCKSDB_IOURING_PRESENT HistogramData multiget_io_batch_size; statistics()->histogramData(MULTIGET_IO_BATCH_SIZE, &multiget_io_batch_size); // There are 2 keys in L1 in twp separate files, and 1 in L2. With // optimize_multiget_for_io, all three lookups will happen in parallel. // Otherwise, the L2 lookup will happen after L1. ASSERT_EQ(multiget_io_batch_size.count, GetParam() ? 1 : 2); ASSERT_EQ(multiget_io_batch_size.max, GetParam() ? 3 : 2); #endif // ROCKSDB_IOURING_PRESENT } TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeOverlapL0L1) { std::vector key_strs; std::vector keys; std::vector values; std::vector statuses; // 19 and 26 are in L2, but overlap with L0 and L1 file ranges key_strs.push_back(Key(19)); key_strs.push_back(Key(26)); keys.emplace_back(key_strs[0]); keys.emplace_back(key_strs[1]); values.resize(keys.size()); statuses.resize(keys.size()); PrepareDBForTest(); ReadOptions ro; ro.async_io = true; ro.optimize_multiget_for_io = GetParam(); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data()); ASSERT_EQ(values.size(), 2); ASSERT_EQ(statuses[0], Status::OK()); ASSERT_EQ(statuses[1], Status::OK()); ASSERT_EQ(values[0], "val_l2_" + std::to_string(19)); ASSERT_EQ(values[1], "val_l2_" + std::to_string(26)); #ifdef ROCKSDB_IOURING_PRESENT // Bloom filters in L0/L1 will avoid the coroutine calls in those levels ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2); #else // ROCKSDB_IOURING_PRESENT ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 0); #endif // ROCKSDB_IOURING_PRESENT } #ifdef ROCKSDB_IOURING_PRESENT TEST_P(DBMultiGetAsyncIOTest, GetFromL2WithRangeDelInL1) { std::vector key_strs; std::vector keys; std::vector values; std::vector statuses; // 139 and 163 are in L2, but overlap with a range deletes in L1 key_strs.push_back(Key(139)); key_strs.push_back(Key(163)); keys.emplace_back(key_strs[0]); keys.emplace_back(key_strs[1]); values.resize(keys.size()); statuses.resize(keys.size()); PrepareDBForTest(); ReadOptions ro; ro.async_io = true; ro.optimize_multiget_for_io = GetParam(); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data()); ASSERT_EQ(values.size(), 2); ASSERT_EQ(statuses[0], Status::NotFound()); ASSERT_EQ(statuses[1], Status::NotFound()); // Bloom filters in L0/L1 will avoid the coroutine calls in those levels ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 2); } TEST_P(DBMultiGetAsyncIOTest, GetFromL1AndL2WithRangeDelInL1) { std::vector key_strs; std::vector keys; std::vector values; std::vector statuses; // 139 and 163 are in L2, but overlap with a range deletes in L1 key_strs.push_back(Key(139)); key_strs.push_back(Key(144)); key_strs.push_back(Key(163)); keys.emplace_back(key_strs[0]); keys.emplace_back(key_strs[1]); keys.emplace_back(key_strs[2]); values.resize(keys.size()); statuses.resize(keys.size()); PrepareDBForTest(); ReadOptions ro; ro.async_io = true; ro.optimize_multiget_for_io = GetParam(); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data()); ASSERT_EQ(values.size(), keys.size()); ASSERT_EQ(statuses[0], Status::NotFound()); ASSERT_EQ(statuses[1], Status::OK()); ASSERT_EQ(values[1], "val_l1_" + std::to_string(144)); ASSERT_EQ(statuses[2], Status::NotFound()); // Bloom filters in L0/L1 will avoid the coroutine calls in those levels ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 3); } #endif // ROCKSDB_IOURING_PRESENT TEST_P(DBMultiGetAsyncIOTest, GetNoIOUring) { std::vector key_strs; std::vector keys; std::vector values; std::vector statuses; key_strs.push_back(Key(33)); key_strs.push_back(Key(54)); key_strs.push_back(Key(102)); keys.emplace_back(key_strs[0]); keys.emplace_back(key_strs[1]); keys.emplace_back(key_strs[2]); values.resize(keys.size()); statuses.resize(keys.size()); enable_io_uring = false; ReopenDB(); ReadOptions ro; ro.async_io = true; ro.optimize_multiget_for_io = GetParam(); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data()); ASSERT_EQ(values.size(), 3); ASSERT_EQ(statuses[0], Status::OK()); ASSERT_EQ(statuses[1], Status::OK()); ASSERT_EQ(statuses[2], Status::OK()); HistogramData async_read_bytes; statistics()->histogramData(ASYNC_READ_BYTES, &async_read_bytes); // A batch of 3 async IOs is expected, one for each overlapping file in L1 ASSERT_EQ(async_read_bytes.count, 0); ASSERT_EQ(statistics()->getTickerCount(MULTIGET_COROUTINE_COUNT), 0); } INSTANTIATE_TEST_CASE_P(DBMultiGetAsyncIOTest, DBMultiGetAsyncIOTest, testing::Bool()); #endif // USE_COROUTINES TEST_F(DBBasicTest, MultiGetStats) { Options options; options.create_if_missing = true; options.disable_auto_compactions = true; options.env = env_; options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); BlockBasedTableOptions table_options; table_options.block_size = 1; table_options.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; table_options.partition_filters = true; table_options.no_block_cache = true; table_options.cache_index_and_filter_blocks = false; table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); CreateAndReopenWithCF({"pikachu"}, options); int total_keys = 2000; std::vector keys_str(total_keys); std::vector keys(total_keys); static size_t kMultiGetBatchSize = 100; std::vector values(kMultiGetBatchSize); std::vector s(kMultiGetBatchSize); ReadOptions read_opts; Random rnd(309); // Create Multiple SST files at multiple levels. for (int i = 0; i < 500; ++i) { keys_str[i] = "k" + std::to_string(i); keys[i] = Slice(keys_str[i]); ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000))); if (i % 100 == 0) { ASSERT_OK(Flush(1)); } } ASSERT_OK(Flush(1)); MoveFilesToLevel(2, 1); for (int i = 501; i < 1000; ++i) { keys_str[i] = "k" + std::to_string(i); keys[i] = Slice(keys_str[i]); ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000))); if (i % 100 == 0) { ASSERT_OK(Flush(1)); } } ASSERT_OK(Flush(1)); MoveFilesToLevel(2, 1); for (int i = 1001; i < total_keys; ++i) { keys_str[i] = "k" + std::to_string(i); keys[i] = Slice(keys_str[i]); ASSERT_OK(Put(1, "k" + std::to_string(i), rnd.RandomString(1000))); if (i % 100 == 0) { ASSERT_OK(Flush(1)); } } ASSERT_OK(Flush(1)); MoveFilesToLevel(1, 1); Close(); ReopenWithColumnFamilies({"default", "pikachu"}, options); ASSERT_OK(options.statistics->Reset()); db_->MultiGet(read_opts, handles_[1], kMultiGetBatchSize, &keys[1250], values.data(), s.data(), false); ASSERT_EQ(values.size(), kMultiGetBatchSize); HistogramData hist_level; HistogramData hist_index_and_filter_blocks; HistogramData hist_sst; options.statistics->histogramData(NUM_LEVEL_READ_PER_MULTIGET, &hist_level); options.statistics->histogramData(NUM_INDEX_AND_FILTER_BLOCKS_READ_PER_LEVEL, &hist_index_and_filter_blocks); options.statistics->histogramData(NUM_SST_READ_PER_LEVEL, &hist_sst); // Maximum number of blocks read from a file system in a level. ASSERT_EQ(hist_level.max, 1); ASSERT_GT(hist_index_and_filter_blocks.max, 0); // Maximum number of sst files read from file system in a level. ASSERT_EQ(hist_sst.max, 2); // Minimun number of blocks read in a level. ASSERT_EQ(hist_level.min, 1); ASSERT_GT(hist_index_and_filter_blocks.min, 0); // Minimun number of sst files read in a level. ASSERT_EQ(hist_sst.min, 1); for (PinnableSlice& value : values) { value.Reset(); } for (Status& status : s) { status = Status::OK(); } db_->MultiGet(read_opts, handles_[1], kMultiGetBatchSize, &keys[950], values.data(), s.data(), false); options.statistics->histogramData(NUM_LEVEL_READ_PER_MULTIGET, &hist_level); ASSERT_EQ(hist_level.max, 2); } // Test class for batched MultiGet with prefix extractor // Param bool - If true, use partitioned filters // If false, use full filter block class MultiGetPrefixExtractorTest : public DBBasicTest, public ::testing::WithParamInterface { }; TEST_P(MultiGetPrefixExtractorTest, Batched) { Options options = CurrentOptions(); options.prefix_extractor.reset(NewFixedPrefixTransform(2)); options.memtable_prefix_bloom_size_ratio = 10; BlockBasedTableOptions bbto; if (GetParam()) { bbto.index_type = BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; bbto.partition_filters = true; } bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); bbto.whole_key_filtering = false; bbto.cache_index_and_filter_blocks = false; options.table_factory.reset(NewBlockBasedTableFactory(bbto)); Reopen(options); SetPerfLevel(kEnableCount); get_perf_context()->Reset(); ASSERT_OK(Put("k", "v0")); ASSERT_OK(Put("kk1", "v1")); ASSERT_OK(Put("kk2", "v2")); ASSERT_OK(Put("kk3", "v3")); ASSERT_OK(Put("kk4", "v4")); std::vector keys( {"k", "kk1", "kk2", "kk3", "kk4", "rofl", "lmho"}); std::vector expected( {"v0", "v1", "v2", "v3", "v4", "NOT_FOUND", "NOT_FOUND"}); std::vector values; values = MultiGet(keys, nullptr); ASSERT_EQ(values, expected); // One key ("k") is not queried against the filter because it is outside // the prefix_extractor domain, leaving 6 keys with queried prefixes. ASSERT_EQ(get_perf_context()->bloom_memtable_miss_count, 2); ASSERT_EQ(get_perf_context()->bloom_memtable_hit_count, 4); ASSERT_OK(Flush()); get_perf_context()->Reset(); values = MultiGet(keys, nullptr); ASSERT_EQ(values, expected); ASSERT_EQ(get_perf_context()->bloom_sst_miss_count, 2); ASSERT_EQ(get_perf_context()->bloom_sst_hit_count, 4); // Also check Get stat get_perf_context()->Reset(); for (size_t i = 0; i < keys.size(); ++i) { values[i] = Get(keys[i]); } ASSERT_EQ(values, expected); ASSERT_EQ(get_perf_context()->bloom_sst_miss_count, 2); ASSERT_EQ(get_perf_context()->bloom_sst_hit_count, 4); } INSTANTIATE_TEST_CASE_P(MultiGetPrefix, MultiGetPrefixExtractorTest, ::testing::Bool()); class DBMultiGetRowCacheTest : public DBBasicTest, public ::testing::WithParamInterface {}; TEST_P(DBMultiGetRowCacheTest, MultiGetBatched) { do { option_config_ = kRowCache; Options options = CurrentOptions(); options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); CreateAndReopenWithCF({"pikachu"}, options); SetPerfLevel(kEnableCount); ASSERT_OK(Put(1, "k1", "v1")); ASSERT_OK(Put(1, "k2", "v2")); ASSERT_OK(Put(1, "k3", "v3")); ASSERT_OK(Put(1, "k4", "v4")); ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "k5", "v5")); const Snapshot* snap1 = dbfull()->GetSnapshot(); ASSERT_OK(Delete(1, "k4")); ASSERT_OK(Flush(1)); const Snapshot* snap2 = dbfull()->GetSnapshot(); get_perf_context()->Reset(); std::vector keys({"no_key", "k5", "k4", "k3", "k1"}); std::vector values(keys.size()); std::vector cfs(keys.size(), handles_[1]); std::vector s(keys.size()); ReadOptions ro; bool use_snapshots = GetParam(); if (use_snapshots) { ro.snapshot = snap2; } db_->MultiGet(ro, handles_[1], keys.size(), keys.data(), values.data(), s.data(), false); ASSERT_EQ(values.size(), keys.size()); ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v1"); ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v3"); ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5"); // four kv pairs * two bytes per value ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes); ASSERT_TRUE(s[0].IsNotFound()); ASSERT_OK(s[1]); ASSERT_TRUE(s[2].IsNotFound()); ASSERT_OK(s[3]); ASSERT_OK(s[4]); // Call MultiGet() again with some intersection with the previous set of // keys. Those should already be in the row cache. keys.assign({"no_key", "k5", "k3", "k2"}); for (size_t i = 0; i < keys.size(); ++i) { values[i].Reset(); s[i] = Status::OK(); } get_perf_context()->Reset(); if (use_snapshots) { ro.snapshot = snap1; } db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(), values.data(), s.data(), false); ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v2"); ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v3"); ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v5"); // four kv pairs * two bytes per value ASSERT_EQ(6, (int)get_perf_context()->multiget_read_bytes); ASSERT_TRUE(s[0].IsNotFound()); ASSERT_OK(s[1]); ASSERT_OK(s[2]); ASSERT_OK(s[3]); if (use_snapshots) { // Only reads from the first SST file would have been cached, since // snapshot seq no is > fd.largest_seqno ASSERT_EQ(1, TestGetTickerCount(options, ROW_CACHE_HIT)); } else { ASSERT_EQ(2, TestGetTickerCount(options, ROW_CACHE_HIT)); } SetPerfLevel(kDisable); dbfull()->ReleaseSnapshot(snap1); dbfull()->ReleaseSnapshot(snap2); } while (ChangeCompactOptions()); } INSTANTIATE_TEST_CASE_P(DBMultiGetRowCacheTest, DBMultiGetRowCacheTest, testing::Values(true, false)); TEST_F(DBBasicTest, GetAllKeyVersions) { Options options = CurrentOptions(); options.env = env_; options.create_if_missing = true; options.disable_auto_compactions = true; CreateAndReopenWithCF({"pikachu"}, options); ASSERT_EQ(2, handles_.size()); const size_t kNumInserts = 4; const size_t kNumDeletes = 4; const size_t kNumUpdates = 4; // Check default column family for (size_t i = 0; i != kNumInserts; ++i) { ASSERT_OK(Put(std::to_string(i), "value")); } for (size_t i = 0; i != kNumUpdates; ++i) { ASSERT_OK(Put(std::to_string(i), "value1")); } for (size_t i = 0; i != kNumDeletes; ++i) { ASSERT_OK(Delete(std::to_string(i))); } std::vector key_versions; ASSERT_OK(GetAllKeyVersions(db_, Slice(), Slice(), std::numeric_limits::max(), &key_versions)); ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size()); for (size_t i = 0; i < kNumInserts + kNumDeletes + kNumUpdates; i++) { if (i % 3 == 0) { ASSERT_EQ(key_versions[i].GetTypeName(), "TypeDeletion"); } else { ASSERT_EQ(key_versions[i].GetTypeName(), "TypeValue"); } } ASSERT_OK(GetAllKeyVersions(db_, handles_[0], Slice(), Slice(), std::numeric_limits::max(), &key_versions)); ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates, key_versions.size()); // Check non-default column family for (size_t i = 0; i + 1 != kNumInserts; ++i) { ASSERT_OK(Put(1, std::to_string(i), "value")); } for (size_t i = 0; i + 1 != kNumUpdates; ++i) { ASSERT_OK(Put(1, std::to_string(i), "value1")); } for (size_t i = 0; i + 1 != kNumDeletes; ++i) { ASSERT_OK(Delete(1, std::to_string(i))); } ASSERT_OK(GetAllKeyVersions(db_, handles_[1], Slice(), Slice(), std::numeric_limits::max(), &key_versions)); ASSERT_EQ(kNumInserts + kNumDeletes + kNumUpdates - 3, key_versions.size()); } TEST_F(DBBasicTest, ValueTypeString) { KeyVersion key_version; // when adding new type, please also update `value_type_string_map` for (unsigned char i = ValueType::kTypeDeletion; i < ValueType::kTypeMaxValid; i++) { key_version.type = i; ASSERT_TRUE(key_version.GetTypeName() != "Invalid"); } } TEST_F(DBBasicTest, MultiGetIOBufferOverrun) { Options options = CurrentOptions(); Random rnd(301); BlockBasedTableOptions table_options; table_options.pin_l0_filter_and_index_blocks_in_cache = true; table_options.block_size = 16 * 1024; ASSERT_TRUE(table_options.block_size > BlockBasedTable::kMultiGetReadStackBufSize); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); Reopen(options); std::string zero_str(128, '\0'); for (int i = 0; i < 100; ++i) { // Make the value compressible. A purely random string doesn't compress // and the resultant data block will not be compressed std::string value(rnd.RandomString(128) + zero_str); assert(Put(Key(i), value) == Status::OK()); } ASSERT_OK(Flush()); std::vector key_data(10); std::vector keys; // We cannot resize a PinnableSlice vector, so just set initial size to // largest we think we will need std::vector values(10); std::vector statuses; ReadOptions ro; // Warm up the cache first key_data.emplace_back(Key(0)); keys.emplace_back(key_data.back()); key_data.emplace_back(Key(50)); keys.emplace_back(key_data.back()); statuses.resize(keys.size()); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data(), true); } TEST_F(DBBasicTest, IncrementalRecoveryNoCorrupt) { Options options = CurrentOptions(); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu", "eevee"}, options); size_t num_cfs = handles_.size(); ASSERT_EQ(3, num_cfs); WriteOptions write_opts; write_opts.disableWAL = true; for (size_t cf = 0; cf != num_cfs; ++cf) { for (size_t i = 0; i != 10000; ++i) { std::string key_str = Key(static_cast(i)); std::string value_str = std::to_string(cf) + "_" + std::to_string(i); ASSERT_OK(Put(static_cast(cf), key_str, value_str)); if (0 == (i % 1000)) { ASSERT_OK(Flush(static_cast(cf))); } } } for (size_t cf = 0; cf != num_cfs; ++cf) { ASSERT_OK(Flush(static_cast(cf))); } Close(); options.best_efforts_recovery = true; ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu", "eevee"}, options); num_cfs = handles_.size(); ASSERT_EQ(3, num_cfs); for (size_t cf = 0; cf != num_cfs; ++cf) { for (int i = 0; i != 10000; ++i) { std::string key_str = Key(static_cast(i)); std::string expected_value_str = std::to_string(cf) + "_" + std::to_string(i); ASSERT_EQ(expected_value_str, Get(static_cast(cf), key_str)); } } } TEST_F(DBBasicTest, BestEffortsRecoveryWithVersionBuildingFailure) { Options options = CurrentOptions(); DestroyAndReopen(options); ASSERT_OK(Put("foo", "value")); ASSERT_OK(Flush()); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->SetCallBack( "VersionBuilder::CheckConsistencyBeforeReturn", [&](void* arg) { ASSERT_NE(nullptr, arg); *(static_cast(arg)) = Status::Corruption("Inject corruption"); }); SyncPoint::GetInstance()->EnableProcessing(); options.best_efforts_recovery = true; Status s = TryReopen(options); ASSERT_TRUE(s.IsCorruption()); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); } namespace { class TableFileListener : public EventListener { public: void OnTableFileCreated(const TableFileCreationInfo& info) override { InstrumentedMutexLock lock(&mutex_); cf_to_paths_[info.cf_name].push_back(info.file_path); } std::vector& GetFiles(const std::string& cf_name) { InstrumentedMutexLock lock(&mutex_); return cf_to_paths_[cf_name]; } private: InstrumentedMutex mutex_; std::unordered_map> cf_to_paths_; }; class FlushTableFileListener : public EventListener { public: void OnTableFileCreated(const TableFileCreationInfo& info) override { InstrumentedMutexLock lock(&mutex_); if (info.reason != TableFileCreationReason::kFlush) { return; } cf_to_flushed_files_[info.cf_name].push_back(info.file_path); } std::vector& GetFlushedFiles(const std::string& cf_name) { InstrumentedMutexLock lock(&mutex_); return cf_to_flushed_files_[cf_name]; } private: InstrumentedMutex mutex_; std::unordered_map> cf_to_flushed_files_; }; class FlushBlobFileListener : public EventListener { public: void OnBlobFileCreated(const BlobFileCreationInfo& info) override { InstrumentedMutexLock lock(&mutex_); if (info.reason != BlobFileCreationReason::kFlush) { return; } cf_to_flushed_blobs_files_[info.cf_name].push_back(info.file_path); } std::vector& GetFlushedBlobFiles(const std::string& cf_name) { InstrumentedMutexLock lock(&mutex_); return cf_to_flushed_blobs_files_[cf_name]; } private: InstrumentedMutex mutex_; std::unordered_map> cf_to_flushed_blobs_files_; }; } // anonymous namespace TEST_F(DBBasicTest, LastSstFileNotInManifest) { // If the last sst file is not tracked in MANIFEST, // or the VersionEdit for the last sst file is not synced, // on recovery, the last sst file should be deleted, // and new sst files shouldn't reuse its file number. Options options = CurrentOptions(); DestroyAndReopen(options); Close(); // Manually add a sst file. constexpr uint64_t kSstFileNumber = 100; const std::string kSstFile = MakeTableFileName(dbname_, kSstFileNumber); ASSERT_OK(WriteStringToFile(env_, /* data = */ "bad sst file content", /* fname = */ kSstFile, /* should_sync = */ true)); ASSERT_OK(env_->FileExists(kSstFile)); TableFileListener* listener = new TableFileListener(); options.listeners.emplace_back(listener); Reopen(options); // kSstFile should already be deleted. ASSERT_TRUE(env_->FileExists(kSstFile).IsNotFound()); ASSERT_OK(Put("k", "v")); ASSERT_OK(Flush()); // New sst file should have file number > kSstFileNumber. std::vector& files = listener->GetFiles(kDefaultColumnFamilyName); ASSERT_EQ(files.size(), 1); const std::string fname = files[0].erase(0, (dbname_ + "/").size()); uint64_t number = 0; FileType type = kTableFile; ASSERT_TRUE(ParseFileName(fname, &number, &type)); ASSERT_EQ(type, kTableFile); ASSERT_GT(number, kSstFileNumber); } TEST_F(DBBasicTest, RecoverWithMissingFiles) { Options options = CurrentOptions(); DestroyAndReopen(options); TableFileListener* listener = new TableFileListener(); // Disable auto compaction to simplify SST file name tracking. options.disable_auto_compactions = true; options.listeners.emplace_back(listener); CreateAndReopenWithCF({"pikachu", "eevee"}, options); std::vector all_cf_names = {kDefaultColumnFamilyName, "pikachu", "eevee"}; size_t num_cfs = handles_.size(); ASSERT_EQ(3, num_cfs); for (size_t cf = 0; cf != num_cfs; ++cf) { ASSERT_OK(Put(static_cast(cf), "a", "0_value")); ASSERT_OK(Flush(static_cast(cf))); ASSERT_OK(Put(static_cast(cf), "b", "0_value")); ASSERT_OK(Flush(static_cast(cf))); ASSERT_OK(Put(static_cast(cf), "c", "0_value")); ASSERT_OK(Flush(static_cast(cf))); } // Delete and corrupt files for (size_t i = 0; i < all_cf_names.size(); ++i) { std::vector& files = listener->GetFiles(all_cf_names[i]); ASSERT_EQ(3, files.size()); std::string corrupted_data; ASSERT_OK(ReadFileToString(env_, files[files.size() - 1], &corrupted_data)); ASSERT_OK(WriteStringToFile( env_, corrupted_data.substr(0, corrupted_data.size() - 2), files[files.size() - 1], /*should_sync=*/true)); for (int j = static_cast(files.size() - 2); j >= static_cast(i); --j) { ASSERT_OK(env_->DeleteFile(files[j])); } } options.best_efforts_recovery = true; ReopenWithColumnFamilies(all_cf_names, options); // Verify data ReadOptions read_opts; read_opts.total_order_seek = true; { std::unique_ptr iter(db_->NewIterator(read_opts, handles_[0])); iter->SeekToFirst(); ASSERT_FALSE(iter->Valid()); ASSERT_OK(iter->status()); iter.reset(db_->NewIterator(read_opts, handles_[1])); iter->SeekToFirst(); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("a", iter->key()); iter->Next(); ASSERT_FALSE(iter->Valid()); ASSERT_OK(iter->status()); iter.reset(db_->NewIterator(read_opts, handles_[2])); iter->SeekToFirst(); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("a", iter->key()); iter->Next(); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("b", iter->key()); iter->Next(); ASSERT_FALSE(iter->Valid()); ASSERT_OK(iter->status()); } } // Param 0: whether to enable blob DB. // Param 1: when blob DB is enabled, whether to also delete the missing L0 // file's associated blob file. class BestEffortsRecoverIncompleteVersionTest : public DBTestBase, public testing::WithParamInterface> { public: BestEffortsRecoverIncompleteVersionTest() : DBTestBase("best_efforts_recover_incomplete_version_test", /*env_do_fsync=*/false) {} }; TEST_P(BestEffortsRecoverIncompleteVersionTest, Basic) { Options options = CurrentOptions(); options.enable_blob_files = std::get<0>(GetParam()); bool delete_blob_file_too = std::get<1>(GetParam()); DestroyAndReopen(options); FlushTableFileListener* flush_table_listener = new FlushTableFileListener(); FlushBlobFileListener* flush_blob_listener = new FlushBlobFileListener(); // Disable auto compaction to simplify SST file name tracking. options.disable_auto_compactions = true; options.listeners.emplace_back(flush_table_listener); options.listeners.emplace_back(flush_blob_listener); CreateAndReopenWithCF({"pikachu", "eevee"}, options); std::vector all_cf_names = {kDefaultColumnFamilyName, "pikachu", "eevee"}; int num_cfs = static_cast(handles_.size()); ASSERT_EQ(3, num_cfs); std::string start = "a"; Slice start_slice = start; std::string end = "d"; Slice end_slice = end; for (int cf = 0; cf != num_cfs; ++cf) { ASSERT_OK(Put(cf, "a", "a_value")); ASSERT_OK(Flush(cf)); // Compact file to L1 to avoid trivial file move in the next compaction ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf], &start_slice, &end_slice)); ASSERT_OK(Put(cf, "a", "a_value_new")); ASSERT_OK(Flush(cf)); ASSERT_OK(Put(cf, "b", "b_value")); ASSERT_OK(Flush(cf)); ASSERT_OK(Put(cf, "f", "f_value")); ASSERT_OK(Flush(cf)); ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf], &start_slice, &end_slice)); } dbfull()->TEST_DeleteObsoleteFiles(); // Delete the most recent L0 file which is before a compaction. for (int i = 0; i < num_cfs; ++i) { std::vector& files = flush_table_listener->GetFlushedFiles(all_cf_names[i]); ASSERT_EQ(4, files.size()); ASSERT_OK(env_->DeleteFile(files[files.size() - 1])); if (options.enable_blob_files) { std::vector& blob_files = flush_blob_listener->GetFlushedBlobFiles(all_cf_names[i]); ASSERT_EQ(4, blob_files.size()); if (delete_blob_file_too) { ASSERT_OK(env_->DeleteFile(blob_files[files.size() - 1])); } } } options.best_efforts_recovery = true; ReopenWithColumnFamilies(all_cf_names, options); for (int i = 0; i < num_cfs; ++i) { auto cfh = static_cast(handles_[i]); ColumnFamilyData* cfd = cfh->cfd(); VersionStorageInfo* vstorage = cfd->current()->storage_info(); // The L0 file flushed right before the last compaction is missing. ASSERT_EQ(0, vstorage->LevelFiles(0).size()); // Only the output of the last compaction is available. ASSERT_EQ(1, vstorage->LevelFiles(1).size()); } // Verify data ReadOptions read_opts; read_opts.total_order_seek = true; for (int i = 0; i < num_cfs; ++i) { std::unique_ptr iter(db_->NewIterator(read_opts, handles_[i])); iter->SeekToFirst(); ASSERT_TRUE(iter->Valid()); ASSERT_OK(iter->status()); ASSERT_EQ("a", iter->key()); ASSERT_EQ("a_value_new", iter->value()); iter->Next(); ASSERT_TRUE(iter->Valid()); ASSERT_OK(iter->status()); ASSERT_EQ("b", iter->key()); ASSERT_EQ("b_value", iter->value()); iter->Next(); ASSERT_FALSE(iter->Valid()); ASSERT_OK(iter->status()); } // Write more data. for (int cf = 0; cf < num_cfs; ++cf) { ASSERT_OK(Put(cf, "g", "g_value")); ASSERT_OK(Flush(cf)); ASSERT_OK(db_->CompactRange(CompactRangeOptions(), handles_[cf], nullptr, nullptr)); std::string value; ASSERT_OK(db_->Get(ReadOptions(), handles_[cf], "g", &value)); ASSERT_EQ("g_value", value); } } INSTANTIATE_TEST_CASE_P(BestEffortsRecoverIncompleteVersionTest, BestEffortsRecoverIncompleteVersionTest, testing::Values(std::make_tuple(false, false), std::make_tuple(true, false), std::make_tuple(true, true))); TEST_F(DBBasicTest, BestEffortsRecoveryTryMultipleManifests) { Options options = CurrentOptions(); options.env = env_; DestroyAndReopen(options); ASSERT_OK(Put("foo", "value0")); ASSERT_OK(Flush()); Close(); { // Hack by adding a new MANIFEST with high file number std::string garbage(10, '\0'); ASSERT_OK(WriteStringToFile(env_, garbage, dbname_ + "/MANIFEST-001000", /*should_sync=*/true)); } { // Hack by adding a corrupted SST not referenced by any MANIFEST std::string garbage(10, '\0'); ASSERT_OK(WriteStringToFile(env_, garbage, dbname_ + "/001001.sst", /*should_sync=*/true)); } options.best_efforts_recovery = true; Reopen(options); ASSERT_OK(Put("bar", "value")); } TEST_F(DBBasicTest, RecoverWithNoCurrentFile) { Options options = CurrentOptions(); options.env = env_; DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); options.best_efforts_recovery = true; ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); ASSERT_EQ(2, handles_.size()); ASSERT_OK(Put("foo", "value")); ASSERT_OK(Put(1, "bar", "value")); ASSERT_OK(Flush()); ASSERT_OK(Flush(1)); Close(); ASSERT_OK(env_->DeleteFile(CurrentFileName(dbname_))); ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); std::vector cf_names; ASSERT_OK(DB::ListColumnFamilies(DBOptions(options), dbname_, &cf_names)); ASSERT_EQ(2, cf_names.size()); for (const auto& name : cf_names) { ASSERT_TRUE(name == kDefaultColumnFamilyName || name == "pikachu"); } } TEST_F(DBBasicTest, RecoverWithNoManifest) { Options options = CurrentOptions(); options.env = env_; DestroyAndReopen(options); ASSERT_OK(Put("foo", "value")); ASSERT_OK(Flush()); Close(); { // Delete all MANIFEST. std::vector files; ASSERT_OK(env_->GetChildren(dbname_, &files)); for (const auto& file : files) { uint64_t number = 0; FileType type = kWalFile; if (ParseFileName(file, &number, &type) && type == kDescriptorFile) { ASSERT_OK(env_->DeleteFile(dbname_ + "/" + file)); } } } options.best_efforts_recovery = true; options.create_if_missing = false; Status s = TryReopen(options); ASSERT_TRUE(s.IsInvalidArgument()); options.create_if_missing = true; Reopen(options); // Since no MANIFEST exists, best-efforts recovery creates a new, empty db. ASSERT_EQ("NOT_FOUND", Get("foo")); } TEST_F(DBBasicTest, SkipWALIfMissingTableFiles) { Options options = CurrentOptions(); DestroyAndReopen(options); TableFileListener* listener = new TableFileListener(); options.listeners.emplace_back(listener); CreateAndReopenWithCF({"pikachu"}, options); std::vector kAllCfNames = {kDefaultColumnFamilyName, "pikachu"}; size_t num_cfs = handles_.size(); ASSERT_EQ(2, num_cfs); for (int cf = 0; cf < static_cast(kAllCfNames.size()); ++cf) { ASSERT_OK(Put(cf, "a", "0_value")); ASSERT_OK(Flush(cf)); ASSERT_OK(Put(cf, "b", "0_value")); } // Delete files for (size_t i = 0; i < kAllCfNames.size(); ++i) { std::vector& files = listener->GetFiles(kAllCfNames[i]); ASSERT_EQ(1, files.size()); for (int j = static_cast(files.size() - 1); j >= static_cast(i); --j) { ASSERT_OK(env_->DeleteFile(files[j])); } } options.best_efforts_recovery = true; ReopenWithColumnFamilies(kAllCfNames, options); // Verify WAL is not applied ReadOptions read_opts; read_opts.total_order_seek = true; std::unique_ptr iter(db_->NewIterator(read_opts, handles_[0])); iter->SeekToFirst(); ASSERT_FALSE(iter->Valid()); ASSERT_OK(iter->status()); iter.reset(db_->NewIterator(read_opts, handles_[1])); iter->SeekToFirst(); ASSERT_TRUE(iter->Valid()); ASSERT_EQ("a", iter->key()); iter->Next(); ASSERT_FALSE(iter->Valid()); ASSERT_OK(iter->status()); } TEST_F(DBBasicTest, DisableTrackWal) { // If WAL tracking was enabled, and then disabled during reopen, // the previously tracked WALs should be removed from MANIFEST. Options options = CurrentOptions(); options.track_and_verify_wals_in_manifest = true; // extremely small write buffer size, // so that new WALs are created more frequently. options.write_buffer_size = 100; options.env = env_; DestroyAndReopen(options); for (int i = 0; i < 100; i++) { ASSERT_OK(Put("foo" + std::to_string(i), "value" + std::to_string(i))); } ASSERT_OK(dbfull()->TEST_SwitchMemtable()); ASSERT_OK(db_->SyncWAL()); // Some WALs are tracked. ASSERT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty()); Close(); // Disable WAL tracking. options.track_and_verify_wals_in_manifest = false; options.create_if_missing = false; ASSERT_OK(TryReopen(options)); // Previously tracked WALs are cleared. ASSERT_TRUE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty()); Close(); // Re-enable WAL tracking again. options.track_and_verify_wals_in_manifest = true; options.create_if_missing = false; ASSERT_OK(TryReopen(options)); ASSERT_TRUE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty()); Close(); } TEST_F(DBBasicTest, ManifestChecksumMismatch) { Options options = CurrentOptions(); DestroyAndReopen(options); ASSERT_OK(Put("bar", "value")); ASSERT_OK(Flush()); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->SetCallBack( "LogWriter::EmitPhysicalRecord:BeforeEncodeChecksum", [&](void* arg) { auto* crc = reinterpret_cast(arg); *crc = *crc + 1; }); SyncPoint::GetInstance()->EnableProcessing(); WriteOptions write_opts; write_opts.disableWAL = true; Status s = db_->Put(write_opts, "foo", "value"); ASSERT_OK(s); ASSERT_OK(Flush()); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); ASSERT_OK(Put("foo", "value1")); ASSERT_OK(Flush()); s = TryReopen(options); ASSERT_TRUE(s.IsCorruption()); } TEST_F(DBBasicTest, ConcurrentlyCloseDB) { Options options = CurrentOptions(); DestroyAndReopen(options); std::vector workers; for (int i = 0; i < 10; i++) { workers.emplace_back([&]() { auto s = db_->Close(); ASSERT_OK(s); }); } for (auto& w : workers) { w.join(); } } class DBBasicTestTrackWal : public DBTestBase, public testing::WithParamInterface { public: DBBasicTestTrackWal() : DBTestBase("db_basic_test_track_wal", /*env_do_fsync=*/false) {} int CountWalFiles() { VectorLogPtr log_files; EXPECT_OK(dbfull()->GetSortedWalFiles(log_files)); return static_cast(log_files.size()); }; }; TEST_P(DBBasicTestTrackWal, DoNotTrackObsoleteWal) { // If a WAL becomes obsolete after flushing, but is not deleted from disk yet, // then if SyncWAL is called afterwards, the obsolete WAL should not be // tracked in MANIFEST. Options options = CurrentOptions(); options.create_if_missing = true; options.track_and_verify_wals_in_manifest = true; options.atomic_flush = GetParam(); DestroyAndReopen(options); CreateAndReopenWithCF({"cf"}, options); ASSERT_EQ(handles_.size(), 2); // default, cf // Do not delete WALs. ASSERT_OK(db_->DisableFileDeletions()); constexpr int n = 10; std::vector> wals(n); for (size_t i = 0; i < n; i++) { // Generate a new WAL for each key-value. const int cf = i % 2; ASSERT_OK(db_->GetCurrentWalFile(&wals[i])); ASSERT_OK(Put(cf, "k" + std::to_string(i), "v" + std::to_string(i))); ASSERT_OK(Flush({0, 1})); } ASSERT_EQ(CountWalFiles(), n); // Since all WALs are obsolete, no WAL should be tracked in MANIFEST. ASSERT_OK(db_->SyncWAL()); // Manually delete all WALs. Close(); for (const auto& wal : wals) { ASSERT_OK(env_->DeleteFile(LogFileName(dbname_, wal->LogNumber()))); } // If SyncWAL tracks the obsolete WALs in MANIFEST, // reopen will fail because the WALs are missing from disk. ASSERT_OK(TryReopenWithColumnFamilies({"default", "cf"}, options)); Destroy(options); } INSTANTIATE_TEST_CASE_P(DBBasicTestTrackWal, DBBasicTestTrackWal, testing::Bool()); class DBBasicTestMultiGet : public DBTestBase { public: DBBasicTestMultiGet(std::string test_dir, int num_cfs, bool uncompressed_cache, bool _compression_enabled, bool _fill_cache, uint32_t compression_parallel_threads) : DBTestBase(test_dir, /*env_do_fsync=*/false) { compression_enabled_ = _compression_enabled; fill_cache_ = _fill_cache; if (uncompressed_cache) { std::shared_ptr cache = NewLRUCache(1048576); uncompressed_cache_ = std::make_shared(cache); } env_->count_random_reads_ = true; Options options = CurrentOptions(); Random rnd(301); BlockBasedTableOptions table_options; if (compression_enabled_) { std::vector compression_types; compression_types = GetSupportedCompressions(); // Not every platform may have compression libraries available, so // dynamically pick based on what's available CompressionType tmp_type = kNoCompression; for (auto c_type : compression_types) { if (c_type != kNoCompression) { tmp_type = c_type; break; } } if (tmp_type != kNoCompression) { options.compression = tmp_type; } else { compression_enabled_ = false; } } table_options.block_cache = uncompressed_cache_; if (table_options.block_cache == nullptr) { table_options.no_block_cache = true; } else { table_options.pin_l0_filter_and_index_blocks_in_cache = true; } table_options.flush_block_policy_factory.reset( new MyFlushBlockPolicyFactory(10)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); if (!compression_enabled_) { options.compression = kNoCompression; } else { options.compression_opts.parallel_threads = compression_parallel_threads; } options_ = options; Reopen(options); if (num_cfs > 1) { for (int cf = 0; cf < num_cfs; ++cf) { cf_names_.emplace_back("cf" + std::to_string(cf)); } CreateColumnFamilies(cf_names_, options); cf_names_.emplace_back("default"); } std::string zero_str(128, '\0'); for (int cf = 0; cf < num_cfs; ++cf) { for (int i = 0; i < 100; ++i) { // Make the value compressible. A purely random string doesn't compress // and the resultant data block will not be compressed values_.emplace_back(rnd.RandomString(128) + zero_str); assert(((num_cfs == 1) ? Put(Key(i), values_[i]) : Put(cf, Key(i), values_[i])) == Status::OK()); } if (num_cfs == 1) { EXPECT_OK(Flush()); } else { EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf])); } for (int i = 0; i < 100; ++i) { // block cannot gain space by compression uncompressable_values_.emplace_back(rnd.RandomString(256) + '\0'); std::string tmp_key = "a" + Key(i); assert(((num_cfs == 1) ? Put(tmp_key, uncompressable_values_[i]) : Put(cf, tmp_key, uncompressable_values_[i])) == Status::OK()); } if (num_cfs == 1) { EXPECT_OK(Flush()); } else { EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf])); } } // Clear compressed cache, which is always pre-populated if (compressed_cache_) { compressed_cache_->SetCapacity(0); compressed_cache_->SetCapacity(1048576); } } bool CheckValue(int i, const std::string& value) { if (values_[i].compare(value) == 0) { return true; } return false; } bool CheckUncompressableValue(int i, const std::string& value) { if (uncompressable_values_[i].compare(value) == 0) { return true; } return false; } const std::vector& GetCFNames() const { return cf_names_; } int num_lookups() { return uncompressed_cache_->num_lookups(); } int num_found() { return uncompressed_cache_->num_found(); } int num_inserts() { return uncompressed_cache_->num_inserts(); } int num_lookups_compressed() { return compressed_cache_->num_lookups(); } int num_found_compressed() { return compressed_cache_->num_found(); } int num_inserts_compressed() { return compressed_cache_->num_inserts(); } bool fill_cache() { return fill_cache_; } bool compression_enabled() { return compression_enabled_; } bool has_compressed_cache() { return compressed_cache_ != nullptr; } bool has_uncompressed_cache() { return uncompressed_cache_ != nullptr; } Options get_options() { return options_; } static void SetUpTestCase() {} static void TearDownTestCase() {} protected: class MyBlockCache : public CacheWrapper { public: explicit MyBlockCache(std::shared_ptr target) : CacheWrapper(target), num_lookups_(0), num_found_(0), num_inserts_(0) {} const char* Name() const override { return "MyBlockCache"; } Status Insert(const Slice& key, Cache::ObjectPtr value, const CacheItemHelper* helper, size_t charge, Handle** handle = nullptr, Priority priority = Priority::LOW, const Slice& compressed = Slice(), CompressionType type = kNoCompression) override { num_inserts_++; return target_->Insert(key, value, helper, charge, handle, priority, compressed, type); } Handle* Lookup(const Slice& key, const CacheItemHelper* helper, CreateContext* create_context, Priority priority = Priority::LOW, Statistics* stats = nullptr) override { num_lookups_++; Handle* handle = target_->Lookup(key, helper, create_context, priority, stats); if (handle != nullptr) { num_found_++; } return handle; } int num_lookups() { return num_lookups_; } int num_found() { return num_found_; } int num_inserts() { return num_inserts_; } private: int num_lookups_; int num_found_; int num_inserts_; }; std::shared_ptr compressed_cache_; std::shared_ptr uncompressed_cache_; Options options_; bool compression_enabled_; std::vector values_; std::vector uncompressable_values_; bool fill_cache_; std::vector cf_names_; }; class DBBasicTestWithParallelIO : public DBBasicTestMultiGet, public testing::WithParamInterface< std::tuple> { public: DBBasicTestWithParallelIO() : DBBasicTestMultiGet("/db_basic_test_with_parallel_io", 1, std::get<0>(GetParam()), std::get<1>(GetParam()), std::get<2>(GetParam()), std::get<3>(GetParam())) {} }; TEST_P(DBBasicTestWithParallelIO, MultiGet) { std::vector key_data(10); std::vector keys; // We cannot resize a PinnableSlice vector, so just set initial size to // largest we think we will need std::vector values(10); std::vector statuses; ReadOptions ro; ro.fill_cache = fill_cache(); // Warm up the cache first key_data.emplace_back(Key(0)); keys.emplace_back(key_data.back()); key_data.emplace_back(Key(50)); keys.emplace_back(key_data.back()); statuses.resize(keys.size()); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data(), true); ASSERT_TRUE(CheckValue(0, values[0].ToString())); ASSERT_TRUE(CheckValue(50, values[1].ToString())); int random_reads = env_->random_read_counter_.Read(); key_data[0] = Key(1); key_data[1] = Key(51); keys[0] = Slice(key_data[0]); keys[1] = Slice(key_data[1]); values[0].Reset(); values[1].Reset(); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data(), true); ASSERT_TRUE(CheckValue(1, values[0].ToString())); ASSERT_TRUE(CheckValue(51, values[1].ToString())); bool read_from_cache = false; if (fill_cache()) { if (has_uncompressed_cache()) { read_from_cache = true; } else if (has_compressed_cache() && compression_enabled()) { read_from_cache = true; } } int expected_reads = random_reads + (read_from_cache ? 0 : 2); ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); keys.resize(10); statuses.resize(10); std::vector key_ints{1, 2, 15, 16, 55, 81, 82, 83, 84, 85}; for (size_t i = 0; i < key_ints.size(); ++i) { key_data[i] = Key(key_ints[i]); keys[i] = Slice(key_data[i]); statuses[i] = Status::OK(); values[i].Reset(); } dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data(), true); for (size_t i = 0; i < key_ints.size(); ++i) { ASSERT_OK(statuses[i]); ASSERT_TRUE(CheckValue(key_ints[i], values[i].ToString())); } if (compression_enabled() && !has_compressed_cache()) { expected_reads += (read_from_cache ? 2 : 3); } else { expected_reads += (read_from_cache ? 2 : 4); } ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); keys.resize(10); statuses.resize(10); std::vector key_uncmp{1, 2, 15, 16, 55, 81, 82, 83, 84, 85}; for (size_t i = 0; i < key_uncmp.size(); ++i) { key_data[i] = "a" + Key(key_uncmp[i]); keys[i] = Slice(key_data[i]); statuses[i] = Status::OK(); values[i].Reset(); } dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data(), true); for (size_t i = 0; i < key_uncmp.size(); ++i) { ASSERT_OK(statuses[i]); ASSERT_TRUE(CheckUncompressableValue(key_uncmp[i], values[i].ToString())); } if (compression_enabled() && !has_compressed_cache()) { expected_reads += (read_from_cache ? 3 : 3); } else { expected_reads += (read_from_cache ? 4 : 4); } ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); keys.resize(5); statuses.resize(5); std::vector key_tr{1, 2, 15, 16, 55}; for (size_t i = 0; i < key_tr.size(); ++i) { key_data[i] = "a" + Key(key_tr[i]); keys[i] = Slice(key_data[i]); statuses[i] = Status::OK(); values[i].Reset(); } dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data(), true); for (size_t i = 0; i < key_tr.size(); ++i) { ASSERT_OK(statuses[i]); ASSERT_TRUE(CheckUncompressableValue(key_tr[i], values[i].ToString())); } if (compression_enabled() && !has_compressed_cache()) { expected_reads += (read_from_cache ? 0 : 2); ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); } else { if (has_uncompressed_cache()) { expected_reads += (read_from_cache ? 0 : 3); ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); } else { // A rare case, even we enable the block compression but some of data // blocks are not compressed due to content. If user only enable the // compressed cache, the uncompressed blocks will not tbe cached, and // block reads will be triggered. The number of reads is related to // the compression algorithm. ASSERT_TRUE(env_->random_read_counter_.Read() >= expected_reads); } } } TEST_P(DBBasicTestWithParallelIO, MultiGetDirectIO) { class FakeDirectIOEnv : public EnvWrapper { class FakeDirectIOSequentialFile; class FakeDirectIORandomAccessFile; public: FakeDirectIOEnv(Env* env) : EnvWrapper(env) {} static const char* kClassName() { return "FakeDirectIOEnv"; } const char* Name() const override { return kClassName(); } Status NewRandomAccessFile(const std::string& fname, std::unique_ptr* result, const EnvOptions& options) override { std::unique_ptr file; assert(options.use_direct_reads); EnvOptions opts = options; opts.use_direct_reads = false; Status s = target()->NewRandomAccessFile(fname, &file, opts); if (!s.ok()) { return s; } result->reset(new FakeDirectIORandomAccessFile(std::move(file))); return s; } private: class FakeDirectIOSequentialFile : public SequentialFileWrapper { public: FakeDirectIOSequentialFile(std::unique_ptr&& file) : SequentialFileWrapper(file.get()), file_(std::move(file)) {} ~FakeDirectIOSequentialFile() {} bool use_direct_io() const override { return true; } size_t GetRequiredBufferAlignment() const override { return 1; } private: std::unique_ptr file_; }; class FakeDirectIORandomAccessFile : public RandomAccessFileWrapper { public: FakeDirectIORandomAccessFile(std::unique_ptr&& file) : RandomAccessFileWrapper(file.get()), file_(std::move(file)) {} ~FakeDirectIORandomAccessFile() {} bool use_direct_io() const override { return true; } size_t GetRequiredBufferAlignment() const override { return 1; } private: std::unique_ptr file_; }; }; std::unique_ptr env(new FakeDirectIOEnv(env_)); Options opts = get_options(); opts.env = env.get(); opts.use_direct_reads = true; Reopen(opts); std::vector key_data(10); std::vector keys; // We cannot resize a PinnableSlice vector, so just set initial size to // largest we think we will need std::vector values(10); std::vector statuses; ReadOptions ro; ro.fill_cache = fill_cache(); // Warm up the cache first key_data.emplace_back(Key(0)); keys.emplace_back(key_data.back()); key_data.emplace_back(Key(50)); keys.emplace_back(key_data.back()); statuses.resize(keys.size()); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data(), true); ASSERT_TRUE(CheckValue(0, values[0].ToString())); ASSERT_TRUE(CheckValue(50, values[1].ToString())); int random_reads = env_->random_read_counter_.Read(); key_data[0] = Key(1); key_data[1] = Key(51); keys[0] = Slice(key_data[0]); keys[1] = Slice(key_data[1]); values[0].Reset(); values[1].Reset(); if (uncompressed_cache_) { uncompressed_cache_->SetCapacity(0); uncompressed_cache_->SetCapacity(1048576); } dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data(), true); ASSERT_TRUE(CheckValue(1, values[0].ToString())); ASSERT_TRUE(CheckValue(51, values[1].ToString())); bool read_from_cache = false; if (fill_cache()) { if (has_uncompressed_cache()) { read_from_cache = true; } else if (has_compressed_cache() && compression_enabled()) { read_from_cache = true; } } int expected_reads = random_reads; if (!compression_enabled() || !has_compressed_cache()) { expected_reads += 2; } else { expected_reads += (read_from_cache ? 0 : 2); } if (env_->random_read_counter_.Read() != expected_reads) { ASSERT_EQ(env_->random_read_counter_.Read(), expected_reads); } Close(); } TEST_P(DBBasicTestWithParallelIO, MultiGetWithChecksumMismatch) { std::vector key_data(10); std::vector keys; // We cannot resize a PinnableSlice vector, so just set initial size to // largest we think we will need std::vector values(10); std::vector statuses; int read_count = 0; ReadOptions ro; ro.fill_cache = fill_cache(); SyncPoint::GetInstance()->SetCallBack( "RetrieveMultipleBlocks:VerifyChecksum", [&](void* status) { Status* s = static_cast(status); read_count++; if (read_count == 2) { *s = Status::Corruption(); } }); SyncPoint::GetInstance()->EnableProcessing(); // Warm up the cache first key_data.emplace_back(Key(0)); keys.emplace_back(key_data.back()); key_data.emplace_back(Key(50)); keys.emplace_back(key_data.back()); statuses.resize(keys.size()); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data(), true); ASSERT_TRUE(CheckValue(0, values[0].ToString())); // ASSERT_TRUE(CheckValue(50, values[1].ToString())); ASSERT_EQ(statuses[0], Status::OK()); ASSERT_EQ(statuses[1], Status::Corruption()); SyncPoint::GetInstance()->DisableProcessing(); } TEST_P(DBBasicTestWithParallelIO, MultiGetWithMissingFile) { std::vector key_data(10); std::vector keys; // We cannot resize a PinnableSlice vector, so just set initial size to // largest we think we will need std::vector values(10); std::vector statuses; ReadOptions ro; ro.fill_cache = fill_cache(); SyncPoint::GetInstance()->SetCallBack( "TableCache::MultiGet:FindTable", [&](void* status) { Status* s = static_cast(status); *s = Status::IOError(); }); // DB open will create table readers unless we reduce the table cache // capacity. // SanitizeOptions will set max_open_files to minimum of 20. Table cache // is allocated with max_open_files - 10 as capacity. So override // max_open_files to 11 so table cache capacity will become 1. This will // prevent file open during DB open and force the file to be opened // during MultiGet SyncPoint::GetInstance()->SetCallBack( "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { int* max_open_files = (int*)arg; *max_open_files = 11; }); SyncPoint::GetInstance()->EnableProcessing(); Reopen(CurrentOptions()); // Warm up the cache first key_data.emplace_back(Key(0)); keys.emplace_back(key_data.back()); key_data.emplace_back(Key(50)); keys.emplace_back(key_data.back()); statuses.resize(keys.size()); dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), keys.data(), values.data(), statuses.data(), true); ASSERT_EQ(statuses[0], Status::IOError()); ASSERT_EQ(statuses[1], Status::IOError()); SyncPoint::GetInstance()->DisableProcessing(); } INSTANTIATE_TEST_CASE_P(ParallelIO, DBBasicTestWithParallelIO, // Params are as follows - // Param 0 - Uncompressed cache enabled // Param 1 - Data compression enabled // Param 2 - ReadOptions::fill_cache // Param 3 - CompressionOptions::parallel_threads ::testing::Combine(::testing::Bool(), ::testing::Bool(), ::testing::Bool(), ::testing::Values(1, 4))); // Forward declaration class DeadlineFS; class DeadlineRandomAccessFile : public FSRandomAccessFileOwnerWrapper { public: DeadlineRandomAccessFile(DeadlineFS& fs, std::unique_ptr& file) : FSRandomAccessFileOwnerWrapper(std::move(file)), fs_(fs) {} IOStatus Read(uint64_t offset, size_t len, const IOOptions& opts, Slice* result, char* scratch, IODebugContext* dbg) const override; IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, const IOOptions& options, IODebugContext* dbg) override; IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, std::function cb, void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) override; private: DeadlineFS& fs_; std::unique_ptr file_; }; class DeadlineFS : public FileSystemWrapper { public: // The error_on_delay parameter specifies whether a IOStatus::TimedOut() // status should be returned after delaying the IO to exceed the timeout, // or to simply delay but return success anyway. The latter mimics the // behavior of PosixFileSystem, which does not enforce any timeout explicit DeadlineFS(SpecialEnv* env, bool error_on_delay) : FileSystemWrapper(env->GetFileSystem()), deadline_(std::chrono::microseconds::zero()), io_timeout_(std::chrono::microseconds::zero()), env_(env), timedout_(false), ignore_deadline_(false), error_on_delay_(error_on_delay) {} static const char* kClassName() { return "DeadlineFileSystem"; } const char* Name() const override { return kClassName(); } IOStatus NewRandomAccessFile(const std::string& fname, const FileOptions& opts, std::unique_ptr* result, IODebugContext* dbg) override { std::unique_ptr file; IOStatus s = target()->NewRandomAccessFile(fname, opts, &file, dbg); EXPECT_OK(s); result->reset(new DeadlineRandomAccessFile(*this, file)); const std::chrono::microseconds deadline = GetDeadline(); const std::chrono::microseconds io_timeout = GetIOTimeout(); if (deadline.count() || io_timeout.count()) { AssertDeadline(deadline, io_timeout, opts.io_options); } return ShouldDelay(opts.io_options); } // Set a vector of {IO counter, delay in microseconds, return status} tuples // that control when to inject a delay and duration of the delay void SetDelayTrigger(const std::chrono::microseconds deadline, const std::chrono::microseconds io_timeout, const int trigger) { delay_trigger_ = trigger; io_count_ = 0; deadline_ = deadline; io_timeout_ = io_timeout; timedout_ = false; } // Increment the IO counter and return a delay in microseconds IOStatus ShouldDelay(const IOOptions& opts) { if (timedout_) { return IOStatus::TimedOut(); } else if (!deadline_.count() && !io_timeout_.count()) { return IOStatus::OK(); } if (!ignore_deadline_ && delay_trigger_ == io_count_++) { env_->SleepForMicroseconds(static_cast(opts.timeout.count() + 1)); timedout_ = true; if (error_on_delay_) { return IOStatus::TimedOut(); } } return IOStatus::OK(); } const std::chrono::microseconds GetDeadline() { return ignore_deadline_ ? std::chrono::microseconds::zero() : deadline_; } const std::chrono::microseconds GetIOTimeout() { return ignore_deadline_ ? std::chrono::microseconds::zero() : io_timeout_; } bool TimedOut() { return timedout_; } void IgnoreDeadline(bool ignore) { ignore_deadline_ = ignore; } void AssertDeadline(const std::chrono::microseconds deadline, const std::chrono::microseconds io_timeout, const IOOptions& opts) const { // Give a leeway of +- 10us as it can take some time for the Get/ // MultiGet call to reach here, in order to avoid false alarms std::chrono::microseconds now = std::chrono::microseconds(env_->NowMicros()); std::chrono::microseconds timeout; if (deadline.count()) { timeout = deadline - now; if (io_timeout.count()) { timeout = std::min(timeout, io_timeout); } } else { timeout = io_timeout; } if (opts.timeout != timeout) { ASSERT_EQ(timeout, opts.timeout); } } private: // The number of IOs to trigger the delay after int delay_trigger_; // Current IO count int io_count_; // ReadOptions deadline for the Get/MultiGet/Iterator std::chrono::microseconds deadline_; // ReadOptions io_timeout for the Get/MultiGet/Iterator std::chrono::microseconds io_timeout_; SpecialEnv* env_; // Flag to indicate whether we injected a delay bool timedout_; // Temporarily ignore deadlines/timeouts bool ignore_deadline_; // Return IOStatus::TimedOut() or IOStatus::OK() bool error_on_delay_; }; IOStatus DeadlineRandomAccessFile::Read(uint64_t offset, size_t len, const IOOptions& opts, Slice* result, char* scratch, IODebugContext* dbg) const { const std::chrono::microseconds deadline = fs_.GetDeadline(); const std::chrono::microseconds io_timeout = fs_.GetIOTimeout(); IOStatus s; if (deadline.count() || io_timeout.count()) { fs_.AssertDeadline(deadline, io_timeout, opts); } if (s.ok()) { s = FSRandomAccessFileWrapper::Read(offset, len, opts, result, scratch, dbg); } if (s.ok()) { s = fs_.ShouldDelay(opts); } return s; } IOStatus DeadlineRandomAccessFile::ReadAsync( FSReadRequest& req, const IOOptions& opts, std::function cb, void* cb_arg, void** io_handle, IOHandleDeleter* del_fn, IODebugContext* dbg) { const std::chrono::microseconds deadline = fs_.GetDeadline(); const std::chrono::microseconds io_timeout = fs_.GetIOTimeout(); IOStatus s; if (deadline.count() || io_timeout.count()) { fs_.AssertDeadline(deadline, io_timeout, opts); } if (s.ok()) { s = FSRandomAccessFileWrapper::ReadAsync(req, opts, cb, cb_arg, io_handle, del_fn, dbg); } if (s.ok()) { s = fs_.ShouldDelay(opts); } return s; } IOStatus DeadlineRandomAccessFile::MultiRead(FSReadRequest* reqs, size_t num_reqs, const IOOptions& options, IODebugContext* dbg) { const std::chrono::microseconds deadline = fs_.GetDeadline(); const std::chrono::microseconds io_timeout = fs_.GetIOTimeout(); IOStatus s; if (deadline.count() || io_timeout.count()) { fs_.AssertDeadline(deadline, io_timeout, options); } if (s.ok()) { s = FSRandomAccessFileWrapper::MultiRead(reqs, num_reqs, options, dbg); } if (s.ok()) { s = fs_.ShouldDelay(options); } return s; } // A test class for intercepting random reads and injecting artificial // delays. Used for testing the MultiGet deadline feature class DBBasicTestMultiGetDeadline : public DBBasicTestMultiGet, public testing::WithParamInterface { public: DBBasicTestMultiGetDeadline() : DBBasicTestMultiGet( "db_basic_test_multiget_deadline" /*Test dir*/, 10 /*# of column families*/, true /*uncompressed cache enabled*/, true /*compression enabled*/, true /*ReadOptions.fill_cache*/, 1 /*# of parallel compression threads*/) {} inline void CheckStatus(std::vector& statuses, size_t num_ok) { for (size_t i = 0; i < statuses.size(); ++i) { if (i < num_ok) { EXPECT_OK(statuses[i]); } else { if (statuses[i] != Status::TimedOut()) { EXPECT_EQ(statuses[i], Status::TimedOut()); } } } } }; TEST_P(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) { #ifndef USE_COROUTINES if (GetParam()) { ROCKSDB_GTEST_SKIP("This test requires coroutine support"); return; } #endif // USE_COROUTINES std::shared_ptr fs = std::make_shared(env_, false); std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); Options options = CurrentOptions(); std::shared_ptr cache = NewLRUCache(1048576); BlockBasedTableOptions table_options; table_options.block_cache = cache; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.env = env.get(); SetTimeElapseOnlySleepOnReopen(&options); ReopenWithColumnFamilies(GetCFNames(), options); // Test batched MultiGet with an IO delay in the first data block read. // Both keys in the first CF should succeed as they're in the same data // block and would form one batch, and we check for deadline between // batches. std::vector key_str; size_t i; for (i = 0; i < 10; ++i) { key_str.emplace_back(Key(static_cast(i))); } std::vector cfs(key_str.size()); std::vector keys(key_str.size()); std::vector pin_values(keys.size()); for (i = 0; i < key_str.size(); ++i) { // 2 keys per CF cfs[i] = handles_[i / 2]; keys[i] = Slice(key_str[i].data(), key_str[i].size()); } ReadOptions ro; ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; ro.async_io = GetParam(); // Delay the first IO fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 0); std::vector statuses(key_str.size()); dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(), pin_values.data(), statuses.data()); // The first key is successful because we check after the lookup, but // subsequent keys fail due to deadline exceeded CheckStatus(statuses, 2); // Similar to the previous one, but an IO delay in the third CF data block // read for (PinnableSlice& value : pin_values) { value.Reset(); } cache->SetCapacity(0); cache->SetCapacity(1048576); statuses.clear(); statuses.resize(keys.size()); ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 2); dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(), pin_values.data(), statuses.data()); CheckStatus(statuses, 6); // Similar to the previous one, but an IO delay in the last but one CF for (PinnableSlice& value : pin_values) { value.Reset(); } cache->SetCapacity(0); cache->SetCapacity(1048576); statuses.clear(); statuses.resize(keys.size()); ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 3); dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(), pin_values.data(), statuses.data()); CheckStatus(statuses, 8); // Test batched MultiGet with single CF and lots of keys. Inject delay // into the second batch of keys. As each batch is 32, the first 64 keys, // i.e first two batches, should succeed and the rest should time out for (PinnableSlice& value : pin_values) { value.Reset(); } cache->SetCapacity(0); cache->SetCapacity(1048576); key_str.clear(); for (i = 0; i < 100; ++i) { key_str.emplace_back(Key(static_cast(i))); } keys.resize(key_str.size()); pin_values.clear(); pin_values.resize(key_str.size()); for (i = 0; i < key_str.size(); ++i) { keys[i] = Slice(key_str[i].data(), key_str[i].size()); } statuses.clear(); statuses.resize(keys.size()); ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; fs->SetDelayTrigger(ro.deadline, ro.io_timeout, 1); dbfull()->MultiGet(ro, handles_[0], keys.size(), keys.data(), pin_values.data(), statuses.data()); CheckStatus(statuses, 64); Close(); } INSTANTIATE_TEST_CASE_P(DeadlineIO, DBBasicTestMultiGetDeadline, ::testing::Bool()); TEST_F(DBBasicTest, ManifestWriteFailure) { Options options = GetDefaultOptions(); options.create_if_missing = true; options.disable_auto_compactions = true; options.env = env_; options.enable_blob_files = true; options.blob_file_size = 0; DestroyAndReopen(options); ASSERT_OK(Put("foo", "bar")); ASSERT_OK(Flush()); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->SetCallBack( "VersionSet::ProcessManifestWrites:AfterSyncManifest", [&](void* arg) { ASSERT_NE(nullptr, arg); auto* s = static_cast(arg); ASSERT_OK(*s); // Manually overwrite return status *s = Status::IOError(); }); SyncPoint::GetInstance()->EnableProcessing(); ASSERT_OK(Put("key", "value")); ASSERT_NOK(Flush()); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->EnableProcessing(); Reopen(options); // The IO error was a mocked one from the `AfterSyncManifest` callback. The // Flush's VersionEdit actually made it into the Manifest. So these keys can // be read back. Read them to check all live sst files and blob files. ASSERT_EQ("bar", Get("foo")); ASSERT_EQ("value", Get("key")); } TEST_F(DBBasicTest, DestroyDefaultCfHandle) { Options options = GetDefaultOptions(); options.create_if_missing = true; DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); for (const auto* h : handles_) { ASSERT_NE(db_->DefaultColumnFamily(), h); } // We have two handles to the default column family. The two handles point to // different ColumnFamilyHandle objects. assert(db_->DefaultColumnFamily()); ASSERT_EQ(0U, db_->DefaultColumnFamily()->GetID()); assert(handles_[0]); ASSERT_EQ(0U, handles_[0]->GetID()); // You can destroy handles_[...]. for (auto* h : handles_) { ASSERT_OK(db_->DestroyColumnFamilyHandle(h)); } handles_.clear(); // But you should not destroy db_->DefaultColumnFamily(), since it's going to // be deleted in `DBImpl::CloseHelper()`. Before that, it may be used // elsewhere internally too. ColumnFamilyHandle* default_cf = db_->DefaultColumnFamily(); ASSERT_TRUE(db_->DestroyColumnFamilyHandle(default_cf).IsInvalidArgument()); } TEST_F(DBBasicTest, FailOpenIfLoggerCreationFail) { Options options = GetDefaultOptions(); options.create_if_missing = true; SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); SyncPoint::GetInstance()->SetCallBack( "rocksdb::CreateLoggerFromOptions:AfterGetPath", [&](void* arg) { auto* s = static_cast(arg); assert(s); *s = Status::IOError("Injected"); }); SyncPoint::GetInstance()->EnableProcessing(); Status s = TryReopen(options); ASSERT_EQ(nullptr, options.info_log); ASSERT_TRUE(s.IsIOError()); SyncPoint::GetInstance()->DisableProcessing(); SyncPoint::GetInstance()->ClearAllCallBacks(); } TEST_F(DBBasicTest, VerifyFileChecksums) { Options options = GetDefaultOptions(); options.create_if_missing = true; options.env = env_; DestroyAndReopen(options); ASSERT_OK(Put("a", "value")); ASSERT_OK(Flush()); ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument()); options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); Reopen(options); ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); // Write an L0 with checksum computed. ASSERT_OK(Put("b", "value")); ASSERT_OK(Flush()); ASSERT_OK(db_->VerifyFileChecksums(ReadOptions())); // Does the right thing but with the wrong name -- using it should lead to an // error. class MisnamedFileChecksumGenerator : public FileChecksumGenCrc32c { public: MisnamedFileChecksumGenerator(const FileChecksumGenContext& context) : FileChecksumGenCrc32c(context) {} const char* Name() const override { return "sha1"; } }; class MisnamedFileChecksumGenFactory : public FileChecksumGenCrc32cFactory { public: std::unique_ptr CreateFileChecksumGenerator( const FileChecksumGenContext& context) override { return std::unique_ptr( new MisnamedFileChecksumGenerator(context)); } }; options.file_checksum_gen_factory.reset(new MisnamedFileChecksumGenFactory()); Reopen(options); ASSERT_TRUE(db_->VerifyFileChecksums(ReadOptions()).IsInvalidArgument()); } TEST_F(DBBasicTest, VerifyFileChecksumsReadahead) { Options options = GetDefaultOptions(); options.create_if_missing = true; options.env = env_; options.file_checksum_gen_factory = GetFileChecksumGenCrc32cFactory(); DestroyAndReopen(options); Random rnd(301); int alignment = 256 * 1024; for (int i = 0; i < 16; ++i) { ASSERT_OK(Put("key" + std::to_string(i), rnd.RandomString(alignment))); } ASSERT_OK(Flush()); std::vector filenames; int sst_cnt = 0; std::string sst_name; uint64_t sst_size; uint64_t number; FileType type; ASSERT_OK(env_->GetChildren(dbname_, &filenames)); for (const auto& name : filenames) { if (ParseFileName(name, &number, &type)) { if (type == kTableFile) { sst_cnt++; sst_name = name; } } } ASSERT_EQ(sst_cnt, 1); ASSERT_OK(env_->GetFileSize(dbname_ + '/' + sst_name, &sst_size)); bool last_read = false; ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( "GenerateOneFileChecksum::Chunk:0", [&](void* /*arg*/) { if (env_->random_read_bytes_counter_.load() == sst_size) { EXPECT_FALSE(last_read); last_read = true; } else { ASSERT_EQ(env_->random_read_bytes_counter_.load() & (alignment - 1), 0); } }); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); env_->count_random_reads_ = true; env_->random_read_bytes_counter_ = 0; env_->random_read_counter_.Reset(); ReadOptions ro; ro.readahead_size = alignment; ASSERT_OK(db_->VerifyFileChecksums(ro)); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ASSERT_TRUE(last_read); ASSERT_EQ(env_->random_read_counter_.Read(), (sst_size + alignment - 1) / (alignment)); } // TODO: re-enable after we provide finer-grained control for WAL tracking to // meet the needs of different use cases, durability levels and recovery modes. TEST_F(DBBasicTest, DISABLED_ManualWalSync) { Options options = CurrentOptions(); options.track_and_verify_wals_in_manifest = true; options.wal_recovery_mode = WALRecoveryMode::kAbsoluteConsistency; DestroyAndReopen(options); ASSERT_OK(Put("x", "y")); // This does not create a new WAL. ASSERT_OK(db_->SyncWAL()); EXPECT_FALSE(dbfull()->GetVersionSet()->GetWalSet().GetWals().empty()); std::unique_ptr wal; Status s = db_->GetCurrentWalFile(&wal); ASSERT_OK(s); Close(); EXPECT_OK(env_->DeleteFile(LogFileName(dbname_, wal->LogNumber()))); ASSERT_TRUE(TryReopen(options).IsCorruption()); } // A test class for intercepting random reads and injecting artificial // delays. Used for testing the deadline/timeout feature class DBBasicTestDeadline : public DBBasicTest, public testing::WithParamInterface> {}; TEST_P(DBBasicTestDeadline, PointLookupDeadline) { std::shared_ptr fs = std::make_shared(env_, true); std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); bool set_deadline = std::get<0>(GetParam()); bool set_timeout = std::get<1>(GetParam()); for (int option_config = kDefault; option_config < kEnd; ++option_config) { if (ShouldSkipOptions(option_config, kSkipPlainTable | kSkipMmapReads)) { continue; } option_config_ = option_config; Options options = CurrentOptions(); if (options.use_direct_reads) { continue; } options.env = env.get(); options.disable_auto_compactions = true; Cache* block_cache = nullptr; // Fileter block reads currently don't cause the request to get // aborted on a read timeout, so its possible those block reads // may get issued even if the deadline is past SyncPoint::GetInstance()->SetCallBack( "BlockBasedTable::Get:BeforeFilterMatch", [&](void* /*arg*/) { fs->IgnoreDeadline(true); }); SyncPoint::GetInstance()->SetCallBack( "BlockBasedTable::Get:AfterFilterMatch", [&](void* /*arg*/) { fs->IgnoreDeadline(false); }); // DB open will create table readers unless we reduce the table cache // capacity. // SanitizeOptions will set max_open_files to minimum of 20. Table cache // is allocated with max_open_files - 10 as capacity. So override // max_open_files to 11 so table cache capacity will become 1. This will // prevent file open during DB open and force the file to be opened // during MultiGet SyncPoint::GetInstance()->SetCallBack( "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { int* max_open_files = (int*)arg; *max_open_files = 11; }); SyncPoint::GetInstance()->EnableProcessing(); SetTimeElapseOnlySleepOnReopen(&options); Reopen(options); if (options.table_factory) { block_cache = options.table_factory->GetOptions( TableFactory::kBlockCacheOpts()); } Random rnd(301); for (int i = 0; i < 400; ++i) { std::string key = "k" + std::to_string(i); ASSERT_OK(Put(key, rnd.RandomString(100))); } ASSERT_OK(Flush()); bool timedout = true; // A timeout will be forced when the IO counter reaches this value int io_deadline_trigger = 0; // Keep incrementing io_deadline_trigger and call Get() until there is an // iteration that doesn't cause a timeout. This ensures that we cover // all file reads in the point lookup path that can potentially timeout // and cause the Get() to fail. while (timedout) { ReadOptions ro; if (set_deadline) { ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; } if (set_timeout) { ro.io_timeout = std::chrono::microseconds{5000}; } fs->SetDelayTrigger(ro.deadline, ro.io_timeout, io_deadline_trigger); block_cache->SetCapacity(0); block_cache->SetCapacity(1048576); std::string value; Status s = dbfull()->Get(ro, "k50", &value); if (fs->TimedOut()) { ASSERT_EQ(s, Status::TimedOut()); } else { timedout = false; ASSERT_OK(s); } io_deadline_trigger++; } // Reset the delay sequence in order to avoid false alarms during Reopen fs->SetDelayTrigger(std::chrono::microseconds::zero(), std::chrono::microseconds::zero(), 0); } Close(); } TEST_P(DBBasicTestDeadline, IteratorDeadline) { std::shared_ptr fs = std::make_shared(env_, true); std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); bool set_deadline = std::get<0>(GetParam()); bool set_timeout = std::get<1>(GetParam()); for (int option_config = kDefault; option_config < kEnd; ++option_config) { if (ShouldSkipOptions(option_config, kSkipPlainTable | kSkipMmapReads)) { continue; } Options options = CurrentOptions(); if (options.use_direct_reads) { continue; } options.env = env.get(); options.disable_auto_compactions = true; Cache* block_cache = nullptr; // DB open will create table readers unless we reduce the table cache // capacity. // SanitizeOptions will set max_open_files to minimum of 20. Table cache // is allocated with max_open_files - 10 as capacity. So override // max_open_files to 11 so table cache capacity will become 1. This will // prevent file open during DB open and force the file to be opened // during MultiGet SyncPoint::GetInstance()->SetCallBack( "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { int* max_open_files = (int*)arg; *max_open_files = 11; }); SyncPoint::GetInstance()->EnableProcessing(); SetTimeElapseOnlySleepOnReopen(&options); Reopen(options); if (options.table_factory) { block_cache = options.table_factory->GetOptions( TableFactory::kBlockCacheOpts()); } Random rnd(301); for (int i = 0; i < 400; ++i) { std::string key = "k" + std::to_string(i); ASSERT_OK(Put(key, rnd.RandomString(100))); } ASSERT_OK(Flush()); bool timedout = true; // A timeout will be forced when the IO counter reaches this value int io_deadline_trigger = 0; // Keep incrementing io_deadline_trigger and call Get() until there is an // iteration that doesn't cause a timeout. This ensures that we cover // all file reads in the point lookup path that can potentially timeout while (timedout) { ReadOptions ro; if (set_deadline) { ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; } if (set_timeout) { ro.io_timeout = std::chrono::microseconds{5000}; } fs->SetDelayTrigger(ro.deadline, ro.io_timeout, io_deadline_trigger); block_cache->SetCapacity(0); block_cache->SetCapacity(1048576); Iterator* iter = dbfull()->NewIterator(ro); int count = 0; iter->Seek("k50"); while (iter->Valid() && count++ < 100) { iter->Next(); } if (fs->TimedOut()) { ASSERT_FALSE(iter->Valid()); ASSERT_EQ(iter->status(), Status::TimedOut()); } else { timedout = false; ASSERT_OK(iter->status()); } delete iter; io_deadline_trigger++; } // Reset the delay sequence in order to avoid false alarms during Reopen fs->SetDelayTrigger(std::chrono::microseconds::zero(), std::chrono::microseconds::zero(), 0); } Close(); } // Param 0: If true, set read_options.deadline // Param 1: If true, set read_options.io_timeout INSTANTIATE_TEST_CASE_P(DBBasicTestDeadline, DBBasicTestDeadline, ::testing::Values(std::make_tuple(true, false), std::make_tuple(false, true), std::make_tuple(true, true))); } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); }