// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). #pragma once #include #include #include #include #include #include #include #include #include "db/dbformat.h" #include "db/post_memtable_callback.h" #include "db/pre_release_callback.h" #include "db/write_callback.h" #include "monitoring/instrumented_mutex.h" #include "rocksdb/options.h" #include "rocksdb/status.h" #include "rocksdb/types.h" #include "rocksdb/write_batch.h" #include "util/autovector.h" namespace ROCKSDB_NAMESPACE { class WriteThread { public: enum State : uint8_t { // The initial state of a writer. This is a Writer that is // waiting in JoinBatchGroup. This state can be left when another // thread informs the waiter that it has become a group leader // (-> STATE_GROUP_LEADER), when a leader that has chosen to be // non-parallel informs a follower that its writes have been committed // (-> STATE_COMPLETED), or when a leader that has chosen to perform // updates in parallel and needs this Writer to apply its batch (-> // STATE_PARALLEL_MEMTABLE_WRITER). STATE_INIT = 1, // The state used to inform a waiting Writer that it has become the // leader, and it should now build a write batch group. Tricky: // this state is not used if newest_writer_ is empty when a writer // enqueues itself, because there is no need to wait (or even to // create the mutex and condvar used to wait) in that case. This is // a terminal state unless the leader chooses to make this a parallel // batch, in which case the last parallel worker to finish will move // the leader to STATE_COMPLETED. STATE_GROUP_LEADER = 2, // The state used to inform a waiting writer that it has become the // leader of memtable writer group. The leader will either write // memtable for the whole group, or launch a parallel group write // to memtable by calling LaunchParallelMemTableWrite. STATE_MEMTABLE_WRITER_LEADER = 4, // The state used to inform a waiting writer that it has become a // parallel memtable writer. It can be the group leader who launch the // parallel writer group, or one of the followers. The writer should then // apply its batch to the memtable concurrently and call // CompleteParallelMemTableWriter. STATE_PARALLEL_MEMTABLE_WRITER = 8, // A follower whose writes have been applied, or a parallel leader // whose followers have all finished their work. This is a terminal // state. STATE_COMPLETED = 16, // A state indicating that the thread may be waiting using StateMutex() // and StateCondVar() STATE_LOCKED_WAITING = 32, }; struct Writer; struct WriteGroup { Writer* leader = nullptr; Writer* last_writer = nullptr; SequenceNumber last_sequence; // before running goes to zero, status needs leader->StateMutex() Status status; std::atomic running; size_t size = 0; struct Iterator { Writer* writer; Writer* last_writer; explicit Iterator(Writer* w, Writer* last) : writer(w), last_writer(last) {} Writer* operator*() const { return writer; } Iterator& operator++() { assert(writer != nullptr); if (writer == last_writer) { writer = nullptr; } else { writer = writer->link_newer; } return *this; } bool operator!=(const Iterator& other) const { return writer != other.writer; } }; Iterator begin() const { return Iterator(leader, last_writer); } Iterator end() const { return Iterator(nullptr, nullptr); } }; // Information kept for every waiting writer. struct Writer { WriteBatch* batch; bool sync; bool no_slowdown; bool disable_wal; Env::IOPriority rate_limiter_priority; bool disable_memtable; size_t batch_cnt; // if non-zero, number of sub-batches in the write batch size_t protection_bytes_per_key; PreReleaseCallback* pre_release_callback; PostMemTableCallback* post_memtable_callback; uint64_t log_used; // log number that this batch was inserted into uint64_t log_ref; // log number that memtable insert should reference WriteCallback* callback; bool made_waitable; // records lazy construction of mutex and cv std::atomic state; // write under StateMutex() or pre-link WriteGroup* write_group; SequenceNumber sequence; // the sequence number to use for the first key Status status; Status callback_status; // status returned by callback->Callback() std::aligned_storage::type state_mutex_bytes; std::aligned_storage::type state_cv_bytes; Writer* link_older; // read/write only before linking, or as leader Writer* link_newer; // lazy, read/write only before linking, or as leader Writer() : batch(nullptr), sync(false), no_slowdown(false), disable_wal(false), rate_limiter_priority(Env::IOPriority::IO_TOTAL), disable_memtable(false), batch_cnt(0), protection_bytes_per_key(0), pre_release_callback(nullptr), post_memtable_callback(nullptr), log_used(0), log_ref(0), callback(nullptr), made_waitable(false), state(STATE_INIT), write_group(nullptr), sequence(kMaxSequenceNumber), link_older(nullptr), link_newer(nullptr) {} Writer(const WriteOptions& write_options, WriteBatch* _batch, WriteCallback* _callback, uint64_t _log_ref, bool _disable_memtable, size_t _batch_cnt = 0, PreReleaseCallback* _pre_release_callback = nullptr, PostMemTableCallback* _post_memtable_callback = nullptr) : batch(_batch), // TODO: store a copy of WriteOptions instead of its seperated data // members sync(write_options.sync), no_slowdown(write_options.no_slowdown), disable_wal(write_options.disableWAL), rate_limiter_priority(write_options.rate_limiter_priority), disable_memtable(_disable_memtable), batch_cnt(_batch_cnt), protection_bytes_per_key(_batch->GetProtectionBytesPerKey()), pre_release_callback(_pre_release_callback), post_memtable_callback(_post_memtable_callback), log_used(0), log_ref(_log_ref), callback(_callback), made_waitable(false), state(STATE_INIT), write_group(nullptr), sequence(kMaxSequenceNumber), link_older(nullptr), link_newer(nullptr) {} ~Writer() { if (made_waitable) { StateMutex().~mutex(); StateCV().~condition_variable(); } status.PermitUncheckedError(); callback_status.PermitUncheckedError(); } bool CheckCallback(DB* db) { if (callback != nullptr) { callback_status = callback->Callback(db); } return callback_status.ok(); } void CreateMutex() { if (!made_waitable) { // Note that made_waitable is tracked separately from state // transitions, because we can't atomically create the mutex and // link into the list. made_waitable = true; new (&state_mutex_bytes) std::mutex; new (&state_cv_bytes) std::condition_variable; } } // returns the aggregate status of this Writer Status FinalStatus() { if (!status.ok()) { // a non-ok memtable write status takes presidence assert(callback == nullptr || callback_status.ok()); return status; } else if (!callback_status.ok()) { // if the callback failed then that is the status we want // because a memtable insert should not have been attempted assert(callback != nullptr); assert(status.ok()); return callback_status; } else { // if there is no callback then we only care about // the memtable insert status assert(callback == nullptr || callback_status.ok()); return status; } } bool CallbackFailed() { return (callback != nullptr) && !callback_status.ok(); } bool ShouldWriteToMemtable() { return status.ok() && !CallbackFailed() && !disable_memtable; } bool ShouldWriteToWAL() { return status.ok() && !CallbackFailed() && !disable_wal; } // No other mutexes may be acquired while holding StateMutex(), it is // always last in the order std::mutex& StateMutex() { assert(made_waitable); return *static_cast(static_cast(&state_mutex_bytes)); } std::condition_variable& StateCV() { assert(made_waitable); return *static_cast( static_cast(&state_cv_bytes)); } }; struct AdaptationContext { const char* name; std::atomic value; explicit AdaptationContext(const char* name0) : name(name0), value(0) {} }; explicit WriteThread(const ImmutableDBOptions& db_options); virtual ~WriteThread() = default; // IMPORTANT: None of the methods in this class rely on the db mutex // for correctness. All of the methods except JoinBatchGroup and // EnterUnbatched may be called either with or without the db mutex held. // Correctness is maintained by ensuring that only a single thread is // a leader at a time. // Registers w as ready to become part of a batch group, waits until the // caller should perform some work, and returns the current state of the // writer. If w has become the leader of a write batch group, returns // STATE_GROUP_LEADER. If w has been made part of a sequential batch // group and the leader has performed the write, returns STATE_DONE. // If w has been made part of a parallel batch group and is responsible // for updating the memtable, returns STATE_PARALLEL_MEMTABLE_WRITER. // // The db mutex SHOULD NOT be held when calling this function, because // it will block. // // Writer* w: Writer to be executed as part of a batch group void JoinBatchGroup(Writer* w); // Constructs a write batch group led by leader, which should be a // Writer passed to JoinBatchGroup on the current thread. // // Writer* leader: Writer that is STATE_GROUP_LEADER // WriteGroup* write_group: Out-param of group members // returns: Total batch group byte size size_t EnterAsBatchGroupLeader(Writer* leader, WriteGroup* write_group); // Unlinks the Writer-s in a batch group, wakes up the non-leaders, // and wakes up the next leader (if any). // // WriteGroup* write_group: the write group // Status status: Status of write operation void ExitAsBatchGroupLeader(WriteGroup& write_group, Status& status); // Exit batch group on behalf of batch group leader. void ExitAsBatchGroupFollower(Writer* w); // Constructs a write batch group led by leader from newest_memtable_writers_ // list. The leader should either write memtable for the whole group and // call ExitAsMemTableWriter, or launch parallel memtable write through // LaunchParallelMemTableWriters. void EnterAsMemTableWriter(Writer* leader, WriteGroup* write_grup); // Memtable writer group leader, or the last finished writer in a parallel // write group, exit from the newest_memtable_writers_ list, and wake up // the next leader if needed. void ExitAsMemTableWriter(Writer* self, WriteGroup& write_group); // Causes JoinBatchGroup to return STATE_PARALLEL_MEMTABLE_WRITER for all of // the non-leader members of this write batch group. Sets Writer::sequence // before waking them up. // // WriteGroup* write_group: Extra state used to coordinate the parallel add void LaunchParallelMemTableWriters(WriteGroup* write_group); // Reports the completion of w's batch to the parallel group leader, and // waits for the rest of the parallel batch to complete. Returns true // if this thread is the last to complete, and hence should advance // the sequence number and then call EarlyExitParallelGroup, false if // someone else has already taken responsibility for that. bool CompleteParallelMemTableWriter(Writer* w); // Waits for all preceding writers (unlocking mu while waiting), then // registers w as the currently proceeding writer. // // Writer* w: A Writer not eligible for batching // InstrumentedMutex* mu: The db mutex, to unlock while waiting // REQUIRES: db mutex held void EnterUnbatched(Writer* w, InstrumentedMutex* mu); // Completes a Writer begun with EnterUnbatched, unblocking subsequent // writers. void ExitUnbatched(Writer* w); // Wait for all parallel memtable writers to finish, in case pipelined // write is enabled. void WaitForMemTableWriters(); SequenceNumber UpdateLastSequence(SequenceNumber sequence) { if (sequence > last_sequence_) { last_sequence_ = sequence; } return last_sequence_; } // Insert a dummy writer at the tail of the write queue to indicate a write // stall, and fail any writers in the queue with no_slowdown set to true // REQUIRES: db mutex held, no other stall on this queue outstanding void BeginWriteStall(); // Remove the dummy writer and wake up waiting writers // REQUIRES: db mutex held void EndWriteStall(); // Number of BeginWriteStall(), or 0 if there is no active stall in the // write queue. // REQUIRES: db mutex held uint64_t GetBegunCountOfOutstandingStall(); // Wait for number of completed EndWriteStall() to reach >= `stall_count`, // which will generally have come from GetBegunCountOfOutstandingStall(). // (Does not require db mutex held) void WaitForStallEndedCount(uint64_t stall_count); private: // See AwaitState. const uint64_t max_yield_usec_; const uint64_t slow_yield_usec_; // Allow multiple writers write to memtable concurrently. const bool allow_concurrent_memtable_write_; // Enable pipelined write to WAL and memtable. const bool enable_pipelined_write_; // The maximum limit of number of bytes that are written in a single batch // of WAL or memtable write. It is followed when the leader write size // is larger than 1/8 of this limit. const uint64_t max_write_batch_group_size_bytes; // Points to the newest pending writer. Only leader can remove // elements, adding can be done lock-free by anybody. std::atomic newest_writer_; // Points to the newest pending memtable writer. Used only when pipelined // write is enabled. std::atomic newest_memtable_writer_; // The last sequence that have been consumed by a writer. The sequence // is not necessary visible to reads because the writer can be ongoing. SequenceNumber last_sequence_; // A dummy writer to indicate a write stall condition. This will be inserted // at the tail of the writer queue by the leader, so newer writers can just // check for this and bail Writer write_stall_dummy_; // Mutex and condvar for writers to block on a write stall. During a write // stall, writers with no_slowdown set to false will wait on this rather // on the writer queue port::Mutex stall_mu_; port::CondVar stall_cv_; // Count the number of stalls begun, so that we can check whether // a particular stall has cleared (even if caught in another stall). // Controlled by DB mutex. // Because of the contract on BeginWriteStall() / EndWriteStall(), // stall_ended_count_ <= stall_begun_count_ <= stall_ended_count_ + 1. uint64_t stall_begun_count_ = 0; // Count the number of stalls ended, so that we can check whether // a particular stall has cleared (even if caught in another stall). // Writes controlled by DB mutex + stall_mu_, signalled by stall_cv_. // Read with stall_mu or DB mutex. uint64_t stall_ended_count_ = 0; // Waits for w->state & goal_mask using w->StateMutex(). Returns // the state that satisfies goal_mask. uint8_t BlockingAwaitState(Writer* w, uint8_t goal_mask); // Blocks until w->state & goal_mask, returning the state value // that satisfied the predicate. Uses ctx to adaptively use // std::this_thread::yield() to avoid mutex overheads. ctx should be // a context-dependent static. uint8_t AwaitState(Writer* w, uint8_t goal_mask, AdaptationContext* ctx); // Set writer state and wake the writer up if it is waiting. void SetState(Writer* w, uint8_t new_state); // Links w into the newest_writer list. Return true if w was linked directly // into the leader position. Safe to call from multiple threads without // external locking. bool LinkOne(Writer* w, std::atomic* newest_writer); // Link write group into the newest_writer list as a whole, while keeping the // order of the writers unchanged. Return true if the group was linked // directly into the leader position. bool LinkGroup(WriteGroup& write_group, std::atomic* newest_writer); // Computes any missing link_newer links. Should not be called // concurrently with itself. void CreateMissingNewerLinks(Writer* head); // Set the leader in write_group to completed state and remove it from the // write group. void CompleteLeader(WriteGroup& write_group); // Set a follower in write_group to completed state and remove it from the // write group. void CompleteFollower(Writer* w, WriteGroup& write_group); }; } // namespace ROCKSDB_NAMESPACE