// Copyright 2019 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef DARWINN_DRIVER_SINGLE_TPU_REQUEST_H_ #define DARWINN_DRIVER_SINGLE_TPU_REQUEST_H_ #include #include #include #include #include // NOLINT #include #include #include #include "api/buffer.h" #include "driver/allocator.h" #include "driver/device_buffer.h" #include "driver/device_buffer_mapper.h" #include "driver/dma_info.h" #include "driver/dma_info_extractor.h" #include "driver/executable_util.h" #include "driver/memory/address_space.h" #include "driver/memory/dram_allocator.h" #include "driver/package_registry.h" #include "driver/request.h" #include "driver/tpu_request.h" #include "executable/executable_generated.h" #include "port/array_slice.h" #include "port/integral_types.h" #include "port/statusor.h" #include "port/thread_annotations.h" namespace platforms { namespace darwinn { namespace driver { // An single inference request to TPU. This class is thread-safe. class SingleTpuRequest : public TpuRequest { public: // Constructs a request object for executing the given |executable_reference|. // |done| is the callback function executed when request is complete. // TODO: Make this constructor private and create request objects // through a factory function in the driver. explicit SingleTpuRequest( int id, const std::shared_ptr parent_request, const ExecutableReference* executable_reference, Allocator* allocator, DramAllocator* dram_allocator, std::unique_ptr device_buffer_mapper, const DmaInfoExtractor* extractor, uint64 alignment_bytes, RequestType type); explicit SingleTpuRequest( int id, const std::shared_ptr parent_request, const ExecutableReference* executable_reference, Allocator* allocator, DramAllocator* dram_allocator, std::unique_ptr device_buffer_mapper, const DmaInfoExtractor* extractor, uint64 alignment_bytes, Done done, RequestType type); SingleTpuRequest(SingleTpuRequest&& rhs) = delete; SingleTpuRequest& operator=(SingleTpuRequest&& rhs) = delete; SingleTpuRequest(const SingleTpuRequest&) = delete; SingleTpuRequest& operator=(const SingleTpuRequest&) = delete; ~SingleTpuRequest() override; Status SetDone(Done done) LOCKS_EXCLUDED(mutex_) override; Status AddInput(const std::string& name, const Buffer& input) LOCKS_EXCLUDED(mutex_) override; Status AddOutput(const std::string& name, Buffer output) LOCKS_EXCLUDED(mutex_) override; Status AddNoopInputs(const std::string& name, int count) LOCKS_EXCLUDED(mutex_) override; Status AddNoopOutputs(const std::string& name, int count) LOCKS_EXCLUDED(mutex_) override; const Buffer& InputBuffer(const std::string& name, int batch) const override LOCKS_EXCLUDED(mutex_); Buffer OutputBuffer(const std::string& name, int batch) const override LOCKS_EXCLUDED(mutex_); Status Validate() LOCKS_EXCLUDED(mutex_) override; Status Prepare() LOCKS_EXCLUDED(mutex_) override; Status Cancel() LOCKS_EXCLUDED(mutex_) override; // TODO: The following functions needs to restricted for use // by the driver only. Status NotifyRequestSubmitted() LOCKS_EXCLUDED(mutex_) override; Status NotifyRequestActive() LOCKS_EXCLUDED(mutex_) override; Status NotifyCompletion(Status status) LOCKS_EXCLUDED(mutex_) override; int id() const override { return id_; } RequestType type() const override { return type_; } int num_instruction_bitstream_chunks() const override { return executable().instruction_bitstreams()->size(); } StatusOr> GetDmaInfos() const LOCKS_EXCLUDED(mutex_) override; const ExecutableReference& executable_reference() const override { return executable_reference_; } DeviceBufferMapper* device_buffer_mapper() const { return device_buffer_mapper_.get(); } private: // Compute request state. State transitions : // kUninitialized -> kCreated -> kSubmitted -> kActive -> kDone // kUninitialized -> kCreated -> kSubmitted -> kDone [if cancelled]. enum State { kUninitialized, // Request not initialized. kCreated, // Request created, but pending issue to DarwiNN. kSubmitted, // Request submitted and in queue for issuing to DarwiNN. kActive, // Request issued to DarwiNN, pending results. kDone, // Request in terminal state. }; // Attempts a state transition to the given state. Status SetState(State next_state) EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Validates that we are in the expected state. Status ValidateState(State expected_state) const SHARED_LOCKS_REQUIRED(mutex_); // Maps all data buffers (input, output, parameters) to the DarwiNN address // space. Status MapDataBuffers() EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Map instruction buffers to the DarwiNN address space. Status MapInstructionBuffers() EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Unmaps all buffers and frees the allocated instruction and parameter // buffers if any. Reverse of what is done in #Prepare(). Status Cleanup() EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Convenience function that returns the backing executable in // |executable_reference_|. const darwinn::Executable& executable() const { return executable_reference_.executable(); } // Returns true if the alignment requirement for a provided buffer is met. bool IsBufferAligned(const Buffer& buffer); // Post processes the output buffers. This includes: // 1. Relayout the outputs in host_outputs_ to user-expected layouts and // store them in the user_outputs_. Some outputs do not need a relayout // and for those we set the same user-provided buffer in the host_outputs_. // Those are ignored by this method. // 2. Perform sign conversion. Status PostProcessOutputBuffers() EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Tries to create a TPU DRAM buffer. If it fails, it falls back to create a // host DRAM buffer. Buffer TryCreateDramBuffer(size_t size_bytes); // Creates and returns a new buffer for |batches| copies of the activations of // a provided layer. Buffer CreateActivationBuffer(const api::LayerInformation* layer, int batches); // Gets a contiguous buffer that holds all batched host_outputs_ for a given // layer. Lazily creates the buffer on first access, but always returns the // same buffer when called for the same layer. Buffer GetOrCreateBatchOutput(const api::LayerInformation* layer, const std::string& name) EXCLUSIVE_LOCKS_REQUIRED(mutex_); // Copies a provided input buffer in such a way that inputs of each iteration // has the alignment requirements. Buffer ScatterInput(const Buffer& input, const api::LayerInformation* layer); // Unique ID for request. const int id_; // Track the type of TPU request for logging purposes. const RequestType type_; // The parent driver request this TpuRequest is a part of. We mostly hold on // to a shared_pointer to the parent here to make sure it outlives all its // TPU requests. const std::shared_ptr parent_request_; // Executable for the compute request. const ExecutableReference& executable_reference_; // Buffer allocator. Allocator* const allocator_; // On-Chip DRAM Buffer allocator. DramAllocator* dram_allocator_; // Maps and stores all device buffers. std::unique_ptr device_buffer_mapper_; // DMA info extractor. const DmaInfoExtractor& extractor_; // Maintains integrity of the request object. mutable std::mutex mutex_; // Request state. State state_ GUARDED_BY(mutex_){kUninitialized}; // Infeed and outfeed host buffers. // host_*[layer_name][batch_id] = buffer. Buffer::NamedMap host_inputs_ GUARDED_BY(mutex_); Buffer::NamedMap host_outputs_ GUARDED_BY(mutex_); // Cache of batch-sized output buffers that are used to ensure // host_outputs_ are contiguous. std::unordered_map batch_outputs_ GUARDED_BY(mutex_); // Buffers to contain the user-facing outputs. The difference between user and // host outputs is that host_outputs have a TPU-friendly layout while user // outputs have a user-friendly layout. For DMAs and basically anything from // driver down, we mostly deal host_outputs_. And for anything from driver up // or methods exposed in the API, we deal with user_outputs_. Buffer::NamedMap user_outputs_ GUARDED_BY(mutex_); // Final request completion callback. Done done_ GUARDED_BY(mutex_); // A copy of the mapped parameters owned by executable reference. const DeviceBuffer parameter_device_buffer_ GUARDED_BY(mutex_); // Buffers for instructions. std::unique_ptr instruction_buffers_ GUARDED_BY(mutex_); // The alignment requirement for input and output buffers provided by the // user. const uint64 alignment_bytes_; }; } // namespace driver } // namespace darwinn } // namespace platforms #endif // DARWINN_DRIVER_SINGLE_TPU_REQUEST_H_