/*! * Copyright (c) 2020-2021 IBM Corporation, Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See LICENSE file in the project root for license information. */ #ifndef LIGHTGBM_CUDA_CUDA_UTILS_H_ #define LIGHTGBM_CUDA_CUDA_UTILS_H_ #ifdef USE_CUDA #include #include #include #include #include #include #include namespace LightGBM { typedef unsigned long long atomic_add_long_t; #define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) { if (code != cudaSuccess) { LightGBM::Log::Fatal("[CUDA] %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } #define CUDASUCCESS_OR_FATAL_OUTER(ans) { gpuAssert((ans), file, line); } void SetCUDADevice(int gpu_device_id, const char* file, int line); int GetCUDADevice(const char* file, int line); template void AllocateCUDAMemory(T** out_ptr, size_t size, const char* file, const int line) { void* tmp_ptr = nullptr; CUDASUCCESS_OR_FATAL_OUTER(cudaMalloc(&tmp_ptr, size * sizeof(T))); *out_ptr = reinterpret_cast(tmp_ptr); } template void CopyFromHostToCUDADevice(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { void* void_dst_ptr = reinterpret_cast(dst_ptr); const void* void_src_ptr = reinterpret_cast(src_ptr); size_t size_in_bytes = size * sizeof(T); CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyHostToDevice)); } template void InitCUDAMemoryFromHostMemory(T** dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { AllocateCUDAMemory(dst_ptr, size, file, line); CopyFromHostToCUDADevice(*dst_ptr, src_ptr, size, file, line); } template void CopyFromCUDADeviceToHost(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { void* void_dst_ptr = reinterpret_cast(dst_ptr); const void* void_src_ptr = reinterpret_cast(src_ptr); size_t size_in_bytes = size * sizeof(T); CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToHost)); } template void CopyFromCUDADeviceToHostAsync(T* dst_ptr, const T* src_ptr, size_t size, cudaStream_t stream, const char* file, const int line) { void* void_dst_ptr = reinterpret_cast(dst_ptr); const void* void_src_ptr = reinterpret_cast(src_ptr); size_t size_in_bytes = size * sizeof(T); CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToHost, stream)); } template void CopyFromCUDADeviceToCUDADevice(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { void* void_dst_ptr = reinterpret_cast(dst_ptr); const void* void_src_ptr = reinterpret_cast(src_ptr); size_t size_in_bytes = size * sizeof(T); CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpy(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToDevice)); } template void CopyFromCUDADeviceToCUDADeviceAsync(T* dst_ptr, const T* src_ptr, size_t size, const char* file, const int line) { void* void_dst_ptr = reinterpret_cast(dst_ptr); const void* void_src_ptr = reinterpret_cast(src_ptr); size_t size_in_bytes = size * sizeof(T); CUDASUCCESS_OR_FATAL_OUTER(cudaMemcpyAsync(void_dst_ptr, void_src_ptr, size_in_bytes, cudaMemcpyDeviceToDevice)); } void SynchronizeCUDADevice(const char* file, const int line); template void SetCUDAMemory(T* dst_ptr, int value, size_t size, const char* file, const int line) { CUDASUCCESS_OR_FATAL_OUTER(cudaMemset(reinterpret_cast(dst_ptr), value, size * sizeof(T))); SynchronizeCUDADevice(file, line); } template void DeallocateCUDAMemory(T** ptr, const char* file, const int line) { if (*ptr != nullptr) { CUDASUCCESS_OR_FATAL_OUTER(cudaFree(reinterpret_cast(*ptr))); *ptr = nullptr; } } void PrintLastCUDAError(); template class CUDAVector { public: CUDAVector() { size_ = 0; data_ = nullptr; } explicit CUDAVector(size_t size) { size_ = size; AllocateCUDAMemory(&data_, size_, __FILE__, __LINE__); } void Resize(size_t size) { if (size == size_) { return; } if (size == 0) { Clear(); return; } T* new_data = nullptr; AllocateCUDAMemory(&new_data, size, __FILE__, __LINE__); if (size_ > 0 && data_ != nullptr) { const size_t size_for_old_content = std::min(size_, size); CopyFromCUDADeviceToCUDADevice(new_data, data_, size_for_old_content, __FILE__, __LINE__); } DeallocateCUDAMemory(&data_, __FILE__, __LINE__); data_ = new_data; size_ = size; } void InitFromHostVector(const std::vector& host_vector) { Resize(host_vector.size()); CopyFromHostToCUDADevice(data_, host_vector.data(), host_vector.size(), __FILE__, __LINE__); } void Clear() { if (size_ > 0 && data_ != nullptr) { DeallocateCUDAMemory(&data_, __FILE__, __LINE__); } size_ = 0; } void PushBack(const T* values, size_t len) { T* new_data = nullptr; AllocateCUDAMemory(&new_data, size_ + len, __FILE__, __LINE__); if (size_ > 0 && data_ != nullptr) { CopyFromCUDADeviceToCUDADevice(new_data, data_, size_, __FILE__, __LINE__); } CopyFromCUDADeviceToCUDADevice(new_data + size_, values, len, __FILE__, __LINE__); DeallocateCUDAMemory(&data_, __FILE__, __LINE__); size_ += len; data_ = new_data; } size_t Size() { return size_; } ~CUDAVector() { DeallocateCUDAMemory(&data_, __FILE__, __LINE__); } std::vector ToHost() { std::vector host_vector(size_); if (size_ > 0 && data_ != nullptr) { CopyFromCUDADeviceToHost(host_vector.data(), data_, size_, __FILE__, __LINE__); } return host_vector; } T* RawData() const { return data_; } void SetValue(int value) { SetCUDAMemory(data_, value, size_, __FILE__, __LINE__); } const T* RawDataReadOnly() const { return data_; } private: T* data_; size_t size_; }; template static __device__ T SafeLog(T x) { if (x > 0) { return std::log(x); } else { return -INFINITY; } } } // namespace LightGBM #endif // USE_CUDA #endif // LIGHTGBM_CUDA_CUDA_UTILS_H_