#include #include #define CHECK(call)\ {\ const cudaError_t error=call;\ if(error!=cudaSuccess)\ {\ printf("ERROR: %s:%d,",__FILE__,__LINE__);\ printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\ exit(1);\ }\ } #define CHECK_LAST_KERN()\ {\ cudaDeviceSynchronize();\ const cudaError_t error=cudaGetLastError();\ if(error!=cudaSuccess)\ {\ printf("ERROR: %s:%d,",__FILE__,__LINE__);\ printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\ exit(1);\ }\ } struct StreamWraper { cudaStream_t stream; StreamWraper() { CHECK( cudaStreamCreate(&stream) ); } ~StreamWraper() { CHECK( cudaStreamDestroy(this->stream) ); } cudaStream_t get() { return this->stream; } }; extern "C" { void* create_stream() { StreamWraper* s = new StreamWraper(); return s; } void destory_stream(void* stream) { StreamWraper* _stream = (StreamWraper*)stream; delete _stream; } void wait_stream(void* stream) { StreamWraper* s = (StreamWraper*)stream; cudaStreamSynchronize(s->get()); } void wait_default_stream() { cudaStreamSynchronize(0); } void* get_inner_stream(void* stream) { StreamWraper* s = (StreamWraper*)stream; return (void*)&s->stream; } uint8_t* alloc_locked_buffer(size_t bytes) { uint8_t* buffer; CHECK(cudaMallocHost(&buffer, bytes)); return buffer; } void free_locked_buffer(uint8_t* buffer) { CHECK(cudaFreeHost(buffer)); } void set_device(size_t index) { CHECK(cudaSetDevice(index)); } void set_device_flags(uint32_t flags) { CHECK(cudaSetDeviceFlags(flags)) } void device_to_host(uint8_t* device, uint8_t* host, size_t bytes) { CHECK(cudaMemcpy(host, device, bytes, cudaMemcpyDeviceToHost)); } void host_to_device(uint8_t* host, uint8_t* device, size_t bytes) { CHECK(cudaMemcpy(device, host, bytes, cudaMemcpyHostToDevice)); } void device_to_device(uint8_t* dst, uint8_t* src, size_t bytes) { CHECK(cudaMemcpy(dst, src, bytes, cudaMemcpyDeviceToDevice)); } void host_to_device_2d(uint8_t* host, uint8_t* device, size_t hostPitch, size_t devPitch, size_t width, size_t height) { CHECK(cudaMemcpy2D(device, devPitch, host, hostPitch, width, height, cudaMemcpyHostToDevice)); } uint8_t* alloc_gpu_buffer(size_t bytes) { uint8_t* buf; CHECK(cudaMalloc(&buf, bytes)); return buf; } void free_gpu_buffer(uint8_t* buf) { CHECK(cudaFree(buf)); } void check_and_sync() { CHECK_LAST_KERN(); } void sync_device() { CHECK(cudaDeviceSynchronize()); } void reset_device() { CHECK(cudaDeviceReset()); } uint32_t get_last_error() { const cudaError_t error=cudaGetLastError(); return (uint32_t)(error); } const char* get_error_string(uint32_t error_code) { return cudaGetErrorString((cudaError_t)error_code); } uint32_t get_device_count() { int count; cudaGetDeviceCount(&count); return (uint32_t)count; } uint32_t get_sm_count() { cudaDeviceProp devProp; cudaGetDeviceProperties(&devProp, 0); return devProp.multiProcessorCount; } size_t get_total_memory() { cudaDeviceProp devProp; cudaGetDeviceProperties(&devProp, 0); return devProp.totalGlobalMem; } uint32_t get_device_cuda_core_count() { uint32_t cores = 0; cudaDeviceProp devProp; cudaGetDeviceProperties(&devProp, 0); uint32_t mp = devProp.multiProcessorCount; switch (devProp.major){ case 2: // Fermi if (devProp.minor == 1) cores = mp * 48; else cores = mp * 32; break; case 3: // Kepler cores = mp * 192; break; case 5: // Maxwell cores = mp * 128; break; case 6: // Pascal if ((devProp.minor == 1) || (devProp.minor == 2)) cores = mp * 128; else if (devProp.minor == 0) cores = mp * 64; break; case 7: // Volta and Turing if ((devProp.minor == 0) || (devProp.minor == 5)) cores = mp * 64; break; case 8: // Ampere if (devProp.minor == 0) cores = mp * 64; else if (devProp.minor == 6) cores = mp * 128; break; default: break; } return cores; } void cu_memset(uint8_t* devPtr, uint8_t value, size_t count) { CHECK(cudaMemset((void*)devPtr, value, count)); } }