/*************************************************************************** nvd_memory.h ------------------- W. Michael Brown CUDA Driver Specific Memory Management and Vector/Matrix Containers __________________________________________________________________________ This file is part of the Geryon Unified Coprocessor Library (UCL) __________________________________________________________________________ begin : Thu Jan 21 2010 copyright : (C) 2010 by W. Michael Brown email : brownw@ornl.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2010) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ #ifndef NVD_MEMORY_H #define NVD_MEMORY_H #include #include #include #include "nvd_macros.h" #include "ucl_types.h" namespace ucl_cudadr { // -------------------------------------------------------------------------- // - API Specific Types // -------------------------------------------------------------------------- //typedef dim3 ucl_kernel_dim; // -------------------------------------------------------------------------- // - API SPECIFIC DEVICE POINTERS // -------------------------------------------------------------------------- typedef CUdeviceptr device_ptr; // -------------------------------------------------------------------------- // - HOST MEMORY ALLOCATION ROUTINES // -------------------------------------------------------------------------- template inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n, const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){ CUresult err=CUDA_SUCCESS; if (kind==UCL_NOT_PINNED) *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n); else if (kind==UCL_WRITE_ONLY) err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED); else err=cuMemAllocHost((void **)mat.host_ptr(),n); if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL) return UCL_MEMORY_ERROR; mat.cq()=cm.cq(); return UCL_SUCCESS; } template inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n, const enum UCL_MEMOPT kind, const enum UCL_MEMOPT kind2){ CUresult err=CUDA_SUCCESS; if (kind==UCL_NOT_PINNED) *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n); else if (kind==UCL_WRITE_ONLY) err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED); else err=cuMemAllocHost((void **)mat.host_ptr(),n); if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL) return UCL_MEMORY_ERROR; mat.cq()=dev.cq(); return UCL_SUCCESS; } template inline void _host_free(mat_type &mat) { if (mat.kind()==UCL_VIEW) return; else if (mat.kind()!=UCL_NOT_PINNED) CU_DESTRUCT_CALL(cuMemFreeHost(mat.begin())); else free(mat.begin()); } template inline int _host_resize(mat_type &mat, const size_t n) { _host_free(mat); CUresult err=CUDA_SUCCESS; if (mat.kind()==UCL_NOT_PINNED) *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n); else if (mat.kind()==UCL_WRITE_ONLY) err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED); else err=cuMemAllocHost((void **)mat.host_ptr(),n); if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL) return UCL_MEMORY_ERROR; return UCL_SUCCESS; } // -------------------------------------------------------------------------- // - DEVICE MEMORY ALLOCATION ROUTINES // -------------------------------------------------------------------------- template inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n, const enum UCL_MEMOPT kind) { CUresult err=cuMemAlloc(&mat.cbegin(),n); if (err!=CUDA_SUCCESS) return UCL_MEMORY_ERROR; mat.cq()=cm.cq(); return UCL_SUCCESS; } template inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n, const enum UCL_MEMOPT kind) { CUresult err=cuMemAlloc(&mat.cbegin(),n); if (err!=CUDA_SUCCESS) return UCL_MEMORY_ERROR; mat.cq()=dev.cq(); return UCL_SUCCESS; } template inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows, const size_t cols, size_t &pitch, const enum UCL_MEMOPT kind) { CUresult err; CUDA_INT_TYPE upitch; err=cuMemAllocPitch(&mat.cbegin(),&upitch, cols*sizeof(typename mat_type::data_type),rows,16); pitch=static_cast(upitch); if (err!=CUDA_SUCCESS) return UCL_MEMORY_ERROR; mat.cq()=cm.cq(); return UCL_SUCCESS; } template inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows, const size_t cols, size_t &pitch, const enum UCL_MEMOPT kind) { CUresult err; unsigned upitch; err=cuMemAllocPitch(&mat.cbegin(),&upitch, cols*sizeof(typename mat_type::data_type),rows,16); pitch=static_cast(upitch); if (err!=CUDA_SUCCESS) return UCL_MEMORY_ERROR; mat.cq()=d.cq(); return UCL_SUCCESS; } template inline void _device_free(mat_type &mat) { if (mat.kind()!=UCL_VIEW) CU_DESTRUCT_CALL(cuMemFree(mat.cbegin())); } template inline int _device_resize(mat_type &mat, const size_t n) { _device_free(mat); CUresult err=cuMemAlloc(&mat.cbegin(),n); if (err!=CUDA_SUCCESS) return UCL_MEMORY_ERROR; return UCL_SUCCESS; } template inline int _device_resize(mat_type &mat, const size_t rows, const size_t cols, size_t &pitch) { _device_free(mat); CUresult err; CUDA_INT_TYPE upitch; err=cuMemAllocPitch(&mat.cbegin(),&upitch, cols*sizeof(typename mat_type::data_type),rows,16); pitch=static_cast(upitch); if (err!=CUDA_SUCCESS) return UCL_MEMORY_ERROR; return UCL_SUCCESS; } inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) { *ptr=in; } template inline void _device_view(CUdeviceptr *ptr, numtyp *in) { *ptr=0; } inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in, const size_t offset, const size_t numsize) { *ptr=in+offset*numsize; } template inline void _device_view(CUdeviceptr *ptr, numtyp *in, const size_t offset, const size_t numsize) { *ptr=0; } // -------------------------------------------------------------------------- // - DEVICE IMAGE ALLOCATION ROUTINES // -------------------------------------------------------------------------- template inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows, const size_t cols) { assert(0==1); } template inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows, const size_t cols) { assert(0==1); } template inline void _device_image_free(mat_type &mat) { assert(0==1); } // -------------------------------------------------------------------------- // - ZERO ROUTINES // -------------------------------------------------------------------------- inline void _host_zero(void *ptr, const size_t n) { memset(ptr,0,n); } template inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) { if (n%32==0) CU_SAFE_CALL(cuMemsetD32Async(mat.cbegin(),0,n/4,cq)); else if (n%16==0) CU_SAFE_CALL(cuMemsetD16Async(mat.cbegin(),0,n/2,cq)); else CU_SAFE_CALL(cuMemsetD8Async(mat.cbegin(),0,n,cq)); } // -------------------------------------------------------------------------- // - HELPER FUNCTIONS FOR MEMCPY ROUTINES // -------------------------------------------------------------------------- inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch, const size_t spitch, const size_t cols, const size_t rows) { ins.srcXInBytes=0; ins.srcY=0; ins.srcPitch=spitch; ins.dstXInBytes=0; ins.dstY=0; ins.dstPitch=dpitch; ins.WidthInBytes=cols; ins.Height=rows; } template struct _nvd_set_2D_mem; template <> struct _nvd_set_2D_mem<1> { static CUmemorytype a() { return CU_MEMORYTYPE_HOST; } }; template <> struct _nvd_set_2D_mem<2> { static CUmemorytype a() { return CU_MEMORYTYPE_ARRAY; } }; template struct _nvd_set_2D_mem { static CUmemorytype a() { return CU_MEMORYTYPE_DEVICE; } }; // -------------------------------------------------------------------------- // - MEMCPY ROUTINES // -------------------------------------------------------------------------- template struct _ucl_memcpy; // Both are images template<> struct _ucl_memcpy<2,2> { template static inline void mc(p1 &dst, const p2 &src, const size_t n) { assert(0==1); } template static inline void mc(p1 &dst, const p2 &src, const size_t n, CUstream &cq) { assert(0==1); } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); ins.dstMemoryType=_nvd_set_2D_mem::a(); ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstArray=dst.cbegin(); ins.srcArray=src.cbegin(); CU_SAFE_CALL(cuMemcpy2D(&ins)); } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); ins.dstMemoryType=_nvd_set_2D_mem::a(); ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstArray=dst.cbegin(); ins.srcArray=src.cbegin(); CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); } }; // Destination is texture, source on device template<> struct _ucl_memcpy<2,0> { template static inline void mc(p1 &dst, const p2 &src, const size_t n) { assert(0==1); } template static inline void mc(p1 &dst, const p2 &src, const size_t n, CUstream &cq) { assert(0==1); } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); ins.dstMemoryType=_nvd_set_2D_mem::a(); ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstArray=dst.cbegin(); ins.srcDevice=src.cbegin(); CU_SAFE_CALL(cuMemcpy2D(&ins)); } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); ins.dstMemoryType=_nvd_set_2D_mem::a(); ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstArray=dst.cbegin(); ins.srcDevice=src.cbegin(); CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); } }; // Destination is texture, source on host template<> struct _ucl_memcpy<2,1> { template static inline void mc(p1 &dst, const p2 &src, const size_t n) { assert(0==1); } template static inline void mc(p1 &dst, const p2 &src, const size_t n, CUstream &cq) { assert(0==1); } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); ins.dstMemoryType=_nvd_set_2D_mem::a(); ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstArray=dst.cbegin(); ins.srcHost=src.begin(); CU_SAFE_CALL(cuMemcpy2D(&ins)); } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); ins.dstMemoryType=_nvd_set_2D_mem::a(); ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstArray=dst.cbegin(); ins.srcHost=src.begin(); CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); } }; // Source is texture, dest on device template<> struct _ucl_memcpy<0,2> { template static inline void mc(p1 &dst, const p2 &src, const size_t n) { assert(0==1); } template static inline void mc(p1 &dst, const p2 &src, const size_t n, CUstream &cq) { assert(0==1); } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); ins.dstMemoryType=_nvd_set_2D_mem::a(); ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstDevice=dst.cbegin(); ins.srcArray=src.cbegin(); CU_SAFE_CALL(cuMemcpy2D(&ins)); } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); ins.dstMemoryType=_nvd_set_2D_mem::a(); ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstDevice=dst.cbegin(); ins.srcArray=src.cbegin(); CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); } }; // Source is texture, dest on host template<> struct _ucl_memcpy<1,2> { template static inline void mc(p1 &dst, const p2 &src, const size_t n) { assert(0==1); } template static inline void mc(p1 &dst, const p2 &src, const size_t n, CUstream &cq) { assert(0==1); } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); ins.dstMemoryType=_nvd_set_2D_mem::a(); ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstHost=dst.begin(); ins.srcArray=src.cbegin(); CU_SAFE_CALL(cuMemcpy2D(&ins)); } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); ins.dstMemoryType=_nvd_set_2D_mem::a(); ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstHost=dst.begin(); ins.srcArray=src.cbegin(); CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); } }; // Neither are textures, destination on host template <> struct _ucl_memcpy<1,0> { template static inline void mc(p1 &dst, const p2 &src, const size_t n) { CU_SAFE_CALL(cuMemcpyDtoH(dst.begin(),src.cbegin(),n)); } template static inline void mc(p1 &dst, const p2 &src, const size_t n, CUstream &cq) { CU_SAFE_CALL(cuMemcpyDtoHAsync(dst.begin(),src.cbegin(),n,cq)); } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); ins.dstMemoryType=_nvd_set_2D_mem::a(); ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstHost=dst.begin(); ins.srcDevice=src.cbegin(); CU_SAFE_CALL(cuMemcpy2D(&ins)); } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); ins.dstMemoryType=_nvd_set_2D_mem::a(); ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstHost=dst.begin(); ins.srcDevice=src.cbegin(); CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); } }; // Neither are textures, source on host template <> struct _ucl_memcpy<0,1> { template static inline void mc(p1 &dst, const p2 &src, const size_t n) { CU_SAFE_CALL(cuMemcpyHtoD(dst.cbegin(),src.begin(),n)); } template static inline void mc(p1 &dst, const p2 &src, const size_t n, CUstream &cq) { CU_SAFE_CALL(cuMemcpyHtoDAsync(dst.cbegin(),src.begin(),n,cq)); } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); ins.dstMemoryType=_nvd_set_2D_mem::a(); ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstDevice=dst.cbegin(); ins.srcHost=src.begin(); CU_SAFE_CALL(cuMemcpy2D(&ins)); } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); ins.dstMemoryType=_nvd_set_2D_mem::a(); ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstDevice=dst.cbegin(); ins.srcHost=src.begin(); CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); } }; // Neither are textures, both on host template <> struct _ucl_memcpy<1,1> { template static inline void mc(p1 &dst, const p2 &src, const size_t n) { memcpy(dst.begin(),src.begin(),n); } template static inline void mc(p1 &dst, const p2 &src, const size_t n, CUstream &cq) { memcpy(dst.begin(),src.begin(),n); } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); ins.dstMemoryType=_nvd_set_2D_mem::a(); ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstHost=dst.begin(); ins.srcHost=src.begin(); CU_SAFE_CALL(cuMemcpy2D(&ins)); } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, CUstream &cq) { CUDA_MEMCPY2D ins; _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows); ins.dstMemoryType=_nvd_set_2D_mem::a(); ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstHost=dst.begin(); ins.srcHost=src.begin(); CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); } }; // Neither are textures, both on device template struct _ucl_memcpy { template static inline void mc(p1 &dst, const p2 &src, const size_t n) { CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin(),src.cbegin(),n)); } template static inline void mc(p1 &dst, const p2 &src, const size_t n, CUstream &cq) { CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin(),src.cbegin(),n,cq)); } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows) { if (p1::PADDED==0 || p2::PADDED==0) { size_t src_offset=0, dst_offset=0; for (size_t i=0; i::a(); ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstDevice=dst.cbegin(); ins.srcDevice=src.cbegin(); CU_SAFE_CALL(cuMemcpy2D(&ins)); } } template static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, const size_t spitch, const size_t cols, const size_t rows, CUstream &cq) { if (p1::PADDED==0 || p2::PADDED==0) { size_t src_offset=0, dst_offset=0; for (size_t i=0; i::a(); ins.srcMemoryType=_nvd_set_2D_mem::a(); ins.dstDevice=dst.cbegin(); ins.srcDevice=src.cbegin(); CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq)); } } }; template inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n) { _ucl_memcpy::mc(dst,src,n); } template inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n, CUstream &cq) { _ucl_memcpy::mc(dst,src,n,cq); } template inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, const size_t spitch, const size_t cols, const size_t rows) { _ucl_memcpy::mc(dst,dpitch,src,spitch,cols, rows); } template inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, const size_t spitch, const size_t cols, const size_t rows,CUstream &cq) { _ucl_memcpy::mc(dst,dpitch,src,spitch,cols, rows,cq); } } // namespace ucl_cudart #endif