/*************************************************************************** ucl_d_mat.h ------------------- W. Michael Brown Matrix Container on Device __________________________________________________________________________ This file is part of the Geryon Unified Coprocessor Library (UCL) __________________________________________________________________________ begin : Thu Jun 25 2009 copyright : (C) 2009 by W. Michael Brown email : brownw@ornl.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ // Only allow this file to be included by CUDA and OpenCL specific headers #ifdef _UCL_MAT_ALLOW /// 2D Matrix on device (can have extra column storage to get correct alignment) template class UCL_D_Mat : public UCL_BaseMat { public: // Traits for copying data // MEM_TYPE is 0 for device, 1 for host, and 2 for image enum traits { DATA_TYPE = _UCL_DATA_ID::id, MEM_TYPE = 0, PADDED = 1, ROW_MAJOR = 1, VECTOR = 0 }; typedef numtyp data_type; UCL_D_Mat() : _cols(0) {} ~UCL_D_Mat() { _device_free(*this); } /// Construct with specified rows and cols /** \sa alloc() **/ UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device, const enum UCL_MEMOPT kind=UCL_READ_WRITE) : _cols(0) { alloc(rows,cols,device,kind); } /// Row major matrix on device /** The kind parameter controls memory optimizations as follows: * - UCL_READ_WRITE - Specify that you will read and write in kernels * - UCL_WRITE_ONLY - Specify that you will only write in kernels * - UCL_READ_ONLY - Specify that you will only read in kernels * \param cq Default command queue for operations copied from another mat * \note - Coalesced access using adjacent cols on same row * UCL_D_Mat(row,col) given by array[row*row_size()+col] * \return UCL_SUCCESS if the memory allocation is successful **/ template inline int alloc(const size_t rows, const size_t cols, mat_type &cq, const enum UCL_MEMOPT kind=UCL_READ_WRITE) { clear(); int err=_device_alloc(*this,cq,rows,cols,_pitch,kind); if (err!=UCL_SUCCESS) { #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << rows*cols*sizeof(numtyp) << " bytes on device.\n"; UCL_GERYON_EXIT; #endif return err; } _kind=kind; _rows=rows; _cols=cols; _row_size=_pitch/sizeof(numtyp); #ifndef _UCL_DEVICE_PTR_MAT _end=_array+_row_size*cols; #endif #ifdef _OCL_MAT _offset=0; #endif return err; } /// Row major matrix on device /** The kind parameter controls memory optimizations as follows: * - UCL_READ_WRITE - Specify that you will read and write in kernels * - UCL_WRITE_ONLY - Specify that you will only write in kernels * - UCL_READ_ONLY - Specify that you will only read in kernels * \param device Used to get the default command queue for operations * \note - Coalesced access using adjacent cols on same row * UCL_D_Mat(row,col) given by array[row*row_size()+col] * \return UCL_SUCCESS if the memory allocation is successful **/ inline int alloc(const size_t rows, const size_t cols, UCL_Device &device, const enum UCL_MEMOPT kind=UCL_READ_WRITE) { clear(); int err=_device_alloc(*this,device,rows,cols,_pitch,kind); if (err!=UCL_SUCCESS) { #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << rows*cols*sizeof(numtyp) << " bytes on device.\n"; UCL_GERYON_EXIT; #endif return err; } _kind=kind; _rows=rows; _cols=cols; _row_size=_pitch/sizeof(numtyp); #ifndef _UCL_DEVICE_PTR_MAT _end=_array+_row_size*cols; #endif #ifdef _OCL_MAT _offset=0; #endif return err; } /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs * \param stride Number of _elements_ between the start of each row **/ template inline void view(ucl_type &input, const size_t rows, const size_t cols, const size_t stride) { clear(); _kind=UCL_VIEW; _rows=rows; _cols=cols; _pitch=stride*sizeof(numtyp); _row_size=stride; this->_cq=input.cq(); #ifdef _OCL_MAT _offset=input.offset(); _array=input.cbegin(); CL_SAFE_CALL(clRetainMemObject(input.cbegin())); CL_SAFE_CALL(clRetainCommandQueue(input.cq())); #else _device_view(&_array,input.begin()); #endif #ifndef _UCL_DEVICE_PTR_MAT _end=_array+_cols; #endif } /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template inline void view(ucl_type &input, const size_t rows, const size_t cols) { view(input,rows,cols,input.row_size()); } /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template inline void view(ucl_type &input, const size_t cols) { view(input,1,cols); } /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template inline void view(ucl_type &input) { view(input,input.rows(),input.cols()); } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs * \param stride Number of _elements_ between the start of each row **/ template inline void view(ptr_type input, const size_t rows, const size_t cols, const size_t stride, UCL_Device &dev) { clear(); _kind=UCL_VIEW; _cols=cols; _rows=rows; _pitch=stride*sizeof(numtyp); _row_size=stride; this->_cq=dev.cq(); _array=input; #ifndef _UCL_DEVICE_PTR_MAT _end=_array+_cols; #endif #ifdef _OCL_MAT _offset=0; CL_SAFE_CALL(clRetainMemObject(input)); CL_SAFE_CALL(clRetainCommandQueue(dev.cq())); #endif } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template inline void view(ptr_type input, const size_t rows, const size_t cols, UCL_Device &dev) { view(input,rows,cols,cols,dev); } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template inline void view(ptr_type input, const size_t cols, UCL_Device &dev) { view(input,1,cols,dev); } /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs * \param stride Number of _elements_ between the start of each row **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, const size_t cols, const size_t stride) { clear(); _kind=UCL_VIEW; _cols=cols; _rows=rows; _pitch=stride*sizeof(numtyp); _row_size=stride; this->_cq=input.cq(); #ifdef _OCL_MAT _array=input.begin(); _offset=offset+input.offset(); CL_SAFE_CALL(clRetainMemObject(input.cbegin())); CL_SAFE_CALL(clRetainCommandQueue(input.cq())); #else _device_view(&_array,input.begin(),offset,sizeof(numtyp)); #endif #ifndef _UCL_DEVICE_PTR_MAT _end=_array+_cols; #endif } /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, const size_t cols) { view_offset(offset,input,rows,cols,input.row_size()); } /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t cols) { view_offset(offset,input,1,cols); } /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template inline void view_offset(const size_t offset, ucl_type &input) { if (input.rows()==1) view_offset(offset,input,1,input.cols()-offset); else view_offset(offset,input,input.rows()-offset/input.row_size(), input.cols()); } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs * \param stride Number of _elements_ between the start of each row **/ template inline void view_offset(const size_t offset,ptr_type input,const size_t rows, const size_t cols,const size_t stride, UCL_Device &dev) { clear(); _kind=UCL_VIEW; _cols=cols; _rows=rows; _pitch=stride*sizeof(numtyp); _row_size=stride; this->_cq=dev.cq(); #ifdef _OCL_MAT _array=input; _offset=offset; CL_SAFE_CALL(clRetainMemObject(input)); CL_SAFE_CALL(clRetainCommandQueue(dev.cq())); #else #ifdef _UCL_DEVICE_PTR_MAT _array=input+offset*sizeof(numtyp); #else _array=input+offset; #endif #endif #ifndef _UCL_DEVICE_PTR_MAT _end=_array+_cols; #endif } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template inline void view_offset(const size_t offset,ptr_type input,const size_t rows, const size_t cols, UCL_Device &dev) { view_offset(offset,input,rows,cols,cols,dev); } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template inline void view_offset(const size_t offset, ptr_type input, const size_t cols, UCL_Device &dev) { view_offset(offset,input,1,cols,dev); } /// Free memory and set size to 0 inline void clear() { _device_free(*this); _cols=0; _kind=UCL_VIEW; } /// Resize the allocation to contain cols elements /** \note Cannot be used on views **/ inline int resize(const int rows, const int cols) { assert(_kind!=UCL_VIEW); int err=_device_resize(*this,rows,cols,_pitch); if (err!=UCL_SUCCESS) { #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << rows*cols*sizeof(numtyp) << " bytes on device.\n"; UCL_GERYON_EXIT; #endif return err; } _rows=rows; _cols=cols; _row_size=_pitch/sizeof(numtyp); #ifndef _UCL_DEVICE_PTR_MAT _end=_array+_row_size*cols; #endif #ifdef _OCL_MAT _offset=0; #endif return err; } /// Resize (only if bigger) the allocation to contain rows x cols elements /** \note Cannot be used on views **/ inline int resize_ib(const int rows, const int cols) { if (cols>_cols || rows>_rows) return resize(rows,cols); else return UCL_SUCCESS; } /// Set each element to zero asynchronously in the default command_queue inline void zero() { zero(_cq); } /// Set first n elements to zero asynchronously in the default command_queue inline void zero(const int n) { zero(n,_cq); } /// Set each element to zero asynchronously inline void zero(command_queue &cq) { _device_zero(*this,row_bytes()*_rows,cq); } /// Set first n elements to zero asynchronously inline void zero(const int n, command_queue &cq) { _device_zero(*this,n*sizeof(numtyp),cq); } #ifdef _UCL_DEVICE_PTR_MAT /// For OpenCL, returns a (void *) device pointer to memory allocation inline device_ptr & begin() { return _array; } /// For OpenCL, returns a (void *) device pointer to memory allocation inline const device_ptr & begin() const { return _array; } #else /// For CUDA-RT, get device pointer to first element inline numtyp * & begin() { return _array; } /// For CUDA-RT, get device pointer to first element inline numtyp * const & begin() const { return _array; } /// For CUDA-RT, get device pointer to one past last element inline numtyp * end() { return _end; } /// For CUDA-RT, get device pointer to one past last element inline const numtyp * end() const { return _end; } #endif #ifdef _UCL_DEVICE_PTR_MAT /// Returns an API specific device pointer /** - For OpenCL, returns a &cl_mem object * - For CUDA Driver, returns a &CUdeviceptr * - For CUDA-RT, returns void** **/ inline device_ptr & cbegin() { return _array; } /// Returns an API specific device pointer /** - For OpenCL, returns a &cl_mem object * - For CUDA Driver, returns a &CUdeviceptr * - For CUDA-RT, returns void** **/ inline const device_ptr & cbegin() const { return _array; } #else /// Returns an API specific device pointer /** - For OpenCL, returns a &cl_mem object * - For CUDA Driver, returns a &CUdeviceptr * - For CUDA-RT, returns numtyp** **/ inline numtyp ** cbegin() { return &_array; } /// Returns an API specific device pointer /** - For OpenCL, returns a &cl_mem object * - For CUDA Driver, returns a &CUdeviceptr * - For CUDA-RT, returns numtyp** **/ inline const numtyp ** cbegin() const { return &_array; } #endif /// Get the number of elements inline size_t numel() const { return _cols*_rows; } /// Get the number of rows inline size_t rows() const { return _rows; } /// Get the number of columns inline size_t cols() const { return _cols; } ///Get the size of a row (including any padding) in elements inline size_t row_size() const { return _row_size; } /// Get the size of a row (including any padding) in bytes inline size_t row_bytes() const { return _pitch; } /// Get the size in bytes of 1 element inline int element_size() const { return sizeof(numtyp); } #ifdef _OCL_MAT /// Return the offset (in elements) from begin() pointer where data starts /** \note Always 0 for host matrices and CUDA APIs **/ inline size_t offset() const { return _offset; } #else /// Return the offset (in elements) from begin() pointer where data starts /** \note Always 0 for host matrices and CUDA APIs **/ inline size_t offset() const { return 0; } #endif /// Return the offset (in bytes) from begin() pointer where data starts /** \note Always 0 for host matrices and CUDA APIs **/ inline size_t byteoff() const { return offset()*sizeof(numtyp); } private: size_t _pitch, _row_size, _rows, _cols; #ifdef _UCL_DEVICE_PTR_MAT device_ptr _array; #else numtyp *_array,*_end; #endif #ifdef _OCL_MAT size_t _offset; #endif }; #endif