/*************************************************************************** ucl_d_vec.h ------------------- W. Michael Brown Vector Container on Device __________________________________________________________________________ This file is part of the Geryon Unified Coprocessor Library (UCL) __________________________________________________________________________ begin : Thu Jun 25 2009 copyright : (C) 2009 by W. Michael Brown email : brownw@ornl.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2009) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ // Only allow this file to be included by CUDA and OpenCL specific headers #ifdef _UCL_MAT_ALLOW /// Row vector on device template class UCL_D_Vec : public UCL_BaseMat { public: // Traits for copying data // MEM_TYPE is 0 for device, 1 for host, and 2 for image enum traits { DATA_TYPE = _UCL_DATA_ID::id, MEM_TYPE = 0, PADDED = 0, ROW_MAJOR = 1, VECTOR = 1 }; typedef numtyp data_type; UCL_D_Vec() : _cols(0) {} ~UCL_D_Vec() { _device_free(*this); } /// Construct with n columns /** \sa alloc() **/ UCL_D_Vec(const size_t n, UCL_Device &device, const enum UCL_MEMOPT kind=UCL_READ_WRITE) : _cols(0) { alloc(n,device,kind); } /// Set up host vector with 'cols' columns and reserve memory /** The kind parameter controls memory optimizations as follows: * - UCL_READ_WRITE - Specify that you will read and write in kernels * - UCL_WRITE_ONLY - Specify that you will only write in kernels * - UCL_READ_ONLY - Specify that you will only read in kernels * \param cq Default command queue for operations copied from another mat * \return UCL_SUCCESS if the memory allocation is successful **/ template inline int alloc(const size_t cols, mat_type &cq, const enum UCL_MEMOPT kind=UCL_READ_WRITE) { clear(); _row_bytes=cols*sizeof(numtyp); int err=_device_alloc(*this,cq,_row_bytes,kind); if (err!=UCL_SUCCESS) { #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << _row_bytes << " bytes on device.\n"; _row_bytes=0; UCL_GERYON_EXIT; #endif _row_bytes=0; return err; } _kind=kind; _cols=cols; #ifndef _UCL_DEVICE_PTR_MAT _end=_array+cols; #endif #ifdef _OCL_MAT _offset=0; #endif return err; } /// Set up host vector with 'cols' columns and reserve memory /** The kind parameter controls memory optimizations as follows: * - UCL_READ_WRITE - Specify that you will read and write in kernels * - UCL_WRITE_ONLY - Specify that you will only write in kernels * - UCL_READ_ONLY - Specify that you will only read in kernels * \param device Used to get the default command queue for operations * \return UCL_SUCCESS if the memory allocation is successful **/ inline int alloc(const size_t cols, UCL_Device &device, const enum UCL_MEMOPT kind=UCL_READ_WRITE) { clear(); _row_bytes=cols*sizeof(numtyp); int err=_device_alloc(*this,device,_row_bytes,kind); if (err!=UCL_SUCCESS) { #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << _row_bytes << " bytes on device.\n"; _row_bytes=0; UCL_GERYON_EXIT; #endif _row_bytes=0; return err; } _kind=kind; _cols=cols; #ifndef _UCL_DEVICE_PTR_MAT _end=_array+cols; #endif #ifdef _OCL_MAT _offset=0; #endif return err; } /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template inline void view(ucl_type &input, const size_t rows, const size_t cols) { #ifdef UCL_DEBUG assert(rows==1); #endif clear(); _kind=UCL_VIEW; _cols=cols; _row_bytes=_cols*sizeof(numtyp); this->_cq=input.cq(); #ifdef _OCL_MAT _offset=input.offset(); _array=input.cbegin(); CL_SAFE_CALL(clRetainMemObject(input.cbegin())); CL_SAFE_CALL(clRetainCommandQueue(input.cq())); #else _device_view(&_array,input.begin()); #endif #ifndef _UCL_DEVICE_PTR_MAT _end=_array+_cols; #endif } /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs * \param stride Number of _elements_ between the start of each row **/ template inline void view(ucl_type &input, const size_t rows, const size_t cols, const size_t stride) { view(input,rows,cols); } /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template inline void view(ucl_type &input, const size_t cols) { view(input,1,cols); } /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template inline void view(ucl_type &input) { view(input,input.rows()*input.row_size()); } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template inline void view(ptr_type input, const size_t rows, const size_t cols, UCL_Device &dev) { #ifdef UCL_DEBUG assert(rows==1); #endif clear(); _kind=UCL_VIEW; _cols=cols; _row_bytes=_cols*sizeof(numtyp); this->_cq=dev.cq(); _array=input; #ifndef _UCL_DEVICE_PTR_MAT _end=_array+_cols; #endif #ifdef _OCL_MAT _offset=0; CL_SAFE_CALL(clRetainMemObject(input)); CL_SAFE_CALL(clRetainCommandQueue(dev.cq())); #endif } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs * \param stride Number of _elements_ between the start of each row **/ template inline void view(ptr_type input, const size_t rows, const size_t cols, const size_t stride, UCL_Device &dev) { view(input,rows,cols,stride); } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template inline void view(ptr_type input, const size_t cols, UCL_Device &dev) { view(input,1,cols,dev); } /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, const size_t cols) { #ifdef UCL_DEBUG assert(rows==1); #endif clear(); _kind=UCL_VIEW; _cols=cols; _row_bytes=_cols*sizeof(numtyp); this->_cq=input.cq(); #ifdef _OCL_MAT _array=input.begin(); _offset=offset+input.offset(); CL_SAFE_CALL(clRetainMemObject(input.begin())); CL_SAFE_CALL(clRetainCommandQueue(input.cq())); #else _device_view(&_array,input.begin(),offset,sizeof(numtyp)); #endif #ifndef _UCL_DEVICE_PTR_MAT _end=_array+_cols; #endif } /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs * \param stride Number of _elements_ between the start of each row **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t rows, const size_t cols, const size_t stride) { view_offset(offset,input,rows,cols); } /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template inline void view_offset(const size_t offset,ucl_type &input,const size_t cols) { view_offset(offset,input,1,cols); } /// Do not allocate memory, instead use an existing allocation from Geryon /** This function must be passed a Geryon vector or matrix container. * No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs * - If a matrix is used a input, all elements (including padding) * will be used for view **/ template inline void view_offset(const size_t offset, ucl_type &input) { view_offset(offset,input,input.rows()*input.row_size()-offset); } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template inline void view_offset(const size_t offset,ptr_type input,const size_t rows, const size_t cols, UCL_Device &dev) { #ifdef UCL_DEBUG assert(rows==1); #endif clear(); _kind=UCL_VIEW; _cols=cols; _row_bytes=_cols*sizeof(numtyp); this->_cq=dev.cq(); #ifdef _OCL_MAT _array=input; _offset=offset; CL_SAFE_CALL(clRetainMemObject(input)); CL_SAFE_CALL(clRetainCommandQueue(dev.cq())); #else #ifdef _UCL_DEVICE_PTR_MAT _array=input+offset*sizeof(numtyp); #else _array=input+offset; #endif #endif #ifndef _UCL_DEVICE_PTR_MAT _end=_array+_cols; #endif } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs * \param stride Number of _elements_ between the start of each row **/ template inline void view_offset(const size_t offset,ptr_type input,const size_t rows, const size_t cols,const size_t stride,UCL_Device &dev) { view_offset(offset,input,rows,cols,stride); } /// Do not allocate memory, instead use an existing allocation /** - No memory is freed when the object is destructed. * - The view does not prevent the memory from being freed by the * allocating container when using CUDA APIs **/ template inline void view_offset(const size_t offset, ptr_type input, const size_t cols, UCL_Device &dev) { view_offset(offset,input,1,cols,dev); } /// Free memory and set size to 0 inline void clear() { _device_free(*this); _cols=0; _kind=UCL_VIEW; } /// Resize the allocation to contain cols elements /** \note Cannot be used on views **/ inline int resize(const int cols) { assert(_kind!=UCL_VIEW); _row_bytes=cols*sizeof(numtyp); int err=_device_resize(*this,_row_bytes); if (err!=UCL_SUCCESS) { #ifndef UCL_NO_EXIT std::cerr << "UCL Error: Could not allocate " << _row_bytes << " bytes on device.\n"; _row_bytes=0; UCL_GERYON_EXIT; #endif _row_bytes=0; return err; } _cols=cols; #ifndef _UCL_DEVICE_PTR_MAT _end=_array+cols; #endif #ifdef _OCL_MAT _offset=0; #endif return err; } /// Resize (only if bigger) the allocation to contain cols elements /** \note Cannot be used on views **/ inline int resize_ib(const int cols) { if (cols>_cols) return resize(cols); else return UCL_SUCCESS; } /// Set each element to zero asynchronously in the default command_queue inline void zero() { zero(_cq); } /// Set first n elements to zero asynchronously in the default command_queue inline void zero(const int n) { zero(n,_cq); } /// Set each element to zero asynchronously inline void zero(command_queue &cq) { _device_zero(*this,row_bytes(),cq); } /// Set first n elements to zero asynchronously inline void zero(const int n, command_queue &cq) { _device_zero(*this,n*sizeof(numtyp),cq); } #ifdef _UCL_DEVICE_PTR_MAT /// For OpenCL, returns a (void *) device pointer to memory allocation inline device_ptr & begin() { return _array; } /// For OpenCL, returns a (void *) device pointer to memory allocation inline const device_ptr & begin() const { return _array; } #else /// For CUDA-RT, get device pointer to first element inline numtyp * & begin() { return _array; } /// For CUDA-RT, get device pointer to first element inline numtyp * const & begin() const { return _array; } /// For CUDA-RT, get device pointer to one past last element inline numtyp * end() { return _end; } /// For CUDA-RT, get device pointer to one past last element inline numtyp * end() const { return _end; } #endif #ifdef _UCL_DEVICE_PTR_MAT /// Returns an API specific device pointer /** - For OpenCL, returns a &cl_mem object * - For CUDA Driver, returns a &CUdeviceptr * - For CUDA-RT, returns void** **/ inline device_ptr & cbegin() { return _array; } /// Returns an API specific device pointer /** - For OpenCL, returns a &cl_mem object * - For CUDA Driver, returns a &CUdeviceptr * - For CUDA-RT, returns void** **/ inline const device_ptr & cbegin() const { return _array; } #else /// Returns an API specific device pointer /** - For OpenCL, returns a &cl_mem object * - For CUDA Driver, returns a &CUdeviceptr * - For CUDA-RT, returns numtyp** **/ inline numtyp ** cbegin() { return &_array; } /// Returns an API specific device pointer /** - For OpenCL, returns a &cl_mem object * - For CUDA Driver, returns a &CUdeviceptr * - For CUDA-RT, returns numtyp** **/ inline const numtyp ** cbegin() const { return &_array; } /// For CUDA-RT, allocate row vector and bind texture inline void safe_alloc(const size_t cols, UCL_Device &dev, textureReference *t) { alloc(cols,dev); assign_texture(t); bind(); } /// For CUDA-RT, assign a texture to matrix inline void assign_texture(textureReference *t) { _tex_ptr=t; } /// For CUDA-RT, bind to texture inline void bind() { cuda_gb_get_channel(_channel); (*_tex_ptr).addressMode[0] = cudaAddressModeClamp; (*_tex_ptr).addressMode[1] = cudaAddressModeClamp; (*_tex_ptr).filterMode = cudaFilterModePoint; (*_tex_ptr).normalized = false; CUDA_SAFE_CALL(cudaBindTexture(NULL,_tex_ptr,_array,&_channel)); } /// For CUDA-RT, unbind texture inline void unbind() { CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); } #endif /// Get the number of elements inline size_t numel() const { return _cols; } /// Get the number of rows inline size_t rows() const { return 1; } /// Get the number of columns inline size_t cols() const { return _cols; } ///Get the size of a row (including any padding) in elements inline size_t row_size() const { return _cols; } /// Get the size of a row (including any padding) in bytes inline size_t row_bytes() const { return _row_bytes; } /// Get the size in bytes of 1 element inline int element_size() const { return sizeof(numtyp); } #ifdef _OCL_MAT /// Return the offset (in elements) from begin() pointer where data starts /** \note Always 0 for host matrices and CUDA APIs **/ inline size_t offset() const { return _offset; } #else /// Return the offset (in elements) from begin() pointer where data starts /** \note Always 0 for host matrices and CUDA APIs **/ inline size_t offset() const { return 0; } #endif /// Return the offset (in bytes) from begin() pointer where data starts /** \note Always 0 for host matrices and CUDA APIs **/ inline size_t byteoff() const { return offset()*sizeof(numtyp); } private: size_t _row_bytes, _row_size, _rows, _cols; #ifdef _UCL_DEVICE_PTR_MAT device_ptr _array; #else numtyp *_array,*_end; cudaChannelFormatDesc _channel; textureReference *_tex_ptr; #endif #ifdef _OCL_MAT size_t _offset; #endif }; #endif