/*************************************************************************** ucl_copy.h ------------------- W. Michael Brown Routines for copying matrix/vector data onto and off coprocessor device __________________________________________________________________________ This file is part of the Geryon Unified Coprocessor Library (UCL) __________________________________________________________________________ begin : Mon Jan 4 2010 copyright : (C) 2010 by W. Michael Brown email : brownw@ornl.gov ***************************************************************************/ /* ----------------------------------------------------------------------- Copyright (2010) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the Simplified BSD License. ----------------------------------------------------------------------- */ /*************************************************************************** The ucl_copy and ucl_cast_copy routines provide a general prototype for copying data between host and device memory (including texture memory) for the matrix and vector types in nvc_memory. For host/host and host/device transfers, typecasting is performed automatically as necessary. The routines are written so that all branches can be removed by the compiler during template instantiation. The routines currently assume row-major ordering for all types. For asynchronous copy in the default command queue, async is boolean true; For asynchronous copy in a specified command queue, async is command queue Otherwise, set async to boolean false; When performing frequent data copies that require casting, it is more efficient to allocate a casting buffer once and then pass that buffer to the copy routine. This can be accomplished with the ucl_cast_copy routines. Examples (x's represent alignment padding - to maintain alignment) (o's represent a larger matrix in memory) (vectors represented as single row) ---------------------------------------------------------------- dst src command ---------------------------------------------------------------- 0 1 2 3 4 <-- 0 1 2 3 4 ucl_copy(dst,src,async) 0 1 2 3 <-- 0 1 2 3 4 ucl_copy(dst,src,4,async) 0 1 2 <-- 0 1 2 3 4 5 ucl_copy(dst,src,async) 3 4 5 0 1 2 3 4 5 <-- 0 1 2 ucl_copy(dst,src,async) 3 4 5 0 1 2 <-- 0 1 2 ucl_copy(dst,src,async) 3 4 5 3 4 5 0 1 2 <-- 0 1 2 ucl_copy(dst,src,6,async) 3 4 5 3 4 5 5 6 7 0 1 2 <-- 0 1 2 3 ucl_copy(dst,src,2,3,async) 4 5 6 4 5 6 7 8 9 10 11 0 1 2 x x <-- 0 1 2 ucl_copy(dst,src,async) 3 4 5 x x 3 4 5 0 1 2 <-- 0 1 2 x x ucl_copy(dst,src,async) 3 4 5 3 4 5 x x 0 1 2 o o <-- 0 1 2 ucl_copy(dst,src,2,3,async) 3 4 5 o o 3 4 5 o o o o o 0 1 2 o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,3,async) 3 4 5 o o o o o o o 0 1 o o o <-- 0 1 2 3 4 5 ucl_copy(dst,src,2,2,async) 2 3 o o o o o o o o 0 1 2 o o <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async) 5 6 7 o o 5 6 7 8 9 o o o o o 10 11 12 13 14 0 1 2 5 6 7 <-- 0 1 2 3 4 ucl_copy(dst,src,2,3,async) 5 6 7 8 9 10 11 12 13 14 ***************************************************************************/ // Only allow this file to be included by nvc_memory.h and ocl_memory.h #ifdef UCL_COPY_ALLOW // -------------------------------------------------------------------------- // - CHECK PERMISSIONS FOR SOURCE AND DESTINATION IN COPY // -------------------------------------------------------------------------- template inline void _check_ucl_copy_perm(mat1 &dst, mat2 &src) { if ((int)mat1::MEM_TYPE==(int)mat2::MEM_TYPE) { if (dst.kind()==UCL_READ_ONLY) { std::cerr << "Attempt to copy where destination is UCL_READ_ONLY\n"; assert(0==1); } else if (src.kind()==UCL_WRITE_ONLY) { std::cerr << "Attempt to copy where source is UCL_WRITE_ONLY\n"; assert(0==1); } } else { if (dst.kind()==UCL_WRITE_ONLY) { std::cerr << "Destination in host-device copy cannot be UCL_WRITE_ONLY\n"; assert(0==1); } else if (src.kind()==UCL_READ_ONLY) { std::cerr << "Source in host-device copy cannot be UCL_READ_ONLY\n"; assert(0==1); } } } // -------------------------------------------------------------------------- // - HOST-HOST COPY ROUTINES // -------------------------------------------------------------------------- // Have to use specialization because some types don't have operator[] template struct _host_host_copy; // Both on host template <> struct _host_host_copy<1,1> { template static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) { #ifdef UCL_DEBUG assert(mat1::PADDED==0 && mat2::PADDED==0); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); #endif if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) { #ifdef _OCL_MAT if (dst.begin()==src.begin()) { #ifdef UCL_DBG_MEM_TRACE std::cerr << "UCL_COPY 7S\n"; #endif return; } #endif memcpy(dst.begin(),src.begin(),numel*sizeof(typename mat1::data_type)); #ifdef UCL_DBG_MEM_TRACE std::cerr << "UCL_COPY 7NS\n"; #endif } else for (size_t i=0; i(src[i]); } template static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols) { #ifdef UCL_DEBUG assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); #endif size_t dst_row_size, src_row_size; if (mat1::VECTOR) dst_row_size=cols; else dst_row_size=dst.row_size(); if (mat2::VECTOR) src_row_size=cols; else src_row_size=src.row_size(); if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0) { #ifdef _OCL_MAT if (dst.begin()==src.begin()) { #ifdef UCL_DBG_MEM_TRACE std::cerr << "UCL_COPY 8S\n"; #endif return; } #endif #ifdef UCL_DBG_MEM_TRACE std::cerr << "UCL_COPY 8NS\n"; #endif for (size_t i=0; i(src[src_i]); src_i++; } } } }; // Should never be here template struct _host_host_copy { template static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) { assert(0==1); } template static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols) { assert(0==1); } }; // -------------------------------------------------------------------------- // - TEMPLATE HELPER FUNCTIONS FOR SPECIALIZED CASTING // -------------------------------------------------------------------------- // Helper functions for ucl_cast_copy template struct _ucl_cast_copy; // Destination is on host template struct _ucl_cast_copy<1,host_type2> { template static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, mat3 &cast_buffer) { ucl_mv_cpy(cast_buffer,src,numel*sizeof(typename mat2::data_type)); for (size_t i=0; i(cast_buffer[i]); } template static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, mat3 &cast_buffer,command_queue &cq) { ucl_mv_cpy(cast_buffer,src,numel*sizeof(typename mat2::data_type),cq); cast_buffer.sync(); for (size_t i=0; i(cast_buffer[i]); } template static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols, mat3 &cast_buffer) { // Asynchronous currently pointless here #ifdef UCL_DEBUG assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); #endif if (mat1::VECTOR) { ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, src.row_bytes(),cols*sizeof(typename mat2::data_type),rows); for (size_t i=0; i(cast_buffer[i]); } else { if (mat2::VECTOR) ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, cols*sizeof(typename mat2::data_type), cols*sizeof(typename mat2::data_type),rows); else ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, src.row_bytes(),cols*sizeof(typename mat2::data_type), rows); size_t dst_i=0, buff_i=0, doff=dst.cols()-cols; for (size_t i=0; i(cast_buffer[buff_i]); buff_i++; dst_i++; } dst_i+=doff; } } } template static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols, mat3 &cast_buffer, command_queue &cq) { // Asynchronous currently pointless here #ifdef UCL_DEBUG assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); #endif if (mat1::VECTOR) { ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, src.row_bytes(),cols*sizeof(typename mat2::data_type),rows,cq); cast_buffer.sync(); for (size_t i=0; i(cast_buffer[i]); } else { if (mat2::VECTOR) ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, cols*sizeof(typename mat2::data_type), cols*sizeof(typename mat2::data_type),rows,cq); else ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src, src.row_bytes(),cols*sizeof(typename mat2::data_type), rows,cq); cast_buffer.sync(); size_t dst_i=0, buff_i=0, doff=dst.cols()-cols; for (size_t i=0; i(cast_buffer[buff_i]); buff_i++; dst_i++; } dst_i+=doff; } } } }; // Source is on host template struct _ucl_cast_copy { template static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, mat3 &cast_buffer) { for (size_t i=0; i(src[i]); ucl_mv_cpy(dst,cast_buffer,numel*sizeof(typename mat1::data_type)); } template static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, mat3 &cast_buffer, command_queue &cq) { for (size_t i=0; i(src[i]); ucl_mv_cpy(dst,cast_buffer,numel*sizeof(typename mat1::data_type),cq); } template static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols, mat3 &cast_buffer) { #ifdef UCL_DEBUG assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); if (mat3::VECTOR==0) { assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols); assert(dst.rows()>=rows && dst.cols()>=cols); } #endif if (mat2::VECTOR) { if (mat3::VECTOR==0) { size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols; for (size_t i=0; i(src[si]); ci++; si++; } ci+=co; si+=so; } ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(), cols*sizeof(typename mat1::data_type),rows); } else { for (size_t i=0; i(src[i]); ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer, cols*sizeof(typename mat1::data_type), cols*sizeof(typename mat1::data_type),rows); } } else if (mat1::VECTOR) { size_t src_i=0, buf_i=0, soff=src.cols()-cols; for (size_t i=0; i(src[src_i]); buf_i++; src_i++; } src_i+=soff; } ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows); } else { size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch; if (mat3::VECTOR==0) { co=cast_buffer.cols()-cols; spitch=cast_buffer.row_bytes(); } else { co=0; spitch=cols*sizeof(typename mat1::data_type); } for (size_t i=0; i(src[src_i]); buf_i++; src_i++; } src_i+=so; buf_i+=co; } ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch, cols*sizeof(typename mat1::data_type),rows); } } template static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols, mat3 &cast_buffer, command_queue &cq) { #ifdef UCL_DEBUG assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols); if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols); if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols); if (mat3::VECTOR==0) { assert(cast_buffer.rows()>=rows && cast_buffer.cols()>=cols); assert(dst.rows()>=rows && dst.cols()>=cols); } #endif if (mat2::VECTOR) { if (mat3::VECTOR==0) { size_t ci=0, si=0, co=cast_buffer.cols()-cols, so=src.cols()-cols; for (size_t i=0; i(src[si]); ci++; si++; } ci+=co; si+=so; } ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,cast_buffer.row_bytes(), cols*sizeof(typename mat1::data_type),rows); } else { for (size_t i=0; i(src[i]); ucl_mv_cpy(dst,dst.row_bytes(), cast_buffer,cols*sizeof(typename mat1::data_type), cols*sizeof(typename mat1::data_type),rows,cq); } } else if (mat1::VECTOR) { size_t src_i=0, buf_i=0, soff=src.cols()-cols; for (size_t i=0; i(src[src_i]); buf_i++; src_i++; } src_i+=soff; } ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows,cq); } else { size_t src_i=0, buf_i=0, so=src.cols()-cols, co, spitch; if (mat3::VECTOR==0) { co=cast_buffer.cols()-cols; spitch=cast_buffer.row_bytes(); } else { co=0; spitch=cols*sizeof(typename mat1::data_type); } for (size_t i=0; i(src[src_i]); buf_i++; src_i++; } src_i+=so; buf_i+=co; } ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,spitch, cols*sizeof(typename mat1::data_type),rows,cq); } } }; // Neither on host or both on host template <> struct _ucl_cast_copy<1,1> { template static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, mat3 &cast_buffer, command_queue &cq) { assert(0==1); } template static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, mat3 &cast_buffer) { assert(0==1); } template static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols, mat3 &cast_buffer) { assert(0==1); } template static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols, mat3 &cast_buffer, command_queue &cq) { assert(0==1); } }; // Neither on host or both on host template <> struct _ucl_cast_copy<0,0> { template static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, mat3 &cast_buffer, command_queue &cq) { assert(0==1); } template static inline void cc(mat1 &dst, const mat2 &src, const size_t numel, mat3 &cast_buffer) { assert(0==1); } template static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols, mat3 &cast_buffer) { assert(0==1); } template static inline void cc(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols, mat3 &cast_buffer, command_queue &cq) { assert(0==1); } }; // -------------------------------------------------------------------------- // - 1D COPY - SPECIFIED NUMBER OF BYTES // -------------------------------------------------------------------------- /// Asynchronous copy of matrix/vector with cast (Device/Host transfer) /** \param numel Number of elements (not bytes) to copy * \param cast_buffer Buffer on host with enough storage for casting * - If the data types for the two matrices are same, no cast performed * - Padding for 2D matrices is not considered in this routine. * - Currently does not handle textures **/ template inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel, mat3 &cast_buffer, command_queue &cq) { #ifdef UCL_DEBUG assert(dst.numel()>=numel && src.numel()>=numel); assert(cast_buffer.numel()>=numel); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); #endif if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) ucl_copy(dst,src,numel,cq); else { #ifdef UCL_DEBUG _check_ucl_copy_perm(dst,src); #endif _ucl_cast_copy::cc(dst,src,numel, cast_buffer,cq); } } /// Asynchronous copy of matrix/vector with cast (Device/Host transfer) /** \param numel Number of elements (not bytes) to copy * \param async Perform non-blocking copy on default stream * \param cast_buffer Buffer on host with enough storage for casting * - If the data types for the two matrices are same, no cast performed * - Padding for 2D matrices is not considered in this routine. * - Currently does not handle textures **/ template inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel, mat3 &cast_buffer, const bool async) { #ifdef UCL_DEBUG assert(dst.numel()>=numel && src.numel()>=numel); assert(cast_buffer.numel()>=numel); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); _check_ucl_copy_perm(dst,src); #endif if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) ucl_copy(dst,src,numel,async); else if (async) _ucl_cast_copy::cc(dst,src,numel, cast_buffer,dst.cq()); else _ucl_cast_copy::cc(dst,src,numel, cast_buffer); } /// Asynchronous copy of matrix/vector (memory already allocated) /** \param numel Number of elements (not bytes) to copy * - If the data types of the two matrices are not the same, * casting will be performed automatically as long as the copy is * not device to device. For host/device transfers, a temporary * buffer is created for copy. When multiple casts occur, it is * more efficient to create a permanent casting buffer that can * be passed to an alternative copy routine. * - Padding for 2D matrices is not considered in this routine. * - Currently does not handle textures **/ template inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel, command_queue &cq) { #ifdef UCL_DEBUG assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); _check_ucl_copy_perm(dst,src); #endif if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) _host_host_copy::hhc(dst,src,numel); else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { if (mat1::MEM_TYPE==1) { UCL_H_Vec cast_buffer; cast_buffer.alloc(numel,dst,UCL_READ_ONLY); _ucl_cast_copy::cc(dst,src,numel, cast_buffer,cq); } else { UCL_H_Vec cast_buffer; cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY); _ucl_cast_copy::cc(dst,src,numel, cast_buffer,cq); } } else ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq); } /// Copy matrix/vector (memory already allocated) /** \param numel Number of elements (not bytes) to copy * \param async Perform non-blocking copy (ignored for host to host copy) * - If the data types of the two matrices are not the same, * casting will be performed automatically as long as the copy is * not device to device. For host/device transfers, a temporary * buffer is created for copy. When multiple casts occur, it is * more efficient to create a permanent casting buffer that can * be passed to an alternative copy routine. * - Padding for 2D matrices is not considered in this routine. * - The default stream is used for asynchronous copy * - Currently does not handle textures **/ template inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel, const bool async) { #ifdef UCL_DEBUG assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); _check_ucl_copy_perm(dst,src); #endif if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) _host_host_copy::hhc(dst,src,numel); else if (async) ucl_copy(dst,src,numel,dst.cq()); else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { if (mat1::MEM_TYPE==1) { UCL_H_Vec cast_buffer; cast_buffer.alloc(numel,dst,UCL_READ_ONLY); _ucl_cast_copy::cc(dst,src,numel, cast_buffer); } else { UCL_H_Vec cast_buffer; cast_buffer.alloc(numel,dst,UCL_WRITE_ONLY); _ucl_cast_copy::cc(dst,src,numel, cast_buffer); } } else ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type)); } // -------------------------------------------------------------------------- // - 2D COPY - SPECIFIED NUMBER OF ROWS/COLS // -------------------------------------------------------------------------- /// Asynchronous copy subset matrix rows/cols with cast (Device/Host transfer) /** \param async Perform non-blocking copy on default stream * \param cast_buffer Buffer on host with enough storage for casting * - If src is a vector, routine assumes row-major rows by cols copy * - If src is a matrix, routine will copy upper left tile of matrix * - If dst is a vector, routine assumes row-major rows by cols copy * - If dst is a matrix, routine will copy into left tile of matrix * - If the data types for the two matrices are same, no cast performed * - Padding for 2D matrices is not considered in this routine. * - Copy from vector to matrix and vice versa allowed * - Currently does not handle textures **/ template inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols, mat3 &cast_buffer, const bool async) { if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) ucl_copy(dst,src,rows,cols,async); else if (async) ucl_copy(dst,src,rows,cols,dst.cq()); else { #ifdef UCL_DEBUG _check_ucl_copy_perm(dst,src); #endif _ucl_cast_copy::cc(dst,src,rows,cols, cast_buffer); } } /// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer) /** \param cast_buffer Buffer on host with enough storage for casting * - If src is a vector, routine assumes row-major rows by cols copy * - If src is a matrix, routine will copy upper left tile of matrix * - If dst is a vector, routine assumes row-major rows by cols copy * - If dst is a matrix, routine will copy into upper left tile of matrix * - If the data types for the two matrices are same, no cast performed * - Padding for 2D matrices is not considered in this routine. * - Copy from vector to matrix and vice versa allowed * - Currently does not handle textures **/ template inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols, mat3 &cast_buffer, command_queue &cq) { if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) ucl_copy(dst,src,rows,cols,cq); else { #ifdef UCL_DEBUG _check_ucl_copy_perm(dst,src); #endif _ucl_cast_copy::cc(dst,src,rows,cols, cast_buffer,cq); } } /// Asynchronous copy of subset matrix rows,cols (memory already allocated) /** - If src is a vector, routine assumes row-major rows by cols copy * - If src is a matrix, routine will copy upper left tile of matrix * - If dst is a vector, routine assumes row-major rows by cols copy * - If dst is a matrix, routine will copy into left tile of matrix * - If the data types of the two matrices are not the same, * casting will be performed automatically as long as the copy is * not device to device. For host/device transfers, a temporary * buffer is created for copy. When multiple casts occur, it is * more efficient to create a permanent casting buffer that can * be passed to an alternative copy routine. * - The copy should handle padding for 2D alignment correctly * - Copy from vector to matrix and vice versa allowed * - Currently does not handle textures **/ template inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols, command_queue &cq) { #ifdef UCL_DEBUG _check_ucl_copy_perm(dst,src); #endif if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) _host_host_copy::hhc(dst,src,rows,cols); else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { if (mat1::MEM_TYPE==1) { UCL_H_Vec cast_buffer; cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY); _ucl_cast_copy::cc(dst,src,rows,cols, cast_buffer,cq); } else { UCL_H_Vec cast_buffer; cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY); _ucl_cast_copy::cc(dst,src,rows,cols, cast_buffer,cq); } // If we are here, at least one of the matrices must have VECTOR=0 } else if (mat1::VECTOR) { #ifdef UCL_DEBUG assert(dst.numel()>=rows*cols && src.rows()>=rows && src.cols()>=cols); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); #endif ucl_mv_cpy(dst,cols*sizeof(typename mat1::data_type),src,src.row_bytes(), cols*sizeof(typename mat1::data_type),rows, cq); } else if (mat2::VECTOR) { #ifdef UCL_DEBUG assert(src.numel()>=rows*cols && dst.rows()>=rows && dst.cols()>=cols); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); #endif ucl_mv_cpy(dst,dst.row_bytes(),src,cols*sizeof(typename mat1::data_type), cols*sizeof(typename mat1::data_type),rows,cq); } else { #ifdef UCL_DEBUG assert(src.rows()>=rows && src.cols()>=cols); assert(dst.rows()>=rows && dst.cols()>=cols); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); #endif ucl_mv_cpy(dst,dst.row_bytes(),src,src.row_bytes(), cols*sizeof(typename mat1::data_type),rows,cq); } } /// Copy subset of matrix rows,cols (memory already allocated) /** \param async Perform non-blocking copy (ignored for host to host copy) * - If src is a vector, routine assumes row-major rows by cols copy * - If src is a matrix, routine will copy upper left tile of matrix * - If dst is a vector, routine assumes row-major rows by cols copy * - If dst is a matrix, routine will copy into left tile of matrix * - If the data types of the two matrices are not the same, * casting will be performed automatically as long as the copy is * not device to device. For host/device transfers, a temporary * buffer is created for copy. When multiple casts occur, it is * more efficient to create a permanent casting buffer that can * be passed to an alternative copy routine. * - The copy should handle padding for 2D alignment correctly * - Copy from vector to matrix and vice versa allowed * - The default stream is used for asynchronous copy * - Currently does not handle textures **/ template inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows, const size_t cols, const bool async) { #ifdef UCL_DEBUG _check_ucl_copy_perm(dst,src); #endif if (async) ucl_copy(dst,src,rows,cols,dst.cq()); else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1) _host_host_copy::hhc(dst,src,rows,cols); else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) { if (mat1::MEM_TYPE==1) { UCL_H_Vec cast_buffer; cast_buffer.alloc(rows*cols,dst,UCL_READ_ONLY); _ucl_cast_copy::cc(dst,src,rows,cols, cast_buffer); } else { UCL_H_Vec cast_buffer; cast_buffer.alloc(rows*cols,dst,UCL_WRITE_ONLY); _ucl_cast_copy::cc(dst,src,rows,cols, cast_buffer); } // If we are here, at least one of the matrices must have VECTOR=0 } else if (mat1::VECTOR) { #ifdef UCL_DEBUG assert(dst.numel()>=rows*cols && src.rows()>=rows && src.cols()>=cols); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); assert(mat2::VECTOR==0); #endif ucl_mv_cpy(dst,cols*sizeof(typename mat1::data_type),src,src.row_bytes(), cols*sizeof(typename mat1::data_type),rows); } else if (mat2::VECTOR) { #ifdef UCL_DEBUG assert(src.numel()>=rows*cols && dst.rows()>=rows && dst.cols()>=cols); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); assert(mat1::VECTOR==0); #endif ucl_mv_cpy(dst,dst.row_bytes(),src,cols*sizeof(typename mat1::data_type), cols*sizeof(typename mat1::data_type),rows); } else { #ifdef UCL_DEBUG assert(src.rows()>=rows && src.cols()>=cols); assert(dst.rows()>=rows && dst.cols()>=cols); assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1); #endif ucl_mv_cpy(dst,dst.row_bytes(),src,src.row_bytes(), cols*sizeof(typename mat1::data_type),rows); } } // -------------------------------------------------------------------------- // - 1D/2D COPY // -------------------------------------------------------------------------- /// Asynchronous copy of matrix/vector with cast (Device/Host transfer) /** \param async Perform non-blocking copy on default stream * \param cast_buffer Buffer on host with enough storage for casting * - If the data types for the two matrices are same, no cast performed * - The number of bytes copied is determined by entire src data * - Padding for 2D matrices is not considered in this routine. * - Copy from vector to matrix and vice versa allowed * - Currently does not handle textures **/ template inline void ucl_cast_copy(mat1 &dst, const mat2 &src, mat3 &cast_buffer, const bool async) { if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) ucl_copy(dst,src,async); else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) ) ucl_cast_copy(dst,src,src.rows(),src.cols(),cast_buffer,async); else if (mat1::PADDED==1) ucl_cast_copy(dst,src,dst.rows(),dst.cols(),cast_buffer,async); else ucl_cast_copy(dst,src,src.numel(),cast_buffer,async); } /// Asynchronous copy of matrix/vector with cast (Device/Host transfer) /** \param cast_buffer Buffer on host with enough storage for casting * - If the data types for the two matrices are same, no cast performed * - The number of bytes copied is determined by entire src data * - Padding for 2D matrices is not considered in this routine. * - Copy from vector to matrix and vice versa allowed * - Currently does not handle textures **/ template inline void ucl_cast_copy(mat1 &dst, const mat2 &src, mat3 &cast_buffer, command_queue &cq) { if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) ucl_copy(dst,src,cq); else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) ) ucl_copy(dst,src,src.rows(),src.cols(),cast_buffer,cq); else if (mat1::PADDED==1) ucl_copy(dst,src,dst.rows(),dst.cols(),cast_buffer,cq); else ucl_copy(dst,src,src.numel(),cast_buffer,cq); } /// Asynchronous copy of matrix/vector (memory already allocated) /** - The number of bytes copied is determined by entire src data * - If the data types of the two matrices are not the same, * casting will be performed automatically as long as the copy is * not device to device. For host/device transfers, a temporary * buffer is created for copy. When multiple casts occur, it is * more efficient to create a permanent casting buffer that can * be passed to an alternative copy routine. * - The copy should handle padding for 2D alignment correctly * - Copy from vector to matrix and vice versa allowed * - Currently does not handle textures **/ template inline void ucl_copy(mat1 &dst, const mat2 &src, command_queue &cq) { if (dst.row_bytes()==src.row_bytes() && src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW && (int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) ucl_copy(dst,src,src.row_size()*src.rows(),cq); else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) ) ucl_copy(dst,src,src.rows(),src.cols(),cq); else if (mat1::PADDED==1) ucl_copy(dst,src,dst.rows(),dst.cols(),cq); else ucl_copy(dst,src,src.numel(),cq); } /// Copy matrix/vector (memory already allocated) /** \param async Perform non-blocking copy (ignored for host to host copy) * - The number of bytes copied is determined by entire src data * - If the data types of the two matrices are not the same, * casting will be performed automatically as long as the copy is * not device to device. For host/device transfers, a temporary * buffer is created for copy. When multiple casts occur, it is * more efficient to create a permanent casting buffer that can * be passed to an alternative copy routine. * - The copy should handle padding for 2D alignment correctly * - Copy from vector to matrix and vice versa allowed * - The default stream is used for asynchronous copy * - Currently does not handle textures **/ template inline void ucl_copy(mat1 &dst, const mat2 &src, const bool async) { if (async) ucl_copy(dst,src,dst.cq()); else if (dst.row_bytes()==src.row_bytes() && src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW && (int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE) ucl_copy(dst,src,src.row_size()*src.rows(),async); else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) ) ucl_copy(dst,src,src.rows(),src.cols(),async); else if (mat1::PADDED==1) ucl_copy(dst,src,dst.rows(),dst.cols(),async); else ucl_copy(dst,src,src.numel(),async); } #endif