/*M/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // // By downloading, copying, installing or using the software you agree to this license. // If you do not agree to this license, do not download, install, // copy or use the software. // // // License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // // * Redistribution's of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. // // This software is provided by the copyright holders and contributors "as is" and // any express or implied warranties, including, but not limited to, the implied // warranties of merchantability and fitness for a particular purpose are disclaimed. // In no event shall the Intel Corporation or contributors be liable for any direct, // indirect, incidental, special, exemplary, or consequential damages // (including, but not limited to, procurement of substitute goods or services; // loss of use, data, or profits; or business interruption) however caused // and on any theory of liability, whether in contract, strict liability, // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. // //M*/ #ifndef __OPENCV_CUDA_TRANSFORM_DETAIL_HPP__ #define __OPENCV_CUDA_TRANSFORM_DETAIL_HPP__ #include "../common.hpp" #include "../vec_traits.hpp" #include "../functional.hpp" //! @cond IGNORED namespace cv { namespace cuda { namespace device { namespace transform_detail { //! Read Write Traits template struct UnaryReadWriteTraits { typedef typename TypeVec::vec_type read_type; typedef typename TypeVec::vec_type write_type; }; template struct BinaryReadWriteTraits { typedef typename TypeVec::vec_type read_type1; typedef typename TypeVec::vec_type read_type2; typedef typename TypeVec::vec_type write_type; }; //! Transform kernels template struct OpUnroller; template <> struct OpUnroller<1> { template static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src.x); } template static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src1.x, src2.x); } }; template <> struct OpUnroller<2> { template static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, UnOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src.x); if (mask(y, x_shifted + 1)) dst.y = op(src.y); } template static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, BinOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src1.x, src2.x); if (mask(y, x_shifted + 1)) dst.y = op(src1.y, src2.y); } }; template <> struct OpUnroller<3> { template static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src.x); if (mask(y, x_shifted + 1)) dst.y = op(src.y); if (mask(y, x_shifted + 2)) dst.z = op(src.z); } template static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src1.x, src2.x); if (mask(y, x_shifted + 1)) dst.y = op(src1.y, src2.y); if (mask(y, x_shifted + 2)) dst.z = op(src1.z, src2.z); } }; template <> struct OpUnroller<4> { template static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src.x); if (mask(y, x_shifted + 1)) dst.y = op(src.y); if (mask(y, x_shifted + 2)) dst.z = op(src.z); if (mask(y, x_shifted + 3)) dst.w = op(src.w); } template static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.x = op(src1.x, src2.x); if (mask(y, x_shifted + 1)) dst.y = op(src1.y, src2.y); if (mask(y, x_shifted + 2)) dst.z = op(src1.z, src2.z); if (mask(y, x_shifted + 3)) dst.w = op(src1.w, src2.w); } }; template <> struct OpUnroller<8> { template static __device__ __forceinline__ void unroll(const T& src, D& dst, const Mask& mask, const UnOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.a0 = op(src.a0); if (mask(y, x_shifted + 1)) dst.a1 = op(src.a1); if (mask(y, x_shifted + 2)) dst.a2 = op(src.a2); if (mask(y, x_shifted + 3)) dst.a3 = op(src.a3); if (mask(y, x_shifted + 4)) dst.a4 = op(src.a4); if (mask(y, x_shifted + 5)) dst.a5 = op(src.a5); if (mask(y, x_shifted + 6)) dst.a6 = op(src.a6); if (mask(y, x_shifted + 7)) dst.a7 = op(src.a7); } template static __device__ __forceinline__ void unroll(const T1& src1, const T2& src2, D& dst, const Mask& mask, const BinOp& op, int x_shifted, int y) { if (mask(y, x_shifted)) dst.a0 = op(src1.a0, src2.a0); if (mask(y, x_shifted + 1)) dst.a1 = op(src1.a1, src2.a1); if (mask(y, x_shifted + 2)) dst.a2 = op(src1.a2, src2.a2); if (mask(y, x_shifted + 3)) dst.a3 = op(src1.a3, src2.a3); if (mask(y, x_shifted + 4)) dst.a4 = op(src1.a4, src2.a4); if (mask(y, x_shifted + 5)) dst.a5 = op(src1.a5, src2.a5); if (mask(y, x_shifted + 6)) dst.a6 = op(src1.a6, src2.a6); if (mask(y, x_shifted + 7)) dst.a7 = op(src1.a7, src2.a7); } }; template static __global__ void transformSmart(const PtrStepSz src_, PtrStep dst_, const Mask mask, const UnOp op) { typedef TransformFunctorTraits ft; typedef typename UnaryReadWriteTraits::read_type read_type; typedef typename UnaryReadWriteTraits::write_type write_type; const int x = threadIdx.x + blockIdx.x * blockDim.x; const int y = threadIdx.y + blockIdx.y * blockDim.y; const int x_shifted = x * ft::smart_shift; if (y < src_.rows) { const T* src = src_.ptr(y); D* dst = dst_.ptr(y); if (x_shifted + ft::smart_shift - 1 < src_.cols) { const read_type src_n_el = ((const read_type*)src)[x]; write_type dst_n_el = ((const write_type*)dst)[x]; OpUnroller::unroll(src_n_el, dst_n_el, mask, op, x_shifted, y); ((write_type*)dst)[x] = dst_n_el; } else { for (int real_x = x_shifted; real_x < src_.cols; ++real_x) { if (mask(y, real_x)) dst[real_x] = op(src[real_x]); } } } } template __global__ static void transformSimple(const PtrStepSz src, PtrStep dst, const Mask mask, const UnOp op) { const int x = blockDim.x * blockIdx.x + threadIdx.x; const int y = blockDim.y * blockIdx.y + threadIdx.y; if (x < src.cols && y < src.rows && mask(y, x)) { dst.ptr(y)[x] = op(src.ptr(y)[x]); } } template static __global__ void transformSmart(const PtrStepSz src1_, const PtrStep src2_, PtrStep dst_, const Mask mask, const BinOp op) { typedef TransformFunctorTraits ft; typedef typename BinaryReadWriteTraits::read_type1 read_type1; typedef typename BinaryReadWriteTraits::read_type2 read_type2; typedef typename BinaryReadWriteTraits::write_type write_type; const int x = threadIdx.x + blockIdx.x * blockDim.x; const int y = threadIdx.y + blockIdx.y * blockDim.y; const int x_shifted = x * ft::smart_shift; if (y < src1_.rows) { const T1* src1 = src1_.ptr(y); const T2* src2 = src2_.ptr(y); D* dst = dst_.ptr(y); if (x_shifted + ft::smart_shift - 1 < src1_.cols) { const read_type1 src1_n_el = ((const read_type1*)src1)[x]; const read_type2 src2_n_el = ((const read_type2*)src2)[x]; write_type dst_n_el = ((const write_type*)dst)[x]; OpUnroller::unroll(src1_n_el, src2_n_el, dst_n_el, mask, op, x_shifted, y); ((write_type*)dst)[x] = dst_n_el; } else { for (int real_x = x_shifted; real_x < src1_.cols; ++real_x) { if (mask(y, real_x)) dst[real_x] = op(src1[real_x], src2[real_x]); } } } } template static __global__ void transformSimple(const PtrStepSz src1, const PtrStep src2, PtrStep dst, const Mask mask, const BinOp op) { const int x = blockDim.x * blockIdx.x + threadIdx.x; const int y = blockDim.y * blockIdx.y + threadIdx.y; if (x < src1.cols && y < src1.rows && mask(y, x)) { const T1 src1_data = src1.ptr(y)[x]; const T2 src2_data = src2.ptr(y)[x]; dst.ptr(y)[x] = op(src1_data, src2_data); } } template struct TransformDispatcher; template<> struct TransformDispatcher { template static void call(PtrStepSz src, PtrStepSz dst, UnOp op, Mask mask, cudaStream_t stream) { typedef TransformFunctorTraits ft; const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1); const dim3 grid(divUp(src.cols, threads.x), divUp(src.rows, threads.y), 1); transformSimple<<>>(src, dst, mask, op); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } template static void call(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, BinOp op, Mask mask, cudaStream_t stream) { typedef TransformFunctorTraits ft; const dim3 threads(ft::simple_block_dim_x, ft::simple_block_dim_y, 1); const dim3 grid(divUp(src1.cols, threads.x), divUp(src1.rows, threads.y), 1); transformSimple<<>>(src1, src2, dst, mask, op); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } }; template<> struct TransformDispatcher { template static void call(PtrStepSz src, PtrStepSz dst, UnOp op, Mask mask, cudaStream_t stream) { typedef TransformFunctorTraits ft; CV_StaticAssert(ft::smart_shift != 1, ""); if (!isAligned(src.data, ft::smart_shift * sizeof(T)) || !isAligned(src.step, ft::smart_shift * sizeof(T)) || !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D))) { TransformDispatcher::call(src, dst, op, mask, stream); return; } const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1); const dim3 grid(divUp(src.cols, threads.x * ft::smart_shift), divUp(src.rows, threads.y), 1); transformSmart<<>>(src, dst, mask, op); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } template static void call(PtrStepSz src1, PtrStepSz src2, PtrStepSz dst, BinOp op, Mask mask, cudaStream_t stream) { typedef TransformFunctorTraits ft; CV_StaticAssert(ft::smart_shift != 1, ""); if (!isAligned(src1.data, ft::smart_shift * sizeof(T1)) || !isAligned(src1.step, ft::smart_shift * sizeof(T1)) || !isAligned(src2.data, ft::smart_shift * sizeof(T2)) || !isAligned(src2.step, ft::smart_shift * sizeof(T2)) || !isAligned(dst.data, ft::smart_shift * sizeof(D)) || !isAligned(dst.step, ft::smart_shift * sizeof(D))) { TransformDispatcher::call(src1, src2, dst, op, mask, stream); return; } const dim3 threads(ft::smart_block_dim_x, ft::smart_block_dim_y, 1); const dim3 grid(divUp(src1.cols, threads.x * ft::smart_shift), divUp(src1.rows, threads.y), 1); transformSmart<<>>(src1, src2, dst, mask, op); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); } }; } // namespace transform_detail }}} // namespace cv { namespace cuda { namespace cudev //! @endcond #endif // __OPENCV_CUDA_TRANSFORM_DETAIL_HPP__