/*M/////////////////////////////////////////////////////////////////////////////////////// // // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. // // By downloading, copying, installing or using the software you agree to this license. // If you do not agree to this license, do not download, install, // copy or use the software. // // // License Agreement // For Open Source Computer Vision Library // // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2013, OpenCV Foundation, all rights reserved. // Third party copyrights are property of their respective owners. // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: // // * Redistribution's of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // // * Redistribution's in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // * The name of the copyright holders may not be used to endorse or promote products // derived from this software without specific prior written permission. // // This software is provided by the copyright holders and contributors "as is" and // any express or implied warranties, including, but not limited to, the implied // warranties of merchantability and fitness for a particular purpose are disclaimed. // In no event shall the Intel Corporation or contributors be liable for any direct, // indirect, incidental, special, exemplary, or consequential damages // (including, but not limited to, procurement of substitute goods or services; // loss of use, data, or profits; or business interruption) however caused // and on any theory of liability, whether in contract, strict liability, // or tort (including negligence or otherwise) arising in any way out of // the use of this software, even if advised of the possibility of such damage. // //M*/ #pragma once #ifndef __OPENCV_CUDEV_BLOCK_REDUCE_DETAIL_HPP__ #define __OPENCV_CUDEV_BLOCK_REDUCE_DETAIL_HPP__ #include "../../common.hpp" #include "../../util/tuple.hpp" #include "../../util/type_traits.hpp" #include "../../warp/warp.hpp" #include "../../warp/shuffle.hpp" namespace cv { namespace cudev { namespace block_reduce_detail { // GetType template struct GetType; template struct GetType { typedef T type; }; template struct GetType { typedef T type; }; template struct GetType { typedef T type; }; // For template struct For { template __device__ static void loadToSmem(const PointerTuple& smem, const ValTuple& val, uint tid) { get(smem)[tid] = get(val); For::loadToSmem(smem, val, tid); } template __device__ static void loadFromSmem(const PointerTuple& smem, const ValTuple& val, uint tid) { get(val) = get(smem)[tid]; For::loadFromSmem(smem, val, tid); } template __device__ static void merge(const PointerTuple& smem, const ValTuple& val, uint tid, uint delta, const OpTuple& op) { typename GetType::type>::type reg = get(smem)[tid + delta]; get(smem)[tid] = get(val) = get(op)(get(val), reg); For::merge(smem, val, tid, delta, op); } #if CV_CUDEV_ARCH >= 300 template __device__ static void mergeShfl(const ValTuple& val, uint delta, uint width, const OpTuple& op) { typename GetType::type>::type reg = shfl_down(get(val), delta, width); get(val) = get(op)(get(val), reg); For::mergeShfl(val, delta, width, op); } #endif }; template struct For { template __device__ __forceinline__ static void loadToSmem(const PointerTuple&, const ValTuple&, uint) { } template __device__ __forceinline__ static void loadFromSmem(const PointerTuple&, const ValTuple&, uint) { } template __device__ __forceinline__ static void merge(const PointerTuple&, const ValTuple&, uint, uint, const OpTuple&) { } #if CV_CUDEV_ARCH >= 300 template __device__ __forceinline__ static void mergeShfl(const ValTuple&, uint, uint, const OpTuple&) { } #endif }; // loadToSmem / loadFromSmem template __device__ __forceinline__ void loadToSmem(volatile T* smem, T& val, uint tid) { smem[tid] = val; } template __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& val, uint tid) { val = smem[tid]; } template __device__ __forceinline__ void loadToSmem(const tuple& smem, const tuple& val, uint tid) { For<0, tuple_size >::value>::loadToSmem(smem, val, tid); } template __device__ __forceinline__ void loadFromSmem(const tuple& smem, const tuple& val, uint tid) { For<0, tuple_size >::value>::loadFromSmem(smem, val, tid); } // merge template __device__ __forceinline__ void merge(volatile T* smem, T& val, uint tid, uint delta, const Op& op) { T reg = smem[tid + delta]; smem[tid] = val = op(val, reg); } template __device__ __forceinline__ void merge(const tuple& smem, const tuple& val, uint tid, uint delta, const tuple& op) { For<0, tuple_size >::value>::merge(smem, val, tid, delta, op); } // mergeShfl #if CV_CUDEV_ARCH >= 300 template __device__ __forceinline__ void mergeShfl(T& val, uint delta, uint width, const Op& op) { T reg = shfl_down(val, delta, width); val = op(val, reg); } template __device__ __forceinline__ void mergeShfl(const tuple& val, uint delta, uint width, const tuple& op) { For<0, tuple_size >::value>::mergeShfl(val, delta, width, op); } #endif // Generic template struct Generic { template __device__ static void reduce(Pointer smem, Reference val, uint tid, Op op) { loadToSmem(smem, val, tid); if (N >= 32) __syncthreads(); if (N >= 2048) { if (tid < 1024) merge(smem, val, tid, 1024, op); __syncthreads(); } if (N >= 1024) { if (tid < 512) merge(smem, val, tid, 512, op); __syncthreads(); } if (N >= 512) { if (tid < 256) merge(smem, val, tid, 256, op); __syncthreads(); } if (N >= 256) { if (tid < 128) merge(smem, val, tid, 128, op); __syncthreads(); } if (N >= 128) { if (tid < 64) merge(smem, val, tid, 64, op); __syncthreads(); } if (N >= 64) { if (tid < 32) merge(smem, val, tid, 32, op); } if (tid < 16) { merge(smem, val, tid, 16, op); merge(smem, val, tid, 8, op); merge(smem, val, tid, 4, op); merge(smem, val, tid, 2, op); merge(smem, val, tid, 1, op); } } }; // Unroll template struct Unroll { __device__ static void loop(Pointer smem, Reference val, uint tid, Op op) { merge(smem, val, tid, I, op); Unroll::loop(smem, val, tid, op); } #if CV_CUDEV_ARCH >= 300 __device__ static void loopShfl(Reference val, Op op, uint N) { mergeShfl(val, I, N, op); Unroll::loopShfl(val, op, N); } #endif }; template struct Unroll<0, Pointer, Reference, Op> { __device__ __forceinline__ static void loop(Pointer, Reference, uint, Op) { } #if CV_CUDEV_ARCH >= 300 __device__ __forceinline__ static void loopShfl(Reference, Op, uint) { } #endif }; // WarpOptimized template struct WarpOptimized { template __device__ static void reduce(Pointer smem, Reference val, uint tid, Op op) { #if CV_CUDEV_ARCH >= 300 (void) smem; (void) tid; Unroll::loopShfl(val, op, N); #else loadToSmem(smem, val, tid); if (tid < N / 2) Unroll::loop(smem, val, tid, op); #endif } }; // GenericOptimized32 template struct GenericOptimized32 { enum { M = N / 32 }; template __device__ static void reduce(Pointer smem, Reference val, uint tid, Op op) { const uint laneId = Warp::laneId(); #if CV_CUDEV_ARCH >= 300 Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize); if (laneId == 0) loadToSmem(smem, val, tid / 32); #else loadToSmem(smem, val, tid); if (laneId < 16) Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op); __syncthreads(); if (laneId == 0) loadToSmem(smem, val, tid / 32); #endif __syncthreads(); loadFromSmem(smem, val, tid); if (tid < 32) { #if CV_CUDEV_ARCH >= 300 Unroll::loopShfl(val, op, M); #else Unroll::loop(smem, val, tid, op); #endif } } }; template struct Dispatcher { typedef typename SelectIf< (N <= 32) && IsPowerOf2::value, WarpOptimized, typename SelectIf< (N <= 1024) && IsPowerOf2::value, GenericOptimized32, Generic >::type >::type reductor; }; } }} #endif