/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * \file * The cub::BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. */ #pragma once #include "specializations/block_reduce_raking.cuh" #include "specializations/block_reduce_raking_commutative_only.cuh" #include "specializations/block_reduce_warp_reductions.cuh" #include "../config.cuh" #include "../util_ptx.cuh" #include "../util_type.cuh" #include "../thread/thread_operators.cuh" CUB_NAMESPACE_BEGIN /****************************************************************************** * Algorithmic variants ******************************************************************************/ /** * BlockReduceAlgorithm enumerates alternative algorithms for parallel * reduction across a CUDA thread block. */ enum BlockReduceAlgorithm { /** * \par Overview * An efficient "raking" reduction algorithm that only supports commutative * reduction operators (true for most operations, e.g., addition). * * \par * Execution is comprised of three phases: * -# Upsweep sequential reduction in registers (if threads contribute more * than one input each). Threads in warps other than the first warp place * their partial reductions into shared memory. * -# Upsweep sequential reduction in shared memory. Threads within the first * warp continue to accumulate by raking across segments of shared partial reductions * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. * * \par * \image html block_reduce.png *

\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.

* * \par Performance Considerations * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE * and is preferable when the reduction operator is commutative. This variant * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall * throughput across the GPU when suitably occupied. However, turn-around latency may be * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable * when the GPU is under-occupied. */ BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY, /** * \par Overview * An efficient "raking" reduction algorithm that supports commutative * (e.g., addition) and non-commutative (e.g., string concatenation) reduction * operators. \blocked. * * \par * Execution is comprised of three phases: * -# Upsweep sequential reduction in registers (if threads contribute more * than one input each). Each thread then places the partial reduction * of its item(s) into shared memory. * -# Upsweep sequential reduction in shared memory. Threads within a * single warp rake across segments of shared partial reductions. * -# A warp-synchronous Kogge-Stone style reduction within the raking warp. * * \par * \image html block_reduce.png *

\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.

* * \par Performance Considerations * - This variant performs more communication than BLOCK_REDUCE_RAKING * and is only preferable when the reduction operator is non-commutative. This variant * applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall * throughput across the GPU when suitably occupied. However, turn-around latency may be * higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable * when the GPU is under-occupied. */ BLOCK_REDUCE_RAKING, /** * \par Overview * A quick "tiled warp-reductions" reduction algorithm that supports commutative * (e.g., addition) and non-commutative (e.g., string concatenation) reduction * operators. * * \par * Execution is comprised of four phases: * -# Upsweep sequential reduction in registers (if threads contribute more * than one input each). Each thread then places the partial reduction * of its item(s) into shared memory. * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style * reduction within each warp. * -# A propagation phase where the warp reduction outputs in each warp are * updated with the aggregate from each preceding warp. * * \par * \image html block_scan_warpscans.png *

\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.

* * \par Performance Considerations * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING * or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall * throughput across the GPU. However turn-around latency may be lower and * thus useful when the GPU is under-occupied. */ BLOCK_REDUCE_WARP_REDUCTIONS, }; /****************************************************************************** * Block reduce ******************************************************************************/ /** * \brief The BlockReduce class provides [collective](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png) * \ingroup BlockModule * * \tparam T Data type being reduced * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension * \tparam ALGORITHM [optional] cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS) * \tparam BLOCK_DIM_Y [optional] The thread block length in threads along the Y dimension (default: 1) * \tparam BLOCK_DIM_Z [optional] The thread block length in threads along the Z dimension (default: 1) * \tparam PTX_ARCH [optional] \ptxversion * * \par Overview * - A reduction (or fold) * uses a binary combining operator to compute a single aggregate from a list of input elements. * - \rowmajor * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles: * -# cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY. An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) * -# cub::BLOCK_REDUCE_RAKING. An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) * -# cub::BLOCK_REDUCE_WARP_REDUCTIONS. A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm) * * \par Performance Considerations * - \granularity * - Very efficient (only one synchronization barrier). * - Incurs zero bank conflicts for most types * - Computation is slightly more efficient (i.e., having lower instruction overhead) for: * - Summation (vs. generic reduction) * - \p BLOCK_THREADS is a multiple of the architecture's warp size * - Every thread has a valid input (i.e., full vs. partial-tiles) * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives * * \par A Simple Example * \blockcollective{BlockReduce} * \par * The code snippet below illustrates a sum reduction of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockReduce for a 1D block of 128 threads of type int * typedef cub::BlockReduce BlockReduce; * * // Allocate shared memory for BlockReduce * __shared__ typename BlockReduce::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Compute the block-wide sum for thread0 * int aggregate = BlockReduce(temp_storage).Sum(thread_data); * * \endcode * * \par Re-using dynamically allocating shared memory * The following example under the examples/block folder illustrates usage of * dynamically shared memory with BlockReduce and how to re-purpose * the same memory region: * example_block_reduce_dyn_smem.cu */ template < typename T, int BLOCK_DIM_X, BlockReduceAlgorithm ALGORITHM = BLOCK_REDUCE_WARP_REDUCTIONS, int BLOCK_DIM_Y = 1, int BLOCK_DIM_Z = 1, int PTX_ARCH = CUB_PTX_ARCH> class BlockReduce { private: /****************************************************************************** * Constants and type definitions ******************************************************************************/ /// Constants enum { /// The thread block size in threads BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z, }; typedef BlockReduceWarpReductions WarpReductions; typedef BlockReduceRakingCommutativeOnly RakingCommutativeOnly; typedef BlockReduceRaking Raking; /// Internal specialization type using InternalBlockReduce = cub::detail::conditional_t< ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS, WarpReductions, cub::detail::conditional_t>; // BlockReduceRaking /// Shared memory storage layout type for BlockReduce typedef typename InternalBlockReduce::TempStorage _TempStorage; /****************************************************************************** * Utility methods ******************************************************************************/ /// Internal storage allocator __device__ __forceinline__ _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } /****************************************************************************** * Thread fields ******************************************************************************/ /// Shared storage reference _TempStorage &temp_storage; /// Linear thread-id unsigned int linear_tid; public: /// \smemstorage{BlockReduce} struct TempStorage : Uninitialized<_TempStorage> {}; /******************************************************************//** * \name Collective constructors *********************************************************************/ //@{ /** * \brief Collective constructor using a private static allocation of shared memory as temporary storage. */ __device__ __forceinline__ BlockReduce() : temp_storage(PrivateStorage()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} /** * \brief Collective constructor using the specified memory allocation as temporary storage. */ __device__ __forceinline__ BlockReduce( TempStorage &temp_storage) ///< [in] Reference to memory allocation having layout type TempStorage : temp_storage(temp_storage.Alias()), linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)) {} //@} end member group /******************************************************************//** * \name Generic reductions *********************************************************************/ //@{ /** * \brief Computes a block-wide reduction for thread₀ using the specified binary reduction functor. Each thread contributes one input element. * * \par * - The return value is undefined in threads other than thread₀. * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates a max reduction of 128 integer items that * are partitioned across 128 threads. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockReduce for a 1D block of 128 threads of type int * typedef cub::BlockReduce BlockReduce; * * // Allocate shared memory for BlockReduce * __shared__ typename BlockReduce::TempStorage temp_storage; * * // Each thread obtains an input item * int thread_data; * ... * * // Compute the block-wide max for thread0 * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); * * \endcode * * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ T Reduce( T input, ///< [in] Calling thread's input ReductionOp reduction_op) ///< [in] Binary reduction functor { return InternalBlockReduce(temp_storage).template Reduce(input, BLOCK_THREADS, reduction_op); } /** * \brief Computes a block-wide reduction for thread₀ using the specified binary reduction functor. Each thread contributes an array of consecutive input elements. * * \par * - The return value is undefined in threads other than thread₀. * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates a max reduction of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockReduce for a 1D block of 128 threads of type int * typedef cub::BlockReduce BlockReduce; * * // Allocate shared memory for BlockReduce * __shared__ typename BlockReduce::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Compute the block-wide max for thread0 * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max()); * * \endcode * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) */ template < int ITEMS_PER_THREAD, typename ReductionOp> __device__ __forceinline__ T Reduce( T (&inputs)[ITEMS_PER_THREAD], ///< [in] Calling thread's input segment ReductionOp reduction_op) ///< [in] Binary reduction functor { // Reduce partials T partial = internal::ThreadReduce(inputs, reduction_op); return Reduce(partial, reduction_op); } /** * \brief Computes a block-wide reduction for thread₀ using the specified binary reduction functor. The first \p num_valid threads each contribute one input element. * * \par * - The return value is undefined in threads other than thread₀. * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates a max reduction of a partially-full tile of integer items that * are partitioned across 128 threads. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(int num_valid, ...) * { * // Specialize BlockReduce for a 1D block of 128 threads of type int * typedef cub::BlockReduce BlockReduce; * * // Allocate shared memory for BlockReduce * __shared__ typename BlockReduce::TempStorage temp_storage; * * // Each thread obtains an input item * int thread_data; * if (threadIdx.x < num_valid) thread_data = ... * * // Compute the block-wide max for thread0 * int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid); * * \endcode * * \tparam ReductionOp [inferred] Binary reduction functor type having member T operator()(const T &a, const T &b) */ template __device__ __forceinline__ T Reduce( T input, ///< [in] Calling thread's input ReductionOp reduction_op, ///< [in] Binary reduction functor int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) { // Determine if we scan skip bounds checking if (num_valid >= BLOCK_THREADS) { return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); } else { return InternalBlockReduce(temp_storage).template Reduce(input, num_valid, reduction_op); } } //@} end member group /******************************************************************//** * \name Summation reductions *********************************************************************/ //@{ /** * \brief Computes a block-wide reduction for thread₀ using addition (+) as the reduction operator. Each thread contributes one input element. * * \par * - The return value is undefined in threads other than thread₀. * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates a sum reduction of 128 integer items that * are partitioned across 128 threads. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockReduce for a 1D block of 128 threads of type int * typedef cub::BlockReduce BlockReduce; * * // Allocate shared memory for BlockReduce * __shared__ typename BlockReduce::TempStorage temp_storage; * * // Each thread obtains an input item * int thread_data; * ... * * // Compute the block-wide sum for thread0 * int aggregate = BlockReduce(temp_storage).Sum(thread_data); * * \endcode * */ __device__ __forceinline__ T Sum( T input) ///< [in] Calling thread's input { return InternalBlockReduce(temp_storage).template Sum(input, BLOCK_THREADS); } /** * \brief Computes a block-wide reduction for thread₀ using addition (+) as the reduction operator. Each thread contributes an array of consecutive input elements. * * \par * - The return value is undefined in threads other than thread₀. * - \granularity * - \smemreuse * * \par Snippet * The code snippet below illustrates a sum reduction of 512 integer items that * are partitioned in a [blocked arrangement](index.html#sec5sec3) across 128 threads * where each thread owns 4 consecutive items. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(...) * { * // Specialize BlockReduce for a 1D block of 128 threads of type int * typedef cub::BlockReduce BlockReduce; * * // Allocate shared memory for BlockReduce * __shared__ typename BlockReduce::TempStorage temp_storage; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Compute the block-wide sum for thread0 * int aggregate = BlockReduce(temp_storage).Sum(thread_data); * * \endcode * * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. */ template __device__ __forceinline__ T Sum( T (&inputs)[ITEMS_PER_THREAD]) ///< [in] Calling thread's input segment { // Reduce partials T partial = internal::ThreadReduce(inputs, cub::Sum()); return Sum(partial); } /** * \brief Computes a block-wide reduction for thread₀ using addition (+) as the reduction operator. The first \p num_valid threads each contribute one input element. * * \par * - The return value is undefined in threads other than thread₀. * - \rowmajor * - \smemreuse * * \par Snippet * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that * are partitioned across 128 threads. * \par * \code * #include // or equivalently * * __global__ void ExampleKernel(int num_valid, ...) * { * // Specialize BlockReduce for a 1D block of 128 threads of type int * typedef cub::BlockReduce BlockReduce; * * // Allocate shared memory for BlockReduce * __shared__ typename BlockReduce::TempStorage temp_storage; * * // Each thread obtains an input item (up to num_items) * int thread_data; * if (threadIdx.x < num_valid) * thread_data = ... * * // Compute the block-wide sum for thread0 * int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid); * * \endcode * */ __device__ __forceinline__ T Sum( T input, ///< [in] Calling thread's input int num_valid) ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS) { // Determine if we scan skip bounds checking if (num_valid >= BLOCK_THREADS) { return InternalBlockReduce(temp_storage).template Sum(input, num_valid); } else { return InternalBlockReduce(temp_storage).template Sum(input, num_valid); } } //@} end member group }; /** * \example example_block_reduce.cu */ CUB_NAMESPACE_END