/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * The cub::WarpExchange class provides [collective](index.html#sec0) * methods for rearranging data partitioned across a CUDA warp. */ #pragma once #include #include #include CUB_NAMESPACE_BEGIN /** * @brief The WarpExchange class provides [collective](index.html#sec0) * methods for rearranging data partitioned across a CUDA warp. * @ingroup WarpModule * * @tparam T * The data type to be exchanged. * * @tparam ITEMS_PER_THREAD * The number of items partitioned onto each thread. * * @tparam LOGICAL_WARP_THREADS * [optional] The number of threads per "logical" warp (may be less * than the number of hardware warp threads). Default is the warp size of the * targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a * power of two. * * @tparam PTX_ARCH * [optional] \ptxversion * * @par Overview * - It is commonplace for a warp of threads to rearrange data items between * threads. For example, the global memory accesses prefer patterns where * data items are "striped" across threads (where consecutive threads access * consecutive items), yet most warp-wide operations prefer a "blocked" * partitioning of items across threads (where consecutive items belong to a * single thread). * - WarpExchange supports the following types of data exchanges: * - Transposing between [blocked](index.html#sec5sec3) and * [striped](index.html#sec5sec3) arrangements * - Scattering ranked items to a * [striped arrangement](index.html#sec5sec3) * * @par A Simple Example * @par * The code snippet below illustrates the conversion from a "blocked" to a * "striped" arrangement of 64 integer items partitioned across 16 threads where * each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = * cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Load a tile of data striped across threads * int thread_data[items_per_thread]; * // ... * * // Collectively exchange data into a blocked arrangement across threads * WarpExchangeT(temp_storage[warp_id]).StripedToBlocked(thread_data, thread_data); * @endcode * @par * Suppose the set of striped input @p thread_data across the block of threads * is { [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }. * The corresponding output @p thread_data in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }. */ template class WarpExchange { static_assert(PowerOfTwo::VALUE, "LOGICAL_WARP_THREADS must be a power of two"); constexpr static int ITEMS_PER_TILE = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS + 1; constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH); constexpr static int LOG_SMEM_BANKS = CUB_LOG_SMEM_BANKS(PTX_ARCH); // Insert padding if the number of items per thread is a power of two // and > 4 (otherwise we can typically use 128b loads) constexpr static bool INSERT_PADDING = (ITEMS_PER_THREAD > 4) && (PowerOfTwo::VALUE); constexpr static int PADDING_ITEMS = INSERT_PADDING ? (ITEMS_PER_TILE >> LOG_SMEM_BANKS) : 0; union _TempStorage { InputT items_shared[ITEMS_PER_TILE + PADDING_ITEMS]; }; // union TempStorage /// Shared storage reference _TempStorage &temp_storage; const unsigned int lane_id; const unsigned int warp_id; const unsigned int member_mask; public: /// \smemstorage{WarpExchange} struct TempStorage : Uninitialized<_TempStorage> {}; /*************************************************************************//** * @name Collective constructors ****************************************************************************/ //@{ WarpExchange() = delete; /** * @brief Collective constructor using the specified memory allocation as * temporary storage. */ explicit __device__ __forceinline__ WarpExchange(TempStorage &temp_storage) : temp_storage(temp_storage.Alias()) , lane_id(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) { } //@} end member group /*************************************************************************//** * @name Data movement ****************************************************************************/ //@{ /** * @brief Transposes data items from blocked arrangement to * striped arrangement. * * @par * \smemreuse * * @par Snippet * The code snippet below illustrates the conversion from a "blocked" to a * "striped" arrangement of 64 integer items partitioned across 16 threads * where each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[items_per_thread]; * // ... * * // Collectively exchange data into a striped arrangement across threads * WarpExchangeT(temp_storage[warp_id]).BlockedToStriped(thread_data, thread_data); * @endcode * @par * Suppose the set of striped input @p thread_data across the block of threads * is { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }. * The corresponding output @p thread_data in those threads will be * { [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }. * * @param[in] input_items * Items to exchange, converting between blocked and * striped arrangements. * * @param[out] output_items * Items from exchange, converting between striped and * blocked arrangements. May be aliased to @p input_items. */ template __device__ __forceinline__ void BlockedToStriped(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { for (int item = 0; item < ITEMS_PER_THREAD; item++) { const int idx = ITEMS_PER_THREAD * lane_id + item; temp_storage.items_shared[idx] = input_items[item]; } WARP_SYNC(member_mask); for (int item = 0; item < ITEMS_PER_THREAD; item++) { const int idx = LOGICAL_WARP_THREADS * item + lane_id; output_items[item] = temp_storage.items_shared[idx]; } } /** * @brief Transposes data items from striped arrangement to * blocked arrangement. * * @par * \smemreuse * * @par Snippet * The code snippet below illustrates the conversion from a "striped" to a * "blocked" arrangement of 64 integer items partitioned across 16 threads * where each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Load a tile of data striped across threads * int thread_data[items_per_thread]; * // ... * * // Collectively exchange data into a blocked arrangement across threads * WarpExchangeT(temp_storage[warp_id]).StripedToBlocked(thread_data, thread_data); * @endcode * @par * Suppose the set of striped input @p thread_data across the block of threads * is { [0,16,32,48], [1,17,33,49], ..., [15, 32, 47, 63] }. * The corresponding output @p thread_data in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [60,61,62,63] }. * * @param[in] input_items * Items to exchange * * @param[out] output_items * Items from exchange. May be aliased to @p input_items. */ template __device__ __forceinline__ void StripedToBlocked(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD]) { for (int item = 0; item < ITEMS_PER_THREAD; item++) { const int idx = LOGICAL_WARP_THREADS * item + lane_id; temp_storage.items_shared[idx] = input_items[item]; } WARP_SYNC(member_mask); for (int item = 0; item < ITEMS_PER_THREAD; item++) { const int idx = ITEMS_PER_THREAD * lane_id + item; output_items[item] = temp_storage.items_shared[idx]; } } /** * @brief Exchanges valid data items annotated by rank * into striped arrangement. * * @par * \smemreuse * * @par Snippet * The code snippet below illustrates the conversion from a "scatter" to a * "striped" arrangement of 64 integer items partitioned across 16 threads * where each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[items_per_thread]; * int thread_ranks[items_per_thread]; * // ... * * // Collectively exchange data into a striped arrangement across threads * WarpExchangeT(temp_storage[warp_id]).ScatterToStriped( * thread_data, thread_ranks); * @endcode * @par * Suppose the set of input @p thread_data across the block of threads * is `{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`, and the set of * @p thread_ranks is `{ [63,62,61,60], ..., [7,6,5,4], [3,2,1,0] }`. The * corresponding output @p thread_data in those threads will be * `{ [63, 47, 31, 15], [62, 46, 30, 14], ..., [48, 32, 16, 0] }`. * * @tparam OffsetT [inferred] Signed integer type for local offsets * * @param[in,out] items Items to exchange * @param[in] ranks Corresponding scatter ranks */ template __device__ __forceinline__ void ScatterToStriped(InputT (&items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { ScatterToStriped(items, items, ranks); } /** * @brief Exchanges valid data items annotated by rank * into striped arrangement. * * @par * \smemreuse * * @par Snippet * The code snippet below illustrates the conversion from a "scatter" to a * "striped" arrangement of 64 integer items partitioned across 16 threads * where each thread owns 4 items. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpExchange for a virtual warp of 16 threads owning 4 integer items each * using WarpExchangeT = cub::WarpExchange; * * // Allocate shared memory for WarpExchange * __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_per_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_input[items_per_thread]; * int thread_ranks[items_per_thread]; * // ... * * // Collectively exchange data into a striped arrangement across threads * int thread_output[items_per_thread]; * WarpExchangeT(temp_storage[warp_id]).ScatterToStriped( * thread_input, thread_output, thread_ranks); * @endcode * @par * Suppose the set of input @p thread_input across the block of threads * is `{ [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }`, and the set of * @p thread_ranks is `{ [63,62,61,60], ..., [7,6,5,4], [3,2,1,0] }`. The * corresponding @p thread_output in those threads will be * `{ [63, 47, 31, 15], [62, 46, 30, 14], ..., [48, 32, 16, 0] }`. * * @tparam OffsetT [inferred] Signed integer type for local offsets * * @param[in] input_items * Items to exchange * * @param[out] output_items * Items from exchange. May be aliased to @p input_items. * * @param[in] ranks * Corresponding scatter ranks */ template __device__ __forceinline__ void ScatterToStriped(const InputT (&input_items)[ITEMS_PER_THREAD], OutputT (&output_items)[ITEMS_PER_THREAD], OffsetT (&ranks)[ITEMS_PER_THREAD]) { #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (INSERT_PADDING) { ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]); } temp_storage.items_shared[ranks[ITEM]] = input_items[ITEM]; } WARP_SYNC(member_mask); #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id; if (INSERT_PADDING) { item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset); } output_items[ITEM] = temp_storage.items_shared[item_offset]; } } //@} end member group }; CUB_NAMESPACE_END