/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /** * @file * Operations for writing linear segments of data from the CUDA warp */ #pragma once #include #include #include #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief cub::WarpStoreAlgorithm enumerates alternative algorithms for * cub::WarpStore to write a blocked arrangement of items across a CUDA * warp to a linear segment of memory. */ enum WarpStoreAlgorithm { /** * @par Overview * A [blocked arrangement](index.html#sec5sec3) of data is written * directly to memory. * * @par Performance Considerations * The utilization of memory transactions (coalescing) decreases as the * access stride between threads increases (i.e., the number items per thread). */ WARP_STORE_DIRECT, /** * @par Overview * A [striped arrangement](index.html#sec5sec3) of data is written * directly to memory. * * @par Performance Considerations * The utilization of memory transactions (coalescing) remains high regardless * of items written per thread. */ WARP_STORE_STRIPED, /** * @par Overview * * A [blocked arrangement](index.html#sec5sec3) of data is written * directly to memory using CUDA's built-in vectorized stores as a coalescing * optimization. For example, st.global.v4.s32 instructions will be * generated when @p T = @p int and @p ITEMS_PER_THREAD % 4 == 0. * * @par Performance Considerations * - The utilization of memory transactions (coalescing) remains high until * the the access stride between threads (i.e., the number items per thread) * exceeds the maximum vector store width (typically 4 items or 64B, * whichever is lower). * - The following conditions will prevent vectorization and writing will fall * back to cub::WARP_STORE_DIRECT: * - @p ITEMS_PER_THREAD is odd * - The @p OutputIteratorT is not a simple pointer type * - The block output offset is not quadword-aligned * - The data type @p T is not a built-in primitive or CUDA vector type * (e.g., @p short, @p int2, @p double, @p float2, etc.) */ WARP_STORE_VECTORIZE, /** * @par Overview * A [blocked arrangement](index.html#sec5sec3) is locally * transposed and then efficiently written to memory as a * [striped arrangement](index.html#sec5sec3). * * @par Performance Considerations * - The utilization of memory transactions (coalescing) remains high * regardless of items written per thread. * - The local reordering incurs slightly longer latencies and throughput than the * direct cub::WARP_STORE_DIRECT and cub::WARP_STORE_VECTORIZE alternatives. */ WARP_STORE_TRANSPOSE }; /** * @brief The WarpStore class provides [collective](index.html#sec0) * data movement methods for writing a [blocked arrangement](index.html#sec5sec3) * of items partitioned across a CUDA warp to a linear segment of memory. * @ingroup WarpModule * @ingroup UtilIo * * @tparam T * The type of data to be written. * * @tparam ITEMS_PER_THREAD * The number of consecutive items partitioned onto each thread. * * @tparam ALGORITHM * [optional] cub::WarpStoreAlgorithm tuning policy enumeration. * default: cub::WARP_STORE_DIRECT. * * @tparam LOGICAL_WARP_THREADS * [optional] The number of threads per "logical" warp (may be less * than the number of hardware warp threads). Default is the warp size of the * targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a * power of two. * * @tparam PTX_ARCH * [optional] \ptxversion * * @par Overview * - The WarpStore class provides a single data movement abstraction that can be * specialized to implement different cub::WarpStoreAlgorithm strategies. This * facilitates different performance policies for different architectures, * data types, granularity sizes, etc. * - WarpStore can be optionally specialized by different data movement strategies: * -# cub::WARP_STORE_DIRECT. A [blocked arrangement](index.html#sec5sec3) * of data is written directly to memory. [More...](@ref cub::WarpStoreAlgorithm) * -# cub::WARP_STORE_STRIPED. A [striped arrangement](index.html#sec5sec3) * of data is written directly to memory. [More...](@ref cub::WarpStoreAlgorithm) * -# cub::WARP_STORE_VECTORIZE. A [blocked arrangement](index.html#sec5sec3) * of data is written directly to memory using CUDA's built-in vectorized * stores as a coalescing optimization. [More...](@ref cub::WarpStoreAlgorithm) * -# cub::WARP_STORE_TRANSPOSE. A [blocked arrangement](index.html#sec5sec3) * is locally transposed into a [striped arrangement](index.html#sec5sec3) * which is then written to memory. [More...](@ref cub::WarpStoreAlgorithm) * - \rowmajor * * @par A Simple Example * @par * The code snippet below illustrates the storing of a "blocked" arrangement * of 64 integers across 16 threads (where each thread owns 4 consecutive items) * into a linear segment of memory. The store is specialized for * @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so * that memory references will be efficiently coalesced using a warp-striped * access pattern. * @par * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * * // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each * using WarpStoreT = WarpStore; * * constexpr int warps_in_block = block_threads / warp_threads; * constexpr int tile_size = items_per_thread * warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Allocate shared memory for WarpStore * __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Store items to linear memory * WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data); * @endcode * @par * Suppose the set of @p thread_data across the warp threads is * { [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }. * The output @p d_data will be 0, 1, 2, 3, 4, 5, .... */ template class WarpStore { static_assert(PowerOfTwo::VALUE, "LOGICAL_WARP_THREADS must be a power of two"); constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH); private: /// Store helper template struct StoreInternal; template struct StoreInternal { typedef NullType TempStorage; int linear_tid; __device__ __forceinline__ StoreInternal(TempStorage &/*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectBlocked(linear_tid, block_itr, items); } template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { StoreDirectBlocked(linear_tid, block_itr, items, valid_items); } }; template struct StoreInternal { typedef NullType TempStorage; int linear_tid; __device__ __forceinline__ StoreInternal(TempStorage & /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectStriped(linear_tid, block_itr, items); } template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { StoreDirectStriped(linear_tid, block_itr, items, valid_items); } }; template struct StoreInternal { typedef NullType TempStorage; int linear_tid; __device__ __forceinline__ StoreInternal(TempStorage & /*temp_storage*/, int linear_tid) : linear_tid(linear_tid) {} __device__ __forceinline__ void Store(T *block_ptr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectBlockedVectorized(linear_tid, block_ptr, items); } template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { StoreDirectBlocked(linear_tid, block_itr, items); } template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { StoreDirectBlocked(linear_tid, block_itr, items, valid_items); } }; template struct StoreInternal { using WarpExchangeT = WarpExchange; struct _TempStorage : WarpExchangeT::TempStorage {}; struct TempStorage : Uninitialized<_TempStorage> {}; _TempStorage &temp_storage; int linear_tid; __device__ __forceinline__ StoreInternal(TempStorage &temp_storage, int linear_tid) : temp_storage(temp_storage.Alias()) , linear_tid(linear_tid) {} template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { WarpExchangeT(temp_storage).BlockedToStriped(items, items); StoreDirectStriped(linear_tid, block_itr, items); } template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { WarpExchangeT(temp_storage).BlockedToStriped(items, items); StoreDirectStriped(linear_tid, block_itr, items, valid_items); } }; /// Internal load implementation to use using InternalStore = StoreInternal; /// Shared memory storage layout type using _TempStorage = typename InternalStore::TempStorage; __device__ __forceinline__ _TempStorage& PrivateStorage() { __shared__ _TempStorage private_storage; return private_storage; } _TempStorage &temp_storage; int linear_tid; public: struct TempStorage : Uninitialized<_TempStorage> {}; /*************************************************************************//** * @name Collective constructors ****************************************************************************/ //@{ /** * @brief Collective constructor using a private static allocation of shared * memory as temporary storage. */ __device__ __forceinline__ WarpStore() : temp_storage(PrivateStorage()) , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) {} /** * @brief Collective constructor using the specified memory allocation as * temporary storage. */ __device__ __forceinline__ WarpStore(TempStorage &temp_storage) : temp_storage(temp_storage.Alias()) , linear_tid(IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) {} //@} end member group /*************************************************************************//** * @name Data movement ****************************************************************************/ //@{ /** * @brief Store items into a linear segment of memory. * * @par * \smemreuse * * @par Snippet * @par * The code snippet below illustrates the storing of a "blocked" arrangement * of 64 integers across 16 threads (where each thread owns 4 consecutive items) * into a linear segment of memory. The store is specialized for * @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so * that memory references will be efficiently coalesced using a warp-striped * access pattern. * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * * // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each * using WarpStoreT = WarpStore; * * constexpr int warps_in_block = block_threads / warp_threads; * constexpr int tile_size = items_per_thread * warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Allocate shared memory for WarpStore * __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Store items to linear memory * WarpStoreT(temp_storage[warp_id]).Store(d_data + warp_id * tile_size, thread_data); * @endcode * @par * Suppose the set of @p thread_data across the warp threads is * { [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] }. * The output @p d_data will be 0, 1, 2, 3, 4, 5, .... * * @param[out] block_itr The thread block's base output iterator for storing to * @param[in] items Data to store */ template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD]) { InternalStore(temp_storage, linear_tid).Store(block_itr, items); } /** * @brief Store items into a linear segment of memory, guarded by range. * * @par * \smemreuse * * @par Snippet * @par * The code snippet below illustrates the storing of a "blocked" arrangement * of 64 integers across 16 threads (where each thread owns 4 consecutive items) * into a linear segment of memory. The store is specialized for * @p WARP_STORE_TRANSPOSE, meaning items are locally reordered among threads so * that memory references will be efficiently coalesced using a warp-striped * access pattern. * @code * #include // or equivalently * * __global__ void ExampleKernel(int *d_data, int valid_items ...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * * // Specialize WarpStore for a virtual warp of 16 threads owning 4 integer items each * using WarpStoreT = WarpStore; * * constexpr int warps_in_block = block_threads / warp_threads; * constexpr int tile_size = items_per_thread * warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Allocate shared memory for WarpStore * __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_data[4]; * ... * * // Store items to linear memory * WarpStoreT(temp_storage[warp_id]).Store( * d_data + warp_id * tile_size, thread_data, valid_items); * @endcode * @par * Suppose the set of @p thread_data across the warp threads is * { [0,1,2,3], [4,5,6,7], ..., [60,61,62,63] } and @p valid_items * is @p 5.. The output @p d_data will be 0, 1, 2, 3, 4, ?, ?, ..., * with only the first two threads being unmasked to store portions of valid * data. * * @param[out] block_itr The thread block's base output iterator for storing to * @param[in] items Data to store * @param[in] valid_items Number of valid items to write */ template __device__ __forceinline__ void Store(OutputIteratorT block_itr, T (&items)[ITEMS_PER_THREAD], int valid_items) { InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items); } //@} end member group }; CUB_NAMESPACE_END