/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include #include #include #include CUB_NAMESPACE_BEGIN /** * @brief The WarpMergeSort class provides methods for sorting items partitioned * across a CUDA warp using a merge sorting method. * @ingroup WarpModule * * @tparam KeyT * Key type * * @tparam ITEMS_PER_THREAD * The number of items per thread * * @tparam LOGICAL_WARP_THREADS * [optional] The number of threads per "logical" warp (may be less * than the number of hardware warp threads). Default is the warp size of the * targeted CUDA compute-capability (e.g., 32 threads for SM86). Must be a * power of two. * * @tparam ValueT * [optional] Value type (default: cub::NullType, which indicates a * keys-only sort) * * @tparam PTX_ARCH * [optional] \ptxversion * * @par Overview * WarpMergeSort arranges items into ascending order using a comparison * functor with less-than semantics. Merge sort can handle arbitrary types * and comparison functors. * * @par A Simple Example * @par * The code snippet below illustrates a sort of 64 integer keys that are * partitioned across 16 threads where each thread owns 4 consecutive items. * @par * @code * #include // or equivalently * * struct CustomLess * { * template * __device__ bool operator()(const DataType &lhs, const DataType &rhs) * { * return lhs < rhs; * } * }; * * __global__ void ExampleKernel(...) * { * constexpr int warp_threads = 16; * constexpr int block_threads = 256; * constexpr int items_per_thread = 4; * constexpr int warps_per_block = block_threads / warp_threads; * const int warp_id = static_cast(threadIdx.x) / warp_threads; * * // Specialize WarpMergeSort for a virtual warp of 16 threads * // owning 4 integer items each * using WarpMergeSortT = * cub::WarpMergeSort; * * // Allocate shared memory for WarpMergeSort * __shared__ typename WarpMergeSort::TempStorage temp_storage[warps_per_block]; * * // Obtain a segment of consecutive items that are blocked across threads * int thread_keys[items_per_thread]; * // ... * * WarpMergeSort(temp_storage[warp_id]).Sort(thread_keys, CustomLess()); * // ... * } * @endcode * @par * Suppose the set of input @p thread_keys across the block of threads is * { [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }. * The corresponding output @p thread_keys in those threads will be * { [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }. */ template < typename KeyT, int ITEMS_PER_THREAD, int LOGICAL_WARP_THREADS = CUB_PTX_WARP_THREADS, typename ValueT = NullType, int PTX_ARCH = CUB_PTX_ARCH> class WarpMergeSort : public BlockMergeSortStrategy< KeyT, ValueT, LOGICAL_WARP_THREADS, ITEMS_PER_THREAD, WarpMergeSort> { private: constexpr static bool IS_ARCH_WARP = LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH); constexpr static bool KEYS_ONLY = std::is_same::value; constexpr static int TILE_SIZE = ITEMS_PER_THREAD * LOGICAL_WARP_THREADS; using BlockMergeSortStrategyT = BlockMergeSortStrategy; const unsigned int warp_id; const unsigned int member_mask; public: WarpMergeSort() = delete; __device__ __forceinline__ WarpMergeSort(typename BlockMergeSortStrategyT::TempStorage &temp_storage) : BlockMergeSortStrategyT(temp_storage, IS_ARCH_WARP ? LaneId() : (LaneId() % LOGICAL_WARP_THREADS)) , warp_id(IS_ARCH_WARP ? 0 : (LaneId() / LOGICAL_WARP_THREADS)) , member_mask(WarpMask(warp_id)) { } __device__ __forceinline__ unsigned int get_member_mask() const { return member_mask; } private: __device__ __forceinline__ void SyncImplementation() const { WARP_SYNC(member_mask); } friend BlockMergeSortStrategyT; }; CUB_NAMESPACE_END