/****************************************************************************** * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ #pragma once #include "../config.cuh" #include "../util_namespace.cuh" #include "dispatch/dispatch_merge_sort.cuh" CUB_NAMESPACE_BEGIN /** * @brief DeviceMergeSort provides device-wide, parallel operations for * computing a merge sort across a sequence of data items residing within * device-accessible memory. * * @ingroup SingleModule * * @par Overview * - DeviceMergeSort arranges items into ascending order using a comparison * functor with less-than semantics. Merge sort can handle arbitrary types (as * long as a value of these types is a model of [LessThan Comparable]) and * comparison functors, but is slower than DeviceRadixSort when sorting * arithmetic types into ascending/descending order. * - Another difference from RadixSort is the fact that DeviceMergeSort can * handle arbitrary random-access iterators, as shown below. * * @par A Simple Example * @par * The code snippet below illustrates a thrust reverse iterator usage. * @par * @code * #include // or equivalently * * struct CustomLess * { * template * __device__ bool operator()(const DataType &lhs, const DataType &rhs) * { * return lhs < rhs; * } * }; * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * thrust::device_vector d_keys(num_items); * thrust::device_vector d_values(num_items); * // ... * * // Initialize iterator * using KeyIterator = typename thrust::device_vector::iterator; * thrust::reverse_iterator reverse_iter(d_keys.end()); * * // Determine temporary device storage requirements * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::SortPairs( * nullptr, * temp_storage_bytes, * reverse_iter, * thrust::raw_pointer_cast(d_values.data()), * num_items, * CustomLess()); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::SortPairs( * d_temp_storage, * temp_storage_bytes, * reverse_iter, * thrust::raw_pointer_cast(d_values.data()), * num_items, * CustomLess()); * @endcode * * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ struct DeviceMergeSort { /** * @brief Sorts items using a merge sorting method. * * @par * SortPairs is not guaranteed to be stable. That is, suppose that i and j are * equivalent: neither one is less than the other. It is not guaranteed * that the relative order of these two elements will be preserved by sort. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of `int` * keys with associated vector of `int` values. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // sorting data * int num_items; // e.g., 7 * int *d_keys; // e.g., [8, 6, 6, 5, 3, 0, 9] * int *d_values; // e.g., [0, 1, 2, 3, 4, 5, 6] * ... * * // Initialize comparator * CustomOpT custom_op; * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::SortPairs( * d_temp_storage, temp_storage_bytes, * d_keys, d_values, num_items, custom_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::SortPairs( * d_temp_storage, temp_storage_bytes, * d_keys, d_values, num_items, custom_op); * * // d_keys <-- [0, 3, 5, 6, 6, 8, 9] * // d_values <-- [5, 4, 3, 2, 1, 0, 6] * * @endcode * * @tparam KeyIteratorT * is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and * its `value_type` is a model of [LessThan Comparable]. This `value_type`'s * ordering relation is a *strict weak ordering* as defined in * the [LessThan Comparable] requirements. * * @tparam ValueIteratorT * is a model of [Random Access Iterator], and `ValueIteratorT` is mutable. * * @tparam OffsetT * is an integer type for global offsets. * * @tparam CompareOpT * is a type of callable object with the signature * `bool operator()(KeyT lhs, KeyT rhs)` that models * the [Strict Weak Ordering] concept. * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Pointer to the input sequence of unsorted input keys * * @param[in,out] d_items * Pointer to the input sequence of unsorted input values * * @param[in] num_items * Number of items to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * @param[in] debug_synchronous * **[optional]** Whether or not to synchronize the stream after every * kernel launch to check for errors. Also causes launch configurations to * be printed to the console. Default is `false`. * * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ template CUB_RUNTIME_FUNCTION static cudaError_t SortPairs(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyIteratorT d_keys, ValueIteratorT d_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0, bool debug_synchronous = false) { using DispatchMergeSortT = DispatchMergeSort; return DispatchMergeSortT::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_items, d_keys, d_items, num_items, compare_op, stream, debug_synchronous); } /** * @brief Sorts items using a merge sorting method. * * @par * - SortPairsCopy is not guaranteed to be stable. That is, suppose * that `i` and `j` are equivalent: neither one is less than the * other. It is not guaranteed that the relative order of these * two elements will be preserved by sort. * - Input arrays `d_input_keys` and `d_input_items` are not modified. * - Note that the behavior is undefined if the input and output ranges * overlap in any way. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of * `int` keys with associated vector of `int` values. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int *d_keys; // e.g., [8, 6, 6, 5, 3, 0, 9] * int *d_values; // e.g., [0, 1, 2, 3, 4, 5, 6] * ... * * // Initialize comparator * CustomOpT custom_op; * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::SortPairsCopy( * d_temp_storage, temp_storage_bytes, * d_keys, d_values, num_items, custom_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::SortPairsCopy( * d_temp_storage, temp_storage_bytes, * d_keys, d_values, num_items, custom_op); * * // d_keys <-- [0, 3, 5, 6, 6, 8, 9] * // d_values <-- [5, 4, 3, 2, 1, 0, 6] * * @endcode * * @tparam KeyInputIteratorT * is a model of [Random Access Iterator]. Its `value_type` is a model of * [LessThan Comparable]. This `value_type`'s ordering relation is a * *strict weak ordering* as defined in the [LessThan Comparable] * requirements. * * @tparam ValueInputIteratorT * is a model of [Random Access Iterator]. * * @tparam KeyIteratorT * is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and * its `value_type` is a model of [LessThan Comparable]. This `value_type`'s * ordering relation is a *strict weak ordering* as defined in * the [LessThan Comparable] requirements. * * @tparam ValueIteratorT * is a model of [Random Access Iterator], and `ValueIteratorT` is mutable. * * @tparam OffsetT * is an integer type for global offsets. * * @tparam CompareOpT * is a type of callable object with the signature * `bool operator()(KeyT lhs, KeyT rhs)` that models * the [Strict Weak Ordering] concept. * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_input_keys * Pointer to the input sequence of unsorted input keys * * @param[in] d_input_items * Pointer to the input sequence of unsorted input values * * @param[out] d_output_keys * Pointer to the output sequence of sorted input keys * * @param[out] d_output_items * Pointer to the output sequence of sorted input values * * @param[in] num_items * Number of items to sort * * @param[in] compare_op * Comparison function object which returns `true` if the first argument is * ordered before the second * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * @param[in] debug_synchronous * **[optional]** Whether or not to synchronize the stream after every * kernel launch to check for errors. Also causes launch configurations to * be printed to the console. Default is `false`. * * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ template CUB_RUNTIME_FUNCTION static cudaError_t SortPairsCopy(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyInputIteratorT d_input_keys, ValueInputIteratorT d_input_items, KeyIteratorT d_output_keys, ValueIteratorT d_output_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0, bool debug_synchronous = false) { using DispatchMergeSortT = DispatchMergeSort; return DispatchMergeSortT::Dispatch(d_temp_storage, temp_storage_bytes, d_input_keys, d_input_items, d_output_keys, d_output_items, num_items, compare_op, stream, debug_synchronous); } /** * @brief Sorts items using a merge sorting method. * * @par * SortKeys is not guaranteed to be stable. That is, suppose that `i` and `j` * are equivalent: neither one is less than the other. It is not guaranteed * that the relative order of these two elements will be preserved by sort. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of `int` * keys. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers * // for sorting data * int num_items; // e.g., 7 * int *d_keys; // e.g., [8, 6, 7, 5, 3, 0, 9] * ... * * // Initialize comparator * CustomOpT custom_op; * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::SortKeys( * d_temp_storage, temp_storage_bytes, * d_keys, num_items, custom_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::SortKeys( * d_temp_storage, temp_storage_bytes, * d_keys, num_items, custom_op); * * // d_keys <-- [0, 3, 5, 6, 7, 8, 9] * @endcode * * @tparam KeyIteratorT * is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and * its `value_type` is a model of [LessThan Comparable]. This `value_type`'s * ordering relation is a *strict weak ordering* as defined in * the [LessThan Comparable] requirements. * * @tparam OffsetT * is an integer type for global offsets. * * @tparam CompareOpT * is a type of callable object with the signature * `bool operator()(KeyT lhs, KeyT rhs)` that models * the [Strict Weak Ordering] concept. * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Pointer to the input sequence of unsorted input keys * * @param[in] num_items * Number of items to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * @param[in] debug_synchronous * **[optional]** Whether or not to synchronize the stream after every * kernel launch to check for errors. Also causes launch configurations to * be printed to the console. Default is `false`. * * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ template CUB_RUNTIME_FUNCTION static cudaError_t SortKeys(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyIteratorT d_keys, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0, bool debug_synchronous = false) { using DispatchMergeSortT = DispatchMergeSort; return DispatchMergeSortT::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, static_cast(nullptr), d_keys, static_cast(nullptr), num_items, compare_op, stream, debug_synchronous); } /** * @brief Sorts items using a merge sorting method. * * @par * - SortKeysCopy is not guaranteed to be stable. That is, suppose that `i` * and `j` are equivalent: neither one is less than the other. It is not * guaranteed that the relative order of these two elements will be * preserved by sort. * - Input array d_input_keys is not modified. * - Note that the behavior is undefined if the input and output ranges * overlap in any way. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of * `int` keys. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // sorting data * int num_items; // e.g., 7 * int *d_keys; // e.g., [8, 6, 7, 5, 3, 0, 9] * ... * * // Initialize comparator * CustomOpT custom_op; * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::SortKeysCopy( * d_temp_storage, temp_storage_bytes, * d_keys, num_items, custom_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::SortKeysCopy( * d_temp_storage, temp_storage_bytes, * d_keys, num_items, custom_op); * * // d_keys <-- [0, 3, 5, 6, 7, 8, 9] * @endcode * * @tparam KeyInputIteratorT * is a model of [Random Access Iterator]. Its `value_type` is a model of * [LessThan Comparable]. This `value_type`'s ordering relation is a * *strict weak ordering* as defined in the [LessThan Comparable] * requirements. * * @tparam KeyIteratorT * is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and * its `value_type` is a model of [LessThan Comparable]. This `value_type`'s * ordering relation is a *strict weak ordering* as defined in * the [LessThan Comparable] requirements. * * @tparam OffsetT * is an integer type for global offsets. * * @tparam CompareOpT * is a type of callable object with the signature * `bool operator()(KeyT lhs, KeyT rhs)` that models * the [Strict Weak Ordering] concept. * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in] d_input_keys * Pointer to the input sequence of unsorted input keys * * @param[out] d_output_keys * Pointer to the output sequence of sorted input keys * * @param[in] num_items * Number of items to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * @param[in] debug_synchronous * **[optional]** Whether or not to synchronize the stream after every * kernel launch to check for errors. Also causes launch configurations to * be printed to the console. Default is `false`. * * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ template CUB_RUNTIME_FUNCTION static cudaError_t SortKeysCopy(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyInputIteratorT d_input_keys, KeyIteratorT d_output_keys, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0, bool debug_synchronous = false) { using DispatchMergeSortT = DispatchMergeSort; return DispatchMergeSortT::Dispatch(d_temp_storage, temp_storage_bytes, d_input_keys, static_cast(nullptr), d_output_keys, static_cast(nullptr), num_items, compare_op, stream, debug_synchronous); } /** * @brief Sorts items using a merge sorting method. * * @par * StableSortPairs is stable: it preserves the relative ordering of equivalent * elements. That is, if x and y are elements such that x precedes y, * and if the two elements are equivalent (neither x < y nor y < x) then * a postcondition of stable_sort is that x still precedes y. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of `int` * keys with associated vector of `int` values. * @par * @code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // sorting data * int num_items; // e.g., 7 * int *d_keys; // e.g., [8, 6, 6, 5, 3, 0, 9] * int *d_values; // e.g., [0, 1, 2, 3, 4, 5, 6] * ... * * // Initialize comparator * CustomOpT custom_op; * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::StableSortPairs( * d_temp_storage, temp_storage_bytes, * d_keys, d_values, num_items, custom_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::StableSortPairs( * d_temp_storage, temp_storage_bytes, * d_keys, d_values, num_items, custom_op); * * // d_keys <-- [0, 3, 5, 6, 6, 8, 9] * // d_values <-- [5, 4, 3, 1, 2, 0, 6] * @endcode * * @tparam KeyIteratorT * is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and * its `value_type` is a model of [LessThan Comparable]. This `value_type`'s * ordering relation is a *strict weak ordering* as defined in * the [LessThan Comparable] requirements. * * @tparam ValueIteratorT * is a model of [Random Access Iterator], and `ValueIteratorT` is mutable. * * @tparam OffsetT * is an integer type for global offsets. * * @tparam CompareOpT * is a type of callable object with the signature * `bool operator()(KeyT lhs, KeyT rhs)` that models * the [Strict Weak Ordering] concept. * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Pointer to the input sequence of unsorted input keys * * @param[in,out] d_items * Pointer to the input sequence of unsorted input values * * @param[in] num_items * Number of items to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * @param[in] debug_synchronous * **[optional]** Whether or not to synchronize the stream after every * kernel launch to check for errors. Also causes launch configurations to * be printed to the console. Default is `false`. * * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ template CUB_RUNTIME_FUNCTION static cudaError_t StableSortPairs(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyIteratorT d_keys, ValueIteratorT d_items, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0, bool debug_synchronous = false) { return SortPairs( d_temp_storage, temp_storage_bytes, d_keys, d_items, num_items, compare_op, stream, debug_synchronous); } /** * @brief Sorts items using a merge sorting method. * * @par * StableSortKeys is stable: it preserves the relative ordering of equivalent * elements. That is, if `x` and `y` are elements such that `x` precedes `y`, * and if the two elements are equivalent (neither `x < y` nor `y < x`) then * a postcondition of stable_sort is that `x` still precedes `y`. * * @par Snippet * The code snippet below illustrates the sorting of a device vector of `int` * keys. * \par * \code * #include * // or equivalently * * // Declare, allocate, and initialize device-accessible pointers for * // sorting data * int num_items; // e.g., 7 * int *d_keys; // e.g., [8, 6, 7, 5, 3, 0, 9] * ... * * // Initialize comparator * CustomOpT custom_op; * * // Determine temporary device storage requirements * void *d_temp_storage = nullptr; * std::size_t temp_storage_bytes = 0; * cub::DeviceMergeSort::StableSortKeys( * d_temp_storage, temp_storage_bytes, * d_keys, num_items, custom_op); * * // Allocate temporary storage * cudaMalloc(&d_temp_storage, temp_storage_bytes); * * // Run sorting operation * cub::DeviceMergeSort::StableSortKeys( * d_temp_storage, temp_storage_bytes, * d_keys, num_items, custom_op); * * // d_keys <-- [0, 3, 5, 6, 7, 8, 9] * @endcode * * @tparam KeyIteratorT * is a model of [Random Access Iterator]. `KeyIteratorT` is mutable, and * its `value_type` is a model of [LessThan Comparable]. This `value_type`'s * ordering relation is a *strict weak ordering* as defined in * the [LessThan Comparable] requirements. * * @tparam OffsetT * is an integer type for global offsets. * * @tparam CompareOpT * is a type of callable object with the signature * `bool operator()(KeyT lhs, KeyT rhs)` that models * the [Strict Weak Ordering] concept. * * @param[in] d_temp_storage * Device-accessible allocation of temporary storage. When `nullptr`, the * required allocation size is written to `temp_storage_bytes` and no work * is done. * * @param[in,out] temp_storage_bytes * Reference to size in bytes of `d_temp_storage` allocation * * @param[in,out] d_keys * Pointer to the input sequence of unsorted input keys * * @param[in] num_items * Number of items to sort * * @param[in] compare_op * Comparison function object which returns true if the first argument is * ordered before the second * * @param[in] stream * **[optional]** CUDA stream to launch kernels within. Default is * stream0. * * @param[in] debug_synchronous * **[optional]** Whether or not to synchronize the stream after every * kernel launch to check for errors. Also causes launch configurations to * be printed to the console. Default is `false`. * * [Random Access Iterator]: https://en.cppreference.com/w/cpp/iterator/random_access_iterator * [Strict Weak Ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order * [LessThan Comparable]: https://en.cppreference.com/w/cpp/named_req/LessThanComparable */ template CUB_RUNTIME_FUNCTION static cudaError_t StableSortKeys(void *d_temp_storage, std::size_t &temp_storage_bytes, KeyIteratorT d_keys, OffsetT num_items, CompareOpT compare_op, cudaStream_t stream = 0, bool debug_synchronous = false) { return SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items, compare_op, stream, debug_synchronous); } }; CUB_NAMESPACE_END