/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of DeviceSelect::If and DevicePartition::If utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include #include #include "test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; int g_timing_iterations = 0; int g_repeat = 0; float g_device_giga_bandwidth; CachingDeviceAllocator g_allocator(true); // Dispatch types enum Backend { CUB, // CUB method THRUST, // Thrust method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; // Selection functor type template struct LessThan { T compare; __host__ __device__ __forceinline__ LessThan(T compare) : compare(compare) {} __host__ __device__ __forceinline__ bool operator()(const T &a) const { return (a < compare); } }; //--------------------------------------------------------------------- // Dispatch to different CUB DeviceSelect entrypoints //--------------------------------------------------------------------- /** * Dispatch to select if entrypoint */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch( Int2Type dispatch_to, Int2Type is_flagged, Int2Type is_partition, int timing_timing_iterations, size_t* d_temp_storage_bytes, cudaError_t* d_cdp_error, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagIteratorT d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items, SelectOpT select_op, cudaStream_t stream, bool debug_synchronous) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous); } return error; } /** * Dispatch to partition if entrypoint */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch( Int2Type dispatch_to, Int2Type is_flagged, Int2Type is_partition, int timing_timing_iterations, size_t* d_temp_storage_bytes, cudaError_t* d_cdp_error, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagIteratorT d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items, SelectOpT select_op, cudaStream_t stream, bool debug_synchronous) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DevicePartition::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op, stream, debug_synchronous); } return error; } /** * Dispatch to select flagged entrypoint */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch( Int2Type dispatch_to, Int2Type is_flagged, Int2Type partition, int timing_timing_iterations, size_t* d_temp_storage_bytes, cudaError_t* d_cdp_error, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagIteratorT d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items, SelectOpT select_op, cudaStream_t stream, bool debug_synchronous) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous); } return error; } /** * Dispatch to partition flagged entrypoint */ template CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Dispatch( Int2Type dispatch_to, Int2Type is_flagged, Int2Type partition, int timing_timing_iterations, size_t* d_temp_storage_bytes, cudaError_t* d_cdp_error, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagIteratorT d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items, SelectOpT select_op, cudaStream_t stream, bool debug_synchronous) { cudaError_t error = cudaSuccess; for (int i = 0; i < timing_timing_iterations; ++i) { error = DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, stream, debug_synchronous); } return error; } //--------------------------------------------------------------------- // Dispatch to different Thrust entrypoints //--------------------------------------------------------------------- /** * Dispatch to select if entrypoint */ template __host__ __forceinline__ cudaError_t Dispatch( Int2Type dispatch_to, Int2Type is_flagged, Int2Type is_partition, int timing_timing_iterations, size_t* d_temp_storage_bytes, cudaError_t* d_cdp_error, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagIteratorT d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items, SelectOpT select_op, cudaStream_t stream, bool debug_synchronous) { // The input value type typedef typename std::iterator_traits::value_type InputT; // The output value type typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? typename std::iterator_traits::value_type, // ... then the input iterator's value type, typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type if (d_temp_storage == 0) { temp_storage_bytes = 1; } else { thrust::device_ptr d_out_wrapper_end; thrust::device_ptr d_in_wrapper(d_in); thrust::device_ptr d_out_wrapper(d_out); for (int i = 0; i < timing_timing_iterations; ++i) { d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, select_op); } OffsetT num_selected = d_out_wrapper_end - d_out_wrapper; CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice)); } return cudaSuccess; } /** * Dispatch to partition if entrypoint */ template __host__ __forceinline__ cudaError_t Dispatch( Int2Type dispatch_to, Int2Type is_flagged, Int2Type is_partition, int timing_timing_iterations, size_t* d_temp_storage_bytes, cudaError_t* d_cdp_error, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagIteratorT d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items, SelectOpT select_op, cudaStream_t stream, bool debug_synchronous) { // The input value type typedef typename std::iterator_traits::value_type InputT; // The output value type typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? typename std::iterator_traits::value_type, // ... then the input iterator's value type, typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type typedef thrust::reverse_iterator > ReverseOutputIteratorT; if (d_temp_storage == 0) { temp_storage_bytes = 1; } else { thrust::pair, ReverseOutputIteratorT> d_out_wrapper_end; thrust::device_ptr d_in_wrapper(d_in); thrust::device_ptr d_out_wrapper(d_out); ReverseOutputIteratorT d_out_unselected(d_out_wrapper + num_items); for (int i = 0; i < timing_timing_iterations; ++i) { d_out_wrapper_end = thrust::partition_copy( d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper, d_out_unselected, select_op); } OffsetT num_selected = d_out_wrapper_end.first - d_out_wrapper; CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice)); } return cudaSuccess; } /** * Dispatch to select flagged entrypoint */ template __host__ __forceinline__ cudaError_t Dispatch( Int2Type dispatch_to, Int2Type is_flagged, Int2Type is_partition, int timing_timing_iterations, size_t* d_temp_storage_bytes, cudaError_t* d_cdp_error, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagIteratorT d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items, SelectOpT select_op, cudaStream_t stream, bool debug_synchronous) { // The flag type typedef typename std::iterator_traits::value_type FlagT; // The input value type typedef typename std::iterator_traits::value_type InputT; // The output value type typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? typename std::iterator_traits::value_type, // ... then the input iterator's value type, typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type if (d_temp_storage == 0) { temp_storage_bytes = 1; } else { thrust::device_ptr d_out_wrapper_end; thrust::device_ptr d_in_wrapper(d_in); thrust::device_ptr d_out_wrapper(d_out); thrust::device_ptr d_flags_wrapper(d_flags); for (int i = 0; i < timing_timing_iterations; ++i) { d_out_wrapper_end = thrust::copy_if(d_in_wrapper, d_in_wrapper + num_items, d_flags_wrapper, d_out_wrapper, Cast()); } OffsetT num_selected = d_out_wrapper_end - d_out_wrapper; CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice)); } return cudaSuccess; } /** * Dispatch to partition flagged entrypoint */ template __host__ __forceinline__ cudaError_t Dispatch( Int2Type dispatch_to, Int2Type is_flagged, Int2Type is_partition, int timing_timing_iterations, size_t* d_temp_storage_bytes, cudaError_t* d_cdp_error, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagIteratorT d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items, SelectOpT select_op, cudaStream_t stream, bool debug_synchronous) { // The flag type typedef typename std::iterator_traits::value_type FlagT; // The input value type typedef typename std::iterator_traits::value_type InputT; // The output value type typedef typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? typename std::iterator_traits::value_type, // ... then the input iterator's value type, typename std::iterator_traits::value_type>::Type OutputT; // ... else the output iterator's value type typedef thrust::reverse_iterator > ReverseOutputIteratorT; if (d_temp_storage == 0) { temp_storage_bytes = 1; } else { thrust::pair, ReverseOutputIteratorT> d_out_wrapper_end; thrust::device_ptr d_in_wrapper(d_in); thrust::device_ptr d_out_wrapper(d_out); thrust::device_ptr d_flags_wrapper(d_flags); ReverseOutputIteratorT d_out_unselected(d_out_wrapper + num_items); for (int i = 0; i < timing_timing_iterations; ++i) { d_out_wrapper_end = thrust::partition_copy( d_in_wrapper, d_in_wrapper + num_items, d_flags_wrapper, d_out_wrapper, d_out_unselected, Cast()); } OffsetT num_selected = d_out_wrapper_end.first - d_out_wrapper; CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice)); } return cudaSuccess; } //--------------------------------------------------------------------- // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- /** * Simple wrapper kernel to invoke DeviceSelect */ template __global__ void CnpDispatchKernel( IsFlaggedTag is_flagged, IsPartitionTag is_partition, int timing_timing_iterations, size_t* d_temp_storage_bytes, cudaError_t* d_cdp_error, void* d_temp_storage, size_t temp_storage_bytes, InputIteratorT d_in, FlagIteratorT d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items, SelectOpT select_op, bool debug_synchronous) { #ifndef CUB_CDP *d_cdp_error = cudaErrorNotSupported; #else *d_cdp_error = Dispatch(Int2Type(), is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, debug_synchronous); *d_temp_storage_bytes = temp_storage_bytes; #endif } /** * Dispatch to CDP kernel */ template cudaError_t Dispatch( Int2Type dispatch_to, IsFlaggedTag is_flagged, IsPartitionTag is_partition, int timing_timing_iterations, size_t* d_temp_storage_bytes, cudaError_t* d_cdp_error, void* d_temp_storage, size_t& temp_storage_bytes, InputIteratorT d_in, FlagIteratorT d_flags, OutputIteratorT d_out, NumSelectedIteratorT d_num_selected_out, OffsetT num_items, SelectOpT select_op, cudaStream_t stream, bool debug_synchronous) { // Invoke kernel to invoke device-side dispatch CnpDispatchKernel<<<1,1>>>(is_flagged, is_partition, timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, debug_synchronous); // Copy out temp_storage_bytes CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); // Copy out error cudaError_t retval; CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); return retval; } //--------------------------------------------------------------------- // Test generation //--------------------------------------------------------------------- /** * Initialize problem */ template void Initialize( T* h_in, int num_items) { for (int i = 0; i < num_items; ++i) { // Initialize each item to a randomly selected value from [0..126] unsigned int value; RandomBits(value, 0, 0, 7); if (value == 127) value = 126; InitValue(INTEGER_SEED, h_in[i], value); } if (g_verbose) { printf("Input:\n"); DisplayResults(h_in, num_items); printf("\n\n"); } } /** * Solve selection problem (and set corresponding flags) */ template < typename InputIteratorT, typename FlagIteratorT, typename SelectOpT, typename T> int Solve( InputIteratorT h_in, SelectOpT select_op, T* h_reference, FlagIteratorT h_flags, int num_items) { int num_selected = 0; for (int i = 0; i < num_items; ++i) { if ((h_flags[i] = select_op(h_in[i]))) { h_reference[num_selected] = h_in[i]; num_selected++; } else { h_reference[num_items - (i - num_selected) - 1] = h_in[i]; } } return num_selected; } /** * Test DeviceSelect for a given problem input */ template < Backend BACKEND, bool IS_FLAGGED, bool IS_PARTITION, typename DeviceInputIteratorT, typename FlagT, typename SelectOpT, typename T> void Test( DeviceInputIteratorT d_in, FlagT* h_flags, SelectOpT select_op, T* h_reference, int num_selected, int num_items) { // Allocate device flags, output, and num-selected FlagT* d_flags = NULL; T* d_out = NULL; int* d_num_selected_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_flags, sizeof(FlagT) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); // Allocate CDP device arrays size_t* d_temp_storage_bytes = NULL; cudaError_t* d_cdp_error = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); // Allocate temporary storage void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; CubDebugExit(Dispatch(Int2Type(), Int2Type(), Int2Type(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, true)); CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); // Copy flags and clear device output array CubDebugExit(cudaMemcpy(d_flags, h_flags, sizeof(FlagT) * num_items, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * num_items)); CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int))); // Run warmup/correctness iteration CubDebugExit(Dispatch(Int2Type(), Int2Type(), Int2Type(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, true)); // Check for correctness (and display results, if specified) int compare1 = (IS_PARTITION) ? CompareDeviceResults(h_reference, d_out, num_items, true, g_verbose) : CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose); printf("\t Data %s\n", compare1 ? "FAIL" : "PASS"); int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose); printf("\t Count %s\n", compare2 ? "FAIL" : "PASS"); // Flush any stdout/stderr fflush(stdout); fflush(stderr); // Performance GpuTimer gpu_timer; gpu_timer.Start(); CubDebugExit(Dispatch(Int2Type(), Int2Type(), Int2Type(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items, select_op, 0, false)); gpu_timer.Stop(); float elapsed_millis = gpu_timer.ElapsedMillis(); // Display performance if (g_timing_iterations > 0) { float avg_millis = elapsed_millis / g_timing_iterations; float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f; int num_output_items = (IS_PARTITION) ? num_items : num_selected; int num_flag_items = (IS_FLAGGED) ? num_items : 0; size_t num_bytes = sizeof(T) * (num_items + num_output_items) + sizeof(FlagT) * num_flag_items; float giga_bandwidth = float(num_bytes) / avg_millis / 1000.0f / 1000.0f; printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0); } printf("\n\n"); // Flush any stdout/stderr fflush(stdout); fflush(stderr); // Cleanup if (d_flags) CubDebugExit(g_allocator.DeviceFree(d_flags)); if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out)); if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); // Correctness asserts AssertEquals(0, compare1 | compare2); } /** * Test on pointer type */ template < Backend BACKEND, bool IS_FLAGGED, bool IS_PARTITION, typename T> void TestPointer( int num_items, float select_ratio) { typedef char FlagT; // Allocate host arrays T* h_in = new T[num_items]; FlagT* h_flags = new FlagT[num_items]; T* h_reference = new T[num_items]; // Initialize input Initialize(h_in, num_items); // Select a comparison value that is select_ratio through the space of [0,127] T compare; if (select_ratio <= 0.0) InitValue(INTEGER_SEED, compare, 0); // select none else if (select_ratio >= 1.0) InitValue(INTEGER_SEED, compare, 127); // select all else InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio))); LessThan select_op(compare); int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items); if (g_verbose) std::cout << "\nComparison item: " << compare << "\n"; printf("\nPointer %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n", (IS_PARTITION) ? "DevicePartition" : "DeviceSelect", (IS_FLAGGED) ? "Flagged" : "If", (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB", num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T)); fflush(stdout); // Allocate problem device arrays T *d_in = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * num_items)); // Initialize device input CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice)); // Run Test Test(d_in, h_flags, select_op, h_reference, num_selected, num_items); // Cleanup if (h_in) delete[] h_in; if (h_reference) delete[] h_reference; if (h_flags) delete[] h_flags; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); } /** * Test on iterator type */ template < Backend BACKEND, bool IS_FLAGGED, bool IS_PARTITION, typename T> void TestIterator( int num_items, float select_ratio) { typedef char FlagT; // Allocate host arrays T* h_reference = new T[num_items]; FlagT* h_flags = new FlagT[num_items]; // Use counting iterator as the input CountingInputIterator h_in(0); // Select a comparison value that is select_ratio through the space of [0,127] T compare; if (select_ratio <= 0.0) InitValue(INTEGER_SEED, compare, 0); // select none else if (select_ratio >= 1.0) InitValue(INTEGER_SEED, compare, 127); // select all else InitValue(INTEGER_SEED, compare, int(double(double(127) * select_ratio))); LessThan select_op(compare); int num_selected = Solve(h_in, select_op, h_reference, h_flags, num_items); if (g_verbose) std::cout << "\nComparison item: " << compare << "\n"; printf("\nIterator %s cub::%s::%s %d items, %d selected (select ratio %.3f), %s %d-byte elements\n", (IS_PARTITION) ? "DevicePartition" : "DeviceSelect", (IS_FLAGGED) ? "Flagged" : "If", (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB", num_items, num_selected, float(num_selected) / num_items, typeid(T).name(), (int) sizeof(T)); fflush(stdout); // Run Test Test(h_in, h_flags, select_op, h_reference, num_selected, num_items); // Cleanup if (h_reference) delete[] h_reference; if (h_flags) delete[] h_flags; } /** * Test different selection ratios */ template < Backend BACKEND, bool IS_FLAGGED, bool IS_PARTITION, typename T> void Test( int num_items) { for (float select_ratio = 0.0f; select_ratio <= 1.0f; select_ratio += 0.2f) { TestPointer(num_items, select_ratio); } } /** * Test (select vs. partition) and (flagged vs. functor) */ template < Backend BACKEND, typename T> void TestMethod( int num_items) { // Functor Test(num_items); Test(num_items); // Flagged Test(num_items); Test(num_items); } /** * Test different dispatch */ template < typename T> void TestOp( int num_items) { TestMethod(num_items); #ifdef CUB_CDP TestMethod(num_items); #endif } /** * Test different input sizes */ template void Test( int num_items) { if (num_items < 0) { TestOp(0); TestOp(1); TestOp(100); TestOp(10000); TestOp(1000000); } else { TestOp(num_items); } } /** * Test select/partition on pointer types */ template void ComparePointer( int num_items, float select_ratio) { printf("-- Select-if ----------------------------\n"); TestPointer(num_items, select_ratio); TestPointer(num_items, select_ratio); printf("-- Partition-if ----------------------------\n"); TestPointer(num_items, select_ratio); TestPointer(num_items, select_ratio); printf("-- Select-flagged ----------------------------\n"); TestPointer(num_items, select_ratio); TestPointer(num_items, select_ratio); printf("-- Partition-flagged ----------------------------\n"); TestPointer(num_items, select_ratio); TestPointer(num_items, select_ratio); } //--------------------------------------------------------------------- // Main //--------------------------------------------------------------------- /** * Main */ int main(int argc, char** argv) { int num_items = -1; float select_ratio = 0.5; // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("n", num_items); args.GetCmdLineArgument("i", g_timing_iterations); args.GetCmdLineArgument("repeat", g_repeat); args.GetCmdLineArgument("ratio", select_ratio); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--n= " "[--i= " "[--device=] " "[--ratio=] " "[--repeat=] " "[--v] " "[--cdp] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); g_device_giga_bandwidth = args.device_giga_bandwidth; printf("\n"); #ifdef QUICKER_TEST // Compile/run basic CUB test if (num_items < 0) num_items = 32000000; printf("-- Select-if ----------------------------\n"); TestPointer(num_items, select_ratio); printf("-- Partition-if ----------------------------\n"); TestPointer(num_items, select_ratio); printf("-- Select-flagged ----------------------------\n"); TestPointer(num_items, select_ratio); printf("-- Partition-flagged ----------------------------\n"); TestPointer(num_items, select_ratio); #elif defined(QUICK_TEST) // Get device ordinal int device_ordinal; CubDebugExit(cudaGetDevice(&device_ordinal)); // Get device SM version int sm_version; CubDebugExit(SmVersion(sm_version, device_ordinal)); // Compile/run quick tests if (num_items < 0) num_items = 32000000; printf("-- Iterator ----------------------------\n"); TestIterator(num_items, select_ratio); ComparePointer( num_items * ((sm_version <= 130) ? 1 : 4), select_ratio); ComparePointer( num_items * ((sm_version <= 130) ? 1 : 2), select_ratio); ComparePointer( num_items, select_ratio); ComparePointer( num_items / 2, select_ratio); ComparePointer( num_items / 4, select_ratio); #else // Compile/run thorough tests for (int i = 0; i <= g_repeat; ++i) { // Test different input types Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); Test(num_items); } #endif return 0; }