/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Test of BlockScan utilities ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include #include #include #include "test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- bool g_verbose = false; int g_repeat = 0; CachingDeviceAllocator g_allocator(true); /** * Primitive variant to test */ enum TestMode { BASIC, AGGREGATE, PREFIX, }; /** * Scan mode to test */ enum ScanMode { EXCLUSIVE, INCLUSIVE }; /** * \brief WrapperFunctor (for precluding test-specialized dispatch to *Sum variants) */ template struct WrapperFunctor { OpT op; WrapperFunctor(OpT op) : op(op) {} template __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const { return op(a, b); } }; /** * Stateful prefix functor */ template < typename T, typename ScanOpT> struct BlockPrefixCallbackOp { int linear_tid; T prefix; ScanOpT scan_op; __device__ __forceinline__ BlockPrefixCallbackOp(int linear_tid, T prefix, ScanOpT scan_op) : linear_tid(linear_tid), prefix(prefix), scan_op(scan_op) {} __device__ __forceinline__ T operator()(T block_aggregate) { // For testing purposes T retval = (linear_tid == 0) ? prefix : T(); prefix = scan_op(prefix, block_aggregate); return retval; } }; //--------------------------------------------------------------------- // Exclusive scan //--------------------------------------------------------------------- /// Exclusive scan (BASIC, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.ExclusiveScan(data[0], data[0], initial_value, scan_op); } /// Exclusive scan (BASIC, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.ExclusiveScan(data, data, initial_value, scan_op); } /// Exclusive scan (AGGREGATE, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.ExclusiveScan(data[0], data[0], initial_value, scan_op, block_aggregate); } /// Exclusive scan (AGGREGATE, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.ExclusiveScan(data, data, initial_value, scan_op, block_aggregate); } /// Exclusive scan (PREFIX, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.ExclusiveScan(data[0], data[0], scan_op, prefix_op); } /// Exclusive scan (PREFIX, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.ExclusiveScan(data, data, scan_op, prefix_op); } //--------------------------------------------------------------------- // Exclusive sum //--------------------------------------------------------------------- /// Exclusive sum (BASIC, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.ExclusiveSum(data[0], data[0]); } /// Exclusive sum (BASIC, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.ExclusiveSum(data, data); } /// Exclusive sum (AGGREGATE, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.ExclusiveSum(data[0], data[0], block_aggregate); } /// Exclusive sum (AGGREGATE, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.ExclusiveSum(data, data, block_aggregate); } /// Exclusive sum (PREFIX, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.ExclusiveSum(data[0], data[0], prefix_op); } /// Exclusive sum (PREFIX, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.ExclusiveSum(data, data, prefix_op); } //--------------------------------------------------------------------- // Inclusive scan //--------------------------------------------------------------------- /// Inclusive scan (BASIC, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.InclusiveScan(data[0], data[0], scan_op); } /// Inclusive scan (BASIC, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.InclusiveScan(data, data, scan_op); } /// Inclusive scan (AGGREGATE, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.InclusiveScan(data[0], data[0], scan_op, block_aggregate); } /// Inclusive scan (AGGREGATE, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.InclusiveScan(data, data, scan_op, block_aggregate); } /// Inclusive scan (PREFIX, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.InclusiveScan(data[0], data[0], scan_op, prefix_op); } /// Inclusive scan (PREFIX, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, ScanOpT &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, IsPrimitiveT is_primitive) { block_scan.InclusiveScan(data, data, scan_op, prefix_op); } //--------------------------------------------------------------------- // Inclusive sum //--------------------------------------------------------------------- /// Inclusive sum (BASIC, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.InclusiveSum(data[0], data[0]); } /// Inclusive sum (BASIC, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.InclusiveSum(data, data); } /// Inclusive sum (AGGREGATE, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.InclusiveSum(data[0], data[0], block_aggregate); } /// Inclusive sum (AGGREGATE, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.InclusiveSum(data, data, block_aggregate); } /// Inclusive sum (PREFIX, 1) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[1], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.InclusiveSum(data[0], data[0], prefix_op); } /// Inclusive sum (PREFIX, ITEMS_PER_THREAD) template __device__ __forceinline__ void DeviceTest( BlockScanT &block_scan, T (&data)[ITEMS_PER_THREAD], T &initial_value, Sum &scan_op, T &block_aggregate, PrefixCallbackOp &prefix_op, Int2Type scan_mode, Int2Type test_mode, Int2Type is_primitive) { block_scan.InclusiveSum(data, data, prefix_op); } //--------------------------------------------------------------------- // Test kernels //--------------------------------------------------------------------- /** * BlockScan test kernel. */ template < int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int ITEMS_PER_THREAD, ScanMode SCAN_MODE, TestMode TEST_MODE, BlockScanAlgorithm ALGORITHM, typename T, typename ScanOpT> __launch_bounds__ (BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) __global__ void BlockScanKernel( T *d_in, T *d_out, T *d_aggregate, ScanOpT scan_op, T initial_value, clock_t *d_elapsed) { const int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; // Parameterize BlockScan type for our thread block typedef BlockScan BlockScanT; // Allocate temp storage in shared memory __shared__ typename BlockScanT::TempStorage temp_storage; int linear_tid = RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z); // Per-thread tile data T data[ITEMS_PER_THREAD]; LoadDirectBlocked(linear_tid, d_in, data); __threadfence_block(); // workaround to prevent clock hoisting clock_t start = clock(); __threadfence_block(); // workaround to prevent clock hoisting // Test scan T block_aggregate; BlockScanT block_scan(temp_storage); BlockPrefixCallbackOp prefix_op(linear_tid, initial_value, scan_op); DeviceTest(block_scan, data, initial_value, scan_op, block_aggregate, prefix_op, Int2Type(), Int2Type(), Int2Type::PRIMITIVE>()); // Stop cycle timer __threadfence_block(); // workaround to prevent clock hoisting clock_t stop = clock(); __threadfence_block(); // workaround to prevent clock hoisting // Store output StoreDirectBlocked(linear_tid, d_out, data); // Store block_aggregate if (TEST_MODE != BASIC) d_aggregate[linear_tid] = block_aggregate; // Store prefix if (TEST_MODE == PREFIX) { if (linear_tid == 0) d_out[TILE_SIZE] = prefix_op.prefix; } // Store time if (linear_tid == 0) *d_elapsed = (start > stop) ? start - stop : stop - start; } //--------------------------------------------------------------------- // Host utility subroutines //--------------------------------------------------------------------- /** * Initialize exclusive-scan problem (and solution) */ template T Initialize( GenMode gen_mode, T *h_in, T *h_reference, int num_items, ScanOpT scan_op, T initial_value, Int2Type) { InitValue(gen_mode, h_in[0], 0); T block_aggregate = h_in[0]; h_reference[0] = initial_value; T inclusive = scan_op(initial_value, h_in[0]); for (int i = 1; i < num_items; ++i) { InitValue(gen_mode, h_in[i], i); h_reference[i] = inclusive; inclusive = scan_op(inclusive, h_in[i]); block_aggregate = scan_op(block_aggregate, h_in[i]); } return block_aggregate; } /** * Initialize inclusive-scan problem (and solution) */ template T Initialize( GenMode gen_mode, T *h_in, T *h_reference, int num_items, ScanOpT scan_op, T initial_value, Int2Type) { InitValue(gen_mode, h_in[0], 0); T block_aggregate = h_in[0]; T inclusive = scan_op(initial_value, h_in[0]); h_reference[0] = inclusive; for (int i = 1; i < num_items; ++i) { InitValue(gen_mode, h_in[i], i); inclusive = scan_op(inclusive, h_in[i]); block_aggregate = scan_op(block_aggregate, h_in[i]); h_reference[i] = inclusive; } return block_aggregate; } /** * Test threadblock scan. (Specialized for sufficient resources) */ template < int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int ITEMS_PER_THREAD, ScanMode SCAN_MODE, TestMode TEST_MODE, BlockScanAlgorithm ALGORITHM, typename ScanOpT, typename T> void Test( GenMode gen_mode, ScanOpT scan_op, T initial_value, Int2Type sufficient_resources) { const int BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z; const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; // Allocate host arrays T *h_in = new T[TILE_SIZE]; T *h_reference = new T[TILE_SIZE]; T *h_aggregate = new T[BLOCK_THREADS]; // Initialize problem T block_aggregate = Initialize( gen_mode, h_in, h_reference, TILE_SIZE, scan_op, initial_value, Int2Type()); // Test reference block_aggregate is returned in all threads for (int i = 0; i < BLOCK_THREADS; ++i) { h_aggregate[i] = block_aggregate; } // Run kernel printf("Test-mode %d, gen-mode %d, policy %d, %s %s BlockScan, %d (%d,%d,%d) threadblock threads, %d items per thread, %d tile size, %s (%d bytes) elements:\n", TEST_MODE, gen_mode, ALGORITHM, (SCAN_MODE == INCLUSIVE) ? "Inclusive" : "Exclusive", typeid(ScanOpT).name(), BLOCK_THREADS, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, ITEMS_PER_THREAD, TILE_SIZE, typeid(T).name(), (int) sizeof(T)); fflush(stdout); // Initialize/clear device arrays T *d_in = NULL; T *d_out = NULL; T *d_aggregate = NULL; clock_t *d_elapsed = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_elapsed, sizeof(unsigned long long))); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(T) * TILE_SIZE)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * (TILE_SIZE + 2))); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_aggregate, sizeof(T) * BLOCK_THREADS)); CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * TILE_SIZE, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * (TILE_SIZE + 1))); CubDebugExit(cudaMemset(d_aggregate, 0, sizeof(T) * BLOCK_THREADS)); // Display input problem data if (g_verbose) { printf("Input data: "); for (int i = 0; i < TILE_SIZE; i++) { std::cout << CoutCast(h_in[i]) << ", "; } printf("\n\n"); } // Run block_aggregate/prefix kernel dim3 block_dims(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z); BlockScanKernel<<<1, block_dims>>>( d_in, d_out, d_aggregate, scan_op, initial_value, d_elapsed); CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Copy out and display results printf("\tScan results: "); int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); if (TEST_MODE == AGGREGATE) { // Copy out and display block_aggregate printf("\tScan block aggregate: "); compare = CompareDeviceResults(h_aggregate, d_aggregate, BLOCK_THREADS, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); } if (TEST_MODE == PREFIX) { // Copy out and display updated prefix printf("\tScan running total: "); T running_total = scan_op(initial_value, block_aggregate); compare = CompareDeviceResults(&running_total, d_out + TILE_SIZE, 1, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); } printf("\tElapsed clocks: "); DisplayDeviceResults(d_elapsed, 1); // Cleanup if (h_in) delete[] h_in; if (h_reference) delete[] h_reference; if (h_aggregate) delete[] h_aggregate; if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in)); if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out)); if (d_aggregate) CubDebugExit(g_allocator.DeviceFree(d_aggregate)); if (d_elapsed) CubDebugExit(g_allocator.DeviceFree(d_elapsed)); } /** * Test threadblock scan. (Specialized for insufficient resources) */ template < int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int ITEMS_PER_THREAD, ScanMode SCAN_MODE, TestMode TEST_MODE, BlockScanAlgorithm ALGORITHM, typename ScanOpT, typename T> void Test( GenMode gen_mode, ScanOpT scan_op, T initial_value, Int2Type sufficient_resources) {} /** * Test threadblock scan. */ template < int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_DIM_Z, int ITEMS_PER_THREAD, ScanMode SCAN_MODE, TestMode TEST_MODE, BlockScanAlgorithm ALGORITHM, typename ScanOpT, typename T> void Test( GenMode gen_mode, ScanOpT scan_op, T initial_value) { // Check size of smem storage for the target arch to make sure it will fit typedef BlockScan BlockScanT; enum { #if defined(SM100) || defined(SM110) || defined(SM130) sufficient_smem = (sizeof(typename BlockScanT::TempStorage) <= 16 * 1024), sufficient_threads = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 512), #else sufficient_smem = (sizeof(typename BlockScanT::TempStorage) <= 16 * 1024), sufficient_threads = ((BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z) <= 1024), #endif #if defined(_WIN32) || defined(_WIN64) // Accommodate ptxas crash bug (access violation) on Windows special_skip = ((TEST_ARCH <= 130) && (Equals::VALUE) && (BLOCK_DIM_Z > 1)), #else special_skip = false, #endif sufficient_resources = (sufficient_smem && sufficient_threads && !special_skip), }; Test( gen_mode, scan_op, initial_value, Int2Type()); } /** * Run test for different threadblock dimensions */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, ScanMode SCAN_MODE, TestMode TEST_MODE, BlockScanAlgorithm ALGORITHM, typename ScanOpT, typename T> void Test( GenMode gen_mode, ScanOpT scan_op, T initial_value) { Test(gen_mode, scan_op, initial_value); Test(gen_mode, scan_op, initial_value); } /** * Run test for different policy types */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, ScanMode SCAN_MODE, TestMode TEST_MODE, typename ScanOpT, typename T> void Test( GenMode gen_mode, ScanOpT scan_op, T initial_value) { #ifdef TEST_RAKING Test(gen_mode, scan_op, initial_value); #endif #ifdef TEST_RAKING_MEMOIZE Test(gen_mode, scan_op, initial_value); #endif #ifdef TEST_WARP_SCANS Test(gen_mode, scan_op, initial_value); #endif } /** * Run tests for different primitive variants */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, typename ScanOpT, typename T> void Test( GenMode gen_mode, ScanOpT scan_op, T identity, T initial_value) { // Exclusive (use identity as initial value because it will dispatch to *Sum variants that don't take initial values) Test(gen_mode, scan_op, identity); Test(gen_mode, scan_op, identity); Test(gen_mode, scan_op, identity); // Exclusive (non-specialized, so we can use initial-value) Test(gen_mode, WrapperFunctor(scan_op), initial_value); Test(gen_mode, WrapperFunctor(scan_op), initial_value); Test(gen_mode, WrapperFunctor(scan_op), initial_value); // Inclusive Test(gen_mode, scan_op, identity); // This scan doesn't take an initial value Test(gen_mode, scan_op, identity); // This scan doesn't take an initial value Test(gen_mode, scan_op, initial_value); } /** * Run tests for different problem-generation options */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, typename ScanOpT, typename T> void Test( ScanOpT scan_op, T identity, T initial_value) { Test(UNIFORM, scan_op, identity, initial_value); Test(INTEGER_SEED, scan_op, identity, initial_value); // Don't test randomly-generated floats b/c of stability if (Traits::CATEGORY != FLOATING_POINT) Test(RANDOM, scan_op, identity, initial_value); } /** * Run tests for different data types and scan ops */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD> void Test() { // Get ptx version int ptx_version; CubDebugExit(PtxVersion(ptx_version)); // primitive Test(Sum(), (unsigned char) 0, (unsigned char) 99); Test(Sum(), (unsigned short) 0, (unsigned short) 99); Test(Sum(), (unsigned int) 0, (unsigned int) 99); Test(Sum(), (unsigned long long) 0, (unsigned long long) 99); Test(Sum(), (float) 0, (float) 99); // primitive (alternative scan op) Test(Max(), std::numeric_limits::min(), (char) 99); Test(Max(), std::numeric_limits::min(), (short) 99); Test(Max(), std::numeric_limits::min(), (int) 99); Test(Max(), std::numeric_limits::min(), (long long) 99); if (ptx_version > 120) // Don't check doubles on PTX120 or below because they're down-converted Test(Max(), std::numeric_limits::max() * -1, (double) 99); // vec-1 Test(Sum(), make_uchar1(0), make_uchar1(17)); // vec-2 Test(Sum(), make_uchar2(0, 0), make_uchar2(17, 21)); Test(Sum(), make_ushort2(0, 0), make_ushort2(17, 21)); Test(Sum(), make_uint2(0, 0), make_uint2(17, 21)); Test(Sum(), make_ulonglong2(0, 0), make_ulonglong2(17, 21)); // vec-4 Test(Sum(), make_char4(0, 0, 0, 0), make_char4(17, 21, 32, 85)); Test(Sum(), make_short4(0, 0, 0, 0), make_short4(17, 21, 32, 85)); Test(Sum(), make_int4(0, 0, 0, 0), make_int4(17, 21, 32, 85)); Test(Sum(), make_longlong4(0, 0, 0, 0), make_longlong4(17, 21, 32, 85)); // complex Test(Sum(), TestFoo::MakeTestFoo(0, 0, 0, 0), TestFoo::MakeTestFoo(17, 21, 32, 85)); Test(Sum(), TestBar(0, 0), TestBar(17, 21)); } /** * Run tests for different items per thread */ template void Test() { Test(); Test(); Test(); } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("repeat", g_repeat); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--repeat=]" "[--v] " "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); #ifdef QUICK_TEST Test<128, 1, 1, 1, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), int(0)); // Compile/run quick tests Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), int(0)); Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_RAKING>(UNIFORM, Sum(), int(0)); Test<128, 1, 1, 4, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_RAKING_MEMOIZE>(UNIFORM, Sum(), int(0)); Test<128, 1, 1, 2, INCLUSIVE, PREFIX, BLOCK_SCAN_RAKING>(INTEGER_SEED, Sum(), TestFoo::MakeTestFoo(17, 21, 32, 85)); Test<128, 1, 1, 1, EXCLUSIVE, AGGREGATE, BLOCK_SCAN_WARP_SCANS>(UNIFORM, Sum(), make_longlong4(17, 21, 32, 85)); #else // Compile/run thorough tests for (int i = 0; i <= g_repeat; ++i) { // Run tests for different threadblock sizes Test<17>(); Test<32>(); Test<62>(); Test<65>(); // Test<96>(); // TODO: file bug for UNREACHABLE error for Test<96, 9, BASIC, BLOCK_SCAN_RAKING>(UNIFORM, Sum(), NullType(), make_ulonglong2(17, 21)); Test<128>(); } #endif return 0; }