/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * Simple demonstration of cub::BlockScan * * To compile using the command line: * nvcc -arch=sm_XX example_block_scan.cu -I../.. -lcudart -O3 * ******************************************************************************/ // Ensure printing of CUDA runtime errors to console (define before including cub.h) #define CUB_STDERR #include #include #include #include #include #include "../../test/test_util.h" using namespace cub; //--------------------------------------------------------------------- // Globals, constants and typedefs //--------------------------------------------------------------------- /// Verbose output bool g_verbose = false; /// Timing iterations int g_timing_iterations = 100; /// Default grid size int g_grid_size = 1; //--------------------------------------------------------------------- // Kernels //--------------------------------------------------------------------- /** * Simple kernel for performing a block-wide exclusive prefix sum over integers */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockScanAlgorithm ALGORITHM> __global__ void BlockPrefixSumKernel( int *d_in, // Tile of input int *d_out, // Tile of output clock_t *d_elapsed) // Elapsed cycle count of block scan { // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement) typedef BlockLoad BlockLoadT; // Specialize BlockStore type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement) typedef BlockStore BlockStoreT; // Specialize BlockScan type for our thread block typedef BlockScan BlockScanT; // Shared memory __shared__ union TempStorage { typename BlockLoadT::TempStorage load; typename BlockStoreT::TempStorage store; typename BlockScanT::TempStorage scan; } temp_storage; // Per-thread tile data int data[ITEMS_PER_THREAD]; // Load items into a blocked arrangement BlockLoadT(temp_storage.load).Load(d_in, data); // Barrier for smem reuse __syncthreads(); // Start cycle timer clock_t start = clock(); // Compute exclusive prefix sum int aggregate; BlockScanT(temp_storage.scan).ExclusiveSum(data, data, aggregate); // Stop cycle timer clock_t stop = clock(); // Barrier for smem reuse __syncthreads(); // Store items from a blocked arrangement BlockStoreT(temp_storage.store).Store(d_out, data); // Store aggregate and elapsed clocks if (threadIdx.x == 0) { *d_elapsed = (start > stop) ? start - stop : stop - start; d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate; } } //--------------------------------------------------------------------- // Host utilities //--------------------------------------------------------------------- /** * Initialize exclusive prefix sum problem (and solution). * Returns the aggregate */ int Initialize( int *h_in, int *h_reference, int num_items) { int inclusive = 0; for (int i = 0; i < num_items; ++i) { h_in[i] = i % 17; h_reference[i] = inclusive; inclusive += h_in[i]; } return inclusive; } /** * Test thread block scan */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, BlockScanAlgorithm ALGORITHM> void Test() { const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; // Allocate host arrays int *h_in = new int[TILE_SIZE]; int *h_reference = new int[TILE_SIZE]; int *h_gpu = new int[TILE_SIZE + 1]; // Initialize problem and reference output on host int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE); // Initialize device arrays int *d_in = NULL; int *d_out = NULL; clock_t *d_elapsed = NULL; cudaMalloc((void**)&d_in, sizeof(int) * TILE_SIZE); cudaMalloc((void**)&d_out, sizeof(int) * (TILE_SIZE + 1)); cudaMalloc((void**)&d_elapsed, sizeof(clock_t)); // Display input problem data if (g_verbose) { printf("Input data: "); for (int i = 0; i < TILE_SIZE; i++) printf("%d, ", h_in[i]); printf("\n\n"); } // Kernel props int max_sm_occupancy; CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockPrefixSumKernel, BLOCK_THREADS)); // Copy problem to device cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice); printf("BlockScan algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n", (ALGORITHM == BLOCK_SCAN_RAKING) ? "BLOCK_SCAN_RAKING" : (ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE) ? "BLOCK_SCAN_RAKING_MEMOIZE" : "BLOCK_SCAN_WARP_SCANS", TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy); // Run aggregate/prefix kernel BlockPrefixSumKernel<<>>( d_in, d_out, d_elapsed); // Check results printf("\tOutput items: "); int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Check total aggregate printf("\tAggregate: "); compare = CompareDeviceResults(&h_aggregate, d_out + TILE_SIZE, 1, g_verbose, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Run this several times and average the performance results GpuTimer timer; float elapsed_millis = 0.0; clock_t elapsed_clocks = 0; for (int i = 0; i < g_timing_iterations; ++i) { // Copy problem to device cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice); timer.Start(); // Run aggregate/prefix kernel BlockPrefixSumKernel<<>>( d_in, d_out, d_elapsed); timer.Stop(); elapsed_millis += timer.ElapsedMillis(); // Copy clocks from device clock_t clocks; CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost)); elapsed_clocks += clocks; } // Check for kernel errors and STDIO from the kernel, if any CubDebugExit(cudaPeekAtLastError()); CubDebugExit(cudaDeviceSynchronize()); // Display timing results float avg_millis = elapsed_millis / g_timing_iterations; float avg_items_per_sec = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f; float avg_clocks = float(elapsed_clocks) / g_timing_iterations; float avg_clocks_per_item = avg_clocks / TILE_SIZE; printf("\tAverage BlockScan::Sum clocks: %.3f\n", avg_clocks); printf("\tAverage BlockScan::Sum clocks per item: %.3f\n", avg_clocks_per_item); printf("\tAverage kernel millis: %.4f\n", avg_millis); printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec); // Cleanup if (h_in) delete[] h_in; if (h_reference) delete[] h_reference; if (h_gpu) delete[] h_gpu; if (d_in) cudaFree(d_in); if (d_out) cudaFree(d_out); if (d_elapsed) cudaFree(d_elapsed); } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("i", g_timing_iterations); args.GetCmdLineArgument("grid-size", g_grid_size); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s " "[--device=] " "[--i=]" "[--grid-size=]" "[--v] " "\n", argv[0], g_timing_iterations, g_grid_size); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Run tests Test<1024, 1, BLOCK_SCAN_RAKING>(); Test<512, 2, BLOCK_SCAN_RAKING>(); Test<256, 4, BLOCK_SCAN_RAKING>(); Test<128, 8, BLOCK_SCAN_RAKING>(); Test<64, 16, BLOCK_SCAN_RAKING>(); Test<32, 32, BLOCK_SCAN_RAKING>(); printf("-------------\n"); Test<1024, 1, BLOCK_SCAN_RAKING_MEMOIZE>(); Test<512, 2, BLOCK_SCAN_RAKING_MEMOIZE>(); Test<256, 4, BLOCK_SCAN_RAKING_MEMOIZE>(); Test<128, 8, BLOCK_SCAN_RAKING_MEMOIZE>(); Test<64, 16, BLOCK_SCAN_RAKING_MEMOIZE>(); Test<32, 32, BLOCK_SCAN_RAKING_MEMOIZE>(); printf("-------------\n"); Test<1024, 1, BLOCK_SCAN_WARP_SCANS>(); Test<512, 2, BLOCK_SCAN_WARP_SCANS>(); Test<256, 4, BLOCK_SCAN_WARP_SCANS>(); Test<128, 8, BLOCK_SCAN_WARP_SCANS>(); Test<64, 16, BLOCK_SCAN_WARP_SCANS>(); Test<32, 32, BLOCK_SCAN_WARP_SCANS>(); return 0; }