/****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2016, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the NVIDIA CORPORATION nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ******************************************************************************/ /****************************************************************************** * An implementation of COO SpMV using prefix scan to implement a * reduce-value-by-row strategy ******************************************************************************/ // Ensure printing of CUDA runtime errors to console #define CUB_STDERR #include #include #include #include #include #include #include "coo_graph.cuh" #include "../test/test_util.h" using namespace cub; using namespace std; /****************************************************************************** * Globals, constants, and typedefs ******************************************************************************/ typedef int VertexId; // uint32s as vertex ids typedef double Value; // double-precision floating point values bool g_verbose = false; int g_timing_iterations = 1; CachingDeviceAllocator g_allocator; /****************************************************************************** * Texture referencing ******************************************************************************/ /** * Templated texture reference type for multiplicand vector */ template struct TexVector { // Texture type to actually use (e.g., because CUDA doesn't load doubles as texture items) typedef typename If<(Equals::VALUE), uint2, Value>::Type CastType; // Texture reference type typedef texture TexRef; static TexRef ref; /** * Bind textures */ static void BindTexture(void *d_in, int elements) { cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc(); if (d_in) { size_t offset; size_t bytes = sizeof(CastType) * elements; CubDebugExit(cudaBindTexture(&offset, ref, d_in, tex_desc, bytes)); } } /** * Unbind textures */ static void UnbindTexture() { CubDebugExit(cudaUnbindTexture(ref)); } /** * Load */ static __device__ __forceinline__ Value Load(int offset) { Value output; reinterpret_cast::CastType &>(output) = tex1Dfetch(TexVector::ref, offset); return output; } }; // Texture reference definitions template typename TexVector::TexRef TexVector::ref = 0; /****************************************************************************** * Utility types ******************************************************************************/ /** * A partial dot-product sum paired with a corresponding row-id */ template struct PartialProduct { VertexId row; /// Row-id Value partial; /// PartialProduct sum }; /** * A partial dot-product sum paired with a corresponding row-id (specialized for double-int pairings) */ template <> struct PartialProduct { long long row; /// Row-id double partial; /// PartialProduct sum }; /** * Reduce-value-by-row scan operator */ struct ReduceByKeyOp { template __device__ __forceinline__ PartialProduct operator()( const PartialProduct &first, const PartialProduct &second) { PartialProduct retval; retval.partial = (second.row != first.row) ? second.partial : first.partial + second.partial; retval.row = second.row; return retval; } }; /** * Stateful block-wide prefix operator for BlockScan */ template struct BlockPrefixCallbackOp { // Running block-wide prefix PartialProduct running_prefix; /** * Returns the block-wide running_prefix in thread-0 */ __device__ __forceinline__ PartialProduct operator()( const PartialProduct &block_aggregate) ///< The aggregate sum of the BlockScan inputs { ReduceByKeyOp scan_op; PartialProduct retval = running_prefix; running_prefix = scan_op(running_prefix, block_aggregate); return retval; } }; /** * Operator for detecting discontinuities in a list of row identifiers. */ struct NewRowOp { /// Returns true if row_b is the start of a new row template __device__ __forceinline__ bool operator()( const VertexId& row_a, const VertexId& row_b) { return (row_a != row_b); } }; /****************************************************************************** * Persistent thread block types ******************************************************************************/ /** * SpMV threadblock abstraction for processing a contiguous segment of * sparse COO tiles. */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, typename VertexId, typename Value> struct PersistentBlockSpmv { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- // Constants enum { TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, }; // Head flag type typedef int HeadFlag; // Partial dot product type typedef PartialProduct PartialProduct; // Parameterized BlockScan type for reduce-value-by-row scan typedef BlockScan BlockScan; // Parameterized BlockExchange type for exchanging rows between warp-striped -> blocked arrangements typedef BlockExchange BlockExchangeRows; // Parameterized BlockExchange type for exchanging values between warp-striped -> blocked arrangements typedef BlockExchange BlockExchangeValues; // Parameterized BlockDiscontinuity type for setting head-flags for each new row segment typedef BlockDiscontinuity BlockDiscontinuity; // Shared memory type for this threadblock struct TempStorage { union { typename BlockExchangeRows::TempStorage exchange_rows; // Smem needed for BlockExchangeRows typename BlockExchangeValues::TempStorage exchange_values; // Smem needed for BlockExchangeValues struct { typename BlockScan::TempStorage scan; // Smem needed for BlockScan typename BlockDiscontinuity::TempStorage discontinuity; // Smem needed for BlockDiscontinuity }; }; VertexId first_block_row; ///< The first row-ID seen by this thread block VertexId last_block_row; ///< The last row-ID seen by this thread block Value first_product; ///< The first dot-product written by this thread block }; //--------------------------------------------------------------------- // Thread fields //--------------------------------------------------------------------- TempStorage &temp_storage; BlockPrefixCallbackOp prefix_op; VertexId *d_rows; VertexId *d_columns; Value *d_values; Value *d_vector; Value *d_result; PartialProduct *d_block_partials; int block_offset; int block_end; //--------------------------------------------------------------------- // Operations //--------------------------------------------------------------------- /** * Constructor */ __device__ __forceinline__ PersistentBlockSpmv( TempStorage &temp_storage, VertexId *d_rows, VertexId *d_columns, Value *d_values, Value *d_vector, Value *d_result, PartialProduct *d_block_partials, int block_offset, int block_end) : temp_storage(temp_storage), d_rows(d_rows), d_columns(d_columns), d_values(d_values), d_vector(d_vector), d_result(d_result), d_block_partials(d_block_partials), block_offset(block_offset), block_end(block_end) { // Initialize scalar shared memory values if (threadIdx.x == 0) { VertexId first_block_row = d_rows[block_offset]; VertexId last_block_row = d_rows[block_end - 1]; temp_storage.first_block_row = first_block_row; temp_storage.last_block_row = last_block_row; temp_storage.first_product = Value(0); // Initialize prefix_op to identity prefix_op.running_prefix.row = first_block_row; prefix_op.running_prefix.partial = Value(0); } __syncthreads(); } /** * Processes a COO input tile of edges, outputting dot products for each row */ template __device__ __forceinline__ void ProcessTile( int block_offset, int guarded_items = 0) { VertexId columns[ITEMS_PER_THREAD]; VertexId rows[ITEMS_PER_THREAD]; Value values[ITEMS_PER_THREAD]; PartialProduct partial_sums[ITEMS_PER_THREAD]; HeadFlag head_flags[ITEMS_PER_THREAD]; // Load a threadblock-striped tile of A (sparse row-ids, column-ids, and values) if (FULL_TILE) { // Unguarded loads LoadDirectWarpStriped(threadIdx.x, d_columns + block_offset, columns); LoadDirectWarpStriped(threadIdx.x, d_values + block_offset, values); LoadDirectWarpStriped(threadIdx.x, d_rows + block_offset, rows); } else { // This is a partial-tile (e.g., the last tile of input). Extend the coordinates of the last // vertex for out-of-bound items, but zero-valued LoadDirectWarpStriped(threadIdx.x, d_columns + block_offset, columns, guarded_items, VertexId(0)); LoadDirectWarpStriped(threadIdx.x, d_values + block_offset, values, guarded_items, Value(0)); LoadDirectWarpStriped(threadIdx.x, d_rows + block_offset, rows, guarded_items, temp_storage.last_block_row); } // Load the referenced values from x and compute the dot product partials sums #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { #if CUB_PTX_ARCH >= 350 values[ITEM] *= ThreadLoad(d_vector + columns[ITEM]); #else values[ITEM] *= TexVector::Load(columns[ITEM]); #endif } // Transpose from warp-striped to blocked arrangement BlockExchangeValues(temp_storage.exchange_values).WarpStripedToBlocked(values); __syncthreads(); // Transpose from warp-striped to blocked arrangement BlockExchangeRows(temp_storage.exchange_rows).WarpStripedToBlocked(rows); // Barrier for smem reuse and coherence __syncthreads(); // FlagT row heads by looking for discontinuities BlockDiscontinuity(temp_storage.discontinuity).FlagHeads( head_flags, // (Out) Head flags rows, // Original row ids NewRowOp(), // Functor for detecting start of new rows prefix_op.running_prefix.row); // Last row ID from previous tile to compare with first row ID in this tile // Assemble partial product structures #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { partial_sums[ITEM].partial = values[ITEM]; partial_sums[ITEM].row = rows[ITEM]; } // Reduce reduce-value-by-row across partial_sums using exclusive prefix scan PartialProduct block_aggregate; BlockScan(temp_storage.scan).ExclusiveScan( partial_sums, // Scan input partial_sums, // Scan output ReduceByKeyOp(), // Scan operator block_aggregate, // Block-wide total (unused) prefix_op); // Prefix operator for seeding the block-wide scan with the running total // Barrier for smem reuse and coherence __syncthreads(); // Scatter an accumulated dot product if it is the head of a valid row #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (head_flags[ITEM]) { d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial; // Save off the first partial product that this thread block will scatter if (partial_sums[ITEM].row == temp_storage.first_block_row) { temp_storage.first_product = partial_sums[ITEM].partial; } } } } /** * Iterate over input tiles belonging to this thread block */ __device__ __forceinline__ void ProcessTiles() { // Process full tiles while (block_offset <= block_end - TILE_ITEMS) { ProcessTile(block_offset); block_offset += TILE_ITEMS; } // Process the last, partially-full tile (if present) int guarded_items = block_end - block_offset; if (guarded_items) { ProcessTile(block_offset, guarded_items); } if (threadIdx.x == 0) { if (gridDim.x == 1) { // Scatter the final aggregate (this kernel contains only 1 threadblock) d_result[prefix_op.running_prefix.row] = prefix_op.running_prefix.partial; } else { // Write the first and last partial products from this thread block so // that they can be subsequently "fixed up" in the next kernel. PartialProduct first_product; first_product.row = temp_storage.first_block_row; first_product.partial = temp_storage.first_product; d_block_partials[blockIdx.x * 2] = first_product; d_block_partials[(blockIdx.x * 2) + 1] = prefix_op.running_prefix; } } } }; /** * Threadblock abstraction for "fixing up" an array of interblock SpMV partial products. */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, typename VertexId, typename Value> struct FinalizeSpmvBlock { //--------------------------------------------------------------------- // Types and constants //--------------------------------------------------------------------- // Constants enum { TILE_ITEMS = BLOCK_THREADS * ITEMS_PER_THREAD, }; // Head flag type typedef int HeadFlag; // Partial dot product type typedef PartialProduct PartialProduct; // Parameterized BlockScan type for reduce-value-by-row scan typedef BlockScan BlockScan; // Parameterized BlockDiscontinuity type for setting head-flags for each new row segment typedef BlockDiscontinuity BlockDiscontinuity; // Shared memory type for this threadblock struct TempStorage { typename BlockScan::TempStorage scan; // Smem needed for reduce-value-by-row scan typename BlockDiscontinuity::TempStorage discontinuity; // Smem needed for head-flagging VertexId last_block_row; }; //--------------------------------------------------------------------- // Thread fields //--------------------------------------------------------------------- TempStorage &temp_storage; BlockPrefixCallbackOp prefix_op; Value *d_result; PartialProduct *d_block_partials; int num_partials; //--------------------------------------------------------------------- // Operations //--------------------------------------------------------------------- /** * Constructor */ __device__ __forceinline__ FinalizeSpmvBlock( TempStorage &temp_storage, Value *d_result, PartialProduct *d_block_partials, int num_partials) : temp_storage(temp_storage), d_result(d_result), d_block_partials(d_block_partials), num_partials(num_partials) { // Initialize scalar shared memory values if (threadIdx.x == 0) { VertexId first_block_row = d_block_partials[0].row; VertexId last_block_row = d_block_partials[num_partials - 1].row; temp_storage.last_block_row = last_block_row; // Initialize prefix_op to identity prefix_op.running_prefix.row = first_block_row; prefix_op.running_prefix.partial = Value(0); } __syncthreads(); } /** * Processes a COO input tile of edges, outputting dot products for each row */ template __device__ __forceinline__ void ProcessTile( int block_offset, int guarded_items = 0) { VertexId rows[ITEMS_PER_THREAD]; PartialProduct partial_sums[ITEMS_PER_THREAD]; HeadFlag head_flags[ITEMS_PER_THREAD]; // Load a tile of block partials from previous kernel if (FULL_TILE) { // Full tile #if CUB_PTX_ARCH >= 350 LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums); #else LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums); #endif } else { // Partial tile (extend zero-valued coordinates of the last partial-product for out-of-bounds items) PartialProduct default_sum; default_sum.row = temp_storage.last_block_row; default_sum.partial = Value(0); #if CUB_PTX_ARCH >= 350 LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum); #else LoadDirectBlocked(threadIdx.x, d_block_partials + block_offset, partial_sums, guarded_items, default_sum); #endif } // Copy out row IDs for row-head flagging #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { rows[ITEM] = partial_sums[ITEM].row; } // FlagT row heads by looking for discontinuities BlockDiscontinuity(temp_storage.discontinuity).FlagHeads( rows, // Original row ids head_flags, // (Out) Head flags NewRowOp(), // Functor for detecting start of new rows prefix_op.running_prefix.row); // Last row ID from previous tile to compare with first row ID in this tile // Reduce reduce-value-by-row across partial_sums using exclusive prefix scan PartialProduct block_aggregate; BlockScan(temp_storage.scan).ExclusiveScan( partial_sums, // Scan input partial_sums, // Scan output ReduceByKeyOp(), // Scan operator block_aggregate, // Block-wide total (unused) prefix_op); // Prefix operator for seeding the block-wide scan with the running total // Scatter an accumulated dot product if it is the head of a valid row #pragma unroll for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++) { if (head_flags[ITEM]) { d_result[partial_sums[ITEM].row] = partial_sums[ITEM].partial; } } } /** * Iterate over input tiles belonging to this thread block */ __device__ __forceinline__ void ProcessTiles() { // Process full tiles int block_offset = 0; while (block_offset <= num_partials - TILE_ITEMS) { ProcessTile(block_offset); block_offset += TILE_ITEMS; } // Process final partial tile (if present) int guarded_items = num_partials - block_offset; if (guarded_items) { ProcessTile(block_offset, guarded_items); } // Scatter the final aggregate (this kernel contains only 1 threadblock) if (threadIdx.x == 0) { d_result[prefix_op.running_prefix.row] = prefix_op.running_prefix.partial; } } }; /****************************************************************************** * Kernel entrypoints ******************************************************************************/ /** * SpMV kernel whose thread blocks each process a contiguous segment of sparse COO tiles. */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, typename VertexId, typename Value> __launch_bounds__ (BLOCK_THREADS) __global__ void CooKernel( GridEvenShare even_share, PartialProduct *d_block_partials, VertexId *d_rows, VertexId *d_columns, Value *d_values, Value *d_vector, Value *d_result) { // Specialize SpMV threadblock abstraction type typedef PersistentBlockSpmv PersistentBlockSpmv; // Shared memory allocation __shared__ typename PersistentBlockSpmv::TempStorage temp_storage; // Initialize threadblock even-share to tell us where to start and stop our tile-processing even_share.BlockInit(); // Construct persistent thread block PersistentBlockSpmv persistent_block( temp_storage, d_rows, d_columns, d_values, d_vector, d_result, d_block_partials, even_share.block_offset, even_share.block_end); // Process input tiles persistent_block.ProcessTiles(); } /** * Kernel for "fixing up" an array of interblock SpMV partial products. */ template < int BLOCK_THREADS, int ITEMS_PER_THREAD, typename VertexId, typename Value> __launch_bounds__ (BLOCK_THREADS, 1) __global__ void CooFinalizeKernel( PartialProduct *d_block_partials, int num_partials, Value *d_result) { // Specialize "fix-up" threadblock abstraction type typedef FinalizeSpmvBlock FinalizeSpmvBlock; // Shared memory allocation __shared__ typename FinalizeSpmvBlock::TempStorage temp_storage; // Construct persistent thread block FinalizeSpmvBlock persistent_block(temp_storage, d_result, d_block_partials, num_partials); // Process input tiles persistent_block.ProcessTiles(); } //--------------------------------------------------------------------- // Host subroutines //--------------------------------------------------------------------- /** * Simple test of device */ template < int COO_BLOCK_THREADS, int COO_ITEMS_PER_THREAD, int COO_SUBSCRIPTION_FACTOR, int FINALIZE_BLOCK_THREADS, int FINALIZE_ITEMS_PER_THREAD, typename VertexId, typename Value> void TestDevice( CooGraph& coo_graph, Value* h_vector, Value* h_reference) { typedef PartialProduct PartialProduct; const int COO_TILE_SIZE = COO_BLOCK_THREADS * COO_ITEMS_PER_THREAD; // SOA device storage VertexId *d_rows; // SOA graph row coordinates VertexId *d_columns; // SOA graph col coordinates Value *d_values; // SOA graph values Value *d_vector; // Vector multiplicand Value *d_result; // Output row PartialProduct *d_block_partials; // Temporary storage for communicating dot product partials between threadblocks // Create SOA version of coo_graph on host int num_edges = coo_graph.coo_tuples.size(); VertexId *h_rows = new VertexId[num_edges]; VertexId *h_columns = new VertexId[num_edges]; Value *h_values = new Value[num_edges]; for (int i = 0; i < num_edges; i++) { h_rows[i] = coo_graph.coo_tuples[i].row; h_columns[i] = coo_graph.coo_tuples[i].col; h_values[i] = coo_graph.coo_tuples[i].val; } // Get CUDA properties Device device_props; CubDebugExit(device_props.Init()); // Determine launch configuration from kernel properties int coo_sm_occupancy; CubDebugExit(device_props.MaxSmOccupancy( coo_sm_occupancy, CooKernel, COO_BLOCK_THREADS)); int max_coo_grid_size = device_props.sm_count * coo_sm_occupancy * COO_SUBSCRIPTION_FACTOR; // Construct an even-share work distribution GridEvenShare even_share(num_edges, max_coo_grid_size, COO_TILE_SIZE); int coo_grid_size = even_share.grid_size; int num_partials = coo_grid_size * 2; // Allocate COO device arrays CubDebugExit(g_allocator.DeviceAllocate((void**)&d_rows, sizeof(VertexId) * num_edges)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_columns, sizeof(VertexId) * num_edges)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values, sizeof(Value) * num_edges)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_vector, sizeof(Value) * coo_graph.col_dim)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_result, sizeof(Value) * coo_graph.row_dim)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_block_partials, sizeof(PartialProduct) * num_partials)); // Copy host arrays to device CubDebugExit(cudaMemcpy(d_rows, h_rows, sizeof(VertexId) * num_edges, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_columns, h_columns, sizeof(VertexId) * num_edges, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_values, h_values, sizeof(Value) * num_edges, cudaMemcpyHostToDevice)); CubDebugExit(cudaMemcpy(d_vector, h_vector, sizeof(Value) * coo_graph.col_dim, cudaMemcpyHostToDevice)); // Bind textures TexVector::BindTexture(d_vector, coo_graph.col_dim); // Print debug info printf("CooKernel<%d, %d><<<%d, %d>>>(...), Max SM occupancy: %d\n", COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, coo_grid_size, COO_BLOCK_THREADS, coo_sm_occupancy); if (coo_grid_size > 1) { printf("CooFinalizeKernel<<<1, %d>>>(...)\n", FINALIZE_BLOCK_THREADS); } fflush(stdout); CubDebugExit(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte)); // Run kernel (always run one iteration without timing) GpuTimer gpu_timer; float elapsed_millis = 0.0; for (int i = 0; i <= g_timing_iterations; i++) { gpu_timer.Start(); // Initialize output CubDebugExit(cudaMemset(d_result, 0, coo_graph.row_dim * sizeof(Value))); // Run the COO kernel CooKernel<<>>( even_share, d_block_partials, d_rows, d_columns, d_values, d_vector, d_result); if (coo_grid_size > 1) { // Run the COO finalize kernel CooFinalizeKernel<<<1, FINALIZE_BLOCK_THREADS>>>( d_block_partials, num_partials, d_result); } gpu_timer.Stop(); if (i > 0) elapsed_millis += gpu_timer.ElapsedMillis(); } // Force any kernel stdio to screen CubDebugExit(cudaThreadSynchronize()); fflush(stdout); // Display timing if (g_timing_iterations > 0) { float avg_elapsed = elapsed_millis / g_timing_iterations; int total_bytes = ((sizeof(VertexId) + sizeof(VertexId)) * 2 * num_edges) + (sizeof(Value) * coo_graph.row_dim); printf("%d iterations, average elapsed (%.3f ms), utilized bandwidth (%.3f GB/s), GFLOPS(%.3f)\n", g_timing_iterations, avg_elapsed, total_bytes / avg_elapsed / 1000.0 / 1000.0, num_edges * 2 / avg_elapsed / 1000.0 / 1000.0); } // Check results int compare = CompareDeviceResults(h_reference, d_result, coo_graph.row_dim, true, g_verbose); printf("%s\n", compare ? "FAIL" : "PASS"); AssertEquals(0, compare); // Cleanup TexVector::UnbindTexture(); CubDebugExit(g_allocator.DeviceFree(d_block_partials)); CubDebugExit(g_allocator.DeviceFree(d_rows)); CubDebugExit(g_allocator.DeviceFree(d_columns)); CubDebugExit(g_allocator.DeviceFree(d_values)); CubDebugExit(g_allocator.DeviceFree(d_vector)); CubDebugExit(g_allocator.DeviceFree(d_result)); delete[] h_rows; delete[] h_columns; delete[] h_values; } /** * Compute reference answer on CPU */ template void ComputeReference( CooGraph& coo_graph, Value* h_vector, Value* h_reference) { for (VertexId i = 0; i < coo_graph.row_dim; i++) { h_reference[i] = 0.0; } for (VertexId i = 0; i < coo_graph.coo_tuples.size(); i++) { h_reference[coo_graph.coo_tuples[i].row] += coo_graph.coo_tuples[i].val * h_vector[coo_graph.coo_tuples[i].col]; } } /** * Assign arbitrary values to vector items */ template void AssignVectorValues(Value *vector, int col_dim) { for (int i = 0; i < col_dim; i++) { vector[i] = 1.0; } } /** * Main */ int main(int argc, char** argv) { // Initialize command line CommandLineArgs args(argc, argv); g_verbose = args.CheckCmdLineFlag("v"); args.GetCmdLineArgument("i", g_timing_iterations); // Print usage if (args.CheckCmdLineFlag("help")) { printf("%s\n [--device=] [--v] [--iterations=] [--grid-size=]\n" "\t--type=wheel --spokes=\n" "\t--type=grid2d --width= [--no-self-loops]\n" "\t--type=grid3d --width= [--no-self-loops]\n" "\t--type=market --file=\n" "\n", argv[0]); exit(0); } // Initialize device CubDebugExit(args.DeviceInit()); // Get graph type string type; args.GetCmdLineArgument("type", type); // Generate graph structure CpuTimer timer; timer.Start(); CooGraph coo_graph; if (type == string("grid2d")) { VertexId width; args.GetCmdLineArgument("width", width); bool self_loops = !args.CheckCmdLineFlag("no-self-loops"); printf("Generating %s grid2d width(%d)... ", (self_loops) ? "5-pt" : "4-pt", width); fflush(stdout); if (coo_graph.InitGrid2d(width, self_loops)) exit(1); } else if (type == string("grid3d")) { VertexId width; args.GetCmdLineArgument("width", width); bool self_loops = !args.CheckCmdLineFlag("no-self-loops"); printf("Generating %s grid3d width(%d)... ", (self_loops) ? "7-pt" : "6-pt", width); fflush(stdout); if (coo_graph.InitGrid3d(width, self_loops)) exit(1); } else if (type == string("wheel")) { VertexId spokes; args.GetCmdLineArgument("spokes", spokes); printf("Generating wheel spokes(%d)... ", spokes); fflush(stdout); if (coo_graph.InitWheel(spokes)) exit(1); } else if (type == string("market")) { string filename; args.GetCmdLineArgument("file", filename); printf("Generating MARKET for %s... ", filename.c_str()); fflush(stdout); if (coo_graph.InitMarket(filename)) exit(1); } else { printf("Unsupported graph type\n"); exit(1); } timer.Stop(); printf("Done (%.3fs). %d non-zeros, %d rows, %d columns\n", timer.ElapsedMillis() / 1000.0, coo_graph.coo_tuples.size(), coo_graph.row_dim, coo_graph.col_dim); fflush(stdout); if (g_verbose) { cout << coo_graph << "\n"; } // Create vector Value *h_vector = new Value[coo_graph.col_dim]; AssignVectorValues(h_vector, coo_graph.col_dim); if (g_verbose) { printf("Vector[%d]: ", coo_graph.col_dim); DisplayResults(h_vector, coo_graph.col_dim); printf("\n\n"); } // Compute reference answer Value *h_reference = new Value[coo_graph.row_dim]; ComputeReference(coo_graph, h_vector, h_reference); if (g_verbose) { printf("Results[%d]: ", coo_graph.row_dim); DisplayResults(h_reference, coo_graph.row_dim); printf("\n\n"); } // Parameterization for SM35 enum { COO_BLOCK_THREADS = 64, COO_ITEMS_PER_THREAD = 10, COO_SUBSCRIPTION_FACTOR = 4, FINALIZE_BLOCK_THREADS = 256, FINALIZE_ITEMS_PER_THREAD = 4, }; // Run GPU version TestDevice< COO_BLOCK_THREADS, COO_ITEMS_PER_THREAD, COO_SUBSCRIPTION_FACTOR, FINALIZE_BLOCK_THREADS, FINALIZE_ITEMS_PER_THREAD>(coo_graph, h_vector, h_reference); // Cleanup delete[] h_vector; delete[] h_reference; return 0; }