// ------------------------------------------------------------- // cuDPP -- CUDA Data Parallel Primitives library // ------------------------------------------------------------- // $Revision: 5633 $ // $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $ // ------------------------------------------------------------- // This source code is distributed under the terms of license.txt // in the root directory of this source distribution. // ------------------------------------------------------------- /** * @file * scan_cta.cu * * @brief CUDPP CTA-level scan routines */ /** \defgroup cudpp_cta CUDPP CTA-Level API * The CUDPP CTA-Level API contains functions that run on the GPU * device. These are CUDA \c __device__ functions that are called * from within other CUDA device functions (typically * \link cudpp_kernel CUDPP Kernel-Level API\endlink functions). * They are called CTA-level functions because they typically process * s_data "owned" by each CTA within shared memory, and are agnostic of * any other CTAs that may be running (or how many CTAs are running), * other than to compute appropriate global memory addresses. * @{ */ /** @name Scan Functions * @{ */ #include #include #include #include /** * @brief Macro to insert necessary __syncthreads() in device emulation mode */ #ifdef __DEVICE_EMULATION__ #define __EMUSYNC __syncthreads() #else #define __EMUSYNC #endif /** * @brief Template class containing compile-time parameters to the scan functions * * ScanTraits is passed as a template parameter to all scan functions. By * using these compile-time functions we can enable generic code while * maintaining the highest performance. This is crucial for the performance * of low-level workhorse algorithms like scan. * * @param T The datatype of the scan * @param oper The ::CUDPPOperator to use for the scan (add, max, etc.) * @param multiRow True if this is a multi-row scan * @param unroll True if scan inner loops should be unrolled * @param sums True if each block should write it's sum to the d_blockSums array (false for single-block scans) * @param backward True if this is a backward scan * @param fullBlock True if all blocks in this scan are full (CTA_SIZE * SCAN_ELEMENTS_PER_THREAD elements) * @param exclusive True for exclusive scans, false for inclusive scans */ template class ScanTraits { public: //! Returns true if this is a backward scan static __device__ bool isBackward() { return backward; }; //! Returns true if this is an exclusive scan static __device__ bool isExclusive() { return exclusive; }; //! Returns true if this a multi-row scan. static __device__ bool isMultiRow() { return multiRow; }; //! Returns true if this scan writes the sum of each block to the d_blockSums array (multi-block scans) static __device__ bool writeSums() { return sums; }; //! Returns true if this is a full scan -- all blocks process CTA_SIZE * SCAN_ELEMENTS_PER_THREAD elements static __device__ bool isFullBlock() { return fullBlock; }; //! The operator function used for the scan static __device__ T op(const T a, const T b) { return Operator::op(a, b); } //! The identity value used by the scan static __device__ T identity() { return Operator::identity(); } }; //! This is used to insert syncthreads to avoid perf loss caused by 128-bit //! load overlap that happens on G80. This gives about a 15% boost on scans on //! G80. //! @todo Parameterize this in case this perf detail changes on future GPUs. #define DISALLOW_LOADSTORE_OVERLAP 1 /** * @brief Handles loading input s_data from global memory to shared memory * (vec4 version) * * Load a chunk of 8*blockDim.x elements from global memory into a * shared memory array. Each thread loads two T4 elements (where * T4 is, e.g. int4 or float4), computes the scan of those two vec4s in * thread local arrays (in registers), and writes the two total sums of the * vec4s into shared memory, where they will be cooperatively scanned with * the other partial sums by all threads in the CTA. * * @param[out] s_out The output (shared) memory array * @param[out] threadScan0 Intermediate per-thread partial sums array 1 * @param[out] threadScan1 Intermediate per-thread partial sums array 2 * @param[in] d_in The input (device) memory array * @param[in] numElements The number of elements in the array being scanned * @param[in] iDataOffset the offset of the input array in global memory for this * thread block * @param[out] ai The shared memory address for the thread's first element * (returned for reuse) * @param[out] bi The shared memory address for the thread's second element * (returned for reuse) * @param[out] aiDev The device memory address for this thread's first element * (returned for reuse) * @param[out] biDev The device memory address for this thread's second element * (returned for reuse) */ template __device__ void loadSharedChunkFromMem4(T *s_out, T threadScan0[4], T threadScan1[4], const T *d_in, int numElements, int iDataOffset, int &ai, int &bi, int &aiDev, int &biDev) { int thid = threadIdx.x; aiDev = iDataOffset + thid; biDev = aiDev + blockDim.x; // convert to 4-vector typename typeToVector::Result tempData; typename typeToVector::Result* inData = (typename typeToVector::Result*)d_in; ai = thid; bi = thid + blockDim.x; // read into tempData; if (traits::isBackward()) { int i = aiDev * 4; if (traits::isFullBlock() || i + 3 < numElements) { tempData = inData[aiDev]; threadScan0[3] = tempData.w; threadScan0[2] = traits::op(tempData.z, threadScan0[3]); threadScan0[1] = traits::op(tempData.y, threadScan0[2]); threadScan0[0] = s_out[ai] = traits::op(tempData.x, threadScan0[1]); } else { threadScan0[3] = traits::identity(); threadScan0[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan0[3]); threadScan0[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan0[2]); threadScan0[0] = s_out[ai] = traits::op((i < numElements) ? d_in[i] : traits::identity(), threadScan0[1]); } #ifdef DISALLOW_LOADSTORE_OVERLAP __syncthreads(); #endif i = biDev * 4; if (traits::isFullBlock() || i + 3 < numElements) { tempData = inData[biDev]; threadScan1[3] = tempData.w; threadScan1[2] = traits::op(tempData.z, threadScan1[3]); threadScan1[1] = traits::op(tempData.y, threadScan1[2]); threadScan1[0] = s_out[bi] = traits::op(tempData.x, threadScan1[1]); } else { threadScan1[3] = traits::identity(); threadScan1[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan1[3]); threadScan1[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan1[2]); threadScan1[0] = s_out[bi] = traits::op((i < numElements) ? d_in[i] : traits::identity(), threadScan1[1]); } __syncthreads(); // reverse s_data in shared memory if (ai < CTA_SIZE) { unsigned int leftIdx = ai; unsigned int rightIdx = (2 * CTA_SIZE - 1) - ai; if (leftIdx < rightIdx) { T tmp = s_out[leftIdx]; s_out[leftIdx] = s_out[rightIdx]; s_out[rightIdx] = tmp; } } __syncthreads(); } else { int i = aiDev * 4; if (traits::isFullBlock() || i + 3 < numElements) { tempData = inData[aiDev]; threadScan0[0] = tempData.x; threadScan0[1] = traits::op(tempData.y, threadScan0[0]); threadScan0[2] = traits::op(tempData.z, threadScan0[1]); threadScan0[3] = s_out[ai] = traits::op(tempData.w, threadScan0[2]); } else { threadScan0[0] = (i < numElements) ? d_in[i] : traits::identity(); threadScan0[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan0[0]); threadScan0[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan0[1]); threadScan0[3] = s_out[ai] = traits::op(((i+3) < numElements) ? d_in[i+3] : traits::identity(), threadScan0[2]); } #ifdef DISALLOW_LOADSTORE_OVERLAP __syncthreads(); #endif i = biDev * 4; if (traits::isFullBlock() || i + 3 < numElements) { tempData = inData[biDev]; threadScan1[0] = tempData.x; threadScan1[1] = traits::op(tempData.y, threadScan1[0]); threadScan1[2] = traits::op(tempData.z, threadScan1[1]); threadScan1[3] = s_out[bi] = traits::op(tempData.w, threadScan1[2]); } else { threadScan1[0] = (i < numElements) ? d_in[i] : traits::identity(); threadScan1[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan1[0]); threadScan1[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan1[1]); threadScan1[3] = s_out[bi] = traits::op(((i+3) < numElements) ? d_in[i+3] : traits::identity(), threadScan1[2]); } __syncthreads(); } } /** * @brief Handles storing result s_data from shared memory to global memory * (vec4 version) * * Store a chunk of SCAN_ELTS_PER_THREAD*blockDim.x elements from shared memory * into a device memory array. Each thread stores reads two elements from shared * memory, adds them to the intermediate sums computed in * loadSharedChunkFromMem4(), and writes two T4 elements (where * T4 is, e.g. int4 or float4) to global memory. * * @param[out] d_out The output (device) memory array * @param[in] threadScan0 Intermediate per-thread partial sums array 1 * (contents computed in loadSharedChunkFromMem4()) * @param[in] threadScan1 Intermediate per-thread partial sums array 2 * (contents computed in loadSharedChunkFromMem4()) * @param[in] s_in The input (shared) memory array * @param[in] numElements The number of elements in the array being scanned * @param[in] oDataOffset the offset of the output array in global memory * for this thread block * @param[in] ai The shared memory address for the thread's first element * (computed in loadSharedChunkFromMem4()) * @param[in] bi The shared memory address for the thread's second element * (computed in loadSharedChunkFromMem4()) * @param[in] aiDev The device memory address for this thread's first element * (computed in loadSharedChunkFromMem4()) * @param[in] biDev The device memory address for this thread's second element * (computed in loadSharedChunkFromMem4()) */ template __device__ void storeSharedChunkToMem4(T *d_out, T threadScan0[4], T threadScan1[4], T *s_in, int numElements, int oDataOffset, int ai, int bi, int aiDev, int biDev) { // Convert to 4-vector typename typeToVector::Result tempData; typename typeToVector::Result* outData = (typename typeToVector::Result*)d_out; // write results to global memory if (traits::isBackward()) { if (ai < CTA_SIZE) { unsigned int leftIdx = ai; unsigned int rightIdx = (2 * CTA_SIZE - 1) - ai; if (leftIdx < rightIdx) { T tmp = s_in[leftIdx]; s_in[leftIdx] = s_in[rightIdx]; s_in[rightIdx] = tmp; } } __syncthreads(); T temp = s_in[ai]; if (traits::isExclusive()) { tempData.w = temp; tempData.z = traits::op(temp, threadScan0[3]); tempData.y = traits::op(temp, threadScan0[2]); tempData.x = traits::op(temp, threadScan0[1]); } else { tempData.w = traits::op(temp, threadScan0[3]); tempData.z = traits::op(temp, threadScan0[2]); tempData.y = traits::op(temp, threadScan0[1]); tempData.x = traits::op(temp, threadScan0[0]); } int i = aiDev * 4; if (traits::isFullBlock() || i + 3 < numElements) { outData[aiDev] = tempData; } else { if (i < numElements) { d_out[i] = tempData.x; if (i+1 < numElements) { d_out[i+1] = tempData.y; if (i+2 < numElements) { d_out[i+2] = tempData.z; }}} } #ifdef DISALLOW_LOADSTORE_OVERLAP __syncthreads(); #endif temp = s_in[bi]; if (traits::isExclusive()) { tempData.w = temp; tempData.z = traits::op(temp, threadScan1[3]); tempData.y = traits::op(temp, threadScan1[2]); tempData.x = traits::op(temp, threadScan1[1]); } else { tempData.w = traits::op(temp, threadScan1[3]); tempData.z = traits::op(temp, threadScan1[2]); tempData.y = traits::op(temp, threadScan1[1]); tempData.x = traits::op(temp, threadScan1[0]); } i = biDev * 4; if (traits::isFullBlock() || i + 3 < numElements) { outData[biDev] = tempData; } else { if (i < numElements) { d_out[i] = tempData.x; if (i+1 < numElements) { d_out[i+1] = tempData.y; if (i+2 < numElements) { d_out[i+2] = tempData.z; }}} } } else { T temp; temp = s_in[ai]; if (traits::isExclusive()) { tempData.x = temp; tempData.y = traits::op(temp, threadScan0[0]); tempData.z = traits::op(temp, threadScan0[1]); tempData.w = traits::op(temp, threadScan0[2]); } else { tempData.x = traits::op(temp, threadScan0[0]); tempData.y = traits::op(temp, threadScan0[1]); tempData.z = traits::op(temp, threadScan0[2]); tempData.w = traits::op(temp, threadScan0[3]); } int i = aiDev * 4; if (traits::isFullBlock() || i + 3 < numElements) { outData[aiDev] = tempData; } else { // we can't use vec4 because the original array isn't a multiple of // 4 elements if ( i < numElements) { d_out[i] = tempData.x; if ((i+1) < numElements) { d_out[i+1] = tempData.y; if ((i+2) < numElements) { d_out[i+2] = tempData.z; } } } } #ifdef DISALLOW_LOADSTORE_OVERLAP __syncthreads(); #endif temp = s_in[bi]; if (traits::isExclusive()) { tempData.x = temp; tempData.y = traits::op(temp, threadScan1[0]); tempData.z = traits::op(temp, threadScan1[1]); tempData.w = traits::op(temp, threadScan1[2]); } else { tempData.x = traits::op(temp, threadScan1[0]); tempData.y = traits::op(temp, threadScan1[1]); tempData.z = traits::op(temp, threadScan1[2]); tempData.w = traits::op(temp, threadScan1[3]); } i = biDev * 4; if (traits::isFullBlock() || i + 3 < numElements) { outData[biDev] = tempData; } else { // we can't use vec4 because the original array isn't a multiple of // 4 elements if ( i < numElements) { d_out[i] = tempData.x; if ((i+1) < numElements) { d_out[i+1] = tempData.y; if ((i+2) < numElements) { d_out[i+2] = tempData.z; } } } } } } /** @brief Scan all warps of a CTA without synchronization * * The warp-scan algorithm breaks a block of data into warp-sized chunks, and * scans the chunks independently with a warp of threads each. Because warps * execute instructions in SIMD fashion, there is no need to synchronize in * order to share data within a warp (only across warps). Also, in SIMD the * most efficient algorithm is a step-efficient algorithm. Therefore, within * each warp we use a Hillis-and-Steele-style scan that takes log2(N) steps * to scan the warp [Daniel Hillis and Guy Steele 1986], rather than the * work-efficient tree-based algorithm described by Guy Blelloch [1990] that * takes 2 * log(N) steps and is in general more complex to implement. * Previous versions of CUDPP used the Blelloch algorithm. For current GPUs, * the warp size is 32, so this takes five steps per warp. * * Each thread is responsible for a single element of the array to be scanned. * Each thread inputs a single value to the scan via \a val and returns * its own scanned result element. The threads of each warp cooperate * via the shared memory array \a s_data to scan WARP_SIZE elements. * * Template parameter \a maxlevel allows this warpscan to be performed on * partial warps. For example, if only the first 8 elements of each warp need * to be scanned, then warpscan only performs log2(8)=3 steps rather than 5. * * The computation uses 2 * WARP_SIZE elements of shared memory per warp to * enable warps to offset beyond their input data and receive the identity * element without using any branch instructions. * * \note s_data is declared volatile here to prevent the compiler from * optimizing away writes to shared memory, and ensure correct intrawarp * communication in the absence of __syncthreads. * * @return The result of the warp scan for the current thread * @param[in] val The current threads's input to the scan * @param[in,out] s_data A pointer to a temporary shared array of 2*CTA_SIZE * elements used to compute the warp scans */ template __device__ T warpscan(T val, volatile T* s_data) { // The following is the same as 2 * 32 * warpId + threadInWarp = // 64*(threadIdx.x >> 5) + (threadIdx.x & (WARP_SIZE-1)) int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE-1)); s_data[idx] = traits::identity(); idx += WARP_SIZE; T t = s_data[idx] = val; __EMUSYNC; // This code is needed because the warp size of device emulation // is only 1 thread, so sync-less cooperation within a warp doesn't // work. #ifdef __DEVICE_EMULATION__ t = s_data[idx - 1]; __EMUSYNC; s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC; t = s_data[idx - 2]; __EMUSYNC; s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC; t = s_data[idx - 4]; __EMUSYNC; s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC; t = s_data[idx - 8]; __EMUSYNC; s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC; t = s_data[idx - 16]; __EMUSYNC; s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC; #else if (0 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 1]); } if (1 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 2]); } if (2 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 4]); } if (3 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 8]); } if (4 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx -16]); } #endif return s_data[idx-1]; // convert inclusive -> exclusive } /** @brief Perform a full CTA scan using the warp-scan algorithm * * As described in the comment for warpscan(), the warp-scan algorithm breaks * a block of data into warp-sized chunks, and scans the chunks independently * with a warp of threads each. To complete the scan, each warp j then * writes its last element to element j of a temporary shared array. * Then a single warp exclusive-scans these "warp sums". Finally, each thread * adds the result of the warp sum scan to the result of the scan from the * first pass. * * Because we scan 2*CTA_SIZE elements per thread, we have to call warpscan * twice. * * @param x The first input value for the current thread * @param y The second input value for the current thread * @param s_data Temporary shared memory space of 2*CTA_SIZE elements for * performing the scan */ template __device__ void scanWarps(T x, T y, T *s_data) { T val = warpscan(x, s_data); __syncthreads(); T val2 = warpscan(y, s_data); int idx = threadIdx.x; if ((idx & 31)==31) { s_data[idx >> 5] = traits::op(val, x); s_data[(idx + blockDim.x) >> 5] = traits::op(val2, y); } __syncthreads(); #ifndef __DEVICE_EMULATION__ if (idx < 32) #endif { s_data[idx] = warpscan(s_data[idx], s_data); } __syncthreads(); val = traits::op(val, s_data[idx >> 5]); val2 = traits::op(val2, s_data[(idx + blockDim.x) >> 5]); __syncthreads(); s_data[idx] = val; s_data[idx+blockDim.x] = val2; } /** * @brief CTA-level scan routine; scans s_data in shared memory in each thread block * * This function is the main CTA-level scan function. It may be called by other * CUDA __global__ or __device__ functions. This function scans 2 * CTA_SIZE elements. * Each thread is responsible for one element in each half of the input array. * \note This code is intended to be run on a CTA of 128 threads. Other sizes are * untested. * * @param[in] s_data The array to be scanned in shared memory * @param[out] d_blockSums Array of per-block sums * @param[in] blockSumIndex Location in \a d_blockSums to which to write this block's sum */ template __device__ void scanCTA(T *s_data, T *d_blockSums, unsigned int blockSumIndex) { T val = s_data[threadIdx.x]; T val2 = s_data[threadIdx.x + blockDim.x]; __syncthreads(); scanWarps(val, val2, s_data); __syncthreads(); if (traits::writeSums() && threadIdx.x == blockDim.x - 1) { d_blockSums[blockSumIndex] = traits::op(val2, s_data[threadIdx.x + blockDim.x]); } #ifdef __DEVICE_EMULATION__ // must sync in emulation mode when doing backward scans, because otherwise the // shared memory array will get reversed before the block sums are read! if (traits::isBackward()) __syncthreads(); #endif } /** @} */ // end scan functions /** @} */ // end cudpp_cta