// ------------------------------------------------------------- // CUDPP -- CUDA Data Parallel Primitives library // ------------------------------------------------------------- // $Revision$ // $Date$ // ------------------------------------------------------------- // This source code is distributed under the terms of license.txt // in the root directory of this source distribution. // ------------------------------------------------------------- /** * @file * radixsort_app.cu * * @brief CUDPP application-level radix sorting routines */ /** @addtogroup cudpp_app * @{ */ /** @name RadixSort Functions * @{ */ #include "cudpp.h" #include "cudpp_util.h" #include "cudpp_radixsort.h" #include "cudpp_scan.h" #include "kernel/radixsort_kernel.cu" #include #include #include #include typedef unsigned int uint; /** @brief Perform one step of the radix sort. Sorts by nbits key bits per step, * starting at startbit. * * Uses cudppScanDispatch() for the prefix sum of radix counters. * * @param[in,out] keys Keys to be sorted. * @param[in,out] values Associated values to be sorted (through keys). * @param[in] plan Configuration information for RadixSort. * @param[in] numElements Number of elements in the sort. **/ template void radixSortStep(uint *keys, uint *values, const CUDPPRadixSortPlan *plan, uint numElements) { const uint eltsPerBlock = SORT_CTA_SIZE * 4; const uint eltsPerBlock2 = SORT_CTA_SIZE * 2; bool fullBlocks = ((numElements % eltsPerBlock) == 0); uint numBlocks = (fullBlocks) ? (numElements / eltsPerBlock) : (numElements / eltsPerBlock + 1); uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ? (numElements / eltsPerBlock2) : (numElements / eltsPerBlock2 + 1); bool loop = numBlocks > 65535; uint blocks = loop ? 65535 : numBlocks; uint blocksFind = loop ? 65535 : numBlocks2; uint blocksReorder = loop ? 65535 : numBlocks2; uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[0] : plan->m_persistentCTAThreshold[0]; bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold); if (persist) { loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536); blocks = numBlocks; blocksFind = numBlocks2; blocksReorder = numBlocks2; // Run an empty kernel -- this seems to reset some of the CTA scheduling hardware // on GT200, resulting in better scheduling and lower run times if (startbit > 0) { emptyKernel<<>>(); } } if (fullBlocks) { if (loop) { if (persist) { blocks = flip? numCTAs(radixSortBlocks<4, 0, true, true, true>) : numCTAs(radixSortBlocks<4, 0, true, false, true>); } radixSortBlocks <<>> ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks); } else { radixSortBlocks <<>> ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks); } } else { if (loop) { if (persist) { blocks = flip ? numCTAs(radixSortBlocks<4, 0, false, true, true>) : numCTAs(radixSortBlocks<4, 0, false, false, true>); } radixSortBlocks <<>> ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks); } else { radixSortBlocks <<>> ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks); } } CUT_CHECK_ERROR("radixSortBlocks"); if (fullBlocks) { if (loop) { if (persist) { blocksFind = numCTAs(findRadixOffsets<0, true, true>); } findRadixOffsets <<>> ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2); } else { findRadixOffsets <<>> ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2); } } else { if (loop) { if (persist) { blocksFind = numCTAs(findRadixOffsets<0, false, true>); } findRadixOffsets <<>> ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2); } else { findRadixOffsets <<>> ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2); } } CUT_CHECK_ERROR("findRadixOffsets"); cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan); if (fullBlocks) { if (plan->m_bManualCoalesce) { if (loop) { if (persist) { blocksReorder = unflip ? numCTAs(reorderData<0, true, true, true, true>) : numCTAs(reorderData<0, true, true, false, true>); } reorderData <<>> (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); } else { reorderData <<>> (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); } } else { if (loop) { if (persist) { blocksReorder = unflip ? numCTAs(reorderData<0, true, false, true, true>) : numCTAs(reorderData<0, true, false, false, true>); } reorderData <<>> (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); } else { reorderData <<>> (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); } } } else { if (plan->m_bManualCoalesce) { if (loop) { if (persist) { blocksReorder = unflip ? numCTAs(reorderData<0, false, true, true, true>) : numCTAs(reorderData<0, false, true, false, true>); } reorderData <<>> (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); } else { reorderData <<>> (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); } } else { if (loop) { if (persist) { blocksReorder = unflip ? numCTAs(reorderData<0, false, false, true, true>) : numCTAs(reorderData<0, false, false, false, true>); } reorderData <<>> (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); } else { reorderData <<>> (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); } } } CUT_CHECK_ERROR("radixSortStep"); } /** * @brief Single-block optimization for sorts of fewer than 4 * CTA_SIZE elements * * @param[in,out] keys Keys to be sorted. * @param[in,out] values Associated values to be sorted (through keys). * @param numElements Number of elements in the sort. **/ template void radixSortSingleBlock(uint *keys, uint *values, uint numElements) { bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0); if (fullBlocks) { radixSortBlocks<32, 0, true, flip, false> <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>> ((uint4*)keys, (uint4*)values, (uint4*)keys, (uint4*)values, numElements, 0); } else { radixSortBlocks<32, 0, false, flip, false> <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>> ((uint4*)keys, (uint4*)values, (uint4*)keys, (uint4*)values, numElements, 0); } if (flip) unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements); CUT_CHECK_ERROR("radixSortSingleBlock"); } /** * @brief Main radix sort function * * Main radix sort function. Sorts in place in the keys and values arrays, * but uses the other device arrays as temporary storage. All pointer * parameters are device pointers. Uses cudppScan() for the prefix sum of * radix counters. * * @param[in,out] keys Keys to be sorted. * @param[in,out] values Associated values to be sorted (through keys). * @param[in] plan Configuration information for RadixSort. * @param[in] numElements Number of elements in the sort. * @param[in] flipBits Is set true if key datatype is a float * (neg. numbers) for special float sorting operations. * @param[in] keyBits Number of interesting bits in the key **/ void radixSort(uint *keys, uint* values, const CUDPPRadixSortPlan *plan, size_t numElements, bool flipBits, int keyBits) { if(numElements <= WARP_SIZE) { if (flipBits) radixSortSingleWarp<<<1, numElements>>> (keys, values, numElements); else radixSortSingleWarp<<<1, numElements>>> (keys, values, numElements); CUT_CHECK_ERROR("radixSortSingleWarp"); return; } #ifdef __DEVICE_EMULATION__ printf("bits: %d\n", keyBits); #endif if(numElements <= SORT_CTA_SIZE * 4) { if (flipBits) radixSortSingleBlock(keys, values, numElements); else radixSortSingleBlock(keys, values, numElements); return; } // flip float bits on the first pass, unflip on the last pass if (flipBits) { radixSortStep<4, 0, true, false> (keys, values, plan, numElements); } else { radixSortStep<4, 0, false, false> (keys, values, plan, numElements); } if (keyBits > 4) { radixSortStep<4, 4, false, false> (keys, values, plan, numElements); } if (keyBits > 8) { radixSortStep<4, 8, false, false> (keys, values, plan, numElements); } if (keyBits > 12) { radixSortStep<4, 12, false, false> (keys, values, plan, numElements); } if (keyBits > 16) { radixSortStep<4, 16, false, false> (keys, values, plan, numElements); } if (keyBits > 20) { radixSortStep<4, 20, false, false> (keys, values, plan, numElements); } if (keyBits > 24) { radixSortStep<4, 24, false, false> (keys, values, plan, numElements); } if (keyBits > 28) { if (flipBits) // last pass { radixSortStep<4, 28, false, true> (keys, values, plan, numElements); } else { radixSortStep<4, 28, false, false> (keys, values, plan, numElements); } } } /** * @brief Wrapper to call main radix sort function. For float configuration. * * Calls the main radix sort function. For float configuration. * * @param[in,out] keys Keys to be sorted. * @param[in,out] values Associated values to be sorted (through keys). * @param[in] plan Configuration information for RadixSort. * @param[in] numElements Number of elements in the sort. * @param[in] negativeKeys Is set true if key datatype has neg. numbers. * @param[in] keyBits Number of interesting bits in the key **/ extern "C" void radixSortFloatKeys(float* keys, uint* values, const CUDPPRadixSortPlan *plan, size_t numElements, bool negativeKeys, int keyBits) { radixSort((uint*)keys, (uint*)values, plan, numElements, negativeKeys, keyBits); } /** @brief Perform one step of the radix sort. Sorts by nbits key bits per step, * starting at startbit. * * @param[in,out] keys Keys to be sorted. * @param[in] plan Configuration information for RadixSort. * @param[in] numElements Number of elements in the sort. **/ template void radixSortStepKeysOnly(uint *keys, const CUDPPRadixSortPlan *plan, uint numElements) { const uint eltsPerBlock = SORT_CTA_SIZE * 4; const uint eltsPerBlock2 = SORT_CTA_SIZE * 2; bool fullBlocks = ((numElements % eltsPerBlock) == 0); uint numBlocks = (fullBlocks) ? (numElements / eltsPerBlock) : (numElements / eltsPerBlock + 1); uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ? (numElements / eltsPerBlock2) : (numElements / eltsPerBlock2 + 1); bool loop = numBlocks > 65535; uint blocks = loop ? 65535 : numBlocks; uint blocksFind = loop ? 65535 : numBlocks2; uint blocksReorder = loop ? 65535 : numBlocks2; uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[1] : plan->m_persistentCTAThreshold[1]; bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold); if (persist) { loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536); blocks = numBlocks; blocksFind = numBlocks2; blocksReorder = numBlocks2; } if (fullBlocks) { if (loop) { if (persist) { blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>) : numCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>); } radixSortBlocksKeysOnly <<>> ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks); } else radixSortBlocksKeysOnly <<>> ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks); } else { if (loop) { if (persist) { blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>) : numCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>); } radixSortBlocksKeysOnly <<>> ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks); } else radixSortBlocksKeysOnly <<>> ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks); } if (fullBlocks) { if (loop) { if (persist) { blocksFind = numCTAs(findRadixOffsets<0, true, true>); } findRadixOffsets <<>> ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2); } else findRadixOffsets <<>> ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2); } else { if (loop) { if (persist) { blocksFind = numCTAs(findRadixOffsets<0, false, true>); } findRadixOffsets <<>> ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2); } else findRadixOffsets <<>> ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2); } cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan); if (fullBlocks) { if (plan->m_bManualCoalesce) { if (loop) { if (persist) { blocksReorder = unflip ? numCTAs(reorderDataKeysOnly<0, true, true, true, true>) : numCTAs(reorderDataKeysOnly<0, true, true, false, true>); } reorderDataKeysOnly <<>> (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); } else reorderDataKeysOnly <<>> (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); } else { if (loop) { if (persist) { blocksReorder = unflip ? numCTAs(reorderDataKeysOnly<0, true, false, true, true>) : numCTAs(reorderDataKeysOnly<0, true, false, false, true>); } reorderDataKeysOnly <<>> (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); } else reorderDataKeysOnly <<>> (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); } } else { if (plan->m_bManualCoalesce) { if (loop) { if (persist) { blocksReorder = unflip ? numCTAs(reorderDataKeysOnly<0, false, true, true, true>) : numCTAs(reorderDataKeysOnly<0, false, true, false, true>); } reorderDataKeysOnly <<>> (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); } else reorderDataKeysOnly <<>> (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); } else { if (loop) { if (persist) { blocksReorder = unflip ? numCTAs(reorderDataKeysOnly<0, false, false, true, true>) : numCTAs(reorderDataKeysOnly<0, false, false, false, true>); } reorderDataKeysOnly <<>> (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); } else reorderDataKeysOnly <<>> (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2); } } CUT_CHECK_ERROR("radixSortStepKeysOnly"); } /** * @brief Optimization for sorts of fewer than 4 * CTA_SIZE elements (keys only). * * @param[in,out] keys Keys to be sorted. * @param numElements Number of elements in the sort. **/ template void radixSortSingleBlockKeysOnly(uint *keys, uint numElements) { bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0); if (fullBlocks) { radixSortBlocksKeysOnly<32, 0, true, flip, false> <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>> ((uint4*)keys, (uint4*)keys, numElements, 1 ); } else { radixSortBlocksKeysOnly<32, 0, false, flip, false> <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>> ((uint4*)keys, (uint4*)keys, numElements, 1 ); } if (flip) unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements); CUT_CHECK_ERROR("radixSortSingleBlock"); } /** * @brief Main radix sort function. For keys only configuration. * * Main radix sort function. Sorts in place in the keys array, * but uses the other device arrays as temporary storage. All pointer * parameters are device pointers. Uses scan for the prefix sum of * radix counters. * * @param[in,out] keys Keys to be sorted. * @param[in] plan Configuration information for RadixSort. * @param[in] flipBits Is set true if key datatype is a float (neg. numbers) * for special float sorting operations. * @param[in] numElements Number of elements in the sort. * @param[in] keyBits Number of interesting bits in the key **/ extern "C" void radixSortKeysOnly(uint *keys, const CUDPPRadixSortPlan *plan, bool flipBits, size_t numElements, int keyBits) { if(numElements <= WARP_SIZE) { if (flipBits) radixSortSingleWarpKeysOnly<<<1, numElements>>>(keys, numElements); else radixSortSingleWarpKeysOnly<<<1, numElements>>>(keys, numElements); return; } if(numElements <= SORT_CTA_SIZE * 4) { if (flipBits) radixSortSingleBlockKeysOnly(keys, numElements); else radixSortSingleBlockKeysOnly(keys, numElements); return; } // flip float bits on the first pass, unflip on the last pass if (flipBits) { radixSortStepKeysOnly<4, 0, true, false>(keys, plan, numElements); } else { radixSortStepKeysOnly<4, 0, false, false>(keys, plan, numElements); } if (keyBits > 4) { radixSortStepKeysOnly<4, 4, false, false>(keys, plan, numElements); } if (keyBits > 8) { radixSortStepKeysOnly<4, 8, false, false>(keys, plan, numElements); } if (keyBits > 12) { radixSortStepKeysOnly<4, 12, false, false>(keys, plan, numElements); } if (keyBits > 16) { radixSortStepKeysOnly<4, 16, false, false>(keys, plan, numElements); } if (keyBits > 20) { radixSortStepKeysOnly<4, 20, false, false>(keys, plan, numElements); } if (keyBits > 24) { radixSortStepKeysOnly<4, 24, false, false>(keys, plan, numElements); } if (keyBits > 28) { if (flipBits) // last pass { radixSortStepKeysOnly<4, 28, false, true>(keys, plan, numElements); } else { radixSortStepKeysOnly<4, 28, false, false>(keys, plan, numElements); } } } /** * @brief Wrapper to call main radix sort function. For floats and keys only. * * Calls the radixSortKeysOnly function setting parameters for floats. * * @param[in,out] keys Keys to be sorted. * @param[in] plan Configuration information for RadixSort. * @param[in] negativeKeys Is set true if key flipBits is to be true in * radixSortKeysOnly(). * @param[in] numElements Number of elements in the sort. * @param[in] keyBits Number of interesting bits in the key **/ extern "C" void radixSortFloatKeysOnly(float *keys, const CUDPPRadixSortPlan *plan, bool negativeKeys, size_t numElements, int keyBits) { radixSortKeysOnly((uint*)keys, plan, negativeKeys, numElements, keyBits); } extern "C" void initDeviceParameters(CUDPPRadixSortPlan *plan) { int deviceID = -1; if (cudaSuccess == cudaGetDevice(&deviceID)) { cudaDeviceProp devprop; cudaGetDeviceProperties(&devprop, deviceID); int smVersion = devprop.major * 10 + devprop.minor; // sm_12 and later devices don't need help with coalesce in reorderData kernel plan->m_bManualCoalesce = (smVersion < 12); // sm_20 and later devices are better off not using persistent CTAs plan->m_bUsePersistentCTAs = (smVersion < 20); if (plan->m_bUsePersistentCTAs) { // The following is only true on pre-sm_20 devices (pre-Fermi): // Empirically we have found that for some (usually larger) sort // sizes it is better to use exactly as many "persistent" CTAs // as can fill the GPU, which loop over the "blocks" of work. For smaller // arrays it is better to use the typical CUDA approach of launching one CTA // per block of work. // 0-element of these two-element arrays is for key-value sorts // 1-element is for key-only sorts plan->m_persistentCTAThreshold[0] = plan->m_bManualCoalesce ? 16777216 : 524288; plan->m_persistentCTAThresholdFullBlocks[0] = plan->m_bManualCoalesce ? 2097152: 524288; plan->m_persistentCTAThreshold[1] = plan->m_bManualCoalesce ? 16777216 : 8388608; plan->m_persistentCTAThresholdFullBlocks[1] = plan->m_bManualCoalesce ? 2097152: 0; // create a map of function pointers to register counts for more accurate occupancy calculation // Must pass in the dynamic shared memory used by each kernel, since the runtime doesn't know it // Note we only insert the "loop" version of the kernels (the one with the last template param = true) // Because those are the only ones that require persistent CTAs that maximally fill the device. computeNumCTAs(radixSortBlocks<4, 0, false, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); computeNumCTAs(radixSortBlocks<4, 0, false, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); computeNumCTAs(radixSortBlocks<4, 0, true, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); computeNumCTAs(radixSortBlocks<4, 0, true, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); computeNumCTAs(findRadixOffsets<0, false, true>, 3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); computeNumCTAs(findRadixOffsets<0, true, true>, 3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE); computeNumCTAs(reorderData<0, false, false, false, true>, 0, SORT_CTA_SIZE); computeNumCTAs(reorderData<0, false, false, true, true>, 0, SORT_CTA_SIZE); computeNumCTAs(reorderData<0, false, true, false, true>, 0, SORT_CTA_SIZE); computeNumCTAs(reorderData<0, false, true, true, true>, 0, SORT_CTA_SIZE); computeNumCTAs(reorderData<0, true, false, false, true>, 0, SORT_CTA_SIZE); computeNumCTAs(reorderData<0, true, false, true, true>, 0, SORT_CTA_SIZE); computeNumCTAs(reorderData<0, true, true, false, true>, 0, SORT_CTA_SIZE); computeNumCTAs(reorderData<0, true, true, true, true>, 0, SORT_CTA_SIZE); computeNumCTAs(reorderDataKeysOnly<0, false, false, false, true>, 0, SORT_CTA_SIZE); computeNumCTAs(reorderDataKeysOnly<0, false, false, true, true>, 0, SORT_CTA_SIZE); computeNumCTAs(reorderDataKeysOnly<0, false, true, false, true>, 0, SORT_CTA_SIZE); computeNumCTAs(reorderDataKeysOnly<0, false, true, true, true>, 0, SORT_CTA_SIZE); computeNumCTAs(reorderDataKeysOnly<0, true, false, false, true>, 0, SORT_CTA_SIZE); computeNumCTAs(reorderDataKeysOnly<0, true, false, true, true>, 0, SORT_CTA_SIZE); computeNumCTAs(reorderDataKeysOnly<0, true, true, false, true>, 0, SORT_CTA_SIZE); computeNumCTAs(reorderDataKeysOnly<0, true, true, true, true>, 0, SORT_CTA_SIZE); computeNumCTAs(emptyKernel, 0, SORT_CTA_SIZE); } } } /** * @brief From the programmer-specified sort configuration, * creates internal memory for performing the sort. * * @param[in] plan Pointer to CUDPPRadixSortPlan object **/ extern "C" void allocRadixSortStorage(CUDPPRadixSortPlan *plan) { unsigned int numElements = plan->m_numElements; unsigned int numBlocks = ((numElements % (SORT_CTA_SIZE * 4)) == 0) ? (numElements / (SORT_CTA_SIZE * 4)) : (numElements / (SORT_CTA_SIZE * 4) + 1); switch(plan->m_config.datatype) { case CUDPP_UINT: CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys, numElements * sizeof(unsigned int))); if (!plan->m_bKeysOnly) CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues, numElements * sizeof(unsigned int))); CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters, WARP_SIZE * numBlocks * sizeof(unsigned int))); CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum, WARP_SIZE * numBlocks * sizeof(unsigned int))); CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets, WARP_SIZE * numBlocks * sizeof(unsigned int))); break; case CUDPP_FLOAT: CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys, numElements * sizeof(float))); if (!plan->m_bKeysOnly) CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues, numElements * sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters, WARP_SIZE * numBlocks * sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum, WARP_SIZE * numBlocks * sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets, WARP_SIZE * numBlocks * sizeof(float))); break; } initDeviceParameters(plan); } /** @brief Deallocates intermediate memory from allocRadixSortStorage. * * * @param[in] plan Pointer to CUDPPRadixSortPlan object **/ extern "C" void freeRadixSortStorage(CUDPPRadixSortPlan* plan) { CUDA_SAFE_CALL( cudaFree(plan->m_tempKeys)); CUDA_SAFE_CALL( cudaFree(plan->m_tempValues)); CUDA_SAFE_CALL( cudaFree(plan->m_counters)); CUDA_SAFE_CALL( cudaFree(plan->m_countersSum)); CUDA_SAFE_CALL( cudaFree(plan->m_blockOffsets)); } /** @brief Dispatch function to perform a sort on an array with * a specified configuration. * * This is the dispatch routine which calls radixSort...() with * appropriate template parameters and arguments as specified by * the plan. * @param[in,out] keys Keys to be sorted. * @param[in,out] values Associated values to be sorted (through keys). * @param[in] numElements Number of elements in the sort. * @param[in] keyBits Number of interesting bits in the key* * @param[in] plan Configuration information for RadixSort. **/ extern "C" void cudppRadixSortDispatch(void *keys, void *values, size_t numElements, int keyBits, const CUDPPRadixSortPlan *plan) { if(plan->m_bKeysOnly) { switch(plan->m_config.datatype) { case CUDPP_UINT: radixSortKeysOnly((uint*)keys, plan, false, numElements, keyBits); break; case CUDPP_FLOAT: radixSortFloatKeysOnly((float*)keys, plan, true, numElements, keyBits); } } else { switch(plan->m_config.datatype) { case CUDPP_UINT: radixSort((uint*)keys, (uint*) values, plan, numElements, false, keyBits); break; case CUDPP_FLOAT: radixSortFloatKeys((float*)keys, (uint*) values, plan, numElements, true, keyBits); } } } /** @} */ // end radixsort functions /** @} */ // end cudpp_app