// ------------------------------------------------------------- // cuDPP -- CUDA Data Parallel Primitives library // ------------------------------------------------------------- // $Revision: 5632 $ // $Date: 2009-07-01 14:36:01 +1000 (Wed, 01 Jul 2009) $ // ------------------------------------------------------------- // This source code is distributed under the terms of license.txt in // the root directory of this source distribution. // ------------------------------------------------------------- /** * @file * cudpp.cpp * * @brief Main library source file. Implements wrappers for public * interface. * * Main library source file. Implements wrappers for public * interface. These wrappers call application-level operators. * As this grows we may decide to partition into multiple source * files. */ /** * \defgroup publicInterface CUDPP Public Interface * The CUDA public interface comprises the functions, structs, and enums * defined in cudpp.h. Public interface functions call functions in the * \link cudpp_app Application-Level\endlink interface. The public * interface functions include Plan Interface functions and Algorithm * Interface functions. Plan Inteface functions are used for creating * CUDPP Plan objects which contain configuration details, intermediate * storage space, and in the case of cudppSparseMatrix(), data. The * Algorithm Interface is the set of functions that do the real work * of CUDPP, such as cudppScan() and cudppSparseMatrixVectorMultiply. * * @{ */ /** @name Algorithm Interface * @{ */ #include "cudpp.h" #include "cudpp_plan_manager.h" #include "cudpp_scan.h" //#include "cudpp_segscan.h" //#include "cudpp_compact.h" //#include "cudpp_spmvmult.h" #include "cudpp_radixsort.h" //#include "cudpp_rand.h" /** * @brief Performs a scan operation of numElements on its input in * GPU memory (d_in) and places the output in GPU memory * (d_out), with the scan parameters specified in the plan pointed to by * planHandle. * The input to a scan operation is an input array, a binary associative * operator (like + or max), and an identity element for that operator * (+'s identity is 0). The output of scan is the same size as its input. * Informally, the output at each element is the result of operator * applied to each input that comes before it. For instance, the * output of sum-scan at each element is the sum of all the input * elements before that input. * * More formally, for associative operator * @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly, * outi = in0 * @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly * in1 * @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly ... * @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly * ini-1. * * CUDPP supports "exclusive" and "inclusive" scans. For the ADD operator, * an exclusive scan computes the sum of all input elements before the * current element, while an inclusive scan computes the sum of all input * elements up to and including the current element. * * Before calling scan, create an internal plan using cudppPlan(). * * After you are finished with the scan plan, clean up with cudppDestroyPlan(). * * @param[in] planHandle Handle to plan for this scan * @param[out] d_out output of scan, in GPU memory * @param[in] d_in input to scan, in GPU memory * @param[in] numElements number of elements to scan * * @see cudppPlan, cudppDestroyPlan */ CUDPP_DLL CUDPPResult cudppScan(CUDPPHandle planHandle, void *d_out, const void *d_in, size_t numElements) { CUDPPScanPlan *plan = (CUDPPScanPlan*)CUDPPPlanManager::GetPlan(planHandle); if (plan != NULL) { cudppScanDispatch(d_out, d_in, numElements, 1, plan); return CUDPP_SUCCESS; } else { return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors } } /** * @brief Performs a segmented scan operation of numElements on its input in * GPU memory (d_idata) and places the output in GPU memory * (d_out), with the scan parameters specified in the plan pointed to by * planHandle. * The input to a segmented scan operation is an input array of data, * an input array of flags which demarcate segments, a binary associative * operator (like + or max), and an identity element for that operator * (+'s identity is 0). The array of flags is the same length as the input * with 1 marking the the first element of a segment and 0 otherwise. The * output of segmented scan is the same size as its input. Informally, the * output at each element is the result of operator applied to each input * that comes before it in that segment. For instance, the output of * segmented sum-scan at each element is the sum of all the input elements * before that input in that segment. * * More formally, for associative operator * @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly, * outi = ink * @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly * ink+1 * @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly ... * @htmlonly⊕@endhtmlonly@latexonly$\oplus$@endlatexonly * ini-1. * k is the index of the first element of the segment in which i lies * * We support both "exclusive" and "inclusive" variants. For a segmented sum-scan, * the exclusive variant computes the sum of all input elements before the * current element in that segment, while the inclusive variant computes the * sum of all input elements up to and including the current element, in * that segment. * * Before calling segmented scan, create an internal plan using cudppPlan(). * * After you are finished with the scan plan, clean up with cudppDestroyPlan(). * @param[in] planHandle Handle to plan for this scan * @param[out] d_out output of segmented scan, in GPU memory * @param[in] d_idata input data to segmented scan, in GPU memory * @param[in] d_iflags input flags to segmented scan, in GPU memory * @param[in] numElements number of elements to perform segmented scan on * * @see cudppPlan, cudppDestroyPlan CUDPP_DLL CUDPPResult cudppSegmentedScan(CUDPPHandle planHandle, void *d_out, const void *d_idata, const unsigned int *d_iflags, size_t numElements) { CUDPPSegmentedScanPlan *plan = (CUDPPSegmentedScanPlan*)CUDPPPlanManager::GetPlan(planHandle); if (plan != NULL) { cudppSegmentedScanDispatch(d_out, d_idata, d_iflags, numElements, plan); return CUDPP_SUCCESS; } else { return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors } } */ /** * @brief Performs numRows parallel scan operations of numElements * each on its input (d_in) and places the output in d_out, * with the scan parameters set by config. Exactly like cudppScan * except that it runs on multiple rows in parallel. * * Note that to achieve good performance with cudppMultiScan one should * allocate the device arrays passed to it so that all rows are aligned * to the correct boundaries for the architecture the app is running on. * The easy way to do this is to use cudaMallocPitch() to allocate a * 2D array on the device. Use the \a rowPitch parameter to cudppPlan() * to specify this pitch. The easiest way is to pass the device pitch * returned by cudaMallocPitch to cudppPlan() via \a rowPitch. * * @param[in] planHandle handle to CUDPPScanPlan * @param[out] d_out output of scan, in GPU memory * @param[in] d_in input to scan, in GPU memory * @param[in] numElements number of elements (per row) to scan * @param[in] numRows number of rows to scan in parallel * * @see cudppScan, cudppPlan CUDPP_DLL CUDPPResult cudppMultiScan(CUDPPHandle planHandle, void *d_out, const void *d_in, size_t numElements, size_t numRows) { CUDPPScanPlan *plan = (CUDPPScanPlan*)CUDPPPlanManager::GetPlan(planHandle); if (plan != NULL) { cudppScanDispatch(d_out, d_in, numElements, numRows, plan); return CUDPP_SUCCESS; } else { return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors } } */ /** * @brief Given an array \a d_in and an array of 1/0 flags in \a * deviceValid, returns a compacted array in \a d_out of corresponding * only the "valid" values from \a d_in. * * Takes as input an array of elements in GPU memory * (\a d_in) and an equal-sized unsigned int array in GPU memory * (\a deviceValid) that indicate which of those input elements are * valid. The output is a packed array, in GPU memory, of only those * elements marked as valid. * * Internally, uses cudppScan. * * Example: * \code * d_in = [ a b c d e f ] * deviceValid = [ 1 0 1 1 0 1 ] * d_out = [ a c d f ] * \endcode * * @todo [MJH] We need to evaluate whether cudppCompact should be a core member * of the public interface. It's not clear to me that what the user always * wants is a final compacted array. Often one just wants the array of indices * to which each input element should go in the output. The split() routine used * in radix sort might make more sense to expose. * * @param[in] planHandle handle to CUDPPCompactPlan * @param[out] d_out compacted output * @param[out] d_numValidElements set during cudppCompact; is set with the * number of elements valid flags in the d_isValid input array * @param[in] d_in input to compact * @param[in] d_isValid which elements in d_in are valid * @param[in] numElements number of elements in d_in CUDPP_DLL CUDPPResult cudppCompact(CUDPPHandle planHandle, void *d_out, size_t *d_numValidElements, const void *d_in, const unsigned int *d_isValid, size_t numElements) { CUDPPCompactPlan *plan = (CUDPPCompactPlan*)CUDPPPlanManager::GetPlan(planHandle); if (plan != NULL) { cudppCompactDispatch(d_out, d_numValidElements, d_in, d_isValid, numElements, plan); return CUDPP_SUCCESS; } else { return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors. } } */ /** * @brief Sorts key-value pairs or keys only * * Takes as input an array of keys in GPU memory * (d_keys) and an optional array of corresponding values, * and outputs sorted arrays of keys and (optionally) values in place. * Key-value and key-only sort is selected through the configuration of * the plan, using the options CUDPP_OPTION_KEYS_ONLY and * CUDPP_OPTION_KEY_VALUE_PAIRS. * * Supported key types are CUDPP_FLOAT and CUDPP_UINT. Values can be * any 32-bit type (internally, values are treated only as a payload * and cast to unsigned int). * * @todo Determine if we need to provide an "out of place" sort interface. * * @param[in] planHandle handle to CUDPPSortPlan * @param[out] d_keys keys by which key-value pairs will be sorted * @param[in] d_values values to be sorted * @param[in] keyBits the number of least significant bits in each element * of d_keys to sort by * @param[in] numElements number of elements in d_keys and d_values * * @see cudppPlan, CUDPPConfiguration, CUDPPAlgorithm */ CUDPP_DLL CUDPPResult cudppSort(CUDPPHandle planHandle, void *d_keys, void *d_values, int keyBits, size_t numElements) { CUDPPRadixSortPlan *plan = (CUDPPRadixSortPlan*)CUDPPPlanManager::GetPlan(planHandle); if (plan != NULL) { cudppRadixSortDispatch(d_keys, d_values, numElements, keyBits, plan); return CUDPP_SUCCESS; } else { return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors. } } /** @brief Perform matrix-vector multiply y = A*x for arbitrary sparse matrix A and vector x * * Given a matrix object handle (which has been initialized using cudppSparseMatrix()), * This function multiplies the input vector \a d_x by the matrix referred to by * \a sparseMatrixHandle, returning the result in \a d_y. * * @param sparseMatrixHandle Handle to a sparse matrix object created with cudppSparseMatrix() * @param d_y The output vector, y * @param d_x The input vector, x * * @see cudppSparseMatrix, cudppDestroySparseMatrix CUDPP_DLL CUDPPResult cudppSparseMatrixVectorMultiply(CUDPPHandle sparseMatrixHandle, void *d_y, const void *d_x) { CUDPPSparseMatrixVectorMultiplyPlan *plan = (CUDPPSparseMatrixVectorMultiplyPlan*)CUDPPPlanManager::GetPlan(sparseMatrixHandle); if (plan != NULL) { cudppSparseMatrixVectorMultiplyDispatch(d_y, d_x, plan); return CUDPP_SUCCESS; } else { return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors. } } */ /** * @brief Rand puts \a numElements random 32-bit elements into \a d_out * * Outputs \a numElements random values to \a d_out. \a d_out must be of * type unsigned int, allocated in device memory. * * The algorithm used for the random number generation is stored in \a planHandle. * Depending on the specification of the pseudo random number generator(PRNG), * the generator may have one or more seeds. To set the seed, use cudppRandSeed(). * * @todo Currently only MD5 PRNG is supported. We may provide more rand routines in * the future. * * @param[in] planHandle Handle to plan for rand * @param[in] numElements number of elements in d_out. * @param[out] d_out output of rand, in GPU memory. Should be an array of unsigned integers. * * @see cudppPlan, CUDPPConfiguration, CUDPPAlgorithm CUDPP_DLL CUDPPResult cudppRand(CUDPPHandle planHandle,void * d_out, size_t numElements) { CUDPPRandPlan * plan = (CUDPPRandPlan *) CUDPPPlanManager::GetPlan(planHandle); if(plan != NULL) { //dispatch the rand algorithm here cudppRandDispatch(d_out, numElements, plan); return CUDPP_SUCCESS; } else return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors } */ /**@brief Sets the seed used for rand * * The seed is crucial to any random number generator as it allows a * sequence of random numbers to be replicated. Since there may be * multiple different rand algorithms in CUDPP, cudppRandSeed * uses \a planHandle to determine which seed to set. Each rand * algorithm has its own unique set of seeds depending on what * the algorithm needs. * * @param[in] planHandle the handle to the plan which specifies which rand seed to set * @param[in] seed the value which the internal cudpp seed will be set to CUDPP_DLL CUDPPResult cudppRandSeed(const CUDPPHandle planHandle, unsigned int seed) { CUDPPRandPlan * plan = (CUDPPRandPlan *) CUDPPPlanManager::GetPlan(planHandle); //switch on the plan to figure out which seed to update switch(plan->m_config.algorithm) { case CUDPP_RAND_MD5: plan->m_seed = seed; break; default: break; } return CUDPP_SUCCESS; }//end cudppRandSeed */ /** @} */ // end Algorithm Interface /** @} */ // end of publicInterface group // Leave this at the end of the file // Local Variables: // mode:c++ // c-file-style: "NVIDIA" // End: