// -------------------------------------------------------------
// cuDPP -- CUDA Data Parallel Primitives library
// -------------------------------------------------------------
// $Revision$
// $Date$
// ------------------------------------------------------------- 
// This source code is distributed under the terms of license.txt
// in the root directory of this source distribution.
// ------------------------------------------------------------- 
#include "cudpp_maximal_launch.h"
 
inline size_t min(size_t x, size_t y)
{
    return (x <= y) ? x : y;
}

inline size_t max(size_t x, size_t y)
{
    return (x >= y) ? x : y;
}

// computes next highest multiple of f from x
inline size_t multiple(size_t x, size_t f)
{
    return ((x + (f-1)) / f);
}


// MS Excel-style CEIL() function
// Rounds x up to nearest multiple of f
inline size_t ceiling(size_t x, size_t f)
{
    return multiple(x, f) * f;
}

extern "C"
size_t maxBlocks(cudaFuncAttributes &attribs, 
                 cudaDeviceProp &devprop, 
                 size_t bytesDynamicSharedMem,
                 size_t threadsPerBlock)
{
    
    // Determine the maximum number of CTAs that can be run simultaneously for each kernel
    // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
    const unsigned int regAllocationUnit = (devprop.major < 2 && devprop.minor < 2) ? 256 : 512; // in registers
    const unsigned int warpAllocationMultiple = 2;
    const unsigned int smemAllocationUnit = 512;                                                 // in bytes
    const unsigned int maxThreadsPerSM = (devprop.major < 2 && devprop.minor < 2) ? 768 : 1024;  // sm_12 GPUs increase threads/SM to 1024
    const unsigned int maxBlocksPerSM = 8;

    // Number of warps (round up to nearest whole multiple of warp size)
    size_t numWarps = multiple(threadsPerBlock, devprop.warpSize);
    // Round up to warp allocation multiple
    numWarps = ceiling(numWarps, warpAllocationMultiple);

    // Number of regs is regs per thread times number of warps times warp size
    size_t regsPerCTA = attribs.numRegs * devprop.warpSize * numWarps;
    // Round up to multiple of register allocation unit size
    regsPerCTA = ceiling(regsPerCTA, regAllocationUnit);
    
    size_t smemBytes  = attribs.sharedSizeBytes + bytesDynamicSharedMem;
    size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit);

    size_t ctaLimitRegs    = regsPerCTA > 0 ? devprop.regsPerBlock      / regsPerCTA : maxBlocksPerSM;
    size_t ctaLimitSMem    = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
    size_t ctaLimitThreads =                  maxThreadsPerSM           / threadsPerBlock;

    return devprop.multiProcessorCount * min(ctaLimitRegs, min(ctaLimitSMem, min(ctaLimitThreads, maxBlocksPerSM)));
}

extern "C"
size_t maxBlocksFromPointer(void*  kernel, 
                            size_t bytesDynamicSharedMem,
                            size_t threadsPerBlock)
{
    cudaDeviceProp devprop;
    int deviceID = -1;
    cudaError_t err = cudaGetDevice(&deviceID);
    if (err == cudaSuccess)
    {
        err = cudaGetDeviceProperties(&devprop, deviceID);
        if (err != cudaSuccess)
            return -1;

        cudaFuncAttributes attr;
        err = cudaFuncGetAttributes(&attr, (const char*)kernel);
        if (err != cudaSuccess)
            return -1;

        return maxBlocks(attr, devprop, bytesDynamicSharedMem, threadsPerBlock);
    }

    return -1;    
}