//------------------------------------------------------------------------------
// CHOLMOD/GPU/cholmod_gpu: GPU utilities for CHOLMOD
//------------------------------------------------------------------------------

// CHOLMOD/GPU Module.  Copyright (C) 2005-2022, Timothy A. Davis.
// All Rights Reserved.
// SPDX-License-Identifier: GPL-2.0+

//------------------------------------------------------------------------------

/* Primary routines:
 * -----------------
 * cholmod_gpu_memorysize       determine free memory on current GPU
 * cholmod_gpu_probe            ensure a GPU is available
 * cholmod_gpu_allocate         allocate GPU resources
 * cholmod_gpu_deallocate       free GPU resources
 */

#include "cholmod_internal.h"

#ifdef CHOLMOD_HAS_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#endif

#define MINSIZE (64 * 1024 * 1024)

/* ========================================================================== */
/* === cholmod_gpu_memorysize =============================================== */
/* ========================================================================== */

/* Determine the amount of free memory on the current GPU.  To use another
 * GPU, use cudaSetDevice (k) prior to calling this routine, where k is an
 * integer in the range 0 to the number of devices-1.   If the free size is
 * less than 64 MB, then a size of 1 is returned.  Normal usage:
 *
 *  Common->useGPU = 1 ;
 *  err = cholmod_gpu_memorysize (&totmem, &availmem, Common);
 *  Returns 1 if GPU requested but not available, 0 otherwise
 */

static int poll_gpu (size_t s)          /* TRUE if OK, FALSE otherwise */
{
#ifdef CHOLMOD_HAS_CUDA
    /* Returns TRUE if the GPU has a block of memory of size s,
       FALSE otherwise.  The block of memory is immediately freed. */
    void *p = NULL ;
    if (s == 0)
    {
        return (FALSE) ;
    }
    if (cudaMalloc (&p, s) != cudaSuccess)
    {
        return (FALSE) ;
    }
    cudaFree (p) ;
    return (TRUE) ;
#else
    return (FALSE) ;
#endif
}

int CHOLMOD(gpu_memorysize)      /* returns 1 on error, 0 otherwise */
(
    size_t         *total_mem,
    size_t         *available_mem,
    cholmod_common *Common
)
{
    size_t good, bad, s, total_free, total_memory ;
    int k ;

    *total_mem = 0;
    *available_mem = 0;
#ifndef CHOLMOD_INT64
    return 0;
#endif

    if (Common->useGPU != 1)
    {
        return (0) ;                    /* not using the GPU at all */
    }

#ifdef CHOLMOD_HAS_CUDA

    /* find the total amount of free memory */
    cudaMemGetInfo (&total_free, &total_memory) ;

    *total_mem = total_memory;

    if (total_free < MINSIZE)
    {
        return (1) ;                    /* not even 64MB; return failure code */
    }

    /* try a bit less than the total free memory */
    s = MAX (MINSIZE, total_free*0.98) ;
    if (poll_gpu (s))
    {
        *available_mem = s;
        return (0) ;  /* no error */
    }

    /* ensure s = 64 MB is OK */
    if (!poll_gpu (MINSIZE))
    {
        return (1) ;                    /* not even 64MB; return failure code */
    }

    /* 8 iterations of binary search */
    good = MINSIZE ;                    /* already known to be OK */
    bad  = total_free ;                 /* already known to be bad */
    for (k = 0 ; k < 8 ; k++)
    {
        s = (good + bad) / 2 ;
        if (poll_gpu (s))
        {
            good = s ;                  /* s is OK, increase good */
        }
        else
        {
            bad = s ;                   /* s failed, decrease bad */
        }
    }

    *available_mem = good;

#endif

    return (0) ; /* no error */
}


/* ========================================================================== */
/* === cholmod_gpu_probe ==================================================== */
/* ========================================================================== */
/*
 * Used to ensure that a suitable GPU is available.  As this version of
 * CHOLMOD can only utilize a single GPU, only the default (i.e. selected as
 * 'best' by the NVIDIA driver) is verified as suitable.  If this selection
 * is incorrect, the user can select the proper GPU with the
 * CUDA_VISIBLE_DEVICES environment variable.
 *
 * To be considered suitable, the GPU must have a compute capability > 1 and
 * more than 1 GB of device memory.
 */

int CHOLMOD(gpu_probe) ( cholmod_common *Common )
{

#ifdef CHOLMOD_HAS_CUDA
    int ngpus, idevice;
    double tstart, tend;
    struct cudaDeviceProp gpuProp;

    if (Common->useGPU != 1)
    {
        return (0) ;
    }

    cudaGetDeviceCount(&ngpus);
    // printf ("# gpus: %d\n", ngpus) ;

    if ( ngpus ) {
        cudaGetDevice ( &idevice );
        cudaGetDeviceProperties ( &gpuProp, idevice );
        if ( gpuProp.major > 1 && 1.0e-9*gpuProp.totalGlobalMem > 1.0 ) {
            return 1;  /* useGPU = 1 */
        }
    }
    CHOLMOD_GPU_PRINTF (("GPU WARNING: useGPUs was selected, "
        "but no applicable GPUs were found. useGPU reset to FALSE.\n")) ;
#endif

    /* no GPU is available */
    return 0;  /* useGPU = 0 */
}

/* ========================================================================== */
/* === cholmod_gpu_deallocate =============================================== */
/* ========================================================================== */

/*
 * Deallocate all GPU related buffers.
 */

int CHOLMOD(gpu_deallocate)
(
    cholmod_common *Common
)
{

#ifdef CHOLMOD_HAS_CUDA
    cudaError_t cudaErr;

    if ( Common->dev_mempool )
    {
        /* fprintf (stderr, "free dev_mempool\n") ; */
        cudaErr = cudaFree (Common->dev_mempool);
        /* fprintf (stderr, "free dev_mempool done\n") ; */
        if ( cudaErr )
        {
            ERROR ( CHOLMOD_GPU_PROBLEM,
                    "GPU error when freeing device memory.");
        }
    }
    Common->dev_mempool = NULL;
    Common->dev_mempool_size = 0;

    if ( Common->host_pinned_mempool )
    {
        /* fprintf (stderr, "free host_pinned_mempool\n") ; */
        cudaErr = cudaFreeHost ( Common->host_pinned_mempool );
        /* fprintf (stderr, "free host_pinned_mempool done\n") ; */
        if ( cudaErr )
        {
            ERROR ( CHOLMOD_GPU_PROBLEM,
                    "GPU error when freeing host pinned memory.");
        }
    }
    Common->host_pinned_mempool = NULL;
    Common->host_pinned_mempool_size = 0;

    CHOLMOD (gpu_end) (Common) ;
#endif

    return (0);
}

//------------------------------------------------------------------------------
// cholmod_gpu_start: allocate the cublasHandle and the streams
//------------------------------------------------------------------------------

int CHOLMOD(gpu_start)
(
    cholmod_common *Common
)
{
#ifdef CHOLMOD_HAS_CUDA
    cudaError_t cudaErr ;

    if (Common->cublasHandle == NULL)
    {

        //----------------------------------------------------------------------
        // create cuBlas handle
        //----------------------------------------------------------------------

        cublasStatus_t cublasErr = cublasCreate (&(Common->cublasHandle)) ;
        if (cublasErr != CUBLAS_STATUS_SUCCESS)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "CUBLAS initialization") ;
            CHOLMOD (gpu_end) (Common) ;
            return (0) ;
        }

        //----------------------------------------------------------------------
        // create each CUDA stream
        //----------------------------------------------------------------------

        for (int k = 0 ; k < CHOLMOD_HOST_SUPERNODE_BUFFERS ; k++)
        {
            cudaErr = cudaStreamCreate ( &(Common->gpuStream[k]) ) ;
            if (cudaErr != cudaSuccess)
            {
                ERROR (CHOLMOD_GPU_PROBLEM, "CUDA stream") ;
                CHOLMOD (gpu_end) (Common) ;
                return (0) ;
            }
        }

        //----------------------------------------------------------------------
        // create each CUDA event
        //----------------------------------------------------------------------

        for (int k = 0 ; k < 3 ; k++)
        {
            cudaErr = cudaEventCreateWithFlags
                (&(Common->cublasEventPotrf [k]), cudaEventDisableTiming) ;
            if (cudaErr != cudaSuccess)
            {
                ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event") ;
                CHOLMOD (gpu_end) (Common) ;
                return (0) ;
            }
        }

        for (int k = 0 ; k < CHOLMOD_HOST_SUPERNODE_BUFFERS ; k++)
        {
            cudaErr = cudaEventCreateWithFlags
                (&(Common->updateCBuffersFree[k]), cudaEventDisableTiming) ;
            if (cudaErr != cudaSuccess)
            {
                ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event") ;
                CHOLMOD (gpu_end) (Common) ;
                return (0) ;
            }
        }

        cudaErr = cudaEventCreateWithFlags ( &(Common->updateCKernelsComplete),
                                             cudaEventDisableTiming );
        if (cudaErr != cudaSuccess)
        {
            ERROR (CHOLMOD_GPU_PROBLEM, "CUDA updateCKernelsComplete event") ;
            CHOLMOD (gpu_end) (Common) ;
            return (0) ;
        }
    }
#endif

    // printf ("cublasHandle created %p\n", Common->cublasHandle) ;
    return (1) ;
}

//------------------------------------------------------------------------------
// cholmod_gpu_end: destroy the cublasHandle and the streams
//------------------------------------------------------------------------------

void CHOLMOD(gpu_end)
(
    cholmod_common *Common
)
{
#ifdef CHOLMOD_HAS_CUDA

    //----------------------------------------------------------------------
    // destroy cublasHandle
    //----------------------------------------------------------------------

    if (Common->cublasHandle)
    {
        /* fprintf (stderr, "destroy cublas %p\n", Common->cublasHandle) ; */
        cublasDestroy (Common->cublasHandle) ;
        /* fprintf (stderr, "destroy cublas done\n") ; */
        Common->cublasHandle = NULL ;
    }

    //----------------------------------------------------------------------
    // destroy each CUDA stream
    //----------------------------------------------------------------------

    for (int k = 0 ; k < CHOLMOD_HOST_SUPERNODE_BUFFERS ; k++)
    {
        if (Common->gpuStream [k])
        {
            /* fprintf (stderr, "destroy gpuStream [%d] %p\n", k,
                Common->gpuStream [k]) ; */
            cudaStreamDestroy (Common->gpuStream [k]) ;
            /* fprintf (stderr, "destroy gpuStream [%d] done\n", k) ; */
            Common->gpuStream [k] = NULL ;
        }
    }

    //----------------------------------------------------------------------
    // destroy each CUDA event
    //----------------------------------------------------------------------

    for (int k = 0 ; k < 3 ; k++)
    {
        if (Common->cublasEventPotrf [k])
        {
            /* fprintf (stderr, "destroy cublasEnventPotrf [%d] %p\n", k,
                Common->cublasEventPotrf [k]) ; */
            cudaEventDestroy (Common->cublasEventPotrf [k]) ;
            /* fprintf (stderr, "destroy cublasEnventPotrf [%d] done\n", k) ; */
            Common->cublasEventPotrf [k] = NULL ;
        }
    }

    for (int k = 0 ; k < CHOLMOD_HOST_SUPERNODE_BUFFERS ; k++)
    {
        if (Common->updateCBuffersFree [k])
        {
            /* fprintf (stderr, "destroy updateCBuffersFree [%d] %p\n", k,
                Common->updateCBuffersFree [k]) ; */
            cudaEventDestroy (Common->updateCBuffersFree [k]) ;
            /* fprintf (stderr, "destroy updateCBuffersFree [%d] done\n", k) ;*/
            Common->updateCBuffersFree [k] = NULL ;
        }
    }

    if (Common->updateCKernelsComplete)
    {
        /* fprintf (stderr, "destroy updateCKernelsComplete %p\n",
            Common->updateCKernelsComplete) ; */
        cudaEventDestroy (Common->updateCKernelsComplete) ;
        /* fprintf (stderr, "destroy updateCKernelsComplete done\n") ; */
        Common->updateCKernelsComplete = NULL;
    }
#endif
}

/* ========================================================================== */
/* === cholmod_gpu_allocate ================================================= */
/* ========================================================================== */
/*
 * Allocate both host and device memory needed for GPU computation.
 *
 * Memory allocation is expensive and should be done once and reused for
 * multiple factorizations.
 *
 * When gpu_allocate is called, the requested amount of device (and by
 * association host) memory is computed.  If that amount or more memory has
 * already been allocated, then nothing is done.  (i.e. memory allocation is
 * not reduced.)  If the requested amount is more than is currently allcoated
 * then both device and pinned host memory is freed and the new amount
 * allocated.
 *
 * This routine will allocate the minimum of either:
 *
 * maxGpuMemBytes - size of requested device allocation in bytes
 *
 * maxGpuMemFraction - size of requested device allocation as a fraction of
 *                     total GPU memory
 *
 * If both maxGpuMemBytes and maxGpuMemFraction are zero, this will allocate
 * the maximum amount of GPU memory possible.
 *
 * Note that since the GPU driver requires some memory, it is not advisable
 * to request maxGpuMemFraction of 1.0 (which will request all GPU memory and
 * will fail).  If maximum memory is requested then call this routine wtih
 * both maxGpuMemBytes and maxGpuMemFraction of 0.0.
 *
 */

int CHOLMOD(gpu_allocate) ( cholmod_common *Common )
{

#ifdef CHOLMOD_HAS_CUDA

    size_t fdm, tdm;
    size_t requestedDeviceMemory, requestedHostMemory;
    double tstart, tend;
    cudaError_t cudaErr;
    size_t maxGpuMemBytes;
    double maxGpuMemFraction;

    if (Common->useGPU != 1) return (0) ;

    if (Common->dev_mempool != NULL)
    {
        // memory pool is already allocated
        return (0) ;
    }

    maxGpuMemBytes = Common->maxGpuMemBytes;
    maxGpuMemFraction = Common->maxGpuMemFraction;

    /* ensure valid input */
    if ( maxGpuMemBytes < 0 ) maxGpuMemBytes = 0;
    if ( maxGpuMemFraction < 0 ) maxGpuMemFraction = 0;
    if ( maxGpuMemFraction > 1 ) maxGpuMemFraction = 1;

    int err = CHOLMOD(gpu_memorysize) (&tdm,&fdm,Common) ;
    if (err)
    {
        printf ("GPU failure in cholmod_gpu: gpu_memorysize %g %g MB\n",
            ((double) tdm) / (1024*1024),
            ((double) fdm) / (1024*1024)) ;
        ERROR (CHOLMOD_GPU_PROBLEM, "gpu memorysize failure\n") ;
    }

    /* compute the amount of device memory requested */
    if ( maxGpuMemBytes == 0 && maxGpuMemFraction == 0 ) {
        /* no specific request - take all available GPU memory
         *  (other programs could already have allocated some GPU memory,
         *  possibly even previous calls to gpu_allocate).  Always leave
         *  50 MB free for driver use. */
        requestedDeviceMemory = fdm+Common->dev_mempool_size-
            1024ll*1024ll*50ll;
    }
    else if ( maxGpuMemBytes > 0 && maxGpuMemFraction > 0 ) {
        /* both byte and fraction limits - take the lowest of the two */
        requestedDeviceMemory = maxGpuMemBytes;
        if ( requestedDeviceMemory > tdm*maxGpuMemFraction ) {
            requestedDeviceMemory = tdm*maxGpuMemFraction;
        }
    }
    else if ( maxGpuMemFraction > 0 ) {
        /* just a fraction requested */
        requestedDeviceMemory = maxGpuMemFraction * tdm;
    }
    else {
        /* specific number of bytes requested */
        requestedDeviceMemory = maxGpuMemBytes;
        if ( maxGpuMemBytes > fdm ) {
            CHOLMOD_GPU_PRINTF ((
                "GPU WARNING: Requested amount of device memory not available\n"
                )) ;
            requestedDeviceMemory = fdm;
        }
    }

    /* do nothing if sufficient memory has already been allocated */
    if ( requestedDeviceMemory <= Common->dev_mempool_size ) {

        CHOLMOD_GPU_PRINTF (("requested = %d, mempool = %d \n",
            requestedDeviceMemory, Common->dev_mempool_size));
        CHOLMOD_GPU_PRINTF (("GPU NOTE:  gpu_allocate did nothing \n"));
        return 0;
    }

    CHOLMOD(gpu_deallocate) (Common);

    /* allocated corresponding pinned host memory */
    requestedHostMemory = requestedDeviceMemory*CHOLMOD_HOST_SUPERNODE_BUFFERS/
        CHOLMOD_DEVICE_SUPERNODE_BUFFERS;

    cudaErr = cudaMallocHost ( (void**)&(Common->host_pinned_mempool),
                               requestedHostMemory );
    while ( cudaErr ) {
        /* insufficient host memory, try again with less */
        requestedHostMemory *= .5;
        cudaErr = cudaMallocHost ( (void**)&(Common->host_pinned_mempool),
                                   requestedHostMemory );
    }
    Common->host_pinned_mempool_size = requestedHostMemory;

    requestedDeviceMemory = requestedHostMemory*
        CHOLMOD_DEVICE_SUPERNODE_BUFFERS/CHOLMOD_HOST_SUPERNODE_BUFFERS;

    /* Split up the memory allocations into required device buffers. */
    Common->devBuffSize = requestedDeviceMemory/
        (size_t)CHOLMOD_DEVICE_SUPERNODE_BUFFERS;
    Common->devBuffSize -= Common->devBuffSize%0x20000;

    cudaErr = cudaMalloc ( &(Common->dev_mempool), requestedDeviceMemory );
    /*
    CHOLMOD_HANDLE_CUDA_ERROR (cudaErr,"device memory allocation failure\n");
    */
    if (cudaErr)
    {
        printf ("GPU failure in cholmod_gpu: requested %g MB\n",
            ((double) requestedDeviceMemory) / (1024*1024)) ;
        ERROR (CHOLMOD_GPU_PROBLEM, "device memory allocation failure\n") ;
    }

    Common->dev_mempool_size = requestedDeviceMemory;

#endif

    return (0);
}