// =============================================================================
// === GPUQREngine/Source/GPUQREngine_ExpertDense.cpp ==========================
// =============================================================================

// GPUQREngine, Copyright (c) 2013, Timothy A Davis, Sencer Nuri Yeralan,
// and Sanjay Ranka.  All Rights Reserved.
// SPDX-License-Identifier: GPL-2.0+

//------------------------------------------------------------------------------
//
// This file contains the dense GPUQREngine wrapper that finds the staircase,
// makes a copy of the user's front data, then calls down into the Internal
// GPUQREngine factorization routine.
//
// Other functions include:
//  - GPUQREngine_Cleanup:       Cleans up relevant workspaces in the dense
//                               factorization depending on how we're exiting.
//  - GPUQREngine_FindStaircase: Finds the staircase for a front and returns
//                               the staircase as an Int* list
// =============================================================================


#ifdef SPQR_HAS_CUDA

#include "GPUQREngine_Internal.hpp"

template <typename Int>
QREngineResultCode GPUQREngine_Cleanup
(
    QREngineResultCode code,    // The result code that we're exiting with
    Front <Int> *userFronts,          // The user-provided list of fronts
    Front <Int> *fronts,              // The internal copy of the user's fronts
    Int numFronts,              // The number of fronts to be factorized
    Workspace *wsMongoF,        // Pointer to the total GPU Front workspace
    Workspace *wsMongoR         // Pointer to the total CPU R workspace
);

template QREngineResultCode GPUQREngine_Cleanup
(
    QREngineResultCode code,    // The result code that we're exiting with
    Front <int32_t> *userFronts,          // The user-provided list of fronts
    Front <int32_t> *fronts,              // The internal copy of the user's fronts
    int32_t numFronts,              // The number of fronts to be factorized
    Workspace *wsMongoF,        // Pointer to the total GPU Front workspace
    Workspace *wsMongoR         // Pointer to the total CPU R workspace
);
template QREngineResultCode GPUQREngine_Cleanup
(
    QREngineResultCode code,    // The result code that we're exiting with
    Front <int64_t> *userFronts,          // The user-provided list of fronts
    Front <int64_t> *fronts,              // The internal copy of the user's fronts
    int64_t numFronts,              // The number of fronts to be factorized
    Workspace *wsMongoF,        // Pointer to the total GPU Front workspace
    Workspace *wsMongoR         // Pointer to the total CPU R workspace
);

template <typename Int>
QREngineResultCode GPUQREngine
(
    size_t gpuMemorySize,   // The total available GPU memory size in bytes
    Front <Int> *userFronts,      // The list of fronts to factorize
    Int numFronts,          // The number of fronts to factorize
    QREngineStats <Int> *stats    // An optional parameter. If present, statistics
                            // are collected and passed back to the caller
                            // via this struct
)
{
    /* Allocate workspaces */
    Front  <Int> *fronts = (Front <Int> *) SuiteSparse_calloc(numFronts, sizeof(Front <Int>));
    if(!fronts)
    {
        return QRENGINE_OUTOFMEMORY;
    }

    size_t FSize, RSize;
    FSize = RSize = 0;
    for(int f=0; f<numFronts; f++)
    {
        /* Configure the front */
        Front <Int> *userFront = &(userFronts[f]);
        Int m = userFront->fm;
        Int n = userFront->fn;
        Front <Int> *front = new (&fronts[f]) Front <Int> (f, EMPTY, m, n);
        FSize += front->getNumFrontValues();
        RSize += front->getNumRValues();
    }

    // We have to allocate page-locked CPU-GPU space to leverage asynchronous
    // memory transfers.  This has to be done in a way that the CUDA driver is
    // aware of, which unfortunately means making a copy of the user input.

    // calloc pagelocked space on CPU, and calloc space on the GPU
    Workspace *wsMongoF = Workspace::allocate(FSize,    // CPU and GPU
        sizeof(double), true, true, true, true);

    // calloc pagelocked space on the CPU.  Nothing on the GPU
    Workspace *wsMongoR = Workspace::allocate(RSize,    // CPU
        sizeof(double), true, true, false, true);

    /* Cleanup and return if we ran out of memory. */
    if(!wsMongoF || !wsMongoR)
    {
        return GPUQREngine_Cleanup (QRENGINE_OUTOFMEMORY,
            userFronts, fronts, numFronts, wsMongoF, wsMongoR);
    }

    /* Prepare the fronts for GPU execution. */
    size_t FOffset, ROffset;
    FOffset = ROffset = 0;
    for(int f=0; f<numFronts; f++)
    {
        // Set the front pointers; make the copy from user data into front data.
        Front <Int> *front = &(fronts[f]);
        front->F    = CPU_REFERENCE(wsMongoF, double*) + FOffset;
        front->gpuF = GPU_REFERENCE(wsMongoF, double*) + FOffset;
        front->cpuR = CPU_REFERENCE(wsMongoR, double*) + ROffset;
        FOffset += front->getNumFrontValues();
        ROffset += front->getNumRValues();

        /* COPY USER DATA (user's F to our F) */
        Front <Int> *userFront = &(userFronts[f]);
        double *userF = userFront->F;
        double *F = front->F;
        Int m = userFront->fm;
        Int n = userFront->fn;
        bool isColMajor = userFront->isColMajor;
        Int ldn = userFront->ldn;
        for(Int i=0; i<m; i++)
        {
            for(Int j=0; j<n; j++)
            {
                F[i*n+j] = (isColMajor ? userF[j*ldn+i] : userF[i*ldn+j]);
            }
        }

        /* Attach either the user-specified Stair, or compute it. */
        front->Stair = userFront->Stair;
        if(!front->Stair) front->Stair = GPUQREngine_FindStaircase(front);

        /* Cleanup and return if we ran out of memory building the staircase */
        if(!front->Stair)
        {
            return GPUQREngine_Cleanup (QRENGINE_OUTOFMEMORY,
                userFronts, fronts, numFronts, wsMongoF, wsMongoR);
        }
    }

    /* Transfer the fronts to the GPU. */
    if(!wsMongoF->transfer(cudaMemcpyHostToDevice))
    {
        return GPUQREngine_Cleanup (QRENGINE_GPUERROR,
            userFronts, fronts, numFronts, wsMongoF, wsMongoR);
    }

    /* Do the factorization for this set of fronts. */
    QREngineResultCode result = GPUQREngine_Internal <Int>(gpuMemorySize, fronts,
        numFronts, NULL, NULL, NULL, stats);
    if(result != QRENGINE_SUCCESS)
    {
        return GPUQREngine_Cleanup (result,
            userFronts, fronts, numFronts, wsMongoF, wsMongoR);
    }

    /* COPY USER DATA (our R back to user's R) */
    for(int f=0; f<numFronts; f++)
    {
        Front <Int> *userFront = &(userFronts[f]);
        double *R = (&fronts[f])->cpuR;
        double *userR = userFront->cpuR;
        Int m = userFront->fm;
        Int n = userFront->fn;
        Int rank = userFront->rank;
        bool isColMajor = userFront->isColMajor;
        Int ldn = userFront->ldn;
        for(Int i=0; i<rank; i++)
        {
            for(Int j=0; j<n; j++)
            {
                userR[i*ldn+j] = (isColMajor ? R[j*n+i] : R[i*n+j]);
            }
        }
    }

    /* Return that the factorization was successful. */
    return GPUQREngine_Cleanup (QRENGINE_SUCCESS,
        userFronts, fronts, numFronts, wsMongoF, wsMongoR);
}

template QREngineResultCode GPUQREngine
(
    size_t gpuMemorySize,   // The total available GPU memory size in bytes
    Front <int32_t> *userFronts,      // The list of fronts to factorize
    int32_t numFronts,          // The number of fronts to factorize
    QREngineStats <int32_t> *stats    // An optional parameter. If present, statistics
                            // are collected and passed back to the caller
                            // via this struct
) ;
template QREngineResultCode GPUQREngine
(
    size_t gpuMemorySize,   // The total available GPU memory size in bytes
    Front <int64_t> *userFronts,      // The list of fronts to factorize
    int64_t numFronts,          // The number of fronts to factorize
    QREngineStats <int64_t> *stats    // An optional parameter. If present, statistics
                            // are collected and passed back to the caller
                            // via this struct
) ;

template <typename Int>
QREngineResultCode GPUQREngine_Cleanup
(
    QREngineResultCode code,    // The result code that we're exiting with
    Front <Int> *userFronts,          // The user-provided list of fronts
    Front <Int> *fronts,              // The internal copy of the user's fronts
    Int numFronts,              // The number of fronts to be factorized
    Workspace *wsMongoF,        // Pointer to the total GPU Front workspace
    Workspace *wsMongoR         // Pointer to the total CPU R workspace
)
{
    /* Cleanup fronts. */
    for(int f=0; f<numFronts; f++)
    {
        Front <Int> *userFront = (&userFronts[f]);
        Front <Int> *front = &(fronts[f]);
        if(front != NULL)
        {
            /* If we had to attach our own stair, clean it up. */
            if(userFront->Stair == NULL && front->Stair != NULL)
            {
                front->Stair = (Int *) SuiteSparse_free(front->Stair);
            }

            /* Detach front data since it's managed by the mongo. */
            front->F = NULL;
        }
    }
    fronts = (Front <Int> *) SuiteSparse_free(fronts);

    /* Free the mongo structures. Note that Workspace checks for NULL. */
    wsMongoF = Workspace::destroy(wsMongoF);
    wsMongoR = Workspace::destroy(wsMongoR);

    return code;
}

template <typename Int>
Int *GPUQREngine_FindStaircase
(
    Front <Int> *front                // The front whose staircase we are computing
)
{
    Int fm = front->fm;
    Int fn = front->fn;

    double *F = front->F;
    Int *Stair = (Int*) SuiteSparse_malloc(fn, sizeof(Int));
    if(!F || !Stair) return NULL;

    Int lastStair = 0;
    for(int j=0; j<fn; j++)
    {
        int i;
        for(i=fm-1; i>lastStair && F[i*fn+j] == 0.0; i--);
        Stair[j] = lastStair = i;
    }

    return Stair;
}

template int32_t *GPUQREngine_FindStaircase
(
    Front <int32_t> *front                // The front whose staircase we are computing
) ;
template int64_t *GPUQREngine_FindStaircase
(
    Front <int64_t> *front                // The front whose staircase we are computing
) ;
#endif