//------------------------------------------------------------------------------
// GraphBLAS/CUDA/GB_cuda_reduce_to_scalar: reduce on the GPU with semiring 
//------------------------------------------------------------------------------

// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

// Reduce a matrix A to a scalar s, or to a smaller matrix V if the GPU was
// only able to do a partial reduction.  This case occurs if the GPU does not
// cannot do an atomic update for the monoid.  To handle this case, the GPU
// returns a full GrB_Matrix V, of size gridsize-by-1, with one entry per
// threadblock.  Then GB_reduce_to_scalar on the CPU sees this V as the result,
// and calls itself recursively to continue the reduction.

#define GB_FREE_ALL                                         \
{                                                           \
    GB_FREE_WORK (&zscalar, zscalar_size) ;                 \
    GB_Matrix_free (&V) ;                                   \
    if (stream != nullptr) cudaStreamDestroy (stream) ;     \
    stream = nullptr ;                                      \
}

#include "GB_cuda_reduce.hpp"

GrB_Info GB_cuda_reduce_to_scalar
(
    // output:
    GB_void *s,                 // note: statically allocated on CPU stack; if
                                // the result is in s then V is NULL.
    GrB_Matrix *V_handle,       // partial result if unable to reduce to scalar;
                                // NULL if result is in s.
    // input:
    const GrB_Monoid monoid,
    const GrB_Matrix A
)
{

    //--------------------------------------------------------------------------
    // check inputs
    //--------------------------------------------------------------------------

    GB_void *zscalar = NULL ;
    size_t zscalar_size = 0 ;
    GrB_Matrix V = NULL ;
    (*V_handle) = NULL ;
    GrB_Info info = GrB_SUCCESS ;
    cudaStream_t stream = nullptr ;

    //--------------------------------------------------------------------------
    // create the stream
    //--------------------------------------------------------------------------

    // FIXME: use the stream pool
    CUDA_OK (cudaStreamCreate (&stream)) ;

    //--------------------------------------------------------------------------
    // determine problem characteristics and allocate worksbace
    //--------------------------------------------------------------------------

    int threads_per_block = 320 ;
    int work_per_thread = 256;
//  int number_of_sms = GB_Global_gpu_sm_get (0) ;

    GrB_Type ztype = monoid->op->ztype ;
    size_t zsize = ztype->size ;

    // determine kernel launch geometry
    int64_t anvals = GB_nnz_held (A) ;
    int blocksz = threads_per_block ;
    int gridsz =
        // FIXME: this is a lot of blocks.  Use a smaller number (cap at,
        // say, 64K), to simplify the non-atomic reductions
        (anvals + work_per_thread*threads_per_block - 1) /
               (work_per_thread*threads_per_block) ;

    // FIXME: GB_enumify_reduce is called twice: here (to get has_cheeseburger)
    // and in GB_cuda_reduce_to_scalar_jit.  Can we just call it once?  One
    // solution: The code from here to the call to GB_cuda_reduce_to_scalar_jit
    // could be added to the GB_cuda_reduce_to_scalar_jit function itself.

    uint64_t rcode ;
    GB_enumify_reduce (&rcode, monoid, A) ;
    bool has_cheeseburger = GB_RSHIFT (rcode, 27, 1) ;
    GBURBLE ("has_cheeseburger %d\n", has_cheeseburger) ;

    // determine the kind of reduction: partial (to &V), or complete
    // (to the scalar output)
    if (has_cheeseburger)
    {
        // the kernel launch can reduce A to zscalar all by itself
        // allocate and initialize zscalar (upscaling it to at least 32 bits)
        size_t zscalar_space = GB_IMAX (zsize, sizeof (uint32_t)) ;
        zscalar = GB_MALLOC (zscalar_space, GB_void, &zscalar_size) ;
        if (zscalar == NULL)
        {
            // out of memory
            GB_FREE_ALL ;
            return (GrB_OUT_OF_MEMORY) ;
        }
        GB_cuda_upscale_identity (zscalar, monoid) ;
    }
    else
    {
        // allocate a full GrB_Matrix V for the partial result, of size
        // gridsz-by-1, and of type ztype.  V is allocated but not
        // initialized.
        GB_OK (GB_new_bix (&V, ztype, gridsz, 1, GB_Ap_null,
            true, GxB_FULL, false, 0, -1, gridsz, true, false)) ;
    }

    GBURBLE ("(cuda reduce launch %d threads in %d blocks)",
        blocksz, gridsz ) ;

    //--------------------------------------------------------------------------
    // reduce C to a scalar via the CUDA JIT
    //--------------------------------------------------------------------------

//  final call looks like this:
//  GB_OK (GB_cuda_reduce_to_scalar_jit (zscalar, V, monoid, A,
//      stream, gridsz, blocksz)) ;

//  debugging for now, to die early if the CUDA fails to compile, load, or run:
    info = (GB_cuda_reduce_to_scalar_jit (zscalar, V, monoid, A,
        stream, gridsz, blocksz)) ;
    if (info == GrB_NO_VALUE) info = GrB_PANIC ;
    GB_OK (info) ;

    //--------------------------------------------------------------------------
    // return result and destroy the stream
    //--------------------------------------------------------------------------

    CUDA_OK (cudaStreamSynchronize (stream)) ;

    if (has_cheeseburger)
    {
        // return the scalar result
        // s = zscalar (but only the first zsize bytes of it)
        memcpy (s, zscalar, zsize) ;
        GB_FREE_WORK (&zscalar, zscalar_size) ;
    }
    else
    {
        // return the partial reduction
        (*V_handle) = V ;
    }

    CUDA_OK (cudaStreamDestroy (stream)) ;
    return (GrB_SUCCESS) ;
}