//------------------------------------------------------------------------------
// GraphBLAS/CUDA/JitKernels/GB_cuda_jit_AxB_dot3_phase2end.cuh
//------------------------------------------------------------------------------

// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2024, All Rights Reserved.
// This file: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

//------------------------------------------------------------------------------
// GB_cuda_AxB_dot3_phase2end_kernel: fill the global buckets
//------------------------------------------------------------------------------

__global__ void GB_cuda_AxB_dot3_phase2end_kernel
(
    // input, not modified:
    const int64_t *__restrict__ nanobuckets,  // array of size
                                              // NBUCKETS-blockDim.x-by-nblocks
    const int64_t *__restrict__ blockbucket,  // global bucket count, of size
                                              // NBUCKETS*nblocks
    // output:
    const int64_t *__restrict__ bucketp,      // global bucket cumsum,
                                              // of size NBUCKETS+1
          int64_t *__restrict__ bucket,       // global buckets, of size
                                              // cnz == mnz
    const int64_t *__restrict__ offset,       // global offsets for each bucket
    // inputs, not modified:
    const GrB_Matrix C,      // output matrix
    const int64_t cnz        // number of entries in C and M
)
{

    //--------------------------------------------------------------------------
    // get C information 
    //--------------------------------------------------------------------------

    // Ci [p] for an entry C(i,j) contains either GB_FLIP(i) if C(i,j) is a
    // zombie, or (k << 4) + bucket otherwise, where C(:,j) is the kth vector
    // of C (j = Ch [k] if hypersparse or j = k if standard sparse), and
    // where bucket is the bucket assignment for C(i,j).  This phase does not
    // need k, just the bucket for each entry C(i,j).

    int64_t *__restrict__ Ci = C->i ;       // for zombies, or bucket assignment
    //int64_t *Mp = C->p ;       // for offset calculations
    //int64_t mnvec = C->nvec;

    //--------------------------------------------------------------------------
    // load and shift the nanobuckets for this thread block
    //--------------------------------------------------------------------------

    // The taskbucket for this threadblock is an array of size
    // NBUCKETS-by-blockDim.x, held by row.  It forms a 2D array within the 3D
    // nanobuckets array.
    const int64_t *taskbucket = nanobuckets +
        blockIdx.x * (NBUCKETS * blockDim.x) ;

    // Each thread in this threadblock owns one column of this taskbucket, for
    // its set of NBUCKETS nanobuckets.  The nanobuckets are a column of length
    // NBUCKETS, with stride equal to blockDim.x.

    const int64_t *nanobucket = taskbucket + threadIdx.x ;

    // Each thread loads its NBUCKETS nanobucket values into registers.
    int64_t my_bucket [NBUCKETS] ;

    #pragma unroll 
    for (int b = 0 ; b < NBUCKETS ; b++)
    {
        my_bucket [b] = nanobucket [b * blockDim.x]
                      + blockbucket [b * gridDim.x + blockIdx.x]
                      + bucketp [b] ;
    }

    // Now each thread has an index into the global set of NBUCKETS buckets,
    // held in bucket, of where to place its own entries.

    //--------------------------------------------------------------------------
    // construct the global buckets
    //--------------------------------------------------------------------------

    // The slice for task blockIdx.x contains entries pfirst:plast-1 of M and
    // C, which is the part of C operated on by this threadblock.

    // FIXME: why is bucket_idx needed?
    __shared__ int64_t bucket_idx [chunk_size] ;

//  int64_t chunk_max = (cnz + chunk_size -1) / chunk_size ;
//  for (int64_t chunk = blockIdx.x ; chunk < chunk_max ; chunk += gridDim.x)

    for (int64_t pfirst = blockIdx.x << log2_chunk_size ;
                 pfirst < cnz ;
                 pfirst += gridDim.x << log2_chunk_size)
    {

        // pfirst = chunk_size * chunk ;
        // plast  = GB_IMIN( chunk_size * (chunk+1), cnz ) ;
        int64_t plast = pfirst + chunk_size ;
        plast = GB_IMIN (plast, cnz) ;

        for (int64_t p = pfirst + threadIdx.x ; p < plast ; p += blockDim.x)
        {
            // get the entry C(i,j), and extract its bucket.  Then
            // place the entry C(i,j) in the global bucket it belongs to.
            int tid = p - pfirst ;

            // TODO: these writes to global are not coalesced.  Instead: each
            // threadblock could buffer its writes to NBUCKETS buffers and when
            // the buffers are full they can be written to global.

            int ibucket = Ci [p] & 0xF;

            //bucket[my_bucket[ibucket]++] = p;
            //int idx = (my_bucket[ibucket]  - pfirst); 
            //my_bucket[ibucket] +=  1; //blockDim.x ;
            //int idx = (my_bucket[ibucket]++ - pfirst) & 0x7F;
            //bucket_s[ibucket][ idx ] = p;

            bucket_idx [tid] = my_bucket [ibucket]++ ;
            Ci [p] = (ibucket==0) * (Ci [p] >> 4) + (ibucket > 0) * Ci [p] ;

            //if(ibucket == 0) {
            ////    bucket[my_bucket[0]++] = p;
            //    Ci[p] = Ci[p] >> 4;
            //} else {
            //  bucket[my_bucket[ibucket]++] = p;
            //}
        }

        // FIXME: can't this be merged with the loop above?  Or is it a
        // partial implementation of a coalesced write to the global bucket
        // array?

        for (int64_t p = pfirst + threadIdx.x ; p < plast ; p += blockDim.x)
        {
            int tid = p - pfirst ;
            bucket [bucket_idx [tid]] = p ;
        }
    }
}