//------------------------------------------------------------------------------
// GB_subassign_23_template: C += A where C is full
//------------------------------------------------------------------------------

// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

#include "GB_unused.h"

#undef  GB_FREE_ALL
#define GB_FREE_ALL                         \
{                                           \
    GB_WERK_POP (A_ek_slicing, int64_t) ;   \
}

{

    //--------------------------------------------------------------------------
    // get inputs
    //--------------------------------------------------------------------------

    #ifdef GB_JIT_KERNEL
    #define A_is_bitmap GB_A_IS_BITMAP
    #define A_is_full   GB_A_IS_FULL
    #define A_iso       GB_A_ISO
    #else
    bool A_is_bitmap = GB_IS_BITMAP (A) ;
    bool A_is_full = GB_IS_FULL (A) ;
    const bool A_iso = A->iso ;
    #endif

    //--------------------------------------------------------------------------
    // slice the A matrix
    //--------------------------------------------------------------------------

    GB_WERK_DECLARE (A_ek_slicing, int64_t) ;
    int A_ntasks, A_nthreads ;
    GB_A_NHELD (anz) ;      // int64_t anz = GB_nnz_held (A) ;
    double work = anz + A->nvec ;
    if (A_is_bitmap || A_is_full)
    { 
        // C is full and A is bitmap or full
        A_nthreads = GB_nthreads (work, chunk, nthreads_max) ;
        A_ntasks = 0 ;   // unused
        ASSERT (A_ek_slicing == NULL) ;
    }
    else
    { 
        // create tasks to compute over the matrix A
        GB_SLICE_MATRIX_WORK (A, 32, work, anz) ;
        ASSERT (A_ek_slicing != NULL) ;
    }

    //--------------------------------------------------------------------------
    // get C and A
    //--------------------------------------------------------------------------

    ASSERT (!C->iso) ;
    const GB_A_TYPE *restrict Ax = (GB_A_TYPE *) A->x ;
    GB_C_TYPE *restrict Cx = (GB_C_TYPE *) C->x ;
    ASSERT (GB_IS_FULL (C)) ;
    GB_C_NHELD (cnz) ;      // const int64_t cnz = GB_nnz_held (C) ;
    GB_DECLAREY (ywork) ;
    if (A_iso)
    { 
        // get the iso value of A and typecast it to Y
        // ywork = (ytype) Ax [0]
        GB_COPY_aij_to_ywork (ywork, Ax, 0, true) ;
    }

    if (A_is_bitmap)
    {

        //----------------------------------------------------------------------
        // C += A when C is full and A is bitmap
        //----------------------------------------------------------------------

        const int8_t *restrict Ab = A->b ;
        int64_t p ;
        #pragma omp parallel for num_threads(A_nthreads) schedule(static)
        for (p = 0 ; p < cnz ; p++)
        { 
            if (!Ab [p]) continue ;
            // Cx [p] += (ytype) Ax [p], with typecasting
            GB_ACCUMULATE_aij (Cx, p, Ax, p, A_iso, ywork) ;
        }

    }
    else if (A_is_full)
    {

        //----------------------------------------------------------------------
        // C += A when both C and A are ffull
        //----------------------------------------------------------------------

        int64_t p ;
        #pragma omp parallel for num_threads(A_nthreads) schedule(static)
        for (p = 0 ; p < cnz ; p++)
        { 
            // Cx [p] += (ytype) Ax [p], with typecasting
            GB_ACCUMULATE_aij (Cx, p, Ax, p, A_iso, ywork) ;
        }

    }
    else
    {

        //----------------------------------------------------------------------
        // C += A when C is full and A is sparse
        //----------------------------------------------------------------------

        ASSERT (GB_JUMBLED_OK (A)) ;

        const int64_t *restrict Ap = A->p ;
        const int64_t *restrict Ah = A->h ;
        const int64_t *restrict Ai = A->i ;
        const int64_t avlen = A->vlen ;
        const int64_t cvlen = C->vlen ;
        bool A_jumbled = A->jumbled ;

        const int64_t *restrict kfirst_Aslice = A_ek_slicing ;
        const int64_t *restrict klast_Aslice  = kfirst_Aslice + A_ntasks ;
        const int64_t *restrict pstart_Aslice = klast_Aslice + A_ntasks ;

        int taskid ;
        #pragma omp parallel for num_threads(A_nthreads) schedule(dynamic,1)
        for (taskid = 0 ; taskid < A_ntasks ; taskid++)
        {

            // if kfirst > klast then taskid does no work at all
            int64_t kfirst = kfirst_Aslice [taskid] ;
            int64_t klast  = klast_Aslice  [taskid] ;

            //------------------------------------------------------------------
            // C(:,kfirst:klast) += A(:,kfirst:klast)
            //------------------------------------------------------------------

            for (int64_t k = kfirst ; k <= klast ; k++)
            {

                //--------------------------------------------------------------
                // find the part of A(:,k) and C(:,k) for this task
                //--------------------------------------------------------------

                int64_t j = GBH_A (Ah, k) ;
                int64_t pA_start = GBP_A (Ap, k, avlen) ;
                int64_t pA_end   = GBP_A (Ap, k+1, avlen) ;
                GB_GET_PA (my_pA_start, my_pA_end, taskid, k,
                    kfirst, klast, pstart_Aslice, pA_start, pA_end) ;
                bool ajdense = ((pA_end - pA_start) == cvlen) ;

                // pC points to the start of C(:,j)
                int64_t pC = j * cvlen ;

                //--------------------------------------------------------------
                // C(:,j) += A(:,j)
                //--------------------------------------------------------------

                if (ajdense && !A_jumbled)
                {

                    //----------------------------------------------------------
                    // A(:,j) is dense
                    //----------------------------------------------------------

                    GB_PRAGMA_SIMD_VECTORIZE
                    for (int64_t pA = my_pA_start ; pA < my_pA_end ; pA++)
                    { 
                        int64_t i = pA - pA_start ;
                        int64_t p = pC + i ;
                        // Cx [p] += (ytype) Ax [pA], with typecasting
                        GB_ACCUMULATE_aij (Cx, p, Ax, pA, A_iso, ywork) ;
                    }

                }
                else
                {

                    //----------------------------------------------------------
                    // A(:,j) is sparse 
                    //----------------------------------------------------------

                    GB_PRAGMA_SIMD_VECTORIZE
                    for (int64_t pA = my_pA_start ; pA < my_pA_end ; pA++)
                    { 
                        int64_t i = Ai [pA] ;
                        int64_t p = pC + i ;
                        // Cx [p] += (ytype) Ax [pA], with typecasting
                        GB_ACCUMULATE_aij (Cx, p, Ax, pA, A_iso, ywork) ;
                    }
                }
            }
        }
    }

    GB_FREE_ALL ;
}

#undef  GB_FREE_ALL
#define GB_FREE_ALL ;