//------------------------------------------------------------------------------
// GB_AxB_saxpy4_template: C+=A*B, C is full, A is sparse/hyper, B bitmap/full
//------------------------------------------------------------------------------

// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

// C is full. A is hyper/sparse, B is bitmap/full.  M is not present.

// C += A*B is computed with the accumulator identical to the monoid.

// This template is used by Template/GB_AxB_saxpy4_meta.  It is not used
// for the generic case, nor for the ANY_PAIR case.  It is only used for the
// pre-generated kernels, and for the JIT.

#ifndef GB_B_SIZE
#define GB_B_SIZE sizeof (GB_B_TYPE)
#endif

#ifndef GB_C_SIZE
#define GB_C_SIZE sizeof (GB_C_TYPE)
#endif

{

    if (use_coarse_tasks)
    {

        //----------------------------------------------------------------------
        // C += A*B using coarse tasks
        //----------------------------------------------------------------------

        int64_t cvlenx = cvlen * GB_C_SIZE ;

        int tid ;
        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
        for (tid = 0 ; tid < ntasks ; tid++)
        {

            //------------------------------------------------------------------
            // determine the vectors of B and C for this coarse task
            //------------------------------------------------------------------

            int64_t jstart, jend ;
            GB_PARTITION (jstart, jend, bvdim, tid, ntasks) ;
            int64_t jtask = jend - jstart ;
            int64_t jpanel = GB_IMIN (jtask, GB_SAXPY4_PANEL_SIZE) ;

            //------------------------------------------------------------------
            // get the workspace for this task
            //------------------------------------------------------------------

            // Hx workspace to compute the panel of C
            GB_C_TYPE *restrict Hx = (GB_C_TYPE *)
                (Wcx + H_slice [tid] * cvlenx) ;

            //------------------------------------------------------------------
            // C(:,jstart:jend-1) += A * B(:,jstart:jend-1) by panel
            //------------------------------------------------------------------

            for (int64_t j1 = jstart ; j1 < jend ; j1 += jpanel)
            {

                //--------------------------------------------------------------
                // get the panel of np vectors j1:j2-1
                //--------------------------------------------------------------

                int64_t j2 = GB_IMIN (jend, j1 + jpanel) ;
                int64_t np = j2 - j1 ;

                //--------------------------------------------------------------
                // G = B(:,j1:j2-1), of size bvlen-by-np, in column major order
                //--------------------------------------------------------------

                int8_t *restrict Gb = (int8_t *) (Bb + (j1 * bvlen)) ;
                GB_B_TYPE *restrict Gx = (GB_B_TYPE *)
                     (((GB_void *) (B->x)) +
                       (B_iso ? 0 : ((j1 * bvlen) * GB_B_SIZE))) ;

                //--------------------------------------------------------------
                // clear the panel H to compute C(:,j1:j2-1)
                //--------------------------------------------------------------

                if (np == 1)
                { 
                    // Make H an alias to C(:,j1)
                    int64_t j = j1 ;
                    int64_t pC_start = j * cvlen ;    // get pointer to C(:,j)
                    // Hx is GB_C_TYPE, not GB_void, so pointer arithmetic on
                    // it is by units of size sizeof (GB_C_TYPE), not bytes.
                    Hx = Cx + pC_start ;
                }
                else
                { 
                    // C is full: set Hx = identity
                    int64_t nc = np * cvlen ;
                    #if GB_HAS_IDENTITY_BYTE
                        memset (Hx, GB_IDENTITY_BYTE, nc * GB_C_SIZE) ;
                    #else
                        for (int64_t i = 0 ; i < nc ; i++)
                        { 
                            GB_HX_WRITE (i, zidentity) ; // Hx(i) = identity
                        }
                    #endif
                }

                #if GB_IS_PLUS_FC32_MONOID
                float  *restrict Hx_real = (float *) Hx ;
                float  *restrict Hx_imag = Hx_real + 1 ;
                #elif GB_IS_PLUS_FC64_MONOID
                double *restrict Hx_real = (double *) Hx ;
                double *restrict Hx_imag = Hx_real + 1 ;
                #endif

                //--------------------------------------------------------------
                // H += A*G for one panel
                //--------------------------------------------------------------

                #undef GB_B_kj_PRESENT
                #if GB_B_IS_BITMAP
                #define GB_B_kj_PRESENT(b) b
                #else
                #define GB_B_kj_PRESENT(b) 1
                #endif

                #undef GB_MULT_A_ik_G_kj
                #if ( GB_IS_PAIR_MULTIPLIER && !GB_Z_IS_COMPLEX )
                    // t = A(i,k) * B (k,j) is already #defined as 1
                    #define GB_MULT_A_ik_G_kj(gkj,jj)
                #else
                    // t = A(i,k) * B (k,j)
                    #define GB_MULT_A_ik_G_kj(gkj,jj)                   \
                        GB_CIJ_DECLARE (t) ;                            \
                        GB_MULT (t, aik, gkj, i, k, j1 + jj)
                #endif

                #undef  GB_HX_COMPUTE
                #define GB_HX_COMPUTE(gkj,gb,jj)                        \
                {                                                       \
                    /* H (i,jj) += A(i,k) * B(k,j) */                   \
                    if (GB_B_kj_PRESENT (gb))                           \
                    {                                                   \
                        /* t = A(i,k) * B (k,j) */                      \
                        GB_MULT_A_ik_G_kj (gkj, jj) ;                   \
                        /* Hx(i,jj)+=t */                               \
                        GB_HX_UPDATE (pH+jj, t) ;                       \
                    }                                                   \
                }

                #include "GB_AxB_saxpy4_panel.c"

                //--------------------------------------------------------------
                // C(:,j1:j2-1) = H
                //--------------------------------------------------------------

                if (np == 1)
                { 
                    // Hx is already aliased to Cx; no more work to do
                    continue ;
                }

                for (int64_t jj = 0 ; jj < np ; jj++)
                {

                    //----------------------------------------------------------
                    // C(:,j) = H (:,jj)
                    //----------------------------------------------------------

                    int64_t j = j1 + jj ;
                    int64_t pC_start = j * cvlen ;  // get pointer to C(:,j)

                    for (int64_t i = 0 ; i < cvlen ; i++)
                    { 
                        int64_t pC = pC_start + i ;     // pointer to C(i,j)
                        int64_t pH = i * np + jj ;      // pointer to H(i,jj)

                        // C(i,j) = H(i,jj)
                        GB_CIJ_GATHER_UPDATE (pC, pH) ;
                    }
                }
            }
        }

    }
    else if (use_atomics)
    {

        //----------------------------------------------------------------------
        // C += A*B using fine tasks and atomics
        //----------------------------------------------------------------------

        int tid ;
        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
        for (tid = 0 ; tid < ntasks ; tid++)
        {

            //------------------------------------------------------------------
            // determine the vector of B and C for this fine task
            //------------------------------------------------------------------

            // The fine task operates on C(:,j) and B(:,j).  Its fine task
            // id ranges from 0 to nfine_tasks_per_vector-1, and determines
            // which slice of A to operate on.

            int64_t j    = tid / nfine_tasks_per_vector ;
            int fine_tid = tid % nfine_tasks_per_vector ;
            int64_t kfirst = A_slice [fine_tid] ;
            int64_t klast = A_slice [fine_tid + 1] ;
            int64_t pB_start = j * bvlen ;      // pointer to B(:,j)
            int64_t pC_start = j * cvlen ;      // pointer to C(:,j)
            GB_GET_T_FOR_SECONDJ ;              // t = j or j+1 for SECONDJ*

            // for Hx Gustavason workspace: use C(:,j) in-place:
            GB_C_TYPE *restrict Hx = (GB_C_TYPE *)
                (((GB_void *) Cx) + (pC_start * GB_C_SIZE)) ;
            #if !GB_Z_HAS_ATOMIC_UPDATE
            int8_t *restrict Hf = Wf + pC_start ;
            #endif
            #if GB_IS_PLUS_FC32_MONOID
            float  *restrict Hx_real = (float *) Hx ;
            float  *restrict Hx_imag = Hx_real + 1 ;
            #elif GB_IS_PLUS_FC64_MONOID
            double *restrict Hx_real = (double *) Hx ;
            double *restrict Hx_imag = Hx_real + 1 ;
            #endif

            //------------------------------------------------------------------
            // C(:,j) += A(:,k1:k2) * B(k1:k2,j)
            //------------------------------------------------------------------

            for (int64_t kk = kfirst ; kk < klast ; kk++)
            {

                //--------------------------------------------------------------
                // C(:,j) += A(:,k) * B(k,j)
                //--------------------------------------------------------------

                int64_t k = GBH_A (Ah, kk) ;      // k in range k1:k2
                int64_t pB = pB_start + k ;     // get pointer to B(k,j)
                #if GB_B_IS_BITMAP
                if (!GBB_B (Bb, pB)) continue ;   
                #endif
                int64_t pA = Ap [kk] ;
                int64_t pA_end = Ap [kk+1] ;
                GB_GET_B_kj ;                   // bkj = B(k,j)

                for ( ; pA < pA_end ; pA++)
                {

                    //----------------------------------------------------------
                    // get A(i,k)
                    //----------------------------------------------------------

                    int64_t i = Ai [pA] ;       // get A(i,k) index

                    //----------------------------------------------------------
                    // C(i,j) += A(i,k) * B(k,j)
                    //----------------------------------------------------------

                    GB_MULT_A_ik_B_kj ;     // t = A(i,k) * B(k,j)

                    #if GB_Z_HAS_ATOMIC_UPDATE
                    { 
                        // the monoid has an atomic update
                        GB_Z_ATOMIC_UPDATE_HX (i, t) ;    // C(i,j) += t
                    }
                    #else
                    { 
                        // the update must be done in a critical section using
                        // the mutex byte Hf (i,j), located at Hf [i].  If
                        // zero, the mutex is unlocked.
                        int8_t f ;
                        do
                        {
                            // do this atomically:
                            // { f = Hf [i] ; Hf [i] = 1 ; }
                            GB_ATOMIC_CAPTURE_INT8 (f, Hf [i], 1) ;
                        }
                        while (f == 1) ;
                        GB_Z_ATOMIC_UPDATE_HX (i, t) ;    // C(i,j) += t
                        GB_ATOMIC_WRITE
                        Hf [i] = 0 ;
                    }
                    #endif
                }
            }
        }

    }
    else
    {

        //----------------------------------------------------------------------
        // C += A*B using fine tasks and workspace, with no atomics
        //----------------------------------------------------------------------

        // Each fine task is given size-cvlen workspace to compute its result
        // in the first phase, W(:,tid) = A(:,k1:k2) * B(k1:k2,j), where k1:k2
        // is defined by the fine_tid of the task.  The workspaces are then
        // summed into C in the second phase.

        //----------------------------------------------------------------------
        // first phase: W (:,tid) = A (:,k1:k2) * B (k2:k2,j) for each fine task
        //----------------------------------------------------------------------

        int tid ;
        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
        for (tid = 0 ; tid < ntasks ; tid++)
        {

            //------------------------------------------------------------------
            // determine the vector of B and C for this fine task
            //------------------------------------------------------------------

            // The fine task operates on C(:,j) and B(:,j).  Its fine task
            // id ranges from 0 to nfine_tasks_per_vector-1, and determines
            // which slice of A to operate on.

            int64_t j    = tid / nfine_tasks_per_vector ;
            int fine_tid = tid % nfine_tasks_per_vector ;
            int64_t kfirst = A_slice [fine_tid] ;
            int64_t klast = A_slice [fine_tid + 1] ;
            int64_t pB_start = j * bvlen ;      // pointer to B(:,j)
            int64_t pW_start = tid * cvlen ;    // pointer to W for this thread
            GB_GET_T_FOR_SECONDJ ;              // t = j or j+1 for SECONDJ*

            GB_C_TYPE *restrict Hx = (GB_C_TYPE *)
                (Wcx + (pW_start * GB_C_SIZE)) ;
            #if GB_IS_PLUS_FC32_MONOID
            float  *restrict Hx_real = (float *) Hx ;
            float  *restrict Hx_imag = Hx_real + 1 ;
            #elif GB_IS_PLUS_FC64_MONOID
            double *restrict Hx_real = (double *) Hx ;
            double *restrict Hx_imag = Hx_real + 1 ;
            #endif

            //------------------------------------------------------------------
            // clear the panel
            //------------------------------------------------------------------

            // Hx = identity
            #if GB_HAS_IDENTITY_BYTE
            { 
                memset (Hx, GB_IDENTITY_BYTE, cvlen * GB_C_SIZE) ;
            }
            #else
            {
                for (int64_t i = 0 ; i < cvlen ; i++)
                { 
                    GB_HX_WRITE (i, zidentity) ; // Hx(i) = identity
                }
            }
            #endif

            //------------------------------------------------------------------
            // W = A(:,k1:k2) * B(k1:k2,j)
            //------------------------------------------------------------------

            for (int64_t kk = kfirst ; kk < klast ; kk++)
            {

                //--------------------------------------------------------------
                // W(:,k) += A(:,k) * B(k,j)
                //--------------------------------------------------------------

                int64_t k = GBH_A (Ah, kk) ;    // k in range k1:k2
                int64_t pB = pB_start + k ;     // get pointer to B(k,j)
                #if GB_B_IS_BITMAP
                if (!GBB_B (Bb, pB)) continue ;   
                #endif
                int64_t pA = Ap [kk] ;
                int64_t pA_end = Ap [kk+1] ;
                GB_GET_B_kj ;                   // bkj = B(k,j)

                for ( ; pA < pA_end ; pA++)
                { 
                    int64_t i = Ai [pA] ;       // get A(i,k) index
                    // W(i,k) += A(i,k) * B(k,j)
                    GB_MULT_A_ik_B_kj ;         // t = A(i,k)*B(k,j)
                    GB_HX_UPDATE (i, t) ;       // Hx(i) += t
                }
            }
        }

        //----------------------------------------------------------------------
        // second phase: C += reduce (W)
        //----------------------------------------------------------------------

        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
        for (tid = 0 ; tid < ntasks ; tid++)
        {

            //------------------------------------------------------------------
            // determine the W and C for this fine task
            //------------------------------------------------------------------

            // The fine task operates on C(i1:i2,j) and W(i1:i2,w1:w2), where
            // i1:i2 is defined by the fine task id.  Its fine task id ranges
            // from 0 to nfine_tasks_per_vector-1.
            
            // w1:w2 are the updates to C(:,j), where w1:w2 =
            // [j*nfine_tasks_per_vector : (j+1)*nfine_tasks_per_vector-1].

            int64_t j    = tid / nfine_tasks_per_vector ;
            int fine_tid = tid % nfine_tasks_per_vector ;
            int64_t istart, iend ;
            GB_PARTITION (istart, iend, cvlen, fine_tid,
                nfine_tasks_per_vector) ;
            int64_t pC_start = j * cvlen ;          // pointer to C(:,j)
            int64_t wstart = j * nfine_tasks_per_vector ;
            int64_t wend = (j + 1) * nfine_tasks_per_vector ;

            // Hx = (typecasted) Wcx workspace
            GB_C_TYPE *restrict Hx = ((GB_C_TYPE *) Wcx) ;
            #if GB_IS_PLUS_FC32_MONOID
            float  *restrict Hx_real = (float *) Hx ;
            float  *restrict Hx_imag = Hx_real + 1 ;
            #elif GB_IS_PLUS_FC64_MONOID
            double *restrict Hx_real = (double *) Hx ;
            double *restrict Hx_imag = Hx_real + 1 ;
            #endif

            //------------------------------------------------------------------
            // C(i1:i2,j) += reduce (W (i2:i2, wstart:wend))
            //------------------------------------------------------------------

            for (int64_t w = wstart ; w < wend ; w++)
            {

                //--------------------------------------------------------------
                // C(i1:i2,j) += W (i1:i2,w)
                //--------------------------------------------------------------
            
                int64_t pW_start = w * cvlen ;      // pointer to W (:,w)

                for (int64_t i = istart ; i < iend ; i++)
                { 
                    int64_t pW = pW_start + i ;     // pointer to W(i,w)
                    int64_t pC = pC_start + i ;     // pointer to C(i,j)
                    // C(i,j) += W(i,w)
                    GB_CIJ_GATHER_UPDATE (pC, pW) ;
                }
            }
        }
    }
}