//------------------------------------------------------------------------------
// GB_convert_bitmap_worker: construct triplets or CSC/CSR from bitmap
//------------------------------------------------------------------------------

// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

// JIT: needed.

// If A is iso and Ax_new is not NULL, the iso scalar is expanded into the
// non-iso array Ax_new.  Otherwise, if Ax_new and Ax are NULL then no values
// are extracted.

// TODO allow this function to do typecasting.

#include "GB.h"
#include "GB_partition.h"
#include "GB_unused.h"

GrB_Info GB_convert_bitmap_worker   // extract CSC/CSR or triplets from bitmap
(
    // outputs:
    int64_t *restrict Ap,           // vector pointers for CSC/CSR form
    int64_t *restrict Ai,           // indices for CSC/CSR or triplet form
    int64_t *restrict Aj,           // vector indices for triplet form
    GB_void *restrict Ax_new,       // values for CSC/CSR or triplet form
    int64_t *anvec_nonempty,        // # of non-empty vectors
    // inputs: not modified
    const GrB_Matrix A,             // matrix to extract; not modified
    GB_Werk Werk
)
{

    //--------------------------------------------------------------------------
    // check inputs
    //--------------------------------------------------------------------------

    ASSERT (GB_IS_BITMAP (A)) ;
    ASSERT (Ap != NULL) ;           // must be provided on input, size avdim+1

    int64_t *restrict W = NULL ; size_t W_size = 0 ;
    const int64_t avdim = A->vdim ;
    const int64_t avlen = A->vlen ;
    const size_t asize = A->type->size ;

    //--------------------------------------------------------------------------
    // count the entries in each vector
    //--------------------------------------------------------------------------

    const int8_t *restrict Ab = A->b ;

    int nthreads_max = GB_Context_nthreads_max ( ) ;
    double chunk = GB_Context_chunk ( ) ;
    int nthreads = GB_nthreads (avlen*avdim, chunk, nthreads_max) ;
    bool by_vector = (nthreads <= avdim) ;

    if (by_vector)
    {

        //----------------------------------------------------------------------
        // compute all vectors in parallel (no workspace)
        //----------------------------------------------------------------------

        int64_t j ;
        #pragma omp parallel for num_threads(nthreads) schedule(static)
        for (j = 0 ; j < avdim ; j++)
        {
            // ajnz = nnz (A (:,j))
            int64_t ajnz = 0 ;
            int64_t pA_start = j * avlen ;
            for (int64_t i = 0 ; i < avlen ; i++)
            { 
                // see if A(i,j) is present in the bitmap
                int64_t p = i + pA_start ;
                ajnz += Ab [p] ;
                ASSERT (Ab [p] == 0 || Ab [p] == 1) ;
            }
            Ap [j] = ajnz ;
        }

    }
    else
    {

        //----------------------------------------------------------------------
        // compute blocks of rows in parallel
        //----------------------------------------------------------------------

        // allocate one row of W per thread, each row of length avdim
        W = GB_MALLOC_WORK (nthreads * avdim, int64_t, &W_size) ;
        if (W == NULL)
        {
            // out of memory
            return (GrB_OUT_OF_MEMORY) ;
        }

        int taskid ;
        #pragma omp parallel for num_threads(nthreads) schedule(static)
        for (taskid = 0 ; taskid < nthreads ; taskid++)
        {
            int64_t *restrict Wtask = W + taskid * avdim ;
            int64_t istart, iend ;
            GB_PARTITION (istart, iend, avlen, taskid, nthreads) ;
            for (int64_t j = 0 ; j < avdim ; j++)
            {
                // ajnz = nnz (A (istart:iend-1,j))
                int64_t ajnz = 0 ;
                int64_t pA_start = j * avlen ;
                for (int64_t i = istart ; i < iend ; i++)
                { 
                    // see if A(i,j) is present in the bitmap
                    int64_t p = i + pA_start ;
                    ajnz += Ab [p] ;
                    ASSERT (Ab [p] == 0 || Ab [p] == 1) ;
                }
                Wtask [j] = ajnz ;
            }
        }

        // cumulative sum to compute nnz(A(:,j)) for each vector j
        int64_t j ;
        #pragma omp parallel for num_threads(nthreads) schedule(static)
        for (j = 0 ; j < avdim ; j++)
        {
            int64_t ajnz = 0 ;
            for (int taskid = 0 ; taskid < nthreads ; taskid++)
            { 
                int64_t *restrict Wtask = W + taskid * avdim ;
                int64_t c = Wtask [j] ;
                Wtask [j] = ajnz ;
                ajnz += c ;
            }
            Ap [j] = ajnz ;
        }
    }

    //--------------------------------------------------------------------------
    // cumulative sum of Ap 
    //--------------------------------------------------------------------------

    int nth = GB_nthreads (avdim, chunk, nthreads_max) ;
    GB_cumsum (Ap, avdim, anvec_nonempty, nth, Werk) ;
    ASSERT (Ap [avdim] == A->nvals) ;

    //--------------------------------------------------------------------------
    // gather the pattern and values from the bitmap
    //--------------------------------------------------------------------------

    // TODO: add type-specific versions for built-in types

    const GB_void *restrict Ax = (GB_void *) (A->x) ;
    const bool A_iso = A->iso ;
    const bool numeric = (Ax_new != NULL && Ax != NULL) ;

    if (by_vector)
    {

        //----------------------------------------------------------------------
        // construct all vectors in parallel (no workspace)
        //----------------------------------------------------------------------

        int64_t j ;
        #pragma omp parallel for num_threads(nthreads) schedule(static)
        for (j = 0 ; j < avdim ; j++)
        {
            // gather from the bitmap into the new A (:,j)
            int64_t pnew = Ap [j] ;
            int64_t pA_start = j * avlen ;
            for (int64_t i = 0 ; i < avlen ; i++)
            {
                int64_t p = i + pA_start ;
                if (Ab [p])
                {
                    // A(i,j) is in the bitmap
                    if (Ai != NULL) Ai [pnew] = i ;
                    if (Aj != NULL) Aj [pnew] = j ;
                    if (numeric)
                    { 
                        // Ax_new [pnew] = Ax [p])
                        memcpy (Ax_new +(pnew)*asize,
                            Ax +(A_iso ? 0:(p)*asize), asize) ;
                    }
                    pnew++ ;
                }
            }
            ASSERT (pnew == Ap [j+1]) ;
        }

    }
    else
    {

        //----------------------------------------------------------------------
        // compute blocks of rows in parallel
        //----------------------------------------------------------------------

        int taskid ;
        #pragma omp parallel for num_threads(nthreads) schedule(static)
        for (taskid = 0 ; taskid < nthreads ; taskid++)
        {
            int64_t *restrict Wtask = W + taskid * avdim ;
            int64_t istart, iend ;
            GB_PARTITION (istart, iend, avlen, taskid, nthreads) ;
            for (int64_t j = 0 ; j < avdim ; j++)
            {
                // gather from the bitmap into the new A (:,j)
                int64_t pnew = Ap [j] + Wtask [j] ;
                int64_t pA_start = j * avlen ;
                for (int64_t i = istart ; i < iend ; i++)
                {
                    // see if A(i,j) is present in the bitmap
                    int64_t p = i + pA_start ;
                    if (Ab [p])
                    {
                        // A(i,j) is in the bitmap
                        if (Ai != NULL) Ai [pnew] = i ;
                        if (Aj != NULL) Aj [pnew] = j ;
                        if (numeric)
                        { 
                            // Ax_new [pnew] = Ax [p] ;
                            memcpy (Ax_new +(pnew)*asize,
                                Ax +(A_iso ? 0:(p)*asize), asize) ;
                        }
                        pnew++ ;
                    }
                }
            }
        }
    }

    //--------------------------------------------------------------------------
    // free workspace return result
    //--------------------------------------------------------------------------

    GB_FREE_WORK (&W, W_size) ;
    return (GrB_SUCCESS) ;
}