//------------------------------------------------------------------------------
// GB_helper.c: helper functions for @GrB interface
//------------------------------------------------------------------------------

// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

// JIT: not needed.  Only one variant possible.

// These functions are only used by the @GrB interface for
// SuiteSparse:GraphBLAS.

#include "GB_helper.h"

bool GB_factory_kernels_enabled = true ;

//------------------------------------------------------------------------------
// GB_NTHREADS_HELPER: determine the number of threads to use
//------------------------------------------------------------------------------

#define GB_NTHREADS_HELPER(work)                                \
    int nthreads_max = GB_Context_nthreads_max ( ) ;            \
    double chunk = GB_Context_chunk ( ) ;                       \
    int nthreads = GB_nthreads (work, chunk, nthreads_max) ;

//------------------------------------------------------------------------------
// GB_ALLOCATE_WORK: allocate per-thread workspace
//------------------------------------------------------------------------------

#define GB_ALLOCATE_WORK(work_type)                                         \
    size_t Work_size ;                                                      \
    work_type *Work = GB_MALLOC_WORK (nthreads, work_type, &Work_size) ;    \
    if (Work == NULL) return (false) ;

//------------------------------------------------------------------------------
// GB_FREE_WORKSPACE: free per-thread workspace
//------------------------------------------------------------------------------

#define GB_FREE_WORKSPACE                                                   \
    GB_FREE_WORK (&Work, Work_size) ;

//------------------------------------------------------------------------------
// GB_helper0: get the current wall-clock time from OpenMP
//------------------------------------------------------------------------------

double GB_helper0 (void)
{
    return (GB_OPENMP_GET_WTIME) ;
}

//------------------------------------------------------------------------------
// GB_helper1: convert 0-based indices to 1-based for gbextracttuples
//------------------------------------------------------------------------------

void GB_helper1              // convert zero-based indices to one-based
(
    double *restrict I_double,      // output array
    const GrB_Index *restrict I,    // input array
    int64_t nvals                   // size of input and output arrays
)
{

    GB_NTHREADS_HELPER (nvals) ;

    int64_t k ;
    #pragma omp parallel for num_threads(nthreads) schedule(static)
    for (k = 0 ; k < nvals ; k++)
    {
        I_double [k] = (double) (I [k] + 1) ;
    }
}

//------------------------------------------------------------------------------
// GB_helper1i: convert 0-based indices to 1-based for gbextracttuples
//------------------------------------------------------------------------------

void GB_helper1i             // convert zero-based indices to one-based
(
    int64_t *restrict I,            // input/output array
    int64_t nvals                   // size of input/output array
)
{

    GB_NTHREADS_HELPER (nvals) ;

    int64_t k ;
    #pragma omp parallel for num_threads(nthreads) schedule(static)
    for (k = 0 ; k < nvals ; k++)
    {
        I [k] ++ ;
    }
}

//------------------------------------------------------------------------------
// GB_helper3: convert 1-based indices to 0-based for gb_mxarray_to_list
//------------------------------------------------------------------------------

bool GB_helper3             // return true if OK, false on error
(
    int64_t *restrict List,             // size len, output array
    const double *restrict List_double, // size len, input array
    int64_t len,
    int64_t *List_max       // also compute the max entry in the list (1-based)
)
{

    GB_NTHREADS_HELPER (len) ;

    ASSERT (List != NULL) ;
    ASSERT (List_double != NULL) ;
    ASSERT (List_max != NULL) ;

    bool ok = true ;
    int64_t listmax = -1 ;

    GB_ALLOCATE_WORK (int64_t) ;

    int tid ;
    #pragma omp parallel for num_threads(nthreads) schedule(static)
    for (tid = 0 ; tid < nthreads ; tid++)
    {
        bool my_ok = true ;
        int64_t k1, k2, my_listmax = -1 ;
        GB_PARTITION (k1, k2, len, tid, nthreads) ;
        for (int64_t k = k1 ; k < k2 ; k++)
        {
            double x = List_double [k] ;
            int64_t i = (int64_t) x ;
            my_ok = my_ok && (x == (double) i) ;
            my_listmax = GB_IMAX (my_listmax, i) ;
            List [k] = i - 1 ;
        }
        // rather than create a separate per-thread boolean workspace, just
        // use a sentinal value of INT64_MIN if non-integer indices appear
        // in List_double.
        Work [tid] = my_ok ? my_listmax : INT64_MIN ;
    }

    // wrapup
    for (tid = 0 ; tid < nthreads ; tid++)
    {
        listmax = GB_IMAX (listmax, Work [tid]) ;
        ok = ok && (Work [tid] != INT64_MIN) ;
    }

    GB_FREE_WORKSPACE ;

    (*List_max) = listmax ;
    return (ok) ;
}

//------------------------------------------------------------------------------
// GB_helper3i: convert 1-based indices to 0-based for gb_mxarray_to_list
//------------------------------------------------------------------------------

bool GB_helper3i        // return true if OK, false on error
(
    int64_t *restrict List,             // size len, output array
    const int64_t *restrict List_int64, // size len, input array
    int64_t len,
    int64_t *List_max   // also compute the max entry in the list (1-based)
)
{

    GB_NTHREADS_HELPER (len) ;

    int64_t listmax = -1 ;

    GB_ALLOCATE_WORK (int64_t) ;

    int tid ;
    #pragma omp parallel for num_threads(nthreads) schedule(static)
    for (tid = 0 ; tid < nthreads ; tid++)
    {
        int64_t k1, k2, my_listmax = -1 ;
        GB_PARTITION (k1, k2, len, tid, nthreads) ;
        for (int64_t k = k1 ; k < k2 ; k++)
        {
            int64_t i = List_int64 [k] ;
            my_listmax = GB_IMAX (my_listmax, i) ;
            List [k] = i - 1 ;
        }
        Work [tid] = my_listmax ;
    }

    // wrapup
    for (tid = 0 ; tid < nthreads ; tid++)
    {
        listmax = GB_IMAX (listmax, Work [tid]) ;
    }

    GB_FREE_WORKSPACE ;

    (*List_max) = listmax ;
    return (true) ;
}

//------------------------------------------------------------------------------
// GB_helper4: find the max entry in a list of type GrB_Index
//------------------------------------------------------------------------------

bool GB_helper4             // return true if OK, false on error
(
    const GrB_Index *restrict I,    // array of size len
    const int64_t len,
    GrB_Index *List_max     // also compute the max entry in the list (1-based,
                            // which is max(I)+1)
)
{

    GB_NTHREADS_HELPER (len) ;

    GrB_Index listmax = 0 ;

    GB_ALLOCATE_WORK (GrB_Index) ;

    int tid ;
    #pragma omp parallel for num_threads(nthreads) schedule(static)
    for (tid = 0 ; tid < nthreads ; tid++)
    {
        int64_t k1, k2 ;
        GrB_Index my_listmax = 0 ;
        GB_PARTITION (k1, k2, len, tid, nthreads) ;
        for (int64_t k = k1 ; k < k2 ; k++)
        {
            my_listmax = GB_IMAX (my_listmax, I [k]) ;
        }
        Work [tid] = my_listmax ;
    }

    // wrapup
    for (tid = 0 ; tid < nthreads ; tid++)
    {
        listmax = GB_IMAX (listmax, Work [tid]) ;
    }

    GB_FREE_WORKSPACE ;

    if (len > 0) listmax++ ;
    (*List_max) = listmax ;
    return (true) ;
}

//------------------------------------------------------------------------------
// GB_helper5: construct pattern of S for gblogassign
//------------------------------------------------------------------------------

void GB_helper5              // construct pattern of S
(
    GrB_Index *restrict Si,         // array of size anz
    GrB_Index *restrict Sj,         // array of size anz
    const GrB_Index *restrict Mi,   // array of size mnz, M->i, may be NULL
    const GrB_Index *restrict Mj,   // array of size mnz,
    const int64_t mvlen,            // M->vlen
    GrB_Index *restrict Ai,         // array of size anz, A->i, may be NULL
    const int64_t avlen,            // M->vlen
    const GrB_Index anz
)
{

    GB_NTHREADS_HELPER (anz) ;
    ASSERT (Mj != NULL) ;
    ASSERT (Si != NULL) ;
    ASSERT (Sj != NULL) ;

    int64_t k ;
    #pragma omp parallel for num_threads(nthreads) schedule(static)
    for (k = 0 ; k < anz ; k++)
    {
        int64_t i = GBI (Ai, k, avlen) ;
        Si [k] = GBI (Mi, i, mvlen) ;
        Sj [k] = Mj [i] ;
    }
}

//------------------------------------------------------------------------------
// GB_helper7: Kx = uint64 (0:mnz-1), for gblogextract
//------------------------------------------------------------------------------

// TODO: use GrB_apply with a positional operator instead

void GB_helper7              // Kx = uint64 (0:mnz-1)
(
    uint64_t *restrict Kx,          // array of size mnz
    const GrB_Index mnz
)
{

    GB_NTHREADS_HELPER (mnz) ;

    int64_t k ;
    #pragma omp parallel for num_threads(nthreads) schedule(static)
    for (k = 0 ; k < mnz ; k++)
    {
        Kx [k] = k ;
    }
}

//------------------------------------------------------------------------------
// GB_helper8: expand a scalar into an array for gbbuild
//------------------------------------------------------------------------------

// TODO: use GrB_assign instead

void GB_helper8
(
    GB_void *C,         // output array of size nvals * s
    GB_void *A,         // input scalar of size s
    GrB_Index nvals,    // size of C
    size_t s            // size of each scalar
)
{

    GB_NTHREADS_HELPER (nvals) ;

    int64_t k ;
    #pragma omp parallel for num_threads(nthreads) schedule(static)
    for (k = 0 ; k < nvals ; k++)
    {
        // C [k] = A [0]
        memcpy (C + k * s, A, s) ;
    }
}

//------------------------------------------------------------------------------
// GB_helper10: compute norm (x-y,p) of two dense FP32 or FP64 vectors
//------------------------------------------------------------------------------

// p can be:

//      0 or 2:     2-norm, sqrt (sum ((x-y).^2))
//      1:          1-norm, sum (abs (x-y))
//      INT64_MAX   inf-norm, max (abs (x-y))
//      INT64_MIN   (-inf)-norm, min (abs (x-y))
//      other:      p-norm not yet computed

double GB_helper10       // norm (x-y,p), or -1 on error
(
    GB_void *x_arg,             // float or double, depending on type parameter
    bool x_iso,                 // true if x is iso
    GB_void *y_arg,             // same type as x, treat as zero if NULL
    bool y_iso,                 // true if x is iso
    GrB_Type type,              // GrB_FP32 or GrB_FP64
    int64_t p,                  // 0, 1, 2, INT64_MIN, or INT64_MAX
    GrB_Index n
)
{

    //--------------------------------------------------------------------------
    // check inputs
    //--------------------------------------------------------------------------

    if (!(type == GrB_FP32 || type == GrB_FP64))
    {
        // type of x and y must be GrB_FP32 or GrB_FP64
        return ((double) -1) ;
    }

    if (n == 0)
    {
        return ((double) 0) ;
    }

    //--------------------------------------------------------------------------
    // allocate workspace and determine # of threads to use
    //--------------------------------------------------------------------------

    GB_NTHREADS_HELPER (n) ;
    GB_ALLOCATE_WORK (double) ;

    #define xx(k) x [x_iso ? 0 : k]
    #define yy(k) y [y_iso ? 0 : k]

    //--------------------------------------------------------------------------
    // each thread computes its partial norm
    //--------------------------------------------------------------------------

    int tid ;
    #pragma omp parallel for num_threads(nthreads) schedule(static)
    for (tid = 0 ; tid < nthreads ; tid++)
    {
        int64_t k1, k2 ;
        GB_PARTITION (k1, k2, n, tid, nthreads) ;

        if (type == GrB_FP32)
        {

            //------------------------------------------------------------------
            // FP32 case
            //------------------------------------------------------------------

            float my_s = 0 ;
            const float *x = (float *) x_arg ;
            const float *y = (float *) y_arg ;
            switch (p)
            {
                case 0:     // Frobenius norm
                case 2:     // 2-norm: sqrt of sum of (x-y).^2
                {
                    if (y == NULL)
                    {
                        for (int64_t k = k1 ; k < k2 ; k++)
                        {
                            float t = xx (k) ;
                            my_s += (t*t) ;
                        }
                    }
                    else
                    {
                        for (int64_t k = k1 ; k < k2 ; k++)
                        {
                            float t = (xx (k) - yy (k)) ;
                            my_s += (t*t) ;
                        }
                    }
                }
                break ;

                case 1:     // 1-norm: sum (abs (x-y))
                {
                    if (y == NULL)
                    {
                        for (int64_t k = k1 ; k < k2 ; k++)
                        {
                            my_s += fabsf (xx (k)) ;
                        }
                    }
                    else
                    {
                        for (int64_t k = k1 ; k < k2 ; k++)
                        {
                            my_s += fabsf (xx (k) - yy (k)) ;
                        }
                    }
                }
                break ;

                case INT64_MAX:     // inf-norm: max (abs (x-y))
                {
                    if (y == NULL)
                    {
                        for (int64_t k = k1 ; k < k2 ; k++)
                        {
                            my_s = fmaxf (my_s, fabsf (xx (k))) ;
                        }
                    }
                    else
                    {
                        for (int64_t k = k1 ; k < k2 ; k++)
                        {
                            my_s = fmaxf (my_s, fabsf (xx (k) - yy (k))) ;
                        }
                    }
                }
                break ;

                case INT64_MIN:     // (-inf)-norm: min (abs (x-y))
                {
                    my_s = INFINITY ;
                    if (y == NULL)
                    {
                        for (int64_t k = k1 ; k < k2 ; k++)
                        {
                            my_s = fminf (my_s, fabsf (xx (k))) ;
                        }
                    }
                    else
                    {
                        for (int64_t k = k1 ; k < k2 ; k++)
                        {
                            my_s = fminf (my_s, fabsf (xx (k) - yy (k))) ;
                        }
                    }
                }
                break ;

                default: ;  // p-norm not yet supported
            }
            Work [tid] = (double) my_s ;

        }
        else
        {

            //------------------------------------------------------------------
            // FP64 case
            //------------------------------------------------------------------

            double my_s = 0 ;
            const double *x = (double *) x_arg ;
            const double *y = (double *) y_arg ;
            switch (p)
            {
                case 0:     // Frobenius norm
                case 2:     // 2-norm: sqrt of sum of (x-y).^2
                {
                    if (y == NULL)
                    {
                        for (int64_t k = k1 ; k < k2 ; k++)
                        {
                            double t = xx (k) ;
                            my_s += (t*t) ;
                        }
                    }
                    else
                    {
                        for (int64_t k = k1 ; k < k2 ; k++)
                        {
                            double t = (xx (k) - yy (k)) ;
                            my_s += (t*t) ;
                        }
                    }
                }
                break ;

                case 1:     // 1-norm: sum (abs (x-y))
                {
                    if (y == NULL)
                    {
                        for (int64_t k = k1 ; k < k2 ; k++)
                        {
                            my_s += fabs (xx (k)) ;
                        }
                    }
                    else
                    {
                        for (int64_t k = k1 ; k < k2 ; k++)
                        {
                            my_s += fabs (xx (k) - yy (k)) ;
                        }
                    }
                }
                break ;

                case INT64_MAX:     // inf-norm: max (abs (x-y))
                {
                    if (y == NULL)
                    {
                        for (int64_t k = k1 ; k < k2 ; k++)
                        {
                            my_s = fmax (my_s, fabs (xx (k))) ;
                        }
                    }
                    else
                    {
                        for (int64_t k = k1 ; k < k2 ; k++)
                        {
                            my_s = fmax (my_s, fabs (xx (k) - yy (k))) ;
                        }
                    }
                }
                break ;

                case INT64_MIN:     // (-inf)-norm: min (abs (x-y))
                {
                    my_s = INFINITY ;
                    if (y == NULL)
                    {
                        for (int64_t k = k1 ; k < k2 ; k++)
                        {
                            my_s = fmin (my_s, fabs (xx (k))) ;
                        }
                    }
                    else
                    {
                        for (int64_t k = k1 ; k < k2 ; k++)
                        {
                            my_s = fmin (my_s, fabs (xx (k) - yy (k))) ;
                        }
                    }
                }
                break ;

                default: ;  // p-norm not yet supported
            }

            Work [tid] = my_s ;
        }
    }

    //--------------------------------------------------------------------------
    // combine results of each thread
    //--------------------------------------------------------------------------

    double s = 0 ;
    switch (p)
    {
        case 0:     // Frobenius norm
        case 2:     // 2-norm: sqrt of sum of (x-y).^2
        {
            for (int64_t tid = 0 ; tid < nthreads ; tid++)
            {
                s += Work [tid] ;
            }
            s = sqrt (s) ;
        }
        break ;

        case 1:     // 1-norm: sum (abs (x-y))
        {
            for (int64_t tid = 0 ; tid < nthreads ; tid++)
            {
                s += Work [tid] ;
            }
        }
        break ;

        case INT64_MAX:     // inf-norm: max (abs (x-y))
        {
            for (int64_t tid = 0 ; tid < nthreads ; tid++)
            {
                s = fmax (s, Work [tid]) ;
            }
        }
        break ;

        case INT64_MIN:     // (-inf)-norm: min (abs (x-y))
        {
            s = Work [0] ;
            for (int64_t tid = 1 ; tid < nthreads ; tid++)
            {
                s = fmin (s, Work [tid]) ;
            }
        }
        break ;

        default:    // p-norm not yet supported
            s = -1 ;
    }

    //--------------------------------------------------------------------------
    // free workspace and return result
    //--------------------------------------------------------------------------

    GB_FREE_WORKSPACE ;
    return (s) ;
}

//------------------------------------------------------------------------------
// GB_make_shallow.c: force a matrix to have purely shallow components
//------------------------------------------------------------------------------

void GB_make_shallow (GrB_Matrix A)
{
    if (A == NULL) return ;
    A->p_shallow = (A->p != NULL) ;
    A->h_shallow = (A->h != NULL) ;
    A->b_shallow = (A->b != NULL) ;
    A->i_shallow = (A->i != NULL) ;
    A->x_shallow = (A->x != NULL) ;
    #ifdef GB_MEMDUMP
    printf ("remove from memtable: Ap:%p Ah:%p Ab:%p Ai:%p Ax:%p\n", // MEMDUMP
        A->p, A->h, A->b, A->i, A->x) ;
    #endif
    if (A->p != NULL) GB_Global_memtable_remove (A->p) ;
    if (A->h != NULL) GB_Global_memtable_remove (A->h) ;
    if (A->b != NULL) GB_Global_memtable_remove (A->b) ;
    if (A->i != NULL) GB_Global_memtable_remove (A->i) ;
    if (A->x != NULL) GB_Global_memtable_remove (A->x) ;
    GB_make_shallow (A->Y) ;
}