//------------------------------------------------------------------------------ // GB_reduce_to_scalar: reduce a matrix to a scalar //------------------------------------------------------------------------------ // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ // JIT: done. // c = accum (c, reduce_to_scalar(A)), reduce entries in a matrix to a scalar. // Does the work for GrB_*_reduce_TYPE, both matrix and vector. // This function does not need to know if A is hypersparse or not, and its // result is the same if A is in CSR or CSC format. // This function is the only place in all of GraphBLAS where the identity value // of a monoid is required, but only in one special case: it is required to be // the return value of c when A has no entries. The identity value is also // used internally, in the parallel methods below, to initialize a scalar value // in each task. The methods could be rewritten to avoid the use of the // identity value. Since this function requires it anyway, for the special // case when nvals(A) is zero, the existence of the identity value makes the // code a little simpler. #include "GB_reduce.h" #include "GB_binop.h" #include "GB_stringify.h" #ifndef GBCOMPACT #include "GB_red__include.h" #endif #include "GB_monoid_shared_definitions.h" #define GB_FREE_ALL \ { \ GB_WERK_POP (F, bool) ; \ GB_WERK_POP (W, GB_void) ; \ } GrB_Info GB_reduce_to_scalar // z = reduce_to_scalar (A) ( void *c, // result scalar const GrB_Type ctype, // the type of scalar, c const GrB_BinaryOp accum, // for c = accum(c,z) const GrB_Monoid monoid, // monoid to do the reduction const GrB_Matrix A, // matrix to reduce GB_Werk Werk ) { //-------------------------------------------------------------------------- // check inputs //-------------------------------------------------------------------------- GrB_Info info ; GB_RETURN_IF_NULL_OR_FAULTY (monoid) ; GB_RETURN_IF_FAULTY_OR_POSITIONAL (accum) ; GB_RETURN_IF_NULL (c) ; GB_WERK_DECLARE (W, GB_void) ; GB_WERK_DECLARE (F, bool) ; ASSERT_TYPE_OK (ctype, "type of scalar c", GB0) ; ASSERT_MONOID_OK (monoid, "monoid for reduce_to_scalar", GB0) ; ASSERT_BINARYOP_OK_OR_NULL (accum, "accum for reduce_to_scalar", GB0) ; ASSERT_MATRIX_OK (A, "A for reduce_to_scalar", GB0) ; // check domains and dimensions for c = accum (c,z) GrB_Type ztype = monoid->op->ztype ; GB_OK (GB_compatible (ctype, NULL, NULL, false, accum, ztype, Werk)) ; // z = monoid (z,A) must be compatible if (!GB_Type_compatible (A->type, ztype)) { return (GrB_DOMAIN_MISMATCH) ; } //-------------------------------------------------------------------------- // assemble any pending tuples; zombies are OK //-------------------------------------------------------------------------- GB_MATRIX_WAIT_IF_PENDING (A) ; GB_BURBLE_DENSE (A, "(A %s) ") ; ASSERT (GB_ZOMBIES_OK (A)) ; ASSERT (GB_JUMBLED_OK (A)) ; ASSERT (!GB_PENDING (A)) ; //-------------------------------------------------------------------------- // get A //-------------------------------------------------------------------------- int64_t asize = A->type->size ; int64_t zsize = ztype->size ; int64_t anz = GB_nnz_held (A) ; ASSERT (anz >= A->nzombies) ; // z = identity GB_void z [GB_VLA(zsize)] ; memcpy (z, monoid->identity, zsize) ; // required, if nnz(A) is zero //-------------------------------------------------------------------------- // z = reduce_to_scalar (A) on the GPU(s) or CPU //-------------------------------------------------------------------------- info = GrB_NO_VALUE ; #if defined ( GRAPHBLAS_HAS_CUDA ) if (GB_cuda_reduce_to_scalar_branch (monoid, A)) { //---------------------------------------------------------------------- // via the CUDA kernel //---------------------------------------------------------------------- GrB_Matrix V = NULL ; info = GB_cuda_reduce_to_scalar (z, &V, monoid, A) ; if (V != NULL) { // reduction must continue. Result is in V, not the scalar z. ASSERT (info == GrB_SUCCESS) ; if (V->vlen == 1) { // the CUDA kernel has reduced A to a single scalar, V [0], // with a single threadblock; no more recursion. Copy the // single scalar into z, for the accum phase below. memcpy (z, V->x, zsize) ; GB_Matrix_free (&V) ; } else { // the CUDA kernel has reduced A to an array V; keep going info = GB_reduce_to_scalar (c, ctype, accum, monoid, V, Werk) ; GB_Matrix_free (&V) ; return (info) ; } } // GB_cuda_reduce_to_scalar may refuse to do the reduction and // indicate this by returning GrB_NO_VALUE. If so, the CPU will do it // below. if (!(info == GrB_SUCCESS || info == GrB_NO_VALUE)) { // GB_cuda_reduce_to_scalar has returned an error // (out of memory, or other error) return (info) ; } } #endif if (info == GrB_NO_VALUE) { //---------------------------------------------------------------------- // use OpenMP on the CPU threads //---------------------------------------------------------------------- int nthreads_max = GB_Context_nthreads_max ( ) ; double chunk = GB_Context_chunk ( ) ; int nthreads = GB_nthreads (anz, chunk, nthreads_max) ; int ntasks = (nthreads == 1) ? 1 : (64 * nthreads) ; ntasks = GB_IMIN (ntasks, anz) ; ntasks = GB_IMAX (ntasks, 1) ; //---------------------------------------------------------------------- // allocate workspace //---------------------------------------------------------------------- GB_WERK_PUSH (W, ntasks * zsize, GB_void) ; GB_WERK_PUSH (F, ntasks, bool) ; if (W == NULL || F == NULL) { // out of memory GB_FREE_ALL ; return (GrB_OUT_OF_MEMORY) ; } //---------------------------------------------------------------------- // z = reduce_to_scalar (A) //---------------------------------------------------------------------- // get terminal value, if any GB_void *restrict zterminal = (GB_void *) monoid->terminal ; if (anz == A->nzombies) { //------------------------------------------------------------------ // no live entries in A; nothing to do //------------------------------------------------------------------ info = GrB_SUCCESS ; } else if (A->iso) { //------------------------------------------------------------------ // via the iso kernel //------------------------------------------------------------------ // this takes at most O(log(nvals(A))) time, for any monoid GB_reduce_to_scalar_iso (z, monoid, A) ; info = GrB_SUCCESS ; } else if (A->type == ztype) { //------------------------------------------------------------------ // via the factory kernel //------------------------------------------------------------------ #ifndef GBCOMPACT GB_IF_FACTORY_KERNELS_ENABLED { //-------------------------------------------------------------- // define the worker for the switch factory //-------------------------------------------------------------- #define GB_red(opname,aname) \ GB (_red_ ## opname ## aname) #define GB_RED_WORKER(opname,aname,ztype) \ { \ info = GB_red (opname, aname) ((ztype *) z, A, W, F, \ ntasks, nthreads) ; \ } \ break ; //-------------------------------------------------------------- // launch the switch factory //-------------------------------------------------------------- // controlled by opcode and typecode GB_Opcode opcode = monoid->op->opcode ; GB_Type_code typecode = A->type->code ; ASSERT (typecode <= GB_UDT_code) ; #include "GB_red_factory.c" } #endif } //---------------------------------------------------------------------- // via the JIT or PreJIT kernel //---------------------------------------------------------------------- if (info == GrB_NO_VALUE) { info = GB_reduce_to_scalar_jit (z, monoid, A, W, F, ntasks, nthreads) ; } //---------------------------------------------------------------------- // via the generic kernel //---------------------------------------------------------------------- if (info == GrB_NO_VALUE) { //------------------------------------------------------------------ // generic worker //------------------------------------------------------------------ #include "GB_generic.h" GxB_binary_function freduce = monoid->op->binop_function ; // ztype z = identity #define GB_DECLARE_IDENTITY(z) \ GB_void z [GB_VLA(zsize)] ; \ memcpy (z, monoid->identity, zsize) ; // const zidentity = identity #define GB_DECLARE_IDENTITY_CONST(z) \ const GB_void *z = monoid->identity ; // const zterminal = terminal_value #undef GB_DECLARE_TERMINAL_CONST #define GB_DECLARE_TERMINAL_CONST(zterminal) \ const GB_void *zterminal = monoid->terminal ; #define GB_A_TYPE GB_void // no panel used #define GB_PANEL 1 #define GB_NO_PANEL_CASE // W [k] = z, no typecast #define GB_COPY_SCALAR_TO_ARRAY(W, k, z) \ memcpy (W +(k*zsize), z, zsize) // z += W [k], no typecast #define GB_ADD_ARRAY_TO_SCALAR(z,W,k) \ freduce (z, z, W +((k)*zsize)) if (A->type == ztype) { //-------------------------------------------------------------- // generic worker: sum up the entries, no typecasting //-------------------------------------------------------------- GB_BURBLE_MATRIX (A, "(generic reduce to scalar: %s) ", monoid->op->name) ; // the switch factory didn't handle this case // t += (ztype) Ax [p], but no typecasting needed #define GB_GETA_AND_UPDATE(t,Ax,p) \ freduce (t, t, Ax +((p)*zsize)) if (zterminal == NULL) { // monoid is not terminal #undef GB_MONOID_IS_TERMINAL #define GB_MONOID_IS_TERMINAL 0 #undef GB_TERMINAL_CONDITION #define GB_TERMINAL_CONDITION(z,zterminal) 0 #undef GB_IF_TERMINAL_BREAK #define GB_IF_TERMINAL_BREAK(z,zterminal) #include "GB_reduce_to_scalar_template.c" } else { // break if terminal value reached #undef GB_MONOID_IS_TERMINAL #define GB_MONOID_IS_TERMINAL 1 #undef GB_TERMINAL_CONDITION #define GB_TERMINAL_CONDITION(z,zterminal) \ (memcmp (z, zterminal, zsize) == 0) #undef GB_IF_TERMINAL_BREAK #define GB_IF_TERMINAL_BREAK(z,zterminal) \ if (GB_TERMINAL_CONDITION (z, zterminal)) break #include "GB_reduce_to_scalar_template.c" } } else { //-------------------------------------------------------------- // generic worker: sum up the entries, with typecasting //-------------------------------------------------------------- GB_BURBLE_MATRIX (A, "(generic reduce to scalar, with typecast:" " %s) ", monoid->op->name) ; GB_cast_function cast_A_to_Z = GB_cast_factory (ztype->code, A->type->code) ; // t += (ztype) Ax [p], with typecast #undef GB_GETA_AND_UPDATE #define GB_GETA_AND_UPDATE(t,Ax,p) \ GB_void awork [GB_VLA(zsize)] ; \ cast_A_to_Z (awork, Ax +((p)*asize), asize) ; \ freduce (t, t, awork) if (zterminal == NULL) { // monoid is not terminal #undef GB_MONOID_IS_TERMINAL #define GB_MONOID_IS_TERMINAL 0 #undef GB_TERMINAL_CONDITION #define GB_TERMINAL_CONDITION(z,zterminal) 0 #undef GB_IF_TERMINAL_BREAK #define GB_IF_TERMINAL_BREAK #include "GB_reduce_to_scalar_template.c" } else { // break if terminal value reached #undef GB_MONOID_IS_TERMINAL #define GB_MONOID_IS_TERMINAL 1 #undef GB_TERMINAL_CONDITION #define GB_TERMINAL_CONDITION(z,zterminal) \ (memcmp (z, zterminal, zsize) == 0) #undef GB_IF_TERMINAL_BREAK #define GB_IF_TERMINAL_BREAK(z,zterminal) \ if (GB_TERMINAL_CONDITION (z, zterminal)) break #include "GB_reduce_to_scalar_template.c" } } info = GrB_SUCCESS ; } } if (info != GrB_SUCCESS) { // out of memory, or other error GB_FREE_ALL ; return (info) ; } //-------------------------------------------------------------------------- // c = z or c = accum (c,z) //-------------------------------------------------------------------------- // This operation does not use GB_accum_mask, since c and z are // scalars, not matrices. There is no scalar mask. if (accum == NULL) { // c = (ctype) z GB_cast_function cast_Z_to_C = GB_cast_factory (ctype->code, ztype->code) ; cast_Z_to_C (c, z, ctype->size) ; } else { GxB_binary_function faccum = accum->binop_function ; GB_cast_function cast_C_to_xaccum, cast_Z_to_yaccum, cast_zaccum_to_C ; cast_C_to_xaccum = GB_cast_factory (accum->xtype->code, ctype->code) ; cast_Z_to_yaccum = GB_cast_factory (accum->ytype->code, ztype->code) ; cast_zaccum_to_C = GB_cast_factory (ctype->code, accum->ztype->code) ; // scalar workspace GB_void xaccum [GB_VLA(accum->xtype->size)] ; GB_void yaccum [GB_VLA(accum->ytype->size)] ; GB_void zaccum [GB_VLA(accum->ztype->size)] ; // xaccum = (accum->xtype) c cast_C_to_xaccum (xaccum, c, ctype->size) ; // yaccum = (accum->ytype) z cast_Z_to_yaccum (yaccum, z, zsize) ; // zaccum = xaccum "+" yaccum faccum (zaccum, xaccum, yaccum) ; // c = (ctype) zaccum cast_zaccum_to_C (c, zaccum, ctype->size) ; } //-------------------------------------------------------------------------- // free workspace and return result //-------------------------------------------------------------------------- GB_FREE_ALL ; #pragma omp flush return (GrB_SUCCESS) ; }