//------------------------------------------------------------------------------ // GB_AxB_saxpy4: compute C+=A*B: C full, A sparse/hyper, B bitmap/full //------------------------------------------------------------------------------ // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ // GB_AxB_saxpy4 computes C+=A*B where C is as-if-full, A is // sparse/hypersparse, and B is bitmap/full (or as-if-full). No mask is // present, C_replace is false, the accum matches the monoid, no typecasting is // needed, and no user-defined types or operators are used. // The ANY monoid is not supported, since its use as accum would be unusual. // The monoid must have an atomic implementation, so the TIMES monoid for // complex types is not supported. // JIT: done. //------------------------------------------------------------------------------ #include "GB_mxm.h" #include "GB_control.h" #include "GB_stringify.h" #ifndef GBCOMPACT #include "GB_AxB__include2.h" #endif #define GB_FREE_WORKSPACE \ { \ GB_WERK_POP (A_slice, int64_t) ; \ GB_WERK_POP (H_slice, int64_t) ; \ GB_FREE_WORK (&Wf, Wf_size) ; \ GB_FREE_WORK (&Wcx, Wcx_size) ; \ } #define GB_FREE_ALL \ { \ GB_FREE_WORKSPACE ; \ GB_phybix_free (C) ; \ } //------------------------------------------------------------------------------ // GB_AxB_saxpy4: compute C+=A*B: C full, A sparse/hyper, B bitmap/full //------------------------------------------------------------------------------ GrB_Info GB_AxB_saxpy4 // C += A*B ( GrB_Matrix C, // users input/output matrix const GrB_Matrix A, // input matrix A const GrB_Matrix B, // input matrix B const GrB_Semiring semiring, // semiring that defines C=A*B and accum const bool flipxy, // if true, do z=fmult(b,a) vs fmult(a,b) bool *done_in_place, // if true, saxpy4 has computed the result GB_Werk Werk ) { //-------------------------------------------------------------------------- // check inputs //-------------------------------------------------------------------------- GrB_Info info ; GB_WERK_DECLARE (A_slice, int64_t) ; GB_WERK_DECLARE (H_slice, int64_t) ; GB_void *restrict Wcx= NULL ; size_t Wcx_size = 0 ; int8_t *restrict Wf = NULL ; size_t Wf_size = 0 ; ASSERT_MATRIX_OK (C, "C for saxpy4 C+=A*B", GB0) ; ASSERT (GB_IS_FULL (C)) ; ASSERT (!GB_PENDING (C)) ; ASSERT (!GB_JUMBLED (C)) ; ASSERT (!GB_ZOMBIES (C)) ; ASSERT_MATRIX_OK (A, "A for saxpy4 C+=A*B", GB0) ; ASSERT (GB_IS_SPARSE (A) || GB_IS_HYPERSPARSE (A)) ; ASSERT (!GB_PENDING (A)) ; ASSERT (GB_JUMBLED_OK (A)) ; ASSERT (!GB_ZOMBIES (A)) ; ASSERT_MATRIX_OK (B, "B for saxpy4 C+=A*B", GB0) ; ASSERT (GB_IS_BITMAP (B) || GB_IS_FULL (B)) ; ASSERT (!GB_PENDING (B)) ; ASSERT (!GB_JUMBLED (B)) ; ASSERT (!GB_ZOMBIES (B)) ; ASSERT_SEMIRING_OK (semiring, "semiring for saxpy4 C+=A*B", GB0) ; ASSERT (A->vdim == B->vlen) ; //-------------------------------------------------------------------------- // get the semiring operators //-------------------------------------------------------------------------- GrB_BinaryOp mult = semiring->multiply ; ASSERT (mult->ztype == semiring->add->op->ztype) ; bool A_is_pattern, B_is_pattern ; GB_binop_pattern (&A_is_pattern, &B_is_pattern, flipxy, mult->opcode) ; GB_Opcode mult_binop_code, add_binop_code ; GB_Type_code xcode, ycode, zcode ; bool builtin_semiring = GB_AxB_semiring_builtin (A, A_is_pattern, B, B_is_pattern, semiring, flipxy, &mult_binop_code, &add_binop_code, &xcode, &ycode, &zcode) ; if (add_binop_code == GB_ANY_binop_code) { // The semiring cannot use the ANY monoid. // The semiring must be builtin, or use the JIT (no generic method). GBURBLE ("(punt) ") ; return (GrB_NO_VALUE) ; } // the complex TIMES and ANY monoids do not have an atomic update bool z_has_no_atomic_update = (zcode >= GB_FC32_code) && (add_binop_code == GB_TIMES_binop_code) ; GBURBLE ("(saxpy4: %s += %s*%s) ", GB_sparsity_char_matrix (C), GB_sparsity_char_matrix (A), GB_sparsity_char_matrix (B)) ; //-------------------------------------------------------------------------- // ensure C is non-iso //-------------------------------------------------------------------------- GB_OK (GB_convert_any_to_non_iso (C, true)) ; //-------------------------------------------------------------------------- // determine the # of threads to use and the parallel tasks //-------------------------------------------------------------------------- int nthreads, ntasks, nfine_tasks_per_vector ; bool use_coarse_tasks, use_atomics ; GB_AxB_saxpy4_tasks (&ntasks, &nthreads, &nfine_tasks_per_vector, &use_coarse_tasks, &use_atomics, GB_nnz (A), GB_nnz_held (B), B->vdim, C->vlen) ; //-------------------------------------------------------------------------- // allocate workspace and slice A //-------------------------------------------------------------------------- size_t wspace = 0 ; if (use_coarse_tasks) { //---------------------------------------------------------------------- // allocate workspace for coarse tasks //---------------------------------------------------------------------- GB_WERK_PUSH (H_slice, ntasks, int64_t) ; if (H_slice == NULL) { // out of memory GB_FREE_ALL ; return (GrB_OUT_OF_MEMORY) ; } int64_t hwork = 0 ; for (int tid = 0 ; tid < ntasks ; tid++) { int64_t jstart, jend ; GB_PARTITION (jstart, jend, B->vdim, tid, ntasks) ; int64_t jtask = jend - jstart ; int64_t jpanel = GB_IMIN (jtask, GB_SAXPY4_PANEL_SIZE) ; H_slice [tid] = hwork ; // full case needs Hx workspace only if jpanel > 1 if (jpanel > 1) { hwork += jpanel ; } } wspace = hwork * C->vlen * (C->type->size) ; } else { //---------------------------------------------------------------------- // allocate workspace for fine tasks (both atomic and non-atomic) //---------------------------------------------------------------------- // slice A for each team of fine tasks (atomic and non-atomic) GB_WERK_PUSH (A_slice, nfine_tasks_per_vector + 1, int64_t) ; if (A_slice == NULL) { // out of memory GB_FREE_ALL ; return (GrB_OUT_OF_MEMORY) ; } GB_pslice (A_slice, A->p, A->nvec, nfine_tasks_per_vector, true) ; if (!use_atomics) { // Each non-atomic fine task is given size-cvlen workspace to // compute its result in the first phase, W(:,tid) = A(:,k1:k2) * // B(k1:k2,j), where k1:k2 is defined by the fine_tid of the task. // The workspaces are then summed into C in the second phase. // Atomic fine takes do not require any Wcx workspace; they // use just A_slice. wspace = (C->vlen) * ntasks * (C->type->size) ; } else if (z_has_no_atomic_update) { // The atomic fine tasks use the monoid's atomic update, which is // available for most factory kernels. The TIMES monoid for the // complex types (FC32 and FC64) requires a critical section for // each C(i,j) scalar. User-defined monoids for JIT kernels also // require this mutex. Wf = GB_CALLOC_WORK (C->vlen * C->vdim, int8_t, &Wf_size) ; if (Wf == NULL) { // out of memory GB_FREE_ALL ; return (GrB_OUT_OF_MEMORY) ; } } } if (wspace > 0) { Wcx = GB_MALLOC_WORK (wspace, GB_void, &Wcx_size) ; if (Wcx == NULL) { // out of memory GB_FREE_ALL ; return (GrB_OUT_OF_MEMORY) ; } } //-------------------------------------------------------------------------- // via the factory kernel //-------------------------------------------------------------------------- info = GrB_NO_VALUE ; #ifndef GBCOMPACT GB_IF_FACTORY_KERNELS_ENABLED { //---------------------------------------------------------------------- // define the worker for the switch factory //---------------------------------------------------------------------- #define GB_Asaxpy4B(add,mult,xname) \ GB (_Asaxpy4B_ ## add ## mult ## xname) #define GB_AxB_WORKER(add,mult,xname) \ { \ info = GB_Asaxpy4B (add,mult,xname) (C, A, B, ntasks, nthreads, \ nfine_tasks_per_vector, use_coarse_tasks, use_atomics, \ A_slice, H_slice, Wcx, Wf) ; \ } \ break ; //---------------------------------------------------------------------- // launch the switch factory //---------------------------------------------------------------------- // disabled the ANY monoid #define GB_NO_ANY_MONOID if (builtin_semiring) { #include "GB_AxB_factory.c" } } #endif //-------------------------------------------------------------------------- // via the JIT or PreJIT kernel //-------------------------------------------------------------------------- if (info == GrB_NO_VALUE) { info = GB_AxB_saxpy4_jit (C, A, B, semiring, flipxy, ntasks, nthreads, nfine_tasks_per_vector, use_coarse_tasks, use_atomics, A_slice, H_slice, Wcx, Wf) ; } //-------------------------------------------------------------------------- // free workspace and return result //-------------------------------------------------------------------------- GB_FREE_WORKSPACE ; if (info == GrB_NO_VALUE) { // saxpy4 doesn't handle this case; punt to saxpy3, bitmap saxpy, etc GBURBLE ("(punt) ") ; } else if (info == GrB_SUCCESS) { ASSERT_MATRIX_OK (C, "saxpy4: output", GB0) ; (*done_in_place) = true ; } else { // out of memory, or other error GB_FREE_ALL ; } return (info) ; }