//------------------------------------------------------------------------------ // GB_wait: finish all pending computations on a single matrix //------------------------------------------------------------------------------ // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ // JIT: not needed. Only one variant possible. // CALLS: GB_builder // The matrix A has zombies and/or pending tuples placed there by // GrB_setElement, GrB_*assign, or GB_mxm. Zombies must now be deleted, and // pending tuples must now be assembled together and added into the matrix. // The indices in A might also be jumbled; if so, they are sorted now. // When the function returns, and all pending tuples and zombies have been // deleted. This is true even the function fails due to lack of memory (in // that case, the matrix is cleared as well). // If A is hypersparse, the time taken is at most O(nnz(A) + t log t), where t // is the number of pending tuples in A, and nnz(A) includes both zombies and // live entries. There is no O(m) or O(n) time component, if A is m-by-n. // If the number of non-empty vectors of A grows too large, then A can be // converted to non-hypersparse. // If A is non-hypersparse, then O(n) is added in the worst case, to prune // zombies and to update the vector pointers for A. // If A->nvec_nonempty is unknown (-1) it is computed. // The A->Y hyper_hash is freed if the A->h hyperlist has to be constructed. // Instead, it is not computed and left pending (as NULL). It is not modified // if A->h doesn't change. // If the method is successful, it does an OpenMP flush just before returning. #define GB_FREE_WORKSPACE \ { \ GB_Matrix_free (&Y) ; \ GB_Matrix_free (&T) ; \ GB_Matrix_free (&S) ; \ GB_Matrix_free (&A1) ; \ } #define GB_FREE_ALL \ { \ GB_FREE_WORKSPACE ; \ GB_phybix_free (A) ; \ } #include "GB_select.h" #include "GB_add.h" #include "GB_binop.h" #include "GB_Pending.h" #include "GB_build.h" #include "GB_jappend.h" #include "GB_scalar_wrap.h" GrB_Info GB_wait // finish all pending computations ( GrB_Matrix A, // matrix with pending computations const char *name, // name of the matrix GB_Werk Werk ) { //-------------------------------------------------------------------------- // check inputs //-------------------------------------------------------------------------- GrB_Info info = GrB_SUCCESS ; struct GB_Matrix_opaque T_header, A1_header, S_header ; GrB_Matrix T = NULL, A1 = NULL, S = NULL, Y = NULL ; ASSERT_MATRIX_OK (A, "A to wait", GB_FLIP (GB0)) ; if (GB_IS_FULL (A) || GB_IS_BITMAP (A)) { // full and bitmap matrices never have any pending work ASSERT (!GB_ZOMBIES (A)) ; ASSERT (!GB_JUMBLED (A)) ; ASSERT (!GB_PENDING (A)) ; ASSERT (A->nvec_nonempty >= 0) ; // ensure the matrix is written to memory #pragma omp flush return (GrB_SUCCESS) ; } // only sparse and hypersparse matrices can have pending work ASSERT (GB_IS_SPARSE (A) || GB_IS_HYPERSPARSE (A)) ; ASSERT (GB_ZOMBIES_OK (A)) ; ASSERT (GB_JUMBLED_OK (A)) ; ASSERT (GB_PENDING_OK (A)) ; //-------------------------------------------------------------------------- // get the zombie and pending count, and burble if work needs to be done //-------------------------------------------------------------------------- int64_t nzombies = A->nzombies ; int64_t npending = GB_Pending_n (A) ; const bool A_iso = A->iso ; if (nzombies > 0 || npending > 0 || A->jumbled || A->nvec_nonempty < 0) { GB_BURBLE_MATRIX (A, "(%swait:%s " GBd " %s, " GBd " pending%s%s) ", A_iso ? "iso " : "", name, nzombies, (nzombies == 1) ? "zombie" : "zombies", npending, A->jumbled ? ", jumbled" : "", A->nvec_nonempty < 0 ? ", nvec" : "") ; } //-------------------------------------------------------------------------- // determine the max # of threads to use //-------------------------------------------------------------------------- int nthreads_max = GB_Context_nthreads_max ( ) ; double chunk = GB_Context_chunk ( ) ; //-------------------------------------------------------------------------- // check if only A->nvec_nonempty is needed //-------------------------------------------------------------------------- if (npending == 0 && nzombies == 0 && !A->jumbled) { // A->Y is not modified. If not NULL, it remains valid if (A->nvec_nonempty < 0) { A->nvec_nonempty = GB_nvec_nonempty (A) ; } return (GrB_SUCCESS) ; } //-------------------------------------------------------------------------- // check if A only needs to be unjumbled //-------------------------------------------------------------------------- if (npending == 0 && nzombies == 0) { // A is not conformed, so the sparsity structure of A is not modified. // That is, if A has no pending tuples and no zombies, but is just // jumbled, then it stays sparse or hypersparse. A->Y is not modified // nor accessed, and remains NULL if it is NULL on input. If it is // present, it remains valid. GB_OK (GB_unjumble (A, Werk)) ; ASSERT (GB_IMPLIES (info == GrB_SUCCESS, A->nvec_nonempty >= 0)) ; return (info) ; } //-------------------------------------------------------------------------- // assemble the pending tuples into T //-------------------------------------------------------------------------- int64_t anz_orig = GB_nnz (A) ; int64_t asize = A->type->size ; int64_t tnz = 0 ; if (npending > 0) { //---------------------------------------------------------------------- // construct a new hypersparse matrix T with just the pending tuples //---------------------------------------------------------------------- // T has the same type as A->type, which can differ from the type of // the pending tuples, A->Pending->type. The Pending->op can be NULL // (an implicit SECOND function), or it can be any accum operator. The // z=accum(x,y) operator can have any types, and it does not have to be // associative. T is constructed as iso if A is iso. GB_void *S_input = (A_iso) ? ((GB_void *) A->x) : NULL ; GrB_Type stype = (A_iso) ? A->type : A->Pending->type ; GB_CLEAR_STATIC_HEADER (T, &T_header) ; info = GB_builder ( T, // create T using a static header A->type, // T->type = A->type A->vlen, // T->vlen = A->vlen A->vdim, // T->vdim = A->vdim A->is_csc, // T->is_csc = A->is_csc &(A->Pending->i), // iwork_handle, becomes T->i on output &(A->Pending->i_size), &(A->Pending->j), // jwork_handle, free on output &(A->Pending->j_size), &(A->Pending->x), // Swork_handle, free on output &(A->Pending->x_size), A->Pending->sorted, // tuples may or may not be sorted false, // there might be duplicates; look for them A->Pending->nmax, // size of Pending->[ijx] arrays true, // is_matrix: unused NULL, NULL, S_input, // original I,J,S_input tuples A_iso, // pending tuples are iso if A is iso npending, // # of tuples A->Pending->op, // dup operator for assembling duplicates, // NULL if A is iso stype, // type of Pending->x true, // burble is allowed Werk ) ; //---------------------------------------------------------------------- // free pending tuples //---------------------------------------------------------------------- // The tuples have been converted to T, which is more compact, and // duplicates have been removed. The following work needs to be done // even if the builder fails. // GB_builder frees A->Pending->j and A->Pending->x. If successful, // A->Pending->i is now T->i. Otherwise A->Pending->i is freed. In // both cases, A->Pending->i is NULL. ASSERT (A->Pending->i == NULL) ; ASSERT (A->Pending->j == NULL) ; ASSERT (A->Pending->x == NULL) ; // free the list of pending tuples GB_Pending_free (&(A->Pending)) ; ASSERT (!GB_PENDING (A)) ; ASSERT_MATRIX_OK (A, "A after moving pending tuples to T", GB0) ; //---------------------------------------------------------------------- // check the status of the builder //---------------------------------------------------------------------- // Finally check the status of the builder. The pending tuples, must // be freed (just above), whether or not the builder is successful. if (info != GrB_SUCCESS) { // out of memory in GB_builder GB_FREE_ALL ; return (info) ; } ASSERT_MATRIX_OK (T, "T = hypersparse matrix of pending tuples", GB0) ; ASSERT (GB_IS_HYPERSPARSE (T)) ; ASSERT (!GB_ZOMBIES (T)) ; ASSERT (!GB_JUMBLED (T)) ; ASSERT (!GB_PENDING (T)) ; tnz = GB_nnz (T) ; ASSERT (tnz > 0) ; } //-------------------------------------------------------------------------- // delete zombies //-------------------------------------------------------------------------- // A zombie is an entry A(i,j) in the matrix that as been marked for // deletion, but hasn't been deleted yet. It is marked by "negating" // replacing its index i with GB_FLIP(i). // TODO: pass tnz to GB_selector, to pad the reallocated A matrix ASSERT_MATRIX_OK (A, "A before zombies removed", GB0) ; if (nzombies > 0) { // remove all zombies from A // GB_selector frees A->Y if it changes A->h, or leaves it // unmodified (and valid) otherwise. struct GB_Scalar_opaque Thunk_header ; int64_t k = 0 ; GrB_Scalar Thunk = GB_Scalar_wrap (&Thunk_header, GrB_INT64, &k) ; GB_OK (GB_selector (NULL, GxB_NONZOMBIE, false, A, Thunk, Werk)) ; ASSERT (A->nzombies == (anz_orig - GB_nnz (A))) ; A->nzombies = 0 ; } ASSERT_MATRIX_OK (A, "A after zombies removed", GB0) ; // all the zombies are gone, and pending tuples are now in T ASSERT (!GB_ZOMBIES (A)) ; ASSERT (GB_JUMBLED_OK (A)) ; ASSERT (!GB_PENDING (A)) ; //-------------------------------------------------------------------------- // unjumble the matrix //-------------------------------------------------------------------------- GB_OK (GB_unjumble (A, Werk)) ; ASSERT (!GB_ZOMBIES (A)) ; ASSERT (!GB_JUMBLED (A)) ; ASSERT (!GB_PENDING (A)) ; //-------------------------------------------------------------------------- // check for pending tuples //-------------------------------------------------------------------------- if (npending == 0) { // conform A to its desired sparsity structure and return result info = GB_conform (A, Werk) ; ASSERT (GB_IMPLIES (info == GrB_SUCCESS, A->nvec_nonempty >= 0)) ; #pragma omp flush return (info) ; } //-------------------------------------------------------------------------- // check for quick transplant //-------------------------------------------------------------------------- int64_t anz = GB_nnz (A) ; if (anz == 0) { // A has no entries so just transplant T into A, then free T and // conform A to its desired hypersparsity. info = GB_transplant_conform (A, A->type, &T, Werk) ; ASSERT (GB_IMPLIES (info == GrB_SUCCESS, A->nvec_nonempty >= 0)) ; #pragma omp flush return (info) ; } //-------------------------------------------------------------------------- // create the SECOND_ATYPE binary operator //-------------------------------------------------------------------------- struct GB_BinaryOp_opaque op_header ; GrB_BinaryOp op_2nd = GB_binop_second (A->type, &op_header) ; //-------------------------------------------------------------------------- // determine the method for A = A+T //-------------------------------------------------------------------------- // If anz > 0, T is hypersparse, even if A is a GrB_Vector ASSERT (GB_IS_HYPERSPARSE (T)) ; ASSERT (tnz > 0) ; ASSERT (T->nvec > 0) ; ASSERT (A->nvec > 0) ; // tjfirst = first vector in T int64_t tjfirst = T->h [0] ; int64_t anz0 = 0 ; int64_t kA = 0 ; int64_t jlast ; int64_t *restrict Ap = A->p ; int64_t *restrict Ah = A->h ; int64_t *restrict Ai = A->i ; GB_void *restrict Ax = (GB_void *) A->x ; int64_t anvec = A->nvec ; // anz0 = nnz (A0) = nnz (A (:, 0:tjfirst-1)), the region not modified by T if (A->h != NULL) { // find tjfirst in A->h int64_t pright = anvec - 1 ; bool found ; GB_SPLIT_BINARY_SEARCH (tjfirst, A->h, kA, pright, found) ; // A->h [0 ... kA-1] excludes vector tjfirst. The list // A->h [kA ... anvec-1] includes tjfirst. ASSERT (kA >= 0 && kA <= anvec) ; ASSERT (GB_IMPLIES (kA > 0 && kA < anvec, A->h [kA-1] < tjfirst)) ; ASSERT (GB_IMPLIES (found, A->h [kA] == tjfirst)) ; jlast = (kA > 0) ? A->h [kA-1] : (-1) ; } else { kA = tjfirst ; jlast = tjfirst - 1 ; } // anz1 = nnz (A1) = nnz (A (:, kA:end)), the region modified by T anz0 = A->p [kA] ; int64_t anz1 = anz - anz0 ; bool ignore ; // A + T will have anz_new entries int64_t anz_new = anz + tnz ; // must have at least this space if (2 * anz1 < anz0) { //---------------------------------------------------------------------- // append new tuples to A //---------------------------------------------------------------------- // A is growing incrementally. It splits into two parts: A = [A0 A1]. // where A0 = A (:, 0:kA-1) and A1 = A (:, kA:end). The // first part (A0 with anz0 = nnz (A0) entries) is not modified. The // second part (A1, with anz1 = nnz (A1) entries) overlaps with T. // If anz1 is zero, or small compared to anz0, then it is faster to // leave A0 unmodified, and to update just A1. // TODO: if A also had zombies, GB_selector could pad A so that // GB_nnz_max (A) is equal to anz + tnz. // make sure A has enough space for the new tuples if (anz_new > GB_nnz_max (A)) { // double the size if not enough space GB_OK (GB_ix_realloc (A, 2 * anz_new)) ; Ai = A->i ; Ax = (GB_void *) A->x ; } //---------------------------------------------------------------------- // T = A1 + T //---------------------------------------------------------------------- if (anz1 > 0) { //------------------------------------------------------------------ // extract A1 = A (:, kA:end) as a shallow copy //------------------------------------------------------------------ // A1 = [0, A (:, kA:end)], hypersparse with same dimensions as A GB_CLEAR_STATIC_HEADER (A1, &A1_header) ; GB_OK (GB_new (&A1, // hyper, existing header A->type, A->vlen, A->vdim, GB_Ap_malloc, A->is_csc, GxB_HYPERSPARSE, GB_ALWAYS_HYPER, anvec - kA)) ; // the A1->i and A1->x content are shallow copies of A(:,kA:end). // They are not allocated pointers, but point to space inside // Ai and Ax. A1->x = (void *) (Ax + (A_iso ? 0 : (asize * anz0))) ; A1->x_size = (A_iso ? 1 : anz1) * asize ; A1->x_shallow = true ; A1->i = Ai + anz0 ; A1->i_size = anz1 * sizeof (int64_t) ; A1->i_shallow = true ; A1->iso = A_iso ; // OK // fill the column A1->h and A1->p with A->h and A->p, shifted int64_t *restrict A1p = A1->p ; int64_t *restrict A1h = A1->h ; int64_t a1nvec = 0 ; for (int64_t k = kA ; k < anvec ; k++) { // get A (:,k) int64_t pA_start = Ap [k] ; int64_t pA_end = Ap [k+1] ; if (pA_end > pA_start) { // add this column to A1 if A (:,k) is not empty int64_t j = GBH (Ah, k) ; A1p [a1nvec] = pA_start - anz0 ; A1h [a1nvec] = j ; a1nvec++ ; } } // finalize A1 A1p [a1nvec] = anz1 ; A1->nvec = a1nvec ; A1->nvec_nonempty = a1nvec ; A1->nvals = anz1 ; A1->magic = GB_MAGIC ; ASSERT_MATRIX_OK (A1, "A1 slice for GB_wait", GB0) ; //------------------------------------------------------------------ // S = A1 + T, with no operator or mask //------------------------------------------------------------------ GB_CLEAR_STATIC_HEADER (S, &S_header) ; GB_OK (GB_add (S, A->type, A->is_csc, NULL, 0, 0, &ignore, A1, T, false, NULL, NULL, op_2nd, true, Werk)) ; ASSERT_MATRIX_OK (S, "S = A1+T", GB0) ; // free A1 and T GB_Matrix_free (&T) ; GB_Matrix_free (&A1) ; //------------------------------------------------------------------ // replace T with S //------------------------------------------------------------------ T = S ; S = NULL ; tnz = GB_nnz (T) ; //------------------------------------------------------------------ // remove A1 from the vectors of A, if A is hypersparse //------------------------------------------------------------------ if (A->h != NULL) { A->nvec = kA ; } } //---------------------------------------------------------------------- // append T to the end of A0 //---------------------------------------------------------------------- const int64_t *restrict Tp = T->p ; const int64_t *restrict Th = T->h ; const int64_t *restrict Ti = T->i ; int64_t tnvec = T->nvec ; anz = anz0 ; int64_t anz_last = anz ; int nthreads = GB_nthreads (tnz, chunk, nthreads_max) ; // append the indices and values of T to the end of A GB_memcpy (Ai + anz, Ti, tnz * sizeof (int64_t), nthreads) ; if (!A_iso) { const GB_void *restrict Tx = (GB_void *) T->x ; GB_memcpy (Ax + anz * asize, Tx, tnz * asize, nthreads) ; } // append the vectors of T to the end of A for (int64_t k = 0 ; k < tnvec ; k++) { int64_t j = Th [k] ; ASSERT (j >= tjfirst) ; anz += (Tp [k+1] - Tp [k]) ; GB_OK (GB_jappend (A, j, &jlast, anz, &anz_last, Werk)) ; } GB_jwrapup (A, jlast, anz) ; ASSERT (anz == anz_new) ; // need to recompute the # of non-empty vectors in GB_conform A->nvec_nonempty = -1 ; // recomputed just below // A->h has been modified so A->Y is now invalid GB_hyper_hash_free (A) ; ASSERT_MATRIX_OK (A, "A after GB_wait:append", GB0) ; GB_Matrix_free (&T) ; // conform A to its desired sparsity structure GB_OK (GB_conform (A, Werk)) ; ASSERT (A->nvec_nonempty >= 0) ; } else { //---------------------------------------------------------------------- // A = A+T //---------------------------------------------------------------------- // The update is not incremental since most of A is changing. Just do // a single parallel add: S=A+T, free T, and then transplant S back // into A. The nzmax of A is tight, with no room for future // incremental growth. // FUTURE:: if GB_add could tolerate zombies in A, then the initial // prune of zombies can be skipped. // T->Y is not present (GB_builder does not create it). The old A->Y // is still valid, if present, for the matrix A prior to added the // pending tuples in T. GB_add may need A->Y to compute S, but it does // not compute S->Y. GB_CLEAR_STATIC_HEADER (S, &S_header) ; GB_OK (GB_add (S, A->type, A->is_csc, NULL, 0, 0, &ignore, A, T, false, NULL, NULL, op_2nd, true, Werk)) ; GB_Matrix_free (&T) ; ASSERT_MATRIX_OK (S, "S after GB_wait:add", GB0) ; if (GB_IS_HYPERSPARSE (A) && GB_IS_HYPERSPARSE (S) && A->Y != NULL && !A->Y_shallow && !GB_is_shallow (A->Y)) { // A and S are both hypersparse, and the old A->Y exists and is not // shallow. Check if S->h and A->h are identical. If so, remove // A->Y from A and save it. Then after the transplant of S into A, // below, if A is still hyperparse, transplant Y back into A->Y. if (S->nvec == anvec) { // A and S have the same number of vectors. Compare Ah and Sh int64_t *restrict Ah = A->h ; int64_t *restrict Sh = S->h ; bool hsame = true ; int nthreads = GB_nthreads (anvec, chunk, nthreads_max) ; if (nthreads == 1) { // compare Ah and Sh with a single thread hsame = (memcmp (Ah, Sh, anvec * sizeof (int64_t)) == 0) ; } else { // compare Ah and Sh with several threads int ntasks = 64 * nthreads ; int tid ; #pragma omp parallel for num_threads(nthreads) \ schedule(dynamic) for (tid = 0 ; tid < ntasks ; tid++) { int64_t kstart, kend ; GB_PARTITION (kstart, kend, anvec, tid, ntasks) ; bool my_hsame ; GB_ATOMIC_READ my_hsame = hsame ; if (my_hsame) { // compare this task's region of Ah and Sh my_hsame = (memcmp (Ah + kstart, Sh + kstart, (kend - kstart) * sizeof (int64_t)) == 0) ; if (!my_hsame) { // tell other tasks to exit early GB_ATOMIC_WRITE hsame = false ; } } } } if (hsame) { // Ah and Sh are the same, so keep A->Y Y = A->Y ; A->Y = NULL ; A->Y_shallow = false ; } } } // transplant S into A GB_OK (GB_transplant_conform (A, A->type, &S, Werk)) ; ASSERT (A->nvec_nonempty >= 0) ; if (Y != NULL && GB_IS_HYPERSPARSE (A) && A->Y == NULL) { // The hyperlist of A has not changed. A is still hypersparse, and // has no A->Y after the transplant/conform above. The original // A->Y is valid, so transplant it back into A. A->Y = Y ; A->Y_shallow = false ; Y = NULL ; } ASSERT_MATRIX_OK (A, "A after GB_wait:add", GB0) ; } //-------------------------------------------------------------------------- // flush the matrix and return result //-------------------------------------------------------------------------- GB_FREE_WORKSPACE ; #pragma omp flush return (info) ; }