//------------------------------------------------------------------------------ // GB_subassign_06d_template: C = A //------------------------------------------------------------------------------ // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ #undef GB_FREE_ALL #define GB_FREE_ALL \ { \ GB_WERK_POP (A_ek_slicing, int64_t) ; \ } { //-------------------------------------------------------------------------- // get the inputs //-------------------------------------------------------------------------- #ifdef GB_JIT_KERNEL #define Mask_struct GB_MASK_STRUCT #define C_is_bitmap GB_C_IS_BITMAP #define A_is_bitmap GB_A_IS_BITMAP #define A_is_full GB_A_IS_FULL #define A_iso GB_A_ISO #define GB_AX_MASK(Ax,pA,asize) GB_MCAST (((GB_M_TYPE *) Ax), pA, asize) #else const bool C_is_bitmap = GB_IS_BITMAP (C) ; const bool A_is_bitmap = GB_IS_BITMAP (A) ; const bool A_is_full = GB_IS_FULL (A) ; const bool A_iso = A->iso ; const size_t asize = A->type->size ; #endif ASSERT (C_is_bitmap || GB_IS_FULL (C)) ; //-------------------------------------------------------------------------- // Parallel: slice A into equal-sized chunks //-------------------------------------------------------------------------- GB_WERK_DECLARE (A_ek_slicing, int64_t) ; GB_A_NHELD (anz) ; int A_ntasks, A_nthreads ; double work = anz + A->nvec ; chunk = 32 * chunk ; // method 06d needs a larger chunk if (A_is_bitmap || A_is_full) { // no need to construct tasks A_nthreads = GB_nthreads (work, chunk, nthreads_max) ; A_ntasks = (A_nthreads == 1) ? 1 : (8 * A_nthreads) ; } else { GB_SLICE_MATRIX_WORK (A, 8, work, anz) ; } //-------------------------------------------------------------------------- // get C and A //-------------------------------------------------------------------------- ASSERT (!GB_ZOMBIES (A)) ; ASSERT (GB_JUMBLED_OK (A)) ; ASSERT (!GB_PENDING (A)) ; const int64_t *restrict Ap = A->p ; const int64_t *restrict Ah = A->h ; const int64_t *restrict Ai = A->i ; const int8_t *restrict Ab = A->b ; const int64_t avlen = A->vlen ; // since A is the mask, if A->iso is true, Mask_struct has been set true ASSERT (GB_IMPLIES (A_iso, Mask_struct)) ; int8_t *restrict Cb = C->b ; const int64_t cvlen = C->vlen ; #ifdef GB_ISO_ASSIGN // C is iso, and A is either iso or effectively iso (with a single entry // and not in bitmap form). This case is only used by GB_subassign_06d // directly, and it is not needed for any kernel (generic, factor, or JIT). ASSERT (C->iso) ; GB_A_NVALS (e) ; ASSERT (A_iso || (e == 1 && !A_is_bitmap)) ; ASSERT (Mask_struct) ; #else const GB_A_TYPE *restrict Ax = (GB_A_TYPE *) A->x ; GB_C_TYPE *restrict Cx = (GB_C_TYPE *) C->x ; GB_DECLAREC (cwork) ; if (A_iso) { // get the iso value of A and typecast to C->type // cwork = (ctype) Ax [0] GB_COPY_aij_to_cwork (cwork, Ax, 0, true) ; } #endif //-------------------------------------------------------------------------- // C = A //-------------------------------------------------------------------------- int64_t cnvals = C->nvals ; // for C bitmap // future:: divide this Templates into sub-Templates (Mask_struct, etc) if (Mask_struct) { //---------------------------------------------------------------------- // C = A where A can be iso or non-iso; mask is structural //---------------------------------------------------------------------- if (A_is_full) { //------------------------------------------------------------------ // A is full: all entries present //------------------------------------------------------------------ #ifndef GB_ISO_ASSIGN { int64_t p ; #pragma omp parallel for num_threads(A_nthreads) \ schedule(static) for (p = 0 ; p < anz ; p++) { // Cx [p] = Ax [p] GB_COPY_aij_to_C (Cx, p, Ax, p, A_iso, cwork) ; } } #endif if (C_is_bitmap) { GB_memset (Cb, 1, anz, A_nthreads) ; cnvals = anz ; } } else if (A_is_bitmap) { //------------------------------------------------------------------ // A is bitmap //------------------------------------------------------------------ if (C_is_bitmap) { //-------------------------------------------------------------- // C is bitmap, A is bitmap //-------------------------------------------------------------- int tid ; #pragma omp parallel for num_threads(A_nthreads) \ schedule(static) reduction(+:cnvals) for (tid = 0 ; tid < A_nthreads ; tid++) { int64_t pA_start, pA_end, task_cnvals = 0 ; GB_PARTITION (pA_start, pA_end, anz, tid, A_nthreads) ; for (int64_t p = pA_start ; p < pA_end ; p++) { if (Ab [p]) { // Cx [p] = Ax [p] #ifndef GB_ISO_ASSIGN GB_COPY_aij_to_C (Cx, p, Ax, p, A_iso, cwork) ; #endif task_cnvals += (Cb [p] == 0) ; Cb [p] = 1 ; } } cnvals += task_cnvals ; } } else { //-------------------------------------------------------------- // C is hypersparse, sparse, or full, with all entries present //-------------------------------------------------------------- #ifndef GB_ISO_ASSIGN { // this method is used by LAGraph_bfs_parent when q is // a bitmap and pi is full. int64_t p ; #pragma omp parallel for num_threads(A_nthreads) \ schedule(static) for (p = 0 ; p < anz ; p++) { // Cx [p] = Ax [p] if (Ab [p]) { GB_COPY_aij_to_C (Cx, p, Ax, p, A_iso, cwork) ; } } } #endif } } else { //------------------------------------------------------------------ // A is hypersparse or sparse; C is full or a bitmap //------------------------------------------------------------------ const int64_t *restrict kfirst_Aslice = A_ek_slicing ; const int64_t *restrict klast_Aslice = A_ek_slicing + A_ntasks ; const int64_t *restrict pstart_Aslice = A_ek_slicing + A_ntasks * 2; int taskid ; if (C_is_bitmap) { //-------------------------------------------------------------- // C is bitmap, mask is structural //-------------------------------------------------------------- #pragma omp parallel for num_threads(A_nthreads) \ schedule(dynamic,1) reduction(+:cnvals) for (taskid = 0 ; taskid < A_ntasks ; taskid++) { // if kfirst > klast then taskid does no work at all int64_t kfirst = kfirst_Aslice [taskid] ; int64_t klast = klast_Aslice [taskid] ; int64_t task_cnvals = 0 ; // C = A(:,kfirst:klast) for (int64_t k = kfirst ; k <= klast ; k++) { // get A(:,j), the kth vector of A int64_t j = GBH_A (Ah, k) ; GB_GET_PA (pA_start, pA_end, taskid, k, kfirst, klast, pstart_Aslice, GBP_A (Ap, k, avlen), GBP_A (Ap, k+1, avlen)) ; // pC is the start of C(:,j) int64_t pC = j * cvlen ; // C=A(:,j) with C bitmap, A sparse GB_PRAGMA_SIMD_REDUCTION (+,task_cnvals) for (int64_t pA = pA_start ; pA < pA_end ; pA++) { int64_t p = pC + Ai [pA] ; // Cx [p] = Ax [pA] #ifndef GB_ISO_ASSIGN GB_COPY_aij_to_C (Cx, p, Ax, pA, A_iso, cwork) ; #endif task_cnvals += (Cb [p] == 0) ; Cb [p] = 1 ; } } cnvals += task_cnvals ; } } else { //-------------------------------------------------------------- // C is full, mask is structural //-------------------------------------------------------------- #ifndef GB_ISO_ASSIGN { #pragma omp parallel for num_threads(A_nthreads) \ schedule(dynamic,1) for (taskid = 0 ; taskid < A_ntasks ; taskid++) { // if kfirst > klast then taskid does no work at all int64_t kfirst = kfirst_Aslice [taskid] ; int64_t klast = klast_Aslice [taskid] ; // C = A(:,kfirst:klast) for (int64_t k = kfirst ; k <= klast ; k++) { // get A(:,j), the kth vector of A int64_t j = GBH_A (Ah, k) ; GB_GET_PA (pA_start, pA_end, taskid, k, kfirst, klast, pstart_Aslice, GBP_A (Ap, k, avlen), GBP_A (Ap, k+1, avlen)) ; // pC is the start of C(:,j) int64_t pC = j * cvlen ; // C=A(:,j) with C full, A sparse GB_PRAGMA_SIMD_VECTORIZE for (int64_t pA = pA_start ; pA < pA_end ; pA++) { int64_t p = pC + Ai [pA] ; // Cx [p] = Ax [pA] GB_COPY_aij_to_C (Cx, p, Ax, pA, A_iso, cwork) ; } } } } #endif } } } #ifndef GB_ISO_ASSIGN else { //---------------------------------------------------------------------- // C = A where A must be non-iso, and the mask is valued //---------------------------------------------------------------------- if (A_is_full) { //------------------------------------------------------------------ // A is full: all entries present //------------------------------------------------------------------ if (C_is_bitmap) { //-------------------------------------------------------------- // C is bitmap, A is full //-------------------------------------------------------------- int tid ; #pragma omp parallel for num_threads(A_nthreads) \ schedule(static) reduction(+:cnvals) for (tid = 0 ; tid < A_nthreads ; tid++) { int64_t pA_start, pA_end, task_cnvals = 0 ; GB_PARTITION (pA_start, pA_end, anz, tid, A_nthreads) ; for (int64_t p = pA_start ; p < pA_end ; p++) { if (GB_AX_MASK (Ax, p, asize)) { // Cx [p] = Ax [p] GB_COPY_aij_to_C (Cx, p, Ax, p, false, cwork) ; task_cnvals += (Cb [p] == 0) ; Cb [p] = 1 ; } } cnvals += task_cnvals ; } } else { //-------------------------------------------------------------- // C is hypersparse, sparse, or full, with all entries present //-------------------------------------------------------------- int64_t p ; #pragma omp parallel for num_threads(A_nthreads) \ schedule(static) for (p = 0 ; p < anz ; p++) { if (GB_AX_MASK (Ax, p, asize)) { // Cx [p] = Ax [p] GB_COPY_aij_to_C (Cx, p, Ax, p, false, cwork) ; } } } } else if (A_is_bitmap) { //------------------------------------------------------------------ // A is bitmap //------------------------------------------------------------------ if (C_is_bitmap) { //------------------------------------------------------------- // C is bitmap, A is bitmap //-------------------------------------------------------------- int tid ; #pragma omp parallel for num_threads(A_nthreads) \ schedule(static) reduction(+:cnvals) for (tid = 0 ; tid < A_nthreads ; tid++) { int64_t pA_start, pA_end, task_cnvals = 0 ; GB_PARTITION (pA_start, pA_end, anz, tid, A_nthreads) ; for (int64_t p = pA_start ; p < pA_end ; p++) { if (Ab [p] && GB_AX_MASK (Ax, p, asize)) { // Cx [p] = Ax [p] GB_COPY_aij_to_C (Cx, p, Ax, p, false, cwork) ; task_cnvals += (Cb [p] == 0) ; Cb [p] = 1 ; } } cnvals += task_cnvals ; } } else { //-------------------------------------------------------------- // C is hypersparse, sparse, or full, with all entries present //-------------------------------------------------------------- int64_t p ; #pragma omp parallel for num_threads(A_nthreads) \ schedule(static) for (p = 0 ; p < anz ; p++) { if (Ab [p] && GB_AX_MASK (Ax, p, asize)) { // Cx [p] = Ax [p] GB_COPY_aij_to_C (Cx, p, Ax, p, false, cwork) ; } } } } else { //------------------------------------------------------------------ // A is hypersparse or sparse; C is full or bitmap //------------------------------------------------------------------ const int64_t *restrict kfirst_Aslice = A_ek_slicing ; const int64_t *restrict klast_Aslice = A_ek_slicing + A_ntasks ; const int64_t *restrict pstart_Aslice = A_ek_slicing + A_ntasks * 2; int taskid ; if (C_is_bitmap) { //-------------------------------------------------------------- // C is bitmap //-------------------------------------------------------------- #pragma omp parallel for num_threads(A_nthreads) \ schedule(dynamic,1) reduction(+:cnvals) for (taskid = 0 ; taskid < A_ntasks ; taskid++) { // if kfirst > klast then taskid does no work at all int64_t kfirst = kfirst_Aslice [taskid] ; int64_t klast = klast_Aslice [taskid] ; int64_t task_cnvals = 0 ; // C = A(:,kfirst:klast) for (int64_t k = kfirst ; k <= klast ; k++) { // get A(:,j), the kth vector of A int64_t j = GBH_A (Ah, k) ; GB_GET_PA (pA_start, pA_end, taskid, k, kfirst, klast, pstart_Aslice, GBP_A (Ap, k, avlen), GBP_A (Ap, k+1, avlen)) ; // pC is the start of C(:,j) int64_t pC = j * cvlen ; // C=A(:,j) with C bitmap, A sparse GB_PRAGMA_SIMD_REDUCTION (+,task_cnvals) for (int64_t pA = pA_start ; pA < pA_end ; pA++) { if (GB_AX_MASK (Ax, pA, asize)) { int64_t p = pC + Ai [pA] ; // Cx [p] = Ax [pA] GB_COPY_aij_to_C (Cx, p, Ax, pA, A_iso, cwork) ; task_cnvals += (Cb [p] == 0) ; Cb [p] = 1 ; } } } cnvals += task_cnvals ; } } else { //-------------------------------------------------------------- // C is full //-------------------------------------------------------------- #pragma omp parallel for num_threads(A_nthreads) \ schedule(dynamic,1) reduction(+:cnvals) for (taskid = 0 ; taskid < A_ntasks ; taskid++) { // if kfirst > klast then taskid does no work at all int64_t kfirst = kfirst_Aslice [taskid] ; int64_t klast = klast_Aslice [taskid] ; // C = A(:,kfirst:klast) for (int64_t k = kfirst ; k <= klast ; k++) { // get A(:,j), the kth vector of A int64_t j = GBH_A (Ah, k) ; GB_GET_PA (pA_start, pA_end, taskid, k, kfirst, klast, pstart_Aslice, GBP_A (Ap, k, avlen), GBP_A (Ap, k+1, avlen)) ; // pC is the start of C(:,j) int64_t pC = j * cvlen ; // C=A(:,j) with C full, A sparse GB_PRAGMA_SIMD_VECTORIZE for (int64_t pA = pA_start ; pA < pA_end ; pA++) { if (GB_AX_MASK (Ax, pA, asize)) { int64_t p = pC + Ai [pA] ; // Cx [p] = Ax [pA] GB_COPY_aij_to_C (Cx, p, Ax, pA, A_iso, cwork) ; } } } } } } } #endif //-------------------------------------------------------------------------- // log the number of entries in the C bitmap //-------------------------------------------------------------------------- if (C_is_bitmap) { C->nvals = cnvals ; } GB_FREE_ALL ; } #undef GB_ISO_ASSIGN #undef GB_FREE_ALL #define GB_FREE_ALL ;