//------------------------------------------------------------------------------ // GB_AxB_saxpy3_coarseGus_M_phase5: C=A*B, coarse Gustavson, phase5 //------------------------------------------------------------------------------ // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ { //-------------------------------------------------------------------------- // phase5: coarse Gustavson task, C=A*B //-------------------------------------------------------------------------- // Initially, Hf [...] < mark for all of Hf. // Hf [i] < mark : M(i,j)=0, C(i,j) is ignored. // Hf [i] == mark : M(i,j)=1, and C(i,j) not yet seen. // Hf [i] == mark+1 : M(i,j)=1, and C(i,j) has been seen. for (int64_t kk = kfirst ; kk <= klast ; kk++) { int64_t pC = Cp [kk] ; int64_t cjnz = Cp [kk+1] - pC ; if (cjnz == 0) continue ; // nothing to do GB_GET_B_j ; // get B(:,j) #ifndef GB_GENERIC if (cjnz == cvlen) // C(:,j) is dense { // This is not used for the generic saxpy3. GB_COMPUTE_DENSE_C_j ; // C(:,j) = A*B(:,j) continue ; } #endif GB_GET_M_j ; // get M(:,j) GB_GET_M_j_RANGE (64) ; // get first and last in M(:,j) mark += 2 ; int64_t mark1 = mark+1 ; // scatter M(:,j) into the Gustavson workspace GB_SCATTER_M_j (pM_start, pM_end, mark) ; if (16 * cjnz > cvlen) { //------------------------------------------------------------------ // C(:,j) is not very sparse //------------------------------------------------------------------ for ( ; pB < pB_end ; pB++) // scan B(:,j) { GB_GET_B_kj_INDEX ; // get k of B(k,j) GB_GET_A_k ; // get A(:,k) if (aknz == 0) continue ; GB_GET_B_kj ; // bkj = B(k,j) #define GB_IKJ \ { \ int64_t hf = Hf [i] ; \ if (hf == mark) \ { \ /* C(i,j) = A(i,k) * B(k,j) */ \ Hf [i] = mark1 ; /* mark as seen */ \ GB_MULT_A_ik_B_kj ; /* t = aik*bkj */ \ GB_HX_WRITE (i, t) ; /* Hx [i] = t */ \ } \ else if (hf == mark1) \ { \ /* C(i,j) += A(i,k) * B(k,j) */ \ GB_MULT_A_ik_B_kj ; /* t = aik*bkj */ \ GB_HX_UPDATE (i, t) ; /* Hx [i] += t */ \ } \ } GB_SCAN_M_j_OR_A_k (A_ok_for_binary_search) ; #undef GB_IKJ } GB_GATHER_ALL_C_j(mark1) ; // gather into C(:,j) } else { //------------------------------------------------------------------ // C(:,j) is very sparse //------------------------------------------------------------------ for ( ; pB < pB_end ; pB++) // scan B(:,j) { GB_GET_B_kj_INDEX ; // get k of B(k,j) GB_GET_A_k ; // get A(:,k) if (aknz == 0) continue ; GB_GET_B_kj ; // bkj = B(k,j) #define GB_IKJ \ { \ int64_t hf = Hf [i] ; \ if (hf == mark) \ { \ /* C(i,j) = A(i,k) * B(k,j) */ \ Hf [i] = mark1 ; /* mark as seen */ \ GB_MULT_A_ik_B_kj ; /* t = aik*bkj */ \ GB_HX_WRITE (i, t) ; /* Hx [i] = t */ \ Ci [pC++] = i ; /* C(:,j) pattern */ \ } \ else if (hf == mark1) \ { \ /* C(i,j) += A(i,k) * B(k,j) */ \ GB_MULT_A_ik_B_kj ; /* t = aik*bkj */ \ GB_HX_UPDATE (i, t) ; /* Hx [i] += t */ \ } \ } GB_SCAN_M_j_OR_A_k (A_ok_for_binary_search) ; #undef GB_IKJ } GB_SORT_AND_GATHER_C_j ; // gather into C(:,j) } } }