//------------------------------------------------------------------------------ // GB_task_cumsum: cumulative sum of Cp and fine tasks in TaskList //------------------------------------------------------------------------------ // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2023, All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 //------------------------------------------------------------------------------ // Cp is never NULL. C is created as sparse or hypersparse. #include "GB.h" void GB_task_cumsum ( int64_t *Cp, // size Cnvec+1 const int64_t Cnvec, int64_t *Cnvec_nonempty, // # of non-empty vectors in C GB_task_struct *restrict TaskList, // array of structs const int ntasks, // # of tasks const int nthreads, // # of threads GB_Werk Werk ) { //-------------------------------------------------------------------------- // check inputs //-------------------------------------------------------------------------- ASSERT (Cp != NULL) ; ASSERT (Cnvec >= 0) ; ASSERT (Cnvec_nonempty != NULL) ; ASSERT (TaskList != NULL) ; ASSERT (ntasks >= 0) ; ASSERT (nthreads >= 1) ; //-------------------------------------------------------------------------- // local cumulative sum of the fine tasks //-------------------------------------------------------------------------- for (int taskid = 0 ; taskid < ntasks ; taskid++) { int64_t k = TaskList [taskid].kfirst ; if (TaskList [taskid].klast < 0) { // Compute the sum of all fine tasks for vector k, in Cp [k]. Also // compute the cumulative sum of TaskList [taskid].pC, for the // tasks that work on vector k. The first fine task for vector k // starts with TaskList [taskid].pC = 0, which is an offset from // the final Cp [k]. A subsequent fine task t for a vector k // starts on offset of TaskList [t].pC from the start of C(:,k). // Cp [k] has not been cumsum'd across all of Cp. It is just the // count of the entries in C(:,k). The final Cp [k] is added to // each fine task below, after the GB_cumsum of Cp. int64_t pC = Cp [k] ; Cp [k] += TaskList [taskid].pC ; TaskList [taskid].pC = pC ; } } //-------------------------------------------------------------------------- // replace Cp with its cumulative sum //-------------------------------------------------------------------------- GB_cumsum (Cp, Cnvec, Cnvec_nonempty, nthreads, Werk) ; //-------------------------------------------------------------------------- // shift the cumulative sum of the fine tasks //-------------------------------------------------------------------------- for (int taskid = 0 ; taskid < ntasks ; taskid++) { int64_t k = TaskList [taskid].kfirst ; if (TaskList [taskid].klast < 0) { // TaskList [taskid].pC is currently an offset for this task into // C(:,k). The first fine task for vector k has an offset of zero, // the 2nd fine task has an offset equal to the # of entries // computed by the first task, and so on. Cp [k] needs to be added // to all offsets to get the final starting position for each fine // task in C. TaskList [taskid].pC += Cp [k] ; } else { // The last fine task to operate on vector k needs know its own // pC_end, which is Cp [k+1]. Suppose that task is taskid-1. If // this taskid is the first fine task for vector k, then TaskList // [taskid].pC is set to Cp [k] above. If all coarse tasks are // also given TaskList [taskid].pC = Cp [k], then taskid-1 will // always know its pC_end, which is TaskList [taskid].pC, // regardless of whether taskid is a fine or coarse task. TaskList [taskid].pC = Cp [k] ; } } // The last task is ntasks-1. It may be a fine task, in which case it // computes the entries in C from TaskList [ntasks-1].pC to // TaskList [ntasks].pC-1, inclusive. TaskList [ntasks].pC = Cp [Cnvec] ; //-------------------------------------------------------------------------- // check result //-------------------------------------------------------------------------- #ifdef GB_DEBUG // nthreads, ntasks, Cnvec) ; for (int t = 0 ; t < ntasks ; t++) { int64_t k = TaskList [t].kfirst ; int64_t klast = TaskList [t].klast ; if (klast < 0) { // this is a fine task for vector k int64_t pA = TaskList [t].pA ; int64_t pA_end = TaskList [t].pA_end ; int64_t pB = TaskList [t].pB ; int64_t pB_end = TaskList [t].pB_end ; int64_t pC = TaskList [t].pC ; int64_t pC_end = TaskList [t+1].pC ; int64_t pM = TaskList [t].pM ; int64_t pM_end = TaskList [t].pM_end ; ASSERT (k >= 0 && k < Cnvec) ; // pA:(pA_end-1) must reside inside A(:,j), and pB:(pB_end-1) must // reside inside B(:,j), but these cannot be checked here since A // and B are not available. These basic checks can be done: ASSERT (pA == -1 || (0 <= pA && pA <= pA_end)) ; ASSERT (pB == -1 || (0 <= pB && pB <= pB_end)) ; ASSERT (pM == -1 || (0 <= pM && pM <= pM_end)) ; // pC and pC_end can be checked exactly. This task t computes // entries pC:(pC_end-1) of C, inclusive. ASSERT (Cp [k] <= pC && pC <= pC_end && pC_end <= Cp [k+1]) ; } else { // this is a coarse task for vectors k:klast, inclusive ASSERT (k >= 0 && k < Cnvec) ; ASSERT (klast >= 0 && klast <= Cnvec) ; ASSERT (k <= klast) ; } } #endif }