//------------------------------------------------------------------------------ // CHOLMOD/GPU/t_cholmod_gpu: template for cholmod_super_numeric //------------------------------------------------------------------------------ // CHOLMOD/GPU Module. Copyright (C) 2005-2022, Timothy A. Davis. // All Rights Reserved. // SPDX-License-Identifier: GPL-2.0+ //------------------------------------------------------------------------------ /* GPU BLAS template routine for cholmod_super_numeric. */ /* ========================================================================== */ /* === include files and definitions ======================================== */ /* ========================================================================== */ #ifdef CHOLMOD_HAS_CUDA #include #include "cholmod_template.h" #include "cholmod_gpu_kernels.h" #include #include #include #undef L_ENTRY #ifdef REAL #define L_ENTRY 1 #else #define L_ENTRY 2 #endif /* ========================================================================== */ /* === gpu_clear_memory ===================================================== */ /* ========================================================================== */ /* * Ensure the Lx is zeroed before forming factor. This is a significant cost * in the GPU case - so using this parallel memset code for efficiency. */ void TEMPLATE2 (CHOLMOD (gpu_clear_memory)) ( double* buff, size_t size, int num_threads ) { int chunk_multiplier = 5; int num_chunks = chunk_multiplier * num_threads; size_t chunksize = size / num_chunks; int i ; #pragma omp parallel for num_threads(num_threads) \ schedule(dynamic) for (i = 0 ; i < num_chunks ; i++) { size_t chunkoffset = i * chunksize; if (i == num_chunks - 1) { memset(buff + chunkoffset, 0, (size - chunksize*(num_chunks - 1)) * sizeof(double)); } else { memset(buff + chunkoffset, 0, chunksize * sizeof(double)); } } } /* ========================================================================== */ /* === gpu_init ============================================================= */ /* ========================================================================== */ /* * Performs required initialization for GPU computing. * * Returns 0 if there is an error, so the intended use is * * useGPU = CHOLMOD(gpu_init) * * which would locally turn off gpu processing if the initialization failed. */ int TEMPLATE2 (CHOLMOD (gpu_init)) ( void *Cwork, cholmod_factor *L, cholmod_common *Common, int64_t nsuper, int64_t n, int64_t nls, cholmod_gpu_pointers *gpu_p ) { int64_t i, k, maxSize ; cublasStatus_t cublasError ; cudaError_t cudaErr ; size_t maxBytesSize, HostPinnedSize ; // feenableexcept (FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW ); maxSize = L->maxcsize; /* #define PAGE_SIZE (4*1024) */ CHOLMOD_GPU_PRINTF (("gpu_init : %p\n", (void *) ((size_t) Cwork & ~(4*1024-1)))) ; /* make sure the assumed buffer sizes are large enough */ if ( (nls+2*n+4)*sizeof(int64_t) > Common->devBuffSize ) { ERROR (CHOLMOD_GPU_PROBLEM,"\n\n" "GPU Memory allocation error. Ls, Map and RelativeMap exceed\n" "devBuffSize. It is not clear if this is due to insufficient\n" "device or host memory or both. You can try:\n" " 1) increasing the amount of GPU memory requested\n" " 2) reducing CHOLMOD_NUM_HOST_BUFFERS\n" " 3) using a GPU & host with more memory\n" "This issue is a known limitation and should be fixed in a \n" "future release of CHOLMOD.\n") ; return (0) ; } /* divvy up the memory in dev_mempool */ // these d_* are (double *) pointers, so add an explicit typecast: gpu_p->d_Lx[0] = (double *) (Common->dev_mempool) ; gpu_p->d_Lx[1] = (double *) ((char*)Common->dev_mempool + Common->devBuffSize) ; gpu_p->d_C = (double *) ((char*)Common->dev_mempool + 2 * Common->devBuffSize) ; gpu_p->d_A[0] = (double *) ((char*)Common->dev_mempool + 3 * Common->devBuffSize) ; gpu_p->d_A[1] = (double *) ((char*)Common->dev_mempool + 4 * Common->devBuffSize) ; // these d_* pointers are (void *): gpu_p->d_Ls = (char*)Common->dev_mempool + 5 * Common->devBuffSize; gpu_p->d_Map = (char*)gpu_p->d_Ls + (nls + 1) * sizeof(int64_t); gpu_p->d_RelativeMap = (char*)gpu_p->d_Map + (n + 1) * sizeof(int64_t) ; /* Copy all of the Ls and Lpi data to the device. If any supernodes are * to be computed on the device then this will be needed, so might as * well do it now. */ cudaErr = cudaMemcpy ( gpu_p->d_Ls, L->s, nls*sizeof(int64_t), cudaMemcpyHostToDevice ); CHOLMOD_HANDLE_CUDA_ERROR(cudaErr,"cudaMemcpy(d_Ls)"); //-------------------------------------------------------------------------- // create the cublasHandle and streams //-------------------------------------------------------------------------- if (!CHOLMOD (gpu_start) (Common)) { return (0) ; } //-------------------------------------------------------------------------- gpu_p->h_Lx[0] = (double*)(Common->host_pinned_mempool); for ( k=1; kh_Lx[k] = (double*)((char *)(Common->host_pinned_mempool) + k*Common->devBuffSize); } return (1); /* initialization successful, useGPU = 1 */ } #if defined(NCOMPAR_FN_T) typedef int (*__compar_fn_t)(const void *, const void *); #endif /* ========================================================================== */ /* === gpu_reorder_descendants ============================================== */ /* ========================================================================== */ /* Reorder the descendant supernodes as: * 1st - descendant supernodes eligible for processing on the GPU * in increasing (by flops) order * 2nd - supernodes whose processing is to remain on the CPU * in any order * * All of the GPU-eligible supernodes will be scheduled first. All * CPU-eligible descendants will overlap with the last (largest) * CHOLMOD_HOST_SUPERNODE_BUFFERS GPU-eligible descendants. */ void TEMPLATE2 (CHOLMOD (gpu_reorder_descendants)) ( cholmod_common *Common, int64_t *Super, int64_t *locals, int64_t *Lpi, int64_t *Lpos, int64_t *Head, int64_t *Next, int64_t *Previous, int64_t *ndescendants, int64_t *tail, int64_t *mapCreatedOnGpu, cholmod_gpu_pointers *gpu_p ) { int64_t prevd, nextd, firstcpu, d, kd1, kd2, ndcol, pdi, pdend, pdi1; int64_t dnext, ndrow2, p; int64_t n_descendant = 0; double score; /* use h_Lx[0] to buffer the GPU-eligible descendants */ struct cholmod_descendant_score_t* scores = (struct cholmod_descendant_score_t*) gpu_p->h_Lx[0]; double cpuref = 0.0; d = Head[*locals]; prevd = -1; firstcpu = -1; *mapCreatedOnGpu = 0; while ( d != EMPTY ) { /* Get the parameters for the current descendant supernode */ kd1 = Super [d] ; /* d contains cols kd1 to kd2-1 of L */ kd2 = Super [d+1] ; ndcol = kd2 - kd1 ; /* # of columns in all of d */ pdi = Lpi [d] ; /* pointer to first row of d in Ls */ pdend = Lpi [d+1] ; /* pointer just past last row of d in Ls */ p = Lpos [d] ; /* offset of 1st row of d affecting s */ pdi1 = pdi + p ; /* ptr to 1st row of d affecting s in Ls */ ndrow2 = pdend - pdi1; nextd = Next[d]; /* compute a rough flops 'score' for this descendant supernode */ score = ndrow2 * ndcol; if ( ndrow2*L_ENTRY >= CHOLMOD_ND_ROW_LIMIT && ndcol*L_ENTRY >= CHOLMOD_ND_COL_LIMIT ) { score += Common->devBuffSize; } /* place in sort buffer */ scores[n_descendant].score = score; scores[n_descendant].d = d; n_descendant++; d = nextd; } /* Sort the GPU-eligible supernodes */ qsort ( scores, n_descendant, sizeof(struct cholmod_descendant_score_t), (__compar_fn_t) CHOLMOD(score_comp) ); /* Place sorted data back in descendant supernode linked list*/ if ( n_descendant > 0 ) { Head[*locals] = scores[0].d; if ( n_descendant > 1 ) { #ifdef _OPENMP int nthreads = cholmod_nthreads ((double) n_descendant, Common) ; #endif int64_t k ; #pragma omp parallel for num_threads(nthreads) \ if (n_descendant > 64) for (k = 1 ; k < n_descendant ; k++) { Next[scores[k-1].d] = scores[k].d; } } Next[scores[n_descendant-1].d] = firstcpu; } /* reverse the first CHOLMOD_HOST_SUPERNODE_BUFFERS to better hide PCIe communications */ if ( Head[*locals] != EMPTY && Next[Head[*locals]] != EMPTY ) { int64_t previousd = Head[*locals]; d = Next[Head[*locals]]; int64_t nreverse = 1; while ( d!=EMPTY && nreverse < CHOLMOD_HOST_SUPERNODE_BUFFERS ) { kd1 = Super [d] ; /* d contains cols kd1 to kd2-1 of L */ kd2 = Super [d+1] ; ndcol = kd2 - kd1 ; /* # of columns in all of d */ pdi = Lpi [d] ; /* pointer to first row of d in Ls */ pdend = Lpi [d+1] ; /* pointer just past last row of d in Ls */ p = Lpos [d] ; /* offset of 1st row of d affecting s */ pdi1 = pdi + p ; /* ptr to 1st row of d affecting s in Ls */ ndrow2 = pdend - pdi1; nextd = Next[d]; nreverse++; if ( ndrow2*L_ENTRY >= CHOLMOD_ND_ROW_LIMIT && ndcol*L_ENTRY >= CHOLMOD_ND_COL_LIMIT ) { /* place this supernode at the front of the list */ Next[previousd] = Next[d]; Next[d] = Head[*locals]; Head[*locals] = d; } else { previousd = d; } d = nextd; } } /* create a 'previous' list so we can traverse backwards */ *ndescendants = 0; if ( Head[*locals] != EMPTY ) { Previous[Head[*locals]] = EMPTY; for (d = Head [*locals] ; d != EMPTY ; d = dnext) { (*ndescendants)++; dnext = Next[d]; if ( dnext != EMPTY ) { Previous[dnext] = d; } else { *tail = d; } } } return; } /* ========================================================================== */ /* === gpu_initialize_supernode ============================================= */ /* ========================================================================== */ /* C = L (k1:n-1, kd1:kd2-1) * L (k1:k2-1, kd1:kd2-1)', except that k1:n-1 */ void TEMPLATE2 (CHOLMOD (gpu_initialize_supernode)) ( cholmod_common *Common, int64_t nscol, int64_t nsrow, int64_t psi, cholmod_gpu_pointers *gpu_p ) { cudaError_t cuErr; /* initialize the device supernode assemby memory to zero */ cuErr = cudaMemset ( gpu_p->d_A[0], 0, nscol*nsrow*L_ENTRY*sizeof(double) ); CHOLMOD_HANDLE_CUDA_ERROR(cuErr,"cudaMemset(d_A)"); /* Create the Map on the device */ createMapOnDevice ( (int64_t *)(gpu_p->d_Map), (int64_t *)(gpu_p->d_Ls), psi, nsrow ); return; } /* ========================================================================== */ /* === gpu_updateC ========================================================== */ /* ========================================================================== */ /* C = L (k1:n-1, kd1:kd2-1) * L (k1:k2-1, kd1:kd2-1)', except that k1:n-1 * refers to all of the rows in L, but many of the rows are all zero. * Supernode d holds columns kd1 to kd2-1 of L. Nonzero rows in the range * k1:k2-1 are in the list Ls [pdi1 ... pdi2-1], of size ndrow1. Nonzero rows * in the range k2:n-1 are in the list Ls [pdi2 ... pdend], of size ndrow2. * Let L1 = L (Ls [pdi1 ... pdi2-1], kd1:kd2-1), and let L2 = L (Ls [pdi2 ... * pdend], kd1:kd2-1). C is ndrow2-by-ndrow1. Let C1 be the first ndrow1 * rows of C and let C2 be the last ndrow2-ndrow1 rows of C. Only the lower * triangular part of C1 needs to be computed since C1 is symmetric. * * UpdateC is completely asynchronous w.r.t. the GPU. Once the input buffer * d_Lx[] has been filled, all of the device operations are issues, and the * host can continue with filling the next input buffer / or start processing * all of the descendant supernodes which are not eligible for processing on * the device (since they are too small - will not fill the device). */ int TEMPLATE2 (CHOLMOD (gpu_updateC)) ( int64_t ndrow1, /* C is ndrow2-by-ndrow2 */ int64_t ndrow2, int64_t ndrow, /* leading dimension of Lx */ int64_t ndcol, /* L1 is ndrow1-by-ndcol */ int64_t nsrow, int64_t pdx1, /* L1 starts at Lx + L_ENTRY*pdx1 */ /* L2 starts at Lx + L_ENTRY*(pdx1 + ndrow1) */ int64_t pdi1, double *Lx, double *C, cholmod_common *Common, cholmod_gpu_pointers *gpu_p ) { double *devPtrLx, *devPtrC ; double alpha, beta ; cublasStatus_t cublasStatus ; cudaError_t cudaStat [2] ; int64_t ndrow3 ; int iHostBuff, iDevBuff ; #ifdef BLAS_TIMER double tstart = 0; #endif if ((ndrow2*L_ENTRY < CHOLMOD_ND_ROW_LIMIT) || (ndcol*L_ENTRY < CHOLMOD_ND_COL_LIMIT)) { /* too small for the CUDA BLAS; use the CPU instead */ return (0) ; } ndrow3 = ndrow2 - ndrow1 ; #ifdef BLAS_TIMER Common->syrkStart = SUITESPARSE_TIME ; Common->CHOLMOD_GPU_SYRK_CALLS++ ; #endif /* ---------------------------------------------------------------------- */ /* allocate workspace on the GPU */ /* ---------------------------------------------------------------------- */ iHostBuff = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS; iDevBuff = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS; /* cycle the device Lx buffer, d_Lx, through CHOLMOD_DEVICE_STREAMS, usually 2, so we can overlap the copy of this descendent supernode with the compute of the previous descendant supernode */ devPtrLx = (double *)(gpu_p->d_Lx[iDevBuff]); /* very little overlap between kernels for difference descendant supernodes (since we enforce the supernodes must be large enough to fill the device) so we only need one C buffer */ devPtrC = (double *)(gpu_p->d_C); /* ---------------------------------------------------------------------- */ /* copy Lx to the GPU */ /* ---------------------------------------------------------------------- */ #ifdef _OPENMP double work = (double) ndcol * (double) ndrow2 * L_ENTRY ; int nthreads = cholmod_nthreads (work, Common) ; #endif /* copy host data to pinned buffer first for better H2D bandwidth */ int64_t icol ; #pragma omp parallel for num_threads(nthreads) \ if (ndcol > 32) for (icol = 0 ; icol < ndcol ; icol++) { for (int64_t irow = 0 ; irow < ndrow2*L_ENTRY ; irow++) { gpu_p->h_Lx[iHostBuff][icol*ndrow2*L_ENTRY+irow] = Lx[pdx1*L_ENTRY+icol*ndrow*L_ENTRY + irow]; } } cudaStat[0] = cudaMemcpyAsync ( devPtrLx, gpu_p->h_Lx[iHostBuff], ndrow2*ndcol*L_ENTRY*sizeof(devPtrLx[0]), cudaMemcpyHostToDevice, Common->gpuStream[iDevBuff] ); if ( cudaStat[0] ) { CHOLMOD_GPU_PRINTF ((" ERROR cudaMemcpyAsync = %d \n", cudaStat[0])); return (0); } /* make the current stream wait for kernels in previous streams */ cudaStreamWaitEvent ( Common->gpuStream[iDevBuff], Common->updateCKernelsComplete, 0 ) ; /* ---------------------------------------------------------------------- */ /* create the relative map for this descendant supernode */ /* ---------------------------------------------------------------------- */ createRelativeMapOnDevice ( (int64_t *)(gpu_p->d_Map), (int64_t *)(gpu_p->d_Ls), (int64_t *)(gpu_p->d_RelativeMap), pdi1, ndrow2, &(Common->gpuStream[iDevBuff]) ); /* ---------------------------------------------------------------------- */ /* do the CUDA SYRK */ /* ---------------------------------------------------------------------- */ // printf ("cublasHandle %p\n", Common->cublasHandle) ; // printf ("stream %p\n", Common->gpuStream [iDevBuff]) ; cublasStatus = cublasSetStream (Common->cublasHandle, Common->gpuStream[iDevBuff]) ; if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream") ; } alpha = 1.0 ; beta = 0.0 ; #ifdef REAL cublasStatus = cublasDsyrk (Common->cublasHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, (int) ndrow1, (int) ndcol, /* N, K: L1 is ndrow1-by-ndcol */ &alpha, /* ALPHA: 1 */ devPtrLx, ndrow2, /* A, LDA: L1, ndrow2 */ &beta, /* BETA: 0 */ devPtrC, ndrow2) ; /* C, LDC: C1 */ #else cublasStatus = cublasZherk (Common->cublasHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, (int) ndrow1, (int) ndcol, /* N, K: L1 is ndrow1-by-ndcol*/ &alpha, /* ALPHA: 1 */ (const cuDoubleComplex *) devPtrLx, ndrow2, /* A, LDA: L1, ndrow2 */ &beta, /* BETA: 0 */ (cuDoubleComplex *) devPtrC, ndrow2) ; /* C, LDC: C1 */ #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } #ifdef BLAS_TIMER Common->CHOLMOD_GPU_SYRK_TIME += SUITESPARSE_TIME - Common->syrkStart; #endif /* ---------------------------------------------------------------------- */ /* compute remaining (ndrow2-ndrow1)-by-ndrow1 block of C, C2 = L2*L1' */ /* ---------------------------------------------------------------------- */ #ifdef BLAS_TIMER Common->CHOLMOD_GPU_GEMM_CALLS++ ; tstart = SUITESPARSE_TIME; #endif if (ndrow3 > 0) { #ifndef REAL cuDoubleComplex calpha = {1.0,0.0} ; cuDoubleComplex cbeta = {0.0,0.0} ; #endif /* ------------------------------------------------------------------ */ /* do the CUDA BLAS dgemm */ /* ------------------------------------------------------------------ */ #ifdef REAL alpha = 1.0 ; beta = 0.0 ; cublasStatus = cublasDgemm (Common->cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, ndrow3, ndrow1, ndcol, /* M, N, K */ &alpha, /* ALPHA: 1 */ devPtrLx + L_ENTRY*(ndrow1), /* A, LDA: L2*/ ndrow2, /* ndrow */ devPtrLx, /* B, LDB: L1 */ ndrow2, /* ndrow */ &beta, /* BETA: 0 */ devPtrC + L_ENTRY*ndrow1, /* C, LDC: C2 */ ndrow2) ; #else cublasStatus = cublasZgemm (Common->cublasHandle, CUBLAS_OP_N, CUBLAS_OP_C, ndrow3, ndrow1, ndcol, /* M, N, K */ &calpha, /* ALPHA: 1 */ (const cuDoubleComplex*) devPtrLx + ndrow1, ndrow2, /* ndrow */ (const cuDoubleComplex *) devPtrLx, ndrow2, /* ndrow */ &cbeta, /* BETA: 0 */ (cuDoubleComplex *)devPtrC + ndrow1, ndrow2) ; #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } } #ifdef BLAS_TIMER Common->CHOLMOD_GPU_GEMM_TIME += SUITESPARSE_TIME - tstart; #endif /* ------------------------------------------------------------------ */ /* Assemble the update C on the device using the d_RelativeMap */ /* ------------------------------------------------------------------ */ #ifdef REAL addUpdateOnDevice ( gpu_p->d_A[0], devPtrC, gpu_p->d_RelativeMap, ndrow1, ndrow2, nsrow, &(Common->gpuStream[iDevBuff]) ); #else addComplexUpdateOnDevice ( gpu_p->d_A[0], devPtrC, gpu_p->d_RelativeMap, ndrow1, ndrow2, nsrow, &(Common->gpuStream[iDevBuff]) ); #endif /* Record an event indicating that kernels for this descendant are complete */ cudaEventRecord ( Common->updateCKernelsComplete, Common->gpuStream[iDevBuff]); cudaEventRecord ( Common->updateCBuffersFree[iHostBuff], Common->gpuStream[iDevBuff]); return (1) ; } /* ========================================================================== */ /* === gpu_final_assembly =================================================== */ /* ========================================================================== */ /* If the supernode was assembled on both the CPU and the GPU, this will * complete the supernode assembly on both the GPU and CPU. */ void TEMPLATE2 (CHOLMOD (gpu_final_assembly)) ( cholmod_common *Common, double *Lx, int64_t psx, int64_t nscol, int64_t nsrow, int supernodeUsedGPU, int *iHostBuff, int *iDevBuff, cholmod_gpu_pointers *gpu_p ) { int64_t iHostBuff2 ; int64_t iDevBuff2 ; if ( supernodeUsedGPU ) { /* ------------------------------------------------------------------ */ /* Apply all of the Shur-complement updates, computed on the gpu, to */ /* the supernode. */ /* ------------------------------------------------------------------ */ *iHostBuff = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS; *iDevBuff = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS; if ( nscol * L_ENTRY >= CHOLMOD_POTRF_LIMIT ) { /* If this supernode is going to be factored using the GPU (potrf) * then it will need the portion of the update assembled ont the * CPU. So copy that to a pinned buffer an H2D copy to device. */ /* wait until a buffer is free */ cudaEventSynchronize ( Common->updateCBuffersFree[*iHostBuff] ); /* copy update assembled on CPU to a pinned buffer */ #ifdef _OPENMP double work = (double) nscol * (double) nsrow * L_ENTRY ; int nthreads = cholmod_nthreads (work, Common) ; #endif int64_t j ; #pragma omp parallel for num_threads(nthreads) \ if (nscol > 32) for (j = 0 ; j < nscol ; j++) { for (int64_t i = j ; i < nsrow*L_ENTRY ; i++) { int64_t iidx = j*nsrow*L_ENTRY+i; gpu_p->h_Lx[*iHostBuff][iidx] = Lx[psx*L_ENTRY+iidx]; } } /* H2D transfer of update assembled on CPU */ cudaMemcpyAsync ( gpu_p->d_A[1], gpu_p->h_Lx[*iHostBuff], nscol*nsrow*L_ENTRY*sizeof(double), cudaMemcpyHostToDevice, Common->gpuStream[*iDevBuff] ); } Common->ibuffer++; iHostBuff2 = (Common->ibuffer)%CHOLMOD_HOST_SUPERNODE_BUFFERS; iDevBuff2 = (Common->ibuffer)%CHOLMOD_DEVICE_STREAMS; /* wait for all kernels to complete */ cudaEventSynchronize( Common->updateCKernelsComplete ); /* copy assembled Schur-complement updates computed on GPU */ cudaMemcpyAsync ( gpu_p->h_Lx[iHostBuff2], gpu_p->d_A[0], nscol*nsrow*L_ENTRY*sizeof(double), cudaMemcpyDeviceToHost, Common->gpuStream[iDevBuff2] ); if ( nscol * L_ENTRY >= CHOLMOD_POTRF_LIMIT ) { /* with the current implementation, potrf still uses data from the * CPU - so put the fully assembled supernode in a pinned buffer for * fastest access */ /* need both H2D and D2H copies to be complete */ cudaDeviceSynchronize(); /* sum updates from cpu and device on device */ #ifdef REAL sumAOnDevice ( gpu_p->d_A[1], gpu_p->d_A[0], -1.0, nsrow, nscol ); #else sumComplexAOnDevice ( gpu_p->d_A[1], gpu_p->d_A[0], -1.0, nsrow, nscol ); #endif /* place final assembled supernode in pinned buffer */ #ifdef _OPENMP double work = (double) nscol * (double) nscol * L_ENTRY ; int nthreads = cholmod_nthreads (work, Common) ; #endif int64_t j ; #pragma omp parallel for num_threads(nthreads) \ if (nscol > 32) for (j = 0 ; j < nscol ; j++) { for (int64_t i = j*L_ENTRY; i < nscol*L_ENTRY ; i++) { int64_t iidx = j*nsrow*L_ENTRY+i; gpu_p->h_Lx[*iHostBuff][iidx] -= gpu_p->h_Lx[iHostBuff2][iidx]; } } } else { /* assemble with CPU updates */ cudaDeviceSynchronize(); #ifdef _OPENMP double work = (double) nscol * (double) nsrow * L_ENTRY ; int nthreads = cholmod_nthreads (work, Common) ; #endif int64_t j ; #pragma omp parallel for num_threads(nthreads) \ if (nscol > 32) for (j = 0 ; j < nscol ; j++) { for (int64_t i = j*L_ENTRY ; i < nsrow*L_ENTRY ; i++) { int64_t iidx = j*nsrow*L_ENTRY+i; Lx[psx*L_ENTRY+iidx] -= gpu_p->h_Lx[iHostBuff2][iidx]; } } } } return; } /* ========================================================================== */ /* === gpu_lower_potrf ====================================================== */ /* ========================================================================== */ /* Cholesky factorzation (dpotrf) of a matrix S, operating on the lower * triangular part only. S is nscol2-by-nscol2 with leading dimension nsrow. * * S is the top part of the supernode (the lower triangular matrx). * This function also copies the bottom rectangular part of the supernode (B) * onto the GPU, in preparation for gpu_triangular_solve. */ /* * On entry, d_A[1] contains the fully assembled supernode */ int TEMPLATE2 (CHOLMOD (gpu_lower_potrf)) ( int64_t nscol2, /* S is nscol2-by-nscol2 */ int64_t nsrow, /* leading dimension of S */ int64_t psx, /* S is located at Lx + L_ENTRY*psx */ double *Lx, /* contains S; overwritten with Cholesky factor */ int64_t *info, /* BLAS info return value */ cholmod_common *Common, cholmod_gpu_pointers *gpu_p ) { double *devPtrA, *devPtrB, *A ; double alpha, beta ; cudaError_t cudaStat ; cublasStatus_t cublasStatus ; int64_t j, nsrow2, nb, n, gpu_lda, lda, gpu_ldb ; int ilda, ijb ; int64_t iinfo ; #ifdef BLAS_TIMER double tstart ; #endif if (nscol2 * L_ENTRY < CHOLMOD_POTRF_LIMIT) { /* too small for the CUDA BLAS; use the CPU instead */ return (0) ; } #ifdef BLAS_TIMER tstart = SUITESPARSE_TIME ; Common->CHOLMOD_GPU_POTRF_CALLS++ ; #endif nsrow2 = nsrow - nscol2 ; /* ---------------------------------------------------------------------- */ /* heuristic to get the block size depending of the problem size */ /* ---------------------------------------------------------------------- */ nb = 128 ; if (nscol2 > 4096) nb = 256 ; if (nscol2 > 8192) nb = 384 ; n = nscol2 ; gpu_lda = ((nscol2+31)/32)*32 ; lda = nsrow ; A = gpu_p->h_Lx[(Common->ibuffer+CHOLMOD_HOST_SUPERNODE_BUFFERS-1)% CHOLMOD_HOST_SUPERNODE_BUFFERS]; /* ---------------------------------------------------------------------- */ /* determine the GPU leading dimension of B */ /* ---------------------------------------------------------------------- */ gpu_ldb = 0 ; if (nsrow2 > 0) { gpu_ldb = ((nsrow2+31)/32)*32 ; } /* ---------------------------------------------------------------------- */ /* remember where device memory is, to be used by triangular solve later */ /* ---------------------------------------------------------------------- */ devPtrA = gpu_p->d_Lx[0]; devPtrB = gpu_p->d_Lx[1]; /* ---------------------------------------------------------------------- */ /* copy A from device to device */ /* ---------------------------------------------------------------------- */ cudaStat = cudaMemcpy2DAsync ( devPtrA, gpu_lda * L_ENTRY * sizeof (devPtrA[0]), gpu_p->d_A[1], nsrow * L_ENTRY * sizeof (Lx[0]), nscol2 * L_ENTRY * sizeof (devPtrA[0]), nscol2, cudaMemcpyDeviceToDevice, Common->gpuStream[0] ); if ( cudaStat ) { ERROR ( CHOLMOD_GPU_PROBLEM, "GPU memcopy device to device"); } /* ---------------------------------------------------------------------- */ /* copy B in advance, for gpu_triangular_solve */ /* ---------------------------------------------------------------------- */ if (nsrow2 > 0) { cudaStat = cudaMemcpy2DAsync (devPtrB, gpu_ldb * L_ENTRY * sizeof (devPtrB [0]), gpu_p->d_A[1] + L_ENTRY*nscol2, nsrow * L_ENTRY * sizeof (Lx [0]), nsrow2 * L_ENTRY * sizeof (devPtrB [0]), nscol2, cudaMemcpyDeviceToDevice, Common->gpuStream[0]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ; } } /* ------------------------------------------------------------------ */ /* define the dpotrf stream */ /* ------------------------------------------------------------------ */ cublasStatus = cublasSetStream (Common->cublasHandle, Common->gpuStream [0]) ; if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream") ; } /* ---------------------------------------------------------------------- */ /* block Cholesky factorization of S */ /* ---------------------------------------------------------------------- */ for (j = 0 ; j < n ; j += nb) { int64_t jb = nb < (n-j) ? nb : (n-j) ; /* ------------------------------------------------------------------ */ /* do the CUDA BLAS dsyrk */ /* ------------------------------------------------------------------ */ alpha = -1.0 ; beta = 1.0 ; #ifdef REAL cublasStatus = cublasDsyrk (Common->cublasHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, jb, j, &alpha, devPtrA + j, gpu_lda, &beta, devPtrA + j + j*gpu_lda, gpu_lda) ; #else cublasStatus = cublasZherk (Common->cublasHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, jb, j, &alpha, (cuDoubleComplex*)devPtrA + j, gpu_lda, &beta, (cuDoubleComplex*)devPtrA + j + j*gpu_lda, gpu_lda) ; #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } /* ------------------------------------------------------------------ */ cudaStat = cudaEventRecord (Common->cublasEventPotrf [0], Common->gpuStream [0]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ; } cudaStat = cudaStreamWaitEvent (Common->gpuStream [1], Common->cublasEventPotrf [0], 0) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ; } /* ------------------------------------------------------------------ */ /* copy back the jb columns on two different streams */ /* ------------------------------------------------------------------ */ cudaStat = cudaMemcpy2DAsync (A + L_ENTRY*(j + j*lda), lda * L_ENTRY * sizeof (double), devPtrA + L_ENTRY*(j + j*gpu_lda), gpu_lda * L_ENTRY * sizeof (double), L_ENTRY * sizeof (double)*jb, jb, cudaMemcpyDeviceToHost, Common->gpuStream [1]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy from device") ; } /* ------------------------------------------------------------------ */ /* do the CUDA BLAS dgemm */ /* ------------------------------------------------------------------ */ if ((j+jb) < n) { #ifdef REAL alpha = -1.0 ; beta = 1.0 ; cublasStatus = cublasDgemm (Common->cublasHandle, CUBLAS_OP_N, CUBLAS_OP_T, (n-j-jb), jb, j, &alpha, devPtrA + (j+jb), gpu_lda, devPtrA + (j) , gpu_lda, &beta, devPtrA + (j+jb + j*gpu_lda), gpu_lda) ; #else cuDoubleComplex calpha = {-1.0,0.0} ; cuDoubleComplex cbeta = { 1.0,0.0} ; cublasStatus = cublasZgemm (Common->cublasHandle, CUBLAS_OP_N, CUBLAS_OP_C, (n-j-jb), jb, j, &calpha, (cuDoubleComplex*)devPtrA + (j+jb), gpu_lda, (cuDoubleComplex*)devPtrA + (j), gpu_lda, &cbeta, (cuDoubleComplex*)devPtrA + (j+jb + j*gpu_lda), gpu_lda ) ; #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } } cudaStat = cudaStreamSynchronize (Common->gpuStream [1]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ; } /* ------------------------------------------------------------------ */ /* compute the Cholesky factorization of the jbxjb block on the CPU */ /* ------------------------------------------------------------------ */ ilda = (int) lda ; ijb = jb ; bool ok = true ; #ifdef REAL SUITESPARSE_LAPACK_dpotrf ("L", ijb, A + L_ENTRY * (j + j*lda), ilda, iinfo, ok) ; #else SUITESPARSE_LAPACK_zpotrf ("L", ijb, A + L_ENTRY * (j + j*lda), ilda, iinfo, ok) ; #endif *info = iinfo ; if (!ok) { ERROR (CHOLMOD_TOO_LARGE, "problem too large for the BLAS") ; } if (*info != 0) { *info = *info + j ; break ; } /* ------------------------------------------------------------------ */ /* copy the result back to the GPU */ /* ------------------------------------------------------------------ */ cudaStat = cudaMemcpy2DAsync (devPtrA + L_ENTRY*(j + j*gpu_lda), gpu_lda * L_ENTRY * sizeof (double), A + L_ENTRY * (j + j*lda), lda * L_ENTRY * sizeof (double), L_ENTRY * sizeof (double) * jb, jb, cudaMemcpyHostToDevice, Common->gpuStream [0]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ; } /* ------------------------------------------------------------------ */ /* do the CUDA BLAS dtrsm */ /* ------------------------------------------------------------------ */ if ((j+jb) < n) { #ifdef REAL alpha = 1.0 ; cublasStatus = cublasDtrsm (Common->cublasHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, (n-j-jb), jb, &alpha, devPtrA + (j + j*gpu_lda), gpu_lda, devPtrA + (j+jb + j*gpu_lda), gpu_lda) ; #else cuDoubleComplex calpha = {1.0,0.0}; cublasStatus = cublasZtrsm (Common->cublasHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT, (n-j-jb), jb, &calpha, (cuDoubleComplex *)devPtrA + (j + j*gpu_lda), gpu_lda, (cuDoubleComplex *)devPtrA + (j+jb + j*gpu_lda), gpu_lda) ; #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } /* -------------------------------------------------------------- */ /* Copy factored column back to host. */ /* -------------------------------------------------------------- */ cudaStat = cudaEventRecord (Common->cublasEventPotrf[2], Common->gpuStream[0]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ; } cudaStat = cudaStreamWaitEvent (Common->gpuStream[1], Common->cublasEventPotrf[2], 0) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "CUDA event failure") ; } cudaStat = cudaMemcpy2DAsync (A + L_ENTRY*(j + jb + j * lda), lda * L_ENTRY * sizeof (double), devPtrA + L_ENTRY* (j + jb + j * gpu_lda), gpu_lda * L_ENTRY * sizeof (double), L_ENTRY * sizeof (double)* (n - j - jb), jb, cudaMemcpyDeviceToHost, Common->gpuStream[1]) ; if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy to device") ; } } } #ifdef BLAS_TIMER Common->CHOLMOD_GPU_POTRF_TIME += SUITESPARSE_TIME - tstart ; #endif return (1) ; } /* ========================================================================== */ /* === gpu_triangular_solve ================================================= */ /* ========================================================================== */ /* The current supernode is columns k1 to k2-1 of L. Let L1 be the diagonal * block (factorized by dpotrf/zpotrf above; rows/cols k1:k2-1), and L2 be rows * k2:n-1 and columns k1:k2-1 of L. The triangular system to solve is L2*L1' = * S2, where S2 is overwritten with L2. More precisely, L2 = S2 / L1' in * MATLAB notation. */ /* Version with pre-allocation in POTRF */ int TEMPLATE2 (CHOLMOD (gpu_triangular_solve)) ( int64_t nsrow2, /* L1 and S2 are nsrow2-by-nscol2 */ int64_t nscol2, /* L1 is nscol2-by-nscol2 */ int64_t nsrow, /* leading dimension of L1, L2, and S2 */ int64_t psx, /* L1 is at Lx+L_ENTRY*psx; * L2 at Lx+L_ENTRY*(psx+nscol2)*/ double *Lx, /* holds L1, L2, and S2 */ cholmod_common *Common, cholmod_gpu_pointers *gpu_p ) { double *devPtrA, *devPtrB ; cudaError_t cudaStat ; cublasStatus_t cublasStatus ; int64_t gpu_lda, gpu_ldb, gpu_rowstep ; int64_t gpu_row_start = 0 ; int64_t gpu_row_max_chunk, gpu_row_chunk; int ibuf = 0; int iblock = 0; int iHostBuff = (Common->ibuffer+CHOLMOD_HOST_SUPERNODE_BUFFERS-1) % CHOLMOD_HOST_SUPERNODE_BUFFERS; int iwrap; #ifdef BLAS_TIMER double tstart ; #endif #ifdef REAL double alpha = 1.0 ; gpu_row_max_chunk = 768; #else cuDoubleComplex calpha = {1.0,0.0} ; gpu_row_max_chunk = 256; #endif if ( nsrow2 <= 0 ) { return (0) ; } #ifdef BLAS_TIMER tstart = SUITESPARSE_TIME ; Common->CHOLMOD_GPU_TRSM_CALLS++ ; #endif gpu_lda = ((nscol2+31)/32)*32 ; gpu_ldb = ((nsrow2+31)/32)*32 ; devPtrA = gpu_p->d_Lx[0]; devPtrB = gpu_p->d_Lx[1]; /* make sure the copy of B has completed */ cudaStreamSynchronize( Common->gpuStream[0] ); /* ---------------------------------------------------------------------- */ /* do the CUDA BLAS dtrsm */ /* ---------------------------------------------------------------------- */ while ( gpu_row_start < nsrow2 ) { gpu_row_chunk = nsrow2 - gpu_row_start; if ( gpu_row_chunk > gpu_row_max_chunk ) { gpu_row_chunk = gpu_row_max_chunk; } cublasStatus = cublasSetStream ( Common->cublasHandle, Common->gpuStream[ibuf] ); if ( cublasStatus != CUBLAS_STATUS_SUCCESS ) { ERROR ( CHOLMOD_GPU_PROBLEM, "GPU CUBLAS stream"); } #ifdef REAL cublasStatus = cublasDtrsm (Common->cublasHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, gpu_row_chunk, nscol2, &alpha, devPtrA, gpu_lda, devPtrB + gpu_row_start, gpu_ldb) ; #else cublasStatus = cublasZtrsm (Common->cublasHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT, gpu_row_chunk, nscol2, &calpha, (const cuDoubleComplex *) devPtrA, gpu_lda, (cuDoubleComplex *)devPtrB + gpu_row_start , gpu_ldb) ; #endif if (cublasStatus != CUBLAS_STATUS_SUCCESS) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU CUBLAS routine failure") ; } /* ------------------------------------------------------------------ */ /* copy result back to the CPU */ /* ------------------------------------------------------------------ */ cudaStat = cudaMemcpy2DAsync ( gpu_p->h_Lx[iHostBuff] + L_ENTRY*(nscol2+gpu_row_start), nsrow * L_ENTRY * sizeof (Lx [0]), devPtrB + L_ENTRY*gpu_row_start, gpu_ldb * L_ENTRY * sizeof (devPtrB [0]), gpu_row_chunk * L_ENTRY * sizeof (devPtrB [0]), nscol2, cudaMemcpyDeviceToHost, Common->gpuStream[ibuf]); if (cudaStat) { ERROR (CHOLMOD_GPU_PROBLEM, "GPU memcopy from device") ; } cudaEventRecord ( Common->updateCBuffersFree[ibuf], Common->gpuStream[ibuf] ); gpu_row_start += gpu_row_chunk; ibuf++; ibuf = ibuf % CHOLMOD_HOST_SUPERNODE_BUFFERS; iblock ++; if ( iblock >= CHOLMOD_HOST_SUPERNODE_BUFFERS ) { int64_t gpu_row_start2 ; int64_t gpu_row_end ; /* then CHOLMOD_HOST_SUPERNODE_BUFFERS worth of work has been * scheduled, so check for completed events and copy result into * Lx before continuing. */ cudaEventSynchronize ( Common->updateCBuffersFree [iblock%CHOLMOD_HOST_SUPERNODE_BUFFERS] ); /* copy into Lx */ gpu_row_start2 = nscol2 + (iblock-CHOLMOD_HOST_SUPERNODE_BUFFERS) *gpu_row_max_chunk; gpu_row_end = gpu_row_start2+gpu_row_max_chunk; if ( gpu_row_end > nsrow ) gpu_row_end = nsrow; #ifdef _OPENMP double work = (double) nscol2 * (double) (gpu_row_end - gpu_row_start2) * L_ENTRY ; int nthreads = cholmod_nthreads (work, Common) ; #endif int64_t j ; #pragma omp parallel for num_threads(nthreads) \ if (nscol2 > 32) for (j = 0 ; j < nscol2 ; j++) { for (int64_t i = gpu_row_start2*L_ENTRY ; i < gpu_row_end*L_ENTRY ; i++) { int64_t iidx = j*nsrow*L_ENTRY+i; Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx]; } } } } /* Convenient to copy the L1 block here */ #ifdef _OPENMP double work = (double) nscol2 * (double) nscol2 * L_ENTRY ; int nthreads = cholmod_nthreads (work, Common) ; #endif int64_t j ; #pragma omp parallel for num_threads(nthreads) \ if (nscol2 > 32) for (j = 0 ; j < nscol2 ; j++) { for (int64_t i = j*L_ENTRY ; i < nscol2*L_ENTRY ; i++) { int64_t iidx = j*nsrow*L_ENTRY + i; Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx]; } } /* now account for the last HSTREAMS buffers */ for ( iwrap=0; iwrap= 0 && gpu_row_start2 < nsrow ) { int64_t gpu_row_end = gpu_row_start2+gpu_row_max_chunk; if ( gpu_row_end > nsrow ) gpu_row_end = nsrow; cudaEventSynchronize ( Common->updateCBuffersFree [iblock%CHOLMOD_HOST_SUPERNODE_BUFFERS] ); /* copy into Lx */ #ifdef _OPENMP double work = (double) nscol2 * (double) (gpu_row_end - gpu_row_start2) * L_ENTRY ; int nthreads = cholmod_nthreads (work, Common) ; #endif int64_t j ; #pragma omp parallel for num_threads(nthreads) \ if (nscol2 > 32) for (j = 0 ; j < nscol2 ; j++) { for (int64_t i = gpu_row_start2*L_ENTRY ; i < gpu_row_end*L_ENTRY; i++) { int64_t iidx = j*nsrow*L_ENTRY+i; Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx]; } } } iblock++; } /* ---------------------------------------------------------------------- */ /* return */ /* ---------------------------------------------------------------------- */ #ifdef BLAS_TIMER Common->CHOLMOD_GPU_TRSM_TIME += SUITESPARSE_TIME - tstart ; #endif return (1) ; } /* ========================================================================== */ /* === gpu_copy_supernode =================================================== */ /* ========================================================================== */ /* * In the event gpu_triangular_sovle is not needed / called, this routine * copies the factored diagonal block from the GPU to the CPU. */ void TEMPLATE2 (CHOLMOD (gpu_copy_supernode)) ( cholmod_common *Common, double *Lx, int64_t psx, int64_t nscol, int64_t nscol2, int64_t nsrow, int supernodeUsedGPU, int iHostBuff, cholmod_gpu_pointers *gpu_p ) { if ( supernodeUsedGPU && nscol2 * L_ENTRY >= CHOLMOD_POTRF_LIMIT ) { cudaDeviceSynchronize(); #ifdef _OPENMP double work = (double) nscol * (double) nscol * L_ENTRY ; int nthreads = cholmod_nthreads (work, Common) ; #endif int64_t j ; #pragma omp parallel for num_threads(nthreads) \ if (nscol > 32) for (j = 0 ; j < nscol ; j++) { for (int64_t i = j*L_ENTRY ; i < nscol*L_ENTRY ; i++) { int64_t iidx = j*nsrow*L_ENTRY+i; Lx[psx*L_ENTRY+iidx] = gpu_p->h_Lx[iHostBuff][iidx]; } } } return; } #endif #undef REAL #undef COMPLEX #undef ZOMPLEX