// ============================================================================= // === GPUQREngine/Include/Kernel/Factorize/factorize_vt.cu ==================== // ============================================================================= // GPUQREngine, Copyright (c) 2013, Timothy A Davis, Sencer Nuri Yeralan, // and Sanjay Ranka. All Rights Reserved. // SPDX-License-Identifier: GPL-2.0+ //------------------------------------------------------------------------------ // Constraints: // MCHUNK = M / BITTYROWS must be an integer // MCHUNK * N must be <= NUMTHREADS #ifdef FACTORIZE // TODO allow EPSILON to be defined by the caller #define EPSILON (1e-90) __device__ void FACTORIZE ( ) { //-------------------------------------------------------------------------- // The bitty block //-------------------------------------------------------------------------- // The matrix A is M by N, T is N by N (if present) // Each thread owns an r-by-1 bitty column of A. If its first entry is // A(ia,ja), and then it owns entries in every mchunk rows after that // [A(ia,ja), A(ia+mchunk,ja), ... ]. // Each column is operated on by mchunk threads. #define MCHUNK (M / BITTYROWS) #define MYBITTYROW(ii) (ii*MCHUNK + (threadIdx.x % MCHUNK)) #define MYBITTYCOL (threadIdx.x / MCHUNK) #define ATHREADS (MCHUNK * N) #define WORKER (ATHREADS == NUMTHREADS || threadIdx.x < ATHREADS) double rbitA [BITTYROWS] ; // bitty block for A double rbitV [BITTYROWS] ; // bitty block for V double sigma ; // only used by thread zero //-------------------------------------------------------------------------- // shared memory usage //-------------------------------------------------------------------------- #define shA shMemory.factorize.A #define shZ shMemory.factorize.Z #define shRdiag shMemory.factorize.A1 #define RSIGMA(i) shMemory.factorize.V1 [i] #ifdef WHOLE_FRONT // T is not computed, and there is no list of tiles #define TAU shMemory.factorize.tau #else // T is computed and saved in the VT tile, work on a set of tiles #define TAU shT [k][k] #define shT shMemory.factorize.T #define shV1 shMemory.factorize.V1 #endif //-------------------------------------------------------------------------- // Grab my task from the queue. //-------------------------------------------------------------------------- int fn = myTask.fn ; #ifdef WHOLE_FRONT int fm = myTask.fm ; int nv = MIN (fm, fn) ; // If nv is a constant, it allows the outer loop to be unrolled: // #define nv N #define j1 0 // The whole front always considers the edge case #ifndef EDGE_CASE #define EDGE_CASE #endif #else int j1 = myTask.extra[4] ; #ifdef EDGE_CASE int fm = myTask.fm ; int nv = MIN (fm, fn) - j1 ; nv = MIN (nv, N) ; nv = MIN (nv, M) ; #else #define nv N #endif double (*glVT)[TILESIZE] = (double (*)[TILESIZE]) myTask.AuxAddress[0] ; #endif #ifdef EDGE_CASE // Check if an entry is inside the front. #define INSIDE(test) (test) #else // The entry is guaranteed to reside inside the frontal matrix. #define INSIDE(test) (1) #endif // bool is_false = (fn < 0) ; #define glA(i,j) (myTask.F[((i)*fn + (j))]) //-------------------------------------------------------------------------- // Load A into shared memory //-------------------------------------------------------------------------- // ACHUNKSIZE must be an integer #define it (threadIdx.x / N) #define jt (threadIdx.x % N) #define ACHUNKSIZE (NUMTHREADS / N) #ifdef WHOLE_FRONT // all threads load the entire front (no tiles). // always check the edge case. // M / ACHUNKSIZE must be an integer. #define NACHUNKS (M / ACHUNKSIZE) for (int ii = 0 ; ii < NACHUNKS ; ii++) { int i = ii * ACHUNKSIZE + it ; shA [i][jt] = (i < fm && jt < fn) ? glA (i, jt) : 0 ; } #else // when all threads work on a tile. // (N*N / NUMTHREADS) does not have to be an integer. With a tile // size of N=32, and NUMTHREADS=384, it isn't. So compute the ceiling, // and handle the clean up by testing i < N below. #define NACHUNKS CEIL (N*N, NUMTHREADS) /* If we're not coming from an apply-factorize, load from F. */ if(IsApplyFactorize == 0) { // Load tiles from the frontal matrix // accounts for 25% of the total time on Kepler, 13% on Fermi for (int t = 0 ; t < ROW_PANELSIZE ; t++) { int rowTile = myTask.extra[t]; if (INSIDE (rowTile != EMPTY)) { /* load the tile of A from global memory */ for (int ii = 0 ; ii < NACHUNKS ; ii++) { int i = ii * ACHUNKSIZE + it ; if (ii < NACHUNKS-1 || i < N) { shA [i + t*TILESIZE][jt] = (INSIDE (i+rowTile < fm) && INSIDE (jt+j1 < fn)) ? glA (i+rowTile, jt+j1) : 0 ; } } } else { /* clear the tile of A */ for (int ii = 0 ; ii < NACHUNKS ; ii++) { int i = ii * ACHUNKSIZE + it ; if (ii < NACHUNKS-1 || i < N) { shA [i + t*TILESIZE][jt] = 0 ; } } } } } // clear the tile T for (int ii = 0 ; ii < NACHUNKS ; ii++) { int i = ii * ACHUNKSIZE + it ; if (ii < NACHUNKS-1 || i < N) { shT [i][jt] = 0 ; } } #endif /* We need all of A to be loaded and T to be cleared before proceeding. */ __syncthreads(); //-------------------------------------------------------------------------- // load A into the bitty block //-------------------------------------------------------------------------- if (WORKER) { #pragma unroll for (int ii = 0 ; ii < BITTYROWS ; ii++) { int i = MYBITTYROW (ii) ; rbitA [ii] = shA [i][MYBITTYCOL] ; } } //-------------------------------------------------------------------------- // compute the first sigma = sum (A (1:m,1).^2) //-------------------------------------------------------------------------- if (WORKER && MYBITTYCOL == 0) { // each thread that owns column 0 computes sigma for its // own bitty block double s = 0 ; #pragma unroll for (int ii = 0 ; ii < BITTYROWS ; ii++) { int i = MYBITTYROW (ii) ; if (i >= 1) { s += rbitA [ii] * rbitA [ii] ; } } RSIGMA (threadIdx.x) = s ; } // thread zero must wait for RSIGMA __syncthreads ( ) ; if (threadIdx.x == 0) { sigma = 0 ; #pragma unroll for (int ii = 0 ; ii < MCHUNK ; ii++) { sigma += RSIGMA (ii) ; } } //-------------------------------------------------------------------------- // Do the block householder factorization //-------------------------------------------------------------------------- // loop unrolling has no effect on the edge case (it is not unrolled), // but greatly speeds up the non-edge case. #pragma unroll for (int k = 0 ; k < nv ; k++) { //---------------------------------------------------------------------- // write the kth column of A back into shared memory //---------------------------------------------------------------------- if (WORKER && MYBITTYCOL == k && k > 0) { // the bitty block for threads that own column k contains // the kth column of R and the kth column of v. #pragma unroll for (int ii = 0 ; ii < BITTYROWS ; ii++) { int i = MYBITTYROW (ii) ; shA [i][k] = rbitA [ii] ; } } __syncthreads ( ) ; // At this point, A (:,k) is held in both shared memory, and in the // threads that own that column. A (:,k) is not yet the kth // Householder vector, except for the diagnal (which is computed // below). A (0:k-1,k) is now the kth column of R (above the // diagonal). //---------------------------------------------------------------------- // compute the Householder coefficients //---------------------------------------------------------------------- // This is costly, accounting for about 25% of the total time on // Kepler, and 22% on Fermi, when A is loaded from global memory. This // means the work here is even a higher fraction when A is in shared. if (threadIdx.x == 0) { double x1 = shA [k][k] ; // the diagonal A (k,k) double s, v1, tau ; if (sigma <= EPSILON) { // printf ("hit eps %g\n", sigma) ; s = x1 ; v1 = 0 ; tau = 0 ; } else { s = sqrt (x1*x1 + sigma) ; v1 = (x1 <= 0) ? (x1 - s) : (-sigma / (x1 + s)) ; tau = -1 / (s * v1) ; } shRdiag [k] = s ; // the diagonal entry of R shA [k][k] = v1 ; // the topmost entry of the vector v TAU = tau ; // tile case: T (k,k) = tau } // All threads need v1, and later on they need tau __syncthreads ( ) ; // A (0:k-1,k) now holds the kth column of R (excluding the diagonal). // A (k:m-1,k) holds the kth Householder vector (incl. the diagonal). //---------------------------------------------------------------------- // z = (-tau) * v' * A (k:m-1,:), where v is A (k:m-1,k) //---------------------------------------------------------------------- if (WORKER) // && (COMPUTE_T || MYBITTYCOL > k)) { // Load the vector v from A (k:m-1,k) into the V bitty block. // If T is not computed and MYBITTYCOL <= k, then this code can // be skipped, but the code is slower when that test is made. #pragma unroll for (int ii = 0 ; ii < BITTYROWS ; ii++) { int i = MYBITTYROW (ii) ; // only i >= k is needed, but it's faster to load it all rbitV [ii] = shA [i][k] ; } // compute v' * A (k:m-1,:), each thread in its own column { double z = 0.0 ; #pragma unroll for (int ii = 0 ; ii < BITTYROWS ; ii++) { int i = MYBITTYROW (ii) ; if (i >= k) { z += rbitV [ii] * rbitA [ii] ; } } // store z into the reduction space in shared memory shZ [MYBITTYROW(0)][MYBITTYCOL] = z ; } } // All threads need to see the reduction space for Z __syncthreads ( ) ; // Reduce Z into a single row vector z, using the first warp only if (threadIdx.x < N) // && (COMPUTE_T || threadIdx.x > k)) { double z = 0 ; #pragma unroll for (int ii = 0 ; ii < MCHUNK ; ii++) { z += shZ [ii][threadIdx.x] ; } shZ [0][threadIdx.x] = - z * TAU ; } // All threads need to see the z vector __syncthreads ( ) ; //---------------------------------------------------------------------- // update A (in register) and compute the next sigma //---------------------------------------------------------------------- if (WORKER && MYBITTYCOL > k) { // A (k:m,k+1:n) = A (k:,k+1:n) + v * z (k+1:n) ; // only threads that own a column MYBITTYCOL > k do any work { double z = shZ [0][MYBITTYCOL] ; #pragma unroll for (int ii = 0 ; ii < BITTYROWS ; ii++) { int i = MYBITTYROW (ii) ; if (i >= k) { rbitA [ii] += rbitV [ii] * z ; } } } // sigma = sum (A ((k+2):m,k+1).^2), except for the reduction if (MYBITTYCOL == k+1) { // each thread that owns column k+1 computes sigma for its // own bitty block double s = 0 ; #pragma unroll for (int ii = 0 ; ii < BITTYROWS ; ii++) { int i = MYBITTYROW (ii) ; if (i >= k+2) { s += rbitA [ii] * rbitA [ii] ; } } RSIGMA (MYBITTYROW(0)) = s ; } } //---------------------------------------------------------------------- // construct the kth column of T //---------------------------------------------------------------------- #ifndef WHOLE_FRONT // T (0:k-1,k) = T (0:k-1,0:k-1) * z (0:k-1)' if (threadIdx.x < k) { double t_ik = 0 ; for (int jj = 0 ; jj < k ; jj++) { t_ik += shT [threadIdx.x][jj] * shZ [0][jj] ; } shT [threadIdx.x][k] = t_ik ; } #endif //---------------------------------------------------------------------- // reduce sigma into a single scalar for the next iteration //---------------------------------------------------------------------- // Thread zero must wait for RSIGMA __syncthreads ( ) ; if (threadIdx.x == 0) { sigma = 0 ; #pragma unroll for (int ii = 0 ; ii < MCHUNK ; ii++) { sigma += RSIGMA (ii) ; } } } // tril (A) now holds all the Householder vectors, including the diagonal. // triu (A,1) now holds R, excluding the diagonal. // shRdiag holds the diagonal of R. //-------------------------------------------------------------------------- // write out the remaining columns of R, if any //-------------------------------------------------------------------------- if (WORKER && MYBITTYCOL >= nv) { for (int ii = 0 ; ii < BITTYROWS ; ii++) { int i = MYBITTYROW (ii) ; shA [i][MYBITTYCOL] = rbitA [ii] ; } } //-------------------------------------------------------------------------- /* Have a warp shuffle memory around. */ if (threadIdx.x < N) { #ifndef WHOLE_FRONT shV1 [threadIdx.x] = shA [threadIdx.x][threadIdx.x]; #endif shA [threadIdx.x][threadIdx.x] = shRdiag [threadIdx.x]; } // Wait for the memory shuffle to finish before saving A to global memory __syncthreads(); //-------------------------------------------------------------------------- // save results back to global memory //-------------------------------------------------------------------------- #ifdef WHOLE_FRONT if (jt < fn) { for (int ii = 0 ; ii < NACHUNKS ; ii++) { int i = ii * ACHUNKSIZE + it ; if (i < fm) glA (i, jt) = shA [i][jt] ; } } #else // Save VT back to global memory & clear out // lower-triangular part of the first tile (leaving R). for (int th=threadIdx.x; th