// ============================================================================= // === GPUQREngine/Include/Kernel/Apply/cevta_tile.cu ========================== // ============================================================================= // GPUQREngine, Copyright (c) 2013, Timothy A Davis, Sencer Nuri Yeralan, // and Sanjay Ranka. All Rights Reserved. // SPDX-License-Identifier: GPL-2.0+ //------------------------------------------------------------------------------ // cevta_tile macro //------------------------------------------------------------------------------ // C=V'*A for a single tile, computed in two chunks of HALFTILE each. Also // loads in the A matrix into the SHA buffer. The first tile is lower // triangular (selected by defining FIRST_TILE in the caller). Subsequent // tiles are square (and FIRST_TILE is not defined). This file is #include'd // into block_apply_chunk.cu. It is not a stand-alone function. #ifdef FIRST_TILE #define this_tile 0 #define TRIL(test) (test) #else #define this_tile t #define TRIL(test) (1) #endif { #pragma unroll for (int p = 0 ; p < 2 ; p++) { //---------------------------------------------------------------------- // move the prefetched A from register into shared for this halftile //---------------------------------------------------------------------- // Write next halftile of 16 rows of A from register to shared, for // _this_ iteration of p. The abuffer was read from global to register // in the prior iteration of p (the prefetch below), or in the prefetch // in block_apply_chunk.cu. If column fjload is outside the front, // rbitA is all zero, which safely clears SHA. #pragma unroll for (int ii = 0 ; ii < NACHUNKS ; ii++) { int i = ii * ACHUNKSIZE + iaload ; if (ii < NACHUNKS-1 || i < HALFTILE) { SHA (p*HALFTILE+i, jaload) = rbitA (ii) ; } } // V and A for this iteration of p are now loaded into shared __syncthreads ( ) ; //---------------------------------------------------------------------- // prefetch the next halftile of A from global into register //---------------------------------------------------------------------- // Load the next halftile of A from global memory to register, // not for this iteration of p, but the next. if (p == 0) { // Read the next halftile of A for this row tile. #pragma unroll for (int ii = 0 ; ii < NACHUNKS ; ii++) { rbitA (ii) = 0 ; } #pragma unroll for (int ii = 0 ; ii < NACHUNKS ; ii++) { int i = ii * ACHUNKSIZE + iaload ; if (ii < NACHUNKS-1 || i < HALFTILE) { int fi = IFRONT (this_tile, HALFTILE+i) ; if (aloader && INSIDE_ROW (fi < fm)) { rbitA (ii) = glF [fi * fn + fjload] ; } } } } else if (this_tile+1 < ROW_PANELSIZE) // p is 1 for this case { // Read the first halftile of V and A for the next row tile, // but not if we are computing with the very last tile. #pragma unroll for (int ii = 0 ; ii < NACHUNKS ; ii++) { rbitA (ii) = 0 ; } #pragma unroll for (int ii = 0 ; ii < NACHUNKS ; ii++) { int i = ii * ACHUNKSIZE + iaload ; if (ii < NACHUNKS-1 || i < HALFTILE) { int fi = IFRONT (this_tile+1, i) ; if (aloader && INSIDE_ROW (fi < fm)) { rbitA (ii) = glF [fi * fn + fjload] ; } } } } //---------------------------------------------------------------------- // C = tril (V)'*A for this halffile //---------------------------------------------------------------------- // For square tiles (all but the first tile of V), TRIL is always // true, and this code is simpler and faster as a result. if (CTHREADS == NUMTHREADS || threadIdx.x < CTHREADS) { // C=tril(V)'*A, compute with a halftile of A and V #pragma unroll for (int pp = 0 ; pp < HALFTILE ; pp++) { int i = p * HALFTILE + pp ; #pragma unroll for (int ii = 0 ; ii < CBITTYROWS ; ii++) { int j = MYCBITTYROW (ii) ; if (TRIL (i >= j)) { rrow [ii] = SHV (this_tile, i, j) ; } } #pragma unroll for (int jj = 0 ; jj < CBITTYCOLS ; jj++) { int j = MYCBITTYCOL (jj) ; rcol [jj] = SHA (i, j) ; } #pragma unroll for (int ii = 0 ; ii < CBITTYROWS ; ii++) { if (TRIL (i >= MYCBITTYROW (ii))) { #pragma unroll for (int jj = 0 ; jj < CBITTYCOLS ; jj++) { rbit [ii][jj] += rrow [ii] * rcol [jj] ; } } } } } } } #undef FIRST_TILE #undef this_tile #undef TRIL