// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors. // All Rights Reserved. See the top-level LICENSE and NOTICE files for details. // // SPDX-License-Identifier: BSD-2-Clause // // This file is part of CEED: http://github.com/ceed /// @file /// Internal header for Magma non-tensor basis gradient #ifndef CEED_MAGMA_GRAD_NONTENSOR_H #define CEED_MAGMA_GRAD_NONTENSOR_H #include "magma-common-nontensor.h" //////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_Q, MAGMA_MAXTHREADS_1D)) __global__ void magma_grad_nontensor_n(int n, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, CeedScalar *dC, int lddc) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data); const int tx = threadIdx.x; const int ty = threadIdx.y; const int id = blockIdx.x * blockDim.y + ty; const int nblocks = MAGMA_CEILDIV(n, BASIS_NB_GRAD_N); const int myn = min(BASIS_NB_GRAD_N, n - id * BASIS_NB_GRAD_N); dB += id * BASIS_NB_GRAD_N * lddb; dC += id * BASIS_NB_GRAD_N * lddc; // A is BASIS_P x BASIS_Q const int slda = BASIS_P; const int sldb = BASIS_P; CeedScalar *sA = (CeedScalar *)shared_data; CeedScalar *sB = sA + BASIS_Q * BASIS_P; sB += ty * sldb * BASIS_NB_GRAD_N; // read B once for all C's if (id < nblocks) { read_B_g2s_1D_nosync(tx, myn, dB, lddb, sB, sldb); } // unrolling this loop yields dramatic performance drop using hipcc, let the compiler decide (no pragma unroll) for (int d = 0; d < BASIS_DIM; d++) { // read A (BASIS_P x BASIS_Q) using all threads CeedScalar rA[BASIS_P] = {MAGMA_D_ZERO}; __syncthreads(); read_A_trans_g2r_1D_nosync(tx, ty, dA, ldda, sA, slda, rA); // init rC CeedScalar rC[BASIS_NB_GRAD_N] = {MAGMA_D_ZERO}; if (id < nblocks) { mul_rAsBrC_1D_nosync(tx, rA, sB, sldb, rC); } // write C if (id < nblocks) { write_C_r2g_1D_nosync(tx, myn, rC, dC, lddc); } dA += BASIS_Q * BASIS_P; dC += BASIS_Q * n; } } //////////////////////////////////////////////////////////////////////////////// extern "C" __launch_bounds__(MAGMA_BASIS_BOUNDS(BASIS_P, MAGMA_MAXTHREADS_1D)) __global__ void magma_grad_nontensor_t(int n, CeedScalar const *dA, int ldda, CeedScalar const *dB, int lddb, CeedScalar *dC, int lddc) { MAGMA_DEVICE_SHARED(CeedScalar, shared_data); const int tx = threadIdx.x; const int ty = threadIdx.y; const int id = blockIdx.x * blockDim.y + ty; const int nblocks = MAGMA_CEILDIV(n, BASIS_NB_GRAD_T); const int myn = min(BASIS_NB_GRAD_T, n - id * BASIS_NB_GRAD_T); // terminate threads with no work if (id >= nblocks) return; dB += id * BASIS_NB_GRAD_T * lddb; dC += id * BASIS_NB_GRAD_T * lddc; // A is BASIS_P x BASIS_Q const int sldb = BASIS_Q; CeedScalar *sB = (CeedScalar *)shared_data; sB += ty * sldb * BASIS_NB_GRAD_T; // init rA, rC CeedScalar rA[BASIS_Q] = {MAGMA_D_ZERO}; CeedScalar rC[BASIS_NB_GRAD_T] = {MAGMA_D_ZERO}; // unrolling this loop yields dramatic performance drop using hipcc, let the compiler decide (no pragma unroll) for (int d = 0; d < BASIS_DIM; d++) { // read A read_A_notrans_g2r_1D_nosync(tx, dA, ldda, NULL, 0, rA); // read B __syncthreads(); read_B_g2s_1D_nosync(tx, myn, dB, lddb, sB, sldb); __syncthreads(); addmul_rAsBrC_1D_nosync(tx, rA, sB, sldb, rC); dA += BASIS_P * BASIS_Q; dB += BASIS_Q * n; } // write C write_C_r2g_1D_nosync(tx, myn, rC, dC, lddc); } #endif // CEED_MAGMA_GRAD_NONTENSOR_H