// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced // at the Lawrence Livermore National Laboratory. All Rights reserved. See files // LICENSE and NOTICE for details. LLNL-CODE-806117. // // This file is part of the MFEM library. For more information and source code // availability visit https://mfem.org. // // MFEM is free software; you can redistribute it and/or modify it under the // terms of the BSD-3 license. We welcome feedback and contributions, see file // CONTRIBUTING.md for details. #include "lor_h1.hpp" #include "lor_util.hpp" #include "../../linalg/dtensor.hpp" #include "../../general/forall.hpp" namespace mfem { template void BatchedLOR_H1::Assemble2D() { const int nel_ho = fes_ho.GetNE(); static constexpr int nv = 4; static constexpr int dim = 2; static constexpr int ddm2 = (dim*(dim+1))/2; static constexpr int nd1d = ORDER + 1; static constexpr int ndof_per_el = nd1d*nd1d; static constexpr int nnz_per_row = 9; static constexpr int sz_local_mat = nv*nv; const bool const_mq = c1.Size() == 1; const auto MQ = const_mq ? Reshape(c1.Read(), 1, 1, 1) : Reshape(c1.Read(), nd1d, nd1d, nel_ho); const bool const_dq = c2.Size() == 1; const auto DQ = const_dq ? Reshape(c2.Read(), 1, 1, 1) : Reshape(c2.Read(), nd1d, nd1d, nel_ho); sparse_ij.SetSize(nnz_per_row*ndof_per_el*nel_ho); auto V = Reshape(sparse_ij.Write(), nnz_per_row, nd1d, nd1d, nel_ho); auto X = X_vert.Read(); mfem::forall_2D(nel_ho, ORDER, ORDER, [=] MFEM_HOST_DEVICE (int iel_ho) { // Assemble a sparse matrix over the macro-element by looping over each // subelement. // V(j,ix,iy) stores the jth nonzero in the row of the sparse matrix // corresponding to local DOF (ix, iy). MFEM_FOREACH_THREAD(iy,y,nd1d) { MFEM_FOREACH_THREAD(ix,x,nd1d) { for (int j=0; j Q(Q_, ddm2 + 1, 2, 2); DeviceTensor<2> local_mat(local_mat_, nv, nv); for (int i=0; i(X, iel_ho, kx, ky, vx, vy); for (int iqy=0; iqy<2; ++iqy) { for (int iqx=0; iqx<2; ++iqx) { const double x = iqx; const double y = iqy; const double w = 1.0/4.0; double J_[2*2]; DeviceTensor<2> J(J_, 2, 2); Jacobian2D(x, y, vx, vy, J); const double detJ = Det2D(J); const double w_detJ = w/detJ; Q(0,iqy,iqx) = w_detJ * (J(0,1)*J(0,1) + J(1,1)*J(1,1)); // 1,1 Q(1,iqy,iqx) = -w_detJ * (J(0,1)*J(0,0) + J(1,1)*J(1,0)); // 1,2 Q(2,iqy,iqx) = w_detJ * (J(0,0)*J(0,0) + J(1,0)*J(1,0)); // 2,2 Q(3,iqy,iqx) = w*detJ; } } for (int iqx=0; iqx<2; ++iqx) { for (int iqy=0; iqy<2; ++iqy) { const double mq = const_mq ? MQ(0,0,0) : MQ(kx+iqx, ky+iqy, iel_ho); const double dq = const_dq ? DQ(0,0,0) : DQ(kx+iqx, ky+iqy, iel_ho); for (int jy=0; jy<2; ++jy) { const double bjy = (jy == iqy) ? 1.0 : 0.0; const double gjy = (jy == 0) ? -1.0 : 1.0; for (int jx=0; jx<2; ++jx) { const double bjx = (jx == iqx) ? 1.0 : 0.0; const double gjx = (jx == 0) ? -1.0 : 1.0; const double djx = gjx*bjy; const double djy = bjx*gjy; int jj_loc = jx + 2*jy; for (int iy=0; iy<2; ++iy) { const double biy = (iy == iqy) ? 1.0 : 0.0; const double giy = (iy == 0) ? -1.0 : 1.0; for (int ix=0; ix<2; ++ix) { const double bix = (ix == iqx) ? 1.0 : 0.0; const double gix = (ix == 0) ? -1.0 : 1.0; const double dix = gix*biy; const double diy = bix*giy; int ii_loc = ix + 2*iy; // Only store the lower-triangular part of // the matrix (by symmetry). if (jj_loc > ii_loc) { continue; } double val = 0.0; val += dix*djx*Q(0,iqy,iqx); val += (dix*djy + diy*djx)*Q(1,iqy,iqx); val += diy*djy*Q(2,iqy,iqx); val *= dq; val += mq*bix*biy*bjx*bjy*Q(3,iqy,iqx); local_mat(ii_loc, jj_loc) += val; } } } } } } // Assemble the local matrix into the macro-element sparse matrix // in a format similar to coordinate format. The (I,J) arrays // are implicit (not stored explicitly). for (int ii_loc=0; ii_loc 0) ? iy - 1 : 0; const int jy_end = (iy < ORDER) ? iy + 1 : ORDER; for (int ix=0; ix 0) ? ix - 1 : 0; const int jx_end = (ix < ORDER) ? ix + 1 : ORDER; const int ii_el = ix + nd1d*iy; for (int jy=jy_begin; jy<=jy_end; ++jy) { for (int jx=jx_begin; jx<=jx_end; ++jx) { const int jj_off = (jx-ix+1) + 3*(jy-iy+1); const int jj_el = jx + nd1d*jy; map(jj_off, ii_el) = jj_el; } } } } } template void BatchedLOR_H1::Assemble3D() { const int nel_ho = fes_ho.GetNE(); static constexpr int nv = 8; static constexpr int dim = 3; static constexpr int ddm2 = (dim*(dim+1))/2; static constexpr int nd1d = ORDER + 1; static constexpr int ndof_per_el = nd1d*nd1d*nd1d; static constexpr int nnz_per_row = 27; static constexpr int sz_grad_A = 3*3*2*2*2*2; static constexpr int sz_grad_B = sz_grad_A*2; static constexpr int sz_mass_A = 2*2*2*2; static constexpr int sz_mass_B = sz_mass_A*2; static constexpr int sz_local_mat = nv*nv; const bool const_mq = c1.Size() == 1; const auto MQ = const_mq ? Reshape(c1.Read(), 1, 1, 1, 1) : Reshape(c1.Read(), nd1d, nd1d, nd1d, nel_ho); const bool const_dq = c2.Size() == 1; const auto DQ = const_dq ? Reshape(c2.Read(), 1, 1, 1, 1) : Reshape(c2.Read(), nd1d, nd1d, nd1d, nel_ho); sparse_ij.SetSize(nel_ho*ndof_per_el*nnz_per_row); auto V = Reshape(sparse_ij.Write(), nnz_per_row, nd1d, nd1d, nd1d, nel_ho); auto X = X_vert.Read(); // Last thread dimension is lowered to avoid "too many resources" error mfem::forall_3D(nel_ho, ORDER, ORDER, (ORDER>6)?4:ORDER, [=] MFEM_HOST_DEVICE (int iel_ho) { // Assemble a sparse matrix over the macro-element by looping over each // subelement. // V(j,i) stores the jth nonzero in the ith row of the sparse matrix. MFEM_FOREACH_THREAD(iz,z,nd1d) { MFEM_FOREACH_THREAD(iy,y,nd1d) { MFEM_FOREACH_THREAD(ix,x,nd1d) { MFEM_UNROLL(nnz_per_row) for (int j=0; j Q(Q_, ddm2 + 1, 2, 2, 2); DeviceTensor<2> local_mat(local_mat_, nv, nv); DeviceTensor<6> grad_A(grad_A_, 3, 3, 2, 2, 2, 2); DeviceTensor<7> grad_B(grad_B_, 3, 3, 2, 2, 2, 2, 2); DeviceTensor<4> mass_A(mass_A_, 2, 2, 2, 2); DeviceTensor<5> mass_B(mass_B_, 2, 2, 2, 2, 2); // local_mat is the local (dense) stiffness matrix for (int i=0; i(X, iel_ho, kx, ky, kz, vx, vy, vz); //MFEM_UNROLL(2) for (int iqz=0; iqz<2; ++iqz) { //MFEM_UNROLL(2) for (int iqy=0; iqy<2; ++iqy) { //MFEM_UNROLL(2) for (int iqx=0; iqx<2; ++iqx) { const double x = iqx; const double y = iqy; const double z = iqz; const double w = 1.0/8.0; double J_[3*3]; DeviceTensor<2> J(J_, 3, 3); Jacobian3D(x, y, z, vx, vy, vz, J); const double detJ = Det3D(J); const double w_detJ = w/detJ; // adj(J) double A_[3*3]; DeviceTensor<2> A(A_, 3, 3); Adjugate3D(J, A); Q(0,iqz,iqy,iqx) = w_detJ*(A(0,0)*A(0,0)+A(0,1)*A(0,1)+A(0,2)*A(0,2)); // 1,1 Q(1,iqz,iqy,iqx) = w_detJ*(A(0,0)*A(1,0)+A(0,1)*A(1,1)+A(0,2)*A(1,2)); // 2,1 Q(2,iqz,iqy,iqx) = w_detJ*(A(0,0)*A(2,0)+A(0,1)*A(2,1)+A(0,2)*A(2,2)); // 3,1 Q(3,iqz,iqy,iqx) = w_detJ*(A(1,0)*A(1,0)+A(1,1)*A(1,1)+A(1,2)*A(1,2)); // 2,2 Q(4,iqz,iqy,iqx) = w_detJ*(A(1,0)*A(2,0)+A(1,1)*A(2,1)+A(1,2)*A(2,2)); // 3,2 Q(5,iqz,iqy,iqx) = w_detJ*(A(2,0)*A(2,0)+A(2,1)*A(2,1)+A(2,2)*A(2,2)); // 3,3 Q(6,iqz,iqy,iqx) = w*detJ; } } } //MFEM_UNROLL(2) for (int iqx=0; iqx<2; ++iqx) { //MFEM_UNROLL(2) for (int jz=0; jz<2; ++jz) { // Note loop starts at iz=jz here, taking advantage of // symmetries. //MFEM_UNROLL(2) for (int iz=jz; iz<2; ++iz) { //MFEM_UNROLL(2) for (int iqy=0; iqy<2; ++iqy) { //MFEM_UNROLL(2) for (int iqz=0; iqz<2; ++iqz) { const double mq = const_mq ? MQ(0,0,0,0) : MQ(kx+iqx, ky+iqy, kz+iqz, iel_ho); const double dq = const_dq ? DQ(0,0,0,0) : DQ(kx+iqx, ky+iqy, kz+iqz, iel_ho); const double biz = (iz == iqz) ? 1.0 : 0.0; const double giz = (iz == 0) ? -1.0 : 1.0; const double bjz = (jz == iqz) ? 1.0 : 0.0; const double gjz = (jz == 0) ? -1.0 : 1.0; const double J11 = Q(0,iqz,iqy,iqx); const double J21 = Q(1,iqz,iqy,iqx); const double J31 = Q(2,iqz,iqy,iqx); const double J12 = J21; const double J22 = Q(3,iqz,iqy,iqx); const double J32 = Q(4,iqz,iqy,iqx); const double J13 = J31; const double J23 = J32; const double J33 = Q(5,iqz,iqy,iqx); grad_A(0,0,iqy,iz,jz,iqx) += dq*J11*biz*bjz; grad_A(1,0,iqy,iz,jz,iqx) += dq*J21*biz*bjz; grad_A(2,0,iqy,iz,jz,iqx) += dq*J31*giz*bjz; grad_A(0,1,iqy,iz,jz,iqx) += dq*J12*biz*bjz; grad_A(1,1,iqy,iz,jz,iqx) += dq*J22*biz*bjz; grad_A(2,1,iqy,iz,jz,iqx) += dq*J32*giz*bjz; grad_A(0,2,iqy,iz,jz,iqx) += dq*J13*biz*gjz; grad_A(1,2,iqy,iz,jz,iqx) += dq*J23*biz*gjz; grad_A(2,2,iqy,iz,jz,iqx) += dq*J33*giz*gjz; double wdetJ = Q(6,iqz,iqy,iqx); mass_A(iqy,iz,jz,iqx) += mq*wdetJ*biz*bjz; } //MFEM_UNROLL(2) for (int jy=0; jy<2; ++jy) { //MFEM_UNROLL(2) for (int iy=0; iy<2; ++iy) { const double biy = (iy == iqy) ? 1.0 : 0.0; const double giy = (iy == 0) ? -1.0 : 1.0; const double bjy = (jy == iqy) ? 1.0 : 0.0; const double gjy = (jy == 0) ? -1.0 : 1.0; grad_B(0,0,iy,jy,iz,jz,iqx) += biy*bjy*grad_A(0,0,iqy,iz,jz,iqx); grad_B(1,0,iy,jy,iz,jz,iqx) += giy*bjy*grad_A(1,0,iqy,iz,jz,iqx); grad_B(2,0,iy,jy,iz,jz,iqx) += biy*bjy*grad_A(2,0,iqy,iz,jz,iqx); grad_B(0,1,iy,jy,iz,jz,iqx) += biy*gjy*grad_A(0,1,iqy,iz,jz,iqx); grad_B(1,1,iy,jy,iz,jz,iqx) += giy*gjy*grad_A(1,1,iqy,iz,jz,iqx); grad_B(2,1,iy,jy,iz,jz,iqx) += biy*gjy*grad_A(2,1,iqy,iz,jz,iqx); grad_B(0,2,iy,jy,iz,jz,iqx) += biy*bjy*grad_A(0,2,iqy,iz,jz,iqx); grad_B(1,2,iy,jy,iz,jz,iqx) += giy*bjy*grad_A(1,2,iqy,iz,jz,iqx); grad_B(2,2,iy,jy,iz,jz,iqx) += biy*bjy*grad_A(2,2,iqy,iz,jz,iqx); mass_B(iy,jy,iz,jz,iqx) += biy*bjy*mass_A(iqy,iz,jz,iqx); } } } //MFEM_UNROLL(2) for (int jy=0; jy<2; ++jy) { //MFEM_UNROLL(2) for (int jx=0; jx<2; ++jx) { //MFEM_UNROLL(2) for (int iy=0; iy<2; ++iy) { //MFEM_UNROLL(2) for (int ix=0; ix<2; ++ix) { const double bix = (ix == iqx) ? 1.0 : 0.0; const double gix = (ix == 0) ? -1.0 : 1.0; const double bjx = (jx == iqx) ? 1.0 : 0.0; const double gjx = (jx == 0) ? -1.0 : 1.0; int ii_loc = ix + 2*iy + 4*iz; int jj_loc = jx + 2*jy + 4*jz; // Only store the lower-triangular part of // the matrix (by symmetry). if (jj_loc > ii_loc) { continue; } double val = 0.0; val += gix*gjx*grad_B(0,0,iy,jy,iz,jz,iqx); val += bix*gjx*grad_B(1,0,iy,jy,iz,jz,iqx); val += bix*gjx*grad_B(2,0,iy,jy,iz,jz,iqx); val += gix*bjx*grad_B(0,1,iy,jy,iz,jz,iqx); val += bix*bjx*grad_B(1,1,iy,jy,iz,jz,iqx); val += bix*bjx*grad_B(2,1,iy,jy,iz,jz,iqx); val += gix*bjx*grad_B(0,2,iy,jy,iz,jz,iqx); val += bix*bjx*grad_B(2,2,iy,jy,iz,jz,iqx); val += bix*bjx*grad_B(1,2,iy,jy,iz,jz,iqx); val += bix*bjx*mass_B(iy,jy,iz,jz,iqx); local_mat(ii_loc, jj_loc) += val; } } } } } } } // Assemble the local matrix into the macro-element sparse matrix // in a format similar to coordinate format. The (I,J) arrays // are implicit (not stored explicitly). //MFEM_UNROLL(8) for (int ii_loc=0; ii_loc 0) ? iz - 1 : 0; const int jz_end = (iz < ORDER) ? iz + 1 : ORDER; for (int iy=0; iy 0) ? iy - 1 : 0; const int jy_end = (iy < ORDER) ? iy + 1 : ORDER; for (int ix=0; ix 0) ? ix - 1 : 0; const int jx_end = (ix < ORDER) ? ix + 1 : ORDER; const int ii_el = ix + nd1d*(iy + nd1d*iz); for (int jz=jz_begin; jz<=jz_end; ++jz) { for (int jy=jy_begin; jy<=jy_end; ++jy) { for (int jx=jx_begin; jx<=jx_end; ++jx) { const int jj_off = (jx-ix+1) + 3*(jy-iy+1) + 9*(jz-iz+1); const int jj_el = jx + nd1d*(jy + nd1d*jz); map(jj_off, ii_el) = jj_el; } } } } } } } // Explicit template instantiations template void BatchedLOR_H1::Assemble2D<1>(); template void BatchedLOR_H1::Assemble2D<2>(); template void BatchedLOR_H1::Assemble2D<3>(); template void BatchedLOR_H1::Assemble2D<4>(); template void BatchedLOR_H1::Assemble2D<5>(); template void BatchedLOR_H1::Assemble2D<6>(); template void BatchedLOR_H1::Assemble2D<7>(); template void BatchedLOR_H1::Assemble2D<8>(); template void BatchedLOR_H1::Assemble3D<1>(); template void BatchedLOR_H1::Assemble3D<2>(); template void BatchedLOR_H1::Assemble3D<3>(); template void BatchedLOR_H1::Assemble3D<4>(); template void BatchedLOR_H1::Assemble3D<5>(); template void BatchedLOR_H1::Assemble3D<6>(); template void BatchedLOR_H1::Assemble3D<7>(); template void BatchedLOR_H1::Assemble3D<8>(); BatchedLOR_H1::BatchedLOR_H1(BilinearForm &a, FiniteElementSpace &fes_ho_, Vector &X_vert_, Vector &sparse_ij_, Array &sparse_mapping_) : BatchedLORKernel(fes_ho_, X_vert_, sparse_ij_, sparse_mapping_) { ProjectLORCoefficient(a, c1); ProjectLORCoefficient(a, c2); } } // namespace mfem