/* * This code is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this code; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* * Copyright (C) 2020-2023 Max-Planck-Society * Author: Martin Reinecke */ #ifndef DUCC0_TOTALCONVOLVE_H #define DUCC0_TOTALCONVOLVE_H #include #include #include #include #include #include #include #include #include #include "ducc0/infra/error_handling.h" #include "ducc0/infra/threading.h" #include "ducc0/math/constants.h" #include "ducc0/math/gridding_kernel.h" #include "ducc0/infra/mav.h" #include "ducc0/infra/simd.h" #include "ducc0/infra/aligned_array.h" #include "ducc0/infra/useful_macros.h" #include "ducc0/infra/bucket_sort.h" #include "ducc0/sht/sht.h" #include "ducc0/sht/alm.h" #include "ducc0/fft/fft1d.h" #include "ducc0/fft/fft.h" #include "ducc0/math/math_utils.h" #include "ducc0/nufft/nufft.h" namespace ducc0 { namespace detail_totalconvolve { using namespace std; template class ConvolverPlan { protected: constexpr static auto vlen = min(8, native_simd::size()); using Tsimd = typename simd_select::type; size_t nthreads; size_t lmax, kmax; // _s: small grid // _b: oversampled grid // no suffix: grid with borders size_t nphi_s, ntheta_s, npsi_s; size_t kernel_index; shared_ptr kernel; size_t nphi_b, ntheta_b, npsi_b; double dphi, dtheta, dpsi, xdphi, xdtheta, xdpsi; size_t nbphi, nbtheta; size_t nphi, ntheta; double phi0, theta0; auto getKernel(size_t axlen, size_t axlen2) const { auto axlen_big = max(axlen, axlen2); auto axlen_small = min(axlen, axlen2); auto fct = kernel->corfunc(axlen_small/2+1, 1./axlen_big, nthreads); return fct; } quick_array getIdx(const cmav &theta, const cmav &phi, const cmav &psi, size_t patch_ntheta, size_t patch_nphi, size_t itheta0, size_t iphi0, size_t supp) const { size_t nptg = theta.shape(0); constexpr size_t cellsize=8; size_t nct = patch_ntheta/cellsize+1, ncp = patch_nphi/cellsize+1, ncpsi = npsi_b/cellsize+1; double theta0 = (int(itheta0)-int(nbtheta))*dtheta, phi0 = (int(iphi0)-int(nbphi))*dphi; double theta_lo=theta0, theta_hi=theta_lo+(patch_ntheta+1)*dtheta; double phi_lo=phi0, phi_hi=phi_lo+(patch_nphi+1)*dphi; MR_assert(uint64_t(nct)*uint64_t(ncp)*uint64_t(ncpsi)<(uint64_t(1)<<32), "key space too large"); quick_array key(nptg); execParallel(nptg, nthreads, [&](size_t lo, size_t hi) { for (size_t i=lo; i=theta_lo) && (theta(i)<=theta_hi), "theta out of range: ", theta(i)); MR_assert((phi(i)>=phi_lo) && (phi(i)<=phi_hi), "phi out of range: ", phi(i)); auto ftheta = (theta(i)-theta0)*xdtheta-supp*0.5; auto itheta = size_t(ftheta+1); auto fphi = (phi(i)-phi0)*xdphi-supp*0.5; auto iphi = size_t(fphi+1); auto fpsi = psi(i)*xdpsi; fpsi = fmodulo(fpsi, double(npsi_b)); size_t ipsi = size_t(fpsi); ipsi /= cellsize; itheta /= cellsize; iphi /= cellsize; MR_assert(itheta res(key.size()); bucket_sort2(key, res, ncp*nct*ncpsi, nthreads); return res; } template class WeightHelper { public: static constexpr size_t vlen = Tsimd::size(); static constexpr size_t nvec = (supp+vlen-1)/vlen; const ConvolverPlan &plan; union kbuf { T scalar[3*nvec*vlen]; Tsimd simd[3*nvec]; #if defined(_MSC_VER) kbuf() {} #endif }; kbuf buf; private: TemplateKernel tkrn; double mytheta0, myphi0; public: WeightHelper(const ConvolverPlan &plan_, const mav_info<3> &info, size_t itheta0, size_t iphi0) : plan(plan_), tkrn(*plan.kernel), mytheta0(plan.theta0+itheta0*plan.dtheta), myphi0(plan.phi0+iphi0*plan.dphi), wpsi(&buf.scalar[0]), wtheta(&buf.scalar[nvec*vlen]), wphi(&buf.simd[2*nvec]), jumptheta(info.stride(1)) { MR_assert(info.stride(2)==1, "last axis of cube must be contiguous"); } void prep(double theta, double phi, double psi) { auto ftheta = (theta-mytheta0)*plan.xdtheta-supp*0.5; itheta = size_t(ftheta+1); ftheta = -1+(itheta-ftheta)*2; auto fphi = (phi-myphi0)*plan.xdphi-supp*0.5; iphi = size_t(fphi+1); fphi = -1+(iphi-fphi)*2; auto fpsi = psi*plan.xdpsi-supp*0.5; fpsi = fmodulo(fpsi, double(plan.npsi_b)); ipsi = size_t(fpsi+1); fpsi = -1+(ipsi-fpsi)*2; if (ipsi>=plan.npsi_b) ipsi-=plan.npsi_b; tkrn.eval3(T(fpsi), T(ftheta), T(fphi), &buf.simd[0]); } size_t itheta, iphi, ipsi; const T * DUCC0_RESTRICT wpsi; const T * DUCC0_RESTRICT wtheta; const Tsimd * DUCC0_RESTRICT wphi; ptrdiff_t jumptheta; }; // prefetching distance static constexpr size_t pfdist=2; template void interpolx(size_t supp_, const cmav &cube, size_t itheta0, size_t iphi0, const cmav &theta, const cmav &phi, const cmav &psi, vmav &signal) const { if constexpr (supp>=8) if (supp_<=supp/2) return interpolx(supp_, cube, itheta0, iphi0, theta, phi, psi, signal); if constexpr (supp>4) if (supp_(supp_, cube, itheta0, iphi0, theta, phi, psi, signal); MR_assert(supp_==supp, "requested support out of range"); MR_assert(cube.stride(2)==1, "last axis of cube must be contiguous"); MR_assert(phi.shape(0)==theta.shape(0), "array shape mismatch"); MR_assert(psi.shape(0)==theta.shape(0), "array shape mismatch"); MR_assert(signal.shape(0)==theta.shape(0), "array shape mismatch"); static constexpr size_t vlen = Tsimd::size(); static constexpr size_t nvec = (supp+vlen-1)/vlen; MR_assert(cube.shape(0)==npsi_b, "bad psi dimension"); auto idx = getIdx(theta, phi, psi, cube.shape(1), cube.shape(2), itheta0, iphi0, supp); execStatic(idx.size(), nthreads, 0, [&](Scheduler &sched) { WeightHelper hlp(*this, cube, itheta0, iphi0); while (auto rng=sched.getNext()) for(auto ind=rng.lo; ind=npsi_b) ipsi=0; ptr = &cube(ipsi,hlp.itheta,hlp.iphi); } res *= hlp.wphi[0]; } else { for (size_t ipsic=0; ipsic=npsi_b) ipsi=0; ptr = &cube(ipsi,hlp.itheta,hlp.iphi); } } signal(i) = reduce(res, std::plus<>()); } }); } template void deinterpolx(size_t supp_, vmav &cube, size_t itheta0, size_t iphi0, const cmav &theta, const cmav &phi, const cmav &psi, const cmav &signal) const { if constexpr (supp>=8) if (supp_<=supp/2) return deinterpolx(supp_, cube, itheta0, iphi0, theta, phi, psi, signal); if constexpr (supp>4) if (supp_(supp_, cube, itheta0, iphi0, theta, phi, psi, signal); MR_assert(supp_==supp, "requested support out of range"); MR_assert(cube.stride(2)==1, "last axis of cube must be contiguous"); MR_assert(phi.shape(0)==theta.shape(0), "array shape mismatch"); MR_assert(psi.shape(0)==theta.shape(0), "array shape mismatch"); MR_assert(signal.shape(0)==theta.shape(0), "array shape mismatch"); static constexpr size_t vlen = Tsimd::size(); static constexpr size_t nvec = (supp+vlen-1)/vlen; MR_assert(cube.shape(0)==npsi_b, "bad psi dimension"); auto idx = getIdx(theta, phi, psi, cube.shape(1), cube.shape(2), itheta0, iphi0, supp); constexpr size_t cellsize=16; size_t nct = cube.shape(1)/cellsize+10, ncp = cube.shape(2)/cellsize+10; vmav locks({nct,ncp}); execStatic(idx.size(), nthreads, 0, [&](Scheduler &sched) { size_t b_theta=~(size_t(0)), b_phi=~(size_t(0)); WeightHelper hlp(*this, cube, itheta0, iphi0); while (auto rng=sched.getNext()) for(auto ind=rng.lo; ind=npsi_b) ipsi=0; ptr = &cube(ipsi,hlp.itheta,hlp.iphi); } } else { for (size_t ipsic=0; ipsic=npsi_b) ipsi=0; ptr = &cube(ipsi,hlp.itheta,hlp.iphi); } } } } if (b_theta(epsilon, sigma_min, sigma_max, {(2*ntheta_s-2), nphi_s, npsi_s}, npoints, true, nthreads)), kernel(selectKernel(kernel_index)), nphi_b(std::max(20,2*good_size_real(size_t((2*lmax+1)*ducc0::getKernel(kernel_index).ofactor/2.)))), ntheta_b(std::max(21,good_size_real(size_t((lmax+1)*ducc0::getKernel(kernel_index).ofactor))+1)), npsi_b(size_t(npsi_s*ducc0::getKernel(kernel_index).ofactor+0.99999)), dphi(2*pi/nphi_b), dtheta(pi/(ntheta_b-1)), dpsi(2*pi/npsi_b), xdphi(1./dphi), xdtheta(1./dtheta), xdpsi(1./dpsi), nbphi((kernel->support()+1)/2), nbtheta((kernel->support()+1)/2), nphi((nphi_b+2*nbphi+vlen)+((nphi_b+2*nbphi+vlen)&1)), // we need this to be even ntheta(ntheta_b+2*nbtheta), phi0(nbphi*(-dphi)), theta0(nbtheta*(-dtheta)) { auto supp = kernel->support(); MR_assert((supp<=ntheta) && (supp<=nphi_b), "kernel support too large!"); } size_t Lmax() const { return lmax; } size_t Kmax() const { return kmax; } size_t Ntheta() const { return ntheta; } size_t Nphi() const { return nphi; } size_t Npsi() const { return npsi_b; } vector getPatchInfo(double theta_lo, double theta_hi, double phi_lo, double phi_hi) const { vector res(4); auto tmp = (theta_lo-theta0)*xdtheta-nbtheta; res[0] = min(size_t(max(0., tmp)), ntheta); tmp = (theta_hi-theta0)*xdtheta+nbtheta+1.; res[1] = min(size_t(max(0., tmp)), ntheta); tmp = (phi_lo-phi0)*xdphi-nbphi; res[2] = min(size_t(max(0., tmp)), nphi); tmp = (phi_hi-phi0)*xdphi+nbphi+1.+vlen; res[3] = min(size_t(max(0., tmp)), nphi); return res; } void getPlane(const cmav,2> &vslm, const cmav,2> &vblm, size_t mbeam, vmav &planes) const { size_t nplanes=1+(mbeam>0); auto ncomp = vslm.shape(0); MR_assert(ncomp>0, "need at least one component"); MR_assert(vblm.shape(0)==ncomp, "inconsistent slm and blm vectors"); Alm_Base islm(lmax, lmax), iblm(lmax, kmax); MR_assert(islm.Num_Alms()==vslm.shape(1), "bad array dimension"); MR_assert(iblm.Num_Alms()==vblm.shape(1), "bad array dimension"); MR_assert(planes.conformable({nplanes, Ntheta(), Nphi()}), "bad planes shape"); MR_assert(mbeam <= kmax, "mbeam too high"); vector lnorm(lmax+1); for (size_t i=0; i<=lmax; ++i) lnorm[i]=T(std::sqrt(4*pi/(2*i+1.))); Alm_Base base(lmax, lmax); vmav,2> aarr({nplanes,base.Num_Alms()}, UNINITIALIZED); for (size_t m=0; m<=lmax; ++m) for (size_t l=m; l<=lmax; ++l) { aarr(0, base.index(l,m))=0.; if (mbeam>0) aarr(1, base.index(l,m))=0.; if (l>=mbeam) { auto norm = (mbeam>0) ? -lnorm[l] : lnorm[l]; for (size_t i=0; i0) aarr(1,base.index(l,m)) += vslm(i,islm.index(l,m))*tmp.imag(); } } } auto subplanes=subarray<3>(planes,{{}, {nbtheta, nbtheta+ntheta_s}, {nbphi, nbphi+nphi_s}}); MR_assert(planes.stride(2)==1, "last axis must have stride 1"); MR_assert((planes.stride(1)&1)==0, "stride must be even"); MR_assert((planes.stride(0)&1)==0, "stride must be even"); MR_assert(2*(lmax+1)<=nphi_b, "aargh"); vmav,3> leg_s(reinterpret_cast *>(&planes(0,nbtheta,nbphi-1)), {nplanes, ntheta_s, lmax+1}, {subplanes.stride(0)/2, subplanes.stride(1)/2, 1}); vmav,3> leg_b(reinterpret_cast *>(&planes(0,nbtheta,nbphi-1)), {nplanes, ntheta_b, lmax+1}, {subplanes.stride(0)/2, subplanes.stride(1)/2, 1}); vmav theta({ntheta_s}, UNINITIALIZED); for (size_t i=0; i mval({lmax+1}, UNINITIALIZED); vmav mstart({lmax+1}, UNINITIALIZED); size_t ofs=0; for (size_t i=0; i<=lmax; ++i) { mval(i) = i; mstart(i) = ofs-i; ofs += lmax+1-i; } alm2leg(aarr, leg_s, mbeam, lmax, mval, mstart, 1, theta, nthreads, STANDARD); auto kernel = getKernel(2*ntheta_s-2, 2*ntheta_b-2); ducc0::detail_sht::resample_and_convolve_theta (leg_s, true, true, leg_b, true, true, kernel, mbeam, nthreads, false); // fix phi size_t nj=2*lmax+1; auto phikrn = getKernel(nphi_s, nphi_b); vmav phikrn2({nj}); for (size_t j=0; j rplan(nphi_b); for (size_t iplane=0; iplane(planes, {{iplane},{nbtheta, nbtheta+ntheta_b}, {nbphi, nbphi+nphi_b}}); execParallel(ntheta_b, nthreads, [&](size_t lo, size_t hi) { vmav buf({rplan.bufsize()}); for (size_t i=lo; i=nphi_b) j2-=nphi_b; planes(iplane,nbtheta-1-i,j2+nbphi) = fct*planes(iplane,nbtheta+1+i,j+nbphi); planes(iplane,nbtheta+ntheta_b+i,j2+nbphi) = fct*planes(iplane,nbtheta+ntheta_b-2-i,j+nbphi); } for (size_t i=0; i,1> &slm, const cmav,1> &blm, size_t mbeam, vmav &planes) const { cmav,2> vslm(&slm(0), {1,slm.shape(0)}, {0,slm.stride(0)}); cmav,2> vblm(&blm(0), {1,blm.shape(0)}, {0,blm.stride(0)}); getPlane(vslm, vblm, mbeam, planes); } void interpol(const cmav &cube, size_t itheta0, size_t iphi0, const cmav &theta, const cmav &phi, const cmav &psi, vmav &signal) const { constexpr size_t maxsupp = is_same::value ? 16 : 8; interpolx(kernel->support(), cube, itheta0, iphi0, theta, phi, psi, signal); } void deinterpol(vmav &cube, size_t itheta0, size_t iphi0, const cmav &theta, const cmav &phi, const cmav &psi, const cmav &signal) const { constexpr size_t maxsupp = is_same::value ? 16 : 8; deinterpolx(kernel->support(), cube, itheta0, iphi0, theta, phi, psi, signal); } void updateSlm(vmav,2> &vslm, const cmav,2> &vblm, size_t mbeam, vmav &planes) const { size_t nplanes=1+(mbeam>0); auto ncomp = vslm.shape(0); MR_assert(ncomp>0, "need at least one component"); MR_assert(vblm.shape(0)==ncomp, "inconsistent slm and blm vectors"); Alm_Base islm(lmax, lmax), iblm(lmax, kmax); MR_assert(islm.Num_Alms()==vslm.shape(1), "bad array dimension"); MR_assert(iblm.Num_Alms()==vblm.shape(1), "bad array dimension"); MR_assert(planes.conformable({nplanes, Ntheta(), Nphi()}), "bad planes shape"); MR_assert(mbeam <= kmax, "mbeam too high"); // move stuff from border regions onto the main grid for (size_t iplane=0; iplane=nphi_b) j2-=nphi_b; planes(iplane,nbtheta+1+i,j+nbphi) += fct*planes(iplane,nbtheta-1-i,j2+nbphi); planes(iplane,nbtheta+ntheta_b-2-i, j+nbphi) += fct*planes(iplane,nbtheta+ntheta_b+i,j2+nbphi); } } // fix phi size_t nj=2*lmax+1; auto phikrn = getKernel(nphi_b, nphi_s); vmav phikrn2({nj}); for (size_t j=0; j rplan(nphi_b); for (size_t iplane=0; iplane(planes, {{iplane}, {nbtheta, nbtheta+ntheta_b}, {nbphi,nbphi+nphi_b}}); execParallel(ntheta_b, nthreads, [&](size_t lo, size_t hi) { vmav buf({rplan.bufsize()}); for (size_t i=lo; i(planes, {{0, nplanes}, {nbtheta, nbtheta+ntheta_s}, {nbphi,nbphi+nphi_s}}); MR_assert(planes.stride(2)==1, "last axis must have stride 1"); MR_assert((planes.stride(1)&1)==0, "stride must be even"); MR_assert((planes.stride(0)&1)==0, "stride must be even"); MR_assert(2*(lmax+1)<=nphi_b, "aargh"); vmav,3> leg_s(reinterpret_cast *>(&planes(0,nbtheta,nbphi-1)), {nplanes, ntheta_s, lmax+1}, {subplanes.stride(0)/2, subplanes.stride(1)/2, 1}); vmav,3> leg_b(reinterpret_cast *>(&planes(0,nbtheta,nbphi-1)), {nplanes, ntheta_b, lmax+1}, {subplanes.stride(0)/2, subplanes.stride(1)/2, 1}); vmav theta({ntheta_s}, UNINITIALIZED); for (size_t i=0; i mval({lmax+1}, UNINITIALIZED); vmav mstart({lmax+1}, UNINITIALIZED); size_t ofs=0; for (size_t i=0; i<=lmax; ++i) { mval(i) = i; mstart(i) = ofs-i; ofs += lmax+1-i; } auto kernel = getKernel(2*ntheta_b-2, 2*ntheta_s-2); ducc0::detail_sht::resample_and_convolve_theta (leg_b, true, true, leg_s, true, true, kernel, mbeam, nthreads, true); vectorlnorm(lmax+1); for (size_t i=0; i<=lmax; ++i) lnorm[i]=T(std::sqrt(4*pi/(2*i+1.))); Alm_Base base(lmax,lmax); vmav,2> aarr({nplanes, base.Num_Alms()}, UNINITIALIZED); leg2alm(aarr, leg_s, mbeam, lmax, mval, mstart, 1, theta, nthreads, STANDARD); for (size_t m=0; m<=lmax; ++m) for (size_t l=m; l<=lmax; ++l) if (l>=mbeam) for (size_t i=0; i0) vslm(i,islm.index(l,m)) += aarr(1,base.index(l,m))*tmp.imag(); } } void updateSlm(vmav,1> &slm, const cmav,1> &blm, size_t mbeam, vmav &planes) const { auto vslm(slm.prepend_1()); auto vblm(blm.prepend_1()); updateSlm(vslm, vblm, mbeam, planes); } void prepPsi(vmav &subcube) const { MR_assert(subcube.shape(0)==npsi_b, "bad psi dimension"); auto newpart = subarray<3>(subcube, {{npsi_s, MAXIDX}, {}, {}}); mav_apply([](T &v){v=T(0);}, nthreads, newpart); auto fct = kernel->corfunc(npsi_s/2+1, 1./npsi_b, nthreads); for (size_t k=0; k fsubcube(subcube); r2r_fftpack(fsubcube, fsubcube, {0}, false, true, T(1), nthreads); } void deprepPsi(vmav &subcube) const { MR_assert(subcube.shape(0)==npsi_b, "bad psi dimension"); vfmav fsubcube(subcube); r2r_fftpack(fsubcube, fsubcube, {0}, true, false, T(1), nthreads); auto fct = kernel->corfunc(npsi_s/2+1, 1./npsi_b, nthreads); for (size_t k=0; k buildCube(size_t nplanes) const { auto cube_ = vmav::build_noncritical({nplanes, Npsi(), Ntheta(), (Nphi()+1)/2, 2}, UNINITIALIZED); vmav cube = cube_.template reinterpret<4>( {nplanes, Npsi(), Ntheta(), Nphi()}, {cube_.stride(0), cube_.stride(1), cube_.stride(2), 1}); return cube; } }; } using detail_totalconvolve::ConvolverPlan; } #endif