/* * This code is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this code; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* * Copyright (C) 2020-2023 Max-Planck-Society * Author: Martin Reinecke */ #ifndef DUCC0_SPHERE_INTERPOL_H #define DUCC0_SPHERE_INTERPOL_H #include #include #include #include #include #include #include #include #include #include "ducc0/infra/error_handling.h" #include "ducc0/infra/threading.h" #include "ducc0/math/constants.h" #include "ducc0/math/gridding_kernel.h" #include "ducc0/infra/mav.h" #include "ducc0/infra/simd.h" #include "ducc0/infra/aligned_array.h" #include "ducc0/infra/useful_macros.h" #include "ducc0/infra/bucket_sort.h" #include "ducc0/sht/sht.h" #include "ducc0/sht/alm.h" #include "ducc0/fft/fft1d.h" #include "ducc0/fft/fft.h" #include "ducc0/math/math_utils.h" #include "ducc0/sht/sht_utils.h" #include "ducc0/infra/timers.h" #include "ducc0/nufft/nufft.h" namespace ducc0 { namespace detail_sphereinterpol { using namespace std; template class SphereInterpol { protected: constexpr static auto vlen = min(8, native_simd::size()); using Tsimd = typename simd_select::type; size_t nthreads; size_t lmax, mmax, spin; // _s: small grid // _b: oversampled grid // no suffix: grid with borders size_t nphi_s, ntheta_s; size_t kernel_index; shared_ptr kernel; size_t nphi_b, ntheta_b; double dphi, dtheta, xdphi, xdtheta; size_t nbphi, nbtheta; size_t nphi, ntheta; double phi0, theta0; auto getKernel(size_t axlen, size_t axlen2) const { auto axlen_big = max(axlen, axlen2); auto axlen_small = min(axlen, axlen2); auto fct = kernel->corfunc(axlen_small/2+1, 1./axlen_big, nthreads); return fct; } templatequick_array getIdx(const cmav &theta, const cmav &phi, size_t patch_ntheta, size_t patch_nphi, size_t itheta0, size_t iphi0, size_t supp) const { size_t nptg = theta.shape(0); constexpr size_t cellsize=8; size_t nct = patch_ntheta/cellsize+1, ncp = patch_nphi/cellsize+1; double theta0 = (int(itheta0)-int(nbtheta))*dtheta, phi0 = (int(iphi0)-int(nbphi))*dphi; double theta_lo=theta0, theta_hi=theta_lo+(patch_ntheta+1)*dtheta; double phi_lo=phi0, phi_hi=phi_lo+(patch_nphi+1)*dphi; MR_assert(uint64_t(nct)*uint64_t(ncp)<(uint64_t(1)<<32), "key space too large"); quick_array key(nptg); execParallel(nptg, nthreads, [&](size_t lo, size_t hi) { for (size_t i=lo; i=theta_lo) && (theta(i)<=theta_hi), "theta out of range: ", theta(i)); MR_assert((phi(i)>=phi_lo) && (phi(i)<=phi_hi), "phi out of range: ", phi(i)); auto ftheta = (theta(i)-theta0)*xdtheta-supp*0.5; auto itheta = size_t(ftheta+1); auto fphi = (phi(i)-phi0)*xdphi-supp*0.5; auto iphi = size_t(fphi+1); itheta /= cellsize; iphi /= cellsize; MR_assert(itheta res(key.size()); bucket_sort2(key, res, ncp*nct, nthreads); return res; } template class WeightHelper { public: static constexpr size_t vlen = Tsimd::size(); static constexpr size_t nvec = (supp+vlen-1)/vlen; const SphereInterpol &plan; union kbuf { T scalar[2*nvec*vlen]; Tsimd simd[2*nvec]; #if defined(_MSC_VER) kbuf() {} #endif }; kbuf buf; private: TemplateKernel tkrn; double mytheta0, myphi0; public: WeightHelper(const SphereInterpol &plan_, const mav_info<3> &info, size_t itheta0, size_t iphi0) : plan(plan_), tkrn(*plan.kernel), mytheta0(plan.theta0+itheta0*plan.dtheta), myphi0(plan.phi0+iphi0*plan.dphi), wtheta(&buf.scalar[0]), wphi(&buf.simd[nvec]), jumptheta(info.stride(1)) { MR_assert(info.stride(2)==1, "last axis of cube must be contiguous"); } void prep(double theta, double phi) { auto ftheta = (theta-mytheta0)*plan.xdtheta-supp*0.5; itheta = size_t(ftheta+1); ftheta = -1+(itheta-ftheta)*2; auto fphi = (phi-myphi0)*plan.xdphi-supp*0.5; iphi = size_t(fphi+1); fphi = -1+(iphi-fphi)*2; tkrn.eval2(T(ftheta), T(fphi), &buf.simd[0]); } size_t itheta, iphi; const T * DUCC0_RESTRICT wtheta; const Tsimd * DUCC0_RESTRICT wphi; ptrdiff_t jumptheta; }; // prefetching distance static constexpr size_t pfdist=2; template void interpolx(size_t supp_, const cmav &cube, size_t itheta0, size_t iphi0, const cmav &theta, const cmav &phi, vmav &signal) const { if constexpr (supp>=8) if (supp_<=supp/2) return interpolx(supp_, cube, itheta0, iphi0, theta, phi, signal); if constexpr (supp>4) if (supp_(supp_, cube, itheta0, iphi0, theta, phi, signal); MR_assert(supp_==supp, "requested support out of range"); MR_assert(cube.stride(2)==1, "last axis of cube must be contiguous"); MR_assert(phi.shape(0)==theta.shape(0), "array shape mismatch"); MR_assert(signal.shape(1)==theta.shape(0), "array shape mismatch"); const auto ncomp = cube.shape(0); MR_assert(signal.shape(0)==ncomp, "array shape mismatch"); static constexpr size_t vlen = Tsimd::size(); static constexpr size_t nvec = (supp+vlen-1)/vlen; auto idx = getIdx(theta, phi, cube.shape(1), cube.shape(2), itheta0, iphi0, supp); execStatic(idx.size(), nthreads, 0, [&](Scheduler &sched) { WeightHelper hlp(*this, cube, itheta0, iphi0); while (auto rng=sched.getNext()) for(auto ind=rng.lo; ind()); signal(1, i) = reduce(tres1*hlp.wphi[0], std::plus<>()); } else { const T * DUCC0_RESTRICT ptr0 = &cube(0, hlp.itheta,hlp.iphi); const T * DUCC0_RESTRICT ptr1 = &cube(1, hlp.itheta,hlp.iphi); Tsimd tres0=0, tres1=0; for (size_t itheta=0; itheta()); signal(1, i) = reduce(tres1, std::plus<>()); } } else { if constexpr(nvec==1) for (size_t icomp=0; icomp()); } else for (size_t icomp=0; icomp()); } } } }); } template void deinterpolx(size_t supp_, vmav &cube, size_t itheta0, size_t iphi0, const cmav &theta, const cmav &phi, const cmav &signal) const { if constexpr (supp>=8) if (supp_<=supp/2) return deinterpolx(supp_, cube, itheta0, iphi0, theta, phi, signal); if constexpr (supp>4) if (supp_(supp_, cube, itheta0, iphi0, theta, phi, signal); MR_assert(supp_==supp, "requested support out of range"); MR_assert(cube.stride(2)==1, "last axis of cube must be contiguous"); MR_assert(phi.shape(0)==theta.shape(0), "array shape mismatch"); MR_assert(signal.shape(1)==theta.shape(0), "array shape mismatch"); const auto ncomp = cube.shape(0); MR_assert(signal.shape(0)==ncomp, "array shape mismatch"); static constexpr size_t vlen = Tsimd::size(); static constexpr size_t nvec = (supp+vlen-1)/vlen; auto idx = getIdx(theta, phi, cube.shape(1), cube.shape(2), itheta0, iphi0, supp); constexpr size_t cellsize=16; size_t nct = cube.shape(1)/cellsize+10, ncp = cube.shape(2)/cellsize+10; vmav locks({nct,ncp}); execStatic(idx.size(), nthreads, 0, [&](Scheduler &sched) { size_t b_theta=~(size_t(0)), b_phi=~(size_t(0)); WeightHelper hlp(*this, cube, itheta0, iphi0); while (auto rng=sched.getNext()) for(auto ind=rng.lo; ind(epsilon, sigma_min, sigma_max, {(2*ntheta_s-2), nphi_s}, npoints, true, nthreads)), kernel(selectKernel(kernel_index)), nphi_b(std::max(20,2*good_size_real(size_t((2*mmax+1)*ducc0::getKernel(kernel_index).ofactor/2.)))), ntheta_b(std::max(21,good_size_real(size_t((lmax+1)*ducc0::getKernel(kernel_index).ofactor))+1)), dphi(2*pi/nphi_b), dtheta(pi/(ntheta_b-1)), xdphi(1./dphi), xdtheta(1./dtheta), nbphi((kernel->support()+1)/2), nbtheta((kernel->support()+1)/2), nphi(nphi_b+2*nbphi+vlen), ntheta(ntheta_b+2*nbtheta), phi0(nbphi*(-dphi)), theta0(nbtheta*(-dtheta)) { auto supp = kernel->support(); MR_assert((supp<=ntheta) && (supp<=nphi_b), "kernel support too large!"); } size_t Lmax() const { return lmax; } size_t Mmax() const { return mmax; } size_t Spin() const { return spin; } size_t Ntheta() const { return ntheta; } size_t Nphi() const { return nphi; } vector getPatchInfo(double theta_lo, double theta_hi, double phi_lo, double phi_hi) const { vector res(4); auto tmp = (theta_lo-theta0)*xdtheta-nbtheta; res[0] = min(size_t(max(0., tmp)), ntheta); tmp = (theta_hi-theta0)*xdtheta+nbtheta+1.; res[1] = min(size_t(max(0., tmp)), ntheta); tmp = (phi_lo-phi0)*xdphi-nbphi; res[2] = min(size_t(max(0., tmp)), nphi); tmp = (phi_hi-phi0)*xdphi+nbphi+1.+vlen; res[3] = min(size_t(max(0., tmp)), nphi); return res; } void getPlane(const cmav,2> &valm, const cmav &mstart, ptrdiff_t lstride, vmav &planes, SHT_mode mode, TimerHierarchy &timers) const { timers.push("alm2leg"); size_t nplanes=1+(spin>0); MR_assert(planes.conformable({nplanes, Ntheta(), Nphi()}), "bad planes shape"); auto subplanes=subarray<3>(planes,{{}, {nbtheta, nbtheta+ntheta_s}, {nbphi, nbphi+nphi_s}}); MR_assert(planes.stride(2)==1, "last axis must have stride 1"); MR_assert((planes.stride(1)&1)==0, "stride must be even"); MR_assert((planes.stride(0)&1)==0, "stride must be even"); MR_assert(2*(mmax+1)<=nphi_b, "aargh"); vmav,3> leg_s(reinterpret_cast *>(&planes(0,nbtheta,nbphi-1)), {nplanes, ntheta_s, mmax+1}, {subplanes.stride(0)/2, subplanes.stride(1)/2, 1}); vmav,3> leg_b(reinterpret_cast *>(&planes(0,nbtheta,nbphi-1)), {nplanes, ntheta_b, mmax+1}, {subplanes.stride(0)/2, subplanes.stride(1)/2, 1}); vmav theta({ntheta_s}, UNINITIALIZED); for (size_t i=0; i mval({mmax+1}, UNINITIALIZED); for (size_t i=0; i<=mmax; ++i) mval(i) = i; alm2leg(valm, leg_s, spin, lmax, mval, mstart, lstride, theta, nthreads, mode); timers.poppush("theta resampling and deconvolution"); auto kernel = getKernel(2*ntheta_s-2, 2*ntheta_b-2); ducc0::detail_sht::resample_and_convolve_theta (leg_s, true, true, leg_b, true, true, kernel, spin, nthreads, false); timers.poppush("phi FFT and dconvolution"); // fix phi size_t nj=2*mmax+1; auto phikrn = getKernel(nphi_s, nphi_b); vmav phikrn2({nj}); for (size_t j=0; j rplan(nphi_b); for (size_t iplane=0; iplane(planes, {{iplane},{nbtheta, nbtheta+ntheta_b}, {nbphi, nbphi+nphi_b}}); execParallel(ntheta_b, nthreads, [&](size_t lo, size_t hi) { vmav buf({rplan.bufsize()}); for (size_t i=lo; i=nphi_b) j2-=nphi_b; planes(iplane,nbtheta-1-i,j2+nbphi) = fct*planes(iplane,nbtheta+1+i,j+nbphi); planes(iplane,nbtheta+ntheta_b+i,j2+nbphi) = fct*planes(iplane,nbtheta+ntheta_b-2-i,j+nbphi); } for (size_t i=0; i,1> &alm, vmav &planes) const { cmav,2> valm(&alm(0), {1,alm.shape(0)}, {0,alm.stride(0)}); getPlane(valm, planes); } template void interpol(const cmav &cube, size_t itheta0, size_t iphi0, const cmav &theta, const cmav &phi, vmav &signal) const { constexpr size_t maxsupp = is_same::value ? 16 : 8; interpolx(kernel->support(), cube, itheta0, iphi0, theta, phi, signal); } template void deinterpol(vmav &cube, size_t itheta0, size_t iphi0, const cmav &theta, const cmav &phi, const cmav &signal) const { constexpr size_t maxsupp = is_same::value ? 16 : 8; deinterpolx(kernel->support(), cube, itheta0, iphi0, theta, phi, signal); } void updateAlm(vmav,2> &valm, const cmav &mstart, ptrdiff_t lstride, vmav &planes, SHT_mode mode, TimerHierarchy &timers) const { size_t nplanes=1+(spin>0); MR_assert(planes.conformable({nplanes, Ntheta(), Nphi()}), "bad planes shape"); timers.push("dealing with borders"); // move stuff from border regions onto the main grid for (size_t iplane=0; iplane=nphi_b) j2-=nphi_b; planes(iplane,nbtheta+1+i,j+nbphi) += fct*planes(iplane,nbtheta-1-i,j2+nbphi); planes(iplane,nbtheta+ntheta_b-2-i, j+nbphi) += fct*planes(iplane,nbtheta+ntheta_b+i,j2+nbphi); } } timers.poppush("phi FFT and deconvolution"); // fix phi size_t nj=2*mmax+1; auto phikrn = getKernel(nphi_b, nphi_s); vmav phikrn2({nj}); for (size_t j=0; j rplan(nphi_b); for (size_t iplane=0; iplane(planes, {{iplane}, {nbtheta, nbtheta+ntheta_b}, {nbphi,nbphi+nphi_b}}); execParallel(ntheta_b, nthreads, [&](size_t lo, size_t hi) { vmav buf({rplan.bufsize()}); for (size_t i=lo; i(planes, {{0, nplanes}, {nbtheta, nbtheta+ntheta_s}, {nbphi,nbphi+nphi_s}}); MR_assert(planes.stride(2)==1, "last axis must have stride 1"); MR_assert((planes.stride(1)&1)==0, "stride must be even"); MR_assert((planes.stride(0)&1)==0, "stride must be even"); MR_assert(2*(mmax+1)<=nphi_b, "aargh"); vmav,3> leg_s(reinterpret_cast *>(&planes(0,nbtheta,nbphi-1)), {nplanes, ntheta_s, mmax+1}, {subplanes.stride(0)/2, subplanes.stride(1)/2, 1}); vmav,3> leg_b(reinterpret_cast *>(&planes(0,nbtheta,nbphi-1)), {nplanes, ntheta_b, mmax+1}, {subplanes.stride(0)/2, subplanes.stride(1)/2, 1}); vmav theta({ntheta_s}, UNINITIALIZED); for (size_t i=0; i mval({mmax+1}, UNINITIALIZED); for (size_t i=0; i<=mmax; ++i) mval(i) = i; auto kernel = getKernel(2*ntheta_b-2, 2*ntheta_s-2); ducc0::detail_sht::resample_and_convolve_theta (leg_b, true, true, leg_s, true, true, kernel, spin, nthreads, true); timers.poppush("leg2alm"); leg2alm(valm, leg_s, spin, lmax, mval, mstart, lstride, theta, nthreads, mode); timers.pop(); } void updateAlm(vmav,1> &alm, vmav &planes, SHT_mode mode) const { auto valm(alm.prepend_1()); updateAlm(valm, planes, mode); } vmav build_planes() const { size_t nplanes=1+(spin>0); auto planes_ = vmav::build_noncritical({nplanes, Ntheta(), (Nphi()+1)/2, 2}, UNINITIALIZED); vmav planes = planes_.template reinterpret<3>( {nplanes, Ntheta(), Nphi()}, {planes_.stride(0), planes_.stride(1), 1}); return planes; } }; } using detail_sphereinterpol::SphereInterpol; } #endif