/* * This code is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this code; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /*! \file sht.cc * Functionality related to spherical harmonic transforms * * Copyright (C) 2020-2023 Max-Planck-Society * \author Martin Reinecke */ #include #include #include #if ((!defined(DUCC0_NO_SIMD)) && defined(__AVX__) && (!defined(__AVX512F__))) #include #endif #include "ducc0/infra/simd.h" #include "ducc0/sht/sht.h" #include "ducc0/sht/sphere_interpol.h" #include "ducc0/fft/fft1d.h" #include "ducc0/nufft/nufft.h" #include "ducc0/math/math_utils.h" #include "ducc0/math/gl_integrator.h" #include "ducc0/math/constants.h" #include "ducc0/math/solvers.h" #include "ducc0/sht/sht_utils.h" #include "ducc0/infra/timers.h" namespace ducc0 { namespace detail_sht { using namespace std; // the next line is necessary to address some sloppy name choices in hipSYCL using std::min, std::max; static constexpr double sharp_fbig=0x1p+800,sharp_fsmall=0x1p-800; static constexpr double sharp_fbighalf=0x1p+400; struct ringdata { size_t mlim, idx, midx; double cth, sth; }; class YlmBase { public: size_t lmax, mmax, s; vector powlimit; /* used if s==0 */ vector mfac; protected: /* used if s!=0 */ vector flm1, flm2, inv; public: vector prefac; vector fscale; protected: inline void normalize (double &val, int &scale, double xfmax) { while (abs(val)>xfmax) { val*=sharp_fsmall; ++scale; } if (val!=0.) while (abs(val) get_norm(size_t lmax, size_t spin) { /* sign convention for H=1 (LensPix paper) */ #if 1 double spinsign = (spin>0) ? -1.0 : 1.0; #else double spinsign = 1.0; #endif if (spin==0) return vector(lmax+1,1.); vector res(lmax+1); spinsign = (spin&1) ? -spinsign : spinsign; for (size_t l=0; l<=lmax; ++l) res[l] = (l get_d1norm(size_t lmax) { vector res(lmax+1); for (size_t l=0; l<=lmax; ++l) res[l] = (l<1) ? 0. : 0.5*sqrt(l*(l+1.)*(2*l+1.)/(4*pi)); return res; } YlmBase(size_t l_max, size_t m_max, size_t spin) : lmax(l_max), mmax(m_max), s(spin), powlimit(mmax+s+1), mfac((s==0) ? (mmax+1) : 0), flm1((s==0) ? 0 : (2*lmax+3)), flm2((s==0) ? 0 : (2*lmax+3)), inv((s==0) ? 0 : (lmax+2)), prefac((s==0) ? 0 : (mmax+1)), fscale((s==0) ? 0 : (mmax+1)) { MR_assert(l_max>=spin,"incorrect l_max: must be >= spin"); MR_assert(l_max>=m_max,"incorrect l_max: must be >= m_max"); powlimit[0]=0.; constexpr double expo=-400*ln2; for (size_t i=1; i<=m_max+spin; ++i) powlimit[i]=exp(expo/i); if (s==0) { mfac[0] = inv_sqrt4pi; for (size_t i=1; i<=mmax; ++i) mfac[i] = mfac[i-1]*sqrt((2*i+1.)/(2*i)); } else { inv[0]=0; for (size_t i=1; i fac(2*lmax+1); vector facscale(2*lmax+1); fac[0]=1; facscale[0]=0; for (size_t i=1; i<2*lmax+1; ++i) { fac[i]=fac[i-1]*sqrt(i); facscale[i]=facscale[i-1]; normalize(fac[i],facscale[i],sharp_fbighalf); } for (size_t i=0; i<=mmax; ++i) { size_t mlo_=min(s,i), mhi_=max(s,i); double tfac=fac[2*mhi_]/fac[mhi_+mlo_]; int tscale=facscale[2*mhi_]-facscale[mhi_+mlo_]; normalize(tfac,tscale,sharp_fbighalf); tfac/=fac[mhi_-mlo_]; tscale-=facscale[mhi_-mlo_]; normalize(tfac,tscale,sharp_fbighalf); prefac[i]=tfac; fscale[i]=tscale; } } } }; class Ylmgen: public YlmBase { public: struct dbl2 { double a, b; }; size_t m; vector alpha; vector coef; /* used if s==0 */ vector eps; /* used if s!=0 */ size_t sinPow, cosPow; bool preMinus_p, preMinus_m; size_t mlo, mhi; Ylmgen(const YlmBase &base) : YlmBase(base), m(~size_t(0)), alpha((s==0) ? (lmax/2+2) : (lmax+3), 0.), coef((s==0) ? (lmax/2+2) : (lmax+3), {0.,0.}), eps((s==0) ? (lmax+4) : 0), mlo(~size_t(0)), mhi(~size_t(0)) {} void prepare (size_t m_) { if (m_==m) return; m = m_; if (s==0) { eps[m] = 0.; for (size_t l=m+1; lmhi) alpha[l+1] = alpha[l-1]*flp12; else alpha[l+1] = 1.; coef[l+1].a = flp10*alpha[l]/alpha[l+1]; coef[l+1].b = flp11*coef[l+1].a; } } preMinus_p = preMinus_m = false; if (mhi==m) { cosPow = mhi+s; sinPow = mhi-s; preMinus_p = preMinus_m = ((mhi-s)&1); } else { cosPow = mhi+m; sinPow = mhi-m; preMinus_m = ((mhi+m)&1); } } } }; struct ringhelper { using dcmplx = complex; double phi0_; vector shiftarr; size_t s_shift; unique_ptr> plan; vector buf; size_t length; bool norot; ringhelper() : phi0_(0), s_shift(0), length(0), norot(false) {} void update(size_t nph, size_t mmax, double phi0) { norot = (abs(phi0)<1e-14); if (!norot) if ((mmax!=s_shift-1) || (!approx(phi0,phi0_,1e-15))) { shiftarr.resize(mmax+1); s_shift = mmax+1; phi0_ = phi0; MultiExp mexp(phi0, mmax+1); for (size_t m=0; m<=mmax; ++m) shiftarr[m] = mexp[m]; } if (nph!=length) { plan=make_unique>(nph); buf.resize(plan->bufsize()); length=nph; } } template DUCC0_NOINLINE void phase2ring (size_t nph, double phi0, vmav &data, size_t mmax, const cmav,1> &phase) { update (nph, mmax, phi0); if (nph>=2*mmax+1) { if (norot) for (size_t m=0; m<=mmax; ++m) { data(2*m)=phase(m).real(); data(2*m+1)=phase(m).imag(); } else for (size_t m=0; m<=mmax; ++m) { dcmplx tmp = dcmplx(phase(m))*shiftarr[m]; data(2*m)=tmp.real(); data(2*m+1)=tmp.imag(); } for (size_t m=2*(mmax+1); mexec_copyback(&(data(1)), buf.data(), 1., false); } template DUCC0_NOINLINE void ring2phase (size_t nph, double phi0, vmav &data, size_t mmax, vmav,1> &phase) { update (nph, mmax, -phi0); plan->exec_copyback(&(data(1)), buf.data(), 1., true); data(0)=data(1); data(1)=data(nph+1)=0.; if (mmax<=nph/2) { if (norot) for (size_t m=0; m<=mmax; ++m) phase(m) = complex(T(data(2*m)), T(data(2*m+1))); else for (size_t m=0; m<=mmax; ++m) phase(m) = complex(dcmplx(data(2*m), data(2*m+1)) * shiftarr[m]); } else { for (size_t m=0, idx=0; m<=mmax; ++m, idx=(idx+1==nph) ? 0 : idx+1) { dcmplx val; if (idx<(nph-idx)) val = dcmplx(data(2*idx), data(2*idx+1)); else val = dcmplx(data(2*(nph-idx)), -data(2*(nph-idx)+1)); if (!norot) val *= shiftarr[m]; phase(m)=complex(val); } } } }; using Tv=native_simd; static constexpr size_t VLEN=Tv::size(); #if ((!defined(DUCC0_NO_SIMD)) && defined(__AVX__) && (!defined(__AVX512F__))) static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, complex * DUCC0_RESTRICT cc) { auto tmp1=_mm256_hadd_pd(__m256d(a),__m256d(b)), tmp2=_mm256_hadd_pd(__m256d(c),__m256d(d)); auto tmp3=_mm256_permute2f128_pd(tmp1,tmp2,49), tmp4=_mm256_permute2f128_pd(tmp1,tmp2,32); tmp1=tmp3+tmp4; cc[0]+=complex(tmp1[0], tmp1[1]); cc[1]+=complex(tmp1[2], tmp1[3]); } #else static inline void vhsum_cmplx_special (Tv a, Tv b, Tv c, Tv d, complex * DUCC0_RESTRICT cc) { cc[0] += complex(reduce(a,std::plus<>()),reduce(b,std::plus<>())); cc[1] += complex(reduce(c,std::plus<>()),reduce(d,std::plus<>())); } #endif using dcmplx = complex; static constexpr double sharp_ftol=0x1p-60; constexpr size_t nv0 = 128/VLEN; constexpr size_t nvx = 64/VLEN; using Tbv0 = std::array; using Tbs0 = std::array; struct s0data_v { Tbv0 sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i; }; struct s0data_s { Tbs0 sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i; }; union s0data_u { s0data_v v; s0data_s s; #if defined(_MSC_VER) s0data_u() {} #endif }; using Tbvx = std::array; using Tbsx = std::array; struct sxdata_v { Tbvx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth, p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi; }; struct sxdata_s { Tbsx sth, cfp, cfm, scp, scm, l1p, l2p, l1m, l2m, cth, p1pr, p1pi, p2pr, p2pi, p1mr, p1mi, p2mr, p2mi; }; union sxdata_u { sxdata_v v; sxdata_s s; #if defined(_MSC_VER) sxdata_u() {} #endif }; static inline void Tvnormalize (Tv & DUCC0_RESTRICT val_, Tv & DUCC0_RESTRICT scale_, double maxval) { // This copying is necessary for MSVC ... no idea why Tv val = val_; Tv scale = scale_; const Tv vfmin=sharp_fsmall*maxval, vfmax=maxval; const Tv vfsmall=sharp_fsmall, vfbig=sharp_fbig; auto mask = abs(val)>vfmax; while (any_of(mask)) { where(mask,val)*=vfsmall; where(mask,scale)+=1; mask = abs(val)>vfmax; } mask = (abs(val) &powlimit, Tv & DUCC0_RESTRICT resd, Tv & DUCC0_RESTRICT ress) { Tv vminv=powlimit[npow]; auto mask = abs(val)>=1); resd=res; ress=0; } else { Tv scale=0, scaleint=0, res=1; Tvnormalize(val,scaleint,sharp_fbighalf); do { if (npow&1) { res*=val; scale+=scaleint; Tvnormalize(res,scale,sharp_fbighalf); } val*=val; scaleint+=scaleint; Tvnormalize(val,scaleint,sharp_fbighalf); } while(npow>>=1); resd=res; ress=scale; } } static inline void getCorfac(Tv scale, Tv & DUCC0_RESTRICT corfac) { // not sure why, but MSVC miscompiles the default code #if defined(_MSC_VER) for (size_t i=0; i0.5,corfac)=sharp_fbig; #endif } static inline bool rescale(Tv &v1, Tv &v2, Tv &s, Tv eps) { auto mask = abs(v2)>eps; if (any_of(mask)) { where(mask,v1)*=sharp_fsmall; where(mask,v2)*=sharp_fsmall; where(mask,s)+=1; return true; } return false; } DUCC0_NOINLINE static void iter_to_ieee(const Ylmgen &gen, s0data_v & DUCC0_RESTRICT d, size_t & DUCC0_RESTRICT l_, size_t & DUCC0_RESTRICT il_, size_t nv2) { size_t l=gen.m, il=0; Tv mfac = (gen.m&1) ? -gen.mfac[gen.m]:gen.mfac[gen.m]; bool below_limit = true; for (size_t i=0; igen.lmax) {l_=gen.lmax+1;return;} below_limit=1; Tv a1=gen.coef[il ].a, b1=gen.coef[il ].b; Tv a2=gen.coef[il+1].a, b2=gen.coef[il+1].b; for (size_t i=0; i &coef, const dcmplx * DUCC0_RESTRICT alm, size_t l, size_t il, size_t lmax, size_t nv2) { #if 0 for (; l+6<=lmax; il+=4, l+=8) { Tv ar1=alm[l ].real(), ai1=alm[l ].imag(); Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag(); Tv ar3=alm[l+2].real(), ai3=alm[l+2].imag(); Tv ar4=alm[l+3].real(), ai4=alm[l+3].imag(); Tv ar5=alm[l+4].real(), ai5=alm[l+4].imag(); Tv ar6=alm[l+5].real(), ai6=alm[l+5].imag(); Tv ar7=alm[l+6].real(), ai7=alm[l+6].imag(); Tv ar8=alm[l+7].real(), ai8=alm[l+7].imag(); Tv a1=coef[il ].a, b1=coef[il ].b; Tv a2=coef[il+1].a, b2=coef[il+1].b; Tv a3=coef[il+2].a, b3=coef[il+2].b; Tv a4=coef[il+3].a, b4=coef[il+3].b; for (size_t i=0; ilmax) return; auto &coef = gen.coef; bool full_ieee=true; for (size_t i=0; i=0); } while((!full_ieee) && (l<=lmax)) { Tv ar1=alm[l ].real(), ai1=alm[l ].imag(); Tv ar2=alm[l+1].real(), ai2=alm[l+1].imag(); Tv a=coef[il].a, b=coef[il].b; full_ieee=1; for (size_t i=0; i=0); } l+=2; ++il; } if (l>lmax) return; for (size_t i=0; i &coef, dcmplx * DUCC0_RESTRICT alm, size_t l, size_t il, size_t lmax, size_t nv2) { for (; l+2<=lmax; il+=2, l+=4) { Tv a1=coef[il ].a, b1=coef[il ].b; Tv a2=coef[il+1].a, b2=coef[il+1].b; Tv atmp1[4] = {0,0,0,0}; Tv atmp2[4] = {0,0,0,0}; for (size_t i=0; ilmax) return; auto &coef = gen.coef; bool full_ieee=true; for (size_t i=0; i=0); } while((!full_ieee) && (l<=lmax)) { Tv a=coef[il].a, b=coef[il].b; Tv atmp[4] = {0,0,0,0}; full_ieee=1; for (size_t i=0; i=0); } vhsum_cmplx_special (atmp[0], atmp[1], atmp[2], atmp[3], &alm[l]); l+=2; ++il; } if (l>lmax) return; for (size_t i=0; igen.lmax) {l_=gen.lmax+1;return;} below_limit=1; Tv fx10=fx[l+1].a,fx11=fx[l+1].b; Tv fx20=fx[l+2].a,fx21=fx[l+2].b; for (size_t i=0; i &fx, const dcmplx * DUCC0_RESTRICT alm, size_t l, size_t lmax, size_t nv2) { size_t lsave = l; while (l<=lmax) { Tv fx10=fx[l+1].a,fx11=fx[l+1].b; Tv fx20=fx[l+2].a,fx21=fx[l+2].b; Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(), acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag(); Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(), acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag(); for (size_t i=0; ilmax) return; const auto &fx = gen.coef; bool full_ieee=true; for (size_t i=0; i=0) && all_of(d.scm[i]>=0); } while((!full_ieee) && (l<=lmax)) { Tv fx10=fx[l+1].a,fx11=fx[l+1].b; Tv fx20=fx[l+2].a,fx21=fx[l+2].b; Tv agr1=alm[2*l ].real(), agi1=alm[2*l ].imag(), acr1=alm[2*l+1].real(), aci1=alm[2*l+1].imag(); Tv agr2=alm[2*l+2].real(), agi2=alm[2*l+2].imag(), acr2=alm[2*l+3].real(), aci2=alm[2*l+3].imag(); full_ieee=true; for (size_t i=0; i=0); if (rescale(d.l1m[i], d.l2m[i], d.scm[i], sharp_ftol)) getCorfac(d.scm[i], d.cfm[i]); full_ieee &= all_of(d.scm[i]>=0); } l+=2; } // if (l>lmax) return; for (size_t i=0; i &fx, dcmplx * DUCC0_RESTRICT alm, size_t l, size_t lmax, size_t nv2) { size_t lsave=l; while (l<=lmax) { Tv fx10=fx[l+1].a,fx11=fx[l+1].b; Tv fx20=fx[l+2].a,fx21=fx[l+2].b; Tv agr1=0, agi1=0, acr1=0, aci1=0; Tv agr2=0, agi2=0, acr2=0, aci2=0; for (size_t i=0; ilmax) return; const auto &fx = gen.coef; bool full_ieee=true; for (size_t i=0; i=0) && all_of(d.scm[i]>=0); } for (size_t i=0; i=0); if (rescale(d.l1m[i], d.l2m[i], d.scm[i], sharp_ftol)) getCorfac(d.scm[i], d.cfm[i]); full_ieee &= all_of(d.scm[i]>=0); } vhsum_cmplx_special (agr1,agi1,acr1,aci1,&alm[2*l]); vhsum_cmplx_special (agr2,agi2,acr2,aci2,&alm[2*l+2]); l+=2; } if (l>lmax) return; for (size_t i=0; i &fx, const dcmplx * DUCC0_RESTRICT alm, size_t l, size_t lmax, size_t nv2) { size_t lsave=l; while (l<=lmax) { Tv fx10=fx[l+1].a,fx11=fx[l+1].b; Tv fx20=fx[l+2].a,fx21=fx[l+2].b; Tv ar1=alm[l ].real(), ai1=alm[l ].imag(), ar2=alm[l+1].real(), ai2=alm[l+1].imag(); for (size_t i=0; ilmax) return; const auto &fx = gen.coef; bool full_ieee=true; for (size_t i=0; i=0) && all_of(d.scm[i]>=0); } while((!full_ieee) && (l<=lmax)) { Tv fx10=fx[l+1].a,fx11=fx[l+1].b; Tv fx20=fx[l+2].a,fx21=fx[l+2].b; Tv ar1=alm[l ].real(), ai1=alm[l ].imag(), ar2=alm[l+1].real(), ai2=alm[l+1].imag(); full_ieee=true; for (size_t i=0; i=0); if (rescale(d.l1m[i], d.l2m[i], d.scm[i], sharp_ftol)) getCorfac(d.scm[i], d.cfm[i]); full_ieee &= all_of(d.scm[i]>=0); } l+=2; } // if (l>lmax) return; for (size_t i=0; i &fx, dcmplx * DUCC0_RESTRICT alm, size_t l, size_t lmax, size_t nv2) { size_t lsave=l; while (l<=lmax) { Tv fx10=fx[l+1].a,fx11=fx[l+1].b; Tv fx20=fx[l+2].a,fx21=fx[l+2].b; Tv agr1=0, agi1=0; Tv agr2=0, agi2=0; for (size_t i=0; ilmax) return; const auto &fx = gen.coef; bool full_ieee=true; for (size_t i=0; i=0) && all_of(d.scm[i]>=0); } for (size_t i=0; i=0); if (rescale(d.l1m[i], d.l2m[i], d.scm[i], sharp_ftol)) getCorfac(d.scm[i], d.cfm[i]); full_ieee &= all_of(d.scm[i]>=0); } vhsum_cmplx_special (agr1,agi1,agr2,agi2,&alm[l]); l+=2; } if (l>lmax) return; for (size_t i=0; i DUCC0_NOINLINE static void inner_loop_a2m(SHT_mode mode, vmav,2> &almtmp, vmav,3> &phase, const vector &rdata, Ylmgen &gen, size_t mi) { if (gen.s==0) { // adjust the a_lm for the new algorithm MR_assert(almtmp.stride(1)==1, "bad stride"); dcmplx * DUCC0_RESTRICT alm=almtmp.data(); for (size_t il=0, l=gen.m; l<=gen.lmax; ++il,l+=2) { dcmplx al = alm[l]; dcmplx al1 = (l+1>gen.lmax) ? 0. : alm[l+1]; dcmplx al2 = (l+2>gen.lmax) ? 0. : alm[l+2]; alm[l ] = gen.alpha[il]*(gen.eps[l+1]*al + gen.eps[l+2]*al2); alm[l+1] = gen.alpha[il]*al1; } constexpr size_t nval=nv0*VLEN; s0data_u d; array idx, midx; Tbv0 cth; size_t ith=0; while (ith=gen.m) { idx[nth] = rdata[ith].idx; midx[nth] = rdata[ith].midx; auto lcth = rdata[ith].cth; cth[nth/VLEN][nth%VLEN] = lcth; if (abs(lcth)>0.99) d.s.csq[nth]=(1.-rdata[ith].sth)*(1.+rdata[ith].sth); else d.s.csq[nth]=lcth*lcth; d.s.sth[nth]=rdata[ith].sth; ++nth; } else phase(0, rdata[ith].idx, mi) = phase(0, rdata[ith].midx, mi) = 0; ++ith; } if (nth>0) { size_t nvec = (nth+VLEN-1)/VLEN; size_t i2 = nvec*VLEN; for (auto i=nth; i(T(d.s.p1r[i]),T(d.s.p1i[i])); if (idx[i]!=midx[i]) phase(0, midx[i], mi) = complex(T(d.s.p2r[i]),T(d.s.p2i[i])); } } } } else { //adjust the a_lm for the new algorithm for (size_t l=gen.mhi; l<=gen.lmax+1; ++l) for (size_t i=0; i idx, midx; size_t ith=0; while (ith=gen.m) { idx[nth] = rdata[ith].idx; midx[nth] = rdata[ith].midx; d.s.cth[nth]=rdata[ith].cth; d.s.sth[nth]=rdata[ith].sth; ++nth; } else { phase(0, rdata[ith].idx, mi) = phase(0, rdata[ith].midx, mi) = 0; phase(1, rdata[ith].idx, mi) = phase(1, rdata[ith].midx, mi) = 0; } ++ith; } if (nth>0) { size_t nvec = (nth+VLEN-1)/VLEN; size_t i2 = nvec*VLEN; for (size_t i=nth; i(T(d.s.p1pr[i]), T(d.s.p1pi[i])); phase(1, idx[i], mi) = complex(T(d.s.p1mr[i]), T(d.s.p1mi[i])); if (idx[i]!=midx[i]) { phase(0, midx[i], mi) = complex(T(d.s.p2pr[i]), T(d.s.p2pi[i])); phase(1, midx[i], mi) = complex(T(d.s.p2mr[i]), T(d.s.p2mi[i])); } } } } } } template DUCC0_NOINLINE static void inner_loop_m2a(SHT_mode mode, vmav,2> &almtmp, const cmav,3> &phase, const vector &rdata, Ylmgen &gen, size_t mi) { if (gen.s==0) { constexpr size_t nval=nv0*VLEN; size_t ith=0; while (ith=gen.m) { if (abs(rdata[ith].cth)>0.99) d.s.csq[nth]=(1.-rdata[ith].sth)*(1.+rdata[ith].sth); else d.s.csq[nth]=rdata[ith].cth*rdata[ith].cth; d.s.sth[nth]=rdata[ith].sth; dcmplx ph1=phase(0, rdata[ith].idx, mi); dcmplx ph2=(rdata[ith].idx==rdata[ith].midx) ? 0 : phase(0, rdata[ith].midx, mi); d.s.p1r[nth]=(ph1+ph2).real(); d.s.p1i[nth]=(ph1+ph2).imag(); d.s.p2r[nth]=(ph1-ph2).real(); d.s.p2i[nth]=(ph1-ph2).imag(); //adjust for new algorithm d.s.p2r[nth]*=rdata[ith].cth; d.s.p2i[nth]*=rdata[ith].cth; ++nth; } ++ith; } if (nth>0) { size_t i2=((nth+VLEN-1)/VLEN)*VLEN; for (size_t i=nth; igen.lmax) ? 0. : alm[l+1]; alm[l ] = gen.alpha[il]*gen.eps[l+1]*al + alold*gen.eps[l]*alm2; alm[l+1] = gen.alpha[il]*al1; alm2=al; alold=gen.alpha[il]; } } else { constexpr size_t nval=nvx*VLEN; size_t ith=0; while (ith=gen.m) { d.s.cth[nth]=rdata[ith].cth; d.s.sth[nth]=rdata[ith].sth; dcmplx p1Q=phase(0, rdata[ith].idx, mi), p1U=phase(1, rdata[ith].idx, mi), p2Q=(rdata[ith].idx!=rdata[ith].midx) ? phase(0, rdata[ith].midx, mi):0., p2U=(rdata[ith].idx!=rdata[ith].midx) ? phase(1, rdata[ith].midx, mi):0.; if ((gen.mhi-gen.m+gen.s)&1) { p2Q=-p2Q; p2U=-p2U; } d.s.p1pr[nth]=(p1Q+p2Q).real(); d.s.p1pi[nth]=(p1Q+p2Q).imag(); d.s.p1mr[nth]=(p1U+p2U).real(); d.s.p1mi[nth]=(p1U+p2U).imag(); d.s.p2pr[nth]=(p1Q-p2Q).real(); d.s.p2pi[nth]=(p1Q-p2Q).imag(); d.s.p2mr[nth]=(p1U-p2U).real(); d.s.p2mi[nth]=(p1U-p2U).imag(); ++nth; } ++ith; } if (nth>0) { size_t i2=((nth+VLEN-1)/VLEN)*VLEN; for (size_t i=nth; i &mval, size_t lmax) { size_t nm=mval.shape(0); size_t mmax=0; vector present(lmax+1, false); for (size_t mi=0; mi make_ringdata(const cmav &theta, size_t lmax, size_t spin) { size_t nrings = theta.shape(0); struct ringinfo { double theta, cth, sth; size_t idx; }; vector tmp(nrings); for (size_t i=0; i res; size_t pos=0; while (pos get_dh_weights(size_t nrings) { vector weight(nrings); weight[0]=2.; for (size_t k=1; k<=(nrings/2-1); ++k) weight[2*k-1]=2./(1.-4.*k*k); weight[2*(nrings/2)-1]=(nrings-3.)/(2*(nrings/2)-1) -1.; pocketfft_r plan(nrings); plan.exec(weight.data(), 1., false); weight[0] = 0.; // ensure that this is an exact zero return weight; } void get_gridweights(const string &type, vmav &wgt) { size_t nrings=wgt.shape(0); if (type=="GL") // Gauss-Legendre { ducc0::GL_Integrator integ(nrings); auto xwgt = integ.weights(); for (size_t m=0; m xwgt(nrings); xwgt[0]=2.; UnityRoots roots(2*nrings); for (size_t k=1; k<=(nrings-1)/2; ++k) { auto tmp = roots[k]; xwgt[2*k-1]=2./(1.-4.*k*k)*tmp.real(); xwgt[2*k ]=2./(1.-4.*k*k)*tmp.imag(); } if ((nrings&1)==0) xwgt[nrings-1]=0.; pocketfft_r plan(nrings); plan.exec(xwgt.data(), 1., false); for (size_t m=0; m<(nrings+1)/2; ++m) wgt(m)=wgt(nrings-1-m)=xwgt[m]*2*pi/nrings; } else if (type=="CC") // Clenshaw-Curtis { /* Weights from Waldvogel 2006: BIT Numerical Mathematics 46, p. 195 */ MR_assert(nrings>1, "too few rings for Clenshaw-Curtis grid"); size_t n=nrings-1; double dw=-1./(n*n-1.+(n&1)); vector xwgt(nrings); xwgt[0]=2.+dw; for (size_t k=1; k1) xwgt[2*(n/2)-1]=(n-3.)/(2*(n/2)-1) -1. -dw*((2-(n&1))*n-1); pocketfft_r plan(n); plan.exec(xwgt.data(), 1., false); for (size_t m=0; m<(nrings+1)/2; ++m) wgt(m)=wgt(nrings-1-m)=xwgt[m]*2*pi/n; } else if (type=="F2") // Fejer 2 { auto xwgt = get_dh_weights(nrings+1); for (size_t m=0; m get_gridweights(const string &type, size_t nrings) { vmav wgt({nrings}, UNINITIALIZED); get_gridweights(type, wgt); return wgt; } bool downsampling_ok(const cmav &theta, size_t lmax, bool &npi, bool &spi, size_t &ntheta_out) { size_t ntheta = theta.shape(0); if (ntheta<=500) return false; // not worth thinking about shortcuts npi = abs_approx(theta(0), 0., 1e-14); spi = abs_approx(theta(ntheta-1), pi, 1e-14); size_t nthetafull = 2*ntheta-npi-spi; double dtheta = 2*pi/nthetafull; for (size_t i=0; i void alm2leg( // associated Legendre transform const cmav,2> &alm, // (ncomp, lmidx) vmav,3> &leg, // (ncomp, nrings, nm) size_t spin, size_t lmax, const cmav &mval, // (nm) const cmav &mstart, // (nm) ptrdiff_t lstride, const cmav &theta, // (nrings) size_t nthreads, SHT_mode mode, bool theta_interpol) { // sanity checks auto nrings=theta.shape(0); MR_assert(nrings==leg.shape(1), "nrings mismatch"); auto nm=mval.shape(0); MR_assert(nm==mstart.shape(0), "nm mismatch"); MR_assert(nm==leg.shape(2), "nm mismatch"); auto nalm=alm.shape(0); auto mmax = get_mmax(mval, lmax); if (mode==DERIV1) { spin=1; MR_assert(nalm==1, "need one a_lm component"); MR_assert(leg.shape(0)==2, "need two Legendre components"); } else if (mode==GRAD_ONLY) { MR_assert(spin>0, "spin must be positive for grad-only SHTs"); MR_assert(nalm==1, "need one a_lm component"); MR_assert(leg.shape(0)==2, "need two Legendre components"); } else { size_t ncomp = (spin==0) ? 1 : 2; MR_assert(nalm==ncomp, "incorrect number of a_lm components"); MR_assert(leg.shape(0)==ncomp, "incorrect number of Legendre components"); } if (even_odd_m(mval)) { bool npi, spi; size_t ntheta_tmp; if (downsampling_ok(theta, lmax, npi, spi, ntheta_tmp)) { vmav theta_tmp({ntheta_tmp}, UNINITIALIZED); for (size_t i=0; i(leg, {{},{0,ntheta_tmp},{}})); alm2leg(alm, leg_tmp, spin, lmax, mval, mstart, lstride, theta_tmp, nthreads, mode); resample_theta(leg_tmp, true, true, leg, npi, spi, spin, nthreads, false); } else { auto leg_tmp(vmav,3>::build_noncritical({leg.shape(0),ntheta_tmp,leg.shape(2)}, UNINITIALIZED)); alm2leg(alm, leg_tmp, spin, lmax, mval, mstart, lstride, theta_tmp, nthreads, mode); resample_theta(leg_tmp, true, true, leg, npi, spi, spin, nthreads, false); } return; } if (theta_interpol && (nrings>500) && (nrings>1.5*lmax)) // irregular and worth resampling { auto ntheta_tmp = good_size_complex(lmax+1)+1; vmav theta_tmp({ntheta_tmp}, UNINITIALIZED); for (size_t i=0; i,3> leg_tmp({leg.shape(0), ntheta_tmp, leg.shape(2)},UNINITIALIZED); alm2leg(alm, leg_tmp, spin, lmax, mval, mstart, lstride, theta_tmp, nthreads, mode); resample_leg_CC_to_irregular(leg_tmp, leg, theta, spin, mval, nthreads); return; } } auto norm_l = (mode==DERIV1) ? Ylmgen::get_d1norm (lmax) : Ylmgen::get_norm (lmax, spin); auto rdata = make_ringdata(theta, lmax, spin); YlmBase base(lmax, mmax, spin); ducc0::execDynamic(nm, nthreads, 1, [&](ducc0::Scheduler &sched) { Ylmgen gen(base); vmav,2> almtmp({lmax+2,nalm}, UNINITIALIZED); while (auto rng=sched.getNext()) for(auto mi=rng.lo; mi void leg2alm( // associated Legendre transform vmav,2> &alm, // (ncomp, lmidx) const cmav,3> &leg, // (ncomp, nrings, nm) size_t spin, size_t lmax, const cmav &mval, // (nm) const cmav &mstart, // (nm) ptrdiff_t lstride, const cmav &theta, // (nrings) size_t nthreads, SHT_mode mode, bool theta_interpol) { // sanity checks auto nrings=theta.shape(0); MR_assert(nrings==leg.shape(1), "nrings mismatch"); auto nm=mval.shape(0); MR_assert(nm==mstart.shape(0), "nm mismatch"); MR_assert(nm==leg.shape(2), "nm mismatch"); auto mmax = get_mmax(mval, lmax); auto nalm = alm.shape(0); if (mode==DERIV1) MR_fail("DERIV1 mode not supported in map->alm direction"); else if (mode==GRAD_ONLY) { MR_assert(spin>0, "spin must be positive for grad-only SHTs"); MR_assert(nalm==1, "need one a_lm component"); MR_assert(leg.shape(0)==2, "need two Legendre components"); } else { size_t ncomp = (spin==0) ? 1 : 2; MR_assert(nalm==ncomp, "incorrect number of a_lm components"); MR_assert(leg.shape(0)==ncomp, "incorrect number of Legendre components"); } if (even_odd_m(mval)) { bool npi, spi; size_t ntheta_tmp; if (downsampling_ok(theta, lmax, npi, spi, ntheta_tmp)) { vmav theta_tmp({ntheta_tmp}, UNINITIALIZED); for (size_t i=0; i,3>::build_noncritical({leg.shape(0), ntheta_tmp, leg.shape(2)}, UNINITIALIZED)); resample_theta(leg, npi, spi, leg_tmp, true, true, spin, nthreads, true); leg2alm(alm, leg_tmp, spin, lmax, mval, mstart, lstride, theta_tmp, nthreads, mode); return; } if (theta_interpol && (nrings>500) && (nrings>1.5*lmax)) // irregular and worth resampling { auto ntheta_tmp = good_size_complex(lmax+1)+1; vmav theta_tmp({ntheta_tmp}, UNINITIALIZED); for (size_t i=0; i,3> leg_tmp({leg.shape(0), ntheta_tmp, leg.shape(2)},UNINITIALIZED); resample_leg_irregular_to_CC(leg, leg_tmp, theta, spin, mval, nthreads); leg2alm(alm, leg_tmp, spin, lmax, mval, mstart, lstride, theta_tmp, nthreads, mode); return; } } auto norm_l = Ylmgen::get_norm (lmax, spin); auto rdata = make_ringdata(theta, lmax, spin); YlmBase base(lmax, mmax, spin); ducc0::execDynamic(nm, nthreads, 1, [&](ducc0::Scheduler &sched) { Ylmgen gen(base); vmav,2> almtmp({lmax+2,nalm}, UNINITIALIZED); while (auto rng=sched.getNext()) for(auto mi=rng.lo; mi(almtmp(l,ialm)*norm_l[l]); } }); /* end of parallel region */ } template void leg2map( // FFT vmav &map, // (ncomp, pix) const cmav,3> &leg, // (ncomp, nrings, mmax+1) const cmav &nphi, // (nrings) const cmav &phi0, // (nrings) const cmav &ringstart, // (nrings) ptrdiff_t pixstride, size_t nthreads) { size_t ncomp=map.shape(0); MR_assert(ncomp==leg.shape(0), "number of components mismatch"); size_t nrings=leg.shape(1); MR_assert(nrings>=1, "need at least one ring"); MR_assert((nrings==nphi.shape(0)) && (nrings==ringstart.shape(0)) && (nrings==phi0.shape(0)), "inconsistent number of rings"); MR_assert(leg.shape(2)>=1, "bad mmax"); size_t mmax=leg.shape(2)-1; // bool well_behaved=true; // if (nrings==1) well_behaved=false; // size_t dring = (nrings>1) ? ringstart(1)-ringstart(0) : ~size_t(0); size_t nphmax=0; for (size_t i=0; i0) && (ringstart(i)-ringstart(i-1) != dring)) well_behaved=false; } // if (nphmax<2*mmax+1) well_behaved=false; #if 0 if (well_behaved) { auto xmap(map.template reinterpret<3>({ncomp, nrings, nphmax}, {map.stride(0), ptrdiff_t(dring*map.stride(1)), pixstride*map.stride(1)})); execParallel(nrings, nthreads, [&](size_t lo, size_t hi) { if (lo==hi) return; for (size_t icomp=0; icomp xmapf=subarray<2>(xmap,{{icomp},{lo,hi},{}}); r2r_fftpack(xmapf,xmapf,{1},false,false,T(1),1); } }); } else #endif execDynamic(nrings, nthreads, 4, [&](Scheduler &sched) { ringhelper helper; vmav ringtmp({nphmax+2}, UNINITIALIZED); while (auto rng=sched.getNext()) for(auto ith=rng.lo; ith(leg, {{icomp}, {ith}, {}}); helper.phase2ring (nphi(ith),phi0(ith),ringtmp,mmax,ltmp); for (size_t i=0; i void map2leg( // FFT const cmav &map, // (ncomp, pix) vmav,3> &leg, // (ncomp, nrings, mmax+1) const cmav &nphi, // (nrings) const cmav &phi0, // (nrings) const cmav &ringstart, // (nrings) ptrdiff_t pixstride, size_t nthreads) { size_t ncomp=map.shape(0); MR_assert(ncomp==leg.shape(0), "number of components mismatch"); size_t nrings=leg.shape(1); MR_assert(nrings>=1, "need at least one ring"); MR_assert((nrings==nphi.shape(0)) && (nrings==ringstart.shape(0)) && (nrings==phi0.shape(0)), "inconsistent number of rings"); MR_assert(leg.shape(2)>=1, "bad mmax"); size_t mmax=leg.shape(2)-1; // bool well_behaved=true; // if (nrings==1) well_behaved=false; // size_t dring = (nrings>1) ? ringstart(1)-ringstart(0) : ~size_t(0); size_t nphmax=0; for (size_t i=0; i0) && (ringstart(i)-ringstart(i-1) != dring)) well_behaved=false; } // if (nphmax<2*mmax+1) well_behaved=false; #if 0 if (well_behaved) { auto xmap(map.template reinterpret<3>({ncomp, nrings, nphmax}, {map.stride(0), ptrdiff_t(dring*map.stride(1)), pixstride*map.stride(1)})); size_t blksz=min(nrings,64); size_t nblocks = (nrings+blksz-1)/blksz; execParallel(nblocks, nthreads, [&](size_t lo, size_t hi) { if (lo==hi) return; vmav buf({blksz, nphmax}, UNINITIALIZED); for (size_t icomp=0; icomp xmapf=subarray<2>(xmap,{{icomp},{r0,r1},{}}); vfmav buff=subarray<2>(buf,{{0,r1-r0},{}}); r2r_fftpack(xmapf,buff,{1},true,true,T(1),1); for (size_t iring=r0; iring(buf(iring-r0, 2*m-1), buf(iring-r0, 2*m)); } } }); } else #endif execDynamic(nrings, nthreads, 4, [&](Scheduler &sched) { ringhelper helper; vmav ringtmp({nphmax+2}, UNINITIALIZED); while (auto rng=sched.getNext()) for(auto ith=rng.lo; ith(leg, {{icomp}, {ith}, {}}); helper.ring2phase (nphi(ith),phi0(ith),ringtmp,mmax,ltmp); } } }); /* end of parallel region */ } template void resample_to_prepared_CC(const cmav,3> &legi, bool npi, bool spi, vmav,3> &lego, size_t spin, size_t lmax, size_t nthreads) { constexpr size_t chunksize=64; MR_assert(legi.shape(0)==lego.shape(0), "number of components mismatch"); auto nm = legi.shape(2); MR_assert(lego.shape(2)==nm, "dimension mismatch"); size_t nrings_in = legi.shape(1); size_t nfull_in = 2*nrings_in-npi-spi; size_t nrings_out = lego.shape(1); size_t nfull_out = 2*nrings_out-2; bool need_first_resample = !(npi&&spi&&(nrings_in>=2*lmax+2)); size_t nfull = need_first_resample ? 2*nfull_out : nfull_in; vector> shift(npi ? 0 : nrings_in+1); if (!npi) { UnityRoots> roots(2*nfull_in); for (size_t i=0; i plan_in(need_first_resample ? nfull_in : 1), plan_out(nfull_out), plan_full(nfull); execDynamic((nm+1)/2, nthreads, chunksize, [&](Scheduler &sched) { vmav,1> tmp({max(nfull,nfull_in)}, UNINITIALIZED); vmav,1> buf({max(plan_in.bufsize(), max(plan_out.bufsize(), plan_full.bufsize()))}, UNINITIALIZED); while (auto rng=sched.getNext()) { for (size_t n=0; n(legi, {{n},{},{2*rng.lo,MAXIDX}})); auto llego(subarray<2>(lego, {{n},{},{2*rng.lo,MAXIDX}})); for (size_t j=0; j+rng.lo v1 = llegi(i,2*j); complex v2 = ((2*j+1) *)tmp.data(), (Cmplx *)buf.data(), T(1), true); // shift if (!npi) for (size_t i=1, im=nfull_in-1; (infull_in) // pad { size_t dist = nfull-nfull_in; size_t nmove = nfull_in/2; for (size_t i=nfull-1; i+1+nmove>nfull; --i) tmp(i) = tmp(i-dist); for (size_t i=nfull-nmove-dist; i+nmove *)tmp.data(), (Cmplx *)buf.data(), T(1), false); } for (size_t i=0, im=nfull; i<=im; ++i, --im) { tmp(i) *= T(wgt(i)); if ((i==0) || (i==im)) tmp(i)*=2; if ((im *)tmp.data(), (Cmplx *)buf.data(), T(1), true); if (nfull_out *)tmp.data(), (Cmplx *)buf.data(), T(1), false); auto norm = T(.5/(nfull_out*((need_first_resample ? nfull_in : 1)))); for (size_t i=0; i void resample_from_prepared_CC(const cmav,3> &legi, vmav,3> &lego, bool npo, bool spo, size_t spin, size_t lmax, size_t nthreads) { constexpr size_t chunksize=64; MR_assert(legi.shape(0)==lego.shape(0), "number of components mismatch"); auto nm = legi.shape(2); MR_assert(lego.shape(2)==nm, "dimension mismatch"); size_t nrings_in = legi.shape(1); size_t nfull_in = 2*nrings_in-2; size_t nrings_out = lego.shape(1); size_t nfull_out = 2*nrings_out-npo-spo; bool need_second_resample = !(npo&&spo&&(nrings_out>=2*lmax+2)); size_t nfull = need_second_resample ? 2*nfull_in : nfull_out; vector> shift(npo ? 0 : nrings_out+1); if (!npo) { UnityRoots> roots(2*nfull_out); for (size_t i=0; i plan_in(nfull_in), plan_out(need_second_resample ? nfull_out : 1), plan_full(nfull); execDynamic((nm+1)/2, nthreads, chunksize, [&](Scheduler &sched) { vmav,1> tmp({max(nfull,nfull_out)}, UNINITIALIZED); vmav,1> buf({max(plan_in.bufsize(), max(plan_out.bufsize(), plan_full.bufsize()))}, UNINITIALIZED); while (auto rng=sched.getNext()) { for (size_t n=0; n(legi, {{n},{},{2*rng.lo,MAXIDX}})); auto llego(subarray<2>(lego, {{n},{},{2*rng.lo,MAXIDX}})); for (size_t j=0; j+rng.lo v1 = llegi(i,2*j); complex v2 = ((2*j+1) *)tmp.data(), (Cmplx *)buf.data(), T(1), false); // zero padding to full-resolution CC grid if (nfull>nfull_in) // pad { size_t dist = nfull-nfull_in; size_t nmove = nfull_in/2; for (size_t i=nfull-1; i+1+nmove>nfull; --i) tmp(i) = tmp(i-dist); for (size_t i=nfull-nmove-dist; i+nmove=nfull_in, "must not happen"); plan_full.exec_copyback((Cmplx *)tmp.data(), (Cmplx *)buf.data(), T(1), true); for (size_t i=0, im=nfull; i<=im; ++i, --im) { tmp(i) *= T(wgt(i)); if ((i==0) || (i==im)) tmp(i)*=2; if ((im *)tmp.data(), (Cmplx *)buf.data(), T(1), false); if (nfull_out>nfull) // pad { size_t dist = nfull_out-nfull; size_t nmove = nfull/2; for (size_t i=nfull_out-1; i+1+nmove>nfull_out; --i) tmp(i) = tmp(i-dist); for (size_t i=nfull_out-nmove-dist; i+nmove *)tmp.data(), (Cmplx *)buf.data(), T(1), true); } auto norm = T(.5/(nfull_in*((need_second_resample ? nfull_out : 1)))); for (size_t i=0; i &alm, // (ncomp, *) size_t lmax, const cmav &mstart, // (mmax+1) const mav_info<2> &map, // (ncomp, *) const cmav &theta, // (nrings) const mav_info<1> &phi0, // (nrings) const cmav &nphi, // (nrings) const cmav &ringstart, // (nrings) size_t spin, SHT_mode mode) { size_t nm = mstart.shape(0); MR_assert(nm>0, "mstart too small"); size_t mmax = nm-1; MR_assert(lmax>=mmax, "lmax must be >= mmax"); size_t nrings = theta.shape(0); MR_assert(nrings>0, "need at least one ring"); MR_assert((phi0.shape(0)==nrings) && (nphi.shape(0)==nrings) && (ringstart.shape(0)==nrings), "inconsistency in the number of rings"); if ((mode==DERIV1) || (mode==GRAD_ONLY)) { MR_assert(spin>0, "DERIV and GRAD_ONLY modes require spin>0"); MR_assert((alm.shape(0)==1) && (map.shape(0)==2), "inconsistent number of components"); } else { size_t ncomp = 1+(spin>0); MR_assert((alm.shape(0)==ncomp) && (map.shape(0)==ncomp), "inconsistent number of components"); } } template void synthesis( const cmav,2> &alm, // (ncomp, *) vmav &map, // (ncomp, *) size_t spin, size_t lmax, const cmav &mstart, // (mmax+1) ptrdiff_t lstride, const cmav &theta, // (nrings) const cmav &nphi, // (nrings) const cmav &phi0, // (nrings) const cmav &ringstart, // (nrings) ptrdiff_t pixstride, size_t nthreads, SHT_mode mode, bool theta_interpol) { sanity_checks(alm, lmax, mstart, map, theta, phi0, nphi, ringstart, spin, mode); vmav mval({mstart.shape(0)}, UNINITIALIZED); for (size_t i=0; i theta_tmp({ntheta_tmp}, UNINITIALIZED); for (size_t i=0; i,3>::build_noncritical({map.shape(0),max(theta.shape(0),ntheta_tmp),mstart.shape(0)}, UNINITIALIZED)); auto legi(subarray<3>(leg, {{},{0,ntheta_tmp},{}})); auto lego(subarray<3>(leg, {{},{0,theta.shape(0)},{}})); alm2leg(alm, legi, spin, lmax, mval, mstart, lstride, theta_tmp, nthreads, mode, theta_interpol); resample_theta(legi, true, true, lego, npi, spi, spin, nthreads, false); leg2map(map, lego, nphi, phi0, ringstart, pixstride, nthreads); } else { auto leg(vmav,3>::build_noncritical({map.shape(0),theta.shape(0),mstart.shape(0)}, UNINITIALIZED)); alm2leg(alm, leg, spin, lmax, mval, mstart, lstride, theta, nthreads, mode, theta_interpol); leg2map(map, leg, nphi, phi0, ringstart, pixstride, nthreads); } } void get_ringtheta_2d(const string &type, vmav &theta) { auto nrings = theta.shape(0); if (type=="GL") // Gauss-Legendre { ducc0::GL_Integrator integ(nrings); auto cth = integ.coords(); for (size_t m=0; m void synthesis_2d(const cmav,2> &alm, vmav &map, size_t spin, size_t lmax, size_t mmax, const string &geometry, size_t nthreads, SHT_mode mode) { auto nphi = cmav::build_uniform({map.shape(1)}, map.shape(2)); auto phi0 = cmav::build_uniform({map.shape(1)}, 0.); vmav mstart({mmax+1}, UNINITIALIZED); for (size_t i=0, ofs=0; i<=mmax; ++i) { mstart(i) = ofs-i; ofs += lmax+1-i; } vmav ringstart({map.shape(1)}, UNINITIALIZED); auto ringstride = map.stride(1); auto pixstride = map.stride(2); for (size_t i=0; i({map.shape(0), map.shape(1)*map.shape(2)}, {map.stride(0), 1})); vmav theta({map.shape(1)}, UNINITIALIZED); get_ringtheta_2d(geometry, theta); synthesis(alm, map2, spin, lmax, mstart, 1, theta, nphi, phi0, ringstart, pixstride, nthreads, mode); } template void synthesis_2d(const cmav,2> &alm, vmav &map, size_t spin, size_t lmax, size_t mmax, const string &geometry, size_t nthreads, SHT_mode mode); template void synthesis_2d(const cmav,2> &alm, vmav &map, size_t spin, size_t lmax, size_t mmax, const string &geometry, size_t nthreads, SHT_mode mode); template void adjoint_synthesis( vmav,2> &alm, // (ncomp, *) const cmav &map, // (ncomp, *) size_t spin, size_t lmax, const cmav &mstart, // (mmax+1) ptrdiff_t lstride, const cmav &theta, // (nrings) const cmav &nphi, // (nrings) const cmav &phi0, // (nrings) const cmav &ringstart, // (nrings) ptrdiff_t pixstride, size_t nthreads, SHT_mode mode, bool theta_interpol) { sanity_checks(alm, lmax, mstart, map, theta, phi0, nphi, ringstart, spin, mode); vmav mval({mstart.shape(0)}, UNINITIALIZED); for (size_t i=0; i theta_tmp({ntheta_tmp}, UNINITIALIZED); for (size_t i=0; i,3>::build_noncritical({map.shape(0),max(theta.shape(0),ntheta_tmp),mstart.shape(0)}, UNINITIALIZED)); auto legi(subarray<3>(leg, {{},{0,theta.shape(0)},{}})); auto lego(subarray<3>(leg, {{},{0,ntheta_tmp},{}})); map2leg(map, legi, nphi, phi0, ringstart, pixstride, nthreads); resample_theta(legi, npi, spi, lego, true, true, spin, nthreads, true); leg2alm(alm, lego, spin, lmax, mval, mstart, lstride, theta_tmp, nthreads,mode,theta_interpol); } else { auto leg(vmav,3>::build_noncritical({map.shape(0),theta.shape(0),mstart.shape(0)}, UNINITIALIZED)); map2leg(map, leg, nphi, phi0, ringstart, pixstride, nthreads); leg2alm(alm, leg, spin, lmax, mval, mstart, lstride, theta, nthreads, mode, theta_interpol); } } template tuple pseudo_analysis( vmav,2> &alm, // (ncomp, *) const cmav &map, // (ncomp, *) size_t spin, size_t lmax, const cmav &mstart, // (mmax+1) ptrdiff_t lstride, const cmav &theta, // (nrings) const cmav &nphi, // (nrings) const cmav &phi0, // (nrings) const cmav &ringstart, // (nrings) ptrdiff_t pixstride, size_t nthreads, size_t maxiter, double epsilon, bool theta_interpol) { auto op = [&](const cmav,2> &xalm, vmav &xmap) { synthesis(xalm, xmap, spin, lmax, mstart, lstride, theta, nphi, phi0, ringstart, pixstride, nthreads, STANDARD, theta_interpol); }; auto op_adj = [&](const cmav &xmap, vmav,2> &xalm) { adjoint_synthesis(xalm, xmap, spin, lmax, mstart, lstride, theta, nphi, phi0, ringstart, pixstride, nthreads, STANDARD, theta_interpol); }; auto mapnorm = [&](const cmav &xmap) { double res=0; for (size_t icomp=0; icomp,2> &xalm) { double res=0; for (size_t icomp=0; icomp pseudo_analysis( vmav,2> &alm, // (ncomp, *) const cmav &map, // (ncomp, *) size_t spin, size_t lmax, const cmav &mstart, // (mmax+1) ptrdiff_t lstride, const cmav &theta, // (nrings) const cmav &nphi, // (nrings) const cmav &phi0, // (nrings) const cmav &ringstart, // (nrings) ptrdiff_t pixstride, size_t nthreads, size_t maxiter, double epsilon, bool theta_interpol); template tuple pseudo_analysis( vmav,2> &alm, // (ncomp, *) const cmav &map, // (ncomp, *) size_t spin, size_t lmax, const cmav &mstart, // (mmax+1) ptrdiff_t lstride, const cmav &theta, // (nrings) const cmav &nphi, // (nrings) const cmav &phi0, // (nrings) const cmav &ringstart, // (nrings) ptrdiff_t pixstride, size_t nthreads, size_t maxiter, double epsilon, bool theta_interpol); template void adjoint_synthesis_2d(vmav,2> &alm, const cmav &map, size_t spin, size_t lmax, size_t mmax, const string &geometry, size_t nthreads, SHT_mode mode) { auto nphi = cmav::build_uniform({map.shape(1)}, map.shape(2)); auto phi0 = cmav::build_uniform({map.shape(1)}, 0.); vmav mstart({mmax+1}, UNINITIALIZED); for (size_t i=0, ofs=0; i<=mmax; ++i) { mstart(i) = ofs-i; ofs += lmax+1-i; } vmav ringstart({map.shape(1)}, UNINITIALIZED); auto ringstride = map.stride(1); auto pixstride = map.stride(2); for (size_t i=0; i({map.shape(0), map.shape(1)*map.shape(2)}, {map.stride(0), 1})); vmav theta({map.shape(1)}, UNINITIALIZED); get_ringtheta_2d(geometry, theta); adjoint_synthesis(alm, map2, spin, lmax, mstart, 1, theta, nphi, phi0, ringstart, pixstride, nthreads, mode); } template void adjoint_synthesis_2d(vmav,2> &alm, const cmav &map, size_t spin, size_t lmax, size_t mmax, const string &geometry, size_t nthreads, SHT_mode mode); template void adjoint_synthesis_2d(vmav,2> &alm, const cmav &map, size_t spin, size_t lmax, size_t mmax, const string &geometry, size_t nthreads, SHT_mode mode); template void analysis_2d( vmav,2> &alm, // (ncomp, *) const cmav &map, // (ncomp, *) size_t spin, size_t lmax, const cmav &mstart, // (mmax+1) ptrdiff_t lstride, const string &geometry, const cmav &nphi, // (nrings) const cmav &phi0, // (nrings) const cmav &ringstart, // (nrings) ptrdiff_t pixstride, size_t nthreads) { size_t nrings_min = lmax+1; if (geometry=="CC") nrings_min = lmax+2; else if (geometry=="DH") nrings_min = 2*lmax+2; else if (geometry=="F2") nrings_min = 2*lmax+1; MR_assert(map.shape(1)>=nrings_min, "too few rings for analysis up to requested lmax"); vmav mval({mstart.shape(0)}, UNINITIALIZED); for (size_t i=0; i theta({nphi.shape(0)}, UNINITIALIZED); get_ringtheta_2d(geometry, theta); sanity_checks(alm, lmax, mstart, map, theta, phi0, nphi, ringstart, spin, STANDARD); if ((geometry=="CC")||(geometry=="F1")||(geometry=="MW")||(geometry=="MWflip")) { bool npi, spi; if (geometry=="CC") { npi=spi=true; } else if (geometry=="F1") { npi=spi=false; } else if (geometry=="MW") { npi=false; spi=true; } else { npi=true; spi=false; } size_t ntheta_leg = good_size_complex(lmax+1)+1; auto leg(vmav,3>::build_noncritical({map.shape(0), max(ntheta_leg,theta.shape(0)), mstart.shape(0)}, UNINITIALIZED)); auto legi(subarray<3>(leg, {{},{0,theta.shape(0)},{}})); auto lego(subarray<3>(leg, {{},{0,ntheta_leg},{}})); map2leg(map, legi, nphi, phi0, ringstart, pixstride, nthreads); for (size_t i=0; i newtheta({ntheta_leg}, UNINITIALIZED); for (size_t i=0; i,3>::build_noncritical({map.shape(0), theta.shape(0), mstart.shape(0)}, UNINITIALIZED)); map2leg(map, leg, nphi, phi0, ringstart, pixstride, nthreads); for (size_t i=0; i void analysis_2d(vmav,2> &alm, const cmav &map, size_t spin, size_t lmax, size_t mmax, const string &geometry, size_t nthreads) { auto nphi = cmav::build_uniform({map.shape(1)}, map.shape(2)); auto phi0 = cmav::build_uniform({map.shape(1)}, 0.); vmav mstart({mmax+1}, UNINITIALIZED); for (size_t i=0, ofs=0; i<=mmax; ++i) { mstart(i) = ofs-i; ofs += lmax+1-i; } vmav ringstart({map.shape(1)}, UNINITIALIZED); auto ringstride = map.stride(1); auto pixstride = map.stride(2); for (size_t i=0; i({map.shape(0), map.shape(1)*map.shape(2)}, {map.stride(0), 1})); analysis_2d(alm, map2, spin, lmax, mstart, 1, geometry, nphi, phi0, ringstart, pixstride, nthreads); } template void analysis_2d(vmav,2> &alm, const cmav &map, size_t spin, size_t lmax, size_t mmax, const string &geometry, size_t nthreads); template void analysis_2d(vmav,2> &alm, const cmav &map, size_t spin, size_t lmax, size_t mmax, const string &geometry, size_t nthreads); template void adjoint_analysis_2d( const cmav,2> &alm, // (ncomp, *) vmav &map, // (ncomp, *) size_t spin, size_t lmax, const cmav &mstart, // (mmax+1) ptrdiff_t lstride, const string &geometry, const cmav &nphi, // (nrings) const cmav &phi0, // (nrings) const cmav &ringstart, // (nrings) ptrdiff_t pixstride, size_t nthreads) { size_t nrings_min = lmax+1; if (geometry=="CC") nrings_min = lmax+2; else if (geometry=="DH") nrings_min = 2*lmax+2; else if (geometry=="F2") nrings_min = 2*lmax+1; MR_assert(map.shape(1)>=nrings_min, "too few rings for adjoint analysis up to requested lmax"); vmav mval({mstart.shape(0)}, UNINITIALIZED); for (size_t i=0; i theta({nphi.shape(0)}, UNINITIALIZED); get_ringtheta_2d(geometry, theta); sanity_checks(alm, lmax, mstart, map, theta, phi0, nphi, ringstart, spin, STANDARD); if ((geometry=="CC")||(geometry=="F1")||(geometry=="MW")||(geometry=="MWflip")) { bool npo, spo; if (geometry=="CC") { npo=spo=true; } else if (geometry=="F1") { npo=spo=false; } else if (geometry=="MW") { npo=false; spo=true; } else { npo=true; spo=false; } size_t ntheta_leg = good_size_complex(lmax+1)+1; auto leg(vmav,3>::build_noncritical({map.shape(0), max(ntheta_leg,theta.shape(0)), mstart.shape(0)}, UNINITIALIZED)); auto legi(subarray<3>(leg, {{},{0,ntheta_leg},{}})); auto lego(subarray<3>(leg, {{},{0,theta.shape(0)},{}})); vmav theta_tmp({ntheta_leg}, UNINITIALIZED); for (size_t i=0; i,3>::build_noncritical({map.shape(0), theta.shape(0), mstart.shape(0)}, UNINITIALIZED)); alm2leg(alm, leg, spin, lmax, mval, mstart, lstride, theta, nthreads, STANDARD); for (size_t i=0; i void adjoint_analysis_2d(const cmav,2> &alm, vmav &map, size_t spin, size_t lmax, size_t mmax, const string &geometry, size_t nthreads) { auto nphi = cmav::build_uniform({map.shape(1)}, map.shape(2)); auto phi0 = cmav::build_uniform({map.shape(1)}, 0.); vmav mstart({mmax+1}, UNINITIALIZED); for (size_t i=0, ofs=0; i<=mmax; ++i) { mstart(i) = ofs-i; ofs += lmax+1-i; } vmav ringstart({map.shape(1)}, UNINITIALIZED); auto ringstride = map.stride(1); auto pixstride = map.stride(2); for (size_t i=0; i({map.shape(0), map.shape(1)*map.shape(2)}, {map.stride(0), 1})); vmav theta({map.shape(1)}, UNINITIALIZED); adjoint_analysis_2d(alm, map2, spin, lmax, mstart, 1, geometry, nphi, phi0, ringstart, pixstride, nthreads); } template void adjoint_analysis_2d(const cmav,2> &alm, vmav &map, size_t spin, size_t lmax, size_t mmax, const string &geometry, size_t nthreads); template void adjoint_analysis_2d(const cmav,2> &alm, vmav &map, size_t spin, size_t lmax, size_t mmax, const string &geometry, size_t nthreads); template void synthesis_general( const cmav,2> &alm, vmav &map, size_t spin, size_t lmax, const cmav &mstart, ptrdiff_t lstride, const cmav &loc, double epsilon, double sigma_min, double sigma_max, size_t nthreads, SHT_mode mode, bool verbose) { TimerHierarchy timers("synthesis_general"); timers.push("setup"); MR_assert(loc.shape(1)==2, "last dimension of loc must have size 2"); MR_assert(mstart.shape(0)>0, "need at least m=0"); size_t nalm = (spin==0) ? 1 : ((mode==STANDARD) ? 2 : 1); MR_assert(alm.shape(0)==nalm, "number of components mismatch in alm"); size_t nmaps = (spin==0) ? 1 : 2; MR_assert(map.shape(0)==nmaps, "number of components mismatch in map"); timers.poppush("SphereInterpol setup"); SphereInterpol inter(lmax, mstart.shape(0)-1, spin, loc.shape(0), sigma_min, sigma_max, epsilon, nthreads); timers.poppush("build_planes"); auto planes = inter.build_planes(); timers.poppush("getPlane"); inter.getPlane(alm, mstart, lstride, planes, mode, timers); auto xtheta = subarray<1>(loc, {{},{0}}); auto xphi = subarray<1>(loc, {{},{1}}); timers.poppush("interpol (u2nu)"); inter.interpol(planes, 0, 0, xtheta, xphi, map); timers.pop(); if (verbose) timers.report(cerr); } template void synthesis_general( const cmav,2> &alm, vmav &map, size_t spin, size_t lmax, const cmav &mstart, ptrdiff_t lstride, const cmav &loc, double epsilon, double sigma_min, double sigma_max, size_t nthreads, SHT_mode mode, bool verbose); template void synthesis_general( const cmav,2> &alm, vmav &map, size_t spin, size_t lmax, const cmav &mstart, ptrdiff_t lstride, const cmav &loc, double epsilon, double sigma_min, double sigma_max, size_t nthreads, SHT_mode mode, bool verbose); template void adjoint_synthesis_general( vmav,2> &alm, const cmav &map, size_t spin, size_t lmax, const cmav &mstart, ptrdiff_t lstride, const cmav &loc, double epsilon, double sigma_min, double sigma_max, size_t nthreads, SHT_mode mode, bool verbose) { TimerHierarchy timers("adjoint_synthesis_general"); timers.push("setup"); MR_assert(loc.shape(1)==2, "last dimension of loc must have size 2"); size_t nalm = (spin==0) ? 1 : ((mode==STANDARD) ? 2 : 1); MR_assert(alm.shape(0)==nalm, "number of components mismatch in alm"); size_t nmaps = (spin==0) ? 1 : 2; MR_assert(map.shape(0)==nmaps, "number of components mismatch in map"); MR_assert(mstart.shape(0)>0, "need at least m=0"); timers.poppush("SphereInterpol setup"); SphereInterpol inter(lmax, mstart.shape(0)-1, spin, loc.shape(0), sigma_min, sigma_max, epsilon, nthreads); timers.poppush("build_planes"); auto planes = inter.build_planes(); mav_apply([](auto &v){v=0;}, nthreads, planes); timers.poppush("deinterpol (nu2u)"); auto xtheta = subarray<1>(loc, {{},{0}}); auto xphi = subarray<1>(loc, {{},{1}}); inter.deinterpol(planes, 0, 0, xtheta, xphi, map); timers.poppush("updateAlm"); inter.updateAlm(alm, mstart, lstride, planes, mode, timers); timers.pop(); if (verbose) timers.report(cerr); } template void adjoint_synthesis_general( vmav,2> &alm, const cmav &map, size_t spin, size_t lmax, const cmav &mstart, ptrdiff_t lstride, const cmav &loc, double epsilon, double sigma_min, double sigma_max, size_t nthreads, SHT_mode mode, bool verbose); template void adjoint_synthesis_general( vmav,2> &alm, const cmav &map, size_t spin, size_t lmax, const cmav &mstart, ptrdiff_t lstride, const cmav &loc, double epsilon, double sigma_min, double sigma_max, size_t nthreads, SHT_mode mode, bool verbose); template tuple pseudo_analysis_general( vmav,2> &alm, // (ncomp, *) const cmav &map, // (ncomp, npix) size_t spin, size_t lmax, const cmav &mstart, ptrdiff_t lstride, const cmav &loc, // (npix,2) double sigma_min, double sigma_max, size_t nthreads, size_t maxiter, double epsilon) { auto op = [&](const cmav,2> &xalm, vmav &xmap) { synthesis_general(xalm, xmap, spin, lmax, mstart, lstride, loc, 1e-1*epsilon, sigma_min, sigma_max, nthreads, STANDARD); }; auto op_adj = [&](const cmav &xmap, vmav,2> &xalm) { adjoint_synthesis_general(xalm, xmap, spin, lmax, mstart, lstride, loc, 1e-1*epsilon, sigma_min, sigma_max, nthreads, STANDARD); }; auto mapnorm = [&](const cmav &xmap) { double res=0; for (size_t icomp=0; icomp,2> &xalm) { double res=0; for (size_t icomp=0; icomp pseudo_analysis_general( vmav,2> &alm, // (ncomp, *) const cmav &map, // (ncomp, npix) size_t spin, size_t lmax, const cmav &mstart, ptrdiff_t lstride, const cmav &loc, // (npix,2) double sigma_min, double sigma_max, size_t nthreads, size_t maxiter, double epsilon); template tuple pseudo_analysis_general( vmav,2> &alm, // (ncomp, *) const cmav &map, // (ncomp, npix) size_t spin, size_t lmax, const cmav &mstart, ptrdiff_t lstride, const cmav &loc, // (npix,2) double sigma_min, double sigma_max, size_t nthreads, size_t maxiter, double epsilon); }}