/* * This code is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this code; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* Copyright (C) 2019-2023 Max-Planck-Society Author: Martin Reinecke */ #ifndef DUCC0_WGRIDDER_H #define DUCC0_WGRIDDER_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if ((!defined(DUCC0_NO_SIMD)) && (defined(__AVX__)||defined(__SSE3__))) #include #endif #include "ducc0/infra/error_handling.h" #include "ducc0/math/constants.h" #include "ducc0/fft/fft.h" #include "ducc0/infra/threading.h" #include "ducc0/infra/misc_utils.h" #include "ducc0/infra/useful_macros.h" #include "ducc0/infra/mav.h" #include "ducc0/infra/simd.h" #include "ducc0/infra/timers.h" #include "ducc0/math/gridding_kernel.h" #include "ducc0/math/rangeset.h" namespace ducc0 { namespace detail_gridder { using namespace std; // the next line is necessary to address some sloppy name choices in hipSYCL using std::min, std::max; template constexpr inline int mysimdlen = min(8, native_simd::size()); template using mysimd = typename simd_select>::type; template T sqr(T val) { return val*val; } template void quickzero(vmav &arr, size_t nthreads) { #if 0 arr.fill(T(0)); #else MR_assert((arr.stride(0)>0) && (arr.stride(1)>0), "bad memory ordering"); MR_assert(arr.stride(0)>=arr.stride(1), "bad memory ordering"); size_t s0=arr.shape(0), s1=arr.shape(1); execParallel(s0, nthreads, [&](size_t lo, size_t hi) { if (arr.stride(1)==1) { if (size_t(arr.stride(0))==arr.shape(1)) memset(reinterpret_cast(&arr(lo,0)), 0, sizeof(T)*s1*(hi-lo)); else for (auto i=lo; i(&arr(i,0)), 0, sizeof(T)*s1); } else for (auto i=lo; i [[gnu::hot]] void expi(vector> &res, vector &buf, F getang) { using Tsimd = native_simd; static constexpr auto vlen = Tsimd::size(); auto n=res.size(); for (size_t j=0; j(vcos[ii], vsin[ii]); } for (; i(cos(buf[i]), sin(buf[i])); } template complex hsum_cmplx(mysimd vr, mysimd vi) { return complex(reduce(vr, plus<>()), reduce(vi, plus<>())); } #if (!defined(DUCC0_NO_SIMD)) #if (defined(__AVX__)) #if 1 template<> inline complex hsum_cmplx(mysimd vr, mysimd vi) { auto t1 = _mm256_hadd_ps(__m256(vr), __m256(vi)); auto t2 = _mm_hadd_ps(_mm256_extractf128_ps(t1, 0), _mm256_extractf128_ps(t1, 1)); t2 += _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1,0,3,2)); return complex(t2[0], t2[1]); } #else // this version may be slightly faster, but this needs more benchmarking template<> inline complex hsum_cmplx(mysimd vr, mysimd vi) { auto t1 = _mm256_shuffle_ps(vr, vi, _MM_SHUFFLE(0,2,0,2)); auto t2 = _mm256_shuffle_ps(vr, vi, _MM_SHUFFLE(1,3,1,3)); auto t3 = _mm256_add_ps(t1,t2); t3 = _mm256_shuffle_ps(t3, t3, _MM_SHUFFLE(3,0,2,1)); auto t4 = _mm_add_ps(_mm256_extractf128_ps(t3, 1), _mm256_castps256_ps128(t3)); auto t5 = _mm_add_ps(t4, _mm_movehl_ps(t4, t4)); return complex(t5[0], t5[1]); } #endif #elif defined(__SSE3__) template<> inline complex hsum_cmplx(mysimd vr, mysimd vi) { auto t1 = _mm_hadd_ps(__m128(vr), __m128(vi)); t1 += _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)); return complex(t1[0], t1[2]); } #endif #endif template void checkShape (const array &shp1, const array &shp2) { MR_assert(shp1==shp2, "shape mismatch"); } // // Start of real gridder functionality // template void complex2hartley (const cmav, 2> &grid, vmav &grid2, size_t nthreads) { MR_assert(grid.conformable(grid2), "shape mismatch"); size_t nu=grid.shape(0), nv=grid.shape(1); execParallel(nu, nthreads, [&](size_t lo, size_t hi) { for(auto u=lo, xu=(u==0) ? 0 : nu-u; u void hartley2complex (const cmav &grid, vmav,2> &grid2, size_t nthreads) { MR_assert(grid.conformable(grid2), "shape mismatch"); size_t nu=grid.shape(0), nv=grid.shape(1); execParallel(nu, nthreads, [&](size_t lo, size_t hi) { for(size_t u=lo, xu=(u==0) ? 0 : nu-u; u(T(.5)*(grid(u,v)+grid(xu,xv)), T(.5)*(grid(xu,xv)-grid(u,v))); }); } template void hartley2_2D(vmav &arr, size_t vlim, bool first_fast, size_t nthreads) { size_t nu=arr.shape(0), nv=arr.shape(1); vfmav farr(arr); if (2*vlim coord; vector f_over_c; size_t nrows, nchan; double umax, vmax; public: Baselines() = default; template Baselines(const cmav &coord_, const cmav &freq, bool negate_v=false) { constexpr double speedOfLight = 299792458.; MR_assert(coord_.shape(1)==3, "dimension mismatch"); nrows = coord_.shape(0); nchan = freq.shape(0); f_over_c.resize(nchan); double fcmax = 0; for (size_t i=0; i0, "negative channel frequency encountered"); if (i>0) MR_assert(freq(i)>=freq(i-1), "channel frequencies must e sorted in ascending order"); f_over_c[i] = freq(i)/speedOfLight; fcmax = max(fcmax, abs(f_over_c[i])); } coord.resize(nrows); double vfac = negate_v ? -1 : 1; umax=vmax=0; for (size_t i=0; i class Wgridder { private: constexpr static int log2tile=is_same::value ? 5 : 4; bool gridding; TimerHierarchy timers; const cmav,2> &ms_in; vmav,2> &ms_out; const cmav &dirty_in; vmav &dirty_out; const cmav &wgt; const cmav &mask; vmav lmask; double pixsize_x, pixsize_y; size_t nxdirty, nydirty; double epsilon; bool do_wgridding; size_t nthreads; size_t verbosity; bool negate_v, divide_by_n; double sigma_min, sigma_max; Baselines bl; vector ranges; vector> blockstart; double wmin_d, wmax_d; size_t nvis; double wmin, dw, xdw, wshift; size_t nplanes; double nm1min, nm1max; double lshift, mshift, nshift; bool shifting, lmshift, no_nshift; size_t nu, nv; double ofactor; shared_ptr krn; size_t supp, nsafe; double ushift, vshift; int maxiu0, maxiv0; size_t vlim; bool uv_side_fast; vector> uranges, vranges; static_assert(sizeof(Tcalc)<=sizeof(Tacc), "bad type combination"); static_assert(sizeof(Tms)<=sizeof(Tcalc), "bad type combination"); static_assert(sizeof(Timg)<=sizeof(Tcalc), "bad type combination"); static double phase(double x, double y, double w, bool adjoint, double nshift) { double tmp = 1.-x-y; if (tmp<=0) return 0; // no phase factor beyond the horizon double nm1 = (-x-y)/(sqrt(tmp)+1); // more accurate form of sqrt(1-x-y)-1 double phs = w*(nm1+nshift); if (adjoint) phs *= -1; if constexpr (is_same::value) return twopi*phs; // we are reducing accuracy, so let's better do range reduction first return twopi*(phs-floor(phs)); } void grid2dirty_post(vmav &tmav, vmav &dirty) const { checkShape(dirty.shape(), {nxdirty, nydirty}); auto cfu = krn->corfunc(nxdirty/2+1, 1./nu, nthreads); auto cfv = krn->corfunc(nydirty/2+1, 1./nv, nthreads); execParallel(nxdirty, nthreads, [&](size_t lo, size_t hi) { for (auto i=lo; i=nu) i2-=nu; size_t j2 = nv-nydirty/2+j; if (j2>=nv) j2-=nv; dirty(i,j) = Timg(tmav(i2,j2)*cfu[icfu]*cfv[icfv]); } } }); } void grid2dirty_post2(vmav,2> &tmav, vmav &dirty, double w) { timers.push("wscreen+grid correction"); checkShape(dirty.shape(), {nxdirty,nydirty}); double x0 = lshift-0.5*nxdirty*pixsize_x, y0 = mshift-0.5*nydirty*pixsize_y; size_t nxd = lmshift ? nxdirty : (nxdirty/2+1); execParallel(nxd, nthreads, [&](size_t lo, size_t hi) { vector> phases(lmshift ? nydirty : (nydirty/2+1)); vector buf(lmshift ? nydirty : (nydirty/2+1)); for (auto i=lo; i=nu) ix-=nu; expi(phases, buf, [&](size_t i) { return Tcalc(phase(fx, sqr(y0+i*pixsize_y), w, true, nshift)); }); if (lmshift) for (size_t j=0, jx=nv-nydirty/2; j=nv)? jx+1-nv : jx+1) { dirty(i,j) += Timg(tmav(ix,jx).real()*phases[j].real() - tmav(ix,jx).imag()*phases[j].imag()); tmav(ix,jx) = complex(0); } else { size_t i2 = nxdirty-i; size_t ix2 = nu-nxdirty/2+i2; if (ix2>=nu) ix2-=nu; if ((i>0)&&(i=nv)? jx+1-nv : jx+1) { size_t j2 = min(j, nydirty-j); Tcalc re = phases[j2].real(), im = phases[j2].imag(); dirty(i ,j) += Timg(tmav(ix ,jx).real()*re - tmav(ix ,jx).imag()*im); dirty(i2,j) += Timg(tmav(ix2,jx).real()*re - tmav(ix2,jx).imag()*im); tmav(ix,jx) = tmav(ix2,jx) = complex(0); } else for (size_t j=0, jx=nv-nydirty/2; j=nv)? jx+1-nv : jx+1) { size_t j2 = min(j, nydirty-j); Tcalc re = phases[j2].real(), im = phases[j2].imag(); dirty(i,j) += Timg(tmav(ix,jx).real()*re - tmav(ix,jx).imag()*im); // lower left tmav(ix,jx) = complex(0); } } } }); timers.poppush("zeroing grid"); // only zero the parts of the grid that have not been zeroed before { auto a0 = subarray<2>(tmav, {{0,nxdirty/2}, {nydirty/2,nv-nydirty/2}}); quickzero(a0, nthreads); } { auto a0 = subarray<2>(tmav, {{nxdirty/2, nu-nxdirty/2}, {}}); quickzero(a0, nthreads); } { auto a0 = subarray<2>(tmav, {{nu-nxdirty/2,MAXIDX}, {nydirty/2, nv-nydirty/2}}); quickzero(a0, nthreads); } timers.pop(); } void grid2dirty_overwrite(vmav &grid, vmav &dirty) { timers.push("FFT"); checkShape(grid.shape(), {nu,nv}); hartley2_2D(grid, vlim, uv_side_fast, nthreads); timers.poppush("grid correction"); grid2dirty_post(grid, dirty); timers.pop(); } void grid2dirty_c_overwrite_wscreen_add (vmav,2> &grid, vmav &dirty, double w, size_t iplane) { timers.push("FFT"); checkShape(grid.shape(), {nu,nv}); vfmav> inout(grid); const auto &rsu(uranges[iplane]); const auto &rsv(vranges[iplane]); auto cost_ufirst = nxdirty*log(nv)*nv + rsv.nval()*log(nu)*nu; auto cost_vfirst = nydirty*log(nu)*nu + rsu.nval()*log(nv)*nv; if (cost_ufirst &dirty, vmav &grid) { timers.push("zeroing grid"); checkShape(grid.shape(), {nu, nv}); // only zero the parts of the grid that are not filled afterwards anyway { auto a0 = subarray<2>(grid, {{0,nxdirty/2}, {nydirty/2,nv-nydirty/2}}); quickzero(a0, nthreads); } { auto a0 = subarray<2>(grid, {{nxdirty/2, nu-nxdirty/2}, {}}); quickzero(a0, nthreads); } { auto a0 = subarray<2>(grid, {{nu-nxdirty/2,MAXIDX}, {nydirty/2, nv-nydirty/2}}); quickzero(a0, nthreads); } timers.poppush("grid correction"); checkShape(dirty.shape(), {nxdirty, nydirty}); auto cfu = krn->corfunc(nxdirty/2+1, 1./nu, nthreads); auto cfv = krn->corfunc(nydirty/2+1, 1./nv, nthreads); execParallel(nxdirty, nthreads, [&](size_t lo, size_t hi) { for (auto i=lo; i=nu) i2-=nu; size_t j2 = nv-nydirty/2+j; if (j2>=nv) j2-=nv; grid(i2,j2) = dirty(i,j)*Tcalc(cfu[icfu]*cfv[icfv]); } } }); timers.pop(); } void dirty2grid_pre2(const cmav &dirty, vmav,2> &grid, double w) { timers.push("zeroing grid"); checkShape(dirty.shape(), {nxdirty, nydirty}); checkShape(grid.shape(), {nu, nv}); // only zero the parts of the grid that are not filled afterwards anyway { auto a0 = subarray<2>(grid, {{0,nxdirty/2}, {nydirty/2, nv-nydirty/2}}); quickzero(a0, nthreads); } { auto a0 = subarray<2>(grid, {{nxdirty/2,nu-nxdirty/2}, {}}); quickzero(a0, nthreads); } { auto a0 = subarray<2>(grid, {{nu-nxdirty/2,MAXIDX}, {nydirty/2,nv-nydirty/2}}); quickzero(a0, nthreads); } timers.poppush("wscreen+grid correction"); double x0 = lshift-0.5*nxdirty*pixsize_x, y0 = mshift-0.5*nydirty*pixsize_y; size_t nxd = lmshift ? nxdirty : (nxdirty/2+1); execParallel(nxd, nthreads, [&](size_t lo, size_t hi) { vector> phases(lmshift ? nydirty : (nydirty/2+1)); vector buf(lmshift ? nydirty : (nydirty/2+1)); for(auto i=lo; i=nu) ix-=nu; expi(phases, buf, [&](size_t i) { return Tcalc(phase(fx, sqr(y0+i*pixsize_y), w, false, nshift)); }); if (lmshift) for (size_t j=0, jx=nv-nydirty/2; j=nv)? jx+1-nv : jx+1) grid(ix,jx) = Tcalc(dirty(i,j))*phases[j]; else { size_t i2 = nxdirty-i; size_t ix2 = nu-nxdirty/2+i2; if (ix2>=nu) ix2-=nu; if ((i>0)&&(i=nv)? jx+1-nv : jx+1) { size_t j2 = min(j, nydirty-j); grid(ix ,jx) = Tcalc(dirty(i ,j))*phases[j2]; // lower left grid(ix2,jx) = Tcalc(dirty(i2,j))*phases[j2]; // lower right } else for (size_t j=0, jx=nv-nydirty/2; j=nv)? jx+1-nv : jx+1) grid(ix,jx) = Tcalc(dirty(i,j))*phases[min(j, nydirty-j)]; // lower left } } }); timers.pop(); } void dirty2grid(const cmav &dirty, vmav &grid) { dirty2grid_pre(dirty, grid); timers.push("FFT"); hartley2_2D(grid, vlim, !uv_side_fast, nthreads); timers.pop(); } void dirty2grid_c_wscreen(const cmav &dirty, vmav,2> &grid, double w, size_t iplane) { dirty2grid_pre2(dirty, grid, w); timers.push("FFT"); vfmav> inout(grid); const auto &rsu(uranges[iplane]); const auto &rsv(vranges[iplane]); auto cost_ufirst = nydirty*log(nu)*nu + rsu.nval()*log(nv)*nv; auto cost_vfirst = nxdirty*log(nv)*nv + rsv.nval()*log(nu)*nu; if (cost_ufirst>log2tile; iv0 = (iv0+nsafe)>>log2tile; iw = do_wgridding ? max(0,int((uvw.w+wshift)*xdw)) : 0; return Uvwidx(iu0, iv0, iw); } void countRanges() { timers.push("building index"); size_t nrow=bl.Nrows(), nchan=bl.Nchannels(); if (do_wgridding) { dw = 0.5/ofactor/max(abs(nm1max+nshift), abs(nm1min+nshift)); xdw = 1./dw; nplanes = size_t((wmax_d-wmin_d)/dw+supp); MR_assert(nplanes<(size_t(1)<<16), "too many w planes"); wmin = (wmin_d+wmax_d)*0.5 - 0.5*(nplanes-1)*dw; wshift = dw-(0.5*supp*dw)-wmin; } else dw = wmin = xdw = wshift = nplanes = 0; size_t nbunch = do_wgridding ? supp : 1; // we want a maximum deviation of 1% in gridding time between threads constexpr double max_asymm = 0.01; size_t max_allowed = size_t(nvis/double(nbunch*nthreads)*max_asymm); checkShape(wgt.shape(),{nrow,nchan}); checkShape(ms_in.shape(), {nrow,nchan}); checkShape(mask.shape(), {nrow,nchan}); size_t ntiles_u = (nu>>log2tile) + 3; size_t ntiles_v = (nv>>log2tile) + 3; size_t nwmin = do_wgridding ? nplanes-supp+3 : 1; timers.push("counting"); // align members with cache lines struct alignas(64) spaced_size_t { atomic v; }; vector buf(ntiles_u*ntiles_v*nwmin+1); auto chunk = max(1, nrow/(20*nthreads)); execDynamic(nrow, nthreads, chunk, [&](Scheduler &sched) { while (auto rng=sched.getNext()) for(auto irow=rng.lo; irow(nchan,ch0+1); while( (ch1 void { if (ch_lo+1==ch_hi) { if (uvw_lo!=uvw_hi) inc(uvw_hi,ch_hi); } else { auto ch_mid = ch_lo+(ch_hi-ch_lo)/2; auto uvw_mid = get_uvwidx(uvwbase, ch_mid); if (uvw_lo!=uvw_mid) recurse(ch_lo, ch_mid, uvw_lo, uvw_mid, recurse); if (uvw_mid!=uvw_hi) recurse(ch_mid, ch_hi, uvw_mid, uvw_hi, recurse); } }; if (ch0!=ch1) { auto uvw0 = get_uvwidx(uvwbase,ch0); inc0(uvw0); if (ch0+10) blockstart.push_back({Uvwidx(tu,tv,mp),acc}); buf[i].v = acc; acc += tmp; } buf.back().v=acc; } timers.poppush("filling"); ranges.resize(buf.back().v); execDynamic(nrow, nthreads, chunk, [&](Scheduler &sched) { vector> interbuf; while (auto rng=sched.getNext()) for(auto irow=rng.lo; irow vissum; vissum.reserve(ranges.size()+1); size_t visacc=0; for (size_t i=0; i> bs2; swap(blockstart, bs2); for (size_t i=0; imax_allowed) { blockstart.push_back({bs2[i].first, j}); acc=0; } } } lmask.dealloc(); timers.pop(); // compute which grid regions are required if (do_wgridding) { timers.poppush("grid regions"); vmav tmpu({nplanes,(nu>>log2tile)+1}), tmpv({nplanes,(nv>>log2tile)+1}); for (const auto &rng: blockstart) for (size_t i=0; iint(nu)) { int tmp = rsu.ivend(rsu.size()-1); rsu.remove(nu,tmp); rsu.add(0, tmp-nu); } for (size_t j=0; jint(nv)) { int tmp = rsv.ivend(rsv.size()-1); rsv.remove(nv,tmp); rsv.add(0, tmp-nv); } } } timers.pop(); } template class HelperX2g2 { public: static constexpr size_t vlen = mysimd::size(); static constexpr size_t nvec = (supp+vlen-1)/vlen; private: static constexpr int nsafe = (supp+1)/2; static constexpr int su = 2*nsafe+(1<> tkrn; vmav,2> &grid; int iu0, iv0; // start index of the current visibility int bu0, bv0; // start index of the current buffer vmav bufr, bufi; Tacc *px0r, *px0i; double w0, xdw; vector &locks; DUCC0_NOINLINE void dump() { int inu = int(parent->nu); int inv = int(parent->nv); if (bu0<-nsafe) return; // nothing written into buffer yet int idxu = (bu0+inu)%inu; int idxv0 = (bv0+inv)%inv; for (int iu=0; iu(Tcalc(bufr(iu,iv)), Tcalc(bufi(iu,iv))); bufr(iu,iv) = bufi(iu,iv) = 0; if (++idxv>=inv) idxv=0; } } if (++idxu>=inu) idxu=0; } } public: Tacc * DUCC0_RESTRICT p0r, * DUCC0_RESTRICT p0i; union kbuf { Tacc scalar[2*nvec*vlen]; mysimd simd[2*nvec]; #if defined(_MSC_VER) kbuf() {} #endif }; kbuf buf; HelperX2g2(const Wgridder *parent_, vmav,2> &grid_, vector &locks_, double w0_=-1, double dw_=-1) : parent(parent_), tkrn(*parent->krn), grid(grid_), iu0(-1000000), iv0(-1000000), bu0(-1000000), bv0(-1000000), bufr({size_t(su),size_t(svvec)}), bufi({size_t(su),size_t(svvec)}), px0r(bufr.data()), px0i(bufi.data()), w0(w0_), xdw(1./dw_), locks(locks_) { checkShape(grid.shape(), {parent->nu,parent->nv}); } ~HelperX2g2() { dump(); } constexpr int lineJump() const { return svvec; } [[gnu::always_inline]] [[gnu::hot]] void prep(const UVW &in, [[maybe_unused]] size_t nth=0) { double ufrac, vfrac; auto iu0old = iu0; auto iv0old = iv0; parent->getpix(in.u, in.v, ufrac, vfrac, iu0, iv0); auto x0 = -ufrac*2+(supp-1); auto y0 = -vfrac*2+(supp-1); if constexpr(wgrid) tkrn.eval2s(Tacc(x0), Tacc(y0), Tacc(xdw*(w0-in.w)), nth, &buf.simd[0]); else tkrn.eval2(Tacc(x0), Tacc(y0), &buf.simd[0]); if ((iu0==iu0old) && (iv0==iv0old)) return; if ((iu0bu0+su) || (iv0+int(supp)>bv0+sv)) { dump(); bu0=((((iu0+nsafe)>>log2tile)<>log2tile)< class HelperG2x2 { public: static constexpr size_t vlen = mysimd::size(); static constexpr size_t nvec = (supp+vlen-1)/vlen; private: static constexpr int nsafe = (supp+1)/2; static constexpr int su = 2*nsafe+(1<> tkrn; const cmav,2> &grid; int iu0, iv0; // start index of the current visibility int bu0, bv0; // start index of the current buffer vmav bufr, bufi; const Tcalc *px0r, *px0i; double w0, xdw; DUCC0_NOINLINE void load() { int inu = int(parent->nu); int inv = int(parent->nv); int idxu = (bu0+inu)%inu; int idxv0 = (bv0+inv)%inv; for (int iu=0; iu=inv) idxv=0; } if (++idxu>=inu) idxu=0; } } public: const Tcalc * DUCC0_RESTRICT p0r, * DUCC0_RESTRICT p0i; union kbuf { Tcalc scalar[2*nvec*vlen]; mysimd simd[2*nvec]; #if defined(_MSC_VER) kbuf() {} #endif }; kbuf buf; HelperG2x2(const Wgridder *parent_, const cmav,2> &grid_, double w0_=-1, double dw_=-1) : parent(parent_), tkrn(*parent->krn), grid(grid_), iu0(-1000000), iv0(-1000000), bu0(-1000000), bv0(-1000000), bufr({size_t(su),size_t(svvec)}), bufi({size_t(su),size_t(svvec)}), px0r(bufr.data()), px0i(bufi.data()), w0(w0_), xdw(1./dw_) { checkShape(grid.shape(), {parent->nu,parent->nv}); } constexpr int lineJump() const { return svvec; } [[gnu::always_inline]] [[gnu::hot]] void prep(const UVW &in, [[maybe_unused]] size_t nth=0) { double ufrac, vfrac; auto iu0old = iu0; auto iv0old = iv0; parent->getpix(in.u, in.v, ufrac, vfrac, iu0, iv0); auto x0 = -ufrac*2+(supp-1); auto y0 = -vfrac*2+(supp-1); if constexpr(wgrid) tkrn.eval2s(Tcalc(x0), Tcalc(y0), Tcalc(xdw*(w0-in.w)), nth, &buf.simd[0]); else tkrn.eval2(Tcalc(x0), Tcalc(y0), &buf.simd[0]); if ((iu0==iu0old) && (iv0==iv0old)) return; if ((iu0bu0+su) || (iv0+int(supp)>bv0+sv)) { bu0=((((iu0+nsafe)>>log2tile)<>log2tile)<> &phases, vector &buf, Tcalc imflip, const UVW &bcoord, const RowchanRange &rcr) { phases.resize(rcr.ch_end-rcr.ch_begin); buf.resize(rcr.ch_end-rcr.ch_begin); double fct = imflip*(bcoord.u*lshift + bcoord.v*mshift + bcoord.w*nshift); expi(phases, buf, [&](size_t i) { auto tmp = fct*bl.ffact(rcr.ch_begin+i); if constexpr (is_same::value) return Tcalc(twopi*tmp); // we are reducing accuracy, // so let's better do range reduction first return Tcalc(twopi*(tmp-floor(tmp))); }); } template [[gnu::hot]] void x2grid_c_helper (size_t supp, vmav,2> &grid, size_t p0, double w0) { if constexpr (SUPP>=8) if (supp<=SUPP/2) return x2grid_c_helper(supp, grid, p0, w0); if constexpr (SUPP>4) if (supp(supp, grid, p0, w0); MR_assert(supp==SUPP, "requested support out of range"); vector locks(nu); execDynamic(blockstart.size(), nthreads, wgrid ? SUPP : 1, [&](Scheduler &sched) { constexpr auto vlen=mysimd::size(); constexpr auto NVEC((SUPP+vlen-1)/vlen); HelperX2g2 hlp(this, grid, locks, w0, dw); constexpr auto jump = hlp.lineJump(); const auto * DUCC0_RESTRICT ku = hlp.buf.scalar; const auto * DUCC0_RESTRICT kv = hlp.buf.simd+NVEC; vector> phases; vector buf; while (auto rng=sched.getNext()) for(auto ix=rng.lo; ix=ranges.size()) ix -=ranges.size(); const auto &uvwidx(blockstart[ix].first); if ((!wgrid) || ((uvwidx.minplane+SUPP>p0)&&(uvwidx.minplane<=p0))) { //bool lastplane = (!wgrid) || (uvwidx.minplane+SUPP-1==p0); size_t nth = p0-uvwidx.minplane; size_t iend = (ix+1 vr=v.real()*kv[0], vi=v.imag()*imflip*kv[0]; for (size_t cu=0; cu(pxr,element_aligned_tag()); auto ti = mysimd(pxi,element_aligned_tag()); tr += vr*ku[cu]; ti += vi*ku[cu]; tr.copy_to(pxr,element_aligned_tag()); ti.copy_to(pxi,element_aligned_tag()); } } else { mysimd vr(v.real()), vi(v.imag()*imflip); for (size_t cu=0; cu tmpr=vr*ku[cu], tmpi=vi*ku[cu]; for (size_t cv=0; cv(pxr,element_aligned_tag()); tr += tmpr*kv[cv]; tr.copy_to(pxr,element_aligned_tag()); auto ti = mysimd(pxi, element_aligned_tag()); ti += tmpi*kv[cv]; ti.copy_to(pxi,element_aligned_tag()); } } } } } } } }); } template void x2grid_c(vmav,2> &grid, size_t p0, double w0=-1) { checkShape(grid.shape(), {nu, nv}); constexpr size_t maxsupp = is_same::value ? 16 : 8; x2grid_c_helper(supp, grid, p0, w0); } template [[gnu::hot]] void grid2x_c_helper (size_t supp, const cmav,2> &grid, size_t p0, double w0) { if constexpr (SUPP>=8) if (supp<=SUPP/2) return grid2x_c_helper(supp, grid, p0, w0); if constexpr (SUPP>4) if (supp(supp, grid, p0, w0); MR_assert(supp==SUPP, "requested support out of range"); // Loop over sampling points execDynamic(blockstart.size(), nthreads, wgrid ? SUPP : 1, [&](Scheduler &sched) { constexpr size_t vlen=mysimd::size(); constexpr size_t NVEC((SUPP+vlen-1)/vlen); HelperG2x2 hlp(this, grid, w0, dw); constexpr int jump = hlp.lineJump(); const auto * DUCC0_RESTRICT ku = hlp.buf.scalar; const auto * DUCC0_RESTRICT kv = hlp.buf.simd+NVEC; vector> phases; vector buf; while (auto rng=sched.getNext()) for(auto ix=rng.lo; ixp0)&&(uvwidx.minplane<=p0))) { bool firstplane = (!wgrid) || (uvwidx.minplane==p0); bool lastplane = (!wgrid) || (uvwidx.minplane+SUPP-1==p0); size_t nth = p0-uvwidx.minplane; size_t iend = (ix+1 rr=0, ri=0; if constexpr (NVEC==1) { for (size_t cu=0; cu(pxr,element_aligned_tag())*ku[cu]; ri += mysimd(pxi,element_aligned_tag())*ku[cu]; } rr *= kv[0]; ri *= kv[0]; } else { for (size_t cu=0; cu tmpr(0), tmpi(0); for (size_t cv=0; cv(pxr,element_aligned_tag()); tmpi += kv[cv]*mysimd(pxi,element_aligned_tag()); } rr += ku[cu]*tmpr; ri += ku[cu]*tmpi; } } ri *= imflip; auto r = hsum_cmplx(rr,ri); if (!firstplane) r += ms_out(row, ch); if (lastplane) r *= shifting ? complex(phases[ch-rcr.ch_begin]*Tcalc(wgt(row, ch))) : wgt(row, ch); ms_out(row, ch) = r; } } } } }); } template void grid2x_c(const cmav,2> &grid, size_t p0, double w0=-1) { checkShape(grid.shape(), {nu, nv}); constexpr size_t maxsupp = is_same::value ? 16 : 8; grid2x_c_helper(supp, grid, p0, w0); } void apply_global_corrections(vmav &dirty) { timers.push("global corrections"); double x0 = lshift-0.5*nxdirty*pixsize_x, y0 = mshift-0.5*nydirty*pixsize_y; auto cfu = krn->corfunc(nxdirty/2+1, 1./nu, nthreads); auto cfv = krn->corfunc(nydirty/2+1, 1./nv, nthreads); size_t nxd = lmshift ? nxdirty : (nxdirty/2+1); size_t nyd = lmshift ? nydirty : (nydirty/2+1); execParallel(nxd, nthreads, [&](size_t lo, size_t hi) { for(auto i=lo; i=0) { auto nm1 = (-fx-fy)/(sqrt(tmp)+1); // accurate form of sqrt(1-x-y)-1 fct = krn->corfunc((nm1+nshift)*dw); if (divide_by_n) fct /= nm1+1; } else // beyond the horizon, don't really know what to do here fct = divide_by_n ? 0 : krn->corfunc((sqrt(-tmp)-1)*dw); if (lmshift) { auto i2=min(i, nxdirty-i), j2=min(j, nydirty-j); fct *= cfu[nxdirty/2-i2]*cfv[nydirty/2-j2]; dirty(i,j)*=Timg(fct); } else { fct *= cfu[nxdirty/2-i]*cfv[nydirty/2-j]; size_t i2 = nxdirty-i, j2 = nydirty-j; dirty(i,j)*=Timg(fct); if ((i>0)&&(i0)&&(j0)&&(j,2>::build_noncritical({nu,nv}); timers.pop(); for (size_t pl=0; pl(grid, pl, w); timers.pop(); grid2dirty_c_overwrite_wscreen_add(grid, dirty_out, w, pl); } // correct for w gridding etc. apply_global_corrections(dirty_out); } else { timers.push("allocating grid"); auto grid = vmav,2>::build_noncritical({nu,nv}); timers.poppush("gridding proper"); x2grid_c(grid, 0); timers.poppush("allocating rgrid"); auto rgrid = vmav::build_noncritical(grid.shape(), UNINITIALIZED); timers.poppush("complex2hartley"); complex2hartley(grid, rgrid, nthreads); timers.pop(); grid2dirty_overwrite(rgrid, dirty_out); } } void dirty2x() { if (do_wgridding) { timers.push("copying dirty image"); vmav tdirty({nxdirty,nydirty}, UNINITIALIZED); mav_apply([](Timg &a, const Timg &b) {a=b;}, nthreads, tdirty, dirty_in); timers.pop(); // correct for w gridding etc. apply_global_corrections(tdirty); timers.push("allocating grid"); auto grid = vmav,2>::build_noncritical({nu,nv}, UNINITIALIZED); timers.pop(); for (size_t pl=0; pl(grid, pl, w); timers.pop(); } } else { timers.push("allocating grid"); auto rgrid = vmav::build_noncritical({nu,nv}, UNINITIALIZED); timers.pop(); dirty2grid(dirty_in, rgrid); timers.push("allocating grid"); auto grid = vmav,2>::build_noncritical(rgrid.shape()); timers.poppush("hartley2complex"); hartley2complex(rgrid, grid, nthreads); timers.poppush("degridding proper"); grid2x_c(grid, 0); timers.pop(); } } auto getNuNv() { timers.push("parameter calculation"); double xmin = lshift - 0.5*nxdirty*pixsize_x, xmax = xmin + (nxdirty-1)*pixsize_x, ymin = mshift - 0.5*nydirty*pixsize_y, ymax = ymin + (nydirty-1)*pixsize_y; vector xext{xmin, xmax}, yext{ymin, ymax}; if (xmin*xmax<0) xext.push_back(0); if (ymin*ymax<0) yext.push_back(0); nm1min = 1e300, nm1max = -1e300; for (auto xc: xext) for (auto yc: yext) { double tmp = xc*xc+yc*yc; double nval = (tmp<=1.) ? (sqrt(1.-tmp)-1.) : (-sqrt(tmp-1.)-1.); nm1min = min(nm1min, nval); nm1max = max(nm1max, nval); } nshift = (no_nshift||(!do_wgridding)) ? 0. : -0.5*(nm1max+nm1min); shifting = lmshift || (nshift!=0); auto idx = getAvailableKernels(epsilon, do_wgridding ? 3 : 2, sigma_min, sigma_max); double mincost = 1e300; constexpr double nref_fft=2048; constexpr double costref_fft=0.0693; size_t minnu=0, minnv=0, minidx=~(size_t(0)); size_t vlen = gridding ? mysimd::size() : mysimd::size(); for (size_t i=0; i(nu,16); nv = max(nv,16); double logterm = log(nu*nv)/log(nref_fft*nref_fft); double fftcost = nu/nref_fft*nv/nref_fft*logterm*costref_fft; double gridcost = 2.2e-10*nvis*(supp*nvec*vlen + ((2*nvec+1)*(supp+3)*vlen)); if (gridding) gridcost *= sizeof(Tacc)/sizeof(Tcalc); if (do_wgridding) { double dw = 0.5/ofactor/max(abs(nm1max+nshift), abs(nm1min+nshift)); size_t nplanes = size_t((wmax_d-wmin_d)/dw+supp); fftcost *= nplanes; gridcost *= supp; } // FIXME: heuristics could be improved gridcost /= nthreads; // assume perfect scaling for now constexpr double max_fft_scaling = 6; constexpr double scaling_power=2; auto sigmoid = [](double x, double m, double s) { auto x2 = x-1; auto m2 = m-1; return 1.+x2/pow((1.+pow(x2/m2,s)),1./s); }; fftcost /= sigmoid(nthreads, max_fft_scaling, scaling_power); double cost = fftcost+gridcost; if (cost &uvw, const cmav &freq, const cmav,2> &ms_in_, vmav,2> &ms_out_, const cmav &dirty_in_, vmav &dirty_out_, const cmav &wgt_, const cmav &mask_, double pixsize_x_, double pixsize_y_, double epsilon_, bool do_wgridding_, size_t nthreads_, size_t verbosity_, bool negate_v_, bool divide_by_n_, double sigma_min_, double sigma_max_, double center_x, double center_y, bool allow_nshift) : gridding(ms_out_.size()==0), timers(gridding ? "gridding" : "degridding"), ms_in(ms_in_), ms_out(ms_out_), dirty_in(dirty_in_), dirty_out(dirty_out_), wgt(wgt_), mask(mask_), lmask(gridding ? ms_in.shape() : ms_out.shape()), pixsize_x(pixsize_x_), pixsize_y(pixsize_y_), nxdirty(gridding ? dirty_out.shape(0) : dirty_in.shape(0)), nydirty(gridding ? dirty_out.shape(1) : dirty_in.shape(1)), epsilon(epsilon_), do_wgridding(do_wgridding_), nthreads(adjust_nthreads(nthreads_)), verbosity(verbosity_), negate_v(negate_v_), divide_by_n(divide_by_n_), sigma_min(sigma_min_), sigma_max(sigma_max_), lshift(center_x), mshift(negate_v ? -center_y : center_y), lmshift((lshift!=0) || (mshift!=0)), no_nshift(!allow_nshift) { timers.push("Baseline construction"); bl = Baselines(uvw, freq, negate_v); MR_assert(bl.Nrows()<(uint64_t(1)<<32), "too many rows in the MS"); MR_assert(bl.Nchannels()<(uint64_t(1)<<16), "too many channels in the MS"); timers.pop(); scanData(); if (nvis==0) { if (gridding) mav_apply([](Timg &v){v=Timg(0);}, nthreads, dirty_out); return; } auto kidx = getNuNv(); MR_assert((nu>>log2tile)<(size_t(1)<<16), "nu too large"); MR_assert((nv>>log2tile)<(size_t(1)<<16), "nv too large"); ofactor = min(double(nu)/nxdirty, double(nv)/nydirty); krn = selectKernel(kidx); supp = krn->support(); nsafe = (supp+1)/2; ushift = supp*(-0.5)+1+nu; vshift = supp*(-0.5)+1+nv; maxiu0 = (nu+nsafe)-supp; maxiv0 = (nv+nsafe)-supp; vlim = min(nv/2, size_t(nv*bl.Vmax()*pixsize_y+0.5*supp+1)); uv_side_fast = true; size_t vlim2 = (nydirty+1)/2+(supp+1)/2; if (vlim2=2*nsafe, "nu too small"); MR_assert(nv>=2*nsafe, "nv too small"); MR_assert((nxdirty&1)==0, "nx_dirty must be even"); MR_assert((nydirty&1)==0, "ny_dirty must be even"); MR_assert((nu&1)==0, "nu must be even"); MR_assert((nv&1)==0, "nv must be even"); MR_assert(epsilon>0, "epsilon must be positive"); MR_assert(pixsize_x>0, "pixsize_x must be positive"); MR_assert(pixsize_y>0, "pixsize_y must be positive"); countRanges(); report(); gridding ? x2dirty() : dirty2x(); if (verbosity>0) timers.report(cout); } }; template void ms2dirty(const cmav &uvw, const cmav &freq, const cmav,2> &ms, const cmav &wgt_, const cmav &mask_, double pixsize_x, double pixsize_y, double epsilon, bool do_wgridding, size_t nthreads, vmav &dirty, size_t verbosity, bool negate_v=false, bool divide_by_n=true, double sigma_min=1.1, double sigma_max=2.6, double center_x=0, double center_y=0, bool allow_nshift=true) { auto ms_out(vmav,2>::build_empty()); auto dirty_in(vmav::build_empty()); auto wgt(wgt_.size()!=0 ? wgt_ : wgt_.build_uniform(ms.shape(), 1.)); auto mask(mask_.size()!=0 ? mask_ : mask_.build_uniform(ms.shape(), 1)); Wgridder par(uvw, freq, ms, ms_out, dirty_in, dirty, wgt, mask, pixsize_x, pixsize_y, epsilon, do_wgridding, nthreads, verbosity, negate_v, divide_by_n, sigma_min, sigma_max, center_x, center_y, allow_nshift); } template void dirty2ms(const cmav &uvw, const cmav &freq, const cmav &dirty, const cmav &wgt_, const cmav &mask_, double pixsize_x, double pixsize_y, double epsilon, bool do_wgridding, size_t nthreads, vmav,2> &ms, size_t verbosity, bool negate_v=false, bool divide_by_n=true, double sigma_min=1.1, double sigma_max=2.6, double center_x=0, double center_y=0, bool allow_nshift=true) { if (ms.size()==0) return; // nothing to do auto ms_in(ms.build_uniform(ms.shape(),1.)); auto dirty_out(vmav::build_empty()); auto wgt(wgt_.size()!=0 ? wgt_ : wgt_.build_uniform(ms.shape(), 1.)); auto mask(mask_.size()!=0 ? mask_ : mask_.build_uniform(ms.shape(), 1)); Wgridder par(uvw, freq, ms_in, ms, dirty, dirty_out, wgt, mask, pixsize_x, pixsize_y, epsilon, do_wgridding, nthreads, verbosity, negate_v, divide_by_n, sigma_min, sigma_max, center_x, center_y, allow_nshift); } tuple get_facet_data(size_t npix_x, size_t npix_y, size_t nfx, size_t nfy, size_t ifx, size_t ify, double pixsize_x, double pixsize_y, double center_x, double center_y); template void ms2dirty_faceted(size_t nfx, size_t nfy, const cmav &uvw, const cmav &freq, const cmav,2> &ms, const cmav &wgt_, const cmav &mask_, double pixsize_x, double pixsize_y, double epsilon, bool do_wgridding, size_t nthreads, vmav &dirty, size_t verbosity, bool negate_v=false, bool divide_by_n=true, double sigma_min=1.1, double sigma_max=2.6, double center_x=0, double center_y=0) { size_t npix_x=dirty.shape(0), npix_y=dirty.shape(1); for (size_t i=0; i(dirty, {{startx, stopx}, {starty, stopy}}); ms2dirty(uvw, freq, ms, wgt_, mask_, pixsize_x, pixsize_y, epsilon, do_wgridding, nthreads, subdirty, verbosity, negate_v, divide_by_n, sigma_min, sigma_max, cx, cy, true); } } template void dirty2ms_faceted(size_t nfx,size_t nfy, const cmav &uvw, const cmav &freq, const cmav &dirty, const cmav &wgt_, const cmav &mask_, double pixsize_x, double pixsize_y, double epsilon, bool do_wgridding, size_t nthreads, vmav,2> &ms, size_t verbosity, bool negate_v=false, bool divide_by_n=true, double sigma_min=1.1, double sigma_max=2.6, double center_x=0, double center_y=0) { size_t npix_x=dirty.shape(0), npix_y=dirty.shape(1); size_t istep = (npix_x+nfx-1) / nfx; istep += istep%2; // make even size_t jstep = (npix_y+nfy-1) / nfy; jstep += jstep%2; // make even vmav,2> ms2(ms.shape(), UNINITIALIZED); mav_apply([](complex &v){v=complex(0);},nthreads,ms); for (size_t i=0; i(dirty, {{startx, stopx}, {starty, stopy}}); dirty2ms(uvw, freq, subdirty, wgt_, mask_, pixsize_x, pixsize_y, epsilon, do_wgridding, nthreads, ms2, verbosity, negate_v, divide_by_n, sigma_min, sigma_max, cx, cy, true); mav_apply([](complex &v1, const complex &v2){v1+=v2;},nthreads,ms,ms2); } } tuple,size_t,size_t, size_t> get_tuning_parameters(const cmav &uvw, const cmav &freq, const cmav &mask_, size_t npix_x, size_t npix_y, double pixsize_x, double pixsize_y, double epsilon, bool do_wgridding, size_t nthreads, size_t verbosity, double center_x, double center_y); template void ms2dirty_tuning(const cmav &uvw, const cmav &freq, const cmav,2> &ms, const cmav &wgt_, const cmav &mask_, double pixsize_x, double pixsize_y, double epsilon, bool do_wgridding, size_t nthreads, vmav &dirty, size_t verbosity, bool negate_v=false, bool divide_by_n=true, double sigma_min=1.1, double sigma_max=2.6, double center_x=0, double center_y=0) { { size_t npix_x=dirty.shape(0), npix_y=dirty.shape(1); bool xodd=npix_x&1, yodd=npix_y&1; if (xodd || yodd) // odd image size { vmav tdirty({npix_x+xodd, npix_y+yodd}, UNINITIALIZED); ms2dirty_tuning(uvw, freq, ms, wgt_, mask_, pixsize_x, pixsize_y, epsilon, do_wgridding, nthreads, tdirty, verbosity, negate_v, divide_by_n, sigma_min, sigma_max, center_x+0.5*pixsize_x*xodd, center_y+0.5*pixsize_y*yodd); for (size_t i=0; i(uvw, freq, ms, wgt_, mask_, pixsize_x, pixsize_y, epsilon, do_wgridding, nthreads, dirty, verbosity, negate_v, divide_by_n, sigma_min, sigma_max, center_x, center_y, true); else ms2dirty_faceted(nfx, nfy, uvw, freq, ms, wgt_, mask_, pixsize_x, pixsize_y, epsilon, do_wgridding, nthreads, dirty, verbosity, negate_v, divide_by_n, sigma_min, sigma_max, center_x, center_y); } else { auto mask(mask_.size()!=0 ? mask_ : mask_.build_uniform(ms.shape(), 1)); vmav mask2({uvw.shape(0),freq.shape(0)}, UNINITIALIZED); auto icut_local = icut; // FIXME: stupid hack to work around an oversight in the standard(?) mav_apply([&](uint8_t i1, uint8_t i2, uint8_t &out) { out = (i1!=0) && (i2>=icut_local); }, nthreads, mask, bin, mask2); ms2dirty_faceted(nfx, nfy, uvw, freq, ms, wgt_, mask2, pixsize_x, pixsize_y, epsilon, do_wgridding, nthreads, dirty, verbosity, negate_v, divide_by_n, sigma_min, sigma_max, center_x, center_y); vmav dirty2(dirty.shape(), UNINITIALIZED); mav_apply([&](uint8_t i1, uint8_t i2, uint8_t &out) { out = (i1!=0) && (i2(uvw, freq, ms, wgt_, mask2, pixsize_x, pixsize_y, epsilon, do_wgridding, nthreads, dirty2, verbosity, negate_v, divide_by_n, sigma_min, sigma_max, center_x, center_y, true); mav_apply([&](Timg &v1, Timg v2) {v1+=v2;}, nthreads, dirty, dirty2); } } template void dirty2ms_tuning(const cmav &uvw, const cmav &freq, const cmav &dirty, const cmav &wgt_, const cmav &mask_, double pixsize_x, double pixsize_y, double epsilon, bool do_wgridding, size_t nthreads, vmav,2> &ms, size_t verbosity, bool negate_v=false, bool divide_by_n=true, double sigma_min=1.1, double sigma_max=2.6, double center_x=0, double center_y=0) { { size_t npix_x=dirty.shape(0), npix_y=dirty.shape(1); bool xodd=npix_x&1, yodd=npix_y&1; if (xodd || yodd) // odd image size { vmav tdirty({npix_x+xodd, npix_y+yodd}, UNINITIALIZED); for (size_t i=0; i(uvw, freq, tdirty, wgt_, mask_, pixsize_x, pixsize_y, epsilon, do_wgridding, nthreads, ms, verbosity, negate_v, divide_by_n, sigma_min, sigma_max, center_x, center_y); return; } } auto [bin, icut, nfx, nfy] = get_tuning_parameters(uvw, freq, mask_, dirty.shape(0), dirty.shape(1), pixsize_x, pixsize_y, epsilon, do_wgridding, nthreads, verbosity, center_x, center_y); if (bin.size()==0) { if (nfx==0) // traditional algorithm dirty2ms(uvw, freq, dirty, wgt_, mask_, pixsize_x, pixsize_y, epsilon, do_wgridding, nthreads, ms, verbosity, negate_v, divide_by_n, sigma_min, sigma_max, center_x, center_y, true); else dirty2ms_faceted(nfx, nfy, uvw, freq, dirty, wgt_, mask_, pixsize_x, pixsize_y, epsilon, do_wgridding, nthreads, ms, verbosity, negate_v, divide_by_n, sigma_min, sigma_max, center_x, center_y); } else { auto mask(mask_.size()!=0 ? mask_ : mask_.build_uniform(ms.shape(), 1)); vmav mask2({uvw.shape(0),freq.shape(0)}, UNINITIALIZED); auto icut_local = icut; // FIXME: stupid hack to work around an oversight in the standard(?) mav_apply([&](uint8_t i1, uint8_t i2, uint8_t &out) { out = (i1!=0) && (i2>=icut_local); }, nthreads, mask, bin, mask2); dirty2ms_faceted(nfx, nfy, uvw, freq, dirty, wgt_, mask2, pixsize_x, pixsize_y, epsilon, do_wgridding, nthreads, ms, verbosity, negate_v, divide_by_n, sigma_min, sigma_max, center_x, center_y); mav_apply([&](uint8_t i1, uint8_t i2, uint8_t &out) { out = (i1!=0) && (i2,2> tms(ms.shape(), UNINITIALIZED); dirty2ms(uvw, freq, dirty, wgt_, mask2, pixsize_x, pixsize_y, epsilon, do_wgridding, nthreads, tms, verbosity, negate_v, divide_by_n, sigma_min, sigma_max, center_x, center_y, true); mav_apply([&](complex &v1, complex v2) {v1+=v2;}, nthreads, ms, tms); } } } // namespace detail_gridder // public names using detail_gridder::ms2dirty; using detail_gridder::dirty2ms; using detail_gridder::ms2dirty_tuning; using detail_gridder::dirty2ms_tuning; } // namespace ducc0 #endif