/* * This code is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this code; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* Copyright (C) 2019-2023 Max-Planck-Society Author: Martin Reinecke */ #ifndef DUCC0_NUFFT_H #define DUCC0_NUFFT_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if ((!defined(DUCC0_NO_SIMD)) && (defined(__AVX__)||defined(__SSE3__))) #include #endif #include "ducc0/infra/error_handling.h" #include "ducc0/math/constants.h" #include "ducc0/fft/fft.h" #include "ducc0/infra/threading.h" #include "ducc0/infra/misc_utils.h" #include "ducc0/infra/useful_macros.h" #include "ducc0/infra/mav.h" #include "ducc0/infra/simd.h" #include "ducc0/infra/timers.h" #include "ducc0/infra/bucket_sort.h" #include "ducc0/math/gridding_kernel.h" namespace ducc0 { namespace detail_nufft { using namespace std; // the next line is necessary to address some sloppy name choices in hipSYCL using std::min, std::max; // Generally we want to use SIMD types with the largest possible size, but not // larger than 8; length-16 SIMD types (like full AVX512 float32 vectors) would // be overkill for typical kernel supports (we don't let float32 kernels have // a support larger than 8 anyway). template constexpr inline int good_simdlen = min(8, native_simd::size()); template using mysimd = typename simd_select>::type; /// convenience function for squaring a number template T sqr(T val) { return val*val; } /// Function for quickly zeroing a 2D array with arbitrary strides. template void quickzero(vmav &arr, size_t nthreads) { #if 0 arr.fill(T(0)); #else MR_assert((arr.stride(0)>0) && (arr.stride(1)>0), "bad memory ordering"); MR_assert(arr.stride(0)>=arr.stride(1), "bad memory ordering"); size_t s0=arr.shape(0), s1=arr.shape(1); execParallel(s0, nthreads, [&](size_t lo, size_t hi) { if (arr.stride(1)==1) { if (size_t(arr.stride(0))==arr.shape(1)) memset(reinterpret_cast(&arr(lo,0)), 0, sizeof(T)*s1*(hi-lo)); else for (auto i=lo; i(&arr(i,0)), 0, sizeof(T)*s1); } else for (auto i=lo; i complex hsum_cmplx(mysimd vr, mysimd vi) { return complex(reduce(vr, plus<>()), reduce(vi, plus<>())); } #if (!defined(DUCC0_NO_SIMD)) #if (defined(__AVX__)) #if 1 template<> inline complex hsum_cmplx(mysimd vr, mysimd vi) { auto t1 = _mm256_hadd_ps(__m256(vr), __m256(vi)); auto t2 = _mm_hadd_ps(_mm256_extractf128_ps(t1, 0), _mm256_extractf128_ps(t1, 1)); t2 += _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(1,0,3,2)); return complex(t2[0], t2[1]); } #else // this version may be slightly faster, but this needs more benchmarking template<> inline complex hsum_cmplx(mysimd vr, mysimd vi) { auto t1 = _mm256_shuffle_ps(vr, vi, _MM_SHUFFLE(0,2,0,2)); auto t2 = _mm256_shuffle_ps(vr, vi, _MM_SHUFFLE(1,3,1,3)); auto t3 = _mm256_add_ps(t1,t2); t3 = _mm256_shuffle_ps(t3, t3, _MM_SHUFFLE(3,0,2,1)); auto t4 = _mm_add_ps(_mm256_extractf128_ps(t3, 1), _mm256_castps256_ps128(t3)); auto t5 = _mm_add_ps(t4, _mm_movehl_ps(t4, t4)); return complex(t5[0], t5[1]); } #endif #elif defined(__SSE3__) template<> inline complex hsum_cmplx(mysimd vr, mysimd vi) { auto t1 = _mm_hadd_ps(__m128(vr), __m128(vi)); t1 += _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)); return complex(t1[0], t1[2]); } #endif #endif [[gnu::always_inline]] [[gnu::hot]] inline auto comp_indices(size_t idx, size_t nuni, size_t nbig, bool fft_order) { int icf = abs(int(nuni/2)-int(idx)); size_t i1 = fft_order ? nuni-nuni/2+idx : idx; if (i1>=nuni) i1-=nuni; size_t i2 = nbig-nuni/2+idx; if (i2>=nbig) i2-=nbig; return make_tuple(icf, i1, i2); } /*! Selects the most efficient combination of gridding kernel and oversampled grid size for the provided problem parameters. */ template auto findNufftParameters(double epsilon, double sigma_min, double sigma_max, const vector &dims, size_t npoints, bool gridding, size_t nthreads) { auto vlen = gridding ? mysimd::size() : mysimd::size(); auto ndim = dims.size(); auto idx = getAvailableKernels(epsilon, ndim, sigma_min, sigma_max); double mincost = 1e300; constexpr double nref_fft=2048; constexpr double costref_fft=0.0693; vector bigdims(ndim, 0); size_t minidx=~(size_t(0)); for (size_t i=0; i lbigdims(ndim,0); double gridsize=1; for (size_t idim=0; idim(lbigdims[idim], 16); gridsize *= lbigdims[idim]; } double logterm = log(gridsize)/log(nref_fft*nref_fft); double fftcost = gridsize/(nref_fft*nref_fft)*logterm*costref_fft; size_t kernelpoints = nvec*vlen; for (size_t idim=0; idim+1 auto findNufftKernel(double epsilon, double sigma_min, double sigma_max, const vector &dims, size_t npoints, bool gridding, size_t nthreads) { auto vlen = gridding ? mysimd::size() : mysimd::size(); auto ndim = dims.size(); auto idx = getAvailableKernels(epsilon, ndim, sigma_min, sigma_max); double mincost = 1e300; constexpr double nref_fft=2048; constexpr double costref_fft=0.0693; size_t minidx=~(size_t(0)); for (size_t i=0; i(bigdim, 16); gridsize *= bigdim; } double logterm = log(gridsize)/log(nref_fft*nref_fft); double fftcost = gridsize/(nref_fft*nref_fft)*logterm*costref_fft; size_t kernelpoints = nvec*vlen; for (size_t idim=0; idim+1 constexpr inline int log2tile_=-1; template<> constexpr inline int log2tile_ = 9; template<> constexpr inline int log2tile_ = 9; template<> constexpr inline int log2tile_ = 9; template<> constexpr inline int log2tile_ = 4; template<> constexpr inline int log2tile_ = 4; template<> constexpr inline int log2tile_ = 5; #ifdef NEW_DUMP template<> constexpr inline int log2tile_ = 5; template<> constexpr inline int log2tile_ = 5; #else template<> constexpr inline int log2tile_ = 4; template<> constexpr inline int log2tile_ = 4; #endif template<> constexpr inline int log2tile_ = 4; template constexpr inline size_t max_ntile=-1; template<> constexpr inline size_t max_ntile<1> = (~uint32_t(0))-10; template<> constexpr inline size_t max_ntile<2> = (uint32_t(1<<16))-10; template<> constexpr inline size_t max_ntile<3> = (uint32_t(1<<10))-10; template class Nufft_ancestor { protected: TimerHierarchy timers; // requested epsilon value for this transform. double epsilon; // number of threads to use for this transform. size_t nthreads; // 1./ double coordfct; // if true, start with zero mode // if false, start with most negative mode bool fft_order; // number of non-uniform points size_t npoints; // uniform grid dimensions array nuni; // oversampled grid dimensions array nover; // holds the indices of the nonuniform points in the order in which they // should be processed quick_array coord_idx; shared_ptr krn; size_t supp, nsafe; array shift; array maxi0; vector> corfac; // the base-2 logarithm of the linear dimension of a computational tile. constexpr static int log2tile = log2tile_; static_assert(sizeof(Tcalc)<=sizeof(Tacc), "Tacc must be at least as accurate as Tcalc"); /*! Compute minimum index in the oversampled grid touched by the kernel around coordinate \a in. */ template [[gnu::always_inline]] void getpix(array in, array &out, array &out0) const { // do range reduction in long double when Tcoord is double, // to avoid inaccuracies with very large grids using Tbig = typename conditional::value, long double, double>::type; for (size_t i=0; i [[gnu::always_inline]] array get_tile(const array &in) const { array dum; array i0; getpix(in, dum, i0); array res; for (size_t i=0; i>log2tile); return res; } template [[gnu::always_inline]] array get_tile(const array &in, size_t lsq2) const { array dum; array i0; getpix(in, dum, i0); array res; for (size_t i=0; i>lsq2); return res; } template void sort_coords(const cmav &coords, vmav &coords_sorted) { timers.push("sorting coords"); execParallel(npoints, nthreads, [&](size_t lo, size_t hi) { for (size_t i=lo; i bool prep_nu2u (const cmav,1> &points, vmav,ndim> &uniform) { static_assert(sizeof(Tpoints)<=sizeof(Tcalc), "Tcalc must be at least as accurate as Tpoints"); static_assert(sizeof(Tgrid)<=sizeof(Tcalc), "Tcalc must be at least as accurate as Tgrid"); MR_assert(points.shape(0)==npoints, "number of points mismatch"); MR_assert(uniform.shape()==nuni, "uniform grid dimensions mismatch"); if (npoints==0) { mav_apply([](complex &v){v=complex(0);}, nthreads, uniform); return true; } return false; } template bool prep_u2nu (const cmav,1> &points, const cmav,ndim> &uniform) { static_assert(sizeof(Tpoints)<=sizeof(Tcalc), "Tcalc must be at least as accurate as Tpoints"); static_assert(sizeof(Tgrid)<=sizeof(Tcalc), "Tcalc must be at least as accurate as Tgrid"); MR_assert(points.shape(0)==npoints, "number of points mismatch"); MR_assert(uniform.shape()==nuni, "uniform grid dimensions mismatch"); return npoints==0; } static string dim2string(const array &arr) { ostringstream str; str << arr[0]; for (size_t i=1; i())*sizeof(complex)/double(1<<30) << "GB (oversampled grid)" << endl; } public: Nufft_ancestor(bool gridding, size_t npoints_, const array &uniform_shape, double epsilon_, size_t nthreads_, double sigma_min, double sigma_max, double periodicity, bool fft_order_) : timers(gridding ? "nu2u" : "u2nu"), epsilon(epsilon_), nthreads(adjust_nthreads(nthreads_)), coordfct(1./periodicity), fft_order(fft_order_), npoints(npoints_), nuni(uniform_shape) { MR_assert(npoints<=(~uint32_t(0)), "too many nonuniform points"); timers.push("parameter calculation"); vector tdims{nuni.begin(), nuni.end()}; auto [kidx, dims] = findNufftParameters (epsilon, sigma_min, sigma_max, tdims, npoints, gridding, nthreads); for (size_t i=0; i>log2tile)<=max_ntile, "oversampled grid too large"); } timers.pop(); krn = selectKernel(kidx); supp = krn->support(); nsafe = (supp+1)/2; for (size_t i=0; i=2*nsafe, "oversampled length too small"); MR_assert((nover[i]&1)==0, "oversampled dimensions must be even"); } MR_assert(epsilon>0, "epsilon must be positive"); timers.push("correction factors"); for (size_t i=0; icorfunc(nuni[i]/2+1, 1./nover[i], nthreads)); else corfac.push_back(corfac.back()); timers.pop(); } }; template class Nufft; #define DUCC0_NUFFT_BOILERPLATE \ private: \ using parent=Nufft_ancestor; \ using parent::coord_idx, parent::nthreads, parent::npoints, parent::supp, \ parent::timers, parent::krn, parent::fft_order, parent::nuni, \ parent::nover, parent::shift, parent::maxi0, parent::report, \ parent::log2tile, parent::corfac, parent::sort_coords, \ parent::prep_nu2u, parent::prep_u2nu; \ \ vmav coords_sorted; \ \ public: \ using parent::parent; /* inherit constructor */ \ Nufft(bool gridding, const cmav &coords, \ const array &uniform_shape_, double epsilon_, \ size_t nthreads_, double sigma_min, double sigma_max, \ double periodicity, bool fft_order_) \ : parent(gridding, coords.shape(0), uniform_shape_, epsilon_, nthreads_, \ sigma_min, sigma_max, periodicity, fft_order_), \ coords_sorted({npoints,ndim},UNINITIALIZED) \ { \ build_index(coords); \ sort_coords(coords, coords_sorted); \ } \ \ template void nu2u(bool forward, size_t verbosity, \ const cmav,1> &points, vmav,ndim> &uniform) \ { \ if (prep_nu2u(points, uniform)) return; \ MR_assert(coords_sorted.size()!=0, "bad call"); \ if (verbosity>0) report(true); \ nonuni2uni(forward, coords_sorted, points, uniform); \ if (verbosity>0) timers.report(cout); \ } \ template void u2nu(bool forward, size_t verbosity, \ const cmav,ndim> &uniform, vmav,1> &points) \ { \ if (prep_u2nu(points, uniform)) return; \ MR_assert(coords_sorted.size()!=0, "bad call"); \ if (verbosity>0) report(false); \ uni2nonuni(forward, uniform, coords_sorted, points); \ if (verbosity>0) timers.report(cout); \ } \ template void nu2u(bool forward, size_t verbosity, \ const cmav &coords, const cmav,1> &points, \ vmav,ndim> &uniform) \ { \ if (prep_nu2u(points, uniform)) return; \ MR_assert(coords_sorted.size()==0, "bad call"); \ if (verbosity>0) report(true); \ build_index(coords); \ nonuni2uni(forward, coords, points, uniform); \ if (verbosity>0) timers.report(cout); \ } \ template void u2nu(bool forward, size_t verbosity, \ const cmav,ndim> &uniform, const cmav &coords, \ vmav,1> &points) \ { \ if (prep_u2nu(points, uniform)) return; \ MR_assert(coords_sorted.size()==0, "bad call"); \ if (verbosity>0) report(false); \ build_index(coords); \ uni2nonuni(forward, uniform, coords, points); \ if (verbosity>0) timers.report(cout); \ } /*! Helper class for carrying out 1D nonuniform FFTs of types 1 and 2. Tcalc: the floating-point type in which all kernel-related calculations are performed Tacc: the floating-point type used for the grid on which data is accumulated in nu2u transforms. Can usually be the same as Tcalc, but may be chosen to be more accurate in specific situations. Tpoints: the floating-point type used for storing the values at the non-uniform points Tgrid: the floating-point type used for storing the values on the uniform grid. Tcoord: the floating-point type used for storing the coordinates of the non-uniform points. */ template class Nufft: public Nufft_ancestor { private: static constexpr size_t ndim=1; DUCC0_NUFFT_BOILERPLATE private: template class HelperNu2u { public: static constexpr size_t vlen = mysimd::size(); static constexpr size_t nvec = (supp+vlen-1)/vlen; private: static constexpr int nsafe = (supp+1)/2; static constexpr int su = 2*nsafe+(1<> tkrn; vmav,ndim> &grid; array i0; // start index of the current nonuniform point array b0; // start index of the current buffer vmav bufr, bufi; Tacc *px0r, *px0i; Mutex &mylock; // add the acumulated local tile to the global oversampled grid DUCC0_NOINLINE void dump() { if (b0[0]<-nsafe) return; // nothing written into buffer yet int inu = int(parent->nover[0]); { LockGuard lock(mylock); for (int iu=0, idxu=(b0[0]+inu)%inu; iu(Tcalc(bufr(iu)), Tcalc(bufi(iu))); bufr(iu) = bufi(iu) = 0; } } } public: Tacc * DUCC0_RESTRICT p0r, * DUCC0_RESTRICT p0i; union kbuf { Tacc scalar[nvec*vlen]; mysimd simd[nvec]; #if defined(_MSC_VER) kbuf() {} #endif }; kbuf buf; HelperNu2u(const Nufft *parent_, vmav,ndim> &grid_, Mutex &mylock_) : parent(parent_), tkrn(*parent->krn), grid(grid_), i0{-1000000}, b0{-1000000}, bufr({size_t(suvec)}), bufi({size_t(suvec)}), px0r(bufr.data()), px0i(bufi.data()), mylock(mylock_) {} ~HelperNu2u() { dump(); } [[gnu::always_inline]] [[gnu::hot]] void prep(array in) { array frac; auto i0old = i0; parent->template getpix(in, frac, i0); auto x0 = -frac[0]*2+(supp-1); tkrn.eval1(Tacc(x0), &buf.simd[0]); if (i0==i0old) return; if ((i0[0]b0[0]+su)) { dump(); b0[0]=((((i0[0]+nsafe)>>log2tile)< class HelperU2nu { public: static constexpr size_t vlen = mysimd::size(); static constexpr size_t nvec = (supp+vlen-1)/vlen; private: static constexpr int nsafe = (supp+1)/2; static constexpr int su = 2*nsafe+(1<> tkrn; const cmav,ndim> &grid; array i0; // start index of the current nonuniform point array b0; // start index of the current buffer vmav bufr, bufi; const Tcalc *px0r, *px0i; // load a tile from the global oversampled grid into local buffer DUCC0_NOINLINE void load() { int inu = int(parent->nover[0]); for (int iu=0, idxu=(b0[0]+inu)%inu; iu simd[nvec]; #if defined(_MSC_VER) kbuf() {} #endif }; kbuf buf; HelperU2nu(const Nufft *parent_, const cmav,ndim> &grid_) : parent(parent_), tkrn(*parent->krn), grid(grid_), i0{-1000000}, b0{-1000000}, bufr({size_t(suvec)}), bufi({size_t(suvec)}), px0r(bufr.data()), px0i(bufi.data()) {} [[gnu::always_inline]] [[gnu::hot]] void prep(array in) { array frac; auto i0old = i0; parent->template getpix(in, frac, i0); auto x0 = -frac[0]*2+(supp-1); tkrn.eval1(Tcalc(x0), &buf.simd[0]); if (i0==i0old) return; if ((i0[0]b0[0]+su)) { b0[0]=((((i0[0]+nsafe)>>log2tile)< [[gnu::hot]] void spreading_helper (size_t supp, const cmav &coords, const cmav,1> &points, vmav,ndim> &grid) const { if constexpr (SUPP>=8) if (supp<=SUPP/2) return spreading_helper(supp, coords, points, grid); if constexpr (SUPP>4) if (supp(supp, coords, points, grid); MR_assert(supp==SUPP, "requested support out of range"); bool sorted = coords_sorted.size()!=0; Mutex mylock; size_t chunksz = max(1000, npoints/(10*nthreads)); execDynamic(npoints, nthreads, chunksz, [&](Scheduler &sched) { HelperNu2u hlp(this, grid, mylock); const auto * DUCC0_RESTRICT ku = hlp.buf.simd; constexpr size_t lookahead=10; while (auto rng=sched.getNext()) for(auto ix=rng.lo; ix(pxr,element_aligned_tag()); tr += vr*ku[cu]; tr.copy_to(pxr,element_aligned_tag()); auto ti = mysimd(pxi, element_aligned_tag()); ti += vi*ku[cu]; ti.copy_to(pxi,element_aligned_tag()); } } }); } template [[gnu::hot]] void interpolation_helper (size_t supp, const cmav,ndim> &grid, const cmav &coords, vmav,1> &points) const { if constexpr (SUPP>=8) if (supp<=SUPP/2) return interpolation_helper(supp, grid, coords, points); if constexpr (SUPP>4) if (supp(supp, grid, coords, points); MR_assert(supp==SUPP, "requested support out of range"); bool sorted = coords_sorted.size()!=0; size_t chunksz = max(1000, npoints/(10*nthreads)); execDynamic(npoints, nthreads, chunksz, [&](Scheduler &sched) { HelperU2nu hlp(this, grid); const auto * DUCC0_RESTRICT ku = hlp.buf.simd; constexpr size_t lookahead=10; while (auto rng=sched.getNext()) for(auto ix=rng.lo; ix rr=0, ri=0; for (size_t cu=0; cu(pxr,element_aligned_tag()); ri += ku[cu]*mysimd(pxi,element_aligned_tag()); } points(row) = hsum_cmplx(rr,ri); } }); } template void nonuni2uni(bool forward, const cmav &coords, const cmav,1> &points, vmav,ndim> &uniform) { timers.push("nu2u proper"); timers.push("allocating grid"); auto grid = vmav,ndim>::build_noncritical(nover, UNINITIALIZED); timers.poppush("zeroing grid"); mav_apply([](complex &v){v=complex(0);},nthreads,grid); timers.poppush("spreading"); constexpr size_t maxsupp = is_same::value ? 8 : 16; spreading_helper(supp, coords, points, grid); timers.poppush("FFT"); vfmav> fgrid(grid); c2c(fgrid, fgrid, {0}, forward, Tcalc(1), nthreads); timers.poppush("grid correction"); execParallel(nuni[0], nthreads, [&](size_t lo, size_t hi) { for (auto i=lo; i(grid(iin)*Tcalc(corfac[0][icfu])); } }); timers.pop(); timers.pop(); } template void uni2nonuni(bool forward, const cmav,ndim> &uniform, const cmav &coords, vmav,1> &points) { timers.push("u2nu proper"); timers.push("allocating grid"); auto grid = vmav,ndim>::build_noncritical(nover, UNINITIALIZED); timers.poppush("zeroing grid"); mav_apply([](complex &v){v=complex(0);},nthreads,grid); timers.poppush("grid correction"); execParallel(nuni[0], nthreads, [&](size_t lo, size_t hi) { for (auto i=lo; i(uniform(iin))*Tcalc(corfac[0][icfu]); } }); timers.poppush("FFT"); vfmav> fgrid(grid); c2c(fgrid, fgrid, {0}, forward, Tcalc(1), nthreads); timers.poppush("interpolation"); constexpr size_t maxsupp = is_same::value ? 8 : 16; interpolation_helper(supp, grid, coords, points); timers.pop(); timers.pop(); } void build_index(const cmav &coords) { timers.push("building index"); MR_assert(coords.shape(0)==npoints, "number of coords mismatch"); MR_assert(coords.shape(1)==ndim, "ndim mismatch"); size_t ntiles_u = (nover[0]>>log2tile) + 3; coord_idx.resize(npoints); quick_array key(npoints); execParallel(npoints, nthreads, [&](size_t lo, size_t hi) { for (size_t i=lo; i({coords(i,0)})[0]; }); bucket_sort2(key, coord_idx, ntiles_u, nthreads); timers.pop(); } }; template class Nufft: public Nufft_ancestor { private: static constexpr size_t ndim=2; DUCC0_NUFFT_BOILERPLATE template class HelperNu2u { public: static constexpr size_t vlen = mysimd::size(); static constexpr size_t nvec = (supp+vlen-1)/vlen; private: static constexpr int nsafe = (supp+1)/2; static constexpr int su = supp+(1<> tkrn; vmav,ndim> &grid; array i0; // start index of the current nonuniform point array b0; // start index of the current buffer vmav,ndim> gbuf; complex *px0; vector &locks; DUCC0_NOINLINE void dump() { if (b0[0]<-nsafe) return; // nothing written into buffer yet int inu = int(parent->nover[0]); int inv = int(parent->nover[1]); int idxv0 = (b0[1]+inv)%inv; for (int iu=0, idxu=(b0[0]+inu)%inu; iu(gbuf(iu,iv)); gbuf(iu,iv) = 0; } } } public: complex * DUCC0_RESTRICT p0; union kbuf { Tacc scalar[2*nvec*vlen]; mysimd simd[2*nvec]; #if defined(_MSC_VER) kbuf() {} #endif }; kbuf buf; HelperNu2u(const Nufft *parent_, vmav,ndim> &grid_, vector &locks_) : parent(parent_), tkrn(*parent->krn), grid(grid_), i0{-1000000, -1000000}, b0{-1000000, -1000000}, gbuf({size_t(su+1),size_t(sv)}), px0(gbuf.data()), locks(locks_) {} ~HelperNu2u() { dump(); } constexpr int lineJump() const { return sv; } [[gnu::always_inline]] [[gnu::hot]] void prep(array in) { array frac; auto i0old = i0; parent->template getpix(in, frac, i0); auto x0 = -frac[0]*2+(supp-1); auto y0 = -frac[1]*2+(supp-1); tkrn.eval2(Tacc(x0), Tacc(y0), &buf.simd[0]); if (i0==i0old) return; if ((i0[0]b0[0]+su) || (i0[1]+int(supp)>b0[1]+sv)) { dump(); b0[0]=((((i0[0]+nsafe)>>log2tile)<>log2tile)< class HelperU2nu { public: static constexpr size_t vlen = mysimd::size(); static constexpr size_t nvec = (supp+vlen-1)/vlen; private: static constexpr int nsafe = (supp+1)/2; static constexpr int su = supp+(1<(sv, ((supp+2*vlen-2)/vlen)*vlen); static constexpr double xsupp=2./supp; const Nufft *parent; TemplateKernel> tkrn; const cmav,ndim> &grid; array i0; // start index of the current nonuniform point array b0; // start index of the current buffer vmav bufri; const Tcalc *px0r, *px0i; DUCC0_NOINLINE void load() { int inu = int(parent->nover[0]); int inv = int(parent->nover[1]); int idxv0 = (b0[1]+inv)%inv; for (int iu=0, idxu=(b0[0]+inu)%inu; iu simd[2*nvec]; #if defined(_MSC_VER) kbuf() {} #endif }; kbuf buf; HelperU2nu(const Nufft *parent_, const cmav,ndim> &grid_) : parent(parent_), tkrn(*parent->krn), grid(grid_), i0{-1000000, -1000000}, b0{-1000000, -1000000}, bufri({size_t(2*su+1),size_t(svvec)}), px0r(bufri.data()), px0i(bufri.data()+svvec) {} constexpr int lineJump() const { return 2*svvec; } [[gnu::always_inline]] [[gnu::hot]] void prep(array in) { array frac; auto i0old = i0; parent->template getpix(in, frac, i0); auto x0 = -frac[0]*2+(supp-1); auto y0 = -frac[1]*2+(supp-1); tkrn.eval2(Tcalc(x0), Tcalc(y0), &buf.simd[0]); if (i0==i0old) return; if ((i0[0]b0[0]+su) || (i0[1]+int(supp)>b0[1]+sv)) { b0[0]=((((i0[0]+nsafe)>>log2tile)<>log2tile)< [[gnu::hot]] void spreading_helper (size_t supp, const cmav &coords, const cmav,1> &points, vmav,ndim> &grid) const { if constexpr (SUPP>=8) if (supp<=SUPP/2) return spreading_helper(supp, coords, points, grid); if constexpr (SUPP>4) if (supp(supp, coords, points, grid); MR_assert(supp==SUPP, "requested support out of range"); bool sorted = coords_sorted.size()!=0; vector locks(nover[0]); size_t chunksz = max(1000, coord_idx.size()/(10*nthreads)); execDynamic(coord_idx.size(), nthreads, chunksz, [&](Scheduler &sched) { HelperNu2u hlp(this, grid, locks); constexpr auto jump = hlp.lineJump(); const auto * DUCC0_RESTRICT ku = hlp.buf.scalar; const auto * DUCC0_RESTRICT kv = hlp.buf.scalar+hlp.nvec*hlp.vlen; constexpr size_t NVEC2 = (2*SUPP+hlp.vlen-1)/hlp.vlen; union Txdata{ array,SUPP> c; array,NVEC2> v; Txdata(){for (size_t i=0; i v(points(row)); for (size_t cv=0; cv(hlp.p0); for (size_t cu=0; cu(px,element_aligned_tag()); tval += tmpx*xdata.v[cv]; tval.copy_to(px,element_aligned_tag()); } } } }); } #else template [[gnu::hot]] void spreading_helper (size_t supp, const cmav &coords, const cmav,1> &points, vmav,ndim> &grid) const { if constexpr (SUPP>=8) if (supp<=SUPP/2) return spreading_helper(supp, coords, points, grid); if constexpr (SUPP>4) if (supp(supp, coords, points, grid); MR_assert(supp==SUPP, "requested support out of range"); bool sorted = coords_sorted.size()!=0; vector locks(nover[0]); size_t chunksz = max(1000, coord_idx.size()/(10*nthreads)); execDynamic(coord_idx.size(), nthreads, chunksz, [&](Scheduler &sched) { HelperNu2u hlp(this, grid, locks); constexpr auto jump = hlp.lineJump(); const auto * DUCC0_RESTRICT ku = hlp.buf.scalar; const auto * DUCC0_RESTRICT kv = hlp.buf.scalar+hlp.nvec*hlp.vlen; constexpr size_t NVEC2 = (2*SUPP+hlp.vlen-1)/hlp.vlen; array,SUPP> cdata; array,NVEC2> vdata; for (size_t i=0; i v(points(row)); for (size_t cv=0; cv(vdata.data()), reinterpret_cast(cdata.data()), SUPP*sizeof(complex)); Tacc * DUCC0_RESTRICT xpx = reinterpret_cast(hlp.p0); for (size_t cu=0; cu(px,element_aligned_tag()); tval += tmpx*vdata[cv]; tval.copy_to(px,element_aligned_tag()); } } } }); } #endif template [[gnu::hot]] void interpolation_helper (size_t supp, const cmav,ndim> &grid, const cmav &coords, vmav,1> &points) const { if constexpr (SUPP>=8) if (supp<=SUPP/2) return interpolation_helper(supp, grid, coords, points); if constexpr (SUPP>4) if (supp(supp, grid, coords, points); MR_assert(supp==SUPP, "requested support out of range"); bool sorted = coords_sorted.size()!=0; size_t chunksz = max(1000, coord_idx.size()/(10*nthreads)); execDynamic(npoints, nthreads, chunksz, [&](Scheduler &sched) { HelperU2nu hlp(this, grid); constexpr int jump = hlp.lineJump(); const auto * DUCC0_RESTRICT ku = hlp.buf.scalar; const auto * DUCC0_RESTRICT kv = hlp.buf.simd+hlp.nvec; constexpr size_t lookahead=3; while (auto rng=sched.getNext()) for(auto ix=rng.lo; ix rr=0, ri=0; if constexpr (hlp.nvec==1) { for (size_t cu=0; cu(pxr,element_aligned_tag())*ku[cu]; ri += mysimd(pxi,element_aligned_tag())*ku[cu]; } rr *= kv[0]; ri *= kv[0]; } else { for (size_t cu=0; cu tmpr(0), tmpi(0); for (size_t cv=0; cv(pxr,element_aligned_tag()); tmpi += kv[cv]*mysimd(pxi,element_aligned_tag()); } rr += ku[cu]*tmpr; ri += ku[cu]*tmpi; } } points(row) = hsum_cmplx(rr,ri); } }); } template void nonuni2uni(bool forward, const cmav &coords, const cmav,1> &points, vmav,ndim> &uniform) { timers.push("nu2u proper"); timers.push("allocating grid"); auto grid = vmav,ndim>::build_noncritical(nover, UNINITIALIZED); timers.poppush("zeroing grid"); mav_apply([](complex &v){v=complex(0);},nthreads,grid); timers.poppush("spreading"); constexpr size_t maxsupp = is_same::value ? 8 : 16; spreading_helper(supp, coords, points, grid); timers.poppush("FFT"); { vfmav> fgrid(grid); c2c(fgrid, fgrid, {1}, forward, Tcalc(1), nthreads); auto fgridl=fgrid.subarray({{},{0,(nuni[1]+1)/2}}); c2c(fgridl, fgridl, {0}, forward, Tcalc(1), nthreads); if (nuni[1]>1) { auto fgridh=fgrid.subarray({{},{fgrid.shape(1)-nuni[1]/2,MAXIDX}}); c2c(fgridh, fgridh, {0}, forward, Tcalc(1), nthreads); } } timers.poppush("grid correction"); execParallel(nuni[0], nthreads, [&](size_t lo, size_t hi) { for (auto i=lo; i(grid(iin,jin) *Tcalc(corfac[0][icfu]*corfac[1][icfv])); } } }); timers.pop(); timers.pop(); } template void uni2nonuni(bool forward, const cmav,ndim> &uniform, const cmav &coords, vmav,1> &points) { timers.push("u2nu proper"); timers.push("allocating grid"); auto grid = vmav,ndim>::build_noncritical(nover, UNINITIALIZED); timers.poppush("zeroing grid"); // only zero the parts of the grid that are not filled afterwards anyway { auto a0 = subarray<2>(grid, {{0,(nuni[0]+1)/2}, {nuni[1]/2,nover[1]-nuni[1]/2}}); quickzero(a0, nthreads); } { auto a0 = subarray<2>(grid, {{(nuni[0]+1)/2, nover[0]-nuni[0]/2}, {}}); quickzero(a0, nthreads); } if (nuni[0]>1) { auto a0 = subarray<2>(grid, {{nover[0]-nuni[0]/2,MAXIDX}, {nuni[1]/2, nover[1]-nuni[1]/2+1}}); quickzero(a0, nthreads); } timers.poppush("grid correction"); execParallel(nuni[0], nthreads, [&](size_t lo, size_t hi) { for (auto i=lo; i(uniform(iin,jin)) *Tcalc(corfac[0][icfu]*corfac[1][icfv]); } } }); timers.poppush("FFT"); { vfmav> fgrid(grid); auto fgridl=fgrid.subarray({{},{0,(nuni[1]+1)/2}}); c2c(fgridl, fgridl, {0}, forward, Tcalc(1), nthreads); if (nuni[1]>1) { auto fgridh=fgrid.subarray({{},{fgrid.shape(1)-nuni[1]/2,MAXIDX}}); c2c(fgridh, fgridh, {0}, forward, Tcalc(1), nthreads); } c2c(fgrid, fgrid, {1}, forward, Tcalc(1), nthreads); } timers.poppush("interpolation"); constexpr size_t maxsupp = is_same::value ? 8 : 16; interpolation_helper(supp, grid, coords, points); timers.pop(); timers.pop(); } void build_index(const cmav &coords) { timers.push("building index"); size_t ntiles_u = (nover[0]>>log2tile) + 3; size_t ntiles_v = (nover[1]>>log2tile) + 3; coord_idx.resize(npoints); quick_array key(npoints); execParallel(npoints, nthreads, [&](size_t lo, size_t hi) { for (size_t i=lo; i({coords(i,0), coords(i,1)}); key[i] = tile[0]*ntiles_v + tile[1]; } }); bucket_sort2(key, coord_idx, ntiles_u*ntiles_v, nthreads); timers.pop(); } }; template class Nufft: public Nufft_ancestor { private: static constexpr size_t ndim=3; DUCC0_NUFFT_BOILERPLATE template class HelperNu2u { public: static constexpr size_t vlen = mysimd::size(); static constexpr size_t nvec = (supp+vlen-1)/vlen; private: static constexpr int nsafe = (supp+1)/2; static constexpr int su = supp+(1<> tkrn; vmav,ndim> &grid; array i0; // start index of the current nonuniform point array b0; // start index of the current buffer #ifdef NEW_DUMP array imin,imax; #endif vmav,ndim> gbuf; complex *px0; vector &locks; DUCC0_NOINLINE void dump() { if (b0[0]<-nsafe) return; // nothing written into buffer yet int inu = int(parent->nover[0]); int inv = int(parent->nover[1]); int inw = int(parent->nover[2]); #ifdef NEW_DUMP int idxv0 = (imin[1]+b0[1]+inv)%inv; int idxw0 = (imin[2]+b0[2]+inw)%inw; for (int iu=imin[0], idxu=(imin[0]+b0[0]+inu)%inu; iu(t); gbuf(iu,iv,iw) = 0; } } imin={1000,1000,1000}; imax={-1000,-1000,-1000}; #else int idxv0 = (b0[1]+inv)%inv; int idxw0 = (b0[2]+inw)%inw; for (int iu=0, idxu=(b0[0]+inu)%inu; iu(t); gbuf(iu,iv,iw) = 0; } } #endif } public: complex * DUCC0_RESTRICT p0; union kbuf { Tacc scalar[3*nvec*vlen]; mysimd simd[3*nvec]; #if defined(_MSC_VER) kbuf() {} #endif }; kbuf buf; HelperNu2u(const Nufft *parent_, vmav,ndim> &grid_, vector &locks_) : parent(parent_), tkrn(*parent->krn), grid(grid_), i0{-1000000, -1000000, -1000000}, b0{-1000000, -1000000, -1000000}, #ifdef NEW_DUMP imin{1000,1000,1000},imax{-1000,-1000,-1000}, #endif gbuf({size_t(su),size_t(sv),size_t(sw)}), px0(gbuf.data()), locks(locks_) {} ~HelperNu2u() { dump(); } constexpr int lineJump() const { return sw; } constexpr int planeJump() const { return sv*sw; } [[gnu::always_inline]] [[gnu::hot]] void prep(array in) { array frac; auto i0old = i0; parent->template getpix(in, frac, i0); auto x0 = -frac[0]*2+(supp-1); auto y0 = -frac[1]*2+(supp-1); auto z0 = -frac[2]*2+(supp-1); tkrn.eval3(Tacc(x0), Tacc(y0), Tacc(z0), &buf.simd[0]); if (i0==i0old) return; if ((i0[0]b0[0]+su) || (i0[1]+int(supp)>b0[1]+sv) || (i0[2]+int(supp)>b0[2]+sw)) { dump(); b0[0]=((((i0[0]+nsafe)>>log2tile)<>log2tile)<>log2tile)<(imin[i],i0[i]-b0[i]); imax[i]=max(imax[i],i0[i]-b0[i]+supp); } #endif p0 = px0 + (i0[0]-b0[0])*sv*sw + (i0[1]-b0[1])*sw + (i0[2]-b0[2]); } }; template class HelperU2nu { public: static constexpr size_t vlen = mysimd::size(); static constexpr size_t nvec = (supp+vlen-1)/vlen; private: static constexpr int nsafe = (supp+1)/2; static constexpr int su = 2*nsafe+(1<(sw, ((supp+2*nvec-2)/nvec)*nvec); static constexpr double xsupp=2./supp; const Nufft *parent; TemplateKernel> tkrn; const cmav,ndim> &grid; array i0; // start index of the nonuniform point array b0; // start index of the current buffer vmav bufri; const Tcalc *px0r, *px0i; DUCC0_NOINLINE void load() { int inu = int(parent->nover[0]); int inv = int(parent->nover[1]); int inw = int(parent->nover[2]); int idxv0 = (b0[1]+inv)%inv; int idxw0 = (b0[2]+inw)%inw; for (int iu=0, idxu=(b0[0]+inu)%inu; iu simd[3*nvec]; #if defined(_MSC_VER) kbuf() {} #endif }; kbuf buf; HelperU2nu(const Nufft *parent_, const cmav,ndim> &grid_) : parent(parent_), tkrn(*parent->krn), grid(grid_), i0{-1000000, -1000000, -1000000}, b0{-1000000, -1000000, -1000000}, bufri({size_t(su+1),size_t(2*sv),size_t(swvec)}), px0r(bufri.data()), px0i(bufri.data()+swvec) {} constexpr int lineJump() const { return 2*swvec; } constexpr int planeJump() const { return 2*sv*swvec; } [[gnu::always_inline]] [[gnu::hot]] void prep(array in) { array frac; auto i0old = i0; parent->template getpix(in, frac, i0); auto x0 = -frac[0]*2+(supp-1); auto y0 = -frac[1]*2+(supp-1); auto z0 = -frac[2]*2+(supp-1); tkrn.eval3(Tcalc(x0), Tcalc(y0), Tcalc(z0), &buf.simd[0]); if (i0==i0old) return; if ((i0[0]b0[0]+su) || (i0[1]+int(supp)>b0[1]+sv) || (i0[2]+int(supp)>b0[2]+sw)) { b0[0]=((((i0[0]+nsafe)>>log2tile)<>log2tile)<>log2tile)< [[gnu::hot]] void spreading_helper (size_t supp, const cmav &coords, const cmav,1> &points, vmav,ndim> &grid) const { if constexpr (SUPP>=8) if (supp<=SUPP/2) return spreading_helper(supp, coords, points, grid); if constexpr (SUPP>4) if (supp(supp, coords, points, grid); MR_assert(supp==SUPP, "requested support out of range"); bool sorted = coords_sorted.size()!=0; vector locks(nover[0]); size_t chunksz = max(1000, npoints/(10*nthreads)); execDynamic(npoints, nthreads, chunksz, [&](Scheduler &sched) { HelperNu2u hlp(this, grid, locks); constexpr auto ljump = hlp.lineJump(); constexpr auto pjump = hlp.planeJump(); const auto * DUCC0_RESTRICT ku = hlp.buf.scalar; const auto * DUCC0_RESTRICT kv = hlp.buf.scalar+hlp.vlen*hlp.nvec; const auto * DUCC0_RESTRICT kw = hlp.buf.scalar+2*hlp.vlen*hlp.nvec; union Txdata{ array,SUPP> c; array f; Txdata(){for (size_t i=0; i v(points(row)); for (size_t cw=0; cw(hlp.p0); const auto j1 = 2*ljump; const auto j2 = 2*(pjump-SUPP*ljump); for (size_t cu=0; cu [[gnu::hot]] void interpolation_helper (size_t supp, const cmav,ndim> &grid, const cmav &coords, vmav,1> &points) const { if constexpr (SUPP>=8) if (supp<=SUPP/2) return interpolation_helper(supp, grid, coords, points); if constexpr (SUPP>4) if (supp(supp, grid, coords, points); MR_assert(supp==SUPP, "requested support out of range"); bool sorted = coords_sorted.size()!=0; size_t chunksz = max(1000, npoints/(10*nthreads)); execDynamic(npoints, nthreads, chunksz, [&](Scheduler &sched) { HelperU2nu hlp(this, grid); constexpr auto ljump = hlp.lineJump(); constexpr auto pjump = hlp.planeJump(); const auto * DUCC0_RESTRICT ku = hlp.buf.scalar; const auto * DUCC0_RESTRICT kv = hlp.buf.scalar+hlp.vlen*hlp.nvec; const auto * DUCC0_RESTRICT kw = hlp.buf.simd+2*hlp.nvec; while (auto rng=sched.getNext()) for(auto ix=rng.lo; ix rr=0, ri=0; if constexpr (hlp.nvec==1) { for (size_t cu=0; cu r2r=0, r2i=0; for (size_t cv=0; cv(pxr,element_aligned_tag())*kv[cv]; r2i += mysimd(pxi,element_aligned_tag())*kv[cv]; } rr += r2r*ku[cu]; ri += r2i*ku[cu]; } rr *= kw[0]; ri *= kw[0]; } else { for (size_t cu=0; cu tmpr(0), tmpi(0); for (size_t cv=0; cv tmp2r(0), tmp2i(0); for (size_t cw=0; cw(pxr,element_aligned_tag()); tmp2i += kw[cw]*mysimd(pxi,element_aligned_tag()); } tmpr += kv[cv]*tmp2r; tmpi += kv[cv]*tmp2i; } rr += ku[cu]*tmpr; ri += ku[cu]*tmpi; } } points(row) = hsum_cmplx(rr,ri); } }); } template void nonuni2uni(bool forward, const cmav &coords, const cmav,1> &points, vmav,ndim> &uniform) { timers.push("nu2u proper"); timers.push("allocating grid"); auto grid = vmav,ndim>::build_noncritical(nover, UNINITIALIZED); timers.poppush("zeroing grid"); mav_apply([](complex &v){v=complex(0);},nthreads,grid); timers.poppush("spreading"); constexpr size_t maxsupp = is_same::value ? 8 : 16; spreading_helper(supp, coords, points, grid); timers.poppush("FFT"); { vfmav> fgrid(grid); slice slz{0,(nuni[2]+1)/2}, shz{fgrid.shape(2)-nuni[2]/2,MAXIDX}; slice sly{0,(nuni[1]+1)/2}, shy{fgrid.shape(1)-nuni[1]/2,MAXIDX}; c2c(fgrid, fgrid, {2}, forward, Tcalc(1), nthreads); auto fgridl=fgrid.subarray({{},{},slz}); c2c(fgridl, fgridl, {1}, forward, Tcalc(1), nthreads); if (nuni[2]>1) { auto fgridh=fgrid.subarray({{},{},shz}); c2c(fgridh, fgridh, {1}, forward, Tcalc(1), nthreads); } auto fgridll=fgrid.subarray({{},sly,slz}); c2c(fgridll, fgridll, {0}, forward, Tcalc(1), nthreads); if (nuni[2]>1) { auto fgridlh=fgrid.subarray({{},sly,shz}); c2c(fgridlh, fgridlh, {0}, forward, Tcalc(1), nthreads); } if (nuni[1]>1) { auto fgridhl=fgrid.subarray({{},shy,slz}); c2c(fgridhl, fgridhl, {0}, forward, Tcalc(1), nthreads); if (nuni[2]>1) { auto fgridhh=fgrid.subarray({{},shy,shz}); c2c(fgridhh, fgridhh, {0}, forward, Tcalc(1), nthreads); } } } timers.poppush("grid correction"); execParallel(nuni[0], nthreads, [&](size_t lo, size_t hi) { for (auto i=lo; i(grid(iin,jin,kin) *Tcalc(corfac[0][icfu]*corfac[1][icfv]*corfac[2][icfw])); } } } }); timers.pop(); timers.pop(); } template void uni2nonuni(bool forward, const cmav,ndim> &uniform, const cmav &coords, vmav,1> &points) { timers.push("u2nu proper"); timers.push("allocating grid"); auto grid = vmav,ndim>::build_noncritical(nover, UNINITIALIZED); timers.poppush("zeroing grid"); // TODO: not all entries need to be zeroed, perhaps some time can be saved here mav_apply([](complex &v){v=complex(0);},nthreads,grid); timers.poppush("grid correction"); execParallel(nuni[0], nthreads, [&](size_t lo, size_t hi) { for (auto i=lo; i(uniform(iin,jin,kin)) *Tcalc(corfac[0][icfu]*corfac[1][icfv]*corfac[2][icfw]); } } } }); timers.poppush("FFT"); { vfmav> fgrid(grid); slice slz{0,(nuni[2]+1)/2}, shz{fgrid.shape(2)-nuni[2]/2,MAXIDX}; slice sly{0,(nuni[1]+1)/2}, shy{fgrid.shape(1)-nuni[1]/2,MAXIDX}; auto fgridll=fgrid.subarray({{},sly,slz}); c2c(fgridll, fgridll, {0}, forward, Tcalc(1), nthreads); if (nuni[2]>1) { auto fgridlh=fgrid.subarray({{},sly,shz}); c2c(fgridlh, fgridlh, {0}, forward, Tcalc(1), nthreads); } if (nuni[1]>1) { auto fgridhl=fgrid.subarray({{},shy,slz}); c2c(fgridhl, fgridhl, {0}, forward, Tcalc(1), nthreads); if (nuni[2]>1) { auto fgridhh=fgrid.subarray({{},shy,shz}); c2c(fgridhh, fgridhh, {0}, forward, Tcalc(1), nthreads); } } auto fgridl=fgrid.subarray({{},{},slz}); c2c(fgridl, fgridl, {1}, forward, Tcalc(1), nthreads); if (nuni[2]>1) { auto fgridh=fgrid.subarray({{},{},shz}); c2c(fgridh, fgridh, {1}, forward, Tcalc(1), nthreads); } c2c(fgrid, fgrid, {2}, forward, Tcalc(1), nthreads); } timers.poppush("interpolation"); constexpr size_t maxsupp = is_same::value ? 8 : 16; interpolation_helper(supp, grid, coords, points); timers.pop(); timers.pop(); } void build_index(const cmav &coords) { timers.push("building index"); size_t ntiles_u = (nover[0]>>log2tile) + 3; size_t ntiles_v = (nover[1]>>log2tile) + 3; size_t ntiles_w = (nover[2]>>log2tile) + 3; size_t lsq2 = log2tile; while ((lsq2>=1) && (((ntiles_u*ntiles_v*ntiles_w)<<(3*(log2tile-lsq2)))<(size_t(1)<<28))) --lsq2; auto ssmall = log2tile-lsq2; auto msmall = (size_t(1)< key(npoints); execParallel(npoints, nthreads, [&](size_t lo, size_t hi) { for (size_t i=lo; i({coords(i,0),coords(i,1),coords(i,2)},lsq2); auto lowkey = ((tile[0]&msmall)<<(2*ssmall)) | ((tile[1]&msmall)<< ssmall) | (tile[2]&msmall); auto hikey = ((tile[0]>>ssmall)*ntiles_v*ntiles_w) + ((tile[1]>>ssmall)*ntiles_w) + (tile[2]>>ssmall); key[i] = (hikey<<(3*ssmall)) | lowkey; } }); bucket_sort2(key, coord_idx, (ntiles_u*ntiles_v*ntiles_w)<<(3*ssmall), nthreads); timers.pop(); } }; #undef DUCC0_NUFFT_BOILERPLATE template void nu2u(const cmav &coord, const cmav,1> &points, bool forward, double epsilon, size_t nthreads, vfmav> &uniform, size_t verbosity, double sigma_min, double sigma_max, double periodicity, bool fft_order) { auto ndim = uniform.ndim(); MR_assert((ndim>=1) && (ndim<=3), "transform must be 1D/2D/3D"); MR_assert(ndim==coord.shape(1), "dimensionality mismatch"); if (ndim==1) { vmav,1> uniform2(uniform); Nufft nufft(true, points.shape(0), uniform2.shape(), epsilon, nthreads, sigma_min, sigma_max, periodicity, fft_order); nufft.nu2u(forward, verbosity, coord, points, uniform2); } else if (ndim==2) { vmav,2> uniform2(uniform); Nufft nufft(true, points.shape(0), uniform2.shape(), epsilon, nthreads, sigma_min, sigma_max, periodicity, fft_order); nufft.nu2u(forward, verbosity, coord, points, uniform2); } else if (ndim==3) { vmav,3> uniform2(uniform); Nufft nufft(true, points.shape(0), uniform2.shape(), epsilon, nthreads, sigma_min, sigma_max, periodicity, fft_order); nufft.nu2u(forward, verbosity, coord, points, uniform2); } } template void u2nu(const cmav &coord, const cfmav> &uniform, bool forward, double epsilon, size_t nthreads, vmav,1> &points, size_t verbosity, double sigma_min, double sigma_max, double periodicity, bool fft_order) { auto ndim = uniform.ndim(); MR_assert((ndim>=1) && (ndim<=3), "transform must be 1D/2D/3D"); MR_assert(ndim==coord.shape(1), "dimensionality mismatch"); if (ndim==1) { cmav,1> uniform2(uniform); Nufft nufft(false, points.shape(0), uniform2.shape(), epsilon, nthreads, sigma_min, sigma_max, periodicity, fft_order); nufft.u2nu(forward, verbosity, uniform2, coord, points); } else if (ndim==2) { cmav,2> uniform2(uniform); Nufft nufft(false, points.shape(0), uniform2.shape(), epsilon, nthreads, sigma_min, sigma_max, periodicity, fft_order); nufft.u2nu(forward, verbosity, uniform2, coord, points); } else if (ndim==3) { cmav,3> uniform2(uniform); Nufft nufft(false, points.shape(0), uniform2.shape(), epsilon, nthreads, sigma_min, sigma_max, periodicity, fft_order); nufft.u2nu(forward, verbosity, uniform2, coord, points); } } } // namespace detail_nufft // public names using detail_nufft::findNufftKernel; using detail_nufft::u2nu; using detail_nufft::nu2u; using detail_nufft::Nufft; } // namespace ducc0 #endif