/* This file is part of the ducc FFT library. Copyright (C) 2010-2023 Max-Planck-Society Copyright (C) 2019 Peter Bell For the odd-sized DCT-IV transforms: Copyright (C) 2003, 2007-14 Matteo Frigo Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology Authors: Martin Reinecke, Peter Bell */ /* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */ /* All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This code is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this code; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef DUCC0_FFT_H #define DUCC0_FFT_H #include #include #include #include #include #include #include #include #include #include "ducc0/infra/useful_macros.h" #include "ducc0/infra/error_handling.h" #include "ducc0/infra/threading.h" #include "ducc0/infra/misc_utils.h" #include "ducc0/infra/simd.h" #include "ducc0/infra/mav.h" #include "ducc0/infra/aligned_array.h" #include "ducc0/math/cmplx.h" #include "ducc0/math/unity_roots.h" #include "ducc0/fft/fft1d.h" /** \file fft.h * Implementation of multi-dimensional Fast Fourier and related transforms * \copyright Copyright (C) 2010-2021 Max-Planck-Society * \copyright Copyright (C) 2019 Peter Bell * \copyright * \copyright For the odd-sized DCT-IV transforms: * \copyright Copyright (C) 2003, 2007-14 Matteo Frigo * \copyright Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology * * \authors Martin Reinecke, Peter Bell */ namespace ducc0 { namespace detail_fft { // the next line is necessary to address some sloppy name choices in hipSYCL using std::min, std::max; template constexpr inline size_t fft_simdlen = min(8, native_simd::size()); template<> constexpr inline size_t fft_simdlen = min(4, native_simd::size()); template<> constexpr inline size_t fft_simdlen = min(8, native_simd::size()); template using fft_simd = typename simd_select>::type; template constexpr inline bool fft_simd_exists = (fft_simdlen > 1); using shape_t=fmav_info::shape_t; using stride_t=fmav_info::stride_t; constexpr bool FORWARD = true, BACKWARD = false; struct util // hack to avoid duplicate symbols { static void sanity_check_axes(size_t ndim, const shape_t &axes) { if (ndim==1) { if ((axes.size()!=1) || (axes[0]!=0)) throw std::invalid_argument("bad axes"); return; } shape_t tmp(ndim,0); if (axes.empty()) throw std::invalid_argument("no axes specified"); for (auto ax : axes) { if (ax>=ndim) throw std::invalid_argument("bad axis number"); if (++tmp[ax]>1) throw std::invalid_argument("axis specified repeatedly"); } } DUCC0_NOINLINE static void sanity_check_onetype(const fmav_info &a1, const fmav_info &a2, bool inplace, const shape_t &axes) { sanity_check_axes(a1.ndim(), axes); MR_assert(a1.conformable(a2), "array sizes are not conformable"); if (inplace) MR_assert(a1.stride()==a2.stride(), "stride mismatch"); } DUCC0_NOINLINE static void sanity_check_cr(const fmav_info &ac, const fmav_info &ar, const shape_t &axes) { sanity_check_axes(ac.ndim(), axes); MR_assert(ac.ndim()==ar.ndim(), "dimension mismatch"); for (size_t i=0; i=ac.ndim()) throw std::invalid_argument("bad axis number"); MR_assert(ac.ndim()==ar.ndim(), "dimension mismatch"); for (size_t i=0; i class T_dct1 { private: pocketfft_r fftplan; public: DUCC0_NOINLINE T_dct1(size_t length, bool /*vectorize*/=false) : fftplan(2*(length-1)) {} template DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct, bool ortho, int /*type*/, bool /*cosine*/, size_t nthreads=1) const { constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); size_t N=fftplan.length(), n=N/2+1; if (ortho) { c[0]*=sqrt2; c[n-1]*=sqrt2; } auto tmp=&buf[0]; tmp[0] = c[0]; for (size_t i=1; i DUCC0_NOINLINE void exec_copyback(T c[], T buf[], T0 fct, bool ortho, int /*type*/, bool /*cosine*/, size_t nthreads=1) const { exec(c, buf, fct, ortho, 1, true, nthreads); } template DUCC0_NOINLINE void exec(T c[], T0 fct, bool ortho, int /*type*/, bool /*cosine*/, size_t nthreads=1) const { quick_array buf(bufsize()); exec_copyback(c, buf.data(), fct, ortho, 1, true, nthreads); } size_t length() const { return fftplan.length()/2+1; } size_t bufsize() const { return fftplan.length()+fftplan.bufsize(); } }; template class T_dst1 { private: pocketfft_r fftplan; public: DUCC0_NOINLINE T_dst1(size_t length, bool /*vectorize*/=false) : fftplan(2*(length+1)) {} template DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct, bool /*ortho*/, int /*type*/, bool /*cosine*/, size_t nthreads=1) const { size_t N=fftplan.length(), n=N/2-1; auto tmp = &buf[0]; tmp[0] = tmp[n+1] = c[0]*0; for (size_t i=0; i DUCC0_NOINLINE void exec_copyback(T c[], T buf[], T0 fct, bool /*ortho*/, int /*type*/, bool /*cosine*/, size_t nthreads=1) const { exec(c, buf, fct, true, 1, false, nthreads); } template DUCC0_NOINLINE void exec(T c[], T0 fct, bool /*ortho*/, int /*type*/, bool /*cosine*/, size_t nthreads) const { quick_array buf(bufsize()); exec_copyback(c, buf.data(), fct, true, 1, false, nthreads); } size_t length() const { return fftplan.length()/2-1; } size_t bufsize() const { return fftplan.length()+fftplan.bufsize(); } }; template class T_dcst23 { private: pocketfft_r fftplan; std::vector twiddle; public: DUCC0_NOINLINE T_dcst23(size_t length, bool /*vectorize*/=false) : fftplan(length), twiddle(length) { UnityRoots> tw(4*length); for (size_t i=0; i DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct, bool ortho, int type, bool cosine, size_t nthreads=1) const { constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); size_t N=length(); size_t NS2 = (N+1)/2; if (type==2) { c[0] *= 2; if ((N&1)==0) c[N-1]*=2; if (cosine) for (size_t k=1; k DUCC0_NOINLINE void exec_copyback(T c[], T buf[], T0 fct, bool ortho, int type, bool cosine, size_t nthreads=1) const { exec(c, buf, fct, ortho, type, cosine, nthreads); } template DUCC0_NOINLINE void exec(T c[], T0 fct, bool ortho, int type, bool cosine, size_t nthreads=1) const { quick_array buf(bufsize()); exec(c, &buf[0], fct, ortho, type, cosine, nthreads); } size_t length() const { return fftplan.length(); } size_t bufsize() const { return fftplan.bufsize(); } }; template class T_dcst4 { private: size_t N; std::unique_ptr> fft; std::unique_ptr> rfft; quick_array> C2; size_t bufsz; public: DUCC0_NOINLINE T_dcst4(size_t length, bool /*vectorize*/=false) : N(length), fft((N&1) ? nullptr : make_unique>(N/2)), rfft((N&1)? make_unique>(N) : nullptr), C2((N&1) ? 0 : N/2), bufsz((N&1) ? (N+rfft->bufsize()) : (N+2*fft->bufsize())) { if ((N&1)==0) { UnityRoots> tw(16*N); for (size_t i=0; i DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct, bool /*ortho*/, int /*type*/, bool cosine, size_t nthreads) const { size_t n2 = N/2; if (!cosine) for (size_t k=0, kc=N-1; kexec(y, y+N, fct, true, nthreads); { auto SGN = [](size_t i) { constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); return (i&2) ? -sqrt2 : sqrt2; }; c[n2] = res[0]*SGN(n2+1); size_t i=0, i1=1, k=1; for (; k *>(buf); for(size_t i=0; iexec(y2, y2+N/2, fct, true, nthreads); for(size_t i=0, ic=n2-1; i DUCC0_NOINLINE void exec_copyback(T c[], T buf[], T0 fct, bool /*ortho*/, int /*type*/, bool cosine, size_t nthreads=1) const { exec(c, buf, fct, true, 4, cosine, nthreads); } template DUCC0_NOINLINE void exec(T c[], T0 fct, bool /*ortho*/, int /*type*/, bool cosine, size_t nthreads=1) const { quick_array buf(bufsize()); exec(c, &buf[0], fct, true, 4, cosine, nthreads); } size_t length() const { return N; } size_t bufsize() const { return bufsz; } }; // // multi-D infrastructure // template std::shared_ptr get_plan(size_t length, bool vectorize=false) { #ifdef DUCC0_NO_FFT_CACHE return std::make_shared(length, vectorize); #else constexpr size_t nmax=10; struct entry { size_t n; bool vectorize; std::shared_ptr ptr; }; static std::array cache{{{0,0,nullptr}}}; static std::array last_access{{0}}; static size_t access_counter = 0; static Mutex mut; auto find_in_cache = [&]() -> std::shared_ptr { for (size_t i=0; i(length, vectorize); { LockGuard lock(mut); auto p = find_in_cache(); if (p) return p; size_t lru = 0; for (size_t i=1; i class multi_iter { private: shape_t shp, pos; stride_t str_i, str_o; size_t cshp_i, cshp_o, rem; ptrdiff_t cstr_i, cstr_o, sstr_i, sstr_o, p_ii, p_i[N], p_oi, p_o[N]; bool uni_i, uni_o; void advance_i() { for (size_t i=0; i=1, "not enough dimensions"); // Sort the extraneous dimensions in order of ascending output stride; // this should improve overall cache re-use and avoid clashes between // threads as much as possible. shape_t idx(iarr.ndim()); std::iota(idx.begin(), idx.end(), 0); sort(idx.begin(), idx.end(), [&oarr](size_t i1, size_t i2) {return oarr.stride(i1) < oarr.stride(i2);}); for (auto i: idx) if (i!=idim) { pos.push_back(0); MR_assert(iarr.shape(i)==oarr.shape(i), "shape mismatch"); shp.push_back(iarr.shape(i)); str_i.push_back(iarr.stride(i)); str_o.push_back(oarr.stride(i)); } MR_assert(idim0) { sstr_i = str_i[0]; sstr_o = str_o[0]; } if (nshares==1) return; if (nshares==0) throw std::runtime_error("can't run with zero threads"); if (myshare>=nshares) throw std::runtime_error("impossible share requested"); auto [lo, hi] = calcShare(nshares, myshare, rem); size_t todo = hi-lo; size_t chunk = rem; for (size_t i2=0, i=pos.size()-1; i2(stride_in() *tsz)&4095)==0) || ((abs(stride_out()*tsz)&4095)==0); } bool critical_stride_other(size_t tsz) const { if (unistride_i()==0) return false; // it's just one transform return ((abs(unistride_i()*tsz)&4095)==0) || ((abs(unistride_o()*tsz)&4095)==0); } }; template class TmpStorage { private: aligned_array d; size_t dofs, dstride; public: TmpStorage(size_t n_trafo, size_t bufsize_data, size_t bufsize_trafo, size_t n_simultaneous, bool inplace) { if (inplace) { d.resize(bufsize_trafo); return; } constexpr auto vlen = fft_simdlen; // FIXME: when switching to C++20, use bit_floor(othersize) size_t buffct = std::min(vlen, n_trafo); size_t datafct = std::min(vlen, n_trafo); if (n_trafo>=n_simultaneous*vlen) datafct = n_simultaneous*vlen; dstride = bufsize_data; // critical stride avoidance if ((dstride&256)==0) dstride+=3; d.resize(buffct*(bufsize_trafo+17) + datafct*dstride); dofs = bufsize_trafo + 17; } template T2 *transformBuf() { return reinterpret_cast(d.data()); } template T2 *dataBuf() { return reinterpret_cast(d.data()) + dofs; } size_t data_stride() const { return dstride; } }; template class TmpStorage2 { private: TmpStorage &stg; public: using datatype = T2; TmpStorage2(TmpStorage &stg_): stg(stg_) {} T2 *transformBuf() { return stg.template transformBuf(); } T2 *dataBuf() { return stg.template dataBuf(); } size_t data_stride() const { return stg.data_stride(); } }; // Yes, this looks strange. But this is currently the only way I found to // stop compilers from vectorizing the copying loops and messing up the ordering // of the memory accesses, which is really important here. template DUCC0_NOINLINE void copy_inputx2(const Titer &it, const cfmav> &src, Ts *DUCC0_RESTRICT dst, size_t vlen) { for (size_t i=0; i DUCC0_NOINLINE void copy_inputx(const Titer &it, const cfmav> &src, Ts *DUCC0_RESTRICT dst, size_t vlen) { if (it.stride_in()==1) return copy_inputx2(it, src, dst, vlen); for (size_t i=0; i DUCC0_NOINLINE void copy_input(const Titer &it, const cfmav> &src, Cmplx *DUCC0_RESTRICT dst) { constexpr auto vlen=Tsimd::size(); copy_inputx(it, src, reinterpret_cast(dst),vlen); } template DUCC0_NOINLINE void copy_input(const Titer &it, const cfmav &src, Tsimd *DUCC0_RESTRICT dst) { constexpr auto vlen=Tsimd::size(); for (size_t i=0; i DUCC0_NOINLINE void copy_input(const Titer &it, const cfmav &src, T *DUCC0_RESTRICT dst) { if (dst == &src.raw(it.iofs(0))) return; // in-place for (size_t i=0; i DUCC0_NOINLINE void copy_outputx2(const Titer &it, const Ts *DUCC0_RESTRICT src, vfmav> &dst, size_t vlen) { Cmplx * DUCC0_RESTRICT ptr = dst.data(); for (size_t i=0; i DUCC0_NOINLINE void copy_outputx(const Titer &it, const Ts *DUCC0_RESTRICT src, vfmav> &dst, size_t vlen) { if (it.stride_out()==1) return copy_outputx2(it,src,dst,vlen); Cmplx * DUCC0_RESTRICT ptr = dst.data(); for (size_t i=0; i DUCC0_NOINLINE void copy_output(const Titer &it, const Cmplx *DUCC0_RESTRICT src, vfmav> &dst) { constexpr auto vlen=Tsimd::size(); copy_outputx(it, reinterpret_cast(src), dst, vlen); } template DUCC0_NOINLINE void copy_output(const Titer &it, const Tsimd *DUCC0_RESTRICT src, vfmav &dst) { constexpr auto vlen=Tsimd::size(); auto ptr=dst.data(); for (size_t i=0; i DUCC0_NOINLINE void copy_output(const Titer &it, const T *DUCC0_RESTRICT src, vfmav &dst) { auto ptr=dst.data(); if (src == &dst.raw(it.oofs(0))) return; // in-place for (size_t i=0; i DUCC0_NOINLINE void copy_input(const Titer &it, const cfmav> &src, Cmplx *dst, size_t nvec, size_t vstr) { constexpr auto vlen=Tsimd::size(); for (size_t i=0; i DUCC0_NOINLINE void copy_input(const Titer &it, const cfmav> &src, Cmplx *dst, size_t nvec, size_t vstr) { for (size_t i=0; i DUCC0_NOINLINE void copy_input(const Titer &it, const cfmav &src, Tsimd *dst, size_t nvec, size_t vstr) { constexpr auto vlen=Tsimd::size(); for (size_t i=0; i DUCC0_NOINLINE void copy_input(const Titer &it, const cfmav &src, T *dst, size_t nvec, size_t vstr) { for (size_t i=0; i DUCC0_NOINLINE void copy_output(const Titer &it, const Cmplx *src, vfmav> &dst, size_t nvec, size_t vstr) { constexpr auto vlen=Tsimd::size(); Cmplx * DUCC0_RESTRICT ptr = dst.data(); for (size_t i=0; i DUCC0_NOINLINE void copy_output(const Titer &it, const Cmplx *src, vfmav> &dst, size_t nvec, size_t vstr) { Cmplx * DUCC0_RESTRICT ptr = dst.data(); for (size_t i=0; i DUCC0_NOINLINE void copy_output(const Titer &it, const Tsimd *src, vfmav &dst, size_t nvec, size_t vstr) { constexpr auto vlen=Tsimd::size(); typename Tsimd::value_type * DUCC0_RESTRICT ptr = dst.data(); for (size_t i=0; i DUCC0_NOINLINE void copy_output(const Titer &it, const T *src, vfmav &dst, size_t nvec, size_t vstr) { T * DUCC0_RESTRICT ptr = dst.data(); for (size_t i=0; i struct add_vec { using type = typename simd_select::type; }; template struct add_vec, vlen> { using type = Cmplx::type>; }; template using add_vec_t = typename add_vec::type; template DUCC0_NOINLINE void general_nd(const cfmav &in, vfmav &out, const shape_t &axes, T0 fct, size_t nthreads, const Exec &exec, const bool /*allow_inplace*/=true) { if ((in.ndim()==1)&&(in.stride(0)==1)&&(out.stride(0)==1)) { auto plan = get_plan(in.shape(0), true); exec.exec_simple(in.data(), out.data(), *plan, fct, nthreads); return; } std::shared_ptr plan; size_t nth1d = (in.ndim()==1) ? nthreads : 1; for (size_t iax=0; iaxlength())) plan = get_plan(len, in.ndim()==1); execParallel(util::thread_count(nthreads, in, axes[iax], fft_simdlen), [&](Scheduler &sched) { constexpr auto vlen = fft_simdlen; constexpr size_t nmax = 16; const auto &tin(iax==0? in : out); multi_iter it(tin, out, axes[iax], sched.num_threads(), sched.thread_num()); //size_t working_set_size = (2-(tin.data()==out.data()))*len*sizeof(T)+plan->bufsize()*sizeof(T); size_t working_set_size = sizeof(T)*(len+plan->bufsize()); size_t n_simul=1; while(n_simul*2*working_set_size<=262144) n_simul*=2; n_simul = min(n_simul, vlen); size_t n_bunch = n_simul; if ((in.stride(axes[iax])!=1) || (out.stride(axes[iax])!=1)) // might make sense to bunch //while ((n_bunch storage(in.size()/len, len, plan->bufsize(), (n_bunch+vlen-1)/vlen, inplace); // first, do all possible steps of size n_bunch, then n_simul if (n_bunch>1) { #ifndef DUCC0_NO_SIMD if constexpr (vlen>1) { constexpr size_t lvlen = vlen; if (n_simul>=lvlen) { if (it.remaining()>=n_bunch) { TmpStorage2,T,T0> storage2(storage); while (it.remaining()>=n_bunch) { it.advance(n_bunch); exec.exec_n(it, tin, out, storage2, *plan, fct, n_bunch/lvlen, nth1d); } } } if (n_simul==lvlen) { if (it.remaining()>=lvlen) { TmpStorage2,T,T0> storage2(storage); while (it.remaining()>=lvlen) { it.advance(lvlen); exec(it, tin, out, storage2, *plan, fct, nth1d); } } } } if constexpr ((vlen>2) && (simd_exists)) { constexpr size_t lvlen = vlen/2; if (n_simul>=lvlen) { if (it.remaining()>=n_bunch) { TmpStorage2,T,T0> storage2(storage); while (it.remaining()>=n_bunch) { it.advance(n_bunch); exec.exec_n(it, tin, out, storage2, *plan, fct, n_bunch/lvlen, nth1d); } } } if (n_simul==lvlen) { if (it.remaining()>=lvlen) { TmpStorage2,T,T0> storage2(storage); while (it.remaining()>=lvlen) { it.advance(lvlen); exec(it, tin, out, storage2, *plan, fct, nth1d); } } } } if constexpr ((vlen>4) && (simd_exists)) { constexpr size_t lvlen = vlen/4; if (n_simul>=lvlen) { if (it.remaining()>=n_bunch) { TmpStorage2,T,T0> storage2(storage); while (it.remaining()>=n_bunch) { it.advance(n_bunch); exec.exec_n(it, tin, out, storage2, *plan, fct, n_bunch/lvlen, nth1d); } } } if (n_simul==lvlen) { if (it.remaining()>=lvlen) { TmpStorage2,T,T0> storage2(storage); while (it.remaining()>=lvlen) { it.advance(lvlen); exec(it, tin, out, storage2, *plan, fct, nth1d); } } } } #endif { TmpStorage2 storage2(storage); while (it.remaining()>=n_bunch) { it.advance(n_bunch); exec.exec_n(it, tin, out, storage2, *plan, fct, n_bunch, nth1d); } } } { TmpStorage2 storage2(storage); while (it.remaining()>0) { it.advance(1); exec(it, tin, out, storage2, *plan, fct, nth1d, inplace); } } }); // end of parallel region fct = T0(1); // factor has been applied, use 1 for remaining axes } } struct ExecC2C { bool forward; template DUCC0_NOINLINE void operator() ( const Titer &it, const cfmav> &in, vfmav> &out, Tstorage &storage, const pocketfft_c &plan, T0 fct, size_t nthreads, bool inplace=false) const { using T = typename Tstorage::datatype; if constexpr(is_same, T>::value) if (inplace) { if (in.data()!=out.data()) copy_input(it, in, out.data()+it.oofs(0)); plan.exec_copyback(out.data()+it.oofs(0), storage.transformBuf(), fct, forward, nthreads); return; } T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); copy_input(it, in, buf2); auto res = plan.exec(buf2, buf1, fct, forward, nthreads); copy_output(it, res, out); } template DUCC0_NOINLINE void exec_n ( const Titer &it, const cfmav> &in, vfmav> &out, Tstorage &storage, const pocketfft_c &plan, T0 fct, size_t nvec, size_t nthreads) const { using T = typename Tstorage::datatype; size_t dstr = storage.data_stride(); T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); copy_input(it, in, buf2, nvec, dstr); for (size_t i=0; i DUCC0_NOINLINE void exec_simple ( const Cmplx *in, Cmplx *out, const pocketfft_c &plan, T0 fct, size_t nthreads) const { if (in!=out) copy_n(in, plan.length(), out); plan.exec(out, fct, forward, nthreads); } }; struct ExecHartley { template DUCC0_NOINLINE void operator() ( const Titer &it, const cfmav &in, vfmav &out, Tstorage &storage, const pocketfft_hartley &plan, T0 fct, size_t nthreads, bool inplace=false) const { using T = typename Tstorage::datatype; if constexpr(is_same::value) if (inplace) { if (in.data()!=out.data()) copy_input(it, in, out.data()+it.oofs(0)); plan.exec_copyback(out.data()+it.oofs(0), storage.transformBuf(), fct, nthreads); return; } T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); copy_input(it, in, buf2); auto res = plan.exec(buf2, buf1, fct, nthreads); copy_output(it, res, out); } template DUCC0_NOINLINE void exec_n ( const Titer &it, const cfmav &in, vfmav &out, Tstorage &storage, const pocketfft_hartley &plan, T0 fct, size_t nvec, size_t nthreads) const { using T = typename Tstorage::datatype; size_t dstr = storage.data_stride(); T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); copy_input(it, in, buf2, nvec, dstr); for (size_t i=0; i DUCC0_NOINLINE void exec_simple ( const T0 *in, T0 *out, const pocketfft_hartley &plan, T0 fct, size_t nthreads) const { if (in!=out) copy_n(in, plan.length(), out); plan.exec(out, fct, nthreads); } }; struct ExecFHT { template DUCC0_NOINLINE void operator() ( const Titer &it, const cfmav &in, vfmav &out, Tstorage &storage, const pocketfft_fht &plan, T0 fct, size_t nthreads, bool inplace=false) const { using T = typename Tstorage::datatype; if constexpr(is_same::value) if (inplace) { if (in.data()!=out.data()) copy_input(it, in, out.data()+it.oofs(0)); plan.exec_copyback(out.data()+it.oofs(0), storage.transformBuf(), fct, nthreads); return; } T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); copy_input(it, in, buf2); auto res = plan.exec(buf2, buf1, fct, nthreads); copy_output(it, res, out); } template DUCC0_NOINLINE void exec_n ( const Titer &it, const cfmav &in, vfmav &out, Tstorage &storage, const pocketfft_fht &plan, T0 fct, size_t nvec, size_t nthreads) const { using T = typename Tstorage::datatype; size_t dstr = storage.data_stride(); T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); copy_input(it, in, buf2, nvec, dstr); for (size_t i=0; i DUCC0_NOINLINE void exec_simple ( const T0 *in, T0 *out, const pocketfft_fht &plan, T0 fct, size_t nthreads) const { if (in!=out) copy_n(in, plan.length(), out); plan.exec(out, fct, nthreads); } }; struct ExecFFTW { bool forward; template DUCC0_NOINLINE void operator() ( const Titer &it, const cfmav &in, vfmav &out, Tstorage &storage, const pocketfft_fftw &plan, T0 fct, size_t nthreads, bool inplace=false) const { using T = typename Tstorage::datatype; if constexpr(is_same::value) if (inplace) { if (in.data()!=out.data()) copy_input(it, in, out.data()+it.oofs(0)); plan.exec_copyback(out.data()+it.oofs(0), storage.transformBuf(), fct, forward, nthreads); return; } T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); copy_input(it, in, buf2); auto res = plan.exec(buf2, buf1, fct, forward, nthreads); copy_output(it, res, out); } template DUCC0_NOINLINE void exec_n ( const Titer &it, const cfmav &in, vfmav &out, Tstorage &storage, const pocketfft_fftw &plan, T0 fct, size_t nvec, size_t nthreads) const { using T = typename Tstorage::datatype; size_t dstr = storage.data_stride(); T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); copy_input(it, in, buf2, nvec, dstr); for (size_t i=0; i DUCC0_NOINLINE void exec_simple ( const T0 *in, T0 *out, const pocketfft_fftw &plan, T0 fct, size_t nthreads) const { if (in!=out) copy_n(in, plan.length(), out); plan.exec(out, fct, forward, nthreads); } }; struct ExecDcst { bool ortho; int type; bool cosine; template DUCC0_NOINLINE void operator() (const Titer &it, const cfmav &in, vfmav &out, Tstorage &storage, const Tplan &plan, T0 fct, size_t nthreads, bool inplace=false) const { using T = typename Tstorage::datatype; if constexpr(is_same::value) if (inplace) { if (in.data()!=out.data()) copy_input(it, in, out.data()+it.oofs(0)); plan.exec_copyback(out.data()+it.oofs(0), storage.transformBuf(), fct, ortho, type, cosine, nthreads); return; } T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); copy_input(it, in, buf2); auto res = plan.exec(buf2, buf1, fct, ortho, type, cosine, nthreads); copy_output(it, res, out); } template DUCC0_NOINLINE void exec_n ( const Titer &it, const cfmav &in, vfmav &out, Tstorage &storage, const Tplan &plan, T0 fct, size_t nvec, size_t nthreads) const { using T = typename Tstorage::datatype; size_t dstr = storage.data_stride(); T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); copy_input(it, in, buf2, nvec, dstr); for (size_t i=0; i DUCC0_NOINLINE void exec_simple ( const T0 *in, T0 *out, const Tplan &plan, T0 fct, size_t nthreads) const { if (in!=out) copy_n(in, plan.length(), out); plan.exec(out, fct, ortho, type, cosine, nthreads); } }; template DUCC0_NOINLINE void general_r2c( const cfmav &in, vfmav> &out, size_t axis, bool forward, T fct, size_t nthreads) { size_t nth1d = (in.ndim()==1) ? nthreads : 1; auto plan = std::make_unique>(in.shape(axis)); size_t len=in.shape(axis); execParallel( util::thread_count(nthreads, in, axis, fft_simdlen), [&](Scheduler &sched) { constexpr auto vlen = fft_simdlen; TmpStorage storage(in.size()/len, len, plan->bufsize(), 1, false); multi_iter it(in, out, axis, sched.num_threads(), sched.thread_num()); #ifndef DUCC0_NO_SIMD if constexpr (vlen>1) { TmpStorage2,T,T> storage2(storage); auto dbuf = storage2.dataBuf(); auto tbuf = storage2.transformBuf(); while (it.remaining()>=vlen) { it.advance(vlen); copy_input(it, in, dbuf); auto res = plan->exec(dbuf, tbuf, fct, true, nth1d); auto vout = out.data(); for (size_t j=0; j2) if constexpr (simd_exists) if (it.remaining()>=vlen/2) { TmpStorage2,T,T> storage2(storage); auto dbuf = storage2.dataBuf(); auto tbuf = storage2.transformBuf(); it.advance(vlen/2); copy_input(it, in, dbuf); auto res = plan->exec(dbuf, tbuf, fct, true, nth1d); auto vout = out.data(); for (size_t j=0; j4) if constexpr( simd_exists) if (it.remaining()>=vlen/4) { TmpStorage2,T,T> storage2(storage); auto dbuf = storage2.dataBuf(); auto tbuf = storage2.transformBuf(); it.advance(vlen/4); copy_input(it, in, dbuf); auto res = plan->exec(dbuf, tbuf, fct, true, nth1d); auto vout = out.data(); for (size_t j=0; j storage2(storage); auto dbuf = storage2.dataBuf(); auto tbuf = storage2.transformBuf(); while (it.remaining()>0) { it.advance(1); copy_input(it, in, dbuf); auto res = plan->exec(dbuf, tbuf, fct, true, nth1d); auto vout = out.data(); vout[it.oofs(0)].Set(res[0]); size_t i=1, ii=1; if (forward) for (; i DUCC0_NOINLINE void general_c2r( const cfmav> &in, vfmav &out, size_t axis, bool forward, T fct, size_t nthreads) { size_t nth1d = (in.ndim()==1) ? nthreads : 1; auto plan = std::make_unique>(out.shape(axis)); size_t len=out.shape(axis); execParallel( util::thread_count(nthreads, in, axis, fft_simdlen), [&](Scheduler &sched) { constexpr auto vlen = fft_simdlen; TmpStorage storage(out.size()/len, len, plan->bufsize(), 1, false); multi_iter it(in, out, axis, sched.num_threads(), sched.thread_num()); #ifndef DUCC0_NO_SIMD if constexpr (vlen>1) { TmpStorage2,T,T> storage2(storage); auto dbuf = storage2.dataBuf(); auto tbuf = storage2.transformBuf(); while (it.remaining()>=vlen) { it.advance(vlen); for (size_t j=0; jexec(dbuf, tbuf, fct, false, nth1d); copy_output(it, res, out); } } if constexpr (vlen>2) if constexpr (simd_exists) if (it.remaining()>=vlen/2) { TmpStorage2,T,T> storage2(storage); auto dbuf = storage2.dataBuf(); auto tbuf = storage2.transformBuf(); it.advance(vlen/2); for (size_t j=0; jexec(dbuf, tbuf, fct, false, nth1d); copy_output(it, res, out); } if constexpr (vlen>4) if constexpr(simd_exists) if (it.remaining()>=vlen/4) { TmpStorage2,T,T> storage2(storage); auto dbuf = storage2.dataBuf(); auto tbuf = storage2.transformBuf(); it.advance(vlen/4); for (size_t j=0; jexec(dbuf, tbuf, fct, false, nth1d); copy_output(it, res, out); } #endif { TmpStorage2 storage2(storage); auto dbuf = storage2.dataBuf(); auto tbuf = storage2.transformBuf(); while (it.remaining()>0) { it.advance(1); dbuf[0]=in.raw(it.iofs(0)).r; { size_t i=1, ii=1; if (forward) for (; iexec(dbuf, tbuf, fct, false, nth1d); copy_output(it, res, out); } } }); // end of parallel region } struct ExecR2R { bool r2c, forward; template DUCC0_NOINLINE void operator() ( const Titer &it, const cfmav &in, vfmav &out, Tstorage &storage, const pocketfft_r &plan, T0 fct, size_t nthreads, bool inplace=false) const { using T = typename Tstorage::datatype; if constexpr(is_same::value) if (inplace) { T *buf1=storage.transformBuf(), *buf2=out.data(); if (in.data()!=buf2) copy_input(it, in, buf2); if ((!r2c) && forward) for (size_t i=2; i DUCC0_NOINLINE void exec_n ( const Titer &it, const cfmav &in, vfmav &out, Tstorage &storage, const pocketfft_r &plan, T0 fct, size_t nvec, size_t nthreads) const { using T = typename Tstorage::datatype; size_t dstr = storage.data_stride(); T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); copy_input(it, in, buf2, nvec, dstr); if ((!r2c) && forward) for (size_t k=0; k DUCC0_NOINLINE void exec_simple ( const T0 *in, T0 *out, const pocketfft_r &plan, T0 fct, size_t nthreads) const { if (in!=out) copy_n(in, plan.length(), out); if ((!r2c) && forward) for (size_t i=2; i DUCC0_NOINLINE void c2c(const cfmav> &in, vfmav> &out, const shape_t &axes, bool forward, T fct, size_t nthreads=1) { util::sanity_check_onetype(in, out, in.data()==out.data(), axes); if (in.size()==0) return; const auto &in2(reinterpret_cast >&>(in)); auto &out2(reinterpret_cast >&>(out)); if ((axes.size()>1) && (in.data()!=out.data())) // optimize axis order for (size_t i=1; i>(in2, out2, axes2, fct, nthreads, ExecC2C{forward}); return; } general_nd>(in2, out2, axes, fct, nthreads, ExecC2C{forward}); } /// Fast Discrete Cosine Transform /** This executes a DCT on \a in and stores the result in \a out. * * \a in and \a out must have identical shapes; they may point to the same * memory; in this case their strides must also be identical. * * \a axes specifies the axes over which the transform is carried out. * * If \a forward is true, a DCT is computed, otherwise an inverse DCT. * * \a type specifies the desired type (1-4) of the transform. * * No normalization factors will be applied by default; if multiplication by * a constant is desired, it can be supplied in \a fct. * * If \a ortho is true, the first and last array entries are corrected (if * necessary) to allow an orthonormalized transform. * * If the underlying array has more than one dimension, the computation will * be distributed over \a nthreads threads. */ template DUCC0_NOINLINE void dct(const cfmav &in, vfmav &out, const shape_t &axes, int type, T fct, bool ortho, size_t nthreads=1) { if ((type<1) || (type>4)) throw std::invalid_argument("invalid DCT type"); util::sanity_check_onetype(in, out, in.data()==out.data(), axes); if (in.size()==0) return; const ExecDcst exec{ortho, type, true}; if (type==1) general_nd>(in, out, axes, fct, nthreads, exec); else if (type==4) general_nd>(in, out, axes, fct, nthreads, exec); else general_nd>(in, out, axes, fct, nthreads, exec); } /// Fast Discrete Sine Transform /** This executes a DST on \a in and stores the result in \a out. * * \a in and \a out must have identical shapes; they may point to the same * memory; in this case their strides must also be identical. * * \a axes specifies the axes over which the transform is carried out. * * If \a forward is true, a DST is computed, otherwise an inverse DST. * * \a type specifies the desired type (1-4) of the transform. * * No normalization factors will be applied by default; if multiplication by * a constant is desired, it can be supplied in \a fct. * * If \a ortho is true, the first and last array entries are corrected (if * necessary) to allow an orthonormalized transform. * * If the underlying array has more than one dimension, the computation will * be distributed over \a nthreads threads. */ template DUCC0_NOINLINE void dst(const cfmav &in, vfmav &out, const shape_t &axes, int type, T fct, bool ortho, size_t nthreads=1) { if ((type<1) || (type>4)) throw std::invalid_argument("invalid DST type"); util::sanity_check_onetype(in, out, in.data()==out.data(), axes); if (in.size()==0) return; const ExecDcst exec{ortho, type, false}; if (type==1) general_nd>(in, out, axes, fct, nthreads, exec); else if (type==4) general_nd>(in, out, axes, fct, nthreads, exec); else general_nd>(in, out, axes, fct, nthreads, exec); } template DUCC0_NOINLINE void r2c(const cfmav &in, vfmav> &out, size_t axis, bool forward, T fct, size_t nthreads=1) { util::sanity_check_cr(out, in, axis); if (in.size()==0) return; auto &out2(reinterpret_cast>&>(out)); general_r2c(in, out2, axis, forward, fct, nthreads); } template DUCC0_NOINLINE void r2c(const cfmav &in, vfmav> &out, const shape_t &axes, bool forward, T fct, size_t nthreads=1) { util::sanity_check_cr(out, in, axes); if (in.size()==0) return; r2c(in, out, axes.back(), forward, fct, nthreads); if (axes.size()==1) return; auto newaxes = shape_t{axes.begin(), --axes.end()}; c2c(out, out, newaxes, forward, T(1), nthreads); } template DUCC0_NOINLINE void c2r(const cfmav> &in, vfmav &out, size_t axis, bool forward, T fct, size_t nthreads=1) { util::sanity_check_cr(in, out, axis); if (in.size()==0) return; const auto &in2(reinterpret_cast>&>(in)); general_c2r(in2, out, axis, forward, fct, nthreads); } template DUCC0_NOINLINE void c2r(const cfmav> &in, vfmav &out, const shape_t &axes, bool forward, T fct, size_t nthreads=1) { if (axes.size()==1) return c2r(in, out, axes[0], forward, fct, nthreads); util::sanity_check_cr(in, out, axes); if (in.size()==0) return; auto atmp(vfmav>::build_noncritical(in.shape(), UNINITIALIZED)); auto newaxes = shape_t{axes.begin(), --axes.end()}; c2c(in, atmp, newaxes, forward, T(1), nthreads); c2r(atmp, out, axes.back(), forward, fct, nthreads); } template DUCC0_NOINLINE void c2r_mut(vfmav> &in, vfmav &out, const shape_t &axes, bool forward, T fct, size_t nthreads=1) { if (axes.size()==1) return c2r(in, out, axes[0], forward, fct, nthreads); util::sanity_check_cr(in, out, axes); if (in.size()==0) return; auto newaxes = shape_t{axes.begin(), --axes.end()}; c2c(in, in, newaxes, forward, T(1), nthreads); c2r(in, out, axes.back(), forward, fct, nthreads); } template DUCC0_NOINLINE void r2r_fftpack(const cfmav &in, vfmav &out, const shape_t &axes, bool real2hermitian, bool forward, T fct, size_t nthreads=1) { util::sanity_check_onetype(in, out, in.data()==out.data(), axes); if (in.size()==0) return; general_nd>(in, out, axes, fct, nthreads, ExecR2R{real2hermitian, forward}); } template DUCC0_NOINLINE void r2r_fftw(const cfmav &in, vfmav &out, const shape_t &axes, bool forward, T fct, size_t nthreads=1) { util::sanity_check_onetype(in, out, in.data()==out.data(), axes); if (in.size()==0) return; general_nd>(in, out, axes, fct, nthreads, ExecFFTW{forward}); } template DUCC0_NOINLINE void r2r_separable_hartley(const cfmav &in, vfmav &out, const shape_t &axes, T fct, size_t nthreads=1) { util::sanity_check_onetype(in, out, in.data()==out.data(), axes); if (in.size()==0) return; general_nd>(in, out, axes, fct, nthreads, ExecHartley{}, false); } template DUCC0_NOINLINE void r2r_separable_fht(const cfmav &in, vfmav &out, const shape_t &axes, T fct, size_t nthreads=1) { util::sanity_check_onetype(in, out, in.data()==out.data(), axes); if (in.size()==0) return; general_nd>(in, out, axes, fct, nthreads, ExecFHT{}, false); } template void hermiteHelper(size_t idim, ptrdiff_t iin, ptrdiff_t iout0, ptrdiff_t iout1, const cfmav &c, vfmav &r, const shape_t &axes, Func func, size_t nthreads) { auto cstr=c.stride(idim), str=r.stride(idim); auto len=r.shape(idim); if (idim+1==c.ndim()) // last dimension, not much gain in parallelizing { if (idim==axes.back()) // halfcomplex axis for (size_t i=0,ic=0; i void oscarize(vfmav &data, size_t ax0, size_t ax1, size_t nthreads) { auto nu=data.shape(ax0), nv=data.shape(ax1); if ((nu<3)||(nv<3)) return; vector slc(data.ndim()); slc[ax0] = slice(1,(nu+1)/2); slc[ax1] = slice(1,(nv+1)/2); auto all = subarray(data, slc); slc[ax0] = slice(nu-1,nu/2,-1); auto ahl = subarray(data, slc); slc[ax1] = slice(nv-1,nv/2,-1); auto ahh = subarray(data, slc); slc[ax0] = slice(1,(nu+1)/2); auto alh = subarray(data, slc); mav_apply([](T &ll, T &hl, T &hh, T &lh) { T tll=ll, thl=hl, tlh=lh, thh=hh; T v = T(0.5)*(tll+tlh+thl+thh); ll = v-thh; hl = v-tlh; lh = v-thl; hh = v-tll; }, nthreads, all, ahl, ahh, alh); } template void r2r_genuine_hartley(const cfmav &in, vfmav &out, const shape_t &axes, T fct, size_t nthreads=1) { if (axes.size()==1) return r2r_separable_hartley(in, out, axes, fct, nthreads); if (axes.size()==2) { r2r_separable_hartley(in, out, axes, fct, nthreads); oscarize(out, axes[0], axes[1], nthreads); return; } util::sanity_check_onetype(in, out, in.data()==out.data(), axes); if (in.size()==0) return; shape_t tshp(in.shape()); tshp[axes.back()] = tshp[axes.back()]/2+1; auto atmp(vfmav>::build_noncritical(tshp, UNINITIALIZED)); r2c(in, atmp, axes, true, fct, nthreads); hermiteHelper(0, 0, 0, 0, atmp, out, axes, [](const std::complex &c, T &r0, T &r1) { r0 = c.real()+c.imag(); r1 = c.real()-c.imag(); }, nthreads); } template void r2r_genuine_fht(const cfmav &in, vfmav &out, const shape_t &axes, T fct, size_t nthreads=1) { if (axes.size()==1) return r2r_separable_fht(in, out, axes, fct, nthreads); if (axes.size()==2) { r2r_separable_fht(in, out, axes, fct, nthreads); oscarize(out, axes[0], axes[1], nthreads); return; } util::sanity_check_onetype(in, out, in.data()==out.data(), axes); if (in.size()==0) return; shape_t tshp(in.shape()); tshp[axes.back()] = tshp[axes.back()]/2+1; auto atmp(vfmav>::build_noncritical(tshp, UNINITIALIZED)); r2c(in, atmp, axes, true, fct, nthreads); hermiteHelper(0, 0, 0, 0, atmp, out, axes, [](const std::complex &c, T &r0, T &r1) { r0 = c.real()-c.imag(); r1 = c.real()+c.imag(); }, nthreads); } template DUCC0_NOINLINE void general_convolve_axis(const cfmav &in, vfmav &out, const size_t axis, const cmav &kernel, size_t nthreads, const Exec &exec) { std::unique_ptr plan1, plan2; size_t l_in=in.shape(axis), l_out=out.shape(axis); MR_assert(kernel.size()==l_in, "bad kernel size"); plan1 = std::make_unique(l_in); plan2 = std::make_unique(l_out); size_t bufsz = max(plan1->bufsize(), plan2->bufsize()); vmav fkernel({kernel.shape(0)}, UNINITIALIZED); for (size_t i=0; iexec(fkernel.data(), T0(1)/T0(l_in), true, nthreads); execParallel( util::thread_count(nthreads, in, axis, fft_simdlen), [&](Scheduler &sched) { constexpr auto vlen = fft_simdlen; TmpStorage storage(in.size()/l_in, l_in+l_out, bufsz, 1, false); multi_iter it(in, out, axis, sched.num_threads(), sched.thread_num()); #ifndef DUCC0_NO_SIMD if constexpr (vlen>1) { TmpStorage2,T,T0> storage2(storage); while (it.remaining()>=vlen) { it.advance(vlen); exec(it, in, out, storage2, *plan1, *plan2, fkernel); } } if constexpr (vlen>2) if constexpr (simd_exists) if (it.remaining()>=vlen/2) { TmpStorage2,T,T0> storage2(storage); it.advance(vlen/2); exec(it, in, out, storage2, *plan1, *plan2, fkernel); } if constexpr (vlen>4) if constexpr (simd_exists) if (it.remaining()>=vlen/4) { TmpStorage2,T,T0> storage2(storage); it.advance(vlen/4); exec(it, in, out, storage2, *plan1, *plan2, fkernel); } #endif { TmpStorage2 storage2(storage); while (it.remaining()>0) { it.advance(1); exec(it, in, out, storage2, *plan1, *plan2, fkernel); } } }); // end of parallel region } struct ExecConv1R { template void operator() ( const Titer &it, const cfmav &in, vfmav &out, Tstorage &storage, const pocketfft_r &plan1, const pocketfft_r &plan2, const cmav &fkernel) const { using T = typename Tstorage::datatype; size_t l_in = plan1.length(), l_out = plan2.length(), l_min = std::min(l_in, l_out); T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); copy_input(it, in, buf2); plan1.exec_copyback(buf2, buf1, T0(1), true); auto res = buf2; { res[0] *= fkernel(0); size_t i; for (i=1; 2*i t1(res[2*i-1], res[2*i]); Cmplx t2(fkernel(2*i-1), fkernel(2*i)); auto t3 = t1*t2; res[2*i-1] = t3.r; res[2*i] = t3.i; } if (2*i==l_min) { if (l_min t1(res[2*i-1], res[2*i]); Cmplx t2(fkernel(2*i-1), fkernel(2*i)); res[2*i-1] = (t1*t2).r*T0(2); } else res[2*i-1] *= fkernel(2*i-1); } } for (size_t i=l_in; i void operator() ( const Titer &it, const cfmav> &in, vfmav> &out, Tstorage &storage, const pocketfft_c &plan1, const pocketfft_c &plan2, const cmav,1> &fkernel) const { using T = typename Tstorage::datatype; size_t l_in = plan1.length(), l_out = plan2.length(), l_min = std::min(l_in, l_out); T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); copy_input(it, in, buf2); auto res = plan1.exec(buf2, buf1, T0(1), true); auto res2 = buf2+l_in; { res2[0] = res[0]*fkernel(0); size_t i; for (i=1; 2*i DUCC0_NOINLINE void convolve_axis(const cfmav &in, vfmav &out, size_t axis, const cmav &kernel, size_t nthreads=1) { MR_assert(axis, T>(in, out, axis, kernel, nthreads, ExecConv1R()); } template DUCC0_NOINLINE void convolve_axis(const cfmav> &in, vfmav> &out, size_t axis, const cmav,1> &kernel, size_t nthreads=1) { MR_assert(axis>&>(in)); auto &out2(reinterpret_cast>&>(out)); const auto &kernel2(reinterpret_cast,1>&>(kernel)); general_convolve_axis, T>(in2, out2, axis, kernel2, nthreads, ExecConv1C()); } } // namespace detail_fft using detail_fft::FORWARD; using detail_fft::BACKWARD; using detail_fft::c2c; using detail_fft::c2r; using detail_fft::c2r_mut; using detail_fft::r2c; using detail_fft::r2r_fftpack; using detail_fft::r2r_fftw; using detail_fft::r2r_separable_hartley; using detail_fft::r2r_genuine_hartley; using detail_fft::r2r_separable_fht; using detail_fft::r2r_genuine_fht; using detail_fft::dct; using detail_fft::dst; using detail_fft::convolve_axis; } // namespace ducc0 #endif // POCKETFFT_HDRONLY_H