/* This file is part of the ducc FFT library Copyright (C) 2010-2022 Max-Planck-Society Copyright (C) 2019 Peter Bell Authors: Martin Reinecke, Peter Bell */ /* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */ /* All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This code is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this code; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef DUCC0_FFT1D_H #define DUCC0_FFT1D_H #include #include #include #include #include #include #include #include #include #include "ducc0/infra/useful_macros.h" #include "ducc0/math/cmplx.h" #include "ducc0/infra/error_handling.h" #include "ducc0/infra/aligned_array.h" #include "ducc0/infra/simd.h" #include "ducc0/infra/threading.h" #include "ducc0/math/unity_roots.h" namespace ducc0 { namespace detail_fft { using namespace std; // the next line is necessary to address some sloppy name choices in hipSYCL using std::min, std::max; template constexpr inline size_t fft1d_simdlen = min(8, native_simd::size()); template<> constexpr inline size_t fft1d_simdlen = min(4, native_simd::size()); template<> constexpr inline size_t fft1d_simdlen = min(8, native_simd::size()); template using fft1d_simd = typename simd_select>::type; template constexpr inline bool fft1d_simd_exists = (fft1d_simdlen > 1); // Always use std:: for functions template T cos(T) = delete; template T sin(T) = delete; template T sqrt(T) = delete; template inline void PM(T &a, T &b, T c, T d) { a=c+d; b=c-d; } template inline void PMINPLACE(T &a, T &b) { T t = a; a+=b; b=t-b; } template inline void MPINPLACE(T &a, T &b) { T t = a; a-=b; b=t+b; } template void special_mul (const Cmplx &v1, const Cmplx &v2, Cmplx &res) { res = fwd ? Cmplx(v1.r*v2.r+v1.i*v2.i, v1.i*v2.r-v1.r*v2.i) : Cmplx(v1.r*v2.r-v1.i*v2.i, v1.r*v2.i+v1.i*v2.r); } template void ROTX90(Cmplx &a) { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; } template inline auto tidx() { return type_index(typeid(T)); } struct util1d // hack to avoid duplicate symbols { /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */ DUCC0_NOINLINE static size_t good_size_cmplx(size_t n) { if (n<=12) return n; size_t bestfac=2*n; for (size_t f11=1; f11n) { if (x>=1; } else return n; } } return bestfac; } /* returns the smallest composite of 2, 3, 5 which is >= n */ DUCC0_NOINLINE static size_t good_size_real(size_t n) { if (n<=6) return n; size_t bestfac=2*n; for (size_t f5=1; f5n) { if (x>=1; } else return n; } } return bestfac; } DUCC0_NOINLINE static vector prime_factors(size_t N) { MR_assert(N>0, "need a positive number"); vector factors; while ((N&1)==0) { N>>=1; factors.push_back(2); } for (size_t divisor=3; divisor*divisor<=N; divisor+=2) while ((N%divisor)==0) { factors.push_back(divisor); N/=divisor; } if (N>1) factors.push_back(N); return factors; } }; template using Troots = shared_ptr>>; // T: "type", f/c: "float/complex", s/v: "scalar/vector" template class cfftpass { public: virtual ~cfftpass(){} using Tcs = Cmplx; // number of Tcd values required as scratch space during "exec" // will be provided in "buf" virtual size_t bufsize() const = 0; virtual bool needs_copy() const = 0; virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, bool fwd, size_t nthreads=1) const = 0; static vector factorize(size_t N) { MR_assert(N>0, "need a positive number"); vector factors; factors.reserve(15); while ((N&3)==0) { factors.push_back(4); N>>=2; } if ((N&1)==0) { N>>=1; // factor 2 should be at the front of the factor list factors.push_back(2); swap(factors[0], factors.back()); } for (size_t divisor=3; divisor*divisor<=N; divisor+=2) while ((N%divisor)==0) { factors.push_back(divisor); N/=divisor; } if (N>1) factors.push_back(N); return factors; } static shared_ptr make_pass(size_t l1, size_t ido, size_t ip, const Troots &roots, bool vectorize=false); static shared_ptr make_pass(size_t ip, bool vectorize=false) { return make_pass(1,1,ip,make_shared>>(ip), vectorize); } }; #define POCKETFFT_EXEC_DISPATCH \ virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, \ bool fwd, size_t nthreads=1) const \ { \ static const auto tics = tidx(); \ if (ti==tics) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ if constexpr (fft1d_simdlen > 1) \ if constexpr (simd_exists>) \ { \ using Tfv = typename simd_select>::type; \ using Tcv = Cmplx; \ static const auto ticv = tidx(); \ if (ti==ticv) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ } \ if constexpr (fft1d_simdlen > 2) \ if constexpr (simd_exists/2>) \ { \ using Tfv = typename simd_select/2>::type; \ using Tcv = Cmplx; \ static const auto ticv = tidx(); \ if (ti==ticv) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ } \ if constexpr (fft1d_simdlen > 4) \ if constexpr (simd_exists/4>) \ { \ using Tfv = typename simd_select/4>::type; \ using Tcv = Cmplx; \ static const auto ticv = tidx(); \ if (ti==ticv) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ } \ if constexpr (fft1d_simdlen > 8) \ if constexpr (simd_exists/8>) \ { \ using Tfv = typename simd_select/8>::type; \ using Tcv = Cmplx; \ static const auto ticv = tidx(); \ if (ti==ticv) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ } \ MR_fail("impossible vector length requested"); \ } template using Tcpass = shared_ptr>; template class cfftp1: public cfftpass { public: cfftp1() {} virtual size_t bufsize() const { return 0; } virtual bool needs_copy() const { return false; } virtual void *exec(const type_index & /*ti*/, void * in, void * /*copy*/, void * /*buf*/, bool /*fwd*/, size_t /*nthreads*/) const { return in; } }; template class cfftp2: public cfftpass { private: using typename cfftpass::Tcs; size_t l1, ido; static constexpr size_t ip=2; quick_array wa; auto WA(size_t i) const { return wa[i-1]; } template Tcd *exec_ (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const { if (ido==1) { auto CH = [ch,this](size_t b, size_t c) -> Tcd& { return ch[b+l1*c]; }; auto CC = [cc](size_t b, size_t c) -> const Tcd& { return cc[b+ip*c]; }; for (size_t k=0; k Tcd& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& { return cc[a+ido*(b+ip*c)]; }; for (size_t k=0; k(CC(i,0,k)-CC(i,1,k),WA(i),CH(i,k,1)); } } return ch; } } public: cfftp2(size_t l1_, size_t ido_, const Troots &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { size_t N=ip*l1*ido; size_t rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t i=1; i class cfftp3: public cfftpass { private: using typename cfftpass::Tcs; size_t l1, ido; static constexpr size_t ip=3; quick_array wa; auto WA(size_t x, size_t i) const { return wa[x+(i-1)*(ip-1)]; } template Tcd *exec_ (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const { constexpr Tfs tw1r=-0.5, tw1i= (fwd ? -1: 1) * Tfs(0.8660254037844386467637231707529362L); auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& { return cc[a+ido*(b+ip*c)]; }; #define POCKETFFT_PREP3(idx) \ Tcd t0 = CC(idx,0,k), t1, t2; \ PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \ CH(idx,k,0)=t0+t1; #define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \ { \ Tcd ca=t0+t1*twr; \ Tcd cb{-t2.i*twi, t2.r*twi}; \ PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\ } #define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \ { \ Tcd ca=t0+t1*twr; \ Tcd cb{-t2.i*twi, t2.r*twi}; \ special_mul(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ special_mul(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ } if (ido==1) for (size_t k=0; k &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { size_t N=ip*l1*ido; size_t rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t i=1; i class cfftp4: public cfftpass { private: using typename cfftpass::Tcs; size_t l1, ido; static constexpr size_t ip=4; quick_array wa; auto WA(size_t x, size_t i) const { return wa[x+(i-1)*(ip-1)]; } template Tcd *exec_ (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const { if (ido==1) { auto CH = [ch,this](size_t b, size_t c) -> Tcd& { return ch[b+l1*c]; }; auto CC = [cc](size_t b, size_t c) -> const Tcd& { return cc[b+ip*c]; }; for (size_t k=0; k(t4); PM(CH(k,0),CH(k,2),t2,t3); PM(CH(k,1),CH(k,3),t1,t4); } } else { auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& { return cc[a+ido*(b+ip*c)]; }; for (size_t k=0; k(t4); PM(CH(0,k,0),CH(0,k,2),t2,t3); PM(CH(0,k,1),CH(0,k,3),t1,t4); } for (size_t i=1; i(t4); CH(i,k,0) = t2+t3; special_mul(t1+t4,WA(0,i),CH(i,k,1)); special_mul(t2-t3,WA(1,i),CH(i,k,2)); special_mul(t1-t4,WA(2,i),CH(i,k,3)); } } } return ch; } public: cfftp4(size_t l1_, size_t ido_, const Troots &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { size_t N=ip*l1*ido; size_t rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t i=1; i class cfftp5: public cfftpass { private: using typename cfftpass::Tcs; size_t l1, ido; static constexpr size_t ip=5; quick_array wa; auto WA(size_t x, size_t i) const { return wa[x+(i-1)*(ip-1)]; } template Tcd *exec_ (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const { constexpr Tfs tw1r= Tfs(0.3090169943749474241022934171828191L), tw1i= (fwd ? -1: 1) * Tfs(0.9510565162951535721164393333793821L), tw2r= Tfs(-0.8090169943749474241022934171828191L), tw2i= (fwd ? -1: 1) * Tfs(0.5877852522924731291687059546390728L); auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& { return cc[a+ido*(b+ip*c)]; }; #define POCKETFFT_PREP5(idx) \ Tcd t0 = CC(idx,0,k), t1, t2, t3, t4; \ PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \ PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \ CH(idx,k,0).r=t0.r+t1.r+t2.r; \ CH(idx,k,0).i=t0.i+t1.i+t2.i; #define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \ { \ Tcd ca,cb; \ ca.r=t0.r+twar*t1.r+twbr*t2.r; \ ca.i=t0.i+twar*t1.i+twbr*t2.i; \ cb.i=twai*t4.r twbi*t3.r; \ cb.r=-(twai*t4.i twbi*t3.i); \ PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \ } #define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \ { \ Tcd ca,cb,da,db; \ ca.r=t0.r+twar*t1.r+twbr*t2.r; \ ca.i=t0.i+twar*t1.i+twbr*t2.i; \ cb.i=twai*t4.r twbi*t3.r; \ cb.r=-(twai*t4.i twbi*t3.i); \ special_mul(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ special_mul(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ } if (ido==1) for (size_t k=0; k &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { size_t N=ip*l1*ido; auto rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t i=1; i class cfftp7: public cfftpass { private: using typename cfftpass::Tcs; size_t l1, ido; static constexpr size_t ip=7; quick_array wa; auto WA(size_t x, size_t i) const { return wa[x+(i-1)*(ip-1)]; } template Tcd *exec_ (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const { constexpr Tfs tw1r= Tfs(0.6234898018587335305250048840042398L), tw1i= (fwd ? -1 : 1) * Tfs(0.7818314824680298087084445266740578L), tw2r= Tfs(-0.2225209339563144042889025644967948L), tw2i= (fwd ? -1 : 1) * Tfs(0.9749279121818236070181316829939312L), tw3r= Tfs(-0.9009688679024191262361023195074451L), tw3i= (fwd ? -1 : 1) * Tfs(0.433883739117558120475768332848359L); auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& { return cc[a+ido*(b+ip*c)]; }; #define POCKETFFT_PREP7(idx) \ Tcd t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \ PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \ PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \ PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \ CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \ CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i; #define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \ { \ Tcd ca,cb; \ ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \ ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \ cb.i=y1*t7.r y2*t6.r y3*t5.r; \ cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \ PM(out1,out2,ca,cb); \ } #define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \ POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2)) #define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \ { \ Tcd da,db; \ POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \ special_mul(da,WA(u1-1,i),CH(i,k,u1)); \ special_mul(db,WA(u2-1,i),CH(i,k,u2)); \ } if (ido==1) for (size_t k=0; k &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { size_t N=ip*l1*ido; auto rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t i=1; i class cfftp11: public cfftpass { private: using typename cfftpass::Tcs; size_t l1, ido; static constexpr size_t ip=11; quick_array wa; auto WA(size_t x, size_t i) const { return wa[x+(i-1)*(ip-1)]; } template [[gnu::hot]] Tcd *exec_ (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const { constexpr Tfs tw1r= Tfs(0.8412535328311811688618116489193677L), tw1i= (fwd ? -1 : 1) * Tfs(0.5406408174555975821076359543186917L), tw2r= Tfs(0.4154150130018864255292741492296232L), tw2i= (fwd ? -1 : 1) * Tfs(0.9096319953545183714117153830790285L), tw3r= Tfs(-0.1423148382732851404437926686163697L), tw3i= (fwd ? -1 : 1) * Tfs(0.9898214418809327323760920377767188L), tw4r= Tfs(-0.6548607339452850640569250724662936L), tw4i= (fwd ? -1 : 1) * Tfs(0.7557495743542582837740358439723444L), tw5r= Tfs(-0.9594929736144973898903680570663277L), tw5i= (fwd ? -1 : 1) * Tfs(0.2817325568414296977114179153466169L); auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& { return cc[a+ido*(b+ip*c)]; }; #define POCKETFFT_PREP11(idx) \ Tcd t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \ PM (t2,t11,CC(idx,1,k),CC(idx,10,k)); \ PM (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \ PM (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \ PM (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \ PM (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \ CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \ CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i; #define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \ { \ Tcd ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \ cb; \ cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \ cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \ PM(out1,out2,ca,cb); \ } #define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2)) #define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ { \ Tcd da,db; \ POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \ special_mul(da,WA(u1-1,i),CH(i,k,u1)); \ special_mul(db,WA(u2-1,i),CH(i,k,u2)); \ } if (ido==1) for (size_t k=0; k &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { size_t N=ip*l1*ido; auto rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t i=1; i class cfftpg: public cfftpass { private: using typename cfftpass::Tcs; size_t l1, ido; size_t ip; quick_array wa; quick_array csarr; auto WA(size_t x, size_t i) const { return wa[i-1+x*(ido-1)]; } template Tcd *exec_ (Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const { size_t ipph = (ip+1)/2; size_t idl1 = ido*l1; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& { return cc[a+ido*(b+ip*c)]; }; auto CX = [cc,this](size_t a, size_t b, size_t c) -> Tcd& { return cc[a+ido*(b+l1*c)]; }; auto CX2 = [cc, idl1](size_t a, size_t b) -> Tcd& { return cc[a+idl1*b]; }; auto CH2 = [ch, idl1](size_t a, size_t b) -> const Tcd& { return ch[a+idl1*b]; }; for (size_t k=0; kip) iwal-=ip; Tcs xwal=fwd ? csarr[iwal].conj() : csarr[iwal]; iwal+=l; if (iwal>ip) iwal-=ip; Tcs xwal2=fwd ? csarr[iwal].conj() : csarr[iwal]; for (size_t ik=0; ikip) iwal-=ip; Tcs xwal=fwd ? csarr[iwal].conj() : csarr[iwal]; for (size_t ik=0; ik(x1,wa[idij],CX(i,k,j)); idij=(jc-1)*(ido-1)+i-1; special_mul(x2,wa[idij],CX(i,k,jc)); } } } return cc; } public: cfftpg(size_t l1_, size_t ido_, size_t ip_, const Troots &roots) : l1(l1_), ido(ido_), ip(ip_), wa((ip-1)*(ido-1)), csarr(ip) { MR_assert((ip&1)&&(ip>=5), "need an odd number >=5"); size_t N=ip*l1*ido; auto rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t j=1; j class cfftpblue: public cfftpass { private: using typename cfftpass::Tcs; const size_t l1, ido, ip; const size_t ip2; const Tcpass subplan; quick_array wa, bk, bkf; size_t bufsz; bool need_cpy; auto WA(size_t x, size_t i) const { return wa[i-1+x*(ido-1)]; } template Tcd *exec_ (Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * DUCC0_RESTRICT buf, size_t nthreads) const { static const auto ti=tidx(); Tcd *akf = &buf[0]; Tcd *akf2 = &buf[ip2]; Tcd *subbuf = &buf[2*ip2]; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tcd& { return cc[a+ido*(b+ip*c)]; }; //FIXME: parallelize here? for (size_t k=0; k(CC(i,m,k),bk[m],akf[m]); auto zero = akf[0]*Tfs(0); for (size_t m=ip; m(subplan->exec(ti,akf,akf2, subbuf, true, nthreads)); /* do the convolution */ res[0] = res[0].template special_mul(bkf[0]); for (size_t m=1; m<(ip2+1)/2; ++m) { res[m] = res[m].template special_mul(bkf[m]); res[ip2-m] = res[ip2-m].template special_mul(bkf[m]); } if ((ip2&1)==0) res[ip2/2] = res[ip2/2].template special_mul(bkf[ip2/2]); /* inverse FFT */ res = static_cast(subplan->exec(ti, res, (res==akf) ? akf2 : akf, subbuf, false, nthreads)); /* multiply by b_k and write to output buffer */ if (l1>1) { if (i==0) for (size_t m=0; m(bk[m]); else { CH(i,k,0) = res[0].template special_mul(bk[0]); for (size_t m=1; m(bk[m]*WA(m-1,i)); } } else { if (i==0) for (size_t m=0; m(bk[m]); else { CC(i,0,0) = res[0].template special_mul(bk[0]); for (size_t m=1; m(bk[m]*WA(m-1,i)); } } } return (l1>1) ? ch : cc; } public: cfftpblue(size_t l1_, size_t ido_, size_t ip_, const Troots &roots, bool vectorize=false) : l1(l1_), ido(ido_), ip(ip_), ip2(util1d::good_size_cmplx(ip*2-1)), subplan(cfftpass::make_pass(ip2, vectorize)), wa((ip-1)*(ido-1)), bk(ip), bkf(ip2/2+1) { size_t N=ip*l1*ido; auto rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t j=1; jsize()/(2*ip))*2*ip==roots->size()) ? roots : make_shared>(2*ip); size_t rfct2 = roots2->size()/(2*ip); for (size_t m=1; m=2*ip) coeff-=2*ip; bk[m] = (*roots2)[coeff*rfct2]; } /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */ quick_array tbkf(ip2), tbkf2(ip2); Tfs xn2 = Tfs(1)/Tfs(ip2); tbkf[0] = bk[0]*xn2; for (size_t m=1; m buf(subplan->bufsize()); static const auto tics=tidx(); auto res = static_cast(subplan->exec(tics, tbkf.data(), tbkf2.data(), buf.data(), true)); for (size_t i=0; i1; bufsz = ip2*(1+subplan->needs_copy()) + subplan->bufsize(); } virtual size_t bufsize() const { return bufsz; } virtual bool needs_copy() const { return need_cpy; } POCKETFFT_EXEC_DISPATCH }; template class cfft_multipass: public cfftpass { private: using typename cfftpass::Tcs; static constexpr size_t bunchsize=8; const size_t l1, ido; size_t ip; vector> passes; size_t bufsz; bool need_cpy; size_t rfct; Troots myroots; // FIXME split into sub-functions. This is too long! template Cmplx *exec_(Cmplx *cc, Cmplx *ch, Cmplx *buf, size_t nthreads) const { using Tc = Cmplx; if ((l1==1) && (ido==1)) // no chance at vectorizing { static const auto tic=tidx(); Tc *p1=cc, *p2=ch; for(const auto &pass: passes) { auto res = static_cast(pass->exec(tic, p1, p2, buf, fwd, nthreads)); if (res==p2) swap (p1,p2); } return p1; } else { if constexpr(is_same::value && fft1d_simd_exists) // we can vectorize! { using Tfv = fft1d_simd; using Tcv = Cmplx; constexpr size_t vlen = Tfv::size(); size_t nvtrans = (l1*ido + vlen-1)/vlen; // NOTE: removed "static" here, because it leads to trouble with gcc 7 // static const type_index ticv = tidx(); const type_index ticv = tidx(); if (ido==1) { auto CH = [ch,this](size_t b, size_t c) -> Tc& { return ch[b+l1*c]; }; auto CC = [cc,this](size_t b, size_t c) -> Tc& { return cc[b+ip*c]; }; execStatic(nvtrans, nthreads, 0, [&](auto &sched) { quick_array tbuf(2*ip+bufsize()); auto cc2 = &tbuf[0]; auto ch2 = &tbuf[ip]; auto buf2 = &tbuf[2*ip]; while (auto rng=sched.getNext()) for(auto itrans=rng.lo; itrans(pass->exec(ticv, p1, p2, buf2, fwd)); if (res==p2) swap (p1,p2); } for (size_t m=0; m Tc& { return cc[a+ido*b]; }; execStatic(nvtrans, nthreads, 0, [&](auto &sched) { quick_array tbuf(2*ip+bufsize()); auto cc2 = &tbuf[0]; auto ch2 = &tbuf[ip]; auto buf2 = &tbuf[2*ip]; while (auto rng=sched.getNext()) for(auto itrans=rng.lo; itrans(pass->exec(ticv, p1, p2, buf2, fwd)); if (res==p2) swap (p1,p2); } for (size_t m=0; m= ido) break; if (i==0) CC(0,m) = { p1[m].r[n], p1[m].i[n] }; else { if (m==0) CC(i,0) = { p1[0].r[n], p1[0].i[n] } ; else CC(i,m) = Tcs(p1[m].r[n],p1[m].i[n]).template special_mul((*myroots)[rfct*m*i]); } } } }); return cc; } MR_fail("must not get here"); #if 0 //FIXME this code path is currently unused quick_array tbuf(2*ip+bufsize()); auto cc2 = &tbuf[0]; auto ch2 = &tbuf[ip]; auto buf2 = &tbuf[2*ip]; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tc& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tc& { return cc[a+ido*(b+ip*c)]; }; //FIXME parallelize? for (size_t itrans=0; itrans ix, kx; size_t ixcur = (itrans*vlen)%ido; size_t kxcur = (itrans*vlen)/ido; for (size_t n=0; n(pass->exec(ticv, p1, p2, buf2, fwd)); if (res==p2) swap (p1,p2); } for (size_t m=0; m= l1*ido) break; if (i==0) CH(0,k,m) = { p1[m].r[n], p1[m].i[n] }; else { if (m==0) CH(i,k,0) = { p1[0].r[n], p1[0].i[n] } ; else CH(i,k,m) = Tcs(p1[m].r[n],p1[m].i[n]).template special_mul((*myroots)[rfct*l1*m*i]); } } } return ch; #endif } else { static const auto tic = tidx *>(); if (ido==1) { // parallelize here! for (size_t n=0; n *p1=&cc[n*ip], *p2=ch; Cmplx *res = nullptr; for(const auto &pass: passes) { res = static_cast *>(pass->exec(tic, p1, p2, buf, fwd)); if (res==p2) swap (p1,p2); } if (res != &cc[n*ip]) copy(res, res+ip, cc+n*ip); } // transpose size_t nbunch = (l1*ido + bunchsize-1)/bunchsize; // parallelize here! for (size_t ibunch=0; ibunch Tc& { return cc[a+ido*b]; }; // parallelize here! for (size_t ibunch=0; ibunch *p1=&cc2[n*ip], *p2=ch2; Cmplx *res = nullptr; for(const auto &pass: passes) { res = static_cast *>(pass->exec(tic, p1, p2, buf2, fwd)); if (res==p2) swap (p1,p2); } if (res==&cc2[n*ip]) // no copying necessary { if (i!=0) { for (size_t m=1; m((*myroots)[rfct*m*i]); } } else { if (i==0) for (size_t m=0; m((*myroots)[rfct*m*i]); } } } for (size_t m=0; m Tc& { return ch[a+ido*(b+l1*c)]; }; auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tc& { return cc[a+ido*(b+ip*c)]; }; // parallelize here! for (size_t ibunch=0; ibunch ix, kx; size_t ixcur = (ibunch*bunchsize)%ido; size_t kxcur = (ibunch*bunchsize)/ido; for (size_t n=0; n *p1=&cc2[n*ip], *p2=ch2; Cmplx *res = nullptr; for(const auto &pass: passes) { res = static_cast *>(pass->exec(tic, p1, p2, buf2, fwd)); if (res==p2) swap (p1,p2); } if (res==&cc2[n*ip]) // no copying necessary { if (i!=0) { for (size_t m=1; m((*myroots)[rfct*l1*m*i]); } } else { if (i==0) for (size_t m=0; m((*myroots)[rfct*l1*m*i]); } } } for (size_t m=0; m &roots, bool /*vectorize*/=false) : l1(l1_), ido(ido_), ip(ip_), bufsz(0), need_cpy(false), myroots(roots) { size_t N=ip*l1*ido; rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); // FIXME TBD // do we need the vectorize flag at all? size_t lim = 10000; //vectorize ? 10000 : 10000; if (ip<=lim) { auto factors = cfftpass::factorize(ip); size_t l1l=1; for (auto fct: factors) { passes.push_back(cfftpass::make_pass(l1l, ip/(fct*l1l), fct, roots, false)); l1l*=fct; } } else { vector packets(2,1); auto factors = util1d::prime_factors(ip); sort(factors.begin(), factors.end(), std::greater()); for (auto fct: factors) (packets[0]>packets[1]) ? packets[1]*=fct : packets[0]*=fct; size_t l1l=1; for (auto pkt: packets) { passes.push_back(cfftpass::make_pass(l1l, ip/(pkt*l1l), pkt, roots, false)); l1l*=pkt; } } for (const auto &pass: passes) { bufsz = max(bufsz, pass->bufsize()); need_cpy |= pass->needs_copy(); } if ((l1!=1)||(ido!=1)) { need_cpy=true; bufsz += (bunchsize+1)*ip; } } virtual size_t bufsize() const { return bufsz; } virtual bool needs_copy() const { return need_cpy; } POCKETFFT_EXEC_DISPATCH }; #undef POCKETFFT_EXEC_DISPATCH #if 0 // leaving in for potential future use; but doesn't seem beneficial template class cfftp_vecpass: public cfftpass { private: static_assert(simd_exists, "bad vlen"); using typename cfftpass::Tcs; using Tfv=typename simd_select::type; using Tcv=Cmplx; size_t ip; Tcpass spass; Tcpass vpass; size_t bufsz; template Tcs *exec_ (Tcs *cc, Tcs * /*ch*/, Tcs * /*buf*/, size_t nthreads) const { quick_array buf(2*ip+bufsz); auto * cc2 = buf.data(); auto * ch2 = buf.data()+ip; auto * buf2 = buf.data()+2*ip; static const auto tics = tidx(); // run scalar pass auto res = static_cast(spass->exec(tics, cc, reinterpret_cast(ch2), reinterpret_cast(buf2), fwd, nthreads)); // arrange input in SIMD-friendly way // FIXME: swap loops? for (size_t i=0; i(); auto res2 = static_cast(vpass->exec(ticv, cc2, ch2, buf2, fwd, nthreads)); // de-SIMDify for (size_t i=0; i &roots) : ip(ip_), spass(cfftpass::make_pass(1, ip/vlen, vlen, roots)), vpass(cfftpass::make_pass(1, 1, ip/vlen, roots)), bufsz(0) { MR_assert((ip/vlen)*vlen==ip, "cannot vectorize this size"); bufsz=2*ip+max(vpass->bufsize(),(spass->bufsize()+vlen-1)/vlen); } virtual size_t bufsize() const { return 0; } virtual bool needs_copy() const { return false; } virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, bool fwd, size_t nthreads=1) const { static const auto tics = tidx(); MR_assert(ti==tics, "bad input type"); auto in1 = static_cast(in); auto copy1 = static_cast(copy); auto buf1 = static_cast(buf); return fwd ? exec_(in1, copy1, buf1, nthreads) : exec_(in1, copy1, buf1, nthreads); } }; #endif template Tcpass cfftpass::make_pass(size_t l1, size_t ido, size_t ip, const Troots &roots, bool vectorize) { MR_assert(ip>=1, "no zero-sized FFTs"); #if 0 if (vectorize && (ip>300) && (ip<32768) && (l1==1) && (ido==1)) { constexpr auto vlen = native_simd::size(); if constexpr(vlen>1) if ((ip&(vlen-1))==0) return make_shared>(ip, roots); } #endif if (ip==1) return make_shared>(); auto factors=cfftpass::factorize(ip); if (factors.size()==1) { switch(ip) { case 2: return make_shared>(l1, ido, roots); case 3: return make_shared>(l1, ido, roots); case 4: return make_shared>(l1, ido, roots); case 5: return make_shared>(l1, ido, roots); case 7: return make_shared>(l1, ido, roots); case 11: return make_shared>(l1, ido, roots); default: if (ip<110) return make_shared>(l1, ido, ip, roots); else return make_shared>(l1, ido, ip, roots, vectorize); } } else // more than one factor, need a multipass return make_shared>(l1, ido, ip, roots, vectorize); } template class pocketfft_c { private: size_t N; size_t critbuf; Tcpass plan; public: pocketfft_c(size_t n, bool vectorize=false) : N(n), critbuf(((N&1023)==0) ? 16 : 0), plan(cfftpass::make_pass(n,vectorize)) {} size_t length() const { return N; } size_t bufsize() const { return N*plan->needs_copy()+2*critbuf+plan->bufsize(); } template DUCC0_NOINLINE Cmplx *exec(Cmplx *in, Cmplx *buf, Tfs fct, bool fwd, size_t nthreads=1) const { static const auto tic = tidx *>(); auto res = static_cast *>(plan->exec(tic, in, buf+critbuf+plan->bufsize(), buf+critbuf, fwd, nthreads)); if (fct!=Tfs(1)) for (size_t i=0; i DUCC0_NOINLINE void exec_copyback(Cmplx *in, Cmplx *buf, Tfs fct, bool fwd, size_t nthreads=1) const { static const auto tic = tidx *>(); auto res = static_cast *>(plan->exec(tic, in, buf, buf+N*plan->needs_copy(), fwd, nthreads)); if (res==in) { if (fct!=Tfs(1)) for (size_t i=0; i DUCC0_NOINLINE void exec(Cmplx *in, Tfs fct, bool fwd, size_t nthreads=1) const { quick_array> buf(N*plan->needs_copy()+plan->bufsize()); exec_copyback(in, buf.data(), fct, fwd, nthreads); } }; template class rfftpass { public: virtual ~rfftpass(){} // number of Tfd values required as scratch space during "exec" // will be provided in "buf" virtual size_t bufsize() const = 0; virtual bool needs_copy() const = 0; virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, bool fwd, size_t nthreads=1) const = 0; static vector factorize(size_t N) { MR_assert(N>0, "need a positive number"); vector factors; while ((N&3)==0) { factors.push_back(4); N>>=2; } if ((N&1)==0) { N>>=1; // factor 2 should be at the front of the factor list factors.push_back(2); swap(factors[0], factors.back()); } for (size_t divisor=3; divisor*divisor<=N; divisor+=2) while ((N%divisor)==0) { factors.push_back(divisor); N/=divisor; } if (N>1) factors.push_back(N); return factors; } static shared_ptr make_pass(size_t l1, size_t ido, size_t ip, const Troots &roots, bool vectorize=false); static shared_ptr make_pass(size_t ip, bool vectorize=false) { return make_pass(1,1,ip,make_shared>>(ip), vectorize); } }; #define POCKETFFT_EXEC_DISPATCH \ virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, \ bool fwd, size_t nthreads) const \ { \ static const auto tifs=tidx(); \ if (ti==tifs) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ if constexpr (fft1d_simdlen > 1) \ if constexpr (simd_exists>) \ { \ using Tfv = typename simd_select>::type; \ static const auto tifv=tidx(); \ if (ti==tifv) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ } \ if constexpr (fft1d_simdlen > 2) \ if constexpr (simd_exists/2>) \ { \ using Tfv = typename simd_select/2>::type; \ static const auto tifv=tidx(); \ if (ti==tifv) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ } \ if constexpr (fft1d_simdlen > 4) \ if constexpr (simd_exists/4>) \ { \ using Tfv = typename simd_select/4>::type; \ static const auto tifv=tidx(); \ if (ti==tifv) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ } \ if constexpr (fft1d_simdlen > 8) \ if constexpr (simd_exists/8>) \ { \ using Tfv = typename simd_select/8>::type; \ static const auto tifv=tidx(); \ if (ti==tifv) \ { \ auto in1 = static_cast(in); \ auto copy1 = static_cast(copy); \ auto buf1 = static_cast(buf); \ return fwd ? exec_(in1, copy1, buf1, nthreads) \ : exec_(in1, copy1, buf1, nthreads); \ } \ } \ MR_fail("impossible vector length requested"); \ } template using Trpass = shared_ptr>; /* (a+ib) = conj(c+id) * (e+if) */ template inline void MULPM (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) { a=c*e+d*f; b=c*f-d*e; } template class rfftp1: public rfftpass { public: rfftp1() {} virtual size_t bufsize() const { return 0; } virtual bool needs_copy() const { return false; } virtual void *exec(const type_index & /*ti*/, void * in, void * /*copy*/, void * /*buf*/, bool /*fwd*/, size_t /*nthreads*/) const { return in; } }; template class rfftp2: public rfftpass { private: size_t l1, ido; static constexpr size_t ip=2; quick_array wa; auto WA(size_t x, size_t i) const { return wa[i+x*(ido-1)]; } template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const { if constexpr(fwd) { auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& { return cc[a+ido*(b+l1*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& { return ch[a+ido*(b+ip*c)]; }; for (size_t k=0; k const Tfd& { return cc[a+ido*(b+ip*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& { return ch[a+ido*(b+l1*c)]; }; for (size_t k=0; k &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { size_t N=ip*l1*ido; size_t rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t j=1; j class rfftp3: public rfftpass { private: size_t l1, ido; static constexpr size_t ip=3; quick_array wa; auto WA(size_t x, size_t i) const { return wa[i+x*(ido-1)]; } template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const { constexpr Tfs taur=Tfs(-0.5), taui=Tfs(0.8660254037844386467637231707529362L); if constexpr(fwd) { auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& { return cc[a+ido*(b+l1*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& { return ch[a+ido*(b+ip*c)]; }; for (size_t k=0; k const Tfd& { return cc[a+ido*(b+ip*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& { return ch[a+ido*(b+l1*c)]; }; for (size_t k=0; k &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { MR_assert(ido&1, "ido must be odd"); size_t N=ip*l1*ido; size_t rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t j=1; j class rfftp4: public rfftpass { private: size_t l1, ido; static constexpr size_t ip=4; quick_array wa; auto WA(size_t x, size_t i) const { return wa[i+x*(ido-1)]; } template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const { if constexpr(fwd) { constexpr Tfs hsqt2=Tfs(0.707106781186547524400844362104849L); auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& { return cc[a+ido*(b+l1*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& { return ch[a+ido*(b+ip*c)]; }; for (size_t k=0; k const Tfd& { return cc[a+ido*(b+ip*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& { return ch[a+ido*(b+l1*c)]; }; for (size_t k=0; k &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { size_t N=ip*l1*ido; size_t rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t j=1; j class rfftp5: public rfftpass { private: size_t l1, ido; static constexpr size_t ip=5; quick_array wa; auto WA(size_t x, size_t i) const { return wa[i+x*(ido-1)]; } template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const { constexpr Tfs tr11= Tfs(0.3090169943749474241022934171828191L), ti11= Tfs(0.9510565162951535721164393333793821L), tr12= Tfs(-0.8090169943749474241022934171828191L), ti12= Tfs(0.5877852522924731291687059546390728L); if constexpr(fwd) { auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& { return cc[a+ido*(b+l1*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& { return ch[a+ido*(b+ip*c)]; }; for (size_t k=0; k const Tfd& { return cc[a+ido*(b+ip*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& { return ch[a+ido*(b+l1*c)]; }; for (size_t k=0; k &roots) : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) { MR_assert(ido&1, "ido must be odd"); size_t N=ip*l1*ido; size_t rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t j=1; j class rfftpg: public rfftpass { private: size_t l1, ido; size_t ip; quick_array wa, csarr; template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const { if constexpr(fwd) { size_t ipph=(ip+1)/2; size_t idl1 = ido*l1; auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tfd& { return cc[a+ido*(b+ip*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> const Tfd& { return ch[a+ido*(b+l1*c)]; }; auto C1 = [cc,this] (size_t a, size_t b, size_t c) -> Tfd& { return cc[a+ido*(b+l1*c)]; }; auto C2 = [cc,idl1] (size_t a, size_t b) -> Tfd& { return cc[a+idl1*b]; }; auto CH2 = [ch,idl1] (size_t a, size_t b) -> Tfd& { return ch[a+idl1*b]; }; if (ido>1) { for (size_t j=1, jc=ip-1; j=ip) iang-=ip; Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; Tfs ar3=csarr[2*iang], ai3=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; Tfs ar4=csarr[2*iang], ai4=csarr[2*iang+1]; for (size_t ik=0; ik=ip) iang-=ip; Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if (iang>=ip) iang-=ip; Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; for (size_t ik=0; ik=ip) iang-=ip; Tfs ar=csarr[2*iang], ai=csarr[2*iang+1]; for (size_t ik=0; ik const Tfd& { return cc[a+ido*(b+ip*c)]; }; auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& { return ch[a+ido*(b+l1*c)]; }; auto C1 = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& { return cc[a+ido*(b+l1*c)]; }; auto C2 = [cc,idl1](size_t a, size_t b) -> Tfd& { return cc[a+idl1*b]; }; auto CH2 = [ch,idl1](size_t a, size_t b) -> Tfd& { return ch[a+idl1*b]; }; for (size_t k=0; kip) iang-=ip; Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; Tfs ar3=csarr[2*iang], ai3=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; Tfs ar4=csarr[2*iang], ai4=csarr[2*iang+1]; for (size_t ik=0; ikip) iang-=ip; Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; iang+=l; if(iang>ip) iang-=ip; Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; for (size_t ik=0; ikip) iang-=ip; Tfs war=csarr[2*iang], wai=csarr[2*iang+1]; for (size_t ik=0; ik &roots) : l1(l1_), ido(ido_), ip(ip_), wa((ip-1)*(ido-1)), csarr(2*ip) { MR_assert(ido&1, "ido must be odd"); size_t N=ip*l1*ido; size_t rfct = roots->size()/N; MR_assert(roots->size()==N*rfct, "mismatch"); for (size_t j=1; j class rfftpblue: public rfftpass { private: const size_t l1, ido, ip; quick_array wa; const Tcpass cplan; size_t bufsz; bool need_cpy; auto WA(size_t x, size_t i) const { return wa[i+x*(ido-1)]; } template