/* * High-speed vectorize FFT tree for arbitrary `logn`. * * ============================================================================= * Copyright (c) 2023 by Cryptographic Engineering Research Group (CERG) * ECE Department, George Mason University * Fairfax, VA, U.S.A. * Author: Duc Tri Nguyen * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ============================================================================= * @author Duc Tri Nguyen , */ #include "inner.h" #include "macrof.h" #include "macrofx4.h" /* * 1 layer of Merge FFT for 2 complex points (4 coefficients). */ static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mergeFFT_log2(fpr *f, const fpr *f0, const fpr *f1) { fpr a_re, a_im, b_re, b_im, d_re, d_im, s; a_re = f0[0]; a_im = f0[1]; s = fpr_tab_log2[0]; b_re = f1[0] * s; b_im = f1[1] * s; d_re = b_re - b_im; d_im = b_re + b_im; f[0] = a_re + d_re; f[2] = a_im + d_im; f[1] = a_re - d_re; f[3] = a_im - d_im; } /* * Vectorized 1 layer of Merge FFT for 4 complex points (8 coefficients). */ static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mergeFFT_log3(fpr *f, const fpr *f0, const fpr *f1) { // Total SIMD registers: 12 = 10 + 2 float64x2x2_t g1, g0, g_re, g_im, s_re_im; // 10 float64x2_t t_re, t_im; // 2 vloadx2(g1, &f1[0]); vload2(s_re_im, &fpr_tab_log3[0]); FWD_TOP(t_re, t_im, g1.val[0], g1.val[1], s_re_im.val[0], s_re_im.val[1]); vloadx2(g0, &f0[0]); FPC_ADD(g_re.val[0], g_im.val[0], g0.val[0], g0.val[1], t_re, t_im); FPC_SUB(g_re.val[1], g_im.val[1], g0.val[0], g0.val[1], t_re, t_im); vstore2(&f[0], g_re); vstore2(&f[4], g_im); } /* * Vectorized 1 layer of Merge FFT for 8 complex points (16 coefficients). */ static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_mergeFFT_log4(fpr *f, const fpr *f0, const fpr *f1, const unsigned logn) { const unsigned n = 1 << logn; const unsigned ht = n >> 2; const fpr *fpr_merge = fpr_table[logn]; // Total SIMD register 22 = 14 + 8 float64x2x2_t g1_re, g1_im, g0_re, g0_im, s_re_im, t_re, t_im; // 14 float64x2x4_t g_re, g_im; // 8 for (unsigned j = 0; j < ht; j += 4) { vload2(g1_re, &f1[j]); vload2(g1_im, &f1[j + ht]); vload2(s_re_im, &fpr_merge[j]); FWD_TOP(t_re.val[0], t_im.val[0], g1_re.val[0], g1_im.val[0], s_re_im.val[0], s_re_im.val[1]); vload2(g0_re, &f0[j]); FWD_TOP(t_re.val[1], t_im.val[1], g1_re.val[1], g1_im.val[1], s_re_im.val[0], s_re_im.val[1]); vload2(g0_im, &f0[j + ht]); FPC_ADD(g_re.val[0], g_im.val[0], g0_re.val[0], g0_im.val[0], t_re.val[0], t_im.val[0]); FPC_SUB(g_re.val[1], g_im.val[1], g0_re.val[0], g0_im.val[0], t_re.val[0], t_im.val[0]); FPC_ADDJ(g_re.val[2], g_im.val[2], g0_re.val[1], g0_im.val[1], t_re.val[1], t_im.val[1]); FPC_SUBJ(g_re.val[3], g_im.val[3], g0_re.val[1], g0_im.val[1], t_re.val[1], t_im.val[1]); vstore4(&f[j << 1], g_re); vstore4(&f[(j + ht) << 1], g_im); } } /* * 1 layer of Split FFT for 2 complex points (4 coefficients). */ static void PQCLEAN_FALCONPADDED1024_AARCH64_poly_splitFFT_log2(fpr *restrict f0, fpr *restrict f1, const fpr *restrict f) { fpr a_re, a_im, b_re, b_im, d_re, d_im, s; a_re = f[0]; b_re = f[1]; a_im = f[2]; b_im = f[3]; s = fpr_tab_log2[0] * 0.5; f0[0] = (a_re + b_re) * 0.5; f0[1] = (a_im + b_im) * 0.5; d_re = (a_re - b_re) * s; d_im = (a_im - b_im) * s; f1[0] = d_im + d_re; f1[1] = d_im - d_re; } /* * Vectorized 1 layer of Split FFT for 4 complex points (8 coefficients). */ static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_splitFFT_log3(fpr *f0, fpr *f1, const fpr *f) { // Total SIMD registers: 12 float64x2x2_t re, im, g0, g1, s_re_im, tm; // 12 vload2(re, &f[0]); vload2(im, &f[4]); FPC_ADD(g0.val[0], g0.val[1], re.val[0], im.val[0], re.val[1], im.val[1]); FPC_SUB(tm.val[0], tm.val[1], re.val[0], im.val[0], re.val[1], im.val[1]); vload2(s_re_im, &fpr_tab_log3[0]); vfmuln(g0.val[0], g0.val[0], 0.5); vfmuln(g0.val[1], g0.val[1], 0.5); vstorex2(&f0[0], g0); vfmuln(s_re_im.val[0], s_re_im.val[0], 0.5); vfmuln(s_re_im.val[1], s_re_im.val[1], 0.5); INV_BOTJ(g1.val[0], g1.val[1], tm.val[0], tm.val[1], s_re_im.val[0], s_re_im.val[1]); vstorex2(&f1[0], g1); } /* * Vectorized 1 layer of Split FFT for 8 complex points (16 coefficients). */ static inline void PQCLEAN_FALCONPADDED1024_AARCH64_poly_splitFFT_log4(fpr *f0, fpr *f1, const fpr *f, const unsigned logn) { const unsigned n = 1 << logn; const unsigned hn = n >> 1; const unsigned ht = n >> 2; const fpr *fpr_split = fpr_table[logn]; // Total SIMD register 23 = 1 + 8 + 14 float64x2_t half; // 1 float64x2x4_t g_re, g_im; // 8 float64x2x2_t s_re_im, t_re, t_im, g1_re, g1_im, g0_re, g0_im; // 14 half = vdupq_n_f64(0.5); for (unsigned j = 0; j < ht; j += 4) { unsigned j2 = j << 1; vload4(g_re, &f[j2]); vload4(g_im, &f[j2 + hn]); FPC_ADD(g0_re.val[0], g0_im.val[0], g_re.val[0], g_im.val[0], g_re.val[1], g_im.val[1]); FPC_ADD(g0_re.val[1], g0_im.val[1], g_re.val[2], g_im.val[2], g_re.val[3], g_im.val[3]); FPC_SUB(t_re.val[0], t_im.val[0], g_re.val[0], g_im.val[0], g_re.val[1], g_im.val[1]); FPC_SUB(t_re.val[1], t_im.val[1], g_re.val[3], g_im.val[3], g_re.val[2], g_im.val[2]); vload2(s_re_im, &fpr_split[j]); vfmul(g0_re.val[0], g0_re.val[0], half); vfmul(g0_re.val[1], g0_re.val[1], half); vstore2(&f0[j], g0_re); vfmul(g0_im.val[0], g0_im.val[0], half); vfmul(g0_im.val[1], g0_im.val[1], half); vstore2(&f0[j + ht], g0_im); vfmul(s_re_im.val[0], s_re_im.val[0], half); vfmul(s_re_im.val[1], s_re_im.val[1], half); INV_BOTJ(g1_re.val[0], g1_im.val[0], t_re.val[0], t_im.val[0], s_re_im.val[0], s_re_im.val[1]); INV_BOTJm(g1_re.val[1], g1_im.val[1], t_re.val[1], t_im.val[1], s_re_im.val[0], s_re_im.val[1]); vstore2(&f1[j], g1_re); vstore2(&f1[j + ht], g1_im); } } /* * Vectorized Split FFT implementation */ void PQCLEAN_FALCONPADDED1024_AARCH64_poly_split_fft(fpr *restrict f0, fpr *restrict f1, const fpr *f, const unsigned logn) { switch (logn) { case 1: // n = 2; hn = 1; qn = 0; f0[0] = f[0]; f1[0] = f[1]; break; case 2: PQCLEAN_FALCONPADDED1024_AARCH64_poly_splitFFT_log2(f0, f1, f); break; case 3: PQCLEAN_FALCONPADDED1024_AARCH64_poly_splitFFT_log3(f0, f1, f); break; default: PQCLEAN_FALCONPADDED1024_AARCH64_poly_splitFFT_log4(f0, f1, f, logn); break; } } /* * Vectorized Merge FFT implementation */ void PQCLEAN_FALCONPADDED1024_AARCH64_poly_merge_fft(fpr *restrict f, const fpr *restrict f0, const fpr *restrict f1, const unsigned logn) { switch (logn) { case 1: // n = 2; hn = 1; f[0] = f0[0]; f[1] = f1[0]; break; case 2: PQCLEAN_FALCONPADDED1024_AARCH64_poly_mergeFFT_log2(f, f0, f1); break; case 3: PQCLEAN_FALCONPADDED1024_AARCH64_poly_mergeFFT_log3(f, f0, f1); break; default: PQCLEAN_FALCONPADDED1024_AARCH64_poly_mergeFFT_log4(f, f0, f1, logn); break; } }