/* * Floating-point operations. * * ==========================(LICENSE BEGIN)============================ * * Copyright (c) 2017-2019 Falcon Project * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * ===========================(LICENSE END)============================= * * @author Thomas Pornin */ /* ====================================================================== */ #include #include #include "macrof.h" /* * We wrap the native 'double' type into a structure so that the C compiler * complains if we inadvertently use raw arithmetic operators on the 'fpr' * type instead of using the inline functions below. This should have no * extra runtime cost, since all the functions below are 'inline'. */ typedef double fpr; static inline fpr FPR(double v) { fpr x; x = v; return x; } static inline fpr fpr_of(int64_t i) { return (double)i; } static const fpr fpr_q = 12289.0 ; static const fpr fpr_inverse_of_q = 1.0 / 12289.0 ; static const fpr fpr_inv_2sqrsigma0 = .150865048875372721532312163019 ; static const fpr fpr_inv_sigma_10 = 0.0059386453095331159950250124336477482 ; static const fpr fpr_sigma_min_10 = 1.2982803343442918539708792538826807 ; static const fpr fpr_log2 = 0.69314718055994530941723212146 ; static const fpr fpr_inv_log2 = 1.4426950408889634073599246810 ; static const fpr fpr_bnorm_max = 16822.4121 ; static const fpr fpr_zero = 0.0 ; static const fpr fpr_one = 1.0 ; static const fpr fpr_two = 2.0 ; static const fpr fpr_onehalf = 0.5 ; static const fpr fpr_invsqrt2 = 0.707106781186547524400844362105 ; static const fpr fpr_invsqrt8 = 0.353553390593273762200422181052 ; static const fpr fpr_ptwo31 = 2147483648.0 ; static const fpr fpr_ptwo31m1 = 2147483647.0 ; static const fpr fpr_mtwo31m1 = -2147483647.0 ; static const fpr fpr_ptwo63m1 = 9223372036854775807.0 ; static const fpr fpr_mtwo63m1 = -9223372036854775807.0 ; static const fpr fpr_ptwo63 = 9223372036854775808.0 ; static inline int64_t fpr_rint(fpr x) { int64_t t; __asm__ ( "fcvtns %x0, %d1": "=r" (t) : "w" (x)); return t; } static inline int64_t fpr_floor(fpr x) { int64_t r; /* * The cast performs a trunc() (rounding toward 0) and thus is * wrong by 1 for most negative values. The correction below is * constant-time as long as the compiler turns the * floating-point conversion result into a 0/1 integer without a * conditional branch or another non-constant-time construction. * This should hold on all modern architectures with an FPU (and * if it is false on a given arch, then chances are that the FPU * itself is not constant-time, making the point moot). */ r = (int64_t)x; return r - (x < (double)r); } static inline int64_t fpr_trunc(fpr x) { return (int64_t)x; } static inline fpr fpr_add(fpr x, fpr y) { return (x + y); } static inline fpr fpr_sub(fpr x, fpr y) { return (x - y); } static inline fpr fpr_neg(fpr x) { return (-x); } static inline fpr fpr_half(fpr x) { return (x * 0.5); } static inline fpr fpr_double(fpr x) { return (x + x); } static inline fpr fpr_mul(fpr x, fpr y) { return (x * y); } static inline fpr fpr_sqr(fpr x) { return (x * x); } static inline fpr fpr_inv(fpr x) { return (1.0 / x); } static inline fpr fpr_div(fpr x, fpr y) { return (x / y); } static inline fpr fpr_sqrt(fpr x) { __asm__ ( "fsqrt %d0, %d0" : "+w" (x) : : ); return x; } static inline int fpr_lt(fpr x, fpr y) { return x < y; } static inline uint64_t fpr_expm_p63(fpr x, fpr ccs) { static const double C_expm[] = { 1.000000000000000000000000000000, // c0 -0.999999999999994892974086724280, // c1 0.500000000000019206858326015208, // c2 -0.166666666666984014666397229121, // c3 0.041666666666110491190622155955, // c4 -0.008333333327800835146903501993, // c5 0.001388888894063186997887560103, // c6 -0.000198412739277311890541063977, // c7 0.000024801566833585381209939524, // c8 -0.000002755586350219122514855659, // c9 0.000000275607356160477811864927, // c10 -0.000000025299506379442070029551, // c11 0.000000002073772366009083061987, // c12 0.000000000000000000000000000000, }; float64x2_t neon_x, neon_1x, neon_x2, neon_x4, neon_x8, neon_x12, neon_ccs; float64x2x4_t neon_exp0; float64x2x3_t neon_exp1; float64x2_t y1, y2, y3, y; double ret; neon_exp0 = vld1q_f64_x4(&C_expm[0]); neon_exp1 = vld1q_f64_x3(&C_expm[8]); neon_ccs = vdupq_n_f64(ccs); neon_ccs = vmulq_n_f64(neon_ccs, fpr_ptwo63); // x | x neon_x = vdupq_n_f64(x); // 1 | x neon_1x = vsetq_lane_f64(1.0, neon_x, 0); neon_x2 = vmulq_f64(neon_x, neon_x); neon_x4 = vmulq_f64(neon_x2, neon_x2); neon_x8 = vmulq_f64(neon_x4, neon_x4); neon_x12 = vmulq_f64(neon_x8, neon_x4); vfmla(y1, neon_exp0.val[0], neon_exp0.val[1], neon_x2); vfmla(y2, neon_exp0.val[2], neon_exp0.val[3], neon_x2); vfmla(y3, neon_exp1.val[0], neon_exp1.val[1], neon_x2); y1 = vmulq_f64(y1, neon_1x); y2 = vmulq_f64(y2, neon_1x); y3 = vmulq_f64(y3, neon_1x); vfmla(y, y1, y2, neon_x4); vfmla(y, y, y3, neon_x8); vfmla(y, y, neon_exp1.val[2], neon_x12); y = vmulq_f64( y, neon_ccs); ret = vaddvq_f64(y); return (uint64_t) ret; } #define fpr_p2_tab PQCLEAN_FALCON1024_AARCH64_fpr_p2_tab extern const fpr fpr_p2_tab[]; #define fpr_tab_log2 PQCLEAN_FALCON1024_AARCH64_fpr_tab_log2 #define fpr_tab_log3 PQCLEAN_FALCON1024_AARCH64_fpr_tab_log3 #define fpr_tab_log4 PQCLEAN_FALCON1024_AARCH64_fpr_tab_log4 #define fpr_tab_log5 PQCLEAN_FALCON1024_AARCH64_fpr_tab_log5 #define fpr_tab_log6 PQCLEAN_FALCON1024_AARCH64_fpr_tab_log6 #define fpr_tab_log7 PQCLEAN_FALCON1024_AARCH64_fpr_tab_log7 #define fpr_tab_log8 PQCLEAN_FALCON1024_AARCH64_fpr_tab_log8 #define fpr_tab_log9 PQCLEAN_FALCON1024_AARCH64_fpr_tab_log9 #define fpr_tab_log10 PQCLEAN_FALCON1024_AARCH64_fpr_tab_log10 #define fpr_table PQCLEAN_FALCON1024_AARCH64_fpr_table extern const fpr fpr_tab_log2[]; extern const fpr fpr_tab_log3[]; extern const fpr fpr_tab_log4[]; extern const fpr fpr_tab_log5[]; extern const fpr fpr_tab_log6[]; extern const fpr fpr_tab_log7[]; extern const fpr fpr_tab_log8[]; extern const fpr fpr_tab_log9[]; extern const fpr fpr_tab_log10[]; extern const fpr *fpr_table[]; /* ====================================================================== */