#include "crypto_core_multsntrup1013_ntt.h" #include <immintrin.h> // auto-generated; do not edit #define _mm256_permute2x128_si256_lo(f0,f1) _mm256_permute2x128_si256(f0,f1,0x20) #define _mm256_permute2x128_si256_hi(f0,f1) _mm256_permute2x128_si256(f0,f1,0x31) #define int16x16 __m256i typedef int16_t int16; typedef int32_t int32; typedef union { int16 data[106 * 16]; __m256i _dummy; } vec1696; static const vec1696 qdata_7681 = { .data = { #define precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+0) -3593, -3593, -3593, -3593, -3625, -3625, -3625, -3625, -3593, -3593, -3593, -3593, -3625, -3625, -3625, -3625, #define precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+16) -3777, -3777, -3777, -3777, 3182, 3182, 3182, 3182, -3777, -3777, -3777, -3777, 3182, 3182, 3182, 3182, #define precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+32) -3593, -3593, -3593, -3593, -3182, -3182, -3182, -3182, -3593, -3593, -3593, -3593, -3182, -3182, -3182, -3182, #define precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+48) 3777, 3777, 3777, 3777, 3625, 3625, 3625, 3625, 3777, 3777, 3777, 3777, 3625, 3625, 3625, 3625, #define precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+64) -3593, -3593, -3593, -3593, 2194, 2194, 2194, 2194, -3593, -3593, -3593, -3593, 2194, 2194, 2194, 2194, #define precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+80) -3625, -3625, -3625, -3625, -1100, -1100, -1100, -1100, -3625, -3625, -3625, -3625, -1100, -1100, -1100, -1100, #define precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+96) -3593, -3593, -3593, -3593, 3696, 3696, 3696, 3696, -3593, -3593, -3593, -3593, 3696, 3696, 3696, 3696, #define precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+112) -3182, -3182, -3182, -3182, -2456, -2456, -2456, -2456, -3182, -3182, -3182, -3182, -2456, -2456, -2456, -2456, #define precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+128) -3593, 1701, 2194, 834, -3625, 2319, -1100, 121, -3593, 1701, 2194, 834, -3625, 2319, -1100, 121, #define precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+144) -3777, 1414, 2456, 2495, 3182, 2876, -3696, 2250, -3777, 1414, 2456, 2495, 3182, 2876, -3696, 2250, #define precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+160) -3593, -2250, 3696, -2876, -3182, -2495, -2456, -1414, -3593, -2250, 3696, -2876, -3182, -2495, -2456, -1414, #define precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+176) 3777, -121, 1100, -2319, 3625, -834, -2194, -1701, 3777, -121, 1100, -2319, 3625, -834, -2194, -1701, #define precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+192) -3593, 3364, 1701, -1599, 2194, 2557, 834, -2816, -3593, 3364, 1701, -1599, 2194, 2557, 834, -2816, #define precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+208) -3625, 617, 2319, 2006, -1100, -1296, 121, 1986, -3625, 617, 2319, 2006, -1100, -1296, 121, 1986, #define precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+224) -3593, 2237, -2250, -1483, 3696, 3706, -2876, 1921, -3593, 2237, -2250, -1483, 3696, 3706, -2876, 1921, #define precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+240) -3182, 2088, -2495, -1525, -2456, 1993, -1414, 2830, -3182, 2088, -2495, -1525, -2456, 1993, -1414, 2830, #define precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+256) -3593, 514, 3364, 438, 1701, 2555, -1599, -1738, 2194, 103, 2557, 1881, 834, -549, -2816, 638, #define precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+272) -3625, -1399, 617, -1760, 2319, 2535, 2006, 3266, -1100, -1431, -1296, 3174, 121, 3153, 1986, -810, #define precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+288) -3777, 2956, -2830, -679, 1414, 2440, -1993, -3689, 2456, 2804, 1525, 3555, 2495, 1535, -2088, -7, #define precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+304) 3182, -1321, -1921, -1305, 2876, -3772, -3706, 3600, -3696, -2043, 1483, -396, 2250, -2310, -2237, 1887, #define precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+320) -3593, -1887, 2237, 2310, -2250, 396, -1483, 2043, 3696, -3600, 3706, 3772, -2876, 1305, 1921, 1321, #define precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+336) -3182, 7, 2088, -1535, -2495, -3555, -1525, -2804, -2456, 3689, 1993, -2440, -1414, 679, 2830, -2956, #define precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+352) 3777, 810, -1986, -3153, -121, -3174, 1296, 1431, 1100, -3266, -2006, -2535, -2319, 1760, -617, 1399, #define precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+368) 3625, -638, 2816, 549, -834, -1881, -2557, -103, -2194, 1738, 1599, -2555, -1701, -438, -3364, -514, #define precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+384) -3593, -1532, 514, -373, 3364, -3816, 438, -3456, 1701, 783, 2555, 2883, -1599, 727, -1738, -2385, #define precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+400) 2194, -2160, 103, -2391, 2557, 2762, 1881, -2426, 834, 3310, -549, -1350, -2816, 1386, 638, -194, #define precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+416) -3625, 404, -1399, -3692, 617, -2764, -1760, -1054, 2319, 1799, 2535, -3588, 2006, 1533, 3266, 2113, #define precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+432) -1100, -2579, -1431, -1756, -1296, 1598, 3174, -2, 121, -3480, 3153, -2572, 1986, 2743, -810, 2919, #define precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+448) -3593, 2789, -1887, -921, 2237, -1497, 2310, -2133, -2250, -915, 396, 1390, -1483, 3135, 2043, -859, #define precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+464) 3696, 2732, -3600, -1464, 3706, 2224, 3772, -2665, -2876, 1698, 1305, 2835, 1921, 730, 1321, 486, #define precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+480) -3182, 3417, 7, -3428, 2088, -3145, -1535, 1168, -2495, -3831, -3555, -3750, -1525, 660, -2804, 2649, #define precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+496) -2456, 3405, 3689, -1521, 1993, 1681, -2440, 1056, -1414, 1166, 679, -2233, 2830, 2175, -2956, -1919, #define precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+512) -3593, -1404, -1532, 451, 514, -402, -373, 1278, 3364, -509, -3816, -3770, 438, -2345, -3456, -226, #define precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+528) 1701, -1689, 783, -1509, 2555, 2963, 2883, 1242, -1599, 1669, 727, 2719, -1738, 642, -2385, -436, #define precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+544) 2194, 3335, -2160, 1779, 103, 3745, -2391, 17, 2557, 2812, 2762, -1144, 1881, 83, -2426, -1181, #define precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+560) 834, -1519, 3310, 3568, -549, -796, -1350, 2072, -2816, -2460, 1386, 2891, 638, -2083, -194, -715, #define precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+576) -3593, -402, -3816, -226, 2555, 1669, -2385, 1779, 2557, 83, 3310, 2072, 638, 1012, -3692, 1295, #define precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+592) 2319, -3208, 1533, -2071, -1431, -2005, -2, 1586, 1986, -293, 1919, -929, -679, 777, -1681, -3461, #define precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+608) 2456, 3366, 3750, -1203, 1535, -3657, -3417, -1712, -1921, 2515, 2665, -1070, 3600, 2532, -3135, -2589, #define precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+624) 2250, -2258, 921, -658, -514, 509, 3456, 1509, 1599, -642, 2160, -17, -1881, 1519, 1350, -2891, #define precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+640) -3593, -3434, -1497, 893, 396, -2422, -859, 2965, 3706, -2339, 1698, -2937, 1321, -670, -3428, -3163, #define precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+656) -2495, -1072, 660, 1084, 3689, -179, 1056, -1338, 2830, 2786, -2919, -3677, -3153, -151, -1598, 3334, #define precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+672) 1100, -3314, 3588, 2262, 1760, -2230, -404, 2083, 2816, -3568, 2426, -2812, -103, 436, -727, -2963, #define precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+688) -1701, 3770, 373, 1404, 1887, -1649, 2133, -826, 1483, 434, -2732, 3287, -3772, -2378, -2835, 3723, #define precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+704) -3593, 658, 2789, 370, -1887, -3434, -921, -3752, 2237, 1649, -1497, 2258, 2310, 3581, -2133, 893, #define precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+720) -2250, 3794, -915, 826, 396, 2589, 1390, 592, -1483, -2422, 3135, 3214, 2043, -434, -859, -2532, #define precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+736) 3696, 1121, 2732, 2965, -3600, 2998, -1464, -3287, 3706, 1070, 2224, -589, 3772, -2339, -2665, 2070, #define precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+752) -2876, 2378, 1698, -2515, 1305, -2815, 2835, -2937, 1921, -1348, 730, -3723, 1321, 1712, 486, 2130, #define q_x16 *(const int16x16 *)(qdata+768) 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, #define qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+784) -9, -9, -9, -9, -16425, -16425, -16425, -16425, -9, -9, -9, -9, -16425, -16425, -16425, -16425, #define qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+800) -28865, -28865, -28865, -28865, 10350, 10350, 10350, 10350, -28865, -28865, -28865, -28865, 10350, 10350, 10350, 10350, #define qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+816) -9, -9, -9, -9, -10350, -10350, -10350, -10350, -9, -9, -9, -9, -10350, -10350, -10350, -10350, #define qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+832) 28865, 28865, 28865, 28865, 16425, 16425, 16425, 16425, 28865, 28865, 28865, 28865, 16425, 16425, 16425, 16425, #define qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+848) -9, -9, -9, -9, -4974, -4974, -4974, -4974, -9, -9, -9, -9, -4974, -4974, -4974, -4974, #define qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+864) -16425, -16425, -16425, -16425, -7244, -7244, -7244, -7244, -16425, -16425, -16425, -16425, -7244, -7244, -7244, -7244, #define qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+880) -9, -9, -9, -9, -4496, -4496, -4496, -4496, -9, -9, -9, -9, -4496, -4496, -4496, -4496, #define qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+896) -10350, -10350, -10350, -10350, -14744, -14744, -14744, -14744, -10350, -10350, -10350, -10350, -14744, -14744, -14744, -14744, #define qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+912) -9, -20315, -4974, 18242, -16425, 18191, -7244, -11655, -9, -20315, -4974, 18242, -16425, 18191, -7244, -11655, #define qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+928) -28865, 20870, 14744, -22593, 10350, 828, 4496, 23754, -28865, 20870, 14744, -22593, 10350, 828, 4496, 23754, #define qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+944) -9, -23754, -4496, -828, -10350, 22593, -14744, -20870, -9, -23754, -4496, -828, -10350, 22593, -14744, -20870, #define qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+960) 28865, 11655, 7244, -18191, 16425, -18242, 4974, 20315, 28865, 11655, 7244, -18191, 16425, -18242, 4974, 20315, #define qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+976) -9, -10972, -20315, 23489, -4974, 25597, 18242, -2816, -9, -10972, -20315, 23489, -4974, 25597, 18242, -2816, #define qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+992) -16425, -19351, 18191, -3114, -7244, -9488, -11655, 19394, -16425, -19351, 18191, -3114, -7244, -9488, -11655, 19394, #define qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+1008) -9, -7491, -23754, -15307, -4496, -15750, -828, -5759, -9, -7491, -23754, -15307, -4496, -15750, -828, -5759, #define qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1024) -10350, 22568, 22593, -20469, -14744, 31177, -20870, 26382, -10350, 22568, 22593, -20469, -14744, 31177, -20870, 26382, #define qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1040) -9, -14846, -10972, -21066, -20315, -24581, 23489, -23242, -4974, -4505, 25597, -26279, 18242, 21467, -2816, 15998, #define qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1056) -16425, -4983, -19351, 14624, 18191, -2073, -3114, 20674, -7244, -21399, -9488, 6246, -11655, -29103, 19394, -5930, #define qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1072) -28865, -23668, -26382, -28839, 20870, 6536, -31177, 16279, 14744, 29428, 20469, 29667, -22593, 9215, -22568, -11783, #define qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1088) 10350, -14121, 5759, -5913, 828, -1724, 15750, 11792, 4496, 25093, 15307, 26228, 23754, -21766, 7491, -6817, #define qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1104) -9, 6817, -7491, 21766, -23754, -26228, -15307, -25093, -4496, -11792, -15750, 1724, -828, 5913, -5759, 14121, #define qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1120) -10350, 11783, 22568, -9215, 22593, -29667, -20469, -29428, -14744, -16279, 31177, -6536, -20870, 28839, 26382, 23668, #define qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1136) 28865, 5930, -19394, 29103, 11655, -6246, 9488, 21399, 7244, -20674, 3114, 2073, -18191, -14624, 19351, 4983, #define qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1152) 16425, -15998, 2816, -21467, -18242, 26279, -25597, 4505, 4974, 23242, -23489, 24581, 20315, 21066, 10972, 14846, #define qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1168) -9, -32252, -14846, -19317, -10972, 8472, -21066, -3456, -20315, 16655, -24581, 12611, 23489, -12073, -23242, 29871, #define qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1184) -4974, 6032, -4505, 10409, 25597, 24266, -26279, 17030, 18242, 10478, 21467, 11962, -2816, -26262, 15998, -17602, #define qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1200) -16425, -22124, -4983, -26220, -19351, -8908, 14624, 32738, 18191, 13575, -2073, 27132, -3114, 24573, 20674, 27201, #define qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1216) -7244, 12269, -21399, -16092, -9488, -15810, 6246, 15358, -11655, -15768, -29103, 24052, 19394, -26441, -5930, -1689, #define qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1232) -9, 13541, 6817, -5529, -7491, 26663, 21766, -4693, -23754, 13933, -26228, 8558, -15307, -21953, -25093, -22875, #define qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1248) -4496, -7508, -11792, -30136, -15750, 26800, 1724, 17303, -828, 2722, 5913, -12013, -5759, 30426, 14121, 3558, #define qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1264) -10350, -24743, 11783, -21860, 22568, -32329, -9215, 9360, 22593, -7415, -29667, 25946, -20469, -21868, -29428, -25511, #define qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1280) -14744, 1869, -16279, 14351, 31177, 2193, -6536, 17440, -20870, 24718, 28839, -23225, 26382, 9855, 23668, -9599, #define qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1296) -9, -32124, -32252, 10179, -14846, 6766, -19317, 16638, -10972, -23549, 8472, -17082, -21066, -15145, -3456, 31518, #define qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1312) -20315, -6297, 16655, -12261, -24581, -11885, 12611, 30938, 23489, 28805, -12073, 26783, -23242, -14718, 29871, 5708, #define qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1328) -4974, 15111, 6032, -29453, -4505, 12449, 10409, 529, 25597, -32004, 24266, 2952, -26279, 18003, 17030, 24931, #define qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1344) 18242, -1007, 10478, -4624, 21467, 17636, 11962, 14360, -2816, 15972, -26262, 16715, 15998, 4573, -17602, -14539, #define qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1360) -9, 6766, 8472, 31518, -24581, 28805, 29871, -29453, 25597, 18003, 10478, 14360, 15998, 27636, -26220, 17167, #define qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1376) 18191, -7304, 24573, -22039, -21399, -4565, 15358, 10802, 19394, 21723, 9599, -9633, -28839, -2807, -2193, -30597, #define qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1392) 14744, -26330, -25946, -2739, 9215, 32695, 24743, -26288, 5759, 20435, -17303, 24530, 11792, 20964, 21953, 23523, #define qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1408) 23754, -27858, 5529, 6510, 14846, 23549, 3456, 12261, -23489, 14718, -6032, -529, 26279, 1007, -11962, -16715, #define qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1424) -9, 24214, 26663, 23933, -26228, -13686, -22875, -27243, -15750, 4317, 2722, 8839, 14121, -32414, -21860, -25179, #define qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1440) 22593, -25648, -21868, -964, -16279, -1715, 17440, -14650, 26382, -28958, 1689, -10333, 29103, -20119, 15810, 22790, #define qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1456) 7244, 20238, -27132, -2858, -14624, 19274, 22124, -4573, 2816, 4624, -17030, 32004, 4505, -5708, 12073, 11885, #define qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1472) 20315, 17082, 19317, 32124, -6817, 14223, 4693, -14138, 15307, 9650, 7508, -9513, -1724, -23882, 12013, -15221, #define qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1488) -9, -6510, 13541, -23182, 6817, 24214, -5529, -24232, -7491, -14223, 26663, 27858, 21766, 26621, -4693, 23933, #define qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1504) -23754, 29394, 13933, 14138, -26228, -23523, 8558, -23984, -15307, -13686, -21953, 26766, -25093, -9650, -22875, -20964, #define qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1520) -4496, -22943, -7508, -27243, -11792, -18506, -30136, 9513, -15750, -24530, 26800, 947, 1724, 4317, 17303, 29718, #define qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1536) -828, 23882, 2722, -20435, 5913, -10495, -12013, 8839, -5759, -3396, 30426, 15221, 14121, 26288, 3558, 27730, #define qinvscaledzeta_x16_4_1 *(const int16x16 *)(qdata+1552) -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, #define qinvscaledzeta_x16_4_3 *(const int16x16 *)(qdata+1568) 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, #define qinvscaledzeta_x16_8_1 *(const int16x16 *)(qdata+1584) -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, #define qinvscaledzeta_x16_8_7 *(const int16x16 *)(qdata+1600) -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, #define qround32_x16 *(const int16x16 *)(qdata+1616) 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, #define scaledzeta_x16_4_1 *(const int16x16 *)(qdata+1632) -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, #define scaledzeta_x16_4_3 *(const int16x16 *)(qdata+1648) 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, #define scaledzeta_x16_8_1 *(const int16x16 *)(qdata+1664) -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, #define scaledzeta_x16_8_7 *(const int16x16 *)(qdata+1680) -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, } } ; static const vec1696 qdata_10753 = { .data = { // precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 1018, 1018, 1018, 1018, 3688, 3688, 3688, 3688, 1018, 1018, 1018, 1018, 3688, 3688, 3688, 3688, // precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 -223, -223, -223, -223, -4188, -4188, -4188, -4188, -223, -223, -223, -223, -4188, -4188, -4188, -4188, // precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 1018, 1018, 1018, 1018, 4188, 4188, 4188, 4188, 1018, 1018, 1018, 1018, 4188, 4188, 4188, 4188, // precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 223, 223, 223, 223, -3688, -3688, -3688, -3688, 223, 223, 223, 223, -3688, -3688, -3688, -3688, // precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 1018, 1018, 1018, 1018, -376, -376, -376, -376, 1018, 1018, 1018, 1018, -376, -376, -376, -376, // precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 3688, 3688, 3688, 3688, -3686, -3686, -3686, -3686, 3688, 3688, 3688, 3688, -3686, -3686, -3686, -3686, // precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 1018, 1018, 1018, 1018, -2413, -2413, -2413, -2413, 1018, 1018, 1018, 1018, -2413, -2413, -2413, -2413, // precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 4188, 4188, 4188, 4188, -357, -357, -357, -357, 4188, 4188, 4188, 4188, -357, -357, -357, -357, // precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 1018, -3364, -376, 4855, 3688, 425, -3686, 2695, 1018, -3364, -376, 4855, 3688, 425, -3686, 2695, // precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 -223, -3784, 357, -2236, -4188, 4544, 2413, 730, -223, -3784, 357, -2236, -4188, 4544, 2413, 730, // precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 1018, -730, -2413, -4544, 4188, 2236, -357, 3784, 1018, -730, -2413, -4544, 4188, 2236, -357, 3784, // precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 223, -2695, 3686, -425, -3688, -4855, 376, 3364, 223, -2695, 3686, -425, -3688, -4855, 376, 3364, // precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 1018, -5175, -3364, 2503, -376, 1341, 4855, -4875, 1018, -5175, -3364, 2503, -376, 1341, 4855, -4875, // precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 3688, -2629, 425, -4347, -3686, 3823, 2695, -4035, 3688, -2629, 425, -4347, -3686, 3823, 2695, -4035, // precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 1018, 5063, -730, 341, -2413, -3012, -4544, -5213, 1018, 5063, -730, 341, -2413, -3012, -4544, -5213, // precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 4188, 1520, 2236, 1931, -357, 918, 3784, 4095, 4188, 1520, 2236, 1931, -357, 918, 3784, 4095, // precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 1018, 3085, -5175, 2982, -3364, -4744, 2503, -4129, -376, -2576, 1341, -193, 4855, 3062, -4875, 4, // precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 3688, 2388, -2629, -4513, 425, 4742, -4347, 2935, -3686, -544, 3823, -2178, 2695, 847, -4035, 268, // precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 -223, -1299, -4095, -1287, -3784, -4876, -918, 3091, 357, -4189, -1931, 4616, -2236, 2984, -1520, -3550, // precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 -4188, -1009, 5213, -205, 4544, -4102, 3012, 2790, 2413, -1085, -341, -2565, 730, -4379, -5063, -1284, // precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 1018, 1284, 5063, 4379, -730, 2565, 341, 1085, -2413, -2790, -3012, 4102, -4544, 205, -5213, 1009, // precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 4188, 3550, 1520, -2984, 2236, -4616, 1931, 4189, -357, -3091, 918, 4876, 3784, 1287, 4095, 1299, // precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 223, -268, 4035, -847, -2695, 2178, -3823, 544, 3686, -2935, 4347, -4742, -425, 4513, 2629, -2388, // precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 -3688, -4, 4875, -3062, -4855, 193, -1341, 2576, 376, 4129, -2503, 4744, 3364, -2982, 5175, -3085, // precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 1018, 5116, 3085, -3615, -5175, 400, 2982, 3198, -3364, 2234, -4744, -4828, 2503, 326, -4129, -512, // precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -376, 1068, -2576, -4580, 1341, 3169, -193, -2998, 4855, -635, 3062, -4808, -4875, -2740, 4, 675, // precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 3688, -1324, 2388, 5114, -2629, 5294, -4513, -794, 425, -864, 4742, -886, -4347, 336, 2935, -2045, // precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 -3686, -3715, -544, 4977, 3823, -2737, -2178, 3441, 2695, 467, 847, 454, -4035, -779, 268, 2213, // precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 1018, 1615, 1284, 2206, 5063, 5064, 4379, 472, -730, -5341, 2565, -4286, 341, 2981, 1085, -1268, // precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -2413, -3057, -2790, -2884, -3012, -1356, 4102, -3337, -4544, 5023, 205, -636, -5213, 909, 1009, -2973, // precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 4188, 2271, 3550, -1572, 1520, 1841, -2984, 970, 2236, -4734, -4616, 578, 1931, -116, 4189, 1586, // precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 -357, -2774, -3091, -1006, 918, -5156, 4876, 4123, 3784, -567, 1287, 151, 4095, 1458, 1299, 2684, // precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 1018, -3260, 5116, -1722, 3085, 5120, -3615, 3760, -5175, 73, 400, 4254, 2982, 2788, 3198, -2657, // precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -3364, 569, 2234, 1930, -4744, -2279, -4828, 5215, 2503, -4403, 326, 1639, -4129, 5068, -512, -5015, // precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 -376, -4859, 1068, -40, -2576, 4003, -4580, -4621, 1341, 2487, 3169, -2374, -193, 2625, -2998, 4784, // precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 4855, 825, -635, 2118, 3062, -2813, -4808, -4250, -4875, -2113, -2740, -4408, 4, -1893, 675, 458, // precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 1018, 5120, 400, -2657, -4744, -4403, -512, -40, 1341, 2625, -635, -4250, 4, -3360, 5114, -5313, // precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 425, -2151, 336, -2662, -544, 5334, 3441, 2117, -4035, 2205, -2684, -3570, -1287, -4973, 5156, 2419, // precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 357, 1204, -578, 1635, 2984, -1111, -2271, 4359, 5213, -2449, 3337, 3453, 2790, 554, -2981, -1409, // precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 730, -279, -2206, 3524, -3085, -73, -3198, -1930, -2503, -5068, -1068, 4621, 193, -825, 4808, 4408, // precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 1018, 4428, 5064, -4000, 2565, 573, -1268, 3125, -3012, -4144, 5023, 1927, 1009, -2139, -1572, 3535, // precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 2236, 663, -116, 4967, -3091, -854, 4123, 1160, 4095, -1349, -2213, 1782, -847, 2062, 2737, 624, // precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 3686, -2283, 886, 4889, 4513, -4601, 1324, 1893, 4875, -2118, 2998, -2487, 2576, 5015, -326, 2279, // precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 3364, -4254, 3615, 3260, -1284, -1381, -472, -3891, -341, 2087, 3057, 4720, -4102, 3410, 636, 1689, // precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 1018, -3524, 1615, 5268, 1284, 4428, 2206, -834, 5063, 1381, 5064, 279, 4379, 2439, 472, -4000, // precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -730, -2015, -5341, 3891, 2565, 1409, -4286, 2605, 341, 573, 2981, 5356, 1085, -2087, -1268, -554, // precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 -2413, 3135, -3057, 3125, -2790, -778, -2884, -4720, -3012, -3453, -1356, -355, 4102, -4144, -3337, -152, // precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 -4544, -3410, 5023, 2449, 205, -97, -636, 1927, -5213, 2624, 909, -1689, 1009, -4359, -2973, -3419, // q_x16 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, // qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 -6, -6, -6, -6, -408, -408, -408, -408, -6, -6, -6, -6, -408, -408, -408, -408, // qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 -27359, -27359, -27359, -27359, 1956, 1956, 1956, 1956, -27359, -27359, -27359, -27359, 1956, 1956, 1956, 1956, // qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 -6, -6, -6, -6, -1956, -1956, -1956, -1956, -6, -6, -6, -6, -1956, -1956, -1956, -1956, // qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 27359, 27359, 27359, 27359, 408, 408, 408, 408, 27359, 27359, 27359, 27359, 408, 408, 408, 408, // qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 -6, -6, -6, -6, -20856, -20856, -20856, -20856, -6, -6, -6, -6, -20856, -20856, -20856, -20856, // qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 -408, -408, -408, -408, -21094, -21094, -21094, -21094, -408, -408, -408, -408, -21094, -21094, -21094, -21094, // qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 -6, -6, -6, -6, -10093, -10093, -10093, -10093, -6, -6, -6, -6, -10093, -10093, -10093, -10093, // qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 -1956, -1956, -1956, -1956, -28517, -28517, -28517, -28517, -1956, -1956, -1956, -1956, -28517, -28517, -28517, -28517, // qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 -6, -9508, -20856, -29449, -408, 18345, -21094, -7033, -6, -9508, -20856, -29449, -408, 18345, -21094, -7033, // qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 -27359, -16072, 28517, -12476, 1956, -28224, 10093, 16090, -27359, -16072, 28517, -12476, 1956, -28224, 10093, 16090, // qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 -6, -16090, -10093, 28224, -1956, 12476, -28517, 16072, -6, -16090, -10093, 28224, -1956, 12476, -28517, 16072, // qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 27359, 7033, 21094, -18345, 408, 29449, 20856, 9508, 27359, 7033, 21094, -18345, 408, 29449, 20856, 9508, // qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 -6, -3639, -9508, 25543, -20856, 829, -29449, -17675, -6, -3639, -9508, 25543, -20856, 829, -29449, -17675, // qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 -408, 18363, 18345, 7429, -21094, -10001, -7033, -4547, -408, 18363, 18345, 7429, -21094, -10001, -7033, -4547, // qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 -6, 28103, -16090, 3925, -10093, 7228, 28224, 11683, -6, 28103, -16090, 3925, -10093, 7228, 28224, 11683, // qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 -1956, -23056, 12476, 14731, -28517, 26518, 16072, 14847, -1956, -23056, 12476, 14731, -28517, 26518, 16072, 14847, // qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 -6, -5619, -3639, -12378, -9508, 15736, 25543, 23007, -20856, -27152, 829, -22209, -29449, -20490, -17675, 22532, // qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -408, 16724, 18363, 22623, 18345, 5766, 7429, -31369, -21094, 15840, -10001, 19326, -7033, 3407, -4547, 2316, // qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 -27359, 6381, -14847, 8441, -16072, -6924, -26518, -4589, 28517, 12707, -14731, -15864, -12476, 31656, 23056, 24098, // qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 1956, -31217, -11683, -24269, -28224, -5126, -7228, 20198, 10093, -573, -3925, -14341, 16090, 23781, -28103, -23812, // qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 -6, 23812, 28103, -23781, -16090, 14341, 3925, 573, -10093, -20198, 7228, 5126, 28224, 24269, 11683, 31217, // qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -1956, -24098, -23056, -31656, 12476, 15864, 14731, -12707, -28517, 4589, 26518, 6924, 16072, -8441, 14847, -6381, // qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 27359, -2316, 4547, -3407, 7033, -19326, 10001, -15840, 21094, 31369, -7429, -5766, -18345, -22623, -18363, -16724, // qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 408, -22532, 17675, 20490, 29449, 22209, -829, 27152, 20856, -23007, -25543, -15736, 9508, 12378, 3639, 5619, // qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 -6, -17412, -5619, 2017, -3639, 24976, -12378, 24702, -9508, -31558, 15736, 1316, 25543, -31418, 23007, -512, // qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -20856, -13268, -27152, 22044, 829, 8801, -22209, -12214, -29449, 11141, -20490, -17096, -17675, 32076, 22532, 17571, // qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 -408, 13012, 16724, 4090, 18363, -30546, 22623, 16614, 18345, -17248, 5766, 22666, 7429, -7856, -31369, 31235, // qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 -21094, 28541, 15840, -30351, -10001, -177, 19326, -31887, -7033, 25555, 3407, -31290, -4547, -13579, 2316, -2395, // qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 -6, 4175, 23812, 7326, 28103, 17352, -23781, -28200, -16090, 11555, 14341, 6978, 3925, -1627, 573, 780, // qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -10093, 32271, -20198, 7356, 7228, 29364, 5126, 27895, 28224, -609, 24269, 21892, 11683, -7795, 31217, -18845, // qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 -1956, 29407, -24098, -7716, -23056, -719, -31656, -8246, 12476, -26238, 15864, 11842, 14731, 1932, -12707, -11726, // qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 -28517, 4394, 4589, 2066, 26518, -11300, 6924, -24037, 16072, 969, -8441, 14999, 14847, -11854, -6381, -19844, // qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 -6, -13500, -17412, 32070, -5619, 5120, 2017, 11952, -3639, 1609, 24976, 9374, -12378, -23836, 24702, -8289, // qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -9508, -22471, -31558, 25482, 15736, -8935, 1316, 32351, 25543, 19661, -31418, 8295, 23007, -25652, -512, -19863, // qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 -20856, 6917, -13268, -28712, -27152, 20899, 22044, 4083, 829, 951, 8801, 29370, -22209, 24641, -12214, 12976, // qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 -29449, -22215, 11141, -29626, -20490, 30467, -17096, 13158, -17675, -24129, 32076, 7880, 22532, -30053, 17571, -8758, // qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 -6, 5120, 24976, -8289, 15736, 19661, -512, -28712, 829, 24641, 11141, 13158, 22532, 13024, 4090, -27329, // qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 18345, -8807, -7856, -20070, 15840, -1834, -31887, -18875, -4547, 18077, 19844, -23026, 8441, -12653, 11300, 11123, // qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 28517, 31924, -11842, -14237, 31656, 16809, -29407, -5369, -11683, -16273, -27895, -29827, 20198, 7722, 1627, 9343, // qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 16090, -15127, -7326, -6716, 5619, -1609, -24702, -25482, -25543, 25652, 13268, -4083, 22209, 22215, 17096, -7880, // qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 -6, -26292, 17352, 12384, 14341, 61, 780, 23093, 7228, -12336, -609, -7801, 31217, -6747, -7716, 6095, // qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 12476, 15511, 1932, 11623, 4589, 6314, -24037, -19320, 14847, 19643, 2395, -21770, -3407, -17394, 177, -23952, // qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 21094, -31467, -22666, -1767, -22623, -14329, -13012, 30053, 17675, 29626, 12214, -951, 27152, 19863, 31418, 8935, // qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 9508, -9374, -2017, 13500, -23812, -29541, 28200, 20173, -3925, -24025, -32271, -19856, -5126, -26286, -21892, -4967, // qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 -6, 6716, 4175, -13164, 23812, -26292, 7326, -12098, 28103, 29541, 17352, 15127, -23781, -7289, -28200, 12384, // qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 -16090, -29151, 11555, -20173, 14341, -9343, 6978, -22483, 3925, 61, -1627, 23788, 573, 24025, 780, -7722, // qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 -10093, -18881, 32271, 23093, -20198, -24330, 7356, 19856, 7228, 29827, 29364, 15517, 5126, -12336, 27895, -4248, // qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 28224, 26286, -609, 16273, 24269, -5729, 21892, -7801, 11683, -30144, -7795, 4967, 31217, 5369, -18845, -8027, // qinvscaledzeta_x16_4_1 -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, // qinvscaledzeta_x16_4_3 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, // qinvscaledzeta_x16_8_1 -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, // qinvscaledzeta_x16_8_7 -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, // qround32_x16 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // scaledzeta_x16_4_1 -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, // scaledzeta_x16_4_3 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, // scaledzeta_x16_8_1 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, // scaledzeta_x16_8_7 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, } } ; static inline int16x16 add_x16(int16x16 a, int16x16 b) { return _mm256_add_epi16(a, b); } static inline int16x16 sub_x16(int16x16 a, int16x16 b) { return _mm256_sub_epi16(a, b); } static inline int16x16 mulmod_scaled_x16(int16x16 x, int16x16 y, int16x16 yqinv, const int16 *qdata) { int16x16 b = _mm256_mulhi_epi16(x, y); int16x16 d = _mm256_mullo_epi16(x, yqinv); int16x16 e = _mm256_mulhi_epi16(d, q_x16); return sub_x16(b, e); } static inline int16x16 reduce_x16(int16x16 x, const int16 *qdata) { int16x16 y = _mm256_mulhrs_epi16(x, qround32_x16); y = _mm256_mullo_epi16(y, q_x16); return sub_x16(x, y); } // ----- codegen pass 1 // // startntt 512 // startbatch 512 // // ----- PRECONDITIONS // physical_map (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // transform size 512 // // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8] // // transforms per batch 1 // // batch indexing [] // // total batch size 512 // // // modulus x^512-1 pos 0:512 q 7681,10753 bound 512*(5629,5800) // assertranges ... // // // ----- LAYER 1 // // // butterfly(0,256,1,256,1,0) // butterfly 0 256 1 256 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // ----- POSTCONDITIONS AFTER LAYER 1 // // transform size 512 // // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8] // // transforms per batch 1 // // batch indexing [] // // total batch size 512 // // // modulus x^256-1 pos 0:256 q 7681,10753 bound 256*(11258,11600) // assertranges ... // // // modulus x^256+1 pos 256:512 q 7681,10753 bound 256*(11258,11600) // assertranges ... // // // ----- LAYER 2 // // // reduce_ifreverse(0,64,1) // reduce_ifreverse 0 64 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // reduce_ifreverse(256,320,1) // reduce_ifreverse 256 320 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // butterfly(0,128,1,128,1,0) // butterfly 0 128 1 128 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // butterfly(256,384,1,128,4,1) // butterfly 256 384 1 128 4 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // ----- POSTCONDITIONS AFTER LAYER 2 // // transform size 512 // // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8] // // transforms per batch 1 // // batch indexing [] // // total batch size 512 // // // modulus x^128-1 pos 0:128 q 7681,10753 bound 128*(22516,23200) // assertranges ... // // // modulus x^128+1 pos 128:256 q 7681,10753 bound 128*(22516,23200) // assertranges ... // // // modulus x^128-zeta4 pos 256:384 q 7681,10753 bound 128*(15747,17016) // assertranges ... // // // modulus x^128+zeta4 pos 384:512 q 7681,10753 bound 128*(15747,17016) // assertranges ... // // // ----- LAYER 3 // // // reduce_ifforward(64,128,1) // reduce_ifforward 64 128 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // butterfly(0,64,1,64,1,0) // butterfly 0 64 1 64 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // butterfly(128,192,1,64,4,1) // butterfly 128 192 1 64 4 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // butterfly(256,320,1,64,8,1) // butterfly 256 320 1 64 8 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // butterfly(384,448,1,64,8,-1) // butterfly 384 448 1 64 8 7 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // reduce(0,64,1) // reduce 0 64 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // twist(64,128,1,128,1) // twist 64 128 1 128 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // twist(128,192,1,256,1) // twist 128 192 1 256 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // twist(192,256,1,256,-1) // twist 192 256 1 256 255 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // twist(256,320,1,512,1) // twist 256 320 1 512 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // twist(320,384,1,512,5) // twist 320 384 1 512 5 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // twist(384,448,1,512,-1) // twist 384 448 1 512 511 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // twist(448,512,1,512,-5) // twist 448 512 1 512 507 (0, 1, 2, 3, 4, 5, 6, 7, 8) () // // // physical_permute(3,6) // physical_permute (3, 6) (0, 1, 2, 3, 4, 5, 6, 7, 8) () (0, 1, 2, 6, 4, 5, 3, 7, 8) () // // // fold(256) // physical_unmap (0, 1, 2, 6, 4, 5, 3, 7, 8) () // physical_map (0, 1, 2, 6, 4, 5, 3, 7) (8,) // // // fold(128) // physical_unmap (0, 1, 2, 6, 4, 5, 3, 7) (8,) // physical_map (0, 1, 2, 6, 4, 5, 3) (7, 8) // // // fold(64) // physical_unmap (0, 1, 2, 6, 4, 5, 3) (7, 8) // physical_map (0, 1, 2, 6, 4, 5) (3, 7, 8) // // // nextbatch() // stopbatch 512 // startbatch 512 // // // halfbatch() // physical_unmap (0, 1, 2, 6, 4, 5) (3, 7, 8) // stopbatch 512 // doublereps // startbatch 256 // physical_map (0, 1, 2, 6, 4, 5) (3, 7) // // // halfbatch() // physical_unmap (0, 1, 2, 6, 4, 5) (3, 7) // stopbatch 256 // doublereps // startbatch 128 // physical_map (0, 1, 2, 6, 4, 5) (3,) // // // ----- POSTCONDITIONS AFTER LAYER 3 // // transform size 64 // // transform indexing [0, 1, 2, 6, 4, 5] // // transforms per batch 2 // // batch indexing [3] // // total batch size 128 // // // modulus x^64-1 pos 0:64 q 7681,10753 bound 1*(5629,5827) 1*(5629,7613) 1*(5629,7666) 1*(5629,7264) 1*(5629,7639) 1*(5629,7591) 1*(5629,7291) 1*(5629,7204) ... // assertranges ... // // // ----- LAYER 4 // // // butterfly(0,32,1,32,1,0) // butterfly 0 32 1 32 1 0 (0, 1, 2, 6, 4, 5) (3,) // // // ----- POSTCONDITIONS AFTER LAYER 4 // // transform size 64 // // transform indexing [0, 1, 2, 6, 4, 5] // // transforms per batch 2 // // batch indexing [3] // // total batch size 128 // // // modulus x^32-1 pos 0:32 q 7681,10753 bound 1*(11258,13035) 1*(11258,14721) 1*(11258,14855) 1*(11258,14877) 1*(11258,14690) 1*(11258,15282) 1*(11258,14641) 1*(11258,14272) ... // assertranges ... // // // modulus x^32+1 pos 32:64 q 7681,10753 bound 1*(11258,13035) 1*(11258,14721) 1*(11258,14855) 1*(11258,14877) 1*(11258,14690) 1*(11258,15282) 1*(11258,14641) 1*(11258,14272) ... // assertranges ... // // // ----- LAYER 5 // // // butterfly(0,16,1,16,1,0) // butterfly 0 16 1 16 1 0 (0, 1, 2, 6, 4, 5) (3,) // // // butterfly(32,48,1,16,4,1) // butterfly 32 48 1 16 4 1 (0, 1, 2, 6, 4, 5) (3,) // // // reduce(0,16,1) // reduce 0 16 1 (0, 1, 2, 6, 4, 5) (3,) // // // twist(16,32,1,32,1) // twist 16 32 1 32 1 (0, 1, 2, 6, 4, 5) (3,) // // // twist(32,48,1,64,1) // twist 32 48 1 64 1 (0, 1, 2, 6, 4, 5) (3,) // // // twist(48,64,1,64,-1) // twist 48 64 1 64 63 (0, 1, 2, 6, 4, 5) (3,) // // // physical_permute(0,1,2,5) // physical_permute (0, 1, 2, 5) (0, 1, 2, 6, 4, 5) (3,) (1, 2, 5, 6, 4, 0) (3,) // // // fold(32) // physical_unmap (1, 2, 5, 6, 4, 0) (3,) // physical_map (1, 2, 5, 6, 4) (0, 3) // // // fold(16) // physical_unmap (1, 2, 5, 6, 4) (0, 3) // physical_map (1, 2, 5, 6) (0, 3, 4) // // // ----- POSTCONDITIONS AFTER LAYER 5 // // transform size 16 // // transform indexing [1, 2, 5, 6] // // transforms per batch 8 // // batch indexing [0, 3, 4] // // total batch size 128 // // // modulus x^16-1 pos 0:16 q 7681,10753 bound 1*(5629,5800) 1*(5629,6967) 1*(5629,6418) 1*(5629,7585) 1*(5629,7017) 1*(5629,6328) 1*(5629,7033) 1*(5629,6943) ... // assertranges ... // // // ----- LAYER 6 // // // butterfly(0,8,1,8,1,0) // butterfly 0 8 1 8 1 0 (1, 2, 5, 6) (0, 3, 4) // // // physical_permute(1,2,4) // physical_permute (1, 2, 4) (1, 2, 5, 6) (0, 3, 4) (2, 4, 5, 6) (0, 3, 1) // // // nextbatch() // stopbatch 128 // startbatch 128 // // // ----- POSTCONDITIONS AFTER LAYER 6 // // transform size 16 // // transform indexing [2, 4, 5, 6] // // transforms per batch 8 // // batch indexing [0, 3, 1] // // total batch size 128 // // // modulus x^8-1 pos 0:8 q 7681,10753 bound 1*(11258,12447) 1*(11258,14071) 1*(11258,12488) 1*(11258,14310) 1*(11258,14287) 1*(11258,13674) 1*(11258,13574) 1*(11258,13555) // assertranges ... // // // modulus x^8+1 pos 8:16 q 7681,10753 bound 1*(11258,12447) 1*(11258,14071) 1*(11258,12488) 1*(11258,14310) 1*(11258,14287) 1*(11258,13674) 1*(11258,13574) 1*(11258,13555) // assertranges ... // // // ----- LAYER 7 // // // butterfly(0,4,1,4,1,0) // butterfly 0 4 1 4 1 0 (2, 4, 5, 6) (0, 3, 1) // // // butterfly(8,12,1,4,4,1) // butterfly 8 12 1 4 4 1 (2, 4, 5, 6) (0, 3, 1) // // // reduce(0,4,1) // reduce 0 4 1 (2, 4, 5, 6) (0, 3, 1) // // // twist(4,8,1,8,1) // twist 4 8 1 8 1 (2, 4, 5, 6) (0, 3, 1) // // // twist(8,12,1,16,1) // twist 8 12 1 16 1 (2, 4, 5, 6) (0, 3, 1) // // // twist(12,16,1,16,-1) // twist 12 16 1 16 15 (2, 4, 5, 6) (0, 3, 1) // // // physical_permute(2,6) // physical_permute (2, 6) (2, 4, 5, 6) (0, 3, 1) (6, 4, 5, 2) (0, 3, 1) // // // fold(8) // physical_unmap (6, 4, 5, 2) (0, 3, 1) // physical_map (6, 4, 5) (0, 1, 2, 3) // // // fold(4) // physical_unmap (6, 4, 5) (0, 1, 2, 3) // physical_map (6, 4) (0, 1, 2, 3, 5) // // // ----- POSTCONDITIONS AFTER LAYER 7 // // transform size 4 // // transform indexing [6, 4] // // transforms per batch 32 // // batch indexing [0, 1, 2, 3, 5] // // total batch size 128 // // // modulus x^4-1 pos 0:4 q 7681,10753 bound 1*(5629,5800) 1*(5629,6938) 1*(5629,6521) 1*(5629,7157) // assertranges ... // // // ----- LAYER 8 // // // butterfly(0,2,1,2,1,0) // butterfly 0 2 1 2 1 0 (6, 4) (0, 1, 2, 3, 5) // // // ----- POSTCONDITIONS AFTER LAYER 8 // // transform size 4 // // transform indexing [6, 4] // // transforms per batch 32 // // batch indexing [0, 1, 2, 3, 5] // // total batch size 128 // // // modulus x^2-1 pos 0:2 q 7681,10753 bound 1*(11258,12321) 1*(11258,14095) // assertranges ... // // // modulus x^2+1 pos 2:4 q 7681,10753 bound 1*(11258,12321) 1*(11258,14095) // assertranges ... // // // ----- LAYER 9 // // // butterfly(0,1,1,1,1,0) // butterfly 0 1 1 1 1 0 (6, 4) (0, 1, 2, 3, 5) // // // butterfly(2,3,1,1,4,1) // butterfly 2 3 1 1 4 1 (6, 4) (0, 1, 2, 3, 5) // // // ----- POSTCONDITIONS AFTER LAYER 9 // // transform size 4 // // transform indexing [6, 4] // // transforms per batch 32 // // batch indexing [0, 1, 2, 3, 5] // // total batch size 128 // // // modulus x^1-1 pos 0:1 q 7681,10753 bound 1*(22516,26416) // assertranges ... // // // modulus x^1+1 pos 1:2 q 7681,10753 bound 1*(22516,26416) // assertranges ... // // // modulus x^1-zeta4 pos 2:3 q 7681,10753 bound 1*(15747,17745) // assertranges ... // // // modulus x^1+zeta4 pos 3:4 q 7681,10753 bound 1*(15747,17745) // assertranges ... // stopbatch 128 // physical_unmap (6, 4) (0, 1, 2, 3, 5) // stopntt 512 // ----- codegen pass 2 // // startntt 512 // startbatch 512 // vector_butterfly 0 256 1 0 // vector_butterfly 128 384 1 0 // vector_butterfly 64 320 1 0 // vector_butterfly 192 448 1 0 // vector_reduce_ifreverse 0 // vector_reduce_ifreverse 256 // vector_butterfly 0 128 1 0 // vector_butterfly 64 192 1 0 // vector_butterfly 256 384 4 1 // vector_butterfly 320 448 4 1 // vector_reduce_ifforward 64 // vector_butterfly 0 64 1 0 // vector_butterfly 128 192 4 1 // vector_butterfly 256 320 8 1 // vector_butterfly 384 448 8 7 // vector_reduce 0 // vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 // vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 // vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 // vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 // vector_twist 320 512 5 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 // vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 // vector_twist 448 512 507 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 // vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // stopbatch 512 // startbatch 512 // vector_butterfly 16 272 1 0 // vector_butterfly 144 400 1 0 // vector_butterfly 80 336 1 0 // vector_butterfly 208 464 1 0 // vector_reduce_ifreverse 16 // vector_reduce_ifreverse 272 // vector_butterfly 16 144 1 0 // vector_butterfly 80 208 1 0 // vector_butterfly 272 400 4 1 // vector_butterfly 336 464 4 1 // vector_reduce_ifforward 80 // vector_butterfly 16 80 1 0 // vector_butterfly 144 208 4 1 // vector_butterfly 272 336 8 1 // vector_butterfly 400 464 8 7 // vector_reduce 16 // vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 // vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 // vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 // vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 // vector_twist 336 512 5 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 // vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 // vector_twist 464 512 507 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 // vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // stopbatch 512 // startbatch 512 // vector_butterfly 32 288 1 0 // vector_butterfly 160 416 1 0 // vector_butterfly 96 352 1 0 // vector_butterfly 224 480 1 0 // vector_reduce_ifreverse 32 // vector_reduce_ifreverse 288 // vector_butterfly 32 160 1 0 // vector_butterfly 96 224 1 0 // vector_butterfly 288 416 4 1 // vector_butterfly 352 480 4 1 // vector_reduce_ifforward 96 // vector_butterfly 32 96 1 0 // vector_butterfly 160 224 4 1 // vector_butterfly 288 352 8 1 // vector_butterfly 416 480 8 7 // vector_reduce 32 // vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // vector_twist 352 512 5 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // vector_twist 480 512 507 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 // vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // stopbatch 512 // startbatch 512 // vector_butterfly 48 304 1 0 // vector_butterfly 176 432 1 0 // vector_butterfly 112 368 1 0 // vector_butterfly 240 496 1 0 // vector_reduce_ifreverse 48 // vector_reduce_ifreverse 304 // vector_butterfly 48 176 1 0 // vector_butterfly 112 240 1 0 // vector_butterfly 304 432 4 1 // vector_butterfly 368 496 4 1 // vector_reduce_ifforward 112 // vector_butterfly 48 112 1 0 // vector_butterfly 176 240 4 1 // vector_butterfly 304 368 8 1 // vector_butterfly 432 496 8 7 // vector_reduce 48 // vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 // vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 // vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 // vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 // vector_twist 368 512 5 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 // vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 // vector_twist 496 512 507 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 // vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi // stopbatch 512 // doublereps // doublereps // startbatch 128 // vector_butterfly 0 32 1 0 // vector_butterfly 64 96 1 0 // vector_butterfly 16 48 1 0 // vector_butterfly 80 112 1 0 // vector_butterfly 0 16 1 0 // vector_butterfly 64 80 1 0 // vector_butterfly 32 48 4 1 // vector_butterfly 96 112 4 1 // vector_reduce 0 // vector_reduce 64 // vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 // vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 // vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 // vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 // vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 // vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 // vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 // vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 // vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 // vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 // vector_butterfly 0 64 1 0 // vector_butterfly 32 96 1 0 // vector_butterfly 16 80 1 0 // vector_butterfly 48 112 1 0 // vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 // vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 // vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 // vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 // stopbatch 128 // startbatch 128 // vector_butterfly 0 32 1 0 // vector_butterfly 16 48 1 0 // vector_butterfly 64 96 4 1 // vector_butterfly 80 112 4 1 // vector_reduce 0 // vector_reduce 16 // vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 // vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 // vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 // vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 // vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 // vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 // vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 // vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 // vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 // vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 // vector_butterfly 0 16 1 0 // vector_butterfly 64 80 1 0 // vector_butterfly 32 48 1 0 // vector_butterfly 96 112 1 0 // vector_butterfly 0 64 1 0 // vector_butterfly 32 96 1 0 // vector_butterfly 16 80 4 1 // vector_butterfly 48 112 4 1 // stopbatch 128 // stopntt 512 // startntt 512 static void ntt512(int16 *f, int reps, const int16 *qdata) { // startbatch 512 for (long long r = 0; r < reps; ++r) { // vector_butterfly 0 256 1 0 int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0)); int16x16 a16 = _mm256_loadu_si256((int16x16 *) (f + 256)); int16x16 b0 = add_x16(a0, a16); int16x16 b16 = sub_x16(a0, a16); // vector_butterfly 128 384 1 0 int16x16 a8 = _mm256_loadu_si256((int16x16 *) (f + 128)); int16x16 a24 = _mm256_loadu_si256((int16x16 *) (f + 384)); int16x16 b8 = add_x16(a8, a24); int16x16 b24 = sub_x16(a8, a24); // vector_butterfly 64 320 1 0 int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64)); int16x16 a20 = _mm256_loadu_si256((int16x16 *) (f + 320)); int16x16 b4 = add_x16(a4, a20); int16x16 b20 = sub_x16(a4, a20); // vector_butterfly 192 448 1 0 int16x16 a12 = _mm256_loadu_si256((int16x16 *) (f + 192)); int16x16 a28 = _mm256_loadu_si256((int16x16 *) (f + 448)); int16x16 b12 = add_x16(a12, a28); int16x16 b28 = sub_x16(a12, a28); // vector_reduce_ifreverse 0 // vector_reduce_ifreverse 256 // vector_butterfly 0 128 1 0 int16x16 c0 = add_x16(b0, b8); int16x16 c8 = sub_x16(b0, b8); // vector_butterfly 64 192 1 0 int16x16 c4 = add_x16(b4, b12); int16x16 c12 = sub_x16(b4, b12); // vector_butterfly 256 384 4 1 b24 = mulmod_scaled_x16(b24, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); int16x16 c16 = add_x16(b16, b24); int16x16 c24 = sub_x16(b16, b24); // vector_butterfly 320 448 4 1 b28 = mulmod_scaled_x16(b28, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); int16x16 c20 = add_x16(b20, b28); int16x16 c28 = sub_x16(b20, b28); // vector_reduce_ifforward 64 c4 = reduce_x16(c4, qdata); // vector_butterfly 0 64 1 0 int16x16 d0 = add_x16(c0, c4); int16x16 d4 = sub_x16(c0, c4); // vector_butterfly 128 192 4 1 c12 = mulmod_scaled_x16(c12, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); int16x16 d8 = add_x16(c8, c12); int16x16 d12 = sub_x16(c8, c12); // vector_butterfly 256 320 8 1 c20 = mulmod_scaled_x16(c20, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata); int16x16 d16 = add_x16(c16, c20); int16x16 d20 = sub_x16(c16, c20); // vector_butterfly 384 448 8 7 c28 = mulmod_scaled_x16(c28, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata); int16x16 d24 = add_x16(c24, c28); int16x16 d28 = sub_x16(c24, c28); // vector_reduce 0 d0 = reduce_x16(d0, qdata); // vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 d4 = mulmod_scaled_x16(d4, precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); // vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 d8 = mulmod_scaled_x16(d8, precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); // vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 d12 = mulmod_scaled_x16(d12, precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); // vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 d16 = mulmod_scaled_x16(d16, precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); // vector_twist 320 512 5 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 d20 = mulmod_scaled_x16(d20, precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); // vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 d24 = mulmod_scaled_x16(d24, precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); // vector_twist 448 512 507 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 d28 = mulmod_scaled_x16(d28, precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); // vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e0 = _mm256_permute2x128_si256_lo(d0, d4); int16x16 e4 = _mm256_permute2x128_si256_hi(d0, d4); // vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e8 = _mm256_permute2x128_si256_lo(d8, d12); int16x16 e12 = _mm256_permute2x128_si256_hi(d8, d12); // vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e16 = _mm256_permute2x128_si256_lo(d16, d20); int16x16 e20 = _mm256_permute2x128_si256_hi(d16, d20); // vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e24 = _mm256_permute2x128_si256_lo(d24, d28); int16x16 e28 = _mm256_permute2x128_si256_hi(d24, d28); // stopbatch 512 _mm256_storeu_si256((int16x16 *) (f + 0), e0); _mm256_storeu_si256((int16x16 *) (f + 64), e4); _mm256_storeu_si256((int16x16 *) (f + 128), e8); _mm256_storeu_si256((int16x16 *) (f + 192), e12); _mm256_storeu_si256((int16x16 *) (f + 256), e16); _mm256_storeu_si256((int16x16 *) (f + 320), e20); _mm256_storeu_si256((int16x16 *) (f + 384), e24); _mm256_storeu_si256((int16x16 *) (f + 448), e28); f += 512; } f -= 512 * reps; // startbatch 512 for (long long r = 0; r < reps; ++r) { // vector_butterfly 16 272 1 0 int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16)); int16x16 a17 = _mm256_loadu_si256((int16x16 *) (f + 272)); int16x16 b1 = add_x16(a1, a17); int16x16 b17 = sub_x16(a1, a17); // vector_butterfly 144 400 1 0 int16x16 a9 = _mm256_loadu_si256((int16x16 *) (f + 144)); int16x16 a25 = _mm256_loadu_si256((int16x16 *) (f + 400)); int16x16 b9 = add_x16(a9, a25); int16x16 b25 = sub_x16(a9, a25); // vector_butterfly 80 336 1 0 int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80)); int16x16 a21 = _mm256_loadu_si256((int16x16 *) (f + 336)); int16x16 b5 = add_x16(a5, a21); int16x16 b21 = sub_x16(a5, a21); // vector_butterfly 208 464 1 0 int16x16 a13 = _mm256_loadu_si256((int16x16 *) (f + 208)); int16x16 a29 = _mm256_loadu_si256((int16x16 *) (f + 464)); int16x16 b13 = add_x16(a13, a29); int16x16 b29 = sub_x16(a13, a29); // vector_reduce_ifreverse 16 // vector_reduce_ifreverse 272 // vector_butterfly 16 144 1 0 int16x16 c1 = add_x16(b1, b9); int16x16 c9 = sub_x16(b1, b9); // vector_butterfly 80 208 1 0 int16x16 c5 = add_x16(b5, b13); int16x16 c13 = sub_x16(b5, b13); // vector_butterfly 272 400 4 1 b25 = mulmod_scaled_x16(b25, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); int16x16 c17 = add_x16(b17, b25); int16x16 c25 = sub_x16(b17, b25); // vector_butterfly 336 464 4 1 b29 = mulmod_scaled_x16(b29, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); int16x16 c21 = add_x16(b21, b29); int16x16 c29 = sub_x16(b21, b29); // vector_reduce_ifforward 80 c5 = reduce_x16(c5, qdata); // vector_butterfly 16 80 1 0 int16x16 d1 = add_x16(c1, c5); int16x16 d5 = sub_x16(c1, c5); // vector_butterfly 144 208 4 1 c13 = mulmod_scaled_x16(c13, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); int16x16 d9 = add_x16(c9, c13); int16x16 d13 = sub_x16(c9, c13); // vector_butterfly 272 336 8 1 c21 = mulmod_scaled_x16(c21, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata); int16x16 d17 = add_x16(c17, c21); int16x16 d21 = sub_x16(c17, c21); // vector_butterfly 400 464 8 7 c29 = mulmod_scaled_x16(c29, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata); int16x16 d25 = add_x16(c25, c29); int16x16 d29 = sub_x16(c25, c29); // vector_reduce 16 d1 = reduce_x16(d1, qdata); // vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 d5 = mulmod_scaled_x16(d5, precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); // vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 d9 = mulmod_scaled_x16(d9, precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); // vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 d13 = mulmod_scaled_x16(d13, precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); // vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 d17 = mulmod_scaled_x16(d17, precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); // vector_twist 336 512 5 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 d21 = mulmod_scaled_x16(d21, precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); // vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 d25 = mulmod_scaled_x16(d25, precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); // vector_twist 464 512 507 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 d29 = mulmod_scaled_x16(d29, precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); // vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e1 = _mm256_permute2x128_si256_lo(d1, d5); int16x16 e5 = _mm256_permute2x128_si256_hi(d1, d5); // vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e9 = _mm256_permute2x128_si256_lo(d9, d13); int16x16 e13 = _mm256_permute2x128_si256_hi(d9, d13); // vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e17 = _mm256_permute2x128_si256_lo(d17, d21); int16x16 e21 = _mm256_permute2x128_si256_hi(d17, d21); // vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e25 = _mm256_permute2x128_si256_lo(d25, d29); int16x16 e29 = _mm256_permute2x128_si256_hi(d25, d29); // stopbatch 512 _mm256_storeu_si256((int16x16 *) (f + 16), e1); _mm256_storeu_si256((int16x16 *) (f + 80), e5); _mm256_storeu_si256((int16x16 *) (f + 144), e9); _mm256_storeu_si256((int16x16 *) (f + 208), e13); _mm256_storeu_si256((int16x16 *) (f + 272), e17); _mm256_storeu_si256((int16x16 *) (f + 336), e21); _mm256_storeu_si256((int16x16 *) (f + 400), e25); _mm256_storeu_si256((int16x16 *) (f + 464), e29); f += 512; } f -= 512 * reps; // startbatch 512 for (long long r = 0; r < reps; ++r) { // vector_butterfly 32 288 1 0 int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32)); int16x16 a18 = _mm256_loadu_si256((int16x16 *) (f + 288)); int16x16 b2 = add_x16(a2, a18); int16x16 b18 = sub_x16(a2, a18); // vector_butterfly 160 416 1 0 int16x16 a10 = _mm256_loadu_si256((int16x16 *) (f + 160)); int16x16 a26 = _mm256_loadu_si256((int16x16 *) (f + 416)); int16x16 b10 = add_x16(a10, a26); int16x16 b26 = sub_x16(a10, a26); // vector_butterfly 96 352 1 0 int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96)); int16x16 a22 = _mm256_loadu_si256((int16x16 *) (f + 352)); int16x16 b6 = add_x16(a6, a22); int16x16 b22 = sub_x16(a6, a22); // vector_butterfly 224 480 1 0 int16x16 a14 = _mm256_loadu_si256((int16x16 *) (f + 224)); int16x16 a30 = _mm256_loadu_si256((int16x16 *) (f + 480)); int16x16 b14 = add_x16(a14, a30); int16x16 b30 = sub_x16(a14, a30); // vector_reduce_ifreverse 32 // vector_reduce_ifreverse 288 // vector_butterfly 32 160 1 0 int16x16 c2 = add_x16(b2, b10); int16x16 c10 = sub_x16(b2, b10); // vector_butterfly 96 224 1 0 int16x16 c6 = add_x16(b6, b14); int16x16 c14 = sub_x16(b6, b14); // vector_butterfly 288 416 4 1 b26 = mulmod_scaled_x16(b26, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); int16x16 c18 = add_x16(b18, b26); int16x16 c26 = sub_x16(b18, b26); // vector_butterfly 352 480 4 1 b30 = mulmod_scaled_x16(b30, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); int16x16 c22 = add_x16(b22, b30); int16x16 c30 = sub_x16(b22, b30); // vector_reduce_ifforward 96 c6 = reduce_x16(c6, qdata); // vector_butterfly 32 96 1 0 int16x16 d2 = add_x16(c2, c6); int16x16 d6 = sub_x16(c2, c6); // vector_butterfly 160 224 4 1 c14 = mulmod_scaled_x16(c14, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); int16x16 d10 = add_x16(c10, c14); int16x16 d14 = sub_x16(c10, c14); // vector_butterfly 288 352 8 1 c22 = mulmod_scaled_x16(c22, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata); int16x16 d18 = add_x16(c18, c22); int16x16 d22 = sub_x16(c18, c22); // vector_butterfly 416 480 8 7 c30 = mulmod_scaled_x16(c30, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata); int16x16 d26 = add_x16(c26, c30); int16x16 d30 = sub_x16(c26, c30); // vector_reduce 32 d2 = reduce_x16(d2, qdata); // vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 d6 = mulmod_scaled_x16(d6, precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); // vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 d10 = mulmod_scaled_x16(d10, precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); // vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 d14 = mulmod_scaled_x16(d14, precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); // vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 d18 = mulmod_scaled_x16(d18, precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); // vector_twist 352 512 5 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 d22 = mulmod_scaled_x16(d22, precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); // vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 d26 = mulmod_scaled_x16(d26, precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); // vector_twist 480 512 507 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 d30 = mulmod_scaled_x16(d30, precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); // vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e2 = _mm256_permute2x128_si256_lo(d2, d6); int16x16 e6 = _mm256_permute2x128_si256_hi(d2, d6); // vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e10 = _mm256_permute2x128_si256_lo(d10, d14); int16x16 e14 = _mm256_permute2x128_si256_hi(d10, d14); // vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e18 = _mm256_permute2x128_si256_lo(d18, d22); int16x16 e22 = _mm256_permute2x128_si256_hi(d18, d22); // vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e26 = _mm256_permute2x128_si256_lo(d26, d30); int16x16 e30 = _mm256_permute2x128_si256_hi(d26, d30); // stopbatch 512 _mm256_storeu_si256((int16x16 *) (f + 32), e2); _mm256_storeu_si256((int16x16 *) (f + 96), e6); _mm256_storeu_si256((int16x16 *) (f + 160), e10); _mm256_storeu_si256((int16x16 *) (f + 224), e14); _mm256_storeu_si256((int16x16 *) (f + 288), e18); _mm256_storeu_si256((int16x16 *) (f + 352), e22); _mm256_storeu_si256((int16x16 *) (f + 416), e26); _mm256_storeu_si256((int16x16 *) (f + 480), e30); f += 512; } f -= 512 * reps; // startbatch 512 for (long long r = 0; r < reps; ++r) { // vector_butterfly 48 304 1 0 int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48)); int16x16 a19 = _mm256_loadu_si256((int16x16 *) (f + 304)); int16x16 b3 = add_x16(a3, a19); int16x16 b19 = sub_x16(a3, a19); // vector_butterfly 176 432 1 0 int16x16 a11 = _mm256_loadu_si256((int16x16 *) (f + 176)); int16x16 a27 = _mm256_loadu_si256((int16x16 *) (f + 432)); int16x16 b11 = add_x16(a11, a27); int16x16 b27 = sub_x16(a11, a27); // vector_butterfly 112 368 1 0 int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112)); int16x16 a23 = _mm256_loadu_si256((int16x16 *) (f + 368)); int16x16 b7 = add_x16(a7, a23); int16x16 b23 = sub_x16(a7, a23); // vector_butterfly 240 496 1 0 int16x16 a15 = _mm256_loadu_si256((int16x16 *) (f + 240)); int16x16 a31 = _mm256_loadu_si256((int16x16 *) (f + 496)); int16x16 b15 = add_x16(a15, a31); int16x16 b31 = sub_x16(a15, a31); // vector_reduce_ifreverse 48 // vector_reduce_ifreverse 304 // vector_butterfly 48 176 1 0 int16x16 c3 = add_x16(b3, b11); int16x16 c11 = sub_x16(b3, b11); // vector_butterfly 112 240 1 0 int16x16 c7 = add_x16(b7, b15); int16x16 c15 = sub_x16(b7, b15); // vector_butterfly 304 432 4 1 b27 = mulmod_scaled_x16(b27, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); int16x16 c19 = add_x16(b19, b27); int16x16 c27 = sub_x16(b19, b27); // vector_butterfly 368 496 4 1 b31 = mulmod_scaled_x16(b31, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); int16x16 c23 = add_x16(b23, b31); int16x16 c31 = sub_x16(b23, b31); // vector_reduce_ifforward 112 c7 = reduce_x16(c7, qdata); // vector_butterfly 48 112 1 0 int16x16 d3 = add_x16(c3, c7); int16x16 d7 = sub_x16(c3, c7); // vector_butterfly 176 240 4 1 c15 = mulmod_scaled_x16(c15, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); int16x16 d11 = add_x16(c11, c15); int16x16 d15 = sub_x16(c11, c15); // vector_butterfly 304 368 8 1 c23 = mulmod_scaled_x16(c23, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata); int16x16 d19 = add_x16(c19, c23); int16x16 d23 = sub_x16(c19, c23); // vector_butterfly 432 496 8 7 c31 = mulmod_scaled_x16(c31, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata); int16x16 d27 = add_x16(c27, c31); int16x16 d31 = sub_x16(c27, c31); // vector_reduce 48 d3 = reduce_x16(d3, qdata); // vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 d7 = mulmod_scaled_x16(d7, precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); // vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 d11 = mulmod_scaled_x16(d11, precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); // vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 d15 = mulmod_scaled_x16(d15, precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); // vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 d19 = mulmod_scaled_x16(d19, precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); // vector_twist 368 512 5 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 d23 = mulmod_scaled_x16(d23, precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); // vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 d27 = mulmod_scaled_x16(d27, precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); // vector_twist 496 512 507 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 d31 = mulmod_scaled_x16(d31, precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); // vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e3 = _mm256_permute2x128_si256_lo(d3, d7); int16x16 e7 = _mm256_permute2x128_si256_hi(d3, d7); // vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e11 = _mm256_permute2x128_si256_lo(d11, d15); int16x16 e15 = _mm256_permute2x128_si256_hi(d11, d15); // vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e19 = _mm256_permute2x128_si256_lo(d19, d23); int16x16 e23 = _mm256_permute2x128_si256_hi(d19, d23); // vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 e27 = _mm256_permute2x128_si256_lo(d27, d31); int16x16 e31 = _mm256_permute2x128_si256_hi(d27, d31); // stopbatch 512 _mm256_storeu_si256((int16x16 *) (f + 48), e3); _mm256_storeu_si256((int16x16 *) (f + 112), e7); _mm256_storeu_si256((int16x16 *) (f + 176), e11); _mm256_storeu_si256((int16x16 *) (f + 240), e15); _mm256_storeu_si256((int16x16 *) (f + 304), e19); _mm256_storeu_si256((int16x16 *) (f + 368), e23); _mm256_storeu_si256((int16x16 *) (f + 432), e27); _mm256_storeu_si256((int16x16 *) (f + 496), e31); f += 512; } f -= 512 * reps; // doublereps reps *= 2; // doublereps reps *= 2; // startbatch 128 for (long long r = 0; r < reps; ++r) { // vector_butterfly 0 32 1 0 int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0)); int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32)); int16x16 b0 = add_x16(a0, a2); int16x16 b2 = sub_x16(a0, a2); // vector_butterfly 64 96 1 0 int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64)); int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96)); int16x16 b4 = add_x16(a4, a6); int16x16 b6 = sub_x16(a4, a6); // vector_butterfly 16 48 1 0 int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16)); int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48)); int16x16 b1 = add_x16(a1, a3); int16x16 b3 = sub_x16(a1, a3); // vector_butterfly 80 112 1 0 int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80)); int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112)); int16x16 b5 = add_x16(a5, a7); int16x16 b7 = sub_x16(a5, a7); // vector_butterfly 0 16 1 0 int16x16 c0 = add_x16(b0, b1); int16x16 c1 = sub_x16(b0, b1); // vector_butterfly 64 80 1 0 int16x16 c4 = add_x16(b4, b5); int16x16 c5 = sub_x16(b4, b5); // vector_butterfly 32 48 4 1 b3 = mulmod_scaled_x16(b3, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); int16x16 c2 = add_x16(b2, b3); int16x16 c3 = sub_x16(b2, b3); // vector_butterfly 96 112 4 1 b7 = mulmod_scaled_x16(b7, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); int16x16 c6 = add_x16(b6, b7); int16x16 c7 = sub_x16(b6, b7); // vector_reduce 0 c0 = reduce_x16(c0, qdata); // vector_reduce 64 c4 = reduce_x16(c4, qdata); // vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 c1 = mulmod_scaled_x16(c1, precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata); // vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 c5 = mulmod_scaled_x16(c5, precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata); // vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 c2 = mulmod_scaled_x16(c2, precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata); // vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 c6 = mulmod_scaled_x16(c6, precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata); // vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 c3 = mulmod_scaled_x16(c3, precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata); // vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 c7 = mulmod_scaled_x16(c7, precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata); // vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 int16x16 d0 = _mm256_unpacklo_epi16(c0, c2); int16x16 d2 = _mm256_unpackhi_epi16(c0, c2); // vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 int16x16 d1 = _mm256_unpacklo_epi16(c1, c3); int16x16 d3 = _mm256_unpackhi_epi16(c1, c3); // vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 int16x16 d4 = _mm256_unpacklo_epi16(c4, c6); int16x16 d6 = _mm256_unpackhi_epi16(c4, c6); // vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 int16x16 d5 = _mm256_unpacklo_epi16(c5, c7); int16x16 d7 = _mm256_unpackhi_epi16(c5, c7); // vector_butterfly 0 64 1 0 int16x16 e0 = add_x16(d0, d4); int16x16 e4 = sub_x16(d0, d4); // vector_butterfly 32 96 1 0 int16x16 e2 = add_x16(d2, d6); int16x16 e6 = sub_x16(d2, d6); // vector_butterfly 16 80 1 0 int16x16 e1 = add_x16(d1, d5); int16x16 e5 = sub_x16(d1, d5); // vector_butterfly 48 112 1 0 int16x16 e3 = add_x16(d3, d7); int16x16 e7 = sub_x16(d3, d7); // vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 int16x16 f0 = _mm256_unpacklo_epi32(e0, e1); int16x16 f1 = _mm256_unpackhi_epi32(e0, e1); // vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 int16x16 f2 = _mm256_unpacklo_epi32(e2, e3); int16x16 f3 = _mm256_unpackhi_epi32(e2, e3); // vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 int16x16 f4 = _mm256_unpacklo_epi32(e4, e5); int16x16 f5 = _mm256_unpackhi_epi32(e4, e5); // vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 int16x16 f6 = _mm256_unpacklo_epi32(e6, e7); int16x16 f7 = _mm256_unpackhi_epi32(e6, e7); // stopbatch 128 _mm256_storeu_si256((int16x16 *) (f + 0), f0); _mm256_storeu_si256((int16x16 *) (f + 16), f1); _mm256_storeu_si256((int16x16 *) (f + 32), f2); _mm256_storeu_si256((int16x16 *) (f + 48), f3); _mm256_storeu_si256((int16x16 *) (f + 64), f4); _mm256_storeu_si256((int16x16 *) (f + 80), f5); _mm256_storeu_si256((int16x16 *) (f + 96), f6); _mm256_storeu_si256((int16x16 *) (f + 112), f7); f += 128; } f -= 128 * reps; // startbatch 128 for (long long r = 0; r < reps; ++r) { // vector_butterfly 0 32 1 0 int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0)); int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32)); int16x16 b0 = add_x16(a0, a2); int16x16 b2 = sub_x16(a0, a2); // vector_butterfly 16 48 1 0 int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16)); int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48)); int16x16 b1 = add_x16(a1, a3); int16x16 b3 = sub_x16(a1, a3); // vector_butterfly 64 96 4 1 int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64)); int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96)); a6 = mulmod_scaled_x16(a6, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); int16x16 b4 = add_x16(a4, a6); int16x16 b6 = sub_x16(a4, a6); // vector_butterfly 80 112 4 1 int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80)); int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112)); a7 = mulmod_scaled_x16(a7, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); int16x16 b5 = add_x16(a5, a7); int16x16 b7 = sub_x16(a5, a7); // vector_reduce 0 b0 = reduce_x16(b0, qdata); // vector_reduce 16 b1 = reduce_x16(b1, qdata); // vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 b2 = mulmod_scaled_x16(b2, precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata); // vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 b3 = mulmod_scaled_x16(b3, precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata); // vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 b4 = mulmod_scaled_x16(b4, precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata); // vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 b5 = mulmod_scaled_x16(b5, precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata); // vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 b6 = mulmod_scaled_x16(b6, precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata); // vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 b7 = mulmod_scaled_x16(b7, precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata); // vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 int16x16 c0 = _mm256_unpacklo_epi64(b0, b4); int16x16 c4 = _mm256_unpackhi_epi64(b0, b4); // vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 int16x16 c1 = _mm256_unpacklo_epi64(b1, b5); int16x16 c5 = _mm256_unpackhi_epi64(b1, b5); // vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 int16x16 c2 = _mm256_unpacklo_epi64(b2, b6); int16x16 c6 = _mm256_unpackhi_epi64(b2, b6); // vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 int16x16 c3 = _mm256_unpacklo_epi64(b3, b7); int16x16 c7 = _mm256_unpackhi_epi64(b3, b7); // vector_butterfly 0 16 1 0 int16x16 d0 = add_x16(c0, c1); int16x16 d1 = sub_x16(c0, c1); // vector_butterfly 64 80 1 0 int16x16 d4 = add_x16(c4, c5); int16x16 d5 = sub_x16(c4, c5); // vector_butterfly 32 48 1 0 int16x16 d2 = add_x16(c2, c3); int16x16 d3 = sub_x16(c2, c3); // vector_butterfly 96 112 1 0 int16x16 d6 = add_x16(c6, c7); int16x16 d7 = sub_x16(c6, c7); // vector_butterfly 0 64 1 0 int16x16 e0 = add_x16(d0, d4); int16x16 e4 = sub_x16(d0, d4); // vector_butterfly 32 96 1 0 int16x16 e2 = add_x16(d2, d6); int16x16 e6 = sub_x16(d2, d6); // vector_butterfly 16 80 4 1 d5 = mulmod_scaled_x16(d5, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); int16x16 e1 = add_x16(d1, d5); int16x16 e5 = sub_x16(d1, d5); // vector_butterfly 48 112 4 1 d7 = mulmod_scaled_x16(d7, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata); int16x16 e3 = add_x16(d3, d7); int16x16 e7 = sub_x16(d3, d7); // stopbatch 128 _mm256_storeu_si256((int16x16 *) (f + 0), e0); _mm256_storeu_si256((int16x16 *) (f + 16), e1); _mm256_storeu_si256((int16x16 *) (f + 32), e2); _mm256_storeu_si256((int16x16 *) (f + 48), e3); _mm256_storeu_si256((int16x16 *) (f + 64), e4); _mm256_storeu_si256((int16x16 *) (f + 80), e5); _mm256_storeu_si256((int16x16 *) (f + 96), e6); _mm256_storeu_si256((int16x16 *) (f + 112), e7); f += 128; } // f -= 128*reps; // stopntt 512 } void PQCLEAN_NTRULPR1013_AVX2_ntt512_7681(int16 *f, int reps) { ntt512(f, reps, qdata_7681.data); } void PQCLEAN_NTRULPR1013_AVX2_ntt512_10753(int16 *f, int reps) { ntt512(f, reps, qdata_10753.data); } // inv stopntt 512 static void invntt512(int16 *f, int reps, const int16 *qdata) { reps *= 4; // inv stopbatch 128 for (long long r = 0; r < reps; ++r) { // inv vector_butterfly 48 112 4 1 int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48)); int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112)); int16x16 b3 = add_x16(a3, a7); int16x16 b7 = sub_x16(a3, a7); b7 = mulmod_scaled_x16(b7, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); // inv vector_butterfly 16 80 4 1 int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16)); int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80)); int16x16 b1 = add_x16(a1, a5); int16x16 b5 = sub_x16(a1, a5); b5 = mulmod_scaled_x16(b5, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); // inv vector_butterfly 32 96 1 0 int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32)); int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96)); int16x16 b2 = add_x16(a2, a6); int16x16 b6 = sub_x16(a2, a6); // inv vector_butterfly 0 64 1 0 int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0)); int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64)); int16x16 b0 = add_x16(a0, a4); int16x16 b4 = sub_x16(a0, a4); // inv vector_butterfly 96 112 1 0 int16x16 c6 = add_x16(b6, b7); int16x16 c7 = sub_x16(b6, b7); // inv vector_butterfly 32 48 1 0 int16x16 c2 = add_x16(b2, b3); int16x16 c3 = sub_x16(b2, b3); // inv vector_butterfly 64 80 1 0 int16x16 c4 = add_x16(b4, b5); int16x16 c5 = sub_x16(b4, b5); // inv vector_butterfly 0 16 1 0 int16x16 c0 = add_x16(b0, b1); int16x16 c1 = sub_x16(b0, b1); // inv vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 int16x16 d3 = _mm256_unpacklo_epi64(c3, c7); int16x16 d7 = _mm256_unpackhi_epi64(c3, c7); // inv vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 int16x16 d2 = _mm256_unpacklo_epi64(c2, c6); int16x16 d6 = _mm256_unpackhi_epi64(c2, c6); // inv vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 int16x16 d1 = _mm256_unpacklo_epi64(c1, c5); int16x16 d5 = _mm256_unpackhi_epi64(c1, c5); // inv vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64 int16x16 d0 = _mm256_unpacklo_epi64(c0, c4); int16x16 d4 = _mm256_unpackhi_epi64(c0, c4); // inv vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 d7 = mulmod_scaled_x16(d7, precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata); // inv vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 d6 = mulmod_scaled_x16(d6, precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata); // inv vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 d5 = mulmod_scaled_x16(d5, precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata); // inv vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 d4 = mulmod_scaled_x16(d4, precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata); // inv vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3 d3 = mulmod_scaled_x16(d3, precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata); // inv vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 d2 = mulmod_scaled_x16(d2, precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata); // inv vector_reduce 16 d1 = reduce_x16(d1, qdata); // inv vector_reduce 0 d0 = reduce_x16(d0, qdata); // inv vector_butterfly 80 112 4 1 int16x16 e5 = add_x16(d5, d7); int16x16 e7 = sub_x16(d5, d7); e7 = mulmod_scaled_x16(e7, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); // inv vector_butterfly 64 96 4 1 int16x16 e4 = add_x16(d4, d6); int16x16 e6 = sub_x16(d4, d6); e6 = mulmod_scaled_x16(e6, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); // inv vector_butterfly 16 48 1 0 int16x16 e1 = add_x16(d1, d3); int16x16 e3 = sub_x16(d1, d3); // inv vector_butterfly 0 32 1 0 int16x16 e0 = add_x16(d0, d2); int16x16 e2 = sub_x16(d0, d2); // inv startbatch 128 _mm256_storeu_si256((int16x16 *) (f + 0), e0); _mm256_storeu_si256((int16x16 *) (f + 16), e1); _mm256_storeu_si256((int16x16 *) (f + 32), e2); _mm256_storeu_si256((int16x16 *) (f + 48), e3); _mm256_storeu_si256((int16x16 *) (f + 64), e4); _mm256_storeu_si256((int16x16 *) (f + 80), e5); _mm256_storeu_si256((int16x16 *) (f + 96), e6); _mm256_storeu_si256((int16x16 *) (f + 112), e7); f += 128; } f -= 128 * reps; // inv stopbatch 128 for (long long r = 0; r < reps; ++r) { // inv vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96)); int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112)); int16x16 b6 = _mm256_unpacklo_epi32(a6, a7); int16x16 b7 = _mm256_unpackhi_epi32(a6, a7); int16x16 c6 = _mm256_unpacklo_epi32(b6, b7); int16x16 c7 = _mm256_unpackhi_epi32(b6, b7); // inv vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64)); int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80)); int16x16 b4 = _mm256_unpacklo_epi32(a4, a5); int16x16 b5 = _mm256_unpackhi_epi32(a4, a5); int16x16 c4 = _mm256_unpacklo_epi32(b4, b5); int16x16 c5 = _mm256_unpackhi_epi32(b4, b5); // inv vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32)); int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48)); int16x16 b2 = _mm256_unpacklo_epi32(a2, a3); int16x16 b3 = _mm256_unpackhi_epi32(a2, a3); int16x16 c2 = _mm256_unpacklo_epi32(b2, b3); int16x16 c3 = _mm256_unpackhi_epi32(b2, b3); // inv vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32 int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0)); int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16)); int16x16 b0 = _mm256_unpacklo_epi32(a0, a1); int16x16 b1 = _mm256_unpackhi_epi32(a0, a1); int16x16 c0 = _mm256_unpacklo_epi32(b0, b1); int16x16 c1 = _mm256_unpackhi_epi32(b0, b1); // inv vector_butterfly 48 112 1 0 int16x16 d3 = add_x16(c3, c7); int16x16 d7 = sub_x16(c3, c7); // inv vector_butterfly 16 80 1 0 int16x16 d1 = add_x16(c1, c5); int16x16 d5 = sub_x16(c1, c5); // inv vector_butterfly 32 96 1 0 int16x16 d2 = add_x16(c2, c6); int16x16 d6 = sub_x16(c2, c6); // inv vector_butterfly 0 64 1 0 int16x16 d0 = add_x16(c0, c4); int16x16 d4 = sub_x16(c0, c4); // inv vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 int16x16 e5 = _mm256_unpacklo_epi16(d5, d7); int16x16 e7 = _mm256_unpackhi_epi16(d5, d7); int16x16 f5 = _mm256_unpacklo_epi16(e5, e7); int16x16 f7 = _mm256_unpackhi_epi16(e5, e7); int16x16 g5 = _mm256_unpacklo_epi16(f5, f7); int16x16 g7 = _mm256_unpackhi_epi16(f5, f7); // inv vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 int16x16 e4 = _mm256_unpacklo_epi16(d4, d6); int16x16 e6 = _mm256_unpackhi_epi16(d4, d6); int16x16 f4 = _mm256_unpacklo_epi16(e4, e6); int16x16 f6 = _mm256_unpackhi_epi16(e4, e6); int16x16 g4 = _mm256_unpacklo_epi16(f4, f6); int16x16 g6 = _mm256_unpackhi_epi16(f4, f6); // inv vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 int16x16 e1 = _mm256_unpacklo_epi16(d1, d3); int16x16 e3 = _mm256_unpackhi_epi16(d1, d3); int16x16 f1 = _mm256_unpacklo_epi16(e1, e3); int16x16 f3 = _mm256_unpackhi_epi16(e1, e3); int16x16 g1 = _mm256_unpacklo_epi16(f1, f3); int16x16 g3 = _mm256_unpackhi_epi16(f1, f3); // inv vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16 int16x16 e0 = _mm256_unpacklo_epi16(d0, d2); int16x16 e2 = _mm256_unpackhi_epi16(d0, d2); int16x16 f0 = _mm256_unpacklo_epi16(e0, e2); int16x16 f2 = _mm256_unpackhi_epi16(e0, e2); int16x16 g0 = _mm256_unpacklo_epi16(f0, f2); int16x16 g2 = _mm256_unpackhi_epi16(f0, f2); // inv vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 g7 = mulmod_scaled_x16(g7, precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata); // inv vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 g3 = mulmod_scaled_x16(g3, precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata); // inv vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 g6 = mulmod_scaled_x16(g6, precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata); // inv vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 g2 = mulmod_scaled_x16(g2, precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata); // inv vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 g5 = mulmod_scaled_x16(g5, precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata); // inv vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 g1 = mulmod_scaled_x16(g1, precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata); // inv vector_reduce 64 g4 = reduce_x16(g4, qdata); // inv vector_reduce 0 g0 = reduce_x16(g0, qdata); // inv vector_butterfly 96 112 4 1 int16x16 h6 = add_x16(g6, g7); int16x16 h7 = sub_x16(g6, g7); h7 = mulmod_scaled_x16(h7, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); // inv vector_butterfly 32 48 4 1 int16x16 h2 = add_x16(g2, g3); int16x16 h3 = sub_x16(g2, g3); h3 = mulmod_scaled_x16(h3, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); // inv vector_butterfly 64 80 1 0 int16x16 h4 = add_x16(g4, g5); int16x16 h5 = sub_x16(g4, g5); // inv vector_butterfly 0 16 1 0 int16x16 h0 = add_x16(g0, g1); int16x16 h1 = sub_x16(g0, g1); // inv vector_butterfly 80 112 1 0 int16x16 i5 = add_x16(h5, h7); int16x16 i7 = sub_x16(h5, h7); // inv vector_butterfly 16 48 1 0 int16x16 i1 = add_x16(h1, h3); int16x16 i3 = sub_x16(h1, h3); // inv vector_butterfly 64 96 1 0 int16x16 i4 = add_x16(h4, h6); int16x16 i6 = sub_x16(h4, h6); // inv vector_butterfly 0 32 1 0 int16x16 i0 = add_x16(h0, h2); int16x16 i2 = sub_x16(h0, h2); // inv startbatch 128 _mm256_storeu_si256((int16x16 *) (f + 0), i0); _mm256_storeu_si256((int16x16 *) (f + 16), i1); _mm256_storeu_si256((int16x16 *) (f + 32), i2); _mm256_storeu_si256((int16x16 *) (f + 48), i3); _mm256_storeu_si256((int16x16 *) (f + 64), i4); _mm256_storeu_si256((int16x16 *) (f + 80), i5); _mm256_storeu_si256((int16x16 *) (f + 96), i6); _mm256_storeu_si256((int16x16 *) (f + 112), i7); f += 128; } f -= 128 * reps; // inv doublereps reps /= 2; // inv doublereps reps /= 2; // inv stopbatch 512 for (long long r = 0; r < reps; ++r) { // inv vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a27 = _mm256_loadu_si256((int16x16 *) (f + 432)); int16x16 a31 = _mm256_loadu_si256((int16x16 *) (f + 496)); int16x16 b27 = _mm256_permute2x128_si256_lo(a27, a31); int16x16 b31 = _mm256_permute2x128_si256_hi(a27, a31); // inv vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a19 = _mm256_loadu_si256((int16x16 *) (f + 304)); int16x16 a23 = _mm256_loadu_si256((int16x16 *) (f + 368)); int16x16 b19 = _mm256_permute2x128_si256_lo(a19, a23); int16x16 b23 = _mm256_permute2x128_si256_hi(a19, a23); // inv vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a11 = _mm256_loadu_si256((int16x16 *) (f + 176)); int16x16 a15 = _mm256_loadu_si256((int16x16 *) (f + 240)); int16x16 b11 = _mm256_permute2x128_si256_lo(a11, a15); int16x16 b15 = _mm256_permute2x128_si256_hi(a11, a15); // inv vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48)); int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112)); int16x16 b3 = _mm256_permute2x128_si256_lo(a3, a7); int16x16 b7 = _mm256_permute2x128_si256_hi(a3, a7); // inv vector_twist 496 512 507 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 b31 = mulmod_scaled_x16(b31, precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); // inv vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 b27 = mulmod_scaled_x16(b27, precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); // inv vector_twist 368 512 5 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 b23 = mulmod_scaled_x16(b23, precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); // inv vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 b19 = mulmod_scaled_x16(b19, precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); // inv vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 b15 = mulmod_scaled_x16(b15, precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); // inv vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 b11 = mulmod_scaled_x16(b11, precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); // inv vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 b7 = mulmod_scaled_x16(b7, precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata); // inv vector_reduce 48 b3 = reduce_x16(b3, qdata); // inv vector_butterfly 432 496 8 7 int16x16 c27 = add_x16(b27, b31); int16x16 c31 = sub_x16(b27, b31); c31 = mulmod_scaled_x16(c31, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata); // inv vector_butterfly 304 368 8 1 int16x16 c19 = add_x16(b19, b23); int16x16 c23 = sub_x16(b19, b23); c23 = mulmod_scaled_x16(c23, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata); // inv vector_butterfly 176 240 4 1 int16x16 c11 = add_x16(b11, b15); int16x16 c15 = sub_x16(b11, b15); c15 = mulmod_scaled_x16(c15, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); // inv vector_butterfly 48 112 1 0 int16x16 c3 = add_x16(b3, b7); int16x16 c7 = sub_x16(b3, b7); // inv vector_reduce_ifforward 112 // inv vector_butterfly 368 496 4 1 int16x16 d23 = add_x16(c23, c31); int16x16 d31 = sub_x16(c23, c31); d31 = mulmod_scaled_x16(d31, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); // inv vector_butterfly 304 432 4 1 int16x16 d19 = add_x16(c19, c27); int16x16 d27 = sub_x16(c19, c27); d27 = mulmod_scaled_x16(d27, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); // inv vector_butterfly 112 240 1 0 int16x16 d7 = add_x16(c7, c15); int16x16 d15 = sub_x16(c7, c15); // inv vector_butterfly 48 176 1 0 int16x16 d3 = add_x16(c3, c11); int16x16 d11 = sub_x16(c3, c11); // inv vector_reduce_ifreverse 304 d19 = reduce_x16(d19, qdata); // inv vector_reduce_ifreverse 48 d3 = reduce_x16(d3, qdata); // inv vector_butterfly 240 496 1 0 int16x16 e15 = add_x16(d15, d31); int16x16 e31 = sub_x16(d15, d31); // inv vector_butterfly 112 368 1 0 int16x16 e7 = add_x16(d7, d23); int16x16 e23 = sub_x16(d7, d23); // inv vector_butterfly 176 432 1 0 int16x16 e11 = add_x16(d11, d27); int16x16 e27 = sub_x16(d11, d27); // inv vector_butterfly 48 304 1 0 int16x16 e3 = add_x16(d3, d19); int16x16 e19 = sub_x16(d3, d19); // inv startbatch 512 _mm256_storeu_si256((int16x16 *) (f + 48), e3); _mm256_storeu_si256((int16x16 *) (f + 112), e7); _mm256_storeu_si256((int16x16 *) (f + 176), e11); _mm256_storeu_si256((int16x16 *) (f + 240), e15); _mm256_storeu_si256((int16x16 *) (f + 304), e19); _mm256_storeu_si256((int16x16 *) (f + 368), e23); _mm256_storeu_si256((int16x16 *) (f + 432), e27); _mm256_storeu_si256((int16x16 *) (f + 496), e31); f += 512; } f -= 512 * reps; // inv stopbatch 512 for (long long r = 0; r < reps; ++r) { // inv vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a26 = _mm256_loadu_si256((int16x16 *) (f + 416)); int16x16 a30 = _mm256_loadu_si256((int16x16 *) (f + 480)); int16x16 b26 = _mm256_permute2x128_si256_lo(a26, a30); int16x16 b30 = _mm256_permute2x128_si256_hi(a26, a30); // inv vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a18 = _mm256_loadu_si256((int16x16 *) (f + 288)); int16x16 a22 = _mm256_loadu_si256((int16x16 *) (f + 352)); int16x16 b18 = _mm256_permute2x128_si256_lo(a18, a22); int16x16 b22 = _mm256_permute2x128_si256_hi(a18, a22); // inv vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a10 = _mm256_loadu_si256((int16x16 *) (f + 160)); int16x16 a14 = _mm256_loadu_si256((int16x16 *) (f + 224)); int16x16 b10 = _mm256_permute2x128_si256_lo(a10, a14); int16x16 b14 = _mm256_permute2x128_si256_hi(a10, a14); // inv vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32)); int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96)); int16x16 b2 = _mm256_permute2x128_si256_lo(a2, a6); int16x16 b6 = _mm256_permute2x128_si256_hi(a2, a6); // inv vector_twist 480 512 507 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 b30 = mulmod_scaled_x16(b30, precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); // inv vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 b26 = mulmod_scaled_x16(b26, precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); // inv vector_twist 352 512 5 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 b22 = mulmod_scaled_x16(b22, precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); // inv vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 b18 = mulmod_scaled_x16(b18, precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); // inv vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 b14 = mulmod_scaled_x16(b14, precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); // inv vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 b10 = mulmod_scaled_x16(b10, precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); // inv vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 b6 = mulmod_scaled_x16(b6, precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata); // inv vector_reduce 32 b2 = reduce_x16(b2, qdata); // inv vector_butterfly 416 480 8 7 int16x16 c26 = add_x16(b26, b30); int16x16 c30 = sub_x16(b26, b30); c30 = mulmod_scaled_x16(c30, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata); // inv vector_butterfly 288 352 8 1 int16x16 c18 = add_x16(b18, b22); int16x16 c22 = sub_x16(b18, b22); c22 = mulmod_scaled_x16(c22, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata); // inv vector_butterfly 160 224 4 1 int16x16 c10 = add_x16(b10, b14); int16x16 c14 = sub_x16(b10, b14); c14 = mulmod_scaled_x16(c14, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); // inv vector_butterfly 32 96 1 0 int16x16 c2 = add_x16(b2, b6); int16x16 c6 = sub_x16(b2, b6); // inv vector_reduce_ifforward 96 // inv vector_butterfly 352 480 4 1 int16x16 d22 = add_x16(c22, c30); int16x16 d30 = sub_x16(c22, c30); d30 = mulmod_scaled_x16(d30, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); // inv vector_butterfly 288 416 4 1 int16x16 d18 = add_x16(c18, c26); int16x16 d26 = sub_x16(c18, c26); d26 = mulmod_scaled_x16(d26, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); // inv vector_butterfly 96 224 1 0 int16x16 d6 = add_x16(c6, c14); int16x16 d14 = sub_x16(c6, c14); // inv vector_butterfly 32 160 1 0 int16x16 d2 = add_x16(c2, c10); int16x16 d10 = sub_x16(c2, c10); // inv vector_reduce_ifreverse 288 d18 = reduce_x16(d18, qdata); // inv vector_reduce_ifreverse 32 d2 = reduce_x16(d2, qdata); // inv vector_butterfly 224 480 1 0 int16x16 e14 = add_x16(d14, d30); int16x16 e30 = sub_x16(d14, d30); // inv vector_butterfly 96 352 1 0 int16x16 e6 = add_x16(d6, d22); int16x16 e22 = sub_x16(d6, d22); // inv vector_butterfly 160 416 1 0 int16x16 e10 = add_x16(d10, d26); int16x16 e26 = sub_x16(d10, d26); // inv vector_butterfly 32 288 1 0 int16x16 e2 = add_x16(d2, d18); int16x16 e18 = sub_x16(d2, d18); // inv startbatch 512 _mm256_storeu_si256((int16x16 *) (f + 32), e2); _mm256_storeu_si256((int16x16 *) (f + 96), e6); _mm256_storeu_si256((int16x16 *) (f + 160), e10); _mm256_storeu_si256((int16x16 *) (f + 224), e14); _mm256_storeu_si256((int16x16 *) (f + 288), e18); _mm256_storeu_si256((int16x16 *) (f + 352), e22); _mm256_storeu_si256((int16x16 *) (f + 416), e26); _mm256_storeu_si256((int16x16 *) (f + 480), e30); f += 512; } f -= 512 * reps; // inv stopbatch 512 for (long long r = 0; r < reps; ++r) { // inv vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a25 = _mm256_loadu_si256((int16x16 *) (f + 400)); int16x16 a29 = _mm256_loadu_si256((int16x16 *) (f + 464)); int16x16 b25 = _mm256_permute2x128_si256_lo(a25, a29); int16x16 b29 = _mm256_permute2x128_si256_hi(a25, a29); // inv vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a17 = _mm256_loadu_si256((int16x16 *) (f + 272)); int16x16 a21 = _mm256_loadu_si256((int16x16 *) (f + 336)); int16x16 b17 = _mm256_permute2x128_si256_lo(a17, a21); int16x16 b21 = _mm256_permute2x128_si256_hi(a17, a21); // inv vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a9 = _mm256_loadu_si256((int16x16 *) (f + 144)); int16x16 a13 = _mm256_loadu_si256((int16x16 *) (f + 208)); int16x16 b9 = _mm256_permute2x128_si256_lo(a9, a13); int16x16 b13 = _mm256_permute2x128_si256_hi(a9, a13); // inv vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16)); int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80)); int16x16 b1 = _mm256_permute2x128_si256_lo(a1, a5); int16x16 b5 = _mm256_permute2x128_si256_hi(a1, a5); // inv vector_twist 464 512 507 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 b29 = mulmod_scaled_x16(b29, precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); // inv vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 b25 = mulmod_scaled_x16(b25, precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); // inv vector_twist 336 512 5 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 b21 = mulmod_scaled_x16(b21, precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); // inv vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 b17 = mulmod_scaled_x16(b17, precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); // inv vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 b13 = mulmod_scaled_x16(b13, precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); // inv vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 b9 = mulmod_scaled_x16(b9, precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); // inv vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 b5 = mulmod_scaled_x16(b5, precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata); // inv vector_reduce 16 b1 = reduce_x16(b1, qdata); // inv vector_butterfly 400 464 8 7 int16x16 c25 = add_x16(b25, b29); int16x16 c29 = sub_x16(b25, b29); c29 = mulmod_scaled_x16(c29, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata); // inv vector_butterfly 272 336 8 1 int16x16 c17 = add_x16(b17, b21); int16x16 c21 = sub_x16(b17, b21); c21 = mulmod_scaled_x16(c21, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata); // inv vector_butterfly 144 208 4 1 int16x16 c9 = add_x16(b9, b13); int16x16 c13 = sub_x16(b9, b13); c13 = mulmod_scaled_x16(c13, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); // inv vector_butterfly 16 80 1 0 int16x16 c1 = add_x16(b1, b5); int16x16 c5 = sub_x16(b1, b5); // inv vector_reduce_ifforward 80 // inv vector_butterfly 336 464 4 1 int16x16 d21 = add_x16(c21, c29); int16x16 d29 = sub_x16(c21, c29); d29 = mulmod_scaled_x16(d29, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); // inv vector_butterfly 272 400 4 1 int16x16 d17 = add_x16(c17, c25); int16x16 d25 = sub_x16(c17, c25); d25 = mulmod_scaled_x16(d25, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); // inv vector_butterfly 80 208 1 0 int16x16 d5 = add_x16(c5, c13); int16x16 d13 = sub_x16(c5, c13); // inv vector_butterfly 16 144 1 0 int16x16 d1 = add_x16(c1, c9); int16x16 d9 = sub_x16(c1, c9); // inv vector_reduce_ifreverse 272 d17 = reduce_x16(d17, qdata); // inv vector_reduce_ifreverse 16 d1 = reduce_x16(d1, qdata); // inv vector_butterfly 208 464 1 0 int16x16 e13 = add_x16(d13, d29); int16x16 e29 = sub_x16(d13, d29); // inv vector_butterfly 80 336 1 0 int16x16 e5 = add_x16(d5, d21); int16x16 e21 = sub_x16(d5, d21); // inv vector_butterfly 144 400 1 0 int16x16 e9 = add_x16(d9, d25); int16x16 e25 = sub_x16(d9, d25); // inv vector_butterfly 16 272 1 0 int16x16 e1 = add_x16(d1, d17); int16x16 e17 = sub_x16(d1, d17); // inv startbatch 512 _mm256_storeu_si256((int16x16 *) (f + 16), e1); _mm256_storeu_si256((int16x16 *) (f + 80), e5); _mm256_storeu_si256((int16x16 *) (f + 144), e9); _mm256_storeu_si256((int16x16 *) (f + 208), e13); _mm256_storeu_si256((int16x16 *) (f + 272), e17); _mm256_storeu_si256((int16x16 *) (f + 336), e21); _mm256_storeu_si256((int16x16 *) (f + 400), e25); _mm256_storeu_si256((int16x16 *) (f + 464), e29); f += 512; } f -= 512 * reps; // inv stopbatch 512 for (long long r = 0; r < reps; ++r) { // inv vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a24 = _mm256_loadu_si256((int16x16 *) (f + 384)); int16x16 a28 = _mm256_loadu_si256((int16x16 *) (f + 448)); int16x16 b24 = _mm256_permute2x128_si256_lo(a24, a28); int16x16 b28 = _mm256_permute2x128_si256_hi(a24, a28); // inv vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a16 = _mm256_loadu_si256((int16x16 *) (f + 256)); int16x16 a20 = _mm256_loadu_si256((int16x16 *) (f + 320)); int16x16 b16 = _mm256_permute2x128_si256_lo(a16, a20); int16x16 b20 = _mm256_permute2x128_si256_hi(a16, a20); // inv vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a8 = _mm256_loadu_si256((int16x16 *) (f + 128)); int16x16 a12 = _mm256_loadu_si256((int16x16 *) (f + 192)); int16x16 b8 = _mm256_permute2x128_si256_lo(a8, a12); int16x16 b12 = _mm256_permute2x128_si256_hi(a8, a12); // inv vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0)); int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64)); int16x16 b0 = _mm256_permute2x128_si256_lo(a0, a4); int16x16 b4 = _mm256_permute2x128_si256_hi(a0, a4); // inv vector_twist 448 512 507 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 b28 = mulmod_scaled_x16(b28, precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); // inv vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 b24 = mulmod_scaled_x16(b24, precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); // inv vector_twist 320 512 5 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 b20 = mulmod_scaled_x16(b20, precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); // inv vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 b16 = mulmod_scaled_x16(b16, precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); // inv vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 b12 = mulmod_scaled_x16(b12, precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); // inv vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 b8 = mulmod_scaled_x16(b8, precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); // inv vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 b4 = mulmod_scaled_x16(b4, precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata); // inv vector_reduce 0 b0 = reduce_x16(b0, qdata); // inv vector_butterfly 384 448 8 7 int16x16 c24 = add_x16(b24, b28); int16x16 c28 = sub_x16(b24, b28); c28 = mulmod_scaled_x16(c28, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata); // inv vector_butterfly 256 320 8 1 int16x16 c16 = add_x16(b16, b20); int16x16 c20 = sub_x16(b16, b20); c20 = mulmod_scaled_x16(c20, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata); // inv vector_butterfly 128 192 4 1 int16x16 c8 = add_x16(b8, b12); int16x16 c12 = sub_x16(b8, b12); c12 = mulmod_scaled_x16(c12, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); // inv vector_butterfly 0 64 1 0 int16x16 c0 = add_x16(b0, b4); int16x16 c4 = sub_x16(b0, b4); // inv vector_reduce_ifforward 64 // inv vector_butterfly 320 448 4 1 int16x16 d20 = add_x16(c20, c28); int16x16 d28 = sub_x16(c20, c28); d28 = mulmod_scaled_x16(d28, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); // inv vector_butterfly 256 384 4 1 int16x16 d16 = add_x16(c16, c24); int16x16 d24 = sub_x16(c16, c24); d24 = mulmod_scaled_x16(d24, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata); // inv vector_butterfly 64 192 1 0 int16x16 d4 = add_x16(c4, c12); int16x16 d12 = sub_x16(c4, c12); // inv vector_butterfly 0 128 1 0 int16x16 d0 = add_x16(c0, c8); int16x16 d8 = sub_x16(c0, c8); // inv vector_reduce_ifreverse 256 d16 = reduce_x16(d16, qdata); // inv vector_reduce_ifreverse 0 d0 = reduce_x16(d0, qdata); // inv vector_butterfly 192 448 1 0 int16x16 e12 = add_x16(d12, d28); int16x16 e28 = sub_x16(d12, d28); // inv vector_butterfly 64 320 1 0 int16x16 e4 = add_x16(d4, d20); int16x16 e20 = sub_x16(d4, d20); // inv vector_butterfly 128 384 1 0 int16x16 e8 = add_x16(d8, d24); int16x16 e24 = sub_x16(d8, d24); // inv vector_butterfly 0 256 1 0 int16x16 e0 = add_x16(d0, d16); int16x16 e16 = sub_x16(d0, d16); // inv startbatch 512 _mm256_storeu_si256((int16x16 *) (f + 0), e0); _mm256_storeu_si256((int16x16 *) (f + 64), e4); _mm256_storeu_si256((int16x16 *) (f + 128), e8); _mm256_storeu_si256((int16x16 *) (f + 192), e12); _mm256_storeu_si256((int16x16 *) (f + 256), e16); _mm256_storeu_si256((int16x16 *) (f + 320), e20); _mm256_storeu_si256((int16x16 *) (f + 384), e24); _mm256_storeu_si256((int16x16 *) (f + 448), e28); f += 512; } // f -= 512*reps; // inv startntt 512 } void PQCLEAN_NTRULPR1013_AVX2_invntt512_7681(int16 *f, int reps) { invntt512(f, reps, qdata_7681.data); } void PQCLEAN_NTRULPR1013_AVX2_invntt512_10753(int16 *f, int reps) { invntt512(f, reps, qdata_10753.data); }