#include "crypto_core_multsntrup1013_ntt.h"
#include <immintrin.h>

// auto-generated; do not edit


#define _mm256_permute2x128_si256_lo(f0,f1) _mm256_permute2x128_si256(f0,f1,0x20)
#define _mm256_permute2x128_si256_hi(f0,f1) _mm256_permute2x128_si256(f0,f1,0x31)
#define int16x16 __m256i

typedef int16_t int16;
typedef int32_t int32;

typedef union {
    int16 data[106 * 16];
    __m256i _dummy;
} vec1696;

static const vec1696 qdata_7681 = { .data = {
#define precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+0)
        -3593, -3593, -3593, -3593, -3625, -3625, -3625, -3625, -3593, -3593, -3593, -3593, -3625, -3625, -3625, -3625,
#define precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+16)
            -3777, -3777, -3777, -3777, 3182, 3182, 3182, 3182, -3777, -3777, -3777, -3777, 3182, 3182, 3182, 3182,
#define precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+32)
            -3593, -3593, -3593, -3593, -3182, -3182, -3182, -3182, -3593, -3593, -3593, -3593, -3182, -3182, -3182, -3182,
#define precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+48)
            3777, 3777, 3777, 3777, 3625, 3625, 3625, 3625, 3777, 3777, 3777, 3777, 3625, 3625, 3625, 3625,
#define precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+64)
            -3593, -3593, -3593, -3593, 2194, 2194, 2194, 2194, -3593, -3593, -3593, -3593, 2194, 2194, 2194, 2194,
#define precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+80)
            -3625, -3625, -3625, -3625, -1100, -1100, -1100, -1100, -3625, -3625, -3625, -3625, -1100, -1100, -1100, -1100,
#define precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+96)
            -3593, -3593, -3593, -3593, 3696, 3696, 3696, 3696, -3593, -3593, -3593, -3593, 3696, 3696, 3696, 3696,
#define precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+112)
            -3182, -3182, -3182, -3182, -2456, -2456, -2456, -2456, -3182, -3182, -3182, -3182, -2456, -2456, -2456, -2456,
#define precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+128)
            -3593, 1701, 2194, 834, -3625, 2319, -1100, 121, -3593, 1701, 2194, 834, -3625, 2319, -1100, 121,
#define precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+144)
            -3777, 1414, 2456, 2495, 3182, 2876, -3696, 2250, -3777, 1414, 2456, 2495, 3182, 2876, -3696, 2250,
#define precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+160)
            -3593, -2250, 3696, -2876, -3182, -2495, -2456, -1414, -3593, -2250, 3696, -2876, -3182, -2495, -2456, -1414,
#define precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+176)
            3777, -121, 1100, -2319, 3625, -834, -2194, -1701, 3777, -121, 1100, -2319, 3625, -834, -2194, -1701,
#define precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+192)
            -3593, 3364, 1701, -1599, 2194, 2557, 834, -2816, -3593, 3364, 1701, -1599, 2194, 2557, 834, -2816,
#define precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+208)
            -3625, 617, 2319, 2006, -1100, -1296, 121, 1986, -3625, 617, 2319, 2006, -1100, -1296, 121, 1986,
#define precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+224)
            -3593, 2237, -2250, -1483, 3696, 3706, -2876, 1921, -3593, 2237, -2250, -1483, 3696, 3706, -2876, 1921,
#define precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+240)
            -3182, 2088, -2495, -1525, -2456, 1993, -1414, 2830, -3182, 2088, -2495, -1525, -2456, 1993, -1414, 2830,
#define precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+256)
            -3593, 514, 3364, 438, 1701, 2555, -1599, -1738, 2194, 103, 2557, 1881, 834, -549, -2816, 638,
#define precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+272)
            -3625, -1399, 617, -1760, 2319, 2535, 2006, 3266, -1100, -1431, -1296, 3174, 121, 3153, 1986, -810,
#define precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+288)
            -3777, 2956, -2830, -679, 1414, 2440, -1993, -3689, 2456, 2804, 1525, 3555, 2495, 1535, -2088, -7,
#define precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+304)
            3182, -1321, -1921, -1305, 2876, -3772, -3706, 3600, -3696, -2043, 1483, -396, 2250, -2310, -2237, 1887,
#define precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+320)
            -3593, -1887, 2237, 2310, -2250, 396, -1483, 2043, 3696, -3600, 3706, 3772, -2876, 1305, 1921, 1321,
#define precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+336)
            -3182, 7, 2088, -1535, -2495, -3555, -1525, -2804, -2456, 3689, 1993, -2440, -1414, 679, 2830, -2956,
#define precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+352)
            3777, 810, -1986, -3153, -121, -3174, 1296, 1431, 1100, -3266, -2006, -2535, -2319, 1760, -617, 1399,
#define precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+368)
            3625, -638, 2816, 549, -834, -1881, -2557, -103, -2194, 1738, 1599, -2555, -1701, -438, -3364, -514,
#define precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+384)
            -3593, -1532, 514, -373, 3364, -3816, 438, -3456, 1701, 783, 2555, 2883, -1599, 727, -1738, -2385,
#define precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+400)
            2194, -2160, 103, -2391, 2557, 2762, 1881, -2426, 834, 3310, -549, -1350, -2816, 1386, 638, -194,
#define precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+416)
            -3625, 404, -1399, -3692, 617, -2764, -1760, -1054, 2319, 1799, 2535, -3588, 2006, 1533, 3266, 2113,
#define precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+432)
            -1100, -2579, -1431, -1756, -1296, 1598, 3174, -2, 121, -3480, 3153, -2572, 1986, 2743, -810, 2919,
#define precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+448)
            -3593, 2789, -1887, -921, 2237, -1497, 2310, -2133, -2250, -915, 396, 1390, -1483, 3135, 2043, -859,
#define precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+464)
            3696, 2732, -3600, -1464, 3706, 2224, 3772, -2665, -2876, 1698, 1305, 2835, 1921, 730, 1321, 486,
#define precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+480)
            -3182, 3417, 7, -3428, 2088, -3145, -1535, 1168, -2495, -3831, -3555, -3750, -1525, 660, -2804, 2649,
#define precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+496)
            -2456, 3405, 3689, -1521, 1993, 1681, -2440, 1056, -1414, 1166, 679, -2233, 2830, 2175, -2956, -1919,
#define precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+512)
            -3593, -1404, -1532, 451, 514, -402, -373, 1278, 3364, -509, -3816, -3770, 438, -2345, -3456, -226,
#define precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+528)
            1701, -1689, 783, -1509, 2555, 2963, 2883, 1242, -1599, 1669, 727, 2719, -1738, 642, -2385, -436,
#define precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+544)
            2194, 3335, -2160, 1779, 103, 3745, -2391, 17, 2557, 2812, 2762, -1144, 1881, 83, -2426, -1181,
#define precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+560)
            834, -1519, 3310, 3568, -549, -796, -1350, 2072, -2816, -2460, 1386, 2891, 638, -2083, -194, -715,
#define precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+576)
            -3593, -402, -3816, -226, 2555, 1669, -2385, 1779, 2557, 83, 3310, 2072, 638, 1012, -3692, 1295,
#define precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+592)
            2319, -3208, 1533, -2071, -1431, -2005, -2, 1586, 1986, -293, 1919, -929, -679, 777, -1681, -3461,
#define precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+608)
            2456, 3366, 3750, -1203, 1535, -3657, -3417, -1712, -1921, 2515, 2665, -1070, 3600, 2532, -3135, -2589,
#define precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+624)
            2250, -2258, 921, -658, -514, 509, 3456, 1509, 1599, -642, 2160, -17, -1881, 1519, 1350, -2891,
#define precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+640)
            -3593, -3434, -1497, 893, 396, -2422, -859, 2965, 3706, -2339, 1698, -2937, 1321, -670, -3428, -3163,
#define precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+656)
            -2495, -1072, 660, 1084, 3689, -179, 1056, -1338, 2830, 2786, -2919, -3677, -3153, -151, -1598, 3334,
#define precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+672)
            1100, -3314, 3588, 2262, 1760, -2230, -404, 2083, 2816, -3568, 2426, -2812, -103, 436, -727, -2963,
#define precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+688)
            -1701, 3770, 373, 1404, 1887, -1649, 2133, -826, 1483, 434, -2732, 3287, -3772, -2378, -2835, 3723,
#define precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+704)
            -3593, 658, 2789, 370, -1887, -3434, -921, -3752, 2237, 1649, -1497, 2258, 2310, 3581, -2133, 893,
#define precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+720)
            -2250, 3794, -915, 826, 396, 2589, 1390, 592, -1483, -2422, 3135, 3214, 2043, -434, -859, -2532,
#define precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+736)
            3696, 1121, 2732, 2965, -3600, 2998, -1464, -3287, 3706, 1070, 2224, -589, 3772, -2339, -2665, 2070,
#define precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+752)
            -2876, 2378, 1698, -2515, 1305, -2815, 2835, -2937, 1921, -1348, 730, -3723, 1321, 1712, 486, 2130,
#define q_x16 *(const int16x16 *)(qdata+768)
            7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681, 7681,
#define qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+784)
            -9, -9, -9, -9, -16425, -16425, -16425, -16425, -9, -9, -9, -9, -16425, -16425, -16425, -16425,
#define qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+800)
            -28865, -28865, -28865, -28865, 10350, 10350, 10350, 10350, -28865, -28865, -28865, -28865, 10350, 10350, 10350, 10350,
#define qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+816)
            -9, -9, -9, -9, -10350, -10350, -10350, -10350, -9, -9, -9, -9, -10350, -10350, -10350, -10350,
#define qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+832)
            28865, 28865, 28865, 28865, 16425, 16425, 16425, 16425, 28865, 28865, 28865, 28865, 16425, 16425, 16425, 16425,
#define qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+848)
            -9, -9, -9, -9, -4974, -4974, -4974, -4974, -9, -9, -9, -9, -4974, -4974, -4974, -4974,
#define qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+864)
            -16425, -16425, -16425, -16425, -7244, -7244, -7244, -7244, -16425, -16425, -16425, -16425, -7244, -7244, -7244, -7244,
#define qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1 *(const int16x16 *)(qdata+880)
            -9, -9, -9, -9, -4496, -4496, -4496, -4496, -9, -9, -9, -9, -4496, -4496, -4496, -4496,
#define qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3 *(const int16x16 *)(qdata+896)
            -10350, -10350, -10350, -10350, -14744, -14744, -14744, -14744, -10350, -10350, -10350, -10350, -14744, -14744, -14744, -14744,
#define qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+912)
            -9, -20315, -4974, 18242, -16425, 18191, -7244, -11655, -9, -20315, -4974, 18242, -16425, 18191, -7244, -11655,
#define qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+928)
            -28865, 20870, 14744, -22593, 10350, 828, 4496, 23754, -28865, 20870, 14744, -22593, 10350, 828, 4496, 23754,
#define qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+944)
            -9, -23754, -4496, -828, -10350, 22593, -14744, -20870, -9, -23754, -4496, -828, -10350, 22593, -14744, -20870,
#define qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+960)
            28865, 11655, 7244, -18191, 16425, -18242, 4974, 20315, 28865, 11655, 7244, -18191, 16425, -18242, 4974, 20315,
#define qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+976)
            -9, -10972, -20315, 23489, -4974, 25597, 18242, -2816, -9, -10972, -20315, 23489, -4974, 25597, 18242, -2816,
#define qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+992)
            -16425, -19351, 18191, -3114, -7244, -9488, -11655, 19394, -16425, -19351, 18191, -3114, -7244, -9488, -11655, 19394,
#define qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7 *(const int16x16 *)(qdata+1008)
            -9, -7491, -23754, -15307, -4496, -15750, -828, -5759, -9, -7491, -23754, -15307, -4496, -15750, -828, -5759,
#define qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1024)
            -10350, 22568, 22593, -20469, -14744, 31177, -20870, 26382, -10350, 22568, 22593, -20469, -14744, 31177, -20870, 26382,
#define qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1040)
            -9, -14846, -10972, -21066, -20315, -24581, 23489, -23242, -4974, -4505, 25597, -26279, 18242, 21467, -2816, 15998,
#define qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1056)
            -16425, -4983, -19351, 14624, 18191, -2073, -3114, 20674, -7244, -21399, -9488, 6246, -11655, -29103, 19394, -5930,
#define qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1072)
            -28865, -23668, -26382, -28839, 20870, 6536, -31177, 16279, 14744, 29428, 20469, 29667, -22593, 9215, -22568, -11783,
#define qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1088)
            10350, -14121, 5759, -5913, 828, -1724, 15750, 11792, 4496, 25093, 15307, 26228, 23754, -21766, 7491, -6817,
#define qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1104)
            -9, 6817, -7491, 21766, -23754, -26228, -15307, -25093, -4496, -11792, -15750, 1724, -828, 5913, -5759, 14121,
#define qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1120)
            -10350, 11783, 22568, -9215, 22593, -29667, -20469, -29428, -14744, -16279, 31177, -6536, -20870, 28839, 26382, 23668,
#define qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1136)
            28865, 5930, -19394, 29103, 11655, -6246, 9488, 21399, 7244, -20674, 3114, 2073, -18191, -14624, 19351, 4983,
#define qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1152)
            16425, -15998, 2816, -21467, -18242, 26279, -25597, 4505, 4974, 23242, -23489, 24581, 20315, 21066, 10972, 14846,
#define qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1168)
            -9, -32252, -14846, -19317, -10972, 8472, -21066, -3456, -20315, 16655, -24581, 12611, 23489, -12073, -23242, 29871,
#define qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1184)
            -4974, 6032, -4505, 10409, 25597, 24266, -26279, 17030, 18242, 10478, 21467, 11962, -2816, -26262, 15998, -17602,
#define qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1200)
            -16425, -22124, -4983, -26220, -19351, -8908, 14624, 32738, 18191, 13575, -2073, 27132, -3114, 24573, 20674, 27201,
#define qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1216)
            -7244, 12269, -21399, -16092, -9488, -15810, 6246, 15358, -11655, -15768, -29103, 24052, 19394, -26441, -5930, -1689,
#define qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1232)
            -9, 13541, 6817, -5529, -7491, 26663, 21766, -4693, -23754, 13933, -26228, 8558, -15307, -21953, -25093, -22875,
#define qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1248)
            -4496, -7508, -11792, -30136, -15750, 26800, 1724, 17303, -828, 2722, 5913, -12013, -5759, 30426, 14121, 3558,
#define qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1264)
            -10350, -24743, 11783, -21860, 22568, -32329, -9215, 9360, 22593, -7415, -29667, 25946, -20469, -21868, -29428, -25511,
#define qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1280)
            -14744, 1869, -16279, 14351, 31177, 2193, -6536, 17440, -20870, 24718, 28839, -23225, 26382, 9855, 23668, -9599,
#define qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1296)
            -9, -32124, -32252, 10179, -14846, 6766, -19317, 16638, -10972, -23549, 8472, -17082, -21066, -15145, -3456, 31518,
#define qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1312)
            -20315, -6297, 16655, -12261, -24581, -11885, 12611, 30938, 23489, 28805, -12073, 26783, -23242, -14718, 29871, 5708,
#define qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1328)
            -4974, 15111, 6032, -29453, -4505, 12449, 10409, 529, 25597, -32004, 24266, 2952, -26279, 18003, 17030, 24931,
#define qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1344)
            18242, -1007, 10478, -4624, 21467, 17636, 11962, 14360, -2816, 15972, -26262, 16715, 15998, 4573, -17602, -14539,
#define qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1360)
            -9, 6766, 8472, 31518, -24581, 28805, 29871, -29453, 25597, 18003, 10478, 14360, 15998, 27636, -26220, 17167,
#define qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1376)
            18191, -7304, 24573, -22039, -21399, -4565, 15358, 10802, 19394, 21723, 9599, -9633, -28839, -2807, -2193, -30597,
#define qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1392)
            14744, -26330, -25946, -2739, 9215, 32695, 24743, -26288, 5759, 20435, -17303, 24530, 11792, 20964, 21953, 23523,
#define qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1408)
            23754, -27858, 5529, 6510, 14846, 23549, 3456, 12261, -23489, 14718, -6032, -529, 26279, 1007, -11962, -16715,
#define qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1424)
            -9, 24214, 26663, 23933, -26228, -13686, -22875, -27243, -15750, 4317, 2722, 8839, 14121, -32414, -21860, -25179,
#define qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1440)
            22593, -25648, -21868, -964, -16279, -1715, 17440, -14650, 26382, -28958, 1689, -10333, 29103, -20119, 15810, 22790,
#define qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1456)
            7244, 20238, -27132, -2858, -14624, 19274, 22124, -4573, 2816, 4624, -17030, 32004, 4505, -5708, 12073, 11885,
#define qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1472)
            20315, 17082, 19317, 32124, -6817, 14223, 4693, -14138, 15307, 9650, 7508, -9513, -1724, -23882, 12013, -15221,
#define qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15 *(const int16x16 *)(qdata+1488)
            -9, -6510, 13541, -23182, 6817, 24214, -5529, -24232, -7491, -14223, 26663, 27858, 21766, 26621, -4693, 23933,
#define qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31 *(const int16x16 *)(qdata+1504)
            -23754, 29394, 13933, 14138, -26228, -23523, 8558, -23984, -15307, -13686, -21953, 26766, -25093, -9650, -22875, -20964,
#define qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47 *(const int16x16 *)(qdata+1520)
            -4496, -22943, -7508, -27243, -11792, -18506, -30136, 9513, -15750, -24530, 26800, 947, 1724, 4317, 17303, 29718,
#define qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63 *(const int16x16 *)(qdata+1536)
            -828, 23882, 2722, -20435, 5913, -10495, -12013, 8839, -5759, -3396, 30426, 15221, 14121, 26288, 3558, 27730,
#define qinvscaledzeta_x16_4_1 *(const int16x16 *)(qdata+1552)
            -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865, -28865,
#define qinvscaledzeta_x16_4_3 *(const int16x16 *)(qdata+1568)
            28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865, 28865,
#define qinvscaledzeta_x16_8_1 *(const int16x16 *)(qdata+1584)
            -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425, -16425,
#define qinvscaledzeta_x16_8_7 *(const int16x16 *)(qdata+1600)
            -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350, -10350,
#define qround32_x16 *(const int16x16 *)(qdata+1616)
            4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
#define scaledzeta_x16_4_1 *(const int16x16 *)(qdata+1632)
            -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777, -3777,
#define scaledzeta_x16_4_3 *(const int16x16 *)(qdata+1648)
            3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777, 3777,
#define scaledzeta_x16_8_1 *(const int16x16 *)(qdata+1664)
            -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625, -3625,
#define scaledzeta_x16_8_7 *(const int16x16 *)(qdata+1680)
            -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182, -3182,
        }
} ;

static const vec1696 qdata_10753 = { .data = {
        // precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
        1018, 1018, 1018, 1018, 3688, 3688, 3688, 3688, 1018, 1018, 1018, 1018, 3688, 3688, 3688, 3688,
        // precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
        -223, -223, -223, -223, -4188, -4188, -4188, -4188, -223, -223, -223, -223, -4188, -4188, -4188, -4188,
        // precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
        1018, 1018, 1018, 1018, 4188, 4188, 4188, 4188, 1018, 1018, 1018, 1018, 4188, 4188, 4188, 4188,
        // precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
        223, 223, 223, 223, -3688, -3688, -3688, -3688, 223, 223, 223, 223, -3688, -3688, -3688, -3688,
        // precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
        1018, 1018, 1018, 1018, -376, -376, -376, -376, 1018, 1018, 1018, 1018, -376, -376, -376, -376,
        // precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
        3688, 3688, 3688, 3688, -3686, -3686, -3686, -3686, 3688, 3688, 3688, 3688, -3686, -3686, -3686, -3686,
        // precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
        1018, 1018, 1018, 1018, -2413, -2413, -2413, -2413, 1018, 1018, 1018, 1018, -2413, -2413, -2413, -2413,
        // precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
        4188, 4188, 4188, 4188, -357, -357, -357, -357, 4188, 4188, 4188, 4188, -357, -357, -357, -357,
        // precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
        1018, -3364, -376, 4855, 3688, 425, -3686, 2695, 1018, -3364, -376, 4855, 3688, 425, -3686, 2695,
        // precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
        -223, -3784, 357, -2236, -4188, 4544, 2413, 730, -223, -3784, 357, -2236, -4188, 4544, 2413, 730,
        // precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
        1018, -730, -2413, -4544, 4188, 2236, -357, 3784, 1018, -730, -2413, -4544, 4188, 2236, -357, 3784,
        // precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
        223, -2695, 3686, -425, -3688, -4855, 376, 3364, 223, -2695, 3686, -425, -3688, -4855, 376, 3364,
        // precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
        1018, -5175, -3364, 2503, -376, 1341, 4855, -4875, 1018, -5175, -3364, 2503, -376, 1341, 4855, -4875,
        // precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
        3688, -2629, 425, -4347, -3686, 3823, 2695, -4035, 3688, -2629, 425, -4347, -3686, 3823, 2695, -4035,
        // precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
        1018, 5063, -730, 341, -2413, -3012, -4544, -5213, 1018, 5063, -730, 341, -2413, -3012, -4544, -5213,
        // precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
        4188, 1520, 2236, 1931, -357, 918, 3784, 4095, 4188, 1520, 2236, 1931, -357, 918, 3784, 4095,
        // precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
        1018, 3085, -5175, 2982, -3364, -4744, 2503, -4129, -376, -2576, 1341, -193, 4855, 3062, -4875, 4,
        // precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
        3688, 2388, -2629, -4513, 425, 4742, -4347, 2935, -3686, -544, 3823, -2178, 2695, 847, -4035, 268,
        // precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
        -223, -1299, -4095, -1287, -3784, -4876, -918, 3091, 357, -4189, -1931, 4616, -2236, 2984, -1520, -3550,
        // precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
        -4188, -1009, 5213, -205, 4544, -4102, 3012, 2790, 2413, -1085, -341, -2565, 730, -4379, -5063, -1284,
        // precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
        1018, 1284, 5063, 4379, -730, 2565, 341, 1085, -2413, -2790, -3012, 4102, -4544, 205, -5213, 1009,
        // precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
        4188, 3550, 1520, -2984, 2236, -4616, 1931, 4189, -357, -3091, 918, 4876, 3784, 1287, 4095, 1299,
        // precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
        223, -268, 4035, -847, -2695, 2178, -3823, 544, 3686, -2935, 4347, -4742, -425, 4513, 2629, -2388,
        // precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
        -3688, -4, 4875, -3062, -4855, 193, -1341, 2576, 376, 4129, -2503, 4744, 3364, -2982, 5175, -3085,
        // precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
        1018, 5116, 3085, -3615, -5175, 400, 2982, 3198, -3364, 2234, -4744, -4828, 2503, 326, -4129, -512,
        // precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
        -376, 1068, -2576, -4580, 1341, 3169, -193, -2998, 4855, -635, 3062, -4808, -4875, -2740, 4, 675,
        // precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
        3688, -1324, 2388, 5114, -2629, 5294, -4513, -794, 425, -864, 4742, -886, -4347, 336, 2935, -2045,
        // precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
        -3686, -3715, -544, 4977, 3823, -2737, -2178, 3441, 2695, 467, 847, 454, -4035, -779, 268, 2213,
        // precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
        1018, 1615, 1284, 2206, 5063, 5064, 4379, 472, -730, -5341, 2565, -4286, 341, 2981, 1085, -1268,
        // precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
        -2413, -3057, -2790, -2884, -3012, -1356, 4102, -3337, -4544, 5023, 205, -636, -5213, 909, 1009, -2973,
        // precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
        4188, 2271, 3550, -1572, 1520, 1841, -2984, 970, 2236, -4734, -4616, 578, 1931, -116, 4189, 1586,
        // precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
        -357, -2774, -3091, -1006, 918, -5156, 4876, 4123, 3784, -567, 1287, 151, 4095, 1458, 1299, 2684,
        // precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
        1018, -3260, 5116, -1722, 3085, 5120, -3615, 3760, -5175, 73, 400, 4254, 2982, 2788, 3198, -2657,
        // precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
        -3364, 569, 2234, 1930, -4744, -2279, -4828, 5215, 2503, -4403, 326, 1639, -4129, 5068, -512, -5015,
        // precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
        -376, -4859, 1068, -40, -2576, 4003, -4580, -4621, 1341, 2487, 3169, -2374, -193, 2625, -2998, 4784,
        // precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
        4855, 825, -635, 2118, 3062, -2813, -4808, -4250, -4875, -2113, -2740, -4408, 4, -1893, 675, 458,
        // precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
        1018, 5120, 400, -2657, -4744, -4403, -512, -40, 1341, 2625, -635, -4250, 4, -3360, 5114, -5313,
        // precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
        425, -2151, 336, -2662, -544, 5334, 3441, 2117, -4035, 2205, -2684, -3570, -1287, -4973, 5156, 2419,
        // precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
        357, 1204, -578, 1635, 2984, -1111, -2271, 4359, 5213, -2449, 3337, 3453, 2790, 554, -2981, -1409,
        // precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
        730, -279, -2206, 3524, -3085, -73, -3198, -1930, -2503, -5068, -1068, 4621, 193, -825, 4808, 4408,
        // precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
        1018, 4428, 5064, -4000, 2565, 573, -1268, 3125, -3012, -4144, 5023, 1927, 1009, -2139, -1572, 3535,
        // precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
        2236, 663, -116, 4967, -3091, -854, 4123, 1160, 4095, -1349, -2213, 1782, -847, 2062, 2737, 624,
        // precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
        3686, -2283, 886, 4889, 4513, -4601, 1324, 1893, 4875, -2118, 2998, -2487, 2576, 5015, -326, 2279,
        // precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
        3364, -4254, 3615, 3260, -1284, -1381, -472, -3891, -341, 2087, 3057, 4720, -4102, 3410, 636, 1689,
        // precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
        1018, -3524, 1615, 5268, 1284, 4428, 2206, -834, 5063, 1381, 5064, 279, 4379, 2439, 472, -4000,
        // precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
        -730, -2015, -5341, 3891, 2565, 1409, -4286, 2605, 341, 573, 2981, 5356, 1085, -2087, -1268, -554,
        // precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
        -2413, 3135, -3057, 3125, -2790, -778, -2884, -4720, -3012, -3453, -1356, -355, 4102, -4144, -3337, -152,
        // precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
        -4544, -3410, 5023, 2449, 205, -97, -636, 1927, -5213, 2624, 909, -1689, 1009, -4359, -2973, -3419,
        // q_x16
        10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753, 10753,
        // qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
        -6, -6, -6, -6, -408, -408, -408, -408, -6, -6, -6, -6, -408, -408, -408, -408,
        // qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
        -27359, -27359, -27359, -27359, 1956, 1956, 1956, 1956, -27359, -27359, -27359, -27359, 1956, 1956, 1956, 1956,
        // qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
        -6, -6, -6, -6, -1956, -1956, -1956, -1956, -6, -6, -6, -6, -1956, -1956, -1956, -1956,
        // qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
        27359, 27359, 27359, 27359, 408, 408, 408, 408, 27359, 27359, 27359, 27359, 408, 408, 408, 408,
        // qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
        -6, -6, -6, -6, -20856, -20856, -20856, -20856, -6, -6, -6, -6, -20856, -20856, -20856, -20856,
        // qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
        -408, -408, -408, -408, -21094, -21094, -21094, -21094, -408, -408, -408, -408, -21094, -21094, -21094, -21094,
        // qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1
        -6, -6, -6, -6, -10093, -10093, -10093, -10093, -6, -6, -6, -6, -10093, -10093, -10093, -10093,
        // qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3
        -1956, -1956, -1956, -1956, -28517, -28517, -28517, -28517, -1956, -1956, -1956, -1956, -28517, -28517, -28517, -28517,
        // qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
        -6, -9508, -20856, -29449, -408, 18345, -21094, -7033, -6, -9508, -20856, -29449, -408, 18345, -21094, -7033,
        // qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
        -27359, -16072, 28517, -12476, 1956, -28224, 10093, 16090, -27359, -16072, 28517, -12476, 1956, -28224, 10093, 16090,
        // qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
        -6, -16090, -10093, 28224, -1956, 12476, -28517, 16072, -6, -16090, -10093, 28224, -1956, 12476, -28517, 16072,
        // qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
        27359, 7033, 21094, -18345, 408, 29449, 20856, 9508, 27359, 7033, 21094, -18345, 408, 29449, 20856, 9508,
        // qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
        -6, -3639, -9508, 25543, -20856, 829, -29449, -17675, -6, -3639, -9508, 25543, -20856, 829, -29449, -17675,
        // qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
        -408, 18363, 18345, 7429, -21094, -10001, -7033, -4547, -408, 18363, 18345, 7429, -21094, -10001, -7033, -4547,
        // qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7
        -6, 28103, -16090, 3925, -10093, 7228, 28224, 11683, -6, 28103, -16090, 3925, -10093, 7228, 28224, 11683,
        // qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15
        -1956, -23056, 12476, 14731, -28517, 26518, 16072, 14847, -1956, -23056, 12476, 14731, -28517, 26518, 16072, 14847,
        // qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
        -6, -5619, -3639, -12378, -9508, 15736, 25543, 23007, -20856, -27152, 829, -22209, -29449, -20490, -17675, 22532,
        // qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
        -408, 16724, 18363, 22623, 18345, 5766, 7429, -31369, -21094, 15840, -10001, 19326, -7033, 3407, -4547, 2316,
        // qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
        -27359, 6381, -14847, 8441, -16072, -6924, -26518, -4589, 28517, 12707, -14731, -15864, -12476, 31656, 23056, 24098,
        // qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
        1956, -31217, -11683, -24269, -28224, -5126, -7228, 20198, 10093, -573, -3925, -14341, 16090, 23781, -28103, -23812,
        // qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
        -6, 23812, 28103, -23781, -16090, 14341, 3925, 573, -10093, -20198, 7228, 5126, 28224, 24269, 11683, 31217,
        // qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
        -1956, -24098, -23056, -31656, 12476, 15864, 14731, -12707, -28517, 4589, 26518, 6924, 16072, -8441, 14847, -6381,
        // qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
        27359, -2316, 4547, -3407, 7033, -19326, 10001, -15840, 21094, 31369, -7429, -5766, -18345, -22623, -18363, -16724,
        // qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
        408, -22532, 17675, 20490, 29449, 22209, -829, 27152, 20856, -23007, -25543, -15736, 9508, 12378, 3639, 5619,
        // qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
        -6, -17412, -5619, 2017, -3639, 24976, -12378, 24702, -9508, -31558, 15736, 1316, 25543, -31418, 23007, -512,
        // qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
        -20856, -13268, -27152, 22044, 829, 8801, -22209, -12214, -29449, 11141, -20490, -17096, -17675, 32076, 22532, 17571,
        // qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
        -408, 13012, 16724, 4090, 18363, -30546, 22623, 16614, 18345, -17248, 5766, 22666, 7429, -7856, -31369, 31235,
        // qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
        -21094, 28541, 15840, -30351, -10001, -177, 19326, -31887, -7033, 25555, 3407, -31290, -4547, -13579, 2316, -2395,
        // qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
        -6, 4175, 23812, 7326, 28103, 17352, -23781, -28200, -16090, 11555, 14341, 6978, 3925, -1627, 573, 780,
        // qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
        -10093, 32271, -20198, 7356, 7228, 29364, 5126, 27895, 28224, -609, 24269, 21892, 11683, -7795, 31217, -18845,
        // qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
        -1956, 29407, -24098, -7716, -23056, -719, -31656, -8246, 12476, -26238, 15864, 11842, 14731, 1932, -12707, -11726,
        // qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
        -28517, 4394, 4589, 2066, 26518, -11300, 6924, -24037, 16072, 969, -8441, 14999, 14847, -11854, -6381, -19844,
        // qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
        -6, -13500, -17412, 32070, -5619, 5120, 2017, 11952, -3639, 1609, 24976, 9374, -12378, -23836, 24702, -8289,
        // qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
        -9508, -22471, -31558, 25482, 15736, -8935, 1316, 32351, 25543, 19661, -31418, 8295, 23007, -25652, -512, -19863,
        // qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
        -20856, 6917, -13268, -28712, -27152, 20899, 22044, 4083, 829, 951, 8801, 29370, -22209, 24641, -12214, 12976,
        // qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
        -29449, -22215, 11141, -29626, -20490, 30467, -17096, 13158, -17675, -24129, 32076, 7880, 22532, -30053, 17571, -8758,
        // qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
        -6, 5120, 24976, -8289, 15736, 19661, -512, -28712, 829, 24641, 11141, 13158, 22532, 13024, 4090, -27329,
        // qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
        18345, -8807, -7856, -20070, 15840, -1834, -31887, -18875, -4547, 18077, 19844, -23026, 8441, -12653, 11300, 11123,
        // qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
        28517, 31924, -11842, -14237, 31656, 16809, -29407, -5369, -11683, -16273, -27895, -29827, 20198, 7722, 1627, 9343,
        // qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
        16090, -15127, -7326, -6716, 5619, -1609, -24702, -25482, -25543, 25652, 13268, -4083, 22209, 22215, 17096, -7880,
        // qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
        -6, -26292, 17352, 12384, 14341, 61, 780, 23093, 7228, -12336, -609, -7801, 31217, -6747, -7716, 6095,
        // qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
        12476, 15511, 1932, 11623, 4589, 6314, -24037, -19320, 14847, 19643, 2395, -21770, -3407, -17394, 177, -23952,
        // qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
        21094, -31467, -22666, -1767, -22623, -14329, -13012, 30053, 17675, 29626, 12214, -951, 27152, 19863, 31418, 8935,
        // qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
        9508, -9374, -2017, 13500, -23812, -29541, 28200, 20173, -3925, -24025, -32271, -19856, -5126, -26286, -21892, -4967,
        // qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15
        -6, 6716, 4175, -13164, 23812, -26292, 7326, -12098, 28103, 29541, 17352, 15127, -23781, -7289, -28200, 12384,
        // qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31
        -16090, -29151, 11555, -20173, 14341, -9343, 6978, -22483, 3925, 61, -1627, 23788, 573, 24025, 780, -7722,
        // qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47
        -10093, -18881, 32271, 23093, -20198, -24330, 7356, 19856, 7228, 29827, 29364, 15517, 5126, -12336, 27895, -4248,
        // qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63
        28224, 26286, -609, 16273, 24269, -5729, 21892, -7801, 11683, -30144, -7795, 4967, 31217, 5369, -18845, -8027,
        // qinvscaledzeta_x16_4_1
        -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359, -27359,
        // qinvscaledzeta_x16_4_3
        27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359, 27359,
        // qinvscaledzeta_x16_8_1
        -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408, -408,
        // qinvscaledzeta_x16_8_7
        -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956, -1956,
        // qround32_x16
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        // scaledzeta_x16_4_1
        -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223, -223,
        // scaledzeta_x16_4_3
        223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223, 223,
        // scaledzeta_x16_8_1
        3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688, 3688,
        // scaledzeta_x16_8_7
        4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188, 4188,
    }
} ;

static inline int16x16 add_x16(int16x16 a, int16x16 b) {
    return _mm256_add_epi16(a, b);
}

static inline int16x16 sub_x16(int16x16 a, int16x16 b) {
    return _mm256_sub_epi16(a, b);
}

static inline int16x16 mulmod_scaled_x16(int16x16 x, int16x16 y, int16x16 yqinv, const int16 *qdata) {
    int16x16 b = _mm256_mulhi_epi16(x, y);
    int16x16 d = _mm256_mullo_epi16(x, yqinv);
    int16x16 e = _mm256_mulhi_epi16(d, q_x16);
    return sub_x16(b, e);
}

static inline int16x16 reduce_x16(int16x16 x, const int16 *qdata) {
    int16x16 y = _mm256_mulhrs_epi16(x, qround32_x16);
    y = _mm256_mullo_epi16(y, q_x16);
    return sub_x16(x, y);
}

// ----- codegen pass 1
//
// startntt 512
// startbatch 512
// // ----- PRECONDITIONS
// physical_map (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
// // transform size 512
// // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8]
// // transforms per batch 1
// // batch indexing []
// // total batch size 512
//
// // modulus x^512-1 pos 0:512 q 7681,10753 bound 512*(5629,5800)
// assertranges ...
//
// // ----- LAYER 1
//
// // butterfly(0,256,1,256,1,0)
// butterfly 0 256 1 256 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // ----- POSTCONDITIONS AFTER LAYER 1
// // transform size 512
// // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8]
// // transforms per batch 1
// // batch indexing []
// // total batch size 512
//
// // modulus x^256-1 pos 0:256 q 7681,10753 bound 256*(11258,11600)
// assertranges ...
//
// // modulus x^256+1 pos 256:512 q 7681,10753 bound 256*(11258,11600)
// assertranges ...
//
// // ----- LAYER 2
//
// // reduce_ifreverse(0,64,1)
// reduce_ifreverse 0 64 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // reduce_ifreverse(256,320,1)
// reduce_ifreverse 256 320 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // butterfly(0,128,1,128,1,0)
// butterfly 0 128 1 128 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // butterfly(256,384,1,128,4,1)
// butterfly 256 384 1 128 4 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // ----- POSTCONDITIONS AFTER LAYER 2
// // transform size 512
// // transform indexing [0, 1, 2, 3, 4, 5, 6, 7, 8]
// // transforms per batch 1
// // batch indexing []
// // total batch size 512
//
// // modulus x^128-1 pos 0:128 q 7681,10753 bound 128*(22516,23200)
// assertranges ...
//
// // modulus x^128+1 pos 128:256 q 7681,10753 bound 128*(22516,23200)
// assertranges ...
//
// // modulus x^128-zeta4 pos 256:384 q 7681,10753 bound 128*(15747,17016)
// assertranges ...
//
// // modulus x^128+zeta4 pos 384:512 q 7681,10753 bound 128*(15747,17016)
// assertranges ...
//
// // ----- LAYER 3
//
// // reduce_ifforward(64,128,1)
// reduce_ifforward 64 128 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // butterfly(0,64,1,64,1,0)
// butterfly 0 64 1 64 1 0 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // butterfly(128,192,1,64,4,1)
// butterfly 128 192 1 64 4 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // butterfly(256,320,1,64,8,1)
// butterfly 256 320 1 64 8 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // butterfly(384,448,1,64,8,-1)
// butterfly 384 448 1 64 8 7 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // reduce(0,64,1)
// reduce 0 64 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // twist(64,128,1,128,1)
// twist 64 128 1 128 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // twist(128,192,1,256,1)
// twist 128 192 1 256 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // twist(192,256,1,256,-1)
// twist 192 256 1 256 255 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // twist(256,320,1,512,1)
// twist 256 320 1 512 1 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // twist(320,384,1,512,5)
// twist 320 384 1 512 5 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // twist(384,448,1,512,-1)
// twist 384 448 1 512 511 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // twist(448,512,1,512,-5)
// twist 448 512 1 512 507 (0, 1, 2, 3, 4, 5, 6, 7, 8) ()
//
// // physical_permute(3,6)
// physical_permute (3, 6) (0, 1, 2, 3, 4, 5, 6, 7, 8) () (0, 1, 2, 6, 4, 5, 3, 7, 8) ()
//
// // fold(256)
// physical_unmap (0, 1, 2, 6, 4, 5, 3, 7, 8) ()
// physical_map (0, 1, 2, 6, 4, 5, 3, 7) (8,)
//
// // fold(128)
// physical_unmap (0, 1, 2, 6, 4, 5, 3, 7) (8,)
// physical_map (0, 1, 2, 6, 4, 5, 3) (7, 8)
//
// // fold(64)
// physical_unmap (0, 1, 2, 6, 4, 5, 3) (7, 8)
// physical_map (0, 1, 2, 6, 4, 5) (3, 7, 8)
//
// // nextbatch()
// stopbatch 512
// startbatch 512
//
// // halfbatch()
// physical_unmap (0, 1, 2, 6, 4, 5) (3, 7, 8)
// stopbatch 512
// doublereps
// startbatch 256
// physical_map (0, 1, 2, 6, 4, 5) (3, 7)
//
// // halfbatch()
// physical_unmap (0, 1, 2, 6, 4, 5) (3, 7)
// stopbatch 256
// doublereps
// startbatch 128
// physical_map (0, 1, 2, 6, 4, 5) (3,)
//
// // ----- POSTCONDITIONS AFTER LAYER 3
// // transform size 64
// // transform indexing [0, 1, 2, 6, 4, 5]
// // transforms per batch 2
// // batch indexing [3]
// // total batch size 128
//
// // modulus x^64-1 pos 0:64 q 7681,10753 bound 1*(5629,5827) 1*(5629,7613) 1*(5629,7666) 1*(5629,7264) 1*(5629,7639) 1*(5629,7591) 1*(5629,7291) 1*(5629,7204) ...
// assertranges ...
//
// // ----- LAYER 4
//
// // butterfly(0,32,1,32,1,0)
// butterfly 0 32 1 32 1 0 (0, 1, 2, 6, 4, 5) (3,)
//
// // ----- POSTCONDITIONS AFTER LAYER 4
// // transform size 64
// // transform indexing [0, 1, 2, 6, 4, 5]
// // transforms per batch 2
// // batch indexing [3]
// // total batch size 128
//
// // modulus x^32-1 pos 0:32 q 7681,10753 bound 1*(11258,13035) 1*(11258,14721) 1*(11258,14855) 1*(11258,14877) 1*(11258,14690) 1*(11258,15282) 1*(11258,14641) 1*(11258,14272) ...
// assertranges ...
//
// // modulus x^32+1 pos 32:64 q 7681,10753 bound 1*(11258,13035) 1*(11258,14721) 1*(11258,14855) 1*(11258,14877) 1*(11258,14690) 1*(11258,15282) 1*(11258,14641) 1*(11258,14272) ...
// assertranges ...
//
// // ----- LAYER 5
//
// // butterfly(0,16,1,16,1,0)
// butterfly 0 16 1 16 1 0 (0, 1, 2, 6, 4, 5) (3,)
//
// // butterfly(32,48,1,16,4,1)
// butterfly 32 48 1 16 4 1 (0, 1, 2, 6, 4, 5) (3,)
//
// // reduce(0,16,1)
// reduce 0 16 1 (0, 1, 2, 6, 4, 5) (3,)
//
// // twist(16,32,1,32,1)
// twist 16 32 1 32 1 (0, 1, 2, 6, 4, 5) (3,)
//
// // twist(32,48,1,64,1)
// twist 32 48 1 64 1 (0, 1, 2, 6, 4, 5) (3,)
//
// // twist(48,64,1,64,-1)
// twist 48 64 1 64 63 (0, 1, 2, 6, 4, 5) (3,)
//
// // physical_permute(0,1,2,5)
// physical_permute (0, 1, 2, 5) (0, 1, 2, 6, 4, 5) (3,) (1, 2, 5, 6, 4, 0) (3,)
//
// // fold(32)
// physical_unmap (1, 2, 5, 6, 4, 0) (3,)
// physical_map (1, 2, 5, 6, 4) (0, 3)
//
// // fold(16)
// physical_unmap (1, 2, 5, 6, 4) (0, 3)
// physical_map (1, 2, 5, 6) (0, 3, 4)
//
// // ----- POSTCONDITIONS AFTER LAYER 5
// // transform size 16
// // transform indexing [1, 2, 5, 6]
// // transforms per batch 8
// // batch indexing [0, 3, 4]
// // total batch size 128
//
// // modulus x^16-1 pos 0:16 q 7681,10753 bound 1*(5629,5800) 1*(5629,6967) 1*(5629,6418) 1*(5629,7585) 1*(5629,7017) 1*(5629,6328) 1*(5629,7033) 1*(5629,6943) ...
// assertranges ...
//
// // ----- LAYER 6
//
// // butterfly(0,8,1,8,1,0)
// butterfly 0 8 1 8 1 0 (1, 2, 5, 6) (0, 3, 4)
//
// // physical_permute(1,2,4)
// physical_permute (1, 2, 4) (1, 2, 5, 6) (0, 3, 4) (2, 4, 5, 6) (0, 3, 1)
//
// // nextbatch()
// stopbatch 128
// startbatch 128
//
// // ----- POSTCONDITIONS AFTER LAYER 6
// // transform size 16
// // transform indexing [2, 4, 5, 6]
// // transforms per batch 8
// // batch indexing [0, 3, 1]
// // total batch size 128
//
// // modulus x^8-1 pos 0:8 q 7681,10753 bound 1*(11258,12447) 1*(11258,14071) 1*(11258,12488) 1*(11258,14310) 1*(11258,14287) 1*(11258,13674) 1*(11258,13574) 1*(11258,13555)
// assertranges ...
//
// // modulus x^8+1 pos 8:16 q 7681,10753 bound 1*(11258,12447) 1*(11258,14071) 1*(11258,12488) 1*(11258,14310) 1*(11258,14287) 1*(11258,13674) 1*(11258,13574) 1*(11258,13555)
// assertranges ...
//
// // ----- LAYER 7
//
// // butterfly(0,4,1,4,1,0)
// butterfly 0 4 1 4 1 0 (2, 4, 5, 6) (0, 3, 1)
//
// // butterfly(8,12,1,4,4,1)
// butterfly 8 12 1 4 4 1 (2, 4, 5, 6) (0, 3, 1)
//
// // reduce(0,4,1)
// reduce 0 4 1 (2, 4, 5, 6) (0, 3, 1)
//
// // twist(4,8,1,8,1)
// twist 4 8 1 8 1 (2, 4, 5, 6) (0, 3, 1)
//
// // twist(8,12,1,16,1)
// twist 8 12 1 16 1 (2, 4, 5, 6) (0, 3, 1)
//
// // twist(12,16,1,16,-1)
// twist 12 16 1 16 15 (2, 4, 5, 6) (0, 3, 1)
//
// // physical_permute(2,6)
// physical_permute (2, 6) (2, 4, 5, 6) (0, 3, 1) (6, 4, 5, 2) (0, 3, 1)
//
// // fold(8)
// physical_unmap (6, 4, 5, 2) (0, 3, 1)
// physical_map (6, 4, 5) (0, 1, 2, 3)
//
// // fold(4)
// physical_unmap (6, 4, 5) (0, 1, 2, 3)
// physical_map (6, 4) (0, 1, 2, 3, 5)
//
// // ----- POSTCONDITIONS AFTER LAYER 7
// // transform size 4
// // transform indexing [6, 4]
// // transforms per batch 32
// // batch indexing [0, 1, 2, 3, 5]
// // total batch size 128
//
// // modulus x^4-1 pos 0:4 q 7681,10753 bound 1*(5629,5800) 1*(5629,6938) 1*(5629,6521) 1*(5629,7157)
// assertranges ...
//
// // ----- LAYER 8
//
// // butterfly(0,2,1,2,1,0)
// butterfly 0 2 1 2 1 0 (6, 4) (0, 1, 2, 3, 5)
//
// // ----- POSTCONDITIONS AFTER LAYER 8
// // transform size 4
// // transform indexing [6, 4]
// // transforms per batch 32
// // batch indexing [0, 1, 2, 3, 5]
// // total batch size 128
//
// // modulus x^2-1 pos 0:2 q 7681,10753 bound 1*(11258,12321) 1*(11258,14095)
// assertranges ...
//
// // modulus x^2+1 pos 2:4 q 7681,10753 bound 1*(11258,12321) 1*(11258,14095)
// assertranges ...
//
// // ----- LAYER 9
//
// // butterfly(0,1,1,1,1,0)
// butterfly 0 1 1 1 1 0 (6, 4) (0, 1, 2, 3, 5)
//
// // butterfly(2,3,1,1,4,1)
// butterfly 2 3 1 1 4 1 (6, 4) (0, 1, 2, 3, 5)
//
// // ----- POSTCONDITIONS AFTER LAYER 9
// // transform size 4
// // transform indexing [6, 4]
// // transforms per batch 32
// // batch indexing [0, 1, 2, 3, 5]
// // total batch size 128
//
// // modulus x^1-1 pos 0:1 q 7681,10753 bound 1*(22516,26416)
// assertranges ...
//
// // modulus x^1+1 pos 1:2 q 7681,10753 bound 1*(22516,26416)
// assertranges ...
//
// // modulus x^1-zeta4 pos 2:3 q 7681,10753 bound 1*(15747,17745)
// assertranges ...
//
// // modulus x^1+zeta4 pos 3:4 q 7681,10753 bound 1*(15747,17745)
// assertranges ...
// stopbatch 128
// physical_unmap (6, 4) (0, 1, 2, 3, 5)
// stopntt 512

// ----- codegen pass 2
//
// startntt 512
// startbatch 512
// vector_butterfly 0 256 1 0
// vector_butterfly 128 384 1 0
// vector_butterfly 64 320 1 0
// vector_butterfly 192 448 1 0
// vector_reduce_ifreverse 0
// vector_reduce_ifreverse 256
// vector_butterfly 0 128 1 0
// vector_butterfly 64 192 1 0
// vector_butterfly 256 384 4 1
// vector_butterfly 320 448 4 1
// vector_reduce_ifforward 64
// vector_butterfly 0 64 1 0
// vector_butterfly 128 192 4 1
// vector_butterfly 256 320 8 1
// vector_butterfly 384 448 8 7
// vector_reduce 0
// vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// vector_twist 320 512 5 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// vector_twist 448 512 507 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// stopbatch 512
// startbatch 512
// vector_butterfly 16 272 1 0
// vector_butterfly 144 400 1 0
// vector_butterfly 80 336 1 0
// vector_butterfly 208 464 1 0
// vector_reduce_ifreverse 16
// vector_reduce_ifreverse 272
// vector_butterfly 16 144 1 0
// vector_butterfly 80 208 1 0
// vector_butterfly 272 400 4 1
// vector_butterfly 336 464 4 1
// vector_reduce_ifforward 80
// vector_butterfly 16 80 1 0
// vector_butterfly 144 208 4 1
// vector_butterfly 272 336 8 1
// vector_butterfly 400 464 8 7
// vector_reduce 16
// vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
// vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
// vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
// vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
// vector_twist 336 512 5 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
// vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
// vector_twist 464 512 507 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
// vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// stopbatch 512
// startbatch 512
// vector_butterfly 32 288 1 0
// vector_butterfly 160 416 1 0
// vector_butterfly 96 352 1 0
// vector_butterfly 224 480 1 0
// vector_reduce_ifreverse 32
// vector_reduce_ifreverse 288
// vector_butterfly 32 160 1 0
// vector_butterfly 96 224 1 0
// vector_butterfly 288 416 4 1
// vector_butterfly 352 480 4 1
// vector_reduce_ifforward 96
// vector_butterfly 32 96 1 0
// vector_butterfly 160 224 4 1
// vector_butterfly 288 352 8 1
// vector_butterfly 416 480 8 7
// vector_reduce 32
// vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
// vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
// vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
// vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
// vector_twist 352 512 5 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
// vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
// vector_twist 480 512 507 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
// vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// stopbatch 512
// startbatch 512
// vector_butterfly 48 304 1 0
// vector_butterfly 176 432 1 0
// vector_butterfly 112 368 1 0
// vector_butterfly 240 496 1 0
// vector_reduce_ifreverse 48
// vector_reduce_ifreverse 304
// vector_butterfly 48 176 1 0
// vector_butterfly 112 240 1 0
// vector_butterfly 304 432 4 1
// vector_butterfly 368 496 4 1
// vector_reduce_ifforward 112
// vector_butterfly 48 112 1 0
// vector_butterfly 176 240 4 1
// vector_butterfly 304 368 8 1
// vector_butterfly 432 496 8 7
// vector_reduce 48
// vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
// vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
// vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
// vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
// vector_twist 368 512 5 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
// vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
// vector_twist 496 512 507 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
// vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
// stopbatch 512
// doublereps
// doublereps
// startbatch 128
// vector_butterfly 0 32 1 0
// vector_butterfly 64 96 1 0
// vector_butterfly 16 48 1 0
// vector_butterfly 80 112 1 0
// vector_butterfly 0 16 1 0
// vector_butterfly 64 80 1 0
// vector_butterfly 32 48 4 1
// vector_butterfly 96 112 4 1
// vector_reduce 0
// vector_reduce 64
// vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
// vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
// vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
// vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
// vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
// vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
// vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
// vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
// vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
// vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
// vector_butterfly 0 64 1 0
// vector_butterfly 32 96 1 0
// vector_butterfly 16 80 1 0
// vector_butterfly 48 112 1 0
// vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
// vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
// vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
// vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
// stopbatch 128
// startbatch 128
// vector_butterfly 0 32 1 0
// vector_butterfly 16 48 1 0
// vector_butterfly 64 96 4 1
// vector_butterfly 80 112 4 1
// vector_reduce 0
// vector_reduce 16
// vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
// vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
// vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
// vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
// vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
// vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
// vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
// vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
// vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
// vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
// vector_butterfly 0 16 1 0
// vector_butterfly 64 80 1 0
// vector_butterfly 32 48 1 0
// vector_butterfly 96 112 1 0
// vector_butterfly 0 64 1 0
// vector_butterfly 32 96 1 0
// vector_butterfly 16 80 4 1
// vector_butterfly 48 112 4 1
// stopbatch 128
// stopntt 512
// startntt 512

static void ntt512(int16 *f, int reps, const int16 *qdata) {
    // startbatch 512
    for (long long r = 0; r < reps; ++r) {
        // vector_butterfly 0 256 1 0
        int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0));
        int16x16 a16 = _mm256_loadu_si256((int16x16 *) (f + 256));
        int16x16 b0 = add_x16(a0, a16);
        int16x16 b16 = sub_x16(a0, a16);
        // vector_butterfly 128 384 1 0
        int16x16 a8 = _mm256_loadu_si256((int16x16 *) (f + 128));
        int16x16 a24 = _mm256_loadu_si256((int16x16 *) (f + 384));
        int16x16 b8 = add_x16(a8, a24);
        int16x16 b24 = sub_x16(a8, a24);
        // vector_butterfly 64 320 1 0
        int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64));
        int16x16 a20 = _mm256_loadu_si256((int16x16 *) (f + 320));
        int16x16 b4 = add_x16(a4, a20);
        int16x16 b20 = sub_x16(a4, a20);
        // vector_butterfly 192 448 1 0
        int16x16 a12 = _mm256_loadu_si256((int16x16 *) (f + 192));
        int16x16 a28 = _mm256_loadu_si256((int16x16 *) (f + 448));
        int16x16 b12 = add_x16(a12, a28);
        int16x16 b28 = sub_x16(a12, a28);
        // vector_reduce_ifreverse 0
        // vector_reduce_ifreverse 256
        // vector_butterfly 0 128 1 0
        int16x16 c0 = add_x16(b0, b8);
        int16x16 c8 = sub_x16(b0, b8);
        // vector_butterfly 64 192 1 0
        int16x16 c4 = add_x16(b4, b12);
        int16x16 c12 = sub_x16(b4, b12);
        // vector_butterfly 256 384 4 1
        b24 = mulmod_scaled_x16(b24, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
        int16x16 c16 = add_x16(b16, b24);
        int16x16 c24 = sub_x16(b16, b24);
        // vector_butterfly 320 448 4 1
        b28 = mulmod_scaled_x16(b28, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
        int16x16 c20 = add_x16(b20, b28);
        int16x16 c28 = sub_x16(b20, b28);
        // vector_reduce_ifforward 64
        c4 = reduce_x16(c4, qdata);
        // vector_butterfly 0 64 1 0
        int16x16 d0 = add_x16(c0, c4);
        int16x16 d4 = sub_x16(c0, c4);
        // vector_butterfly 128 192 4 1
        c12 = mulmod_scaled_x16(c12, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
        int16x16 d8 = add_x16(c8, c12);
        int16x16 d12 = sub_x16(c8, c12);
        // vector_butterfly 256 320 8 1
        c20 = mulmod_scaled_x16(c20, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
        int16x16 d16 = add_x16(c16, c20);
        int16x16 d20 = sub_x16(c16, c20);
        // vector_butterfly 384 448 8 7
        c28 = mulmod_scaled_x16(c28, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
        int16x16 d24 = add_x16(c24, c28);
        int16x16 d28 = sub_x16(c24, c28);
        // vector_reduce 0
        d0 = reduce_x16(d0, qdata);
        // vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
        d4 = mulmod_scaled_x16(d4, precomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_128_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
        // vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
        d8 = mulmod_scaled_x16(d8, precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
        // vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
        d12 = mulmod_scaled_x16(d12, precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
        // vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
        d16 = mulmod_scaled_x16(d16, precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
        // vector_twist 320 512 5 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
        d20 = mulmod_scaled_x16(d20, precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
        // vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
        d24 = mulmod_scaled_x16(d24, precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
        // vector_twist 448 512 507 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
        d28 = mulmod_scaled_x16(d28, precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
        // vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 e0 = _mm256_permute2x128_si256_lo(d0, d4);
        int16x16 e4 = _mm256_permute2x128_si256_hi(d0, d4);
        // vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 e8 = _mm256_permute2x128_si256_lo(d8, d12);
        int16x16 e12 = _mm256_permute2x128_si256_hi(d8, d12);
        // vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 e16 = _mm256_permute2x128_si256_lo(d16, d20);
        int16x16 e20 = _mm256_permute2x128_si256_hi(d16, d20);
        // vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 e24 = _mm256_permute2x128_si256_lo(d24, d28);
        int16x16 e28 = _mm256_permute2x128_si256_hi(d24, d28);
        // stopbatch 512
        _mm256_storeu_si256((int16x16 *) (f + 0), e0);
        _mm256_storeu_si256((int16x16 *) (f + 64), e4);
        _mm256_storeu_si256((int16x16 *) (f + 128), e8);
        _mm256_storeu_si256((int16x16 *) (f + 192), e12);
        _mm256_storeu_si256((int16x16 *) (f + 256), e16);
        _mm256_storeu_si256((int16x16 *) (f + 320), e20);
        _mm256_storeu_si256((int16x16 *) (f + 384), e24);
        _mm256_storeu_si256((int16x16 *) (f + 448), e28);
        f += 512;
    }
    f -= 512 * reps;
    // startbatch 512
    for (long long r = 0; r < reps; ++r) {
        // vector_butterfly 16 272 1 0
        int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16));
        int16x16 a17 = _mm256_loadu_si256((int16x16 *) (f + 272));
        int16x16 b1 = add_x16(a1, a17);
        int16x16 b17 = sub_x16(a1, a17);
        // vector_butterfly 144 400 1 0
        int16x16 a9 = _mm256_loadu_si256((int16x16 *) (f + 144));
        int16x16 a25 = _mm256_loadu_si256((int16x16 *) (f + 400));
        int16x16 b9 = add_x16(a9, a25);
        int16x16 b25 = sub_x16(a9, a25);
        // vector_butterfly 80 336 1 0
        int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80));
        int16x16 a21 = _mm256_loadu_si256((int16x16 *) (f + 336));
        int16x16 b5 = add_x16(a5, a21);
        int16x16 b21 = sub_x16(a5, a21);
        // vector_butterfly 208 464 1 0
        int16x16 a13 = _mm256_loadu_si256((int16x16 *) (f + 208));
        int16x16 a29 = _mm256_loadu_si256((int16x16 *) (f + 464));
        int16x16 b13 = add_x16(a13, a29);
        int16x16 b29 = sub_x16(a13, a29);
        // vector_reduce_ifreverse 16
        // vector_reduce_ifreverse 272
        // vector_butterfly 16 144 1 0
        int16x16 c1 = add_x16(b1, b9);
        int16x16 c9 = sub_x16(b1, b9);
        // vector_butterfly 80 208 1 0
        int16x16 c5 = add_x16(b5, b13);
        int16x16 c13 = sub_x16(b5, b13);
        // vector_butterfly 272 400 4 1
        b25 = mulmod_scaled_x16(b25, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
        int16x16 c17 = add_x16(b17, b25);
        int16x16 c25 = sub_x16(b17, b25);
        // vector_butterfly 336 464 4 1
        b29 = mulmod_scaled_x16(b29, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
        int16x16 c21 = add_x16(b21, b29);
        int16x16 c29 = sub_x16(b21, b29);
        // vector_reduce_ifforward 80
        c5 = reduce_x16(c5, qdata);
        // vector_butterfly 16 80 1 0
        int16x16 d1 = add_x16(c1, c5);
        int16x16 d5 = sub_x16(c1, c5);
        // vector_butterfly 144 208 4 1
        c13 = mulmod_scaled_x16(c13, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
        int16x16 d9 = add_x16(c9, c13);
        int16x16 d13 = sub_x16(c9, c13);
        // vector_butterfly 272 336 8 1
        c21 = mulmod_scaled_x16(c21, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
        int16x16 d17 = add_x16(c17, c21);
        int16x16 d21 = sub_x16(c17, c21);
        // vector_butterfly 400 464 8 7
        c29 = mulmod_scaled_x16(c29, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
        int16x16 d25 = add_x16(c25, c29);
        int16x16 d29 = sub_x16(c25, c29);
        // vector_reduce 16
        d1 = reduce_x16(d1, qdata);
        // vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
        d5 = mulmod_scaled_x16(d5, precomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_128_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
        // vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
        d9 = mulmod_scaled_x16(d9, precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
        // vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
        d13 = mulmod_scaled_x16(d13, precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
        // vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
        d17 = mulmod_scaled_x16(d17, precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
        // vector_twist 336 512 5 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
        d21 = mulmod_scaled_x16(d21, precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
        // vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
        d25 = mulmod_scaled_x16(d25, precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
        // vector_twist 464 512 507 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
        d29 = mulmod_scaled_x16(d29, precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
        // vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 e1 = _mm256_permute2x128_si256_lo(d1, d5);
        int16x16 e5 = _mm256_permute2x128_si256_hi(d1, d5);
        // vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 e9 = _mm256_permute2x128_si256_lo(d9, d13);
        int16x16 e13 = _mm256_permute2x128_si256_hi(d9, d13);
        // vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 e17 = _mm256_permute2x128_si256_lo(d17, d21);
        int16x16 e21 = _mm256_permute2x128_si256_hi(d17, d21);
        // vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 e25 = _mm256_permute2x128_si256_lo(d25, d29);
        int16x16 e29 = _mm256_permute2x128_si256_hi(d25, d29);
        // stopbatch 512
        _mm256_storeu_si256((int16x16 *) (f + 16), e1);
        _mm256_storeu_si256((int16x16 *) (f + 80), e5);
        _mm256_storeu_si256((int16x16 *) (f + 144), e9);
        _mm256_storeu_si256((int16x16 *) (f + 208), e13);
        _mm256_storeu_si256((int16x16 *) (f + 272), e17);
        _mm256_storeu_si256((int16x16 *) (f + 336), e21);
        _mm256_storeu_si256((int16x16 *) (f + 400), e25);
        _mm256_storeu_si256((int16x16 *) (f + 464), e29);
        f += 512;
    }
    f -= 512 * reps;
    // startbatch 512
    for (long long r = 0; r < reps; ++r) {
        // vector_butterfly 32 288 1 0
        int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32));
        int16x16 a18 = _mm256_loadu_si256((int16x16 *) (f + 288));
        int16x16 b2 = add_x16(a2, a18);
        int16x16 b18 = sub_x16(a2, a18);
        // vector_butterfly 160 416 1 0
        int16x16 a10 = _mm256_loadu_si256((int16x16 *) (f + 160));
        int16x16 a26 = _mm256_loadu_si256((int16x16 *) (f + 416));
        int16x16 b10 = add_x16(a10, a26);
        int16x16 b26 = sub_x16(a10, a26);
        // vector_butterfly 96 352 1 0
        int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96));
        int16x16 a22 = _mm256_loadu_si256((int16x16 *) (f + 352));
        int16x16 b6 = add_x16(a6, a22);
        int16x16 b22 = sub_x16(a6, a22);
        // vector_butterfly 224 480 1 0
        int16x16 a14 = _mm256_loadu_si256((int16x16 *) (f + 224));
        int16x16 a30 = _mm256_loadu_si256((int16x16 *) (f + 480));
        int16x16 b14 = add_x16(a14, a30);
        int16x16 b30 = sub_x16(a14, a30);
        // vector_reduce_ifreverse 32
        // vector_reduce_ifreverse 288
        // vector_butterfly 32 160 1 0
        int16x16 c2 = add_x16(b2, b10);
        int16x16 c10 = sub_x16(b2, b10);
        // vector_butterfly 96 224 1 0
        int16x16 c6 = add_x16(b6, b14);
        int16x16 c14 = sub_x16(b6, b14);
        // vector_butterfly 288 416 4 1
        b26 = mulmod_scaled_x16(b26, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
        int16x16 c18 = add_x16(b18, b26);
        int16x16 c26 = sub_x16(b18, b26);
        // vector_butterfly 352 480 4 1
        b30 = mulmod_scaled_x16(b30, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
        int16x16 c22 = add_x16(b22, b30);
        int16x16 c30 = sub_x16(b22, b30);
        // vector_reduce_ifforward 96
        c6 = reduce_x16(c6, qdata);
        // vector_butterfly 32 96 1 0
        int16x16 d2 = add_x16(c2, c6);
        int16x16 d6 = sub_x16(c2, c6);
        // vector_butterfly 160 224 4 1
        c14 = mulmod_scaled_x16(c14, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
        int16x16 d10 = add_x16(c10, c14);
        int16x16 d14 = sub_x16(c10, c14);
        // vector_butterfly 288 352 8 1
        c22 = mulmod_scaled_x16(c22, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
        int16x16 d18 = add_x16(c18, c22);
        int16x16 d22 = sub_x16(c18, c22);
        // vector_butterfly 416 480 8 7
        c30 = mulmod_scaled_x16(c30, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
        int16x16 d26 = add_x16(c26, c30);
        int16x16 d30 = sub_x16(c26, c30);
        // vector_reduce 32
        d2 = reduce_x16(d2, qdata);
        // vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
        d6 = mulmod_scaled_x16(d6, precomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_128_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
        // vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
        d10 = mulmod_scaled_x16(d10, precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
        // vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
        d14 = mulmod_scaled_x16(d14, precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
        // vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
        d18 = mulmod_scaled_x16(d18, precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
        // vector_twist 352 512 5 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
        d22 = mulmod_scaled_x16(d22, precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
        // vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
        d26 = mulmod_scaled_x16(d26, precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
        // vector_twist 480 512 507 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
        d30 = mulmod_scaled_x16(d30, precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
        // vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 e2 = _mm256_permute2x128_si256_lo(d2, d6);
        int16x16 e6 = _mm256_permute2x128_si256_hi(d2, d6);
        // vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 e10 = _mm256_permute2x128_si256_lo(d10, d14);
        int16x16 e14 = _mm256_permute2x128_si256_hi(d10, d14);
        // vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 e18 = _mm256_permute2x128_si256_lo(d18, d22);
        int16x16 e22 = _mm256_permute2x128_si256_hi(d18, d22);
        // vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 e26 = _mm256_permute2x128_si256_lo(d26, d30);
        int16x16 e30 = _mm256_permute2x128_si256_hi(d26, d30);
        // stopbatch 512
        _mm256_storeu_si256((int16x16 *) (f + 32), e2);
        _mm256_storeu_si256((int16x16 *) (f + 96), e6);
        _mm256_storeu_si256((int16x16 *) (f + 160), e10);
        _mm256_storeu_si256((int16x16 *) (f + 224), e14);
        _mm256_storeu_si256((int16x16 *) (f + 288), e18);
        _mm256_storeu_si256((int16x16 *) (f + 352), e22);
        _mm256_storeu_si256((int16x16 *) (f + 416), e26);
        _mm256_storeu_si256((int16x16 *) (f + 480), e30);
        f += 512;
    }
    f -= 512 * reps;
    // startbatch 512
    for (long long r = 0; r < reps; ++r) {
        // vector_butterfly 48 304 1 0
        int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48));
        int16x16 a19 = _mm256_loadu_si256((int16x16 *) (f + 304));
        int16x16 b3 = add_x16(a3, a19);
        int16x16 b19 = sub_x16(a3, a19);
        // vector_butterfly 176 432 1 0
        int16x16 a11 = _mm256_loadu_si256((int16x16 *) (f + 176));
        int16x16 a27 = _mm256_loadu_si256((int16x16 *) (f + 432));
        int16x16 b11 = add_x16(a11, a27);
        int16x16 b27 = sub_x16(a11, a27);
        // vector_butterfly 112 368 1 0
        int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112));
        int16x16 a23 = _mm256_loadu_si256((int16x16 *) (f + 368));
        int16x16 b7 = add_x16(a7, a23);
        int16x16 b23 = sub_x16(a7, a23);
        // vector_butterfly 240 496 1 0
        int16x16 a15 = _mm256_loadu_si256((int16x16 *) (f + 240));
        int16x16 a31 = _mm256_loadu_si256((int16x16 *) (f + 496));
        int16x16 b15 = add_x16(a15, a31);
        int16x16 b31 = sub_x16(a15, a31);
        // vector_reduce_ifreverse 48
        // vector_reduce_ifreverse 304
        // vector_butterfly 48 176 1 0
        int16x16 c3 = add_x16(b3, b11);
        int16x16 c11 = sub_x16(b3, b11);
        // vector_butterfly 112 240 1 0
        int16x16 c7 = add_x16(b7, b15);
        int16x16 c15 = sub_x16(b7, b15);
        // vector_butterfly 304 432 4 1
        b27 = mulmod_scaled_x16(b27, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
        int16x16 c19 = add_x16(b19, b27);
        int16x16 c27 = sub_x16(b19, b27);
        // vector_butterfly 368 496 4 1
        b31 = mulmod_scaled_x16(b31, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
        int16x16 c23 = add_x16(b23, b31);
        int16x16 c31 = sub_x16(b23, b31);
        // vector_reduce_ifforward 112
        c7 = reduce_x16(c7, qdata);
        // vector_butterfly 48 112 1 0
        int16x16 d3 = add_x16(c3, c7);
        int16x16 d7 = sub_x16(c3, c7);
        // vector_butterfly 176 240 4 1
        c15 = mulmod_scaled_x16(c15, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
        int16x16 d11 = add_x16(c11, c15);
        int16x16 d15 = sub_x16(c11, c15);
        // vector_butterfly 304 368 8 1
        c23 = mulmod_scaled_x16(c23, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
        int16x16 d19 = add_x16(c19, c23);
        int16x16 d23 = sub_x16(c19, c23);
        // vector_butterfly 432 496 8 7
        c31 = mulmod_scaled_x16(c31, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
        int16x16 d27 = add_x16(c27, c31);
        int16x16 d31 = sub_x16(c27, c31);
        // vector_reduce 48
        d3 = reduce_x16(d3, qdata);
        // vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
        d7 = mulmod_scaled_x16(d7, precomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_128_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
        // vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
        d11 = mulmod_scaled_x16(d11, precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
        // vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
        d15 = mulmod_scaled_x16(d15, precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
        // vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
        d19 = mulmod_scaled_x16(d19, precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
        // vector_twist 368 512 5 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
        d23 = mulmod_scaled_x16(d23, precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
        // vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
        d27 = mulmod_scaled_x16(d27, precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
        // vector_twist 496 512 507 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
        d31 = mulmod_scaled_x16(d31, precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
        // vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 e3 = _mm256_permute2x128_si256_lo(d3, d7);
        int16x16 e7 = _mm256_permute2x128_si256_hi(d3, d7);
        // vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 e11 = _mm256_permute2x128_si256_lo(d11, d15);
        int16x16 e15 = _mm256_permute2x128_si256_hi(d11, d15);
        // vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 e19 = _mm256_permute2x128_si256_lo(d19, d23);
        int16x16 e23 = _mm256_permute2x128_si256_hi(d19, d23);
        // vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 e27 = _mm256_permute2x128_si256_lo(d27, d31);
        int16x16 e31 = _mm256_permute2x128_si256_hi(d27, d31);
        // stopbatch 512
        _mm256_storeu_si256((int16x16 *) (f + 48), e3);
        _mm256_storeu_si256((int16x16 *) (f + 112), e7);
        _mm256_storeu_si256((int16x16 *) (f + 176), e11);
        _mm256_storeu_si256((int16x16 *) (f + 240), e15);
        _mm256_storeu_si256((int16x16 *) (f + 304), e19);
        _mm256_storeu_si256((int16x16 *) (f + 368), e23);
        _mm256_storeu_si256((int16x16 *) (f + 432), e27);
        _mm256_storeu_si256((int16x16 *) (f + 496), e31);
        f += 512;
    }
    f -= 512 * reps;
    // doublereps
    reps *= 2;
    // doublereps
    reps *= 2;
    // startbatch 128
    for (long long r = 0; r < reps; ++r) {
        // vector_butterfly 0 32 1 0
        int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0));
        int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32));
        int16x16 b0 = add_x16(a0, a2);
        int16x16 b2 = sub_x16(a0, a2);
        // vector_butterfly 64 96 1 0
        int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64));
        int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96));
        int16x16 b4 = add_x16(a4, a6);
        int16x16 b6 = sub_x16(a4, a6);
        // vector_butterfly 16 48 1 0
        int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16));
        int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48));
        int16x16 b1 = add_x16(a1, a3);
        int16x16 b3 = sub_x16(a1, a3);
        // vector_butterfly 80 112 1 0
        int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80));
        int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112));
        int16x16 b5 = add_x16(a5, a7);
        int16x16 b7 = sub_x16(a5, a7);
        // vector_butterfly 0 16 1 0
        int16x16 c0 = add_x16(b0, b1);
        int16x16 c1 = sub_x16(b0, b1);
        // vector_butterfly 64 80 1 0
        int16x16 c4 = add_x16(b4, b5);
        int16x16 c5 = sub_x16(b4, b5);
        // vector_butterfly 32 48 4 1
        b3 = mulmod_scaled_x16(b3, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
        int16x16 c2 = add_x16(b2, b3);
        int16x16 c3 = sub_x16(b2, b3);
        // vector_butterfly 96 112 4 1
        b7 = mulmod_scaled_x16(b7, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
        int16x16 c6 = add_x16(b6, b7);
        int16x16 c7 = sub_x16(b6, b7);
        // vector_reduce 0
        c0 = reduce_x16(c0, qdata);
        // vector_reduce 64
        c4 = reduce_x16(c4, qdata);
        // vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
        c1 = mulmod_scaled_x16(c1, precomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_32_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata);
        // vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
        c5 = mulmod_scaled_x16(c5, precomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_32_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata);
        // vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
        c2 = mulmod_scaled_x16(c2, precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata);
        // vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
        c6 = mulmod_scaled_x16(c6, precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata);
        // vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
        c3 = mulmod_scaled_x16(c3, precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata);
        // vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
        c7 = mulmod_scaled_x16(c7, precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata);
        // vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
        int16x16 d0 = _mm256_unpacklo_epi16(c0, c2);
        int16x16 d2 = _mm256_unpackhi_epi16(c0, c2);
        // vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
        int16x16 d1 = _mm256_unpacklo_epi16(c1, c3);
        int16x16 d3 = _mm256_unpackhi_epi16(c1, c3);
        // vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
        int16x16 d4 = _mm256_unpacklo_epi16(c4, c6);
        int16x16 d6 = _mm256_unpackhi_epi16(c4, c6);
        // vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
        int16x16 d5 = _mm256_unpacklo_epi16(c5, c7);
        int16x16 d7 = _mm256_unpackhi_epi16(c5, c7);
        // vector_butterfly 0 64 1 0
        int16x16 e0 = add_x16(d0, d4);
        int16x16 e4 = sub_x16(d0, d4);
        // vector_butterfly 32 96 1 0
        int16x16 e2 = add_x16(d2, d6);
        int16x16 e6 = sub_x16(d2, d6);
        // vector_butterfly 16 80 1 0
        int16x16 e1 = add_x16(d1, d5);
        int16x16 e5 = sub_x16(d1, d5);
        // vector_butterfly 48 112 1 0
        int16x16 e3 = add_x16(d3, d7);
        int16x16 e7 = sub_x16(d3, d7);
        // vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
        int16x16 f0 = _mm256_unpacklo_epi32(e0, e1);
        int16x16 f1 = _mm256_unpackhi_epi32(e0, e1);
        // vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
        int16x16 f2 = _mm256_unpacklo_epi32(e2, e3);
        int16x16 f3 = _mm256_unpackhi_epi32(e2, e3);
        // vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
        int16x16 f4 = _mm256_unpacklo_epi32(e4, e5);
        int16x16 f5 = _mm256_unpackhi_epi32(e4, e5);
        // vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
        int16x16 f6 = _mm256_unpacklo_epi32(e6, e7);
        int16x16 f7 = _mm256_unpackhi_epi32(e6, e7);
        // stopbatch 128
        _mm256_storeu_si256((int16x16 *) (f + 0), f0);
        _mm256_storeu_si256((int16x16 *) (f + 16), f1);
        _mm256_storeu_si256((int16x16 *) (f + 32), f2);
        _mm256_storeu_si256((int16x16 *) (f + 48), f3);
        _mm256_storeu_si256((int16x16 *) (f + 64), f4);
        _mm256_storeu_si256((int16x16 *) (f + 80), f5);
        _mm256_storeu_si256((int16x16 *) (f + 96), f6);
        _mm256_storeu_si256((int16x16 *) (f + 112), f7);
        f += 128;
    }
    f -= 128 * reps;
    // startbatch 128
    for (long long r = 0; r < reps; ++r) {
        // vector_butterfly 0 32 1 0
        int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0));
        int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32));
        int16x16 b0 = add_x16(a0, a2);
        int16x16 b2 = sub_x16(a0, a2);
        // vector_butterfly 16 48 1 0
        int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16));
        int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48));
        int16x16 b1 = add_x16(a1, a3);
        int16x16 b3 = sub_x16(a1, a3);
        // vector_butterfly 64 96 4 1
        int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64));
        int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96));
        a6 = mulmod_scaled_x16(a6, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
        int16x16 b4 = add_x16(a4, a6);
        int16x16 b6 = sub_x16(a4, a6);
        // vector_butterfly 80 112 4 1
        int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80));
        int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112));
        a7 = mulmod_scaled_x16(a7, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
        int16x16 b5 = add_x16(a5, a7);
        int16x16 b7 = sub_x16(a5, a7);
        // vector_reduce 0
        b0 = reduce_x16(b0, qdata);
        // vector_reduce 16
        b1 = reduce_x16(b1, qdata);
        // vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
        b2 = mulmod_scaled_x16(b2, precomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_8_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata);
        // vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
        b3 = mulmod_scaled_x16(b3, precomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_8_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata);
        // vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
        b4 = mulmod_scaled_x16(b4, precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata);
        // vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
        b5 = mulmod_scaled_x16(b5, precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata);
        // vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
        b6 = mulmod_scaled_x16(b6, precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata);
        // vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
        b7 = mulmod_scaled_x16(b7, precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata);
        // vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
        int16x16 c0 = _mm256_unpacklo_epi64(b0, b4);
        int16x16 c4 = _mm256_unpackhi_epi64(b0, b4);
        // vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
        int16x16 c1 = _mm256_unpacklo_epi64(b1, b5);
        int16x16 c5 = _mm256_unpackhi_epi64(b1, b5);
        // vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
        int16x16 c2 = _mm256_unpacklo_epi64(b2, b6);
        int16x16 c6 = _mm256_unpackhi_epi64(b2, b6);
        // vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
        int16x16 c3 = _mm256_unpacklo_epi64(b3, b7);
        int16x16 c7 = _mm256_unpackhi_epi64(b3, b7);
        // vector_butterfly 0 16 1 0
        int16x16 d0 = add_x16(c0, c1);
        int16x16 d1 = sub_x16(c0, c1);
        // vector_butterfly 64 80 1 0
        int16x16 d4 = add_x16(c4, c5);
        int16x16 d5 = sub_x16(c4, c5);
        // vector_butterfly 32 48 1 0
        int16x16 d2 = add_x16(c2, c3);
        int16x16 d3 = sub_x16(c2, c3);
        // vector_butterfly 96 112 1 0
        int16x16 d6 = add_x16(c6, c7);
        int16x16 d7 = sub_x16(c6, c7);
        // vector_butterfly 0 64 1 0
        int16x16 e0 = add_x16(d0, d4);
        int16x16 e4 = sub_x16(d0, d4);
        // vector_butterfly 32 96 1 0
        int16x16 e2 = add_x16(d2, d6);
        int16x16 e6 = sub_x16(d2, d6);
        // vector_butterfly 16 80 4 1
        d5 = mulmod_scaled_x16(d5, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
        int16x16 e1 = add_x16(d1, d5);
        int16x16 e5 = sub_x16(d1, d5);
        // vector_butterfly 48 112 4 1
        d7 = mulmod_scaled_x16(d7, scaledzeta_x16_4_1, qinvscaledzeta_x16_4_1, qdata);
        int16x16 e3 = add_x16(d3, d7);
        int16x16 e7 = sub_x16(d3, d7);
        // stopbatch 128
        _mm256_storeu_si256((int16x16 *) (f + 0), e0);
        _mm256_storeu_si256((int16x16 *) (f + 16), e1);
        _mm256_storeu_si256((int16x16 *) (f + 32), e2);
        _mm256_storeu_si256((int16x16 *) (f + 48), e3);
        _mm256_storeu_si256((int16x16 *) (f + 64), e4);
        _mm256_storeu_si256((int16x16 *) (f + 80), e5);
        _mm256_storeu_si256((int16x16 *) (f + 96), e6);
        _mm256_storeu_si256((int16x16 *) (f + 112), e7);
        f += 128;
    }
    // f -= 128*reps;
    // stopntt 512
}

void PQCLEAN_NTRULPR1013_AVX2_ntt512_7681(int16 *f, int reps) {
    ntt512(f, reps, qdata_7681.data);
}

void PQCLEAN_NTRULPR1013_AVX2_ntt512_10753(int16 *f, int reps) {
    ntt512(f, reps, qdata_10753.data);
}
// inv stopntt 512

static void invntt512(int16 *f, int reps, const int16 *qdata) {
    reps *= 4;
    // inv stopbatch 128
    for (long long r = 0; r < reps; ++r) {
        // inv vector_butterfly 48 112 4 1
        int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48));
        int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112));
        int16x16 b3 = add_x16(a3, a7);
        int16x16 b7 = sub_x16(a3, a7);
        b7 = mulmod_scaled_x16(b7, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
        // inv vector_butterfly 16 80 4 1
        int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16));
        int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80));
        int16x16 b1 = add_x16(a1, a5);
        int16x16 b5 = sub_x16(a1, a5);
        b5 = mulmod_scaled_x16(b5, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
        // inv vector_butterfly 32 96 1 0
        int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32));
        int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96));
        int16x16 b2 = add_x16(a2, a6);
        int16x16 b6 = sub_x16(a2, a6);
        // inv vector_butterfly 0 64 1 0
        int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0));
        int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64));
        int16x16 b0 = add_x16(a0, a4);
        int16x16 b4 = sub_x16(a0, a4);
        // inv vector_butterfly 96 112 1 0
        int16x16 c6 = add_x16(b6, b7);
        int16x16 c7 = sub_x16(b6, b7);
        // inv vector_butterfly 32 48 1 0
        int16x16 c2 = add_x16(b2, b3);
        int16x16 c3 = sub_x16(b2, b3);
        // inv vector_butterfly 64 80 1 0
        int16x16 c4 = add_x16(b4, b5);
        int16x16 c5 = sub_x16(b4, b5);
        // inv vector_butterfly 0 16 1 0
        int16x16 c0 = add_x16(b0, b1);
        int16x16 c1 = sub_x16(b0, b1);
        // inv vector_permute 48 112 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
        int16x16 d3 = _mm256_unpacklo_epi64(c3, c7);
        int16x16 d7 = _mm256_unpackhi_epi64(c3, c7);
        // inv vector_permute 32 96 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
        int16x16 d2 = _mm256_unpacklo_epi64(c2, c6);
        int16x16 d6 = _mm256_unpackhi_epi64(c2, c6);
        // inv vector_permute 16 80 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
        int16x16 d1 = _mm256_unpacklo_epi64(c1, c5);
        int16x16 d5 = _mm256_unpackhi_epi64(c1, c5);
        // inv vector_permute 0 64 _mm256_unpacklo_epi64 _mm256_unpackhi_epi64
        int16x16 d0 = _mm256_unpacklo_epi64(c0, c4);
        int16x16 d4 = _mm256_unpackhi_epi64(c0, c4);
        // inv vector_twist 112 16 15 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
        d7 = mulmod_scaled_x16(d7, precomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_1_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata);
        // inv vector_twist 96 16 15 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
        d6 = mulmod_scaled_x16(d6, precomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_1_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata);
        // inv vector_twist 80 16 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
        d5 = mulmod_scaled_x16(d5, precomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_16_15_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata);
        // inv vector_twist 64 16 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
        d4 = mulmod_scaled_x16(d4, precomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_16_15_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata);
        // inv vector_twist 48 8 1 2 2 2 2 3 3 3 3 2 2 2 2 3 3 3 3
        d3 = mulmod_scaled_x16(d3, precomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qinvprecomp_8_7_2_2_2_2_3_3_3_3_2_2_2_2_3_3_3_3, qdata);
        // inv vector_twist 32 8 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1
        d2 = mulmod_scaled_x16(d2, precomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qinvprecomp_8_7_0_0_0_0_1_1_1_1_0_0_0_0_1_1_1_1, qdata);
        // inv vector_reduce 16
        d1 = reduce_x16(d1, qdata);
        // inv vector_reduce 0
        d0 = reduce_x16(d0, qdata);
        // inv vector_butterfly 80 112 4 1
        int16x16 e5 = add_x16(d5, d7);
        int16x16 e7 = sub_x16(d5, d7);
        e7 = mulmod_scaled_x16(e7, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
        // inv vector_butterfly 64 96 4 1
        int16x16 e4 = add_x16(d4, d6);
        int16x16 e6 = sub_x16(d4, d6);
        e6 = mulmod_scaled_x16(e6, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
        // inv vector_butterfly 16 48 1 0
        int16x16 e1 = add_x16(d1, d3);
        int16x16 e3 = sub_x16(d1, d3);
        // inv vector_butterfly 0 32 1 0
        int16x16 e0 = add_x16(d0, d2);
        int16x16 e2 = sub_x16(d0, d2);
        // inv startbatch 128
        _mm256_storeu_si256((int16x16 *) (f + 0), e0);
        _mm256_storeu_si256((int16x16 *) (f + 16), e1);
        _mm256_storeu_si256((int16x16 *) (f + 32), e2);
        _mm256_storeu_si256((int16x16 *) (f + 48), e3);
        _mm256_storeu_si256((int16x16 *) (f + 64), e4);
        _mm256_storeu_si256((int16x16 *) (f + 80), e5);
        _mm256_storeu_si256((int16x16 *) (f + 96), e6);
        _mm256_storeu_si256((int16x16 *) (f + 112), e7);
        f += 128;
    }
    f -= 128 * reps;
    // inv stopbatch 128
    for (long long r = 0; r < reps; ++r) {
        // inv vector_permute 96 112 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
        int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96));
        int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112));
        int16x16 b6 = _mm256_unpacklo_epi32(a6, a7);
        int16x16 b7 = _mm256_unpackhi_epi32(a6, a7);
        int16x16 c6 = _mm256_unpacklo_epi32(b6, b7);
        int16x16 c7 = _mm256_unpackhi_epi32(b6, b7);
        // inv vector_permute 64 80 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
        int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64));
        int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80));
        int16x16 b4 = _mm256_unpacklo_epi32(a4, a5);
        int16x16 b5 = _mm256_unpackhi_epi32(a4, a5);
        int16x16 c4 = _mm256_unpacklo_epi32(b4, b5);
        int16x16 c5 = _mm256_unpackhi_epi32(b4, b5);
        // inv vector_permute 32 48 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
        int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32));
        int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48));
        int16x16 b2 = _mm256_unpacklo_epi32(a2, a3);
        int16x16 b3 = _mm256_unpackhi_epi32(a2, a3);
        int16x16 c2 = _mm256_unpacklo_epi32(b2, b3);
        int16x16 c3 = _mm256_unpackhi_epi32(b2, b3);
        // inv vector_permute 0 16 _mm256_unpacklo_epi32 _mm256_unpackhi_epi32
        int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0));
        int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16));
        int16x16 b0 = _mm256_unpacklo_epi32(a0, a1);
        int16x16 b1 = _mm256_unpackhi_epi32(a0, a1);
        int16x16 c0 = _mm256_unpacklo_epi32(b0, b1);
        int16x16 c1 = _mm256_unpackhi_epi32(b0, b1);
        // inv vector_butterfly 48 112 1 0
        int16x16 d3 = add_x16(c3, c7);
        int16x16 d7 = sub_x16(c3, c7);
        // inv vector_butterfly 16 80 1 0
        int16x16 d1 = add_x16(c1, c5);
        int16x16 d5 = sub_x16(c1, c5);
        // inv vector_butterfly 32 96 1 0
        int16x16 d2 = add_x16(c2, c6);
        int16x16 d6 = sub_x16(c2, c6);
        // inv vector_butterfly 0 64 1 0
        int16x16 d0 = add_x16(c0, c4);
        int16x16 d4 = sub_x16(c0, c4);
        // inv vector_permute 80 112 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
        int16x16 e5 = _mm256_unpacklo_epi16(d5, d7);
        int16x16 e7 = _mm256_unpackhi_epi16(d5, d7);
        int16x16 f5 = _mm256_unpacklo_epi16(e5, e7);
        int16x16 f7 = _mm256_unpackhi_epi16(e5, e7);
        int16x16 g5 = _mm256_unpacklo_epi16(f5, f7);
        int16x16 g7 = _mm256_unpackhi_epi16(f5, f7);
        // inv vector_permute 64 96 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
        int16x16 e4 = _mm256_unpacklo_epi16(d4, d6);
        int16x16 e6 = _mm256_unpackhi_epi16(d4, d6);
        int16x16 f4 = _mm256_unpacklo_epi16(e4, e6);
        int16x16 f6 = _mm256_unpackhi_epi16(e4, e6);
        int16x16 g4 = _mm256_unpacklo_epi16(f4, f6);
        int16x16 g6 = _mm256_unpackhi_epi16(f4, f6);
        // inv vector_permute 16 48 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
        int16x16 e1 = _mm256_unpacklo_epi16(d1, d3);
        int16x16 e3 = _mm256_unpackhi_epi16(d1, d3);
        int16x16 f1 = _mm256_unpacklo_epi16(e1, e3);
        int16x16 f3 = _mm256_unpackhi_epi16(e1, e3);
        int16x16 g1 = _mm256_unpacklo_epi16(f1, f3);
        int16x16 g3 = _mm256_unpackhi_epi16(f1, f3);
        // inv vector_permute 0 32 _mm256_unpacklo_epi16 _mm256_unpackhi_epi16
        int16x16 e0 = _mm256_unpacklo_epi16(d0, d2);
        int16x16 e2 = _mm256_unpackhi_epi16(d0, d2);
        int16x16 f0 = _mm256_unpacklo_epi16(e0, e2);
        int16x16 f2 = _mm256_unpackhi_epi16(e0, e2);
        int16x16 g0 = _mm256_unpacklo_epi16(f0, f2);
        int16x16 g2 = _mm256_unpackhi_epi16(f0, f2);
        // inv vector_twist 112 64 63 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
        g7 = mulmod_scaled_x16(g7, precomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_1_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata);
        // inv vector_twist 48 64 63 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
        g3 = mulmod_scaled_x16(g3, precomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_1_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata);
        // inv vector_twist 96 64 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
        g6 = mulmod_scaled_x16(g6, precomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_64_63_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata);
        // inv vector_twist 32 64 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
        g2 = mulmod_scaled_x16(g2, precomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_64_63_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata);
        // inv vector_twist 80 32 1 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15
        g5 = mulmod_scaled_x16(g5, precomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qinvprecomp_32_31_8_9_10_11_12_13_14_15_8_9_10_11_12_13_14_15, qdata);
        // inv vector_twist 16 32 1 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
        g1 = mulmod_scaled_x16(g1, precomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qinvprecomp_32_31_0_1_2_3_4_5_6_7_0_1_2_3_4_5_6_7, qdata);
        // inv vector_reduce 64
        g4 = reduce_x16(g4, qdata);
        // inv vector_reduce 0
        g0 = reduce_x16(g0, qdata);
        // inv vector_butterfly 96 112 4 1
        int16x16 h6 = add_x16(g6, g7);
        int16x16 h7 = sub_x16(g6, g7);
        h7 = mulmod_scaled_x16(h7, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
        // inv vector_butterfly 32 48 4 1
        int16x16 h2 = add_x16(g2, g3);
        int16x16 h3 = sub_x16(g2, g3);
        h3 = mulmod_scaled_x16(h3, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
        // inv vector_butterfly 64 80 1 0
        int16x16 h4 = add_x16(g4, g5);
        int16x16 h5 = sub_x16(g4, g5);
        // inv vector_butterfly 0 16 1 0
        int16x16 h0 = add_x16(g0, g1);
        int16x16 h1 = sub_x16(g0, g1);
        // inv vector_butterfly 80 112 1 0
        int16x16 i5 = add_x16(h5, h7);
        int16x16 i7 = sub_x16(h5, h7);
        // inv vector_butterfly 16 48 1 0
        int16x16 i1 = add_x16(h1, h3);
        int16x16 i3 = sub_x16(h1, h3);
        // inv vector_butterfly 64 96 1 0
        int16x16 i4 = add_x16(h4, h6);
        int16x16 i6 = sub_x16(h4, h6);
        // inv vector_butterfly 0 32 1 0
        int16x16 i0 = add_x16(h0, h2);
        int16x16 i2 = sub_x16(h0, h2);
        // inv startbatch 128
        _mm256_storeu_si256((int16x16 *) (f + 0), i0);
        _mm256_storeu_si256((int16x16 *) (f + 16), i1);
        _mm256_storeu_si256((int16x16 *) (f + 32), i2);
        _mm256_storeu_si256((int16x16 *) (f + 48), i3);
        _mm256_storeu_si256((int16x16 *) (f + 64), i4);
        _mm256_storeu_si256((int16x16 *) (f + 80), i5);
        _mm256_storeu_si256((int16x16 *) (f + 96), i6);
        _mm256_storeu_si256((int16x16 *) (f + 112), i7);
        f += 128;
    }
    f -= 128 * reps;
    // inv doublereps
    reps /= 2;
    // inv doublereps
    reps /= 2;
    // inv stopbatch 512
    for (long long r = 0; r < reps; ++r) {
        // inv vector_permute 432 496 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 a27 = _mm256_loadu_si256((int16x16 *) (f + 432));
        int16x16 a31 = _mm256_loadu_si256((int16x16 *) (f + 496));
        int16x16 b27 = _mm256_permute2x128_si256_lo(a27, a31);
        int16x16 b31 = _mm256_permute2x128_si256_hi(a27, a31);
        // inv vector_permute 304 368 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 a19 = _mm256_loadu_si256((int16x16 *) (f + 304));
        int16x16 a23 = _mm256_loadu_si256((int16x16 *) (f + 368));
        int16x16 b19 = _mm256_permute2x128_si256_lo(a19, a23);
        int16x16 b23 = _mm256_permute2x128_si256_hi(a19, a23);
        // inv vector_permute 176 240 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 a11 = _mm256_loadu_si256((int16x16 *) (f + 176));
        int16x16 a15 = _mm256_loadu_si256((int16x16 *) (f + 240));
        int16x16 b11 = _mm256_permute2x128_si256_lo(a11, a15);
        int16x16 b15 = _mm256_permute2x128_si256_hi(a11, a15);
        // inv vector_permute 48 112 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 a3 = _mm256_loadu_si256((int16x16 *) (f + 48));
        int16x16 a7 = _mm256_loadu_si256((int16x16 *) (f + 112));
        int16x16 b3 = _mm256_permute2x128_si256_lo(a3, a7);
        int16x16 b7 = _mm256_permute2x128_si256_hi(a3, a7);
        // inv vector_twist 496 512 507 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
        b31 = mulmod_scaled_x16(b31, precomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_5_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
        // inv vector_twist 432 512 511 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
        b27 = mulmod_scaled_x16(b27, precomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
        // inv vector_twist 368 512 5 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
        b23 = mulmod_scaled_x16(b23, precomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_507_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
        // inv vector_twist 304 512 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
        b19 = mulmod_scaled_x16(b19, precomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_512_511_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
        // inv vector_twist 240 256 255 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
        b15 = mulmod_scaled_x16(b15, precomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_1_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
        // inv vector_twist 176 256 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
        b11 = mulmod_scaled_x16(b11, precomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_256_255_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
        // inv vector_twist 112 128 1 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
        b7 = mulmod_scaled_x16(b7, precomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qinvprecomp_128_127_48_49_50_51_52_53_54_55_56_57_58_59_60_61_62_63, qdata);
        // inv vector_reduce 48
        b3 = reduce_x16(b3, qdata);
        // inv vector_butterfly 432 496 8 7
        int16x16 c27 = add_x16(b27, b31);
        int16x16 c31 = sub_x16(b27, b31);
        c31 = mulmod_scaled_x16(c31, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
        // inv vector_butterfly 304 368 8 1
        int16x16 c19 = add_x16(b19, b23);
        int16x16 c23 = sub_x16(b19, b23);
        c23 = mulmod_scaled_x16(c23, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
        // inv vector_butterfly 176 240 4 1
        int16x16 c11 = add_x16(b11, b15);
        int16x16 c15 = sub_x16(b11, b15);
        c15 = mulmod_scaled_x16(c15, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
        // inv vector_butterfly 48 112 1 0
        int16x16 c3 = add_x16(b3, b7);
        int16x16 c7 = sub_x16(b3, b7);
        // inv vector_reduce_ifforward 112
        // inv vector_butterfly 368 496 4 1
        int16x16 d23 = add_x16(c23, c31);
        int16x16 d31 = sub_x16(c23, c31);
        d31 = mulmod_scaled_x16(d31, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
        // inv vector_butterfly 304 432 4 1
        int16x16 d19 = add_x16(c19, c27);
        int16x16 d27 = sub_x16(c19, c27);
        d27 = mulmod_scaled_x16(d27, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
        // inv vector_butterfly 112 240 1 0
        int16x16 d7 = add_x16(c7, c15);
        int16x16 d15 = sub_x16(c7, c15);
        // inv vector_butterfly 48 176 1 0
        int16x16 d3 = add_x16(c3, c11);
        int16x16 d11 = sub_x16(c3, c11);
        // inv vector_reduce_ifreverse 304
        d19 = reduce_x16(d19, qdata);
        // inv vector_reduce_ifreverse 48
        d3 = reduce_x16(d3, qdata);
        // inv vector_butterfly 240 496 1 0
        int16x16 e15 = add_x16(d15, d31);
        int16x16 e31 = sub_x16(d15, d31);
        // inv vector_butterfly 112 368 1 0
        int16x16 e7 = add_x16(d7, d23);
        int16x16 e23 = sub_x16(d7, d23);
        // inv vector_butterfly 176 432 1 0
        int16x16 e11 = add_x16(d11, d27);
        int16x16 e27 = sub_x16(d11, d27);
        // inv vector_butterfly 48 304 1 0
        int16x16 e3 = add_x16(d3, d19);
        int16x16 e19 = sub_x16(d3, d19);
        // inv startbatch 512
        _mm256_storeu_si256((int16x16 *) (f + 48), e3);
        _mm256_storeu_si256((int16x16 *) (f + 112), e7);
        _mm256_storeu_si256((int16x16 *) (f + 176), e11);
        _mm256_storeu_si256((int16x16 *) (f + 240), e15);
        _mm256_storeu_si256((int16x16 *) (f + 304), e19);
        _mm256_storeu_si256((int16x16 *) (f + 368), e23);
        _mm256_storeu_si256((int16x16 *) (f + 432), e27);
        _mm256_storeu_si256((int16x16 *) (f + 496), e31);
        f += 512;
    }
    f -= 512 * reps;
    // inv stopbatch 512
    for (long long r = 0; r < reps; ++r) {
        // inv vector_permute 416 480 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 a26 = _mm256_loadu_si256((int16x16 *) (f + 416));
        int16x16 a30 = _mm256_loadu_si256((int16x16 *) (f + 480));
        int16x16 b26 = _mm256_permute2x128_si256_lo(a26, a30);
        int16x16 b30 = _mm256_permute2x128_si256_hi(a26, a30);
        // inv vector_permute 288 352 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 a18 = _mm256_loadu_si256((int16x16 *) (f + 288));
        int16x16 a22 = _mm256_loadu_si256((int16x16 *) (f + 352));
        int16x16 b18 = _mm256_permute2x128_si256_lo(a18, a22);
        int16x16 b22 = _mm256_permute2x128_si256_hi(a18, a22);
        // inv vector_permute 160 224 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 a10 = _mm256_loadu_si256((int16x16 *) (f + 160));
        int16x16 a14 = _mm256_loadu_si256((int16x16 *) (f + 224));
        int16x16 b10 = _mm256_permute2x128_si256_lo(a10, a14);
        int16x16 b14 = _mm256_permute2x128_si256_hi(a10, a14);
        // inv vector_permute 32 96 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 a2 = _mm256_loadu_si256((int16x16 *) (f + 32));
        int16x16 a6 = _mm256_loadu_si256((int16x16 *) (f + 96));
        int16x16 b2 = _mm256_permute2x128_si256_lo(a2, a6);
        int16x16 b6 = _mm256_permute2x128_si256_hi(a2, a6);
        // inv vector_twist 480 512 507 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
        b30 = mulmod_scaled_x16(b30, precomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_5_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
        // inv vector_twist 416 512 511 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
        b26 = mulmod_scaled_x16(b26, precomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
        // inv vector_twist 352 512 5 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
        b22 = mulmod_scaled_x16(b22, precomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_507_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
        // inv vector_twist 288 512 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
        b18 = mulmod_scaled_x16(b18, precomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_512_511_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
        // inv vector_twist 224 256 255 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
        b14 = mulmod_scaled_x16(b14, precomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_1_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
        // inv vector_twist 160 256 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
        b10 = mulmod_scaled_x16(b10, precomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_256_255_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
        // inv vector_twist 96 128 1 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
        b6 = mulmod_scaled_x16(b6, precomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qinvprecomp_128_127_32_33_34_35_36_37_38_39_40_41_42_43_44_45_46_47, qdata);
        // inv vector_reduce 32
        b2 = reduce_x16(b2, qdata);
        // inv vector_butterfly 416 480 8 7
        int16x16 c26 = add_x16(b26, b30);
        int16x16 c30 = sub_x16(b26, b30);
        c30 = mulmod_scaled_x16(c30, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
        // inv vector_butterfly 288 352 8 1
        int16x16 c18 = add_x16(b18, b22);
        int16x16 c22 = sub_x16(b18, b22);
        c22 = mulmod_scaled_x16(c22, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
        // inv vector_butterfly 160 224 4 1
        int16x16 c10 = add_x16(b10, b14);
        int16x16 c14 = sub_x16(b10, b14);
        c14 = mulmod_scaled_x16(c14, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
        // inv vector_butterfly 32 96 1 0
        int16x16 c2 = add_x16(b2, b6);
        int16x16 c6 = sub_x16(b2, b6);
        // inv vector_reduce_ifforward 96
        // inv vector_butterfly 352 480 4 1
        int16x16 d22 = add_x16(c22, c30);
        int16x16 d30 = sub_x16(c22, c30);
        d30 = mulmod_scaled_x16(d30, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
        // inv vector_butterfly 288 416 4 1
        int16x16 d18 = add_x16(c18, c26);
        int16x16 d26 = sub_x16(c18, c26);
        d26 = mulmod_scaled_x16(d26, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
        // inv vector_butterfly 96 224 1 0
        int16x16 d6 = add_x16(c6, c14);
        int16x16 d14 = sub_x16(c6, c14);
        // inv vector_butterfly 32 160 1 0
        int16x16 d2 = add_x16(c2, c10);
        int16x16 d10 = sub_x16(c2, c10);
        // inv vector_reduce_ifreverse 288
        d18 = reduce_x16(d18, qdata);
        // inv vector_reduce_ifreverse 32
        d2 = reduce_x16(d2, qdata);
        // inv vector_butterfly 224 480 1 0
        int16x16 e14 = add_x16(d14, d30);
        int16x16 e30 = sub_x16(d14, d30);
        // inv vector_butterfly 96 352 1 0
        int16x16 e6 = add_x16(d6, d22);
        int16x16 e22 = sub_x16(d6, d22);
        // inv vector_butterfly 160 416 1 0
        int16x16 e10 = add_x16(d10, d26);
        int16x16 e26 = sub_x16(d10, d26);
        // inv vector_butterfly 32 288 1 0
        int16x16 e2 = add_x16(d2, d18);
        int16x16 e18 = sub_x16(d2, d18);
        // inv startbatch 512
        _mm256_storeu_si256((int16x16 *) (f + 32), e2);
        _mm256_storeu_si256((int16x16 *) (f + 96), e6);
        _mm256_storeu_si256((int16x16 *) (f + 160), e10);
        _mm256_storeu_si256((int16x16 *) (f + 224), e14);
        _mm256_storeu_si256((int16x16 *) (f + 288), e18);
        _mm256_storeu_si256((int16x16 *) (f + 352), e22);
        _mm256_storeu_si256((int16x16 *) (f + 416), e26);
        _mm256_storeu_si256((int16x16 *) (f + 480), e30);
        f += 512;
    }
    f -= 512 * reps;
    // inv stopbatch 512
    for (long long r = 0; r < reps; ++r) {
        // inv vector_permute 400 464 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 a25 = _mm256_loadu_si256((int16x16 *) (f + 400));
        int16x16 a29 = _mm256_loadu_si256((int16x16 *) (f + 464));
        int16x16 b25 = _mm256_permute2x128_si256_lo(a25, a29);
        int16x16 b29 = _mm256_permute2x128_si256_hi(a25, a29);
        // inv vector_permute 272 336 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 a17 = _mm256_loadu_si256((int16x16 *) (f + 272));
        int16x16 a21 = _mm256_loadu_si256((int16x16 *) (f + 336));
        int16x16 b17 = _mm256_permute2x128_si256_lo(a17, a21);
        int16x16 b21 = _mm256_permute2x128_si256_hi(a17, a21);
        // inv vector_permute 144 208 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 a9 = _mm256_loadu_si256((int16x16 *) (f + 144));
        int16x16 a13 = _mm256_loadu_si256((int16x16 *) (f + 208));
        int16x16 b9 = _mm256_permute2x128_si256_lo(a9, a13);
        int16x16 b13 = _mm256_permute2x128_si256_hi(a9, a13);
        // inv vector_permute 16 80 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 a1 = _mm256_loadu_si256((int16x16 *) (f + 16));
        int16x16 a5 = _mm256_loadu_si256((int16x16 *) (f + 80));
        int16x16 b1 = _mm256_permute2x128_si256_lo(a1, a5);
        int16x16 b5 = _mm256_permute2x128_si256_hi(a1, a5);
        // inv vector_twist 464 512 507 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
        b29 = mulmod_scaled_x16(b29, precomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_5_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
        // inv vector_twist 400 512 511 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
        b25 = mulmod_scaled_x16(b25, precomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
        // inv vector_twist 336 512 5 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
        b21 = mulmod_scaled_x16(b21, precomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_507_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
        // inv vector_twist 272 512 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
        b17 = mulmod_scaled_x16(b17, precomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_512_511_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
        // inv vector_twist 208 256 255 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
        b13 = mulmod_scaled_x16(b13, precomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_1_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
        // inv vector_twist 144 256 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
        b9 = mulmod_scaled_x16(b9, precomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_256_255_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
        // inv vector_twist 80 128 1 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
        b5 = mulmod_scaled_x16(b5, precomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qinvprecomp_128_127_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31, qdata);
        // inv vector_reduce 16
        b1 = reduce_x16(b1, qdata);
        // inv vector_butterfly 400 464 8 7
        int16x16 c25 = add_x16(b25, b29);
        int16x16 c29 = sub_x16(b25, b29);
        c29 = mulmod_scaled_x16(c29, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
        // inv vector_butterfly 272 336 8 1
        int16x16 c17 = add_x16(b17, b21);
        int16x16 c21 = sub_x16(b17, b21);
        c21 = mulmod_scaled_x16(c21, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
        // inv vector_butterfly 144 208 4 1
        int16x16 c9 = add_x16(b9, b13);
        int16x16 c13 = sub_x16(b9, b13);
        c13 = mulmod_scaled_x16(c13, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
        // inv vector_butterfly 16 80 1 0
        int16x16 c1 = add_x16(b1, b5);
        int16x16 c5 = sub_x16(b1, b5);
        // inv vector_reduce_ifforward 80
        // inv vector_butterfly 336 464 4 1
        int16x16 d21 = add_x16(c21, c29);
        int16x16 d29 = sub_x16(c21, c29);
        d29 = mulmod_scaled_x16(d29, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
        // inv vector_butterfly 272 400 4 1
        int16x16 d17 = add_x16(c17, c25);
        int16x16 d25 = sub_x16(c17, c25);
        d25 = mulmod_scaled_x16(d25, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
        // inv vector_butterfly 80 208 1 0
        int16x16 d5 = add_x16(c5, c13);
        int16x16 d13 = sub_x16(c5, c13);
        // inv vector_butterfly 16 144 1 0
        int16x16 d1 = add_x16(c1, c9);
        int16x16 d9 = sub_x16(c1, c9);
        // inv vector_reduce_ifreverse 272
        d17 = reduce_x16(d17, qdata);
        // inv vector_reduce_ifreverse 16
        d1 = reduce_x16(d1, qdata);
        // inv vector_butterfly 208 464 1 0
        int16x16 e13 = add_x16(d13, d29);
        int16x16 e29 = sub_x16(d13, d29);
        // inv vector_butterfly 80 336 1 0
        int16x16 e5 = add_x16(d5, d21);
        int16x16 e21 = sub_x16(d5, d21);
        // inv vector_butterfly 144 400 1 0
        int16x16 e9 = add_x16(d9, d25);
        int16x16 e25 = sub_x16(d9, d25);
        // inv vector_butterfly 16 272 1 0
        int16x16 e1 = add_x16(d1, d17);
        int16x16 e17 = sub_x16(d1, d17);
        // inv startbatch 512
        _mm256_storeu_si256((int16x16 *) (f + 16), e1);
        _mm256_storeu_si256((int16x16 *) (f + 80), e5);
        _mm256_storeu_si256((int16x16 *) (f + 144), e9);
        _mm256_storeu_si256((int16x16 *) (f + 208), e13);
        _mm256_storeu_si256((int16x16 *) (f + 272), e17);
        _mm256_storeu_si256((int16x16 *) (f + 336), e21);
        _mm256_storeu_si256((int16x16 *) (f + 400), e25);
        _mm256_storeu_si256((int16x16 *) (f + 464), e29);
        f += 512;
    }
    f -= 512 * reps;
    // inv stopbatch 512
    for (long long r = 0; r < reps; ++r) {
        // inv vector_permute 384 448 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 a24 = _mm256_loadu_si256((int16x16 *) (f + 384));
        int16x16 a28 = _mm256_loadu_si256((int16x16 *) (f + 448));
        int16x16 b24 = _mm256_permute2x128_si256_lo(a24, a28);
        int16x16 b28 = _mm256_permute2x128_si256_hi(a24, a28);
        // inv vector_permute 256 320 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 a16 = _mm256_loadu_si256((int16x16 *) (f + 256));
        int16x16 a20 = _mm256_loadu_si256((int16x16 *) (f + 320));
        int16x16 b16 = _mm256_permute2x128_si256_lo(a16, a20);
        int16x16 b20 = _mm256_permute2x128_si256_hi(a16, a20);
        // inv vector_permute 128 192 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 a8 = _mm256_loadu_si256((int16x16 *) (f + 128));
        int16x16 a12 = _mm256_loadu_si256((int16x16 *) (f + 192));
        int16x16 b8 = _mm256_permute2x128_si256_lo(a8, a12);
        int16x16 b12 = _mm256_permute2x128_si256_hi(a8, a12);
        // inv vector_permute 0 64 _mm256_permute2x128_si256_lo _mm256_permute2x128_si256_hi
        int16x16 a0 = _mm256_loadu_si256((int16x16 *) (f + 0));
        int16x16 a4 = _mm256_loadu_si256((int16x16 *) (f + 64));
        int16x16 b0 = _mm256_permute2x128_si256_lo(a0, a4);
        int16x16 b4 = _mm256_permute2x128_si256_hi(a0, a4);
        // inv vector_twist 448 512 507 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
        b28 = mulmod_scaled_x16(b28, precomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_5_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
        // inv vector_twist 384 512 511 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
        b24 = mulmod_scaled_x16(b24, precomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
        // inv vector_twist 320 512 5 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
        b20 = mulmod_scaled_x16(b20, precomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_507_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
        // inv vector_twist 256 512 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
        b16 = mulmod_scaled_x16(b16, precomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_512_511_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
        // inv vector_twist 192 256 255 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
        b12 = mulmod_scaled_x16(b12, precomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_1_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
        // inv vector_twist 128 256 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
        b8 = mulmod_scaled_x16(b8, precomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_256_255_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
        // inv vector_twist 64 128 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
        b4 = mulmod_scaled_x16(b4, precomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qinvprecomp_128_127_0_1_2_3_4_5_6_7_8_9_10_11_12_13_14_15, qdata);
        // inv vector_reduce 0
        b0 = reduce_x16(b0, qdata);
        // inv vector_butterfly 384 448 8 7
        int16x16 c24 = add_x16(b24, b28);
        int16x16 c28 = sub_x16(b24, b28);
        c28 = mulmod_scaled_x16(c28, scaledzeta_x16_8_1, qinvscaledzeta_x16_8_1, qdata);
        // inv vector_butterfly 256 320 8 1
        int16x16 c16 = add_x16(b16, b20);
        int16x16 c20 = sub_x16(b16, b20);
        c20 = mulmod_scaled_x16(c20, scaledzeta_x16_8_7, qinvscaledzeta_x16_8_7, qdata);
        // inv vector_butterfly 128 192 4 1
        int16x16 c8 = add_x16(b8, b12);
        int16x16 c12 = sub_x16(b8, b12);
        c12 = mulmod_scaled_x16(c12, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
        // inv vector_butterfly 0 64 1 0
        int16x16 c0 = add_x16(b0, b4);
        int16x16 c4 = sub_x16(b0, b4);
        // inv vector_reduce_ifforward 64
        // inv vector_butterfly 320 448 4 1
        int16x16 d20 = add_x16(c20, c28);
        int16x16 d28 = sub_x16(c20, c28);
        d28 = mulmod_scaled_x16(d28, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
        // inv vector_butterfly 256 384 4 1
        int16x16 d16 = add_x16(c16, c24);
        int16x16 d24 = sub_x16(c16, c24);
        d24 = mulmod_scaled_x16(d24, scaledzeta_x16_4_3, qinvscaledzeta_x16_4_3, qdata);
        // inv vector_butterfly 64 192 1 0
        int16x16 d4 = add_x16(c4, c12);
        int16x16 d12 = sub_x16(c4, c12);
        // inv vector_butterfly 0 128 1 0
        int16x16 d0 = add_x16(c0, c8);
        int16x16 d8 = sub_x16(c0, c8);
        // inv vector_reduce_ifreverse 256
        d16 = reduce_x16(d16, qdata);
        // inv vector_reduce_ifreverse 0
        d0 = reduce_x16(d0, qdata);
        // inv vector_butterfly 192 448 1 0
        int16x16 e12 = add_x16(d12, d28);
        int16x16 e28 = sub_x16(d12, d28);
        // inv vector_butterfly 64 320 1 0
        int16x16 e4 = add_x16(d4, d20);
        int16x16 e20 = sub_x16(d4, d20);
        // inv vector_butterfly 128 384 1 0
        int16x16 e8 = add_x16(d8, d24);
        int16x16 e24 = sub_x16(d8, d24);
        // inv vector_butterfly 0 256 1 0
        int16x16 e0 = add_x16(d0, d16);
        int16x16 e16 = sub_x16(d0, d16);
        // inv startbatch 512
        _mm256_storeu_si256((int16x16 *) (f + 0), e0);
        _mm256_storeu_si256((int16x16 *) (f + 64), e4);
        _mm256_storeu_si256((int16x16 *) (f + 128), e8);
        _mm256_storeu_si256((int16x16 *) (f + 192), e12);
        _mm256_storeu_si256((int16x16 *) (f + 256), e16);
        _mm256_storeu_si256((int16x16 *) (f + 320), e20);
        _mm256_storeu_si256((int16x16 *) (f + 384), e24);
        _mm256_storeu_si256((int16x16 *) (f + 448), e28);
        f += 512;
    }
    // f -= 512*reps;
    // inv startntt 512
}

void PQCLEAN_NTRULPR1013_AVX2_invntt512_7681(int16 *f, int reps) {
    invntt512(f, reps, qdata_7681.data);
}

void PQCLEAN_NTRULPR1013_AVX2_invntt512_10753(int16 *f, int reps) {
    invntt512(f, reps, qdata_10753.data);
}