#include "crypto_encode_1013x2393round.h" #include /* auto-generated; do not edit */ #define int16 int16_t #define uint16 uint16_t #define uint32 uint32_t void PQCLEAN_NTRULPR1013_AVX2_crypto_encode_1013x2393round(unsigned char *out, const void *v) { const int16 *R0 = v; /* XXX: caller could overlap R with input */ uint16 R[507]; long i; const uint16 *reading; uint16 *writing; uint16 r0, r1; uint32 r2; uint32 s0; reading = (uint16 *) R0; writing = R; i = 32; while (i > 0) { __m256i x, x2, y, y2; --i; if (!i) { reading -= 12; writing -= 6; out -= 12; } x = _mm256_loadu_si256((__m256i *) (reading + 0)); x2 = _mm256_loadu_si256((__m256i *) (reading + 16)); x = _mm256_mulhrs_epi16(x, _mm256_set1_epi16(10923)); x2 = _mm256_mulhrs_epi16(x2, _mm256_set1_epi16(10923)); x = _mm256_add_epi16(x, _mm256_add_epi16(x, x)); x2 = _mm256_add_epi16(x2, _mm256_add_epi16(x2, x2)); x = _mm256_add_epi16(x, _mm256_set1_epi16(3588)); x2 = _mm256_add_epi16(x2, _mm256_set1_epi16(3588)); x &= _mm256_set1_epi16(16383); x2 &= _mm256_set1_epi16(16383); x = _mm256_mulhi_epi16(x, _mm256_set1_epi16(21846)); x2 = _mm256_mulhi_epi16(x2, _mm256_set1_epi16(21846)); y = x & _mm256_set1_epi32(65535); y2 = x2 & _mm256_set1_epi32(65535); x = _mm256_srli_epi32(x, 16); x2 = _mm256_srli_epi32(x2, 16); x = _mm256_mullo_epi32(x, _mm256_set1_epi32(2393)); x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(2393)); x = _mm256_add_epi32(y, x); x2 = _mm256_add_epi32(y2, x2); x = _mm256_shuffle_epi8(x, _mm256_set_epi8( 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 )); x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8( 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 )); x = _mm256_permute4x64_epi64(x, 0xd8); x2 = _mm256_permute4x64_epi64(x2, 0xd8); _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31)); _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20)); reading += 32; writing += 16; out += 32; } R[506] = (uint16) ((((3 * ((10923 * R0[1012] + 16384) >> 15) + 3588) & 16383) * 10923) >> 15); for (i = 0; i < 253; ++i) { r0 = R[2 * i]; r1 = R[2 * i + 1]; r2 = r0 + r1 * (uint32)88; R[i] = (uint16) r2; } R[253] = R[506]; reading = (uint16 *) R; writing = R; i = 8; while (i > 0) { __m256i x, x2, y, y2; --i; if (!i) { reading -= 2; writing -= 1; out -= 2; } x = _mm256_loadu_si256((__m256i *) (reading + 0)); x2 = _mm256_loadu_si256((__m256i *) (reading + 16)); y = x & _mm256_set1_epi32(65535); y2 = x2 & _mm256_set1_epi32(65535); x = _mm256_srli_epi32(x, 16); x2 = _mm256_srli_epi32(x2, 16); x = _mm256_mullo_epi32(x, _mm256_set1_epi32(7744)); x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(7744)); x = _mm256_add_epi32(y, x); x2 = _mm256_add_epi32(y2, x2); x = _mm256_shuffle_epi8(x, _mm256_set_epi8( 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 )); x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8( 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 )); x = _mm256_permute4x64_epi64(x, 0xd8); x2 = _mm256_permute4x64_epi64(x2, 0xd8); _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31)); _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20)); reading += 32; writing += 16; out += 32; } reading = (uint16 *) R; writing = R; i = 8; while (i > 0) { __m256i x, y; --i; if (!i) { reading -= 2; writing -= 1; out -= 1; } x = _mm256_loadu_si256((__m256i *) reading); y = x & _mm256_set1_epi32(65535); x = _mm256_srli_epi32(x, 16); x = _mm256_mullo_epi32(x, _mm256_set1_epi32(916)); x = _mm256_add_epi32(y, x); x = _mm256_shuffle_epi8(x, _mm256_set_epi8( 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 )); x = _mm256_permute4x64_epi64(x, 0xd8); _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); s0 = (uint32) _mm256_extract_epi32(x, 4); *out++ = (unsigned char) s0; s0 >>= 8; *out++ = (unsigned char) s0; s0 >>= 8; *out++ = (unsigned char) s0; s0 >>= 8; *out++ = (unsigned char) s0; s0 = (uint32) _mm256_extract_epi32(x, 6); *out++ = (unsigned char) s0; s0 >>= 8; *out++ = (unsigned char) s0; s0 >>= 8; *out++ = (unsigned char) s0; s0 >>= 8; *out++ = (unsigned char) s0; reading += 16; writing += 8; } R[63] = R[126]; reading = (uint16 *) R; writing = R; i = 2; while (i > 0) { __m256i x, x2, y, y2; --i; if (!i) { reading -= 2; writing -= 1; out -= 2; } x = _mm256_loadu_si256((__m256i *) (reading + 0)); x2 = _mm256_loadu_si256((__m256i *) (reading + 16)); y = x & _mm256_set1_epi32(65535); y2 = x2 & _mm256_set1_epi32(65535); x = _mm256_srli_epi32(x, 16); x2 = _mm256_srli_epi32(x2, 16); x = _mm256_mullo_epi32(x, _mm256_set1_epi32(3278)); x2 = _mm256_mullo_epi32(x2, _mm256_set1_epi32(3278)); x = _mm256_add_epi32(y, x); x2 = _mm256_add_epi32(y2, x2); x = _mm256_shuffle_epi8(x, _mm256_set_epi8( 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 )); x2 = _mm256_shuffle_epi8(x2, _mm256_set_epi8( 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0, 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0 )); x = _mm256_permute4x64_epi64(x, 0xd8); x2 = _mm256_permute4x64_epi64(x2, 0xd8); _mm256_storeu_si256((__m256i *) writing, _mm256_permute2f128_si256(x, x2, 0x31)); _mm256_storeu_si256((__m256i *) out, _mm256_permute2f128_si256(x, x2, 0x20)); reading += 32; writing += 16; out += 32; } r0 = R[62]; r1 = R[63]; r2 = r0 + r1 * (uint32)3278; *out++ = (unsigned char) r2; r2 >>= 8; R[31] = (uint16) r2; reading = (uint16 *) R; writing = R; i = 2; while (i > 0) { __m256i x, y; --i; x = _mm256_loadu_si256((__m256i *) reading); y = x & _mm256_set1_epi32(65535); x = _mm256_srli_epi32(x, 16); x = _mm256_mullo_epi32(x, _mm256_set1_epi32(164)); x = _mm256_add_epi32(y, x); x = _mm256_shuffle_epi8(x, _mm256_set_epi8( 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1, 12, 8, 4, 0, 12, 8, 4, 0, 14, 13, 10, 9, 6, 5, 2, 1 )); x = _mm256_permute4x64_epi64(x, 0xd8); _mm_storeu_si128((__m128i *) writing, _mm256_extractf128_si256(x, 0)); s0 = (uint32) _mm256_extract_epi32(x, 4); *out++ = (unsigned char) s0; s0 >>= 8; *out++ = (unsigned char) s0; s0 >>= 8; *out++ = (unsigned char) s0; s0 >>= 8; *out++ = (unsigned char) s0; s0 = (uint32) _mm256_extract_epi32(x, 6); *out++ = (unsigned char) s0; s0 >>= 8; *out++ = (unsigned char) s0; s0 >>= 8; *out++ = (unsigned char) s0; s0 >>= 8; *out++ = (unsigned char) s0; reading += 16; writing += 8; } for (i = 0; i < 7; ++i) { r0 = R[2 * i]; r1 = R[2 * i + 1]; r2 = r0 + r1 * (uint32)106; R[i] = (uint16) r2; } r0 = R[14]; r1 = R[15]; r2 = r0 + r1 * (uint32)106; *out++ = (unsigned char) r2; r2 >>= 8; R[7] = (uint16) r2; for (i = 0; i < 4; ++i) { r0 = R[2 * i]; r1 = R[2 * i + 1]; r2 = r0 + r1 * (uint32)11236; *out++ = (unsigned char) r2; r2 >>= 8; *out++ = (unsigned char) r2; r2 >>= 8; R[i] = (uint16) r2; } for (i = 0; i < 2; ++i) { r0 = R[2 * i]; r1 = R[2 * i + 1]; r2 = r0 + r1 * (uint32)1927; *out++ = (unsigned char) r2; r2 >>= 8; R[i] = (uint16) r2; } r0 = R[0]; r1 = R[1]; r2 = r0 + r1 * (uint32)14506; *out++ = (unsigned char) r2; r2 >>= 8; *out++ = (unsigned char) r2; r2 >>= 8; R[0] = (uint16) r2; r0 = R[0]; *out++ = (unsigned char) r0; r0 >>= 8; *out++ = (unsigned char) r0; }