#include "cdecl.h" .include "shuffle.inc" .macro butterfly l,h,zl0=1,zl1=1,zh0=2,zh1=2 vpsubd %ymm\l,%ymm\h,%ymm12 vpaddd %ymm\h,%ymm\l,%ymm\l vpmuldq %ymm\zl0,%ymm12,%ymm13 vmovshdup %ymm12,%ymm\h vpmuldq %ymm\zl1,%ymm\h,%ymm14 vpmuldq %ymm\zh0,%ymm12,%ymm12 vpmuldq %ymm\zh1,%ymm\h,%ymm\h vpmuldq %ymm0,%ymm13,%ymm13 vpmuldq %ymm0,%ymm14,%ymm14 vpsubd %ymm13,%ymm12,%ymm12 vpsubd %ymm14,%ymm\h,%ymm\h vmovshdup %ymm12,%ymm12 vpblendd $0xAA,%ymm\h,%ymm12,%ymm\h .endm .macro levels0t5 off vmovdqa 256*\off+ 0(%rdi),%ymm4 vmovdqa 256*\off+ 32(%rdi),%ymm5 vmovdqa 256*\off+ 64(%rdi),%ymm6 vmovdqa 256*\off+ 96(%rdi),%ymm7 vmovdqa 256*\off+128(%rdi),%ymm8 vmovdqa 256*\off+160(%rdi),%ymm9 vmovdqa 256*\off+192(%rdi),%ymm10 vmovdqa 256*\off+224(%rdi),%ymm11 /* level 0 */ vpermq $0x1B,(_ZETAS_QINV+296-8*\off-8)*4(%rsi),%ymm3 vpermq $0x1B,(_ZETAS+296-8*\off-8)*4(%rsi),%ymm15 vmovshdup %ymm3,%ymm1 vmovshdup %ymm15,%ymm2 butterfly 4,5,1,3,2,15 vpermq $0x1B,(_ZETAS_QINV+296-8*\off-40)*4(%rsi),%ymm3 vpermq $0x1B,(_ZETAS+296-8*\off-40)*4(%rsi),%ymm15 vmovshdup %ymm3,%ymm1 vmovshdup %ymm15,%ymm2 butterfly 6,7,1,3,2,15 vpermq $0x1B,(_ZETAS_QINV+296-8*\off-72)*4(%rsi),%ymm3 vpermq $0x1B,(_ZETAS+296-8*\off-72)*4(%rsi),%ymm15 vmovshdup %ymm3,%ymm1 vmovshdup %ymm15,%ymm2 butterfly 8,9,1,3,2,15 vpermq $0x1B,(_ZETAS_QINV+296-8*\off-104)*4(%rsi),%ymm3 vpermq $0x1B,(_ZETAS+296-8*\off-104)*4(%rsi),%ymm15 vmovshdup %ymm3,%ymm1 vmovshdup %ymm15,%ymm2 butterfly 10,11,1,3,2,15 /* level 1 */ vpermq $0x1B,(_ZETAS_QINV+168-8*\off-8)*4(%rsi),%ymm3 vpermq $0x1B,(_ZETAS+168-8*\off-8)*4(%rsi),%ymm15 vmovshdup %ymm3,%ymm1 vmovshdup %ymm15,%ymm2 butterfly 4,6,1,3,2,15 butterfly 5,7,1,3,2,15 vpermq $0x1B,(_ZETAS_QINV+168-8*\off-40)*4(%rsi),%ymm3 vpermq $0x1B,(_ZETAS+168-8*\off-40)*4(%rsi),%ymm15 vmovshdup %ymm3,%ymm1 vmovshdup %ymm15,%ymm2 butterfly 8,10,1,3,2,15 butterfly 9,11,1,3,2,15 /* level 2 */ vpermq $0x1B,(_ZETAS_QINV+104-8*\off-8)*4(%rsi),%ymm3 vpermq $0x1B,(_ZETAS+104-8*\off-8)*4(%rsi),%ymm15 vmovshdup %ymm3,%ymm1 vmovshdup %ymm15,%ymm2 butterfly 4,8,1,3,2,15 butterfly 5,9,1,3,2,15 butterfly 6,10,1,3,2,15 butterfly 7,11,1,3,2,15 /* level 3 */ shuffle2 4,5,3,5 shuffle2 6,7,4,7 shuffle2 8,9,6,9 shuffle2 10,11,8,11 vpermq $0x1B,(_ZETAS_QINV+72-8*\off-8)*4(%rsi),%ymm1 vpermq $0x1B,(_ZETAS+72-8*\off-8)*4(%rsi),%ymm2 butterfly 3,5 butterfly 4,7 butterfly 6,9 butterfly 8,11 /* level 4 */ shuffle4 3,4,10,4 shuffle4 6,8,3,8 shuffle4 5,7,6,7 shuffle4 9,11,5,11 vpermq $0x1B,(_ZETAS_QINV+40-8*\off-8)*4(%rsi),%ymm1 vpermq $0x1B,(_ZETAS+40-8*\off-8)*4(%rsi),%ymm2 butterfly 10,4 butterfly 3,8 butterfly 6,7 butterfly 5,11 /* level 5 */ shuffle8 10,3,9,3 shuffle8 6,5,10,5 shuffle8 4,8,6,8 shuffle8 7,11,4,11 vpbroadcastd (_ZETAS_QINV+7-\off)*4(%rsi),%ymm1 vpbroadcastd (_ZETAS+7-\off)*4(%rsi),%ymm2 butterfly 9,3 butterfly 10,5 butterfly 6,8 butterfly 4,11 vmovdqa %ymm9,256*\off+ 0(%rdi) vmovdqa %ymm10,256*\off+ 32(%rdi) vmovdqa %ymm6,256*\off+ 64(%rdi) vmovdqa %ymm4,256*\off+ 96(%rdi) vmovdqa %ymm3,256*\off+128(%rdi) vmovdqa %ymm5,256*\off+160(%rdi) vmovdqa %ymm8,256*\off+192(%rdi) vmovdqa %ymm11,256*\off+224(%rdi) .endm .macro levels6t7 off vmovdqa 0+32*\off(%rdi),%ymm4 vmovdqa 128+32*\off(%rdi),%ymm5 vmovdqa 256+32*\off(%rdi),%ymm6 vmovdqa 384+32*\off(%rdi),%ymm7 vmovdqa 512+32*\off(%rdi),%ymm8 vmovdqa 640+32*\off(%rdi),%ymm9 vmovdqa 768+32*\off(%rdi),%ymm10 vmovdqa 896+32*\off(%rdi),%ymm11 /* level 6 */ vpbroadcastd (_ZETAS_QINV+3)*4(%rsi),%ymm1 vpbroadcastd (_ZETAS+3)*4(%rsi),%ymm2 butterfly 4,6 butterfly 5,7 vpbroadcastd (_ZETAS_QINV+2)*4(%rsi),%ymm1 vpbroadcastd (_ZETAS+2)*4(%rsi),%ymm2 butterfly 8,10 butterfly 9,11 /* level 7 */ vpbroadcastd (_ZETAS_QINV+0)*4(%rsi),%ymm1 vpbroadcastd (_ZETAS+0)*4(%rsi),%ymm2 butterfly 4,8 butterfly 5,9 butterfly 6,10 butterfly 7,11 vmovdqa %ymm8,512+32*\off(%rdi) vmovdqa %ymm9,640+32*\off(%rdi) vmovdqa %ymm10,768+32*\off(%rdi) vmovdqa %ymm11,896+32*\off(%rdi) vmovdqa (_8XDIV_QINV)*4(%rsi),%ymm1 vmovdqa (_8XDIV)*4(%rsi),%ymm2 vpmuldq %ymm1,%ymm4,%ymm12 vpmuldq %ymm1,%ymm5,%ymm13 vmovshdup %ymm4,%ymm8 vmovshdup %ymm5,%ymm9 vpmuldq %ymm1,%ymm8,%ymm14 vpmuldq %ymm1,%ymm9,%ymm15 vpmuldq %ymm2,%ymm4,%ymm4 vpmuldq %ymm2,%ymm5,%ymm5 vpmuldq %ymm2,%ymm8,%ymm8 vpmuldq %ymm2,%ymm9,%ymm9 vpmuldq %ymm0,%ymm12,%ymm12 vpmuldq %ymm0,%ymm13,%ymm13 vpmuldq %ymm0,%ymm14,%ymm14 vpmuldq %ymm0,%ymm15,%ymm15 vpsubd %ymm12,%ymm4,%ymm4 vpsubd %ymm13,%ymm5,%ymm5 vpsubd %ymm14,%ymm8,%ymm8 vpsubd %ymm15,%ymm9,%ymm9 vmovshdup %ymm4,%ymm4 vmovshdup %ymm5,%ymm5 vpblendd $0xAA,%ymm8,%ymm4,%ymm4 vpblendd $0xAA,%ymm9,%ymm5,%ymm5 vpmuldq %ymm1,%ymm6,%ymm12 vpmuldq %ymm1,%ymm7,%ymm13 vmovshdup %ymm6,%ymm8 vmovshdup %ymm7,%ymm9 vpmuldq %ymm1,%ymm8,%ymm14 vpmuldq %ymm1,%ymm9,%ymm15 vpmuldq %ymm2,%ymm6,%ymm6 vpmuldq %ymm2,%ymm7,%ymm7 vpmuldq %ymm2,%ymm8,%ymm8 vpmuldq %ymm2,%ymm9,%ymm9 vpmuldq %ymm0,%ymm12,%ymm12 vpmuldq %ymm0,%ymm13,%ymm13 vpmuldq %ymm0,%ymm14,%ymm14 vpmuldq %ymm0,%ymm15,%ymm15 vpsubd %ymm12,%ymm6,%ymm6 vpsubd %ymm13,%ymm7,%ymm7 vpsubd %ymm14,%ymm8,%ymm8 vpsubd %ymm15,%ymm9,%ymm9 vmovshdup %ymm6,%ymm6 vmovshdup %ymm7,%ymm7 vpblendd $0xAA,%ymm8,%ymm6,%ymm6 vpblendd $0xAA,%ymm9,%ymm7,%ymm7 vmovdqa %ymm4, 0+32*\off(%rdi) vmovdqa %ymm5,128+32*\off(%rdi) vmovdqa %ymm6,256+32*\off(%rdi) vmovdqa %ymm7,384+32*\off(%rdi) .endm .text .global cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_avx) .global _cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_avx) cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_avx): _cdecl(PQCLEAN_DILITHIUM3_AVX2_invntt_avx): vmovdqa _8XQ*4(%rsi),%ymm0 levels0t5 0 levels0t5 1 levels0t5 2 levels0t5 3 levels6t7 0 levels6t7 1 levels6t7 2 levels6t7 3 ret