#include "cdecl.h"
.include "shuffle.inc"
.include "fq.inc"

.macro butterfly rl0,rl1,rl2,rl3,rh0,rh1,rh2,rh3,zl0=2,zl1=2,zh0=3,zh1=3
vpsubw		%ymm\rl0,%ymm\rh0,%ymm12
vpaddw		%ymm\rh0,%ymm\rl0,%ymm\rl0
vpsubw		%ymm\rl1,%ymm\rh1,%ymm13

vpmullw		%ymm\zl0,%ymm12,%ymm\rh0
vpaddw		%ymm\rh1,%ymm\rl1,%ymm\rl1
vpsubw		%ymm\rl2,%ymm\rh2,%ymm14

vpmullw		%ymm\zl0,%ymm13,%ymm\rh1
vpaddw		%ymm\rh2,%ymm\rl2,%ymm\rl2
vpsubw		%ymm\rl3,%ymm\rh3,%ymm15

vpmullw		%ymm\zl1,%ymm14,%ymm\rh2
vpaddw		%ymm\rh3,%ymm\rl3,%ymm\rl3
vpmullw		%ymm\zl1,%ymm15,%ymm\rh3

vpmulhw		%ymm\zh0,%ymm12,%ymm12
vpmulhw		%ymm\zh0,%ymm13,%ymm13

vpmulhw		%ymm\zh1,%ymm14,%ymm14
vpmulhw		%ymm\zh1,%ymm15,%ymm15

vpmulhw		%ymm0,%ymm\rh0,%ymm\rh0

vpmulhw		%ymm0,%ymm\rh1,%ymm\rh1

vpmulhw		%ymm0,%ymm\rh2,%ymm\rh2
vpmulhw		%ymm0,%ymm\rh3,%ymm\rh3

#

#

vpsubw		%ymm\rh0,%ymm12,%ymm\rh0

vpsubw		%ymm\rh1,%ymm13,%ymm\rh1

vpsubw		%ymm\rh2,%ymm14,%ymm\rh2
vpsubw		%ymm\rh3,%ymm15,%ymm\rh3
.endm

.macro intt_levels0t5 off
/* level 0 */
vmovdqa		_16XFLO*2(%rsi),%ymm2
vmovdqa		_16XFHI*2(%rsi),%ymm3

vmovdqa         (128*\off+  0)*2(%rdi),%ymm4
vmovdqa         (128*\off+ 32)*2(%rdi),%ymm6
vmovdqa         (128*\off+ 16)*2(%rdi),%ymm5
vmovdqa         (128*\off+ 48)*2(%rdi),%ymm7

fqmulprecomp	2,3,4
fqmulprecomp	2,3,6
fqmulprecomp	2,3,5
fqmulprecomp	2,3,7

vmovdqa         (128*\off+ 64)*2(%rdi),%ymm8
vmovdqa         (128*\off+ 96)*2(%rdi),%ymm10
vmovdqa         (128*\off+ 80)*2(%rdi),%ymm9
vmovdqa         (128*\off+112)*2(%rdi),%ymm11

fqmulprecomp	2,3,8
fqmulprecomp	2,3,10
fqmulprecomp	2,3,9
fqmulprecomp	2,3,11

vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+208)*2(%rsi),%ymm15
vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+176)*2(%rsi),%ymm1
vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+224)*2(%rsi),%ymm2
vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+192)*2(%rsi),%ymm3
vmovdqa		_REVIDXB*2(%rsi),%ymm12
vpshufb		%ymm12,%ymm15,%ymm15
vpshufb		%ymm12,%ymm1,%ymm1
vpshufb		%ymm12,%ymm2,%ymm2
vpshufb		%ymm12,%ymm3,%ymm3

butterfly	4,5,8,9,6,7,10,11,15,1,2,3

/* level 1 */
vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+144)*2(%rsi),%ymm2
vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+160)*2(%rsi),%ymm3
vmovdqa		_REVIDXB*2(%rsi),%ymm1
vpshufb		%ymm1,%ymm2,%ymm2
vpshufb		%ymm1,%ymm3,%ymm3

butterfly	4,5,6,7,8,9,10,11,2,2,3,3

shuffle1	4,5,3,5
shuffle1	6,7,4,7
shuffle1	8,9,6,9
shuffle1	10,11,8,11

/* level 2 */
vmovdqa		_REVIDXD*2(%rsi),%ymm12
vpermd		(_ZETAS_EXP+(1-\off)*224+112)*2(%rsi),%ymm12,%ymm2
vpermd		(_ZETAS_EXP+(1-\off)*224+128)*2(%rsi),%ymm12,%ymm10

butterfly	3,4,6,8,5,7,9,11,2,2,10,10

vmovdqa		_16XV*2(%rsi),%ymm1
red16		3

shuffle2	3,4,10,4
shuffle2	6,8,3,8
shuffle2	5,7,6,7
shuffle2	9,11,5,11

/* level 3 */
vpermq		$0x1B,(_ZETAS_EXP+(1-\off)*224+80)*2(%rsi),%ymm2
vpermq		$0x1B,(_ZETAS_EXP+(1-\off)*224+96)*2(%rsi),%ymm9

butterfly	10,3,6,5,4,8,7,11,2,2,9,9

shuffle4	10,3,9,3
shuffle4	6,5,10,5
shuffle4	4,8,6,8
shuffle4	7,11,4,11

/* level 4 */
vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+48)*2(%rsi),%ymm2
vpermq		$0x4E,(_ZETAS_EXP+(1-\off)*224+64)*2(%rsi),%ymm7

butterfly	9,10,6,4,3,5,8,11,2,2,7,7

red16		9

shuffle8	9,10,7,10
shuffle8	6,4,9,4
shuffle8	3,5,6,5
shuffle8	8,11,3,11

/* level5 */
vmovdqa		(_ZETAS_EXP+(1-\off)*224+16)*2(%rsi),%ymm2
vmovdqa		(_ZETAS_EXP+(1-\off)*224+32)*2(%rsi),%ymm8

butterfly	7,9,6,3,10,4,5,11,2,2,8,8

vmovdqa         %ymm7,(128*\off+  0)*2(%rdi)
vmovdqa         %ymm9,(128*\off+ 16)*2(%rdi)
vmovdqa         %ymm6,(128*\off+ 32)*2(%rdi)
vmovdqa         %ymm3,(128*\off+ 48)*2(%rdi)
vmovdqa         %ymm10,(128*\off+ 64)*2(%rdi)
vmovdqa         %ymm4,(128*\off+ 80)*2(%rdi)
vmovdqa         %ymm5,(128*\off+ 96)*2(%rdi)
vmovdqa         %ymm11,(128*\off+112)*2(%rdi)
.endm

.macro intt_level6 off
/* level 6 */
vmovdqa         (64*\off+  0)*2(%rdi),%ymm4
vmovdqa         (64*\off+128)*2(%rdi),%ymm8
vmovdqa         (64*\off+ 16)*2(%rdi),%ymm5
vmovdqa         (64*\off+144)*2(%rdi),%ymm9
vpbroadcastq	(_ZETAS_EXP+0)*2(%rsi),%ymm2

vmovdqa         (64*\off+ 32)*2(%rdi),%ymm6
vmovdqa         (64*\off+160)*2(%rdi),%ymm10
vmovdqa         (64*\off+ 48)*2(%rdi),%ymm7
vmovdqa         (64*\off+176)*2(%rdi),%ymm11
vpbroadcastq	(_ZETAS_EXP+4)*2(%rsi),%ymm3

butterfly	4,5,6,7,8,9,10,11

.if \off == 0
red16		4
.endif

vmovdqa		%ymm4,(64*\off+  0)*2(%rdi)
vmovdqa		%ymm5,(64*\off+ 16)*2(%rdi)
vmovdqa		%ymm6,(64*\off+ 32)*2(%rdi)
vmovdqa		%ymm7,(64*\off+ 48)*2(%rdi)
vmovdqa		%ymm8,(64*\off+128)*2(%rdi)
vmovdqa		%ymm9,(64*\off+144)*2(%rdi)
vmovdqa		%ymm10,(64*\off+160)*2(%rdi)
vmovdqa		%ymm11,(64*\off+176)*2(%rdi)
.endm

.text
.global cdecl(PQCLEAN_KYBER76890S_AVX2_invntt_avx)
.global _cdecl(PQCLEAN_KYBER76890S_AVX2_invntt_avx)
cdecl(PQCLEAN_KYBER76890S_AVX2_invntt_avx):
_cdecl(PQCLEAN_KYBER76890S_AVX2_invntt_avx):
vmovdqa         _16XQ*2(%rsi),%ymm0

intt_levels0t5	0
intt_levels0t5	1

intt_level6	0
intt_level6	1
ret