# The eXtended Keccak Code Package (XKCP) # https://github.com/XKCP/XKCP # # Copyright (c) 2006-2017, CRYPTOGAMS by # Copyright (c) 2017 Ronny Van Keer # All rights reserved. # # The source code in this file is licensed under the CRYPTOGAMS license. # For further details see http://www.openssl.org/~appro/cryptogams/. # # Notes: # The code for the permutation (__KeccakF1600) was generated with # Andy Polyakov's keccak1600-avx2.pl from the CRYPTOGAMS project # (https://github.com/dot-asm/cryptogams/blob/master/x86_64/keccak1600-avx2.pl). # The rest of the code was written by Ronny Van Keer. # Adaptations for macOS by Stéphane Léon. # Adaptations for mingw-w64 (changes macOS too) by Jorrit Jongma. .text # ----------------------------------------------------------------------------- # # void KeccakP1600_Initialize(void *state); # .ifdef add_underscore .globl _KeccakP1600_Initialize _KeccakP1600_Initialize: .else .globl KeccakP1600_Initialize .hidden KeccakP1600_Initialize .ifndef no_type .type KeccakP1600_Initialize,@function .endif KeccakP1600_Initialize: .endif .balign 32 vpxor %ymm0,%ymm0,%ymm0 vmovdqu %ymm0,0*32(%rdi) vmovdqu %ymm0,1*32(%rdi) vmovdqu %ymm0,2*32(%rdi) vmovdqu %ymm0,3*32(%rdi) vmovdqu %ymm0,4*32(%rdi) vmovdqu %ymm0,5*32(%rdi) movq $0,6*32(%rdi) ret .ifndef no_size .size KeccakP1600_Initialize,.-KeccakP1600_Initialize .endif # ----------------------------------------------------------------------------- # # void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); # %rdi %rsi %rdx # .ifdef add_underscore .globl _KeccakP1600_AddByte _KeccakP1600_AddByte: .else .globl KeccakP1600_AddByte .hidden KeccakP1600_AddByte .ifndef no_type .type KeccakP1600_AddByte,@function .endif KeccakP1600_AddByte: .endif .balign 32 mov %rdx, %rax and $7, %rax and $0xFFFFFFF8, %edx lea mapState(%rip), %r9 mov (%r9, %rdx), %rdx add %rdx, %rdi add %rax, %rdi xorb %sil, (%rdi) ret .ifndef no_size .size KeccakP1600_AddByte,.-KeccakP1600_AddByte .endif # ----------------------------------------------------------------------------- # # void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); # %rdi %rsi %rdx %rcx # .ifdef add_underscore .globl _KeccakP1600_AddBytes _KeccakP1600_AddBytes: .else .globl KeccakP1600_AddBytes .hidden KeccakP1600_AddBytes .ifndef no_type .type KeccakP1600_AddBytes,@function .endif KeccakP1600_AddBytes: .endif .balign 32 cmp $0, %rcx jz KeccakP1600_AddBytes_Exit mov %rdx, %rax # rax offset in lane and $0xFFFFFFF8, %edx # rdx pointer into state index mapper lea mapState(%rip), %r9 add %r9, %rdx and $7, %rax jz KeccakP1600_AddBytes_LaneAlignedCheck mov $8, %r9 # r9 is (max) length of incomplete lane sub %rax, %r9 cmp %rcx, %r9 cmovae %rcx, %r9 sub %r9, %rcx # length -= length of incomplete lane add (%rdx), %rax # rax = pointer to state lane add $8, %rdx add %rdi, %rax KeccakP1600_AddBytes_NotAlignedLoop: mov (%rsi), %r8b inc %rsi xorb %r8b, (%rax) inc %rax dec %r9 jnz KeccakP1600_AddBytes_NotAlignedLoop jmp KeccakP1600_AddBytes_LaneAlignedCheck KeccakP1600_AddBytes_LaneAlignedLoop: mov (%rsi), %r8 add $8, %rsi mov (%rdx), %rax add $8, %rdx add %rdi, %rax xor %r8, (%rax) KeccakP1600_AddBytes_LaneAlignedCheck: sub $8, %rcx jnc KeccakP1600_AddBytes_LaneAlignedLoop KeccakP1600_AddBytes_LastIncompleteLane: add $8, %rcx jz KeccakP1600_AddBytes_Exit mov (%rdx), %rax add %rdi, %rax KeccakP1600_AddBytes_LastIncompleteLaneLoop: mov (%rsi), %r8b inc %rsi xor %r8b, (%rax) inc %rax dec %rcx jnz KeccakP1600_AddBytes_LastIncompleteLaneLoop KeccakP1600_AddBytes_Exit: ret .ifndef no_size .size KeccakP1600_AddBytes,.-KeccakP1600_AddBytes .endif # ----------------------------------------------------------------------------- # # void KeccakP1600_OverwriteBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); # %rdi %rsi %rdx %rcx # .ifdef add_underscore .globl _KeccakP1600_OverwriteBytes _KeccakP1600_OverwriteBytes: .else .globl KeccakP1600_OverwriteBytes .hidden KeccakP1600_OverwriteBytes .ifndef no_type .type KeccakP1600_OverwriteBytes,@function .endif KeccakP1600_OverwriteBytes: .endif .balign 32 cmp $0, %rcx jz KeccakP1600_OverwriteBytes_Exit mov %rdx, %rax # rax offset in lane and $0xFFFFFFF8, %edx # rdx pointer into state index mapper lea mapState(%rip), %r9 add %r9, %rdx and $7, %rax jz KeccakP1600_OverwriteBytes_LaneAlignedCheck mov $8, %r9 # r9 is (max) length of incomplete lane sub %rax, %r9 cmp %rcx, %r9 cmovae %rcx, %r9 sub %r9, %rcx # length -= length of incomplete lane add (%rdx), %rax # rax = pointer to state lane add $8, %rdx add %rdi, %rax KeccakP1600_OverwriteBytes_NotAlignedLoop: mov (%rsi), %r8b inc %rsi mov %r8b, (%rax) inc %rax dec %r9 jnz KeccakP1600_OverwriteBytes_NotAlignedLoop jmp KeccakP1600_OverwriteBytes_LaneAlignedCheck KeccakP1600_OverwriteBytes_LaneAlignedLoop: mov (%rsi), %r8 add $8, %rsi mov (%rdx), %rax add $8, %rdx add %rdi, %rax mov %r8, (%rax) KeccakP1600_OverwriteBytes_LaneAlignedCheck: sub $8, %rcx jnc KeccakP1600_OverwriteBytes_LaneAlignedLoop KeccakP1600_OverwriteBytes_LastIncompleteLane: add $8, %rcx jz KeccakP1600_OverwriteBytes_Exit mov (%rdx), %rax add %rdi, %rax KeccakP1600_OverwriteBytes_LastIncompleteLaneLoop: mov (%rsi), %r8b inc %rsi mov %r8b, (%rax) inc %rax dec %rcx jnz KeccakP1600_OverwriteBytes_LastIncompleteLaneLoop KeccakP1600_OverwriteBytes_Exit: ret .ifndef no_size .size KeccakP1600_OverwriteBytes,.-KeccakP1600_OverwriteBytes .endif # ----------------------------------------------------------------------------- # # void KeccakP1600_OverwriteWithZeroes(void *state, unsigned int byteCount); # %rdi %rsi # .ifdef add_underscore .globl _KeccakP1600_OverwriteWithZeroes _KeccakP1600_OverwriteWithZeroes: .else .globl KeccakP1600_OverwriteWithZeroes .hidden KeccakP1600_OverwriteWithZeroes .ifndef no_type .type KeccakP1600_OverwriteWithZeroes,@function .endif KeccakP1600_OverwriteWithZeroes: .endif .balign 32 cmp $0, %rsi jz KeccakP1600_OverwriteWithZeroes_Exit lea mapState(%rip), %rdx # rdx pointer into state index mapper jmp KeccakP1600_OverwriteWithZeroes_LaneAlignedCheck KeccakP1600_OverwriteWithZeroes_LaneAlignedLoop: mov (%rdx), %rax add $8, %rdx add %rdi, %rax movq $0, (%rax) KeccakP1600_OverwriteWithZeroes_LaneAlignedCheck: sub $8, %rsi jnc KeccakP1600_OverwriteWithZeroes_LaneAlignedLoop KeccakP1600_OverwriteWithZeroes_LastIncompleteLane: add $8, %rsi jz KeccakP1600_OverwriteWithZeroes_Exit mov (%rdx), %rax add %rdi, %rax KeccakP1600_OverwriteWithZeroes_LastIncompleteLaneLoop: movb $0, (%rax) inc %rax dec %rsi jnz KeccakP1600_OverwriteWithZeroes_LastIncompleteLaneLoop KeccakP1600_OverwriteWithZeroes_Exit: ret .ifndef no_size .size KeccakP1600_OverwriteWithZeroes,.-KeccakP1600_OverwriteWithZeroes .endif # ----------------------------------------------------------------------------- # # void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); # %rdi %rsi %rdx %rcx # .ifdef add_underscore .globl _KeccakP1600_ExtractBytes _KeccakP1600_ExtractBytes: .else .globl KeccakP1600_ExtractBytes .hidden KeccakP1600_ExtractBytes .ifndef no_type .type KeccakP1600_ExtractBytes,@function .endif KeccakP1600_ExtractBytes: .endif .balign 32 push %rbx cmp $0, %rcx jz KeccakP1600_ExtractBytes_Exit mov %rdx, %rax # rax offset in lane and $0xFFFFFFF8, %edx # rdx pointer into state index mapper lea mapState(%rip), %r9 add %r9, %rdx and $7, %rax jz KeccakP1600_ExtractBytes_LaneAlignedCheck mov $8, %rbx # rbx is (max) length of incomplete lane sub %rax, %rbx cmp %rcx, %rbx cmovae %rcx, %rbx sub %rbx, %rcx # length -= length of incomplete lane mov (%rdx), %r9 add $8, %rdx add %rdi, %r9 add %rax, %r9 KeccakP1600_ExtractBytes_NotAlignedLoop: mov (%r9), %r8b inc %r9 mov %r8b, (%rsi) inc %rsi dec %rbx jnz KeccakP1600_ExtractBytes_NotAlignedLoop jmp KeccakP1600_ExtractBytes_LaneAlignedCheck KeccakP1600_ExtractBytes_LaneAlignedLoop: mov (%rdx), %rax add $8, %rdx add %rdi, %rax mov (%rax), %r8 mov %r8, (%rsi) add $8, %rsi KeccakP1600_ExtractBytes_LaneAlignedCheck: sub $8, %rcx jnc KeccakP1600_ExtractBytes_LaneAlignedLoop KeccakP1600_ExtractBytes_LastIncompleteLane: add $8, %rcx jz KeccakP1600_ExtractBytes_Exit mov (%rdx), %rax add %rdi, %rax mov (%rax), %r8 KeccakP1600_ExtractBytes_LastIncompleteLaneLoop: mov %r8b, (%rsi) shr $8, %r8 inc %rsi dec %rcx jnz KeccakP1600_ExtractBytes_LastIncompleteLaneLoop KeccakP1600_ExtractBytes_Exit: pop %rbx ret .ifndef no_size .size KeccakP1600_ExtractBytes,.-KeccakP1600_ExtractBytes .endif # ----------------------------------------------------------------------------- # # void KeccakP1600_ExtractAndAddBytes(const void *state, const unsigned char *input, unsigned char *output, unsigned int offset, unsigned int length); # %rdi %rsi %rdx %rcx %r8 # .ifdef add_underscore .globl _KeccakP1600_ExtractAndAddBytes _KeccakP1600_ExtractAndAddBytes: .else .globl KeccakP1600_ExtractAndAddBytes .hidden KeccakP1600_ExtractAndAddBytes .ifndef no_type .type KeccakP1600_ExtractAndAddBytes,@function .endif KeccakP1600_ExtractAndAddBytes: .endif .balign 32 push %rbx push %r10 cmp $0, %r8 jz KeccakP1600_ExtractAndAddBytes_Exit mov %rcx, %rax # rax offset in lane and $0xFFFFFFF8, %ecx # rcx pointer into state index mapper lea mapState(%rip), %r9 add %r9, %rcx and $7, %rax jz KeccakP1600_ExtractAndAddBytes_LaneAlignedCheck mov $8, %rbx # rbx is (max) length of incomplete lane sub %rax, %rbx cmp %r8, %rbx cmovae %r8, %rbx sub %rbx, %r8 # length -= length of incomplete lane mov (%rcx), %r9 add $8, %rcx add %rdi, %r9 add %rax, %r9 KeccakP1600_ExtractAndAddBytes_NotAlignedLoop: mov (%r9), %r10b inc %r9 xor (%rsi), %r10b inc %rsi mov %r10b, (%rdx) inc %rdx dec %rbx jnz KeccakP1600_ExtractAndAddBytes_NotAlignedLoop jmp KeccakP1600_ExtractAndAddBytes_LaneAlignedCheck KeccakP1600_ExtractAndAddBytes_LaneAlignedLoop: mov (%rcx), %rax add $8, %rcx add %rdi, %rax mov (%rax), %r10 xor (%rsi), %r10 add $8, %rsi mov %r10, (%rdx) add $8, %rdx KeccakP1600_ExtractAndAddBytes_LaneAlignedCheck: sub $8, %r8 jnc KeccakP1600_ExtractAndAddBytes_LaneAlignedLoop KeccakP1600_ExtractAndAddBytes_LastIncompleteLane: add $8, %r8 jz KeccakP1600_ExtractAndAddBytes_Exit mov (%rcx), %rax add %rdi, %rax mov (%rax), %r10 KeccakP1600_ExtractAndAddBytes_LastIncompleteLaneLoop: xor (%rsi), %r10b inc %rsi mov %r10b, (%rdx) inc %rdx shr $8, %r10 dec %r8 jnz KeccakP1600_ExtractAndAddBytes_LastIncompleteLaneLoop KeccakP1600_ExtractAndAddBytes_Exit: pop %r10 pop %rbx ret .ifndef no_size .size KeccakP1600_ExtractAndAddBytes,.-KeccakP1600_ExtractAndAddBytes .endif # ----------------------------------------------------------------------------- # # internal # .ifndef no_type .type __KeccakF1600,@function .endif .balign 32 __KeccakF1600: .Loop_avx2: ######################################### Theta vpshufd $0b01001110,%ymm2,%ymm13 vpxor %ymm3,%ymm5,%ymm12 vpxor %ymm6,%ymm4,%ymm9 vpxor %ymm1,%ymm12,%ymm12 vpxor %ymm9,%ymm12,%ymm12 # C[1..4] vpermq $0b10010011,%ymm12,%ymm11 vpxor %ymm2,%ymm13,%ymm13 vpermq $0b01001110,%ymm13,%ymm7 vpsrlq $63,%ymm12,%ymm8 vpaddq %ymm12,%ymm12,%ymm9 vpor %ymm9,%ymm8,%ymm8 # ROL64(C[1..4],1) vpermq $0b00111001,%ymm8,%ymm15 vpxor %ymm11,%ymm8,%ymm14 vpermq $0b00000000,%ymm14,%ymm14 # D[0..0] = ROL64(C[1],1) ^ C[4] vpxor %ymm0,%ymm13,%ymm13 vpxor %ymm7,%ymm13,%ymm13 # C[0..0] vpsrlq $63,%ymm13,%ymm7 vpaddq %ymm13,%ymm13,%ymm8 vpor %ymm7,%ymm8,%ymm8 # ROL64(C[0..0],1) vpxor %ymm14,%ymm2,%ymm2 # ^= D[0..0] vpxor %ymm14,%ymm0,%ymm0 # ^= D[0..0] vpblendd $0b11000000,%ymm8,%ymm15,%ymm15 vpblendd $0b00000011,%ymm13,%ymm11,%ymm11 vpxor %ymm11,%ymm15,%ymm15 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3] ######################################### Rho + Pi + pre-Chi shuffle vpsllvq 0*32-96(%r8),%ymm2,%ymm10 vpsrlvq 0*32-96(%r9),%ymm2,%ymm2 vpor %ymm10,%ymm2,%ymm2 vpxor %ymm15,%ymm3,%ymm3 # ^= D[1..4] from Theta vpsllvq 2*32-96(%r8),%ymm3,%ymm11 vpsrlvq 2*32-96(%r9),%ymm3,%ymm3 vpor %ymm11,%ymm3,%ymm3 vpxor %ymm15,%ymm4,%ymm4 # ^= D[1..4] from Theta vpsllvq 3*32-96(%r8),%ymm4,%ymm12 vpsrlvq 3*32-96(%r9),%ymm4,%ymm4 vpor %ymm12,%ymm4,%ymm4 vpxor %ymm15,%ymm5,%ymm5 # ^= D[1..4] from Theta vpsllvq 4*32-96(%r8),%ymm5,%ymm13 vpsrlvq 4*32-96(%r9),%ymm5,%ymm5 vpor %ymm13,%ymm5,%ymm5 vpxor %ymm15,%ymm6,%ymm6 # ^= D[1..4] from Theta vpermq $0b10001101,%ymm2,%ymm10 # %ymm2 -> future %ymm3 vpermq $0b10001101,%ymm3,%ymm11 # %ymm3 -> future %ymm4 vpsllvq 5*32-96(%r8),%ymm6,%ymm14 vpsrlvq 5*32-96(%r9),%ymm6,%ymm8 vpor %ymm14,%ymm8,%ymm8 # %ymm6 -> future %ymm1 vpxor %ymm15,%ymm1,%ymm1 # ^= D[1..4] from Theta vpermq $0b00011011,%ymm4,%ymm12 # %ymm4 -> future %ymm5 vpermq $0b01110010,%ymm5,%ymm13 # %ymm5 -> future %ymm6 vpsllvq 1*32-96(%r8),%ymm1,%ymm15 vpsrlvq 1*32-96(%r9),%ymm1,%ymm9 vpor %ymm15,%ymm9,%ymm9 # %ymm1 -> future %ymm2 ######################################### Chi vpsrldq $8,%ymm8,%ymm14 vpandn %ymm14,%ymm8,%ymm7 # tgting [0][0] [0][0] [0][0] [0][0] vpblendd $0b00001100,%ymm13,%ymm9,%ymm3 # [4][4] [2][0] vpblendd $0b00001100,%ymm9,%ymm11,%ymm15 # [4][0] [2][1] vpblendd $0b00001100,%ymm11,%ymm10,%ymm5 # [4][2] [2][4] vpblendd $0b00001100,%ymm10,%ymm9,%ymm14 # [4][3] [2][0] vpblendd $0b00110000,%ymm11,%ymm3,%ymm3 # [1][3] [4][4] [2][0] vpblendd $0b00110000,%ymm12,%ymm15,%ymm15 # [1][4] [4][0] [2][1] vpblendd $0b00110000,%ymm9,%ymm5,%ymm5 # [1][0] [4][2] [2][4] vpblendd $0b00110000,%ymm13,%ymm14,%ymm14 # [1][1] [4][3] [2][0] vpblendd $0b11000000,%ymm12,%ymm3,%ymm3 # [3][2] [1][3] [4][4] [2][0] vpblendd $0b11000000,%ymm13,%ymm15,%ymm15 # [3][3] [1][4] [4][0] [2][1] vpblendd $0b11000000,%ymm13,%ymm5,%ymm5 # [3][3] [1][0] [4][2] [2][4] vpblendd $0b11000000,%ymm11,%ymm14,%ymm14 # [3][4] [1][1] [4][3] [2][0] vpandn %ymm15,%ymm3,%ymm3 # tgting [3][1] [1][2] [4][3] [2][4] vpandn %ymm14,%ymm5,%ymm5 # tgting [3][2] [1][4] [4][1] [2][3] vpblendd $0b00001100,%ymm9,%ymm12,%ymm6 # [4][0] [2][3] vpblendd $0b00001100,%ymm12,%ymm10,%ymm15 # [4][1] [2][4] vpxor %ymm10,%ymm3,%ymm3 vpblendd $0b00110000,%ymm10,%ymm6,%ymm6 # [1][2] [4][0] [2][3] vpblendd $0b00110000,%ymm11,%ymm15,%ymm15 # [1][3] [4][1] [2][4] vpxor %ymm12,%ymm5,%ymm5 vpblendd $0b11000000,%ymm11,%ymm6,%ymm6 # [3][4] [1][2] [4][0] [2][3] vpblendd $0b11000000,%ymm9,%ymm15,%ymm15 # [3][0] [1][3] [4][1] [2][4] vpandn %ymm15,%ymm6,%ymm6 # tgting [3][3] [1][1] [4][4] [2][2] vpxor %ymm13,%ymm6,%ymm6 vpermq $0b00011110,%ymm8,%ymm4 # [0][1] [0][2] [0][4] [0][3] vpblendd $0b00110000,%ymm0,%ymm4,%ymm15 # [0][1] [0][0] [0][4] [0][3] vpermq $0b00111001,%ymm8,%ymm1 # [0][1] [0][4] [0][3] [0][2] vpblendd $0b11000000,%ymm0,%ymm1,%ymm1 # [0][0] [0][4] [0][3] [0][2] vpandn %ymm15,%ymm1,%ymm1 # tgting [0][4] [0][3] [0][2] [0][1] vpblendd $0b00001100,%ymm12,%ymm11,%ymm2 # [4][1] [2][1] vpblendd $0b00001100,%ymm11,%ymm13,%ymm14 # [4][2] [2][2] vpblendd $0b00110000,%ymm13,%ymm2,%ymm2 # [1][1] [4][1] [2][1] vpblendd $0b00110000,%ymm10,%ymm14,%ymm14 # [1][2] [4][2] [2][2] vpblendd $0b11000000,%ymm10,%ymm2,%ymm2 # [3][1] [1][1] [4][1] [2][1] vpblendd $0b11000000,%ymm12,%ymm14,%ymm14 # [3][2] [1][2] [4][2] [2][2] vpandn %ymm14,%ymm2,%ymm2 # tgting [3][0] [1][0] [4][0] [2][0] vpxor %ymm9,%ymm2,%ymm2 vpermq $0b00000000,%ymm7,%ymm7 # [0][0] [0][0] [0][0] [0][0] vpermq $0b00011011,%ymm3,%ymm3 # post-Chi shuffle vpermq $0b10001101,%ymm5,%ymm5 vpermq $0b01110010,%ymm6,%ymm6 vpblendd $0b00001100,%ymm10,%ymm13,%ymm4 # [4][3] [2][2] vpblendd $0b00001100,%ymm13,%ymm12,%ymm14 # [4][4] [2][3] vpblendd $0b00110000,%ymm12,%ymm4,%ymm4 # [1][4] [4][3] [2][2] vpblendd $0b00110000,%ymm9,%ymm14,%ymm14 # [1][0] [4][4] [2][3] vpblendd $0b11000000,%ymm9,%ymm4,%ymm4 # [3][0] [1][4] [4][3] [2][2] vpblendd $0b11000000,%ymm10,%ymm14,%ymm14 # [3][1] [1][0] [4][4] [2][3] vpandn %ymm14,%ymm4,%ymm4 # tgting [3][4] [1][3] [4][2] [2][1] vpxor %ymm7,%ymm0,%ymm0 vpxor %ymm8,%ymm1,%ymm1 vpxor %ymm11,%ymm4,%ymm4 ######################################### Iota vpxor (%r10),%ymm0,%ymm0 lea 32(%r10),%r10 dec %eax jnz .Loop_avx2 ret .ifndef no_size .size __KeccakF1600,.-__KeccakF1600 .endif .ifdef add_underscore .globl _KeccakP1600_Permute_24rounds _KeccakP1600_Permute_24rounds: .else .globl KeccakP1600_Permute_24rounds .hidden KeccakP1600_Permute_24rounds .ifndef no_type .type KeccakP1600_Permute_24rounds,@function .endif KeccakP1600_Permute_24rounds: .endif .balign 32 lea rhotates_left+96(%rip),%r8 lea rhotates_right+96(%rip),%r9 lea iotas(%rip),%r10 mov $24,%eax lea 96(%rdi),%rdi vzeroupper vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] vmovdqu 8+32*0-96(%rdi),%ymm1 vmovdqu 8+32*1-96(%rdi),%ymm2 vmovdqu 8+32*2-96(%rdi),%ymm3 vmovdqu 8+32*3-96(%rdi),%ymm4 vmovdqu 8+32*4-96(%rdi),%ymm5 vmovdqu 8+32*5-96(%rdi),%ymm6 call __KeccakF1600 vmovq %xmm0,-96(%rdi) vmovdqu %ymm1,8+32*0-96(%rdi) vmovdqu %ymm2,8+32*1-96(%rdi) vmovdqu %ymm3,8+32*2-96(%rdi) vmovdqu %ymm4,8+32*3-96(%rdi) vmovdqu %ymm5,8+32*4-96(%rdi) vmovdqu %ymm6,8+32*5-96(%rdi) vzeroupper ret .ifndef no_size .size KeccakP1600_Permute_24rounds,.-KeccakP1600_Permute_24rounds .endif .ifdef add_underscore .globl _KeccakP1600_Permute_12rounds _KeccakP1600_Permute_12rounds: .else .globl KeccakP1600_Permute_12rounds .hidden KeccakP1600_Permute_12rounds .ifndef no_type .type KeccakP1600_Permute_12rounds,@function .endif KeccakP1600_Permute_12rounds: .endif .balign 32 lea rhotates_left+96(%rip),%r8 lea rhotates_right+96(%rip),%r9 lea iotas+12*4*8(%rip),%r10 mov $12,%eax lea 96(%rdi),%rdi vzeroupper vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] vmovdqu 8+32*0-96(%rdi),%ymm1 vmovdqu 8+32*1-96(%rdi),%ymm2 vmovdqu 8+32*2-96(%rdi),%ymm3 vmovdqu 8+32*3-96(%rdi),%ymm4 vmovdqu 8+32*4-96(%rdi),%ymm5 vmovdqu 8+32*5-96(%rdi),%ymm6 call __KeccakF1600 vmovq %xmm0,-96(%rdi) vmovdqu %ymm1,8+32*0-96(%rdi) vmovdqu %ymm2,8+32*1-96(%rdi) vmovdqu %ymm3,8+32*2-96(%rdi) vmovdqu %ymm4,8+32*3-96(%rdi) vmovdqu %ymm5,8+32*4-96(%rdi) vmovdqu %ymm6,8+32*5-96(%rdi) vzeroupper ret .ifndef no_size .size KeccakP1600_Permute_12rounds,.-KeccakP1600_Permute_12rounds .endif .ifdef add_underscore .globl _KeccakP1600_Permute_Nrounds _KeccakP1600_Permute_Nrounds: .else .globl KeccakP1600_Permute_Nrounds .hidden KeccakP1600_Permute_Nrounds .ifndef no_type .type KeccakP1600_Permute_Nrounds,@function .endif KeccakP1600_Permute_Nrounds: .endif .balign 32 lea rhotates_left+96(%rip),%r8 lea rhotates_right+96(%rip),%r9 lea iotas+24*4*8(%rip),%r10 mov %rsi,%rax shl $2+3,%rsi sub %rsi, %r10 lea 96(%rdi),%rdi vzeroupper vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] vmovdqu 8+32*0-96(%rdi),%ymm1 vmovdqu 8+32*1-96(%rdi),%ymm2 vmovdqu 8+32*2-96(%rdi),%ymm3 vmovdqu 8+32*3-96(%rdi),%ymm4 vmovdqu 8+32*4-96(%rdi),%ymm5 vmovdqu 8+32*5-96(%rdi),%ymm6 call __KeccakF1600 vmovq %xmm0,-96(%rdi) vmovdqu %ymm1,8+32*0-96(%rdi) vmovdqu %ymm2,8+32*1-96(%rdi) vmovdqu %ymm3,8+32*2-96(%rdi) vmovdqu %ymm4,8+32*3-96(%rdi) vmovdqu %ymm5,8+32*4-96(%rdi) vmovdqu %ymm6,8+32*5-96(%rdi) vzeroupper ret .ifndef no_size .size KeccakP1600_Permute_Nrounds,.-KeccakP1600_Permute_Nrounds .endif # ----------------------------------------------------------------------------- # # size_t KeccakF1600_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); # %rdi %rsi %rdx %rcx # .ifdef add_underscore .globl _KeccakF1600_FastLoop_Absorb _KeccakF1600_FastLoop_Absorb: .else .globl KeccakF1600_FastLoop_Absorb .hidden KeccakF1600_FastLoop_Absorb .ifndef no_type .type KeccakF1600_FastLoop_Absorb,@function .endif KeccakF1600_FastLoop_Absorb: .endif .balign 32 push %rbx push %r10 shr $3, %rcx # rcx = data length in lanes mov %rdx, %rbx # rbx = initial data pointer cmp %rsi, %rcx jb KeccakF1600_FastLoop_Absorb_Exit vzeroupper cmp $21, %rsi jnz KeccakF1600_FastLoop_Absorb_Not21Lanes sub $21, %rcx lea rhotates_left+96(%rip),%r8 lea rhotates_right+96(%rip),%r9 lea 96(%rdi),%rdi vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] vmovdqu 8+32*0-96(%rdi),%ymm1 vmovdqu 8+32*1-96(%rdi),%ymm2 vmovdqu 8+32*2-96(%rdi),%ymm3 vmovdqu 8+32*3-96(%rdi),%ymm4 vmovdqu 8+32*4-96(%rdi),%ymm5 vmovdqu 8+32*5-96(%rdi),%ymm6 KeccakF1600_FastLoop_Absorb_Loop21Lanes: vpbroadcastq (%rdx),%ymm7 vmovdqu 8(%rdx),%ymm8 vmovdqa map2(%rip), %xmm15 vpcmpeqd %ymm14, %ymm14, %ymm14 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9 vmovdqa mask3_21(%rip), %ymm14 vpxor %ymm10, %ymm10, %ymm10 vmovdqa map3(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10 vmovdqa mask4_21(%rip), %ymm14 vpxor %ymm11, %ymm11, %ymm11 vmovdqa map4(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11 vmovdqa mask5_21(%rip), %ymm14 vpxor %ymm12, %ymm12, %ymm12 vmovdqa map5(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12 vmovdqa mask6_21(%rip), %ymm14 vpxor %ymm13, %ymm13, %ymm13 vmovdqa map6(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13 vpxor %ymm7,%ymm0,%ymm0 vpxor %ymm8,%ymm1,%ymm1 vpxor %ymm9,%ymm2,%ymm2 vpxor %ymm10,%ymm3,%ymm3 vpxor %ymm11,%ymm4,%ymm4 vpxor %ymm12,%ymm5,%ymm5 vpxor %ymm13,%ymm6,%ymm6 add $21*8, %rdx lea iotas(%rip),%r10 mov $24,%eax call __KeccakF1600 sub $21, %rcx jnc KeccakF1600_FastLoop_Absorb_Loop21Lanes KeccakF1600_FastLoop_Absorb_SaveAndExit: vmovq %xmm0,-96(%rdi) vmovdqu %ymm1,8+32*0-96(%rdi) vmovdqu %ymm2,8+32*1-96(%rdi) vmovdqu %ymm3,8+32*2-96(%rdi) vmovdqu %ymm4,8+32*3-96(%rdi) vmovdqu %ymm5,8+32*4-96(%rdi) vmovdqu %ymm6,8+32*5-96(%rdi) KeccakF1600_FastLoop_Absorb_Exit: vzeroupper mov %rdx, %rax # return number of bytes processed sub %rbx, %rax pop %r10 pop %rbx ret KeccakF1600_FastLoop_Absorb_Not21Lanes: cmp $17, %rsi jnz KeccakF1600_FastLoop_Absorb_Not17Lanes sub $17, %rcx lea rhotates_left+96(%rip),%r8 lea rhotates_right+96(%rip),%r9 lea 96(%rdi),%rdi vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] vmovdqu 8+32*0-96(%rdi),%ymm1 vmovdqu 8+32*1-96(%rdi),%ymm2 vmovdqu 8+32*2-96(%rdi),%ymm3 vmovdqu 8+32*3-96(%rdi),%ymm4 vmovdqu 8+32*4-96(%rdi),%ymm5 vmovdqu 8+32*5-96(%rdi),%ymm6 KeccakF1600_FastLoop_Absorb_Loop17Lanes: vpbroadcastq (%rdx),%ymm7 vmovdqu 8(%rdx),%ymm8 vmovdqa mask2_17(%rip), %ymm14 vpxor %ymm9, %ymm9, %ymm9 vmovdqa map2(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9 vmovdqa mask3_17(%rip), %ymm14 vpxor %ymm10, %ymm10, %ymm10 vmovdqa map3(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10 vmovdqa mask4_17(%rip), %ymm14 vpxor %ymm11, %ymm11, %ymm11 vmovdqa map4(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11 vmovdqa mask5_17(%rip), %ymm14 vpxor %ymm12, %ymm12, %ymm12 vmovdqa map5(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12 vmovdqa mask6_17(%rip), %ymm14 vpxor %ymm13, %ymm13, %ymm13 vmovdqa map6(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13 vpxor %ymm7,%ymm0,%ymm0 vpxor %ymm8,%ymm1,%ymm1 vpxor %ymm9,%ymm2,%ymm2 vpxor %ymm10,%ymm3,%ymm3 vpxor %ymm11,%ymm4,%ymm4 vpxor %ymm12,%ymm5,%ymm5 vpxor %ymm13,%ymm6,%ymm6 add $17*8, %rdx lea iotas(%rip),%r10 mov $24,%eax call __KeccakF1600 sub $17, %rcx jnc KeccakF1600_FastLoop_Absorb_Loop17Lanes jmp KeccakF1600_FastLoop_Absorb_SaveAndExit KeccakF1600_FastLoop_Absorb_Not17Lanes: lea mapState(%rip), %r9 mov %rsi, %rax KeccakF1600_FastLoop_Absorb_LanesAddLoop: mov (%rdx), %r8 add $8, %rdx mov (%r9), %r10 add $8, %r9 add %rdi, %r10 xor %r8, (%r10) sub $1, %rax jnz KeccakF1600_FastLoop_Absorb_LanesAddLoop sub %rsi, %rcx push %rdi push %rsi push %rdx push %rcx .ifdef no_plt .ifdef add_underscore call _KeccakP1600_Permute_24rounds .else call KeccakP1600_Permute_24rounds .endif .else call KeccakP1600_Permute_24rounds@PLT .endif pop %rcx pop %rdx pop %rsi pop %rdi cmp %rsi, %rcx jae KeccakF1600_FastLoop_Absorb_Not17Lanes jmp KeccakF1600_FastLoop_Absorb_Exit .ifndef no_size .size KeccakF1600_FastLoop_Absorb,.-KeccakF1600_FastLoop_Absorb .endif # ----------------------------------------------------------------------------- # # size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); # %rdi %rsi %rdx %rcx # .ifdef add_underscore .globl _KeccakP1600_12rounds_FastLoop_Absorb _KeccakP1600_12rounds_FastLoop_Absorb: .else .globl KeccakP1600_12rounds_FastLoop_Absorb .hidden KeccakP1600_12rounds_FastLoop_Absorb .ifndef no_type .type KeccakP1600_12rounds_FastLoop_Absorb,@function .endif KeccakP1600_12rounds_FastLoop_Absorb: .endif .balign 32 push %rbx push %r10 shr $3, %rcx # rcx = data length in lanes mov %rdx, %rbx # rbx = initial data pointer cmp %rsi, %rcx jb KeccakP1600_12rounds_FastLoop_Absorb_Exit vzeroupper cmp $21, %rsi jnz KeccakP1600_12rounds_FastLoop_Absorb_Not21Lanes sub $21, %rcx lea rhotates_left+96(%rip),%r8 lea rhotates_right+96(%rip),%r9 lea 96(%rdi),%rdi vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] vmovdqu 8+32*0-96(%rdi),%ymm1 vmovdqu 8+32*1-96(%rdi),%ymm2 vmovdqu 8+32*2-96(%rdi),%ymm3 vmovdqu 8+32*3-96(%rdi),%ymm4 vmovdqu 8+32*4-96(%rdi),%ymm5 vmovdqu 8+32*5-96(%rdi),%ymm6 KeccakP1600_12rounds_FastLoop_Absorb_Loop21Lanes: vpbroadcastq (%rdx),%ymm7 vmovdqu 8(%rdx),%ymm8 vmovdqa map2(%rip), %xmm15 vpcmpeqd %ymm14, %ymm14, %ymm14 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9 vmovdqa mask3_21(%rip), %ymm14 vpxor %ymm10, %ymm10, %ymm10 vmovdqa map3(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10 vmovdqa mask4_21(%rip), %ymm14 vpxor %ymm11, %ymm11, %ymm11 vmovdqa map4(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11 vmovdqa mask5_21(%rip), %ymm14 vpxor %ymm12, %ymm12, %ymm12 vmovdqa map5(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12 vmovdqa mask6_21(%rip), %ymm14 vpxor %ymm13, %ymm13, %ymm13 vmovdqa map6(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13 vpxor %ymm7,%ymm0,%ymm0 vpxor %ymm8,%ymm1,%ymm1 vpxor %ymm9,%ymm2,%ymm2 vpxor %ymm10,%ymm3,%ymm3 vpxor %ymm11,%ymm4,%ymm4 vpxor %ymm12,%ymm5,%ymm5 vpxor %ymm13,%ymm6,%ymm6 add $21*8, %rdx lea iotas+12*4*8(%rip),%r10 mov $12,%eax call __KeccakF1600 sub $21, %rcx jnc KeccakP1600_12rounds_FastLoop_Absorb_Loop21Lanes KeccakP1600_12rounds_FastLoop_Absorb_SaveAndExit: vmovq %xmm0,-96(%rdi) vmovdqu %ymm1,8+32*0-96(%rdi) vmovdqu %ymm2,8+32*1-96(%rdi) vmovdqu %ymm3,8+32*2-96(%rdi) vmovdqu %ymm4,8+32*3-96(%rdi) vmovdqu %ymm5,8+32*4-96(%rdi) vmovdqu %ymm6,8+32*5-96(%rdi) KeccakP1600_12rounds_FastLoop_Absorb_Exit: vzeroupper mov %rdx, %rax # return number of bytes processed sub %rbx, %rax pop %r10 pop %rbx ret KeccakP1600_12rounds_FastLoop_Absorb_Not21Lanes: cmp $17, %rsi jnz KeccakP1600_12rounds_FastLoop_Absorb_Not17Lanes sub $17, %rcx lea rhotates_left+96(%rip),%r8 lea rhotates_right+96(%rip),%r9 lea 96(%rdi),%rdi vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] vmovdqu 8+32*0-96(%rdi),%ymm1 vmovdqu 8+32*1-96(%rdi),%ymm2 vmovdqu 8+32*2-96(%rdi),%ymm3 vmovdqu 8+32*3-96(%rdi),%ymm4 vmovdqu 8+32*4-96(%rdi),%ymm5 vmovdqu 8+32*5-96(%rdi),%ymm6 KeccakP1600_12rounds_FastLoop_Absorb_Loop17Lanes: vpbroadcastq (%rdx),%ymm7 vmovdqu 8(%rdx),%ymm8 vmovdqa mask2_17(%rip), %ymm14 vpxor %ymm9, %ymm9, %ymm9 vmovdqa map2(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9 vmovdqa mask3_17(%rip), %ymm14 vpxor %ymm10, %ymm10, %ymm10 vmovdqa map3(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10 vmovdqa mask4_17(%rip), %ymm14 vpxor %ymm11, %ymm11, %ymm11 vmovdqa map4(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11 vmovdqa mask5_17(%rip), %ymm14 vpxor %ymm12, %ymm12, %ymm12 vmovdqa map5(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12 vmovdqa mask6_17(%rip), %ymm14 vpxor %ymm13, %ymm13, %ymm13 vmovdqa map6(%rip), %xmm15 vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13 vpxor %ymm7,%ymm0,%ymm0 vpxor %ymm8,%ymm1,%ymm1 vpxor %ymm9,%ymm2,%ymm2 vpxor %ymm10,%ymm3,%ymm3 vpxor %ymm11,%ymm4,%ymm4 vpxor %ymm12,%ymm5,%ymm5 vpxor %ymm13,%ymm6,%ymm6 add $17*8, %rdx lea iotas+12*4*8(%rip),%r10 mov $12,%eax call __KeccakF1600 sub $17, %rcx jnc KeccakP1600_12rounds_FastLoop_Absorb_Loop17Lanes jmp KeccakP1600_12rounds_FastLoop_Absorb_SaveAndExit KeccakP1600_12rounds_FastLoop_Absorb_Not17Lanes: lea mapState(%rip), %r9 mov %rsi, %rax KeccakP1600_12rounds_FastLoop_Absorb_LanesAddLoop: mov (%rdx), %r8 add $8, %rdx mov (%r9), %r10 add $8, %r9 add %rdi, %r10 xor %r8, (%r10) sub $1, %rax jnz KeccakP1600_12rounds_FastLoop_Absorb_LanesAddLoop sub %rsi, %rcx push %rdi push %rsi push %rdx push %rcx .ifdef no_plt .ifdef add_underscore call _KeccakP1600_Permute_12rounds .else call KeccakP1600_Permute_12rounds .endif .else call KeccakP1600_Permute_12rounds@PLT .endif pop %rcx pop %rdx pop %rsi pop %rdi cmp %rsi, %rcx jae KeccakP1600_12rounds_FastLoop_Absorb_Not17Lanes jmp KeccakP1600_12rounds_FastLoop_Absorb_Exit .ifndef no_size .size KeccakP1600_12rounds_FastLoop_Absorb,.-KeccakP1600_12rounds_FastLoop_Absorb .endif .equ ALLON, 0xFFFFFFFFFFFFFFFF .balign 64 rhotates_left: .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0] .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4] .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4] .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4] .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4] .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4] rhotates_right: .quad 64-3, 64-18, 64-36, 64-41 .quad 64-1, 64-62, 64-28, 64-27 .quad 64-45, 64-6, 64-56, 64-39 .quad 64-10, 64-61, 64-55, 64-8 .quad 64-2, 64-15, 64-25, 64-20 .quad 64-44, 64-43, 64-21, 64-14 iotas: .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001 .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082 .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000 .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009 .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088 .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009 .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089 .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003 .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002 .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080 .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080 .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008 mapState: .quad 0*8, 1*8, 2*8, 3*8, 4*8 .quad 7*8, 21*8, 10*8, 15*8, 20*8 .quad 5*8, 13*8, 22*8, 19*8, 12*8 .quad 8*8, 9*8, 18*8, 23*8, 16*8 .quad 6*8, 17*8, 14*8, 11*8, 24*8 .balign 16 map2: .long 10*8, 20*8, 5*8, 15*8 map3: .long 16*8, 7*8, 23*8, 14*8 map4: .long 11*8, 22*8, 8*8, 19*8 map5: .long 21*8, 17*8, 13*8, 9*8 map6: .long 6*8, 12*8, 18*8, 24*8 .balign 32 mask3_21: .quad ALLON, ALLON, 0, ALLON mask4_21: .quad ALLON, 0, ALLON, ALLON mask5_21: .quad 0, ALLON, ALLON, ALLON mask6_21: .quad ALLON, ALLON, ALLON, 0 mask2_17: .quad ALLON, 0, ALLON, ALLON mask3_17: .quad ALLON, ALLON, 0, ALLON mask4_17: .quad ALLON, 0, ALLON, 0 mask5_17: .quad 0, 0, ALLON, ALLON mask6_17: .quad ALLON, ALLON, 0, 0 .asciz "Keccak-1600 for AVX2, CRYPTOGAMS by "