; This file is generated from a similarly-named Perl script in the BoringSSL ; source tree. Do not edit by hand. default rel %define XMMWORD %define YMMWORD %define ZMMWORD section .text code align=64 EXTERN GFp_ia32cap_P global GFp_poly1305_init_asm global GFp_poly1305_blocks global GFp_poly1305_emit ALIGN 32 GFp_poly1305_init_asm: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_GFp_poly1305_init_asm: mov rdi,rcx mov rsi,rdx mov rdx,r8 xor rax,rax mov QWORD[rdi],rax mov QWORD[8+rdi],rax mov QWORD[16+rdi],rax cmp rsi,0 je NEAR $L$no_key lea r10,[GFp_poly1305_blocks] lea r11,[GFp_poly1305_emit] mov r9,QWORD[((GFp_ia32cap_P+4))] lea rax,[poly1305_blocks_avx] lea rcx,[poly1305_emit_avx] bt r9,28 cmovc r10,rax cmovc r11,rcx lea rax,[poly1305_blocks_avx2] bt r9,37 cmovc r10,rax mov rax,0x0ffffffc0fffffff mov rcx,0x0ffffffc0ffffffc and rax,QWORD[rsi] and rcx,QWORD[8+rsi] mov QWORD[24+rdi],rax mov QWORD[32+rdi],rcx mov QWORD[rdx],r10 mov QWORD[8+rdx],r11 mov eax,1 $L$no_key: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_GFp_poly1305_init_asm: ALIGN 32 GFp_poly1305_blocks: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_GFp_poly1305_blocks: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 $L$blocks: shr rdx,4 jz NEAR $L$no_data push rbx push rbp push r12 push r13 push r14 push r15 $L$blocks_body: mov r15,rdx mov r11,QWORD[24+rdi] mov r13,QWORD[32+rdi] mov r14,QWORD[rdi] mov rbx,QWORD[8+rdi] mov rbp,QWORD[16+rdi] mov r12,r13 shr r13,2 mov rax,r12 add r13,r12 jmp NEAR $L$oop ALIGN 32 $L$oop: add r14,QWORD[rsi] adc rbx,QWORD[8+rsi] lea rsi,[16+rsi] adc rbp,rcx mul r14 mov r9,rax mov rax,r11 mov r10,rdx mul r14 mov r14,rax mov rax,r11 mov r8,rdx mul rbx add r9,rax mov rax,r13 adc r10,rdx mul rbx mov rbx,rbp add r14,rax adc r8,rdx imul rbx,r13 add r9,rbx mov rbx,r8 adc r10,0 imul rbp,r11 add rbx,r9 mov rax,-4 adc r10,rbp and rax,r10 mov rbp,r10 shr r10,2 and rbp,3 add rax,r10 add r14,rax adc rbx,0 adc rbp,0 mov rax,r12 dec r15 jnz NEAR $L$oop mov QWORD[rdi],r14 mov QWORD[8+rdi],rbx mov QWORD[16+rdi],rbp mov r15,QWORD[rsp] mov r14,QWORD[8+rsp] mov r13,QWORD[16+rsp] mov r12,QWORD[24+rsp] mov rbp,QWORD[32+rsp] mov rbx,QWORD[40+rsp] lea rsp,[48+rsp] $L$no_data: $L$blocks_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_GFp_poly1305_blocks: ALIGN 32 GFp_poly1305_emit: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_GFp_poly1305_emit: mov rdi,rcx mov rsi,rdx mov rdx,r8 $L$emit: mov r8,QWORD[rdi] mov r9,QWORD[8+rdi] mov r10,QWORD[16+rdi] mov rax,r8 add r8,5 mov rcx,r9 adc r9,0 adc r10,0 shr r10,2 cmovnz rax,r8 cmovnz rcx,r9 add rax,QWORD[rdx] adc rcx,QWORD[8+rdx] mov QWORD[rsi],rax mov QWORD[8+rsi],rcx mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_GFp_poly1305_emit: ALIGN 32 __poly1305_block: mul r14 mov r9,rax mov rax,r11 mov r10,rdx mul r14 mov r14,rax mov rax,r11 mov r8,rdx mul rbx add r9,rax mov rax,r13 adc r10,rdx mul rbx mov rbx,rbp add r14,rax adc r8,rdx imul rbx,r13 add r9,rbx mov rbx,r8 adc r10,0 imul rbp,r11 add rbx,r9 mov rax,-4 adc r10,rbp and rax,r10 mov rbp,r10 shr r10,2 and rbp,3 add rax,r10 add r14,rax adc rbx,0 adc rbp,0 DB 0F3h,0C3h ;repret ALIGN 32 __poly1305_init_avx: mov r14,r11 mov rbx,r12 xor rbp,rbp lea rdi,[((48+64))+rdi] mov rax,r12 call __poly1305_block mov eax,0x3ffffff mov edx,0x3ffffff mov r8,r14 and eax,r14d mov r9,r11 and edx,r11d mov DWORD[((-64))+rdi],eax shr r8,26 mov DWORD[((-60))+rdi],edx shr r9,26 mov eax,0x3ffffff mov edx,0x3ffffff and eax,r8d and edx,r9d mov DWORD[((-48))+rdi],eax lea eax,[rax*4+rax] mov DWORD[((-44))+rdi],edx lea edx,[rdx*4+rdx] mov DWORD[((-32))+rdi],eax shr r8,26 mov DWORD[((-28))+rdi],edx shr r9,26 mov rax,rbx mov rdx,r12 shl rax,12 shl rdx,12 or rax,r8 or rdx,r9 and eax,0x3ffffff and edx,0x3ffffff mov DWORD[((-16))+rdi],eax lea eax,[rax*4+rax] mov DWORD[((-12))+rdi],edx lea edx,[rdx*4+rdx] mov DWORD[rdi],eax mov r8,rbx mov DWORD[4+rdi],edx mov r9,r12 mov eax,0x3ffffff mov edx,0x3ffffff shr r8,14 shr r9,14 and eax,r8d and edx,r9d mov DWORD[16+rdi],eax lea eax,[rax*4+rax] mov DWORD[20+rdi],edx lea edx,[rdx*4+rdx] mov DWORD[32+rdi],eax shr r8,26 mov DWORD[36+rdi],edx shr r9,26 mov rax,rbp shl rax,24 or r8,rax mov DWORD[48+rdi],r8d lea r8,[r8*4+r8] mov DWORD[52+rdi],r9d lea r9,[r9*4+r9] mov DWORD[64+rdi],r8d mov DWORD[68+rdi],r9d mov rax,r12 call __poly1305_block mov eax,0x3ffffff mov r8,r14 and eax,r14d shr r8,26 mov DWORD[((-52))+rdi],eax mov edx,0x3ffffff and edx,r8d mov DWORD[((-36))+rdi],edx lea edx,[rdx*4+rdx] shr r8,26 mov DWORD[((-20))+rdi],edx mov rax,rbx shl rax,12 or rax,r8 and eax,0x3ffffff mov DWORD[((-4))+rdi],eax lea eax,[rax*4+rax] mov r8,rbx mov DWORD[12+rdi],eax mov edx,0x3ffffff shr r8,14 and edx,r8d mov DWORD[28+rdi],edx lea edx,[rdx*4+rdx] shr r8,26 mov DWORD[44+rdi],edx mov rax,rbp shl rax,24 or r8,rax mov DWORD[60+rdi],r8d lea r8,[r8*4+r8] mov DWORD[76+rdi],r8d mov rax,r12 call __poly1305_block mov eax,0x3ffffff mov r8,r14 and eax,r14d shr r8,26 mov DWORD[((-56))+rdi],eax mov edx,0x3ffffff and edx,r8d mov DWORD[((-40))+rdi],edx lea edx,[rdx*4+rdx] shr r8,26 mov DWORD[((-24))+rdi],edx mov rax,rbx shl rax,12 or rax,r8 and eax,0x3ffffff mov DWORD[((-8))+rdi],eax lea eax,[rax*4+rax] mov r8,rbx mov DWORD[8+rdi],eax mov edx,0x3ffffff shr r8,14 and edx,r8d mov DWORD[24+rdi],edx lea edx,[rdx*4+rdx] shr r8,26 mov DWORD[40+rdi],edx mov rax,rbp shl rax,24 or r8,rax mov DWORD[56+rdi],r8d lea r8,[r8*4+r8] mov DWORD[72+rdi],r8d lea rdi,[((-48-64))+rdi] DB 0F3h,0C3h ;repret ALIGN 32 poly1305_blocks_avx: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_poly1305_blocks_avx: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8d,DWORD[20+rdi] cmp rdx,128 jae NEAR $L$blocks_avx test r8d,r8d jz NEAR $L$blocks $L$blocks_avx: and rdx,-16 jz NEAR $L$no_data_avx vzeroupper test r8d,r8d jz NEAR $L$base2_64_avx test rdx,31 jz NEAR $L$even_avx push rbx push rbp push r12 push r13 push r14 push r15 $L$blocks_avx_body: mov r15,rdx mov r8,QWORD[rdi] mov r9,QWORD[8+rdi] mov ebp,DWORD[16+rdi] mov r11,QWORD[24+rdi] mov r13,QWORD[32+rdi] mov r14d,r8d and r8,-2147483648 mov r12,r9 mov ebx,r9d and r9,-2147483648 shr r8,6 shl r12,52 add r14,r8 shr rbx,12 shr r9,18 add r14,r12 adc rbx,r9 mov r8,rbp shl r8,40 shr rbp,24 add rbx,r8 adc rbp,0 mov r9,-4 mov r8,rbp and r9,rbp shr r8,2 and rbp,3 add r8,r9 add r14,r8 adc rbx,0 adc rbp,0 mov r12,r13 mov rax,r13 shr r13,2 add r13,r12 add r14,QWORD[rsi] adc rbx,QWORD[8+rsi] lea rsi,[16+rsi] adc rbp,rcx call __poly1305_block test rcx,rcx jz NEAR $L$store_base2_64_avx mov rax,r14 mov rdx,r14 shr r14,52 mov r11,rbx mov r12,rbx shr rdx,26 and rax,0x3ffffff shl r11,12 and rdx,0x3ffffff shr rbx,14 or r14,r11 shl rbp,24 and r14,0x3ffffff shr r12,40 and rbx,0x3ffffff or rbp,r12 sub r15,16 jz NEAR $L$store_base2_26_avx vmovd xmm0,eax vmovd xmm1,edx vmovd xmm2,r14d vmovd xmm3,ebx vmovd xmm4,ebp jmp NEAR $L$proceed_avx ALIGN 32 $L$store_base2_64_avx: mov QWORD[rdi],r14 mov QWORD[8+rdi],rbx mov QWORD[16+rdi],rbp jmp NEAR $L$done_avx ALIGN 16 $L$store_base2_26_avx: mov DWORD[rdi],eax mov DWORD[4+rdi],edx mov DWORD[8+rdi],r14d mov DWORD[12+rdi],ebx mov DWORD[16+rdi],ebp ALIGN 16 $L$done_avx: mov r15,QWORD[rsp] mov r14,QWORD[8+rsp] mov r13,QWORD[16+rsp] mov r12,QWORD[24+rsp] mov rbp,QWORD[32+rsp] mov rbx,QWORD[40+rsp] lea rsp,[48+rsp] $L$no_data_avx: $L$blocks_avx_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret ALIGN 32 $L$base2_64_avx: push rbx push rbp push r12 push r13 push r14 push r15 $L$base2_64_avx_body: mov r15,rdx mov r11,QWORD[24+rdi] mov r13,QWORD[32+rdi] mov r14,QWORD[rdi] mov rbx,QWORD[8+rdi] mov ebp,DWORD[16+rdi] mov r12,r13 mov rax,r13 shr r13,2 add r13,r12 test rdx,31 jz NEAR $L$init_avx add r14,QWORD[rsi] adc rbx,QWORD[8+rsi] lea rsi,[16+rsi] adc rbp,rcx sub r15,16 call __poly1305_block $L$init_avx: mov rax,r14 mov rdx,r14 shr r14,52 mov r8,rbx mov r9,rbx shr rdx,26 and rax,0x3ffffff shl r8,12 and rdx,0x3ffffff shr rbx,14 or r14,r8 shl rbp,24 and r14,0x3ffffff shr r9,40 and rbx,0x3ffffff or rbp,r9 vmovd xmm0,eax vmovd xmm1,edx vmovd xmm2,r14d vmovd xmm3,ebx vmovd xmm4,ebp mov DWORD[20+rdi],1 call __poly1305_init_avx $L$proceed_avx: mov rdx,r15 mov r15,QWORD[rsp] mov r14,QWORD[8+rsp] mov r13,QWORD[16+rsp] mov r12,QWORD[24+rsp] mov rbp,QWORD[32+rsp] mov rbx,QWORD[40+rsp] lea rax,[48+rsp] lea rsp,[48+rsp] $L$base2_64_avx_epilogue: jmp NEAR $L$do_avx ALIGN 32 $L$even_avx: vmovd xmm0,DWORD[rdi] vmovd xmm1,DWORD[4+rdi] vmovd xmm2,DWORD[8+rdi] vmovd xmm3,DWORD[12+rdi] vmovd xmm4,DWORD[16+rdi] $L$do_avx: lea r11,[((-248))+rsp] sub rsp,0x218 vmovdqa XMMWORD[80+r11],xmm6 vmovdqa XMMWORD[96+r11],xmm7 vmovdqa XMMWORD[112+r11],xmm8 vmovdqa XMMWORD[128+r11],xmm9 vmovdqa XMMWORD[144+r11],xmm10 vmovdqa XMMWORD[160+r11],xmm11 vmovdqa XMMWORD[176+r11],xmm12 vmovdqa XMMWORD[192+r11],xmm13 vmovdqa XMMWORD[208+r11],xmm14 vmovdqa XMMWORD[224+r11],xmm15 $L$do_avx_body: sub rdx,64 lea rax,[((-32))+rsi] cmovc rsi,rax vmovdqu xmm14,XMMWORD[48+rdi] lea rdi,[112+rdi] lea rcx,[$L$const] vmovdqu xmm5,XMMWORD[32+rsi] vmovdqu xmm6,XMMWORD[48+rsi] vmovdqa xmm15,XMMWORD[64+rcx] vpsrldq xmm7,xmm5,6 vpsrldq xmm8,xmm6,6 vpunpckhqdq xmm9,xmm5,xmm6 vpunpcklqdq xmm5,xmm5,xmm6 vpunpcklqdq xmm8,xmm7,xmm8 vpsrlq xmm9,xmm9,40 vpsrlq xmm6,xmm5,26 vpand xmm5,xmm5,xmm15 vpsrlq xmm7,xmm8,4 vpand xmm6,xmm6,xmm15 vpsrlq xmm8,xmm8,30 vpand xmm7,xmm7,xmm15 vpand xmm8,xmm8,xmm15 vpor xmm9,xmm9,XMMWORD[32+rcx] jbe NEAR $L$skip_loop_avx vmovdqu xmm11,XMMWORD[((-48))+rdi] vmovdqu xmm12,XMMWORD[((-32))+rdi] vpshufd xmm13,xmm14,0xEE vpshufd xmm10,xmm14,0x44 vmovdqa XMMWORD[(-144)+r11],xmm13 vmovdqa XMMWORD[rsp],xmm10 vpshufd xmm14,xmm11,0xEE vmovdqu xmm10,XMMWORD[((-16))+rdi] vpshufd xmm11,xmm11,0x44 vmovdqa XMMWORD[(-128)+r11],xmm14 vmovdqa XMMWORD[16+rsp],xmm11 vpshufd xmm13,xmm12,0xEE vmovdqu xmm11,XMMWORD[rdi] vpshufd xmm12,xmm12,0x44 vmovdqa XMMWORD[(-112)+r11],xmm13 vmovdqa XMMWORD[32+rsp],xmm12 vpshufd xmm14,xmm10,0xEE vmovdqu xmm12,XMMWORD[16+rdi] vpshufd xmm10,xmm10,0x44 vmovdqa XMMWORD[(-96)+r11],xmm14 vmovdqa XMMWORD[48+rsp],xmm10 vpshufd xmm13,xmm11,0xEE vmovdqu xmm10,XMMWORD[32+rdi] vpshufd xmm11,xmm11,0x44 vmovdqa XMMWORD[(-80)+r11],xmm13 vmovdqa XMMWORD[64+rsp],xmm11 vpshufd xmm14,xmm12,0xEE vmovdqu xmm11,XMMWORD[48+rdi] vpshufd xmm12,xmm12,0x44 vmovdqa XMMWORD[(-64)+r11],xmm14 vmovdqa XMMWORD[80+rsp],xmm12 vpshufd xmm13,xmm10,0xEE vmovdqu xmm12,XMMWORD[64+rdi] vpshufd xmm10,xmm10,0x44 vmovdqa XMMWORD[(-48)+r11],xmm13 vmovdqa XMMWORD[96+rsp],xmm10 vpshufd xmm14,xmm11,0xEE vpshufd xmm11,xmm11,0x44 vmovdqa XMMWORD[(-32)+r11],xmm14 vmovdqa XMMWORD[112+rsp],xmm11 vpshufd xmm13,xmm12,0xEE vmovdqa xmm14,XMMWORD[rsp] vpshufd xmm12,xmm12,0x44 vmovdqa XMMWORD[(-16)+r11],xmm13 vmovdqa XMMWORD[128+rsp],xmm12 jmp NEAR $L$oop_avx ALIGN 32 $L$oop_avx: vpmuludq xmm10,xmm14,xmm5 vpmuludq xmm11,xmm14,xmm6 vmovdqa XMMWORD[32+r11],xmm2 vpmuludq xmm12,xmm14,xmm7 vmovdqa xmm2,XMMWORD[16+rsp] vpmuludq xmm13,xmm14,xmm8 vpmuludq xmm14,xmm14,xmm9 vmovdqa XMMWORD[r11],xmm0 vpmuludq xmm0,xmm9,XMMWORD[32+rsp] vmovdqa XMMWORD[16+r11],xmm1 vpmuludq xmm1,xmm2,xmm8 vpaddq xmm10,xmm10,xmm0 vpaddq xmm14,xmm14,xmm1 vmovdqa XMMWORD[48+r11],xmm3 vpmuludq xmm0,xmm2,xmm7 vpmuludq xmm1,xmm2,xmm6 vpaddq xmm13,xmm13,xmm0 vmovdqa xmm3,XMMWORD[48+rsp] vpaddq xmm12,xmm12,xmm1 vmovdqa XMMWORD[64+r11],xmm4 vpmuludq xmm2,xmm2,xmm5 vpmuludq xmm0,xmm3,xmm7 vpaddq xmm11,xmm11,xmm2 vmovdqa xmm4,XMMWORD[64+rsp] vpaddq xmm14,xmm14,xmm0 vpmuludq xmm1,xmm3,xmm6 vpmuludq xmm3,xmm3,xmm5 vpaddq xmm13,xmm13,xmm1 vmovdqa xmm2,XMMWORD[80+rsp] vpaddq xmm12,xmm12,xmm3 vpmuludq xmm0,xmm4,xmm9 vpmuludq xmm4,xmm4,xmm8 vpaddq xmm11,xmm11,xmm0 vmovdqa xmm3,XMMWORD[96+rsp] vpaddq xmm10,xmm10,xmm4 vmovdqa xmm4,XMMWORD[128+rsp] vpmuludq xmm1,xmm2,xmm6 vpmuludq xmm2,xmm2,xmm5 vpaddq xmm14,xmm14,xmm1 vpaddq xmm13,xmm13,xmm2 vpmuludq xmm0,xmm3,xmm9 vpmuludq xmm1,xmm3,xmm8 vpaddq xmm12,xmm12,xmm0 vmovdqu xmm0,XMMWORD[rsi] vpaddq xmm11,xmm11,xmm1 vpmuludq xmm3,xmm3,xmm7 vpmuludq xmm7,xmm4,xmm7 vpaddq xmm10,xmm10,xmm3 vmovdqu xmm1,XMMWORD[16+rsi] vpaddq xmm11,xmm11,xmm7 vpmuludq xmm8,xmm4,xmm8 vpmuludq xmm9,xmm4,xmm9 vpsrldq xmm2,xmm0,6 vpaddq xmm12,xmm12,xmm8 vpaddq xmm13,xmm13,xmm9 vpsrldq xmm3,xmm1,6 vpmuludq xmm9,xmm5,XMMWORD[112+rsp] vpmuludq xmm5,xmm4,xmm6 vpunpckhqdq xmm4,xmm0,xmm1 vpaddq xmm14,xmm14,xmm9 vmovdqa xmm9,XMMWORD[((-144))+r11] vpaddq xmm10,xmm10,xmm5 vpunpcklqdq xmm0,xmm0,xmm1 vpunpcklqdq xmm3,xmm2,xmm3 vpsrldq xmm4,xmm4,5 vpsrlq xmm1,xmm0,26 vpand xmm0,xmm0,xmm15 vpsrlq xmm2,xmm3,4 vpand xmm1,xmm1,xmm15 vpand xmm4,xmm4,XMMWORD[rcx] vpsrlq xmm3,xmm3,30 vpand xmm2,xmm2,xmm15 vpand xmm3,xmm3,xmm15 vpor xmm4,xmm4,XMMWORD[32+rcx] vpaddq xmm0,xmm0,XMMWORD[r11] vpaddq xmm1,xmm1,XMMWORD[16+r11] vpaddq xmm2,xmm2,XMMWORD[32+r11] vpaddq xmm3,xmm3,XMMWORD[48+r11] vpaddq xmm4,xmm4,XMMWORD[64+r11] lea rax,[32+rsi] lea rsi,[64+rsi] sub rdx,64 cmovc rsi,rax vpmuludq xmm5,xmm9,xmm0 vpmuludq xmm6,xmm9,xmm1 vpaddq xmm10,xmm10,xmm5 vpaddq xmm11,xmm11,xmm6 vmovdqa xmm7,XMMWORD[((-128))+r11] vpmuludq xmm5,xmm9,xmm2 vpmuludq xmm6,xmm9,xmm3 vpaddq xmm12,xmm12,xmm5 vpaddq xmm13,xmm13,xmm6 vpmuludq xmm9,xmm9,xmm4 vpmuludq xmm5,xmm4,XMMWORD[((-112))+r11] vpaddq xmm14,xmm14,xmm9 vpaddq xmm10,xmm10,xmm5 vpmuludq xmm6,xmm7,xmm2 vpmuludq xmm5,xmm7,xmm3 vpaddq xmm13,xmm13,xmm6 vmovdqa xmm8,XMMWORD[((-96))+r11] vpaddq xmm14,xmm14,xmm5 vpmuludq xmm6,xmm7,xmm1 vpmuludq xmm7,xmm7,xmm0 vpaddq xmm12,xmm12,xmm6 vpaddq xmm11,xmm11,xmm7 vmovdqa xmm9,XMMWORD[((-80))+r11] vpmuludq xmm5,xmm8,xmm2 vpmuludq xmm6,xmm8,xmm1 vpaddq xmm14,xmm14,xmm5 vpaddq xmm13,xmm13,xmm6 vmovdqa xmm7,XMMWORD[((-64))+r11] vpmuludq xmm8,xmm8,xmm0 vpmuludq xmm5,xmm9,xmm4 vpaddq xmm12,xmm12,xmm8 vpaddq xmm11,xmm11,xmm5 vmovdqa xmm8,XMMWORD[((-48))+r11] vpmuludq xmm9,xmm9,xmm3 vpmuludq xmm6,xmm7,xmm1 vpaddq xmm10,xmm10,xmm9 vmovdqa xmm9,XMMWORD[((-16))+r11] vpaddq xmm14,xmm14,xmm6 vpmuludq xmm7,xmm7,xmm0 vpmuludq xmm5,xmm8,xmm4 vpaddq xmm13,xmm13,xmm7 vpaddq xmm12,xmm12,xmm5 vmovdqu xmm5,XMMWORD[32+rsi] vpmuludq xmm7,xmm8,xmm3 vpmuludq xmm8,xmm8,xmm2 vpaddq xmm11,xmm11,xmm7 vmovdqu xmm6,XMMWORD[48+rsi] vpaddq xmm10,xmm10,xmm8 vpmuludq xmm2,xmm9,xmm2 vpmuludq xmm3,xmm9,xmm3 vpsrldq xmm7,xmm5,6 vpaddq xmm11,xmm11,xmm2 vpmuludq xmm4,xmm9,xmm4 vpsrldq xmm8,xmm6,6 vpaddq xmm2,xmm12,xmm3 vpaddq xmm3,xmm13,xmm4 vpmuludq xmm4,xmm0,XMMWORD[((-32))+r11] vpmuludq xmm0,xmm9,xmm1 vpunpckhqdq xmm9,xmm5,xmm6 vpaddq xmm4,xmm14,xmm4 vpaddq xmm0,xmm10,xmm0 vpunpcklqdq xmm5,xmm5,xmm6 vpunpcklqdq xmm8,xmm7,xmm8 vpsrldq xmm9,xmm9,5 vpsrlq xmm6,xmm5,26 vmovdqa xmm14,XMMWORD[rsp] vpand xmm5,xmm5,xmm15 vpsrlq xmm7,xmm8,4 vpand xmm6,xmm6,xmm15 vpand xmm9,xmm9,XMMWORD[rcx] vpsrlq xmm8,xmm8,30 vpand xmm7,xmm7,xmm15 vpand xmm8,xmm8,xmm15 vpor xmm9,xmm9,XMMWORD[32+rcx] vpsrlq xmm13,xmm3,26 vpand xmm3,xmm3,xmm15 vpaddq xmm4,xmm4,xmm13 vpsrlq xmm10,xmm0,26 vpand xmm0,xmm0,xmm15 vpaddq xmm1,xmm11,xmm10 vpsrlq xmm10,xmm4,26 vpand xmm4,xmm4,xmm15 vpsrlq xmm11,xmm1,26 vpand xmm1,xmm1,xmm15 vpaddq xmm2,xmm2,xmm11 vpaddq xmm0,xmm0,xmm10 vpsllq xmm10,xmm10,2 vpaddq xmm0,xmm0,xmm10 vpsrlq xmm12,xmm2,26 vpand xmm2,xmm2,xmm15 vpaddq xmm3,xmm3,xmm12 vpsrlq xmm10,xmm0,26 vpand xmm0,xmm0,xmm15 vpaddq xmm1,xmm1,xmm10 vpsrlq xmm13,xmm3,26 vpand xmm3,xmm3,xmm15 vpaddq xmm4,xmm4,xmm13 ja NEAR $L$oop_avx $L$skip_loop_avx: vpshufd xmm14,xmm14,0x10 add rdx,32 jnz NEAR $L$ong_tail_avx vpaddq xmm7,xmm7,xmm2 vpaddq xmm5,xmm5,xmm0 vpaddq xmm6,xmm6,xmm1 vpaddq xmm8,xmm8,xmm3 vpaddq xmm9,xmm9,xmm4 $L$ong_tail_avx: vmovdqa XMMWORD[32+r11],xmm2 vmovdqa XMMWORD[r11],xmm0 vmovdqa XMMWORD[16+r11],xmm1 vmovdqa XMMWORD[48+r11],xmm3 vmovdqa XMMWORD[64+r11],xmm4 vpmuludq xmm12,xmm14,xmm7 vpmuludq xmm10,xmm14,xmm5 vpshufd xmm2,XMMWORD[((-48))+rdi],0x10 vpmuludq xmm11,xmm14,xmm6 vpmuludq xmm13,xmm14,xmm8 vpmuludq xmm14,xmm14,xmm9 vpmuludq xmm0,xmm2,xmm8 vpaddq xmm14,xmm14,xmm0 vpshufd xmm3,XMMWORD[((-32))+rdi],0x10 vpmuludq xmm1,xmm2,xmm7 vpaddq xmm13,xmm13,xmm1 vpshufd xmm4,XMMWORD[((-16))+rdi],0x10 vpmuludq xmm0,xmm2,xmm6 vpaddq xmm12,xmm12,xmm0 vpmuludq xmm2,xmm2,xmm5 vpaddq xmm11,xmm11,xmm2 vpmuludq xmm3,xmm3,xmm9 vpaddq xmm10,xmm10,xmm3 vpshufd xmm2,XMMWORD[rdi],0x10 vpmuludq xmm1,xmm4,xmm7 vpaddq xmm14,xmm14,xmm1 vpmuludq xmm0,xmm4,xmm6 vpaddq xmm13,xmm13,xmm0 vpshufd xmm3,XMMWORD[16+rdi],0x10 vpmuludq xmm4,xmm4,xmm5 vpaddq xmm12,xmm12,xmm4 vpmuludq xmm1,xmm2,xmm9 vpaddq xmm11,xmm11,xmm1 vpshufd xmm4,XMMWORD[32+rdi],0x10 vpmuludq xmm2,xmm2,xmm8 vpaddq xmm10,xmm10,xmm2 vpmuludq xmm0,xmm3,xmm6 vpaddq xmm14,xmm14,xmm0 vpmuludq xmm3,xmm3,xmm5 vpaddq xmm13,xmm13,xmm3 vpshufd xmm2,XMMWORD[48+rdi],0x10 vpmuludq xmm1,xmm4,xmm9 vpaddq xmm12,xmm12,xmm1 vpshufd xmm3,XMMWORD[64+rdi],0x10 vpmuludq xmm0,xmm4,xmm8 vpaddq xmm11,xmm11,xmm0 vpmuludq xmm4,xmm4,xmm7 vpaddq xmm10,xmm10,xmm4 vpmuludq xmm2,xmm2,xmm5 vpaddq xmm14,xmm14,xmm2 vpmuludq xmm1,xmm3,xmm9 vpaddq xmm13,xmm13,xmm1 vpmuludq xmm0,xmm3,xmm8 vpaddq xmm12,xmm12,xmm0 vpmuludq xmm1,xmm3,xmm7 vpaddq xmm11,xmm11,xmm1 vpmuludq xmm3,xmm3,xmm6 vpaddq xmm10,xmm10,xmm3 jz NEAR $L$short_tail_avx vmovdqu xmm0,XMMWORD[rsi] vmovdqu xmm1,XMMWORD[16+rsi] vpsrldq xmm2,xmm0,6 vpsrldq xmm3,xmm1,6 vpunpckhqdq xmm4,xmm0,xmm1 vpunpcklqdq xmm0,xmm0,xmm1 vpunpcklqdq xmm3,xmm2,xmm3 vpsrlq xmm4,xmm4,40 vpsrlq xmm1,xmm0,26 vpand xmm0,xmm0,xmm15 vpsrlq xmm2,xmm3,4 vpand xmm1,xmm1,xmm15 vpsrlq xmm3,xmm3,30 vpand xmm2,xmm2,xmm15 vpand xmm3,xmm3,xmm15 vpor xmm4,xmm4,XMMWORD[32+rcx] vpshufd xmm9,XMMWORD[((-64))+rdi],0x32 vpaddq xmm0,xmm0,XMMWORD[r11] vpaddq xmm1,xmm1,XMMWORD[16+r11] vpaddq xmm2,xmm2,XMMWORD[32+r11] vpaddq xmm3,xmm3,XMMWORD[48+r11] vpaddq xmm4,xmm4,XMMWORD[64+r11] vpmuludq xmm5,xmm9,xmm0 vpaddq xmm10,xmm10,xmm5 vpmuludq xmm6,xmm9,xmm1 vpaddq xmm11,xmm11,xmm6 vpmuludq xmm5,xmm9,xmm2 vpaddq xmm12,xmm12,xmm5 vpshufd xmm7,XMMWORD[((-48))+rdi],0x32 vpmuludq xmm6,xmm9,xmm3 vpaddq xmm13,xmm13,xmm6 vpmuludq xmm9,xmm9,xmm4 vpaddq xmm14,xmm14,xmm9 vpmuludq xmm5,xmm7,xmm3 vpaddq xmm14,xmm14,xmm5 vpshufd xmm8,XMMWORD[((-32))+rdi],0x32 vpmuludq xmm6,xmm7,xmm2 vpaddq xmm13,xmm13,xmm6 vpshufd xmm9,XMMWORD[((-16))+rdi],0x32 vpmuludq xmm5,xmm7,xmm1 vpaddq xmm12,xmm12,xmm5 vpmuludq xmm7,xmm7,xmm0 vpaddq xmm11,xmm11,xmm7 vpmuludq xmm8,xmm8,xmm4 vpaddq xmm10,xmm10,xmm8 vpshufd xmm7,XMMWORD[rdi],0x32 vpmuludq xmm6,xmm9,xmm2 vpaddq xmm14,xmm14,xmm6 vpmuludq xmm5,xmm9,xmm1 vpaddq xmm13,xmm13,xmm5 vpshufd xmm8,XMMWORD[16+rdi],0x32 vpmuludq xmm9,xmm9,xmm0 vpaddq xmm12,xmm12,xmm9 vpmuludq xmm6,xmm7,xmm4 vpaddq xmm11,xmm11,xmm6 vpshufd xmm9,XMMWORD[32+rdi],0x32 vpmuludq xmm7,xmm7,xmm3 vpaddq xmm10,xmm10,xmm7 vpmuludq xmm5,xmm8,xmm1 vpaddq xmm14,xmm14,xmm5 vpmuludq xmm8,xmm8,xmm0 vpaddq xmm13,xmm13,xmm8 vpshufd xmm7,XMMWORD[48+rdi],0x32 vpmuludq xmm6,xmm9,xmm4 vpaddq xmm12,xmm12,xmm6 vpshufd xmm8,XMMWORD[64+rdi],0x32 vpmuludq xmm5,xmm9,xmm3 vpaddq xmm11,xmm11,xmm5 vpmuludq xmm9,xmm9,xmm2 vpaddq xmm10,xmm10,xmm9 vpmuludq xmm7,xmm7,xmm0 vpaddq xmm14,xmm14,xmm7 vpmuludq xmm6,xmm8,xmm4 vpaddq xmm13,xmm13,xmm6 vpmuludq xmm5,xmm8,xmm3 vpaddq xmm12,xmm12,xmm5 vpmuludq xmm6,xmm8,xmm2 vpaddq xmm11,xmm11,xmm6 vpmuludq xmm8,xmm8,xmm1 vpaddq xmm10,xmm10,xmm8 $L$short_tail_avx: vpsrldq xmm9,xmm14,8 vpsrldq xmm8,xmm13,8 vpsrldq xmm6,xmm11,8 vpsrldq xmm5,xmm10,8 vpsrldq xmm7,xmm12,8 vpaddq xmm13,xmm13,xmm8 vpaddq xmm14,xmm14,xmm9 vpaddq xmm10,xmm10,xmm5 vpaddq xmm11,xmm11,xmm6 vpaddq xmm12,xmm12,xmm7 vpsrlq xmm3,xmm13,26 vpand xmm13,xmm13,xmm15 vpaddq xmm14,xmm14,xmm3 vpsrlq xmm0,xmm10,26 vpand xmm10,xmm10,xmm15 vpaddq xmm11,xmm11,xmm0 vpsrlq xmm4,xmm14,26 vpand xmm14,xmm14,xmm15 vpsrlq xmm1,xmm11,26 vpand xmm11,xmm11,xmm15 vpaddq xmm12,xmm12,xmm1 vpaddq xmm10,xmm10,xmm4 vpsllq xmm4,xmm4,2 vpaddq xmm10,xmm10,xmm4 vpsrlq xmm2,xmm12,26 vpand xmm12,xmm12,xmm15 vpaddq xmm13,xmm13,xmm2 vpsrlq xmm0,xmm10,26 vpand xmm10,xmm10,xmm15 vpaddq xmm11,xmm11,xmm0 vpsrlq xmm3,xmm13,26 vpand xmm13,xmm13,xmm15 vpaddq xmm14,xmm14,xmm3 vmovd DWORD[(-112)+rdi],xmm10 vmovd DWORD[(-108)+rdi],xmm11 vmovd DWORD[(-104)+rdi],xmm12 vmovd DWORD[(-100)+rdi],xmm13 vmovd DWORD[(-96)+rdi],xmm14 vmovdqa xmm6,XMMWORD[80+r11] vmovdqa xmm7,XMMWORD[96+r11] vmovdqa xmm8,XMMWORD[112+r11] vmovdqa xmm9,XMMWORD[128+r11] vmovdqa xmm10,XMMWORD[144+r11] vmovdqa xmm11,XMMWORD[160+r11] vmovdqa xmm12,XMMWORD[176+r11] vmovdqa xmm13,XMMWORD[192+r11] vmovdqa xmm14,XMMWORD[208+r11] vmovdqa xmm15,XMMWORD[224+r11] lea rsp,[248+r11] $L$do_avx_epilogue: vzeroupper mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_poly1305_blocks_avx: ALIGN 32 poly1305_emit_avx: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_poly1305_emit_avx: mov rdi,rcx mov rsi,rdx mov rdx,r8 cmp DWORD[20+rdi],0 je NEAR $L$emit mov eax,DWORD[rdi] mov ecx,DWORD[4+rdi] mov r8d,DWORD[8+rdi] mov r11d,DWORD[12+rdi] mov r10d,DWORD[16+rdi] shl rcx,26 mov r9,r8 shl r8,52 add rax,rcx shr r9,12 add r8,rax adc r9,0 shl r11,14 mov rax,r10 shr r10,24 add r9,r11 shl rax,40 add r9,rax adc r10,0 mov rax,r10 mov rcx,r10 and r10,3 shr rax,2 and rcx,-4 add rax,rcx add r8,rax adc r9,0 adc r10,0 mov rax,r8 add r8,5 mov rcx,r9 adc r9,0 adc r10,0 shr r10,2 cmovnz rax,r8 cmovnz rcx,r9 add rax,QWORD[rdx] adc rcx,QWORD[8+rdx] mov QWORD[rsi],rax mov QWORD[8+rsi],rcx mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_poly1305_emit_avx: ALIGN 32 poly1305_blocks_avx2: mov QWORD[8+rsp],rdi ;WIN64 prologue mov QWORD[16+rsp],rsi mov rax,rsp $L$SEH_begin_poly1305_blocks_avx2: mov rdi,rcx mov rsi,rdx mov rdx,r8 mov rcx,r9 mov r8d,DWORD[20+rdi] cmp rdx,128 jae NEAR $L$blocks_avx2 test r8d,r8d jz NEAR $L$blocks $L$blocks_avx2: and rdx,-16 jz NEAR $L$no_data_avx2 vzeroupper test r8d,r8d jz NEAR $L$base2_64_avx2 test rdx,63 jz NEAR $L$even_avx2 push rbx push rbp push r12 push r13 push r14 push r15 $L$blocks_avx2_body: mov r15,rdx mov r8,QWORD[rdi] mov r9,QWORD[8+rdi] mov ebp,DWORD[16+rdi] mov r11,QWORD[24+rdi] mov r13,QWORD[32+rdi] mov r14d,r8d and r8,-2147483648 mov r12,r9 mov ebx,r9d and r9,-2147483648 shr r8,6 shl r12,52 add r14,r8 shr rbx,12 shr r9,18 add r14,r12 adc rbx,r9 mov r8,rbp shl r8,40 shr rbp,24 add rbx,r8 adc rbp,0 mov r9,-4 mov r8,rbp and r9,rbp shr r8,2 and rbp,3 add r8,r9 add r14,r8 adc rbx,0 adc rbp,0 mov r12,r13 mov rax,r13 shr r13,2 add r13,r12 $L$base2_26_pre_avx2: add r14,QWORD[rsi] adc rbx,QWORD[8+rsi] lea rsi,[16+rsi] adc rbp,rcx sub r15,16 call __poly1305_block mov rax,r12 test r15,63 jnz NEAR $L$base2_26_pre_avx2 test rcx,rcx jz NEAR $L$store_base2_64_avx2 mov rax,r14 mov rdx,r14 shr r14,52 mov r11,rbx mov r12,rbx shr rdx,26 and rax,0x3ffffff shl r11,12 and rdx,0x3ffffff shr rbx,14 or r14,r11 shl rbp,24 and r14,0x3ffffff shr r12,40 and rbx,0x3ffffff or rbp,r12 test r15,r15 jz NEAR $L$store_base2_26_avx2 vmovd xmm0,eax vmovd xmm1,edx vmovd xmm2,r14d vmovd xmm3,ebx vmovd xmm4,ebp jmp NEAR $L$proceed_avx2 ALIGN 32 $L$store_base2_64_avx2: mov QWORD[rdi],r14 mov QWORD[8+rdi],rbx mov QWORD[16+rdi],rbp jmp NEAR $L$done_avx2 ALIGN 16 $L$store_base2_26_avx2: mov DWORD[rdi],eax mov DWORD[4+rdi],edx mov DWORD[8+rdi],r14d mov DWORD[12+rdi],ebx mov DWORD[16+rdi],ebp ALIGN 16 $L$done_avx2: mov r15,QWORD[rsp] mov r14,QWORD[8+rsp] mov r13,QWORD[16+rsp] mov r12,QWORD[24+rsp] mov rbp,QWORD[32+rsp] mov rbx,QWORD[40+rsp] lea rsp,[48+rsp] $L$no_data_avx2: $L$blocks_avx2_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret ALIGN 32 $L$base2_64_avx2: push rbx push rbp push r12 push r13 push r14 push r15 $L$base2_64_avx2_body: mov r15,rdx mov r11,QWORD[24+rdi] mov r13,QWORD[32+rdi] mov r14,QWORD[rdi] mov rbx,QWORD[8+rdi] mov ebp,DWORD[16+rdi] mov r12,r13 mov rax,r13 shr r13,2 add r13,r12 test rdx,63 jz NEAR $L$init_avx2 $L$base2_64_pre_avx2: add r14,QWORD[rsi] adc rbx,QWORD[8+rsi] lea rsi,[16+rsi] adc rbp,rcx sub r15,16 call __poly1305_block mov rax,r12 test r15,63 jnz NEAR $L$base2_64_pre_avx2 $L$init_avx2: mov rax,r14 mov rdx,r14 shr r14,52 mov r8,rbx mov r9,rbx shr rdx,26 and rax,0x3ffffff shl r8,12 and rdx,0x3ffffff shr rbx,14 or r14,r8 shl rbp,24 and r14,0x3ffffff shr r9,40 and rbx,0x3ffffff or rbp,r9 vmovd xmm0,eax vmovd xmm1,edx vmovd xmm2,r14d vmovd xmm3,ebx vmovd xmm4,ebp mov DWORD[20+rdi],1 call __poly1305_init_avx $L$proceed_avx2: mov rdx,r15 mov r15,QWORD[rsp] mov r14,QWORD[8+rsp] mov r13,QWORD[16+rsp] mov r12,QWORD[24+rsp] mov rbp,QWORD[32+rsp] mov rbx,QWORD[40+rsp] lea rax,[48+rsp] lea rsp,[48+rsp] $L$base2_64_avx2_epilogue: jmp NEAR $L$do_avx2 ALIGN 32 $L$even_avx2: vmovd xmm0,DWORD[rdi] vmovd xmm1,DWORD[4+rdi] vmovd xmm2,DWORD[8+rdi] vmovd xmm3,DWORD[12+rdi] vmovd xmm4,DWORD[16+rdi] $L$do_avx2: lea r11,[((-248))+rsp] sub rsp,0x1c8 vmovdqa XMMWORD[80+r11],xmm6 vmovdqa XMMWORD[96+r11],xmm7 vmovdqa XMMWORD[112+r11],xmm8 vmovdqa XMMWORD[128+r11],xmm9 vmovdqa XMMWORD[144+r11],xmm10 vmovdqa XMMWORD[160+r11],xmm11 vmovdqa XMMWORD[176+r11],xmm12 vmovdqa XMMWORD[192+r11],xmm13 vmovdqa XMMWORD[208+r11],xmm14 vmovdqa XMMWORD[224+r11],xmm15 $L$do_avx2_body: lea rdi,[((48+64))+rdi] lea rcx,[$L$const] vmovdqu xmm9,XMMWORD[((-64))+rdi] and rsp,-512 vmovdqu xmm10,XMMWORD[((-48))+rdi] vmovdqu xmm6,XMMWORD[((-32))+rdi] vmovdqu xmm11,XMMWORD[((-16))+rdi] vmovdqu xmm12,XMMWORD[rdi] vmovdqu xmm13,XMMWORD[16+rdi] vmovdqu xmm14,XMMWORD[32+rdi] vpermq ymm9,ymm9,0x15 vmovdqu xmm15,XMMWORD[48+rdi] vpermq ymm10,ymm10,0x15 vpshufd ymm9,ymm9,0xc8 vmovdqu xmm5,XMMWORD[64+rdi] vpermq ymm6,ymm6,0x15 vpshufd ymm10,ymm10,0xc8 vmovdqa YMMWORD[rsp],ymm9 vpermq ymm11,ymm11,0x15 vpshufd ymm6,ymm6,0xc8 vmovdqa YMMWORD[32+rsp],ymm10 vpermq ymm12,ymm12,0x15 vpshufd ymm11,ymm11,0xc8 vmovdqa YMMWORD[64+rsp],ymm6 vpermq ymm13,ymm13,0x15 vpshufd ymm12,ymm12,0xc8 vmovdqa YMMWORD[96+rsp],ymm11 vpermq ymm14,ymm14,0x15 vpshufd ymm13,ymm13,0xc8 vmovdqa YMMWORD[128+rsp],ymm12 vpermq ymm15,ymm15,0x15 vpshufd ymm14,ymm14,0xc8 vmovdqa YMMWORD[160+rsp],ymm13 vpermq ymm5,ymm5,0x15 vpshufd ymm15,ymm15,0xc8 vmovdqa YMMWORD[192+rsp],ymm14 vpshufd ymm5,ymm5,0xc8 vmovdqa YMMWORD[224+rsp],ymm15 vmovdqa YMMWORD[256+rsp],ymm5 vmovdqa ymm5,YMMWORD[64+rcx] vmovdqu xmm7,XMMWORD[rsi] vmovdqu xmm8,XMMWORD[16+rsi] vinserti128 ymm7,ymm7,XMMWORD[32+rsi],1 vinserti128 ymm8,ymm8,XMMWORD[48+rsi],1 lea rsi,[64+rsi] vpsrldq ymm9,ymm7,6 vpsrldq ymm10,ymm8,6 vpunpckhqdq ymm6,ymm7,ymm8 vpunpcklqdq ymm9,ymm9,ymm10 vpunpcklqdq ymm7,ymm7,ymm8 vpsrlq ymm10,ymm9,30 vpsrlq ymm9,ymm9,4 vpsrlq ymm8,ymm7,26 vpsrlq ymm6,ymm6,40 vpand ymm9,ymm9,ymm5 vpand ymm7,ymm7,ymm5 vpand ymm8,ymm8,ymm5 vpand ymm10,ymm10,ymm5 vpor ymm6,ymm6,YMMWORD[32+rcx] lea rax,[144+rsp] vpaddq ymm2,ymm9,ymm2 sub rdx,64 jz NEAR $L$tail_avx2 jmp NEAR $L$oop_avx2 ALIGN 32 $L$oop_avx2: vpaddq ymm0,ymm7,ymm0 vmovdqa ymm7,YMMWORD[rsp] vpaddq ymm1,ymm8,ymm1 vmovdqa ymm8,YMMWORD[32+rsp] vpaddq ymm3,ymm10,ymm3 vmovdqa ymm9,YMMWORD[96+rsp] vpaddq ymm4,ymm6,ymm4 vmovdqa ymm10,YMMWORD[48+rax] vmovdqa ymm5,YMMWORD[112+rax] vpmuludq ymm13,ymm7,ymm2 vpmuludq ymm14,ymm8,ymm2 vpmuludq ymm15,ymm9,ymm2 vpmuludq ymm11,ymm10,ymm2 vpmuludq ymm12,ymm5,ymm2 vpmuludq ymm6,ymm8,ymm0 vpmuludq ymm2,ymm8,ymm1 vpaddq ymm12,ymm12,ymm6 vpaddq ymm13,ymm13,ymm2 vpmuludq ymm6,ymm8,ymm3 vpmuludq ymm2,ymm4,YMMWORD[64+rsp] vpaddq ymm15,ymm15,ymm6 vpaddq ymm11,ymm11,ymm2 vmovdqa ymm8,YMMWORD[((-16))+rax] vpmuludq ymm6,ymm7,ymm0 vpmuludq ymm2,ymm7,ymm1 vpaddq ymm11,ymm11,ymm6 vpaddq ymm12,ymm12,ymm2 vpmuludq ymm6,ymm7,ymm3 vpmuludq ymm2,ymm7,ymm4 vmovdqu xmm7,XMMWORD[rsi] vpaddq ymm14,ymm14,ymm6 vpaddq ymm15,ymm15,ymm2 vinserti128 ymm7,ymm7,XMMWORD[32+rsi],1 vpmuludq ymm6,ymm8,ymm3 vpmuludq ymm2,ymm8,ymm4 vmovdqu xmm8,XMMWORD[16+rsi] vpaddq ymm11,ymm11,ymm6 vpaddq ymm12,ymm12,ymm2 vmovdqa ymm2,YMMWORD[16+rax] vpmuludq ymm6,ymm9,ymm1 vpmuludq ymm9,ymm9,ymm0 vpaddq ymm14,ymm14,ymm6 vpaddq ymm13,ymm13,ymm9 vinserti128 ymm8,ymm8,XMMWORD[48+rsi],1 lea rsi,[64+rsi] vpmuludq ymm6,ymm2,ymm1 vpmuludq ymm2,ymm2,ymm0 vpsrldq ymm9,ymm7,6 vpaddq ymm15,ymm15,ymm6 vpaddq ymm14,ymm14,ymm2 vpmuludq ymm6,ymm10,ymm3 vpmuludq ymm2,ymm10,ymm4 vpsrldq ymm10,ymm8,6 vpaddq ymm12,ymm12,ymm6 vpaddq ymm13,ymm13,ymm2 vpunpckhqdq ymm6,ymm7,ymm8 vpmuludq ymm3,ymm5,ymm3 vpmuludq ymm4,ymm5,ymm4 vpunpcklqdq ymm7,ymm7,ymm8 vpaddq ymm2,ymm13,ymm3 vpaddq ymm3,ymm14,ymm4 vpunpcklqdq ymm10,ymm9,ymm10 vpmuludq ymm4,ymm0,YMMWORD[80+rax] vpmuludq ymm0,ymm5,ymm1 vmovdqa ymm5,YMMWORD[64+rcx] vpaddq ymm4,ymm15,ymm4 vpaddq ymm0,ymm11,ymm0 vpsrlq ymm14,ymm3,26 vpand ymm3,ymm3,ymm5 vpaddq ymm4,ymm4,ymm14 vpsrlq ymm11,ymm0,26 vpand ymm0,ymm0,ymm5 vpaddq ymm1,ymm12,ymm11 vpsrlq ymm15,ymm4,26 vpand ymm4,ymm4,ymm5 vpsrlq ymm9,ymm10,4 vpsrlq ymm12,ymm1,26 vpand ymm1,ymm1,ymm5 vpaddq ymm2,ymm2,ymm12 vpaddq ymm0,ymm0,ymm15 vpsllq ymm15,ymm15,2 vpaddq ymm0,ymm0,ymm15 vpand ymm9,ymm9,ymm5 vpsrlq ymm8,ymm7,26 vpsrlq ymm13,ymm2,26 vpand ymm2,ymm2,ymm5 vpaddq ymm3,ymm3,ymm13 vpaddq ymm2,ymm2,ymm9 vpsrlq ymm10,ymm10,30 vpsrlq ymm11,ymm0,26 vpand ymm0,ymm0,ymm5 vpaddq ymm1,ymm1,ymm11 vpsrlq ymm6,ymm6,40 vpsrlq ymm14,ymm3,26 vpand ymm3,ymm3,ymm5 vpaddq ymm4,ymm4,ymm14 vpand ymm7,ymm7,ymm5 vpand ymm8,ymm8,ymm5 vpand ymm10,ymm10,ymm5 vpor ymm6,ymm6,YMMWORD[32+rcx] sub rdx,64 jnz NEAR $L$oop_avx2 DB 0x66,0x90 $L$tail_avx2: vpaddq ymm0,ymm7,ymm0 vmovdqu ymm7,YMMWORD[4+rsp] vpaddq ymm1,ymm8,ymm1 vmovdqu ymm8,YMMWORD[36+rsp] vpaddq ymm3,ymm10,ymm3 vmovdqu ymm9,YMMWORD[100+rsp] vpaddq ymm4,ymm6,ymm4 vmovdqu ymm10,YMMWORD[52+rax] vmovdqu ymm5,YMMWORD[116+rax] vpmuludq ymm13,ymm7,ymm2 vpmuludq ymm14,ymm8,ymm2 vpmuludq ymm15,ymm9,ymm2 vpmuludq ymm11,ymm10,ymm2 vpmuludq ymm12,ymm5,ymm2 vpmuludq ymm6,ymm8,ymm0 vpmuludq ymm2,ymm8,ymm1 vpaddq ymm12,ymm12,ymm6 vpaddq ymm13,ymm13,ymm2 vpmuludq ymm6,ymm8,ymm3 vpmuludq ymm2,ymm4,YMMWORD[68+rsp] vpaddq ymm15,ymm15,ymm6 vpaddq ymm11,ymm11,ymm2 vpmuludq ymm6,ymm7,ymm0 vpmuludq ymm2,ymm7,ymm1 vpaddq ymm11,ymm11,ymm6 vmovdqu ymm8,YMMWORD[((-12))+rax] vpaddq ymm12,ymm12,ymm2 vpmuludq ymm6,ymm7,ymm3 vpmuludq ymm2,ymm7,ymm4 vpaddq ymm14,ymm14,ymm6 vpaddq ymm15,ymm15,ymm2 vpmuludq ymm6,ymm8,ymm3 vpmuludq ymm2,ymm8,ymm4 vpaddq ymm11,ymm11,ymm6 vpaddq ymm12,ymm12,ymm2 vmovdqu ymm2,YMMWORD[20+rax] vpmuludq ymm6,ymm9,ymm1 vpmuludq ymm9,ymm9,ymm0 vpaddq ymm14,ymm14,ymm6 vpaddq ymm13,ymm13,ymm9 vpmuludq ymm6,ymm2,ymm1 vpmuludq ymm2,ymm2,ymm0 vpaddq ymm15,ymm15,ymm6 vpaddq ymm14,ymm14,ymm2 vpmuludq ymm6,ymm10,ymm3 vpmuludq ymm2,ymm10,ymm4 vpaddq ymm12,ymm12,ymm6 vpaddq ymm13,ymm13,ymm2 vpmuludq ymm3,ymm5,ymm3 vpmuludq ymm4,ymm5,ymm4 vpaddq ymm2,ymm13,ymm3 vpaddq ymm3,ymm14,ymm4 vpmuludq ymm4,ymm0,YMMWORD[84+rax] vpmuludq ymm0,ymm5,ymm1 vmovdqa ymm5,YMMWORD[64+rcx] vpaddq ymm4,ymm15,ymm4 vpaddq ymm0,ymm11,ymm0 vpsrldq ymm8,ymm12,8 vpsrldq ymm9,ymm2,8 vpsrldq ymm10,ymm3,8 vpsrldq ymm6,ymm4,8 vpsrldq ymm7,ymm0,8 vpaddq ymm12,ymm12,ymm8 vpaddq ymm2,ymm2,ymm9 vpaddq ymm3,ymm3,ymm10 vpaddq ymm4,ymm4,ymm6 vpaddq ymm0,ymm0,ymm7 vpermq ymm10,ymm3,0x2 vpermq ymm6,ymm4,0x2 vpermq ymm7,ymm0,0x2 vpermq ymm8,ymm12,0x2 vpermq ymm9,ymm2,0x2 vpaddq ymm3,ymm3,ymm10 vpaddq ymm4,ymm4,ymm6 vpaddq ymm0,ymm0,ymm7 vpaddq ymm12,ymm12,ymm8 vpaddq ymm2,ymm2,ymm9 vpsrlq ymm14,ymm3,26 vpand ymm3,ymm3,ymm5 vpaddq ymm4,ymm4,ymm14 vpsrlq ymm11,ymm0,26 vpand ymm0,ymm0,ymm5 vpaddq ymm1,ymm12,ymm11 vpsrlq ymm15,ymm4,26 vpand ymm4,ymm4,ymm5 vpsrlq ymm12,ymm1,26 vpand ymm1,ymm1,ymm5 vpaddq ymm2,ymm2,ymm12 vpaddq ymm0,ymm0,ymm15 vpsllq ymm15,ymm15,2 vpaddq ymm0,ymm0,ymm15 vpsrlq ymm13,ymm2,26 vpand ymm2,ymm2,ymm5 vpaddq ymm3,ymm3,ymm13 vpsrlq ymm11,ymm0,26 vpand ymm0,ymm0,ymm5 vpaddq ymm1,ymm1,ymm11 vpsrlq ymm14,ymm3,26 vpand ymm3,ymm3,ymm5 vpaddq ymm4,ymm4,ymm14 vmovd DWORD[(-112)+rdi],xmm0 vmovd DWORD[(-108)+rdi],xmm1 vmovd DWORD[(-104)+rdi],xmm2 vmovd DWORD[(-100)+rdi],xmm3 vmovd DWORD[(-96)+rdi],xmm4 vmovdqa xmm6,XMMWORD[80+r11] vmovdqa xmm7,XMMWORD[96+r11] vmovdqa xmm8,XMMWORD[112+r11] vmovdqa xmm9,XMMWORD[128+r11] vmovdqa xmm10,XMMWORD[144+r11] vmovdqa xmm11,XMMWORD[160+r11] vmovdqa xmm12,XMMWORD[176+r11] vmovdqa xmm13,XMMWORD[192+r11] vmovdqa xmm14,XMMWORD[208+r11] vmovdqa xmm15,XMMWORD[224+r11] lea rsp,[248+r11] $L$do_avx2_epilogue: vzeroupper mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_poly1305_blocks_avx2: ALIGN 64 $L$const: $L$mask24: DD 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 $L$129: DD 16777216,0,16777216,0,16777216,0,16777216,0 $L$mask26: DD 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 $L$five: DD 5,0,5,0,5,0,5,0 DB 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54 DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32 DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115 DB 108,46,111,114,103,62,0 ALIGN 16 EXTERN __imp_RtlVirtualUnwind ALIGN 16 se_handler: push rsi push rdi push rbx push rbp push r12 push r13 push r14 push r15 pushfq sub rsp,64 mov rax,QWORD[120+r8] mov rbx,QWORD[248+r8] mov rsi,QWORD[8+r9] mov r11,QWORD[56+r9] mov r10d,DWORD[r11] lea r10,[r10*1+rsi] cmp rbx,r10 jb NEAR $L$common_seh_tail mov rax,QWORD[152+r8] mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$common_seh_tail lea rax,[48+rax] mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] mov r12,QWORD[((-24))+rax] mov r13,QWORD[((-32))+rax] mov r14,QWORD[((-40))+rax] mov r15,QWORD[((-48))+rax] mov QWORD[144+r8],rbx mov QWORD[160+r8],rbp mov QWORD[216+r8],r12 mov QWORD[224+r8],r13 mov QWORD[232+r8],r14 mov QWORD[240+r8],r15 jmp NEAR $L$common_seh_tail ALIGN 16 avx_handler: push rsi push rdi push rbx push rbp push r12 push r13 push r14 push r15 pushfq sub rsp,64 mov rax,QWORD[120+r8] mov rbx,QWORD[248+r8] mov rsi,QWORD[8+r9] mov r11,QWORD[56+r9] mov r10d,DWORD[r11] lea r10,[r10*1+rsi] cmp rbx,r10 jb NEAR $L$common_seh_tail mov rax,QWORD[152+r8] mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$common_seh_tail mov rax,QWORD[208+r8] lea rsi,[80+rax] lea rax,[248+rax] lea rdi,[512+r8] mov ecx,20 DD 0xa548f3fc $L$common_seh_tail: mov rdi,QWORD[8+rax] mov rsi,QWORD[16+rax] mov QWORD[152+r8],rax mov QWORD[168+r8],rsi mov QWORD[176+r8],rdi mov rdi,QWORD[40+r9] mov rsi,r8 mov ecx,154 DD 0xa548f3fc mov rsi,r9 xor rcx,rcx mov rdx,QWORD[8+rsi] mov r8,QWORD[rsi] mov r9,QWORD[16+rsi] mov r10,QWORD[40+rsi] lea r11,[56+rsi] lea r12,[24+rsi] mov QWORD[32+rsp],r10 mov QWORD[40+rsp],r11 mov QWORD[48+rsp],r12 mov QWORD[56+rsp],rcx call QWORD[__imp_RtlVirtualUnwind] mov eax,1 add rsp,64 popfq pop r15 pop r14 pop r13 pop r12 pop rbp pop rbx pop rdi pop rsi DB 0F3h,0C3h ;repret section .pdata rdata align=4 ALIGN 4 DD $L$SEH_begin_GFp_poly1305_init_asm wrt ..imagebase DD $L$SEH_end_GFp_poly1305_init_asm wrt ..imagebase DD $L$SEH_info_GFp_poly1305_init_asm wrt ..imagebase DD $L$SEH_begin_GFp_poly1305_blocks wrt ..imagebase DD $L$SEH_end_GFp_poly1305_blocks wrt ..imagebase DD $L$SEH_info_GFp_poly1305_blocks wrt ..imagebase DD $L$SEH_begin_GFp_poly1305_emit wrt ..imagebase DD $L$SEH_end_GFp_poly1305_emit wrt ..imagebase DD $L$SEH_info_GFp_poly1305_emit wrt ..imagebase DD $L$SEH_begin_poly1305_blocks_avx wrt ..imagebase DD $L$base2_64_avx wrt ..imagebase DD $L$SEH_info_poly1305_blocks_avx_1 wrt ..imagebase DD $L$base2_64_avx wrt ..imagebase DD $L$even_avx wrt ..imagebase DD $L$SEH_info_poly1305_blocks_avx_2 wrt ..imagebase DD $L$even_avx wrt ..imagebase DD $L$SEH_end_poly1305_blocks_avx wrt ..imagebase DD $L$SEH_info_poly1305_blocks_avx_3 wrt ..imagebase DD $L$SEH_begin_poly1305_emit_avx wrt ..imagebase DD $L$SEH_end_poly1305_emit_avx wrt ..imagebase DD $L$SEH_info_poly1305_emit_avx wrt ..imagebase DD $L$SEH_begin_poly1305_blocks_avx2 wrt ..imagebase DD $L$base2_64_avx2 wrt ..imagebase DD $L$SEH_info_poly1305_blocks_avx2_1 wrt ..imagebase DD $L$base2_64_avx2 wrt ..imagebase DD $L$even_avx2 wrt ..imagebase DD $L$SEH_info_poly1305_blocks_avx2_2 wrt ..imagebase DD $L$even_avx2 wrt ..imagebase DD $L$SEH_end_poly1305_blocks_avx2 wrt ..imagebase DD $L$SEH_info_poly1305_blocks_avx2_3 wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_GFp_poly1305_init_asm: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$SEH_begin_GFp_poly1305_init_asm wrt ..imagebase,$L$SEH_begin_GFp_poly1305_init_asm wrt ..imagebase $L$SEH_info_GFp_poly1305_blocks: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$blocks_body wrt ..imagebase,$L$blocks_epilogue wrt ..imagebase $L$SEH_info_GFp_poly1305_emit: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$SEH_begin_GFp_poly1305_emit wrt ..imagebase,$L$SEH_begin_GFp_poly1305_emit wrt ..imagebase $L$SEH_info_poly1305_blocks_avx_1: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$blocks_avx_body wrt ..imagebase,$L$blocks_avx_epilogue wrt ..imagebase $L$SEH_info_poly1305_blocks_avx_2: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$base2_64_avx_body wrt ..imagebase,$L$base2_64_avx_epilogue wrt ..imagebase $L$SEH_info_poly1305_blocks_avx_3: DB 9,0,0,0 DD avx_handler wrt ..imagebase DD $L$do_avx_body wrt ..imagebase,$L$do_avx_epilogue wrt ..imagebase $L$SEH_info_poly1305_emit_avx: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$SEH_begin_poly1305_emit_avx wrt ..imagebase,$L$SEH_begin_poly1305_emit_avx wrt ..imagebase $L$SEH_info_poly1305_blocks_avx2_1: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$blocks_avx2_body wrt ..imagebase,$L$blocks_avx2_epilogue wrt ..imagebase $L$SEH_info_poly1305_blocks_avx2_2: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$base2_64_avx2_body wrt ..imagebase,$L$base2_64_avx2_epilogue wrt ..imagebase $L$SEH_info_poly1305_blocks_avx2_3: DB 9,0,0,0 DD avx_handler wrt ..imagebase DD $L$do_avx2_body wrt ..imagebase,$L$do_avx2_epilogue wrt ..imagebase