; This file is generated from a similarly-named Perl script in the BoringSSL ; source tree. Do not edit by hand. %ifdef BORINGSSL_PREFIX %include "boringssl_prefix_symbols_nasm.inc" %endif %ifidn __OUTPUT_FORMAT__,obj section code use32 class=code align=64 %elifidn __OUTPUT_FORMAT__,win32 %ifdef __YASM_VERSION_ID__ %if __YASM_VERSION_ID__ < 01010000h %error yasm version 1.1.0 or later needed. %endif ; Yasm automatically includes .00 and complains about redefining it. ; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html %else $@feat.00 equ 1 %endif section .text code align=64 %else section .text code %endif ;extern _GFp_ia32cap_P align 64 global _GFp_poly1305_init_asm align 16 _GFp_poly1305_init_asm: L$_GFp_poly1305_init_asm_begin: push ebp push ebx push esi push edi mov edi,DWORD [20+esp] mov esi,DWORD [24+esp] mov ebp,DWORD [28+esp] xor eax,eax mov DWORD [edi],eax mov DWORD [4+edi],eax mov DWORD [8+edi],eax mov DWORD [12+edi],eax mov DWORD [16+edi],eax mov DWORD [20+edi],eax cmp esi,0 je NEAR L$000nokey call L$001pic_point L$001pic_point: pop ebx lea eax,[(_GFp_poly1305_blocks-L$001pic_point)+ebx] lea edx,[(_GFp_poly1305_emit-L$001pic_point)+ebx] lea edi,[_GFp_ia32cap_P] mov ecx,DWORD [edi] and ecx,83886080 cmp ecx,83886080 lea eax,[(__poly1305_blocks_sse2-L$001pic_point)+ebx] lea edx,[(__poly1305_emit_sse2-L$001pic_point)+ebx] mov edi,DWORD [20+esp] mov DWORD [ebp],eax mov DWORD [4+ebp],edx mov eax,DWORD [esi] mov ebx,DWORD [4+esi] mov ecx,DWORD [8+esi] mov edx,DWORD [12+esi] and eax,268435455 and ebx,268435452 and ecx,268435452 and edx,268435452 mov DWORD [24+edi],eax mov DWORD [28+edi],ebx mov DWORD [32+edi],ecx mov DWORD [36+edi],edx mov eax,1 L$000nokey: pop edi pop esi pop ebx pop ebp ret global _GFp_poly1305_blocks align 16 _GFp_poly1305_blocks: L$_GFp_poly1305_blocks_begin: push ebp push ebx push esi push edi mov edi,DWORD [20+esp] mov esi,DWORD [24+esp] mov ecx,DWORD [28+esp] L$enter_blocks: and ecx,-15 jz NEAR L$002nodata sub esp,64 mov eax,DWORD [24+edi] mov ebx,DWORD [28+edi] lea ebp,[ecx*1+esi] mov ecx,DWORD [32+edi] mov edx,DWORD [36+edi] mov DWORD [92+esp],ebp mov ebp,esi mov DWORD [36+esp],eax mov eax,ebx shr eax,2 mov DWORD [40+esp],ebx add eax,ebx mov ebx,ecx shr ebx,2 mov DWORD [44+esp],ecx add ebx,ecx mov ecx,edx shr ecx,2 mov DWORD [48+esp],edx add ecx,edx mov DWORD [52+esp],eax mov DWORD [56+esp],ebx mov DWORD [60+esp],ecx mov eax,DWORD [edi] mov ebx,DWORD [4+edi] mov ecx,DWORD [8+edi] mov esi,DWORD [12+edi] mov edi,DWORD [16+edi] jmp NEAR L$003loop align 32 L$003loop: add eax,DWORD [ebp] adc ebx,DWORD [4+ebp] adc ecx,DWORD [8+ebp] adc esi,DWORD [12+ebp] lea ebp,[16+ebp] adc edi,DWORD [96+esp] mov DWORD [esp],eax mov DWORD [12+esp],esi mul DWORD [36+esp] mov DWORD [16+esp],edi mov edi,eax mov eax,ebx mov esi,edx mul DWORD [60+esp] add edi,eax mov eax,ecx adc esi,edx mul DWORD [56+esp] add edi,eax mov eax,DWORD [12+esp] adc esi,edx mul DWORD [52+esp] add edi,eax mov eax,DWORD [esp] adc esi,edx mul DWORD [40+esp] mov DWORD [20+esp],edi xor edi,edi add esi,eax mov eax,ebx adc edi,edx mul DWORD [36+esp] add esi,eax mov eax,ecx adc edi,edx mul DWORD [60+esp] add esi,eax mov eax,DWORD [12+esp] adc edi,edx mul DWORD [56+esp] add esi,eax mov eax,DWORD [16+esp] adc edi,edx imul eax,DWORD [52+esp] add esi,eax mov eax,DWORD [esp] adc edi,0 mul DWORD [44+esp] mov DWORD [24+esp],esi xor esi,esi add edi,eax mov eax,ebx adc esi,edx mul DWORD [40+esp] add edi,eax mov eax,ecx adc esi,edx mul DWORD [36+esp] add edi,eax mov eax,DWORD [12+esp] adc esi,edx mul DWORD [60+esp] add edi,eax mov eax,DWORD [16+esp] adc esi,edx imul eax,DWORD [56+esp] add edi,eax mov eax,DWORD [esp] adc esi,0 mul DWORD [48+esp] mov DWORD [28+esp],edi xor edi,edi add esi,eax mov eax,ebx adc edi,edx mul DWORD [44+esp] add esi,eax mov eax,ecx adc edi,edx mul DWORD [40+esp] add esi,eax mov eax,DWORD [12+esp] adc edi,edx mul DWORD [36+esp] add esi,eax mov ecx,DWORD [16+esp] adc edi,edx mov edx,ecx imul ecx,DWORD [60+esp] add esi,ecx mov eax,DWORD [20+esp] adc edi,0 imul edx,DWORD [36+esp] add edx,edi mov ebx,DWORD [24+esp] mov ecx,DWORD [28+esp] mov edi,edx shr edx,2 and edi,3 lea edx,[edx*4+edx] add eax,edx adc ebx,0 adc ecx,0 adc esi,0 adc edi,0 cmp ebp,DWORD [92+esp] jne NEAR L$003loop mov edx,DWORD [84+esp] add esp,64 mov DWORD [edx],eax mov DWORD [4+edx],ebx mov DWORD [8+edx],ecx mov DWORD [12+edx],esi mov DWORD [16+edx],edi L$002nodata: pop edi pop esi pop ebx pop ebp ret global _GFp_poly1305_emit align 16 _GFp_poly1305_emit: L$_GFp_poly1305_emit_begin: push ebp push ebx push esi push edi mov ebp,DWORD [20+esp] L$enter_emit: mov edi,DWORD [24+esp] mov eax,DWORD [ebp] mov ebx,DWORD [4+ebp] mov ecx,DWORD [8+ebp] mov edx,DWORD [12+ebp] mov esi,DWORD [16+ebp] add eax,5 adc ebx,0 adc ecx,0 adc edx,0 adc esi,0 shr esi,2 neg esi and eax,esi and ebx,esi and ecx,esi and edx,esi mov DWORD [edi],eax mov DWORD [4+edi],ebx mov DWORD [8+edi],ecx mov DWORD [12+edi],edx not esi mov eax,DWORD [ebp] mov ebx,DWORD [4+ebp] mov ecx,DWORD [8+ebp] mov edx,DWORD [12+ebp] mov ebp,DWORD [28+esp] and eax,esi and ebx,esi and ecx,esi and edx,esi or eax,DWORD [edi] or ebx,DWORD [4+edi] or ecx,DWORD [8+edi] or edx,DWORD [12+edi] add eax,DWORD [ebp] adc ebx,DWORD [4+ebp] adc ecx,DWORD [8+ebp] adc edx,DWORD [12+ebp] mov DWORD [edi],eax mov DWORD [4+edi],ebx mov DWORD [8+edi],ecx mov DWORD [12+edi],edx pop edi pop esi pop ebx pop ebp ret align 32 align 16 __poly1305_init_sse2: movdqu xmm4,[24+edi] lea edi,[48+edi] mov ebp,esp sub esp,224 and esp,-16 movq xmm7,[64+ebx] movdqa xmm0,xmm4 movdqa xmm1,xmm4 movdqa xmm2,xmm4 pand xmm0,xmm7 psrlq xmm1,26 psrldq xmm2,6 pand xmm1,xmm7 movdqa xmm3,xmm2 psrlq xmm2,4 psrlq xmm3,30 pand xmm2,xmm7 pand xmm3,xmm7 psrldq xmm4,13 lea edx,[144+esp] mov ecx,2 L$004square: movdqa [esp],xmm0 movdqa [16+esp],xmm1 movdqa [32+esp],xmm2 movdqa [48+esp],xmm3 movdqa [64+esp],xmm4 movdqa xmm6,xmm1 movdqa xmm5,xmm2 pslld xmm6,2 pslld xmm5,2 paddd xmm6,xmm1 paddd xmm5,xmm2 movdqa [80+esp],xmm6 movdqa [96+esp],xmm5 movdqa xmm6,xmm3 movdqa xmm5,xmm4 pslld xmm6,2 pslld xmm5,2 paddd xmm6,xmm3 paddd xmm5,xmm4 movdqa [112+esp],xmm6 movdqa [128+esp],xmm5 pshufd xmm6,xmm0,68 movdqa xmm5,xmm1 pshufd xmm1,xmm1,68 pshufd xmm2,xmm2,68 pshufd xmm3,xmm3,68 pshufd xmm4,xmm4,68 movdqa [edx],xmm6 movdqa [16+edx],xmm1 movdqa [32+edx],xmm2 movdqa [48+edx],xmm3 movdqa [64+edx],xmm4 pmuludq xmm4,xmm0 pmuludq xmm3,xmm0 pmuludq xmm2,xmm0 pmuludq xmm1,xmm0 pmuludq xmm0,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[48+edx] movdqa xmm7,xmm6 pmuludq xmm6,[32+edx] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[16+edx] paddq xmm3,xmm6 movdqa xmm6,[80+esp] pmuludq xmm5,[edx] paddq xmm2,xmm7 pmuludq xmm6,[64+edx] movdqa xmm7,[32+esp] paddq xmm1,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[32+edx] paddq xmm0,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[16+edx] paddq xmm4,xmm7 movdqa xmm7,[96+esp] pmuludq xmm6,[edx] paddq xmm3,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[64+edx] paddq xmm2,xmm6 pmuludq xmm5,[48+edx] movdqa xmm6,[48+esp] paddq xmm1,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[16+edx] paddq xmm0,xmm5 movdqa xmm5,[112+esp] pmuludq xmm7,[edx] paddq xmm4,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[64+edx] paddq xmm3,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[48+edx] paddq xmm2,xmm5 pmuludq xmm7,[32+edx] movdqa xmm5,[64+esp] paddq xmm1,xmm6 movdqa xmm6,[128+esp] pmuludq xmm5,[edx] paddq xmm0,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[64+edx] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[16+edx] paddq xmm3,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[32+edx] paddq xmm0,xmm7 pmuludq xmm6,[48+edx] movdqa xmm7,[64+ebx] paddq xmm1,xmm5 paddq xmm2,xmm6 movdqa xmm5,xmm3 pand xmm3,xmm7 psrlq xmm5,26 paddq xmm5,xmm4 movdqa xmm6,xmm0 pand xmm0,xmm7 psrlq xmm6,26 movdqa xmm4,xmm5 paddq xmm6,xmm1 psrlq xmm5,26 pand xmm4,xmm7 movdqa xmm1,xmm6 psrlq xmm6,26 paddd xmm0,xmm5 psllq xmm5,2 paddq xmm6,xmm2 paddq xmm5,xmm0 pand xmm1,xmm7 movdqa xmm2,xmm6 psrlq xmm6,26 pand xmm2,xmm7 paddd xmm6,xmm3 movdqa xmm0,xmm5 psrlq xmm5,26 movdqa xmm3,xmm6 psrlq xmm6,26 pand xmm0,xmm7 paddd xmm1,xmm5 pand xmm3,xmm7 paddd xmm4,xmm6 dec ecx jz NEAR L$005square_break punpcklqdq xmm0,[esp] punpcklqdq xmm1,[16+esp] punpcklqdq xmm2,[32+esp] punpcklqdq xmm3,[48+esp] punpcklqdq xmm4,[64+esp] jmp NEAR L$004square L$005square_break: psllq xmm0,32 psllq xmm1,32 psllq xmm2,32 psllq xmm3,32 psllq xmm4,32 por xmm0,[esp] por xmm1,[16+esp] por xmm2,[32+esp] por xmm3,[48+esp] por xmm4,[64+esp] pshufd xmm0,xmm0,141 pshufd xmm1,xmm1,141 pshufd xmm2,xmm2,141 pshufd xmm3,xmm3,141 pshufd xmm4,xmm4,141 movdqu [edi],xmm0 movdqu [16+edi],xmm1 movdqu [32+edi],xmm2 movdqu [48+edi],xmm3 movdqu [64+edi],xmm4 movdqa xmm6,xmm1 movdqa xmm5,xmm2 pslld xmm6,2 pslld xmm5,2 paddd xmm6,xmm1 paddd xmm5,xmm2 movdqu [80+edi],xmm6 movdqu [96+edi],xmm5 movdqa xmm6,xmm3 movdqa xmm5,xmm4 pslld xmm6,2 pslld xmm5,2 paddd xmm6,xmm3 paddd xmm5,xmm4 movdqu [112+edi],xmm6 movdqu [128+edi],xmm5 mov esp,ebp lea edi,[edi-48] ret align 32 align 16 __poly1305_blocks_sse2: push ebp push ebx push esi push edi mov edi,DWORD [20+esp] mov esi,DWORD [24+esp] mov ecx,DWORD [28+esp] mov eax,DWORD [20+edi] and ecx,-16 jz NEAR L$006nodata cmp ecx,64 jae NEAR L$007enter_sse2 test eax,eax jz NEAR L$enter_blocks align 16 L$007enter_sse2: call L$008pic_point L$008pic_point: pop ebx lea ebx,[(L$const_sse2-L$008pic_point)+ebx] test eax,eax jnz NEAR L$009base2_26 call __poly1305_init_sse2 mov eax,DWORD [edi] mov ecx,DWORD [3+edi] mov edx,DWORD [6+edi] mov esi,DWORD [9+edi] mov ebp,DWORD [13+edi] mov DWORD [20+edi],1 shr ecx,2 and eax,67108863 shr edx,4 and ecx,67108863 shr esi,6 and edx,67108863 movd xmm0,eax movd xmm1,ecx movd xmm2,edx movd xmm3,esi movd xmm4,ebp mov esi,DWORD [24+esp] mov ecx,DWORD [28+esp] jmp NEAR L$010base2_32 align 16 L$009base2_26: movd xmm0,DWORD [edi] movd xmm1,DWORD [4+edi] movd xmm2,DWORD [8+edi] movd xmm3,DWORD [12+edi] movd xmm4,DWORD [16+edi] movdqa xmm7,[64+ebx] L$010base2_32: mov eax,DWORD [32+esp] mov ebp,esp sub esp,528 and esp,-16 lea edi,[48+edi] shl eax,24 test ecx,31 jz NEAR L$011even movdqu xmm6,[esi] lea esi,[16+esi] movdqa xmm5,xmm6 pand xmm6,xmm7 paddd xmm0,xmm6 movdqa xmm6,xmm5 psrlq xmm5,26 psrldq xmm6,6 pand xmm5,xmm7 paddd xmm1,xmm5 movdqa xmm5,xmm6 psrlq xmm6,4 pand xmm6,xmm7 paddd xmm2,xmm6 movdqa xmm6,xmm5 psrlq xmm5,30 pand xmm5,xmm7 psrldq xmm6,7 paddd xmm3,xmm5 movd xmm5,eax paddd xmm4,xmm6 movd xmm6,DWORD [12+edi] paddd xmm4,xmm5 movdqa [esp],xmm0 movdqa [16+esp],xmm1 movdqa [32+esp],xmm2 movdqa [48+esp],xmm3 movdqa [64+esp],xmm4 pmuludq xmm0,xmm6 pmuludq xmm1,xmm6 pmuludq xmm2,xmm6 movd xmm5,DWORD [28+edi] pmuludq xmm3,xmm6 pmuludq xmm4,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[48+esp] movdqa xmm7,xmm6 pmuludq xmm6,[32+esp] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[16+esp] paddq xmm3,xmm6 movd xmm6,DWORD [92+edi] pmuludq xmm5,[esp] paddq xmm2,xmm7 pmuludq xmm6,[64+esp] movd xmm7,DWORD [44+edi] paddq xmm1,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[32+esp] paddq xmm0,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[16+esp] paddq xmm4,xmm7 movd xmm7,DWORD [108+edi] pmuludq xmm6,[esp] paddq xmm3,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[64+esp] paddq xmm2,xmm6 pmuludq xmm5,[48+esp] movd xmm6,DWORD [60+edi] paddq xmm1,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[16+esp] paddq xmm0,xmm5 movd xmm5,DWORD [124+edi] pmuludq xmm7,[esp] paddq xmm4,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[64+esp] paddq xmm3,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[48+esp] paddq xmm2,xmm5 pmuludq xmm7,[32+esp] movd xmm5,DWORD [76+edi] paddq xmm1,xmm6 movd xmm6,DWORD [140+edi] pmuludq xmm5,[esp] paddq xmm0,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[64+esp] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[16+esp] paddq xmm3,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[32+esp] paddq xmm0,xmm7 pmuludq xmm6,[48+esp] movdqa xmm7,[64+ebx] paddq xmm1,xmm5 paddq xmm2,xmm6 movdqa xmm5,xmm3 pand xmm3,xmm7 psrlq xmm5,26 paddq xmm5,xmm4 movdqa xmm6,xmm0 pand xmm0,xmm7 psrlq xmm6,26 movdqa xmm4,xmm5 paddq xmm6,xmm1 psrlq xmm5,26 pand xmm4,xmm7 movdqa xmm1,xmm6 psrlq xmm6,26 paddd xmm0,xmm5 psllq xmm5,2 paddq xmm6,xmm2 paddq xmm5,xmm0 pand xmm1,xmm7 movdqa xmm2,xmm6 psrlq xmm6,26 pand xmm2,xmm7 paddd xmm6,xmm3 movdqa xmm0,xmm5 psrlq xmm5,26 movdqa xmm3,xmm6 psrlq xmm6,26 pand xmm0,xmm7 paddd xmm1,xmm5 pand xmm3,xmm7 paddd xmm4,xmm6 sub ecx,16 jz NEAR L$012done L$011even: lea edx,[384+esp] lea eax,[esi-32] sub ecx,64 movdqu xmm5,[edi] pshufd xmm6,xmm5,68 cmovb esi,eax pshufd xmm5,xmm5,238 movdqa [edx],xmm6 lea eax,[160+esp] movdqu xmm6,[16+edi] movdqa [edx-144],xmm5 pshufd xmm5,xmm6,68 pshufd xmm6,xmm6,238 movdqa [16+edx],xmm5 movdqu xmm5,[32+edi] movdqa [edx-128],xmm6 pshufd xmm6,xmm5,68 pshufd xmm5,xmm5,238 movdqa [32+edx],xmm6 movdqu xmm6,[48+edi] movdqa [edx-112],xmm5 pshufd xmm5,xmm6,68 pshufd xmm6,xmm6,238 movdqa [48+edx],xmm5 movdqu xmm5,[64+edi] movdqa [edx-96],xmm6 pshufd xmm6,xmm5,68 pshufd xmm5,xmm5,238 movdqa [64+edx],xmm6 movdqu xmm6,[80+edi] movdqa [edx-80],xmm5 pshufd xmm5,xmm6,68 pshufd xmm6,xmm6,238 movdqa [80+edx],xmm5 movdqu xmm5,[96+edi] movdqa [edx-64],xmm6 pshufd xmm6,xmm5,68 pshufd xmm5,xmm5,238 movdqa [96+edx],xmm6 movdqu xmm6,[112+edi] movdqa [edx-48],xmm5 pshufd xmm5,xmm6,68 pshufd xmm6,xmm6,238 movdqa [112+edx],xmm5 movdqu xmm5,[128+edi] movdqa [edx-32],xmm6 pshufd xmm6,xmm5,68 pshufd xmm5,xmm5,238 movdqa [128+edx],xmm6 movdqa [edx-16],xmm5 movdqu xmm5,[32+esi] movdqu xmm6,[48+esi] lea esi,[32+esi] movdqa [112+esp],xmm2 movdqa [128+esp],xmm3 movdqa [144+esp],xmm4 movdqa xmm2,xmm5 movdqa xmm3,xmm6 psrldq xmm2,6 psrldq xmm3,6 movdqa xmm4,xmm5 punpcklqdq xmm2,xmm3 punpckhqdq xmm4,xmm6 punpcklqdq xmm5,xmm6 movdqa xmm3,xmm2 psrlq xmm2,4 psrlq xmm3,30 movdqa xmm6,xmm5 psrlq xmm4,40 psrlq xmm6,26 pand xmm5,xmm7 pand xmm6,xmm7 pand xmm2,xmm7 pand xmm3,xmm7 por xmm4,[ebx] movdqa [80+esp],xmm0 movdqa [96+esp],xmm1 jbe NEAR L$013skip_loop jmp NEAR L$014loop align 32 L$014loop: movdqa xmm7,[edx-144] movdqa [16+eax],xmm6 movdqa [32+eax],xmm2 movdqa [48+eax],xmm3 movdqa [64+eax],xmm4 movdqa xmm1,xmm5 pmuludq xmm5,xmm7 movdqa xmm0,xmm6 pmuludq xmm6,xmm7 pmuludq xmm2,xmm7 pmuludq xmm3,xmm7 pmuludq xmm4,xmm7 pmuludq xmm0,[edx-16] movdqa xmm7,xmm1 pmuludq xmm1,[edx-128] paddq xmm0,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[edx-112] paddq xmm1,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[edx-96] paddq xmm2,xmm7 movdqa xmm7,[16+eax] pmuludq xmm6,[edx-80] paddq xmm3,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[edx-128] paddq xmm4,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[edx-112] paddq xmm2,xmm7 movdqa xmm7,[32+eax] pmuludq xmm6,[edx-96] paddq xmm3,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[edx-32] paddq xmm4,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[edx-16] paddq xmm0,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[edx-128] paddq xmm1,xmm5 movdqa xmm5,[48+eax] pmuludq xmm7,[edx-112] paddq xmm3,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[edx-48] paddq xmm4,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[edx-32] paddq xmm0,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[edx-16] paddq xmm1,xmm6 movdqa xmm6,[64+eax] pmuludq xmm5,[edx-128] paddq xmm2,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[edx-16] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[edx-64] paddq xmm3,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[edx-48] paddq xmm0,xmm7 movdqa xmm7,[64+ebx] pmuludq xmm6,[edx-32] paddq xmm1,xmm5 paddq xmm2,xmm6 movdqu xmm5,[esi-32] movdqu xmm6,[esi-16] lea esi,[32+esi] movdqa [32+esp],xmm2 movdqa [48+esp],xmm3 movdqa [64+esp],xmm4 movdqa xmm2,xmm5 movdqa xmm3,xmm6 psrldq xmm2,6 psrldq xmm3,6 movdqa xmm4,xmm5 punpcklqdq xmm2,xmm3 punpckhqdq xmm4,xmm6 punpcklqdq xmm5,xmm6 movdqa xmm3,xmm2 psrlq xmm2,4 psrlq xmm3,30 movdqa xmm6,xmm5 psrlq xmm4,40 psrlq xmm6,26 pand xmm5,xmm7 pand xmm6,xmm7 pand xmm2,xmm7 pand xmm3,xmm7 por xmm4,[ebx] lea eax,[esi-32] sub ecx,64 paddd xmm5,[80+esp] paddd xmm6,[96+esp] paddd xmm2,[112+esp] paddd xmm3,[128+esp] paddd xmm4,[144+esp] cmovb esi,eax lea eax,[160+esp] movdqa xmm7,[edx] movdqa [16+esp],xmm1 movdqa [16+eax],xmm6 movdqa [32+eax],xmm2 movdqa [48+eax],xmm3 movdqa [64+eax],xmm4 movdqa xmm1,xmm5 pmuludq xmm5,xmm7 paddq xmm5,xmm0 movdqa xmm0,xmm6 pmuludq xmm6,xmm7 pmuludq xmm2,xmm7 pmuludq xmm3,xmm7 pmuludq xmm4,xmm7 paddq xmm6,[16+esp] paddq xmm2,[32+esp] paddq xmm3,[48+esp] paddq xmm4,[64+esp] pmuludq xmm0,[128+edx] movdqa xmm7,xmm1 pmuludq xmm1,[16+edx] paddq xmm0,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[32+edx] paddq xmm1,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[48+edx] paddq xmm2,xmm7 movdqa xmm7,[16+eax] pmuludq xmm6,[64+edx] paddq xmm3,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[16+edx] paddq xmm4,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[32+edx] paddq xmm2,xmm7 movdqa xmm7,[32+eax] pmuludq xmm6,[48+edx] paddq xmm3,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[112+edx] paddq xmm4,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[128+edx] paddq xmm0,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[16+edx] paddq xmm1,xmm5 movdqa xmm5,[48+eax] pmuludq xmm7,[32+edx] paddq xmm3,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[96+edx] paddq xmm4,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[112+edx] paddq xmm0,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[128+edx] paddq xmm1,xmm6 movdqa xmm6,[64+eax] pmuludq xmm5,[16+edx] paddq xmm2,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[128+edx] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[80+edx] paddq xmm3,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[96+edx] paddq xmm0,xmm7 movdqa xmm7,[64+ebx] pmuludq xmm6,[112+edx] paddq xmm1,xmm5 paddq xmm2,xmm6 movdqa xmm5,xmm3 pand xmm3,xmm7 psrlq xmm5,26 paddq xmm5,xmm4 movdqa xmm6,xmm0 pand xmm0,xmm7 psrlq xmm6,26 movdqa xmm4,xmm5 paddq xmm6,xmm1 psrlq xmm5,26 pand xmm4,xmm7 movdqa xmm1,xmm6 psrlq xmm6,26 paddd xmm0,xmm5 psllq xmm5,2 paddq xmm6,xmm2 paddq xmm5,xmm0 pand xmm1,xmm7 movdqa xmm2,xmm6 psrlq xmm6,26 pand xmm2,xmm7 paddd xmm6,xmm3 movdqa xmm0,xmm5 psrlq xmm5,26 movdqa xmm3,xmm6 psrlq xmm6,26 pand xmm0,xmm7 paddd xmm1,xmm5 pand xmm3,xmm7 paddd xmm4,xmm6 movdqu xmm5,[32+esi] movdqu xmm6,[48+esi] lea esi,[32+esi] movdqa [112+esp],xmm2 movdqa [128+esp],xmm3 movdqa [144+esp],xmm4 movdqa xmm2,xmm5 movdqa xmm3,xmm6 psrldq xmm2,6 psrldq xmm3,6 movdqa xmm4,xmm5 punpcklqdq xmm2,xmm3 punpckhqdq xmm4,xmm6 punpcklqdq xmm5,xmm6 movdqa xmm3,xmm2 psrlq xmm2,4 psrlq xmm3,30 movdqa xmm6,xmm5 psrlq xmm4,40 psrlq xmm6,26 pand xmm5,xmm7 pand xmm6,xmm7 pand xmm2,xmm7 pand xmm3,xmm7 por xmm4,[ebx] movdqa [80+esp],xmm0 movdqa [96+esp],xmm1 ja NEAR L$014loop L$013skip_loop: pshufd xmm7,[edx-144],16 add ecx,32 jnz NEAR L$015long_tail paddd xmm5,xmm0 paddd xmm6,xmm1 paddd xmm2,[112+esp] paddd xmm3,[128+esp] paddd xmm4,[144+esp] L$015long_tail: movdqa [eax],xmm5 movdqa [16+eax],xmm6 movdqa [32+eax],xmm2 movdqa [48+eax],xmm3 movdqa [64+eax],xmm4 pmuludq xmm5,xmm7 pmuludq xmm6,xmm7 pmuludq xmm2,xmm7 movdqa xmm0,xmm5 pshufd xmm5,[edx-128],16 pmuludq xmm3,xmm7 movdqa xmm1,xmm6 pmuludq xmm4,xmm7 movdqa xmm6,xmm5 pmuludq xmm5,[48+eax] movdqa xmm7,xmm6 pmuludq xmm6,[32+eax] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[16+eax] paddq xmm3,xmm6 pshufd xmm6,[edx-64],16 pmuludq xmm5,[eax] paddq xmm2,xmm7 pmuludq xmm6,[64+eax] pshufd xmm7,[edx-112],16 paddq xmm1,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[32+eax] paddq xmm0,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[16+eax] paddq xmm4,xmm7 pshufd xmm7,[edx-48],16 pmuludq xmm6,[eax] paddq xmm3,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[64+eax] paddq xmm2,xmm6 pmuludq xmm5,[48+eax] pshufd xmm6,[edx-96],16 paddq xmm1,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[16+eax] paddq xmm0,xmm5 pshufd xmm5,[edx-32],16 pmuludq xmm7,[eax] paddq xmm4,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[64+eax] paddq xmm3,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[48+eax] paddq xmm2,xmm5 pmuludq xmm7,[32+eax] pshufd xmm5,[edx-80],16 paddq xmm1,xmm6 pshufd xmm6,[edx-16],16 pmuludq xmm5,[eax] paddq xmm0,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[64+eax] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[16+eax] paddq xmm3,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[32+eax] paddq xmm0,xmm7 pmuludq xmm6,[48+eax] movdqa xmm7,[64+ebx] paddq xmm1,xmm5 paddq xmm2,xmm6 jz NEAR L$016short_tail movdqu xmm5,[esi-32] movdqu xmm6,[esi-16] lea esi,[32+esi] movdqa [32+esp],xmm2 movdqa [48+esp],xmm3 movdqa [64+esp],xmm4 movdqa xmm2,xmm5 movdqa xmm3,xmm6 psrldq xmm2,6 psrldq xmm3,6 movdqa xmm4,xmm5 punpcklqdq xmm2,xmm3 punpckhqdq xmm4,xmm6 punpcklqdq xmm5,xmm6 movdqa xmm3,xmm2 psrlq xmm2,4 psrlq xmm3,30 movdqa xmm6,xmm5 psrlq xmm4,40 psrlq xmm6,26 pand xmm5,xmm7 pand xmm6,xmm7 pand xmm2,xmm7 pand xmm3,xmm7 por xmm4,[ebx] pshufd xmm7,[edx],16 paddd xmm5,[80+esp] paddd xmm6,[96+esp] paddd xmm2,[112+esp] paddd xmm3,[128+esp] paddd xmm4,[144+esp] movdqa [esp],xmm5 pmuludq xmm5,xmm7 movdqa [16+esp],xmm6 pmuludq xmm6,xmm7 paddq xmm0,xmm5 movdqa xmm5,xmm2 pmuludq xmm2,xmm7 paddq xmm1,xmm6 movdqa xmm6,xmm3 pmuludq xmm3,xmm7 paddq xmm2,[32+esp] movdqa [32+esp],xmm5 pshufd xmm5,[16+edx],16 paddq xmm3,[48+esp] movdqa [48+esp],xmm6 movdqa xmm6,xmm4 pmuludq xmm4,xmm7 paddq xmm4,[64+esp] movdqa [64+esp],xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[48+esp] movdqa xmm7,xmm6 pmuludq xmm6,[32+esp] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[16+esp] paddq xmm3,xmm6 pshufd xmm6,[80+edx],16 pmuludq xmm5,[esp] paddq xmm2,xmm7 pmuludq xmm6,[64+esp] pshufd xmm7,[32+edx],16 paddq xmm1,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[32+esp] paddq xmm0,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[16+esp] paddq xmm4,xmm7 pshufd xmm7,[96+edx],16 pmuludq xmm6,[esp] paddq xmm3,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[64+esp] paddq xmm2,xmm6 pmuludq xmm5,[48+esp] pshufd xmm6,[48+edx],16 paddq xmm1,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[16+esp] paddq xmm0,xmm5 pshufd xmm5,[112+edx],16 pmuludq xmm7,[esp] paddq xmm4,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[64+esp] paddq xmm3,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[48+esp] paddq xmm2,xmm5 pmuludq xmm7,[32+esp] pshufd xmm5,[64+edx],16 paddq xmm1,xmm6 pshufd xmm6,[128+edx],16 pmuludq xmm5,[esp] paddq xmm0,xmm7 movdqa xmm7,xmm6 pmuludq xmm6,[64+esp] paddq xmm4,xmm5 movdqa xmm5,xmm7 pmuludq xmm7,[16+esp] paddq xmm3,xmm6 movdqa xmm6,xmm5 pmuludq xmm5,[32+esp] paddq xmm0,xmm7 pmuludq xmm6,[48+esp] movdqa xmm7,[64+ebx] paddq xmm1,xmm5 paddq xmm2,xmm6 L$016short_tail: pshufd xmm6,xmm4,78 pshufd xmm5,xmm3,78 paddq xmm4,xmm6 paddq xmm3,xmm5 pshufd xmm6,xmm0,78 pshufd xmm5,xmm1,78 paddq xmm0,xmm6 paddq xmm1,xmm5 pshufd xmm6,xmm2,78 movdqa xmm5,xmm3 pand xmm3,xmm7 psrlq xmm5,26 paddq xmm2,xmm6 paddq xmm5,xmm4 movdqa xmm6,xmm0 pand xmm0,xmm7 psrlq xmm6,26 movdqa xmm4,xmm5 paddq xmm6,xmm1 psrlq xmm5,26 pand xmm4,xmm7 movdqa xmm1,xmm6 psrlq xmm6,26 paddd xmm0,xmm5 psllq xmm5,2 paddq xmm6,xmm2 paddq xmm5,xmm0 pand xmm1,xmm7 movdqa xmm2,xmm6 psrlq xmm6,26 pand xmm2,xmm7 paddd xmm6,xmm3 movdqa xmm0,xmm5 psrlq xmm5,26 movdqa xmm3,xmm6 psrlq xmm6,26 pand xmm0,xmm7 paddd xmm1,xmm5 pand xmm3,xmm7 paddd xmm4,xmm6 L$012done: movd DWORD [edi-48],xmm0 movd DWORD [edi-44],xmm1 movd DWORD [edi-40],xmm2 movd DWORD [edi-36],xmm3 movd DWORD [edi-32],xmm4 mov esp,ebp L$006nodata: pop edi pop esi pop ebx pop ebp ret align 32 align 16 __poly1305_emit_sse2: push ebp push ebx push esi push edi mov ebp,DWORD [20+esp] cmp DWORD [20+ebp],0 je NEAR L$enter_emit mov eax,DWORD [ebp] mov edi,DWORD [4+ebp] mov ecx,DWORD [8+ebp] mov edx,DWORD [12+ebp] mov esi,DWORD [16+ebp] mov ebx,edi shl edi,26 shr ebx,6 add eax,edi mov edi,ecx adc ebx,0 shl edi,20 shr ecx,12 add ebx,edi mov edi,edx adc ecx,0 shl edi,14 shr edx,18 add ecx,edi mov edi,esi adc edx,0 shl edi,8 shr esi,24 add edx,edi adc esi,0 mov edi,esi and esi,3 shr edi,2 lea ebp,[edi*4+edi] mov edi,DWORD [24+esp] add eax,ebp mov ebp,DWORD [28+esp] adc ebx,0 adc ecx,0 adc edx,0 adc esi,0 movd xmm0,eax add eax,5 movd xmm1,ebx adc ebx,0 movd xmm2,ecx adc ecx,0 movd xmm3,edx adc edx,0 adc esi,0 shr esi,2 neg esi and eax,esi and ebx,esi and ecx,esi and edx,esi mov DWORD [edi],eax movd eax,xmm0 mov DWORD [4+edi],ebx movd ebx,xmm1 mov DWORD [8+edi],ecx movd ecx,xmm2 mov DWORD [12+edi],edx movd edx,xmm3 not esi and eax,esi and ebx,esi or eax,DWORD [edi] and ecx,esi or ebx,DWORD [4+edi] and edx,esi or ecx,DWORD [8+edi] or edx,DWORD [12+edi] add eax,DWORD [ebp] adc ebx,DWORD [4+ebp] mov DWORD [edi],eax adc ecx,DWORD [8+ebp] mov DWORD [4+edi],ebx adc edx,DWORD [12+ebp] mov DWORD [8+edi],ecx mov DWORD [12+edi],edx pop edi pop esi pop ebx pop ebp ret align 64 L$const_sse2: dd 16777216,0,16777216,0,16777216,0,16777216,0 dd 0,0,0,0,0,0,0,0 dd 67108863,0,67108863,0,67108863,0,67108863,0 dd 268435455,268435452,268435452,268435452 db 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54 db 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 db 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 db 114,103,62,0 align 4 segment .bss common _GFp_ia32cap_P 16