#! /usr/bin/env perl # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # January 2013 # # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled # in http://download.intel.com/design/intarch/papers/323686.pdf, is # that since AESNI-CBC encrypt exhibit *very* low instruction-level # parallelism, interleaving it with another algorithm would allow to # utilize processor resources better and achieve better performance. # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and # AESNI code is weaved into it. As SHA256 dominates execution time, # stitch performance does not depend on AES key length. Below are # performance numbers in cycles per processed byte, less is better, # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched # subroutine: # # AES-128/-192/-256+SHA256 this(**) gain # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43% # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50% # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59% # Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40% # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58% # Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54% # Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60% # # (*) there are XOP, AVX1 and AVX2 code paths, meaning that # Westmere is omitted from loop, this is because gain was not # estimated high enough to justify the effort; # (**) these are EVP-free results, results obtained with 'speed # -evp aes-256-cbc-hmac-sha256' will vary by percent or two; # (***) these are SHAEXT results; $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=12); } if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } $shaext=$avx; ### set to zero if compiling for 1.0.1 $avx=1 if (!$shaext && $avx); open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $func="aesni_cbc_sha256_enc"; $TABLE="K256"; $SZ=4; @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", "%r8d","%r9d","%r10d","%r11d"); ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi"); @Sigma0=( 2,13,22); @Sigma1=( 6,11,25); @sigma0=( 7,18, 3); @sigma1=(17,19,10); $rounds=64; ######################################################################## # void aesni_cbc_sha256_enc(const void *inp, # void *out, # size_t length, # const AES_KEY *key, # unsigned char *iv, # SHA256_CTX *ctx, # const void *in0); ($inp, $out, $len, $key, $ivp, $ctx, $in0) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); $Tbl="%rbp"; $_inp="16*$SZ+0*8(%rsp)"; $_out="16*$SZ+1*8(%rsp)"; $_end="16*$SZ+2*8(%rsp)"; $_key="16*$SZ+3*8(%rsp)"; $_ivp="16*$SZ+4*8(%rsp)"; $_ctx="16*$SZ+5*8(%rsp)"; $_in0="16*$SZ+6*8(%rsp)"; $_rsp="`16*$SZ+7*8`(%rsp)"; $framesz=16*$SZ+8*8; $code=<<___; .text .extern OPENSSL_ia32cap_P .globl $func .type $func,\@abi-omnipotent .align 16 $func: .cfi_startproc ___ if ($avx) { $code.=<<___; lea OPENSSL_ia32cap_P(%rip),%r11 mov \$1,%eax cmp \$0,`$win64?"%rcx":"%rdi"` je .Lprobe mov 0(%r11),%eax mov 4(%r11),%r10 ___ $code.=<<___ if ($shaext); bt \$61,%r10 # check for SHA jc ${func}_shaext ___ $code.=<<___; mov %r10,%r11 shr \$32,%r11 test \$`1<<11`,%r10d # check for XOP jnz ${func}_xop ___ $code.=<<___ if ($avx>1); and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 cmp \$`1<<8|1<<5|1<<3`,%r11d je ${func}_avx2 ___ $code.=<<___; and \$`1<<28`,%r10d # check for AVX jnz ${func}_avx ud2 ___ } $code.=<<___; xor %eax,%eax cmp \$0,`$win64?"%rcx":"%rdi"` je .Lprobe ud2 .Lprobe: ret .cfi_endproc .size $func,.-$func .align 64 .type $TABLE,\@object $TABLE: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1 .long 0,0,0,0, 0,0,0,0 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by " .align 64 ___ ###################################################################### # SIMD code paths # {{{ ($iv,$inout,$roundkey,$temp, $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15)); $aesni_cbc_idx=0; @aesni_cbc_block = ( ## &vmovdqu ($roundkey,"0x00-0x80($inp)");' ## &vmovdqu ($inout,($inp)); ## &mov ($_inp,$inp); '&vpxor ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x10-0x80($inp)");', '&vpxor ($inout,$inout,$iv);', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x20-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x30-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x40-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x50-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x60-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x70-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x80-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x90-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");', '&vaesenclast ($temp,$inout,$roundkey);'. ' &vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");', '&vpand ($iv,$temp,$mask10);'. ' &vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");', '&vaesenclast ($temp,$inout,$roundkey);'. ' &vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");', '&vpand ($temp,$temp,$mask12);'. ' &vaesenc ($inout,$inout,$roundkey);'. '&vmovdqu ($roundkey,"0xe0-0x80($inp)");', '&vpor ($iv,$iv,$temp);'. ' &vaesenclast ($temp,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x00-0x80($inp)");' ## &mov ($inp,$_inp); ## &mov ($out,$_out); ## &vpand ($temp,$temp,$mask14); ## &vpor ($iv,$iv,$temp); ## &vmovdqu ($iv,($out,$inp); ## &lea (inp,16($inp)); ); my $a4=$T1; my ($a,$b,$c,$d,$e,$f,$g,$h); sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; my $arg = pop; $arg = "\$$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; } sub body_00_15 () { ( '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. '&ror ($a0,$Sigma1[2]-$Sigma1[1])', '&mov ($a,$a1)', '&mov ($a4,$f)', '&xor ($a0,$e)', '&ror ($a1,$Sigma0[2]-$Sigma0[1])', '&xor ($a4,$g)', # f^g '&ror ($a0,$Sigma1[1]-$Sigma1[0])', '&xor ($a1,$a)', '&and ($a4,$e)', # (f^g)&e @aesni_cbc_block[$aesni_cbc_idx++]. '&xor ($a0,$e)', '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] '&mov ($a2,$a)', '&ror ($a1,$Sigma0[1]-$Sigma0[0])', '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g '&xor ($a2,$b)', # a^b, b^c in next round '&ror ($a0,$Sigma1[0])', # Sigma1(e) '&add ($h,$a4)', # h+=Ch(e,f,g) '&and ($a3,$a2)', # (b^c)&(a^b) '&xor ($a1,$a)', '&add ($h,$a0)', # h+=Sigma1(e) '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) '&add ($d,$h)', # d+=h '&ror ($a1,$Sigma0[0])', # Sigma0(a) '&add ($h,$a3)', # h+=Maj(a,b,c) '&mov ($a0,$d)', '&add ($a1,$h);'. # h+=Sigma0(a) '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' ); } if ($avx) {{ ###################################################################### # XOP code path # $code.=<<___; .type ${func}_xop,\@function,6 .align 64 ${func}_xop: .cfi_startproc .Lxop_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$`$framesz+$win64*16*10`,%rsp and \$-64,%rsp # align stack frame shl \$6,$len sub $inp,$out # re-bias sub $inp,$in0 add $inp,$len # end of input #mov $inp,$_inp # saved later mov $out,$_out mov $len,$_end #mov $key,$_key # remains resident in $inp register mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 mov %rax,$_rsp .cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) movaps %xmm7,`$framesz+16*1`(%rsp) movaps %xmm8,`$framesz+16*2`(%rsp) movaps %xmm9,`$framesz+16*3`(%rsp) movaps %xmm10,`$framesz+16*4`(%rsp) movaps %xmm11,`$framesz+16*5`(%rsp) movaps %xmm12,`$framesz+16*6`(%rsp) movaps %xmm13,`$framesz+16*7`(%rsp) movaps %xmm14,`$framesz+16*8`(%rsp) movaps %xmm15,`$framesz+16*9`(%rsp) ___ $code.=<<___; .Lprologue_xop: vzeroall mov $inp,%r12 # borrow $a4 lea 0x80($key),$inp # size optimization, reassign lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 mov $ctx,%r15 # borrow $a2 mov $in0,%rsi # borrow $a3 vmovdqu ($ivp),$iv # load IV sub \$9,%r14 mov $SZ*0(%r15),$A mov $SZ*1(%r15),$B mov $SZ*2(%r15),$C mov $SZ*3(%r15),$D mov $SZ*4(%r15),$E mov $SZ*5(%r15),$F mov $SZ*6(%r15),$G mov $SZ*7(%r15),$H vmovdqa 0x00(%r13,%r14,8),$mask14 vmovdqa 0x10(%r13,%r14,8),$mask12 vmovdqa 0x20(%r13,%r14,8),$mask10 vmovdqu 0x00-0x80($inp),$roundkey jmp .Lloop_xop ___ if ($SZ==4) { # SHA256 my @X = map("%xmm$_",(0..3)); my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); $code.=<<___; .align 16 .Lloop_xop: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00(%rsi,%r12),@X[0] vmovdqu 0x10(%rsi,%r12),@X[1] vmovdqu 0x20(%rsi,%r12),@X[2] vmovdqu 0x30(%rsi,%r12),@X[3] vpshufb $t3,@X[0],@X[0] lea $TABLE(%rip),$Tbl vpshufb $t3,@X[1],@X[1] vpshufb $t3,@X[2],@X[2] vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] vpaddd 0x20($Tbl),@X[1],$t1 vpaddd 0x40($Tbl),@X[2],$t2 vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) mov $A,$a1 vmovdqa $t1,0x10(%rsp) mov $B,$a3 vmovdqa $t2,0x20(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x30(%rsp) mov $E,$a0 jmp .Lxop_00_47 .align 16 .Lxop_00_47: sub \$-16*2*$SZ,$Tbl # size optimization vmovdqu (%r12),$inout # $a4 mov %r12,$_inp # $a4 ___ sub XOP_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 104 instructions &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] eval(shift(@insns)); eval(shift(@insns)); &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &vpsrld ($t0,$t0,$sigma0[2]); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t0,$t0,$t1); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &vpsrld ($t2,@X[3],$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpsrldq ($t3,$t3,8); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &vpsrld ($t2,@X[0],$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpslldq ($t3,$t3,8); # 22 instructions eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } $aesni_cbc_idx=0; for ($i=0,$j=0; $j<4; $j++) { &XOP_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &mov ("%r12",$_inp); # borrow $a4 &vpand ($temp,$temp,$mask14); &mov ("%r15",$_out); # borrow $a2 &vpor ($iv,$iv,$temp); &vmovdqu ("(%r15,%r12)",$iv); # write output &lea ("%r12","16(%r12)"); # inp++ &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lxop_00_47"); &vmovdqu ($inout,"(%r12)"); &mov ($_inp,"%r12"); $aesni_cbc_idx=0; for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } } $code.=<<___; mov $_inp,%r12 # borrow $a4 mov $_out,%r13 # borrow $a0 mov $_ctx,%r15 # borrow $a2 mov $_in0,%rsi # borrow $a3 vpand $mask14,$temp,$temp mov $a1,$A vpor $temp,$iv,$iv vmovdqu $iv,(%r13,%r12) # write output lea 16(%r12),%r12 # inp++ add $SZ*0(%r15),$A add $SZ*1(%r15),$B add $SZ*2(%r15),$C add $SZ*3(%r15),$D add $SZ*4(%r15),$E add $SZ*5(%r15),$F add $SZ*6(%r15),$G add $SZ*7(%r15),$H cmp $_end,%r12 mov $A,$SZ*0(%r15) mov $B,$SZ*1(%r15) mov $C,$SZ*2(%r15) mov $D,$SZ*3(%r15) mov $E,$SZ*4(%r15) mov $F,$SZ*5(%r15) mov $G,$SZ*6(%r15) mov $H,$SZ*7(%r15) jb .Lloop_xop mov $_ivp,$ivp mov $_rsp,%rsi .cfi_def_cfa %rsi,8 vmovdqu $iv,($ivp) # output IV vzeroall ___ $code.=<<___ if ($win64); movaps `$framesz+16*0`(%rsp),%xmm6 movaps `$framesz+16*1`(%rsp),%xmm7 movaps `$framesz+16*2`(%rsp),%xmm8 movaps `$framesz+16*3`(%rsp),%xmm9 movaps `$framesz+16*4`(%rsp),%xmm10 movaps `$framesz+16*5`(%rsp),%xmm11 movaps `$framesz+16*6`(%rsp),%xmm12 movaps `$framesz+16*7`(%rsp),%xmm13 movaps `$framesz+16*8`(%rsp),%xmm14 movaps `$framesz+16*9`(%rsp),%xmm15 ___ $code.=<<___; mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_xop: ret .cfi_endproc .size ${func}_xop,.-${func}_xop ___ ###################################################################### # AVX+shrd code path # local *ror = sub { &shrd(@_[0],@_) }; $code.=<<___; .type ${func}_avx,\@function,6 .align 64 ${func}_avx: .cfi_startproc .Lavx_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$`$framesz+$win64*16*10`,%rsp and \$-64,%rsp # align stack frame shl \$6,$len sub $inp,$out # re-bias sub $inp,$in0 add $inp,$len # end of input #mov $inp,$_inp # saved later mov $out,$_out mov $len,$_end #mov $key,$_key # remains resident in $inp register mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 mov %rax,$_rsp .cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) movaps %xmm7,`$framesz+16*1`(%rsp) movaps %xmm8,`$framesz+16*2`(%rsp) movaps %xmm9,`$framesz+16*3`(%rsp) movaps %xmm10,`$framesz+16*4`(%rsp) movaps %xmm11,`$framesz+16*5`(%rsp) movaps %xmm12,`$framesz+16*6`(%rsp) movaps %xmm13,`$framesz+16*7`(%rsp) movaps %xmm14,`$framesz+16*8`(%rsp) movaps %xmm15,`$framesz+16*9`(%rsp) ___ $code.=<<___; .Lprologue_avx: vzeroall mov $inp,%r12 # borrow $a4 lea 0x80($key),$inp # size optimization, reassign lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 mov $ctx,%r15 # borrow $a2 mov $in0,%rsi # borrow $a3 vmovdqu ($ivp),$iv # load IV sub \$9,%r14 mov $SZ*0(%r15),$A mov $SZ*1(%r15),$B mov $SZ*2(%r15),$C mov $SZ*3(%r15),$D mov $SZ*4(%r15),$E mov $SZ*5(%r15),$F mov $SZ*6(%r15),$G mov $SZ*7(%r15),$H vmovdqa 0x00(%r13,%r14,8),$mask14 vmovdqa 0x10(%r13,%r14,8),$mask12 vmovdqa 0x20(%r13,%r14,8),$mask10 vmovdqu 0x00-0x80($inp),$roundkey ___ if ($SZ==4) { # SHA256 my @X = map("%xmm$_",(0..3)); my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); $code.=<<___; jmp .Lloop_avx .align 16 .Lloop_avx: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00(%rsi,%r12),@X[0] vmovdqu 0x10(%rsi,%r12),@X[1] vmovdqu 0x20(%rsi,%r12),@X[2] vmovdqu 0x30(%rsi,%r12),@X[3] vpshufb $t3,@X[0],@X[0] lea $TABLE(%rip),$Tbl vpshufb $t3,@X[1],@X[1] vpshufb $t3,@X[2],@X[2] vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] vpaddd 0x20($Tbl),@X[1],$t1 vpaddd 0x40($Tbl),@X[2],$t2 vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) mov $A,$a1 vmovdqa $t1,0x10(%rsp) mov $B,$a3 vmovdqa $t2,0x20(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x30(%rsp) mov $E,$a0 jmp .Lavx_00_47 .align 16 .Lavx_00_47: sub \$-16*2*$SZ,$Tbl # size optimization vmovdqu (%r12),$inout # $a4 mov %r12,$_inp # $a4 ___ sub Xupdate_256_AVX () { ( '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] '&vpsrld ($t2,$t0,$sigma0[0]);', '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] '&vpsrld ($t3,$t0,$sigma0[2])', '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', '&vpxor ($t0,$t3,$t2)', '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', '&vpxor ($t0,$t0,$t1)', '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', '&vpxor ($t0,$t0,$t2)', '&vpsrld ($t2,$t3,$sigma1[2]);', '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) '&vpsrlq ($t3,$t3,$sigma1[0]);', '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) '&vpxor ($t2,$t2,$t3);', '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15]) '&vpshufd ($t2,$t2,0b10000100)', '&vpsrldq ($t2,$t2,8)', '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] '&vpsrld ($t2,$t3,$sigma1[2])', '&vpsrlq ($t3,$t3,$sigma1[0])', '&vpxor ($t2,$t2,$t3);', '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', '&vpxor ($t2,$t2,$t3)', '&vpshufd ($t2,$t2,0b11101000)', '&vpslldq ($t2,$t2,8)', '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) ); } sub AVX_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 104 instructions foreach (Xupdate_256_AVX()) { # 29 instructions eval; eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); } &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } $aesni_cbc_idx=0; for ($i=0,$j=0; $j<4; $j++) { &AVX_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &mov ("%r12",$_inp); # borrow $a4 &vpand ($temp,$temp,$mask14); &mov ("%r15",$_out); # borrow $a2 &vpor ($iv,$iv,$temp); &vmovdqu ("(%r15,%r12)",$iv); # write output &lea ("%r12","16(%r12)"); # inp++ &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lavx_00_47"); &vmovdqu ($inout,"(%r12)"); &mov ($_inp,"%r12"); $aesni_cbc_idx=0; for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } } $code.=<<___; mov $_inp,%r12 # borrow $a4 mov $_out,%r13 # borrow $a0 mov $_ctx,%r15 # borrow $a2 mov $_in0,%rsi # borrow $a3 vpand $mask14,$temp,$temp mov $a1,$A vpor $temp,$iv,$iv vmovdqu $iv,(%r13,%r12) # write output lea 16(%r12),%r12 # inp++ add $SZ*0(%r15),$A add $SZ*1(%r15),$B add $SZ*2(%r15),$C add $SZ*3(%r15),$D add $SZ*4(%r15),$E add $SZ*5(%r15),$F add $SZ*6(%r15),$G add $SZ*7(%r15),$H cmp $_end,%r12 mov $A,$SZ*0(%r15) mov $B,$SZ*1(%r15) mov $C,$SZ*2(%r15) mov $D,$SZ*3(%r15) mov $E,$SZ*4(%r15) mov $F,$SZ*5(%r15) mov $G,$SZ*6(%r15) mov $H,$SZ*7(%r15) jb .Lloop_avx mov $_ivp,$ivp mov $_rsp,%rsi .cfi_def_cfa %rsi,8 vmovdqu $iv,($ivp) # output IV vzeroall ___ $code.=<<___ if ($win64); movaps `$framesz+16*0`(%rsp),%xmm6 movaps `$framesz+16*1`(%rsp),%xmm7 movaps `$framesz+16*2`(%rsp),%xmm8 movaps `$framesz+16*3`(%rsp),%xmm9 movaps `$framesz+16*4`(%rsp),%xmm10 movaps `$framesz+16*5`(%rsp),%xmm11 movaps `$framesz+16*6`(%rsp),%xmm12 movaps `$framesz+16*7`(%rsp),%xmm13 movaps `$framesz+16*8`(%rsp),%xmm14 movaps `$framesz+16*9`(%rsp),%xmm15 ___ $code.=<<___; mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx: ret .cfi_endproc .size ${func}_avx,.-${func}_avx ___ if ($avx>1) {{ ###################################################################### # AVX2+BMI code path # my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp my $PUSH8=8*2*$SZ; use integer; sub bodyx_00_15 () { # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f ( '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] '&and ($a4,$e)', # f&e '&rorx ($a0,$e,$Sigma1[2])', '&rorx ($a2,$e,$Sigma1[1])', '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past '&lea ($h,"($h,$a4)")', '&andn ($a4,$e,$g)', # ~e&g '&xor ($a0,$a2)', '&rorx ($a1,$e,$Sigma1[0])', '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) '&xor ($a0,$a1)', # Sigma1(e) '&mov ($a2,$a)', '&rorx ($a4,$a,$Sigma0[2])', '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) '&xor ($a2,$b)', # a^b, b^c in next round '&rorx ($a1,$a,$Sigma0[1])', '&rorx ($a0,$a,$Sigma0[0])', '&lea ($d,"($d,$h)")', # d+=h '&and ($a3,$a2)', # (b^c)&(a^b) @aesni_cbc_block[$aesni_cbc_idx++]. '&xor ($a1,$a4)', '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) '&xor ($a1,$a0)', # Sigma0(a) '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) '&mov ($a4,$e)', # copy of f in future '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' ); # and at the finish one has to $a+=$a1 } $code.=<<___; .type ${func}_avx2,\@function,6 .align 64 ${func}_avx2: .cfi_startproc .Lavx2_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp and \$-256*$SZ,%rsp # align stack frame add \$`2*$SZ*($rounds-8)`,%rsp shl \$6,$len sub $inp,$out # re-bias sub $inp,$in0 add $inp,$len # end of input #mov $inp,$_inp # saved later #mov $out,$_out # kept in $offload mov $len,$_end #mov $key,$_key # remains resident in $inp register mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 mov %rax,$_rsp .cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) movaps %xmm7,`$framesz+16*1`(%rsp) movaps %xmm8,`$framesz+16*2`(%rsp) movaps %xmm9,`$framesz+16*3`(%rsp) movaps %xmm10,`$framesz+16*4`(%rsp) movaps %xmm11,`$framesz+16*5`(%rsp) movaps %xmm12,`$framesz+16*6`(%rsp) movaps %xmm13,`$framesz+16*7`(%rsp) movaps %xmm14,`$framesz+16*8`(%rsp) movaps %xmm15,`$framesz+16*9`(%rsp) ___ $code.=<<___; .Lprologue_avx2: vzeroall mov $inp,%r13 # borrow $a0 vpinsrq \$1,$out,$offload,$offload lea 0x80($key),$inp # size optimization, reassign lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 mov $ctx,%r15 # borrow $a2 mov $in0,%rsi # borrow $a3 vmovdqu ($ivp),$iv # load IV lea -9(%r14),%r14 vmovdqa 0x00(%r12,%r14,8),$mask14 vmovdqa 0x10(%r12,%r14,8),$mask12 vmovdqa 0x20(%r12,%r14,8),$mask10 sub \$-16*$SZ,%r13 # inp++, size optimization mov $SZ*0(%r15),$A lea (%rsi,%r13),%r12 # borrow $a0 mov $SZ*1(%r15),$B cmp $len,%r13 # $_end mov $SZ*2(%r15),$C cmove %rsp,%r12 # next block or random data mov $SZ*3(%r15),$D mov $SZ*4(%r15),$E mov $SZ*5(%r15),$F mov $SZ*6(%r15),$G mov $SZ*7(%r15),$H vmovdqu 0x00-0x80($inp),$roundkey ___ if ($SZ==4) { # SHA256 my @X = map("%ymm$_",(0..3)); my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7)); $code.=<<___; jmp .Loop_avx2 .align 16 .Loop_avx2: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3 vinserti128 \$1,(%r12),@X[0],@X[0] vinserti128 \$1,16(%r12),@X[1],@X[1] vpshufb $t3,@X[0],@X[0] vinserti128 \$1,32(%r12),@X[2],@X[2] vpshufb $t3,@X[1],@X[1] vinserti128 \$1,48(%r12),@X[3],@X[3] lea $TABLE(%rip),$Tbl vpshufb $t3,@X[2],@X[2] lea -16*$SZ(%r13),%r13 vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] vpaddd 0x20($Tbl),@X[1],$t1 vpaddd 0x40($Tbl),@X[2],$t2 vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) xor $a1,$a1 vmovdqa $t1,0x20(%rsp) ___ $code.=<<___ if (!$win64); # temporarily use %rsi as frame pointer mov $_rsp,%rsi .cfi_def_cfa %rsi,8 ___ $code.=<<___; lea -$PUSH8(%rsp),%rsp ___ $code.=<<___ if (!$win64); # the frame info is at $_rsp, but the stack is moving... # so a second frame pointer is saved at -8(%rsp) # that is in the red zone mov %rsi,-8(%rsp) .cfi_cfa_expression %rsp-8,deref,+8 ___ $code.=<<___; mov $B,$a3 vmovdqa $t2,0x00(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x20(%rsp) mov $F,$a4 sub \$-16*2*$SZ,$Tbl # size optimization jmp .Lavx2_00_47 .align 16 .Lavx2_00_47: vmovdqu (%r13),$inout vpinsrq \$0,%r13,$offload,$offload ___ sub AVX2_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 96 instructions my $base = "+2*$PUSH8(%rsp)"; if (($j%2)==0) { &lea ("%rsp","-$PUSH8(%rsp)"); $code.=<<___ if (!$win64); .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 # copy secondary frame pointer to new location again at -8(%rsp) pushq $PUSH8-8(%rsp) .cfi_cfa_expression %rsp,deref,+8 lea 8(%rsp),%rsp .cfi_cfa_expression %rsp-8,deref,+8 ___ } foreach (Xupdate_256_AVX()) { # 29 instructions eval; eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); } &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); } $aesni_cbc_idx=0; for ($i=0,$j=0; $j<4; $j++) { &AVX2_256_00_47($j,\&bodyx_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &vmovq ("%r13",$offload); # borrow $a0 &vpextrq ("%r15",$offload,1); # borrow $a2 &vpand ($temp,$temp,$mask14); &vpor ($iv,$iv,$temp); &vmovdqu ("(%r15,%r13)",$iv); # write output &lea ("%r13","16(%r13)"); # inp++ &lea ($Tbl,16*2*$SZ."($Tbl)"); &cmpb (($SZ-1)."($Tbl)",0); &jne (".Lavx2_00_47"); &vmovdqu ($inout,"(%r13)"); &vpinsrq ($offload,$offload,"%r13",0); $aesni_cbc_idx=0; for ($i=0; $i<16; ) { my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; foreach(bodyx_00_15()) { eval; } } } $code.=<<___; vpextrq \$1,$offload,%r12 # $_out, borrow $a4 vmovq $offload,%r13 # $_inp, borrow $a0 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 add $a1,$A lea `2*$SZ*($rounds-8)`(%rsp),$Tbl vpand $mask14,$temp,$temp vpor $temp,$iv,$iv vmovdqu $iv,(%r12,%r13) # write output lea 16(%r13),%r13 add $SZ*0(%r15),$A add $SZ*1(%r15),$B add $SZ*2(%r15),$C add $SZ*3(%r15),$D add $SZ*4(%r15),$E add $SZ*5(%r15),$F add $SZ*6(%r15),$G add $SZ*7(%r15),$H mov $A,$SZ*0(%r15) mov $B,$SZ*1(%r15) mov $C,$SZ*2(%r15) mov $D,$SZ*3(%r15) mov $E,$SZ*4(%r15) mov $F,$SZ*5(%r15) mov $G,$SZ*6(%r15) mov $H,$SZ*7(%r15) cmp `$PUSH8+2*8`($Tbl),%r13 # $_end je .Ldone_avx2 xor $a1,$a1 mov $B,$a3 mov $F,$a4 xor $C,$a3 # magic jmp .Lower_avx2 .align 16 .Lower_avx2: vmovdqu (%r13),$inout vpinsrq \$0,%r13,$offload,$offload ___ $aesni_cbc_idx=0; for ($i=0; $i<16; ) { my $base="+16($Tbl)"; foreach(bodyx_00_15()) { eval; } &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8); } $code.=<<___; vmovq $offload,%r13 # borrow $a0 vpextrq \$1,$offload,%r15 # borrow $a2 vpand $mask14,$temp,$temp vpor $temp,$iv,$iv lea -$PUSH8($Tbl),$Tbl vmovdqu $iv,(%r15,%r13) # write output lea 16(%r13),%r13 # inp++ cmp %rsp,$Tbl jae .Lower_avx2 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 lea 16*$SZ(%r13),%r13 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3 add $a1,$A lea `2*$SZ*($rounds-8)`(%rsp),%rsp add $SZ*0(%r15),$A add $SZ*1(%r15),$B add $SZ*2(%r15),$C add $SZ*3(%r15),$D add $SZ*4(%r15),$E add $SZ*5(%r15),$F add $SZ*6(%r15),$G lea (%rsi,%r13),%r12 add $SZ*7(%r15),$H cmp $_end,%r13 mov $A,$SZ*0(%r15) cmove %rsp,%r12 # next block or stale data mov $B,$SZ*1(%r15) mov $C,$SZ*2(%r15) mov $D,$SZ*3(%r15) mov $E,$SZ*4(%r15) mov $F,$SZ*5(%r15) mov $G,$SZ*6(%r15) mov $H,$SZ*7(%r15) jbe .Loop_avx2 lea (%rsp),$Tbl # temporarily use $Tbl as index to $_rsp # this avoids the need to save a secondary frame pointer at -8(%rsp) .cfi_cfa_expression $Tbl+`16*$SZ+7*8`,deref,+8 .Ldone_avx2: mov 16*$SZ+4*8($Tbl),$ivp mov 16*$SZ+7*8($Tbl),%rsi .cfi_def_cfa %rsi,8 vmovdqu $iv,($ivp) # output IV vzeroall ___ $code.=<<___ if ($win64); movaps `$framesz+16*0`($Tbl),%xmm6 movaps `$framesz+16*1`($Tbl),%xmm7 movaps `$framesz+16*2`($Tbl),%xmm8 movaps `$framesz+16*3`($Tbl),%xmm9 movaps `$framesz+16*4`($Tbl),%xmm10 movaps `$framesz+16*5`($Tbl),%xmm11 movaps `$framesz+16*6`($Tbl),%xmm12 movaps `$framesz+16*7`($Tbl),%xmm13 movaps `$framesz+16*8`($Tbl),%xmm14 movaps `$framesz+16*9`($Tbl),%xmm15 ___ $code.=<<___; mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx2: ret .cfi_endproc .size ${func}_avx2,.-${func}_avx2 ___ }} }} {{ my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); my ($rounds,$Tbl)=("%r11d","%rbx"); my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15)); my @rndkey=("%xmm4","%xmm5"); my $r=0; my $sn=0; my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9)); my @MSG=map("%xmm$_",(10..13)); my $aesenc=sub { use integer; my ($n,$k)=($r/10,$r%10); if ($k==0) { $code.=<<___; movups `16*$n`($in0),$in # load input xorps $rndkey0,$in ___ $code.=<<___ if ($n); movups $iv,`16*($n-1)`($out,$in0) # write output ___ $code.=<<___; xorps $in,$iv movups `32+16*$k-112`($key),$rndkey[1] aesenc $rndkey[0],$iv ___ } elsif ($k==9) { $sn++; $code.=<<___; cmp \$11,$rounds jb .Laesenclast$sn movups `32+16*($k+0)-112`($key),$rndkey[1] aesenc $rndkey[0],$iv movups `32+16*($k+1)-112`($key),$rndkey[0] aesenc $rndkey[1],$iv je .Laesenclast$sn movups `32+16*($k+2)-112`($key),$rndkey[1] aesenc $rndkey[0],$iv movups `32+16*($k+3)-112`($key),$rndkey[0] aesenc $rndkey[1],$iv .Laesenclast$sn: aesenclast $rndkey[0],$iv movups 16-112($key),$rndkey[1] # forward reference nop ___ } else { $code.=<<___; movups `32+16*$k-112`($key),$rndkey[1] aesenc $rndkey[0],$iv ___ } $r++; unshift(@rndkey,pop(@rndkey)); }; if ($shaext) { my $Tbl="%rax"; $code.=<<___; .type ${func}_shaext,\@function,6 .align 32 ${func}_shaext: .cfi_startproc mov `($win64?56:8)`(%rsp),$inp # load 7th argument ___ $code.=<<___ if ($win64); lea `-8-10*16`(%rsp),%rsp movaps %xmm6,-8-10*16(%rax) movaps %xmm7,-8-9*16(%rax) movaps %xmm8,-8-8*16(%rax) movaps %xmm9,-8-7*16(%rax) movaps %xmm10,-8-6*16(%rax) movaps %xmm11,-8-5*16(%rax) movaps %xmm12,-8-4*16(%rax) movaps %xmm13,-8-3*16(%rax) movaps %xmm14,-8-2*16(%rax) movaps %xmm15,-8-1*16(%rax) .Lprologue_shaext: ___ $code.=<<___; lea K256+0x80(%rip),$Tbl movdqu ($ctx),$ABEF # DCBA movdqu 16($ctx),$CDGH # HGFE movdqa 0x200-0x80($Tbl),$TMP # byte swap mask mov 240($key),$rounds sub $in0,$out movups ($key),$rndkey0 # $key[0] movups ($ivp),$iv # load IV movups 16($key),$rndkey[0] # forward reference lea 112($key),$key # size optimization pshufd \$0x1b,$ABEF,$Wi # ABCD pshufd \$0xb1,$ABEF,$ABEF # CDAB pshufd \$0x1b,$CDGH,$CDGH # EFGH movdqa $TMP,$BSWAP # offload palignr \$8,$CDGH,$ABEF # ABEF punpcklqdq $Wi,$CDGH # CDGH jmp .Loop_shaext .align 16 .Loop_shaext: movdqu ($inp),@MSG[0] movdqu 0x10($inp),@MSG[1] movdqu 0x20($inp),@MSG[2] pshufb $TMP,@MSG[0] movdqu 0x30($inp),@MSG[3] movdqa 0*32-0x80($Tbl),$Wi paddd @MSG[0],$Wi pshufb $TMP,@MSG[1] movdqa $CDGH,$CDGH_SAVE # offload movdqa $ABEF,$ABEF_SAVE # offload ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 0-3 pshufd \$0x0e,$Wi,$Wi ___ &$aesenc(); $code.=<<___; sha256rnds2 $CDGH,$ABEF movdqa 1*32-0x80($Tbl),$Wi paddd @MSG[1],$Wi pshufb $TMP,@MSG[2] lea 0x40($inp),$inp ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 4-7 pshufd \$0x0e,$Wi,$Wi ___ &$aesenc(); $code.=<<___; sha256rnds2 $CDGH,$ABEF movdqa 2*32-0x80($Tbl),$Wi paddd @MSG[2],$Wi pshufb $TMP,@MSG[3] sha256msg1 @MSG[1],@MSG[0] ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 8-11 pshufd \$0x0e,$Wi,$Wi movdqa @MSG[3],$TMP palignr \$4,@MSG[2],$TMP paddd $TMP,@MSG[0] ___ &$aesenc(); $code.=<<___; sha256rnds2 $CDGH,$ABEF movdqa 3*32-0x80($Tbl),$Wi paddd @MSG[3],$Wi sha256msg2 @MSG[3],@MSG[0] sha256msg1 @MSG[2],@MSG[1] ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 12-15 pshufd \$0x0e,$Wi,$Wi ___ &$aesenc(); $code.=<<___; movdqa @MSG[0],$TMP palignr \$4,@MSG[3],$TMP paddd $TMP,@MSG[1] sha256rnds2 $CDGH,$ABEF ___ for($i=4;$i<16-3;$i++) { &$aesenc() if (($r%10)==0); $code.=<<___; movdqa $i*32-0x80($Tbl),$Wi paddd @MSG[0],$Wi sha256msg2 @MSG[0],@MSG[1] sha256msg1 @MSG[3],@MSG[2] ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 16-19... pshufd \$0x0e,$Wi,$Wi movdqa @MSG[1],$TMP palignr \$4,@MSG[0],$TMP paddd $TMP,@MSG[2] ___ &$aesenc(); &$aesenc() if ($r==19); $code.=<<___; sha256rnds2 $CDGH,$ABEF ___ push(@MSG,shift(@MSG)); } $code.=<<___; movdqa 13*32-0x80($Tbl),$Wi paddd @MSG[0],$Wi sha256msg2 @MSG[0],@MSG[1] sha256msg1 @MSG[3],@MSG[2] ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 52-55 pshufd \$0x0e,$Wi,$Wi movdqa @MSG[1],$TMP palignr \$4,@MSG[0],$TMP paddd $TMP,@MSG[2] ___ &$aesenc(); &$aesenc(); $code.=<<___; sha256rnds2 $CDGH,$ABEF movdqa 14*32-0x80($Tbl),$Wi paddd @MSG[1],$Wi sha256msg2 @MSG[1],@MSG[2] movdqa $BSWAP,$TMP ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 56-59 pshufd \$0x0e,$Wi,$Wi ___ &$aesenc(); $code.=<<___; sha256rnds2 $CDGH,$ABEF movdqa 15*32-0x80($Tbl),$Wi paddd @MSG[2],$Wi ___ &$aesenc(); &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 60-63 pshufd \$0x0e,$Wi,$Wi ___ &$aesenc(); $code.=<<___; sha256rnds2 $CDGH,$ABEF #pxor $CDGH,$rndkey0 # black magic ___ while ($r<40) { &$aesenc(); } # remaining aesenc's $code.=<<___; #xorps $CDGH,$rndkey0 # black magic paddd $CDGH_SAVE,$CDGH paddd $ABEF_SAVE,$ABEF dec $len movups $iv,48($out,$in0) # write output lea 64($in0),$in0 jnz .Loop_shaext pshufd \$0xb1,$CDGH,$CDGH # DCHG pshufd \$0x1b,$ABEF,$TMP # FEBA pshufd \$0xb1,$ABEF,$ABEF # BAFE punpckhqdq $CDGH,$ABEF # DCBA palignr \$8,$TMP,$CDGH # HGFE movups $iv,($ivp) # write IV movdqu $ABEF,($ctx) movdqu $CDGH,16($ctx) ___ $code.=<<___ if ($win64); movaps 0*16(%rsp),%xmm6 movaps 1*16(%rsp),%xmm7 movaps 2*16(%rsp),%xmm8 movaps 3*16(%rsp),%xmm9 movaps 4*16(%rsp),%xmm10 movaps 5*16(%rsp),%xmm11 movaps 6*16(%rsp),%xmm12 movaps 7*16(%rsp),%xmm13 movaps 8*16(%rsp),%xmm14 movaps 9*16(%rsp),%xmm15 lea 8+10*16(%rsp),%rsp .Lepilogue_shaext: ___ $code.=<<___; ret .cfi_endproc .size ${func}_shaext,.-${func}_shaext ___ } }}}}} # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64 && $avx) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HanderlData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue ___ $code.=<<___ if ($shaext); lea aesni_cbc_sha256_enc_shaext(%rip),%r10 cmp %r10,%rbx jb .Lnot_in_shaext lea (%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq lea 168(%rax),%rax # adjust stack pointer jmp .Lin_prologue .Lnot_in_shaext: ___ $code.=<<___ if ($avx>1); lea .Lavx2_shortcut(%rip),%r10 cmp %r10,%rbx # context->RipRbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq .Lin_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler .section .pdata .rva .LSEH_begin_${func}_xop .rva .LSEH_end_${func}_xop .rva .LSEH_info_${func}_xop .rva .LSEH_begin_${func}_avx .rva .LSEH_end_${func}_avx .rva .LSEH_info_${func}_avx ___ $code.=<<___ if ($avx>1); .rva .LSEH_begin_${func}_avx2 .rva .LSEH_end_${func}_avx2 .rva .LSEH_info_${func}_avx2 ___ $code.=<<___ if ($shaext); .rva .LSEH_begin_${func}_shaext .rva .LSEH_end_${func}_shaext .rva .LSEH_info_${func}_shaext ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_${func}_xop: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] .LSEH_info_${func}_avx: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] ___ $code.=<<___ if ($avx>1); .LSEH_info_${func}_avx2: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] ___ $code.=<<___ if ($shaext); .LSEH_info_${func}_shaext: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[] ___ } #################################################################### sub rex { local *opcode=shift; my ($dst,$src)=@_; my $rex=0; $rex|=0x04 if($dst>=8); $rex|=0x01 if($src>=8); unshift @opcode,$rex|0x40 if($rex); } { my %opcodelet = ( "sha256rnds2" => 0xcb, "sha256msg1" => 0xcc, "sha256msg2" => 0xcd ); sub sha256op38 { my $instr = shift; if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x0f,0x38); rex(\@opcode,$2,$1); push @opcode,$opcodelet{$instr}; push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } else { return $instr."\t".@_[0]; } } } $code =~ s/\`([^\`]*)\`/eval $1/gem; $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem; print $code; close STDOUT or die "error closing STDOUT: $!";