#!/usr/bin/env perl # # ==================================================================== # Written by Andy Polyakov, @dot-asm, initially for the OpenSSL # project. # ==================================================================== # # December 2015 # # ChaCha20 for s390x. # # 3 times faster than compiler-generated code. # # February 2019 # # Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's # 4x"vertical" submission [on z13] and >3 faster than scalar code. # But to harness overheads revert to transliteration of VSX code path # from chacha-ppc module, which is also 4x"vertical", to handle inputs # not longer than 256 bytes. # # NB, compile with additional -Wa,-march=z13. $flavour = shift; if ($flavour =~ /3[12]/) { $SIZE_T=4; $g=""; } else { $SIZE_T=8; $g="g"; } while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; sub AUTOLOAD() # thunk [simplified] x86-style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $code .= "\t$opcode\t".join(',',@_)."\n"; } my $sp="%r15"; my $stdframe=16*$SIZE_T+4*8; my $frame=$stdframe+4*20; my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6)); my @x=map("%r$_",(0..7,"x","x","x","x",(10..13))); my @t=map("%r$_",(8,9)); sub ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my ($xc,$xc_)=map("\"$_\"",@t); my @x=map("\"$_\"",@x); # Consider order in which variables are addressed by their # index: # # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 # # 'a', 'b' and 'd's are permanently allocated in registers, # @x[0..7,12..15], while 'c's are maintained in memory. If # you observe 'c' column, you'll notice that pair of 'c's is # invariant between rounds. This means that we have to reload # them once per round, in the middle. This is why you'll see # 'c' stores and loads in the middle, but none in the beginning # or end. ( "&alr (@x[$a0],@x[$b0])", # Q1 "&alr (@x[$a1],@x[$b1])", # Q2 "&xr (@x[$d0],@x[$a0])", "&xr (@x[$d1],@x[$a1])", "&rll (@x[$d0],@x[$d0],16)", "&rll (@x[$d1],@x[$d1],16)", "&alr ($xc,@x[$d0])", "&alr ($xc_,@x[$d1])", "&xr (@x[$b0],$xc)", "&xr (@x[$b1],$xc_)", "&rll (@x[$b0],@x[$b0],12)", "&rll (@x[$b1],@x[$b1],12)", "&alr (@x[$a0],@x[$b0])", "&alr (@x[$a1],@x[$b1])", "&xr (@x[$d0],@x[$a0])", "&xr (@x[$d1],@x[$a1])", "&rll (@x[$d0],@x[$d0],8)", "&rll (@x[$d1],@x[$d1],8)", "&alr ($xc,@x[$d0])", "&alr ($xc_,@x[$d1])", "&xr (@x[$b0],$xc)", "&xr (@x[$b1],$xc_)", "&rll (@x[$b0],@x[$b0],7)", "&rll (@x[$b1],@x[$b1],7)", "&stm ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's "&lm ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')", "&alr (@x[$a2],@x[$b2])", # Q3 "&alr (@x[$a3],@x[$b3])", # Q4 "&xr (@x[$d2],@x[$a2])", "&xr (@x[$d3],@x[$a3])", "&rll (@x[$d2],@x[$d2],16)", "&rll (@x[$d3],@x[$d3],16)", "&alr ($xc,@x[$d2])", "&alr ($xc_,@x[$d3])", "&xr (@x[$b2],$xc)", "&xr (@x[$b3],$xc_)", "&rll (@x[$b2],@x[$b2],12)", "&rll (@x[$b3],@x[$b3],12)", "&alr (@x[$a2],@x[$b2])", "&alr (@x[$a3],@x[$b3])", "&xr (@x[$d2],@x[$a2])", "&xr (@x[$d3],@x[$a3])", "&rll (@x[$d2],@x[$d2],8)", "&rll (@x[$d3],@x[$d3],8)", "&alr ($xc,@x[$d2])", "&alr ($xc_,@x[$d3])", "&xr (@x[$b2],$xc)", "&xr (@x[$b3],$xc_)", "&rll (@x[$b2],@x[$b2],7)", "&rll (@x[$b3],@x[$b3],7)" ); } $code.=<<___; .text .globl ChaCha20_ctr32 .type ChaCha20_ctr32,\@function .align 32 ChaCha20_ctr32: #ifndef __KERNEL__ larl %r1,OPENSSL_s390xcap_P lghi %r0,64 lt${g}r $len,$len # $len==0? bzr %r14 lg %r1,16(%r1) cl${g}r $len,%r0 jle .Lshort tmhh %r1,0x4000 # check for vx bit jnz .LChaCha20_ctr32_vx .Lshort: #endif a${g}hi $len,-64 l${g}hi %r1,-$frame stm${g} %r6,%r15,`6*$SIZE_T`($sp) sl${g}r $out,$inp # difference la $len,0($inp,$len) # end of input minus 64 larl %r7,.Lsigma lgr %r0,$sp la $sp,0(%r1,$sp) st${g} %r0,0($sp) lmg %r8,%r11,0($key) # load key lmg %r12,%r13,0($counter) # load counter lmg %r6,%r7,0(%r7) # load sigma constant la %r14,0($inp) st${g} $out,$frame+3*$SIZE_T($sp) st${g} $len,$frame+4*$SIZE_T($sp) stmg %r6,%r13,$stdframe($sp) # copy key schedule to stack srlg @x[12],%r12,32 # 32-bit counter value .align 16 .Loop_outer: lm @x[0],@x[7],$stdframe+4*0($sp) # load x[0]-x[7] lm @t[0],@t[1],$stdframe+4*10($sp) # load x[10]-x[11] lm @x[13],@x[15],$stdframe+4*13($sp) # load x[13]-x[15] stm @t[0],@t[1],$stdframe+4*8+4*10($sp) # offload x[10]-x[11] lm @t[0],@t[1],$stdframe+4*8($sp) # load x[8]-x[9] st @x[12],$stdframe+4*12($sp) # save counter st${g} %r14,$frame+2*$SIZE_T($sp) # save input pointer lhi %r14,10 j .Loop .align 4 .Loop: ___ foreach (&ROUND(0, 4, 8,12)) { eval; } foreach (&ROUND(0, 5,10,15)) { eval; } $code.=<<___; brct %r14,.Loop l${g} %r14,$frame+2*$SIZE_T($sp) # pull input pointer stm @t[0],@t[1],$stdframe+4*8+4*8($sp) # offload x[8]-x[9] lm${g} @t[0],@t[1],$frame+3*$SIZE_T($sp) al @x[0],$stdframe+4*0($sp) # accumulate key schedule al @x[1],$stdframe+4*1($sp) al @x[2],$stdframe+4*2($sp) al @x[3],$stdframe+4*3($sp) al @x[4],$stdframe+4*4($sp) al @x[5],$stdframe+4*5($sp) al @x[6],$stdframe+4*6($sp) al @x[7],$stdframe+4*7($sp) lrvr @x[0],@x[0] lrvr @x[1],@x[1] lrvr @x[2],@x[2] lrvr @x[3],@x[3] lrvr @x[4],@x[4] lrvr @x[5],@x[5] lrvr @x[6],@x[6] lrvr @x[7],@x[7] al @x[12],$stdframe+4*12($sp) al @x[13],$stdframe+4*13($sp) al @x[14],$stdframe+4*14($sp) al @x[15],$stdframe+4*15($sp) lrvr @x[12],@x[12] lrvr @x[13],@x[13] lrvr @x[14],@x[14] lrvr @x[15],@x[15] la @t[0],0(@t[0],%r14) # reconstruct output pointer cl${g}r %r14,@t[1] jh .Ltail x @x[0],4*0(%r14) # xor with input x @x[1],4*1(%r14) st @x[0],4*0(@t[0]) # store output x @x[2],4*2(%r14) st @x[1],4*1(@t[0]) x @x[3],4*3(%r14) st @x[2],4*2(@t[0]) x @x[4],4*4(%r14) st @x[3],4*3(@t[0]) lm @x[0],@x[3],$stdframe+4*8+4*8($sp) # load x[8]-x[11] x @x[5],4*5(%r14) st @x[4],4*4(@t[0]) x @x[6],4*6(%r14) al @x[0],$stdframe+4*8($sp) st @x[5],4*5(@t[0]) x @x[7],4*7(%r14) al @x[1],$stdframe+4*9($sp) st @x[6],4*6(@t[0]) x @x[12],4*12(%r14) al @x[2],$stdframe+4*10($sp) st @x[7],4*7(@t[0]) x @x[13],4*13(%r14) al @x[3],$stdframe+4*11($sp) st @x[12],4*12(@t[0]) x @x[14],4*14(%r14) st @x[13],4*13(@t[0]) x @x[15],4*15(%r14) st @x[14],4*14(@t[0]) lrvr @x[0],@x[0] st @x[15],4*15(@t[0]) lrvr @x[1],@x[1] lrvr @x[2],@x[2] lrvr @x[3],@x[3] lhi @x[12],1 x @x[0],4*8(%r14) al @x[12],$stdframe+4*12($sp) # increment counter x @x[1],4*9(%r14) st @x[0],4*8(@t[0]) x @x[2],4*10(%r14) st @x[1],4*9(@t[0]) x @x[3],4*11(%r14) st @x[2],4*10(@t[0]) st @x[3],4*11(@t[0]) cl${g}r %r14,@t[1] # done yet? la %r14,64(%r14) jl .Loop_outer .Ldone: xgr %r0,%r0 xgr %r1,%r1 xgr %r2,%r2 xgr %r3,%r3 stmg %r0,%r3,$stdframe+4*4($sp) # wipe key copy stmg %r0,%r3,$stdframe+4*12($sp) lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) br %r14 .align 16 .Ltail: la @t[1],64($t[1]) stm @x[0],@x[7],$stdframe+4*0($sp) sl${g}r @t[1],%r14 lm @x[0],@x[3],$stdframe+4*8+4*8($sp) l${g}hi @x[6],0 stm @x[12],@x[15],$stdframe+4*12($sp) al @x[0],$stdframe+4*8($sp) al @x[1],$stdframe+4*9($sp) al @x[2],$stdframe+4*10($sp) al @x[3],$stdframe+4*11($sp) lrvr @x[0],@x[0] lrvr @x[1],@x[1] lrvr @x[2],@x[2] lrvr @x[3],@x[3] stm @x[0],@x[3],$stdframe+4*8($sp) .Loop_tail: llgc @x[4],0(@x[6],%r14) llgc @x[5],$stdframe(@x[6],$sp) xr @x[5],@x[4] stc @x[5],0(@x[6],@t[0]) la @x[6],1(@x[6]) brct @t[1],.Loop_tail j .Ldone .size ChaCha20_ctr32,.-ChaCha20_ctr32 ___ {{{ ######################################################################## # 4x"vertical" layout minimizes amount of instructions, but pipeline # runs underutilized [because of vector instructions' high latency]. # On the other hand minimum amount of data it takes to fully utilize # the pipeline is higher, so that effectively, short inputs would be # processed slower. Hence this code path targeting <=256 bytes lengths. # my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = map("%v$_",(0..15)); my @K = map("%v$_",(16..19)); my $CTR = "%v26"; my ($xt0,$xt1,$xt2,$xt3) = map("%v$_",(27..30)); my $beperm = "%v31"; my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10))); my $FRAME=$stdframe+4*16; sub VX_lane_ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my @x=map("\"%v$_\"",(0..15)); ( "&vaf (@x[$a0],@x[$a0],@x[$b0])", # Q1 "&vx (@x[$d0],@x[$d0],@x[$a0])", "&verllf (@x[$d0],@x[$d0],16)", "&vaf (@x[$a1],@x[$a1],@x[$b1])", # Q2 "&vx (@x[$d1],@x[$d1],@x[$a1])", "&verllf (@x[$d1],@x[$d1],16)", "&vaf (@x[$a2],@x[$a2],@x[$b2])", # Q3 "&vx (@x[$d2],@x[$d2],@x[$a2])", "&verllf (@x[$d2],@x[$d2],16)", "&vaf (@x[$a3],@x[$a3],@x[$b3])", # Q4 "&vx (@x[$d3],@x[$d3],@x[$a3])", "&verllf (@x[$d3],@x[$d3],16)", "&vaf (@x[$c0],@x[$c0],@x[$d0])", "&vx (@x[$b0],@x[$b0],@x[$c0])", "&verllf (@x[$b0],@x[$b0],12)", "&vaf (@x[$c1],@x[$c1],@x[$d1])", "&vx (@x[$b1],@x[$b1],@x[$c1])", "&verllf (@x[$b1],@x[$b1],12)", "&vaf (@x[$c2],@x[$c2],@x[$d2])", "&vx (@x[$b2],@x[$b2],@x[$c2])", "&verllf (@x[$b2],@x[$b2],12)", "&vaf (@x[$c3],@x[$c3],@x[$d3])", "&vx (@x[$b3],@x[$b3],@x[$c3])", "&verllf (@x[$b3],@x[$b3],12)", "&vaf (@x[$a0],@x[$a0],@x[$b0])", "&vx (@x[$d0],@x[$d0],@x[$a0])", "&verllf (@x[$d0],@x[$d0],8)", "&vaf (@x[$a1],@x[$a1],@x[$b1])", "&vx (@x[$d1],@x[$d1],@x[$a1])", "&verllf (@x[$d1],@x[$d1],8)", "&vaf (@x[$a2],@x[$a2],@x[$b2])", "&vx (@x[$d2],@x[$d2],@x[$a2])", "&verllf (@x[$d2],@x[$d2],8)", "&vaf (@x[$a3],@x[$a3],@x[$b3])", "&vx (@x[$d3],@x[$d3],@x[$a3])", "&verllf (@x[$d3],@x[$d3],8)", "&vaf (@x[$c0],@x[$c0],@x[$d0])", "&vx (@x[$b0],@x[$b0],@x[$c0])", "&verllf (@x[$b0],@x[$b0],7)", "&vaf (@x[$c1],@x[$c1],@x[$d1])", "&vx (@x[$b1],@x[$b1],@x[$c1])", "&verllf (@x[$b1],@x[$b1],7)", "&vaf (@x[$c2],@x[$c2],@x[$d2])", "&vx (@x[$b2],@x[$b2],@x[$c2])", "&verllf (@x[$b2],@x[$b2],7)", "&vaf (@x[$c3],@x[$c3],@x[$d3])", "&vx (@x[$b3],@x[$b3],@x[$c3])", "&verllf (@x[$b3],@x[$b3],7)" ); } $code.=<<___; .align 32 ChaCha20_ctr32_4x: .LChaCha20_ctr32_4x: stm${g} %r6,%r7,`6*$SIZE_T`($sp) ___ $code.=<<___ if ($flavour !~ /64/); std %f4,`16*$SIZE_T+2*8`($sp) std %f6,`16*$SIZE_T+3*8`($sp) ___ $code.=<<___; l${g}hi %r1,-$FRAME lgr %r0,$sp la $sp,0(%r1,$sp) st${g} %r0,0($sp) # back-chain ___ $code.=<<___ if ($flavour =~ /64/); std %f8,`$stdframe+8*0`($sp) std %f9,`$stdframe+8*1`($sp) std %f10,`$stdframe+8*2`($sp) std %f11,`$stdframe+8*3`($sp) std %f12,`$stdframe+8*4`($sp) std %f13,`$stdframe+8*5`($sp) std %f14,`$stdframe+8*6`($sp) std %f15,`$stdframe+8*7`($sp) ___ $code.=<<___; larl %r7,.Lsigma lhi %r0,10 lhi %r1,0 vl @K[0],0(%r7) # load sigma vl @K[1],0($key) # load key vl @K[2],16($key) vl @K[3],0($counter) # load counter vl $beperm,0x40(%r7) vl $CTR,0x50(%r7) #.Loop_outer_4x: vlm $xa0,$xa3,0x60(%r7) # load [smashed] sigma vrepf $xb0,@K[1],0 # smash the key vrepf $xb1,@K[1],1 vrepf $xb2,@K[1],2 vrepf $xb3,@K[1],3 vrepf $xd0,@K[3],0 vrepf $xd1,@K[3],1 vrepf $xd2,@K[3],2 vrepf $xd3,@K[3],3 vaf $xd0,$xd0,$CTR vrepf $xc0,@K[2],0 vrepf $xc1,@K[2],1 vrepf $xc2,@K[2],2 vrepf $xc3,@K[2],3 .Loop_4x: ___ foreach (&VX_lane_ROUND(0, 4, 8,12)) { eval; } foreach (&VX_lane_ROUND(0, 5,10,15)) { eval; } $code.=<<___; brct %r0,.Loop_4x vaf $xd0,$xd0,$CTR vmrhf $xt0,$xa0,$xa1 # transpose data vmrhf $xt1,$xa2,$xa3 vmrlf $xt2,$xa0,$xa1 vmrlf $xt3,$xa2,$xa3 vpdi $xa0,$xt0,$xt1,0b0000 vpdi $xa1,$xt0,$xt1,0b0101 vpdi $xa2,$xt2,$xt3,0b0000 vpdi $xa3,$xt2,$xt3,0b0101 vmrhf $xt0,$xb0,$xb1 vmrhf $xt1,$xb2,$xb3 vmrlf $xt2,$xb0,$xb1 vmrlf $xt3,$xb2,$xb3 vpdi $xb0,$xt0,$xt1,0b0000 vpdi $xb1,$xt0,$xt1,0b0101 vpdi $xb2,$xt2,$xt3,0b0000 vpdi $xb3,$xt2,$xt3,0b0101 vmrhf $xt0,$xc0,$xc1 vmrhf $xt1,$xc2,$xc3 vmrlf $xt2,$xc0,$xc1 vmrlf $xt3,$xc2,$xc3 vpdi $xc0,$xt0,$xt1,0b0000 vpdi $xc1,$xt0,$xt1,0b0101 vpdi $xc2,$xt2,$xt3,0b0000 vpdi $xc3,$xt2,$xt3,0b0101 vmrhf $xt0,$xd0,$xd1 vmrhf $xt1,$xd2,$xd3 vmrlf $xt2,$xd0,$xd1 vmrlf $xt3,$xd2,$xd3 vpdi $xd0,$xt0,$xt1,0b0000 vpdi $xd1,$xt0,$xt1,0b0101 vpdi $xd2,$xt2,$xt3,0b0000 vpdi $xd3,$xt2,$xt3,0b0101 #vrepif $xt0,4 #vaf $CTR,$CTR,$xt0 # next counter value vaf $xa0,$xa0,@K[0] vaf $xb0,$xb0,@K[1] vaf $xc0,$xc0,@K[2] vaf $xd0,$xd0,@K[3] vperm $xa0,$xa0,$xa0,$beperm vperm $xb0,$xb0,$xb0,$beperm vperm $xc0,$xc0,$xc0,$beperm vperm $xd0,$xd0,$xd0,$beperm #cl${g}fi $len,0x40 #jl .Ltail_4x vlm $xt0,$xt3,0($inp) vx $xt0,$xt0,$xa0 vx $xt1,$xt1,$xb0 vx $xt2,$xt2,$xc0 vx $xt3,$xt3,$xd0 vstm $xt0,$xt3,0($out) la $inp,0x40($inp) la $out,0x40($out) a${g}hi $len,-0x40 #je .Ldone_4x vaf $xa0,$xa1,@K[0] vaf $xb0,$xb1,@K[1] vaf $xc0,$xc1,@K[2] vaf $xd0,$xd1,@K[3] vperm $xa0,$xa0,$xa0,$beperm vperm $xb0,$xb0,$xb0,$beperm vperm $xc0,$xc0,$xc0,$beperm vperm $xd0,$xd0,$xd0,$beperm cl${g}fi $len,0x40 jl .Ltail_4x vlm $xt0,$xt3,0($inp) vx $xt0,$xt0,$xa0 vx $xt1,$xt1,$xb0 vx $xt2,$xt2,$xc0 vx $xt3,$xt3,$xd0 vstm $xt0,$xt3,0($out) la $inp,0x40($inp) la $out,0x40($out) a${g}hi $len,-0x40 je .Ldone_4x vaf $xa0,$xa2,@K[0] vaf $xb0,$xb2,@K[1] vaf $xc0,$xc2,@K[2] vaf $xd0,$xd2,@K[3] vperm $xa0,$xa0,$xa0,$beperm vperm $xb0,$xb0,$xb0,$beperm vperm $xc0,$xc0,$xc0,$beperm vperm $xd0,$xd0,$xd0,$beperm cl${g}fi $len,0x40 jl .Ltail_4x vlm $xt0,$xt3,0($inp) vx $xt0,$xt0,$xa0 vx $xt1,$xt1,$xb0 vx $xt2,$xt2,$xc0 vx $xt3,$xt3,$xd0 vstm $xt0,$xt3,0($out) la $inp,0x40($inp) la $out,0x40($out) a${g}hi $len,-0x40 je .Ldone_4x vaf $xa0,$xa3,@K[0] vaf $xb0,$xb3,@K[1] vaf $xc0,$xc3,@K[2] vaf $xd0,$xd3,@K[3] vperm $xa0,$xa0,$xa0,$beperm vperm $xb0,$xb0,$xb0,$beperm vperm $xc0,$xc0,$xc0,$beperm vperm $xd0,$xd0,$xd0,$beperm cl${g}fi $len,0x40 jl .Ltail_4x vlm $xt0,$xt3,0($inp) vx $xt0,$xt0,$xa0 vx $xt1,$xt1,$xb0 vx $xt2,$xt2,$xc0 vx $xt3,$xt3,$xd0 vstm $xt0,$xt3,0($out) #la $inp,0x40($inp) #la $out,0x40($out) #lhi %r0,10 #a${g}hi $len,-0x40 #jne .Loop_outer_4x .Ldone_4x: ___ $code.=<<___ if ($flavour !~ /64/); ld %f4,`$FRAME+16*$SIZE_T+2*8`($sp) ld %f6,`$FRAME+16*$SIZE_T+3*8`($sp) ___ $code.=<<___ if ($flavour =~ /64/); ld %f8,`$stdframe+8*0`($sp) ld %f9,`$stdframe+8*1`($sp) ld %f10,`$stdframe+8*2`($sp) ld %f11,`$stdframe+8*3`($sp) ld %f12,`$stdframe+8*4`($sp) ld %f13,`$stdframe+8*5`($sp) ld %f14,`$stdframe+8*6`($sp) ld %f15,`$stdframe+8*7`($sp) ___ $code.=<<___; lm${g} %r6,%r7,`$FRAME+6*$SIZE_T`($sp) la $sp,$FRAME($sp) br %r14 .align 16 .Ltail_4x: ___ $code.=<<___ if ($flavour !~ /64/); vlr $xt0,$xb0 ld %f4,`$FRAME+16*$SIZE_T+2*8`($sp) ld %f6,`$FRAME+16*$SIZE_T+3*8`($sp) vst $xa0,`$stdframe+0x00`($sp) vst $xt0,`$stdframe+0x10`($sp) vst $xc0,`$stdframe+0x20`($sp) vst $xd0,`$stdframe+0x30`($sp) ___ $code.=<<___ if ($flavour =~ /64/); vlr $xt0,$xc0 ld %f8,`$stdframe+8*0`($sp) ld %f9,`$stdframe+8*1`($sp) ld %f10,`$stdframe+8*2`($sp) ld %f11,`$stdframe+8*3`($sp) vlr $xt1,$xd0 ld %f12,`$stdframe+8*4`($sp) ld %f13,`$stdframe+8*5`($sp) ld %f14,`$stdframe+8*6`($sp) ld %f15,`$stdframe+8*7`($sp) vst $xa0,`$stdframe+0x00`($sp) vst $xb0,`$stdframe+0x10`($sp) vst $xt0,`$stdframe+0x20`($sp) vst $xt1,`$stdframe+0x30`($sp) ___ $code.=<<___; lghi %r1,0 .Loop_tail_4x: llgc %r5,0(%r1,$inp) llgc %r6,$stdframe(%r1,$sp) xr %r6,%r5 stc %r6,0(%r1,$out) la %r1,1(%r1) brct $len,.Loop_tail_4x lm${g} %r6,%r7,`$FRAME+6*$SIZE_T`($sp) la $sp,$FRAME($sp) br %r14 .size ChaCha20_ctr32_4x,.-ChaCha20_ctr32_4x ___ }}} {{{ ######################################################################## # 6x"horizontal" layout is optimal fit for the platform in its current # shape, more specifically for given vector instructions' latency. Well, # computational part of 8x"vertical" would be faster, but it consumes # all registers and dealing with that will diminish the return... # my ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1, $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3, $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5) = map("%v$_",(0..23)); my @K = map("%v$_",(27,24..26)); my ($t0,$t1,$t2,$t3) = map("%v$_",27..30); my $beperm = "%v31"; my $FRAME=$stdframe + 4*16; sub VX_ROUND { my ($a,$b,$c,$d,$odd)=@_; ( "&vaf ('$a','$a','$b')", "&vx ('$d','$d','$a')", "&verllf ('$d','$d', 16)", "&vaf ('$c','$c','$d')", "&vx ('$b','$b','$c')", "&verllf ('$b','$b', 12)", "&vaf ('$a','$a','$b')", "&vx ('$d','$d','$a')", "&verllf ('$d','$d', 8)", "&vaf ('$c','$c','$d')", "&vx ('$b','$b','$c')", "&verllf ('$b','$b',7)", "&vsldb ('$c','$c','$c', 8)", "&vsldb ('$b','$b','$b', $odd ? 12 : 4)", "&vsldb ('$d','$d','$d', $odd ? 4 : 12)" ); } $code.=<<___; .globl ChaCha20_ctr32_vx .align 32 ChaCha20_ctr32_vx: .LChaCha20_ctr32_vx: cl${g}fi $len,256 jle .LChaCha20_ctr32_4x stm${g} %r6,%r7,`6*$SIZE_T`($sp) ___ $code.=<<___ if ($flavour !~ /64/); std %f4,`16*$SIZE_T+2*8`($sp) std %f6,`16*$SIZE_T+3*8`($sp) ___ $code.=<<___; l${g}hi %r1,-$FRAME lgr %r0,$sp la $sp,0(%r1,$sp) st${g} %r0,0($sp) # back-chain ___ $code.=<<___ if ($flavour =~ /64/); std %f8,`$FRAME-8*8`($sp) std %f9,`$FRAME-8*7`($sp) std %f10,`$FRAME-8*6`($sp) std %f11,`$FRAME-8*5`($sp) std %f12,`$FRAME-8*4`($sp) std %f13,`$FRAME-8*3`($sp) std %f14,`$FRAME-8*2`($sp) std %f15,`$FRAME-8*1`($sp) ___ $code.=<<___; larl %r7,.Lsigma lhi %r0,10 vlm @K[1],@K[2],0($key) # load key vl @K[3],0($counter) # load counter vlm @K[0],$beperm,0(%r7) # load sigma, increments, ... .Loop_outer_vx: vlr $a0,@K[0] vlr $b0,@K[1] vlr $a1,@K[0] vlr $b1,@K[1] vlr $a2,@K[0] vlr $b2,@K[1] vlr $a3,@K[0] vlr $b3,@K[1] vlr $a4,@K[0] vlr $b4,@K[1] vlr $a5,@K[0] vlr $b5,@K[1] vlr $d0,@K[3] vaf $d1,@K[3],$t1 # K[3]+1 vaf $d2,@K[3],$t2 # K[3]+2 vaf $d3,@K[3],$t3 # K[3]+3 vaf $d4,$d2,$t2 # K[3]+4 vaf $d5,$d2,$t3 # K[3]+5 vlr $c0,@K[2] vlr $c1,@K[2] vlr $c2,@K[2] vlr $c3,@K[2] vlr $c4,@K[2] vlr $c5,@K[2] vlr $t1,$d1 vlr $t2,$d2 vlr $t3,$d3 .align 4 .Loop_vx: ___ my @thread0 = &VX_ROUND($a0,$b0,$c0,$d0,0); my @thread1 = &VX_ROUND($a1,$b1,$c1,$d1,0); my @thread2 = &VX_ROUND($a2,$b2,$c2,$d2,0); my @thread3 = &VX_ROUND($a3,$b3,$c3,$d3,0); my @thread4 = &VX_ROUND($a4,$b4,$c4,$d4,0); my @thread5 = &VX_ROUND($a5,$b5,$c5,$d5,0); foreach(@thread0) { eval; eval(shift(@thread1)); eval(shift(@thread2)); eval(shift(@thread3)); eval(shift(@thread4)); eval(shift(@thread5)); } @thread0 = &VX_ROUND($a0,$b0,$c0,$d0,1); @thread1 = &VX_ROUND($a1,$b1,$c1,$d1,1); @thread2 = &VX_ROUND($a2,$b2,$c2,$d2,1); @thread3 = &VX_ROUND($a3,$b3,$c3,$d3,1); @thread4 = &VX_ROUND($a4,$b4,$c4,$d4,1); @thread5 = &VX_ROUND($a5,$b5,$c5,$d5,1); foreach(@thread0) { eval; eval(shift(@thread1)); eval(shift(@thread2)); eval(shift(@thread3)); eval(shift(@thread4)); eval(shift(@thread5)); } $code.=<<___; brct %r0,.Loop_vx vaf $a0,$a0,@K[0] vaf $b0,$b0,@K[1] vaf $c0,$c0,@K[2] vaf $d0,$d0,@K[3] vaf $a1,$a1,@K[0] vaf $d1,$d1,$t1 # +K[3]+1 vperm $a0,$a0,$a0,$beperm vperm $b0,$b0,$b0,$beperm vperm $c0,$c0,$c0,$beperm vperm $d0,$d0,$d0,$beperm cl${g}fi $len,0x40 jl .Ltail_vx vaf $d2,$d2,$t2 # +K[3]+2 vaf $d3,$d3,$t3 # +K[3]+3 vlm $t0,$t3,0($inp) vx $a0,$a0,$t0 vx $b0,$b0,$t1 vx $c0,$c0,$t2 vx $d0,$d0,$t3 vlm @K[0],$t3,0(%r7) # re-load sigma and increments vstm $a0,$d0,0($out) la $inp,0x40($inp) la $out,0x40($out) a${g}hi $len,-0x40 je .Ldone_vx vaf $b1,$b1,@K[1] vaf $c1,$c1,@K[2] vperm $a0,$a1,$a1,$beperm vperm $b0,$b1,$b1,$beperm vperm $c0,$c1,$c1,$beperm vperm $d0,$d1,$d1,$beperm cl${g}fi $len,0x40 jl .Ltail_vx vlm $a1,$d1,0($inp) vx $a0,$a0,$a1 vx $b0,$b0,$b1 vx $c0,$c0,$c1 vx $d0,$d0,$d1 vstm $a0,$d0,0($out) la $inp,0x40($inp) la $out,0x40($out) a${g}hi $len,-0x40 je .Ldone_vx vaf $a2,$a2,@K[0] vaf $b2,$b2,@K[1] vaf $c2,$c2,@K[2] vperm $a0,$a2,$a2,$beperm vperm $b0,$b2,$b2,$beperm vperm $c0,$c2,$c2,$beperm vperm $d0,$d2,$d2,$beperm cl${g}fi $len,0x40 jl .Ltail_vx vlm $a1,$d1,0($inp) vx $a0,$a0,$a1 vx $b0,$b0,$b1 vx $c0,$c0,$c1 vx $d0,$d0,$d1 vstm $a0,$d0,0($out) la $inp,0x40($inp) la $out,0x40($out) a${g}hi $len,-0x40 je .Ldone_vx vaf $a3,$a3,@K[0] vaf $b3,$b3,@K[1] vaf $c3,$c3,@K[2] vaf $d2,@K[3],$t3 # K[3]+3 vperm $a0,$a3,$a3,$beperm vperm $b0,$b3,$b3,$beperm vperm $c0,$c3,$c3,$beperm vperm $d0,$d3,$d3,$beperm cl${g}fi $len,0x40 jl .Ltail_vx vaf $d3,$d2,$t1 # K[3]+4 vlm $a1,$d1,0($inp) vx $a0,$a0,$a1 vx $b0,$b0,$b1 vx $c0,$c0,$c1 vx $d0,$d0,$d1 vstm $a0,$d0,0($out) la $inp,0x40($inp) la $out,0x40($out) a${g}hi $len,-0x40 je .Ldone_vx vaf $a4,$a4,@K[0] vaf $b4,$b4,@K[1] vaf $c4,$c4,@K[2] vaf $d4,$d4,$d3 # +K[3]+4 vaf $d3,$d3,$t1 # K[3]+5 vaf @K[3],$d2,$t3 # K[3]+=6 vperm $a0,$a4,$a4,$beperm vperm $b0,$b4,$b4,$beperm vperm $c0,$c4,$c4,$beperm vperm $d0,$d4,$d4,$beperm cl${g}fi $len,0x40 jl .Ltail_vx vlm $a1,$d1,0($inp) vx $a0,$a0,$a1 vx $b0,$b0,$b1 vx $c0,$c0,$c1 vx $d0,$d0,$d1 vstm $a0,$d0,0($out) la $inp,0x40($inp) la $out,0x40($out) a${g}hi $len,-0x40 je .Ldone_vx vaf $a5,$a5,@K[0] vaf $b5,$b5,@K[1] vaf $c5,$c5,@K[2] vaf $d5,$d5,$d3 # +K[3]+5 vperm $a0,$a5,$a5,$beperm vperm $b0,$b5,$b5,$beperm vperm $c0,$c5,$c5,$beperm vperm $d0,$d5,$d5,$beperm cl${g}fi $len,0x40 jl .Ltail_vx vlm $a1,$d1,0($inp) vx $a0,$a0,$a1 vx $b0,$b0,$b1 vx $c0,$c0,$c1 vx $d0,$d0,$d1 vstm $a0,$d0,0($out) la $inp,0x40($inp) la $out,0x40($out) lhi %r0,10 a${g}hi $len,-0x40 jne .Loop_outer_vx .Ldone_vx: ___ $code.=<<___ if ($flavour !~ /64/); ld %f4,`$FRAME+16*$SIZE_T+2*8`($sp) ld %f6,`$FRAME+16*$SIZE_T+3*8`($sp) ___ $code.=<<___ if ($flavour =~ /64/); ld %f8,`$FRAME-8*8`($sp) ld %f9,`$FRAME-8*7`($sp) ld %f10,`$FRAME-8*6`($sp) ld %f11,`$FRAME-8*5`($sp) ld %f12,`$FRAME-8*4`($sp) ld %f13,`$FRAME-8*3`($sp) ld %f14,`$FRAME-8*2`($sp) ld %f15,`$FRAME-8*1`($sp) ___ $code.=<<___; lm${g} %r6,%r7,`$FRAME+6*$SIZE_T`($sp) la $sp,$FRAME($sp) br %r14 .align 16 .Ltail_vx: ___ $code.=<<___ if ($flavour !~ /64/); ld %f4,`$FRAME+16*$SIZE_T+2*8`($sp) ld %f6,`$FRAME+16*$SIZE_T+3*8`($sp) ___ $code.=<<___ if ($flavour =~ /64/); ld %f8,`$FRAME-8*8`($sp) ld %f9,`$FRAME-8*7`($sp) ld %f10,`$FRAME-8*6`($sp) ld %f11,`$FRAME-8*5`($sp) ld %f12,`$FRAME-8*4`($sp) ld %f13,`$FRAME-8*3`($sp) ld %f14,`$FRAME-8*2`($sp) ld %f15,`$FRAME-8*1`($sp) ___ $code.=<<___; vstm $a0,$d0,$stdframe($sp) lghi %r1,0 .Loop_tail_vx: llgc %r5,0(%r1,$inp) llgc %r6,$stdframe(%r1,$sp) xr %r6,%r5 stc %r6,0(%r1,$out) la %r1,1(%r1) brct $len,.Loop_tail_vx lm${g} %r6,%r7,`$FRAME+6*$SIZE_T`($sp) la $sp,$FRAME($sp) br %r14 .size ChaCha20_ctr32_vx,.-ChaCha20_ctr32_vx ___ }}} $code.=<<___; .align 32 .Lsigma: .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral .long 1,0,0,0 .long 2,0,0,0 .long 3,0,0,0 .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c # byte swap .long 0,1,2,3 .long 0x61707865,0x61707865,0x61707865,0x61707865 # smashed sigma .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32 .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574 .asciz "ChaCha20 for s390x, CRYPTOGAMS by \@dot-asm" .align 4 ___ foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; print $_,"\n"; } close STDOUT;