#!/usr/bin/env perl # # ==================================================================== # Written by Andy Polyakov, @dot-asm, initially for the OpenSSL # project. # ==================================================================== # # December 2014 # # ChaCha20 for ARMv4. # # September 2018 # # Improve scalar performance per Eric Biggers' suggestion to eliminate # separate rotates. This requires b[0..3] and d[0..3] to be maintained # pre-rotated, hence odd twists prior inner loop and when accumulating # key material. Since amount of instructions is reduced as result, even # NEON performance is improved somewhat, most notably by ~9% on low-end # Cortex-A5/A7. Full unroll was shown to provide even better scalar # performance on Cortex-A5/A7, naturally at the cost of manyfold size # increase. I let it be. Oversized code works in benchmarks, but is not # necessarily optimal in real life, when it's likely to be out-of-cache # upon entry and evict significant part of cache upon completion. # # Performance in cycles per byte out of large buffer. # # IALU/gcc-4.4 1xNEON 3xNEON+1xIALU # # Cortex-A5 14.2(*)/+160% 21.8 12.9(**) # Cortex-A8 10.2(*)/+190% 13.9 6.10 # Cortex-A9 10.8(*)/+150% 14.3 6.50 # Cortex-A15 11.0/+40% 16.0 4.90 # Snapdragon S4 13.9(***)/+90% 13.6 4.90 # # (*) most "favourable" result for aligned data on little-endian # processor, result for misaligned data is 10-15% lower; # (**) pure 4xNEON [with "vertical" layout] was shown to provide ~8% # better performance on Cortex-A5/A7, but not on others; # (***) it's 17% slower than original, trade-off is considered # acceptable, because of improvement on others, specifically # +36% on Cortex-A5/A7 and +20% on Cortex-A9; $flavour = shift; if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open STDOUT,"| \"$^X\" $xlate $flavour $output"; } else { open STDOUT,">$output"; } sub AUTOLOAD() # thunk [simplified] x86-style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; my $arg = pop; $arg = "#$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; } my @x=map("r$_",(0..7,"x","x","x","x",12,"x",14,"x")); my @t=map("r$_",(8..11)); sub ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my $odd = $d0&1; my ($xc,$xc_) = (@t[0..1]); my ($xd,$xd_) = $odd ? (@t[2],@x[$d1]) : (@x[$d0],@t[2]); my @ret; # Consider order in which variables are addressed by their # index: # # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 # # 'a', 'b' are permanently allocated in registers, @x[0..7], # while 'c's and pair of 'd's are maintained in memory. If # you observe 'c' column, you'll notice that pair of 'c's is # invariant between rounds. This means that we have to reload # them once per round, in the middle. This is why you'll see # bunch of 'c' stores and loads in the middle, but none in # the beginning or end. If you observe 'd' column, you'll # notice that 15 and 13 are reused in next pair of rounds. # This is why these two are chosen for offloading to memory, # to make loads count more. push @ret,( "&add (@x[$a0],@x[$a0],@x[$b0],'ror#13')", "&add (@x[$a1],@x[$a1],@x[$b1],'ror#13')", "&eor ($xd,@x[$a0],$xd,'ror#24')", "&eor ($xd_,@x[$a1],$xd_,'ror#24')", "&add ($xc,$xc,$xd,'ror#16')", "&add ($xc_,$xc_,$xd_,'ror#16')", "&eor (@x[$b0],$xc, @x[$b0],'ror#13')", "&eor (@x[$b1],$xc_,@x[$b1],'ror#13')", "&add (@x[$a0],@x[$a0],@x[$b0],'ror#20')", "&add (@x[$a1],@x[$a1],@x[$b1],'ror#20')", "&eor ($xd,@x[$a0],$xd,'ror#16')", "&eor ($xd_,@x[$a1],$xd_,'ror#16')" ); push @ret,( "&str ($xd,'[sp,#4*(16+$d0)]')" ) if ($odd); push @ret,( "&add ($xc,$xc,$xd,'ror#24')" ); push @ret,( "&ldr ($xd,'[sp,#4*(16+$d2)]')" ) if ($odd); push @ret,( "&str ($xd_,'[sp,#4*(16+$d1)]')" ) if (!$odd); push @ret,( "&add ($xc_,$xc_,$xd_,'ror#24')" ); push @ret,( "&ldr ($xd_,'[sp,#4*(16+$d3)]')" ) if (!$odd); push @ret,( "&str ($xc,'[sp,#4*(16+$c0)]')", "&eor (@x[$b0],@x[$b0],$xc,'ror#12')", "&str ($xc_,'[sp,#4*(16+$c1)]')", "&eor (@x[$b1],@x[$b1],$xc_,'ror#12')" ); $xd=@x[$d2] if (!$odd); $xd_=@x[$d3] if ($odd); push @ret,( "&ldr ($xc,'[sp,#4*(16+$c2)]')", "&add (@x[$a2],@x[$a2],@x[$b2],'ror#13')", "&ldr ($xc_,'[sp,#4*(16+$c3)]')", "&add (@x[$a3],@x[$a3],@x[$b3],'ror#13')", "&eor ($xd,@x[$a2],$xd,'ror#24')", "&eor ($xd_,@x[$a3],$xd_,'ror#24')", "&add ($xc,$xc,$xd,'ror#16')", "&add ($xc_,$xc_,$xd_,'ror#16')", "&eor (@x[$b2],$xc, @x[$b2],'ror#13')", "&eor (@x[$b3],$xc_,@x[$b3],'ror#13')", "&add (@x[$a2],@x[$a2],@x[$b2],'ror#20')", "&add (@x[$a3],@x[$a3],@x[$b3],'ror#20')", "&eor ($xd,@x[$a2],$xd,'ror#16')", "&eor ($xd_,@x[$a3],$xd_,'ror#16')", "&add ($xc,$xc,$xd,'ror#24')", "&add ($xc_,$xc_,$xd_,'ror#24')", "&eor (@x[$b2],@x[$b2],$xc,'ror#12')", "&eor (@x[$b3],@x[$b3],$xc_,'ror#12')" ); @ret; } $code.=<<___; #ifndef __KERNEL__ # include "arm_arch.h" #else # define __ARM_ARCH__ __LINUX_ARM_ARCH__ # define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ # define ChaCha20_ctr32 chacha20_arm # define ChaCha20_neon chacha20_neon #endif #if defined(__thumb2__) || defined(__clang__) .syntax unified # define ldrhsb ldrbhs #endif #if defined(__thumb2__) .thumb #else .code 32 #endif .text .align 5 .Lsigma: .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral .Lone: .long 1,0,0,0 .Lrot8: .long 0x02010003,0x06050407 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) .LOPENSSL_armcap: # ifdef _WIN32 .word OPENSSL_armcap_P # else .word OPENSSL_armcap_P-.LChaCha20_ctr32 # endif #else .word -1 #endif .globl ChaCha20_ctr32 .type ChaCha20_ctr32,%function .align 5 ChaCha20_ctr32: .LChaCha20_ctr32: ldr r12,[sp,#0] @ pull pointer to counter and nonce stmdb sp!,{r0-r2,r4-r11,lr} #ifndef __thumb2__ sub r14,pc,#16 @ ChaCha20_ctr32 #else adr r14,.LChaCha20_ctr32 #endif cmp r2,#0 @ len==0? #ifdef __thumb2__ itt eq #endif addeq sp,sp,#4*3 beq .Lno_data #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) cmp r2,#192 @ test len bls .Lshort ldr r4,.LOPENSSL_armcap # if !defined(_WIN32) ldr r4,[r14,r4] # endif # if defined(__APPLE__) || defined(_WIN32) ldr r4,[r4] # endif tst r4,#ARMV7_NEON bne .LChaCha20_neon .Lshort: #endif ldmia r12,{r4-r7} @ load counter and nonce sub sp,sp,#4*(16) @ off-load area sub r14,r14,#64 @ .Lsigma stmdb sp!,{r4-r7} @ copy counter and nonce ldmia r3,{r4-r11} @ load key ldmia r14,{r0-r3} @ load sigma stmdb sp!,{r4-r11} @ copy key stmdb sp!,{r0-r3} @ copy sigma str r10,[sp,#4*(16+10)] @ off-load "@x[10]" str r11,[sp,#4*(16+11)] @ off-load "@x[11]" b .Loop_outer_enter .align 4 .Loop_outer: ldmia sp,{r0-r9} @ load key material str @t[3],[sp,#4*(32+2)] @ save len str r12, [sp,#4*(32+1)] @ save inp str r14, [sp,#4*(32+0)] @ save out .Loop_outer_enter: ldr @t[3], [sp,#4*(15)] mov @x[4],@x[4],ror#19 @ twist b[0..3] ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load mov @x[5],@x[5],ror#19 ldr @t[2], [sp,#4*(13)] mov @x[6],@x[6],ror#19 ldr @x[14],[sp,#4*(14)] mov @x[7],@x[7],ror#19 mov @t[3],@t[3],ror#8 @ twist d[0..3] mov @x[12],@x[12],ror#8 mov @t[2],@t[2],ror#8 mov @x[14],@x[14],ror#8 str @t[3], [sp,#4*(16+15)] mov @t[3],#10 b .Loop .align 4 .Loop: subs @t[3],@t[3],#1 ___ foreach (&ROUND(0, 4, 8,12)) { eval; } foreach (&ROUND(0, 5,10,15)) { eval; } $code.=<<___; bne .Loop ldr @t[3],[sp,#4*(32+2)] @ load len str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store str @t[1], [sp,#4*(16+9)] str @x[12],[sp,#4*(16+12)] str @t[2], [sp,#4*(16+13)] str @x[14],[sp,#4*(16+14)] @ at this point we have first half of 512-bit result in @ @x[0-7] and second half at sp+4*(16+8) cmp @t[3],#64 @ done yet? #ifdef __thumb2__ itete lo #endif addlo r12,sp,#4*(0) @ shortcut or ... ldrhs r12,[sp,#4*(32+1)] @ ... load inp addlo r14,sp,#4*(0) @ shortcut or ... ldrhs r14,[sp,#4*(32+0)] @ ... load out ldr @t[0],[sp,#4*(0)] @ load key material ldr @t[1],[sp,#4*(1)] #if __ARM_ARCH__>=6 || !defined(__ARMEB__) # if __ARM_ARCH__<7 orr @t[2],r12,r14 tst @t[2],#3 @ are input and output aligned? ldr @t[2],[sp,#4*(2)] bne .Lunaligned cmp @t[3],#64 @ restore flags # else ldr @t[2],[sp,#4*(2)] # endif ldr @t[3],[sp,#4*(3)] add @x[0],@x[0],@t[0] @ accumulate key material add @x[1],@x[1],@t[1] # ifdef __thumb2__ itt hs # endif ldrhs @t[0],[r12],#16 @ load input ldrhs @t[1],[r12,#-12] add @x[2],@x[2],@t[2] add @x[3],@x[3],@t[3] # ifdef __thumb2__ itt hs # endif ldrhs @t[2],[r12,#-8] ldrhs @t[3],[r12,#-4] # if __ARM_ARCH__>=6 && defined(__ARMEB__) rev @x[0],@x[0] rev @x[1],@x[1] rev @x[2],@x[2] rev @x[3],@x[3] # endif # ifdef __thumb2__ itt hs # endif eorhs @x[0],@x[0],@t[0] @ xor with input eorhs @x[1],@x[1],@t[1] add @t[0],sp,#4*(4) str @x[0],[r14],#16 @ store output # ifdef __thumb2__ itt hs # endif eorhs @x[2],@x[2],@t[2] eorhs @x[3],@x[3],@t[3] ldmia @t[0],{@t[0]-@t[3]} @ load key material str @x[1],[r14,#-12] str @x[2],[r14,#-8] str @x[3],[r14,#-4] add @x[4],@t[0],@x[4],ror#13 @ accumulate key material add @x[5],@t[1],@x[5],ror#13 # ifdef __thumb2__ itt hs # endif ldrhs @t[0],[r12],#16 @ load input ldrhs @t[1],[r12,#-12] add @x[6],@t[2],@x[6],ror#13 add @x[7],@t[3],@x[7],ror#13 # ifdef __thumb2__ itt hs # endif ldrhs @t[2],[r12,#-8] ldrhs @t[3],[r12,#-4] # if __ARM_ARCH__>=6 && defined(__ARMEB__) rev @x[4],@x[4] rev @x[5],@x[5] rev @x[6],@x[6] rev @x[7],@x[7] # endif # ifdef __thumb2__ itt hs # endif eorhs @x[4],@x[4],@t[0] eorhs @x[5],@x[5],@t[1] add @t[0],sp,#4*(8) str @x[4],[r14],#16 @ store output # ifdef __thumb2__ itt hs # endif eorhs @x[6],@x[6],@t[2] eorhs @x[7],@x[7],@t[3] str @x[5],[r14,#-12] ldmia @t[0],{@t[0]-@t[3]} @ load key material str @x[6],[r14,#-8] add @x[0],sp,#4*(16+8) str @x[7],[r14,#-4] ldmia @x[0],{@x[0]-@x[7]} @ load second half add @x[0],@x[0],@t[0] @ accumulate key material add @x[1],@x[1],@t[1] # ifdef __thumb2__ itt hs # endif ldrhs @t[0],[r12],#16 @ load input ldrhs @t[1],[r12,#-12] # ifdef __thumb2__ itt hi # endif strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it add @x[2],@x[2],@t[2] add @x[3],@x[3],@t[3] # ifdef __thumb2__ itt hs # endif ldrhs @t[2],[r12,#-8] ldrhs @t[3],[r12,#-4] # if __ARM_ARCH__>=6 && defined(__ARMEB__) rev @x[0],@x[0] rev @x[1],@x[1] rev @x[2],@x[2] rev @x[3],@x[3] # endif # ifdef __thumb2__ itt hs # endif eorhs @x[0],@x[0],@t[0] eorhs @x[1],@x[1],@t[1] add @t[0],sp,#4*(12) str @x[0],[r14],#16 @ store output # ifdef __thumb2__ itt hs # endif eorhs @x[2],@x[2],@t[2] eorhs @x[3],@x[3],@t[3] str @x[1],[r14,#-12] ldmia @t[0],{@t[0]-@t[3]} @ load key material str @x[2],[r14,#-8] str @x[3],[r14,#-4] add @x[4],@t[0],@x[4],ror#24 @ accumulate key material add @x[5],@t[1],@x[5],ror#24 # ifdef __thumb2__ itt hi # endif addhi @t[0],@t[0],#1 @ next counter value strhi @t[0],[sp,#4*(12)] @ save next counter value # ifdef __thumb2__ itt hs # endif ldrhs @t[0],[r12],#16 @ load input ldrhs @t[1],[r12,#-12] add @x[6],@t[2],@x[6],ror#24 add @x[7],@t[3],@x[7],ror#24 # ifdef __thumb2__ itt hs # endif ldrhs @t[2],[r12,#-8] ldrhs @t[3],[r12,#-4] # if __ARM_ARCH__>=6 && defined(__ARMEB__) rev @x[4],@x[4] rev @x[5],@x[5] rev @x[6],@x[6] rev @x[7],@x[7] # endif # ifdef __thumb2__ itt hs # endif eorhs @x[4],@x[4],@t[0] eorhs @x[5],@x[5],@t[1] # ifdef __thumb2__ it ne # endif ldrne @t[0],[sp,#4*(32+2)] @ re-load len # ifdef __thumb2__ itt hs # endif eorhs @x[6],@x[6],@t[2] eorhs @x[7],@x[7],@t[3] str @x[4],[r14],#16 @ store output str @x[5],[r14,#-12] # ifdef __thumb2__ it hs # endif subhs @t[3],@t[0],#64 @ len-=64 str @x[6],[r14,#-8] str @x[7],[r14,#-4] bhi .Loop_outer beq .Ldone # if __ARM_ARCH__<7 b .Ltail .align 4 .Lunaligned: @ unaligned endian-neutral path cmp @t[3],#64 @ restore flags # endif #endif #if __ARM_ARCH__<7 ldr @t[3],[sp,#4*(3)] ___ for ($i=0;$i<16;$i+=4) { my $j=$i&0x7; my $twist=""; if ($i==4) { $twist = ",ror#13"; } elsif ($i==12) { $twist = ",ror#24"; } $code.=<<___ if ($i==4); add @x[0],sp,#4*(16+8) ___ $code.=<<___ if ($i==8); ldmia @x[0],{@x[0]-@x[7]} @ load second half # ifdef __thumb2__ itt hi # endif strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" ___ $code.=<<___; add @x[$j+0],@t[0],@x[$j+0]$twist @ accumulate key material ___ $code.=<<___ if ($i==12); # ifdef __thumb2__ itt hi # endif addhi @t[0],@t[0],#1 @ next counter value strhi @t[0],[sp,#4*(12)] @ save next counter value ___ $code.=<<___; add @x[$j+1],@t[1],@x[$j+1]$twist add @x[$j+2],@t[2],@x[$j+2]$twist # ifdef __thumb2__ itete lo # endif eorlo @t[0],@t[0],@t[0] @ zero or ... ldrhsb @t[0],[r12],#16 @ ... load input eorlo @t[1],@t[1],@t[1] ldrhsb @t[1],[r12,#-12] add @x[$j+3],@t[3],@x[$j+3]$twist # ifdef __thumb2__ itete lo # endif eorlo @t[2],@t[2],@t[2] ldrhsb @t[2],[r12,#-8] eorlo @t[3],@t[3],@t[3] ldrhsb @t[3],[r12,#-4] eor @x[$j+0],@t[0],@x[$j+0] @ xor with input (or zero) eor @x[$j+1],@t[1],@x[$j+1] # ifdef __thumb2__ itt hs # endif ldrhsb @t[0],[r12,#-15] @ load more input ldrhsb @t[1],[r12,#-11] eor @x[$j+2],@t[2],@x[$j+2] strb @x[$j+0],[r14],#16 @ store output eor @x[$j+3],@t[3],@x[$j+3] # ifdef __thumb2__ itt hs # endif ldrhsb @t[2],[r12,#-7] ldrhsb @t[3],[r12,#-3] strb @x[$j+1],[r14,#-12] eor @x[$j+0],@t[0],@x[$j+0],lsr#8 strb @x[$j+2],[r14,#-8] eor @x[$j+1],@t[1],@x[$j+1],lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb @t[0],[r12,#-14] @ load more input ldrhsb @t[1],[r12,#-10] strb @x[$j+3],[r14,#-4] eor @x[$j+2],@t[2],@x[$j+2],lsr#8 strb @x[$j+0],[r14,#-15] eor @x[$j+3],@t[3],@x[$j+3],lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb @t[2],[r12,#-6] ldrhsb @t[3],[r12,#-2] strb @x[$j+1],[r14,#-11] eor @x[$j+0],@t[0],@x[$j+0],lsr#8 strb @x[$j+2],[r14,#-7] eor @x[$j+1],@t[1],@x[$j+1],lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb @t[0],[r12,#-13] @ load more input ldrhsb @t[1],[r12,#-9] strb @x[$j+3],[r14,#-3] eor @x[$j+2],@t[2],@x[$j+2],lsr#8 strb @x[$j+0],[r14,#-14] eor @x[$j+3],@t[3],@x[$j+3],lsr#8 # ifdef __thumb2__ itt hs # endif ldrhsb @t[2],[r12,#-5] ldrhsb @t[3],[r12,#-1] strb @x[$j+1],[r14,#-10] strb @x[$j+2],[r14,#-6] eor @x[$j+0],@t[0],@x[$j+0],lsr#8 strb @x[$j+3],[r14,#-2] eor @x[$j+1],@t[1],@x[$j+1],lsr#8 strb @x[$j+0],[r14,#-13] eor @x[$j+2],@t[2],@x[$j+2],lsr#8 strb @x[$j+1],[r14,#-9] eor @x[$j+3],@t[3],@x[$j+3],lsr#8 strb @x[$j+2],[r14,#-5] strb @x[$j+3],[r14,#-1] ___ $code.=<<___ if ($i<12); add @t[0],sp,#4*(4+$i) ldmia @t[0],{@t[0]-@t[3]} @ load key material ___ } $code.=<<___; # ifdef __thumb2__ it ne # endif ldrne @t[0],[sp,#4*(32+2)] @ re-load len # ifdef __thumb2__ it hs # endif subhs @t[3],@t[0],#64 @ len-=64 bhi .Loop_outer beq .Ldone #endif .Ltail: ldr r12,[sp,#4*(32+1)] @ load inp add @t[1],sp,#4*(0) ldr r14,[sp,#4*(32+0)] @ load out .Loop_tail: ldrb @t[2],[@t[1]],#1 @ read buffer on stack ldrb @t[3],[r12],#1 @ read input subs @t[0],@t[0],#1 eor @t[3],@t[3],@t[2] strb @t[3],[r14],#1 @ store output bne .Loop_tail .Ldone: add sp,sp,#4*(32+3) .Lno_data: #if __ARM_ARCH__>=5 ldmia sp!,{r4-r11,pc} #else ldmia sp!,{r4-r11,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet .long 0xe12fff1e @ interoperable with Thumb ISA:-) #endif .size ChaCha20_ctr32,.-ChaCha20_ctr32 ___ {{{ my ($a0,$b0,$c0,$d0,$a1,$b1,$c1,$d1,$a2,$b2,$c2,$d2,$t0,$t1,$t2,$t3) = map("q$_",(0..15)); # This can replace vshr-by-24+vsli-by-8. It gives ~3% improvement on # Cortex-A5/A7, but hurts Cortex-A9 by 5% and Snapdragon S4 by 14%! sub vperm() { my ($dst,$src,$tbl) = @_; $code .= " vtbl.8 $dst#lo,{$src#lo},$tbl#lo\n"; $code .= " vtbl.8 $dst#hi,{$src#hi},$tbl#lo\n"; } sub NEONROUND { my $odd = pop; my ($a,$b,$c,$d,$t)=@_; ( "&vadd_i32 ($a,$a,$b)", "&veor ($d,$d,$a)", "&vrev32_16 ($d,$d)", # vrot ($d,16) "&vadd_i32 ($c,$c,$d)", "&veor ($t,$b,$c)", "&vshr_u32 ($b,$t,20)", "&vsli_32 ($b,$t,12)", "&vadd_i32 ($a,$a,$b)", "&veor ($t,$d,$a)", "&vshr_u32 ($d,$t,24)", "&vsli_32 ($d,$t,8)", #"&vperm ($d,$t,$t3)", "&vadd_i32 ($c,$c,$d)", "&veor ($t,$b,$c)", "&vshr_u32 ($b,$t,25)", "&vsli_32 ($b,$t,7)", "&vext_8 ($c,$c,$c,8)", "&vext_8 ($b,$b,$b,$odd?12:4)", "&vext_8 ($d,$d,$d,$odd?4:12)" ); } $code.=<<___; #if __ARM_MAX_ARCH__>=7 .arch armv7-a .fpu neon # ifdef __KERNEL__ .globl ChaCha20_neon @ For optimal performance it's appropriate for caller to enforce @ minimum input length, 193 bytes is suggested. # endif .type ChaCha20_neon,%function .align 5 ChaCha20_neon: ldr r12,[sp,#0] @ pull pointer to counter and nonce stmdb sp!,{r0-r2,r4-r11,lr} .LChaCha20_neon: adr r14,.Lsigma vstmdb sp!,{d8-d15} @ ABI spec says so stmdb sp!,{r0-r3} vld1.32 {$b0-$c0},[r3] @ load key ldmia r3,{r4-r11} @ load key sub sp,sp,#4*(16+16) vld1.32 {$d0},[r12] @ load counter and nonce add r12,sp,#4*8 ldmia r14,{r0-r3} @ load sigma vld1.32 {$a0},[r14]! @ load sigma vld1.32 {$t0},[r14]! @ one @ vld1.32 {$t3#lo},[r14] @ rot8 vst1.32 {$c0-$d0},[r12] @ copy 1/2key|counter|nonce vst1.32 {$a0-$b0},[sp] @ copy sigma|1/2key str r10,[sp,#4*(16+10)] @ off-load "@x[10]" str r11,[sp,#4*(16+11)] @ off-load "@x[11]" vshl.i32 $t1#lo,$t0#lo,#1 @ two vstr $t0#lo,[sp,#4*(16+0)] vshl.i32 $t2#lo,$t0#lo,#2 @ four vstr $t1#lo,[sp,#4*(16+2)] vmov $a1,$a0 vstr $t2#lo,[sp,#4*(16+4)] vmov $a2,$a0 @ vstr $t3#lo,[sp,#4*(16+6)] vmov $b1,$b0 vmov $b2,$b0 b .Loop_neon_enter .align 4 .Loop_neon_outer: ldmia sp,{r0-r9} @ load key material cmp @t[3],#64*2 @ if len<=64*2 bls .Lbreak_neon @ switch to integer-only @ vldr $t3#lo,[sp,#4*(16+6)] @ rot8 vmov $a1,$a0 str @t[3],[sp,#4*(32+2)] @ save len vmov $a2,$a0 str r12, [sp,#4*(32+1)] @ save inp vmov $b1,$b0 str r14, [sp,#4*(32+0)] @ save out vmov $b2,$b0 .Loop_neon_enter: ldr @t[3], [sp,#4*(15)] mov @x[4],@x[4],ror#19 @ twist b[0..3] vadd.i32 $d1,$d0,$t0 @ counter+1 ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load mov @x[5],@x[5],ror#19 vmov $c1,$c0 ldr @t[2], [sp,#4*(13)] mov @x[6],@x[6],ror#19 vmov $c2,$c0 ldr @x[14],[sp,#4*(14)] mov @x[7],@x[7],ror#19 vadd.i32 $d2,$d1,$t0 @ counter+2 add @x[12],@x[12],#3 @ counter+3 mov @t[3],@t[3],ror#8 @ twist d[0..3] mov @x[12],@x[12],ror#8 mov @t[2],@t[2],ror#8 mov @x[14],@x[14],ror#8 str @t[3], [sp,#4*(16+15)] mov @t[3],#10 b .Loop_neon .align 4 .Loop_neon: subs @t[3],@t[3],#1 ___ my @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,0); my @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,0); my @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,0); my @thread3=&ROUND(0,4,8,12); foreach (@thread0) { eval; eval(shift(@thread3)); eval(shift(@thread1)); eval(shift(@thread3)); eval(shift(@thread2)); eval(shift(@thread3)); } @thread0=&NEONROUND($a0,$b0,$c0,$d0,$t0,1); @thread1=&NEONROUND($a1,$b1,$c1,$d1,$t1,1); @thread2=&NEONROUND($a2,$b2,$c2,$d2,$t2,1); @thread3=&ROUND(0,5,10,15); foreach (@thread0) { eval; eval(shift(@thread3)); eval(shift(@thread1)); eval(shift(@thread3)); eval(shift(@thread2)); eval(shift(@thread3)); } $code.=<<___; bne .Loop_neon add @t[3],sp,#32 vld1.32 {$t0-$t1},[sp] @ load key material vld1.32 {$t2-$t3},[@t[3]] ldr @t[3],[sp,#4*(32+2)] @ load len str @t[0], [sp,#4*(16+8)] @ modulo-scheduled store str @t[1], [sp,#4*(16+9)] str @x[12],[sp,#4*(16+12)] str @t[2], [sp,#4*(16+13)] str @x[14],[sp,#4*(16+14)] @ at this point we have first half of 512-bit result in @ @x[0-7] and second half at sp+4*(16+8) ldr r12,[sp,#4*(32+1)] @ load inp ldr r14,[sp,#4*(32+0)] @ load out vadd.i32 $a0,$a0,$t0 @ accumulate key material vadd.i32 $a1,$a1,$t0 vadd.i32 $a2,$a2,$t0 vldr $t0#lo,[sp,#4*(16+0)] @ one vadd.i32 $b0,$b0,$t1 vadd.i32 $b1,$b1,$t1 vadd.i32 $b2,$b2,$t1 vldr $t1#lo,[sp,#4*(16+2)] @ two vadd.i32 $c0,$c0,$t2 vadd.i32 $c1,$c1,$t2 vadd.i32 $c2,$c2,$t2 vadd.i32 $d1#lo,$d1#lo,$t0#lo @ counter+1 vadd.i32 $d2#lo,$d2#lo,$t1#lo @ counter+2 vadd.i32 $d0,$d0,$t3 vadd.i32 $d1,$d1,$t3 vadd.i32 $d2,$d2,$t3 cmp @t[3],#64*4 blo .Ltail_neon vld1.8 {$t0-$t1},[r12]! @ load input mov @t[3],sp vld1.8 {$t2-$t3},[r12]! veor $a0,$a0,$t0 @ xor with input veor $b0,$b0,$t1 vld1.8 {$t0-$t1},[r12]! veor $c0,$c0,$t2 veor $d0,$d0,$t3 vld1.8 {$t2-$t3},[r12]! veor $a1,$a1,$t0 vst1.8 {$a0-$b0},[r14]! @ store output veor $b1,$b1,$t1 vld1.8 {$t0-$t1},[r12]! veor $c1,$c1,$t2 vst1.8 {$c0-$d0},[r14]! veor $d1,$d1,$t3 vld1.8 {$t2-$t3},[r12]! veor $a2,$a2,$t0 vld1.32 {$a0-$b0},[@t[3]]! @ load for next iteration veor $t0#hi,$t0#hi,$t0#hi vldr $t0#lo,[sp,#4*(16+4)] @ four veor $b2,$b2,$t1 vld1.32 {$c0-$d0},[@t[3]] veor $c2,$c2,$t2 vst1.8 {$a1-$b1},[r14]! veor $d2,$d2,$t3 vst1.8 {$c1-$d1},[r14]! vadd.i32 $d0#lo,$d0#lo,$t0#lo @ next counter value vldr $t0#lo,[sp,#4*(16+0)] @ one ldmia sp,{@t[0]-@t[3]} @ load key material add @x[0],@x[0],@t[0] @ accumulate key material ldr @t[0],[r12],#16 @ load input vst1.8 {$a2-$b2},[r14]! add @x[1],@x[1],@t[1] ldr @t[1],[r12,#-12] vst1.8 {$c2-$d2},[r14]! add @x[2],@x[2],@t[2] ldr @t[2],[r12,#-8] add @x[3],@x[3],@t[3] ldr @t[3],[r12,#-4] # ifdef __ARMEB__ rev @x[0],@x[0] rev @x[1],@x[1] rev @x[2],@x[2] rev @x[3],@x[3] # endif eor @x[0],@x[0],@t[0] @ xor with input add @t[0],sp,#4*(4) eor @x[1],@x[1],@t[1] str @x[0],[r14],#16 @ store output eor @x[2],@x[2],@t[2] str @x[1],[r14,#-12] eor @x[3],@x[3],@t[3] ldmia @t[0],{@t[0]-@t[3]} @ load key material str @x[2],[r14,#-8] str @x[3],[r14,#-4] add @x[4],@t[0],@x[4],ror#13 @ accumulate key material ldr @t[0],[r12],#16 @ load input add @x[5],@t[1],@x[5],ror#13 ldr @t[1],[r12,#-12] add @x[6],@t[2],@x[6],ror#13 ldr @t[2],[r12,#-8] add @x[7],@t[3],@x[7],ror#13 ldr @t[3],[r12,#-4] # ifdef __ARMEB__ rev @x[4],@x[4] rev @x[5],@x[5] rev @x[6],@x[6] rev @x[7],@x[7] # endif eor @x[4],@x[4],@t[0] add @t[0],sp,#4*(8) eor @x[5],@x[5],@t[1] str @x[4],[r14],#16 @ store output eor @x[6],@x[6],@t[2] str @x[5],[r14,#-12] eor @x[7],@x[7],@t[3] ldmia @t[0],{@t[0]-@t[3]} @ load key material str @x[6],[r14,#-8] add @x[0],sp,#4*(16+8) str @x[7],[r14,#-4] ldmia @x[0],{@x[0]-@x[7]} @ load second half add @x[0],@x[0],@t[0] @ accumulate key material ldr @t[0],[r12],#16 @ load input add @x[1],@x[1],@t[1] ldr @t[1],[r12,#-12] # ifdef __thumb2__ it hi # endif strhi @t[2],[sp,#4*(16+10)] @ copy "@x[10]" while at it add @x[2],@x[2],@t[2] ldr @t[2],[r12,#-8] # ifdef __thumb2__ it hi # endif strhi @t[3],[sp,#4*(16+11)] @ copy "@x[11]" while at it add @x[3],@x[3],@t[3] ldr @t[3],[r12,#-4] # ifdef __ARMEB__ rev @x[0],@x[0] rev @x[1],@x[1] rev @x[2],@x[2] rev @x[3],@x[3] # endif eor @x[0],@x[0],@t[0] add @t[0],sp,#4*(12) eor @x[1],@x[1],@t[1] str @x[0],[r14],#16 @ store output eor @x[2],@x[2],@t[2] str @x[1],[r14,#-12] eor @x[3],@x[3],@t[3] ldmia @t[0],{@t[0]-@t[3]} @ load key material str @x[2],[r14,#-8] str @x[3],[r14,#-4] add @x[4],@t[0],@x[4],ror#24 @ accumulate key material add @t[0],@t[0],#4 @ next counter value add @x[5],@t[1],@x[5],ror#24 str @t[0],[sp,#4*(12)] @ save next counter value ldr @t[0],[r12],#16 @ load input add @x[6],@t[2],@x[6],ror#24 add @x[4],@x[4],#3 @ counter+3 ldr @t[1],[r12,#-12] add @x[7],@t[3],@x[7],ror#24 ldr @t[2],[r12,#-8] ldr @t[3],[r12,#-4] # ifdef __ARMEB__ rev @x[4],@x[4] rev @x[5],@x[5] rev @x[6],@x[6] rev @x[7],@x[7] # endif eor @x[4],@x[4],@t[0] # ifdef __thumb2__ it hi # endif ldrhi @t[0],[sp,#4*(32+2)] @ re-load len eor @x[5],@x[5],@t[1] eor @x[6],@x[6],@t[2] str @x[4],[r14],#16 @ store output eor @x[7],@x[7],@t[3] str @x[5],[r14,#-12] sub @t[3],@t[0],#64*4 @ len-=64*4 str @x[6],[r14,#-8] str @x[7],[r14,#-4] bhi .Loop_neon_outer b .Ldone_neon .align 4 .Lbreak_neon: @ harmonize NEON and integer-only stack frames: load data @ from NEON frame, but save to integer-only one; distance @ between the two is 4*(32+4+16-32)=4*(20). str @t[3], [sp,#4*(20+32+2)] @ save len add @t[3],sp,#4*(32+4) str r12, [sp,#4*(20+32+1)] @ save inp str r14, [sp,#4*(20+32+0)] @ save out ldr @x[12],[sp,#4*(16+10)] ldr @x[14],[sp,#4*(16+11)] vldmia @t[3],{d8-d15} @ fulfill ABI requirement str @x[12],[sp,#4*(20+16+10)] @ copy "@x[10]" str @x[14],[sp,#4*(20+16+11)] @ copy "@x[11]" ldr @t[3], [sp,#4*(15)] mov @x[4],@x[4],ror#19 @ twist b[0..3] ldr @x[12],[sp,#4*(12)] @ modulo-scheduled load mov @x[5],@x[5],ror#19 ldr @t[2], [sp,#4*(13)] mov @x[6],@x[6],ror#19 ldr @x[14],[sp,#4*(14)] mov @x[7],@x[7],ror#19 mov @t[3],@t[3],ror#8 @ twist d[0..3] mov @x[12],@x[12],ror#8 mov @t[2],@t[2],ror#8 mov @x[14],@x[14],ror#8 str @t[3], [sp,#4*(20+16+15)] add @t[3],sp,#4*(20) vst1.32 {$a0-$b0},[@t[3]]! @ copy key add sp,sp,#4*(20) @ switch frame vst1.32 {$c0-$d0},[@t[3]] mov @t[3],#10 b .Loop @ go integer-only .align 4 .Ltail_neon: cmp @t[3],#64*3 bhs .L192_or_more_neon cmp @t[3],#64*2 bhs .L128_or_more_neon cmp @t[3],#64*1 bhs .L64_or_more_neon add @t[0],sp,#4*(8) vst1.8 {$a0-$b0},[sp] add @t[2],sp,#4*(0) vst1.8 {$c0-$d0},[@t[0]] b .Loop_tail_neon .align 4 .L64_or_more_neon: vld1.8 {$t0-$t1},[r12]! vld1.8 {$t2-$t3},[r12]! veor $a0,$a0,$t0 veor $b0,$b0,$t1 veor $c0,$c0,$t2 veor $d0,$d0,$t3 vst1.8 {$a0-$b0},[r14]! vst1.8 {$c0-$d0},[r14]! beq .Ldone_neon add @t[0],sp,#4*(8) vst1.8 {$a1-$b1},[sp] add @t[2],sp,#4*(0) vst1.8 {$c1-$d1},[@t[0]] sub @t[3],@t[3],#64*1 @ len-=64*1 b .Loop_tail_neon .align 4 .L128_or_more_neon: vld1.8 {$t0-$t1},[r12]! vld1.8 {$t2-$t3},[r12]! veor $a0,$a0,$t0 veor $b0,$b0,$t1 vld1.8 {$t0-$t1},[r12]! veor $c0,$c0,$t2 veor $d0,$d0,$t3 vld1.8 {$t2-$t3},[r12]! veor $a1,$a1,$t0 veor $b1,$b1,$t1 vst1.8 {$a0-$b0},[r14]! veor $c1,$c1,$t2 vst1.8 {$c0-$d0},[r14]! veor $d1,$d1,$t3 vst1.8 {$a1-$b1},[r14]! vst1.8 {$c1-$d1},[r14]! beq .Ldone_neon add @t[0],sp,#4*(8) vst1.8 {$a2-$b2},[sp] add @t[2],sp,#4*(0) vst1.8 {$c2-$d2},[@t[0]] sub @t[3],@t[3],#64*2 @ len-=64*2 b .Loop_tail_neon .align 4 .L192_or_more_neon: vld1.8 {$t0-$t1},[r12]! vld1.8 {$t2-$t3},[r12]! veor $a0,$a0,$t0 veor $b0,$b0,$t1 vld1.8 {$t0-$t1},[r12]! veor $c0,$c0,$t2 veor $d0,$d0,$t3 vld1.8 {$t2-$t3},[r12]! veor $a1,$a1,$t0 veor $b1,$b1,$t1 vld1.8 {$t0-$t1},[r12]! veor $c1,$c1,$t2 vst1.8 {$a0-$b0},[r14]! veor $d1,$d1,$t3 vld1.8 {$t2-$t3},[r12]! veor $a2,$a2,$t0 vst1.8 {$c0-$d0},[r14]! veor $b2,$b2,$t1 vst1.8 {$a1-$b1},[r14]! veor $c2,$c2,$t2 vst1.8 {$c1-$d1},[r14]! veor $d2,$d2,$t3 vst1.8 {$a2-$b2},[r14]! vst1.8 {$c2-$d2},[r14]! beq .Ldone_neon ldmia sp,{@t[0]-@t[3]} @ load key material add @x[0],@x[0],@t[0] @ accumulate key material add @t[0],sp,#4*(4) add @x[1],@x[1],@t[1] add @x[2],@x[2],@t[2] add @x[3],@x[3],@t[3] ldmia @t[0],{@t[0]-@t[3]} @ load key material add @x[4],@t[0],@x[4],ror#13 @ accumulate key material add @t[0],sp,#4*(8) add @x[5],@t[1],@x[5],ror#13 add @x[6],@t[2],@x[6],ror#13 add @x[7],@t[3],@x[7],ror#13 ldmia @t[0],{@t[0]-@t[3]} @ load key material # ifdef __ARMEB__ rev @x[0],@x[0] rev @x[1],@x[1] rev @x[2],@x[2] rev @x[3],@x[3] rev @x[4],@x[4] rev @x[5],@x[5] rev @x[6],@x[6] rev @x[7],@x[7] # endif stmia sp,{@x[0]-@x[7]} add @x[0],sp,#4*(16+8) ldmia @x[0],{@x[0]-@x[7]} @ load second half add @x[0],@x[0],@t[0] @ accumulate key material add @t[0],sp,#4*(12) add @x[1],@x[1],@t[1] add @x[2],@x[2],@t[2] add @x[3],@x[3],@t[3] ldmia @t[0],{@t[0]-@t[3]} @ load key material add @x[4],@t[0],@x[4],ror#24 @ accumulate key material add @t[0],sp,#4*(8) add @x[5],@t[1],@x[5],ror#24 add @x[4],@x[4],#3 @ counter+3 add @x[6],@t[2],@x[6],ror#24 add @x[7],@t[3],@x[7],ror#24 ldr @t[3],[sp,#4*(32+2)] @ re-load len # ifdef __ARMEB__ rev @x[0],@x[0] rev @x[1],@x[1] rev @x[2],@x[2] rev @x[3],@x[3] rev @x[4],@x[4] rev @x[5],@x[5] rev @x[6],@x[6] rev @x[7],@x[7] # endif stmia @t[0],{@x[0]-@x[7]} add @t[2],sp,#4*(0) sub @t[3],@t[3],#64*3 @ len-=64*3 .Loop_tail_neon: ldrb @t[0],[@t[2]],#1 @ read buffer on stack ldrb @t[1],[r12],#1 @ read input subs @t[3],@t[3],#1 eor @t[0],@t[0],@t[1] strb @t[0],[r14],#1 @ store output bne .Loop_tail_neon .Ldone_neon: add sp,sp,#4*(32+4) vldmia sp,{d8-d15} add sp,sp,#4*(16+3) ldmia sp!,{r4-r11,pc} .size ChaCha20_neon,.-ChaCha20_neon # ifndef __KERNEL__ .comm OPENSSL_armcap_P,4,4 # endif #endif ___ }}} foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; print $_,"\n"; } close STDOUT;