#!/usr/bin/env perl # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # Keccak-1600 for ARMv4. # # June 2017. # # Non-NEON code is KECCAK_1X variant (see sha/keccak1600.c) with bit # interleaving. How does it compare to Keccak Code Package? It's as # fast, but several times smaller, and is endian- and ISA-neutral. ISA # neutrality means that minimum ISA requirement is ARMv4, yet it can # be assembled even as Thumb-2. NEON code path is KECCAK_1X_ALT with # register layout taken from Keccak Code Package. It's also as fast, # in fact faster by 10-15% on some processors, and endian-neutral. # # August 2017. # # Switch to KECCAK_2X variant for non-NEON code and merge almost 1/2 # of rotate instructions with logical ones. This resulted in ~10% # improvement on most processors. Switch to KECCAK_2X effectively # minimizes re-loads from temporary storage, and merged rotates just # eliminate corresponding instructions. As for latter. When examining # code you'll notice commented ror instructions. These are eliminated # ones, and you should trace destination register below to see what's # going on. Just in case, why not all rotates are eliminated. Trouble # is that you have operations that require both inputs to be rotated, # e.g. 'eor a,b>>>x,c>>>y'. This conundrum is resolved by using # 'eor a,b,c>>>(x-y)' and then merge-rotating 'a' in next operation # that takes 'a' as input. And thing is that this next operation can # be in next round. It's totally possible to "carry" rotate "factors" # to the next round, but it makes code more complex. And the last word # is the keyword, i.e. "almost 1/2" is kind of complexity cap [for the # time being]... # # Reduce per-round instruction count in Thumb-2 case by 16%. This is # achieved by folding ldr/str pairs to their double-word counterparts. # Theoretically this should have improved performance on single-issue # cores, such as Cortex-A5/A7, by 19%. Reality is a bit different, as # usual... # ######################################################################## # Numbers are cycles per processed byte. Non-NEON results account even # for input bit interleaving. # # r=1088(*) Thumb-2(**) NEON # # ARM11xx 82/+150% # Cortex-A5 88/+160%, 86, 36 # Cortex-A7 78/+160%, 68, 34 # Cortex-A8 51/+230%, 57, 30 # Cortex-A9 53/+210%, 51, 26 # Cortex-A15 42/+160%, 38, 18 # Snapdragon S4 43/+210%, 38, 24 # # (*) Corresponds to SHA3-256. Percentage after slash is improvement # over compiler-generated KECCAK_2X reference code. # (**) Thumb-2 results for Cortex-A5/A7 are likely to apply even to # Cortex-Mx, x>=3. Otherwise, non-NEON results for NEON-capable # processors are presented mostly for reference purposes. $flavour = shift; if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } if ($flavour && $flavour ne "void") { $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open STDOUT,"| \"$^X\" $xlate $flavour $output"; } else { open STDOUT,">$output"; } my @C = map("r$_",(0..9)); my @E = map("r$_",(10..12,14)); ######################################################################## # Stack layout # ----->+-----------------------+ # | uint64_t A[5][5] | # | ... | # +200->+-----------------------+ # | uint64_t D[5] | # | ... | # +240->+-----------------------+ # | uint64_t T[5][5] | # | ... | # +440->+-----------------------+ # | saved lr | # +444->+-----------------------+ # | loop counter | # +448->+-----------------------+ # | ... my @A = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (0,5,10,15,20)); my @D = map(8*$_, (25..29)); my @T = map([ 8*$_, 8*($_+1), 8*($_+2), 8*($_+3), 8*($_+4) ], (30,35,40,45,50)); $code.=<<___; #include "arm_arch.h" #if defined(__thumb2__) .syntax unified .thumb #else .code 32 #endif .text .type iotas32, %object .align 5 iotas32: .long 0x00000001, 0x00000000 .long 0x00000000, 0x00000089 .long 0x00000000, 0x8000008b .long 0x00000000, 0x80008080 .long 0x00000001, 0x0000008b .long 0x00000001, 0x00008000 .long 0x00000001, 0x80008088 .long 0x00000001, 0x80000082 .long 0x00000000, 0x0000000b .long 0x00000000, 0x0000000a .long 0x00000001, 0x00008082 .long 0x00000000, 0x00008003 .long 0x00000001, 0x0000808b .long 0x00000001, 0x8000000b .long 0x00000001, 0x8000008a .long 0x00000001, 0x80000081 .long 0x00000000, 0x80000081 .long 0x00000000, 0x80000008 .long 0x00000000, 0x00000083 .long 0x00000000, 0x80008003 .long 0x00000001, 0x80008088 .long 0x00000000, 0x80000088 .long 0x00000001, 0x00008000 .long 0x00000000, 0x80008082 .size iotas32,.-iotas32 .type KeccakF1600_int, %function .align 5 KeccakF1600_int: add @C[9],sp,#$A[4][2] add @E[2],sp,#$A[0][0] add @E[0],sp,#$A[1][0] ldmia @C[9],{@C[4]-@C[9]} @ A[4][2..4] KeccakF1600_enter: str lr,[sp,#440] eor @E[1],@E[1],@E[1] str @E[1],[sp,#444] b .Lround2x .align 4 .Lround2x: ___ sub Round { my (@A,@R); (@A[0..4],@R) = @_; $code.=<<___; ldmia @E[2],{@C[0]-@C[3]} @ A[0][0..1] ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][0..1] #ifdef __thumb2__ eor @C[0],@C[0],@E[0] eor @C[1],@C[1],@E[1] eor @C[2],@C[2],@E[2] ldrd @E[0],@E[1],[sp,#$A[1][2]] eor @C[3],@C[3],@E[3] ldrd @E[2],@E[3],[sp,#$A[1][3]] eor @C[4],@C[4],@E[0] eor @C[5],@C[5],@E[1] eor @C[6],@C[6],@E[2] ldrd @E[0],@E[1],[sp,#$A[1][4]] eor @C[7],@C[7],@E[3] ldrd @E[2],@E[3],[sp,#$A[2][0]] eor @C[8],@C[8],@E[0] eor @C[9],@C[9],@E[1] eor @C[0],@C[0],@E[2] ldrd @E[0],@E[1],[sp,#$A[2][1]] eor @C[1],@C[1],@E[3] ldrd @E[2],@E[3],[sp,#$A[2][2]] eor @C[2],@C[2],@E[0] eor @C[3],@C[3],@E[1] eor @C[4],@C[4],@E[2] ldrd @E[0],@E[1],[sp,#$A[2][3]] eor @C[5],@C[5],@E[3] ldrd @E[2],@E[3],[sp,#$A[2][4]] eor @C[6],@C[6],@E[0] eor @C[7],@C[7],@E[1] eor @C[8],@C[8],@E[2] ldrd @E[0],@E[1],[sp,#$A[3][0]] eor @C[9],@C[9],@E[3] ldrd @E[2],@E[3],[sp,#$A[3][1]] eor @C[0],@C[0],@E[0] eor @C[1],@C[1],@E[1] eor @C[2],@C[2],@E[2] ldrd @E[0],@E[1],[sp,#$A[3][2]] eor @C[3],@C[3],@E[3] ldrd @E[2],@E[3],[sp,#$A[3][3]] eor @C[4],@C[4],@E[0] eor @C[5],@C[5],@E[1] eor @C[6],@C[6],@E[2] ldrd @E[0],@E[1],[sp,#$A[3][4]] eor @C[7],@C[7],@E[3] ldrd @E[2],@E[3],[sp,#$A[4][0]] eor @C[8],@C[8],@E[0] eor @C[9],@C[9],@E[1] eor @C[0],@C[0],@E[2] ldrd @E[0],@E[1],[sp,#$A[4][1]] eor @C[1],@C[1],@E[3] ldrd @E[2],@E[3],[sp,#$A[0][2]] eor @C[2],@C[2],@E[0] eor @C[3],@C[3],@E[1] eor @C[4],@C[4],@E[2] ldrd @E[0],@E[1],[sp,#$A[0][3]] eor @C[5],@C[5],@E[3] ldrd @E[2],@E[3],[sp,#$A[0][4]] #else eor @C[0],@C[0],@E[0] add @E[0],sp,#$A[1][2] eor @C[1],@C[1],@E[1] eor @C[2],@C[2],@E[2] eor @C[3],@C[3],@E[3] ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][2..3] eor @C[4],@C[4],@E[0] add @E[0],sp,#$A[1][4] eor @C[5],@C[5],@E[1] eor @C[6],@C[6],@E[2] eor @C[7],@C[7],@E[3] ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[1][4]..A[2][0] eor @C[8],@C[8],@E[0] add @E[0],sp,#$A[2][1] eor @C[9],@C[9],@E[1] eor @C[0],@C[0],@E[2] eor @C[1],@C[1],@E[3] ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][1..2] eor @C[2],@C[2],@E[0] add @E[0],sp,#$A[2][3] eor @C[3],@C[3],@E[1] eor @C[4],@C[4],@E[2] eor @C[5],@C[5],@E[3] ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[2][3..4] eor @C[6],@C[6],@E[0] add @E[0],sp,#$A[3][0] eor @C[7],@C[7],@E[1] eor @C[8],@C[8],@E[2] eor @C[9],@C[9],@E[3] ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][0..1] eor @C[0],@C[0],@E[0] add @E[0],sp,#$A[3][2] eor @C[1],@C[1],@E[1] eor @C[2],@C[2],@E[2] eor @C[3],@C[3],@E[3] ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][2..3] eor @C[4],@C[4],@E[0] add @E[0],sp,#$A[3][4] eor @C[5],@C[5],@E[1] eor @C[6],@C[6],@E[2] eor @C[7],@C[7],@E[3] ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[3][4]..A[4][0] eor @C[8],@C[8],@E[0] ldr @E[0],[sp,#$A[4][1]] @ A[4][1] eor @C[9],@C[9],@E[1] ldr @E[1],[sp,#$A[4][1]+4] eor @C[0],@C[0],@E[2] ldr @E[2],[sp,#$A[0][2]] @ A[0][2] eor @C[1],@C[1],@E[3] ldr @E[3],[sp,#$A[0][2]+4] eor @C[2],@C[2],@E[0] add @E[0],sp,#$A[0][3] eor @C[3],@C[3],@E[1] eor @C[4],@C[4],@E[2] eor @C[5],@C[5],@E[3] ldmia @E[0],{@E[0]-@E[2],@E[3]} @ A[0][3..4] #endif eor @C[6],@C[6],@E[0] eor @C[7],@C[7],@E[1] eor @C[8],@C[8],@E[2] eor @C[9],@C[9],@E[3] eor @E[0],@C[0],@C[5],ror#32-1 @ E[0] = ROL64(C[2], 1) ^ C[0]; str.l @E[0],[sp,#$D[1]] @ D[1] = E[0] eor @E[1],@C[1],@C[4] str.h @E[1],[sp,#$D[1]+4] eor @E[2],@C[6],@C[1],ror#32-1 @ E[1] = ROL64(C[0], 1) ^ C[3]; eor @E[3],@C[7],@C[0] str.l @E[2],[sp,#$D[4]] @ D[4] = E[1] eor @C[0],@C[8],@C[3],ror#32-1 @ C[0] = ROL64(C[1], 1) ^ C[4]; str.h @E[3],[sp,#$D[4]+4] eor @C[1],@C[9],@C[2] str.l @C[0],[sp,#$D[0]] @ D[0] = C[0] eor @C[2],@C[2],@C[7],ror#32-1 @ C[1] = ROL64(C[3], 1) ^ C[1]; ldr.l @C[7],[sp,#$A[3][3]] eor @C[3],@C[3],@C[6] str.h @C[1],[sp,#$D[0]+4] ldr.h @C[6],[sp,#$A[3][3]+4] str.l @C[2],[sp,#$D[2]] @ D[2] = C[1] eor @C[4],@C[4],@C[9],ror#32-1 @ C[2] = ROL64(C[4], 1) ^ C[2]; str.h @C[3],[sp,#$D[2]+4] eor @C[5],@C[5],@C[8] ldr.l @C[8],[sp,#$A[4][4]] ldr.h @C[9],[sp,#$A[4][4]+4] str.l @C[4],[sp,#$D[3]] @ D[3] = C[2] eor @C[7],@C[7],@C[4] str.h @C[5],[sp,#$D[3]+4] eor @C[6],@C[6],@C[5] ldr.l @C[4],[sp,#$A[0][0]] @ ror @C[7],@C[7],#32-10 @ C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */ @ ror @C[6],@C[6],#32-11 ldr.h @C[5],[sp,#$A[0][0]+4] eor @C[8],@C[8],@E[2] eor @C[9],@C[9],@E[3] ldr.l @E[2],[sp,#$A[2][2]] eor @C[0],@C[0],@C[4] ldr.h @E[3],[sp,#$A[2][2]+4] @ ror @C[8],@C[8],#32-7 @ C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */ @ ror @C[9],@C[9],#32-7 eor @C[1],@C[1],@C[5] @ C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */ eor @E[2],@E[2],@C[2] ldr.l @C[2],[sp,#$A[1][1]] eor @E[3],@E[3],@C[3] ldr.h @C[3],[sp,#$A[1][1]+4] ror @C[5],@E[2],#32-21 @ C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */ ldr @E[2],[sp,#444] @ load counter eor @C[2],@C[2],@E[0] adr @E[0],iotas32 ror @C[4],@E[3],#32-22 add @E[3],@E[0],@E[2] eor @C[3],@C[3],@E[1] ___ $code.=<<___ if ($A[0][0] != $T[0][0]); ldmia @E[3],{@E[0],@E[1]} @ iotas[i] ___ $code.=<<___ if ($A[0][0] == $T[0][0]); ldr.l @E[0],[@E[3],#8] @ iotas[i].lo add @E[2],@E[2],#16 ldr.h @E[1],[@E[3],#12] @ iotas[i].hi cmp @E[2],#192 str @E[2],[sp,#444] @ store counter ___ $code.=<<___; bic @E[2],@C[4],@C[2],ror#32-22 bic @E[3],@C[5],@C[3],ror#32-22 ror @C[2],@C[2],#32-22 @ C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */ ror @C[3],@C[3],#32-22 eor @E[2],@E[2],@C[0] eor @E[3],@E[3],@C[1] eor @E[0],@E[0],@E[2] eor @E[1],@E[1],@E[3] str.l @E[0],[sp,#$R[0][0]] @ R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; bic @E[2],@C[6],@C[4],ror#11 str.h @E[1],[sp,#$R[0][0]+4] bic @E[3],@C[7],@C[5],ror#10 bic @E[0],@C[8],@C[6],ror#32-(11-7) bic @E[1],@C[9],@C[7],ror#32-(10-7) eor @E[2],@C[2],@E[2],ror#32-11 str.l @E[2],[sp,#$R[0][1]] @ R[0][1] = C[1] ^ (~C[2] & C[3]); eor @E[3],@C[3],@E[3],ror#32-10 str.h @E[3],[sp,#$R[0][1]+4] eor @E[0],@C[4],@E[0],ror#32-7 eor @E[1],@C[5],@E[1],ror#32-7 str.l @E[0],[sp,#$R[0][2]] @ R[0][2] = C[2] ^ (~C[3] & C[4]); bic @E[2],@C[0],@C[8],ror#32-7 str.h @E[1],[sp,#$R[0][2]+4] bic @E[3],@C[1],@C[9],ror#32-7 eor @E[2],@E[2],@C[6],ror#32-11 str.l @E[2],[sp,#$R[0][3]] @ R[0][3] = C[3] ^ (~C[4] & C[0]); eor @E[3],@E[3],@C[7],ror#32-10 str.h @E[3],[sp,#$R[0][3]+4] bic @E[0],@C[2],@C[0] add @E[3],sp,#$D[3] ldr.l @C[0],[sp,#$A[0][3]] @ A[0][3] bic @E[1],@C[3],@C[1] ldr.h @C[1],[sp,#$A[0][3]+4] eor @E[0],@E[0],@C[8],ror#32-7 eor @E[1],@E[1],@C[9],ror#32-7 str.l @E[0],[sp,#$R[0][4]] @ R[0][4] = C[4] ^ (~C[0] & C[1]); add @C[9],sp,#$D[0] str.h @E[1],[sp,#$R[0][4]+4] ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[3..4] ldmia @C[9],{@C[6]-@C[9]} @ D[0..1] ldr.l @C[2],[sp,#$A[1][4]] @ A[1][4] eor @C[0],@C[0],@E[0] ldr.h @C[3],[sp,#$A[1][4]+4] eor @C[1],@C[1],@E[1] @ ror @C[0],@C[0],#32-14 @ C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]); ldr.l @E[0],[sp,#$A[3][1]] @ A[3][1] @ ror @C[1],@C[1],#32-14 ldr.h @E[1],[sp,#$A[3][1]+4] eor @C[2],@C[2],@E[2] ldr.l @C[4],[sp,#$A[2][0]] @ A[2][0] eor @C[3],@C[3],@E[3] ldr.h @C[5],[sp,#$A[2][0]+4] @ ror @C[2],@C[2],#32-10 @ C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); @ ror @C[3],@C[3],#32-10 eor @C[6],@C[6],@C[4] ldr.l @E[2],[sp,#$D[2]] @ D[2] eor @C[7],@C[7],@C[5] ldr.h @E[3],[sp,#$D[2]+4] ror @C[5],@C[6],#32-1 @ C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]); ror @C[4],@C[7],#32-2 eor @E[0],@E[0],@C[8] ldr.l @C[8],[sp,#$A[4][2]] @ A[4][2] eor @E[1],@E[1],@C[9] ldr.h @C[9],[sp,#$A[4][2]+4] ror @C[7],@E[0],#32-22 @ C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]); ror @C[6],@E[1],#32-23 bic @E[0],@C[4],@C[2],ror#32-10 bic @E[1],@C[5],@C[3],ror#32-10 eor @E[2],@E[2],@C[8] eor @E[3],@E[3],@C[9] ror @C[9],@E[2],#32-30 @ C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]); ror @C[8],@E[3],#32-31 eor @E[0],@E[0],@C[0],ror#32-14 eor @E[1],@E[1],@C[1],ror#32-14 str.l @E[0],[sp,#$R[1][0]] @ R[1][0] = C[0] ^ (~C[1] & C[2]) bic @E[2],@C[6],@C[4] str.h @E[1],[sp,#$R[1][0]+4] bic @E[3],@C[7],@C[5] eor @E[2],@E[2],@C[2],ror#32-10 str.l @E[2],[sp,#$R[1][1]] @ R[1][1] = C[1] ^ (~C[2] & C[3]); eor @E[3],@E[3],@C[3],ror#32-10 str.h @E[3],[sp,#$R[1][1]+4] bic @E[0],@C[8],@C[6] bic @E[1],@C[9],@C[7] bic @E[2],@C[0],@C[8],ror#14 bic @E[3],@C[1],@C[9],ror#14 eor @E[0],@E[0],@C[4] eor @E[1],@E[1],@C[5] str.l @E[0],[sp,#$R[1][2]] @ R[1][2] = C[2] ^ (~C[3] & C[4]); bic @C[2],@C[2],@C[0],ror#32-(14-10) str.h @E[1],[sp,#$R[1][2]+4] eor @E[2],@C[6],@E[2],ror#32-14 bic @E[1],@C[3],@C[1],ror#32-(14-10) str.l @E[2],[sp,#$R[1][3]] @ R[1][3] = C[3] ^ (~C[4] & C[0]); eor @E[3],@C[7],@E[3],ror#32-14 str.h @E[3],[sp,#$R[1][3]+4] add @E[2],sp,#$D[1] ldr.l @C[1],[sp,#$A[0][1]] @ A[0][1] eor @E[0],@C[8],@C[2],ror#32-10 ldr.h @C[0],[sp,#$A[0][1]+4] eor @E[1],@C[9],@E[1],ror#32-10 str.l @E[0],[sp,#$R[1][4]] @ R[1][4] = C[4] ^ (~C[0] & C[1]); str.h @E[1],[sp,#$R[1][4]+4] add @C[9],sp,#$D[3] ldmia @E[2],{@E[0]-@E[2],@E[3]} @ D[1..2] ldr.l @C[2],[sp,#$A[1][2]] @ A[1][2] ldr.h @C[3],[sp,#$A[1][2]+4] ldmia @C[9],{@C[6]-@C[9]} @ D[3..4] eor @C[1],@C[1],@E[0] ldr.l @C[4],[sp,#$A[2][3]] @ A[2][3] eor @C[0],@C[0],@E[1] ldr.h @C[5],[sp,#$A[2][3]+4] ror @C[0],@C[0],#32-1 @ C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]); eor @C[2],@C[2],@E[2] ldr.l @E[0],[sp,#$A[3][4]] @ A[3][4] eor @C[3],@C[3],@E[3] ldr.h @E[1],[sp,#$A[3][4]+4] @ ror @C[2],@C[2],#32-3 @ C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]); ldr.l @E[2],[sp,#$D[0]] @ D[0] @ ror @C[3],@C[3],#32-3 ldr.h @E[3],[sp,#$D[0]+4] eor @C[4],@C[4],@C[6] eor @C[5],@C[5],@C[7] @ ror @C[5],@C[6],#32-12 @ C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); @ ror @C[4],@C[7],#32-13 @ [track reverse order below] eor @E[0],@E[0],@C[8] ldr.l @C[8],[sp,#$A[4][0]] @ A[4][0] eor @E[1],@E[1],@C[9] ldr.h @C[9],[sp,#$A[4][0]+4] ror @C[6],@E[0],#32-4 @ C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]); ror @C[7],@E[1],#32-4 eor @E[2],@E[2],@C[8] eor @E[3],@E[3],@C[9] ror @C[8],@E[2],#32-9 @ C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]); ror @C[9],@E[3],#32-9 bic @E[0],@C[5],@C[2],ror#13-3 bic @E[1],@C[4],@C[3],ror#12-3 bic @E[2],@C[6],@C[5],ror#32-13 bic @E[3],@C[7],@C[4],ror#32-12 eor @E[0],@C[0],@E[0],ror#32-13 eor @E[1],@C[1],@E[1],ror#32-12 str.l @E[0],[sp,#$R[2][0]] @ R[2][0] = C[0] ^ (~C[1] & C[2]) eor @E[2],@E[2],@C[2],ror#32-3 str.h @E[1],[sp,#$R[2][0]+4] eor @E[3],@E[3],@C[3],ror#32-3 str.l @E[2],[sp,#$R[2][1]] @ R[2][1] = C[1] ^ (~C[2] & C[3]); bic @E[0],@C[8],@C[6] bic @E[1],@C[9],@C[7] str.h @E[3],[sp,#$R[2][1]+4] eor @E[0],@E[0],@C[5],ror#32-13 eor @E[1],@E[1],@C[4],ror#32-12 str.l @E[0],[sp,#$R[2][2]] @ R[2][2] = C[2] ^ (~C[3] & C[4]); bic @E[2],@C[0],@C[8] str.h @E[1],[sp,#$R[2][2]+4] bic @E[3],@C[1],@C[9] eor @E[2],@E[2],@C[6] eor @E[3],@E[3],@C[7] str.l @E[2],[sp,#$R[2][3]] @ R[2][3] = C[3] ^ (~C[4] & C[0]); bic @E[0],@C[2],@C[0],ror#3 str.h @E[3],[sp,#$R[2][3]+4] bic @E[1],@C[3],@C[1],ror#3 ldr.l @C[1],[sp,#$A[0][4]] @ A[0][4] [in reverse order] eor @E[0],@C[8],@E[0],ror#32-3 ldr.h @C[0],[sp,#$A[0][4]+4] eor @E[1],@C[9],@E[1],ror#32-3 str.l @E[0],[sp,#$R[2][4]] @ R[2][4] = C[4] ^ (~C[0] & C[1]); add @C[9],sp,#$D[1] str.h @E[1],[sp,#$R[2][4]+4] ldr.l @E[0],[sp,#$D[4]] @ D[4] ldr.h @E[1],[sp,#$D[4]+4] ldr.l @E[2],[sp,#$D[0]] @ D[0] ldr.h @E[3],[sp,#$D[0]+4] ldmia @C[9],{@C[6]-@C[9]} @ D[1..2] eor @C[1],@C[1],@E[0] ldr.l @C[2],[sp,#$A[1][0]] @ A[1][0] eor @C[0],@C[0],@E[1] ldr.h @C[3],[sp,#$A[1][0]+4] @ ror @C[1],@E[0],#32-13 @ C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]); ldr.l @C[4],[sp,#$A[2][1]] @ A[2][1] @ ror @C[0],@E[1],#32-14 @ [was loaded in reverse order] ldr.h @C[5],[sp,#$A[2][1]+4] eor @C[2],@C[2],@E[2] ldr.l @E[0],[sp,#$A[3][2]] @ A[3][2] eor @C[3],@C[3],@E[3] ldr.h @E[1],[sp,#$A[3][2]+4] @ ror @C[2],@C[2],#32-18 @ C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]); ldr.l @E[2],[sp,#$D[3]] @ D[3] @ ror @C[3],@C[3],#32-18 ldr.h @E[3],[sp,#$D[3]+4] eor @C[6],@C[6],@C[4] eor @C[7],@C[7],@C[5] ror @C[4],@C[6],#32-5 @ C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]); ror @C[5],@C[7],#32-5 eor @E[0],@E[0],@C[8] ldr.l @C[8],[sp,#$A[4][3]] @ A[4][3] eor @E[1],@E[1],@C[9] ldr.h @C[9],[sp,#$A[4][3]+4] ror @C[7],@E[0],#32-7 @ C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); ror @C[6],@E[1],#32-8 eor @E[2],@E[2],@C[8] eor @E[3],@E[3],@C[9] ror @C[8],@E[2],#32-28 @ C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]); ror @C[9],@E[3],#32-28 bic @E[0],@C[4],@C[2],ror#32-18 bic @E[1],@C[5],@C[3],ror#32-18 eor @E[0],@E[0],@C[0],ror#32-14 eor @E[1],@E[1],@C[1],ror#32-13 str.l @E[0],[sp,#$R[3][0]] @ R[3][0] = C[0] ^ (~C[1] & C[2]) bic @E[2],@C[6],@C[4] str.h @E[1],[sp,#$R[3][0]+4] bic @E[3],@C[7],@C[5] eor @E[2],@E[2],@C[2],ror#32-18 str.l @E[2],[sp,#$R[3][1]] @ R[3][1] = C[1] ^ (~C[2] & C[3]); eor @E[3],@E[3],@C[3],ror#32-18 str.h @E[3],[sp,#$R[3][1]+4] bic @E[0],@C[8],@C[6] bic @E[1],@C[9],@C[7] bic @E[2],@C[0],@C[8],ror#14 bic @E[3],@C[1],@C[9],ror#13 eor @E[0],@E[0],@C[4] eor @E[1],@E[1],@C[5] str.l @E[0],[sp,#$R[3][2]] @ R[3][2] = C[2] ^ (~C[3] & C[4]); bic @C[2],@C[2],@C[0],ror#18-14 str.h @E[1],[sp,#$R[3][2]+4] eor @E[2],@C[6],@E[2],ror#32-14 bic @E[1],@C[3],@C[1],ror#18-13 eor @E[3],@C[7],@E[3],ror#32-13 str.l @E[2],[sp,#$R[3][3]] @ R[3][3] = C[3] ^ (~C[4] & C[0]); str.h @E[3],[sp,#$R[3][3]+4] add @E[3],sp,#$D[2] ldr.l @C[0],[sp,#$A[0][2]] @ A[0][2] eor @E[0],@C[8],@C[2],ror#32-18 ldr.h @C[1],[sp,#$A[0][2]+4] eor @E[1],@C[9],@E[1],ror#32-18 str.l @E[0],[sp,#$R[3][4]] @ R[3][4] = C[4] ^ (~C[0] & C[1]); str.h @E[1],[sp,#$R[3][4]+4] ldmia @E[3],{@E[0]-@E[2],@E[3]} @ D[2..3] ldr.l @C[2],[sp,#$A[1][3]] @ A[1][3] ldr.h @C[3],[sp,#$A[1][3]+4] ldr.l @C[6],[sp,#$D[4]] @ D[4] ldr.h @C[7],[sp,#$D[4]+4] eor @C[0],@C[0],@E[0] ldr.l @C[4],[sp,#$A[2][4]] @ A[2][4] eor @C[1],@C[1],@E[1] ldr.h @C[5],[sp,#$A[2][4]+4] @ ror @C[0],@C[0],#32-31 @ C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]); ldr.l @C[8],[sp,#$D[0]] @ D[0] @ ror @C[1],@C[1],#32-31 ldr.h @C[9],[sp,#$D[0]+4] eor @E[2],@E[2],@C[2] ldr.l @E[0],[sp,#$A[3][0]] @ A[3][0] eor @E[3],@E[3],@C[3] ldr.h @E[1],[sp,#$A[3][0]+4] ror @C[3],@E[2],#32-27 @ C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]); ldr.l @E[2],[sp,#$D[1]] @ D[1] ror @C[2],@E[3],#32-28 ldr.h @E[3],[sp,#$D[1]+4] eor @C[6],@C[6],@C[4] eor @C[7],@C[7],@C[5] ror @C[5],@C[6],#32-19 @ C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]); ror @C[4],@C[7],#32-20 eor @E[0],@E[0],@C[8] ldr.l @C[8],[sp,#$A[4][1]] @ A[4][1] eor @E[1],@E[1],@C[9] ldr.h @C[9],[sp,#$A[4][1]+4] ror @C[7],@E[0],#32-20 @ C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]); ror @C[6],@E[1],#32-21 eor @C[8],@C[8],@E[2] eor @C[9],@C[9],@E[3] @ ror @C[8],@C[2],#32-1 @ C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); @ ror @C[9],@C[3],#32-1 bic @E[0],@C[4],@C[2] bic @E[1],@C[5],@C[3] eor @E[0],@E[0],@C[0],ror#32-31 str.l @E[0],[sp,#$R[4][0]] @ R[4][0] = C[0] ^ (~C[1] & C[2]) eor @E[1],@E[1],@C[1],ror#32-31 str.h @E[1],[sp,#$R[4][0]+4] bic @E[2],@C[6],@C[4] bic @E[3],@C[7],@C[5] eor @E[2],@E[2],@C[2] eor @E[3],@E[3],@C[3] str.l @E[2],[sp,#$R[4][1]] @ R[4][1] = C[1] ^ (~C[2] & C[3]); bic @E[0],@C[8],@C[6],ror#1 str.h @E[3],[sp,#$R[4][1]+4] bic @E[1],@C[9],@C[7],ror#1 bic @E[2],@C[0],@C[8],ror#31-1 bic @E[3],@C[1],@C[9],ror#31-1 eor @C[4],@C[4],@E[0],ror#32-1 str.l @C[4],[sp,#$R[4][2]] @ R[4][2] = C[2] ^= (~C[3] & C[4]); eor @C[5],@C[5],@E[1],ror#32-1 str.h @C[5],[sp,#$R[4][2]+4] eor @C[6],@C[6],@E[2],ror#32-31 eor @C[7],@C[7],@E[3],ror#32-31 str.l @C[6],[sp,#$R[4][3]] @ R[4][3] = C[3] ^= (~C[4] & C[0]); bic @E[0],@C[2],@C[0],ror#32-31 str.h @C[7],[sp,#$R[4][3]+4] bic @E[1],@C[3],@C[1],ror#32-31 add @E[2],sp,#$R[0][0] eor @C[8],@E[0],@C[8],ror#32-1 add @E[0],sp,#$R[1][0] eor @C[9],@E[1],@C[9],ror#32-1 str.l @C[8],[sp,#$R[4][4]] @ R[4][4] = C[4] ^= (~C[0] & C[1]); str.h @C[9],[sp,#$R[4][4]+4] ___ } Round(@A,@T); Round(@T,@A); $code.=<<___; blo .Lround2x #if __ARM_ARCH__>=5 ldr pc,[sp,#440] #else ldr lr,[sp,#440] tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif .size KeccakF1600_int,.-KeccakF1600_int .type KeccakF1600, %function .align 5 KeccakF1600: stmdb sp!,{r0,r4-r11,lr} sub sp,sp,#440+16 @ space for A[5][5],D[5],T[5][5],... add @E[0],r0,#$A[1][0] add @E[1],sp,#$A[1][0] ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack stmia sp, {@C[0]-@C[9]} ldmia @E[0]!,{@C[0]-@C[9]} stmia @E[1]!,{@C[0]-@C[9]} ldmia @E[0]!,{@C[0]-@C[9]} stmia @E[1]!,{@C[0]-@C[9]} ldmia @E[0]!,{@C[0]-@C[9]} stmia @E[1]!,{@C[0]-@C[9]} ldmia @E[0], {@C[0]-@C[9]} add @E[2],sp,#$A[0][0] add @E[0],sp,#$A[1][0] stmia @E[1], {@C[0]-@C[9]} bl KeccakF1600_enter ldr @E[1], [sp,#440+16] @ restore pointer to A ldmia sp, {@C[0]-@C[9]} stmia @E[1]!,{@C[0]-@C[9]} @ return A[5][5] ldmia @E[0]!,{@C[0]-@C[9]} stmia @E[1]!,{@C[0]-@C[9]} ldmia @E[0]!,{@C[0]-@C[9]} stmia @E[1]!,{@C[0]-@C[9]} ldmia @E[0]!,{@C[0]-@C[9]} stmia @E[1]!,{@C[0]-@C[9]} ldmia @E[0], {@C[0]-@C[9]} stmia @E[1], {@C[0]-@C[9]} add sp,sp,#440+20 #if __ARM_ARCH__>=5 ldmia sp!,{r4-r11,pc} #else ldmia sp!,{r4-r11,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif .size KeccakF1600,.-KeccakF1600 ___ { my ($A_flat,$inp,$len,$bsz) = map("r$_",(10..12,14)); ######################################################################## # Stack layout # ----->+-----------------------+ # | uint64_t A[5][5] | # | ... | # | ... | # +456->+-----------------------+ # | 0x55555555 | # +460->+-----------------------+ # | 0x33333333 | # +464->+-----------------------+ # | 0x0f0f0f0f | # +468->+-----------------------+ # | 0x00ff00ff | # +472->+-----------------------+ # | uint64_t *A | # +476->+-----------------------+ # | const void *inp | # +480->+-----------------------+ # | size_t len | # +484->+-----------------------+ # | size_t bs | # +488->+-----------------------+ # | .... $code.=<<___; .global SHA3_absorb .type SHA3_absorb,%function .align 5 SHA3_absorb: stmdb sp!,{r0-r12,lr} sub sp,sp,#456+16 add $A_flat,r0,#$A[1][0] @ mov $inp,r1 mov $len,r2 mov $bsz,r3 cmp r2,r3 blo .Labsorb_abort add $inp,sp,#0 ldmia r0, {@C[0]-@C[9]} @ copy A[5][5] to stack stmia $inp!, {@C[0]-@C[9]} ldmia $A_flat!,{@C[0]-@C[9]} stmia $inp!, {@C[0]-@C[9]} ldmia $A_flat!,{@C[0]-@C[9]} stmia $inp!, {@C[0]-@C[9]} ldmia $A_flat!,{@C[0]-@C[9]} stmia $inp!, {@C[0]-@C[9]} ldmia $A_flat!,{@C[0]-@C[9]} stmia $inp, {@C[0]-@C[9]} ldr $inp,[sp,#476] @ restore $inp #ifdef __thumb2__ mov r9,#0x00ff00ff mov r8,#0x0f0f0f0f mov r7,#0x33333333 mov r6,#0x55555555 #else mov r6,#0x11 @ compose constants mov r8,#0x0f mov r9,#0xff orr r6,r6,r6,lsl#8 orr r8,r8,r8,lsl#8 orr r6,r6,r6,lsl#16 @ 0x11111111 orr r9,r9,r9,lsl#16 @ 0x00ff00ff orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f orr r7,r6,r6,lsl#1 @ 0x33333333 orr r6,r6,r6,lsl#2 @ 0x55555555 #endif str r9,[sp,#468] str r8,[sp,#464] str r7,[sp,#460] str r6,[sp,#456] b .Loop_absorb .align 4 .Loop_absorb: subs r0,$len,$bsz blo .Labsorbed add $A_flat,sp,#0 str r0,[sp,#480] @ save len - bsz .align 4 .Loop_block: ldrb r0,[$inp],#1 ldrb r1,[$inp],#1 ldrb r2,[$inp],#1 ldrb r3,[$inp],#1 ldrb r4,[$inp],#1 orr r0,r0,r1,lsl#8 ldrb r1,[$inp],#1 orr r0,r0,r2,lsl#16 ldrb r2,[$inp],#1 orr r0,r0,r3,lsl#24 @ lo ldrb r3,[$inp],#1 orr r1,r4,r1,lsl#8 orr r1,r1,r2,lsl#16 orr r1,r1,r3,lsl#24 @ hi and r2,r0,r6 @ &=0x55555555 and r0,r0,r6,lsl#1 @ &=0xaaaaaaaa and r3,r1,r6 @ &=0x55555555 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa orr r2,r2,r2,lsr#1 orr r0,r0,r0,lsl#1 orr r3,r3,r3,lsr#1 orr r1,r1,r1,lsl#1 and r2,r2,r7 @ &=0x33333333 and r0,r0,r7,lsl#2 @ &=0xcccccccc and r3,r3,r7 @ &=0x33333333 and r1,r1,r7,lsl#2 @ &=0xcccccccc orr r2,r2,r2,lsr#2 orr r0,r0,r0,lsl#2 orr r3,r3,r3,lsr#2 orr r1,r1,r1,lsl#2 and r2,r2,r8 @ &=0x0f0f0f0f and r0,r0,r8,lsl#4 @ &=0xf0f0f0f0 and r3,r3,r8 @ &=0x0f0f0f0f and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0 ldmia $A_flat,{r4-r5} @ A_flat[i] orr r2,r2,r2,lsr#4 orr r0,r0,r0,lsl#4 orr r3,r3,r3,lsr#4 orr r1,r1,r1,lsl#4 and r2,r2,r9 @ &=0x00ff00ff and r0,r0,r9,lsl#8 @ &=0xff00ff00 and r3,r3,r9 @ &=0x00ff00ff and r1,r1,r9,lsl#8 @ &=0xff00ff00 orr r2,r2,r2,lsr#8 orr r0,r0,r0,lsl#8 orr r3,r3,r3,lsr#8 orr r1,r1,r1,lsl#8 lsl r2,r2,#16 lsr r1,r1,#16 eor r4,r4,r3,lsl#16 eor r5,r5,r0,lsr#16 eor r4,r4,r2,lsr#16 eor r5,r5,r1,lsl#16 stmia $A_flat!,{r4-r5} @ A_flat[i++] ^= BitInterleave(inp[0..7]) subs $bsz,$bsz,#8 bhi .Loop_block str $inp,[sp,#476] bl KeccakF1600_int add r14,sp,#456 ldmia r14,{r6-r12,r14} @ restore constants and variables b .Loop_absorb .align 4 .Labsorbed: add $inp,sp,#$A[1][0] ldmia sp, {@C[0]-@C[9]} stmia $A_flat!,{@C[0]-@C[9]} @ return A[5][5] ldmia $inp!, {@C[0]-@C[9]} stmia $A_flat!,{@C[0]-@C[9]} ldmia $inp!, {@C[0]-@C[9]} stmia $A_flat!,{@C[0]-@C[9]} ldmia $inp!, {@C[0]-@C[9]} stmia $A_flat!,{@C[0]-@C[9]} ldmia $inp, {@C[0]-@C[9]} stmia $A_flat, {@C[0]-@C[9]} .Labsorb_abort: add sp,sp,#456+32 mov r0,$len @ return value #if __ARM_ARCH__>=5 ldmia sp!,{r4-r12,pc} #else ldmia sp!,{r4-r12,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif .size SHA3_absorb,.-SHA3_absorb ___ } { my ($out,$len,$A_flat,$bsz) = map("r$_", (4,5,10,12)); $code.=<<___; .global SHA3_squeeze .type SHA3_squeeze,%function .align 5 SHA3_squeeze: stmdb sp!,{r0,r3-r10,lr} mov $A_flat,r0 mov $out,r1 mov $len,r2 mov $bsz,r3 #ifdef __thumb2__ mov r9,#0x00ff00ff mov r8,#0x0f0f0f0f mov r7,#0x33333333 mov r6,#0x55555555 #else mov r6,#0x11 @ compose constants mov r8,#0x0f mov r9,#0xff orr r6,r6,r6,lsl#8 orr r8,r8,r8,lsl#8 orr r6,r6,r6,lsl#16 @ 0x11111111 orr r9,r9,r9,lsl#16 @ 0x00ff00ff orr r8,r8,r8,lsl#16 @ 0x0f0f0f0f orr r7,r6,r6,lsl#1 @ 0x33333333 orr r6,r6,r6,lsl#2 @ 0x55555555 #endif stmdb sp!,{r6-r9} mov r14,$A_flat b .Loop_squeeze .align 4 .Loop_squeeze: ldmia $A_flat!,{r0,r1} @ A_flat[i++] lsl r2,r0,#16 lsl r3,r1,#16 @ r3 = r1 << 16 lsr r2,r2,#16 @ r2 = r0 & 0x0000ffff lsr r1,r1,#16 lsr r0,r0,#16 @ r0 = r0 >> 16 lsl r1,r1,#16 @ r1 = r1 & 0xffff0000 orr r2,r2,r2,lsl#8 orr r3,r3,r3,lsr#8 orr r0,r0,r0,lsl#8 orr r1,r1,r1,lsr#8 and r2,r2,r9 @ &=0x00ff00ff and r3,r3,r9,lsl#8 @ &=0xff00ff00 and r0,r0,r9 @ &=0x00ff00ff and r1,r1,r9,lsl#8 @ &=0xff00ff00 orr r2,r2,r2,lsl#4 orr r3,r3,r3,lsr#4 orr r0,r0,r0,lsl#4 orr r1,r1,r1,lsr#4 and r2,r2,r8 @ &=0x0f0f0f0f and r3,r3,r8,lsl#4 @ &=0xf0f0f0f0 and r0,r0,r8 @ &=0x0f0f0f0f and r1,r1,r8,lsl#4 @ &=0xf0f0f0f0 orr r2,r2,r2,lsl#2 orr r3,r3,r3,lsr#2 orr r0,r0,r0,lsl#2 orr r1,r1,r1,lsr#2 and r2,r2,r7 @ &=0x33333333 and r3,r3,r7,lsl#2 @ &=0xcccccccc and r0,r0,r7 @ &=0x33333333 and r1,r1,r7,lsl#2 @ &=0xcccccccc orr r2,r2,r2,lsl#1 orr r3,r3,r3,lsr#1 orr r0,r0,r0,lsl#1 orr r1,r1,r1,lsr#1 and r2,r2,r6 @ &=0x55555555 and r3,r3,r6,lsl#1 @ &=0xaaaaaaaa and r0,r0,r6 @ &=0x55555555 and r1,r1,r6,lsl#1 @ &=0xaaaaaaaa orr r2,r2,r3 orr r0,r0,r1 cmp $len,#8 blo .Lsqueeze_tail lsr r1,r2,#8 strb r2,[$out],#1 lsr r3,r2,#16 strb r1,[$out],#1 lsr r2,r2,#24 strb r3,[$out],#1 strb r2,[$out],#1 lsr r1,r0,#8 strb r0,[$out],#1 lsr r3,r0,#16 strb r1,[$out],#1 lsr r0,r0,#24 strb r3,[$out],#1 strb r0,[$out],#1 subs $len,$len,#8 beq .Lsqueeze_done subs $bsz,$bsz,#8 @ bsz -= 8 bhi .Loop_squeeze mov r0,r14 @ original $A_flat bl KeccakF1600 ldmia sp,{r6-r10,r12} @ restore constants and variables mov r14,$A_flat b .Loop_squeeze .align 4 .Lsqueeze_tail: strb r2,[$out],#1 lsr r2,r2,#8 subs $len,$len,#1 beq .Lsqueeze_done strb r2,[$out],#1 lsr r2,r2,#8 subs $len,$len,#1 beq .Lsqueeze_done strb r2,[$out],#1 lsr r2,r2,#8 subs $len,$len,#1 beq .Lsqueeze_done strb r2,[$out],#1 subs $len,$len,#1 beq .Lsqueeze_done strb r0,[$out],#1 lsr r0,r0,#8 subs $len,$len,#1 beq .Lsqueeze_done strb r0,[$out],#1 lsr r0,r0,#8 subs $len,$len,#1 beq .Lsqueeze_done strb r0,[$out] b .Lsqueeze_done .align 4 .Lsqueeze_done: add sp,sp,#24 #if __ARM_ARCH__>=5 ldmia sp!,{r4-r10,pc} #else ldmia sp!,{r4-r10,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) #endif .size SHA3_squeeze,.-SHA3_squeeze ___ } $code.=<<___; #if __ARM_MAX_ARCH__>=7 .fpu neon .type iotas64, %object .align 5 iotas64: .quad 0x0000000000000001 .quad 0x0000000000008082 .quad 0x800000000000808a .quad 0x8000000080008000 .quad 0x000000000000808b .quad 0x0000000080000001 .quad 0x8000000080008081 .quad 0x8000000000008009 .quad 0x000000000000008a .quad 0x0000000000000088 .quad 0x0000000080008009 .quad 0x000000008000000a .quad 0x000000008000808b .quad 0x800000000000008b .quad 0x8000000000008089 .quad 0x8000000000008003 .quad 0x8000000000008002 .quad 0x8000000000000080 .quad 0x000000000000800a .quad 0x800000008000000a .quad 0x8000000080008081 .quad 0x8000000000008080 .quad 0x0000000080000001 .quad 0x8000000080008008 .size iotas64,.-iotas64 .type KeccakF1600_neon, %function .align 5 KeccakF1600_neon: add r1, r0, #16 adr r2, iotas64 mov r3, #24 @ loop counter b .Loop_neon .align 4 .Loop_neon: @ Theta vst1.64 {q4}, [r0,:64] @ offload A[0..1][4] veor q13, q0, q5 @ A[0..1][0]^A[2..3][0] vst1.64 {d18}, [r1,:64] @ offload A[2][4] veor q14, q1, q6 @ A[0..1][1]^A[2..3][1] veor q15, q2, q7 @ A[0..1][2]^A[2..3][2] veor d26, d26, d27 @ C[0]=A[0][0]^A[1][0]^A[2][0]^A[3][0] veor d27, d28, d29 @ C[1]=A[0][1]^A[1][1]^A[2][1]^A[3][1] veor q14, q3, q8 @ A[0..1][3]^A[2..3][3] veor q4, q4, q9 @ A[0..1][4]^A[2..3][4] veor d30, d30, d31 @ C[2]=A[0][2]^A[1][2]^A[2][2]^A[3][2] veor d31, d28, d29 @ C[3]=A[0][3]^A[1][3]^A[2][3]^A[3][3] veor d25, d8, d9 @ C[4]=A[0][4]^A[1][4]^A[2][4]^A[3][4] veor q13, q13, q10 @ C[0..1]^=A[4][0..1] veor q14, q15, q11 @ C[2..3]^=A[4][2..3] veor d25, d25, d24 @ C[4]^=A[4][4] vadd.u64 q4, q13, q13 @ C[0..1]<<1 vadd.u64 q15, q14, q14 @ C[2..3]<<1 vadd.u64 d18, d25, d25 @ C[4]<<1 vsri.u64 q4, q13, #63 @ ROL64(C[0..1],1) vsri.u64 q15, q14, #63 @ ROL64(C[2..3],1) vsri.u64 d18, d25, #63 @ ROL64(C[4],1) veor d25, d25, d9 @ D[0] = C[4] ^= ROL64(C[1],1) veor q13, q13, q15 @ D[1..2] = C[0..1] ^ ROL64(C[2..3],1) veor d28, d28, d18 @ D[3] = C[2] ^= ROL64(C[4],1) veor d29, d29, d8 @ D[4] = C[3] ^= ROL64(C[0],1) veor d0, d0, d25 @ A[0][0] ^= C[4] veor d1, d1, d25 @ A[1][0] ^= C[4] veor d10, d10, d25 @ A[2][0] ^= C[4] veor d11, d11, d25 @ A[3][0] ^= C[4] veor d20, d20, d25 @ A[4][0] ^= C[4] veor d2, d2, d26 @ A[0][1] ^= D[1] veor d3, d3, d26 @ A[1][1] ^= D[1] veor d12, d12, d26 @ A[2][1] ^= D[1] veor d13, d13, d26 @ A[3][1] ^= D[1] veor d21, d21, d26 @ A[4][1] ^= D[1] vmov d26, d27 veor d6, d6, d28 @ A[0][3] ^= C[2] veor d7, d7, d28 @ A[1][3] ^= C[2] veor d16, d16, d28 @ A[2][3] ^= C[2] veor d17, d17, d28 @ A[3][3] ^= C[2] veor d23, d23, d28 @ A[4][3] ^= C[2] vld1.64 {q4}, [r0,:64] @ restore A[0..1][4] vmov d28, d29 vld1.64 {d18}, [r1,:64] @ restore A[2][4] veor q2, q2, q13 @ A[0..1][2] ^= D[2] veor q7, q7, q13 @ A[2..3][2] ^= D[2] veor d22, d22, d27 @ A[4][2] ^= D[2] veor q4, q4, q14 @ A[0..1][4] ^= C[3] veor q9, q9, q14 @ A[2..3][4] ^= C[3] veor d24, d24, d29 @ A[4][4] ^= C[3] @ Rho + Pi vmov d26, d2 @ C[1] = A[0][1] vshl.u64 d2, d3, #44 vmov d27, d4 @ C[2] = A[0][2] vshl.u64 d4, d14, #43 vmov d28, d6 @ C[3] = A[0][3] vshl.u64 d6, d17, #21 vmov d29, d8 @ C[4] = A[0][4] vshl.u64 d8, d24, #14 vsri.u64 d2, d3, #64-44 @ A[0][1] = ROL64(A[1][1], rhotates[1][1]) vsri.u64 d4, d14, #64-43 @ A[0][2] = ROL64(A[2][2], rhotates[2][2]) vsri.u64 d6, d17, #64-21 @ A[0][3] = ROL64(A[3][3], rhotates[3][3]) vsri.u64 d8, d24, #64-14 @ A[0][4] = ROL64(A[4][4], rhotates[4][4]) vshl.u64 d3, d9, #20 vshl.u64 d14, d16, #25 vshl.u64 d17, d15, #15 vshl.u64 d24, d21, #2 vsri.u64 d3, d9, #64-20 @ A[1][1] = ROL64(A[1][4], rhotates[1][4]) vsri.u64 d14, d16, #64-25 @ A[2][2] = ROL64(A[2][3], rhotates[2][3]) vsri.u64 d17, d15, #64-15 @ A[3][3] = ROL64(A[3][2], rhotates[3][2]) vsri.u64 d24, d21, #64-2 @ A[4][4] = ROL64(A[4][1], rhotates[4][1]) vshl.u64 d9, d22, #61 @ vshl.u64 d16, d19, #8 vshl.u64 d15, d12, #10 vshl.u64 d21, d7, #55 vsri.u64 d9, d22, #64-61 @ A[1][4] = ROL64(A[4][2], rhotates[4][2]) vext.8 d16, d19, d19, #8-1 @ A[2][3] = ROL64(A[3][4], rhotates[3][4]) vsri.u64 d15, d12, #64-10 @ A[3][2] = ROL64(A[2][1], rhotates[2][1]) vsri.u64 d21, d7, #64-55 @ A[4][1] = ROL64(A[1][3], rhotates[1][3]) vshl.u64 d22, d18, #39 @ vshl.u64 d19, d23, #56 vshl.u64 d12, d5, #6 vshl.u64 d7, d13, #45 vsri.u64 d22, d18, #64-39 @ A[4][2] = ROL64(A[2][4], rhotates[2][4]) vext.8 d19, d23, d23, #8-7 @ A[3][4] = ROL64(A[4][3], rhotates[4][3]) vsri.u64 d12, d5, #64-6 @ A[2][1] = ROL64(A[1][2], rhotates[1][2]) vsri.u64 d7, d13, #64-45 @ A[1][3] = ROL64(A[3][1], rhotates[3][1]) vshl.u64 d18, d20, #18 vshl.u64 d23, d11, #41 vshl.u64 d5, d10, #3 vshl.u64 d13, d1, #36 vsri.u64 d18, d20, #64-18 @ A[2][4] = ROL64(A[4][0], rhotates[4][0]) vsri.u64 d23, d11, #64-41 @ A[4][3] = ROL64(A[3][0], rhotates[3][0]) vsri.u64 d5, d10, #64-3 @ A[1][2] = ROL64(A[2][0], rhotates[2][0]) vsri.u64 d13, d1, #64-36 @ A[3][1] = ROL64(A[1][0], rhotates[1][0]) vshl.u64 d1, d28, #28 vshl.u64 d10, d26, #1 vshl.u64 d11, d29, #27 vshl.u64 d20, d27, #62 vsri.u64 d1, d28, #64-28 @ A[1][0] = ROL64(C[3], rhotates[0][3]) vsri.u64 d10, d26, #64-1 @ A[2][0] = ROL64(C[1], rhotates[0][1]) vsri.u64 d11, d29, #64-27 @ A[3][0] = ROL64(C[4], rhotates[0][4]) vsri.u64 d20, d27, #64-62 @ A[4][0] = ROL64(C[2], rhotates[0][2]) @ Chi + Iota vbic q13, q2, q1 vbic q14, q3, q2 vbic q15, q4, q3 veor q13, q13, q0 @ A[0..1][0] ^ (~A[0..1][1] & A[0..1][2]) veor q14, q14, q1 @ A[0..1][1] ^ (~A[0..1][2] & A[0..1][3]) veor q2, q2, q15 @ A[0..1][2] ^= (~A[0..1][3] & A[0..1][4]) vst1.64 {q13}, [r0,:64] @ offload A[0..1][0] vbic q13, q0, q4 vbic q15, q1, q0 vmov q1, q14 @ A[0..1][1] veor q3, q3, q13 @ A[0..1][3] ^= (~A[0..1][4] & A[0..1][0]) veor q4, q4, q15 @ A[0..1][4] ^= (~A[0..1][0] & A[0..1][1]) vbic q13, q7, q6 vmov q0, q5 @ A[2..3][0] vbic q14, q8, q7 vmov q15, q6 @ A[2..3][1] veor q5, q5, q13 @ A[2..3][0] ^= (~A[2..3][1] & A[2..3][2]) vbic q13, q9, q8 veor q6, q6, q14 @ A[2..3][1] ^= (~A[2..3][2] & A[2..3][3]) vbic q14, q0, q9 veor q7, q7, q13 @ A[2..3][2] ^= (~A[2..3][3] & A[2..3][4]) vbic q13, q15, q0 veor q8, q8, q14 @ A[2..3][3] ^= (~A[2..3][4] & A[2..3][0]) vmov q14, q10 @ A[4][0..1] veor q9, q9, q13 @ A[2..3][4] ^= (~A[2..3][0] & A[2..3][1]) vld1.64 d25, [r2,:64]! @ Iota[i++] vbic d26, d22, d21 vbic d27, d23, d22 vld1.64 {q0}, [r0,:64] @ restore A[0..1][0] veor d20, d20, d26 @ A[4][0] ^= (~A[4][1] & A[4][2]) vbic d26, d24, d23 veor d21, d21, d27 @ A[4][1] ^= (~A[4][2] & A[4][3]) vbic d27, d28, d24 veor d22, d22, d26 @ A[4][2] ^= (~A[4][3] & A[4][4]) vbic d26, d29, d28 veor d23, d23, d27 @ A[4][3] ^= (~A[4][4] & A[4][0]) veor d0, d0, d25 @ A[0][0] ^= Iota[i] veor d24, d24, d26 @ A[4][4] ^= (~A[4][0] & A[4][1]) subs r3, r3, #1 bne .Loop_neon ret .size KeccakF1600_neon,.-KeccakF1600_neon .global SHA3_absorb_neon .type SHA3_absorb_neon, %function .align 5 SHA3_absorb_neon: stmdb sp!, {r4-r6,lr} vstmdb sp!, {d8-d15} mov r4, r1 @ inp mov r5, r2 @ len mov r6, r3 @ bsz vld1.32 {d0}, [r0,:64]! @ A[0][0] vld1.32 {d2}, [r0,:64]! @ A[0][1] vld1.32 {d4}, [r0,:64]! @ A[0][2] vld1.32 {d6}, [r0,:64]! @ A[0][3] vld1.32 {d8}, [r0,:64]! @ A[0][4] vld1.32 {d1}, [r0,:64]! @ A[1][0] vld1.32 {d3}, [r0,:64]! @ A[1][1] vld1.32 {d5}, [r0,:64]! @ A[1][2] vld1.32 {d7}, [r0,:64]! @ A[1][3] vld1.32 {d9}, [r0,:64]! @ A[1][4] vld1.32 {d10}, [r0,:64]! @ A[2][0] vld1.32 {d12}, [r0,:64]! @ A[2][1] vld1.32 {d14}, [r0,:64]! @ A[2][2] vld1.32 {d16}, [r0,:64]! @ A[2][3] vld1.32 {d18}, [r0,:64]! @ A[2][4] vld1.32 {d11}, [r0,:64]! @ A[3][0] vld1.32 {d13}, [r0,:64]! @ A[3][1] vld1.32 {d15}, [r0,:64]! @ A[3][2] vld1.32 {d17}, [r0,:64]! @ A[3][3] vld1.32 {d19}, [r0,:64]! @ A[3][4] vld1.32 {d20-d23}, [r0,:64]! @ A[4][0..3] vld1.32 {d24}, [r0,:64] @ A[4][4] sub r0, r0, #24*8 @ rewind b .Loop_absorb_neon .align 4 .Loop_absorb_neon: subs r12, r5, r6 @ len - bsz blo .Labsorbed_neon mov r5, r12 vld1.8 {d31}, [r4]! @ endian-neutral loads... veor d0, d0, d31 @ A[0][0] ^= *inp++ vld1.8 {d31}, [r4]! veor d2, d2, d31 @ A[0][1] ^= *inp++ vld1.8 {d31}, [r4]! veor d4, d4, d31 @ A[0][2] ^= *inp++ vld1.8 {d31}, [r4]! veor d6, d6, d31 @ A[0][3] ^= *inp++ vld1.8 {d31},[r4]! veor d8, d8, d31 @ A[0][4] ^= *inp++ vld1.8 {d31}, [r4]! veor d1, d1, d31 @ A[1][0] ^= *inp++ vld1.8 {d31}, [r4]! veor d3, d3, d31 @ A[1][1] ^= *inp++ vld1.8 {d31}, [r4]! veor d5, d5, d31 @ A[1][2] ^= *inp++ vld1.8 {d31}, [r4]! cmp r6, #8*13 veor d7, d7, d31 @ A[1][3] ^= *inp++ blo .Lprocess_neon vld1.8 {d31}, [r4]! veor d9, d9, d31 @ A[1][4] ^= *inp++ vld1.8 {d31}, [r4]! veor d10, d10, d31 @ A[2][0] ^= *inp++ vld1.8 {d31}, [r4]! veor d12, d12, d31 @ A[2][1] ^= *inp++ vld1.8 {d31}, [r4]! veor d14, d14, d31 @ A[2][2] ^= *inp++ beq .Lprocess_neon vld1.8 {d31}, [r4]! veor d16, d16, d31 @ A[2][3] ^= *inp++ vld1.8 {d31}, [r4]! veor d18, d18, d31 @ A[2][4] ^= *inp++ vld1.8 {d31}, [r4]! veor d11, d11, d31 @ A[3][0] ^= *inp++ vld1.8 {d31}, [r4]! cmp r6, #8*18 veor d13, d13, d31 @ A[3][1] ^= *inp++ blo .Lprocess_neon vld1.8 {d31}, [r4]! veor d15, d15, d31 @ A[3][2] ^= *inp++ beq .Lprocess_neon vld1.8 {d31}, [r4]! veor d17, d17, d31 @ A[3][3] ^= *inp++ vld1.8 {d31}, [r4]! veor d19, d19, d31 @ A[3][4] ^= *inp++ vld1.8 {d31}, [r4]! veor d20, d20, d31 @ A[4][0] ^= *inp++ .Lprocess_neon: bl KeccakF1600_neon b .Loop_absorb_neon .align 4 .Labsorbed_neon: vst1.32 {d0}, [r0,:64]! @ A[0][0..4] vst1.32 {d2}, [r0,:64]! vst1.32 {d4}, [r0,:64]! vst1.32 {d6}, [r0,:64]! vst1.32 {d8}, [r0,:64]! vst1.32 {d1}, [r0,:64]! @ A[1][0..4] vst1.32 {d3}, [r0,:64]! vst1.32 {d5}, [r0,:64]! vst1.32 {d7}, [r0,:64]! vst1.32 {d9}, [r0,:64]! vst1.32 {d10}, [r0,:64]! @ A[2][0..4] vst1.32 {d12}, [r0,:64]! vst1.32 {d14}, [r0,:64]! vst1.32 {d16}, [r0,:64]! vst1.32 {d18}, [r0,:64]! vst1.32 {d11}, [r0,:64]! @ A[3][0..4] vst1.32 {d13}, [r0,:64]! vst1.32 {d15}, [r0,:64]! vst1.32 {d17}, [r0,:64]! vst1.32 {d19}, [r0,:64]! vst1.32 {d20-d23}, [r0,:64]! @ A[4][0..4] vst1.32 {d24}, [r0,:64] mov r0, r5 @ return value vldmia sp!, {d8-d15} ldmia sp!, {r4-r6,pc} .size SHA3_absorb_neon,.-SHA3_absorb_neon .global SHA3_squeeze_neon .type SHA3_squeeze_neon, %function .align 5 SHA3_squeeze_neon: stmdb sp!, {r4-r6,lr} mov r4, r1 @ out mov r5, r2 @ len mov r6, r3 @ bsz mov r12, r0 @ A_flat mov r14, r3 @ bsz b .Loop_squeeze_neon .align 4 .Loop_squeeze_neon: cmp r5, #8 blo .Lsqueeze_neon_tail vld1.32 {d0}, [r12]! vst1.8 {d0}, [r4]! @ endian-neutral store subs r5, r5, #8 @ len -= 8 beq .Lsqueeze_neon_done subs r14, r14, #8 @ bsz -= 8 bhi .Loop_squeeze_neon vstmdb sp!, {d8-d15} vld1.32 {d0}, [r0,:64]! @ A[0][0..4] vld1.32 {d2}, [r0,:64]! vld1.32 {d4}, [r0,:64]! vld1.32 {d6}, [r0,:64]! vld1.32 {d8}, [r0,:64]! vld1.32 {d1}, [r0,:64]! @ A[1][0..4] vld1.32 {d3}, [r0,:64]! vld1.32 {d5}, [r0,:64]! vld1.32 {d7}, [r0,:64]! vld1.32 {d9}, [r0,:64]! vld1.32 {d10}, [r0,:64]! @ A[2][0..4] vld1.32 {d12}, [r0,:64]! vld1.32 {d14}, [r0,:64]! vld1.32 {d16}, [r0,:64]! vld1.32 {d18}, [r0,:64]! vld1.32 {d11}, [r0,:64]! @ A[3][0..4] vld1.32 {d13}, [r0,:64]! vld1.32 {d15}, [r0,:64]! vld1.32 {d17}, [r0,:64]! vld1.32 {d19}, [r0,:64]! vld1.32 {d20-d23}, [r0,:64]! @ A[4][0..4] vld1.32 {d24}, [r0,:64] sub r0, r0, #24*8 @ rewind bl KeccakF1600_neon mov r12, r0 @ A_flat vst1.32 {d0}, [r0,:64]! @ A[0][0..4] vst1.32 {d2}, [r0,:64]! vst1.32 {d4}, [r0,:64]! vst1.32 {d6}, [r0,:64]! vst1.32 {d8}, [r0,:64]! vst1.32 {d1}, [r0,:64]! @ A[1][0..4] vst1.32 {d3}, [r0,:64]! vst1.32 {d5}, [r0,:64]! vst1.32 {d7}, [r0,:64]! vst1.32 {d9}, [r0,:64]! vst1.32 {d10}, [r0,:64]! @ A[2][0..4] vst1.32 {d12}, [r0,:64]! vst1.32 {d14}, [r0,:64]! vst1.32 {d16}, [r0,:64]! vst1.32 {d18}, [r0,:64]! vst1.32 {d11}, [r0,:64]! @ A[3][0..4] vst1.32 {d13}, [r0,:64]! vst1.32 {d15}, [r0,:64]! vst1.32 {d17}, [r0,:64]! vst1.32 {d19}, [r0,:64]! vst1.32 {d20-d23}, [r0,:64]! @ A[4][0..4] mov r14, r6 @ bsz vst1.32 {d24}, [r0,:64] mov r0, r12 @ rewind vldmia sp!, {d8-d15} b .Loop_squeeze_neon .align 4 .Lsqueeze_neon_tail: ldmia r12, {r2,r3} cmp r5, #2 strb r2, [r4],#1 @ endian-neutral store lsr r2, r2, #8 blo .Lsqueeze_neon_done strb r2, [r4], #1 lsr r2, r2, #8 beq .Lsqueeze_neon_done strb r2, [r4], #1 lsr r2, r2, #8 cmp r5, #4 blo .Lsqueeze_neon_done strb r2, [r4], #1 beq .Lsqueeze_neon_done strb r3, [r4], #1 lsr r3, r3, #8 cmp r5, #6 blo .Lsqueeze_neon_done strb r3, [r4], #1 lsr r3, r3, #8 beq .Lsqueeze_neon_done strb r3, [r4], #1 .Lsqueeze_neon_done: ldmia sp!, {r4-r6,pc} .size SHA3_squeeze_neon,.-SHA3_squeeze_neon #endif .asciz "Keccak-1600 absorb and squeeze for ARMv4/NEON, CRYPTOGAMS by " .align 2 ___ { my %ldr, %str; sub ldrd { my ($mnemonic,$half,$reg,$ea) = @_; my $op = $mnemonic eq "ldr" ? \%ldr : \%str; if ($half eq "l") { $$op{reg} = $reg; $$op{ea} = $ea; sprintf "#ifndef __thumb2__\n" . " %s\t%s,%s\n" . "#endif", $mnemonic,$reg,$ea; } else { sprintf "#ifndef __thumb2__\n" . " %s\t%s,%s\n" . "#else\n" . " %sd\t%s,%s,%s\n" . "#endif", $mnemonic,$reg,$ea, $mnemonic,$$op{reg},$reg,$$op{ea}; } } } foreach (split($/,$code)) { s/\`([^\`]*)\`/eval $1/ge; s/^\s+(ldr|str)\.([lh])\s+(r[0-9]+),\s*(\[.*)/ldrd($1,$2,$3,$4)/ge or s/\b(ror|ls[rl])\s+(r[0-9]+.*)#/mov $2$1#/g or s/\bret\b/bx lr/g or s/\bbx\s+lr\b/.word\t0xe12fff1e/g; # make it possible to compile with -march=armv4 print $_,"\n"; } close STDOUT; # enforce flush