#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# Keccak-1600 for ARMv8.
#
# June 2017.
#
# This is straightforward KECCAK_1X_ALT implementation. It makes no
# sense to attempt SIMD/NEON implementation for following reason.
# 64-bit lanes of vector registers can't be addressed as easily as in
# 32-bit mode. This means that 64-bit NEON is bound to be slower than
# 32-bit NEON, and this implementation is faster than 32-bit NEON on
# same processor. Even though it takes more scalar xor's and andn's,
# it gets compensated by availability of rotate. Not to forget that
# most processors achieve higher issue rate with scalar instructions.
#
# February 2018.
#
# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT
# variant with register permutation/rotation twist that allows to
# eliminate copies to temporary registers. If you look closely you'll
# notice that it uses only one lane of vector registers. The new
# instructions effectively facilitate parallel hashing, which we don't
# support [yet?]. But lowest-level core procedure is prepared for it.
# The inner round is 67 [vector] instructions, so it's not actually
# obvious that it will provide performance improvement [in serial
# hash] as long as vector instructions issue rate is limited to 1 per
# cycle...
#
######################################################################
# Numbers are cycles per processed byte.
#
#		r=1088(*)
#
# Cortex-A53	13
# Cortex-A57	12
# Cortex-A76	7.9
# Cortex-X2	6.1 (***)
# X-Gene	14
# Mongoose	10
# Kryo		12
# Denver	7.8
# Apple A7	7.2
# Apple A10	6.1
# Apple A12	4.4
# Apple A14/M1	3.5 (**)
# ThunderX2	9.7
#
# (*)	Corresponds to SHA3-256. No improvement coefficients are listed
#	because they vary too much from compiler to compiler. Newer
#	compiler does much better and improvement varies from 5% on
#	Cortex-A57 to 25% on Cortex-A53. While in comparison to older
#	compiler this code is at least 2x faster...
# (**)	The result is for hardware-assisted implementation below.
# (***)	Hardware-assisted code is significantly slower, 11.3,
#	apparently because the processor can issue just one SHA3
#	instruction per cycle.

$flavour = shift;
$output  = shift;

if ($flavour && $flavour ne "void") {
    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
    die "can't locate arm-xlate.pl";

    open STDOUT,"| \"$^X\" $xlate $flavour $output";
} else {
    open STDOUT,">$output";
}

my @rhotates = ([  0,  1, 62, 28, 27 ],
                [ 36, 44,  6, 55, 20 ],
                [  3, 10, 43, 25, 39 ],
                [ 41, 45, 15, 21,  8 ],
                [ 18,  2, 61, 56, 14 ]);

my $sha3ops = ($flavour =~ /\+sha3/);

$code.=<<___	if ($sha3ops);
.arch	armv8.2-a+sha3
___
$code.=<<___;
.text

.align 8	// strategic alignment and padding that allows to use
		// address value as loop termination condition...
	.quad	0,0,0,0,0,0,0,0
.type	iotas,%object
iotas:
	.quad	0x0000000000000001
	.quad	0x0000000000008082
	.quad	0x800000000000808a
	.quad	0x8000000080008000
	.quad	0x000000000000808b
	.quad	0x0000000080000001
	.quad	0x8000000080008081
	.quad	0x8000000000008009
	.quad	0x000000000000008a
	.quad	0x0000000000000088
	.quad	0x0000000080008009
	.quad	0x000000008000000a
.Liotas12:
	.quad	0x000000008000808b
	.quad	0x800000000000008b
	.quad	0x8000000000008089
	.quad	0x8000000000008003
	.quad	0x8000000000008002
	.quad	0x8000000000000080
	.quad	0x000000000000800a
	.quad	0x800000008000000a
	.quad	0x8000000080008081
	.quad	0x8000000000008080
	.quad	0x0000000080000001
	.quad	0x8000000080008008
.size	iotas,.-iotas
___
								{{{
my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ],
            (0, 5, 10, 15, 20));
   $A[3][3] = "x25"; # x18 is reserved

my @C = map("x$_", (26,27,28,30));

$code.=<<___;
.type	KeccakF1600_int,%function
.align	5
KeccakF1600_int:
	.inst	0xd503233f			// paciasp
	stp	c#$C[2],c30,[csp,#16]		// stack is pre-allocated
	b	.Loop
.align	4
.Loop:
	////////////////////////////////////////// Theta
	eor	$C[0],$A[0][0],$A[1][0]
	stp	$A[0][4],$A[1][4],[sp,#0]	// offload pair...
	eor	$C[1],$A[0][1],$A[1][1]
	eor	$C[2],$A[0][2],$A[1][2]
	eor	$C[3],$A[0][3],$A[1][3]
___
	$C[4]=$A[0][4];
	$C[5]=$A[1][4];
$code.=<<___;
	eor	$C[4],$A[0][4],$A[1][4]
	eor	$C[0],$C[0],$A[2][0]
	eor	$C[1],$C[1],$A[2][1]
	eor	$C[2],$C[2],$A[2][2]
	eor	$C[3],$C[3],$A[2][3]
	eor	$C[4],$C[4],$A[2][4]
	eor	$C[0],$C[0],$A[3][0]
	eor	$C[1],$C[1],$A[3][1]
	eor	$C[2],$C[2],$A[3][2]
	eor	$C[3],$C[3],$A[3][3]
	eor	$C[4],$C[4],$A[3][4]
	eor	$C[0],$C[0],$A[4][0]
	eor	$C[2],$C[2],$A[4][2]
	eor	$C[1],$C[1],$A[4][1]
	eor	$C[3],$C[3],$A[4][3]
	eor	$C[4],$C[4],$A[4][4]

	eor	$C[5],$C[0],$C[2],ror#63

	eor	$A[0][1],$A[0][1],$C[5]
	eor	$A[1][1],$A[1][1],$C[5]
	eor	$A[2][1],$A[2][1],$C[5]
	eor	$A[3][1],$A[3][1],$C[5]
	eor	$A[4][1],$A[4][1],$C[5]

	eor	$C[5],$C[1],$C[3],ror#63
	eor	$C[2],$C[2],$C[4],ror#63
	eor	$C[3],$C[3],$C[0],ror#63
	eor	$C[4],$C[4],$C[1],ror#63

	eor	$C[1],   $A[0][2],$C[5]		// mov	$C[1],$A[0][2]
	eor	$A[1][2],$A[1][2],$C[5]
	eor	$A[2][2],$A[2][2],$C[5]
	eor	$A[3][2],$A[3][2],$C[5]
	eor	$A[4][2],$A[4][2],$C[5]

	eor	$A[0][0],$A[0][0],$C[4]
	eor	$A[1][0],$A[1][0],$C[4]
	eor	$A[2][0],$A[2][0],$C[4]
	eor	$A[3][0],$A[3][0],$C[4]
	eor	$A[4][0],$A[4][0],$C[4]
___
	$C[4]=undef;
	$C[5]=undef;
$code.=<<___;
	ldp	$A[0][4],$A[1][4],[sp,#0]	// re-load offloaded data
	eor	$C[0],   $A[0][3],$C[2]		// mov	$C[0],$A[0][3]
	eor	$A[1][3],$A[1][3],$C[2]
	eor	$A[2][3],$A[2][3],$C[2]
	eor	$A[3][3],$A[3][3],$C[2]
	eor	$A[4][3],$A[4][3],$C[2]

	eor	$C[2],   $A[0][4],$C[3]		// mov	$C[2],$A[0][4]
	eor	$A[1][4],$A[1][4],$C[3]
	eor	$A[2][4],$A[2][4],$C[3]
	eor	$A[3][4],$A[3][4],$C[3]
	eor	$A[4][4],$A[4][4],$C[3]

	////////////////////////////////////////// Rho+Pi
	mov	$C[3],$A[0][1]
	ror	$A[0][1],$A[1][1],#64-$rhotates[1][1]
	//mov	$C[1],$A[0][2]
	ror	$A[0][2],$A[2][2],#64-$rhotates[2][2]
	//mov	$C[0],$A[0][3]
	ror	$A[0][3],$A[3][3],#64-$rhotates[3][3]	// ?
	//mov	$C[2],$A[0][4]
	ror	$A[0][4],$A[4][4],#64-$rhotates[4][4]	// ?

	ror	$A[1][1],$A[1][4],#64-$rhotates[1][4]	// ?
	ror	$A[2][2],$A[2][3],#64-$rhotates[2][3]	// ?
	ror	$A[3][3],$A[3][2],#64-$rhotates[3][2]
	ror	$A[4][4],$A[4][1],#64-$rhotates[4][1]	// ?

	ror	$A[1][4],$A[4][2],#64-$rhotates[4][2]
	ror	$A[2][3],$A[3][4],#64-$rhotates[3][4]
	ror	$A[3][2],$A[2][1],#64-$rhotates[2][1]
	ror	$A[4][1],$A[1][3],#64-$rhotates[1][3]

	ror	$A[4][2],$A[2][4],#64-$rhotates[2][4]
	ror	$A[3][4],$A[4][3],#64-$rhotates[4][3]
	ror	$A[2][1],$A[1][2],#64-$rhotates[1][2]	// ?
	ror	$A[1][3],$A[3][1],#64-$rhotates[3][1]

	ror	$A[2][4],$A[4][0],#64-$rhotates[4][0]
	ror	$A[4][3],$A[3][0],#64-$rhotates[3][0]
	ror	$A[1][2],$A[2][0],#64-$rhotates[2][0]
	ror	$A[3][1],$A[1][0],#64-$rhotates[1][0]	// ?

	ror	$A[1][0],$C[0],#64-$rhotates[0][3]	// ?
	ror	$A[2][0],$C[3],#64-$rhotates[0][1]
	ror	$A[3][0],$C[2],#64-$rhotates[0][4]	// ?
	ror	$A[4][0],$C[1],#64-$rhotates[0][2]	// ?

	////////////////////////////////////////// Chi+Iota
	bic	$C[0],$A[0][2],$A[0][1]
	bic	$C[1],$A[0][3],$A[0][2]
	bic	$C[2],$A[0][0],$A[0][4]
	bic	$C[3],$A[0][1],$A[0][0]
	eor	$A[0][0],$A[0][0],$C[0]
	bic	$C[0],$A[0][4],$A[0][3]
	eor	$A[0][1],$A[0][1],$C[1]
	 ldr	c#$C[1],[csp,#16]
	eor	$A[0][3],$A[0][3],$C[2]
	eor	$A[0][4],$A[0][4],$C[3]
	eor	$A[0][2],$A[0][2],$C[0]
	 ldr	$C[3],[$C[1]],#8		// Iota[i++]

	bic	$C[0],$A[1][2],$A[1][1]
	 tst	$C[1],#255			// are we done?
	 str	c#$C[1],[csp,#16]
	bic	$C[1],$A[1][3],$A[1][2]
	bic	$C[2],$A[1][0],$A[1][4]
	 eor	$A[0][0],$A[0][0],$C[3]		// A[0][0] ^= Iota
	bic	$C[3],$A[1][1],$A[1][0]
	eor	$A[1][0],$A[1][0],$C[0]
	bic	$C[0],$A[1][4],$A[1][3]
	eor	$A[1][1],$A[1][1],$C[1]
	eor	$A[1][3],$A[1][3],$C[2]
	eor	$A[1][4],$A[1][4],$C[3]
	eor	$A[1][2],$A[1][2],$C[0]

	bic	$C[0],$A[2][2],$A[2][1]
	bic	$C[1],$A[2][3],$A[2][2]
	bic	$C[2],$A[2][0],$A[2][4]
	bic	$C[3],$A[2][1],$A[2][0]
	eor	$A[2][0],$A[2][0],$C[0]
	bic	$C[0],$A[2][4],$A[2][3]
	eor	$A[2][1],$A[2][1],$C[1]
	eor	$A[2][3],$A[2][3],$C[2]
	eor	$A[2][4],$A[2][4],$C[3]
	eor	$A[2][2],$A[2][2],$C[0]

	bic	$C[0],$A[3][2],$A[3][1]
	bic	$C[1],$A[3][3],$A[3][2]
	bic	$C[2],$A[3][0],$A[3][4]
	bic	$C[3],$A[3][1],$A[3][0]
	eor	$A[3][0],$A[3][0],$C[0]
	bic	$C[0],$A[3][4],$A[3][3]
	eor	$A[3][1],$A[3][1],$C[1]
	eor	$A[3][3],$A[3][3],$C[2]
	eor	$A[3][4],$A[3][4],$C[3]
	eor	$A[3][2],$A[3][2],$C[0]

	bic	$C[0],$A[4][2],$A[4][1]
	bic	$C[1],$A[4][3],$A[4][2]
	bic	$C[2],$A[4][0],$A[4][4]
	bic	$C[3],$A[4][1],$A[4][0]
	eor	$A[4][0],$A[4][0],$C[0]
	bic	$C[0],$A[4][4],$A[4][3]
	eor	$A[4][1],$A[4][1],$C[1]
	eor	$A[4][3],$A[4][3],$C[2]
	eor	$A[4][4],$A[4][4],$C[3]
	eor	$A[4][2],$A[4][2],$C[0]

	bne	.Loop

	ldr	c30,[csp,#16+__SIZEOF_POINTER__]
	.inst	0xd50323bf			// autiasp
	ret
.size	KeccakF1600_int,.-KeccakF1600_int

.type	KeccakF1600,%function
.align	5
KeccakF1600:
	.inst	0xd503233f			// paciasp
	stp	c29,c30,[csp,#-16*__SIZEOF_POINTER__]!
	add	c29,csp,#0
	stp	c19,c20,[csp,#2*__SIZEOF_POINTER__]
	stp	c21,c22,[csp,#4*__SIZEOF_POINTER__]
	stp	c23,c24,[csp,#6*__SIZEOF_POINTER__]
	stp	c25,c26,[csp,#8*__SIZEOF_POINTER__]
	stp	c27,c28,[csp,#10*__SIZEOF_POINTER__]
	sub	csp,csp,#16+4*__SIZEOF_POINTER__

	str	c0,[csp,#16+2*__SIZEOF_POINTER__]	// offload argument
	mov	c#$C[0],c0
	ldp	$A[0][0],$A[0][1],[x0,#16*0]
	ldp	$A[0][2],$A[0][3],[$C[0],#16*1]
	ldp	$A[0][4],$A[1][0],[$C[0],#16*2]
	ldp	$A[1][1],$A[1][2],[$C[0],#16*3]
	ldp	$A[1][3],$A[1][4],[$C[0],#16*4]
	ldp	$A[2][0],$A[2][1],[$C[0],#16*5]
	ldp	$A[2][2],$A[2][3],[$C[0],#16*6]
	ldp	$A[2][4],$A[3][0],[$C[0],#16*7]
	ldp	$A[3][1],$A[3][2],[$C[0],#16*8]
	ldp	$A[3][3],$A[3][4],[$C[0],#16*9]
	ldp	$A[4][0],$A[4][1],[$C[0],#16*10]
	ldp	$A[4][2],$A[4][3],[$C[0],#16*11]
	ldr	$A[4][4],[$C[0],#16*12]

	adr	$C[2],iotas
	bl	KeccakF1600_int

	ldr	c#$C[0],[csp,#16+2*__SIZEOF_POINTER__]
	stp	$A[0][0],$A[0][1],[$C[0],#16*0]
	stp	$A[0][2],$A[0][3],[$C[0],#16*1]
	stp	$A[0][4],$A[1][0],[$C[0],#16*2]
	stp	$A[1][1],$A[1][2],[$C[0],#16*3]
	stp	$A[1][3],$A[1][4],[$C[0],#16*4]
	stp	$A[2][0],$A[2][1],[$C[0],#16*5]
	stp	$A[2][2],$A[2][3],[$C[0],#16*6]
	stp	$A[2][4],$A[3][0],[$C[0],#16*7]
	stp	$A[3][1],$A[3][2],[$C[0],#16*8]
	stp	$A[3][3],$A[3][4],[$C[0],#16*9]
	stp	$A[4][0],$A[4][1],[$C[0],#16*10]
	stp	$A[4][2],$A[4][3],[$C[0],#16*11]
	str	$A[4][4],[$C[0],#16*12]

	ldp	c19,c20,[c29,#2*__SIZEOF_POINTER__]
	add	csp,csp,#16+4*__SIZEOF_POINTER__
	ldp	c21,c22,[c29,#4*__SIZEOF_POINTER__]
	ldp	c23,c24,[c29,#6*__SIZEOF_POINTER__]
	ldp	c25,c26,[c29,#8*__SIZEOF_POINTER__]
	ldp	c27,c28,[c29,#10*__SIZEOF_POINTER__]
	ldp	c29,c30,[csp],#16*__SIZEOF_POINTER__
	.inst	0xd50323bf			// autiasp
	ret
.size	KeccakF1600,.-KeccakF1600

.globl	SHA3_absorb
.type	SHA3_absorb,%function
.align	5
SHA3_absorb:
	.inst	0xd503233f			// paciasp
	stp	c29,c30,[csp,#-16*__SIZEOF_POINTER__]!
	add	c29,csp,#0
	stp	c19,c20,[csp,#2*__SIZEOF_POINTER__]
	stp	c21,c22,[csp,#4*__SIZEOF_POINTER__]
	stp	c23,c24,[csp,#6*__SIZEOF_POINTER__]
	stp	c25,c26,[csp,#8*__SIZEOF_POINTER__]
	stp	c27,c28,[csp,#10*__SIZEOF_POINTER__]
	sub	csp,csp,#16+4*__SIZEOF_POINTER__+16

	stp	c0,c1,[csp,#16+2*__SIZEOF_POINTER__]	// offload arguments
	stp	x2,x3,[csp,#16+4*__SIZEOF_POINTER__]

	mov	c#$C[0],c0			// uint64_t A[5][5]
	mov	c#$C[1],c1			// const void *inp
	mov	$C[2],x2			// size_t len
	mov	$C[3],x3			// size_t bsz
	ldp	$A[0][0],$A[0][1],[$C[0],#16*0]
	ldp	$A[0][2],$A[0][3],[$C[0],#16*1]
	ldp	$A[0][4],$A[1][0],[$C[0],#16*2]
	ldp	$A[1][1],$A[1][2],[$C[0],#16*3]
	ldp	$A[1][3],$A[1][4],[$C[0],#16*4]
	ldp	$A[2][0],$A[2][1],[$C[0],#16*5]
	ldp	$A[2][2],$A[2][3],[$C[0],#16*6]
	ldp	$A[2][4],$A[3][0],[$C[0],#16*7]
	ldp	$A[3][1],$A[3][2],[$C[0],#16*8]
	ldp	$A[3][3],$A[3][4],[$C[0],#16*9]
	ldp	$A[4][0],$A[4][1],[$C[0],#16*10]
	ldp	$A[4][2],$A[4][3],[$C[0],#16*11]
	ldr	$A[4][4],[$C[0],#16*12]
	b	.Loop_absorb

.align	4
.Loop_absorb:
	subs	$C[0],$C[2],$C[3]		// len - bsz
	blo	.Labsorbed

	str	$C[0],[csp,#16+4*__SIZEOF_POINTER__]	// save len - bsz
	cmp	$C[3],#104
___
sub load_n_xor {
    my ($from,$to) = @_;

    for (my $i=$from; $i<=$to; $i++) {
$code.=<<___;
	ldr	$C[0],[$C[1],#`8*$i`]		// A[`$i/5`][`$i%5`] ^= *inp++
#ifdef	__AARCH64EB__
	rev	$C[0],$C[0]
#endif
	eor	$A[$i/5][$i%5],$A[$i/5][$i%5],$C[0]
___
    }
}
load_n_xor(0,8);
$code.=<<___;
	blo	.Lprocess_block

___
load_n_xor(9,12);
$code.=<<___;
	beq	.Lprocess_block

	cmp	$C[3],#144
___
load_n_xor(13,16);
$code.=<<___;
	blo	.Lprocess_block

___
load_n_xor(17,17);
$code.=<<___;
	beq	.Lprocess_block

___
load_n_xor(18,20);
$code.=<<___;

.Lprocess_block:
	add	c#$C[1],c#@C[1],@C[3]
	str	c#$C[1],[csp,#16+3*__SIZEOF_POINTER__]	// save inp

	adr	$C[2],iotas
	bl	KeccakF1600_int

	ldr	c#$C[1],[csp,#16+3*__SIZEOF_POINTER__]	// restore arguments
	ldp	$C[2],$C[3],[csp,#16+4*__SIZEOF_POINTER__]
	b	.Loop_absorb

.align	4
.Labsorbed:
	ldr	c#$C[1],[sp,#16+2*__SIZEOF_POINTER__]
	stp	$A[0][0],$A[0][1],[$C[1],#16*0]
	stp	$A[0][2],$A[0][3],[$C[1],#16*1]
	stp	$A[0][4],$A[1][0],[$C[1],#16*2]
	stp	$A[1][1],$A[1][2],[$C[1],#16*3]
	stp	$A[1][3],$A[1][4],[$C[1],#16*4]
	stp	$A[2][0],$A[2][1],[$C[1],#16*5]
	stp	$A[2][2],$A[2][3],[$C[1],#16*6]
	stp	$A[2][4],$A[3][0],[$C[1],#16*7]
	stp	$A[3][1],$A[3][2],[$C[1],#16*8]
	stp	$A[3][3],$A[3][4],[$C[1],#16*9]
	stp	$A[4][0],$A[4][1],[$C[1],#16*10]
	stp	$A[4][2],$A[4][3],[$C[1],#16*11]
	str	$A[4][4],[$C[1],#16*12]

	mov	x0,$C[2]			// return value
	ldp	c19,c20,[c29,#2*__SIZEOF_POINTER__]
	add	csp,csp,#16+4*__SIZEOF_POINTER__+16
	ldp	c21,c22,[c29,#4*__SIZEOF_POINTER__]
	ldp	c23,c24,[c29,#6*__SIZEOF_POINTER__]
	ldp	c25,c26,[c29,#8*__SIZEOF_POINTER__]
	ldp	c27,c28,[c29,#10*__SIZEOF_POINTER__]
	ldp	c29,c30,[csp],#16*__SIZEOF_POINTER__
	.inst	0xd50323bf			// autiasp
	ret
.size	SHA3_absorb,.-SHA3_absorb
___
{
my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22));
$code.=<<___;
.globl	SHA3_squeeze
.type	SHA3_squeeze,%function
.align	5
SHA3_squeeze:
	.inst	0xd503233f			// paciasp
	stp	c29,c30,[csp,#-6*__SIZEOF_POINTER__]!
	add	c29,csp,#0
	stp	c19,c20,[csp,#2*__SIZEOF_POINTER__]
	stp	c21,c22,[csp,#4*__SIZEOF_POINTER__]

	cmov	$A_flat,x0			// put aside arguments
	cmov	$out,x1
	mov	$len,x2
	mov	$bsz,x3

.Loop_squeeze:
	ldr	x4,[x0],#8
	cmp	$len,#8
	blo	.Lsqueeze_tail
#ifdef	__AARCH64EB__
	rev	x4,x4
#endif
	str	x4,[$out],#8
	subs	$len,$len,#8
	beq	.Lsqueeze_done

	subs	x3,x3,#8
	bhi	.Loop_squeeze

	cmov	x0,$A_flat
	bl	KeccakF1600
	cmov	x0,$A_flat
	mov	x3,$bsz
	b	.Loop_squeeze

.align	4
.Lsqueeze_tail:
	strb	w4,[$out],#1
	lsr	x4,x4,#8
	subs	$len,$len,#1
	beq	.Lsqueeze_done
	strb	w4,[$out],#1
	lsr	x4,x4,#8
	subs	$len,$len,#1
	beq	.Lsqueeze_done
	strb	w4,[$out],#1
	lsr	x4,x4,#8
	subs	$len,$len,#1
	beq	.Lsqueeze_done
	strb	w4,[$out],#1
	lsr	x4,x4,#8
	subs	$len,$len,#1
	beq	.Lsqueeze_done
	strb	w4,[$out],#1
	lsr	x4,x4,#8
	subs	$len,$len,#1
	beq	.Lsqueeze_done
	strb	w4,[$out],#1
	lsr	x4,x4,#8
	subs	$len,$len,#1
	beq	.Lsqueeze_done
	strb	w4,[$out],#1

.Lsqueeze_done:
	ldp	c19,c20,[csp,#2*__SIZEOF_POINTER__]
	ldp	c21,c22,[csp,#4*__SIZEOF_POINTER__]
	ldp	c29,c30,[csp],#6*__SIZEOF_POINTER__
	.inst	0xd50323bf			// autiasp
	ret
.size	SHA3_squeeze,.-SHA3_squeeze
___
}								}}}
								{{{
my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b",
                             "v".($_+3).".16b", "v".($_+4).".16b" ],
            (0, 5, 10, 15, 20));

my @C = map("v$_.16b", (25..31));
my @D = @C[4,5,6,2,3];

$code.=<<___;
.type	KeccakF1600_ce,%function
.align	5
KeccakF1600_ce:
.Loop_ce:
	////////////////////////////////////////////////// Theta
	eor3	$C[0],$A[4][0],$A[3][0],$A[2][0]
	eor3	$C[1],$A[4][1],$A[3][1],$A[2][1]
	eor3	$C[2],$A[4][2],$A[3][2],$A[2][2]
	eor3	$C[3],$A[4][3],$A[3][3],$A[2][3]
	eor3	$C[4],$A[4][4],$A[3][4],$A[2][4]
	eor3	$C[0],$C[0],   $A[1][0],$A[0][0]
	eor3	$C[1],$C[1],   $A[1][1],$A[0][1]
	eor3	$C[2],$C[2],   $A[1][2],$A[0][2]
	eor3	$C[3],$C[3],   $A[1][3],$A[0][3]
	eor3	$C[4],$C[4],   $A[1][4],$A[0][4]

	rax1	$C[5],$C[0],$C[2]			// D[1]
	rax1	$C[6],$C[1],$C[3]			// D[2]
	rax1	$C[2],$C[2],$C[4]			// D[3]
	rax1	$C[3],$C[3],$C[0]			// D[4]
	rax1	$C[4],$C[4],$C[1]			// D[0]

	////////////////////////////////////////////////// Theta+Rho+Pi
	xar	$C[0],   $A[0][1],$D[1],#64-$rhotates[0][1] // C[0]=A[2][0]

	xar	$A[0][1],$A[1][1],$D[1],#64-$rhotates[1][1]
	xar	$A[1][1],$A[1][4],$D[4],#64-$rhotates[1][4]
	xar	$A[1][4],$A[4][2],$D[2],#64-$rhotates[4][2]
	xar	$A[4][2],$A[2][4],$D[4],#64-$rhotates[2][4]
	xar	$A[2][4],$A[4][0],$D[0],#64-$rhotates[4][0]

	xar	$C[1],   $A[0][2],$D[2],#64-$rhotates[0][2] // C[1]=A[4][0]

	xar	$A[0][2],$A[2][2],$D[2],#64-$rhotates[2][2]
	xar	$A[2][2],$A[2][3],$D[3],#64-$rhotates[2][3]
	xar	$A[2][3],$A[3][4],$D[4],#64-$rhotates[3][4]
	xar	$A[3][4],$A[4][3],$D[3],#64-$rhotates[4][3]
	xar	$A[4][3],$A[3][0],$D[0],#64-$rhotates[3][0]

	xar	$A[3][0],$A[0][4],$D[4],#64-$rhotates[0][4]

	xar	$D[4],   $A[4][4],$D[4],#64-$rhotates[4][4] // D[4]=A[0][4]
	xar	$A[4][4],$A[4][1],$D[1],#64-$rhotates[4][1]
	xar	$A[1][3],$A[1][3],$D[3],#64-$rhotates[1][3] // A[1][3]=A[4][1]
	xar	$A[0][4],$A[3][1],$D[1],#64-$rhotates[3][1] // A[0][4]=A[1][3]
	xar	$A[3][1],$A[1][0],$D[0],#64-$rhotates[1][0]

	xar	$A[1][0],$A[0][3],$D[3],#64-$rhotates[0][3]

	eor	$A[0][0],$A[0][0],$D[0]

	xar	$D[3],   $A[3][3],$D[3],#64-$rhotates[3][3] // D[3]=A[0][3]
	xar	$A[0][3],$A[3][2],$D[2],#64-$rhotates[3][2] // A[0][3]=A[3][3]
	xar	$D[1],   $A[2][1],$D[1],#64-$rhotates[2][1] // D[1]=A[3][2]
	xar	$D[2],   $A[1][2],$D[2],#64-$rhotates[1][2] // D[2]=A[2][1]
	xar	$D[0],   $A[2][0],$D[0],#64-$rhotates[2][0] // D[0]=A[1][2]

	////////////////////////////////////////////////// Chi+Iota
	bcax	$A[4][0],$C[1],   $A[4][2],$A[1][3]	// A[1][3]=A[4][1]
	bcax	$A[4][1],$A[1][3],$A[4][3],$A[4][2]	// A[1][3]=A[4][1]
	bcax	$A[4][2],$A[4][2],$A[4][4],$A[4][3]
	bcax	$A[4][3],$A[4][3],$C[1],   $A[4][4]
	bcax	$A[4][4],$A[4][4],$A[1][3],$C[1]	// A[1][3]=A[4][1]

	ld1r	{$C[1]},[x10],#8

	bcax	$A[3][2],$D[1],   $A[3][4],$A[0][3]	// A[0][3]=A[3][3]
	bcax	$A[3][3],$A[0][3],$A[3][0],$A[3][4]	// A[0][3]=A[3][3]
	bcax	$A[3][4],$A[3][4],$A[3][1],$A[3][0]
	bcax	$A[3][0],$A[3][0],$D[1],   $A[3][1]
	bcax	$A[3][1],$A[3][1],$A[0][3],$D[1]	// A[0][3]=A[3][3]

	bcax	$A[2][0],$C[0],   $A[2][2],$D[2]
	bcax	$A[2][1],$D[2],   $A[2][3],$A[2][2]
	bcax	$A[2][2],$A[2][2],$A[2][4],$A[2][3]
	bcax	$A[2][3],$A[2][3],$C[0],   $A[2][4]
	bcax	$A[2][4],$A[2][4],$D[2],   $C[0]

	bcax	$A[1][2],$D[0],   $A[1][4],$A[0][4]	// A[0][4]=A[1][3]
	bcax	$A[1][3],$A[0][4],$A[1][0],$A[1][4]	// A[0][4]=A[1][3]
	bcax	$A[1][4],$A[1][4],$A[1][1],$A[1][0]
	bcax	$A[1][0],$A[1][0],$D[0],   $A[1][1]
	bcax	$A[1][1],$A[1][1],$A[0][4],$D[0]	// A[0][4]=A[1][3]

	bcax	$A[0][3],$D[3],   $A[0][0],$D[4]
	bcax	$A[0][4],$D[4],   $A[0][1],$A[0][0]
	bcax	$A[0][0],$A[0][0],$A[0][2],$A[0][1]
	bcax	$A[0][1],$A[0][1],$D[3],   $A[0][2]
	bcax	$A[0][2],$A[0][2],$D[4],   $D[3]

	eor	$A[0][0],$A[0][0],$C[1]

	tst	x10,#255
	bne	.Loop_ce

	ret
.size	KeccakF1600_ce,.-KeccakF1600_ce

.type	KeccakF1600_cext,%function
.align	5
KeccakF1600_cext:
	.inst	0xd503233f		// paciasp
	stp	c29,c30,[csp,#-2*__SIZEOF_POINTER__-64]!
	add	c29,csp,#0
	stp	d8,d9,[csp,#2*__SIZEOF_POINTER__+0]	// per ABI requirement
	stp	d10,d11,[csp,#2*__SIZEOF_POINTER__+16]
	stp	d12,d13,[csp,#2*__SIZEOF_POINTER__+32]
	stp	d14,d15,[csp,#2*__SIZEOF_POINTER__+48]
___
for($i=0; $i<24; $i+=2) {		# load A[5][5]
my $j=$i+1;
$code.=<<___;
	ldp	d$i,d$j,[x0,#8*$i]
___
}
$code.=<<___;
	ldr	d24,[x0,#8*$i]
	adr	x10,iotas
	bl	KeccakF1600_ce
	ldr	c30,[csp,#__SIZEOF_POINTER__]
___
for($i=0; $i<24; $i+=2) {		# store A[5][5]
my $j=$i+1;
$code.=<<___;
	stp	d$i,d$j,[x0,#8*$i]
___
}
$code.=<<___;
	str	d24,[x0,#8*$i]

	ldp	d8,d9,[csp,#2*__SIZEOF_POINTER__+0]
	ldp	d10,d11,[csp,#2*__SIZEOF_POINTER__+16]
	ldp	d12,d13,[csp,#2*__SIZEOF_POINTER__+32]
	ldp	d14,d15,[csp,#2*__SIZEOF_POINTER__+48]
	ldr	c29,[csp],#2*__SIZEOF_POINTER__+64
	.inst	0xd50323bf		// autiasp
	ret
.size	KeccakF1600_cext,.-KeccakF1600_cext
___

{
my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3));

$code.=<<___;
.globl	SHA3_absorb_cext
.type	SHA3_absorb_cext,%function
.align	5
SHA3_absorb_cext:
	.inst	0xd503233f		// paciasp
	stp	c29,c30,[csp,#-2*__SIZEOF_POINTER__-64]!
	add	c29,csp,#0
	stp	d8,d9,[csp,#2*__SIZEOF_POINTER__+0]	// per ABI requirement
	stp	d10,d11,[csp,#2*__SIZEOF_POINTER__+16]
	stp	d12,d13,[csp,#2*__SIZEOF_POINTER__+32]
	stp	d14,d15,[csp,#2*__SIZEOF_POINTER__+48]
___
for($i=0; $i<24; $i+=2) {		# load A[5][5]
my $j=$i+1;
$code.=<<___;
	ldp	d$i,d$j,[x0,#8*$i]
___
}
$code.=<<___;
	ldr	d24,[x0,#8*$i]
	b	.Loop_absorb_ce

.align	4
.Loop_absorb_ce:
	subs	$len,$len,$bsz		// len - bsz
	blo	.Labsorbed_ce

	cmp	$bsz,#104
___
sub load_n_xor_ce {
    my ($from,$to) = @_;
    my $range = $to-$from+1;

    while ($range>=4) {
$code.=<<___;
	ld1	{v27.8b-v30.8b},[$inp],#32
	eor 	$A[$from/5][$from%5],$A[$from/5][$from++%5],v27.16b
	eor 	$A[$from/5][$from%5],$A[$from/5][$from++%5],v28.16b
	eor 	$A[$from/5][$from%5],$A[$from/5][$from++%5],v29.16b
	eor 	$A[$from/5][$from%5],$A[$from/5][$from++%5],v30.16b
___
	$range-=4;
    }
    while ($range>=3) {
$code.=<<___;
	ld1	{v28.8b-v30.8b},[$inp],#24
	eor 	$A[$from/5][$from%5],$A[$from/5][$from++%5],v28.16b
	eor 	$A[$from/5][$from%5],$A[$from/5][$from++%5],v29.16b
	eor 	$A[$from/5][$from%5],$A[$from/5][$from++%5],v30.16b
___
	$range-=3;
    }
    while ($from<=$to) {
$code.=<<___;
	ld1	{v31.8b},[$inp],#8	// A[`$from/5`][`$from%5`] ^= *inp++
	eor	$A[$from/5][$from%5],$A[$from/5][$from++%5],v31.16b
___
    }
}
load_n_xor_ce(0,8);
$code.=<<___;
	blo	.Lprocess_block_ce

___
load_n_xor_ce(9,12);
$code.=<<___;
	beq	.Lprocess_block_ce

	cmp	$bsz,#144
___
load_n_xor_ce(13,16);
$code.=<<___;
	blo	.Lprocess_block_ce

___
load_n_xor_ce(17,17);
$code.=<<___;
	beq	.Lprocess_block_ce

___
load_n_xor_ce(18,20);
$code.=<<___;

.Lprocess_block_ce:
	adr	x10,iotas
	bl	KeccakF1600_ce

	b	.Loop_absorb_ce

.align	4
.Labsorbed_ce:
___
for($i=0; $i<24; $i+=2) {		# store A[5][5]
my $j=$i+1;
$code.=<<___;
	stp	d$i,d$j,[x0,#8*$i]
___
}
$code.=<<___;
	str	d24,[x0,#8*$i]
	add	x0,$len,$bsz		// return value

	ldp	d8,d9,[csp,#2*__SIZEOF_POINTER__+0]
	ldp	d10,d11,[csp,#2*__SIZEOF_POINTER__+16]
	ldp	d12,d13,[csp,#2*__SIZEOF_POINTER__+32]
	ldp	d14,d15,[csp,#2*__SIZEOF_POINTER__+48]
	ldp	c29,c30,[csp],#2*__SIZEOF_POINTER__+64
	.inst	0xd50323bf		// autiasp
	ret
.size	SHA3_absorb_cext,.-SHA3_absorb_cext
___
}
{
my ($ctx,$out,$len,$bsz) = map("x$_",(0..3));
$code.=<<___;
.globl	SHA3_squeeze_cext
.type	SHA3_squeeze_cext,%function
.align	5
SHA3_squeeze_cext:
	.inst	0xd503233f		// paciasp
	stp	c29,c30,[csp,#-2*__SIZEOF_POINTER__]!
	add	c29,csp,#0
	cmov	x9,$ctx
	mov	x10,$bsz

.Loop_squeeze_ce:
	ldr	x4,[x9],#8
	cmp	$len,#8
	blo	.Lsqueeze_tail_ce
#ifdef	__AARCH64EB__
	rev	x4,x4
#endif
	str	x4,[$out],#8
	beq	.Lsqueeze_done_ce

	sub	$len,$len,#8
	subs	x10,x10,#8
	bhi	.Loop_squeeze_ce

	bl	KeccakF1600_cext
	ldr	c30,[csp,#__SIZEOF_POINTER__]
	cmov	x9,$ctx
	mov	x10,$bsz
	b	.Loop_squeeze_ce

.align	4
.Lsqueeze_tail_ce:
	strb	w4,[$out],#1
	lsr	x4,x4,#8
	subs	$len,$len,#1
	beq	.Lsqueeze_done_ce
	strb	w4,[$out],#1
	lsr	x4,x4,#8
	subs	$len,$len,#1
	beq	.Lsqueeze_done_ce
	strb	w4,[$out],#1
	lsr	x4,x4,#8
	subs	$len,$len,#1
	beq	.Lsqueeze_done_ce
	strb	w4,[$out],#1
	lsr	x4,x4,#8
	subs	$len,$len,#1
	beq	.Lsqueeze_done_ce
	strb	w4,[$out],#1
	lsr	x4,x4,#8
	subs	$len,$len,#1
	beq	.Lsqueeze_done_ce
	strb	w4,[$out],#1
	lsr	x4,x4,#8
	subs	$len,$len,#1
	beq	.Lsqueeze_done_ce
	strb	w4,[$out],#1

.Lsqueeze_done_ce:
	ldr	c29,[csp],#2*__SIZEOF_POINTER__
	.inst	0xd50323bf		// autiasp
	ret
.size	SHA3_squeeze_cext,.-SHA3_squeeze_cext
___
}								}}}
$code.=<<___;
.asciz	"Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by \@dot-asm"
___

{   my  %opcode = (
	"rax1"	=> 0xce608c00,	"eor3"	=> 0xce000000,
	"bcax"	=> 0xce200000,	"xar"	=> 0xce800000	);

    sub unsha3 {
	my ($mnemonic,$arg)=@_;

	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
	&&
	sprintf ".inst\t0x%08x\t//%s %s",
			$opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
			$mnemonic,$arg;
    }
}

foreach(split("\n",$code)) {
	use integer;

	s/\`([^\`]*)\`/eval($1)/ge;

	m/\b(ld1r|rax1|xar)\b/ and s/\.16b/.2d/g;
	$sha3ops or s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
	s/([cw])#x([0-9]+)/$1$2/g;

	print $_,"\n";
}

close STDOUT;