#!/usr/bin/env perl
#
# Why ILASM? Key to Keccak-1600 performance on 32-bit platforms is bit
# interleaving, which allows to simplify rotate operations. But it's
# less suitable for 64-bit platforms, by 30-40%. Ideally one wants both
# code paths to be available in .NET assembly for JIT compiler to choose
# from (yes, and even eliminate unused;-), depending on whether or not
# the module is *currently* executed on 32- or 64-bit platform. C++/CLI
# doesn't do, because it appears to insist on taking this decision at
# compile time, so that JIT would be presented with only one code path,
# not two. Not to mention that it drags along sizeable chunks of C RTL,
# which seems redundant. C# on the other hand doesn't give you enough
# control over inlining, and it's absolutely essential for performance
# that rotates get inlined. Raw CIL code allows to alleviate all these
# questions and ensure optimal all-round performance. Lane complementing
# KECCAK_1X variant was chosen for implementation. Below numbers are
# cycles per processed byte out of large buffer for r=1088, which
# corresponds to SHA3-256.
#
#		mono 4.2	.NET 4.7	[scalar] asm
#
# Cortex-A15	170		-		42
# Pentium M	151		-		59.5(*)
# Goldmont/32	150		87.1		52.0(*)
# Haswell/32	107		51.7		33.3(*)
#
# Cortex-A53	44.5		-		13
# Goldmont	28.7		25.5		15.8
# Haswell	20.5		17.0		9.6
#
# (*)	gcc-5.x-generated code, no [scalar] assembly;
#
# CONSTRAINTS.
#
# There is implicit dependency on byte order with rationale that all
# known .NET platforms are little-endian.
#
# It's expected that SHA3.CIL class is wrapped into something more
# presentable for .NET programmer. One can wonder if it makes sense
# to provide "unsafe" interface with byte * instead of byte[]...

my $class = "SHA3.CIL";

my @A = map([ "A$_"."0", "A$_"."1", "A$_"."2", "A$_"."3", "A$_"."4" ],
            (0, 1, 2, 3, 4));
my @T = map([ "T$_"."0", "T$_"."1", "T$_"."2", "T$_"."3", "T$_"."4" ],
            (0, 1));
my @C = map("C$_", (0..4));
my @D = map("D$_", (0..4));

my @rhotates = ([  0,  1, 62, 28, 27 ],
                [ 36, 44,  6, 55, 20 ],
                [  3, 10, 43, 25, 39 ],
                [ 41, 45, 15, 21,  8 ],
                [ 18,  2, 61, 56, 14 ]);

sub ROL32
{ my ($val, $shift) = @_;
  my $snippet;
  my $ldval = ($val =~ m/^A/) ? "ldarg.0\n        " .
                                "ldfld unsigned int64 ${class}::$val"
                              : "ldloc $val";
    $snippet="$ldval\n";

    if ($shift == 1) {
    $snippet.=<<___;
        ldc.i4  32
        shl

        $ldval
        ldc.i4  32
        shr.un
        conv.u4
        dup
        stloc   AT
        ldc.i4.1
        shl
        ldloc   AT
        ldc.i4  31
        shr.un
        or              // $val.hi <<<= 1
        conv.u8

        or
___
    } elsif ($shift & 1) {
    my $lshift = $shift >> 1;
    my $rshift = 32-$lshift;
    $snippet.=<<___;
        conv.u4
        dup
        stloc   AT
        ldc.i4  $lshift
        shl
        ldloc   AT
        ldc.i4  $rshift
        shr.un
        or              // $val.lo <<<= $lshift
        conv.u8

        ldc.i4  32
        shl
___
    $lshift++;
    $rshift--;
    $snippet.=<<___;
        $ldval
        ldc.i4  32
        shr.un
        conv.u4
        dup
        stloc   AT
        ldc.i4  $lshift
        shl
        ldloc   AT
        ldc.i4  $rshift
        shr.un
        or              // $val.hi <<<= $lshift+1
        conv.u8

        or
___
    } elsif ($shift != 0) {
    my $lshift = $shift >> 1;
    my $rshift = 32 - $lshift;
    $snippet.=<<___;
        conv.u4
        dup
        stloc   AT
        ldc.i4  $lshift
        shl
        ldloc   AT
        ldc.i4  $rshift
        shr.un
        or              // $val.lo <<<= $lshift
        conv.u8

        $ldval
        ldc.i4  32
        shr.un
        conv.u4
        dup
        stloc   AT
        ldc.i4  $lshift
        shl
        ldloc   AT
        ldc.i4  $rshift
        shr.un
        or              // $val.hi <<<= $lshift
        conv.u8

        ldc.i4  32
        shl
        or
___
    }

    return $snippet;
}

sub ROL64
{ my ($val, $lshift) = @_;
  my $rshift = 64 - $lshift;
  my $snippet;
  my $ldval = ($val =~ m/^A/) ? "ldarg.0\n        " .
                                "ldfld   unsigned int64 ${class}::$val"
                              : "ldloc   $val";
    $snippet.="$ldval\n";
    if ($lshift != 0) {
    $snippet.=<<___;
        ldc.i4  $lshift
        shl
        $ldval
        ldc.i4  $rshift
        shr.un
        or              // $val <<<= $lshift
___
    }

    return $snippet;
}

$code.=<<___;
.assembly extern mscorlib { }

.class value private sequential ansi sealed beforefieldinit Keccak1600.Iotas
       extends [mscorlib]System.ValueType
{
    .pack 8
    .size 192
}

.class public sequential ansi beforefieldinit $class
       extends [mscorlib]System.Object
{
___
for (my $i=0; $i<5; $i++) {
  for (my $j=0; $j<5; $j++) {
$code.=<<___;
    .field private unsigned int64 $A[$i][$j]
___
  }
}
$code.=<<___;
    .field private static valuetype Keccak1600.Iotas iotas32 at iotas32
    .field private static valuetype Keccak1600.Iotas iotas64 at iotas64

    .method public hidebysig specialname rtspecialname
            instance void .ctor ()
    {
        ldarg.0
        call    instance void object::.ctor()

        ldarg.0
        call    instance void ${class}::Complement()

        ret
    }

    .method private hidebysig instance
    void Complement()
    {
        ldarg.0
        dup
        ldfld   unsigned int64 ${class}::$A[0][1]
        not
        stfld   unsigned int64 ${class}::$A[0][1]

        ldarg.0
        dup
        ldfld   unsigned int64 ${class}::$A[0][2]
        not
        stfld   unsigned int64 ${class}::$A[0][2]

        ldarg.0
        dup
        ldfld   unsigned int64 ${class}::$A[1][3]
        not
        stfld   unsigned int64 ${class}::$A[1][3]

        ldarg.0
        dup
        ldfld   unsigned int64 ${class}::$A[2][2]
        not
        stfld   unsigned int64 ${class}::$A[2][2]

        ldarg.0
        dup
        ldfld   unsigned int64 ${class}::$A[3][2]
        not
        stfld   unsigned int64 ${class}::$A[3][2]

        ldarg.0
        dup
        ldfld   unsigned int64 ${class}::$A[4][0]
        not
        stfld   unsigned int64 ${class}::$A[4][0]

        ret
    }

    .method private hidebysig static
    unsigned int64 BitInterleave(unsigned int64 v)
    {
        .locals (
            unsigned int32 lo,
            unsigned int32 hi,
            unsigned int32 t0,
            unsigned int32 t1 )

        sizeof  native int
        ldc.i4.8
        blt.s   Lproceed
        ldarg.0
        ret

    Lproceed:                   // JIT-eliminated on 64-bit platform
        ldarg.0
        dup
        conv.u4
        stloc   lo
        ldc.i4  32
        shr.un
        conv.u4
        stloc   hi

        ldloc   lo
        ldc.i4  0x55555555
        and
        dup
        ldc.i4.1
        shr.un
        or
        ldc.i4  0x33333333
        and
        dup
        ldc.i4.2
        shr.un
        or
        ldc.i4  0x0f0f0f0f
        and
        dup
        ldc.i4.4
        shr.un
        or
        ldc.i4  0x00ff00ff
        and
        dup
        ldc.i4.8
        shr.un
        or
        ldc.i4  0x0000ffff
        and
        stloc   t0

        ldloc   hi
        ldc.i4  0x55555555
        and
        dup
        ldc.i4.1
        shr.un
        or
        ldc.i4  0x33333333
        and
        dup
        ldc.i4.2
        shr.un
        or
        ldc.i4  0x0f0f0f0f
        and
        dup
        ldc.i4.4
        shr.un
        or
        ldc.i4  0x00ff00ff
        and
        dup
        ldc.i4.8
        shr.un
        or
        ldc.i4  16
        shl
        stloc   t1

        ldloc   lo
        ldc.i4  0xaaaaaaaa
        and
        dup
        ldc.i4.1
        shl
        or
        ldc.i4  0xcccccccc
        and
        dup
        ldc.i4.2
        shl
        or
        ldc.i4  0xf0f0f0f0
        and
        dup
        ldc.i4.4
        shl
        or
        ldc.i4  0xff00ff00
        and
        dup
        ldc.i4.8
        shl
        or
        ldc.i4  16
        shr.un
        stloc   lo

        ldloc   hi
        ldc.i4  0xaaaaaaaa
        and
        dup
        ldc.i4.1
        shl
        or
        ldc.i4  0xcccccccc
        and
        dup
        ldc.i4.2
        shl
        or
        ldc.i4  0xf0f0f0f0
        and
        dup
        ldc.i4.4
        shl
        or
        ldc.i4  0xff00ff00
        and
        dup
        ldc.i4.8
        shl
        or
        ldc.i4  0xffff0000
        and
        stloc   hi

        ldloc   lo
        ldloc   hi
        or
        conv.u8
        ldc.i4  32
        shl
        ldloc   t0
        ldloc   t1
        or
        conv.u8
        or

        ret
    }

    .method private hidebysig static
    unsigned int64 BitDeinterleave(unsigned int64 v)
    {
        .locals (
            unsigned int32 lo,
            unsigned int32 hi,
            unsigned int32 t0,
            unsigned int32 t1 )

        sizeof  native int
        ldc.i4.8
        blt.s   Lproceed
        ldarg.0
        ret

    Lproceed:                   // JIT-eliminated on 64-bit platform
        ldarg.0
        dup
        conv.u4
        stloc   lo
        ldc.i4  32
        shr.un
        conv.u4
        stloc   hi

        ldloc   lo
        ldc.i4  0x0000ffff
        and
        dup
        ldc.i4.8
        shl
        or
        ldc.i4  0x00ff00ff
        and
        dup
        ldc.i4.4
        shl
        or
        ldc.i4  0x0f0f0f0f
        and
        dup
        ldc.i4.2
        shl
        or
        ldc.i4  0x33333333
        and
        dup
        ldc.i4.1
        shl
        or
        ldc.i4  0x55555555
        and
        stloc   t0

        ldloc   hi
        ldc.i4  16
        shl
        dup
        ldc.i4.8
        shr.un
        or
        ldc.i4  0xff00ff00
        and
        dup
        ldc.i4.4
        shr.un
        or
        ldc.i4  0xf0f0f0f0
        and
        dup
        ldc.i4.2
        shr.un
        or
        ldc.i4  0xcccccccc
        and
        dup
        ldc.i4.1
        shr.un
        or
        ldc.i4  0xaaaaaaaa
        and
        stloc   t1

        ldloc   lo
        ldc.i4  16
        shr.un
        dup
        ldc.i4.8
        shl
        or
        ldc.i4  0x00ff00ff
        and
        dup
        ldc.i4.4
        shl
        or
        ldc.i4  0x0f0f0f0f
        and
        dup
        ldc.i4.2
        shl
        or
        ldc.i4  0x33333333
        and
        dup
        ldc.i4.1
        shl
        or
        ldc.i4  0x55555555
        and
        stloc   lo

        ldloc   hi
        ldc.i4  0xffff0000
        and
        dup
        ldc.i4.8
        shr.un
        or
        ldc.i4  0xff00ff00
        and
        dup
        ldc.i4.4
        shr.un
        or
        ldc.i4  0xf0f0f0f0
        and
        dup
        ldc.i4.2
        shr.un
        or
        ldc.i4  0xcccccccc
        and
        dup
        ldc.i4.1
        shr.un
        or
        ldc.i4  0xaaaaaaaa
        and
        stloc   hi

        ldloc   lo
        ldloc   hi
        or
        conv.u8
        ldc.i4  32
        shl
        ldloc   t0
        ldloc   t1
        or
        conv.u8
        or

        ret
    }
___

sub KeccakF1600 {
my ($bits) = @_;

$code.=<<___;
    .method private hidebysig instance
    void KeccakF1600_${bits}()
    {
        .locals (
            unsigned int64 C0,
            unsigned int64 C1,
            unsigned int64 C2,
            unsigned int64 C3,
            unsigned int64 C4,

            unsigned int64 D0,
            unsigned int64 D1,
            unsigned int64 D2,
            unsigned int64 D3,
            unsigned int64 D4,

            unsigned int64 T00,
            unsigned int64 T01,
            unsigned int64 T02,
            unsigned int64 T03,
            unsigned int64 T04,

            unsigned int64 T10,
            unsigned int64 T11,
            unsigned int64 T12,
            unsigned int64 T13,
            unsigned int64 T14,
            unsigned int64 *iota,
            int32 counter )
___
$code.=<<___    if ($bits == 64);
        sizeof  native int
        ldc.i4.8
        bge.s   Lproceed
        ret

    Lproceed:                   // JIT-eliminated on 32-bit platform
___
$code.=<<___    if ($bits == 32);
        .locals (
            unsigned int32 AT )

        sizeof  native int
        ldc.i4.8
        blt.s   Lproceed
        ret

    Lproceed:                   // JIT-eliminated on 64-bit platform
___
$code.=<<___;
        ldsflda valuetype Keccak1600.Iotas ${class}::iotas${bits}
        stloc   iota
        ldc.i4  24
        stloc   counter

    Loop:
___
for ($i = 0; $i < 5; $i++) {
$code.=<<___;
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[0][$i]
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[1][$i]
        xor
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[2][$i]
        xor
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[3][$i]
        xor
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[4][$i]
        xor
        stloc   $C[$i]
___
}
for ($i = 0; $i < 5; $i++) {
$code.=<<___;
        `&ROL${bits}("$C[($i+1)%5]",1)`
        ldloc   $C[($i+4)%5]
        xor
        stloc   $D[$i]
___
}
$code.=<<___;
        // T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[3][0]
        ldloc   $D[0]
        xor
        stloc   $T[0][0]

        // T[0][1] = A[0][1] ^ D[1];
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[0][1]
        ldloc   $D[1]
        xor
        stloc   $T[0][1]

        // T[0][2] = A[0][2] ^ D[2];
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[0][2]
        ldloc   $D[2]
        xor
        stloc   $T[0][2]

        // T[0][3] = A[0][3] ^ D[3];
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[0][3]
        ldloc   $D[3]
        xor
        stloc   $T[0][3]

        // T[0][4] = A[0][4] ^ D[4];
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[0][4]
        ldloc   $D[4]
        xor
        stloc   $T[0][4]

        // C[0] =       A[0][0] ^ D[0]; /* rotate by 0 */
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[0][0]
        ldloc   $D[0]
        xor
        stloc   $C[0]

        // C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[1][1]
        ldloc   $D[1]
        xor
        stloc   $C[1]
        `&ROL${bits}("$C[1]",$rhotates[1][1])`
        stloc   $C[1]

        // C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[2][2]
        ldloc   $D[2]
        xor
        stloc   $C[2]
        `&ROL${bits}("$C[2]",$rhotates[2][2])`
        stloc   $C[2]

        // C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[3][3]
        ldloc   $D[3]
        xor
        stloc   $C[3]
        `&ROL${bits}("$C[3]",$rhotates[3][3])`
        stloc   $C[3]

        // C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[4][4]
        ldloc   $D[4]
        xor
        stloc   $C[4]
        `&ROL${bits}("$C[4]",$rhotates[4][4])`
        stloc   $C[4]

        // A[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i++];
        ldarg.0
        ldloc   $C[1]
        ldloc   $C[2]
        or
        ldloc   $C[0]
        xor
        ldloc   iota
        ldind.u8
        xor
        stfld   unsigned int64 ${class}::$A[0][0]
        ldloc   iota
        ldc.i4.8
        add
        stloc   iota

        // A[0][1] = C[1] ^ (~C[2] | C[3]);
        ldarg.0
        ldloc   $C[2]
        not
        ldloc   $C[3]
        or
        ldloc   $C[1]
        xor
        stfld   unsigned int64 ${class}::$A[0][1]

        // A[0][2] = C[2] ^ ( C[3] & C[4]);
        ldarg.0
        ldloc   $C[3]
        ldloc   $C[4]
        and
        ldloc   $C[2]
        xor
        stfld   unsigned int64 ${class}::$A[0][2]

        // A[0][3] = C[3] ^ ( C[4] | C[0]);
        ldarg.0
        ldloc   $C[4]
        ldloc   $C[0]
        or
        ldloc   $C[3]
        xor
        stfld   unsigned int64 ${class}::$A[0][3]

        // A[0][4] = C[4] ^ ( C[0] & C[1]);
        ldarg.0
        ldloc   $C[0]
        ldloc   $C[1]
        and
        ldloc   $C[4]
        xor
        stfld   unsigned int64 ${class}::$A[0][4]

        // T[1][0] = A[1][0] ^ D[0];
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[1][0]
        ldloc   $D[0]
        xor
        stloc   $T[1][0]

        // T[1][1] = A[2][1] ^ D[1]; /* borrow T[1][1] */
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[2][1]
        ldloc   $D[1]
        xor
        stloc   $T[1][1]

        // T[1][2] = A[1][2] ^ D[2];
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[1][2]
        ldloc   $D[2]
        xor
        stloc   $T[1][2]

        // T[1][3] = A[1][3] ^ D[3];
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[1][3]
        ldloc   $D[3]
        xor
        stloc   $T[1][3]

        // T[1][4] = A[2][4] ^ D[4]; /* borrow T[1][4] */
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[2][4]
        ldloc   $D[4]
        xor
        stloc   $T[1][4]

        // C[0] = ROL64(T[0][3],        rhotates[0][3]);
        `&ROL${bits}("$T[0][3]",$rhotates[0][3])`
        stloc   $C[0]

        // C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[1][4]
        ldloc   $D[4]
        xor
        stloc   $C[1]
        `&ROL${bits}("$C[1]",$rhotates[1][4])`
        stloc   $C[1]

        // C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[2][0]
        ldloc   $D[0]
        xor
        stloc   $C[2]
        `&ROL${bits}("$C[2]",$rhotates[2][0])`
        stloc   $C[2]

        // C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[3][1]
        ldloc   $D[1]
        xor
        stloc   $C[3]
        `&ROL${bits}("$C[3]",$rhotates[3][1])`
        stloc   $C[3]

        // C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[4][2]
        ldloc   $D[2]
        xor
        stloc   $C[4]
        `&ROL${bits}("$C[4]",$rhotates[4][2])`
        stloc   $C[4]

        // A[1][0] = C[0] ^ (C[1] |  C[2]);
        ldarg.0
        ldloc   $C[1]
        ldloc   $C[2]
        or
        ldloc   $C[0]
        xor
        stfld   unsigned int64 ${class}::$A[1][0]

        // A[1][1] = C[1] ^ (C[2] &  C[3]);
        ldarg.0
        ldloc   $C[2]
        ldloc   $C[3]
        and
        ldloc   $C[1]
        xor
        stfld   unsigned int64 ${class}::$A[1][1]

        // A[1][2] = C[2] ^ (C[3] | ~C[4]);
        ldarg.0
        ldloc   $C[3]
        ldloc   $C[4]
        not
        or
        ldloc   $C[2]
        xor
        stfld   unsigned int64 ${class}::$A[1][2]

        // A[1][3] = C[3] ^ (C[4] |  C[0]);
        ldarg.0
        ldloc   $C[4]
        ldloc   $C[0]
        or
        ldloc   $C[3]
        xor
        stfld   unsigned int64 ${class}::$A[1][3]

        // A[1][4] = C[4] ^ (C[0] &  C[1]);
        ldarg.0
        ldloc   $C[0]
        ldloc   $C[1]
        and
        ldloc   $C[4]
        xor
        stfld   unsigned int64 ${class}::$A[1][4]

        // C[0] = ROL64(T[0][1],        rhotates[0][1]);
        `&ROL${bits}("$T[0][1]",$rhotates[0][1])`
        stloc   $C[0]

        // C[1] = ROL64(T[1][2],        rhotates[1][2]);
        `&ROL${bits}("$T[1][2]",$rhotates[1][2])`
        stloc   $C[1]

        // C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[2][3]
        ldloc   $D[3]
        xor
        stloc   $C[2]
        `&ROL${bits}("$C[2]",$rhotates[2][3])`
        stloc   $C[2]

        // C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[3][4]
        ldloc   $D[4]
        xor
        stloc   $C[3]
        `&ROL${bits}("$C[3]",$rhotates[3][4])`
        stloc   $C[3]

        // C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[4][0]
        ldloc   $D[0]
        xor
        stloc   $C[4]
        `&ROL${bits}("$C[4]",$rhotates[4][0])`
        stloc   $C[4]

        // A[2][0] =  C[0] ^ ( C[1] | C[2]);
        ldarg.0
        ldloc   $C[1]
        ldloc   $C[2]
        or
        ldloc   $C[0]
        xor
        stfld   unsigned int64 ${class}::$A[2][0]

        // A[2][1] =  C[1] ^ ( C[2] & C[3]);
        ldarg.0
        ldloc   $C[2]
        ldloc   $C[3]
        and
        ldloc   $C[1]
        xor
        stfld   unsigned int64 ${class}::$A[2][1]

        // A[2][2] =  C[2] ^ (~C[3] & C[4]);
        ldarg.0
        ldloc   $C[3]
        not
        ldloc   $C[4]
        and
        ldloc   $C[2]
        xor
        stfld   unsigned int64 ${class}::$A[2][2]

        // A[2][3] = ~C[3] ^ ( C[4] | C[0]);
        ldarg.0
        ldloc   $C[4]
        ldloc   $C[0]
        or
        ldloc   $C[3]
        not
        xor
        stfld   unsigned int64 ${class}::$A[2][3]

        // A[2][4] =  C[4] ^ ( C[0] & C[1]);
        ldarg.0
        ldloc   $C[0]
        ldloc   $C[1]
        and
        ldloc   $C[4]
        xor
        stfld   unsigned int64 ${class}::$A[2][4]

        // C[0] = ROL64(T[0][4],        rhotates[0][4]);
        `&ROL${bits}("$T[0][4]",$rhotates[0][4])`
        stloc   $C[0]

        // C[1] = ROL64(T[1][0],        rhotates[1][0]);
        `&ROL${bits}("$T[1][0]",$rhotates[1][0])`
        stloc   $C[1]

        // C[2] = ROL64(T[1][1],        rhotates[2][1]); /* originally A[2][1] */
        `&ROL${bits}("$T[1][1]",$rhotates[2][1])`
        stloc   $C[2]

        // C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[3][2]
        ldloc   $D[2]
        xor
        stloc   $C[3]
        `&ROL${bits}("$C[3]",$rhotates[3][2])`
        stloc   $C[3]

        // C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[4][3]
        ldloc   $D[3]
        xor
        stloc   $C[4]
        `&ROL${bits}("$C[4]",$rhotates[4][3])`
        stloc   $C[4]

        // A[3][0] =  C[0] ^ ( C[1] & C[2]);
        ldarg.0
        ldloc   $C[1]
        ldloc   $C[2]
        and
        ldloc   $C[0]
        xor
        stfld   unsigned int64 ${class}::$A[3][0]

        // A[3][1] =  C[1] ^ ( C[2] | C[3]);
        ldarg.0
        ldloc   $C[2]
        ldloc   $C[3]
        or
        ldloc   $C[1]
        xor
        stfld   unsigned int64 ${class}::$A[3][1]

        // A[3][2] =  C[2] ^ (~C[3] | C[4]);
        ldarg.0
        ldloc   $C[3]
        not
        ldloc   $C[4]
        or
        ldloc   $C[2]
        xor
        stfld   unsigned int64 ${class}::$A[3][2]

        // A[3][3] = ~C[3] ^ ( C[4] & C[0]);
        ldarg.0
        ldloc   $C[4]
        ldloc   $C[0]
        and
        ldloc   $C[3]
        not
        xor
        stfld   unsigned int64 ${class}::$A[3][3]

        // A[3][4] =  C[4] ^ ( C[0] | C[1]);
        ldarg.0
        ldloc   $C[0]
        ldloc   $C[1]
        or
        ldloc   $C[4]
        xor
        stfld   unsigned int64 ${class}::$A[3][4]

        // C[0] = ROL64(T[0][2],        rhotates[0][2]);
        `&ROL${bits}("$T[0][2]",$rhotates[0][2])`
        stloc   $C[0]

        // C[1] = ROL64(T[1][3],        rhotates[1][3]);
        `&ROL${bits}("$T[1][3]",$rhotates[1][3])`
        stloc   $C[1]

        // C[2] = ROL64(T[1][4],        rhotates[2][4]); /* originally A[2][4] */
        `&ROL${bits}("$T[1][4]",$rhotates[2][4])`
        stloc   $C[2]

        // C[3] = ROL64(T[0][0],        rhotates[3][0]); /* originally A[3][0] */
        `&ROL${bits}("$T[0][0]",$rhotates[3][0])`
        stloc   $C[3]

        // C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
        ldarg.0
        ldfld   unsigned int64 ${class}::$A[4][1]
        ldloc   $D[1]
        xor
        stloc   $C[4]
        `&ROL${bits}("$C[4]",$rhotates[4][1])`
        stloc   $C[4]

        // A[4][0] =  C[0] ^ (~C[1] & C[2]);
        ldarg.0
        ldloc   $C[1]
        not
        ldloc   $C[2]
        and
        ldloc   $C[0]
        xor
        stfld   unsigned int64 ${class}::$A[4][0]

        // A[4][1] = ~C[1] ^ ( C[2] | C[3]);
        ldarg.0
        ldloc   $C[2]
        ldloc   $C[3]
        or
        ldloc   $C[1]
        not
        xor
        stfld   unsigned int64 ${class}::$A[4][1]

        // A[4][2] =  C[2] ^ ( C[3] & C[4]);
        ldarg.0
        ldloc   $C[3]
        ldloc   $C[4]
        and
        ldloc   $C[2]
        xor
        stfld   unsigned int64 ${class}::$A[4][2]

        // A[4][3] =  C[3] ^ ( C[4] | C[0]);
        ldarg.0
        ldloc   $C[4]
        ldloc   $C[0]
        or
        ldloc   $C[3]
        xor
        stfld   unsigned int64 ${class}::$A[4][3]

        // A[4][4] =  C[4] ^ ( C[0] & C[1]);
        ldarg.0
        ldloc   $C[0]
        ldloc   $C[1]
        and
        ldloc   $C[4]
        xor
        stfld   unsigned int64 ${class}::$A[4][4]

        ldloc   counter
        ldc.i4.1
        sub
        dup
        stloc   counter
        brtrue  Loop

        ret
    }

___
}

&KeccakF1600(64);
&KeccakF1600(32);

$code.=<<___;
    .method public hidebysig instance
    int32 Absorb(unsigned int8[] inp, int32 bsz)
    {
        .locals init (
            unsigned int8& pinned pinp,
            unsigned int64& pinned Aij )
        .locals (
            unsigned int64 *ptr,
            native unsigned int len,
            int32 counter )

        ldarg.1
        ldlen
        dup
        stloc   len
        ldarg.2
        blt.un  Ldone64

        ldarg.0
        ldflda  unsigned int64 ${class}::$A[0][0]
        stloc   Aij

        ldarg.1
        ldc.i4.0
        ldelema unsigned int8
        dup
        stloc   pinp
        conv.u
        stloc   ptr

        sizeof  native int
        ldc.i4.8
        blt.s   Lblock32

    Lblock64:
        ldc.i4.0
        stloc   counter         // counter = 0

    Loop64:
        ldloc   Aij
        ldloc   counter
        add
        dup
        ldind.u8
        ldloc   ptr
        unaligned. 1
        ldind.u8
        xor
        stind.i8

        ldloc   ptr
        ldc.i4.8
        add
        stloc   ptr             // ptr += 8
        ldloc   counter
        ldc.i4.8
        add
        dup
        stloc   counter         // counter -= 8
        ldarg.2
        blt.s   Loop64          // counter < bsz?

        ldarg.0
        call    instance void class ${class}::KeccakF1600_64()

        ldloc   len
        ldarg.2
        sub
        dup
        stloc   len             // len -= bsz
        ldarg.2
        bge.un  Loop64          // len >= bsz?

    Ldone64:
        ldloc   len
        conv.i4
        ret

    // 32-bit code path //////////////////////////////////////////////////////
    Lblock32:
        ldc.i4.0
        stloc   counter         // counter = 0

    Loop32:
        ldloc   Aij
        ldloc   counter
        add
        dup
        ldind.u8
        ldloc   ptr
        unaligned. 1
        ldind.u8
        call    unsigned int64 ${class}::BitInterleave(unsigned int64)
        xor
        stind.i8

        ldloc   ptr
        ldc.i4.8
        add
        stloc   ptr             // ptr += 8
        ldloc   counter
        ldc.i4.8
        add
        dup
        stloc   counter         // counter -= 8
        ldarg.2
        blt.s   Loop32          // counter < bsz?

        ldarg.0
        call    instance void class ${class}::KeccakF1600_32()

        ldloc   len
        ldarg.2
        sub
        dup
        stloc   len             // len -= bsz
        ldarg.2
        bge.un  Loop32          // len >= bsz?

    Ldone32:
        ldloc   len
        conv.i4
        ret
    }

    .method public hidebysig instance
    void Squeeze(unsigned int8[] res, int32 bsz)
    {
        .locals init (
            unsigned int8& pinned pres,
            unsigned int64& pinned Aij )
        .locals (
            unsigned int64 *ptr,
            native unsigned int len,
            int32 counter,
            unsigned int64 tail )

        ldarg.1
        ldlen
        dup
        stloc   len
        brfalse Ldone64

        ldarg.0
        ldflda  unsigned int64 ${class}::$A[0][0]
        stloc   Aij

        ldarg.1
        ldc.i4.0
        ldelema unsigned int8
        dup
        stloc   pres
        conv.u
        stloc   ptr

        ldarg.0
        call    instance void ${class}::Complement()

        ldc.i4.0
        stloc   counter

        sizeof  native int
        ldc.i4.8
        blt     Loop32

    Loop64:
        ldloc   len
        ldc.i4.8
        blt.un  Ltail64

        ldloc   ptr
        dup

        ldloc   Aij
        ldloc   counter
        add
        ldind.u8
        unaligned. 1
        stind.i8

        ldc.i4.8
        add
        stloc   ptr             // ptr += 8

        ldloc   len
        ldc.i4.8
        sub
        dup
        stloc   len             // len -= 8
        brfalse Ldone64         // len == 0?

        ldloc   counter
        ldc.i4.8
        add
        dup
        stloc   counter
        ldarg.2
        bne.un  Loop64

        ldarg.0
        call    instance void ${class}::Complement()
        ldarg.0
        call    instance void ${class}::KeccakF1600_64()
        ldarg.0
        call    instance void ${class}::Complement()
        ldc.i4.0
        stloc   counter
        br.s    Loop64

    Ltail64:
        ldloc   Aij
        ldloc   counter
        add
        ldind.u8
        stloc   tail

    Loop_tail64:
        ldloc   ptr
        dup
        ldloc   tail
        conv.i1
        stind.i1
        ldc.i4.1
        add
        stloc   ptr
        ldloc   tail
        ldc.i4.8
        shr
        stloc   tail
        ldloc   len
        ldc.i4.1
        sub
        dup
        stloc   len
        brtrue  Loop_tail64

    Ldone64:
        ret

    // 32-bit code path //////////////////////////////////////////////////////
    Loop32:
        ldloc   len
        ldc.i4.8
        blt.un  Ltail32

        ldloc   ptr
        dup

        ldloc   Aij
        ldloc   counter
        add
        ldind.u8
        call    unsigned int64 ${class}::BitDeinterleave(unsigned int64)
        unaligned. 1
        stind.i8

        ldc.i4.8
        add
        stloc   ptr             // ptr += 8

        ldloc   len
        ldc.i4.8
        sub
        dup
        stloc   len             // len -= 8
        brfalse Ldone32         // len == 0?

        ldloc   counter
        ldc.i4.8
        add
        dup
        stloc   counter
        ldarg.2
        bne.un  Loop32

        ldarg.0
        call    instance void ${class}::Complement()
        ldarg.0
        call    instance void ${class}::KeccakF1600_32()
        ldarg.0
        call    instance void ${class}::Complement()
        ldc.i4.0
        stloc   counter
        br.s    Loop32

    Ltail32:
        ldloc   Aij
        ldloc   counter
        add
        ldind.u8
        call    unsigned int64 ${class}::BitDeinterleave(unsigned int64)
        stloc   tail

    Loop_tail32:
        ldloc   ptr
        dup
        ldloc   tail
        conv.i1
        stind.i1
        ldc.i4.1
        add
        stloc   ptr
        ldloc   tail
        ldc.i4.8
        shr.un
        stloc   tail
        ldloc   len
        ldc.i4.1
        sub
        dup
        stloc   len
        brtrue  Loop_tail32

    Ldone32:
        ret
    }

    .data iotas32 = {
        int64(0x0000000000000001),  int64(0x0000008900000000),
        int64(0x8000008b00000000),  int64(0x8000808000000000),
        int64(0x0000008b00000001),  int64(0x0000800000000001),
        int64(0x8000808800000001),  int64(0x8000008200000001),
        int64(0x0000000b00000000),  int64(0x0000000a00000000),
        int64(0x0000808200000001),  int64(0x0000800300000000),
        int64(0x0000808b00000001),  int64(0x8000000b00000001),
        int64(0x8000008a00000001),  int64(0x8000008100000001),
        int64(0x8000008100000000),  int64(0x8000000800000000),
        int64(0x0000008300000000),  int64(0x8000800300000000),
        int64(0x8000808800000001),  int64(0x8000008800000000),
        int64(0x0000800000000001),  int64(0x8000808200000000)
    }
    .data iotas64 = {
        int64(0x0000000000000001),  int64(0x0000000000008082),
        int64(0x800000000000808a),  int64(0x8000000080008000),
        int64(0x000000000000808b),  int64(0x0000000080000001),
        int64(0x8000000080008081),  int64(0x8000000000008009),
        int64(0x000000000000008a),  int64(0x0000000000000088),
        int64(0x0000000080008009),  int64(0x000000008000000a),
        int64(0x000000008000808b),  int64(0x800000000000008b),
        int64(0x8000000000008089),  int64(0x8000000000008003),
        int64(0x8000000000008002),  int64(0x8000000000000080),
        int64(0x000000000000800a),  int64(0x800000008000000a),
        int64(0x8000000080008081),  int64(0x8000000000008080),
        int64(0x0000000080000001),  int64(0x8000000080008008)
    }
}
___

foreach(split("\n",$code)) {
    s/\`([^\`]*)\`/eval($1)/ge;
    print $_, "\n";
}
close STDOUT;