#! /usr/bin/env perl # Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the Apache License 2.0 (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # This module implements support for Armv8 SM3 instructions # $output is the last argument if it looks like a file (it has an extension) # $flavour is the first argument if it doesn't look like a file $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or die "can't locate arm-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour \"$output\"" or die "can't call $xlate: $!"; *STDOUT=*OUT; # Message expanding: # Wj <- P1(W[j-16]^W[j-9]^(W[j-3]<<<15))^(W[j-13]<<<7)^W[j-6] # Input: s0, s1, s2, s3 # s0 = w0 | w1 | w2 | w3 # s1 = w4 | w5 | w6 | w7 # s2 = w8 | w9 | w10 | w11 # s3 = w12 | w13 | w14 | w15 # Output: s4 sub msg_exp () { my $s0 = shift; my $s1 = shift; my $s2 = shift; my $s3 = shift; my $s4 = shift; my $vtmp1 = shift; my $vtmp2 = shift; $code.=<<___; // s4 = w7 | w8 | w9 | w10 ext $s4.16b, $s1.16b, $s2.16b, #12 // vtmp1 = w3 | w4 | w5 | w6 ext $vtmp1.16b, $s0.16b, $s1.16b, #12 // vtmp2 = w10 | w11 | w12 | w13 ext $vtmp2.16b, $s2.16b, $s3.16b, #8 sm3partw1 $s4.4s, $s0.4s, $s3.4s sm3partw2 $s4.4s, $vtmp2.4s, $vtmp1.4s ___ } # A round of compresson function # Input: # ab - choose instruction among sm3tt1a, sm3tt1b, sm3tt2a, sm3tt2b # vstate0 - vstate1, store digest status(A - H) # vconst0 - vconst1, interleaved used to store Tj <<< j # vtmp - temporary register # vw - for sm3tt1ab, vw = s0 eor s1 # s0 - for sm3tt2ab, just be s0 # i, choose wj' or wj from vw sub round () { my $ab = shift; my $vstate0 = shift; my $vstate1 = shift; my $vconst0 = shift; my $vconst1 = shift; my $vtmp = shift; my $vw = shift; my $s0 = shift; my $i = shift; $code.=<<___; sm3ss1 $vtmp.4s, $vstate0.4s, $vconst0.4s, $vstate1.4s shl $vconst1.4s, $vconst0.4s, #1 sri $vconst1.4s, $vconst0.4s, #31 sm3tt1$ab $vstate0.4s, $vtmp.4s, $vw.4s[$i] sm3tt2$ab $vstate1.4s, $vtmp.4s, $s0.4s[$i] ___ } sub qround () { my $ab = shift; my $vstate0 = shift; my $vstate1 = shift; my $vconst0 = shift; my $vconst1 = shift; my $vtmp1 = shift; my $vtmp2 = shift; my $s0 = shift; my $s1 = shift; my $s2 = shift; my $s3 = shift; my $s4 = shift; if($s4) { &msg_exp($s0, $s1, $s2, $s3, $s4, $vtmp1, $vtmp2); } $code.=<<___; eor $vtmp1.16b, $s0.16b, $s1.16b ___ &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2, $vtmp1, $s0, 0); &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2, $vtmp1, $s0, 1); &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2, $vtmp1, $s0, 2); &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2, $vtmp1, $s0, 3); } $code=<<___; #include "arm_arch.h" .arch armv8.2-a .text ___ {{{ my ($pstate,$pdata,$num)=("x0","x1","w2"); my ($state1,$state2)=("v5","v6"); my ($sconst1, $sconst2)=("s16","s17"); my ($vconst1, $vconst2)=("v16","v17"); my ($s0,$s1,$s2,$s3,$s4)=map("v$_",(0..4)); my ($bkstate1,$bkstate2)=("v18","v19"); my ($vconst_tmp1,$vconst_tmp2)=("v20","v21"); my ($vtmp1,$vtmp2)=("v22","v23"); my $constaddr="x8"; # void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num) $code.=<<___; .globl ossl_hwsm3_block_data_order .type ossl_hwsm3_block_data_order,%function .align 5 ossl_hwsm3_block_data_order: // load state ld1 {$state1.4s-$state2.4s}, [$pstate] rev64 $state1.4s, $state1.4s rev64 $state2.4s, $state2.4s ext $state1.16b, $state1.16b, $state1.16b, #8 ext $state2.16b, $state2.16b, $state2.16b, #8 adr $constaddr, .Tj ldp $sconst1, $sconst2, [$constaddr] .Loop: // load input ld1 {$s0.16b-$s3.16b}, [$pdata], #64 sub $num, $num, #1 mov $bkstate1.16b, $state1.16b mov $bkstate2.16b, $state2.16b #ifndef __ARMEB__ rev32 $s0.16b, $s0.16b rev32 $s1.16b, $s1.16b rev32 $s2.16b, $s2.16b rev32 $s3.16b, $s3.16b #endif ext $vconst_tmp1.16b, $vconst1.16b, $vconst1.16b, #4 ___ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, $s0,$s1,$s2,$s3,$s4); &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, $s1,$s2,$s3,$s4,$s0); &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, $s2,$s3,$s4,$s0,$s1); &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, $s3,$s4,$s0,$s1,$s2); $code.=<<___; ext $vconst_tmp1.16b, $vconst2.16b, $vconst2.16b, #4 ___ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, $s4,$s0,$s1,$s2,$s3); &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, $s0,$s1,$s2,$s3,$s4); &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, $s1,$s2,$s3,$s4,$s0); &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, $s2,$s3,$s4,$s0,$s1); &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, $s3,$s4,$s0,$s1,$s2); &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, $s4,$s0,$s1,$s2,$s3); &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, $s0,$s1,$s2,$s3,$s4); &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, $s1,$s2,$s3,$s4,$s0); &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, $s2,$s3,$s4,$s0,$s1); &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, $s3,$s4); &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, $s4,$s0); &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, $s0,$s1); $code.=<<___; eor $state1.16b, $state1.16b, $bkstate1.16b eor $state2.16b, $state2.16b, $bkstate2.16b // any remained blocks? cbnz $num, .Loop // save state rev64 $state1.4s, $state1.4s rev64 $state2.4s, $state2.4s ext $state1.16b, $state1.16b, $state1.16b, #8 ext $state2.16b, $state2.16b, $state2.16b, #8 st1 {$state1.4s-$state2.4s}, [$pstate] ret .size ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order .align 3 .Tj: .word 0x79cc4519, 0x9d8a7a87 ___ }}} ######################################### my %sm3partopcode = ( "sm3partw1" => 0xce60C000, "sm3partw2" => 0xce60C400); my %sm3ss1opcode = ( "sm3ss1" => 0xce400000); my %sm3ttopcode = ( "sm3tt1a" => 0xce408000, "sm3tt1b" => 0xce408400, "sm3tt2a" => 0xce408800, "sm3tt2b" => 0xce408C00); sub unsm3part { my ($mnemonic,$arg)=@_; $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o && sprintf ".inst\t0x%08x\t//%s %s", $sm3partopcode{$mnemonic}|$1|($2<<5)|($3<<16), $mnemonic,$arg; } sub unsm3ss1 { my ($mnemonic,$arg)=@_; $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o && sprintf ".inst\t0x%08x\t//%s %s", $sm3ss1opcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10), $mnemonic,$arg; } sub unsm3tt { my ($mnemonic,$arg)=@_; $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*\[([0-3])\]/o && sprintf ".inst\t0x%08x\t//%s %s", $sm3ttopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<12), $mnemonic,$arg; } open SELF,$0; while() { next if (/^#!/); last if (!s/^#/\/\// and !/^$/); print; } close SELF; foreach(split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/ge; s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge; s/\b(sm3ss1)\s+([qv].*)/unsm3ss1($1,$2)/ge; s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!";