#!/usr/bin/env perl # # ==================================================================== # Written by Andy Polyakov, @dot-asm, initially for use with OpenSSL. # ==================================================================== # # Poly1305 hash for RISC-V. # # February 2019 # # In the essence it's pretty straightforward transliteration of MIPS # module [without big-endian option]. # # 3.9 cycles per byte on U74, ~60% faster than compiler-generated code. # 1.9 cpb on C910, ~75% improvement. 1.75 cpb on JH7110 (U74 with # apparently better multiplier), ~120% faster. # # June 2024. # # Add CHERI support. # ###################################################################### # ($zero,$ra,$sp,$gp,$tp)=map("x$_",(0..4)); ($t0,$t1,$t2,$t3,$t4,$t5,$t6)=map("x$_",(5..7,28..31)); ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(10..17)); ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("x$_",(8,9,18..27)); # ###################################################################### $flavour = shift || "64"; for (@ARGV) { $output=$_ if (/\w[\w\-]*\.\w+$/); } open STDOUT,">$output"; if ($flavour =~ /64/) {{{ ###################################################################### # 64-bit code path... # my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); my ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$t0,$t1,$t2); $code.=<<___; #if __riscv_xlen == 64 # if __SIZEOF_POINTER__ == 16 # define PUSH csc # define POP clc # else # define PUSH sd # define POP ld # endif #else # error "unsupported __riscv_xlen" #endif .option pic .text .globl poly1305_init .type poly1305_init,\@function poly1305_init: sd $zero,0($ctx) sd $zero,8($ctx) sd $zero,16($ctx) beqz $inp,.Lno_key #ifndef __CHERI_PURE_CAPABILITY__ andi $tmp0,$inp,7 # $inp % 8 andi $inp,$inp,-8 # align $inp slli $tmp0,$tmp0,3 # byte to bit offset #endif ld $in0,0($inp) ld $in1,8($inp) #ifndef __CHERI_PURE_CAPABILITY__ beqz $tmp0,.Laligned_key ld $tmp2,16($inp) neg $tmp1,$tmp0 # implicit &63 in sll srl $in0,$in0,$tmp0 sll $tmp3,$in1,$tmp1 srl $in1,$in1,$tmp0 sll $tmp2,$tmp2,$tmp1 or $in0,$in0,$tmp3 or $in1,$in1,$tmp2 .Laligned_key: #endif li $tmp0,1 slli $tmp0,$tmp0,32 # 0x0000000100000000 addi $tmp0,$tmp0,-63 # 0x00000000ffffffc1 slli $tmp0,$tmp0,28 # 0x0ffffffc10000000 addi $tmp0,$tmp0,-1 # 0x0ffffffc0fffffff and $in0,$in0,$tmp0 addi $tmp0,$tmp0,-3 # 0x0ffffffc0ffffffc and $in1,$in1,$tmp0 sd $in0,24($ctx) srli $tmp0,$in1,2 sd $in1,32($ctx) add $tmp0,$tmp0,$in1 # s1 = r1 + (r1 >> 2) sd $tmp0,40($ctx) .Lno_key: li $a0,0 # return 0 ret .size poly1305_init,.-poly1305_init ___ { my ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) = ($s0,$s1,$s2,$s3,$t3,$t4,$in0,$in1,$t2); my ($shr,$shl) = ($t5,$t6); # used on R6 $code.=<<___; .globl poly1305_blocks .type poly1305_blocks,\@function poly1305_blocks: andi $len,$len,-16 # complete blocks only beqz $len,.Lno_data caddi $sp,$sp,-4*__SIZEOF_POINTER__ PUSH $s0,3*__SIZEOF_POINTER__($sp) PUSH $s1,2*__SIZEOF_POINTER__($sp) PUSH $s2,1*__SIZEOF_POINTER__($sp) PUSH $s3,0*__SIZEOF_POINTER__($sp) #ifndef __CHERI_PURE_CAPABILITY__ andi $shr,$inp,7 andi $inp,$inp,-8 # align $inp slli $shr,$shr,3 # byte to bit offset neg $shl,$shr # implicit &63 in sll #endif ld $h0,0($ctx) # load hash value ld $h1,8($ctx) ld $h2,16($ctx) ld $r0,24($ctx) # load key ld $r1,32($ctx) ld $rs1,40($ctx) add $len,$len,$inp # end of buffer .Loop: ld $in0,0($inp) # load input ld $in1,8($inp) #ifndef __CHERI_PURE_CAPABILITY__ beqz $shr,.Laligned_inp ld $tmp2,16($inp) srl $in0,$in0,$shr sll $tmp3,$in1,$shl srl $in1,$in1,$shr sll $tmp2,$tmp2,$shl or $in0,$in0,$tmp3 or $in1,$in1,$tmp2 .Laligned_inp: #endif caddi $inp,$inp,16 andi $tmp0,$h2,-4 # modulo-scheduled reduction srli $tmp1,$h2,2 andi $h2,$h2,3 add $d0,$h0,$in0 # accumulate input add $tmp1,$tmp1,$tmp0 sltu $tmp0,$d0,$h0 add $d0,$d0,$tmp1 # ... and residue sltu $tmp1,$d0,$tmp1 add $d1,$h1,$in1 add $tmp0,$tmp0,$tmp1 sltu $tmp1,$d1,$h1 add $d1,$d1,$tmp0 add $d2,$h2,$padbit sltu $tmp0,$d1,$tmp0 mulhu $h1,$r0,$d0 # h0*r0 mul $h0,$r0,$d0 add $d2,$d2,$tmp1 add $d2,$d2,$tmp0 mulhu $tmp1,$rs1,$d1 # h1*5*r1 mul $tmp0,$rs1,$d1 mulhu $h2,$r1,$d0 # h0*r1 mul $tmp2,$r1,$d0 add $h0,$h0,$tmp0 add $h1,$h1,$tmp1 sltu $tmp0,$h0,$tmp0 add $h1,$h1,$tmp0 add $h1,$h1,$tmp2 mulhu $tmp1,$r0,$d1 # h1*r0 mul $tmp0,$r0,$d1 sltu $tmp2,$h1,$tmp2 add $h2,$h2,$tmp2 mul $tmp2,$rs1,$d2 # h2*5*r1 add $h1,$h1,$tmp0 add $h2,$h2,$tmp1 mul $tmp3,$r0,$d2 # h2*r0 sltu $tmp0,$h1,$tmp0 add $h2,$h2,$tmp0 add $h1,$h1,$tmp2 sltu $tmp2,$h1,$tmp2 add $h2,$h2,$tmp2 add $h2,$h2,$tmp3 bne $inp,$len,.Loop sd $h0,0($ctx) # store hash value sd $h1,8($ctx) sd $h2,16($ctx) POP $s0,3*__SIZEOF_POINTER__($sp) # epilogue POP $s1,2*__SIZEOF_POINTER__($sp) POP $s2,1*__SIZEOF_POINTER__($sp) POP $s3,0*__SIZEOF_POINTER__($sp) caddi $sp,$sp,4*__SIZEOF_POINTER__ .Lno_data: ret .size poly1305_blocks,.-poly1305_blocks ___ } { my ($ctx,$mac,$nonce) = ($a0,$a1,$a2); $code.=<<___; .globl poly1305_emit .type poly1305_emit,\@function poly1305_emit: ld $tmp2,16($ctx) ld $tmp0,0($ctx) ld $tmp1,8($ctx) andi $in0,$tmp2,-4 # final reduction srl $in1,$tmp2,2 andi $tmp2,$tmp2,3 add $in0,$in0,$in1 add $tmp0,$tmp0,$in0 sltu $in1,$tmp0,$in0 addi $in0,$tmp0,5 # compare to modulus add $tmp1,$tmp1,$in1 sltiu $tmp3,$in0,5 sltu $tmp4,$tmp1,$in1 add $in1,$tmp1,$tmp3 add $tmp2,$tmp2,$tmp4 sltu $tmp3,$in1,$tmp3 add $tmp2,$tmp2,$tmp3 srli $tmp2,$tmp2,2 # see if it carried/borrowed neg $tmp2,$tmp2 xor $in0,$in0,$tmp0 xor $in1,$in1,$tmp1 and $in0,$in0,$tmp2 and $in1,$in1,$tmp2 xor $in0,$in0,$tmp0 xor $in1,$in1,$tmp1 lwu $tmp0,0($nonce) # load nonce lwu $tmp1,4($nonce) lwu $tmp2,8($nonce) lwu $tmp3,12($nonce) slli $tmp1,$tmp1,32 slli $tmp3,$tmp3,32 or $tmp0,$tmp0,$tmp1 or $tmp2,$tmp2,$tmp3 add $in0,$in0,$tmp0 # accumulate nonce add $in1,$in1,$tmp2 sltu $tmp0,$in0,$tmp0 add $in1,$in1,$tmp0 srli $tmp0,$in0,8 # write mac value srli $tmp1,$in0,16 srli $tmp2,$in0,24 sb $in0,0($mac) srli $tmp3,$in0,32 sb $tmp0,1($mac) srli $tmp0,$in0,40 sb $tmp1,2($mac) srli $tmp1,$in0,48 sb $tmp2,3($mac) srli $tmp2,$in0,56 sb $tmp3,4($mac) srli $tmp3,$in1,8 sb $tmp0,5($mac) srli $tmp0,$in1,16 sb $tmp1,6($mac) srli $tmp1,$in1,24 sb $tmp2,7($mac) sb $in1,8($mac) srli $tmp2,$in1,32 sb $tmp3,9($mac) srli $tmp3,$in1,40 sb $tmp0,10($mac) srli $tmp0,$in1,48 sb $tmp1,11($mac) srli $tmp1,$in1,56 sb $tmp2,12($mac) sb $tmp3,13($mac) sb $tmp0,14($mac) sb $tmp1,15($mac) ret .size poly1305_emit,.-poly1305_emit .string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm" ___ } }}} else {{{ ###################################################################### # 32-bit code path # my ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3); my ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) = ($a4,$a5,$a6,$a7,$t0,$t1,$t2,$t3); $code.=<<___; #if __riscv_xlen == 32 # if __SIZEOF_POINTER__ == 8 # define PUSH csc # define POP clc # else # define PUSH sw # define POP lw # endif # define MULX(hi,lo,a,b) mulhu hi,a,b; mul lo,a,b # define srlw srl # define sllw sll # define addw add # define addiw addi # define mulw mul #elif __riscv_xlen == 64 # if __SIZEOF_POINTER__ == 16 # define PUSH csc # define POP clc # else # define PUSH sd # define POP ld # endif # define MULX(hi,lo,a,b) slli b,b,32; srli b,b,32; mul hi,a,b; addiw lo,hi,0; srai hi,hi,32 #else # error "unsupported __riscv_xlen" #endif .option pic .text .globl poly1305_init .type poly1305_init,\@function poly1305_init: sw $zero,0($ctx) sw $zero,4($ctx) sw $zero,8($ctx) sw $zero,12($ctx) sw $zero,16($ctx) beqz $inp,.Lno_key #ifndef __CHERI_PURE_CAPABILITY__ andi $tmp0,$inp,3 # $inp % 4 sub $inp,$inp,$tmp0 # align $inp sll $tmp0,$tmp0,3 # byte to bit offset #endif lw $in0,0($inp) lw $in1,4($inp) lw $in2,8($inp) lw $in3,12($inp) #ifndef __CHERI_PURE_CAPABILITY__ beqz $tmp0,.Laligned_key lw $tmp2,16($inp) sub $tmp1,$zero,$tmp0 srlw $in0,$in0,$tmp0 sllw $tmp3,$in1,$tmp1 srlw $in1,$in1,$tmp0 or $in0,$in0,$tmp3 sllw $tmp3,$in2,$tmp1 srlw $in2,$in2,$tmp0 or $in1,$in1,$tmp3 sllw $tmp3,$in3,$tmp1 srlw $in3,$in3,$tmp0 or $in2,$in2,$tmp3 sllw $tmp2,$tmp2,$tmp1 or $in3,$in3,$tmp2 .Laligned_key: #endif lui $tmp0,0x10000 addi $tmp0,$tmp0,-1 # 0x0fffffff and $in0,$in0,$tmp0 addi $tmp0,$tmp0,-3 # 0x0ffffffc and $in1,$in1,$tmp0 and $in2,$in2,$tmp0 and $in3,$in3,$tmp0 sw $in0,20($ctx) sw $in1,24($ctx) sw $in2,28($ctx) sw $in3,32($ctx) srlw $tmp1,$in1,2 srlw $tmp2,$in2,2 srlw $tmp3,$in3,2 addw $in1,$in1,$tmp1 # s1 = r1 + (r1 >> 2) addw $in2,$in2,$tmp2 addw $in3,$in3,$tmp3 sw $in1,36($ctx) sw $in2,40($ctx) sw $in3,44($ctx) .Lno_key: li $a0,0 ret .size poly1305_init,.-poly1305_init ___ { my ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) = ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $t0,$t1,$t2); my ($d0,$d1,$d2,$d3) = ($a4,$a5,$a6,$a7); my $shr = $ra; # used on R6 $code.=<<___; .globl poly1305_blocks .type poly1305_blocks,\@function poly1305_blocks: andi $len,$len,-16 # complete blocks only beqz $len,.Labort caddi $sp,$sp,-__SIZEOF_POINTER__*12 PUSH $ra, __SIZEOF_POINTER__*11($sp) PUSH $s0, __SIZEOF_POINTER__*10($sp) PUSH $s1, __SIZEOF_POINTER__*9($sp) PUSH $s2, __SIZEOF_POINTER__*8($sp) PUSH $s3, __SIZEOF_POINTER__*7($sp) PUSH $s4, __SIZEOF_POINTER__*6($sp) PUSH $s5, __SIZEOF_POINTER__*5($sp) PUSH $s6, __SIZEOF_POINTER__*4($sp) PUSH $s7, __SIZEOF_POINTER__*3($sp) PUSH $s8, __SIZEOF_POINTER__*2($sp) #ifndef __CHERI_PURE_CAPABILITY__ andi $shr,$inp,3 andi $inp,$inp,-4 # align $inp slli $shr,$shr,3 # byte to bit offset #endif lw $h0,0($ctx) # load hash value lw $h1,4($ctx) lw $h2,8($ctx) lw $h3,12($ctx) lw $h4,16($ctx) lw $r0,20($ctx) # load key lw $r1,24($ctx) lw $r2,28($ctx) lw $r3,32($ctx) lw $rs1,36($ctx) lw $rs2,40($ctx) lw $rs3,44($ctx) add $len,$len,$inp # end of buffer .Loop: lw $d0,0($inp) # load input lw $d1,4($inp) lw $d2,8($inp) lw $d3,12($inp) #ifndef __CHERI_PURE_CAPABILITY__ beqz $shr,.Laligned_inp lw $t4,16($inp) sub $t5,$zero,$shr srlw $d0,$d0,$shr sllw $t3,$d1,$t5 srlw $d1,$d1,$shr or $d0,$d0,$t3 sllw $t3,$d2,$t5 srlw $d2,$d2,$shr or $d1,$d1,$t3 sllw $t3,$d3,$t5 srlw $d3,$d3,$shr or $d2,$d2,$t3 sllw $t4,$t4,$t5 or $d3,$d3,$t4 .Laligned_inp: #endif srli $t3,$h4,2 # modulo-scheduled reduction andi $t4,$h4,-4 andi $h4,$h4,3 addw $d0,$d0,$h0 # accumulate input addw $t4,$t4,$t3 sltu $h0,$d0,$h0 addw $d0,$d0,$t4 # ... and residue sltu $t4,$d0,$t4 addw $d1,$d1,$h1 addw $h0,$h0,$t4 # carry sltu $h1,$d1,$h1 addw $d1,$d1,$h0 sltu $h0,$d1,$h0 addw $d2,$d2,$h2 addw $h1,$h1,$h0 # carry sltu $h2,$d2,$h2 addw $d2,$d2,$h1 sltu $h1,$d2,$h1 addw $d3,$d3,$h3 addw $h2,$h2,$h1 # carry sltu $h3,$d3,$h3 addw $d3,$d3,$h2 MULX ($h1,$h0,$r0,$d0) # d0*r0 sltu $h2,$d3,$h2 addw $h3,$h3,$h2 # carry MULX ($t4,$t3,$rs3,$d1) # d1*s3 addw $h4,$h4,$padbit caddi $inp,$inp,16 addw $h4,$h4,$h3 MULX ($t6,$a3,$rs2,$d2) # d2*s2 addw $h0,$h0,$t3 addw $h1,$h1,$t4 sltu $t3,$h0,$t3 addw $h1,$h1,$t3 MULX ($t4,$t3,$rs1,$d3) # d3*s1 addw $h0,$h0,$a3 addw $h1,$h1,$t6 sltu $a3,$h0,$a3 addw $h1,$h1,$a3 MULX ($h2,$a3,$r1,$d0) # d0*r1 addw $h0,$h0,$t3 addw $h1,$h1,$t4 sltu $t3,$h0,$t3 addw $h1,$h1,$t3 MULX ($t4,$t3,$r0,$d1) # d1*r0 addw $h1,$h1,$a3 sltu $a3,$h1,$a3 addw $h2,$h2,$a3 MULX ($t6,$a3,$rs3,$d2) # d2*s3 addw $h1,$h1,$t3 addw $h2,$h2,$t4 sltu $t3,$h1,$t3 addw $h2,$h2,$t3 MULX ($t4,$t3,$rs2,$d3) # d3*s2 addw $h1,$h1,$a3 addw $h2,$h2,$t6 sltu $a3,$h1,$a3 addw $h2,$h2,$a3 mulw $a3,$rs1,$h4 # h4*s1 addw $h1,$h1,$t3 addw $h2,$h2,$t4 sltu $t3,$h1,$t3 addw $h2,$h2,$t3 MULX ($h3,$t3,$r2,$d0) # d0*r2 addw $h1,$h1,$a3 sltu $a3,$h1,$a3 addw $h2,$h2,$a3 MULX ($t6,$a3,$r1,$d1) # d1*r1 addw $h2,$h2,$t3 sltu $t3,$h2,$t3 addw $h3,$h3,$t3 MULX ($t4,$t3,$r0,$d2) # d2*r0 addw $h2,$h2,$a3 addw $h3,$h3,$t6 sltu $a3,$h2,$a3 addw $h3,$h3,$a3 MULX ($t6,$a3,$rs3,$d3) # d3*s3 addw $h2,$h2,$t3 addw $h3,$h3,$t4 sltu $t3,$h2,$t3 addw $h3,$h3,$t3 mulw $t3,$rs2,$h4 # h4*s2 addw $h2,$h2,$a3 addw $h3,$h3,$t6 sltu $a3,$h2,$a3 addw $h3,$h3,$a3 MULX ($t6,$a3,$r3,$d0) # d0*r3 addw $h2,$h2,$t3 sltu $t3,$h2,$t3 addw $h3,$h3,$t3 MULX ($t4,$t3,$r2,$d1) # d1*r2 addw $h3,$h3,$a3 sltu $a3,$h3,$a3 addw $t6,$t6,$a3 MULX ($a3,$d3,$r0,$d3) # d3*r0 addw $h3,$h3,$t3 addw $t6,$t6,$t4 sltu $t3,$h3,$t3 addw $t6,$t6,$t3 MULX ($t4,$t3,$r1,$d2) # d2*r1 addw $h3,$h3,$d3 addw $t6,$t6,$a3 sltu $d3,$h3,$d3 addw $t6,$t6,$d3 mulw $a3,$rs3,$h4 # h4*s3 addw $h3,$h3,$t3 addw $t6,$t6,$t4 sltu $t3,$h3,$t3 addw $t6,$t6,$t3 mulw $h4,$r0,$h4 # h4*r0 addw $h3,$h3,$a3 sltu $a3,$h3,$a3 addw $t6,$t6,$a3 addw $h4,$t6,$h4 li $padbit,1 # if we loop, padbit is 1 bne $inp,$len,.Loop sw $h0,0($ctx) # store hash value sw $h1,4($ctx) sw $h2,8($ctx) sw $h3,12($ctx) sw $h4,16($ctx) POP $ra, __SIZEOF_POINTER__*11($sp) POP $s0, __SIZEOF_POINTER__*10($sp) POP $s1, __SIZEOF_POINTER__*9($sp) POP $s2, __SIZEOF_POINTER__*8($sp) POP $s3, __SIZEOF_POINTER__*7($sp) POP $s4, __SIZEOF_POINTER__*6($sp) POP $s5, __SIZEOF_POINTER__*5($sp) POP $s6, __SIZEOF_POINTER__*4($sp) POP $s7, __SIZEOF_POINTER__*3($sp) POP $s8, __SIZEOF_POINTER__*2($sp) caddi $sp,$sp,__SIZEOF_POINTER__*12 .Labort: ret .size poly1305_blocks,.-poly1305_blocks ___ } { my ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3); $code.=<<___; .globl poly1305_emit .type poly1305_emit,\@function poly1305_emit: lw $tmp4,16($ctx) lw $tmp0,0($ctx) lw $tmp1,4($ctx) lw $tmp2,8($ctx) lw $tmp3,12($ctx) srli $ctx,$tmp4,2 # final reduction andi $in0,$tmp4,-4 andi $tmp4,$tmp4,3 addw $ctx,$ctx,$in0 addw $tmp0,$tmp0,$ctx sltu $ctx,$tmp0,$ctx addiw $in0,$tmp0,5 # compare to modulus addw $tmp1,$tmp1,$ctx sltiu $in1,$in0,5 sltu $ctx,$tmp1,$ctx addw $in1,$in1,$tmp1 addw $tmp2,$tmp2,$ctx sltu $in2,$in1,$tmp1 sltu $ctx,$tmp2,$ctx addw $in2,$in2,$tmp2 addw $tmp3,$tmp3,$ctx sltu $in3,$in2,$tmp2 sltu $ctx,$tmp3,$ctx addw $in3,$in3,$tmp3 addw $tmp4,$tmp4,$ctx sltu $ctx,$in3,$tmp3 addw $ctx,$ctx,$tmp4 srl $ctx,$ctx,2 # see if it carried/borrowed sub $ctx,$zero,$ctx xor $in0,$in0,$tmp0 xor $in1,$in1,$tmp1 xor $in2,$in2,$tmp2 xor $in3,$in3,$tmp3 and $in0,$in0,$ctx and $in1,$in1,$ctx and $in2,$in2,$ctx and $in3,$in3,$ctx xor $in0,$in0,$tmp0 xor $in1,$in1,$tmp1 xor $in2,$in2,$tmp2 xor $in3,$in3,$tmp3 lw $tmp0,0($nonce) # load nonce lw $tmp1,4($nonce) lw $tmp2,8($nonce) lw $tmp3,12($nonce) addw $in0,$in0,$tmp0 # accumulate nonce sltu $ctx,$in0,$tmp0 addw $in1,$in1,$tmp1 sltu $tmp1,$in1,$tmp1 addw $in1,$in1,$ctx sltu $ctx,$in1,$ctx addw $ctx,$ctx,$tmp1 addw $in2,$in2,$tmp2 sltu $tmp2,$in2,$tmp2 addw $in2,$in2,$ctx sltu $ctx,$in2,$ctx addw $ctx,$ctx,$tmp2 addw $in3,$in3,$tmp3 addw $in3,$in3,$ctx srl $tmp0,$in0,8 # write mac value srl $tmp1,$in0,16 srl $tmp2,$in0,24 sb $in0, 0($mac) sb $tmp0,1($mac) srl $tmp0,$in1,8 sb $tmp1,2($mac) srl $tmp1,$in1,16 sb $tmp2,3($mac) srl $tmp2,$in1,24 sb $in1, 4($mac) sb $tmp0,5($mac) srl $tmp0,$in2,8 sb $tmp1,6($mac) srl $tmp1,$in2,16 sb $tmp2,7($mac) srl $tmp2,$in2,24 sb $in2, 8($mac) sb $tmp0,9($mac) srl $tmp0,$in3,8 sb $tmp1,10($mac) srl $tmp1,$in3,16 sb $tmp2,11($mac) srl $tmp2,$in3,24 sb $in3, 12($mac) sb $tmp0,13($mac) sb $tmp1,14($mac) sb $tmp2,15($mac) ret .size poly1305_emit,.-poly1305_emit .string "Poly1305 for RISC-V, CRYPTOGAMS by \@dot-asm" ___ } }}} foreach (split("\n", $code)) { if ($flavour =~ /^cheri/) { s/\(x([0-9]+)\)/(c$1)/ and s/\b([ls][bhwd]u?)\b/c$1/; s/\b(PUSH|POP)(\s+)x([0-9]+)/$1$2c$3/ or s/\b(ret|jal)\b/c$1/; s/\bcaddi?\b/cincoffset/ and s/\bx([0-9]+,)/c$1/g or m/\bcmove\b/ and s/\bx([0-9]+)/c$1/g; } else { s/\bcaddi?\b/add/ or s/\bcmove\b/mv/; } print $_, "\n"; } close STDOUT;