/*

   BLIS
   An object-based framework for developing high-performance BLAS-like
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
   Copyright (C) 2020, Linaro Limited

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
   met:
    - Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    - Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    - Neither the name(s) of the copyright holder(s) nor the names of its
      contributors may be used to endorse or promote products derived
      from this software without specific prior written permission.

   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


*/

#include "blis.h"

/*
   o 8x8 Double precision micro-kernel
   o Runnable on ARMv8a with SVE 256 feature, compiled with aarch64 GCC.
   o Tested on qemu-aarch64 and armie for SVE.

   Preconditions:
    - to use this kernel, SVE with vector length of 256 bits is a must.

   April 2020.
*/
void bli_dgemm_armsve256_asm_8x8
     (
       dim_t               k0,
       double*    restrict alpha,
       double*    restrict a,
       double*    restrict b,
       double*    restrict beta,
       double*    restrict c, inc_t rs_c0, inc_t cs_c0,
       auxinfo_t* restrict data,
       cntx_t*    restrict cntx
     )
{
	void* a_next = bli_auxinfo_next_a( data );
	void* b_next = bli_auxinfo_next_b( data );

	// Typecast local copies of integers in case dim_t and inc_t are a
	// different size than is expected by load instructions.
	uint64_t k_iter = k0 / 4;
	uint64_t k_left = k0 % 4;
	uint64_t rs_c   = rs_c0;
	uint64_t cs_c   = cs_c0;

__asm__ volatile
(
"                                            \n\t" 
" ldr x0,%[aaddr]                            \n\t" // Load address of A 
" ldr x1,%[baddr]                            \n\t" // Load address of B
" ldr x2,%[caddr]                            \n\t" // Load address of C
"                                            \n\t"
" ldr x3,%[a_next]                           \n\t" // Move pointer
" ldr x4,%[b_next]                           \n\t" // Move pointer
"                                            \n\t"
" ldr x5,%[k_iter]                           \n\t" // Init guard (k_iter)
" ldr x6,%[k_left]                           \n\t" // Init guard (k_iter)
"                                            \n\t" 
" ldr x7,%[alpha]                            \n\t" // Alpha address      
" ldr x8,%[beta]                             \n\t" // Beta address      
"                                            \n\t" 
" ldr x9,%[cs_c]                             \n\t" // Load cs_c
" lsl x10,x9,#3                              \n\t" // cs_c * sizeof(double)
"                                            \n\t"
" ldr x13,%[rs_c]                            \n\t" // Load rs_c.
" lsl x14,x13,#3                             \n\t" // rs_c * sizeof(double). 
"                                            \n\t"
" add x20,x2,x10                             \n\t" //Load address Column 1 of C
" add x21,x20,x10                            \n\t" //Load address Column 2 of C
" add x22,x21,x10                            \n\t" //Load address Column 3 of C
" add x23,x22,x10                            \n\t" //Load address Column 4 of C
" add x24,x23,x10                            \n\t" //Load address Column 5 of C
" add x25,x24,x10                            \n\t" //Load address Column 6 of C
" add x26,x25,x10                            \n\t" //Load address Column 7 of C
"                                            \n\t"
" prfm pldl1keep,[x2]                        \n\t" // Prefetch c.
" prfm pldl1keep,[x20]                       \n\t" // Prefetch c.
" prfm pldl1keep,[x21]                       \n\t" // Prefetch c.
" prfm pldl1keep,[x22]                       \n\t" // Prefetch c.
" prfm pldl1keep,[x23]                       \n\t" // Prefetch c.
" prfm pldl1keep,[x24]                       \n\t" // Prefetch c.
" prfm pldl1keep,[x25]                       \n\t" // Prefetch c.
" prfm pldl1keep,[x26]                       \n\t" // Prefetch c.
"                                            \n\t"
" ldr  z0, [x0]                              \n\t" // Load a
" ldr  z1, [x0, #1, MUL VL]                  \n\t"
"                                            \n\t"
" ptrue   p0.d, all                          \n\t"
" ld1rqd  {z2.d}, p0/z, [x1]                 \n\t" // load b( l,0:1 )
" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
"                                            \n\t"
"                                            \n\t" // PRFM, the following prefetch on [x1] and [x0]
"                                            \n\t" //   is for b rows 4..7 and a columns 4..7.
"                                            \n\t" //   both of them will be used in next iteration
"                                            \n\t" //   of k_iter (unrolled per 4 loops)
"                                            \n\t"
" dup  z16.d, #0                             \n\t" // Vector for accummulating column 0
" prfm    PLDL1KEEP, [x1, #256]              \n\t" // prefetch b row no.4
" dup  z17.d, #0                             \n\t" // Vector for accummulating column 0
" prfm    PLDL1KEEP, [x1, #320]              \n\t" // prefetch b row no.5
" dup  z18.d, #0                             \n\t" // Vector for accummulating column 1
" prfm    PLDL1KEEP, [x1, #384]              \n\t" // prefetch b row no.6
" dup  z19.d, #0                             \n\t" // Vector for accummulating column 1
" prfm    PLDL1KEEP, [x1, #448]              \n\t" // preftech b row no.7
" dup  z20.d, #0                             \n\t" // Vector for accummulating column 2 
" dup  z21.d, #0                             \n\t" // Vector for accummulating column 2
"                                            \n\t"
" dup  z22.d, #0                             \n\t" // Vector for accummulating column 3
" prfm    PLDL1KEEP, [x0, #256]              \n\t" // prefetch a col. no.4
" dup  z23.d, #0                             \n\t" // Vector for accummulating column 3
" prfm    PLDL1KEEP, [x0, #320]              \n\t" // prefetch a col. no.5
" dup  z24.d, #0                             \n\t" // Vector for accummulating column 4
" prfm    PLDL1KEEP, [x0, #384]              \n\t" // prefetch a col. no.6
" dup  z25.d, #0                             \n\t" // Vector for accummulating column 4
" prfm    PLDL1KEEP, [x0, #448]              \n\t" // prefetch a col. no.7
" dup  z26.d, #0                             \n\t" // Vector for accummulating column 5 
" dup  z27.d, #0                             \n\t" // Vector for accummulating column 5
"                                            \n\t"
" dup  z28.d, #0                             \n\t" // Vector for accummulating column 6
" dup  z29.d, #0                             \n\t" // Vector for accummulating column 6
" dup  z30.d, #0                             \n\t" // Vector for accummulating column 7
" dup  z31.d, #0                             \n\t" // Vector for accummulating column 7
"                                            \n\t"
"                                            \n\t"
" cmp x5,#0                                  \n\t" // If k_iter == 0, jump to k_left.
" beq .DCONSIDERKLEFT                        \n\t"
"                                            \n\t"
" add x0, x0, #64                            \n\t" //update address of A
" add x1, x1, #64                            \n\t" //update address of B
"                                            \n\t"
" cmp x5,1                                   \n\t" // If there is just one k_iter, jump to that one. 
" beq .DLASTITER                             \n\t" // (as loop is do-while-like).
"                                            \n\t"
" DLOOP:                                     \n\t" // Body
"                                            \n\t"
" fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
" prfm    PLDL1KEEP, [x1, #448]              \n\t" // prefetch b row no.8, 512-64=448
" fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
" prfm    PLDL1KEEP, [x1, #512]              \n\t" // prefetch b row no.9
" fmla z18.d, z0.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
" prfm    PLDL1KEEP, [x1, #576]              \n\t" // prefetch b row no.10
"                                            \n\t"
" fmla z19.d, z1.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
" fmla z20.d, z0.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
" ldr  z6, [x0]                              \n\t" // Load a( 0:3,l )
"                                            \n\t"
" fmla z21.d, z1.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
" fmla z22.d, z0.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
" ldr  z7, [x0, #1, MUL VL]                  \n\t" // load a( 4:7,l )
"                                            \n\t"
" fmla z23.d, z1.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
" fmla z24.d, z0.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
" ld1rqd  {z2.d}, p0/z, [x1]                 \n\t" // load b( l,0:1 )
"                                            \n\t"
" fmla z25.d, z1.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
" fmla z26.d, z0.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
" fmla z27.d, z1.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
"                                            \n\t"
" fmla z28.d, z0.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
" fmla z29.d, z1.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
"                                            \n\t"
" fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
" fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
"                                            \n\t"
"                                            \n\t"                  // End it 1
"                                            \n\t"
" fmla z16.d, z6.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
" prfm    PLDL1KEEP, [x1, #640]              \n\t" // prefetch b row no.11
" fmla z17.d, z7.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
" prfm    PLDL1KEEP, [x0, #448]              \n\t" // prefetch a col. no.8
" fmla z18.d, z6.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
" prfm    PLDL1KEEP, [x0, #512]              \n\t" // prefetch a col. no.9
"                                            \n\t"
" fmla z19.d, z7.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
" fmla z20.d, z6.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
" ldr  z0, [x0, #2, MUL VL]                  \n\t" // Load a( 0:3,l )
"                                            \n\t"
" fmla z21.d, z7.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
" fmla z22.d, z6.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
" ldr  z1, [x0, #3, MUL VL]                  \n\t" // load a( 4:7,l )
"                                            \n\t"
" fmla z23.d, z7.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
" fmla z24.d, z6.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
" ld1rqd  {z2.d}, p0/z, [x1, #64]            \n\t" // load b( l,0:1 )
"                                            \n\t"
" fmla z25.d, z7.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
" fmla z26.d, z6.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
" fmla z27.d, z7.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
" ld1rqd  {z3.d}, p0/z, [x1, #80]            \n\t" // load b( l,2:3 )
"                                            \n\t"
" fmla z28.d, z6.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
" fmla z29.d, z7.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
" ld1rqd  {z4.d}, p0/z, [x1, #96]            \n\t" // load b( l,4:5 )
"                                            \n\t"
" fmla z30.d, z6.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
" fmla z31.d, z7.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
" ld1rqd  {z5.d}, p0/z, [x1, #112]           \n\t" // load b( l,6:7 )
"                                            \n\t"
"                                            \n\t"
"                                            \n\t"                  //End it 2
"                                            \n\t"
" fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
" prfm    PLDL1KEEP, [x0, #576]              \n\t" // prefetch a col. no.10
" fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
" prfm    PLDL1KEEP, [x0, #640]              \n\t" // prefetch a col. no.11
"                                            \n\t"
" fmla z18.d, z0.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
"                                            \n\t"
" add x1, x1, #128                           \n\t" // because immediate in 'ldr1rqd' must be
"                                            \n\t" //   in range -128 to 112
"                                            \n\t"
" fmla z19.d, z1.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
" fmla z20.d, z0.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
" ldr  z6, [x0, #4, MUL VL]                  \n\t" // Load a( 0:3,l )
"                                            \n\t"
" fmla z21.d, z1.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
" fmla z22.d, z0.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
" ldr  z7, [x0, #5, MUL VL]                  \n\t" // load a( 4:7,l )
"                                            \n\t"
" fmla z23.d, z1.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
" fmla z24.d, z0.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
" ld1rqd  {z2.d}, p0/z, [x1, #0]             \n\t" // load b( l,0:1 )
"                                            \n\t"
" fmla z25.d, z1.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
" fmla z26.d, z0.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
" fmla z27.d, z1.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
"                                            \n\t"
" fmla z28.d, z0.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
" fmla z29.d, z1.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
"                                            \n\t"
" fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
" fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
"                                            \n\t"
"                                            \n\t"                  // End it 3
"                                            \n\t"
" fmla z16.d, z6.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
" fmla z17.d, z7.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
" fmla z18.d, z6.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
" ldr  z0, [x0, #6, MUL VL]                  \n\t" // Load a( 0:3,l )
"                                            \n\t"
" fmla z19.d, z7.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
" fmla z20.d, z6.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
" fmla z21.d, z7.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
" ldr  z1, [x0, #7, MUL VL]                  \n\t" // load a( 4:7,l )
"                                            \n\t"
" fmla z22.d, z6.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
" fmla z23.d, z7.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
" fmla z24.d, z6.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
" ld1rqd  {z2.d}, p0/z, [x1, #64]            \n\t" // load b( l,0:1 )
"                                            \n\t"
" fmla z25.d, z7.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
" fmla z26.d, z6.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
" fmla z27.d, z7.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
" ld1rqd  {z3.d}, p0/z, [x1, #80]            \n\t" // load b( l,2:3 )
"                                            \n\t"
" fmla z28.d, z6.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
" fmla z29.d, z7.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
" ld1rqd  {z4.d}, p0/z, [x1, #96]            \n\t" // load b( l,4:5 )
"                                            \n\t"
" fmla z30.d, z6.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
" fmla z31.d, z7.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
" ld1rqd  {z5.d}, p0/z, [x1, #112]           \n\t" // load b( l,6:7 )
"                                            \n\t"
"                                            \n\t"                  //End it 4
" add x0, x0, #256                           \n\t"
" add x1, x1, #128                           \n\t"
"                                            \n\t"
" sub x5,x5,1                                \n\t" // i-=1
" cmp x5,1                                   \n\t" // Iterate again if we are not in k_iter == 1.
" bne DLOOP                                  \n\t"
"                                            \n\t"
".DLASTITER:                                 \n\t"
"                                            \n\t"
" fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
" fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
" fmla z18.d, z0.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
" ldr  z6, [x0]                              \n\t" // Load a( 0:3,l )
"                                            \n\t"
" fmla z19.d, z1.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
" fmla z20.d, z0.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
" fmla z21.d, z1.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
" ldr  z7, [x0, #1, MUL VL]                  \n\t" // load a( 4:7,l )
"                                            \n\t"
" fmla z22.d, z0.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
" fmla z23.d, z1.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
" fmla z24.d, z0.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
" ld1rqd  {z2.d}, p0/z, [x1]                 \n\t" // load b( l,0:1 )
"                                            \n\t"
" fmla z25.d, z1.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
" fmla z26.d, z0.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
" fmla z27.d, z1.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
"                                            \n\t"
" fmla z28.d, z0.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
" fmla z29.d, z1.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
"                                            \n\t"
" fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
" fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
"                                            \n\t"
"                                            \n\t"                  // End it 1
"                                            \n\t"
" fmla z16.d, z6.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
" fmla z17.d, z7.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
" fmla z18.d, z6.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
" ldr  z0, [x0, #2, MUL VL]                  \n\t" // Load a( 0:3,l )
"                                            \n\t"
" fmla z19.d, z7.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
" fmla z20.d, z6.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
" fmla z21.d, z7.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
" ldr  z1, [x0, #3, MUL VL]                  \n\t" // load a( 4:7,l )
"                                            \n\t"
" fmla z22.d, z6.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
" fmla z23.d, z7.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
" fmla z24.d, z6.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
" ld1rqd  {z2.d}, p0/z, [x1, #64]            \n\t" // load b( l,0:1 )
"                                            \n\t"
" fmla z25.d, z7.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
" fmla z26.d, z6.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
" fmla z27.d, z7.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
" ld1rqd  {z3.d}, p0/z, [x1, #80]            \n\t" // load b( l,2:3 )
"                                            \n\t"
" fmla z28.d, z6.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
" fmla z29.d, z7.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
" ld1rqd  {z4.d}, p0/z, [x1, #96]            \n\t" // load b( l,4:5 )
"                                            \n\t"
" fmla z30.d, z6.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
" fmla z31.d, z7.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
" ld1rqd  {z5.d}, p0/z, [x1, #112]           \n\t" // load b( l,6:7 )
"                                            \n\t"
"                                            \n\t"
"                                            \n\t"                  //End it 2
"                                            \n\t"
" fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
" fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
" fmla z18.d, z0.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
" ldr  z6, [x0, #4, MUL VL]                  \n\t" // Load a( 0:3,l )
"                                            \n\t"
" fmla z19.d, z1.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
" fmla z20.d, z0.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
" fmla z21.d, z1.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
" ldr  z7, [x0, #5, MUL VL]                  \n\t" // load a( 4:7,l )
"                                            \n\t"
" fmla z22.d, z0.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
" add x1, x1, #128                           \n\t" // because immediate in 'ldr1rqd' must be
"                                            \n\t" //   in range -128 to 112
" fmla z23.d, z1.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
" fmla z24.d, z0.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
" ld1rqd  {z2.d}, p0/z, [x1, #0]             \n\t" // load b( l,0:1 )
"                                            \n\t"
" fmla z25.d, z1.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
" fmla z26.d, z0.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
" fmla z27.d, z1.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
"                                            \n\t"
" fmla z28.d, z0.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
" fmla z29.d, z1.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
"                                            \n\t"
" fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
" fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
"                                            \n\t"
"                                            \n\t"                  // End it 3
"                                            \n\t"
" fmla z16.d, z6.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
" fmla z17.d, z7.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
"                                            \n\t"
" fmla z18.d, z6.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
" fmla z19.d, z7.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
"                                            \n\t"
" fmla z20.d, z6.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
" fmla z21.d, z7.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
"                                            \n\t"
" fmla z22.d, z6.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
" fmla z23.d, z7.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
"                                            \n\t"
" fmla z24.d, z6.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
" fmla z25.d, z7.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
"                                            \n\t"
" fmla z26.d, z6.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
" fmla z27.d, z7.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
" add x1, x1, #64                            \n\t"
"                                            \n\t"
" fmla z28.d, z6.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
" fmla z29.d, z7.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
"                                            \n\t"
" fmla z30.d, z6.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
" fmla z31.d, z7.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
"                                            \n\t"
"                                            \n\t"                  //End it 4
" add x0, x0, #192                           \n\t"
"                                            \n\t"
" .DCONSIDERKLEFT:                           \n\t" 
" cmp x6,0                                   \n\t" // If k_left == 0, we are done.
" beq .DPOSTACCUM                            \n\t" // else, we enter the k_left loop.
"                                            \n\t"
".DLOOPKLEFT:                                \n\t"
"                                            \n\t"
" ldr  z0, [x0]                              \n\t" // Load a
" ldr  z1, [x0, #1, MUL VL]                  \n\t"
" add x0, x0, #64                            \n\t"
"                                            \n\t"
" ld1rqd  {z2.d}, p0/z, [x1]                 \n\t" // load b( l,0:1 )
" ld1rqd  {z3.d}, p0/z, [x1, #16]            \n\t" // load b( l,2:3 )
" ld1rqd  {z4.d}, p0/z, [x1, #32]            \n\t" // load b( l,4:5 )
" ld1rqd  {z5.d}, p0/z, [x1, #48]            \n\t" // load b( l,6:7 )
" add x1, x1, #64                            \n\t"
"                                            \n\t"
" sub x6,x6,1                                \n\t"
"                                            \n\t"
" fmla z16.d, z0.d, z2.d[0]                  \n\t" // Accummulate  c(0:3,0)+=a(0:3,l)*b(l,0)
" fmla z17.d, z1.d, z2.d[0]                  \n\t" // Accummulate  c(4:7,0)+=a(4:7,l)*b(l,0)
"                                            \n\t"
" fmla z18.d, z0.d, z2.d[1]                  \n\t" // Accummulate  c(0:3,1)+=a(0:3,l)*b(l,1)
" fmla z19.d, z1.d, z2.d[1]                  \n\t" // Accummulate  c(4:7,1)+=a(4:7,l)*b(l,1)
"                                            \n\t"
" fmla z20.d, z0.d, z3.d[0]                  \n\t" // Accummulate  c(0:3,2)+=a(0:3,l)*b(l,2)
" fmla z21.d, z1.d, z3.d[0]                  \n\t" // Accummulate  c(4:7,2)+=a(4:7,l)*b(l,2)
"                                            \n\t"
" fmla z22.d, z0.d, z3.d[1]                  \n\t" // Accummulate  c(0:3,3)+=a(0:3,l)*b(l,3)
" fmla z23.d, z1.d, z3.d[1]                  \n\t" // Accummulate  c(4:7,3)+=a(4:7,l)*b(l,3)
"                                            \n\t"
" fmla z24.d, z0.d, z4.d[0]                  \n\t" // Accummulate  c(0:3,4)+=a(0:3,l)*b(l,4)
" fmla z25.d, z1.d, z4.d[0]                  \n\t" // Accummulate  c(4:7,4)+=a(4:7,l)*b(l,4)
"                                            \n\t"
" fmla z26.d, z0.d, z4.d[1]                  \n\t" // Accummulate  c(0:3,5)+=a(0:3,l)*b(l,5)
" fmla z27.d, z1.d, z4.d[1]                  \n\t" // Accummulate  c(4:7,5)+=a(0:3,l)*b(l,5)
"                                            \n\t"
" fmla z28.d, z0.d, z5.d[0]                  \n\t" // Accummulate  c(0:3,6)+=a(0:3,l)*b(l,6)
" fmla z29.d, z1.d, z5.d[0]                  \n\t" // Accummulate  c(4:7,6)+=a(0:3,l)*b(l,6)
"                                            \n\t"
" fmla z30.d, z0.d, z5.d[1]                  \n\t" // Accummulate  c(0:3,7)+=a(0:3,l)*b(l,7)
" fmla z31.d, z1.d, z5.d[1]                  \n\t" // Accummulate  c(4:7,7)+=a(0:3,l)*b(l,7)
"                                            \n\t"
" cmp x6,0                                   \n\t" // Iterate again.
" bne .DLOOPKLEFT                            \n\t" // if i!=0.
"                                            \n\t"
" .DPOSTACCUM:                               \n\t"
"                                            \n\t"
" ld1rd {z6.d}, p0/z, [x7]                   \n\t" // Load alpha.
" ld1rd {z7.d}, p0/z, [x8]                   \n\t" // Load beta
"                                            \n\t"
" cmp x13,#1                                 \n\t" // If rs_c != 1 (column-major)
" bne .DGENSTORED                            \n\t"
"                                            \n\t"
" .DCOLSTORED:                               \n\t" // C is column-major.
"                                            \n\t"
" dup  z0.d, #0                              \n\t"
" dup  z1.d, #0                              \n\t"
" dup  z2.d, #0                              \n\t"
" dup  z3.d, #0                              \n\t"
"                                            \n\t"
" fcmp d7,#0.0                               \n\t"
" beq .DBETAZEROCOLSTOREDS1                  \n\t" // Taking care of the beta==0 case.
"                                            \n\t"
" ldr z0, [x2]                               \n\t" //Load column 0 of C
" ldr z1, [x2, #1, MUL VL]                   \n\t"
"                                            \n\t"
" ldr z2, [x20]                              \n\t" //Load column 1 of C
" ldr z3, [x20, #1, MUL VL]                  \n\t"
"                                            \n\t"
" fmul z0.d, z0.d, z7.d                      \n\t" // Scale by beta
" fmul z1.d, z1.d, z7.d                      \n\t" // Scale by beta
" fmul z2.d, z2.d, z7.d                      \n\t" // Scale by beta
" fmul z3.d, z3.d, z7.d                      \n\t" // Scale by beta
"                                            \n\t"
" .DBETAZEROCOLSTOREDS1:                     \n\t"
"                                            \n\t"
" fmla z0.d, z16.d, z6.d[0]                  \n\t" // Scale by alpha
" fmla z1.d, z17.d, z6.d[0]                  \n\t" // Scale by alpha
" fmla z2.d, z18.d, z6.d[0]                  \n\t" // Scale by alpha
" fmla z3.d, z19.d, z6.d[0]                  \n\t" // Scale by alpha
"                                            \n\t"
" str z0, [x2]                               \n\t" //Store column 0 of C
" str z1, [x2, #1, MUL VL]                   \n\t"
"                                            \n\t"
" str z2, [x20]                              \n\t" //Store column 1 of C
" str z3, [x20, #1, MUL VL]                  \n\t"
"                                            \n\t"
" dup  z8.d,  #0                             \n\t"
" dup  z9.d,  #0                             \n\t"
" dup  z10.d, #0                             \n\t"
" dup  z11.d, #0                             \n\t"
"                                            \n\t"
" fcmp d7,#0.0                               \n\t"
" beq .DBETAZEROCOLSTOREDS2                  \n\t" // Taking care of the beta==0 case.
"                                            \n\t"
" ldr z8, [x21]                              \n\t" //Load column 2 of C
" ldr z9, [x21, #1, MUL VL]                  \n\t"
"                                            \n\t"
" ldr z10, [x22]                             \n\t" //Load column 3 of C
" ldr z11, [x22, #1, MUL VL]                 \n\t"
"                                            \n\t"
" fmul z8.d,  z8.d,  z7.d                    \n\t" // Scale by beta
" fmul z9.d,  z9.d,  z7.d                    \n\t" // Scale by beta
" fmul z10.d, z10.d, z7.d                    \n\t" // Scale by beta
" fmul z11.d, z11.d, z7.d                    \n\t" // Scale by beta
"                                            \n\t"
" .DBETAZEROCOLSTOREDS2:                     \n\t"
"                                            \n\t"
" fmla z8.d,  z20.d, z6.d[0]                 \n\t" // Scale by alpha
" fmla z9.d,  z21.d, z6.d[0]                 \n\t" // Scale by alpha
" fmla z10.d, z22.d, z6.d[0]                 \n\t" // Scale by alpha
" fmla z11.d, z23.d, z6.d[0]                 \n\t" // Scale by alpha
"                                            \n\t"
" str z8, [x21]                              \n\t" //Store column 2 of C
" str z9, [x21, #1, MUL VL]                  \n\t"
"                                            \n\t"
" str z10, [x22]                             \n\t" //Store column 3 of C
" str z11, [x22, #1, MUL VL]                 \n\t"
"                                            \n\t"
" dup  z0.d, #0                              \n\t"
" dup  z1.d, #0                              \n\t"
" dup  z2.d, #0                              \n\t"
" dup  z3.d, #0                              \n\t"
"                                            \n\t"
" fcmp d7,#0.0                               \n\t"
" beq .DBETAZEROCOLSTOREDS3                  \n\t" // Taking care of the beta==0 case.
"                                            \n\t"
" ldr z0, [x23]                              \n\t" //Load column 4 of C
" ldr z1, [x23, #1, MUL VL]                  \n\t"
"                                            \n\t"
" ldr z2, [x24]                              \n\t" //Load column 5 of C
" ldr z3, [x24, #1, MUL VL]                  \n\t"
"                                            \n\t"
" fmul z0.d, z0.d, z7.d                      \n\t" // Scale by beta
" fmul z1.d, z1.d, z7.d                      \n\t" // Scale by beta
" fmul z2.d, z2.d, z7.d                      \n\t" // Scale by beta
" fmul z3.d, z3.d, z7.d                      \n\t" // Scale by beta
"                                            \n\t"
" .DBETAZEROCOLSTOREDS3:                     \n\t"
"                                            \n\t"
" fmla z0.d, z24.d, z6.d[0]                  \n\t" // Scale by alpha
" fmla z1.d, z25.d, z6.d[0]                  \n\t" // Scale by alpha
" fmla z2.d, z26.d, z6.d[0]                  \n\t" // Scale by alpha
" fmla z3.d, z27.d, z6.d[0]                  \n\t" // Scale by alpha
"                                            \n\t"
" str z0, [x23]                              \n\t" //Store column 4 of C
" str z1, [x23, #1, MUL VL]                  \n\t"
"                                            \n\t"
" str z2, [x24]                              \n\t" //Store column 5 of C
" str z3, [x24, #1, MUL VL]                  \n\t"
"                                            \n\t"
" dup  z8.d,  #0                             \n\t"
" dup  z9.d,  #0                             \n\t"
" dup  z10.d, #0                             \n\t"
" dup  z11.d, #0                             \n\t"
"                                            \n\t"
" fcmp d7,#0.0                               \n\t"
" beq .DBETAZEROCOLSTOREDS4                  \n\t" // Taking care of the beta==0 case.
"                                            \n\t"
" ldr z8, [x25]                              \n\t" //Load column 6 of C
" ldr z9, [x25, #1, MUL VL]                  \n\t"
"                                            \n\t"
" ldr z10, [x26]                             \n\t" //Load column 7 of C
" ldr z11, [x26, #1, MUL VL]                 \n\t"
"                                            \n\t"
" fmul z8.d,  z8.d,  z7.d                    \n\t" // Scale by beta
" fmul z9.d,  z9.d,  z7.d                    \n\t" // Scale by beta
" fmul z10.d, z10.d, z7.d                    \n\t" // Scale by beta
" fmul z11.d, z11.d, z7.d                    \n\t" // Scale by beta
"                                            \n\t"
" .DBETAZEROCOLSTOREDS4:                     \n\t"
"                                            \n\t"
" prfm pldl2keep,[x3]                        \n\t"
" prfm pldl2keep,[x4]                        \n\t"
"                                            \n\t"
" fmla z8.d,  z28.d, z6.d[0]                 \n\t" // Scale by alpha
" fmla z9.d,  z29.d, z6.d[0]                 \n\t" // Scale by alpha
" fmla z10.d, z30.d, z6.d[0]                 \n\t" // Scale by alpha
" fmla z11.d, z31.d, z6.d[0]                 \n\t" // Scale by alpha
"                                            \n\t"
" str z8, [x25]                              \n\t" //Store column 6 of C
" str z9, [x25, #1, MUL VL]                  \n\t"
"                                            \n\t"
" str z10, [x26]                             \n\t" //Store column 7 of C
" str z11, [x26, #1, MUL VL]                 \n\t"
"                                            \n\t"
" b .DEND                                    \n\t"
"                                            \n\t"
" .DGENSTORED:                               \n\t" // C is general-stride stored.
"                                            \n\t"
"                                            \n\t" // x14 is row-stride in number of bytes.
" lsl x15,x14,#2                             \n\t" // x15 is 4-row-stride, which is the address offset 
"                                            \n\t" //     btw c(4,*) and c(0,*)
" index z4.d, xzr, x14                       \n\t" // z4  is address offsets of four contiguous elements
"                                            \n\t" //     in a column. such as c( 0:3,* ).
"                                            \n\t" //     z4 is used as vector index for gather/scatter
"                                            \n\t" //     loading/storing from column of *c
"                                            \n\t"
"                                            \n\t" // C's each column's address:
"                                            \n\t" //     x2, x20, x21, x22, x23, x24, x25, x26: are addresses of c(0,0:7)
"                                            \n\t" //     x5, x6,  x7,  x8,  x16, x17, x18, x19: are addresses of c(4,0:7)
" add  x5,  x15, x2                          \n\t" // x5  is address of c(4,0)
" add  x6,  x15, x20                         \n\t" // x6  is address of c(4,1)
" add  x7,  x15, x21                         \n\t" // x7  is address of c(4,2)
" add  x8,  x15, x22                         \n\t" // x8  is address of c(4,3)
" add  x16, x15, x23                         \n\t" // x16 is address of c(4,4)
" add  x17, x15, x24                         \n\t" // x17 is address of c(4,5)
" add  x18, x15, x25                         \n\t" // x18 is address of c(4,6)
" add  x19, x15, x26                         \n\t" // x19 is address of c(4,7)
"                                            \n\t"
" dup  z0.d, #0                              \n\t" // C column 0, 1
" dup  z1.d, #0                              \n\t"
" dup  z2.d, #0                              \n\t"
" dup  z3.d, #0                              \n\t"
"                                            \n\t"
" fcmp d7,#0.0                               \n\t"
" beq .DBETAZEROGENSTOREDS1                  \n\t" // Taking care of the beta==0 case.
"                                            \n\t"
"                                            \n\t" // x2  is address of c(0,0)
"                                            \n\t" // x5  is address of c(4,0)
"                                            \n\t" // x20 is address of c(0,1)
"                                            \n\t" // x6  is address of c(4,1)
" ld1d {z0.d}, p0/z, [x2, z4.d]              \n\t" // Load c( 0:3,0 ) into z0
" ld1d {z1.d}, p0/z, [x5, z4.d]              \n\t" // Load c( 4:7,0 ) into z1
" ld1d {z2.d}, p0/z, [x20, z4.d]             \n\t" // Load c( 0:3,1 ) into z2
" ld1d {z3.d}, p0/z, [x6 , z4.d]             \n\t" // Load c( 4:7,1 ) into z3
"                                            \n\t"
" fmul z0.d, z0.d, z7.d                      \n\t" // Scale by beta
" fmul z1.d, z1.d, z7.d                      \n\t" // Scale by beta
" fmul z2.d, z2.d, z7.d                      \n\t" // Scale by beta
" fmul z3.d, z3.d, z7.d                      \n\t" // Scale by beta
"                                            \n\t"
" .DBETAZEROGENSTOREDS1:                     \n\t"
"                                            \n\t"
" fmla z0.d, z16.d, z6.d[0]                  \n\t" // Scale by alpha
" fmla z1.d, z17.d, z6.d[0]                  \n\t" // Scale by alpha
" fmla z2.d, z18.d, z6.d[0]                  \n\t" // Scale by alpha
" fmla z3.d, z19.d, z6.d[0]                  \n\t" // Scale by alpha
"                                            \n\t"
" st1d {z0.d}, p0, [x2 , z4.d]               \n\t" // Store c( 0:3,0 ) <- z0
" st1d {z1.d}, p0, [x5 , z4.d]               \n\t" // Store c( 4:7,0 ) <- z1
" st1d {z2.d}, p0, [x20, z4.d]               \n\t" // Store c( 0:3,1 ) <- z2
" st1d {z3.d}, p0, [x6 , z4.d]               \n\t" // Store c( 4:7,1 ) <- z3
"                                            \n\t"
"                                            \n\t"
"                                            \n\t"
" dup  z8.d, #0                              \n\t" // C column 2, 3
" dup  z9.d, #0                              \n\t"
" dup  z10.d, #0                             \n\t"
" dup  z11.d, #0                             \n\t"
"                                            \n\t"
" fcmp d7,#0.0                               \n\t"
" beq .DBETAZEROGENSTOREDS2                  \n\t" // Taking care of the beta==0 case.
"                                            \n\t"
"                                            \n\t" // x21 is address of c(0,2)
"                                            \n\t" // x7  is address of c(4,2)
"                                            \n\t" // x22 is address of c(0,3)
"                                            \n\t" // x8  is address of c(4,3)
" ld1d {z8.d},  p0/z, [x21, z4.d]            \n\t" // Load c( 0:3,2 ) into z8
" ld1d {z9.d},  p0/z, [x7 , z4.d]            \n\t" // Load c( 4:7,2 ) into z9
" ld1d {z10.d}, p0/z, [x22, z4.d]            \n\t" // Load c( 0:3,3 ) into z10
" ld1d {z11.d}, p0/z, [x8 , z4.d]            \n\t" // Load c( 4:7,3 ) into z11
"                                            \n\t"
" fmul z8.d,  z8.d,  z7.d                    \n\t" // Scale by beta
" fmul z9.d,  z9.d,  z7.d                    \n\t" // Scale by beta
" fmul z10.d, z10.d, z7.d                    \n\t" // Scale by beta
" fmul z11.d, z11.d, z7.d                    \n\t" // Scale by beta
"                                            \n\t"
" .DBETAZEROGENSTOREDS2:                     \n\t"
"                                            \n\t"
" fmla z8.d,  z20.d, z6.d[0]                 \n\t" // Scale by alpha
" fmla z9.d,  z21.d, z6.d[0]                 \n\t" // Scale by alpha
" fmla z10.d, z22.d, z6.d[0]                 \n\t" // Scale by alpha
" fmla z11.d, z23.d, z6.d[0]                 \n\t" // Scale by alpha
"                                            \n\t"
" st1d {z8.d},  p0, [x21, z4.d]              \n\t" // Store c( 0:3,2 ) <- z8
" st1d {z9.d},  p0, [x7 , z4.d]              \n\t" // Store c( 4:7,2 ) <- z9
" st1d {z10.d}, p0, [x22, z4.d]              \n\t" // Store c( 0:3,3 ) <- z10
" st1d {z11.d}, p0, [x8 , z4.d]              \n\t" // Store c( 4:7,3 ) <- z11
"                                            \n\t"
" dup  z0.d, #0                              \n\t" // C column 4, 5
" dup  z1.d, #0                              \n\t"
" dup  z2.d, #0                              \n\t"
" dup  z3.d, #0                              \n\t"
"                                            \n\t"
" fcmp d7,#0.0                               \n\t"
" beq .DBETAZEROGENSTOREDS3                  \n\t" // Taking care of the beta==0 case.
"                                            \n\t"
"                                            \n\t" // x23 is address of c(0,4)
"                                            \n\t" // x16 is address of c(4,4)
"                                            \n\t" // x24 is address of c(0,5)
"                                            \n\t" // x17 is address of c(4,5)
" ld1d {z0.d}, p0/z, [x23, z4.d]             \n\t" // Load c( 0:3,4 ) into z0
" ld1d {z1.d}, p0/z, [x16, z4.d]             \n\t" // Load c( 4:7,4 ) into z1
" ld1d {z2.d}, p0/z, [x24, z4.d]             \n\t" // Load c( 0:3,5 ) into z2
" ld1d {z3.d}, p0/z, [x17, z4.d]             \n\t" // Load c( 4:7,5 ) into z3
"                                            \n\t"
" fmul z0.d, z0.d, z7.d                      \n\t" // Scale by beta
" fmul z1.d, z1.d, z7.d                      \n\t" // Scale by beta
" fmul z2.d, z2.d, z7.d                      \n\t" // Scale by beta
" fmul z3.d, z3.d, z7.d                      \n\t" // Scale by beta
"                                            \n\t"
" .DBETAZEROGENSTOREDS3:                     \n\t"
"                                            \n\t"
" fmla z0.d, z24.d, z6.d[0]                  \n\t" // Scale by alpha
" fmla z1.d, z25.d, z6.d[0]                  \n\t" // Scale by alpha
" fmla z2.d, z26.d, z6.d[0]                  \n\t" // Scale by alpha
" fmla z3.d, z27.d, z6.d[0]                  \n\t" // Scale by alpha
"                                            \n\t"
" st1d {z0.d}, p0, [x23, z4.d]               \n\t" // Store c( 0:3,4 ) <- z0
" st1d {z1.d}, p0, [x16, z4.d]               \n\t" // Store c( 4:7,4 ) <- z1
" st1d {z2.d}, p0, [x24, z4.d]               \n\t" // Store c( 0:3,5 ) <- z2
" st1d {z3.d}, p0, [x17, z4.d]               \n\t" // Store c( 4:7,5 ) <- z3
"                                            \n\t"
" dup  z8.d, #0                              \n\t" // C column 6, 7
" dup  z9.d, #0                              \n\t"
" dup  z10.d, #0                             \n\t"
" dup  z11.d, #0                             \n\t"
"                                            \n\t"
" fcmp d7,#0.0                               \n\t"
" beq .DBETAZEROGENSTOREDS4                  \n\t" // Taking care of the beta==0 case.
"                                            \n\t"
"                                            \n\t" // x25 is address of c(0,6)
"                                            \n\t" // x18 is address of c(4,6)
"                                            \n\t" // x26 is address of c(0,7)
"                                            \n\t" // x19 is address of c(4,7)
" ld1d {z8.d},  p0/z, [x25, z4.d]            \n\t" // Load c( 0:3,6 ) into z8
" ld1d {z9.d},  p0/z, [x18, z4.d]            \n\t" // Load c( 4:7,6 ) into z9
" ld1d {z10.d}, p0/z, [x26, z4.d]            \n\t" // Load c( 0:3,7 ) into z10
" ld1d {z11.d}, p0/z, [x19, z4.d]            \n\t" // Load c( 4:7,7 ) into z11
"                                            \n\t"
" fmul z8.d,  z8.d,  z7.d                    \n\t" // Scale by beta
" fmul z9.d,  z9.d,  z7.d                    \n\t" // Scale by beta
" fmul z10.d, z10.d, z7.d                    \n\t" // Scale by beta
" fmul z11.d, z11.d, z7.d                    \n\t" // Scale by beta
"                                            \n\t"
" .DBETAZEROGENSTOREDS4:                     \n\t"
"                                            \n\t"
" fmla z8.d,  z28.d, z6.d[0]                 \n\t" // Scale by alpha
" fmla z9.d,  z29.d, z6.d[0]                 \n\t" // Scale by alpha
" fmla z10.d, z30.d, z6.d[0]                 \n\t" // Scale by alpha
" fmla z11.d, z31.d, z6.d[0]                 \n\t" // Scale by alpha
"                                            \n\t"
" st1d {z8.d},  p0, [x25, z4.d]              \n\t" // Store c( 0:3,6 ) <- z8
" st1d {z9.d},  p0, [x18, z4.d]              \n\t" // Store c( 4:7,6 ) <- z9
" st1d {z10.d}, p0, [x26, z4.d]              \n\t" // Store c( 0:3,7 ) <- z10
" st1d {z11.d}, p0, [x19, z4.d]              \n\t" // Store c( 4:7,7 ) <- z11
"                                            \n\t"
" .DEND:                                     \n\t" // Done!
"                                            \n\t"
:// output operands (none)
:// input operands
 [aaddr]  "m" (a),      // 0
 [baddr]  "m" (b),      // 1
 [caddr]  "m" (c),      // 2
 [k_iter] "m" (k_iter), // 3
 [k_left] "m" (k_left), // 4
 [alpha]  "m" (alpha),  // 5
 [beta]   "m" (beta),   // 6
 [rs_c]   "m" (rs_c),   // 6
 [cs_c]   "m" (cs_c),   // 7
 [a_next] "m" (a_next), // 8
 [b_next] "m" (b_next)  // 9
:// Register clobber list
 "x0","x1","x2","x3",
 "x4","x5","x6",
 "x7","x8","x9",
 "x10","x11","x12","x13","x14","x15","x16","x17","x18","x19",
 "x20","x21","x22","x23","x24","x25","x26",
 "x27",       
 "v0","v1","v2",
 "v3","v4","v5",
 "v6","v7","v8",
 "v9","v10","v11",
 "v12","v13","v14",
 "v15","v16","v17","v18","v19",
 "v20","v21","v22","v23",
 "v24","v25","v26","v27",
 "v28","v29","v30","v31"
);

}