/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2023, SiFive, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ // clang-format off #include "blis.h" #include "../riscv_cmul_macros_asm.h" #include #include #include #include // byte-size of the floating point type #define FLT_SIZE 4 #define FLT_LOAD "flw " #define VLE "vle32.v " #define VLSE "vlse32.v " #define VSE "vse32.v " #define VSSE "vsse32.v " #define PACKMR 8 #define PACKNR 64 void bli_sgemm_7m4 ( dim_t N, dim_t K, const float* restrict alpha, const float* restrict a, const float* restrict b, const float* restrict beta, float* restrict c, inc_t rsc, inc_t csc ) { // 7 x N x K sgemm, 0 < N <= 64 = vlmax, K > 0 __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); bool first = true; // compute a*b for (dim_t k = 0; k < K; ++k) { __asm__(VLE "v28, (%0)" : : "r"(b)); if (first) { __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__("vfmul.vf v0, v28, ft0"); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); __asm__("vfmul.vf v4, v28, ft1"); __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__("vfmul.vf v8, v28, ft2"); __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); __asm__("vfmul.vf v12, v28, ft3"); __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__("vfmul.vf v16, v28, ft4"); __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); __asm__("vfmul.vf v20, v28, ft5"); __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); __asm__("vfmul.vf v24, v28, ft6"); first = false; } else { __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__("vfmacc.vf v0, ft0, v28"); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); __asm__("vfmacc.vf v4, ft1, v28"); __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__("vfmacc.vf v8, ft2, v28"); __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); __asm__("vfmacc.vf v12, ft3, v28"); __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__("vfmacc.vf v16, ft4, v28"); __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); __asm__("vfmacc.vf v20, ft5, v28"); __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); __asm__("vfmacc.vf v24, ft6, v28"); } __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE)); } rsc *= FLT_SIZE; csc *= FLT_SIZE; __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); // compute alpha*a*b + beta*c if (*beta == 0.f) { __asm__("vfmul.vf v0, v0, ft10"); __asm__("vfmul.vf v4, v4, ft10"); __asm__("vfmul.vf v8, v8, ft10"); __asm__("vfmul.vf v12, v12, ft10"); __asm__("vfmul.vf v16, v16, ft10"); __asm__("vfmul.vf v20, v20, ft10"); __asm__("vfmul.vf v24, v24, ft10"); } else { // beta != 0.f __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); float *c_tmp = c; if (csc == FLT_SIZE) { // c unit column stride __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v0, v0, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v0, ft11, v28"); __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v4, v4, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v4, ft11, v28"); __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v8, v8, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v8, ft11, v28"); __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v12, v12, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v12, ft11, v28"); __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v16, v16, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v16, ft11, v28"); __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v20, v20, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v20, ft11, v28"); __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v24, v24, ft10"); __asm__("vfmacc.vf v24, ft11, v28"); } // end c unit column stride else { // c non-unit column stride __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v0, v0, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v0, ft11, v28"); __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v4, v4, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v4, ft11, v28"); __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v8, v8, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v8, ft11, v28"); __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v12, v12, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v12, ft11, v28"); __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v16, v16, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v16, ft11, v28"); __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v20, v20, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v20, ft11, v28"); __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v24, v24, ft10"); __asm__("vfmacc.vf v24, ft11, v28"); } // end c non-unit column stride } // end beta != 0.f // store c if (csc == FLT_SIZE) { __asm__(VSE "v0, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSE "v4, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSE "v8, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSE "v12, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSE "v16, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSE "v20, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSE "v24, (%0)" : : "r"(c)); } else { __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc)); } return; } void bli_sgemm_7m4_cleanup ( dim_t M, dim_t N, dim_t K, const float* restrict alpha, const float* restrict a, const float* restrict b, const float* restrict beta, float* restrict c, inc_t rsc, inc_t csc ) { // M x N x K sgemm, 0 < M < 6, 0 < N <= 64 = vlmax, K > 0 __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); bool first = true; // compute a*b for (dim_t k = 0; k < K; ++k) { __asm__(VLE "v28, (%0)" : : "r"(b)); if (first) { switch (M) { case 6: __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); __asm__("vfmul.vf v20, v28, ft5"); case 5: __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__("vfmul.vf v16, v28, ft4"); case 4: __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); __asm__("vfmul.vf v12, v28, ft3"); case 3: __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__("vfmul.vf v8, v28, ft2"); case 2: __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); __asm__("vfmul.vf v4, v28, ft1"); case 1: __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__("vfmul.vf v0, v28, ft0"); } first = false; } else { switch (M) { case 6: __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); __asm__("vfmacc.vf v20, ft5, v28"); case 5: __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__("vfmacc.vf v16, ft4, v28"); case 4: __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); __asm__("vfmacc.vf v12, ft3, v28"); case 3: __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__("vfmacc.vf v8, ft2, v28"); case 2: __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); __asm__("vfmacc.vf v4, ft1, v28"); case 1: __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__("vfmacc.vf v0, ft0, v28"); } } __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE)); } c += (M - 1) * rsc; rsc *= FLT_SIZE; csc *= FLT_SIZE; __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); // compute alpha*a*b + beta*c if (*beta == 0.f) { switch (M) { case 6: __asm__("vfmul.vf v20, v20, ft10"); case 5: __asm__("vfmul.vf v16, v16, ft10"); case 4: __asm__("vfmul.vf v12, v12, ft10"); case 3: __asm__("vfmul.vf v8, v8, ft10"); case 2: __asm__("vfmul.vf v4, v4, ft10"); case 1: __asm__("vfmul.vf v0, v0, ft10"); } } else { // beta != 0.f __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); float *c_tmp = c; if (csc == FLT_SIZE) { switch (M) { case 6: __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v20, v20, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v20, ft11, v28"); case 5: __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v16, v16, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v16, ft11, v28"); case 4: __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v12, v12, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v12, ft11, v28"); case 3: __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v8, v8, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v8, ft11, v28"); case 2: __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v4, v4, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v4, ft11, v28"); case 1: __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v0, v0, ft10"); __asm__("vfmacc.vf v0, ft11, v28"); } } // end c unit column stride else { // c non-unit column stride switch (M) { case 6: __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v20, v20, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v20, ft11, v28"); case 5: __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v16, v16, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v16, ft11, v28"); case 4: __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v12, v12, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v12, ft11, v28"); case 3: __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v8, v8, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v8, ft11, v28"); case 2: __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v4, v4, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v4, ft11, v28"); case 1: __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v0, v0, ft10"); __asm__("vfmacc.vf v0, ft11, v28"); } } // end c non-unit column stride } // end beta != 0.f // store c if (csc == FLT_SIZE) { switch (M) { case 6: __asm__(VSE "v20, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 5: __asm__(VSE "v16, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 4: __asm__(VSE "v12, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 3: __asm__(VSE "v8, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 2: __asm__(VSE "v4, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 1: __asm__(VSE "v0, (%0)" : : "r"(c)); } } else { switch (M) { case 6: __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 5: __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 4: __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 3: __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 2: __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 1: __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); } } return; } void bli_sgemm_7m4_k0 ( dim_t M, dim_t N, const float* restrict beta, float* restrict c, inc_t rsc, inc_t csc ) { // 0 < M <= 7, 0 < N < 64 = vlmax, K = 0 // This may not produce the same result as the reference kernel if alpha is infinite or NaN. __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); c += (M - 1) * rsc; rsc *= FLT_SIZE; csc *= FLT_SIZE; if (*beta == 0.f) { // set c to 0 __asm__("vmv.v.i v0, 0"); if (csc == FLT_SIZE) { // c unit column stride switch (M) { case 7: __asm__(VSE "v0, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 6: __asm__(VSE "v0, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 5: __asm__(VSE "v0, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 4: __asm__(VSE "v0, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 3: __asm__(VSE "v0, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 2: __asm__(VSE "v0, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 1: __asm__(VSE "v0, (%0)" : : "r"(c)); } } // end c unit column stride else { // c non-unit column stride switch (M) { case 7: __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 6: __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 5: __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 4: __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 3: __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 2: __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 1: __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); } } // end c non-unit column stride } // end beta == 0.f else { // beta != 0.f __asm__(FLT_LOAD "ft0, (%0)" : : "r"(beta)); if (csc == FLT_SIZE) { // c unit column stride switch (M) { case 7: __asm__(VLE "v24, (%0)" : : "r"(c)); __asm__("vfmul.vf v24, v24, ft0"); __asm__(VSE "v24, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 6: __asm__(VLE "v20, (%0)" : : "r"(c)); __asm__("vfmul.vf v20, v20, ft0"); __asm__(VSE "v20, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 5: __asm__(VLE "v16, (%0)" : : "r"(c)); __asm__("vfmul.vf v16, v16, ft0"); __asm__(VSE "v16, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 4: __asm__(VLE "v12, (%0)" : : "r"(c)); __asm__("vfmul.vf v12, v12, ft0"); __asm__(VSE "v12, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 3: __asm__(VLE "v8, (%0)" : : "r"(c)); __asm__("vfmul.vf v8, v8, ft0"); __asm__(VSE "v8, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 2: __asm__(VLE "v4, (%0)" : : "r"(c)); __asm__("vfmul.vf v4, v4, ft0"); __asm__(VSE "v4, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 1: __asm__(VLE "v0, (%0)" : : "r"(c)); __asm__("vfmul.vf v0, v0, ft0"); __asm__(VSE "v0, (%0)" : : "r"(c)); } } // end c unit column stride else { // c non-unit column stride switch (M) { case 7: __asm__(VLSE "v24, (%0), %1" : : "r"(c), "r"(csc)); __asm__("vfmul.vf v24, v24, ft0"); __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 6: __asm__(VLSE "v20, (%0), %1" : : "r"(c), "r"(csc)); __asm__("vfmul.vf v20, v20, ft0"); __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 5: __asm__(VLSE "v16, (%0), %1" : : "r"(c), "r"(csc)); __asm__("vfmul.vf v16, v16, ft0"); __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 4: __asm__(VLSE "v12, (%0), %1" : : "r"(c), "r"(csc)); __asm__("vfmul.vf v12, v12, ft0"); __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 3: __asm__(VLSE "v8, (%0), %1" : : "r"(c), "r"(csc)); __asm__("vfmul.vf v8, v8, ft0"); __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 2: __asm__(VLSE "v4, (%0), %1" : : "r"(c), "r"(csc)); __asm__("vfmul.vf v4, v4, ft0"); __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 1: __asm__(VLSE "v0, (%0), %1" : : "r"(c), "r"(csc)); __asm__("vfmul.vf v0, v0, ft0"); __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); } } // end c non-unit column stride } // end beta != 0.f return; } void bli_sgemm_sifive_x280_asm_7m4 ( dim_t M, dim_t N, dim_t K, const void* restrict alpha_, const void* restrict a_, const void* restrict b_, const void* restrict beta_, void* restrict c_, inc_t rsc, inc_t csc, auxinfo_t* restrict data, const cntx_t* restrict cntx ) { (void) data; (void) cntx; const float* restrict alpha = alpha_; const float* restrict beta = beta_; const float* restrict a = a_; const float* restrict b = b_; float* restrict c = c_; // M x N x K sgemm if (M <= 0 || N <= 0 || K < 0) return; else if (K == 0) bli_sgemm_7m4_k0(M, N, beta, c, rsc, csc); else if (M == 7) bli_sgemm_7m4(N, K, alpha, a, b, beta, c, rsc, csc); else bli_sgemm_7m4_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc); return; } #undef FLT_SIZE #undef FLT_LOAD #undef VLE #undef VLSE #undef VSE #undef VSSE #undef PACKMR #undef PACKNR // byte-size of the floating point type #define FLT_SIZE 8 #define FLT_LOAD "fld " #define VLE "vle64.v " #define VLSE "vlse64.v " #define VSE "vse64.v " #define VSSE "vsse64.v " #define PACKMR 8 #define PACKNR 32 void bli_dgemm_7m4 ( dim_t N, dim_t K, const double* restrict alpha, const double* restrict a, const double* restrict b, const double* restrict beta, double* restrict c, inc_t rsc, inc_t csc ) { // 7 x N x K dgemm, 0 < N <= 64 = vlmax, K > 0 __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); bool first = true; // compute a*b for (dim_t k = 0; k < K; ++k) { __asm__(VLE "v28, (%0)" : : "r"(b)); if (first) { __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__("vfmul.vf v0, v28, ft0"); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); __asm__("vfmul.vf v4, v28, ft1"); __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__("vfmul.vf v8, v28, ft2"); __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); __asm__("vfmul.vf v12, v28, ft3"); __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__("vfmul.vf v16, v28, ft4"); __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); __asm__("vfmul.vf v20, v28, ft5"); __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); __asm__("vfmul.vf v24, v28, ft6"); first = false; } else { __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__("vfmacc.vf v0, ft0, v28"); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); __asm__("vfmacc.vf v4, ft1, v28"); __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__("vfmacc.vf v8, ft2, v28"); __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); __asm__("vfmacc.vf v12, ft3, v28"); __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__("vfmacc.vf v16, ft4, v28"); __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); __asm__("vfmacc.vf v20, ft5, v28"); __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); __asm__("vfmacc.vf v24, ft6, v28"); } __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE)); } rsc *= FLT_SIZE; csc *= FLT_SIZE; __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); // compute alpha*a*b + beta*c if (*beta == 0.) { __asm__("vfmul.vf v0, v0, ft10"); __asm__("vfmul.vf v4, v4, ft10"); __asm__("vfmul.vf v8, v8, ft10"); __asm__("vfmul.vf v12, v12, ft10"); __asm__("vfmul.vf v16, v16, ft10"); __asm__("vfmul.vf v20, v20, ft10"); __asm__("vfmul.vf v24, v24, ft10"); } else { // beta != 0. __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); double *c_tmp = c; if (csc == FLT_SIZE) { // c unit column stride __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v0, v0, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v0, ft11, v28"); __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v4, v4, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v4, ft11, v28"); __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v8, v8, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v8, ft11, v28"); __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v12, v12, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v12, ft11, v28"); __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v16, v16, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v16, ft11, v28"); __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v20, v20, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v20, ft11, v28"); __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v24, v24, ft10"); __asm__("vfmacc.vf v24, ft11, v28"); } // end c unit column stride else { // c non-unit column stride __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v0, v0, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v0, ft11, v28"); __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v4, v4, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v4, ft11, v28"); __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v8, v8, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v8, ft11, v28"); __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v12, v12, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v12, ft11, v28"); __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v16, v16, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v16, ft11, v28"); __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v20, v20, ft10"); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v20, ft11, v28"); __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v24, v24, ft10"); __asm__("vfmacc.vf v24, ft11, v28"); } // end c non-unit column stride } // end beta != 0. // store c if (csc == FLT_SIZE) { __asm__(VSE "v0, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSE "v4, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSE "v8, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSE "v12, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSE "v16, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSE "v20, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSE "v24, (%0)" : : "r"(c)); } else { __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc)); } return; } void bli_dgemm_7m4_cleanup ( dim_t M, dim_t N, dim_t K, const double* restrict alpha, const double* restrict a, const double* restrict b, const double* restrict beta, double* restrict c, inc_t rsc, inc_t csc ) { // M x N x K dgemm, 0 < M < 6, 0 < N <= 64 = vlmax, K > 0 __asm__("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); bool first = true; // compute a*b for (dim_t k = 0; k < K; ++k) { __asm__(VLE "v28, (%0)" : : "r"(b)); if (first) { switch (M) { case 6: __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); __asm__("vfmul.vf v20, v28, ft5"); case 5: __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__("vfmul.vf v16, v28, ft4"); case 4: __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); __asm__("vfmul.vf v12, v28, ft3"); case 3: __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__("vfmul.vf v8, v28, ft2"); case 2: __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); __asm__("vfmul.vf v4, v28, ft1"); case 1: __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__("vfmul.vf v0, v28, ft0"); } first = false; } else { switch (M) { case 6: __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); __asm__("vfmacc.vf v20, ft5, v28"); case 5: __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__("vfmacc.vf v16, ft4, v28"); case 4: __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); __asm__("vfmacc.vf v12, ft3, v28"); case 3: __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__("vfmacc.vf v8, ft2, v28"); case 2: __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); __asm__("vfmacc.vf v4, ft1, v28"); case 1: __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__("vfmacc.vf v0, ft0, v28"); } } __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * FLT_SIZE)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * FLT_SIZE)); } c += (M - 1) * rsc; rsc *= FLT_SIZE; csc *= FLT_SIZE; __asm__(FLT_LOAD "ft10, (%0)" : : "r"(alpha)); // compute alpha*a*b + beta*c if (*beta == 0.) { switch (M) { case 6: __asm__("vfmul.vf v20, v20, ft10"); case 5: __asm__("vfmul.vf v16, v16, ft10"); case 4: __asm__("vfmul.vf v12, v12, ft10"); case 3: __asm__("vfmul.vf v8, v8, ft10"); case 2: __asm__("vfmul.vf v4, v4, ft10"); case 1: __asm__("vfmul.vf v0, v0, ft10"); } } else { // beta != 0. __asm__(FLT_LOAD "ft11, (%0)" : : "r"(beta)); double *c_tmp = c; if (csc == FLT_SIZE) { switch (M) { case 6: __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v20, v20, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v20, ft11, v28"); case 5: __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v16, v16, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v16, ft11, v28"); case 4: __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v12, v12, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v12, ft11, v28"); case 3: __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v8, v8, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v8, ft11, v28"); case 2: __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v4, v4, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v4, ft11, v28"); case 1: __asm__(VLE "v28, (%0)" : : "r"(c_tmp)); __asm__("vfmul.vf v0, v0, ft10"); __asm__("vfmacc.vf v0, ft11, v28"); } } // end c unit column stride else { // c non-unit column stride switch (M) { case 6: __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v20, v20, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v20, ft11, v28"); case 5: __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v16, v16, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v16, ft11, v28"); case 4: __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v12, v12, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v12, ft11, v28"); case 3: __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v8, v8, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v8, ft11, v28"); case 2: __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v4, v4, ft10"); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__("vfmacc.vf v4, ft11, v28"); case 1: __asm__(VLSE "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("vfmul.vf v0, v0, ft10"); __asm__("vfmacc.vf v0, ft11, v28"); } } // end c non-unit column stride } // end beta != 0. // store c if (csc == FLT_SIZE) { switch (M) { case 6: __asm__(VSE "v20, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 5: __asm__(VSE "v16, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 4: __asm__(VSE "v12, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 3: __asm__(VSE "v8, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 2: __asm__(VSE "v4, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 1: __asm__(VSE "v0, (%0)" : : "r"(c)); } } else { switch (M) { case 6: __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 5: __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 4: __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 3: __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 2: __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 1: __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); } } return; } void bli_dgemm_7m4_k0 ( dim_t M, dim_t N, const double* restrict beta, double* restrict c, inc_t rsc, inc_t csc ) { // 0 < M <= 7, 0 < N < 64 = vlmax, K = 0 // This may not produce the same result as the reference kernel if alpha is infinite or NaN. __asm__ volatile("vsetvli zero, %0, e%1, m4, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); c += (M - 1) * rsc; rsc *= FLT_SIZE; csc *= FLT_SIZE; if (*beta == 0.) { // set c to 0 __asm__("vmv.v.i v0, 0"); if (csc == FLT_SIZE) { // c unit column stride switch (M) { case 7: __asm__(VSE "v0, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 6: __asm__(VSE "v0, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 5: __asm__(VSE "v0, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 4: __asm__(VSE "v0, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 3: __asm__(VSE "v0, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 2: __asm__(VSE "v0, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 1: __asm__(VSE "v0, (%0)" : : "r"(c)); } } // end c unit column stride else { // c non-unit column stride switch (M) { case 7: __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 6: __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 5: __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 4: __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 3: __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 2: __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 1: __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); } } // end c non-unit column stride } // end beta == 0. else { // beta != 0. __asm__(FLT_LOAD "ft0, (%0)" : : "r"(beta)); if (csc == FLT_SIZE) { // c unit column stride switch (M) { case 7: __asm__(VLE "v24, (%0)" : : "r"(c)); __asm__("vfmul.vf v24, v24, ft0"); __asm__(VSE "v24, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 6: __asm__(VLE "v20, (%0)" : : "r"(c)); __asm__("vfmul.vf v20, v20, ft0"); __asm__(VSE "v20, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 5: __asm__(VLE "v16, (%0)" : : "r"(c)); __asm__("vfmul.vf v16, v16, ft0"); __asm__(VSE "v16, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 4: __asm__(VLE "v12, (%0)" : : "r"(c)); __asm__("vfmul.vf v12, v12, ft0"); __asm__(VSE "v12, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 3: __asm__(VLE "v8, (%0)" : : "r"(c)); __asm__("vfmul.vf v8, v8, ft0"); __asm__(VSE "v8, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 2: __asm__(VLE "v4, (%0)" : : "r"(c)); __asm__("vfmul.vf v4, v4, ft0"); __asm__(VSE "v4, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 1: __asm__(VLE "v0, (%0)" : : "r"(c)); __asm__("vfmul.vf v0, v0, ft0"); __asm__(VSE "v0, (%0)" : : "r"(c)); } } // end c unit column stride else { // c non-unit column stride switch (M) { case 7: __asm__(VLSE "v24, (%0), %1" : : "r"(c), "r"(csc)); __asm__("vfmul.vf v24, v24, ft0"); __asm__(VSSE "v24, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 6: __asm__(VLSE "v20, (%0), %1" : : "r"(c), "r"(csc)); __asm__("vfmul.vf v20, v20, ft0"); __asm__(VSSE "v20, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 5: __asm__(VLSE "v16, (%0), %1" : : "r"(c), "r"(csc)); __asm__("vfmul.vf v16, v16, ft0"); __asm__(VSSE "v16, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 4: __asm__(VLSE "v12, (%0), %1" : : "r"(c), "r"(csc)); __asm__("vfmul.vf v12, v12, ft0"); __asm__(VSSE "v12, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 3: __asm__(VLSE "v8, (%0), %1" : : "r"(c), "r"(csc)); __asm__("vfmul.vf v8, v8, ft0"); __asm__(VSSE "v8, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 2: __asm__(VLSE "v4, (%0), %1" : : "r"(c), "r"(csc)); __asm__("vfmul.vf v4, v4, ft0"); __asm__(VSSE "v4, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 1: __asm__(VLSE "v0, (%0), %1" : : "r"(c), "r"(csc)); __asm__("vfmul.vf v0, v0, ft0"); __asm__(VSSE "v0, (%0), %1" : : "r"(c), "r"(csc)); } } // end c non-unit column stride } // end beta != 0. return; } void bli_dgemm_sifive_x280_asm_7m4 ( dim_t M, dim_t N, dim_t K, const void* restrict alpha_, const void* restrict a_, const void* restrict b_, const void* restrict beta_, void* restrict c_, inc_t rsc, inc_t csc, auxinfo_t* restrict data, const cntx_t* restrict cntx ) { (void) data; (void) cntx; const double* restrict alpha = alpha_; const double* restrict beta = beta_; const double* restrict a = a_; const double* restrict b = b_; double* restrict c = c_; // M x N x K dgemm if (M <= 0 || N <= 0 || K < 0) return; else if (K == 0) bli_dgemm_7m4_k0(M, N, beta, c, rsc, csc); else if (M == 7) bli_dgemm_7m4(N, K, alpha, a, b, beta, c, rsc, csc); else bli_dgemm_7m4_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc); return; } #undef FLT_SIZE #undef FLT_LOAD #undef VLE #undef VLSE #undef VSE #undef VSSE #undef PACKMR #undef PACKNR // byte-size of underlying floating point type #define FLT_SIZE 4 #define FLT_LOAD "flw " #define VLSEG2 "vlseg2e32.v " #define VLSSEG2 "vlsseg2e32.v " #define VSSEG2 "vsseg2e32.v " #define VSSSEG2 "vssseg2e32.v " #define PACKMR 8 #define PACKNR 32 void bli_cgemm_6m2 ( dim_t N, dim_t K, const scomplex* restrict alpha, const scomplex* restrict a, const scomplex* restrict b, const scomplex* restrict beta, scomplex* restrict c, inc_t rsc, inc_t csc ) { // 6 x N x K cgemm, N <= 32 = vlmax, K > 0 // pairs of register groups hold the real and imag. parts of rows of c and b __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); if (K >= 2) { __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); } __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); vcmul_vf(v0, v2, v24, v26, ft0, ft1); __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); vcmul_vf(v4, v6, v24, v26, ft2, ft3); __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); vcmul_vf(v8, v10, v24, v26, ft4, ft5); __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); vcmul_vf(v12, v14, v24, v26, ft6, ft7); __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); vcmul_vf(v16, v18, v24, v26, ft8, ft9); __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); vcmul_vf(v20, v22, v24, v26, ft10, ft11); K -= 1; if (K >= 2) { __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); } __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); while (K > 0) { __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); vcmacc_vf(v0, v2, ft0, ft1, v28, v30); __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); vcmacc_vf(v4, v6, ft2, ft3, v28, v30); __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); vcmacc_vf(v8, v10, ft4, ft5, v28, v30); __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); vcmacc_vf(v12, v14, ft6, ft7, v28, v30); __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); vcmacc_vf(v16, v18, ft8, ft9, v28, v30); __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); vcmacc_vf(v20, v22, ft10, ft11, v28, v30); K -= 1; if (K == 0) { break; } if (K >= 2) { __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); } __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); vcmacc_vf(v0, v2, ft0, ft1, v24, v26); __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); vcmacc_vf(v4, v6, ft2, ft3, v24, v26); __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); vcmacc_vf(v8, v10, ft4, ft5, v24, v26); __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); vcmacc_vf(v12, v14, ft6, ft7, v24, v26); __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); vcmacc_vf(v16, v18, ft8, ft9, v24, v26); __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); vcmacc_vf(v20, v22, ft10, ft11, v24, v26); K -= 1; if (K >= 2) { __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); } __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); } rsc *= 2 * FLT_SIZE; csc *= 2 * FLT_SIZE; __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE)); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE)); __asm__("vfmul.vf v24, v2, ft1"); __asm__("vfmul.vf v26, v0, ft1"); __asm__("vfmul.vf v28, v6, ft1"); __asm__("vfmul.vf v30, v4, ft1"); __asm__("vfmsub.vf v0, ft0, v24"); __asm__("vfmadd.vf v2, ft0, v26"); __asm__("vfmsub.vf v4, ft0, v28"); __asm__("vfmadd.vf v6, ft0, v30"); __asm__("vfmul.vf v24, v10, ft1"); __asm__("vfmul.vf v26, v8, ft1"); __asm__("vfmul.vf v28, v14, ft1"); __asm__("vfmul.vf v30, v12, ft1"); __asm__("vfmsub.vf v8, ft0, v24"); __asm__("vfmadd.vf v10, ft0, v26"); __asm__("vfmsub.vf v12, ft0, v28"); __asm__("vfmadd.vf v14, ft0, v30"); __asm__("vfmul.vf v24, v18, ft1"); __asm__("vfmul.vf v26, v16, ft1"); __asm__("vfmul.vf v28, v22, ft1"); __asm__("vfmul.vf v30, v20, ft1"); __asm__("vfmsub.vf v16, ft0, v24"); __asm__("vfmadd.vf v18, ft0, v26"); __asm__("vfmsub.vf v20, ft0, v28"); __asm__("vfmadd.vf v22, ft0, v30"); scomplex beta_cast = *beta; if (beta_cast.real != 0.f || beta_cast.imag != 0.f) { if (csc == 2 * FLT_SIZE) { scomplex *c_tmp = c; __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30); } else { scomplex *c_tmp = c; __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30); } } if (csc == 2 * FLT_SIZE) { __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSEG2 "v8, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSEG2 "v12, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSEG2 "v16, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSEG2 "v20, (%0)" : : "r"(c)); } else { __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSSEG2 "v20, (%0), %1" : : "r"(c), "r"(csc)); } return; } void bli_cgemm_6m2_cleanup ( dim_t M, dim_t N, dim_t K, const scomplex* restrict alpha, const scomplex* restrict a, const scomplex* restrict b, const scomplex* restrict beta, scomplex* restrict c, inc_t rsc, inc_t csc ) { // M x N x K cgemm, 0 < M < 6, N <= 32 = vlmax, K > 0 // pairs of register groups hold the real and imag. parts of rows of c and b __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); if (K >= 2) { __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); } switch (M) { case 5: __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); vcmul_vf(v16, v18, v24, v26, ft8, ft9); case 4: __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); vcmul_vf(v12, v14, v24, v26, ft6, ft7); case 3: __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); vcmul_vf(v8, v10, v24, v26, ft4, ft5); case 2: __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); vcmul_vf(v4, v6, v24, v26, ft2, ft3); case 1: __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); vcmul_vf(v0, v2, v24, v26, ft0, ft1); } K -= 1; if (K >= 2) { __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); } __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); while (K > 0) { switch (M) { case 5: __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); vcmacc_vf(v16, v18, ft8, ft9, v28, v30); case 4: __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); vcmacc_vf(v12, v14, ft6, ft7, v28, v30); case 3: __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); vcmacc_vf(v8, v10, ft4, ft5, v28, v30); case 2: __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); vcmacc_vf(v4, v6, ft2, ft3, v28, v30); case 1: __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); vcmacc_vf(v0, v2, ft0, ft1, v28, v30); } K -= 1; if (K == 0) { break; } if (K >= 2) { __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); } __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); switch (M) { case 5: __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); vcmacc_vf(v16, v18, ft8, ft9, v24, v26); case 4: __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); vcmacc_vf(v12, v14, ft6, ft7, v24, v26); case 3: __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); vcmacc_vf(v8, v10, ft4, ft5, v24, v26); case 2: __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); vcmacc_vf(v4, v6, ft2, ft3, v24, v26); case 1: __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); vcmacc_vf(v0, v2, ft0, ft1, v24, v26); } K -= 1; if (K >= 2) { __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); } __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); } c += (M - 1) * rsc; rsc *= 2 * FLT_SIZE; csc *= 2 * FLT_SIZE; __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE)); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE)); switch (M) { case 5: __asm__("vfmul.vf v24, v18, ft1"); __asm__("vfmul.vf v26, v16, ft1"); __asm__("vfmsub.vf v16, ft0, v24"); __asm__("vfmadd.vf v18, ft0, v26"); case 4: __asm__("vfmul.vf v28, v14, ft1"); __asm__("vfmul.vf v30, v12, ft1"); __asm__("vfmsub.vf v12, ft0, v28"); __asm__("vfmadd.vf v14, ft0, v30"); case 3: __asm__("vfmul.vf v24, v10, ft1"); __asm__("vfmul.vf v26, v8, ft1"); __asm__("vfmsub.vf v8, ft0, v24"); __asm__("vfmadd.vf v10, ft0, v26"); case 2: __asm__("vfmul.vf v28, v6, ft1"); __asm__("vfmul.vf v30, v4, ft1"); __asm__("vfmsub.vf v4, ft0, v28"); __asm__("vfmadd.vf v6, ft0, v30"); case 1: __asm__("vfmul.vf v24, v2, ft1"); __asm__("vfmul.vf v26, v0, ft1"); __asm__("vfmsub.vf v0, ft0, v24"); __asm__("vfmadd.vf v2, ft0, v26"); } scomplex beta_cast = *beta; if (beta_cast.real != 0.f || beta_cast.imag != 0.f) { if (csc == 2 * FLT_SIZE) { scomplex *c_tmp = c; switch (M) { case 5: __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); case 4: __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); case 3: __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); case 2: __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); case 1: __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); } } else { scomplex *c_tmp = c; switch (M) { case 5: __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); case 4: __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); case 3: __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); case 2: __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); case 1: __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); } } } if (csc == 2 * FLT_SIZE) { switch (M) { case 5: __asm__(VSSEG2 "v16, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 4: __asm__(VSSEG2 "v12, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 3: __asm__(VSSEG2 "v8, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 2: __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 1: __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); } } else { switch (M) { case 5: __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 4: __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 3: __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 2: __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 1: __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); } } return; } void bli_cgemm_6m2_k0 ( dim_t M, dim_t N, const scomplex* restrict beta, scomplex* restrict c, inc_t rsc, inc_t csc ) { // 0 < M <= 6, 0 < N <= 32 = vlmax, K = 0 // This may not produce the same result as the reference kernel if alpha is infinite or NaN. __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); csc *= 2 * FLT_SIZE; scomplex beta_cast = *beta; if (beta_cast.real == 0.f && beta_cast.imag == 0.f) { // set c to 0 __asm__("vmv.v.i v0, 0"); __asm__("vmv.v.i v2, 0"); for (size_t i = 0; i < M; ++i) { if (csc == 2 * FLT_SIZE) __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); else __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); c += rsc; } } else { // scale c by beta for (size_t i = 0; i < M; ++i) { if (csc == 2 * FLT_SIZE) { __asm__(VLSEG2 "v0, (%0)" : : "r"(c)); vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag); __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); } else { __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag); __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); } c += rsc; } } return; } void bli_cgemm_sifive_x280_asm_6m2 ( dim_t M, dim_t N, dim_t K, const void* restrict alpha_, const void* restrict a_, const void* restrict b_, const void* restrict beta_, void* restrict c_, inc_t rsc, inc_t csc, auxinfo_t* restrict data, const cntx_t* restrict cntx ) { // M x N x K cgemm (void) data; (void) cntx; const scomplex* restrict alpha = alpha_; const scomplex* restrict beta = beta_; const scomplex* restrict a = a_; const scomplex* restrict b = b_; scomplex* restrict c = c_; if (M <= 0 || N <= 0 || K < 0) return; else if (K == 0) bli_cgemm_6m2_k0(M, N, beta, c, rsc, csc); else if (M == 6) bli_cgemm_6m2(N, K, alpha, a, b, beta, c, rsc, csc); else bli_cgemm_6m2_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc); return; } #undef FLT_SIZE #undef FLT_LOAD #undef VLSEG2 #undef VLSSEG2 #undef VSSEG2 #undef VSSSEG2 #undef PACKMR #undef PACKNR // byte-size of underlying floating point type #define FLT_SIZE 8 #define FLT_LOAD "fld " #define VLSEG2 "vlseg2e64.v " #define VLSSEG2 "vlsseg2e64.v " #define VSSEG2 "vsseg2e64.v " #define VSSSEG2 "vssseg2e64.v " #define PACKMR 8 #define PACKNR 16 void bli_zgemm_6m2 ( dim_t N, dim_t K, const dcomplex* restrict alpha, const dcomplex* restrict a, const dcomplex* restrict b, const dcomplex* restrict beta, dcomplex* restrict c, inc_t rsc, inc_t csc ) { // 6 x N x K zgemm, N <= 32 = vlmax, K > 0 // pairs of register groups hold the real and imag. parts of rows of c and b __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); if (K >= 2) { __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); } __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); vcmul_vf(v0, v2, v24, v26, ft0, ft1); __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); vcmul_vf(v4, v6, v24, v26, ft2, ft3); __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); vcmul_vf(v8, v10, v24, v26, ft4, ft5); __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); vcmul_vf(v12, v14, v24, v26, ft6, ft7); __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); vcmul_vf(v16, v18, v24, v26, ft8, ft9); __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); vcmul_vf(v20, v22, v24, v26, ft10, ft11); K -= 1; if (K >= 2) { __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); } __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); while (K > 0) { __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); vcmacc_vf(v0, v2, ft0, ft1, v28, v30); __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); vcmacc_vf(v4, v6, ft2, ft3, v28, v30); __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); vcmacc_vf(v8, v10, ft4, ft5, v28, v30); __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); vcmacc_vf(v12, v14, ft6, ft7, v28, v30); __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); vcmacc_vf(v16, v18, ft8, ft9, v28, v30); __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); vcmacc_vf(v20, v22, ft10, ft11, v28, v30); K -= 1; if (K == 0) { break; } if (K >= 2) { __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); } __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); vcmacc_vf(v0, v2, ft0, ft1, v24, v26); __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); vcmacc_vf(v4, v6, ft2, ft3, v24, v26); __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); vcmacc_vf(v8, v10, ft4, ft5, v24, v26); __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); vcmacc_vf(v12, v14, ft6, ft7, v24, v26); __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); vcmacc_vf(v16, v18, ft8, ft9, v24, v26); __asm__(FLT_LOAD "ft10, %1(%0)" : : "r"(a), "I"(10 * FLT_SIZE)); __asm__(FLT_LOAD "ft11, %1(%0)" : : "r"(a), "I"(11 * FLT_SIZE)); vcmacc_vf(v20, v22, ft10, ft11, v24, v26); K -= 1; if (K >= 2) { __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); } __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); } rsc *= 2 * FLT_SIZE; csc *= 2 * FLT_SIZE; __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE)); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE)); __asm__("vfmul.vf v24, v2, ft1"); __asm__("vfmul.vf v26, v0, ft1"); __asm__("vfmul.vf v28, v6, ft1"); __asm__("vfmul.vf v30, v4, ft1"); __asm__("vfmsub.vf v0, ft0, v24"); __asm__("vfmadd.vf v2, ft0, v26"); __asm__("vfmsub.vf v4, ft0, v28"); __asm__("vfmadd.vf v6, ft0, v30"); __asm__("vfmul.vf v24, v10, ft1"); __asm__("vfmul.vf v26, v8, ft1"); __asm__("vfmul.vf v28, v14, ft1"); __asm__("vfmul.vf v30, v12, ft1"); __asm__("vfmsub.vf v8, ft0, v24"); __asm__("vfmadd.vf v10, ft0, v26"); __asm__("vfmsub.vf v12, ft0, v28"); __asm__("vfmadd.vf v14, ft0, v30"); __asm__("vfmul.vf v24, v18, ft1"); __asm__("vfmul.vf v26, v16, ft1"); __asm__("vfmul.vf v28, v22, ft1"); __asm__("vfmul.vf v30, v20, ft1"); __asm__("vfmsub.vf v16, ft0, v24"); __asm__("vfmadd.vf v18, ft0, v26"); __asm__("vfmsub.vf v20, ft0, v28"); __asm__("vfmadd.vf v22, ft0, v30"); dcomplex beta_cast = *beta; if (beta_cast.real != 0. || beta_cast.imag != 0.) { if (csc == 2 * FLT_SIZE) { dcomplex *c_tmp = c; __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30); } else { dcomplex *c_tmp = c; __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); vcmacc_vf2(v20, v22, beta_cast.real, beta_cast.imag, v28, v30); } } if (csc == 2 * FLT_SIZE) { __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSEG2 "v8, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSEG2 "v12, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSEG2 "v16, (%0)" : : "r"(c)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSEG2 "v20, (%0)" : : "r"(c)); } else { __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc)); __asm__("add %0, %0, %1" : "+r"(c) : "r"(rsc)); __asm__(VSSSEG2 "v20, (%0), %1" : : "r"(c), "r"(csc)); } return; } void bli_zgemm_6m2_cleanup ( dim_t M, dim_t N, dim_t K, const dcomplex* restrict alpha, const dcomplex* restrict a, const dcomplex* restrict b, const dcomplex* restrict beta, dcomplex* restrict c, inc_t rsc, inc_t csc ) { // M x N x K zgemm, 0 < M < 6, N <= 32 = vlmax, K > 0 // pairs of register groups hold the real and imag. parts of rows of c and b __asm__("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); if (K >= 2) { __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); } switch (M) { case 5: __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); vcmul_vf(v16, v18, v24, v26, ft8, ft9); case 4: __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); vcmul_vf(v12, v14, v24, v26, ft6, ft7); case 3: __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); vcmul_vf(v8, v10, v24, v26, ft4, ft5); case 2: __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); vcmul_vf(v4, v6, v24, v26, ft2, ft3); case 1: __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); vcmul_vf(v0, v2, v24, v26, ft0, ft1); } K -= 1; if (K >= 2) { __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); } __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); while (K > 0) { switch (M) { case 5: __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); vcmacc_vf(v16, v18, ft8, ft9, v28, v30); case 4: __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); vcmacc_vf(v12, v14, ft6, ft7, v28, v30); case 3: __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); vcmacc_vf(v8, v10, ft4, ft5, v28, v30); case 2: __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); vcmacc_vf(v4, v6, ft2, ft3, v28, v30); case 1: __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); vcmacc_vf(v0, v2, ft0, ft1, v28, v30); } K -= 1; if (K == 0) { break; } if (K >= 2) { __asm__(VLSEG2 "v28, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); } __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); switch (M) { case 5: __asm__(FLT_LOAD "ft8, %1(%0)" : : "r"(a), "I"(8 * FLT_SIZE)); __asm__(FLT_LOAD "ft9, %1(%0)" : : "r"(a), "I"(9 * FLT_SIZE)); vcmacc_vf(v16, v18, ft8, ft9, v24, v26); case 4: __asm__(FLT_LOAD "ft6, %1(%0)" : : "r"(a), "I"(6 * FLT_SIZE)); __asm__(FLT_LOAD "ft7, %1(%0)" : : "r"(a), "I"(7 * FLT_SIZE)); vcmacc_vf(v12, v14, ft6, ft7, v24, v26); case 3: __asm__(FLT_LOAD "ft4, %1(%0)" : : "r"(a), "I"(4 * FLT_SIZE)); __asm__(FLT_LOAD "ft5, %1(%0)" : : "r"(a), "I"(5 * FLT_SIZE)); vcmacc_vf(v8, v10, ft4, ft5, v24, v26); case 2: __asm__(FLT_LOAD "ft2, %1(%0)" : : "r"(a), "I"(2 * FLT_SIZE)); __asm__(FLT_LOAD "ft3, %1(%0)" : : "r"(a), "I"(3 * FLT_SIZE)); vcmacc_vf(v4, v6, ft2, ft3, v24, v26); case 1: __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(a), "I"(0 * FLT_SIZE)); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(a), "I"(1 * FLT_SIZE)); vcmacc_vf(v0, v2, ft0, ft1, v24, v26); } K -= 1; if (K >= 2) { __asm__(VLSEG2 "v24, (%0)" : : "r"(b)); __asm__("addi %0, %0, %1" : "+r"(b) : "I"(PACKNR * 2 * FLT_SIZE)); } __asm__("addi %0, %0, %1" : "+r"(a) : "I"(PACKMR * 2 * FLT_SIZE)); } c += (M - 1) * rsc; rsc *= 2 * FLT_SIZE; csc *= 2 * FLT_SIZE; __asm__(FLT_LOAD "ft0, %1(%0)" : : "r"(alpha), "I"(0 * FLT_SIZE)); __asm__(FLT_LOAD "ft1, %1(%0)" : : "r"(alpha), "I"(1 * FLT_SIZE)); switch (M) { case 5: __asm__("vfmul.vf v24, v18, ft1"); __asm__("vfmul.vf v26, v16, ft1"); __asm__("vfmsub.vf v16, ft0, v24"); __asm__("vfmadd.vf v18, ft0, v26"); case 4: __asm__("vfmul.vf v28, v14, ft1"); __asm__("vfmul.vf v30, v12, ft1"); __asm__("vfmsub.vf v12, ft0, v28"); __asm__("vfmadd.vf v14, ft0, v30"); case 3: __asm__("vfmul.vf v24, v10, ft1"); __asm__("vfmul.vf v26, v8, ft1"); __asm__("vfmsub.vf v8, ft0, v24"); __asm__("vfmadd.vf v10, ft0, v26"); case 2: __asm__("vfmul.vf v28, v6, ft1"); __asm__("vfmul.vf v30, v4, ft1"); __asm__("vfmsub.vf v4, ft0, v28"); __asm__("vfmadd.vf v6, ft0, v30"); case 1: __asm__("vfmul.vf v24, v2, ft1"); __asm__("vfmul.vf v26, v0, ft1"); __asm__("vfmsub.vf v0, ft0, v24"); __asm__("vfmadd.vf v2, ft0, v26"); } dcomplex beta_cast = *beta; if (beta_cast.real != 0. || beta_cast.imag != 0.) { if (csc == 2 * FLT_SIZE) { dcomplex *c_tmp = c; switch (M) { case 5: __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); case 4: __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); case 3: __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); case 2: __asm__(VLSEG2 "v28, (%0)" : : "r"(c_tmp)); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); case 1: __asm__(VLSEG2 "v24, (%0)" : : "r"(c_tmp)); vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); } } else { dcomplex *c_tmp = c; switch (M) { case 5: __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v16, v18, beta_cast.real, beta_cast.imag, v24, v26); case 4: __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v12, v14, beta_cast.real, beta_cast.imag, v28, v30); case 3: __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v8, v10, beta_cast.real, beta_cast.imag, v24, v26); case 2: __asm__(VLSSEG2 "v28, (%0), %1" : : "r"(c_tmp), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c_tmp) : "r"(rsc)); vcmacc_vf2(v4, v6, beta_cast.real, beta_cast.imag, v28, v30); case 1: __asm__(VLSSEG2 "v24, (%0), %1" : : "r"(c_tmp), "r"(csc)); vcmacc_vf2(v0, v2, beta_cast.real, beta_cast.imag, v24, v26); } } } if (csc == 2 * FLT_SIZE) { switch (M) { case 5: __asm__(VSSEG2 "v16, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 4: __asm__(VSSEG2 "v12, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 3: __asm__(VSSEG2 "v8, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 2: __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 1: __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); } } else { switch (M) { case 5: __asm__(VSSSEG2 "v16, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 4: __asm__(VSSSEG2 "v12, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 3: __asm__(VSSSEG2 "v8, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 2: __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); __asm__("sub %0, %0, %1" : "+r"(c) : "r"(rsc)); case 1: __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); } } return; } void bli_zgemm_6m2_k0 ( dim_t M, dim_t N, const dcomplex* restrict beta, dcomplex* restrict c, inc_t rsc, inc_t csc ) { // 0 < M <= 6, 0 < N <= 32 = vlmax, K = 0 // This may not produce the same result as the reference kernel if alpha is infinite or NaN. __asm__ volatile("vsetvli zero, %0, e%1, m2, ta, ma" : : "r"(N), "i"(8 * FLT_SIZE)); csc *= 2 * FLT_SIZE; dcomplex beta_cast = *beta; if (beta_cast.real == 0. && beta_cast.imag == 0.) { // set c to 0 __asm__("vmv.v.i v0, 0"); __asm__("vmv.v.i v2, 0"); for (size_t i = 0; i < M; ++i) { if (csc == 2 * FLT_SIZE) __asm__(VSSEG2 "v0, (%0)" : : "r"(c)); else __asm__(VSSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); c += rsc; } } else { // scale c by beta for (size_t i = 0; i < M; ++i) { if (csc == 2 * FLT_SIZE) { __asm__(VLSEG2 "v0, (%0)" : : "r"(c)); vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag); __asm__(VSSEG2 "v4, (%0)" : : "r"(c)); } else { __asm__(VLSSEG2 "v0, (%0), %1" : : "r"(c), "r"(csc)); vcmul_vf2(v4, v6, v0, v2, beta_cast.real, beta_cast.imag); __asm__(VSSSEG2 "v4, (%0), %1" : : "r"(c), "r"(csc)); } c += rsc; } } return; } void bli_zgemm_sifive_x280_asm_6m2 ( dim_t M, dim_t N, dim_t K, const void* restrict alpha_, const void* restrict a_, const void* restrict b_, const void* restrict beta_, void* restrict c_, inc_t rsc, inc_t csc, auxinfo_t* restrict data, const cntx_t* restrict cntx ) { // M x N x K zgemm (void) data; (void) cntx; const dcomplex* restrict alpha = alpha_; const dcomplex* restrict beta = beta_; const dcomplex* restrict a = a_; const dcomplex* restrict b = b_; dcomplex* restrict c = c_; if (M <= 0 || N <= 0 || K < 0) return; else if (K == 0) bli_zgemm_6m2_k0(M, N, beta, c, rsc, csc); else if (M == 6) bli_zgemm_6m2(N, K, alpha, a, b, beta, c, rsc, csc); else bli_zgemm_6m2_cleanup(M, N, K, alpha, a, b, beta, c, rsc, csc); return; } #undef FLT_SIZE #undef FLT_LOAD #undef VLSEG2 #undef VLSSEG2 #undef VSSEG2 #undef VSSSEG2 #undef PACKMR #undef PACKNR