/***************************************************************************** Copyright (c) 2011-2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #define ASSEMBLER #include "common.h" #define old_bm %rdi #define old_bn %rsi #define old_bk %rdx #define bm %r13 #define bn %r14 #define bk %r15 #define ALPHA %xmm0 #define ba %rcx #define bb %r8 #define C %r9 #define ldc %r10 #define i %r11 #define k %rax #define ptrba %rdi #define ptrbb %rsi #define C0 %rbx #define C1 %rbp #define prebb %r12 #ifndef WINDOWS_ABI #define STACKSIZE 128 #define old_ldc 8+STACKSIZE(%rsp) #define old_offset 16+STACKSIZE(%rsp) #define MEMALPHA_R 48(%rsp) #define MEMALPHA_I 56(%rsp) #define j 64(%rsp) #define OFFSET 72(%rsp) #define kk 80(%rsp) #define kkk 88(%rsp) #else #define STACKSIZE 512 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define old_ldc 72 + STACKSIZE(%rsp) #define old_offset 80 + STACKSIZE(%rsp) #define MEMALPHA_R 224(%rsp) #define MEMALPHA_I 232(%rsp) #define j 240(%rsp) #define OFFSET 248(%rsp) #define kk 256(%rsp) #define kkk 264(%rsp) #endif #define PREFETCH0 prefetcht0 #define PREFETCH1 prefetcht0 #define PREFETCH2 prefetcht0 #define PRESIZE 64 #define xvec0 %xmm0 #define xvec1 %xmm1 #define xvec2 %xmm2 #define xvec3 %xmm3 #define xvec4 %xmm4 #define xvec5 %xmm5 #define xvec6 %xmm6 #define xvec7 %xmm7 #define xvec8 %xmm8 #define xvec9 %xmm9 #define xvec10 %xmm10 #define xvec11 %xmm11 #define xvec12 %xmm12 #define xvec13 %xmm13 #define xvec14 %xmm14 #define xvec15 %xmm15 #define yvec0 %ymm0 #define yvec1 %ymm1 #define yvec2 %ymm2 #define yvec3 %ymm3 #define yvec4 %ymm4 #define yvec5 %ymm5 #define yvec6 %ymm6 #define yvec7 %ymm7 #define yvec8 %ymm8 #define yvec9 %ymm9 #define yvec10 %ymm10 #define yvec11 %ymm11 #define yvec12 %ymm12 #define yvec13 %ymm13 #define yvec14 %ymm14 #define yvec15 %ymm15 #define LEAQ leaq #define ADDQ addq #define MULQ imulq #define SARQ sarq #define SALQ salq #define ANDQ andq #define SUBQ subq #define DECQ decq #define JG jg #define JLE jle #define TEST testq #define OR orq #define JNE jne #define JMP jmp #define NOP #define XOR xorpd #undef MOVQ #define MOVQ movq #define XOR_SY vxorps #define XOR_SX vxorps #define LD_SY vmovaps #define LD_SX vmovaps #define LDL_SX vmovlps #define LDL_SY vmovlps #define LDH_SX vmovhps #define LDH_SY vmovhps #define ST_SY vmovaps #define ST_SX vmovaps #define STL_SX vmovlps #define STL_SY vmovlps #define STH_SX vmovhps #define STH_SY vmovhps #define EDUP_SY vmovsldup #define ODUP_SY vmovshdup #define EDUP_SX vmovsldup #define ODUP_SX vmovshdup #define ADD_SY vaddps #define ADD_SX vaddps #define SUB_SY vsubps #define SUB_SX vsubps #define ADDSUB_SY vaddsubps #define ADDSUB_SX vaddsubps #define MUL_SY vmulps #define MUL_SX vmulps #define SHUF_SY vperm2f128 #define SHUF_SX vpshufd #define VPERMILP_SY vpermilps #define VPERMILP_SX vpermilps #define BROAD_SY vbroadcastss #define BROAD_SX vbroadcastss #define MOV_SY vmovaps #define MOV_SX vmovaps #define REVS_SY vshufps #define REVS_SX vshufps #define EXTRA_SY vextractf128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1_SY ADD_SY #define ADD2_SY ADDSUB_SY #define ADD1_SX ADD_SX #define ADD2_SX ADDSUB_SX #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define ADD1_SY SUB_SY #define ADD2_SY ADDSUB_SY #define ADD1_SX SUB_SX #define ADD2_SX ADDSUB_SX #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define ADD1_SY SUB_SY #define ADD2_SY ADDSUB_SY #define ADD1_SX SUB_SX #define ADD2_SX ADDSUB_SX #else #define ADD1_SY ADD_SY #define ADD2_SY ADDSUB_SY #define ADD1_SX ADD_SX #define ADD2_SX ADDSUB_SX #endif PROLOGUE subq $STACKSIZE, %rsp; movq %rbx, 0(%rsp); movq %rbp, 8(%rsp); movq %r12, 16(%rsp); movq %r13, 24(%rsp); movq %r14, 32(%rsp); movq %r15, 40(%rsp); #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, old_bm movq ARG2, old_bn movq ARG3, old_bk movq OLD_A, ba movq OLD_B, bb movq OLD_C, C movq old_ldc, ldc #ifdef TRMMKERNEL movq old_offset, %r11 #endif movaps %xmm3, %xmm0 movsd OLD_ALPHA_I, %xmm1 #else movq old_ldc, ldc #ifdef TRMMKERNEL movq old_offset, %r11; #endif #endif vzeroupper vmovlps %xmm0, MEMALPHA_R vmovlps %xmm1, MEMALPHA_I movq old_bm, bm movq old_bn, bn movq old_bk, bk salq $ZBASE_SHIFT, ldc #ifdef TRMMKERNEL movq %r11, OFFSET #ifndef LEFT negq %r11; #endif movq %r11, kk; #endif MOVQ bn,j; SARQ $2,j; # Rn = 4 JLE .L0_loopE; ALIGN_5; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; MOVQ %rax, kk; #endif MOVQ C,C0; LEAQ (C,ldc,2),C1; MOVQ bk, k; SALQ $5, k; LEAQ (bb, k, 1), prebb; # Rn=4, SIZE=4 COMPLEX=2 MOVQ ba,ptrba; MOVQ bm,i; SARQ $3,i; # Rm = 8 JLE .L1_loopE; ALIGN_5; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif # Initial results register PREFETCH0 0*SIZE(prebb); XOR_SY yvec15, yvec15, yvec15; PREFETCH0 16*SIZE(prebb); ADDQ $32*SIZE, prebb; XOR_SY yvec14, yvec14, yvec14; PREFETCH2 3*SIZE(C0); XOR_SY yvec13, yvec13, yvec13; PREFETCH2 3*SIZE(C0, ldc, 1); XOR_SY yvec12, yvec12, yvec12; PREFETCH2 3*SIZE(C1); EDUP_SY 0*SIZE(ptrbb), yvec2; # Br0, Br1, Br2, Br3 PREFETCH2 3*SIZE(C1, ldc, 1); XOR_SY yvec11, yvec11, yvec11; XOR_SY yvec10, yvec10, yvec10; LD_SY 0*SIZE(ptrba), yvec0; # Ar0, Ai0, Ar1, Ai1.. XOR_SY yvec9, yvec9, yvec9; XOR_SY yvec8, yvec8, yvec8; VPERMILP_SY $0x4e, yvec2, yvec3; # Br2, Br3, Br0, Br1 #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; #else ADDQ $4, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2,k; # Unroll 4 times JLE .L2_loopE; ALIGN_5; .L2_bodyB:; # Computing kernel ######### Unroll 1 ################## PREFETCH0 PRESIZE*SIZE(ptrba); LD_SY 8*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec15, yvec15; ADD1_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; ODUP_SY 0*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 MUL_SY yvec1, yvec3, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. ADD1_SY yvec6, yvec11, yvec11; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 MUL_SY yvec1, yvec5, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec10, yvec10; ADD1_SY yvec7, yvec8, yvec8; VPERMILP_SY $0xb1, yvec1, yvec1; MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; ADD2_SY yvec6, yvec15, yvec15; ADD2_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; EDUP_SY 8*SIZE(ptrbb), yvec2; MUL_SY yvec1, yvec3, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; ADD2_SY yvec6, yvec14, yvec14; ADD2_SY yvec7, yvec12, yvec12; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; LD_SY 16*SIZE(ptrba), yvec0; ADD2_SY yvec6, yvec11, yvec11; ADD2_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec6, yvec10, yvec10; ADD2_SY yvec7, yvec8, yvec8; ######### Unroll 2 ################## PREFETCH0 (PRESIZE+16)*SIZE(ptrba); LD_SY 24*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec15, yvec15; ADD1_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; ODUP_SY 8*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 MUL_SY yvec1, yvec3, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. ADD1_SY yvec6, yvec11, yvec11; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 MUL_SY yvec1, yvec5, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec10, yvec10; ADD1_SY yvec7, yvec8, yvec8; VPERMILP_SY $0xb1, yvec1, yvec1; MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; ADD2_SY yvec6, yvec15, yvec15; ADD2_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; EDUP_SY 16*SIZE(ptrbb), yvec2; MUL_SY yvec1, yvec3, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; ADD2_SY yvec6, yvec14, yvec14; ADD2_SY yvec7, yvec12, yvec12; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; LD_SY 32*SIZE(ptrba), yvec0; ADD2_SY yvec6, yvec11, yvec11; ADD2_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec6, yvec10, yvec10; ADD2_SY yvec7, yvec8, yvec8; ######### Unroll 3 ################## PREFETCH0 (PRESIZE+32)*SIZE(ptrba); LD_SY 40*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec15, yvec15; ADD1_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; ODUP_SY 16*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 MUL_SY yvec1, yvec3, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. ADD1_SY yvec6, yvec11, yvec11; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 MUL_SY yvec1, yvec5, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec10, yvec10; ADD1_SY yvec7, yvec8, yvec8; VPERMILP_SY $0xb1, yvec1, yvec1; MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; ADD2_SY yvec6, yvec15, yvec15; ADD2_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; EDUP_SY 24*SIZE(ptrbb), yvec2; MUL_SY yvec1, yvec3, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; ADD2_SY yvec6, yvec14, yvec14; ADD2_SY yvec7, yvec12, yvec12; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; LD_SY 48*SIZE(ptrba), yvec0; ADD2_SY yvec6, yvec11, yvec11; ADD2_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec6, yvec10, yvec10; ADD2_SY yvec7, yvec8, yvec8; ######### Unroll 4 ################## PREFETCH0 (PRESIZE+48)*SIZE(ptrba); LD_SY 56*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. MUL_SY yvec0, yvec2, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADDQ $64*SIZE, ptrba; ADD1_SY yvec6, yvec15, yvec15; ADD1_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; ODUP_SY 24*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 MUL_SY yvec1, yvec3, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 ADDQ $32*SIZE, ptrbb; ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. ADD1_SY yvec6, yvec11, yvec11; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 MUL_SY yvec1, yvec5, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec10, yvec10; ADD1_SY yvec7, yvec8, yvec8; VPERMILP_SY $0xb1, yvec1, yvec1; MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; ADD2_SY yvec6, yvec15, yvec15; ADD2_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; EDUP_SY 0*SIZE(ptrbb), yvec2; MUL_SY yvec1, yvec3, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; ADD2_SY yvec6, yvec14, yvec14; ADD2_SY yvec7, yvec12, yvec12; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; LD_SY 0*SIZE(ptrba), yvec0; ADD2_SY yvec6, yvec11, yvec11; ADD2_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec6, yvec10, yvec10; ADD2_SY yvec7, yvec8, yvec8; .L2_bodyE:; DECQ k; JG .L2_bodyB; ALIGN_5 .L2_loopE:; #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L3_loopE; ALIGN_5 .L3_loopB: ######### Unroll 1 ################## PREFETCH0 PRESIZE*SIZE(ptrba) LD_SY 8*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 ADD1_SY yvec6, yvec15, yvec15; ADD1_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; MUL_SY yvec1, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; ODUP_SY 0*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 ADD1_SY yvec6, yvec11, yvec11; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. ADD1_SY yvec6, yvec10, yvec10; ADD1_SY yvec7, yvec8, yvec8; VPERMILP_SY $0xb1, yvec1, yvec1; MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 ADD2_SY yvec6, yvec15, yvec15; ADD2_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; MUL_SY yvec1, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD2_SY yvec6, yvec14, yvec14; ADD2_SY yvec7, yvec12, yvec12; EDUP_SY 8*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; ADD2_SY yvec6, yvec11, yvec11; ADD2_SY yvec7, yvec9, yvec9; LD_SY 16*SIZE(ptrba), yvec0; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec6, yvec10, yvec10; ADD2_SY yvec7, yvec8, yvec8; ######### Unroll 2 ################## PREFETCH0 (PRESIZE+16)*SIZE(ptrba) LD_SY 24*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; ADDQ $32*SIZE, ptrba SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 ADD1_SY yvec6, yvec15, yvec15; ADD1_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; MUL_SY yvec1, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; ODUP_SY 8*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; ADDQ $16*SIZE, ptrbb; VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 ADD1_SY yvec6, yvec11, yvec11; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. ADD1_SY yvec6, yvec10, yvec10; ADD1_SY yvec7, yvec8, yvec8; VPERMILP_SY $0xb1, yvec1, yvec1; MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 ADD2_SY yvec6, yvec15, yvec15; ADD2_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; MUL_SY yvec1, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD2_SY yvec6, yvec14, yvec14; ADD2_SY yvec7, yvec12, yvec12; EDUP_SY 0*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; ADD2_SY yvec6, yvec11, yvec11; ADD2_SY yvec7, yvec9, yvec9; LD_SY 0*SIZE(ptrba), yvec0; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec6, yvec10, yvec10; ADD2_SY yvec7, yvec8, yvec8; .L3_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L4_loopE; ALIGN_5 .L4_loopB:; ######### Unroll 1 ################## PREFETCH0 PRESIZE*SIZE(ptrba) LD_SY 8*SIZE(ptrba), yvec1; # Ar4, Ai4, Ar5, Ai5.. MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; ADDQ $16*SIZE, ptrba; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 ADD1_SY yvec6, yvec15, yvec15; ADD1_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; MUL_SY yvec1, yvec3, yvec7; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 ADD1_SY yvec6, yvec14, yvec14; ADD1_SY yvec7, yvec12, yvec12; ODUP_SY 0*SIZE(ptrbb), yvec2; # Bi0, Bi1, Bi2, Bi3 MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; ADDQ $8*SIZE, ptrbb; VPERMILP_SY $0x4e, yvec2, yvec3; # Bi2, Bi3, Bi0, Bi1 ADD1_SY yvec6, yvec11, yvec11; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; VPERMILP_SY $0xb1, yvec0, yvec0; # Ai0, Ar0, Ai1, Ar1.. ADD1_SY yvec6, yvec10, yvec10; ADD1_SY yvec7, yvec8, yvec8; VPERMILP_SY $0xb1, yvec1, yvec1; MUL_SY yvec0, yvec2, yvec6; MUL_SY yvec0, yvec3, yvec7; SHUF_SY $0x03, yvec2, yvec2, yvec4; # Br1, Br0, Br3, Br2 ADD2_SY yvec6, yvec15, yvec15; ADD2_SY yvec7, yvec13, yvec13; MUL_SY yvec1, yvec2, yvec6; ADD2_SY yvec6, yvec14, yvec14; SHUF_SY $0x03, yvec3, yvec3, yvec5; # Br3, Br2, Br1, Br0 MUL_SY yvec1, yvec3, yvec7; ADD2_SY yvec7, yvec12, yvec12; MUL_SY yvec0, yvec4, yvec6; MUL_SY yvec0, yvec5, yvec7; VPERMILP_SY $0x4e, yvec2, yvec3; ADD2_SY yvec6, yvec11, yvec11; ADD2_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec4, yvec6; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec6, yvec10, yvec10; ADD2_SY yvec7, yvec8, yvec8; .L4_loopE:; #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_SY yvec15, yvec7, yvec15; ADDSUB_SY yvec14, yvec7, yvec14; ADDSUB_SY yvec13, yvec7, yvec13; ADDSUB_SY yvec12, yvec7, yvec12; ADDSUB_SY yvec11, yvec7, yvec11; ADDSUB_SY yvec10, yvec7, yvec10; ADDSUB_SY yvec9, yvec7, yvec9; ADDSUB_SY yvec8, yvec7, yvec8; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_SY yvec15, yvec7, yvec15; SUB_SY yvec14, yvec7, yvec14; SUB_SY yvec13, yvec7, yvec13; SUB_SY yvec12, yvec7, yvec12; SUB_SY yvec11, yvec7, yvec11; SUB_SY yvec10, yvec7, yvec10; SUB_SY yvec9, yvec7, yvec9; SUB_SY yvec8, yvec7, yvec8; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_SY $0xb1, yvec15, yvec15; VPERMILP_SY $0xb1, yvec14, yvec14; VPERMILP_SY $0xb1, yvec13, yvec13; VPERMILP_SY $0xb1, yvec12, yvec12; VPERMILP_SY $0xb1, yvec11, yvec11; VPERMILP_SY $0xb1, yvec10, yvec10; VPERMILP_SY $0xb1, yvec9, yvec9; VPERMILP_SY $0xb1, yvec8, yvec8; ADDSUB_SY yvec15, yvec7, yvec15; ADDSUB_SY yvec14, yvec7, yvec14; ADDSUB_SY yvec13, yvec7, yvec13; ADDSUB_SY yvec12, yvec7, yvec12; ADDSUB_SY yvec11, yvec7, yvec11; ADDSUB_SY yvec10, yvec7, yvec10; ADDSUB_SY yvec9, yvec7, yvec9; ADDSUB_SY yvec8, yvec7, yvec8; VPERMILP_SY $0xb1, yvec15, yvec15; VPERMILP_SY $0xb1, yvec14, yvec14; VPERMILP_SY $0xb1, yvec13, yvec13; VPERMILP_SY $0xb1, yvec12, yvec12; VPERMILP_SY $0xb1, yvec11, yvec11; VPERMILP_SY $0xb1, yvec10, yvec10; VPERMILP_SY $0xb1, yvec9, yvec9; VPERMILP_SY $0xb1, yvec8, yvec8; #endif ##### Load Alpha #### BROAD_SY MEMALPHA_R,yvec7; BROAD_SY MEMALPHA_I,yvec6; ##### Multiply Alpha #### VPERMILP_SY $0xb1,yvec15, yvec5; MUL_SY yvec15, yvec7, yvec15; MUL_SY yvec5, yvec6, yvec5; ADDSUB_SY yvec5, yvec15, yvec15; VPERMILP_SY $0xb1,yvec14, yvec4; MUL_SY yvec14, yvec7, yvec14; MUL_SY yvec4, yvec6, yvec4; ADDSUB_SY yvec4, yvec14, yvec14; VPERMILP_SY $0xb1,yvec13, yvec3; MUL_SY yvec13, yvec7, yvec13; MUL_SY yvec3, yvec6, yvec3; ADDSUB_SY yvec3, yvec13, yvec13; VPERMILP_SY $0xb1,yvec12, yvec2; MUL_SY yvec12, yvec7, yvec12; MUL_SY yvec2, yvec6, yvec2; ADDSUB_SY yvec2, yvec12, yvec12; VPERMILP_SY $0xb1,yvec11, yvec1; MUL_SY yvec11, yvec7, yvec11; MUL_SY yvec1, yvec6, yvec1; ADDSUB_SY yvec1, yvec11, yvec11; VPERMILP_SY $0xb1,yvec10, yvec0; MUL_SY yvec10, yvec7, yvec10; MUL_SY yvec0, yvec6, yvec0; ADDSUB_SY yvec0, yvec10, yvec10; VPERMILP_SY $0xb1,yvec9, yvec5; MUL_SY yvec9, yvec7, yvec9; MUL_SY yvec5, yvec6, yvec5; ADDSUB_SY yvec5, yvec9, yvec9; VPERMILP_SY $0xb1,yvec8, yvec4; MUL_SY yvec8, yvec7, yvec8; MUL_SY yvec4, yvec6, yvec4; ADDSUB_SY yvec4, yvec8, yvec8; #### Shuffle Results #### MOV_SY yvec15,yvec7; REVS_SY $0xe4,yvec13,yvec15,yvec15; REVS_SY $0xe4,yvec7,yvec13,yvec13; MOV_SY yvec14,yvec7; REVS_SY $0xe4,yvec12,yvec14,yvec14; REVS_SY $0xe4,yvec7,yvec12,yvec12; MOV_SY yvec11,yvec7; REVS_SY $0xe4,yvec9,yvec11,yvec11; REVS_SY $0xe4,yvec7,yvec9,yvec9; MOV_SY yvec10,yvec7; REVS_SY $0xe4,yvec8,yvec10,yvec10; REVS_SY $0xe4,yvec7,yvec8,yvec8; #### Store Back #### #### Testing alignment #### MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; ALIGN_5 EXTRA_SY $1,yvec15,xvec7; EXTRA_SY $1,yvec14,xvec6; EXTRA_SY $1,yvec13,xvec5; EXTRA_SY $1,yvec12,xvec4; EXTRA_SY $1,yvec11,xvec3; EXTRA_SY $1,yvec10,xvec2; EXTRA_SY $1,yvec9,xvec1; EXTRA_SY $1,yvec8,xvec0; #ifndef TRMMKERNEL ADD_SY 0*SIZE(C0),xvec15, xvec15; ADD_SY 4*SIZE(C1),xvec7, xvec7; ADD_SY 8*SIZE(C0),xvec14, xvec14; ADD_SY 12*SIZE(C1),xvec6, xvec6; ADD_SY 0*SIZE(C0,ldc,1),xvec13, xvec13; ADD_SY 4*SIZE(C1,ldc,1),xvec5, xvec5; ADD_SY 8*SIZE(C0,ldc,1),xvec12, xvec12; ADD_SY 12*SIZE(C1,ldc,1),xvec4, xvec4; ADD_SY 0*SIZE(C1),xvec11, xvec11; ADD_SY 4*SIZE(C0),xvec3, xvec3; ADD_SY 8*SIZE(C1),xvec10, xvec10; ADD_SY 12*SIZE(C0),xvec2, xvec2; ADD_SY 0*SIZE(C1,ldc,1),xvec9, xvec9; ADD_SY 4*SIZE(C0,ldc,1),xvec1, xvec1; ADD_SY 8*SIZE(C1,ldc,1),xvec8, xvec8; ADD_SY 12*SIZE(C0,ldc,1),xvec0, xvec0; #endif ST_SY xvec15,0*SIZE(C0); ST_SY xvec7,4*SIZE(C1); ST_SY xvec14,8*SIZE(C0); ST_SY xvec6,12*SIZE(C1); ST_SY xvec13,0*SIZE(C0,ldc,1); ST_SY xvec5,4*SIZE(C1,ldc,1); ST_SY xvec12,8*SIZE(C0,ldc,1); ST_SY xvec4,12*SIZE(C1,ldc,1); ST_SY xvec11,0*SIZE(C1); ST_SY xvec3,4*SIZE(C0); ST_SY xvec10,8*SIZE(C1); ST_SY xvec2,12*SIZE(C0); ST_SY xvec9,0*SIZE(C1,ldc,1); ST_SY xvec1,4*SIZE(C0,ldc,1); ST_SY xvec8,8*SIZE(C1,ldc,1); ST_SY xvec0,12*SIZE(C0,ldc,1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk; #endif ADDQ $16*SIZE,C0; ADDQ $16*SIZE,C1; .L1_bodyE:; DECQ i; JG .L1_bodyB; JMP .L1_loopE; ALIGN_5 .L4_loopEx: EXTRA_SY $1, yvec15, xvec7; #ifndef TRMMKERNEL LDL_SY 0*SIZE(C0), xvec6, xvec6; LDH_SY 2*SIZE(C0), xvec6, xvec6; ADD_SY xvec6, xvec15, xvec15; #endif STL_SY xvec15, 0*SIZE(C0); STH_SY xvec15, 2*SIZE(C0); #ifndef TRMMKERNEL LDL_SY 4*SIZE(C1), xvec5, xvec5; LDH_SY 6*SIZE(C1), xvec5, xvec5; ADD_SY xvec5, xvec7, xvec7; #endif STL_SY xvec7, 4*SIZE(C1); STH_SY xvec7, 6*SIZE(C1); EXTRA_SY $1, yvec14, xvec6; #ifndef TRMMKERNEL LDL_SY 8*SIZE(C0), xvec5, xvec5; LDH_SY 10*SIZE(C0), xvec5, xvec5; ADD_SY xvec5, xvec14, xvec14; #endif STL_SY xvec14, 8*SIZE(C0); STH_SY xvec14, 10*SIZE(C0); #ifndef TRMMKERNEL LDL_SY 12*SIZE(C1), xvec4, xvec4; LDH_SY 14*SIZE(C1), xvec4, xvec4; ADD_SY xvec4, xvec6, xvec6; #endif STL_SY xvec6, 12*SIZE(C1); STH_SY xvec6, 14*SIZE(C1); EXTRA_SY $1, yvec13, xvec5; #ifndef TRMMKERNEL LDL_SY 0*SIZE(C0, ldc, 1), xvec4, xvec4; LDH_SY 2*SIZE(C0, ldc, 1), xvec4, xvec4; ADD_SY xvec4, xvec13, xvec13; #endif STL_SY xvec13, 0*SIZE(C0, ldc, 1); STH_SY xvec13, 2*SIZE(C0, ldc, 1); #ifndef TRMMKERNEL LDL_SY 4*SIZE(C1, ldc, 1), xvec3, xvec3; LDH_SY 6*SIZE(C1, ldc, 1), xvec3, xvec3; ADD_SY xvec3, xvec5, xvec5; #endif STL_SY xvec5, 4*SIZE(C1, ldc, 1); STH_SX xvec5, 6*SIZE(C1, ldc, 1); EXTRA_SY $1, yvec12, xvec4; #ifndef TRMMKERNEL LDL_SY 8*SIZE(C0, ldc, 1), xvec3, xvec3; LDH_SY 10*SIZE(C0, ldc, 1), xvec3, xvec3; ADD_SY xvec3, xvec12, xvec12; #endif STL_SY xvec12, 8*SIZE(C0, ldc, 1); STH_SY xvec12, 10*SIZE(C0, ldc, 1); #ifndef TRMMKERNEL LDL_SY 12*SIZE(C1, ldc, 1), xvec2, xvec2; LDH_SY 14*SIZE(C1, ldc, 1), xvec2, xvec2; ADD_SY xvec2, xvec4, xvec4; #endif STL_SY xvec4, 12*SIZE(C1, ldc, 1); STH_SY xvec4, 14*SIZE(C1, ldc, 1); EXTRA_SY $1, yvec11, xvec3; #ifndef TRMMKERNEL LDL_SY 0*SIZE(C1), xvec2, xvec2; LDH_SY 2*SIZE(C1), xvec2, xvec2; ADD_SY xvec2, xvec11, xvec11; #endif STL_SY xvec11, 0*SIZE(C1); STH_SY xvec11, 2*SIZE(C1); #ifndef TRMMKERNEL LDL_SY 4*SIZE(C0), xvec1, xvec1; LDH_SY 6*SIZE(C0), xvec1, xvec1; ADD_SY xvec1, xvec3, xvec3; #endif STL_SY xvec3, 4*SIZE(C0); STH_SY xvec3, 6*SIZE(C0); EXTRA_SY $1, yvec10, xvec2; #ifndef TRMMKERNEL LDL_SY 8*SIZE(C1), xvec1, xvec1; LDH_SY 10*SIZE(C1), xvec1, xvec1; ADD_SY xvec1, xvec10, xvec10; #endif STL_SY xvec10, 8*SIZE(C1); STH_SY xvec10, 10*SIZE(C1); #ifndef TRMMKERNEL LDL_SY 12*SIZE(C0), xvec0, xvec0; LDH_SY 14*SIZE(C0), xvec0, xvec0; ADD_SY xvec0, xvec2, xvec2; #endif STL_SY xvec2, 12*SIZE(C0); STH_SY xvec2, 14*SIZE(C0); EXTRA_SY $1, yvec9, xvec1; #ifndef TRMMKERNEL LDL_SY 0*SIZE(C1, ldc, 1), xvec7, xvec7; LDH_SY 2*SIZE(C1, ldc, 1), xvec7, xvec7; ADD_SY xvec7, xvec9, xvec9; #endif STL_SY xvec9, 0*SIZE(C1, ldc, 1); STH_SY xvec9, 2*SIZE(C1, ldc, 1); #ifndef TRMMKERNEL LDL_SY 4*SIZE(C0, ldc, 1), xvec6, xvec6; LDH_SY 6*SIZE(C0, ldc, 1), xvec6, xvec6; ADD_SY xvec6, xvec1, xvec1; #endif STL_SY xvec1, 4*SIZE(C0, ldc, 1); STH_SY xvec1, 6*SIZE(C0, ldc, 1); EXTRA_SY $1, yvec8, xvec0; #ifndef TRMMKERNEL LDL_SY 8*SIZE(C1, ldc, 1), xvec6, xvec6; LDH_SY 10*SIZE(C1, ldc, 1), xvec6, xvec6; ADD_SY xvec6, xvec8, xvec8; #endif STL_SY xvec8, 8*SIZE(C1, ldc, 1); STH_SY xvec8, 10*SIZE(C1, ldc, 1); #ifndef TRMMKERNEL LDL_SY 12*SIZE(C0, ldc, 1), xvec5, xvec5; LDH_SY 14*SIZE(C0, ldc, 1), xvec5, xvec5; ADD_SY xvec5, xvec0, xvec0; #endif STL_SY xvec0, 12*SIZE(C0, ldc, 1); STH_SY xvec0, 14*SIZE(C0, ldc, 1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk; #endif ADDQ $16*SIZE, C0; ADDQ $16*SIZE, C1; DECQ i; JG .L1_bodyB; ALIGN_5; .L1_loopE:; TEST $4, bm; JLE .L5_loopE; ALIGN_5 .L5_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec13, yvec13, yvec13; XOR_SY yvec11, yvec11, yvec11; XOR_SY yvec9, yvec9, yvec9; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; #else ADDQ $4, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L8_loopE; ALIGN_5 .L8_bodyB: #### Unroll times 1 #### LD_SY 0*SIZE(ptrba), yvec0; VPERMILP_SY $0xb1, yvec0, yvec1; EDUP_SY 0*SIZE(ptrbb), yvec2; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec0, yvec3, yvec7; ADD1_SY yvec7, yvec13, yvec13; ODUP_SY 0*SIZE(ptrbb), yvec2; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec0, yvec4, yvec6; ADD1_SY yvec6, yvec11, yvec11; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec5, yvec7; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec2, yvec6; ADD2_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec1, yvec3, yvec7; ADD2_SY yvec7, yvec13, yvec13; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec1, yvec4, yvec6; ADD2_SY yvec6, yvec11, yvec11; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec7, yvec9, yvec9; #### Unroll time 2 #### LD_SY 8*SIZE(ptrba), yvec0; VPERMILP_SY $0xb1, yvec0, yvec1; EDUP_SY 8*SIZE(ptrbb), yvec2; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec0, yvec3, yvec7; ADD1_SY yvec7, yvec13, yvec13; ODUP_SY 8*SIZE(ptrbb), yvec2; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec0, yvec4, yvec6; ADD1_SY yvec6, yvec11, yvec11; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec5, yvec7; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec2, yvec6; ADD2_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec1, yvec3, yvec7; ADD2_SY yvec7, yvec13, yvec13; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec1, yvec4, yvec6; ADD2_SY yvec6, yvec11, yvec11; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec7, yvec9, yvec9; #### Unroll time 3 #### LD_SY 16*SIZE(ptrba), yvec0; VPERMILP_SY $0xb1, yvec0, yvec1; EDUP_SY 16*SIZE(ptrbb), yvec2; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec0, yvec3, yvec7; ADD1_SY yvec7, yvec13, yvec13; ODUP_SY 16*SIZE(ptrbb), yvec2; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec0, yvec4, yvec6; ADD1_SY yvec6, yvec11, yvec11; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec5, yvec7; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec2, yvec6; ADD2_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec1, yvec3, yvec7; ADD2_SY yvec7, yvec13, yvec13; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec1, yvec4, yvec6; ADD2_SY yvec6, yvec11, yvec11; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec7, yvec9, yvec9; #### Unroll time 3 #### LD_SY 24*SIZE(ptrba), yvec0; VPERMILP_SY $0xb1, yvec0, yvec1; EDUP_SY 24*SIZE(ptrbb), yvec2; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec0, yvec3, yvec7; ADD1_SY yvec7, yvec13, yvec13; ODUP_SY 24*SIZE(ptrbb), yvec2; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec0, yvec4, yvec6; ADD1_SY yvec6, yvec11, yvec11; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec5, yvec7; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec2, yvec6; ADD2_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec1, yvec3, yvec7; ADD2_SY yvec7, yvec13, yvec13; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec1, yvec4, yvec6; ADD2_SY yvec6, yvec11, yvec11; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec7, yvec9, yvec9; ADDQ $32*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L8_bodyB; ALIGN_5 .L8_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L9_loopE; ALIGN_5 .L9_bodyB: #### Unroll times 1 #### LD_SY 0*SIZE(ptrba), yvec0; VPERMILP_SY $0xb1, yvec0, yvec1; EDUP_SY 0*SIZE(ptrbb), yvec2; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec0, yvec3, yvec7; ADD1_SY yvec7, yvec13, yvec13; ODUP_SY 0*SIZE(ptrbb), yvec2; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec0, yvec4, yvec6; ADD1_SY yvec6, yvec11, yvec11; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec5, yvec7; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec2, yvec6; ADD2_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec1, yvec3, yvec7; ADD2_SY yvec7, yvec13, yvec13; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec1, yvec4, yvec6; ADD2_SY yvec6, yvec11, yvec11; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec7, yvec9, yvec9; #### Unroll time 2 #### LD_SY 8*SIZE(ptrba), yvec0; VPERMILP_SY $0xb1, yvec0, yvec1; EDUP_SY 8*SIZE(ptrbb), yvec2; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec0, yvec3, yvec7; ADD1_SY yvec7, yvec13, yvec13; ODUP_SY 8*SIZE(ptrbb), yvec2; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec0, yvec4, yvec6; ADD1_SY yvec6, yvec11, yvec11; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec5, yvec7; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec2, yvec6; ADD2_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec1, yvec3, yvec7; ADD2_SY yvec7, yvec13, yvec13; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec1, yvec4, yvec6; ADD2_SY yvec6, yvec11, yvec11; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec7, yvec9, yvec9; ADDQ $16*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; .L9_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L10_loopE; ALIGN_5 .L10_bodyB: #### Unroll times 1 #### LD_SY 0*SIZE(ptrba), yvec0; VPERMILP_SY $0xb1, yvec0, yvec1; EDUP_SY 0*SIZE(ptrbb), yvec2; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec0, yvec3, yvec7; ADD1_SY yvec7, yvec13, yvec13; ODUP_SY 0*SIZE(ptrbb), yvec2; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec0, yvec4, yvec6; ADD1_SY yvec6, yvec11, yvec11; VPERMILP_SY $0x4e, yvec2, yvec3; MUL_SY yvec0, yvec5, yvec7; ADD1_SY yvec7, yvec9, yvec9; MUL_SY yvec1, yvec2, yvec6; ADD2_SY yvec6, yvec15, yvec15; SHUF_SY $0x03, yvec2, yvec2, yvec4; MUL_SY yvec1, yvec3, yvec7; ADD2_SY yvec7, yvec13, yvec13; SHUF_SY $0x03, yvec3, yvec3, yvec5; MUL_SY yvec1, yvec4, yvec6; ADD2_SY yvec6, yvec11, yvec11; MUL_SY yvec1, yvec5, yvec7; ADD2_SY yvec7, yvec9, yvec9; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L10_loopE: #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_SY yvec15, yvec7, yvec15; ADDSUB_SY yvec13, yvec7, yvec13; ADDSUB_SY yvec11, yvec7, yvec11; ADDSUB_SY yvec9, yvec7, yvec9; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_SY yvec15, yvec7, yvec15; SUB_SY yvec13, yvec7, yvec13; SUB_SY yvec11, yvec7, yvec11; SUB_SY yvec9, yvec7, yvec9; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_SY $0xb1, yvec15, yvec15; VPERMILP_SY $0xb1, yvec13, yvec13; VPERMILP_SY $0xb1, yvec11, yvec11; VPERMILP_SY $0xb1, yvec9, yvec9; ADDSUB_SY yvec15, yvec7, yvec15; ADDSUB_SY yvec13, yvec7, yvec13; ADDSUB_SY yvec11, yvec7, yvec11; ADDSUB_SY yvec9, yvec7, yvec9; VPERMILP_SY $0xb1, yvec15, yvec15; VPERMILP_SY $0xb1, yvec13, yvec13; VPERMILP_SY $0xb1, yvec11, yvec11; VPERMILP_SY $0xb1, yvec9, yvec9; #endif ##### Load Alpha #### BROAD_SY MEMALPHA_R,yvec7; BROAD_SY MEMALPHA_I,yvec6; ##### Multiply Alpha #### VPERMILP_SY $0xb1,yvec15, yvec5; MUL_SY yvec15, yvec7, yvec15; MUL_SY yvec5, yvec6, yvec5; ADDSUB_SY yvec5, yvec15, yvec15; VPERMILP_SY $0xb1,yvec13, yvec3; MUL_SY yvec13, yvec7, yvec13; MUL_SY yvec3, yvec6, yvec3; ADDSUB_SY yvec3, yvec13, yvec13; VPERMILP_SY $0xb1,yvec11, yvec1; MUL_SY yvec11, yvec7, yvec11; MUL_SY yvec1, yvec6, yvec1; ADDSUB_SY yvec1, yvec11, yvec11; VPERMILP_SY $0xb1,yvec9, yvec5; MUL_SY yvec9, yvec7, yvec9; MUL_SY yvec5, yvec6, yvec5; ADDSUB_SY yvec5, yvec9, yvec9; #### Writing back #### #### Shuffle Results #### MOV_SY yvec15,yvec7; REVS_SY $0xe4,yvec13,yvec15,yvec15; REVS_SY $0xe4,yvec7,yvec13,yvec13; MOV_SY yvec11,yvec7; REVS_SY $0xe4,yvec9,yvec11,yvec11; REVS_SY $0xe4,yvec7,yvec9,yvec9; #### Writing back #### EXTRA_SY $1, yvec15, xvec7; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec6, xvec6; LDH_SX 2*SIZE(C0), xvec6, xvec6; ADD_SX xvec6, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); #ifndef TRMMKERNEL LDL_SX 4*SIZE(C1), xvec4, xvec4; LDH_SX 6*SIZE(C1), xvec4, xvec4; ADD_SX xvec4, xvec7, xvec7; #endif STL_SX xvec7, 4*SIZE(C1); STH_SX xvec7, 6*SIZE(C1); EXTRA_SY $1, yvec13, xvec5; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0, ldc, 1), xvec4, xvec4; LDH_SX 2*SIZE(C0, ldc, 1), xvec4, xvec4; ADD_SX xvec4, xvec13, xvec13; #endif STL_SX xvec13, 0*SIZE(C0, ldc, 1); STH_SX xvec13, 2*SIZE(C0, ldc, 1); #ifndef TRMMKERNEL LDL_SX 4*SIZE(C1, ldc, 1), xvec2, xvec2; LDH_SX 6*SIZE(C1, ldc, 1), xvec2, xvec2; ADD_SX xvec2, xvec5, xvec5; #endif STL_SX xvec5, 4*SIZE(C1, ldc, 1); STH_SX xvec5, 6*SIZE(C1, ldc, 1); EXTRA_SY $1, yvec11, xvec3; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C1), xvec2, xvec2; LDH_SX 2*SIZE(C1), xvec2, xvec2; ADD_SX xvec2, xvec11, xvec11; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C1); #ifndef TRMMKERNEL LDL_SX 4*SIZE(C0), xvec0, xvec0; LDH_SX 6*SIZE(C0), xvec0, xvec0; ADD_SX xvec0, xvec3, xvec3; #endif STL_SX xvec3, 4*SIZE(C0); STH_SX xvec3, 6*SIZE(C0); EXTRA_SY $1, yvec9, xvec1; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C1, ldc, 1), xvec0, xvec0; LDH_SX 2*SIZE(C1, ldc, 1), xvec0, xvec0; ADD_SX xvec0, xvec9, xvec9; #endif STL_SX xvec9, 0*SIZE(C1, ldc, 1); STH_SX xvec9, 2*SIZE(C1, ldc, 1); #ifndef TRMMKERNEL LDL_SX 4*SIZE(C0, ldc, 1), xvec6, xvec6; LDH_SX 6*SIZE(C0, ldc, 1), xvec6, xvec6; ADD_SX xvec6, xvec1, xvec1; #endif STL_SX xvec1, 4*SIZE(C0, ldc, 1); STH_SX xvec1, 6*SIZE(C0, ldc, 1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk; #endif ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; .L5_loopE: TEST $2, bm; JLE .L6_loopE; ALIGN_5 .L6_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #### Initial Results Register #### XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; XOR_SY yvec13, yvec13, yvec13; XOR_SY yvec12, yvec12, yvec12; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; #else ADDQ $4, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L11_loopE; ALIGN_5 .L11_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec13, xvec13; MUL_SX xvec0, xvec5, xvec5; ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; MUL_SX xvec1, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 8*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec13, xvec13; MUL_SX xvec0, xvec5, xvec5; ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; MUL_SX xvec1, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec12, xvec12; LD_SX 8*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 16*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 20*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec13, xvec13; MUL_SX xvec0, xvec5, xvec5; ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 16*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; MUL_SX xvec1, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 20*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec12, xvec12; LD_SX 12*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 24*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 28*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec13, xvec13; MUL_SX xvec0, xvec5, xvec5; ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 24*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; MUL_SX xvec1, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 28*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec12, xvec12; ADDQ $16*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L11_bodyB; ALIGN_5 .L11_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L12_loopE; ALIGN_5 .L12_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec13, xvec13; MUL_SX xvec0, xvec5, xvec5; ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; MUL_SX xvec1, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 8*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec13, xvec13; MUL_SX xvec0, xvec5, xvec5; ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; MUL_SX xvec1, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec12, xvec12; ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; .L12_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L13_loopE; ALIGN_5 .L13_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec13, xvec13; MUL_SX xvec0, xvec5, xvec5; ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; MUL_SX xvec1, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec13, xvec13; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec12, xvec12; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L13_loopE: #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; SHUF_SX $0xb1, xvec13, xvec13; SHUF_SX $0xb1, xvec12, xvec12; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; SHUF_SX $0xb1, xvec13, xvec13; SHUF_SX $0xb1, xvec12, xvec12; #endif ##### Load Alpha #### BROAD_SX MEMALPHA_R,xvec7; BROAD_SX MEMALPHA_I,xvec6; ##### Multiply Alpha #### VPERMILP_SX $0xb1,xvec15, xvec5; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec6, xvec5, xvec5; ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; MUL_SX xvec7, xvec14, xvec14; MUL_SX xvec6, xvec4, xvec4; ADDSUB_SX xvec4, xvec14, xvec14; VPERMILP_SX $0xb1,xvec13, xvec3; MUL_SX xvec7, xvec13, xvec13; MUL_SX xvec6, xvec3, xvec3; ADDSUB_SX xvec3, xvec13, xvec13; VPERMILP_SX $0xb1,xvec12, xvec2; MUL_SX xvec7, xvec12, xvec12; MUL_SX xvec6, xvec2, xvec2; ADDSUB_SX xvec2, xvec12, xvec12; #### Writing back #### #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C0, ldc,1), xvec0, xvec0; LDL_SX 0*SIZE(C0, ldc,1), xvec1, xvec1; LDH_SX 2*SIZE(C0), xvec1, xvec1; LDL_SX 0*SIZE(C1), xvec2, xvec2; LDH_SX 2*SIZE(C1, ldc, 1), xvec2, xvec2; LDL_SX 0*SIZE(C1, ldc, 1), xvec3, xvec3; LDH_SX 2*SIZE(C1), xvec3, xvec3; ADD_SX xvec0, xvec15, xvec15; ADD_SX xvec1, xvec14, xvec14; ADD_SX xvec2, xvec13, xvec13; ADD_SX xvec3, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0, ldc, 1); STL_SX xvec14, 0*SIZE(C0, ldc, 1); STH_SX xvec14, 2*SIZE(C0); STL_SX xvec13, 0*SIZE(C1); STH_SX xvec13, 2*SIZE(C1, ldc, 1); STL_SX xvec12, 0*SIZE(C1, ldc, 1); STH_SX xvec12, 2*SIZE(C1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; .L6_loopE: TEST $1, bm; JLE .L7_loopE; ALIGN_5 .L7_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; #else ADDQ $4, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L14_loopE; ALIGN_5 .L14_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; LD_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 3*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 4*SIZE(ptrba), xvec0; LD_SX 16*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; LD_SX 20*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 5*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 6*SIZE(ptrba), xvec0; LD_SX 24*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; LD_SX 28*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 7*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec14, xvec14; ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L14_bodyB; ALIGN_5 .L14_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L15_loopE; ALIGN_5 .L15_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; LD_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 3*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; .L15_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L16_loopE; ALIGN_5 .L16_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; MUL_SX xvec0, xvec4, xvec4; ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; MUL_SX xvec1, xvec5, xvec5; ADD2_SX xvec5, xvec14, xvec14; ADDQ $2*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L16_loopE: #### Handle #### #if defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; #endif ##### Load Alpha #### BROAD_SX MEMALPHA_R,xvec7; BROAD_SX MEMALPHA_I,xvec6; ##### Multiply Alpha #### VPERMILP_SX $0xb1,xvec15, xvec5; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec6, xvec5, xvec5; ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; MUL_SX xvec7, xvec14, xvec14; MUL_SX xvec6, xvec4, xvec4; ADDSUB_SX xvec4, xvec14, xvec14; #### Writing back #### #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 0*SIZE(C0, ldc, 1), xvec0, xvec0; LDL_SX 0*SIZE(C1), xvec1, xvec1; LDH_SX 0*SIZE(C1, ldc, 1), xvec1, xvec1; ADD_SX xvec0, xvec15, xvec15; ADD_SX xvec1, xvec14, xvec14; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 0*SIZE(C0, ldc, 1); STL_SX xvec14, 0*SIZE(C1); STH_SX xvec14, 0*SIZE(C1, ldc, 1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 4), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $1, kk; #endif ADDQ $2*SIZE, C0; ADDQ $2*SIZE, C1; .L7_loopE: #if defined(TRMMKERNEL) && !defined(LEFT) ADDQ $4, kk; #endif MOVQ bk,k; SALQ $5,k; ADDQ k,bb; LEAQ (C,ldc,4),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; ALIGN_5; .L0_loopE:; TEST $2, bn; JLE .L20_loopE; ALIGN_5 .L20_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; MOVQ %rax, kk; #endif MOVQ C, C0; LEAQ (C, ldc, 1), C1; MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L21_loopE; ALIGN_5 .L21_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; XOR_SY yvec13, yvec13, yvec13; XOR_SY yvec12, yvec12, yvec12; XOR_SY yvec11, yvec11, yvec11; XOR_SY yvec10, yvec10, yvec10; XOR_SY yvec9, yvec9, yvec9; XOR_SY yvec8, yvec8, yvec8; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; #else ADDQ $2, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L211_loopE; ALIGN_5 .L211_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec13, xvec13; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec9, xvec9; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec12, xvec12; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 16*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 20*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; LD_SX 24*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec13, xvec13; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec9, xvec9; LD_SX 28*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec12, xvec12; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 8*SIZE(ptrbb), xvec4; ODUP_SX 8*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 32*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 36*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; LD_SX 40*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec13, xvec13; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec9, xvec9; LD_SX 44*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec12, xvec12; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 12*SIZE(ptrbb), xvec4; ODUP_SX 12*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 48*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 52*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; LD_SX 56*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec13, xvec13; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec9, xvec9; LD_SX 60*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec12, xvec12; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec8, xvec8; ADDQ $64*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L211_bodyB; ALIGN_5 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L212_loopE; ALIGN_5 .L212_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec13, xvec13; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec9, xvec9; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec12, xvec12; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 16*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 20*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; LD_SX 24*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec13, xvec13; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec9, xvec9; LD_SX 28*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec12, xvec12; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec8, xvec8; ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L212_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L213_loopE; ALIGN_5 .L213_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec13, xvec13; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec9, xvec9; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec12, xvec12; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec8, xvec8; ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb .L213_loopE: #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec9, xvec7, xvec7; MOV_SX xvec7, xvec9; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec8, xvec7, xvec7; MOV_SX xvec7, xvec8; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec9, xvec7, xvec7; MOV_SX xvec7, xvec9; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec8, xvec7, xvec7; MOV_SX xvec7, xvec8; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; SHUF_SX $0xb1, xvec13, xvec13; SHUF_SX $0xb1, xvec12, xvec12; SHUF_SX $0xb1, xvec11, xvec11; SHUF_SX $0xb1, xvec10, xvec10; SHUF_SX $0xb1, xvec9, xvec9; SHUF_SX $0xb1, xvec8, xvec8; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec9, xvec7, xvec7; MOV_SX xvec7, xvec9; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec8, xvec7, xvec7; MOV_SX xvec7, xvec8; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; SHUF_SX $0xb1, xvec13, xvec13; SHUF_SX $0xb1, xvec12, xvec12; SHUF_SX $0xb1, xvec11, xvec11; SHUF_SX $0xb1, xvec10, xvec10; SHUF_SX $0xb1, xvec9, xvec9; SHUF_SX $0xb1, xvec8, xvec8; #endif #### Mulitply Alpha #### BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec6, xvec5, xvec5; ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; MUL_SX xvec7, xvec14, xvec14; MUL_SX xvec6, xvec4, xvec4; ADDSUB_SX xvec4, xvec14, xvec14; VPERMILP_SX $0xb1,xvec13, xvec3; MUL_SX xvec7, xvec13, xvec13; MUL_SX xvec6, xvec3, xvec3; ADDSUB_SX xvec3, xvec13, xvec13; VPERMILP_SX $0xb1,xvec12, xvec2; MUL_SX xvec7, xvec12, xvec12; MUL_SX xvec6, xvec2, xvec2; ADDSUB_SX xvec2, xvec12, xvec12; VPERMILP_SX $0xb1,xvec11, xvec1; MUL_SX xvec7, xvec11, xvec11; MUL_SX xvec6, xvec1, xvec1; ADDSUB_SX xvec1, xvec11, xvec11; VPERMILP_SX $0xb1,xvec10, xvec0; MUL_SX xvec7, xvec10, xvec10; MUL_SX xvec6, xvec0, xvec0; ADDSUB_SX xvec0, xvec10, xvec10; VPERMILP_SX $0xb1,xvec9, xvec5; MUL_SX xvec7, xvec9, xvec9; MUL_SX xvec6, xvec5, xvec5; ADDSUB_SX xvec5, xvec9, xvec9; VPERMILP_SX $0xb1,xvec8, xvec4; MUL_SX xvec7, xvec8, xvec8; MUL_SX xvec6, xvec4, xvec4; ADDSUB_SX xvec4, xvec8, xvec8; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C1), xvec0, xvec0; LDL_SX 4*SIZE(C0), xvec1, xvec1; LDH_SX 6*SIZE(C1), xvec1, xvec1; LDL_SX 8*SIZE(C0), xvec2, xvec2; LDH_SX 10*SIZE(C1), xvec2, xvec2; LDL_SX 12*SIZE(C0), xvec3, xvec3; LDH_SX 14*SIZE(C1), xvec3, xvec3; ADD_SX xvec0, xvec15, xvec15; ADD_SX xvec1, xvec14, xvec14; ADD_SX xvec2, xvec13, xvec13; ADD_SX xvec3, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); STL_SX xvec14, 4*SIZE(C0); STH_SX xvec14, 6*SIZE(C1); STL_SX xvec13, 8*SIZE(C0); STH_SX xvec13, 10*SIZE(C1); STL_SX xvec12, 12*SIZE(C0); STH_SX xvec12, 14*SIZE(C1); #ifndef TRMMKERNEL LDL_SX 0*SIZE(C1), xvec4, xvec4; LDH_SX 2*SIZE(C0), xvec4, xvec4; LDL_SX 4*SIZE(C1), xvec5, xvec5; LDH_SX 6*SIZE(C0), xvec5, xvec5; LDL_SX 8*SIZE(C1), xvec6, xvec6; LDH_SX 10*SIZE(C0), xvec6, xvec6; LDL_SX 12*SIZE(C1), xvec7, xvec7; LDH_SX 14*SIZE(C0), xvec7, xvec7; ADD_SX xvec4, xvec11, xvec11; ADD_SX xvec5, xvec10, xvec10; ADD_SX xvec6, xvec9, xvec9; ADD_SX xvec7, xvec8, xvec8; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C0); STL_SX xvec10, 4*SIZE(C1); STH_SX xvec10, 6*SIZE(C0); STL_SX xvec9, 8*SIZE(C1); STH_SX xvec9, 10*SIZE(C0); STL_SX xvec8, 12*SIZE(C1); STH_SX xvec8, 14*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 8), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk; #endif ADDQ $16*SIZE, C0; ADDQ $16*SIZE, C1; DECQ i; JG .L21_bodyB; ALIGN_5 .L21_loopE: TEST $4, bm; JLE .L22_loopE; ALIGN_5 .L22_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; XOR_SY yvec11, yvec11, yvec11; XOR_SY yvec10, yvec10, yvec10; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; #else ADDQ $2, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L221_loopE; ALIGN_5 .L221_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; #### Unroll 3 #### EDUP_SX 8*SIZE(ptrbb), xvec4; ODUP_SX 8*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 16*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 20*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; #### Unroll 4 #### EDUP_SX 12*SIZE(ptrbb), xvec4; ODUP_SX 12*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 24*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 28*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; ADDQ $32*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L221_bodyB; ALIGN_5 .L221_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L222_loopE; ALIGN_5 .L222_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L222_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L223_loopE; ALIGN_5 .L223_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec14, xvec14; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec10, xvec10; ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L223_loopE: #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; SHUF_SX $0xb1, xvec11, xvec11; SHUF_SX $0xb1, xvec10, xvec10; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; SHUF_SX $0xb1, xvec11, xvec11; SHUF_SX $0xb1, xvec10, xvec10; #endif #### Mulitply Alpha #### BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec6, xvec5, xvec5; ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; MUL_SX xvec7, xvec14, xvec14; MUL_SX xvec6, xvec4, xvec4; ADDSUB_SX xvec4, xvec14, xvec14; VPERMILP_SX $0xb1,xvec11, xvec1; MUL_SX xvec7, xvec11, xvec11; MUL_SX xvec6, xvec1, xvec1; ADDSUB_SX xvec1, xvec11, xvec11; VPERMILP_SX $0xb1,xvec10, xvec0; MUL_SX xvec7, xvec10, xvec10; MUL_SX xvec6, xvec0, xvec0; ADDSUB_SX xvec0, xvec10, xvec10; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C1), xvec0, xvec0; LDL_SX 4*SIZE(C0), xvec1, xvec1; LDH_SX 6*SIZE(C1), xvec1, xvec1; ADD_SX xvec0, xvec15, xvec15; ADD_SX xvec1, xvec14, xvec14; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); STL_SX xvec14, 4*SIZE(C0); STH_SX xvec14, 6*SIZE(C1); #ifndef TRMMKERNEL LDL_SX 0*SIZE(C1), xvec4, xvec4; LDH_SX 2*SIZE(C0), xvec4, xvec4; LDL_SX 4*SIZE(C1), xvec5, xvec5; LDH_SX 6*SIZE(C0), xvec5, xvec5; ADD_SX xvec4, xvec11, xvec11; ADD_SX xvec5, xvec10, xvec10; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C0); STL_SX xvec10, 4*SIZE(C1); STH_SX xvec10, 6*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk; #endif ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; .L22_loopE: TEST $2, bm; JLE .L23_loopE; ALIGN_5 .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec11, yvec11, yvec11; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; #else ADDQ $2, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; ALIGN_5 .L231_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; #### Unroll 3 #### EDUP_SX 8*SIZE(ptrbb), xvec4; ODUP_SX 8*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; #### Unroll 4 #### EDUP_SX 12*SIZE(ptrbb), xvec4; ODUP_SX 12*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; ADDQ $16*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L231_bodyB; ALIGN_5 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L232_loopE; ALIGN_5 .L232_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L232_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L233_loopE; ALIGN_5 .L233_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x4e, xvec4, xvec6; SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; MUL_SX xvec4, xvec0, xvec0; ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; MUL_SX xvec6, xvec1, xvec1; ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; MUL_SX xvec5, xvec2, xvec2; ADD2_SX xvec2, xvec15, xvec15; MUL_SX xvec7, xvec3, xvec3; ADD2_SX xvec3, xvec11, xvec11; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L233_loopE: #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; SUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec11, xvec11; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec11, xvec11; #endif #### Mulitply Alpha #### BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec6, xvec5, xvec5; ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec11, xvec1; MUL_SX xvec7, xvec11, xvec11; MUL_SX xvec6, xvec1, xvec1; ADDSUB_SX xvec1, xvec11, xvec11; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C1), xvec0, xvec0; ADD_SX xvec0, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); #ifndef TRMMKERNEL LDL_SX 0*SIZE(C1), xvec4, xvec4; LDH_SX 2*SIZE(C0), xvec4, xvec4; ADD_SX xvec4, xvec11, xvec11; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; .L23_loopE: TEST $1, bm; JLE .L24_loopE; ALIGN_5 .L24_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; #else ADDQ $2, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L241_loopE; ALIGN_5 .L241_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 3*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 4*SIZE(ptrba), xvec0; LD_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 5*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 6*SIZE(ptrba), xvec0; LD_SX 12*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 7*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L241_bodyB; ALIGN_5 .L241_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L242_loopE; ALIGN_5 .L242_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 3*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L242_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L243_loopE; ALIGN_5 .L243_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec1; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L243_loopE: #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(NR) || defined(NC) || defined(TR) || defined(TC) ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; SHUF_SX $0xb1, xvec15, xvec15; #endif ##### Load Alpha #### BROAD_SX MEMALPHA_R,xvec7; BROAD_SX MEMALPHA_I,xvec6; ##### Multiply Alpha #### VPERMILP_SX $0xb1,xvec15, xvec5; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec6, xvec5, xvec5; ADDSUB_SX xvec5, xvec15, xvec15; #### Writing back #### #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 0*SIZE(C1), xvec0, xvec0; ADD_SX xvec0, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 0*SIZE(C1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; LEAQ (ptrbb, %rax, 2), ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $1, kk; #endif ADDQ $2*SIZE, C0; ADDQ $2*SIZE, C1; .L24_loopE: #if defined(TRMMKERNEL) && !defined(LEFT) ADDQ $2, kk; #endif MOVQ bk, k; SALQ $4, k; ADDQ k, bb; LEAQ (C, ldc, 2), C; .L20_loopE: TEST $1, bn; JLE .L30_loopE; ALIGN_5 .L30_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; MOVQ %rax, kk; #endif MOVQ C, C0; MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L31_loopE; ALIGN_5 .L31_bodyB: MOVQ bb, ptrbb; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 8), ptrba; ADDQ %rax, ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; XOR_SY yvec14, yvec14, yvec14; MOVQ bk, k; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $8, %rax; #else ADDQ $1, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; ALIGN_5 .L311_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; LD_SY 8*SIZE(ptrba), yvec1; BROAD_SY 0*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; MUL_SY yvec1, yvec2, yvec7; ADD1_SY yvec7, yvec14, yvec14; BROAD_SY 1*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; VPERMILP_SY $0xb1, yvec1, yvec5; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; MUL_SY yvec5, yvec3, yvec7; ADD2_SY yvec7, yvec14, yvec14; #### Unroll 2 #### LD_SY 16*SIZE(ptrba), yvec0; LD_SY 24*SIZE(ptrba), yvec1; BROAD_SY 2*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; MUL_SY yvec1, yvec2, yvec7; ADD1_SY yvec7, yvec14, yvec14; BROAD_SY 3*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; VPERMILP_SY $0xb1, yvec1, yvec5; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; MUL_SY yvec5, yvec3, yvec7; ADD2_SY yvec7, yvec14, yvec14; #### Unroll 3 #### LD_SY 32*SIZE(ptrba), yvec0; LD_SY 40*SIZE(ptrba), yvec1; BROAD_SY 4*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; MUL_SY yvec1, yvec2, yvec7; ADD1_SY yvec7, yvec14, yvec14; BROAD_SY 5*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; VPERMILP_SY $0xb1, yvec1, yvec5; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; MUL_SY yvec5, yvec3, yvec7; ADD2_SY yvec7, yvec14, yvec14; #### Unroll 4 #### LD_SY 48*SIZE(ptrba), yvec0; LD_SY 56*SIZE(ptrba), yvec1; BROAD_SY 6*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; MUL_SY yvec1, yvec2, yvec7; ADD1_SY yvec7, yvec14, yvec14; BROAD_SY 7*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; VPERMILP_SY $0xb1, yvec1, yvec5; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; MUL_SY yvec5, yvec3, yvec7; ADD2_SY yvec7, yvec14, yvec14; ADDQ $64*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L311_bodyB; ALIGN_5 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L312_loopE; ALIGN_5 .L312_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; LD_SY 8*SIZE(ptrba), yvec1; BROAD_SY 0*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; MUL_SY yvec1, yvec2, yvec7; ADD1_SY yvec7, yvec14, yvec14; BROAD_SY 1*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; VPERMILP_SY $0xb1, yvec1, yvec5; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; MUL_SY yvec5, yvec3, yvec7; ADD2_SY yvec7, yvec14, yvec14; #### Unroll 2 #### LD_SY 16*SIZE(ptrba), yvec0; LD_SY 24*SIZE(ptrba), yvec1; BROAD_SY 2*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; MUL_SY yvec1, yvec2, yvec7; ADD1_SY yvec7, yvec14, yvec14; BROAD_SY 3*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; VPERMILP_SY $0xb1, yvec1, yvec5; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; MUL_SY yvec5, yvec3, yvec7; ADD2_SY yvec7, yvec14, yvec14; ADDQ $32*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L312_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L313_loopE; ALIGN_5 .L313_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; LD_SY 8*SIZE(ptrba), yvec1; BROAD_SY 0*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; MUL_SY yvec1, yvec2, yvec7; ADD1_SY yvec7, yvec14, yvec14; BROAD_SY 1*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; VPERMILP_SY $0xb1, yvec1, yvec5; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; MUL_SY yvec5, yvec3, yvec7; ADD2_SY yvec7, yvec14, yvec14; ADDQ $16*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L313_loopE: #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_SY yvec15, yvec7, yvec15; ADDSUB_SY yvec14, yvec7, yvec14; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_SY yvec15, yvec7, yvec15; SUB_SY yvec14, yvec7, yvec14; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_SY $0xb1, yvec15, yvec15; VPERMILP_SY $0xb1, yvec14, yvec14; ADDSUB_SY yvec15, yvec7, yvec15; ADDSUB_SY yvec14, yvec7, yvec14; VPERMILP_SY $0xb1, yvec15, yvec15; VPERMILP_SY $0xb1, yvec14, yvec14; #endif ##### Load Alpha #### BROAD_SY MEMALPHA_R,yvec7; BROAD_SY MEMALPHA_I,yvec6; ##### Multiply Alpha #### VPERMILP_SY $0xb1,yvec15, yvec5; MUL_SY yvec15, yvec7, yvec15; MUL_SY yvec5, yvec6, yvec5; ADDSUB_SY yvec5, yvec15, yvec15; VPERMILP_SY $0xb1,yvec14, yvec4; MUL_SY yvec14, yvec7, yvec14; MUL_SY yvec4, yvec6, yvec4; ADDSUB_SY yvec4, yvec14, yvec14; #### Writing back #### EXTRA_SY $1, yvec15, xvec7; EXTRA_SY $1, yvec14, xvec6; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C0), xvec0, xvec0; LDL_SX 4*SIZE(C0), xvec1, xvec1; LDH_SX 6*SIZE(C0), xvec1, xvec1; LDL_SX 8*SIZE(C0), xvec2, xvec2; LDH_SX 10*SIZE(C0), xvec2, xvec2; LDL_SX 12*SIZE(C0), xvec3, xvec3; LDH_SX 14*SIZE(C0), xvec3, xvec3; ADD_SX xvec0, xvec15, xvec15; ADD_SX xvec1, xvec7, xvec7; ADD_SX xvec2, xvec14, xvec14; ADD_SX xvec3, xvec6, xvec6; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); STL_SX xvec7, 4*SIZE(C0); STH_SX xvec7, 6*SIZE(C0); STL_SX xvec14, 8*SIZE(C0); STH_SX xvec14, 10*SIZE(C0); STL_SX xvec6, 12*SIZE(C0); STH_SX xvec6, 14*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 8), ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $8, kk; #endif ADDQ $16*SIZE, C0; DECQ i; JG .L31_bodyB; ALIGN_5 .L31_loopE: TEST $4, bm; JLE .L32_loopE; ALIGN_5 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; ADDQ %rax, ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $4, %rax; #else ADDQ $1, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L321_loopE; ALIGN_5 .L321_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; BROAD_SY 1*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; #### Unroll 2 #### LD_SY 8*SIZE(ptrba), yvec0; BROAD_SY 2*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; BROAD_SY 3*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; #### Unroll 3 #### LD_SY 16*SIZE(ptrba), yvec0; BROAD_SY 4*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; BROAD_SY 5*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; #### Unroll 4 #### LD_SY 24*SIZE(ptrba), yvec0; BROAD_SY 6*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; BROAD_SY 7*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L321_bodyB; ALIGN_5 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L322_loopE; ALIGN_5 .L322_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; BROAD_SY 1*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; #### Unroll 2 #### LD_SY 8*SIZE(ptrba), yvec0; BROAD_SY 2*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; BROAD_SY 3*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L322_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L323_loopE; ALIGN_5 .L323_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec2; MUL_SY yvec0, yvec2, yvec6; ADD1_SY yvec6, yvec15, yvec15; BROAD_SY 1*SIZE(ptrbb), yvec3; VPERMILP_SY $0xb1, yvec0, yvec4; MUL_SY yvec4, yvec3, yvec6; ADD2_SY yvec6, yvec15, yvec15; ADDQ $8*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L323_loopE: #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_SY yvec15, yvec7, yvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_SY yvec15, yvec7, yvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) VPERMILP_SY $0xb1, yvec15, yvec15; ADDSUB_SY yvec15, yvec7, yvec15; VPERMILP_SY $0xb1, yvec15, yvec15; #endif ##### Load Alpha #### BROAD_SY MEMALPHA_R,yvec7; BROAD_SY MEMALPHA_I,yvec6; ##### Multiply Alpha #### VPERMILP_SY $0xb1,yvec15, yvec5; MUL_SY yvec15, yvec7, yvec15; MUL_SY yvec5, yvec6, yvec5; ADDSUB_SY yvec5, yvec15, yvec15; #### Writing back #### EXTRA_SY $1, yvec15, xvec7; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C0), xvec0, xvec0; LDL_SX 4*SIZE(C0), xvec1, xvec1; LDH_SX 6*SIZE(C0), xvec1, xvec1; ADD_SX xvec0, xvec15, xvec15; ADD_SX xvec1, xvec7, xvec7; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); STL_SX xvec7, 4*SIZE(C0); STH_SX xvec7, 6*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 4), ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $4, kk; #endif ADDQ $8*SIZE, C0; .L32_loopE: TEST $2, bm; JLE .L33_loopE; ALIGN_5 .L33_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; ADDQ %rax, ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $2, %rax; #else ADDQ $1, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; ALIGN_5 .L331_bodyB: #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 1*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; #### Unroll 2 #### LD_SX 4*SIZE(ptrba), xvec0; BROAD_SX 2*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 3*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; #### Unroll 3 #### LD_SX 8*SIZE(ptrba), xvec0; BROAD_SX 4*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 5*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; #### Unroll 4 #### LD_SX 12*SIZE(ptrba), xvec0; BROAD_SX 6*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 7*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L331_bodyB; ALIGN_5 .L331_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L332_loopE; ALIGN_5 .L332_bodyB: #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 1*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; #### Unroll 2 #### LD_SX 4*SIZE(ptrba), xvec0; BROAD_SX 2*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 3*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L332_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L333_loopE; ALIGN_5 .L333_bodyB: #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec2; MUL_SX xvec0, xvec2, xvec2; ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 1*SIZE(ptrbb), xvec3; MUL_SX xvec1, xvec3, xvec3; ADD2_SX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L333_loopE: #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; SHUF_SX $0xb1, xvec15, xvec15; #endif #### Mulitply Alpha #### BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec6, xvec5, xvec5; ADDSUB_SX xvec5, xvec15, xvec15; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; LDH_SX 2*SIZE(C0), xvec0, xvec0; ADD_SX xvec0, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; LEAQ (ptrba, %rax, 2), ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $2, kk; #endif ADDQ $4*SIZE, C0; .L33_loopE: TEST $1, bm; JLE .L34_loopE; ALIGN_5 .L34_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; #else MOVQ bb, ptrbb; MOVQ kk, %rax; SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; ADDQ %rax, ptrbb; #endif XOR_SY yvec15, yvec15, yvec15; #ifndef TRMMKERNEL MOVQ bk,k; #elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) MOVQ bk, %rax; SUBQ kk, %rax; MOVQ %rax, kkk; #else MOVQ kk, %rax; #ifdef LEFT ADDQ $1, %rax; #else ADDQ $1, %rax; #endif MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L341_loopE; ALIGN_5 .L341_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xa0, xvec2, xvec3; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; SHUF_SX $0xf5, xvec2, xvec4; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec15, xvec15; LD_SX 4*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0xa0, xvec2, xvec3; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; SHUF_SX $0xf5, xvec2, xvec4; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L341_bodyB; ALIGN_5 .L341_loopE: #ifndef TRMMKERNEL TEST $2, bk; #else TEST $2, kkk; #endif JLE .L342_loopE; ALIGN_5 .L342_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xa0, xvec2, xvec3; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; SHUF_SX $0xf5, xvec2, xvec4; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L342_loopE: #ifndef TRMMKERNEL TEST $1, bk; #else TEST $1, kkk; #endif JLE .L343_loopE; ALIGN_5 .L343_bodyB: XOR_SY yvec0, yvec0, yvec0; XOR_SY yvec2, yvec2, yvec2; LDL_SX 0*SIZE(ptrba), xvec0, xvec0; LDL_SX 0*SIZE(ptrbb), xvec2, xvec2; SHUF_SX $0xe0, xvec2, xvec3; MUL_SX xvec0, xvec3, xvec3; ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xe1, xvec0, xvec1; SHUF_SX $0xe5, xvec2, xvec4; MUL_SX xvec1, xvec4, xvec4; ADD2_SX xvec4, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L343_loopE: #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; SHUF_SX $0xb1, xvec15, xvec15; #endif BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; VPERMILP_SX $0xb1, xvec15, xvec5; MUL_SX xvec7, xvec15, xvec15; MUL_SX xvec6, xvec5, xvec5; ADDSUB_SX xvec5, xvec15, xvec15; SHUF_SX $0x44, xvec15, xvec14; SHUF_SX $0xee, xvec15, xvec13; ADD_SX xvec13, xvec14, xvec14; #ifndef TRMMKERNEL LDL_SX 0*SIZE(C0), xvec0, xvec0; ADD_SX xvec0, xvec14, xvec14; #endif STL_SX xvec14, 0*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; SALQ $ZBASE_SHIFT, %rax; ADDQ %rax, ptrba; ADDQ %rax, ptrbb; #endif #if defined(TRMMKERNEL) && defined(LEFT) ADDQ $1, kk; #endif ADDQ $2*SIZE, C0; .L34_loopE: #if defined(TRMMKERNEL) && !defined(LEFT) ADDQ $1, kk; #endif MOVQ bk, k; SALQ $3, k; ADDQ k, bb; ADDQ ldc, C; .L30_loopE: movq 0(%rsp), %rbx; movq 8(%rsp), %rbp; movq 16(%rsp), %r12; movq 24(%rsp), %r13; movq 32(%rsp), %r14; movq 40(%rsp), %r15; vzeroupper #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp; ret EPILOGUE