/**************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ // register blocking= 6x4. unloop k = 4. // Use FMA3 on piledriver. // Todo: 1) deal with the edge. 2) Add windows abi. #define ASSEMBLER #include "common.h" #define STACKSIZE 128 #define oldbk_i %rdi #define oldbk_j %rsi #define oldbk_l %rdx #define _bk_i %r13 #define _bk_j %r14 #define _bk_l %r15 #define ALPHA %xmm0 #define _ptr_A %rcx #define _ptr_B %r8 #define _ptr_C %r9 #define LDC %r10 #define i %r11 #define k %rax #define _pre_B %r12 #define _ptr__A_0 %rdi #define _ptr__B_0 %rsi #define _ptr__C_0 %rbx #define _ptr__C_1 %rbp #define old_ldc 8+STACKSIZE(%rsp) #define alpha 48(%rsp) #define j 56(%rsp) #define MOVQ2560(s,d) movq s,d #define LEAQ2560(s,d) leaq s,d #define SARQ2560(imm,n) sarq imm,n #define ADDQ2560(off,addr) addq off,addr #define SUBQ2560(off,addr) subq off,addr #define DIVQ2560(off,addr) divq off,addr #define MULQ2560(s,d) mulq s,d #define DECQ2560(addr) decq addr #define NEGQ2560(s) negq s #define TESTQ2560(n,addr) testq n,addr #define SALQ2560(imm,n) salq imm,n #define MOVQ1280(s,d) movq s,d #define LEAQ1280(s,d) leaq s,d #define SARQ1280(imm,n) sarq imm,n #define ADDQ1280(off,addr) addq off,addr #define SUBQ1280(off,addr) subq off,addr #define DIVQ1280(off,addr) divq off,addr #define CMPQ1280(off,addr) cmpq off,addr #define MULQ1280(s,d) mulq s,d #define DECQ1280(addr) decq addr #define NEGQ1280(s) negq s #define TESTQ1280(n,addr) testq n,addr #define SALQ1280(imm,n) salq imm,n #define JG jg #define JLE jle #define VLD2560(addr,reg) vmovapd addr,reg #define VST2560(reg,addr) vmovapd reg,addr #define VMUL2560(a,b,c) vmulpd a,b,c #define MVMUL2560(a,b,c) vmulpd b,a,c #define VADD2560(a,b,c) vaddpd a,b,c #define MVADD2560(a,b,c) vaddpd b,a,c #define VSHUF2560(imm,s,d) vpermilpd imm,s,d #define VSHUF2F2560(imm,s1,s2,d) vperm2f128 imm,s1,s2,d #define BROAD2560(addr,reg) vbroadcastsd addr,reg #define MOVRR2560(a,b) vmovapd a,b #define REVS2560(imm,s1,s2,d) vshufpd imm,s1,s2,d #define EXTR2561(imm,a,b) vextractf128 imm,a,b #define LDL2561(addr,reg) vmovlpd addr,reg,reg #define LDH2561(addr,reg) vmovhpd addr,reg,reg #define STL2561(reg,addr) vmovlpd reg,addr #define STH2561(reg,addr) vmovhpd reg,addr #define VADD2561(a,b,c) vaddpd a,b,c #define VXOR2560(a,b,c) vxorpd a,b,c #define PREFETCH02560(addr,b) prefetcht0 addr #define PREFETCH12560(addr,b) prefetcht0 addr #define PREFETCH22560(addr,b) prefetcht2 addr #define PREFETCHW2560(addr,b) prefetchw addr #define PREFETCHN2560(addr,b) prefetchnta addr #define VMA2560(a,b,c,d) vfmaddpd d,a,b,c #define MVMA2560(a,b,c,d) vfmaddpd d,a,b,c #define VLD1280(addr,reg) vmovapd addr,reg #define VLD1282(addr,reg) vmovapd addr,reg #define VLD1281(addr,reg) movsd addr,reg #define VST1280(reg,addr) vmovapd reg,addr #define VST1282(reg,addr) vmovapd reg,addr #define VST1281(reg,addr) movsd reg,addr #define VLDU1282(addr,reg) vmovupd addr,reg #define VLDU1281(addr,reg) movsd addr,reg #define VSTU1282(reg,addr) vmovupd reg,addr #define VSTU1281(reg,addr) movsd reg,addr #define VMUL1280(a,b,c) vmulpd a,b,c #define VMUL1282(a,b,c) vmulpd a,b,c #define VMUL1281(a,b,c) vmulpd a,b,c #define MVMUL1280(a,b,c) vmulpd b,a,c #define VADD1280(a,b,c) vaddpd a,b,c #define MVADD1280(a,b,c) vaddpd b,a,c #define VSHUF1280(imm,s,d) vpermilpd imm,s,d #define VSHUF2F1280(imm,s1,s2,d) vperm2f128 imm,s1,s2,d #define BROAD1280(addr,reg) vmovddup addr,reg #define BROAD1282(addr,reg) vmovddup addr,reg #define BROAD1281(addr,reg) movddup addr,reg #define MOVRR1280(a,b) vmovapd a,b #define REVS1280(imm,s1,s2,d) vshufpd imm,s1,s2,d #define EXTR1281(imm,a,b) vextractf128 imm,a,b #define LDL1281(addr,reg) vmovlpd addr,reg,reg #define LDH1281(addr,reg) vmovhpd addr,reg,reg #define STL1281(reg,addr) vmovlpd reg,addr #define STH1281(reg,addr) vmovhpd reg,addr #define VADD1281(a,b,c) vaddpd a,b,c #define VXOR1280(a,b,c) vxorpd a,b,c #define VXOR1282(a,b,c) vxorpd a,b,c #define VXOR1281(a,b,c) vxorpd a,b,c #define PREFETCH01280(addr,b) prefetcht0 addr #define PREFETCH11280(addr,b) prefetcht0 addr #define PREFETCH21280(addr,b) prefetcht2 addr #define PREFETCHW1280(addr,b) prefetchw addr #define PREFETCHN1280(addr,b) prefetchnta addr #define VMA1280(a,b,c,d) vfmaddpd d,a,b,c #define VMA1282(a,b,c,d) vfmadd231pd a,b,c #define VMA1281(a,b,c,d) vfmadd231pd a,b,c #define VMA21282(a,b,c,d) vfmadd231pd a,b,c #define VMA21281(a,b,c,d) vfmadd231pd a,b,c //#define VMA1282(a,b,c,d) nop //#define VMA1281(a,b,c,d) nop //#define VMA21282(a,b,c,d) nop //#define VMA21281(a,b,c,d) nop #define MVMA1280(a,b,c,d) vfmaddpd d,a,b,c #define imm1 $0x05 #define imm3 $0x05 #define imm100 $0x05 #define imm200 $0x0a #define XMM0 %xmm0 #define XMM1 %xmm1 #define XMM2 %xmm2 #define XMM3 %xmm3 #define XMM4 %xmm4 #define XMM5 %xmm5 #define XMM6 %xmm6 #define XMM7 %xmm7 #define XMM8 %xmm8 #define XMM9 %xmm9 #define XMM10 %xmm10 #define XMM11 %xmm11 #define XMM12 %xmm12 #define XMM13 %xmm13 #define XMM14 %xmm14 #define XMM15 %xmm15 #define YMM0 %ymm0 #define YMM1 %ymm1 #define YMM2 %ymm2 #define YMM3 %ymm3 #define YMM4 %ymm4 #define YMM5 %ymm5 #define YMM6 %ymm6 #define YMM7 %ymm7 #define YMM8 %ymm8 #define YMM9 %ymm9 #define YMM10 %ymm10 #define YMM11 %ymm11 #define YMM12 %ymm12 #define YMM13 %ymm13 #define YMM14 %ymm14 #define YMM15 %ymm15 PROLOGUE subq $STACKSIZE, %rsp; movq %rbx, 0(%rsp); movq %rbp, 8(%rsp); movq %r12, 16(%rsp); movq %r13, 24(%rsp); movq %r14, 32(%rsp); movq %r15, 40(%rsp); vzeroupper movl old_ldc, %eax movq %rax, LDC movlps ALPHA, alpha movq oldbk_i, _bk_i movq oldbk_j, _bk_j movq oldbk_l, _bk_l leaq (, LDC, SIZE), LDC MOVQ1280(_bk_j,j); SARQ1280($2,j); JLE ._L_0_loopE; ALIGN_4; ._L_0_bodyB:; MOVQ1280(_ptr_A,_ptr__A_0); MOVQ1280(_ptr_C,_ptr__C_0); LEAQ1280((_ptr_C,LDC,2),_ptr__C_1); MOVQ1280(_bk_l,%rax); SALQ1280($5,%rax); ADDQ1280(%rax,_pre_B); MOVQ1280(_bk_i,i); CMPQ1280($6,i); JL ._L_1_loopE; ._L_1_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1282(XMM0,XMM0,XMM0); VXOR1282(XMM1,XMM1,XMM1); VXOR1282(XMM2,XMM2,XMM2); VXOR1282(XMM3,XMM3,XMM3); VXOR1282(XMM4,XMM4,XMM4); VXOR1282(XMM5,XMM5,XMM5); VXOR1282(XMM6,XMM6,XMM6); VXOR1282(XMM7,XMM7,XMM7); VXOR1282(XMM8,XMM8,XMM8); VXOR1282(XMM9,XMM9,XMM9); VXOR1282(XMM10,XMM10,XMM10); VXOR1282(XMM11,XMM11,XMM11); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_2_loopE; ALIGN_4; ._L_2_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM6,XMM6); VMA1282(XMM13,XMM15,XMM7,XMM7); VMA1282(XMM14,XMM15,XMM8,XMM8); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM9,XMM9); VMA1282(XMM13,XMM15,XMM10,XMM10); VMA1282(XMM14,XMM15,XMM11,XMM11); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(4*SIZE(_ptr__B_0),XMM15); VLD1282(6*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(8*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(10*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(5*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(6*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM6,XMM6); VMA1282(XMM13,XMM15,XMM7,XMM7); VMA1282(XMM14,XMM15,XMM8,XMM8); BROAD1282(7*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM9,XMM9); VMA1282(XMM13,XMM15,XMM10,XMM10); VMA1282(XMM14,XMM15,XMM11,XMM11); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1282(8*SIZE(_ptr__B_0),XMM15); VLD1282(12*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(14*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(16*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(9*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(10*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM6,XMM6); VMA1282(XMM13,XMM15,XMM7,XMM7); VMA1282(XMM14,XMM15,XMM8,XMM8); BROAD1282(11*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM9,XMM9); VMA1282(XMM13,XMM15,XMM10,XMM10); VMA1282(XMM14,XMM15,XMM11,XMM11); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1282(12*SIZE(_ptr__B_0),XMM15); VLD1282(18*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(20*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(22*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(13*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(14*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM6,XMM6); VMA1282(XMM13,XMM15,XMM7,XMM7); VMA1282(XMM14,XMM15,XMM8,XMM8); BROAD1282(15*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM9,XMM9); VMA1282(XMM13,XMM15,XMM10,XMM10); VMA1282(XMM14,XMM15,XMM11,XMM11); ADDQ1280($24*SIZE,_ptr__A_0); ADDQ1280($16*SIZE,_ptr__B_0); ._L_2_bodyE:; DECQ1280(k); JG ._L_2_bodyB; ALIGN_4; ._L_2_loopE:; TESTQ1280($2,_bk_l); JLE ._L_3_loopE; ALIGN_4; ._L_3_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM6,XMM6); VMA1282(XMM13,XMM15,XMM7,XMM7); VMA1282(XMM14,XMM15,XMM8,XMM8); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM9,XMM9); VMA1282(XMM13,XMM15,XMM10,XMM10); VMA1282(XMM14,XMM15,XMM11,XMM11); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(4*SIZE(_ptr__B_0),XMM15); VLD1282(6*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(8*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(10*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(5*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(6*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM6,XMM6); VMA1282(XMM13,XMM15,XMM7,XMM7); VMA1282(XMM14,XMM15,XMM8,XMM8); BROAD1282(7*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM9,XMM9); VMA1282(XMM13,XMM15,XMM10,XMM10); VMA1282(XMM14,XMM15,XMM11,XMM11); ADDQ1280($12*SIZE,_ptr__A_0); ADDQ1280($8*SIZE,_ptr__B_0); ._L_3_loopE:; TESTQ1280($1,_bk_l); JLE ._L_4_loopE; ALIGN_4; ._L_4_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM6,XMM6); VMA1282(XMM13,XMM15,XMM7,XMM7); VMA1282(XMM14,XMM15,XMM8,XMM8); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM9,XMM9); VMA1282(XMM13,XMM15,XMM10,XMM10); VMA1282(XMM14,XMM15,XMM11,XMM11); ADDQ1280($6*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_4_loopE:; BROAD1282(alpha,XMM12); VLDU1282(0*SIZE(_ptr__C_0),XMM13); VMA21282(XMM12,XMM0,XMM13,XMM0); VSTU1282(XMM13,0*SIZE(_ptr__C_0)); VLDU1282(2*SIZE(_ptr__C_0),XMM14); VMA21282(XMM12,XMM1,XMM14,XMM1); VSTU1282(XMM14,2*SIZE(_ptr__C_0)); VLDU1282(4*SIZE(_ptr__C_0),XMM15); VMA21282(XMM12,XMM2,XMM15,XMM2); VSTU1282(XMM15,4*SIZE(_ptr__C_0)); VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM13); VMA21282(XMM12,XMM3,XMM13,XMM3); VSTU1282(XMM13,0*SIZE(_ptr__C_0,LDC,1)); VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM14); VMA21282(XMM12,XMM4,XMM14,XMM4); VSTU1282(XMM14,2*SIZE(_ptr__C_0,LDC,1)); VLDU1282(4*SIZE(_ptr__C_0,LDC,1),XMM15); VMA21282(XMM12,XMM5,XMM15,XMM5); VSTU1282(XMM15,4*SIZE(_ptr__C_0,LDC,1)); VLDU1282(0*SIZE(_ptr__C_1),XMM13); VMA21282(XMM12,XMM6,XMM13,XMM6); VSTU1282(XMM13,0*SIZE(_ptr__C_1)); VLDU1282(2*SIZE(_ptr__C_1),XMM14); VMA21282(XMM12,XMM7,XMM14,XMM7); VSTU1282(XMM14,2*SIZE(_ptr__C_1)); VLDU1282(4*SIZE(_ptr__C_1),XMM15); VMA21282(XMM12,XMM8,XMM15,XMM8); VSTU1282(XMM15,4*SIZE(_ptr__C_1)); VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM13); VMA21282(XMM12,XMM9,XMM13,XMM9); VSTU1282(XMM13,0*SIZE(_ptr__C_1,LDC,1)); VLDU1282(2*SIZE(_ptr__C_1,LDC,1),XMM14); VMA21282(XMM12,XMM10,XMM14,XMM10); VSTU1282(XMM14,2*SIZE(_ptr__C_1,LDC,1)); VLDU1282(4*SIZE(_ptr__C_1,LDC,1),XMM15); VMA21282(XMM12,XMM11,XMM15,XMM11); VSTU1282(XMM15,4*SIZE(_ptr__C_1,LDC,1)); ADDQ1280($6*SIZE,_ptr__C_0); ADDQ1280($6*SIZE,_ptr__C_1); ._L_1_bodyE:; SUBQ1280($6,i); JG ._L_1_bodyB; ALIGN_4; ._L_1_loopE:; TESTQ1280($4,i); JLE ._L_5_loopE; ALIGN_4; ._L_5_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1282(XMM0,XMM0,XMM0); VXOR1282(XMM1,XMM1,XMM1); VXOR1282(XMM2,XMM2,XMM2); VXOR1282(XMM3,XMM3,XMM3); VXOR1282(XMM4,XMM4,XMM4); VXOR1282(XMM5,XMM5,XMM5); VXOR1282(XMM6,XMM6,XMM6); VXOR1282(XMM7,XMM7,XMM7); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_6_loopE; ALIGN_4; ._L_6_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM6,XMM6); VMA1282(XMM14,XMM15,XMM7,XMM7); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(4*SIZE(_ptr__B_0),XMM15); VLD1282(4*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(6*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(5*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); BROAD1282(6*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(7*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM6,XMM6); VMA1282(XMM14,XMM15,XMM7,XMM7); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1282(8*SIZE(_ptr__B_0),XMM15); VLD1282(8*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(10*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(9*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); BROAD1282(10*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(11*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM6,XMM6); VMA1282(XMM14,XMM15,XMM7,XMM7); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1282(12*SIZE(_ptr__B_0),XMM15); VLD1282(12*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(14*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(13*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); BROAD1282(14*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(15*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM6,XMM6); VMA1282(XMM14,XMM15,XMM7,XMM7); ADDQ1280($16*SIZE,_ptr__A_0); ADDQ1280($16*SIZE,_ptr__B_0); ._L_6_bodyE:; DECQ1280(k); JG ._L_6_bodyB; ALIGN_4; ._L_6_loopE:; TESTQ1280($2,_bk_l); JLE ._L_7_loopE; ALIGN_4; ._L_7_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM6,XMM6); VMA1282(XMM14,XMM15,XMM7,XMM7); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(4*SIZE(_ptr__B_0),XMM15); VLD1282(4*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(6*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(5*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); BROAD1282(6*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(7*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM6,XMM6); VMA1282(XMM14,XMM15,XMM7,XMM7); ADDQ1280($8*SIZE,_ptr__A_0); ADDQ1280($8*SIZE,_ptr__B_0); ._L_7_loopE:; TESTQ1280($1,_bk_l); JLE ._L_8_loopE; ALIGN_4; ._L_8_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM6,XMM6); VMA1282(XMM14,XMM15,XMM7,XMM7); ADDQ1280($4*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_8_loopE:; BROAD1282(alpha,XMM8); VLDU1282(0*SIZE(_ptr__C_0),XMM9); VMA21282(XMM8,XMM0,XMM9,XMM0); VSTU1282(XMM9,0*SIZE(_ptr__C_0)); VLDU1282(2*SIZE(_ptr__C_0),XMM10); VMA21282(XMM8,XMM1,XMM10,XMM1); VSTU1282(XMM10,2*SIZE(_ptr__C_0)); VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM11); VMA21282(XMM8,XMM2,XMM11,XMM2); VSTU1282(XMM11,0*SIZE(_ptr__C_0,LDC,1)); VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM12); VMA21282(XMM8,XMM3,XMM12,XMM3); VSTU1282(XMM12,2*SIZE(_ptr__C_0,LDC,1)); VLDU1282(0*SIZE(_ptr__C_1),XMM13); VMA21282(XMM8,XMM4,XMM13,XMM4); VSTU1282(XMM13,0*SIZE(_ptr__C_1)); VLDU1282(2*SIZE(_ptr__C_1),XMM14); VMA21282(XMM8,XMM5,XMM14,XMM5); VSTU1282(XMM14,2*SIZE(_ptr__C_1)); VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM15); VMA21282(XMM8,XMM6,XMM15,XMM6); VSTU1282(XMM15,0*SIZE(_ptr__C_1,LDC,1)); VLDU1282(2*SIZE(_ptr__C_1,LDC,1),XMM9); VMA21282(XMM8,XMM7,XMM9,XMM7); VSTU1282(XMM9,2*SIZE(_ptr__C_1,LDC,1)); ADDQ1280($4*SIZE,_ptr__C_0); ADDQ1280($4*SIZE,_ptr__C_1); ._L_5_loopE:; TESTQ1280($2,i); JLE ._L_9_loopE; ALIGN_4; ._L_9_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1282(XMM0,XMM0,XMM0); VXOR1282(XMM1,XMM1,XMM1); VXOR1282(XMM2,XMM2,XMM2); VXOR1282(XMM3,XMM3,XMM3); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_10_loopE; ALIGN_4; ._L_10_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM3,XMM3); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(4*SIZE(_ptr__B_0),XMM15); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(5*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(6*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(7*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM3,XMM3); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1282(8*SIZE(_ptr__B_0),XMM15); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(9*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(10*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(11*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM3,XMM3); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1282(12*SIZE(_ptr__B_0),XMM15); VLD1282(6*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(13*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(14*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(15*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM3,XMM3); ADDQ1280($8*SIZE,_ptr__A_0); ADDQ1280($16*SIZE,_ptr__B_0); ._L_10_bodyE:; DECQ1280(k); JG ._L_10_bodyB; ALIGN_4; ._L_10_loopE:; TESTQ1280($2,_bk_l); JLE ._L_11_loopE; ALIGN_4; ._L_11_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM3,XMM3); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(4*SIZE(_ptr__B_0),XMM15); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(5*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(6*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(7*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM3,XMM3); ADDQ1280($4*SIZE,_ptr__A_0); ADDQ1280($8*SIZE,_ptr__B_0); ._L_11_loopE:; TESTQ1280($1,_bk_l); JLE ._L_12_loopE; ALIGN_4; ._L_12_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM3,XMM3); ADDQ1280($2*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_12_loopE:; BROAD1282(alpha,XMM4); VLDU1282(0*SIZE(_ptr__C_0),XMM5); VMA21282(XMM4,XMM0,XMM5,XMM0); VSTU1282(XMM5,0*SIZE(_ptr__C_0)); VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM6); VMA21282(XMM4,XMM1,XMM6,XMM1); VSTU1282(XMM6,0*SIZE(_ptr__C_0,LDC,1)); VLDU1282(0*SIZE(_ptr__C_1),XMM7); VMA21282(XMM4,XMM2,XMM7,XMM2); VSTU1282(XMM7,0*SIZE(_ptr__C_1)); VLDU1282(0*SIZE(_ptr__C_1,LDC,1),XMM8); VMA21282(XMM4,XMM3,XMM8,XMM3); VSTU1282(XMM8,0*SIZE(_ptr__C_1,LDC,1)); ADDQ1280($2*SIZE,_ptr__C_0); ADDQ1280($2*SIZE,_ptr__C_1); ._L_9_loopE:; TESTQ1280($1,i); JLE ._L_13_loopE; ALIGN_4; ._L_13_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1281(XMM0,XMM0,XMM0); VXOR1281(XMM1,XMM1,XMM1); VXOR1281(XMM2,XMM2,XMM2); VXOR1281(XMM3,XMM3,XMM3); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_14_loopE; ALIGN_4; ._L_14_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1281(0*SIZE(_ptr__B_0),XMM15); VLD1281(0*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(1*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); BROAD1281(2*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM2,XMM2); BROAD1281(3*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM3,XMM3); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1281(4*SIZE(_ptr__B_0),XMM15); VLD1281(1*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(5*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); BROAD1281(6*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM2,XMM2); BROAD1281(7*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM3,XMM3); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1281(8*SIZE(_ptr__B_0),XMM15); VLD1281(2*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(9*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); BROAD1281(10*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM2,XMM2); BROAD1281(11*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM3,XMM3); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1281(12*SIZE(_ptr__B_0),XMM15); VLD1281(3*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(13*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); BROAD1281(14*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM2,XMM2); BROAD1281(15*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM3,XMM3); ADDQ1280($4*SIZE,_ptr__A_0); ADDQ1280($16*SIZE,_ptr__B_0); ._L_14_bodyE:; DECQ1280(k); JG ._L_14_bodyB; ALIGN_4; ._L_14_loopE:; TESTQ1280($2,_bk_l); JLE ._L_15_loopE; ALIGN_4; ._L_15_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1281(0*SIZE(_ptr__B_0),XMM15); VLD1281(0*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(1*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); BROAD1281(2*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM2,XMM2); BROAD1281(3*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM3,XMM3); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1281(4*SIZE(_ptr__B_0),XMM15); VLD1281(1*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(5*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); BROAD1281(6*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM2,XMM2); BROAD1281(7*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM3,XMM3); ADDQ1280($2*SIZE,_ptr__A_0); ADDQ1280($8*SIZE,_ptr__B_0); ._L_15_loopE:; TESTQ1280($1,_bk_l); JLE ._L_16_loopE; ALIGN_4; ._L_16_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1281(0*SIZE(_ptr__B_0),XMM15); VLD1281(0*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(1*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); BROAD1281(2*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM2,XMM2); BROAD1281(3*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM3,XMM3); ADDQ1280($1*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_16_loopE:; BROAD1281(alpha,XMM4); VLDU1281(0*SIZE(_ptr__C_0),XMM5); VMA21281(XMM4,XMM0,XMM5,XMM0); VSTU1281(XMM5,0*SIZE(_ptr__C_0)); VLDU1281(0*SIZE(_ptr__C_0,LDC,1),XMM6); VMA21281(XMM4,XMM1,XMM6,XMM1); VSTU1281(XMM6,0*SIZE(_ptr__C_0,LDC,1)); VLDU1281(0*SIZE(_ptr__C_1),XMM7); VMA21281(XMM4,XMM2,XMM7,XMM2); VSTU1281(XMM7,0*SIZE(_ptr__C_1)); VLDU1281(0*SIZE(_ptr__C_1,LDC,1),XMM8); VMA21281(XMM4,XMM3,XMM8,XMM3); VSTU1281(XMM8,0*SIZE(_ptr__C_1,LDC,1)); ADDQ1280($1*SIZE,_ptr__C_0); ADDQ1280($1*SIZE,_ptr__C_1); ._L_13_loopE:; MOVQ1280(LDC,%rax); SALQ1280($2,%rax); ADDQ1280(%rax,_ptr_C); MOVQ1280(_bk_l,%rax); SALQ1280($5,%rax); ADDQ1280(%rax,_ptr_B); ._L_0_bodyE:; DECQ1280(j); JG ._L_0_bodyB; ALIGN_4; ._L_0_loopE:; TESTQ1280($2,_bk_j); JLE ._L_17_loopE; ALIGN_4; ._L_17_bodyB:; MOVQ1280(_ptr_A,_ptr__A_0); MOVQ1280(_ptr_C,_ptr__C_0); LEAQ1280((_ptr_C,LDC,1),_ptr__C_1); MOVQ1280(_bk_l,%rax); SALQ1280($4,%rax); ADDQ1280(%rax,_pre_B); MOVQ1280(_bk_i,i); CMPQ1280($6,i); JL ._L_18_loopE; ._L_18_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1282(XMM0,XMM0,XMM0); VXOR1282(XMM1,XMM1,XMM1); VXOR1282(XMM2,XMM2,XMM2); VXOR1282(XMM3,XMM3,XMM3); VXOR1282(XMM4,XMM4,XMM4); VXOR1282(XMM5,XMM5,XMM5); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_19_loopE; ALIGN_4; ._L_19_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VLD1282(6*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(8*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(10*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1282(4*SIZE(_ptr__B_0),XMM15); VLD1282(12*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(14*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(16*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(5*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1282(6*SIZE(_ptr__B_0),XMM15); VLD1282(18*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(20*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(22*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(7*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); ADDQ1280($24*SIZE,_ptr__A_0); ADDQ1280($8*SIZE,_ptr__B_0); ._L_19_bodyE:; DECQ1280(k); JG ._L_19_bodyB; ALIGN_4; ._L_19_loopE:; TESTQ1280($2,_bk_l); JLE ._L_20_loopE; ALIGN_4; ._L_20_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VLD1282(6*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(8*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(10*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); ADDQ1280($12*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_20_loopE:; TESTQ1280($1,_bk_l); JLE ._L_21_loopE; ALIGN_4; ._L_21_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM12,XMM15,XMM3,XMM3); VMA1282(XMM13,XMM15,XMM4,XMM4); VMA1282(XMM14,XMM15,XMM5,XMM5); ADDQ1280($6*SIZE,_ptr__A_0); ADDQ1280($2*SIZE,_ptr__B_0); ._L_21_loopE:; BROAD1282(alpha,XMM6); VLDU1282(0*SIZE(_ptr__C_0),XMM7); VMA21282(XMM6,XMM0,XMM7,XMM0); VSTU1282(XMM7,0*SIZE(_ptr__C_0)); VLDU1282(2*SIZE(_ptr__C_0),XMM8); VMA21282(XMM6,XMM1,XMM8,XMM1); VSTU1282(XMM8,2*SIZE(_ptr__C_0)); VLDU1282(4*SIZE(_ptr__C_0),XMM9); VMA21282(XMM6,XMM2,XMM9,XMM2); VSTU1282(XMM9,4*SIZE(_ptr__C_0)); VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM10); VMA21282(XMM6,XMM3,XMM10,XMM3); VSTU1282(XMM10,0*SIZE(_ptr__C_0,LDC,1)); VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM11); VMA21282(XMM6,XMM4,XMM11,XMM4); VSTU1282(XMM11,2*SIZE(_ptr__C_0,LDC,1)); VLDU1282(4*SIZE(_ptr__C_0,LDC,1),XMM12); VMA21282(XMM6,XMM5,XMM12,XMM5); VSTU1282(XMM12,4*SIZE(_ptr__C_0,LDC,1)); ADDQ1280($6*SIZE,_ptr__C_0); ADDQ1280($6*SIZE,_ptr__C_1); ._L_18_bodyE:; SUBQ1280($6,i); JG ._L_18_bodyB; ALIGN_4; ._L_18_loopE:; TESTQ1280($4,i); JLE ._L_22_loopE; ALIGN_4; ._L_22_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1282(XMM0,XMM0,XMM0); VXOR1282(XMM1,XMM1,XMM1); VXOR1282(XMM2,XMM2,XMM2); VXOR1282(XMM3,XMM3,XMM3); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_23_loopE; ALIGN_4; ._L_23_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VLD1282(4*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(6*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1282(4*SIZE(_ptr__B_0),XMM15); VLD1282(8*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(10*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(5*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1282(6*SIZE(_ptr__B_0),XMM15); VLD1282(12*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(14*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(7*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); ADDQ1280($16*SIZE,_ptr__A_0); ADDQ1280($8*SIZE,_ptr__B_0); ._L_23_bodyE:; DECQ1280(k); JG ._L_23_bodyB; ALIGN_4; ._L_23_loopE:; TESTQ1280($2,_bk_l); JLE ._L_24_loopE; ALIGN_4; ._L_24_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VLD1282(4*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(6*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); ADDQ1280($8*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_24_loopE:; TESTQ1280($1,_bk_l); JLE ._L_25_loopE; ALIGN_4; ._L_25_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM13,XMM15,XMM2,XMM2); VMA1282(XMM14,XMM15,XMM3,XMM3); ADDQ1280($4*SIZE,_ptr__A_0); ADDQ1280($2*SIZE,_ptr__B_0); ._L_25_loopE:; BROAD1282(alpha,XMM4); VLDU1282(0*SIZE(_ptr__C_0),XMM5); VMA21282(XMM4,XMM0,XMM5,XMM0); VSTU1282(XMM5,0*SIZE(_ptr__C_0)); VLDU1282(2*SIZE(_ptr__C_0),XMM6); VMA21282(XMM4,XMM1,XMM6,XMM1); VSTU1282(XMM6,2*SIZE(_ptr__C_0)); VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM7); VMA21282(XMM4,XMM2,XMM7,XMM2); VSTU1282(XMM7,0*SIZE(_ptr__C_0,LDC,1)); VLDU1282(2*SIZE(_ptr__C_0,LDC,1),XMM8); VMA21282(XMM4,XMM3,XMM8,XMM3); VSTU1282(XMM8,2*SIZE(_ptr__C_0,LDC,1)); ADDQ1280($4*SIZE,_ptr__C_0); ADDQ1280($4*SIZE,_ptr__C_1); ._L_22_loopE:; TESTQ1280($2,i); JLE ._L_26_loopE; ALIGN_4; ._L_26_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1282(XMM0,XMM0,XMM0); VXOR1282(XMM1,XMM1,XMM1); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_27_loopE; ALIGN_4; ._L_27_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1282(4*SIZE(_ptr__B_0),XMM15); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(5*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1282(6*SIZE(_ptr__B_0),XMM15); VLD1282(6*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(7*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); ADDQ1280($8*SIZE,_ptr__A_0); ADDQ1280($8*SIZE,_ptr__B_0); ._L_27_bodyE:; DECQ1280(k); JG ._L_27_bodyB; ALIGN_4; ._L_27_loopE:; TESTQ1280($2,_bk_l); JLE ._L_28_loopE; ALIGN_4; ._L_28_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); ADDQ1280($4*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_28_loopE:; TESTQ1280($1,_bk_l); JLE ._L_29_loopE; ALIGN_4; ._L_29_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VMA1282(XMM14,XMM15,XMM1,XMM1); ADDQ1280($2*SIZE,_ptr__A_0); ADDQ1280($2*SIZE,_ptr__B_0); ._L_29_loopE:; BROAD1282(alpha,XMM2); VLDU1282(0*SIZE(_ptr__C_0),XMM3); VMA21282(XMM2,XMM0,XMM3,XMM0); VSTU1282(XMM3,0*SIZE(_ptr__C_0)); VLDU1282(0*SIZE(_ptr__C_0,LDC,1),XMM4); VMA21282(XMM2,XMM1,XMM4,XMM1); VSTU1282(XMM4,0*SIZE(_ptr__C_0,LDC,1)); ADDQ1280($2*SIZE,_ptr__C_0); ADDQ1280($2*SIZE,_ptr__C_1); ._L_26_loopE:; TESTQ1280($1,i); JLE ._L_30_loopE; ALIGN_4; ._L_30_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1281(XMM0,XMM0,XMM0); VXOR1281(XMM1,XMM1,XMM1); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_31_loopE; ALIGN_4; ._L_31_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1281(0*SIZE(_ptr__B_0),XMM15); VLD1281(0*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(1*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1281(2*SIZE(_ptr__B_0),XMM15); VLD1281(1*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(3*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1281(4*SIZE(_ptr__B_0),XMM15); VLD1281(2*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(5*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1281(6*SIZE(_ptr__B_0),XMM15); VLD1281(3*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(7*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); ADDQ1280($4*SIZE,_ptr__A_0); ADDQ1280($8*SIZE,_ptr__B_0); ._L_31_bodyE:; DECQ1280(k); JG ._L_31_bodyB; ALIGN_4; ._L_31_loopE:; TESTQ1280($2,_bk_l); JLE ._L_32_loopE; ALIGN_4; ._L_32_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1281(0*SIZE(_ptr__B_0),XMM15); VLD1281(0*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(1*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1281(2*SIZE(_ptr__B_0),XMM15); VLD1281(1*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(3*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); ADDQ1280($2*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_32_loopE:; TESTQ1280($1,_bk_l); JLE ._L_33_loopE; ALIGN_4; ._L_33_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1281(0*SIZE(_ptr__B_0),XMM15); VLD1281(0*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); BROAD1281(1*SIZE(_ptr__B_0),XMM15); VMA1281(XMM14,XMM15,XMM1,XMM1); ADDQ1280($1*SIZE,_ptr__A_0); ADDQ1280($2*SIZE,_ptr__B_0); ._L_33_loopE:; BROAD1281(alpha,XMM2); VLDU1281(0*SIZE(_ptr__C_0),XMM3); VMA21281(XMM2,XMM0,XMM3,XMM0); VSTU1281(XMM3,0*SIZE(_ptr__C_0)); VLDU1281(0*SIZE(_ptr__C_0,LDC,1),XMM4); VMA21281(XMM2,XMM1,XMM4,XMM1); VSTU1281(XMM4,0*SIZE(_ptr__C_0,LDC,1)); ADDQ1280($1*SIZE,_ptr__C_0); ADDQ1280($1*SIZE,_ptr__C_1); ._L_30_loopE:; MOVQ1280(LDC,%rax); SALQ1280($1,%rax); ADDQ1280(%rax,_ptr_C); MOVQ1280(_bk_l,%rax); SALQ1280($4,%rax); ADDQ1280(%rax,_ptr_B); ._L_17_loopE:; TESTQ1280($1,_bk_j); JLE ._L_34_loopE; ALIGN_4; ._L_34_bodyB:; MOVQ1280(_ptr_A,_ptr__A_0); MOVQ1280(_ptr_C,_ptr__C_0); MOVQ1280(_bk_l,%rax); SALQ1280($3,%rax); ADDQ1280(%rax,_pre_B); MOVQ1280(_bk_i,i); CMPQ1280($6,i); JL ._L_35_loopE; ._L_35_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1282(XMM0,XMM0,XMM0); VXOR1282(XMM1,XMM1,XMM1); VXOR1282(XMM2,XMM2,XMM2); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_36_loopE; ALIGN_4; ._L_36_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VLD1282(6*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(8*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(10*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VLD1282(12*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(14*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(16*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VLD1282(18*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(20*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(22*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); ADDQ1280($24*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_36_bodyE:; DECQ1280(k); JG ._L_36_bodyB; ALIGN_4; ._L_36_loopE:; TESTQ1280($2,_bk_l); JLE ._L_37_loopE; ALIGN_4; ._L_37_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VLD1282(6*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(8*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(10*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); ADDQ1280($12*SIZE,_ptr__A_0); ADDQ1280($2*SIZE,_ptr__B_0); ._L_37_loopE:; TESTQ1280($1,_bk_l); JLE ._L_38_loopE; ALIGN_4; ._L_38_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM12); VMA1282(XMM12,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM1,XMM1); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM2,XMM2); ADDQ1280($6*SIZE,_ptr__A_0); ADDQ1280($1*SIZE,_ptr__B_0); ._L_38_loopE:; BROAD1282(alpha,XMM3); VLDU1282(0*SIZE(_ptr__C_0),XMM4); VMA21282(XMM3,XMM0,XMM4,XMM0); VSTU1282(XMM4,0*SIZE(_ptr__C_0)); VLDU1282(2*SIZE(_ptr__C_0),XMM5); VMA21282(XMM3,XMM1,XMM5,XMM1); VSTU1282(XMM5,2*SIZE(_ptr__C_0)); VLDU1282(4*SIZE(_ptr__C_0),XMM6); VMA21282(XMM3,XMM2,XMM6,XMM2); VSTU1282(XMM6,4*SIZE(_ptr__C_0)); ADDQ1280($6*SIZE,_ptr__C_0); ADDQ1280($6*SIZE,_ptr__C_1); ._L_35_bodyE:; SUBQ1280($6,i); JG ._L_35_bodyB; ALIGN_4; ._L_35_loopE:; TESTQ1280($4,i); JLE ._L_39_loopE; ALIGN_4; ._L_39_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1282(XMM0,XMM0,XMM0); VXOR1282(XMM1,XMM1,XMM1); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_40_loopE; ALIGN_4; ._L_40_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VLD1282(4*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(6*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VLD1282(8*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(10*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VLD1282(12*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(14*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); ADDQ1280($16*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_40_bodyE:; DECQ1280(k); JG ._L_40_bodyB; ALIGN_4; ._L_40_loopE:; TESTQ1280($2,_bk_l); JLE ._L_41_loopE; ALIGN_4; ._L_41_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VLD1282(4*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(6*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); ADDQ1280($8*SIZE,_ptr__A_0); ADDQ1280($2*SIZE,_ptr__B_0); ._L_41_loopE:; TESTQ1280($1,_bk_l); JLE ._L_42_loopE; ALIGN_4; ._L_42_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM13); VMA1282(XMM13,XMM15,XMM0,XMM0); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM1,XMM1); ADDQ1280($4*SIZE,_ptr__A_0); ADDQ1280($1*SIZE,_ptr__B_0); ._L_42_loopE:; BROAD1282(alpha,XMM2); VLDU1282(0*SIZE(_ptr__C_0),XMM3); VMA21282(XMM2,XMM0,XMM3,XMM0); VSTU1282(XMM3,0*SIZE(_ptr__C_0)); VLDU1282(2*SIZE(_ptr__C_0),XMM4); VMA21282(XMM2,XMM1,XMM4,XMM1); VSTU1282(XMM4,2*SIZE(_ptr__C_0)); ADDQ1280($4*SIZE,_ptr__C_0); ADDQ1280($4*SIZE,_ptr__C_1); ._L_39_loopE:; TESTQ1280($2,i); JLE ._L_43_loopE; ALIGN_4; ._L_43_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1282(XMM0,XMM0,XMM0); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_44_loopE; ALIGN_4; ._L_44_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1282(2*SIZE(_ptr__B_0),XMM15); VLD1282(4*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1282(3*SIZE(_ptr__B_0),XMM15); VLD1282(6*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); ADDQ1280($8*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_44_bodyE:; DECQ1280(k); JG ._L_44_bodyB; ALIGN_4; ._L_44_loopE:; TESTQ1280($2,_bk_l); JLE ._L_45_loopE; ALIGN_4; ._L_45_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1282(1*SIZE(_ptr__B_0),XMM15); VLD1282(2*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); ADDQ1280($4*SIZE,_ptr__A_0); ADDQ1280($2*SIZE,_ptr__B_0); ._L_45_loopE:; TESTQ1280($1,_bk_l); JLE ._L_46_loopE; ALIGN_4; ._L_46_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1282(0*SIZE(_ptr__B_0),XMM15); VLD1282(0*SIZE(_ptr__A_0),XMM14); VMA1282(XMM14,XMM15,XMM0,XMM0); ADDQ1280($2*SIZE,_ptr__A_0); ADDQ1280($1*SIZE,_ptr__B_0); ._L_46_loopE:; BROAD1282(alpha,XMM1); VLDU1282(0*SIZE(_ptr__C_0),XMM2); VMA21282(XMM1,XMM0,XMM2,XMM0); VSTU1282(XMM2,0*SIZE(_ptr__C_0)); ADDQ1280($2*SIZE,_ptr__C_0); ADDQ1280($2*SIZE,_ptr__C_1); ._L_43_loopE:; TESTQ1280($1,i); JLE ._L_47_loopE; ALIGN_4; ._L_47_bodyB:; MOVQ1280(_ptr_B,_ptr__B_0); VXOR1281(XMM0,XMM0,XMM0); PREFETCHN1280(3*SIZE(_ptr__C_0),N); PREFETCHN1280(11*SIZE(_ptr__C_0,LDC,1),N); PREFETCHN1280(3*SIZE(_ptr__C_1),N); PREFETCHN1280(11*SIZE(_ptr__C_1,LDC,1),N); MOVQ1280(_bk_l,k); SARQ1280($2,k); JLE ._L_48_loopE; ALIGN_4; ._L_48_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1281(0*SIZE(_ptr__B_0),XMM15); VLD1281(0*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1281(1*SIZE(_ptr__B_0),XMM15); VLD1281(1*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); PREFETCH01280(176*SIZE(_ptr__A_0),0); BROAD1281(2*SIZE(_ptr__B_0),XMM15); VLD1281(2*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); PREFETCH21280(184*SIZE(_ptr__A_0),2); BROAD1281(3*SIZE(_ptr__B_0),XMM15); VLD1281(3*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); ADDQ1280($4*SIZE,_ptr__A_0); ADDQ1280($4*SIZE,_ptr__B_0); ._L_48_bodyE:; DECQ1280(k); JG ._L_48_bodyB; ALIGN_4; ._L_48_loopE:; TESTQ1280($2,_bk_l); JLE ._L_49_loopE; ALIGN_4; ._L_49_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1281(0*SIZE(_ptr__B_0),XMM15); VLD1281(0*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); PREFETCH21280(168*SIZE(_ptr__A_0),2); BROAD1281(1*SIZE(_ptr__B_0),XMM15); VLD1281(1*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); ADDQ1280($2*SIZE,_ptr__A_0); ADDQ1280($2*SIZE,_ptr__B_0); ._L_49_loopE:; TESTQ1280($1,_bk_l); JLE ._L_50_loopE; ALIGN_4; ._L_50_bodyB:; PREFETCH01280(160*SIZE(_ptr__A_0),0); BROAD1281(0*SIZE(_ptr__B_0),XMM15); VLD1281(0*SIZE(_ptr__A_0),XMM14); VMA1281(XMM14,XMM15,XMM0,XMM0); ADDQ1280($1*SIZE,_ptr__A_0); ADDQ1280($1*SIZE,_ptr__B_0); ._L_50_loopE:; BROAD1281(alpha,XMM1); VLDU1281(0*SIZE(_ptr__C_0),XMM2); VMA21281(XMM1,XMM0,XMM2,XMM0); VSTU1281(XMM2,0*SIZE(_ptr__C_0)); ADDQ1280($1*SIZE,_ptr__C_0); ADDQ1280($1*SIZE,_ptr__C_1); ._L_47_loopE:; MOVQ1280(LDC,%rax); ADDQ1280(%rax,_ptr_C); MOVQ1280(_bk_l,%rax); SALQ1280($3,%rax); ADDQ1280(%rax,_ptr_B); ._L_34_loopE:; vzeroupper movq 0(%rsp), %rbx; movq 8(%rsp), %rbp; movq 16(%rsp), %r12; movq 24(%rsp), %r13; movq 32(%rsp), %r14; movq 40(%rsp), %r15; addq $STACKSIZE, %rsp; ret EPILOGUE