/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /********************************************************************* * * 2014/06/28 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * * 2013/10/30 Saar * * Parameter: * UNROLL_M 2 * UNROLL_N 2 * ZGEMM_P 384 * ZGEMM_Q 168 * A_PR1 512 * B_PR1 256 * * Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): * * 3456x3456 82.4 GFLOPS with 8 threads on 4 modules (ACML: 76.3 ) (BULLDOZER: 81.0 ) * 3456x3456 79.9 GFLOPS with 4 threads on 4 modules (ACML: 69.9 ) (BULLDOZER: 74.6 ) * 3456x3456 40.4 GFLOPS with 2 threads on 2 modules (ACML: 35.8 ) (BULLDOZER: 37.9 ) * 3456x3456 20.3 GFLOPS with 1 threads on 1 modules (ACML: 18.1 ) (BULLDOZER: 19.2 ) * * Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): * * 6912x6912 227.5 GFLOPS with 32 threads on 16 modules (ACML: 166.3 ) (BULLDOZER: 228.5 ) * 6912x6912 211.6 GFLOPS with 16 threads on 16 modules (ACML: 169.5 ) (BULLDOZER: 204.3 ) * 6912x6912 123.5 GFLOPS with 8 threads on 8 modules (ACML: 92.7 ) (BULLDOZER: 117.0 ) * 3456x3456 64.1 GFLOPS with 4 threads on 4 modules (ACML: 49.1 ) (BULLDOZER: 61.7 ) * 3456x3456 33.4 GFLOPS with 2 threads on 2 modules (ACML: 28.1 ) (BULLDOZER: 30.9 ) * 3456x3456 17.0 GFLOPS with 1 threads on 1 modules (ACML: 15.2 ) (BULLDOZER: 15.7 ) * *********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define J %r14 #define OLD_K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define K %r12 #define BI %rbp #define SP %rbx #define BO1 %rdi #define BO2 %r15 #ifndef WINDOWS_ABI #define STACKSIZE 96 #else #define STACKSIZE 320 #define OLD_ALPHA_I 40 + STACKSIZE(%rsp) #define OLD_A 48 + STACKSIZE(%rsp) #define OLD_B 56 + STACKSIZE(%rsp) #define OLD_C 64 + STACKSIZE(%rsp) #define OLD_LDC 72 + STACKSIZE(%rsp) #define OLD_OFFSET 80 + STACKSIZE(%rsp) #endif #define L_BUFFER_SIZE 256*8*4 #define Ndiv6 24(%rsp) #define Nmod6 32(%rsp) #define N 40(%rsp) #define ALPHA_R 48(%rsp) #define ALPHA_I 56(%rsp) #define OFFSET 64(%rsp) #define KK 72(%rsp) #define KKK 80(%rsp) #define BUFFER1 128(%rsp) #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ movl $0, 4096 * 4(%rsp);\ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ movl $0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif #else #define STACK_TOUCH #endif #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define VFMADD_R vfmaddpd #define VFMADD_I vfmaddpd #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) #define VFMADD_R vfnmaddpd #define VFMADD_I vfmaddpd #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) #define VFMADD_R vfmaddpd #define VFMADD_I vfnmaddpd #else #define VFMADD_R vfnmaddpd #define VFMADD_I vfnmaddpd #endif #define A_PR1 512 #define B_PR1 256 #define KERNEL2x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ #define KERNEL2x2_2(xx) \ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ #define KERNEL2x2_3(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ #define KERNEL2x2_4(xx) \ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ addq $16, BI ;\ addq $16, %rax ;\ #define KERNEL2x2_SUB(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ VFMADD_R %xmm14,%xmm6,%xmm1,%xmm14 ;\ vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ VFMADD_I %xmm15,%xmm7,%xmm1,%xmm15 ;\ addq $4, BI ;\ addq $4, %rax ;\ /************************************************************************************************/ #define KERNEL1x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL1x2_2(xx) \ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vmovddup -2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vmovddup -1 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL1x2_3(xx) \ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vmovddup 2 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vmovddup 3 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ #define KERNEL1x2_4(xx) \ vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup 5 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vmovddup 6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vmovddup 7 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ addq $16, BI ;\ addq $8 , %rax ;\ #define KERNEL1x2_SUB(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -8 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup -7 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ vmovddup -6 * SIZE(BO, BI, SIZE), %xmm6 ;\ VFMADD_R %xmm10,%xmm6,%xmm0,%xmm10 ;\ vmovddup -5 * SIZE(BO, BI, SIZE), %xmm7 ;\ VFMADD_I %xmm11,%xmm7,%xmm0,%xmm11 ;\ addq $4, BI ;\ addq $2, %rax ;\ /************************************************************************************************/ #define KERNEL2x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ #define KERNEL2x1_2(xx) \ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ #define KERNEL2x1_3(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 2 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ #define KERNEL2x1_4(xx) \ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups 6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ addq $8, BI ;\ addq $16, %rax ;\ #define KERNEL2x1_SUB(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm1 ;\ VFMADD_R %xmm12,%xmm4,%xmm1,%xmm12 ;\ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ VFMADD_I %xmm13,%xmm5,%xmm1,%xmm13 ;\ addq $2, BI ;\ addq $4, %rax ;\ /************************************************************************************************/ #define KERNEL1x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL1x1_2(xx) \ vmovups -6 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup -1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL1x1_3(xx) \ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 0 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup 1 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ #define KERNEL1x1_4(xx) \ vmovups -2 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup 2 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup 3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ addq $8, BI ;\ addq $8, %rax ;\ #define KERNEL1x1_SUB(xx) \ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vmovddup -4 * SIZE(BO, BI, SIZE), %xmm4 ;\ VFMADD_R %xmm8,%xmm4,%xmm0,%xmm8 ;\ vmovddup -3 * SIZE(BO, BI, SIZE), %xmm5 ;\ VFMADD_I %xmm9,%xmm5,%xmm0,%xmm9 ;\ addq $2, BI ;\ addq $2, %rax ;\ /************************************************************************************************/ PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) vmovups %xmm6, 64(%rsp) vmovups %xmm7, 80(%rsp) vmovups %xmm8, 96(%rsp) vmovups %xmm9, 112(%rsp) vmovups %xmm10, 128(%rsp) vmovups %xmm11, 144(%rsp) vmovups %xmm12, 160(%rsp) vmovups %xmm13, 176(%rsp) vmovups %xmm14, 192(%rsp) vmovups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL vmovsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 vmovsd OLD_ALPHA_I, %xmm1 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL vmovsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA_R vmovsd %xmm1, ALPHA_I salq $ZBASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $2, %rdi divq %rdi // N / 2 movq %rax, Ndiv6 // N / 2 movq %rdx, Nmod6 // N % 2 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET vmovsd %xmm12, KK #ifndef LEFT negq KK #endif #endif .L2_0: movq Ndiv6, J cmpq $0, J je .L1_0 ALIGN_4 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_02b: vmovups (BO1), %xmm0 vmovups 2 * SIZE(BO1), %xmm1 vmovups %xmm0, (BO) vmovups %xmm1, 2 * SIZE(BO) addq $4*SIZE,BO1 addq $4*SIZE,BO decq %rax jnz .L2_02b .L2_02c: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $8 * SIZE, AO movq M, I sarq $1, I // i = (m >> 1) je .L2_40 ALIGN_4 .L2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_12: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_16 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL2x2_SUB(xxx) jl .L2_17 ALIGN_4 .L2_19: vmovddup ALPHA_R, %xmm0 vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 vshufpd $0x01, %xmm11, %xmm11, %xmm11 vshufpd $0x01, %xmm13, %xmm13, %xmm13 vshufpd $0x01, %xmm15, %xmm15, %xmm15 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 vaddsubpd %xmm13,%xmm12, %xmm12 vaddsubpd %xmm15,%xmm14, %xmm14 vshufpd $0x01, %xmm8 , %xmm8, %xmm9 vshufpd $0x01, %xmm10, %xmm10, %xmm11 vshufpd $0x01, %xmm12, %xmm12, %xmm13 vshufpd $0x01, %xmm14, %xmm14, %xmm15 #else vaddsubpd %xmm8, %xmm9 ,%xmm9 vaddsubpd %xmm10, %xmm11,%xmm11 vaddsubpd %xmm12, %xmm13,%xmm13 vaddsubpd %xmm14, %xmm15,%xmm15 vmovapd %xmm9, %xmm8 vmovapd %xmm11, %xmm10 vmovapd %xmm13, %xmm12 vmovapd %xmm15, %xmm14 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 vshufpd $0x01, %xmm11, %xmm11, %xmm11 vshufpd $0x01, %xmm13, %xmm13, %xmm13 vshufpd $0x01, %xmm15, %xmm15, %xmm15 #endif // multiply with ALPHA_R vmulpd %xmm8 , %xmm0, %xmm8 vmulpd %xmm10, %xmm0, %xmm10 vmulpd %xmm12, %xmm0, %xmm12 vmulpd %xmm14, %xmm0, %xmm14 // multiply with ALPHA_I vmulpd %xmm9 , %xmm1, %xmm9 vmulpd %xmm11, %xmm1, %xmm11 vmulpd %xmm13, %xmm1, %xmm13 vmulpd %xmm15, %xmm1, %xmm15 vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 vaddsubpd %xmm13,%xmm12, %xmm12 vaddsubpd %xmm15,%xmm14, %xmm14 #ifndef TRMMKERNEL vaddpd (CO1), %xmm8 , %xmm8 vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 vaddpd (CO1, LDC), %xmm10, %xmm10 vaddpd 2 * SIZE(CO1, LDC), %xmm14, %xmm14 #endif vmovups %xmm8 , (CO1) vmovups %xmm12 , 2 * SIZE(CO1) vmovups %xmm10 , (CO1, LDC) vmovups %xmm14 , 2 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $8 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_46 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_42: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) prefetcht0 B_PR1+64(BO,BI,SIZE) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB(xxx) jl .L2_47 ALIGN_4 .L2_49: vmovddup ALPHA_R, %xmm0 vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 vshufpd $0x01, %xmm11, %xmm11, %xmm11 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 vshufpd $0x01, %xmm8 , %xmm8, %xmm9 vshufpd $0x01, %xmm10, %xmm10, %xmm11 #else vaddsubpd %xmm8, %xmm9, %xmm9 vaddsubpd %xmm10,%xmm11, %xmm11 vmovapd %xmm9, %xmm8 vmovapd %xmm11, %xmm10 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 vshufpd $0x01, %xmm11, %xmm11, %xmm11 #endif // multiply with ALPHA_R vmulpd %xmm8 , %xmm0, %xmm8 vmulpd %xmm10, %xmm0, %xmm10 // multiply with ALPHA_I vmulpd %xmm9 , %xmm1, %xmm9 vmulpd %xmm11, %xmm1, %xmm11 vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm11,%xmm10, %xmm10 #ifndef TRMMKERNEL vaddpd (CO1), %xmm8 , %xmm8 vaddpd (CO1, LDC), %xmm10, %xmm10 #endif vmovups %xmm8 , (CO1) vmovups %xmm10 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,4), BI // BI = BI * 4 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovups (BO1), %xmm0 vmovups %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $8 * SIZE, AO movq M, I sarq $1, I // i = (m >> 1) je .L1_40 ALIGN_4 .L1_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_12: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_16 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL2x1_SUB(xxx) jl .L1_17 ALIGN_4 .L1_19: vmovddup ALPHA_R, %xmm0 vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 vshufpd $0x01, %xmm13, %xmm13, %xmm13 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm13,%xmm12 , %xmm12 vshufpd $0x01, %xmm8 , %xmm8, %xmm9 vshufpd $0x01, %xmm12, %xmm12, %xmm13 #else vaddsubpd %xmm8, %xmm9 , %xmm9 vaddsubpd %xmm12,%xmm13, %xmm13 vmovapd %xmm9, %xmm8 vmovapd %xmm13, %xmm12 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 vshufpd $0x01, %xmm13, %xmm13, %xmm13 #endif // multiply with ALPHA_R vmulpd %xmm8 , %xmm0, %xmm8 vmulpd %xmm12, %xmm0, %xmm12 // multiply with ALPHA_I vmulpd %xmm9 , %xmm1, %xmm9 vmulpd %xmm13, %xmm1, %xmm13 vaddsubpd %xmm9, %xmm8 , %xmm8 vaddsubpd %xmm13, %xmm12, %xmm12 #ifndef TRMMKERNEL vaddpd (CO1), %xmm8 , %xmm8 vaddpd 2 * SIZE(CO1), %xmm12, %xmm12 #endif vmovups %xmm8 , (CO1) vmovups %xmm12 , 2 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $4 * SIZE, CO1 # coffset += 4 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_40: testq $1, M jz .L999 ALIGN_4 .L1_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_46 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_42: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) prefetcht0 B_PR1(BO,BI,SIZE) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB(xxx) jl .L1_47 ALIGN_4 .L1_49: vmovddup ALPHA_R, %xmm0 vmovddup ALPHA_I, %xmm1 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ defined(NR) || defined(NC) || defined(TR) || defined(TC) vaddsubpd %xmm9, %xmm8, %xmm8 vshufpd $0x01, %xmm8 , %xmm8, %xmm9 #else vaddsubpd %xmm8, %xmm9, %xmm9 vmovapd %xmm9, %xmm8 // swap high and low 64 bytes vshufpd $0x01, %xmm9 , %xmm9, %xmm9 #endif // multiply with ALPHA_R vmulpd %xmm8 , %xmm0, %xmm8 // multiply with ALPHA_I vmulpd %xmm9 , %xmm1, %xmm9 vaddsubpd %xmm9 ,%xmm8, %xmm8 #ifndef TRMMKERNEL vaddpd (CO1), %xmm8 , %xmm8 #endif vmovups %xmm8 , (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq ( ,BI,2), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L999: vzeroupper movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi vmovups 64(%rsp), %xmm6 vmovups 80(%rsp), %xmm7 vmovups 96(%rsp), %xmm8 vmovups 112(%rsp), %xmm9 vmovups 128(%rsp), %xmm10 vmovups 144(%rsp), %xmm11 vmovups 160(%rsp), %xmm12 vmovups 176(%rsp), %xmm13 vmovups 192(%rsp), %xmm14 vmovups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE