/*************************************************************************** Copyright (c) 2013, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the OpenBLAS project nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /********************************************************************* * * 2013/10/18 Saar * BLASTEST : OK * CTEST : OK * TEST : OK * * * 2013/10/29 Saar * * Parameter: * UNROLL_M 16 * UNROLL_N 2 * SGEMM_P 768 * SGEMM_Q 192 * SGEMM_R 12288 * A_PR1 384 * B_PR1 192 * * Performance at m x n on AMD 8320 (ACML-Version: 5.3.1): * * 6144x6144 168.2 GFLOPS with 8 threads on 4 modules (ACML: 158.0 ) (BULLDOZER: 167.4 ) * 6144x6144 162.7 GFLOPS with 4 threads on 4 modules (ACML: 157.6 ) (BULLDOZER: 159.0 ) * 6144x6144 82.0 GFLOPS with 2 threads on 2 modules (ACML: 81.4 ) (BULLDOZER: 80.3 ) * 6144x6144 41.3 GFLOPS with 1 threads on 1 modules (ACML: 41.1 ) (BULLDOZER: 40.4 ) * * Performance at m x n on AMD 6380 (ACML-Version: 5.3.1): * * 12288x12288 469.5 GFLOPS with 32 threads on 16 modules (ACML: 375.3 ) (BULLDOZER: 445.5 ) * 12288x12288 442.9 GFLOPS with 16 threads on 16 modules (ACML: 378.5 ) (BULLDOZER: 416.3 ) * 12288x12288 265.1 GFLOPS with 8 threads on 8 modules (ACML: 218.5 ) (BULLDOZER: 261.5 ) * 6144x6144 139.7 GFLOPS with 4 threads on 4 modules (ACML: 116.0 ) (BULLDOZER: 137.7 ) * 6144x6144 70.9 GFLOPS with 2 threads on 2 modules (ACML: 67.4 ) (BULLDOZER: 69.5 ) * 6144x6144 35.6 GFLOPS with 1 threads on 1 modules (ACML: 36.1 ) (BULLDOZER: 35.1 ) * *********************************************************************/ #define ASSEMBLER #include "common.h" #define OLD_M %rdi #define OLD_N %rsi #define M %r13 #define J %r14 #define OLD_K %rdx #define A %rcx #define B %r8 #define C %r9 #define LDC %r10 #define I %r11 #define AO %rdi #define BO %rsi #define CO1 %r15 #define K %r12 #define BI %rbp #define SP %rbx #define BO1 %rdi #define BO2 %r15 #ifndef WINDOWS_ABI #define STACKSIZE 96 #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_B 48 + STACKSIZE(%rsp) #define OLD_C 56 + STACKSIZE(%rsp) #define OLD_LDC 64 + STACKSIZE(%rsp) #define OLD_OFFSET 72 + STACKSIZE(%rsp) #endif #define L_BUFFER_SIZE 8192 #define LB2_OFFSET 4096 #define Ndiv6 24(%rsp) #define Nmod6 32(%rsp) #define N 40(%rsp) #define ALPHA 48(%rsp) #define OFFSET 56(%rsp) #define KK 64(%rsp) #define KKK 72(%rsp) #define BUFFER1 128(%rsp) #define BUFFER2 LB2_OFFSET+128(%rsp) #if defined(OS_WINDOWS) #if L_BUFFER_SIZE > 16384 #define STACK_TOUCH \ movl $0, 4096 * 4(%rsp);\ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 12288 #define STACK_TOUCH \ movl $0, 4096 * 3(%rsp);\ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 8192 #define STACK_TOUCH \ movl $0, 4096 * 2(%rsp);\ movl $0, 4096 * 1(%rsp); #elif L_BUFFER_SIZE > 4096 #define STACK_TOUCH \ movl $0, 4096 * 1(%rsp); #else #define STACK_TOUCH #endif #else #define STACK_TOUCH #endif #define A_PR1 384 #define B_PR1 192 /******************************************************************************************* * 3 lines of N *******************************************************************************************/ #define KERNEL16x3_1(xx) \ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ #define KERNEL16x3_2(xx) \ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ #define KERNEL16x3_3(xx) \ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ #define KERNEL16x3_4(xx) \ vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ addq $12, BI ;\ vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ addq $64, %rax ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ #define KERNEL16x3_SUB(xx) \ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vfmaddps %xmm12,%xmm3,%xmm0,%xmm12 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ vfmaddps %xmm15,%xmm3,%xmm0,%xmm15 ;\ /*******************************************************************************************/ #define KERNEL8x3_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ #define KERNEL8x3_2(xx) \ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ #define KERNEL8x3_3(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ #define KERNEL8x3_4(xx) \ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ addq $12, BI ;\ addq $32, %rax ;\ #define KERNEL8x3_SUB(xx) \ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ nop ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vfmaddps %xmm9,%xmm3,%xmm0,%xmm9 ;\ /*******************************************************************************************/ #define KERNEL4x3_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL4x3_2(xx) \ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL4x3_3(xx) \ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL4x3_4(xx) \ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ addq $12, BI ;\ addq $16, %rax ;\ #define KERNEL4x3_SUB(xx) \ vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddps %xmm6,%xmm3,%xmm0,%xmm6 ;\ /*******************************************************************************************/ #define KERNEL2x3_1(xx) \ vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ #define KERNEL2x3_2(xx) \ vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ #define KERNEL2x3_3(xx) \ vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ #define KERNEL2x3_4(xx) \ vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ addq $12, BI ;\ addq $8, %rax ;\ #define KERNEL2x3_SUB(xx) \ vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ vfmaddss %xmm12,%xmm3,%xmm0,%xmm12 ;\ /*******************************************************************************************/ #define KERNEL1x3_1(xx) \ vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL1x3_2(xx) \ vmovss -3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -2 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -1 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL1x3_3(xx) \ vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss 2 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ #define KERNEL1x3_4(xx) \ vmovss 3 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 4 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss 5 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ addq $12, BI ;\ addq $4, %rax ;\ #define KERNEL1x3_SUB(xx) \ vmovss -6 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -5 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -4 * SIZE(BO, BI, SIZE), %xmm3 ;\ vfmaddss %xmm6,%xmm3,%xmm0,%xmm6 ;\ /*******************************************************************************************/ /******************************************************************************************* * 2 lines of N *******************************************************************************************/ #define KERNEL16x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ #define KERNEL16x2_2(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ #define KERNEL16x2_3(xx) \ prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ #define KERNEL16x2_4(xx) \ prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ addq $8, BI ;\ addq $64, %rax ;\ #define KERNEL16x2_SUB(xx) \ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vfmaddps %xmm11,%xmm2,%xmm0,%xmm11 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ vfmaddps %xmm14,%xmm2,%xmm0,%xmm14 ;\ /*******************************************************************************************/ #define KERNEL8x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ #define KERNEL8x2_2(xx) \ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ #define KERNEL8x2_3(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ #define KERNEL8x2_4(xx) \ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ addq $8, BI ;\ addq $32, %rax ;\ #define KERNEL8x2_SUB(xx) \ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vfmaddps %xmm8,%xmm2,%xmm0,%xmm8 ;\ /*******************************************************************************************/ #define KERNEL4x2_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL4x2_2(xx) \ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL4x2_3(xx) \ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL4x2_4(xx) \ vbroadcastss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ addq $8, BI ;\ addq $16, %rax ;\ #define KERNEL4x2_SUB(xx) \ vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddps %xmm5,%xmm2,%xmm0,%xmm5 ;\ /*******************************************************************************************/ #define KERNEL2x2_1(xx) \ vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ #define KERNEL2x2_2(xx) \ vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ #define KERNEL2x2_3(xx) \ vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ #define KERNEL2x2_4(xx) \ vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ addq $8, BI ;\ addq $8, %rax ;\ #define KERNEL2x2_SUB(xx) \ vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ vfmaddss %xmm10,%xmm2,%xmm0,%xmm10 ;\ /*******************************************************************************************/ #define KERNEL1x2_1(xx) \ vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL1x2_2(xx) \ vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL1x2_3(xx) \ vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 1 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ #define KERNEL1x2_4(xx) \ vmovss 2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss 3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ addq $8, BI ;\ addq $4, %rax ;\ #define KERNEL1x2_SUB(xx) \ vmovss -4 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -3 * SIZE(BO, BI, SIZE), %xmm2 ;\ vfmaddss %xmm5,%xmm2,%xmm0,%xmm5 ;\ /*******************************************************************************************/ /******************************************************************************************* * 1 line of N *******************************************************************************************/ #define KERNEL16x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ #define KERNEL16x1_2(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ #define KERNEL16x1_3(xx) \ prefetcht0 A_PR1+128(AO,%rax,SIZE) ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups 0 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups 4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups 8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups 12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ #define KERNEL16x1_4(xx) \ prefetcht0 A_PR1+192(AO,%rax,SIZE) ;\ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups 16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups 20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups 24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups 28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ addq $4, BI ;\ addq $64, %rax ;\ #define KERNEL16x1_SUB(xx) \ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm10,%xmm1,%xmm0,%xmm10 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm13,%xmm1,%xmm0,%xmm13 ;\ /*******************************************************************************************/ #define KERNEL8x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ #define KERNEL8x1_2(xx) \ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ #define KERNEL8x1_3(xx) \ prefetcht0 A_PR1+64(AO,%rax,SIZE) ;\ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -12 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ #define KERNEL8x1_4(xx) \ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -8 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -4 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ addq $4, BI ;\ addq $32, %rax ;\ #define KERNEL8x1_SUB(xx) \ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm7,%xmm1,%xmm0,%xmm7 ;\ /*******************************************************************************************/ #define KERNEL4x1_1(xx) \ prefetcht0 A_PR1(AO,%rax,SIZE) ;\ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL4x1_2(xx) \ vbroadcastss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL4x1_3(xx) \ vbroadcastss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -24 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL4x1_4(xx) \ vbroadcastss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -20 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ addq $4, BI ;\ addq $16, %rax ;\ #define KERNEL4x1_SUB(xx) \ vbroadcastss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovups -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddps %xmm4,%xmm1,%xmm0,%xmm4 ;\ /*******************************************************************************************/ #define KERNEL2x1_1(xx) \ vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ #define KERNEL2x1_2(xx) \ vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ #define KERNEL2x1_3(xx) \ vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -28 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -27 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ #define KERNEL2x1_4(xx) \ vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -26 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -25 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ addq $4, BI ;\ addq $8, %rax ;\ #define KERNEL2x1_SUB(xx) \ vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm8,%xmm1,%xmm0,%xmm8 ;\ /*******************************************************************************************/ #define KERNEL1x1_1(xx) \ vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL1x1_2(xx) \ vmovss -1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -31 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL1x1_3(xx) \ vmovss 0 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -30 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ #define KERNEL1x1_4(xx) \ vmovss 1 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -29 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ addq $4, BI ;\ addq $4, %rax ;\ #define KERNEL1x1_SUB(xx) \ vmovss -2 * SIZE(BO, BI, SIZE), %xmm1 ;\ vmovss -32 * SIZE(AO, %rax, SIZE), %xmm0 ;\ vfmaddss %xmm4,%xmm1,%xmm0,%xmm4 ;\ /*******************************************************************************************/ #if !defined(TRMMKERNEL) PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC vmovaps %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA salq $BASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $6, %rdi divq %rdi // N / 6 movq %rax, Ndiv6 // N / 6 movq %rdx, Nmod6 // N % 6 movq Ndiv6, J cmpq $0, J je .L2_0 ALIGN_4 .L6_01: // copy to sub buffer movq K, %rax salq $1,%rax // K * 2 ; read 2 values movq B, BO1 leaq (B,%rax, SIZE), BO2 // next offset to BO2 leaq BUFFER1, BO // first buffer to BO movq K, %rax sarq $3 , %rax // K / 8 jz .L6_01a_2 ALIGN_4 .L6_01a_1: prefetcht0 512(BO1) prefetcht0 512(BO2) prefetchw 512(BO) vmovsd 0 * SIZE(BO1), %xmm0 vmovsd 2 * SIZE(BO1), %xmm2 vmovsd 4 * SIZE(BO1), %xmm4 vmovsd 6 * SIZE(BO1), %xmm6 vmovss 0 * SIZE(BO2), %xmm1 vmovss 2 * SIZE(BO2), %xmm3 vmovss 4 * SIZE(BO2), %xmm5 vmovss 6 * SIZE(BO2), %xmm7 vmovsd %xmm0, 0*SIZE(BO) vmovss %xmm1, 2*SIZE(BO) vmovsd %xmm2, 3*SIZE(BO) vmovss %xmm3, 5*SIZE(BO) vmovsd %xmm4, 6*SIZE(BO) vmovss %xmm5, 8*SIZE(BO) vmovsd %xmm6, 9*SIZE(BO) vmovss %xmm7,11*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO2 addq $12*SIZE,BO vmovsd 0 * SIZE(BO1), %xmm0 vmovsd 2 * SIZE(BO1), %xmm2 vmovsd 4 * SIZE(BO1), %xmm4 vmovsd 6 * SIZE(BO1), %xmm6 vmovss 0 * SIZE(BO2), %xmm1 vmovss 2 * SIZE(BO2), %xmm3 vmovss 4 * SIZE(BO2), %xmm5 vmovss 6 * SIZE(BO2), %xmm7 vmovsd %xmm0, 0*SIZE(BO) vmovss %xmm1, 2*SIZE(BO) vmovsd %xmm2, 3*SIZE(BO) vmovss %xmm3, 5*SIZE(BO) vmovsd %xmm4, 6*SIZE(BO) vmovss %xmm5, 8*SIZE(BO) vmovsd %xmm6, 9*SIZE(BO) vmovss %xmm7,11*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO2 addq $12*SIZE,BO decq %rax jnz .L6_01a_1 .L6_01a_2: movq K, %rax andq $7, %rax // K % 8 jz .L6_02c ALIGN_4 .L6_02b: vmovsd 0 * SIZE(BO1), %xmm0 vmovss 0 * SIZE(BO2), %xmm2 vmovsd %xmm0, 0*SIZE(BO) vmovss %xmm2, 2*SIZE(BO) addq $2*SIZE,BO1 addq $2*SIZE,BO2 addq $3*SIZE,BO decq %rax jnz .L6_02b .L6_02c: movq K, %rax salq $1,%rax // K * 2 leaq (B,%rax, SIZE), BO1 // next offset to BO1 leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 leaq BUFFER2, BO // second buffer to BO movq K, %rax sarq $3 , %rax // K / 8 jz .L6_02c_2 ALIGN_4 .L6_02c_1: prefetcht0 512(BO2) prefetchw 512(BO) vmovsd 0 * SIZE(BO2), %xmm0 vmovsd 2 * SIZE(BO2), %xmm2 vmovsd 4 * SIZE(BO2), %xmm4 vmovsd 6 * SIZE(BO2), %xmm6 vmovss 1 * SIZE(BO1), %xmm1 vmovss 3 * SIZE(BO1), %xmm3 vmovss 5 * SIZE(BO1), %xmm5 vmovss 7 * SIZE(BO1), %xmm7 vmovss %xmm1, 0*SIZE(BO) vmovsd %xmm0, 1*SIZE(BO) vmovss %xmm3, 3*SIZE(BO) vmovsd %xmm2, 4*SIZE(BO) vmovss %xmm5, 6*SIZE(BO) vmovsd %xmm4, 7*SIZE(BO) vmovss %xmm7, 9*SIZE(BO) vmovsd %xmm6,10*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO2 addq $12*SIZE,BO vmovsd 0 * SIZE(BO2), %xmm0 vmovsd 2 * SIZE(BO2), %xmm2 vmovsd 4 * SIZE(BO2), %xmm4 vmovsd 6 * SIZE(BO2), %xmm6 vmovss 1 * SIZE(BO1), %xmm1 vmovss 3 * SIZE(BO1), %xmm3 vmovss 5 * SIZE(BO1), %xmm5 vmovss 7 * SIZE(BO1), %xmm7 vmovss %xmm1, 0*SIZE(BO) vmovsd %xmm0, 1*SIZE(BO) vmovss %xmm3, 3*SIZE(BO) vmovsd %xmm2, 4*SIZE(BO) vmovss %xmm5, 6*SIZE(BO) vmovsd %xmm4, 7*SIZE(BO) vmovss %xmm7, 9*SIZE(BO) vmovsd %xmm6,10*SIZE(BO) addq $8*SIZE,BO1 addq $8*SIZE,BO2 addq $12*SIZE,BO decq %rax jnz .L6_02c_1 .L6_02c_2: movq K, %rax andq $7, %rax // K % 8 jz .L6_03c ALIGN_4 .L6_03b: vmovss 1*SIZE(BO1), %xmm0 vmovsd 0*SIZE(BO2), %xmm1 vmovss %xmm0, 0*SIZE(BO) vmovsd %xmm1, 1*SIZE(BO) addq $2*SIZE,BO1 addq $2*SIZE,BO2 addq $3*SIZE,BO decq %rax jnz .L6_03b .L6_03c: movq BO2, B // next offset of B .L6_10: movq C, CO1 leaq (C, LDC, 2), C leaq (C, LDC, 1), C // c += 3 * ldc movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L6_20 ALIGN_4 .L6_11: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L6_16 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_12: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x3_1(xxx) KERNEL16x3_2(xxx) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) KERNEL16x3_1(xxx) prefetcht0 B_PR1+16(BO,BI, SIZE) KERNEL16x3_2(xxx) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) je .L6_16 KERNEL16x3_1(xxx) KERNEL16x3_2(xxx) prefetcht0 B_PR1+32(BO,BI, SIZE) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) KERNEL16x3_1(xxx) KERNEL16x3_2(xxx) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) je .L6_16 jmp .L6_12 ALIGN_4 .L6_16: movq K, %rax andq $7, %rax # if (k & 1) je .L6_19 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_17: KERNEL16x3_SUB(xxx) addq $3, BI addq $16, %rax jl .L6_17 ALIGN_4 .L6_19: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm10, 8 * SIZE(CO1) vmovups %xmm13,12 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) vmovups %xmm11, 8 * SIZE(CO1, LDC) vmovups %xmm14,12 * SIZE(CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) vmovups %xmm15,12 * SIZE(CO1, LDC, 2) addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L6_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L6_20: // Test rest of M testq $15, M jz .L7_10 // to next 3 lines of N testq $8, M jz .L6_21pre ALIGN_4 /**************************************************************************/ .L6_20_1: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_20_6 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_20_2: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) KERNEL8x3_1(xxx) prefetcht0 B_PR1+16(BO,BI, SIZE) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) je .L6_20_6 KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) prefetcht0 B_PR1+32(BO,BI, SIZE) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) je .L6_20_6 jmp .L6_20_2 ALIGN_4 .L6_20_6: movq K, %rax andq $7, %rax # if (k & 1) je .L6_20_9 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_20_7: KERNEL8x3_SUB(xxx) addq $3, BI addq $8, %rax jl .L6_20_7 ALIGN_4 .L6_20_9: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L6_21pre: testq $4, M jz .L6_30 ALIGN_4 .L6_21: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_26 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_22: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) prefetcht0 B_PR1+16(BO,BI, SIZE) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L6_26 KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) prefetcht0 B_PR1+32(BO,BI, SIZE) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L6_26 jmp .L6_22 ALIGN_4 .L6_26: movq K, %rax andq $7, %rax # if (k & 1) je .L6_29 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_27: KERNEL4x3_SUB(xxx) addq $3, BI addq $4, %rax jl .L6_27 ALIGN_4 .L6_29: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L6_30: testq $2, M jz .L6_40 ALIGN_4 .L6_31: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_36 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_32: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) prefetcht0 B_PR1+16(BO,BI,SIZE) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L6_36 KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,SIZE) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L6_36 jmp .L6_32 ALIGN_4 .L6_36: movq K, %rax andq $7, %rax # if (k & 1) je .L6_39 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_37: KERNEL2x3_SUB(xxx) addq $3, BI addq $2, %rax jl .L6_37 ALIGN_4 .L6_39: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 vmovss %xmm4 , (CO1) vmovss %xmm8 , 1 * SIZE(CO1) vmovss %xmm5 , (CO1, LDC) vmovss %xmm10, 1 * SIZE(CO1, LDC) vmovss %xmm6 , (CO1, LDC, 2) vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L6_40: testq $1, M jz .L7_10 // to next 3 lines of N ALIGN_4 .L6_41: leaq BUFFER1, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L6_46 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_42: KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L6_46 KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L6_46 jmp .L6_42 ALIGN_4 .L6_46: movq K, %rax andq $7, %rax # if (k & 1) je .L6_49 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L6_47: KERNEL1x3_SUB(xxx) addq $3, BI addq $1, %rax jl .L6_47 ALIGN_4 .L6_49: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vmovss %xmm4 , (CO1) vmovss %xmm5 , (CO1, LDC) vmovss %xmm6 , (CO1, LDC, 2) addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 /***************************************************************************************************************/ .L7_10: movq C, CO1 leaq (C, LDC, 2), C leaq (C, LDC, 1), C // c += 3 * ldc movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L7_20 ALIGN_4 .L7_11: leaq BUFFER2, BO // second buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L7_16 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_12: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x3_1(xxx) KERNEL16x3_2(xxx) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) KERNEL16x3_1(xxx) prefetcht0 B_PR1+16(BO,BI, SIZE) KERNEL16x3_2(xxx) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) je .L7_16 KERNEL16x3_1(xxx) KERNEL16x3_2(xxx) prefetcht0 B_PR1+32(BO,BI, SIZE) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) KERNEL16x3_1(xxx) KERNEL16x3_2(xxx) KERNEL16x3_3(xxx) KERNEL16x3_4(xxx) je .L7_16 jmp .L7_12 ALIGN_4 .L7_16: movq K, %rax andq $7, %rax # if (k & 1) je .L7_19 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_17: KERNEL16x3_SUB(xxx) addq $3, BI addq $16, %rax jl .L7_17 ALIGN_4 .L7_19: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 vfmaddps 8 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 vfmaddps 12 * SIZE(CO1, LDC, 2),%xmm0, %xmm15,%xmm15 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm10, 8 * SIZE(CO1) vmovups %xmm13,12 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) vmovups %xmm11, 8 * SIZE(CO1, LDC) vmovups %xmm14,12 * SIZE(CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) vmovups %xmm12, 8 * SIZE(CO1, LDC, 2) vmovups %xmm15,12 * SIZE(CO1, LDC, 2) addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L7_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L7_20: // Test rest of M testq $15, M jz .L7_60 // to next 3 lines of N testq $8, M jz .L7_21pre ALIGN_4 /**************************************************************************/ .L7_20_1: leaq BUFFER2, BO // first buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_20_6 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_20_2: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) KERNEL8x3_1(xxx) prefetcht0 B_PR1+16(BO,BI, SIZE) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) je .L7_20_6 KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) prefetcht0 B_PR1+32(BO,BI, SIZE) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) KERNEL8x3_1(xxx) KERNEL8x3_2(xxx) KERNEL8x3_3(xxx) KERNEL8x3_4(xxx) je .L7_20_6 jmp .L7_20_2 ALIGN_4 .L7_20_6: movq K, %rax andq $7, %rax # if (k & 1) je .L7_20_9 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_20_7: KERNEL8x3_SUB(xxx) addq $3, BI addq $8, %rax jl .L7_20_7 ALIGN_4 .L7_20_9: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddps (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddps 4 * SIZE(CO1, LDC, 2),%xmm0, %xmm9,%xmm9 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) vmovups %xmm9 , 4 * SIZE(CO1, LDC, 2) addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L7_21pre: testq $4, M jz .L7_30 ALIGN_4 .L7_21: leaq BUFFER2, BO // second buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_26 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_22: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) prefetcht0 B_PR1+16(BO,BI, SIZE) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L7_26 KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) prefetcht0 B_PR1+32(BO,BI, SIZE) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) KERNEL4x3_1(xxx) KERNEL4x3_2(xxx) KERNEL4x3_3(xxx) KERNEL4x3_4(xxx) je .L7_26 jmp .L7_22 ALIGN_4 .L7_26: movq K, %rax andq $7, %rax # if (k & 1) je .L7_29 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_27: KERNEL4x3_SUB(xxx) addq $3, BI addq $4, %rax jl .L7_27 ALIGN_4 .L7_29: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps (CO1, LDC, 2),%xmm0, %xmm6 ,%xmm6 vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm6 , (CO1, LDC, 2) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L7_30: testq $2, M jz .L7_40 ALIGN_4 .L7_31: leaq BUFFER2, BO // second buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_36 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_32: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) prefetcht0 B_PR1+16(BO,BI,SIZE) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L7_36 KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) prefetcht0 B_PR1+32(BO,BI,SIZE) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) KERNEL2x3_1(xxx) KERNEL2x3_2(xxx) KERNEL2x3_3(xxx) KERNEL2x3_4(xxx) je .L7_36 jmp .L7_32 ALIGN_4 .L7_36: movq K, %rax andq $7, %rax # if (k & 1) je .L7_39 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_37: KERNEL2x3_SUB(xxx) addq $3, BI addq $2, %rax jl .L7_37 ALIGN_4 .L7_39: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vfmaddss 1 * SIZE(CO1, LDC, 2),%xmm0, %xmm12,%xmm12 vmovss %xmm4 , (CO1) vmovss %xmm8 , 1 * SIZE(CO1) vmovss %xmm5 , (CO1, LDC) vmovss %xmm10, 1 * SIZE(CO1, LDC) vmovss %xmm6 , (CO1, LDC, 2) vmovss %xmm12, 1 * SIZE(CO1, LDC, 2) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L7_40: testq $1, M jz .L7_60 // to next 3 lines of N ALIGN_4 .L7_41: leaq BUFFER2, BO // second buffer to BO addq $6 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L7_46 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_42: KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L7_46 KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) KERNEL1x3_1(xxx) KERNEL1x3_2(xxx) KERNEL1x3_3(xxx) KERNEL1x3_4(xxx) je .L7_46 jmp .L7_42 ALIGN_4 .L7_46: movq K, %rax andq $7, %rax # if (k & 1) je .L7_49 movq %rax, BI // Index for BO leaq (BI,BI,2), BI // BI = BI * 3 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L7_47: KERNEL1x3_SUB(xxx) addq $3, BI addq $1, %rax jl .L7_47 ALIGN_4 .L7_49: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddss (CO1, LDC, 2),%xmm0, %xmm6,%xmm6 vmovss %xmm4 , (CO1) vmovss %xmm5 , (CO1, LDC) vmovss %xmm6 , (CO1, LDC, 2) addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L7_60: decq J // j -- jg .L6_01 .L2_0: cmpq $0, Nmod6 // N % 6 == 0 je .L999 /************************************************************************************************ * Loop for Nmod6 / 2 > 0 *************************************************************************************************/ movq Nmod6, J sarq $1, J // j = j / 2 je .L1_0 ALIGN_4 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_02b: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L2_02b .L2_02c: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L2_20 ALIGN_4 .L2_11: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_12: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) je .L2_16 prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: movq K, %rax andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL16x2_SUB(xxx) addq $2, BI addq $16, %rax jl .L2_17 ALIGN_4 .L2_19: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm10, 8 * SIZE(CO1) vmovups %xmm13,12 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) vmovups %xmm11, 8 * SIZE(CO1, LDC) vmovups %xmm14,12 * SIZE(CO1, LDC) addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: // Test rest of M testq $15, M jz .L2_60 // to next 3 lines of N testq $8, M jz .L2_21pre ALIGN_4 /**************************************************************************/ .L2_20_1: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_20_6 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_2: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_20_6 prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_20_6 jmp .L2_20_2 ALIGN_4 .L2_20_6: movq K, %rax andq $7, %rax # if (k & 1) je .L2_20_9 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_7: KERNEL8x2_SUB(xxx) addq $2, BI addq $8, %rax jl .L2_20_7 ALIGN_4 .L2_20_9: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L2_21pre: testq $4, M jz .L2_30 ALIGN_4 .L2_21: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_26 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 1 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_22: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 jmp .L2_22 ALIGN_4 .L2_26: movq K, %rax andq $7, %rax # if (k & 1) je .L2_29 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_27: KERNEL4x2_SUB(xxx) addq $2, BI addq $4, %rax jl .L2_27 ALIGN_4 .L2_29: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L2_30: testq $2, M jz .L2_40 ALIGN_4 .L2_31: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_36 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_32: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 jmp .L2_32 ALIGN_4 .L2_36: movq K, %rax andq $7, %rax # if (k & 1) je .L2_39 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_37: KERNEL2x2_SUB(xxx) addq $2, BI addq $2, %rax jl .L2_37 ALIGN_4 .L2_39: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 vmovss %xmm4 , (CO1) vmovss %xmm8 , 1 * SIZE(CO1) vmovss %xmm5 , (CO1, LDC) vmovss %xmm10, 1 * SIZE(CO1, LDC) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L2_46 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_42: KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: movq K, %rax andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB(xxx) addq $2, BI addq $1, %rax jl .L2_47 ALIGN_4 .L2_49: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vmovss %xmm4 , (CO1) vmovss %xmm5 , (CO1, LDC) addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L2_60: decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovss (BO1), %xmm0 vmovss %xmm0, (BO) addq $1*SIZE,BO1 addq $1*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L1_20 ALIGN_4 .L1_11: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_12: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) je .L1_16 KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: movq K, %rax andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL16x1_SUB(xxx) addq $1, BI addq $16, %rax jl .L1_17 ALIGN_4 .L1_19: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm10, 8 * SIZE(CO1) vmovups %xmm13,12 * SIZE(CO1) addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: // Test rest of M testq $15, M jz .L999 testq $8, M jz .L1_21pre ALIGN_4 /**************************************************************************/ .L1_20_1: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_20_6 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_2: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_20_6 KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_20_6 jmp .L1_20_2 ALIGN_4 .L1_20_6: movq K, %rax andq $7, %rax # if (k & 1) je .L1_20_9 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_7: KERNEL8x1_SUB(xxx) addq $1, BI addq $8, %rax jl .L1_20_7 ALIGN_4 .L1_20_9: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L1_21pre: testq $4, M jz .L1_30 ALIGN_4 .L1_21: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_26 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_22: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 jmp .L1_22 ALIGN_4 .L1_26: movq K, %rax andq $7, %rax # if (k & 1) je .L1_29 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_27: KERNEL4x1_SUB(xxx) addq $1, BI addq $4, %rax jl .L1_27 ALIGN_4 .L1_29: vbroadcastss ALPHA, %xmm0 vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vmovups %xmm4 , (CO1) addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L1_30: testq $2, M jz .L1_40 ALIGN_4 .L1_31: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_36 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_32: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 jmp .L1_32 ALIGN_4 .L1_36: movq K, %rax andq $7, %rax # if (k & 1) je .L1_39 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_37: KERNEL2x1_SUB(xxx) addq $1, BI addq $2, %rax jl .L1_37 ALIGN_4 .L1_39: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 vmovss %xmm4 , (CO1) vmovss %xmm8 , 1 * SIZE(CO1) addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L1_40: testq $1, M jz .L999 ALIGN_4 .L1_41: leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO vzeroall movq K, %rax andq $-8, %rax je .L1_46 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_42: KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: movq K, %rax andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB(xxx) addq $1, BI addq $1, %rax jl .L1_47 ALIGN_4 .L1_49: vmovss ALPHA, %xmm0 vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vmovss %xmm4 , (CO1) addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L999: movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE #else /************************************************************************************* * TRMM Kernel *************************************************************************************/ PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, (%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) vzeroupper #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq ARG1, OLD_M movq ARG2, OLD_N movq ARG3, OLD_K movq OLD_A, A movq OLD_B, B movq OLD_C, C movq OLD_LDC, LDC #ifdef TRMMKERNEL movsd OLD_OFFSET, %xmm12 #endif vmovaps %xmm3, %xmm0 #else movq STACKSIZE + 8(%rsp), LDC #ifdef TRMMKERNEL movsd STACKSIZE + 16(%rsp), %xmm12 #endif #endif movq %rsp, SP # save old stack subq $128 + L_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCH cmpq $0, OLD_M je .L999 cmpq $0, OLD_N je .L999 cmpq $0, OLD_K je .L999 movq OLD_M, M movq OLD_N, N movq OLD_K, K vmovsd %xmm0, ALPHA salq $BASE_SHIFT, LDC movq N, %rax xorq %rdx, %rdx movq $2, %rdi divq %rdi // N / 6 movq %rax, Ndiv6 // N / 6 movq %rdx, Nmod6 // N % 6 #ifdef TRMMKERNEL vmovsd %xmm12, OFFSET vmovsd %xmm12, KK #ifndef LEFT negq KK #endif #endif movq Ndiv6, J cmpq $0, J je .L1_0 ALIGN_4 .L2_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L2_02b: vmovsd (BO1), %xmm0 vmovsd %xmm0, (BO) addq $2*SIZE,BO1 addq $2*SIZE,BO decq %rax jnz .L2_02b .L2_02c: movq BO1, B // next offset of B .L2_10: movq C, CO1 leaq (C, LDC, 2), C // c += 2 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L2_20 ALIGN_4 .L2_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $16, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L2_16 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_12: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) je .L2_16 prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) KERNEL16x2_1(xxx) KERNEL16x2_2(xxx) KERNEL16x2_3(xxx) KERNEL16x2_4(xxx) je .L2_16 jmp .L2_12 ALIGN_4 .L2_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_19 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_17: KERNEL16x2_SUB(xxx) addq $2, BI addq $16, %rax jl .L2_17 ALIGN_4 .L2_19: vbroadcastss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 vfmaddps 8 * SIZE(CO1, LDC),%xmm0, %xmm11,%xmm11 vfmaddps 12 * SIZE(CO1, LDC),%xmm0, %xmm14,%xmm14 #else vmulps %xmm0, %xmm4,%xmm4 vmulps %xmm0, %xmm7,%xmm7 vmulps %xmm0, %xmm10,%xmm10 vmulps %xmm0, %xmm13,%xmm13 vmulps %xmm0, %xmm5,%xmm5 vmulps %xmm0, %xmm8,%xmm8 vmulps %xmm0, %xmm11,%xmm11 vmulps %xmm0, %xmm14,%xmm14 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm10, 8 * SIZE(CO1) vmovups %xmm13,12 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) vmovups %xmm11, 8 * SIZE(CO1, LDC) vmovups %xmm14,12 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $16, KK #endif addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L2_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L2_20: // Test rest of M testq $15, M jz .L2_60 // to next 3 lines of N testq $8, M jz .L2_21pre ALIGN_4 /**************************************************************************/ .L2_20_1: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in A #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_20_6 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_2: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_20_6 prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) KERNEL8x2_1(xxx) KERNEL8x2_2(xxx) KERNEL8x2_3(xxx) KERNEL8x2_4(xxx) je .L2_20_6 jmp .L2_20_2 ALIGN_4 .L2_20_6: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_20_9 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_20_7: KERNEL8x2_SUB(xxx) addq $2, BI addq $8, %rax jl .L2_20_7 ALIGN_4 .L2_20_9: vbroadcastss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddps 4 * SIZE(CO1, LDC),%xmm0, %xmm8,%xmm8 #else vmulps %xmm0, %xmm4,%xmm4 vmulps %xmm0, %xmm7,%xmm7 vmulps %xmm0, %xmm5,%xmm5 vmulps %xmm0, %xmm8,%xmm8 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm5 , (CO1, LDC) vmovups %xmm8 , 4 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L2_21pre: testq $4, M jz .L2_30 ALIGN_4 .L2_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in A #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_26 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 1 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_22: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) KERNEL4x2_1(xxx) KERNEL4x2_2(xxx) KERNEL4x2_3(xxx) KERNEL4x2_4(xxx) je .L2_26 jmp .L2_22 ALIGN_4 .L2_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_29 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_27: KERNEL4x2_SUB(xxx) addq $2, BI addq $4, %rax jl .L2_27 ALIGN_4 .L2_29: vbroadcastss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps (CO1, LDC),%xmm0, %xmm5,%xmm5 #else vmulps %xmm0, %xmm4,%xmm4 vmulps %xmm0, %xmm5,%xmm5 #endif vmovups %xmm4 , (CO1) vmovups %xmm5 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L2_30: testq $2, M jz .L2_40 ALIGN_4 .L2_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_36 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_32: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) KERNEL2x2_1(xxx) KERNEL2x2_2(xxx) KERNEL2x2_3(xxx) KERNEL2x2_4(xxx) je .L2_36 jmp .L2_32 ALIGN_4 .L2_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_39 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_37: KERNEL2x2_SUB(xxx) addq $2, BI addq $2, %rax jl .L2_37 ALIGN_4 .L2_39: vmovss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 vfmaddss 1 * SIZE(CO1, LDC),%xmm0, %xmm10,%xmm10 #else vmulss %xmm0, %xmm4,%xmm4 vmulss %xmm0, %xmm8,%xmm8 vmulss %xmm0, %xmm5,%xmm5 vmulss %xmm0, %xmm10,%xmm10 #endif vmovss %xmm4 , (CO1) vmovss %xmm8 , 1 * SIZE(CO1) vmovss %xmm5 , (CO1, LDC) vmovss %xmm10, 1 * SIZE(CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L2_40: testq $1, M jz .L2_60 // to next 2 lines of N ALIGN_4 .L2_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $4 * SIZE, BO movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $2, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L2_46 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_42: KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) KERNEL1x2_1(xxx) KERNEL1x2_2(xxx) KERNEL1x2_3(xxx) KERNEL1x2_4(xxx) je .L2_46 jmp .L2_42 ALIGN_4 .L2_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L2_49 movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L2_47: KERNEL1x2_SUB(xxx) addq $2, BI addq $1, %rax jl .L2_47 ALIGN_4 .L2_49: vmovss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss (CO1, LDC),%xmm0, %xmm5,%xmm5 #else vmulss %xmm0, %xmm4,%xmm4 vmulss %xmm0, %xmm5,%xmm5 #endif vmovss %xmm4 , (CO1) vmovss %xmm5 , (CO1, LDC) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BI,BI,1), BI // BI = BI * 2 ; number of values leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L2_60: #if defined(TRMMKERNEL) && !defined(LEFT) addq $2, KK #endif decq J // j -- jg .L2_01 // next 2 lines of N .L1_0: /************************************************************************************************ * Loop for Nmod6 % 2 > 0 *************************************************************************************************/ movq Nmod6, J andq $1, J // j % 2 je .L999 ALIGN_4 .L1_01: // copy to sub buffer movq B, BO1 leaq BUFFER1, BO // first buffer to BO movq K, %rax ALIGN_4 .L1_02b: vmovss (BO1), %xmm0 vmovss %xmm0, (BO) addq $1*SIZE,BO1 addq $1*SIZE,BO decq %rax jnz .L1_02b .L1_02c: movq BO1, B // next offset of B .L1_10: movq C, CO1 leaq (C, LDC, 1), C // c += 1 * ldc #if defined(TRMMKERNEL) && defined(LEFT) movq OFFSET, %rax movq %rax, KK #endif movq A, AO // aoffset = a addq $32 * SIZE, AO movq M, I sarq $4, I // i = (m >> 4) je .L1_20 ALIGN_4 .L1_11: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $16, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax // K = K - ( K % 8 ) je .L1_16 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_12: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) je .L1_16 KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) KERNEL16x1_1(xxx) KERNEL16x1_2(xxx) KERNEL16x1_3(xxx) KERNEL16x1_4(xxx) je .L1_16 jmp .L1_12 ALIGN_4 .L1_16: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_19 movq %rax, BI // Index for BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_17: KERNEL16x1_SUB(xxx) addq $1, BI addq $16, %rax jl .L1_17 ALIGN_4 .L1_19: vbroadcastss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 vfmaddps 8 * SIZE(CO1),%xmm0, %xmm10,%xmm10 vfmaddps 12 * SIZE(CO1),%xmm0, %xmm13,%xmm13 #else vmulps %xmm0, %xmm4,%xmm4 vmulps %xmm0, %xmm7,%xmm7 vmulps %xmm0, %xmm10,%xmm10 vmulps %xmm0, %xmm13,%xmm13 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) vmovups %xmm10, 8 * SIZE(CO1) vmovups %xmm13,12 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $4, %rax // rax = rax * 16 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $16, KK #endif addq $16 * SIZE, CO1 # coffset += 16 decq I # i -- jg .L1_11 ALIGN_4 /************************************************************************** * Rest of M ***************************************************************************/ .L1_20: // Test rest of M testq $15, M jz .L999 testq $8, M jz .L1_21pre ALIGN_4 /**************************************************************************/ .L1_20_1: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $8, %rax // number of values in A #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_20_6 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_2: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_20_6 KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) KERNEL8x1_1(xxx) KERNEL8x1_2(xxx) KERNEL8x1_3(xxx) KERNEL8x1_4(xxx) je .L1_20_6 jmp .L1_20_2 ALIGN_4 .L1_20_6: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_20_9 movq %rax, BI // Index for BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_20_7: KERNEL8x1_SUB(xxx) addq $1, BI addq $8, %rax jl .L1_20_7 ALIGN_4 .L1_20_9: vbroadcastss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddps (CO1),%xmm0, %xmm4,%xmm4 vfmaddps 4 * SIZE(CO1),%xmm0, %xmm7,%xmm7 #else vmulps %xmm0, %xmm4,%xmm4 vmulps %xmm0, %xmm7,%xmm7 #endif vmovups %xmm4 , (CO1) vmovups %xmm7 , 4 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $3, %rax // rax = rax * 8 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $8, KK #endif addq $8 * SIZE, CO1 # coffset += 8 ALIGN_4 /**************************************************************************/ .L1_21pre: testq $4, M jz .L1_30 ALIGN_4 .L1_21: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $4, %rax // number of values in A #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_26 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_22: prefetcht0 B_PR1(BO,BI, SIZE) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) KERNEL4x1_1(xxx) KERNEL4x1_2(xxx) KERNEL4x1_3(xxx) KERNEL4x1_4(xxx) je .L1_26 jmp .L1_22 ALIGN_4 .L1_26: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_29 movq %rax, BI // Index for BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_27: KERNEL4x1_SUB(xxx) addq $1, BI addq $4, %rax jl .L1_27 ALIGN_4 .L1_29: vbroadcastss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddps (CO1),%xmm0, %xmm4,%xmm4 #else vmulps %xmm0, %xmm4,%xmm4 #endif vmovups %xmm4 , (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $2, %rax // rax = rax * 4 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $4, KK #endif addq $4 * SIZE, CO1 # coffset += 4 ALIGN_4 .L1_30: testq $2, M jz .L1_40 ALIGN_4 .L1_31: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $2, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_36 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_32: prefetcht0 B_PR1(BO,BI,SIZE) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) KERNEL2x1_1(xxx) KERNEL2x1_2(xxx) KERNEL2x1_3(xxx) KERNEL2x1_4(xxx) je .L1_36 jmp .L1_32 ALIGN_4 .L1_36: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_39 movq %rax, BI // Index for BO salq $1, %rax // rax = rax *2 ; number of values leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_37: KERNEL2x1_SUB(xxx) addq $1, BI addq $2, %rax jl .L1_37 ALIGN_4 .L1_39: vmovss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddss (CO1),%xmm0, %xmm4,%xmm4 vfmaddss 1 * SIZE(CO1),%xmm0, %xmm8,%xmm8 #else vmulss %xmm0, %xmm4,%xmm4 vmulss %xmm0, %xmm8,%xmm8 #endif vmovss %xmm4 , (CO1) vmovss %xmm8 , 1 * SIZE(CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO salq $1, %rax // rax = rax * 2 ; number of values leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $2, KK #endif addq $2 * SIZE, CO1 # coffset += 2 ALIGN_4 .L1_40: testq $1, M jz .L999 ALIGN_4 .L1_41: #if !defined(TRMMKERNEL) || \ (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO #else movq KK, %rax leaq BUFFER1, BO // first buffer to BO addq $2 * SIZE, BO movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif vzeroall #ifndef TRMMKERNEL movq K, %rax #elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) movq K, %rax subq KK, %rax movq %rax, KKK #else movq KK, %rax #ifdef LEFT addq $1, %rax // number of values in AO #else addq $1, %rax // number of values in BO #endif movq %rax, KKK #endif andq $-8, %rax je .L1_46 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_42: KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) KERNEL1x1_1(xxx) KERNEL1x1_2(xxx) KERNEL1x1_3(xxx) KERNEL1x1_4(xxx) je .L1_46 jmp .L1_42 ALIGN_4 .L1_46: #ifndef TRMMKERNEL movq K, %rax #else movq KKK, %rax #endif andq $7, %rax # if (k & 1) je .L1_49 movq %rax, BI // Index for BO leaq (AO, %rax, SIZE), AO leaq (BO, BI, SIZE), BO negq BI negq %rax ALIGN_4 .L1_47: KERNEL1x1_SUB(xxx) addq $1, BI addq $1, %rax jl .L1_47 ALIGN_4 .L1_49: vmovss ALPHA, %xmm0 #ifndef TRMMKERNEL vfmaddss (CO1),%xmm0, %xmm4,%xmm4 #else vmulss %xmm0, %xmm4,%xmm4 #endif vmovss %xmm4 , (CO1) #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) movq K, %rax subq KKK, %rax movq %rax, BI // Index for BO leaq (BO, BI, SIZE), BO leaq (AO, %rax, SIZE), AO #endif #if defined(TRMMKERNEL) && defined(LEFT) addq $1, KK #endif addq $1 * SIZE, CO1 # coffset += 1 ALIGN_4 .L999: movq SP, %rsp movq (%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE #endif