/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ATOM #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #ifdef CORE2 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 28) #endif #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 12) #define movsd movlpd #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #ifndef WINDOWS_ABI #define STACKSIZE 80 #define OLD_Y 8 + STACKSIZE(%rsp) #define OLD_INCY 16 + STACKSIZE(%rsp) #define OLD_BUFFER 24 + STACKSIZE(%rsp) #define M ARG1 #define N ARG2 #define A ARG3 #define LDA ARG4 #define X ARG5 #define INCX ARG6 #else #define STACKSIZE 256 #define OLD_A 40 + STACKSIZE(%rsp) #define OLD_LDA 48 + STACKSIZE(%rsp) #define OLD_X 56 + STACKSIZE(%rsp) #define OLD_INCX 64 + STACKSIZE(%rsp) #define OLD_Y 72 + STACKSIZE(%rsp) #define OLD_INCY 80 + STACKSIZE(%rsp) #define OLD_BUFFER 88 + STACKSIZE(%rsp) #define M ARG1 #define N ARG2 #define A ARG4 #define LDA ARG3 #define X %rdi #define INCX %rsi #endif #define Y %r10 #define INCY %r11 #define BUFFER %r12 #define TEMP %rax #define I %rax #define A1 %rbx #define A2 %rbp #define XX %r13 #define YY %r14 #define IS %r15 #define NEW_X BUFFER #define NEW_Y X #define ALPHA_R %xmm0 #define ALPHA_I %xmm1 #define xtemp1 %xmm0 #define xtemp2 %xmm1 #define xtemp3 %xmm2 #define xtemp4 %xmm3 #define atemp1 %xmm4 #define atemp2 %xmm5 #define atemp3 %xmm6 #define atemp4 %xmm7 #define xsum1 %xmm8 #define xsum2 %xmm9 #define yy1 %xmm10 #define yy2 %xmm11 #define a1 %xmm12 #define a2 %xmm13 #define a3 %xmm14 #define xt1 %xmm15 #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else #define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c #define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c #endif PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) movq OLD_Y, Y movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER salq $ZBASE_SHIFT, INCX salq $ZBASE_SHIFT, INCY salq $ZBASE_SHIFT, LDA testq M, M jle .L999 pcmpeqb %xmm2, %xmm2 xorpd %xmm3, %xmm3 psllq $63, %xmm2 unpcklpd %xmm3, %xmm2 unpcklpd ALPHA_I, ALPHA_R unpcklpd ALPHA_R, ALPHA_I xorpd %xmm2, ALPHA_I movq BUFFER, XX movq M, %rax sarq $2, %rax jle .L02 ALIGN_3 .L01: MOVDDUP(0 * SIZE, X, %xmm3) MOVDDUP(1 * SIZE, X, %xmm4) addq INCX, X MOVDDUP(0 * SIZE, X, %xmm5) MOVDDUP(1 * SIZE, X, %xmm6) addq INCX, X mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm4 mulpd ALPHA_R, %xmm5 mulpd ALPHA_I, %xmm6 addpd %xmm4, %xmm3 addpd %xmm6, %xmm5 movapd %xmm3, 0 * SIZE(XX) SHUFPD_1 %xmm3, %xmm3 pxor %xmm2, %xmm3 movapd %xmm3, 2 * SIZE(XX) movapd %xmm5, 4 * SIZE(XX) SHUFPD_1 %xmm5, %xmm5 pxor %xmm2, %xmm5 movapd %xmm5, 6 * SIZE(XX) MOVDDUP(0 * SIZE, X, %xmm3) MOVDDUP(1 * SIZE, X, %xmm4) addq INCX, X MOVDDUP(0 * SIZE, X, %xmm5) MOVDDUP(1 * SIZE, X, %xmm6) addq INCX, X mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm4 mulpd ALPHA_R, %xmm5 mulpd ALPHA_I, %xmm6 addpd %xmm4, %xmm3 addpd %xmm6, %xmm5 movapd %xmm3, 8 * SIZE(XX) SHUFPD_1 %xmm3, %xmm3 pxor %xmm2, %xmm3 movapd %xmm3, 10 * SIZE(XX) movapd %xmm5, 12 * SIZE(XX) SHUFPD_1 %xmm5, %xmm5 pxor %xmm2, %xmm5 movapd %xmm5, 14 * SIZE(XX) subq $-16 * SIZE, XX decq %rax jg .L01 ALIGN_3 .L02: movq M, %rax andq $3, %rax jle .L05 ALIGN_3 .L03: MOVDDUP(0 * SIZE, X, %xmm3) MOVDDUP(1 * SIZE, X, %xmm4) addq INCX, X mulpd ALPHA_R, %xmm3 mulpd ALPHA_I, %xmm4 addpd %xmm4, %xmm3 movapd %xmm3, 0 * SIZE(XX) SHUFPD_1 %xmm3, %xmm3 pxor %xmm2, %xmm3 movapd %xmm3, 2 * SIZE(XX) addq $4 * SIZE, XX decq %rax jg .L03 ALIGN_3 .L05: /* now we don't need original X */ movq Y, NEW_Y addq $512, XX andq $-512, XX cmpq $2 * SIZE, INCY je .L10 movq Y, YY movq XX, NEW_Y movq M, %rax sarq $2, %rax jle .L07 ALIGN_3 .L06: movsd 0 * SIZE(YY), %xmm0 movhpd 1 * SIZE(YY), %xmm0 addq INCY, YY movsd 0 * SIZE(YY), %xmm1 movhpd 1 * SIZE(YY), %xmm1 addq INCY, YY movsd 0 * SIZE(YY), %xmm2 movhpd 1 * SIZE(YY), %xmm2 addq INCY, YY movsd 0 * SIZE(YY), %xmm3 movhpd 1 * SIZE(YY), %xmm3 addq INCY, YY movapd %xmm0, 0 * SIZE(XX) movapd %xmm1, 2 * SIZE(XX) movapd %xmm2, 4 * SIZE(XX) movapd %xmm3, 6 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L06 ALIGN_3 .L07: movq M, %rax andq $3, %rax jle .L10 ALIGN_3 .L08: movsd 0 * SIZE(YY), %xmm0 movhpd 1 * SIZE(YY), %xmm0 addq INCY, YY movapd %xmm0, 0 * SIZE(XX) addq $2 * SIZE, XX decq %rax jg .L08 ALIGN_3 .L10: xorq IS, IS # is = 0 cmpq $2, N jl .L20 ALIGN_3 .L11: movq A, A1 leaq (A, LDA, 1), A2 leaq 4 * SIZE(A, LDA, 2), A leaq (, IS, SIZE), I leaq 0 * SIZE(NEW_X, I, 4), XX leaq 4 * SIZE(NEW_Y, I, 2), YY movapd 0 * SIZE(XX), atemp1 movapd 2 * SIZE(XX), atemp2 movapd 4 * SIZE(XX), atemp3 movapd 6 * SIZE(XX), atemp4 MOVDDUP(0 * SIZE, A1, xsum1) MOVDDUP(2 * SIZE, A1, xsum2) mulpd atemp1, xsum1 mulpd atemp1, xsum2 MOVDDUP(1 * SIZE, A1, a1) MOVDDUP(3 * SIZE, A1, a2) mulpd atemp2, a1 mulpd atemp2, a2 addpd a1, xsum1 addpd a2, xsum2 MOVDDUP(2 * SIZE, A1, a1) MOVDDUP(2 * SIZE, A2, a2) mulpd atemp3, a1 mulpd atemp3, a2 addpd a1, xsum1 addpd a2, xsum2 MOVDDUP(3 * SIZE, A1, a1) MOVDDUP(3 * SIZE, A2, a2) mulpd atemp4, a1 mulpd atemp4, a2 addpd a1, xsum1 addpd a2, xsum2 MOVDDUP(4 * SIZE, A1, a1) MOVDDUP(6 * SIZE, A2, a2) movsd 0 * SIZE(YY), yy1 movhpd 1 * SIZE(YY), yy1 movsd 2 * SIZE(YY), yy2 movhpd 3 * SIZE(YY), yy2 movapd 8 * SIZE(XX), xtemp1 movapd 10 * SIZE(XX), xtemp2 movapd 12 * SIZE(XX), xtemp3 movapd 14 * SIZE(XX), xtemp4 addq $8 * SIZE, XX addq $4 * SIZE, A1 addq $4 * SIZE, A2 movq M, I subq IS, I subq $2, I sarq $2, I jle .L15 ALIGN_3 .L12: movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(1 * SIZE, A1, a1) PREFETCH PREFETCHSIZE(A1) movapd xtemp3, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(3 * SIZE, A2, a2) movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(2 * SIZE, A1, a1) movapd xtemp4, xt1 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(0 * SIZE, A2, a2) PREFETCH PREFETCHSIZE(XX) movapd xtemp3, xt1 movapd 12 * SIZE(XX), xtemp3 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(3 * SIZE, A1, a1) movapd xtemp1, xt1 movapd 8 * SIZE(XX), xtemp1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(1 * SIZE, A2, a2) movapd xtemp4, xt1 movapd 14 * SIZE(XX), xtemp4 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(4 * SIZE, A1, a1) movlpd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 10 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(6 * SIZE, A2, a2) PREFETCH PREFETCHSIZE(A2) movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(5 * SIZE, A1, a1) movapd xtemp3, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(7 * SIZE, A2, a2) movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(6 * SIZE, A1, a1) PREFETCHW PREFETCHSIZE(YY) movapd xtemp4, xt1 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(4 * SIZE, A2, a2) movapd xtemp3, xt1 movapd 20 * SIZE(XX), xtemp3 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(7 * SIZE, A1, a1) movapd xtemp1, xt1 movapd 16 * SIZE(XX), xtemp1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(5 * SIZE, A2, a2) movapd xtemp4, xt1 movapd 22 * SIZE(XX), xtemp4 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP( 8 * SIZE, A1, a1) movlpd yy2, 6 * SIZE(YY) movhpd yy2, 7 * SIZE(YY) movsd 10 * SIZE(YY), yy2 movhpd 11 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 18 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(10 * SIZE, A2, a2) movlpd yy1, 4 * SIZE(YY) movhpd yy1, 5 * SIZE(YY) movsd 8 * SIZE(YY), yy1 movhpd 9 * SIZE(YY), yy1 subq $-16 * SIZE, XX addq $ 8 * SIZE, YY addq $ 8 * SIZE, A1 addq $ 8 * SIZE, A2 decq I jg .L12 ALIGN_3 .L15: movq M, I subq IS, I subq $2, I testq $2, I jle .L16 movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(1 * SIZE, A1, a1) movapd xtemp3, xt1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(3 * SIZE, A2, a2) movapd xtemp2, xt1 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(2 * SIZE, A1, a1) movapd xtemp4, xt1 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum2 addpd a2, yy2 MOVDDUP(0 * SIZE, A2, a2) movapd xtemp3, xt1 movapd 12 * SIZE(XX), xtemp3 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(3 * SIZE, A1, a1) movapd xtemp1, xt1 movapd 8 * SIZE(XX), xtemp1 mulpd a2, xt1 mulpd atemp3, a2 addpd xt1, xsum2 addpd a2, yy1 MOVDDUP(1 * SIZE, A2, a2) movapd xtemp4, xt1 movapd 14 * SIZE(XX), xtemp4 mulpd a1, xt1 mulpd atemp2, a1 addpd xt1, xsum1 addpd a1, yy2 MOVDDUP(4 * SIZE, A1, a1) movlpd yy2, 2 * SIZE(YY) movhpd yy2, 3 * SIZE(YY) movsd 6 * SIZE(YY), yy2 movhpd 7 * SIZE(YY), yy2 movapd xtemp2, xt1 movapd 10 * SIZE(XX), xtemp2 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum2 addpd a2, yy1 movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhpd 5 * SIZE(YY), yy1 addq $4 * SIZE, YY addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L16: testq $1, M jle .L18 MOVDDUP(1 * SIZE, A1, a2) movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp1, a1 addpd xt1, xsum1 addpd a1, yy1 MOVDDUP(0 * SIZE, A2, a1) movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp2, a2 addpd xt1, xsum1 addpd a2, yy1 MOVDDUP(1 * SIZE, A2, a2) movapd xtemp1, xt1 mulpd a1, xt1 mulpd atemp3, a1 addpd xt1, xsum2 addpd a1, yy1 movapd xtemp2, xt1 mulpd a2, xt1 mulpd atemp4, a2 addpd xt1, xsum2 addpd a2, yy1 movlpd yy1, 0 * SIZE(YY) movhpd yy1, 1 * SIZE(YY) ALIGN_3 .L18: leaq (, IS, SIZE), I movsd 0 * SIZE(NEW_Y, I, 2), yy1 movhpd 1 * SIZE(NEW_Y, I, 2), yy1 movsd 2 * SIZE(NEW_Y, I, 2), yy2 movhpd 3 * SIZE(NEW_Y, I, 2), yy2 addpd xsum1, yy1 addpd xsum2, yy2 movlpd yy1, 0 * SIZE(NEW_Y, I, 2) movhpd yy1, 1 * SIZE(NEW_Y, I, 2) movlpd yy2, 2 * SIZE(NEW_Y, I, 2) movhpd yy2, 3 * SIZE(NEW_Y, I, 2) addq $2, IS movq IS, I addq $2, I cmpq M, I jle .L11 ALIGN_3 .L20: HALT testq $1, N jle .L990 leaq (, IS, SIZE), I movapd 0 * SIZE(NEW_X, I, 4), atemp1 movapd 2 * SIZE(NEW_X, I, 4), atemp2 movsd 0 * SIZE(NEW_Y, I, 2), yy1 movhpd 1 * SIZE(NEW_Y, I, 2), yy1 MOVDDUP(0 * SIZE, A, a1) MOVDDUP(1 * SIZE, A, a2) mulpd atemp1, a1 mulpd atemp2, a2 addpd a1, yy1 addpd a2, yy1 movlpd yy1, 0 * SIZE(NEW_Y, I, 2) movhpd yy1, 1 * SIZE(NEW_Y, I, 2) ALIGN_3 .L990: cmpq $2 * SIZE, INCY je .L999 movq M, %rax sarq $2, %rax jle .L997 ALIGN_3 .L996: movapd 0 * SIZE(NEW_Y), %xmm0 movapd 2 * SIZE(NEW_Y), %xmm1 movapd 4 * SIZE(NEW_Y), %xmm2 movapd 6 * SIZE(NEW_Y), %xmm3 movsd %xmm0, 0 * SIZE(Y) movhpd %xmm0, 1 * SIZE(Y) addq INCY, Y movsd %xmm1, 0 * SIZE(Y) movhpd %xmm1, 1 * SIZE(Y) addq INCY, Y movsd %xmm2, 0 * SIZE(Y) movhpd %xmm2, 1 * SIZE(Y) addq INCY, Y movsd %xmm3, 0 * SIZE(Y) movhpd %xmm3, 1 * SIZE(Y) addq INCY, Y addq $8 * SIZE, NEW_Y decq %rax jg .L996 ALIGN_3 .L997: movq M, %rax andq $3, %rax jle .L999 ALIGN_3 .L998: movapd 0 * SIZE(NEW_Y), %xmm0 movsd %xmm0, 0 * SIZE(Y) movhpd %xmm0, 1 * SIZE(Y) addq INCY, Y addq $2 * SIZE, NEW_Y decq %rax jg .L998 ALIGN_3 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 addq $STACKSIZE, %rsp ret EPILOGUE