/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifdef ATOM #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #ifdef CORE2 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #if defined(PENRYN) || defined(DUNNINGTON) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) #endif #ifdef PENTIUM4 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 20) #endif #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 8) #define movsd movlps #endif #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) #endif #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) #endif #ifdef GENERIC #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 20) #endif #ifndef WINDOWS_ABI #define STACKSIZE 80 #define OLD_Y 8 + STACKSIZE(%rsp) #define OLD_INCY 16 + STACKSIZE(%rsp) #define OLD_BUFFER 24 + STACKSIZE(%rsp) #define M ARG1 #define N ARG2 #define A ARG3 #define LDA ARG4 #define X ARG5 #define INCX ARG6 #else #define STACKSIZE 256 #define OLD_LDA 40 + STACKSIZE(%rsp) #define OLD_X 48 + STACKSIZE(%rsp) #define OLD_INCX 56 + STACKSIZE(%rsp) #define OLD_Y 64 + STACKSIZE(%rsp) #define OLD_INCY 72 + STACKSIZE(%rsp) #define OLD_BUFFER 80 + STACKSIZE(%rsp) #define M ARG1 #define N ARG2 #define A ARG4 #define LDA ARG3 #define X %rdi #define INCX %rsi #endif #define Y %r10 #define INCY %r11 #define BUFFER %r12 #define TEMP %rax #define I %rax #define A1 %rbx #define A2 %rbp #define XX %r13 #define YY %r14 #define IS %r15 #define NEW_X BUFFER #define NEW_Y X #define ALPHA %xmm0 #define atemp1 %xmm0 #define atemp2 %xmm1 #define atemp3 %xmm2 #define atemp4 %xmm3 #define xsum1 %xmm4 #define xsum2 %xmm5 #define xsum3 %xmm6 #define xsum4 %xmm7 #define xtemp1 %xmm8 #define xtemp2 %xmm9 #define yy1 %xmm10 #define xt1 %xmm11 #define a1 %xmm12 #define a2 %xmm13 #define a3 %xmm14 #define a4 %xmm15 PROLOGUE PROFCODE subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) movq %r12, 16(%rsp) movq %r13, 24(%rsp) movq %r14, 32(%rsp) movq %r15, 40(%rsp) #ifdef WINDOWS_ABI movq %rdi, 48(%rsp) movq %rsi, 56(%rsp) movups %xmm6, 64(%rsp) movups %xmm7, 80(%rsp) movups %xmm8, 96(%rsp) movups %xmm9, 112(%rsp) movups %xmm10, 128(%rsp) movups %xmm11, 144(%rsp) movups %xmm12, 160(%rsp) movups %xmm13, 176(%rsp) movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) movq OLD_LDA, LDA movq OLD_X, X movq OLD_INCX, INCX movaps %xmm2, %xmm0 #endif movq OLD_Y, Y movq OLD_INCY, INCY movq OLD_BUFFER, BUFFER leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (,LDA, SIZE), LDA testq M, M jle .L999 shufps $0, ALPHA, ALPHA movq BUFFER, XX movq M, %rax sarq $3, %rax jle .L02 ALIGN_3 .L01: movss 0 * SIZE(X), %xmm1 addq INCX, X movss 0 * SIZE(X), %xmm2 addq INCX, X movss 0 * SIZE(X), %xmm3 addq INCX, X movss 0 * SIZE(X), %xmm4 addq INCX, X movss 0 * SIZE(X), %xmm5 addq INCX, X movss 0 * SIZE(X), %xmm6 addq INCX, X movss 0 * SIZE(X), %xmm7 addq INCX, X movss 0 * SIZE(X), %xmm8 addq INCX, X mulss ALPHA, %xmm1 mulss ALPHA, %xmm2 mulss ALPHA, %xmm3 mulss ALPHA, %xmm4 mulss ALPHA, %xmm5 mulss ALPHA, %xmm6 mulss ALPHA, %xmm7 mulss ALPHA, %xmm8 movss %xmm1, 0 * SIZE(XX) movss %xmm2, 1 * SIZE(XX) movss %xmm3, 2 * SIZE(XX) movss %xmm4, 3 * SIZE(XX) movss %xmm5, 4 * SIZE(XX) movss %xmm6, 5 * SIZE(XX) movss %xmm7, 6 * SIZE(XX) movss %xmm8, 7 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L01 ALIGN_3 .L02: movq M, %rax andq $7, %rax jle .L05 ALIGN_3 .L03: movss 0 * SIZE(X), %xmm1 addq INCX, X mulss ALPHA, %xmm1 movss %xmm1, 0 * SIZE(XX) addq $1 * SIZE, XX decq %rax jg .L03 ALIGN_3 .L05: /* now we don't need original X */ movq Y, NEW_Y addq $512, XX andq $-512, XX cmpq $SIZE, INCY je .L10 movq Y, YY movq XX, NEW_Y movq M, %rax sarq $3, %rax jle .L07 ALIGN_3 .L06: movss 0 * SIZE(YY), %xmm0 addq INCY, YY movss 0 * SIZE(YY), %xmm1 addq INCY, YY movss 0 * SIZE(YY), %xmm2 addq INCY, YY movss 0 * SIZE(YY), %xmm3 addq INCY, YY movss 0 * SIZE(YY), %xmm4 addq INCY, YY movss 0 * SIZE(YY), %xmm5 addq INCY, YY movss 0 * SIZE(YY), %xmm6 addq INCY, YY movss 0 * SIZE(YY), %xmm7 addq INCY, YY movss %xmm0, 0 * SIZE(XX) movss %xmm1, 1 * SIZE(XX) movss %xmm2, 2 * SIZE(XX) movss %xmm3, 3 * SIZE(XX) movss %xmm4, 4 * SIZE(XX) movss %xmm5, 5 * SIZE(XX) movss %xmm6, 6 * SIZE(XX) movss %xmm7, 7 * SIZE(XX) addq $8 * SIZE, XX decq %rax jg .L06 ALIGN_3 .L07: movq M, %rax andq $7, %rax jle .L10 ALIGN_3 .L08: movss 0 * SIZE(YY), %xmm0 addq INCY, YY movss %xmm0, 0 * SIZE(XX) addq $1 * SIZE, XX decq %rax jg .L08 ALIGN_3 .L10: xorq IS, IS # is = 0 cmpq $4, N jl .L20 ALIGN_3 .L11: movq A, A1 leaq (A, LDA, 2), A2 leaq 4 * SIZE(A, LDA, 4), A leaq (NEW_X, IS, SIZE), XX leaq 4 * SIZE(NEW_Y, IS, SIZE), YY movaps 0 * SIZE(XX), atemp4 movsd 0 * SIZE(A1), xsum1 movhps 2 * SIZE(A1), xsum1 mulps atemp4, xsum1 movss 1 * SIZE(A1), xsum2 movss 1 * SIZE(A1, LDA, 1), a2 movss 2 * SIZE(A1, LDA, 1), a3 movss 3 * SIZE(A1, LDA, 1), a4 unpcklps a3, xsum2 unpcklps a4, a2 unpcklps a2, xsum2 mulps atemp4, xsum2 movss 2 * SIZE(A1), xsum3 movss 2 * SIZE(A1, LDA, 1), a2 movss 2 * SIZE(A2), a3 movss 3 * SIZE(A2), a4 unpcklps a3, xsum3 unpcklps a4, a2 unpcklps a2, xsum3 mulps atemp4, xsum3 movss 3 * SIZE(A1), xsum4 movss 3 * SIZE(A1, LDA, 1), a2 movss 3 * SIZE(A2), a3 movss 3 * SIZE(A2, LDA, 1), a4 unpcklps a3, xsum4 unpcklps a4, a2 unpcklps a2, xsum4 mulps atemp4, xsum4 pshufd $0x00, atemp4, atemp1 pshufd $0x55, atemp4, atemp2 pshufd $0xaa, atemp4, atemp3 pshufd $0xff, atemp4, atemp4 movaps 4 * SIZE(XX), xtemp1 movaps 8 * SIZE(XX), xtemp2 movsd 0 * SIZE(YY), yy1 movhps 2 * SIZE(YY), yy1 movsd 4 * SIZE(A1), a1 movhps 6 * SIZE(A1), a1 movsd 4 * SIZE(A1, LDA, 1), a2 movhps 6 * SIZE(A1, LDA, 1), a2 movsd 4 * SIZE(A2), a3 movhps 6 * SIZE(A2), a3 movsd 4 * SIZE(A2, LDA, 1), a4 movhps 6 * SIZE(A2, LDA, 1), a4 addq $4 * SIZE, XX addq $4 * SIZE, A1 addq $4 * SIZE, A2 movq M, I subq IS, I subq $4, I sarq $4, I jle .L14 ALIGN_3 .L12: movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 4 * SIZE(A1), a1 movhps 6 * SIZE(A1), a1 PREFETCH PREFETCHSIZE(A1) movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 4 * SIZE(A1, LDA, 1), a2 movhps 6 * SIZE(A1, LDA, 1), a2 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 4 * SIZE(A2), a3 movhps 6 * SIZE(A2), a3 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) PREFETCH PREFETCHSIZE(XX) #endif movaps xtemp1, xt1 movaps 8 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 4 * SIZE(A2, LDA, 1), a4 movhps 6 * SIZE(A2, LDA, 1), a4 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhps 6 * SIZE(YY), yy1 movaps xtemp2, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 8 * SIZE(A1), a1 movhps 10 * SIZE(A1), a1 PREFETCH PREFETCHSIZE(A1, LDA, 1) movaps xtemp2, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 8 * SIZE(A1, LDA, 1), a2 movhps 10 * SIZE(A1, LDA, 1), a2 movaps xtemp2, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 8 * SIZE(A2), a3 movhps 10 * SIZE(A2), a3 movaps xtemp2, xt1 movaps 12 * SIZE(XX), xtemp2 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 8 * SIZE(A2, LDA, 1), a4 movhps 10 * SIZE(A2, LDA, 1), a4 movlps yy1, 4 * SIZE(YY) movhps yy1, 6 * SIZE(YY) movsd 8 * SIZE(YY), yy1 movhps 10 * SIZE(YY), yy1 movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 12 * SIZE(A1), a1 movhps 14 * SIZE(A1), a1 PREFETCH PREFETCHSIZE(A2) movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 12 * SIZE(A1, LDA, 1), a2 movhps 14 * SIZE(A1, LDA, 1), a2 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 12 * SIZE(A2), a3 movhps 14 * SIZE(A2), a3 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON) PREFETCHW PREFETCHSIZE(YY) #endif movaps xtemp1, xt1 movaps 16 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 12 * SIZE(A2, LDA, 1), a4 movhps 14 * SIZE(A2, LDA, 1), a4 movlps yy1, 8 * SIZE(YY) movhps yy1, 10 * SIZE(YY) movsd 12 * SIZE(YY), yy1 movhps 14 * SIZE(YY), yy1 movaps xtemp2, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 16 * SIZE(A1), a1 movhps 18 * SIZE(A1), a1 PREFETCH PREFETCHSIZE(A2, LDA, 1) movaps xtemp2, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 16 * SIZE(A1, LDA, 1), a2 movhps 18 * SIZE(A1, LDA, 1), a2 movaps xtemp2, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 16 * SIZE(A2), a3 movhps 18 * SIZE(A2), a3 movaps xtemp2, xt1 movaps 20 * SIZE(XX), xtemp2 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 16 * SIZE(A2, LDA, 1), a4 movhps 18 * SIZE(A2, LDA, 1), a4 movlps yy1, 12 * SIZE(YY) movhps yy1, 14 * SIZE(YY) movsd 16 * SIZE(YY), yy1 movhps 18 * SIZE(YY), yy1 addq $16 * SIZE, XX addq $16 * SIZE, YY addq $16 * SIZE, A1 addq $16 * SIZE, A2 decq I jg .L12 ALIGN_3 .L14: movq M, I subq IS, I subq $4, I test $8, I jle .L15 movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 4 * SIZE(A1), a1 movhps 6 * SIZE(A1), a1 movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 4 * SIZE(A1, LDA, 1), a2 movhps 6 * SIZE(A1, LDA, 1), a2 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 4 * SIZE(A2), a3 movhps 6 * SIZE(A2), a3 movaps xtemp1, xt1 movaps 8 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 4 * SIZE(A2, LDA, 1), a4 movhps 6 * SIZE(A2, LDA, 1), a4 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) movsd 4 * SIZE(YY), yy1 movhps 6 * SIZE(YY), yy1 movaps xtemp2, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 8 * SIZE(A1), a1 movhps 10 * SIZE(A1), a1 movaps xtemp2, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 8 * SIZE(A1, LDA, 1), a2 movhps 10 * SIZE(A1, LDA, 1), a2 movaps xtemp2, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 8 * SIZE(A2), a3 movhps 10 * SIZE(A2), a3 movaps xtemp2, xt1 movaps 12 * SIZE(XX), xtemp2 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 8 * SIZE(A2, LDA, 1), a4 movhps 10 * SIZE(A2, LDA, 1), a4 movlps yy1, 4 * SIZE(YY) movhps yy1, 6 * SIZE(YY) movsd 8 * SIZE(YY), yy1 movhps 10 * SIZE(YY), yy1 addq $8 * SIZE, XX addq $8 * SIZE, YY addq $8 * SIZE, A1 addq $8 * SIZE, A2 ALIGN_3 .L15: test $4, I jle .L17 movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movsd 4 * SIZE(A1), a1 movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movsd 4 * SIZE(A1, LDA, 1), a2 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movsd 4 * SIZE(A2), a3 movaps xtemp1, xt1 movsd 4 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movsd 4 * SIZE(A2, LDA, 1), a4 movlps yy1, 0 * SIZE(YY) movhps yy1, 2 * SIZE(YY) movsd 4 * SIZE(YY), yy1 addq $4 * SIZE, XX addq $4 * SIZE, YY addq $4 * SIZE, A1 addq $4 * SIZE, A2 ALIGN_3 .L17: testq $2, M jle .L18 pxor xtemp2, xtemp2 movlhps xtemp2, a1 movaps xtemp1, xt1 mulps a1, xt1 mulps atemp1, a1 addps xt1, xsum1 addps a1, yy1 movss 2 * SIZE(A1), a1 movlhps xtemp2, a2 movaps xtemp1, xt1 mulps a2, xt1 mulps atemp2, a2 addps xt1, xsum2 addps a2, yy1 movss 2 * SIZE(A1, LDA, 1), a2 movlhps xtemp2, a3 movaps xtemp1, xt1 mulps a3, xt1 mulps atemp3, a3 addps xt1, xsum3 addps a3, yy1 movss 2 * SIZE(A2), a3 movlhps xtemp2, a4 movaps xtemp1, xt1 movss 2 * SIZE(XX), xtemp1 mulps a4, xt1 mulps atemp4, a4 addps xt1, xsum4 addps a4, yy1 movss 2 * SIZE(A2, LDA, 1), a4 movlps yy1, 0 * SIZE(YY) movss 2 * SIZE(YY), yy1 addq $2 * SIZE, XX addq $2 * SIZE, YY addq $2 * SIZE, A1 addq $2 * SIZE, A2 ALIGN_3 .L18: testq $1, M jle .L19 movss 0 * SIZE(XX), xtemp1 movss 0 * SIZE(YY), yy1 movss 0 * SIZE(A1), a1 movss 0 * SIZE(A1, LDA, 1), a2 movss 0 * SIZE(A2), a3 movss 0 * SIZE(A2, LDA, 1), a4 movaps xtemp1, xt1 mulss a1, xt1 mulss atemp1, a1 addss xt1, xsum1 addss a1, yy1 movaps xtemp1, xt1 mulss a2, xt1 mulss atemp2, a2 addss xt1, xsum2 addss a2, yy1 movaps xtemp1, xt1 mulss a3, xt1 mulss atemp3, a3 addss xt1, xsum3 addss a3, yy1 movaps xtemp1, xt1 mulss a4, xt1 mulss atemp4, a4 addss xt1, xsum4 addss a4, yy1 movss yy1, 0 * SIZE(YY) ALIGN_3 .L19: #ifndef HAVE_SSE3 movaps xsum1, xtemp1 unpcklps xsum3, xsum1 unpckhps xsum3, xtemp1 movaps xsum2, xtemp2 unpcklps xsum4, xsum2 unpckhps xsum4, xtemp2 movaps xsum1, xsum3 unpcklps xsum2, xsum1 unpckhps xsum2, xsum3 movaps xtemp1, xsum4 unpcklps xtemp2, xtemp1 unpckhps xtemp2, xsum4 addps xsum3, xsum1 addps xtemp1, xsum4 addps xsum4, xsum1 #else haddps xsum2, xsum1 haddps xsum4, xsum3 haddps xsum3, xsum1 #endif movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 movhps 2 * SIZE(NEW_Y, IS, SIZE), yy1 addps xsum1, yy1 movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE) movhps yy1, 2 * SIZE(NEW_Y, IS, SIZE) addq $4, IS movq IS, I addq $4, I cmpq N, I jle .L11 ALIGN_3 .L20: testq $2, N jle .L30 movq A, A1 leaq 2 * SIZE(A, LDA, 2), A movaps 0 * SIZE(NEW_X, IS, SIZE), atemp4 #if defined(OPTERON) pxor xsum1, xsum1 #endif movsd 0 * SIZE(A1), xsum1 mulps atemp4, xsum1 movss 1 * SIZE(A1), xsum2 movss 1 * SIZE(A1, LDA, 1), a2 unpcklps a2, xsum2 mulps atemp4, xsum2 pshufd $0x00, atemp4, atemp1 pshufd $0x55, atemp4, atemp2 testq $1, M jle .L29 movss 2 * SIZE(A1), a1 movss 2 * SIZE(A1, LDA, 1), a2 movss 2 * SIZE(NEW_X, IS, SIZE), xtemp1 movss 2 * SIZE(NEW_Y, IS, SIZE), yy1 movaps xtemp1, xt1 mulss a1, xt1 mulss atemp1, a1 addss xt1, xsum1 addps a1, yy1 movaps xtemp1, xt1 mulss a2, xt1 mulss atemp2, a2 addss xt1, xsum2 addss a2, yy1 movss yy1, 2 * SIZE(NEW_Y, IS, SIZE) ALIGN_3 .L29: #ifndef HAVE_SSE3 unpcklps xsum2, xsum1 movhlps xsum1, xsum2 addps xsum2, xsum1 #else haddps xsum2, xsum1 haddps xsum1, xsum1 #endif movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1 addps xsum1, yy1 movlps yy1, 0 * SIZE(NEW_Y, IS, SIZE) addq $2, IS ALIGN_3 .L30: testq $1, N jle .L990 movss 0 * SIZE(NEW_X, IS, SIZE), xsum1 mulss 0 * SIZE(A), xsum1 addss 0 * SIZE(NEW_Y, IS, SIZE), xsum1 movss xsum1, 0 * SIZE(NEW_Y, IS, SIZE) ALIGN_3 .L990: cmpq $SIZE, INCY je .L999 movq M, %rax sarq $3, %rax jle .L997 ALIGN_3 .L996: movss 0 * SIZE(NEW_Y), %xmm0 movss 1 * SIZE(NEW_Y), %xmm1 movss 2 * SIZE(NEW_Y), %xmm2 movss 3 * SIZE(NEW_Y), %xmm3 movss 4 * SIZE(NEW_Y), %xmm4 movss 5 * SIZE(NEW_Y), %xmm5 movss 6 * SIZE(NEW_Y), %xmm6 movss 7 * SIZE(NEW_Y), %xmm7 movss %xmm0, 0 * SIZE(Y) addq INCY, Y movss %xmm1, 0 * SIZE(Y) addq INCY, Y movss %xmm2, 0 * SIZE(Y) addq INCY, Y movss %xmm3, 0 * SIZE(Y) addq INCY, Y movss %xmm4, 0 * SIZE(Y) addq INCY, Y movss %xmm5, 0 * SIZE(Y) addq INCY, Y movss %xmm6, 0 * SIZE(Y) addq INCY, Y movss %xmm7, 0 * SIZE(Y) addq INCY, Y addq $8 * SIZE, NEW_Y decq %rax jg .L996 ALIGN_3 .L997: movq M, %rax andq $7, %rax jle .L999 ALIGN_3 .L998: movss 0 * SIZE(NEW_Y), %xmm0 movss %xmm0, 0 * SIZE(Y) addq INCY, Y addq $1 * SIZE, NEW_Y decq %rax jg .L998 ALIGN_3 .L999: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 movq 24(%rsp), %r13 movq 32(%rsp), %r14 movq 40(%rsp), %r15 #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi movups 64(%rsp), %xmm6 movups 80(%rsp), %xmm7 movups 96(%rsp), %xmm8 movups 112(%rsp), %xmm9 movups 128(%rsp), %xmm10 movups 144(%rsp), %xmm11 movups 160(%rsp), %xmm12 movups 176(%rsp), %xmm13 movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif addq $STACKSIZE, %rsp ret EPILOGUE