/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin.           */
/* All rights reserved.                                              */
/*                                                                   */
/* Redistribution and use in source and binary forms, with or        */
/* without modification, are permitted provided that the following   */
/* conditions are met:                                               */
/*                                                                   */
/*   1. Redistributions of source code must retain the above         */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer.                                                  */
/*                                                                   */
/*   2. Redistributions in binary form must reproduce the above      */
/*      copyright notice, this list of conditions and the following  */
/*      disclaimer in the documentation and/or other materials       */
/*      provided with the distribution.                              */
/*                                                                   */
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
/*                                                                   */
/* The views and conclusions contained in the software and           */
/* documentation are those of the authors and should not be          */
/* interpreted as representing official policies, either expressed   */
/* or implied, of The University of Texas at Austin.                 */
/*********************************************************************/

#define ASSEMBLER
#include "common.h"

#ifdef ATOM
#define PREFETCH	prefetcht0
#define PREFETCHW	prefetcht0
#define PREFETCHSIZE	(16 * 12)
#endif

#ifdef CORE2
#define PREFETCH	prefetcht0
#define PREFETCHW	prefetcht0
#define PREFETCHSIZE	(16 * 12)
#endif

#if defined(PENRYN) || defined(DUNNINGTON)
#define PREFETCH	prefetcht0
#define PREFETCHW	prefetcht0
#define PREFETCHSIZE	(16 * 12)
#endif

#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
#define PREFETCH	prefetcht0
#define PREFETCHW	prefetcht0
#define PREFETCHSIZE	(16 * 12)
#endif

#ifdef PENTIUM4
#define PREFETCH	prefetcht0
#define PREFETCHW	prefetcht0
#define PREFETCHSIZE	(16 * 20)
#endif

#ifdef OPTERON
#define PREFETCH	prefetch
#define PREFETCHW	prefetchw
#define PREFETCHSIZE	(16 * 8)
#define movsd		movlps
#endif

#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH	prefetch
#define PREFETCHW	prefetchw
#define PREFETCHSIZE	(16 * 16)
#endif

#ifdef NANO
#define PREFETCH	prefetcht0
#define PREFETCHW	prefetcht0
#define PREFETCHSIZE	(16 * 24)
#endif

#ifdef GENERIC
#define PREFETCH	prefetcht0
#define PREFETCHW	prefetcht0
#define PREFETCHSIZE	(16 * 20)
#endif

#ifndef WINDOWS_ABI

#define STACKSIZE	80

#define OLD_Y		 8 + STACKSIZE(%rsp)
#define OLD_INCY	16 + STACKSIZE(%rsp)
#define OLD_BUFFER	24 + STACKSIZE(%rsp)

#define M	  ARG1
#define	N	  ARG2
#define	A	  ARG3
#define LDA	  ARG4
#define	X	  ARG5
#define INCX	  ARG6

#else

#define STACKSIZE	256

#define OLD_LDA		 40 + STACKSIZE(%rsp)
#define OLD_X		 48 + STACKSIZE(%rsp)
#define OLD_INCX	 56 + STACKSIZE(%rsp)
#define OLD_Y		 64 + STACKSIZE(%rsp)
#define OLD_INCY	 72 + STACKSIZE(%rsp)
#define OLD_BUFFER	 80 + STACKSIZE(%rsp)

#define M	  ARG1
#define N	  ARG2
#define	A	  ARG4
#define LDA	  ARG3
#define	X	  %rdi
#define INCX	  %rsi
#endif

#define	Y	%r10
#define INCY	%r11
#define BUFFER	%r12

#define TEMP	%rax
#define I	%rax
#define A1	%rbx
#define A2	%rbp
#define XX	%r13
#define YY	%r14
#define IS	%r15
#define NEW_X	BUFFER
#define NEW_Y	X

#define ALPHA  %xmm0

#define atemp1 %xmm0
#define atemp2 %xmm1
#define atemp3 %xmm2
#define atemp4 %xmm3

#define xsum1  %xmm4
#define xsum2  %xmm5
#define xsum3  %xmm6
#define xsum4  %xmm7

#define xtemp1 %xmm8
#define xtemp2 %xmm9
#define yy1    %xmm10
#define	xt1    %xmm11

#define a1     %xmm12
#define a2     %xmm13
#define a3     %xmm14
#define a4     %xmm15


	PROLOGUE
	PROFCODE

	subq	$STACKSIZE, %rsp
	movq	%rbx,  0(%rsp)
	movq	%rbp,  8(%rsp)
	movq	%r12, 16(%rsp)
	movq	%r13, 24(%rsp)
	movq	%r14, 32(%rsp)
	movq	%r15, 40(%rsp)

#ifdef WINDOWS_ABI
	movq	%rdi,    48(%rsp)
	movq	%rsi,    56(%rsp)
	movups	%xmm6,   64(%rsp)
	movups	%xmm7,   80(%rsp)
	movups	%xmm8,   96(%rsp)
	movups	%xmm9,  112(%rsp)
	movups	%xmm10, 128(%rsp)
	movups	%xmm11, 144(%rsp)
	movups	%xmm12, 160(%rsp)
	movups	%xmm13, 176(%rsp)
	movups	%xmm14, 192(%rsp)
	movups	%xmm15, 208(%rsp)

	movq	OLD_LDA,   LDA
	movq	OLD_X,     X
	movq	OLD_INCX,  INCX

	movaps	%xmm2, %xmm0
#endif

	movq	OLD_Y,     Y
	movq	OLD_INCY,   INCY
	movq	OLD_BUFFER, BUFFER

	leaq	(,INCX, SIZE), INCX
	leaq	(,INCY, SIZE), INCY
	leaq	(,LDA,  SIZE), LDA

	testq	M, M
	jle	.L999

	shufps	$0, ALPHA, ALPHA

	movq	BUFFER, XX

	movq	M,  %rax
	sarq	$3, %rax
	jle	.L02
	ALIGN_3

.L01:
	movss	0 * SIZE(X), %xmm1
	addq	INCX, X
	movss	0 * SIZE(X), %xmm2
	addq	INCX, X
	movss	0 * SIZE(X), %xmm3
	addq	INCX, X
	movss	0 * SIZE(X), %xmm4
	addq	INCX, X
	movss	0 * SIZE(X), %xmm5
	addq	INCX, X
	movss	0 * SIZE(X), %xmm6
	addq	INCX, X
	movss	0 * SIZE(X), %xmm7
	addq	INCX, X
	movss	0 * SIZE(X), %xmm8
	addq	INCX, X

	mulss	ALPHA, %xmm1
	mulss	ALPHA, %xmm2
	mulss	ALPHA, %xmm3
	mulss	ALPHA, %xmm4
	mulss	ALPHA, %xmm5
	mulss	ALPHA, %xmm6
	mulss	ALPHA, %xmm7
	mulss	ALPHA, %xmm8

	movss	%xmm1, 0 * SIZE(XX)
	movss	%xmm2, 1 * SIZE(XX)
	movss	%xmm3, 2 * SIZE(XX)
	movss	%xmm4, 3 * SIZE(XX)
	movss	%xmm5, 4 * SIZE(XX)
	movss	%xmm6, 5 * SIZE(XX)
	movss	%xmm7, 6 * SIZE(XX)
	movss	%xmm8, 7 * SIZE(XX)

	addq	$8 * SIZE, XX
	decq	%rax
	jg	.L01
	ALIGN_3

.L02:
	movq	M, %rax
	andq	$7, %rax
	jle	.L05
	ALIGN_3

.L03:
	movss	0 * SIZE(X), %xmm1
	addq	INCX, X

	mulss	ALPHA, %xmm1

	movss	%xmm1, 0 * SIZE(XX)

	addq	$1 * SIZE, XX
	decq	%rax
	jg	.L03
	ALIGN_3

.L05:
	/* now we don't need original X */
	movq   Y, NEW_Y

	addq   $512, XX
	andq   $-512, XX

	cmpq   $SIZE, INCY
	je    .L10

	movq   Y,  YY
	movq   XX, NEW_Y

	movq	M,  %rax
	sarq	$3, %rax
	jle	.L07
	ALIGN_3

.L06:
	movss	0 * SIZE(YY), %xmm0
	addq	INCY, YY
	movss	0 * SIZE(YY), %xmm1
	addq	INCY, YY
	movss	0 * SIZE(YY), %xmm2
	addq	INCY, YY
	movss	0 * SIZE(YY), %xmm3
	addq	INCY, YY
	movss	0 * SIZE(YY), %xmm4
	addq	INCY, YY
	movss	0 * SIZE(YY), %xmm5
	addq	INCY, YY
	movss	0 * SIZE(YY), %xmm6
	addq	INCY, YY
	movss	0 * SIZE(YY), %xmm7
	addq	INCY, YY

	movss	%xmm0, 0 * SIZE(XX)
	movss	%xmm1, 1 * SIZE(XX)
	movss	%xmm2, 2 * SIZE(XX)
	movss	%xmm3, 3 * SIZE(XX)
	movss	%xmm4, 4 * SIZE(XX)
	movss	%xmm5, 5 * SIZE(XX)
	movss	%xmm6, 6 * SIZE(XX)
	movss	%xmm7, 7 * SIZE(XX)

	addq	$8 * SIZE, XX
	decq	%rax
	jg	.L06
	ALIGN_3

.L07:
	movq	M, %rax
	andq	$7, %rax
	jle	.L10
	ALIGN_3

.L08:
	movss	0 * SIZE(YY), %xmm0
	addq	INCY, YY

	movss	%xmm0, 0 * SIZE(XX)

	addq	$1 * SIZE, XX
	decq	%rax
	jg	.L08
	ALIGN_3

.L10:
	xorq	IS, IS		# is = 0

	cmpq	$4, N
	jl	.L20
	ALIGN_3

.L11:
	movq	A,  A1
	leaq	(A, LDA, 2), A2
	leaq	4 * SIZE(A, LDA, 4), A

	leaq	        (NEW_X, IS, SIZE), XX
	leaq	4 * SIZE(NEW_Y, IS, SIZE), YY

	movaps		0 * SIZE(XX), atemp4

	movsd	 0 * SIZE(A1), xsum1
	movhps	 2 * SIZE(A1), xsum1
	mulps	 atemp4, xsum1

	movss	 1 * SIZE(A1), xsum2
	movss	 1 * SIZE(A1, LDA, 1), a2
	movss	 2 * SIZE(A1, LDA, 1), a3
	movss	 3 * SIZE(A1, LDA, 1), a4
	unpcklps a3, xsum2
	unpcklps a4, a2
	unpcklps a2, xsum2
	mulps	 atemp4, xsum2

	movss	 2 * SIZE(A1), xsum3
	movss	 2 * SIZE(A1, LDA, 1), a2
	movss	 2 * SIZE(A2), a3
	movss	 3 * SIZE(A2), a4
	unpcklps a3, xsum3
	unpcklps a4, a2
	unpcklps a2, xsum3
	mulps	 atemp4, xsum3

	movss	 3 * SIZE(A1), xsum4
	movss	 3 * SIZE(A1, LDA, 1), a2
	movss	 3 * SIZE(A2), a3
	movss	 3 * SIZE(A2, LDA, 1), a4
	unpcklps a3, xsum4
	unpcklps a4, a2
	unpcklps a2, xsum4
	mulps	 atemp4, xsum4

	pshufd	$0x00, atemp4, atemp1
	pshufd	$0x55, atemp4, atemp2
	pshufd	$0xaa, atemp4, atemp3
	pshufd	$0xff, atemp4, atemp4

	movaps	 4 * SIZE(XX), xtemp1
	movaps	 8 * SIZE(XX), xtemp2

	movsd	 0 * SIZE(YY), yy1
	movhps	 2 * SIZE(YY), yy1

	movsd	 4 * SIZE(A1), a1
	movhps	 6 * SIZE(A1), a1
	movsd	 4 * SIZE(A1, LDA, 1), a2
	movhps	 6 * SIZE(A1, LDA, 1), a2
	movsd	 4 * SIZE(A2), a3
	movhps	 6 * SIZE(A2), a3
	movsd	 4 * SIZE(A2, LDA, 1), a4
	movhps	 6 * SIZE(A2, LDA, 1), a4

	addq	 $4 * SIZE, XX
	addq	 $4 * SIZE, A1
	addq	 $4 * SIZE, A2

	movq	M,  I
	subq	IS, I
	subq	$4, I
	sarq	$4, I
	jle	.L14
	ALIGN_3

.L12:
	movaps	 xtemp1, xt1
	mulps	 a1,     xt1
	mulps	 atemp1, a1
	addps	 xt1,    xsum1
	addps	 a1,     yy1
	movsd	 4 * SIZE(A1), a1
	movhps	 6 * SIZE(A1), a1

	PREFETCH	PREFETCHSIZE(A1)

	movaps	 xtemp1, xt1
	mulps	 a2,     xt1
	mulps	 atemp2, a2
	addps	 xt1,    xsum2
	addps	 a2,     yy1
	movsd	 4 * SIZE(A1, LDA, 1), a2
	movhps	 6 * SIZE(A1, LDA, 1), a2

	movaps	 xtemp1, xt1
	mulps	 a3,     xt1
	mulps	 atemp3, a3
	addps	 xt1,    xsum3
	addps	 a3,     yy1
	movsd	 4 * SIZE(A2), a3
	movhps	 6 * SIZE(A2), a3

#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
	PREFETCH	PREFETCHSIZE(XX)
#endif

	movaps	 xtemp1, xt1
	movaps	 8 * SIZE(XX), xtemp1
	mulps	 a4,     xt1
	mulps	 atemp4, a4
	addps	 xt1,    xsum4
	addps	 a4,     yy1
	movsd	 4 * SIZE(A2, LDA, 1), a4
	movhps	 6 * SIZE(A2, LDA, 1), a4

	movlps	 yy1, 0 * SIZE(YY)
	movhps	 yy1, 2 * SIZE(YY)
	movsd	 4 * SIZE(YY), yy1
	movhps	 6 * SIZE(YY), yy1

	movaps	 xtemp2, xt1
	mulps	 a1,     xt1
	mulps	 atemp1, a1
	addps	 xt1,    xsum1
	addps	 a1,     yy1
	movsd	 8 * SIZE(A1), a1
	movhps	10 * SIZE(A1), a1

	PREFETCH	PREFETCHSIZE(A1, LDA, 1)

	movaps	 xtemp2, xt1
	mulps	 a2,     xt1
	mulps	 atemp2, a2
	addps	 xt1,    xsum2
	addps	 a2,     yy1
	movsd	 8 * SIZE(A1, LDA, 1), a2
	movhps	10 * SIZE(A1, LDA, 1), a2

	movaps	 xtemp2, xt1
	mulps	 a3,     xt1
	mulps	 atemp3, a3
	addps	 xt1,    xsum3
	addps	 a3,     yy1
	movsd	 8 * SIZE(A2), a3
	movhps	10 * SIZE(A2), a3

	movaps	 xtemp2, xt1
	movaps	12 * SIZE(XX), xtemp2
	mulps	 a4,     xt1
	mulps	 atemp4, a4
	addps	 xt1,    xsum4
	addps	 a4,     yy1
	movsd	 8 * SIZE(A2, LDA, 1), a4
	movhps	10 * SIZE(A2, LDA, 1), a4

	movlps	 yy1, 4 * SIZE(YY)
	movhps	 yy1, 6 * SIZE(YY)
	movsd	 8 * SIZE(YY), yy1
	movhps	10 * SIZE(YY), yy1


	movaps	 xtemp1, xt1
	mulps	 a1,     xt1
	mulps	 atemp1, a1
	addps	 xt1,    xsum1
	addps	 a1,     yy1
	movsd	12 * SIZE(A1), a1
	movhps	14 * SIZE(A1), a1

	PREFETCH	PREFETCHSIZE(A2)

	movaps	 xtemp1, xt1
	mulps	 a2,     xt1
	mulps	 atemp2, a2
	addps	 xt1,    xsum2
	addps	 a2,     yy1
	movsd	12 * SIZE(A1, LDA, 1), a2
	movhps	14 * SIZE(A1, LDA, 1), a2

	movaps	 xtemp1, xt1
	mulps	 a3,     xt1
	mulps	 atemp3, a3
	addps	 xt1,    xsum3
	addps	 a3,     yy1
	movsd	12 * SIZE(A2), a3
	movhps	14 * SIZE(A2), a3

#if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
	PREFETCHW	PREFETCHSIZE(YY)
#endif

	movaps	 xtemp1, xt1
	movaps	16 * SIZE(XX), xtemp1
	mulps	 a4,     xt1
	mulps	 atemp4, a4
	addps	 xt1,    xsum4
	addps	 a4,     yy1
	movsd	12 * SIZE(A2, LDA, 1), a4
	movhps	14 * SIZE(A2, LDA, 1), a4

	movlps	 yy1,  8 * SIZE(YY)
	movhps	 yy1, 10 * SIZE(YY)
	movsd	12 * SIZE(YY), yy1
	movhps	14 * SIZE(YY), yy1

	movaps	 xtemp2, xt1
	mulps	 a1,     xt1
	mulps	 atemp1, a1
	addps	 xt1,    xsum1
	addps	 a1,     yy1
	movsd	16 * SIZE(A1), a1
	movhps	18 * SIZE(A1), a1

	PREFETCH	PREFETCHSIZE(A2, LDA, 1)

	movaps	 xtemp2, xt1
	mulps	 a2,     xt1
	mulps	 atemp2, a2
	addps	 xt1,    xsum2
	addps	 a2,     yy1
	movsd	16 * SIZE(A1, LDA, 1), a2
	movhps	18 * SIZE(A1, LDA, 1), a2

	movaps	 xtemp2, xt1
	mulps	 a3,     xt1
	mulps	 atemp3, a3
	addps	 xt1,    xsum3
	addps	 a3,     yy1
	movsd	16 * SIZE(A2), a3
	movhps	18 * SIZE(A2), a3

	movaps	 xtemp2, xt1
	movaps	20 * SIZE(XX), xtemp2
	mulps	 a4,     xt1
	mulps	 atemp4, a4
	addps	 xt1,    xsum4
	addps	 a4,     yy1
	movsd	16 * SIZE(A2, LDA, 1), a4
	movhps	18 * SIZE(A2, LDA, 1), a4

	movlps	 yy1, 12 * SIZE(YY)
	movhps	 yy1, 14 * SIZE(YY)
	movsd	16 * SIZE(YY), yy1
	movhps	18 * SIZE(YY), yy1

	addq	 $16 * SIZE, XX
	addq	 $16 * SIZE, YY
	addq	 $16 * SIZE, A1
	addq	 $16 * SIZE, A2

	decq	 I
	jg	 .L12
	ALIGN_3

.L14:
	movq	M,  I
	subq	IS, I
	subq	$4, I
	test	$8, I
	jle	.L15

	movaps	 xtemp1, xt1
	mulps	 a1,     xt1
	mulps	 atemp1, a1
	addps	 xt1,    xsum1
	addps	 a1,     yy1
	movsd	 4 * SIZE(A1), a1
	movhps	 6 * SIZE(A1), a1

	movaps	 xtemp1, xt1
	mulps	 a2,     xt1
	mulps	 atemp2, a2
	addps	 xt1,    xsum2
	addps	 a2,     yy1
	movsd	 4 * SIZE(A1, LDA, 1), a2
	movhps	 6 * SIZE(A1, LDA, 1), a2

	movaps	 xtemp1, xt1
	mulps	 a3,     xt1
	mulps	 atemp3, a3
	addps	 xt1,    xsum3
	addps	 a3,     yy1
	movsd	 4 * SIZE(A2), a3
	movhps	 6 * SIZE(A2), a3

	movaps	 xtemp1, xt1
	movaps	 8 * SIZE(XX), xtemp1
	mulps	 a4,     xt1
	mulps	 atemp4, a4
	addps	 xt1,    xsum4
	addps	 a4,     yy1
	movsd	 4 * SIZE(A2, LDA, 1), a4
	movhps	 6 * SIZE(A2, LDA, 1), a4

	movlps	 yy1, 0 * SIZE(YY)
	movhps	 yy1, 2 * SIZE(YY)
	movsd	 4 * SIZE(YY), yy1
	movhps	 6 * SIZE(YY), yy1

	movaps	 xtemp2, xt1
	mulps	 a1,     xt1
	mulps	 atemp1, a1
	addps	 xt1,    xsum1
	addps	 a1,     yy1
	movsd	 8 * SIZE(A1), a1
	movhps	10 * SIZE(A1), a1

	movaps	 xtemp2, xt1
	mulps	 a2,     xt1
	mulps	 atemp2, a2
	addps	 xt1,    xsum2
	addps	 a2,     yy1
	movsd	 8 * SIZE(A1, LDA, 1), a2
	movhps	10 * SIZE(A1, LDA, 1), a2

	movaps	 xtemp2, xt1
	mulps	 a3,     xt1
	mulps	 atemp3, a3
	addps	 xt1,    xsum3
	addps	 a3,     yy1
	movsd	 8 * SIZE(A2), a3
	movhps	10 * SIZE(A2), a3

	movaps	 xtemp2, xt1
	movaps	12 * SIZE(XX), xtemp2
	mulps	 a4,     xt1
	mulps	 atemp4, a4
	addps	 xt1,    xsum4
	addps	 a4,     yy1
	movsd	 8 * SIZE(A2, LDA, 1), a4
	movhps	10 * SIZE(A2, LDA, 1), a4

	movlps	 yy1, 4 * SIZE(YY)
	movhps	 yy1, 6 * SIZE(YY)
	movsd	 8 * SIZE(YY), yy1
	movhps	10 * SIZE(YY), yy1

	addq	 $8 * SIZE, XX
	addq	 $8 * SIZE, YY
	addq	 $8 * SIZE, A1
	addq	 $8 * SIZE, A2
	ALIGN_3

.L15:
	test	$4, I
	jle	.L17

	movaps	 xtemp1, xt1
	mulps	 a1,     xt1
	mulps	 atemp1, a1
	addps	 xt1,    xsum1
	addps	 a1,     yy1
	movsd	 4 * SIZE(A1), a1

	movaps	 xtemp1, xt1
	mulps	 a2,     xt1
	mulps	 atemp2, a2
	addps	 xt1,    xsum2
	addps	 a2,     yy1
	movsd	 4 * SIZE(A1, LDA, 1), a2

	movaps	 xtemp1, xt1
	mulps	 a3,     xt1
	mulps	 atemp3, a3
	addps	 xt1,    xsum3
	addps	 a3,     yy1
	movsd	 4 * SIZE(A2), a3

	movaps	 xtemp1, xt1
	movsd	 4 * SIZE(XX), xtemp1
	mulps	 a4,     xt1
	mulps	 atemp4, a4
	addps	 xt1,    xsum4
	addps	 a4,     yy1
	movsd	 4 * SIZE(A2, LDA, 1), a4

	movlps	 yy1, 0 * SIZE(YY)
	movhps	 yy1, 2 * SIZE(YY)
	movsd	 4 * SIZE(YY), yy1

	addq	 $4 * SIZE, XX
	addq	 $4 * SIZE, YY
	addq	 $4 * SIZE, A1
	addq	 $4 * SIZE, A2
	ALIGN_3

.L17:
	testq	$2, M
	jle	.L18

	pxor	 xtemp2, xtemp2

	movlhps  xtemp2, a1
	movaps	 xtemp1, xt1
	mulps	 a1,     xt1
	mulps	 atemp1, a1
	addps	 xt1,    xsum1
	addps	 a1,     yy1
	movss	 2 * SIZE(A1), a1

	movlhps  xtemp2, a2
	movaps	 xtemp1, xt1
	mulps	 a2,     xt1
	mulps	 atemp2, a2
	addps	 xt1,    xsum2
	addps	 a2,     yy1
	movss	 2 * SIZE(A1, LDA, 1), a2

	movlhps  xtemp2, a3
	movaps	 xtemp1, xt1
	mulps	 a3,     xt1
	mulps	 atemp3, a3
	addps	 xt1,    xsum3
	addps	 a3,     yy1
	movss	 2 * SIZE(A2), a3

	movlhps  xtemp2, a4
	movaps	 xtemp1, xt1
	movss	 2 * SIZE(XX), xtemp1
	mulps	 a4,     xt1
	mulps	 atemp4, a4
	addps	 xt1,    xsum4
	addps	 a4,     yy1
	movss	 2 * SIZE(A2, LDA, 1), a4

	movlps	 yy1, 0 * SIZE(YY)
	movss	 2 * SIZE(YY), yy1

	addq	 $2 * SIZE, XX
	addq	 $2 * SIZE, YY
	addq	 $2 * SIZE, A1
	addq	 $2 * SIZE, A2
	ALIGN_3

.L18:
	testq	$1, M
	jle	.L19

	movss	 0 * SIZE(XX), xtemp1

	movss	 0 * SIZE(YY), yy1

	movss	 0 * SIZE(A1), a1
	movss	 0 * SIZE(A1, LDA, 1), a2
	movss	 0 * SIZE(A2), a3
	movss	 0 * SIZE(A2, LDA, 1), a4

	movaps	 xtemp1, xt1
	mulss	 a1,     xt1
	mulss	 atemp1, a1
	addss	 xt1,    xsum1
	addss	 a1,     yy1

	movaps	 xtemp1, xt1
	mulss	 a2,     xt1
	mulss	 atemp2, a2
	addss	 xt1,    xsum2
	addss	 a2,     yy1

	movaps	 xtemp1, xt1
	mulss	 a3,     xt1
	mulss	 atemp3, a3
	addss	 xt1,    xsum3
	addss	 a3,     yy1

	movaps	 xtemp1, xt1
	mulss	 a4,     xt1
	mulss	 atemp4, a4
	addss	 xt1,    xsum4
	addss	 a4,     yy1

	movss	 yy1, 0 * SIZE(YY)
	ALIGN_3

.L19:
#ifndef HAVE_SSE3
	movaps	 xsum1,  xtemp1
	unpcklps xsum3,  xsum1
	unpckhps xsum3,  xtemp1

	movaps	 xsum2,  xtemp2
	unpcklps xsum4,  xsum2
	unpckhps xsum4,  xtemp2

	movaps	 xsum1,  xsum3
	unpcklps xsum2,  xsum1
	unpckhps xsum2,  xsum3

	movaps	 xtemp1, xsum4
	unpcklps xtemp2, xtemp1
	unpckhps xtemp2, xsum4

	addps	 xsum3,  xsum1
	addps	 xtemp1, xsum4
	addps	 xsum4,  xsum1
#else
	haddps	 xsum2, xsum1
	haddps	 xsum4, xsum3

	haddps	 xsum3, xsum1
#endif

	movsd	 0 * SIZE(NEW_Y, IS, SIZE), yy1
	movhps	 2 * SIZE(NEW_Y, IS, SIZE), yy1

	addps	 xsum1, yy1

	movsd	 yy1, 0 * SIZE(NEW_Y, IS, SIZE)
	movhps	 yy1, 2 * SIZE(NEW_Y, IS, SIZE)

	addq	 $4, IS

	movq	 IS, I
	addq	 $4, I
	cmpq	 N, I
	jle	 .L11
	ALIGN_3

.L20:
	testq	$2, N
	jle	.L30

	movq	A,  A1
	leaq	2 * SIZE(A, LDA, 2), A

	movaps	 0 * SIZE(NEW_X, IS, SIZE), atemp4

#if defined(OPTERON)
	pxor	xsum1, xsum1
#endif
	movsd	 0 * SIZE(A1), xsum1
	mulps	 atemp4, xsum1

	movss	 1 * SIZE(A1), xsum2
	movss	 1 * SIZE(A1, LDA, 1), a2
	unpcklps a2, xsum2
	mulps	 atemp4, xsum2

	pshufd	$0x00, atemp4, atemp1
	pshufd	$0x55, atemp4, atemp2

	testq	$1, M
	jle	.L29

	movss	 2 * SIZE(A1), a1
	movss	 2 * SIZE(A1, LDA, 1), a2
	movss	 2 * SIZE(NEW_X, IS, SIZE), xtemp1
	movss	 2 * SIZE(NEW_Y, IS, SIZE), yy1

	movaps	 xtemp1, xt1
	mulss	 a1,     xt1
	mulss	 atemp1, a1
	addss	 xt1,    xsum1
	addps	 a1,     yy1

	movaps	 xtemp1, xt1
	mulss	 a2,     xt1
	mulss	 atemp2, a2
	addss	 xt1,    xsum2
	addss	 a2,     yy1

	movss	 yy1, 2 * SIZE(NEW_Y, IS, SIZE)
	ALIGN_3

.L29:

#ifndef HAVE_SSE3
	unpcklps xsum2, xsum1
	movhlps	 xsum1, xsum2
	addps	 xsum2, xsum1
#else
	haddps	 xsum2, xsum1
	haddps	 xsum1, xsum1
#endif

	movsd	 0 * SIZE(NEW_Y, IS, SIZE), yy1

	addps	 xsum1, yy1

	movlps	 yy1, 0 * SIZE(NEW_Y, IS, SIZE)

	addq	 $2, IS
	ALIGN_3

.L30:
	testq	$1, N
	jle	.L990

	movss	 0 * SIZE(NEW_X, IS, SIZE), xsum1
	mulss	 0 * SIZE(A), xsum1
	addss	 0 * SIZE(NEW_Y, IS, SIZE), xsum1
	movss	 xsum1, 0 * SIZE(NEW_Y, IS, SIZE)
	ALIGN_3

.L990:
	cmpq   $SIZE, INCY
	je    .L999

	movq	M,  %rax
	sarq	$3, %rax
	jle	.L997
	ALIGN_3

.L996:
	movss	 0 * SIZE(NEW_Y), %xmm0
	movss	 1 * SIZE(NEW_Y), %xmm1
	movss	 2 * SIZE(NEW_Y), %xmm2
	movss	 3 * SIZE(NEW_Y), %xmm3
	movss	 4 * SIZE(NEW_Y), %xmm4
	movss	 5 * SIZE(NEW_Y), %xmm5
	movss	 6 * SIZE(NEW_Y), %xmm6
	movss	 7 * SIZE(NEW_Y), %xmm7

	movss	%xmm0,  0 * SIZE(Y)
	addq	INCY, Y
	movss	%xmm1,  0 * SIZE(Y)
	addq	INCY, Y
	movss	%xmm2,  0 * SIZE(Y)
	addq	INCY, Y
	movss	%xmm3,  0 * SIZE(Y)
	addq	INCY, Y
	movss	%xmm4,  0 * SIZE(Y)
	addq	INCY, Y
	movss	%xmm5,  0 * SIZE(Y)
	addq	INCY, Y
	movss	%xmm6,  0 * SIZE(Y)
	addq	INCY, Y
	movss	%xmm7,  0 * SIZE(Y)
	addq	INCY, Y

	addq	$8 * SIZE, NEW_Y
	decq	%rax
	jg	.L996
	ALIGN_3

.L997:
	movq	M, %rax
	andq	$7, %rax
	jle	.L999
	ALIGN_3

.L998:
	movss	0 * SIZE(NEW_Y), %xmm0

	movss	%xmm0,  0 * SIZE(Y)
	addq	INCY, Y

	addq	$1 * SIZE, NEW_Y

	decq	%rax
	jg	.L998
	ALIGN_3

.L999:
	movq	  0(%rsp), %rbx
	movq	  8(%rsp), %rbp
	movq	 16(%rsp), %r12
	movq	 24(%rsp), %r13
	movq	 32(%rsp), %r14
	movq	 40(%rsp), %r15

#ifdef WINDOWS_ABI
	movq	 48(%rsp), %rdi
	movq	 56(%rsp), %rsi
	movups	 64(%rsp), %xmm6
	movups	 80(%rsp), %xmm7
	movups	 96(%rsp), %xmm8
	movups	112(%rsp), %xmm9
	movups	128(%rsp), %xmm10
	movups	144(%rsp), %xmm11
	movups	160(%rsp), %xmm12
	movups	176(%rsp), %xmm13
	movups	192(%rsp), %xmm14
	movups	208(%rsp), %xmm15
#endif

	addq	$STACKSIZE, %rsp
	ret
	EPILOGUE