/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #ifndef NEEDPARAM #ifndef DOUBLE #include "cparam.h" #else #include "zparam.h" #endif #endif #ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 #define X r6 #define INCX r7 #define Y r8 #define INCY r9 #define A r10 #define LDA r5 #else #define M r3 #define N r4 #define X r8 #define INCX r9 #define Y r10 #define INCY r5 #define A r6 #define LDA r7 #endif #endif #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) #define M r3 #define N r4 #define X r10 #define INCX r5 #define Y r6 #define INCY r7 #define A r8 #define LDA r9 #else #define M r3 #define N r4 #define X r8 #define INCX r9 #define Y r10 #define INCY r5 #define A r6 #define LDA r7 #endif #endif #define I r11 #define J r12 #define AO1 r14 #define AO2 r15 #define AO3 r16 #define AO4 r17 #define AO5 r18 #define AO6 r19 #define AO7 r20 #define AO8 r21 #define X1 r22 #define PREA r23 #define PREC r24 #define XX r25 #define BUFFER r26 #define y01 f0 #define y02 f1 #define y03 f2 #define y04 f3 #define y05 f4 #define y06 f5 #define y07 f6 #define y08 f7 #define alpha1_r f8 #define alpha1_i f9 #define alpha2_r f10 #define alpha2_i f11 #define a1 f12 #define a2 f13 #define a3 f14 #define a4 f15 #define a5 f16 #define a6 f17 #define a7 f18 #define a8 f19 #define a9 f20 #define a10 f21 #define a11 f22 #define a12 f23 #define a13 f24 #define a14 f25 #define a15 f26 #define a16 f27 #define alpha_r f30 #define alpha_i f31 #ifndef CONJ #define FMA1 FNMSUB #define FMA2 FMADD #else #define FMA1 FMADD #define FMA2 FNMSUB #endif #if defined(PPC440) || defined(PPC440FP2) #define PREFETCHSIZE_A 24 #define PREFETCHSIZE_C 16 #endif #ifdef PPC970 #define PREFETCHSIZE_A 16 #define PREFETCHSIZE_C 16 #endif #ifdef POWER4 #define PREFETCHSIZE_A 16 #define PREFETCHSIZE_C 16 #endif #ifdef POWER5 #define PREFETCHSIZE_A 16 #define PREFETCHSIZE_C 16 #endif #ifndef NEEDPARAM #ifndef __64BIT__ #define STACKSIZE 224 #else #define STACKSIZE 280 #endif PROLOGUE PROFCODE addi SP, SP, -STACKSIZE stfd f14, 0(SP) stfd f15, 8(SP) stfd f16, 16(SP) stfd f17, 24(SP) stfd f18, 32(SP) stfd f19, 40(SP) stfd f20, 48(SP) stfd f21, 56(SP) stfd f22, 64(SP) stfd f23, 72(SP) stfd f24, 80(SP) stfd f25, 88(SP) stfd f26, 96(SP) stfd f27, 104(SP) stfd f28, 112(SP) stfd f29, 120(SP) stfd f30, 128(SP) stfd f31, 136(SP) #ifdef __64BIT__ std r14, 144(SP) std r15, 152(SP) std r16, 160(SP) std r17, 168(SP) std r18, 176(SP) std r19, 184(SP) std r20, 192(SP) std r21, 200(SP) std r22, 208(SP) std r23, 216(SP) std r24, 224(SP) std r25, 232(SP) std r26, 240(SP) std r27, 248(SP) #else stw r14, 144(SP) stw r15, 148(SP) stw r16, 152(SP) stw r17, 156(SP) stw r18, 160(SP) stw r19, 164(SP) stw r20, 168(SP) stw r21, 172(SP) stw r22, 176(SP) stw r23, 180(SP) stw r24, 184(SP) stw r25, 188(SP) stw r26, 192(SP) stw r27, 196(SP) #endif #ifdef linux #ifndef __64BIT__ lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) #else ld INCY, FRAMESLOT(0) + STACKSIZE(SP) ld A, FRAMESLOT(1) + STACKSIZE(SP) ld LDA, FRAMESLOT(2) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifndef __64BIT__ #ifdef DOUBLE lwz INCX, FRAMESLOT(0) + STACKSIZE(SP) lwz Y, FRAMESLOT(1) + STACKSIZE(SP) lwz INCY, FRAMESLOT(2) + STACKSIZE(SP) lwz A, FRAMESLOT(3) + STACKSIZE(SP) lwz LDA, FRAMESLOT(4) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(5) + STACKSIZE(SP) #else lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz A, FRAMESLOT(1) + STACKSIZE(SP) lwz LDA, FRAMESLOT(2) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #else ld INCY, FRAMESLOT(0) + STACKSIZE(SP) ld A, FRAMESLOT(1) + STACKSIZE(SP) ld LDA, FRAMESLOT(2) + STACKSIZE(SP) ld BUFFER, FRAMESLOT(3) + STACKSIZE(SP) #endif #endif fmr alpha_r, f1 fmr alpha_i, f2 slwi LDA, LDA, ZBASE_SHIFT slwi INCX, INCX, ZBASE_SHIFT slwi INCY, INCY, ZBASE_SHIFT li PREA, PREFETCHSIZE_A * SIZE li PREC, PREFETCHSIZE_C * SIZE cmpwi cr0, M, 0 ble- LL(999) cmpwi cr0, N, 0 ble- LL(999) mr XX, X cmpi cr0, 0, INCX, 2 * SIZE beq LL(10) mr XX, BUFFER mr X1, BUFFER srawi. r0, M, 2 mtspr CTR, r0 ble LL(05) .align 4 LL(01): LFD a1, 0 * SIZE(X) LFD a2, 1 * SIZE(X) add X, X, INCX LFD a3, 0 * SIZE(X) LFD a4, 1 * SIZE(X) add X, X, INCX LFD a5, 0 * SIZE(X) LFD a6, 1 * SIZE(X) add X, X, INCX LFD a7, 0 * SIZE(X) LFD a8, 1 * SIZE(X) add X, X, INCX STFD a1, 0 * SIZE(X1) STFD a2, 1 * SIZE(X1) STFD a3, 2 * SIZE(X1) STFD a4, 3 * SIZE(X1) STFD a5, 4 * SIZE(X1) STFD a6, 5 * SIZE(X1) STFD a7, 6 * SIZE(X1) STFD a8, 7 * SIZE(X1) addi X1, X1, 8 * SIZE bdnz+ LL(01) .align 4 LL(05): andi. r0, M, 7 mtspr CTR, r0 ble LL(10) .align 4 LL(06): LFD a1, 0 * SIZE(X) LFD a2, 1 * SIZE(X) STFD a1, 0 * SIZE(X1) STFD a2, 1 * SIZE(X1) add X, X, INCX addi X1, X1, 2 * SIZE bdnz+ LL(06) .align 4 LL(10): srawi. J, N, 1 ble LL(20) .align 4 LL(11): LFD alpha1_r, 0 * SIZE(Y) LFD alpha1_i, 1 * SIZE(Y) add Y, Y, INCY LFD alpha2_r, 0 * SIZE(Y) LFD alpha2_i, 1 * SIZE(Y) add Y, Y, INCY FMUL a1, alpha_r, alpha1_r FMUL a2, alpha_i, alpha1_r FMUL a3, alpha_r, alpha2_r FMUL a4, alpha_i, alpha2_r FMA1 alpha1_r, alpha_i, alpha1_i, a1 FMA2 alpha1_i, alpha_r, alpha1_i, a2 FMA1 alpha2_r, alpha_i, alpha2_i, a3 FMA2 alpha2_i, alpha_r, alpha2_i, a4 mr AO1, A add AO2, A, LDA add A, AO2, LDA mr X1, XX srawi. r0, M, 3 mtspr CTR, r0 ble LL(15) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) LFD y03, 2 * SIZE(X1) LFD y04, 3 * SIZE(X1) LFD y05, 4 * SIZE(X1) LFD y06, 5 * SIZE(X1) LFD y07, 6 * SIZE(X1) LFD y08, 7 * SIZE(X1) LFD a9, 0 * SIZE(AO2) LFD a10, 1 * SIZE(AO2) LFD a11, 2 * SIZE(AO2) LFD a12, 3 * SIZE(AO2) LFD a13, 4 * SIZE(AO2) LFD a14, 5 * SIZE(AO2) LFD a15, 6 * SIZE(AO2) LFD a16, 7 * SIZE(AO2) bdz LL(13) .align 4 LL(12): FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FMADD a9, alpha2_r, y01, a9 FMADD a10, alpha2_r, y02, a10 FMADD a11, alpha2_r, y03, a11 FMADD a12, alpha2_r, y04, a12 FMADD a13, alpha2_r, y05, a13 FMADD a14, alpha2_r, y06, a14 FMADD a15, alpha2_r, y07, a15 FMADD a16, alpha2_r, y08, a16 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) LFD a1, 8 * SIZE(AO1) LFD a2, 9 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 STFD a5, 4 * SIZE(AO1) STFD a6, 5 * SIZE(AO1) STFD a7, 6 * SIZE(AO1) STFD a8, 7 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) FNMSUB a9, alpha2_i, y02, a9 FMADD a10, alpha2_i, y01, a10 FNMSUB a11, alpha2_i, y04, a11 FMADD a12, alpha2_i, y03, a12 LFD y01, 8 * SIZE(X1) LFD y02, 9 * SIZE(X1) LFD y03, 10 * SIZE(X1) LFD y04, 11 * SIZE(X1) STFD a9, 0 * SIZE(AO2) STFD a10, 1 * SIZE(AO2) STFD a11, 2 * SIZE(AO2) STFD a12, 3 * SIZE(AO2) LFD a9, 8 * SIZE(AO2) LFD a10, 9 * SIZE(AO2) LFD a11, 10 * SIZE(AO2) LFD a12, 11 * SIZE(AO2) FNMSUB a13, alpha2_i, y06, a13 FMADD a14, alpha2_i, y05, a14 FNMSUB a15, alpha2_i, y08, a15 FMADD a16, alpha2_i, y07, a16 LFD y05, 12 * SIZE(X1) LFD y06, 13 * SIZE(X1) LFD y07, 14 * SIZE(X1) LFD y08, 15 * SIZE(X1) STFD a13, 4 * SIZE(AO2) STFD a14, 5 * SIZE(AO2) STFD a15, 6 * SIZE(AO2) STFD a16, 7 * SIZE(AO2) LFD a13, 12 * SIZE(AO2) LFD a14, 13 * SIZE(AO2) LFD a15, 14 * SIZE(AO2) LFD a16, 15 * SIZE(AO2) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FMADD a9, alpha2_r, y01, a9 FMADD a10, alpha2_r, y02, a10 FMADD a11, alpha2_r, y03, a11 FMADD a12, alpha2_r, y04, a12 FMADD a13, alpha2_r, y05, a13 FMADD a14, alpha2_r, y06, a14 FMADD a15, alpha2_r, y07, a15 FMADD a16, alpha2_r, y08, a16 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 STFD a1, 8 * SIZE(AO1) STFD a2, 9 * SIZE(AO1) STFD a3, 10 * SIZE(AO1) STFD a4, 11 * SIZE(AO1) LFD a1, 16 * SIZE(AO1) LFD a2, 17 * SIZE(AO1) LFD a3, 18 * SIZE(AO1) LFD a4, 19 * SIZE(AO1) FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 STFD a5, 12 * SIZE(AO1) STFD a6, 13 * SIZE(AO1) STFD a7, 14 * SIZE(AO1) STFD a8, 15 * SIZE(AO1) LFD a5, 20 * SIZE(AO1) LFD a6, 21 * SIZE(AO1) LFD a7, 22 * SIZE(AO1) LFD a8, 23 * SIZE(AO1) FNMSUB a9, alpha2_i, y02, a9 FMADD a10, alpha2_i, y01, a10 FNMSUB a11, alpha2_i, y04, a11 FMADD a12, alpha2_i, y03, a12 LFD y01, 16 * SIZE(X1) LFD y02, 17 * SIZE(X1) LFD y03, 18 * SIZE(X1) LFD y04, 19 * SIZE(X1) STFD a9, 8 * SIZE(AO2) STFD a10, 9 * SIZE(AO2) STFD a11, 10 * SIZE(AO2) STFD a12, 11 * SIZE(AO2) LFD a9, 16 * SIZE(AO2) LFD a10, 17 * SIZE(AO2) LFD a11, 18 * SIZE(AO2) LFD a12, 19 * SIZE(AO2) FNMSUB a13, alpha2_i, y06, a13 FMADD a14, alpha2_i, y05, a14 FNMSUB a15, alpha2_i, y08, a15 FMADD a16, alpha2_i, y07, a16 LFD y05, 20 * SIZE(X1) LFD y06, 21 * SIZE(X1) LFD y07, 22 * SIZE(X1) LFD y08, 23 * SIZE(X1) STFD a13, 12 * SIZE(AO2) STFD a14, 13 * SIZE(AO2) STFD a15, 14 * SIZE(AO2) STFD a16, 15 * SIZE(AO2) LFD a13, 20 * SIZE(AO2) LFD a14, 21 * SIZE(AO2) LFD a15, 22 * SIZE(AO2) LFD a16, 23 * SIZE(AO2) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi X1, X1, 16 * SIZE DCBT(AO1, PREA) DCBT(AO2, PREA) DCBT(Y1, PREY) bdnz+ LL(12) .align 4 LL(13): FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FMADD a9, alpha2_r, y01, a9 FMADD a10, alpha2_r, y02, a10 FMADD a11, alpha2_r, y03, a11 FMADD a12, alpha2_r, y04, a12 FMADD a13, alpha2_r, y05, a13 FMADD a14, alpha2_r, y06, a14 FMADD a15, alpha2_r, y07, a15 FMADD a16, alpha2_r, y08, a16 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) LFD a1, 8 * SIZE(AO1) LFD a2, 9 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 STFD a5, 4 * SIZE(AO1) STFD a6, 5 * SIZE(AO1) STFD a7, 6 * SIZE(AO1) STFD a8, 7 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) FNMSUB a9, alpha2_i, y02, a9 FMADD a10, alpha2_i, y01, a10 FNMSUB a11, alpha2_i, y04, a11 FMADD a12, alpha2_i, y03, a12 LFD y01, 8 * SIZE(X1) LFD y02, 9 * SIZE(X1) LFD y03, 10 * SIZE(X1) LFD y04, 11 * SIZE(X1) STFD a9, 0 * SIZE(AO2) STFD a10, 1 * SIZE(AO2) STFD a11, 2 * SIZE(AO2) STFD a12, 3 * SIZE(AO2) LFD a9, 8 * SIZE(AO2) LFD a10, 9 * SIZE(AO2) LFD a11, 10 * SIZE(AO2) LFD a12, 11 * SIZE(AO2) FNMSUB a13, alpha2_i, y06, a13 FMADD a14, alpha2_i, y05, a14 FNMSUB a15, alpha2_i, y08, a15 FMADD a16, alpha2_i, y07, a16 LFD y05, 12 * SIZE(X1) LFD y06, 13 * SIZE(X1) LFD y07, 14 * SIZE(X1) LFD y08, 15 * SIZE(X1) STFD a13, 4 * SIZE(AO2) STFD a14, 5 * SIZE(AO2) STFD a15, 6 * SIZE(AO2) STFD a16, 7 * SIZE(AO2) LFD a13, 12 * SIZE(AO2) LFD a14, 13 * SIZE(AO2) LFD a15, 14 * SIZE(AO2) LFD a16, 15 * SIZE(AO2) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FMADD a9, alpha2_r, y01, a9 FMADD a10, alpha2_r, y02, a10 FMADD a11, alpha2_r, y03, a11 FMADD a12, alpha2_r, y04, a12 FMADD a13, alpha2_r, y05, a13 FMADD a14, alpha2_r, y06, a14 FMADD a15, alpha2_r, y07, a15 FMADD a16, alpha2_r, y08, a16 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 STFD a1, 8 * SIZE(AO1) STFD a2, 9 * SIZE(AO1) STFD a3, 10 * SIZE(AO1) STFD a4, 11 * SIZE(AO1) FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 STFD a5, 12 * SIZE(AO1) STFD a6, 13 * SIZE(AO1) STFD a7, 14 * SIZE(AO1) STFD a8, 15 * SIZE(AO1) FNMSUB a9, alpha2_i, y02, a9 FMADD a10, alpha2_i, y01, a10 FNMSUB a11, alpha2_i, y04, a11 FMADD a12, alpha2_i, y03, a12 STFD a9, 8 * SIZE(AO2) STFD a10, 9 * SIZE(AO2) STFD a11, 10 * SIZE(AO2) STFD a12, 11 * SIZE(AO2) FNMSUB a13, alpha2_i, y06, a13 FMADD a14, alpha2_i, y05, a14 FNMSUB a15, alpha2_i, y08, a15 FMADD a16, alpha2_i, y07, a16 STFD a13, 12 * SIZE(AO2) STFD a14, 13 * SIZE(AO2) STFD a15, 14 * SIZE(AO2) STFD a16, 15 * SIZE(AO2) addi AO1, AO1, 16 * SIZE addi AO2, AO2, 16 * SIZE addi X1, X1, 16 * SIZE .align 4 LL(15): andi. r0, M, 7 ble LL(19) andi. r0, M, 4 ble LL(17) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) LFD y03, 2 * SIZE(X1) LFD y04, 3 * SIZE(X1) LFD y05, 4 * SIZE(X1) LFD y06, 5 * SIZE(X1) LFD y07, 6 * SIZE(X1) LFD y08, 7 * SIZE(X1) LFD a9, 0 * SIZE(AO2) LFD a10, 1 * SIZE(AO2) LFD a11, 2 * SIZE(AO2) LFD a12, 3 * SIZE(AO2) LFD a13, 4 * SIZE(AO2) LFD a14, 5 * SIZE(AO2) LFD a15, 6 * SIZE(AO2) LFD a16, 7 * SIZE(AO2) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FMADD a9, alpha2_r, y01, a9 FMADD a10, alpha2_r, y02, a10 FMADD a11, alpha2_r, y03, a11 FMADD a12, alpha2_r, y04, a12 FMADD a13, alpha2_r, y05, a13 FMADD a14, alpha2_r, y06, a14 FMADD a15, alpha2_r, y07, a15 FMADD a16, alpha2_r, y08, a16 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 FNMSUB a9, alpha2_i, y02, a9 FMADD a10, alpha2_i, y01, a10 FNMSUB a11, alpha2_i, y04, a11 FMADD a12, alpha2_i, y03, a12 FNMSUB a13, alpha2_i, y06, a13 FMADD a14, alpha2_i, y05, a14 FNMSUB a15, alpha2_i, y08, a15 FMADD a16, alpha2_i, y07, a16 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) STFD a5, 4 * SIZE(AO1) STFD a6, 5 * SIZE(AO1) STFD a7, 6 * SIZE(AO1) STFD a8, 7 * SIZE(AO1) STFD a9, 0 * SIZE(AO2) STFD a10, 1 * SIZE(AO2) STFD a11, 2 * SIZE(AO2) STFD a12, 3 * SIZE(AO2) STFD a13, 4 * SIZE(AO2) STFD a14, 5 * SIZE(AO2) STFD a15, 6 * SIZE(AO2) STFD a16, 7 * SIZE(AO2) addi AO1, AO1, 8 * SIZE addi AO2, AO2, 8 * SIZE addi X1, X1, 8 * SIZE .align 4 LL(17): andi. r0, M, 2 ble LL(18) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) LFD y03, 2 * SIZE(X1) LFD y04, 3 * SIZE(X1) LFD a5, 0 * SIZE(AO2) LFD a6, 1 * SIZE(AO2) LFD a7, 2 * SIZE(AO2) LFD a8, 3 * SIZE(AO2) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha2_r, y01, a5 FMADD a6, alpha2_r, y02, a6 FMADD a7, alpha2_r, y03, a7 FMADD a8, alpha2_r, y04, a8 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 FNMSUB a5, alpha2_i, y02, a5 FMADD a6, alpha2_i, y01, a6 FNMSUB a7, alpha2_i, y04, a7 FMADD a8, alpha2_i, y03, a8 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) STFD a5, 0 * SIZE(AO2) STFD a6, 1 * SIZE(AO2) STFD a7, 2 * SIZE(AO2) STFD a8, 3 * SIZE(AO2) addi AO1, AO1, 4 * SIZE addi AO2, AO2, 4 * SIZE addi X1, X1, 4 * SIZE .align 4 LL(18): andi. r0, M, 1 ble LL(19) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 0 * SIZE(AO2) LFD a4, 1 * SIZE(AO2) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha2_r, y01, a3 FMADD a4, alpha2_r, y02, a4 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha2_i, y02, a3 FMADD a4, alpha2_i, y01, a4 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 0 * SIZE(AO2) STFD a4, 1 * SIZE(AO2) .align 4 LL(19): addi J, J, -1 cmpi cr0, 0, J, 0 bgt LL(11) .align 4 LL(20): andi. J, N, 1 ble LL(999) LFD alpha1_r, 0 * SIZE(Y) LFD alpha1_i, 1 * SIZE(Y) FMUL a1, alpha_r, alpha1_r FMUL a2, alpha_i, alpha1_r FMA1 alpha1_r, alpha_i, alpha1_i, a1 FMA2 alpha1_i, alpha_r, alpha1_i, a2 mr AO1, A mr X1, XX srawi. r0, M, 3 mtspr CTR, r0 ble LL(25) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) LFD y03, 2 * SIZE(X1) LFD y04, 3 * SIZE(X1) LFD y05, 4 * SIZE(X1) LFD y06, 5 * SIZE(X1) LFD y07, 6 * SIZE(X1) LFD y08, 7 * SIZE(X1) bdz LL(23) .align 4 LL(22): FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) LFD a1, 8 * SIZE(AO1) LFD a2, 9 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 STFD a5, 4 * SIZE(AO1) STFD a6, 5 * SIZE(AO1) STFD a7, 6 * SIZE(AO1) STFD a8, 7 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) LFD y01, 8 * SIZE(X1) LFD y02, 9 * SIZE(X1) LFD y03, 10 * SIZE(X1) LFD y04, 11 * SIZE(X1) LFD y05, 12 * SIZE(X1) LFD y06, 13 * SIZE(X1) LFD y07, 14 * SIZE(X1) LFD y08, 15 * SIZE(X1) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FMADD a9, alpha2_r, y01, a9 FMADD a10, alpha2_r, y02, a10 FMADD a11, alpha2_r, y03, a11 FMADD a12, alpha2_r, y04, a12 FMADD a13, alpha2_r, y05, a13 FMADD a14, alpha2_r, y06, a14 FMADD a15, alpha2_r, y07, a15 FMADD a16, alpha2_r, y08, a16 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 STFD a1, 8 * SIZE(AO1) STFD a2, 9 * SIZE(AO1) STFD a3, 10 * SIZE(AO1) STFD a4, 11 * SIZE(AO1) LFD a1, 16 * SIZE(AO1) LFD a2, 17 * SIZE(AO1) LFD a3, 18 * SIZE(AO1) LFD a4, 19 * SIZE(AO1) FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 STFD a5, 12 * SIZE(AO1) STFD a6, 13 * SIZE(AO1) STFD a7, 14 * SIZE(AO1) STFD a8, 15 * SIZE(AO1) LFD a5, 20 * SIZE(AO1) LFD a6, 21 * SIZE(AO1) LFD a7, 22 * SIZE(AO1) LFD a8, 23 * SIZE(AO1) LFD y01, 16 * SIZE(X1) LFD y02, 17 * SIZE(X1) LFD y03, 18 * SIZE(X1) LFD y04, 19 * SIZE(X1) LFD y05, 20 * SIZE(X1) LFD y06, 21 * SIZE(X1) LFD y07, 22 * SIZE(X1) LFD y08, 23 * SIZE(X1) addi AO1, AO1, 16 * SIZE addi X1, X1, 16 * SIZE DCBT(AO1, PREA) DCBT(Y1, PREY) bdnz+ LL(22) .align 4 LL(23): FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) LFD a1, 8 * SIZE(AO1) LFD a2, 9 * SIZE(AO1) LFD a3, 10 * SIZE(AO1) LFD a4, 11 * SIZE(AO1) FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 STFD a5, 4 * SIZE(AO1) STFD a6, 5 * SIZE(AO1) STFD a7, 6 * SIZE(AO1) STFD a8, 7 * SIZE(AO1) LFD a5, 12 * SIZE(AO1) LFD a6, 13 * SIZE(AO1) LFD a7, 14 * SIZE(AO1) LFD a8, 15 * SIZE(AO1) LFD y01, 8 * SIZE(X1) LFD y02, 9 * SIZE(X1) LFD y03, 10 * SIZE(X1) LFD y04, 11 * SIZE(X1) LFD y05, 12 * SIZE(X1) LFD y06, 13 * SIZE(X1) LFD y07, 14 * SIZE(X1) LFD y08, 15 * SIZE(X1) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 STFD a1, 8 * SIZE(AO1) STFD a2, 9 * SIZE(AO1) STFD a3, 10 * SIZE(AO1) STFD a4, 11 * SIZE(AO1) FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 STFD a5, 12 * SIZE(AO1) STFD a6, 13 * SIZE(AO1) STFD a7, 14 * SIZE(AO1) STFD a8, 15 * SIZE(AO1) addi AO1, AO1, 16 * SIZE addi X1, X1, 16 * SIZE .align 4 LL(25): andi. r0, M, 7 ble LL(999) andi. r0, M, 4 ble LL(27) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD a5, 4 * SIZE(AO1) LFD a6, 5 * SIZE(AO1) LFD a7, 6 * SIZE(AO1) LFD a8, 7 * SIZE(AO1) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) LFD y03, 2 * SIZE(X1) LFD y04, 3 * SIZE(X1) LFD y05, 4 * SIZE(X1) LFD y06, 5 * SIZE(X1) LFD y07, 6 * SIZE(X1) LFD y08, 7 * SIZE(X1) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FMADD a5, alpha1_r, y05, a5 FMADD a6, alpha1_r, y06, a6 FMADD a7, alpha1_r, y07, a7 FMADD a8, alpha1_r, y08, a8 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 FNMSUB a5, alpha1_i, y06, a5 FMADD a6, alpha1_i, y05, a6 FNMSUB a7, alpha1_i, y08, a7 FMADD a8, alpha1_i, y07, a8 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) STFD a5, 4 * SIZE(AO1) STFD a6, 5 * SIZE(AO1) STFD a7, 6 * SIZE(AO1) STFD a8, 7 * SIZE(AO1) addi AO1, AO1, 8 * SIZE addi X1, X1, 8 * SIZE .align 4 LL(27): andi. r0, M, 2 ble LL(28) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD a3, 2 * SIZE(AO1) LFD a4, 3 * SIZE(AO1) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) LFD y03, 2 * SIZE(X1) LFD y04, 3 * SIZE(X1) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FMADD a3, alpha1_r, y03, a3 FMADD a4, alpha1_r, y04, a4 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 FNMSUB a3, alpha1_i, y04, a3 FMADD a4, alpha1_i, y03, a4 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) STFD a3, 2 * SIZE(AO1) STFD a4, 3 * SIZE(AO1) addi AO1, AO1, 4 * SIZE addi X1, X1, 4 * SIZE .align 4 LL(28): andi. r0, M, 1 ble LL(999) LFD a1, 0 * SIZE(AO1) LFD a2, 1 * SIZE(AO1) LFD y01, 0 * SIZE(X1) LFD y02, 1 * SIZE(X1) FMADD a1, alpha1_r, y01, a1 FMADD a2, alpha1_r, y02, a2 FNMSUB a1, alpha1_i, y02, a1 FMADD a2, alpha1_i, y01, a2 STFD a1, 0 * SIZE(AO1) STFD a2, 1 * SIZE(AO1) .align 4 LL(999): li r3, 0 lfd f14, 0(SP) lfd f15, 8(SP) lfd f16, 16(SP) lfd f17, 24(SP) lfd f18, 32(SP) lfd f19, 40(SP) lfd f20, 48(SP) lfd f21, 56(SP) lfd f22, 64(SP) lfd f23, 72(SP) lfd f24, 80(SP) lfd f25, 88(SP) lfd f26, 96(SP) lfd f27, 104(SP) lfd f28, 112(SP) lfd f29, 120(SP) lfd f30, 128(SP) lfd f31, 136(SP) #ifdef __64BIT__ ld r14, 144(SP) ld r15, 152(SP) ld r16, 160(SP) ld r17, 168(SP) ld r18, 176(SP) ld r19, 184(SP) ld r20, 192(SP) ld r21, 200(SP) ld r22, 208(SP) ld r23, 216(SP) ld r24, 224(SP) ld r25, 232(SP) ld r26, 240(SP) ld r27, 248(SP) #else lwz r14, 144(SP) lwz r15, 148(SP) lwz r16, 152(SP) lwz r17, 156(SP) lwz r18, 160(SP) lwz r19, 164(SP) lwz r20, 168(SP) lwz r21, 172(SP) lwz r22, 176(SP) lwz r23, 180(SP) lwz r24, 184(SP) lwz r25, 188(SP) lwz r26, 192(SP) lwz r27, 196(SP) #endif addi SP, SP, STACKSIZE blr EPILOGUE #endif