/*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ /* without modification, are permitted provided that the following */ /* conditions are met: */ /* */ /* 1. Redistributions of source code must retain the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer. */ /* */ /* 2. Redistributions in binary form must reproduce the above */ /* copyright notice, this list of conditions and the following */ /* disclaimer in the documentation and/or other materials */ /* provided with the distribution. */ /* */ /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /* POSSIBILITY OF SUCH DAMAGE. */ /* */ /* The views and conclusions contained in the software and */ /* documentation are those of the authors and should not be */ /* interpreted as representing official policies, either expressed */ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ #define ASSEMBLER #include "common.h" #define N r3 #define X r4 #define INCX r5 #define PREA r8 #define INCXM1 r9 #define FZERO f0 #define STACKSIZE 16 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 stw r0, 0(SP) #ifdef F_INTERFACE LDINT N, 0(N) LDINT INCX, 0(INCX) #endif lfs FZERO, 0(SP) slwi INCX, INCX, ZBASE_SHIFT fmr f1, FZERO li PREA, 8 * 16 * SIZE fmr f2, FZERO subi INCXM1, INCX, SIZE cmpwi cr0, N, 0 fmr f3, FZERO ble- LL(999) cmpwi cr0, INCX, 0 ble- LL(999) cmpwi cr0, INCX, SIZE * 2 bne- cr0, LL(20) srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(15) .align 4 LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) fabs f4, f8 LFD f10, 2 * SIZE(X) fabs f5, f9 LFD f11, 3 * SIZE(X) fabs f6, f10 LFD f8, 4 * SIZE(X) fabs f7, f11 bdz LL(13) .align 4 LL(12): FADD f0, f0, f4 dcbt X, PREA fabs f4, f8 LFD f9, 5 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 6 * SIZE(X) FADD f2, f2, f6 nop fabs f6, f10 LFD f11, 7 * SIZE(X) FADD f3, f3, f7 nop fabs f7, f11 LFD f8, 8 * SIZE(X) FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 9 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 10 * SIZE(X) FADD f2, f2, f6 nop fabs f6, f10 LFD f11, 11 * SIZE(X) FADD f3, f3, f7 nop fabs f7, f11 LFD f8, 12 * SIZE(X) FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 13 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 14 * SIZE(X) FADD f2, f2, f6 nop fabs f6, f10 LFD f11, 15 * SIZE(X) FADD f3, f3, f7 nop fabs f7, f11 LFD f8, 16 * SIZE(X) FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 17 * SIZE(X) FADD f1, f1, f5 addi X, X, 16 * SIZE fabs f5, f9 LFD f10, 2 * SIZE(X) FADD f2, f2, f6 nop fabs f6, f10 LFD f11, 3 * SIZE(X) FADD f3, f3, f7 LFD f8, 4 * SIZE(X) fabs f7, f11 bdnz LL(12) .align 4 LL(13): FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 5 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 6 * SIZE(X) FADD f2, f2, f6 nop fabs f6, f10 LFD f11, 7 * SIZE(X) FADD f3, f3, f7 nop fabs f7, f11 LFD f8, 8 * SIZE(X) FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 9 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 10 * SIZE(X) FADD f2, f2, f6 nop fabs f6, f10 LFD f11, 11 * SIZE(X) FADD f3, f3, f7 nop fabs f7, f11 LFD f8, 12 * SIZE(X) FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 13 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 14 * SIZE(X) FADD f2, f2, f6 addi X, X, 16 * SIZE fabs f6, f10 LFD f11, -1 * SIZE(X) FADD f3, f3, f7 fabs f7, f11 FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 .align 4 LL(15): andi. r0, N, 7 beq LL(999) andi. r0, N, 4 beq LL(16) LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) fabs f4, f8 LFD f10, 2 * SIZE(X) fabs f5, f9 LFD f11, 3 * SIZE(X) fabs f6, f10 LFD f8, 4 * SIZE(X) fabs f7, f11 FADD f0, f0, f4 nop fabs f4, f8 LFD f9, 5 * SIZE(X) FADD f1, f1, f5 nop fabs f5, f9 LFD f10, 6 * SIZE(X) FADD f2, f2, f6 addi X, X, 8 * SIZE fabs f6, f10 LFD f11, -1 * SIZE(X) FADD f3, f3, f7 fabs f7, f11 FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 .align 4 LL(16): andi. r0, N, 2 beq LL(17) LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) fabs f4, f8 LFD f10, 2 * SIZE(X) fabs f5, f9 LFD f11, 3 * SIZE(X) fabs f6, f10 addi X, X, 4 * SIZE fabs f7, f11 nop FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 .align 4 LL(17): andi. r0, N, 1 beq LL(999) LFD f8, 0 * SIZE(X) LFD f9, 1 * SIZE(X) fabs f4, f8 fabs f5, f9 FADD f0, f0, f4 addi X, X, 2 * SIZE FADD f1, f1, f5 b LL(999) .align 4 LL(20): sub X, X, INCXM1 srawi. r0, N, 3 mtspr CTR, r0 beq- cr0, LL(25) LFDX f8, X, INCXM1 LFDUX f9, X, INCX fabs f4, f8 LFDX f10, X, INCXM1 fabs f5, f9 LFDUX f11, X, INCX fabs f6, f10 LFDX f8, X, INCXM1 fabs f7, f11 bdz LL(23) .align 4 LL(22): FADD f0, f0, f4 dcbt X, PREA fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDX f10, X, INCXM1 FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 nop fabs f7, f11 LFDX f8, X, INCXM1 FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDX f10, X, INCXM1 FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 nop fabs f7, f11 LFDX f8, X, INCXM1 FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDX f10, X, INCXM1 FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 nop fabs f7, f11 LFDX f8, X, INCXM1 FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDX f10, X, INCXM1 FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 LFDX f8, X, INCXM1 fabs f7, f11 bdnz LL(22) .align 4 LL(23): FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDX f10, X, INCXM1 FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 nop fabs f7, f11 LFDX f8, X, INCXM1 FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDX f10, X, INCXM1 FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 nop fabs f7, f11 LFDX f8, X, INCXM1 FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDX f10, X, INCXM1 FADD f2, f2, f6 nop fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 fabs f7, f11 FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 .align 4 LL(25): andi. r0, N, 7 beq LL(999) andi. r0, N, 4 beq LL(26) LFDX f8, X, INCXM1 LFDUX f9, X, INCX fabs f4, f8 LFDX f10, X, INCXM1 fabs f5, f9 LFDUX f11, X, INCX fabs f6, f10 LFDX f8, X, INCXM1 fabs f7, f11 FADD f0, f0, f4 nop fabs f4, f8 LFDUX f9, X, INCX FADD f1, f1, f5 nop fabs f5, f9 LFDX f10, X, INCXM1 FADD f2, f2, f6 fabs f6, f10 LFDUX f11, X, INCX FADD f3, f3, f7 fabs f7, f11 FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 .align 4 LL(26): andi. r0, N, 2 beq LL(27) LFDX f8, X, INCXM1 LFDUX f9, X, INCX fabs f4, f8 LFDX f10, X, INCXM1 fabs f5, f9 LFDUX f11, X, INCX fabs f6, f10 fabs f7, f11 FADD f0, f0, f4 FADD f1, f1, f5 FADD f2, f2, f6 FADD f3, f3, f7 .align 4 LL(27): andi. r0, N, 1 beq LL(999) LFDX f8, X, INCXM1 LFDUX f9, X, INCX fabs f4, f8 fabs f5, f9 FADD f0, f0, f4 FADD f1, f1, f5 .align 4 LL(999): FADD f0, f0, f1 FADD f2, f2, f3 FADD f1, f0, f2 addi SP, SP, STACKSIZE blr EPILOGUE