#include "common.h" #include static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOAT *C0, FLOAT *C1, FLOAT *C2,FLOAT *C3, FLOAT *C4, FLOAT *C5,FLOAT *C6, FLOAT *C7) __attribute__ ((noinline)); static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOAT *C0, FLOAT *C1, FLOAT *C2,FLOAT *C3, FLOAT *C4, FLOAT *C5,FLOAT *C6, FLOAT *C7) { BLASLONG i = 0; BLASLONG temp1 = n * 8; __asm__ __volatile__ ( " vxorpd %%ymm4 , %%ymm4 , %%ymm4 \n\t" " vxorpd %%ymm5 , %%ymm5 , %%ymm5 \n\t" " vxorpd %%ymm6 , %%ymm6 , %%ymm6 \n\t" " vxorpd %%ymm7 , %%ymm7 , %%ymm7 \n\t" " vxorpd %%ymm8 , %%ymm8 , %%ymm8 \n\t" " vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" " vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" " vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" " cmp $0, %1 \n\t" " jz 2f \n\t" " .align 16 \n\t" "1: \n\t" " vmovups (%2,%0,4) , %%ymm0 \n\t" " vmovups (%3,%0,8) , %%ymm1 \n\t" " vmovups 32(%3,%0,8) , %%ymm2 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm4 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm8 \n\t" " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm5 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm9 \n\t" " vpermpd $0x1b , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm6 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm10 \n\t" " vpermpd $0xb1 , %%ymm0 , %%ymm0 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm7 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm11 \n\t" " addq $8 , %0 \n\t" " cmp %0 , %1 \n\t" " jne 1b \n\t" "2: \n\t" " vbroadcastsd (%4), %%ymm0 \n\t" " vmulpd %%ymm0 , %%ymm4 , %%ymm4 \n\t" " vmulpd %%ymm0 , %%ymm5 , %%ymm5 \n\t" " vmulpd %%ymm0 , %%ymm6 , %%ymm6 \n\t" " vmulpd %%ymm0 , %%ymm7 , %%ymm7 \n\t" " vmulpd %%ymm0 , %%ymm8 , %%ymm8 \n\t" " vmulpd %%ymm0 , %%ymm9 , %%ymm9 \n\t" " vmulpd %%ymm0 , %%ymm10, %%ymm10 \n\t" " vmulpd %%ymm0 , %%ymm11, %%ymm11 \n\t" " vpermpd $0xb1 , %%ymm5 , %%ymm5 \n\t" " vpermpd $0xb1 , %%ymm7 , %%ymm7 \n\t" " vblendpd $0x0a , %%ymm5 , %%ymm4 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm5 , %%ymm4 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm7 , %%ymm6 , %%ymm2 \n\t" " vblendpd $0x05 , %%ymm7 , %%ymm6 , %%ymm3 \n\t" " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm6 \n\t" " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm7 \n\t" " vmovups %%ymm4 , (%5) \n\t" " vmovups %%ymm5 , (%6) \n\t" " vmovups %%ymm6 , (%7) \n\t" " vmovups %%ymm7 , (%8) \n\t" " vpermpd $0xb1 , %%ymm9 , %%ymm9 \n\t" " vpermpd $0xb1 , %%ymm11, %%ymm11 \n\t" " vblendpd $0x0a , %%ymm9 , %%ymm8 , %%ymm0 \n\t" " vblendpd $0x05 , %%ymm9 , %%ymm8 , %%ymm1 \n\t" " vblendpd $0x0a , %%ymm11, %%ymm10, %%ymm2 \n\t" " vblendpd $0x05 , %%ymm11, %%ymm10, %%ymm3 \n\t" " vpermpd $0x1b , %%ymm2 , %%ymm2 \n\t" " vpermpd $0x1b , %%ymm3 , %%ymm3 \n\t" " vpermpd $0xb1 , %%ymm2 , %%ymm2 \n\t" " vpermpd $0xb1 , %%ymm3 , %%ymm3 \n\t" " vblendpd $0x03 , %%ymm0 , %%ymm2 , %%ymm4 \n\t" " vblendpd $0x03 , %%ymm1 , %%ymm3 , %%ymm5 \n\t" " vblendpd $0x03 , %%ymm2 , %%ymm0 , %%ymm6 \n\t" " vblendpd $0x03 , %%ymm3 , %%ymm1 , %%ymm7 \n\t" " vmovups %%ymm4 , (%9) \n\t" " vmovups %%ymm5 , (%10) \n\t" " vmovups %%ymm6 , (%11) \n\t" " vmovups %%ymm7 , (%12) \n\t" : : "a" (i), // 0 "r" (temp1), // 1 "S" (a), // 2 "D" (b), // 3 "r" (alpha), // 4 "r" (C0), // 5 "r" (C1), // 6 "r" (C2), // 7 "r" (C3), // 8 "r" (C4), // 9 "r" (C5), // 10 "r" (C6), // 11 "r" (C7) // 12 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); } int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) { BLASLONG i,j,k; FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb; FLOAT res0_0; FLOAT res0_1; FLOAT res0_2; FLOAT res0_3; FLOAT res1_0; FLOAT res1_1; FLOAT res1_2; FLOAT res1_3; FLOAT res2_0; FLOAT res2_1; FLOAT res2_2; FLOAT res2_3; FLOAT res3_0; FLOAT res3_1; FLOAT res3_2; FLOAT res3_3; FLOAT res4_0; FLOAT res4_1; FLOAT res4_2; FLOAT res4_3; FLOAT res5_0; FLOAT res5_1; FLOAT res5_2; FLOAT res5_3; FLOAT res6_0; FLOAT res6_1; FLOAT res6_2; FLOAT res6_3; FLOAT res7_0; FLOAT res7_1; FLOAT res7_2; FLOAT res7_3; FLOAT a0; FLOAT a1; FLOAT b0; FLOAT b1; FLOAT b2; FLOAT b3; FLOAT b4; FLOAT b5; FLOAT b6; FLOAT b7; BLASLONG off, temp ; bool left; bool transposed; bool backwards; #ifdef LEFT left = true; #else left = false; #endif #ifdef TRANSA transposed = true; #else transposed = false; #endif backwards = left != transposed; if (!left) { off = -offset; } for (j=0; j