{% comment %} // vim: set syntax=asm : /* mmm 64 x 1 ymm0 ymm1 ... ymm8 System V ABI: args: rdi, rsi, rdx, rcx, r8, r9 preserve: rbx, rsp, rbp, r12, r13, r14, r15 scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 return: rax (+rdx) Windows ABI: args: RCX, RDX, R8, R9 preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 return: rax (+rdx) */ {% endcomment %} {% include "preamble.tmpliq" type:"f32", size:"64x1", suffix:suffix, G:G %} {{L}}clear: vzeroall jmp {{L}}non_linear_loop {{L}}add_mat_mul: mov rcx, [rdi + 24] // B mov rax, [rdi + 16] // A mov rbx, [rdi + 8] // k test rbx, rbx jz {{L}}non_linear_loop test rbx, 1 jz {{L}}main_loop_packed_packed {% include "8x1/packed_packed_loop1/avx.tmpli" %} dec rbx jz {{L}}non_linear_loop {{align}} 16 {{L}}main_loop_packed_packed: {% include "8x1/packed_packed_loop1/avx-unroll.tmpli" %} sub rbx, 2 jnz {{L}}main_loop_packed_packed jmp {{L}}non_linear_loop {% include "fma_mmm_f32_scalars.tmpliq" from:0, to:7, type:"f32" %} {% include "fma_mmm_f32_per_rows.tmpliq" mr:64, from:0, to:7, type:"f32" %} {% include "fma_mmm_f32_per_cols.tmpliq" mr:64, from:0, to:7, type:"f32" %} {{L}}add_unicast: mov r10, [rdi + 8] // c ptr mov rsi, [rdi + 16] // row stride cmp rsi, 4 jne {{L}}add_unicast_generic {% for row in (0..7) %} vaddps ymm{{row}}, ymm{{row}}, [ r10 + {{row|times:32}} ] {% endfor %} jmp {{L}}non_linear_loop jmp {{L}}non_linear_loop {{L}}add_unicast_generic: mov eax, 0 {% for i in (0..3) %} pinsrd xmm14, eax, {{i}} add eax, esi {% endfor %} {% for i in (0..3) %} pinsrd xmm15, eax, {{i}} add eax, esi {% endfor %} vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 {% for i in (0..7) %} vpcmpeqd ymm15, ymm15, ymm15 vgatherdps ymm12, [ r10 + ymm14 ], ymm15 vaddps ymm{{i}}, ymm{{i}}, ymm12 lea r10, [ r10 + rsi * 8 ] {% endfor %} jmp {{L}}non_linear_loop {{L}}add_row_col_products: mov rax, [ rdi + 8 ] mov rbx, [ rdi + 16 ] vbroadcastss ymm14, dword ptr [rbx] {% for i in (0..7) %} vmovups ymm12, [rax + {{i|times:32}}] vfmadd231ps ymm{{i}}, ymm12, ymm14 {% endfor %} jmp {{L}}non_linear_loop {{L}}store: mov r8, [rdi + 8] // c ptr mov rsi, [rdi + 16] // row stride cmp rsi, 4 jne {{L}}store_generic {% for row in (0..7) %} vmovups [r8 + {{row|times:32}}], ymm{{row}} {% endfor %} jmp {{L}}non_linear_loop {{L}}store_generic: {% for vec in (0..7) %} {% for half in (0..1) %} {% if half == 0 %} movaps xmm9, xmm{{vec}} {% else %} vperm2f128 ymm9, ymm{{vec}}, ymm{{vec}}, 1 {% endif %} {% for row in (0..3) %} vextractps dword ptr [r8], xmm9, {{row}} add r8, rsi {% endfor %} {% endfor %} {% endfor %} jmp {{L}}non_linear_loop {% include "postamble.tmpliq" type:"f32", size:"64x1", suffix:suffix, G:G, L:L %}