{% comment %} // vim: set syntax=asm : /* mmm 128 x 1 zmm0 zmm1 ... zmm7 System V ABI: args: rdi, rsi, rdx, rcx, r8, r9 preserve: rbx, rsp, rbp, r12, r13, r14, r15 scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 return: rax (+rdx) Windows ABI: args: RCX, RDX, R8, R9 preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15 return: rax (+rdx) */ {% endcomment %} {% include "preamble.tmpliq" size:"128x1", suffix:suffix, G:G, arch:"avx512" %} {{L}}clear: vzeroall jmp {{L}}non_linear_loop {{L}}add_mat_mul: mov rcx, [rdi + 24] // B mov rax, [rdi + 16] // A mov rbx, [rdi + 8] // k test rbx, rbx jz {{L}}non_linear_loop {{align}} 16 {{L}}main_loop_packed_packed: {% include "8x1/packed_packed_loop1/avx-512.tmpli" %} sub rbx, 1 jnz {{L}}main_loop_packed_packed jmp {{L}}non_linear_loop {% include "f32_scalars.tmpliq" from:0, to:7 %} {% include "f32_per_rows.tmpliq" mr:128, from:0, to:7 %} {% include "f32_per_cols.tmpliq" mr:128, from:0, to:7 %} {{L}}add_unicast: mov r10, [rdi + 8] // c ptr mov rsi, [rdi + 16] // row stride {% for row in (0..7) %} vaddps zmm{{row}}, zmm{{row}}, [ r10 + {{row|times:64}} ] {% endfor %} jmp {{L}}non_linear_loop {{L}}add_row_col_products: mov rax, [ rdi + 8 ] mov rbx, [ rdi + 16 ] vbroadcastss zmm14, dword ptr [rbx] {% for i in (0..7) %} vmovups zmm12, [rax + {{i|times:64}}] vfmadd231ps zmm{{i}}, zmm12, zmm14 {% endfor %} jmp {{L}}non_linear_loop {{L}}store: mov r8, [rdi + 8] // c ptr mov rsi, [rdi + 16] // row stride cmp rsi, 4 jne {{L}}store_noncontiguous test r8, 63 jnz {{L}}store_unaligned {% for row in (0..7) %} vmovaps [r8 + {{row|times:64}}], zmm{{row}} {% endfor %} jmp {{L}}non_linear_loop {{L}}store_unaligned: {% for row in (0..7) %} vmovups [r8 + {{row|times:64}}], zmm{{row}} {% endfor %} jmp {{L}}non_linear_loop {{L}}store_noncontiguous: {% for r in (0..7) %} {% for quarter in (0..3) %} vextractf32x4 xmm8, zmm{{r}}, {{quarter}} {% for row in (0..3) %} vextractps dword ptr [r8], xmm8, {{row}} add r8, rsi {% endfor %} {% endfor %} {% endfor %} jmp {{L}}non_linear_loop {% include "postamble.tmpliq" size:"128x1", suffix:suffix, G:G, L:L, arch:"avx512" %}