{% comment %} // vim: set syntax=asm : /* mmm 16 x 1 zmm0 System V ABI: args: rdi, rsi, rdx, rcx, r8, r9 preserve: rbx, rsp, rbp, r12, r13, r14, r15 scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 return: rax (+rdx) Windows ABI: args: RCX, RDX, R8, R9 preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of ZMM0-15 and ZMM0-15 return: rax (+rdx) */ {% endcomment %} {% include "preamble.tmpliq" size:"16x1", suffix:suffix, G:G, arch:"avx512" %} {{L}}clear: vzeroall jmp {{L}}non_linear_loop {{L}}add_mat_mul: mov rcx, [rdi + 24] // B mov rax, [rdi + 16] // A mov rbx, [rdi + 8] // k test rbx, rbx jz {{L}}non_linear_loop cmp rbx, 8 jl {{L}}main_loop_packed_packed_tail {{align}} 16 {{L}}main_loop_packed_packed: {% include "1x1/packed_packed_loop1/unroll-4.tmpli" %} sub rbx, 4 cmp rbx, 4 jge {{L}}main_loop_packed_packed {% for r in (1..3) %} vaddps zmm0, zmm0, zmm{{r}} {% endfor %} test rbx, rbx jz {{L}}non_linear_loop {{align}} 16 {{L}}main_loop_packed_packed_tail: {% include "1x1/packed_packed_loop1/avx-512.tmpli" %} sub rbx, 1 jnz {{L}}main_loop_packed_packed_tail jmp {{L}}non_linear_loop {% include "f32_scalars.tmpliq" from:0, to:0 %} {% include "f32_per_rows.tmpliq" mr:16, from:0, to:0 %} {% include "f32_per_cols.tmpliq" mr:16, from:0, to:0 %} {{L}}add_unicast: mov r10, [rdi + 8] // c ptr mov rsi, [rdi + 16] // row stride cmp rsi, 4 jne {{L}}add_unicast_generic vaddps zmm0, zmm0, [r10] jmp {{L}}non_linear_loop {{L}}add_unicast_generic: mov r8, [0] // mov eax, 0 // {% for i in (0..3) %} // pinsrd xmm14, eax, {{i}} // add eax, esi // {% endfor %} // {% for i in (0..3) %} // pinsrd xmm15, eax, {{i}} // add eax, esi // {% endfor %} // // vperm2f128 zmm14, zmm14, zmm15, 32 // zmm14 <- xmm14::xmm15 // // {% for i in (0..7) %} // vpcmpeqd zmm15, zmm15, zmm15 // vgatherdps zmm12, [ r10 + zmm14 ], zmm15 // // vaddps zmm{{i}}, zmm{{i}}, zmm12 // lea r10, [ r10 + rsi * 8 ] // {% endfor %} // jmp {{L}}non_linear_loop {{L}}add_row_col_products: mov rax, [ rdi + 8 ] mov rbx, [ rdi + 16 ] vbroadcastss zmm14, dword ptr [rbx] {% for i in (0..0) %} vmovups zmm12, [rax + {{i|times:64}}] vfmadd231ps zmm{{i}}, zmm12, zmm14 {% endfor %} jmp {{L}}non_linear_loop {{L}}store: mov r8, [rdi + 8] // c ptr mov rsi, [rdi + 16] // row stride cmp rsi, 4 jne {{L}}store_noncontiguous test r8, 63 jnz {{L}}store_unaligned vmovaps [r8], zmm0 jmp {{L}}non_linear_loop {{L}}store_unaligned: vmovups [r8], zmm0 jmp {{L}}non_linear_loop {{L}}store_noncontiguous: {% for quarter in (0..3) %} vextractf32x4 xmm8, zmm0, {{quarter}} {% for row in (0..3) %} vextractps dword ptr [r8], xmm8, {{row}} add r8, rsi {% endfor %} {% endfor %} jmp {{L}}non_linear_loop {% include "postamble.tmpliq" size:"16x1", suffix:suffix, G:G, L:L, arch:"avx512" %}