{% comment %} // vim: set syntax=asm : /* mmm 16 x 6: ymm0 ymm2 ymm4 ymm6 ymm8 ymm10 ymm1 ymm3 ymm5 ymm7 ymm9 ymm11 System V ABI: args: rdi, rsi, rdx, rcx, r8, r9 preserve: rbx, rsp, rbp, r12, r13, r14, r15 scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 return: rax (+rdx) Windows ABI: args: RCX, RDX, R8, R9 preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 return: rax (+rdx) */ {% endcomment %} {% include "preamble.tmpliq" type:"f16", size:"8x8", suffix:suffix, G:G %} {{L}}clear: vzeroall jmp {{L}}non_linear_loop {{L}}add_mat_mul: mov rbx, [rdi + 24] // B mov rax, [rdi + 16] // A mov rcx, [rdi + 8] // k test rcx, rcx jz {{L}}non_linear_loop {{L}}main_loop_packed_packed: {% include "8x8/packed_packed_loop1/f16-avx.tmpli" %} add rax, 16 add rbx, 16 dec rcx jnz {{L}}main_loop_packed_packed jmp {{L}}non_linear_loop // NON LINEAR / ADDC {% include "fma_mmm_f32_scalars.tmpliq" from:0, to:7, type:"f16" %} {% include "fma_mmm_f32_per_rows.tmpliq" mr:8, from:0, to:7, type:"f16" %} {% include "fma_mmm_f32_per_cols.tmpliq" mr:8, from:0, to:7, type:"f16" %} {{L}}add_unicast: mov r8, [rdi + 8] // c ptr mov rsi, [rdi + 16] // row stride mov rdx, [rdi + 24] // col stride {% for col in (0..7) %} mov r10, r8 {% for row in (0..7) %} pinsrw xmm8, word ptr [r10], {{row}} add r10, rsi {% endfor %} vcvtph2ps ymm8, xmm8 vaddps ymm{{col}}, ymm{{col}}, ymm8 add r8, rdx {% endfor %} jmp {{L}}non_linear_loop {{L}}add_row_col_products: mov rax, [ rdi + 8 ] mov rbx, [ rdi + 16 ] vmovups xmm12, [rax] vcvtph2ps ymm12, xmm12 {% for i in (0..7) %} pinsrw xmm14, word ptr [rbx + {{i|times:2}} ], 0 vcvtph2ps ymm14, xmm14 vbroadcastss ymm14, xmm14 vfmadd231ps ymm{{i}}, ymm12, ymm14 {% endfor %} jmp {{L}}non_linear_loop {{L}}store: mov r8, [rdi + 8] // c ptr mov rsi, [rdi + 16] // row stride mov rdx, [rdi + 24] // col stride {% for col in (0..7) %} mov r10, r8 vcvtps2ph xmm8, ymm{{col}}, 0 {% for row in (0..7) %} pextrw word ptr [r10], xmm8, {{row}} add r10, rsi {% endfor %} add r8, rdx {% endfor %} jmp {{L}}non_linear_loop {% include "postamble.tmpliq" type:"f16", size:"8x8", suffix:suffix, G:G, L:L %}