{% comment %}
// vim: set syntax=asm :

/* mmm 64 x 1

    ymm0
    ymm1
    ...
    ymm8

System V ABI:
    args: rdi, rsi, rdx, rcx, r8, r9
    preserve: rbx, rsp, rbp, r12, r13, r14, r15
    scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11
    return: rax (+rdx)

Windows ABI:
    args: RCX, RDX, R8, R9
    preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15
    scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15
    return: rax (+rdx)
*/
{% endcomment %}

{% include "preamble.tmpliq" type:"f32", size:"64x1", suffix:suffix, G:G %}

{{L}}clear:
    vzeroall
    jmp     {{L}}non_linear_loop

{{L}}add_mat_mul:
    mov     rcx,    [rdi + 24]   // B
    mov     rax,    [rdi + 16]   // A

    mov     rbx,    [rdi + 8]    // k
    test    rbx,    rbx
    jz      {{L}}non_linear_loop

	test rbx, 1
	jz {{L}}main_loop_packed_packed
	{% include "8x1/packed_packed_loop1/avx.tmpli" %}

    dec             rbx
    jz              {{L}}non_linear_loop

{{align}} 16
{{L}}main_loop_packed_packed:
	{% include "8x1/packed_packed_loop1/avx-unroll.tmpli" %}

    sub             rbx, 2
    jnz             {{L}}main_loop_packed_packed

    jmp             {{L}}non_linear_loop

{% include "fma_mmm_f32_scalars.tmpliq" from:0, to:7, type:"f32" %}
{% include "fma_mmm_f32_per_rows.tmpliq" mr:64, from:0, to:7, type:"f32" %}
{% include "fma_mmm_f32_per_cols.tmpliq" mr:64, from:0, to:7, type:"f32" %}

{{L}}add_unicast:
    mov     r10,    [rdi + 8]           // c ptr
    mov     rsi,    [rdi + 16]          // row stride

	cmp rsi, 4
	jne {{L}}add_unicast_generic

    {% for row in (0..7) %}
        vaddps ymm{{row}}, ymm{{row}}, [ r10 + {{row|times:32}} ]
    {% endfor %}
    jmp    {{L}}non_linear_loop


    jmp    {{L}}non_linear_loop

{{L}}add_unicast_generic:
    mov     eax,    0
{% for i in (0..3) %}
    pinsrd  xmm14, eax, {{i}}
    add     eax,    esi
{% endfor %}
{% for i in (0..3) %}
    pinsrd  xmm15, eax, {{i}}
    add     eax,    esi
{% endfor %}

    vperm2f128      ymm14,  ymm14, ymm15,         32 // ymm14 <- xmm14::xmm15

{% for i in (0..7) %}
    vpcmpeqd        ymm15,  ymm15, ymm15
    vgatherdps      ymm12,  [ r10 + ymm14 ], ymm15

    vaddps          ymm{{i}},   ymm{{i}},   ymm12
    lea             r10, [ r10 + rsi * 8 ]
{% endfor %}

    jmp    {{L}}non_linear_loop

{{L}}add_row_col_products:
    mov             rax, [ rdi + 8 ]
    mov             rbx, [ rdi + 16 ]

    vbroadcastss    ymm14, dword ptr [rbx]

{% for i in (0..7) %}
    vmovups         ymm12,  [rax + {{i|times:32}}]
    vfmadd231ps     ymm{{i}}, ymm12, ymm14
{% endfor %}
    jmp    {{L}}non_linear_loop

{{L}}store:
    mov     r8,     [rdi + 8]           // c ptr
    mov     rsi,    [rdi + 16]          // row stride

	cmp rsi, 4
	jne {{L}}store_generic

	{% for row in (0..7) %}
        vmovups [r8 + {{row|times:32}}], ymm{{row}}
    {% endfor %}

    jmp     {{L}}non_linear_loop

{{L}}store_generic:

    {% for vec in (0..7) %}
        {% for half in (0..1) %}
            {% if half == 0 %}
                movaps xmm9, xmm{{vec}}
            {% else %}
                vperm2f128 ymm9, ymm{{vec}}, ymm{{vec}}, 1
            {% endif %}
            {% for row in (0..3) %}
                vextractps  dword ptr [r8], xmm9, {{row}}
                add         r8, rsi
            {% endfor %}
        {% endfor %}
    {% endfor %}

    jmp    {{L}}non_linear_loop


{% include "postamble.tmpliq" type:"f32", size:"64x1", suffix:suffix, G:G, L:L %}