// vim: ft=arm

// C tile regs: v16 to v31, no need to preserve

// no preservation either for v0-v7...
// v8..v15 are callee-preserved
// packed A buffering (2x8 values): alternating v0, v1 with v2, v3
// packed B buffering (2x8 values): alternating v4, v5 with v6, v7

.text
.align 4

{% if needs_pragma == true %}
.cpu generic+fp+simd+fp16
{% endif %}
.global {{G}}arm64fp16_mmm_f16_32x6_{{core}}_{{suffix}}
{{G}}arm64fp16_mmm_f16_32x6_{{core}}_{{suffix}}:

    stp         x20, x21, [sp, #-16]!
    stp         x22, x23, [sp, #-16]!
    stp         x24, x25, [sp, #-16]!

    stp         d8, d9, [sp, #-16]!
    stp         d10, d11, [sp, #-16]!
    stp         d12, d13, [sp, #-16]!
    stp         d14, d15, [sp, #-16]!

{% include "dispatcher.tmpliq" %}

.add_mat_mul:
    ldp         x2, x4, [x0, #24]   // b, packing
    ldp         x3, x1, [x0, #8]    // k, a

    cmp         x3, #0
    beq         .non_linear_loop

.p2align 4
.packed_packed_loop_1:
    ld1         { v7.8h }, [ x2 ]
    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [ x1 ], #64
    add         x2, x2, 12

{% for row in (0..3) %}
    {% for col in (0..5) %}
        fmla        v{{ col|times:4|plus:8|plus:row}}.8h, v{{row}}.8h, v7.h[{{col}}]
    {% endfor %}
    /*
    {% for col in (0..1) %}
        fmla        v{{ col|plus:4|times:4|plus:8|plus:row}}.8h, v{{row}}.8h, v6.h[{{col}}]
    {% endfor %}
    */
{% endfor %}

    subs        x3, x3, #1
    bne         .packed_packed_loop_1

    b           .non_linear_loop

{% include "arm64fp16_mmm_f16_scalars.tmpliq" from:8, to:31%}
{% include "arm64fp16_mmm_f16_per_rows.tmpliq" mr:32, from:8, to:31%}
{% include "arm64fp16_mmm_f16_per_cols.tmpliq" mr:32, from:8, to:31%}

.add_unicast:
    ldp         x5, x6, [x0, #8]
    ldp         x7, x8, [x0, #24]

    {% for col in (0..5) %}
        mov x4, x5
        {% for reg in (0..3) %}
            {% for lane in (0..7) %}
                ld1 {v0.h}[{{lane}}], [ x4 ], x6
            {% endfor %}
            fadd v{{col | times:4 | plus: 8| plus: reg}}.8h, v{{col | times:4 | plus: 8 | plus: reg}}.8h, v0.8h
        {% endfor %}
        add x5, x5, x7
    {% endfor %}

    b           .non_linear_loop

.do_per_row_add:
    ld1     {v0.8h-v3.8h}, [x5], #64
    ld1     {v4.8h-v7.8h}, [x5], #64

    {% for r in (0..7) %}
        fadd v{{r| plus: 24}}.8h, v{{r | plus: 24}}.8h, v{{r}}.8h
    {% endfor %}

    b           .non_linear_loop

.add_row_col_products:
    ldp         x2, x3, [x0, #8]

    ld1         { v7.d }[0], [ x3 ], #8
    ld1         { v7.s }[2], [ x3 ], #4
    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [ x2 ], #64

{% for row in (0..3) %}
    {% for col in (0..5) %}
        fmla        v{{ col|times:4|plus:8|plus:row}}.8h, v{{row}}.8h, v7.h[{{col}}]
    {% endfor %}
{% endfor %}

    b           .non_linear_loop

.store:
    ldp         x5, x6, [x0, #8]                // c base ptr, rsc
    ldp         x7, x8, [x0, #24]               // csc, item_size

    cmp         x6, #2
    beq         .store_strides_contig

    {% for col in (0..5) %}
        mov x4, x5
        {% for reg in (0..3) %}
            {% for lane in (0..7) %}
                st1 { v{{col | times:4 | plus: 8 | plus: reg}}.h }[{{lane}}], [ x4 ], x6
            {% endfor %}
        {% endfor %}
        add x5, x5, x7
    {% endfor %}
    b           .non_linear_loop

.store_strides_contig:

    {% for col in (0..5) %}
        mov x4, x5
        {% for r in (0..3) %}
            st1 { v{{col | times:4 | plus: 8 | plus: r}}.8h }, [ x4 ], 16
        {% endfor %}
        add x5, x5, x7
    {% endfor %}

    b           .non_linear_loop

.return:

    ldp         d14, d15, [sp], #16
    ldp         d12, d13, [sp], #16
    ldp         d10, d11, [sp], #16
    ldp         d8, d9, [sp], #16

    ldp         x24, x25, [sp], #16
    ldp         x22, x23, [sp], #16
    ldp         x20, x21, [sp], #16

    ret