// vim: ft=arm

// x20..x27 are used, callee-preserved

// C tile regs: v16 to v31, (scratch)
// 
//      v16[0] v18[0] v20[0] v22[0] v24[0] v26[0] v28[0] v30[0]
//      v16[1] v18[1] 
//      v16[2] v18[2] 
//      v16[3] v18[3]
//                     
//      v17[0] v19[0] v21[0] v23[0] v25[0] v27[0] v29[0] v31[0]
//      v17[1] v19[1] 
//      v17[2] v19[2] 
//      v17[3] v19[3] 

// v8 is used, d8 (lower half) must preserved
// v0-v7 (scratch registers)
//  packed A buffering (2x8 values): alternating v0, v1 with v2, v3
//  packed B buffering (2x8 values): alternating v4, v5 with v6, v7

.text
.align 4

{% if needs_pragma == true %}
.cpu generic+fp+simd+fp16
{% endif %}
.global {{G}}arm64fp16_mmm_f16_16x8_{{core}}_{{suffix}}
{{G}}arm64fp16_mmm_f16_16x8_{{core}}_{{suffix}}:

    stp         x20, x21, [sp, #-16]!
    stp         x22, x23, [sp, #-16]!
    stp         x24, x25, [sp, #-16]!
    stp         x26, x27, [sp, #-16]!

    str         q8, [sp, #-16]!

{% include "dispatcher.tmpliq" %}

.add_mat_mul:
    ldr         x2, [x0, #24]       // b
    ldp         x3, x1, [x0, #8]    // k, a

    cmp         x3, #0
    beq         .non_linear_loop

.packed_packed:
    ld1         { v0.4s, v1.4s }, [ x1 ], #32
    ld1         { v4.4s }, [ x2 ], #16

{% capture packed_packed_loop1 %}
    {% include "arm64fp16_mmm_f16_16x8/loop1/naive.tmpli" %}
{% endcapture %}

{% capture packed_packed_loop2 %}
    {% if core == "a55" %}
        {% include "arm64fp16_mmm_f16_16x8/loop2/cortex_a55.tmpli" %}
    {% else %}
        {{ packed_packed_loop1 }}
        {{ packed_packed_loop1 }}
    {% endif %}
{% endcapture %}

    cmp         x3, #4
    blt         .packed_packed_loop_1

.p2align 4
.packed_packed_loop_4:
    {{ packed_packed_loop2 }}
    {{ packed_packed_loop2 }}

    sub x3, x3, #4
    cmp x3, #4
    bge .packed_packed_loop_4


    cmp x3, #0
    beq .non_linear_loop

.p2align 4
.packed_packed_loop_1:
    {{ packed_packed_loop1 }}
    subs        x3, x3, #1
    bne .packed_packed_loop_1

    b .non_linear_loop

{% include "arm64fp16_mmm_f16_scalars.tmpliq" from:16, to:31%}
{% include "arm64fp16_mmm_f16_per_rows.tmpliq" mr:16, from:16, to:31 %}
{% include "arm64fp16_mmm_f16_per_cols.tmpliq" mr:16, from:16, to:31 %}

.add_unicast:
    ldp         x5, x6, [x0, #8]
    ldp         x7, x8, [x0, #24]

    {% for col in (8..15) %}
        mov x4, x5
        {% for reg in (0..1) %}
            {% for lane in (0..7) %}
                ld1 {v0.h}[{{lane}}], [ x4 ], x6
            {% endfor %}
            fadd v{{col | times:2 | plus: reg}}.8h, v{{col | times:2 | plus: reg}}.8h, v0.8h
        {% endfor %}
        add x5, x5, x7
    {% endfor %}

    b           .non_linear_loop

.add_row_col_products:
    ldr     x2, [x0, #8]
    ldr     x3, [x0, #16]

    ld1         { v0.4s, v1.4s }, [ x2 ], #32
    ld1         { v4.4s }, [ x3 ], #16

    fmla        v16.8h, v0.8h, v4.h[0]
    fmla        v17.8h, v1.8h, v4.h[0]
    fmla        v18.8h, v0.8h, v4.h[1]
    fmla        v19.8h, v1.8h, v4.h[1]
    fmla        v20.8h, v0.8h, v4.h[2]
    fmla        v21.8h, v1.8h, v4.h[2]
    fmla        v22.8h, v0.8h, v4.h[3]
    fmla        v23.8h, v1.8h, v4.h[3]

    fmla        v24.8h, v0.8h, v4.h[4]
    fmla        v25.8h, v1.8h, v4.h[4]
    fmla        v26.8h, v0.8h, v4.h[5]
    fmla        v27.8h, v1.8h, v4.h[5]
    fmla        v28.8h, v0.8h, v4.h[6]
    fmla        v29.8h, v1.8h, v4.h[6]
    fmla        v30.8h, v0.8h, v4.h[7]
    fmla        v31.8h, v1.8h, v4.h[7]

    b           .non_linear_loop

.store:
    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
    ldp         x7, x8, [x0, #24]           // csc, item_size

    cmp         x6, #2
    bne         .store_strides_generic

    {% for col in (8..15) %}
        str q{{col | times:2 }}, [ x5 ]
        str q{{col | times:2 | plus: 1}}, [ x5, #16 ]
        add x5, x5, x7
    {% endfor %}

    b           .non_linear_loop

.store_strides_generic:

    {% for col in (8..15) %}
        mov x4, x5
        {% for reg in (0..1) %}
            {% for lane in (0..7) %}
                st1 { v{{col | times:2 | plus: reg}}.h }[{{lane}}], [ x4 ], x6
            {% endfor %}
        {% endfor %}
        add x5, x5, x7
    {% endfor %}

    b           .non_linear_loop

.return:
    ldr         q8, [sp], #16

    ldp         x26, x27, [sp], #16
    ldp         x24, x25, [sp], #16
    ldp         x22, x23, [sp], #16
    ldp         x20, x21, [sp], #16

    ret