// vim: ft=arm

// C tile regs: v16 to v31, no need to preserve

// no preservation either for v0-v7...
// v8..v15 are callee-preserved
// packed A buffering (2x8 values): alternating v0, v1 with v2, v3
// packed B buffering (2x8 values): alternating v4, v5 with v6, v7

.text
.align 4

{% if needs_pragma == true %}
.cpu generic+fp+simd+fp16
{% endif %}
.global {{G}}arm64fp16_mmm_f16_128x1_{{core}}_{{suffix}}
{{G}}arm64fp16_mmm_f16_128x1_{{core}}_{{suffix}}:

    stp         x20, x21, [sp, #-16]!
    stp         x22, x23, [sp, #-16]!
    stp         x24, x25, [sp, #-16]!

    stp         d8, d9, [sp, #-16]!
    stp         d10, d11, [sp, #-16]!
    stp         d12, d13, [sp, #-16]!
    stp         d14, d15, [sp, #-16]!

{% include "dispatcher.tmpliq" %}

.add_mat_mul:
    ldr         x2, [x0, #24]       // b
    ldp         x3, x1, [x0, #8]    // k, a

    cmp         x3, #0
    beq         .non_linear_loop
    sub         x3, x3, #1


    ld1         { v8.h }[0], [ x2 ], #2
    ld1         { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64
    ld1         { v4.4s, v5.4s, v6.4s, v7.4s }, [ x1 ], #64

    cmp         x3, #0
    beq         .packed_packed_loop_1_last

    cmp         x3, #4
    blt        .packed_packed_loop_1

{% capture packed_packed_loop1 %}
    {% include "arm64fp16_mmm_f16_128x1/loop1/naive.tmpli" %}
{% endcapture %}

{% capture packed_packed_loop2 %}
    {% include "arm64fp16_mmm_f16_128x1/loop2/cortex_a55.tmpli" %}
{% endcapture %}

.p2align 4
.packed_packed_loop_4:
    {{ packed_packed_loop2 }}
    {{ packed_packed_loop2 }}

    sub         x3, x3, #4
    cmp         x3, #4
    bge         .packed_packed_loop_4

    cmp         x3, #0
    beq         .packed_packed_loop_1_last

.p2align 4
.packed_packed_loop_1:
    {{ packed_packed_loop1 }}

    subs        x3, x3, #1
    bne         .packed_packed_loop_1

// last loop can't read beyond actual input as it's likely not packed and padded
.packed_packed_loop_1_last:
    ld1         { v9.8h, v10.8h, v11.8h, v12.8h }, [x1], #64
    ld1         { v13.8h, v14.8h, v15.8h }, [x1], #48

    fmla        v16.8h, v0.8h, v8.h[0]
    fmla        v17.8h, v1.8h, v8.h[0]
    ld1         { v0.8h }, [ x1 ]
    fmla        v18.8h, v2.8h, v8.h[0]
    fmla        v19.8h, v3.8h, v8.h[0]
    fmla        v20.8h, v4.8h, v8.h[0]
    fmla        v21.8h, v5.8h, v8.h[0]
    fmla        v22.8h, v6.8h, v8.h[0]
    fmla        v23.8h, v7.8h, v8.h[0]

    fmla        v24.8h, v9.8h, v8.h[0]
    fmla        v25.8h, v10.8h, v8.h[0]
    fmla        v26.8h, v11.8h, v8.h[0]
    fmla        v27.8h, v12.8h, v8.h[0]
    fmla        v28.8h, v13.8h, v8.h[0]
    fmla        v29.8h, v14.8h, v8.h[0]
    fmla        v30.8h, v15.8h, v8.h[0]
    fmla        v31.8h, v0.8h, v8.h[0]

    b           .non_linear_loop

{% include "arm64fp16_mmm_f16_scalars.tmpliq" from:16, to:31%}
{% include "arm64fp16_mmm_f16_per_rows.tmpliq" mr:128, from:16, to:31%}
{% include "arm64fp16_mmm_f16_per_cols.tmpliq" mr:128, from:16, to:31%}

.add_unicast:
    ldp         x5, x6, [x0, #8]           // c base ptr, rsc
    cmp         x6, #2
    beq         .do_per_row_add

    {% for reg in (16..31) %}
        {% for lane in (0..7) %}
            ld1 {v0.h}[{{lane}}], [ x5 ], x6
        {% endfor %}
        fadd v{{reg}}.8h, v{{reg}}.8h, v0.8h
    {% endfor %}

    b           .non_linear_loop

.do_per_row_add:
    ld1     {v0.8h-v3.8h}, [x5], #64
    ld1     {v4.8h-v7.8h}, [x5], #64
    ld1     {v8.8h-v11.8h}, [x5], #64
    ld1     {v12.8h-v15.8h}, [x5], #64

    {% for r in (0..15) %}
        fadd v{{r| plus: 16}}.8h, v{{r | plus: 16}}.8h, v{{r}}.8h
    {% endfor %}

    b           .non_linear_loop

.add_row_col_products:
    ldr     x3, [x0, #16]
    ldr     x2, [x0, #8]

    ld1         {v8.h}[0], [ x3 ]

    {% for r in (0..7) %}
        ldr     q{{r}}, [x2], #16
    {% endfor %}

    fmla        v16.8h, v0.8h, v8.h[0]
    ldr         q0, [x2], #16
    fmla        v17.8h, v1.8h, v8.h[0] 
    ldr         q1, [x2], #16
    fmla        v18.8h, v2.8h, v8.h[0] 
    ldr         q2, [x2], #16
    fmla        v19.8h, v3.8h, v8.h[0] 
    ldr         q3, [x2], #16
    fmla        v20.8h, v4.8h, v8.h[0] 
    ldr         q4, [x2], #16
    fmla        v21.8h, v5.8h, v8.h[0] 
    ldr         q5, [x2], #16
    fmla        v22.8h, v6.8h, v8.h[0] 
    ldr         q6, [x2], #16
    fmla        v23.8h, v7.8h, v8.h[0] 
    ldr         q7, [x2], #16

    fmla        v24.8h, v0.8h, v8.h[0]
    fmla        v25.8h, v1.8h, v8.h[0] 
    fmla        v26.8h, v2.8h, v8.h[0] 
    fmla        v27.8h, v3.8h, v8.h[0] 
    fmla        v28.8h, v4.8h, v8.h[0] 
    fmla        v29.8h, v5.8h, v8.h[0] 
    fmla        v30.8h, v6.8h, v8.h[0] 
    fmla        v31.8h, v7.8h, v8.h[0] 

    b           .non_linear_loop

.store:
    ldp         x5, x6, [x0, #8]                // c base ptr, rsc$

    cmp         x6, #2
    beq         .store_strides_contig

    {% for reg in (16..31) %}
        {% for lane in (0..7) %}
            st1 { v{{reg}}.h }[{{lane}}], [ x5 ], x6
        {% endfor %}
    {% endfor %}
    b           .non_linear_loop

.store_strides_contig:

    {% for reg in (16..31) %}
        st1 { v{{reg}}.8h }, [ x5 ], #16
    {% endfor %}
    b           .non_linear_loop

.return:

    ldp         d14, d15, [sp], #16
    ldp         d12, d13, [sp], #16
    ldp         d10, d11, [sp], #16
    ldp         d8, d9, [sp], #16

    ldp         x24, x25, [sp], #16
    ldp         x22, x23, [sp], #16
    ldp         x20, x21, [sp], #16

    ret