// vim: ft=arm

// C tile regs: 
// - x19-x29 to preserve (but x19, x28, x29 not used) 
// - d8..d15 to preserve
// - v16 to v31, no need to preserve
//
// v8  v11 v14 v17 v20 v23 v26 v29
// v9  v12 v15 v18 v21 v24 v27 v30
// v10 v13 v16 v19 v22 v25 v28 v31

// no preservation for v0-v7:
// packed A buffering (2x8 values): rotating over v0..v3
// packed B buffering (2x8 values): alternating v4, v5 with v6, v7

.text
.align 4

.cpu generic+fp+simd
.global {{G}}arm64simd_mmm_f32_12x8_{{core}}_{{suffix}}
{{G}}arm64simd_mmm_f32_12x8_{{core}}_{{suffix}}:

    stp         x20, x21, [sp, #-16]!
    stp         x22, x23, [sp, #-16]!
    stp         x24, x25, [sp, #-16]!
    stp         x26, x27, [sp, #-16]!

    stp         d8, d9, [sp, #-16]!
    stp         d10, d11, [sp, #-16]!
    stp         d12, d13, [sp, #-16]!
    stp         d14, d15, [sp, #-16]!

{% include "dispatcher.tmpliq" %}

.add_mat_mul:
    ldr         x2, [x0, #24]       // b
    ldp         x3, x1, [x0, #8]    // k, a

    cmp         x3, #0
    beq         .non_linear_loop

    ld1         { v0.4s, v1.4s, v2.4s }, [ x1 ], #48
    ld1         { v4.4s, v5.4s }, [ x2 ], #32

{% capture packed_packed_loop1 %}
    {% if core == "a53" %}
        {% include "arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_x_preload.tmpli" %}
    {% else %}
        {% include "arm64simd_mmm_f32_12x8/packed_packed_loop1/naive.tmpli" %}
    {% endif %}
{% endcapture %}

{% capture packed_packed_loop2 %}
    {% if core == "a55" %}
        {% include "arm64simd_mmm_f32_12x8/packed_packed_loop2/cortex_a55.tmpli" %}
    {% else %}
        {{ packed_packed_loop1 }}
        {{ packed_packed_loop1 }}
    {% endif %}
{% endcapture %}

    cmp         x3, #4
    blt         .packed_packed_loop_1

.p2align 4
.packed_packed_loop_4:
    {{ packed_packed_loop2 }}
    {{ packed_packed_loop2 }}

    sub x3, x3, #4
    cmp x3, #4
    bge .packed_packed_loop_4

    cmp x3, #0
    beq .non_linear_loop

.p2align 4
.packed_packed_loop_1:
    {{ packed_packed_loop1 }}
    subs        x3, x3, #1
    bne .packed_packed_loop_1

    b .non_linear_loop

{% include "arm64simd_mmm_f32_scalars.tmpliq" from:8, to:31%}
{% include "arm64simd_mmm_f32_per_rows.tmpliq" mr:12, from:8, to:31 %}
{% include "arm64simd_mmm_f32_per_cols.tmpliq" mr:12, from:8, to:31 %}

.add_unicast:
    ldp         x5, x6, [x0, #8 ]           // c base ptr, rsc
    ldp         x7, x8, [x0, #24]           // csc, item_size

    {% for col in (0..7) %}
        mov x4, x5
        {% for reg in (0..2) %}
            {% for lane in (0..3) %}
                ld1 {v0.s}[{{lane}}], [ x4 ], x6
            {% endfor %}
            fadd v{{col | times:3 | plus: 8| plus: reg}}.4s, v{{col | times:3 | plus: 8 | plus: reg}}.4s, v0.4s
        {% endfor %}
        add x5, x5, x7
    {% endfor %}

    b           .non_linear_loop

.add_row_col_products:
    ldr     x2, [x0, #8]
    ldr     x3, [x0, #16]

    ld1         { v0.4s, v1.4s, v2.4s }, [ x2 ]
    ld1         { v4.4s, v5.4s }, [ x3 ]

    {% for col in (0..7) %}
        {% for reg in (0..2) %}
            fmla v{{col | times:3 | plus: 8 | plus: reg}}.4s, v{{reg}}.4s, v{{col| divided_by:4 | plus: 4}}.s[{{col| modulo: 4}}]
        {% endfor %}
    {% endfor %}

    b           .non_linear_loop

.store:
    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
    ldp         x7, x8, [x0, #24]           // csc, item_size

    cmp         x6, #4
    bne         .store_strides_generic

    {% for col in (0..7) %}
        str q{{col | times:3 | plus: 8 }}, [ x5 ]
        str q{{col | times:3 | plus: 9}}, [ x5, #16 ]
        str q{{col | times:3 | plus: 10}}, [ x5, #32 ]
        add x5, x5, x7
    {% endfor %}

    b           .non_linear_loop

.store_strides_generic:
    {% for col in (0..7) %}
        mov x4, x5
        {% for reg in (0..2) %}
            {% for lane in (0..3) %}
                st1 { v{{col | times:3 | plus: 8 | plus: reg}}.s }[{{lane}}], [ x4 ], x6
            {% endfor %}
        {% endfor %}
        add x5, x5, x7
    {% endfor %}

    b           .non_linear_loop

.return:
    ldp         d14, d15, [sp], #16
    ldp         d12, d13, [sp], #16
    ldp         d10, d11, [sp], #16
    ldp         d8, d9, [sp], #16

    ldp         x26, x27, [sp], #16
    ldp         x24, x25, [sp], #16
    ldp         x22, x23, [sp], #16
    ldp         x20, x21, [sp], #16

    ret