// vim: ft=arm

// C tile regs:
// - x19-x29 to preserve (but x19, x28, x29 not used) 
// - d8..d15 to preserve
// - v16 to v31, no need to preserve
// 
//      v16[0] v18[0] v20[0] v22[0] v24[0] v26[0] v28[0] v30[0]
//      v16[1] v18[1] 
//      v16[2] v18[2] 
//      v16[3] v18[3]
//                     
//      v17[0] v19[0] v21[0] v23[0] v25[0] v27[0] v29[0] v31[0]
//      v17[1] v19[1] 
//      v17[2] v19[2] 
//      v17[3] v19[3] 

// no preservation either for v0-v7...
// packed A buffering (2x8 values): alternating v0, v1 with v2, v3
// packed B buffering (2x8 values): alternating v4, v5 with v6, v7

.text
.align 4

.cpu generic+fp+simd
.global {{G}}arm64simd_mmm_i32_8x8_{{suffix}}
{{G}}arm64simd_mmm_i32_8x8_{{suffix}}:

/*
    prfm        pldl1keep, [x1]
    prfm        pldl1keep, [x2]
*/
    stp         x20, x21, [sp, #-16]!
    stp         x22, x23, [sp, #-16]!
    stp         x24, x25, [sp, #-16]!
    stp         x26, x27, [sp, #-16]!

    stp         d8, d9, [sp, #-16]!
    stp         d10, d11, [sp, #-16]!
    stp         d12, d13, [sp, #-16]!
    stp         d14, d15, [sp, #-16]!

{% include "dispatcher.tmpliq" %}

.add_mat_mul:
    ldp         x2, x4, [x0, #24]   // b, packing
    ldp         x3, x1, [x0, #8]    // k, a

    cmp         x3, #0
    beq         .non_linear_loop

    cmp         x4, #1
    beq         .packed_packed_loop_1_i8i8

.packed_packed_loop_1:

    ld1	        { v0.4s, v1.4s }, [ x1 ], #32
    ld1	        { v4.4s, v5.4s }, [ x2 ], #32

    mla         v16.4s, v0.4s, v4.s[0]
    mla         v17.4s, v1.4s, v4.s[0]
    mla         v18.4s, v0.4s, v4.s[1]
    mla         v19.4s, v1.4s, v4.s[1]

    mla         v20.4s, v0.4s, v4.s[2]
    mla         v21.4s, v1.4s, v4.s[2]
    mla         v22.4s, v0.4s, v4.s[3]
    mla         v23.4s, v1.4s, v4.s[3]

    mla         v24.4s, v0.4s, v5.s[0]
    mla         v25.4s, v1.4s, v5.s[0]
    mla         v26.4s, v0.4s, v5.s[1]
    mla         v27.4s, v1.4s, v5.s[1]

    mla         v28.4s, v0.4s, v5.s[2]
    mla         v29.4s, v1.4s, v5.s[2]
    mla         v30.4s, v0.4s, v5.s[3]
    mla         v31.4s, v1.4s, v5.s[3]

    subs        x3, x3, #1
    bne .packed_packed_loop_1

    b .non_linear_loop

.packed_packed_loop_1_i8i8:

    ld1	        { v0.8b }, [ x1 ], #8
    sshll       v0.8h, v0.8b, 0
    ld1         { v4.8b }, [ x2 ], #8
    sshll        v4.8h, v4.8b, 0

    smlal        v16.4s, v0.4h, v4.h[0]
    smlal2       v17.4s, v0.8h, v4.h[0]
    smlal        v18.4s, v0.4h, v4.h[1]
    smlal2       v19.4s, v0.8h, v4.h[1]
    smlal        v20.4s, v0.4h, v4.h[2]
    smlal2       v21.4s, v0.8h, v4.h[2]
    smlal        v22.4s, v0.4h, v4.h[3]
    smlal2       v23.4s, v0.8h, v4.h[3]

    smlal        v24.4s, v0.4h, v4.h[4]
    smlal2       v25.4s, v0.8h, v4.h[4]
    smlal        v26.4s, v0.4h, v4.h[5]
    smlal2       v27.4s, v0.8h, v4.h[5]
    smlal        v28.4s, v0.4h, v4.h[6]
    smlal2       v29.4s, v0.8h, v4.h[6]
    smlal        v30.4s, v0.4h, v4.h[7]
    smlal2       v31.4s, v0.8h, v4.h[7]

    subs        x3, x3, #1
    bne .packed_packed_loop_1_i8i8

    b .non_linear_loop

{% include "arm64simd_mmm_i32_scalars.tmpliq" from:16, to:31%}
{% include "arm64simd_mmm_i32_per_rows.tmpliq" mr:8, from:16, to:31%}
{% include "arm64simd_mmm_i32_per_cols.tmpliq" mr:8, from:16, to:31%}

.add_unicast:
    ldp         x5, x6, [x0, #8]
    ldp         x7, x8, [x0, #24]

    cmp         x8, #4
    beq         non_linear_addc_i32

    {% for col in (8..15) %}
        mov x4, x5
        {% for reg in (0..1) %}
            {% for lane in (0..3) %}
                ld1 {v0.b}[{{lane}}], [ x4 ], x6
            {% endfor %}
            sshll v0.8h, v0.8b, 0
            sshll v0.4s, v0.4h, 0
            add v{{col | times:2 | plus: reg}}.4s, v{{col | times:2 | plus: reg}}.4s, v0.4s
        {% endfor %}
        add x5, x5, x7
    {% endfor %}

    b           .non_linear_loop

non_linear_addc_i32:
    {% for col in (8..15) %}
        mov x4, x5
        {% for reg in (0..1) %}
            {% for lane in (0..3) %}
                ld1 {v0.s}[{{lane}}], [ x4 ], x6
            {% endfor %}
            add v{{col | times:2 | plus: reg}}.4s, v{{col | times:2 | plus: reg}}.4s, v0.4s
        {% endfor %}
        add x5, x5, x7
    {% endfor %}

    b           .non_linear_loop

.add_row_col_products:
    ldr     x2, [x0, #8]
    ldr     x3, [x0, #16]

    ld1         { v0.4s, v1.4s }, [ x2 ]
    ld1         { v4.4s, v5.4s }, [ x3 ]

    xtn         v0.4h, v0.4s
    xtn         v1.4h, v1.4s
    xtn         v4.4h, v4.4s
    xtn         v5.4h, v5.4s

    smlal        v16.4s, v0.4h, v4.h[0]
    smlal        v17.4s, v1.4h, v4.h[0]
    smlal        v18.4s, v0.4h, v4.h[1]
    smlal        v19.4s, v1.4h, v4.h[1]
    smlal        v20.4s, v0.4h, v4.h[2]
    smlal        v21.4s, v1.4h, v4.h[2]
    smlal        v22.4s, v0.4h, v4.h[3]
    smlal        v23.4s, v1.4h, v4.h[3]

    smlal        v24.4s, v0.4h, v5.h[0]
    smlal        v25.4s, v1.4h, v5.h[0]
    smlal        v26.4s, v0.4h, v5.h[1]
    smlal        v27.4s, v1.4h, v5.h[1]
    smlal        v28.4s, v0.4h, v5.h[2]
    smlal        v29.4s, v1.4h, v5.h[2]
    smlal        v30.4s, v0.4h, v5.h[3]
    smlal        v31.4s, v1.4h, v5.h[3]

    b           .non_linear_loop

    {% include "arm64simd_mmm_i32_scale_q16_q31.tmpliq" %}

.store:
    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
    ldp         x7, x8, [x0, #24]           // csc, item_size

    cmp         x8, #4
    beq         .store_strides_i32

    {% for col in (8..15) %}
        mov x4, x5
        {% for reg in (0..1) %}
            {% for lane in (0..3) %}
                st1 { v{{col | times:2 | plus: reg}}.b }[{{lane|times:4}}], [ x4 ], x6
            {% endfor %}
        {% endfor %}
        add x5, x5, x7
    {% endfor %}

    b           .non_linear_loop

.store_strides_i32:
    {% for col in (8..15) %}
        mov x4, x5
        {% for reg in (0..1) %}
            {% for lane in (0..3) %}
                st1 { v{{col | times:2 | plus: reg}}.s }[{{lane}}], [ x4 ], x6
            {% endfor %}
        {% endfor %}
        add x5, x5, x7
    {% endfor %}

    b           .non_linear_loop

.return:
    ldp         d14, d15, [sp], #16
    ldp         d12, d13, [sp], #16
    ldp         d10, d11, [sp], #16
    ldp         d8, d9, [sp], #16

    ldp         x26, x27, [sp], #16
    ldp         x24, x25, [sp], #16
    ldp         x22, x23, [sp], #16
    ldp         x20, x21, [sp], #16

    ret