// vim: ft=arm

// C tile regs: 
// - x19-x29 to preserve (but x19, x28, x29 not used) 
// - d8..d15 to preserve
// - v16 to v31, no need to preserve

// no preservation either for v0-v7...
// packed A buffering (2x8 values): alternating v0, v1 with v2, v3
// packed B buffering (2x8 values): alternating v4, v5 with v6, v7

.text
.align 4

.cpu generic+fp+simd
.global {{G}}arm64simd_mmm_i32_64x1_{{suffix}}
{{G}}arm64simd_mmm_i32_64x1_{{suffix}}:

/*
    prfm        pldl1keep, [x1]
    prfm        pldl1keep, [x2]
*/
    stp         x20, x21, [sp, #-16]!
    stp         x22, x23, [sp, #-16]!
    stp         x24, x25, [sp, #-16]!
    stp         x26, x27, [sp, #-16]!

    stp         d8, d9, [sp, #-16]!
    stp         d10, d11, [sp, #-16]!
    stp         d12, d13, [sp, #-16]!
    stp         d14, d15, [sp, #-16]!

{% include "dispatcher.tmpliq" %}

.add_mat_mul:
    ldp         x2, x4, [x0, #24]   // b, packing
    ldp         x3, x1, [x0, #8]    // k, a

    cmp         x3, #0
    beq         .non_linear_loop

    cmp         x4, #1
    beq         .packed_packed_loop_1_i8i8

.packed_packed_loop_1:
    ld1         {v9.s}[0], [ x2 ], 4

    ld1	        { v0.4s-v3.4s }, [ x1 ], #64
    ld1	        { v4.4s-v7.4s }, [ x1 ], #64
    {% for reg in (0..3) %}
        mla      v{{reg | times: 2 | plus: 16 }}.4s, v{{reg | times:2}}.4s, v9.s[0]
        mla      v{{reg | times: 2 | plus: 17 }}.4s, v{{reg | times:2 | plus:1}}.4s, v9.s[0]
    {% endfor %}

    ld1	        { v0.4s-v3.4s }, [ x1 ], #64
    ld1	        { v4.4s-v7.4s }, [ x1 ], #64
    {% for reg in (0..3) %}
        mla      v{{reg | times: 2 | plus: 24 }}.4s, v{{reg | times:2}}.4s, v9.s[0]
        mla      v{{reg | times: 2 | plus: 25 }}.4s, v{{reg | times:2 | plus:1}}.4s, v9.s[0]
    {% endfor %}

    subs        x3, x3, #1
    bne .packed_packed_loop_1
    
    b .non_linear_loop

.packed_packed_loop_1_i8i8:
    ld1         {v9.b}[0], [ x2 ], 1
    sshll       v9.8h, v9.8b, 0

    ld1	        { v0.8b-v3.8b }, [ x1 ], #32
    ld1	        { v4.8b-v7.8b }, [ x1 ], #32

    {% for reg in (0..7) %}
        sshll       v10.8h, v{{reg}}.8b, 0
        smlal       v{{reg | times: 2 | plus: 16 }}.4s, v10.4h, v9.h[0]
        smlal2      v{{reg | times: 2 | plus: 17 }}.4s, v10.8h, v9.h[0]
    {% endfor %}

    subs        x3, x3, #1
    bne .packed_packed_loop_1_i8i8

    b .non_linear_loop

.add_unicast:
    ldp         x5, x6, [x0, #8]
    ldp         x7, x8, [x0, #24]

    cmp         x8, #4
    beq         non_linear_addc_i32

    {% for reg in (16..31) %}
        {% for lane in (0..3) %}
            ld1 {v0.b}[{{lane}}], [ x5 ], x6
        {% endfor %}
        sshll v0.8h, v0.8b, 0
        sshll v0.4s, v0.4h, 0
        add v{{reg}}.4s, v{{reg}}.4s, v0.4s
    {% endfor %}

    b           .non_linear_loop

non_linear_addc_i32:
    {% for reg in (16..31) %}
        {% for lane in (0..3) %}
            ld1 {v0.s}[{{lane}}], [ x5 ], x6
        {% endfor %}
        add v{{reg}}.4s, v{{reg}}.4s, v0.4s
    {% endfor %}

    b           .non_linear_loop

.add_row_col_products:
    ldr     x2, [x0, #8]
    ldr     x3, [x0, #16]

    ld1         { v15.s }[0], [ x3 ]
    xtn         v15.4h, v15.4s

    ld1         { v0.4s-v3.4s }, [ x2 ], #64
    ld1         { v4.4s-v7.4s }, [ x2 ], #64

    {% for reg in (0..7) %}
        xtn         v{{reg}}.4h, v{{reg}}.4s
        smlal        v{{reg|plus: 16}}.4s, v{{reg}}.4h, v15.h[0]
    {% endfor %}

    ld1         { v0.4s-v3.4s }, [ x2 ], #64
    ld1         { v4.4s-v7.4s }, [ x2 ], #64

    {% for reg in (0..7) %}
        xtn         v{{reg}}.4h, v{{reg}}.4s
        smlal        v{{reg|plus: 24}}.4s, v{{reg}}.4h, v15.h[0]
    {% endfor %}

    b           .non_linear_loop

{% include "arm64simd_mmm_i32_scalars.tmpliq" from:16, to:31 %}
{% include "arm64simd_mmm_i32_per_rows.tmpliq" mr:64, from:16, to:31 %}
{% include "arm64simd_mmm_i32_per_cols.tmpliq" mr:64, from:16, to:31 %}
{% include "arm64simd_mmm_i32_scale_q16_q31.tmpliq" %}

.store:
    ldp         x5, x6, [x0, #8]            // c base ptr, rsc
    ldp         x7, x8, [x0, #24]           // csc, item_size

    cmp         x8, #4
    beq         .store_strides_i32

    {% for reg in (16..31) %}
        {% for lane in (0..3) %}
            st1 { v{{reg}}.b }[{{lane | times: 4}}], [ x5 ], x6
        {% endfor %}
    {% endfor %}

    b   .non_linear_loop

.store_strides_i32:
    {% for reg in (16..31) %}
        {% for lane in (0..3) %}
            st1 { v{{reg}}.s }[{{lane}}], [ x5 ], x6
        {% endfor %}
    {% endfor %}

    b   .non_linear_loop

.return:
    ldp         d14, d15, [sp], #16
    ldp         d12, d13, [sp], #16
    ldp         d10, d11, [sp], #16
    ldp         d8, d9, [sp], #16
    
    ldp         x26, x27, [sp], #16
    ldp         x24, x25, [sp], #16
    ldp         x22, x23, [sp], #16
    ldp         x20, x21, [sp], #16

    ret