// vim: ft=arm // C tile regs: // - x19-x29 to preserve (but x19, x28, x29 not used) // - d8..d15 to preserve // - v16 to v31, no need to preserve // no preservation either for v0-v7... // packed A buffering (2x8 values): alternating v0, v1 with v2, v3 // packed B buffering (2x8 values): alternating v4, v5 with v6, v7 .text .align 4 .cpu generic+fp+simd .global {{G}}arm64simd_mmm_i32_64x1_{{suffix}} {{G}}arm64simd_mmm_i32_64x1_{{suffix}}: /* prfm pldl1keep, [x1] prfm pldl1keep, [x2] */ stp x20, x21, [sp, #-16]! stp x22, x23, [sp, #-16]! stp x24, x25, [sp, #-16]! stp x26, x27, [sp, #-16]! stp d8, d9, [sp, #-16]! stp d10, d11, [sp, #-16]! stp d12, d13, [sp, #-16]! stp d14, d15, [sp, #-16]! {% include "dispatcher.tmpliq" %} .add_mat_mul: ldp x2, x4, [x0, #24] // b, packing ldp x3, x1, [x0, #8] // k, a cmp x3, #0 beq .non_linear_loop cmp x4, #1 beq .packed_packed_loop_1_i8i8 .packed_packed_loop_1: ld1 {v9.s}[0], [ x2 ], 4 ld1 { v0.4s-v3.4s }, [ x1 ], #64 ld1 { v4.4s-v7.4s }, [ x1 ], #64 {% for reg in (0..3) %} mla v{{reg | times: 2 | plus: 16 }}.4s, v{{reg | times:2}}.4s, v9.s[0] mla v{{reg | times: 2 | plus: 17 }}.4s, v{{reg | times:2 | plus:1}}.4s, v9.s[0] {% endfor %} ld1 { v0.4s-v3.4s }, [ x1 ], #64 ld1 { v4.4s-v7.4s }, [ x1 ], #64 {% for reg in (0..3) %} mla v{{reg | times: 2 | plus: 24 }}.4s, v{{reg | times:2}}.4s, v9.s[0] mla v{{reg | times: 2 | plus: 25 }}.4s, v{{reg | times:2 | plus:1}}.4s, v9.s[0] {% endfor %} subs x3, x3, #1 bne .packed_packed_loop_1 b .non_linear_loop .packed_packed_loop_1_i8i8: ld1 {v9.b}[0], [ x2 ], 1 sshll v9.8h, v9.8b, 0 ld1 { v0.8b-v3.8b }, [ x1 ], #32 ld1 { v4.8b-v7.8b }, [ x1 ], #32 {% for reg in (0..7) %} sshll v10.8h, v{{reg}}.8b, 0 smlal v{{reg | times: 2 | plus: 16 }}.4s, v10.4h, v9.h[0] smlal2 v{{reg | times: 2 | plus: 17 }}.4s, v10.8h, v9.h[0] {% endfor %} subs x3, x3, #1 bne .packed_packed_loop_1_i8i8 b .non_linear_loop .add_unicast: ldp x5, x6, [x0, #8] ldp x7, x8, [x0, #24] cmp x8, #4 beq non_linear_addc_i32 {% for reg in (16..31) %} {% for lane in (0..3) %} ld1 {v0.b}[{{lane}}], [ x5 ], x6 {% endfor %} sshll v0.8h, v0.8b, 0 sshll v0.4s, v0.4h, 0 add v{{reg}}.4s, v{{reg}}.4s, v0.4s {% endfor %} b .non_linear_loop non_linear_addc_i32: {% for reg in (16..31) %} {% for lane in (0..3) %} ld1 {v0.s}[{{lane}}], [ x5 ], x6 {% endfor %} add v{{reg}}.4s, v{{reg}}.4s, v0.4s {% endfor %} b .non_linear_loop .add_row_col_products: ldr x2, [x0, #8] ldr x3, [x0, #16] ld1 { v15.s }[0], [ x3 ] xtn v15.4h, v15.4s ld1 { v0.4s-v3.4s }, [ x2 ], #64 ld1 { v4.4s-v7.4s }, [ x2 ], #64 {% for reg in (0..7) %} xtn v{{reg}}.4h, v{{reg}}.4s smlal v{{reg|plus: 16}}.4s, v{{reg}}.4h, v15.h[0] {% endfor %} ld1 { v0.4s-v3.4s }, [ x2 ], #64 ld1 { v4.4s-v7.4s }, [ x2 ], #64 {% for reg in (0..7) %} xtn v{{reg}}.4h, v{{reg}}.4s smlal v{{reg|plus: 24}}.4s, v{{reg}}.4h, v15.h[0] {% endfor %} b .non_linear_loop {% include "arm64simd_mmm_i32_scalars.tmpliq" from:16, to:31 %} {% include "arm64simd_mmm_i32_per_rows.tmpliq" mr:64, from:16, to:31 %} {% include "arm64simd_mmm_i32_per_cols.tmpliq" mr:64, from:16, to:31 %} {% include "arm64simd_mmm_i32_scale_q16_q31.tmpliq" %} .store: ldp x5, x6, [x0, #8] // c base ptr, rsc ldp x7, x8, [x0, #24] // csc, item_size cmp x8, #4 beq .store_strides_i32 {% for reg in (16..31) %} {% for lane in (0..3) %} st1 { v{{reg}}.b }[{{lane | times: 4}}], [ x5 ], x6 {% endfor %} {% endfor %} b .non_linear_loop .store_strides_i32: {% for reg in (16..31) %} {% for lane in (0..3) %} st1 { v{{reg}}.s }[{{lane}}], [ x5 ], x6 {% endfor %} {% endfor %} b .non_linear_loop .return: ldp d14, d15, [sp], #16 ldp d12, d13, [sp], #16 ldp d10, d11, [sp], #16 ldp d8, d9, [sp], #16 ldp x26, x27, [sp], #16 ldp x24, x25, [sp], #16 ldp x22, x23, [sp], #16 ldp x20, x21, [sp], #16 ret