// vim: ft=arm // C tile regs: // - x19-x29 to preserve (but x19, x28, x29 not used) // - d8..d15 to preserve // - v16 to v31, no need to preserve // // v16[0] v18[0] v20[0] v22[0] v24[0] v26[0] v28[0] v30[0] // v16[1] v18[1] // v16[2] v18[2] // v16[3] v18[3] // // v17[0] v19[0] v21[0] v23[0] v25[0] v27[0] v29[0] v31[0] // v17[1] v19[1] // v17[2] v19[2] // v17[3] v19[3] // no preservation either for v0-v7... // packed A buffering (2x8 values): alternating v0, v1 with v2, v3 // packed B buffering (2x8 values): alternating v4, v5 with v6, v7 .text .align 4 .cpu generic+fp+simd .global {{G}}arm64simd_mmm_i32_8x8_{{suffix}} {{G}}arm64simd_mmm_i32_8x8_{{suffix}}: /* prfm pldl1keep, [x1] prfm pldl1keep, [x2] */ stp x20, x21, [sp, #-16]! stp x22, x23, [sp, #-16]! stp x24, x25, [sp, #-16]! stp x26, x27, [sp, #-16]! stp d8, d9, [sp, #-16]! stp d10, d11, [sp, #-16]! stp d12, d13, [sp, #-16]! stp d14, d15, [sp, #-16]! {% include "dispatcher.tmpliq" %} .add_mat_mul: ldp x2, x4, [x0, #24] // b, packing ldp x3, x1, [x0, #8] // k, a cmp x3, #0 beq .non_linear_loop cmp x4, #1 beq .packed_packed_loop_1_i8i8 .packed_packed_loop_1: ld1 { v0.4s, v1.4s }, [ x1 ], #32 ld1 { v4.4s, v5.4s }, [ x2 ], #32 mla v16.4s, v0.4s, v4.s[0] mla v17.4s, v1.4s, v4.s[0] mla v18.4s, v0.4s, v4.s[1] mla v19.4s, v1.4s, v4.s[1] mla v20.4s, v0.4s, v4.s[2] mla v21.4s, v1.4s, v4.s[2] mla v22.4s, v0.4s, v4.s[3] mla v23.4s, v1.4s, v4.s[3] mla v24.4s, v0.4s, v5.s[0] mla v25.4s, v1.4s, v5.s[0] mla v26.4s, v0.4s, v5.s[1] mla v27.4s, v1.4s, v5.s[1] mla v28.4s, v0.4s, v5.s[2] mla v29.4s, v1.4s, v5.s[2] mla v30.4s, v0.4s, v5.s[3] mla v31.4s, v1.4s, v5.s[3] subs x3, x3, #1 bne .packed_packed_loop_1 b .non_linear_loop .packed_packed_loop_1_i8i8: ld1 { v0.8b }, [ x1 ], #8 sshll v0.8h, v0.8b, 0 ld1 { v4.8b }, [ x2 ], #8 sshll v4.8h, v4.8b, 0 smlal v16.4s, v0.4h, v4.h[0] smlal2 v17.4s, v0.8h, v4.h[0] smlal v18.4s, v0.4h, v4.h[1] smlal2 v19.4s, v0.8h, v4.h[1] smlal v20.4s, v0.4h, v4.h[2] smlal2 v21.4s, v0.8h, v4.h[2] smlal v22.4s, v0.4h, v4.h[3] smlal2 v23.4s, v0.8h, v4.h[3] smlal v24.4s, v0.4h, v4.h[4] smlal2 v25.4s, v0.8h, v4.h[4] smlal v26.4s, v0.4h, v4.h[5] smlal2 v27.4s, v0.8h, v4.h[5] smlal v28.4s, v0.4h, v4.h[6] smlal2 v29.4s, v0.8h, v4.h[6] smlal v30.4s, v0.4h, v4.h[7] smlal2 v31.4s, v0.8h, v4.h[7] subs x3, x3, #1 bne .packed_packed_loop_1_i8i8 b .non_linear_loop {% include "arm64simd_mmm_i32_scalars.tmpliq" from:16, to:31%} {% include "arm64simd_mmm_i32_per_rows.tmpliq" mr:8, from:16, to:31%} {% include "arm64simd_mmm_i32_per_cols.tmpliq" mr:8, from:16, to:31%} .add_unicast: ldp x5, x6, [x0, #8] ldp x7, x8, [x0, #24] cmp x8, #4 beq non_linear_addc_i32 {% for col in (8..15) %} mov x4, x5 {% for reg in (0..1) %} {% for lane in (0..3) %} ld1 {v0.b}[{{lane}}], [ x4 ], x6 {% endfor %} sshll v0.8h, v0.8b, 0 sshll v0.4s, v0.4h, 0 add v{{col | times:2 | plus: reg}}.4s, v{{col | times:2 | plus: reg}}.4s, v0.4s {% endfor %} add x5, x5, x7 {% endfor %} b .non_linear_loop non_linear_addc_i32: {% for col in (8..15) %} mov x4, x5 {% for reg in (0..1) %} {% for lane in (0..3) %} ld1 {v0.s}[{{lane}}], [ x4 ], x6 {% endfor %} add v{{col | times:2 | plus: reg}}.4s, v{{col | times:2 | plus: reg}}.4s, v0.4s {% endfor %} add x5, x5, x7 {% endfor %} b .non_linear_loop .add_row_col_products: ldr x2, [x0, #8] ldr x3, [x0, #16] ld1 { v0.4s, v1.4s }, [ x2 ] ld1 { v4.4s, v5.4s }, [ x3 ] xtn v0.4h, v0.4s xtn v1.4h, v1.4s xtn v4.4h, v4.4s xtn v5.4h, v5.4s smlal v16.4s, v0.4h, v4.h[0] smlal v17.4s, v1.4h, v4.h[0] smlal v18.4s, v0.4h, v4.h[1] smlal v19.4s, v1.4h, v4.h[1] smlal v20.4s, v0.4h, v4.h[2] smlal v21.4s, v1.4h, v4.h[2] smlal v22.4s, v0.4h, v4.h[3] smlal v23.4s, v1.4h, v4.h[3] smlal v24.4s, v0.4h, v5.h[0] smlal v25.4s, v1.4h, v5.h[0] smlal v26.4s, v0.4h, v5.h[1] smlal v27.4s, v1.4h, v5.h[1] smlal v28.4s, v0.4h, v5.h[2] smlal v29.4s, v1.4h, v5.h[2] smlal v30.4s, v0.4h, v5.h[3] smlal v31.4s, v1.4h, v5.h[3] b .non_linear_loop {% include "arm64simd_mmm_i32_scale_q16_q31.tmpliq" %} .store: ldp x5, x6, [x0, #8] // c base ptr, rsc ldp x7, x8, [x0, #24] // csc, item_size cmp x8, #4 beq .store_strides_i32 {% for col in (8..15) %} mov x4, x5 {% for reg in (0..1) %} {% for lane in (0..3) %} st1 { v{{col | times:2 | plus: reg}}.b }[{{lane|times:4}}], [ x4 ], x6 {% endfor %} {% endfor %} add x5, x5, x7 {% endfor %} b .non_linear_loop .store_strides_i32: {% for col in (8..15) %} mov x4, x5 {% for reg in (0..1) %} {% for lane in (0..3) %} st1 { v{{col | times:2 | plus: reg}}.s }[{{lane}}], [ x4 ], x6 {% endfor %} {% endfor %} add x5, x5, x7 {% endfor %} b .non_linear_loop .return: ldp d14, d15, [sp], #16 ldp d12, d13, [sp], #16 ldp d10, d11, [sp], #16 ldp d8, d9, [sp], #16 ldp x26, x27, [sp], #16 ldp x24, x25, [sp], #16 ldp x22, x23, [sp], #16 ldp x20, x21, [sp], #16 ret