// vim: ft=arm // C tile regs: // - x19-x29 to preserve (but x19, x28, x29 not used) // - d8..d15 to preserve // - v16 to v31, no need to preserve // // v16[0] v18[0] v20[0] v22[0] v24[0] v26[0] v28[0] v30[0] // v16[1] v18[1] // v16[2] v18[2] // v16[3] v18[3] // // v17[0] v19[0] v21[0] v23[0] v25[0] v27[0] v29[0] v31[0] // v17[1] v19[1] // v17[2] v19[2] // v17[3] v19[3] // packed A buffering (2x8 values): alternating v0, v1 with v2, v3 // packed B buffering (2x8 values): alternating v4, v5 with v6, v7 .text .align 4 .cpu generic+fp+simd .global {{G}}arm64simd_mmm_f32_64x1_{{core}}_{{suffix}} {{G}}arm64simd_mmm_f32_64x1_{{core}}_{{suffix}}: stp x20, x21, [sp, #-16]! stp x22, x23, [sp, #-16]! stp x24, x25, [sp, #-16]! stp x26, x27, [sp, #-16]! stp d8, d9, [sp, #-16]! stp d10, d11, [sp, #-16]! stp d12, d13, [sp, #-16]! stp d14, d15, [sp, #-16]! {% include "dispatcher.tmpliq" %} .add_mat_mul: ldr x2, [x0, #24] // b ldp x3, x1, [x0, #8] // k, a cmp x3, #0 beq .non_linear_loop sub x3, x3, #1 ld1 { v8.s }[0], [ x2 ], #4 ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64 ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [ x1 ], #64 cmp x3, #0 beq .packed_packed_loop_1_last cmp x3, #4 blt .packed_packed_loop_1 {% capture packed_packed_loop1 %} {% if core == "a53" %} {% include "arm64simd_mmm_f32_64x1/loop1/cortex_a53.tmpli" %} {% else %} {% include "arm64simd_mmm_f32_64x1/loop1/naive.tmpli" %} {% endif %} {% endcapture %} {% capture packed_packed_loop2 %} {% if core == "a53" %} {{ packed_packed_loop1 }} {{ packed_packed_loop1 }} {% elsif core == "a55" %} {% include "arm64simd_mmm_f32_64x1/loop2/cortex_a55.tmpli" %} {% else %} {% include "arm64simd_mmm_f32_64x1/loop2/naive.tmpli" %} {% endif %} {% endcapture %} .p2align 4 .packed_packed_loop_4: {{ packed_packed_loop2 }} {{ packed_packed_loop2 }} sub x3, x3, #4 cmp x3, #4 bge .packed_packed_loop_4 cmp x3, #0 beq .packed_packed_loop_1_last .p2align 4 .packed_packed_loop_1: {{ packed_packed_loop1 }} subs x3, x3, #1 bne .packed_packed_loop_1 // last loop can't read beyond actual input as it's likely not packed and padded .packed_packed_loop_1_last: ld1 { v9.4s, v10.4s, v11.4s, v12.4s }, [x1], #64 ld1 { v13.4s, v14.4s, v15.4s }, [x1], #48 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] ld1 { v0.4s }, [ x1 ] fmla v18.4s, v2.4s, v8.s[0] fmla v19.4s, v3.4s, v8.s[0] fmla v20.4s, v4.4s, v8.s[0] fmla v21.4s, v5.4s, v8.s[0] fmla v22.4s, v6.4s, v8.s[0] fmla v23.4s, v7.4s, v8.s[0] fmla v24.4s, v9.4s, v8.s[0] fmla v25.4s, v10.4s, v8.s[0] fmla v26.4s, v11.4s, v8.s[0] fmla v27.4s, v12.4s, v8.s[0] fmla v28.4s, v13.4s, v8.s[0] fmla v29.4s, v14.4s, v8.s[0] fmla v30.4s, v15.4s, v8.s[0] fmla v31.4s, v0.4s, v8.s[0] b .non_linear_loop {% include "arm64simd_mmm_f32_scalars.tmpliq" from:16, to:31%} {% include "arm64simd_mmm_f32_per_rows.tmpliq" mr:64, from:16, to:31%} {% include "arm64simd_mmm_f32_per_cols.tmpliq" mr:64, from:16, to:31%} .add_unicast: ldp x5, x6, [x0, #8] // c base ptr, rsc cmp x6, #4 beq .do_per_row_add {% for reg in (16..31) %} {% for lane in (0..3) %} ld1 {v0.s}[{{lane}}], [ x5 ], x6 {% endfor %} fadd v{{reg}}.4s, v{{reg}}.4s, v0.4s {% endfor %} b .non_linear_loop .do_per_row_add: ld1 {v0.4s-v3.4s}, [x5], #64 ld1 {v4.4s-v7.4s}, [x5], #64 ld1 {v8.4s-v11.4s}, [x5], #64 ld1 {v12.4s-v15.4s}, [x5], #64 {% for r in (0..15) %} fadd v{{r| plus: 16}}.4s, v{{r | plus: 16}}.4s, v{{r}}.4s {% endfor %} b .non_linear_loop .add_row_col_products: ldr x3, [x0, #16] ldr x2, [x0, #8] ld1 {v8.s}[0], [ x3 ] {% for r in (0..7) %} ldr q{{r}}, [x2], #16 {% endfor %} fmla v16.4s, v0.4s, v8.s[0] ldr q0, [x2], #16 fmla v17.4s, v1.4s, v8.s[0] ldr q1, [x2], #16 fmla v18.4s, v2.4s, v8.s[0] ldr q2, [x2], #16 fmla v19.4s, v3.4s, v8.s[0] ldr q3, [x2], #16 fmla v20.4s, v4.4s, v8.s[0] ldr q4, [x2], #16 fmla v21.4s, v5.4s, v8.s[0] ldr q5, [x2], #16 fmla v22.4s, v6.4s, v8.s[0] ldr q6, [x2], #16 fmla v23.4s, v7.4s, v8.s[0] ldr q7, [x2], #16 fmla v24.4s, v0.4s, v8.s[0] fmla v25.4s, v1.4s, v8.s[0] fmla v26.4s, v2.4s, v8.s[0] fmla v27.4s, v3.4s, v8.s[0] fmla v28.4s, v4.4s, v8.s[0] fmla v29.4s, v5.4s, v8.s[0] fmla v30.4s, v6.4s, v8.s[0] fmla v31.4s, v7.4s, v8.s[0] b .non_linear_loop .store: ldp x5, x6, [x0, #8] // c base ptr, rsc$ cmp x6, #4 beq .store_strides_contig {% for reg in (16..31) %} {% for lane in (0..3) %} st1 { v{{reg}}.s }[{{lane}}], [ x5 ], x6 {% endfor %} {% endfor %} b .non_linear_loop .store_strides_contig: {% for reg in (16..31) %} st1 { v{{reg}}.4s }, [ x5 ], #16 {% endfor %} b .non_linear_loop .return: ldp d14, d15, [sp], #16 ldp d12, d13, [sp], #16 ldp d10, d11, [sp], #16 ldp d8, d9, [sp], #16 ldp x26, x27, [sp], #16 ldp x24, x25, [sp], #16 ldp x22, x23, [sp], #16 ldp x20, x21, [sp], #16 ret