// vim: ft=arm // C tile regs: v16 to v31, no need to preserve // no preservation either for v0-v7... // v8..v15 are callee-preserved // packed A buffering (2x8 values): alternating v0, v1 with v2, v3 // packed B buffering (2x8 values): alternating v4, v5 with v6, v7 .text .align 4 {% if needs_pragma == true %} .cpu generic+fp+simd+fp16 {% endif %} .global {{G}}arm64fp16_mmm_f16_128x1_{{core}}_{{suffix}} {{G}}arm64fp16_mmm_f16_128x1_{{core}}_{{suffix}}: stp x20, x21, [sp, #-16]! stp x22, x23, [sp, #-16]! stp x24, x25, [sp, #-16]! stp d8, d9, [sp, #-16]! stp d10, d11, [sp, #-16]! stp d12, d13, [sp, #-16]! stp d14, d15, [sp, #-16]! {% include "dispatcher.tmpliq" %} .add_mat_mul: ldr x2, [x0, #24] // b ldp x3, x1, [x0, #8] // k, a cmp x3, #0 beq .non_linear_loop sub x3, x3, #1 ld1 { v8.h }[0], [ x2 ], #2 ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [ x1 ], #64 ld1 { v4.4s, v5.4s, v6.4s, v7.4s }, [ x1 ], #64 cmp x3, #0 beq .packed_packed_loop_1_last cmp x3, #4 blt .packed_packed_loop_1 {% capture packed_packed_loop1 %} {% include "arm64fp16_mmm_f16_128x1/loop1/naive.tmpli" %} {% endcapture %} {% capture packed_packed_loop2 %} {% include "arm64fp16_mmm_f16_128x1/loop2/cortex_a55.tmpli" %} {% endcapture %} .p2align 4 .packed_packed_loop_4: {{ packed_packed_loop2 }} {{ packed_packed_loop2 }} sub x3, x3, #4 cmp x3, #4 bge .packed_packed_loop_4 cmp x3, #0 beq .packed_packed_loop_1_last .p2align 4 .packed_packed_loop_1: {{ packed_packed_loop1 }} subs x3, x3, #1 bne .packed_packed_loop_1 // last loop can't read beyond actual input as it's likely not packed and padded .packed_packed_loop_1_last: ld1 { v9.8h, v10.8h, v11.8h, v12.8h }, [x1], #64 ld1 { v13.8h, v14.8h, v15.8h }, [x1], #48 fmla v16.8h, v0.8h, v8.h[0] fmla v17.8h, v1.8h, v8.h[0] ld1 { v0.8h }, [ x1 ] fmla v18.8h, v2.8h, v8.h[0] fmla v19.8h, v3.8h, v8.h[0] fmla v20.8h, v4.8h, v8.h[0] fmla v21.8h, v5.8h, v8.h[0] fmla v22.8h, v6.8h, v8.h[0] fmla v23.8h, v7.8h, v8.h[0] fmla v24.8h, v9.8h, v8.h[0] fmla v25.8h, v10.8h, v8.h[0] fmla v26.8h, v11.8h, v8.h[0] fmla v27.8h, v12.8h, v8.h[0] fmla v28.8h, v13.8h, v8.h[0] fmla v29.8h, v14.8h, v8.h[0] fmla v30.8h, v15.8h, v8.h[0] fmla v31.8h, v0.8h, v8.h[0] b .non_linear_loop {% include "arm64fp16_mmm_f16_scalars.tmpliq" from:16, to:31%} {% include "arm64fp16_mmm_f16_per_rows.tmpliq" mr:128, from:16, to:31%} {% include "arm64fp16_mmm_f16_per_cols.tmpliq" mr:128, from:16, to:31%} .add_unicast: ldp x5, x6, [x0, #8] // c base ptr, rsc cmp x6, #2 beq .do_per_row_add {% for reg in (16..31) %} {% for lane in (0..7) %} ld1 {v0.h}[{{lane}}], [ x5 ], x6 {% endfor %} fadd v{{reg}}.8h, v{{reg}}.8h, v0.8h {% endfor %} b .non_linear_loop .do_per_row_add: ld1 {v0.8h-v3.8h}, [x5], #64 ld1 {v4.8h-v7.8h}, [x5], #64 ld1 {v8.8h-v11.8h}, [x5], #64 ld1 {v12.8h-v15.8h}, [x5], #64 {% for r in (0..15) %} fadd v{{r| plus: 16}}.8h, v{{r | plus: 16}}.8h, v{{r}}.8h {% endfor %} b .non_linear_loop .add_row_col_products: ldr x3, [x0, #16] ldr x2, [x0, #8] ld1 {v8.h}[0], [ x3 ] {% for r in (0..7) %} ldr q{{r}}, [x2], #16 {% endfor %} fmla v16.8h, v0.8h, v8.h[0] ldr q0, [x2], #16 fmla v17.8h, v1.8h, v8.h[0] ldr q1, [x2], #16 fmla v18.8h, v2.8h, v8.h[0] ldr q2, [x2], #16 fmla v19.8h, v3.8h, v8.h[0] ldr q3, [x2], #16 fmla v20.8h, v4.8h, v8.h[0] ldr q4, [x2], #16 fmla v21.8h, v5.8h, v8.h[0] ldr q5, [x2], #16 fmla v22.8h, v6.8h, v8.h[0] ldr q6, [x2], #16 fmla v23.8h, v7.8h, v8.h[0] ldr q7, [x2], #16 fmla v24.8h, v0.8h, v8.h[0] fmla v25.8h, v1.8h, v8.h[0] fmla v26.8h, v2.8h, v8.h[0] fmla v27.8h, v3.8h, v8.h[0] fmla v28.8h, v4.8h, v8.h[0] fmla v29.8h, v5.8h, v8.h[0] fmla v30.8h, v6.8h, v8.h[0] fmla v31.8h, v7.8h, v8.h[0] b .non_linear_loop .store: ldp x5, x6, [x0, #8] // c base ptr, rsc$ cmp x6, #2 beq .store_strides_contig {% for reg in (16..31) %} {% for lane in (0..7) %} st1 { v{{reg}}.h }[{{lane}}], [ x5 ], x6 {% endfor %} {% endfor %} b .non_linear_loop .store_strides_contig: {% for reg in (16..31) %} st1 { v{{reg}}.8h }, [ x5 ], #16 {% endfor %} b .non_linear_loop .return: ldp d14, d15, [sp], #16 ldp d12, d13, [sp], #16 ldp d10, d11, [sp], #16 ldp d8, d9, [sp], #16 ldp x24, x25, [sp], #16 ldp x22, x23, [sp], #16 ldp x20, x21, [sp], #16 ret