// vim: ft=arm // x20..x27 are used, callee-preserved // C tile regs: v16 to v31, (scratch) // // v16[0] v18[0] v20[0] v22[0] v24[0] v26[0] v28[0] v30[0] // v16[1] v18[1] // v16[2] v18[2] // v16[3] v18[3] // // v17[0] v19[0] v21[0] v23[0] v25[0] v27[0] v29[0] v31[0] // v17[1] v19[1] // v17[2] v19[2] // v17[3] v19[3] // v8 is used, d8 (lower half) must preserved // v0-v7 (scratch registers) // packed A buffering (2x8 values): alternating v0, v1 with v2, v3 // packed B buffering (2x8 values): alternating v4, v5 with v6, v7 .text .align 4 {% if needs_pragma == true %} .cpu generic+fp+simd+fp16 {% endif %} .global {{G}}arm64fp16_mmm_f16_16x8_{{core}}_{{suffix}} {{G}}arm64fp16_mmm_f16_16x8_{{core}}_{{suffix}}: stp x20, x21, [sp, #-16]! stp x22, x23, [sp, #-16]! stp x24, x25, [sp, #-16]! stp x26, x27, [sp, #-16]! str q8, [sp, #-16]! {% include "dispatcher.tmpliq" %} .add_mat_mul: ldr x2, [x0, #24] // b ldp x3, x1, [x0, #8] // k, a cmp x3, #0 beq .non_linear_loop .packed_packed: ld1 { v0.4s, v1.4s }, [ x1 ], #32 ld1 { v4.4s }, [ x2 ], #16 {% capture packed_packed_loop1 %} {% include "arm64fp16_mmm_f16_16x8/loop1/naive.tmpli" %} {% endcapture %} {% capture packed_packed_loop2 %} {% if core == "a55" %} {% include "arm64fp16_mmm_f16_16x8/loop2/cortex_a55.tmpli" %} {% else %} {{ packed_packed_loop1 }} {{ packed_packed_loop1 }} {% endif %} {% endcapture %} cmp x3, #4 blt .packed_packed_loop_1 .p2align 4 .packed_packed_loop_4: {{ packed_packed_loop2 }} {{ packed_packed_loop2 }} sub x3, x3, #4 cmp x3, #4 bge .packed_packed_loop_4 cmp x3, #0 beq .non_linear_loop .p2align 4 .packed_packed_loop_1: {{ packed_packed_loop1 }} subs x3, x3, #1 bne .packed_packed_loop_1 b .non_linear_loop {% include "arm64fp16_mmm_f16_scalars.tmpliq" from:16, to:31%} {% include "arm64fp16_mmm_f16_per_rows.tmpliq" mr:16, from:16, to:31 %} {% include "arm64fp16_mmm_f16_per_cols.tmpliq" mr:16, from:16, to:31 %} .add_unicast: ldp x5, x6, [x0, #8] ldp x7, x8, [x0, #24] {% for col in (8..15) %} mov x4, x5 {% for reg in (0..1) %} {% for lane in (0..7) %} ld1 {v0.h}[{{lane}}], [ x4 ], x6 {% endfor %} fadd v{{col | times:2 | plus: reg}}.8h, v{{col | times:2 | plus: reg}}.8h, v0.8h {% endfor %} add x5, x5, x7 {% endfor %} b .non_linear_loop .add_row_col_products: ldr x2, [x0, #8] ldr x3, [x0, #16] ld1 { v0.4s, v1.4s }, [ x2 ], #32 ld1 { v4.4s }, [ x3 ], #16 fmla v16.8h, v0.8h, v4.h[0] fmla v17.8h, v1.8h, v4.h[0] fmla v18.8h, v0.8h, v4.h[1] fmla v19.8h, v1.8h, v4.h[1] fmla v20.8h, v0.8h, v4.h[2] fmla v21.8h, v1.8h, v4.h[2] fmla v22.8h, v0.8h, v4.h[3] fmla v23.8h, v1.8h, v4.h[3] fmla v24.8h, v0.8h, v4.h[4] fmla v25.8h, v1.8h, v4.h[4] fmla v26.8h, v0.8h, v4.h[5] fmla v27.8h, v1.8h, v4.h[5] fmla v28.8h, v0.8h, v4.h[6] fmla v29.8h, v1.8h, v4.h[6] fmla v30.8h, v0.8h, v4.h[7] fmla v31.8h, v1.8h, v4.h[7] b .non_linear_loop .store: ldp x5, x6, [x0, #8] // c base ptr, rsc ldp x7, x8, [x0, #24] // csc, item_size cmp x6, #2 bne .store_strides_generic {% for col in (8..15) %} str q{{col | times:2 }}, [ x5 ] str q{{col | times:2 | plus: 1}}, [ x5, #16 ] add x5, x5, x7 {% endfor %} b .non_linear_loop .store_strides_generic: {% for col in (8..15) %} mov x4, x5 {% for reg in (0..1) %} {% for lane in (0..7) %} st1 { v{{col | times:2 | plus: reg}}.h }[{{lane}}], [ x4 ], x6 {% endfor %} {% endfor %} add x5, x5, x7 {% endfor %} b .non_linear_loop .return: ldr q8, [sp], #16 ldp x26, x27, [sp], #16 ldp x24, x25, [sp], #16 ldp x22, x23, [sp], #16 ldp x20, x21, [sp], #16 ret