// vim: ft=arm // C tile regs: // - x19-x29 to preserve (but x19, x28, x29 not used) // - d8..d15 to preserve // - v16 to v31, no need to preserve // // v8 v11 v14 v17 v20 v23 v26 v29 // v9 v12 v15 v18 v21 v24 v27 v30 // v10 v13 v16 v19 v22 v25 v28 v31 // no preservation for v0-v7: // packed A buffering (2x8 values): rotating over v0..v3 // packed B buffering (2x8 values): alternating v4, v5 with v6, v7 .text .align 4 .cpu generic+fp+simd .global {{G}}arm64simd_mmm_f32_12x8_{{core}}_{{suffix}} {{G}}arm64simd_mmm_f32_12x8_{{core}}_{{suffix}}: stp x20, x21, [sp, #-16]! stp x22, x23, [sp, #-16]! stp x24, x25, [sp, #-16]! stp x26, x27, [sp, #-16]! stp d8, d9, [sp, #-16]! stp d10, d11, [sp, #-16]! stp d12, d13, [sp, #-16]! stp d14, d15, [sp, #-16]! {% include "dispatcher.tmpliq" %} .add_mat_mul: ldr x2, [x0, #24] // b ldp x3, x1, [x0, #8] // k, a cmp x3, #0 beq .non_linear_loop ld1 { v0.4s, v1.4s, v2.4s }, [ x1 ], #48 ld1 { v4.4s, v5.4s }, [ x2 ], #32 {% capture packed_packed_loop1 %} {% if core == "a53" %} {% include "arm64simd_mmm_f32_12x8/packed_packed_loop1/ldr_x_preload.tmpli" %} {% else %} {% include "arm64simd_mmm_f32_12x8/packed_packed_loop1/naive.tmpli" %} {% endif %} {% endcapture %} {% capture packed_packed_loop2 %} {% if core == "a55" %} {% include "arm64simd_mmm_f32_12x8/packed_packed_loop2/cortex_a55.tmpli" %} {% else %} {{ packed_packed_loop1 }} {{ packed_packed_loop1 }} {% endif %} {% endcapture %} cmp x3, #4 blt .packed_packed_loop_1 .p2align 4 .packed_packed_loop_4: {{ packed_packed_loop2 }} {{ packed_packed_loop2 }} sub x3, x3, #4 cmp x3, #4 bge .packed_packed_loop_4 cmp x3, #0 beq .non_linear_loop .p2align 4 .packed_packed_loop_1: {{ packed_packed_loop1 }} subs x3, x3, #1 bne .packed_packed_loop_1 b .non_linear_loop {% include "arm64simd_mmm_f32_scalars.tmpliq" from:8, to:31%} {% include "arm64simd_mmm_f32_per_rows.tmpliq" mr:12, from:8, to:31 %} {% include "arm64simd_mmm_f32_per_cols.tmpliq" mr:12, from:8, to:31 %} .add_unicast: ldp x5, x6, [x0, #8 ] // c base ptr, rsc ldp x7, x8, [x0, #24] // csc, item_size {% for col in (0..7) %} mov x4, x5 {% for reg in (0..2) %} {% for lane in (0..3) %} ld1 {v0.s}[{{lane}}], [ x4 ], x6 {% endfor %} fadd v{{col | times:3 | plus: 8| plus: reg}}.4s, v{{col | times:3 | plus: 8 | plus: reg}}.4s, v0.4s {% endfor %} add x5, x5, x7 {% endfor %} b .non_linear_loop .add_row_col_products: ldr x2, [x0, #8] ldr x3, [x0, #16] ld1 { v0.4s, v1.4s, v2.4s }, [ x2 ] ld1 { v4.4s, v5.4s }, [ x3 ] {% for col in (0..7) %} {% for reg in (0..2) %} fmla v{{col | times:3 | plus: 8 | plus: reg}}.4s, v{{reg}}.4s, v{{col| divided_by:4 | plus: 4}}.s[{{col| modulo: 4}}] {% endfor %} {% endfor %} b .non_linear_loop .store: ldp x5, x6, [x0, #8] // c base ptr, rsc ldp x7, x8, [x0, #24] // csc, item_size cmp x6, #4 bne .store_strides_generic {% for col in (0..7) %} str q{{col | times:3 | plus: 8 }}, [ x5 ] str q{{col | times:3 | plus: 9}}, [ x5, #16 ] str q{{col | times:3 | plus: 10}}, [ x5, #32 ] add x5, x5, x7 {% endfor %} b .non_linear_loop .store_strides_generic: {% for col in (0..7) %} mov x4, x5 {% for reg in (0..2) %} {% for lane in (0..3) %} st1 { v{{col | times:3 | plus: 8 | plus: reg}}.s }[{{lane}}], [ x4 ], x6 {% endfor %} {% endfor %} add x5, x5, x7 {% endfor %} b .non_linear_loop .return: ldp d14, d15, [sp], #16 ldp d12, d13, [sp], #16 ldp d10, d11, [sp], #16 ldp d8, d9, [sp], #16 ldp x26, x27, [sp], #16 ldp x24, x25, [sp], #16 ldp x22, x23, [sp], #16 ldp x20, x21, [sp], #16 ret