// vim: ft=arm // C tile regs: v16 to v31, no need to preserve // no preservation either for v0-v7... // v8..v15 are callee-preserved // packed A buffering (2x8 values): alternating v0, v1 with v2, v3 // packed B buffering (2x8 values): alternating v4, v5 with v6, v7 .text .align 4 {% if needs_pragma == true %} .cpu generic+fp+simd+fp16 {% endif %} .global {{G}}arm64fp16_mmm_f16_64x1_{{core}}_{{suffix}} {{G}}arm64fp16_mmm_f16_64x1_{{core}}_{{suffix}}: stp x20, x21, [sp, #-16]! stp x22, x23, [sp, #-16]! stp x24, x25, [sp, #-16]! stp d8, d9, [sp, #-16]! stp d10, d11, [sp, #-16]! stp d12, d13, [sp, #-16]! stp d14, d15, [sp, #-16]! {% include "dispatcher.tmpliq" %} .add_mat_mul: ldp x2, x4, [x0, #24] // b, packing ldp x3, x1, [x0, #8] // k, a cmp x3, #0 beq .non_linear_loop cmp x4, #1 beq .q4f16se cmp x4, #2 beq .q4f16 .p2align 4 .packed_packed_loop_1: ld1 { v8.h }[0], [ x2 ], #2 ld1 { v0.8h, v1.8h, v2.8h, v3.8h }, [ x1 ], #64 ld1 { v4.8h, v5.8h, v6.8h, v7.8h }, [ x1 ], #64 fmla v24.8h, v0.8h, v8.h[0] fmla v25.8h, v1.8h, v8.h[0] fmla v26.8h, v2.8h, v8.h[0] fmla v27.8h, v3.8h, v8.h[0] fmla v28.8h, v4.8h, v8.h[0] fmla v29.8h, v5.8h, v8.h[0] fmla v30.8h, v6.8h, v8.h[0] fmla v31.8h, v7.8h, v8.h[0] subs x3, x3, #1 bne .packed_packed_loop_1 b .non_linear_loop .p2align 8 .q40f16_const: .byte 0xc8, 0xc7, 0xc6, 0xc5, 0xc4, 0xc2, 0xc0, 0xbc .byte 0x00, 0x3c, 0x40, 0x42, 0x44, 0x45, 0x46, 0x47 .q4f16se: adr x4, .q40f16_const movi v15.16b, 15 ld1 {v13.16b}, [ x4 ] eor v12.16b, v12.16b, v12.16b .q4f16se_outerloop: {% for i in (0..7) %} eor v{{i|plus:16}}.16b, v{{i|plus:16}}.16b, v{{i|plus:16}}.16b {% endfor %} mov x4, #32 .p2align 4 .q4f16se_innerloop: ld1 { v9.16b-v10.16b }, [x1], #32 ld1 { v8.h }[0], [ x2 ], #2 and v0.16b, v9.16b, v15.16b ushr v2.16b, v9.16b, 4 and v4.16b, v10.16b, v15.16b ushr v6.16b, v10.16b, 4 tbl v0.16b, { v13.16b }, v0.16b tbl v2.16b, { v13.16b }, v2.16b tbl v4.16b, { v13.16b }, v4.16b tbl v6.16b, { v13.16b }, v6.16b zip2 v1.16b, v12.16b, v0.16b zip2 v3.16b, v12.16b, v2.16b zip2 v5.16b, v12.16b, v4.16b zip2 v7.16b, v12.16b, v6.16b zip1 v0.16b, v12.16b, v0.16b zip1 v2.16b, v12.16b, v2.16b zip1 v4.16b, v12.16b, v4.16b zip1 v6.16b, v12.16b, v6.16b {% for i in (0..7) %} fmla v{{ i|plus: 16 }}.8h, v{{i}}.8h, v8.h[0] {% endfor %} subs x4, x4, #1 bne .q4f16se_innerloop // scales ld1 { v0.8h-v3.8h }, [ x1 ], #64 ld1 { v4.8h-v7.8h }, [ x1 ], #64 {% for i in (0..7) %} fmla v{{i|plus:24}}.8h, v{{i}}.8h, v{{i|plus:16}}.8h {% endfor %} subs x3, x3, #32 bne .q4f16se_outerloop b .non_linear_loop .q4f16: adr x4, .q40f16_const movi v15.16b, 15 ld1 {v13.16b}, [ x4 ] eor v12.16b, v12.16b, v12.16b .q4f16_outerloop: // scales ld1 { v16.8h-v19.8h }, [ x1 ], #64 ld1 { v20.8h-v23.8h }, [ x1 ], #64 mov x4, #32 .p2align 4 .q4f16_innerloop: ld1 { v9.16b-v10.16b }, [x1], #32 ld1 { v8.h }[0], [ x2 ], #2 and v0.16b, v9.16b, v15.16b ushr v2.16b, v9.16b, 4 and v4.16b, v10.16b, v15.16b ushr v6.16b, v10.16b, 4 tbl v0.16b, { v13.16b }, v0.16b tbl v2.16b, { v13.16b }, v2.16b tbl v4.16b, { v13.16b }, v4.16b tbl v6.16b, { v13.16b }, v6.16b zip2 v1.16b, v12.16b, v0.16b zip2 v3.16b, v12.16b, v2.16b zip2 v5.16b, v12.16b, v4.16b zip2 v7.16b, v12.16b, v6.16b zip1 v0.16b, v12.16b, v0.16b zip1 v2.16b, v12.16b, v2.16b zip1 v4.16b, v12.16b, v4.16b zip1 v6.16b, v12.16b, v6.16b {% for i in (0..7) %} fmul v{{i}}.8h, v{{i}}.8h, v{{i|plus:16}}.8h {% endfor %} {% for i in (0..7) %} fmla v{{ i|plus: 24 }}.8h, v{{i}}.8h, v8.h[0] {% endfor %} subs x4, x4, #1 bne .q4f16_innerloop subs x3, x3, #32 bne .q4f16_outerloop b .non_linear_loop {% include "arm64fp16_mmm_f16_scalars.tmpliq" from:24, to:31%} {% include "arm64fp16_mmm_f16_per_rows.tmpliq" mr:64, from:24, to:31%} {% include "arm64fp16_mmm_f16_per_cols.tmpliq" mr:64, from:24, to:31%} .add_unicast: ldp x5, x6, [x0, #8] // c base ptr, rsc cmp x6, #2 beq .do_per_row_add {% for reg in (24..31) %} {% for lane in (0..7) %} ld1 {v0.h}[{{lane}}], [ x5 ], x6 {% endfor %} fadd v{{reg}}.8h, v{{reg}}.8h, v0.8h {% endfor %} b .non_linear_loop .do_per_row_add: ld1 {v0.8h-v3.8h}, [x5], #64 ld1 {v4.8h-v7.8h}, [x5], #64 {% for r in (0..7) %} fadd v{{r| plus: 24}}.8h, v{{r | plus: 24}}.8h, v{{r}}.8h {% endfor %} b .non_linear_loop .add_row_col_products: ldr x3, [x0, #16] ldr x2, [x0, #8] ld1 {v8.h}[0], [ x3 ] {% for r in (0..7) %} ldr q{{r}}, [x2], #16 {% endfor %} fmla v24.8h, v0.8h, v8.h[0] fmla v25.8h, v1.8h, v8.h[0] fmla v26.8h, v2.8h, v8.h[0] fmla v27.8h, v3.8h, v8.h[0] fmla v28.8h, v4.8h, v8.h[0] fmla v29.8h, v5.8h, v8.h[0] fmla v30.8h, v6.8h, v8.h[0] fmla v31.8h, v7.8h, v8.h[0] b .non_linear_loop .store: ldp x5, x6, [x0, #8] // c base ptr, rsc$ cmp x6, #2 beq .store_strides_contig {% for reg in (24..31) %} {% for lane in (0..7) %} st1 { v{{reg}}.h }[{{lane}}], [ x5 ], x6 {% endfor %} {% endfor %} b .non_linear_loop .store_strides_contig: {% for reg in (24..31) %} st1 { v{{reg}}.8h }, [ x5 ], #16 {% endfor %} b .non_linear_loop .return: ldp d14, d15, [sp], #16 ldp d12, d13, [sp], #16 ldp d10, d11, [sp], #16 ldp d8, d9, [sp], #16 ldp x24, x25, [sp], #16 ldp x22, x23, [sp], #16 ldp x20, x21, [sp], #16 ret