// vim: ft=arm // C tile regs: q8..q16 .arm .text .global armv7neon_mmm_i32_32x1_{{suffix}} .type armv7neon_mmm_i32_32x1_{{suffix}}, %function armv7neon_mmm_i32_32x1_{{suffix}}: pld [r0] push { r4-r12 } vpush { q4-q7 } {% include "dispatcher.tmpliq" %} .add_mat_mul: // r3 r4 r5 r6 // k a b packing cmp r3, #0 beq .non_linear_loop mov r1, r4 // packed A ptr pld [r3] pld [r7] cmp r6, #1 beq .packed_packed_i8i8 .packed_packed: .packed_packed_loop_1: vldmia r1!, { q4-q7 } vld1.32 { d0[0] }, [ r5 ]! vmla.s32 q8, q4, d0[0] vldmia r1!, { q1-q4 } vmla.s32 q9, q5, d0[0] vmla.s32 q10, q6, d0[0] vmla.s32 q11, q7, d0[0] vmla.s32 q12, q1, d0[0] vmla.s32 q13, q2, d0[0] vmla.s32 q14, q3, d0[0] vmla.s32 q15, q4, d0[0] subs r3, r3, #1 bne .packed_packed_loop_1 b .non_linear_loop .packed_packed_i8i8: .packed_packed_loop_i8i8_1: vldmia r1!, { q4-q5 } vld1.8 { d0[0] }, [ r5 ]! vmovl.s8 q0, d0 vmovl.s8 q1, d8 vmlal.s16 q8, d2, d0[0] vmlal.s16 q9, d3, d0[0] vmovl.s8 q1, d9 vmlal.s16 q10, d2, d0[0] vmlal.s16 q11, d3, d0[0] vmovl.s8 q1, d10 vmlal.s16 q12, d2, d0[0] vmlal.s16 q13, d3, d0[0] vmovl.s8 q1, d11 vmlal.s16 q14, d2, d0[0] vmlal.s16 q15, d3, d0[0] subs r3, r3, #1 bne .packed_packed_loop_i8i8_1 b .non_linear_loop {% include "armv7neon_mmm_i32_scalars.tmpliq" from:8, to:15 %} {% include "armv7neon_mmm_i32_per_rows.tmpliq" mr:32, from:8, to:15 %} {% include "armv7neon_mmm_i32_per_cols.tmpliq" mr:32, from:8, to:15 %} .add_unicast: // r3, r4, r5, r6 <- ptr, rsc, csc, size cmp r6, #4 beq .non_linear_addc_i32 {% for reg in (16..31) %} vld1.s8 d0[0], [ r3 ], r4 vld1.s8 d0[1], [ r3 ], r4 vmovl.s8 q0, d0 vmovl.s16 q0, d0 vadd.i32 d{{reg}}, d0 {% endfor %} b .non_linear_loop .non_linear_addc_i32: {% for reg in (16..31) %} vld1.s32 d0[0], [ r3 ], r4 vld1.s32 d0[1], [ r3 ], r4 vadd.i32 d{{reg}}, d0 {% endfor %} b .non_linear_loop .add_row_col_products: vldm r4, { s0 } vldmia r3!, { q4-q7 } vmla.s32 q8, q4, d0[0] vmla.s32 q9, q5, d0[0] vmla.s32 q10, q6, d0[0] vmla.s32 q11, q7, d0[0] vldmia r3!, { q4-q7 } vmla.s32 q12, q4, d0[0] vmla.s32 q13, q5, d0[0] vmla.s32 q14, q6, d0[0] vmla.s32 q15, q7, d0[0] b .non_linear_loop {% include "armv7neon_mmm_i32_scale_q8_q15.tmpliq" %} .store: // r3, r4, r5, r6 <- ptr, rsc, csc, size cmp r6, #4 beq .store_strides_i32 {% for reg in (8..15) %} vmovn.s32 d{{reg | times: 2}}, q{{reg}} vmovn.s16 d{{reg | times: 2}}, q{{reg}} {% endfor %} {% for reg in (8..15) %} {%capture d%}{{reg | times: 2 }}{%endcapture%} vst1.s8 d{{d}}[0], [ r3 ], r4 vst1.s8 d{{d}}[1], [ r3 ], r4 vst1.s8 d{{d}}[2], [ r3 ], r4 vst1.s8 d{{d}}[3], [ r3 ], r4 {% endfor %} b .non_linear_loop .store_strides_i32: {% for reg in (8..15) %} {%capture d%}{{reg | times: 2}}{%endcapture%} vst1.s32 d{{d}}[0], [ r3 ], r4 vst1.s32 d{{d}}[1], [ r3 ], r4 vst1.s32 d{{d|plus:1}}[0], [ r3 ], r4 vst1.s32 d{{d|plus:1}}[1], [ r3 ], r4 {% endfor %} b .non_linear_loop .return: vpop { q4-q7 } pop { r4-r12 } bx lr