// vim: ft=arm .arm .text .global armv7neon_mmm_f32_8x6_{{core}}_{{suffix}} .type armv7neon_mmm_f32_8x6_{{core}}_{{suffix}}, %function armv7neon_mmm_f32_8x6_{{core}}_{{suffix}}: pld [r0] push { r4-r12 } vpush { q4-q7 } {% include "dispatcher.tmpliq" %} .add_mat_mul: cmp r3, #0 beq .non_linear_loop mov r1, r4 // packed A ptr pld [r3] pld [r5] .packed_packed_loop_1: {% if core == "cortexa7" %} vldr d0, [r1] vldr d1, [r1, #8] vldr d2, [r1, #16] vldr d3, [r1, #24] vldr d4, [r5] vldr d5, [r5, #8] vldr d6, [r5, #16] {% elsif core == "cortexa9" %} vld1.64 {d0-d3}, [r1]! vld1.64 {d4, d5, d6}, [r5]! {% else %} vldmia r1!, {q0-q1} vldmia r5!, {d4-d6} {% endif %} {% if core != "generic" %} pld [r1, #512] pld [r5, #512] {% endif %} vmla.f32 q4, q0, d4[0] vmla.f32 q5, q1, d4[0] vmla.f32 q6, q0, d4[1] vmla.f32 q7, q1, d4[1] vmla.f32 q8, q0, d5[0] vmla.f32 q9, q1, d5[0] vmla.f32 q10, q0, d5[1] vmla.f32 q11, q1, d5[1] vmla.f32 q12, q0, d6[0] vmla.f32 q13, q1, d6[0] vmla.f32 q14, q0, d6[1] vmla.f32 q15, q1, d6[1] {% if core == "cortexa7" %} add r1, #32 add r5, #24 {% endif %} subs r3, r3, #1 bne .packed_packed_loop_1 b .non_linear_loop {% include "armv7neon_mmm_f32_scalars.tmpliq" from:4, to:15 %} {% include "armv7neon_mmm_f32_per_rows.tmpliq" mr:8, from:4, to:15 %} {% include "armv7neon_mmm_f32_per_cols.tmpliq" mr:8, from:4, to:15 %} .add_unicast: // r3, r4, r5, r6 <- ptr, rsc, csc, size {% for col in (0..5) %} mov r2, r3 {% for reg in (0..3) %} vld1.f32 d0[0], [ r2 ], r4 vld1.f32 d0[1], [ r2 ], r4 vadd.f32 d{{col | times: 4 | plus: reg | plus : 8}}, d0 {% endfor %} add r3, r3, r5 {% endfor %} b .non_linear_loop .add_row_col_products: vldmia r3!, { q0, q1 } vldmia r4!, { d4, d5, d6 } vmla.f32 q4, q0, d4[0] vmla.f32 q5, q1, d4[0] vmla.f32 q6, q0, d4[1] vmla.f32 q7, q1, d4[1] vmla.f32 q8, q0, d5[0] vmla.f32 q9, q1, d5[0] vmla.f32 q10, q0, d5[1] vmla.f32 q11, q1, d5[1] vmla.f32 q12, q0, d6[0] vmla.f32 q13, q1, d6[0] vmla.f32 q14, q0, d6[1] vmla.f32 q15, q1, d6[1] b .non_linear_loop .store: // r3, r4, r5 <- ptr, rsc, csc cmp r4, #4 bne .store_generic {% for col in (0..5) %} mov r8, r3 {% for reg in (0..3) %} vst1.64 d{{col| times: 4 | plus: 8 | plus: reg}}, [ r8 ]! {% endfor %} {% if col < 5 %} add r3, r3, r5 {% endif %} {% endfor %} b .non_linear_loop .store_generic: {% for col in (0..5) %} mov r8, r3 {% for reg in (0..3) %} vst1.f32 d{{col | times: 4 | plus: reg | plus : 8}}[0], [ r8 ], r4 vst1.f32 d{{col | times: 4 | plus: reg | plus : 8}}[1], [ r8 ], r4 {% endfor %} {% if col < 5 %} add r3, r3, r5 {% endif %} {% endfor %} b .non_linear_loop .return: vpop { q4-q7 } pop { r4-r12 } bx lr