// vim: ft=arm .arm .text .global armvfpv2_mmm_f32_4x4_{{suffix}} .type armvfpv2_mmm_f32_4x4_{{suffix}}, %function // C tile: // s16 s20 s24 s28 // s17 s21 s25 s29 // s18 s22 s26 s30 // s19 s23 s27 s31 // packed A: (2x4) alternating between (s0-s3) and (s4-s7) // packed B: (2x4) alternating between (s8-s11) and (s12-15) // all vfp registers in use. armvfpv2_mmm_f32_4x4_{{suffix}}: /* pld [r1] pld [r1, #8] pld [r2] pld [r2, #8] */ push { r4-r12 } // no lr (we're a leaf), no fp. #24 bytes ldr r8, [sp, #28] ldr r9, [sp, #24] // r8=rsc, r9=csc vmrs r6, FPSCR bic r6, r6, #0x00370000 vmsr FPSCR, r6 vpush { s16-s31 } {% include "dispatcher.tmpliq" %} .clear: eor r6, r6 vmov s16, r6 vmov.f32 s17, s16 vmov.f32 s18, s16 vmov.f32 s19, s16 vmov.f32 s20, s16 vmov.f32 s21, s16 vmov.f32 s22, s16 vmov.f32 s23, s16 vmov.f32 s24, s16 vmov.f32 s25, s16 vmov.f32 s26, s16 vmov.f32 s27, s16 vmov.f32 s28, s16 vmov.f32 s29, s16 vmov.f32 s30, s16 vmov.f32 s31, s16 b .non_linear_loop .add_mat_mul: // r3 <- k, r4 <- a, r5 <- b cmp r3, #0 beq .non_linear_loop mov r1, r4 // packed A ptr pld [r3] pld [r5] .packed_packed: cmp r3, #4 blt .packed_packed_loop_1 .packed_packed_loop_4: // 1 vldmia r1!, { s0, s1 } vldmia r5!, { s8, s9 } vmla.f32 s16, s0, s8 vldmia r1!, { s2, s3 } vmla.f32 s17, s1, s8 vldmia r5!, { s10, s11 } vmla.f32 s18, s2, s8 vmla.f32 s19, s3, s8 vmla.f32 s20, s0, s9 vmla.f32 s21, s1, s9 vmla.f32 s22, s2, s9 vmla.f32 s23, s3, s9 vldmia r1!, { s4-s7 } vmla.f32 s24, s0, s10 vmla.f32 s25, s1, s10 vmla.f32 s26, s2, s10 vmla.f32 s27, s3, s10 vldmia r5!, { s12-s15 } vmla.f32 s28, s0, s11 vmla.f32 s29, s1, s11 vmla.f32 s30, s2, s11 vmla.f32 s31, s3, s11 // 2 vmla.f32 s16, s4, s12 vmla.f32 s17, s5, s12 vmla.f32 s18, s6, s12 vmla.f32 s19, s7, s12 vldmia r1!, { s0-s3 } vmla.f32 s20, s4, s13 vmla.f32 s21, s5, s13 vmla.f32 s22, s6, s13 vmla.f32 s23, s7, s13 vldmia r5!, { s8-s11 } vmla.f32 s24, s4, s14 vmla.f32 s25, s5, s14 vmla.f32 s26, s6, s14 vmla.f32 s27, s7, s14 vmla.f32 s28, s4, s15 vmla.f32 s29, s5, s15 vmla.f32 s30, s6, s15 vmla.f32 s31, s7, s15 // 3 vmla.f32 s16, s0, s8 vmla.f32 s17, s1, s8 vmla.f32 s18, s2, s8 vmla.f32 s19, s3, s8 vldmia r1!, { s4-s7 } vmla.f32 s20, s0, s9 vmla.f32 s21, s1, s9 vmla.f32 s22, s2, s9 vmla.f32 s23, s3, s9 vldmia r5!, { s12-s15 } vmla.f32 s24, s0, s10 vmla.f32 s25, s1, s10 vmla.f32 s26, s2, s10 vmla.f32 s27, s3, s10 pld [r1] vmla.f32 s28, s0, s11 vmla.f32 s29, s1, s11 vmla.f32 s30, s2, s11 vmla.f32 s31, s3, s11 pld [r6] // 4 vmla.f32 s16, s4, s12 vmla.f32 s17, s5, s12 vmla.f32 s18, s6, s12 vmla.f32 s19, s7, s12 vmla.f32 s20, s4, s13 vmla.f32 s21, s5, s13 vmla.f32 s22, s6, s13 vmla.f32 s23, s7, s13 vmla.f32 s24, s4, s14 vmla.f32 s25, s5, s14 vmla.f32 s26, s6, s14 vmla.f32 s27, s7, s14 vmla.f32 s28, s4, s15 vmla.f32 s29, s5, s15 vmla.f32 s30, s6, s15 vmla.f32 s31, s7, s15 sub r3, r3, #4 cmp r3, #4 bge .packed_packed_loop_4 cmp r3, #0 beq .non_linear_loop .packed_packed_loop_1: vldmia r1!, { s0, s1 } vldmia r5!, { s8, s9 } vmla.f32 s16, s0, s8 vldmia r1!, { s2, s3 } vmla.f32 s17, s1, s8 vldmia r5!, { s10, s11 } vmla.f32 s18, s2, s8 vmla.f32 s19, s3, s8 vmla.f32 s20, s0, s9 vmla.f32 s21, s1, s9 vmla.f32 s22, s2, s9 vmla.f32 s23, s3, s9 vmla.f32 s24, s0, s10 vmla.f32 s25, s1, s10 vmla.f32 s26, s2, s10 vmla.f32 s27, s3, s10 vmla.f32 s28, s0, s11 vmla.f32 s29, s1, s11 vmla.f32 s30, s2, s11 vmla.f32 s31, s3, s11 subs r3, r3, #1 bne .packed_packed_loop_1 b .non_linear_loop .add_unicast: {% for col in (0..3) %} mov r8, r3 {% for reg in (0..3) %} vldr s0, [ r8 ] vadd.f32 s{{col|times:4|plus:reg|plus:16}}, s{{col|times:4|plus:reg|plus:16}}, s0 {% if reg < 3 %} add r8, r8, r4 {% endif %} {% endfor %} {% if col < 3 %} add r3, r3, r5 {% endif %} {% endfor %} b .non_linear_loop .scalar_min: vmov s0, r3 {% for reg in (16..31) %} vcmp.f32 s{{reg}}, s0 vmrs apsr_nzcv, fpscr vmovge s{{reg}}, s0 {% endfor %} b .non_linear_loop .scalar_max: vmov s0, r3 {% for reg in (16..31) %} vcmp.f32 s{{reg}}, s0 vmrs apsr_nzcv, fpscr vmovle s{{reg}}, s0 {% endfor %} b .non_linear_loop .scalar_add: vmov s0, r3 {% for s in (16..31) %} vadd.f32 s{{s}}, s{{s}}, s0 {% endfor %} b .non_linear_loop .scalar_mul: vmov s0, r3 {% for s in (16..31) %} vmul.f32 s{{s}}, s{{s}}, s0 {% endfor %} b .non_linear_loop .scalar_sub: vmov s0, r3 {% for s in (16..31) %} vsub.f32 s{{s}}, s0, s{{s}} {% endfor %} b .non_linear_loop .scalar_sub_flipped: vmov s0, r3 {% for s in (16..31) %} vsub.f32 s{{s}}, s{{s}}, s0 {% endfor %} b .non_linear_loop .leaky_relu: vmov s0, r3 {% for reg in (16..31) %} vmul.f32 s1, s0, s{{reg}} vcmp.f32 s{{reg}}, #0 vmrs apsr_nzcv, fpscr vmovlt s{{reg}}, s1 {% endfor %} b .non_linear_loop .per_row_min: vldm r3, {s0, s1, s2, s3} {% for row in (0..3) %} {% for col in (0..3) %} {%capture s%}s{{col|times:4|plus:row|plus:16}}{%endcapture%} vcmp.f32 {{s}}, s{{row}} vmrs apsr_nzcv, fpscr vmovge {{s}}, s{{row}} {% endfor %} {% endfor %} b .non_linear_loop .per_row_max: vldm r3, {s0, s1, s2, s3} {% for row in (0..3) %} {% for col in (0..3) %} {%capture s%}s{{col|times:4|plus:row|plus:16}}{%endcapture%} vcmp.f32 {{s}}, s{{row}} vmrs apsr_nzcv, fpscr vmovlt {{s}}, s{{row}} {% endfor %} {% endfor %} b .non_linear_loop .per_row_add: vldm r3, {s0, s1, s2, s3} {% for row in (0..3) %} {% for col in (0..3) %} vadd.f32 s{{col|times:4|plus:row|plus:16}}, s{{col|times:4|plus:row|plus:16}}, s{{row}} {% endfor %} {% endfor %} b .non_linear_loop .per_row_mul: vldm r3, {s0, s1, s2, s3} {% for row in (0..3) %} {% for col in (0..3) %} vmul.f32 s{{col|times:4|plus:row|plus:16}}, s{{col|times:4|plus:row|plus:16}}, s{{row}} {% endfor %} {% endfor %} b .non_linear_loop .per_row_sub: vldm r3, {s0, s1, s2, s3} {% for row in (0..3) %} {% for col in (0..3) %} vsub.f32 s{{col|times:4|plus:row|plus:16}}, s{{row}}, s{{col|times:4|plus:row|plus:16}} {% endfor %} {% endfor %} b .non_linear_loop .per_row_sub_flipped: vldm r3, {s0, s1, s2, s3} {% for row in (0..3) %} {% for col in (0..3) %} vsub.f32 s{{col|times:4|plus:row|plus:16}}, s{{col|times:4|plus:row|plus:16}}, s{{row}} {% endfor %} {% endfor %} b .non_linear_loop .per_col_min: vldm r3, {s0, s1, s2, s3} {% for row in (0..3) %} {% for col in (0..3) %} {%capture s%}s{{col|times:4|plus:row|plus:16}}{%endcapture%} vcmp.f32 {{s}}, s{{col}} vmrs apsr_nzcv, fpscr vmovge {{s}}, s{{col}} {% endfor %} {% endfor %} b .non_linear_loop .per_col_max: vldm r3, {s0, s1, s2, s3} {% for row in (0..3) %} {% for col in (0..3) %} {%capture s%}s{{col|times:4|plus:row|plus:16}}{%endcapture%} vcmp.f32 {{s}}, s{{col}} vmrs apsr_nzcv, fpscr vmovlt {{s}}, s{{col}} {% endfor %} {% endfor %} b .non_linear_loop .per_col_add: vldm r3, {s0, s1, s2, s3} {% for row in (0..3) %} {% for col in (0..3) %} vadd.f32 s{{col|times:4|plus:row|plus:16}}, s{{col|times:4|plus:row|plus:16}}, s{{col}} {% endfor %} {% endfor %} b .non_linear_loop .per_col_mul: vldm r3, {s0, s1, s2, s3} {% for row in (0..3) %} {% for col in (0..3) %} vmul.f32 s{{col|times:4|plus:row|plus:16}}, s{{col|times:4|plus:row|plus:16}}, s{{col}} {% endfor %} {% endfor %} b .non_linear_loop .per_col_sub: vldm r3, {s0, s1, s2, s3} {% for row in (0..3) %} {% for col in (0..3) %} vsub.f32 s{{col|times:4|plus:row|plus:16}}, s{{col}}, s{{col|times:4|plus:row|plus:16}} {% endfor %} {% endfor %} b .non_linear_loop .per_col_sub_flipped: vldm r3, {s0, s1, s2, s3} {% for row in (0..3) %} {% for col in (0..3) %} vsub.f32 s{{col|times:4|plus:row|plus:16}}, s{{col|times:4|plus:row|plus:16}}, s{{col}} {% endfor %} {% endfor %} b .non_linear_loop .add_row_col_products: vldmia r3!, { s0, s1 } vldmia r4!, { s8, s9 } vmla.f32 s16, s0, s8 vldmia r3!, { s2, s3 } vmla.f32 s17, s1, s8 vldmia r4!, { s10, s11 } vmla.f32 s18, s2, s8 vmla.f32 s19, s3, s8 vmla.f32 s20, s0, s9 vmla.f32 s21, s1, s9 vmla.f32 s22, s2, s9 vmla.f32 s23, s3, s9 vmla.f32 s24, s0, s10 vmla.f32 s25, s1, s10 vmla.f32 s26, s2, s10 vmla.f32 s27, s3, s10 vmla.f32 s28, s0, s11 vmla.f32 s29, s1, s11 vmla.f32 s30, s2, s11 vmla.f32 s31, s3, s11 b .non_linear_loop .store: {% for col in (0..3) %} mov r8, r3 {% for reg in (0..3) %} fsts s{{col|times:4|plus:reg|plus:16}}, [ r8 ] {% if reg < 3 %} add r8, r8, r4 {% endif %} {% endfor %} {% if col < 3 %} add r3, r3, r5 {% endif %} {% endfor %} mov r0, #0 b .return .q_scale: .q_shl: .q_shr: b .unsupported .return: vpop { s16-s31 } pop { r4-r12 } bx lr