// vim: ft=arm .q_scale: ldm r0, { r4, r5, r6, r7 } // fixme params are already loaded by disp. vdup.s32 q0, r7 // q0 <- multiplier mov r3, #1 vdup.s32 q1, r3 // q1 <- ones vmovl.s32 q1, d2 add r5, #32 neg r5, r5 vdup.s32 q2, r5 // q2 <- -(shift + 32) vmovl.s32 q2, d4 cmp r6, #1 beq .q_scale_rounding_zero cmp r6, #2 beq .q_scale_rounding_away cmp r6, #3 beq .q_scale_rounding_minus_inf cmp r6, #4 beq .q_scale_rounding_plus_inf cmp r6, #5 beq .q_scale_rounding_even cmp r6, #6 beq .q_scale_rounding_odd b .unsupported .q_scale_rounding_zero: {% for q in (8..15) %} vclt.s32 q7, q{{q}}, #0 vabs.s32 q{{q}}, q{{q}} vqdmull.s32 q5, d{{q | times:2}}, d0[0] vqdmull.s32 q6, d{{q | times:2 | plus:1}}, d0[0] vsub.s64 q5, q1 vsub.s64 q6, q1 vqrshl.s64 q5, q2 vqrshl.s64 q6, q2 vmovn.s64 d{{q | times:2}}, q5 vmovn.s64 d{{q | times:2 | plus: 1}}, q6 vneg.s32 q5, q{{q}} vbit.s32 q{{q}}, q5, q7 {% endfor %} b .non_linear_loop .q_scale_rounding_away: {% for q in (8..15) %} vclt.s32 q7, q{{q}}, #0 vabs.s32 q{{q}}, q{{q}} vqdmull.s32 q5, d{{q | times:2}}, d0[0] vqdmull.s32 q6, d{{q | times:2 | plus:1}}, d0[0] vqrshl.s64 q5, q2 vqrshl.s64 q6, q2 vmovn.s64 d{{q | times:2}}, q5 vmovn.s64 d{{q | times:2 | plus: 1}}, q6 vneg.s32 q5, q{{q}} vbit.s32 q{{q}}, q5, q7 {% endfor %} b .non_linear_loop .q_scale_rounding_minus_inf: {% for q in (8..15) %} vqdmull.s32 q5, d{{q | times:2}}, d0[0] vqdmull.s32 q6, d{{q | times:2 | plus:1}}, d0[0] vsub.s64 q5, q1 vsub.s64 q6, q1 vqrshl.s64 q5, q2 vqrshl.s64 q6, q2 vmovn.s64 d{{q | times:2}}, q5 vmovn.s64 d{{q | times:2 | plus: 1}}, q6 {% endfor %} b .non_linear_loop .q_scale_rounding_plus_inf: {% for q in (8..15) %} vqdmull.s32 q5, d{{q | times:2}}, d0[0] vqdmull.s32 q6, d{{q | times:2 | plus:1}}, d0[0] vqrshl.s64 q5, q2 vqrshl.s64 q6, q2 vmovn.s64 d{{q | times:2}}, q5 vmovn.s64 d{{q | times:2 | plus: 1}}, q6 {% endfor %} b .non_linear_loop .q_scale_rounding_even: {% for q in (8..15) %} vclt.s32 q7, q{{q}}, #0 vabs.s32 q{{q}}, q{{q}} vqdmull.s32 q5, d{{q | times:2}}, d0[0] vqdmull.s32 q6, d{{q | times:2 | plus:1}}, d0[0] vqshl.s64 q3, q5, q2 vqshl.s64 q4, q6, q2 vand q3, q3, q1 vand q4, q4, q1 vsub.s64 q3, q3, q1 vsub.s64 q4, q4, q1 vadd.s64 q5, q3 vadd.s64 q6, q4 vqrshl.s64 q5, q2 vqrshl.s64 q6, q2 vmovn.s64 d{{q | times:2}}, q5 vmovn.s64 d{{q | times:2 | plus: 1}}, q6 vneg.s32 q5, q{{q}} vbit.s32 q{{q}}, q5, q7 {% endfor %} b .non_linear_loop .q_scale_rounding_odd: {% for q in (8..15) %} vclt.s32 q7, q{{q}}, #0 vabs.s32 q{{q}}, q{{q}} vqdmull.s32 q5, d{{q | times:2}}, d0[0] vqdmull.s32 q6, d{{q | times:2 | plus:1}}, d0[0] vqshl.s64 q3, q5, q2 vqshl.s64 q4, q6, q2 vand q3, q3, q1 vand q4, q4, q1 vsub.s64 q5, q3 vsub.s64 q6, q4 vqrshl.s64 q5, q2 vqrshl.s64 q6, q2 vmovn.s64 d{{q | times:2}}, q5 vmovn.s64 d{{q | times:2 | plus: 1}}, q6 vneg.s32 q5, q{{q}} vbit.s32 q{{q}}, q5, q7 {% endfor %} b .non_linear_loop .q_shl: ldm r0, { r4, r5 } // fixme params are already loaded by disp. vdup.s32 q2, r5 // q2 <- shift {% for q in (8..15) %} vqrshl.s32 q{{q}}, q2 // Shift {% endfor %} b .non_linear_loop .q_shr: ldm r0, { r4, r5, r6 } // fixme params are already loaded by disp. mov r3, #1 vdup.s32 q1, r3 // q1 <- ones neg r5, r5 vdup.s32 q2, r5 // q2 <- shift cmp r6, #1 beq .q_shr_rounding_zero cmp r6, #2 beq .q_shr_rounding_away cmp r6, #3 beq .q_shr_rounding_minus_inf cmp r6, #4 beq .q_shr_rounding_plus_inf cmp r6, #5 beq .q_shr_rounding_even cmp r6, #6 beq .q_shr_rounding_odd b .unsupported .q_shr_rounding_zero: // return signum(x) * ((abs(x) - 1) >>r shift ) {% for q in (8..15) %} vclt.s32 q3, q{{q}}, #0 // Store the sign of the value vabs.s32 q{{q}}, q{{q}} // Compute their abs vsub.s32 q{{q}}, q1 // Substract 1 to abs(x) vqrshl.s32 q{{q}}, q2 // Rounding shift (0.5 -> 1) vneg.s32 q4, q{{q}} // Compute -((abs(x) - 1) >>r shift ) vbit.s32 q{{q}}, q4, q3 // Restore sign of x with bit mask {% endfor %} b .non_linear_loop .q_shr_rounding_away: // return signum(x) * (abs(x) >>r shift ) {% for q in (8..15) %} vclt.s32 q3, q{{q}}, #0 // Store the sign of the value vabs.s32 q{{q}}, q{{q}} // Compute their abs vqrshl.s32 q{{q}}, q2 // Rounding shift (0.5 -> 1) vneg.s32 q4, q{{q}} // Compute -(abs(x) >>r shift ) vbit.s32 q{{q}}, q4, q3 // Restore sign of x with bit mask {% endfor %} b .non_linear_loop .q_shr_rounding_minus_inf: // return -(-x >>r shift) {% for q in (8..15) %} vneg.s32 q3, q{{q}} // Compute -x vqrshl.s32 q3, q2 // Rounding shift (0.5 -> 1) vneg.s32 q{{q}}, q3 // Compute -(-x >>r shift) {% endfor %} b .non_linear_loop .q_shr_rounding_plus_inf: // return x >>r shift {% for q in (8..15) %} vqrshl.s32 q{{q}}, q2 // Rounding shift (0.5 -> 1) {% endfor %} b .non_linear_loop .q_shr_rounding_even: // If (x >> shift) is odd -> (x - 0) >>r shift // If (x >> shift) is even -> (x - 1) >>r shift {% for q in (8..15) %} vqshl.s32 q3, q{{q}}, q2 // Truncate shift (0.5 -> 0) vand.s32 q4, q3, q1 // Store if x is odd vsub.s32 q5, q4, q1 // If (x >> shift) is odd 0 else -1 vadd.s32 q{{q}}, q{{q}}, q5 // If (x >> shift) is odd (x - 0) else (x - 1) vqrshl.s32 q{{q}}, q2 // Rounding shift (0.5 -> 1) {% endfor %} b .non_linear_loop .q_shr_rounding_odd: // If (x >> shift) is even -> (x - 0) >>r shift // If (x >> shift) is odd -> (x - 1) >>r shift {% for q in (8..15) %} vqshl.s32 q3, q{{q}}, q2 // Truncate shift (0.5 -> 0) vand.s32 q4, q3, q1 // Store if x >> shift is odd vneg.s32 q5, q4 // If x is odd -1 else 0 vadd.s32 q{{q}}, q{{q}}, q5 // If x is odd (x - 1) else (x - 0) vqrshl.s32 q{{q}}, q2 // Rounding shift (0.5 -> 1) {% endfor %} b .non_linear_loop