// vim: ft=arm .q_scale: ldp x5, x6, [x0, #8] // x5: shift, x6: policy add x2, x0, #24 ld1r { v2.4s }, [x2] // v2.4s <- multiplier mov w3, #1 ins v4.d[0], x3 dup v4.2d, v4.d[0] // v4.2d <- 1 add x5, x5, #32 // add 32 to shift neg x5, x5 // broadcast shift ins v1.d[0], x5 dup v1.2d, v1.d[0] // v1.2s <- -(shift + 32) cmp x6, 1 beq .q_scale_rounding_zero cmp x6, 2 beq .q_scale_rounding_away cmp x6, 3 beq .q_scale_rounding_minus_inf cmp x6, 4 beq .q_scale_rounding_plus_inf cmp x6, 5 beq .q_scale_rounding_even cmp x6, 6 beq .q_scale_rounding_odd b .unsupported .q_scale_rounding_zero: // rust: signum * ((abs + nudge2) >> shift // asm: signum * (2*abs - 1) >>r (shift + 1) {% for q in (16..31) %} cmlt v0.4s, v{{q}}.4s, #0 abs v{{q}}.4s, v{{q}}.4s sqdmull v8.2d, v{{q}}.2s, v2.2s sqdmull2 v9.2d, v{{q}}.4s, v2.4s //mul without shift and store results in v8 and v9 sub v8.2d, v8.2d, v4.2d sqrshl v8.2d, v8.2d, v1.2d sub v9.2d, v9.2d, v4.2d sqrshl v9.2d, v9.2d, v1.2d uzp1 v{{q}}.4s, v8.4s, v9.4s //combine back neg v3.4s, v{{q}}.4s bit v{{q}}.16b, v3.16b, v0.16b {% endfor %} b .non_linear_loop .q_scale_rounding_away: // signum * (abs >> (shift-1) + 1 >> 1) {% for q in (16..31) %} cmlt v0.4s, v{{q}}.4s, #0 abs v{{q}}.4s, v{{q}}.4s sqdmull v8.2d, v{{q}}.2s, v2.2s sqdmull2 v9.2d, v{{q}}.4s, v2.4s //mul without shift and store results in v8 and v9 sqrshl v8.2d, v8.2d, v1.2d sqrshl v9.2d, v9.2d, v1.2d uzp1 v{{q}}.4s, v8.4s, v9.4s //combine back neg v3.4s, v{{q}}.4s bit v{{q}}.16b, v3.16b, v0.16b {% endfor %} b .non_linear_loop .q_scale_rounding_minus_inf: // val >> shift {% for q in (16..31) %} sqdmull v8.2d, v{{q}}.2s, v2.2s sqdmull2 v9.2d, v{{q}}.4s, v2.4s //mul without shift and store results in v8 and v9 sub v8.2d, v8.2d, v4.2d sqrshl v8.2d, v8.2d, v1.2d sub v9.2d, v9.2d, v4.2d sqrshl v9.2d, v9.2d, v1.2d uzp1 v{{q}}.4s, v8.4s, v9.4s //combine back {% endfor %} b .non_linear_loop .q_scale_rounding_plus_inf: // (val >> shift-1)+1 >>1 {% for q in (16..31) %} sqdmull v8.2d, v{{q}}.2s, v2.2s sqdmull2 v9.2d, v{{q}}.4s, v2.4s //mul without shift and store results in v8 and v9 sqrshl v8.2d, v8.2d, v1.2d sqrshl v9.2d, v9.2d, v1.2d uzp1 v{{q}}.4s, v8.4s, v9.4s //combine back {% endfor %} b .non_linear_loop .q_scale_rounding_even: // signum * ((abs >> shift-1) + (abs & 0x1) - 1 >> 1) {% for q in (16..31) %} cmlt v0.4s, v{{q}}.4s, #0 abs v{{q}}.4s, v{{q}}.4s sqdmull v8.2d, v{{q}}.2s, v2.2s sqdmull2 v9.2d, v{{q}}.4s, v2.4s //mul without shift and store results in v8 and v9 sqshl v3.2d, v8.2d, v1.2d // abs >> shift - 1 and v3.16b, v3.16b, v4.16b // abs & 0x1 sub v3.2d, v3.2d, v4.2d //nudge : -1 if we want to round down, 0 if up add v8.2d, v8.2d, v3.2d sqrshl v8.2d, v8.2d, v1.2d sqshl v3.2d, v9.2d, v1.2d and v3.16b, v3.16b, v4.16b sub v3.2d, v3.2d, v4.2d //nudge : -1 if we want to round down, 0 if up add v9.2d, v9.2d, v3.2d sqrshl v9.2d, v9.2d, v1.2d uzp1 v{{q}}.4s, v8.4s, v9.4s //combine back neg v3.4s, v{{q}}.4s bit v{{q}}.16b, v3.16b, v0.16b {% endfor %} b .non_linear_loop .q_scale_rounding_odd: // signum * ((abs >> shift-1) - (abs & 0x1) >> 1) {% for q in (16..31) %} cmlt v0.4s, v{{q}}.4s, #0 abs v{{q}}.4s, v{{q}}.4s sqdmull v8.2d, v{{q}}.2s, v2.2s sqdmull2 v9.2d, v{{q}}.4s, v2.4s //mul without shift and store results in v8 and v9 sqshl v3.2d, v8.2d, v1.2d and v3.16b, v3.16b, v4.16b //nudge : -1 if we want to round down, 0 if up sub v8.2d, v8.2d, v3.2d sqrshl v8.2d, v8.2d, v1.2d sqshl v3.2d, v9.2d, v1.2d and v3.16b, v3.16b, v4.16b //nudge : -1 if we want to round down, 0 if up sub v9.2d, v9.2d, v3.2d sqrshl v9.2d, v9.2d, v1.2d uzp1 v{{q}}.4s, v8.4s, v9.4s //combine back neg v3.4s, v{{q}}.4s bit v{{q}}.16b, v3.16b, v0.16b {% endfor %} b .non_linear_loop .q_shl: ldr x5, [x0, #8] // x5: shift ins v1.s[0], w5 dup v1.4s, v1.s[0] // v1.4s <- shift {% for q in (16..31) %} sqrshl v{{q}}.4s, v{{q}}.4s, v1.4s {% endfor %} b .non_linear_loop .q_shr: ldp x5, x6, [x0, #8] // x5: shift, x6: policy mov w3, #1 ins v4.s[0], w3 dup v4.4s, v4.s[0] // v4.4d <- 1 neg w5, w5 // broadcast shift ins v1.s[0], w5 dup v1.4s, v1.s[0] // v1.4s <- -shift cmp x6, 1 beq .q_shr_rounding_zero cmp x6, 2 beq .q_shr_rounding_away cmp x6, 3 beq .q_shr_rounding_minus_inf cmp x6, 4 beq .q_shr_rounding_plus_inf cmp x6, 5 beq .q_shr_rounding_even cmp x6, 6 beq .q_shr_rounding_odd b .unsupported .q_shr_rounding_zero: // asm: signum * (abs >>r shift) {% for q in (16..31) %} cmlt v0.4s, v{{q}}.4s, #0 abs v{{q}}.4s, v{{q}}.4s sub v{{q}}.4s, v{{q}}.4s, v4.4s sqrshl v{{q}}.4s, v{{q}}.4s, v1.4s neg v3.4s, v{{q}}.4s bit v{{q}}.16b, v3.16b, v0.16b {% endfor %} b .non_linear_loop .q_shr_rounding_away: {% for q in (16..31) %} cmlt v0.4s, v{{q}}.4s, #0 abs v{{q}}.4s, v{{q}}.4s sqrshl v{{q}}.4s, v{{q}}.4s, v1.4s neg v3.4s, v{{q}}.4s bit v{{q}}.16b, v3.16b, v0.16b {% endfor %} b .non_linear_loop .q_shr_rounding_minus_inf: {% for q in (16..31) %} sqneg v{{q}}.4s, v{{q}}.4s sqrshl v{{q}}.4s, v{{q}}.4s, v1.4s sqneg v{{q}}.4s, v{{q}}.4s {% endfor %} b .non_linear_loop .q_shr_rounding_plus_inf: {% for q in (16..31) %} sqrshl v{{q}}.4s, v{{q}}.4s, v1.4s {% endfor %} b .non_linear_loop .q_shr_rounding_even: // sqrshl is round(+inf), sqshl trauncates // we look at parity of result by truncation: if it's odd, we have nothing more to do, we go towards +inf // if it's even, we need to nudge towards 0 by adding -1 // => nudge = (x >>l shift) & 0x1 - 1 (>>l is sqshl) // => result is (x + nudge) >>r shift (with sqrshl) {% for q in (16..31) %} sqshl v3.4s, v{{q}}.4s, v1.4s // trunc and v3.16b, v3.16b, v4.16b sub v3.4s, v3.4s, v4.4s add v{{q}}.4s, v{{q}}.4s, v3.4s sqrshl v{{q}}.4s, v{{q}}.4s, v1.4s {% endfor %} b .non_linear_loop .q_shr_rounding_odd: // here: nudge is -((x >>l shift) & 0x1) {% for q in (16..31) %} sqshl v3.4s, v{{q}}.4s, v1.4s // trunc and v3.16b, v3.16b, v4.16b neg v3.4s, v3.4s add v{{q}}.4s, v{{q}}.4s, v3.4s sqrshl v{{q}}.4s, v{{q}}.4s, v1.4s {% endfor %} b .non_linear_loop