// vim: ft=arm

.q_scale:
    ldp     x5, x6, [x0, #8]            // x5: shift, x6: policy
    add     x2, x0, #24
    ld1r    { v2.4s }, [x2]             // v2.4s <- multiplier

    mov     w3, #1
    ins     v4.d[0], x3
    dup     v4.2d, v4.d[0]              // v4.2d <- 1

    add     x5, x5, #32                 // add 32 to shift
    neg     x5, x5                      // broadcast shift
    ins     v1.d[0], x5
    dup     v1.2d, v1.d[0]              // v1.2s <- -(shift + 32)

    cmp     x6, 1
    beq     .q_scale_rounding_zero
    cmp     x6, 2
    beq     .q_scale_rounding_away
    cmp     x6, 3
    beq     .q_scale_rounding_minus_inf
    cmp     x6, 4
    beq     .q_scale_rounding_plus_inf
    cmp     x6, 5
    beq     .q_scale_rounding_even
    cmp     x6, 6
    beq     .q_scale_rounding_odd

    b .unsupported

.q_scale_rounding_zero:
        // rust: signum * ((abs + nudge2) >> shift
        // asm: signum * (2*abs - 1) >>r (shift + 1)

    {% for q in (16..31) %}
        cmlt        v0.4s, v{{q}}.4s, #0
        abs         v{{q}}.4s, v{{q}}.4s
        sqdmull     v8.2d, v{{q}}.2s, v2.2s
        sqdmull2    v9.2d, v{{q}}.4s, v2.4s     //mul without shift and store results in v8 and v9

        sub         v8.2d, v8.2d, v4.2d
        sqrshl      v8.2d, v8.2d, v1.2d

        sub         v9.2d, v9.2d, v4.2d
        sqrshl      v9.2d, v9.2d, v1.2d

        uzp1         v{{q}}.4s, v8.4s, v9.4s    //combine back

        neg         v3.4s, v{{q}}.4s
        bit         v{{q}}.16b, v3.16b, v0.16b
    {% endfor %}

    b .non_linear_loop

.q_scale_rounding_away: // signum * (abs >> (shift-1) + 1 >> 1)

    {% for q in (16..31) %}
        cmlt        v0.4s, v{{q}}.4s, #0
        abs         v{{q}}.4s, v{{q}}.4s
        sqdmull     v8.2d, v{{q}}.2s, v2.2s
        sqdmull2    v9.2d, v{{q}}.4s, v2.4s     //mul without shift and store results in v8 and v9

        sqrshl      v8.2d, v8.2d, v1.2d
        sqrshl      v9.2d, v9.2d, v1.2d

        uzp1         v{{q}}.4s, v8.4s, v9.4s    //combine back

        neg         v3.4s, v{{q}}.4s
        bit         v{{q}}.16b, v3.16b, v0.16b
    {% endfor %}

    b .non_linear_loop

.q_scale_rounding_minus_inf: // val >> shift

    {% for q in (16..31) %}
        sqdmull     v8.2d, v{{q}}.2s, v2.2s
        sqdmull2    v9.2d, v{{q}}.4s, v2.4s     //mul without shift and store results in v8 and v9

        sub         v8.2d, v8.2d, v4.2d
        sqrshl      v8.2d, v8.2d, v1.2d

        sub         v9.2d, v9.2d, v4.2d
        sqrshl      v9.2d, v9.2d, v1.2d

        uzp1         v{{q}}.4s, v8.4s, v9.4s    //combine back
    {% endfor %}

    b .non_linear_loop

.q_scale_rounding_plus_inf: // (val >> shift-1)+1 >>1

    {% for q in (16..31) %}
        sqdmull     v8.2d, v{{q}}.2s, v2.2s
        sqdmull2    v9.2d, v{{q}}.4s, v2.4s     //mul without shift and store results in v8 and v9

        sqrshl      v8.2d, v8.2d, v1.2d
        sqrshl      v9.2d, v9.2d, v1.2d

        uzp1         v{{q}}.4s, v8.4s, v9.4s    //combine back
    {% endfor %}

    b .non_linear_loop

.q_scale_rounding_even: // signum * ((abs >> shift-1) + (abs & 0x1) - 1 >> 1)

    {% for q in (16..31) %}
        cmlt        v0.4s, v{{q}}.4s, #0
        abs         v{{q}}.4s, v{{q}}.4s
        sqdmull     v8.2d, v{{q}}.2s, v2.2s
        sqdmull2    v9.2d, v{{q}}.4s, v2.4s     //mul without shift and store results in v8 and v9

        sqshl       v3.2d, v8.2d, v1.2d         // abs >> shift - 1
        and         v3.16b, v3.16b, v4.16b      // abs & 0x1
        sub         v3.2d, v3.2d, v4.2d         //nudge : -1 if we want to round down, 0 if up

        add         v8.2d, v8.2d, v3.2d
        sqrshl      v8.2d, v8.2d, v1.2d

        sqshl       v3.2d, v9.2d, v1.2d
        and         v3.16b, v3.16b, v4.16b
        sub         v3.2d, v3.2d, v4.2d         //nudge : -1 if we want to round down, 0 if up

        add         v9.2d, v9.2d, v3.2d
        sqrshl      v9.2d, v9.2d, v1.2d

        uzp1         v{{q}}.4s, v8.4s, v9.4s    //combine back

        neg         v3.4s, v{{q}}.4s
        bit         v{{q}}.16b, v3.16b, v0.16b
    {% endfor %}

    b .non_linear_loop

.q_scale_rounding_odd: // signum * ((abs >> shift-1) - (abs & 0x1) >> 1)

    {% for q in (16..31) %}
        cmlt        v0.4s, v{{q}}.4s, #0
        abs         v{{q}}.4s, v{{q}}.4s
        sqdmull     v8.2d, v{{q}}.2s, v2.2s
        sqdmull2    v9.2d, v{{q}}.4s, v2.4s     //mul without shift and store results in v8 and v9

        sqshl       v3.2d, v8.2d, v1.2d
        and         v3.16b, v3.16b, v4.16b      //nudge : -1 if we want to round down, 0 if up

        sub         v8.2d, v8.2d, v3.2d
        sqrshl      v8.2d, v8.2d, v1.2d

        sqshl       v3.2d, v9.2d, v1.2d
        and         v3.16b, v3.16b, v4.16b      //nudge : -1 if we want to round down, 0 if up

        sub         v9.2d, v9.2d, v3.2d
        sqrshl      v9.2d, v9.2d, v1.2d

        uzp1        v{{q}}.4s, v8.4s, v9.4s    //combine back

        neg         v3.4s, v{{q}}.4s
        bit         v{{q}}.16b, v3.16b, v0.16b
    {% endfor %}

    b .non_linear_loop

.q_shl:
    ldr     x5, [x0, #8]                // x5: shift
    ins     v1.s[0], w5
    dup     v1.4s, v1.s[0]              // v1.4s <- shift

    {% for q in (16..31) %}
        sqrshl      v{{q}}.4s, v{{q}}.4s, v1.4s
    {% endfor %}
    b .non_linear_loop

.q_shr:
    ldp     x5, x6, [x0, #8]            // x5: shift, x6: policy

    mov     w3, #1
    ins     v4.s[0], w3
    dup     v4.4s, v4.s[0]              // v4.4d <- 1

    neg     w5, w5                      // broadcast shift
    ins     v1.s[0], w5
    dup     v1.4s, v1.s[0]              // v1.4s <- -shift

    cmp     x6, 1
    beq     .q_shr_rounding_zero
    cmp     x6, 2
    beq     .q_shr_rounding_away
    cmp     x6, 3
    beq     .q_shr_rounding_minus_inf
    cmp     x6, 4
    beq     .q_shr_rounding_plus_inf
    cmp     x6, 5
    beq     .q_shr_rounding_even
    cmp     x6, 6
    beq     .q_shr_rounding_odd

    b .unsupported

.q_shr_rounding_zero:
    // asm: signum * (abs >>r shift)
    {% for q in (16..31) %}
        cmlt        v0.4s, v{{q}}.4s, #0
        abs         v{{q}}.4s, v{{q}}.4s

        sub         v{{q}}.4s, v{{q}}.4s, v4.4s
        sqrshl      v{{q}}.4s, v{{q}}.4s, v1.4s

        neg         v3.4s, v{{q}}.4s
        bit         v{{q}}.16b, v3.16b, v0.16b
    {% endfor %}
    b .non_linear_loop

.q_shr_rounding_away:
    {% for q in (16..31) %}
        cmlt        v0.4s, v{{q}}.4s, #0
        abs         v{{q}}.4s, v{{q}}.4s

        sqrshl      v{{q}}.4s, v{{q}}.4s, v1.4s

        neg         v3.4s, v{{q}}.4s
        bit         v{{q}}.16b, v3.16b, v0.16b
    {% endfor %}
    b .non_linear_loop

.q_shr_rounding_minus_inf:
    {% for q in (16..31) %}
        sqneg       v{{q}}.4s, v{{q}}.4s
        sqrshl      v{{q}}.4s, v{{q}}.4s, v1.4s
        sqneg       v{{q}}.4s, v{{q}}.4s
    {% endfor %}
    b .non_linear_loop

.q_shr_rounding_plus_inf:
    {% for q in (16..31) %}
        sqrshl      v{{q}}.4s, v{{q}}.4s, v1.4s
    {% endfor %}
    b .non_linear_loop

.q_shr_rounding_even:
    // sqrshl is round(+inf), sqshl trauncates
    // we look at parity of result by truncation: if it's odd, we have nothing more to do, we go towards +inf
    // if it's even, we need to nudge towards 0 by adding -1
    // => nudge = (x >>l shift) & 0x1 - 1 (>>l is sqshl)
    // => result is (x + nudge) >>r shift (with sqrshl)
    {% for q in (16..31) %}
        sqshl       v3.4s, v{{q}}.4s, v1.4s // trunc
        and         v3.16b, v3.16b, v4.16b
        sub         v3.4s, v3.4s, v4.4s
        add         v{{q}}.4s, v{{q}}.4s, v3.4s

        sqrshl      v{{q}}.4s, v{{q}}.4s, v1.4s
    {% endfor %}
    b .non_linear_loop

.q_shr_rounding_odd:
    // here: nudge is -((x >>l shift) & 0x1)
    {% for q in (16..31) %}
        sqshl       v3.4s, v{{q}}.4s, v1.4s // trunc
        and         v3.16b, v3.16b, v4.16b
        neg         v3.4s, v3.4s
        add         v{{q}}.4s, v{{q}}.4s, v3.4s

        sqrshl      v{{q}}.4s, v{{q}}.4s, v1.4s
    {% endfor %}
    b .non_linear_loop