// vim: ft=arm

// no preservation either for v0-v7 and v16-v31

.text
.align 4

{% if needs_pragma == true %}
.cpu generic+fp+simd+fp16
{% endif %}
.global {{G}}arm64fp16_leaky_relu_f16_8n_{{suffix}}
{{G}}arm64fp16_leaky_relu_f16_8n_{{suffix}}:

    cmp         x1, #0
    beq         .return

    mov         v31.h[0], w2
    dup         v31.8h, v31.h[0]
    mov         x2, x0
    
    cmp         x1, #64
    blt         .loop

    ld1         { v16.8h, v17.8h, v18.8h, v19.8h }, [x2], #64
.loop4:

    ld1         { v0.8h, v1.8h, v2.8h, v3.8h }, [x2], #64

    fmul        v20.8h, v16.8h, v31.8h
    fmul        v21.8h, v17.8h, v31.8h
    fmul        v22.8h, v18.8h, v31.8h
    fmul        v23.8h, v19.8h, v31.8h

    fcmge       v24.8h, v16.8h, #0.0
    fcmge       v25.8h, v17.8h, #0.0
    fcmge       v26.8h, v18.8h, #0.0
    fcmge       v27.8h, v19.8h, #0.0

    bsl         v24.16b, v16.16b, v20.16b
    bsl         v25.16b, v17.16b, v21.16b
    bsl         v26.16b, v18.16b, v22.16b
    bsl         v27.16b, v19.16b, v23.16b

    st1         { v24.8h, v25.8h, v26.8h, v27.8h }, [x0], #64

    and         v16.16b, v0.16b, v0.16b
    and         v17.16b, v1.16b, v1.16b
    and         v18.16b, v2.16b, v2.16b
    and         v19.16b, v3.16b, v3.16b

    subs        x1, x1, #32
    cmp         x1, #64
    bge         .loop4

    cmp         x1, #0
    beq         .return

.loop:
    ld1         { v16.8h }, [x0]

    fmul        v17.8h, v16.8h, v31.8h
    fcmge       v18.8h, v16.8h, #0.0
    bsl         v18.16b, v16.16b, v17.16b
    
    st1         { v18.8h }, [x0], #16

    subs        x1, x1, #8
    bne         .loop

.return:
    ret