// vim: ft=arm // no preservation either for v0-v7 and v16-v31 .text .align 4 .cpu generic+fp+simd .global {{G}}arm64simd_tanh_f32_4n_{{suffix}} {{G}}arm64simd_tanh_f32_4n_{{suffix}}: cmp x1, #0 beq .return adr x2, .coeffs_num ld1 { v0.4s, v1.4s, v2.4s, v3.4s }, [x2] dup v5.4s, v0.s[0] // v5 <- low, broadcasted dup v6.4s, v0.s[1] // v6 <- high, broadcasted cmp x1, #16 blt .loop .loop4: ld1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x0] fmax v16.4s, v16.4s, v5.4s fmax v17.4s, v17.4s, v5.4s fmax v18.4s, v18.4s, v5.4s fmax v19.4s, v19.4s, v5.4s fmin v16.4s, v16.4s, v6.4s fmin v17.4s, v17.4s, v6.4s fmin v18.4s, v18.4s, v6.4s fmin v19.4s, v19.4s, v6.4s // v16 <- x fmul v20.4s, v16.4s, v16.4s fmul v21.4s, v17.4s, v17.4s fmul v22.4s, v18.4s, v18.4s fmul v23.4s, v19.4s, v19.4s // v20 <- x2 dup v24.4s, v0.s[3] fmla v24.4s, v20.4s, v0.s[2] dup v25.4s, v0.s[3] fmla v25.4s, v21.4s, v0.s[2] dup v26.4s, v0.s[3] fmla v26.4s, v22.4s, v0.s[2] dup v27.4s, v0.s[3] fmla v27.4s, v23.4s, v0.s[2] dup v28.4s, v1.s[0] fmla v28.4s, v20.4s, v24.4s dup v29.4s, v1.s[0] fmla v29.4s, v21.4s, v25.4s dup v30.4s, v1.s[0] fmla v30.4s, v22.4s, v26.4s dup v31.4s, v1.s[0] fmla v31.4s, v23.4s, v27.4s dup v24.4s, v1.s[1] fmla v24.4s, v20.4s, v28.4s dup v25.4s, v1.s[1] fmla v25.4s, v21.4s, v29.4s dup v26.4s, v1.s[1] fmla v26.4s, v22.4s, v30.4s dup v27.4s, v1.s[1] fmla v27.4s, v23.4s, v31.4s dup v28.4s, v1.s[2] fmla v28.4s, v20.4s, v24.4s dup v29.4s, v1.s[2] fmla v29.4s, v21.4s, v25.4s dup v30.4s, v1.s[2] fmla v30.4s, v22.4s, v26.4s dup v31.4s, v1.s[2] fmla v31.4s, v23.4s, v27.4s dup v24.4s, v1.s[3] fmla v24.4s, v20.4s, v28.4s dup v25.4s, v1.s[3] fmla v25.4s, v21.4s, v29.4s dup v26.4s, v1.s[3] fmla v26.4s, v22.4s, v30.4s dup v27.4s, v1.s[3] fmla v27.4s, v23.4s, v31.4s dup v28.4s, v2.s[0] fmla v28.4s, v20.4s, v24.4s dup v29.4s, v2.s[0] fmla v29.4s, v21.4s, v25.4s dup v30.4s, v2.s[0] fmla v30.4s, v22.4s, v26.4s dup v31.4s, v2.s[0] fmla v31.4s, v23.4s, v27.4s fmul v16.4s, v16.4s, v28.4s fmul v17.4s, v17.4s, v29.4s fmul v18.4s, v18.4s, v30.4s fmul v19.4s, v19.4s, v31.4s // v16 <- numerator dup v24.4s, v2.s[2] fmla v24.4s, v20.4s, v2.s[1] dup v25.4s, v2.s[2] fmla v25.4s, v21.4s, v2.s[1] dup v26.4s, v2.s[2] fmla v26.4s, v22.4s, v2.s[1] dup v27.4s, v2.s[2] fmla v27.4s, v23.4s, v2.s[1] dup v28.4s, v2.s[3] fmla v28.4s, v20.4s, v24.4s dup v29.4s, v2.s[3] fmla v29.4s, v21.4s, v25.4s dup v30.4s, v2.s[3] fmla v30.4s, v22.4s, v26.4s dup v31.4s, v2.s[3] fmla v31.4s, v23.4s, v27.4s dup v24.4s, v3.s[0] fmla v24.4s, v20.4s, v28.4s dup v25.4s, v3.s[0] fmla v25.4s, v21.4s, v29.4s dup v26.4s, v3.s[0] fmla v26.4s, v22.4s, v30.4s dup v27.4s, v3.s[0] fmla v27.4s, v23.4s, v31.4s // v24 denum fdiv v16.4s, v16.4s, v24.4s fdiv v17.4s, v17.4s, v25.4s fdiv v18.4s, v18.4s, v26.4s fdiv v19.4s, v19.4s, v27.4s st1 { v16.4s, v17.4s, v18.4s, v19.4s }, [x0], #64 subs x1, x1, #16 cmp x1, #16 bge .loop4 cmp x1, #0 beq .return .loop: ld1 { v16.4s }, [x0] fmax v16.4s, v16.4s, v5.4s fmin v16.4s, v16.4s, v6.4s // v16 <- x fmul v20.4s, v16.4s, v16.4s // v20 <- x2 dup v24.4s, v0.s[3] fmla v24.4s, v20.4s, v0.s[2] dup v28.4s, v1.s[0] fmla v28.4s, v20.4s, v24.4s dup v24.4s, v1.s[1] fmla v24.4s, v20.4s, v28.4s dup v28.4s, v1.s[2] fmla v28.4s, v20.4s, v24.4s dup v24.4s, v1.s[3] fmla v24.4s, v20.4s, v28.4s dup v28.4s, v2.s[0] fmla v28.4s, v20.4s, v24.4s fmul v16.4s, v16.4s, v28.4s // v16 <- numerator dup v24.4s, v2.s[2] fmla v24.4s, v20.4s, v2.s[1] dup v28.4s, v2.s[3] fmla v28.4s, v20.4s, v24.4s dup v24.4s, v3.s[0] fmla v24.4s, v20.4s, v28.4s // v24 <- denum fdiv v16.4s, v16.4s, v24.4s st1 { v16.4s }, [x0], #16 subs x1, x1, #4 bne .loop .return: ret .coeffs_num: .float -8.9 // low .float 8.9 // high .float -8.488492677e-14 // alpha_13 .float 5.277853000e-11 .float -2.022500419e-8 .float 0.00001115424833 .float 0.003103950131 .float 0.1308400453 .float 0.9999999934 .float 0.0002546136580 // beta_6 .float 0.02449515379 .float 0.4641733162 .float 1.0 .float 0 // padding .float 0 // padding .float 0 // padding