// mul a: v0, v1, v2, v3 b: v4 // load a: v5(d5/x5), v6(d6,x6), v7(d7,x7), v8(d8, x8) // load b: v9 as d9 fmla v16.8h, v0.8h, v4.h[0] ldr d5, [x1], #8 fmla v17.8h, v1.8h, v4.h[0] ldr d9, [x2], #8 fmla v18.8h, v2.8h, v4.h[0] ldr x5, [x1], #8 fmla v19.8h, v3.8h, v4.h[0] fmla v20.8h, v0.8h, v4.h[1] ldr d6, [x1], #8 fmla v21.8h, v1.8h, v4.h[1] ldr x6, [x1], #8 fmla v22.8h, v2.8h, v4.h[1] ldr d7, [x1], #8 fmla v23.8h, v3.8h, v4.h[1] ldr x7, [x1], #8 fmla v24.8h, v0.8h, v4.h[2] ldr d8, [x1], #8 fmla v25.8h, v1.8h, v4.h[2] ldr x8, [x1], #8 fmla v26.8h, v2.8h, v4.h[2] ins v5.d[1], x5 fmla v27.8h, v3.8h, v4.h[2] ins v6.d[1], x6 fmla v28.8h, v0.8h, v4.h[3] ins v7.d[1], x7 fmla v29.8h, v1.8h, v4.h[3] ins v8.d[1], x8 fmla v30.8h, v2.8h, v4.h[3] ins v9.d[1], x9 fmla v31.8h, v3.8h, v4.h[3] // mul a: v5, v6, v7, v8 b: v9 // load a: v0(d0/x5), v1(d1,x6), v2(d2,x7), v3(d3, x8) // load b: v4 as d4 fmla v16.8h, v5.8h, v9.h[0] ldr d0, [x1], #8 fmla v17.8h, v6.8h, v9.h[0] ldr d4, [x2], #8 fmla v18.8h, v7.8h, v9.h[0] ldr x5, [x1], #8 fmla v19.8h, v8.8h, v9.h[0] fmla v20.8h, v5.8h, v9.h[1] ldr d1, [x1], #8 fmla v21.8h, v6.8h, v9.h[1] ldr x6, [x1], #8 fmla v22.8h, v7.8h, v9.h[1] ldr d2, [x1], #8 fmla v23.8h, v8.8h, v9.h[1] ldr x7, [x1], #8 fmla v24.8h, v5.8h, v9.h[2] ldr d3, [x1], #8 fmla v25.8h, v6.8h, v9.h[2] ldr x8, [x1], #8 fmla v26.8h, v7.8h, v9.h[2] ins v0.d[1], x5 fmla v27.8h, v8.8h, v9.h[2] ins v1.d[1], x6 fmla v28.8h, v5.8h, v9.h[3] ins v2.d[1], x7 fmla v29.8h, v6.8h, v9.h[3] ins v3.d[1], x8 fmla v30.8h, v7.8h, v9.h[3] ins v4.d[1], x9 fmla v31.8h, v8.8h, v9.h[3]