// mul a: v0, v1, v2, v3, v4, v5 b: v7 // load a: v5(d5/x5), v6(d6,x6), v7(d7,x7), v8(d8, x8) // load b: v9(d9/x9) fmla v16.4s, v0.4s, v4.s[0] ldr d5, [x1], #8 fmla v17.4s, v1.4s, v4.s[0] ldr d9, [x2], #8 fmla v18.4s, v2.4s, v4.s[0] ldr x5, [x1], #8 fmla v19.4s, v3.4s, v4.s[0] ldr x9, [x2], #8 fmla v20.4s, v0.4s, v4.s[1] ldr d6, [x1], #8 fmla v21.4s, v1.4s, v4.s[1] ldr x6, [x1], #8 fmla v22.4s, v2.4s, v4.s[1] ldr d7, [x1], #8 fmla v23.4s, v3.4s, v4.s[1] ldr x7, [x1], #8 fmla v24.4s, v0.4s, v4.s[2] ldr d8, [x1], #8 fmla v25.4s, v1.4s, v4.s[2] ldr x8, [x1], #8 fmla v26.4s, v2.4s, v4.s[2] ins v5.d[1], x5 fmla v27.4s, v3.4s, v4.s[2] ins v6.d[1], x6 fmla v28.4s, v0.4s, v4.s[3] ins v7.d[1], x7 fmla v29.4s, v1.4s, v4.s[3] ins v8.d[1], x8 fmla v30.4s, v2.4s, v4.s[3] ins v9.d[1], x9 fmla v31.4s, v3.4s, v4.s[3] // mul a: v5, v6, v7, v8 b: v9 // load a: v0(d0/x5), v1(d1,x6), v2(d2,x7), v3(d3, x8) // load b: v4(d4/x9) fmla v16.4s, v5.4s, v9.s[0] ldr d0, [x1], #8 fmla v17.4s, v6.4s, v9.s[0] ldr d4, [x2], #8 fmla v18.4s, v7.4s, v9.s[0] ldr x5, [x1], #8 fmla v19.4s, v8.4s, v9.s[0] ldr x9, [x2], #8 fmla v20.4s, v5.4s, v9.s[1] ldr d1, [x1], #8 fmla v21.4s, v6.4s, v9.s[1] ldr x6, [x1], #8 fmla v22.4s, v7.4s, v9.s[1] ldr d2, [x1], #8 fmla v23.4s, v8.4s, v9.s[1] ldr x7, [x1], #8 fmla v24.4s, v5.4s, v9.s[2] ldr d3, [x1], #8 fmla v25.4s, v6.4s, v9.s[2] ldr x8, [x1], #8 fmla v26.4s, v7.4s, v9.s[2] ins v0.d[1], x5 fmla v27.4s, v8.4s, v9.s[2] ins v1.d[1], x6 fmla v28.4s, v5.4s, v9.s[3] ins v2.d[1], x7 fmla v29.4s, v6.4s, v9.s[3] ins v3.d[1], x8 fmla v30.4s, v7.4s, v9.s[3] ins v4.d[1], x9 fmla v31.4s, v8.4s, v9.s[3]