// vim: ft=arm

// C tile regs
//
//      q8[0]    q10[0]   q12[0]    q14[0]
//      q8[1]    q10[1]   q12[1]    q14[1]
//      q8[2]    q10[2]   q12[2]    q14[2]
//      q8[3]    q10[3]   q12[3]    q14[3]
//
//      q9[0]    q11[0]   q13[0]    q15[0]
//      q9[1]    q11[1]   q13[1]    q15[1]
//      q9[2]    q11[2]   q13[2]    q15[2]
//      q9[3]    q11[3]   q13[3]    q15[3]

// packed A buffering (2x8 values): alternating q0, q1 with q2, q3
// packed B buffering (2x4 values): alternating q4 with q5

    .arm
    .text
    .global armv7neon_mmm_f32_8x4_{{core}}_{{suffix}}
    .type armv7neon_mmm_f32_8x4_{{core}}_{{suffix}}, %function

armv7neon_mmm_f32_8x4_{{core}}_{{suffix}}:
    pld     [r0]
    push    { r4-r12 }
    vpush   { q4-q7 }

{% include "dispatcher.tmpliq" %}

.add_mat_mul:

    cmp     r3, #0
    beq     .non_linear_loop

    mov     r1, r4 // packed A ptr
    pld     [r3]
    pld     [r5]

    .packed_packed:
    pld     [r5]                           // packed B ptr
    .packed_packed_loop_1:

{% if core == "cortexa7" %}
    vldr            d0, [r1]
    vldr            d1, [r1, #8]
    vldr            d2, [r1, #16]
    vldr            d3, [r1, #24]
    vldr            d4, [r5]
    vldr            d5, [r5, #8]
{% elsif core == "cortexa9" %}
    vld1.64         {d0-d3}, [r1]!
    vld1.64         {d4, d5}, [r5]!
{% else %}
    vldmia          r1!, { q0, q1}
    vldmia          r5!, { q2 }
{% endif %}

{% if core != "generic" %}
    pld             [r1, #512]
    pld             [r5, #512]
{% endif %}

    vmla.f32        q8, q0, d4[0]
    vmla.f32        q9, q1, d4[0]

    vmla.f32        q10, q0, d4[1]
    vmla.f32        q11, q1, d4[1]

    vmla.f32        q12, q0, d5[0]
    vmla.f32        q13, q1, d5[0]

    vmla.f32        q14, q0, d5[1]
    vmla.f32        q15, q1, d5[1]

{% if core == "cortexa7" %}
    add             r1, #32
    add             r5, #16
{% endif %}

    subs r3, r3, #1
    bne .packed_packed_loop_1
    b   .non_linear_loop

{% include "armv7neon_mmm_f32_scalars.tmpliq" from:8, to:15 %}
{% include "armv7neon_mmm_f32_per_rows.tmpliq" mr:8, from:8, to:15 %}
{% include "armv7neon_mmm_f32_per_cols.tmpliq" mr:8, from:8, to:15 %}

.add_unicast:
    // r3, r4, r5 <- ptr, rsc, csc
    {% for col in (0..3) %}
        mov         r2, r3
        {% for reg in (0..3) %}
            vld1.f32    d0[0], [ r2 ], r4
            vld1.f32    d0[1], [ r2 ], r4
            vadd.f32    d{{col | times: 4 | plus: reg | plus : 16}}, d0
        {% endfor %}
        add r3, r3, r5
    {% endfor %}

    b .non_linear_loop

.add_row_col_products:
    vldmia          r3!, { q0, q1 }
    vldmia          r4!, { q4 }

    vmla.f32        q8, q0, d8[0]
    vmla.f32        q9, q1, d8[0]

    vmla.f32        q10, q0, d8[1]
    vmla.f32        q11, q1, d8[1]

    vmla.f32        q12, q0, d9[0]
    vmla.f32        q13, q1, d9[0]

    vmla.f32        q14, q0, d9[1]
    vmla.f32        q15, q1, d9[1]

    b .non_linear_loop

.store:
    // r3,r4,r5 are c,rsc,csc
    {% for col in (0..3) %}
        mov         r8, r3
        {% for reg in (0..3) %}
            vst1.f32    d{{col | times: 4 | plus: reg | plus : 16}}[0], [ r8 ], r4
            vst1.f32    d{{col | times: 4 | plus: reg | plus : 16}}[1], [ r8 ], r4
        {% endfor %}
        {% if col < 3 %}
            add r3, r3, r5
        {% endif %}
    {% endfor %}
    b .non_linear_loop

.return:
    vpop        { q4-q7 }
    pop         { r4-r12 }

    bx          lr