// vim: set syntax=asm : {{L}}{{label}}: mov rax, [ rdi + 8 ] {% capture mr_over_16 %}{{ mr | divided_by: 16}}{%endcapture%} {% capture mr_over_16_min_1 %}{{ mr | divided_by: 16 | minus: 1}}{%endcapture%} {%capture tmp%}{{to | plus: 1 }}{%endcapture%} {%capture cols%}{{to | plus: 1| minus:from| divided_by:mr_over_16}}{%endcapture%} {%capture cols_min_1%}{{to | plus: 1| minus:from| divided_by:mr_over_16|minus:1}}{%endcapture%} // {{to|minus:from|plus:1}} cols:{{cols}} {% for right in (0..cols_min_1) %} vbroadcastss zmm{{tmp}}, dword ptr [ rax ] add rax, 4 {% for down in (0..mr_over_16_min_1) %} {%capture acc%}{{mr_over_16|times:right|plus:from|plus:down}}{%endcapture%} {% if flipped %} {{op}} zmm{{acc}}, zmm{{acc}}, zmm{{tmp}} {% else %} {{op}} zmm{{acc}}, zmm{{tmp}}, zmm{{acc}} {% endif %} {% endfor %} {% endfor %} jmp {{L}}non_linear_loop