// vim: set syntax=asm : {{L}}{{label}}: mov rax, [ rdi + 8 ] {% capture mr_over_8 %}{{ mr | divided_by: 8}}{%endcapture%} {% capture mr_over_8_min_1 %}{{ mr | divided_by: 8 | minus: 1}}{%endcapture%} {%capture tmp%}{{to | plus: 1 }}{%endcapture%} {%capture cols%}{{to | plus: 1| minus:from| divided_by:mr_over_8}}{%endcapture%} {%capture cols_min_1%}{{to | plus: 1| minus:from| divided_by:mr_over_8|minus:1}}{%endcapture%} {% for right in (0..cols_min_1) %} {% if type == "f16" %} pinsrw xmm{{tmp}}, word ptr [ rax ], 0 add rax, 2 vcvtph2ps ymm{{tmp}}, xmm{{tmp}} vbroadcastss ymm{{tmp}}, xmm{{tmp}} {% else %} vbroadcastss ymm{{tmp}}, dword ptr [ rax ] add rax, 4 {% endif %} {% for down in (0..mr_over_8_min_1) %} {%capture acc%}{{mr_over_8|times:right|plus:from|plus:down}}{%endcapture%} {% if flipped %} {{op}} ymm{{acc}}, ymm{{acc}}, ymm{{tmp}} {% else %} {{op}} ymm{{acc}}, ymm{{tmp}}, ymm{{acc}} {% endif %} {% endfor %} {% endfor %} jmp {{L}}non_linear_loop