// vim: set syntax=asm : {{L}}{{label}}: mov rax, [ rdi + 8 ] {% capture mr_over_8 %}{{ mr | divided_by: 8}}{%endcapture%} {% capture mr_over_8_min_1 %}{{ mr | divided_by: 8 | minus: 1}}{%endcapture%} {% if type == "f16" %} {% for ix in (0..mr_over_8_min_1) %} vmovups xmm{{to | plus: 1 | plus: ix}}, [rax + {{ix | times: 16}}] {% endfor %} {% for ix in (0..mr_over_8_min_1) %} vcvtph2ps ymm{{to | plus: 1 | plus: ix}}, xmm{{to | plus: 1 | plus: ix}} {% endfor %} {% else %} {% for ix in (0..mr_over_8_min_1) %} vmovups ymm{{to | plus: 1 | plus: ix}}, [rax + {{ix | times: 32}}] {% endfor %} {% endif %} {% if flipped %} {% for acc in (from..to) %} {{op}} ymm{{acc}}, ymm{{acc}}, ymm{{ acc | modulo: mr_over_8 | plus: to | plus: 1 }} {% endfor %} {% else %} {% for acc in (from..to) %} {{op}} ymm{{acc}}, ymm{{ acc | modulo: mr_over_8 | plus: to | plus: 1 }}, ymm{{acc}} {% endfor %} {% endif %} jmp {{L}}non_linear_loop