// vim: set syntax=asm : {{L}}{{label}}: mov rax, [ rdi + 8 ] {% capture mr_over_16 %}{{ mr | divided_by: 16}}{%endcapture%} {% capture mr_over_16_min_1 %}{{ mr | divided_by: 16 | minus: 1}}{%endcapture%} {% for ix in (0..mr_over_16_min_1) %} vmovups zmm{{to | plus: 1 | plus: ix}}, [rax + {{ix | times: 64}}] {% endfor %} {% if flipped %} {% for acc in (from..to) %} {{op}} zmm{{acc}}, zmm{{acc}}, zmm{{ acc | modulo: mr_over_16 | plus: to | plus: 1 }} {% endfor %} {% else %} {% for acc in (from..to) %} {{op}} zmm{{acc}}, zmm{{ acc | modulo: mr_over_16 | plus: to | plus: 1 }}, zmm{{acc}} {% endfor %} {% endif %} jmp {{L}}non_linear_loop