{% if msvc %} _text segment {{arch}}_mmm_f32_{{size}}_{{suffix}} proc {% else %} .intel_syntax noprefix .text .p2align 5 .globl {{G}}{{arch}}_mmm_f32_{{size}}_{{suffix}} {{G}}{{arch}}_mmm_f32_{{size}}_{{suffix}}: .cfi_startproc {% endif %} push rbp mov rbp, rsp {% if family == "windows" %} // https://www.agner.org/optimize/calling_conventions.pdf xmm6-15 are not scratch // https://stackoverflow.com/questions/43358429/save-value-of-xmm-registers and rsp,-16 lea rsp,[rsp-160] vmovaps [rsp], xmm6 vmovaps [rsp+16*1],xmm7 vmovaps [rsp+16*2],xmm8 vmovaps [rsp+16*3],xmm9 vmovaps [rsp+16*4],xmm10 vmovaps [rsp+16*5],xmm11 vmovaps [rsp+16*6],xmm12 vmovaps [rsp+16*7],xmm13 vmovaps [rsp+16*8],xmm14 vmovaps [rsp+16*9],xmm15 push rdi push rsi mov rdi, rcx {% endif %} push rbx push r12 push r13 push r14 push r15 sub rsp, 8 {% if family == "unix" %} .cfi_def_cfa_offset 64 {% endif %} stmxcsr [rsp + 4] {% if msvc %} mov rax, 1FC0h {% else %} mov rax, 0x1FC0 {% endif %} mov [rsp], eax ldmxcsr [rsp] {% include "dispatcher.tmpliq" %}