{% comment %} // vim: set syntax=asm : /* mmm 40 x 5: ymm0 ymm5 ymm1 ymm6 ymm2 ymm7 ymm3 ymm8 ymm4 ymm9 System V ABI: args: rdi, rsi, rdx, rcx, r8, r9 preserve: rbx, rsp, rbp, r12, r13, r14, r15 scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 return: rax (+rdx) Windows ABI: args: RCX, RDX, R8, R9 preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 return: rax (+rdx) */ {% endcomment %} {% include "preamble.tmpliq" type:"f32", size:"40x2", suffix:suffix, G:G %} {{L}}clear: vzeroall jmp {{L}}non_linear_loop {{L}}add_mat_mul: mov rcx, [rdi + 24] // B mov rax, [rdi + 16] // A mov rbx, [rdi + 8] // k test rbx, rbx jz {{L}}non_linear_loop {{L}}main_loop_packed_packed: {% include "5x2/packed_packed_loop1/avx.tmpli" %} dec rbx jnz {{L}}main_loop_packed_packed jmp {{L}}non_linear_loop // NON LINEAR / ADDC {% include "fma_mmm_f32_scalars.tmpliq" from:0, to:9, type:"f32" %} {% include "fma_mmm_f32_per_rows.tmpliq" mr:40, from:0, to:9, type:"f32" %} {% include "fma_mmm_f32_per_cols.tmpliq" mr:40, from:0, to:9, type:"f32" %} {{L}}add_unicast: mov r8, [rdi + 8] // c ptr mov rsi, [rdi + 16] // row stride mov rbx, [rdi + 24] // col stride cmp rsi, 4 jne {{L}}unicast_generic lea r9, [ r8 + rbx ] lea r10, [ r9 + rbx] lea r11, [ r10 + rbx ] lea r12, [ r11 + rbx ] {% for col in (0..1) %} {% for row in (0..4) %} vmovups ymm12, [ r{{col|plus:8}} ] add r{{col|plus:8}}, 32 vaddps ymm{{col|times:5|plus:row}}, ymm{{col|times:5|plus:row}}, ymm12 {% endfor %} {% endfor %} jmp {{L}}non_linear_loop {{L}}unicast_generic: mov eax, 0 {% for i in (0..3) %} pinsrd xmm14, eax, {{i}} add eax, esi {% endfor %} {% for i in (0..3) %} pinsrd xmm15, eax, {{i}} add eax, esi {% endfor %} vperm2f128 ymm14, ymm14, ymm15, 32 // ymm14 <- xmm14::xmm15 lea r9, [ r8 + rsi * 8] lea r10, [ r9 + rsi * 8] lea r11, [ r10 + rsi * 8] lea r12, [ r11 + rsi * 8] {% for col in (0..1) %} {% for row in (0..4) %} vpcmpeqd ymm15, ymm15, ymm15 vgatherdps ymm12, [ r{{row|plus:8}} + ymm14 ], ymm15 add r{{row|plus:8}}, rbx vaddps ymm{{col|times:5|plus:row}}, ymm{{col|times:5|plus:row}}, ymm12 {% endfor %} {% endfor %} jmp {{L}}non_linear_loop {{L}}add_row_col_products: mov rax, [ rdi + 8 ] mov rbx, [ rdi + 16 ] vbroadcastss ymm10, dword ptr [rbx] vbroadcastss ymm11, dword ptr [rbx + 4] {% for i in (0..4) %} vmovups ymm12, [rax + {{i|times:32}}] vfmadd231ps ymm{{0|plus:i}}, ymm12, ymm10 vfmadd231ps ymm{{5|plus:i}}, ymm12, ymm11 {% endfor %} jmp {{L}}non_linear_loop {{L}}store: mov r8, [rdi + 8] // c ptr mov rsi, [rdi + 16] // row stride mov rbx, [rdi + 24] // col stride lea r9, [ r8 + rbx ] lea r10, [ r8 + 2 * rbx ] lea r11, [ r10 + rbx ] lea r12, [ r10 + 2 * rbx ] cmp rsi, 4 jne {{L}}store_strides_generic {% for col in (0..1) %} {% for row in (0..4) %} vmovups ymmword ptr [r{{col|plus:8}}], ymm{{col|times:5|plus:row}} add r{{col|plus:8}}, 32 {% endfor %} {% endfor %} jmp {{L}}non_linear_loop {{L}}store_strides_generic: {% for col in (0..1) %} {% for row in (0..4) %} {% for i in (0..3) %} vextractps dword ptr [r{{col | plus: 8}}], xmm{{col | times:5 | plus:row}}, {{i}} add r{{col | plus: 8}}, rsi {% endfor %} vperm2f128 ymm{{col | times:5 | plus:row}}, ymm{{col | times:5 | plus:row}}, ymm{{col | times:5 | plus:row}}, 1 {% for i in (0..3) %} vextractps dword ptr [r{{col | plus: 8}}], xmm{{col | times:5|plus:row}}, {{i}} add r{{col | plus: 8}}, rsi {% endfor %} {% endfor %} {% endfor %} jmp {{L}}non_linear_loop {% include "postamble.tmpliq" type:"f32", size:"40x2", suffix:suffix, G:G, L:L %}