{% comment %} // vim: set syntax=asm : /* mmm 8x8: ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm7 System V ABI: args: rdi, rsi, rdx, rcx, r8, r9 preserve: rbx, rsp, rbp, r12, r13, r14, r15 scratch: rax, rdi, rsi, rdx, rcx, r8, r9, r10, r11 return: rax (+rdx) Windows ABI: args: RCX, RDX, R8, R9 preserve: RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15, and XMM6-15 scratch: RAX, RCX, RDX, R8, R9, R10, R11, XMM0-5, and the upper portions of YMM0-15 and ZMM0-15 return: rax (+rdx) */ {% endcomment %} {% if msvc %} _text segment avx2_mmm_i32_8x8_{{suffix}} proc {% else %} .intel_syntax noprefix .text .p2align 5 .globl {{G}}avx2_mmm_i32_8x8_{{suffix}} {{G}}avx2_mmm_i32_8x8_{{suffix}}: .cfi_startproc {% endif %} push rbp mov rbp, rsp {% if family == "windows" %} // https://www.agner.org/optimize/calling_conventions.pdf xmm6-15 are not scratch // https://stackoverflow.com/questions/43358429/save-value-of-xmm-registers and rsp,-16 lea rsp,[rsp-160] vmovaps [rsp], xmm6 vmovaps [rsp+16*1],xmm7 vmovaps [rsp+16*2],xmm8 vmovaps [rsp+16*3],xmm9 vmovaps [rsp+16*4],xmm10 vmovaps [rsp+16*5],xmm11 vmovaps [rsp+16*6],xmm12 vmovaps [rsp+16*7],xmm13 vmovaps [rsp+16*8],xmm14 vmovaps [rsp+16*9],xmm15 push rdi push rsi mov rdi, rcx {% endif %} push rbx push r12 push r13 push r14 push r15 sub rsp, 8 {% if family == "unix" %} .cfi_def_cfa_offset 64 {% endif %} stmxcsr [rsp + 4] {% if msvc %} mov rax, 1FC0h {% else %} mov rax, 0x1FC0 {% endif %} mov [rsp], eax ldmxcsr [rsp] {% include "dispatcher.tmpliq" %} {{L}}clear: vzeroall jmp {{L}}non_linear_loop {{L}}add_mat_mul: mov r12, [rdi + 32] // packing mov rbx, [rdi + 24] // B mov rax, [rdi + 16] // A mov rcx, [rdi + 8] // k test rcx, rcx jz {{L}}non_linear_loop cmp r12, 1 je {{L}}main_loop_packed_packed_i8i8 {{L}}main_loop_packed_packed: vmovaps ymm12, [rax] {% for i in (0..7) %} vbroadcastss ymm14, dword ptr [rbx + {{i}} * 4] vpmulld ymm13, ymm12, ymm14 vpaddd ymm{{i}}, ymm{{i}}, ymm13 {% endfor %} add rax, 32 add rbx, 32 dec rcx jnz {{L}}main_loop_packed_packed jmp {{L}}non_linear_loop {{L}}main_loop_packed_packed_i8i8: movq xmm8, qword ptr [rax] // read 8 bytes vpmovsxbw ymm8, xmm8 // promote byte to i32x8 vpbroadcastb ymm9, byte ptr [rbx] // broadcast 1 byte from B vpbroadcastb ymm10, byte ptr [rbx + 1] // broadcast 1 byte from B vpbroadcastb ymm11, byte ptr [rbx + 2] // broadcast 1 byte from B vpbroadcastb ymm12, byte ptr [rbx + 3] // broadcast 1 byte from B vpmovsxbw ymm9, xmm9 // promote byte to i32x8 vpmovsxbw ymm10, xmm10 // promote byte to i32x8 vpmovsxbw ymm11, xmm11 // promote byte to i32x8 vpmovsxbw ymm12, xmm12 // promote byte to i32x8 vpmullw ymm9, ymm9, ymm8 vpmullw ymm10, ymm10, ymm8 vpmullw ymm11, ymm11, ymm8 vpmullw ymm12, ymm12, ymm8 vpmovsxwd ymm9, xmm9 // promote byte to i32x8 vpmovsxwd ymm10, xmm10 // promote byte to i32x8 vpmovsxwd ymm11, xmm11 // promote byte to i32x8 vpmovsxwd ymm12, xmm12 // promote byte to i32x8 vpaddd ymm0, ymm0, ymm9 vpaddd ymm1, ymm1, ymm10 vpaddd ymm2, ymm2, ymm11 vpaddd ymm3, ymm3, ymm12 vpbroadcastb ymm9, byte ptr [rbx + 4] vpbroadcastb ymm10, byte ptr [rbx + 5] vpbroadcastb ymm11, byte ptr [rbx + 6] vpbroadcastb ymm12, byte ptr [rbx + 7] vpmovsxbw ymm9, xmm9 vpmovsxbw ymm10, xmm10 vpmovsxbw ymm11, xmm11 vpmovsxbw ymm12, xmm12 vpmullw ymm9, ymm9, ymm8 vpmullw ymm10, ymm10, ymm8 vpmullw ymm11, ymm11, ymm8 vpmullw ymm12, ymm12, ymm8 vpmovsxwd ymm9, xmm9 // promote byte to i32x8 vpmovsxwd ymm10, xmm10 // promote byte to i32x8 vpmovsxwd ymm11, xmm11 // promote byte to i32x8 vpmovsxwd ymm12, xmm12 // promote byte to i32x8 vpaddd ymm4, ymm4, ymm9 vpaddd ymm5, ymm5, ymm10 vpaddd ymm6, ymm6, ymm11 vpaddd ymm7, ymm7, ymm12 add rbx, 8 add rax, 8 dec rcx jnz {{L}}main_loop_packed_packed_i8i8 jmp {{L}}non_linear_loop {% include "fma_mmm_i32_scalars.tmpliq" from:0, to:7 %} {% include "fma_mmm_i32_per_rows.tmpliq" mr:8,from:0, to:7 %} {% include "fma_mmm_i32_per_cols.tmpliq" mr:8,from:0, to:7 %} {{L}}add_unicast: mov r10, [rdi + 8] // c ptr mov rsi, [rdi + 16] // row stride mov rbx, [rdi + 24] // col stride mov r8, [rdi + 32] // item size cmp r8, 4 je {{L}}non_linear_addc_i32 {% comment %} // This is not great as vgatherdps reads 32-bits values and goes beyond our buffer. Probably harmless though. // Commented and replaced with the "mov al" loop beyond to pacify valgrind. // ymm14 and ymm15 are the same as in the non_linear_addc_i32 case (compute them before the test right above here. // {% for i in (0..7) %} // vpcmpeqd ymm15, ymm15, ymm15 // vgatherdps ymm12, [ r10 + ymm14 ], ymm15 // 0xxx 1xxx 2xxx 3xxx 4xxx 5xxx 6xxx 7xxx // // // we need to go through vpmovsxbd, shuffling naively erases signs // vpshufb ymm12, ymm12, ymm10 // 0123 0123 0123 0123 4567 4567 4567 4567 // // vpermd ymm12, ymm11, ymm12 // 0123 4567 // vpmovsxbd ymm12, xmm12 // sign extend // // vpaddd ymm{{i}}, ymm{{i}}, ymm12 // add r10, rbx // {% endfor %} {% endcomment %} {% for col in (0..7) %} mov r8, r10 {% for half in (0..1) %} {% for lane in (0..3) %} mov al, [ r8 ] add r8, rsi movsx eax, al pinsrd xmm10, eax, {{lane}} {% endfor %} vperm2f128 ymm10, ymm10, ymm10, 1 {% endfor %} vpaddd ymm{{col}}, ymm{{col}}, ymm10 add r10, rbx {% endfor %} jmp {{L}}non_linear_loop {{L}}non_linear_addc_i32: mov eax, 0 {% for i in (0..3) %} pinsrd xmm14, eax, {{i}} add eax, esi {% endfor %} vpermq ymm14, ymm14, 78 // 0b01001110 {% for i in (0..3) %} pinsrd xmm14, eax, {{i}} add eax, esi {% endfor %} vpermq ymm14, ymm14, 78 // 0b01001110 {% if msvc %} vpbroadcastd ymm10, dword ptr [ offset byte_shuffle ] vmovups ymm11, dword ptr [ offset i128_shuffle ] {% else %} vpbroadcastd ymm10, [ rip + {{L}}byte_shuffle ] vmovups ymm11, [ rip + {{L}}i128_shuffle ] {% endif %} {% for i in (0..7) %} vpcmpeqd ymm15, ymm15, ymm15 vgatherdps ymm12, [ r10 + ymm14 ], ymm15 vpaddd ymm{{i}}, ymm{{i}}, ymm12 add r10, rbx {% endfor %} jmp {{L}}non_linear_loop {% if msvc %} .data byte_shuffle dd 201851904 // 0x0c080400 i128_shuffle dd 0, 4 .code {% else %} {{L}}byte_shuffle: .int 201851904 // 0x0c080400 {{L}}i128_shuffle: .int 0, 4 {% endif %} {{L}}add_row_col_products: mov rax, [ rdi + 8 ] mov rbx, [ rdi + 16 ] vmovups ymm12, [rax] {% for i in (0..7) %} vbroadcastss ymm14, dword ptr [rbx + {{i|times:4}} ] vpmulld ymm15, ymm12, ymm14 vpaddd ymm{{i}}, ymm{{i}}, ymm15 {% endfor %} jmp {{L}}non_linear_loop {{L}}q_scale: mov r8, [ rdi + 16 ] // policy vbroadcastss ymm8, dword ptr [rdi + 24] // multi mov rax, 1 movq xmm9, rax vpbroadcastq ymm9, xmm9 // ymm9 <- 1 mov rax, [ rdi + 8 ] // xmm10 <- shift + 31 add rax, 31 movq xmm10, rax vpbroadcastq ymm10, xmm10 mov rax, 1 movq xmm11, rax vpsubq ymm12, ymm10, ymm9 // shift+31 - 1 vpsllq ymm11, ymm9, xmm12 // ymm11 <- 1 << (shift + 31 - 1) cmp r8, 1 je {{L}}q_scale_rounding_zero cmp r8, 2 je {{L}}q_scale_rounding_away cmp r8, 3 je {{L}}q_scale_rounding_minus_inf cmp r8, 4 je {{L}}q_scale_rounding_plus_inf cmp r8, 5 je {{L}}q_scale_rounding_even cmp r8, 6 je {{L}}q_scale_rounding_odd jmp {{L}}unsupported {{L}}q_scale_rounding_zero: // signum * ( (abs + nudge) >> shift ) {% for i in (0..7) %} vpabsd ymm14, ymm{{i}} vpsrldq ymm15, ymm14, 4 // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0 vpmuldq ymm14, ymm14, ymm8 // ymm14 <- a0*c, a2*c, a4*c, a6*c vpmuldq ymm15, ymm15, ymm8 // ymm15 <- a1*c, a3*c, a5*c, a7*c vpaddq ymm14, ymm14, ymm11 vpaddq ymm15, ymm15, ymm11 vpsubq ymm14, ymm14, ymm9 vpsubq ymm15, ymm15, ymm9 vpsrlq ymm14, ymm14, xmm10 vpsrlq ymm15, ymm15, xmm10 vpslldq ymm15, ymm15, 4 vpblendd ymm14, ymm15, ymm14, 85 // 0x55 vpsignd ymm{{i}}, ymm14, ymm{{i}} {% endfor %} jmp {{L}}non_linear_loop {{L}}q_scale_rounding_away: // signum * ( (abs + nudge) >> shift ) {% for i in (0..7) %} vpabsd ymm14, ymm{{i}} vpsrldq ymm15, ymm14, 4 // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0 vpmuldq ymm14, ymm14, ymm8 // ymm14 <- a0*c, a2*c, a4*c, a6*c vpmuldq ymm15, ymm15, ymm8 // ymm15 <- a1*c, a3*c, a5*c, a7*c vpaddq ymm14, ymm14, ymm11 vpaddq ymm15, ymm15, ymm11 vpsrlq ymm14, ymm14, xmm10 vpsrlq ymm15, ymm15, xmm10 vpslldq ymm15, ymm15, 4 vpblendd ymm14, ymm15, ymm14, 85 // 0x55 vpsignd ymm{{i}}, ymm14, ymm{{i}} {% endfor %} jmp {{L}}non_linear_loop {{L}}q_scale_rounding_minus_inf: // signum * ( (abs << 32 + 1<<30+shift) >> shift ) {% for i in (0..7) %} vpabsd ymm14, ymm{{i}} // sign extract for nudging in the right direction vpxor ymm13, ymm13, ymm13 vpcmpgtd ymm13, ymm{{i}}, ymm13 // ymm13 <- s0, s1, ..s8 (signums, as all ones or all zeros) vpsrld ymm13, ymm13, 31 // then just 0 or 1 vpsrldq ymm15, ymm14, 4 // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0 vpmuldq ymm14, ymm14, ymm8 // ymm14 <- a0*c, a2*c, a4*c, a6*c vpmuldq ymm15, ymm15, ymm8 // ymm15 <- a1*c, a3*c, a5*c, a7*c vpaddq ymm14, ymm14, ymm11 vpaddq ymm15, ymm15, ymm11 // reinterpret ymm13=s0i32..s7 as i64 and blend with zero to pick the even ones as i64 vpxor ymm12, ymm12, ymm12 vpblendd ymm12, ymm12, ymm13, 85 // 0x55 vpsubq ymm14, ymm14, ymm12 vpsrldq ymm13, ymm13, 4 // ymm13 <- s1, s2, .., s7, 0 vpxor ymm12, ymm12, ymm12 vpblendd ymm12, ymm12, ymm13, 85 // 0x55 vpsubq ymm15, ymm15, ymm12 vpsrlq ymm14, ymm14, xmm10 vpsrlq ymm15, ymm15, xmm10 vpslldq ymm15, ymm15, 4 vpblendd ymm14, ymm15, ymm14, 85 // 0x55 vpsignd ymm{{i}}, ymm14, ymm{{i}} {% endfor %} jmp {{L}}non_linear_loop {{L}}q_scale_rounding_plus_inf: // signum * ( (abs << 32 + 1<<30+shift) >> shift ) vpbroadcastd ymm9, xmm9 {% for i in (0..7) %} vpabsd ymm14, ymm{{i}} vpxor ymm13, ymm13, ymm13 // sign extract for nudging in the right direction vpcmpgtd ymm13, ymm{{i}}, ymm13 // ymm13 <- s0, s1, ..s8 (signums, as all ones or all zeros) vpaddd ymm13, ymm13, ymm9 // if val >= 0 { 0i32 } else { 1i32 } vpsrldq ymm15, ymm14, 4 // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0 vpmuldq ymm14, ymm14, ymm8 // ymm14 <- a0*c, a2*c, a4*c, a6*c vpmuldq ymm15, ymm15, ymm8 // ymm15 <- a1*c, a3*c, a5*c, a7*c vpaddq ymm14, ymm14, ymm11 vpaddq ymm15, ymm15, ymm11 // reinterpret ymm13=s0i32..s7 as i64 and blend with zero to pick the even ones as i64 vpxor ymm12, ymm12, ymm12 vpblendd ymm12, ymm12, ymm13, 85 // 0x55 vpsubq ymm14, ymm14, ymm12 vpsrldq ymm13, ymm13, 4 // ymm13 <- s1, s2, .., s7, 0 vpxor ymm12, ymm12, ymm12 vpblendd ymm12, ymm12, ymm13, 85 // 0x55 vpsubq ymm15, ymm15, ymm12 vpsrlq ymm14, ymm14, xmm10 vpsrlq ymm15, ymm15, xmm10 vpslldq ymm15, ymm15, 4 vpblendd ymm14, ymm15, ymm14, 85 // 0x55 vpsignd ymm{{i}}, ymm14, ymm{{i}} {% endfor %} jmp {{L}}non_linear_loop {{L}}q_scale_rounding_even: // signum * ( (abs + nudge) >> shift ) {% for i in (0..7) %} vpabsd ymm14, ymm{{i}} vpsrldq ymm15, ymm14, 4 // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0 vpmuldq ymm14, ymm14, ymm8 // ymm14 <- a0*c, a2*c, a4*c, a6*c vpmuldq ymm15, ymm15, ymm8 // ymm15 <- a1*c, a3*c, a5*c, a7*c vpsrlq ymm12, ymm14, xmm10 vpand ymm12, ymm12, ymm9 vpaddq ymm14, ymm14, ymm12 vpsubq ymm14, ymm14, ymm9 vpsrlq ymm12, ymm15, xmm10 vpand ymm12, ymm12, ymm9 vpaddq ymm15, ymm15, ymm12 vpsubq ymm15, ymm15, ymm9 vpaddq ymm14, ymm14, ymm11 vpaddq ymm15, ymm15, ymm11 vpsrlq ymm14, ymm14, xmm10 vpsrlq ymm15, ymm15, xmm10 vpslldq ymm15, ymm15, 4 vpblendd ymm14, ymm15, ymm14, 85 // 0x55 vpsignd ymm{{i}}, ymm14, ymm{{i}} {% endfor %} jmp {{L}}non_linear_loop {{L}}q_scale_rounding_odd: // signum * ( (abs + nudge) >> shift ) {% for i in (0..7) %} vpabsd ymm14, ymm{{i}} vpsrldq ymm15, ymm14, 4 // ymm15 <- a1, a2, a3, a4, a5, a6, a7, 0 vpmuldq ymm14, ymm14, ymm8 // ymm14 <- a0*c, a2*c, a4*c, a6*c vpmuldq ymm15, ymm15, ymm8 // ymm15 <- a1*c, a3*c, a5*c, a7*c vpsrlq ymm12, ymm14, xmm10 vpand ymm12, ymm12, ymm9 vpsubq ymm14, ymm14, ymm12 vpsrlq ymm12, ymm15, xmm10 vpand ymm12, ymm12, ymm9 vpsubq ymm15, ymm15, ymm12 vpaddq ymm14, ymm14, ymm11 vpaddq ymm15, ymm15, ymm11 vpsrlq ymm14, ymm14, xmm10 vpsrlq ymm15, ymm15, xmm10 vpslldq ymm15, ymm15, 4 vpblendd ymm14, ymm15, ymm14, 85 // 0x55 vpsignd ymm{{i}}, ymm14, ymm{{i}} {% endfor %} jmp {{L}}non_linear_loop {{L}}q_shl: mov eax, [ rdi + 8 ] // xmm10 <- -shift (8 times) movd xmm10, eax vpbroadcastd ymm10, xmm10 {% for i in (0..7) %} vpsllvd ymm{{i}}, ymm{{i}}, ymm10 {% endfor %} jmp {{L}}non_linear_loop {{L}}q_shr: mov r8, [ rdi + 16 ] // policy mov eax, 1 movd xmm9, eax vpbroadcastd ymm9, xmm9 // ymm9 <- 1u32 (8 times) mov eax, [ rdi + 8 ] // xmm10 <- shift (8 times) movd xmm10, eax vpbroadcastd ymm10, xmm10 mov ebx, 1 mov cl, al sub cl, 1 // rcx <- shift -1 sal ebx, cl // rbx <- (1 << (shift - 1)) movd xmm11, ebx vpbroadcastd ymm11, xmm11 // ymm11 <- "half" vpxor ymm12, ymm12, ymm12 // ymm12 <- zeroes cmp r8, 1 je {{L}}q_shr_rounding_zero cmp r8, 2 je {{L}}q_shr_rounding_away cmp r8, 3 je {{L}}q_shr_rounding_minus_inf cmp r8, 4 je {{L}}q_shr_rounding_plus_inf cmp r8, 5 je {{L}}q_shr_rounding_even cmp r8, 6 je {{L}}q_shr_rounding_odd jmp {{L}}unsupported {{L}}q_shr_rounding_zero: {% for i in (0..7) %} vpabsd ymm14, ymm{{i}} vpsubd ymm14, ymm14, ymm9 vpaddd ymm14, ymm14, ymm11 vpsravd ymm14, ymm14, ymm10 vpsignd ymm{{i}}, ymm14, ymm{{i}} {% endfor %} jmp {{L}}non_linear_loop {{L}}q_shr_rounding_away: {% for i in (0..7) %} vpabsd ymm14, ymm{{i}} vpaddd ymm14, ymm14, ymm11 vpsravd ymm14, ymm14, ymm10 vpsignd ymm{{i}}, ymm14, ymm{{i}} {% endfor %} jmp {{L}}non_linear_loop {{L}}q_shr_rounding_minus_inf: {% for i in (0..7) %} vpsubd ymm{{i}}, ymm{{i}}, ymm9 vpaddd ymm{{i}}, ymm{{i}}, ymm11 vpsravd ymm{{i}}, ymm{{i}}, ymm10 {% endfor %} jmp {{L}}non_linear_loop {{L}}q_shr_rounding_plus_inf: {% for i in (0..7) %} vpaddd ymm{{i}}, ymm{{i}}, ymm11 vpsravd ymm{{i}}, ymm{{i}}, ymm10 {% endfor %} jmp {{L}}non_linear_loop {{L}}q_shr_rounding_even: {% for i in (0..7) %} vpabsd ymm14, ymm{{i}} vpsravd ymm13, ymm14, ymm10 vpand ymm13, ymm13, ymm9 vpsubd ymm13, ymm13, ymm9 // nudge = ((abs >>l shift) & 0x01) - 1 vpaddd ymm14, ymm14, ymm13 // add nudge vpaddd ymm14, ymm14, ymm11 // add half vpsravd ymm14, ymm14, ymm10 vpsignd ymm{{i}}, ymm14, ymm{{i}} {% endfor %} jmp {{L}}non_linear_loop {{L}}q_shr_rounding_odd: {% for i in (0..7) %} vpabsd ymm14, ymm{{i}} vpsravd ymm13, ymm14, ymm10 vpand ymm13, ymm13, ymm9 vpsubd ymm13, ymm12, ymm13 // nudge = - ((abs >>l shift) & 0x01) vpaddd ymm14, ymm14, ymm13 // add nudge vpaddd ymm14, ymm14, ymm11 // add half vpsravd ymm14, ymm14, ymm10 vpsignd ymm{{i}}, ymm14, ymm{{i}} {% endfor %} jmp {{L}}non_linear_loop {{L}}store: mov r8, [rdi + 8] // c ptr mov rsi, [rdi + 16] // row stride mov rdx, [rdi + 24] // col stride mov rcx, [rdi + 32] // item size cmp rcx, 4 je {{L}}store_strides_i32 {% for col in (0..7) %} mov r10, r8 {% for row in (0..3) %} extractps ebx, xmm{{col}}, {{row}} mov byte ptr [r10], bl add r10, rsi {% endfor %} vperm2f128 ymm{{col}}, ymm{{col}}, ymm{{col}}, 1 {% for row in (0..3) %} extractps ebx, xmm{{col}}, {{row}} mov byte ptr [r10], bl add r10, rsi {% endfor %} add r8, rdx {% endfor %} jmp {{L}}non_linear_loop {{L}}store_strides_i32: {% for col in (0..7) %} mov r10, r8 {% for row in (0..3) %} extractps ebx, xmm{{col}}, {{row}} mov dword ptr [r10], ebx add r10, rsi {% endfor %} vperm2f128 ymm{{col}}, ymm{{col}}, ymm{{col}}, 1 {% for row in (0..3) %} extractps ebx, xmm{{col}}, {{row}} mov dword ptr [r10], ebx add r10, rsi {% endfor %} add r8, rdx {% endfor %} jmp {{L}}non_linear_loop {{L}}return: ldmxcsr [rsp + 4] add rsp, 8 pop r15 pop r14 pop r13 pop r12 pop rbx {% if family == "windows" %} pop rsi pop rdi vmovaps xmm15, [rsp+16*9] vmovaps xmm14, [rsp+16*8] vmovaps xmm13, [rsp+16*7] vmovaps xmm12, [rsp+16*6] vmovaps xmm11, [rsp+16*5] vmovaps xmm10, [rsp+16*4] vmovaps xmm9, [rsp+16*3] vmovaps xmm8, [rsp+16*2] vmovaps xmm7, [rsp+16*1] vmovaps xmm6, [rsp] {% endif %} mov rsp, rbp pop rbp ret {{L}}one_32bit: {% if msvc %} dd 1 {% else %} .int 1 {% endif %} {% if msvc %} avx2_mmm_i32_8x8_{{suffix}} endp _text ends end {% else %} .cfi_endproc {% endif %}