/* SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, * modify, merge, publish, distribute, sublicense, and/or sell copies * of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore */ #if !defined(SIMDE_X86_AVX512_PERMUTEXVAR_H) #define SIMDE_X86_AVX512_PERMUTEXVAR_H #include "types.h" #include "and.h" #include "andnot.h" #include "blend.h" #include "mov.h" #include "or.h" #include "set1.h" #include "slli.h" #include "srli.h" #include "test.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_permutexvar_epi16 (simde__m128i idx, simde__m128i a) { #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm_permutexvar_epi16(idx, a); #elif defined(SIMDE_X86_SSSE3_NATIVE) simde__m128i mask16 = simde_mm_set1_epi16(0x0007); simde__m128i shift16 = simde_mm_set1_epi16(0x0202); simde__m128i byte_index16 = simde_mm_set1_epi16(0x0100); simde__m128i index16 = simde_mm_and_si128(idx, mask16); index16 = simde_mm_mullo_epi16(index16, shift16); index16 = simde_mm_add_epi16(index16, byte_index16); return simde_mm_shuffle_epi8(a, index16); #else simde__m128i_private idx_ = simde__m128i_to_private(idx), a_ = simde__m128i_to_private(a), r_; #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) uint16x8_t mask16 = vdupq_n_u16(0x0007); uint16x8_t byte_index16 = vdupq_n_u16(0x0100); uint16x8_t index16 = vandq_u16(idx_.neon_u16, mask16); index16 = vmulq_n_u16(index16, 0x0202); index16 = vaddq_u16(index16, byte_index16); r_.neon_u8 = vqtbl1q_u8(a_.neon_u8, vreinterpretq_u8_u16(index16)); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) index16; index16 = vec_and(idx_.altivec_u16, vec_splat_u16(7)); index16 = vec_mladd(index16, vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0202)), vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0100))); r_.altivec_u8 = vec_perm(a_.altivec_u8, a_.altivec_u8, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), index16)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) const v128_t mask16 = wasm_i16x8_splat(0x0007); const v128_t shift16 = wasm_i16x8_splat(0x0202); const v128_t byte_index16 = wasm_i16x8_splat(0x0100); v128_t index16 = wasm_v128_and(idx_.wasm_v128, mask16); index16 = wasm_i16x8_mul(index16, shift16); index16 = wasm_i16x8_add(index16, byte_index16); r_.wasm_v128 = wasm_i8x16_swizzle(a_.wasm_v128, index16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { r_.i16[i] = a_.i16[idx_.i16[i] & 0x07]; } #endif return simde__m128i_from_private(r_); #endif } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_permutexvar_epi16 #define _mm_permutexvar_epi16(idx, a) simde_mm_permutexvar_epi16(idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_mask_permutexvar_epi16 (simde__m128i src, simde__mmask8 k, simde__m128i idx, simde__m128i a) { #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm_mask_permutexvar_epi16(src, k, idx, a); #else return simde_mm_mask_mov_epi16(src, k, simde_mm_permutexvar_epi16(idx, a)); #endif } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_permutexvar_epi16 #define _mm_mask_permutexvar_epi16(src, k, idx, a) simde_mm_mask_permutexvar_epi16(src, k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_maskz_permutexvar_epi16 (simde__mmask8 k, simde__m128i idx, simde__m128i a) { #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm_maskz_permutexvar_epi16(k, idx, a); #else return simde_mm_maskz_mov_epi16(k, simde_mm_permutexvar_epi16(idx, a)); #endif } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_permutexvar_epi16 #define _mm_maskz_permutexvar_epi16(k, idx, a) simde_mm_maskz_permutexvar_epi16(k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_permutexvar_epi8 (simde__m128i idx, simde__m128i a) { #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm_permutexvar_epi8(idx, a); #elif defined(SIMDE_X86_SSSE3_NATIVE) simde__m128i mask = simde_mm_set1_epi8(0x0F); simde__m128i index = simde_mm_and_si128(idx, mask); return simde_mm_shuffle_epi8(a, index); #else simde__m128i_private idx_ = simde__m128i_to_private(idx), a_ = simde__m128i_to_private(a), r_; #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) uint8x16_t mask = vdupq_n_u8(0x0F); uint8x16_t index = vandq_u8(idx_.neon_u8, mask); r_.neon_u8 = vqtbl1q_u8(a_.neon_u8, index); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_u8 = vec_perm(a_.altivec_u8, a_.altivec_u8, idx_.altivec_u8); #elif defined(SIMDE_WASM_SIMD128_NATIVE) const v128_t mask = wasm_i8x16_splat(0x0F); v128_t index = wasm_v128_and(idx_.wasm_v128, mask); r_.wasm_v128 = wasm_i8x16_swizzle(a_.wasm_v128, index); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { r_.i8[i] = a_.i8[idx_.i8[i] & 0x0F]; } #endif return simde__m128i_from_private(r_); #endif } #if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_permutexvar_epi8 #define _mm_permutexvar_epi8(idx, a) simde_mm_permutexvar_epi8(idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_mask_permutexvar_epi8 (simde__m128i src, simde__mmask16 k, simde__m128i idx, simde__m128i a) { #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm_mask_permutexvar_epi8(src, k, idx, a); #else return simde_mm_mask_mov_epi8(src, k, simde_mm_permutexvar_epi8(idx, a)); #endif } #if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_permutexvar_epi8 #define _mm_mask_permutexvar_epi8(src, k, idx, a) simde_mm_mask_permutexvar_epi8(src, k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_maskz_permutexvar_epi8 (simde__mmask16 k, simde__m128i idx, simde__m128i a) { #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm_maskz_permutexvar_epi8(k, idx, a); #else return simde_mm_maskz_mov_epi8(k, simde_mm_permutexvar_epi8(idx, a)); #endif } #if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_permutexvar_epi8 #define _mm_maskz_permutexvar_epi8(k, idx, a) simde_mm_maskz_permutexvar_epi8(k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_permutexvar_epi16 (simde__m256i idx, simde__m256i a) { #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_permutexvar_epi16(idx, a); #elif defined(SIMDE_X86_AVX2_NATIVE) simde__m256i mask16 = simde_mm256_set1_epi16(0x001F); simde__m256i shift16 = simde_mm256_set1_epi16(0x0202); simde__m256i byte_index16 = simde_mm256_set1_epi16(0x0100); simde__m256i index16 = simde_mm256_and_si256(idx, mask16); index16 = simde_mm256_mullo_epi16(index16, shift16); simde__m256i lo = simde_mm256_permute4x64_epi64(a, (1 << 6) + (0 << 4) + (1 << 2) + (0 << 0)); simde__m256i hi = simde_mm256_permute4x64_epi64(a, (3 << 6) + (2 << 4) + (3 << 2) + (2 << 0)); simde__m256i select = simde_mm256_slli_epi64(index16, 3); index16 = simde_mm256_add_epi16(index16, byte_index16); lo = simde_mm256_shuffle_epi8(lo, index16); hi = simde_mm256_shuffle_epi8(hi, index16); return simde_mm256_blendv_epi8(lo, hi, select); #else simde__m256i_private idx_ = simde__m256i_to_private(idx), a_ = simde__m256i_to_private(a), r_; #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) uint8x16x2_t table = { { a_.m128i_private[0].neon_u8, a_.m128i_private[1].neon_u8 } }; uint16x8_t mask16 = vdupq_n_u16(0x000F); uint16x8_t byte_index16 = vdupq_n_u16(0x0100); SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { uint16x8_t index16 = vandq_u16(idx_.m128i_private[i].neon_u16, mask16); index16 = vmulq_n_u16(index16, 0x0202); index16 = vaddq_u16(index16, byte_index16); r_.m128i_private[i].neon_u8 = vqtbl2q_u8(table, vreinterpretq_u8_u16(index16)); } #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) index16, mask16, shift16, byte_index16; mask16 = vec_splat_u16(0x000F); shift16 = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0202)); byte_index16 = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0100)); SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { index16 = vec_and(idx_.m128i_private[i].altivec_u16, mask16); index16 = vec_mladd(index16, shift16, byte_index16); r_.m128i_private[i].altivec_u8 = vec_perm(a_.m128i_private[0].altivec_u8, a_.m128i_private[1].altivec_u8, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), index16)); } #elif defined(SIMDE_WASM_SIMD128_NATIVE) v128_t index, index16, r, t; const v128_t mask16 = wasm_i16x8_splat(0x000F); const v128_t shift16 = wasm_i16x8_splat(0x0202); const v128_t byte_index16 = wasm_i16x8_splat(0x0100); const v128_t sixteen = wasm_i8x16_splat(16); const v128_t a0 = a_.m128i_private[0].wasm_v128; const v128_t a1 = a_.m128i_private[1].wasm_v128; SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { index16 = wasm_v128_and(idx_.m128i_private[i].wasm_v128, mask16); index16 = wasm_i16x8_mul(index16, shift16); index = wasm_i16x8_add(index16, byte_index16); r = wasm_i8x16_swizzle(a0, index); index = wasm_i8x16_sub(index, sixteen); t = wasm_i8x16_swizzle(a1, index); r_.m128i_private[i].wasm_v128 = wasm_v128_or(r, t); } #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { r_.i16[i] = a_.i16[idx_.i16[i] & 0x0F]; } #endif return simde__m256i_from_private(r_); #endif } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_permutexvar_epi16 #define _mm256_permutexvar_epi16(idx, a) simde_mm256_permutexvar_epi16(idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_mask_permutexvar_epi16 (simde__m256i src, simde__mmask16 k, simde__m256i idx, simde__m256i a) { #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_mask_permutexvar_epi16(src, k, idx, a); #else return simde_mm256_mask_mov_epi16(src, k, simde_mm256_permutexvar_epi16(idx, a)); #endif } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_permutexvar_epi16 #define _mm256_mask_permutexvar_epi16(src, k, idx, a) simde_mm256_mask_permutexvar_epi16(src, k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_maskz_permutexvar_epi16 (simde__mmask16 k, simde__m256i idx, simde__m256i a) { #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_maskz_permutexvar_epi16(k, idx, a); #else return simde_mm256_maskz_mov_epi16(k, simde_mm256_permutexvar_epi16(idx, a)); #endif } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_permutexvar_epi16 #define _mm256_maskz_permutexvar_epi16(k, idx, a) simde_mm256_maskz_permutexvar_epi16(k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_permutexvar_epi32 (simde__m256i idx, simde__m256i a) { #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_permutexvar_epi32(idx, a); #elif defined(SIMDE_X86_AVX2_NATIVE) return simde_mm256_permutevar8x32_epi32(a, idx); #else simde__m256i_private idx_ = simde__m256i_to_private(idx), a_ = simde__m256i_to_private(a), r_; #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) uint8x16x2_t table = { { a_.m128i_private[0].neon_u8, a_.m128i_private[1].neon_u8 } }; uint32x4_t mask32 = vdupq_n_u32(0x00000007); uint32x4_t byte_index32 = vdupq_n_u32(0x03020100); SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { uint32x4_t index32 = vandq_u32(idx_.m128i_private[i].neon_u32, mask32); index32 = vmulq_n_u32(index32, 0x04040404); index32 = vaddq_u32(index32, byte_index32); r_.m128i_private[i].neon_u8 = vqtbl2q_u8(table, vreinterpretq_u8_u32(index32)); } #else #if !defined(__INTEL_COMPILER) SIMDE_VECTORIZE #endif for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { r_.i32[i] = a_.i32[idx_.i32[i] & 0x07]; } #endif return simde__m256i_from_private(r_); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_permutexvar_epi32 #define _mm256_permutexvar_epi32(idx, a) simde_mm256_permutexvar_epi32(idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_mask_permutexvar_epi32 (simde__m256i src, simde__mmask8 k, simde__m256i idx, simde__m256i a) { #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_mask_permutexvar_epi32(src, k, idx, a); #else return simde_mm256_mask_mov_epi32(src, k, simde_mm256_permutexvar_epi32(idx, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_permutexvar_epi32 #define _mm256_mask_permutexvar_epi32(src, k, idx, a) simde_mm256_mask_permutexvar_epi32(src, k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_maskz_permutexvar_epi32 (simde__mmask8 k, simde__m256i idx, simde__m256i a) { #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_maskz_permutexvar_epi32(k, idx, a); #else return simde_mm256_maskz_mov_epi32(k, simde_mm256_permutexvar_epi32(idx, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_permutexvar_epi32 #define _mm256_maskz_permutexvar_epi32(k, idx, a) simde_mm256_maskz_permutexvar_epi32(k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_permutexvar_epi64 (simde__m256i idx, simde__m256i a) { #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_permutexvar_epi64(idx, a); #else simde__m256i_private idx_ = simde__m256i_to_private(idx), a_ = simde__m256i_to_private(a), r_; #if !defined(__INTEL_COMPILER) SIMDE_VECTORIZE #endif for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { r_.i64[i] = a_.i64[idx_.i64[i] & 3]; } return simde__m256i_from_private(r_); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_permutexvar_epi64 #define _mm256_permutexvar_epi64(idx, a) simde_mm256_permutexvar_epi64(idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_mask_permutexvar_epi64 (simde__m256i src, simde__mmask8 k, simde__m256i idx, simde__m256i a) { #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_mask_permutexvar_epi64(src, k, idx, a); #else return simde_mm256_mask_mov_epi64(src, k, simde_mm256_permutexvar_epi64(idx, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_permutexvar_epi64 #define _mm256_mask_permutexvar_epi64(src, k, idx, a) simde_mm256_mask_permutexvar_epi64(src, k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_maskz_permutexvar_epi64 (simde__mmask8 k, simde__m256i idx, simde__m256i a) { #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_maskz_permutexvar_epi64(k, idx, a); #else return simde_mm256_maskz_mov_epi64(k, simde_mm256_permutexvar_epi64(idx, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_permutexvar_epi64 #define _mm256_maskz_permutexvar_epi64(k, idx, a) simde_mm256_maskz_permutexvar_epi64(k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_permutexvar_epi8 (simde__m256i idx, simde__m256i a) { #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_permutexvar_epi8(idx, a); #elif defined(SIMDE_X86_AVX2_NATIVE) simde__m256i mask = simde_mm256_set1_epi8(0x0F); simde__m256i lo = simde_mm256_permute4x64_epi64(a, (1 << 6) + (0 << 4) + (1 << 2) + (0 << 0)); simde__m256i hi = simde_mm256_permute4x64_epi64(a, (3 << 6) + (2 << 4) + (3 << 2) + (2 << 0)); simde__m256i index = simde_mm256_and_si256(idx, mask); simde__m256i select = simde_mm256_slli_epi64(idx, 3); lo = simde_mm256_shuffle_epi8(lo, index); hi = simde_mm256_shuffle_epi8(hi, index); return simde_mm256_blendv_epi8(lo, hi, select); #else simde__m256i_private idx_ = simde__m256i_to_private(idx), a_ = simde__m256i_to_private(a), r_; #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) uint8x16x2_t table = { { a_.m128i_private[0].neon_u8, a_.m128i_private[1].neon_u8 } }; uint8x16_t mask = vdupq_n_u8(0x1F); SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { r_.m128i_private[i].neon_u8 = vqtbl2q_u8(table, vandq_u8(idx_.m128i_private[i].neon_u8, mask)); } #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { r_.m128i_private[i].altivec_u8 = vec_perm(a_.m128i_private[0].altivec_u8, a_.m128i_private[1].altivec_u8, idx_.m128i_private[i].altivec_u8); } #elif defined(SIMDE_WASM_SIMD128_NATIVE) v128_t index, r, t; const v128_t mask = wasm_i8x16_splat(0x1F); const v128_t sixteen = wasm_i8x16_splat(16); const v128_t a0 = a_.m128i_private[0].wasm_v128; const v128_t a1 = a_.m128i_private[1].wasm_v128; SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { index = wasm_v128_and(idx_.m128i_private[i].wasm_v128, mask); r = wasm_i8x16_swizzle(a0, index); index = wasm_i8x16_sub(index, sixteen); t = wasm_i8x16_swizzle(a1, index); r_.m128i_private[i].wasm_v128 = wasm_v128_or(r, t); } #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { r_.i8[i] = a_.i8[idx_.i8[i] & 0x1F]; } #endif return simde__m256i_from_private(r_); #endif } #if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_permutexvar_epi8 #define _mm256_permutexvar_epi8(idx, a) simde_mm256_permutexvar_epi8(idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_mask_permutexvar_epi8 (simde__m256i src, simde__mmask32 k, simde__m256i idx, simde__m256i a) { #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_mask_permutexvar_epi8(src, k, idx, a); #else return simde_mm256_mask_mov_epi8(src, k, simde_mm256_permutexvar_epi8(idx, a)); #endif } #if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_permutexvar_epi8 #define _mm256_mask_permutexvar_epi8(src, k, idx, a) simde_mm256_mask_permutexvar_epi8(src, k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_maskz_permutexvar_epi8 (simde__mmask32 k, simde__m256i idx, simde__m256i a) { #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_maskz_permutexvar_epi8(k, idx, a); #else return simde_mm256_maskz_mov_epi8(k, simde_mm256_permutexvar_epi8(idx, a)); #endif } #if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_permutexvar_epi8 #define _mm256_maskz_permutexvar_epi8(k, idx, a) simde_mm256_maskz_permutexvar_epi8(k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m256d simde_mm256_permutexvar_pd (simde__m256i idx, simde__m256d a) { #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_permutexvar_pd(idx, a); #else return simde_mm256_castsi256_pd(simde_mm256_permutexvar_epi64(idx, simde_mm256_castpd_si256(a))); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_permutexvar_pd #define _mm256_permutexvar_pd(idx, a) simde_mm256_permutexvar_pd(idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m256d simde_mm256_mask_permutexvar_pd (simde__m256d src, simde__mmask8 k, simde__m256i idx, simde__m256d a) { #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_mask_permutexvar_pd(src, k, idx, a); #else return simde_mm256_mask_mov_pd(src, k, simde_mm256_permutexvar_pd(idx, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_permutexvar_pd #define _mm256_mask_permutexvar_pd(src, k, idx, a) simde_mm256_mask_permutexvar_pd(src, k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m256d simde_mm256_maskz_permutexvar_pd (simde__mmask8 k, simde__m256i idx, simde__m256d a) { #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_maskz_permutexvar_pd(k, idx, a); #else return simde_mm256_maskz_mov_pd(k, simde_mm256_permutexvar_pd(idx, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_permutexvar_pd #define _mm256_maskz_permutexvar_pd(k, idx, a) simde_mm256_maskz_permutexvar_pd(k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m256 simde_mm256_permutexvar_ps (simde__m256i idx, simde__m256 a) { #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_permutexvar_ps(idx, a); #elif defined(SIMDE_X86_AVX2_NATIVE) return simde_mm256_permutevar8x32_ps(a, idx); #else return simde_mm256_castsi256_ps(simde_mm256_permutexvar_epi32(idx, simde_mm256_castps_si256(a))); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_permutexvar_ps #define _mm256_permutexvar_ps(idx, a) simde_mm256_permutexvar_ps(idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m256 simde_mm256_mask_permutexvar_ps (simde__m256 src, simde__mmask8 k, simde__m256i idx, simde__m256 a) { #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_mask_permutexvar_ps(src, k, idx, a); #else return simde_mm256_mask_mov_ps(src, k, simde_mm256_permutexvar_ps(idx, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_permutexvar_ps #define _mm256_mask_permutexvar_ps(src, k, idx, a) simde_mm256_mask_permutexvar_ps(src, k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m256 simde_mm256_maskz_permutexvar_ps (simde__mmask8 k, simde__m256i idx, simde__m256 a) { #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) return _mm256_maskz_permutexvar_ps(k, idx, a); #else return simde_mm256_maskz_mov_ps(k, simde_mm256_permutexvar_ps(idx, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_permutexvar_ps #define _mm256_maskz_permutexvar_ps(k, idx, a) simde_mm256_maskz_permutexvar_ps(k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_permutexvar_epi16 (simde__m512i idx, simde__m512i a) { #if defined(SIMDE_X86_AVX512BW_NATIVE) return _mm512_permutexvar_epi16(idx, a); #else simde__m512i_private idx_ = simde__m512i_to_private(idx), a_ = simde__m512i_to_private(a), r_; #if defined(SIMDE_X86_AVX2_NATIVE) simde__m256i t0, t1, index, select, a01, a23; simde__m256i mask = simde_mm256_set1_epi16(0x001F); simde__m256i shift = simde_mm256_set1_epi16(0x0202); simde__m256i byte_index = simde_mm256_set1_epi16(0x0100); simde__m256i a0 = simde_mm256_broadcastsi128_si256(a_.m128i[0]); simde__m256i a1 = simde_mm256_broadcastsi128_si256(a_.m128i[1]); simde__m256i a2 = simde_mm256_broadcastsi128_si256(a_.m128i[2]); simde__m256i a3 = simde_mm256_broadcastsi128_si256(a_.m128i[3]); SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.m256i_private) / sizeof(r_.m256i_private[0])) ; i++) { index = idx_.m256i[i]; index = simde_mm256_and_si256(index, mask); index = simde_mm256_mullo_epi16(index, shift); index = simde_mm256_add_epi16(index, byte_index); t0 = simde_mm256_shuffle_epi8(a0, index); t1 = simde_mm256_shuffle_epi8(a1, index); select = simde_mm256_slli_epi64(index, 3); a01 = simde_mm256_blendv_epi8(t0, t1, select); t0 = simde_mm256_shuffle_epi8(a2, index); t1 = simde_mm256_shuffle_epi8(a3, index); a23 = simde_mm256_blendv_epi8(t0, t1, select); select = simde_mm256_slli_epi64(index, 2); r_.m256i[i] = simde_mm256_blendv_epi8(a01, a23, select); } #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) uint8x16x4_t table = { { a_.m128i_private[0].neon_u8, a_.m128i_private[1].neon_u8, a_.m128i_private[2].neon_u8, a_.m128i_private[3].neon_u8 } }; uint16x8_t mask16 = vdupq_n_u16(0x001F); uint16x8_t byte_index16 = vdupq_n_u16(0x0100); SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { uint16x8_t index16 = vandq_u16(idx_.m128i_private[i].neon_u16, mask16); index16 = vmulq_n_u16(index16, 0x0202); index16 = vaddq_u16(index16, byte_index16); r_.m128i_private[i].neon_u8 = vqtbl4q_u8(table, vreinterpretq_u8_u16(index16)); } #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) index16, mask16, shift16, byte_index16; SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) index, test, r01, r23; mask16 = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x001F)); shift16 = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0202)); byte_index16 = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0100)); test = vec_splats(HEDLEY_STATIC_CAST(unsigned char, 0x20)); SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { index16 = vec_and(idx_.m128i_private[i].altivec_u16, mask16); index16 = vec_mladd(index16, shift16, byte_index16); index = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), index16); r01 = vec_perm(a_.m128i_private[0].altivec_u8, a_.m128i_private[1].altivec_u8, index); r23 = vec_perm(a_.m128i_private[2].altivec_u8, a_.m128i_private[3].altivec_u8, index); r_.m128i_private[i].altivec_u8 = vec_sel(r01, r23, vec_cmpeq(vec_and(index, test), test)); } #elif defined(SIMDE_WASM_SIMD128_NATIVE) v128_t index, r, t; const v128_t mask = wasm_i16x8_splat(0x001F); const v128_t shift = wasm_i16x8_splat(0x0202); const v128_t byte_index = wasm_i16x8_splat(0x0100); const v128_t sixteen = wasm_i8x16_splat(16); const v128_t a0 = a_.m128i_private[0].wasm_v128; const v128_t a1 = a_.m128i_private[1].wasm_v128; const v128_t a2 = a_.m128i_private[2].wasm_v128; const v128_t a3 = a_.m128i_private[3].wasm_v128; SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { index = wasm_v128_and(idx_.m128i_private[i].wasm_v128, mask); index = wasm_i16x8_mul(index, shift); index = wasm_i16x8_add(index, byte_index); r = wasm_i8x16_swizzle(a0, index); index = wasm_i8x16_sub(index, sixteen); t = wasm_i8x16_swizzle(a1, index); r = wasm_v128_or(r, t); index = wasm_i8x16_sub(index, sixteen); t = wasm_i8x16_swizzle(a2, index); r = wasm_v128_or(r, t); index = wasm_i8x16_sub(index, sixteen); t = wasm_i8x16_swizzle(a3, index); r_.m128i_private[i].wasm_v128 = wasm_v128_or(r, t); } #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { r_.i16[i] = a_.i16[idx_.i16[i] & 0x1F]; } #endif return simde__m512i_from_private(r_); #endif } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_permutexvar_epi16 #define _mm512_permutexvar_epi16(idx, a) simde_mm512_permutexvar_epi16(idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_mask_permutexvar_epi16 (simde__m512i src, simde__mmask32 k, simde__m512i idx, simde__m512i a) { #if defined(SIMDE_X86_AVX512BW_NATIVE) return _mm512_mask_permutexvar_epi16(src, k, idx, a); #else return simde_mm512_mask_mov_epi16(src, k, simde_mm512_permutexvar_epi16(idx, a)); #endif } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_permutexvar_epi16 #define _mm512_mask_permutexvar_epi16(src, k, idx, a) simde_mm512_mask_permutexvar_epi16(src, k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_maskz_permutexvar_epi16 (simde__mmask32 k, simde__m512i idx, simde__m512i a) { #if defined(SIMDE_X86_AVX512BW_NATIVE) return _mm512_maskz_permutexvar_epi16(k, idx, a); #else return simde_mm512_maskz_mov_epi16(k, simde_mm512_permutexvar_epi16(idx, a)); #endif } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_permutexvar_epi16 #define _mm512_maskz_permutexvar_epi16(k, idx, a) simde_mm512_maskz_permutexvar_epi16(k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_permutexvar_epi32 (simde__m512i idx, simde__m512i a) { #if defined(SIMDE_X86_AVX512F_NATIVE) return _mm512_permutexvar_epi32(idx, a); #else simde__m512i_private idx_ = simde__m512i_to_private(idx), a_ = simde__m512i_to_private(a), r_; #if defined(SIMDE_X86_AVX2_NATIVE) simde__m256i index, r0, r1, select; SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.m256i_private) / sizeof(r_.m256i_private[0])) ; i++) { index = idx_.m256i[i]; r0 = simde_mm256_permutevar8x32_epi32(a_.m256i[0], index); r1 = simde_mm256_permutevar8x32_epi32(a_.m256i[1], index); select = simde_mm256_slli_epi32(index, 28); r_.m256i[i] = simde_mm256_castps_si256(simde_mm256_blendv_ps(simde_mm256_castsi256_ps(r0), simde_mm256_castsi256_ps(r1), simde_mm256_castsi256_ps(select))); } #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) uint8x16x4_t table = { { a_.m128i_private[0].neon_u8, a_.m128i_private[1].neon_u8, a_.m128i_private[2].neon_u8, a_.m128i_private[3].neon_u8 } }; uint32x4_t mask32 = vdupq_n_u32(0x0000000F); uint32x4_t byte_index32 = vdupq_n_u32(0x03020100); SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { uint32x4_t index32 = vandq_u32(idx_.m128i_private[i].neon_u32, mask32); index32 = vmulq_n_u32(index32, 0x04040404); index32 = vaddq_u32(index32, byte_index32); r_.m128i_private[i].neon_u8 = vqtbl4q_u8(table, vreinterpretq_u8_u32(index32)); } #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) index32, mask32, byte_index32, temp32, sixteen; SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) zero, shift; SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) index, test, r01, r23; mask32 = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 0x0000000F)); byte_index32 = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 0x03020100)); zero = vec_splat_u16(0); shift = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0404)); sixteen = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 16)); test = vec_splats(HEDLEY_STATIC_CAST(unsigned char, 0x20)); SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { index32 = vec_and(idx_.m128i_private[i].altivec_u32, mask32); /* Multiply index32 by 0x04040404; unfortunately vec_mul isn't available so (mis)use 16-bit vec_mladd */ temp32 = vec_sl(index32, sixteen); index32 = vec_add(index32, temp32); index32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), vec_mladd(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), index32), shift, zero)); index32 = vec_add(index32, byte_index32); index = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), index32); r01 = vec_perm(a_.m128i_private[0].altivec_u8, a_.m128i_private[1].altivec_u8, index); r23 = vec_perm(a_.m128i_private[2].altivec_u8, a_.m128i_private[3].altivec_u8, index); r_.m128i_private[i].altivec_u8 = vec_sel(r01, r23, vec_cmpeq(vec_and(index, test), test)); } #elif defined(SIMDE_WASM_SIMD128_NATIVE) v128_t index, r, t; const v128_t mask = wasm_i32x4_splat(0x0000000F); const v128_t shift = wasm_i32x4_splat(0x04040404); const v128_t byte_index = wasm_i32x4_splat(0x03020100); const v128_t sixteen = wasm_i8x16_splat(16); const v128_t a0 = a_.m128i_private[0].wasm_v128; const v128_t a1 = a_.m128i_private[1].wasm_v128; const v128_t a2 = a_.m128i_private[2].wasm_v128; const v128_t a3 = a_.m128i_private[3].wasm_v128; SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { index = wasm_v128_and(idx_.m128i_private[i].wasm_v128, mask); index = wasm_i32x4_mul(index, shift); index = wasm_i32x4_add(index, byte_index); r = wasm_i8x16_swizzle(a0, index); index = wasm_i8x16_sub(index, sixteen); t = wasm_i8x16_swizzle(a1, index); r = wasm_v128_or(r, t); index = wasm_i8x16_sub(index, sixteen); t = wasm_i8x16_swizzle(a2, index); r = wasm_v128_or(r, t); index = wasm_i8x16_sub(index, sixteen); t = wasm_i8x16_swizzle(a3, index); r_.m128i_private[i].wasm_v128 = wasm_v128_or(r, t); } #else #if !defined(__INTEL_COMPILER) SIMDE_VECTORIZE #endif for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { r_.i32[i] = a_.i32[idx_.i32[i] & 0x0F]; } #endif return simde__m512i_from_private(r_); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_permutexvar_epi32 #define _mm512_permutexvar_epi32(idx, a) simde_mm512_permutexvar_epi32(idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_mask_permutexvar_epi32 (simde__m512i src, simde__mmask16 k, simde__m512i idx, simde__m512i a) { #if defined(SIMDE_X86_AVX512F_NATIVE) return _mm512_mask_permutexvar_epi32(src, k, idx, a); #else return simde_mm512_mask_mov_epi32(src, k, simde_mm512_permutexvar_epi32(idx, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_permutexvar_epi32 #define _mm512_mask_permutexvar_epi32(src, k, idx, a) simde_mm512_mask_permutexvar_epi32(src, k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_maskz_permutexvar_epi32 (simde__mmask16 k, simde__m512i idx, simde__m512i a) { #if defined(SIMDE_X86_AVX512F_NATIVE) return _mm512_maskz_permutexvar_epi32(k, idx, a); #else return simde_mm512_maskz_mov_epi32(k, simde_mm512_permutexvar_epi32(idx, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_permutexvar_epi32 #define _mm512_maskz_permutexvar_epi32(k, idx, a) simde_mm512_maskz_permutexvar_epi32(k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_permutexvar_epi64 (simde__m512i idx, simde__m512i a) { #if defined(SIMDE_X86_AVX512F_NATIVE) return _mm512_permutexvar_epi64(idx, a); #else simde__m512i_private idx_ = simde__m512i_to_private(idx), a_ = simde__m512i_to_private(a), r_; #if !defined(__INTEL_COMPILER) SIMDE_VECTORIZE #endif for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { r_.i64[i] = a_.i64[idx_.i64[i] & 7]; } return simde__m512i_from_private(r_); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_permutexvar_epi64 #define _mm512_permutexvar_epi64(idx, a) simde_mm512_permutexvar_epi64(idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_mask_permutexvar_epi64 (simde__m512i src, simde__mmask8 k, simde__m512i idx, simde__m512i a) { #if defined(SIMDE_X86_AVX512F_NATIVE) return _mm512_mask_permutexvar_epi64(src, k, idx, a); #else return simde_mm512_mask_mov_epi64(src, k, simde_mm512_permutexvar_epi64(idx, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_permutexvar_epi64 #define _mm512_mask_permutexvar_epi64(src, k, idx, a) simde_mm512_mask_permutexvar_epi64(src, k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_maskz_permutexvar_epi64 (simde__mmask8 k, simde__m512i idx, simde__m512i a) { #if defined(SIMDE_X86_AVX512F_NATIVE) return _mm512_maskz_permutexvar_epi64(k, idx, a); #else return simde_mm512_maskz_mov_epi64(k, simde_mm512_permutexvar_epi64(idx, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_permutexvar_epi64 #define _mm512_maskz_permutexvar_epi64(k, idx, a) simde_mm512_maskz_permutexvar_epi64(k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_permutexvar_epi8 (simde__m512i idx, simde__m512i a) { #if defined(SIMDE_X86_AVX512VBMI_NATIVE) return _mm512_permutexvar_epi8(idx, a); #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) simde__m512i hilo, hi, lo, hi2, lo2, idx2; simde__m512i ones = simde_mm512_set1_epi8(1); simde__m512i low_bytes = simde_mm512_set1_epi16(0x00FF); idx2 = simde_mm512_srli_epi16(idx, 1); hilo = simde_mm512_permutexvar_epi16(idx2, a); simde__mmask64 mask = simde_mm512_test_epi8_mask(idx, ones); lo = simde_mm512_and_si512(hilo, low_bytes); hi = simde_mm512_srli_epi16(hilo, 8); idx2 = simde_mm512_srli_epi16(idx, 9); hilo = simde_mm512_permutexvar_epi16(idx2, a); lo2 = simde_mm512_slli_epi16(hilo, 8); hi2 = simde_mm512_andnot_si512(low_bytes, hilo); lo = simde_mm512_or_si512(lo, lo2); hi = simde_mm512_or_si512(hi, hi2); return simde_mm512_mask_blend_epi8(mask, lo, hi); #else simde__m512i_private idx_ = simde__m512i_to_private(idx), a_ = simde__m512i_to_private(a), r_; #if defined(SIMDE_X86_AVX2_NATIVE) simde__m256i t0, t1, index, select, a01, a23; simde__m256i mask = simde_mm256_set1_epi8(0x3F); simde__m256i a0 = simde_mm256_broadcastsi128_si256(a_.m128i[0]); simde__m256i a1 = simde_mm256_broadcastsi128_si256(a_.m128i[1]); simde__m256i a2 = simde_mm256_broadcastsi128_si256(a_.m128i[2]); simde__m256i a3 = simde_mm256_broadcastsi128_si256(a_.m128i[3]); SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.m256i_private) / sizeof(r_.m256i_private[0])) ; i++) { index = idx_.m256i[i]; index = simde_mm256_and_si256(index, mask); select = simde_mm256_slli_epi64(index, 3); t0 = simde_mm256_shuffle_epi8(a0, index); t1 = simde_mm256_shuffle_epi8(a1, index); a01 = simde_mm256_blendv_epi8(t0, t1, select); t0 = simde_mm256_shuffle_epi8(a2, index); t1 = simde_mm256_shuffle_epi8(a3, index); a23 = simde_mm256_blendv_epi8(t0, t1, select); select = simde_mm256_slli_epi64(index, 2); r_.m256i[i] = simde_mm256_blendv_epi8(a01, a23, select); } #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) uint8x16x4_t table = { { a_.m128i_private[0].neon_u8, a_.m128i_private[1].neon_u8, a_.m128i_private[2].neon_u8, a_.m128i_private[3].neon_u8 } }; uint8x16_t mask = vdupq_n_u8(0x3F); SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { r_.m128i_private[i].neon_u8 = vqtbl4q_u8(table, vandq_u8(idx_.m128i_private[i].neon_u8, mask)); } #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) test, r01, r23; test = vec_splats(HEDLEY_STATIC_CAST(unsigned char, 0x20)); SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { r01 = vec_perm(a_.m128i_private[0].altivec_u8, a_.m128i_private[1].altivec_u8, idx_.m128i_private[i].altivec_u8); r23 = vec_perm(a_.m128i_private[2].altivec_u8, a_.m128i_private[3].altivec_u8, idx_.m128i_private[i].altivec_u8); r_.m128i_private[i].altivec_u8 = vec_sel(r01, r23, vec_cmpeq(vec_and(idx_.m128i_private[i].altivec_u8, test), test)); } #elif defined(SIMDE_WASM_SIMD128_NATIVE) v128_t index, r, t; const v128_t mask = wasm_i8x16_splat(0x3F); const v128_t sixteen = wasm_i8x16_splat(16); const v128_t a0 = a_.m128i_private[0].wasm_v128; const v128_t a1 = a_.m128i_private[1].wasm_v128; const v128_t a2 = a_.m128i_private[2].wasm_v128; const v128_t a3 = a_.m128i_private[3].wasm_v128; SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { index = wasm_v128_and(idx_.m128i_private[i].wasm_v128, mask); r = wasm_i8x16_swizzle(a0, index); index = wasm_i8x16_sub(index, sixteen); t = wasm_i8x16_swizzle(a1, index); r = wasm_v128_or(r, t); index = wasm_i8x16_sub(index, sixteen); t = wasm_i8x16_swizzle(a2, index); r = wasm_v128_or(r, t); index = wasm_i8x16_sub(index, sixteen); t = wasm_i8x16_swizzle(a3, index); r_.m128i_private[i].wasm_v128 = wasm_v128_or(r, t); } #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { r_.i8[i] = a_.i8[idx_.i8[i] & 0x3F]; } #endif return simde__m512i_from_private(r_); #endif } #if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) #undef _mm512_permutexvar_epi8 #define _mm512_permutexvar_epi8(idx, a) simde_mm512_permutexvar_epi8(idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_mask_permutexvar_epi8 (simde__m512i src, simde__mmask64 k, simde__m512i idx, simde__m512i a) { #if defined(SIMDE_X86_AVX512VBMI_NATIVE) return _mm512_mask_permutexvar_epi8(src, k, idx, a); #else return simde_mm512_mask_mov_epi8(src, k, simde_mm512_permutexvar_epi8(idx, a)); #endif } #if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_permutexvar_epi8 #define _mm512_mask_permutexvar_epi8(src, k, idx, a) simde_mm512_mask_permutexvar_epi8(src, k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_maskz_permutexvar_epi8 (simde__mmask64 k, simde__m512i idx, simde__m512i a) { #if defined(SIMDE_X86_AVX512VBMI_NATIVE) return _mm512_maskz_permutexvar_epi8(k, idx, a); #else return simde_mm512_maskz_mov_epi8(k, simde_mm512_permutexvar_epi8(idx, a)); #endif } #if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_permutexvar_epi8 #define _mm512_maskz_permutexvar_epi8(k, idx, a) simde_mm512_maskz_permutexvar_epi8(k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_permutexvar_pd (simde__m512i idx, simde__m512d a) { #if defined(SIMDE_X86_AVX512F_NATIVE) return _mm512_permutexvar_pd(idx, a); #else return simde_mm512_castsi512_pd(simde_mm512_permutexvar_epi64(idx, simde_mm512_castpd_si512(a))); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_permutexvar_pd #define _mm512_permutexvar_pd(idx, a) simde_mm512_permutexvar_pd(idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_mask_permutexvar_pd (simde__m512d src, simde__mmask8 k, simde__m512i idx, simde__m512d a) { #if defined(SIMDE_X86_AVX512F_NATIVE) return _mm512_mask_permutexvar_pd(src, k, idx, a); #else return simde_mm512_mask_mov_pd(src, k, simde_mm512_permutexvar_pd(idx, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_permutexvar_pd #define _mm512_mask_permutexvar_pd(src, k, idx, a) simde_mm512_mask_permutexvar_pd(src, k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_maskz_permutexvar_pd (simde__mmask8 k, simde__m512i idx, simde__m512d a) { #if defined(SIMDE_X86_AVX512F_NATIVE) return _mm512_maskz_permutexvar_pd(k, idx, a); #else return simde_mm512_maskz_mov_pd(k, simde_mm512_permutexvar_pd(idx, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_permutexvar_pd #define _mm512_maskz_permutexvar_pd(k, idx, a) simde_mm512_maskz_permutexvar_pd(k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_permutexvar_ps (simde__m512i idx, simde__m512 a) { #if defined(SIMDE_X86_AVX512F_NATIVE) return _mm512_permutexvar_ps(idx, a); #else return simde_mm512_castsi512_ps(simde_mm512_permutexvar_epi32(idx, simde_mm512_castps_si512(a))); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_permutexvar_ps #define _mm512_permutexvar_ps(idx, a) simde_mm512_permutexvar_ps(idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_mask_permutexvar_ps (simde__m512 src, simde__mmask16 k, simde__m512i idx, simde__m512 a) { #if defined(SIMDE_X86_AVX512F_NATIVE) return _mm512_mask_permutexvar_ps(src, k, idx, a); #else return simde_mm512_mask_mov_ps(src, k, simde_mm512_permutexvar_ps(idx, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_permutexvar_ps #define _mm512_mask_permutexvar_ps(src, k, idx, a) simde_mm512_mask_permutexvar_ps(src, k, idx, a) #endif SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_maskz_permutexvar_ps (simde__mmask16 k, simde__m512i idx, simde__m512 a) { #if defined(SIMDE_X86_AVX512F_NATIVE) return _mm512_maskz_permutexvar_ps(k, idx, a); #else return simde_mm512_maskz_mov_ps(k, simde_mm512_permutexvar_ps(idx, a)); #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_permutexvar_ps #define _mm512_maskz_permutexvar_ps(k, idx, a) simde_mm512_maskz_permutexvar_ps(k, idx, a) #endif SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP #endif /* !defined(SIMDE_X86_AVX512_PERMUTEXVAR_H) */