// Copyright 2019 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // 256-bit vectors and AVX2 instructions, plus some AVX512-VL operations when // compiling for that target. // External include guard in highway.h - see comment there. // WARNING: most operations do not cross 128-bit block boundaries. In // particular, "Broadcast", pack and zip behavior may be surprising. // Must come before HWY_DIAGNOSTICS and HWY_COMPILER_CLANGCL #include "hwy/base.h" // Avoid uninitialized warnings in GCC's avx512fintrin.h - see // https://github.com/google/highway/issues/710) HWY_DIAGNOSTICS(push) #if HWY_COMPILER_GCC_ACTUAL HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494, ignored "-Wmaybe-uninitialized") #endif // Must come before HWY_COMPILER_CLANGCL #include // AVX2+ #if HWY_COMPILER_CLANGCL // Including should be enough, but Clang's headers helpfully skip // including these headers when _MSC_VER is defined, like when using clang-cl. // Include these directly here. #include // avxintrin defines __m256i and must come before avx2intrin. #include #include // _pext_u64 #include #include #include #endif // HWY_COMPILER_CLANGCL // For half-width vectors. Already includes base.h. #include "hwy/ops/shared-inl.h" // Already included by shared-inl, but do it again to avoid IDE warnings. #include "hwy/ops/x86_128-inl.h" HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { namespace detail { template struct Raw256 { using type = __m256i; }; #if HWY_HAVE_FLOAT16 template <> struct Raw256 { using type = __m256h; }; #endif // HWY_HAVE_FLOAT16 template <> struct Raw256 { using type = __m256; }; template <> struct Raw256 { using type = __m256d; }; } // namespace detail template class Vec256 { using Raw = typename detail::Raw256::type; public: using PrivateT = T; // only for DFromV static constexpr size_t kPrivateN = 32 / sizeof(T); // only for DFromV // Compound assignment. Only usable if there is a corresponding non-member // binary operator overload. For example, only f32 and f64 support division. HWY_INLINE Vec256& operator*=(const Vec256 other) { return *this = (*this * other); } HWY_INLINE Vec256& operator/=(const Vec256 other) { return *this = (*this / other); } HWY_INLINE Vec256& operator+=(const Vec256 other) { return *this = (*this + other); } HWY_INLINE Vec256& operator-=(const Vec256 other) { return *this = (*this - other); } HWY_INLINE Vec256& operator%=(const Vec256 other) { return *this = (*this % other); } HWY_INLINE Vec256& operator&=(const Vec256 other) { return *this = (*this & other); } HWY_INLINE Vec256& operator|=(const Vec256 other) { return *this = (*this | other); } HWY_INLINE Vec256& operator^=(const Vec256 other) { return *this = (*this ^ other); } Raw raw; }; #if HWY_TARGET <= HWY_AVX3 namespace detail { // Template arg: sizeof(lane type) template struct RawMask256 {}; template <> struct RawMask256<1> { using type = __mmask32; }; template <> struct RawMask256<2> { using type = __mmask16; }; template <> struct RawMask256<4> { using type = __mmask8; }; template <> struct RawMask256<8> { using type = __mmask8; }; } // namespace detail template struct Mask256 { using Raw = typename detail::RawMask256::type; static Mask256 FromBits(uint64_t mask_bits) { return Mask256{static_cast(mask_bits)}; } Raw raw; }; #else // AVX2 // FF..FF or 0. template struct Mask256 { typename detail::Raw256::type raw; }; #endif // AVX2 #if HWY_TARGET <= HWY_AVX3 namespace detail { // Used by Expand() emulation, which is required for both AVX3 and AVX2. template HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { return mask.raw; } } // namespace detail #endif // HWY_TARGET <= HWY_AVX3 template using Full256 = Simd; // ------------------------------ BitCast namespace detail { HWY_INLINE __m256i BitCastToInteger(__m256i v) { return v; } #if HWY_HAVE_FLOAT16 HWY_INLINE __m256i BitCastToInteger(__m256h v) { return _mm256_castph_si256(v); } #endif // HWY_HAVE_FLOAT16 HWY_INLINE __m256i BitCastToInteger(__m256 v) { return _mm256_castps_si256(v); } HWY_INLINE __m256i BitCastToInteger(__m256d v) { return _mm256_castpd_si256(v); } template HWY_INLINE Vec256 BitCastToByte(Vec256 v) { return Vec256{BitCastToInteger(v.raw)}; } // Cannot rely on function overloading because return types differ. template struct BitCastFromInteger256 { HWY_INLINE __m256i operator()(__m256i v) { return v; } }; #if HWY_HAVE_FLOAT16 template <> struct BitCastFromInteger256 { HWY_INLINE __m256h operator()(__m256i v) { return _mm256_castsi256_ph(v); } }; #endif // HWY_HAVE_FLOAT16 template <> struct BitCastFromInteger256 { HWY_INLINE __m256 operator()(__m256i v) { return _mm256_castsi256_ps(v); } }; template <> struct BitCastFromInteger256 { HWY_INLINE __m256d operator()(__m256i v) { return _mm256_castsi256_pd(v); } }; template HWY_INLINE VFromD BitCastFromByte(D /* tag */, Vec256 v) { return VFromD{BitCastFromInteger256>()(v.raw)}; } } // namespace detail template HWY_API VFromD BitCast(D d, Vec256 v) { return detail::BitCastFromByte(d, detail::BitCastToByte(v)); } // ------------------------------ Zero // Cannot use VFromD here because it is defined in terms of Zero. template HWY_API Vec256> Zero(D /* tag */) { return Vec256>{_mm256_setzero_si256()}; } template HWY_API Vec256 Zero(D /* tag */) { return Vec256{_mm256_setzero_si256()}; } template HWY_API Vec256 Zero(D /* tag */) { #if HWY_HAVE_FLOAT16 return Vec256{_mm256_setzero_ph()}; #else return Vec256{_mm256_setzero_si256()}; #endif // HWY_HAVE_FLOAT16 } template HWY_API Vec256 Zero(D /* tag */) { return Vec256{_mm256_setzero_ps()}; } template HWY_API Vec256 Zero(D /* tag */) { return Vec256{_mm256_setzero_pd()}; } // ------------------------------ Set template HWY_API VFromD Set(D /* tag */, TFromD t) { return VFromD{_mm256_set1_epi8(static_cast(t))}; // NOLINT } template HWY_API VFromD Set(D /* tag */, TFromD t) { return VFromD{_mm256_set1_epi16(static_cast(t))}; // NOLINT } template HWY_API VFromD Set(D /* tag */, TFromD t) { return VFromD{_mm256_set1_epi32(static_cast(t))}; } template HWY_API VFromD Set(D /* tag */, TFromD t) { return VFromD{_mm256_set1_epi64x(static_cast(t))}; // NOLINT } // bfloat16_t is handled by x86_128-inl.h. #if HWY_HAVE_FLOAT16 template HWY_API Vec256 Set(D /* tag */, float16_t t) { return Vec256{_mm256_set1_ph(t)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec256 Set(D /* tag */, float t) { return Vec256{_mm256_set1_ps(t)}; } template HWY_API Vec256 Set(D /* tag */, double t) { return Vec256{_mm256_set1_pd(t)}; } HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") // Returns a vector with uninitialized elements. template HWY_API VFromD Undefined(D /* tag */) { // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC // generate an XOR instruction. return VFromD{_mm256_undefined_si256()}; } template HWY_API Vec256 Undefined(D /* tag */) { return Vec256{_mm256_undefined_si256()}; } template HWY_API Vec256 Undefined(D /* tag */) { #if HWY_HAVE_FLOAT16 return Vec256{_mm256_undefined_ph()}; #else return Vec256{_mm256_undefined_si256()}; #endif } template HWY_API Vec256 Undefined(D /* tag */) { return Vec256{_mm256_undefined_ps()}; } template HWY_API Vec256 Undefined(D /* tag */) { return Vec256{_mm256_undefined_pd()}; } HWY_DIAGNOSTICS(pop) // ------------------------------ ResizeBitCast // 32-byte vector to 32-byte vector (or 64-byte vector to 64-byte vector on // AVX3) template ))> HWY_API VFromD ResizeBitCast(D d, FromV v) { return BitCast(d, v); } // 32-byte vector to 16-byte vector (or 64-byte vector to 32-byte vector on // AVX3) template )) / 2)> HWY_API VFromD ResizeBitCast(D d, FromV v) { const DFromV d_from; const Half dh_from; return BitCast(d, LowerHalf(dh_from, v)); } // 32-byte vector (or 64-byte vector on AVX3) to <= 8-byte vector template HWY_API VFromD ResizeBitCast(D /*d*/, FromV v) { return VFromD{ResizeBitCast(Full128>(), v).raw}; } // <= 16-byte vector to 32-byte vector template HWY_API VFromD ResizeBitCast(D d, FromV v) { return BitCast(d, Vec256{_mm256_castsi128_si256( ResizeBitCast(Full128(), v).raw)}); } // ------------------------------ Dup128VecFromValues template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7, TFromD t8, TFromD t9, TFromD t10, TFromD t11, TFromD t12, TFromD t13, TFromD t14, TFromD t15) { return VFromD{_mm256_setr_epi8( static_cast(t0), static_cast(t1), static_cast(t2), static_cast(t3), static_cast(t4), static_cast(t5), static_cast(t6), static_cast(t7), static_cast(t8), static_cast(t9), static_cast(t10), static_cast(t11), static_cast(t12), static_cast(t13), static_cast(t14), static_cast(t15), static_cast(t0), static_cast(t1), static_cast(t2), static_cast(t3), static_cast(t4), static_cast(t5), static_cast(t6), static_cast(t7), static_cast(t8), static_cast(t9), static_cast(t10), static_cast(t11), static_cast(t12), static_cast(t13), static_cast(t14), static_cast(t15))}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7) { return VFromD{ _mm256_setr_epi16(static_cast(t0), static_cast(t1), static_cast(t2), static_cast(t3), static_cast(t4), static_cast(t5), static_cast(t6), static_cast(t7), static_cast(t0), static_cast(t1), static_cast(t2), static_cast(t3), static_cast(t4), static_cast(t5), static_cast(t6), static_cast(t7))}; } #if HWY_HAVE_FLOAT16 template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7) { return VFromD{_mm256_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7)}; } #endif template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3) { return VFromD{ _mm256_setr_epi32(static_cast(t0), static_cast(t1), static_cast(t2), static_cast(t3), static_cast(t0), static_cast(t1), static_cast(t2), static_cast(t3))}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3) { return VFromD{_mm256_setr_ps(t0, t1, t2, t3, t0, t1, t2, t3)}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1) { return VFromD{ _mm256_setr_epi64x(static_cast(t0), static_cast(t1), static_cast(t0), static_cast(t1))}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1) { return VFromD{_mm256_setr_pd(t0, t1, t0, t1)}; } // ================================================== LOGICAL // ------------------------------ And template HWY_API Vec256 And(Vec256 a, Vec256 b) { const DFromV d; // for float16_t const RebindToUnsigned du; return BitCast(d, VFromD{_mm256_and_si256(BitCast(du, a).raw, BitCast(du, b).raw)}); } HWY_API Vec256 And(Vec256 a, Vec256 b) { return Vec256{_mm256_and_ps(a.raw, b.raw)}; } HWY_API Vec256 And(Vec256 a, Vec256 b) { return Vec256{_mm256_and_pd(a.raw, b.raw)}; } // ------------------------------ AndNot // Returns ~not_mask & mask. template HWY_API Vec256 AndNot(Vec256 not_mask, Vec256 mask) { const DFromV d; // for float16_t const RebindToUnsigned du; return BitCast(d, VFromD{_mm256_andnot_si256( BitCast(du, not_mask).raw, BitCast(du, mask).raw)}); } HWY_API Vec256 AndNot(Vec256 not_mask, Vec256 mask) { return Vec256{_mm256_andnot_ps(not_mask.raw, mask.raw)}; } HWY_API Vec256 AndNot(Vec256 not_mask, Vec256 mask) { return Vec256{_mm256_andnot_pd(not_mask.raw, mask.raw)}; } // ------------------------------ Or template HWY_API Vec256 Or(Vec256 a, Vec256 b) { const DFromV d; // for float16_t const RebindToUnsigned du; return BitCast(d, VFromD{_mm256_or_si256(BitCast(du, a).raw, BitCast(du, b).raw)}); } HWY_API Vec256 Or(Vec256 a, Vec256 b) { return Vec256{_mm256_or_ps(a.raw, b.raw)}; } HWY_API Vec256 Or(Vec256 a, Vec256 b) { return Vec256{_mm256_or_pd(a.raw, b.raw)}; } // ------------------------------ Xor template HWY_API Vec256 Xor(Vec256 a, Vec256 b) { const DFromV d; // for float16_t const RebindToUnsigned du; return BitCast(d, VFromD{_mm256_xor_si256(BitCast(du, a).raw, BitCast(du, b).raw)}); } HWY_API Vec256 Xor(Vec256 a, Vec256 b) { return Vec256{_mm256_xor_ps(a.raw, b.raw)}; } HWY_API Vec256 Xor(Vec256 a, Vec256 b) { return Vec256{_mm256_xor_pd(a.raw, b.raw)}; } // ------------------------------ Not template HWY_API Vec256 Not(const Vec256 v) { const DFromV d; using TU = MakeUnsigned; #if HWY_TARGET <= HWY_AVX3 const __m256i vu = BitCast(RebindToUnsigned(), v).raw; return BitCast(d, Vec256{_mm256_ternarylogic_epi32(vu, vu, vu, 0x55)}); #else return Xor(v, BitCast(d, Vec256{_mm256_set1_epi32(-1)})); #endif } // ------------------------------ Xor3 template HWY_API Vec256 Xor3(Vec256 x1, Vec256 x2, Vec256 x3) { #if HWY_TARGET <= HWY_AVX3 const DFromV d; const RebindToUnsigned du; using VU = VFromD; const __m256i ret = _mm256_ternarylogic_epi64( BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96); return BitCast(d, VU{ret}); #else return Xor(x1, Xor(x2, x3)); #endif } // ------------------------------ Or3 template HWY_API Vec256 Or3(Vec256 o1, Vec256 o2, Vec256 o3) { #if HWY_TARGET <= HWY_AVX3 const DFromV d; const RebindToUnsigned du; using VU = VFromD; const __m256i ret = _mm256_ternarylogic_epi64( BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE); return BitCast(d, VU{ret}); #else return Or(o1, Or(o2, o3)); #endif } // ------------------------------ OrAnd template HWY_API Vec256 OrAnd(Vec256 o, Vec256 a1, Vec256 a2) { #if HWY_TARGET <= HWY_AVX3 const DFromV d; const RebindToUnsigned du; using VU = VFromD; const __m256i ret = _mm256_ternarylogic_epi64( BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8); return BitCast(d, VU{ret}); #else return Or(o, And(a1, a2)); #endif } // ------------------------------ IfVecThenElse template HWY_API Vec256 IfVecThenElse(Vec256 mask, Vec256 yes, Vec256 no) { #if HWY_TARGET <= HWY_AVX3 const DFromV d; const RebindToUnsigned du; using VU = VFromD; return BitCast(d, VU{_mm256_ternarylogic_epi64(BitCast(du, mask).raw, BitCast(du, yes).raw, BitCast(du, no).raw, 0xCA)}); #else return IfThenElse(MaskFromVec(mask), yes, no); #endif } // ------------------------------ Operator overloads (internal-only if float) template HWY_API Vec256 operator&(const Vec256 a, const Vec256 b) { return And(a, b); } template HWY_API Vec256 operator|(const Vec256 a, const Vec256 b) { return Or(a, b); } template HWY_API Vec256 operator^(const Vec256 a, const Vec256 b) { return Xor(a, b); } // ------------------------------ PopulationCount // 8/16 require BITALG, 32/64 require VPOPCNTDQ. #if HWY_TARGET <= HWY_AVX3_DL #ifdef HWY_NATIVE_POPCNT #undef HWY_NATIVE_POPCNT #else #define HWY_NATIVE_POPCNT #endif namespace detail { template HWY_INLINE Vec256 PopulationCount(hwy::SizeTag<1> /* tag */, Vec256 v) { return Vec256{_mm256_popcnt_epi8(v.raw)}; } template HWY_INLINE Vec256 PopulationCount(hwy::SizeTag<2> /* tag */, Vec256 v) { return Vec256{_mm256_popcnt_epi16(v.raw)}; } template HWY_INLINE Vec256 PopulationCount(hwy::SizeTag<4> /* tag */, Vec256 v) { return Vec256{_mm256_popcnt_epi32(v.raw)}; } template HWY_INLINE Vec256 PopulationCount(hwy::SizeTag<8> /* tag */, Vec256 v) { return Vec256{_mm256_popcnt_epi64(v.raw)}; } } // namespace detail template HWY_API Vec256 PopulationCount(Vec256 v) { return detail::PopulationCount(hwy::SizeTag(), v); } #endif // HWY_TARGET <= HWY_AVX3_DL // ================================================== MASK #if HWY_TARGET <= HWY_AVX3 // ------------------------------ IfThenElse // Returns mask ? b : a. namespace detail { // Templates for signed/unsigned integer of a particular size. template HWY_INLINE Vec256 IfThenElse(hwy::SizeTag<1> /* tag */, Mask256 mask, Vec256 yes, Vec256 no) { return Vec256{_mm256_mask_blend_epi8(mask.raw, no.raw, yes.raw)}; } template HWY_INLINE Vec256 IfThenElse(hwy::SizeTag<2> /* tag */, Mask256 mask, Vec256 yes, Vec256 no) { return Vec256{_mm256_mask_blend_epi16(mask.raw, no.raw, yes.raw)}; } template HWY_INLINE Vec256 IfThenElse(hwy::SizeTag<4> /* tag */, Mask256 mask, Vec256 yes, Vec256 no) { return Vec256{_mm256_mask_blend_epi32(mask.raw, no.raw, yes.raw)}; } template HWY_INLINE Vec256 IfThenElse(hwy::SizeTag<8> /* tag */, Mask256 mask, Vec256 yes, Vec256 no) { return Vec256{_mm256_mask_blend_epi64(mask.raw, no.raw, yes.raw)}; } } // namespace detail template HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, Vec256 no) { return detail::IfThenElse(hwy::SizeTag(), mask, yes, no); } #if HWY_HAVE_FLOAT16 HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, Vec256 no) { return Vec256{_mm256_mask_blend_ph(mask.raw, no.raw, yes.raw)}; } #endif // HWY_HAVE_FLOAT16 HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, Vec256 no) { return Vec256{_mm256_mask_blend_ps(mask.raw, no.raw, yes.raw)}; } HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, Vec256 no) { return Vec256{_mm256_mask_blend_pd(mask.raw, no.raw, yes.raw)}; } namespace detail { template HWY_INLINE Vec256 IfThenElseZero(hwy::SizeTag<1> /* tag */, Mask256 mask, Vec256 yes) { return Vec256{_mm256_maskz_mov_epi8(mask.raw, yes.raw)}; } template HWY_INLINE Vec256 IfThenElseZero(hwy::SizeTag<2> /* tag */, Mask256 mask, Vec256 yes) { return Vec256{_mm256_maskz_mov_epi16(mask.raw, yes.raw)}; } template HWY_INLINE Vec256 IfThenElseZero(hwy::SizeTag<4> /* tag */, Mask256 mask, Vec256 yes) { return Vec256{_mm256_maskz_mov_epi32(mask.raw, yes.raw)}; } template HWY_INLINE Vec256 IfThenElseZero(hwy::SizeTag<8> /* tag */, Mask256 mask, Vec256 yes) { return Vec256{_mm256_maskz_mov_epi64(mask.raw, yes.raw)}; } } // namespace detail template HWY_API Vec256 IfThenElseZero(Mask256 mask, Vec256 yes) { return detail::IfThenElseZero(hwy::SizeTag(), mask, yes); } HWY_API Vec256 IfThenElseZero(Mask256 mask, Vec256 yes) { return Vec256{_mm256_maskz_mov_ps(mask.raw, yes.raw)}; } HWY_API Vec256 IfThenElseZero(Mask256 mask, Vec256 yes) { return Vec256{_mm256_maskz_mov_pd(mask.raw, yes.raw)}; } namespace detail { template HWY_INLINE Vec256 IfThenZeroElse(hwy::SizeTag<1> /* tag */, Mask256 mask, Vec256 no) { // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16. return Vec256{_mm256_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)}; } template HWY_INLINE Vec256 IfThenZeroElse(hwy::SizeTag<2> /* tag */, Mask256 mask, Vec256 no) { return Vec256{_mm256_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)}; } template HWY_INLINE Vec256 IfThenZeroElse(hwy::SizeTag<4> /* tag */, Mask256 mask, Vec256 no) { return Vec256{_mm256_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)}; } template HWY_INLINE Vec256 IfThenZeroElse(hwy::SizeTag<8> /* tag */, Mask256 mask, Vec256 no) { return Vec256{_mm256_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)}; } } // namespace detail template HWY_API Vec256 IfThenZeroElse(Mask256 mask, Vec256 no) { return detail::IfThenZeroElse(hwy::SizeTag(), mask, no); } HWY_API Vec256 IfThenZeroElse(Mask256 mask, Vec256 no) { return Vec256{_mm256_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)}; } HWY_API Vec256 IfThenZeroElse(Mask256 mask, Vec256 no) { return Vec256{_mm256_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)}; } template HWY_API Vec256 ZeroIfNegative(const Vec256 v) { static_assert(IsSigned(), "Only for float"); // AVX3 MaskFromVec only looks at the MSB return IfThenZeroElse(MaskFromVec(v), v); } // ------------------------------ Mask logical namespace detail { template HWY_INLINE Mask256 And(hwy::SizeTag<1> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{_kand_mask32(a.raw, b.raw)}; #else return Mask256{static_cast<__mmask32>(a.raw & b.raw)}; #endif } template HWY_INLINE Mask256 And(hwy::SizeTag<2> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{_kand_mask16(a.raw, b.raw)}; #else return Mask256{static_cast<__mmask16>(a.raw & b.raw)}; #endif } template HWY_INLINE Mask256 And(hwy::SizeTag<4> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{_kand_mask8(a.raw, b.raw)}; #else return Mask256{static_cast<__mmask8>(a.raw & b.raw)}; #endif } template HWY_INLINE Mask256 And(hwy::SizeTag<8> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{_kand_mask8(a.raw, b.raw)}; #else return Mask256{static_cast<__mmask8>(a.raw & b.raw)}; #endif } template HWY_INLINE Mask256 AndNot(hwy::SizeTag<1> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{_kandn_mask32(a.raw, b.raw)}; #else return Mask256{static_cast<__mmask32>(~a.raw & b.raw)}; #endif } template HWY_INLINE Mask256 AndNot(hwy::SizeTag<2> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{_kandn_mask16(a.raw, b.raw)}; #else return Mask256{static_cast<__mmask16>(~a.raw & b.raw)}; #endif } template HWY_INLINE Mask256 AndNot(hwy::SizeTag<4> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{_kandn_mask8(a.raw, b.raw)}; #else return Mask256{static_cast<__mmask8>(~a.raw & b.raw)}; #endif } template HWY_INLINE Mask256 AndNot(hwy::SizeTag<8> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{_kandn_mask8(a.raw, b.raw)}; #else return Mask256{static_cast<__mmask8>(~a.raw & b.raw)}; #endif } template HWY_INLINE Mask256 Or(hwy::SizeTag<1> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{_kor_mask32(a.raw, b.raw)}; #else return Mask256{static_cast<__mmask32>(a.raw | b.raw)}; #endif } template HWY_INLINE Mask256 Or(hwy::SizeTag<2> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{_kor_mask16(a.raw, b.raw)}; #else return Mask256{static_cast<__mmask16>(a.raw | b.raw)}; #endif } template HWY_INLINE Mask256 Or(hwy::SizeTag<4> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{_kor_mask8(a.raw, b.raw)}; #else return Mask256{static_cast<__mmask8>(a.raw | b.raw)}; #endif } template HWY_INLINE Mask256 Or(hwy::SizeTag<8> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{_kor_mask8(a.raw, b.raw)}; #else return Mask256{static_cast<__mmask8>(a.raw | b.raw)}; #endif } template HWY_INLINE Mask256 Xor(hwy::SizeTag<1> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{_kxor_mask32(a.raw, b.raw)}; #else return Mask256{static_cast<__mmask32>(a.raw ^ b.raw)}; #endif } template HWY_INLINE Mask256 Xor(hwy::SizeTag<2> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{_kxor_mask16(a.raw, b.raw)}; #else return Mask256{static_cast<__mmask16>(a.raw ^ b.raw)}; #endif } template HWY_INLINE Mask256 Xor(hwy::SizeTag<4> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{_kxor_mask8(a.raw, b.raw)}; #else return Mask256{static_cast<__mmask8>(a.raw ^ b.raw)}; #endif } template HWY_INLINE Mask256 Xor(hwy::SizeTag<8> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{_kxor_mask8(a.raw, b.raw)}; #else return Mask256{static_cast<__mmask8>(a.raw ^ b.raw)}; #endif } template HWY_INLINE Mask256 ExclusiveNeither(hwy::SizeTag<1> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{_kxnor_mask32(a.raw, b.raw)}; #else return Mask256{static_cast<__mmask32>(~(a.raw ^ b.raw) & 0xFFFFFFFF)}; #endif } template HWY_INLINE Mask256 ExclusiveNeither(hwy::SizeTag<2> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{_kxnor_mask16(a.raw, b.raw)}; #else return Mask256{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)}; #endif } template HWY_INLINE Mask256 ExclusiveNeither(hwy::SizeTag<4> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{_kxnor_mask8(a.raw, b.raw)}; #else return Mask256{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)}; #endif } template HWY_INLINE Mask256 ExclusiveNeither(hwy::SizeTag<8> /*tag*/, const Mask256 a, const Mask256 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)}; #else return Mask256{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)}; #endif } // UnmaskedNot returns ~m.raw without zeroing out any invalid bits template HWY_INLINE Mask256 UnmaskedNot(const Mask256 m) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{static_cast<__mmask32>(_knot_mask32(m.raw))}; #else return Mask256{static_cast<__mmask32>(~m.raw)}; #endif } template HWY_INLINE Mask256 UnmaskedNot(const Mask256 m) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{static_cast<__mmask16>(_knot_mask16(m.raw))}; #else return Mask256{static_cast<__mmask16>(~m.raw)}; #endif } template HWY_INLINE Mask256 UnmaskedNot(const Mask256 m) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask256{static_cast<__mmask8>(_knot_mask8(m.raw))}; #else return Mask256{static_cast<__mmask8>(~m.raw)}; #endif } template HWY_INLINE Mask256 Not(hwy::SizeTag<1> /*tag*/, const Mask256 m) { // sizeof(T) == 1: simply return ~m as all 32 bits of m are valid return UnmaskedNot(m); } template HWY_INLINE Mask256 Not(hwy::SizeTag<2> /*tag*/, const Mask256 m) { // sizeof(T) == 2: simply return ~m as all 16 bits of m are valid return UnmaskedNot(m); } template HWY_INLINE Mask256 Not(hwy::SizeTag<4> /*tag*/, const Mask256 m) { // sizeof(T) == 4: simply return ~m as all 8 bits of m are valid return UnmaskedNot(m); } template HWY_INLINE Mask256 Not(hwy::SizeTag<8> /*tag*/, const Mask256 m) { // sizeof(T) == 8: need to zero out the upper 4 bits of ~m as only the lower // 4 bits of m are valid // Return (~m) & 0x0F return AndNot(hwy::SizeTag<8>(), m, Mask256::FromBits(uint64_t{0x0F})); } } // namespace detail template HWY_API Mask256 And(const Mask256 a, Mask256 b) { return detail::And(hwy::SizeTag(), a, b); } template HWY_API Mask256 AndNot(const Mask256 a, Mask256 b) { return detail::AndNot(hwy::SizeTag(), a, b); } template HWY_API Mask256 Or(const Mask256 a, Mask256 b) { return detail::Or(hwy::SizeTag(), a, b); } template HWY_API Mask256 Xor(const Mask256 a, Mask256 b) { return detail::Xor(hwy::SizeTag(), a, b); } template HWY_API Mask256 Not(const Mask256 m) { // Flip only the valid bits. return detail::Not(hwy::SizeTag(), m); } template HWY_API Mask256 ExclusiveNeither(const Mask256 a, Mask256 b) { return detail::ExclusiveNeither(hwy::SizeTag(), a, b); } template HWY_API MFromD CombineMasks(D /*d*/, MFromD> hi, MFromD> lo) { #if HWY_COMPILER_HAS_MASK_INTRINSICS const __mmask32 combined_mask = _mm512_kunpackw( static_cast<__mmask32>(hi.raw), static_cast<__mmask32>(lo.raw)); #else const auto combined_mask = ((static_cast(hi.raw) << 16) | (lo.raw & 0xFFFFu)); #endif return MFromD{static_cast().raw)>(combined_mask)}; } template HWY_API MFromD UpperHalfOfMask(D /*d*/, MFromD> m) { #if HWY_COMPILER_HAS_MASK_INTRINSICS const auto shifted_mask = _kshiftri_mask32(static_cast<__mmask32>(m.raw), 16); #else const auto shifted_mask = static_cast(m.raw) >> 16; #endif return MFromD{static_cast().raw)>(shifted_mask)}; } #else // AVX2 // ------------------------------ Mask // Mask and Vec are the same (true = FF..FF). template HWY_API Mask256 MaskFromVec(const Vec256 v) { return Mask256{v.raw}; } template HWY_API Vec256 VecFromMask(const Mask256 v) { return Vec256{v.raw}; } // ------------------------------ IfThenElse // mask ? yes : no template HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, Vec256 no) { return Vec256{_mm256_blendv_epi8(no.raw, yes.raw, mask.raw)}; } HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, Vec256 no) { return Vec256{_mm256_blendv_ps(no.raw, yes.raw, mask.raw)}; } HWY_API Vec256 IfThenElse(Mask256 mask, Vec256 yes, Vec256 no) { return Vec256{_mm256_blendv_pd(no.raw, yes.raw, mask.raw)}; } // mask ? yes : 0 template HWY_API Vec256 IfThenElseZero(Mask256 mask, Vec256 yes) { const DFromV d; return yes & VecFromMask(d, mask); } // mask ? 0 : no template HWY_API Vec256 IfThenZeroElse(Mask256 mask, Vec256 no) { const DFromV d; return AndNot(VecFromMask(d, mask), no); } template HWY_API Vec256 ZeroIfNegative(Vec256 v) { static_assert(IsSigned(), "Only for float"); const DFromV d; const auto zero = Zero(d); // AVX2 IfThenElse only looks at the MSB for 32/64-bit lanes return IfThenElse(MaskFromVec(v), zero, v); } // ------------------------------ Mask logical template HWY_API Mask256 Not(const Mask256 m) { const Full256 d; return MaskFromVec(Not(VecFromMask(d, m))); } template HWY_API Mask256 And(const Mask256 a, Mask256 b) { const Full256 d; return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask256 AndNot(const Mask256 a, Mask256 b) { const Full256 d; return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask256 Or(const Mask256 a, Mask256 b) { const Full256 d; return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask256 Xor(const Mask256 a, Mask256 b) { const Full256 d; return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask256 ExclusiveNeither(const Mask256 a, Mask256 b) { const Full256 d; return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); } #endif // HWY_TARGET <= HWY_AVX3 // ================================================== COMPARE #if HWY_TARGET <= HWY_AVX3 // Comparisons set a mask bit to 1 if the condition is true, else 0. template HWY_API MFromD RebindMask(DTo /*tag*/, Mask256 m) { static_assert(sizeof(TFrom) == sizeof(TFromD), "Must have same size"); return MFromD{m.raw}; } namespace detail { template HWY_INLINE Mask256 TestBit(hwy::SizeTag<1> /*tag*/, const Vec256 v, const Vec256 bit) { return Mask256{_mm256_test_epi8_mask(v.raw, bit.raw)}; } template HWY_INLINE Mask256 TestBit(hwy::SizeTag<2> /*tag*/, const Vec256 v, const Vec256 bit) { return Mask256{_mm256_test_epi16_mask(v.raw, bit.raw)}; } template HWY_INLINE Mask256 TestBit(hwy::SizeTag<4> /*tag*/, const Vec256 v, const Vec256 bit) { return Mask256{_mm256_test_epi32_mask(v.raw, bit.raw)}; } template HWY_INLINE Mask256 TestBit(hwy::SizeTag<8> /*tag*/, const Vec256 v, const Vec256 bit) { return Mask256{_mm256_test_epi64_mask(v.raw, bit.raw)}; } } // namespace detail template HWY_API Mask256 TestBit(const Vec256 v, const Vec256 bit) { static_assert(!hwy::IsFloat(), "Only integer vectors supported"); return detail::TestBit(hwy::SizeTag(), v, bit); } // ------------------------------ Equality template HWY_API Mask256 operator==(const Vec256 a, const Vec256 b) { return Mask256{_mm256_cmpeq_epi8_mask(a.raw, b.raw)}; } template HWY_API Mask256 operator==(const Vec256 a, const Vec256 b) { return Mask256{_mm256_cmpeq_epi16_mask(a.raw, b.raw)}; } template HWY_API Mask256 operator==(const Vec256 a, const Vec256 b) { return Mask256{_mm256_cmpeq_epi32_mask(a.raw, b.raw)}; } template HWY_API Mask256 operator==(const Vec256 a, const Vec256 b) { return Mask256{_mm256_cmpeq_epi64_mask(a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 HWY_API Mask256 operator==(Vec256 a, Vec256 b) { // Work around warnings in the intrinsic definitions (passing -1 as a mask). HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") return Mask256{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)}; HWY_DIAGNOSTICS(pop) } #endif // HWY_HAVE_FLOAT16 HWY_API Mask256 operator==(Vec256 a, Vec256 b) { return Mask256{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)}; } HWY_API Mask256 operator==(Vec256 a, Vec256 b) { return Mask256{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)}; } // ------------------------------ Inequality template HWY_API Mask256 operator!=(const Vec256 a, const Vec256 b) { return Mask256{_mm256_cmpneq_epi8_mask(a.raw, b.raw)}; } template HWY_API Mask256 operator!=(const Vec256 a, const Vec256 b) { return Mask256{_mm256_cmpneq_epi16_mask(a.raw, b.raw)}; } template HWY_API Mask256 operator!=(const Vec256 a, const Vec256 b) { return Mask256{_mm256_cmpneq_epi32_mask(a.raw, b.raw)}; } template HWY_API Mask256 operator!=(const Vec256 a, const Vec256 b) { return Mask256{_mm256_cmpneq_epi64_mask(a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 HWY_API Mask256 operator!=(Vec256 a, Vec256 b) { // Work around warnings in the intrinsic definitions (passing -1 as a mask). HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") return Mask256{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; HWY_DIAGNOSTICS(pop) } #endif // HWY_HAVE_FLOAT16 HWY_API Mask256 operator!=(Vec256 a, Vec256 b) { return Mask256{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; } HWY_API Mask256 operator!=(Vec256 a, Vec256 b) { return Mask256{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; } // ------------------------------ Strict inequality HWY_API Mask256 operator>(Vec256 a, Vec256 b) { return Mask256{_mm256_cmpgt_epi8_mask(a.raw, b.raw)}; } HWY_API Mask256 operator>(Vec256 a, Vec256 b) { return Mask256{_mm256_cmpgt_epi16_mask(a.raw, b.raw)}; } HWY_API Mask256 operator>(Vec256 a, Vec256 b) { return Mask256{_mm256_cmpgt_epi32_mask(a.raw, b.raw)}; } HWY_API Mask256 operator>(Vec256 a, Vec256 b) { return Mask256{_mm256_cmpgt_epi64_mask(a.raw, b.raw)}; } HWY_API Mask256 operator>(Vec256 a, Vec256 b) { return Mask256{_mm256_cmpgt_epu8_mask(a.raw, b.raw)}; } HWY_API Mask256 operator>(Vec256 a, Vec256 b) { return Mask256{_mm256_cmpgt_epu16_mask(a.raw, b.raw)}; } HWY_API Mask256 operator>(Vec256 a, Vec256 b) { return Mask256{_mm256_cmpgt_epu32_mask(a.raw, b.raw)}; } HWY_API Mask256 operator>(Vec256 a, Vec256 b) { return Mask256{_mm256_cmpgt_epu64_mask(a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 HWY_API Mask256 operator>(Vec256 a, Vec256 b) { // Work around warnings in the intrinsic definitions (passing -1 as a mask). HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") return Mask256{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)}; HWY_DIAGNOSTICS(pop) } #endif // HWY_HAVE_FLOAT16 HWY_API Mask256 operator>(Vec256 a, Vec256 b) { return Mask256{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)}; } HWY_API Mask256 operator>(Vec256 a, Vec256 b) { return Mask256{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)}; } // ------------------------------ Weak inequality #if HWY_HAVE_FLOAT16 HWY_API Mask256 operator>=(Vec256 a, Vec256 b) { // Work around warnings in the intrinsic definitions (passing -1 as a mask). HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") return Mask256{_mm256_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)}; HWY_DIAGNOSTICS(pop) } #endif // HWY_HAVE_FLOAT16 HWY_API Mask256 operator>=(Vec256 a, Vec256 b) { return Mask256{_mm256_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)}; } HWY_API Mask256 operator>=(Vec256 a, Vec256 b) { return Mask256{_mm256_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)}; } HWY_API Mask256 operator>=(Vec256 a, Vec256 b) { return Mask256{_mm256_cmpge_epi8_mask(a.raw, b.raw)}; } HWY_API Mask256 operator>=(Vec256 a, Vec256 b) { return Mask256{_mm256_cmpge_epi16_mask(a.raw, b.raw)}; } HWY_API Mask256 operator>=(Vec256 a, Vec256 b) { return Mask256{_mm256_cmpge_epi32_mask(a.raw, b.raw)}; } HWY_API Mask256 operator>=(Vec256 a, Vec256 b) { return Mask256{_mm256_cmpge_epi64_mask(a.raw, b.raw)}; } HWY_API Mask256 operator>=(Vec256 a, Vec256 b) { return Mask256{_mm256_cmpge_epu8_mask(a.raw, b.raw)}; } HWY_API Mask256 operator>=(const Vec256 a, const Vec256 b) { return Mask256{_mm256_cmpge_epu16_mask(a.raw, b.raw)}; } HWY_API Mask256 operator>=(const Vec256 a, const Vec256 b) { return Mask256{_mm256_cmpge_epu32_mask(a.raw, b.raw)}; } HWY_API Mask256 operator>=(const Vec256 a, const Vec256 b) { return Mask256{_mm256_cmpge_epu64_mask(a.raw, b.raw)}; } // ------------------------------ Mask namespace detail { template HWY_INLINE Mask256 MaskFromVec(hwy::SizeTag<1> /*tag*/, const Vec256 v) { return Mask256{_mm256_movepi8_mask(v.raw)}; } template HWY_INLINE Mask256 MaskFromVec(hwy::SizeTag<2> /*tag*/, const Vec256 v) { return Mask256{_mm256_movepi16_mask(v.raw)}; } template HWY_INLINE Mask256 MaskFromVec(hwy::SizeTag<4> /*tag*/, const Vec256 v) { return Mask256{_mm256_movepi32_mask(v.raw)}; } template HWY_INLINE Mask256 MaskFromVec(hwy::SizeTag<8> /*tag*/, const Vec256 v) { return Mask256{_mm256_movepi64_mask(v.raw)}; } } // namespace detail template HWY_API Mask256 MaskFromVec(const Vec256 v) { return detail::MaskFromVec(hwy::SizeTag(), v); } // There do not seem to be native floating-point versions of these instructions. template HWY_API Mask256 MaskFromVec(const Vec256 v) { const RebindToSigned> di; return Mask256{MaskFromVec(BitCast(di, v)).raw}; } template HWY_API Vec256 VecFromMask(const Mask256 v) { return Vec256{_mm256_movm_epi8(v.raw)}; } template HWY_API Vec256 VecFromMask(const Mask256 v) { return Vec256{_mm256_movm_epi16(v.raw)}; } template HWY_API Vec256 VecFromMask(const Mask256 v) { return Vec256{_mm256_movm_epi32(v.raw)}; } template HWY_API Vec256 VecFromMask(const Mask256 v) { return Vec256{_mm256_movm_epi64(v.raw)}; } #if HWY_HAVE_FLOAT16 HWY_API Vec256 VecFromMask(const Mask256 v) { return Vec256{_mm256_castsi256_ph(_mm256_movm_epi16(v.raw))}; } #endif // HWY_HAVE_FLOAT16 HWY_API Vec256 VecFromMask(const Mask256 v) { return Vec256{_mm256_castsi256_ps(_mm256_movm_epi32(v.raw))}; } HWY_API Vec256 VecFromMask(const Mask256 v) { return Vec256{_mm256_castsi256_pd(_mm256_movm_epi64(v.raw))}; } #else // AVX2 // Comparisons fill a lane with 1-bits if the condition is true, else 0. template HWY_API MFromD RebindMask(DTo d_to, Mask256 m) { static_assert(sizeof(TFrom) == sizeof(TFromD), "Must have same size"); const Full256 dfrom; return MaskFromVec(BitCast(d_to, VecFromMask(dfrom, m))); } template HWY_API Mask256 TestBit(const Vec256 v, const Vec256 bit) { static_assert(!hwy::IsFloat(), "Only integer vectors supported"); return (v & bit) == bit; } // ------------------------------ Equality template HWY_API Mask256 operator==(Vec256 a, Vec256 b) { return Mask256{_mm256_cmpeq_epi8(a.raw, b.raw)}; } template HWY_API Mask256 operator==(Vec256 a, Vec256 b) { return Mask256{_mm256_cmpeq_epi16(a.raw, b.raw)}; } template HWY_API Mask256 operator==(Vec256 a, Vec256 b) { return Mask256{_mm256_cmpeq_epi32(a.raw, b.raw)}; } template HWY_API Mask256 operator==(Vec256 a, Vec256 b) { return Mask256{_mm256_cmpeq_epi64(a.raw, b.raw)}; } HWY_API Mask256 operator==(Vec256 a, Vec256 b) { return Mask256{_mm256_cmp_ps(a.raw, b.raw, _CMP_EQ_OQ)}; } HWY_API Mask256 operator==(Vec256 a, Vec256 b) { return Mask256{_mm256_cmp_pd(a.raw, b.raw, _CMP_EQ_OQ)}; } // ------------------------------ Inequality template HWY_API Mask256 operator!=(Vec256 a, Vec256 b) { return Not(a == b); } HWY_API Mask256 operator!=(Vec256 a, Vec256 b) { return Mask256{_mm256_cmp_ps(a.raw, b.raw, _CMP_NEQ_OQ)}; } HWY_API Mask256 operator!=(Vec256 a, Vec256 b) { return Mask256{_mm256_cmp_pd(a.raw, b.raw, _CMP_NEQ_OQ)}; } // ------------------------------ Strict inequality // Tag dispatch instead of SFINAE for MSVC 2017 compatibility namespace detail { // Pre-9.3 GCC immintrin.h uses char, which may be unsigned, causing cmpgt_epi8 // to perform an unsigned comparison instead of the intended signed. Workaround // is to cast to an explicitly signed type. See https://godbolt.org/z/PL7Ujy #if HWY_COMPILER_GCC_ACTUAL != 0 && HWY_COMPILER_GCC_ACTUAL < 903 #define HWY_AVX2_GCC_CMPGT8_WORKAROUND 1 #else #define HWY_AVX2_GCC_CMPGT8_WORKAROUND 0 #endif HWY_API Mask256 Gt(hwy::SignedTag /*tag*/, Vec256 a, Vec256 b) { #if HWY_AVX2_GCC_CMPGT8_WORKAROUND using i8x32 = signed char __attribute__((__vector_size__(32))); return Mask256{static_cast<__m256i>(reinterpret_cast(a.raw) > reinterpret_cast(b.raw))}; #else return Mask256{_mm256_cmpgt_epi8(a.raw, b.raw)}; #endif } HWY_API Mask256 Gt(hwy::SignedTag /*tag*/, Vec256 a, Vec256 b) { return Mask256{_mm256_cmpgt_epi16(a.raw, b.raw)}; } HWY_API Mask256 Gt(hwy::SignedTag /*tag*/, Vec256 a, Vec256 b) { return Mask256{_mm256_cmpgt_epi32(a.raw, b.raw)}; } HWY_API Mask256 Gt(hwy::SignedTag /*tag*/, Vec256 a, Vec256 b) { return Mask256{_mm256_cmpgt_epi64(a.raw, b.raw)}; } template HWY_INLINE Mask256 Gt(hwy::UnsignedTag /*tag*/, Vec256 a, Vec256 b) { const Full256 du; const RebindToSigned di; const Vec256 msb = Set(du, (LimitsMax() >> 1) + 1); return RebindMask(du, BitCast(di, Xor(a, msb)) > BitCast(di, Xor(b, msb))); } HWY_API Mask256 Gt(hwy::FloatTag /*tag*/, Vec256 a, Vec256 b) { return Mask256{_mm256_cmp_ps(a.raw, b.raw, _CMP_GT_OQ)}; } HWY_API Mask256 Gt(hwy::FloatTag /*tag*/, Vec256 a, Vec256 b) { return Mask256{_mm256_cmp_pd(a.raw, b.raw, _CMP_GT_OQ)}; } } // namespace detail template HWY_API Mask256 operator>(Vec256 a, Vec256 b) { return detail::Gt(hwy::TypeTag(), a, b); } // ------------------------------ Weak inequality namespace detail { template HWY_INLINE Mask256 Ge(hwy::SignedTag tag, Vec256 a, Vec256 b) { return Not(Gt(tag, b, a)); } template HWY_INLINE Mask256 Ge(hwy::UnsignedTag tag, Vec256 a, Vec256 b) { return Not(Gt(tag, b, a)); } HWY_INLINE Mask256 Ge(hwy::FloatTag /*tag*/, Vec256 a, Vec256 b) { return Mask256{_mm256_cmp_ps(a.raw, b.raw, _CMP_GE_OQ)}; } HWY_INLINE Mask256 Ge(hwy::FloatTag /*tag*/, Vec256 a, Vec256 b) { return Mask256{_mm256_cmp_pd(a.raw, b.raw, _CMP_GE_OQ)}; } } // namespace detail template HWY_API Mask256 operator>=(Vec256 a, Vec256 b) { return detail::Ge(hwy::TypeTag(), a, b); } #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ Reversed comparisons template HWY_API Mask256 operator<(const Vec256 a, const Vec256 b) { return b > a; } template HWY_API Mask256 operator<=(const Vec256 a, const Vec256 b) { return b >= a; } // ------------------------------ Min (Gt, IfThenElse) // Unsigned HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { return Vec256{_mm256_min_epu8(a.raw, b.raw)}; } HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { return Vec256{_mm256_min_epu16(a.raw, b.raw)}; } HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { return Vec256{_mm256_min_epu32(a.raw, b.raw)}; } HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_min_epu64(a.raw, b.raw)}; #else const Full256 du; const Full256 di; const auto msb = Set(du, 1ull << 63); const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); return IfThenElse(gt, b, a); #endif } // Signed HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { return Vec256{_mm256_min_epi8(a.raw, b.raw)}; } HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { return Vec256{_mm256_min_epi16(a.raw, b.raw)}; } HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { return Vec256{_mm256_min_epi32(a.raw, b.raw)}; } HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_min_epi64(a.raw, b.raw)}; #else return IfThenElse(a < b, a, b); #endif } // Float #if HWY_HAVE_FLOAT16 HWY_API Vec256 Min(Vec256 a, Vec256 b) { return Vec256{_mm256_min_ph(a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { return Vec256{_mm256_min_ps(a.raw, b.raw)}; } HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { return Vec256{_mm256_min_pd(a.raw, b.raw)}; } // ------------------------------ Max (Gt, IfThenElse) // Unsigned HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { return Vec256{_mm256_max_epu8(a.raw, b.raw)}; } HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { return Vec256{_mm256_max_epu16(a.raw, b.raw)}; } HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { return Vec256{_mm256_max_epu32(a.raw, b.raw)}; } HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_max_epu64(a.raw, b.raw)}; #else const Full256 du; const Full256 di; const auto msb = Set(du, 1ull << 63); const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); return IfThenElse(gt, a, b); #endif } // Signed HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { return Vec256{_mm256_max_epi8(a.raw, b.raw)}; } HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { return Vec256{_mm256_max_epi16(a.raw, b.raw)}; } HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { return Vec256{_mm256_max_epi32(a.raw, b.raw)}; } HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_max_epi64(a.raw, b.raw)}; #else return IfThenElse(a < b, b, a); #endif } // Float #if HWY_HAVE_FLOAT16 HWY_API Vec256 Max(Vec256 a, Vec256 b) { return Vec256{_mm256_max_ph(a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { return Vec256{_mm256_max_ps(a.raw, b.raw)}; } HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { return Vec256{_mm256_max_pd(a.raw, b.raw)}; } // ------------------------------ Iota namespace detail { template HWY_INLINE VFromD Iota0(D /*d*/) { return VFromD{_mm256_set_epi8( static_cast(31), static_cast(30), static_cast(29), static_cast(28), static_cast(27), static_cast(26), static_cast(25), static_cast(24), static_cast(23), static_cast(22), static_cast(21), static_cast(20), static_cast(19), static_cast(18), static_cast(17), static_cast(16), static_cast(15), static_cast(14), static_cast(13), static_cast(12), static_cast(11), static_cast(10), static_cast(9), static_cast(8), static_cast(7), static_cast(6), static_cast(5), static_cast(4), static_cast(3), static_cast(2), static_cast(1), static_cast(0))}; } template HWY_INLINE VFromD Iota0(D /*d*/) { return VFromD{_mm256_set_epi16( int16_t{15}, int16_t{14}, int16_t{13}, int16_t{12}, int16_t{11}, int16_t{10}, int16_t{9}, int16_t{8}, int16_t{7}, int16_t{6}, int16_t{5}, int16_t{4}, int16_t{3}, int16_t{2}, int16_t{1}, int16_t{0})}; } #if HWY_HAVE_FLOAT16 template HWY_INLINE VFromD Iota0(D /*d*/) { return VFromD{ _mm256_set_ph(float16_t{15}, float16_t{14}, float16_t{13}, float16_t{12}, float16_t{11}, float16_t{10}, float16_t{9}, float16_t{8}, float16_t{7}, float16_t{6}, float16_t{5}, float16_t{4}, float16_t{3}, float16_t{2}, float16_t{1}, float16_t{0})}; } #endif // HWY_HAVE_FLOAT16 template HWY_INLINE VFromD Iota0(D /*d*/) { return VFromD{_mm256_set_epi32(int32_t{7}, int32_t{6}, int32_t{5}, int32_t{4}, int32_t{3}, int32_t{2}, int32_t{1}, int32_t{0})}; } template HWY_INLINE VFromD Iota0(D /*d*/) { return VFromD{ _mm256_set_epi64x(int64_t{3}, int64_t{2}, int64_t{1}, int64_t{0})}; } template HWY_INLINE VFromD Iota0(D /*d*/) { return VFromD{ _mm256_set_ps(7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f)}; } template HWY_INLINE VFromD Iota0(D /*d*/) { return VFromD{_mm256_set_pd(3.0, 2.0, 1.0, 0.0)}; } } // namespace detail template HWY_API VFromD Iota(D d, const T2 first) { return detail::Iota0(d) + Set(d, ConvertScalarTo>(first)); } // ------------------------------ FirstN (Iota, Lt) template > HWY_API M FirstN(const D d, size_t n) { constexpr size_t kN = MaxLanes(d); // For AVX3, this ensures `num` <= 255 as required by bzhi, which only looks // at the lower 8 bits; for AVX2 and below, this ensures `num` fits in TI. n = HWY_MIN(n, kN); #if HWY_TARGET <= HWY_AVX3 #if HWY_ARCH_X86_64 const uint64_t all = (1ull << kN) - 1; return M::FromBits(_bzhi_u64(all, n)); #else const uint32_t all = static_cast((1ull << kN) - 1); return M::FromBits(_bzhi_u32(all, static_cast(n))); #endif // HWY_ARCH_X86_64 #else const RebindToSigned di; // Signed comparisons are cheaper. using TI = TFromD; return RebindMask(d, detail::Iota0(di) < Set(di, static_cast(n))); #endif } // ================================================== ARITHMETIC // ------------------------------ Addition // Unsigned HWY_API Vec256 operator+(Vec256 a, Vec256 b) { return Vec256{_mm256_add_epi8(a.raw, b.raw)}; } HWY_API Vec256 operator+(Vec256 a, Vec256 b) { return Vec256{_mm256_add_epi16(a.raw, b.raw)}; } HWY_API Vec256 operator+(Vec256 a, Vec256 b) { return Vec256{_mm256_add_epi32(a.raw, b.raw)}; } HWY_API Vec256 operator+(Vec256 a, Vec256 b) { return Vec256{_mm256_add_epi64(a.raw, b.raw)}; } // Signed HWY_API Vec256 operator+(Vec256 a, Vec256 b) { return Vec256{_mm256_add_epi8(a.raw, b.raw)}; } HWY_API Vec256 operator+(Vec256 a, Vec256 b) { return Vec256{_mm256_add_epi16(a.raw, b.raw)}; } HWY_API Vec256 operator+(Vec256 a, Vec256 b) { return Vec256{_mm256_add_epi32(a.raw, b.raw)}; } HWY_API Vec256 operator+(Vec256 a, Vec256 b) { return Vec256{_mm256_add_epi64(a.raw, b.raw)}; } // Float #if HWY_HAVE_FLOAT16 HWY_API Vec256 operator+(Vec256 a, Vec256 b) { return Vec256{_mm256_add_ph(a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 HWY_API Vec256 operator+(Vec256 a, Vec256 b) { return Vec256{_mm256_add_ps(a.raw, b.raw)}; } HWY_API Vec256 operator+(Vec256 a, Vec256 b) { return Vec256{_mm256_add_pd(a.raw, b.raw)}; } // ------------------------------ Subtraction // Unsigned HWY_API Vec256 operator-(Vec256 a, Vec256 b) { return Vec256{_mm256_sub_epi8(a.raw, b.raw)}; } HWY_API Vec256 operator-(Vec256 a, Vec256 b) { return Vec256{_mm256_sub_epi16(a.raw, b.raw)}; } HWY_API Vec256 operator-(Vec256 a, Vec256 b) { return Vec256{_mm256_sub_epi32(a.raw, b.raw)}; } HWY_API Vec256 operator-(Vec256 a, Vec256 b) { return Vec256{_mm256_sub_epi64(a.raw, b.raw)}; } // Signed HWY_API Vec256 operator-(Vec256 a, Vec256 b) { return Vec256{_mm256_sub_epi8(a.raw, b.raw)}; } HWY_API Vec256 operator-(Vec256 a, Vec256 b) { return Vec256{_mm256_sub_epi16(a.raw, b.raw)}; } HWY_API Vec256 operator-(Vec256 a, Vec256 b) { return Vec256{_mm256_sub_epi32(a.raw, b.raw)}; } HWY_API Vec256 operator-(Vec256 a, Vec256 b) { return Vec256{_mm256_sub_epi64(a.raw, b.raw)}; } // Float #if HWY_HAVE_FLOAT16 HWY_API Vec256 operator-(Vec256 a, Vec256 b) { return Vec256{_mm256_sub_ph(a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 HWY_API Vec256 operator-(Vec256 a, Vec256 b) { return Vec256{_mm256_sub_ps(a.raw, b.raw)}; } HWY_API Vec256 operator-(Vec256 a, Vec256 b) { return Vec256{_mm256_sub_pd(a.raw, b.raw)}; } // ------------------------------ AddSub HWY_API Vec256 AddSub(Vec256 a, Vec256 b) { return Vec256{_mm256_addsub_ps(a.raw, b.raw)}; } HWY_API Vec256 AddSub(Vec256 a, Vec256 b) { return Vec256{_mm256_addsub_pd(a.raw, b.raw)}; } // ------------------------------ SumsOf8 HWY_API Vec256 SumsOf8(Vec256 v) { return Vec256{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())}; } HWY_API Vec256 SumsOf8AbsDiff(Vec256 a, Vec256 b) { return Vec256{_mm256_sad_epu8(a.raw, b.raw)}; } // ------------------------------ SumsOf4 #if HWY_TARGET <= HWY_AVX3 namespace detail { HWY_INLINE Vec256 SumsOf4(hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, Vec256 v) { const DFromV d; // _mm256_maskz_dbsad_epu8 is used below as the odd uint16_t lanes need to be // zeroed out and the sums of the 4 consecutive lanes are already in the // even uint16_t lanes of the _mm256_maskz_dbsad_epu8 result. return Vec256{_mm256_maskz_dbsad_epu8( static_cast<__mmask16>(0x5555), v.raw, Zero(d).raw, 0)}; } // detail::SumsOf4 for Vec256 on AVX3 is implemented in x86_512-inl.h } // namespace detail #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ SumsOfAdjQuadAbsDiff template static Vec256 SumsOfAdjQuadAbsDiff(Vec256 a, Vec256 b) { static_assert(0 <= kAOffset && kAOffset <= 1, "kAOffset must be between 0 and 1"); static_assert(0 <= kBOffset && kBOffset <= 3, "kBOffset must be between 0 and 3"); return Vec256{_mm256_mpsadbw_epu8( a.raw, b.raw, (kAOffset << 5) | (kBOffset << 3) | (kAOffset << 2) | kBOffset)}; } // ------------------------------ SumsOfShuffledQuadAbsDiff #if HWY_TARGET <= HWY_AVX3 template static Vec256 SumsOfShuffledQuadAbsDiff(Vec256 a, Vec256 b) { static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3"); static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3"); static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3"); static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3"); return Vec256{ _mm256_dbsad_epu8(b.raw, a.raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))}; } #endif // ------------------------------ SaturatedAdd // Returns a + b clamped to the destination range. // Unsigned HWY_API Vec256 SaturatedAdd(Vec256 a, Vec256 b) { return Vec256{_mm256_adds_epu8(a.raw, b.raw)}; } HWY_API Vec256 SaturatedAdd(Vec256 a, Vec256 b) { return Vec256{_mm256_adds_epu16(a.raw, b.raw)}; } // Signed HWY_API Vec256 SaturatedAdd(Vec256 a, Vec256 b) { return Vec256{_mm256_adds_epi8(a.raw, b.raw)}; } HWY_API Vec256 SaturatedAdd(Vec256 a, Vec256 b) { return Vec256{_mm256_adds_epi16(a.raw, b.raw)}; } #if HWY_TARGET <= HWY_AVX3 HWY_API Vec256 SaturatedAdd(Vec256 a, Vec256 b) { const DFromV d; const auto sum = a + b; const auto overflow_mask = MaskFromVec( Vec256{_mm256_ternarylogic_epi32(a.raw, b.raw, sum.raw, 0x42)}); const auto i32_max = Set(d, LimitsMax()); const Vec256 overflow_result{_mm256_mask_ternarylogic_epi32( i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; return IfThenElse(overflow_mask, overflow_result, sum); } HWY_API Vec256 SaturatedAdd(Vec256 a, Vec256 b) { const DFromV d; const auto sum = a + b; const auto overflow_mask = MaskFromVec( Vec256{_mm256_ternarylogic_epi64(a.raw, b.raw, sum.raw, 0x42)}); const auto i64_max = Set(d, LimitsMax()); const Vec256 overflow_result{_mm256_mask_ternarylogic_epi64( i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; return IfThenElse(overflow_mask, overflow_result, sum); } #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ SaturatedSub // Returns a - b clamped to the destination range. // Unsigned HWY_API Vec256 SaturatedSub(Vec256 a, Vec256 b) { return Vec256{_mm256_subs_epu8(a.raw, b.raw)}; } HWY_API Vec256 SaturatedSub(Vec256 a, Vec256 b) { return Vec256{_mm256_subs_epu16(a.raw, b.raw)}; } // Signed HWY_API Vec256 SaturatedSub(Vec256 a, Vec256 b) { return Vec256{_mm256_subs_epi8(a.raw, b.raw)}; } HWY_API Vec256 SaturatedSub(Vec256 a, Vec256 b) { return Vec256{_mm256_subs_epi16(a.raw, b.raw)}; } #if HWY_TARGET <= HWY_AVX3 HWY_API Vec256 SaturatedSub(Vec256 a, Vec256 b) { const DFromV d; const auto diff = a - b; const auto overflow_mask = MaskFromVec( Vec256{_mm256_ternarylogic_epi32(a.raw, b.raw, diff.raw, 0x18)}); const auto i32_max = Set(d, LimitsMax()); const Vec256 overflow_result{_mm256_mask_ternarylogic_epi32( i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; return IfThenElse(overflow_mask, overflow_result, diff); } HWY_API Vec256 SaturatedSub(Vec256 a, Vec256 b) { const DFromV d; const auto diff = a - b; const auto overflow_mask = MaskFromVec( Vec256{_mm256_ternarylogic_epi64(a.raw, b.raw, diff.raw, 0x18)}); const auto i64_max = Set(d, LimitsMax()); const Vec256 overflow_result{_mm256_mask_ternarylogic_epi64( i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; return IfThenElse(overflow_mask, overflow_result, diff); } #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ Average // Returns (a + b + 1) / 2 // Unsigned HWY_API Vec256 AverageRound(Vec256 a, Vec256 b) { return Vec256{_mm256_avg_epu8(a.raw, b.raw)}; } HWY_API Vec256 AverageRound(Vec256 a, Vec256 b) { return Vec256{_mm256_avg_epu16(a.raw, b.raw)}; } // ------------------------------ Abs (Sub) // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. HWY_API Vec256 Abs(Vec256 v) { #if HWY_COMPILER_MSVC // Workaround for incorrect codegen? (wrong result) const DFromV d; const auto zero = Zero(d); return Vec256{_mm256_max_epi8(v.raw, (zero - v).raw)}; #else return Vec256{_mm256_abs_epi8(v.raw)}; #endif } HWY_API Vec256 Abs(const Vec256 v) { return Vec256{_mm256_abs_epi16(v.raw)}; } HWY_API Vec256 Abs(const Vec256 v) { return Vec256{_mm256_abs_epi32(v.raw)}; } #if HWY_TARGET <= HWY_AVX3 HWY_API Vec256 Abs(const Vec256 v) { return Vec256{_mm256_abs_epi64(v.raw)}; } #endif // ------------------------------ Integer multiplication // Unsigned HWY_API Vec256 operator*(Vec256 a, Vec256 b) { return Vec256{_mm256_mullo_epi16(a.raw, b.raw)}; } HWY_API Vec256 operator*(Vec256 a, Vec256 b) { return Vec256{_mm256_mullo_epi32(a.raw, b.raw)}; } // Signed HWY_API Vec256 operator*(Vec256 a, Vec256 b) { return Vec256{_mm256_mullo_epi16(a.raw, b.raw)}; } HWY_API Vec256 operator*(Vec256 a, Vec256 b) { return Vec256{_mm256_mullo_epi32(a.raw, b.raw)}; } // Returns the upper 16 bits of a * b in each lane. HWY_API Vec256 MulHigh(Vec256 a, Vec256 b) { return Vec256{_mm256_mulhi_epu16(a.raw, b.raw)}; } HWY_API Vec256 MulHigh(Vec256 a, Vec256 b) { return Vec256{_mm256_mulhi_epi16(a.raw, b.raw)}; } HWY_API Vec256 MulFixedPoint15(Vec256 a, Vec256 b) { return Vec256{_mm256_mulhrs_epi16(a.raw, b.raw)}; } // Multiplies even lanes (0, 2 ..) and places the double-wide result into // even and the upper half into its odd neighbor lane. HWY_API Vec256 MulEven(Vec256 a, Vec256 b) { return Vec256{_mm256_mul_epi32(a.raw, b.raw)}; } HWY_API Vec256 MulEven(Vec256 a, Vec256 b) { return Vec256{_mm256_mul_epu32(a.raw, b.raw)}; } // ------------------------------ ShiftLeft #if HWY_TARGET <= HWY_AVX3_DL namespace detail { template HWY_API Vec256 GaloisAffine(Vec256 v, Vec256 matrix) { return Vec256{_mm256_gf2p8affine_epi64_epi8(v.raw, matrix.raw, 0)}; } } // namespace detail #endif // HWY_TARGET <= HWY_AVX3_DL template HWY_API Vec256 ShiftLeft(Vec256 v) { return Vec256{_mm256_slli_epi16(v.raw, kBits)}; } template HWY_API Vec256 ShiftLeft(Vec256 v) { return Vec256{_mm256_slli_epi32(v.raw, kBits)}; } template HWY_API Vec256 ShiftLeft(Vec256 v) { return Vec256{_mm256_slli_epi64(v.raw, kBits)}; } template HWY_API Vec256 ShiftLeft(Vec256 v) { return Vec256{_mm256_slli_epi16(v.raw, kBits)}; } template HWY_API Vec256 ShiftLeft(Vec256 v) { return Vec256{_mm256_slli_epi32(v.raw, kBits)}; } template HWY_API Vec256 ShiftLeft(Vec256 v) { return Vec256{_mm256_slli_epi64(v.raw, kBits)}; } #if HWY_TARGET > HWY_AVX3_DL template HWY_API Vec256 ShiftLeft(const Vec256 v) { const Full256 d8; const RepartitionToWide d16; const auto shifted = BitCast(d8, ShiftLeft(BitCast(d16, v))); return kBits == 1 ? (v + v) : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); } #endif // HWY_TARGET > HWY_AVX3_DL // ------------------------------ ShiftRight template HWY_API Vec256 ShiftRight(Vec256 v) { return Vec256{_mm256_srli_epi16(v.raw, kBits)}; } template HWY_API Vec256 ShiftRight(Vec256 v) { return Vec256{_mm256_srli_epi32(v.raw, kBits)}; } template HWY_API Vec256 ShiftRight(Vec256 v) { return Vec256{_mm256_srli_epi64(v.raw, kBits)}; } template HWY_API Vec256 ShiftRight(Vec256 v) { return Vec256{_mm256_srai_epi16(v.raw, kBits)}; } template HWY_API Vec256 ShiftRight(Vec256 v) { return Vec256{_mm256_srai_epi32(v.raw, kBits)}; } #if HWY_TARGET > HWY_AVX3_DL template HWY_API Vec256 ShiftRight(Vec256 v) { const Full256 d8; // Use raw instead of BitCast to support N=1. const Vec256 shifted{ShiftRight(Vec256{v.raw}).raw}; return shifted & Set(d8, 0xFF >> kBits); } template HWY_API Vec256 ShiftRight(Vec256 v) { const Full256 di; const Full256 du; const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); return (shifted ^ shifted_sign) - shifted_sign; } #endif // HWY_TARGET > HWY_AVX3_DL // i64 is implemented after BroadcastSignBit. // ------------------------------ RotateRight template HWY_API Vec256 RotateRight(const Vec256 v) { constexpr size_t kSizeInBits = sizeof(T) * 8; static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); if (kBits == 0) return v; // AVX3 does not support 8/16-bit. return Or(ShiftRight(v), ShiftLeft(v)); } template HWY_API Vec256 RotateRight(const Vec256 v) { static_assert(0 <= kBits && kBits < 32, "Invalid shift count"); #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_ror_epi32(v.raw, kBits)}; #else if (kBits == 0) return v; return Or(ShiftRight(v), ShiftLeft(v)); #endif } template HWY_API Vec256 RotateRight(const Vec256 v) { static_assert(0 <= kBits && kBits < 64, "Invalid shift count"); #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_ror_epi64(v.raw, kBits)}; #else if (kBits == 0) return v; return Or(ShiftRight(v), ShiftLeft(v)); #endif } // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask) HWY_API Vec256 BroadcastSignBit(const Vec256 v) { const DFromV d; return VecFromMask(v < Zero(d)); } HWY_API Vec256 BroadcastSignBit(const Vec256 v) { return ShiftRight<15>(v); } HWY_API Vec256 BroadcastSignBit(const Vec256 v) { return ShiftRight<31>(v); } HWY_API Vec256 BroadcastSignBit(const Vec256 v) { #if HWY_TARGET == HWY_AVX2 const DFromV d; return VecFromMask(v < Zero(d)); #else return Vec256{_mm256_srai_epi64(v.raw, 63)}; #endif } template HWY_API Vec256 ShiftRight(const Vec256 v) { #if HWY_TARGET <= HWY_AVX3 return Vec256{ _mm256_srai_epi64(v.raw, static_cast(kBits))}; #else const Full256 di; const Full256 du; const auto right = BitCast(di, ShiftRight(BitCast(du, v))); const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v)); return right | sign; #endif } // ------------------------------ IfNegativeThenElse (BroadcastSignBit) HWY_API Vec256 IfNegativeThenElse(Vec256 v, Vec256 yes, Vec256 no) { // int8: AVX2 IfThenElse only looks at the MSB. return IfThenElse(MaskFromVec(v), yes, no); } template HWY_API Vec256 IfNegativeThenElse(Vec256 v, Vec256 yes, Vec256 no) { static_assert(IsSigned(), "Only works for signed/float"); #if HWY_TARGET <= HWY_AVX3 const auto mask = MaskFromVec(v); #else // 16-bit: no native blendv on AVX2, so copy sign to lower byte's MSB. const DFromV d; const RebindToSigned di; const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); #endif return IfThenElse(mask, yes, no); } template HWY_API Vec256 IfNegativeThenElse(Vec256 v, Vec256 yes, Vec256 no) { static_assert(IsSigned(), "Only works for signed/float"); #if HWY_TARGET <= HWY_AVX3 // No need to cast to float on AVX3 as IfThenElse only looks at the MSB on // AVX3 return IfThenElse(MaskFromVec(v), yes, no); #else const DFromV d; const RebindToFloat df; // 32/64-bit: use float IfThenElse, which only looks at the MSB. const MFromD msb = MaskFromVec(BitCast(df, v)); return BitCast(d, IfThenElse(msb, BitCast(df, yes), BitCast(df, no))); #endif } // ------------------------------ IfNegativeThenNegOrUndefIfZero HWY_API Vec256 IfNegativeThenNegOrUndefIfZero(Vec256 mask, Vec256 v) { return Vec256{_mm256_sign_epi8(v.raw, mask.raw)}; } HWY_API Vec256 IfNegativeThenNegOrUndefIfZero(Vec256 mask, Vec256 v) { return Vec256{_mm256_sign_epi16(v.raw, mask.raw)}; } HWY_API Vec256 IfNegativeThenNegOrUndefIfZero(Vec256 mask, Vec256 v) { return Vec256{_mm256_sign_epi32(v.raw, mask.raw)}; } // ------------------------------ ShiftLeftSame HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec256{_mm256_slli_epi16(v.raw, bits)}; } #endif return Vec256{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; } HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec256{_mm256_slli_epi32(v.raw, bits)}; } #endif return Vec256{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; } HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec256{_mm256_slli_epi64(v.raw, bits)}; } #endif return Vec256{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; } HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec256{_mm256_slli_epi16(v.raw, bits)}; } #endif return Vec256{_mm256_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; } HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec256{_mm256_slli_epi32(v.raw, bits)}; } #endif return Vec256{_mm256_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; } HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec256{_mm256_slli_epi64(v.raw, bits)}; } #endif return Vec256{_mm256_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; } template HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { const Full256 d8; const RepartitionToWide d16; const auto shifted = BitCast(d8, ShiftLeftSame(BitCast(d16, v), bits)); return shifted & Set(d8, static_cast((0xFF << bits) & 0xFF)); } // ------------------------------ ShiftRightSame (BroadcastSignBit) HWY_API Vec256 ShiftRightSame(const Vec256 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec256{_mm256_srli_epi16(v.raw, bits)}; } #endif return Vec256{_mm256_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))}; } HWY_API Vec256 ShiftRightSame(const Vec256 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec256{_mm256_srli_epi32(v.raw, bits)}; } #endif return Vec256{_mm256_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))}; } HWY_API Vec256 ShiftRightSame(const Vec256 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec256{_mm256_srli_epi64(v.raw, bits)}; } #endif return Vec256{_mm256_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))}; } HWY_API Vec256 ShiftRightSame(Vec256 v, const int bits) { const Full256 d8; const RepartitionToWide d16; const auto shifted = BitCast(d8, ShiftRightSame(BitCast(d16, v), bits)); return shifted & Set(d8, static_cast(0xFF >> bits)); } HWY_API Vec256 ShiftRightSame(const Vec256 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec256{_mm256_srai_epi16(v.raw, bits)}; } #endif return Vec256{_mm256_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; } HWY_API Vec256 ShiftRightSame(const Vec256 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec256{_mm256_srai_epi32(v.raw, bits)}; } #endif return Vec256{_mm256_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))}; } HWY_API Vec256 ShiftRightSame(const Vec256 v, const int bits) { #if HWY_TARGET <= HWY_AVX3 #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec256{ _mm256_srai_epi64(v.raw, static_cast(bits))}; } #endif return Vec256{_mm256_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))}; #else const Full256 di; const Full256 du; const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits); return right | sign; #endif } HWY_API Vec256 ShiftRightSame(Vec256 v, const int bits) { const Full256 di; const Full256 du; const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); const auto shifted_sign = BitCast(di, Set(du, static_cast(0x80 >> bits))); return (shifted ^ shifted_sign) - shifted_sign; } // ------------------------------ Neg (Xor, Sub) // Tag dispatch instead of SFINAE for MSVC 2017 compatibility namespace detail { template HWY_INLINE Vec256 Neg(hwy::FloatTag /*tag*/, const Vec256 v) { const DFromV d; return Xor(v, SignBit(d)); } template HWY_INLINE Vec256 Neg(hwy::SpecialTag /*tag*/, const Vec256 v) { const DFromV d; return Xor(v, SignBit(d)); } // Not floating-point template HWY_INLINE Vec256 Neg(hwy::SignedTag /*tag*/, const Vec256 v) { const DFromV d; return Zero(d) - v; } } // namespace detail template HWY_API Vec256 Neg(const Vec256 v) { return detail::Neg(hwy::TypeTag(), v); } // ------------------------------ Floating-point mul / div #if HWY_HAVE_FLOAT16 HWY_API Vec256 operator*(Vec256 a, Vec256 b) { return Vec256{_mm256_mul_ph(a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 HWY_API Vec256 operator*(Vec256 a, Vec256 b) { return Vec256{_mm256_mul_ps(a.raw, b.raw)}; } HWY_API Vec256 operator*(Vec256 a, Vec256 b) { return Vec256{_mm256_mul_pd(a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 HWY_API Vec256 operator/(Vec256 a, Vec256 b) { return Vec256{_mm256_div_ph(a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 HWY_API Vec256 operator/(Vec256 a, Vec256 b) { return Vec256{_mm256_div_ps(a.raw, b.raw)}; } HWY_API Vec256 operator/(Vec256 a, Vec256 b) { return Vec256{_mm256_div_pd(a.raw, b.raw)}; } // Approximate reciprocal #if HWY_HAVE_FLOAT16 HWY_API Vec256 ApproximateReciprocal(Vec256 v) { return Vec256{_mm256_rcp_ph(v.raw)}; } #endif // HWY_HAVE_FLOAT16 HWY_API Vec256 ApproximateReciprocal(Vec256 v) { return Vec256{_mm256_rcp_ps(v.raw)}; } #if HWY_TARGET <= HWY_AVX3 HWY_API Vec256 ApproximateReciprocal(Vec256 v) { return Vec256{_mm256_rcp14_pd(v.raw)}; } #endif // ------------------------------ MaskedMinOr #if HWY_TARGET <= HWY_AVX3 template HWY_API Vec256 MaskedMinOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_min_epu8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedMinOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedMinOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_min_epu16(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedMinOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_min_epi16(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedMinOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_min_epu32(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedMinOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_min_epi32(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedMinOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_min_epu64(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedMinOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_min_epi64(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedMinOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_min_ps(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedMinOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_min_pd(no.raw, m.raw, a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 template HWY_API Vec256 MaskedMinOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_min_ph(no.raw, m.raw, a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 // ------------------------------ MaskedMaxOr template HWY_API Vec256 MaskedMaxOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_max_epu8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedMaxOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedMaxOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_max_epu16(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedMaxOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_max_epi16(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedMaxOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_max_epu32(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedMaxOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_max_epi32(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedMaxOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_max_epu64(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedMaxOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_max_epi64(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedMaxOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_max_ps(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedMaxOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_max_pd(no.raw, m.raw, a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 template HWY_API Vec256 MaskedMaxOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_max_ph(no.raw, m.raw, a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 // ------------------------------ MaskedAddOr template HWY_API Vec256 MaskedAddOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_add_epi8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedAddOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedAddOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_add_epi32(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedAddOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_add_epi64(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedAddOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_add_ps(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedAddOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_add_pd(no.raw, m.raw, a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 template HWY_API Vec256 MaskedAddOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_add_ph(no.raw, m.raw, a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 // ------------------------------ MaskedSubOr template HWY_API Vec256 MaskedSubOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_sub_epi8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedSubOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedSubOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_sub_epi32(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedSubOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_sub_epi64(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedSubOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_sub_ps(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedSubOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_sub_pd(no.raw, m.raw, a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 template HWY_API Vec256 MaskedSubOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_sub_ph(no.raw, m.raw, a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 // ------------------------------ MaskedMulOr HWY_API Vec256 MaskedMulOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_mul_ps(no.raw, m.raw, a.raw, b.raw)}; } HWY_API Vec256 MaskedMulOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_mul_pd(no.raw, m.raw, a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 HWY_API Vec256 MaskedMulOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 // ------------------------------ MaskedDivOr HWY_API Vec256 MaskedDivOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_div_ps(no.raw, m.raw, a.raw, b.raw)}; } HWY_API Vec256 MaskedDivOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_div_pd(no.raw, m.raw, a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 HWY_API Vec256 MaskedDivOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_div_ph(no.raw, m.raw, a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 // ------------------------------ MaskedSatAddOr template HWY_API Vec256 MaskedSatAddOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_adds_epi8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedSatAddOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedSatAddOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_adds_epi16(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedSatAddOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_adds_epu16(no.raw, m.raw, a.raw, b.raw)}; } // ------------------------------ MaskedSatSubOr template HWY_API Vec256 MaskedSatSubOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_subs_epi8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedSatSubOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedSatSubOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_subs_epi16(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec256 MaskedSatSubOr(Vec256 no, Mask256 m, Vec256 a, Vec256 b) { return Vec256{_mm256_mask_subs_epu16(no.raw, m.raw, a.raw, b.raw)}; } #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ Floating-point multiply-add variants #if HWY_HAVE_FLOAT16 HWY_API Vec256 MulAdd(Vec256 mul, Vec256 x, Vec256 add) { return Vec256{_mm256_fmadd_ph(mul.raw, x.raw, add.raw)}; } HWY_API Vec256 NegMulAdd(Vec256 mul, Vec256 x, Vec256 add) { return Vec256{_mm256_fnmadd_ph(mul.raw, x.raw, add.raw)}; } HWY_API Vec256 MulSub(Vec256 mul, Vec256 x, Vec256 sub) { return Vec256{_mm256_fmsub_ph(mul.raw, x.raw, sub.raw)}; } HWY_API Vec256 NegMulSub(Vec256 mul, Vec256 x, Vec256 sub) { return Vec256{_mm256_fnmsub_ph(mul.raw, x.raw, sub.raw)}; } #endif // HWY_HAVE_FLOAT16 HWY_API Vec256 MulAdd(Vec256 mul, Vec256 x, Vec256 add) { #ifdef HWY_DISABLE_BMI2_FMA return mul * x + add; #else return Vec256{_mm256_fmadd_ps(mul.raw, x.raw, add.raw)}; #endif } HWY_API Vec256 MulAdd(Vec256 mul, Vec256 x, Vec256 add) { #ifdef HWY_DISABLE_BMI2_FMA return mul * x + add; #else return Vec256{_mm256_fmadd_pd(mul.raw, x.raw, add.raw)}; #endif } HWY_API Vec256 NegMulAdd(Vec256 mul, Vec256 x, Vec256 add) { #ifdef HWY_DISABLE_BMI2_FMA return add - mul * x; #else return Vec256{_mm256_fnmadd_ps(mul.raw, x.raw, add.raw)}; #endif } HWY_API Vec256 NegMulAdd(Vec256 mul, Vec256 x, Vec256 add) { #ifdef HWY_DISABLE_BMI2_FMA return add - mul * x; #else return Vec256{_mm256_fnmadd_pd(mul.raw, x.raw, add.raw)}; #endif } HWY_API Vec256 MulSub(Vec256 mul, Vec256 x, Vec256 sub) { #ifdef HWY_DISABLE_BMI2_FMA return mul * x - sub; #else return Vec256{_mm256_fmsub_ps(mul.raw, x.raw, sub.raw)}; #endif } HWY_API Vec256 MulSub(Vec256 mul, Vec256 x, Vec256 sub) { #ifdef HWY_DISABLE_BMI2_FMA return mul * x - sub; #else return Vec256{_mm256_fmsub_pd(mul.raw, x.raw, sub.raw)}; #endif } HWY_API Vec256 NegMulSub(Vec256 mul, Vec256 x, Vec256 sub) { #ifdef HWY_DISABLE_BMI2_FMA return Neg(mul * x) - sub; #else return Vec256{_mm256_fnmsub_ps(mul.raw, x.raw, sub.raw)}; #endif } HWY_API Vec256 NegMulSub(Vec256 mul, Vec256 x, Vec256 sub) { #ifdef HWY_DISABLE_BMI2_FMA return Neg(mul * x) - sub; #else return Vec256{_mm256_fnmsub_pd(mul.raw, x.raw, sub.raw)}; #endif } #if HWY_HAVE_FLOAT16 HWY_API Vec256 MulAddSub(Vec256 mul, Vec256 x, Vec256 sub_or_add) { return Vec256{_mm256_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)}; } #endif // HWY_HAVE_FLOAT16 HWY_API Vec256 MulAddSub(Vec256 mul, Vec256 x, Vec256 sub_or_add) { #ifdef HWY_DISABLE_BMI2_FMA return AddSub(mul * x, sub_or_add); #else return Vec256{_mm256_fmaddsub_ps(mul.raw, x.raw, sub_or_add.raw)}; #endif } HWY_API Vec256 MulAddSub(Vec256 mul, Vec256 x, Vec256 sub_or_add) { #ifdef HWY_DISABLE_BMI2_FMA return AddSub(mul * x, sub_or_add); #else return Vec256{_mm256_fmaddsub_pd(mul.raw, x.raw, sub_or_add.raw)}; #endif } // ------------------------------ Floating-point square root // Full precision square root #if HWY_HAVE_FLOAT16 HWY_API Vec256 Sqrt(Vec256 v) { return Vec256{_mm256_sqrt_ph(v.raw)}; } #endif // HWY_HAVE_FLOAT16 HWY_API Vec256 Sqrt(Vec256 v) { return Vec256{_mm256_sqrt_ps(v.raw)}; } HWY_API Vec256 Sqrt(Vec256 v) { return Vec256{_mm256_sqrt_pd(v.raw)}; } // Approximate reciprocal square root #if HWY_HAVE_FLOAT16 HWY_API Vec256 ApproximateReciprocalSqrt(Vec256 v) { return Vec256{_mm256_rsqrt_ph(v.raw)}; } #endif HWY_API Vec256 ApproximateReciprocalSqrt(Vec256 v) { return Vec256{_mm256_rsqrt_ps(v.raw)}; } #if HWY_TARGET <= HWY_AVX3 HWY_API Vec256 ApproximateReciprocalSqrt(Vec256 v) { #if HWY_COMPILER_MSVC const DFromV d; return Vec256{_mm256_mask_rsqrt14_pd( Undefined(d).raw, static_cast<__mmask8>(0xFF), v.raw)}; #else return Vec256{_mm256_rsqrt14_pd(v.raw)}; #endif } #endif // ------------------------------ Floating-point rounding // Toward nearest integer, tie to even #if HWY_HAVE_FLOAT16 HWY_API Vec256 Round(Vec256 v) { return Vec256{_mm256_roundscale_ph( v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; } #endif // HWY_HAVE_FLOAT16 HWY_API Vec256 Round(Vec256 v) { return Vec256{ _mm256_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; } HWY_API Vec256 Round(Vec256 v) { return Vec256{ _mm256_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; } // Toward zero, aka truncate #if HWY_HAVE_FLOAT16 HWY_API Vec256 Trunc(Vec256 v) { return Vec256{ _mm256_roundscale_ph(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; } #endif // HWY_HAVE_FLOAT16 HWY_API Vec256 Trunc(Vec256 v) { return Vec256{ _mm256_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; } HWY_API Vec256 Trunc(Vec256 v) { return Vec256{ _mm256_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; } // Toward +infinity, aka ceiling #if HWY_HAVE_FLOAT16 HWY_API Vec256 Ceil(Vec256 v) { return Vec256{ _mm256_roundscale_ph(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; } #endif // HWY_HAVE_FLOAT16 HWY_API Vec256 Ceil(Vec256 v) { return Vec256{ _mm256_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; } HWY_API Vec256 Ceil(Vec256 v) { return Vec256{ _mm256_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; } // Toward -infinity, aka floor #if HWY_HAVE_FLOAT16 HWY_API Vec256 Floor(Vec256 v) { return Vec256{ _mm256_roundscale_ph(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; } #endif // HWY_HAVE_FLOAT16 HWY_API Vec256 Floor(Vec256 v) { return Vec256{ _mm256_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; } HWY_API Vec256 Floor(Vec256 v) { return Vec256{ _mm256_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; } // ------------------------------ Floating-point classification #if HWY_HAVE_FLOAT16 || HWY_IDE HWY_API Mask256 IsNaN(Vec256 v) { return Mask256{_mm256_fpclass_ph_mask( v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; } HWY_API Mask256 IsInf(Vec256 v) { return Mask256{_mm256_fpclass_ph_mask( v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; } HWY_API Mask256 IsFinite(Vec256 v) { // fpclass doesn't have a flag for positive, so we have to check for inf/NaN // and negate the mask. return Not(Mask256{_mm256_fpclass_ph_mask( v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); } #endif // HWY_HAVE_FLOAT16 HWY_API Mask256 IsNaN(Vec256 v) { #if HWY_TARGET <= HWY_AVX3 return Mask256{_mm256_fpclass_ps_mask( v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; #else return Mask256{_mm256_cmp_ps(v.raw, v.raw, _CMP_UNORD_Q)}; #endif } HWY_API Mask256 IsNaN(Vec256 v) { #if HWY_TARGET <= HWY_AVX3 return Mask256{_mm256_fpclass_pd_mask( v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; #else return Mask256{_mm256_cmp_pd(v.raw, v.raw, _CMP_UNORD_Q)}; #endif } #if HWY_TARGET <= HWY_AVX3 HWY_API Mask256 IsInf(Vec256 v) { return Mask256{_mm256_fpclass_ps_mask( v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; } HWY_API Mask256 IsInf(Vec256 v) { return Mask256{_mm256_fpclass_pd_mask( v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; } HWY_API Mask256 IsFinite(Vec256 v) { // fpclass doesn't have a flag for positive, so we have to check for inf/NaN // and negate the mask. return Not(Mask256{_mm256_fpclass_ps_mask( v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); } HWY_API Mask256 IsFinite(Vec256 v) { return Not(Mask256{_mm256_fpclass_pd_mask( v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); } #endif // HWY_TARGET <= HWY_AVX3 // ================================================== MEMORY // ------------------------------ Load template HWY_API VFromD Load(D /* tag */, const TFromD* HWY_RESTRICT aligned) { return VFromD{ _mm256_load_si256(reinterpret_cast(aligned))}; } // bfloat16_t is handled by x86_128-inl.h. #if HWY_HAVE_FLOAT16 template HWY_API Vec256 Load(D /* tag */, const float16_t* HWY_RESTRICT aligned) { return Vec256{_mm256_load_ph(aligned)}; } #endif template HWY_API Vec256 Load(D /* tag */, const float* HWY_RESTRICT aligned) { return Vec256{_mm256_load_ps(aligned)}; } template HWY_API Vec256 Load(D /* tag */, const double* HWY_RESTRICT aligned) { return Vec256{_mm256_load_pd(aligned)}; } template HWY_API VFromD LoadU(D /* tag */, const TFromD* HWY_RESTRICT p) { return VFromD{_mm256_loadu_si256(reinterpret_cast(p))}; } // bfloat16_t is handled by x86_128-inl.h. #if HWY_HAVE_FLOAT16 template HWY_API Vec256 LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) { return Vec256{_mm256_loadu_ph(p)}; } #endif template HWY_API Vec256 LoadU(D /* tag */, const float* HWY_RESTRICT p) { return Vec256{_mm256_loadu_ps(p)}; } template HWY_API Vec256 LoadU(D /* tag */, const double* HWY_RESTRICT p) { return Vec256{_mm256_loadu_pd(p)}; } // ------------------------------ MaskedLoad #if HWY_TARGET <= HWY_AVX3 template HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, const TFromD* HWY_RESTRICT p) { return VFromD{_mm256_maskz_loadu_epi8(m.raw, p)}; } template HWY_API VFromD MaskedLoad(MFromD m, D d, const TFromD* HWY_RESTRICT p) { const RebindToUnsigned du; // for float16_t return BitCast(d, VFromD{_mm256_maskz_loadu_epi16(m.raw, p)}); } template HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, const TFromD* HWY_RESTRICT p) { return VFromD{_mm256_maskz_loadu_epi32(m.raw, p)}; } template HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, const TFromD* HWY_RESTRICT p) { return VFromD{_mm256_maskz_loadu_epi64(m.raw, p)}; } template HWY_API Vec256 MaskedLoad(Mask256 m, D /* tag */, const float* HWY_RESTRICT p) { return Vec256{_mm256_maskz_loadu_ps(m.raw, p)}; } template HWY_API Vec256 MaskedLoad(Mask256 m, D /* tag */, const double* HWY_RESTRICT p) { return Vec256{_mm256_maskz_loadu_pd(m.raw, p)}; } template HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, const TFromD* HWY_RESTRICT p) { return VFromD{_mm256_mask_loadu_epi8(v.raw, m.raw, p)}; } template HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D d, const TFromD* HWY_RESTRICT p) { const RebindToUnsigned du; // for float16_t return BitCast(d, VFromD{ _mm256_mask_loadu_epi16(BitCast(du, v).raw, m.raw, p)}); } template HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, const TFromD* HWY_RESTRICT p) { return VFromD{_mm256_mask_loadu_epi32(v.raw, m.raw, p)}; } template HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, const TFromD* HWY_RESTRICT p) { return VFromD{_mm256_mask_loadu_epi64(v.raw, m.raw, p)}; } template HWY_API Vec256 MaskedLoadOr(VFromD v, Mask256 m, D /* tag */, const float* HWY_RESTRICT p) { return Vec256{_mm256_mask_loadu_ps(v.raw, m.raw, p)}; } template HWY_API Vec256 MaskedLoadOr(VFromD v, Mask256 m, D /* tag */, const double* HWY_RESTRICT p) { return Vec256{_mm256_mask_loadu_pd(v.raw, m.raw, p)}; } #else // AVX2 // There is no maskload_epi8/16, so blend instead. template HWY_API VFromD MaskedLoad(MFromD m, D d, const TFromD* HWY_RESTRICT p) { return IfThenElseZero(m, LoadU(d, p)); } template HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, const TFromD* HWY_RESTRICT p) { auto pi = reinterpret_cast(p); // NOLINT return VFromD{_mm256_maskload_epi32(pi, m.raw)}; } template HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, const TFromD* HWY_RESTRICT p) { auto pi = reinterpret_cast(p); // NOLINT return VFromD{_mm256_maskload_epi64(pi, m.raw)}; } template HWY_API Vec256 MaskedLoad(Mask256 m, D d, const float* HWY_RESTRICT p) { const Vec256 mi = BitCast(RebindToSigned(), VecFromMask(d, m)); return Vec256{_mm256_maskload_ps(p, mi.raw)}; } template HWY_API Vec256 MaskedLoad(Mask256 m, D d, const double* HWY_RESTRICT p) { const Vec256 mi = BitCast(RebindToSigned(), VecFromMask(d, m)); return Vec256{_mm256_maskload_pd(p, mi.raw)}; } #endif // ------------------------------ LoadDup128 // Loads 128 bit and duplicates into both 128-bit halves. This avoids the // 3-cycle cost of moving data between 128-bit halves and avoids port 5. template HWY_API VFromD LoadDup128(D d, const TFromD* HWY_RESTRICT p) { const RebindToUnsigned du; const Full128> d128; const RebindToUnsigned du128; const __m128i v128 = BitCast(du128, LoadU(d128, p)).raw; #if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931 // Workaround for incorrect results with _mm256_broadcastsi128_si256. Note // that MSVC also lacks _mm256_zextsi128_si256, but cast (which leaves the // upper half undefined) is fine because we're overwriting that anyway. // This workaround seems in turn to generate incorrect code in MSVC 2022 // (19.31), so use broadcastsi128 there. return BitCast(d, VFromD{_mm256_inserti128_si256( _mm256_castsi128_si256(v128), v128, 1)}); #else // The preferred path. This is perhaps surprising, because vbroadcasti128 // with xmm input has 7 cycle latency on Intel, but Clang >= 7 is able to // pattern-match this to vbroadcastf128 with a memory operand as desired. return BitCast(d, VFromD{_mm256_broadcastsi128_si256(v128)}); #endif } template HWY_API Vec256 LoadDup128(D /* tag */, const float* HWY_RESTRICT p) { #if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931 const Full128 d128; const __m128 v128 = LoadU(d128, p).raw; return Vec256{ _mm256_insertf128_ps(_mm256_castps128_ps256(v128), v128, 1)}; #else return Vec256{_mm256_broadcast_ps(reinterpret_cast(p))}; #endif } template HWY_API Vec256 LoadDup128(D /* tag */, const double* HWY_RESTRICT p) { #if HWY_COMPILER_MSVC && HWY_COMPILER_MSVC < 1931 const Full128 d128; const __m128d v128 = LoadU(d128, p).raw; return Vec256{ _mm256_insertf128_pd(_mm256_castpd128_pd256(v128), v128, 1)}; #else return Vec256{ _mm256_broadcast_pd(reinterpret_cast(p))}; #endif } // ------------------------------ Store template HWY_API void Store(VFromD v, D /* tag */, TFromD* HWY_RESTRICT aligned) { _mm256_store_si256(reinterpret_cast<__m256i*>(aligned), v.raw); } #if HWY_HAVE_FLOAT16 template HWY_API void Store(Vec256 v, D /* tag */, float16_t* HWY_RESTRICT aligned) { _mm256_store_ph(aligned, v.raw); } #endif // HWY_HAVE_FLOAT16 template HWY_API void Store(Vec256 v, D /* tag */, float* HWY_RESTRICT aligned) { _mm256_store_ps(aligned, v.raw); } template HWY_API void Store(Vec256 v, D /* tag */, double* HWY_RESTRICT aligned) { _mm256_store_pd(aligned, v.raw); } template HWY_API void StoreU(VFromD v, D /* tag */, TFromD* HWY_RESTRICT p) { _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), v.raw); } #if HWY_HAVE_FLOAT16 template HWY_API void StoreU(Vec256 v, D /* tag */, float16_t* HWY_RESTRICT p) { _mm256_storeu_ph(p, v.raw); } #endif template HWY_API void StoreU(Vec256 v, D /* tag */, float* HWY_RESTRICT p) { _mm256_storeu_ps(p, v.raw); } template HWY_API void StoreU(Vec256 v, D /* tag */, double* HWY_RESTRICT p) { _mm256_storeu_pd(p, v.raw); } // ------------------------------ BlendedStore #if HWY_TARGET <= HWY_AVX3 template HWY_API void BlendedStore(VFromD v, MFromD m, D /* tag */, TFromD* HWY_RESTRICT p) { _mm256_mask_storeu_epi8(p, m.raw, v.raw); } template HWY_API void BlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT p) { const RebindToUnsigned du; // for float16_t _mm256_mask_storeu_epi16(reinterpret_cast(p), RebindMask(du, m).raw, BitCast(du, v).raw); } template HWY_API void BlendedStore(VFromD v, MFromD m, D /* tag */, TFromD* HWY_RESTRICT p) { _mm256_mask_storeu_epi32(p, m.raw, v.raw); } template HWY_API void BlendedStore(VFromD v, MFromD m, D /* tag */, TFromD* HWY_RESTRICT p) { _mm256_mask_storeu_epi64(p, m.raw, v.raw); } template HWY_API void BlendedStore(Vec256 v, Mask256 m, D /* tag */, float* HWY_RESTRICT p) { _mm256_mask_storeu_ps(p, m.raw, v.raw); } template HWY_API void BlendedStore(Vec256 v, Mask256 m, D /* tag */, double* HWY_RESTRICT p) { _mm256_mask_storeu_pd(p, m.raw, v.raw); } #else // AVX2 // Intel SDM says "No AC# reported for any mask bit combinations". However, AMD // allows AC# if "Alignment checking enabled and: 256-bit memory operand not // 32-byte aligned". Fortunately AC# is not enabled by default and requires both // OS support (CR0) and the application to set rflags.AC. We assume these remain // disabled because x86/x64 code and compiler output often contain misaligned // scalar accesses, which would also fault. // // Caveat: these are slow on AMD Jaguar/Bulldozer. template HWY_API void BlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT p) { // There is no maskload_epi8/16. Blending is also unsafe because loading a // full vector that crosses the array end causes asan faults. Resort to scalar // code; the caller should instead use memcpy, assuming m is FirstN(d, n). const RebindToUnsigned du; using TU = TFromD; alignas(32) TU buf[MaxLanes(d)]; alignas(32) TU mask[MaxLanes(d)]; Store(BitCast(du, v), du, buf); Store(BitCast(du, VecFromMask(d, m)), du, mask); for (size_t i = 0; i < MaxLanes(d); ++i) { if (mask[i]) { CopySameSize(buf + i, p + i); } } } template HWY_API void BlendedStore(VFromD v, MFromD m, D /* tag */, TFromD* HWY_RESTRICT p) { auto pi = reinterpret_cast(p); // NOLINT _mm256_maskstore_epi32(pi, m.raw, v.raw); } template HWY_API void BlendedStore(VFromD v, MFromD m, D /* tag */, TFromD* HWY_RESTRICT p) { auto pi = reinterpret_cast(p); // NOLINT _mm256_maskstore_epi64(pi, m.raw, v.raw); } template HWY_API void BlendedStore(Vec256 v, Mask256 m, D d, float* HWY_RESTRICT p) { const Vec256 mi = BitCast(RebindToSigned(), VecFromMask(d, m)); _mm256_maskstore_ps(p, mi.raw, v.raw); } template HWY_API void BlendedStore(Vec256 v, Mask256 m, D d, double* HWY_RESTRICT p) { const Vec256 mi = BitCast(RebindToSigned(), VecFromMask(d, m)); _mm256_maskstore_pd(p, mi.raw, v.raw); } #endif // ------------------------------ Non-temporal stores template HWY_API void Stream(VFromD v, D d, TFromD* HWY_RESTRICT aligned) { const RebindToUnsigned du; // for float16_t _mm256_stream_si256(reinterpret_cast<__m256i*>(aligned), BitCast(du, v).raw); } template HWY_API void Stream(Vec256 v, D /* tag */, float* HWY_RESTRICT aligned) { _mm256_stream_ps(aligned, v.raw); } template HWY_API void Stream(Vec256 v, D /* tag */, double* HWY_RESTRICT aligned) { _mm256_stream_pd(aligned, v.raw); } // ------------------------------ ScatterOffset // Work around warnings in the intrinsic definitions (passing -1 as a mask). HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") #if HWY_TARGET <= HWY_AVX3 template HWY_API void ScatterOffset(VFromD v, D /* tag */, TFromD* HWY_RESTRICT base, Vec256 offset) { _mm256_i32scatter_epi32(base, offset.raw, v.raw, 1); } template HWY_API void ScatterOffset(VFromD v, D /* tag */, TFromD* HWY_RESTRICT base, Vec256 offset) { _mm256_i64scatter_epi64(base, offset.raw, v.raw, 1); } template HWY_API void ScatterOffset(VFromD v, D /* tag */, float* HWY_RESTRICT base, const Vec256 offset) { _mm256_i32scatter_ps(base, offset.raw, v.raw, 1); } template HWY_API void ScatterOffset(VFromD v, D /* tag */, double* HWY_RESTRICT base, const Vec256 offset) { _mm256_i64scatter_pd(base, offset.raw, v.raw, 1); } // ------------------------------ ScatterIndex template HWY_API void ScatterIndex(VFromD v, D /* tag */, TFromD* HWY_RESTRICT base, VFromD> index) { _mm256_i32scatter_epi32(base, index.raw, v.raw, 4); } template HWY_API void ScatterIndex(VFromD v, D /* tag */, TFromD* HWY_RESTRICT base, VFromD> index) { _mm256_i64scatter_epi64(base, index.raw, v.raw, 8); } template HWY_API void ScatterIndex(VFromD v, D /* tag */, float* HWY_RESTRICT base, VFromD> index) { _mm256_i32scatter_ps(base, index.raw, v.raw, 4); } template HWY_API void ScatterIndex(VFromD v, D /* tag */, double* HWY_RESTRICT base, VFromD> index) { _mm256_i64scatter_pd(base, index.raw, v.raw, 8); } // ------------------------------ MaskedScatterIndex template HWY_API void MaskedScatterIndex(VFromD v, MFromD m, D /* tag */, TFromD* HWY_RESTRICT base, VFromD> index) { _mm256_mask_i32scatter_epi32(base, m.raw, index.raw, v.raw, 4); } template HWY_API void MaskedScatterIndex(VFromD v, MFromD m, D /* tag */, TFromD* HWY_RESTRICT base, VFromD> index) { _mm256_mask_i64scatter_epi64(base, m.raw, index.raw, v.raw, 8); } template HWY_API void MaskedScatterIndex(VFromD v, MFromD m, D /* tag */, float* HWY_RESTRICT base, VFromD> index) { _mm256_mask_i32scatter_ps(base, m.raw, index.raw, v.raw, 4); } template HWY_API void MaskedScatterIndex(VFromD v, MFromD m, D /* tag */, double* HWY_RESTRICT base, VFromD> index) { _mm256_mask_i64scatter_pd(base, m.raw, index.raw, v.raw, 8); } #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ Gather namespace detail { template HWY_INLINE Vec256 NativeGather256(const T* HWY_RESTRICT base, Vec256 indices) { return Vec256{_mm256_i32gather_epi32( reinterpret_cast(base), indices.raw, kScale)}; } template HWY_INLINE Vec256 NativeGather256(const T* HWY_RESTRICT base, Vec256 indices) { return Vec256{_mm256_i64gather_epi64( reinterpret_cast(base), indices.raw, kScale)}; } template HWY_API Vec256 NativeGather256(const float* HWY_RESTRICT base, Vec256 indices) { return Vec256{_mm256_i32gather_ps(base, indices.raw, kScale)}; } template HWY_API Vec256 NativeGather256(const double* HWY_RESTRICT base, Vec256 indices) { return Vec256{_mm256_i64gather_pd(base, indices.raw, kScale)}; } } // namespace detail template HWY_API VFromD GatherOffset(D d, const TFromD* HWY_RESTRICT base, VFromD> offsets) { const RebindToSigned di; (void)di; // for HWY_DASSERT HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di)))); return detail::NativeGather256<1>(base, offsets); } template HWY_API VFromD GatherIndex(D d, const TFromD* HWY_RESTRICT base, VFromD> indices) { const RebindToSigned di; (void)di; // for HWY_DASSERT HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di)))); return detail::NativeGather256)>(base, indices); } // ------------------------------ MaskedGatherIndexOr namespace detail { template HWY_INLINE Vec256 NativeMaskedGatherOr256(Vec256 no, Mask256 m, const T* HWY_RESTRICT base, Vec256 indices) { #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_mmask_i32gather_epi32( no.raw, m.raw, indices.raw, reinterpret_cast(base), kScale)}; #else return Vec256{_mm256_mask_i32gather_epi32( no.raw, reinterpret_cast(base), indices.raw, m.raw, kScale)}; #endif } template HWY_INLINE Vec256 NativeMaskedGatherOr256(Vec256 no, Mask256 m, const T* HWY_RESTRICT base, Vec256 indices) { #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_mmask_i64gather_epi64( no.raw, m.raw, indices.raw, reinterpret_cast(base), kScale)}; #else // For reasons unknown, _mm256_mask_i64gather_epi64 returns all-zeros. const Full256 d; const Full256 dd; return BitCast(d, Vec256{_mm256_mask_i64gather_pd( BitCast(dd, no).raw, reinterpret_cast(base), indices.raw, RebindMask(dd, m).raw, kScale)}); #endif } template HWY_API Vec256 NativeMaskedGatherOr256(Vec256 no, Mask256 m, const float* HWY_RESTRICT base, Vec256 indices) { #if HWY_TARGET <= HWY_AVX3 return Vec256{ _mm256_mmask_i32gather_ps(no.raw, m.raw, indices.raw, base, kScale)}; #else return Vec256{ _mm256_mask_i32gather_ps(no.raw, base, indices.raw, m.raw, kScale)}; #endif } template HWY_API Vec256 NativeMaskedGatherOr256(Vec256 no, Mask256 m, const double* HWY_RESTRICT base, Vec256 indices) { #if HWY_TARGET <= HWY_AVX3 return Vec256{ _mm256_mmask_i64gather_pd(no.raw, m.raw, indices.raw, base, kScale)}; #else return Vec256{ _mm256_mask_i64gather_pd(no.raw, base, indices.raw, m.raw, kScale)}; #endif } } // namespace detail template HWY_API VFromD MaskedGatherIndexOr(VFromD no, MFromD m, D d, const TFromD* HWY_RESTRICT base, VFromD> indices) { const RebindToSigned di; (void)di; // for HWY_DASSERT HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di)))); return detail::NativeMaskedGatherOr256)>(no, m, base, indices); } HWY_DIAGNOSTICS(pop) // ================================================== SWIZZLE // ------------------------------ LowerHalf template HWY_API VFromD LowerHalf(D /* tag */, VFromD> v) { return VFromD{_mm256_castsi256_si128(v.raw)}; } template HWY_API Vec128 LowerHalf(D /* tag */, Vec256 v) { return Vec128{_mm256_castsi256_si128(v.raw)}; } template HWY_API Vec128 LowerHalf(D /* tag */, Vec256 v) { #if HWY_HAVE_FLOAT16 return Vec128{_mm256_castph256_ph128(v.raw)}; #else return Vec128{_mm256_castsi256_si128(v.raw)}; #endif // HWY_HAVE_FLOAT16 } template HWY_API Vec128 LowerHalf(D /* tag */, Vec256 v) { return Vec128{_mm256_castps256_ps128(v.raw)}; } template HWY_API Vec128 LowerHalf(D /* tag */, Vec256 v) { return Vec128{_mm256_castpd256_pd128(v.raw)}; } template HWY_API Vec128 LowerHalf(Vec256 v) { const Full128 dh; return LowerHalf(dh, v); } // ------------------------------ UpperHalf template HWY_API VFromD UpperHalf(D d, VFromD> v) { const RebindToUnsigned du; // for float16_t const Twice dut; return BitCast(d, VFromD{ _mm256_extracti128_si256(BitCast(dut, v).raw, 1)}); } template HWY_API VFromD UpperHalf(D /* tag */, Vec256 v) { return VFromD{_mm256_extractf128_ps(v.raw, 1)}; } template HWY_API VFromD UpperHalf(D /* tag */, Vec256 v) { return VFromD{_mm256_extractf128_pd(v.raw, 1)}; } // ------------------------------ ExtractLane (Store) template HWY_API T ExtractLane(const Vec256 v, size_t i) { const DFromV d; HWY_DASSERT(i < Lanes(d)); #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang constexpr size_t kLanesPerBlock = 16 / sizeof(T); if (__builtin_constant_p(i < kLanesPerBlock) && (i < kLanesPerBlock)) { return ExtractLane(LowerHalf(Half(), v), i); } #endif alignas(32) T lanes[32 / sizeof(T)]; Store(v, d, lanes); return lanes[i]; } // ------------------------------ InsertLane (Store) template HWY_API Vec256 InsertLane(const Vec256 v, size_t i, T t) { return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); } // ------------------------------ GetLane (LowerHalf) template HWY_API T GetLane(const Vec256 v) { return GetLane(LowerHalf(v)); } // ------------------------------ ExtractBlock (LowerHalf, UpperHalf) template HWY_API Vec128 ExtractBlock(Vec256 v) { static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index"); const Half> dh; return (kBlockIdx == 0) ? LowerHalf(dh, v) : UpperHalf(dh, v); } // ------------------------------ ZeroExtendVector // Unfortunately the initial _mm256_castsi128_si256 intrinsic leaves the upper // bits undefined. Although it makes sense for them to be zero (VEX encoded // 128-bit instructions zero the upper lanes to avoid large penalties), a // compiler could decide to optimize out code that relies on this. // // The newer _mm256_zextsi128_si256 intrinsic fixes this by specifying the // zeroing, but it is not available on MSVC until 1920 nor GCC until 10.1. // Unfortunately as of 2023-08 it still seems to cause internal compiler errors // on MSVC, so we consider it unavailable there. // // Without zext we can still possibly obtain the desired code thanks to pattern // recognition; note that the expensive insert instruction might not actually be // generated, see https://gcc.godbolt.org/z/1MKGaP. #if !defined(HWY_HAVE_ZEXT) #if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG >= 500) || \ (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL >= 1000) #define HWY_HAVE_ZEXT 1 #else #define HWY_HAVE_ZEXT 0 #endif #endif // defined(HWY_HAVE_ZEXT) template HWY_API VFromD ZeroExtendVector(D /* tag */, VFromD> lo) { #if HWY_HAVE_ZEXT return VFromD{_mm256_zextsi128_si256(lo.raw)}; #elif HWY_COMPILER_MSVC // Workaround: _mm256_inserti128_si256 does not actually zero the hi part. return VFromD{_mm256_set_m128i(_mm_setzero_si128(), lo.raw)}; #else return VFromD{_mm256_inserti128_si256(_mm256_setzero_si256(), lo.raw, 0)}; #endif } #if HWY_HAVE_FLOAT16 template HWY_API Vec256 ZeroExtendVector(D d, Vec128 lo) { #if HWY_HAVE_ZEXT (void)d; return Vec256{_mm256_zextph128_ph256(lo.raw)}; #else const RebindToUnsigned du; return BitCast(d, ZeroExtendVector(du, BitCast(du, lo))); #endif // HWY_HAVE_ZEXT } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec256 ZeroExtendVector(D /* tag */, Vec128 lo) { #if HWY_HAVE_ZEXT return Vec256{_mm256_zextps128_ps256(lo.raw)}; #else return Vec256{_mm256_insertf128_ps(_mm256_setzero_ps(), lo.raw, 0)}; #endif } template HWY_API Vec256 ZeroExtendVector(D /* tag */, Vec128 lo) { #if HWY_HAVE_ZEXT return Vec256{_mm256_zextpd128_pd256(lo.raw)}; #else return Vec256{_mm256_insertf128_pd(_mm256_setzero_pd(), lo.raw, 0)}; #endif } // ------------------------------ ZeroExtendResizeBitCast namespace detail { template HWY_INLINE VFromD ZeroExtendResizeBitCast( hwy::SizeTag<8> /* from_size_tag */, hwy::SizeTag<32> /* to_size_tag */, DTo d_to, DFrom d_from, VFromD v) { const Twice dt_from; const Twice dq_from; return BitCast(d_to, ZeroExtendVector(dq_from, ZeroExtendVector(dt_from, v))); } } // namespace detail // ------------------------------ Combine template HWY_API VFromD Combine(D d, VFromD> hi, VFromD> lo) { const RebindToUnsigned du; // for float16_t const Half dh_u; const auto lo256 = ZeroExtendVector(du, BitCast(dh_u, lo)); return BitCast(d, VFromD{_mm256_inserti128_si256( lo256.raw, BitCast(dh_u, hi).raw, 1)}); } template HWY_API Vec256 Combine(D d, Vec128 hi, Vec128 lo) { const auto lo256 = ZeroExtendVector(d, lo); return Vec256{_mm256_insertf128_ps(lo256.raw, hi.raw, 1)}; } template HWY_API Vec256 Combine(D d, Vec128 hi, Vec128 lo) { const auto lo256 = ZeroExtendVector(d, lo); return Vec256{_mm256_insertf128_pd(lo256.raw, hi.raw, 1)}; } // ------------------------------ ShiftLeftBytes template HWY_API VFromD ShiftLeftBytes(D /* tag */, VFromD v) { static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); // This is the same operation as _mm256_bslli_epi128. return VFromD{_mm256_slli_si256(v.raw, kBytes)}; } // ------------------------------ ShiftRightBytes template HWY_API VFromD ShiftRightBytes(D /* tag */, VFromD v) { static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); // This is the same operation as _mm256_bsrli_epi128. return VFromD{_mm256_srli_si256(v.raw, kBytes)}; } // ------------------------------ CombineShiftRightBytes template HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { const Repartition d8; return BitCast(d, Vec256{_mm256_alignr_epi8( BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)}); } // ------------------------------ Broadcast template HWY_API Vec256 Broadcast(const Vec256 v) { const DFromV d; const RebindToUnsigned du; using VU = VFromD; const VU vu = BitCast(du, v); // for float16_t static_assert(0 <= kLane && kLane < 8, "Invalid lane"); if (kLane < 4) { const __m256i lo = _mm256_shufflelo_epi16(vu.raw, (0x55 * kLane) & 0xFF); return BitCast(d, VU{_mm256_unpacklo_epi64(lo, lo)}); } else { const __m256i hi = _mm256_shufflehi_epi16(vu.raw, (0x55 * (kLane - 4)) & 0xFF); return BitCast(d, VU{_mm256_unpackhi_epi64(hi, hi)}); } } template HWY_API Vec256 Broadcast(const Vec256 v) { static_assert(0 <= kLane && kLane < 4, "Invalid lane"); return Vec256{_mm256_shuffle_epi32(v.raw, 0x55 * kLane)}; } template HWY_API Vec256 Broadcast(const Vec256 v) { static_assert(0 <= kLane && kLane < 2, "Invalid lane"); return Vec256{_mm256_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)}; } template HWY_API Vec256 Broadcast(Vec256 v) { static_assert(0 <= kLane && kLane < 4, "Invalid lane"); return Vec256{_mm256_shuffle_ps(v.raw, v.raw, 0x55 * kLane)}; } template HWY_API Vec256 Broadcast(const Vec256 v) { static_assert(0 <= kLane && kLane < 2, "Invalid lane"); return Vec256{_mm256_shuffle_pd(v.raw, v.raw, 15 * kLane)}; } // ------------------------------ BroadcastBlock template HWY_API Vec256 BroadcastBlock(Vec256 v) { static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index"); const DFromV d; return (kBlockIdx == 0) ? ConcatLowerLower(d, v, v) : ConcatUpperUpper(d, v, v); } // ------------------------------ BroadcastLane namespace detail { template HWY_INLINE Vec256 BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, Vec256 v) { const Half> dh; return Vec256{_mm256_broadcastb_epi8(LowerHalf(dh, v).raw)}; } template HWY_INLINE Vec256 BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, Vec256 v) { const DFromV d; const RebindToUnsigned du; // for float16_t const Half dh; const RebindToUnsigned dh_u; return BitCast(d, VFromD{_mm256_broadcastw_epi16( BitCast(dh_u, LowerHalf(dh, v)).raw)}); } template HWY_INLINE Vec256 BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, Vec256 v) { const Half> dh; return Vec256{_mm256_broadcastd_epi32(LowerHalf(dh, v).raw)}; } template HWY_INLINE Vec256 BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, Vec256 v) { const Half> dh; return Vec256{_mm256_broadcastq_epi64(LowerHalf(dh, v).raw)}; } HWY_INLINE Vec256 BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, Vec256 v) { const Half> dh; return Vec256{_mm256_broadcastss_ps(LowerHalf(dh, v).raw)}; } HWY_INLINE Vec256 BroadcastLane(hwy::SizeTag<0> /* lane_idx_tag */, Vec256 v) { const Half> dh; return Vec256{_mm256_broadcastsd_pd(LowerHalf(dh, v).raw)}; } template * = nullptr, HWY_IF_NOT_T_SIZE(T, 8)> HWY_INLINE Vec256 BroadcastLane(hwy::SizeTag /* lane_idx_tag */, Vec256 v) { constexpr size_t kLanesPerBlock = 16 / sizeof(T); constexpr int kBlockIdx = static_cast(kLaneIdx / kLanesPerBlock); constexpr int kLaneInBlkIdx = static_cast(kLaneIdx) & (kLanesPerBlock - 1); return Broadcast(BroadcastBlock(v)); } template * = nullptr, HWY_IF_UI64(T)> HWY_INLINE Vec256 BroadcastLane(hwy::SizeTag /* lane_idx_tag */, Vec256 v) { static_assert(kLaneIdx <= 3, "Invalid lane"); return Vec256{ _mm256_permute4x64_epi64(v.raw, static_cast(0x55 * kLaneIdx))}; } template * = nullptr> HWY_INLINE Vec256 BroadcastLane( hwy::SizeTag /* lane_idx_tag */, Vec256 v) { static_assert(kLaneIdx <= 3, "Invalid lane"); return Vec256{ _mm256_permute4x64_pd(v.raw, static_cast(0x55 * kLaneIdx))}; } } // namespace detail template HWY_API Vec256 BroadcastLane(Vec256 v) { static_assert(kLaneIdx >= 0, "Invalid lane"); return detail::BroadcastLane(hwy::SizeTag(kLaneIdx)>(), v); } // ------------------------------ Hard-coded shuffles // Notation: let Vec256 have lanes 7,6,5,4,3,2,1,0 (0 is // least-significant). Shuffle0321 rotates four-lane blocks one lane to the // right (the previous least-significant lane is now most-significant => // 47650321). These could also be implemented via CombineShiftRightBytes but // the shuffle_abcd notation is more convenient. // Swap 32-bit halves in 64-bit halves. template HWY_API Vec256 Shuffle2301(const Vec256 v) { return Vec256{_mm256_shuffle_epi32(v.raw, 0xB1)}; } HWY_API Vec256 Shuffle2301(const Vec256 v) { return Vec256{_mm256_shuffle_ps(v.raw, v.raw, 0xB1)}; } // Used by generic_ops-inl.h namespace detail { template HWY_API Vec256 ShuffleTwo2301(const Vec256 a, const Vec256 b) { const DFromV d; const RebindToFloat df; constexpr int m = _MM_SHUFFLE(2, 3, 0, 1); return BitCast(d, Vec256{_mm256_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw, m)}); } template HWY_API Vec256 ShuffleTwo1230(const Vec256 a, const Vec256 b) { const DFromV d; const RebindToFloat df; constexpr int m = _MM_SHUFFLE(1, 2, 3, 0); return BitCast(d, Vec256{_mm256_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw, m)}); } template HWY_API Vec256 ShuffleTwo3012(const Vec256 a, const Vec256 b) { const DFromV d; const RebindToFloat df; constexpr int m = _MM_SHUFFLE(3, 0, 1, 2); return BitCast(d, Vec256{_mm256_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw, m)}); } } // namespace detail // Swap 64-bit halves HWY_API Vec256 Shuffle1032(const Vec256 v) { return Vec256{_mm256_shuffle_epi32(v.raw, 0x4E)}; } HWY_API Vec256 Shuffle1032(const Vec256 v) { return Vec256{_mm256_shuffle_epi32(v.raw, 0x4E)}; } HWY_API Vec256 Shuffle1032(const Vec256 v) { // Shorter encoding than _mm256_permute_ps. return Vec256{_mm256_shuffle_ps(v.raw, v.raw, 0x4E)}; } HWY_API Vec256 Shuffle01(const Vec256 v) { return Vec256{_mm256_shuffle_epi32(v.raw, 0x4E)}; } HWY_API Vec256 Shuffle01(const Vec256 v) { return Vec256{_mm256_shuffle_epi32(v.raw, 0x4E)}; } HWY_API Vec256 Shuffle01(const Vec256 v) { // Shorter encoding than _mm256_permute_pd. return Vec256{_mm256_shuffle_pd(v.raw, v.raw, 5)}; } // Rotate right 32 bits HWY_API Vec256 Shuffle0321(const Vec256 v) { return Vec256{_mm256_shuffle_epi32(v.raw, 0x39)}; } HWY_API Vec256 Shuffle0321(const Vec256 v) { return Vec256{_mm256_shuffle_epi32(v.raw, 0x39)}; } HWY_API Vec256 Shuffle0321(const Vec256 v) { return Vec256{_mm256_shuffle_ps(v.raw, v.raw, 0x39)}; } // Rotate left 32 bits HWY_API Vec256 Shuffle2103(const Vec256 v) { return Vec256{_mm256_shuffle_epi32(v.raw, 0x93)}; } HWY_API Vec256 Shuffle2103(const Vec256 v) { return Vec256{_mm256_shuffle_epi32(v.raw, 0x93)}; } HWY_API Vec256 Shuffle2103(const Vec256 v) { return Vec256{_mm256_shuffle_ps(v.raw, v.raw, 0x93)}; } // Reverse HWY_API Vec256 Shuffle0123(const Vec256 v) { return Vec256{_mm256_shuffle_epi32(v.raw, 0x1B)}; } HWY_API Vec256 Shuffle0123(const Vec256 v) { return Vec256{_mm256_shuffle_epi32(v.raw, 0x1B)}; } HWY_API Vec256 Shuffle0123(const Vec256 v) { return Vec256{_mm256_shuffle_ps(v.raw, v.raw, 0x1B)}; } // ------------------------------ TableLookupLanes // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. template struct Indices256 { __m256i raw; }; // 8-bit lanes: indices remain unchanged template HWY_API Indices256> IndicesFromVec(D /* tag */, Vec256 vec) { static_assert(sizeof(TFromD) == sizeof(TI), "Index size must match lane"); #if HWY_IS_DEBUG_BUILD const Full256 di; HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && AllTrue(di, Lt(vec, Set(di, static_cast(2 * Lanes(di)))))); #endif return Indices256>{vec.raw}; } // 16-bit lanes: convert indices to 32x8 unless AVX3 is available template HWY_API Indices256> IndicesFromVec(D /* tag */, Vec256 vec) { static_assert(sizeof(TFromD) == sizeof(TI), "Index size must match lane"); const Full256 di; #if HWY_IS_DEBUG_BUILD HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && AllTrue(di, Lt(vec, Set(di, static_cast(2 * Lanes(di)))))); #endif #if HWY_TARGET <= HWY_AVX3 (void)di; return Indices256>{vec.raw}; #else const Repartition d8; using V8 = VFromD; alignas(32) static constexpr uint8_t kByteOffsets[32] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; // Broadcast each lane index to all 2 bytes of T alignas(32) static constexpr uint8_t kBroadcastLaneBytes[32] = { 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes)); // Shift to bytes const Repartition d16; const V8 byte_indices = BitCast(d8, ShiftLeft<1>(BitCast(d16, lane_indices))); return Indices256>{Add(byte_indices, Load(d8, kByteOffsets)).raw}; #endif // HWY_TARGET <= HWY_AVX3 } // Native 8x32 instruction: indices remain unchanged template HWY_API Indices256> IndicesFromVec(D /* tag */, Vec256 vec) { static_assert(sizeof(TFromD) == sizeof(TI), "Index size must match lane"); #if HWY_IS_DEBUG_BUILD const Full256 di; HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && AllTrue(di, Lt(vec, Set(di, static_cast(2 * Lanes(di)))))); #endif return Indices256>{vec.raw}; } // 64-bit lanes: convert indices to 8x32 unless AVX3 is available template HWY_API Indices256> IndicesFromVec(D d, Vec256 idx64) { static_assert(sizeof(TFromD) == sizeof(TI), "Index size must match lane"); const Rebind di; (void)di; // potentially unused #if HWY_IS_DEBUG_BUILD HWY_DASSERT(AllFalse(di, Lt(idx64, Zero(di))) && AllTrue(di, Lt(idx64, Set(di, static_cast(2 * Lanes(di)))))); #endif #if HWY_TARGET <= HWY_AVX3 (void)d; return Indices256>{idx64.raw}; #else const Repartition df; // 32-bit! // Replicate 64-bit index into upper 32 bits const Vec256 dup = BitCast(di, Vec256{_mm256_moveldup_ps(BitCast(df, idx64).raw)}); // For each idx64 i, idx32 are 2*i and 2*i+1. const Vec256 idx32 = dup + dup + Set(di, TI(1) << 32); return Indices256>{idx32.raw}; #endif } template HWY_API Indices256> SetTableIndices(D d, const TI* idx) { const Rebind di; return IndicesFromVec(d, LoadU(di, idx)); } template HWY_API Vec256 TableLookupLanes(Vec256 v, Indices256 idx) { #if HWY_TARGET <= HWY_AVX3_DL return Vec256{_mm256_permutexvar_epi8(idx.raw, v.raw)}; #else const Vec256 idx_vec{idx.raw}; const DFromV d; const Repartition du16; const auto sel_hi_mask = MaskFromVec(BitCast(d, ShiftLeft<3>(BitCast(du16, idx_vec)))); const auto a = ConcatLowerLower(d, v, v); const auto b = ConcatUpperUpper(d, v, v); const auto lo_lookup_result = TableLookupBytes(a, idx_vec); #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_mask_shuffle_epi8( lo_lookup_result.raw, sel_hi_mask.raw, b.raw, idx_vec.raw)}; #else const auto hi_lookup_result = TableLookupBytes(b, idx_vec); return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); #endif // HWY_TARGET <= HWY_AVX3 #endif // HWY_TARGET <= HWY_AVX3_DL } template HWY_API Vec256 TableLookupLanes(Vec256 v, Indices256 idx) { #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_permutexvar_epi16(idx.raw, v.raw)}; #else const DFromV d; const Repartition du8; return BitCast( d, TableLookupLanes(BitCast(du8, v), Indices256{idx.raw})); #endif } #if HWY_HAVE_FLOAT16 HWY_API Vec256 TableLookupLanes(Vec256 v, Indices256 idx) { return Vec256{_mm256_permutexvar_ph(idx.raw, v.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec256 TableLookupLanes(Vec256 v, Indices256 idx) { return Vec256{_mm256_permutevar8x32_epi32(v.raw, idx.raw)}; } template HWY_API Vec256 TableLookupLanes(Vec256 v, Indices256 idx) { #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_permutexvar_epi64(idx.raw, v.raw)}; #else return Vec256{_mm256_permutevar8x32_epi32(v.raw, idx.raw)}; #endif } HWY_API Vec256 TableLookupLanes(const Vec256 v, const Indices256 idx) { return Vec256{_mm256_permutevar8x32_ps(v.raw, idx.raw)}; } HWY_API Vec256 TableLookupLanes(const Vec256 v, const Indices256 idx) { #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_permutexvar_pd(idx.raw, v.raw)}; #else const Full256 df; const Full256 du; return BitCast(df, Vec256{_mm256_permutevar8x32_epi32( BitCast(du, v).raw, idx.raw)}); #endif } template HWY_API Vec256 TwoTablesLookupLanes(Vec256 a, Vec256 b, Indices256 idx) { #if HWY_TARGET <= HWY_AVX3_DL return Vec256{_mm256_permutex2var_epi8(a.raw, idx.raw, b.raw)}; #else const DFromV d; const auto sel_hi_mask = MaskFromVec(BitCast(d, ShiftLeft<2>(Vec256{idx.raw}))); const auto lo_lookup_result = TableLookupLanes(a, idx); const auto hi_lookup_result = TableLookupLanes(b, idx); return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); #endif } template HWY_API Vec256 TwoTablesLookupLanes(Vec256 a, Vec256 b, Indices256 idx) { #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_permutex2var_epi16(a.raw, idx.raw, b.raw)}; #else const DFromV d; const Repartition du8; return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), Indices256{idx.raw})); #endif } template HWY_API Vec256 TwoTablesLookupLanes(Vec256 a, Vec256 b, Indices256 idx) { #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_permutex2var_epi32(a.raw, idx.raw, b.raw)}; #else const DFromV d; const RebindToFloat df; const Vec256 idx_vec{idx.raw}; const auto sel_hi_mask = MaskFromVec(BitCast(df, ShiftLeft<28>(idx_vec))); const auto lo_lookup_result = BitCast(df, TableLookupLanes(a, idx)); const auto hi_lookup_result = BitCast(df, TableLookupLanes(b, idx)); return BitCast(d, IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result)); #endif } #if HWY_HAVE_FLOAT16 HWY_API Vec256 TwoTablesLookupLanes(Vec256 a, Vec256 b, Indices256 idx) { return Vec256{_mm256_permutex2var_ph(a.raw, idx.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 HWY_API Vec256 TwoTablesLookupLanes(Vec256 a, Vec256 b, Indices256 idx) { #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_permutex2var_ps(a.raw, idx.raw, b.raw)}; #else const DFromV d; const auto sel_hi_mask = MaskFromVec(BitCast(d, ShiftLeft<28>(Vec256{idx.raw}))); const auto lo_lookup_result = TableLookupLanes(a, idx); const auto hi_lookup_result = TableLookupLanes(b, idx); return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); #endif } template HWY_API Vec256 TwoTablesLookupLanes(Vec256 a, Vec256 b, Indices256 idx) { #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_permutex2var_epi64(a.raw, idx.raw, b.raw)}; #else const DFromV d; const Repartition du32; return BitCast(d, TwoTablesLookupLanes(BitCast(du32, a), BitCast(du32, b), Indices256{idx.raw})); #endif } HWY_API Vec256 TwoTablesLookupLanes(Vec256 a, Vec256 b, Indices256 idx) { #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_permutex2var_pd(a.raw, idx.raw, b.raw)}; #else const DFromV d; const Repartition du32; return BitCast(d, TwoTablesLookupLanes(BitCast(du32, a), BitCast(du32, b), Indices256{idx.raw})); #endif } // ------------------------------ SwapAdjacentBlocks template HWY_API Vec256 SwapAdjacentBlocks(Vec256 v) { const DFromV d; const RebindToUnsigned du; // for float16_t return BitCast(d, VFromD{_mm256_permute4x64_epi64( BitCast(du, v).raw, _MM_SHUFFLE(1, 0, 3, 2))}); } HWY_API Vec256 SwapAdjacentBlocks(Vec256 v) { return Vec256{_mm256_permute4x64_pd(v.raw, _MM_SHUFFLE(1, 0, 3, 2))}; } HWY_API Vec256 SwapAdjacentBlocks(Vec256 v) { // Assume no domain-crossing penalty between float/double (true on SKX). const DFromV d; const RepartitionToWide dw; return BitCast(d, SwapAdjacentBlocks(BitCast(dw, v))); } // ------------------------------ Reverse (RotateRight) template HWY_API VFromD Reverse(D d, const VFromD v) { alignas(32) static constexpr int32_t kReverse[8] = {7, 6, 5, 4, 3, 2, 1, 0}; return TableLookupLanes(v, SetTableIndices(d, kReverse)); } template HWY_API VFromD Reverse(D d, const VFromD v) { alignas(32) static constexpr int64_t kReverse[4] = {3, 2, 1, 0}; return TableLookupLanes(v, SetTableIndices(d, kReverse)); } template HWY_API VFromD Reverse(D d, const VFromD v) { #if HWY_TARGET <= HWY_AVX3 const RebindToSigned di; alignas(32) static constexpr int16_t kReverse[16] = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; const Vec256 idx = Load(di, kReverse); return BitCast(d, Vec256{ _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); #else const RebindToSigned di; const VFromD shuffle = Dup128VecFromValues( di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100); const auto rev128 = TableLookupBytes(v, shuffle); return VFromD{ _mm256_permute4x64_epi64(rev128.raw, _MM_SHUFFLE(1, 0, 3, 2))}; #endif } template HWY_API VFromD Reverse(D d, const VFromD v) { #if HWY_TARGET <= HWY_AVX3_DL alignas(32) static constexpr TFromD kReverse[32] = { 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; return TableLookupLanes(v, SetTableIndices(d, kReverse)); #else // First reverse bytes within blocks via PSHUFB, then swap blocks. alignas(32) static constexpr TFromD kReverse[32] = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}; return SwapAdjacentBlocks(TableLookupBytes(v, Load(d, kReverse))); #endif } // ------------------------------ Reverse2 (in x86_128) // ------------------------------ Reverse4 (SwapAdjacentBlocks) template HWY_API VFromD Reverse4(D d, const VFromD v) { const RebindToSigned di; const VFromD shuffle = Dup128VecFromValues( di, 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908); return BitCast(d, TableLookupBytes(v, shuffle)); } // 32 bit Reverse4 defined in x86_128. template HWY_API VFromD Reverse4(D /* tag */, const VFromD v) { // Could also use _mm256_permute4x64_epi64. return SwapAdjacentBlocks(Shuffle01(v)); } // ------------------------------ Reverse8 template HWY_API VFromD Reverse8(D d, const VFromD v) { const RebindToSigned di; const VFromD shuffle = Dup128VecFromValues( di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100); return BitCast(d, TableLookupBytes(v, shuffle)); } template HWY_API VFromD Reverse8(D d, const VFromD v) { return Reverse(d, v); } template HWY_API VFromD Reverse8(D /* tag */, const VFromD /* v */) { HWY_ASSERT(0); // AVX2 does not have 8 64-bit lanes } // ------------------------------ ReverseBits in x86_512 // ------------------------------ InterleaveLower // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides // the least-significant lane) and "b". To concatenate two half-width integers // into one, use ZipLower/Upper instead (also works with scalar). template HWY_API Vec256 InterleaveLower(Vec256 a, Vec256 b) { return Vec256{_mm256_unpacklo_epi8(a.raw, b.raw)}; } template HWY_API Vec256 InterleaveLower(Vec256 a, Vec256 b) { const DFromV d; const RebindToUnsigned du; using VU = VFromD; // for float16_t return BitCast( d, VU{_mm256_unpacklo_epi16(BitCast(du, a).raw, BitCast(du, b).raw)}); } template HWY_API Vec256 InterleaveLower(Vec256 a, Vec256 b) { return Vec256{_mm256_unpacklo_epi32(a.raw, b.raw)}; } template HWY_API Vec256 InterleaveLower(Vec256 a, Vec256 b) { return Vec256{_mm256_unpacklo_epi64(a.raw, b.raw)}; } HWY_API Vec256 InterleaveLower(Vec256 a, Vec256 b) { return Vec256{_mm256_unpacklo_ps(a.raw, b.raw)}; } HWY_API Vec256 InterleaveLower(Vec256 a, Vec256 b) { return Vec256{_mm256_unpacklo_pd(a.raw, b.raw)}; } // ------------------------------ InterleaveUpper template HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { return VFromD{_mm256_unpackhi_epi8(a.raw, b.raw)}; } template HWY_API VFromD InterleaveUpper(D d, VFromD a, VFromD b) { const RebindToUnsigned du; using VU = VFromD; // for float16_t return BitCast( d, VU{_mm256_unpackhi_epi16(BitCast(du, a).raw, BitCast(du, b).raw)}); } template HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { return VFromD{_mm256_unpackhi_epi32(a.raw, b.raw)}; } template HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { return VFromD{_mm256_unpackhi_epi64(a.raw, b.raw)}; } template HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { return VFromD{_mm256_unpackhi_ps(a.raw, b.raw)}; } template HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { return VFromD{_mm256_unpackhi_pd(a.raw, b.raw)}; } // ------------------------------ Blocks (LowerHalf, ZeroExtendVector) // _mm256_broadcastsi128_si256 has 7 cycle latency on ICL. // _mm256_permute2x128_si256 is slow on Zen1 (8 uops), so we avoid it (at no // extra cost) for LowerLower and UpperLower. // hiH,hiL loH,loL |-> hiL,loL (= lower halves) template HWY_API VFromD ConcatLowerLower(D d, VFromD hi, VFromD lo) { const RebindToUnsigned du; // for float16_t const Half d2; const RebindToUnsigned du2; // for float16_t return BitCast( d, VFromD{_mm256_inserti128_si256( BitCast(du, lo).raw, BitCast(du2, LowerHalf(d2, hi)).raw, 1)}); } template HWY_API Vec256 ConcatLowerLower(D d, Vec256 hi, Vec256 lo) { const Half d2; return Vec256{_mm256_insertf128_ps(lo.raw, LowerHalf(d2, hi).raw, 1)}; } template HWY_API Vec256 ConcatLowerLower(D d, Vec256 hi, Vec256 lo) { const Half d2; return Vec256{_mm256_insertf128_pd(lo.raw, LowerHalf(d2, hi).raw, 1)}; } // hiH,hiL loH,loL |-> hiL,loH (= inner halves / swap blocks) template HWY_API VFromD ConcatLowerUpper(D d, VFromD hi, VFromD lo) { const RebindToUnsigned du; return BitCast(d, VFromD{_mm256_permute2x128_si256( BitCast(du, lo).raw, BitCast(du, hi).raw, 0x21)}); } template HWY_API Vec256 ConcatLowerUpper(D /* tag */, Vec256 hi, Vec256 lo) { return Vec256{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x21)}; } template HWY_API Vec256 ConcatLowerUpper(D /* tag */, Vec256 hi, Vec256 lo) { return Vec256{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x21)}; } // hiH,hiL loH,loL |-> hiH,loL (= outer halves) template HWY_API VFromD ConcatUpperLower(D d, VFromD hi, VFromD lo) { const RebindToUnsigned du; // for float16_t return BitCast(d, VFromD{_mm256_blend_epi32( BitCast(du, hi).raw, BitCast(du, lo).raw, 0x0F)}); } template HWY_API Vec256 ConcatUpperLower(D /* tag */, Vec256 hi, Vec256 lo) { return Vec256{_mm256_blend_ps(hi.raw, lo.raw, 0x0F)}; } template HWY_API Vec256 ConcatUpperLower(D /* tag */, Vec256 hi, Vec256 lo) { return Vec256{_mm256_blend_pd(hi.raw, lo.raw, 3)}; } // hiH,hiL loH,loL |-> hiH,loH (= upper halves) template HWY_API VFromD ConcatUpperUpper(D d, VFromD hi, VFromD lo) { const RebindToUnsigned du; // for float16_t return BitCast(d, VFromD{_mm256_permute2x128_si256( BitCast(du, lo).raw, BitCast(du, hi).raw, 0x31)}); } template HWY_API Vec256 ConcatUpperUpper(D /* tag */, Vec256 hi, Vec256 lo) { return Vec256{_mm256_permute2f128_ps(lo.raw, hi.raw, 0x31)}; } template HWY_API Vec256 ConcatUpperUpper(D /* tag */, Vec256 hi, Vec256 lo) { return Vec256{_mm256_permute2f128_pd(lo.raw, hi.raw, 0x31)}; } // ---------------------------- InsertBlock (ConcatLowerLower, ConcatUpperLower) template HWY_API Vec256 InsertBlock(Vec256 v, Vec128 blk_to_insert) { static_assert(kBlockIdx == 0 || kBlockIdx == 1, "Invalid block index"); const DFromV d; const auto vec_to_insert = ResizeBitCast(d, blk_to_insert); return (kBlockIdx == 0) ? ConcatUpperLower(d, v, vec_to_insert) : ConcatLowerLower(d, vec_to_insert, v); } // ------------------------------ ConcatOdd template HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { const RebindToUnsigned du; #if HWY_TARGET <= HWY_AVX3_DL alignas(32) static constexpr uint8_t kIdx[32] = { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63}; return BitCast( d, Vec256{_mm256_permutex2var_epi8( BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); #else const RepartitionToWide dw; // Unsigned 8-bit shift so we can pack. const Vec256 uH = ShiftRight<8>(BitCast(dw, hi)); const Vec256 uL = ShiftRight<8>(BitCast(dw, lo)); const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw); return VFromD{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))}; #endif } template HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { const RebindToUnsigned du; #if HWY_TARGET <= HWY_AVX3 alignas(32) static constexpr uint16_t kIdx[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31}; return BitCast( d, Vec256{_mm256_permutex2var_epi16( BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); #else const RepartitionToWide dw; // Unsigned 16-bit shift so we can pack. const Vec256 uH = ShiftRight<16>(BitCast(dw, hi)); const Vec256 uL = ShiftRight<16>(BitCast(dw, lo)); const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw); return BitCast(d, VFromD{_mm256_permute4x64_epi64( u16, _MM_SHUFFLE(3, 1, 2, 0))}); #endif } template HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { const RebindToUnsigned du; #if HWY_TARGET <= HWY_AVX3 alignas(32) static constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15}; return BitCast( d, Vec256{_mm256_permutex2var_epi32( BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); #else const RebindToFloat df; const Vec256 v3131{_mm256_shuffle_ps( BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(3, 1, 3, 1))}; return VFromD{_mm256_permute4x64_epi64(BitCast(du, v3131).raw, _MM_SHUFFLE(3, 1, 2, 0))}; #endif } template HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { const RebindToUnsigned du; #if HWY_TARGET <= HWY_AVX3 alignas(32) static constexpr uint32_t kIdx[8] = {1, 3, 5, 7, 9, 11, 13, 15}; return VFromD{_mm256_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)}; #else const VFromD v3131{ _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 1, 3, 1))}; return BitCast(d, Vec256{_mm256_permute4x64_epi64( BitCast(du, v3131).raw, _MM_SHUFFLE(3, 1, 2, 0))}); #endif } template HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { const RebindToUnsigned du; #if HWY_TARGET <= HWY_AVX3 alignas(64) static constexpr uint64_t kIdx[4] = {1, 3, 5, 7}; return BitCast( d, Vec256{_mm256_permutex2var_epi64( BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); #else const RebindToFloat df; const Vec256 v31{ _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 15)}; return VFromD{ _mm256_permute4x64_epi64(BitCast(du, v31).raw, _MM_SHUFFLE(3, 1, 2, 0))}; #endif } template HWY_API Vec256 ConcatOdd(D d, Vec256 hi, Vec256 lo) { #if HWY_TARGET <= HWY_AVX3 const RebindToUnsigned du; alignas(64) static constexpr uint64_t kIdx[4] = {1, 3, 5, 7}; return Vec256{ _mm256_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)}; #else (void)d; const Vec256 v31{_mm256_shuffle_pd(lo.raw, hi.raw, 15)}; return Vec256{ _mm256_permute4x64_pd(v31.raw, _MM_SHUFFLE(3, 1, 2, 0))}; #endif } // ------------------------------ ConcatEven template HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { const RebindToUnsigned du; #if HWY_TARGET <= HWY_AVX3_DL alignas(64) static constexpr uint8_t kIdx[32] = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62}; return BitCast( d, Vec256{_mm256_permutex2var_epi8( BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); #else const RepartitionToWide dw; // Isolate lower 8 bits per u16 so we can pack. const Vec256 mask = Set(dw, 0x00FF); const Vec256 uH = And(BitCast(dw, hi), mask); const Vec256 uL = And(BitCast(dw, lo), mask); const __m256i u8 = _mm256_packus_epi16(uL.raw, uH.raw); return VFromD{_mm256_permute4x64_epi64(u8, _MM_SHUFFLE(3, 1, 2, 0))}; #endif } template HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { const RebindToUnsigned du; #if HWY_TARGET <= HWY_AVX3 alignas(64) static constexpr uint16_t kIdx[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30}; return BitCast( d, Vec256{_mm256_permutex2var_epi16( BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); #else const RepartitionToWide dw; // Isolate lower 16 bits per u32 so we can pack. const Vec256 mask = Set(dw, 0x0000FFFF); const Vec256 uH = And(BitCast(dw, hi), mask); const Vec256 uL = And(BitCast(dw, lo), mask); const __m256i u16 = _mm256_packus_epi32(uL.raw, uH.raw); return BitCast(d, VFromD{_mm256_permute4x64_epi64( u16, _MM_SHUFFLE(3, 1, 2, 0))}); #endif } template HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { const RebindToUnsigned du; #if HWY_TARGET <= HWY_AVX3 alignas(64) static constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14}; return BitCast( d, Vec256{_mm256_permutex2var_epi32( BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); #else const RebindToFloat df; const Vec256 v2020{_mm256_shuffle_ps( BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))}; return VFromD{_mm256_permute4x64_epi64(BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))}; #endif } template HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { const RebindToUnsigned du; #if HWY_TARGET <= HWY_AVX3 alignas(64) static constexpr uint32_t kIdx[8] = {0, 2, 4, 6, 8, 10, 12, 14}; return VFromD{_mm256_permutex2var_ps(lo.raw, Load(du, kIdx).raw, hi.raw)}; #else const VFromD v2020{ _mm256_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))}; return BitCast(d, Vec256{_mm256_permute4x64_epi64( BitCast(du, v2020).raw, _MM_SHUFFLE(3, 1, 2, 0))}); #endif } template HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { const RebindToUnsigned du; #if HWY_TARGET <= HWY_AVX3 alignas(64) static constexpr uint64_t kIdx[4] = {0, 2, 4, 6}; return BitCast( d, Vec256{_mm256_permutex2var_epi64( BitCast(du, lo).raw, Load(du, kIdx).raw, BitCast(du, hi).raw)}); #else const RebindToFloat df; const Vec256 v20{ _mm256_shuffle_pd(BitCast(df, lo).raw, BitCast(df, hi).raw, 0)}; return VFromD{ _mm256_permute4x64_epi64(BitCast(du, v20).raw, _MM_SHUFFLE(3, 1, 2, 0))}; #endif } template HWY_API Vec256 ConcatEven(D d, Vec256 hi, Vec256 lo) { #if HWY_TARGET <= HWY_AVX3 const RebindToUnsigned du; alignas(64) static constexpr uint64_t kIdx[4] = {0, 2, 4, 6}; return Vec256{ _mm256_permutex2var_pd(lo.raw, Load(du, kIdx).raw, hi.raw)}; #else (void)d; const Vec256 v20{_mm256_shuffle_pd(lo.raw, hi.raw, 0)}; return Vec256{ _mm256_permute4x64_pd(v20.raw, _MM_SHUFFLE(3, 1, 2, 0))}; #endif } // ------------------------------ InterleaveWholeLower #if HWY_TARGET <= HWY_AVX3 template HWY_API VFromD InterleaveWholeLower(D d, VFromD a, VFromD b) { #if HWY_TARGET <= HWY_AVX3_DL const RebindToUnsigned du; alignas(32) static constexpr uint8_t kIdx[32] = { 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47}; return VFromD{_mm256_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)}; #else return ConcatLowerLower(d, InterleaveUpper(d, a, b), InterleaveLower(a, b)); #endif } template HWY_API VFromD InterleaveWholeLower(D d, VFromD a, VFromD b) { const RebindToUnsigned du; alignas(32) static constexpr uint16_t kIdx[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23}; return BitCast( d, VFromD{_mm256_permutex2var_epi16( BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)}); } template HWY_API VFromD InterleaveWholeLower(D d, VFromD a, VFromD b) { const RebindToUnsigned du; alignas(32) static constexpr uint32_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11}; return VFromD{_mm256_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)}; } template HWY_API VFromD InterleaveWholeLower(D d, VFromD a, VFromD b) { const RebindToUnsigned du; alignas(32) static constexpr uint32_t kIdx[8] = {0, 8, 1, 9, 2, 10, 3, 11}; return VFromD{_mm256_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)}; } template HWY_API VFromD InterleaveWholeLower(D d, VFromD a, VFromD b) { const RebindToUnsigned du; alignas(32) static constexpr uint64_t kIdx[4] = {0, 4, 1, 5}; return VFromD{_mm256_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)}; } template HWY_API VFromD InterleaveWholeLower(D d, VFromD a, VFromD b) { const RebindToUnsigned du; alignas(32) static constexpr uint64_t kIdx[4] = {0, 4, 1, 5}; return VFromD{_mm256_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)}; } #else // AVX2 template HWY_API VFromD InterleaveWholeLower(D d, VFromD a, VFromD b) { return ConcatLowerLower(d, InterleaveUpper(d, a, b), InterleaveLower(a, b)); } #endif // ------------------------------ InterleaveWholeUpper #if HWY_TARGET <= HWY_AVX3 template HWY_API VFromD InterleaveWholeUpper(D d, VFromD a, VFromD b) { #if HWY_TARGET <= HWY_AVX3_DL const RebindToUnsigned du; alignas(32) static constexpr uint8_t kIdx[32] = { 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63}; return VFromD{_mm256_permutex2var_epi8(a.raw, Load(du, kIdx).raw, b.raw)}; #else return ConcatUpperUpper(d, InterleaveUpper(d, a, b), InterleaveLower(a, b)); #endif } template HWY_API VFromD InterleaveWholeUpper(D d, VFromD a, VFromD b) { const RebindToUnsigned du; alignas(32) static constexpr uint16_t kIdx[16] = { 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31}; return BitCast( d, VFromD{_mm256_permutex2var_epi16( BitCast(du, a).raw, Load(du, kIdx).raw, BitCast(du, b).raw)}); } template HWY_API VFromD InterleaveWholeUpper(D d, VFromD a, VFromD b) { const RebindToUnsigned du; alignas(32) static constexpr uint32_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15}; return VFromD{_mm256_permutex2var_epi32(a.raw, Load(du, kIdx).raw, b.raw)}; } template HWY_API VFromD InterleaveWholeUpper(D d, VFromD a, VFromD b) { const RebindToUnsigned du; alignas(32) static constexpr uint32_t kIdx[8] = {4, 12, 5, 13, 6, 14, 7, 15}; return VFromD{_mm256_permutex2var_ps(a.raw, Load(du, kIdx).raw, b.raw)}; } template HWY_API VFromD InterleaveWholeUpper(D d, VFromD a, VFromD b) { const RebindToUnsigned du; alignas(32) static constexpr uint64_t kIdx[4] = {2, 6, 3, 7}; return VFromD{_mm256_permutex2var_epi64(a.raw, Load(du, kIdx).raw, b.raw)}; } template HWY_API VFromD InterleaveWholeUpper(D d, VFromD a, VFromD b) { const RebindToUnsigned du; alignas(32) static constexpr uint64_t kIdx[4] = {2, 6, 3, 7}; return VFromD{_mm256_permutex2var_pd(a.raw, Load(du, kIdx).raw, b.raw)}; } #else // AVX2 template HWY_API VFromD InterleaveWholeUpper(D d, VFromD a, VFromD b) { return ConcatUpperUpper(d, InterleaveUpper(d, a, b), InterleaveLower(a, b)); } #endif // ------------------------------ DupEven (InterleaveLower) template HWY_API Vec256 DupEven(Vec256 v) { return Vec256{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; } HWY_API Vec256 DupEven(Vec256 v) { return Vec256{ _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; } template HWY_API Vec256 DupEven(const Vec256 v) { const DFromV d; return InterleaveLower(d, v, v); } // ------------------------------ DupOdd (InterleaveUpper) template HWY_API Vec256 DupOdd(Vec256 v) { return Vec256{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; } HWY_API Vec256 DupOdd(Vec256 v) { return Vec256{ _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; } template HWY_API Vec256 DupOdd(const Vec256 v) { const DFromV d; return InterleaveUpper(d, v, v); } // ------------------------------ OddEven template HWY_INLINE Vec256 OddEven(Vec256 a, Vec256 b) { const DFromV d; const Full256 d8; const VFromD mask = Dup128VecFromValues(d8, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0); return IfThenElse(MaskFromVec(BitCast(d, mask)), b, a); } template HWY_INLINE Vec256 OddEven(Vec256 a, Vec256 b) { const DFromV d; const RebindToUnsigned du; // for float16_t return BitCast(d, VFromD{_mm256_blend_epi16( BitCast(du, a).raw, BitCast(du, b).raw, 0x55)}); } #if HWY_HAVE_FLOAT16 HWY_INLINE Vec256 OddEven(Vec256 a, Vec256 b) { return Vec256{ _mm256_mask_blend_ph(static_cast<__mmask16>(0x5555), a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_INLINE Vec256 OddEven(Vec256 a, Vec256 b) { return Vec256{_mm256_blend_epi32(a.raw, b.raw, 0x55)}; } template HWY_INLINE Vec256 OddEven(Vec256 a, Vec256 b) { return Vec256{_mm256_blend_epi32(a.raw, b.raw, 0x33)}; } HWY_API Vec256 OddEven(Vec256 a, Vec256 b) { return Vec256{_mm256_blend_ps(a.raw, b.raw, 0x55)}; } HWY_API Vec256 OddEven(Vec256 a, Vec256 b) { return Vec256{_mm256_blend_pd(a.raw, b.raw, 5)}; } // ------------------------------ OddEvenBlocks template Vec256 OddEvenBlocks(Vec256 odd, Vec256 even) { const DFromV d; const RebindToUnsigned du; return BitCast(d, VFromD{_mm256_blend_epi32( BitCast(du, odd).raw, BitCast(du, even).raw, 0xFu)}); } HWY_API Vec256 OddEvenBlocks(Vec256 odd, Vec256 even) { return Vec256{_mm256_blend_ps(odd.raw, even.raw, 0xFu)}; } HWY_API Vec256 OddEvenBlocks(Vec256 odd, Vec256 even) { return Vec256{_mm256_blend_pd(odd.raw, even.raw, 0x3u)}; } // ------------------------------ ReverseBlocks (SwapAdjacentBlocks) template HWY_API VFromD ReverseBlocks(D /*d*/, VFromD v) { return SwapAdjacentBlocks(v); } // ------------------------------ TableLookupBytes (ZeroExtendVector) // Both full template HWY_API Vec256 TableLookupBytes(Vec256 bytes, Vec256 from) { const DFromV d; return BitCast(d, Vec256{_mm256_shuffle_epi8( BitCast(Full256(), bytes).raw, BitCast(Full256(), from).raw)}); } // Partial index vector template HWY_API Vec128 TableLookupBytes(Vec256 bytes, Vec128 from) { const Full256 di; const Half dih; // First expand to full 128, then 256. const auto from_256 = ZeroExtendVector(di, Vec128{from.raw}); const auto tbl_full = TableLookupBytes(bytes, from_256); // Shrink to 128, then partial. return Vec128{LowerHalf(dih, tbl_full).raw}; } // Partial table vector template HWY_API Vec256 TableLookupBytes(Vec128 bytes, Vec256 from) { const Full256 d; // First expand to full 128, then 256. const auto bytes_256 = ZeroExtendVector(d, Vec128{bytes.raw}); return TableLookupBytes(bytes_256, from); } // Partial both are handled by x86_128. // ------------------------------ I8/U8 Broadcast (TableLookupBytes) template HWY_API Vec256 Broadcast(const Vec256 v) { static_assert(0 <= kLane && kLane < 16, "Invalid lane"); return TableLookupBytes(v, Set(Full256(), static_cast(kLane))); } // ------------------------------ Per4LaneBlockShuffle namespace detail { template HWY_INLINE VFromD Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, const uint32_t x2, const uint32_t x1, const uint32_t x0) { return BitCast(d, Vec256{_mm256_set_epi32( static_cast(x3), static_cast(x2), static_cast(x1), static_cast(x0), static_cast(x3), static_cast(x2), static_cast(x1), static_cast(x0))}); } template )> HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, hwy::SizeTag<32> /*vect_size_tag*/, V v) { return V{_mm256_shuffle_epi32(v.raw, static_cast(kIdx3210 & 0xFF))}; } template )> HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, hwy::SizeTag<32> /*vect_size_tag*/, V v) { return V{_mm256_shuffle_ps(v.raw, v.raw, static_cast(kIdx3210 & 0xFF))}; } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x44> /*idx_3210_tag*/, hwy::SizeTag<8> /*lane_size_tag*/, hwy::SizeTag<32> /*vect_size_tag*/, V v) { const DFromV d; return ConcatLowerLower(d, v, v); } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xEE> /*idx_3210_tag*/, hwy::SizeTag<8> /*lane_size_tag*/, hwy::SizeTag<32> /*vect_size_tag*/, V v) { const DFromV d; return ConcatUpperUpper(d, v, v); } template )> HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, hwy::SizeTag<8> /*lane_size_tag*/, hwy::SizeTag<32> /*vect_size_tag*/, V v) { return V{_mm256_permute4x64_epi64(v.raw, static_cast(kIdx3210 & 0xFF))}; } template )> HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, hwy::SizeTag<8> /*lane_size_tag*/, hwy::SizeTag<32> /*vect_size_tag*/, V v) { return V{_mm256_permute4x64_pd(v.raw, static_cast(kIdx3210 & 0xFF))}; } } // namespace detail // ------------------------------ SlideUpLanes namespace detail { #if HWY_TARGET <= HWY_AVX3 template HWY_INLINE V CombineShiftRightI32Lanes(V hi, V lo) { const DFromV d; const Repartition du32; return BitCast(d, Vec256{_mm256_alignr_epi32( BitCast(du32, hi).raw, BitCast(du32, lo).raw, kI32Lanes)}); } template HWY_INLINE V CombineShiftRightI64Lanes(V hi, V lo) { const DFromV d; const Repartition du64; return BitCast(d, Vec256{_mm256_alignr_epi64( BitCast(du64, hi).raw, BitCast(du64, lo).raw, kI64Lanes)}); } template HWY_INLINE V SlideUpI64Lanes(V v) { static_assert(0 <= kI64Lanes && kI64Lanes <= 3, "kI64Lanes must be between 0 and 3"); const DFromV d; return CombineShiftRightI64Lanes<4 - kI64Lanes>(v, Zero(d)); } #else // AVX2 template )> HWY_INLINE V SlideUpI64Lanes(V v) { static_assert(0 <= kI64Lanes && kI64Lanes <= 3, "kI64Lanes must be between 0 and 3"); constexpr int kIdx0 = (-kI64Lanes) & 3; constexpr int kIdx1 = (-kI64Lanes + 1) & 3; constexpr int kIdx2 = (-kI64Lanes + 2) & 3; constexpr int kIdx3 = (-kI64Lanes + 3) & 3; constexpr int kIdx3210 = _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0); constexpr int kBlendMask = (1 << (kI64Lanes * 2)) - 1; const DFromV d; return V{_mm256_blend_epi32(_mm256_permute4x64_epi64(v.raw, kIdx3210), Zero(d).raw, kBlendMask)}; } template )> HWY_INLINE V SlideUpI64Lanes(V v) { static_assert(0 <= kI64Lanes && kI64Lanes <= 3, "kI64Lanes must be between 0 and 3"); constexpr int kIdx0 = (-kI64Lanes) & 3; constexpr int kIdx1 = (-kI64Lanes + 1) & 3; constexpr int kIdx2 = (-kI64Lanes + 2) & 3; constexpr int kIdx3 = (-kI64Lanes + 3) & 3; constexpr int kIdx3210 = _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0); constexpr int kBlendMask = (1 << kI64Lanes) - 1; const DFromV d; const Repartition dd; return BitCast(d, Vec256{_mm256_blend_pd( _mm256_permute4x64_pd(BitCast(dd, v).raw, kIdx3210), Zero(dd).raw, kBlendMask)}); } #endif // HWY_TARGET <= HWY_AVX3 template HWY_AVX3) ? (1 << 2) : 0))> HWY_INLINE VFromD TableLookupSlideUpLanes(D d, VFromD v, size_t amt) { const Repartition du8; const auto idx_vec = Iota(du8, static_cast(size_t{0} - amt * sizeof(TFromD))); const Indices256> idx{idx_vec.raw}; #if HWY_TARGET <= HWY_AVX3_DL return TwoTablesLookupLanes(v, Zero(d), idx); #else return TableLookupLanes(v, idx); #endif } template HWY_INLINE VFromD TableLookupSlideUpLanes(D d, VFromD v, size_t amt) { const RebindToUnsigned du; using TU = TFromD; const auto idx = Iota(du, static_cast(size_t{0} - amt)); #if HWY_TARGET <= HWY_AVX3 const auto masked_idx = And(idx, Set(du, static_cast(MaxLanes(d) * 2 - 1))); return TwoTablesLookupLanes(v, Zero(d), IndicesFromVec(d, masked_idx)); #else const auto masked_idx = And(idx, Set(du, static_cast(MaxLanes(d) - 1))); return IfThenElseZero(RebindMask(d, idx == masked_idx), TableLookupLanes(v, IndicesFromVec(d, masked_idx))); #endif } #if HWY_TARGET > HWY_AVX3 template HWY_INLINE VFromD TableLookupSlideUpLanes(D d, VFromD v, size_t amt) { const RepartitionToNarrow dn; return BitCast(d, TableLookupSlideUpLanes(dn, BitCast(dn, v), amt * 2)); } #endif // HWY_TARGET > HWY_AVX3 } // namespace detail template HWY_API VFromD SlideUpBlocks(D d, VFromD v) { static_assert(0 <= kBlocks && kBlocks <= 1, "kBlocks must be between 0 and 1"); return (kBlocks == 1) ? ConcatLowerLower(d, v, Zero(d)) : v; } template HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD); if (__builtin_constant_p(amt)) { const auto v_lo = ConcatLowerLower(d, v, Zero(d)); switch (amt * sizeof(TFromD)) { case 0: return v; case 1: return CombineShiftRightBytes<15>(d, v, v_lo); case 2: return CombineShiftRightBytes<14>(d, v, v_lo); case 3: return CombineShiftRightBytes<13>(d, v, v_lo); case 4: #if HWY_TARGET <= HWY_AVX3 return detail::CombineShiftRightI32Lanes<7>(v, Zero(d)); #else return CombineShiftRightBytes<12>(d, v, v_lo); #endif case 5: return CombineShiftRightBytes<11>(d, v, v_lo); case 6: return CombineShiftRightBytes<10>(d, v, v_lo); case 7: return CombineShiftRightBytes<9>(d, v, v_lo); case 8: return detail::SlideUpI64Lanes<1>(v); case 9: return CombineShiftRightBytes<7>(d, v, v_lo); case 10: return CombineShiftRightBytes<6>(d, v, v_lo); case 11: return CombineShiftRightBytes<5>(d, v, v_lo); case 12: #if HWY_TARGET <= HWY_AVX3 return detail::CombineShiftRightI32Lanes<5>(v, Zero(d)); #else return CombineShiftRightBytes<4>(d, v, v_lo); #endif case 13: return CombineShiftRightBytes<3>(d, v, v_lo); case 14: return CombineShiftRightBytes<2>(d, v, v_lo); case 15: return CombineShiftRightBytes<1>(d, v, v_lo); case 16: return ConcatLowerLower(d, v, Zero(d)); #if HWY_TARGET <= HWY_AVX3 case 20: return detail::CombineShiftRightI32Lanes<3>(v, Zero(d)); #endif case 24: return detail::SlideUpI64Lanes<3>(v); #if HWY_TARGET <= HWY_AVX3 case 28: return detail::CombineShiftRightI32Lanes<1>(v, Zero(d)); #endif } } if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) { const Half dh; return Combine(d, SlideUpLanes(dh, LowerHalf(dh, v), amt - kLanesPerBlock), Zero(dh)); } #endif return detail::TableLookupSlideUpLanes(d, v, amt); } // ------------------------------ Slide1Up template HWY_API VFromD Slide1Up(D d, VFromD v) { const auto v_lo = ConcatLowerLower(d, v, Zero(d)); return CombineShiftRightBytes<15>(d, v, v_lo); } template HWY_API VFromD Slide1Up(D d, VFromD v) { const auto v_lo = ConcatLowerLower(d, v, Zero(d)); return CombineShiftRightBytes<14>(d, v, v_lo); } template HWY_API VFromD Slide1Up(D d, VFromD v) { #if HWY_TARGET <= HWY_AVX3 return detail::CombineShiftRightI32Lanes<7>(v, Zero(d)); #else const auto v_lo = ConcatLowerLower(d, v, Zero(d)); return CombineShiftRightBytes<12>(d, v, v_lo); #endif } template HWY_API VFromD Slide1Up(D /*d*/, VFromD v) { return detail::SlideUpI64Lanes<1>(v); } // ------------------------------ SlideDownLanes namespace detail { #if HWY_TARGET <= HWY_AVX3 template HWY_INLINE V SlideDownI64Lanes(V v) { static_assert(0 <= kI64Lanes && kI64Lanes <= 3, "kI64Lanes must be between 0 and 3"); const DFromV d; return CombineShiftRightI64Lanes(Zero(d), v); } #else // AVX2 template )> HWY_INLINE V SlideDownI64Lanes(V v) { static_assert(0 <= kI64Lanes && kI64Lanes <= 3, "kI64Lanes must be between 0 and 3"); constexpr int kIdx1 = (kI64Lanes + 1) & 3; constexpr int kIdx2 = (kI64Lanes + 2) & 3; constexpr int kIdx3 = (kI64Lanes + 3) & 3; constexpr int kIdx3210 = _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kI64Lanes); constexpr int kBlendMask = static_cast((0xFFu << ((4 - kI64Lanes) * 2)) & 0xFFu); const DFromV d; return V{_mm256_blend_epi32(_mm256_permute4x64_epi64(v.raw, kIdx3210), Zero(d).raw, kBlendMask)}; } template )> HWY_INLINE V SlideDownI64Lanes(V v) { static_assert(0 <= kI64Lanes && kI64Lanes <= 3, "kI64Lanes must be between 0 and 3"); constexpr int kIdx1 = (kI64Lanes + 1) & 3; constexpr int kIdx2 = (kI64Lanes + 2) & 3; constexpr int kIdx3 = (kI64Lanes + 3) & 3; constexpr int kIdx3210 = _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kI64Lanes); constexpr int kBlendMask = (0x0F << (4 - kI64Lanes)) & 0x0F; const DFromV d; const Repartition dd; return BitCast(d, Vec256{_mm256_blend_pd( _mm256_permute4x64_pd(BitCast(dd, v).raw, kIdx3210), Zero(dd).raw, kBlendMask)}); } #endif // HWY_TARGET <= HWY_AVX3 template HWY_AVX3) ? (1 << 2) : 0))> HWY_INLINE VFromD TableLookupSlideDownLanes(D d, VFromD v, size_t amt) { const Repartition du8; auto idx_vec = Iota(du8, static_cast(amt * sizeof(TFromD))); #if HWY_TARGET <= HWY_AVX3_DL const auto result_mask = idx_vec < Set(du8, uint8_t{32}); return VFromD{ _mm256_maskz_permutexvar_epi8(result_mask.raw, idx_vec.raw, v.raw)}; #else const RebindToSigned di8; idx_vec = Or(idx_vec, BitCast(du8, VecFromMask(di8, BitCast(di8, idx_vec) > Set(di8, int8_t{31})))); return TableLookupLanes(v, Indices256>{idx_vec.raw}); #endif } template HWY_INLINE VFromD TableLookupSlideDownLanes(D d, VFromD v, size_t amt) { const RebindToUnsigned du; using TU = TFromD; const auto idx = Iota(du, static_cast(amt)); const auto masked_idx = And(idx, Set(du, static_cast(MaxLanes(d) - 1))); return IfThenElseZero(RebindMask(d, idx == masked_idx), TableLookupLanes(v, IndicesFromVec(d, masked_idx))); } #if HWY_TARGET > HWY_AVX3 template HWY_INLINE VFromD TableLookupSlideDownLanes(D d, VFromD v, size_t amt) { const RepartitionToNarrow dn; return BitCast(d, TableLookupSlideDownLanes(dn, BitCast(dn, v), amt * 2)); } #endif // HWY_TARGET > HWY_AVX3 } // namespace detail template HWY_API VFromD SlideDownBlocks(D d, VFromD v) { static_assert(0 <= kBlocks && kBlocks <= 1, "kBlocks must be between 0 and 1"); const Half dh; return (kBlocks == 1) ? ZeroExtendVector(d, UpperHalf(dh, v)) : v; } template HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD); const Half dh; if (__builtin_constant_p(amt)) { const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v)); switch (amt * sizeof(TFromD)) { case 0: return v; case 1: return CombineShiftRightBytes<1>(d, v_hi, v); case 2: return CombineShiftRightBytes<2>(d, v_hi, v); case 3: return CombineShiftRightBytes<3>(d, v_hi, v); case 4: #if HWY_TARGET <= HWY_AVX3 return detail::CombineShiftRightI32Lanes<1>(Zero(d), v); #else return CombineShiftRightBytes<4>(d, v_hi, v); #endif case 5: return CombineShiftRightBytes<5>(d, v_hi, v); case 6: return CombineShiftRightBytes<6>(d, v_hi, v); case 7: return CombineShiftRightBytes<7>(d, v_hi, v); case 8: return detail::SlideDownI64Lanes<1>(v); case 9: return CombineShiftRightBytes<9>(d, v_hi, v); case 10: return CombineShiftRightBytes<10>(d, v_hi, v); case 11: return CombineShiftRightBytes<11>(d, v_hi, v); case 12: #if HWY_TARGET <= HWY_AVX3 return detail::CombineShiftRightI32Lanes<3>(Zero(d), v); #else return CombineShiftRightBytes<12>(d, v_hi, v); #endif case 13: return CombineShiftRightBytes<13>(d, v_hi, v); case 14: return CombineShiftRightBytes<14>(d, v_hi, v); case 15: return CombineShiftRightBytes<15>(d, v_hi, v); case 16: return v_hi; #if HWY_TARGET <= HWY_AVX3 case 20: return detail::CombineShiftRightI32Lanes<5>(Zero(d), v); #endif case 24: return detail::SlideDownI64Lanes<3>(v); #if HWY_TARGET <= HWY_AVX3 case 28: return detail::CombineShiftRightI32Lanes<7>(Zero(d), v); #endif } } if (__builtin_constant_p(amt >= kLanesPerBlock) && amt >= kLanesPerBlock) { return ZeroExtendVector( d, SlideDownLanes(dh, UpperHalf(dh, v), amt - kLanesPerBlock)); } #endif return detail::TableLookupSlideDownLanes(d, v, amt); } // ------------------------------ Slide1Down template HWY_API VFromD Slide1Down(D d, VFromD v) { const Half dh; const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v)); return CombineShiftRightBytes<1>(d, v_hi, v); } template HWY_API VFromD Slide1Down(D d, VFromD v) { const Half dh; const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v)); return CombineShiftRightBytes<2>(d, v_hi, v); } template HWY_API VFromD Slide1Down(D d, VFromD v) { #if HWY_TARGET <= HWY_AVX3 return detail::CombineShiftRightI32Lanes<1>(Zero(d), v); #else const Half dh; const auto v_hi = ZeroExtendVector(d, UpperHalf(dh, v)); return CombineShiftRightBytes<4>(d, v_hi, v); #endif } template HWY_API VFromD Slide1Down(D /*d*/, VFromD v) { return detail::SlideDownI64Lanes<1>(v); } // ------------------------------ Shl (Mul, ZipLower) namespace detail { #if HWY_TARGET > HWY_AVX3 && !HWY_IDE // AVX2 or older template HWY_INLINE V AVX2ShlU16Vec256(V v, V bits) { const DFromV d; const Half dh; const Rebind du32; const auto lo_shl_result = PromoteTo(du32, LowerHalf(dh, v)) << PromoteTo(du32, LowerHalf(dh, bits)); const auto hi_shl_result = PromoteTo(du32, UpperHalf(dh, v)) << PromoteTo(du32, UpperHalf(dh, bits)); return ConcatEven(d, BitCast(d, hi_shl_result), BitCast(d, lo_shl_result)); } #endif HWY_INLINE Vec256 Shl(hwy::UnsignedTag /*tag*/, Vec256 v, Vec256 bits) { #if HWY_TARGET <= HWY_AVX3 || HWY_IDE return Vec256{_mm256_sllv_epi16(v.raw, bits.raw)}; #else return AVX2ShlU16Vec256(v, bits); #endif } // 8-bit: may use the Shl overload for uint16_t. HWY_API Vec256 Shl(hwy::UnsignedTag tag, Vec256 v, Vec256 bits) { const DFromV d; #if HWY_TARGET <= HWY_AVX3_DL (void)tag; // masks[i] = 0xFF >> i const VFromD masks = Dup128VecFromValues(d, 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0, 0, 0, 0, 0, 0, 0, 0); // kShl[i] = 1 << i const VFromD shl = Dup128VecFromValues( d, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 0, 0, 0, 0, 0, 0, 0, 0); v = And(v, TableLookupBytes(masks, bits)); const VFromD mul = TableLookupBytes(shl, bits); return VFromD{_mm256_gf2p8mul_epi8(v.raw, mul.raw)}; #else const Repartition dw; using VW = VFromD; const VW even_mask = Set(dw, 0x00FF); const VW odd_mask = Set(dw, 0xFF00); const VW vw = BitCast(dw, v); const VW bits16 = BitCast(dw, bits); // Shift even lanes in-place const VW evens = Shl(tag, vw, And(bits16, even_mask)); const VW odds = Shl(tag, And(vw, odd_mask), ShiftRight<8>(bits16)); return OddEven(BitCast(d, odds), BitCast(d, evens)); #endif } HWY_INLINE Vec256 Shl(hwy::UnsignedTag /*tag*/, Vec256 v, Vec256 bits) { return Vec256{_mm256_sllv_epi32(v.raw, bits.raw)}; } HWY_INLINE Vec256 Shl(hwy::UnsignedTag /*tag*/, Vec256 v, Vec256 bits) { return Vec256{_mm256_sllv_epi64(v.raw, bits.raw)}; } template HWY_INLINE Vec256 Shl(hwy::SignedTag /*tag*/, Vec256 v, Vec256 bits) { // Signed left shifts are the same as unsigned. const Full256 di; const Full256> du; return BitCast(di, Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits))); } } // namespace detail template HWY_API Vec256 operator<<(Vec256 v, Vec256 bits) { return detail::Shl(hwy::TypeTag(), v, bits); } // ------------------------------ Shr (MulHigh, IfThenElse, Not) #if HWY_TARGET > HWY_AVX3 // AVX2 namespace detail { template HWY_INLINE V AVX2ShrU16Vec256(V v, V bits) { const DFromV d; const Half dh; const Rebind di32; const Rebind du32; const auto lo_shr_result = PromoteTo(du32, LowerHalf(dh, v)) >> PromoteTo(du32, LowerHalf(dh, bits)); const auto hi_shr_result = PromoteTo(du32, UpperHalf(dh, v)) >> PromoteTo(du32, UpperHalf(dh, bits)); return OrderedDemote2To(d, BitCast(di32, lo_shr_result), BitCast(di32, hi_shr_result)); } } // namespace detail #endif HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_srlv_epi16(v.raw, bits.raw)}; #else return detail::AVX2ShrU16Vec256(v, bits); #endif } // 8-bit uses 16-bit shifts. HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { const DFromV d; const RepartitionToWide dw; using VW = VFromD; const VW mask = Set(dw, 0x00FF); const VW vw = BitCast(dw, v); const VW bits16 = BitCast(dw, bits); const VW evens = And(vw, mask) >> And(bits16, mask); // Shift odd lanes in-place const VW odds = vw >> ShiftRight<8>(bits16); return OddEven(BitCast(d, odds), BitCast(d, evens)); } HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { return Vec256{_mm256_srlv_epi32(v.raw, bits.raw)}; } HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { return Vec256{_mm256_srlv_epi64(v.raw, bits.raw)}; } #if HWY_TARGET > HWY_AVX3 // AVX2 namespace detail { template HWY_INLINE V AVX2ShrI16Vec256(V v, V bits) { const DFromV d; const Half dh; const Rebind di32; const auto lo_shr_result = PromoteTo(di32, LowerHalf(dh, v)) >> PromoteTo(di32, LowerHalf(dh, bits)); const auto hi_shr_result = PromoteTo(di32, UpperHalf(dh, v)) >> PromoteTo(di32, UpperHalf(dh, bits)); return OrderedDemote2To(d, lo_shr_result, hi_shr_result); } } // namespace detail #endif HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_srav_epi16(v.raw, bits.raw)}; #else return detail::AVX2ShrI16Vec256(v, bits); #endif } // 8-bit uses 16-bit shifts. HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { const DFromV d; const RepartitionToWide dw; const RebindToUnsigned dw_u; using VW = VFromD; const VW mask = Set(dw, 0x00FF); const VW vw = BitCast(dw, v); const VW bits16 = BitCast(dw, bits); const VW evens = ShiftRight<8>(ShiftLeft<8>(vw)) >> And(bits16, mask); // Shift odd lanes in-place const VW odds = vw >> BitCast(dw, ShiftRight<8>(BitCast(dw_u, bits16))); return OddEven(BitCast(d, odds), BitCast(d, evens)); } HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { return Vec256{_mm256_srav_epi32(v.raw, bits.raw)}; } HWY_API Vec256 operator>>(Vec256 v, Vec256 bits) { #if HWY_TARGET <= HWY_AVX3 return Vec256{_mm256_srav_epi64(v.raw, bits.raw)}; #else const DFromV d; return detail::SignedShr(d, v, bits); #endif } HWY_INLINE Vec256 MulEven(const Vec256 a, const Vec256 b) { const Full256 du64; const RepartitionToNarrow du32; const auto maskL = Set(du64, 0xFFFFFFFFULL); const auto a32 = BitCast(du32, a); const auto b32 = BitCast(du32, b); // Inputs for MulEven: we only need the lower 32 bits const auto aH = Shuffle2301(a32); const auto bH = Shuffle2301(b32); // Knuth double-word multiplication. We use 32x32 = 64 MulEven and only need // the even (lower 64 bits of every 128-bit block) results. See // https://github.com/hcs0/Hackers-Delight/blob/master/muldwu.c.tat const auto aLbL = MulEven(a32, b32); const auto w3 = aLbL & maskL; const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL); const auto w2 = t2 & maskL; const auto w1 = ShiftRight<32>(t2); const auto t = MulEven(a32, bH) + w2; const auto k = ShiftRight<32>(t); const auto mulH = MulEven(aH, bH) + w1 + k; const auto mulL = ShiftLeft<32>(t) + w3; return InterleaveLower(mulL, mulH); } HWY_INLINE Vec256 MulOdd(const Vec256 a, const Vec256 b) { const Full256 du64; const RepartitionToNarrow du32; const auto maskL = Set(du64, 0xFFFFFFFFULL); const auto a32 = BitCast(du32, a); const auto b32 = BitCast(du32, b); // Inputs for MulEven: we only need bits [95:64] (= upper half of input) const auto aH = Shuffle2301(a32); const auto bH = Shuffle2301(b32); // Same as above, but we're using the odd results (upper 64 bits per block). const auto aLbL = MulEven(a32, b32); const auto w3 = aLbL & maskL; const auto t2 = MulEven(aH, b32) + ShiftRight<32>(aLbL); const auto w2 = t2 & maskL; const auto w1 = ShiftRight<32>(t2); const auto t = MulEven(a32, bH) + w2; const auto k = ShiftRight<32>(t); const auto mulH = MulEven(aH, bH) + w1 + k; const auto mulL = ShiftLeft<32>(t) + w3; return InterleaveUpper(du64, mulL, mulH); } // ------------------------------ WidenMulPairwiseAdd template HWY_API VFromD WidenMulPairwiseAdd(D /*d32*/, Vec256 a, Vec256 b) { return VFromD{_mm256_madd_epi16(a.raw, b.raw)}; } // ------------------------------ SatWidenMulPairwiseAdd template HWY_API VFromD SatWidenMulPairwiseAdd( DI16 /* tag */, VFromD> a, VFromD> b) { return VFromD{_mm256_maddubs_epi16(a.raw, b.raw)}; } // ------------------------------ ReorderWidenMulAccumulate template HWY_API VFromD ReorderWidenMulAccumulate(D d, Vec256 a, Vec256 b, const VFromD sum0, VFromD& /*sum1*/) { (void)d; #if HWY_TARGET <= HWY_AVX3_DL return VFromD{_mm256_dpwssd_epi32(sum0.raw, a.raw, b.raw)}; #else return sum0 + WidenMulPairwiseAdd(d, a, b); #endif } // ------------------------------ RearrangeToOddPlusEven HWY_API Vec256 RearrangeToOddPlusEven(const Vec256 sum0, Vec256 /*sum1*/) { return sum0; // invariant already holds } HWY_API Vec256 RearrangeToOddPlusEven(const Vec256 sum0, Vec256 /*sum1*/) { return sum0; // invariant already holds } // ------------------------------ SumOfMulQuadAccumulate #if HWY_TARGET <= HWY_AVX3_DL template HWY_API VFromD SumOfMulQuadAccumulate( DI32 /*di32*/, VFromD> a_u, VFromD> b_i, VFromD sum) { return VFromD{_mm256_dpbusd_epi32(sum.raw, a_u.raw, b_i.raw)}; } #endif // ================================================== CONVERT // ------------------------------ Promotions (part w/ narrow lanes -> full) template HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { return VFromD{_mm256_cvtps_pd(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { return VFromD{_mm256_cvtepi32_pd(v.raw)}; } #if HWY_TARGET <= HWY_AVX3 template HWY_API Vec256 PromoteTo(D /* tag */, Vec128 v) { return Vec256{_mm256_cvtepu32_pd(v.raw)}; } #endif // Unsigned: zero-extend. // Note: these have 3 cycle latency; if inputs are already split across the // 128 bit blocks (in their upper/lower halves), then Zip* would be faster. template HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { return VFromD{_mm256_cvtepu8_epi16(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { return VFromD{_mm256_cvtepu8_epi32(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { return VFromD{_mm256_cvtepu16_epi32(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { return VFromD{_mm256_cvtepu32_epi64(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, Vec64 v) { return VFromD{_mm256_cvtepu16_epi64(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, Vec32 v) { return VFromD{_mm256_cvtepu8_epi64(v.raw)}; } // Signed: replicate sign bit. // Note: these have 3 cycle latency; if inputs are already split across the // 128 bit blocks (in their upper/lower halves), then ZipUpper/lo followed by // signed shift would be faster. template HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { return VFromD{_mm256_cvtepi8_epi16(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { return VFromD{_mm256_cvtepi8_epi32(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { return VFromD{_mm256_cvtepi16_epi32(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, Vec128 v) { return VFromD{_mm256_cvtepi32_epi64(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, Vec64 v) { return VFromD{_mm256_cvtepi16_epi64(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, Vec32 v) { return VFromD{_mm256_cvtepi8_epi64(v.raw)}; } #if HWY_TARGET <= HWY_AVX3 template HWY_API VFromD PromoteTo(D di64, VFromD> v) { const Rebind df32; const RebindToFloat df64; const RebindToSigned di32; return detail::FixConversionOverflow( di64, BitCast(df64, PromoteTo(di64, BitCast(di32, v))), VFromD{_mm256_cvttps_epi64(v.raw)}); } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{_mm256_maskz_cvttps_epu64( detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)}; } #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ PromoteEvenTo/PromoteOddTo #if HWY_TARGET > HWY_AVX3 namespace detail { // I32->I64 PromoteEvenTo/PromoteOddTo template HWY_INLINE VFromD PromoteEvenTo(hwy::SignedTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, hwy::SignedTag /*from_type_tag*/, D d_to, Vec256 v) { return BitCast(d_to, OddEven(DupEven(BroadcastSignBit(v)), v)); } template HWY_INLINE VFromD PromoteOddTo(hwy::SignedTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, hwy::SignedTag /*from_type_tag*/, D d_to, Vec256 v) { return BitCast(d_to, OddEven(BroadcastSignBit(v), DupOdd(v))); } } // namespace detail #endif // ------------------------------ Demotions (full -> part w/ narrow lanes) template HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { const __m256i u16 = _mm256_packus_epi32(v.raw, v.raw); // Concatenating lower halves of both 128-bit blocks afterward is more // efficient than an extra input with low block = high block of v. return VFromD{_mm256_castsi256_si128(_mm256_permute4x64_epi64(u16, 0x88))}; } template HWY_API VFromD DemoteTo(D dn, Vec256 v) { const DFromV d; const RebindToSigned di; return DemoteTo(dn, BitCast(di, Min(v, Set(d, 0x7FFFFFFFu)))); } template HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { const __m256i i16 = _mm256_packs_epi32(v.raw, v.raw); return VFromD{_mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))}; } template HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw); // Concatenate lower 64 bits of each 128-bit block const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88); const __m128i i16 = _mm256_castsi256_si128(i16_concat); return VFromD{_mm_packus_epi16(i16, i16)}; } template HWY_API VFromD DemoteTo(D dn, Vec256 v) { #if HWY_TARGET <= HWY_AVX3 (void)dn; return VFromD{_mm256_cvtusepi32_epi8(v.raw)}; #else const DFromV d; const RebindToSigned di; return DemoteTo(dn, BitCast(di, Min(v, Set(d, 0x7FFFFFFFu)))); #endif } template HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { const __m256i u8 = _mm256_packus_epi16(v.raw, v.raw); return VFromD{_mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))}; } template HWY_API VFromD DemoteTo(D dn, Vec256 v) { const DFromV d; const RebindToSigned di; return DemoteTo(dn, BitCast(di, Min(v, Set(d, 0x7FFFu)))); } template HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw); // Concatenate lower 64 bits of each 128-bit block const __m256i i16_concat = _mm256_permute4x64_epi64(i16_blocks, 0x88); const __m128i i16 = _mm256_castsi256_si128(i16_concat); return VFromD{_mm_packs_epi16(i16, i16)}; } template HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { const __m256i i8 = _mm256_packs_epi16(v.raw, v.raw); return VFromD{_mm256_castsi256_si128(_mm256_permute4x64_epi64(i8, 0x88))}; } #if HWY_TARGET <= HWY_AVX3 template HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { return VFromD{_mm256_cvtsepi64_epi32(v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { return VFromD{_mm256_cvtsepi64_epi16(v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { return VFromD{_mm256_cvtsepi64_epi8(v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw; return VFromD{_mm256_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw; return VFromD{_mm256_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw; return VFromD{_mm256_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { return VFromD{_mm256_cvtusepi64_epi32(v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { return VFromD{_mm256_cvtusepi64_epi16(v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { return VFromD{_mm256_cvtusepi64_epi8(v.raw)}; } #endif // HWY_TARGET <= HWY_AVX3 #ifndef HWY_DISABLE_F16C // Avoid "value of intrinsic immediate argument '8' is out of range '0 - 7'". // 8 is the correct value of _MM_FROUND_NO_EXC, which is allowed here. HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wsign-conversion") template HWY_API VFromD DemoteTo(D df16, Vec256 v) { const RebindToUnsigned du16; return BitCast( df16, VFromD{_mm256_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)}); } HWY_DIAGNOSTICS(pop) #endif // HWY_DISABLE_F16C #if HWY_HAVE_FLOAT16 template HWY_API VFromD DemoteTo(D /*df16*/, Vec256 v) { return VFromD{_mm256_cvtpd_ph(v.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API VFromD DemoteTo(D dbf16, Vec256 v) { // TODO(janwas): _mm256_cvtneps_pbh once we have avx512bf16. const Rebind di32; const Rebind du32; // for logical shift right const Rebind du16; const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); return BitCast(dbf16, DemoteTo(du16, bits_in_32)); } template HWY_API VFromD ReorderDemote2To(D dbf16, Vec256 a, Vec256 b) { // TODO(janwas): _mm256_cvtne2ps_pbh once we have avx512bf16. const RebindToUnsigned du16; const Repartition du32; const Vec256 b_in_even = ShiftRight<16>(BitCast(du32, b)); return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); } template HWY_API VFromD ReorderDemote2To(D /*d16*/, Vec256 a, Vec256 b) { return VFromD{_mm256_packs_epi32(a.raw, b.raw)}; } template HWY_API VFromD ReorderDemote2To(D /*d16*/, Vec256 a, Vec256 b) { return VFromD{_mm256_packus_epi32(a.raw, b.raw)}; } template HWY_API VFromD ReorderDemote2To(D dn, Vec256 a, Vec256 b) { const DFromV d; const RebindToSigned di; const auto max_i32 = Set(d, 0x7FFFFFFFu); return ReorderDemote2To(dn, BitCast(di, Min(a, max_i32)), BitCast(di, Min(b, max_i32))); } template HWY_API VFromD ReorderDemote2To(D /*d16*/, Vec256 a, Vec256 b) { return VFromD{_mm256_packs_epi16(a.raw, b.raw)}; } template HWY_API VFromD ReorderDemote2To(D /*d16*/, Vec256 a, Vec256 b) { return VFromD{_mm256_packus_epi16(a.raw, b.raw)}; } template HWY_API VFromD ReorderDemote2To(D dn, Vec256 a, Vec256 b) { const DFromV d; const RebindToSigned di; const auto max_i16 = Set(d, 0x7FFFu); return ReorderDemote2To(dn, BitCast(di, Min(a, max_i16)), BitCast(di, Min(b, max_i16))); } #if HWY_TARGET > HWY_AVX3 template HWY_API Vec256 ReorderDemote2To(D dn, Vec256 a, Vec256 b) { const DFromV di64; const RebindToUnsigned du64; const Half dnh; const Repartition dn_f; // Negative values are saturated by first saturating their bitwise inverse // and then inverting the saturation result const auto invert_mask_a = BitCast(du64, BroadcastSignBit(a)); const auto invert_mask_b = BitCast(du64, BroadcastSignBit(b)); const auto saturated_a = Xor( invert_mask_a, detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_a, BitCast(du64, a)))); const auto saturated_b = Xor( invert_mask_b, detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_b, BitCast(du64, b)))); return BitCast(dn, Vec256{_mm256_shuffle_ps(BitCast(dn_f, saturated_a).raw, BitCast(dn_f, saturated_b).raw, _MM_SHUFFLE(2, 0, 2, 0))}); } template HWY_API Vec256 ReorderDemote2To(D dn, Vec256 a, Vec256 b) { const DFromV di64; const RebindToUnsigned du64; const Half dnh; const Repartition dn_f; const auto saturated_a = detail::DemoteFromU64Saturate( dnh, BitCast(du64, AndNot(BroadcastSignBit(a), a))); const auto saturated_b = detail::DemoteFromU64Saturate( dnh, BitCast(du64, AndNot(BroadcastSignBit(b), b))); return BitCast(dn, Vec256{_mm256_shuffle_ps(BitCast(dn_f, saturated_a).raw, BitCast(dn_f, saturated_b).raw, _MM_SHUFFLE(2, 0, 2, 0))}); } template HWY_API Vec256 ReorderDemote2To(D dn, Vec256 a, Vec256 b) { const Half dnh; const Repartition dn_f; const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a); const auto saturated_b = detail::DemoteFromU64Saturate(dnh, b); return BitCast(dn, Vec256{_mm256_shuffle_ps(BitCast(dn_f, saturated_a).raw, BitCast(dn_f, saturated_b).raw, _MM_SHUFFLE(2, 0, 2, 0))}); } #endif // HWY_TARGET > HWY_AVX3 template ), HWY_IF_V_SIZE_D(D, 32), HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV) * 2), HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4) | ((HWY_TARGET > HWY_AVX3) ? (1 << 8) : 0))> HWY_API VFromD OrderedDemote2To(D d, V a, V b) { return VFromD{_mm256_permute4x64_epi64(ReorderDemote2To(d, a, b).raw, _MM_SHUFFLE(3, 1, 2, 0))}; } template HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { return VFromD{_mm256_cvtpd_ps(v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, Vec256 v) { const Full256 d64; const auto clamped = detail::ClampF64ToI32Max(d64, v); return VFromD{_mm256_cvttpd_epi32(clamped.raw)}; } template HWY_API VFromD DemoteTo(D du32, Vec256 v) { #if HWY_TARGET <= HWY_AVX3 (void)du32; return VFromD{_mm256_maskz_cvttpd_epu32( detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)}; #else // AVX2 const Rebind df64; const RebindToUnsigned du64; // Clamp v[i] to a value between 0 and 4294967295 const auto clamped = Min(ZeroIfNegative(v), Set(df64, 4294967295.0)); const auto k2_31 = Set(df64, 2147483648.0); const auto clamped_is_ge_k2_31 = (clamped >= k2_31); const auto clamped_lo31_f64 = clamped - IfThenElseZero(clamped_is_ge_k2_31, k2_31); const VFromD clamped_lo31_u32{_mm256_cvttpd_epi32(clamped_lo31_f64.raw)}; const auto clamped_u32_msb = ShiftLeft<31>( TruncateTo(du32, BitCast(du64, VecFromMask(df64, clamped_is_ge_k2_31)))); return Or(clamped_lo31_u32, clamped_u32_msb); #endif } #if HWY_TARGET <= HWY_AVX3 template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{_mm256_cvtepi64_ps(v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{_mm256_cvtepu64_ps(v.raw)}; } #endif // For already range-limited input [0, 255]. HWY_API Vec128 U8FromU32(const Vec256 v) { const Full256 d32; const Full64 d8; alignas(32) static constexpr uint32_t k8From32[8] = { 0x0C080400u, ~0u, ~0u, ~0u, ~0u, 0x0C080400u, ~0u, ~0u}; // Place first four bytes in lo[0], remaining 4 in hi[1]. const auto quad = TableLookupBytes(v, Load(d32, k8From32)); // Interleave both quadruplets - OR instead of unpack reduces port5 pressure. const auto lo = LowerHalf(quad); const auto hi = UpperHalf(Half(), quad); return BitCast(d8, LowerHalf(lo | hi)); } // ------------------------------ Truncations namespace detail { // LO and HI each hold four indices of bytes within a 128-bit block. template HWY_INLINE Vec128 LookupAndConcatHalves(Vec256 v) { const Full256 d32; #if HWY_TARGET <= HWY_AVX3_DL alignas(32) static constexpr uint32_t kMap[8] = { LO, HI, 0x10101010 + LO, 0x10101010 + HI, 0, 0, 0, 0}; const auto result = _mm256_permutexvar_epi8(Load(d32, kMap).raw, v.raw); #else alignas(32) static constexpr uint32_t kMap[8] = {LO, HI, ~0u, ~0u, ~0u, ~0u, LO, HI}; const auto quad = TableLookupBytes(v, Load(d32, kMap)); const auto result = _mm256_permute4x64_epi64(quad.raw, 0xCC); // Possible alternative: // const auto lo = LowerHalf(quad); // const auto hi = UpperHalf(Half(), quad); // const auto result = lo | hi; #endif return Vec128{_mm256_castsi256_si128(result)}; } // LO and HI each hold two indices of bytes within a 128-bit block. template HWY_INLINE Vec128 LookupAndConcatQuarters(Vec256 v) { const Full256 d16; #if HWY_TARGET <= HWY_AVX3_DL alignas(32) static constexpr uint16_t kMap[16] = { LO, HI, 0x1010 + LO, 0x1010 + HI, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; const auto result = _mm256_permutexvar_epi8(Load(d16, kMap).raw, v.raw); return LowerHalf(Vec128{_mm256_castsi256_si128(result)}); #else constexpr uint16_t ff = static_cast(~0u); alignas(32) static constexpr uint16_t kMap[16] = { LO, ff, HI, ff, ff, ff, ff, ff, ff, ff, ff, ff, LO, ff, HI, ff}; const auto quad = TableLookupBytes(v, Load(d16, kMap)); const auto mixed = _mm256_permute4x64_epi64(quad.raw, 0xCC); const auto half = _mm256_castsi256_si128(mixed); return LowerHalf(Vec128{_mm_packus_epi32(half, half)}); #endif } } // namespace detail template HWY_API VFromD TruncateTo(D /* tag */, Vec256 v) { const Full256 d32; #if HWY_TARGET <= HWY_AVX3_DL alignas(32) static constexpr uint32_t kMap[8] = {0x18100800u, 0, 0, 0, 0, 0, 0, 0}; const auto result = _mm256_permutexvar_epi8(Load(d32, kMap).raw, v.raw); return LowerHalf(LowerHalf(LowerHalf(Vec256{result}))); #else alignas(32) static constexpr uint32_t kMap[8] = {0xFFFF0800u, ~0u, ~0u, ~0u, 0x0800FFFFu, ~0u, ~0u, ~0u}; const auto quad = TableLookupBytes(v, Load(d32, kMap)); const auto lo = LowerHalf(quad); const auto hi = UpperHalf(Half(), quad); const auto result = lo | hi; return LowerHalf(LowerHalf(Vec128{result.raw})); #endif } template HWY_API VFromD TruncateTo(D /* tag */, Vec256 v) { const auto result = detail::LookupAndConcatQuarters<0x100, 0x908>(v); return VFromD{result.raw}; } template HWY_API VFromD TruncateTo(D /* tag */, Vec256 v) { const Full256 d32; alignas(32) static constexpr uint32_t kEven[8] = {0, 2, 4, 6, 0, 2, 4, 6}; const auto v32 = TableLookupLanes(BitCast(d32, v), SetTableIndices(d32, kEven)); return LowerHalf(Vec256{v32.raw}); } template HWY_API VFromD TruncateTo(D /* tag */, Vec256 v) { const auto full = detail::LookupAndConcatQuarters<0x400, 0xC08>(v); return VFromD{full.raw}; } template HWY_API VFromD TruncateTo(D /* tag */, Vec256 v) { const auto full = detail::LookupAndConcatHalves<0x05040100, 0x0D0C0908>(v); return VFromD{full.raw}; } template HWY_API VFromD TruncateTo(D /* tag */, Vec256 v) { const auto full = detail::LookupAndConcatHalves<0x06040200, 0x0E0C0A08>(v); return VFromD{full.raw}; } // ------------------------------ Integer <=> fp (ShiftRight, OddEven) #if HWY_HAVE_FLOAT16 template HWY_API VFromD ConvertTo(D /* tag */, Vec256 v) { return VFromD{_mm256_cvtepu16_ph(v.raw)}; } template HWY_API VFromD ConvertTo(D /* tag */, Vec256 v) { return VFromD{_mm256_cvtepi16_ph(v.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API VFromD ConvertTo(D /* tag */, Vec256 v) { return VFromD{_mm256_cvtepi32_ps(v.raw)}; } #if HWY_TARGET <= HWY_AVX3 template HWY_API VFromD ConvertTo(D /*df*/, Vec256 v) { return VFromD{_mm256_cvtepu32_ps(v.raw)}; } template HWY_API VFromD ConvertTo(D /*dd*/, Vec256 v) { return VFromD{_mm256_cvtepi64_pd(v.raw)}; } template HWY_API VFromD ConvertTo(D /*dd*/, Vec256 v) { return VFromD{_mm256_cvtepu64_pd(v.raw)}; } #endif // HWY_TARGET <= HWY_AVX3 // Truncates (rounds toward zero). #if HWY_HAVE_FLOAT16 template HWY_API VFromD ConvertTo(D d, Vec256 v) { return detail::FixConversionOverflow(d, v, VFromD{_mm256_cvttph_epi16(v.raw)}); } template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD{_mm256_maskz_cvttph_epu16( detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API VFromD ConvertTo(D d, Vec256 v) { return detail::FixConversionOverflow(d, v, VFromD{_mm256_cvttps_epi32(v.raw)}); } #if HWY_TARGET <= HWY_AVX3 template HWY_API VFromD ConvertTo(D di, Vec256 v) { return detail::FixConversionOverflow(di, v, VFromD{_mm256_cvttpd_epi64(v.raw)}); } template HWY_API VFromD ConvertTo(DU /*du*/, VFromD> v) { return VFromD{_mm256_maskz_cvttps_epu32( detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)}; } template HWY_API VFromD ConvertTo(DU /*du*/, VFromD> v) { return VFromD{_mm256_maskz_cvttpd_epu64( detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)}; } #else // AVX2 template HWY_API VFromD ConvertTo(DU32 du32, VFromD> v) { const RebindToSigned di32; const RebindToFloat df32; const auto non_neg_v = ZeroIfNegative(v); const auto exp_diff = Set(di32, int32_t{158}) - BitCast(di32, ShiftRight<23>(BitCast(du32, non_neg_v))); const auto scale_down_f32_val_mask = BitCast(du32, VecFromMask(di32, Eq(exp_diff, Zero(di32)))); const auto v_scaled = BitCast( df32, BitCast(du32, non_neg_v) + ShiftLeft<23>(scale_down_f32_val_mask)); const VFromD f32_to_u32_result{ _mm256_cvttps_epi32(v_scaled.raw)}; return Or( BitCast(du32, BroadcastSignBit(exp_diff)), f32_to_u32_result + And(f32_to_u32_result, scale_down_f32_val_mask)); } #endif // HWY_TARGET <= HWY_AVX3 HWY_API Vec256 NearestInt(const Vec256 v) { const Full256 di; return detail::FixConversionOverflow( di, v, Vec256{_mm256_cvtps_epi32(v.raw)}); } #ifndef HWY_DISABLE_F16C template HWY_API VFromD PromoteTo(D df32, Vec128 v) { (void)df32; #if HWY_HAVE_FLOAT16 const RebindToUnsigned> du16; return VFromD{_mm256_cvtph_ps(BitCast(du16, v).raw)}; #else return VFromD{_mm256_cvtph_ps(v.raw)}; #endif // HWY_HAVE_FLOAT16 } #endif // HWY_DISABLE_F16C #if HWY_HAVE_FLOAT16 template HWY_INLINE VFromD PromoteTo(D /*tag*/, Vec64 v) { return VFromD{_mm256_cvtph_pd(v.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API VFromD PromoteTo(D df32, Vec128 v) { const Rebind du16; const RebindToSigned di32; return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); } // ================================================== CRYPTO #if !defined(HWY_DISABLE_PCLMUL_AES) HWY_API Vec256 AESRound(Vec256 state, Vec256 round_key) { #if HWY_TARGET <= HWY_AVX3_DL return Vec256{_mm256_aesenc_epi128(state.raw, round_key.raw)}; #else const Full256 d; const Half d2; return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)), AESRound(LowerHalf(state), LowerHalf(round_key))); #endif } HWY_API Vec256 AESLastRound(Vec256 state, Vec256 round_key) { #if HWY_TARGET <= HWY_AVX3_DL return Vec256{_mm256_aesenclast_epi128(state.raw, round_key.raw)}; #else const Full256 d; const Half d2; return Combine(d, AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)), AESLastRound(LowerHalf(state), LowerHalf(round_key))); #endif } HWY_API Vec256 AESRoundInv(Vec256 state, Vec256 round_key) { #if HWY_TARGET <= HWY_AVX3_DL return Vec256{_mm256_aesdec_epi128(state.raw, round_key.raw)}; #else const Full256 d; const Half d2; return Combine(d, AESRoundInv(UpperHalf(d2, state), UpperHalf(d2, round_key)), AESRoundInv(LowerHalf(state), LowerHalf(round_key))); #endif } HWY_API Vec256 AESLastRoundInv(Vec256 state, Vec256 round_key) { #if HWY_TARGET <= HWY_AVX3_DL return Vec256{_mm256_aesdeclast_epi128(state.raw, round_key.raw)}; #else const Full256 d; const Half d2; return Combine( d, AESLastRoundInv(UpperHalf(d2, state), UpperHalf(d2, round_key)), AESLastRoundInv(LowerHalf(state), LowerHalf(round_key))); #endif } template )> HWY_API V AESInvMixColumns(V state) { const DFromV d; #if HWY_TARGET <= HWY_AVX3_DL // On AVX3_DL, it is more efficient to do an InvMixColumns operation for a // 256-bit or 512-bit vector by doing a AESLastRound operation // (_mm256_aesenclast_epi128/_mm512_aesenclast_epi128) followed by a // AESRoundInv operation (_mm256_aesdec_epi128/_mm512_aesdec_epi128) than to // split the vector into 128-bit vectors, carrying out multiple // _mm_aesimc_si128 operations, and then combining the _mm_aesimc_si128 // results back into a 256-bit or 512-bit vector. const auto zero = Zero(d); return AESRoundInv(AESLastRound(state, zero), zero); #else const Half dh; return Combine(d, AESInvMixColumns(UpperHalf(dh, state)), AESInvMixColumns(LowerHalf(dh, state))); #endif } template HWY_API Vec256 AESKeyGenAssist(Vec256 v) { const Full256 d; #if HWY_TARGET <= HWY_AVX3_DL const VFromD rconXorMask = Dup128VecFromValues( d, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0); const VFromD rotWordShuffle = Dup128VecFromValues( d, 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12); const Repartition du32; const auto w13 = BitCast(d, DupOdd(BitCast(du32, v))); const auto sub_word_result = AESLastRound(w13, rconXorMask); return TableLookupBytes(sub_word_result, rotWordShuffle); #else const Half d2; return Combine(d, AESKeyGenAssist(UpperHalf(d2, v)), AESKeyGenAssist(LowerHalf(v))); #endif } HWY_API Vec256 CLMulLower(Vec256 a, Vec256 b) { #if HWY_TARGET <= HWY_AVX3_DL return Vec256{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x00)}; #else const Full256 d; const Half d2; return Combine(d, CLMulLower(UpperHalf(d2, a), UpperHalf(d2, b)), CLMulLower(LowerHalf(a), LowerHalf(b))); #endif } HWY_API Vec256 CLMulUpper(Vec256 a, Vec256 b) { #if HWY_TARGET <= HWY_AVX3_DL return Vec256{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x11)}; #else const Full256 d; const Half d2; return Combine(d, CLMulUpper(UpperHalf(d2, a), UpperHalf(d2, b)), CLMulUpper(LowerHalf(a), LowerHalf(b))); #endif } #endif // HWY_DISABLE_PCLMUL_AES // ================================================== MISC #if HWY_TARGET <= HWY_AVX3 // ------------------------------ LoadMaskBits // `p` points to at least 8 readable bytes, not all of which need be valid. template HWY_API MFromD LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { constexpr size_t kN = MaxLanes(d); constexpr size_t kNumBytes = (kN + 7) / 8; uint64_t mask_bits = 0; CopyBytes(bits, &mask_bits); if (kN < 8) { mask_bits &= (1ull << kN) - 1; } return MFromD::FromBits(mask_bits); } // ------------------------------ StoreMaskBits // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { constexpr size_t kN = MaxLanes(d); constexpr size_t kNumBytes = (kN + 7) / 8; CopyBytes(&mask.raw, bits); // Non-full byte, need to clear the undefined upper bits. if (kN < 8) { const int mask_bits = static_cast((1ull << kN) - 1); bits[0] = static_cast(bits[0] & mask_bits); } return kNumBytes; } // ------------------------------ Mask testing template HWY_API size_t CountTrue(D /* tag */, MFromD mask) { return PopCount(static_cast(mask.raw)); } template HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD mask) { return Num0BitsBelowLS1Bit_Nonzero32(mask.raw); } template HWY_API intptr_t FindFirstTrue(D d, MFromD mask) { return mask.raw ? static_cast(FindKnownFirstTrue(d, mask)) : intptr_t{-1}; } template HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD mask) { return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask.raw); } template HWY_API intptr_t FindLastTrue(D d, MFromD mask) { return mask.raw ? static_cast(FindKnownLastTrue(d, mask)) : intptr_t{-1}; } // Beware: the suffix indicates the number of mask bits, not lane size! namespace detail { template HWY_INLINE bool AllFalse(hwy::SizeTag<1> /*tag*/, const Mask256 mask) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return _kortestz_mask32_u8(mask.raw, mask.raw); #else return mask.raw == 0; #endif } template HWY_INLINE bool AllFalse(hwy::SizeTag<2> /*tag*/, const Mask256 mask) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return _kortestz_mask16_u8(mask.raw, mask.raw); #else return mask.raw == 0; #endif } template HWY_INLINE bool AllFalse(hwy::SizeTag<4> /*tag*/, const Mask256 mask) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return _kortestz_mask8_u8(mask.raw, mask.raw); #else return mask.raw == 0; #endif } template HWY_INLINE bool AllFalse(hwy::SizeTag<8> /*tag*/, const Mask256 mask) { return (uint64_t{mask.raw} & 0xF) == 0; } } // namespace detail template HWY_API bool AllFalse(D /* tag */, MFromD mask) { return detail::AllFalse(hwy::SizeTag)>(), mask); } namespace detail { template HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask256 mask) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return _kortestc_mask32_u8(mask.raw, mask.raw); #else return mask.raw == 0xFFFFFFFFu; #endif } template HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask256 mask) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return _kortestc_mask16_u8(mask.raw, mask.raw); #else return mask.raw == 0xFFFFu; #endif } template HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask256 mask) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return _kortestc_mask8_u8(mask.raw, mask.raw); #else return mask.raw == 0xFFu; #endif } template HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask256 mask) { // Cannot use _kortestc because we have less than 8 mask bits. return mask.raw == 0xFu; } } // namespace detail template HWY_API bool AllTrue(D /* tag */, const MFromD mask) { return detail::AllTrue(hwy::SizeTag)>(), mask); } // ------------------------------ Compress // 16-bit is defined in x86_512 so we can use 512-bit vectors. template HWY_API Vec256 Compress(Vec256 v, Mask256 mask) { return Vec256{_mm256_maskz_compress_epi32(mask.raw, v.raw)}; } HWY_API Vec256 Compress(Vec256 v, Mask256 mask) { return Vec256{_mm256_maskz_compress_ps(mask.raw, v.raw)}; } template HWY_API Vec256 Compress(Vec256 v, Mask256 mask) { // See CompressIsPartition. alignas(16) static constexpr uint64_t packed_array[16] = { // PrintCompress64x4NibbleTables 0x00003210, 0x00003210, 0x00003201, 0x00003210, 0x00003102, 0x00003120, 0x00003021, 0x00003210, 0x00002103, 0x00002130, 0x00002031, 0x00002310, 0x00001032, 0x00001320, 0x00000321, 0x00003210}; // For lane i, shift the i-th 4-bit index down to bits [0, 2) - // _mm256_permutexvar_epi64 will ignore the upper bits. const DFromV d; const RebindToUnsigned du64; const auto packed = Set(du64, packed_array[mask.raw]); alignas(64) static constexpr uint64_t shifts[4] = {0, 4, 8, 12}; const auto indices = Indices256{(packed >> Load(du64, shifts)).raw}; return TableLookupLanes(v, indices); } // ------------------------------ CompressNot (Compress) // Implemented in x86_512 for lane size != 8. template HWY_API Vec256 CompressNot(Vec256 v, Mask256 mask) { // See CompressIsPartition. alignas(16) static constexpr uint64_t packed_array[16] = { // PrintCompressNot64x4NibbleTables 0x00003210, 0x00000321, 0x00001320, 0x00001032, 0x00002310, 0x00002031, 0x00002130, 0x00002103, 0x00003210, 0x00003021, 0x00003120, 0x00003102, 0x00003210, 0x00003201, 0x00003210, 0x00003210}; // For lane i, shift the i-th 4-bit index down to bits [0, 2) - // _mm256_permutexvar_epi64 will ignore the upper bits. const DFromV d; const RebindToUnsigned du64; const auto packed = Set(du64, packed_array[mask.raw]); alignas(32) static constexpr uint64_t shifts[4] = {0, 4, 8, 12}; const auto indices = Indices256{(packed >> Load(du64, shifts)).raw}; return TableLookupLanes(v, indices); } // ------------------------------ CompressStore (defined in x86_512) // ------------------------------ CompressBlendedStore (defined in x86_512) // ------------------------------ CompressBitsStore (defined in x86_512) #else // AVX2 // ------------------------------ LoadMaskBits (TestBit) namespace detail { // 256 suffix avoids ambiguity with x86_128 without needing HWY_IF_V_SIZE. template HWY_INLINE Mask256 LoadMaskBits256(uint64_t mask_bits) { const Full256 d; const RebindToUnsigned du; const Repartition du32; const auto vbits = BitCast(du, Set(du32, static_cast(mask_bits))); // Replicate bytes 8x such that each byte contains the bit that governs it. const Repartition du64; alignas(32) static constexpr uint64_t kRep8[4] = { 0x0000000000000000ull, 0x0101010101010101ull, 0x0202020202020202ull, 0x0303030303030303ull}; const auto rep8 = TableLookupBytes(vbits, BitCast(du, Load(du64, kRep8))); const VFromD bit = Dup128VecFromValues( du, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128); return RebindMask(d, TestBit(rep8, bit)); } template HWY_INLINE Mask256 LoadMaskBits256(uint64_t mask_bits) { const Full256 d; const RebindToUnsigned du; alignas(32) static constexpr uint16_t kBit[16] = { 1, 2, 4, 8, 16, 32, 64, 128, 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000}; const auto vmask_bits = Set(du, static_cast(mask_bits)); return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); } template HWY_INLINE Mask256 LoadMaskBits256(uint64_t mask_bits) { const Full256 d; const RebindToUnsigned du; alignas(32) static constexpr uint32_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; const auto vmask_bits = Set(du, static_cast(mask_bits)); return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); } template HWY_INLINE Mask256 LoadMaskBits256(uint64_t mask_bits) { const Full256 d; const RebindToUnsigned du; alignas(32) static constexpr uint64_t kBit[8] = {1, 2, 4, 8}; return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit))); } } // namespace detail // `p` points to at least 8 readable bytes, not all of which need be valid. template HWY_API MFromD LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { constexpr size_t kN = MaxLanes(d); constexpr size_t kNumBytes = (kN + 7) / 8; uint64_t mask_bits = 0; CopyBytes(bits, &mask_bits); if (kN < 8) { mask_bits &= (1ull << kN) - 1; } return detail::LoadMaskBits256>(mask_bits); } // ------------------------------ StoreMaskBits namespace detail { template HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { const Full256 d; const Full256 d8; const auto sign_bits = BitCast(d8, VecFromMask(d, mask)).raw; // Prevent sign-extension of 32-bit masks because the intrinsic returns int. return static_cast(_mm256_movemask_epi8(sign_bits)); } template HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { #if !defined(HWY_DISABLE_BMI2_FMA) && !defined(HWY_DISABLE_PEXT_ON_AVX2) const Full256 d; const Full256 d8; const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); const uint64_t sign_bits8 = BitsFromMask(mask8); // Skip the bits from the lower byte of each u16 (better not to use the // same packs_epi16 as SSE4, because that requires an extra swizzle here). return _pext_u32(static_cast(sign_bits8), 0xAAAAAAAAu); #else // Slow workaround for when BMI2 is disabled // Remove useless lower half of each u16 while preserving the sign bit. // Bytes [0, 8) and [16, 24) have the same sign bits as the input lanes. const auto sign_bits = _mm256_packs_epi16(mask.raw, _mm256_setzero_si256()); // Move odd qwords (value zero) to top so they don't affect the mask value. const auto compressed = _mm256_castsi256_si128( _mm256_permute4x64_epi64(sign_bits, _MM_SHUFFLE(3, 1, 2, 0))); return static_cast(_mm_movemask_epi8(compressed)); #endif // HWY_ARCH_X86_64 } template HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { const Full256 d; const Full256 df; const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw; return static_cast(_mm256_movemask_ps(sign_bits)); } template HWY_INLINE uint64_t BitsFromMask(const Mask256 mask) { const Full256 d; const Full256 df; const auto sign_bits = BitCast(df, VecFromMask(d, mask)).raw; return static_cast(_mm256_movemask_pd(sign_bits)); } } // namespace detail // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { constexpr size_t N = Lanes(d); constexpr size_t kNumBytes = (N + 7) / 8; const uint64_t mask_bits = detail::BitsFromMask(mask); CopyBytes(&mask_bits, bits); return kNumBytes; } // ------------------------------ Mask testing // Specialize for 16-bit lanes to avoid unnecessary pext. This assumes each mask // lane is 0 or ~0. template HWY_API bool AllFalse(D d, MFromD mask) { const Repartition d8; const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); return detail::BitsFromMask(mask8) == 0; } template HWY_API bool AllFalse(D /* tag */, MFromD mask) { // Cheaper than PTEST, which is 2 uop / 3L. return detail::BitsFromMask(mask) == 0; } template HWY_API bool AllTrue(D d, MFromD mask) { const Repartition d8; const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); return detail::BitsFromMask(mask8) == (1ull << 32) - 1; } template HWY_API bool AllTrue(D d, MFromD mask) { constexpr uint64_t kAllBits = (1ull << Lanes(d)) - 1; return detail::BitsFromMask(mask) == kAllBits; } template HWY_API size_t CountTrue(D d, MFromD mask) { const Repartition d8; const Mask256 mask8 = MaskFromVec(BitCast(d8, VecFromMask(d, mask))); return PopCount(detail::BitsFromMask(mask8)) >> 1; } template HWY_API size_t CountTrue(D /* tag */, MFromD mask) { return PopCount(detail::BitsFromMask(mask)); } template HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD mask) { const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); return Num0BitsBelowLS1Bit_Nonzero32(mask_bits); } template HWY_API intptr_t FindFirstTrue(D /* tag */, MFromD mask) { const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; } template HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD mask) { const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits); } template HWY_API intptr_t FindLastTrue(D /* tag */, MFromD mask) { const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) : -1; } // ------------------------------ Compress, CompressBits namespace detail { template HWY_INLINE Vec256 IndicesFromBits256(uint64_t mask_bits) { const Full256 d32; // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT // of SetTableIndices would require 8 KiB, a large part of L1D. The other // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles) // and unavailable in 32-bit builds. We instead compress each index into 4 // bits, for a total of 1 KiB. alignas(16) static constexpr uint32_t packed_array[256] = { // PrintCompress32x8Tables 0x76543210, 0x76543218, 0x76543209, 0x76543298, 0x7654310a, 0x765431a8, 0x765430a9, 0x76543a98, 0x7654210b, 0x765421b8, 0x765420b9, 0x76542b98, 0x765410ba, 0x76541ba8, 0x76540ba9, 0x7654ba98, 0x7653210c, 0x765321c8, 0x765320c9, 0x76532c98, 0x765310ca, 0x76531ca8, 0x76530ca9, 0x7653ca98, 0x765210cb, 0x76521cb8, 0x76520cb9, 0x7652cb98, 0x76510cba, 0x7651cba8, 0x7650cba9, 0x765cba98, 0x7643210d, 0x764321d8, 0x764320d9, 0x76432d98, 0x764310da, 0x76431da8, 0x76430da9, 0x7643da98, 0x764210db, 0x76421db8, 0x76420db9, 0x7642db98, 0x76410dba, 0x7641dba8, 0x7640dba9, 0x764dba98, 0x763210dc, 0x76321dc8, 0x76320dc9, 0x7632dc98, 0x76310dca, 0x7631dca8, 0x7630dca9, 0x763dca98, 0x76210dcb, 0x7621dcb8, 0x7620dcb9, 0x762dcb98, 0x7610dcba, 0x761dcba8, 0x760dcba9, 0x76dcba98, 0x7543210e, 0x754321e8, 0x754320e9, 0x75432e98, 0x754310ea, 0x75431ea8, 0x75430ea9, 0x7543ea98, 0x754210eb, 0x75421eb8, 0x75420eb9, 0x7542eb98, 0x75410eba, 0x7541eba8, 0x7540eba9, 0x754eba98, 0x753210ec, 0x75321ec8, 0x75320ec9, 0x7532ec98, 0x75310eca, 0x7531eca8, 0x7530eca9, 0x753eca98, 0x75210ecb, 0x7521ecb8, 0x7520ecb9, 0x752ecb98, 0x7510ecba, 0x751ecba8, 0x750ecba9, 0x75ecba98, 0x743210ed, 0x74321ed8, 0x74320ed9, 0x7432ed98, 0x74310eda, 0x7431eda8, 0x7430eda9, 0x743eda98, 0x74210edb, 0x7421edb8, 0x7420edb9, 0x742edb98, 0x7410edba, 0x741edba8, 0x740edba9, 0x74edba98, 0x73210edc, 0x7321edc8, 0x7320edc9, 0x732edc98, 0x7310edca, 0x731edca8, 0x730edca9, 0x73edca98, 0x7210edcb, 0x721edcb8, 0x720edcb9, 0x72edcb98, 0x710edcba, 0x71edcba8, 0x70edcba9, 0x7edcba98, 0x6543210f, 0x654321f8, 0x654320f9, 0x65432f98, 0x654310fa, 0x65431fa8, 0x65430fa9, 0x6543fa98, 0x654210fb, 0x65421fb8, 0x65420fb9, 0x6542fb98, 0x65410fba, 0x6541fba8, 0x6540fba9, 0x654fba98, 0x653210fc, 0x65321fc8, 0x65320fc9, 0x6532fc98, 0x65310fca, 0x6531fca8, 0x6530fca9, 0x653fca98, 0x65210fcb, 0x6521fcb8, 0x6520fcb9, 0x652fcb98, 0x6510fcba, 0x651fcba8, 0x650fcba9, 0x65fcba98, 0x643210fd, 0x64321fd8, 0x64320fd9, 0x6432fd98, 0x64310fda, 0x6431fda8, 0x6430fda9, 0x643fda98, 0x64210fdb, 0x6421fdb8, 0x6420fdb9, 0x642fdb98, 0x6410fdba, 0x641fdba8, 0x640fdba9, 0x64fdba98, 0x63210fdc, 0x6321fdc8, 0x6320fdc9, 0x632fdc98, 0x6310fdca, 0x631fdca8, 0x630fdca9, 0x63fdca98, 0x6210fdcb, 0x621fdcb8, 0x620fdcb9, 0x62fdcb98, 0x610fdcba, 0x61fdcba8, 0x60fdcba9, 0x6fdcba98, 0x543210fe, 0x54321fe8, 0x54320fe9, 0x5432fe98, 0x54310fea, 0x5431fea8, 0x5430fea9, 0x543fea98, 0x54210feb, 0x5421feb8, 0x5420feb9, 0x542feb98, 0x5410feba, 0x541feba8, 0x540feba9, 0x54feba98, 0x53210fec, 0x5321fec8, 0x5320fec9, 0x532fec98, 0x5310feca, 0x531feca8, 0x530feca9, 0x53feca98, 0x5210fecb, 0x521fecb8, 0x520fecb9, 0x52fecb98, 0x510fecba, 0x51fecba8, 0x50fecba9, 0x5fecba98, 0x43210fed, 0x4321fed8, 0x4320fed9, 0x432fed98, 0x4310feda, 0x431feda8, 0x430feda9, 0x43feda98, 0x4210fedb, 0x421fedb8, 0x420fedb9, 0x42fedb98, 0x410fedba, 0x41fedba8, 0x40fedba9, 0x4fedba98, 0x3210fedc, 0x321fedc8, 0x320fedc9, 0x32fedc98, 0x310fedca, 0x31fedca8, 0x30fedca9, 0x3fedca98, 0x210fedcb, 0x21fedcb8, 0x20fedcb9, 0x2fedcb98, 0x10fedcba, 0x1fedcba8, 0x0fedcba9, 0xfedcba98}; // No need to mask because _mm256_permutevar8x32_epi32 ignores bits 3..31. // Just shift each copy of the 32 bit LUT to extract its 4-bit fields. // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing // latency, it may be faster to use LoadDup128 and PSHUFB. const auto packed = Set(d32, packed_array[mask_bits]); alignas(32) static constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28}; return packed >> Load(d32, shifts); } template HWY_INLINE Vec256 IndicesFromBits256(uint64_t mask_bits) { const Full256 d32; // For 64-bit, we still need 32-bit indices because there is no 64-bit // permutevar, but there are only 4 lanes, so we can afford to skip the // unpacking and load the entire index vector directly. alignas(32) static constexpr uint32_t u32_indices[128] = { // PrintCompress64x4PairTables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 10, 11, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 12, 13, 0, 1, 6, 7, 8, 9, 10, 11, 12, 13, 6, 7, 14, 15, 0, 1, 2, 3, 4, 5, 8, 9, 14, 15, 2, 3, 4, 5, 10, 11, 14, 15, 0, 1, 4, 5, 8, 9, 10, 11, 14, 15, 4, 5, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 12, 13, 14, 15, 2, 3, 10, 11, 12, 13, 14, 15, 0, 1, 8, 9, 10, 11, 12, 13, 14, 15}; return Load(d32, u32_indices + 8 * mask_bits); } template HWY_INLINE Vec256 IndicesFromNotBits256(uint64_t mask_bits) { const Full256 d32; // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT // of SetTableIndices would require 8 KiB, a large part of L1D. The other // alternative is _pext_u64, but this is extremely slow on Zen2 (18 cycles) // and unavailable in 32-bit builds. We instead compress each index into 4 // bits, for a total of 1 KiB. alignas(16) static constexpr uint32_t packed_array[256] = { // PrintCompressNot32x8Tables 0xfedcba98, 0x8fedcba9, 0x9fedcba8, 0x98fedcba, 0xafedcb98, 0xa8fedcb9, 0xa9fedcb8, 0xa98fedcb, 0xbfedca98, 0xb8fedca9, 0xb9fedca8, 0xb98fedca, 0xbafedc98, 0xba8fedc9, 0xba9fedc8, 0xba98fedc, 0xcfedba98, 0xc8fedba9, 0xc9fedba8, 0xc98fedba, 0xcafedb98, 0xca8fedb9, 0xca9fedb8, 0xca98fedb, 0xcbfeda98, 0xcb8feda9, 0xcb9feda8, 0xcb98feda, 0xcbafed98, 0xcba8fed9, 0xcba9fed8, 0xcba98fed, 0xdfecba98, 0xd8fecba9, 0xd9fecba8, 0xd98fecba, 0xdafecb98, 0xda8fecb9, 0xda9fecb8, 0xda98fecb, 0xdbfeca98, 0xdb8feca9, 0xdb9feca8, 0xdb98feca, 0xdbafec98, 0xdba8fec9, 0xdba9fec8, 0xdba98fec, 0xdcfeba98, 0xdc8feba9, 0xdc9feba8, 0xdc98feba, 0xdcafeb98, 0xdca8feb9, 0xdca9feb8, 0xdca98feb, 0xdcbfea98, 0xdcb8fea9, 0xdcb9fea8, 0xdcb98fea, 0xdcbafe98, 0xdcba8fe9, 0xdcba9fe8, 0xdcba98fe, 0xefdcba98, 0xe8fdcba9, 0xe9fdcba8, 0xe98fdcba, 0xeafdcb98, 0xea8fdcb9, 0xea9fdcb8, 0xea98fdcb, 0xebfdca98, 0xeb8fdca9, 0xeb9fdca8, 0xeb98fdca, 0xebafdc98, 0xeba8fdc9, 0xeba9fdc8, 0xeba98fdc, 0xecfdba98, 0xec8fdba9, 0xec9fdba8, 0xec98fdba, 0xecafdb98, 0xeca8fdb9, 0xeca9fdb8, 0xeca98fdb, 0xecbfda98, 0xecb8fda9, 0xecb9fda8, 0xecb98fda, 0xecbafd98, 0xecba8fd9, 0xecba9fd8, 0xecba98fd, 0xedfcba98, 0xed8fcba9, 0xed9fcba8, 0xed98fcba, 0xedafcb98, 0xeda8fcb9, 0xeda9fcb8, 0xeda98fcb, 0xedbfca98, 0xedb8fca9, 0xedb9fca8, 0xedb98fca, 0xedbafc98, 0xedba8fc9, 0xedba9fc8, 0xedba98fc, 0xedcfba98, 0xedc8fba9, 0xedc9fba8, 0xedc98fba, 0xedcafb98, 0xedca8fb9, 0xedca9fb8, 0xedca98fb, 0xedcbfa98, 0xedcb8fa9, 0xedcb9fa8, 0xedcb98fa, 0xedcbaf98, 0xedcba8f9, 0xedcba9f8, 0xedcba98f, 0xfedcba98, 0xf8edcba9, 0xf9edcba8, 0xf98edcba, 0xfaedcb98, 0xfa8edcb9, 0xfa9edcb8, 0xfa98edcb, 0xfbedca98, 0xfb8edca9, 0xfb9edca8, 0xfb98edca, 0xfbaedc98, 0xfba8edc9, 0xfba9edc8, 0xfba98edc, 0xfcedba98, 0xfc8edba9, 0xfc9edba8, 0xfc98edba, 0xfcaedb98, 0xfca8edb9, 0xfca9edb8, 0xfca98edb, 0xfcbeda98, 0xfcb8eda9, 0xfcb9eda8, 0xfcb98eda, 0xfcbaed98, 0xfcba8ed9, 0xfcba9ed8, 0xfcba98ed, 0xfdecba98, 0xfd8ecba9, 0xfd9ecba8, 0xfd98ecba, 0xfdaecb98, 0xfda8ecb9, 0xfda9ecb8, 0xfda98ecb, 0xfdbeca98, 0xfdb8eca9, 0xfdb9eca8, 0xfdb98eca, 0xfdbaec98, 0xfdba8ec9, 0xfdba9ec8, 0xfdba98ec, 0xfdceba98, 0xfdc8eba9, 0xfdc9eba8, 0xfdc98eba, 0xfdcaeb98, 0xfdca8eb9, 0xfdca9eb8, 0xfdca98eb, 0xfdcbea98, 0xfdcb8ea9, 0xfdcb9ea8, 0xfdcb98ea, 0xfdcbae98, 0xfdcba8e9, 0xfdcba9e8, 0xfdcba98e, 0xfedcba98, 0xfe8dcba9, 0xfe9dcba8, 0xfe98dcba, 0xfeadcb98, 0xfea8dcb9, 0xfea9dcb8, 0xfea98dcb, 0xfebdca98, 0xfeb8dca9, 0xfeb9dca8, 0xfeb98dca, 0xfebadc98, 0xfeba8dc9, 0xfeba9dc8, 0xfeba98dc, 0xfecdba98, 0xfec8dba9, 0xfec9dba8, 0xfec98dba, 0xfecadb98, 0xfeca8db9, 0xfeca9db8, 0xfeca98db, 0xfecbda98, 0xfecb8da9, 0xfecb9da8, 0xfecb98da, 0xfecbad98, 0xfecba8d9, 0xfecba9d8, 0xfecba98d, 0xfedcba98, 0xfed8cba9, 0xfed9cba8, 0xfed98cba, 0xfedacb98, 0xfeda8cb9, 0xfeda9cb8, 0xfeda98cb, 0xfedbca98, 0xfedb8ca9, 0xfedb9ca8, 0xfedb98ca, 0xfedbac98, 0xfedba8c9, 0xfedba9c8, 0xfedba98c, 0xfedcba98, 0xfedc8ba9, 0xfedc9ba8, 0xfedc98ba, 0xfedcab98, 0xfedca8b9, 0xfedca9b8, 0xfedca98b, 0xfedcba98, 0xfedcb8a9, 0xfedcb9a8, 0xfedcb98a, 0xfedcba98, 0xfedcba89, 0xfedcba98, 0xfedcba98}; // No need to mask because <_mm256_permutevar8x32_epi32> ignores bits 3..31. // Just shift each copy of the 32 bit LUT to extract its 4-bit fields. // If broadcasting 32-bit from memory incurs the 3-cycle block-crossing // latency, it may be faster to use LoadDup128 and PSHUFB. const Vec256 packed = Set(d32, packed_array[mask_bits]); alignas(32) static constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28}; return packed >> Load(d32, shifts); } template HWY_INLINE Vec256 IndicesFromNotBits256(uint64_t mask_bits) { const Full256 d32; // For 64-bit, we still need 32-bit indices because there is no 64-bit // permutevar, but there are only 4 lanes, so we can afford to skip the // unpacking and load the entire index vector directly. alignas(32) static constexpr uint32_t u32_indices[128] = { // PrintCompressNot64x4PairTables 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 8, 9, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 8, 9, 10, 11, 14, 15, 12, 13, 10, 11, 14, 15, 8, 9, 12, 13, 8, 9, 14, 15, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 8, 9, 14, 15, 8, 9, 12, 13, 10, 11, 14, 15, 12, 13, 8, 9, 10, 11, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 8, 9, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15}; return Load(d32, u32_indices + 8 * mask_bits); } template HWY_INLINE Vec256 Compress(Vec256 v, const uint64_t mask_bits) { const DFromV d; const Repartition du32; HWY_DASSERT(mask_bits < (1ull << Lanes(d))); // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is // no instruction for 4x64). const Indices256 indices{IndicesFromBits256(mask_bits).raw}; return BitCast(d, TableLookupLanes(BitCast(du32, v), indices)); } // LUTs are infeasible for 2^16 possible masks, so splice together two // half-vector Compress. template HWY_INLINE Vec256 Compress(Vec256 v, const uint64_t mask_bits) { const DFromV d; const RebindToUnsigned du; const auto vu16 = BitCast(du, v); // (required for float16_t inputs) const Half duh; const auto half0 = LowerHalf(duh, vu16); const auto half1 = UpperHalf(duh, vu16); const uint64_t mask_bits0 = mask_bits & 0xFF; const uint64_t mask_bits1 = mask_bits >> 8; const auto compressed0 = detail::CompressBits(half0, mask_bits0); const auto compressed1 = detail::CompressBits(half1, mask_bits1); alignas(32) uint16_t all_true[16] = {}; // Store mask=true lanes, left to right. const size_t num_true0 = PopCount(mask_bits0); Store(compressed0, duh, all_true); StoreU(compressed1, duh, all_true + num_true0); if (hwy::HWY_NAMESPACE::CompressIsPartition::value) { // Store mask=false lanes, right to left. The second vector fills the upper // half with right-aligned false lanes. The first vector is shifted // rightwards to overwrite the true lanes of the second. alignas(32) uint16_t all_false[16] = {}; const size_t num_true1 = PopCount(mask_bits1); Store(compressed1, duh, all_false + 8); StoreU(compressed0, duh, all_false + num_true1); const auto mask = FirstN(du, num_true0 + num_true1); return BitCast(d, IfThenElse(mask, Load(du, all_true), Load(du, all_false))); } else { // Only care about the mask=true lanes. return BitCast(d, Load(du, all_true)); } } template HWY_INLINE Vec256 CompressNot(Vec256 v, const uint64_t mask_bits) { const DFromV d; const Repartition du32; HWY_DASSERT(mask_bits < (1ull << Lanes(d))); // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is // no instruction for 4x64). const Indices256 indices{IndicesFromNotBits256(mask_bits).raw}; return BitCast(d, TableLookupLanes(BitCast(du32, v), indices)); } // LUTs are infeasible for 2^16 possible masks, so splice together two // half-vector Compress. template HWY_INLINE Vec256 CompressNot(Vec256 v, const uint64_t mask_bits) { // Compress ensures only the lower 16 bits are set, so flip those. return Compress(v, mask_bits ^ 0xFFFF); } } // namespace detail template HWY_API Vec256 Compress(Vec256 v, Mask256 m) { return detail::Compress(v, detail::BitsFromMask(m)); } template HWY_API Vec256 CompressNot(Vec256 v, Mask256 m) { return detail::CompressNot(v, detail::BitsFromMask(m)); } HWY_API Vec256 CompressBlocksNot(Vec256 v, Mask256 mask) { return CompressNot(v, mask); } template HWY_API Vec256 CompressBits(Vec256 v, const uint8_t* HWY_RESTRICT bits) { constexpr size_t N = 32 / sizeof(T); constexpr size_t kNumBytes = (N + 7) / 8; uint64_t mask_bits = 0; CopyBytes(bits, &mask_bits); if (N < 8) { mask_bits &= (1ull << N) - 1; } return detail::Compress(v, mask_bits); } // ------------------------------ CompressStore, CompressBitsStore template HWY_API size_t CompressStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const uint64_t mask_bits = detail::BitsFromMask(m); const size_t count = PopCount(mask_bits); StoreU(detail::Compress(v, mask_bits), d, unaligned); detail::MaybeUnpoison(unaligned, count); return count; } template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const uint64_t mask_bits = detail::BitsFromMask(m); const size_t count = PopCount(mask_bits); const RebindToUnsigned du; const Repartition du32; HWY_DASSERT(mask_bits < (1ull << Lanes(d))); // 32-bit indices because we only have _mm256_permutevar8x32_epi32 (there is // no instruction for 4x64). Nibble MSB encodes FirstN. const Vec256 idx_mask = detail::IndicesFromBits256>(mask_bits); // Shift nibble MSB into MSB const Mask256 mask32 = MaskFromVec(ShiftLeft<28>(idx_mask)); // First cast to unsigned (RebindMask cannot change lane size) const MFromD mask_u{mask32.raw}; const MFromD mask = RebindMask(d, mask_u); const VFromD compressed = BitCast( d, TableLookupLanes(BitCast(du32, v), Indices256{idx_mask.raw})); BlendedStore(compressed, mask, d, unaligned); detail::MaybeUnpoison(unaligned, count); return count; } template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const uint64_t mask_bits = detail::BitsFromMask(m); const size_t count = PopCount(mask_bits); const VFromD compressed = detail::Compress(v, mask_bits); #if HWY_MEM_OPS_MIGHT_FAULT // true if HWY_IS_MSAN // BlendedStore tests mask for each lane, but we know that the mask is // FirstN, so we can just copy. alignas(32) TFromD buf[16]; Store(compressed, d, buf); CopyBytes(buf, unaligned, count * sizeof(TFromD)); #else BlendedStore(compressed, FirstN(d, count), d, unaligned); #endif return count; } template HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, D d, TFromD* HWY_RESTRICT unaligned) { constexpr size_t N = Lanes(d); constexpr size_t kNumBytes = (N + 7) / 8; uint64_t mask_bits = 0; CopyBytes(bits, &mask_bits); if (N < 8) { mask_bits &= (1ull << N) - 1; } const size_t count = PopCount(mask_bits); StoreU(detail::Compress(v, mask_bits), d, unaligned); detail::MaybeUnpoison(unaligned, count); return count; } #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ Dup128MaskFromMaskBits // Generic for all vector lengths >= 32 bytes template HWY_API MFromD Dup128MaskFromMaskBits(D d, unsigned mask_bits) { const Half dh; const auto mh = Dup128MaskFromMaskBits(dh, mask_bits); return CombineMasks(d, mh, mh); } // ------------------------------ Expand // Always define Expand/LoadExpand because generic_ops only does so for Vec128. namespace detail { #if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE // VBMI2 HWY_INLINE Vec256 NativeExpand(Vec256 v, Mask256 mask) { return Vec256{_mm256_maskz_expand_epi8(mask.raw, v.raw)}; } HWY_INLINE Vec256 NativeExpand(Vec256 v, Mask256 mask) { return Vec256{_mm256_maskz_expand_epi16(mask.raw, v.raw)}; } template HWY_INLINE VFromD NativeLoadExpand(MFromD mask, D /* d */, const uint8_t* HWY_RESTRICT unaligned) { return VFromD{_mm256_maskz_expandloadu_epi8(mask.raw, unaligned)}; } template HWY_INLINE VFromD NativeLoadExpand(MFromD mask, D /* d */, const uint16_t* HWY_RESTRICT unaligned) { return VFromD{_mm256_maskz_expandloadu_epi16(mask.raw, unaligned)}; } #endif // HWY_TARGET <= HWY_AVX3_DL #if HWY_TARGET <= HWY_AVX3 || HWY_IDE HWY_INLINE Vec256 NativeExpand(Vec256 v, Mask256 mask) { return Vec256{_mm256_maskz_expand_epi32(mask.raw, v.raw)}; } HWY_INLINE Vec256 NativeExpand(Vec256 v, Mask256 mask) { return Vec256{_mm256_maskz_expand_epi64(mask.raw, v.raw)}; } template HWY_INLINE VFromD NativeLoadExpand(MFromD mask, D /* d */, const uint32_t* HWY_RESTRICT unaligned) { return VFromD{_mm256_maskz_expandloadu_epi32(mask.raw, unaligned)}; } template HWY_INLINE VFromD NativeLoadExpand(MFromD mask, D /* d */, const uint64_t* HWY_RESTRICT unaligned) { return VFromD{_mm256_maskz_expandloadu_epi64(mask.raw, unaligned)}; } #endif // HWY_TARGET <= HWY_AVX3 } // namespace detail template HWY_API Vec256 Expand(Vec256 v, Mask256 mask) { const DFromV d; #if HWY_TARGET <= HWY_AVX3_DL // VBMI2 const RebindToUnsigned du; const MFromD mu = RebindMask(du, mask); return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); #else // LUTs are infeasible for so many mask combinations, so Combine two // half-vector Expand. const Half dh; const uint64_t mask_bits = detail::BitsFromMask(mask); constexpr size_t N = 32 / sizeof(T); const size_t countL = PopCount(mask_bits & ((1 << (N / 2)) - 1)); const Mask128 maskL = MaskFromVec(LowerHalf(VecFromMask(d, mask))); const Vec128 expandL = Expand(LowerHalf(v), maskL); // We have to shift the input by a variable number of bytes, but there isn't // a table-driven option for that until VBMI, and CPUs with that likely also // have VBMI2 and thus native Expand. alignas(32) T lanes[N]; Store(v, d, lanes); const Mask128 maskH = MaskFromVec(UpperHalf(dh, VecFromMask(d, mask))); const Vec128 expandH = Expand(LoadU(dh, lanes + countL), maskH); return Combine(d, expandH, expandL); #endif } // If AVX3, this is already implemented by x86_512. #if HWY_TARGET != HWY_AVX3 template HWY_API Vec256 Expand(Vec256 v, Mask256 mask) { const Full256 d; #if HWY_TARGET <= HWY_AVX3_DL // VBMI2 const RebindToUnsigned du; return BitCast(d, detail::NativeExpand(BitCast(du, v), RebindMask(du, mask))); #else // AVX2 // LUTs are infeasible for 2^16 possible masks, so splice together two // half-vector Expand. const Half dh; const Mask128 maskL = MaskFromVec(LowerHalf(VecFromMask(d, mask))); const Vec128 expandL = Expand(LowerHalf(v), maskL); // We have to shift the input by a variable number of u16. permutevar_epi16 // requires AVX3 and if we had that, we'd use native u32 Expand. The only // alternative is re-loading, which incurs a store to load forwarding stall. alignas(32) T lanes[32 / sizeof(T)]; Store(v, d, lanes); const Vec128 vH = LoadU(dh, lanes + CountTrue(dh, maskL)); const Mask128 maskH = MaskFromVec(UpperHalf(dh, VecFromMask(d, mask))); const Vec128 expandH = Expand(vH, maskH); return Combine(d, expandH, expandL); #endif // AVX2 } #endif // HWY_TARGET != HWY_AVX3 template HWY_API Vec256 Expand(Vec256 v, Mask256 mask) { const Full256 d; #if HWY_TARGET <= HWY_AVX3 const RebindToUnsigned du; const MFromD mu = RebindMask(du, mask); return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); #else const RebindToUnsigned du; const uint64_t mask_bits = detail::BitsFromMask(mask); alignas(16) constexpr uint32_t packed_array[256] = { // PrintExpand32x8Nibble. 0xffffffff, 0xfffffff0, 0xffffff0f, 0xffffff10, 0xfffff0ff, 0xfffff1f0, 0xfffff10f, 0xfffff210, 0xffff0fff, 0xffff1ff0, 0xffff1f0f, 0xffff2f10, 0xffff10ff, 0xffff21f0, 0xffff210f, 0xffff3210, 0xfff0ffff, 0xfff1fff0, 0xfff1ff0f, 0xfff2ff10, 0xfff1f0ff, 0xfff2f1f0, 0xfff2f10f, 0xfff3f210, 0xfff10fff, 0xfff21ff0, 0xfff21f0f, 0xfff32f10, 0xfff210ff, 0xfff321f0, 0xfff3210f, 0xfff43210, 0xff0fffff, 0xff1ffff0, 0xff1fff0f, 0xff2fff10, 0xff1ff0ff, 0xff2ff1f0, 0xff2ff10f, 0xff3ff210, 0xff1f0fff, 0xff2f1ff0, 0xff2f1f0f, 0xff3f2f10, 0xff2f10ff, 0xff3f21f0, 0xff3f210f, 0xff4f3210, 0xff10ffff, 0xff21fff0, 0xff21ff0f, 0xff32ff10, 0xff21f0ff, 0xff32f1f0, 0xff32f10f, 0xff43f210, 0xff210fff, 0xff321ff0, 0xff321f0f, 0xff432f10, 0xff3210ff, 0xff4321f0, 0xff43210f, 0xff543210, 0xf0ffffff, 0xf1fffff0, 0xf1ffff0f, 0xf2ffff10, 0xf1fff0ff, 0xf2fff1f0, 0xf2fff10f, 0xf3fff210, 0xf1ff0fff, 0xf2ff1ff0, 0xf2ff1f0f, 0xf3ff2f10, 0xf2ff10ff, 0xf3ff21f0, 0xf3ff210f, 0xf4ff3210, 0xf1f0ffff, 0xf2f1fff0, 0xf2f1ff0f, 0xf3f2ff10, 0xf2f1f0ff, 0xf3f2f1f0, 0xf3f2f10f, 0xf4f3f210, 0xf2f10fff, 0xf3f21ff0, 0xf3f21f0f, 0xf4f32f10, 0xf3f210ff, 0xf4f321f0, 0xf4f3210f, 0xf5f43210, 0xf10fffff, 0xf21ffff0, 0xf21fff0f, 0xf32fff10, 0xf21ff0ff, 0xf32ff1f0, 0xf32ff10f, 0xf43ff210, 0xf21f0fff, 0xf32f1ff0, 0xf32f1f0f, 0xf43f2f10, 0xf32f10ff, 0xf43f21f0, 0xf43f210f, 0xf54f3210, 0xf210ffff, 0xf321fff0, 0xf321ff0f, 0xf432ff10, 0xf321f0ff, 0xf432f1f0, 0xf432f10f, 0xf543f210, 0xf3210fff, 0xf4321ff0, 0xf4321f0f, 0xf5432f10, 0xf43210ff, 0xf54321f0, 0xf543210f, 0xf6543210, 0x0fffffff, 0x1ffffff0, 0x1fffff0f, 0x2fffff10, 0x1ffff0ff, 0x2ffff1f0, 0x2ffff10f, 0x3ffff210, 0x1fff0fff, 0x2fff1ff0, 0x2fff1f0f, 0x3fff2f10, 0x2fff10ff, 0x3fff21f0, 0x3fff210f, 0x4fff3210, 0x1ff0ffff, 0x2ff1fff0, 0x2ff1ff0f, 0x3ff2ff10, 0x2ff1f0ff, 0x3ff2f1f0, 0x3ff2f10f, 0x4ff3f210, 0x2ff10fff, 0x3ff21ff0, 0x3ff21f0f, 0x4ff32f10, 0x3ff210ff, 0x4ff321f0, 0x4ff3210f, 0x5ff43210, 0x1f0fffff, 0x2f1ffff0, 0x2f1fff0f, 0x3f2fff10, 0x2f1ff0ff, 0x3f2ff1f0, 0x3f2ff10f, 0x4f3ff210, 0x2f1f0fff, 0x3f2f1ff0, 0x3f2f1f0f, 0x4f3f2f10, 0x3f2f10ff, 0x4f3f21f0, 0x4f3f210f, 0x5f4f3210, 0x2f10ffff, 0x3f21fff0, 0x3f21ff0f, 0x4f32ff10, 0x3f21f0ff, 0x4f32f1f0, 0x4f32f10f, 0x5f43f210, 0x3f210fff, 0x4f321ff0, 0x4f321f0f, 0x5f432f10, 0x4f3210ff, 0x5f4321f0, 0x5f43210f, 0x6f543210, 0x10ffffff, 0x21fffff0, 0x21ffff0f, 0x32ffff10, 0x21fff0ff, 0x32fff1f0, 0x32fff10f, 0x43fff210, 0x21ff0fff, 0x32ff1ff0, 0x32ff1f0f, 0x43ff2f10, 0x32ff10ff, 0x43ff21f0, 0x43ff210f, 0x54ff3210, 0x21f0ffff, 0x32f1fff0, 0x32f1ff0f, 0x43f2ff10, 0x32f1f0ff, 0x43f2f1f0, 0x43f2f10f, 0x54f3f210, 0x32f10fff, 0x43f21ff0, 0x43f21f0f, 0x54f32f10, 0x43f210ff, 0x54f321f0, 0x54f3210f, 0x65f43210, 0x210fffff, 0x321ffff0, 0x321fff0f, 0x432fff10, 0x321ff0ff, 0x432ff1f0, 0x432ff10f, 0x543ff210, 0x321f0fff, 0x432f1ff0, 0x432f1f0f, 0x543f2f10, 0x432f10ff, 0x543f21f0, 0x543f210f, 0x654f3210, 0x3210ffff, 0x4321fff0, 0x4321ff0f, 0x5432ff10, 0x4321f0ff, 0x5432f1f0, 0x5432f10f, 0x6543f210, 0x43210fff, 0x54321ff0, 0x54321f0f, 0x65432f10, 0x543210ff, 0x654321f0, 0x6543210f, 0x76543210, }; // For lane i, shift the i-th 4-bit index down to bits [0, 3). const Vec256 packed = Set(du, packed_array[mask_bits]); alignas(32) constexpr uint32_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28}; // TableLookupLanes ignores upper bits; avoid bounds-check in IndicesFromVec. const Indices256 indices{(packed >> Load(du, shifts)).raw}; const Vec256 expand = TableLookupLanes(BitCast(du, v), indices); // TableLookupLanes cannot also zero masked-off lanes, so do that now. return IfThenElseZero(mask, BitCast(d, expand)); #endif } template HWY_API Vec256 Expand(Vec256 v, Mask256 mask) { const Full256 d; #if HWY_TARGET <= HWY_AVX3 const RebindToUnsigned du; const MFromD mu = RebindMask(du, mask); return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); #else const RebindToUnsigned du; const uint64_t mask_bits = detail::BitsFromMask(mask); alignas(16) constexpr uint64_t packed_array[16] = { // PrintExpand64x4Nibble. 0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0, 0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10, 0x000010ff, 0x000021f0, 0x0000210f, 0x00003210}; // For lane i, shift the i-th 4-bit index down to bits [0, 2). const Vec256 packed = Set(du, packed_array[mask_bits]); alignas(32) constexpr uint64_t shifts[8] = {0, 4, 8, 12, 16, 20, 24, 28}; #if HWY_TARGET <= HWY_AVX3 // native 64-bit TableLookupLanes // TableLookupLanes ignores upper bits; avoid bounds-check in IndicesFromVec. const Indices256 indices{(packed >> Load(du, shifts)).raw}; #else // 64-bit TableLookupLanes on AVX2 requires IndicesFromVec, which checks // bounds, so clear the upper bits. const Vec256 masked = And(packed >> Load(du, shifts), Set(du, 3)); const Indices256 indices = IndicesFromVec(du, masked); #endif const Vec256 expand = TableLookupLanes(BitCast(du, v), indices); // TableLookupLanes cannot also zero masked-off lanes, so do that now. return IfThenElseZero(mask, BitCast(d, expand)); #endif } // ------------------------------ LoadExpand template HWY_API VFromD LoadExpand(MFromD mask, D d, const TFromD* HWY_RESTRICT unaligned) { #if HWY_TARGET <= HWY_AVX3_DL // VBMI2 const RebindToUnsigned du; using TU = TFromD; const TU* HWY_RESTRICT pu = reinterpret_cast(unaligned); const MFromD mu = RebindMask(du, mask); return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); #else return Expand(LoadU(d, unaligned), mask); #endif } template HWY_API VFromD LoadExpand(MFromD mask, D d, const TFromD* HWY_RESTRICT unaligned) { #if HWY_TARGET <= HWY_AVX3 const RebindToUnsigned du; using TU = TFromD; const TU* HWY_RESTRICT pu = reinterpret_cast(unaligned); const MFromD mu = RebindMask(du, mask); return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); #else return Expand(LoadU(d, unaligned), mask); #endif } // ------------------------------ LoadInterleaved3/4 // Implemented in generic_ops, we just overload LoadTransposedBlocks3/4. namespace detail { // Input: // 1 0 (<- first block of unaligned) // 3 2 // 5 4 // Output: // 3 0 // 4 1 // 5 2 template HWY_API void LoadTransposedBlocks3(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& A, VFromD& B, VFromD& C) { constexpr size_t N = Lanes(d); const VFromD v10 = LoadU(d, unaligned + 0 * N); // 1 0 const VFromD v32 = LoadU(d, unaligned + 1 * N); const VFromD v54 = LoadU(d, unaligned + 2 * N); A = ConcatUpperLower(d, v32, v10); B = ConcatLowerUpper(d, v54, v10); C = ConcatUpperLower(d, v54, v32); } // Input (128-bit blocks): // 1 0 (first block of unaligned) // 3 2 // 5 4 // 7 6 // Output: // 4 0 (LSB of vA) // 5 1 // 6 2 // 7 3 template HWY_API void LoadTransposedBlocks4(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& vA, VFromD& vB, VFromD& vC, VFromD& vD) { constexpr size_t N = Lanes(d); const VFromD v10 = LoadU(d, unaligned + 0 * N); const VFromD v32 = LoadU(d, unaligned + 1 * N); const VFromD v54 = LoadU(d, unaligned + 2 * N); const VFromD v76 = LoadU(d, unaligned + 3 * N); vA = ConcatLowerLower(d, v54, v10); vB = ConcatUpperUpper(d, v54, v10); vC = ConcatLowerLower(d, v76, v32); vD = ConcatUpperUpper(d, v76, v32); } } // namespace detail // ------------------------------ StoreInterleaved2/3/4 (ConcatUpperLower) // Implemented in generic_ops, we just overload StoreTransposedBlocks2/3/4. namespace detail { // Input (128-bit blocks): // 2 0 (LSB of i) // 3 1 // Output: // 1 0 // 3 2 template HWY_API void StoreTransposedBlocks2(VFromD i, VFromD j, D d, TFromD* HWY_RESTRICT unaligned) { constexpr size_t N = Lanes(d); const auto out0 = ConcatLowerLower(d, j, i); const auto out1 = ConcatUpperUpper(d, j, i); StoreU(out0, d, unaligned + 0 * N); StoreU(out1, d, unaligned + 1 * N); } // Input (128-bit blocks): // 3 0 (LSB of i) // 4 1 // 5 2 // Output: // 1 0 // 3 2 // 5 4 template HWY_API void StoreTransposedBlocks3(VFromD i, VFromD j, VFromD k, D d, TFromD* HWY_RESTRICT unaligned) { constexpr size_t N = Lanes(d); const auto out0 = ConcatLowerLower(d, j, i); const auto out1 = ConcatUpperLower(d, i, k); const auto out2 = ConcatUpperUpper(d, k, j); StoreU(out0, d, unaligned + 0 * N); StoreU(out1, d, unaligned + 1 * N); StoreU(out2, d, unaligned + 2 * N); } // Input (128-bit blocks): // 4 0 (LSB of i) // 5 1 // 6 2 // 7 3 // Output: // 1 0 // 3 2 // 5 4 // 7 6 template HWY_API void StoreTransposedBlocks4(VFromD i, VFromD j, VFromD k, VFromD l, D d, TFromD* HWY_RESTRICT unaligned) { constexpr size_t N = Lanes(d); // Write lower halves, then upper. const auto out0 = ConcatLowerLower(d, j, i); const auto out1 = ConcatLowerLower(d, l, k); StoreU(out0, d, unaligned + 0 * N); StoreU(out1, d, unaligned + 1 * N); const auto out2 = ConcatUpperUpper(d, j, i); const auto out3 = ConcatUpperUpper(d, l, k); StoreU(out2, d, unaligned + 2 * N); StoreU(out3, d, unaligned + 3 * N); } } // namespace detail // ------------------------------ Additional mask logical operations #if HWY_TARGET <= HWY_AVX3 template HWY_API Mask256 SetAtOrAfterFirst(Mask256 mask) { constexpr size_t N = Lanes(Full256()); constexpr uint32_t kActiveElemMask = static_cast((uint64_t{1} << N) - 1); return Mask256{static_cast::Raw>( (0u - detail::AVX3Blsi(mask.raw)) & kActiveElemMask)}; } template HWY_API Mask256 SetBeforeFirst(Mask256 mask) { constexpr size_t N = Lanes(Full256()); constexpr uint32_t kActiveElemMask = static_cast((uint64_t{1} << N) - 1); return Mask256{static_cast::Raw>( (detail::AVX3Blsi(mask.raw) - 1u) & kActiveElemMask)}; } template HWY_API Mask256 SetAtOrBeforeFirst(Mask256 mask) { constexpr size_t N = Lanes(Full256()); constexpr uint32_t kActiveElemMask = static_cast((uint64_t{1} << N) - 1); return Mask256{static_cast::Raw>( detail::AVX3Blsmsk(mask.raw) & kActiveElemMask)}; } template HWY_API Mask256 SetOnlyFirst(Mask256 mask) { return Mask256{ static_cast::Raw>(detail::AVX3Blsi(mask.raw))}; } #else // AVX2 template HWY_API Mask256 SetAtOrAfterFirst(Mask256 mask) { const Full256 d; const Repartition di64; const Repartition df32; const Repartition di32; const Half dh_i64; const Half dh_i32; using VF32 = VFromD; auto vmask = BitCast(di64, VecFromMask(d, mask)); vmask = Or(vmask, Neg(vmask)); // Copy the sign bit of the even int64_t lanes to the odd int64_t lanes const auto vmask2 = BitCast( di32, VF32{_mm256_shuffle_ps(Zero(df32).raw, BitCast(df32, vmask).raw, _MM_SHUFFLE(1, 1, 0, 0))}); vmask = Or(vmask, BitCast(di64, BroadcastSignBit(vmask2))); // Copy the sign bit of the lower 128-bit half to the upper 128-bit half const auto vmask3 = BroadcastSignBit(Broadcast<3>(BitCast(dh_i32, LowerHalf(dh_i64, vmask)))); vmask = Or(vmask, BitCast(di64, Combine(di32, vmask3, Zero(dh_i32)))); return MaskFromVec(BitCast(d, vmask)); } template HWY_API Mask256 SetBeforeFirst(Mask256 mask) { return Not(SetAtOrAfterFirst(mask)); } template HWY_API Mask256 SetOnlyFirst(Mask256 mask) { const Full256 d; const RebindToSigned di; const Repartition di64; const Half dh_i64; const auto zero = Zero(di64); const auto vmask = BitCast(di64, VecFromMask(d, mask)); const auto vmask_eq_0 = VecFromMask(di64, vmask == zero); auto vmask2_lo = LowerHalf(dh_i64, vmask_eq_0); auto vmask2_hi = UpperHalf(dh_i64, vmask_eq_0); vmask2_lo = And(vmask2_lo, InterleaveLower(vmask2_lo, vmask2_lo)); vmask2_hi = And(ConcatLowerUpper(dh_i64, vmask2_hi, vmask2_lo), InterleaveUpper(dh_i64, vmask2_lo, vmask2_lo)); vmask2_lo = InterleaveLower(Set(dh_i64, int64_t{-1}), vmask2_lo); const auto vmask2 = Combine(di64, vmask2_hi, vmask2_lo); const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask)))); return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2)))); } template HWY_API Mask256 SetAtOrBeforeFirst(Mask256 mask) { const Full256 d; constexpr size_t kLanesPerBlock = MaxLanes(d) / 2; const auto vmask = VecFromMask(d, mask); const auto vmask_lo = ConcatLowerLower(d, vmask, Zero(d)); return SetBeforeFirst( MaskFromVec(CombineShiftRightBytes<(kLanesPerBlock - 1) * sizeof(T)>( d, vmask, vmask_lo))); } #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ Reductions in generic_ops // ------------------------------ LeadingZeroCount #if HWY_TARGET <= HWY_AVX3 template ), HWY_IF_V_SIZE_V(V, 32)> HWY_API V LeadingZeroCount(V v) { return V{_mm256_lzcnt_epi32(v.raw)}; } template ), HWY_IF_V_SIZE_V(V, 32)> HWY_API V LeadingZeroCount(V v) { return V{_mm256_lzcnt_epi64(v.raw)}; } #endif // HWY_TARGET <= HWY_AVX3 // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE(); // Note that the GCC warnings are not suppressed if we only wrap the *intrin.h - // the warning seems to be issued at the call site of intrinsics, i.e. our code. HWY_DIAGNOSTICS(pop)