// Copyright 2019 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // 128-bit vectors and SSE4 instructions, plus some AVX2 and AVX512-VL // operations when compiling for those targets. // External include guard in highway.h - see comment there. // Must come before HWY_DIAGNOSTICS and HWY_COMPILER_GCC_ACTUAL #include "hwy/base.h" // Avoid uninitialized warnings in GCC's emmintrin.h - see // https://github.com/google/highway/issues/710 and pull/902 HWY_DIAGNOSTICS(push) #if HWY_COMPILER_GCC_ACTUAL HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") HWY_DIAGNOSTICS_OFF(disable : 4701 4703 6001 26494, ignored "-Wmaybe-uninitialized") #endif #include #include #if HWY_TARGET == HWY_SSSE3 #include // SSSE3 #elif HWY_TARGET <= HWY_SSE4 #include // SSE4 #ifndef HWY_DISABLE_PCLMUL_AES #include // CLMUL #endif #endif #include "hwy/ops/shared-inl.h" HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { namespace detail { // Enable generic functions for whichever of (f16, bf16) are not supported. #if !HWY_HAVE_FLOAT16 #define HWY_X86_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D) #else #define HWY_X86_IF_EMULATED_D(D) HWY_IF_BF16_D(D) #endif template struct Raw128 { using type = __m128i; }; #if HWY_HAVE_FLOAT16 template <> struct Raw128 { using type = __m128h; }; #endif // HWY_HAVE_FLOAT16 template <> struct Raw128 { using type = __m128; }; template <> struct Raw128 { using type = __m128d; }; } // namespace detail template class Vec128 { using Raw = typename detail::Raw128::type; public: using PrivateT = T; // only for DFromV static constexpr size_t kPrivateN = N; // only for DFromV // Compound assignment. Only usable if there is a corresponding non-member // binary operator overload. For example, only f32 and f64 support division. HWY_INLINE Vec128& operator*=(const Vec128 other) { return *this = (*this * other); } HWY_INLINE Vec128& operator/=(const Vec128 other) { return *this = (*this / other); } HWY_INLINE Vec128& operator+=(const Vec128 other) { return *this = (*this + other); } HWY_INLINE Vec128& operator-=(const Vec128 other) { return *this = (*this - other); } HWY_INLINE Vec128& operator%=(const Vec128 other) { return *this = (*this % other); } HWY_INLINE Vec128& operator&=(const Vec128 other) { return *this = (*this & other); } HWY_INLINE Vec128& operator|=(const Vec128 other) { return *this = (*this | other); } HWY_INLINE Vec128& operator^=(const Vec128 other) { return *this = (*this ^ other); } Raw raw; }; template using Vec64 = Vec128; template using Vec32 = Vec128; template using Vec16 = Vec128; #if HWY_TARGET <= HWY_AVX3 namespace detail { // Template arg: sizeof(lane type) template struct RawMask128 {}; template <> struct RawMask128<1> { using type = __mmask16; }; template <> struct RawMask128<2> { using type = __mmask8; }; template <> struct RawMask128<4> { using type = __mmask8; }; template <> struct RawMask128<8> { using type = __mmask8; }; } // namespace detail template struct Mask128 { using Raw = typename detail::RawMask128::type; static Mask128 FromBits(uint64_t mask_bits) { return Mask128{static_cast(mask_bits)}; } Raw raw; }; #else // AVX2 or below // FF..FF or 0. template struct Mask128 { typename detail::Raw128::type raw; }; #endif // AVX2 or below namespace detail { // Returns the lowest N of the _mm_movemask* bits. template constexpr uint64_t OnlyActive(uint64_t mask_bits) { return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1); } } // namespace detail #if HWY_TARGET <= HWY_AVX3 namespace detail { // Used by Expand() emulation, which is required for both AVX3 and AVX2. template HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { return OnlyActive(mask.raw); } } // namespace detail #endif // HWY_TARGET <= HWY_AVX3 template using DFromV = Simd; template using TFromV = typename V::PrivateT; // ------------------------------ Zero // Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero. template HWY_API Vec128, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { return Vec128, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()}; } #if HWY_HAVE_FLOAT16 template HWY_API Vec128 Zero(D /* tag */) { return Vec128{_mm_setzero_ph()}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 Zero(D /* tag */) { return Vec128{_mm_setzero_ps()}; } template HWY_API Vec128 Zero(D /* tag */) { return Vec128{_mm_setzero_pd()}; } template HWY_API Vec128, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { return Vec128, HWY_MAX_LANES_D(D)>{_mm_setzero_si128()}; } // Using the existing Zero function instead of a dedicated function for // deduction avoids having to forward-declare Vec256 here. template using VFromD = decltype(Zero(D())); // ------------------------------ Tuple (VFromD) #include "hwy/ops/tuple-inl.h" // ------------------------------ BitCast namespace detail { HWY_INLINE __m128i BitCastToInteger(__m128i v) { return v; } #if HWY_HAVE_FLOAT16 HWY_INLINE __m128i BitCastToInteger(__m128h v) { return _mm_castph_si128(v); } #endif // HWY_HAVE_FLOAT16 HWY_INLINE __m128i BitCastToInteger(__m128 v) { return _mm_castps_si128(v); } HWY_INLINE __m128i BitCastToInteger(__m128d v) { return _mm_castpd_si128(v); } template HWY_INLINE Vec128 BitCastToByte(Vec128 v) { return Vec128{BitCastToInteger(v.raw)}; } // Cannot rely on function overloading because return types differ. template struct BitCastFromInteger128 { HWY_INLINE __m128i operator()(__m128i v) { return v; } }; #if HWY_HAVE_FLOAT16 template <> struct BitCastFromInteger128 { HWY_INLINE __m128h operator()(__m128i v) { return _mm_castsi128_ph(v); } }; #endif // HWY_HAVE_FLOAT16 template <> struct BitCastFromInteger128 { HWY_INLINE __m128 operator()(__m128i v) { return _mm_castsi128_ps(v); } }; template <> struct BitCastFromInteger128 { HWY_INLINE __m128d operator()(__m128i v) { return _mm_castsi128_pd(v); } }; template HWY_INLINE VFromD BitCastFromByte(D /* tag */, Vec128 v) { return VFromD{BitCastFromInteger128>()(v.raw)}; } } // namespace detail template HWY_API VFromD BitCast(D d, Vec128().MaxLanes()> v) { return detail::BitCastFromByte(d, detail::BitCastToByte(v)); } // ------------------------------ Set template HWY_API VFromD Set(D /* tag */, TFromD t) { return VFromD{_mm_set1_epi8(static_cast(t))}; // NOLINT } template HWY_API VFromD Set(D /* tag */, TFromD t) { return VFromD{_mm_set1_epi16(static_cast(t))}; // NOLINT } template HWY_API VFromD Set(D /* tag */, TFromD t) { return VFromD{_mm_set1_epi32(static_cast(t))}; } template HWY_API VFromD Set(D /* tag */, TFromD t) { return VFromD{_mm_set1_epi64x(static_cast(t))}; // NOLINT } #if HWY_HAVE_FLOAT16 template HWY_API VFromD Set(D /* tag */, float16_t t) { return VFromD{_mm_set1_ph(t)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API VFromD Set(D /* tag */, float t) { return VFromD{_mm_set1_ps(t)}; } template HWY_API VFromD Set(D /* tag */, double t) { return VFromD{_mm_set1_pd(t)}; } // Generic for all vector lengths. template HWY_API VFromD Set(D df, TFromD t) { const RebindToUnsigned du; static_assert(sizeof(TFromD) == 2, "Expecting [b]f16"); uint16_t bits; CopyBytes<2>(&t, &bits); return BitCast(df, Set(du, bits)); } // ------------------------------ Undefined HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") // Returns a vector with uninitialized elements. template HWY_API VFromD Undefined(D /* tag */) { // Available on Clang 6.0, GCC 6.2, ICC 16.03, MSVC 19.14. All but ICC // generate an XOR instruction. return VFromD{_mm_undefined_si128()}; } #if HWY_HAVE_FLOAT16 template HWY_API VFromD Undefined(D /* tag */) { return VFromD{_mm_undefined_ph()}; } #endif // HWY_HAVE_FLOAT16 template HWY_API VFromD Undefined(D /* tag */) { return VFromD{_mm_undefined_ps()}; } template HWY_API VFromD Undefined(D /* tag */) { return VFromD{_mm_undefined_pd()}; } template HWY_API VFromD Undefined(D /* tag */) { return VFromD{_mm_undefined_si128()}; } HWY_DIAGNOSTICS(pop) // ------------------------------ GetLane template HWY_API T GetLane(const Vec128 v) { return static_cast(_mm_cvtsi128_si32(v.raw) & 0xFF); } template HWY_API T GetLane(const Vec128 v) { const DFromV d; const RebindToUnsigned du; const uint16_t bits = static_cast(_mm_cvtsi128_si32(BitCast(du, v).raw) & 0xFFFF); return BitCastScalar(bits); } template HWY_API T GetLane(const Vec128 v) { return static_cast(_mm_cvtsi128_si32(v.raw)); } template HWY_API float GetLane(const Vec128 v) { return _mm_cvtss_f32(v.raw); } template HWY_API T GetLane(const Vec128 v) { #if HWY_ARCH_X86_32 const DFromV d; alignas(16) T lanes[2]; Store(v, d, lanes); return lanes[0]; #else return static_cast(_mm_cvtsi128_si64(v.raw)); #endif } template HWY_API double GetLane(const Vec128 v) { return _mm_cvtsd_f64(v.raw); } // ------------------------------ ResizeBitCast template HWY_API VFromD ResizeBitCast(D d, FromV v) { const Repartition du8; return BitCast(d, VFromD{detail::BitCastToInteger(v.raw)}); } // ------------------------------ Dup128VecFromValues template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7, TFromD t8, TFromD t9, TFromD t10, TFromD t11, TFromD t12, TFromD t13, TFromD t14, TFromD t15) { return VFromD{_mm_setr_epi8( static_cast(t0), static_cast(t1), static_cast(t2), static_cast(t3), static_cast(t4), static_cast(t5), static_cast(t6), static_cast(t7), static_cast(t8), static_cast(t9), static_cast(t10), static_cast(t11), static_cast(t12), static_cast(t13), static_cast(t14), static_cast(t15))}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7) { return VFromD{ _mm_setr_epi16(static_cast(t0), static_cast(t1), static_cast(t2), static_cast(t3), static_cast(t4), static_cast(t5), static_cast(t6), static_cast(t7))}; } // Generic for all vector lengths template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7) { const RebindToSigned di; return BitCast(d, Dup128VecFromValues( di, BitCastScalar(t0), BitCastScalar(t1), BitCastScalar(t2), BitCastScalar(t3), BitCastScalar(t4), BitCastScalar(t5), BitCastScalar(t6), BitCastScalar(t7))); } #if HWY_HAVE_FLOAT16 template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7) { return VFromD{_mm_setr_ph(t0, t1, t2, t3, t4, t5, t6, t7)}; } #else // Generic for all vector lengths if HWY_HAVE_FLOAT16 is not true template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7) { const RebindToSigned di; return BitCast(d, Dup128VecFromValues( di, BitCastScalar(t0), BitCastScalar(t1), BitCastScalar(t2), BitCastScalar(t3), BitCastScalar(t4), BitCastScalar(t5), BitCastScalar(t6), BitCastScalar(t7))); } #endif // HWY_HAVE_FLOAT16 template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3) { return VFromD{ _mm_setr_epi32(static_cast(t0), static_cast(t1), static_cast(t2), static_cast(t3))}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3) { return VFromD{_mm_setr_ps(t0, t1, t2, t3)}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1) { // Need to use _mm_set_epi64x as there is no _mm_setr_epi64x intrinsic // available return VFromD{ _mm_set_epi64x(static_cast(t1), static_cast(t0))}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1) { return VFromD{_mm_setr_pd(t0, t1)}; } // ================================================== LOGICAL // ------------------------------ And template HWY_API Vec128 And(Vec128 a, Vec128 b) { const DFromV d; // for float16_t const RebindToUnsigned du; return BitCast(d, VFromD{ _mm_and_si128(BitCast(du, a).raw, BitCast(du, b).raw)}); } template HWY_API Vec128 And(Vec128 a, Vec128 b) { return Vec128{_mm_and_ps(a.raw, b.raw)}; } template HWY_API Vec128 And(Vec128 a, Vec128 b) { return Vec128{_mm_and_pd(a.raw, b.raw)}; } // ------------------------------ AndNot // Returns ~not_mask & mask. template HWY_API Vec128 AndNot(Vec128 not_mask, Vec128 mask) { const DFromV d; // for float16_t const RebindToUnsigned du; return BitCast(d, VFromD{_mm_andnot_si128( BitCast(du, not_mask).raw, BitCast(du, mask).raw)}); } template HWY_API Vec128 AndNot(Vec128 not_mask, Vec128 mask) { return Vec128{_mm_andnot_ps(not_mask.raw, mask.raw)}; } template HWY_API Vec128 AndNot(Vec128 not_mask, Vec128 mask) { return Vec128{_mm_andnot_pd(not_mask.raw, mask.raw)}; } // ------------------------------ Or template HWY_API Vec128 Or(Vec128 a, Vec128 b) { const DFromV d; // for float16_t const RebindToUnsigned du; return BitCast(d, VFromD{ _mm_or_si128(BitCast(du, a).raw, BitCast(du, b).raw)}); } template HWY_API Vec128 Or(Vec128 a, Vec128 b) { return Vec128{_mm_or_ps(a.raw, b.raw)}; } template HWY_API Vec128 Or(Vec128 a, Vec128 b) { return Vec128{_mm_or_pd(a.raw, b.raw)}; } // ------------------------------ Xor template HWY_API Vec128 Xor(Vec128 a, Vec128 b) { const DFromV d; // for float16_t const RebindToUnsigned du; return BitCast(d, VFromD{ _mm_xor_si128(BitCast(du, a).raw, BitCast(du, b).raw)}); } template HWY_API Vec128 Xor(Vec128 a, Vec128 b) { return Vec128{_mm_xor_ps(a.raw, b.raw)}; } template HWY_API Vec128 Xor(Vec128 a, Vec128 b) { return Vec128{_mm_xor_pd(a.raw, b.raw)}; } // ------------------------------ Not template HWY_API Vec128 Not(const Vec128 v) { const DFromV d; const RebindToUnsigned du; using VU = VFromD; #if HWY_TARGET <= HWY_AVX3 const __m128i vu = BitCast(du, v).raw; return BitCast(d, VU{_mm_ternarylogic_epi32(vu, vu, vu, 0x55)}); #else return Xor(v, BitCast(d, VU{_mm_set1_epi32(-1)})); #endif } // ------------------------------ Xor3 template HWY_API Vec128 Xor3(Vec128 x1, Vec128 x2, Vec128 x3) { #if HWY_TARGET <= HWY_AVX3 const DFromV d; const RebindToUnsigned du; using VU = VFromD; const __m128i ret = _mm_ternarylogic_epi64( BitCast(du, x1).raw, BitCast(du, x2).raw, BitCast(du, x3).raw, 0x96); return BitCast(d, VU{ret}); #else return Xor(x1, Xor(x2, x3)); #endif } // ------------------------------ Or3 template HWY_API Vec128 Or3(Vec128 o1, Vec128 o2, Vec128 o3) { #if HWY_TARGET <= HWY_AVX3 const DFromV d; const RebindToUnsigned du; using VU = VFromD; const __m128i ret = _mm_ternarylogic_epi64( BitCast(du, o1).raw, BitCast(du, o2).raw, BitCast(du, o3).raw, 0xFE); return BitCast(d, VU{ret}); #else return Or(o1, Or(o2, o3)); #endif } // ------------------------------ OrAnd template HWY_API Vec128 OrAnd(Vec128 o, Vec128 a1, Vec128 a2) { #if HWY_TARGET <= HWY_AVX3 const DFromV d; const RebindToUnsigned du; using VU = VFromD; const __m128i ret = _mm_ternarylogic_epi64( BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8); return BitCast(d, VU{ret}); #else return Or(o, And(a1, a2)); #endif } // ------------------------------ IfVecThenElse template HWY_API Vec128 IfVecThenElse(Vec128 mask, Vec128 yes, Vec128 no) { #if HWY_TARGET <= HWY_AVX3 const DFromV d; const RebindToUnsigned du; using VU = VFromD; return BitCast( d, VU{_mm_ternarylogic_epi64(BitCast(du, mask).raw, BitCast(du, yes).raw, BitCast(du, no).raw, 0xCA)}); #else return IfThenElse(MaskFromVec(mask), yes, no); #endif } // ------------------------------ BitwiseIfThenElse #if HWY_TARGET <= HWY_AVX3 #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE #else #define HWY_NATIVE_BITWISE_IF_THEN_ELSE #endif template HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { return IfVecThenElse(mask, yes, no); } #endif // ------------------------------ Operator overloads (internal-only if float) template HWY_API Vec128 operator&(const Vec128 a, const Vec128 b) { return And(a, b); } template HWY_API Vec128 operator|(const Vec128 a, const Vec128 b) { return Or(a, b); } template HWY_API Vec128 operator^(const Vec128 a, const Vec128 b) { return Xor(a, b); } // ------------------------------ PopulationCount // 8/16 require BITALG, 32/64 require VPOPCNTDQ. #if HWY_TARGET <= HWY_AVX3_DL #ifdef HWY_NATIVE_POPCNT #undef HWY_NATIVE_POPCNT #else #define HWY_NATIVE_POPCNT #endif namespace detail { template HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<1> /* tag */, Vec128 v) { return Vec128{_mm_popcnt_epi8(v.raw)}; } template HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<2> /* tag */, Vec128 v) { return Vec128{_mm_popcnt_epi16(v.raw)}; } template HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<4> /* tag */, Vec128 v) { return Vec128{_mm_popcnt_epi32(v.raw)}; } template HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<8> /* tag */, Vec128 v) { return Vec128{_mm_popcnt_epi64(v.raw)}; } } // namespace detail template HWY_API Vec128 PopulationCount(Vec128 v) { return detail::PopulationCount(hwy::SizeTag(), v); } #endif // HWY_TARGET <= HWY_AVX3_DL // ================================================== SIGN // ------------------------------ Neg // Tag dispatch instead of SFINAE for MSVC 2017 compatibility namespace detail { template HWY_INLINE Vec128 Neg(hwy::FloatTag /*tag*/, const Vec128 v) { return Xor(v, SignBit(DFromV())); } template HWY_INLINE Vec128 Neg(hwy::SpecialTag /*tag*/, const Vec128 v) { return Xor(v, SignBit(DFromV())); } template HWY_INLINE Vec128 Neg(hwy::SignedTag /*tag*/, const Vec128 v) { return Zero(DFromV()) - v; } } // namespace detail template HWY_INLINE Vec128 Neg(const Vec128 v) { return detail::Neg(hwy::TypeTag(), v); } // ------------------------------ Floating-point Abs // Generic for all vector lengths template )> HWY_API V Abs(V v) { const DFromV d; const RebindToSigned di; using TI = TFromD; return v & BitCast(d, Set(di, static_cast(~SignMask()))); } // ------------------------------ CopySign // Generic for all vector lengths. template HWY_API V CopySign(const V magn, const V sign) { static_assert(IsFloat>(), "Only makes sense for floating-point"); const DFromV d; const auto msb = SignBit(d); // Truth table for msb, magn, sign | bitwise msb ? sign : mag // 0 0 0 | 0 // 0 0 1 | 0 // 0 1 0 | 1 // 0 1 1 | 1 // 1 0 0 | 0 // 1 0 1 | 1 // 1 1 0 | 0 // 1 1 1 | 1 return BitwiseIfThenElse(msb, sign, magn); } // ------------------------------ CopySignToAbs // Generic for all vector lengths. template HWY_API V CopySignToAbs(const V abs, const V sign) { const DFromV d; return OrAnd(abs, SignBit(d), sign); } // ================================================== MASK #if HWY_TARGET <= HWY_AVX3 // ------------------------------ MaskFromVec namespace detail { template HWY_INLINE Mask128 MaskFromVec(hwy::SizeTag<1> /*tag*/, const Vec128 v) { return Mask128{_mm_movepi8_mask(v.raw)}; } template HWY_INLINE Mask128 MaskFromVec(hwy::SizeTag<2> /*tag*/, const Vec128 v) { return Mask128{_mm_movepi16_mask(v.raw)}; } template HWY_INLINE Mask128 MaskFromVec(hwy::SizeTag<4> /*tag*/, const Vec128 v) { return Mask128{_mm_movepi32_mask(v.raw)}; } template HWY_INLINE Mask128 MaskFromVec(hwy::SizeTag<8> /*tag*/, const Vec128 v) { return Mask128{_mm_movepi64_mask(v.raw)}; } } // namespace detail template HWY_API Mask128 MaskFromVec(const Vec128 v) { return detail::MaskFromVec(hwy::SizeTag(), v); } // There do not seem to be native floating-point versions of these instructions. #if HWY_HAVE_FLOAT16 template HWY_API Mask128 MaskFromVec(const Vec128 v) { const RebindToSigned> di; return Mask128{MaskFromVec(BitCast(di, v)).raw}; } #endif template HWY_API Mask128 MaskFromVec(const Vec128 v) { const RebindToSigned> di; return Mask128{MaskFromVec(BitCast(di, v)).raw}; } template HWY_API Mask128 MaskFromVec(const Vec128 v) { const RebindToSigned> di; return Mask128{MaskFromVec(BitCast(di, v)).raw}; } template using MFromD = decltype(MaskFromVec(VFromD())); // ------------------------------ MaskFalse (MFromD) #ifdef HWY_NATIVE_MASK_FALSE #undef HWY_NATIVE_MASK_FALSE #else #define HWY_NATIVE_MASK_FALSE #endif // Generic for all vector lengths template HWY_API MFromD MaskFalse(D /*d*/) { return MFromD{static_cast().raw)>(0)}; } // ------------------------------ PromoteMaskTo (MFromD) #ifdef HWY_NATIVE_PROMOTE_MASK_TO #undef HWY_NATIVE_PROMOTE_MASK_TO #else #define HWY_NATIVE_PROMOTE_MASK_TO #endif // AVX3 PromoteMaskTo is generic for all vector lengths template )), class DFrom_2 = Rebind, DTo>, hwy::EnableIf, MFromD>()>* = nullptr> HWY_API MFromD PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, MFromD m) { return MFromD{static_cast().raw)>(m.raw)}; } // ------------------------------ DemoteMaskTo (MFromD) #ifdef HWY_NATIVE_DEMOTE_MASK_TO #undef HWY_NATIVE_DEMOTE_MASK_TO #else #define HWY_NATIVE_DEMOTE_MASK_TO #endif // AVX3 DemoteMaskTo is generic for all vector lengths template ) - 1), class DFrom_2 = Rebind, DTo>, hwy::EnableIf, MFromD>()>* = nullptr> HWY_API MFromD DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/, MFromD m) { return MFromD{static_cast().raw)>(m.raw)}; } // ------------------------------ CombineMasks (MFromD) #ifdef HWY_NATIVE_COMBINE_MASKS #undef HWY_NATIVE_COMBINE_MASKS #else #define HWY_NATIVE_COMBINE_MASKS #endif template HWY_API MFromD CombineMasks(D /*d*/, MFromD> hi, MFromD> lo) { #if HWY_COMPILER_HAS_MASK_INTRINSICS const __mmask8 combined_mask = _kor_mask8( _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 1), _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(1))); #else const auto combined_mask = (static_cast(hi.raw) << 1) | (lo.raw & 1); #endif return MFromD{static_cast().raw)>(combined_mask)}; } template HWY_API MFromD CombineMasks(D /*d*/, MFromD> hi, MFromD> lo) { #if HWY_COMPILER_HAS_MASK_INTRINSICS const __mmask8 combined_mask = _kor_mask8( _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 2), _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(3))); #else const auto combined_mask = (static_cast(hi.raw) << 2) | (lo.raw & 3); #endif return MFromD{static_cast().raw)>(combined_mask)}; } template HWY_API MFromD CombineMasks(D /*d*/, MFromD> hi, MFromD> lo) { #if HWY_COMPILER_HAS_MASK_INTRINSICS const __mmask8 combined_mask = _kor_mask8( _kshiftli_mask8(static_cast<__mmask8>(hi.raw), 4), _kand_mask8(static_cast<__mmask8>(lo.raw), static_cast<__mmask8>(15))); #else const auto combined_mask = (static_cast(hi.raw) << 4) | (lo.raw & 15u); #endif return MFromD{static_cast().raw)>(combined_mask)}; } template HWY_API MFromD CombineMasks(D /*d*/, MFromD> hi, MFromD> lo) { #if HWY_COMPILER_HAS_MASK_INTRINSICS const __mmask16 combined_mask = _mm512_kunpackb( static_cast<__mmask16>(hi.raw), static_cast<__mmask16>(lo.raw)); #else const auto combined_mask = ((static_cast(hi.raw) << 8) | (lo.raw & 0xFFu)); #endif return MFromD{static_cast().raw)>(combined_mask)}; } // ------------------------------ LowerHalfOfMask (MFromD) #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK #undef HWY_NATIVE_LOWER_HALF_OF_MASK #else #define HWY_NATIVE_LOWER_HALF_OF_MASK #endif // Generic for all vector lengths template HWY_API MFromD LowerHalfOfMask(D d, MFromD> m) { using RawM = decltype(MFromD().raw); constexpr size_t kN = MaxLanes(d); constexpr size_t kNumOfBitsInRawMask = sizeof(RawM) * 8; MFromD result_mask{static_cast(m.raw)}; if (kN < kNumOfBitsInRawMask) { result_mask = And(result_mask, MFromD{static_cast((1ULL << kN) - 1)}); } return result_mask; } // ------------------------------ UpperHalfOfMask (MFromD) #ifdef HWY_NATIVE_UPPER_HALF_OF_MASK #undef HWY_NATIVE_UPPER_HALF_OF_MASK #else #define HWY_NATIVE_UPPER_HALF_OF_MASK #endif template HWY_API MFromD UpperHalfOfMask(D /*d*/, MFromD> m) { #if HWY_COMPILER_HAS_MASK_INTRINSICS const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 1); #else const auto shifted_mask = static_cast(m.raw) >> 1; #endif return MFromD{static_cast().raw)>(shifted_mask)}; } template HWY_API MFromD UpperHalfOfMask(D /*d*/, MFromD> m) { #if HWY_COMPILER_HAS_MASK_INTRINSICS const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 2); #else const auto shifted_mask = static_cast(m.raw) >> 2; #endif return MFromD{static_cast().raw)>(shifted_mask)}; } template HWY_API MFromD UpperHalfOfMask(D /*d*/, MFromD> m) { #if HWY_COMPILER_HAS_MASK_INTRINSICS const auto shifted_mask = _kshiftri_mask8(static_cast<__mmask8>(m.raw), 4); #else const auto shifted_mask = static_cast(m.raw) >> 4; #endif return MFromD{static_cast().raw)>(shifted_mask)}; } template HWY_API MFromD UpperHalfOfMask(D /*d*/, MFromD> m) { #if HWY_COMPILER_HAS_MASK_INTRINSICS const auto shifted_mask = _kshiftri_mask16(static_cast<__mmask16>(m.raw), 8); #else const auto shifted_mask = static_cast(m.raw) >> 8; #endif return MFromD{static_cast().raw)>(shifted_mask)}; } // ------------------------------ OrderedDemote2MasksTo (MFromD, CombineMasks) #ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO #undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO #else #define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO #endif // Generic for all vector lengths template ) / 2), class DTo_2 = Repartition, DFrom>, hwy::EnableIf, MFromD>()>* = nullptr> HWY_API MFromD OrderedDemote2MasksTo(DTo d_to, DFrom /*d_from*/, MFromD a, MFromD b) { using MH = MFromD>; using RawMH = decltype(MH().raw); return CombineMasks(d_to, MH{static_cast(b.raw)}, MH{static_cast(a.raw)}); } // ------------------------------ VecFromMask template HWY_API Vec128 VecFromMask(const Mask128 v) { return Vec128{_mm_movm_epi8(v.raw)}; } template HWY_API Vec128 VecFromMask(const Mask128 v) { return Vec128{_mm_movm_epi16(v.raw)}; } template HWY_API Vec128 VecFromMask(const Mask128 v) { return Vec128{_mm_movm_epi32(v.raw)}; } template HWY_API Vec128 VecFromMask(const Mask128 v) { return Vec128{_mm_movm_epi64(v.raw)}; } #if HWY_HAVE_FLOAT16 template HWY_API Vec128 VecFromMask(const Mask128 v) { return Vec128{_mm_castsi128_ph(_mm_movm_epi16(v.raw))}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 VecFromMask(const Mask128 v) { return Vec128{_mm_castsi128_ps(_mm_movm_epi32(v.raw))}; } template HWY_API Vec128 VecFromMask(const Mask128 v) { return Vec128{_mm_castsi128_pd(_mm_movm_epi64(v.raw))}; } // Generic for all vector lengths. template HWY_API VFromD VecFromMask(D /* tag */, MFromD v) { return VecFromMask(v); } // ------------------------------ RebindMask (MaskFromVec) template HWY_API MFromD RebindMask(DTo /* tag */, Mask128 m) { static_assert(sizeof(TFrom) == sizeof(TFromD), "Must have same size"); return MFromD{m.raw}; } // ------------------------------ IfThenElse namespace detail { template HWY_INLINE Vec128 IfThenElse(hwy::SizeTag<1> /* tag */, Mask128 mask, Vec128 yes, Vec128 no) { return Vec128{_mm_mask_blend_epi8(mask.raw, no.raw, yes.raw)}; } template HWY_INLINE Vec128 IfThenElse(hwy::SizeTag<2> /* tag */, Mask128 mask, Vec128 yes, Vec128 no) { return Vec128{_mm_mask_blend_epi16(mask.raw, no.raw, yes.raw)}; } template HWY_INLINE Vec128 IfThenElse(hwy::SizeTag<4> /* tag */, Mask128 mask, Vec128 yes, Vec128 no) { return Vec128{_mm_mask_blend_epi32(mask.raw, no.raw, yes.raw)}; } template HWY_INLINE Vec128 IfThenElse(hwy::SizeTag<8> /* tag */, Mask128 mask, Vec128 yes, Vec128 no) { return Vec128{_mm_mask_blend_epi64(mask.raw, no.raw, yes.raw)}; } } // namespace detail template HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, Vec128 no) { return detail::IfThenElse(hwy::SizeTag(), mask, yes, no); } #if HWY_HAVE_FLOAT16 template HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, Vec128 no) { return Vec128{_mm_mask_blend_ph(mask.raw, no.raw, yes.raw)}; } #endif // HWY_HAVE_FLOAT16 // Generic for all vector lengths. template , HWY_X86_IF_EMULATED_D(D)> HWY_API V IfThenElse(MFromD mask, V yes, V no) { const RebindToUnsigned du; return BitCast( D(), IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no))); } template HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, Vec128 no) { return Vec128{_mm_mask_blend_ps(mask.raw, no.raw, yes.raw)}; } template HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, Vec128 no) { return Vec128{_mm_mask_blend_pd(mask.raw, no.raw, yes.raw)}; } namespace detail { template HWY_INLINE Vec128 IfThenElseZero(hwy::SizeTag<1> /* tag */, Mask128 mask, Vec128 yes) { return Vec128{_mm_maskz_mov_epi8(mask.raw, yes.raw)}; } template HWY_INLINE Vec128 IfThenElseZero(hwy::SizeTag<2> /* tag */, Mask128 mask, Vec128 yes) { return Vec128{_mm_maskz_mov_epi16(mask.raw, yes.raw)}; } template HWY_INLINE Vec128 IfThenElseZero(hwy::SizeTag<4> /* tag */, Mask128 mask, Vec128 yes) { return Vec128{_mm_maskz_mov_epi32(mask.raw, yes.raw)}; } template HWY_INLINE Vec128 IfThenElseZero(hwy::SizeTag<8> /* tag */, Mask128 mask, Vec128 yes) { return Vec128{_mm_maskz_mov_epi64(mask.raw, yes.raw)}; } } // namespace detail template HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { return detail::IfThenElseZero(hwy::SizeTag(), mask, yes); } template HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { return Vec128{_mm_maskz_mov_ps(mask.raw, yes.raw)}; } template HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { return Vec128{_mm_maskz_mov_pd(mask.raw, yes.raw)}; } // Generic for all vector lengths. template , HWY_IF_SPECIAL_FLOAT_D(D)> HWY_API V IfThenElseZero(MFromD mask, V yes) { const RebindToUnsigned du; return BitCast(D(), IfThenElseZero(RebindMask(du, mask), BitCast(du, yes))); } namespace detail { template HWY_INLINE Vec128 IfThenZeroElse(hwy::SizeTag<1> /* tag */, Mask128 mask, Vec128 no) { // xor_epi8/16 are missing, but we have sub, which is just as fast for u8/16. return Vec128{_mm_mask_sub_epi8(no.raw, mask.raw, no.raw, no.raw)}; } template HWY_INLINE Vec128 IfThenZeroElse(hwy::SizeTag<2> /* tag */, Mask128 mask, Vec128 no) { return Vec128{_mm_mask_sub_epi16(no.raw, mask.raw, no.raw, no.raw)}; } template HWY_INLINE Vec128 IfThenZeroElse(hwy::SizeTag<4> /* tag */, Mask128 mask, Vec128 no) { return Vec128{_mm_mask_xor_epi32(no.raw, mask.raw, no.raw, no.raw)}; } template HWY_INLINE Vec128 IfThenZeroElse(hwy::SizeTag<8> /* tag */, Mask128 mask, Vec128 no) { return Vec128{_mm_mask_xor_epi64(no.raw, mask.raw, no.raw, no.raw)}; } } // namespace detail template HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { return detail::IfThenZeroElse(hwy::SizeTag(), mask, no); } template HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { return Vec128{_mm_mask_xor_ps(no.raw, mask.raw, no.raw, no.raw)}; } template HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { return Vec128{_mm_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)}; } // Generic for all vector lengths. template , HWY_IF_SPECIAL_FLOAT_D(D)> HWY_API V IfThenZeroElse(MFromD mask, V no) { const RebindToUnsigned du; return BitCast(D(), IfThenZeroElse(RebindMask(du, mask), BitCast(du, no))); } // ------------------------------ Mask logical // For Clang and GCC, mask intrinsics (KORTEST) weren't added until recently. #if !defined(HWY_COMPILER_HAS_MASK_INTRINSICS) #if HWY_COMPILER_MSVC != 0 || HWY_COMPILER_GCC_ACTUAL >= 700 || \ HWY_COMPILER_CLANG >= 800 #define HWY_COMPILER_HAS_MASK_INTRINSICS 1 #else #define HWY_COMPILER_HAS_MASK_INTRINSICS 0 #endif #endif // HWY_COMPILER_HAS_MASK_INTRINSICS namespace detail { template HWY_INLINE Mask128 And(hwy::SizeTag<1> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{_kand_mask16(a.raw, b.raw)}; #else return Mask128{static_cast<__mmask16>(a.raw & b.raw)}; #endif } template HWY_INLINE Mask128 And(hwy::SizeTag<2> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{_kand_mask8(a.raw, b.raw)}; #else return Mask128{static_cast<__mmask8>(a.raw & b.raw)}; #endif } template HWY_INLINE Mask128 And(hwy::SizeTag<4> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{_kand_mask8(a.raw, b.raw)}; #else return Mask128{static_cast<__mmask8>(a.raw & b.raw)}; #endif } template HWY_INLINE Mask128 And(hwy::SizeTag<8> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{_kand_mask8(a.raw, b.raw)}; #else return Mask128{static_cast<__mmask8>(a.raw & b.raw)}; #endif } template HWY_INLINE Mask128 AndNot(hwy::SizeTag<1> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{_kandn_mask16(a.raw, b.raw)}; #else return Mask128{static_cast<__mmask16>(~a.raw & b.raw)}; #endif } template HWY_INLINE Mask128 AndNot(hwy::SizeTag<2> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{_kandn_mask8(a.raw, b.raw)}; #else return Mask128{static_cast<__mmask8>(~a.raw & b.raw)}; #endif } template HWY_INLINE Mask128 AndNot(hwy::SizeTag<4> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{_kandn_mask8(a.raw, b.raw)}; #else return Mask128{static_cast<__mmask8>(~a.raw & b.raw)}; #endif } template HWY_INLINE Mask128 AndNot(hwy::SizeTag<8> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{_kandn_mask8(a.raw, b.raw)}; #else return Mask128{static_cast<__mmask8>(~a.raw & b.raw)}; #endif } template HWY_INLINE Mask128 Or(hwy::SizeTag<1> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{_kor_mask16(a.raw, b.raw)}; #else return Mask128{static_cast<__mmask16>(a.raw | b.raw)}; #endif } template HWY_INLINE Mask128 Or(hwy::SizeTag<2> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{_kor_mask8(a.raw, b.raw)}; #else return Mask128{static_cast<__mmask8>(a.raw | b.raw)}; #endif } template HWY_INLINE Mask128 Or(hwy::SizeTag<4> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{_kor_mask8(a.raw, b.raw)}; #else return Mask128{static_cast<__mmask8>(a.raw | b.raw)}; #endif } template HWY_INLINE Mask128 Or(hwy::SizeTag<8> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{_kor_mask8(a.raw, b.raw)}; #else return Mask128{static_cast<__mmask8>(a.raw | b.raw)}; #endif } template HWY_INLINE Mask128 Xor(hwy::SizeTag<1> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{_kxor_mask16(a.raw, b.raw)}; #else return Mask128{static_cast<__mmask16>(a.raw ^ b.raw)}; #endif } template HWY_INLINE Mask128 Xor(hwy::SizeTag<2> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{_kxor_mask8(a.raw, b.raw)}; #else return Mask128{static_cast<__mmask8>(a.raw ^ b.raw)}; #endif } template HWY_INLINE Mask128 Xor(hwy::SizeTag<4> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{_kxor_mask8(a.raw, b.raw)}; #else return Mask128{static_cast<__mmask8>(a.raw ^ b.raw)}; #endif } template HWY_INLINE Mask128 Xor(hwy::SizeTag<8> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{_kxor_mask8(a.raw, b.raw)}; #else return Mask128{static_cast<__mmask8>(a.raw ^ b.raw)}; #endif } template HWY_INLINE Mask128 ExclusiveNeither(hwy::SizeTag<1> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{_kxnor_mask16(a.raw, b.raw)}; #else return Mask128{static_cast<__mmask16>(~(a.raw ^ b.raw) & 0xFFFF)}; #endif } template HWY_INLINE Mask128 ExclusiveNeither(hwy::SizeTag<2> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{_kxnor_mask8(a.raw, b.raw)}; #else return Mask128{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xFF)}; #endif } template HWY_INLINE Mask128 ExclusiveNeither(hwy::SizeTag<4> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0xF)}; #else return Mask128{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0xF)}; #endif } template HWY_INLINE Mask128 ExclusiveNeither(hwy::SizeTag<8> /*tag*/, const Mask128 a, const Mask128 b) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{static_cast<__mmask8>(_kxnor_mask8(a.raw, b.raw) & 0x3)}; #else return Mask128{static_cast<__mmask8>(~(a.raw ^ b.raw) & 0x3)}; #endif } // UnmaskedNot returns ~m.raw without zeroing out any invalid bits template HWY_INLINE Mask128 UnmaskedNot(const Mask128 m) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{static_cast<__mmask16>(_knot_mask16(m.raw))}; #else return Mask128{static_cast<__mmask16>(~m.raw)}; #endif } template HWY_INLINE Mask128 UnmaskedNot(const Mask128 m) { #if HWY_COMPILER_HAS_MASK_INTRINSICS return Mask128{static_cast<__mmask8>(_knot_mask8(m.raw))}; #else return Mask128{static_cast<__mmask8>(~m.raw)}; #endif } template HWY_INLINE Mask128 Not(hwy::SizeTag<1> /*tag*/, const Mask128 m) { // sizeof(T) == 1 and N == 16: simply return ~m as all 16 bits of m are valid return UnmaskedNot(m); } template HWY_INLINE Mask128 Not(hwy::SizeTag<1> /*tag*/, const Mask128 m) { // sizeof(T) == 1 and N <= 8: need to zero out the upper bits of ~m as there // are fewer than 16 valid bits in m // Return (~m) & ((1ull << N) - 1) return AndNot(hwy::SizeTag<1>(), m, Mask128::FromBits((1ull << N) - 1)); } template HWY_INLINE Mask128 Not(hwy::SizeTag<2> /*tag*/, const Mask128 m) { // sizeof(T) == 2 and N == 8: simply return ~m as all 8 bits of m are valid return UnmaskedNot(m); } template HWY_INLINE Mask128 Not(hwy::SizeTag<2> /*tag*/, const Mask128 m) { // sizeof(T) == 2 and N <= 4: need to zero out the upper bits of ~m as there // are fewer than 8 valid bits in m // Return (~m) & ((1ull << N) - 1) return AndNot(hwy::SizeTag<2>(), m, Mask128::FromBits((1ull << N) - 1)); } template HWY_INLINE Mask128 Not(hwy::SizeTag<4> /*tag*/, const Mask128 m) { // sizeof(T) == 4: need to zero out the upper bits of ~m as there are at most // 4 valid bits in m // Return (~m) & ((1ull << N) - 1) return AndNot(hwy::SizeTag<4>(), m, Mask128::FromBits((1ull << N) - 1)); } template HWY_INLINE Mask128 Not(hwy::SizeTag<8> /*tag*/, const Mask128 m) { // sizeof(T) == 8: need to zero out the upper bits of ~m as there are at most // 2 valid bits in m // Return (~m) & ((1ull << N) - 1) return AndNot(hwy::SizeTag<8>(), m, Mask128::FromBits((1ull << N) - 1)); } } // namespace detail template HWY_API Mask128 And(const Mask128 a, Mask128 b) { return detail::And(hwy::SizeTag(), a, b); } template HWY_API Mask128 AndNot(const Mask128 a, Mask128 b) { return detail::AndNot(hwy::SizeTag(), a, b); } template HWY_API Mask128 Or(const Mask128 a, Mask128 b) { return detail::Or(hwy::SizeTag(), a, b); } template HWY_API Mask128 Xor(const Mask128 a, Mask128 b) { return detail::Xor(hwy::SizeTag(), a, b); } template HWY_API Mask128 Not(const Mask128 m) { // Flip only the valid bits return detail::Not(hwy::SizeTag(), m); } template HWY_API Mask128 ExclusiveNeither(const Mask128 a, Mask128 b) { return detail::ExclusiveNeither(hwy::SizeTag(), a, b); } #else // AVX2 or below // ------------------------------ Mask // Mask and Vec are the same (true = FF..FF). template HWY_API Mask128 MaskFromVec(const Vec128 v) { return Mask128{v.raw}; } template using MFromD = decltype(MaskFromVec(VFromD())); template HWY_API Vec128 VecFromMask(const Mask128 v) { return Vec128{v.raw}; } // Generic for all vector lengths. template HWY_API VFromD VecFromMask(D /* tag */, MFromD v) { return VecFromMask(v); } #if HWY_TARGET >= HWY_SSSE3 // mask ? yes : no template HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, Vec128 no) { const auto vmask = VecFromMask(DFromV(), mask); return Or(And(vmask, yes), AndNot(vmask, no)); } #else // HWY_TARGET < HWY_SSSE3 // mask ? yes : no template HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, Vec128 no) { return Vec128{_mm_blendv_epi8(no.raw, yes.raw, mask.raw)}; } template HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, Vec128 no) { return Vec128{_mm_blendv_ps(no.raw, yes.raw, mask.raw)}; } template HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, Vec128 no) { return Vec128{_mm_blendv_pd(no.raw, yes.raw, mask.raw)}; } #endif // HWY_TARGET >= HWY_SSSE3 // mask ? yes : 0 template HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { return yes & VecFromMask(DFromV(), mask); } // mask ? 0 : no template HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { return AndNot(VecFromMask(DFromV(), mask), no); } // ------------------------------ Mask logical template HWY_API Mask128 Not(const Mask128 m) { const Simd d; return MaskFromVec(Not(VecFromMask(d, m))); } template HWY_API Mask128 And(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 AndNot(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 Or(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 Xor(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 ExclusiveNeither(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); } #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ ShiftLeft template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{_mm_slli_epi16(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{_mm_slli_epi32(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{_mm_slli_epi64(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{_mm_slli_epi16(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{_mm_slli_epi32(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{_mm_slli_epi64(v.raw, kBits)}; } #if HWY_TARGET <= HWY_AVX3_DL namespace detail { template HWY_API Vec128 GaloisAffine( Vec128 v, VFromD>> matrix) { return Vec128{_mm_gf2p8affine_epi64_epi8(v.raw, matrix.raw, 0)}; } } // namespace detail #else // HWY_TARGET > HWY_AVX3_DL template HWY_API Vec128 ShiftLeft(const Vec128 v) { const DFromV d8; // Use raw instead of BitCast to support N=1. const Vec128 shifted{ShiftLeft(Vec128>{v.raw}).raw}; return kBits == 1 ? (v + v) : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); } #endif // HWY_TARGET > HWY_AVX3_DL // ------------------------------ ShiftRight template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{_mm_srli_epi16(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{_mm_srli_epi32(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{_mm_srli_epi64(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{_mm_srai_epi16(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{_mm_srai_epi32(v.raw, kBits)}; } #if HWY_TARGET > HWY_AVX3_DL template HWY_API Vec128 ShiftRight(const Vec128 v) { const DFromV d8; // Use raw instead of BitCast to support N=1. const Vec128 shifted{ ShiftRight(Vec128{v.raw}).raw}; return shifted & Set(d8, 0xFF >> kBits); } template HWY_API Vec128 ShiftRight(const Vec128 v) { const DFromV di; const RebindToUnsigned du; const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); return (shifted ^ shifted_sign) - shifted_sign; } #endif // HWY_TARGET > HWY_AVX3_DL // i64 is implemented after BroadcastSignBit. // ================================================== MEMORY (1) // Clang static analysis claims the memory immediately after a partial vector // store is uninitialized, and also flags the input to partial loads (at least // for loadl_pd) as "garbage". This is a false alarm because msan does not // raise errors. We work around this by using CopyBytes instead of intrinsics, // but only for the analyzer to avoid potentially bad code generation. // Unfortunately __clang_analyzer__ was not defined for clang-tidy prior to v7. #ifndef HWY_SAFE_PARTIAL_LOAD_STORE #if defined(__clang_analyzer__) || \ (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700) #define HWY_SAFE_PARTIAL_LOAD_STORE 1 #else #define HWY_SAFE_PARTIAL_LOAD_STORE 0 #endif #endif // HWY_SAFE_PARTIAL_LOAD_STORE // ------------------------------ Load template HWY_API VFromD Load(D /* tag */, const TFromD* HWY_RESTRICT aligned) { return VFromD{_mm_load_si128(reinterpret_cast(aligned))}; } #if HWY_HAVE_FLOAT16 template HWY_API Vec128 Load(D, const float16_t* HWY_RESTRICT aligned) { return Vec128{_mm_load_ph(aligned)}; } #endif // HWY_HAVE_FLOAT16 // Generic for all vector lengths greater than or equal to 16 bytes. template HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT aligned) { const RebindToUnsigned du; return BitCast(d, Load(du, detail::U16LanePointer(aligned))); } template HWY_API Vec128 Load(D /* tag */, const float* HWY_RESTRICT aligned) { return Vec128{_mm_load_ps(aligned)}; } template HWY_API Vec128 Load(D /* tag */, const double* HWY_RESTRICT aligned) { return Vec128{_mm_load_pd(aligned)}; } template HWY_API VFromD LoadU(D /* tag */, const TFromD* HWY_RESTRICT p) { return VFromD{_mm_loadu_si128(reinterpret_cast(p))}; } #if HWY_HAVE_FLOAT16 template HWY_API Vec128 LoadU(D, const float16_t* HWY_RESTRICT p) { return Vec128{_mm_loadu_ph(p)}; } #endif // HWY_HAVE_FLOAT16 // Generic for all vector lengths greater than or equal to 16 bytes. template HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { const RebindToUnsigned du; return BitCast(d, LoadU(du, detail::U16LanePointer(p))); } template HWY_API Vec128 LoadU(D /* tag */, const float* HWY_RESTRICT p) { return Vec128{_mm_loadu_ps(p)}; } template HWY_API Vec128 LoadU(D /* tag */, const double* HWY_RESTRICT p) { return Vec128{_mm_loadu_pd(p)}; } template HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT p) { const RebindToUnsigned du; // for float16_t #if HWY_SAFE_PARTIAL_LOAD_STORE __m128i v = _mm_setzero_si128(); CopyBytes<8>(p, &v); // not same size #else const __m128i v = _mm_loadl_epi64(reinterpret_cast(p)); #endif return BitCast(d, VFromD{v}); } template HWY_API Vec64 Load(D /* tag */, const float* HWY_RESTRICT p) { #if HWY_SAFE_PARTIAL_LOAD_STORE __m128 v = _mm_setzero_ps(); CopyBytes<8>(p, &v); // not same size return Vec64{v}; #else const __m128 hi = _mm_setzero_ps(); return Vec64{_mm_loadl_pi(hi, reinterpret_cast(p))}; #endif } template HWY_API Vec64 Load(D /* tag */, const double* HWY_RESTRICT p) { #if HWY_SAFE_PARTIAL_LOAD_STORE __m128d v = _mm_setzero_pd(); CopyBytes<8>(p, &v); // not same size return Vec64{v}; #else return Vec64{_mm_load_sd(p)}; #endif } template HWY_API Vec32 Load(D /* tag */, const float* HWY_RESTRICT p) { #if HWY_SAFE_PARTIAL_LOAD_STORE __m128 v = _mm_setzero_ps(); CopyBytes<4>(p, &v); // not same size return Vec32{v}; #else return Vec32{_mm_load_ss(p)}; #endif } // Any <= 32 bit except template HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT p) { const RebindToUnsigned du; // for float16_t // Clang ArgumentPromotionPass seems to break this code. We can unpoison // before SetTableIndices -> LoadU -> Load and the memory is poisoned again. detail::MaybeUnpoison(p, Lanes(d)); #if HWY_SAFE_PARTIAL_LOAD_STORE __m128i v = Zero(Full128>()).raw; CopyBytes(p, &v); // not same size as VFromD #else int32_t bits = 0; CopyBytes(p, &bits); // not same size as VFromD const __m128i v = _mm_cvtsi32_si128(bits); #endif return BitCast(d, VFromD{v}); } // For < 128 bit, LoadU == Load. template HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { return Load(d, p); } // 128-bit SIMD => nothing to duplicate, same as an unaligned load. template HWY_API VFromD LoadDup128(D d, const TFromD* HWY_RESTRICT p) { return LoadU(d, p); } // ------------------------------ Store template HWY_API void Store(VFromD v, D /* tag */, TFromD* HWY_RESTRICT aligned) { _mm_store_si128(reinterpret_cast<__m128i*>(aligned), v.raw); } #if HWY_HAVE_FLOAT16 template HWY_API void Store(Vec128 v, D, float16_t* HWY_RESTRICT aligned) { _mm_store_ph(aligned, v.raw); } #endif // HWY_HAVE_FLOAT16 // Generic for all vector lengths greater than or equal to 16 bytes. template HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT aligned) { const RebindToUnsigned du; Store(BitCast(du, v), du, reinterpret_cast(aligned)); } template HWY_API void Store(Vec128 v, D /* tag */, float* HWY_RESTRICT aligned) { _mm_store_ps(aligned, v.raw); } template HWY_API void Store(Vec128 v, D /* tag */, double* HWY_RESTRICT aligned) { _mm_store_pd(aligned, v.raw); } template HWY_API void StoreU(VFromD v, D /* tag */, TFromD* HWY_RESTRICT p) { _mm_storeu_si128(reinterpret_cast<__m128i*>(p), v.raw); } #if HWY_HAVE_FLOAT16 template HWY_API void StoreU(Vec128 v, D, float16_t* HWY_RESTRICT p) { _mm_storeu_ph(p, v.raw); } #endif // HWY_HAVE_FLOAT16 // Generic for all vector lengths greater than or equal to 16 bytes. template HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { const RebindToUnsigned du; StoreU(BitCast(du, v), du, reinterpret_cast(p)); } template HWY_API void StoreU(Vec128 v, D /* tag */, float* HWY_RESTRICT p) { _mm_storeu_ps(p, v.raw); } template HWY_API void StoreU(Vec128 v, D /* tag */, double* HWY_RESTRICT p) { _mm_storeu_pd(p, v.raw); } template HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT p) { #if HWY_SAFE_PARTIAL_LOAD_STORE (void)d; CopyBytes<8>(&v, p); // not same size #else const RebindToUnsigned du; // for float16_t _mm_storel_epi64(reinterpret_cast<__m128i*>(p), BitCast(du, v).raw); #endif } template HWY_API void Store(Vec64 v, D /* tag */, float* HWY_RESTRICT p) { #if HWY_SAFE_PARTIAL_LOAD_STORE CopyBytes<8>(&v, p); // not same size #else _mm_storel_pi(reinterpret_cast<__m64*>(p), v.raw); #endif } template HWY_API void Store(Vec64 v, D /* tag */, double* HWY_RESTRICT p) { #if HWY_SAFE_PARTIAL_LOAD_STORE CopyBytes<8>(&v, p); // not same size #else _mm_storel_pd(p, v.raw); #endif } // Any <= 32 bit except template HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT p) { CopyBytes(&v, p); // not same size } template HWY_API void Store(Vec32 v, D /* tag */, float* HWY_RESTRICT p) { #if HWY_SAFE_PARTIAL_LOAD_STORE CopyBytes<4>(&v, p); // not same size #else _mm_store_ss(p, v.raw); #endif } // For < 128 bit, StoreU == Store. template HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { Store(v, d, p); } // ================================================== SWIZZLE (1) // ------------------------------ TableLookupBytes template HWY_API Vec128 TableLookupBytes(const Vec128 bytes, const Vec128 from) { const DFromV d; const Repartition du8; const DFromV d_bytes; const Repartition du8_bytes; #if HWY_TARGET == HWY_SSE2 #if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) typedef uint8_t GccU8RawVectType __attribute__((__vector_size__(16))); (void)d; (void)du8; (void)d_bytes; (void)du8_bytes; return Vec128{reinterpret_cast::type>( __builtin_shuffle(reinterpret_cast(bytes.raw), reinterpret_cast(from.raw)))}; #else const Full128 du8_full; alignas(16) uint8_t result_bytes[16]; alignas(16) uint8_t u8_bytes[16]; alignas(16) uint8_t from_bytes[16]; Store(Vec128{BitCast(du8_bytes, bytes).raw}, du8_full, u8_bytes); Store(Vec128{BitCast(du8, from).raw}, du8_full, from_bytes); for (int i = 0; i < 16; i++) { result_bytes[i] = u8_bytes[from_bytes[i] & 15]; } return BitCast(d, VFromD{Load(du8_full, result_bytes).raw}); #endif #else // SSSE3 or newer return BitCast( d, VFromD{_mm_shuffle_epi8(BitCast(du8_bytes, bytes).raw, BitCast(du8, from).raw)}); #endif } // ------------------------------ TableLookupBytesOr0 // For all vector widths; x86 anyway zeroes if >= 0x80 on SSSE3/SSE4/AVX2/AVX3 template HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) { #if HWY_TARGET == HWY_SSE2 const DFromV d; const Repartition di8; const auto di8_from = BitCast(di8, from); return BitCast(d, IfThenZeroElse(di8_from < Zero(di8), TableLookupBytes(bytes, di8_from))); #else return TableLookupBytes(bytes, from); #endif } // ------------------------------ Shuffles (ShiftRight, TableLookupBytes) // Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). // Shuffle0321 rotates one lane to the right (the previous least-significant // lane is now most-significant). These could also be implemented via // CombineShiftRightBytes but the shuffle_abcd notation is more convenient. // Swap 32-bit halves in 64-bit halves. template HWY_API Vec128 Shuffle2301(const Vec128 v) { static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{_mm_shuffle_epi32(v.raw, 0xB1)}; } template HWY_API Vec128 Shuffle2301(const Vec128 v) { static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{_mm_shuffle_ps(v.raw, v.raw, 0xB1)}; } // These are used by generic_ops-inl to implement LoadInterleaved3. As with // Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output // comes from the first argument. namespace detail { template HWY_API Vec32 ShuffleTwo2301(const Vec32 a, const Vec32 b) { const DFromV d; const Twice d2; const auto ba = Combine(d2, b, a); #if HWY_TARGET == HWY_SSE2 Vec32 ba_shuffled{ _mm_shufflelo_epi16(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))}; return BitCast(d, Or(ShiftLeft<8>(ba_shuffled), ShiftRight<8>(ba_shuffled))); #else const RebindToUnsigned d2_u; const auto shuffle_idx = BitCast(d2, Dup128VecFromValues(d2_u, 1, 0, 7, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); return Vec32{TableLookupBytes(ba, shuffle_idx).raw}; #endif } template HWY_API Vec64 ShuffleTwo2301(const Vec64 a, const Vec64 b) { const DFromV d; const Twice d2; const auto ba = Combine(d2, b, a); #if HWY_TARGET == HWY_SSE2 Vec64 ba_shuffled{ _mm_shuffle_epi32(ba.raw, _MM_SHUFFLE(3, 0, 3, 0))}; return Vec64{ _mm_shufflelo_epi16(ba_shuffled.raw, _MM_SHUFFLE(2, 3, 0, 1))}; #else const RebindToUnsigned d2_u; const auto shuffle_idx = BitCast( d2, Dup128VecFromValues(d2_u, 0x0302, 0x0100, 0x0f0e, 0x0d0c, 0, 0, 0, 0)); return Vec64{TableLookupBytes(ba, shuffle_idx).raw}; #endif } template HWY_API Vec128 ShuffleTwo2301(const Vec128 a, const Vec128 b) { const DFromV d; const RebindToFloat df; constexpr int m = _MM_SHUFFLE(2, 3, 0, 1); return BitCast(d, Vec128{_mm_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw, m)}); } template HWY_API Vec32 ShuffleTwo1230(const Vec32 a, const Vec32 b) { const DFromV d; #if HWY_TARGET == HWY_SSE2 const auto zero = Zero(d); const Rebind di16; const Vec32 a_shuffled{_mm_shufflelo_epi16( _mm_unpacklo_epi8(a.raw, zero.raw), _MM_SHUFFLE(3, 0, 3, 0))}; const Vec32 b_shuffled{_mm_shufflelo_epi16( _mm_unpacklo_epi8(b.raw, zero.raw), _MM_SHUFFLE(1, 2, 1, 2))}; const auto ba_shuffled = Combine(di16, b_shuffled, a_shuffled); return Vec32{_mm_packus_epi16(ba_shuffled.raw, ba_shuffled.raw)}; #else const Twice d2; const auto ba = Combine(d2, b, a); const RebindToUnsigned d2_u; const auto shuffle_idx = BitCast(d2, Dup128VecFromValues(d2_u, 0, 3, 6, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); return Vec32{TableLookupBytes(ba, shuffle_idx).raw}; #endif } template HWY_API Vec64 ShuffleTwo1230(const Vec64 a, const Vec64 b) { const DFromV d; #if HWY_TARGET == HWY_SSE2 const Vec32 a_shuffled{ _mm_shufflelo_epi16(a.raw, _MM_SHUFFLE(3, 0, 3, 0))}; const Vec32 b_shuffled{ _mm_shufflelo_epi16(b.raw, _MM_SHUFFLE(1, 2, 1, 2))}; return Combine(d, b_shuffled, a_shuffled); #else const Twice d2; const auto ba = Combine(d2, b, a); const RebindToUnsigned d2_u; const auto shuffle_idx = BitCast( d2, Dup128VecFromValues(d2_u, 0x0100, 0x0706, 0x0d0c, 0x0b0a, 0, 0, 0, 0)); return Vec64{TableLookupBytes(ba, shuffle_idx).raw}; #endif } template HWY_API Vec128 ShuffleTwo1230(const Vec128 a, const Vec128 b) { const DFromV d; const RebindToFloat df; constexpr int m = _MM_SHUFFLE(1, 2, 3, 0); return BitCast(d, Vec128{_mm_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw, m)}); } template HWY_API Vec32 ShuffleTwo3012(const Vec32 a, const Vec32 b) { const DFromV d; #if HWY_TARGET == HWY_SSE2 const auto zero = Zero(d); const Rebind di16; const Vec32 a_shuffled{_mm_shufflelo_epi16( _mm_unpacklo_epi8(a.raw, zero.raw), _MM_SHUFFLE(1, 2, 1, 2))}; const Vec32 b_shuffled{_mm_shufflelo_epi16( _mm_unpacklo_epi8(b.raw, zero.raw), _MM_SHUFFLE(3, 0, 3, 0))}; const auto ba_shuffled = Combine(di16, b_shuffled, a_shuffled); return Vec32{_mm_packus_epi16(ba_shuffled.raw, ba_shuffled.raw)}; #else const Twice d2; const auto ba = Combine(d2, b, a); const RebindToUnsigned d2_u; const auto shuffle_idx = BitCast(d2, Dup128VecFromValues(d2_u, 2, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); return Vec32{TableLookupBytes(ba, shuffle_idx).raw}; #endif } template HWY_API Vec64 ShuffleTwo3012(const Vec64 a, const Vec64 b) { const DFromV d; #if HWY_TARGET == HWY_SSE2 const Vec32 a_shuffled{ _mm_shufflelo_epi16(a.raw, _MM_SHUFFLE(1, 2, 1, 2))}; const Vec32 b_shuffled{ _mm_shufflelo_epi16(b.raw, _MM_SHUFFLE(3, 0, 3, 0))}; return Combine(d, b_shuffled, a_shuffled); #else const Twice d2; const auto ba = Combine(d2, b, a); const RebindToUnsigned d2_u; const auto shuffle_idx = BitCast( d2, Dup128VecFromValues(d2_u, 0x0504, 0x0302, 0x0908, 0x0f0e, 0, 0, 0, 0)); return Vec64{TableLookupBytes(ba, shuffle_idx).raw}; #endif } template HWY_API Vec128 ShuffleTwo3012(const Vec128 a, const Vec128 b) { const DFromV d; const RebindToFloat df; constexpr int m = _MM_SHUFFLE(3, 0, 1, 2); return BitCast(d, Vec128{_mm_shuffle_ps(BitCast(df, a).raw, BitCast(df, b).raw, m)}); } } // namespace detail // Swap 64-bit halves HWY_API Vec128 Shuffle1032(const Vec128 v) { return Vec128{_mm_shuffle_epi32(v.raw, 0x4E)}; } HWY_API Vec128 Shuffle1032(const Vec128 v) { return Vec128{_mm_shuffle_epi32(v.raw, 0x4E)}; } HWY_API Vec128 Shuffle1032(const Vec128 v) { return Vec128{_mm_shuffle_ps(v.raw, v.raw, 0x4E)}; } HWY_API Vec128 Shuffle01(const Vec128 v) { return Vec128{_mm_shuffle_epi32(v.raw, 0x4E)}; } HWY_API Vec128 Shuffle01(const Vec128 v) { return Vec128{_mm_shuffle_epi32(v.raw, 0x4E)}; } HWY_API Vec128 Shuffle01(const Vec128 v) { return Vec128{_mm_shuffle_pd(v.raw, v.raw, 1)}; } // Rotate right 32 bits HWY_API Vec128 Shuffle0321(const Vec128 v) { return Vec128{_mm_shuffle_epi32(v.raw, 0x39)}; } HWY_API Vec128 Shuffle0321(const Vec128 v) { return Vec128{_mm_shuffle_epi32(v.raw, 0x39)}; } HWY_API Vec128 Shuffle0321(const Vec128 v) { return Vec128{_mm_shuffle_ps(v.raw, v.raw, 0x39)}; } // Rotate left 32 bits HWY_API Vec128 Shuffle2103(const Vec128 v) { return Vec128{_mm_shuffle_epi32(v.raw, 0x93)}; } HWY_API Vec128 Shuffle2103(const Vec128 v) { return Vec128{_mm_shuffle_epi32(v.raw, 0x93)}; } HWY_API Vec128 Shuffle2103(const Vec128 v) { return Vec128{_mm_shuffle_ps(v.raw, v.raw, 0x93)}; } // Reverse HWY_API Vec128 Shuffle0123(const Vec128 v) { return Vec128{_mm_shuffle_epi32(v.raw, 0x1B)}; } HWY_API Vec128 Shuffle0123(const Vec128 v) { return Vec128{_mm_shuffle_epi32(v.raw, 0x1B)}; } HWY_API Vec128 Shuffle0123(const Vec128 v) { return Vec128{_mm_shuffle_ps(v.raw, v.raw, 0x1B)}; } // ================================================== COMPARE #if HWY_TARGET <= HWY_AVX3 // Comparisons set a mask bit to 1 if the condition is true, else 0. // ------------------------------ TestBit namespace detail { template HWY_INLINE Mask128 TestBit(hwy::SizeTag<1> /*tag*/, const Vec128 v, const Vec128 bit) { return Mask128{_mm_test_epi8_mask(v.raw, bit.raw)}; } template HWY_INLINE Mask128 TestBit(hwy::SizeTag<2> /*tag*/, const Vec128 v, const Vec128 bit) { return Mask128{_mm_test_epi16_mask(v.raw, bit.raw)}; } template HWY_INLINE Mask128 TestBit(hwy::SizeTag<4> /*tag*/, const Vec128 v, const Vec128 bit) { return Mask128{_mm_test_epi32_mask(v.raw, bit.raw)}; } template HWY_INLINE Mask128 TestBit(hwy::SizeTag<8> /*tag*/, const Vec128 v, const Vec128 bit) { return Mask128{_mm_test_epi64_mask(v.raw, bit.raw)}; } } // namespace detail template HWY_API Mask128 TestBit(const Vec128 v, const Vec128 bit) { static_assert(!hwy::IsFloat(), "Only integer vectors supported"); return detail::TestBit(hwy::SizeTag(), v, bit); } // ------------------------------ Equality template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{_mm_cmpeq_epi8_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{_mm_cmpeq_epi16_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{_mm_cmpeq_epi32_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{_mm_cmpeq_epi64_mask(a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { // Work around warnings in the intrinsic definitions (passing -1 as a mask). HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") return Mask128{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_EQ_OQ)}; HWY_DIAGNOSTICS(pop) } #endif // HWY_HAVE_FLOAT16 template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_EQ_OQ)}; } template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_EQ_OQ)}; } // ------------------------------ Inequality template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{_mm_cmpneq_epi8_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{_mm_cmpneq_epi16_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{_mm_cmpneq_epi32_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{_mm_cmpneq_epi64_mask(a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { // Work around warnings in the intrinsic definitions (passing -1 as a mask). HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") return Mask128{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; HWY_DIAGNOSTICS(pop) } #endif // HWY_HAVE_FLOAT16 template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Mask128{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Mask128{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_NEQ_OQ)}; } // ------------------------------ Strict inequality // Signed/float < template HWY_API Mask128 operator>(Vec128 a, Vec128 b) { return Mask128{_mm_cmpgt_epi8_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator>(Vec128 a, Vec128 b) { return Mask128{_mm_cmpgt_epi16_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator>(Vec128 a, Vec128 b) { return Mask128{_mm_cmpgt_epi32_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator>(Vec128 a, Vec128 b) { return Mask128{_mm_cmpgt_epi64_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator>(Vec128 a, Vec128 b) { return Mask128{_mm_cmpgt_epu8_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator>(Vec128 a, Vec128 b) { return Mask128{_mm_cmpgt_epu16_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator>(Vec128 a, Vec128 b) { return Mask128{_mm_cmpgt_epu32_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator>(Vec128 a, Vec128 b) { return Mask128{_mm_cmpgt_epu64_mask(a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 template HWY_API Mask128 operator>(Vec128 a, Vec128 b) { // Work around warnings in the intrinsic definitions (passing -1 as a mask). HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") return Mask128{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GT_OQ)}; HWY_DIAGNOSTICS(pop) } #endif // HWY_HAVE_FLOAT16 template HWY_API Mask128 operator>(Vec128 a, Vec128 b) { return Mask128{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GT_OQ)}; } template HWY_API Mask128 operator>(Vec128 a, Vec128 b) { return Mask128{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GT_OQ)}; } // ------------------------------ Weak inequality #if HWY_HAVE_FLOAT16 template HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { // Work around warnings in the intrinsic definitions (passing -1 as a mask). HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") return Mask128{_mm_cmp_ph_mask(a.raw, b.raw, _CMP_GE_OQ)}; HWY_DIAGNOSTICS(pop) } #endif // HWY_HAVE_FLOAT16 template HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { return Mask128{_mm_cmp_ps_mask(a.raw, b.raw, _CMP_GE_OQ)}; } template HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { return Mask128{_mm_cmp_pd_mask(a.raw, b.raw, _CMP_GE_OQ)}; } template HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { return Mask128{_mm_cmpge_epi8_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { return Mask128{_mm_cmpge_epi16_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { return Mask128{_mm_cmpge_epi32_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { return Mask128{_mm_cmpge_epi64_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { return Mask128{_mm_cmpge_epu8_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { return Mask128{_mm_cmpge_epu16_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { return Mask128{_mm_cmpge_epu32_mask(a.raw, b.raw)}; } template HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { return Mask128{_mm_cmpge_epu64_mask(a.raw, b.raw)}; } #else // AVX2 or below // Comparisons fill a lane with 1-bits if the condition is true, else 0. template HWY_API MFromD RebindMask(DTo dto, Mask128 m) { static_assert(sizeof(TFrom) == sizeof(TFromD), "Must have same size"); const Simd d; return MaskFromVec(BitCast(dto, VecFromMask(d, m))); } template HWY_API Mask128 TestBit(Vec128 v, Vec128 bit) { static_assert(!hwy::IsFloat(), "Only integer vectors supported"); return (v & bit) == bit; } // ------------------------------ Equality // Unsigned template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{_mm_cmpeq_epi8(a.raw, b.raw)}; } template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{_mm_cmpeq_epi16(a.raw, b.raw)}; } template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{_mm_cmpeq_epi32(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { #if HWY_TARGET >= HWY_SSSE3 const DFromV d64; const RepartitionToNarrow d32; const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); const auto cmp64 = cmp32 & Shuffle2301(cmp32); return MaskFromVec(BitCast(d64, cmp64)); #else return Mask128{_mm_cmpeq_epi64(a.raw, b.raw)}; #endif } // Signed template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{_mm_cmpeq_epi8(a.raw, b.raw)}; } template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{_mm_cmpeq_epi16(a.raw, b.raw)}; } template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{_mm_cmpeq_epi32(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { // Same as signed ==; avoid duplicating the SSSE3 version. const DFromV d; RebindToUnsigned du; return RebindMask(d, BitCast(du, a) == BitCast(du, b)); } // Float template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{_mm_cmpeq_ps(a.raw, b.raw)}; } template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{_mm_cmpeq_pd(a.raw, b.raw)}; } // ------------------------------ Inequality // This cannot have T as a template argument, otherwise it is not more // specialized than rewritten operator== in C++20, leading to compile // errors: https://gcc.godbolt.org/z/xsrPhPvPT. template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Mask128{_mm_cmpneq_ps(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Mask128{_mm_cmpneq_pd(a.raw, b.raw)}; } // ------------------------------ Strict inequality namespace detail { template HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, Vec128 a, Vec128 b) { return Mask128{_mm_cmpgt_epi8(a.raw, b.raw)}; } template HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, Vec128 a, Vec128 b) { return Mask128{_mm_cmpgt_epi16(a.raw, b.raw)}; } template HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, Vec128 a, Vec128 b) { return Mask128{_mm_cmpgt_epi32(a.raw, b.raw)}; } template HWY_INLINE Mask128 Gt(hwy::SignedTag /*tag*/, const Vec128 a, const Vec128 b) { #if HWY_TARGET >= HWY_SSSE3 // See https://stackoverflow.com/questions/65166174/: const DFromV d; const RepartitionToNarrow d32; const Vec128 m_eq32{Eq(BitCast(d32, a), BitCast(d32, b)).raw}; const Vec128 m_gt32{Gt(BitCast(d32, a), BitCast(d32, b)).raw}; // If a.upper is greater, upper := true. Otherwise, if a.upper == b.upper: // upper := b-a (unsigned comparison result of lower). Otherwise: upper := 0. const __m128i upper = OrAnd(m_gt32, m_eq32, Sub(b, a)).raw; // Duplicate upper to lower half. return Mask128{_mm_shuffle_epi32(upper, _MM_SHUFFLE(3, 3, 1, 1))}; #else return Mask128{_mm_cmpgt_epi64(a.raw, b.raw)}; // SSE4.2 #endif } template HWY_INLINE Mask128 Gt(hwy::UnsignedTag /*tag*/, Vec128 a, Vec128 b) { const DFromV du; const RebindToSigned di; const Vec128 msb = Set(du, (LimitsMax() >> 1) + 1); const auto sa = BitCast(di, Xor(a, msb)); const auto sb = BitCast(di, Xor(b, msb)); return RebindMask(du, Gt(hwy::SignedTag(), sa, sb)); } template HWY_INLINE Mask128 Gt(hwy::FloatTag /*tag*/, Vec128 a, Vec128 b) { return Mask128{_mm_cmpgt_ps(a.raw, b.raw)}; } template HWY_INLINE Mask128 Gt(hwy::FloatTag /*tag*/, Vec128 a, Vec128 b) { return Mask128{_mm_cmpgt_pd(a.raw, b.raw)}; } } // namespace detail template HWY_INLINE Mask128 operator>(Vec128 a, Vec128 b) { return detail::Gt(hwy::TypeTag(), a, b); } // ------------------------------ Weak inequality namespace detail { template HWY_INLINE Mask128 Ge(hwy::SignedTag tag, Vec128 a, Vec128 b) { return Not(Gt(tag, b, a)); } template HWY_INLINE Mask128 Ge(hwy::UnsignedTag tag, Vec128 a, Vec128 b) { return Not(Gt(tag, b, a)); } template HWY_INLINE Mask128 Ge(hwy::FloatTag /*tag*/, Vec128 a, Vec128 b) { return Mask128{_mm_cmpge_ps(a.raw, b.raw)}; } template HWY_INLINE Mask128 Ge(hwy::FloatTag /*tag*/, Vec128 a, Vec128 b) { return Mask128{_mm_cmpge_pd(a.raw, b.raw)}; } } // namespace detail template HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { return detail::Ge(hwy::TypeTag(), a, b); } #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ Reversed comparisons template HWY_API Mask128 operator<(Vec128 a, Vec128 b) { return b > a; } template HWY_API Mask128 operator<=(Vec128 a, Vec128 b) { return b >= a; } // ------------------------------ Iota (Load) namespace detail { template HWY_INLINE VFromD Iota0(D /*d*/) { return VFromD{_mm_set_epi8( static_cast(15), static_cast(14), static_cast(13), static_cast(12), static_cast(11), static_cast(10), static_cast(9), static_cast(8), static_cast(7), static_cast(6), static_cast(5), static_cast(4), static_cast(3), static_cast(2), static_cast(1), static_cast(0))}; } template HWY_INLINE VFromD Iota0(D /*d*/) { return VFromD{_mm_set_epi16(int16_t{7}, int16_t{6}, int16_t{5}, int16_t{4}, int16_t{3}, int16_t{2}, int16_t{1}, int16_t{0})}; } #if HWY_HAVE_FLOAT16 template HWY_INLINE VFromD Iota0(D /*d*/) { return VFromD{_mm_set_ph(float16_t{7}, float16_t{6}, float16_t{5}, float16_t{4}, float16_t{3}, float16_t{2}, float16_t{1}, float16_t{0})}; } #endif // HWY_HAVE_FLOAT16 template HWY_INLINE VFromD Iota0(D /*d*/) { return VFromD{ _mm_set_epi32(int32_t{3}, int32_t{2}, int32_t{1}, int32_t{0})}; } template HWY_INLINE VFromD Iota0(D /*d*/) { return VFromD{_mm_set_epi64x(int64_t{1}, int64_t{0})}; } template HWY_INLINE VFromD Iota0(D /*d*/) { return VFromD{_mm_set_ps(3.0f, 2.0f, 1.0f, 0.0f)}; } template HWY_INLINE VFromD Iota0(D /*d*/) { return VFromD{_mm_set_pd(1.0, 0.0)}; } #if HWY_COMPILER_MSVC template static HWY_INLINE V MaskOutVec128Iota(V v) { const V mask_out_mask{_mm_set_epi32(0, 0, 0, 0xFF)}; return v & mask_out_mask; } template static HWY_INLINE V MaskOutVec128Iota(V v) { #if HWY_TARGET <= HWY_SSE4 return V{_mm_blend_epi16(v.raw, _mm_setzero_si128(), 0xFE)}; #else const V mask_out_mask{_mm_set_epi32(0, 0, 0, 0xFFFF)}; return v & mask_out_mask; #endif } template static HWY_INLINE V MaskOutVec128Iota(V v) { const DFromV d; const Repartition df; using VF = VFromD; return BitCast(d, VF{_mm_move_ss(_mm_setzero_ps(), BitCast(df, v).raw)}); } template static HWY_INLINE V MaskOutVec128Iota(V v) { const DFromV d; const RebindToUnsigned du; using VU = VFromD; return BitCast(d, VU{_mm_move_epi64(BitCast(du, v).raw)}); } template static HWY_INLINE V MaskOutVec128Iota(V v) { return v; } #endif } // namespace detail template HWY_API VFromD Iota(D d, const T2 first) { const auto result_iota = detail::Iota0(d) + Set(d, ConvertScalarTo>(first)); #if HWY_COMPILER_MSVC return detail::MaskOutVec128Iota(result_iota); #else return result_iota; #endif } // ------------------------------ FirstN (Iota, Lt) template , HWY_IF_V_SIZE_LE_D(D, 16)> HWY_API M FirstN(D d, size_t num) { constexpr size_t kN = MaxLanes(d); // For AVX3, this ensures `num` <= 255 as required by bzhi, which only looks // at the lower 8 bits; for AVX2 and below, this ensures `num` fits in TI. num = HWY_MIN(num, kN); #if HWY_TARGET <= HWY_AVX3 #if HWY_ARCH_X86_64 const uint64_t all = (1ull << kN) - 1; return M::FromBits(_bzhi_u64(all, num)); #else const uint32_t all = static_cast((1ull << kN) - 1); return M::FromBits(_bzhi_u32(all, static_cast(num))); #endif // HWY_ARCH_X86_64 #else // HWY_TARGET > HWY_AVX3 const RebindToSigned di; // Signed comparisons are cheaper. using TI = TFromD; return RebindMask(d, detail::Iota0(di) < Set(di, static_cast(num))); #endif // HWY_TARGET <= HWY_AVX3 } // ------------------------------ InterleaveLower // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides // the least-significant lane) and "b". To concatenate two half-width integers // into one, use ZipLower/Upper instead (also works with scalar). template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{_mm_unpacklo_epi8(a.raw, b.raw)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { const DFromV d; const RebindToUnsigned du; using VU = VFromD; // for float16_t return BitCast( d, VU{_mm_unpacklo_epi16(BitCast(du, a).raw, BitCast(du, b).raw)}); } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{_mm_unpacklo_epi32(a.raw, b.raw)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{_mm_unpacklo_epi64(a.raw, b.raw)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{_mm_unpacklo_ps(a.raw, b.raw)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{_mm_unpacklo_pd(a.raw, b.raw)}; } // Generic for all vector lengths. template HWY_API VFromD InterleaveLower(D /* tag */, VFromD a, VFromD b) { return InterleaveLower(a, b); } // ================================================== MEMORY (2) // ------------------------------ MaskedLoad #if HWY_TARGET <= HWY_AVX3 template HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, const TFromD* HWY_RESTRICT p) { return VFromD{_mm_maskz_loadu_epi8(m.raw, p)}; } template HWY_API VFromD MaskedLoad(MFromD m, D d, const TFromD* HWY_RESTRICT p) { const RebindToUnsigned du; // for float16_t return BitCast(d, VFromD{_mm_maskz_loadu_epi16(m.raw, p)}); } template HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, const TFromD* HWY_RESTRICT p) { return VFromD{_mm_maskz_loadu_epi32(m.raw, p)}; } template HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, const TFromD* HWY_RESTRICT p) { return VFromD{_mm_maskz_loadu_epi64(m.raw, p)}; } template HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, const float* HWY_RESTRICT p) { return VFromD{_mm_maskz_loadu_ps(m.raw, p)}; } template HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, const double* HWY_RESTRICT p) { return VFromD{_mm_maskz_loadu_pd(m.raw, p)}; } template HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, const TFromD* HWY_RESTRICT p) { return VFromD{_mm_mask_loadu_epi8(v.raw, m.raw, p)}; } template HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D d, const TFromD* HWY_RESTRICT p) { const RebindToUnsigned du; // for float16_t return BitCast(d, VFromD{ _mm_mask_loadu_epi16(BitCast(du, v).raw, m.raw, p)}); } template HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, const TFromD* HWY_RESTRICT p) { return VFromD{_mm_mask_loadu_epi32(v.raw, m.raw, p)}; } template HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, const TFromD* HWY_RESTRICT p) { return VFromD{_mm_mask_loadu_epi64(v.raw, m.raw, p)}; } template HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, const float* HWY_RESTRICT p) { return VFromD{_mm_mask_loadu_ps(v.raw, m.raw, p)}; } template HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D /* tag */, const double* HWY_RESTRICT p) { return VFromD{_mm_mask_loadu_pd(v.raw, m.raw, p)}; } #elif HWY_TARGET == HWY_AVX2 template HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, const TFromD* HWY_RESTRICT p) { auto p_p = reinterpret_cast(p); // NOLINT return VFromD{_mm_maskload_epi32(p_p, m.raw)}; } template HWY_API VFromD MaskedLoad(MFromD m, D /* tag */, const TFromD* HWY_RESTRICT p) { auto p_p = reinterpret_cast(p); // NOLINT return VFromD{_mm_maskload_epi64(p_p, m.raw)}; } template HWY_API VFromD MaskedLoad(MFromD m, D d, const float* HWY_RESTRICT p) { const RebindToSigned di; return VFromD{_mm_maskload_ps(p, BitCast(di, VecFromMask(d, m)).raw)}; } template HWY_API VFromD MaskedLoad(MFromD m, D d, const double* HWY_RESTRICT p) { const RebindToSigned di; return VFromD{_mm_maskload_pd(p, BitCast(di, VecFromMask(d, m)).raw)}; } // There is no maskload_epi8/16, so blend instead. template HWY_API VFromD MaskedLoad(MFromD m, D d, const TFromD* HWY_RESTRICT p) { return IfThenElseZero(m, LoadU(d, p)); } #else // <= SSE4 // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow). template HWY_API VFromD MaskedLoad(MFromD m, D d, const TFromD* HWY_RESTRICT p) { return IfThenElseZero(m, LoadU(d, p)); } #endif // ------------------------------ MaskedLoadOr #if HWY_TARGET > HWY_AVX3 // else: native // Generic for all vector lengths. template HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D d, const TFromD* HWY_RESTRICT p) { return IfThenElse(m, LoadU(d, p), v); } #endif // HWY_TARGET > HWY_AVX3 // ------------------------------ LoadN (InterleaveLower) #if HWY_TARGET <= HWY_AVX2 && !HWY_MEM_OPS_MIGHT_FAULT #ifdef HWY_NATIVE_LOAD_N #undef HWY_NATIVE_LOAD_N #else #define HWY_NATIVE_LOAD_N #endif // Generic for all vector lengths. template HWY_API VFromD LoadN(D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { const FixedTag, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD))> d_full; return ResizeBitCast(d, MaskedLoad(FirstN(d_full, num_lanes), d_full, p)); } // Generic for all vector lengths. template HWY_API VFromD LoadNOr(VFromD no, D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { const FixedTag, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD))> d_full; return ResizeBitCast(d, MaskedLoadOr(ResizeBitCast(d_full, no), FirstN(d_full, num_lanes), d_full, p)); } #if HWY_TARGET > HWY_AVX3 namespace detail { // 'Leading' means the part that fits in 32-bit lanes. With 2-byte vectors, // there are none, so return the remainder (v_trailing). template HWY_INLINE VFromD AVX2UIF8Or16LoadLeadingN( VFromD /*load_mask*/, D /*d*/, const TFromD* HWY_RESTRICT /*p*/, VFromD v_trailing) { return v_trailing; } template HWY_INLINE VFromD AVX2UIF8Or16LoadLeadingNOr( VFromD /*no*/, VFromD /*load_mask*/, D /*d*/, const TFromD* HWY_RESTRICT /*p*/, VFromD v_trailing) { return v_trailing; } template HWY_INLINE VFromD AVX2UIF8Or16LoadLeadingN(VFromD load_mask, D d, const TFromD* HWY_RESTRICT p, VFromD v_trailing) { using DI32 = Repartition; const FixedTag di32_full; // ResizeBitCast of load_mask to di32 is okay below if // d.MaxBytes() < di32.MaxBytes() is true as any lanes of load_mask.raw past // the first (lowest-index) lanes of load_mask.raw will have already been // zeroed out by FirstN. return ResizeBitCast( d, IfNegativeThenElse( ResizeBitCast(di32_full, load_mask), MaskedLoad(MaskFromVec(ResizeBitCast(di32_full, load_mask)), di32_full, reinterpret_cast(p)), ResizeBitCast(di32_full, v_trailing))); } template HWY_INLINE VFromD AVX2UIF8Or16LoadLeadingNOr(VFromD no, VFromD load_mask, D d, const TFromD* HWY_RESTRICT p, VFromD v_trailing) { using DI32 = Repartition; const FixedTag di32_full; // ResizeBitCast of load_mask to di32 is okay below if // d.MaxBytes() < di32.MaxBytes() is true as any lanes of load_mask.raw past // the first (lowest-index) lanes of load_mask.raw will have already been // zeroed out by FirstN. return ResizeBitCast( d, IfNegativeThenElse( ResizeBitCast(di32_full, load_mask), MaskedLoadOr(ResizeBitCast(di32_full, no), MaskFromVec(ResizeBitCast(di32_full, load_mask)), di32_full, reinterpret_cast(p)), ResizeBitCast(di32_full, v_trailing))); } // Single lane: load or default value. template HWY_INLINE VFromD AVX2UIF8Or16LoadTrailingN(VFromD /*load_mask*/, D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { return (num_lanes > 0) ? LoadU(d, p) : Zero(d); } template HWY_INLINE VFromD AVX2UIF8Or16LoadTrailingNOr( VFromD no, VFromD /*load_mask*/, D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { return (num_lanes > 0) ? LoadU(d, p) : no; } // Two lanes: load 1, 2, or default. template HWY_INLINE VFromD AVX2UIF8Or16LoadTrailingN(VFromD /*load_mask*/, D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { if (num_lanes > 1) { return LoadU(d, p); } else { const FixedTag, 1> d1; return (num_lanes == 1) ? ResizeBitCast(d, LoadU(d1, p)) : Zero(d); } } template HWY_INLINE VFromD AVX2UIF8Or16LoadTrailingNOr( VFromD no, VFromD /*load_mask*/, D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { if (num_lanes > 1) { return LoadU(d, p); } else { if (num_lanes == 0) return no; // Load one, upper lane is default. const FixedTag, 1> d1; return InterleaveLower(ResizeBitCast(d, LoadU(d1, p)), no); } } template HWY_INLINE VFromD AVX2UIF8Or16LoadTrailingN(VFromD load_mask, D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { const size_t trailing_n = num_lanes & 3; if (trailing_n == 0) return Zero(d); VFromD v_trailing = And(load_mask, Set(d, p[num_lanes - 1])); if ((trailing_n & 2) != 0) { const Repartition di16; int16_t i16_bits; CopyBytes(p + num_lanes - trailing_n, &i16_bits); v_trailing = BitCast( d, IfNegativeThenElse(BitCast(di16, load_mask), Set(di16, i16_bits), BitCast(di16, v_trailing))); } return v_trailing; } template HWY_INLINE VFromD AVX2UIF8Or16LoadTrailingNOr( VFromD no, VFromD load_mask, D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { const size_t trailing_n = num_lanes & 3; if (trailing_n == 0) return no; VFromD v_trailing = IfVecThenElse(load_mask, Set(d, p[num_lanes - 1]), no); if ((trailing_n & 2) != 0) { const Repartition di16; int16_t i16_bits; CopyBytes(p + num_lanes - trailing_n, &i16_bits); v_trailing = BitCast( d, IfNegativeThenElse(BitCast(di16, load_mask), Set(di16, i16_bits), BitCast(di16, v_trailing))); } return v_trailing; } template HWY_INLINE VFromD AVX2UIF8Or16LoadTrailingN(VFromD load_mask, D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { if ((num_lanes & 1) != 0) { return And(load_mask, Set(d, p[num_lanes - 1])); } else { return Zero(d); } } template HWY_INLINE VFromD AVX2UIF8Or16LoadTrailingNOr( VFromD no, VFromD load_mask, D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { if ((num_lanes & 1) != 0) { return IfVecThenElse(load_mask, Set(d, p[num_lanes - 1]), no); } else { return no; } } } // namespace detail // Generic for all vector lengths. template HWY_API VFromD LoadN(D d, const TFromD* HWY_RESTRICT p, size_t N) { const FixedTag, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD))> d_full; const VFromD load_mask = ResizeBitCast(d, VecFromMask(d_full, FirstN(d_full, N))); const size_t num_lanes = HWY_MIN(N, HWY_MAX_LANES_D(D)); const VFromD v_trailing = detail::AVX2UIF8Or16LoadTrailingN(load_mask, d, p, num_lanes); #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD if (__builtin_constant_p(num_lanes < (4 / sizeof(TFromD))) && num_lanes < (4 / sizeof(TFromD))) { return v_trailing; } #endif return detail::AVX2UIF8Or16LoadLeadingN(load_mask, d, p, v_trailing); } // Generic for all vector lengths. template HWY_API VFromD LoadNOr(VFromD no, D d, const TFromD* HWY_RESTRICT p, size_t N) { const FixedTag, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD))> d_full; const VFromD load_mask = ResizeBitCast(d, VecFromMask(d_full, FirstN(d_full, N))); const size_t num_lanes = HWY_MIN(N, HWY_MAX_LANES_D(D)); const VFromD v_trailing = detail::AVX2UIF8Or16LoadTrailingNOr(no, load_mask, d, p, num_lanes); #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD if (__builtin_constant_p(num_lanes < (4 / sizeof(TFromD))) && num_lanes < (4 / sizeof(TFromD))) { return v_trailing; } #endif return detail::AVX2UIF8Or16LoadLeadingNOr(no, load_mask, d, p, v_trailing); } #endif // HWY_TARGET > HWY_AVX3 #endif // HWY_TARGET <= HWY_AVX2 && !HWY_MEM_OPS_MIGHT_FAULT // ------------------------------ BlendedStore namespace detail { // There is no maskload_epi8/16 with which we could safely implement // BlendedStore. Manual blending is also unsafe because loading a full vector // that crosses the array end causes asan faults. Resort to scalar code; the // caller should instead use memcpy, assuming m is FirstN(d, n). template HWY_API void ScalarMaskedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT p) { const RebindToSigned di; // for testing mask if T=bfloat16_t. using TI = TFromD; alignas(16) TI buf[MaxLanes(d)]; alignas(16) TI mask[MaxLanes(d)]; Store(BitCast(di, v), di, buf); Store(BitCast(di, VecFromMask(d, m)), di, mask); for (size_t i = 0; i < MaxLanes(d); ++i) { if (mask[i]) { CopySameSize(buf + i, p + i); } } } } // namespace detail #if HWY_TARGET <= HWY_AVX3 template HWY_API void BlendedStore(VFromD v, MFromD m, D /* tag */, TFromD* HWY_RESTRICT p) { _mm_mask_storeu_epi8(p, m.raw, v.raw); } template HWY_API void BlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT p) { const RebindToUnsigned du; // for float16_t _mm_mask_storeu_epi16(reinterpret_cast(p), RebindMask(du, m).raw, BitCast(du, v).raw); } template HWY_API void BlendedStore(VFromD v, MFromD m, D /* tag */, TFromD* HWY_RESTRICT p) { auto pi = reinterpret_cast(p); // NOLINT _mm_mask_storeu_epi32(pi, m.raw, v.raw); } template HWY_API void BlendedStore(VFromD v, MFromD m, D /* tag */, TFromD* HWY_RESTRICT p) { auto pi = reinterpret_cast(p); // NOLINT _mm_mask_storeu_epi64(pi, m.raw, v.raw); } template HWY_API void BlendedStore(VFromD v, MFromD m, D, float* HWY_RESTRICT p) { _mm_mask_storeu_ps(p, m.raw, v.raw); } template HWY_API void BlendedStore(VFromD v, MFromD m, D, double* HWY_RESTRICT p) { _mm_mask_storeu_pd(p, m.raw, v.raw); } #elif HWY_TARGET == HWY_AVX2 template HWY_API void BlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT p) { detail::ScalarMaskedStore(v, m, d, p); } namespace detail { template HWY_INLINE void NativeBlendedStore(V v, M m, TFromD* HWY_RESTRICT p) { auto pi = reinterpret_cast(p); // NOLINT _mm_maskstore_epi32(pi, m.raw, v.raw); } template HWY_INLINE void NativeBlendedStore(V v, M m, TFromD* HWY_RESTRICT p) { auto pi = reinterpret_cast(p); // NOLINT _mm_maskstore_epi64(pi, m.raw, v.raw); } template HWY_INLINE void NativeBlendedStore(V v, M m, float* HWY_RESTRICT p) { _mm_maskstore_ps(p, m.raw, v.raw); } template HWY_INLINE void NativeBlendedStore(V v, M m, double* HWY_RESTRICT p) { _mm_maskstore_pd(p, m.raw, v.raw); } } // namespace detail template HWY_API void BlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT p) { const RebindToSigned di; // For partial vectors, avoid writing other lanes by zeroing their mask. if (d.MaxBytes() < 16) { const Full128> dfull; const Mask128> mfull{m.raw}; m = MFromD{And(mfull, FirstN(dfull, MaxLanes(d))).raw}; } // Float/double require, and unsigned ints tolerate, signed int masks. detail::NativeBlendedStore(v, RebindMask(di, m), p); } #else // <= SSE4 template HWY_API void BlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT p) { // Avoid maskmov* - its nontemporal 'hint' causes it to bypass caches (slow). detail::ScalarMaskedStore(v, m, d, p); } #endif // SSE4 // ================================================== ARITHMETIC // ------------------------------ Addition // Unsigned template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{_mm_add_epi8(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{_mm_add_epi16(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{_mm_add_epi32(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{_mm_add_epi64(a.raw, b.raw)}; } // Signed template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{_mm_add_epi8(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{_mm_add_epi16(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{_mm_add_epi32(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{_mm_add_epi64(a.raw, b.raw)}; } // Float #if HWY_HAVE_FLOAT16 template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{_mm_add_ph(a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{_mm_add_ps(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{_mm_add_pd(a.raw, b.raw)}; } // ------------------------------ Subtraction // Unsigned template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{_mm_sub_epi8(a.raw, b.raw)}; } template HWY_API Vec128 operator-(Vec128 a, Vec128 b) { return Vec128{_mm_sub_epi16(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{_mm_sub_epi32(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{_mm_sub_epi64(a.raw, b.raw)}; } // Signed template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{_mm_sub_epi8(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{_mm_sub_epi16(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{_mm_sub_epi32(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{_mm_sub_epi64(a.raw, b.raw)}; } // Float #if HWY_HAVE_FLOAT16 template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{_mm_sub_ph(a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{_mm_sub_ps(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{_mm_sub_pd(a.raw, b.raw)}; } // ------------------------------ AddSub #if HWY_TARGET <= HWY_SSSE3 template HWY_API Vec128 AddSub(Vec128 a, Vec128 b) { return Vec128{_mm_addsub_ps(a.raw, b.raw)}; } HWY_API Vec128 AddSub(Vec128 a, Vec128 b) { return Vec128{_mm_addsub_pd(a.raw, b.raw)}; } #endif // HWY_TARGET <= HWY_SSSE3 // ------------------------------ SumsOf8 template HWY_API Vec128 SumsOf8(const Vec128 v) { return Vec128{_mm_sad_epu8(v.raw, _mm_setzero_si128())}; } // Generic for all vector lengths template )> HWY_API VFromD>> SumsOf8(V v) { const DFromV d; const RebindToUnsigned du; const Repartition di64; // Adjust the values of v to be in the 0..255 range by adding 128 to each lane // of v (which is the same as an bitwise XOR of each i8 lane by 128) and then // bitcasting the Xor result to an u8 vector. const auto v_adj = BitCast(du, Xor(v, SignBit(d))); // Need to add -1024 to each i64 lane of the result of the SumsOf8(v_adj) // operation to account for the adjustment made above. return BitCast(di64, SumsOf8(v_adj)) + Set(di64, int64_t{-1024}); } #ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF #undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF #else #define HWY_NATIVE_SUMS_OF_8_ABS_DIFF #endif template HWY_API Vec128 SumsOf8AbsDiff(const Vec128 a, const Vec128 b) { return Vec128{_mm_sad_epu8(a.raw, b.raw)}; } // Generic for all vector lengths template )> HWY_API VFromD>> SumsOf8AbsDiff(V a, V b) { const DFromV d; const RebindToUnsigned du; const RepartitionToWideX3 di64; // Adjust the values of a and b to be in the 0..255 range by adding 128 to // each lane of a and b (which is the same as an bitwise XOR of each i8 lane // by 128) and then bitcasting the results of the Xor operations to u8 // vectors. const auto i8_msb = SignBit(d); const auto a_adj = BitCast(du, Xor(a, i8_msb)); const auto b_adj = BitCast(du, Xor(b, i8_msb)); // The result of SumsOf8AbsDiff(a_adj, b_adj) can simply be bitcasted to an // i64 vector as |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true return BitCast(di64, SumsOf8AbsDiff(a_adj, b_adj)); } // ------------------------------ SumsOf4 #if HWY_TARGET <= HWY_AVX3 namespace detail { template HWY_INLINE Vec128 SumsOf4( hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, Vec128 v) { const DFromV d; // _mm_maskz_dbsad_epu8 is used below as the odd uint16_t lanes need to be // zeroed out and the sums of the 4 consecutive lanes are already in the // even uint16_t lanes of the _mm_maskz_dbsad_epu8 result. return Vec128{ _mm_maskz_dbsad_epu8(static_cast<__mmask8>(0x55), v.raw, Zero(d).raw, 0)}; } // detail::SumsOf4 for Vec128 on AVX3 is implemented in x86_512-inl.h } // namespace detail #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ SumsOfAdjQuadAbsDiff #if HWY_TARGET <= HWY_SSE4 #ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF #undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF #else #define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF #endif template HWY_API Vec128 SumsOfAdjQuadAbsDiff( Vec128 a, Vec128 b) { static_assert(0 <= kAOffset && kAOffset <= 1, "kAOffset must be between 0 and 1"); static_assert(0 <= kBOffset && kBOffset <= 3, "kBOffset must be between 0 and 3"); return Vec128{ _mm_mpsadbw_epu8(a.raw, b.raw, (kAOffset << 2) | kBOffset)}; } // Generic for all vector lengths template )> HWY_API VFromD>> SumsOfAdjQuadAbsDiff(V a, V b) { const DFromV d; const RebindToUnsigned du; const RepartitionToWide dw; // Adjust the values of a and b to be in the 0..255 range by adding 128 to // each lane of a and b (which is the same as an bitwise XOR of each i8 lane // by 128) and then bitcasting the results of the Xor operations to u8 // vectors. const auto i8_msb = SignBit(d); const auto a_adj = BitCast(du, Xor(a, i8_msb)); const auto b_adj = BitCast(du, Xor(b, i8_msb)); // The result of SumsOfAdjQuadAbsDiff(a_adj, b_adj) can // simply be bitcasted to an i16 vector as // |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true. return BitCast(dw, SumsOfAdjQuadAbsDiff(a_adj, b_adj)); } #endif // ------------------------------ SumsOfShuffledQuadAbsDiff #if HWY_TARGET <= HWY_AVX3 #ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF #undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF #else #define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF #endif template HWY_API Vec128 SumsOfShuffledQuadAbsDiff( Vec128 a, Vec128 b) { static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3"); static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3"); static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3"); static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3"); return Vec128{ _mm_dbsad_epu8(b.raw, a.raw, _MM_SHUFFLE(kIdx3, kIdx2, kIdx1, kIdx0))}; } // Generic for all vector lengths template )> HWY_API VFromD>> SumsOfShuffledQuadAbsDiff(V a, V b) { const DFromV d; const RebindToUnsigned du; const RepartitionToWide dw; // Adjust the values of a and b to be in the 0..255 range by adding 128 to // each lane of a and b (which is the same as an bitwise XOR of each i8 lane // by 128) and then bitcasting the results of the Xor operations to u8 // vectors. const auto i8_msb = SignBit(d); const auto a_adj = BitCast(du, Xor(a, i8_msb)); const auto b_adj = BitCast(du, Xor(b, i8_msb)); // The result of // SumsOfShuffledQuadAbsDiff(a_adj, b_adj) can // simply be bitcasted to an i16 vector as // |(a[i] + 128) - (b[i] + 128)| == |a[i] - b[i]| is true. return BitCast( dw, SumsOfShuffledQuadAbsDiff(a_adj, b_adj)); } #endif // ------------------------------ SaturatedAdd // Returns a + b clamped to the destination range. // Unsigned template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{_mm_adds_epu8(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{_mm_adds_epu16(a.raw, b.raw)}; } // Signed template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{_mm_adds_epi8(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{_mm_adds_epi16(a.raw, b.raw)}; } #if HWY_TARGET <= HWY_AVX3 #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB #undef HWY_NATIVE_I32_SATURATED_ADDSUB #else #define HWY_NATIVE_I32_SATURATED_ADDSUB #endif #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB #undef HWY_NATIVE_I64_SATURATED_ADDSUB #else #define HWY_NATIVE_I64_SATURATED_ADDSUB #endif template HWY_API Vec128 SaturatedAdd(Vec128 a, Vec128 b) { const DFromV d; const auto sum = a + b; const auto overflow_mask = MaskFromVec( Vec128{_mm_ternarylogic_epi32(a.raw, b.raw, sum.raw, 0x42)}); const auto i32_max = Set(d, LimitsMax()); const Vec128 overflow_result{_mm_mask_ternarylogic_epi32( i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; return IfThenElse(overflow_mask, overflow_result, sum); } template HWY_API Vec128 SaturatedAdd(Vec128 a, Vec128 b) { const DFromV d; const auto sum = a + b; const auto overflow_mask = MaskFromVec( Vec128{_mm_ternarylogic_epi64(a.raw, b.raw, sum.raw, 0x42)}); const auto i64_max = Set(d, LimitsMax()); const Vec128 overflow_result{_mm_mask_ternarylogic_epi64( i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; return IfThenElse(overflow_mask, overflow_result, sum); } #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ SaturatedSub // Returns a - b clamped to the destination range. // Unsigned template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{_mm_subs_epu8(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{_mm_subs_epu16(a.raw, b.raw)}; } // Signed template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{_mm_subs_epi8(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{_mm_subs_epi16(a.raw, b.raw)}; } #if HWY_TARGET <= HWY_AVX3 template HWY_API Vec128 SaturatedSub(Vec128 a, Vec128 b) { const DFromV d; const auto diff = a - b; const auto overflow_mask = MaskFromVec( Vec128{_mm_ternarylogic_epi32(a.raw, b.raw, diff.raw, 0x18)}); const auto i32_max = Set(d, LimitsMax()); const Vec128 overflow_result{_mm_mask_ternarylogic_epi32( i32_max.raw, MaskFromVec(a).raw, i32_max.raw, i32_max.raw, 0x55)}; return IfThenElse(overflow_mask, overflow_result, diff); } template HWY_API Vec128 SaturatedSub(Vec128 a, Vec128 b) { const DFromV d; const auto diff = a - b; const auto overflow_mask = MaskFromVec( Vec128{_mm_ternarylogic_epi64(a.raw, b.raw, diff.raw, 0x18)}); const auto i64_max = Set(d, LimitsMax()); const Vec128 overflow_result{_mm_mask_ternarylogic_epi64( i64_max.raw, MaskFromVec(a).raw, i64_max.raw, i64_max.raw, 0x55)}; return IfThenElse(overflow_mask, overflow_result, diff); } #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ AverageRound // Returns (a + b + 1) / 2 // Unsigned template HWY_API Vec128 AverageRound(const Vec128 a, const Vec128 b) { return Vec128{_mm_avg_epu8(a.raw, b.raw)}; } template HWY_API Vec128 AverageRound(const Vec128 a, const Vec128 b) { return Vec128{_mm_avg_epu16(a.raw, b.raw)}; } // ------------------------------ Integer multiplication template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{_mm_mullo_epi16(a.raw, b.raw)}; } template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{_mm_mullo_epi16(a.raw, b.raw)}; } // Returns the upper 16 bits of a * b in each lane. template HWY_API Vec128 MulHigh(const Vec128 a, const Vec128 b) { return Vec128{_mm_mulhi_epu16(a.raw, b.raw)}; } template HWY_API Vec128 MulHigh(const Vec128 a, const Vec128 b) { return Vec128{_mm_mulhi_epi16(a.raw, b.raw)}; } // Multiplies even lanes (0, 2 ..) and places the double-wide result into // even and the upper half into its odd neighbor lane. template )> HWY_API VFromD>> MulEven(V a, V b) { const DFromV d; const RepartitionToWide dw; const auto lo8_mask = Set(dw, uint16_t{0x00FF}); return And(ResizeBitCast(dw, a), lo8_mask) * And(ResizeBitCast(dw, b), lo8_mask); } template )> HWY_API VFromD>> MulEven(V a, V b) { const DFromV d; const RepartitionToWide dw; return ShiftRight<8>(ShiftLeft<8>(ResizeBitCast(dw, a))) * ShiftRight<8>(ShiftLeft<8>(ResizeBitCast(dw, b))); } template )> HWY_API VFromD>> MulEven(V a, V b) { const DFromV d; const RepartitionToWide dw; const RepartitionToNarrow dw_as_d16; const auto lo = ResizeBitCast(dw, a * b); const auto hi = ShiftLeft<16>(ResizeBitCast(dw, MulHigh(a, b))); return BitCast(dw, OddEven(BitCast(dw_as_d16, hi), BitCast(dw_as_d16, lo))); } template HWY_API Vec128 MulEven(const Vec128 a, const Vec128 b) { return Vec128{_mm_mul_epu32(a.raw, b.raw)}; } template HWY_API Vec128 MulEven(const Vec128 a, const Vec128 b) { #if HWY_TARGET >= HWY_SSSE3 const DFromV d; const RepartitionToWide dw; const RebindToUnsigned du; // p[i] = (((a[i] >> 31) * (a[i] >> 31)) << 64) + // (((a[i] >> 31) * b[i]) << 32) + // (((b[i] >> 31) * a[i]) << 32) + // ((a[i] & int64_t{0xFFFFFFFF}) * (b[i] & int64_t{0xFFFFFFFF})) // ((a[i] >> 31) * (a[i] >> 31)) << 64 does not need to be computed as the // lower 64 bits of ((a[i] >> 31) * (a[i] >> 31)) << 64 is zero. // (((a[i] >> 31) * b[i]) << 32) + (((b[i] >> 31) * a[i]) << 32) == // -((((a[i] >> 31) & b[i]) + ((b[i] >> 31) & a[i])) << 32) // ((a[i] & int64_t{0xFFFFFFFF}) * (b[i] & int64_t{0xFFFFFFFF})) can be // computed using MulEven(BitCast(du, a), BitCast(du, b)) const auto neg_p_hi = ShiftLeft<32>( ResizeBitCast(dw, And(ShiftRight<31>(a), b) + And(ShiftRight<31>(b), a))); const auto p_lo = BitCast(dw, MulEven(BitCast(du, a), BitCast(du, b))); return p_lo - neg_p_hi; #else return Vec128{_mm_mul_epi32(a.raw, b.raw)}; #endif } template HWY_API VFromD>> MulOdd(V a, V b) { const DFromV d; const RepartitionToWide dw; return ShiftRight<8>(ResizeBitCast(dw, a)) * ShiftRight<8>(ResizeBitCast(dw, b)); } template )> HWY_API VFromD>> MulOdd(V a, V b) { const DFromV d; const RepartitionToWide dw; const RebindToUnsigned dw_u; const RepartitionToNarrow dw_as_d16; const auto lo = ShiftRight<16>(BitCast(dw_u, ResizeBitCast(dw, a * b))); const auto hi = ResizeBitCast(dw, MulHigh(a, b)); return BitCast(dw, OddEven(BitCast(dw_as_d16, hi), BitCast(dw_as_d16, lo))); } template )> HWY_API VFromD>> MulOdd(V a, V b) { return MulEven(DupOdd(a), DupOdd(b)); } template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { #if HWY_TARGET >= HWY_SSSE3 // Not as inefficient as it looks: _mm_mullo_epi32 has 10 cycle latency. // 64-bit right shift would also work but also needs port 5, so no benefit. // Notation: x=don't care, z=0. const __m128i a_x3x1 = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 3, 1, 1)); const auto mullo_x2x0 = MulEven(a, b); const __m128i b_x3x1 = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(3, 3, 1, 1)); const auto mullo_x3x1 = MulEven(Vec128{a_x3x1}, Vec128{b_x3x1}); // We could _mm_slli_epi64 by 32 to get 3z1z and OR with z2z0, but generating // the latter requires one more instruction or a constant. const __m128i mul_20 = _mm_shuffle_epi32(mullo_x2x0.raw, _MM_SHUFFLE(2, 0, 2, 0)); const __m128i mul_31 = _mm_shuffle_epi32(mullo_x3x1.raw, _MM_SHUFFLE(2, 0, 2, 0)); return Vec128{_mm_unpacklo_epi32(mul_20, mul_31)}; #else return Vec128{_mm_mullo_epi32(a.raw, b.raw)}; #endif } template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { // Same as unsigned; avoid duplicating the SSSE3 code. const DFromV d; const RebindToUnsigned du; return BitCast(d, BitCast(du, a) * BitCast(du, b)); } // ------------------------------ RotateRight (ShiftRight, Or) template HWY_API Vec128 RotateRight(const Vec128 v) { constexpr size_t kSizeInBits = sizeof(T) * 8; static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); if (kBits == 0) return v; // AVX3 does not support 8/16-bit. return Or(ShiftRight(v), ShiftLeft(v)); } template HWY_API Vec128 RotateRight(const Vec128 v) { static_assert(0 <= kBits && kBits < 32, "Invalid shift count"); #if HWY_TARGET <= HWY_AVX3 return Vec128{_mm_ror_epi32(v.raw, kBits)}; #else if (kBits == 0) return v; return Or(ShiftRight(v), ShiftLeft(v)); #endif } template HWY_API Vec128 RotateRight(const Vec128 v) { static_assert(0 <= kBits && kBits < 64, "Invalid shift count"); #if HWY_TARGET <= HWY_AVX3 return Vec128{_mm_ror_epi64(v.raw, kBits)}; #else if (kBits == 0) return v; return Or(ShiftRight(v), ShiftLeft(v)); #endif } // ------------------------------ BroadcastSignBit (ShiftRight, compare, mask) template HWY_API Vec128 BroadcastSignBit(const Vec128 v) { const DFromV d; return VecFromMask(v < Zero(d)); } template HWY_API Vec128 BroadcastSignBit(const Vec128 v) { return ShiftRight<15>(v); } template HWY_API Vec128 BroadcastSignBit(const Vec128 v) { return ShiftRight<31>(v); } template HWY_API Vec128 BroadcastSignBit(const Vec128 v) { const DFromV d; #if HWY_TARGET <= HWY_AVX3 (void)d; return Vec128{_mm_srai_epi64(v.raw, 63)}; #elif HWY_TARGET == HWY_AVX2 || HWY_TARGET == HWY_SSE4 return VecFromMask(v < Zero(d)); #else // Efficient Lt() requires SSE4.2 and BLENDVPD requires SSE4.1. 32-bit shift // avoids generating a zero. const RepartitionToNarrow d32; const auto sign = ShiftRight<31>(BitCast(d32, v)); return Vec128{ _mm_shuffle_epi32(sign.raw, _MM_SHUFFLE(3, 3, 1, 1))}; #endif } // ------------------------------ Integer Abs // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. template HWY_API Vec128 Abs(const Vec128 v) { #if HWY_COMPILER_MSVC || HWY_TARGET == HWY_SSE2 const DFromV d; const RebindToUnsigned du; const auto zero = Zero(du); const auto v_as_u8 = BitCast(du, v); return BitCast(d, Min(v_as_u8, zero - v_as_u8)); #else return Vec128{_mm_abs_epi8(v.raw)}; #endif } template HWY_API Vec128 Abs(const Vec128 v) { #if HWY_TARGET == HWY_SSE2 const auto zero = Zero(DFromV()); return Max(v, zero - v); #else return Vec128{_mm_abs_epi16(v.raw)}; #endif } template HWY_API Vec128 Abs(const Vec128 v) { #if HWY_TARGET <= HWY_SSSE3 return Vec128{_mm_abs_epi32(v.raw)}; #else const auto zero = Zero(DFromV()); return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); #endif } #if HWY_TARGET <= HWY_AVX3 template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{_mm_abs_epi64(v.raw)}; } #else // I64 Abs is generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 template )> HWY_API V Abs(V v) { const auto zero = Zero(DFromV()); return IfNegativeThenElse(v, zero - v, v); } #endif #ifdef HWY_NATIVE_SATURATED_ABS #undef HWY_NATIVE_SATURATED_ABS #else #define HWY_NATIVE_SATURATED_ABS #endif // Generic for all vector lengths template )> HWY_API V SaturatedAbs(V v) { const DFromV d; const RebindToUnsigned du; return BitCast(d, Min(BitCast(du, v), BitCast(du, SaturatedSub(Zero(d), v)))); } // Generic for all vector lengths template )> HWY_API V SaturatedAbs(V v) { return Max(v, SaturatedSub(Zero(DFromV()), v)); } // Generic for all vector lengths template )> HWY_API V SaturatedAbs(V v) { const auto abs_v = Abs(v); #if HWY_TARGET <= HWY_SSE4 const DFromV d; const RebindToUnsigned du; return BitCast(d, Min(BitCast(du, abs_v), Set(du, static_cast(LimitsMax())))); #else return Add(abs_v, BroadcastSignBit(abs_v)); #endif } // Generic for all vector lengths template )> HWY_API V SaturatedAbs(V v) { const auto abs_v = Abs(v); return Add(abs_v, BroadcastSignBit(abs_v)); } // GCC <14 and Clang <11 do not follow the Intel documentation for AVX-512VL // srli_epi64: the count should be unsigned int. Note that this is not the same // as the Shift3264Count in x86_512-inl.h (GCC also requires int). #if (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) || \ (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400) using Shift64Count = int; #else // Assume documented behavior. Clang 12, GCC 14 and MSVC 14.28.29910 match this. using Shift64Count = unsigned int; #endif template HWY_API Vec128 ShiftRight(const Vec128 v) { #if HWY_TARGET <= HWY_AVX3 return Vec128{ _mm_srai_epi64(v.raw, static_cast(kBits))}; #else const DFromV di; const RebindToUnsigned du; const auto right = BitCast(di, ShiftRight(BitCast(du, v))); const auto sign = ShiftLeft<64 - kBits>(BroadcastSignBit(v)); return right | sign; #endif } // ------------------------------ ZeroIfNegative (BroadcastSignBit) template HWY_API Vec128 ZeroIfNegative(Vec128 v) { static_assert(IsFloat(), "Only works for float"); const DFromV d; #if HWY_TARGET >= HWY_SSSE3 const RebindToSigned di; const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); #else const auto mask = MaskFromVec(v); // MSB is sufficient for BLENDVPS #endif return IfThenElse(mask, Zero(d), v); } // ------------------------------ IfNegativeThenElse template HWY_API Vec128 IfNegativeThenElse(const Vec128 v, const Vec128 yes, const Vec128 no) { // int8: IfThenElse only looks at the MSB on SSE4 or newer #if HWY_TARGET <= HWY_SSE4 const auto mask = MaskFromVec(v); #else const DFromV d; const RebindToSigned di; const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); #endif return IfThenElse(mask, yes, no); } template HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, Vec128 no) { static_assert(IsSigned(), "Only works for signed/float"); // 16-bit: no native blendv on AVX2 or earlier, so copy sign to lower byte's // MSB. #if HWY_TARGET <= HWY_AVX3 const auto mask = MaskFromVec(v); #else const DFromV d; const RebindToSigned di; const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); #endif return IfThenElse(mask, yes, no); } template HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, Vec128 no) { static_assert(IsSigned(), "Only works for signed/float"); const DFromV d; #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE4 // 32/64-bit: use float IfThenElse on SSE4/AVX2, which only looks at the MSB // on SSE4 or later. const RebindToFloat df; const auto mask = MaskFromVec(BitCast(df, v)); return BitCast(d, IfThenElse(mask, BitCast(df, yes), BitCast(df, no))); #else // SSE2, SSSE3, or AVX3 #if HWY_TARGET <= HWY_AVX3 // No need to cast to float or broadcast sign bit on AVX3 as IfThenElse only // looks at the MSB on AVX3 (void)d; const auto mask = MaskFromVec(v); #else const RebindToSigned di; const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); #endif return IfThenElse(mask, yes, no); #endif } // ------------------------------ IfNegativeThenNegOrUndefIfZero #if HWY_TARGET <= HWY_SSSE3 #ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG #undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG #else #define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG #endif template HWY_API Vec128 IfNegativeThenNegOrUndefIfZero(Vec128 mask, Vec128 v) { return Vec128{_mm_sign_epi8(v.raw, mask.raw)}; } template HWY_API Vec128 IfNegativeThenNegOrUndefIfZero( Vec128 mask, Vec128 v) { return Vec128{_mm_sign_epi16(v.raw, mask.raw)}; } template HWY_API Vec128 IfNegativeThenNegOrUndefIfZero( Vec128 mask, Vec128 v) { return Vec128{_mm_sign_epi32(v.raw, mask.raw)}; } // Generic for all vector lengths template )> HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) { #if HWY_TARGET <= HWY_AVX3 // MaskedSubOr is more efficient than IfNegativeThenElse on AVX3 const DFromV d; return MaskedSubOr(v, MaskFromVec(mask), Zero(d), v); #else // IfNegativeThenElse is more efficient than MaskedSubOr on SSE4/AVX2 return IfNegativeThenElse(mask, Neg(v), v); #endif } #endif // HWY_TARGET <= HWY_SSSE3 // ------------------------------ ShiftLeftSame template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec128{_mm_slli_epi16(v.raw, bits)}; } #endif return Vec128{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; } template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec128{_mm_slli_epi32(v.raw, bits)}; } #endif return Vec128{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; } template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec128{_mm_slli_epi64(v.raw, bits)}; } #endif return Vec128{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; } template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec128{_mm_slli_epi16(v.raw, bits)}; } #endif return Vec128{_mm_sll_epi16(v.raw, _mm_cvtsi32_si128(bits))}; } template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec128{_mm_slli_epi32(v.raw, bits)}; } #endif return Vec128{_mm_sll_epi32(v.raw, _mm_cvtsi32_si128(bits))}; } template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec128{_mm_slli_epi64(v.raw, bits)}; } #endif return Vec128{_mm_sll_epi64(v.raw, _mm_cvtsi32_si128(bits))}; } template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { const DFromV d8; // Use raw instead of BitCast to support N=1. const Vec128 shifted{ ShiftLeftSame(Vec128>{v.raw}, bits).raw}; return shifted & Set(d8, static_cast((0xFF << bits) & 0xFF)); } // ------------------------------ ShiftRightSame (BroadcastSignBit) template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec128{_mm_srli_epi16(v.raw, bits)}; } #endif return Vec128{_mm_srl_epi16(v.raw, _mm_cvtsi32_si128(bits))}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec128{_mm_srli_epi32(v.raw, bits)}; } #endif return Vec128{_mm_srl_epi32(v.raw, _mm_cvtsi32_si128(bits))}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec128{_mm_srli_epi64(v.raw, bits)}; } #endif return Vec128{_mm_srl_epi64(v.raw, _mm_cvtsi32_si128(bits))}; } template HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { const DFromV d8; // Use raw instead of BitCast to support N=1. const Vec128 shifted{ ShiftRightSame(Vec128{v.raw}, bits).raw}; return shifted & Set(d8, static_cast(0xFF >> bits)); } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec128{_mm_srai_epi16(v.raw, bits)}; } #endif return Vec128{_mm_sra_epi16(v.raw, _mm_cvtsi32_si128(bits))}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec128{_mm_srai_epi32(v.raw, bits)}; } #endif return Vec128{_mm_sra_epi32(v.raw, _mm_cvtsi32_si128(bits))}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { #if HWY_TARGET <= HWY_AVX3 #if HWY_COMPILER_GCC if (__builtin_constant_p(bits)) { return Vec128{ _mm_srai_epi64(v.raw, static_cast(bits))}; } #endif return Vec128{_mm_sra_epi64(v.raw, _mm_cvtsi32_si128(bits))}; #else const DFromV di; const RebindToUnsigned du; const auto right = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); const auto sign = ShiftLeftSame(BroadcastSignBit(v), 64 - bits); return right | sign; #endif } template HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { const DFromV di; const RebindToUnsigned du; const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); const auto shifted_sign = BitCast(di, Set(du, static_cast(0x80 >> bits))); return (shifted ^ shifted_sign) - shifted_sign; } // ------------------------------ Floating-point mul / div #if HWY_HAVE_FLOAT16 template HWY_API Vec128 operator*(Vec128 a, Vec128 b) { return Vec128{_mm_mul_ph(a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 operator*(Vec128 a, Vec128 b) { return Vec128{_mm_mul_ps(a.raw, b.raw)}; } HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{_mm_mul_ss(a.raw, b.raw)}; } template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{_mm_mul_pd(a.raw, b.raw)}; } HWY_API Vec64 operator*(const Vec64 a, const Vec64 b) { return Vec64{_mm_mul_sd(a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 template HWY_API Vec128 operator/(const Vec128 a, const Vec128 b) { return Vec128{_mm_div_ph(a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 operator/(const Vec128 a, const Vec128 b) { return Vec128{_mm_div_ps(a.raw, b.raw)}; } HWY_API Vec128 operator/(const Vec128 a, const Vec128 b) { return Vec128{_mm_div_ss(a.raw, b.raw)}; } template HWY_API Vec128 operator/(const Vec128 a, const Vec128 b) { return Vec128{_mm_div_pd(a.raw, b.raw)}; } HWY_API Vec64 operator/(const Vec64 a, const Vec64 b) { return Vec64{_mm_div_sd(a.raw, b.raw)}; } // Approximate reciprocal #if HWY_HAVE_FLOAT16 template HWY_API Vec128 ApproximateReciprocal( const Vec128 v) { return Vec128{_mm_rcp_ph(v.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 ApproximateReciprocal(const Vec128 v) { return Vec128{_mm_rcp_ps(v.raw)}; } HWY_API Vec128 ApproximateReciprocal(const Vec128 v) { return Vec128{_mm_rcp_ss(v.raw)}; } #if HWY_TARGET <= HWY_AVX3 #ifdef HWY_NATIVE_F64_APPROX_RECIP #undef HWY_NATIVE_F64_APPROX_RECIP #else #define HWY_NATIVE_F64_APPROX_RECIP #endif HWY_API Vec128 ApproximateReciprocal(Vec128 v) { return Vec128{_mm_rcp14_pd(v.raw)}; } HWY_API Vec64 ApproximateReciprocal(Vec64 v) { return Vec64{_mm_rcp14_sd(v.raw, v.raw)}; } #endif // Generic for all vector lengths. template HWY_API V AbsDiff(V a, V b) { return Abs(a - b); } // ------------------------------ MaskedMinOr #if HWY_TARGET <= HWY_AVX3 #ifdef HWY_NATIVE_MASKED_ARITH #undef HWY_NATIVE_MASKED_ARITH #else #define HWY_NATIVE_MASKED_ARITH #endif template HWY_API Vec128 MaskedMinOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_min_epu8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedMinOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_min_epi8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedMinOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_min_epu16(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedMinOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_min_epi16(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedMinOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_min_epu32(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedMinOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_min_epi32(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedMinOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_min_epu64(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedMinOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_min_epi64(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedMinOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_min_ps(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedMinOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_min_pd(no.raw, m.raw, a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 template HWY_API Vec128 MaskedMinOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_min_ph(no.raw, m.raw, a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 // ------------------------------ MaskedMaxOr template HWY_API Vec128 MaskedMaxOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_max_epu8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedMaxOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_max_epi8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedMaxOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_max_epu16(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedMaxOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_max_epi16(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedMaxOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_max_epu32(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedMaxOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_max_epi32(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedMaxOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_max_epu64(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedMaxOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_max_epi64(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedMaxOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_max_ps(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedMaxOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_max_pd(no.raw, m.raw, a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 template HWY_API Vec128 MaskedMaxOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_max_ph(no.raw, m.raw, a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 // ------------------------------ MaskedAddOr template HWY_API Vec128 MaskedAddOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_add_epi8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedAddOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_add_epi16(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedAddOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_add_epi32(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedAddOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_add_epi64(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedAddOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_add_ps(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedAddOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_add_pd(no.raw, m.raw, a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 template HWY_API Vec128 MaskedAddOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_add_ph(no.raw, m.raw, a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 // ------------------------------ MaskedSubOr template HWY_API Vec128 MaskedSubOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_sub_epi8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedSubOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_sub_epi16(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedSubOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_sub_epi32(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedSubOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_sub_epi64(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedSubOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_sub_ps(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedSubOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_sub_pd(no.raw, m.raw, a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 template HWY_API Vec128 MaskedSubOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_sub_ph(no.raw, m.raw, a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 // ------------------------------ MaskedMulOr // There are no elementwise integer mask_mul. Generic for all vector lengths. template HWY_API V MaskedMulOr(V no, M m, V a, V b) { return IfThenElse(m, a * b, no); } template HWY_API Vec128 MaskedMulOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_mul_ps(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedMulOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_mul_pd(no.raw, m.raw, a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 template HWY_API Vec128 MaskedMulOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_mul_ph(no.raw, m.raw, a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 // ------------------------------ MaskedDivOr template HWY_API Vec128 MaskedDivOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_div_ps(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedDivOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_div_pd(no.raw, m.raw, a.raw, b.raw)}; } #if HWY_HAVE_FLOAT16 template HWY_API Vec128 MaskedDivOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_div_ph(no.raw, m.raw, a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 // Generic for all vector lengths template HWY_API V MaskedDivOr(V no, MFromD> m, V a, V b) { return IfThenElse(m, Div(a, b), no); } // ------------------------------ MaskedModOr // Generic for all vector lengths template HWY_API V MaskedModOr(V no, MFromD> m, V a, V b) { return IfThenElse(m, Mod(a, b), no); } // ------------------------------ MaskedSatAddOr template HWY_API Vec128 MaskedSatAddOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_adds_epi8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedSatAddOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_adds_epu8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedSatAddOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_adds_epi16(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedSatAddOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_adds_epu16(no.raw, m.raw, a.raw, b.raw)}; } // ------------------------------ MaskedSatSubOr template HWY_API Vec128 MaskedSatSubOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_subs_epi8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedSatSubOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_subs_epu8(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedSatSubOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_subs_epi16(no.raw, m.raw, a.raw, b.raw)}; } template HWY_API Vec128 MaskedSatSubOr(Vec128 no, Mask128 m, Vec128 a, Vec128 b) { return Vec128{_mm_mask_subs_epu16(no.raw, m.raw, a.raw, b.raw)}; } #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ Floating-point multiply-add variants #if HWY_HAVE_FLOAT16 template HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, Vec128 add) { return Vec128{_mm_fmadd_ph(mul.raw, x.raw, add.raw)}; } template HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, Vec128 add) { return Vec128{_mm_fnmadd_ph(mul.raw, x.raw, add.raw)}; } template HWY_API Vec128 MulSub(Vec128 mul, Vec128 x, Vec128 sub) { return Vec128{_mm_fmsub_ph(mul.raw, x.raw, sub.raw)}; } template HWY_API Vec128 NegMulSub(Vec128 mul, Vec128 x, Vec128 sub) { return Vec128{_mm_fnmsub_ph(mul.raw, x.raw, sub.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, Vec128 add) { #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) return mul * x + add; #else return Vec128{_mm_fmadd_ps(mul.raw, x.raw, add.raw)}; #endif } template HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, Vec128 add) { #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) return mul * x + add; #else return Vec128{_mm_fmadd_pd(mul.raw, x.raw, add.raw)}; #endif } // Returns add - mul * x template HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, Vec128 add) { #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) return add - mul * x; #else return Vec128{_mm_fnmadd_ps(mul.raw, x.raw, add.raw)}; #endif } template HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, Vec128 add) { #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) return add - mul * x; #else return Vec128{_mm_fnmadd_pd(mul.raw, x.raw, add.raw)}; #endif } // Returns mul * x - sub template HWY_API Vec128 MulSub(Vec128 mul, Vec128 x, Vec128 sub) { #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) return mul * x - sub; #else return Vec128{_mm_fmsub_ps(mul.raw, x.raw, sub.raw)}; #endif } template HWY_API Vec128 MulSub(Vec128 mul, Vec128 x, Vec128 sub) { #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) return mul * x - sub; #else return Vec128{_mm_fmsub_pd(mul.raw, x.raw, sub.raw)}; #endif } // Returns -mul * x - sub template HWY_API Vec128 NegMulSub(Vec128 mul, Vec128 x, Vec128 sub) { #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) return Neg(mul) * x - sub; #else return Vec128{_mm_fnmsub_ps(mul.raw, x.raw, sub.raw)}; #endif } template HWY_API Vec128 NegMulSub(Vec128 mul, Vec128 x, Vec128 sub) { #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) return Neg(mul) * x - sub; #else return Vec128{_mm_fnmsub_pd(mul.raw, x.raw, sub.raw)}; #endif } #if HWY_TARGET <= HWY_SSSE3 #if HWY_HAVE_FLOAT16 template HWY_API Vec128 MulAddSub(Vec128 mul, Vec128 x, Vec128 sub_or_add) { return Vec128{_mm_fmaddsub_ph(mul.raw, x.raw, sub_or_add.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 MulAddSub(Vec128 mul, Vec128 x, Vec128 sub_or_add) { #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) return AddSub(mul * x, sub_or_add); #else return Vec128{_mm_fmaddsub_ps(mul.raw, x.raw, sub_or_add.raw)}; #endif } HWY_API Vec128 MulAddSub(Vec128 mul, Vec128 x, Vec128 sub_or_add) { #if HWY_TARGET >= HWY_SSE4 || defined(HWY_DISABLE_BMI2_FMA) return AddSub(mul * x, sub_or_add); #else return Vec128{_mm_fmaddsub_pd(mul.raw, x.raw, sub_or_add.raw)}; #endif } #endif // HWY_TARGET <= HWY_SSSE3 // ------------------------------ Floating-point square root // Full precision square root #if HWY_HAVE_FLOAT16 template HWY_API Vec128 Sqrt(Vec128 v) { return Vec128{_mm_sqrt_ph(v.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 Sqrt(Vec128 v) { return Vec128{_mm_sqrt_ps(v.raw)}; } HWY_API Vec128 Sqrt(Vec128 v) { return Vec128{_mm_sqrt_ss(v.raw)}; } template HWY_API Vec128 Sqrt(Vec128 v) { return Vec128{_mm_sqrt_pd(v.raw)}; } HWY_API Vec64 Sqrt(Vec64 v) { return Vec64{_mm_sqrt_sd(_mm_setzero_pd(), v.raw)}; } // Approximate reciprocal square root #if HWY_HAVE_FLOAT16 template HWY_API Vec128 ApproximateReciprocalSqrt(Vec128 v) { return Vec128{_mm_rsqrt_ph(v.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 ApproximateReciprocalSqrt(Vec128 v) { return Vec128{_mm_rsqrt_ps(v.raw)}; } HWY_API Vec128 ApproximateReciprocalSqrt(Vec128 v) { return Vec128{_mm_rsqrt_ss(v.raw)}; } #if HWY_TARGET <= HWY_AVX3 #ifdef HWY_NATIVE_F64_APPROX_RSQRT #undef HWY_NATIVE_F64_APPROX_RSQRT #else #define HWY_NATIVE_F64_APPROX_RSQRT #endif HWY_API Vec64 ApproximateReciprocalSqrt(Vec64 v) { return Vec64{_mm_rsqrt14_sd(v.raw, v.raw)}; } HWY_API Vec128 ApproximateReciprocalSqrt(Vec128 v) { #if HWY_COMPILER_MSVC const DFromV d; return Vec128{_mm_mask_rsqrt14_pd( Undefined(d).raw, static_cast<__mmask8>(0xFF), v.raw)}; #else return Vec128{_mm_rsqrt14_pd(v.raw)}; #endif } #endif // ------------------------------ Min (Gt, IfThenElse) namespace detail { template HWY_INLINE HWY_MAYBE_UNUSED Vec128 MinU(const Vec128 a, const Vec128 b) { const DFromV d; const RebindToUnsigned du; const RebindToSigned di; const auto msb = Set(du, static_cast(T(1) << (sizeof(T) * 8 - 1))); const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); return IfThenElse(gt, b, a); } } // namespace detail // Unsigned template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{_mm_min_epu8(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { #if HWY_TARGET >= HWY_SSSE3 return detail::MinU(a, b); #else return Vec128{_mm_min_epu16(a.raw, b.raw)}; #endif } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { #if HWY_TARGET >= HWY_SSSE3 return detail::MinU(a, b); #else return Vec128{_mm_min_epu32(a.raw, b.raw)}; #endif } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { #if HWY_TARGET <= HWY_AVX3 return Vec128{_mm_min_epu64(a.raw, b.raw)}; #else return detail::MinU(a, b); #endif } // Signed template HWY_API Vec128 Min(Vec128 a, Vec128 b) { #if HWY_TARGET >= HWY_SSSE3 return IfThenElse(a < b, a, b); #else return Vec128{_mm_min_epi8(a.raw, b.raw)}; #endif } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{_mm_min_epi16(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { #if HWY_TARGET >= HWY_SSSE3 return IfThenElse(a < b, a, b); #else return Vec128{_mm_min_epi32(a.raw, b.raw)}; #endif } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { #if HWY_TARGET <= HWY_AVX3 return Vec128{_mm_min_epi64(a.raw, b.raw)}; #else return IfThenElse(a < b, a, b); #endif } // Float #if HWY_HAVE_FLOAT16 template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{_mm_min_ph(a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{_mm_min_ps(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{_mm_min_pd(a.raw, b.raw)}; } // ------------------------------ Max (Gt, IfThenElse) namespace detail { template HWY_INLINE HWY_MAYBE_UNUSED Vec128 MaxU(const Vec128 a, const Vec128 b) { const DFromV d; const RebindToUnsigned du; const RebindToSigned di; const auto msb = Set(du, static_cast(T(1) << (sizeof(T) * 8 - 1))); const auto gt = RebindMask(du, BitCast(di, a ^ msb) > BitCast(di, b ^ msb)); return IfThenElse(gt, a, b); } } // namespace detail // Unsigned template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{_mm_max_epu8(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { #if HWY_TARGET >= HWY_SSSE3 return detail::MaxU(a, b); #else return Vec128{_mm_max_epu16(a.raw, b.raw)}; #endif } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { #if HWY_TARGET >= HWY_SSSE3 return detail::MaxU(a, b); #else return Vec128{_mm_max_epu32(a.raw, b.raw)}; #endif } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { #if HWY_TARGET <= HWY_AVX3 return Vec128{_mm_max_epu64(a.raw, b.raw)}; #else return detail::MaxU(a, b); #endif } // Signed template HWY_API Vec128 Max(Vec128 a, Vec128 b) { #if HWY_TARGET >= HWY_SSSE3 return IfThenElse(a < b, b, a); #else return Vec128{_mm_max_epi8(a.raw, b.raw)}; #endif } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{_mm_max_epi16(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { #if HWY_TARGET >= HWY_SSSE3 return IfThenElse(a < b, b, a); #else return Vec128{_mm_max_epi32(a.raw, b.raw)}; #endif } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { #if HWY_TARGET <= HWY_AVX3 return Vec128{_mm_max_epi64(a.raw, b.raw)}; #else return IfThenElse(a < b, b, a); #endif } // Float #if HWY_HAVE_FLOAT16 template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{_mm_max_ph(a.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{_mm_max_ps(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{_mm_max_pd(a.raw, b.raw)}; } // ================================================== MEMORY (3) // ------------------------------ Non-temporal stores // On clang6, we see incorrect code generated for _mm_stream_pi, so // round even partial vectors up to 16 bytes. template HWY_API void Stream(VFromD v, D d, TFromD* HWY_RESTRICT aligned) { const RebindToUnsigned du; // for float16_t _mm_stream_si128(reinterpret_cast<__m128i*>(aligned), BitCast(du, v).raw); } template HWY_API void Stream(VFromD v, D /* tag */, float* HWY_RESTRICT aligned) { _mm_stream_ps(aligned, v.raw); } template HWY_API void Stream(VFromD v, D /* tag */, double* HWY_RESTRICT aligned) { _mm_stream_pd(aligned, v.raw); } // ------------------------------ Scatter // Work around warnings in the intrinsic definitions (passing -1 as a mask). HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") // Unfortunately the GCC/Clang intrinsics do not accept int64_t*. using GatherIndex64 = long long int; // NOLINT(runtime/int) static_assert(sizeof(GatherIndex64) == 8, "Must be 64-bit type"); #if HWY_TARGET <= HWY_AVX3 #ifdef HWY_NATIVE_SCATTER #undef HWY_NATIVE_SCATTER #else #define HWY_NATIVE_SCATTER #endif namespace detail { template HWY_INLINE void NativeScatter128(VFromD v, D d, TFromD* HWY_RESTRICT base, VI index) { if (d.MaxBytes() == 16) { _mm_i32scatter_epi32(base, index.raw, v.raw, kScale); } else { const __mmask8 mask = (1u << MaxLanes(d)) - 1; _mm_mask_i32scatter_epi32(base, mask, index.raw, v.raw, kScale); } } template HWY_INLINE void NativeScatter128(VFromD v, D d, TFromD* HWY_RESTRICT base, VI index) { if (d.MaxBytes() == 16) { _mm_i64scatter_epi64(base, index.raw, v.raw, kScale); } else { const __mmask8 mask = (1u << MaxLanes(d)) - 1; _mm_mask_i64scatter_epi64(base, mask, index.raw, v.raw, kScale); } } template HWY_INLINE void NativeScatter128(VFromD v, D d, float* HWY_RESTRICT base, VI index) { if (d.MaxBytes() == 16) { _mm_i32scatter_ps(base, index.raw, v.raw, kScale); } else { const __mmask8 mask = (1u << MaxLanes(d)) - 1; _mm_mask_i32scatter_ps(base, mask, index.raw, v.raw, kScale); } } template HWY_INLINE void NativeScatter128(VFromD v, D d, double* HWY_RESTRICT base, VI index) { if (d.MaxBytes() == 16) { _mm_i64scatter_pd(base, index.raw, v.raw, kScale); } else { const __mmask8 mask = (1u << MaxLanes(d)) - 1; _mm_mask_i64scatter_pd(base, mask, index.raw, v.raw, kScale); } } template HWY_INLINE void NativeMaskedScatter128(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT base, VI index) { // For partial vectors, ensure upper mask lanes are zero to prevent faults. if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); _mm_mask_i32scatter_epi32(base, m.raw, index.raw, v.raw, kScale); } template HWY_INLINE void NativeMaskedScatter128(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT base, VI index) { // For partial vectors, ensure upper mask lanes are zero to prevent faults. if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); _mm_mask_i64scatter_epi64(base, m.raw, index.raw, v.raw, kScale); } template HWY_INLINE void NativeMaskedScatter128(VFromD v, MFromD m, D d, float* HWY_RESTRICT base, VI index) { // For partial vectors, ensure upper mask lanes are zero to prevent faults. if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); _mm_mask_i32scatter_ps(base, m.raw, index.raw, v.raw, kScale); } template HWY_INLINE void NativeMaskedScatter128(VFromD v, MFromD m, D d, double* HWY_RESTRICT base, VI index) { // For partial vectors, ensure upper mask lanes are zero to prevent faults. if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); _mm_mask_i64scatter_pd(base, m.raw, index.raw, v.raw, kScale); } } // namespace detail template HWY_API void ScatterOffset(VFromD v, D d, TFromD* HWY_RESTRICT base, VFromD> offset) { return detail::NativeScatter128<1>(v, d, base, offset); } template HWY_API void ScatterIndex(VFromD v, D d, TFromD* HWY_RESTRICT base, VFromD> index) { return detail::NativeScatter128)>(v, d, base, index); } template HWY_API void MaskedScatterIndex(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT base, VFromD> index) { return detail::NativeMaskedScatter128)>(v, m, d, base, index); } #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ Gather (Load/Store) #if HWY_TARGET <= HWY_AVX2 #ifdef HWY_NATIVE_GATHER #undef HWY_NATIVE_GATHER #else #define HWY_NATIVE_GATHER #endif namespace detail { template HWY_INLINE Vec128 NativeGather128(const T* HWY_RESTRICT base, Vec128 indices) { return Vec128{_mm_i32gather_epi32( reinterpret_cast(base), indices.raw, kScale)}; } template HWY_INLINE Vec128 NativeGather128(const T* HWY_RESTRICT base, Vec128 indices) { return Vec128{_mm_i64gather_epi64( reinterpret_cast(base), indices.raw, kScale)}; } template HWY_INLINE Vec128 NativeGather128(const float* HWY_RESTRICT base, Vec128 indices) { return Vec128{_mm_i32gather_ps(base, indices.raw, kScale)}; } template HWY_INLINE Vec128 NativeGather128(const double* HWY_RESTRICT base, Vec128 indices) { return Vec128{_mm_i64gather_pd(base, indices.raw, kScale)}; } template HWY_INLINE Vec128 NativeMaskedGatherOr128(Vec128 no, Mask128 m, const T* HWY_RESTRICT base, Vec128 indices) { #if HWY_TARGET <= HWY_AVX3 return Vec128{_mm_mmask_i32gather_epi32( no.raw, m.raw, indices.raw, reinterpret_cast(base), kScale)}; #else return Vec128{ _mm_mask_i32gather_epi32(no.raw, reinterpret_cast(base), indices.raw, m.raw, kScale)}; #endif } template HWY_INLINE Vec128 NativeMaskedGatherOr128(Vec128 no, Mask128 m, const T* HWY_RESTRICT base, Vec128 indices) { #if HWY_TARGET <= HWY_AVX3 return Vec128{_mm_mmask_i64gather_epi64( no.raw, m.raw, indices.raw, reinterpret_cast(base), kScale)}; #else return Vec128{_mm_mask_i64gather_epi64( no.raw, reinterpret_cast(base), indices.raw, m.raw, kScale)}; #endif } template HWY_INLINE Vec128 NativeMaskedGatherOr128( Vec128 no, Mask128 m, const float* HWY_RESTRICT base, Vec128 indices) { #if HWY_TARGET <= HWY_AVX3 return Vec128{ _mm_mmask_i32gather_ps(no.raw, m.raw, indices.raw, base, kScale)}; #else return Vec128{ _mm_mask_i32gather_ps(no.raw, base, indices.raw, m.raw, kScale)}; #endif } template HWY_INLINE Vec128 NativeMaskedGatherOr128( Vec128 no, Mask128 m, const double* HWY_RESTRICT base, Vec128 indices) { #if HWY_TARGET <= HWY_AVX3 return Vec128{ _mm_mmask_i64gather_pd(no.raw, m.raw, indices.raw, base, kScale)}; #else return Vec128{ _mm_mask_i64gather_pd(no.raw, base, indices.raw, m.raw, kScale)}; #endif } } // namespace detail template HWY_API VFromD GatherOffset(D d, const TFromD* HWY_RESTRICT base, VFromD> offsets) { const RebindToSigned di; (void)di; // for HWY_DASSERT HWY_DASSERT(AllFalse(di, Lt(offsets, Zero(di)))); return detail::NativeGather128<1>(base, offsets); } template > HWY_API VFromD GatherIndex(D d, const T* HWY_RESTRICT base, VFromD> indices) { const RebindToSigned di; (void)di; // for HWY_DASSERT HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di)))); return detail::NativeGather128(base, indices); } template > HWY_API VFromD MaskedGatherIndexOr(VFromD no, MFromD m, D d, const T* HWY_RESTRICT base, VFromD> indices) { // For partial vectors, ensure upper mask lanes are zero to prevent faults. if (!detail::IsFull(d)) m = And(m, FirstN(d, Lanes(d))); const RebindToSigned di; (void)di; // for HWY_DASSERT HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di)))); return detail::NativeMaskedGatherOr128(no, m, base, indices); } // Generic for all vector lengths. template HWY_API VFromD MaskedGatherIndex(MFromD m, D d, const TFromD* HWY_RESTRICT base, VFromD> indices) { return MaskedGatherIndexOr(Zero(d), m, d, base, indices); } #endif // HWY_TARGET <= HWY_AVX2 HWY_DIAGNOSTICS(pop) // ================================================== SWIZZLE (2) // ------------------------------ LowerHalf template HWY_API VFromD LowerHalf(D /* tag */, VFromD> v) { return VFromD{v.raw}; } template HWY_API Vec128 LowerHalf(Vec128 v) { return Vec128{v.raw}; } // ------------------------------ ShiftLeftBytes template HWY_API VFromD ShiftLeftBytes(D d, VFromD v) { static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); const RebindToUnsigned du; return BitCast( d, VFromD{_mm_slli_si128(BitCast(du, v).raw, kBytes)}); } // Generic for all vector lengths. template HWY_API V ShiftLeftBytes(const V v) { return ShiftLeftBytes(DFromV(), v); } // ------------------------------ ShiftLeftLanes // Generic for all vector lengths. template HWY_API VFromD ShiftLeftLanes(D d, const VFromD v) { const Repartition d8; return BitCast(d, ShiftLeftBytes)>(BitCast(d8, v))); } // Generic for all vector lengths. template HWY_API V ShiftLeftLanes(const V v) { return ShiftLeftLanes(DFromV(), v); } // ------------------------------ ShiftRightBytes template HWY_API VFromD ShiftRightBytes(D d, VFromD v) { static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); const RebindToUnsigned du; // For partial vectors, clear upper lanes so we shift in zeros. if (d.MaxBytes() != 16) { const Full128> dfull; const VFromD vfull{v.raw}; v = VFromD{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw}; } return BitCast( d, VFromD{_mm_srli_si128(BitCast(du, v).raw, kBytes)}); } // ------------------------------ ShiftRightLanes // Generic for all vector lengths. template HWY_API VFromD ShiftRightLanes(D d, const VFromD v) { const Repartition d8; constexpr size_t kBytes = kLanes * sizeof(TFromD); return BitCast(d, ShiftRightBytes(d8, BitCast(d8, v))); } // ------------------------------ UpperHalf (ShiftRightBytes) // Full input: copy hi into lo (smaller instruction encoding than shifts). template HWY_API VFromD UpperHalf(D d, VFromD> v) { const Twice> dut; using VUT = VFromD; // for float16_t const VUT vut = BitCast(dut, v); return BitCast(d, LowerHalf(VUT{_mm_unpackhi_epi64(vut.raw, vut.raw)})); } template HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { return Vec64{_mm_movehl_ps(v.raw, v.raw)}; } template HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { return Vec64{_mm_unpackhi_pd(v.raw, v.raw)}; } // Partial template HWY_API VFromD UpperHalf(D d, VFromD> v) { return LowerHalf(d, ShiftRightBytes(Twice(), v)); } // ------------------------------ ExtractLane (UpperHalf) namespace detail { template HWY_INLINE T ExtractLane(const Vec128 v) { static_assert(kLane < N, "Lane index out of bounds"); #if HWY_TARGET >= HWY_SSSE3 const int pair = _mm_extract_epi16(v.raw, kLane / 2); constexpr int kShift = kLane & 1 ? 8 : 0; return static_cast((pair >> kShift) & 0xFF); #else return static_cast(_mm_extract_epi8(v.raw, kLane) & 0xFF); #endif } template HWY_INLINE T ExtractLane(const Vec128 v) { static_assert(kLane < N, "Lane index out of bounds"); const DFromV d; const RebindToUnsigned du; const uint16_t lane = static_cast( _mm_extract_epi16(BitCast(du, v).raw, kLane) & 0xFFFF); return BitCastScalar(lane); } template HWY_INLINE T ExtractLane(const Vec128 v) { static_assert(kLane < N, "Lane index out of bounds"); #if HWY_TARGET >= HWY_SSSE3 return static_cast(_mm_cvtsi128_si32( (kLane == 0) ? v.raw : _mm_shuffle_epi32(v.raw, kLane))); #else return static_cast(_mm_extract_epi32(v.raw, kLane)); #endif } template HWY_INLINE T ExtractLane(const Vec128 v) { static_assert(kLane < N, "Lane index out of bounds"); #if HWY_ARCH_X86_32 alignas(16) T lanes[2]; Store(v, DFromV(), lanes); return lanes[kLane]; #elif HWY_TARGET >= HWY_SSSE3 return static_cast( _mm_cvtsi128_si64((kLane == 0) ? v.raw : _mm_shuffle_epi32(v.raw, 0xEE))); #else return static_cast(_mm_extract_epi64(v.raw, kLane)); #endif } template HWY_INLINE float ExtractLane(const Vec128 v) { static_assert(kLane < N, "Lane index out of bounds"); #if HWY_TARGET >= HWY_SSSE3 return _mm_cvtss_f32((kLane == 0) ? v.raw : _mm_shuffle_ps(v.raw, v.raw, kLane)); #else // Bug in the intrinsic, returns int but should be float. const int32_t bits = _mm_extract_ps(v.raw, kLane); return BitCastScalar(bits); #endif } // There is no extract_pd; two overloads because there is no UpperHalf for N=1. template HWY_INLINE double ExtractLane(const Vec64 v) { static_assert(kLane == 0, "Lane index out of bounds"); return GetLane(v); } template HWY_INLINE double ExtractLane(const Vec128 v) { static_assert(kLane < 2, "Lane index out of bounds"); const Half> dh; return kLane == 0 ? GetLane(v) : GetLane(UpperHalf(dh, v)); } } // namespace detail // Requires one overload per vector length because ExtractLane<3> may be a // compile error if it calls _mm_extract_epi64. template HWY_API T ExtractLane(const Vec128 v, size_t i) { HWY_DASSERT(i == 0); (void)i; return GetLane(v); } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::ExtractLane<0>(v); case 1: return detail::ExtractLane<1>(v); } } #endif alignas(16) T lanes[2]; Store(v, DFromV(), lanes); return lanes[i]; } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::ExtractLane<0>(v); case 1: return detail::ExtractLane<1>(v); case 2: return detail::ExtractLane<2>(v); case 3: return detail::ExtractLane<3>(v); } } #endif alignas(16) T lanes[4]; Store(v, DFromV(), lanes); return lanes[i]; } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::ExtractLane<0>(v); case 1: return detail::ExtractLane<1>(v); case 2: return detail::ExtractLane<2>(v); case 3: return detail::ExtractLane<3>(v); case 4: return detail::ExtractLane<4>(v); case 5: return detail::ExtractLane<5>(v); case 6: return detail::ExtractLane<6>(v); case 7: return detail::ExtractLane<7>(v); } } #endif alignas(16) T lanes[8]; Store(v, DFromV(), lanes); return lanes[i]; } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::ExtractLane<0>(v); case 1: return detail::ExtractLane<1>(v); case 2: return detail::ExtractLane<2>(v); case 3: return detail::ExtractLane<3>(v); case 4: return detail::ExtractLane<4>(v); case 5: return detail::ExtractLane<5>(v); case 6: return detail::ExtractLane<6>(v); case 7: return detail::ExtractLane<7>(v); case 8: return detail::ExtractLane<8>(v); case 9: return detail::ExtractLane<9>(v); case 10: return detail::ExtractLane<10>(v); case 11: return detail::ExtractLane<11>(v); case 12: return detail::ExtractLane<12>(v); case 13: return detail::ExtractLane<13>(v); case 14: return detail::ExtractLane<14>(v); case 15: return detail::ExtractLane<15>(v); } } #endif alignas(16) T lanes[16]; Store(v, DFromV(), lanes); return lanes[i]; } // ------------------------------ InsertLane (UpperHalf) namespace detail { template HWY_INLINE V InsertLaneUsingBroadcastAndBlend(V v, size_t i, TFromV t) { const DFromV d; #if HWY_TARGET <= HWY_AVX3 using RawMask = decltype(MaskFromVec(VFromD()).raw); const auto mask = MFromD{static_cast(uint64_t{1} << i)}; #else const RebindToUnsigned du; using TU = TFromD; const auto mask = RebindMask(d, Iota(du, 0) == Set(du, static_cast(i))); #endif return IfThenElse(mask, Set(d, t), v); } template HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { static_assert(kLane < N, "Lane index out of bounds"); #if HWY_TARGET >= HWY_SSSE3 return InsertLaneUsingBroadcastAndBlend(v, kLane, t); #else return Vec128{_mm_insert_epi8(v.raw, t, kLane)}; #endif } template HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { static_assert(kLane < N, "Lane index out of bounds"); const DFromV d; const RebindToUnsigned du; const uint16_t bits = BitCastScalar(t); return BitCast(d, VFromD{ _mm_insert_epi16(BitCast(du, v).raw, bits, kLane)}); } template HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { static_assert(kLane < N, "Lane index out of bounds"); #if HWY_TARGET >= HWY_SSSE3 return InsertLaneUsingBroadcastAndBlend(v, kLane, t); #else const MakeSigned ti = BitCastScalar>(t); return Vec128{_mm_insert_epi32(v.raw, ti, kLane)}; #endif } template HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { static_assert(kLane < N, "Lane index out of bounds"); #if HWY_TARGET >= HWY_SSSE3 || HWY_ARCH_X86_32 const DFromV d; const RebindToFloat df; const auto vt = BitCast(df, Set(d, t)); if (kLane == 0) { return BitCast( d, Vec128{_mm_shuffle_pd(vt.raw, BitCast(df, v).raw, 2)}); } return BitCast( d, Vec128{_mm_shuffle_pd(BitCast(df, v).raw, vt.raw, 0)}); #else const MakeSigned ti = BitCastScalar>(t); return Vec128{_mm_insert_epi64(v.raw, ti, kLane)}; #endif } template HWY_INLINE Vec128 InsertLane(const Vec128 v, float t) { static_assert(kLane < N, "Lane index out of bounds"); #if HWY_TARGET >= HWY_SSSE3 return InsertLaneUsingBroadcastAndBlend(v, kLane, t); #else return Vec128{_mm_insert_ps(v.raw, _mm_set_ss(t), kLane << 4)}; #endif } // There is no insert_pd; two overloads because there is no UpperHalf for N=1. template HWY_INLINE Vec128 InsertLane(const Vec128 v, double t) { static_assert(kLane == 0, "Lane index out of bounds"); return Set(DFromV(), t); } template HWY_INLINE Vec128 InsertLane(const Vec128 v, double t) { static_assert(kLane < 2, "Lane index out of bounds"); const DFromV d; const Vec128 vt = Set(d, t); if (kLane == 0) { return Vec128{_mm_shuffle_pd(vt.raw, v.raw, 2)}; } return Vec128{_mm_shuffle_pd(v.raw, vt.raw, 0)}; } } // namespace detail // Requires one overload per vector length because InsertLane<3> may be a // compile error if it calls _mm_insert_epi64. template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { HWY_DASSERT(i == 0); (void)i; return Set(DFromV(), t); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); } } #endif return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); case 2: return detail::InsertLane<2>(v, t); case 3: return detail::InsertLane<3>(v, t); } } #endif return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); case 2: return detail::InsertLane<2>(v, t); case 3: return detail::InsertLane<3>(v, t); case 4: return detail::InsertLane<4>(v, t); case 5: return detail::InsertLane<5>(v, t); case 6: return detail::InsertLane<6>(v, t); case 7: return detail::InsertLane<7>(v, t); } } #endif return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); case 2: return detail::InsertLane<2>(v, t); case 3: return detail::InsertLane<3>(v, t); case 4: return detail::InsertLane<4>(v, t); case 5: return detail::InsertLane<5>(v, t); case 6: return detail::InsertLane<6>(v, t); case 7: return detail::InsertLane<7>(v, t); case 8: return detail::InsertLane<8>(v, t); case 9: return detail::InsertLane<9>(v, t); case 10: return detail::InsertLane<10>(v, t); case 11: return detail::InsertLane<11>(v, t); case 12: return detail::InsertLane<12>(v, t); case 13: return detail::InsertLane<13>(v, t); case 14: return detail::InsertLane<14>(v, t); case 15: return detail::InsertLane<15>(v, t); } } #endif return detail::InsertLaneUsingBroadcastAndBlend(v, i, t); } // ------------------------------ CombineShiftRightBytes #if HWY_TARGET == HWY_SSE2 template HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { static_assert(0 < kBytes && kBytes < 16, "kBytes invalid"); return Or(ShiftRightBytes(d, lo), ShiftLeftBytes<16 - kBytes>(d, hi)); } template HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { constexpr size_t kSize = d.MaxBytes(); static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); const Twice dt; return VFromD{ShiftRightBytes(dt, Combine(dt, hi, lo)).raw}; } #else template HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { const Repartition d8; return BitCast(d, Vec128{_mm_alignr_epi8( BitCast(d8, hi).raw, BitCast(d8, lo).raw, kBytes)}); } template HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { constexpr size_t kSize = d.MaxBytes(); static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); const Repartition d8; using V8 = Vec128; const DFromV dfull8; const Repartition, decltype(dfull8)> dfull; const V8 hi8{BitCast(d8, hi).raw}; // Move into most-significant bytes const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw}); const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8); return VFromD{BitCast(dfull, r).raw}; } #endif // ------------------------------ Broadcast/splat any lane template HWY_API Vec128 Broadcast(const Vec128 v) { const DFromV d; const RebindToUnsigned du; using VU = VFromD; const VU vu = BitCast(du, v); // for float16_t static_assert(0 <= kLane && kLane < N, "Invalid lane"); if (kLane < 4) { const __m128i lo = _mm_shufflelo_epi16(vu.raw, (0x55 * kLane) & 0xFF); return BitCast(d, VU{_mm_unpacklo_epi64(lo, lo)}); } else { const __m128i hi = _mm_shufflehi_epi16(vu.raw, (0x55 * (kLane - 4)) & 0xFF); return BitCast(d, VU{_mm_unpackhi_epi64(hi, hi)}); } } template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{_mm_shuffle_epi32(v.raw, 0x55 * kLane)}; } template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{_mm_shuffle_epi32(v.raw, kLane ? 0xEE : 0x44)}; } template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{_mm_shuffle_ps(v.raw, v.raw, 0x55 * kLane)}; } template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{_mm_shuffle_pd(v.raw, v.raw, 3 * kLane)}; } // ------------------------------ TableLookupLanes (Shuffle01) // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. template struct Indices128 { __m128i raw; }; template , typename TI, size_t kN, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 1)> HWY_API Indices128 IndicesFromVec(D d, Vec128 vec) { static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); #if HWY_IS_DEBUG_BUILD const Rebind di; HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && AllTrue(di, Lt(vec, Set(di, kN * 2)))); #endif // No change as byte indices are always used for 8-bit lane types (void)d; return Indices128{vec.raw}; } template , typename TI, size_t kN, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 2)> HWY_API Indices128 IndicesFromVec(D d, Vec128 vec) { static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); #if HWY_IS_DEBUG_BUILD const Rebind di; HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && AllTrue(di, Lt(vec, Set(di, kN * 2)))); #endif #if HWY_TARGET <= HWY_AVX3 || HWY_TARGET == HWY_SSE2 (void)d; return Indices128{vec.raw}; #else // SSSE3, SSE4, or AVX2 const Repartition d8; using V8 = VFromD; alignas(16) static constexpr uint8_t kByteOffsets[16] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; // Broadcast each lane index to all 4 bytes of T alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes)); // Shift to bytes const Repartition d16; const V8 byte_indices = BitCast(d8, ShiftLeft<1>(BitCast(d16, lane_indices))); return Indices128{Add(byte_indices, Load(d8, kByteOffsets)).raw}; #endif // HWY_TARGET <= HWY_AVX3 || HWY_TARGET == HWY_SSE2 } template , typename TI, size_t kN, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 4)> HWY_API Indices128 IndicesFromVec(D d, Vec128 vec) { static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); #if HWY_IS_DEBUG_BUILD const Rebind di; HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && AllTrue(di, Lt(vec, Set(di, kN * 2)))); #endif #if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 (void)d; return Indices128{vec.raw}; #else const Repartition d8; using V8 = VFromD; alignas(16) static constexpr uint8_t kByteOffsets[16] = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; // Broadcast each lane index to all 4 bytes of T alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; const V8 lane_indices = TableLookupBytes(vec, Load(d8, kBroadcastLaneBytes)); // Shift to bytes const Repartition d16; const V8 byte_indices = BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices))); return Indices128{Add(byte_indices, Load(d8, kByteOffsets)).raw}; #endif } template , typename TI, size_t kN, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_T_SIZE(T, 8)> HWY_API Indices128 IndicesFromVec(D d, Vec128 vec) { static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); #if HWY_IS_DEBUG_BUILD const Rebind di; HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && AllTrue(di, Lt(vec, Set(di, static_cast(kN * 2))))); #else (void)d; #endif // No change - even without AVX3, we can shuffle+blend. return Indices128{vec.raw}; } template HWY_API Indices128, HWY_MAX_LANES_D(D)> SetTableIndices( D d, const TI* idx) { static_assert(sizeof(TFromD) == sizeof(TI), "Index size must match lane"); const Rebind di; return IndicesFromVec(d, LoadU(di, idx)); } template HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { return TableLookupBytes(v, Vec128{idx.raw}); } template HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { #if HWY_TARGET <= HWY_AVX3 return {_mm_permutexvar_epi16(idx.raw, v.raw)}; #elif HWY_TARGET == HWY_SSE2 #if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) typedef uint16_t GccU16RawVectType __attribute__((__vector_size__(16))); return Vec128{reinterpret_cast::type>( __builtin_shuffle(reinterpret_cast(v.raw), reinterpret_cast(idx.raw)))}; #else const Full128 d_full; alignas(16) T src_lanes[8]; alignas(16) uint16_t indices[8]; alignas(16) T result_lanes[8]; Store(Vec128{v.raw}, d_full, src_lanes); _mm_store_si128(reinterpret_cast<__m128i*>(indices), idx.raw); for (int i = 0; i < 8; i++) { result_lanes[i] = src_lanes[indices[i] & 7u]; } return Vec128{Load(d_full, result_lanes).raw}; #endif // HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) #else return TableLookupBytes(v, Vec128{idx.raw}); #endif } #if HWY_HAVE_FLOAT16 template HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { return {_mm_permutexvar_ph(idx.raw, v.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { #if HWY_TARGET <= HWY_AVX2 const DFromV d; const RebindToFloat df; const Vec128 perm{_mm_permutevar_ps(BitCast(df, v).raw, idx.raw)}; return BitCast(d, perm); #elif HWY_TARGET == HWY_SSE2 #if HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16))); return Vec128{reinterpret_cast::type>( __builtin_shuffle(reinterpret_cast(v.raw), reinterpret_cast(idx.raw)))}; #else const Full128 d_full; alignas(16) T src_lanes[4]; alignas(16) uint32_t indices[4]; alignas(16) T result_lanes[4]; Store(Vec128{v.raw}, d_full, src_lanes); _mm_store_si128(reinterpret_cast<__m128i*>(indices), idx.raw); for (int i = 0; i < 4; i++) { result_lanes[i] = src_lanes[indices[i] & 3u]; } return Vec128{Load(d_full, result_lanes).raw}; #endif // HWY_COMPILER_GCC_ACTUAL && HWY_HAS_BUILTIN(__builtin_shuffle) #else // SSSE3 or SSE4 return TableLookupBytes(v, Vec128{idx.raw}); #endif } #if HWY_TARGET <= HWY_SSSE3 template HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { #if HWY_TARGET <= HWY_AVX2 return Vec128{_mm_permutevar_ps(v.raw, idx.raw)}; #else // SSSE3 or SSE4 const DFromV df; const RebindToSigned di; return BitCast(df, TableLookupBytes(BitCast(di, v), Vec128{idx.raw})); #endif // HWY_TARGET <= HWY_AVX2 } #endif // HWY_TARGET <= HWY_SSSE3 // Single lane: no change template HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 /* idx */) { return v; } template HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { const DFromV d; Vec128 vidx{idx.raw}; #if HWY_TARGET <= HWY_AVX2 // There is no _mm_permute[x]var_epi64. vidx += vidx; // bit1 is the decider (unusual) const RebindToFloat df; return BitCast( d, Vec128{_mm_permutevar_pd(BitCast(df, v).raw, vidx.raw)}); #else // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit // comparison (expensive on SSSE3), just invert the upper lane and subtract 1 // to obtain an all-zero or all-one mask. const RebindToSigned di; const Vec128 same = (vidx ^ Iota(di, 0)) - Set(di, 1); const Mask128 mask_same = RebindMask(d, MaskFromVec(same)); return IfThenElse(mask_same, v, Shuffle01(v)); #endif } HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { Vec128 vidx{idx.raw}; #if HWY_TARGET <= HWY_AVX2 vidx += vidx; // bit1 is the decider (unusual) return Vec128{_mm_permutevar_pd(v.raw, vidx.raw)}; #else // Only 2 lanes: can swap+blend. Choose v if vidx == iota. To avoid a 64-bit // comparison (expensive on SSSE3), just invert the upper lane and subtract 1 // to obtain an all-zero or all-one mask. const DFromV d; const RebindToSigned di; const Vec128 same = (vidx ^ Iota(di, 0)) - Set(di, 1); const Mask128 mask_same = RebindMask(d, MaskFromVec(same)); return IfThenElse(mask_same, v, Shuffle01(v)); #endif } // ------------------------------ ReverseBlocks // Single block: no change template HWY_API VFromD ReverseBlocks(D /* tag */, VFromD v) { return v; } // ------------------------------ Reverse (Shuffle0123, Shuffle2301) // Single lane: no change template HWY_API VFromD Reverse(D /* tag */, VFromD v) { return v; } // 32-bit x2: shuffle template HWY_API VFromD Reverse(D /* tag */, const VFromD v) { return VFromD{Shuffle2301(Vec128>{v.raw}).raw}; } // 64-bit x2: shuffle template HWY_API VFromD Reverse(D /* tag */, const VFromD v) { return Shuffle01(v); } // 32-bit x4: shuffle template HWY_API VFromD Reverse(D /* tag */, const VFromD v) { return Shuffle0123(v); } // 16-bit template HWY_API VFromD Reverse(D d, const VFromD v) { const RebindToUnsigned du; using VU = VFromD; const VU vu = BitCast(du, v); // for float16_t constexpr size_t kN = MaxLanes(d); if (kN == 1) return v; if (kN == 2) { return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 0, 1))}); } if (kN == 4) { return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3))}); } #if HWY_TARGET == HWY_SSE2 const VU rev4{ _mm_shufflehi_epi16(_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3)), _MM_SHUFFLE(0, 1, 2, 3))}; return BitCast(d, VU{_mm_shuffle_epi32(rev4.raw, _MM_SHUFFLE(1, 0, 3, 2))}); #else const RebindToSigned di; const VFromD shuffle = Dup128VecFromValues( di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100); return BitCast(d, TableLookupBytes(v, shuffle)); #endif } template HWY_API VFromD Reverse(D d, const VFromD v) { constexpr int kN = static_cast(MaxLanes(d)); if (kN == 1) return v; #if HWY_TARGET <= HWY_SSSE3 // NOTE: Lanes with negative shuffle control mask values are set to zero. alignas(16) static constexpr int8_t kReverse[16] = { kN - 1, kN - 2, kN - 3, kN - 4, kN - 5, kN - 6, kN - 7, kN - 8, kN - 9, kN - 10, kN - 11, kN - 12, kN - 13, kN - 14, kN - 15, kN - 16}; const RebindToSigned di; const VFromD idx = Load(di, kReverse); return VFromD{_mm_shuffle_epi8(BitCast(di, v).raw, idx.raw)}; #else const RepartitionToWide d16; return BitCast(d, Reverse(d16, RotateRight<8>(BitCast(d16, v)))); #endif } // ------------------------------ Reverse2 // Single lane: no change template HWY_API VFromD Reverse2(D /* tag */, VFromD v) { return v; } // Generic for all vector lengths (128-bit sufficient if SSE2). template HWY_API VFromD Reverse2(D d, VFromD v) { #if HWY_TARGET <= HWY_AVX3 const Repartition du32; return BitCast(d, RotateRight<16>(BitCast(du32, v))); #elif HWY_TARGET == HWY_SSE2 const RebindToUnsigned du; using VU = VFromD; const VU vu = BitCast(du, v); // for float16_t constexpr size_t kN = MaxLanes(d); __m128i shuf_result = _mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(2, 3, 0, 1)); if (kN > 4) { shuf_result = _mm_shufflehi_epi16(shuf_result, _MM_SHUFFLE(2, 3, 0, 1)); } return BitCast(d, VU{shuf_result}); #else const RebindToSigned di; const VFromD shuffle = Dup128VecFromValues( di, 0x0302, 0x0100, 0x0706, 0x0504, 0x0B0A, 0x0908, 0x0F0E, 0x0D0C); return BitCast(d, TableLookupBytes(v, shuffle)); #endif } // Generic for all vector lengths. template HWY_API VFromD Reverse2(D /* tag */, VFromD v) { return Shuffle2301(v); } // Generic for all vector lengths. template HWY_API VFromD Reverse2(D /* tag */, VFromD v) { return Shuffle01(v); } // ------------------------------ Reverse4 template HWY_API VFromD Reverse4(D d, VFromD v) { const RebindToUnsigned du; using VU = VFromD; const VU vu = BitCast(du, v); // for float16_t // 4x 16-bit: a single shufflelo suffices. constexpr size_t kN = MaxLanes(d); if (kN <= 4) { return BitCast(d, VU{_mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3))}); } #if HWY_TARGET == HWY_SSE2 return BitCast(d, VU{_mm_shufflehi_epi16( _mm_shufflelo_epi16(vu.raw, _MM_SHUFFLE(0, 1, 2, 3)), _MM_SHUFFLE(0, 1, 2, 3))}); #else const RebindToSigned di; const VFromD shuffle = Dup128VecFromValues( di, 0x0706, 0x0504, 0x0302, 0x0100, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908); return BitCast(d, TableLookupBytes(v, shuffle)); #endif } // Generic for all vector lengths. template HWY_API VFromD Reverse4(D /* tag */, const VFromD v) { return Shuffle0123(v); } template HWY_API VFromD Reverse4(D /* tag */, VFromD /* v */) { HWY_ASSERT(0); // don't have 4 u64 lanes } // ------------------------------ Reverse8 template HWY_API VFromD Reverse8(D d, const VFromD v) { #if HWY_TARGET == HWY_SSE2 const RepartitionToWide dw; return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v)))); #else const RebindToSigned di; const VFromD shuffle = Dup128VecFromValues( di, 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100); return BitCast(d, TableLookupBytes(v, shuffle)); #endif } template HWY_API VFromD Reverse8(D /* tag */, VFromD /* v */) { HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit } // ------------------------------ ReverseBits in x86_512 // ------------------------------ InterleaveUpper (UpperHalf) // Full template HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { return VFromD{_mm_unpackhi_epi8(a.raw, b.raw)}; } template HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { const DFromV d; const RebindToUnsigned du; using VU = VFromD; // for float16_t return BitCast( d, VU{_mm_unpackhi_epi16(BitCast(du, a).raw, BitCast(du, b).raw)}); } template HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { return VFromD{_mm_unpackhi_epi32(a.raw, b.raw)}; } template HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { return VFromD{_mm_unpackhi_epi64(a.raw, b.raw)}; } template HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { return VFromD{_mm_unpackhi_ps(a.raw, b.raw)}; } template HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { return VFromD{_mm_unpackhi_pd(a.raw, b.raw)}; } // Partial template HWY_API VFromD InterleaveUpper(D d, VFromD a, VFromD b) { const Half d2; return InterleaveLower(d, VFromD{UpperHalf(d2, a).raw}, VFromD{UpperHalf(d2, b).raw}); } // -------------------------- I8/U8 Broadcast (InterleaveLower, InterleaveUpper) template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); const DFromV d; #if HWY_TARGET == HWY_SSE2 const Full128 d_full; const Vec128 v_full{v.raw}; const auto v_interleaved = (kLane < 8) ? InterleaveLower(d_full, v_full, v_full) : InterleaveUpper(d_full, v_full, v_full); return ResizeBitCast( d, Broadcast(BitCast(Full128(), v_interleaved))); #else return TableLookupBytes(v, Set(d, static_cast(kLane))); #endif } // ------------------------------ ZipLower/ZipUpper (InterleaveLower) // Same as Interleave*, except that the return lanes are double-width integers; // this is necessary because the single-lane scalar cannot return two values. // Generic for all vector lengths. template >> HWY_API VFromD ZipLower(V a, V b) { return BitCast(DW(), InterleaveLower(a, b)); } template , class DW = RepartitionToWide> HWY_API VFromD ZipLower(DW dw, V a, V b) { return BitCast(dw, InterleaveLower(D(), a, b)); } template , class DW = RepartitionToWide> HWY_API VFromD ZipUpper(DW dw, V a, V b) { return BitCast(dw, InterleaveUpper(D(), a, b)); } // ------------------------------ Per4LaneBlockShuffle namespace detail { #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 #else #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32 #endif template HWY_INLINE VFromD Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, const uint32_t x2, const uint32_t x1, const uint32_t x0) { return ResizeBitCast( d, Vec128{_mm_set_epi32( static_cast(x3), static_cast(x2), static_cast(x1), static_cast(x0))}); } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, hwy::SizeTag<8> /*vect_size_tag*/, V v) { const DFromV d; const RebindToUnsigned du; // for float16_t return BitCast(d, VFromD{_mm_shufflelo_epi16( BitCast(du, v).raw, static_cast(kIdx3210 & 0xFF))}); } #if HWY_TARGET == HWY_SSE2 template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, hwy::SizeTag<16> /*vect_size_tag*/, V v) { const DFromV d; const RebindToUnsigned du; // for float16_t constexpr int kShuffle = static_cast(kIdx3210 & 0xFF); return BitCast( d, VFromD{_mm_shufflehi_epi16( _mm_shufflelo_epi16(BitCast(du, v).raw, kShuffle), kShuffle)}); } template * = nullptr> HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag idx_3210_tag, hwy::SizeTag<1> /*lane_size_tag*/, hwy::SizeTag /*vect_size_tag*/, V v) { const DFromV d; const RebindToUnsigned du; const Rebind du16; const RebindToSigned di16; const auto vu16 = PromoteTo(du16, BitCast(du, v)); const auto shuf16_result = Per4LaneBlockShuffle( idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag(), vu16); return BitCast(d, DemoteTo(du, BitCast(di16, shuf16_result))); } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag idx_3210_tag, hwy::SizeTag<1> /*lane_size_tag*/, hwy::SizeTag<16> /*vect_size_tag*/, V v) { const DFromV d; const RebindToUnsigned du; const Repartition du16; const RebindToSigned di16; const auto zero = Zero(d); const auto v_lo16 = BitCast(du16, InterleaveLower(d, v, zero)); const auto v_hi16 = BitCast(du16, InterleaveUpper(d, v, zero)); const auto lo_shuf_result = Per4LaneBlockShuffle( idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag<16>(), v_lo16); const auto hi_shuf_result = Per4LaneBlockShuffle( idx_3210_tag, hwy::SizeTag<2>(), hwy::SizeTag<16>(), v_hi16); return BitCast(d, OrderedDemote2To(du, BitCast(di16, lo_shuf_result), BitCast(di16, hi_shuf_result))); } #endif template )> HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, hwy::SizeTag<16> /*vect_size_tag*/, V v) { return V{_mm_shuffle_epi32(v.raw, static_cast(kIdx3210 & 0xFF))}; } template )> HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, hwy::SizeTag<16> /*vect_size_tag*/, V v) { return V{_mm_shuffle_ps(v.raw, v.raw, static_cast(kIdx3210 & 0xFF))}; } } // namespace detail // ------------------------------ SlideUpLanes namespace detail { template HWY_INLINE V SlideUpLanes(V v, size_t amt) { const DFromV d; const Full64 du64; const auto vu64 = ResizeBitCast(du64, v); return ResizeBitCast( d, ShiftLeftSame(vu64, static_cast(amt * sizeof(TFromV) * 8))); } #if HWY_TARGET <= HWY_SSSE3 template HWY_INLINE V SlideUpLanes(V v, size_t amt) { const DFromV d; const Repartition du8; const auto idx = Iota(du8, static_cast(size_t{0} - amt * sizeof(TFromV))); return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx)); } #else template HWY_INLINE V SlideUpLanes(V v, size_t amt) { const DFromV d; const Repartition di32; const Repartition du64; constexpr size_t kNumOfLanesPerU64 = 8 / sizeof(TFromV); const auto vu64 = BitCast(du64, v); const auto v_hi = IfVecThenElse( BitCast(du64, Set(di32, -static_cast(amt >= kNumOfLanesPerU64))), BitCast(du64, ShiftLeftBytes<8>(du64, vu64)), vu64); const auto v_lo = ShiftLeftBytes<8>(du64, v_hi); const int shl_amt = static_cast((amt * sizeof(TFromV) * 8) & 63); return BitCast( d, Or(ShiftLeftSame(v_hi, shl_amt), ShiftRightSame(v_lo, 64 - shl_amt))); } #endif } // namespace detail template HWY_API VFromD SlideUpLanes(D /*d*/, VFromD v, size_t /*amt*/) { return v; } template HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftLeftLanes<1>(d, v); } } #else (void)d; #endif return detail::SlideUpLanes(v, amt); } template HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftLeftLanes<1>(d, v); case 2: return ShiftLeftLanes<2>(d, v); case 3: return ShiftLeftLanes<3>(d, v); } } #else (void)d; #endif return detail::SlideUpLanes(v, amt); } template HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftLeftLanes<1>(d, v); case 2: return ShiftLeftLanes<2>(d, v); case 3: return ShiftLeftLanes<3>(d, v); case 4: return ShiftLeftLanes<4>(d, v); case 5: return ShiftLeftLanes<5>(d, v); case 6: return ShiftLeftLanes<6>(d, v); case 7: return ShiftLeftLanes<7>(d, v); } } #else (void)d; #endif return detail::SlideUpLanes(v, amt); } template HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftLeftLanes<1>(d, v); case 2: return ShiftLeftLanes<2>(d, v); case 3: return ShiftLeftLanes<3>(d, v); case 4: return ShiftLeftLanes<4>(d, v); case 5: return ShiftLeftLanes<5>(d, v); case 6: return ShiftLeftLanes<6>(d, v); case 7: return ShiftLeftLanes<7>(d, v); case 8: return ShiftLeftLanes<8>(d, v); case 9: return ShiftLeftLanes<9>(d, v); case 10: return ShiftLeftLanes<10>(d, v); case 11: return ShiftLeftLanes<11>(d, v); case 12: return ShiftLeftLanes<12>(d, v); case 13: return ShiftLeftLanes<13>(d, v); case 14: return ShiftLeftLanes<14>(d, v); case 15: return ShiftLeftLanes<15>(d, v); } } #else (void)d; #endif return detail::SlideUpLanes(v, amt); } // ------------------------------ SlideDownLanes namespace detail { template HWY_INLINE V SlideDownLanes(V v, size_t amt) { const DFromV d; const Repartition, decltype(d)> dv; return BitCast(d, ShiftRightSame(BitCast(dv, v), static_cast(amt * sizeof(TFromV) * 8))); } #if HWY_TARGET <= HWY_SSSE3 template HWY_INLINE V SlideDownLanes(V v, size_t amt) { const DFromV d; const Repartition di8; auto idx = Iota(di8, static_cast(amt * sizeof(TFromV))); idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15}))); return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx)); } #else template HWY_INLINE V SlideDownLanes(V v, size_t amt) { const DFromV d; const Repartition di32; const Repartition du64; constexpr size_t kNumOfLanesPerU64 = 8 / sizeof(TFromV); const auto vu64 = BitCast(du64, v); const auto v_lo = IfVecThenElse( BitCast(du64, Set(di32, -static_cast(amt >= kNumOfLanesPerU64))), BitCast(du64, ShiftRightBytes<8>(du64, vu64)), vu64); const auto v_hi = ShiftRightBytes<8>(du64, v_lo); const int shr_amt = static_cast((amt * sizeof(TFromV) * 8) & 63); return BitCast( d, Or(ShiftRightSame(v_lo, shr_amt), ShiftLeftSame(v_hi, 64 - shr_amt))); } #endif } // namespace detail template HWY_API VFromD SlideDownLanes(D /*d*/, VFromD v, size_t /*amt*/) { return v; } template HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftRightLanes<1>(d, v); } } #else (void)d; #endif return detail::SlideDownLanes(v, amt); } template HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftRightLanes<1>(d, v); case 2: return ShiftRightLanes<2>(d, v); case 3: return ShiftRightLanes<3>(d, v); } } #else (void)d; #endif return detail::SlideDownLanes(v, amt); } template HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftRightLanes<1>(d, v); case 2: return ShiftRightLanes<2>(d, v); case 3: return ShiftRightLanes<3>(d, v); case 4: return ShiftRightLanes<4>(d, v); case 5: return ShiftRightLanes<5>(d, v); case 6: return ShiftRightLanes<6>(d, v); case 7: return ShiftRightLanes<7>(d, v); } } #else (void)d; #endif return detail::SlideDownLanes(v, amt); } template HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftRightLanes<1>(d, v); case 2: return ShiftRightLanes<2>(d, v); case 3: return ShiftRightLanes<3>(d, v); case 4: return ShiftRightLanes<4>(d, v); case 5: return ShiftRightLanes<5>(d, v); case 6: return ShiftRightLanes<6>(d, v); case 7: return ShiftRightLanes<7>(d, v); case 8: return ShiftRightLanes<8>(d, v); case 9: return ShiftRightLanes<9>(d, v); case 10: return ShiftRightLanes<10>(d, v); case 11: return ShiftRightLanes<11>(d, v); case 12: return ShiftRightLanes<12>(d, v); case 13: return ShiftRightLanes<13>(d, v); case 14: return ShiftRightLanes<14>(d, v); case 15: return ShiftRightLanes<15>(d, v); } } #else (void)d; #endif return detail::SlideDownLanes(v, amt); } // ================================================== MEMORY (4) // ------------------------------ StoreN (ExtractLane) #if HWY_TARGET <= HWY_AVX2 #ifdef HWY_NATIVE_STORE_N #undef HWY_NATIVE_STORE_N #else #define HWY_NATIVE_STORE_N #endif template HWY_API void StoreN(VFromD v, D d, TFromD* HWY_RESTRICT p, size_t max_lanes_to_store) { const size_t num_lanes_to_store = HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)); #if HWY_COMPILER_MSVC // Work around MSVC compiler bug by using a HWY_FENCE before the BlendedStore HWY_FENCE; #endif BlendedStore(v, FirstN(d, num_lanes_to_store), d, p); #if HWY_COMPILER_MSVC // Work around MSVC compiler bug by using a HWY_FENCE after the BlendedStore HWY_FENCE; #endif detail::MaybeUnpoison(p, num_lanes_to_store); } #if HWY_TARGET > HWY_AVX3 template HWY_API void StoreN(VFromD v, D d, TFromD* HWY_RESTRICT p, size_t max_lanes_to_store) { if (max_lanes_to_store > 0) { StoreU(v, d, p); } } template HWY_API void StoreN(VFromD v, D /*d*/, TFromD* HWY_RESTRICT p, size_t max_lanes_to_store) { if (max_lanes_to_store >= 1) { p[static_cast(max_lanes_to_store > 1)] = detail::ExtractLane<1>(v); p[0] = GetLane(v); } } namespace detail { template HWY_API void AVX2UIF8Or16StoreTrailingN(VFromD v_trailing, D /*d*/, TFromD* HWY_RESTRICT p, size_t num_lanes_to_store) { // AVX2UIF8Or16StoreTrailingN should only be called for an I8/U8 vector if // (num_lanes_to_store & 3) != 0 is true const auto v_full128 = ResizeBitCast(Full128>(), v_trailing); if ((num_lanes_to_store & 2) != 0) { const uint16_t u16_bits = GetLane(BitCast(Full128(), v_full128)); p[num_lanes_to_store - 1] = detail::ExtractLane<2>(v_full128); CopyBytes(&u16_bits, p + (num_lanes_to_store & ~size_t{3})); } else { p[num_lanes_to_store - 1] = GetLane(v_full128); } } template HWY_API void AVX2UIF8Or16StoreTrailingN(VFromD v_trailing, D /*d*/, TFromD* p, size_t num_lanes_to_store) { // AVX2UIF8Or16StoreTrailingN should only be called for an I16/U16/F16/BF16 // vector if (num_lanes_to_store & 1) == 1 is true p[num_lanes_to_store - 1] = GetLane(v_trailing); } } // namespace detail template HWY_API void StoreN(VFromD v, D d, TFromD* p, size_t max_lanes_to_store) { const size_t num_lanes_to_store = HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)); const FixedTag, HWY_MAX(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD))> d_full; const RebindToUnsigned du_full; const Repartition di32_full; const auto i32_store_mask = BitCast( di32_full, VecFromMask(du_full, FirstN(du_full, num_lanes_to_store))); const auto vi32 = ResizeBitCast(di32_full, v); #if HWY_COMPILER_MSVC // Work around MSVC compiler bug by using a HWY_FENCE before the BlendedStore HWY_FENCE; #endif BlendedStore(vi32, MaskFromVec(i32_store_mask), di32_full, reinterpret_cast(p)); constexpr size_t kNumOfLanesPerI32 = 4 / sizeof(TFromD); constexpr size_t kTrailingLenMask = kNumOfLanesPerI32 - 1; const size_t trailing_n = (num_lanes_to_store & kTrailingLenMask); if (trailing_n != 0) { const VFromD v_trailing = ResizeBitCast( d, SlideDownLanes(di32_full, vi32, num_lanes_to_store / kNumOfLanesPerI32)); detail::AVX2UIF8Or16StoreTrailingN(v_trailing, d, p, num_lanes_to_store); } #if HWY_COMPILER_MSVC // Work around MSVC compiler bug by using a HWY_FENCE after the BlendedStore HWY_FENCE; #endif detail::MaybeUnpoison(p, num_lanes_to_store); } #endif // HWY_TARGET > HWY_AVX3 #endif // HWY_TARGET <= HWY_AVX2 // ================================================== COMBINE // ------------------------------ Combine (InterleaveLower) // N = N/2 + N/2 (upper half undefined) template >> HWY_API VFromD Combine(D d, VH hi_half, VH lo_half) { const Half dh; const RebindToUnsigned duh; // Treat half-width input as one lane, and expand to two lanes. using VU = Vec128, 2>; const VU lo{BitCast(duh, lo_half).raw}; const VU hi{BitCast(duh, hi_half).raw}; return BitCast(d, InterleaveLower(lo, hi)); } // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero) template HWY_API VFromD ZeroExtendVector(D d, VFromD> lo) { const RebindToUnsigned du; const Half duh; return BitCast(d, VFromD{_mm_move_epi64(BitCast(duh, lo).raw)}); } template HWY_API VFromD ZeroExtendVector(D d, VFromD> lo) { const Half dh; return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD{lo.raw}); } #if HWY_HAVE_FLOAT16 template HWY_API VFromD ZeroExtendVector(D d, VFromD> lo) { const RebindToUnsigned du; const Half duh; return BitCast(d, ZeroExtendVector(du, BitCast(duh, lo))); } #endif // Generic for all vector lengths. template HWY_API VFromD ZeroExtendVector(D d, VFromD> lo) { const RebindToUnsigned du; const Half duh; return BitCast(d, ZeroExtendVector(du, BitCast(duh, lo))); } // ------------------------------ Concat full (InterleaveLower) // hiH,hiL loH,loL |-> hiL,loL (= lower halves) template HWY_API VFromD ConcatLowerLower(D d, VFromD hi, VFromD lo) { const Repartition d64; return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi))); } // hiH,hiL loH,loL |-> hiH,loH (= upper halves) template HWY_API VFromD ConcatUpperUpper(D d, VFromD hi, VFromD lo) { const Repartition d64; return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi))); } // hiH,hiL loH,loL |-> hiL,loH (= inner halves) template HWY_API VFromD ConcatLowerUpper(D d, VFromD hi, VFromD lo) { return CombineShiftRightBytes<8>(d, hi, lo); } // hiH,hiL loH,loL |-> hiH,loL (= outer halves) template HWY_API VFromD ConcatUpperLower(D d, VFromD hi, VFromD lo) { const Repartition dd; #if HWY_TARGET >= HWY_SSSE3 return BitCast( d, Vec128{_mm_shuffle_pd(BitCast(dd, lo).raw, BitCast(dd, hi).raw, _MM_SHUFFLE2(1, 0))}); #else // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _pd can do 3/cycle. return BitCast(d, Vec128{_mm_blend_pd(BitCast(dd, hi).raw, BitCast(dd, lo).raw, 1)}); #endif } template HWY_API Vec128 ConcatUpperLower(D d, Vec128 hi, Vec128 lo) { #if HWY_TARGET >= HWY_SSSE3 (void)d; return Vec128{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(3, 2, 1, 0))}; #else // _mm_shuffle_ps has throughput 1/cycle on SKX, whereas blend can do 3/cycle. const RepartitionToWide dd; return BitCast(d, Vec128{_mm_blend_pd(BitCast(dd, hi).raw, BitCast(dd, lo).raw, 1)}); #endif } template HWY_API Vec128 ConcatUpperLower(D /* tag */, Vec128 hi, Vec128 lo) { #if HWY_TARGET >= HWY_SSSE3 return Vec128{_mm_shuffle_pd(lo.raw, hi.raw, _MM_SHUFFLE2(1, 0))}; #else // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle. return Vec128{_mm_blend_pd(hi.raw, lo.raw, 1)}; #endif } // ------------------------------ Concat partial (Combine, LowerHalf) template HWY_API VFromD ConcatLowerLower(D d, VFromD hi, VFromD lo) { const Half d2; return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo)); } template HWY_API VFromD ConcatUpperUpper(D d, VFromD hi, VFromD lo) { const Half d2; return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo)); } template HWY_API VFromD ConcatLowerUpper(D d, const VFromD hi, const VFromD lo) { const Half d2; return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo)); } template HWY_API VFromD ConcatUpperLower(D d, VFromD hi, VFromD lo) { const Half d2; return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo)); } // ------------------------------ ConcatOdd // 8-bit full template HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { const Repartition dw; // Right-shift 8 bits per u16 so we can pack. const Vec128 uH = ShiftRight<8>(BitCast(dw, hi)); const Vec128 uL = ShiftRight<8>(BitCast(dw, lo)); return VFromD{_mm_packus_epi16(uL.raw, uH.raw)}; } // 8-bit x8 template HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { #if HWY_TARGET == HWY_SSE2 const Repartition dw; // Right-shift 8 bits per u16 so we can pack. const Vec64 uH = ShiftRight<8>(BitCast(dw, hi)); const Vec64 uL = ShiftRight<8>(BitCast(dw, lo)); return VFromD{_mm_shuffle_epi32(_mm_packus_epi16(uL.raw, uH.raw), _MM_SHUFFLE(2, 0, 2, 0))}; #else const Repartition du32; // Don't care about upper half, no need to zero. alignas(16) const uint8_t kCompactOddU8[8] = {1, 3, 5, 7}; const VFromD shuf = BitCast(d, Load(Full64(), kCompactOddU8)); const VFromD L = TableLookupBytes(lo, shuf); const VFromD H = TableLookupBytes(hi, shuf); return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); #endif } // 8-bit x4 template HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { #if HWY_TARGET == HWY_SSE2 const Repartition dw; const Twice dw_2; // Right-shift 8 bits per u16 so we can pack. const Vec32 uH = ShiftRight<8>(BitCast(dw, hi)); const Vec32 uL = ShiftRight<8>(BitCast(dw, lo)); const Vec64 uHL = Combine(dw_2, uH, uL); return VFromD{_mm_packus_epi16(uHL.raw, uHL.raw)}; #else const Repartition du16; // Don't care about upper half, no need to zero. alignas(16) const uint8_t kCompactOddU8[4] = {1, 3}; const VFromD shuf = BitCast(d, Load(Full32(), kCompactOddU8)); const VFromD L = TableLookupBytes(lo, shuf); const VFromD H = TableLookupBytes(hi, shuf); return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H))); #endif } template HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns // 0xFFFF8000, which correctly saturates to 0x8000. const RebindToUnsigned du; const Repartition dw; const Vec128 uH = ShiftRight<16>(BitCast(dw, hi)); const Vec128 uL = ShiftRight<16>(BitCast(dw, lo)); return BitCast(d, VFromD{_mm_packs_epi32(uL.raw, uH.raw)}); } // 16-bit x4 template HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { #if HWY_TARGET == HWY_SSE2 // Right-shift 16 bits per i32 - a *signed* shift of 0x8000xxxx returns // 0xFFFF8000, which correctly saturates to 0x8000. const Repartition dw; const Vec64 uH = ShiftRight<16>(BitCast(dw, hi)); const Vec64 uL = ShiftRight<16>(BitCast(dw, lo)); return VFromD{_mm_shuffle_epi32(_mm_packs_epi32(uL.raw, uH.raw), _MM_SHUFFLE(2, 0, 2, 0))}; #else const Repartition du32; // Don't care about upper half, no need to zero. alignas(16) const uint8_t kCompactOddU16[8] = {2, 3, 6, 7}; const VFromD shuf = BitCast(d, Load(Full64(), kCompactOddU16)); const VFromD L = TableLookupBytes(lo, shuf); const VFromD H = TableLookupBytes(hi, shuf); return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); #endif } // 32-bit full template HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { const RebindToFloat df; return BitCast( d, Vec128{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(3, 1, 3, 1))}); } // Any type x2 template HWY_API VFromD ConcatOdd(D d, VFromD hi, VFromD lo) { return InterleaveUpper(d, lo, hi); } // ------------------------------ ConcatEven (InterleaveLower) // 8-bit full template HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { const Repartition dw; // Isolate lower 8 bits per u16 so we can pack. const Vec128 mask = Set(dw, 0x00FF); const Vec128 uH = And(BitCast(dw, hi), mask); const Vec128 uL = And(BitCast(dw, lo), mask); return VFromD{_mm_packus_epi16(uL.raw, uH.raw)}; } // 8-bit x8 template HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { #if HWY_TARGET == HWY_SSE2 const Repartition dw; // Isolate lower 8 bits per u16 so we can pack. const Vec64 mask = Set(dw, 0x00FF); const Vec64 uH = And(BitCast(dw, hi), mask); const Vec64 uL = And(BitCast(dw, lo), mask); return VFromD{_mm_shuffle_epi32(_mm_packus_epi16(uL.raw, uH.raw), _MM_SHUFFLE(2, 0, 2, 0))}; #else const Repartition du32; // Don't care about upper half, no need to zero. alignas(16) const uint8_t kCompactEvenU8[8] = {0, 2, 4, 6}; const VFromD shuf = BitCast(d, Load(Full64(), kCompactEvenU8)); const VFromD L = TableLookupBytes(lo, shuf); const VFromD H = TableLookupBytes(hi, shuf); return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); #endif } // 8-bit x4 template HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { #if HWY_TARGET == HWY_SSE2 const Repartition dw; const Twice dw_2; // Isolate lower 8 bits per u16 so we can pack. const Vec32 mask = Set(dw, 0x00FF); const Vec32 uH = And(BitCast(dw, hi), mask); const Vec32 uL = And(BitCast(dw, lo), mask); const Vec64 uHL = Combine(dw_2, uH, uL); return VFromD{_mm_packus_epi16(uHL.raw, uHL.raw)}; #else const Repartition du16; // Don't care about upper half, no need to zero. alignas(16) const uint8_t kCompactEvenU8[4] = {0, 2}; const VFromD shuf = BitCast(d, Load(Full32(), kCompactEvenU8)); const VFromD L = TableLookupBytes(lo, shuf); const VFromD H = TableLookupBytes(hi, shuf); return BitCast(d, InterleaveLower(du16, BitCast(du16, L), BitCast(du16, H))); #endif } // 16-bit full template HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { #if HWY_TARGET <= HWY_SSE4 // Isolate lower 16 bits per u32 so we can pack. const RebindToUnsigned du; // for float16_t const Repartition dw; const Vec128 mask = Set(dw, 0x0000FFFF); const Vec128 uH = And(BitCast(dw, hi), mask); const Vec128 uL = And(BitCast(dw, lo), mask); return BitCast(d, VFromD{_mm_packus_epi32(uL.raw, uH.raw)}); #elif HWY_TARGET == HWY_SSE2 const Repartition dw; return ConcatOdd(d, BitCast(d, ShiftLeft<16>(BitCast(dw, hi))), BitCast(d, ShiftLeft<16>(BitCast(dw, lo)))); #else const RebindToUnsigned du; // packs_epi32 saturates 0x8000 to 0x7FFF. Instead ConcatEven within the two // inputs, then concatenate them. alignas(16) const uint16_t kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C}; const VFromD shuf = BitCast(d, Load(du, kCompactEvenU16)); const VFromD L = TableLookupBytes(lo, shuf); const VFromD H = TableLookupBytes(hi, shuf); return ConcatLowerLower(d, H, L); #endif } // 16-bit x4 template HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { #if HWY_TARGET == HWY_SSE2 const Repartition dw; return ConcatOdd(d, BitCast(d, ShiftLeft<16>(BitCast(dw, hi))), BitCast(d, ShiftLeft<16>(BitCast(dw, lo)))); #else const Repartition du32; // Don't care about upper half, no need to zero. alignas(16) const uint8_t kCompactEvenU16[8] = {0, 1, 4, 5}; const VFromD shuf = BitCast(d, Load(Full64(), kCompactEvenU16)); const VFromD L = TableLookupBytes(lo, shuf); const VFromD H = TableLookupBytes(hi, shuf); return BitCast(d, InterleaveLower(du32, BitCast(du32, L), BitCast(du32, H))); #endif } // 32-bit full template HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { const RebindToFloat df; return BitCast( d, Vec128{_mm_shuffle_ps(BitCast(df, lo).raw, BitCast(df, hi).raw, _MM_SHUFFLE(2, 0, 2, 0))}); } template HWY_API VFromD ConcatEven(D /* d */, VFromD hi, VFromD lo) { return VFromD{_mm_shuffle_ps(lo.raw, hi.raw, _MM_SHUFFLE(2, 0, 2, 0))}; } // Any T x2 template HWY_API VFromD ConcatEven(D d, VFromD hi, VFromD lo) { return InterleaveLower(d, lo, hi); } // ------------------------------ DupEven (InterleaveLower) template HWY_API Vec128 DupEven(const Vec128 v) { return v; } template HWY_API Vec128 DupEven(const Vec128 v) { return InterleaveLower(DFromV(), v, v); } template HWY_API V DupEven(V v) { const DFromV d; #if HWY_TARGET <= HWY_SSSE3 const RebindToUnsigned du; const VFromD shuffle = Dup128VecFromValues( du, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14); return TableLookupBytes(v, BitCast(d, shuffle)); #else const Repartition du16; return IfVecThenElse(BitCast(d, Set(du16, uint16_t{0xFF00})), BitCast(d, ShiftLeft<8>(BitCast(du16, v))), v); #endif } template HWY_API Vec64 DupEven(const Vec64 v) { const DFromV d; const RebindToUnsigned du; // for float16_t return BitCast(d, VFromD{_mm_shufflelo_epi16( BitCast(du, v).raw, _MM_SHUFFLE(2, 2, 0, 0))}); } // Generic for all vector lengths. template HWY_API V DupEven(const V v) { const DFromV d; const RebindToUnsigned du; // for float16_t #if HWY_TARGET <= HWY_SSSE3 const VFromD shuffle = Dup128VecFromValues( du, 0x0100, 0x0100, 0x0504, 0x0504, 0x0908, 0x0908, 0x0d0c, 0x0d0c); return TableLookupBytes(v, BitCast(d, shuffle)); #else return BitCast( d, VFromD{_mm_shufflehi_epi16( _mm_shufflelo_epi16(BitCast(du, v).raw, _MM_SHUFFLE(2, 2, 0, 0)), _MM_SHUFFLE(2, 2, 0, 0))}); #endif } template HWY_API Vec128 DupEven(Vec128 v) { return Vec128{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; } HWY_API Vec128 DupEven(Vec128 v) { return Vec128{_mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; } // ------------------------------ DupOdd (InterleaveUpper) template HWY_API Vec128 DupOdd(Vec128 v) { return v; } template HWY_API V DupOdd(V v) { const DFromV d; #if HWY_TARGET <= HWY_SSSE3 const RebindToUnsigned du; const VFromD shuffle = Dup128VecFromValues( du, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15); return TableLookupBytes(v, BitCast(d, shuffle)); #else const Repartition du16; return IfVecThenElse(BitCast(d, Set(du16, uint16_t{0x00FF})), BitCast(d, ShiftRight<8>(BitCast(du16, v))), v); #endif } template HWY_API Vec128 DupOdd(Vec128 v) { const DFromV d; const RebindToUnsigned du; // for float16_t return BitCast(d, VFromD{_mm_shufflelo_epi16( BitCast(du, v).raw, _MM_SHUFFLE(3, 3, 1, 1))}); } // Generic for all vector lengths. template HWY_API V DupOdd(V v) { const DFromV d; const RebindToUnsigned du; // for float16_t #if HWY_TARGET <= HWY_SSSE3 const VFromD shuffle = Dup128VecFromValues( du, 0x0302, 0x0302, 0x0706, 0x0706, 0x0b0a, 0x0b0a, 0x0f0e, 0x0f0e); return TableLookupBytes(v, BitCast(d, shuffle)); #else return BitCast( d, VFromD{_mm_shufflehi_epi16( _mm_shufflelo_epi16(BitCast(du, v).raw, _MM_SHUFFLE(3, 3, 1, 1)), _MM_SHUFFLE(3, 3, 1, 1))}); #endif } template HWY_API Vec128 DupOdd(Vec128 v) { return Vec128{_mm_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; } template HWY_API Vec128 DupOdd(Vec128 v) { return Vec128{ _mm_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; } template HWY_API Vec128 DupOdd(const Vec128 v) { return InterleaveUpper(DFromV(), v, v); } // ------------------------------ TwoTablesLookupLanes (DupEven) template HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, Indices128 idx) { const DFromV d; const Twice dt; // TableLookupLanes currently requires table and index vectors to be the same // size, though a half-length index vector would be sufficient here. #if HWY_IS_MSAN const Vec128 idx_vec{idx.raw}; const Indices128 idx2{Combine(dt, idx_vec, idx_vec).raw}; #else // We only keep LowerHalf of the result, which is valid in idx. const Indices128 idx2{idx.raw}; #endif return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2)); } template HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, Indices128 idx) { #if HWY_TARGET <= HWY_AVX3_DL return Vec128{_mm_permutex2var_epi8(a.raw, idx.raw, b.raw)}; #else // AVX3 or below const DFromV d; const Vec128 idx_vec{idx.raw}; #if HWY_TARGET <= HWY_SSE4 const Repartition du16; const auto sel_hi_mask = MaskFromVec(BitCast(d, ShiftLeft<3>(BitCast(du16, idx_vec)))); #else const RebindToSigned di; const auto sel_hi_mask = RebindMask(d, BitCast(di, idx_vec) > Set(di, int8_t{15})); #endif const auto lo_lookup_result = TableLookupBytes(a, idx_vec); #if HWY_TARGET <= HWY_AVX3 const Vec128 lookup_result{_mm_mask_shuffle_epi8( lo_lookup_result.raw, sel_hi_mask.raw, b.raw, idx_vec.raw)}; return lookup_result; #else const auto hi_lookup_result = TableLookupBytes(b, idx_vec); return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); #endif // HWY_TARGET <= HWY_AVX3 #endif // HWY_TARGET <= HWY_AVX3_DL } template HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, Indices128 idx) { #if HWY_TARGET <= HWY_AVX3 return Vec128{_mm_permutex2var_epi16(a.raw, idx.raw, b.raw)}; #elif HWY_TARGET == HWY_SSE2 const DFromV d; const RebindToSigned di; const Vec128 idx_vec{idx.raw}; const auto sel_hi_mask = RebindMask(d, BitCast(di, idx_vec) > Set(di, int16_t{7})); const auto lo_lookup_result = TableLookupLanes(a, idx); const auto hi_lookup_result = TableLookupLanes(b, idx); return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); #else const DFromV d; const Repartition du8; return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), Indices128{idx.raw})); #endif } template HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, Indices128 idx) { #if HWY_TARGET <= HWY_AVX3 return Vec128{_mm_permutex2var_epi32(a.raw, idx.raw, b.raw)}; #else // AVX2 or below const DFromV d; #if HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 const Vec128 idx_vec{idx.raw}; #if HWY_TARGET <= HWY_AVX2 const RebindToFloat d_sel; const auto sel_hi_mask = MaskFromVec(BitCast(d_sel, ShiftLeft<29>(idx_vec))); #else const RebindToSigned d_sel; const auto sel_hi_mask = BitCast(d_sel, idx_vec) > Set(d_sel, int32_t{3}); #endif const auto lo_lookup_result = BitCast(d_sel, TableLookupLanes(a, idx)); const auto hi_lookup_result = BitCast(d_sel, TableLookupLanes(b, idx)); return BitCast(d, IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result)); #else // SSSE3 or SSE4 const Repartition du8; return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), Indices128{idx.raw})); #endif // HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 #endif // HWY_TARGET <= HWY_AVX3 } #if HWY_HAVE_FLOAT16 HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, Indices128 idx) { return Vec128{_mm_permutex2var_ph(a.raw, idx.raw, b.raw)}; } #endif // HWY_HAVE_FLOAT16 HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, Indices128 idx) { #if HWY_TARGET <= HWY_AVX3 return Vec128{_mm_permutex2var_ps(a.raw, idx.raw, b.raw)}; #elif HWY_TARGET <= HWY_AVX2 || HWY_TARGET == HWY_SSE2 const DFromV d; #if HWY_TARGET <= HWY_AVX2 const auto sel_hi_mask = MaskFromVec(BitCast(d, ShiftLeft<29>(Vec128{idx.raw}))); #else const RebindToSigned di; const auto sel_hi_mask = RebindMask(d, Vec128{idx.raw} > Set(di, int32_t{3})); #endif const auto lo_lookup_result = TableLookupLanes(a, idx); const auto hi_lookup_result = TableLookupLanes(b, idx); return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); #else // SSSE3 or SSE4 const DFromV d; const Repartition du8; return BitCast(d, TwoTablesLookupLanes(BitCast(du8, a), BitCast(du8, b), Indices128{idx.raw})); #endif } template HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, Indices128 idx) { #if HWY_TARGET <= HWY_AVX3 return Vec128{_mm_permutex2var_epi64(a.raw, idx.raw, b.raw)}; #else const DFromV d; const Vec128 idx_vec{idx.raw}; const Indices128 idx_mod{And(idx_vec, Set(d, T{1})).raw}; #if HWY_TARGET <= HWY_SSE4 const RebindToFloat d_sel; const auto sel_hi_mask = MaskFromVec(BitCast(d_sel, ShiftLeft<62>(idx_vec))); #else // SSE2 or SSSE3 const Repartition di32; const RebindToSigned d_sel; const auto sel_hi_mask = MaskFromVec( BitCast(d_sel, VecFromMask(di32, DupEven(BitCast(di32, idx_vec)) > Set(di32, int32_t{1})))); #endif // HWY_TARGET <= HWY_SSE4 const auto lo_lookup_result = BitCast(d_sel, TableLookupLanes(a, idx_mod)); const auto hi_lookup_result = BitCast(d_sel, TableLookupLanes(b, idx_mod)); return BitCast(d, IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result)); #endif // HWY_TARGET <= HWY_AVX3 } HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, Indices128 idx) { #if HWY_TARGET <= HWY_AVX3 return Vec128{_mm_permutex2var_pd(a.raw, idx.raw, b.raw)}; #else const DFromV d; const RebindToSigned di; const Vec128 idx_vec{idx.raw}; const Indices128 idx_mod{And(idx_vec, Set(di, int64_t{1})).raw}; #if HWY_TARGET <= HWY_SSE4 const auto sel_hi_mask = MaskFromVec(BitCast(d, ShiftLeft<62>(idx_vec))); #else // SSE2 or SSSE3 const Repartition di32; const auto sel_hi_mask = MaskFromVec(BitCast(d, VecFromMask(di32, DupEven(BitCast(di32, idx_vec)) > Set(di32, int32_t{1})))); #endif // HWY_TARGET <= HWY_SSE4 const auto lo_lookup_result = TableLookupLanes(a, idx_mod); const auto hi_lookup_result = TableLookupLanes(b, idx_mod); return IfThenElse(sel_hi_mask, hi_lookup_result, lo_lookup_result); #endif // HWY_TARGET <= HWY_AVX3 } // ------------------------------ OddEven (IfThenElse) template HWY_INLINE Vec128 OddEven(const Vec128 a, const Vec128 b) { const DFromV d; const Repartition d8; alignas(16) static constexpr uint8_t mask[16] = { 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); } template HWY_INLINE Vec128 OddEven(const Vec128 a, const Vec128 b) { const DFromV d; #if HWY_TARGET >= HWY_SSSE3 const Repartition d8; alignas(16) static constexpr uint8_t mask[16] = { 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0}; return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); #else const RebindToUnsigned du; // for float16_t return BitCast(d, VFromD{_mm_blend_epi16( BitCast(du, a).raw, BitCast(du, b).raw, 0x55)}); #endif } template HWY_INLINE Vec128 OddEven(const Vec128 a, const Vec128 b) { #if HWY_TARGET >= HWY_SSSE3 const __m128i odd = _mm_shuffle_epi32(a.raw, _MM_SHUFFLE(3, 1, 3, 1)); const __m128i even = _mm_shuffle_epi32(b.raw, _MM_SHUFFLE(2, 0, 2, 0)); return Vec128{_mm_unpacklo_epi32(even, odd)}; #else // _mm_blend_epi16 has throughput 1/cycle on SKX, whereas _ps can do 3/cycle. const DFromV d; const RebindToFloat df; return BitCast(d, Vec128{_mm_blend_ps(BitCast(df, a).raw, BitCast(df, b).raw, 5)}); #endif } template HWY_INLINE Vec128 OddEven(const Vec128 a, const Vec128 b) { // Same as ConcatUpperLower for full vectors; do not call that because this // is more efficient for 64x1 vectors. const DFromV d; const RebindToFloat dd; #if HWY_TARGET >= HWY_SSSE3 return BitCast( d, Vec128{_mm_shuffle_pd( BitCast(dd, b).raw, BitCast(dd, a).raw, _MM_SHUFFLE2(1, 0))}); #else // _mm_shuffle_pd has throughput 1/cycle on SKX, whereas blend can do 3/cycle. return BitCast(d, Vec128{_mm_blend_pd(BitCast(dd, a).raw, BitCast(dd, b).raw, 1)}); #endif } template HWY_API Vec128 OddEven(Vec128 a, Vec128 b) { #if HWY_TARGET >= HWY_SSSE3 // SHUFPS must fill the lower half of the output from one input, so we // need another shuffle. Unpack avoids another immediate byte. const __m128 odd = _mm_shuffle_ps(a.raw, a.raw, _MM_SHUFFLE(3, 1, 3, 1)); const __m128 even = _mm_shuffle_ps(b.raw, b.raw, _MM_SHUFFLE(2, 0, 2, 0)); return Vec128{_mm_unpacklo_ps(even, odd)}; #else return Vec128{_mm_blend_ps(a.raw, b.raw, 5)}; #endif } // ------------------------------ OddEvenBlocks template HWY_API Vec128 OddEvenBlocks(Vec128 /* odd */, Vec128 even) { return even; } // ------------------------------ SwapAdjacentBlocks template HWY_API Vec128 SwapAdjacentBlocks(Vec128 v) { return v; } // ------------------------------ Shl (ZipLower, Mul) // Use AVX2/3 variable shifts where available, otherwise multiply by powers of // two from loading float exponents, which is considerably faster (according // to LLVM-MCA) than scalar or testing bits: https://gcc.godbolt.org/z/9G7Y9v. namespace detail { #if HWY_TARGET == HWY_AVX2 // Unused for AVX3 - we use sllv directly template HWY_API V AVX2ShlU16Vec128(V v, V bits) { const DFromV d; const Rebind du32; return TruncateTo(d, PromoteTo(du32, v) << PromoteTo(du32, bits)); } #elif HWY_TARGET > HWY_AVX2 // Returns 2^v for use as per-lane multipliers to emulate 16-bit shifts. template HWY_INLINE Vec128> Pow2(const Vec128 v) { const DFromV d; const RebindToUnsigned du; const RepartitionToWide dw; const Rebind df; const auto zero = Zero(d); // Move into exponent (this u16 will become the upper half of an f32) const auto exp = ShiftLeft<23 - 16>(v); const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f // Insert 0 into lower halves for reinterpreting as binary32. const auto f0 = ZipLower(dw, zero, upper); const auto f1 = ZipUpper(dw, zero, upper); // See cvtps comment below. const VFromD bits0{_mm_cvtps_epi32(BitCast(df, f0).raw)}; const VFromD bits1{_mm_cvtps_epi32(BitCast(df, f1).raw)}; #if HWY_TARGET <= HWY_SSE4 return VFromD{_mm_packus_epi32(bits0.raw, bits1.raw)}; #else return ConcatEven(du, BitCast(du, bits1), BitCast(du, bits0)); #endif } template HWY_INLINE Vec128, N> Pow2(const Vec128 v) { const DFromV d; const RebindToUnsigned du; const Twice dt_u; const RepartitionToWide dt_w; const RebindToFloat dt_f; // Move into exponent (this u16 will become the upper half of an f32) const auto exp = ShiftLeft<23 - 16>(v); const auto upper = exp + Set(d, 0x3F80); // upper half of 1.0f // Insert 0 into lower halves for reinterpreting as binary32. const auto f0 = ZipLower(dt_w, Zero(dt_u), ResizeBitCast(dt_u, upper)); // See cvtps comment below. const VFromD bits0{_mm_cvtps_epi32(BitCast(dt_f, f0).raw)}; #if HWY_TARGET <= HWY_SSE4 return VFromD{_mm_packus_epi32(bits0.raw, bits0.raw)}; #elif HWY_TARGET == HWY_SSSE3 alignas(16) const uint16_t kCompactEvenU16[8] = {0x0100, 0x0504, 0x0908, 0x0D0C}; return TableLookupBytes(bits0, Load(du, kCompactEvenU16)); #else const RebindToSigned dt_i32; const auto bits0_i32 = ShiftRight<16>(BitCast(dt_i32, ShiftLeft<16>(bits0))); return VFromD{_mm_packs_epi32(bits0_i32.raw, bits0_i32.raw)}; #endif } // Same, for 32-bit shifts. template HWY_INLINE Vec128, N> Pow2(const Vec128 v) { const DFromV d; const auto exp = ShiftLeft<23>(v); const auto f = exp + Set(d, 0x3F800000); // 1.0f // Do not use ConvertTo because we rely on the native 0x80..00 overflow // behavior. return Vec128, N>{_mm_cvtps_epi32(_mm_castsi128_ps(f.raw))}; } #endif // HWY_TARGET > HWY_AVX2 template HWY_API Vec128 Shl(hwy::UnsignedTag /*tag*/, Vec128 v, Vec128 bits) { #if HWY_TARGET <= HWY_AVX3 return Vec128{_mm_sllv_epi16(v.raw, bits.raw)}; #elif HWY_TARGET == HWY_AVX2 return AVX2ShlU16Vec128(v, bits); #else return v * Pow2(bits); #endif } #if HWY_TARGET > HWY_AVX3 HWY_API Vec16 Shl(hwy::UnsignedTag /*tag*/, Vec16 v, Vec16 bits) { #if HWY_TARGET <= HWY_SSE4 const Vec16 bits16{_mm_cvtepu16_epi64(bits.raw)}; #else const auto bits16 = And(bits, Vec16{_mm_set_epi64x(0, 0xFFFF)}); #endif return Vec16{_mm_sll_epi16(v.raw, bits16.raw)}; } #endif #if HWY_TARGET <= HWY_AVX3 template HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) { const DFromV d; const Rebind du16; return TruncateTo(d, PromoteTo(du16, v) << PromoteTo(du16, bits)); } #elif HWY_TARGET <= HWY_AVX2 template HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) { const DFromV d; const Rebind du32; return TruncateTo(d, PromoteTo(du32, v) << PromoteTo(du32, bits)); } template HWY_INLINE V AVX2ShlU8Vec128(V v, V bits) { const DFromV d; const Half dh; const Rebind du16; const Rebind dh_u32; const VFromD lo_shl_result = PromoteTo(dh_u32, LowerHalf(dh, v)) << PromoteTo(dh_u32, LowerHalf(dh, bits)); const VFromD hi_shl_result = PromoteTo(dh_u32, UpperHalf(dh, v)) << PromoteTo(dh_u32, UpperHalf(dh, bits)); const VFromD u16_shl_result = ConcatEven( du16, BitCast(du16, hi_shl_result), BitCast(du16, lo_shl_result)); return TruncateTo(d, u16_shl_result); } #endif // HWY_TARGET <= HWY_AVX3 // 8-bit: may use the Shl overload for uint16_t. template HWY_API Vec128 Shl(hwy::UnsignedTag tag, Vec128 v, Vec128 bits) { const DFromV d; #if HWY_TARGET <= HWY_AVX3_DL (void)tag; // kMask[i] = 0xFF >> i alignas(16) static constexpr uint8_t kMasks[16] = { 0xFF, 0x7F, 0x3F, 0x1F, 0x0F, 0x07, 0x03, 0x01, 0x00}; // kShl[i] = 1 << i alignas(16) static constexpr uint8_t kShl[16] = {1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 0x00}; v = And(v, TableLookupBytes(Load(Full64(), kMasks), bits)); const VFromD mul = TableLookupBytes(Load(Full64(), kShl), bits); return VFromD{_mm_gf2p8mul_epi8(v.raw, mul.raw)}; #elif HWY_TARGET <= HWY_AVX2 (void)tag; (void)d; return AVX2ShlU8Vec128(v, bits); #else const Repartition dw; using VW = VFromD; const VW even_mask = Set(dw, 0x00FF); const VW odd_mask = Set(dw, 0xFF00); const VW vw = BitCast(dw, v); const VW bits16 = BitCast(dw, bits); // Shift even lanes in-place const VW evens = Shl(tag, vw, And(bits16, even_mask)); const VW odds = Shl(tag, And(vw, odd_mask), ShiftRight<8>(bits16)); return OddEven(BitCast(d, odds), BitCast(d, evens)); #endif } HWY_API Vec128 Shl(hwy::UnsignedTag /*tag*/, Vec128 v, Vec128 bits) { #if HWY_TARGET <= HWY_SSE4 const Vec16 bits8{_mm_cvtepu8_epi64(bits.raw)}; #else const Vec16 bits8 = And(Vec16{bits.raw}, Vec16{_mm_set_epi64x(0, 0xFF)}); #endif return Vec128{_mm_sll_epi16(v.raw, bits8.raw)}; } template HWY_API Vec128 Shl(hwy::UnsignedTag /*tag*/, Vec128 v, Vec128 bits) { #if HWY_TARGET >= HWY_SSE4 return v * Pow2(bits); #else return Vec128{_mm_sllv_epi32(v.raw, bits.raw)}; #endif } #if HWY_TARGET >= HWY_SSE4 HWY_API Vec32 Shl(hwy::UnsignedTag /*tag*/, Vec32 v, const Vec32 bits) { #if HWY_TARGET == HWY_SSE4 const Vec32 bits32{_mm_cvtepu32_epi64(bits.raw)}; #else const auto bits32 = Combine(Full64(), Zero(Full32()), bits); #endif return Vec32{_mm_sll_epi32(v.raw, bits32.raw)}; } #endif HWY_API Vec128 Shl(hwy::UnsignedTag /*tag*/, Vec128 v, Vec128 bits) { #if HWY_TARGET >= HWY_SSE4 const DFromV d; // Individual shifts and combine const Vec128 out0{_mm_sll_epi64(v.raw, bits.raw)}; const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw); const Vec128 out1{_mm_sll_epi64(v.raw, bits1)}; return ConcatUpperLower(d, out1, out0); #else return Vec128{_mm_sllv_epi64(v.raw, bits.raw)}; #endif } HWY_API Vec64 Shl(hwy::UnsignedTag /*tag*/, Vec64 v, Vec64 bits) { return Vec64{_mm_sll_epi64(v.raw, bits.raw)}; } // Signed left shift is the same as unsigned. template HWY_API Vec128 Shl(hwy::SignedTag /*tag*/, Vec128 v, Vec128 bits) { const DFromV di; const RebindToUnsigned du; return BitCast(di, Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits))); } } // namespace detail template HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return detail::Shl(hwy::TypeTag(), v, bits); } // ------------------------------ Shr (mul, mask, BroadcastSignBit) // Use AVX2+ variable shifts except for SSSE3/SSE4. There, we use // widening multiplication by powers of two obtained by loading float exponents, // followed by a constant right-shift. This is still faster than a scalar or // bit-test approach: https://gcc.godbolt.org/z/9G7Y9v. #if HWY_TARGET <= HWY_AVX2 namespace detail { #if HWY_TARGET <= HWY_AVX3 template HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) { const DFromV d; const Rebind du16; const RebindToSigned di16; return DemoteTo(d, BitCast(di16, PromoteTo(du16, v) >> PromoteTo(du16, bits))); } #else // AVX2 template HWY_INLINE V AVX2ShrU16Vec128(V v, V bits) { const DFromV d; const Rebind du32; const RebindToSigned di32; return DemoteTo(d, BitCast(di32, PromoteTo(du32, v) >> PromoteTo(du32, bits))); } template HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) { const DFromV d; const Rebind du32; const RebindToSigned di32; return DemoteTo(d, BitCast(di32, PromoteTo(du32, v) >> PromoteTo(du32, bits))); } template HWY_INLINE V AVX2ShrU8Vec128(V v, V bits) { const DFromV d; const Half dh; const Rebind di16; const Rebind du16; const Rebind dh_i32; const Rebind dh_u32; const auto lo_shr_result = BitCast(dh_i32, PromoteTo(dh_u32, LowerHalf(dh, v)) >> PromoteTo(dh_u32, LowerHalf(dh, bits))); const auto hi_shr_result = BitCast(dh_i32, PromoteTo(dh_u32, UpperHalf(dh, v)) >> PromoteTo(dh_u32, UpperHalf(dh, bits))); const auto i16_shr_result = BitCast(di16, OrderedDemote2To(du16, lo_shr_result, hi_shr_result)); return DemoteTo(d, i16_shr_result); } #endif // HWY_TARGET <= HWY_AVX3 } // namespace detail #endif // HWY_TARGET <= HWY_AVX2 template HWY_API Vec128 operator>>(Vec128 in, const Vec128 bits) { #if HWY_TARGET <= HWY_AVX3 return Vec128{_mm_srlv_epi16(in.raw, bits.raw)}; #elif HWY_TARGET <= HWY_AVX2 return detail::AVX2ShrU16Vec128(in, bits); #else const DFromV d; // For bits=0, we cannot mul by 2^16, so fix the result later. const auto out = MulHigh(in, detail::Pow2(Set(d, 16) - bits)); // Replace output with input where bits == 0. return IfThenElse(bits == Zero(d), in, out); #endif } #if HWY_TARGET > HWY_AVX3 HWY_API Vec16 operator>>(const Vec16 in, const Vec16 bits) { #if HWY_TARGET <= HWY_SSE4 const Vec16 bits16{_mm_cvtepu16_epi64(bits.raw)}; #else const auto bits16 = And(bits, Vec16{_mm_set_epi64x(0, 0xFFFF)}); #endif return Vec16{_mm_srl_epi16(in.raw, bits16.raw)}; } #endif // 8-bit uses 16-bit shifts. template HWY_API Vec128 operator>>(Vec128 in, const Vec128 bits) { #if HWY_TARGET <= HWY_AVX2 return detail::AVX2ShrU8Vec128(in, bits); #else const DFromV d; const Repartition dw; using VW = VFromD; const VW mask = Set(dw, 0x00FF); const VW vw = BitCast(dw, in); const VW bits16 = BitCast(dw, bits); const VW evens = And(vw, mask) >> And(bits16, mask); // Shift odd lanes in-place const VW odds = vw >> ShiftRight<8>(bits16); return OddEven(BitCast(d, odds), BitCast(d, evens)); #endif } HWY_API Vec128 operator>>(const Vec128 in, const Vec128 bits) { #if HWY_TARGET <= HWY_SSE4 const Vec16 in8{_mm_cvtepu8_epi16(in.raw)}; const Vec16 bits8{_mm_cvtepu8_epi64(bits.raw)}; #else const Vec16 mask{_mm_set_epi64x(0, 0xFF)}; const Vec16 in8 = And(Vec16{in.raw}, mask); const Vec16 bits8 = And(Vec16{bits.raw}, mask); #endif return Vec128{_mm_srl_epi16(in8.raw, bits8.raw)}; } template HWY_API Vec128 operator>>(const Vec128 in, const Vec128 bits) { #if HWY_TARGET >= HWY_SSE4 // 32x32 -> 64 bit mul, then shift right by 32. const DFromV d32; // Move odd lanes into position for the second mul. Shuffle more gracefully // handles N=1 than repartitioning to u64 and shifting 32 bits right. const Vec128 in31{_mm_shuffle_epi32(in.raw, 0x31)}; // For bits=0, we cannot mul by 2^32, so fix the result later. const auto mul = detail::Pow2(Set(d32, 32) - bits); const auto out20 = ShiftRight<32>(MulEven(in, mul)); // z 2 z 0 const Vec128 mul31{_mm_shuffle_epi32(mul.raw, 0x31)}; // No need to shift right, already in the correct position. const auto out31 = BitCast(d32, MulEven(in31, mul31)); // 3 ? 1 ? const Vec128 out = OddEven(out31, BitCast(d32, out20)); // Replace output with input where bits == 0. return IfThenElse(bits == Zero(d32), in, out); #else return Vec128{_mm_srlv_epi32(in.raw, bits.raw)}; #endif } #if HWY_TARGET >= HWY_SSE4 HWY_API Vec128 operator>>(const Vec128 in, const Vec128 bits) { #if HWY_TARGET == HWY_SSE4 const Vec32 bits32{_mm_cvtepu32_epi64(bits.raw)}; #else const auto bits32 = Combine(Full64(), Zero(Full32()), bits); #endif return Vec128{_mm_srl_epi32(in.raw, bits32.raw)}; } #endif HWY_API Vec128 operator>>(const Vec128 v, const Vec128 bits) { #if HWY_TARGET >= HWY_SSE4 const DFromV d; // Individual shifts and combine const Vec128 out0{_mm_srl_epi64(v.raw, bits.raw)}; const __m128i bits1 = _mm_unpackhi_epi64(bits.raw, bits.raw); const Vec128 out1{_mm_srl_epi64(v.raw, bits1)}; return ConcatUpperLower(d, out1, out0); #else return Vec128{_mm_srlv_epi64(v.raw, bits.raw)}; #endif } HWY_API Vec64 operator>>(const Vec64 v, const Vec64 bits) { return Vec64{_mm_srl_epi64(v.raw, bits.raw)}; } namespace detail { #if HWY_TARGET <= HWY_AVX3 template HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) { const DFromV d; const Rebind di16; return DemoteTo(d, PromoteTo(di16, v) >> PromoteTo(di16, bits)); } #elif HWY_TARGET <= HWY_AVX2 // AVX2 template HWY_INLINE V AVX2ShrI16Vec128(V v, V bits) { const DFromV d; const Rebind di32; return DemoteTo(d, PromoteTo(di32, v) >> PromoteTo(di32, bits)); } template HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) { const DFromV d; const Rebind di32; return DemoteTo(d, PromoteTo(di32, v) >> PromoteTo(di32, bits)); } template HWY_INLINE V AVX2ShrI8Vec128(V v, V bits) { const DFromV d; const Half dh; const Rebind di16; const Rebind dh_i32; const auto lo_shr_result = PromoteTo(dh_i32, LowerHalf(dh, v)) >> PromoteTo(dh_i32, LowerHalf(dh, bits)); const auto hi_shr_result = PromoteTo(dh_i32, UpperHalf(dh, v)) >> PromoteTo(dh_i32, UpperHalf(dh, bits)); const auto i16_shr_result = OrderedDemote2To(di16, lo_shr_result, hi_shr_result); return DemoteTo(d, i16_shr_result); } #endif #if HWY_TARGET > HWY_AVX3 // Also used in x86_256-inl.h. template HWY_INLINE V SignedShr(const DI di, const V v, const V count_i) { const RebindToUnsigned du; const auto count = BitCast(du, count_i); // same type as value to shift // Clear sign and restore afterwards. This is preferable to shifting the MSB // downwards because Shr is somewhat more expensive than Shl. const auto sign = BroadcastSignBit(v); const auto abs = BitCast(du, v ^ sign); // off by one, but fixed below return BitCast(di, abs >> count) ^ sign; } #endif } // namespace detail template HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { #if HWY_TARGET <= HWY_AVX3 return Vec128{_mm_srav_epi16(v.raw, bits.raw)}; #elif HWY_TARGET <= HWY_AVX2 return detail::AVX2ShrI16Vec128(v, bits); #else const DFromV d; return detail::SignedShr(d, v, bits); #endif } #if HWY_TARGET > HWY_AVX3 HWY_API Vec16 operator>>(Vec16 v, Vec16 bits) { #if HWY_TARGET <= HWY_SSE4 const Vec16 bits16{_mm_cvtepu16_epi64(bits.raw)}; #else const auto bits16 = And(bits, Vec16{_mm_set_epi64x(0, 0xFFFF)}); #endif return Vec16{_mm_sra_epi16(v.raw, bits16.raw)}; } #endif template HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { #if HWY_TARGET <= HWY_AVX2 return detail::AVX2ShrI8Vec128(v, bits); #else const DFromV d; return detail::SignedShr(d, v, bits); #endif } HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { #if HWY_TARGET <= HWY_SSE4 const Vec16 vi16{_mm_cvtepi8_epi16(v.raw)}; const Vec16 bits8{_mm_cvtepu8_epi64(bits.raw)}; #else const DFromV d; const Rebind di16; const Twice dt; const auto vi16 = ShiftRight<8>(BitCast(di16, Combine(dt, v, v))); const Vec16 bits8 = And(Vec16{bits.raw}, Vec16{_mm_set_epi64x(0, 0xFF)}); #endif return Vec128{_mm_sra_epi16(vi16.raw, bits8.raw)}; } template HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { #if HWY_TARGET <= HWY_AVX2 return Vec128{_mm_srav_epi32(v.raw, bits.raw)}; #else const DFromV d; return detail::SignedShr(d, v, bits); #endif } #if HWY_TARGET > HWY_AVX2 HWY_API Vec32 operator>>(Vec32 v, Vec32 bits) { #if HWY_TARGET == HWY_SSE4 const Vec32 bits32{_mm_cvtepu32_epi64(bits.raw)}; #else const auto bits32 = Combine(Full64(), Zero(Full32()), bits); #endif return Vec32{_mm_sra_epi32(v.raw, bits32.raw)}; } #endif template HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { #if HWY_TARGET <= HWY_AVX3 return Vec128{_mm_srav_epi64(v.raw, bits.raw)}; #else const DFromV d; return detail::SignedShr(d, v, bits); #endif } // ------------------------------ MulEven/Odd 64x64 (UpperHalf) HWY_INLINE Vec128 MulEven(Vec128 a, Vec128 b) { const DFromV d; alignas(16) uint64_t mul[2]; mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]); return Load(d, mul); } HWY_INLINE Vec128 MulOdd(Vec128 a, Vec128 b) { const DFromV d; const Half d2; alignas(16) uint64_t mul[2]; const uint64_t a1 = GetLane(UpperHalf(d2, a)); const uint64_t b1 = GetLane(UpperHalf(d2, b)); mul[0] = Mul128(a1, b1, &mul[1]); return Load(d, mul); } // ------------------------------ WidenMulPairwiseAdd // Generic for all vector lengths. template >> HWY_API VFromD WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) { // TODO(janwas): _mm_dpbf16_ps when available const RebindToUnsigned du32; // Lane order within sum0/1 is undefined, hence we can avoid the // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip // leads to the odd/even order that RearrangeToOddPlusEven prefers. using VU32 = VFromD; const VU32 odd = Set(du32, 0xFFFF0000u); const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); const VU32 ao = And(BitCast(du32, a), odd); const VU32 be = ShiftLeft<16>(BitCast(du32, b)); const VU32 bo = And(BitCast(du32, b), odd); return MulAdd(BitCast(df32, ae), BitCast(df32, be), Mul(BitCast(df32, ao), BitCast(df32, bo))); } // Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe. template >> HWY_API VFromD WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) { return VFromD{_mm_madd_epi16(a.raw, b.raw)}; } // Generic for all vector lengths. template >> HWY_API VFromD WidenMulPairwiseAdd(DU32 du32, VU16 a, VU16 b) { const auto p_lo = a * b; const auto p_hi = MulHigh(a, b); const auto p_hi1_lo0 = BitCast(du32, OddEven(p_hi, p_lo)); const auto p_hi0_lo1 = Or(ShiftLeft<16>(BitCast(du32, p_hi)), ShiftRight<16>(BitCast(du32, p_lo))); return Add(BitCast(du32, p_hi1_lo0), BitCast(du32, p_hi0_lo1)); } // ------------------------------ SatWidenMulPairwiseAdd #if HWY_TARGET <= HWY_SSSE3 #ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD #undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD #else #define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD #endif // Even if N=1, the input is always at least 2 lanes, hence _mm_maddubs_epi16 // is safe. template HWY_API VFromD SatWidenMulPairwiseAdd( DI16 /* tag */, VFromD> a, VFromD> b) { return VFromD{_mm_maddubs_epi16(a.raw, b.raw)}; } #endif // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ShiftLeft) // Generic for all vector lengths. template >> HWY_API VFromD ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD sum0, VFromD& sum1) { // TODO(janwas): _mm_dpbf16_ps when available const RebindToUnsigned du32; // Lane order within sum0/1 is undefined, hence we can avoid the // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip // leads to the odd/even order that RearrangeToOddPlusEven prefers. using VU32 = VFromD; const VU32 odd = Set(du32, 0xFFFF0000u); const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); const VU32 ao = And(BitCast(du32, a), odd); const VU32 be = ShiftLeft<16>(BitCast(du32, b)); const VU32 bo = And(BitCast(du32, b), odd); sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1); return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0); } // Even if N=1, the input is always at least 2 lanes, hence madd_epi16 is safe. template >> HWY_API VFromD ReorderWidenMulAccumulate(D32 d, V16 a, V16 b, const VFromD sum0, VFromD& /*sum1*/) { (void)d; #if HWY_TARGET <= HWY_AVX3_DL return VFromD{_mm_dpwssd_epi32(sum0.raw, a.raw, b.raw)}; #else return sum0 + WidenMulPairwiseAdd(d, a, b); #endif } template >> HWY_API VFromD ReorderWidenMulAccumulate(DU32 d, VU16 a, VU16 b, const VFromD sum0, VFromD& /*sum1*/) { (void)d; return sum0 + WidenMulPairwiseAdd(d, a, b); } // ------------------------------ RearrangeToOddPlusEven template HWY_API Vec128 RearrangeToOddPlusEven(const Vec128 sum0, Vec128 /*sum1*/) { return sum0; // invariant already holds } template HWY_API Vec128 RearrangeToOddPlusEven( const Vec128 sum0, Vec128 /*sum1*/) { return sum0; // invariant already holds } template HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { return Add(sum0, sum1); } // ------------------------------ SumOfMulQuadAccumulate #if HWY_TARGET <= HWY_AVX3_DL #ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE #undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE #else #define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE #endif template HWY_API VFromD SumOfMulQuadAccumulate( DI32 /*di32*/, VFromD> a_u, VFromD> b_i, VFromD sum) { return VFromD{_mm_dpbusd_epi32(sum.raw, a_u.raw, b_i.raw)}; } #ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE #undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE #else #define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE #endif template HWY_API VFromD SumOfMulQuadAccumulate(DI32 di32, VFromD> a, VFromD> b, VFromD sum) { // TODO(janwas): AVX-VNNI-INT8 has dpbssd. const Repartition du8; const auto a_u = BitCast(du8, a); const auto result_sum_0 = SumOfMulQuadAccumulate(di32, a_u, b, sum); const auto result_sum_1 = ShiftLeft<8>( SumOfMulQuadAccumulate(di32, ShiftRight<7>(a_u), b, Zero(di32))); return result_sum_0 - result_sum_1; } #ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE #undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE #else #define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE #endif template HWY_API VFromD SumOfMulQuadAccumulate( DU32 du32, VFromD> a, VFromD> b, VFromD sum) { // TODO(janwas): AVX-VNNI-INT8 has dpbuud. const Repartition du8; const RebindToSigned di8; const RebindToSigned di32; const auto b_i = BitCast(di8, b); const auto result_sum_0 = SumOfMulQuadAccumulate(di32, a, b_i, BitCast(di32, sum)); const auto result_sum_1 = ShiftLeft<8>( SumOfMulQuadAccumulate(di32, a, BroadcastSignBit(b_i), Zero(di32))); return BitCast(du32, result_sum_0 - result_sum_1); } #endif // HWY_TARGET <= HWY_AVX3_DL // ================================================== CONVERT // ------------------------------ Promotions (part w/ narrow lanes -> full) // Unsigned: zero-extend. template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { #if HWY_TARGET >= HWY_SSSE3 const __m128i zero = _mm_setzero_si128(); return VFromD{_mm_unpacklo_epi8(v.raw, zero)}; #else return VFromD{_mm_cvtepu8_epi16(v.raw)}; #endif } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { #if HWY_TARGET >= HWY_SSSE3 return VFromD{_mm_unpacklo_epi16(v.raw, _mm_setzero_si128())}; #else return VFromD{_mm_cvtepu16_epi32(v.raw)}; #endif } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { #if HWY_TARGET >= HWY_SSSE3 return VFromD{_mm_unpacklo_epi32(v.raw, _mm_setzero_si128())}; #else return VFromD{_mm_cvtepu32_epi64(v.raw)}; #endif } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { #if HWY_TARGET >= HWY_SSSE3 const __m128i zero = _mm_setzero_si128(); const __m128i u16 = _mm_unpacklo_epi8(v.raw, zero); return VFromD{_mm_unpacklo_epi16(u16, zero)}; #else return VFromD{_mm_cvtepu8_epi32(v.raw)}; #endif } template HWY_API VFromD PromoteTo(D d, VFromD> v) { #if HWY_TARGET > HWY_SSSE3 const Rebind du32; return PromoteTo(d, PromoteTo(du32, v)); #elif HWY_TARGET == HWY_SSSE3 alignas(16) static constexpr int8_t kShuffle[16] = { 0, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1}; const Repartition di8; return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle))); #else (void)d; return VFromD{_mm_cvtepu8_epi64(v.raw)}; #endif } template HWY_API VFromD PromoteTo(D d, VFromD> v) { #if HWY_TARGET > HWY_SSSE3 const Rebind du32; return PromoteTo(d, PromoteTo(du32, v)); #elif HWY_TARGET == HWY_SSSE3 alignas(16) static constexpr int8_t kShuffle[16] = { 0, 1, -1, -1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, -1, -1}; const Repartition di8; return TableLookupBytesOr0(v, BitCast(d, Load(di8, kShuffle))); #else (void)d; return VFromD{_mm_cvtepu16_epi64(v.raw)}; #endif } // Unsigned to signed: same plus cast. template ), sizeof(TFromV)), HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V))> HWY_API VFromD PromoteTo(D di, V v) { const RebindToUnsigned du; return BitCast(di, PromoteTo(du, v)); } // Signed: replicate sign bit. template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { #if HWY_TARGET >= HWY_SSSE3 return ShiftRight<8>(VFromD{_mm_unpacklo_epi8(v.raw, v.raw)}); #else return VFromD{_mm_cvtepi8_epi16(v.raw)}; #endif } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { #if HWY_TARGET >= HWY_SSSE3 return ShiftRight<16>(VFromD{_mm_unpacklo_epi16(v.raw, v.raw)}); #else return VFromD{_mm_cvtepi16_epi32(v.raw)}; #endif } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { #if HWY_TARGET >= HWY_SSSE3 return ShiftRight<32>(VFromD{_mm_unpacklo_epi32(v.raw, v.raw)}); #else return VFromD{_mm_cvtepi32_epi64(v.raw)}; #endif } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { #if HWY_TARGET >= HWY_SSSE3 const __m128i x2 = _mm_unpacklo_epi8(v.raw, v.raw); const __m128i x4 = _mm_unpacklo_epi16(x2, x2); return ShiftRight<24>(VFromD{x4}); #else return VFromD{_mm_cvtepi8_epi32(v.raw)}; #endif } template HWY_API VFromD PromoteTo(D d, VFromD> v) { #if HWY_TARGET >= HWY_SSSE3 const Repartition di32; const Half dh_i32; const VFromD x4{PromoteTo(dh_i32, v).raw}; const VFromD s4{ _mm_shufflelo_epi16(x4.raw, _MM_SHUFFLE(3, 3, 1, 1))}; return ZipLower(d, x4, s4); #else (void)d; return VFromD{_mm_cvtepi8_epi64(v.raw)}; #endif } template HWY_API VFromD PromoteTo(D d, VFromD> v) { #if HWY_TARGET >= HWY_SSSE3 const Repartition di32; const Half dh_i32; const VFromD x2{PromoteTo(dh_i32, v).raw}; const VFromD s2{ _mm_shufflelo_epi16(x2.raw, _MM_SHUFFLE(3, 3, 1, 1))}; return ZipLower(d, x2, s2); #else (void)d; return VFromD{_mm_cvtepi16_epi64(v.raw)}; #endif } #if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C) // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions. #ifdef HWY_NATIVE_F16C #undef HWY_NATIVE_F16C #else #define HWY_NATIVE_F16C #endif // Workaround for origin tracking bug in Clang msan prior to 11.0 // (spurious "uninitialized memory" for TestF16 with "ORIGIN: invalid") #if HWY_IS_MSAN && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 1100) #define HWY_INLINE_F16 HWY_NOINLINE #else #define HWY_INLINE_F16 HWY_INLINE #endif template HWY_INLINE_F16 VFromD PromoteTo(D /*tag*/, VFromD> v) { #if HWY_HAVE_FLOAT16 const RebindToUnsigned> du16; return VFromD{_mm_cvtph_ps(BitCast(du16, v).raw)}; #else return VFromD{_mm_cvtph_ps(v.raw)}; #endif } #endif // HWY_NATIVE_F16C #if HWY_HAVE_FLOAT16 #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64 #undef HWY_NATIVE_PROMOTE_F16_TO_F64 #else #define HWY_NATIVE_PROMOTE_F16_TO_F64 #endif template HWY_INLINE VFromD PromoteTo(D /*tag*/, VFromD> v) { return VFromD{_mm_cvtph_pd(v.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API VFromD PromoteTo(D df32, VFromD> v) { const Rebind du16; const RebindToSigned di32; return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{_mm_cvtps_pd(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{_mm_cvtepi32_pd(v.raw)}; } #if HWY_TARGET <= HWY_AVX3 template HWY_API VFromD PromoteTo(D /*df64*/, VFromD> v) { return VFromD{_mm_cvtepu32_pd(v.raw)}; } #else // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 template HWY_API VFromD PromoteTo(D df64, VFromD> v) { const Rebind di32; const auto i32_to_f64_result = PromoteTo(df64, BitCast(di32, v)); return i32_to_f64_result + IfNegativeThenElse(i32_to_f64_result, Set(df64, 4294967296.0), Zero(df64)); } #endif // ------------------------------ PromoteEvenTo/PromoteOddTo #if HWY_TARGET > HWY_AVX3 namespace detail { // I32->I64 PromoteEvenTo/PromoteOddTo template HWY_INLINE VFromD PromoteEvenTo(hwy::SignedTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, hwy::SignedTag /*from_type_tag*/, D d_to, Vec64 v) { return PromoteLowerTo(d_to, v); } template HWY_INLINE VFromD PromoteEvenTo(hwy::SignedTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, hwy::SignedTag /*from_type_tag*/, D d_to, Vec128 v) { const Repartition d_from; return PromoteLowerTo(d_to, ConcatEven(d_from, v, v)); } template HWY_INLINE VFromD PromoteOddTo(hwy::SignedTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, hwy::SignedTag /*from_type_tag*/, D d_to, V v) { const Repartition d_from; return PromoteLowerTo(d_to, ConcatOdd(d_from, v, v)); } } // namespace detail #endif // ------------------------------ Demotions (full -> part w/ narrow lanes) template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{_mm_packs_epi32(v.raw, v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { #if HWY_TARGET >= HWY_SSSE3 const Rebind di32; const auto zero_if_neg = AndNot(ShiftRight<31>(v), v); const auto too_big = VecFromMask(di32, Gt(v, Set(di32, 0xFFFF))); const auto clamped = Or(zero_if_neg, too_big); #if HWY_TARGET == HWY_SSE2 const Rebind du16; const RebindToSigned di16; return BitCast(du16, DemoteTo(di16, ShiftRight<16>(ShiftLeft<16>(clamped)))); #else const Repartition du16; // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts. alignas(16) static constexpr uint16_t kLower2Bytes[16] = { 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080}; const auto lo2 = Load(du16, kLower2Bytes); return VFromD{TableLookupBytes(BitCast(du16, clamped), lo2).raw}; #endif #else return VFromD{_mm_packus_epi32(v.raw, v.raw)}; #endif } template HWY_API VFromD DemoteTo(D du16, VFromD> v) { const DFromV du32; const RebindToSigned di32; #if HWY_TARGET >= HWY_SSSE3 const auto too_big = VecFromMask(di32, Gt(BitCast(di32, ShiftRight<16>(v)), Zero(di32))); const auto clamped = Or(BitCast(di32, v), too_big); #if HWY_TARGET == HWY_SSE2 const RebindToSigned di16; return BitCast(du16, DemoteTo(di16, ShiftRight<16>(ShiftLeft<16>(clamped)))); #else (void)du16; const Repartition du16_full; // Lower 2 bytes from each 32-bit lane; same as return type for fewer casts. alignas(16) static constexpr uint16_t kLower2Bytes[16] = { 0x0100, 0x0504, 0x0908, 0x0D0C, 0x8080, 0x8080, 0x8080, 0x8080}; const auto lo2 = Load(du16_full, kLower2Bytes); return VFromD{TableLookupBytes(BitCast(du16_full, clamped), lo2).raw}; #endif #else return DemoteTo(du16, BitCast(di32, Min(v, Set(du32, 0x7FFFFFFF)))); #endif } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { const __m128i i16 = _mm_packs_epi32(v.raw, v.raw); return VFromD{_mm_packus_epi16(i16, i16)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{_mm_packus_epi16(v.raw, v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { const __m128i i16 = _mm_packs_epi32(v.raw, v.raw); return VFromD{_mm_packs_epi16(i16, i16)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{_mm_packs_epi16(v.raw, v.raw)}; } template HWY_API VFromD DemoteTo(D du8, VFromD> v) { #if HWY_TARGET <= HWY_AVX3 // NOTE: _mm_cvtusepi32_epi8 is a saturated conversion of 32-bit unsigned // integers to 8-bit unsigned integers (void)du8; return VFromD{_mm_cvtusepi32_epi8(v.raw)}; #else const DFromV du32; const RebindToSigned di32; const auto max_i32 = Set(du32, 0x7FFFFFFFu); #if HWY_TARGET >= HWY_SSSE3 // On SSE2/SSSE3, clamp u32 values to an i32 using the u8 Min operation // as SSE2/SSSE3 can do an u8 Min operation in a single instruction. // The u8 Min operation below leaves the lower 24 bits of each 32-bit // lane unchanged. // The u8 Min operation below will leave any values that are less than or // equal to 0x7FFFFFFF unchanged. // For values that are greater than or equal to 0x80000000, the u8 Min // operation below will force the upper 8 bits to 0x7F and leave the lower // 24 bits unchanged. // An u8 Min operation is okay here as any clamped value that is greater than // or equal to 0x80000000 will be clamped to a value between 0x7F000000 and // 0x7FFFFFFF through the u8 Min operation below, which will then be converted // to 0xFF through the i32->u8 demotion. const Repartition du32_as_du8; const auto clamped = BitCast( di32, Min(BitCast(du32_as_du8, v), BitCast(du32_as_du8, max_i32))); #else const auto clamped = BitCast(di32, Min(v, max_i32)); #endif return DemoteTo(du8, clamped); #endif } template HWY_API VFromD DemoteTo(D du8, VFromD> v) { const DFromV du16; const RebindToSigned di16; const auto max_i16 = Set(du16, 0x7FFF); #if HWY_TARGET >= HWY_SSSE3 // On SSE2/SSSE3, clamp u16 values to an i16 using the u8 Min operation // as SSE2/SSSE3 can do an u8 Min operation in a single instruction. // The u8 Min operation below leaves the lower 8 bits of each 16-bit // lane unchanged. // The u8 Min operation below will leave any values that are less than or // equal to 0x7FFF unchanged. // For values that are greater than or equal to 0x8000, the u8 Min // operation below will force the upper 8 bits to 0x7F and leave the lower // 8 bits unchanged. // An u8 Min operation is okay here as any clamped value that is greater than // or equal to 0x8000 will be clamped to a value between 0x7F00 and // 0x7FFF through the u8 Min operation below, which will then be converted // to 0xFF through the i16->u8 demotion. const Repartition du16_as_du8; const auto clamped = BitCast( di16, Min(BitCast(du16_as_du8, v), BitCast(du16_as_du8, max_i16))); #else const auto clamped = BitCast(di16, Min(v, max_i16)); #endif return DemoteTo(du8, clamped); } #if HWY_TARGET < HWY_SSE4 && !defined(HWY_DISABLE_F16C) // HWY_NATIVE_F16C was already toggled above. // Work around MSVC warning for _mm_cvtps_ph (8 is actually a valid immediate). // clang-cl requires a non-empty string, so we 'ignore' the irrelevant -Wmain. HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4556, ignored "-Wmain") template HWY_API VFromD DemoteTo(D df16, VFromD> v) { const RebindToUnsigned du16; return BitCast( df16, VFromD{_mm_cvtps_ph(v.raw, _MM_FROUND_NO_EXC)}); } HWY_DIAGNOSTICS(pop) #endif // F16C #if HWY_HAVE_FLOAT16 #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16 #undef HWY_NATIVE_DEMOTE_F64_TO_F16 #else #define HWY_NATIVE_DEMOTE_F64_TO_F16 #endif template HWY_API VFromD DemoteTo(D /*df16*/, VFromD> v) { return VFromD{_mm_cvtpd_ph(v.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API VFromD DemoteTo(D dbf16, VFromD> v) { // TODO(janwas): _mm_cvtneps_pbh once we have avx512bf16. const Rebind di32; const Rebind du32; // for logical shift right const Rebind du16; const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); return BitCast(dbf16, DemoteTo(du16, bits_in_32)); } template >> HWY_API VFromD ReorderDemote2To(D dbf16, V32 a, V32 b) { // TODO(janwas): _mm_cvtne2ps_pbh once we have avx512bf16. const RebindToUnsigned du16; const Repartition du32; const VFromD b_in_even = ShiftRight<16>(BitCast(du32, b)); return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); } // Specializations for partial vectors because packs_epi32 sets lanes above 2*N. template HWY_API VFromD ReorderDemote2To(D dn, Vec32 a, Vec32 b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } template HWY_API VFromD ReorderDemote2To(D /* tag */, Vec64 a, Vec64 b) { return VFromD{_mm_shuffle_epi32(_mm_packs_epi32(a.raw, b.raw), _MM_SHUFFLE(2, 0, 2, 0))}; } template HWY_API VFromD ReorderDemote2To(D /* tag */, Vec128 a, Vec128 b) { return VFromD{_mm_packs_epi32(a.raw, b.raw)}; } template HWY_API VFromD ReorderDemote2To(D dn, Vec32 a, Vec32 b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } template HWY_API VFromD ReorderDemote2To(D dn, Vec64 a, Vec64 b) { #if HWY_TARGET >= HWY_SSSE3 const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); #else (void)dn; return VFromD{_mm_shuffle_epi32(_mm_packus_epi32(a.raw, b.raw), _MM_SHUFFLE(2, 0, 2, 0))}; #endif } template HWY_API VFromD ReorderDemote2To(D dn, Vec128 a, Vec128 b) { #if HWY_TARGET >= HWY_SSSE3 const Half dnh; const auto u16_a = DemoteTo(dnh, a); const auto u16_b = DemoteTo(dnh, b); return Combine(dn, u16_b, u16_a); #else (void)dn; return VFromD{_mm_packus_epi32(a.raw, b.raw)}; #endif } template HWY_API VFromD ReorderDemote2To(D dn, Vec128 a, Vec128 b) { const DFromV du32; const RebindToSigned di32; const auto max_i32 = Set(du32, 0x7FFFFFFFu); #if HWY_TARGET >= HWY_SSSE3 const Repartition du32_as_du8; // On SSE2/SSSE3, clamp a and b using u8 Min operation const auto clamped_a = BitCast( di32, Min(BitCast(du32_as_du8, a), BitCast(du32_as_du8, max_i32))); const auto clamped_b = BitCast( di32, Min(BitCast(du32_as_du8, b), BitCast(du32_as_du8, max_i32))); #else const auto clamped_a = BitCast(di32, Min(a, max_i32)); const auto clamped_b = BitCast(di32, Min(b, max_i32)); #endif return ReorderDemote2To(dn, clamped_a, clamped_b); } template HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, VFromD> b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } // Specializations for partial vectors because packs_epi32 sets lanes above 2*N. template HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, VFromD> b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } template HWY_API VFromD ReorderDemote2To(D /* tag */, Vec64 a, Vec64 b) { return VFromD{_mm_shuffle_epi32(_mm_packs_epi16(a.raw, b.raw), _MM_SHUFFLE(2, 0, 2, 0))}; } template HWY_API VFromD ReorderDemote2To(D /* tag */, Vec128 a, Vec128 b) { return VFromD{_mm_packs_epi16(a.raw, b.raw)}; } template HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, VFromD> b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } template HWY_API VFromD ReorderDemote2To(D /* tag */, Vec64 a, Vec64 b) { return VFromD{_mm_shuffle_epi32(_mm_packus_epi16(a.raw, b.raw), _MM_SHUFFLE(2, 0, 2, 0))}; } template HWY_API VFromD ReorderDemote2To(D /* tag */, Vec128 a, Vec128 b) { return VFromD{_mm_packus_epi16(a.raw, b.raw)}; } template HWY_API VFromD ReorderDemote2To(D dn, Vec128 a, Vec128 b) { const DFromV du16; const RebindToSigned di16; const auto max_i16 = Set(du16, 0x7FFFu); #if HWY_TARGET >= HWY_SSSE3 const Repartition du16_as_du8; // On SSE2/SSSE3, clamp a and b using u8 Min operation const auto clamped_a = BitCast( di16, Min(BitCast(du16_as_du8, a), BitCast(du16_as_du8, max_i16))); const auto clamped_b = BitCast( di16, Min(BitCast(du16_as_du8, b), BitCast(du16_as_du8, max_i16))); #else const auto clamped_a = BitCast(di16, Min(a, max_i16)); const auto clamped_b = BitCast(di16, Min(b, max_i16)); #endif return ReorderDemote2To(dn, clamped_a, clamped_b); } template HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, VFromD> b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } template ), HWY_IF_V_SIZE_LE_D(D, 16), class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV) * 2)> HWY_API VFromD OrderedDemote2To(D d, V a, V b) { return ReorderDemote2To(d, a, b); } template >> HWY_API VFromD OrderedDemote2To(D dbf16, V32 a, V32 b) { const RebindToUnsigned du16; return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a))); } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{_mm_cvtpd_ps(v.raw)}; } namespace detail { // Generic for all vector lengths. template HWY_INLINE VFromD ClampF64ToI32Max(D d, VFromD v) { // The max can be exactly represented in binary64, so clamping beforehand // prevents x86 conversion from raising an exception and returning 80..00. return Min(v, Set(d, 2147483647.0)); } // For ConvertTo float->int of same size, clamping before conversion would // change the result because the max integer value is not exactly representable. // Instead detect the overflow result after conversion and fix it. // Generic for all vector lengths. template HWY_INLINE VFromD FixConversionOverflow(DI di, VFromD> original, VFromD converted) { // Combinations of original and output sign: // --: normal <0 or -huge_val to 80..00: OK // -+: -0 to 0 : OK // +-: +huge_val to 80..00 : xor with FF..FF to get 7F..FF // ++: normal >0 : OK const VFromD sign_wrong = AndNot(BitCast(di, original), converted); #if HWY_COMPILER_GCC_ACTUAL // Critical GCC 11 compiler bug (possibly also GCC 10): omits the Xor; also // Add() if using that instead. Work around with one more instruction. const RebindToUnsigned du; const VFromD mask = BroadcastSignBit(sign_wrong); const VFromD max = BitCast(di, ShiftRight<1>(BitCast(du, mask))); return IfVecThenElse(mask, max, converted); #else return Xor(converted, BroadcastSignBit(sign_wrong)); #endif } } // namespace detail template > HWY_API VFromD DemoteTo(D /* tag */, VFromD v) { const VFromD clamped = detail::ClampF64ToI32Max(DF(), v); return VFromD{_mm_cvttpd_epi32(clamped.raw)}; } template HWY_API VFromD DemoteTo(D du32, VFromD> v) { #if HWY_TARGET <= HWY_AVX3 (void)du32; return VFromD{ _mm_maskz_cvttpd_epu32(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)}; #else // AVX2 or earlier const Rebind df64; const RebindToUnsigned du64; // Clamp v[i] to a value between 0 and 4294967295 const auto clamped = Min(ZeroIfNegative(v), Set(df64, 4294967295.0)); const auto k2_31 = Set(df64, 2147483648.0); const auto clamped_is_ge_k2_31 = (clamped >= k2_31); const auto clamped_lo31_f64 = clamped - IfThenElseZero(clamped_is_ge_k2_31, k2_31); const VFromD clamped_lo31_u32{_mm_cvttpd_epi32(clamped_lo31_f64.raw)}; const auto clamped_u32_msb = ShiftLeft<31>( TruncateTo(du32, BitCast(du64, VecFromMask(df64, clamped_is_ge_k2_31)))); return Or(clamped_lo31_u32, clamped_u32_msb); #endif } #if HWY_TARGET <= HWY_AVX3 template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{_mm_cvtepi64_ps(v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{_mm_cvtepu64_ps(v.raw)}; } #else // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 template HWY_API VFromD DemoteTo(D df32, VFromD> v) { const Rebind df64; const RebindToUnsigned du64; const RebindToSigned di32; const RebindToUnsigned du32; const auto k2p64_63 = Set(df64, 27670116110564327424.0); const auto f64_hi52 = Xor(BitCast(df64, ShiftRight<12>(BitCast(du64, v))), k2p64_63) - k2p64_63; const auto f64_lo12 = PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)), Set(du32, uint32_t{0x00000FFF})))); const auto f64_sum = f64_hi52 + f64_lo12; const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12; const auto f64_sum_is_inexact = ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64)))); const auto f64_bits_decrement = And(ShiftRight<63>(BitCast(du64, Xor(f64_sum, f64_carry))), f64_sum_is_inexact); const auto adj_f64_val = BitCast( df64, Or(BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact)); return DemoteTo(df32, adj_f64_val); } // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 template HWY_API VFromD DemoteTo(D df32, VFromD> v) { const Rebind df64; const RebindToUnsigned du64; const RebindToSigned di32; const RebindToUnsigned du32; const auto k2p64 = Set(df64, 18446744073709551616.0); const auto f64_hi52 = Or(BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64; const auto f64_lo12 = PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)), Set(du32, uint32_t{0x00000FFF})))); const auto f64_sum = f64_hi52 + f64_lo12; const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12; const auto f64_sum_is_inexact = ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64)))); const auto adj_f64_val = BitCast( df64, Or(BitCast(du64, f64_sum) - ShiftRight<63>(BitCast(du64, f64_carry)), f64_sum_is_inexact)); return DemoteTo(df32, adj_f64_val); } #endif // For already range-limited input [0, 255]. template HWY_API Vec128 U8FromU32(const Vec128 v) { #if HWY_TARGET == HWY_SSE2 const RebindToSigned> di32; const Rebind du8; return DemoteTo(du8, BitCast(di32, v)); #else const DFromV d32; const Repartition d8; alignas(16) static constexpr uint32_t k8From32[4] = { 0x0C080400u, 0x0C080400u, 0x0C080400u, 0x0C080400u}; // Also replicate bytes into all 32 bit lanes for safety. const auto quad = TableLookupBytes(v, Load(d32, k8From32)); return LowerHalf(LowerHalf(BitCast(d8, quad))); #endif } // ------------------------------ F32->UI64 PromoteTo #if HWY_TARGET <= HWY_AVX3 template HWY_API VFromD PromoteTo(D di64, VFromD> v) { const Rebind df32; const RebindToFloat df64; const Twice dt_f32; return detail::FixConversionOverflow( di64, BitCast(df64, InterleaveLower(ResizeBitCast(dt_f32, v), ResizeBitCast(dt_f32, v))), VFromD{_mm_cvttps_epi64(v.raw)}); } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{ _mm_maskz_cvttps_epu64(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)}; } #else // AVX2 or below // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 template HWY_API VFromD PromoteTo(D di64, VFromD> v) { const Rebind di32; const RebindToFloat df32; const RebindToUnsigned du32; const Repartition du32_as_du8; const auto exponent_adj = BitCast( du32, Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))), BitCast(du32_as_du8, Set(du32, uint32_t{157}))), BitCast(du32_as_du8, Set(du32, uint32_t{32})))); const auto adj_v = BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj)); const auto f32_to_i32_result = ConvertTo(di32, adj_v); const auto lo64_or_mask = PromoteTo( di64, BitCast(du32, VecFromMask(di32, Eq(f32_to_i32_result, Set(di32, LimitsMax()))))); return Or(PromoteTo(di64, BitCast(di32, f32_to_i32_result)) << PromoteTo(di64, exponent_adj), lo64_or_mask); } namespace detail { template HWY_INLINE VFromD PromoteF32ToU64OverflowMaskToU64( DU64 du64, VFromD> i32_overflow_mask) { const Rebind di32; const Twice dt_i32; const auto vt_i32_overflow_mask = ResizeBitCast(dt_i32, i32_overflow_mask); return BitCast(du64, InterleaveLower(vt_i32_overflow_mask, vt_i32_overflow_mask)); } template HWY_INLINE VFromD PromoteF32ToU64OverflowMaskToU64( DU64 du64, VFromD> i32_overflow_mask) { const RebindToSigned di64; return BitCast(du64, PromoteTo(di64, i32_overflow_mask)); } } // namespace detail // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 template HWY_API VFromD PromoteTo(D du64, VFromD> v) { const Rebind di32; const RebindToFloat df32; const RebindToUnsigned du32; const Repartition du32_as_du8; const auto non_neg_v = ZeroIfNegative(v); const auto exponent_adj = BitCast( du32, Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, non_neg_v))), BitCast(du32_as_du8, Set(du32, uint32_t{157}))), BitCast(du32_as_du8, Set(du32, uint32_t{33})))); const auto adj_v = BitCast(df32, BitCast(du32, non_neg_v) - ShiftLeft<23>(exponent_adj)); const VFromD f32_to_i32_result{_mm_cvttps_epi32(adj_v.raw)}; const auto i32_overflow_mask = BroadcastSignBit(f32_to_i32_result); const auto overflow_result = detail::PromoteF32ToU64OverflowMaskToU64(du64, i32_overflow_mask); return Or(PromoteTo(du64, BitCast(du32, f32_to_i32_result)) << PromoteTo(du64, exponent_adj), overflow_result); } #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ MulFixedPoint15 #if HWY_TARGET == HWY_SSE2 HWY_API Vec128 MulFixedPoint15(const Vec128 a, const Vec128 b) { const DFromV d; const Repartition di32; auto lo_product = a * b; auto hi_product = MulHigh(a, b); const VFromD i32_product_lo{ _mm_unpacklo_epi16(lo_product.raw, hi_product.raw)}; const VFromD i32_product_hi{ _mm_unpackhi_epi16(lo_product.raw, hi_product.raw)}; const auto round_up_incr = Set(di32, 0x4000); return ReorderDemote2To(d, ShiftRight<15>(i32_product_lo + round_up_incr), ShiftRight<15>(i32_product_hi + round_up_incr)); } template HWY_API Vec128 MulFixedPoint15(const Vec128 a, const Vec128 b) { const DFromV d; const Rebind di32; const auto lo_product = a * b; const auto hi_product = MulHigh(a, b); const VFromD i32_product{ _mm_unpacklo_epi16(lo_product.raw, hi_product.raw)}; return DemoteTo(d, ShiftRight<15>(i32_product + Set(di32, 0x4000))); } #else template HWY_API Vec128 MulFixedPoint15(const Vec128 a, const Vec128 b) { return Vec128{_mm_mulhrs_epi16(a.raw, b.raw)}; } #endif // ------------------------------ Truncations template HWY_API VFromD TruncateTo(DTo /* tag */, Vec128 v) { // BitCast requires the same size; DTo might be u8x1 and v u16x1. const Repartition, DFromV> dto; return VFromD{BitCast(dto, v).raw}; } template HWY_API VFromD TruncateTo(D d, Vec128 v) { #if HWY_TARGET == HWY_SSE2 const Vec128 lo{v.raw}; const Vec128 hi{_mm_unpackhi_epi64(v.raw, v.raw)}; return Combine(d, hi, lo); #else const Repartition> d8; (void)d; alignas(16) static constexpr uint8_t kIdx[16] = {0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8}; const Vec128 v8 = TableLookupBytes(v, Load(d8, kIdx)); return LowerHalf(LowerHalf(LowerHalf(v8))); #endif } template HWY_API VFromD TruncateTo(D d, Vec128 v) { #if HWY_TARGET == HWY_SSE2 const Vec128 lo{v.raw}; const Vec128 hi{_mm_unpackhi_epi64(v.raw, v.raw)}; return Combine(d, hi, lo); #else (void)d; const Repartition> d16; alignas(16) static constexpr uint16_t kIdx[8] = { 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u, 0x100u, 0x908u}; const Vec128 v16 = TableLookupBytes(v, Load(d16, kIdx)); return LowerHalf(LowerHalf(v16)); #endif } template HWY_API VFromD TruncateTo(D /* tag */, Vec128 v) { return VFromD{_mm_shuffle_epi32(v.raw, 0x88)}; } template HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { const DFromV du32; #if HWY_TARGET == HWY_SSE2 const RebindToSigned di32; const Rebind du8; return DemoteTo(du8, BitCast(di32, ShiftRight<24>(ShiftLeft<24>(v)))); #else const Repartition d; alignas(16) static constexpr uint8_t kIdx[16] = { 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu, 0x0u, 0x4u, 0x8u, 0xCu}; return LowerHalf(LowerHalf(TableLookupBytes(v, Load(d, kIdx)))); #endif } template HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { const DFromV du32; #if HWY_TARGET == HWY_SSE2 const RebindToSigned di32; const Rebind du16; const RebindToSigned di16; return BitCast( du16, DemoteTo(di16, ShiftRight<16>(BitCast(di32, ShiftLeft<16>(v))))); #else const Repartition d; return LowerHalf(ConcatEven(d, BitCast(d, v), BitCast(d, v))); #endif } template HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { const DFromV du16; #if HWY_TARGET == HWY_SSE2 const RebindToSigned di16; const Rebind du8; const RebindToSigned di8; return BitCast(du8, DemoteTo(di8, ShiftRight<8>(BitCast(di16, ShiftLeft<8>(v))))); #else const Repartition d; return LowerHalf(ConcatEven(d, BitCast(d, v), BitCast(d, v))); #endif } // ------------------------------ Demotions to/from i64 #if HWY_TARGET <= HWY_AVX3 template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{_mm_cvtsepi64_epi32(v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{_mm_cvtsepi64_epi16(v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{_mm_cvtsepi64_epi8(v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw; return VFromD{_mm_maskz_cvtusepi64_epi32(non_neg_mask, v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw; return VFromD{_mm_maskz_cvtusepi64_epi16(non_neg_mask, v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { const __mmask8 non_neg_mask = detail::UnmaskedNot(MaskFromVec(v)).raw; return VFromD{_mm_maskz_cvtusepi64_epi8(non_neg_mask, v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{_mm_cvtusepi64_epi32(v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{_mm_cvtusepi64_epi16(v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{_mm_cvtusepi64_epi8(v.raw)}; } #else // AVX2 or below namespace detail { template HWY_INLINE VFromD> DemoteFromU64MaskOutResult( D /*dn*/, VFromD> v) { return v; } template HWY_INLINE VFromD> DemoteFromU64MaskOutResult( D /*dn*/, VFromD> v) { const DFromV du64; return And(v, Set(du64, static_cast(hwy::HighestValue>()))); } template HWY_INLINE VFromD> DemoteFromU64Saturate( D dn, VFromD> v) { const Rebind du64; const RebindToSigned di64; constexpr int kShiftAmt = static_cast(sizeof(TFromD) * 8) - static_cast(hwy::IsSigned>()); const auto too_big = BitCast( du64, VecFromMask( di64, Gt(BitCast(di64, ShiftRight(v)), Zero(di64)))); return DemoteFromU64MaskOutResult(dn, Or(v, too_big)); } template HWY_INLINE VFromD ReorderDemote2From64To32Combine(D dn, V a, V b) { return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a)); } } // namespace detail template HWY_API VFromD DemoteTo(D dn, VFromD> v) { const DFromV di64; const RebindToUnsigned du64; const RebindToUnsigned dn_u; // Negative values are saturated by first saturating their bitwise inverse // and then inverting the saturation result const auto invert_mask = BitCast(du64, BroadcastSignBit(v)); const auto saturated_vals = Xor( invert_mask, detail::DemoteFromU64Saturate(dn, Xor(invert_mask, BitCast(du64, v)))); return BitCast(dn, TruncateTo(dn_u, saturated_vals)); } template HWY_API VFromD DemoteTo(D dn, VFromD> v) { const DFromV di64; const RebindToUnsigned du64; const auto non_neg_vals = BitCast(du64, AndNot(BroadcastSignBit(v), v)); return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, non_neg_vals)); } template HWY_API VFromD DemoteTo(D dn, VFromD> v) { return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, v)); } #endif // HWY_TARGET <= HWY_AVX3 template )> HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, VFromD> b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } template HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, VFromD> b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } #if HWY_TARGET > HWY_AVX2 template HWY_API Vec128 ReorderDemote2To(D dn, Vec128 a, Vec128 b) { const DFromV di64; const RebindToUnsigned du64; const Half dnh; // Negative values are saturated by first saturating their bitwise inverse // and then inverting the saturation result const auto invert_mask_a = BitCast(du64, BroadcastSignBit(a)); const auto invert_mask_b = BitCast(du64, BroadcastSignBit(b)); const auto saturated_a = Xor( invert_mask_a, detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_a, BitCast(du64, a)))); const auto saturated_b = Xor( invert_mask_b, detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_b, BitCast(du64, b)))); return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); } template HWY_API Vec128 ReorderDemote2To(D dn, Vec128 a, Vec128 b) { const DFromV di64; const RebindToUnsigned du64; const Half dnh; const auto saturated_a = detail::DemoteFromU64Saturate( dnh, BitCast(du64, AndNot(BroadcastSignBit(a), a))); const auto saturated_b = detail::DemoteFromU64Saturate( dnh, BitCast(du64, AndNot(BroadcastSignBit(b), b))); return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); } template HWY_API Vec128 ReorderDemote2To(D dn, Vec128 a, Vec128 b) { const Half dnh; const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a); const auto saturated_b = detail::DemoteFromU64Saturate(dnh, b); return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); } #endif // HWY_TARGET > HWY_AVX2 // ------------------------------ Integer <=> fp (ShiftRight, OddEven) #if HWY_HAVE_FLOAT16 template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD{_mm_cvtepu16_ph(v.raw)}; } template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD{_mm_cvtepi16_ph(v.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD{_mm_cvtepi32_ps(v.raw)}; } #if HWY_TARGET <= HWY_AVX3 template HWY_API VFromD ConvertTo(D /*df*/, VFromD> v) { return VFromD{_mm_cvtepu32_ps(v.raw)}; } template HWY_API VFromD ConvertTo(D /*dd*/, VFromD> v) { return VFromD{_mm_cvtepi64_pd(v.raw)}; } template HWY_API VFromD ConvertTo(D /*dd*/, VFromD> v) { return VFromD{_mm_cvtepu64_pd(v.raw)}; } #else // AVX2 or below // Generic for all vector lengths. template HWY_API VFromD ConvertTo(D df, VFromD> v) { // Based on wim's approach (https://stackoverflow.com/questions/34066228/) const RebindToUnsigned du32; const RebindToSigned d32; const auto msk_lo = Set(du32, 0xFFFF); const auto cnst2_16_flt = Set(df, 65536.0f); // 2^16 // Extract the 16 lowest/highest significant bits of v and cast to signed int const auto v_lo = BitCast(d32, And(v, msk_lo)); const auto v_hi = BitCast(d32, ShiftRight<16>(v)); return MulAdd(cnst2_16_flt, ConvertTo(df, v_hi), ConvertTo(df, v_lo)); } // Generic for all vector lengths. template HWY_API VFromD ConvertTo(D dd, VFromD> v) { // Based on wim's approach (https://stackoverflow.com/questions/41144668/) const Repartition d32; const Repartition d64; // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63 const auto k84_63 = Set(d64, 0x4530000080000000ULL); const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63); // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven) const auto k52 = Set(d32, 0x43300000); const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v))); const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL)); return (v_upper - k84_63_52) + v_lower; // order matters! } namespace detail { template HWY_INLINE VFromD>> U64ToF64VecFast(VW w) { const DFromV d64; const RebindToFloat dd; const auto cnst2_52_dbl = Set(dd, 0x0010000000000000); // 2^52 return BitCast(dd, Or(w, BitCast(d64, cnst2_52_dbl))) - cnst2_52_dbl; } } // namespace detail // Generic for all vector lengths. template HWY_API VFromD ConvertTo(D dd, VFromD> v) { // Based on wim's approach (https://stackoverflow.com/questions/41144668/) const RebindToUnsigned d64; using VU = VFromD; const VU msk_lo = Set(d64, 0xFFFFFFFF); const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32 // Extract the 32 lowest/highest significant bits of v const VU v_lo = And(v, msk_lo); const VU v_hi = ShiftRight<32>(v); const auto v_lo_dbl = detail::U64ToF64VecFast(v_lo); return MulAdd(cnst2_32_dbl, detail::U64ToF64VecFast(v_hi), v_lo_dbl); } #endif // HWY_TARGET <= HWY_AVX3 // Truncates (rounds toward zero). #if HWY_HAVE_FLOAT16 template HWY_API VFromD ConvertTo(D di, VFromD> v) { return detail::FixConversionOverflow( di, v, VFromD>{_mm_cvttph_epi16(v.raw)}); } template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD{ _mm_maskz_cvttph_epu16(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API VFromD ConvertTo(D di, VFromD> v) { return detail::FixConversionOverflow( di, v, VFromD>{_mm_cvttps_epi32(v.raw)}); } #if HWY_TARGET <= HWY_AVX3 template HWY_API VFromD ConvertTo(DI di, VFromD> v) { return detail::FixConversionOverflow(di, v, VFromD{_mm_cvttpd_epi64(v.raw)}); } template HWY_API VFromD ConvertTo(DU /*du*/, VFromD> v) { return VFromD{ _mm_maskz_cvttps_epu32(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)}; } template HWY_API VFromD ConvertTo(DU /*du*/, VFromD> v) { return VFromD{ _mm_maskz_cvttpd_epu64(detail::UnmaskedNot(MaskFromVec(v)).raw, v.raw)}; } #else // AVX2 or below template HWY_API VFromD ConvertTo(DU32 du32, VFromD> v) { const RebindToSigned di32; const RebindToFloat df32; const auto non_neg_v = ZeroIfNegative(v); const auto exp_diff = Set(di32, int32_t{158}) - BitCast(di32, ShiftRight<23>(BitCast(du32, non_neg_v))); const auto scale_down_f32_val_mask = BitCast(du32, VecFromMask(di32, Eq(exp_diff, Zero(di32)))); const auto v_scaled = BitCast( df32, BitCast(du32, non_neg_v) + ShiftLeft<23>(scale_down_f32_val_mask)); const VFromD f32_to_u32_result{ _mm_cvttps_epi32(v_scaled.raw)}; return Or( BitCast(du32, BroadcastSignBit(exp_diff)), f32_to_u32_result + And(f32_to_u32_result, scale_down_f32_val_mask)); } #if HWY_ARCH_X86_64 template HWY_API VFromD ConvertTo(DI di, Vec64 v) { const Vec64 i0{_mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw))}; return detail::FixConversionOverflow(di, v, i0); } template HWY_API VFromD ConvertTo(DI di, Vec128 v) { const __m128i i0 = _mm_cvtsi64_si128(_mm_cvttsd_si64(v.raw)); const Full64 dd2; const __m128i i1 = _mm_cvtsi64_si128(_mm_cvttsd_si64(UpperHalf(dd2, v).raw)); return detail::FixConversionOverflow( di, v, Vec128{_mm_unpacklo_epi64(i0, i1)}); } #endif // HWY_ARCH_X86_64 #if !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 template HWY_API VFromD ConvertTo(DI di, VFromD> v) { using VI = VFromD; const RebindToUnsigned du; using VU = VFromD; const Repartition du16; const VI k1075 = Set(di, 1075); /* biased exponent of 2^52 */ // Exponent indicates whether the number can be represented as int64_t. const VU biased_exp = ShiftRight<52>(BitCast(du, v)) & Set(du, 0x7FF); #if HWY_TARGET <= HWY_SSE4 const auto in_range = BitCast(di, biased_exp) < Set(di, 1086); #else const Repartition di32; const auto in_range = MaskFromVec(BitCast( di, VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) < Set(di32, 1086)))); #endif // If we were to cap the exponent at 51 and add 2^52, the number would be in // [2^52, 2^53) and mantissa bits could be read out directly. We need to // round-to-0 (truncate), but changing rounding mode in MXCSR hits a // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead // manually shift the mantissa into place (we already have many of the // inputs anyway). // Use 16-bit saturated unsigned subtraction to compute shift_mnt and // shift_int since biased_exp[i] is a non-negative integer that is less than // or equal to 2047. // 16-bit saturated unsigned subtraction is also more efficient than a // 64-bit subtraction followed by a 64-bit signed Max operation on // SSE2/SSSE3/SSE4/AVX2. // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be // zero as the upper 48 bits of both k1075 and biased_exp are zero. const VU shift_mnt = BitCast( du, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp))); const VU shift_int = BitCast( du, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075))); const VU mantissa = BitCast(du, v) & Set(du, (1ULL << 52) - 1); // Include implicit 1-bit. NOTE: the shift count may exceed 63; we rely on x86 // returning zero in that case. const VU int53 = (mantissa | Set(du, 1ULL << 52)) >> shift_mnt; // For inputs larger than 2^53 - 1, insert zeros at the bottom. // For inputs less than 2^63, the implicit 1-bit is guaranteed not to be // shifted out of the left shift result below as shift_int[i] <= 10 is true // for any inputs that are less than 2^63. const VU shifted = int53 << shift_int; // Saturate to LimitsMin (unchanged when negating below) or LimitsMax. const VI sign_mask = BroadcastSignBit(BitCast(di, v)); const VI limit = Set(di, LimitsMax()) - sign_mask; const VI magnitude = IfThenElse(in_range, BitCast(di, shifted), limit); // If the input was negative, negate the integer (two's complement). return (magnitude ^ sign_mask) - sign_mask; } #endif // !HWY_ARCH_X86_64 || HWY_TARGET <= HWY_AVX2 // Generic for all vector lengths on SSE2/SSSE3/SSE4/AVX2 template HWY_API VFromD ConvertTo(DU du, VFromD> v) { const RebindToSigned di; using VU = VFromD; const Repartition du16; const VU k1075 = Set(du, 1075); /* biased exponent of 2^52 */ const auto non_neg_v = ZeroIfNegative(v); // Exponent indicates whether the number can be represented as int64_t. const VU biased_exp = ShiftRight<52>(BitCast(du, non_neg_v)); #if HWY_TARGET <= HWY_SSE4 const VU out_of_range = BitCast(du, VecFromMask(di, BitCast(di, biased_exp) > Set(di, 1086))); #else const Repartition di32; const VU out_of_range = BitCast( du, VecFromMask(di32, DupEven(BitCast(di32, biased_exp)) > Set(di32, 1086))); #endif // If we were to cap the exponent at 51 and add 2^52, the number would be in // [2^52, 2^53) and mantissa bits could be read out directly. We need to // round-to-0 (truncate), but changing rounding mode in MXCSR hits a // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead // manually shift the mantissa into place (we already have many of the // inputs anyway). // Use 16-bit saturated unsigned subtraction to compute shift_mnt and // shift_int since biased_exp[i] is a non-negative integer that is less than // or equal to 2047. // 16-bit saturated unsigned subtraction is also more efficient than a // 64-bit subtraction followed by a 64-bit signed Max operation on // SSE2/SSSE3/SSE4/AVX2. // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be // zero as the upper 48 bits of both k1075 and biased_exp are zero. const VU shift_mnt = BitCast( du, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp))); const VU shift_int = BitCast( du, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075))); const VU mantissa = BitCast(du, non_neg_v) & Set(du, (1ULL << 52) - 1); // Include implicit 1-bit. NOTE: the shift count may exceed 63; we rely on x86 // returning zero in that case. const VU int53 = (mantissa | Set(du, 1ULL << 52)) >> shift_mnt; // For inputs larger than 2^53 - 1, insert zeros at the bottom. // For inputs less than 2^64, the implicit 1-bit is guaranteed not to be // shifted out of the left shift result below as shift_int[i] <= 11 is true // for any inputs that are less than 2^64. const VU shifted = int53 << shift_int; return (shifted | out_of_range); } #endif // HWY_TARGET <= HWY_AVX3 template HWY_API Vec128 NearestInt(const Vec128 v) { const RebindToSigned> di; return detail::FixConversionOverflow( di, v, VFromD{_mm_cvtps_epi32(v.raw)}); } // ------------------------------ Floating-point rounding (ConvertTo) #if HWY_TARGET >= HWY_SSSE3 // Toward nearest integer, ties to even template HWY_API Vec128 Round(const Vec128 v) { static_assert(IsFloat(), "Only for float"); // Rely on rounding after addition with a large value such that no mantissa // bits remain (assuming the current mode is nearest-even). We may need a // compiler flag for precise floating-point to prevent "optimizing" this out. const DFromV df; const auto max = Set(df, MantissaEnd()); const auto large = CopySignToAbs(max, v); const auto added = large + v; const auto rounded = added - large; // Keep original if NaN or the magnitude is large (already an int). return IfThenElse(Abs(v) < max, rounded, v); } namespace detail { // Truncating to integer and converting back to float is correct except when the // input magnitude is large, in which case the input was already an integer // (because mantissa >> exponent is zero). template HWY_INLINE Mask128 UseInt(const Vec128 v) { static_assert(IsFloat(), "Only for float"); const DFromV d; return Abs(v) < Set(d, MantissaEnd()); } } // namespace detail // Toward zero, aka truncate template HWY_API Vec128 Trunc(const Vec128 v) { static_assert(IsFloat(), "Only for float"); const DFromV df; const RebindToSigned di; const auto integer = ConvertTo(di, v); // round toward 0 const auto int_f = ConvertTo(df, integer); return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v); } // Toward +infinity, aka ceiling template HWY_API Vec128 Ceil(const Vec128 v) { static_assert(IsFloat(), "Only for float"); const DFromV df; const RebindToSigned di; const auto integer = ConvertTo(di, v); // round toward 0 const auto int_f = ConvertTo(df, integer); // Truncating a positive non-integer ends up smaller; if so, add 1. const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v))); return IfThenElse(detail::UseInt(v), int_f - neg1, v); } // Toward -infinity, aka floor template HWY_API Vec128 Floor(const Vec128 v) { static_assert(IsFloat(), "Only for float"); const DFromV df; const RebindToSigned di; const auto integer = ConvertTo(di, v); // round toward 0 const auto int_f = ConvertTo(df, integer); // Truncating a negative non-integer ends up larger; if so, subtract 1. const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v))); return IfThenElse(detail::UseInt(v), int_f + neg1, v); } #else // Toward nearest integer, ties to even #if HWY_HAVE_FLOAT16 template HWY_API Vec128 Round(const Vec128 v) { return Vec128{ _mm_roundscale_ph(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 Round(const Vec128 v) { return Vec128{ _mm_round_ps(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; } template HWY_API Vec128 Round(const Vec128 v) { return Vec128{ _mm_round_pd(v.raw, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)}; } // Toward zero, aka truncate #if HWY_HAVE_FLOAT16 template HWY_API Vec128 Trunc(const Vec128 v) { return Vec128{ _mm_roundscale_ph(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 Trunc(const Vec128 v) { return Vec128{ _mm_round_ps(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; } template HWY_API Vec128 Trunc(const Vec128 v) { return Vec128{ _mm_round_pd(v.raw, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)}; } // Toward +infinity, aka ceiling #if HWY_HAVE_FLOAT16 template HWY_API Vec128 Ceil(const Vec128 v) { return Vec128{ _mm_roundscale_ph(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 Ceil(const Vec128 v) { return Vec128{ _mm_round_ps(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; } template HWY_API Vec128 Ceil(const Vec128 v) { return Vec128{ _mm_round_pd(v.raw, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)}; } // Toward -infinity, aka floor #if HWY_HAVE_FLOAT16 template HWY_API Vec128 Floor(const Vec128 v) { return Vec128{ _mm_roundscale_ph(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 Floor(const Vec128 v) { return Vec128{ _mm_round_ps(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; } template HWY_API Vec128 Floor(const Vec128 v) { return Vec128{ _mm_round_pd(v.raw, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)}; } #endif // !HWY_SSSE3 // ------------------------------ Floating-point classification #define HWY_X86_FPCLASS_QNAN 0x01 #define HWY_X86_FPCLASS_POS0 0x02 #define HWY_X86_FPCLASS_NEG0 0x04 #define HWY_X86_FPCLASS_POS_INF 0x08 #define HWY_X86_FPCLASS_NEG_INF 0x10 #define HWY_X86_FPCLASS_SUBNORMAL 0x20 #define HWY_X86_FPCLASS_NEG 0x40 #define HWY_X86_FPCLASS_SNAN 0x80 #if HWY_HAVE_FLOAT16 || HWY_IDE template HWY_API Mask128 IsNaN(const Vec128 v) { return Mask128{ _mm_fpclass_ph_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; } template HWY_API Mask128 IsInf(const Vec128 v) { return Mask128{_mm_fpclass_ph_mask( v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; } template HWY_API Mask128 IsFinite(const Vec128 v) { // fpclass doesn't have a flag for positive, so we have to check for inf/NaN // and negate the mask. return Not(Mask128{_mm_fpclass_ph_mask( v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); } #endif // HWY_HAVE_FLOAT16 template HWY_API Mask128 IsNaN(const Vec128 v) { #if HWY_TARGET <= HWY_AVX3 return Mask128{ _mm_fpclass_ps_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; #else return Mask128{_mm_cmpunord_ps(v.raw, v.raw)}; #endif } template HWY_API Mask128 IsNaN(const Vec128 v) { #if HWY_TARGET <= HWY_AVX3 return Mask128{ _mm_fpclass_pd_mask(v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN)}; #else return Mask128{_mm_cmpunord_pd(v.raw, v.raw)}; #endif } #if HWY_TARGET <= HWY_AVX3 // Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite. #ifdef HWY_NATIVE_ISINF #undef HWY_NATIVE_ISINF #else #define HWY_NATIVE_ISINF #endif template HWY_API Mask128 IsInf(const Vec128 v) { return Mask128{_mm_fpclass_ps_mask( v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; } template HWY_API Mask128 IsInf(const Vec128 v) { return Mask128{_mm_fpclass_pd_mask( v.raw, HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}; } // Returns whether normal/subnormal/zero. template HWY_API Mask128 IsFinite(const Vec128 v) { // fpclass doesn't have a flag for positive, so we have to check for inf/NaN // and negate the mask. return Not(Mask128{_mm_fpclass_ps_mask( v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); } template HWY_API Mask128 IsFinite(const Vec128 v) { return Not(Mask128{_mm_fpclass_pd_mask( v.raw, HWY_X86_FPCLASS_SNAN | HWY_X86_FPCLASS_QNAN | HWY_X86_FPCLASS_NEG_INF | HWY_X86_FPCLASS_POS_INF)}); } #endif // HWY_TARGET <= HWY_AVX3 // ================================================== CRYPTO #if !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET <= HWY_SSE4 // Per-target flag to prevent generic_ops-inl.h from defining AESRound. #ifdef HWY_NATIVE_AES #undef HWY_NATIVE_AES #else #define HWY_NATIVE_AES #endif HWY_API Vec128 AESRound(Vec128 state, Vec128 round_key) { return Vec128{_mm_aesenc_si128(state.raw, round_key.raw)}; } HWY_API Vec128 AESLastRound(Vec128 state, Vec128 round_key) { return Vec128{_mm_aesenclast_si128(state.raw, round_key.raw)}; } HWY_API Vec128 AESInvMixColumns(Vec128 state) { return Vec128{_mm_aesimc_si128(state.raw)}; } HWY_API Vec128 AESRoundInv(Vec128 state, Vec128 round_key) { return Vec128{_mm_aesdec_si128(state.raw, round_key.raw)}; } HWY_API Vec128 AESLastRoundInv(Vec128 state, Vec128 round_key) { return Vec128{_mm_aesdeclast_si128(state.raw, round_key.raw)}; } template HWY_API Vec128 AESKeyGenAssist(Vec128 v) { return Vec128{_mm_aeskeygenassist_si128(v.raw, kRcon)}; } template HWY_API Vec128 CLMulLower(Vec128 a, Vec128 b) { return Vec128{_mm_clmulepi64_si128(a.raw, b.raw, 0x00)}; } template HWY_API Vec128 CLMulUpper(Vec128 a, Vec128 b) { return Vec128{_mm_clmulepi64_si128(a.raw, b.raw, 0x11)}; } #endif // !defined(HWY_DISABLE_PCLMUL_AES) && HWY_TARGET <= HWY_SSE4 // ================================================== MISC // ------------------------------ LoadMaskBits (TestBit) #if HWY_TARGET > HWY_AVX3 namespace detail { template HWY_INLINE MFromD LoadMaskBits128(D d, uint64_t mask_bits) { const RebindToUnsigned du; // Easier than Set(), which would require an >8-bit type, which would not // compile for T=uint8_t, kN=1. const VFromD vbits{_mm_cvtsi32_si128(static_cast(mask_bits))}; #if HWY_TARGET == HWY_SSE2 // {b0, b1, ...} ===> {b0, b0, b1, b1, ...} __m128i unpacked_vbits = _mm_unpacklo_epi8(vbits.raw, vbits.raw); // {b0, b0, b1, b1, ...} ==> {b0, b0, b0, b0, b1, b1, b1, b1, ...} unpacked_vbits = _mm_unpacklo_epi16(unpacked_vbits, unpacked_vbits); // {b0, b0, b0, b0, b1, b1, b1, b1, ...} ==> // {b0, b0, b0, b0, b0, b0, b0, b0, b1, b1, b1, b1, b1, b1, b1, b1} const VFromD rep8{ _mm_unpacklo_epi32(unpacked_vbits, unpacked_vbits)}; #else // Replicate bytes 8x such that each byte contains the bit that governs it. alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}; const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8)); #endif const VFromD bit = Dup128VecFromValues( du, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128); return RebindMask(d, TestBit(rep8, bit)); } template HWY_INLINE MFromD LoadMaskBits128(D d, uint64_t mask_bits) { const RebindToUnsigned du; alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; const auto vmask_bits = Set(du, static_cast(mask_bits)); return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); } template HWY_INLINE MFromD LoadMaskBits128(D d, uint64_t mask_bits) { const RebindToUnsigned du; alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8}; const auto vmask_bits = Set(du, static_cast(mask_bits)); return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); } template HWY_INLINE MFromD LoadMaskBits128(D d, uint64_t mask_bits) { const RebindToUnsigned du; alignas(16) static constexpr uint64_t kBit[8] = {1, 2}; return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit))); } } // namespace detail #endif // HWY_TARGET > HWY_AVX3 // `p` points to at least 8 readable bytes, not all of which need be valid. template HWY_API MFromD LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { constexpr size_t kN = MaxLanes(d); #if HWY_TARGET <= HWY_AVX3 (void)d; uint64_t mask_bits = 0; constexpr size_t kNumBytes = (kN + 7) / 8; CopyBytes(bits, &mask_bits); if (kN < 8) { mask_bits &= (1ull << kN) - 1; } return MFromD::FromBits(mask_bits); #else uint64_t mask_bits = 0; constexpr size_t kNumBytes = (kN + 7) / 8; CopyBytes(bits, &mask_bits); if (kN < 8) { mask_bits &= (1ull << kN) - 1; } return detail::LoadMaskBits128(d, mask_bits); #endif } // ------------------------------ Dup128MaskFromMaskBits template HWY_API MFromD Dup128MaskFromMaskBits(D d, unsigned mask_bits) { constexpr size_t kN = MaxLanes(d); if (kN < 8) mask_bits &= (1u << kN) - 1; #if HWY_TARGET <= HWY_AVX3 return MFromD::FromBits(mask_bits); #else return detail::LoadMaskBits128(d, mask_bits); #endif } template struct CompressIsPartition { #if HWY_TARGET <= HWY_AVX3 // AVX3 supports native compress, but a table-based approach allows // 'partitioning' (also moving mask=false lanes to the top), which helps // vqsort. This is only feasible for eight or less lanes, i.e. sizeof(T) == 8 // on AVX3. For simplicity, we only use tables for 64-bit lanes (not AVX3 // u32x8 etc.). enum { value = (sizeof(T) == 8) }; #else // generic_ops-inl does not guarantee IsPartition for 8-bit. enum { value = (sizeof(T) != 1) }; #endif }; #if HWY_TARGET <= HWY_AVX3 // ------------------------------ StoreMaskBits // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { constexpr size_t kN = MaxLanes(d); constexpr size_t kNumBytes = (kN + 7) / 8; CopyBytes(&mask.raw, bits); // Non-full byte, need to clear the undefined upper bits. if (kN < 8) { const int mask_bits = (1 << kN) - 1; bits[0] = static_cast(bits[0] & mask_bits); } return kNumBytes; } // ------------------------------ Mask testing // Beware: the suffix indicates the number of mask bits, not lane size! template HWY_API size_t CountTrue(D d, MFromD mask) { constexpr size_t kN = MaxLanes(d); const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1); return PopCount(mask_bits); } template HWY_API size_t FindKnownFirstTrue(D d, MFromD mask) { constexpr size_t kN = MaxLanes(d); const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); return Num0BitsBelowLS1Bit_Nonzero32(mask_bits); } template HWY_API intptr_t FindFirstTrue(D d, MFromD mask) { constexpr size_t kN = MaxLanes(d); const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; } template HWY_API size_t FindKnownLastTrue(D d, MFromD mask) { constexpr size_t kN = MaxLanes(d); const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); return 31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits); } template HWY_API intptr_t FindLastTrue(D d, MFromD mask) { constexpr size_t kN = MaxLanes(d); const uint32_t mask_bits = uint32_t{mask.raw} & ((1u << kN) - 1); return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) : -1; } template HWY_API bool AllFalse(D d, MFromD mask) { constexpr size_t kN = MaxLanes(d); const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1); return mask_bits == 0; } template HWY_API bool AllTrue(D d, MFromD mask) { constexpr size_t kN = MaxLanes(d); const uint64_t mask_bits = uint64_t{mask.raw} & ((1ull << kN) - 1); // Cannot use _kortestc because we may have less than 8 mask bits. return mask_bits == (1ull << kN) - 1; } // ------------------------------ Compress // 8-16 bit Compress, CompressStore defined in x86_512 because they use Vec512. // Single lane: no-op template HWY_API Vec128 Compress(Vec128 v, Mask128 /*m*/) { return v; } template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { return Vec128{_mm_maskz_compress_ps(mask.raw, v.raw)}; } template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { HWY_DASSERT(mask.raw < 4); // There are only 2 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[64] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const DFromV d; const Repartition d8; const auto index = Load(d8, u8_indices + 16 * mask.raw); return BitCast(d, TableLookupBytes(BitCast(d8, v), index)); } // ------------------------------ CompressNot (Compress) // Single lane: no-op template HWY_API Vec128 CompressNot(Vec128 v, Mask128 /*m*/) { return v; } template HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // See CompressIsPartition, PrintCompressNot64x2NibbleTables alignas(16) static constexpr uint64_t packed_array[16] = { 0x00000010, 0x00000001, 0x00000010, 0x00000010}; // For lane i, shift the i-th 4-bit index down to bits [0, 2) - // _mm_permutexvar_epi64 will ignore the upper bits. const DFromV d; const RebindToUnsigned du64; const auto packed = Set(du64, packed_array[mask.raw]); alignas(16) static constexpr uint64_t shifts[2] = {0, 4}; const auto indices = Indices128{(packed >> Load(du64, shifts)).raw}; return TableLookupLanes(v, indices); } // ------------------------------ CompressBlocksNot HWY_API Vec128 CompressBlocksNot(Vec128 v, Mask128 /* m */) { return v; } // ------------------------------ CompressStore (defined in x86_512) // ------------------------------ CompressBlendedStore (CompressStore) template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { // AVX-512 already does the blending at no extra cost (latency 11, // rthroughput 2 - same as compress plus store). if (HWY_TARGET == HWY_AVX3_DL || (HWY_TARGET != HWY_AVX3_ZEN4 && sizeof(TFromD) > 2)) { // We're relying on the mask to blend. Clear the undefined upper bits. constexpr size_t kN = MaxLanes(d); if (kN != 16 / sizeof(TFromD)) { m = And(m, FirstN(d, kN)); } return CompressStore(v, m, d, unaligned); } else { const size_t count = CountTrue(d, m); const VFromD compressed = Compress(v, m); #if HWY_MEM_OPS_MIGHT_FAULT // BlendedStore tests mask for each lane, but we know that the mask is // FirstN, so we can just copy. alignas(16) TFromD buf[MaxLanes(d)]; Store(compressed, d, buf); CopyBytes(buf, unaligned, count * sizeof(TFromD)); #else BlendedStore(compressed, FirstN(d, count), d, unaligned); #endif detail::MaybeUnpoison(unaligned, count); return count; } } // ------------------------------ CompressBitsStore (defined in x86_512) #else // AVX2 or below // ------------------------------ StoreMaskBits namespace detail { constexpr HWY_INLINE uint64_t U64FromInt(int mask_bits) { return static_cast(static_cast(mask_bits)); } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, const Mask128 mask) { const Simd d; const auto sign_bits = BitCast(d, VecFromMask(d, mask)).raw; return U64FromInt(_mm_movemask_epi8(sign_bits)); } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, const Mask128 mask) { // Remove useless lower half of each u16 while preserving the sign bit. const auto sign_bits = _mm_packs_epi16(mask.raw, _mm_setzero_si128()); return U64FromInt(_mm_movemask_epi8(sign_bits)); } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { const Simd d; const Simd df; const auto sign_bits = BitCast(df, VecFromMask(d, mask)); return U64FromInt(_mm_movemask_ps(sign_bits.raw)); } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 mask) { const Simd d; const Simd df; const auto sign_bits = BitCast(df, VecFromMask(d, mask)); return U64FromInt(_mm_movemask_pd(sign_bits.raw)); } template HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); } } // namespace detail // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { constexpr size_t kNumBytes = (MaxLanes(d) + 7) / 8; const uint64_t mask_bits = detail::BitsFromMask(mask); CopyBytes(&mask_bits, bits); return kNumBytes; } // ------------------------------ Mask testing template HWY_API bool AllFalse(D /* tag */, MFromD mask) { // Cheaper than PTEST, which is 2 uop / 3L. return detail::BitsFromMask(mask) == 0; } template HWY_API bool AllTrue(D d, MFromD mask) { constexpr uint64_t kAllBits = (1ull << MaxLanes(d)) - 1; return detail::BitsFromMask(mask) == kAllBits; } template HWY_API size_t CountTrue(D /* tag */, MFromD mask) { return PopCount(detail::BitsFromMask(mask)); } template HWY_API size_t FindKnownFirstTrue(D /* tag */, MFromD mask) { return Num0BitsBelowLS1Bit_Nonzero32( static_cast(detail::BitsFromMask(mask))); } template HWY_API intptr_t FindFirstTrue(D /* tag */, MFromD mask) { const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero32(mask_bits)) : -1; } template HWY_API size_t FindKnownLastTrue(D /* tag */, MFromD mask) { return 31 - Num0BitsAboveMS1Bit_Nonzero32( static_cast(detail::BitsFromMask(mask))); } template HWY_API intptr_t FindLastTrue(D /* tag */, MFromD mask) { const uint32_t mask_bits = static_cast(detail::BitsFromMask(mask)); return mask_bits ? intptr_t(31 - Num0BitsAboveMS1Bit_Nonzero32(mask_bits)) : -1; } // ------------------------------ Compress, CompressBits namespace detail { // Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6. template HWY_INLINE VFromD IndicesFromBits128(D d, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 256); const Rebind d8; const Twice d8t; const RebindToUnsigned du; // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need // byte indices for PSHUFB (one vector's worth for each of 256 combinations of // 8 mask bits). Loading them directly would require 4 KiB. We can instead // store lane indices and convert to byte indices (2*lane + 0..1), with the // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles // is likely more costly than the higher cache footprint from storing bytes. alignas(16) static constexpr uint8_t table[2048] = { // PrintCompress16x8Tables 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; const VFromD byte_idx{Load(d8, table + mask_bits * 8).raw}; const VFromD pairs = ZipLower(byte_idx, byte_idx); return BitCast(d, pairs + Set(du, 0x0100)); } template HWY_INLINE VFromD IndicesFromNotBits128(D d, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 256); const Rebind d8; const Twice d8t; const RebindToUnsigned du; // compress_epi16 requires VBMI2 and there is no permutevar_epi16, so we need // byte indices for PSHUFB (one vector's worth for each of 256 combinations of // 8 mask bits). Loading them directly would require 4 KiB. We can instead // store lane indices and convert to byte indices (2*lane + 0..1), with the // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles // is likely more costly than the higher cache footprint from storing bytes. alignas(16) static constexpr uint8_t table[2048] = { // PrintCompressNot16x8Tables 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; const VFromD byte_idx{Load(d8, table + mask_bits * 8).raw}; const VFromD pairs = ZipLower(byte_idx, byte_idx); return BitCast(d, pairs + Set(du, 0x0100)); } template HWY_INLINE VFromD IndicesFromBits128(D d, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 16); // There are only 4 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[256] = { // PrintCompress32x4Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_INLINE VFromD IndicesFromNotBits128(D d, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 16); // There are only 4 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[256] = { // PrintCompressNot32x4Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_INLINE VFromD IndicesFromBits128(D d, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 4); // There are only 2 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[64] = { // PrintCompress64x2Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_INLINE VFromD IndicesFromNotBits128(D d, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 4); // There are only 2 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[64] = { // PrintCompressNot64x2Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_API Vec128 CompressBits(Vec128 v, uint64_t mask_bits) { const DFromV d; const RebindToUnsigned du; HWY_DASSERT(mask_bits < (1ull << N)); const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); } template HWY_API Vec128 CompressNotBits(Vec128 v, uint64_t mask_bits) { const DFromV d; const RebindToUnsigned du; HWY_DASSERT(mask_bits < (1ull << N)); const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits)); return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); } } // namespace detail // Single lane: no-op template HWY_API Vec128 Compress(Vec128 v, Mask128 /*m*/) { return v; } // Two lanes: conditional swap template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. const DFromV d; const Vec128 m = VecFromMask(d, mask); const Vec128 maskL = DupEven(m); const Vec128 maskH = DupOdd(m); const Vec128 swap = AndNot(maskL, maskH); return IfVecThenElse(swap, Shuffle01(v), v); } // General case, 2 or 4 bytes template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { return detail::CompressBits(v, detail::BitsFromMask(mask)); } // ------------------------------ CompressNot // Single lane: no-op template HWY_API Vec128 CompressNot(Vec128 v, Mask128 /*m*/) { return v; } // Two lanes: conditional swap template HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. const DFromV d; const Vec128 m = VecFromMask(d, mask); const Vec128 maskL = DupEven(m); const Vec128 maskH = DupOdd(m); const Vec128 swap = AndNot(maskH, maskL); return IfVecThenElse(swap, Shuffle01(v), v); } template HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // For partial vectors, we cannot pull the Not() into the table because // BitsFromMask clears the upper bits. if (N < 16 / sizeof(T)) { return detail::CompressBits(v, detail::BitsFromMask(Not(mask))); } return detail::CompressNotBits(v, detail::BitsFromMask(mask)); } // ------------------------------ CompressBlocksNot HWY_API Vec128 CompressBlocksNot(Vec128 v, Mask128 /* m */) { return v; } template HWY_API Vec128 CompressBits(Vec128 v, const uint8_t* HWY_RESTRICT bits) { uint64_t mask_bits = 0; constexpr size_t kNumBytes = (N + 7) / 8; CopyBytes(bits, &mask_bits); if (N < 8) { mask_bits &= (1ull << N) - 1; } return detail::CompressBits(v, mask_bits); } // ------------------------------ CompressStore, CompressBitsStore template HWY_API size_t CompressStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; const uint64_t mask_bits = detail::BitsFromMask(m); HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); const size_t count = PopCount(mask_bits); // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); StoreU(compressed, d, unaligned); detail::MaybeUnpoison(unaligned, count); return count; } template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; const uint64_t mask_bits = detail::BitsFromMask(m); HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); const size_t count = PopCount(mask_bits); // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); BlendedStore(compressed, FirstN(d, count), d, unaligned); detail::MaybeUnpoison(unaligned, count); return count; } template HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; uint64_t mask_bits = 0; constexpr size_t kN = MaxLanes(d); constexpr size_t kNumBytes = (kN + 7) / 8; CopyBytes(bits, &mask_bits); if (kN < 8) { mask_bits &= (1ull << kN) - 1; } const size_t count = PopCount(mask_bits); // Avoid _mm_maskmoveu_si128 (>500 cycle latency because it bypasses caches). const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); StoreU(compressed, d, unaligned); detail::MaybeUnpoison(unaligned, count); return count; } #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ Expand // Otherwise, use the generic_ops-inl.h fallback. #if HWY_TARGET <= HWY_AVX3 || HWY_IDE // The native instructions for 8/16-bit actually require VBMI2 (HWY_AVX3_DL), // but we still want to override generic_ops-inl's table-based implementation // whenever we have the 32-bit expand provided by AVX3. #ifdef HWY_NATIVE_EXPAND #undef HWY_NATIVE_EXPAND #else #define HWY_NATIVE_EXPAND #endif namespace detail { #if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE // VBMI2 template HWY_INLINE Vec128 NativeExpand(Vec128 v, Mask128 mask) { return Vec128{_mm_maskz_expand_epi8(mask.raw, v.raw)}; } template HWY_INLINE Vec128 NativeExpand(Vec128 v, Mask128 mask) { return Vec128{_mm_maskz_expand_epi16(mask.raw, v.raw)}; } template HWY_INLINE VFromD NativeLoadExpand(MFromD mask, D /* d */, const uint8_t* HWY_RESTRICT unaligned) { return VFromD{_mm_maskz_expandloadu_epi8(mask.raw, unaligned)}; } template HWY_INLINE VFromD NativeLoadExpand(MFromD mask, D /* d */, const uint16_t* HWY_RESTRICT unaligned) { return VFromD{_mm_maskz_expandloadu_epi16(mask.raw, unaligned)}; } #endif // HWY_TARGET <= HWY_AVX3_DL template HWY_INLINE Vec128 NativeExpand(Vec128 v, Mask128 mask) { return Vec128{_mm_maskz_expand_epi32(mask.raw, v.raw)}; } template HWY_INLINE Vec128 NativeExpand(Vec128 v, Mask128 mask) { return Vec128{_mm_maskz_expand_epi64(mask.raw, v.raw)}; } template HWY_INLINE VFromD NativeLoadExpand(MFromD mask, D /* d */, const uint32_t* HWY_RESTRICT unaligned) { return VFromD{_mm_maskz_expandloadu_epi32(mask.raw, unaligned)}; } template HWY_INLINE VFromD NativeLoadExpand(MFromD mask, D /* d */, const uint64_t* HWY_RESTRICT unaligned) { return VFromD{_mm_maskz_expandloadu_epi64(mask.raw, unaligned)}; } } // namespace detail // Otherwise, 8/16-bit are implemented in x86_512 using PromoteTo. #if HWY_TARGET <= HWY_AVX3_DL || HWY_IDE // VBMI2 template HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { const DFromV d; const RebindToUnsigned du; const MFromD mu = RebindMask(du, mask); return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); } #endif // HWY_TARGET <= HWY_AVX3_DL template HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { const DFromV d; const RebindToUnsigned du; const MFromD mu = RebindMask(du, mask); return BitCast(d, detail::NativeExpand(BitCast(du, v), mu)); } // ------------------------------ LoadExpand template HWY_API VFromD LoadExpand(MFromD mask, D d, const TFromD* HWY_RESTRICT unaligned) { #if HWY_TARGET <= HWY_AVX3_DL // VBMI2 const RebindToUnsigned du; using TU = TFromD; const TU* HWY_RESTRICT pu = reinterpret_cast(unaligned); const MFromD mu = RebindMask(du, mask); return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); #else return Expand(LoadU(d, unaligned), mask); #endif } template HWY_API VFromD LoadExpand(MFromD mask, D d, const TFromD* HWY_RESTRICT unaligned) { #if HWY_TARGET <= HWY_AVX3 const RebindToUnsigned du; using TU = TFromD; const TU* HWY_RESTRICT pu = reinterpret_cast(unaligned); const MFromD mu = RebindMask(du, mask); return BitCast(d, detail::NativeLoadExpand(mu, du, pu)); #else return Expand(LoadU(d, unaligned), mask); #endif } #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ StoreInterleaved2/3/4 // HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in // generic_ops-inl.h. // ------------------------------ Additional mask logical operations #if HWY_TARGET <= HWY_AVX3 namespace detail { template static HWY_INLINE uint32_t AVX3Blsi(T x) { using TU = MakeUnsigned; const auto u32_val = static_cast(static_cast(x)); #if HWY_COMPILER_CLANGCL return static_cast(u32_val & (0u - u32_val)); #else return static_cast(_blsi_u32(u32_val)); #endif } template static HWY_INLINE uint64_t AVX3Blsi(T x) { const auto u64_val = static_cast(x); #if HWY_COMPILER_CLANGCL || HWY_ARCH_X86_32 return static_cast(u64_val & (0ULL - u64_val)); #else return static_cast(_blsi_u64(u64_val)); #endif } template static HWY_INLINE uint32_t AVX3Blsmsk(T x) { using TU = MakeUnsigned; const auto u32_val = static_cast(static_cast(x)); #if HWY_COMPILER_CLANGCL return static_cast(u32_val ^ (u32_val - 1u)); #else return static_cast(_blsmsk_u32(u32_val)); #endif } template static HWY_INLINE uint64_t AVX3Blsmsk(T x) { const auto u64_val = static_cast(x); #if HWY_COMPILER_CLANGCL || HWY_ARCH_X86_32 return static_cast(u64_val ^ (u64_val - 1ULL)); #else return static_cast(_blsmsk_u64(u64_val)); #endif } } // namespace detail template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1; return Mask128{static_cast::Raw>( (0u - detail::AVX3Blsi(mask.raw)) & kActiveElemMask)}; } template HWY_API Mask128 SetBeforeFirst(Mask128 mask) { constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1; return Mask128{static_cast::Raw>( (detail::AVX3Blsi(mask.raw) - 1u) & kActiveElemMask)}; } template HWY_API Mask128 SetAtOrBeforeFirst(Mask128 mask) { constexpr uint32_t kActiveElemMask = (uint32_t{1} << N) - 1; return Mask128{static_cast::Raw>( detail::AVX3Blsmsk(mask.raw) & kActiveElemMask)}; } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { return Mask128{ static_cast::Raw>(detail::AVX3Blsi(mask.raw))}; } #else // AVX2 or below template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { return mask; } template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { const FixedTag d; const auto vmask = VecFromMask(d, mask); return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask))); } template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { const Simd d; const auto vmask = VecFromMask(d, mask); const auto neg_vmask = ResizeBitCast(d, Neg(ResizeBitCast(Full64(), vmask))); return MaskFromVec(Or(vmask, neg_vmask)); } template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { const Full128 d; const Repartition di64; const Repartition df32; const Repartition di32; using VF = VFromD; auto vmask = BitCast(di64, VecFromMask(d, mask)); vmask = Or(vmask, Neg(vmask)); // Copy the sign bit of the first int64_t lane to the second int64_t lane const auto vmask2 = BroadcastSignBit( BitCast(di32, VF{_mm_shuffle_ps(Zero(df32).raw, BitCast(df32, vmask).raw, _MM_SHUFFLE(1, 1, 0, 0))})); return MaskFromVec(BitCast(d, Or(vmask, BitCast(di64, vmask2)))); } template HWY_API Mask128 SetBeforeFirst(Mask128 mask) { return Not(SetAtOrAfterFirst(mask)); } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { return mask; } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { const FixedTag d; const RebindToSigned di; const auto vmask = BitCast(di, VecFromMask(d, mask)); const auto zero = Zero(di); const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero); return MaskFromVec(BitCast(d, And(vmask, vmask2))); } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { const Simd d; const RebindToSigned di; const auto vmask = ResizeBitCast(Full64(), VecFromMask(d, mask)); const auto only_first_vmask = BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask))))); return MaskFromVec(only_first_vmask); } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { const Full128 d; const RebindToSigned di; const Repartition di64; const auto zero = Zero(di64); const auto vmask = BitCast(di64, VecFromMask(d, mask)); const auto vmask2 = VecFromMask(di64, InterleaveLower(zero, vmask) == zero); const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask)))); return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2)))); } template HWY_API Mask128 SetAtOrBeforeFirst(Mask128 /*mask*/) { const FixedTag d; const RebindToSigned di; using TI = MakeSigned; return RebindMask(d, MaskFromVec(Set(di, TI(-1)))); } template HWY_API Mask128 SetAtOrBeforeFirst(Mask128 mask) { const Simd d; return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask)))); } #endif // HWY_TARGET <= HWY_AVX3 // ------------------------------ Reductions // Nothing fully native, generic_ops-inl defines SumOfLanes and ReduceSum. // We provide specializations of u8x8 and u8x16, so exclude those. #undef HWY_IF_SUM_OF_LANES_D #define HWY_IF_SUM_OF_LANES_D(D) \ HWY_IF_LANES_GT_D(D, 1), \ hwy::EnableIf, uint8_t>() || \ (HWY_V_SIZE_D(D) != 8 && HWY_V_SIZE_D(D) != 16)>* = \ nullptr template HWY_API VFromD SumOfLanes(D d, VFromD v) { return Set(d, static_cast(GetLane(SumsOf8(v)) & 0xFF)); } template HWY_API VFromD SumOfLanes(D d, VFromD v) { const Repartition d64; VFromD sums = SumsOf8(v); sums = SumOfLanes(d64, sums); return Broadcast<0>(BitCast(d, sums)); } #if HWY_TARGET <= HWY_SSE4 // We provide specializations of u8x8, u8x16, and u16x8, so exclude those. #undef HWY_IF_MINMAX_OF_LANES_D #define HWY_IF_MINMAX_OF_LANES_D(D) \ HWY_IF_LANES_GT_D(D, 1), \ hwy::EnableIf<(!hwy::IsSame, uint8_t>() || \ ((HWY_V_SIZE_D(D) < 8) || (HWY_V_SIZE_D(D) > 16))) && \ (!hwy::IsSame, uint16_t>() || \ (HWY_V_SIZE_D(D) != 16))>* = nullptr template HWY_API Vec128 MinOfLanes(D /* tag */, Vec128 v) { return Broadcast<0>(Vec128{_mm_minpos_epu16(v.raw)}); } template HWY_API Vec128 MaxOfLanes(D d, Vec128 v) { const Vec128 max = Set(d, LimitsMax()); return max - MinOfLanes(d, max - v); } template HWY_API Vec64 MinOfLanes(D d, Vec64 v) { const Rebind d16; return TruncateTo(d, MinOfLanes(d16, PromoteTo(d16, v))); } template HWY_API Vec128 MinOfLanes(D d, Vec128 v) { const Half dh; Vec64 result = Min(MinOfLanes(dh, UpperHalf(dh, v)), MinOfLanes(dh, LowerHalf(dh, v))); return Combine(d, result, result); } template HWY_API Vec64 MaxOfLanes(D d, Vec64 v) { const Vec64 m(Set(d, LimitsMax())); return m - MinOfLanes(d, m - v); } template HWY_API Vec128 MaxOfLanes(D d, Vec128 v) { const Vec128 m(Set(d, LimitsMax())); return m - MinOfLanes(d, m - v); } #endif // HWY_TARGET <= HWY_SSE4 // ------------------------------ Lt128 namespace detail { // Returns vector-mask for Lt128. Generic for all vector lengths. template HWY_INLINE VFromD Lt128Vec(const D d, VFromD a, VFromD b) { // Truth table of Eq and Lt for Hi and Lo u64. // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) // =H =L cH cL | out = cH | (=H & cL) // 0 0 0 0 | 0 // 0 0 0 1 | 0 // 0 0 1 0 | 1 // 0 0 1 1 | 1 // 0 1 0 0 | 0 // 0 1 0 1 | 0 // 0 1 1 0 | 1 // 1 0 0 0 | 0 // 1 0 0 1 | 1 // 1 1 0 0 | 0 const auto eqHL = Eq(a, b); const VFromD ltHL = VecFromMask(d, Lt(a, b)); const VFromD ltLX = ShiftLeftLanes<1>(ltHL); const VFromD vecHx = IfThenElse(eqHL, ltLX, ltHL); return InterleaveUpper(d, vecHx, vecHx); } // Returns vector-mask for Eq128. Generic for all vector lengths. template HWY_INLINE VFromD Eq128Vec(D d, VFromD a, VFromD b) { const auto eqHL = VecFromMask(d, Eq(a, b)); const auto eqLH = Reverse2(d, eqHL); return And(eqHL, eqLH); } template HWY_INLINE VFromD Ne128Vec(D d, VFromD a, VFromD b) { const auto neHL = VecFromMask(d, Ne(a, b)); const auto neLH = Reverse2(d, neHL); return Or(neHL, neLH); } template HWY_INLINE VFromD Lt128UpperVec(D d, VFromD a, VFromD b) { // No specialization required for AVX-512: Mask <-> Vec is fast, and // copying mask bits to their neighbor seems infeasible. const VFromD ltHL = VecFromMask(d, Lt(a, b)); return InterleaveUpper(d, ltHL, ltHL); } template HWY_INLINE VFromD Eq128UpperVec(D d, VFromD a, VFromD b) { // No specialization required for AVX-512: Mask <-> Vec is fast, and // copying mask bits to their neighbor seems infeasible. const VFromD eqHL = VecFromMask(d, Eq(a, b)); return InterleaveUpper(d, eqHL, eqHL); } template HWY_INLINE VFromD Ne128UpperVec(D d, VFromD a, VFromD b) { // No specialization required for AVX-512: Mask <-> Vec is fast, and // copying mask bits to their neighbor seems infeasible. const VFromD neHL = VecFromMask(d, Ne(a, b)); return InterleaveUpper(d, neHL, neHL); } } // namespace detail template HWY_API MFromD Lt128(D d, VFromD a, VFromD b) { return MaskFromVec(detail::Lt128Vec(d, a, b)); } template HWY_API MFromD Eq128(D d, VFromD a, VFromD b) { return MaskFromVec(detail::Eq128Vec(d, a, b)); } template HWY_API MFromD Ne128(D d, VFromD a, VFromD b) { return MaskFromVec(detail::Ne128Vec(d, a, b)); } template HWY_API MFromD Lt128Upper(D d, VFromD a, VFromD b) { return MaskFromVec(detail::Lt128UpperVec(d, a, b)); } template HWY_API MFromD Eq128Upper(D d, VFromD a, VFromD b) { return MaskFromVec(detail::Eq128UpperVec(d, a, b)); } template HWY_API MFromD Ne128Upper(D d, VFromD a, VFromD b) { return MaskFromVec(detail::Ne128UpperVec(d, a, b)); } // ------------------------------ Min128, Max128 (Lt128) // Avoids the extra MaskFromVec in Lt128. template HWY_API VFromD Min128(D d, VFromD a, VFromD b) { return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b); } template HWY_API VFromD Max128(D d, VFromD a, VFromD b) { return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b); } template HWY_API VFromD Min128Upper(D d, VFromD a, VFromD b) { return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b); } template HWY_API VFromD Max128Upper(D d, VFromD a, VFromD b) { return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b); } // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex #if HWY_TARGET <= HWY_AVX3 #ifdef HWY_NATIVE_LEADING_ZERO_COUNT #undef HWY_NATIVE_LEADING_ZERO_COUNT #else #define HWY_NATIVE_LEADING_ZERO_COUNT #endif template ), HWY_IF_V_SIZE_LE_D(DFromV, 16)> HWY_API V LeadingZeroCount(V v) { return V{_mm_lzcnt_epi32(v.raw)}; } template ), HWY_IF_V_SIZE_LE_D(DFromV, 16)> HWY_API V LeadingZeroCount(V v) { return V{_mm_lzcnt_epi64(v.raw)}; } // HighestSetBitIndex and TrailingZeroCount is implemented in x86_512-inl.h // for AVX3 targets #endif // HWY_TARGET <= HWY_AVX3 // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE(); #undef HWY_X86_IF_EMULATED_D // Note that the GCC warnings are not suppressed if we only wrap the *intrin.h - // the warning seems to be issued at the call site of intrinsics, i.e. our code. HWY_DIAGNOSTICS(pop)