// Copyright 2023 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // 128-bit vectors for VSX/Z14 // External include guard in highway.h - see comment there. #if HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15 #define HWY_S390X_HAVE_Z14 1 #else #define HWY_S390X_HAVE_Z14 0 #endif #pragma push_macro("vector") #pragma push_macro("pixel") #pragma push_macro("bool") #undef vector #undef pixel #undef bool #if HWY_S390X_HAVE_Z14 #include #else #include #endif #pragma pop_macro("vector") #pragma pop_macro("pixel") #pragma pop_macro("bool") #include "hwy/ops/shared-inl.h" // clang's altivec.h gates some intrinsics behind #ifdef __POWER10_VECTOR__, and // some GCC do the same for _ARCH_PWR10. // This means we can only use POWER10-specific intrinsics in static dispatch // mode (where the -mpower10-vector compiler flag is passed). Same for PPC9. // On other compilers, the usual target check is sufficient. #if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC9 && \ (defined(_ARCH_PWR9) || defined(__POWER9_VECTOR__)) #define HWY_PPC_HAVE_9 1 #else #define HWY_PPC_HAVE_9 0 #endif #if !HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_PPC10 && \ (defined(_ARCH_PWR10) || defined(__POWER10_VECTOR__)) #define HWY_PPC_HAVE_10 1 #else #define HWY_PPC_HAVE_10 0 #endif #if HWY_S390X_HAVE_Z14 && HWY_TARGET <= HWY_Z15 && __ARCH__ >= 13 #define HWY_S390X_HAVE_Z15 1 #else #define HWY_S390X_HAVE_Z15 0 #endif HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { namespace detail { template struct Raw128; // Each Raw128 specialization defines the following typedefs: // - type: // the backing Altivec/VSX raw vector type of the Vec128 type // - RawBoolVec: // the backing Altivec/VSX raw __bool vector type of the Mask128 type // - RawT: // the lane type for intrinsics, in particular vec_splat // - AlignedRawVec: // the 128-bit GCC/Clang vector type for aligned loads/stores // - UnalignedRawVec: // the 128-bit GCC/Clang vector type for unaligned loads/stores #define HWY_VSX_RAW128(LANE_TYPE, RAW_VECT_LANE_TYPE, RAW_BOOL_VECT_LANE_TYPE) \ template <> \ struct Raw128 { \ using type = __vector RAW_VECT_LANE_TYPE; \ using RawBoolVec = __vector __bool RAW_BOOL_VECT_LANE_TYPE; \ using RawT = RAW_VECT_LANE_TYPE; \ typedef LANE_TYPE AlignedRawVec \ __attribute__((__vector_size__(16), __aligned__(16), __may_alias__)); \ typedef LANE_TYPE UnalignedRawVec __attribute__(( \ __vector_size__(16), __aligned__(alignof(LANE_TYPE)), __may_alias__)); \ }; HWY_VSX_RAW128(int8_t, signed char, char) HWY_VSX_RAW128(uint8_t, unsigned char, char) HWY_VSX_RAW128(int16_t, signed short, short) // NOLINT(runtime/int) HWY_VSX_RAW128(uint16_t, unsigned short, short) // NOLINT(runtime/int) HWY_VSX_RAW128(int32_t, signed int, int) HWY_VSX_RAW128(uint32_t, unsigned int, int) HWY_VSX_RAW128(int64_t, signed long long, long long) // NOLINT(runtime/int) HWY_VSX_RAW128(uint64_t, unsigned long long, long long) // NOLINT(runtime/int) HWY_VSX_RAW128(float, float, int) HWY_VSX_RAW128(double, double, long long) // NOLINT(runtime/int) template <> struct Raw128 : public Raw128 {}; template <> struct Raw128 : public Raw128 {}; #undef HWY_VSX_RAW128 } // namespace detail template class Vec128 { using Raw = typename detail::Raw128::type; public: using PrivateT = T; // only for DFromV static constexpr size_t kPrivateN = N; // only for DFromV // Compound assignment. Only usable if there is a corresponding non-member // binary operator overload. For example, only f32 and f64 support division. HWY_INLINE Vec128& operator*=(const Vec128 other) { return *this = (*this * other); } HWY_INLINE Vec128& operator/=(const Vec128 other) { return *this = (*this / other); } HWY_INLINE Vec128& operator+=(const Vec128 other) { return *this = (*this + other); } HWY_INLINE Vec128& operator-=(const Vec128 other) { return *this = (*this - other); } HWY_INLINE Vec128& operator%=(const Vec128 other) { return *this = (*this % other); } HWY_INLINE Vec128& operator&=(const Vec128 other) { return *this = (*this & other); } HWY_INLINE Vec128& operator|=(const Vec128 other) { return *this = (*this | other); } HWY_INLINE Vec128& operator^=(const Vec128 other) { return *this = (*this ^ other); } Raw raw; }; template using Vec64 = Vec128; template using Vec32 = Vec128; template using Vec16 = Vec128; // FF..FF or 0. template struct Mask128 { typename detail::Raw128::RawBoolVec raw; using PrivateT = T; // only for DFromM static constexpr size_t kPrivateN = N; // only for DFromM }; template using DFromV = Simd; template using DFromM = Simd; template using TFromV = typename V::PrivateT; // ------------------------------ Zero // Returns an all-zero vector/part. template > HWY_API Vec128 Zero(D /* tag */) { // There is no vec_splats for 64-bit, so we cannot rely on casting the 0 // argument in order to select the correct overload. We instead cast the // return vector type; see also the comment in BitCast. return Vec128{ reinterpret_cast::type>(vec_splats(0))}; } template using VFromD = decltype(Zero(D())); // ------------------------------ Tuple (VFromD) #include "hwy/ops/tuple-inl.h" // ------------------------------ BitCast template HWY_API VFromD BitCast(D /*d*/, Vec128().MaxLanes()> v) { // C-style casts are not sufficient when compiling with // -fno-lax-vector-conversions, which will be the future default in Clang, // but reinterpret_cast is. return VFromD{ reinterpret_cast>::type>(v.raw)}; } // ------------------------------ ResizeBitCast template HWY_API VFromD ResizeBitCast(D /*d*/, FromV v) { // C-style casts are not sufficient when compiling with // -fno-lax-vector-conversions, which will be the future default in Clang, // but reinterpret_cast is. return VFromD{ reinterpret_cast>::type>(v.raw)}; } // ------------------------------ Set // Returns a vector/part with all lanes set to "t". template )> HWY_API VFromD Set(D /* tag */, TFromD t) { using RawLane = typename detail::Raw128>::RawT; return VFromD{vec_splats(static_cast(t))}; } template )> HWY_API VFromD Set(D d, TFromD t) { const RebindToUnsigned du; return BitCast(d, Set(du, BitCastScalar>(t))); } // Returns a vector with uninitialized elements. template HWY_API VFromD Undefined(D d) { #if HWY_COMPILER_GCC_ACTUAL // Suppressing maybe-uninitialized both here and at the caller does not work, // so initialize. return Zero(d); #else HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") typename detail::Raw128>::type raw; return VFromD{raw}; HWY_DIAGNOSTICS(pop) #endif } // ------------------------------ GetLane // Gets the single value stored in a vector/part. template HWY_API T GetLane(Vec128 v) { return static_cast(v.raw[0]); } // ------------------------------ Dup128VecFromValues template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7, TFromD t8, TFromD t9, TFromD t10, TFromD t11, TFromD t12, TFromD t13, TFromD t14, TFromD t15) { const typename detail::Raw128>::type raw = { t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15}; return VFromD{raw}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7) { const typename detail::Raw128>::type raw = {t0, t1, t2, t3, t4, t5, t6, t7}; return VFromD{raw}; } template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7) { const RebindToUnsigned du; return BitCast( d, Dup128VecFromValues( du, BitCastScalar(t0), BitCastScalar(t1), BitCastScalar(t2), BitCastScalar(t3), BitCastScalar(t4), BitCastScalar(t5), BitCastScalar(t6), BitCastScalar(t7))); } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3) { const typename detail::Raw128>::type raw = {t0, t1, t2, t3}; return VFromD{raw}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1) { const typename detail::Raw128>::type raw = {t0, t1}; return VFromD{raw}; } // ================================================== LOGICAL // ------------------------------ And template HWY_API Vec128 And(Vec128 a, Vec128 b) { const DFromV d; const RebindToUnsigned du; using VU = VFromD; #if HWY_S390X_HAVE_Z14 return BitCast(d, VU{BitCast(du, a).raw & BitCast(du, b).raw}); #else return BitCast(d, VU{vec_and(BitCast(du, a).raw, BitCast(du, b).raw)}); #endif } // ------------------------------ AndNot // Returns ~not_mask & mask. template HWY_API Vec128 AndNot(Vec128 not_mask, Vec128 mask) { const DFromV d; const RebindToUnsigned du; using VU = VFromD; return BitCast( d, VU{vec_andc(BitCast(du, mask).raw, BitCast(du, not_mask).raw)}); } // ------------------------------ Or template HWY_API Vec128 Or(Vec128 a, Vec128 b) { const DFromV d; const RebindToUnsigned du; using VU = VFromD; #if HWY_S390X_HAVE_Z14 return BitCast(d, VU{BitCast(du, a).raw | BitCast(du, b).raw}); #else return BitCast(d, VU{vec_or(BitCast(du, a).raw, BitCast(du, b).raw)}); #endif } // ------------------------------ Xor template HWY_API Vec128 Xor(Vec128 a, Vec128 b) { const DFromV d; const RebindToUnsigned du; using VU = VFromD; #if HWY_S390X_HAVE_Z14 return BitCast(d, VU{BitCast(du, a).raw ^ BitCast(du, b).raw}); #else return BitCast(d, VU{vec_xor(BitCast(du, a).raw, BitCast(du, b).raw)}); #endif } // ------------------------------ Not template HWY_API Vec128 Not(Vec128 v) { const DFromV d; const RebindToUnsigned du; using VU = VFromD; return BitCast(d, VU{vec_nor(BitCast(du, v).raw, BitCast(du, v).raw)}); } // ------------------------------ IsConstantRawAltivecVect namespace detail { template static HWY_INLINE bool IsConstantRawAltivecVect( hwy::SizeTag<1> /* lane_size_tag */, RawV v) { return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) && __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) && __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]) && __builtin_constant_p(v[8]) && __builtin_constant_p(v[9]) && __builtin_constant_p(v[10]) && __builtin_constant_p(v[11]) && __builtin_constant_p(v[12]) && __builtin_constant_p(v[13]) && __builtin_constant_p(v[14]) && __builtin_constant_p(v[15]); } template static HWY_INLINE bool IsConstantRawAltivecVect( hwy::SizeTag<2> /* lane_size_tag */, RawV v) { return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]) && __builtin_constant_p(v[4]) && __builtin_constant_p(v[5]) && __builtin_constant_p(v[6]) && __builtin_constant_p(v[7]); } template static HWY_INLINE bool IsConstantRawAltivecVect( hwy::SizeTag<4> /* lane_size_tag */, RawV v) { return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]) && __builtin_constant_p(v[2]) && __builtin_constant_p(v[3]); } template static HWY_INLINE bool IsConstantRawAltivecVect( hwy::SizeTag<8> /* lane_size_tag */, RawV v) { return __builtin_constant_p(v[0]) && __builtin_constant_p(v[1]); } template static HWY_INLINE bool IsConstantRawAltivecVect(RawV v) { return IsConstantRawAltivecVect(hwy::SizeTag(), v); } } // namespace detail // ------------------------------ TernaryLogic #if HWY_PPC_HAVE_10 namespace detail { // NOTE: the kTernLogOp bits of the PPC10 TernaryLogic operation are in reverse // order of the kTernLogOp bits of AVX3 // _mm_ternarylogic_epi64(a, b, c, kTernLogOp) template HWY_INLINE V TernaryLogic(V a, V b, V c) { const DFromV d; const RebindToUnsigned du; using VU = VFromD; const auto a_raw = BitCast(du, a).raw; const auto b_raw = BitCast(du, b).raw; const auto c_raw = BitCast(du, c).raw; #if HWY_COMPILER_GCC_ACTUAL // Use inline assembly on GCC to work around GCC compiler bug typename detail::Raw128>::type raw_ternlog_result; __asm__("xxeval %x0,%x1,%x2,%x3,%4" : "=wa"(raw_ternlog_result) : "wa"(a_raw), "wa"(b_raw), "wa"(c_raw), "n"(static_cast(kTernLogOp)) :); #else const auto raw_ternlog_result = vec_ternarylogic(a_raw, b_raw, c_raw, kTernLogOp); #endif return BitCast(d, VU{raw_ternlog_result}); } } // namespace detail #endif // HWY_PPC_HAVE_10 // ------------------------------ Xor3 template HWY_API Vec128 Xor3(Vec128 x1, Vec128 x2, Vec128 x3) { #if HWY_PPC_HAVE_10 #if defined(__OPTIMIZE__) if (static_cast(detail::IsConstantRawAltivecVect(x1.raw)) + static_cast(detail::IsConstantRawAltivecVect(x2.raw)) + static_cast(detail::IsConstantRawAltivecVect(x3.raw)) >= 2) { return Xor(x1, Xor(x2, x3)); } else // NOLINT #endif { return detail::TernaryLogic<0x69>(x1, x2, x3); } #else return Xor(x1, Xor(x2, x3)); #endif } // ------------------------------ Or3 template HWY_API Vec128 Or3(Vec128 o1, Vec128 o2, Vec128 o3) { #if HWY_PPC_HAVE_10 #if defined(__OPTIMIZE__) if (static_cast(detail::IsConstantRawAltivecVect(o1.raw)) + static_cast(detail::IsConstantRawAltivecVect(o2.raw)) + static_cast(detail::IsConstantRawAltivecVect(o3.raw)) >= 2) { return Or(o1, Or(o2, o3)); } else // NOLINT #endif { return detail::TernaryLogic<0x7F>(o1, o2, o3); } #else return Or(o1, Or(o2, o3)); #endif } // ------------------------------ OrAnd template HWY_API Vec128 OrAnd(Vec128 o, Vec128 a1, Vec128 a2) { #if HWY_PPC_HAVE_10 #if defined(__OPTIMIZE__) if (detail::IsConstantRawAltivecVect(a1.raw) && detail::IsConstantRawAltivecVect(a2.raw)) { return Or(o, And(a1, a2)); } else // NOLINT #endif { return detail::TernaryLogic<0x1F>(o, a1, a2); } #else return Or(o, And(a1, a2)); #endif } // ------------------------------ IfVecThenElse template HWY_API Vec128 IfVecThenElse(Vec128 mask, Vec128 yes, Vec128 no) { const DFromV d; const RebindToUnsigned du; return BitCast( d, VFromD{vec_sel(BitCast(du, no).raw, BitCast(du, yes).raw, BitCast(du, mask).raw)}); } // ------------------------------ BitwiseIfThenElse #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE #else #define HWY_NATIVE_BITWISE_IF_THEN_ELSE #endif template HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { return IfVecThenElse(mask, yes, no); } // ------------------------------ Operator overloads (internal-only if float) template HWY_API Vec128 operator&(Vec128 a, Vec128 b) { return And(a, b); } template HWY_API Vec128 operator|(Vec128 a, Vec128 b) { return Or(a, b); } template HWY_API Vec128 operator^(Vec128 a, Vec128 b) { return Xor(a, b); } // ================================================== SIGN // ------------------------------ Neg template HWY_API Vec128 Neg(Vec128 v) { // If T is an signed integer type, use Zero(d) - v instead of vec_neg to // avoid undefined behavior in the case where v[i] == LimitsMin() const DFromV d; return Zero(d) - v; } template HWY_API Vec128 Neg(Vec128 v) { #if HWY_S390X_HAVE_Z14 return Xor(v, SignBit(DFromV())); #else return Vec128{vec_neg(v.raw)}; #endif } template HWY_API Vec128 Neg(const Vec128 v) { return Xor(v, SignBit(DFromV())); } // ------------------------------ Abs // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. template HWY_API Vec128 Abs(Vec128 v) { // If T is a signed integer type, use Max(v, Neg(v)) instead of vec_abs to // avoid undefined behavior in the case where v[i] == LimitsMin(). return Max(v, Neg(v)); } template HWY_API Vec128 Abs(Vec128 v) { return Vec128{vec_abs(v.raw)}; } // ------------------------------ CopySign #if HWY_S390X_HAVE_Z14 template HWY_API V CopySign(const V magn, const V sign) { static_assert(IsFloat>(), "Only makes sense for floating-point"); const DFromV d; const auto msb = SignBit(d); // Truth table for msb, magn, sign | bitwise msb ? sign : mag // 0 0 0 | 0 // 0 0 1 | 0 // 0 1 0 | 1 // 0 1 1 | 1 // 1 0 0 | 0 // 1 0 1 | 1 // 1 1 0 | 0 // 1 1 1 | 1 return BitwiseIfThenElse(msb, sign, magn); } #else // VSX template HWY_API Vec128 CopySign(Vec128 magn, Vec128 sign) { // Work around compiler bugs that are there with vec_cpsgn on older versions // of GCC/Clang #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 return Vec128{__builtin_vec_copysign(magn.raw, sign.raw)}; #elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \ HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgnsp) return Vec128{__builtin_vsx_xvcpsgnsp(magn.raw, sign.raw)}; #else return Vec128{vec_cpsgn(sign.raw, magn.raw)}; #endif } template HWY_API Vec128 CopySign(Vec128 magn, Vec128 sign) { // Work around compiler bugs that are there with vec_cpsgn on older versions // of GCC/Clang #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 return Vec128{__builtin_vec_copysign(magn.raw, sign.raw)}; #elif HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 && \ HWY_HAS_BUILTIN(__builtin_vsx_xvcpsgndp) return Vec128{__builtin_vsx_xvcpsgndp(magn.raw, sign.raw)}; #else return Vec128{vec_cpsgn(sign.raw, magn.raw)}; #endif } #endif // HWY_S390X_HAVE_Z14 template HWY_API Vec128 CopySignToAbs(Vec128 abs, Vec128 sign) { // PPC8 can also handle abs < 0, so no extra action needed. static_assert(IsFloat(), "Only makes sense for floating-point"); return CopySign(abs, sign); } // ================================================== MEMORY (1) // Note: type punning is safe because the types are tagged with may_alias. // (https://godbolt.org/z/fqrWjfjsP) // ------------------------------ Load template > HWY_API Vec128 Load(D /* tag */, const T* HWY_RESTRICT aligned) { using LoadRaw = typename detail::Raw128::AlignedRawVec; const LoadRaw* HWY_RESTRICT p = HWY_RCAST_ALIGNED(const LoadRaw*, aligned); using ResultRaw = typename detail::Raw128::type; return Vec128{reinterpret_cast(*p)}; } // Any <= 64 bit template > HWY_API VFromD Load(D d, const T* HWY_RESTRICT p) { using BitsT = UnsignedFromSize; BitsT bits; const Repartition d_bits; CopyBytes(p, &bits); return BitCast(d, Set(d_bits, bits)); } // ================================================== MASK // ------------------------------ Mask // Mask and Vec are both backed by vector types (true = FF..FF). template HWY_API Mask128 MaskFromVec(Vec128 v) { using Raw = typename detail::Raw128::RawBoolVec; return Mask128{reinterpret_cast(v.raw)}; } template using MFromD = decltype(MaskFromVec(VFromD())); template HWY_API Vec128 VecFromMask(Mask128 v) { return Vec128{ reinterpret_cast::type>(v.raw)}; } template HWY_API VFromD VecFromMask(D /* tag */, MFromD v) { return VFromD{ reinterpret_cast>::type>(v.raw)}; } // mask ? yes : no template HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, Vec128 no) { const DFromV d; const RebindToUnsigned du; return BitCast(d, VFromD{vec_sel( BitCast(du, no).raw, BitCast(du, yes).raw, mask.raw)}); } // mask ? yes : 0 template HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { return yes & VecFromMask(DFromV(), mask); } // mask ? 0 : no template HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { return AndNot(VecFromMask(DFromV(), mask), no); } // ------------------------------ Mask logical template HWY_API Mask128 Not(Mask128 m) { return Mask128{vec_nor(m.raw, m.raw)}; } template HWY_API Mask128 And(Mask128 a, Mask128 b) { #if HWY_S390X_HAVE_Z14 return Mask128{a.raw & b.raw}; #else return Mask128{vec_and(a.raw, b.raw)}; #endif } template HWY_API Mask128 AndNot(Mask128 a, Mask128 b) { return Mask128{vec_andc(b.raw, a.raw)}; } template HWY_API Mask128 Or(Mask128 a, Mask128 b) { #if HWY_S390X_HAVE_Z14 return Mask128{a.raw | b.raw}; #else return Mask128{vec_or(a.raw, b.raw)}; #endif } template HWY_API Mask128 Xor(Mask128 a, Mask128 b) { #if HWY_S390X_HAVE_Z14 return Mask128{a.raw ^ b.raw}; #else return Mask128{vec_xor(a.raw, b.raw)}; #endif } template HWY_API Mask128 ExclusiveNeither(Mask128 a, Mask128 b) { return Mask128{vec_nor(a.raw, b.raw)}; } // ------------------------------ ShiftLeftSame template HWY_API Vec128 ShiftLeftSame(Vec128 v, const int bits) { const DFromV d; const RebindToUnsigned du; using TU = TFromD; #if HWY_S390X_HAVE_Z14 return BitCast(d, VFromD{BitCast(du, v).raw << Set(du, static_cast(bits)).raw}); #else // Do an unsigned vec_sl operation to avoid undefined behavior return BitCast( d, VFromD{ vec_sl(BitCast(du, v).raw, Set(du, static_cast(bits)).raw)}); #endif } // ------------------------------ ShiftRightSame template HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { using TU = typename detail::Raw128>::RawT; #if HWY_S390X_HAVE_Z14 return Vec128{v.raw >> vec_splats(static_cast(bits))}; #else return Vec128{vec_sr(v.raw, vec_splats(static_cast(bits)))}; #endif } template HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { #if HWY_S390X_HAVE_Z14 using TI = typename detail::Raw128::RawT; return Vec128{v.raw >> vec_splats(static_cast(bits))}; #else using TU = typename detail::Raw128>::RawT; return Vec128{vec_sra(v.raw, vec_splats(static_cast(bits)))}; #endif } // ------------------------------ ShiftLeft template HWY_API Vec128 ShiftLeft(Vec128 v) { static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); return ShiftLeftSame(v, kBits); } // ------------------------------ ShiftRight template HWY_API Vec128 ShiftRight(Vec128 v) { static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); return ShiftRightSame(v, kBits); } // ------------------------------ BroadcastSignBit template HWY_API Vec128 BroadcastSignBit(Vec128 v) { return ShiftRightSame(v, static_cast(sizeof(T) * 8 - 1)); } // ================================================== SWIZZLE (1) // ------------------------------ TableLookupBytes template HWY_API Vec128 TableLookupBytes(Vec128 bytes, Vec128 from) { const Repartition> du8_from; return Vec128{reinterpret_cast::type>( vec_perm(bytes.raw, bytes.raw, BitCast(du8_from, from).raw))}; } // ------------------------------ TableLookupBytesOr0 // For all vector widths; Altivec/VSX needs zero out template HWY_API VI TableLookupBytesOr0(const V bytes, const VI from) { const DFromV di; Repartition di8; const VI zeroOutMask = BitCast(di, BroadcastSignBit(BitCast(di8, from))); return AndNot(zeroOutMask, TableLookupBytes(bytes, from)); } // ------------------------------ Reverse template , HWY_IF_LANES_GT_D(D, 1)> HWY_API Vec128 Reverse(D /* tag */, Vec128 v) { return Vec128{vec_reve(v.raw)}; } // ------------------------------ Shuffles (Reverse) // Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). // Shuffle0321 rotates one lane to the right (the previous least-significant // lane is now most-significant). These could also be implemented via // CombineShiftRightBytes but the shuffle_abcd notation is more convenient. // Swap 32-bit halves in 64-bit halves. template HWY_API Vec128 Shuffle2301(Vec128 v) { static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); static_assert(N == 2 || N == 4, "Does not make sense for N=1"); const __vector unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}; return Vec128{vec_perm(v.raw, v.raw, kShuffle)}; } // These are used by generic_ops-inl to implement LoadInterleaved3. As with // Intel's shuffle* intrinsics and InterleaveLower, the lower half of the output // comes from the first argument. namespace detail { template HWY_API Vec32 ShuffleTwo2301(Vec32 a, Vec32 b) { const __vector unsigned char kShuffle16 = {1, 0, 19, 18}; return Vec32{vec_perm(a.raw, b.raw, kShuffle16)}; } template HWY_API Vec64 ShuffleTwo2301(Vec64 a, Vec64 b) { const __vector unsigned char kShuffle = {2, 3, 0, 1, 22, 23, 20, 21}; return Vec64{vec_perm(a.raw, b.raw, kShuffle)}; } template HWY_API Vec128 ShuffleTwo2301(Vec128 a, Vec128 b) { const __vector unsigned char kShuffle = {4, 5, 6, 7, 0, 1, 2, 3, 28, 29, 30, 31, 24, 25, 26, 27}; return Vec128{vec_perm(a.raw, b.raw, kShuffle)}; } template HWY_API Vec32 ShuffleTwo1230(Vec32 a, Vec32 b) { const __vector unsigned char kShuffle = {0, 3, 18, 17}; return Vec32{vec_perm(a.raw, b.raw, kShuffle)}; } template HWY_API Vec64 ShuffleTwo1230(Vec64 a, Vec64 b) { const __vector unsigned char kShuffle = {0, 1, 6, 7, 20, 21, 18, 19}; return Vec64{vec_perm(a.raw, b.raw, kShuffle)}; } template HWY_API Vec128 ShuffleTwo1230(Vec128 a, Vec128 b) { const __vector unsigned char kShuffle = {0, 1, 2, 3, 12, 13, 14, 15, 24, 25, 26, 27, 20, 21, 22, 23}; return Vec128{vec_perm(a.raw, b.raw, kShuffle)}; } template HWY_API Vec32 ShuffleTwo3012(Vec32 a, Vec32 b) { const __vector unsigned char kShuffle = {2, 1, 16, 19}; return Vec32{vec_perm(a.raw, b.raw, kShuffle)}; } template HWY_API Vec64 ShuffleTwo3012(Vec64 a, Vec64 b) { const __vector unsigned char kShuffle = {4, 5, 2, 3, 16, 17, 22, 23}; return Vec64{vec_perm(a.raw, b.raw, kShuffle)}; } template HWY_API Vec128 ShuffleTwo3012(Vec128 a, Vec128 b) { const __vector unsigned char kShuffle = {8, 9, 10, 11, 4, 5, 6, 7, 16, 17, 18, 19, 28, 29, 30, 31}; return Vec128{vec_perm(a.raw, b.raw, kShuffle)}; } } // namespace detail // Swap 64-bit halves template HWY_API Vec128 Shuffle1032(Vec128 v) { const Full128 d; const Full128 du64; return BitCast(d, Reverse(du64, BitCast(du64, v))); } template HWY_API Vec128 Shuffle01(Vec128 v) { return Reverse(Full128(), v); } // Rotate right 32 bits template HWY_API Vec128 Shuffle0321(Vec128 v) { #if HWY_IS_LITTLE_ENDIAN return Vec128{vec_sld(v.raw, v.raw, 12)}; #else return Vec128{vec_sld(v.raw, v.raw, 4)}; #endif } // Rotate left 32 bits template HWY_API Vec128 Shuffle2103(Vec128 v) { #if HWY_IS_LITTLE_ENDIAN return Vec128{vec_sld(v.raw, v.raw, 4)}; #else return Vec128{vec_sld(v.raw, v.raw, 12)}; #endif } template HWY_API Vec128 Shuffle0123(Vec128 v) { return Reverse(Full128(), v); } // ================================================== COMPARE // Comparisons fill a lane with 1-bits if the condition is true, else 0. template HWY_API MFromD RebindMask(DTo /*dto*/, Mask128 m) { static_assert(sizeof(TFrom) == sizeof(TFromD), "Must have same size"); return MFromD{m.raw}; } template HWY_API Mask128 TestBit(Vec128 v, Vec128 bit) { static_assert(!hwy::IsFloat(), "Only integer vectors supported"); return (v & bit) == bit; } // ------------------------------ Equality template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{vec_cmpeq(a.raw, b.raw)}; } // ------------------------------ Inequality // This cannot have T as a template argument, otherwise it is not more // specialized than rewritten operator== in C++20, leading to compile // errors: https://gcc.godbolt.org/z/xsrPhPvPT. template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { #if HWY_PPC_HAVE_9 return Mask128{vec_cmpne(a.raw, b.raw)}; #else return Not(a == b); #endif } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { #if HWY_PPC_HAVE_9 return Mask128{vec_cmpne(a.raw, b.raw)}; #else return Not(a == b); #endif } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { #if HWY_PPC_HAVE_9 return Mask128{vec_cmpne(a.raw, b.raw)}; #else return Not(a == b); #endif } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { #if HWY_PPC_HAVE_9 return Mask128{vec_cmpne(a.raw, b.raw)}; #else return Not(a == b); #endif } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { #if HWY_PPC_HAVE_9 return Mask128{vec_cmpne(a.raw, b.raw)}; #else return Not(a == b); #endif } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { #if HWY_PPC_HAVE_9 return Mask128{vec_cmpne(a.raw, b.raw)}; #else return Not(a == b); #endif } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } template HWY_API Mask128 operator!=(Vec128 a, Vec128 b) { return Not(a == b); } // ------------------------------ Strict inequality template HWY_INLINE Mask128 operator>(Vec128 a, Vec128 b) { return Mask128{vec_cmpgt(a.raw, b.raw)}; } // ------------------------------ Weak inequality template HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { return Mask128{vec_cmpge(a.raw, b.raw)}; } template HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { return Not(b > a); } // ------------------------------ Reversed comparisons template HWY_API Mask128 operator<(Vec128 a, Vec128 b) { return b > a; } template HWY_API Mask128 operator<=(Vec128 a, Vec128 b) { return b >= a; } // ================================================== MEMORY (2) // ------------------------------ Load template > HWY_API Vec128 LoadU(D /* tag */, const T* HWY_RESTRICT p) { using LoadRaw = typename detail::Raw128::UnalignedRawVec; const LoadRaw* HWY_RESTRICT praw = reinterpret_cast(p); using ResultRaw = typename detail::Raw128::type; return Vec128{reinterpret_cast(*praw)}; } // For < 128 bit, LoadU == Load. template > HWY_API VFromD LoadU(D d, const T* HWY_RESTRICT p) { return Load(d, p); } // 128-bit SIMD => nothing to duplicate, same as an unaligned load. template > HWY_API VFromD LoadDup128(D d, const T* HWY_RESTRICT p) { return LoadU(d, p); } #if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14 #ifdef HWY_NATIVE_LOAD_N #undef HWY_NATIVE_LOAD_N #else #define HWY_NATIVE_LOAD_N #endif template > HWY_API VFromD LoadN(D d, const T* HWY_RESTRICT p, size_t max_lanes_to_load) { #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD if (__builtin_constant_p(max_lanes_to_load) && max_lanes_to_load == 0) { return Zero(d); } if (__builtin_constant_p(max_lanes_to_load >= HWY_MAX_LANES_D(D)) && max_lanes_to_load >= HWY_MAX_LANES_D(D)) { return LoadU(d, p); } #endif const size_t num_of_bytes_to_load = HWY_MIN(max_lanes_to_load, HWY_MAX_LANES_D(D)) * sizeof(TFromD); const Repartition du8; #if HWY_S390X_HAVE_Z14 return (num_of_bytes_to_load > 0) ? BitCast(d, VFromD{vec_load_len( const_cast( reinterpret_cast(p)), static_cast(num_of_bytes_to_load - 1))}) : Zero(d); #else return BitCast( d, VFromD{vec_xl_len( const_cast(reinterpret_cast(p)), num_of_bytes_to_load)}); #endif } template > HWY_API VFromD LoadNOr(VFromD no, D d, const T* HWY_RESTRICT p, size_t max_lanes_to_load) { #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD if (__builtin_constant_p(max_lanes_to_load) && max_lanes_to_load == 0) { return no; } if (__builtin_constant_p(max_lanes_to_load >= HWY_MAX_LANES_D(D)) && max_lanes_to_load >= HWY_MAX_LANES_D(D)) { return LoadU(d, p); } #endif return IfThenElse(FirstN(d, max_lanes_to_load), LoadN(d, p, max_lanes_to_load), no); } #endif // HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14 // Returns a vector with lane i=[0, N) set to "first" + i. namespace detail { template HWY_INLINE VFromD Iota0(D d) { constexpr __vector unsigned char kU8Iota0 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; return BitCast(d, VFromD>{kU8Iota0}); } template HWY_INLINE VFromD Iota0(D d) { constexpr __vector unsigned short kU16Iota0 = {0, 1, 2, 3, 4, 5, 6, 7}; return BitCast(d, VFromD>{kU16Iota0}); } template HWY_INLINE VFromD Iota0(D d) { constexpr __vector unsigned int kU32Iota0 = {0, 1, 2, 3}; return BitCast(d, VFromD>{kU32Iota0}); } template HWY_INLINE VFromD Iota0(D d) { constexpr __vector unsigned long long kU64Iota0 = {0, 1}; return BitCast(d, VFromD>{kU64Iota0}); } template HWY_INLINE VFromD Iota0(D /*d*/) { constexpr __vector float kF32Iota0 = {0.0f, 1.0f, 2.0f, 3.0f}; return VFromD{kF32Iota0}; } template HWY_INLINE VFromD Iota0(D /*d*/) { constexpr __vector double kF64Iota0 = {0.0, 1.0}; return VFromD{kF64Iota0}; } } // namespace detail template HWY_API VFromD Iota(D d, const T2 first) { return detail::Iota0(d) + Set(d, static_cast>(first)); } // ------------------------------ FirstN (Iota, Lt) template HWY_API MFromD FirstN(D d, size_t num) { const RebindToUnsigned du; using TU = TFromD; return RebindMask(d, Iota(du, 0) < Set(du, static_cast(num))); } // ------------------------------ MaskedLoad template > HWY_API VFromD MaskedLoad(MFromD m, D d, const T* HWY_RESTRICT p) { return IfThenElseZero(m, LoadU(d, p)); } // ------------------------------ MaskedLoadOr template > HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D d, const T* HWY_RESTRICT p) { return IfThenElse(m, LoadU(d, p), v); } // ------------------------------ Store template > HWY_API void Store(Vec128 v, D /* tag */, T* HWY_RESTRICT aligned) { using StoreRaw = typename detail::Raw128::AlignedRawVec; *HWY_RCAST_ALIGNED(StoreRaw*, aligned) = reinterpret_cast(v.raw); } template > HWY_API void StoreU(Vec128 v, D /* tag */, T* HWY_RESTRICT p) { using StoreRaw = typename detail::Raw128::UnalignedRawVec; *reinterpret_cast(p) = reinterpret_cast(v.raw); } template > HWY_API void Store(VFromD v, D d, T* HWY_RESTRICT p) { using BitsT = UnsignedFromSize; const Repartition d_bits; const BitsT bits = GetLane(BitCast(d_bits, v)); CopyBytes(&bits, p); } // For < 128 bit, StoreU == Store. template > HWY_API void StoreU(VFromD v, D d, T* HWY_RESTRICT p) { Store(v, d, p); } #if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14 #ifdef HWY_NATIVE_STORE_N #undef HWY_NATIVE_STORE_N #else #define HWY_NATIVE_STORE_N #endif template > HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, size_t max_lanes_to_store) { #if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD if (__builtin_constant_p(max_lanes_to_store) && max_lanes_to_store == 0) { return; } if (__builtin_constant_p(max_lanes_to_store >= HWY_MAX_LANES_D(D)) && max_lanes_to_store >= HWY_MAX_LANES_D(D)) { StoreU(v, d, p); return; } #endif const size_t num_of_bytes_to_store = HWY_MIN(max_lanes_to_store, HWY_MAX_LANES_D(D)) * sizeof(TFromD); const Repartition du8; #if HWY_S390X_HAVE_Z14 if (num_of_bytes_to_store > 0) { vec_store_len(BitCast(du8, v).raw, reinterpret_cast(p), static_cast(num_of_bytes_to_store - 1)); } #else vec_xst_len(BitCast(du8, v).raw, reinterpret_cast(p), num_of_bytes_to_store); #endif } #endif // ------------------------------ BlendedStore template HWY_API void BlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT p) { const RebindToSigned di; // for testing mask if T=bfloat16_t. using TI = TFromD; alignas(16) TI buf[MaxLanes(d)]; alignas(16) TI mask[MaxLanes(d)]; Store(BitCast(di, v), di, buf); Store(BitCast(di, VecFromMask(d, m)), di, mask); for (size_t i = 0; i < MaxLanes(d); ++i) { if (mask[i]) { CopySameSize(buf + i, p + i); } } } // ================================================== ARITHMETIC namespace detail { // If TFromD is an integer type, detail::RebindToUnsignedIfNotFloat // rebinds D to MakeUnsigned>. // Otherwise, if TFromD is a floating-point type (including F16 and BF16), // detail::RebindToUnsignedIfNotFloat is the same as D. template using RebindToUnsignedIfNotFloat = hwy::If<(!hwy::IsFloat>() && !hwy::IsSpecialFloat>()), RebindToUnsigned, D>; } // namespace detail // ------------------------------ Addition template HWY_API Vec128 operator+(Vec128 a, Vec128 b) { const DFromV d; const detail::RebindToUnsignedIfNotFloat d_arith; // If T is an integer type, do an unsigned vec_add to avoid undefined behavior #if HWY_S390X_HAVE_Z14 return BitCast(d, VFromD{BitCast(d_arith, a).raw + BitCast(d_arith, b).raw}); #else return BitCast(d, VFromD{vec_add( BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)}); #endif } // ------------------------------ Subtraction template HWY_API Vec128 operator-(Vec128 a, Vec128 b) { const DFromV d; const detail::RebindToUnsignedIfNotFloat d_arith; // If T is an integer type, do an unsigned vec_sub to avoid undefined behavior #if HWY_S390X_HAVE_Z14 return BitCast(d, VFromD{BitCast(d_arith, a).raw - BitCast(d_arith, b).raw}); #else return BitCast(d, VFromD{vec_sub( BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)}); #endif } // ------------------------------ SumsOf8 template )> HWY_API VFromD>> SumsOf8(V v) { return SumsOf2(SumsOf4(v)); } template )> HWY_API VFromD>> SumsOf8(V v) { #if HWY_S390X_HAVE_Z14 const DFromV di8; const RebindToUnsigned du8; const RepartitionToWideX3 di64; return BitCast(di64, SumsOf8(BitCast(du8, Xor(v, SignBit(di8))))) + Set(di64, int64_t{-1024}); #else return SumsOf2(SumsOf4(v)); #endif } // ------------------------------ SaturatedAdd // Returns a + b clamped to the destination range. #if HWY_S390X_HAVE_Z14 // Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedAdd instructions unlike most // other integer SIMD instruction sets template HWY_API Vec128 SaturatedAdd(Vec128 a, Vec128 b) { return Add(a, Min(b, Not(a))); } template HWY_API Vec128 SaturatedAdd(Vec128 a, Vec128 b) { const DFromV d; const auto sum = Add(a, b); const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum)); const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax())); return IfNegativeThenElse(overflow_mask, overflow_result, sum); } #else // VSX #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB #undef HWY_NATIVE_I32_SATURATED_ADDSUB #else #define HWY_NATIVE_I32_SATURATED_ADDSUB #endif #ifdef HWY_NATIVE_U32_SATURATED_ADDSUB #undef HWY_NATIVE_U32_SATURATED_ADDSUB #else #define HWY_NATIVE_U32_SATURATED_ADDSUB #endif template HWY_API Vec128 SaturatedAdd(Vec128 a, Vec128 b) { return Vec128{vec_adds(a.raw, b.raw)}; } #endif // HWY_S390X_HAVE_Z14 #if HWY_PPC_HAVE_10 #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB #undef HWY_NATIVE_I64_SATURATED_ADDSUB #else #define HWY_NATIVE_I64_SATURATED_ADDSUB #endif template )> HWY_API V SaturatedAdd(V a, V b) { const DFromV d; const auto sum = Add(a, b); const auto overflow_mask = BroadcastSignBit(detail::TernaryLogic<0x42>(a, b, sum)); const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax())); return IfNegativeThenElse(overflow_mask, overflow_result, sum); } #endif // HWY_PPC_HAVE_10 // ------------------------------ SaturatedSub // Returns a - b clamped to the destination range. #if HWY_S390X_HAVE_Z14 // Z14/Z15/Z16 does not have I8/U8/I16/U16 SaturatedSub instructions unlike most // other integer SIMD instruction sets template HWY_API Vec128 SaturatedSub(Vec128 a, Vec128 b) { return Sub(a, Min(a, b)); } template HWY_API Vec128 SaturatedSub(Vec128 a, Vec128 b) { const DFromV d; const auto diff = Sub(a, b); const auto overflow_mask = And(Xor(a, b), Xor(a, diff)); const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax())); return IfNegativeThenElse(overflow_mask, overflow_result, diff); } #else // VSX template HWY_API Vec128 SaturatedSub(Vec128 a, Vec128 b) { return Vec128{vec_subs(a.raw, b.raw)}; } #endif // HWY_S390X_HAVE_Z14 #if HWY_PPC_HAVE_10 template )> HWY_API V SaturatedSub(V a, V b) { const DFromV d; const auto diff = Sub(a, b); const auto overflow_mask = BroadcastSignBit(detail::TernaryLogic<0x18>(a, b, diff)); const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax())); return IfNegativeThenElse(overflow_mask, overflow_result, diff); } #endif // HWY_PPC_HAVE_10 // ------------------------------ AverageRound // Returns (a + b + 1) / 2 template HWY_API Vec128 AverageRound(Vec128 a, Vec128 b) { return Vec128{vec_avg(a.raw, b.raw)}; } // ------------------------------ Multiplication // Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*. #ifdef HWY_NATIVE_MUL_8 #undef HWY_NATIVE_MUL_8 #else #define HWY_NATIVE_MUL_8 #endif #ifdef HWY_NATIVE_MUL_64 #undef HWY_NATIVE_MUL_64 #else #define HWY_NATIVE_MUL_64 #endif template HWY_API Vec128 operator*(Vec128 a, Vec128 b) { const DFromV d; const detail::RebindToUnsignedIfNotFloat d_arith; // If T is an integer type, do an unsigned vec_mul to avoid undefined behavior #if HWY_S390X_HAVE_Z14 return BitCast(d, VFromD{BitCast(d_arith, a).raw * BitCast(d_arith, b).raw}); #else return BitCast(d, VFromD{vec_mul( BitCast(d_arith, a).raw, BitCast(d_arith, b).raw)}); #endif } // Returns the upper 16 bits of a * b in each lane. template HWY_API Vec128 MulHigh(Vec128 a, Vec128 b) { #if HWY_S390X_HAVE_Z14 return Vec128{vec_mulh(a.raw, b.raw)}; #else const DFromV d; const RepartitionToWide dw; const VFromD p1{vec_mule(a.raw, b.raw)}; const VFromD p2{vec_mulo(a.raw, b.raw)}; #if HWY_IS_LITTLE_ENDIAN const __vector unsigned char kShuffle = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31}; #else const __vector unsigned char kShuffle = {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29}; #endif return BitCast(d, VFromD{vec_perm(p1.raw, p2.raw, kShuffle)}); #endif } // Multiplies even lanes (0, 2, ..) and places the double-wide result into // even and the upper half into its odd neighbor lane. template HWY_API Vec128, (N + 1) / 2> MulEven(Vec128 a, Vec128 b) { return Vec128, (N + 1) / 2>{vec_mule(a.raw, b.raw)}; } // Multiplies odd lanes (1, 3, ..) and places the double-wide result into // even and the upper half into its odd neighbor lane. template HWY_API Vec128, (N + 1) / 2> MulOdd(Vec128 a, Vec128 b) { return Vec128, (N + 1) / 2>{vec_mulo(a.raw, b.raw)}; } // ------------------------------ RotateRight template HWY_API Vec128 RotateRight(const Vec128 v) { const DFromV d; const RebindToUnsigned du; constexpr size_t kSizeInBits = sizeof(T) * 8; static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); if (kBits == 0) return v; // Do an unsigned vec_rl operation to avoid undefined behavior return BitCast(d, VFromD{vec_rl( BitCast(du, v).raw, Set(du, kSizeInBits - kBits).raw)}); } // ------------------------------ ZeroIfNegative (BroadcastSignBit) template HWY_API Vec128 ZeroIfNegative(Vec128 v) { static_assert(IsFloat(), "Only works for float"); const DFromV d; const RebindToSigned di; const auto mask = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); return IfThenElse(mask, Zero(d), v); } // ------------------------------ IfNegativeThenElse template HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, Vec128 no) { static_assert(IsSigned(), "Only works for signed/float"); const DFromV d; #if HWY_PPC_HAVE_10 const RebindToUnsigned du; return BitCast( d, VFromD{vec_blendv( BitCast(du, no).raw, BitCast(du, yes).raw, BitCast(du, v).raw)}); #else const RebindToSigned di; return IfVecThenElse(BitCast(d, BroadcastSignBit(BitCast(di, v))), yes, no); #endif } // generic_ops takes care of integer T. template HWY_API Vec128 AbsDiff(Vec128 a, Vec128 b) { return Abs(a - b); } // ------------------------------ Floating-point multiply-add variants // Returns mul * x + add template HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, Vec128 add) { return Vec128{vec_madd(mul.raw, x.raw, add.raw)}; } // Returns add - mul * x template HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, Vec128 add) { // NOTE: the vec_nmsub operation below computes -(mul * x - add), // which is equivalent to add - mul * x in the round-to-nearest // and round-towards-zero rounding modes return Vec128{vec_nmsub(mul.raw, x.raw, add.raw)}; } // Returns mul * x - sub template HWY_API Vec128 MulSub(Vec128 mul, Vec128 x, Vec128 sub) { return Vec128{vec_msub(mul.raw, x.raw, sub.raw)}; } // Returns -mul * x - sub template HWY_API Vec128 NegMulSub(Vec128 mul, Vec128 x, Vec128 sub) { // NOTE: The vec_nmadd operation below computes -(mul * x + sub), // which is equivalent to -mul * x - sub in the round-to-nearest // and round-towards-zero rounding modes return Vec128{vec_nmadd(mul.raw, x.raw, sub.raw)}; } // ------------------------------ Floating-point div // Approximate reciprocal #ifdef HWY_NATIVE_F64_APPROX_RECIP #undef HWY_NATIVE_F64_APPROX_RECIP #else #define HWY_NATIVE_F64_APPROX_RECIP #endif template HWY_API Vec128 operator/(Vec128 a, Vec128 b) { #if HWY_S390X_HAVE_Z14 return Vec128{a.raw / b.raw}; #else return Vec128{vec_div(a.raw, b.raw)}; #endif } template HWY_API Vec128 ApproximateReciprocal(Vec128 v) { #if HWY_S390X_HAVE_Z14 const DFromV d; return Set(d, T(1.0)) / v; #else return Vec128{vec_re(v.raw)}; #endif } // ------------------------------ Floating-point square root #if HWY_S390X_HAVE_Z14 // Approximate reciprocal square root template HWY_API Vec128 ApproximateReciprocalSqrt(Vec128 v) { const DFromV d; const RebindToUnsigned du; const auto half = v * Set(d, 0.5f); // Initial guess based on log2(f) const auto guess = BitCast( d, Set(du, uint32_t{0x5F3759DFu}) - ShiftRight<1>(BitCast(du, v))); // One Newton-Raphson iteration return guess * NegMulAdd(half * guess, guess, Set(d, 1.5f)); } #else // VSX #ifdef HWY_NATIVE_F64_APPROX_RSQRT #undef HWY_NATIVE_F64_APPROX_RSQRT #else #define HWY_NATIVE_F64_APPROX_RSQRT #endif // Approximate reciprocal square root template HWY_API Vec128 ApproximateReciprocalSqrt(Vec128 v) { return Vec128{vec_rsqrte(v.raw)}; } #endif // HWY_S390X_HAVE_Z14 // Full precision square root template HWY_API Vec128 Sqrt(Vec128 v) { return Vec128{vec_sqrt(v.raw)}; } // ------------------------------ Min (Gt, IfThenElse) template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{vec_min(a.raw, b.raw)}; } // ------------------------------ Max (Gt, IfThenElse) template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{vec_max(a.raw, b.raw)}; } // ------------------------------- Integer AbsDiff for PPC9/PPC10 #if HWY_PPC_HAVE_9 #ifdef HWY_NATIVE_INTEGER_ABS_DIFF #undef HWY_NATIVE_INTEGER_ABS_DIFF #else #define HWY_NATIVE_INTEGER_ABS_DIFF #endif template HWY_API V AbsDiff(const V a, const V b) { return V{vec_absd(a.raw, b.raw)}; } template )> HWY_API V AbsDiff(const V a, const V b) { return Sub(Max(a, b), Min(a, b)); } template HWY_API V AbsDiff(const V a, const V b) { return Sub(Max(a, b), Min(a, b)); } #endif // HWY_PPC_HAVE_9 // ------------------------------ Integer Div for PPC10 #if HWY_PPC_HAVE_10 #ifdef HWY_NATIVE_INT_DIV #undef HWY_NATIVE_INT_DIV #else #define HWY_NATIVE_INT_DIV #endif template HWY_API Vec128 operator/(Vec128 a, Vec128 b) { // Inline assembly is used instead of vec_div for I32 Div on PPC10 to avoid // undefined behavior if b[i] == 0 or // (a[i] == LimitsMin() && b[i] == -1) // Clang will also optimize out I32 vec_div on PPC10 if optimizations are // enabled and any of the lanes of b are known to be zero (even in the unused // lanes of a partial vector) __vector signed int raw_result; __asm__("vdivsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw)); return Vec128{raw_result}; } template HWY_API Vec128 operator/(Vec128 a, Vec128 b) { // Inline assembly is used instead of vec_div for U32 Div on PPC10 to avoid // undefined behavior if b[i] == 0 // Clang will also optimize out U32 vec_div on PPC10 if optimizations are // enabled and any of the lanes of b are known to be zero (even in the unused // lanes of a partial vector) __vector unsigned int raw_result; __asm__("vdivuw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw)); return Vec128{raw_result}; } template HWY_API Vec128 operator/(Vec128 a, Vec128 b) { // Inline assembly is used instead of vec_div for I64 Div on PPC10 to avoid // undefined behavior if b[i] == 0 or // (a[i] == LimitsMin() && b[i] == -1) // Clang will also optimize out I64 vec_div on PPC10 if optimizations are // enabled and any of the lanes of b are known to be zero (even in the unused // lanes of a partial vector) __vector signed long long raw_result; __asm__("vdivsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw)); return Vec128{raw_result}; } template HWY_API Vec128 operator/(Vec128 a, Vec128 b) { // Inline assembly is used instead of vec_div for U64 Div on PPC10 to avoid // undefined behavior if b[i] == 0 // Clang will also optimize out U64 vec_div on PPC10 if optimizations are // enabled and any of the lanes of b are known to be zero (even in the unused // lanes of a partial vector) __vector unsigned long long raw_result; __asm__("vdivud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw)); return Vec128{raw_result}; } template HWY_API Vec128 operator/(Vec128 a, Vec128 b) { const DFromV d; const RepartitionToWide dw; return OrderedDemote2To(d, PromoteLowerTo(dw, a) / PromoteLowerTo(dw, b), PromoteUpperTo(dw, a) / PromoteUpperTo(dw, b)); } template HWY_API Vec128 operator/(Vec128 a, Vec128 b) { const DFromV d; const Rebind, decltype(d)> dw; return DemoteTo(d, PromoteTo(dw, a) / PromoteTo(dw, b)); } template HWY_API Vec128 operator%(Vec128 a, Vec128 b) { // Inline assembly is used instead of vec_mod for I32 Mod on PPC10 to avoid // undefined behavior if b[i] == 0 or // (a[i] == LimitsMin() && b[i] == -1) // Clang will also optimize out I32 vec_mod on PPC10 if optimizations are // enabled and any of the lanes of b are known to be zero (even in the unused // lanes of a partial vector) __vector signed int raw_result; __asm__("vmodsw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw)); return Vec128{raw_result}; } template HWY_API Vec128 operator%(Vec128 a, Vec128 b) { // Inline assembly is used instead of vec_mod for U32 Mod on PPC10 to avoid // undefined behavior if b[i] == 0 // Clang will also optimize out U32 vec_mod on PPC10 if optimizations are // enabled and any of the lanes of b are known to be zero (even in the unused // lanes of a partial vector) __vector unsigned int raw_result; __asm__("vmoduw %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw)); return Vec128{raw_result}; } template HWY_API Vec128 operator%(Vec128 a, Vec128 b) { // Inline assembly is used instead of vec_mod for I64 Mod on PPC10 to avoid // undefined behavior if b[i] == 0 or // (a[i] == LimitsMin() && b[i] == -1) // Clang will also optimize out I64 vec_mod on PPC10 if optimizations are // enabled and any of the lanes of b are known to be zero (even in the unused // lanes of a partial vector) __vector signed long long raw_result; __asm__("vmodsd %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw)); return Vec128{raw_result}; } template HWY_API Vec128 operator%(Vec128 a, Vec128 b) { // Inline assembly is used instead of vec_mod for U64 Mod on PPC10 to avoid // undefined behavior if b[i] == 0 // Clang will also optimize out U64 vec_mod on PPC10 if optimizations are // enabled and any of the lanes of b are known to be zero (even in the unused // lanes of a partial vector) __vector unsigned long long raw_result; __asm__("vmodud %0,%1,%2" : "=v"(raw_result) : "v"(a.raw), "v"(b.raw)); return Vec128{raw_result}; } template HWY_API Vec128 operator%(Vec128 a, Vec128 b) { const DFromV d; const RepartitionToWide dw; return OrderedDemote2To(d, PromoteLowerTo(dw, a) % PromoteLowerTo(dw, b), PromoteUpperTo(dw, a) % PromoteUpperTo(dw, b)); } template HWY_API Vec128 operator%(Vec128 a, Vec128 b) { const DFromV d; const Rebind, decltype(d)> dw; return DemoteTo(d, PromoteTo(dw, a) % PromoteTo(dw, b)); } #endif // ================================================== MEMORY (3) // ------------------------------ Non-temporal stores template HWY_API void Stream(VFromD v, D d, TFromD* HWY_RESTRICT aligned) { __builtin_prefetch(aligned, 1, 0); Store(v, d, aligned); } // ------------------------------ Scatter in generic_ops-inl.h // ------------------------------ Gather in generic_ops-inl.h // ================================================== SWIZZLE (2) // ------------------------------ LowerHalf // Returns upper/lower half of a vector. template HWY_API VFromD LowerHalf(D /* tag */, VFromD> v) { return VFromD{v.raw}; } template HWY_API Vec128 LowerHalf(Vec128 v) { return Vec128{v.raw}; } // ------------------------------ ShiftLeftBytes // NOTE: The ShiftLeftBytes operation moves the elements of v to the right // by kBytes bytes and zeroes out the first kBytes bytes of v on both // little-endian and big-endian PPC targets // (same behavior as the HWY_EMU128 ShiftLeftBytes operation on both // little-endian and big-endian targets) template HWY_API VFromD ShiftLeftBytes(D d, VFromD v) { static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); if (kBytes == 0) return v; const auto zeros = Zero(d); #if HWY_IS_LITTLE_ENDIAN return VFromD{vec_sld(v.raw, zeros.raw, kBytes)}; #else return VFromD{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)}; #endif } template HWY_API Vec128 ShiftLeftBytes(Vec128 v) { return ShiftLeftBytes(DFromV(), v); } // ------------------------------ ShiftLeftLanes // NOTE: The ShiftLeftLanes operation moves the elements of v to the right // by kLanes lanes and zeroes out the first kLanes lanes of v on both // little-endian and big-endian PPC targets // (same behavior as the HWY_EMU128 ShiftLeftLanes operation on both // little-endian and big-endian targets) template > HWY_API VFromD ShiftLeftLanes(D d, VFromD v) { const Repartition d8; return BitCast(d, ShiftLeftBytes(BitCast(d8, v))); } template HWY_API Vec128 ShiftLeftLanes(Vec128 v) { return ShiftLeftLanes(DFromV(), v); } // ------------------------------ ShiftRightBytes // NOTE: The ShiftRightBytes operation moves the elements of v to the left // by kBytes bytes and zeroes out the last kBytes bytes of v on both // little-endian and big-endian PPC targets // (same behavior as the HWY_EMU128 ShiftRightBytes operation on both // little-endian and big-endian targets) template HWY_API VFromD ShiftRightBytes(D d, VFromD v) { static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); if (kBytes == 0) return v; // For partial vectors, clear upper lanes so we shift in zeros. if (d.MaxBytes() != 16) { const Full128> dfull; VFromD vfull{v.raw}; v = VFromD{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw}; } const auto zeros = Zero(d); #if HWY_IS_LITTLE_ENDIAN return VFromD{vec_sld(zeros.raw, v.raw, (-kBytes) & 15)}; #else return VFromD{vec_sld(v.raw, zeros.raw, kBytes)}; #endif } // ------------------------------ ShiftRightLanes // NOTE: The ShiftRightLanes operation moves the elements of v to the left // by kLanes lanes and zeroes out the last kLanes lanes of v on both // little-endian and big-endian PPC targets // (same behavior as the HWY_EMU128 ShiftRightLanes operation on both // little-endian and big-endian targets) template HWY_API VFromD ShiftRightLanes(D d, VFromD v) { const Repartition d8; constexpr size_t kBytes = kLanes * sizeof(TFromD); return BitCast(d, ShiftRightBytes(d8, BitCast(d8, v))); } // ------------------------------ UpperHalf (ShiftRightBytes) template HWY_API VFromD UpperHalf(D d, VFromD> v) { return LowerHalf(d, ShiftRightBytes(Twice(), v)); } // ------------------------------ ExtractLane template HWY_API T ExtractLane(Vec128 v, size_t i) { return static_cast(v.raw[i]); } // ------------------------------ InsertLane template HWY_API Vec128 InsertLane(Vec128 v, size_t i, T t) { #if HWY_IS_LITTLE_ENDIAN typename detail::Raw128::type raw_result = v.raw; raw_result[i] = BitCastScalar::RawT>(t); return Vec128{raw_result}; #else // On ppc64be without this, mul_test fails, but swizzle_test passes. DFromV d; alignas(16) T lanes[16 / sizeof(T)]; Store(v, d, lanes); lanes[i] = t; return Load(d, lanes); #endif } // ------------------------------ CombineShiftRightBytes // NOTE: The CombineShiftRightBytes operation below moves the elements of lo to // the left by kBytes bytes and moves the elements of hi right by (d.MaxBytes() // - kBytes) bytes on both little-endian and big-endian PPC targets. template > HWY_API Vec128 CombineShiftRightBytes(D /*d*/, Vec128 hi, Vec128 lo) { constexpr size_t kSize = 16; static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); #if HWY_IS_LITTLE_ENDIAN return Vec128{vec_sld(hi.raw, lo.raw, (-kBytes) & 15)}; #else return Vec128{vec_sld(lo.raw, hi.raw, kBytes)}; #endif } template HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { constexpr size_t kSize = d.MaxBytes(); static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); const Repartition d8; using V8 = Vec128; const DFromV dfull8; const Repartition, decltype(dfull8)> dfull; const V8 hi8{BitCast(d8, hi).raw}; // Move into most-significant bytes const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw}); const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8); return VFromD{BitCast(dfull, r).raw}; } // ------------------------------ Broadcast/splat any lane template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{vec_splat(v.raw, kLane)}; } // ------------------------------ TableLookupLanes (Shuffle01) // Returned by SetTableIndices/IndicesFromVec for use by TableLookupLanes. template struct Indices128 { __vector unsigned char raw; }; namespace detail { template HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( D d) { const Repartition d8; return Iota(d8, 0); } template HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( D d) { const Repartition d8; #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ constexpr __vector unsigned char kBroadcastLaneBytes = { 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; #else constexpr __vector unsigned char kBroadcastLaneBytes = { 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; #endif return VFromD{kBroadcastLaneBytes}; } template HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( D d) { const Repartition d8; #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ constexpr __vector unsigned char kBroadcastLaneBytes = { 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; #else constexpr __vector unsigned char kBroadcastLaneBytes = { 3, 3, 3, 3, 7, 7, 7, 7, 11, 11, 11, 11, 15, 15, 15, 15}; #endif return VFromD{kBroadcastLaneBytes}; } template HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( D d) { const Repartition d8; #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ constexpr __vector unsigned char kBroadcastLaneBytes = { 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; #else constexpr __vector unsigned char kBroadcastLaneBytes = { 7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15}; #endif return VFromD{kBroadcastLaneBytes}; } template HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { const Repartition d8; return Zero(d8); } template HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { const Repartition d8; constexpr __vector unsigned char kByteOffsets = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; return VFromD{kByteOffsets}; } template HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { const Repartition d8; constexpr __vector unsigned char kByteOffsets = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; return VFromD{kByteOffsets}; } template HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { const Repartition d8; constexpr __vector unsigned char kByteOffsets = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; return VFromD{kByteOffsets}; } } // namespace detail template HWY_API Indices128, MaxLanes(D())> IndicesFromVec( D d, Vec128 vec) { using T = TFromD; static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); #if HWY_IS_DEBUG_BUILD const RebindToUnsigned du; using TU = TFromD; HWY_DASSERT(AllTrue( du, Lt(BitCast(du, vec), Set(du, static_cast(MaxLanes(d) * 2))))); #endif const Repartition d8; return Indices128, MaxLanes(D())>{BitCast(d8, vec).raw}; } template HWY_API Indices128, MaxLanes(D())> IndicesFromVec( D d, Vec128 vec) { using T = TFromD; static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); #if HWY_IS_DEBUG_BUILD const RebindToUnsigned du; using TU = TFromD; HWY_DASSERT(AllTrue( du, Lt(BitCast(du, vec), Set(du, static_cast(MaxLanes(d) * 2))))); #endif const Repartition d8; using V8 = VFromD; // Broadcast each lane index to all bytes of T and shift to bytes const V8 lane_indices = TableLookupBytes( BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d)); constexpr int kIndexShiftAmt = static_cast(FloorLog2(sizeof(T))); const V8 byte_indices = ShiftLeft(lane_indices); const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d)); return Indices128, MaxLanes(D())>{sum.raw}; } template HWY_API Indices128, HWY_MAX_LANES_D(D)> SetTableIndices( D d, const TI* idx) { const Rebind di; return IndicesFromVec(d, LoadU(di, idx)); } template HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { const DFromV d; const Repartition d8; return BitCast(d, TableLookupBytes(v, VFromD{idx.raw})); } // Single lane: no change template HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 /* idx */) { return v; } template HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, Indices128 idx) { const DFromV d; const Twice dt; const Repartition dt_u8; // TableLookupLanes currently requires table and index vectors to be the same // size, though a half-length index vector would be sufficient here. #if HWY_IS_MSAN const Vec128 idx_vec{idx.raw}; const Indices128 idx2{Combine(dt, idx_vec, idx_vec).raw}; #else // We only keep LowerHalf of the result, which is valid in idx. const Indices128 idx2{idx.raw}; #endif return LowerHalf( d, TableLookupBytes(Combine(dt, b, a), BitCast(dt, VFromD{idx2.raw}))); } template HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, Indices128 idx) { return Vec128{vec_perm(a.raw, b.raw, idx.raw)}; } // ------------------------------ ReverseBlocks // Single block: no change template HWY_API VFromD ReverseBlocks(D /* tag */, VFromD v) { return v; } // ------------------------------ Reverse (Shuffle0123, Shuffle2301) // Single lane: no change template , HWY_IF_LANES_D(D, 1)> HWY_API Vec128 Reverse(D /* tag */, Vec128 v) { return v; } // 32-bit x2: shuffle template , HWY_IF_T_SIZE(T, 4)> HWY_API Vec64 Reverse(D /* tag */, Vec64 v) { return Vec64{Shuffle2301(Vec128{v.raw}).raw}; } // 16-bit x4: shuffle template , HWY_IF_T_SIZE(T, 2)> HWY_API Vec64 Reverse(D /* tag */, Vec64 v) { const __vector unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9}; return Vec64{vec_perm(v.raw, v.raw, kShuffle)}; } // 16-bit x2: rotate bytes template , HWY_IF_T_SIZE(T, 2)> HWY_API Vec32 Reverse(D d, Vec32 v) { const RepartitionToWide> du32; return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v)))); } // ------------------------------- ReverseLaneBytes #if (HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14) && \ (HWY_COMPILER_GCC_ACTUAL >= 710 || HWY_COMPILER_CLANG >= 400) // Per-target flag to prevent generic_ops-inl.h defining 8-bit ReverseLaneBytes. #ifdef HWY_NATIVE_REVERSE_LANE_BYTES #undef HWY_NATIVE_REVERSE_LANE_BYTES #else #define HWY_NATIVE_REVERSE_LANE_BYTES #endif template HWY_API V ReverseLaneBytes(V v) { return V{vec_revb(v.raw)}; } // Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. #ifdef HWY_NATIVE_REVERSE2_8 #undef HWY_NATIVE_REVERSE2_8 #else #define HWY_NATIVE_REVERSE2_8 #endif template , HWY_IF_T_SIZE(T, 1)> HWY_API VFromD Reverse2(D d, VFromD v) { const Repartition du16; return BitCast(d, ReverseLaneBytes(BitCast(du16, v))); } template , HWY_IF_T_SIZE(T, 1)> HWY_API VFromD Reverse4(D d, VFromD v) { const Repartition du32; return BitCast(d, ReverseLaneBytes(BitCast(du32, v))); } template , HWY_IF_T_SIZE(T, 1)> HWY_API VFromD Reverse8(D d, VFromD v) { const Repartition du64; return BitCast(d, ReverseLaneBytes(BitCast(du64, v))); } #endif // HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14 template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec16 Reverse(D d, Vec16 v) { return Reverse2(d, v); } template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec32 Reverse(D d, Vec32 v) { return Reverse4(d, v); } template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec64 Reverse(D d, Vec64 v) { return Reverse8(d, v); } // ------------------------------ Reverse2 // Single lane: no change template , HWY_IF_LANES_D(D, 1)> HWY_API Vec128 Reverse2(D /* tag */, Vec128 v) { return v; } template , HWY_IF_T_SIZE(T, 2)> HWY_API VFromD Reverse2(D d, VFromD v) { const Repartition du32; return BitCast(d, RotateRight<16>(BitCast(du32, v))); } template , HWY_IF_T_SIZE(T, 4)> HWY_API VFromD Reverse2(D d, VFromD v) { const Repartition du64; return BitCast(d, RotateRight<32>(BitCast(du64, v))); } template , HWY_IF_T_SIZE(T, 8)> HWY_API VFromD Reverse2(D /* tag */, VFromD v) { return Shuffle01(v); } // ------------------------------ Reverse4 template HWY_API VFromD Reverse4(D /*d*/, VFromD v) { const __vector unsigned char kShuffle = {6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9}; return VFromD{vec_perm(v.raw, v.raw, kShuffle)}; } template HWY_API VFromD Reverse4(D d, VFromD v) { return Reverse(d, v); } template HWY_API VFromD Reverse4(D /* tag */, VFromD /* v */) { HWY_ASSERT(0); // don't have 4 u64 lanes } // ------------------------------ Reverse8 template HWY_API VFromD Reverse8(D d, VFromD v) { return Reverse(d, v); } template HWY_API VFromD Reverse8(D /* tag */, VFromD /* v */) { HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit } // ------------------------------ InterleaveLower // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides // the least-significant lane) and "b". To concatenate two half-width integers // into one, use ZipLower/Upper instead (also works with scalar). template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{vec_mergeh(a.raw, b.raw)}; } // Additional overload for the optional tag template HWY_API VFromD InterleaveLower(D /* tag */, VFromD a, VFromD b) { return InterleaveLower(a, b); } // ------------------------------ InterleaveUpper (UpperHalf) // Full template > HWY_API Vec128 InterleaveUpper(D /* tag */, Vec128 a, Vec128 b) { return Vec128{vec_mergel(a.raw, b.raw)}; } // Partial template HWY_API VFromD InterleaveUpper(D d, VFromD a, VFromD b) { const Half d2; return InterleaveLower(d, VFromD{UpperHalf(d2, a).raw}, VFromD{UpperHalf(d2, b).raw}); } // ------------------------------ ZipLower/ZipUpper (InterleaveLower) // Same as Interleave*, except that the return lanes are double-width integers; // this is necessary because the single-lane scalar cannot return two values. template >> HWY_API VFromD ZipLower(V a, V b) { return BitCast(DW(), InterleaveLower(a, b)); } template , class DW = RepartitionToWide> HWY_API VFromD ZipLower(DW dw, V a, V b) { return BitCast(dw, InterleaveLower(D(), a, b)); } template , class DW = RepartitionToWide> HWY_API VFromD ZipUpper(DW dw, V a, V b) { return BitCast(dw, InterleaveUpper(D(), a, b)); } // ------------------------------ Per4LaneBlkShufDupSet4xU32 // Used by hwy/ops/generic_ops-inl.h to implement Per4LaneBlockShuffle namespace detail { #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 #else #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32 #endif template HWY_INLINE VFromD Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, const uint32_t x2, const uint32_t x1, const uint32_t x0) { const __vector unsigned int raw = {x0, x1, x2, x3}; return ResizeBitCast(d, Vec128{raw}); } } // namespace detail // ------------------------------ SlideUpLanes template HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { const Repartition du8; using VU8 = VFromD; const auto v_shift_amt = BitCast(Full128(), Set(Full128(), static_cast(amt * sizeof(TFromD) * 8))); #if HWY_S390X_HAVE_Z14 return BitCast(d, VU8{vec_srb(BitCast(du8, v).raw, v_shift_amt.raw)}); #else // VSX #if HWY_IS_LITTLE_ENDIAN return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)}); #else return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)}); #endif // HWY_IS_LITTLE_ENDIAN #endif // HWY_S390X_HAVE_Z14 } // ------------------------------ SlideDownLanes template HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { using TU = UnsignedFromSize; const Repartition du; const auto v_shift_amt = Set(du, static_cast(amt * sizeof(TFromD) * 8)); #if HWY_IS_LITTLE_ENDIAN return BitCast(d, BitCast(du, v) >> v_shift_amt); #else return BitCast(d, BitCast(du, v) << v_shift_amt); #endif } template HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { const Repartition du8; using VU8 = VFromD; const auto v_shift_amt = BitCast(Full128(), Set(Full128(), static_cast(amt * sizeof(TFromD) * 8))); #if HWY_S390X_HAVE_Z14 return BitCast(d, VU8{vec_slb(BitCast(du8, v).raw, v_shift_amt.raw)}); #else // VSX #if HWY_IS_LITTLE_ENDIAN return BitCast(d, VU8{vec_sro(BitCast(du8, v).raw, v_shift_amt.raw)}); #else return BitCast(d, VU8{vec_slo(BitCast(du8, v).raw, v_shift_amt.raw)}); #endif // HWY_IS_LITTLE_ENDIAN #endif // HWY_S390X_HAVE_Z14 } // ================================================== COMBINE // ------------------------------ Combine (InterleaveLower) // N = N/2 + N/2 (upper half undefined) template >> HWY_API VFromD Combine(D d, VH hi_half, VH lo_half) { const Half dh; // Treat half-width input as one lane, and expand to two lanes. using VU = Vec128, 2>; using Raw = typename detail::Raw128>::type; const VU lo{reinterpret_cast(lo_half.raw)}; const VU hi{reinterpret_cast(hi_half.raw)}; return BitCast(d, InterleaveLower(lo, hi)); } // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero) template HWY_API VFromD ZeroExtendVector(D d, VFromD> lo) { const Half dh; return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD{lo.raw}); } // ------------------------------ Concat full (InterleaveLower) // hiH,hiL loH,loL |-> hiL,loL (= lower halves) template > HWY_API Vec128 ConcatLowerLower(D d, Vec128 hi, Vec128 lo) { const Repartition d64; return BitCast(d, InterleaveLower(BitCast(d64, lo), BitCast(d64, hi))); } // hiH,hiL loH,loL |-> hiH,loH (= upper halves) template > HWY_API Vec128 ConcatUpperUpper(D d, Vec128 hi, Vec128 lo) { const Repartition d64; return BitCast(d, InterleaveUpper(d64, BitCast(d64, lo), BitCast(d64, hi))); } // hiH,hiL loH,loL |-> hiL,loH (= inner halves) template > HWY_API Vec128 ConcatLowerUpper(D d, Vec128 hi, Vec128 lo) { return CombineShiftRightBytes<8>(d, hi, lo); } // hiH,hiL loH,loL |-> hiH,loL (= outer halves) template > HWY_API Vec128 ConcatUpperLower(D /*d*/, Vec128 hi, Vec128 lo) { const __vector unsigned char kShuffle = {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}; return Vec128{vec_perm(lo.raw, hi.raw, kShuffle)}; } // ------------------------------ Concat partial (Combine, LowerHalf) template HWY_API VFromD ConcatLowerLower(D d, VFromD hi, VFromD lo) { const Half d2; return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo)); } template HWY_API VFromD ConcatUpperUpper(D d, VFromD hi, VFromD lo) { const Half d2; return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo)); } template HWY_API VFromD ConcatLowerUpper(D d, VFromD hi, VFromD lo) { const Half d2; return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo)); } template HWY_API VFromD ConcatUpperLower(D d, VFromD hi, VFromD lo) { const Half d2; return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo)); } // ------------------------------ TruncateTo template = sizeof(TFromD) * 2)>* = nullptr, HWY_IF_LANES_D(D, 1)> HWY_API VFromD TruncateTo(D /* tag */, Vec128 v) { using Raw = typename detail::Raw128>::type; #if HWY_IS_LITTLE_ENDIAN return VFromD{reinterpret_cast(v.raw)}; #else return VFromD{reinterpret_cast( vec_sld(v.raw, v.raw, sizeof(FromT) - sizeof(TFromD)))}; #endif } namespace detail { template ) * 2), HWY_IF_LANES_GT_D(D, 1)> HWY_API VFromD Truncate2To( D /* tag */, Vec128().MaxLanes()> lo, Vec128().MaxLanes()> hi) { return VFromD{vec_pack(lo.raw, hi.raw)}; } } // namespace detail template ) * 2), HWY_IF_LANES_GT_D(D, 1)> HWY_API VFromD TruncateTo(D /* d */, Vec128().MaxLanes()> v) { return VFromD{vec_pack(v.raw, v.raw)}; } template = sizeof(TFromD) * 4)>* = nullptr, HWY_IF_LANES_GT_D(D, 1)> HWY_API VFromD TruncateTo(D d, Vec128().MaxLanes()> v) { const Rebind, decltype(d)> d2; return TruncateTo(d, TruncateTo(d2, v)); } // ------------------------------ ConcatOdd (TruncateTo) // 8-bit full template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec128 ConcatOdd(D d, Vec128 hi, Vec128 lo) { const Repartition dw; const RebindToUnsigned du; #if HWY_IS_LITTLE_ENDIAN // Right-shift 8 bits per u16 so we can pack. const Vec128 uH = ShiftRight<8>(BitCast(dw, hi)); const Vec128 uL = ShiftRight<8>(BitCast(dw, lo)); #else const Vec128 uH = BitCast(dw, hi); const Vec128 uL = BitCast(dw, lo); #endif return BitCast(d, detail::Truncate2To(du, uL, uH)); } // 8-bit x8 template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec64 ConcatOdd(D /*d*/, Vec64 hi, Vec64 lo) { // Don't care about upper half, no need to zero. const __vector unsigned char kCompactOddU8 = {1, 3, 5, 7, 17, 19, 21, 23}; return Vec64{vec_perm(lo.raw, hi.raw, kCompactOddU8)}; } // 8-bit x4 template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec32 ConcatOdd(D /*d*/, Vec32 hi, Vec32 lo) { // Don't care about upper half, no need to zero. const __vector unsigned char kCompactOddU8 = {1, 3, 17, 19}; return Vec32{vec_perm(lo.raw, hi.raw, kCompactOddU8)}; } // 16-bit full template , HWY_IF_T_SIZE(T, 2)> HWY_API Vec128 ConcatOdd(D d, Vec128 hi, Vec128 lo) { const Repartition dw; const RebindToUnsigned du; #if HWY_IS_LITTLE_ENDIAN const Vec128 uH = ShiftRight<16>(BitCast(dw, hi)); const Vec128 uL = ShiftRight<16>(BitCast(dw, lo)); #else const Vec128 uH = BitCast(dw, hi); const Vec128 uL = BitCast(dw, lo); #endif return BitCast(d, detail::Truncate2To(du, uL, uH)); } // 16-bit x4 template , HWY_IF_T_SIZE(T, 2)> HWY_API Vec64 ConcatOdd(D /*d*/, Vec64 hi, Vec64 lo) { // Don't care about upper half, no need to zero. const __vector unsigned char kCompactOddU16 = {2, 3, 6, 7, 18, 19, 22, 23}; return Vec64{vec_perm(lo.raw, hi.raw, kCompactOddU16)}; } // 32-bit full template , HWY_IF_T_SIZE(T, 4)> HWY_API Vec128 ConcatOdd(D d, Vec128 hi, Vec128 lo) { #if HWY_IS_LITTLE_ENDIAN (void)d; const __vector unsigned char kShuffle = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; return Vec128{vec_perm(lo.raw, hi.raw, kShuffle)}; #else const RebindToUnsigned du; const Repartition dw; return BitCast(d, detail::Truncate2To(du, BitCast(dw, lo), BitCast(dw, hi))); #endif } // Any type x2 template , HWY_IF_LANES_D(D, 2)> HWY_API Vec128 ConcatOdd(D d, Vec128 hi, Vec128 lo) { return InterleaveUpper(d, lo, hi); } // ------------------------------ ConcatEven (TruncateTo) // 8-bit full template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec128 ConcatEven(D d, Vec128 hi, Vec128 lo) { const Repartition dw; const RebindToUnsigned du; #if HWY_IS_LITTLE_ENDIAN const Vec128 uH = BitCast(dw, hi); const Vec128 uL = BitCast(dw, lo); #else // Right-shift 8 bits per u16 so we can pack. const Vec128 uH = ShiftRight<8>(BitCast(dw, hi)); const Vec128 uL = ShiftRight<8>(BitCast(dw, lo)); #endif return BitCast(d, detail::Truncate2To(du, uL, uH)); } // 8-bit x8 template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec64 ConcatEven(D /*d*/, Vec64 hi, Vec64 lo) { // Don't care about upper half, no need to zero. const __vector unsigned char kCompactEvenU8 = {0, 2, 4, 6, 16, 18, 20, 22}; return Vec64{vec_perm(lo.raw, hi.raw, kCompactEvenU8)}; } // 8-bit x4 template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec32 ConcatEven(D /*d*/, Vec32 hi, Vec32 lo) { // Don't care about upper half, no need to zero. const __vector unsigned char kCompactEvenU8 = {0, 2, 16, 18}; return Vec32{vec_perm(lo.raw, hi.raw, kCompactEvenU8)}; } // 16-bit full template , HWY_IF_T_SIZE(T, 2)> HWY_API Vec128 ConcatEven(D d, Vec128 hi, Vec128 lo) { // Isolate lower 16 bits per u32 so we can pack. const Repartition dw; const RebindToUnsigned du; #if HWY_IS_LITTLE_ENDIAN const Vec128 uH = BitCast(dw, hi); const Vec128 uL = BitCast(dw, lo); #else const Vec128 uH = ShiftRight<16>(BitCast(dw, hi)); const Vec128 uL = ShiftRight<16>(BitCast(dw, lo)); #endif return BitCast(d, detail::Truncate2To(du, uL, uH)); } // 16-bit x4 template , HWY_IF_T_SIZE(T, 2)> HWY_API Vec64 ConcatEven(D /*d*/, Vec64 hi, Vec64 lo) { // Don't care about upper half, no need to zero. const __vector unsigned char kCompactEvenU16 = {0, 1, 4, 5, 16, 17, 20, 21}; return Vec64{vec_perm(lo.raw, hi.raw, kCompactEvenU16)}; } // 32-bit full template , HWY_IF_T_SIZE(T, 4)> HWY_API Vec128 ConcatEven(D d, Vec128 hi, Vec128 lo) { #if HWY_IS_LITTLE_ENDIAN const Repartition dw; const RebindToUnsigned du; return BitCast(d, detail::Truncate2To(du, BitCast(dw, lo), BitCast(dw, hi))); #else (void)d; constexpr __vector unsigned char kShuffle = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27}; return Vec128{vec_perm(lo.raw, hi.raw, kShuffle)}; #endif } // Any T x2 template , HWY_IF_LANES_D(D, 2)> HWY_API Vec128 ConcatEven(D d, Vec128 hi, Vec128 lo) { return InterleaveLower(d, lo, hi); } // ------------------------------ OrderedTruncate2To (ConcatEven, ConcatOdd) #ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO #undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO #else #define HWY_NATIVE_ORDERED_TRUNCATE_2_TO #endif template ) * 2), HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV) * 2)> HWY_API VFromD OrderedTruncate2To(D d, V a, V b) { #if HWY_IS_LITTLE_ENDIAN return ConcatEven(d, BitCast(d, b), BitCast(d, a)); #else return ConcatOdd(d, BitCast(d, b), BitCast(d, a)); #endif } // ------------------------------ DupEven (InterleaveLower) template HWY_API Vec128 DupEven(Vec128 v) { return v; } template HWY_API Vec128 DupEven(Vec128 v) { return InterleaveLower(DFromV(), v, v); } template HWY_API Vec128 DupEven(Vec128 v) { const DFromV d; const Repartition du8; constexpr __vector unsigned char kShuffle = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; return TableLookupBytes(v, BitCast(d, VFromD{kShuffle})); } template HWY_API Vec128 DupEven(Vec128 v) { const DFromV d; const Repartition du8; constexpr __vector unsigned char kShuffle = {0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13}; return TableLookupBytes(v, BitCast(d, VFromD{kShuffle})); } template HWY_API Vec128 DupEven(Vec128 v) { #if HWY_S390X_HAVE_Z14 const DFromV d; const Repartition du8; return TableLookupBytes( v, BitCast(d, Dup128VecFromValues(du8, 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 8, 9, 10, 11))); #else return Vec128{vec_mergee(v.raw, v.raw)}; #endif } // ------------------------------ DupOdd (InterleaveUpper) template HWY_API Vec128 DupOdd(Vec128 v) { const DFromV d; const Repartition du8; constexpr __vector unsigned char kShuffle = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}; return TableLookupBytes(v, BitCast(d, VFromD{kShuffle})); } template HWY_API Vec128 DupOdd(Vec128 v) { const DFromV d; const Repartition du8; constexpr __vector unsigned char kShuffle = {2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15}; return TableLookupBytes(v, BitCast(d, VFromD{kShuffle})); } template HWY_API Vec128 DupOdd(Vec128 v) { #if HWY_S390X_HAVE_Z14 const DFromV d; const Repartition du8; return TableLookupBytes( v, BitCast(d, Dup128VecFromValues(du8, 4, 5, 6, 7, 4, 5, 6, 7, 12, 13, 14, 15, 12, 13, 14, 15))); #else return Vec128{vec_mergeo(v.raw, v.raw)}; #endif } template HWY_API Vec128 DupOdd(Vec128 v) { return InterleaveUpper(DFromV(), v, v); } // ------------------------------ OddEven (IfThenElse) template HWY_INLINE Vec128 OddEven(Vec128 a, Vec128 b) { const DFromV d; const __vector unsigned char mask = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; return IfVecThenElse(BitCast(d, Vec128{mask}), b, a); } template HWY_INLINE Vec128 OddEven(Vec128 a, Vec128 b) { const DFromV d; const __vector unsigned char mask = {0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0}; return IfVecThenElse(BitCast(d, Vec128{mask}), b, a); } template HWY_INLINE Vec128 OddEven(Vec128 a, Vec128 b) { const DFromV d; const __vector unsigned char mask = {0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0}; return IfVecThenElse(BitCast(d, Vec128{mask}), b, a); } template HWY_INLINE Vec128 OddEven(Vec128 a, Vec128 b) { // Same as ConcatUpperLower for full vectors; do not call that because this // is more efficient for 64x1 vectors. const DFromV d; const __vector unsigned char mask = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0, 0, 0, 0, 0, 0, 0, 0}; return IfVecThenElse(BitCast(d, Vec128{mask}), b, a); } // ------------------------------ OddEvenBlocks template HWY_API Vec128 OddEvenBlocks(Vec128 /* odd */, Vec128 even) { return even; } // ------------------------------ SwapAdjacentBlocks template HWY_API Vec128 SwapAdjacentBlocks(Vec128 v) { return v; } // ------------------------------ MulFixedPoint15 (OddEven) #if HWY_S390X_HAVE_Z14 HWY_API Vec16 MulFixedPoint15(Vec16 a, Vec16 b) { const DFromV di16; const RepartitionToWide di32; const auto round_up_incr = Set(di32, 0x4000); const auto i32_product = MulEven(a, b) + round_up_incr; return ResizeBitCast(di16, ShiftLeft<1>(i32_product)); } template HWY_API Vec128 MulFixedPoint15(Vec128 a, Vec128 b) { const DFromV di16; const RepartitionToWide di32; const auto round_up_incr = Set(di32, 0x4000); const auto even_product = MulEven(a, b) + round_up_incr; const auto odd_product = MulOdd(a, b) + round_up_incr; return OddEven(BitCast(di16, ShiftRight<15>(odd_product)), BitCast(di16, ShiftLeft<1>(even_product))); } #else template HWY_API Vec128 MulFixedPoint15(Vec128 a, Vec128 b) { const Vec128 zero = Zero(Full128()); return Vec128{vec_mradds(a.raw, b.raw, zero.raw)}; } #endif // ------------------------------ Shl namespace detail { template HWY_API Vec128 Shl(hwy::UnsignedTag /*tag*/, Vec128 v, Vec128 bits) { #if HWY_S390X_HAVE_Z14 return Vec128{v.raw << bits.raw}; #else return Vec128{vec_sl(v.raw, bits.raw)}; #endif } // Signed left shift is the same as unsigned. template HWY_API Vec128 Shl(hwy::SignedTag /*tag*/, Vec128 v, Vec128 bits) { const DFromV di; const RebindToUnsigned du; return BitCast(di, Shl(hwy::UnsignedTag(), BitCast(du, v), BitCast(du, bits))); } } // namespace detail template HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return detail::Shl(hwy::TypeTag(), v, bits); } // ------------------------------ Shr namespace detail { template HWY_API Vec128 Shr(hwy::UnsignedTag /*tag*/, Vec128 v, Vec128 bits) { #if HWY_S390X_HAVE_Z14 return Vec128{v.raw >> bits.raw}; #else return Vec128{vec_sr(v.raw, bits.raw)}; #endif } template HWY_API Vec128 Shr(hwy::SignedTag /*tag*/, Vec128 v, Vec128 bits) { #if HWY_S390X_HAVE_Z14 return Vec128{v.raw >> bits.raw}; #else const DFromV di; const RebindToUnsigned du; return Vec128{vec_sra(v.raw, BitCast(du, bits).raw)}; #endif } } // namespace detail template HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { return detail::Shr(hwy::TypeTag(), v, bits); } // ------------------------------ MulEven/Odd 64x64 (UpperHalf) HWY_INLINE Vec128 MulEven(Vec128 a, Vec128 b) { #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) using VU64 = __vector unsigned long long; const VU64 mul128_result = reinterpret_cast(vec_mule(a.raw, b.raw)); #if HWY_IS_LITTLE_ENDIAN return Vec128{mul128_result}; #else // Need to swap the two halves of mul128_result on big-endian targets as // the upper 64 bits of the product are in lane 0 of mul128_result and // the lower 64 bits of the product are in lane 1 of mul128_result return Vec128{vec_sld(mul128_result, mul128_result, 8)}; #endif #else alignas(16) uint64_t mul[2]; mul[0] = Mul128(GetLane(a), GetLane(b), &mul[1]); return Load(Full128(), mul); #endif } HWY_INLINE Vec128 MulOdd(Vec128 a, Vec128 b) { #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) using VU64 = __vector unsigned long long; const VU64 mul128_result = reinterpret_cast(vec_mulo(a.raw, b.raw)); #if HWY_IS_LITTLE_ENDIAN return Vec128{mul128_result}; #else // Need to swap the two halves of mul128_result on big-endian targets as // the upper 64 bits of the product are in lane 0 of mul128_result and // the lower 64 bits of the product are in lane 1 of mul128_result return Vec128{vec_sld(mul128_result, mul128_result, 8)}; #endif #else alignas(16) uint64_t mul[2]; const Full64 d2; mul[0] = Mul128(GetLane(UpperHalf(d2, a)), GetLane(UpperHalf(d2, b)), &mul[1]); return Load(Full128(), mul); #endif } // ------------------------------ WidenMulPairwiseAdd template >> HWY_API VFromD WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) { const RebindToUnsigned du32; // Lane order within sum0/1 is undefined, hence we can avoid the // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip // leads to the odd/even order that RearrangeToOddPlusEven prefers. using VU32 = VFromD; const VU32 odd = Set(du32, 0xFFFF0000u); const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); const VU32 ao = And(BitCast(du32, a), odd); const VU32 be = ShiftLeft<16>(BitCast(du32, b)); const VU32 bo = And(BitCast(du32, b), odd); return MulAdd(BitCast(df32, ae), BitCast(df32, be), Mul(BitCast(df32, ao), BitCast(df32, bo))); } // Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe. template >> HWY_API VFromD WidenMulPairwiseAdd(D32 d32, V16 a, V16 b) { #if HWY_S390X_HAVE_Z14 (void)d32; return MulEven(a, b) + MulOdd(a, b); #else return VFromD{vec_msum(a.raw, b.raw, Zero(d32).raw)}; #endif } // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) template >> HWY_API VFromD ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, VFromD sum0, VFromD& sum1) { const RebindToUnsigned du32; // Lane order within sum0/1 is undefined, hence we can avoid the // longer-latency lane-crossing PromoteTo. Using shift/and instead of Zip // leads to the odd/even order that RearrangeToOddPlusEven prefers. using VU32 = VFromD; const VU32 odd = Set(du32, 0xFFFF0000u); const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); const VU32 ao = And(BitCast(du32, a), odd); const VU32 be = ShiftLeft<16>(BitCast(du32, b)); const VU32 bo = And(BitCast(du32, b), odd); sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1); return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0); } // Even if N=1, the input is always at least 2 lanes, hence vec_msum is safe. template >> HWY_API VFromD ReorderWidenMulAccumulate(D32 /*d32*/, V16 a, V16 b, VFromD sum0, VFromD& /*sum1*/) { #if HWY_S390X_HAVE_Z14 return MulEven(a, b) + MulOdd(a, b) + sum0; #else return VFromD{vec_msum(a.raw, b.raw, sum0.raw)}; #endif } // ------------------------------ RearrangeToOddPlusEven template HWY_API Vec128 RearrangeToOddPlusEven(Vec128 sum0, Vec128 /*sum1*/) { return sum0; // invariant already holds } template HWY_API Vec128 RearrangeToOddPlusEven( Vec128 sum0, Vec128 /*sum1*/) { return sum0; // invariant already holds } template HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) { return Add(sum0, sum1); } // ------------------------------ SumOfMulQuadAccumulate #if !HWY_S390X_HAVE_Z14 #ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE #undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE #else #define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE #endif template HWY_API VFromD SumOfMulQuadAccumulate( DU32 /*du32*/, VFromD> a, VFromD> b, VFromD sum) { return VFromD{vec_msum(a.raw, b.raw, sum.raw)}; } #ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE #undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE #else #define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE #endif template HWY_API VFromD SumOfMulQuadAccumulate( DI32 /*di32*/, VFromD> a_u, VFromD> b_i, VFromD sum) { return VFromD{vec_msum(b_i.raw, a_u.raw, sum.raw)}; } #ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE #undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE #else #define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE #endif template HWY_API VFromD SumOfMulQuadAccumulate(DI32 di32, VFromD> a, VFromD> b, VFromD sum) { const Repartition du8; const auto result_sum_0 = SumOfMulQuadAccumulate(di32, BitCast(du8, a), b, sum); const auto result_sum_1 = ShiftLeft<8>(SumsOf4(And(b, BroadcastSignBit(a)))); return result_sum_0 - result_sum_1; } #endif // !HWY_S390X_HAVE_Z14 // ================================================== CONVERT // ------------------------------ Promotions (part w/ narrow lanes -> full) // Unsigned to signed/unsigned: zero-extend. template HWY_API VFromD PromoteTo(D /* d */, Vec128().MaxLanes()> v) { // First pretend the input has twice the lanes - the upper half will be // ignored by ZipLower. const Rebind> d2; const VFromD twice{v.raw}; // Then cast to narrow as expected by ZipLower, in case the sign of FromT // differs from that of D. const RepartitionToNarrow dn; #if HWY_IS_LITTLE_ENDIAN return ZipLower(BitCast(dn, twice), Zero(dn)); #else return ZipLower(Zero(dn), BitCast(dn, twice)); #endif } // Signed: replicate sign bit. template HWY_API VFromD PromoteTo(D /* d */, Vec128().MaxLanes()> v) { using Raw = typename detail::Raw128>::type; return VFromD{reinterpret_cast(vec_unpackh(v.raw))}; } // 8-bit to 32-bit: First, promote to 16-bit, and then convert to 32-bit. template HWY_API VFromD PromoteTo(D d32, Vec128().MaxLanes()> v) { const DFromV d8; const Rebind, decltype(d8)> d16; return PromoteTo(d32, PromoteTo(d16, v)); } // 8-bit or 16-bit to 64-bit: First, promote to MakeWide, and then // convert to 64-bit. template HWY_API VFromD PromoteTo(D d64, Vec128().MaxLanes()> v) { const Rebind, decltype(d64)> dw; return PromoteTo(d64, PromoteTo(dw, v)); } #if HWY_PPC_HAVE_9 // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions. #ifdef HWY_NATIVE_F16C #undef HWY_NATIVE_F16C #else #define HWY_NATIVE_F16C #endif template HWY_INLINE VFromD PromoteTo(D /*tag*/, VFromD> v) { return VFromD{vec_extract_fp32_from_shorth(v.raw)}; } #endif // HWY_PPC_HAVE_9 template HWY_API VFromD PromoteTo(D df32, VFromD> v) { const Rebind du16; const RebindToSigned di32; return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { const __vector float raw_v = InterleaveLower(v, v).raw; #if HWY_IS_LITTLE_ENDIAN return VFromD{vec_doubleo(raw_v)}; #else return VFromD{vec_doublee(raw_v)}; #endif } template HWY_API VFromD PromoteTo(D df64, VFromD> v) { #if HWY_S390X_HAVE_Z14 const RebindToSigned di64; return ConvertTo(df64, PromoteTo(di64, v)); #else // VSX (void)df64; const __vector signed int raw_v = InterleaveLower(v, v).raw; #if HWY_IS_LITTLE_ENDIAN return VFromD{vec_doubleo(raw_v)}; #else return VFromD{vec_doublee(raw_v)}; #endif #endif // HWY_S390X_HAVE_Z14 } template HWY_API VFromD PromoteTo(D df64, VFromD> v) { #if HWY_S390X_HAVE_Z14 const RebindToUnsigned du64; return ConvertTo(df64, PromoteTo(du64, v)); #else // VSX (void)df64; const __vector unsigned int raw_v = InterleaveLower(v, v).raw; #if HWY_IS_LITTLE_ENDIAN return VFromD{vec_doubleo(raw_v)}; #else return VFromD{vec_doublee(raw_v)}; #endif #endif // HWY_S390X_HAVE_Z14 } #if !HWY_S390X_HAVE_Z14 namespace detail { template static HWY_INLINE V VsxF2INormalizeSrcVals(V v) { #if !defined(HWY_DISABLE_PPC_VSX_QEMU_F2I_WORKAROUND) // Workaround for QEMU 7/8 VSX float to int conversion bug return IfThenElseZero(v == v, v); #else return v; #endif } } // namespace detail #endif // !HWY_S390X_HAVE_Z14 template HWY_API VFromD PromoteTo(D di64, VFromD> v) { #if !HWY_S390X_HAVE_Z14 && \ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds)) const __vector float raw_v = detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw; return VFromD{__builtin_vsx_xvcvspsxds(raw_v)}; #else const RebindToFloat df64; return ConvertTo(di64, PromoteTo(df64, v)); #endif } template HWY_API VFromD PromoteTo(D du64, VFromD> v) { #if !HWY_S390X_HAVE_Z14 && \ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds)) const __vector float raw_v = detail::VsxF2INormalizeSrcVals(InterleaveLower(v, v)).raw; return VFromD{reinterpret_cast<__vector unsigned long long>( __builtin_vsx_xvcvspuxds(raw_v))}; #else const RebindToFloat df64; return ConvertTo(du64, PromoteTo(df64, v)); #endif } // ------------------------------ PromoteUpperTo #ifdef HWY_NATIVE_PROMOTE_UPPER_TO #undef HWY_NATIVE_PROMOTE_UPPER_TO #else #define HWY_NATIVE_PROMOTE_UPPER_TO #endif // Unsigned to signed/unsigned: zero-extend. template HWY_API VFromD PromoteUpperTo(D d, Vec128 v) { const RebindToUnsigned du; const RepartitionToNarrow dn; #if HWY_IS_LITTLE_ENDIAN return BitCast(d, ZipUpper(du, v, Zero(dn))); #else return BitCast(d, ZipUpper(du, Zero(dn), v)); #endif } // Signed: replicate sign bit. template HWY_API VFromD PromoteUpperTo(D /* d */, Vec128 v) { using Raw = typename detail::Raw128>::type; return VFromD{reinterpret_cast(vec_unpackl(v.raw))}; } // F16 to F32 template HWY_API VFromD PromoteUpperTo(D df32, Vec128 v) { #if HWY_PPC_HAVE_9 (void)df32; return VFromD{vec_extract_fp32_from_shortl(v.raw)}; #else const Rebind dh; return PromoteTo(df32, UpperHalf(dh, v)); #endif } // BF16 to F32 template HWY_API VFromD PromoteUpperTo(D df32, Vec128 v) { const Repartition du16; const RebindToSigned di32; return BitCast(df32, ShiftLeft<16>(PromoteUpperTo(di32, BitCast(du16, v)))); } template HWY_API VFromD PromoteUpperTo(D /*tag*/, Vec128 v) { const __vector float raw_v = InterleaveUpper(Full128(), v, v).raw; #if HWY_IS_LITTLE_ENDIAN return VFromD{vec_doubleo(raw_v)}; #else return VFromD{vec_doublee(raw_v)}; #endif } template HWY_API VFromD PromoteUpperTo(D df64, Vec128 v) { #if HWY_S390X_HAVE_Z14 const RebindToSigned di64; return ConvertTo(df64, PromoteUpperTo(di64, v)); #else // VSX (void)df64; const __vector signed int raw_v = InterleaveUpper(Full128(), v, v).raw; #if HWY_IS_LITTLE_ENDIAN return VFromD{vec_doubleo(raw_v)}; #else return VFromD{vec_doublee(raw_v)}; #endif #endif // HWY_S390X_HAVE_Z14 } template HWY_API VFromD PromoteUpperTo(D df64, Vec128 v) { #if HWY_S390X_HAVE_Z14 const RebindToUnsigned du64; return ConvertTo(df64, PromoteUpperTo(du64, v)); #else // VSX (void)df64; const __vector unsigned int raw_v = InterleaveUpper(Full128(), v, v).raw; #if HWY_IS_LITTLE_ENDIAN return VFromD{vec_doubleo(raw_v)}; #else return VFromD{vec_doublee(raw_v)}; #endif #endif // HWY_S390X_HAVE_Z14 } template HWY_API VFromD PromoteUpperTo(D di64, Vec128 v) { #if !HWY_S390X_HAVE_Z14 && \ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds)) const __vector float raw_v = detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128(), v, v)) .raw; return VFromD{__builtin_vsx_xvcvspsxds(raw_v)}; #else const RebindToFloat df64; return ConvertTo(di64, PromoteUpperTo(df64, v)); #endif } template HWY_API VFromD PromoteUpperTo(D du64, Vec128 v) { #if !HWY_S390X_HAVE_Z14 && \ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds)) const __vector float raw_v = detail::VsxF2INormalizeSrcVals(InterleaveUpper(Full128(), v, v)) .raw; return VFromD{reinterpret_cast<__vector unsigned long long>( __builtin_vsx_xvcvspuxds(raw_v))}; #else const RebindToFloat df64; return ConvertTo(du64, PromoteUpperTo(df64, v)); #endif } // Generic version for <=64 bit input/output template HWY_API VFromD PromoteUpperTo(D d, V v) { const Rebind, decltype(d)> dh; return PromoteTo(d, UpperHalf(dh, v)); } // ------------------------------ PromoteEvenTo/PromoteOddTo namespace detail { // Signed to Signed PromoteEvenTo/PromoteOddTo for PPC9/PPC10 #if HWY_PPC_HAVE_9 && \ (HWY_COMPILER_GCC_ACTUAL >= 1200 || HWY_COMPILER_CLANG >= 1200) #if HWY_IS_LITTLE_ENDIAN template HWY_INLINE VFromD PromoteEvenTo(hwy::SignedTag /*to_type_tag*/, hwy::SizeTag<4> /*to_lane_size_tag*/, hwy::SignedTag /*from_type_tag*/, D /*d_to*/, V v) { return VFromD{vec_signexti(v.raw)}; } template HWY_INLINE VFromD PromoteEvenTo(hwy::SignedTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, hwy::SignedTag /*from_type_tag*/, D /*d_to*/, V v) { return VFromD{vec_signextll(v.raw)}; } #else template HWY_INLINE VFromD PromoteOddTo(hwy::SignedTag /*to_type_tag*/, hwy::SizeTag<4> /*to_lane_size_tag*/, hwy::SignedTag /*from_type_tag*/, D /*d_to*/, V v) { return VFromD{vec_signexti(v.raw)}; } template HWY_INLINE VFromD PromoteOddTo(hwy::SignedTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, hwy::SignedTag /*from_type_tag*/, D /*d_to*/, V v) { return VFromD{vec_signextll(v.raw)}; } #endif // HWY_IS_LITTLE_ENDIAN #endif // HWY_PPC_HAVE_9 // I32/U32/F32->F64 PromoteEvenTo #if HWY_S390X_HAVE_Z14 template HWY_INLINE VFromD PromoteEvenTo(hwy::FloatTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, hwy::FloatTag /*from_type_tag*/, D /*d_to*/, V v) { return VFromD{vec_doublee(v.raw)}; } template )> HWY_INLINE VFromD PromoteEvenTo(hwy::FloatTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, FromTypeTag /*from_type_tag*/, D d_to, V v) { const Rebind>, decltype(d_to)> dw; return ConvertTo(d_to, PromoteEvenTo(dw, v)); } #else // VSX template HWY_INLINE VFromD PromoteEvenTo(hwy::FloatTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, FromTypeTag /*from_type_tag*/, D /*d_to*/, V v) { return VFromD{vec_doublee(v.raw)}; } #endif // HWY_S390X_HAVE_Z14 // F32->I64 PromoteEvenTo template HWY_INLINE VFromD PromoteEvenTo(hwy::SignedTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, hwy::FloatTag /*from_type_tag*/, D d_to, V v) { #if !HWY_S390X_HAVE_Z14 && \ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds)) (void)d_to; const auto normalized_v = detail::VsxF2INormalizeSrcVals(v); #if HWY_IS_LITTLE_ENDIAN // __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes // on little-endian PPC, and the vec_sld operation below will shift the even // lanes of normalized_v into the odd lanes. return VFromD{ __builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))}; #else // __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes // on big-endian PPC. return VFromD{__builtin_vsx_xvcvspsxds(normalized_v.raw)}; #endif #else const RebindToFloat df64; return ConvertTo(d_to, PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(), hwy::FloatTag(), df64, v)); #endif } // F32->U64 PromoteEvenTo template HWY_INLINE VFromD PromoteEvenTo(hwy::UnsignedTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, hwy::FloatTag /*from_type_tag*/, D d_to, V v) { #if !HWY_S390X_HAVE_Z14 && \ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds)) (void)d_to; const auto normalized_v = detail::VsxF2INormalizeSrcVals(v); #if HWY_IS_LITTLE_ENDIAN // __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes // on little-endian PPC, and the vec_sld operation below will shift the even // lanes of normalized_v into the odd lanes. return VFromD{ reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds( vec_sld(normalized_v.raw, normalized_v.raw, 4)))}; #else // __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes // on big-endian PPC. return VFromD{reinterpret_cast<__vector unsigned long long>( __builtin_vsx_xvcvspuxds(normalized_v.raw))}; #endif #else const RebindToFloat df64; return ConvertTo(d_to, PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(), hwy::FloatTag(), df64, v)); #endif } // I32/U32/F32->F64 PromoteOddTo #if HWY_S390X_HAVE_Z14 template HWY_INLINE VFromD PromoteOddTo(hwy::FloatTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, hwy::FloatTag /*from_type_tag*/, D d_to, V v) { return PromoteEvenTo(hwy::FloatTag(), hwy::SizeTag<8>(), hwy::FloatTag(), d_to, V{vec_sld(v.raw, v.raw, 4)}); } template )> HWY_INLINE VFromD PromoteOddTo(hwy::FloatTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, FromTypeTag /*from_type_tag*/, D d_to, V v) { const Rebind>, decltype(d_to)> dw; return ConvertTo(d_to, PromoteOddTo(dw, v)); } #else template HWY_INLINE VFromD PromoteOddTo(hwy::FloatTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, FromTypeTag /*from_type_tag*/, D /*d_to*/, V v) { return VFromD{vec_doubleo(v.raw)}; } #endif // F32->I64 PromoteOddTo template HWY_INLINE VFromD PromoteOddTo(hwy::SignedTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, hwy::FloatTag /*from_type_tag*/, D d_to, V v) { #if !HWY_S390X_HAVE_Z14 && \ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspsxds)) (void)d_to; const auto normalized_v = detail::VsxF2INormalizeSrcVals(v); #if HWY_IS_LITTLE_ENDIAN // __builtin_vsx_xvcvspsxds expects the source values to be in the odd lanes // on little-endian PPC return VFromD{__builtin_vsx_xvcvspsxds(normalized_v.raw)}; #else // __builtin_vsx_xvcvspsxds expects the source values to be in the even lanes // on big-endian PPC, and the vec_sld operation below will shift the odd lanes // of normalized_v into the even lanes. return VFromD{ __builtin_vsx_xvcvspsxds(vec_sld(normalized_v.raw, normalized_v.raw, 4))}; #endif #else const RebindToFloat df64; return ConvertTo(d_to, PromoteOddTo(hwy::FloatTag(), hwy::SizeTag<8>(), hwy::FloatTag(), df64, v)); #endif } // F32->U64 PromoteOddTo template HWY_INLINE VFromD PromoteOddTo(hwy::UnsignedTag /*to_type_tag*/, hwy::SizeTag<8> /*to_lane_size_tag*/, hwy::FloatTag /*from_type_tag*/, D d_to, V v) { #if !HWY_S390X_HAVE_Z14 && \ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvspuxds)) (void)d_to; const auto normalized_v = detail::VsxF2INormalizeSrcVals(v); #if HWY_IS_LITTLE_ENDIAN // __builtin_vsx_xvcvspuxds expects the source values to be in the odd lanes // on little-endian PPC return VFromD{reinterpret_cast<__vector unsigned long long>( __builtin_vsx_xvcvspuxds(normalized_v.raw))}; #else // __builtin_vsx_xvcvspuxds expects the source values to be in the even lanes // on big-endian PPC, and the vec_sld operation below will shift the odd lanes // of normalized_v into the even lanes. return VFromD{ reinterpret_cast<__vector unsigned long long>(__builtin_vsx_xvcvspuxds( vec_sld(normalized_v.raw, normalized_v.raw, 4)))}; #endif #else const RebindToFloat df64; return ConvertTo(d_to, PromoteOddTo(hwy::FloatTag(), hwy::SizeTag<8>(), hwy::FloatTag(), df64, v)); #endif } } // namespace detail // ------------------------------ Demotions (full -> part w/ narrow lanes) template ) * 2)> HWY_API VFromD DemoteTo(D /* tag */, Vec128().MaxLanes()> v) { return VFromD{vec_packsu(v.raw, v.raw)}; } template ) * 2)> HWY_API VFromD DemoteTo(D /* tag */, Vec128().MaxLanes()> v) { return VFromD{vec_packs(v.raw, v.raw)}; } template ) * 2)> HWY_API VFromD DemoteTo(D /* tag */, Vec128().MaxLanes()> v) { return VFromD{vec_packs(v.raw, v.raw)}; } template = sizeof(TFromD) * 4)>* = nullptr> HWY_API VFromD DemoteTo(D d, Vec128().MaxLanes()> v) { const Rebind, D> d2; return DemoteTo(d, DemoteTo(d2, v)); } template = sizeof(TFromD) * 4)>* = nullptr> HWY_API VFromD DemoteTo(D d, Vec128().MaxLanes()> v) { const Rebind, D> d2; return DemoteTo(d, DemoteTo(d2, v)); } template = sizeof(TFromD) * 4)>* = nullptr> HWY_API VFromD DemoteTo(D d, Vec128().MaxLanes()> v) { const Rebind>, D> d2; return DemoteTo(d, DemoteTo(d2, v)); } #if HWY_PPC_HAVE_9 && \ (HWY_COMPILER_GCC_ACTUAL || HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp)) // We already toggled HWY_NATIVE_F16C above. template HWY_API VFromD DemoteTo(D df16, VFromD> v) { // Avoid vec_pack_to_short_fp32 on Clang because its implementation is buggy. #if HWY_COMPILER_GCC_ACTUAL (void)df16; return VFromD{vec_pack_to_short_fp32(v.raw, v.raw)}; #elif HWY_HAS_BUILTIN(__builtin_vsx_xvcvsphp) // Work around bug in the clang implementation of vec_pack_to_short_fp32 // by using the __builtin_vsx_xvcvsphp builtin on PPC9/PPC10 targets // if the __builtin_vsx_xvcvsphp intrinsic is available const RebindToUnsigned du16; const Rebind du; const VFromD bits16{ reinterpret_cast<__vector unsigned int>(__builtin_vsx_xvcvsphp(v.raw))}; return BitCast(df16, TruncateTo(du16, bits16)); #else #error "Only define the function if we have a native implementation" #endif } #endif // HWY_PPC_HAVE_9 #if HWY_PPC_HAVE_9 #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16 #undef HWY_NATIVE_DEMOTE_F64_TO_F16 #else #define HWY_NATIVE_DEMOTE_F64_TO_F16 #endif namespace detail { // On big-endian PPC9, VsxXscvdphp converts vf64[0] to a F16, returned as an U64 // vector with the resulting F16 bits in the lower 16 bits of U64 lane 0 // On little-endian PPC9, VsxXscvdphp converts vf64[1] to a F16, returned as // an U64 vector with the resulting F16 bits in the lower 16 bits of U64 lane 1 static HWY_INLINE Vec128 VsxXscvdphp(Vec128 vf64) { // Inline assembly is needed for the PPC9 xscvdphp instruction as there is // currently no intrinsic available for the PPC9 xscvdphp instruction __vector unsigned long long raw_result; __asm__("xscvdphp %x0, %x1" : "=wa"(raw_result) : "wa"(vf64.raw)); return Vec128{raw_result}; } } // namespace detail template HWY_API VFromD DemoteTo(D df16, VFromD> v) { const RebindToUnsigned du16; const Rebind du64; const Full128 df64_full; #if HWY_IS_LITTLE_ENDIAN const auto bits16_as_u64 = UpperHalf(du64, detail::VsxXscvdphp(Combine(df64_full, v, v))); #else const auto bits16_as_u64 = LowerHalf(du64, detail::VsxXscvdphp(ResizeBitCast(df64_full, v))); #endif return BitCast(df16, TruncateTo(du16, bits16_as_u64)); } template HWY_API VFromD DemoteTo(D df16, VFromD> v) { const RebindToUnsigned du16; const Rebind du64; const Rebind df64; #if HWY_IS_LITTLE_ENDIAN const auto bits64_as_u64_0 = detail::VsxXscvdphp(InterleaveLower(df64, v, v)); const auto bits64_as_u64_1 = detail::VsxXscvdphp(v); const auto bits64_as_u64 = InterleaveUpper(du64, bits64_as_u64_0, bits64_as_u64_1); #else const auto bits64_as_u64_0 = detail::VsxXscvdphp(v); const auto bits64_as_u64_1 = detail::VsxXscvdphp(InterleaveUpper(df64, v, v)); const auto bits64_as_u64 = InterleaveLower(du64, bits64_as_u64_0, bits64_as_u64_1); #endif return BitCast(df16, TruncateTo(du16, bits64_as_u64)); } #elif HWY_S390X_HAVE_Z14 #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16 #undef HWY_NATIVE_DEMOTE_F64_TO_F16 #else #define HWY_NATIVE_DEMOTE_F64_TO_F16 #endif namespace detail { template static HWY_INLINE VFromD DemoteToF32WithRoundToOdd( DF32 df32, VFromD> v) { const Twice dt_f32; __vector float raw_f32_in_even; __asm__("vledb %0,%1,0,3" : "=v"(raw_f32_in_even) : "v"(v.raw)); const VFromD f32_in_even{raw_f32_in_even}; return LowerHalf(df32, ConcatEven(dt_f32, f32_in_even, f32_in_even)); } } // namespace detail template HWY_API VFromD DemoteTo(D df16, VFromD> v) { const Rebind df32; return DemoteTo(df16, detail::DemoteToF32WithRoundToOdd(df32, v)); } #endif // HWY_PPC_HAVE_9 template HWY_API VFromD DemoteTo(D dbf16, VFromD> v) { const Rebind du32; // for logical shift right const Rebind du16; const auto bits_in_32 = ShiftRight<16>(BitCast(du32, v)); return BitCast(dbf16, TruncateTo(du16, bits_in_32)); } template >> HWY_API VFromD ReorderDemote2To(D dbf16, V32 a, V32 b) { const RebindToUnsigned du16; const Repartition du32; #if HWY_IS_LITTLE_ENDIAN const auto a_in_odd = a; const auto b_in_even = ShiftRight<16>(BitCast(du32, b)); #else const auto a_in_odd = ShiftRight<16>(BitCast(du32, a)); const auto b_in_even = b; #endif return BitCast(dbf16, OddEven(BitCast(du16, a_in_odd), BitCast(du16, b_in_even))); } // Specializations for partial vectors because vec_packs sets lanes above 2*N. template ) * 2)> HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } template ) * 2)> HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { const Twice dn_full; const Repartition du32_full; const VFromD v_full{vec_packs(a.raw, b.raw)}; const auto vu32_full = BitCast(du32_full, v_full); return LowerHalf( BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); } template ) * 2)> HWY_API VFromD ReorderDemote2To(DN /*dn*/, V a, V b) { return VFromD{vec_packs(a.raw, b.raw)}; } template ) * 2)> HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } template ) * 2)> HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { const Twice dn_full; const Repartition du32_full; const VFromD v_full{vec_packsu(a.raw, b.raw)}; const auto vu32_full = BitCast(du32_full, v_full); return LowerHalf( BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); } template ) * 2)> HWY_API VFromD ReorderDemote2To(DN /*dn*/, V a, V b) { return VFromD{vec_packsu(a.raw, b.raw)}; } template ) * 2)> HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } template ) * 2)> HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { const Twice dn_full; const Repartition du32_full; const VFromD v_full{vec_packs(a.raw, b.raw)}; const auto vu32_full = BitCast(du32_full, v_full); return LowerHalf( BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); } template ) * 2)> HWY_API VFromD ReorderDemote2To(DN /*dn*/, V a, V b) { return VFromD{vec_packs(a.raw, b.raw)}; } template ), class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV) * 2)> HWY_API VFromD OrderedDemote2To(D d, V a, V b) { return ReorderDemote2To(d, a, b); } template >> HWY_API VFromD OrderedDemote2To(D dbf16, V32 a, V32 b) { const RebindToUnsigned du16; #if HWY_IS_LITTLE_ENDIAN return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a))); #else return BitCast(dbf16, ConcatEven(du16, BitCast(du16, b), BitCast(du16, a))); #endif } template HWY_API Vec32 DemoteTo(D /* tag */, Vec64 v) { return Vec32{vec_floate(v.raw)}; } template HWY_API Vec64 DemoteTo(D d, Vec128 v) { #if HWY_S390X_HAVE_Z14 || HWY_IS_LITTLE_ENDIAN const Vec128 f64_to_f32{vec_floate(v.raw)}; #else const Vec128 f64_to_f32{vec_floato(v.raw)}; #endif #if HWY_S390X_HAVE_Z14 const Twice dt; return LowerHalf(d, ConcatEven(dt, f64_to_f32, f64_to_f32)); #else const RebindToUnsigned du; const Rebind du64; return Vec64{ BitCast(d, TruncateTo(du, BitCast(du64, f64_to_f32))).raw}; #endif } template HWY_API Vec32 DemoteTo(D di32, Vec64 v) { #if HWY_S390X_HAVE_Z14 const Rebind di64; return DemoteTo(di32, ConvertTo(di64, v)); #else (void)di32; return Vec32{vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)}; #endif } template HWY_API Vec64 DemoteTo(D di32, Vec128 v) { #if HWY_S390X_HAVE_Z14 const Rebind di64; return DemoteTo(di32, ConvertTo(di64, v)); #else (void)di32; #if HWY_IS_LITTLE_ENDIAN const Vec128 f64_to_i32{ vec_signede(detail::VsxF2INormalizeSrcVals(v).raw)}; #else const Vec128 f64_to_i32{ vec_signedo(detail::VsxF2INormalizeSrcVals(v).raw)}; #endif const Rebind di64; const Vec128 vi64 = BitCast(di64, f64_to_i32); return Vec64{vec_pack(vi64.raw, vi64.raw)}; #endif } template HWY_API Vec32 DemoteTo(D du32, Vec64 v) { #if HWY_S390X_HAVE_Z14 const Rebind du64; return DemoteTo(du32, ConvertTo(du64, v)); #else (void)du32; return Vec32{vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)}; #endif } template HWY_API Vec64 DemoteTo(D du32, Vec128 v) { #if HWY_S390X_HAVE_Z14 const Rebind du64; return DemoteTo(du32, ConvertTo(du64, v)); #else (void)du32; #if HWY_IS_LITTLE_ENDIAN const Vec128 f64_to_u32{ vec_unsignede(detail::VsxF2INormalizeSrcVals(v).raw)}; #else const Vec128 f64_to_u32{ vec_unsignedo(detail::VsxF2INormalizeSrcVals(v).raw)}; #endif const Rebind du64; const Vec128 vu64 = BitCast(du64, f64_to_u32); return Vec64{vec_pack(vu64.raw, vu64.raw)}; #endif } #if HWY_S390X_HAVE_Z14 namespace detail { template )> HWY_INLINE VFromD>> ConvToF64WithRoundToOdd(V v) { __vector double raw_result; // Use inline assembly to do a round-to-odd I64->F64 conversion on Z14 __asm__("vcdgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw)); return VFromD>>{raw_result}; } template )> HWY_INLINE VFromD>> ConvToF64WithRoundToOdd(V v) { __vector double raw_result; // Use inline assembly to do a round-to-odd U64->F64 conversion on Z14 __asm__("vcdlgb %0,%1,0,3" : "=v"(raw_result) : "v"(v.raw)); return VFromD>>{raw_result}; } } // namespace detail #endif // HWY_S390X_HAVE_Z14 template HWY_API Vec32 DemoteTo(D df32, Vec64 v) { #if HWY_S390X_HAVE_Z14 return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v)); #else // VSX (void)df32; return Vec32{vec_floate(v.raw)}; #endif } template HWY_API Vec64 DemoteTo(D df32, Vec128 v) { #if HWY_S390X_HAVE_Z14 return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v)); #else // VSX #if HWY_IS_LITTLE_ENDIAN const Vec128 i64_to_f32{vec_floate(v.raw)}; #else const Vec128 i64_to_f32{vec_floato(v.raw)}; #endif const RebindToUnsigned du32; const Rebind du64; return Vec64{ BitCast(df32, TruncateTo(du32, BitCast(du64, i64_to_f32))).raw}; #endif } template HWY_API Vec32 DemoteTo(D df32, Vec64 v) { #if HWY_S390X_HAVE_Z14 return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v)); #else // VSX (void)df32; return Vec32{vec_floate(v.raw)}; #endif } template HWY_API Vec64 DemoteTo(D df32, Vec128 v) { #if HWY_S390X_HAVE_Z14 return DemoteTo(df32, detail::ConvToF64WithRoundToOdd(v)); #else // VSX #if HWY_IS_LITTLE_ENDIAN const Vec128 u64_to_f32{vec_floate(v.raw)}; #else const Vec128 u64_to_f32{vec_floato(v.raw)}; #endif const RebindToUnsigned du; const Rebind du64; return Vec64{ BitCast(df32, TruncateTo(du, BitCast(du64, u64_to_f32))).raw}; #endif } // For already range-limited input [0, 255]. template HWY_API Vec128 U8FromU32(Vec128 v) { const Rebind> du16; const Rebind du8; return TruncateTo(du8, TruncateTo(du16, v)); } // ------------------------------ Integer <=> fp (ShiftRight, OddEven) // Note: altivec.h vec_ct* currently contain C casts which triggers // -Wdeprecate-lax-vec-conv-all warnings, so disable them. #if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15 template HWY_API VFromD ConvertTo(D df32, Vec128().MaxLanes()> v) { const Rebind df64; return DemoteTo(df32, PromoteTo(df64, v)); } template HWY_API VFromD ConvertTo(D df32, Vec128 v) { const RepartitionToWide df64; const VFromD vf32_lo{vec_floate(PromoteLowerTo(df64, v).raw)}; const VFromD vf32_hi{vec_floate(PromoteUpperTo(df64, v).raw)}; return ConcatEven(df32, vf32_hi, vf32_lo); } #else // Z15 or PPC template HWY_API VFromD ConvertTo(D /* tag */, Vec128().MaxLanes()> v) { HWY_DIAGNOSTICS(push) #if HWY_COMPILER_CLANG HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all") #endif #if HWY_S390X_HAVE_Z15 return VFromD{vec_float(v.raw)}; #else return VFromD{vec_ctf(v.raw, 0)}; #endif HWY_DIAGNOSTICS(pop) } #endif // HWY_TARGET == HWY_Z14 template HWY_API VFromD ConvertTo(D /* tag */, Vec128().MaxLanes()> v) { return VFromD{vec_double(v.raw)}; } // Truncates (rounds toward zero). #if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15 template HWY_API VFromD ConvertTo(D di32, Vec128().MaxLanes()> v) { const Rebind di64; return DemoteTo(di32, PromoteTo(di64, v)); } template HWY_API VFromD ConvertTo(D di32, Vec128().MaxLanes()> v) { const RepartitionToWide di64; return OrderedDemote2To(di32, PromoteLowerTo(di64, v), PromoteUpperTo(di64, v)); } #else // Z15 or PPC template HWY_API VFromD ConvertTo(D /* tag */, Vec128().MaxLanes()> v) { #if defined(__OPTIMIZE__) if (detail::IsConstantRawAltivecVect(v.raw)) { constexpr int32_t kMinI32 = LimitsMin(); constexpr int32_t kMaxI32 = LimitsMax(); return Dup128VecFromValues( D(), (v.raw[0] >= -2147483648.0f) ? ((v.raw[0] < 2147483648.0f) ? static_cast(v.raw[0]) : kMaxI32) : ((v.raw[0] < 0) ? kMinI32 : 0), (v.raw[1] >= -2147483648.0f) ? ((v.raw[1] < 2147483648.0f) ? static_cast(v.raw[1]) : kMaxI32) : ((v.raw[1] < 0) ? kMinI32 : 0), (v.raw[2] >= -2147483648.0f) ? ((v.raw[2] < 2147483648.0f) ? static_cast(v.raw[2]) : kMaxI32) : ((v.raw[2] < 0) ? kMinI32 : 0), (v.raw[3] >= -2147483648.0f) ? ((v.raw[3] < 2147483648.0f) ? static_cast(v.raw[3]) : kMaxI32) : ((v.raw[3] < 0) ? kMinI32 : 0)); } #endif #if HWY_S390X_HAVE_Z15 // Use inline assembly on Z15 to avoid undefined behavior if v[i] is not in // the range of an int32_t __vector signed int raw_result; __asm__("vcfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw)); return VFromD{raw_result}; #else HWY_DIAGNOSTICS(push) #if HWY_COMPILER_CLANG HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all") #endif return VFromD{vec_cts(v.raw, 0)}; HWY_DIAGNOSTICS(pop) #endif // HWY_S390X_HAVE_Z15 } #endif // HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15 template HWY_API VFromD ConvertTo(D /* tag */, Vec128().MaxLanes()> v) { #if defined(__OPTIMIZE__) if (detail::IsConstantRawAltivecVect(v.raw)) { constexpr int64_t kMinI64 = LimitsMin(); constexpr int64_t kMaxI64 = LimitsMax(); return Dup128VecFromValues(D(), (v.raw[0] >= -9223372036854775808.0) ? ((v.raw[0] < 9223372036854775808.0) ? static_cast(v.raw[0]) : kMaxI64) : ((v.raw[0] < 0) ? kMinI64 : 0LL), (v.raw[1] >= -9223372036854775808.0) ? ((v.raw[1] < 9223372036854775808.0) ? static_cast(v.raw[1]) : kMaxI64) : ((v.raw[1] < 0) ? kMinI64 : 0LL)); } #endif // Use inline assembly to avoid undefined behavior if v[i] is not within the // range of an int64_t __vector signed long long raw_result; #if HWY_S390X_HAVE_Z14 __asm__("vcgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw)); #else __asm__("xvcvdpsxds %x0,%x1" : "=wa"(raw_result) : "wa"(detail::VsxF2INormalizeSrcVals(v).raw)); #endif return VFromD{raw_result}; } #if HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15 template HWY_API VFromD ConvertTo(D du32, Vec128().MaxLanes()> v) { const Rebind du64; return DemoteTo(du32, PromoteTo(du64, v)); } template HWY_API VFromD ConvertTo(D du32, Vec128().MaxLanes()> v) { const RepartitionToWide du64; return OrderedDemote2To(du32, PromoteLowerTo(du64, v), PromoteUpperTo(du64, v)); } #else // Z15 or VSX template HWY_API VFromD ConvertTo(D /* tag */, Vec128().MaxLanes()> v) { #if defined(__OPTIMIZE__) if (detail::IsConstantRawAltivecVect(v.raw)) { constexpr uint32_t kMaxU32 = LimitsMax(); return Dup128VecFromValues( D(), (v.raw[0] >= 0.0f) ? ((v.raw[0] < 4294967296.0f) ? static_cast(v.raw[0]) : kMaxU32) : 0, (v.raw[1] >= 0.0f) ? ((v.raw[1] < 4294967296.0f) ? static_cast(v.raw[1]) : kMaxU32) : 0, (v.raw[2] >= 0.0f) ? ((v.raw[2] < 4294967296.0f) ? static_cast(v.raw[2]) : kMaxU32) : 0, (v.raw[3] >= 0.0f) ? ((v.raw[3] < 4294967296.0f) ? static_cast(v.raw[3]) : kMaxU32) : 0); } #endif #if HWY_S390X_HAVE_Z15 // Use inline assembly on Z15 to avoid undefined behavior if v[i] is not in // the range of an uint32_t __vector unsigned int raw_result; __asm__("vclfeb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw)); return VFromD{raw_result}; #else // VSX HWY_DIAGNOSTICS(push) #if HWY_COMPILER_CLANG HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all") #endif VFromD result{vec_ctu(v.raw, 0)}; HWY_DIAGNOSTICS(pop) return result; #endif // HWY_S390X_HAVE_Z15 } #endif // HWY_S390X_HAVE_Z14 && !HWY_S390X_HAVE_Z15 template HWY_API VFromD ConvertTo(D /* tag */, Vec128().MaxLanes()> v) { HWY_DIAGNOSTICS(push) #if HWY_COMPILER_CLANG HWY_DIAGNOSTICS_OFF(disable : 5219, ignored "-Wdeprecate-lax-vec-conv-all") #endif #if defined(__OPTIMIZE__) if (detail::IsConstantRawAltivecVect(v.raw)) { constexpr uint64_t kMaxU64 = LimitsMax(); return Dup128VecFromValues( D(), (v.raw[0] >= 0.0) ? ((v.raw[0] < 18446744073709551616.0) ? static_cast(v.raw[0]) : kMaxU64) : 0, (v.raw[1] >= 0.0) ? ((v.raw[1] < 18446744073709551616.0) ? static_cast(v.raw[1]) : kMaxU64) : 0); } #endif // Use inline assembly to avoid undefined behavior if v[i] is not within the // range of an uint64_t __vector unsigned long long raw_result; #if HWY_S390X_HAVE_Z14 __asm__("vclgdb %0,%1,0,5" : "=v"(raw_result) : "v"(v.raw)); #else // VSX __asm__("xvcvdpuxds %x0,%x1" : "=wa"(raw_result) : "wa"(detail::VsxF2INormalizeSrcVals(v).raw)); #endif return VFromD{raw_result}; } // ------------------------------ Floating-point rounding (ConvertTo) // Toward nearest integer, ties to even template HWY_API Vec128 Round(Vec128 v) { return Vec128{vec_round(v.raw)}; } template HWY_API Vec128 Round(Vec128 v) { #if HWY_S390X_HAVE_Z14 return Vec128{vec_round(v.raw)}; #else return Vec128{vec_rint(v.raw)}; #endif } template HWY_API Vec128 NearestInt(Vec128 v) { const DFromV d; const RebindToSigned di; return ConvertTo(di, Round(v)); } // Toward zero, aka truncate template HWY_API Vec128 Trunc(Vec128 v) { return Vec128{vec_trunc(v.raw)}; } // Toward +infinity, aka ceiling template HWY_API Vec128 Ceil(Vec128 v) { return Vec128{vec_ceil(v.raw)}; } // Toward -infinity, aka floor template HWY_API Vec128 Floor(Vec128 v) { return Vec128{vec_floor(v.raw)}; } // ------------------------------ Floating-point classification template HWY_API Mask128 IsNaN(Vec128 v) { static_assert(IsFloat(), "Only for float"); return v != v; } template HWY_API Mask128 IsInf(Vec128 v) { static_assert(IsFloat(), "Only for float"); using TU = MakeUnsigned; const DFromV d; const RebindToUnsigned du; const VFromD vu = BitCast(du, v); // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. return RebindMask( d, Eq(Add(vu, vu), Set(du, static_cast(hwy::MaxExponentTimes2())))); } // Returns whether normal/subnormal/zero. template HWY_API Mask128 IsFinite(Vec128 v) { static_assert(IsFloat(), "Only for float"); using TU = MakeUnsigned; const DFromV d; const RebindToUnsigned du; const VFromD vu = BitCast(du, v); // 'Shift left' to clear the sign bit, check for exponent(hwy::MaxExponentTimes2())))); } // ================================================== CRYPTO #if !HWY_S390X_HAVE_Z14 && !defined(HWY_DISABLE_PPC8_CRYPTO) // Per-target flag to prevent generic_ops-inl.h from defining AESRound. #ifdef HWY_NATIVE_AES #undef HWY_NATIVE_AES #else #define HWY_NATIVE_AES #endif namespace detail { #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1600 using CipherTag = Full128; #else using CipherTag = Full128; #endif // !HWY_COMPILER_CLANG using CipherVec = VFromD; } // namespace detail HWY_API Vec128 AESRound(Vec128 state, Vec128 round_key) { const detail::CipherTag dc; const Full128 du8; #if HWY_IS_LITTLE_ENDIAN return Reverse(du8, BitCast(du8, detail::CipherVec{vec_cipher_be( BitCast(dc, Reverse(du8, state)).raw, BitCast(dc, Reverse(du8, round_key)).raw)})); #else return BitCast(du8, detail::CipherVec{vec_cipher_be( BitCast(dc, state).raw, BitCast(dc, round_key).raw)}); #endif } HWY_API Vec128 AESLastRound(Vec128 state, Vec128 round_key) { const detail::CipherTag dc; const Full128 du8; #if HWY_IS_LITTLE_ENDIAN return Reverse(du8, BitCast(du8, detail::CipherVec{vec_cipherlast_be( BitCast(dc, Reverse(du8, state)).raw, BitCast(dc, Reverse(du8, round_key)).raw)})); #else return BitCast(du8, detail::CipherVec{vec_cipherlast_be( BitCast(dc, state).raw, BitCast(dc, round_key).raw)}); #endif } HWY_API Vec128 AESRoundInv(Vec128 state, Vec128 round_key) { const detail::CipherTag dc; const Full128 du8; #if HWY_IS_LITTLE_ENDIAN return Xor(Reverse(du8, BitCast(du8, detail::CipherVec{vec_ncipher_be( BitCast(dc, Reverse(du8, state)).raw, Zero(dc).raw)})), round_key); #else return Xor(BitCast(du8, detail::CipherVec{vec_ncipher_be( BitCast(dc, state).raw, Zero(dc).raw)}), round_key); #endif } HWY_API Vec128 AESLastRoundInv(Vec128 state, Vec128 round_key) { const detail::CipherTag dc; const Full128 du8; #if HWY_IS_LITTLE_ENDIAN return Reverse(du8, BitCast(du8, detail::CipherVec{vec_ncipherlast_be( BitCast(dc, Reverse(du8, state)).raw, BitCast(dc, Reverse(du8, round_key)).raw)})); #else return BitCast(du8, detail::CipherVec{vec_ncipherlast_be( BitCast(dc, state).raw, BitCast(dc, round_key).raw)}); #endif } HWY_API Vec128 AESInvMixColumns(Vec128 state) { const Full128 du8; const auto zero = Zero(du8); // PPC8/PPC9/PPC10 does not have a single instruction for the AES // InvMixColumns operation like ARM Crypto, SVE2 Crypto, or AES-NI do. // The AESInvMixColumns operation can be carried out on PPC8/PPC9/PPC10 // by doing an AESLastRound operation with a zero round_key followed by an // AESRoundInv operation with a zero round_key. return AESRoundInv(AESLastRound(state, zero), zero); } template HWY_API Vec128 AESKeyGenAssist(Vec128 v) { constexpr __vector unsigned char kRconXorMask = {0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0}; constexpr __vector unsigned char kRotWordShuffle = { 4, 5, 6, 7, 5, 6, 7, 4, 12, 13, 14, 15, 13, 14, 15, 12}; const detail::CipherTag dc; const Full128 du8; const auto sub_word_result = BitCast(du8, detail::CipherVec{vec_sbox_be(BitCast(dc, v).raw)}); const auto rot_word_result = TableLookupBytes(sub_word_result, Vec128{kRotWordShuffle}); return Xor(rot_word_result, Vec128{kRconXorMask}); } template HWY_API Vec128 CLMulLower(Vec128 a, Vec128 b) { // NOTE: Lane 1 of both a and b need to be zeroed out for the // vec_pmsum_be operation below as the vec_pmsum_be operation // does a carryless multiplication of each 64-bit half and then // adds the two halves using an bitwise XOR operation. const DFromV d; const auto zero = Zero(d); using VU64 = __vector unsigned long long; const VU64 pmsum_result = reinterpret_cast( vec_pmsum_be(InterleaveLower(a, zero).raw, InterleaveLower(b, zero).raw)); #if HWY_IS_LITTLE_ENDIAN return Vec128{pmsum_result}; #else // Need to swap the two halves of pmsum_result on big-endian targets as // the upper 64 bits of the carryless multiplication result are in lane 0 of // pmsum_result and the lower 64 bits of the carryless multiplication result // are in lane 1 of mul128_result return Vec128{vec_sld(pmsum_result, pmsum_result, 8)}; #endif } template HWY_API Vec128 CLMulUpper(Vec128 a, Vec128 b) { // NOTE: Lane 0 of both a and b need to be zeroed out for the // vec_pmsum_be operation below as the vec_pmsum_be operation // does a carryless multiplication of each 64-bit half and then // adds the two halves using an bitwise XOR operation. const DFromV d; const auto zero = Zero(d); using VU64 = __vector unsigned long long; const VU64 pmsum_result = reinterpret_cast( vec_pmsum_be(vec_mergel(zero.raw, a.raw), vec_mergel(zero.raw, b.raw))); #if HWY_IS_LITTLE_ENDIAN return Vec128{pmsum_result}; #else // Need to swap the two halves of pmsum_result on big-endian targets as // the upper 64 bits of the carryless multiplication result are in lane 0 of // pmsum_result and the lower 64 bits of the carryless multiplication result // are in lane 1 of mul128_result return Vec128{vec_sld(pmsum_result, pmsum_result, 8)}; #endif } #endif // !defined(HWY_DISABLE_PPC8_CRYPTO) // ================================================== MISC // ------------------------------ LoadMaskBits (TestBit) namespace detail { template HWY_INLINE MFromD LoadMaskBits128(D /*d*/, uint64_t mask_bits) { #if HWY_PPC_HAVE_10 const Vec128 mask_vec{vec_genbm(mask_bits)}; #if HWY_IS_LITTLE_ENDIAN return MFromD{MaskFromVec(mask_vec).raw}; #else return MFromD{MaskFromVec(Reverse(Full128(), mask_vec)).raw}; #endif // HWY_IS_LITTLE_ENDIAN #else // PPC9 or earlier const Full128 du8; const Full128 du16; const Vec128 vbits = BitCast(du8, Set(du16, static_cast(mask_bits))); // Replicate bytes 8x such that each byte contains the bit that governs it. #if HWY_IS_LITTLE_ENDIAN const __vector unsigned char kRep8 = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}; #else const __vector unsigned char kRep8 = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0}; #endif // HWY_IS_LITTLE_ENDIAN const Vec128 rep8{vec_perm(vbits.raw, vbits.raw, kRep8)}; const __vector unsigned char kBit = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128}; return MFromD{TestBit(rep8, Vec128{kBit}).raw}; #endif // HWY_PPC_HAVE_10 } template HWY_INLINE MFromD LoadMaskBits128(D /*d*/, uint64_t mask_bits) { #if HWY_PPC_HAVE_10 const Vec128 mask_vec{vec_genhm(mask_bits)}; #if HWY_IS_LITTLE_ENDIAN return MFromD{MaskFromVec(mask_vec).raw}; #else return MFromD{MaskFromVec(Reverse(Full128(), mask_vec)).raw}; #endif // HWY_IS_LITTLE_ENDIAN #else // PPC9 or earlier const __vector unsigned short kBit = {1, 2, 4, 8, 16, 32, 64, 128}; const auto vmask_bits = Set(Full128(), static_cast(mask_bits)); return MFromD{TestBit(vmask_bits, Vec128{kBit}).raw}; #endif // HWY_PPC_HAVE_10 } template HWY_INLINE MFromD LoadMaskBits128(D /*d*/, uint64_t mask_bits) { #if HWY_PPC_HAVE_10 const Vec128 mask_vec{vec_genwm(mask_bits)}; #if HWY_IS_LITTLE_ENDIAN return MFromD{MaskFromVec(mask_vec).raw}; #else return MFromD{MaskFromVec(Reverse(Full128(), mask_vec)).raw}; #endif // HWY_IS_LITTLE_ENDIAN #else // PPC9 or earlier const __vector unsigned int kBit = {1, 2, 4, 8}; const auto vmask_bits = Set(Full128(), static_cast(mask_bits)); return MFromD{TestBit(vmask_bits, Vec128{kBit}).raw}; #endif // HWY_PPC_HAVE_10 } template HWY_INLINE MFromD LoadMaskBits128(D /*d*/, uint64_t mask_bits) { #if HWY_PPC_HAVE_10 const Vec128 mask_vec{vec_gendm(mask_bits)}; #if HWY_IS_LITTLE_ENDIAN return MFromD{MaskFromVec(mask_vec).raw}; #else return MFromD{MaskFromVec(Reverse(Full128(), mask_vec)).raw}; #endif // HWY_IS_LITTLE_ENDIAN #else // PPC9 or earlier const __vector unsigned long long kBit = {1, 2}; const auto vmask_bits = Set(Full128(), static_cast(mask_bits)); return MFromD{TestBit(vmask_bits, Vec128{kBit}).raw}; #endif // HWY_PPC_HAVE_10 } } // namespace detail // `p` points to at least 8 readable bytes, not all of which need be valid. template HWY_API MFromD LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { // If there are 8 or fewer lanes, simply convert bits[0] to a uint64_t uint64_t mask_bits = bits[0]; constexpr size_t kN = MaxLanes(d); if (kN < 8) mask_bits &= (1u << kN) - 1; return detail::LoadMaskBits128(d, mask_bits); } template HWY_API MFromD LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { // First, copy the mask bits to a uint16_t as there as there are at most // 16 lanes in a vector. // Copying the mask bits to a uint16_t first will also ensure that the // mask bits are loaded into the lower 16 bits on big-endian PPC targets. uint16_t u16_mask_bits; CopyBytes(bits, &u16_mask_bits); #if HWY_IS_LITTLE_ENDIAN return detail::LoadMaskBits128(d, u16_mask_bits); #else // On big-endian targets, u16_mask_bits need to be byte swapped as bits // contains the mask bits in little-endian byte order // GCC/Clang will optimize the load of u16_mask_bits and byte swap to a // single lhbrx instruction on big-endian PPC targets when optimizations // are enabled. #if HWY_HAS_BUILTIN(__builtin_bswap16) return detail::LoadMaskBits128(d, __builtin_bswap16(u16_mask_bits)); #else return detail::LoadMaskBits128( d, static_cast((u16_mask_bits << 8) | (u16_mask_bits >> 8))); #endif #endif } template struct CompressIsPartition { // generic_ops-inl does not guarantee IsPartition for 8-bit. enum { value = (sizeof(T) != 1) }; }; // ------------------------------ Dup128MaskFromMaskBits template HWY_API MFromD Dup128MaskFromMaskBits(D d, unsigned mask_bits) { constexpr size_t kN = MaxLanes(d); if (kN < 8) mask_bits &= (1u << kN) - 1; return detail::LoadMaskBits128(d, mask_bits); } // ------------------------------ StoreMaskBits namespace detail { #if !HWY_S390X_HAVE_Z14 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) // fallback for missing vec_extractm template HWY_INLINE uint64_t ExtractSignBits(Vec128 sign_bits, __vector unsigned char bit_shuffle) { // clang POWER8 and 9 targets appear to differ in their return type of // vec_vbpermq: unsigned or signed, so cast to avoid a warning. using VU64 = detail::Raw128::type; const Vec128 extracted{ reinterpret_cast(vec_vbpermq(sign_bits.raw, bit_shuffle))}; return extracted.raw[HWY_IS_LITTLE_ENDIAN]; } #endif // !HWY_S390X_HAVE_Z14 && !HWY_PPC_HAVE_10 #if HWY_S390X_HAVE_Z14 template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { const DFromM d; const Repartition du8; const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); return ReduceSum( du8, And(sign_bits, Dup128VecFromValues(du8, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128))); } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { const DFromM d; const Repartition du8; const Repartition du64; const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); const auto mask_bytes = SumsOf8( And(sign_bits, Dup128VecFromValues(du8, 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128))); const Rebind du8_2; const Repartition du16_1; return GetLane( BitCast(du16_1, TruncateTo(du8_2, Reverse2(du64, mask_bytes)))); } #else template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { const DFromM d; const Repartition du8; const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN return static_cast(vec_extractm(sign_bits.raw)); #else // PPC8, PPC9, or big-endian PPC10 const __vector unsigned char kBitShuffle = {120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0}; return ExtractSignBits(sign_bits, kBitShuffle); #endif // HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN } #endif // HWY_S390X_HAVE_Z14 template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { const DFromM d; const RebindToUnsigned du; #if HWY_S390X_HAVE_Z14 const VFromD sign_bits = BitCast(du, VecFromMask(d, mask)); return ReduceSum( du, And(sign_bits, Dup128VecFromValues(du, 1, 2, 4, 8, 16, 32, 64, 128))); #else // VSX const Repartition du8; const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN return static_cast(vec_extractm(BitCast(du, sign_bits).raw)); #else // PPC8, PPC9, or big-endian PPC10 (void)du; #if HWY_IS_LITTLE_ENDIAN const __vector unsigned char kBitShuffle = { 112, 96, 80, 64, 48, 32, 16, 0, 128, 128, 128, 128, 128, 128, 128, 128}; #else const __vector unsigned char kBitShuffle = { 128, 128, 128, 128, 128, 128, 128, 128, 112, 96, 80, 64, 48, 32, 16, 0}; #endif return ExtractSignBits(sign_bits, kBitShuffle); #endif // HWY_PPC_HAVE_10 #endif // HWY_S390X_HAVE_Z14 } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { const DFromM d; const RebindToUnsigned du; #if HWY_S390X_HAVE_Z14 const VFromD sign_bits = BitCast(du, VecFromMask(d, mask)); return ReduceSum(du, And(sign_bits, Dup128VecFromValues(du, 1, 2, 4, 8))); #else // VSX const Repartition du8; const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN return static_cast(vec_extractm(BitCast(du, sign_bits).raw)); #else // PPC8, PPC9, or big-endian PPC10 (void)du; #if HWY_IS_LITTLE_ENDIAN const __vector unsigned char kBitShuffle = {96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}; #else const __vector unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 96, 64, 32, 0}; #endif return ExtractSignBits(sign_bits, kBitShuffle); #endif // HWY_PPC_HAVE_10 #endif // HWY_S390X_HAVE_Z14 } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 mask) { const DFromM d; const RebindToUnsigned du; #if HWY_S390X_HAVE_Z14 const VFromD sign_bits = BitCast(du, VecFromMask(d, mask)); return ReduceSum(du, And(sign_bits, Dup128VecFromValues(du, 1, 2))); #else // VSX const Repartition du8; const VFromD sign_bits = BitCast(du8, VecFromMask(d, mask)); #if HWY_PPC_HAVE_10 && HWY_IS_LITTLE_ENDIAN return static_cast(vec_extractm(BitCast(du, sign_bits).raw)); #else (void)du; #if HWY_IS_LITTLE_ENDIAN const __vector unsigned char kBitShuffle = {64, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}; #else const __vector unsigned char kBitShuffle = {128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 64, 0}; #endif return ExtractSignBits(sign_bits, kBitShuffle); #endif // HWY_PPC_HAVE_10 #endif // HWY_S390X_HAVE_Z14 } // Returns the lowest N of the mask bits. template constexpr uint64_t OnlyActive(uint64_t mask_bits) { return ((N * sizeof(T)) == 16) ? mask_bits : mask_bits & ((1ull << N) - 1); } template HWY_INLINE uint64_t BitsFromMask(Mask128 mask) { return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); } } // namespace detail // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D /*d*/, MFromD mask, uint8_t* bits) { // For vectors with 8 or fewer lanes, simply cast the result of BitsFromMask // to an uint8_t and store the result in bits[0]. bits[0] = static_cast(detail::BitsFromMask(mask)); return sizeof(uint8_t); } template HWY_API size_t StoreMaskBits(D /*d*/, MFromD mask, uint8_t* bits) { const auto mask_bits = detail::BitsFromMask(mask); // First convert mask_bits to a uint16_t as we only want to store // the lower 16 bits of mask_bits as there are 16 lanes in mask. // Converting mask_bits to a uint16_t first will also ensure that // the lower 16 bits of mask_bits are stored instead of the upper 16 bits // of mask_bits on big-endian PPC targets. #if HWY_IS_LITTLE_ENDIAN const uint16_t u16_mask_bits = static_cast(mask_bits); #else // On big-endian targets, the bytes of mask_bits need to be swapped // as StoreMaskBits expects the mask bits to be stored in little-endian // byte order. // GCC will also optimize the byte swap and CopyBytes operations below // to a single sthbrx instruction when optimizations are enabled on // big-endian PPC targets #if HWY_HAS_BUILTIN(__builtin_bswap16) const uint16_t u16_mask_bits = __builtin_bswap16(static_cast(mask_bits)); #else const uint16_t u16_mask_bits = static_cast( (mask_bits << 8) | (static_cast(mask_bits) >> 8)); #endif #endif CopyBytes(&u16_mask_bits, bits); return sizeof(uint16_t); } // ------------------------------ Mask testing template HWY_API bool AllFalse(D d, MFromD mask) { const RebindToUnsigned du; return static_cast( vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw, Zero(du).raw)); } template HWY_API bool AllTrue(D d, MFromD mask) { const RebindToUnsigned du; using TU = TFromD; return static_cast(vec_all_eq(VecFromMask(du, RebindMask(du, mask)).raw, Set(du, hwy::LimitsMax()).raw)); } template HWY_API bool AllFalse(D d, MFromD mask) { const Full128> d_full; constexpr size_t kN = MaxLanes(d); return AllFalse(d_full, And(MFromD{mask.raw}, FirstN(d_full, kN))); } template HWY_API bool AllTrue(D d, MFromD mask) { const Full128> d_full; constexpr size_t kN = MaxLanes(d); return AllTrue( d_full, Or(MFromD{mask.raw}, Not(FirstN(d_full, kN)))); } template HWY_API size_t CountTrue(D /* tag */, MFromD mask) { return PopCount(detail::BitsFromMask(mask)); } #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) namespace detail { template static HWY_INLINE size_t VsxCntlzLsbb(V v) { #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 && \ HWY_IS_LITTLE_ENDIAN // Use inline assembly to work around bug in GCC 11 and earlier on // little-endian PPC9 int idx; __asm__("vctzlsbb %0,%1" : "=r"(idx) : "v"(v.raw)); return static_cast(idx); #else return static_cast(vec_cntlz_lsbb(v.raw)); #endif } template static HWY_INLINE size_t VsxCnttzLsbb(V v) { #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1200 && \ HWY_IS_LITTLE_ENDIAN // Use inline assembly to work around bug in GCC 11 and earlier on // little-endian PPC9 int idx; __asm__("vclzlsbb %0,%1" : "=r"(idx) : "v"(v.raw)); return static_cast(idx); #else return static_cast(vec_cnttz_lsbb(v.raw)); #endif } } // namespace detail #endif template > HWY_API size_t FindKnownFirstTrue(D d, MFromD mask) { // For little-endian PPC10, BitsFromMask is already efficient. #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) if (detail::IsFull(d)) { const Repartition d8; const auto bytes = BitCast(d8, VecFromMask(d, mask)); return detail::VsxCntlzLsbb(bytes) / sizeof(T); } #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) (void)d; return Num0BitsBelowLS1Bit_Nonzero64(detail::BitsFromMask(mask)); } template > HWY_API intptr_t FindFirstTrue(D d, MFromD mask) { // For little-endian PPC10, BitsFromMask is already efficient. #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) constexpr size_t kN = 16 / sizeof(T); if (detail::IsFull(d)) { const Repartition d8; const auto bytes = BitCast(d8, VecFromMask(d, mask)); const size_t idx = detail::VsxCntlzLsbb(bytes) / sizeof(T); return idx == kN ? -1 : static_cast(idx); } #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) (void)d; const uint64_t mask_bits = detail::BitsFromMask(mask); return mask_bits ? intptr_t(Num0BitsBelowLS1Bit_Nonzero64(mask_bits)) : -1; } template > HWY_API size_t FindKnownLastTrue(D d, MFromD mask) { // For little-endian PPC10, BitsFromMask is already efficient. #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) if (detail::IsFull(d)) { const Repartition d8; const auto bytes = BitCast(d8, VecFromMask(d, mask)); const size_t idx = detail::VsxCnttzLsbb(bytes) / sizeof(T); return 16 / sizeof(T) - 1 - idx; } #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) (void)d; return 63 - Num0BitsAboveMS1Bit_Nonzero64(detail::BitsFromMask(mask)); } template > HWY_API intptr_t FindLastTrue(D d, MFromD mask) { // For little-endian PPC10, BitsFromMask is already efficient. #if HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) constexpr size_t kN = 16 / sizeof(T); if (detail::IsFull(d)) { const Repartition d8; const auto bytes = BitCast(d8, VecFromMask(d, mask)); const size_t idx = detail::VsxCnttzLsbb(bytes) / sizeof(T); return idx == kN ? -1 : static_cast(kN - 1 - idx); } #endif // HWY_PPC_HAVE_9 && (!HWY_PPC_HAVE_10 || HWY_IS_BIG_ENDIAN) (void)d; const uint64_t mask_bits = detail::BitsFromMask(mask); return mask_bits ? intptr_t(63 - Num0BitsAboveMS1Bit_Nonzero64(mask_bits)) : -1; } // ------------------------------ Compress, CompressBits namespace detail { #if HWY_PPC_HAVE_10 template HWY_INLINE VFromD CompressOrExpandIndicesFromMask(D d, MFromD mask) { constexpr unsigned kGenPcvmMode = (kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u); // Inline assembly is used instead of the vec_genpcvm intrinsic to work around // compiler bugs on little-endian PPC10 typename detail::Raw128>::type idx; __asm__("xxgenpcvbm %x0, %1, %2" : "=wa"(idx) : "v"(mask.raw), "i"(kGenPcvmMode)); return VFromD{idx}; } template HWY_INLINE VFromD CompressOrExpandIndicesFromMask(D d, MFromD mask) { constexpr unsigned kGenPcvmMode = (kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u); // Inline assembly is used instead of the vec_genpcvm intrinsic to work around // compiler bugs on little-endian PPC10 typename detail::Raw128>::type idx; __asm__("xxgenpcvhm %x0, %1, %2" : "=wa"(idx) : "v"(mask.raw), "i"(kGenPcvmMode)); return VFromD{idx}; } template HWY_INLINE VFromD CompressOrExpandIndicesFromMask(D d, MFromD mask) { constexpr unsigned kGenPcvmMode = (kIsCompress ? 1u : 0u) | (HWY_IS_LITTLE_ENDIAN ? 2u : 0u); // Inline assembly is used instead of the vec_genpcvm intrinsic to work around // compiler bugs on little-endian PPC10 typename detail::Raw128>::type idx; __asm__("xxgenpcvwm %x0, %1, %2" : "=wa"(idx) : "v"(mask.raw), "i"(kGenPcvmMode)); return VFromD{idx}; } #endif // Also works for N < 8 because the first 16 4-tuples only reference bytes 0-6. template HWY_INLINE VFromD IndicesFromBits128(D d, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 256); const Rebind d8; const Twice d8t; const RebindToUnsigned du; // To reduce cache footprint, store lane indices and convert to byte indices // (2*lane + 0..1), with the doubling baked into the table. It's not clear // that the additional cost of unpacking nibbles is worthwhile. alignas(16) static constexpr uint8_t table[2048] = { // PrintCompress16x8Tables 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; const VFromD byte_idx{Load(d8, table + mask_bits * 8).raw}; const VFromD pairs = ZipLower(byte_idx, byte_idx); constexpr uint16_t kPairIndexIncrement = HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001; return BitCast(d, pairs + Set(du, kPairIndexIncrement)); } template HWY_INLINE VFromD IndicesFromNotBits128(D d, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 256); const Rebind d8; const Twice d8t; const RebindToUnsigned du; // To reduce cache footprint, store lane indices and convert to byte indices // (2*lane + 0..1), with the doubling baked into the table. It's not clear // that the additional cost of unpacking nibbles is worthwhile. alignas(16) static constexpr uint8_t table[2048] = { // PrintCompressNot16x8Tables 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; const VFromD byte_idx{Load(d8, table + mask_bits * 8).raw}; const VFromD pairs = ZipLower(byte_idx, byte_idx); constexpr uint16_t kPairIndexIncrement = HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001; return BitCast(d, pairs + Set(du, kPairIndexIncrement)); } template HWY_INLINE VFromD IndicesFromBits128(D d, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 16); // There are only 4 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[256] = { // PrintCompress32x4Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_INLINE VFromD IndicesFromNotBits128(D d, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 16); // There are only 4 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[256] = { // PrintCompressNot32x4Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_INLINE VFromD IndicesFromBits128(D d, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 4); // There are only 2 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[64] = { // PrintCompress64x2Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_INLINE VFromD IndicesFromNotBits128(D d, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 4); // There are only 2 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[64] = { // PrintCompressNot64x2Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_API Vec128 CompressBits(Vec128 v, uint64_t mask_bits) { const DFromV d; const RebindToUnsigned du; HWY_DASSERT(mask_bits < (1ull << N)); const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); } template HWY_API Vec128 CompressNotBits(Vec128 v, uint64_t mask_bits) { const DFromV d; const RebindToUnsigned du; HWY_DASSERT(mask_bits < (1ull << N)); const auto indices = BitCast(du, detail::IndicesFromNotBits128(d, mask_bits)); return BitCast(d, TableLookupBytes(BitCast(du, v), indices)); } } // namespace detail // Single lane: no-op template HWY_API Vec128 Compress(Vec128 v, Mask128 /*m*/) { return v; } // Two lanes: conditional swap template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. const Full128 d; const Vec128 m = VecFromMask(d, mask); const Vec128 maskL = DupEven(m); const Vec128 maskH = DupOdd(m); const Vec128 swap = AndNot(maskL, maskH); return IfVecThenElse(swap, Shuffle01(v), v); } #if HWY_PPC_HAVE_10 #ifdef HWY_NATIVE_COMPRESS8 #undef HWY_NATIVE_COMPRESS8 #else #define HWY_NATIVE_COMPRESS8 #endif // General case, 1 byte template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { const DFromV d; return TableLookupBytes( v, detail::CompressOrExpandIndicesFromMask(d, mask)); } #endif // General case, 2 or 4 bytes template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { return detail::CompressBits(v, detail::BitsFromMask(mask)); } // ------------------------------ CompressNot // Single lane: no-op template HWY_API Vec128 CompressNot(Vec128 v, Mask128 /*m*/) { return v; } // Two lanes: conditional swap template HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. const Full128 d; const Vec128 m = VecFromMask(d, mask); const Vec128 maskL = DupEven(m); const Vec128 maskH = DupOdd(m); const Vec128 swap = AndNot(maskH, maskL); return IfVecThenElse(swap, Shuffle01(v), v); } #if HWY_PPC_HAVE_10 // General case, 1 byte template HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { const DFromV d; return TableLookupBytes( v, detail::CompressOrExpandIndicesFromMask(d, Not(mask))); } #endif // General case, 2 or 4 bytes template HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // For partial vectors, we cannot pull the Not() into the table because // BitsFromMask clears the upper bits. if (N < 16 / sizeof(T)) { return detail::CompressBits(v, detail::BitsFromMask(Not(mask))); } return detail::CompressNotBits(v, detail::BitsFromMask(mask)); } // ------------------------------ CompressBlocksNot HWY_API Vec128 CompressBlocksNot(Vec128 v, Mask128 /* m */) { return v; } #if HWY_PPC_HAVE_10 template HWY_API Vec128 CompressBits(Vec128 v, const uint8_t* HWY_RESTRICT bits) { const DFromV d; return Compress(v, LoadMaskBits(d, bits)); } #endif template HWY_API Vec128 CompressBits(Vec128 v, const uint8_t* HWY_RESTRICT bits) { // As there are at most 8 lanes in v if sizeof(TFromD) > 1, simply // convert bits[0] to a uint64_t uint64_t mask_bits = bits[0]; if (N < 8) { mask_bits &= (1ull << N) - 1; } return detail::CompressBits(v, mask_bits); } // ------------------------------ CompressStore, CompressBitsStore #if HWY_PPC_HAVE_10 template HWY_API size_t CompressStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const size_t count = CountTrue(d, m); const auto indices = detail::CompressOrExpandIndicesFromMask(d, m); const auto compressed = TableLookupBytes(v, indices); StoreU(compressed, d, unaligned); return count; } #endif template HWY_API size_t CompressStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; const uint64_t mask_bits = detail::BitsFromMask(m); HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); const size_t count = PopCount(mask_bits); const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); StoreU(compressed, d, unaligned); return count; } #if HWY_PPC_HAVE_10 template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const size_t count = CountTrue(d, m); const auto indices = detail::CompressOrExpandIndicesFromMask(d, m); const auto compressed = TableLookupBytes(v, indices); StoreN(compressed, d, unaligned, count); return count; } #endif template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; const uint64_t mask_bits = detail::BitsFromMask(m); HWY_DASSERT(mask_bits < (1ull << MaxLanes(d))); const size_t count = PopCount(mask_bits); const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); #if (HWY_PPC_HAVE_9 && HWY_ARCH_PPC_64) || HWY_S390X_HAVE_Z14 StoreN(compressed, d, unaligned, count); #else BlendedStore(compressed, FirstN(d, count), d, unaligned); #endif return count; } #if HWY_PPC_HAVE_10 template HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, D d, TFromD* HWY_RESTRICT unaligned) { return CompressStore(v, LoadMaskBits(d, bits), d, unaligned); } #endif template HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; // As there are at most 8 lanes in v if sizeof(TFromD) > 1, simply // convert bits[0] to a uint64_t uint64_t mask_bits = bits[0]; constexpr size_t kN = MaxLanes(d); if (kN < 8) { mask_bits &= (1ull << kN) - 1; } const size_t count = PopCount(mask_bits); const auto indices = BitCast(du, detail::IndicesFromBits128(d, mask_bits)); const auto compressed = BitCast(d, TableLookupBytes(BitCast(du, v), indices)); StoreU(compressed, d, unaligned); return count; } // ------------------------------ Expand #if HWY_PPC_HAVE_10 #ifdef HWY_NATIVE_EXPAND #undef HWY_NATIVE_EXPAND #else #define HWY_NATIVE_EXPAND #endif template HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { const DFromV d; const auto idx = detail::CompressOrExpandIndicesFromMask(d, mask); return IfThenElseZero(mask, TableLookupBytes(v, idx)); } template HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { // Same as Compress, just zero out the mask=false lanes. return IfThenElseZero(mask, Compress(v, mask)); } // For single-element vectors, this is at least as fast as native. template HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { return IfThenElseZero(mask, v); } template HWY_API VFromD LoadExpand(MFromD mask, D d, const TFromD* HWY_RESTRICT unaligned) { return Expand(LoadU(d, unaligned), mask); } #endif // HWY_PPC_HAVE_10 // ------------------------------ StoreInterleaved2/3/4 // HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in // generic_ops-inl.h. // ------------------------------ Additional mask logical operations namespace detail { #if HWY_IS_LITTLE_ENDIAN template HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) { return v; } template HWY_INLINE V Per128BitBlkRevLanesOnBe(V v) { return v; } #else template HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) { const DFromV d; return Reverse8(d, v); } template HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) { const DFromV d; return Reverse4(d, v); } template HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) { const DFromV d; return Reverse2(d, v); } template HWY_INLINE V Per64BitBlkRevLanesOnBe(V v) { return v; } template HWY_INLINE V Per128BitBlkRevLanesOnBe(V v) { const DFromV d; return Reverse(d, v); } #endif template HWY_INLINE V I128Subtract(V a, V b) { #if HWY_S390X_HAVE_Z14 const V diff_i128{reinterpret_cast>::type>( vec_sub_u128(reinterpret_cast<__vector unsigned char>(a.raw), reinterpret_cast<__vector unsigned char>(b.raw)))}; #elif defined(__SIZEOF_INT128__) using VU128 = __vector unsigned __int128; const V diff_i128{reinterpret_cast>::type>( vec_sub(reinterpret_cast(a.raw), reinterpret_cast(b.raw)))}; #else const DFromV d; const Repartition du64; const auto u64_a = BitCast(du64, a); const auto u64_b = BitCast(du64, b); const auto diff_u64 = u64_a - u64_b; const auto borrow_u64 = VecFromMask(du64, u64_a < u64_b); #if HWY_IS_LITTLE_ENDIAN const auto borrow_u64_shifted = ShiftLeftBytes<8>(du64, borrow_u64); #else const auto borrow_u64_shifted = ShiftRightBytes<8>(du64, borrow_u64); #endif const auto diff_i128 = BitCast(d, diff_u64 + borrow_u64_shifted); #endif return diff_i128; } } // namespace detail template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { return mask; } template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { const FixedTag d; const auto vmask = VecFromMask(d, mask); return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask))); } template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { const Simd d; const Full64 d_full64; const auto vmask = VecFromMask(d, mask); const auto vmask_le64 = BitCast(Full64(), detail::Per64BitBlkRevLanesOnBe(ResizeBitCast(d_full64, vmask))); const auto neg_vmask_le64 = Neg(vmask_le64); const auto neg_vmask = ResizeBitCast( d, detail::Per64BitBlkRevLanesOnBe(BitCast(d_full64, neg_vmask_le64))); return MaskFromVec(Or(vmask, neg_vmask)); } template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { const Full128 d; auto vmask = VecFromMask(d, mask); const auto vmask_le128 = detail::Per128BitBlkRevLanesOnBe(vmask); const auto neg_vmask_le128 = detail::I128Subtract(Zero(d), vmask_le128); const auto neg_vmask = detail::Per128BitBlkRevLanesOnBe(neg_vmask_le128); return MaskFromVec(BitCast(d, Or(vmask, neg_vmask))); } template HWY_API Mask128 SetBeforeFirst(Mask128 mask) { return Not(SetAtOrAfterFirst(mask)); } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { return mask; } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { const FixedTag d; const RebindToSigned di; const auto vmask = BitCast(di, VecFromMask(d, mask)); const auto zero = Zero(di); const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero); return MaskFromVec(BitCast(d, And(vmask, vmask2))); } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { const Simd d; const Full64 d_full64; const RebindToSigned di; const auto vmask = VecFromMask(d, mask); const auto vmask_le64 = BitCast(Full64(), detail::Per64BitBlkRevLanesOnBe(ResizeBitCast(d_full64, vmask))); const auto neg_vmask_le64 = Neg(vmask_le64); const auto neg_vmask = ResizeBitCast( d, detail::Per64BitBlkRevLanesOnBe(BitCast(d_full64, neg_vmask_le64))); const auto first_vmask = BitCast(di, And(vmask, neg_vmask)); return MaskFromVec(BitCast(d, Or(first_vmask, Neg(first_vmask)))); } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { const Full128 d; const RebindToSigned di; const auto vmask = VecFromMask(d, mask); const auto vmask_le128 = detail::Per128BitBlkRevLanesOnBe(vmask); const auto neg_vmask_le128 = detail::I128Subtract(Zero(d), vmask_le128); const auto neg_vmask = detail::Per128BitBlkRevLanesOnBe(neg_vmask_le128); return MaskFromVec(BitCast(d, Neg(BitCast(di, And(vmask, neg_vmask))))); } template HWY_API Mask128 SetAtOrBeforeFirst(Mask128 /*mask*/) { const FixedTag d; const RebindToSigned di; using TI = MakeSigned; return RebindMask(d, MaskFromVec(Set(di, TI(-1)))); } template HWY_API Mask128 SetAtOrBeforeFirst(Mask128 mask) { const Simd d; return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask)))); } // ------------------------------ SumsOf2 and SumsOf4 namespace detail { #if !HWY_S390X_HAVE_Z14 // Casts nominally int32_t result to D. template HWY_INLINE VFromD AltivecVsum4sbs(D d, __vector signed char a, __vector signed int b) { const Repartition di32; #ifdef __OPTIMIZE__ if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) { const int64_t sum0 = static_cast(a[0]) + static_cast(a[1]) + static_cast(a[2]) + static_cast(a[3]) + static_cast(b[0]); const int64_t sum1 = static_cast(a[4]) + static_cast(a[5]) + static_cast(a[6]) + static_cast(a[7]) + static_cast(b[1]); const int64_t sum2 = static_cast(a[8]) + static_cast(a[9]) + static_cast(a[10]) + static_cast(a[11]) + static_cast(b[2]); const int64_t sum3 = static_cast(a[12]) + static_cast(a[13]) + static_cast(a[14]) + static_cast(a[15]) + static_cast(b[3]); const int32_t sign0 = static_cast(sum0 >> 63); const int32_t sign1 = static_cast(sum1 >> 63); const int32_t sign2 = static_cast(sum2 >> 63); const int32_t sign3 = static_cast(sum3 >> 63); using Raw = typename detail::Raw128::type; return BitCast( d, VFromD{Raw{ (sign0 == (sum0 >> 31)) ? static_cast(sum0) : static_cast(sign0 ^ 0x7FFFFFFF), (sign1 == (sum1 >> 31)) ? static_cast(sum1) : static_cast(sign1 ^ 0x7FFFFFFF), (sign2 == (sum2 >> 31)) ? static_cast(sum2) : static_cast(sign2 ^ 0x7FFFFFFF), (sign3 == (sum3 >> 31)) ? static_cast(sum3) : static_cast(sign3 ^ 0x7FFFFFFF)}}); } else // NOLINT #endif { return BitCast(d, VFromD{vec_vsum4sbs(a, b)}); } } // Casts nominally uint32_t result to D. template HWY_INLINE VFromD AltivecVsum4ubs(D d, __vector unsigned char a, __vector unsigned int b) { const Repartition du32; #ifdef __OPTIMIZE__ if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) { const uint64_t sum0 = static_cast(a[0]) + static_cast(a[1]) + static_cast(a[2]) + static_cast(a[3]) + static_cast(b[0]); const uint64_t sum1 = static_cast(a[4]) + static_cast(a[5]) + static_cast(a[6]) + static_cast(a[7]) + static_cast(b[1]); const uint64_t sum2 = static_cast(a[8]) + static_cast(a[9]) + static_cast(a[10]) + static_cast(a[11]) + static_cast(b[2]); const uint64_t sum3 = static_cast(a[12]) + static_cast(a[13]) + static_cast(a[14]) + static_cast(a[15]) + static_cast(b[3]); return BitCast( d, VFromD{(__vector unsigned int){ static_cast(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu), static_cast(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu), static_cast(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu), static_cast(sum3 <= 0xFFFFFFFFu ? sum3 : 0xFFFFFFFFu)}}); } else // NOLINT #endif { return BitCast(d, VFromD{vec_vsum4ubs(a, b)}); } } // Casts nominally int32_t result to D. template HWY_INLINE VFromD AltivecVsum2sws(D d, __vector signed int a, __vector signed int b) { const Repartition di32; #ifdef __OPTIMIZE__ const Repartition du64; constexpr int kDestLaneOffset = HWY_IS_BIG_ENDIAN; if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset]) && __builtin_constant_p(b[kDestLaneOffset + 2])) { const int64_t sum0 = static_cast(a[0]) + static_cast(a[1]) + static_cast(b[kDestLaneOffset]); const int64_t sum1 = static_cast(a[2]) + static_cast(a[3]) + static_cast(b[kDestLaneOffset + 2]); const int32_t sign0 = static_cast(sum0 >> 63); const int32_t sign1 = static_cast(sum1 >> 63); return BitCast(d, VFromD{(__vector unsigned long long){ (sign0 == (sum0 >> 31)) ? static_cast(sum0) : static_cast(sign0 ^ 0x7FFFFFFF), (sign1 == (sum1 >> 31)) ? static_cast(sum1) : static_cast(sign1 ^ 0x7FFFFFFF)}}); } else // NOLINT #endif { __vector signed int sum; // Inline assembly is used for vsum2sws to avoid unnecessary shuffling // on little-endian PowerPC targets as the result of the vsum2sws // instruction will already be in the correct lanes on little-endian // PowerPC targets. __asm__("vsum2sws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b)); return BitCast(d, VFromD{sum}); } } // Casts nominally int32_t result to D. template HWY_INLINE VFromD AltivecVsum4shs(D d, __vector signed short a, __vector signed int b) { const Repartition di32; #ifdef __OPTIMIZE__ if (IsConstantRawAltivecVect(a) && IsConstantRawAltivecVect(b)) { const int64_t sum0 = static_cast(a[0]) + static_cast(a[1]) + static_cast(b[0]); const int64_t sum1 = static_cast(a[2]) + static_cast(a[3]) + static_cast(b[1]); const int64_t sum2 = static_cast(a[4]) + static_cast(a[5]) + static_cast(b[2]); const int64_t sum3 = static_cast(a[6]) + static_cast(a[7]) + static_cast(b[3]); const int32_t sign0 = static_cast(sum0 >> 63); const int32_t sign1 = static_cast(sum1 >> 63); const int32_t sign2 = static_cast(sum2 >> 63); const int32_t sign3 = static_cast(sum3 >> 63); using Raw = typename detail::Raw128::type; return BitCast( d, VFromD{Raw{ (sign0 == (sum0 >> 31)) ? static_cast(sum0) : static_cast(sign0 ^ 0x7FFFFFFF), (sign1 == (sum1 >> 31)) ? static_cast(sum1) : static_cast(sign1 ^ 0x7FFFFFFF), (sign2 == (sum2 >> 31)) ? static_cast(sum2) : static_cast(sign2 ^ 0x7FFFFFFF), (sign3 == (sum3 >> 31)) ? static_cast(sum3) : static_cast(sign3 ^ 0x7FFFFFFF)}}); } else // NOLINT #endif { return BitCast(d, VFromD{vec_vsum4shs(a, b)}); } } // Casts nominally int32_t result to D. template HWY_INLINE VFromD AltivecVsumsws(D d, __vector signed int a, __vector signed int b) { const Repartition di32; #ifdef __OPTIMIZE__ constexpr int kDestLaneOffset = HWY_IS_LITTLE_ENDIAN ? 0 : 3; if (IsConstantRawAltivecVect(a) && __builtin_constant_p(b[kDestLaneOffset])) { const int64_t sum = static_cast(a[0]) + static_cast(a[1]) + static_cast(a[2]) + static_cast(a[3]) + static_cast(b[kDestLaneOffset]); const int32_t sign = static_cast(sum >> 63); #if HWY_IS_LITTLE_ENDIAN return BitCast( d, VFromD{(__vector signed int){ (sign == (sum >> 31)) ? static_cast(sum) : static_cast(sign ^ 0x7FFFFFFF), 0, 0, 0}}); #else return BitCast(d, VFromD{(__vector signed int){ 0, 0, 0, (sign == (sum >> 31)) ? static_cast(sum) : static_cast(sign ^ 0x7FFFFFFF)}}); #endif } else // NOLINT #endif { __vector signed int sum; // Inline assembly is used for vsumsws to avoid unnecessary shuffling // on little-endian PowerPC targets as the result of the vsumsws // instruction will already be in the correct lanes on little-endian // PowerPC targets. __asm__("vsumsws %0,%1,%2" : "=v"(sum) : "v"(a), "v"(b)); return BitCast(d, VFromD{sum}); } } template HWY_INLINE Vec128 AltivecU16SumsOf2(Vec128 v) { const RebindToSigned> di16; const RepartitionToWide di32; return AltivecVsum4shs(di32, Xor(BitCast(di16, v), Set(di16, -32768)).raw, Set(di32, 65536).raw); } #endif // !HWY_S390X_HAVE_Z14 // U16->U32 SumsOf2 template HWY_INLINE VFromD>> SumsOf2( hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) { const DFromV d; const RepartitionToWide dw; #if HWY_S390X_HAVE_Z14 return VFromD{vec_sum4(v.raw, Zero(d).raw)}; #else return BitCast(dw, AltivecU16SumsOf2(v)); #endif } // I16->I32 SumsOf2 template HWY_INLINE VFromD>> SumsOf2( hwy::SignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) { const DFromV d; const RepartitionToWide dw; #if HWY_S390X_HAVE_Z14 const RebindToUnsigned du; return BitCast(dw, SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<2>(), BitCast(du, Xor(v, SignBit(d))))) + Set(dw, int32_t{-65536}); #else return AltivecVsum4shs(dw, v.raw, Zero(dw).raw); #endif } #if HWY_S390X_HAVE_Z14 // U32->U64 SumsOf2 template HWY_INLINE VFromD>> SumsOf2( hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, V v) { const DFromV d; const RepartitionToWide dw; return VFromD{vec_sum2(v.raw, Zero(d).raw)}; } // I32->I64 SumsOf2 template HWY_INLINE VFromD>> SumsOf2( hwy::SignedTag /*type_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, V v) { const DFromV d; const RepartitionToWide dw; const RebindToUnsigned du; return BitCast(dw, SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<4>(), BitCast(du, Xor(v, SignBit(d))))) + Set(dw, int64_t{-4294967296LL}); } #endif // U8->U32 SumsOf4 template HWY_INLINE VFromD>> SumsOf4( hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) { const DFromV d; const RepartitionToWideX2 dw2; #if HWY_S390X_HAVE_Z14 return VFromD{vec_sum4(v.raw, Zero(d).raw)}; #else return AltivecVsum4ubs(dw2, v.raw, Zero(dw2).raw); #endif } // I8->I32 SumsOf4 template HWY_INLINE VFromD>> SumsOf4( hwy::SignedTag /*type_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, V v) { const DFromV d; const RepartitionToWideX2 dw2; #if HWY_S390X_HAVE_Z14 const RebindToUnsigned du; return BitCast(dw2, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), BitCast(du, Xor(v, SignBit(d))))) + Set(dw2, int32_t{-512}); #else return AltivecVsum4sbs(dw2, v.raw, Zero(dw2).raw); #endif } // U16->U64 SumsOf4 template HWY_INLINE VFromD>> SumsOf4( hwy::UnsignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) { const DFromV d; const RepartitionToWide dw; const RepartitionToWide dw2; #if HWY_S390X_HAVE_Z14 return VFromD{vec_sum2(v.raw, Zero(d).raw)}; #else const RebindToSigned dw_i; return AltivecVsum2sws(dw2, BitCast(dw_i, SumsOf2(v)).raw, Zero(dw_i).raw); #endif } // I16->I64 SumsOf4 template HWY_INLINE VFromD>> SumsOf4( hwy::SignedTag /*type_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, V v) { const DFromV d; const RepartitionToWide dw; const RepartitionToWide dw2; #if HWY_S390X_HAVE_Z14 const RebindToUnsigned du; return BitCast(dw2, SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<2>(), BitCast(du, Xor(v, SignBit(d))))) + Set(dw2, int64_t{-131072}); #else // VSX const auto sums_of_4_in_lo32 = AltivecVsum2sws(dw, SumsOf2(v).raw, Zero(dw).raw); #if HWY_IS_LITTLE_ENDIAN return PromoteEvenTo(dw2, sums_of_4_in_lo32); #else return PromoteOddTo(dw2, sums_of_4_in_lo32); #endif // HWY_IS_LITTLE_ENDIAN #endif // HWY_S390X_HAVE_Z14 } } // namespace detail // ------------------------------ SumOfLanes // We define SumOfLanes for 8/16-bit types (and I32/U32/I64/U64 on Z14/Z15/Z16); // enable generic for the rest. #undef HWY_IF_SUM_OF_LANES_D #if HWY_S390X_HAVE_Z14 #define HWY_IF_SUM_OF_LANES_D(D) HWY_IF_LANES_GT_D(D, 1), HWY_IF_FLOAT3264_D(D) #else #define HWY_IF_SUM_OF_LANES_D(D) \ HWY_IF_LANES_GT_D(D, 1), HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8)) #endif #if HWY_S390X_HAVE_Z14 namespace detail { template HWY_INLINE Vec128 SumOfU32OrU64LanesAsU128(Vec128 v) { const DFromV d; const RebindToUnsigned du; return BitCast( d, Vec128{vec_sum_u128(BitCast(du, v).raw, Zero(du).raw)}); } } // namespace detail template HWY_API VFromD SumOfLanes(D /*d64*/, VFromD v) { return Broadcast<1>(detail::SumOfU32OrU64LanesAsU128(v)); } #endif template HWY_API Vec32 SumOfLanes(D du16, Vec32 v) { constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN; return Broadcast( BitCast(du16, detail::SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<2>(), v))); } template HWY_API Vec64 SumOfLanes(D du16, Vec64 v) { constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; return Broadcast( BitCast(du16, detail::SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<2>(), v))); } template HWY_API Vec128 SumOfLanes(D du16, Vec128 v) { constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7; #if HWY_S390X_HAVE_Z14 return Broadcast( BitCast(du16, detail::SumOfU32OrU64LanesAsU128(detail::SumsOf4( hwy::UnsignedTag(), hwy::SizeTag<2>(), v)))); #else // VSX const auto zero = Zero(Full128()); return Broadcast( detail::AltivecVsumsws(du16, detail::AltivecU16SumsOf2(v).raw, zero.raw)); #endif } template HWY_API Vec32 SumOfLanes(D di16, Vec32 v) { #if HWY_S390X_HAVE_Z14 const RebindToUnsigned du16; return BitCast(di16, SumOfLanes(du16, BitCast(du16, v))); #else constexpr int kSumLaneIdx = HWY_IS_BIG_ENDIAN; return Broadcast( BitCast(di16, detail::SumsOf2(hwy::SignedTag(), hwy::SizeTag<2>(), v))); #endif } template HWY_API Vec64 SumOfLanes(D di16, Vec64 v) { #if HWY_S390X_HAVE_Z14 const RebindToUnsigned du16; return BitCast(di16, SumOfLanes(du16, BitCast(du16, v))); #else constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; return Broadcast( BitCast(di16, detail::SumsOf4(hwy::SignedTag(), hwy::SizeTag<2>(), v))); #endif } template HWY_API Vec128 SumOfLanes(D di16, Vec128 v) { #if HWY_S390X_HAVE_Z14 const RebindToUnsigned du16; return BitCast(di16, SumOfLanes(du16, BitCast(du16, v))); #else constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7; const Full128 di32; const auto zero = Zero(di32); return Broadcast(detail::AltivecVsumsws( di16, detail::AltivecVsum4shs(di32, v.raw, zero.raw).raw, zero.raw)); #endif } template HWY_API Vec32 SumOfLanes(D du8, Vec32 v) { constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; return Broadcast( BitCast(du8, detail::SumsOf4(hwy::UnsignedTag(), hwy::SizeTag<1>(), v))); } template HWY_API Vec16 SumOfLanes(D du8, Vec16 v) { const Twice dt_u8; return LowerHalf(du8, SumOfLanes(dt_u8, Combine(dt_u8, Zero(du8), v))); } template HWY_API Vec64 SumOfLanes(D du8, Vec64 v) { constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7; return Broadcast(BitCast(du8, SumsOf8(v))); } template HWY_API Vec128 SumOfLanes(D du8, Vec128 v) { constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15; #if HWY_S390X_HAVE_Z14 return Broadcast( BitCast(du8, detail::SumOfU32OrU64LanesAsU128(detail::SumsOf4( hwy::UnsignedTag(), hwy::SizeTag<1>(), v)))); #else const Full128 du32; const RebindToSigned di32; const Vec128 zero = Zero(du32); return Broadcast(detail::AltivecVsumsws( du8, detail::AltivecVsum4ubs(di32, v.raw, zero.raw).raw, BitCast(di32, zero).raw)); #endif } template HWY_API Vec32 SumOfLanes(D di8, Vec32 v) { #if HWY_S390X_HAVE_Z14 const RebindToUnsigned du8; return BitCast(di8, SumOfLanes(du8, BitCast(du8, v))); #else constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 3; return Broadcast( BitCast(di8, detail::SumsOf4(hwy::SignedTag(), hwy::SizeTag<1>(), v))); #endif } template HWY_API Vec16 SumOfLanes(D di8, Vec16 v) { const Twice dt_i8; return LowerHalf(di8, SumOfLanes(dt_i8, Combine(dt_i8, Zero(di8), v))); } template HWY_API Vec64 SumOfLanes(D di8, Vec64 v) { #if HWY_S390X_HAVE_Z14 const RebindToUnsigned du8; return BitCast(di8, SumOfLanes(du8, BitCast(du8, v))); #else constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 7; return Broadcast(BitCast(di8, SumsOf8(v))); #endif } template HWY_API Vec128 SumOfLanes(D di8, Vec128 v) { #if HWY_S390X_HAVE_Z14 const RebindToUnsigned du8; return BitCast(di8, SumOfLanes(du8, BitCast(du8, v))); #else constexpr int kSumLaneIdx = HWY_IS_LITTLE_ENDIAN ? 0 : 15; const Full128 di32; const Vec128 zero = Zero(di32); return Broadcast(detail::AltivecVsumsws( di8, detail::AltivecVsum4sbs(di32, v.raw, zero.raw).raw, zero.raw)); #endif } #if HWY_S390X_HAVE_Z14 template HWY_API VFromD SumOfLanes(D d32, VFromD v) { const RebindToUnsigned du32; return Broadcast<1>( BitCast(d32, detail::SumsOf2(hwy::UnsignedTag(), hwy::SizeTag<4>(), BitCast(du32, v)))); } template HWY_API VFromD SumOfLanes(D /*d32*/, VFromD v) { return Broadcast<3>(detail::SumOfU32OrU64LanesAsU128(v)); } #endif // generic_ops defines MinOfLanes and MaxOfLanes. // ------------------------------ ReduceSum for N=4 I8/U8 // GetLane(SumsOf4(v)) is more efficient on PPC/Z14 than the default N=4 // I8/U8 ReduceSum implementation in generic_ops-inl.h #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8 #undef HWY_NATIVE_REDUCE_SUM_4_UI8 #else #define HWY_NATIVE_REDUCE_SUM_4_UI8 #endif template HWY_API TFromD ReduceSum(D /*d*/, VFromD v) { return static_cast>(GetLane(SumsOf4(v))); } // ------------------------------ Lt128 namespace detail { // Returns vector-mask for Lt128. template > HWY_INLINE V Lt128Vec(D d, V a, V b) { static_assert(IsSame, uint64_t>(), "D must be u64"); #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) (void)d; using VU64 = __vector unsigned long long; using VU128 = __vector unsigned __int128; #if HWY_IS_LITTLE_ENDIAN const VU128 a_u128 = reinterpret_cast(a.raw); const VU128 b_u128 = reinterpret_cast(b.raw); #else // NOTE: Need to swap the halves of both a and b on big-endian targets // as the upper 64 bits of a and b are in lane 1 and the lower 64 bits // of a and b are in lane 0 whereas the vec_cmplt operation below expects // the upper 64 bits in lane 0 and the lower 64 bits in lane 1 on // big-endian PPC targets. const VU128 a_u128 = reinterpret_cast(vec_sld(a.raw, a.raw, 8)); const VU128 b_u128 = reinterpret_cast(vec_sld(b.raw, b.raw, 8)); #endif return V{reinterpret_cast(vec_cmplt(a_u128, b_u128))}; #else // !HWY_PPC_HAVE_10 // Truth table of Eq and Lt for Hi and Lo u64. // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) // =H =L cH cL | out = cH | (=H & cL) // 0 0 0 0 | 0 // 0 0 0 1 | 0 // 0 0 1 0 | 1 // 0 0 1 1 | 1 // 0 1 0 0 | 0 // 0 1 0 1 | 0 // 0 1 1 0 | 1 // 1 0 0 0 | 0 // 1 0 0 1 | 1 // 1 1 0 0 | 0 const auto eqHL = Eq(a, b); const V ltHL = VecFromMask(d, Lt(a, b)); const V ltLX = ShiftLeftLanes<1>(ltHL); const V vecHx = IfThenElse(eqHL, ltLX, ltHL); return InterleaveUpper(d, vecHx, vecHx); #endif } // Returns vector-mask for Eq128. template > HWY_INLINE V Eq128Vec(D d, V a, V b) { static_assert(IsSame, uint64_t>(), "D must be u64"); #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) (void)d; using VU64 = __vector unsigned long long; using VU128 = __vector unsigned __int128; return V{reinterpret_cast(vec_cmpeq(reinterpret_cast(a.raw), reinterpret_cast(b.raw)))}; #else const auto eqHL = VecFromMask(d, Eq(a, b)); const auto eqLH = Reverse2(d, eqHL); return And(eqHL, eqLH); #endif } template > HWY_INLINE V Ne128Vec(D d, V a, V b) { static_assert(IsSame, uint64_t>(), "D must be u64"); #if HWY_PPC_HAVE_10 && defined(__SIZEOF_INT128__) (void)d; using VU64 = __vector unsigned long long; using VU128 = __vector unsigned __int128; return V{reinterpret_cast(vec_cmpne(reinterpret_cast(a.raw), reinterpret_cast(b.raw)))}; #else const auto neHL = VecFromMask(d, Ne(a, b)); const auto neLH = Reverse2(d, neHL); return Or(neHL, neLH); #endif } template > HWY_INLINE V Lt128UpperVec(D d, V a, V b) { const V ltHL = VecFromMask(d, Lt(a, b)); return InterleaveUpper(d, ltHL, ltHL); } template > HWY_INLINE V Eq128UpperVec(D d, V a, V b) { const V eqHL = VecFromMask(d, Eq(a, b)); return InterleaveUpper(d, eqHL, eqHL); } template > HWY_INLINE V Ne128UpperVec(D d, V a, V b) { const V neHL = VecFromMask(d, Ne(a, b)); return InterleaveUpper(d, neHL, neHL); } } // namespace detail template > HWY_API MFromD Lt128(D d, V a, V b) { return MaskFromVec(detail::Lt128Vec(d, a, b)); } template > HWY_API MFromD Eq128(D d, V a, V b) { return MaskFromVec(detail::Eq128Vec(d, a, b)); } template > HWY_API MFromD Ne128(D d, V a, V b) { return MaskFromVec(detail::Ne128Vec(d, a, b)); } template > HWY_API MFromD Lt128Upper(D d, V a, V b) { return MaskFromVec(detail::Lt128UpperVec(d, a, b)); } template > HWY_API MFromD Eq128Upper(D d, V a, V b) { return MaskFromVec(detail::Eq128UpperVec(d, a, b)); } template > HWY_API MFromD Ne128Upper(D d, V a, V b) { return MaskFromVec(detail::Ne128UpperVec(d, a, b)); } // ------------------------------ Min128, Max128 (Lt128) // Avoids the extra MaskFromVec in Lt128. template > HWY_API V Min128(D d, const V a, const V b) { return IfVecThenElse(detail::Lt128Vec(d, a, b), a, b); } template > HWY_API V Max128(D d, const V a, const V b) { return IfVecThenElse(detail::Lt128Vec(d, b, a), a, b); } template > HWY_API V Min128Upper(D d, const V a, const V b) { return IfVecThenElse(detail::Lt128UpperVec(d, a, b), a, b); } template > HWY_API V Max128Upper(D d, const V a, const V b) { return IfVecThenElse(detail::Lt128UpperVec(d, b, a), a, b); } // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex #ifdef HWY_NATIVE_LEADING_ZERO_COUNT #undef HWY_NATIVE_LEADING_ZERO_COUNT #else #define HWY_NATIVE_LEADING_ZERO_COUNT #endif template HWY_API V LeadingZeroCount(V v) { #if HWY_S390X_HAVE_Z14 const DFromV d; const RebindToUnsigned du; #if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__) // Work around for GCC compiler bug in vec_cnttz on Z14/Z15 if v[i] is a // constant __asm__("" : "+v"(v.raw)); #endif return BitCast(d, VFromD{vec_cntlz(BitCast(du, v).raw)}); #else return V{vec_cntlz(v.raw)}; #endif } template HWY_API V HighestSetBitIndex(V v) { const DFromV d; using T = TFromD; return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v)); } #if HWY_PPC_HAVE_9 || HWY_S390X_HAVE_Z14 template HWY_API V TrailingZeroCount(V v) { #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 return V{vec_vctz(v.raw)}; #else #if HWY_S390X_HAVE_Z14 const DFromV d; const RebindToUnsigned du; #if HWY_COMPILER_GCC_ACTUAL && defined(__OPTIMIZE__) // Work around for GCC compiler bug in vec_cnttz on Z14/Z15 if v[i] is a // constant __asm__("" : "+v"(v.raw)); #endif return BitCast(d, VFromD{vec_cnttz(BitCast(du, v).raw)}); #else return V{vec_cnttz(v.raw)}; #endif // HWY_S390X_HAVE_Z14 #endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 } #else template HWY_API V TrailingZeroCount(V v) { const DFromV d; const RebindToSigned di; using TI = TFromD; const auto vi = BitCast(di, v); const auto lowest_bit = And(vi, Neg(vi)); constexpr TI kNumOfBitsInT{sizeof(TI) * 8}; const auto bit_idx = HighestSetBitIndex(lowest_bit); return BitCast(d, IfThenElse(MaskFromVec(BroadcastSignBit(bit_idx)), Set(di, kNumOfBitsInT), bit_idx)); } #endif #undef HWY_PPC_HAVE_9 #undef HWY_PPC_HAVE_10 #undef HWY_S390X_HAVE_Z14 #undef HWY_S390X_HAVE_Z15 // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE();