// Copyright 2019 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Single-element vectors and operations. // External include guard in highway.h - see comment there. #include #ifndef HWY_NO_LIBCXX #include // sqrtf #endif #include "hwy/ops/shared-inl.h" HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { // Single instruction, single data. template using Sisd = Simd; // (Wrapper class required for overloading comparison operators.) template struct Vec1 { using PrivateT = T; // only for DFromV static constexpr size_t kPrivateN = 1; // only for DFromV HWY_INLINE Vec1() = default; Vec1(const Vec1&) = default; Vec1& operator=(const Vec1&) = default; HWY_INLINE explicit Vec1(const T t) : raw(t) {} HWY_INLINE Vec1& operator*=(const Vec1 other) { return *this = (*this * other); } HWY_INLINE Vec1& operator/=(const Vec1 other) { return *this = (*this / other); } HWY_INLINE Vec1& operator+=(const Vec1 other) { return *this = (*this + other); } HWY_INLINE Vec1& operator-=(const Vec1 other) { return *this = (*this - other); } HWY_INLINE Vec1& operator%=(const Vec1 other) { return *this = (*this % other); } HWY_INLINE Vec1& operator&=(const Vec1 other) { return *this = (*this & other); } HWY_INLINE Vec1& operator|=(const Vec1 other) { return *this = (*this | other); } HWY_INLINE Vec1& operator^=(const Vec1 other) { return *this = (*this ^ other); } T raw; }; // 0 or FF..FF, same size as Vec1. template class Mask1 { using Raw = hwy::MakeUnsigned; public: static HWY_INLINE Mask1 FromBool(bool b) { Mask1 mask; mask.bits = b ? static_cast(~Raw{0}) : 0; return mask; } Raw bits; }; template using DFromV = Simd; template using TFromV = typename V::PrivateT; // ------------------------------ BitCast template , typename TFrom> HWY_API Vec1 BitCast(DTo /* tag */, Vec1 v) { static_assert(sizeof(TTo) <= sizeof(TFrom), "Promoting is undefined"); TTo to; CopyBytes(&v.raw, &to); // not same size - ok to shrink return Vec1(to); } // ------------------------------ Zero template > HWY_API Vec1 Zero(D /* tag */) { return Vec1(ConvertScalarTo(0)); } template using VFromD = decltype(Zero(D())); // ------------------------------ Tuple (VFromD) #include "hwy/ops/tuple-inl.h" // ------------------------------ Set template , typename T2> HWY_API Vec1 Set(D /* tag */, const T2 t) { return Vec1(static_cast(t)); } // ------------------------------ Undefined template > HWY_API Vec1 Undefined(D d) { return Zero(d); } // ------------------------------ Iota template , typename T2> HWY_API Vec1 Iota(const D /* tag */, const T2 first) { return Vec1(static_cast(first)); } // ------------------------------ ResizeBitCast template HWY_API VFromD ResizeBitCast(D /* tag */, FromV v) { using TFrom = TFromV; using TTo = TFromD; constexpr size_t kCopyLen = HWY_MIN(sizeof(TFrom), sizeof(TTo)); TTo to{}; CopyBytes(&v.raw, &to); return VFromD(to); } namespace detail { // ResizeBitCast on the HWY_SCALAR target has zero-extending semantics if // sizeof(TFromD) is greater than sizeof(TFromV) template HWY_INLINE VFromD ZeroExtendResizeBitCast(FromSizeTag /* from_size_tag */, ToSizeTag /* to_size_tag */, DTo d_to, DFrom /*d_from*/, VFromD v) { return ResizeBitCast(d_to, v); } } // namespace detail // ------------------------------ Dup128VecFromValues template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD /*t1*/, TFromD /*t2*/, TFromD /*t3*/, TFromD /*t4*/, TFromD /*t5*/, TFromD /*t6*/, TFromD /*t7*/, TFromD /*t8*/, TFromD /*t9*/, TFromD /*t10*/, TFromD /*t11*/, TFromD /*t12*/, TFromD /*t13*/, TFromD /*t14*/, TFromD /*t15*/) { return VFromD(t0); } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD /*t1*/, TFromD /*t2*/, TFromD /*t3*/, TFromD /*t4*/, TFromD /*t5*/, TFromD /*t6*/, TFromD /*t7*/) { return VFromD(t0); } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD /*t1*/, TFromD /*t2*/, TFromD /*t3*/) { return VFromD(t0); } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD /*t1*/) { return VFromD(t0); } // ================================================== LOGICAL // ------------------------------ Not template HWY_API Vec1 Not(const Vec1 v) { using TU = MakeUnsigned; const Sisd du; return BitCast(Sisd(), Vec1(static_cast(~BitCast(du, v).raw))); } // ------------------------------ And template HWY_API Vec1 And(const Vec1 a, const Vec1 b) { using TU = MakeUnsigned; const Sisd du; return BitCast(Sisd(), Vec1(BitCast(du, a).raw & BitCast(du, b).raw)); } template HWY_API Vec1 operator&(const Vec1 a, const Vec1 b) { return And(a, b); } // ------------------------------ AndNot template HWY_API Vec1 AndNot(const Vec1 a, const Vec1 b) { using TU = MakeUnsigned; const Sisd du; return BitCast(Sisd(), Vec1(static_cast(~BitCast(du, a).raw & BitCast(du, b).raw))); } // ------------------------------ Or template HWY_API Vec1 Or(const Vec1 a, const Vec1 b) { using TU = MakeUnsigned; const Sisd du; return BitCast(Sisd(), Vec1(BitCast(du, a).raw | BitCast(du, b).raw)); } template HWY_API Vec1 operator|(const Vec1 a, const Vec1 b) { return Or(a, b); } // ------------------------------ Xor template HWY_API Vec1 Xor(const Vec1 a, const Vec1 b) { using TU = MakeUnsigned; const Sisd du; return BitCast(Sisd(), Vec1(BitCast(du, a).raw ^ BitCast(du, b).raw)); } template HWY_API Vec1 operator^(const Vec1 a, const Vec1 b) { return Xor(a, b); } // ------------------------------ Xor3 template HWY_API Vec1 Xor3(Vec1 x1, Vec1 x2, Vec1 x3) { return Xor(x1, Xor(x2, x3)); } // ------------------------------ Or3 template HWY_API Vec1 Or3(Vec1 o1, Vec1 o2, Vec1 o3) { return Or(o1, Or(o2, o3)); } // ------------------------------ OrAnd template HWY_API Vec1 OrAnd(const Vec1 o, const Vec1 a1, const Vec1 a2) { return Or(o, And(a1, a2)); } // ------------------------------ Mask template , typename TFrom> HWY_API Mask1 RebindMask(DTo /*tag*/, Mask1 m) { static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); return Mask1{m.bits}; } // v must be 0 or FF..FF. template HWY_API Mask1 MaskFromVec(const Vec1 v) { Mask1 mask; CopySameSize(&v, &mask); return mask; } template using MFromD = decltype(MaskFromVec(VFromD())); template Vec1 VecFromMask(const Mask1 mask) { Vec1 v; CopySameSize(&mask, &v); return v; } template > Vec1 VecFromMask(D /* tag */, const Mask1 mask) { Vec1 v; CopySameSize(&mask, &v); return v; } template > HWY_API Mask1 FirstN(D /*tag*/, size_t n) { return Mask1::FromBool(n != 0); } // ------------------------------ IfVecThenElse template HWY_API Vec1 IfVecThenElse(Vec1 mask, Vec1 yes, Vec1 no) { return IfThenElse(MaskFromVec(mask), yes, no); } // ------------------------------ CopySign template HWY_API Vec1 CopySign(const Vec1 magn, const Vec1 sign) { static_assert(IsFloat(), "Only makes sense for floating-point"); const DFromV d; return BitwiseIfThenElse(SignBit(d), sign, magn); } // ------------------------------ CopySignToAbs template HWY_API Vec1 CopySignToAbs(const Vec1 abs, const Vec1 sign) { static_assert(IsFloat(), "Only makes sense for floating-point"); const Sisd d; return OrAnd(abs, SignBit(d), sign); } // ------------------------------ BroadcastSignBit template HWY_API Vec1 BroadcastSignBit(const Vec1 v) { // This is used inside ShiftRight, so we cannot implement in terms of it. return v.raw < 0 ? Vec1(T(-1)) : Vec1(0); } // ------------------------------ PopulationCount #ifdef HWY_NATIVE_POPCNT #undef HWY_NATIVE_POPCNT #else #define HWY_NATIVE_POPCNT #endif template HWY_API Vec1 PopulationCount(Vec1 v) { return Vec1(static_cast(PopCount(v.raw))); } // ------------------------------ IfThenElse // Returns mask ? yes : no. template HWY_API Vec1 IfThenElse(const Mask1 mask, const Vec1 yes, const Vec1 no) { return mask.bits ? yes : no; } template HWY_API Vec1 IfThenElseZero(const Mask1 mask, const Vec1 yes) { return mask.bits ? yes : Vec1(ConvertScalarTo(0)); } template HWY_API Vec1 IfThenZeroElse(const Mask1 mask, const Vec1 no) { return mask.bits ? Vec1(ConvertScalarTo(0)) : no; } template HWY_API Vec1 IfNegativeThenElse(Vec1 v, Vec1 yes, Vec1 no) { const DFromV d; const RebindToSigned di; const auto vi = BitCast(di, v); return vi.raw < 0 ? yes : no; } template HWY_API Vec1 ZeroIfNegative(const Vec1 v) { const DFromV d; const RebindToSigned di; const auto vi = BitCast(di, v); return vi.raw < 0 ? Vec1(ConvertScalarTo(0)) : v; } // ------------------------------ Mask logical template HWY_API Mask1 Not(const Mask1 m) { return MaskFromVec(Not(VecFromMask(Sisd(), m))); } template HWY_API Mask1 And(const Mask1 a, Mask1 b) { const Sisd d; return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask1 AndNot(const Mask1 a, Mask1 b) { const Sisd d; return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask1 Or(const Mask1 a, Mask1 b) { const Sisd d; return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask1 Xor(const Mask1 a, Mask1 b) { const Sisd d; return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask1 ExclusiveNeither(const Mask1 a, Mask1 b) { const Sisd d; return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); } template HWY_API Mask1 SetAtOrAfterFirst(Mask1 mask) { return mask; } template HWY_API Mask1 SetBeforeFirst(Mask1 mask) { return Not(mask); } template HWY_API Mask1 SetOnlyFirst(Mask1 mask) { return mask; } template HWY_API Mask1 SetAtOrBeforeFirst(Mask1 /*mask*/) { return Mask1::FromBool(true); } // ------------------------------ LowerHalfOfMask #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK #undef HWY_NATIVE_LOWER_HALF_OF_MASK #else #define HWY_NATIVE_LOWER_HALF_OF_MASK #endif template HWY_API MFromD LowerHalfOfMask(D /*d*/, MFromD m) { return m; } // ================================================== SHIFTS // ------------------------------ ShiftLeft/ShiftRight (BroadcastSignBit) template HWY_API Vec1 ShiftLeft(const Vec1 v) { static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); return Vec1( static_cast(static_cast>(v.raw) << kBits)); } template HWY_API Vec1 ShiftRight(const Vec1 v) { static_assert(0 <= kBits && kBits < sizeof(T) * 8, "Invalid shift"); #if __cplusplus >= 202002L // Signed right shift is now guaranteed to be arithmetic (rounding toward // negative infinity, i.e. shifting in the sign bit). return Vec1(static_cast(v.raw >> kBits)); #else if (IsSigned()) { // Emulate arithmetic shift using only logical (unsigned) shifts, because // signed shifts are still implementation-defined. using TU = hwy::MakeUnsigned; const Sisd du; const TU shifted = static_cast(BitCast(du, v).raw >> kBits); const TU sign = BitCast(du, BroadcastSignBit(v)).raw; const size_t sign_shift = static_cast(static_cast(sizeof(TU)) * 8 - 1 - kBits); const TU upper = static_cast(sign << sign_shift); return BitCast(Sisd(), Vec1(shifted | upper)); } else { // T is unsigned return Vec1(static_cast(v.raw >> kBits)); } #endif } // ------------------------------ RotateRight (ShiftRight) template HWY_API Vec1 RotateRight(const Vec1 v) { constexpr size_t kSizeInBits = sizeof(T) * 8; static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift"); if (kBits == 0) return v; return Or(ShiftRight(v), ShiftLeft(v)); } // ------------------------------ ShiftLeftSame (BroadcastSignBit) template HWY_API Vec1 ShiftLeftSame(const Vec1 v, int bits) { return Vec1( static_cast(static_cast>(v.raw) << bits)); } template HWY_API Vec1 ShiftRightSame(const Vec1 v, int bits) { #if __cplusplus >= 202002L // Signed right shift is now guaranteed to be arithmetic (rounding toward // negative infinity, i.e. shifting in the sign bit). return Vec1(static_cast(v.raw >> bits)); #else if (IsSigned()) { // Emulate arithmetic shift using only logical (unsigned) shifts, because // signed shifts are still implementation-defined. using TU = hwy::MakeUnsigned; const Sisd du; const TU shifted = static_cast(BitCast(du, v).raw >> bits); const TU sign = BitCast(du, BroadcastSignBit(v)).raw; const size_t sign_shift = static_cast(static_cast(sizeof(TU)) * 8 - 1 - bits); const TU upper = static_cast(sign << sign_shift); return BitCast(Sisd(), Vec1(shifted | upper)); } else { // T is unsigned return Vec1(static_cast(v.raw >> bits)); } #endif } // ------------------------------ Shl // Single-lane => same as ShiftLeftSame except for the argument type. template HWY_API Vec1 operator<<(const Vec1 v, const Vec1 bits) { return ShiftLeftSame(v, static_cast(bits.raw)); } template HWY_API Vec1 operator>>(const Vec1 v, const Vec1 bits) { return ShiftRightSame(v, static_cast(bits.raw)); } // ================================================== ARITHMETIC template HWY_API Vec1 operator+(Vec1 a, Vec1 b) { const uint64_t a64 = static_cast(a.raw); const uint64_t b64 = static_cast(b.raw); return Vec1(static_cast((a64 + b64) & static_cast(~T(0)))); } HWY_API Vec1 operator+(const Vec1 a, const Vec1 b) { return Vec1(a.raw + b.raw); } HWY_API Vec1 operator+(const Vec1 a, const Vec1 b) { return Vec1(a.raw + b.raw); } template HWY_API Vec1 operator-(Vec1 a, Vec1 b) { const uint64_t a64 = static_cast(a.raw); const uint64_t b64 = static_cast(b.raw); return Vec1(static_cast((a64 - b64) & static_cast(~T(0)))); } HWY_API Vec1 operator-(const Vec1 a, const Vec1 b) { return Vec1(a.raw - b.raw); } HWY_API Vec1 operator-(const Vec1 a, const Vec1 b) { return Vec1(a.raw - b.raw); } // ------------------------------ SumsOf8 HWY_API Vec1 SumsOf8(const Vec1 v) { return Vec1(v.raw); } HWY_API Vec1 SumsOf8(const Vec1 v) { return Vec1(v.raw); } // ------------------------------ SumsOf2 template HWY_API Vec1> SumsOf2(const Vec1 v) { const DFromV d; const Rebind, decltype(d)> dw; return PromoteTo(dw, v); } // ------------------------------ SaturatedAdd // Returns a + b clamped to the destination range. // Unsigned HWY_API Vec1 SaturatedAdd(const Vec1 a, const Vec1 b) { return Vec1( static_cast(HWY_MIN(HWY_MAX(0, a.raw + b.raw), 255))); } HWY_API Vec1 SaturatedAdd(const Vec1 a, const Vec1 b) { return Vec1(static_cast( HWY_MIN(HWY_MAX(0, static_cast(a.raw) + b.raw), 65535))); } // Signed HWY_API Vec1 SaturatedAdd(const Vec1 a, const Vec1 b) { return Vec1( static_cast(HWY_MIN(HWY_MAX(-128, a.raw + b.raw), 127))); } HWY_API Vec1 SaturatedAdd(const Vec1 a, const Vec1 b) { return Vec1(static_cast( HWY_MIN(HWY_MAX(-32768, static_cast(a.raw) + b.raw), 32767))); } // ------------------------------ Saturating subtraction // Returns a - b clamped to the destination range. // Unsigned HWY_API Vec1 SaturatedSub(const Vec1 a, const Vec1 b) { return Vec1( static_cast(HWY_MIN(HWY_MAX(0, a.raw - b.raw), 255))); } HWY_API Vec1 SaturatedSub(const Vec1 a, const Vec1 b) { return Vec1(static_cast( HWY_MIN(HWY_MAX(0, static_cast(a.raw) - b.raw), 65535))); } // Signed HWY_API Vec1 SaturatedSub(const Vec1 a, const Vec1 b) { return Vec1( static_cast(HWY_MIN(HWY_MAX(-128, a.raw - b.raw), 127))); } HWY_API Vec1 SaturatedSub(const Vec1 a, const Vec1 b) { return Vec1(static_cast( HWY_MIN(HWY_MAX(-32768, static_cast(a.raw) - b.raw), 32767))); } // ------------------------------ Average // Returns (a + b + 1) / 2 HWY_API Vec1 AverageRound(const Vec1 a, const Vec1 b) { return Vec1(static_cast((a.raw + b.raw + 1) / 2)); } HWY_API Vec1 AverageRound(const Vec1 a, const Vec1 b) { return Vec1(static_cast((a.raw + b.raw + 1) / 2)); } // ------------------------------ Absolute value template HWY_API Vec1 Abs(const Vec1 a) { return Vec1(ScalarAbs(a.raw)); } // ------------------------------ Min/Max // may be unavailable, so implement our own. template HWY_API Vec1 Min(const Vec1 a, const Vec1 b) { return Vec1(HWY_MIN(a.raw, b.raw)); } template HWY_API Vec1 Min(const Vec1 a, const Vec1 b) { if (isnan(a.raw)) return b; if (isnan(b.raw)) return a; return Vec1(HWY_MIN(a.raw, b.raw)); } template HWY_API Vec1 Max(const Vec1 a, const Vec1 b) { return Vec1(HWY_MAX(a.raw, b.raw)); } template HWY_API Vec1 Max(const Vec1 a, const Vec1 b) { if (isnan(a.raw)) return b; if (isnan(b.raw)) return a; return Vec1(HWY_MAX(a.raw, b.raw)); } // ------------------------------ Floating-point negate template HWY_API Vec1 Neg(const Vec1 v) { return Xor(v, SignBit(Sisd())); } template HWY_API Vec1 Neg(const Vec1 v) { return Zero(Sisd()) - v; } // ------------------------------ mul/div // Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*. #ifdef HWY_NATIVE_MUL_8 #undef HWY_NATIVE_MUL_8 #else #define HWY_NATIVE_MUL_8 #endif #ifdef HWY_NATIVE_MUL_64 #undef HWY_NATIVE_MUL_64 #else #define HWY_NATIVE_MUL_64 #endif template HWY_API Vec1 operator*(const Vec1 a, const Vec1 b) { return Vec1(static_cast(double{a.raw} * b.raw)); } template HWY_API Vec1 operator*(const Vec1 a, const Vec1 b) { return Vec1(static_cast(static_cast(a.raw) * static_cast(b.raw))); } template HWY_API Vec1 operator/(const Vec1 a, const Vec1 b) { return Vec1(a.raw / b.raw); } // Returns the upper 16 bits of a * b in each lane. HWY_API Vec1 MulHigh(const Vec1 a, const Vec1 b) { return Vec1(static_cast((a.raw * b.raw) >> 16)); } HWY_API Vec1 MulHigh(const Vec1 a, const Vec1 b) { // Cast to uint32_t first to prevent overflow. Otherwise the result of // uint16_t * uint16_t is in "int" which may overflow. In practice the result // is the same but this way it is also defined. return Vec1(static_cast( (static_cast(a.raw) * static_cast(b.raw)) >> 16)); } HWY_API Vec1 MulFixedPoint15(Vec1 a, Vec1 b) { return Vec1(static_cast((a.raw * b.raw + 16384) >> 15)); } // Multiplies even lanes (0, 2 ..) and returns the double-wide result. template HWY_API Vec1> MulEven(const Vec1 a, const Vec1 b) { using TW = MakeWide; const TW a_wide = a.raw; return Vec1(static_cast(a_wide * b.raw)); } // Approximate reciprocal HWY_API Vec1 ApproximateReciprocal(const Vec1 v) { // Zero inputs are allowed, but callers are responsible for replacing the // return value with something else (typically using IfThenElse). This check // avoids a ubsan error. The return value is arbitrary. if (v.raw == 0.0f) return Vec1(0.0f); return Vec1(1.0f / v.raw); } // generic_ops takes care of integer T. template HWY_API Vec1 AbsDiff(const Vec1 a, const Vec1 b) { return Abs(a - b); } // ------------------------------ Floating-point multiply-add variants template HWY_API Vec1 MulAdd(const Vec1 mul, const Vec1 x, const Vec1 add) { return mul * x + add; } template HWY_API Vec1 NegMulAdd(const Vec1 mul, const Vec1 x, const Vec1 add) { return add - mul * x; } template HWY_API Vec1 MulSub(const Vec1 mul, const Vec1 x, const Vec1 sub) { return mul * x - sub; } template HWY_API Vec1 NegMulSub(const Vec1 mul, const Vec1 x, const Vec1 sub) { return Neg(mul) * x - sub; } // ------------------------------ Floating-point square root // Approximate reciprocal square root HWY_API Vec1 ApproximateReciprocalSqrt(const Vec1 v) { float f = v.raw; const float half = f * 0.5f; uint32_t bits; CopySameSize(&f, &bits); // Initial guess based on log2(f) bits = 0x5F3759DF - (bits >> 1); CopySameSize(&bits, &f); // One Newton-Raphson iteration return Vec1(f * (1.5f - (half * f * f))); } // Square root HWY_API Vec1 Sqrt(Vec1 v) { #if defined(HWY_NO_LIBCXX) #if HWY_COMPILER_GCC_ACTUAL return Vec1(__builtin_sqrt(v.raw)); #else uint32_t bits; CopyBytes(&v, &bits); // Coarse approximation, letting the exponent LSB leak into the mantissa bits = (1 << 29) + (bits >> 1) - (1 << 22); CopyBytes(&bits, &v); return v; #endif // !HWY_COMPILER_GCC_ACTUAL #else return Vec1(sqrtf(v.raw)); #endif // !HWY_NO_LIBCXX } HWY_API Vec1 Sqrt(Vec1 v) { #if defined(HWY_NO_LIBCXX) #if HWY_COMPILER_GCC_ACTUAL return Vec1(__builtin_sqrt(v.raw)); #else uint64_t bits; CopyBytes(&v, &bits); // Coarse approximation, letting the exponent LSB leak into the mantissa bits = (1ULL << 61) + (bits >> 1) - (1ULL << 51); CopyBytes(&bits, &v); return v; #endif // !HWY_COMPILER_GCC_ACTUAL #else return Vec1(sqrt(v.raw)); #endif // HWY_NO_LIBCXX } // ------------------------------ Floating-point rounding template HWY_API Vec1 Round(const Vec1 v) { using TI = MakeSigned; if (!(Abs(v).raw < MantissaEnd())) { // Huge or NaN return v; } const T k0 = ConvertScalarTo(0); const T bias = ConvertScalarTo(v.raw < k0 ? -0.5 : 0.5); const TI rounded = ConvertScalarTo(v.raw + bias); if (rounded == 0) return CopySignToAbs(Vec1(k0), v); TI offset = 0; // Round to even if ((rounded & 1) && ScalarAbs(ConvertScalarTo(rounded) - v.raw) == ConvertScalarTo(0.5)) { offset = v.raw < k0 ? -1 : 1; } return Vec1(ConvertScalarTo(rounded - offset)); } // Round-to-nearest even. HWY_API Vec1 NearestInt(const Vec1 v) { using T = float; using TI = int32_t; const T abs = Abs(v).raw; const bool is_sign = ScalarSignBit(v.raw); if (!(abs < MantissaEnd())) { // Huge or NaN // Check if too large to cast or NaN if (!(abs <= ConvertScalarTo(LimitsMax()))) { return Vec1(is_sign ? LimitsMin() : LimitsMax()); } return Vec1(ConvertScalarTo(v.raw)); } const T bias = ConvertScalarTo(v.raw < ConvertScalarTo(0.0) ? -0.5 : 0.5); const TI rounded = ConvertScalarTo(v.raw + bias); if (rounded == 0) return Vec1(0); TI offset = 0; // Round to even if ((rounded & 1) && ScalarAbs(ConvertScalarTo(rounded) - v.raw) == ConvertScalarTo(0.5)) { offset = is_sign ? -1 : 1; } return Vec1(rounded - offset); } template HWY_API Vec1 Trunc(const Vec1 v) { using TI = MakeSigned; if (!(Abs(v).raw <= MantissaEnd())) { // Huge or NaN return v; } const TI truncated = ConvertScalarTo(v.raw); if (truncated == 0) return CopySignToAbs(Vec1(0), v); return Vec1(ConvertScalarTo(truncated)); } template V Ceiling(const V v) { const Bits kExponentMask = (1ull << kExponentBits) - 1; const Bits kMantissaMask = (1ull << kMantissaBits) - 1; const Bits kBias = kExponentMask / 2; Float f = v.raw; const bool positive = f > Float(0.0); Bits bits; CopySameSize(&v, &bits); const int exponent = static_cast(((bits >> kMantissaBits) & kExponentMask) - kBias); // Already an integer. if (exponent >= kMantissaBits) return v; // |v| <= 1 => 0 or 1. if (exponent < 0) return positive ? V(1) : V(-0.0); const Bits mantissa_mask = kMantissaMask >> exponent; // Already an integer if ((bits & mantissa_mask) == 0) return v; // Clear fractional bits and round up if (positive) bits += (kMantissaMask + 1) >> exponent; bits &= ~mantissa_mask; CopySameSize(&bits, &f); return V(f); } template V Floor(const V v) { const Bits kExponentMask = (1ull << kExponentBits) - 1; const Bits kMantissaMask = (1ull << kMantissaBits) - 1; const Bits kBias = kExponentMask / 2; Float f = v.raw; const bool negative = f < Float(0.0); Bits bits; CopySameSize(&v, &bits); const int exponent = static_cast(((bits >> kMantissaBits) & kExponentMask) - kBias); // Already an integer. if (exponent >= kMantissaBits) return v; // |v| <= 1 => -1 or 0. if (exponent < 0) return V(negative ? Float(-1.0) : Float(0.0)); const Bits mantissa_mask = kMantissaMask >> exponent; // Already an integer if ((bits & mantissa_mask) == 0) return v; // Clear fractional bits and round down if (negative) bits += (kMantissaMask + 1) >> exponent; bits &= ~mantissa_mask; CopySameSize(&bits, &f); return V(f); } // Toward +infinity, aka ceiling HWY_API Vec1 Ceil(const Vec1 v) { return Ceiling(v); } HWY_API Vec1 Ceil(const Vec1 v) { return Ceiling(v); } // Toward -infinity, aka floor HWY_API Vec1 Floor(const Vec1 v) { return Floor(v); } HWY_API Vec1 Floor(const Vec1 v) { return Floor(v); } // ================================================== COMPARE template HWY_API Mask1 operator==(const Vec1 a, const Vec1 b) { return Mask1::FromBool(a.raw == b.raw); } template HWY_API Mask1 operator!=(const Vec1 a, const Vec1 b) { return Mask1::FromBool(a.raw != b.raw); } template HWY_API Mask1 TestBit(const Vec1 v, const Vec1 bit) { static_assert(!hwy::IsFloat(), "Only integer vectors supported"); return (v & bit) == bit; } template HWY_API Mask1 operator<(const Vec1 a, const Vec1 b) { return Mask1::FromBool(a.raw < b.raw); } template HWY_API Mask1 operator>(const Vec1 a, const Vec1 b) { return Mask1::FromBool(a.raw > b.raw); } template HWY_API Mask1 operator<=(const Vec1 a, const Vec1 b) { return Mask1::FromBool(a.raw <= b.raw); } template HWY_API Mask1 operator>=(const Vec1 a, const Vec1 b) { return Mask1::FromBool(a.raw >= b.raw); } // ------------------------------ Floating-point classification (==) template HWY_API Mask1 IsNaN(const Vec1 v) { // std::isnan returns false for 0x7F..FF in clang AVX3 builds, so DIY. MakeUnsigned bits; CopySameSize(&v, &bits); bits += bits; bits >>= 1; // clear sign bit // NaN if all exponent bits are set and the mantissa is not zero. return Mask1::FromBool(bits > ExponentMask()); } // Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite. #ifdef HWY_NATIVE_ISINF #undef HWY_NATIVE_ISINF #else #define HWY_NATIVE_ISINF #endif HWY_API Mask1 IsInf(const Vec1 v) { const Sisd d; const RebindToUnsigned du; const Vec1 vu = BitCast(du, v); // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. return RebindMask(d, (vu + vu) == Set(du, 0xFF000000u)); } HWY_API Mask1 IsInf(const Vec1 v) { const Sisd d; const RebindToUnsigned du; const Vec1 vu = BitCast(du, v); // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. return RebindMask(d, (vu + vu) == Set(du, 0xFFE0000000000000ull)); } HWY_API Mask1 IsFinite(const Vec1 v) { const Vec1 vu = BitCast(Sisd(), v); // Shift left to clear the sign bit, check whether exponent != max value. return Mask1::FromBool((vu.raw << 1) < 0xFF000000u); } HWY_API Mask1 IsFinite(const Vec1 v) { const Vec1 vu = BitCast(Sisd(), v); // Shift left to clear the sign bit, check whether exponent != max value. return Mask1::FromBool((vu.raw << 1) < 0xFFE0000000000000ull); } // ================================================== MEMORY // ------------------------------ Load template > HWY_API Vec1 Load(D /* tag */, const T* HWY_RESTRICT aligned) { T t; CopySameSize(aligned, &t); return Vec1(t); } template > HWY_API Vec1 MaskedLoad(Mask1 m, D d, const T* HWY_RESTRICT aligned) { return IfThenElseZero(m, Load(d, aligned)); } template > HWY_API Vec1 MaskedLoadOr(Vec1 v, Mask1 m, D d, const T* HWY_RESTRICT aligned) { return IfThenElse(m, Load(d, aligned), v); } template > HWY_API Vec1 LoadU(D d, const T* HWY_RESTRICT p) { return Load(d, p); } // In some use cases, "load single lane" is sufficient; otherwise avoid this. template > HWY_API Vec1 LoadDup128(D d, const T* HWY_RESTRICT aligned) { return Load(d, aligned); } #ifdef HWY_NATIVE_LOAD_N #undef HWY_NATIVE_LOAD_N #else #define HWY_NATIVE_LOAD_N #endif template > HWY_API VFromD LoadN(D d, const T* HWY_RESTRICT p, size_t max_lanes_to_load) { return (max_lanes_to_load > 0) ? Load(d, p) : Zero(d); } template > HWY_API VFromD LoadNOr(VFromD no, D d, const T* HWY_RESTRICT p, size_t max_lanes_to_load) { return (max_lanes_to_load > 0) ? Load(d, p) : no; } // ------------------------------ Store template > HWY_API void Store(const Vec1 v, D /* tag */, T* HWY_RESTRICT aligned) { CopySameSize(&v.raw, aligned); } template > HWY_API void StoreU(const Vec1 v, D d, T* HWY_RESTRICT p) { return Store(v, d, p); } template > HWY_API void BlendedStore(const Vec1 v, Mask1 m, D d, T* HWY_RESTRICT p) { if (!m.bits) return; StoreU(v, d, p); } #ifdef HWY_NATIVE_STORE_N #undef HWY_NATIVE_STORE_N #else #define HWY_NATIVE_STORE_N #endif template > HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, size_t max_lanes_to_store) { if (max_lanes_to_store > 0) { Store(v, d, p); } } // ------------------------------ LoadInterleaved2/3/4 // Per-target flag to prevent generic_ops-inl.h from defining StoreInterleaved2. #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED #else #define HWY_NATIVE_LOAD_STORE_INTERLEAVED #endif template > HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, Vec1& v0, Vec1& v1) { v0 = LoadU(d, unaligned + 0); v1 = LoadU(d, unaligned + 1); } template > HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, Vec1& v0, Vec1& v1, Vec1& v2) { v0 = LoadU(d, unaligned + 0); v1 = LoadU(d, unaligned + 1); v2 = LoadU(d, unaligned + 2); } template > HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, Vec1& v0, Vec1& v1, Vec1& v2, Vec1& v3) { v0 = LoadU(d, unaligned + 0); v1 = LoadU(d, unaligned + 1); v2 = LoadU(d, unaligned + 2); v3 = LoadU(d, unaligned + 3); } // ------------------------------ StoreInterleaved2/3/4 template > HWY_API void StoreInterleaved2(const Vec1 v0, const Vec1 v1, D d, T* HWY_RESTRICT unaligned) { StoreU(v0, d, unaligned + 0); StoreU(v1, d, unaligned + 1); } template > HWY_API void StoreInterleaved3(const Vec1 v0, const Vec1 v1, const Vec1 v2, D d, T* HWY_RESTRICT unaligned) { StoreU(v0, d, unaligned + 0); StoreU(v1, d, unaligned + 1); StoreU(v2, d, unaligned + 2); } template > HWY_API void StoreInterleaved4(const Vec1 v0, const Vec1 v1, const Vec1 v2, const Vec1 v3, D d, T* HWY_RESTRICT unaligned) { StoreU(v0, d, unaligned + 0); StoreU(v1, d, unaligned + 1); StoreU(v2, d, unaligned + 2); StoreU(v3, d, unaligned + 3); } // ------------------------------ Stream template > HWY_API void Stream(const Vec1 v, D d, T* HWY_RESTRICT aligned) { return Store(v, d, aligned); } // ------------------------------ Scatter #ifdef HWY_NATIVE_SCATTER #undef HWY_NATIVE_SCATTER #else #define HWY_NATIVE_SCATTER #endif template , typename TI> HWY_API void ScatterOffset(Vec1 v, D d, T* base, Vec1 offset) { static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); const intptr_t addr = reinterpret_cast(base) + static_cast(offset.raw); Store(v, d, reinterpret_cast(addr)); } template , typename TI> HWY_API void ScatterIndex(Vec1 v, D d, T* HWY_RESTRICT base, Vec1 index) { static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); Store(v, d, base + index.raw); } template , typename TI> HWY_API void MaskedScatterIndex(Vec1 v, Mask1 m, D d, T* HWY_RESTRICT base, Vec1 index) { static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); if (m.bits) Store(v, d, base + index.raw); } // ------------------------------ Gather #ifdef HWY_NATIVE_GATHER #undef HWY_NATIVE_GATHER #else #define HWY_NATIVE_GATHER #endif template > HWY_API Vec1 GatherOffset(D d, const T* base, Vec1> offset) { HWY_DASSERT(offset.raw >= 0); const intptr_t addr = reinterpret_cast(base) + static_cast(offset.raw); return Load(d, reinterpret_cast(addr)); } template > HWY_API Vec1 GatherIndex(D d, const T* HWY_RESTRICT base, Vec1> index) { HWY_DASSERT(index.raw >= 0); return Load(d, base + index.raw); } template > HWY_API Vec1 MaskedGatherIndex(Mask1 m, D d, const T* HWY_RESTRICT base, Vec1> index) { HWY_DASSERT(index.raw >= 0); return MaskedLoad(m, d, base + index.raw); } template > HWY_API Vec1 MaskedGatherIndexOr(Vec1 no, Mask1 m, D d, const T* HWY_RESTRICT base, Vec1> index) { HWY_DASSERT(index.raw >= 0); return MaskedLoadOr(no, m, d, base + index.raw); } // ================================================== CONVERT // ConvertTo and DemoteTo with floating-point input and integer output truncate // (rounding toward zero). namespace detail { template HWY_INLINE ToT CastValueForF2IConv(FromT val) { // Prevent ubsan errors when converting float to narrower integer using FromTU = MakeUnsigned; using ToTU = MakeUnsigned; constexpr unsigned kMaxExpField = static_cast(MaxExponentField()); constexpr unsigned kExpBias = kMaxExpField >> 1; constexpr unsigned kMinOutOfRangeExpField = static_cast(HWY_MIN( kExpBias + sizeof(ToT) * 8 - static_cast(IsSigned()), kMaxExpField)); // If ToT is signed, compare only the exponent bits of val against // kMinOutOfRangeExpField. // // Otherwise, if ToT is unsigned, compare the sign bit plus exponent bits of // val against kMinOutOfRangeExpField as a negative value is outside of the // range of an unsigned integer type. const FromT val_to_compare = static_cast(IsSigned() ? ScalarAbs(val) : val); // val is within the range of ToT if // (BitCastScalar(val_to_compare) >> MantissaBits()) is less // than kMinOutOfRangeExpField // // Otherwise, val is either outside of the range of ToT or equal to // LimitsMin() if // (BitCastScalar(val_to_compare) >> MantissaBits()) is greater // than or equal to kMinOutOfRangeExpField. return (static_cast(BitCastScalar(val_to_compare) >> MantissaBits()) < kMinOutOfRangeExpField) ? static_cast(val) : static_cast(static_cast(LimitsMax()) + static_cast(ScalarSignBit(val))); } template HWY_INLINE ToT CastValueForPromoteTo(ToTypeTag /* to_type_tag */, FromT val) { return ConvertScalarTo(val); } template HWY_INLINE ToT CastValueForPromoteTo(hwy::SignedTag /*to_type_tag*/, float val) { return CastValueForF2IConv(val); } template HWY_INLINE ToT CastValueForPromoteTo(hwy::UnsignedTag /*to_type_tag*/, float val) { return CastValueForF2IConv(val); } } // namespace detail #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64 #undef HWY_NATIVE_PROMOTE_F16_TO_F64 #else #define HWY_NATIVE_PROMOTE_F16_TO_F64 #endif template , typename TFrom> HWY_API Vec1 PromoteTo(DTo /* tag */, Vec1 from) { static_assert(sizeof(TTo) > sizeof(TFrom), "Not promoting"); // For bits Y > X, floatX->floatY and intX->intY are always representable. return Vec1( detail::CastValueForPromoteTo(hwy::TypeTag(), from.raw)); } // MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(TFrom) is here, // so we overload for TFrom=double and TTo={float,int32_t}. template HWY_API Vec1 DemoteTo(D /* tag */, Vec1 from) { // Prevent ubsan errors when converting float to narrower integer/float if (IsInf(from).bits || Abs(from).raw > static_cast(HighestValue())) { return Vec1(ScalarSignBit(from.raw) ? LowestValue() : HighestValue()); } return Vec1(static_cast(from.raw)); } template HWY_API VFromD DemoteTo(D /* tag */, Vec1 from) { // Prevent ubsan errors when converting int32_t to narrower integer/int32_t return Vec1>(detail::CastValueForF2IConv>(from.raw)); } template , typename TFrom, HWY_IF_SIGNED(TFrom), HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD)> HWY_API Vec1 DemoteTo(DTo /* tag */, Vec1 from) { static_assert(!IsFloat(), "TFrom=double are handled above"); static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting"); // Int to int: choose closest value in TTo to `from` (avoids UB) from.raw = HWY_MIN(HWY_MAX(LimitsMin(), from.raw), LimitsMax()); return Vec1(static_cast(from.raw)); } template , typename TFrom, HWY_IF_UNSIGNED(TFrom), HWY_IF_UNSIGNED_D(DTo)> HWY_API Vec1 DemoteTo(DTo /* tag */, Vec1 from) { static_assert(!IsFloat(), "TFrom=double are handled above"); static_assert(sizeof(TTo) < sizeof(TFrom), "Not demoting"); // Int to int: choose closest value in TTo to `from` (avoids UB) from.raw = HWY_MIN(from.raw, LimitsMax()); return Vec1(static_cast(from.raw)); } template , typename TFrom, HWY_IF_UI64(TFrom), HWY_IF_F32_D(DTo)> HWY_API Vec1 DemoteTo(DTo /* tag */, Vec1 from) { // int64_t/uint64_t to float: simply cast to TTo return Vec1(static_cast(from.raw)); } // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions; // use this scalar version to verify the vector implementation. #ifdef HWY_NATIVE_F16C #undef HWY_NATIVE_F16C #else #define HWY_NATIVE_F16C #endif template HWY_API Vec1 PromoteTo(D /* tag */, const Vec1 v) { return Vec1(F32FromF16(v.raw)); } template HWY_API Vec1 PromoteTo(D d, const Vec1 v) { return Set(d, F32FromBF16(v.raw)); } template HWY_API VFromD PromoteEvenTo(DTo d_to, Vec1 v) { return PromoteTo(d_to, v); } template HWY_API Vec1 DemoteTo(D /* tag */, const Vec1 v) { return Vec1(F16FromF32(v.raw)); } template HWY_API Vec1 DemoteTo(D d, const Vec1 v) { return Set(d, BF16FromF32(v.raw)); } template , typename TFrom, HWY_IF_FLOAT(TFrom)> HWY_API Vec1 ConvertTo(DTo /* tag */, Vec1 from) { static_assert(sizeof(TTo) == sizeof(TFrom), "Should have same size"); // float## -> int##: return closest representable value. return Vec1(detail::CastValueForF2IConv(from.raw)); } template , typename TFrom, HWY_IF_NOT_FLOAT(TFrom)> HWY_API Vec1 ConvertTo(DTo /* tag */, Vec1 from) { static_assert(sizeof(TTo) == sizeof(TFrom), "Should have same size"); // int## -> float##: no check needed return Vec1(static_cast(from.raw)); } HWY_API Vec1 U8FromU32(const Vec1 v) { return DemoteTo(Sisd(), v); } // ------------------------------ TruncateTo template HWY_API Vec1 TruncateTo(D /* tag */, Vec1 v) { return Vec1{static_cast(v.raw & 0xFF)}; } template HWY_API Vec1 TruncateTo(D /* tag */, Vec1 v) { return Vec1{static_cast(v.raw & 0xFFFF)}; } template HWY_API Vec1 TruncateTo(D /* tag */, Vec1 v) { return Vec1{static_cast(v.raw & 0xFFFFFFFFu)}; } template HWY_API Vec1 TruncateTo(D /* tag */, Vec1 v) { return Vec1{static_cast(v.raw & 0xFF)}; } template HWY_API Vec1 TruncateTo(D /* tag */, Vec1 v) { return Vec1{static_cast(v.raw & 0xFFFF)}; } template HWY_API Vec1 TruncateTo(D /* tag */, Vec1 v) { return Vec1{static_cast(v.raw & 0xFF)}; } // ================================================== COMBINE // UpperHalf, ZeroExtendVector, Combine, Concat* are unsupported. template HWY_API Vec1 LowerHalf(Vec1 v) { return v; } template > HWY_API Vec1 LowerHalf(D /* tag */, Vec1 v) { return v; } // ================================================== SWIZZLE template HWY_API T GetLane(const Vec1 v) { return v.raw; } template HWY_API T ExtractLane(const Vec1 v, size_t i) { HWY_DASSERT(i == 0); (void)i; return v.raw; } template HWY_API Vec1 InsertLane(Vec1 v, size_t i, T t) { HWY_DASSERT(i == 0); (void)i; v.raw = t; return v; } template HWY_API Vec1 DupEven(Vec1 v) { return v; } // DupOdd is unsupported. template HWY_API Vec1 OddEven(Vec1 /* odd */, Vec1 even) { return even; } template HWY_API Vec1 OddEvenBlocks(Vec1 /* odd */, Vec1 even) { return even; } // ------------------------------ SwapAdjacentBlocks template HWY_API Vec1 SwapAdjacentBlocks(Vec1 v) { return v; } // ------------------------------ TableLookupLanes // Returned by SetTableIndices for use by TableLookupLanes. template struct Indices1 { MakeSigned raw; }; template , typename TI> HWY_API Indices1 IndicesFromVec(D, Vec1 vec) { static_assert(sizeof(T) == sizeof(TI), "Index size must match lane size"); HWY_DASSERT(vec.raw <= 1); return Indices1{static_cast>(vec.raw)}; } template , typename TI> HWY_API Indices1 SetTableIndices(D d, const TI* idx) { return IndicesFromVec(d, LoadU(Sisd(), idx)); } template HWY_API Vec1 TableLookupLanes(const Vec1 v, const Indices1 /* idx */) { return v; } template HWY_API Vec1 TwoTablesLookupLanes(const Vec1 a, const Vec1 b, const Indices1 idx) { return (idx.raw == 0) ? a : b; } // ------------------------------ ReverseBlocks // Single block: no change template > HWY_API Vec1 ReverseBlocks(D /* tag */, const Vec1 v) { return v; } // ------------------------------ Reverse template > HWY_API Vec1 Reverse(D /* tag */, const Vec1 v) { return v; } // Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. #ifdef HWY_NATIVE_REVERSE2_8 #undef HWY_NATIVE_REVERSE2_8 #else #define HWY_NATIVE_REVERSE2_8 #endif // Must not be called: template > HWY_API Vec1 Reverse2(D /* tag */, const Vec1 v) { return v; } template > HWY_API Vec1 Reverse4(D /* tag */, const Vec1 v) { return v; } template > HWY_API Vec1 Reverse8(D /* tag */, const Vec1 v) { return v; } // ------------------------------ ReverseLaneBytes #ifdef HWY_NATIVE_REVERSE_LANE_BYTES #undef HWY_NATIVE_REVERSE_LANE_BYTES #else #define HWY_NATIVE_REVERSE_LANE_BYTES #endif HWY_API Vec1 ReverseLaneBytes(Vec1 v) { const uint32_t val{v.raw}; return Vec1( static_cast(((val << 8) & 0xFF00u) | ((val >> 8) & 0x00FFu))); } HWY_API Vec1 ReverseLaneBytes(Vec1 v) { const uint32_t val = v.raw; return Vec1(static_cast( ((val << 24) & 0xFF000000u) | ((val << 8) & 0x00FF0000u) | ((val >> 8) & 0x0000FF00u) | ((val >> 24) & 0x000000FFu))); } HWY_API Vec1 ReverseLaneBytes(Vec1 v) { const uint64_t val = v.raw; return Vec1(static_cast( ((val << 56) & 0xFF00000000000000u) | ((val << 40) & 0x00FF000000000000u) | ((val << 24) & 0x0000FF0000000000u) | ((val << 8) & 0x000000FF00000000u) | ((val >> 8) & 0x00000000FF000000u) | ((val >> 24) & 0x0000000000FF0000u) | ((val >> 40) & 0x000000000000FF00u) | ((val >> 56) & 0x00000000000000FFu))); } template HWY_API V ReverseLaneBytes(V v) { const DFromV d; const RebindToUnsigned du; return BitCast(d, ReverseLaneBytes(BitCast(du, v))); } // ------------------------------ ReverseBits #ifdef HWY_NATIVE_REVERSE_BITS_UI8 #undef HWY_NATIVE_REVERSE_BITS_UI8 #else #define HWY_NATIVE_REVERSE_BITS_UI8 #endif #ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64 #undef HWY_NATIVE_REVERSE_BITS_UI16_32_64 #else #define HWY_NATIVE_REVERSE_BITS_UI16_32_64 #endif namespace detail { template HWY_INLINE T ReverseBitsOfEachByte(T val) { using TU = MakeUnsigned; constexpr TU kMaxUnsignedVal{LimitsMax()}; constexpr TU kShrMask1 = static_cast(0x5555555555555555u & kMaxUnsignedVal); constexpr TU kShrMask2 = static_cast(0x3333333333333333u & kMaxUnsignedVal); constexpr TU kShrMask3 = static_cast(0x0F0F0F0F0F0F0F0Fu & kMaxUnsignedVal); constexpr TU kShlMask1 = static_cast(~kShrMask1); constexpr TU kShlMask2 = static_cast(~kShrMask2); constexpr TU kShlMask3 = static_cast(~kShrMask3); TU result = static_cast(val); result = static_cast(((result << 1) & kShlMask1) | ((result >> 1) & kShrMask1)); result = static_cast(((result << 2) & kShlMask2) | ((result >> 2) & kShrMask2)); result = static_cast(((result << 4) & kShlMask3) | ((result >> 4) & kShrMask3)); return static_cast(result); } } // namespace detail template HWY_API V ReverseBits(V v) { return V(detail::ReverseBitsOfEachByte(v.raw)); } template HWY_API V ReverseBits(V v) { return ReverseLaneBytes(V(detail::ReverseBitsOfEachByte(v.raw))); } template HWY_API V ReverseBits(V v) { const DFromV d; const RebindToUnsigned du; return BitCast(d, ReverseBits(BitCast(du, v))); } // ------------------------------ SlideUpLanes template HWY_API VFromD SlideUpLanes(D /*d*/, VFromD v, size_t /*amt*/) { return v; } // ------------------------------ SlideDownLanes template HWY_API VFromD SlideDownLanes(D /*d*/, VFromD v, size_t /*amt*/) { return v; } // ================================================== BLOCKWISE // Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported. // ------------------------------ Broadcast/splat any lane template HWY_API Vec1 Broadcast(const Vec1 v) { static_assert(kLane == 0, "Scalar only has one lane"); return v; } // ------------------------------ TableLookupBytes, TableLookupBytesOr0 template HWY_API Vec1 TableLookupBytes(const Vec1 in, const Vec1 indices) { uint8_t in_bytes[sizeof(T)]; uint8_t idx_bytes[sizeof(T)]; uint8_t out_bytes[sizeof(T)]; CopyBytes(&in, &in_bytes); // copy to bytes CopyBytes(&indices, &idx_bytes); for (size_t i = 0; i < sizeof(T); ++i) { out_bytes[i] = in_bytes[idx_bytes[i]]; } TI out; CopyBytes(&out_bytes, &out); return Vec1{out}; } template HWY_API Vec1 TableLookupBytesOr0(const Vec1 in, const Vec1 indices) { uint8_t in_bytes[sizeof(T)]; uint8_t idx_bytes[sizeof(T)]; uint8_t out_bytes[sizeof(T)]; CopyBytes(&in, &in_bytes); // copy to bytes CopyBytes(&indices, &idx_bytes); for (size_t i = 0; i < sizeof(T); ++i) { out_bytes[i] = idx_bytes[i] & 0x80 ? 0 : in_bytes[idx_bytes[i]]; } TI out; CopyBytes(&out_bytes, &out); return Vec1{out}; } // ------------------------------ ZipLower HWY_API Vec1 ZipLower(Vec1 a, Vec1 b) { return Vec1(static_cast((uint32_t{b.raw} << 8) + a.raw)); } HWY_API Vec1 ZipLower(Vec1 a, Vec1 b) { return Vec1((uint32_t{b.raw} << 16) + a.raw); } HWY_API Vec1 ZipLower(Vec1 a, Vec1 b) { return Vec1((uint64_t{b.raw} << 32) + a.raw); } HWY_API Vec1 ZipLower(Vec1 a, Vec1 b) { return Vec1(static_cast((int32_t{b.raw} << 8) + a.raw)); } HWY_API Vec1 ZipLower(Vec1 a, Vec1 b) { return Vec1((int32_t{b.raw} << 16) + a.raw); } HWY_API Vec1 ZipLower(Vec1 a, Vec1 b) { return Vec1((int64_t{b.raw} << 32) + a.raw); } template , typename TN = MakeNarrow> HWY_API Vec1 ZipLower(DW /* tag */, Vec1 a, Vec1 b) { return Vec1(static_cast((TW{b.raw} << (sizeof(TN) * 8)) + a.raw)); } // ================================================== MASK template > HWY_API bool AllFalse(D /* tag */, const Mask1 mask) { return mask.bits == 0; } template > HWY_API bool AllTrue(D /* tag */, const Mask1 mask) { return mask.bits != 0; } // `p` points to at least 8 readable bytes, not all of which need be valid. template > HWY_API Mask1 LoadMaskBits(D /* tag */, const uint8_t* HWY_RESTRICT bits) { return Mask1::FromBool((bits[0] & 1) != 0); } template HWY_API MFromD Dup128MaskFromMaskBits(D /*d*/, unsigned mask_bits) { return MFromD::FromBool((mask_bits & 1) != 0); } // `p` points to at least 8 writable bytes. template > HWY_API size_t StoreMaskBits(D d, const Mask1 mask, uint8_t* bits) { *bits = AllTrue(d, mask); return 1; } template > HWY_API size_t CountTrue(D /* tag */, const Mask1 mask) { return mask.bits == 0 ? 0 : 1; } template > HWY_API intptr_t FindFirstTrue(D /* tag */, const Mask1 mask) { return mask.bits == 0 ? -1 : 0; } template > HWY_API size_t FindKnownFirstTrue(D /* tag */, const Mask1 /* m */) { return 0; // There is only one lane and we know it is true. } template > HWY_API intptr_t FindLastTrue(D /* tag */, const Mask1 mask) { return mask.bits == 0 ? -1 : 0; } template > HWY_API size_t FindKnownLastTrue(D /* tag */, const Mask1 /* m */) { return 0; // There is only one lane and we know it is true. } // ------------------------------ Compress, CompressBits template struct CompressIsPartition { enum { value = 1 }; }; template HWY_API Vec1 Compress(Vec1 v, const Mask1 /* mask */) { // A single lane is already partitioned by definition. return v; } template HWY_API Vec1 CompressNot(Vec1 v, const Mask1 /* mask */) { // A single lane is already partitioned by definition. return v; } // ------------------------------ CompressStore template > HWY_API size_t CompressStore(Vec1 v, const Mask1 mask, D d, T* HWY_RESTRICT unaligned) { StoreU(Compress(v, mask), d, unaligned); return CountTrue(d, mask); } // ------------------------------ CompressBlendedStore template > HWY_API size_t CompressBlendedStore(Vec1 v, const Mask1 mask, D d, T* HWY_RESTRICT unaligned) { if (!mask.bits) return 0; StoreU(v, d, unaligned); return 1; } // ------------------------------ CompressBits template HWY_API Vec1 CompressBits(Vec1 v, const uint8_t* HWY_RESTRICT /*bits*/) { return v; } // ------------------------------ CompressBitsStore template > HWY_API size_t CompressBitsStore(Vec1 v, const uint8_t* HWY_RESTRICT bits, D d, T* HWY_RESTRICT unaligned) { const Mask1 mask = LoadMaskBits(d, bits); StoreU(Compress(v, mask), d, unaligned); return CountTrue(d, mask); } // ------------------------------ Expand // generic_ops-inl.h requires Vec64/128, so implement [Load]Expand here. #ifdef HWY_NATIVE_EXPAND #undef HWY_NATIVE_EXPAND #else #define HWY_NATIVE_EXPAND #endif template HWY_API Vec1 Expand(Vec1 v, const Mask1 mask) { return IfThenElseZero(mask, v); } // ------------------------------ LoadExpand template HWY_API VFromD LoadExpand(MFromD mask, D d, const TFromD* HWY_RESTRICT unaligned) { return MaskedLoad(mask, d, unaligned); } // ------------------------------ WidenMulPairwiseAdd template HWY_API Vec1 WidenMulPairwiseAdd(D32 /* tag */, Vec1 a, Vec1 b) { return Vec1(F32FromBF16(a.raw)) * Vec1(F32FromBF16(b.raw)); } template HWY_API Vec1 WidenMulPairwiseAdd(D32 /* tag */, Vec1 a, Vec1 b) { return Vec1(a.raw * b.raw); } // ------------------------------ SatWidenMulPairwiseAdd #ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD #undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD #else #define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD #endif template HWY_API Vec1 SatWidenMulPairwiseAdd(DI16 /* tag */, Vec1 a, Vec1 b) { // Saturation of a.raw * b.raw is not needed on the HWY_SCALAR target as the // input vectors only have 1 lane on the HWY_SCALAR target and as // a.raw * b.raw is between -32640 and 32385, which is already within the // range of an int16_t. // On other targets, a saturated addition of a[0]*b[0] + a[1]*b[1] is needed // as it is possible for the addition of a[0]*b[0] + a[1]*b[1] to overflow if // a[0], a[1], b[0], and b[1] are all non-zero and b[0] and b[1] both have the // same sign. return Vec1(static_cast(a.raw) * static_cast(b.raw)); } // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) template HWY_API Vec1 ReorderWidenMulAccumulate(D32 /* tag */, Vec1 a, Vec1 b, const Vec1 sum0, Vec1& /* sum1 */) { return MulAdd(Vec1(F32FromBF16(a.raw)), Vec1(F32FromBF16(b.raw)), sum0); } template HWY_API Vec1 ReorderWidenMulAccumulate(D32 /* tag */, Vec1 a, Vec1 b, const Vec1 sum0, Vec1& /* sum1 */) { return Vec1(a.raw * b.raw + sum0.raw); } template HWY_API Vec1 ReorderWidenMulAccumulate(DU32 /* tag */, Vec1 a, Vec1 b, const Vec1 sum0, Vec1& /* sum1 */) { return Vec1(static_cast(a.raw) * b.raw + sum0.raw); } // ------------------------------ RearrangeToOddPlusEven template HWY_API Vec1 RearrangeToOddPlusEven(Vec1 sum0, Vec1 /* sum1 */) { return sum0; // invariant already holds } // ================================================== REDUCTIONS // Nothing native, generic_ops-inl defines SumOfLanes and ReduceSum. // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE();