// Copyright 2021 Google LLC // Copyright 2023 Arm Limited and/or its affiliates // SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: BSD-3-Clause // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Target-independent types/functions defined after target-specific ops. // The "include guards" in this file that check HWY_TARGET_TOGGLE serve to skip // the generic implementation here if native ops are already defined. #include "hwy/base.h" // Define detail::Shuffle1230 etc, but only when viewing the current header; // normally this is included via highway.h, which includes ops/*.h. #if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED) #include "hwy/detect_targets.h" #include "hwy/ops/emu128-inl.h" #endif // HWY_IDE // Relies on the external include guard in highway.h. HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { // The lane type of a vector type, e.g. float for Vec>. template using LaneType = decltype(GetLane(V())); // Vector type, e.g. Vec128 for CappedTag. Useful as the return // type of functions that do not take a vector argument, or as an argument type // if the function only has a template argument for D, or for explicit type // names instead of auto. This may be a built-in type. template using Vec = decltype(Zero(D())); // Mask type. Useful as the return type of functions that do not take a mask // argument, or as an argument type if the function only has a template argument // for D, or for explicit type names instead of auto. template using Mask = decltype(MaskFromVec(Zero(D()))); // Returns the closest value to v within [lo, hi]. template HWY_API V Clamp(const V v, const V lo, const V hi) { return Min(Max(lo, v), hi); } // CombineShiftRightBytes (and -Lanes) are not available for the scalar target, // and RVV has its own implementation of -Lanes. #if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV template HWY_API VFromD CombineShiftRightLanes(D d, VFromD hi, VFromD lo) { constexpr size_t kBytes = kLanes * sizeof(TFromD); static_assert(kBytes < 16, "Shift count is per-block"); return CombineShiftRightBytes(d, hi, lo); } #endif // Returns lanes with the most significant bit set and all other bits zero. template HWY_API Vec SignBit(D d) { const RebindToUnsigned du; return BitCast(d, Set(du, SignMask>())); } // Returns quiet NaN. template HWY_API Vec NaN(D d) { const RebindToSigned di; // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus // mantissa MSB (to indicate quiet) would be sufficient. return BitCast(d, Set(di, LimitsMax>())); } // Returns positive infinity. template HWY_API Vec Inf(D d) { const RebindToUnsigned du; using T = TFromD; using TU = TFromD; const TU max_x2 = static_cast(MaxExponentTimes2()); return BitCast(d, Set(du, max_x2 >> 1)); } // ------------------------------ ZeroExtendResizeBitCast // The implementation of detail::ZeroExtendResizeBitCast for the HWY_EMU128 // target is in emu128-inl.h, and the implementation of // detail::ZeroExtendResizeBitCast for the HWY_SCALAR target is in scalar-inl.h #if HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR namespace detail { #if HWY_HAVE_SCALABLE template HWY_INLINE VFromD ZeroExtendResizeBitCast( hwy::SizeTag /* from_size_tag */, hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom d_from, VFromD v) { const Repartition d_to_u8; const auto resized = ResizeBitCast(d_to_u8, v); // Zero the upper bytes which were not present/valid in d_from. const size_t num_bytes = Lanes(Repartition()); return BitCast(d_to, IfThenElseZero(FirstN(d_to_u8, num_bytes), resized)); } #else // target that uses fixed-size vectors // Truncating or same-size resizing cast: same as ResizeBitCast template HWY_INLINE VFromD ZeroExtendResizeBitCast( hwy::SizeTag /* from_size_tag */, hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom /*d_from*/, VFromD v) { return ResizeBitCast(d_to, v); } // Resizing cast to vector that has twice the number of lanes of the source // vector template HWY_INLINE VFromD ZeroExtendResizeBitCast( hwy::SizeTag /* from_size_tag */, hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom d_from, VFromD v) { const Twice dt_from; return BitCast(d_to, ZeroExtendVector(dt_from, v)); } // Resizing cast to vector that has more than twice the number of lanes of the // source vector template HWY_INLINE VFromD ZeroExtendResizeBitCast( hwy::SizeTag /* from_size_tag */, hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom /*d_from*/, VFromD v) { using TFrom = TFromD; constexpr size_t kNumOfFromLanes = kFromVectSize / sizeof(TFrom); const Repartition d_resize_to; return BitCast(d_to, IfThenElseZero(FirstN(d_resize_to, kNumOfFromLanes), ResizeBitCast(d_resize_to, v))); } #endif // HWY_HAVE_SCALABLE } // namespace detail #endif // HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR template HWY_API VFromD ZeroExtendResizeBitCast(DTo d_to, DFrom d_from, VFromD v) { return detail::ZeroExtendResizeBitCast(hwy::SizeTag(), hwy::SizeTag(), d_to, d_from, v); } // ------------------------------ SafeFillN template > HWY_API void SafeFillN(const size_t num, const T value, D d, T* HWY_RESTRICT to) { #if HWY_MEM_OPS_MIGHT_FAULT (void)d; for (size_t i = 0; i < num; ++i) { to[i] = value; } #else BlendedStore(Set(d, value), FirstN(d, num), d, to); #endif } // ------------------------------ SafeCopyN template > HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from, T* HWY_RESTRICT to) { #if HWY_MEM_OPS_MIGHT_FAULT (void)d; for (size_t i = 0; i < num; ++i) { to[i] = from[i]; } #else const Mask mask = FirstN(d, num); BlendedStore(MaskedLoad(mask, d, from), mask, d, to); #endif } // ------------------------------ MaskFalse #if (defined(HWY_NATIVE_MASK_FALSE) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_MASK_FALSE #undef HWY_NATIVE_MASK_FALSE #else #define HWY_NATIVE_MASK_FALSE #endif template HWY_API Mask MaskFalse(D d) { return MaskFromVec(Zero(d)); } #endif // HWY_NATIVE_MASK_FALSE // ------------------------------ BitwiseIfThenElse #if (defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE #else #define HWY_NATIVE_BITWISE_IF_THEN_ELSE #endif template HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { return Or(And(mask, yes), AndNot(mask, no)); } #endif // HWY_NATIVE_BITWISE_IF_THEN_ELSE // ------------------------------ PromoteMaskTo #if (defined(HWY_NATIVE_PROMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_PROMOTE_MASK_TO #undef HWY_NATIVE_PROMOTE_MASK_TO #else #define HWY_NATIVE_PROMOTE_MASK_TO #endif template HWY_API Mask PromoteMaskTo(DTo d_to, DFrom d_from, Mask m) { static_assert( sizeof(TFromD) > sizeof(TFromD), "sizeof(TFromD) must be greater than sizeof(TFromD)"); static_assert( IsSame, Mask, DTo>>>(), "Mask must be the same type as Mask, DTo>>"); const RebindToSigned di_to; const RebindToSigned di_from; return MaskFromVec(BitCast( d_to, PromoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m))))); } #endif // HWY_NATIVE_PROMOTE_MASK_TO // ------------------------------ DemoteMaskTo #if (defined(HWY_NATIVE_DEMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_DEMOTE_MASK_TO #undef HWY_NATIVE_DEMOTE_MASK_TO #else #define HWY_NATIVE_DEMOTE_MASK_TO #endif template HWY_API Mask DemoteMaskTo(DTo d_to, DFrom d_from, Mask m) { static_assert(sizeof(TFromD) < sizeof(TFromD), "sizeof(TFromD) must be less than sizeof(TFromD)"); static_assert( IsSame, Mask, DTo>>>(), "Mask must be the same type as Mask, DTo>>"); const RebindToSigned di_to; const RebindToSigned di_from; return MaskFromVec( BitCast(d_to, DemoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m))))); } #endif // HWY_NATIVE_DEMOTE_MASK_TO // ------------------------------ CombineMasks #if (defined(HWY_NATIVE_COMBINE_MASKS) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_COMBINE_MASKS #undef HWY_NATIVE_COMBINE_MASKS #else #define HWY_NATIVE_COMBINE_MASKS #endif #if HWY_TARGET != HWY_SCALAR template HWY_API Mask CombineMasks(D d, Mask> hi, Mask> lo) { const Half dh; return MaskFromVec(Combine(d, VecFromMask(dh, hi), VecFromMask(dh, lo))); } #endif #endif // HWY_NATIVE_COMBINE_MASKS // ------------------------------ LowerHalfOfMask #if (defined(HWY_NATIVE_LOWER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK #undef HWY_NATIVE_LOWER_HALF_OF_MASK #else #define HWY_NATIVE_LOWER_HALF_OF_MASK #endif template HWY_API Mask LowerHalfOfMask(D d, Mask> m) { const Twice dt; return MaskFromVec(LowerHalf(d, VecFromMask(dt, m))); } #endif // HWY_NATIVE_LOWER_HALF_OF_MASK // ------------------------------ UpperHalfOfMask #if (defined(HWY_NATIVE_UPPER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_UPPER_HALF_OF_MASK #undef HWY_NATIVE_UPPER_HALF_OF_MASK #else #define HWY_NATIVE_UPPER_HALF_OF_MASK #endif #if HWY_TARGET != HWY_SCALAR template HWY_API Mask UpperHalfOfMask(D d, Mask> m) { const Twice dt; return MaskFromVec(UpperHalf(d, VecFromMask(dt, m))); } #endif #endif // HWY_NATIVE_UPPER_HALF_OF_MASK // ------------------------------ OrderedDemote2MasksTo #if (defined(HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO) == \ defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO #undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO #else #define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO #endif #if HWY_TARGET != HWY_SCALAR template HWY_API Mask OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask a, Mask b) { static_assert( sizeof(TFromD) == sizeof(TFromD) / 2, "sizeof(TFromD) must be equal to sizeof(TFromD) / 2"); static_assert(IsSame, Mask, DFrom>>>(), "Mask must be the same type as " "Mask, DFrom>>>()"); const RebindToSigned di_from; const RebindToSigned di_to; const auto va = BitCast(di_from, VecFromMask(d_from, a)); const auto vb = BitCast(di_from, VecFromMask(d_from, b)); return MaskFromVec(BitCast(d_to, OrderedDemote2To(di_to, va, vb))); } #endif #endif // HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO // ------------------------------ InterleaveWholeLower/InterleaveWholeUpper #if (defined(HWY_NATIVE_INTERLEAVE_WHOLE) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_INTERLEAVE_WHOLE #undef HWY_NATIVE_INTERLEAVE_WHOLE #else #define HWY_NATIVE_INTERLEAVE_WHOLE #endif #if HWY_TARGET != HWY_SCALAR template HWY_API VFromD InterleaveWholeLower(D d, VFromD a, VFromD b) { // InterleaveWholeLower(d, a, b) is equivalent to InterleaveLower(a, b) if // D().MaxBytes() <= 16 is true return InterleaveLower(d, a, b); } template HWY_API VFromD InterleaveWholeUpper(D d, VFromD a, VFromD b) { // InterleaveWholeUpper(d, a, b) is equivalent to InterleaveUpper(a, b) if // D().MaxBytes() <= 16 is true return InterleaveUpper(d, a, b); } // InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on AVX2/AVX3 // is implemented in x86_256-inl.h. // InterleaveWholeLower/InterleaveWholeUpper for 64-byte vectors on AVX3 is // implemented in x86_512-inl.h. // InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on WASM_EMU256 // is implemented in wasm_256-inl.h. #endif // HWY_TARGET != HWY_SCALAR #endif // HWY_NATIVE_INTERLEAVE_WHOLE #if HWY_TARGET != HWY_SCALAR // The InterleaveWholeLower without the optional D parameter is generic for all // vector lengths. template HWY_API V InterleaveWholeLower(V a, V b) { return InterleaveWholeLower(DFromV(), a, b); } #endif // HWY_TARGET != HWY_SCALAR // ------------------------------ AddSub template , 1)> HWY_API V AddSub(V a, V b) { // AddSub(a, b) for a one-lane vector is equivalent to Sub(a, b) return Sub(a, b); } // AddSub for F32x2, F32x4, and F64x2 vectors is implemented in x86_128-inl.h on // SSSE3/SSE4/AVX2/AVX3 // AddSub for F32x8 and F64x4 vectors is implemented in x86_256-inl.h on // AVX2/AVX3 template >()) ? 32 : sizeof(TFromV)))> HWY_API V AddSub(V a, V b) { using D = DFromV; using T = TFromD; using TNegate = If(), MakeSigned, T>; const D d; const Rebind d_negate; // Negate the even lanes of b const auto negated_even_b = OddEven(b, BitCast(d, Neg(BitCast(d_negate, b)))); return Add(a, negated_even_b); } // ------------------------------ MaskedAddOr etc. #if (defined(HWY_NATIVE_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_MASKED_ARITH #undef HWY_NATIVE_MASKED_ARITH #else #define HWY_NATIVE_MASKED_ARITH #endif template HWY_API V MaskedMinOr(V no, M m, V a, V b) { return IfThenElse(m, Min(a, b), no); } template HWY_API V MaskedMaxOr(V no, M m, V a, V b) { return IfThenElse(m, Max(a, b), no); } template HWY_API V MaskedAddOr(V no, M m, V a, V b) { return IfThenElse(m, Add(a, b), no); } template HWY_API V MaskedSubOr(V no, M m, V a, V b) { return IfThenElse(m, Sub(a, b), no); } template HWY_API V MaskedMulOr(V no, M m, V a, V b) { return IfThenElse(m, Mul(a, b), no); } template HWY_API V MaskedDivOr(V no, M m, V a, V b) { return IfThenElse(m, Div(a, b), no); } template HWY_API V MaskedModOr(V no, M m, V a, V b) { return IfThenElse(m, Mod(a, b), no); } template HWY_API V MaskedSatAddOr(V no, M m, V a, V b) { return IfThenElse(m, SaturatedAdd(a, b), no); } template HWY_API V MaskedSatSubOr(V no, M m, V a, V b) { return IfThenElse(m, SaturatedSub(a, b), no); } #endif // HWY_NATIVE_MASKED_ARITH // ------------------------------ IfNegativeThenNegOrUndefIfZero #if (defined(HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG) == \ defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG #undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG #else #define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG #endif template HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) { #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // MaskedSubOr is more efficient than IfNegativeThenElse on RVV/SVE const auto zero = Zero(DFromV()); return MaskedSubOr(v, Lt(mask, zero), zero, v); #else return IfNegativeThenElse(mask, Neg(v), v); #endif } #endif // HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG template HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) { return CopySign(v, Xor(mask, v)); } // ------------------------------ SaturatedNeg #if (defined(HWY_NATIVE_SATURATED_NEG_8_16_32) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_SATURATED_NEG_8_16_32 #undef HWY_NATIVE_SATURATED_NEG_8_16_32 #else #define HWY_NATIVE_SATURATED_NEG_8_16_32 #endif template HWY_API V SaturatedNeg(V v) { const DFromV d; return SaturatedSub(Zero(d), v); } template )> HWY_API V SaturatedNeg(V v) { const DFromV d; #if HWY_TARGET == HWY_RVV || \ (HWY_TARGET >= HWY_PPC10 && HWY_TARGET <= HWY_PPC8) || \ (HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES) // RVV/NEON/SVE/PPC have native I32 SaturatedSub instructions return SaturatedSub(Zero(d), v); #else // ~v[i] - ((v[i] > LimitsMin()) ? -1 : 0) is equivalent to // (v[i] > LimitsMin) ? (-v[i]) : LimitsMax() since // -v[i] == ~v[i] + 1 == ~v[i] - (-1) and // ~LimitsMin() == LimitsMax(). return Sub(Not(v), VecFromMask(d, Gt(v, Set(d, LimitsMin())))); #endif } #endif // HWY_NATIVE_SATURATED_NEG_8_16_32 #if (defined(HWY_NATIVE_SATURATED_NEG_64) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_SATURATED_NEG_64 #undef HWY_NATIVE_SATURATED_NEG_64 #else #define HWY_NATIVE_SATURATED_NEG_64 #endif template )> HWY_API V SaturatedNeg(V v) { #if HWY_TARGET == HWY_RVV || \ (HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES) // RVV/NEON/SVE have native I64 SaturatedSub instructions const DFromV d; return SaturatedSub(Zero(d), v); #else const auto neg_v = Neg(v); return Add(neg_v, BroadcastSignBit(And(v, neg_v))); #endif } #endif // HWY_NATIVE_SATURATED_NEG_64 // ------------------------------ SaturatedAbs #if (defined(HWY_NATIVE_SATURATED_ABS) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_SATURATED_ABS #undef HWY_NATIVE_SATURATED_ABS #else #define HWY_NATIVE_SATURATED_ABS #endif template HWY_API V SaturatedAbs(V v) { return Max(v, SaturatedNeg(v)); } #endif // ------------------------------ Reductions // Targets follow one of two strategies. If HWY_NATIVE_REDUCE_SCALAR is toggled, // they (RVV/SVE/Armv8/Emu128) implement ReduceSum and SumOfLanes via Set. // Otherwise, they (Armv7/PPC/scalar/WASM/x86) define zero to most of the // SumOfLanes overloads. For the latter group, we here define the remaining // overloads, plus ReduceSum which uses them plus GetLane. #if (defined(HWY_NATIVE_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_REDUCE_SCALAR #undef HWY_NATIVE_REDUCE_SCALAR #else #define HWY_NATIVE_REDUCE_SCALAR #endif namespace detail { // Allows reusing the same shuffle code for SumOfLanes/MinOfLanes/MaxOfLanes. struct AddFunc { template V operator()(V a, V b) const { return Add(a, b); } }; struct MinFunc { template V operator()(V a, V b) const { return Min(a, b); } }; struct MaxFunc { template V operator()(V a, V b) const { return Max(a, b); } }; // No-op for vectors of at most one block. template HWY_INLINE VFromD ReduceAcrossBlocks(D, Func, VFromD v) { return v; } // Reduces a lane with its counterpart in other block(s). Shared by AVX2 and // WASM_EMU256. AVX3 has its own overload. template HWY_INLINE VFromD ReduceAcrossBlocks(D /*d*/, Func f, VFromD v) { return f(v, SwapAdjacentBlocks(v)); } // These return the reduction result broadcasted across all lanes. They assume // the caller has already reduced across blocks. template HWY_INLINE VFromD ReduceWithinBlocks(D d, Func f, VFromD v10) { return f(v10, Reverse2(d, v10)); } template HWY_INLINE VFromD ReduceWithinBlocks(D d, Func f, VFromD v3210) { const VFromD v0123 = Reverse4(d, v3210); const VFromD v03_12_12_03 = f(v3210, v0123); const VFromD v12_03_03_12 = Reverse2(d, v03_12_12_03); return f(v03_12_12_03, v12_03_03_12); } template HWY_INLINE VFromD ReduceWithinBlocks(D d, Func f, VFromD v76543210) { // The upper half is reversed from the lower half; omit for brevity. const VFromD v34_25_16_07 = f(v76543210, Reverse8(d, v76543210)); const VFromD v0347_1625_1625_0347 = f(v34_25_16_07, Reverse4(d, v34_25_16_07)); return f(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347)); } template HWY_INLINE VFromD ReduceWithinBlocks(D d, Func f, VFromD v) { const RepartitionToWide dw; using VW = VFromD; const VW vw = BitCast(dw, v); // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN. const VW even = And(vw, Set(dw, 0xFF)); const VW odd = ShiftRight<8>(vw); const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd)); #if HWY_IS_LITTLE_ENDIAN return DupEven(BitCast(d, reduced)); #else return DupOdd(BitCast(d, reduced)); #endif } template HWY_INLINE VFromD ReduceWithinBlocks(D d, Func f, VFromD v) { const RepartitionToWide dw; using VW = VFromD; const VW vw = BitCast(dw, v); // Sign-extend // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN. const VW even = ShiftRight<8>(ShiftLeft<8>(vw)); const VW odd = ShiftRight<8>(vw); const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd)); #if HWY_IS_LITTLE_ENDIAN return DupEven(BitCast(d, reduced)); #else return DupOdd(BitCast(d, reduced)); #endif } } // namespace detail template HWY_API VFromD SumOfLanes(D d, VFromD v) { const detail::AddFunc f; v = detail::ReduceAcrossBlocks(d, f, v); return detail::ReduceWithinBlocks(d, f, v); } template HWY_API VFromD MinOfLanes(D d, VFromD v) { const detail::MinFunc f; v = detail::ReduceAcrossBlocks(d, f, v); return detail::ReduceWithinBlocks(d, f, v); } template HWY_API VFromD MaxOfLanes(D d, VFromD v) { const detail::MaxFunc f; v = detail::ReduceAcrossBlocks(d, f, v); return detail::ReduceWithinBlocks(d, f, v); } template HWY_API TFromD ReduceSum(D d, VFromD v) { return GetLane(SumOfLanes(d, v)); } template HWY_API TFromD ReduceMin(D d, VFromD v) { return GetLane(MinOfLanes(d, v)); } template HWY_API TFromD ReduceMax(D d, VFromD v) { return GetLane(MaxOfLanes(d, v)); } #endif // HWY_NATIVE_REDUCE_SCALAR // Corner cases for both generic and native implementations: // N=1 (native covers N=2 e.g. for u64x2 and even u32x2 on Arm) template HWY_API TFromD ReduceSum(D /*d*/, VFromD v) { return GetLane(v); } template HWY_API TFromD ReduceMin(D /*d*/, VFromD v) { return GetLane(v); } template HWY_API TFromD ReduceMax(D /*d*/, VFromD v) { return GetLane(v); } template HWY_API VFromD SumOfLanes(D /* tag */, VFromD v) { return v; } template HWY_API VFromD MinOfLanes(D /* tag */, VFromD v) { return v; } template HWY_API VFromD MaxOfLanes(D /* tag */, VFromD v) { return v; } // N=4 for 8-bit is still less than the minimum native size. // ARMv7 NEON/PPC/RVV/SVE have target-specific implementations of the N=4 I8/U8 // ReduceSum operations #if (defined(HWY_NATIVE_REDUCE_SUM_4_UI8) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8 #undef HWY_NATIVE_REDUCE_SUM_4_UI8 #else #define HWY_NATIVE_REDUCE_SUM_4_UI8 #endif template HWY_API TFromD ReduceSum(D d, VFromD v) { const Twice> dw; return static_cast>(ReduceSum(dw, PromoteTo(dw, v))); } #endif // HWY_NATIVE_REDUCE_SUM_4_UI8 // RVV/SVE have target-specific implementations of the N=4 I8/U8 // ReduceMin/ReduceMax operations #if (defined(HWY_NATIVE_REDUCE_MINMAX_4_UI8) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8 #undef HWY_NATIVE_REDUCE_MINMAX_4_UI8 #else #define HWY_NATIVE_REDUCE_MINMAX_4_UI8 #endif template HWY_API TFromD ReduceMin(D d, VFromD v) { const Twice> dw; return static_cast>(ReduceMin(dw, PromoteTo(dw, v))); } template HWY_API TFromD ReduceMax(D d, VFromD v) { const Twice> dw; return static_cast>(ReduceMax(dw, PromoteTo(dw, v))); } #endif // HWY_NATIVE_REDUCE_MINMAX_4_UI8 // ------------------------------ IsInf, IsFinite // AVX3 has target-specific implementations of these. #if (defined(HWY_NATIVE_ISINF) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_ISINF #undef HWY_NATIVE_ISINF #else #define HWY_NATIVE_ISINF #endif template > HWY_API MFromD IsInf(const V v) { using T = TFromD; const D d; const RebindToUnsigned du; const VFromD vu = BitCast(du, v); // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. return RebindMask( d, Eq(Add(vu, vu), Set(du, static_cast>(hwy::MaxExponentTimes2())))); } // Returns whether normal/subnormal/zero. template > HWY_API MFromD IsFinite(const V v) { using T = TFromD; const D d; const RebindToUnsigned du; const RebindToSigned di; // cheaper than unsigned comparison const VFromD vu = BitCast(du, v); // 'Shift left' to clear the sign bit. MSVC seems to generate incorrect code // for AVX2 if we instead add vu + vu. #if HWY_COMPILER_MSVC const VFromD shl = ShiftLeft<1>(vu); #else const VFromD shl = Add(vu, vu); #endif // Then shift right so we can compare with the max exponent (cannot compare // with MaxExponentTimes2 directly because it is negative and non-negative // floats would be greater). const VFromD exp = BitCast(di, ShiftRight() + 1>(shl)); return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField()))); } #endif // HWY_NATIVE_ISINF // ------------------------------ LoadInterleaved2 #if HWY_IDE || \ (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED #else #define HWY_NATIVE_LOAD_STORE_INTERLEAVED #endif template HWY_API void LoadInterleaved2(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1) { const VFromD A = LoadU(d, unaligned); // v1[1] v0[1] v1[0] v0[0] const VFromD B = LoadU(d, unaligned + Lanes(d)); v0 = ConcatEven(d, B, A); v1 = ConcatOdd(d, B, A); } template HWY_API void LoadInterleaved2(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1) { v0 = LoadU(d, unaligned + 0); v1 = LoadU(d, unaligned + 1); } // ------------------------------ LoadInterleaved3 (CombineShiftRightBytes) namespace detail { #if HWY_IDE template HWY_INLINE V ShuffleTwo1230(V a, V /* b */) { return a; } template HWY_INLINE V ShuffleTwo2301(V a, V /* b */) { return a; } template HWY_INLINE V ShuffleTwo3012(V a, V /* b */) { return a; } #endif // HWY_IDE // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. template HWY_INLINE void LoadTransposedBlocks3(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& A, VFromD& B, VFromD& C) { constexpr size_t kN = MaxLanes(d); A = LoadU(d, unaligned + 0 * kN); B = LoadU(d, unaligned + 1 * kN); C = LoadU(d, unaligned + 2 * kN); } } // namespace detail template HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { const RebindToUnsigned du; using V = VFromD; using VU = VFromD; // Compact notation so these fit on one line: 12 := v1[2]. V A; // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00 V B; // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15 V C; // 2f 1f 0f 2e 1e 0e 2d 1d 0d 2c 1c 0c 2b 1b 0b 2a detail::LoadTransposedBlocks3(d, unaligned, A, B, C); // Compress all lanes belonging to v0 into consecutive lanes. constexpr uint8_t Z = 0x80; const VU idx_v0A = Dup128VecFromValues(du, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z); const VU idx_v0B = Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z); const VU idx_v0C = Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13); const VU idx_v1A = Dup128VecFromValues(du, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z); const VU idx_v1B = Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z); const VU idx_v1C = Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14); const VU idx_v2A = Dup128VecFromValues(du, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z); const VU idx_v2B = Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z); const VU idx_v2C = Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15); const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A)); const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B)); const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C)); const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A)); const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B)); const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C)); const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A)); const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B)); const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C)); v0 = Xor3(v0L, v0M, v0U); v1 = Xor3(v1L, v1M, v1U); v2 = Xor3(v2L, v2M, v2U); } // 8-bit lanes x8 template HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { const RebindToUnsigned du; using V = VFromD; using VU = VFromD; V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0] V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2] V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5] detail::LoadTransposedBlocks3(d, unaligned, A, B, C); // Compress all lanes belonging to v0 into consecutive lanes. constexpr uint8_t Z = 0x80; const VU idx_v0A = Dup128VecFromValues(du, 0, 3, 6, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0); const VU idx_v0B = Dup128VecFromValues(du, Z, Z, Z, 1, 4, 7, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0); const VU idx_v0C = Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 0, 0, 0, 0, 0, 0, 0, 0); const VU idx_v1A = Dup128VecFromValues(du, 1, 4, 7, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0); const VU idx_v1B = Dup128VecFromValues(du, Z, Z, Z, 2, 5, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0); const VU idx_v1C = Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 0); const VU idx_v2A = Dup128VecFromValues(du, 2, 5, Z, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0); const VU idx_v2B = Dup128VecFromValues(du, Z, Z, 0, 3, 6, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0); const VU idx_v2C = Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0); const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A)); const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B)); const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C)); const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A)); const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B)); const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C)); const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A)); const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B)); const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C)); v0 = Xor3(v0L, v0M, v0U); v1 = Xor3(v1L, v1M, v1U); v2 = Xor3(v2L, v2M, v2U); } // 16-bit lanes x8 template HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { const RebindToUnsigned du; const Repartition du8; using V = VFromD; using VU8 = VFromD; V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0] V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2] V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5] detail::LoadTransposedBlocks3(d, unaligned, A, B, C); // Compress all lanes belonging to v0 into consecutive lanes. Same as above, // but each element of the array contains a byte index for a byte of a lane. constexpr uint8_t Z = 0x80; const VU8 idx_v0A = Dup128VecFromValues(du8, 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z); const VU8 idx_v0B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z); const VU8 idx_v0C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x04, 0x05, 0x0A, 0x0B); const VU8 idx_v1A = Dup128VecFromValues(du8, 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z); const VU8 idx_v1B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z, Z, Z, Z); const VU8 idx_v1C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D); const VU8 idx_v2A = Dup128VecFromValues(du8, 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z); const VU8 idx_v2B = Dup128VecFromValues(du8, Z, Z, Z, Z, 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z); const VU8 idx_v2C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F); const V v0L = TableLookupBytesOr0(A, BitCast(d, idx_v0A)); const V v0M = TableLookupBytesOr0(B, BitCast(d, idx_v0B)); const V v0U = TableLookupBytesOr0(C, BitCast(d, idx_v0C)); const V v1L = TableLookupBytesOr0(A, BitCast(d, idx_v1A)); const V v1M = TableLookupBytesOr0(B, BitCast(d, idx_v1B)); const V v1U = TableLookupBytesOr0(C, BitCast(d, idx_v1C)); const V v2L = TableLookupBytesOr0(A, BitCast(d, idx_v2A)); const V v2M = TableLookupBytesOr0(B, BitCast(d, idx_v2B)); const V v2U = TableLookupBytesOr0(C, BitCast(d, idx_v2C)); v0 = Xor3(v0L, v0M, v0U); v1 = Xor3(v1L, v1M, v1U); v2 = Xor3(v2L, v2M, v2U); } template HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { using V = VFromD; V A; // v0[1] v2[0] v1[0] v0[0] V B; // v1[2] v0[2] v2[1] v1[1] V C; // v2[3] v1[3] v0[3] v2[2] detail::LoadTransposedBlocks3(d, unaligned, A, B, C); const V vxx_02_03_xx = OddEven(C, B); v0 = detail::ShuffleTwo1230(A, vxx_02_03_xx); // Shuffle2301 takes the upper/lower halves of the output from one input, so // we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use // OddEven because it may have higher throughput than Shuffle. const V vxx_xx_10_11 = OddEven(A, B); const V v12_13_xx_xx = OddEven(B, C); v1 = detail::ShuffleTwo2301(vxx_xx_10_11, v12_13_xx_xx); const V vxx_20_21_xx = OddEven(B, A); v2 = detail::ShuffleTwo3012(vxx_20_21_xx, C); } template HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { VFromD A; // v1[0] v0[0] VFromD B; // v0[1] v2[0] VFromD C; // v2[1] v1[1] detail::LoadTransposedBlocks3(d, unaligned, A, B, C); v0 = OddEven(B, A); v1 = CombineShiftRightBytes)>(d, C, A); v2 = OddEven(C, B); } template , HWY_IF_LANES_D(D, 1)> HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { v0 = LoadU(d, unaligned + 0); v1 = LoadU(d, unaligned + 1); v2 = LoadU(d, unaligned + 2); } // ------------------------------ LoadInterleaved4 namespace detail { // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. template HWY_INLINE void LoadTransposedBlocks4(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& vA, VFromD& vB, VFromD& vC, VFromD& vD) { constexpr size_t kN = MaxLanes(d); vA = LoadU(d, unaligned + 0 * kN); vB = LoadU(d, unaligned + 1 * kN); vC = LoadU(d, unaligned + 2 * kN); vD = LoadU(d, unaligned + 3 * kN); } } // namespace detail template HWY_API void LoadInterleaved4(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2, VFromD& v3) { const Repartition d64; using V64 = VFromD; using V = VFromD; // 16 lanes per block; the lowest four blocks are at the bottom of vA..vD. // Here int[i] means the four interleaved values of the i-th 4-tuple and // int[3..0] indicates four consecutive 4-tuples (0 = least-significant). V vA; // int[13..10] int[3..0] V vB; // int[17..14] int[7..4] V vC; // int[1b..18] int[b..8] V vD; // int[1f..1c] int[f..c] detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); // For brevity, the comments only list the lower block (upper = lower + 0x10) const V v5140 = InterleaveLower(d, vA, vB); // int[5,1,4,0] const V vd9c8 = InterleaveLower(d, vC, vD); // int[d,9,c,8] const V v7362 = InterleaveUpper(d, vA, vB); // int[7,3,6,2] const V vfbea = InterleaveUpper(d, vC, vD); // int[f,b,e,a] const V v6420 = InterleaveLower(d, v5140, v7362); // int[6,4,2,0] const V veca8 = InterleaveLower(d, vd9c8, vfbea); // int[e,c,a,8] const V v7531 = InterleaveUpper(d, v5140, v7362); // int[7,5,3,1] const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea); // int[f,d,b,9] const V64 v10L = BitCast(d64, InterleaveLower(d, v6420, v7531)); // v10[7..0] const V64 v10U = BitCast(d64, InterleaveLower(d, veca8, vfdb9)); // v10[f..8] const V64 v32L = BitCast(d64, InterleaveUpper(d, v6420, v7531)); // v32[7..0] const V64 v32U = BitCast(d64, InterleaveUpper(d, veca8, vfdb9)); // v32[f..8] v0 = BitCast(d, InterleaveLower(d64, v10L, v10U)); v1 = BitCast(d, InterleaveUpper(d64, v10L, v10U)); v2 = BitCast(d, InterleaveLower(d64, v32L, v32U)); v3 = BitCast(d, InterleaveUpper(d64, v32L, v32U)); } template HWY_API void LoadInterleaved4(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2, VFromD& v3) { // In the last step, we interleave by half of the block size, which is usually // 8 bytes but half that for 8-bit x8 vectors. using TW = hwy::UnsignedFromSize; const Repartition dw; using VW = VFromD; // (Comments are for 256-bit vectors.) // 8 lanes per block; the lowest four blocks are at the bottom of vA..vD. VFromD vA; // v3210[9]v3210[8] v3210[1]v3210[0] VFromD vB; // v3210[b]v3210[a] v3210[3]v3210[2] VFromD vC; // v3210[d]v3210[c] v3210[5]v3210[4] VFromD vD; // v3210[f]v3210[e] v3210[7]v3210[6] detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); const VFromD va820 = InterleaveLower(d, vA, vB); // v3210[a,8] v3210[2,0] const VFromD vec64 = InterleaveLower(d, vC, vD); // v3210[e,c] v3210[6,4] const VFromD vb931 = InterleaveUpper(d, vA, vB); // v3210[b,9] v3210[3,1] const VFromD vfd75 = InterleaveUpper(d, vC, vD); // v3210[f,d] v3210[7,5] const VW v10_b830 = // v10[b..8] v10[3..0] BitCast(dw, InterleaveLower(d, va820, vb931)); const VW v10_fc74 = // v10[f..c] v10[7..4] BitCast(dw, InterleaveLower(d, vec64, vfd75)); const VW v32_b830 = // v32[b..8] v32[3..0] BitCast(dw, InterleaveUpper(d, va820, vb931)); const VW v32_fc74 = // v32[f..c] v32[7..4] BitCast(dw, InterleaveUpper(d, vec64, vfd75)); v0 = BitCast(d, InterleaveLower(dw, v10_b830, v10_fc74)); v1 = BitCast(d, InterleaveUpper(dw, v10_b830, v10_fc74)); v2 = BitCast(d, InterleaveLower(dw, v32_b830, v32_fc74)); v3 = BitCast(d, InterleaveUpper(dw, v32_b830, v32_fc74)); } template HWY_API void LoadInterleaved4(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2, VFromD& v3) { using V = VFromD; V vA; // v3210[4] v3210[0] V vB; // v3210[5] v3210[1] V vC; // v3210[6] v3210[2] V vD; // v3210[7] v3210[3] detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); const V v10e = InterleaveLower(d, vA, vC); // v1[6,4] v0[6,4] v1[2,0] v0[2,0] const V v10o = InterleaveLower(d, vB, vD); // v1[7,5] v0[7,5] v1[3,1] v0[3,1] const V v32e = InterleaveUpper(d, vA, vC); // v3[6,4] v2[6,4] v3[2,0] v2[2,0] const V v32o = InterleaveUpper(d, vB, vD); // v3[7,5] v2[7,5] v3[3,1] v2[3,1] v0 = InterleaveLower(d, v10e, v10o); v1 = InterleaveUpper(d, v10e, v10o); v2 = InterleaveLower(d, v32e, v32o); v3 = InterleaveUpper(d, v32e, v32o); } template HWY_API void LoadInterleaved4(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2, VFromD& v3) { VFromD vA, vB, vC, vD; detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); v0 = InterleaveLower(d, vA, vC); v1 = InterleaveUpper(d, vA, vC); v2 = InterleaveLower(d, vB, vD); v3 = InterleaveUpper(d, vB, vD); } // Any T x1 template , HWY_IF_LANES_D(D, 1)> HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2, VFromD& v3) { v0 = LoadU(d, unaligned + 0); v1 = LoadU(d, unaligned + 1); v2 = LoadU(d, unaligned + 2); v3 = LoadU(d, unaligned + 3); } // ------------------------------ StoreInterleaved2 namespace detail { // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. template HWY_INLINE void StoreTransposedBlocks2(VFromD A, VFromD B, D d, TFromD* HWY_RESTRICT unaligned) { constexpr size_t kN = MaxLanes(d); StoreU(A, d, unaligned + 0 * kN); StoreU(B, d, unaligned + 1 * kN); } } // namespace detail // >= 128 bit vector template HWY_API void StoreInterleaved2(VFromD v0, VFromD v1, D d, TFromD* HWY_RESTRICT unaligned) { const auto v10L = InterleaveLower(d, v0, v1); // .. v1[0] v0[0] const auto v10U = InterleaveUpper(d, v0, v1); // .. v1[kN/2] v0[kN/2] detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned); } // <= 64 bits template HWY_API void StoreInterleaved2(V part0, V part1, D d, TFromD* HWY_RESTRICT unaligned) { const Twice d2; const auto v0 = ZeroExtendVector(d2, part0); const auto v1 = ZeroExtendVector(d2, part1); const auto v10 = InterleaveLower(d2, v0, v1); StoreU(v10, d2, unaligned); } // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes, // TableLookupBytes) namespace detail { // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. template HWY_INLINE void StoreTransposedBlocks3(VFromD A, VFromD B, VFromD C, D d, TFromD* HWY_RESTRICT unaligned) { constexpr size_t kN = MaxLanes(d); StoreU(A, d, unaligned + 0 * kN); StoreU(B, d, unaligned + 1 * kN); StoreU(C, d, unaligned + 2 * kN); } } // namespace detail // >= 128-bit vector, 8-bit lanes template HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; using TU = TFromD; const auto k5 = Set(du, TU{5}); const auto k6 = Set(du, TU{6}); // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes // to their place, with 0x80 so lanes to be filled from other vectors are 0 // to enable blending by ORing together. const VFromD shuf_A0 = Dup128VecFromValues(du, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80, 5); // Cannot reuse shuf_A0 because it contains 5. const VFromD shuf_A1 = Dup128VecFromValues(du, 0x80, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80); // The interleaved vectors will be named A, B, C; temporaries with suffix // 0..2 indicate which input vector's lanes they hold. // cannot reuse shuf_A0 (has 5) const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1); const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0 const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0. const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0.. const VFromD A = BitCast(d, A0 | A1 | A2); // B: v1[10],v0[10], v2[9],v1[9],v0[9] .. , v2[6],v1[6],v0[6], v2[5],v1[5] const auto shuf_B0 = shuf_A2 + k6; // .A..9..8..7..6.. const auto shuf_B1 = shuf_A0 + k5; // A..9..8..7..6..5 const auto shuf_B2 = shuf_A1 + k5; // ..9..8..7..6..5. const auto B0 = TableLookupBytesOr0(v0, shuf_B0); const auto B1 = TableLookupBytesOr0(v1, shuf_B1); const auto B2 = TableLookupBytesOr0(v2, shuf_B2); const VFromD B = BitCast(d, B0 | B1 | B2); // C: v2[15],v1[15],v0[15], v2[11],v1[11],v0[11], v2[10] const auto shuf_C0 = shuf_B2 + k6; // ..F..E..D..C..B. const auto shuf_C1 = shuf_B0 + k5; // .F..E..D..C..B.. const auto shuf_C2 = shuf_B1 + k5; // F..E..D..C..B..A const auto C0 = TableLookupBytesOr0(v0, shuf_C0); const auto C1 = TableLookupBytesOr0(v1, shuf_C1); const auto C2 = TableLookupBytesOr0(v2, shuf_C2); const VFromD C = BitCast(d, C0 | C1 | C2); detail::StoreTransposedBlocks3(A, B, C, d, unaligned); } // >= 128-bit vector, 16-bit lanes template HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, TFromD* HWY_RESTRICT unaligned) { const Repartition du8; const auto k2 = Set(du8, uint8_t{2 * sizeof(TFromD)}); const auto k3 = Set(du8, uint8_t{3 * sizeof(TFromD)}); // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be // filled from other vectors are 0 for blending. Note that these are byte // indices for 16-bit lanes. const VFromD shuf_A1 = Dup128VecFromValues(du8, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5); const VFromD shuf_A2 = Dup128VecFromValues(du8, 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80); // The interleaved vectors will be named A, B, C; temporaries with suffix // 0..2 indicate which input vector's lanes they hold. const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1); const auto A0 = TableLookupBytesOr0(v0, shuf_A0); const auto A1 = TableLookupBytesOr0(v1, shuf_A1); const auto A2 = TableLookupBytesOr0(v2, shuf_A2); const VFromD A = BitCast(d, A0 | A1 | A2); // B: v0[5] v2[4],v1[4],v0[4], v2[3],v1[3],v0[3], v2[2] const auto shuf_B0 = shuf_A1 + k3; // 5..4..3. const auto shuf_B1 = shuf_A2 + k3; // ..4..3.. const auto shuf_B2 = shuf_A0 + k2; // .4..3..2 const auto B0 = TableLookupBytesOr0(v0, shuf_B0); const auto B1 = TableLookupBytesOr0(v1, shuf_B1); const auto B2 = TableLookupBytesOr0(v2, shuf_B2); const VFromD B = BitCast(d, B0 | B1 | B2); // C: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5] const auto shuf_C0 = shuf_B1 + k3; // ..7..6.. const auto shuf_C1 = shuf_B2 + k3; // .7..6..5 const auto shuf_C2 = shuf_B0 + k2; // 7..6..5. const auto C0 = TableLookupBytesOr0(v0, shuf_C0); const auto C1 = TableLookupBytesOr0(v1, shuf_C1); const auto C2 = TableLookupBytesOr0(v2, shuf_C2); const VFromD C = BitCast(d, C0 | C1 | C2); detail::StoreTransposedBlocks3(A, B, C, d, unaligned); } // >= 128-bit vector, 32-bit lanes template HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, TFromD* HWY_RESTRICT unaligned) { const RepartitionToWide dw; const VFromD v10_v00 = InterleaveLower(d, v0, v1); const VFromD v01_v20 = OddEven(v0, v2); // A: v0[1], v2[0],v1[0],v0[0] (<- lane 0) const VFromD A = BitCast( d, InterleaveLower(dw, BitCast(dw, v10_v00), BitCast(dw, v01_v20))); const VFromD v1_321 = ShiftRightLanes<1>(d, v1); const VFromD v0_32 = ShiftRightLanes<2>(d, v0); const VFromD v21_v11 = OddEven(v2, v1_321); const VFromD v12_v02 = OddEven(v1_321, v0_32); // B: v1[2],v0[2], v2[1],v1[1] const VFromD B = BitCast( d, InterleaveLower(dw, BitCast(dw, v21_v11), BitCast(dw, v12_v02))); // Notation refers to the upper 2 lanes of the vector for InterleaveUpper. const VFromD v23_v13 = OddEven(v2, v1_321); const VFromD v03_v22 = OddEven(v0, v2); // C: v2[3],v1[3],v0[3], v2[2] const VFromD C = BitCast( d, InterleaveUpper(dw, BitCast(dw, v03_v22), BitCast(dw, v23_v13))); detail::StoreTransposedBlocks3(A, B, C, d, unaligned); } // >= 128-bit vector, 64-bit lanes template HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, TFromD* HWY_RESTRICT unaligned) { const VFromD A = InterleaveLower(d, v0, v1); const VFromD B = OddEven(v0, v2); const VFromD C = InterleaveUpper(d, v1, v2); detail::StoreTransposedBlocks3(A, B, C, d, unaligned); } // 64-bit vector, 8-bit lanes template HWY_API void StoreInterleaved3(VFromD part0, VFromD part1, VFromD part2, D d, TFromD* HWY_RESTRICT unaligned) { // Use full vectors for the shuffles and first result. constexpr size_t kFullN = 16 / sizeof(TFromD); const Full128 du; const Full128> d_full; const auto k5 = Set(du, uint8_t{5}); const auto k6 = Set(du, uint8_t{6}); const VFromD v0{part0.raw}; const VFromD v1{part1.raw}; const VFromD v2{part2.raw}; // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. 0x80 so lanes to be // filled from other vectors are 0 for blending. alignas(16) static constexpr uint8_t tbl_v0[16] = { 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; alignas(16) static constexpr uint8_t tbl_v1[16] = { 0x80, 0, 0x80, 0x80, 1, 0x80, // 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; // The interleaved vectors will be named A, B, C; temporaries with suffix // 0..2 indicate which input vector's lanes they hold. const auto shuf_A0 = Load(du, tbl_v0); const auto shuf_A1 = Load(du, tbl_v1); // cannot reuse shuf_A0 (5 in MSB) const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1); const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0 const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0. const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0.. const auto A = BitCast(d_full, A0 | A1 | A2); StoreU(A, d_full, unaligned + 0 * kFullN); // Second (HALF) vector: v2[7],v1[7],v0[7], v2[6],v1[6],v0[6], v2[5],v1[5] const auto shuf_B0 = shuf_A2 + k6; // ..7..6.. const auto shuf_B1 = shuf_A0 + k5; // .7..6..5 const auto shuf_B2 = shuf_A1 + k5; // 7..6..5. const auto B0 = TableLookupBytesOr0(v0, shuf_B0); const auto B1 = TableLookupBytesOr0(v1, shuf_B1); const auto B2 = TableLookupBytesOr0(v2, shuf_B2); const VFromD B{BitCast(d_full, B0 | B1 | B2).raw}; StoreU(B, d, unaligned + 1 * kFullN); } // 64-bit vector, 16-bit lanes template HWY_API void StoreInterleaved3(VFromD part0, VFromD part1, VFromD part2, D dh, TFromD* HWY_RESTRICT unaligned) { const Twice d_full; const Full128 du8; const auto k2 = Set(du8, uint8_t{2 * sizeof(TFromD)}); const auto k3 = Set(du8, uint8_t{3 * sizeof(TFromD)}); const VFromD v0{part0.raw}; const VFromD v1{part1.raw}; const VFromD v2{part2.raw}; // Interleave part (v0,v1,v2) to full (MSB on left, lane 0 on right): // v1[2],v0[2], v2[1],v1[1],v0[1], v2[0],v1[0],v0[0]. We're expanding v0 lanes // to their place, with 0x80 so lanes to be filled from other vectors are 0 // to enable blending by ORing together. alignas(16) static constexpr uint8_t tbl_v1[16] = { 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80, 4, 5}; alignas(16) static constexpr uint8_t tbl_v2[16] = { 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80}; // The interleaved vectors will be named A, B; temporaries with suffix // 0..2 indicate which input vector's lanes they hold. const auto shuf_A1 = Load(du8, tbl_v1); // 2..1..0. // .2..1..0 const auto shuf_A0 = CombineShiftRightBytes<2>(du8, shuf_A1, shuf_A1); const auto shuf_A2 = Load(du8, tbl_v2); // ..1..0.. const auto A0 = TableLookupBytesOr0(v0, shuf_A0); const auto A1 = TableLookupBytesOr0(v1, shuf_A1); const auto A2 = TableLookupBytesOr0(v2, shuf_A2); const VFromD A = BitCast(d_full, A0 | A1 | A2); StoreU(A, d_full, unaligned); // Second (HALF) vector: v2[3],v1[3],v0[3], v2[2] const auto shuf_B0 = shuf_A1 + k3; // ..3. const auto shuf_B1 = shuf_A2 + k3; // .3.. const auto shuf_B2 = shuf_A0 + k2; // 3..2 const auto B0 = TableLookupBytesOr0(v0, shuf_B0); const auto B1 = TableLookupBytesOr0(v1, shuf_B1); const auto B2 = TableLookupBytesOr0(v2, shuf_B2); const VFromD B = BitCast(d_full, B0 | B1 | B2); StoreU(VFromD{B.raw}, dh, unaligned + MaxLanes(d_full)); } // 64-bit vector, 32-bit lanes template HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, TFromD* HWY_RESTRICT unaligned) { // (same code as 128-bit vector, 64-bit lanes) const VFromD v10_v00 = InterleaveLower(d, v0, v1); const VFromD v01_v20 = OddEven(v0, v2); const VFromD v21_v11 = InterleaveUpper(d, v1, v2); constexpr size_t kN = MaxLanes(d); StoreU(v10_v00, d, unaligned + 0 * kN); StoreU(v01_v20, d, unaligned + 1 * kN); StoreU(v21_v11, d, unaligned + 2 * kN); } // 64-bit lanes are handled by the N=1 case below. // <= 32-bit vector, 8-bit lanes template HWY_API void StoreInterleaved3(VFromD part0, VFromD part1, VFromD part2, D d, TFromD* HWY_RESTRICT unaligned) { // Use full vectors for the shuffles and result. const Full128 du; const Full128> d_full; const VFromD v0{part0.raw}; const VFromD v1{part1.raw}; const VFromD v2{part2.raw}; // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80 // so lanes to be filled from other vectors are 0 to enable blending by ORing // together. alignas(16) static constexpr uint8_t tbl_v0[16] = { 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; // The interleaved vector will be named A; temporaries with suffix // 0..2 indicate which input vector's lanes they hold. const auto shuf_A0 = Load(du, tbl_v0); const auto shuf_A1 = CombineShiftRightBytes<15>(du, shuf_A0, shuf_A0); const auto shuf_A2 = CombineShiftRightBytes<14>(du, shuf_A0, shuf_A0); const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ......3..2..1..0 const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .....3..2..1..0. const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // ....3..2..1..0.. const VFromD A = BitCast(d_full, A0 | A1 | A2); alignas(16) TFromD buf[MaxLanes(d_full)]; StoreU(A, d_full, buf); CopyBytes(buf, unaligned); } // 32-bit vector, 16-bit lanes template HWY_API void StoreInterleaved3(VFromD part0, VFromD part1, VFromD part2, D d, TFromD* HWY_RESTRICT unaligned) { // Use full vectors for the shuffles and result. const Full128 du8; const Full128> d_full; const VFromD v0{part0.raw}; const VFromD v1{part1.raw}; const VFromD v2{part2.raw}; // Interleave (v0,v1,v2). We're expanding v0 lanes to their place, with 0x80 // so lanes to be filled from other vectors are 0 to enable blending by ORing // together. alignas(16) static constexpr uint8_t tbl_v2[16] = { 0x80, 0x80, 0x80, 0x80, 0, 1, 0x80, 0x80, 0x80, 0x80, 2, 3, 0x80, 0x80, 0x80, 0x80}; // The interleaved vector will be named A; temporaries with suffix // 0..2 indicate which input vector's lanes they hold. const auto shuf_A2 = // ..1..0.. Load(du8, tbl_v2); const auto shuf_A1 = // ...1..0. CombineShiftRightBytes<2>(du8, shuf_A2, shuf_A2); const auto shuf_A0 = // ....1..0 CombineShiftRightBytes<4>(du8, shuf_A2, shuf_A2); const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // ..1..0 const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // .1..0. const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // 1..0.. const auto A = BitCast(d_full, A0 | A1 | A2); alignas(16) TFromD buf[MaxLanes(d_full)]; StoreU(A, d_full, buf); CopyBytes(buf, unaligned); } // Single-element vector, any lane size: just store directly template HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, TFromD* HWY_RESTRICT unaligned) { StoreU(v0, d, unaligned + 0); StoreU(v1, d, unaligned + 1); StoreU(v2, d, unaligned + 2); } // ------------------------------ StoreInterleaved4 namespace detail { // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. template HWY_INLINE void StoreTransposedBlocks4(VFromD vA, VFromD vB, VFromD vC, VFromD vD, D d, TFromD* HWY_RESTRICT unaligned) { constexpr size_t kN = MaxLanes(d); StoreU(vA, d, unaligned + 0 * kN); StoreU(vB, d, unaligned + 1 * kN); StoreU(vC, d, unaligned + 2 * kN); StoreU(vD, d, unaligned + 3 * kN); } } // namespace detail // >= 128-bit vector, 8..32-bit lanes template HWY_API void StoreInterleaved4(VFromD v0, VFromD v1, VFromD v2, VFromD v3, D d, TFromD* HWY_RESTRICT unaligned) { const RepartitionToWide dw; const auto v10L = ZipLower(dw, v0, v1); // .. v1[0] v0[0] const auto v32L = ZipLower(dw, v2, v3); const auto v10U = ZipUpper(dw, v0, v1); const auto v32U = ZipUpper(dw, v2, v3); // The interleaved vectors are vA, vB, vC, vD. const VFromD vA = BitCast(d, InterleaveLower(dw, v10L, v32L)); // 3210 const VFromD vB = BitCast(d, InterleaveUpper(dw, v10L, v32L)); const VFromD vC = BitCast(d, InterleaveLower(dw, v10U, v32U)); const VFromD vD = BitCast(d, InterleaveUpper(dw, v10U, v32U)); detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned); } // >= 128-bit vector, 64-bit lanes template HWY_API void StoreInterleaved4(VFromD v0, VFromD v1, VFromD v2, VFromD v3, D d, TFromD* HWY_RESTRICT unaligned) { // The interleaved vectors are vA, vB, vC, vD. const VFromD vA = InterleaveLower(d, v0, v1); // v1[0] v0[0] const VFromD vB = InterleaveLower(d, v2, v3); const VFromD vC = InterleaveUpper(d, v0, v1); const VFromD vD = InterleaveUpper(d, v2, v3); detail::StoreTransposedBlocks4(vA, vB, vC, vD, d, unaligned); } // 64-bit vector, 8..32-bit lanes template HWY_API void StoreInterleaved4(VFromD part0, VFromD part1, VFromD part2, VFromD part3, D /* tag */, TFromD* HWY_RESTRICT unaligned) { // Use full vectors to reduce the number of stores. const Full128> d_full; const RepartitionToWide dw; const VFromD v0{part0.raw}; const VFromD v1{part1.raw}; const VFromD v2{part2.raw}; const VFromD v3{part3.raw}; const auto v10 = ZipLower(dw, v0, v1); // v1[0] v0[0] const auto v32 = ZipLower(dw, v2, v3); const auto A = BitCast(d_full, InterleaveLower(dw, v10, v32)); const auto B = BitCast(d_full, InterleaveUpper(dw, v10, v32)); StoreU(A, d_full, unaligned); StoreU(B, d_full, unaligned + MaxLanes(d_full)); } // 64-bit vector, 64-bit lane template HWY_API void StoreInterleaved4(VFromD part0, VFromD part1, VFromD part2, VFromD part3, D /* tag */, TFromD* HWY_RESTRICT unaligned) { // Use full vectors to reduce the number of stores. const Full128> d_full; const VFromD v0{part0.raw}; const VFromD v1{part1.raw}; const VFromD v2{part2.raw}; const VFromD v3{part3.raw}; const auto A = InterleaveLower(d_full, v0, v1); // v1[0] v0[0] const auto B = InterleaveLower(d_full, v2, v3); StoreU(A, d_full, unaligned); StoreU(B, d_full, unaligned + MaxLanes(d_full)); } // <= 32-bit vectors template HWY_API void StoreInterleaved4(VFromD part0, VFromD part1, VFromD part2, VFromD part3, D d, TFromD* HWY_RESTRICT unaligned) { // Use full vectors to reduce the number of stores. const Full128> d_full; const RepartitionToWide dw; const VFromD v0{part0.raw}; const VFromD v1{part1.raw}; const VFromD v2{part2.raw}; const VFromD v3{part3.raw}; const auto v10 = ZipLower(dw, v0, v1); // .. v1[0] v0[0] const auto v32 = ZipLower(dw, v2, v3); const auto v3210 = BitCast(d_full, InterleaveLower(dw, v10, v32)); alignas(16) TFromD buf[MaxLanes(d_full)]; StoreU(v3210, d_full, buf); CopyBytes(buf, unaligned); } #endif // HWY_NATIVE_LOAD_STORE_INTERLEAVED // ------------------------------ LoadN #if (defined(HWY_NATIVE_LOAD_N) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_LOAD_N #undef HWY_NATIVE_LOAD_N #else #define HWY_NATIVE_LOAD_N #endif #if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE namespace detail { template HWY_INLINE VFromD LoadNResizeBitCast(DTo d_to, DFrom d_from, VFromD v) { #if HWY_TARGET <= HWY_SSE2 // On SSE2/SSSE3/SSE4, the LoadU operation will zero out any lanes of v.raw // past the first (lowest-index) Lanes(d_from) lanes of v.raw if // sizeof(decltype(v.raw)) > d_from.MaxBytes() is true (void)d_from; return ResizeBitCast(d_to, v); #else // On other targets such as PPC/NEON, the contents of any lanes past the first // (lowest-index) Lanes(d_from) lanes of v.raw might be non-zero if // sizeof(decltype(v.raw)) > d_from.MaxBytes() is true. return ZeroExtendResizeBitCast(d_to, d_from, v); #endif } } // namespace detail template HWY_API VFromD LoadN(D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { return (num_lanes > 0) ? LoadU(d, p) : Zero(d); } template HWY_API VFromD LoadNOr(VFromD no, D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { return (num_lanes > 0) ? LoadU(d, p) : no; } template HWY_API VFromD LoadN(D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { const FixedTag, 1> d1; if (num_lanes >= 2) return LoadU(d, p); if (num_lanes == 0) return Zero(d); return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p)); } template HWY_API VFromD LoadNOr(VFromD no, D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { const FixedTag, 1> d1; if (num_lanes >= 2) return LoadU(d, p); if (num_lanes == 0) return no; return InterleaveLower(ResizeBitCast(d, LoadU(d1, p)), no); } template HWY_API VFromD LoadN(D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { const FixedTag, 2> d2; const Half d1; if (num_lanes >= 4) return LoadU(d, p); if (num_lanes == 0) return Zero(d); if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p)); // Two or three lanes. const VFromD v_lo = detail::LoadNResizeBitCast(d, d2, LoadU(d2, p)); return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]); } template HWY_API VFromD LoadNOr(VFromD no, D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { const FixedTag, 2> d2; if (num_lanes >= 4) return LoadU(d, p); if (num_lanes == 0) return no; if (num_lanes == 1) return InsertLane(no, 0, p[0]); // Two or three lanes. const VFromD v_lo = ConcatUpperLower(d, no, ResizeBitCast(d, LoadU(d2, p))); return (num_lanes == 2) ? v_lo : InsertLane(v_lo, 2, p[2]); } template HWY_API VFromD LoadN(D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { const FixedTag, 4> d4; const Half d2; const Half d1; if (num_lanes >= 8) return LoadU(d, p); if (num_lanes == 0) return Zero(d); if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p)); const size_t leading_len = num_lanes & 4; VFromD v_trailing = Zero(d4); if ((num_lanes & 2) != 0) { const VFromD v_trailing_lo2 = LoadU(d2, p + leading_len); if ((num_lanes & 1) != 0) { v_trailing = Combine( d4, detail::LoadNResizeBitCast(d2, d1, LoadU(d1, p + leading_len + 2)), v_trailing_lo2); } else { v_trailing = detail::LoadNResizeBitCast(d4, d2, v_trailing_lo2); } } else if ((num_lanes & 1) != 0) { v_trailing = detail::LoadNResizeBitCast(d4, d1, LoadU(d1, p + leading_len)); } if (leading_len != 0) { return Combine(d, v_trailing, LoadU(d4, p)); } else { return detail::LoadNResizeBitCast(d, d4, v_trailing); } } template HWY_API VFromD LoadNOr(VFromD no, D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { const FixedTag, 4> d4; const Half d2; const Half d1; if (num_lanes >= 8) return LoadU(d, p); if (num_lanes == 0) return no; if (num_lanes == 1) return InsertLane(no, 0, p[0]); const size_t leading_len = num_lanes & 4; VFromD v_trailing = ResizeBitCast(d4, no); if ((num_lanes & 2) != 0) { const VFromD v_trailing_lo2 = LoadU(d2, p + leading_len); if ((num_lanes & 1) != 0) { v_trailing = Combine( d4, InterleaveLower(ResizeBitCast(d2, LoadU(d1, p + leading_len + 2)), ResizeBitCast(d2, no)), v_trailing_lo2); } else { v_trailing = ConcatUpperLower(d4, ResizeBitCast(d4, no), ResizeBitCast(d4, v_trailing_lo2)); } } else if ((num_lanes & 1) != 0) { v_trailing = InsertLane(ResizeBitCast(d4, no), 0, p[leading_len]); } if (leading_len != 0) { return Combine(d, v_trailing, LoadU(d4, p)); } else { return ConcatUpperLower(d, no, ResizeBitCast(d, v_trailing)); } } template HWY_API VFromD LoadN(D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { const FixedTag, 8> d8; const Half d4; const Half d2; const Half d1; if (num_lanes >= 16) return LoadU(d, p); if (num_lanes == 0) return Zero(d); if (num_lanes == 1) return detail::LoadNResizeBitCast(d, d1, LoadU(d1, p)); const size_t leading_len = num_lanes & 12; VFromD v_trailing = Zero(d4); if ((num_lanes & 2) != 0) { const VFromD v_trailing_lo2 = LoadU(d2, p + leading_len); if ((num_lanes & 1) != 0) { v_trailing = Combine( d4, detail::LoadNResizeBitCast(d2, d1, LoadU(d1, p + leading_len + 2)), v_trailing_lo2); } else { v_trailing = detail::LoadNResizeBitCast(d4, d2, v_trailing_lo2); } } else if ((num_lanes & 1) != 0) { v_trailing = detail::LoadNResizeBitCast(d4, d1, LoadU(d1, p + leading_len)); } if (leading_len != 0) { if (leading_len >= 8) { const VFromD v_hi7 = ((leading_len & 4) != 0) ? Combine(d8, v_trailing, LoadU(d4, p + 8)) : detail::LoadNResizeBitCast(d8, d4, v_trailing); return Combine(d, v_hi7, LoadU(d8, p)); } else { return detail::LoadNResizeBitCast(d, d8, Combine(d8, v_trailing, LoadU(d4, p))); } } else { return detail::LoadNResizeBitCast(d, d4, v_trailing); } } template HWY_API VFromD LoadNOr(VFromD no, D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { const FixedTag, 8> d8; const Half d4; const Half d2; const Half d1; if (num_lanes >= 16) return LoadU(d, p); if (num_lanes == 0) return no; if (num_lanes == 1) return InsertLane(no, 0, p[0]); const size_t leading_len = num_lanes & 12; VFromD v_trailing = ResizeBitCast(d4, no); if ((num_lanes & 2) != 0) { const VFromD v_trailing_lo2 = LoadU(d2, p + leading_len); if ((num_lanes & 1) != 0) { v_trailing = Combine( d4, InterleaveLower(ResizeBitCast(d2, LoadU(d1, p + leading_len + 2)), ResizeBitCast(d2, no)), v_trailing_lo2); } else { v_trailing = ConcatUpperLower(d4, ResizeBitCast(d4, no), ResizeBitCast(d4, v_trailing_lo2)); } } else if ((num_lanes & 1) != 0) { v_trailing = InsertLane(ResizeBitCast(d4, no), 0, p[leading_len]); } if (leading_len != 0) { if (leading_len >= 8) { const VFromD v_hi7 = ((leading_len & 4) != 0) ? Combine(d8, v_trailing, LoadU(d4, p + 8)) : ConcatUpperLower(d8, ResizeBitCast(d8, no), ResizeBitCast(d8, v_trailing)); return Combine(d, v_hi7, LoadU(d8, p)); } else { return ConcatUpperLower( d, ResizeBitCast(d, no), ResizeBitCast(d, Combine(d8, v_trailing, LoadU(d4, p)))); } } else { const Repartition du32; // lowest 4 bytes from v_trailing, next 4 from no. const VFromD lo8 = InterleaveLower(ResizeBitCast(du32, v_trailing), BitCast(du32, no)); return ConcatUpperLower(d, ResizeBitCast(d, no), ResizeBitCast(d, lo8)); } } #if HWY_MAX_BYTES >= 32 template HWY_API VFromD LoadN(D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { if (num_lanes >= Lanes(d)) return LoadU(d, p); const Half dh; const size_t half_N = Lanes(dh); if (num_lanes <= half_N) { return ZeroExtendVector(d, LoadN(dh, p, num_lanes)); } else { const VFromD v_lo = LoadU(dh, p); const VFromD v_hi = LoadN(dh, p + half_N, num_lanes - half_N); return Combine(d, v_hi, v_lo); } } template HWY_API VFromD LoadNOr(VFromD no, D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { if (num_lanes >= Lanes(d)) return LoadU(d, p); const Half dh; const size_t half_N = Lanes(dh); const VFromD no_h = LowerHalf(no); if (num_lanes <= half_N) { return ConcatUpperLower(d, no, ResizeBitCast(d, LoadNOr(no_h, dh, p, num_lanes))); } else { const VFromD v_lo = LoadU(dh, p); const VFromD v_hi = LoadNOr(no_h, dh, p + half_N, num_lanes - half_N); return Combine(d, v_hi, v_lo); } } #endif // HWY_MAX_BYTES >= 32 template HWY_API VFromD LoadN(D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { const RebindToUnsigned du; return BitCast(d, LoadN(du, detail::U16LanePointer(p), num_lanes)); } template HWY_API VFromD LoadNOr(VFromD no, D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { const RebindToUnsigned du; return BitCast( d, LoadNOr(BitCast(du, no), du, detail::U16LanePointer(p), num_lanes)); } #else // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE // For SVE and non-sanitizer AVX-512; RVV has its own specialization. template HWY_API VFromD LoadN(D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { #if HWY_MEM_OPS_MIGHT_FAULT if (num_lanes <= 0) return Zero(d); #endif return MaskedLoad(FirstN(d, num_lanes), d, p); } template HWY_API VFromD LoadNOr(VFromD no, D d, const TFromD* HWY_RESTRICT p, size_t num_lanes) { #if HWY_MEM_OPS_MIGHT_FAULT if (num_lanes <= 0) return no; #endif return MaskedLoadOr(no, FirstN(d, num_lanes), d, p); } #endif // HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE #endif // HWY_NATIVE_LOAD_N // ------------------------------ StoreN #if (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_STORE_N #undef HWY_NATIVE_STORE_N #else #define HWY_NATIVE_STORE_N #endif #if HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE namespace detail { template HWY_INLINE VFromD StoreNGetUpperHalf(DH dh, VFromD> v) { constexpr size_t kMinShrVectBytes = (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) ? 8 : 16; const FixedTag d_shift; return ResizeBitCast( dh, ShiftRightBytes(d_shift, ResizeBitCast(d_shift, v))); } template HWY_INLINE VFromD StoreNGetUpperHalf(DH dh, VFromD> v) { return UpperHalf(dh, v); } } // namespace detail template > HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, size_t max_lanes_to_store) { if (max_lanes_to_store > 0) { StoreU(v, d, p); } } template > HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, size_t max_lanes_to_store) { if (max_lanes_to_store > 1) { StoreU(v, d, p); } else if (max_lanes_to_store == 1) { const FixedTag, 1> d1; StoreU(LowerHalf(d1, v), d1, p); } } template > HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, size_t max_lanes_to_store) { const FixedTag, 2> d2; const Half d1; if (max_lanes_to_store > 1) { if (max_lanes_to_store >= 4) { StoreU(v, d, p); } else { StoreU(ResizeBitCast(d2, v), d2, p); if (max_lanes_to_store == 3) { StoreU(ResizeBitCast(d1, detail::StoreNGetUpperHalf(d2, v)), d1, p + 2); } } } else if (max_lanes_to_store == 1) { StoreU(ResizeBitCast(d1, v), d1, p); } } template > HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, size_t max_lanes_to_store) { const FixedTag, 4> d4; const Half d2; const Half d1; if (max_lanes_to_store <= 1) { if (max_lanes_to_store == 1) { StoreU(ResizeBitCast(d1, v), d1, p); } } else if (max_lanes_to_store >= 8) { StoreU(v, d, p); } else if (max_lanes_to_store >= 4) { StoreU(LowerHalf(d4, v), d4, p); StoreN(detail::StoreNGetUpperHalf(d4, v), d4, p + 4, max_lanes_to_store - 4); } else { StoreN(LowerHalf(d4, v), d4, p, max_lanes_to_store); } } template > HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, size_t max_lanes_to_store) { const FixedTag, 8> d8; const Half d4; const Half d2; const Half d1; if (max_lanes_to_store <= 1) { if (max_lanes_to_store == 1) { StoreU(ResizeBitCast(d1, v), d1, p); } } else if (max_lanes_to_store >= 16) { StoreU(v, d, p); } else if (max_lanes_to_store >= 8) { StoreU(LowerHalf(d8, v), d8, p); StoreN(detail::StoreNGetUpperHalf(d8, v), d8, p + 8, max_lanes_to_store - 8); } else { StoreN(LowerHalf(d8, v), d8, p, max_lanes_to_store); } } #if HWY_MAX_BYTES >= 32 template > HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, size_t max_lanes_to_store) { const size_t N = Lanes(d); if (max_lanes_to_store >= N) { StoreU(v, d, p); return; } const Half dh; const size_t half_N = Lanes(dh); if (max_lanes_to_store <= half_N) { StoreN(LowerHalf(dh, v), dh, p, max_lanes_to_store); } else { StoreU(LowerHalf(dh, v), dh, p); StoreN(UpperHalf(dh, v), dh, p + half_N, max_lanes_to_store - half_N); } } #endif // HWY_MAX_BYTES >= 32 #else // !HWY_MEM_OPS_MIGHT_FAULT || HWY_HAVE_SCALABLE template > HWY_API void StoreN(VFromD v, D d, T* HWY_RESTRICT p, size_t max_lanes_to_store) { const size_t N = Lanes(d); const size_t clamped_max_lanes_to_store = HWY_MIN(max_lanes_to_store, N); #if HWY_MEM_OPS_MIGHT_FAULT if (clamped_max_lanes_to_store == 0) return; #endif BlendedStore(v, FirstN(d, clamped_max_lanes_to_store), d, p); detail::MaybeUnpoison(p, clamped_max_lanes_to_store); } #endif // HWY_MEM_OPS_MIGHT_FAULT && !HWY_HAVE_SCALABLE #endif // (defined(HWY_NATIVE_STORE_N) == defined(HWY_TARGET_TOGGLE)) // ------------------------------ Scatter #if (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_SCATTER #undef HWY_NATIVE_SCATTER #else #define HWY_NATIVE_SCATTER #endif template > HWY_API void ScatterOffset(VFromD v, D d, T* HWY_RESTRICT base, VFromD> offset) { const RebindToSigned di; using TI = TFromD; static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); HWY_ALIGN T lanes[MaxLanes(d)]; Store(v, d, lanes); HWY_ALIGN TI offset_lanes[MaxLanes(d)]; Store(offset, di, offset_lanes); uint8_t* base_bytes = reinterpret_cast(base); for (size_t i = 0; i < MaxLanes(d); ++i) { CopyBytes(&lanes[i], base_bytes + offset_lanes[i]); } } template > HWY_API void ScatterIndex(VFromD v, D d, T* HWY_RESTRICT base, VFromD> index) { const RebindToSigned di; using TI = TFromD; static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); HWY_ALIGN T lanes[MaxLanes(d)]; Store(v, d, lanes); HWY_ALIGN TI index_lanes[MaxLanes(d)]; Store(index, di, index_lanes); for (size_t i = 0; i < MaxLanes(d); ++i) { base[index_lanes[i]] = lanes[i]; } } template > HWY_API void MaskedScatterIndex(VFromD v, MFromD m, D d, T* HWY_RESTRICT base, VFromD> index) { const RebindToSigned di; using TI = TFromD; static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); HWY_ALIGN T lanes[MaxLanes(d)]; Store(v, d, lanes); HWY_ALIGN TI index_lanes[MaxLanes(d)]; Store(index, di, index_lanes); HWY_ALIGN TI mask_lanes[MaxLanes(di)]; Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes); for (size_t i = 0; i < MaxLanes(d); ++i) { if (mask_lanes[i]) base[index_lanes[i]] = lanes[i]; } } #endif // (defined(HWY_NATIVE_SCATTER) == defined(HWY_TARGET_TOGGLE)) // ------------------------------ Gather #if (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_GATHER #undef HWY_NATIVE_GATHER #else #define HWY_NATIVE_GATHER #endif template > HWY_API VFromD GatherOffset(D d, const T* HWY_RESTRICT base, VFromD> offset) { const RebindToSigned di; using TI = TFromD; static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); HWY_ALIGN TI offset_lanes[MaxLanes(d)]; Store(offset, di, offset_lanes); HWY_ALIGN T lanes[MaxLanes(d)]; const uint8_t* base_bytes = reinterpret_cast(base); for (size_t i = 0; i < MaxLanes(d); ++i) { HWY_DASSERT(offset_lanes[i] >= 0); CopyBytes(base_bytes + offset_lanes[i], &lanes[i]); } return Load(d, lanes); } template > HWY_API VFromD GatherIndex(D d, const T* HWY_RESTRICT base, VFromD> index) { const RebindToSigned di; using TI = TFromD; static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); HWY_ALIGN TI index_lanes[MaxLanes(d)]; Store(index, di, index_lanes); HWY_ALIGN T lanes[MaxLanes(d)]; for (size_t i = 0; i < MaxLanes(d); ++i) { HWY_DASSERT(index_lanes[i] >= 0); lanes[i] = base[index_lanes[i]]; } return Load(d, lanes); } template > HWY_API VFromD MaskedGatherIndex(MFromD m, D d, const T* HWY_RESTRICT base, VFromD> index) { const RebindToSigned di; using TI = TFromD; static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); HWY_ALIGN TI index_lanes[MaxLanes(di)]; Store(index, di, index_lanes); HWY_ALIGN TI mask_lanes[MaxLanes(di)]; Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes); HWY_ALIGN T lanes[MaxLanes(d)]; for (size_t i = 0; i < MaxLanes(d); ++i) { HWY_DASSERT(index_lanes[i] >= 0); lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : T{0}; } return Load(d, lanes); } template > HWY_API VFromD MaskedGatherIndexOr(VFromD no, MFromD m, D d, const T* HWY_RESTRICT base, VFromD> index) { const RebindToSigned di; using TI = TFromD; static_assert(sizeof(T) == sizeof(TI), "Index/lane size must match"); HWY_ALIGN TI index_lanes[MaxLanes(di)]; Store(index, di, index_lanes); HWY_ALIGN TI mask_lanes[MaxLanes(di)]; Store(BitCast(di, VecFromMask(d, m)), di, mask_lanes); HWY_ALIGN T no_lanes[MaxLanes(d)]; Store(no, d, no_lanes); HWY_ALIGN T lanes[MaxLanes(d)]; for (size_t i = 0; i < MaxLanes(d); ++i) { HWY_DASSERT(index_lanes[i] >= 0); lanes[i] = mask_lanes[i] ? base[index_lanes[i]] : no_lanes[i]; } return Load(d, lanes); } #endif // (defined(HWY_NATIVE_GATHER) == defined(HWY_TARGET_TOGGLE)) // ------------------------------ ScatterN/GatherN template > HWY_API void ScatterIndexN(VFromD v, D d, T* HWY_RESTRICT base, VFromD> index, const size_t max_lanes_to_store) { MaskedScatterIndex(v, FirstN(d, max_lanes_to_store), d, base, index); } template > HWY_API VFromD GatherIndexN(D d, const T* HWY_RESTRICT base, VFromD> index, const size_t max_lanes_to_load) { return MaskedGatherIndex(FirstN(d, max_lanes_to_load), d, base, index); } // ------------------------------ Integer AbsDiff and SumsOf8AbsDiff #if (defined(HWY_NATIVE_INTEGER_ABS_DIFF) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_INTEGER_ABS_DIFF #undef HWY_NATIVE_INTEGER_ABS_DIFF #else #define HWY_NATIVE_INTEGER_ABS_DIFF #endif template HWY_API V AbsDiff(V a, V b) { return Sub(Max(a, b), Min(a, b)); } #endif // HWY_NATIVE_INTEGER_ABS_DIFF #if (defined(HWY_NATIVE_SUMS_OF_8_ABS_DIFF) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_SUMS_OF_8_ABS_DIFF #undef HWY_NATIVE_SUMS_OF_8_ABS_DIFF #else #define HWY_NATIVE_SUMS_OF_8_ABS_DIFF #endif template ), HWY_IF_V_SIZE_GT_D(DFromV, (HWY_TARGET == HWY_SCALAR ? 0 : 4))> HWY_API Vec>> SumsOf8AbsDiff(V a, V b) { const DFromV d; const RebindToUnsigned du; const RepartitionToWideX3 dw; return BitCast(dw, SumsOf8(BitCast(du, AbsDiff(a, b)))); } #endif // HWY_NATIVE_SUMS_OF_8_ABS_DIFF // ------------------------------ SaturatedAdd/SaturatedSub for UI32/UI64 #if (defined(HWY_NATIVE_I32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB #undef HWY_NATIVE_I32_SATURATED_ADDSUB #else #define HWY_NATIVE_I32_SATURATED_ADDSUB #endif template )> HWY_API V SaturatedAdd(V a, V b) { const DFromV d; const auto sum = Add(a, b); const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum)); const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax())); return IfNegativeThenElse(overflow_mask, overflow_result, sum); } template )> HWY_API V SaturatedSub(V a, V b) { const DFromV d; const auto diff = Sub(a, b); const auto overflow_mask = And(Xor(a, b), Xor(a, diff)); const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax())); return IfNegativeThenElse(overflow_mask, overflow_result, diff); } #endif // HWY_NATIVE_I32_SATURATED_ADDSUB #if (defined(HWY_NATIVE_I64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB #undef HWY_NATIVE_I64_SATURATED_ADDSUB #else #define HWY_NATIVE_I64_SATURATED_ADDSUB #endif template )> HWY_API V SaturatedAdd(V a, V b) { const DFromV d; const auto sum = Add(a, b); const auto overflow_mask = AndNot(Xor(a, b), Xor(a, sum)); const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax())); return IfNegativeThenElse(overflow_mask, overflow_result, sum); } template )> HWY_API V SaturatedSub(V a, V b) { const DFromV d; const auto diff = Sub(a, b); const auto overflow_mask = And(Xor(a, b), Xor(a, diff)); const auto overflow_result = Xor(BroadcastSignBit(a), Set(d, LimitsMax())); return IfNegativeThenElse(overflow_mask, overflow_result, diff); } #endif // HWY_NATIVE_I64_SATURATED_ADDSUB #if (defined(HWY_NATIVE_U32_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_U32_SATURATED_ADDSUB #undef HWY_NATIVE_U32_SATURATED_ADDSUB #else #define HWY_NATIVE_U32_SATURATED_ADDSUB #endif template )> HWY_API V SaturatedAdd(V a, V b) { return Add(a, Min(b, Not(a))); } template )> HWY_API V SaturatedSub(V a, V b) { return Sub(a, Min(a, b)); } #endif // HWY_NATIVE_U32_SATURATED_ADDSUB #if (defined(HWY_NATIVE_U64_SATURATED_ADDSUB) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_U64_SATURATED_ADDSUB #undef HWY_NATIVE_U64_SATURATED_ADDSUB #else #define HWY_NATIVE_U64_SATURATED_ADDSUB #endif template )> HWY_API V SaturatedAdd(V a, V b) { return Add(a, Min(b, Not(a))); } template )> HWY_API V SaturatedSub(V a, V b) { return Sub(a, Min(a, b)); } #endif // HWY_NATIVE_U64_SATURATED_ADDSUB // ------------------------------ Unsigned to signed demotions template , DN>>, hwy::EnableIf<(sizeof(TFromD) < sizeof(TFromV))>* = nullptr, HWY_IF_LANES_D(DFromV, HWY_MAX_LANES_D(DFromV))> HWY_API VFromD DemoteTo(DN dn, V v) { const DFromV d; const RebindToSigned di; const RebindToUnsigned dn_u; // First, do a signed to signed demotion. This will convert any values // that are greater than hwy::HighestValue>>() to a // negative value. const auto i2i_demote_result = DemoteTo(dn, BitCast(di, v)); // Second, convert any negative values to hwy::HighestValue>() // using an unsigned Min operation. const auto max_signed_val = Set(dn, hwy::HighestValue>()); return BitCast( dn, Min(BitCast(dn_u, i2i_demote_result), BitCast(dn_u, max_signed_val))); } #if HWY_TARGET != HWY_SCALAR || HWY_IDE template , DN>>, HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), HWY_IF_LANES_D(DFromV, HWY_MAX_LANES_D(DFromV))> HWY_API VFromD ReorderDemote2To(DN dn, V a, V b) { const DFromV d; const RebindToSigned di; const RebindToUnsigned dn_u; // First, do a signed to signed demotion. This will convert any values // that are greater than hwy::HighestValue>>() to a // negative value. const auto i2i_demote_result = ReorderDemote2To(dn, BitCast(di, a), BitCast(di, b)); // Second, convert any negative values to hwy::HighestValue>() // using an unsigned Min operation. const auto max_signed_val = Set(dn, hwy::HighestValue>()); return BitCast( dn, Min(BitCast(dn_u, i2i_demote_result), BitCast(dn_u, max_signed_val))); } #endif // ------------------------------ PromoteLowerTo // There is no codegen advantage for a native version of this. It is provided // only for convenience. template HWY_API VFromD PromoteLowerTo(D d, V v) { // Lanes(d) may differ from Lanes(DFromV()). Use the lane type from V // because it cannot be deduced from D (could be either bf16 or f16). const Rebind, decltype(d)> dh; return PromoteTo(d, LowerHalf(dh, v)); } // ------------------------------ PromoteUpperTo #if (defined(HWY_NATIVE_PROMOTE_UPPER_TO) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_PROMOTE_UPPER_TO #undef HWY_NATIVE_PROMOTE_UPPER_TO #else #define HWY_NATIVE_PROMOTE_UPPER_TO #endif // This requires UpperHalf. #if HWY_TARGET != HWY_SCALAR || HWY_IDE template HWY_API VFromD PromoteUpperTo(D d, V v) { // Lanes(d) may differ from Lanes(DFromV()). Use the lane type from V // because it cannot be deduced from D (could be either bf16 or f16). const Rebind, decltype(d)> dh; return PromoteTo(d, UpperHalf(dh, v)); } #endif // HWY_TARGET != HWY_SCALAR #endif // HWY_NATIVE_PROMOTE_UPPER_TO // ------------------------------ PromoteEvenTo/PromoteOddTo #if HWY_TARGET != HWY_SCALAR namespace detail { // Tag dispatch is used in detail::PromoteEvenTo and detail::PromoteOddTo as // there are target-specific specializations for some of the // detail::PromoteEvenTo and detail::PromoteOddTo cases on // SVE/PPC/SSE2/SSSE3/SSE4/AVX2. // All targets except HWY_SCALAR use the implementations of // detail::PromoteEvenTo and detail::PromoteOddTo in generic_ops-inl.h for at // least some of the PromoteEvenTo and PromoteOddTo cases. // Signed to signed PromoteEvenTo/PromoteOddTo template HWY_INLINE VFromD PromoteEvenTo( hwy::SignedTag /*to_type_tag*/, hwy::SizeTag /*to_lane_size_tag*/, hwy::SignedTag /*from_type_tag*/, D d_to, V v) { #if HWY_IS_LITTLE_ENDIAN // On little-endian targets, need to shift each lane of the bitcasted vector // left by kToLaneSize * 4 bits to get the bits of the even source lanes into // the upper kToLaneSize * 4 bits of even_in_hi. const auto even_in_hi = ShiftLeft(BitCast(d_to, v)); #else // On big-endian targets, the bits of the even source lanes are already in // the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector. const auto even_in_hi = BitCast(d_to, v); #endif // Right-shift even_in_hi by kToLaneSize * 4 bits return ShiftRight(even_in_hi); } template HWY_INLINE VFromD PromoteOddTo( hwy::SignedTag /*to_type_tag*/, hwy::SizeTag /*to_lane_size_tag*/, hwy::SignedTag /*from_type_tag*/, D d_to, V v) { #if HWY_IS_LITTLE_ENDIAN // On little-endian targets, the bits of the odd source lanes are already in // the upper kToLaneSize * 4 bits of the lanes of the bitcasted vector. const auto odd_in_hi = BitCast(d_to, v); #else // On big-endian targets, need to shift each lane of the bitcasted vector left // by kToLaneSize * 4 bits to get the bits of the odd source lanes into the // upper kToLaneSize * 4 bits of odd_in_hi. const auto odd_in_hi = ShiftLeft(BitCast(d_to, v)); #endif // Right-shift odd_in_hi by kToLaneSize * 4 bits return ShiftRight(odd_in_hi); } // Unsigned to unsigned PromoteEvenTo/PromoteOddTo template HWY_INLINE VFromD PromoteEvenTo( hwy::UnsignedTag /*to_type_tag*/, hwy::SizeTag /*to_lane_size_tag*/, hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { #if HWY_IS_LITTLE_ENDIAN // On little-endian targets, the bits of the even source lanes are already // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector. // Simply need to zero out the upper bits of each lane of the bitcasted // vector. return And(BitCast(d_to, v), Set(d_to, static_cast>(LimitsMax>()))); #else // On big-endian targets, need to shift each lane of the bitcasted vector // right by kToLaneSize * 4 bits to get the bits of the even source lanes into // the lower kToLaneSize * 4 bits of the result. // The right shift below will zero out the upper kToLaneSize * 4 bits of the // result. return ShiftRight(BitCast(d_to, v)); #endif } template HWY_INLINE VFromD PromoteOddTo( hwy::UnsignedTag /*to_type_tag*/, hwy::SizeTag /*to_lane_size_tag*/, hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { #if HWY_IS_LITTLE_ENDIAN // On little-endian targets, need to shift each lane of the bitcasted vector // right by kToLaneSize * 4 bits to get the bits of the odd source lanes into // the lower kToLaneSize * 4 bits of the result. // The right shift below will zero out the upper kToLaneSize * 4 bits of the // result. return ShiftRight(BitCast(d_to, v)); #else // On big-endian targets, the bits of the even source lanes are already // in the lower kToLaneSize * 4 bits of the lanes of the bitcasted vector. // Simply need to zero out the upper bits of each lane of the bitcasted // vector. return And(BitCast(d_to, v), Set(d_to, static_cast>(LimitsMax>()))); #endif } // Unsigned to signed: Same as unsigned->unsigned PromoteEvenTo/PromoteOddTo // followed by BitCast to signed template HWY_INLINE VFromD PromoteEvenTo( hwy::SignedTag /*to_type_tag*/, hwy::SizeTag /*to_lane_size_tag*/, hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { const RebindToUnsigned du_to; return BitCast(d_to, PromoteEvenTo(hwy::UnsignedTag(), hwy::SizeTag(), hwy::UnsignedTag(), du_to, v)); } template HWY_INLINE VFromD PromoteOddTo( hwy::SignedTag /*to_type_tag*/, hwy::SizeTag /*to_lane_size_tag*/, hwy::UnsignedTag /*from_type_tag*/, D d_to, V v) { const RebindToUnsigned du_to; return BitCast(d_to, PromoteOddTo(hwy::UnsignedTag(), hwy::SizeTag(), hwy::UnsignedTag(), du_to, v)); } // BF16->F32 PromoteEvenTo // NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag // instead of hwy::FloatTag on targets that use scalable vectors. // VBF16 is considered to be a bfloat16_t vector if TFromV is the same // type as TFromV>> // The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered // to be a bfloat16_t vector. template >, hwy::EnableIf, TFromV>()>* = nullptr> HWY_INLINE VFromD PromoteEvenTo(hwy::FloatTag /*to_type_tag*/, hwy::SizeTag<4> /*to_lane_size_tag*/, FromTypeTag /*from_type_tag*/, DF32 d_to, VBF16 v) { const RebindToUnsigned du_to; #if HWY_IS_LITTLE_ENDIAN // On little-endian platforms, need to shift left each lane of the bitcasted // vector by 16 bits. return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v))); #else // On big-endian platforms, the even lanes of the source vector are already // in the upper 16 bits of the lanes of the bitcasted vector. // Need to simply zero out the lower 16 bits of each lane of the bitcasted // vector. return BitCast(d_to, And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u}))); #endif } // BF16->F32 PromoteOddTo // NOTE: It is possible for FromTypeTag to be hwy::SignedTag or hwy::UnsignedTag // instead of hwy::FloatTag on targets that use scalable vectors. // VBF16 is considered to be a bfloat16_t vector if TFromV is the same // type as TFromV>> // The BF16->F32 PromoteEvenTo overload is only enabled if VBF16 is considered // to be a bfloat16_t vector. template >, hwy::EnableIf, TFromV>()>* = nullptr> HWY_INLINE VFromD PromoteOddTo(hwy::FloatTag /*to_type_tag*/, hwy::SizeTag<4> /*to_lane_size_tag*/, FromTypeTag /*from_type_tag*/, DF32 d_to, VBF16 v) { const RebindToUnsigned du_to; #if HWY_IS_LITTLE_ENDIAN // On little-endian platforms, the odd lanes of the source vector are already // in the upper 16 bits of the lanes of the bitcasted vector. // Need to simply zero out the lower 16 bits of each lane of the bitcasted // vector. return BitCast(d_to, And(BitCast(du_to, v), Set(du_to, uint32_t{0xFFFF0000u}))); #else // On big-endian platforms, need to shift left each lane of the bitcasted // vector by 16 bits. return BitCast(d_to, ShiftLeft<16>(BitCast(du_to, v))); #endif } // Default PromoteEvenTo/PromoteOddTo implementations template HWY_INLINE VFromD PromoteEvenTo( ToTypeTag /*to_type_tag*/, hwy::SizeTag /*to_lane_size_tag*/, FromTypeTag /*from_type_tag*/, D d_to, V v) { return PromoteLowerTo(d_to, v); } template HWY_INLINE VFromD PromoteEvenTo( ToTypeTag /*to_type_tag*/, hwy::SizeTag /*to_lane_size_tag*/, FromTypeTag /*from_type_tag*/, D d_to, V v) { const DFromV d; return PromoteLowerTo(d_to, ConcatEven(d, v, v)); } template HWY_INLINE VFromD PromoteOddTo( ToTypeTag /*to_type_tag*/, hwy::SizeTag /*to_lane_size_tag*/, FromTypeTag /*from_type_tag*/, D d_to, V v) { const DFromV d; return PromoteLowerTo(d_to, ConcatOdd(d, v, v)); } } // namespace detail template )), class V2 = VFromD, D>>, HWY_IF_LANES_D(DFromV, HWY_MAX_LANES_V(V2))> HWY_API VFromD PromoteEvenTo(D d, V v) { return detail::PromoteEvenTo(hwy::TypeTag>(), hwy::SizeTag)>(), hwy::TypeTag>(), d, v); } template )), class V2 = VFromD, D>>, HWY_IF_LANES_D(DFromV, HWY_MAX_LANES_V(V2))> HWY_API VFromD PromoteOddTo(D d, V v) { return detail::PromoteOddTo(hwy::TypeTag>(), hwy::SizeTag)>(), hwy::TypeTag>(), d, v); } #endif // HWY_TARGET != HWY_SCALAR // ------------------------------ float16_t <-> float #if (defined(HWY_NATIVE_F16C) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_F16C #undef HWY_NATIVE_F16C #else #define HWY_NATIVE_F16C #endif template HWY_API VFromD PromoteTo(D df32, VFromD> v) { const RebindToSigned di32; const RebindToUnsigned du32; const Rebind du16; using VU32 = VFromD; const VU32 bits16 = PromoteTo(du32, BitCast(du16, v)); const VU32 sign = ShiftRight<15>(bits16); const VU32 biased_exp = And(ShiftRight<10>(bits16), Set(du32, 0x1F)); const VU32 mantissa = And(bits16, Set(du32, 0x3FF)); const VU32 subnormal = BitCast(du32, Mul(ConvertTo(df32, BitCast(di32, mantissa)), Set(df32, 1.0f / 16384 / 1024))); const VU32 biased_exp32 = Add(biased_exp, Set(du32, 127 - 15)); const VU32 mantissa32 = ShiftLeft<23 - 10>(mantissa); const VU32 normal = Or(ShiftLeft<23>(biased_exp32), mantissa32); const VU32 bits32 = IfThenElse(Eq(biased_exp, Zero(du32)), subnormal, normal); return BitCast(df32, Or(ShiftLeft<31>(sign), bits32)); } template HWY_API VFromD DemoteTo(D df16, VFromD> v) { const RebindToSigned di16; const Rebind di32; const RebindToFloat df32; const RebindToUnsigned du32; // There are 23 fractional bits (plus the implied 1 bit) in the mantissa of // a F32, and there are 10 fractional bits (plus the implied 1 bit) in the // mantissa of a F16 // We want the unbiased exponent of round_incr[i] to be at least (-14) + 13 as // 2^(-14) is the smallest positive normal F16 value and as we want 13 // mantissa bits (including the implicit 1 bit) to the left of the // F32 mantissa bits in rounded_val[i] since 23 - 10 is equal to 13 // The biased exponent of round_incr[i] needs to be at least 126 as // (-14) + 13 + 127 is equal to 126 // We also want to biased exponent of round_incr[i] to be less than or equal // to 255 (which is equal to MaxExponentField()) // The biased F64 exponent of round_incr is equal to // HWY_MAX(HWY_MIN(((exp_bits[i] >> 23) & 255) + 13, 255), 126) // hi9_bits[i] is equal to the upper 9 bits of v[i] const auto hi9_bits = ShiftRight<23>(BitCast(du32, v)); const auto k13 = Set(du32, uint32_t{13u}); // Minimum biased F32 exponent of round_incr const auto k126 = Set(du32, uint32_t{126u}); // round_incr_hi9_bits[i] is equivalent to // (hi9_bits[i] & 0x100) | // HWY_MAX(HWY_MIN((hi9_bits[i] & 0xFF) + 13, 255), 126) #if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128 const auto k255 = Set(du32, uint32_t{255u}); const auto round_incr_hi9_bits = BitwiseIfThenElse( k255, Max(Min(Add(And(hi9_bits, k255), k13), k255), k126), hi9_bits); #else // On targets other than SCALAR and EMU128, the exponent bits of hi9_bits can // be incremented by 13 and clamped to the [13, 255] range without overflowing // into the sign bit of hi9_bits by using U8 SaturatedAdd as there are 8 // exponent bits in an F32 // U8 Max can be used on targets other than SCALAR and EMU128 to clamp // ((hi9_bits & 0xFF) + 13) to the [126, 255] range without affecting the sign // bit const Repartition du32_as_u8; const auto round_incr_hi9_bits = BitCast( du32, Max(SaturatedAdd(BitCast(du32_as_u8, hi9_bits), BitCast(du32_as_u8, k13)), BitCast(du32_as_u8, k126))); #endif // (round_incr_hi9_bits >> 8) is equal to (hi9_bits >> 8), and // (round_incr_hi9_bits & 0xFF) is equal to // HWY_MAX(HWY_MIN((round_incr_hi9_bits & 0xFF) + 13, 255), 126) const auto round_incr = BitCast(df32, ShiftLeft<23>(round_incr_hi9_bits)); // Add round_incr[i] to v[i] to round the mantissa to the nearest F16 mantissa // and to move the fractional bits of the resulting non-NaN mantissa down to // the lower 10 bits of rounded_val if (v[i] + round_incr[i]) is a non-NaN // value const auto rounded_val = Add(v, round_incr); // rounded_val_bits is the bits of rounded_val as a U32 const auto rounded_val_bits = BitCast(du32, rounded_val); // rounded_val[i] is known to have the same biased exponent as round_incr[i] // as |round_incr[i]| > 2^12*|v[i]| is true if round_incr[i] is a finite // value, round_incr[i] and v[i] both have the same sign, and |round_incr[i]| // is either a power of 2 that is greater than or equal to 2^-1 or infinity. // If rounded_val[i] is a finite F32 value, then // (rounded_val_bits[i] & 0x00000FFF) is the bit representation of the // rounded mantissa of rounded_val[i] as a UQ2.10 fixed point number that is // in the range [0, 2]. // In other words, (rounded_val_bits[i] & 0x00000FFF) is between 0 and 0x0800, // with (rounded_val_bits[i] & 0x000003FF) being the fractional bits of the // resulting F16 mantissa, if rounded_v[i] is a finite F32 value. // (rounded_val_bits[i] & 0x007FF000) == 0 is guaranteed to be true if // rounded_val[i] is a non-NaN value // The biased exponent of rounded_val[i] is guaranteed to be at least 126 as // the biased exponent of round_incr[i] is at least 126 and as both v[i] and // round_incr[i] have the same sign bit // The ULP of a F32 value with a biased exponent of 126 is equal to // 2^(126 - 127 - 23), which is equal to 2^(-24) (which is also the ULP of a // F16 value with a biased exponent of 0 or 1 as (1 - 15 - 10) is equal to // -24) // The biased exponent (before subtracting by 126) needs to be clamped to the // [126, 157] range as 126 + 31 is equal to 157 and as 31 is the largest // biased exponent of a F16. // The biased exponent of the resulting F16 value is equal to // HWY_MIN((round_incr_hi9_bits[i] & 0xFF) + // ((rounded_val_bits[i] >> 10) & 0xFF), 157) - 126 #if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128 auto f16_exp_bits = Min(Add(ShiftLeft<10>(And(round_incr_hi9_bits, k255)), And(rounded_val_bits, Set(du32, static_cast(uint32_t{0xFFu} << 10)))), Set(du32, static_cast(uint32_t{157u} << 10))); #else auto f16_exp_bits = ShiftLeft<10>(BitCast( du32, Min(SaturatedAdd(BitCast(du32_as_u8, round_incr_hi9_bits), BitCast(du32_as_u8, ShiftRight<10>(rounded_val_bits))), BitCast(du32_as_u8, Set(du32, uint32_t{157}))))); #endif f16_exp_bits = Sub(f16_exp_bits, Set(du32, static_cast(uint32_t{126u} << 10))); const auto f16_unmasked_mant_bits = BitCast(di32, Or(rounded_val, VecFromMask(df32, IsNaN(rounded_val)))); const auto f16_exp_mant_bits = OrAnd(BitCast(di32, f16_exp_bits), f16_unmasked_mant_bits, Set(di32, int32_t{0x03FF})); // f16_bits_as_i32 is the F16 bits sign-extended to an I32 (with the upper 17 // bits of f16_bits_as_i32[i] set to the sign bit of rounded_val[i]) to allow // efficient truncation of the F16 bits to an I16 using an I32->I16 DemoteTo // operation const auto f16_bits_as_i32 = OrAnd(f16_exp_mant_bits, ShiftRight<16>(BitCast(di32, rounded_val_bits)), Set(di32, static_cast(0xFFFF8000u))); return BitCast(df16, DemoteTo(di16, f16_bits_as_i32)); } #endif // HWY_NATIVE_F16C // ------------------------------ F64->F16 DemoteTo #if (defined(HWY_NATIVE_DEMOTE_F64_TO_F16) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_DEMOTE_F64_TO_F16 #undef HWY_NATIVE_DEMOTE_F64_TO_F16 #else #define HWY_NATIVE_DEMOTE_F64_TO_F16 #endif #if HWY_HAVE_FLOAT64 template HWY_API VFromD DemoteTo(D df16, VFromD> v) { const Rebind df64; const Rebind du64; const Rebind df32; // The mantissa bits of v[i] are first rounded using round-to-odd rounding to // the nearest F64 value that has the lower 29 bits zeroed out to ensure that // the result is correctly rounded to a F16. const auto vf64_rounded = OrAnd( And(v, BitCast(df64, Set(du64, static_cast(0xFFFFFFFFE0000000u)))), BitCast(df64, Add(BitCast(du64, v), Set(du64, static_cast(0x000000001FFFFFFFu)))), BitCast(df64, Set(du64, static_cast(0x0000000020000000ULL)))); return DemoteTo(df16, DemoteTo(df32, vf64_rounded)); } #endif // HWY_HAVE_FLOAT64 #endif // HWY_NATIVE_DEMOTE_F64_TO_F16 // ------------------------------ F16->F64 PromoteTo #if (defined(HWY_NATIVE_PROMOTE_F16_TO_F64) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_PROMOTE_F16_TO_F64 #undef HWY_NATIVE_PROMOTE_F16_TO_F64 #else #define HWY_NATIVE_PROMOTE_F16_TO_F64 #endif #if HWY_HAVE_FLOAT64 template HWY_API VFromD PromoteTo(D df64, VFromD> v) { return PromoteTo(df64, PromoteTo(Rebind(), v)); } #endif // HWY_HAVE_FLOAT64 #endif // HWY_NATIVE_PROMOTE_F16_TO_F64 // ------------------------------ SumsOf2 #if HWY_TARGET != HWY_SCALAR namespace detail { template HWY_INLINE VFromD>> SumsOf2( TypeTag /*type_tag*/, hwy::SizeTag /*lane_size_tag*/, V v) { const DFromV d; const RepartitionToWide dw; return Add(PromoteEvenTo(dw, v), PromoteOddTo(dw, v)); } } // namespace detail template HWY_API VFromD>> SumsOf2(V v) { return detail::SumsOf2(hwy::TypeTag>(), hwy::SizeTag)>(), v); } #endif // HWY_TARGET != HWY_SCALAR // ------------------------------ SumsOf4 namespace detail { template HWY_INLINE VFromD>> SumsOf4( TypeTag /*type_tag*/, hwy::SizeTag /*lane_size_tag*/, V v) { using hwy::HWY_NAMESPACE::SumsOf2; return SumsOf2(SumsOf2(v)); } } // namespace detail template HWY_API VFromD>> SumsOf4(V v) { return detail::SumsOf4(hwy::TypeTag>(), hwy::SizeTag)>(), v); } // ------------------------------ OrderedTruncate2To #if HWY_IDE || \ (defined(HWY_NATIVE_ORDERED_TRUNCATE_2_TO) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_ORDERED_TRUNCATE_2_TO #undef HWY_NATIVE_ORDERED_TRUNCATE_2_TO #else #define HWY_NATIVE_ORDERED_TRUNCATE_2_TO #endif // (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar) #if HWY_TARGET != HWY_SCALAR || HWY_IDE template ) * 2), HWY_IF_LANES_D(DFromV>, HWY_MAX_LANES_D(DFromV) * 2)> HWY_API VFromD OrderedTruncate2To(DN dn, V a, V b) { return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a)); } #endif // HWY_TARGET != HWY_SCALAR #endif // HWY_NATIVE_ORDERED_TRUNCATE_2_TO // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex #if (defined(HWY_NATIVE_LEADING_ZERO_COUNT) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_LEADING_ZERO_COUNT #undef HWY_NATIVE_LEADING_ZERO_COUNT #else #define HWY_NATIVE_LEADING_ZERO_COUNT #endif namespace detail { template HWY_INLINE VFromD UIntToF32BiasedExp(D d, VFromD v) { const RebindToFloat df; #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2 const RebindToSigned di; const Repartition di16; // On SSE2/SSSE3/SSE4/AVX2, do an int32_t to float conversion, followed // by a unsigned right shift of the uint32_t bit representation of the // floating point values by 23, followed by an int16_t Min // operation as we are only interested in the biased exponent that would // result from a uint32_t to float conversion. // An int32_t to float vector conversion is also much more efficient on // SSE2/SSSE3/SSE4/AVX2 than an uint32_t vector to float vector conversion // as an uint32_t vector to float vector conversion on SSE2/SSSE3/SSE4/AVX2 // requires multiple instructions whereas an int32_t to float vector // conversion can be carried out using a single instruction on // SSE2/SSSE3/SSE4/AVX2. const auto f32_bits = BitCast(d, ConvertTo(df, BitCast(di, v))); return BitCast(d, Min(BitCast(di16, ShiftRight<23>(f32_bits)), BitCast(di16, Set(d, 158)))); #else const auto f32_bits = BitCast(d, ConvertTo(df, v)); return BitCast(d, ShiftRight<23>(f32_bits)); #endif } template )> HWY_INLINE V I32RangeU32ToF32BiasedExp(V v) { // I32RangeU32ToF32BiasedExp is similar to UIntToF32BiasedExp, but // I32RangeU32ToF32BiasedExp assumes that v[i] is between 0 and 2147483647. const DFromV d; const RebindToFloat df; #if HWY_TARGET > HWY_AVX3 && HWY_TARGET <= HWY_SSE2 const RebindToSigned d_src; #else const RebindToUnsigned d_src; #endif const auto f32_bits = BitCast(d, ConvertTo(df, BitCast(d_src, v))); return ShiftRight<23>(f32_bits); } template HWY_INLINE VFromD UIntToF32BiasedExp(D d, VFromD v) { const Rebind du32; const auto f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(PromoteTo(du32, v)); return TruncateTo(d, f32_biased_exp_as_u32); } #if HWY_TARGET != HWY_SCALAR template HWY_INLINE VFromD UIntToF32BiasedExp(D d, VFromD v) { const Half dh; const Rebind du32; const auto lo_u32 = PromoteTo(du32, LowerHalf(dh, v)); const auto hi_u32 = PromoteTo(du32, UpperHalf(dh, v)); const auto lo_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(lo_u32); const auto hi_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(hi_u32); #if HWY_TARGET <= HWY_SSE2 const RebindToSigned di32; const RebindToSigned di; return BitCast(d, OrderedDemote2To(di, BitCast(di32, lo_f32_biased_exp_as_u32), BitCast(di32, hi_f32_biased_exp_as_u32))); #else return OrderedTruncate2To(d, lo_f32_biased_exp_as_u32, hi_f32_biased_exp_as_u32); #endif } #endif // HWY_TARGET != HWY_SCALAR template HWY_INLINE VFromD UIntToF32BiasedExp(D d, VFromD v) { const Rebind du32; const auto f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(PromoteTo(du32, v)); return U8FromU32(f32_biased_exp_as_u32); } #if HWY_TARGET != HWY_SCALAR template HWY_INLINE VFromD UIntToF32BiasedExp(D d, VFromD v) { const Half dh; const Rebind du32; const Repartition du16; const auto lo_u32 = PromoteTo(du32, LowerHalf(dh, v)); const auto hi_u32 = PromoteTo(du32, UpperHalf(dh, v)); const auto lo_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(lo_u32); const auto hi_f32_biased_exp_as_u32 = I32RangeU32ToF32BiasedExp(hi_u32); #if HWY_TARGET <= HWY_SSE2 const RebindToSigned di32; const RebindToSigned di16; const auto f32_biased_exp_as_i16 = OrderedDemote2To(di16, BitCast(di32, lo_f32_biased_exp_as_u32), BitCast(di32, hi_f32_biased_exp_as_u32)); return DemoteTo(d, f32_biased_exp_as_i16); #else const auto f32_biased_exp_as_u16 = OrderedTruncate2To( du16, lo_f32_biased_exp_as_u32, hi_f32_biased_exp_as_u32); return TruncateTo(d, f32_biased_exp_as_u16); #endif } template HWY_INLINE VFromD UIntToF32BiasedExp(D d, VFromD v) { const Half dh; const Half dq; const Rebind du32; const Repartition du16; const auto lo_half = LowerHalf(dh, v); const auto hi_half = UpperHalf(dh, v); const auto u32_q0 = PromoteTo(du32, LowerHalf(dq, lo_half)); const auto u32_q1 = PromoteTo(du32, UpperHalf(dq, lo_half)); const auto u32_q2 = PromoteTo(du32, LowerHalf(dq, hi_half)); const auto u32_q3 = PromoteTo(du32, UpperHalf(dq, hi_half)); const auto f32_biased_exp_as_u32_q0 = I32RangeU32ToF32BiasedExp(u32_q0); const auto f32_biased_exp_as_u32_q1 = I32RangeU32ToF32BiasedExp(u32_q1); const auto f32_biased_exp_as_u32_q2 = I32RangeU32ToF32BiasedExp(u32_q2); const auto f32_biased_exp_as_u32_q3 = I32RangeU32ToF32BiasedExp(u32_q3); #if HWY_TARGET <= HWY_SSE2 const RebindToSigned di32; const RebindToSigned di16; const auto lo_f32_biased_exp_as_i16 = OrderedDemote2To(di16, BitCast(di32, f32_biased_exp_as_u32_q0), BitCast(di32, f32_biased_exp_as_u32_q1)); const auto hi_f32_biased_exp_as_i16 = OrderedDemote2To(di16, BitCast(di32, f32_biased_exp_as_u32_q2), BitCast(di32, f32_biased_exp_as_u32_q3)); return OrderedDemote2To(d, lo_f32_biased_exp_as_i16, hi_f32_biased_exp_as_i16); #else const auto lo_f32_biased_exp_as_u16 = OrderedTruncate2To( du16, f32_biased_exp_as_u32_q0, f32_biased_exp_as_u32_q1); const auto hi_f32_biased_exp_as_u16 = OrderedTruncate2To( du16, f32_biased_exp_as_u32_q2, f32_biased_exp_as_u32_q3); return OrderedTruncate2To(d, lo_f32_biased_exp_as_u16, hi_f32_biased_exp_as_u16); #endif } #endif // HWY_TARGET != HWY_SCALAR #if HWY_TARGET == HWY_SCALAR template using F32ExpLzcntMinMaxRepartition = RebindToUnsigned; #elif HWY_TARGET >= HWY_SSSE3 && HWY_TARGET <= HWY_SSE2 template using F32ExpLzcntMinMaxRepartition = Repartition; #else template using F32ExpLzcntMinMaxRepartition = Repartition), 4)>, D>; #endif template using F32ExpLzcntMinMaxCmpV = VFromD>>; template HWY_INLINE F32ExpLzcntMinMaxCmpV F32ExpLzcntMinMaxBitCast(V v) { const DFromV d; const F32ExpLzcntMinMaxRepartition d2; return BitCast(d2, v); } template HWY_INLINE VFromD UIntToF32BiasedExp(D d, VFromD v) { #if HWY_TARGET == HWY_SCALAR const uint64_t u64_val = GetLane(v); const float f32_val = static_cast(u64_val); const uint32_t f32_bits = BitCastScalar(f32_val); return Set(d, static_cast(f32_bits >> 23)); #else const Repartition du32; const auto f32_biased_exp = UIntToF32BiasedExp(du32, BitCast(du32, v)); const auto f32_biased_exp_adj = IfThenZeroElse(Eq(f32_biased_exp, Zero(du32)), BitCast(du32, Set(d, 0x0000002000000000u))); const auto adj_f32_biased_exp = Add(f32_biased_exp, f32_biased_exp_adj); return ShiftRight<32>(BitCast( d, Max(F32ExpLzcntMinMaxBitCast(adj_f32_biased_exp), F32ExpLzcntMinMaxBitCast(Reverse2(du32, adj_f32_biased_exp))))); #endif } template HWY_INLINE V UIntToF32BiasedExp(V v) { const DFromV d; return UIntToF32BiasedExp(d, v); } template HWY_INLINE V NormalizeForUIntTruncConvToF32(V v) { return v; } template HWY_INLINE V NormalizeForUIntTruncConvToF32(V v) { // If v[i] >= 16777216 is true, make sure that the bit at // HighestSetBitIndex(v[i]) - 24 is zeroed out to ensure that any inexact // conversion to single-precision floating point is rounded down. // This zeroing-out can be accomplished through the AndNot operation below. return AndNot(ShiftRight<24>(v), v); } } // namespace detail template HWY_API V HighestSetBitIndex(V v) { const DFromV d; const RebindToUnsigned du; using TU = TFromD; const auto f32_biased_exp = detail::UIntToF32BiasedExp( detail::NormalizeForUIntTruncConvToF32(BitCast(du, v))); return BitCast(d, Sub(f32_biased_exp, Set(du, TU{127}))); } template HWY_API V LeadingZeroCount(V v) { const DFromV d; const RebindToUnsigned du; using TU = TFromD; constexpr TU kNumOfBitsInT{sizeof(TU) * 8}; const auto f32_biased_exp = detail::UIntToF32BiasedExp( detail::NormalizeForUIntTruncConvToF32(BitCast(du, v))); const auto lz_count = Sub(Set(du, TU{kNumOfBitsInT + 126}), f32_biased_exp); return BitCast(d, Min(detail::F32ExpLzcntMinMaxBitCast(lz_count), detail::F32ExpLzcntMinMaxBitCast(Set(du, kNumOfBitsInT)))); } template HWY_API V TrailingZeroCount(V v) { const DFromV d; const RebindToUnsigned du; const RebindToSigned di; using TU = TFromD; const auto vi = BitCast(di, v); const auto lowest_bit = BitCast(du, And(vi, Neg(vi))); constexpr TU kNumOfBitsInT{sizeof(TU) * 8}; const auto f32_biased_exp = detail::UIntToF32BiasedExp(lowest_bit); const auto tz_count = Sub(f32_biased_exp, Set(du, TU{127})); return BitCast(d, Min(detail::F32ExpLzcntMinMaxBitCast(tz_count), detail::F32ExpLzcntMinMaxBitCast(Set(du, kNumOfBitsInT)))); } #endif // HWY_NATIVE_LEADING_ZERO_COUNT // ------------------------------ AESRound // Cannot implement on scalar: need at least 16 bytes for TableLookupBytes. #if HWY_TARGET != HWY_SCALAR || HWY_IDE // Define for white-box testing, even if native instructions are available. namespace detail { // Constant-time: computes inverse in GF(2^4) based on "Accelerating AES with // Vector Permute Instructions" and the accompanying assembly language // implementation: https://crypto.stanford.edu/vpaes/vpaes.tgz. See also Botan: // https://botan.randombit.net/doxygen/aes__vperm_8cpp_source.html . // // A brute-force 256 byte table lookup can also be made constant-time, and // possibly competitive on NEON, but this is more performance-portable // especially for x86 and large vectors. template // u8 HWY_INLINE V SubBytesMulInverseAndAffineLookup(V state, V affine_tblL, V affine_tblU) { const DFromV du; const auto mask = Set(du, uint8_t{0xF}); // Change polynomial basis to GF(2^4) { const VFromD basisL = Dup128VecFromValues(du, 0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2, 0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA); const VFromD basisU = Dup128VecFromValues(du, 0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C, 0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD); const auto sL = And(state, mask); const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero const auto gf4L = TableLookupBytes(basisL, sL); const auto gf4U = TableLookupBytes(basisU, sU); state = Xor(gf4L, gf4U); } // Inversion in GF(2^4). Elements 0 represent "infinity" (division by 0) and // cause TableLookupBytesOr0 to return 0. const VFromD zetaInv = Dup128VecFromValues( du, 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3); const VFromD tbl = Dup128VecFromValues( du, 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4); const auto sL = And(state, mask); // L=low nibble, U=upper const auto sU = ShiftRight<4>(state); // byte shift => upper bits are zero const auto sX = Xor(sU, sL); const auto invL = TableLookupBytes(zetaInv, sL); const auto invU = TableLookupBytes(tbl, sU); const auto invX = TableLookupBytes(tbl, sX); const auto outL = Xor(sX, TableLookupBytesOr0(tbl, Xor(invL, invU))); const auto outU = Xor(sU, TableLookupBytesOr0(tbl, Xor(invL, invX))); const auto affL = TableLookupBytesOr0(affine_tblL, outL); const auto affU = TableLookupBytesOr0(affine_tblU, outU); return Xor(affL, affU); } template // u8 HWY_INLINE V SubBytes(V state) { const DFromV du; // Linear skew (cannot bake 0x63 bias into the table because out* indices // may have the infinity flag set). const VFromD affineL = Dup128VecFromValues(du, 0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0, 0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15); const VFromD affineU = Dup128VecFromValues(du, 0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF, 0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E); return Xor(SubBytesMulInverseAndAffineLookup(state, affineL, affineU), Set(du, uint8_t{0x63})); } template // u8 HWY_INLINE V InvSubBytes(V state) { const DFromV du; const VFromD gF2P4InvToGF2P8InvL = Dup128VecFromValues(du, 0x00, 0x40, 0xF9, 0x7E, 0x53, 0xEA, 0x87, 0x13, 0x2D, 0x3E, 0x94, 0xD4, 0xB9, 0x6D, 0xAA, 0xC7); const VFromD gF2P4InvToGF2P8InvU = Dup128VecFromValues(du, 0x00, 0x1D, 0x44, 0x93, 0x0F, 0x56, 0xD7, 0x12, 0x9C, 0x8E, 0xC5, 0xD8, 0x59, 0x81, 0x4B, 0xCA); // Apply the inverse affine transformation const auto b = Xor(Xor3(Or(ShiftLeft<1>(state), ShiftRight<7>(state)), Or(ShiftLeft<3>(state), ShiftRight<5>(state)), Or(ShiftLeft<6>(state), ShiftRight<2>(state))), Set(du, uint8_t{0x05})); // The GF(2^8) multiplicative inverse is computed as follows: // - Changing the polynomial basis to GF(2^4) // - Computing the GF(2^4) multiplicative inverse // - Converting the GF(2^4) multiplicative inverse to the GF(2^8) // multiplicative inverse through table lookups using the // kGF2P4InvToGF2P8InvL and kGF2P4InvToGF2P8InvU tables return SubBytesMulInverseAndAffineLookup(b, gF2P4InvToGF2P8InvL, gF2P4InvToGF2P8InvU); } } // namespace detail #endif // HWY_TARGET != HWY_SCALAR #if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_AES #undef HWY_NATIVE_AES #else #define HWY_NATIVE_AES #endif // (Must come after HWY_TARGET_TOGGLE, else we don't reset it for scalar) #if HWY_TARGET != HWY_SCALAR namespace detail { template // u8 HWY_INLINE V ShiftRows(const V state) { const DFromV du; // transposed: state is column major const VFromD shift_row = Dup128VecFromValues( du, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11); return TableLookupBytes(state, shift_row); } template // u8 HWY_INLINE V InvShiftRows(const V state) { const DFromV du; // transposed: state is column major const VFromD shift_row = Dup128VecFromValues( du, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3); return TableLookupBytes(state, shift_row); } template // u8 HWY_INLINE V GF2P8Mod11BMulBy2(V v) { const DFromV du; const RebindToSigned di; // can only do signed comparisons const auto msb = Lt(BitCast(di, v), Zero(di)); const auto overflow = BitCast(du, IfThenElseZero(msb, Set(di, int8_t{0x1B}))); return Xor(Add(v, v), overflow); // = v*2 in GF(2^8). } template // u8 HWY_INLINE V MixColumns(const V state) { const DFromV du; // For each column, the rows are the sum of GF(2^8) matrix multiplication by: // 2 3 1 1 // Let s := state*1, d := state*2, t := state*3. // 1 2 3 1 // d are on diagonal, no permutation needed. // 1 1 2 3 // t1230 indicates column indices of threes for the 4 rows. // 3 1 1 2 // We also need to compute s2301 and s3012 (=1230 o 2301). const VFromD v2301 = Dup128VecFromValues( du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); const VFromD v1230 = Dup128VecFromValues( du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12); const auto d = GF2P8Mod11BMulBy2(state); // = state*2 in GF(2^8). const auto s2301 = TableLookupBytes(state, v2301); const auto d_s2301 = Xor(d, s2301); const auto t_s2301 = Xor(state, d_s2301); // t(s*3) = XOR-sum {s, d(s*2)} const auto t1230_s3012 = TableLookupBytes(t_s2301, v1230); return Xor(d_s2301, t1230_s3012); // XOR-sum of 4 terms } template // u8 HWY_INLINE V InvMixColumns(const V state) { const DFromV du; // For each column, the rows are the sum of GF(2^8) matrix multiplication by: // 14 11 13 9 // 9 14 11 13 // 13 9 14 11 // 11 13 9 14 const VFromD v2301 = Dup128VecFromValues( du, 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13); const VFromD v1230 = Dup128VecFromValues( du, 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12); const auto sx2 = GF2P8Mod11BMulBy2(state); /* = state*2 in GF(2^8) */ const auto sx4 = GF2P8Mod11BMulBy2(sx2); /* = state*4 in GF(2^8) */ const auto sx8 = GF2P8Mod11BMulBy2(sx4); /* = state*8 in GF(2^8) */ const auto sx9 = Xor(sx8, state); /* = state*9 in GF(2^8) */ const auto sx11 = Xor(sx9, sx2); /* = state*11 in GF(2^8) */ const auto sx13 = Xor(sx9, sx4); /* = state*13 in GF(2^8) */ const auto sx14 = Xor3(sx8, sx4, sx2); /* = state*14 in GF(2^8) */ const auto sx13_0123_sx9_1230 = Xor(sx13, TableLookupBytes(sx9, v1230)); const auto sx14_0123_sx11_1230 = Xor(sx14, TableLookupBytes(sx11, v1230)); const auto sx13_2301_sx9_3012 = TableLookupBytes(sx13_0123_sx9_1230, v2301); return Xor(sx14_0123_sx11_1230, sx13_2301_sx9_3012); } } // namespace detail template // u8 HWY_API V AESRound(V state, const V round_key) { // Intel docs swap the first two steps, but it does not matter because // ShiftRows is a permutation and SubBytes is independent of lane index. state = detail::SubBytes(state); state = detail::ShiftRows(state); state = detail::MixColumns(state); state = Xor(state, round_key); // AddRoundKey return state; } template // u8 HWY_API V AESLastRound(V state, const V round_key) { // LIke AESRound, but without MixColumns. state = detail::SubBytes(state); state = detail::ShiftRows(state); state = Xor(state, round_key); // AddRoundKey return state; } template HWY_API V AESInvMixColumns(V state) { return detail::InvMixColumns(state); } template // u8 HWY_API V AESRoundInv(V state, const V round_key) { state = detail::InvSubBytes(state); state = detail::InvShiftRows(state); state = detail::InvMixColumns(state); state = Xor(state, round_key); // AddRoundKey return state; } template // u8 HWY_API V AESLastRoundInv(V state, const V round_key) { // Like AESRoundInv, but without InvMixColumns. state = detail::InvSubBytes(state); state = detail::InvShiftRows(state); state = Xor(state, round_key); // AddRoundKey return state; } template )> HWY_API V AESKeyGenAssist(V v) { const DFromV d; const V rconXorMask = Dup128VecFromValues(d, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0); const V rotWordShuffle = Dup128VecFromValues(d, 4, 5, 6, 7, 5, 6, 7, 4, 12, 13, 14, 15, 13, 14, 15, 12); const auto sub_word_result = detail::SubBytes(v); const auto rot_word_result = TableLookupBytes(sub_word_result, rotWordShuffle); return Xor(rot_word_result, rconXorMask); } // Constant-time implementation inspired by // https://www.bearssl.org/constanttime.html, but about half the cost because we // use 64x64 multiplies and 128-bit XORs. template HWY_API V CLMulLower(V a, V b) { const DFromV d; static_assert(IsSame, uint64_t>(), "V must be u64"); const auto k1 = Set(d, 0x1111111111111111ULL); const auto k2 = Set(d, 0x2222222222222222ULL); const auto k4 = Set(d, 0x4444444444444444ULL); const auto k8 = Set(d, 0x8888888888888888ULL); const auto a0 = And(a, k1); const auto a1 = And(a, k2); const auto a2 = And(a, k4); const auto a3 = And(a, k8); const auto b0 = And(b, k1); const auto b1 = And(b, k2); const auto b2 = And(b, k4); const auto b3 = And(b, k8); auto m0 = Xor(MulEven(a0, b0), MulEven(a1, b3)); auto m1 = Xor(MulEven(a0, b1), MulEven(a1, b0)); auto m2 = Xor(MulEven(a0, b2), MulEven(a1, b1)); auto m3 = Xor(MulEven(a0, b3), MulEven(a1, b2)); m0 = Xor(m0, Xor(MulEven(a2, b2), MulEven(a3, b1))); m1 = Xor(m1, Xor(MulEven(a2, b3), MulEven(a3, b2))); m2 = Xor(m2, Xor(MulEven(a2, b0), MulEven(a3, b3))); m3 = Xor(m3, Xor(MulEven(a2, b1), MulEven(a3, b0))); return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8))); } template HWY_API V CLMulUpper(V a, V b) { const DFromV d; static_assert(IsSame, uint64_t>(), "V must be u64"); const auto k1 = Set(d, 0x1111111111111111ULL); const auto k2 = Set(d, 0x2222222222222222ULL); const auto k4 = Set(d, 0x4444444444444444ULL); const auto k8 = Set(d, 0x8888888888888888ULL); const auto a0 = And(a, k1); const auto a1 = And(a, k2); const auto a2 = And(a, k4); const auto a3 = And(a, k8); const auto b0 = And(b, k1); const auto b1 = And(b, k2); const auto b2 = And(b, k4); const auto b3 = And(b, k8); auto m0 = Xor(MulOdd(a0, b0), MulOdd(a1, b3)); auto m1 = Xor(MulOdd(a0, b1), MulOdd(a1, b0)); auto m2 = Xor(MulOdd(a0, b2), MulOdd(a1, b1)); auto m3 = Xor(MulOdd(a0, b3), MulOdd(a1, b2)); m0 = Xor(m0, Xor(MulOdd(a2, b2), MulOdd(a3, b1))); m1 = Xor(m1, Xor(MulOdd(a2, b3), MulOdd(a3, b2))); m2 = Xor(m2, Xor(MulOdd(a2, b0), MulOdd(a3, b3))); m3 = Xor(m3, Xor(MulOdd(a2, b1), MulOdd(a3, b0))); return Or(Or(And(m0, k1), And(m1, k2)), Or(And(m2, k4), And(m3, k8))); } #endif // HWY_NATIVE_AES #endif // HWY_TARGET != HWY_SCALAR // ------------------------------ PopulationCount #if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_POPCNT #undef HWY_NATIVE_POPCNT #else #define HWY_NATIVE_POPCNT #endif // This overload requires vectors to be at least 16 bytes, which is the case // for LMUL >= 2. #undef HWY_IF_POPCNT #if HWY_TARGET == HWY_RVV #define HWY_IF_POPCNT(D) \ hwy::EnableIf= 1 && D().MaxLanes() >= 16>* = nullptr #else // Other targets only have these two overloads which are mutually exclusive, so // no further conditions are required. #define HWY_IF_POPCNT(D) void* = nullptr #endif // HWY_TARGET == HWY_RVV template , HWY_IF_U8_D(D), HWY_IF_V_SIZE_GT_D(D, 8), HWY_IF_POPCNT(D)> HWY_API V PopulationCount(V v) { const D d; const V lookup = Dup128VecFromValues(d, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); const auto lo = And(v, Set(d, uint8_t{0xF})); const auto hi = ShiftRight<4>(v); return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo)); } // RVV has a specialization that avoids the Set(). #if HWY_TARGET != HWY_RVV // Slower fallback for capped vectors. template , HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> HWY_API V PopulationCount(V v) { const D d; // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3 const V k33 = Set(d, uint8_t{0x33}); v = Sub(v, And(ShiftRight<1>(v), Set(d, uint8_t{0x55}))); v = Add(And(ShiftRight<2>(v), k33), And(v, k33)); return And(Add(v, ShiftRight<4>(v)), Set(d, uint8_t{0x0F})); } #endif // HWY_TARGET != HWY_RVV template , HWY_IF_U16_D(D)> HWY_API V PopulationCount(V v) { const D d; const Repartition d8; const auto vals = BitCast(d, PopulationCount(BitCast(d8, v))); return Add(ShiftRight<8>(vals), And(vals, Set(d, uint16_t{0xFF}))); } template , HWY_IF_U32_D(D)> HWY_API V PopulationCount(V v) { const D d; Repartition d16; auto vals = BitCast(d, PopulationCount(BitCast(d16, v))); return Add(ShiftRight<16>(vals), And(vals, Set(d, uint32_t{0xFF}))); } #if HWY_HAVE_INTEGER64 template , HWY_IF_U64_D(D)> HWY_API V PopulationCount(V v) { const D d; Repartition d32; auto vals = BitCast(d, PopulationCount(BitCast(d32, v))); return Add(ShiftRight<32>(vals), And(vals, Set(d, 0xFFULL))); } #endif #endif // HWY_NATIVE_POPCNT // ------------------------------ 8-bit multiplication #if (defined(HWY_NATIVE_MUL_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE #ifdef HWY_NATIVE_MUL_8 #undef HWY_NATIVE_MUL_8 #else #define HWY_NATIVE_MUL_8 #endif // 8 bit and fits in wider reg: promote template HWY_API V operator*(const V a, const V b) { const DFromV d; const Rebind>, decltype(d)> dw; const RebindToUnsigned du; // TruncateTo result const RebindToUnsigned dwu; // TruncateTo input const VFromD mul = PromoteTo(dw, a) * PromoteTo(dw, b); // TruncateTo is cheaper than ConcatEven. return BitCast(d, TruncateTo(du, BitCast(dwu, mul))); } // 8 bit full reg: promote halves template HWY_API V operator*(const V a, const V b) { const DFromV d; const Half dh; const Twice> dw; const VFromD a0 = PromoteTo(dw, LowerHalf(dh, a)); const VFromD a1 = PromoteTo(dw, UpperHalf(dh, a)); const VFromD b0 = PromoteTo(dw, LowerHalf(dh, b)); const VFromD b1 = PromoteTo(dw, UpperHalf(dh, b)); const VFromD m0 = a0 * b0; const VFromD m1 = a1 * b1; return ConcatEven(d, BitCast(d, m1), BitCast(d, m0)); } #endif // HWY_NATIVE_MUL_8 // ------------------------------ 64-bit multiplication #if (defined(HWY_NATIVE_MUL_64) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE #ifdef HWY_NATIVE_MUL_64 #undef HWY_NATIVE_MUL_64 #else #define HWY_NATIVE_MUL_64 #endif // Single-lane i64 or u64 template HWY_API V operator*(V x, V y) { const DFromV d; using T = TFromD; using TU = MakeUnsigned; const TU xu = static_cast(GetLane(x)); const TU yu = static_cast(GetLane(y)); return Set(d, static_cast(xu * yu)); } template , HWY_IF_U64_D(D64), HWY_IF_V_SIZE_GT_D(D64, 8)> HWY_API V operator*(V x, V y) { RepartitionToNarrow d32; auto x32 = BitCast(d32, x); auto y32 = BitCast(d32, y); auto lolo = BitCast(d32, MulEven(x32, y32)); auto lohi = BitCast(d32, MulEven(x32, BitCast(d32, ShiftRight<32>(y)))); auto hilo = BitCast(d32, MulEven(BitCast(d32, ShiftRight<32>(x)), y32)); auto hi = BitCast(d32, ShiftLeft<32>(BitCast(D64{}, lohi + hilo))); return BitCast(D64{}, lolo + hi); } template , HWY_IF_I64_D(DI64), HWY_IF_V_SIZE_GT_D(DI64, 8)> HWY_API V operator*(V x, V y) { RebindToUnsigned du64; return BitCast(DI64{}, BitCast(du64, x) * BitCast(du64, y)); } #endif // HWY_NATIVE_MUL_64 // ------------------------------ MulAdd / NegMulAdd #if (defined(HWY_NATIVE_INT_FMA) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_INT_FMA #undef HWY_NATIVE_INT_FMA #else #define HWY_NATIVE_INT_FMA #endif #ifdef HWY_NATIVE_INT_FMSUB #undef HWY_NATIVE_INT_FMSUB #else #define HWY_NATIVE_INT_FMSUB #endif template HWY_API V MulAdd(V mul, V x, V add) { return Add(Mul(mul, x), add); } template HWY_API V NegMulAdd(V mul, V x, V add) { return Sub(add, Mul(mul, x)); } template HWY_API V MulSub(V mul, V x, V sub) { return Sub(Mul(mul, x), sub); } #endif // HWY_NATIVE_INT_FMA // ------------------------------ Integer MulSub / NegMulSub #if (defined(HWY_NATIVE_INT_FMSUB) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_INT_FMSUB #undef HWY_NATIVE_INT_FMSUB #else #define HWY_NATIVE_INT_FMSUB #endif template HWY_API V MulSub(V mul, V x, V sub) { const DFromV d; const RebindToSigned di; return MulAdd(mul, x, BitCast(d, Neg(BitCast(di, sub)))); } #endif // HWY_NATIVE_INT_FMSUB template HWY_API V NegMulSub(V mul, V x, V sub) { const DFromV d; const RebindToSigned di; return BitCast(d, Neg(BitCast(di, MulAdd(mul, x, sub)))); } // ------------------------------ MulAddSub // MulAddSub(mul, x, sub_or_add) for a 1-lane vector is equivalent to // MulSub(mul, x, sub_or_add) template , 1)> HWY_API V MulAddSub(V mul, V x, V sub_or_add) { return MulSub(mul, x, sub_or_add); } // MulAddSub for F16/F32/F64 vectors with 2 or more lanes on // SSSE3/SSE4/AVX2/AVX3 is implemented in x86_128-inl.h, x86_256-inl.h, and // x86_512-inl.h template , 1), HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | ((HWY_TARGET <= HWY_SSSE3 && hwy::IsFloat>()) ? 0 : ((1 << 2) | (1 << 4) | (1 << 8))))> HWY_API V MulAddSub(V mul, V x, V sub_or_add) { using D = DFromV; using T = TFromD; using TNegate = If(), MakeSigned, T>; const D d; const Rebind d_negate; const auto add = OddEven(sub_or_add, BitCast(d, Neg(BitCast(d_negate, sub_or_add)))); return MulAdd(mul, x, add); } // ------------------------------ Integer division #if (defined(HWY_NATIVE_INT_DIV) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_INT_DIV #undef HWY_NATIVE_INT_DIV #else #define HWY_NATIVE_INT_DIV #endif namespace detail { template ))> HWY_INLINE Vec IntDivConvFloatToInt(D di, V vf) { return ConvertTo(di, vf); } template ))> HWY_INLINE Vec IntDivConvIntToFloat(D df, V vi) { return ConvertTo(df, vi); } #if !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 template )> HWY_INLINE Vec IntDivConvFloatToInt(D df, V vi) { return PromoteTo(df, vi); } // If !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 is true, then UI64->F32 // IntDivConvIntToFloat(df, vi) returns an approximation of // static_cast(v[i]) that is within 4 ULP of static_cast(v[i]) template )> HWY_INLINE Vec IntDivConvIntToFloat(D df32, V vi) { const Twice dt_f32; auto vf32 = ConvertTo(dt_f32, BitCast(RebindToSigned(), vi)); #if HWY_IS_LITTLE_ENDIAN const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32)); auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32)); #else const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32)); auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32)); #endif const RebindToSigned di32; hi_f32 = Add(hi_f32, And(BitCast(df32, BroadcastSignBit(BitCast(di32, lo_f32))), Set(df32, 1.0f))); return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32); } template )> HWY_INLINE Vec IntDivConvIntToFloat(D df32, V vu) { const Twice dt_f32; auto vf32 = ConvertTo(dt_f32, BitCast(RebindToUnsigned(), vu)); #if HWY_IS_LITTLE_ENDIAN const auto lo_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32)); const auto hi_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32)); #else const auto lo_f32 = LowerHalf(df32, ConcatOdd(dt_f32, vf32, vf32)); const auto hi_f32 = LowerHalf(df32, ConcatEven(dt_f32, vf32, vf32)); #endif return hwy::HWY_NAMESPACE::MulAdd(hi_f32, Set(df32, 4294967296.0f), lo_f32); } #endif // !HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 template , kOrigLaneSize)> HWY_INLINE V IntDivUsingFloatDiv(V a, V b) { const DFromV d; const RebindToFloat df; // If kOrigLaneSize < sizeof(T) is true, then a[i] and b[i] are both in the // [LimitsMin>(), // LimitsMax>()] range. // floor(|a[i] / b[i]|) <= |flt_q| < floor(|a[i] / b[i]|) + 1 is also // guaranteed to be true if MakeFloat has at least kOrigLaneSize*8 + 1 // mantissa bits (including the implied one bit), where flt_q is equal to // static_cast>(a[i]) / static_cast>(b[i]), // even in the case where the magnitude of an inexact floating point division // result is rounded up. // In other words, floor(flt_q) < flt_q < ceil(flt_q) is guaranteed to be true // if (a[i] % b[i]) != 0 is true and MakeFloat has at least // kOrigLaneSize*8 + 1 mantissa bits (including the implied one bit), even in // the case where the magnitude of an inexact floating point division result // is rounded up. #if (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \ !HWY_HAVE_FLOAT64 // On Armv7, do division by multiplying by the ApproximateReciprocal // to avoid unnecessary overhead as F32 Div refines the approximate // reciprocal using 4 Newton-Raphson iterations const RebindToSigned di; const RebindToUnsigned du; const auto flt_b = ConvertTo(df, b); auto flt_recip_b = ApproximateReciprocal(flt_b); if (kOrigLaneSize > 1) { flt_recip_b = Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b)); } auto q0 = ConvertTo(d, Mul(ConvertTo(df, a), flt_recip_b)); const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a)); auto r1 = r0; // Need to negate r1[i] if a[i] < 0 is true if (IsSigned>()) { r1 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r1); } // r1[i] is now equal to (a[i] < 0) ? (-r0[i]) : r0[i] auto abs_b = BitCast(du, b); if (IsSigned>()) { abs_b = BitCast(du, Abs(BitCast(di, abs_b))); } // If (r1[i] < 0 || r1[i] >= abs_b[i]) is true, then set q1[i] to -1. // Otherwise, set q1[i] to 0. // (r1[i] < 0 || r1[i] >= abs_b[i]) can be carried out using a single unsigned // comparison as static_cast(r1[i]) >= TU(LimitsMax() + 1) >= abs_b[i] // will be true if r1[i] < 0 is true. auto q1 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r1), abs_b))); // q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ? -1 : 0 // Need to negate q1[i] if r0[i] and b[i] do not have the same sign auto q1_negate_mask = r0; if (IsSigned>()) { q1_negate_mask = Xor(q1_negate_mask, BitCast(di, b)); } q1 = IfNegativeThenElse(q1_negate_mask, Neg(q1), q1); // q1[i] is now equal to (r1[i] < 0 || r1[i] >= abs_b[i]) ? // (((r0[i] ^ b[i]) < 0) ? 1 : -1) // Need to subtract q1[i] from q0[i] to get the final result return Sub(q0, BitCast(d, q1)); #else // On targets other than Armv7 NEON, use F16 or F32 division as most targets // other than Armv7 NEON have native F32 divide instructions return ConvertTo(d, Div(ConvertTo(df, a), ConvertTo(df, b))); #endif } template , kOrigLaneSize), HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 4) | (1 << 8))> HWY_INLINE V IntDivUsingFloatDiv(V a, V b) { // If kOrigLaneSize == sizeof(T) is true, at least two reciprocal // multiplication steps are needed as the mantissa of MakeFloat has fewer // than kOrigLaneSize*8 + 1 bits using T = TFromV; #if HWY_HAVE_FLOAT64 using TF = MakeFloat; #else using TF = float; #endif const DFromV d; const RebindToSigned di; const RebindToUnsigned du; const Rebind df; if (!IsSigned()) { // If T is unsigned, set a[i] to (a[i] >= b[i] ? 1 : 0) and set b[i] to 1 if // b[i] > LimitsMax>() is true const auto one = Set(di, MakeSigned{1}); a = BitCast( d, IfNegativeThenElse(BitCast(di, b), IfThenElseZero(RebindMask(di, Ge(a, b)), one), BitCast(di, a))); b = BitCast(d, IfNegativeThenElse(BitCast(di, b), one, BitCast(di, b))); } // LimitsMin() <= b[i] <= LimitsMax>() is now true const auto flt_b = IntDivConvIntToFloat(df, b); #if (HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES) && \ !HWY_HAVE_FLOAT64 auto flt_recip_b = ApproximateReciprocal(flt_b); flt_recip_b = Mul(flt_recip_b, ReciprocalNewtonRaphsonStep(flt_recip_b, flt_b)); #else const auto flt_recip_b = Div(Set(df, TF(1.0)), flt_b); #endif auto q0 = IntDivConvFloatToInt(d, Mul(IntDivConvIntToFloat(df, a), flt_recip_b)); const auto r0 = BitCast(di, hwy::HWY_NAMESPACE::NegMulAdd(q0, b, a)); auto q1 = IntDivConvFloatToInt(di, Mul(IntDivConvIntToFloat(df, r0), flt_recip_b)); const auto r1 = hwy::HWY_NAMESPACE::NegMulAdd(q1, BitCast(di, b), r0); auto r3 = r1; #if !HWY_HAVE_FLOAT64 // Need two additional reciprocal multiplication steps for I64/U64 vectors if // HWY_HAVE_FLOAT64 is 0 if (sizeof(T) == 8) { const auto q2 = IntDivConvFloatToInt( di, Mul(IntDivConvIntToFloat(df, r1), flt_recip_b)); const auto r2 = hwy::HWY_NAMESPACE::NegMulAdd(q2, BitCast(di, b), r1); const auto q3 = IntDivConvFloatToInt( di, Mul(IntDivConvIntToFloat(df, r2), flt_recip_b)); r3 = hwy::HWY_NAMESPACE::NegMulAdd(q3, BitCast(di, b), r2); q0 = Add(q0, BitCast(d, q2)); q1 = Add(q1, q3); } #endif // !HWY_HAVE_FLOAT64 auto r4 = r3; // Need to negate r4[i] if a[i] < 0 is true if (IsSigned>()) { r4 = IfNegativeThenNegOrUndefIfZero(BitCast(di, a), r4); } // r4[i] is now equal to (a[i] < 0) ? (-r3[i]) : r3[i] auto abs_b = BitCast(du, b); if (IsSigned>()) { abs_b = BitCast(du, Abs(BitCast(di, abs_b))); } // If (r4[i] < 0 || r4[i] >= abs_b[i]) is true, then set q4[i] to -1. // Otherwise, set r4[i] to 0. // (r4[i] < 0 || r4[i] >= abs_b[i]) can be carried out using a single unsigned // comparison as static_cast(r4[i]) >= TU(LimitsMax() + 1) >= abs_b[i] // will be true if r4[i] < 0 is true. auto q4 = BitCast(di, VecFromMask(du, Ge(BitCast(du, r4), abs_b))); // q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ? -1 : 0 // Need to negate q4[i] if r3[i] and b[i] do not have the same sign auto q4_negate_mask = r3; if (IsSigned>()) { q4_negate_mask = Xor(q4_negate_mask, BitCast(di, b)); } q4 = IfNegativeThenElse(q4_negate_mask, Neg(q4), q4); // q4[i] is now equal to (r4[i] < 0 || r4[i] >= abs_b[i]) ? // (((r3[i] ^ b[i]) < 0) ? 1 : -1) // The final result is equal to q0[i] + q1[i] - q4[i] return Sub(Add(q0, BitCast(d, q1)), BitCast(d, q4)); } template ) == 1) ? 4 : 2))> HWY_INLINE V IntDiv(V a, V b) { using T = TFromV; // If HWY_HAVE_FLOAT16 is 0, need to promote I8 to I32 and U8 to U32 using TW = MakeWide< If<(!HWY_HAVE_FLOAT16 && sizeof(TFromV) == 1), MakeWide, T>>; const DFromV d; const Rebind dw; #if HWY_TARGET <= HWY_SSE2 // On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned to avoid // unnecessary overhead const RebindToSigned dw_i; // On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned if // kOrigLaneSize < sizeof(T) to avoid unnecessary overhead const If<(kOrigLaneSize < sizeof(T)), RebindToSigned, decltype(d)> d_demote_to; #else // On other targets, promote to TW and demote to T const decltype(dw) dw_i; const decltype(d) d_demote_to; #endif return BitCast( d, DemoteTo(d_demote_to, IntDivUsingFloatDiv( PromoteTo(dw_i, a), PromoteTo(dw_i, b)))); } template HWY_INLINE V IntDiv(V a, V b) { const DFromV d; const RepartitionToWide dw; #if HWY_TARGET <= HWY_SSE2 // On SSE2/SSSE3/SSE4/AVX2/AVX3, promote to and from MakeSigned to avoid // unnecessary overhead const RebindToSigned dw_i; // On SSE2/SSSE3/SSE4/AVX2/AVX3, demote to MakeSigned> if // kOrigLaneSize < sizeof(TFromV) to avoid unnecessary overhead const If<(kOrigLaneSize < sizeof(TFromV)), RebindToSigned, decltype(d)> d_demote_to; #else // On other targets, promote to MakeWide> and demote to TFromV const decltype(dw) dw_i; const decltype(d) d_demote_to; #endif return BitCast(d, OrderedDemote2To( d_demote_to, IntDivUsingFloatDiv( PromoteLowerTo(dw_i, a), PromoteLowerTo(dw_i, b)), IntDivUsingFloatDiv( PromoteUpperTo(dw_i, a), PromoteUpperTo(dw_i, b)))); } #if !HWY_HAVE_FLOAT16 template ), HWY_IF_V_SIZE_V(V, HWY_MAX_BYTES / 2)> HWY_INLINE V IntDiv(V a, V b) { const DFromV d; const Rebind>, decltype(d)> dw; #if HWY_TARGET <= HWY_SSE2 // On SSE2/SSSE3, demote from int16_t to TFromV to avoid unnecessary // overhead const RebindToSigned dw_i; #else // On other targets, demote from MakeWide> to TFromV const decltype(dw) dw_i; #endif return DemoteTo(d, BitCast(dw_i, IntDiv<1>(PromoteTo(dw, a), PromoteTo(dw, b)))); } template ), HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)> HWY_INLINE V IntDiv(V a, V b) { const DFromV d; const RepartitionToWide dw; #if HWY_TARGET <= HWY_SSE2 // On SSE2/SSSE3, demote from int16_t to TFromV to avoid unnecessary // overhead const RebindToSigned dw_i; #else // On other targets, demote from MakeWide> to TFromV const decltype(dw) dw_i; #endif return OrderedDemote2To( d, BitCast(dw_i, IntDiv<1>(PromoteLowerTo(dw, a), PromoteLowerTo(dw, b))), BitCast(dw_i, IntDiv<1>(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b)))); } #endif // !HWY_HAVE_FLOAT16 template HWY_INLINE V IntDiv(V a, V b) { return IntDivUsingFloatDiv(a, b); } #if HWY_HAVE_FLOAT64 template ), HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)> HWY_INLINE V IntDiv(V a, V b) { const DFromV d; const Rebind df64; return DemoteTo(d, Div(PromoteTo(df64, a), PromoteTo(df64, b))); } template ), HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)> HWY_INLINE V IntDiv(V a, V b) { const DFromV d; const Half dh; const Repartition df64; return Combine( d, DemoteTo(dh, Div(PromoteUpperTo(df64, a), PromoteUpperTo(df64, b))), DemoteTo(dh, Div(PromoteLowerTo(df64, a), PromoteLowerTo(df64, b)))); } #endif // HWY_HAVE_FLOAT64 template HWY_INLINE V IntMod(V a, V b) { return hwy::HWY_NAMESPACE::NegMulAdd(IntDiv(a, b), b, a); } #if HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || \ HWY_TARGET == HWY_WASM_EMU256 template ), HWY_IF_V_SIZE_LE_V(V, HWY_MAX_BYTES / 2)> HWY_INLINE V IntMod(V a, V b) { const DFromV d; const Rebind>, decltype(d)> dw; return DemoteTo(d, IntMod(PromoteTo(dw, a), PromoteTo(dw, b))); } template ), HWY_IF_V_SIZE_GT_V(V, HWY_MAX_BYTES / 2)> HWY_INLINE V IntMod(V a, V b) { const DFromV d; const RepartitionToWide dw; return OrderedDemote2To( d, IntMod(PromoteLowerTo(dw, a), PromoteLowerTo(dw, b)), IntMod(PromoteUpperTo(dw, a), PromoteUpperTo(dw, b))); } #endif // HWY_TARGET <= HWY_SSE2 || HWY_TARGET == HWY_WASM || HWY_TARGET == // HWY_WASM_EMU256 } // namespace detail #if HWY_TARGET == HWY_SCALAR template HWY_API Vec1 operator/(Vec1 a, Vec1 b) { return detail::IntDiv(a, b); } template HWY_API Vec1 operator%(Vec1 a, Vec1 b) { return detail::IntMod(a, b); } #else // HWY_TARGET != HWY_SCALAR template HWY_API Vec128 operator/(Vec128 a, Vec128 b) { return detail::IntDiv(a, b); } template HWY_API Vec128 operator%(Vec128 a, Vec128 b) { return detail::IntMod(a, b); } #if HWY_CAP_GE256 template HWY_API Vec256 operator/(Vec256 a, Vec256 b) { return detail::IntDiv(a, b); } template HWY_API Vec256 operator%(Vec256 a, Vec256 b) { return detail::IntMod(a, b); } #endif #if HWY_CAP_GE512 template HWY_API Vec512 operator/(Vec512 a, Vec512 b) { return detail::IntDiv(a, b); } template HWY_API Vec512 operator%(Vec512 a, Vec512 b) { return detail::IntMod(a, b); } #endif #endif // HWY_TARGET == HWY_SCALAR #endif // HWY_NATIVE_INT_DIV // ------------------------------ SatWidenMulPairwiseAdd #if (defined(HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD) == \ defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD #undef HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD #else #define HWY_NATIVE_U8_I8_SATWIDENMULPAIRWISEADD #endif template >, HWY_IF_I16_D(DI16), HWY_IF_U8_D(DFromV), HWY_IF_I8_D(DFromV), HWY_IF_LANES_D(DFromV, HWY_MAX_LANES_V(VI8)), HWY_IF_LANES_D(DFromV, HWY_MAX_LANES_V(VU8_2))> HWY_API Vec SatWidenMulPairwiseAdd(DI16 di16, VU8 a, VI8 b) { const RebindToUnsigned du16; const auto a0 = BitCast(di16, PromoteEvenTo(du16, a)); const auto b0 = PromoteEvenTo(di16, b); const auto a1 = BitCast(di16, PromoteOddTo(du16, a)); const auto b1 = PromoteOddTo(di16, b); return SaturatedAdd(Mul(a0, b0), Mul(a1, b1)); } #endif // ------------------------------ SumOfMulQuadAccumulate #if (defined(HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE) == \ defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE #undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE #else #define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE #endif template HWY_API VFromD SumOfMulQuadAccumulate(DI32 di32, VFromD> a, VFromD> b, VFromD sum) { const Repartition di16; const auto a0 = PromoteEvenTo(di16, a); const auto b0 = PromoteEvenTo(di16, b); const auto a1 = PromoteOddTo(di16, a); const auto b1 = PromoteOddTo(di16, b); return Add(sum, Add(WidenMulPairwiseAdd(di32, a0, b0), WidenMulPairwiseAdd(di32, a1, b1))); } #endif #if (defined(HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE) == \ defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE #undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE #else #define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE #endif template HWY_API VFromD SumOfMulQuadAccumulate( DU32 du32, VFromD> a, VFromD> b, VFromD sum) { const Repartition du16; const RebindToSigned di16; const RebindToSigned di32; const auto lo8_mask = Set(di16, int16_t{0x00FF}); const auto a0 = And(BitCast(di16, a), lo8_mask); const auto b0 = And(BitCast(di16, b), lo8_mask); const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a))); const auto b1 = BitCast(di16, ShiftRight<8>(BitCast(du16, b))); return Add(sum, Add(BitCast(du32, WidenMulPairwiseAdd(di32, a0, b0)), BitCast(du32, WidenMulPairwiseAdd(di32, a1, b1)))); } #endif #if (defined(HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE) == \ defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE #undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE #else #define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE #endif template HWY_API VFromD SumOfMulQuadAccumulate( DI32 di32, VFromD> a_u, VFromD> b_i, VFromD sum) { const Repartition di16; const RebindToUnsigned du16; const auto a0 = And(BitCast(di16, a_u), Set(di16, int16_t{0x00FF})); const auto b0 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, b_i))); const auto a1 = BitCast(di16, ShiftRight<8>(BitCast(du16, a_u))); const auto b1 = ShiftRight<8>(BitCast(di16, b_i)); // NOTE: SatWidenMulPairwiseAdd(di16, a_u, b_i) cannot be used in // SumOfMulQuadAccumulate as it is possible for // a_u[0]*b_i[0]+a_u[1]*b_i[1] to overflow an int16_t if a_u[0], b_i[0], // a_u[1], and b_i[1] are all non-zero and b_i[0] and b_i[1] have the same // sign. return Add(sum, Add(WidenMulPairwiseAdd(di32, a0, b0), WidenMulPairwiseAdd(di32, a1, b1))); } #endif #if (defined(HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE) == \ defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE #undef HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE #else #define HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE #endif #if HWY_HAVE_INTEGER64 template HWY_API VFromD SumOfMulQuadAccumulate( DI64 di64, VFromD> a, VFromD> b, VFromD sum) { const Repartition di32; // WidenMulPairwiseAdd(di32, a, b) is okay here as // a[0]*b[0]+a[1]*b[1] is between -2147418112 and 2147483648 and as // a[0]*b[0]+a[1]*b[1] can only overflow an int32_t if // a[0], b[0], a[1], and b[1] are all equal to -32768. const auto i32_pairwise_sum = WidenMulPairwiseAdd(di32, a, b); const auto i32_pairwise_sum_overflow = VecFromMask(di32, Eq(i32_pairwise_sum, Set(di32, LimitsMin()))); // The upper 32 bits of sum0 and sum1 need to be zeroed out in the case of // overflow. const auto hi32_mask = Set(di64, static_cast(~int64_t{0xFFFFFFFF})); const auto p0_zero_out_mask = ShiftLeft<32>(BitCast(di64, i32_pairwise_sum_overflow)); const auto p1_zero_out_mask = And(BitCast(di64, i32_pairwise_sum_overflow), hi32_mask); const auto p0 = AndNot(p0_zero_out_mask, ShiftRight<32>(ShiftLeft<32>(BitCast(di64, i32_pairwise_sum)))); const auto p1 = AndNot(p1_zero_out_mask, ShiftRight<32>(BitCast(di64, i32_pairwise_sum))); return Add(sum, Add(p0, p1)); } #endif // HWY_HAVE_INTEGER64 #endif // HWY_NATIVE_I16_I16_SUMOFMULQUADACCUMULATE #if (defined(HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE) == \ defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE #undef HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE #else #define HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE #endif #if HWY_HAVE_INTEGER64 template HWY_API VFromD SumOfMulQuadAccumulate( DU64 du64, VFromD> a, VFromD> b, VFromD sum) { const auto u32_even_prod = MulEven(a, b); const auto u32_odd_prod = MulOdd(a, b); const auto p0 = Add(PromoteEvenTo(du64, u32_even_prod), PromoteEvenTo(du64, u32_odd_prod)); const auto p1 = Add(PromoteOddTo(du64, u32_even_prod), PromoteOddTo(du64, u32_odd_prod)); return Add(sum, Add(p0, p1)); } #endif // HWY_HAVE_INTEGER64 #endif // HWY_NATIVE_U16_U16_SUMOFMULQUADACCUMULATE // ------------------------------ F64 ApproximateReciprocal #if (defined(HWY_NATIVE_F64_APPROX_RECIP) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_F64_APPROX_RECIP #undef HWY_NATIVE_F64_APPROX_RECIP #else #define HWY_NATIVE_F64_APPROX_RECIP #endif #if HWY_HAVE_FLOAT64 template )> HWY_API V ApproximateReciprocal(V v) { const DFromV d; return Div(Set(d, 1.0), v); } #endif // HWY_HAVE_FLOAT64 #endif // HWY_NATIVE_F64_APPROX_RECIP // ------------------------------ F64 ApproximateReciprocalSqrt #if (defined(HWY_NATIVE_F64_APPROX_RSQRT) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_F64_APPROX_RSQRT #undef HWY_NATIVE_F64_APPROX_RSQRT #else #define HWY_NATIVE_F64_APPROX_RSQRT #endif #if HWY_HAVE_FLOAT64 template )> HWY_API V ApproximateReciprocalSqrt(V v) { const DFromV d; const RebindToUnsigned du; const auto half = Mul(v, Set(d, 0.5)); // Initial guess based on log2(f) const auto guess = BitCast(d, Sub(Set(du, uint64_t{0x5FE6EB50C7B537A9u}), ShiftRight<1>(BitCast(du, v)))); // One Newton-Raphson iteration return Mul(guess, NegMulAdd(Mul(half, guess), guess, Set(d, 1.5))); } #endif // HWY_HAVE_FLOAT64 #endif // HWY_NATIVE_F64_APPROX_RSQRT // ------------------------------ Compress* #if (defined(HWY_NATIVE_COMPRESS8) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_COMPRESS8 #undef HWY_NATIVE_COMPRESS8 #else #define HWY_NATIVE_COMPRESS8 #endif template HWY_API size_t CompressBitsStore(V v, const uint8_t* HWY_RESTRICT bits, D d, T* unaligned) { HWY_ALIGN T lanes[MaxLanes(d)]; Store(v, d, lanes); const Simd d8; T* HWY_RESTRICT pos = unaligned; HWY_ALIGN constexpr T table[2048] = { 0, 1, 2, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // 1, 0, 2, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // 2, 0, 1, 3, 4, 5, 6, 7, /**/ 0, 2, 1, 3, 4, 5, 6, 7, // 1, 2, 0, 3, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // 3, 0, 1, 2, 4, 5, 6, 7, /**/ 0, 3, 1, 2, 4, 5, 6, 7, // 1, 3, 0, 2, 4, 5, 6, 7, /**/ 0, 1, 3, 2, 4, 5, 6, 7, // 2, 3, 0, 1, 4, 5, 6, 7, /**/ 0, 2, 3, 1, 4, 5, 6, 7, // 1, 2, 3, 0, 4, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // 4, 0, 1, 2, 3, 5, 6, 7, /**/ 0, 4, 1, 2, 3, 5, 6, 7, // 1, 4, 0, 2, 3, 5, 6, 7, /**/ 0, 1, 4, 2, 3, 5, 6, 7, // 2, 4, 0, 1, 3, 5, 6, 7, /**/ 0, 2, 4, 1, 3, 5, 6, 7, // 1, 2, 4, 0, 3, 5, 6, 7, /**/ 0, 1, 2, 4, 3, 5, 6, 7, // 3, 4, 0, 1, 2, 5, 6, 7, /**/ 0, 3, 4, 1, 2, 5, 6, 7, // 1, 3, 4, 0, 2, 5, 6, 7, /**/ 0, 1, 3, 4, 2, 5, 6, 7, // 2, 3, 4, 0, 1, 5, 6, 7, /**/ 0, 2, 3, 4, 1, 5, 6, 7, // 1, 2, 3, 4, 0, 5, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // 5, 0, 1, 2, 3, 4, 6, 7, /**/ 0, 5, 1, 2, 3, 4, 6, 7, // 1, 5, 0, 2, 3, 4, 6, 7, /**/ 0, 1, 5, 2, 3, 4, 6, 7, // 2, 5, 0, 1, 3, 4, 6, 7, /**/ 0, 2, 5, 1, 3, 4, 6, 7, // 1, 2, 5, 0, 3, 4, 6, 7, /**/ 0, 1, 2, 5, 3, 4, 6, 7, // 3, 5, 0, 1, 2, 4, 6, 7, /**/ 0, 3, 5, 1, 2, 4, 6, 7, // 1, 3, 5, 0, 2, 4, 6, 7, /**/ 0, 1, 3, 5, 2, 4, 6, 7, // 2, 3, 5, 0, 1, 4, 6, 7, /**/ 0, 2, 3, 5, 1, 4, 6, 7, // 1, 2, 3, 5, 0, 4, 6, 7, /**/ 0, 1, 2, 3, 5, 4, 6, 7, // 4, 5, 0, 1, 2, 3, 6, 7, /**/ 0, 4, 5, 1, 2, 3, 6, 7, // 1, 4, 5, 0, 2, 3, 6, 7, /**/ 0, 1, 4, 5, 2, 3, 6, 7, // 2, 4, 5, 0, 1, 3, 6, 7, /**/ 0, 2, 4, 5, 1, 3, 6, 7, // 1, 2, 4, 5, 0, 3, 6, 7, /**/ 0, 1, 2, 4, 5, 3, 6, 7, // 3, 4, 5, 0, 1, 2, 6, 7, /**/ 0, 3, 4, 5, 1, 2, 6, 7, // 1, 3, 4, 5, 0, 2, 6, 7, /**/ 0, 1, 3, 4, 5, 2, 6, 7, // 2, 3, 4, 5, 0, 1, 6, 7, /**/ 0, 2, 3, 4, 5, 1, 6, 7, // 1, 2, 3, 4, 5, 0, 6, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // 6, 0, 1, 2, 3, 4, 5, 7, /**/ 0, 6, 1, 2, 3, 4, 5, 7, // 1, 6, 0, 2, 3, 4, 5, 7, /**/ 0, 1, 6, 2, 3, 4, 5, 7, // 2, 6, 0, 1, 3, 4, 5, 7, /**/ 0, 2, 6, 1, 3, 4, 5, 7, // 1, 2, 6, 0, 3, 4, 5, 7, /**/ 0, 1, 2, 6, 3, 4, 5, 7, // 3, 6, 0, 1, 2, 4, 5, 7, /**/ 0, 3, 6, 1, 2, 4, 5, 7, // 1, 3, 6, 0, 2, 4, 5, 7, /**/ 0, 1, 3, 6, 2, 4, 5, 7, // 2, 3, 6, 0, 1, 4, 5, 7, /**/ 0, 2, 3, 6, 1, 4, 5, 7, // 1, 2, 3, 6, 0, 4, 5, 7, /**/ 0, 1, 2, 3, 6, 4, 5, 7, // 4, 6, 0, 1, 2, 3, 5, 7, /**/ 0, 4, 6, 1, 2, 3, 5, 7, // 1, 4, 6, 0, 2, 3, 5, 7, /**/ 0, 1, 4, 6, 2, 3, 5, 7, // 2, 4, 6, 0, 1, 3, 5, 7, /**/ 0, 2, 4, 6, 1, 3, 5, 7, // 1, 2, 4, 6, 0, 3, 5, 7, /**/ 0, 1, 2, 4, 6, 3, 5, 7, // 3, 4, 6, 0, 1, 2, 5, 7, /**/ 0, 3, 4, 6, 1, 2, 5, 7, // 1, 3, 4, 6, 0, 2, 5, 7, /**/ 0, 1, 3, 4, 6, 2, 5, 7, // 2, 3, 4, 6, 0, 1, 5, 7, /**/ 0, 2, 3, 4, 6, 1, 5, 7, // 1, 2, 3, 4, 6, 0, 5, 7, /**/ 0, 1, 2, 3, 4, 6, 5, 7, // 5, 6, 0, 1, 2, 3, 4, 7, /**/ 0, 5, 6, 1, 2, 3, 4, 7, // 1, 5, 6, 0, 2, 3, 4, 7, /**/ 0, 1, 5, 6, 2, 3, 4, 7, // 2, 5, 6, 0, 1, 3, 4, 7, /**/ 0, 2, 5, 6, 1, 3, 4, 7, // 1, 2, 5, 6, 0, 3, 4, 7, /**/ 0, 1, 2, 5, 6, 3, 4, 7, // 3, 5, 6, 0, 1, 2, 4, 7, /**/ 0, 3, 5, 6, 1, 2, 4, 7, // 1, 3, 5, 6, 0, 2, 4, 7, /**/ 0, 1, 3, 5, 6, 2, 4, 7, // 2, 3, 5, 6, 0, 1, 4, 7, /**/ 0, 2, 3, 5, 6, 1, 4, 7, // 1, 2, 3, 5, 6, 0, 4, 7, /**/ 0, 1, 2, 3, 5, 6, 4, 7, // 4, 5, 6, 0, 1, 2, 3, 7, /**/ 0, 4, 5, 6, 1, 2, 3, 7, // 1, 4, 5, 6, 0, 2, 3, 7, /**/ 0, 1, 4, 5, 6, 2, 3, 7, // 2, 4, 5, 6, 0, 1, 3, 7, /**/ 0, 2, 4, 5, 6, 1, 3, 7, // 1, 2, 4, 5, 6, 0, 3, 7, /**/ 0, 1, 2, 4, 5, 6, 3, 7, // 3, 4, 5, 6, 0, 1, 2, 7, /**/ 0, 3, 4, 5, 6, 1, 2, 7, // 1, 3, 4, 5, 6, 0, 2, 7, /**/ 0, 1, 3, 4, 5, 6, 2, 7, // 2, 3, 4, 5, 6, 0, 1, 7, /**/ 0, 2, 3, 4, 5, 6, 1, 7, // 1, 2, 3, 4, 5, 6, 0, 7, /**/ 0, 1, 2, 3, 4, 5, 6, 7, // 7, 0, 1, 2, 3, 4, 5, 6, /**/ 0, 7, 1, 2, 3, 4, 5, 6, // 1, 7, 0, 2, 3, 4, 5, 6, /**/ 0, 1, 7, 2, 3, 4, 5, 6, // 2, 7, 0, 1, 3, 4, 5, 6, /**/ 0, 2, 7, 1, 3, 4, 5, 6, // 1, 2, 7, 0, 3, 4, 5, 6, /**/ 0, 1, 2, 7, 3, 4, 5, 6, // 3, 7, 0, 1, 2, 4, 5, 6, /**/ 0, 3, 7, 1, 2, 4, 5, 6, // 1, 3, 7, 0, 2, 4, 5, 6, /**/ 0, 1, 3, 7, 2, 4, 5, 6, // 2, 3, 7, 0, 1, 4, 5, 6, /**/ 0, 2, 3, 7, 1, 4, 5, 6, // 1, 2, 3, 7, 0, 4, 5, 6, /**/ 0, 1, 2, 3, 7, 4, 5, 6, // 4, 7, 0, 1, 2, 3, 5, 6, /**/ 0, 4, 7, 1, 2, 3, 5, 6, // 1, 4, 7, 0, 2, 3, 5, 6, /**/ 0, 1, 4, 7, 2, 3, 5, 6, // 2, 4, 7, 0, 1, 3, 5, 6, /**/ 0, 2, 4, 7, 1, 3, 5, 6, // 1, 2, 4, 7, 0, 3, 5, 6, /**/ 0, 1, 2, 4, 7, 3, 5, 6, // 3, 4, 7, 0, 1, 2, 5, 6, /**/ 0, 3, 4, 7, 1, 2, 5, 6, // 1, 3, 4, 7, 0, 2, 5, 6, /**/ 0, 1, 3, 4, 7, 2, 5, 6, // 2, 3, 4, 7, 0, 1, 5, 6, /**/ 0, 2, 3, 4, 7, 1, 5, 6, // 1, 2, 3, 4, 7, 0, 5, 6, /**/ 0, 1, 2, 3, 4, 7, 5, 6, // 5, 7, 0, 1, 2, 3, 4, 6, /**/ 0, 5, 7, 1, 2, 3, 4, 6, // 1, 5, 7, 0, 2, 3, 4, 6, /**/ 0, 1, 5, 7, 2, 3, 4, 6, // 2, 5, 7, 0, 1, 3, 4, 6, /**/ 0, 2, 5, 7, 1, 3, 4, 6, // 1, 2, 5, 7, 0, 3, 4, 6, /**/ 0, 1, 2, 5, 7, 3, 4, 6, // 3, 5, 7, 0, 1, 2, 4, 6, /**/ 0, 3, 5, 7, 1, 2, 4, 6, // 1, 3, 5, 7, 0, 2, 4, 6, /**/ 0, 1, 3, 5, 7, 2, 4, 6, // 2, 3, 5, 7, 0, 1, 4, 6, /**/ 0, 2, 3, 5, 7, 1, 4, 6, // 1, 2, 3, 5, 7, 0, 4, 6, /**/ 0, 1, 2, 3, 5, 7, 4, 6, // 4, 5, 7, 0, 1, 2, 3, 6, /**/ 0, 4, 5, 7, 1, 2, 3, 6, // 1, 4, 5, 7, 0, 2, 3, 6, /**/ 0, 1, 4, 5, 7, 2, 3, 6, // 2, 4, 5, 7, 0, 1, 3, 6, /**/ 0, 2, 4, 5, 7, 1, 3, 6, // 1, 2, 4, 5, 7, 0, 3, 6, /**/ 0, 1, 2, 4, 5, 7, 3, 6, // 3, 4, 5, 7, 0, 1, 2, 6, /**/ 0, 3, 4, 5, 7, 1, 2, 6, // 1, 3, 4, 5, 7, 0, 2, 6, /**/ 0, 1, 3, 4, 5, 7, 2, 6, // 2, 3, 4, 5, 7, 0, 1, 6, /**/ 0, 2, 3, 4, 5, 7, 1, 6, // 1, 2, 3, 4, 5, 7, 0, 6, /**/ 0, 1, 2, 3, 4, 5, 7, 6, // 6, 7, 0, 1, 2, 3, 4, 5, /**/ 0, 6, 7, 1, 2, 3, 4, 5, // 1, 6, 7, 0, 2, 3, 4, 5, /**/ 0, 1, 6, 7, 2, 3, 4, 5, // 2, 6, 7, 0, 1, 3, 4, 5, /**/ 0, 2, 6, 7, 1, 3, 4, 5, // 1, 2, 6, 7, 0, 3, 4, 5, /**/ 0, 1, 2, 6, 7, 3, 4, 5, // 3, 6, 7, 0, 1, 2, 4, 5, /**/ 0, 3, 6, 7, 1, 2, 4, 5, // 1, 3, 6, 7, 0, 2, 4, 5, /**/ 0, 1, 3, 6, 7, 2, 4, 5, // 2, 3, 6, 7, 0, 1, 4, 5, /**/ 0, 2, 3, 6, 7, 1, 4, 5, // 1, 2, 3, 6, 7, 0, 4, 5, /**/ 0, 1, 2, 3, 6, 7, 4, 5, // 4, 6, 7, 0, 1, 2, 3, 5, /**/ 0, 4, 6, 7, 1, 2, 3, 5, // 1, 4, 6, 7, 0, 2, 3, 5, /**/ 0, 1, 4, 6, 7, 2, 3, 5, // 2, 4, 6, 7, 0, 1, 3, 5, /**/ 0, 2, 4, 6, 7, 1, 3, 5, // 1, 2, 4, 6, 7, 0, 3, 5, /**/ 0, 1, 2, 4, 6, 7, 3, 5, // 3, 4, 6, 7, 0, 1, 2, 5, /**/ 0, 3, 4, 6, 7, 1, 2, 5, // 1, 3, 4, 6, 7, 0, 2, 5, /**/ 0, 1, 3, 4, 6, 7, 2, 5, // 2, 3, 4, 6, 7, 0, 1, 5, /**/ 0, 2, 3, 4, 6, 7, 1, 5, // 1, 2, 3, 4, 6, 7, 0, 5, /**/ 0, 1, 2, 3, 4, 6, 7, 5, // 5, 6, 7, 0, 1, 2, 3, 4, /**/ 0, 5, 6, 7, 1, 2, 3, 4, // 1, 5, 6, 7, 0, 2, 3, 4, /**/ 0, 1, 5, 6, 7, 2, 3, 4, // 2, 5, 6, 7, 0, 1, 3, 4, /**/ 0, 2, 5, 6, 7, 1, 3, 4, // 1, 2, 5, 6, 7, 0, 3, 4, /**/ 0, 1, 2, 5, 6, 7, 3, 4, // 3, 5, 6, 7, 0, 1, 2, 4, /**/ 0, 3, 5, 6, 7, 1, 2, 4, // 1, 3, 5, 6, 7, 0, 2, 4, /**/ 0, 1, 3, 5, 6, 7, 2, 4, // 2, 3, 5, 6, 7, 0, 1, 4, /**/ 0, 2, 3, 5, 6, 7, 1, 4, // 1, 2, 3, 5, 6, 7, 0, 4, /**/ 0, 1, 2, 3, 5, 6, 7, 4, // 4, 5, 6, 7, 0, 1, 2, 3, /**/ 0, 4, 5, 6, 7, 1, 2, 3, // 1, 4, 5, 6, 7, 0, 2, 3, /**/ 0, 1, 4, 5, 6, 7, 2, 3, // 2, 4, 5, 6, 7, 0, 1, 3, /**/ 0, 2, 4, 5, 6, 7, 1, 3, // 1, 2, 4, 5, 6, 7, 0, 3, /**/ 0, 1, 2, 4, 5, 6, 7, 3, // 3, 4, 5, 6, 7, 0, 1, 2, /**/ 0, 3, 4, 5, 6, 7, 1, 2, // 1, 3, 4, 5, 6, 7, 0, 2, /**/ 0, 1, 3, 4, 5, 6, 7, 2, // 2, 3, 4, 5, 6, 7, 0, 1, /**/ 0, 2, 3, 4, 5, 6, 7, 1, // 1, 2, 3, 4, 5, 6, 7, 0, /**/ 0, 1, 2, 3, 4, 5, 6, 7}; for (size_t i = 0; i < Lanes(d); i += 8) { // Each byte worth of bits is the index of one of 256 8-byte ranges, and its // population count determines how far to advance the write position. const size_t bits8 = bits[i / 8]; const auto indices = Load(d8, table + bits8 * 8); const auto compressed = TableLookupBytes(LoadU(d8, lanes + i), indices); StoreU(compressed, d8, pos); pos += PopCount(bits8); } return static_cast(pos - unaligned); } template HWY_API size_t CompressStore(V v, M mask, D d, T* HWY_RESTRICT unaligned) { uint8_t bits[HWY_MAX(size_t{8}, MaxLanes(d) / 8)]; (void)StoreMaskBits(d, mask, bits); return CompressBitsStore(v, bits, d, unaligned); } template HWY_API size_t CompressBlendedStore(V v, M mask, D d, T* HWY_RESTRICT unaligned) { HWY_ALIGN T buf[MaxLanes(d)]; const size_t bytes = CompressStore(v, mask, d, buf); BlendedStore(Load(d, buf), FirstN(d, bytes), d, unaligned); return bytes; } // For reasons unknown, HWY_IF_T_SIZE_V is a compile error in SVE. template , HWY_IF_T_SIZE(T, 1)> HWY_API V Compress(V v, const M mask) { const DFromV d; HWY_ALIGN T lanes[MaxLanes(d)]; (void)CompressStore(v, mask, d, lanes); return Load(d, lanes); } template , HWY_IF_T_SIZE(T, 1)> HWY_API V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) { const DFromV d; HWY_ALIGN T lanes[MaxLanes(d)]; (void)CompressBitsStore(v, bits, d, lanes); return Load(d, lanes); } template , HWY_IF_T_SIZE(T, 1)> HWY_API V CompressNot(V v, M mask) { return Compress(v, Not(mask)); } #endif // HWY_NATIVE_COMPRESS8 // ------------------------------ Expand // Note that this generic implementation assumes <= 128 bit fixed vectors; // the SVE and RVV targets provide their own native implementations. #if (defined(HWY_NATIVE_EXPAND) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE #ifdef HWY_NATIVE_EXPAND #undef HWY_NATIVE_EXPAND #else #define HWY_NATIVE_EXPAND #endif namespace detail { #if HWY_IDE template HWY_INLINE uint64_t BitsFromMask(M /* mask */) { return 0; } #endif // HWY_IDE template HWY_INLINE Vec128 IndicesForExpandFromBits(uint64_t mask_bits) { static_assert(N <= 8, "Should only be called for half-vectors"); const Simd du8; HWY_DASSERT(mask_bits < 0x100); alignas(16) static constexpr uint8_t table[2048] = { // PrintExpand8x8Tables 128, 128, 128, 128, 128, 128, 128, 128, // 0, 128, 128, 128, 128, 128, 128, 128, // 128, 0, 128, 128, 128, 128, 128, 128, // 0, 1, 128, 128, 128, 128, 128, 128, // 128, 128, 0, 128, 128, 128, 128, 128, // 0, 128, 1, 128, 128, 128, 128, 128, // 128, 0, 1, 128, 128, 128, 128, 128, // 0, 1, 2, 128, 128, 128, 128, 128, // 128, 128, 128, 0, 128, 128, 128, 128, // 0, 128, 128, 1, 128, 128, 128, 128, // 128, 0, 128, 1, 128, 128, 128, 128, // 0, 1, 128, 2, 128, 128, 128, 128, // 128, 128, 0, 1, 128, 128, 128, 128, // 0, 128, 1, 2, 128, 128, 128, 128, // 128, 0, 1, 2, 128, 128, 128, 128, // 0, 1, 2, 3, 128, 128, 128, 128, // 128, 128, 128, 128, 0, 128, 128, 128, // 0, 128, 128, 128, 1, 128, 128, 128, // 128, 0, 128, 128, 1, 128, 128, 128, // 0, 1, 128, 128, 2, 128, 128, 128, // 128, 128, 0, 128, 1, 128, 128, 128, // 0, 128, 1, 128, 2, 128, 128, 128, // 128, 0, 1, 128, 2, 128, 128, 128, // 0, 1, 2, 128, 3, 128, 128, 128, // 128, 128, 128, 0, 1, 128, 128, 128, // 0, 128, 128, 1, 2, 128, 128, 128, // 128, 0, 128, 1, 2, 128, 128, 128, // 0, 1, 128, 2, 3, 128, 128, 128, // 128, 128, 0, 1, 2, 128, 128, 128, // 0, 128, 1, 2, 3, 128, 128, 128, // 128, 0, 1, 2, 3, 128, 128, 128, // 0, 1, 2, 3, 4, 128, 128, 128, // 128, 128, 128, 128, 128, 0, 128, 128, // 0, 128, 128, 128, 128, 1, 128, 128, // 128, 0, 128, 128, 128, 1, 128, 128, // 0, 1, 128, 128, 128, 2, 128, 128, // 128, 128, 0, 128, 128, 1, 128, 128, // 0, 128, 1, 128, 128, 2, 128, 128, // 128, 0, 1, 128, 128, 2, 128, 128, // 0, 1, 2, 128, 128, 3, 128, 128, // 128, 128, 128, 0, 128, 1, 128, 128, // 0, 128, 128, 1, 128, 2, 128, 128, // 128, 0, 128, 1, 128, 2, 128, 128, // 0, 1, 128, 2, 128, 3, 128, 128, // 128, 128, 0, 1, 128, 2, 128, 128, // 0, 128, 1, 2, 128, 3, 128, 128, // 128, 0, 1, 2, 128, 3, 128, 128, // 0, 1, 2, 3, 128, 4, 128, 128, // 128, 128, 128, 128, 0, 1, 128, 128, // 0, 128, 128, 128, 1, 2, 128, 128, // 128, 0, 128, 128, 1, 2, 128, 128, // 0, 1, 128, 128, 2, 3, 128, 128, // 128, 128, 0, 128, 1, 2, 128, 128, // 0, 128, 1, 128, 2, 3, 128, 128, // 128, 0, 1, 128, 2, 3, 128, 128, // 0, 1, 2, 128, 3, 4, 128, 128, // 128, 128, 128, 0, 1, 2, 128, 128, // 0, 128, 128, 1, 2, 3, 128, 128, // 128, 0, 128, 1, 2, 3, 128, 128, // 0, 1, 128, 2, 3, 4, 128, 128, // 128, 128, 0, 1, 2, 3, 128, 128, // 0, 128, 1, 2, 3, 4, 128, 128, // 128, 0, 1, 2, 3, 4, 128, 128, // 0, 1, 2, 3, 4, 5, 128, 128, // 128, 128, 128, 128, 128, 128, 0, 128, // 0, 128, 128, 128, 128, 128, 1, 128, // 128, 0, 128, 128, 128, 128, 1, 128, // 0, 1, 128, 128, 128, 128, 2, 128, // 128, 128, 0, 128, 128, 128, 1, 128, // 0, 128, 1, 128, 128, 128, 2, 128, // 128, 0, 1, 128, 128, 128, 2, 128, // 0, 1, 2, 128, 128, 128, 3, 128, // 128, 128, 128, 0, 128, 128, 1, 128, // 0, 128, 128, 1, 128, 128, 2, 128, // 128, 0, 128, 1, 128, 128, 2, 128, // 0, 1, 128, 2, 128, 128, 3, 128, // 128, 128, 0, 1, 128, 128, 2, 128, // 0, 128, 1, 2, 128, 128, 3, 128, // 128, 0, 1, 2, 128, 128, 3, 128, // 0, 1, 2, 3, 128, 128, 4, 128, // 128, 128, 128, 128, 0, 128, 1, 128, // 0, 128, 128, 128, 1, 128, 2, 128, // 128, 0, 128, 128, 1, 128, 2, 128, // 0, 1, 128, 128, 2, 128, 3, 128, // 128, 128, 0, 128, 1, 128, 2, 128, // 0, 128, 1, 128, 2, 128, 3, 128, // 128, 0, 1, 128, 2, 128, 3, 128, // 0, 1, 2, 128, 3, 128, 4, 128, // 128, 128, 128, 0, 1, 128, 2, 128, // 0, 128, 128, 1, 2, 128, 3, 128, // 128, 0, 128, 1, 2, 128, 3, 128, // 0, 1, 128, 2, 3, 128, 4, 128, // 128, 128, 0, 1, 2, 128, 3, 128, // 0, 128, 1, 2, 3, 128, 4, 128, // 128, 0, 1, 2, 3, 128, 4, 128, // 0, 1, 2, 3, 4, 128, 5, 128, // 128, 128, 128, 128, 128, 0, 1, 128, // 0, 128, 128, 128, 128, 1, 2, 128, // 128, 0, 128, 128, 128, 1, 2, 128, // 0, 1, 128, 128, 128, 2, 3, 128, // 128, 128, 0, 128, 128, 1, 2, 128, // 0, 128, 1, 128, 128, 2, 3, 128, // 128, 0, 1, 128, 128, 2, 3, 128, // 0, 1, 2, 128, 128, 3, 4, 128, // 128, 128, 128, 0, 128, 1, 2, 128, // 0, 128, 128, 1, 128, 2, 3, 128, // 128, 0, 128, 1, 128, 2, 3, 128, // 0, 1, 128, 2, 128, 3, 4, 128, // 128, 128, 0, 1, 128, 2, 3, 128, // 0, 128, 1, 2, 128, 3, 4, 128, // 128, 0, 1, 2, 128, 3, 4, 128, // 0, 1, 2, 3, 128, 4, 5, 128, // 128, 128, 128, 128, 0, 1, 2, 128, // 0, 128, 128, 128, 1, 2, 3, 128, // 128, 0, 128, 128, 1, 2, 3, 128, // 0, 1, 128, 128, 2, 3, 4, 128, // 128, 128, 0, 128, 1, 2, 3, 128, // 0, 128, 1, 128, 2, 3, 4, 128, // 128, 0, 1, 128, 2, 3, 4, 128, // 0, 1, 2, 128, 3, 4, 5, 128, // 128, 128, 128, 0, 1, 2, 3, 128, // 0, 128, 128, 1, 2, 3, 4, 128, // 128, 0, 128, 1, 2, 3, 4, 128, // 0, 1, 128, 2, 3, 4, 5, 128, // 128, 128, 0, 1, 2, 3, 4, 128, // 0, 128, 1, 2, 3, 4, 5, 128, // 128, 0, 1, 2, 3, 4, 5, 128, // 0, 1, 2, 3, 4, 5, 6, 128, // 128, 128, 128, 128, 128, 128, 128, 0, // 0, 128, 128, 128, 128, 128, 128, 1, // 128, 0, 128, 128, 128, 128, 128, 1, // 0, 1, 128, 128, 128, 128, 128, 2, // 128, 128, 0, 128, 128, 128, 128, 1, // 0, 128, 1, 128, 128, 128, 128, 2, // 128, 0, 1, 128, 128, 128, 128, 2, // 0, 1, 2, 128, 128, 128, 128, 3, // 128, 128, 128, 0, 128, 128, 128, 1, // 0, 128, 128, 1, 128, 128, 128, 2, // 128, 0, 128, 1, 128, 128, 128, 2, // 0, 1, 128, 2, 128, 128, 128, 3, // 128, 128, 0, 1, 128, 128, 128, 2, // 0, 128, 1, 2, 128, 128, 128, 3, // 128, 0, 1, 2, 128, 128, 128, 3, // 0, 1, 2, 3, 128, 128, 128, 4, // 128, 128, 128, 128, 0, 128, 128, 1, // 0, 128, 128, 128, 1, 128, 128, 2, // 128, 0, 128, 128, 1, 128, 128, 2, // 0, 1, 128, 128, 2, 128, 128, 3, // 128, 128, 0, 128, 1, 128, 128, 2, // 0, 128, 1, 128, 2, 128, 128, 3, // 128, 0, 1, 128, 2, 128, 128, 3, // 0, 1, 2, 128, 3, 128, 128, 4, // 128, 128, 128, 0, 1, 128, 128, 2, // 0, 128, 128, 1, 2, 128, 128, 3, // 128, 0, 128, 1, 2, 128, 128, 3, // 0, 1, 128, 2, 3, 128, 128, 4, // 128, 128, 0, 1, 2, 128, 128, 3, // 0, 128, 1, 2, 3, 128, 128, 4, // 128, 0, 1, 2, 3, 128, 128, 4, // 0, 1, 2, 3, 4, 128, 128, 5, // 128, 128, 128, 128, 128, 0, 128, 1, // 0, 128, 128, 128, 128, 1, 128, 2, // 128, 0, 128, 128, 128, 1, 128, 2, // 0, 1, 128, 128, 128, 2, 128, 3, // 128, 128, 0, 128, 128, 1, 128, 2, // 0, 128, 1, 128, 128, 2, 128, 3, // 128, 0, 1, 128, 128, 2, 128, 3, // 0, 1, 2, 128, 128, 3, 128, 4, // 128, 128, 128, 0, 128, 1, 128, 2, // 0, 128, 128, 1, 128, 2, 128, 3, // 128, 0, 128, 1, 128, 2, 128, 3, // 0, 1, 128, 2, 128, 3, 128, 4, // 128, 128, 0, 1, 128, 2, 128, 3, // 0, 128, 1, 2, 128, 3, 128, 4, // 128, 0, 1, 2, 128, 3, 128, 4, // 0, 1, 2, 3, 128, 4, 128, 5, // 128, 128, 128, 128, 0, 1, 128, 2, // 0, 128, 128, 128, 1, 2, 128, 3, // 128, 0, 128, 128, 1, 2, 128, 3, // 0, 1, 128, 128, 2, 3, 128, 4, // 128, 128, 0, 128, 1, 2, 128, 3, // 0, 128, 1, 128, 2, 3, 128, 4, // 128, 0, 1, 128, 2, 3, 128, 4, // 0, 1, 2, 128, 3, 4, 128, 5, // 128, 128, 128, 0, 1, 2, 128, 3, // 0, 128, 128, 1, 2, 3, 128, 4, // 128, 0, 128, 1, 2, 3, 128, 4, // 0, 1, 128, 2, 3, 4, 128, 5, // 128, 128, 0, 1, 2, 3, 128, 4, // 0, 128, 1, 2, 3, 4, 128, 5, // 128, 0, 1, 2, 3, 4, 128, 5, // 0, 1, 2, 3, 4, 5, 128, 6, // 128, 128, 128, 128, 128, 128, 0, 1, // 0, 128, 128, 128, 128, 128, 1, 2, // 128, 0, 128, 128, 128, 128, 1, 2, // 0, 1, 128, 128, 128, 128, 2, 3, // 128, 128, 0, 128, 128, 128, 1, 2, // 0, 128, 1, 128, 128, 128, 2, 3, // 128, 0, 1, 128, 128, 128, 2, 3, // 0, 1, 2, 128, 128, 128, 3, 4, // 128, 128, 128, 0, 128, 128, 1, 2, // 0, 128, 128, 1, 128, 128, 2, 3, // 128, 0, 128, 1, 128, 128, 2, 3, // 0, 1, 128, 2, 128, 128, 3, 4, // 128, 128, 0, 1, 128, 128, 2, 3, // 0, 128, 1, 2, 128, 128, 3, 4, // 128, 0, 1, 2, 128, 128, 3, 4, // 0, 1, 2, 3, 128, 128, 4, 5, // 128, 128, 128, 128, 0, 128, 1, 2, // 0, 128, 128, 128, 1, 128, 2, 3, // 128, 0, 128, 128, 1, 128, 2, 3, // 0, 1, 128, 128, 2, 128, 3, 4, // 128, 128, 0, 128, 1, 128, 2, 3, // 0, 128, 1, 128, 2, 128, 3, 4, // 128, 0, 1, 128, 2, 128, 3, 4, // 0, 1, 2, 128, 3, 128, 4, 5, // 128, 128, 128, 0, 1, 128, 2, 3, // 0, 128, 128, 1, 2, 128, 3, 4, // 128, 0, 128, 1, 2, 128, 3, 4, // 0, 1, 128, 2, 3, 128, 4, 5, // 128, 128, 0, 1, 2, 128, 3, 4, // 0, 128, 1, 2, 3, 128, 4, 5, // 128, 0, 1, 2, 3, 128, 4, 5, // 0, 1, 2, 3, 4, 128, 5, 6, // 128, 128, 128, 128, 128, 0, 1, 2, // 0, 128, 128, 128, 128, 1, 2, 3, // 128, 0, 128, 128, 128, 1, 2, 3, // 0, 1, 128, 128, 128, 2, 3, 4, // 128, 128, 0, 128, 128, 1, 2, 3, // 0, 128, 1, 128, 128, 2, 3, 4, // 128, 0, 1, 128, 128, 2, 3, 4, // 0, 1, 2, 128, 128, 3, 4, 5, // 128, 128, 128, 0, 128, 1, 2, 3, // 0, 128, 128, 1, 128, 2, 3, 4, // 128, 0, 128, 1, 128, 2, 3, 4, // 0, 1, 128, 2, 128, 3, 4, 5, // 128, 128, 0, 1, 128, 2, 3, 4, // 0, 128, 1, 2, 128, 3, 4, 5, // 128, 0, 1, 2, 128, 3, 4, 5, // 0, 1, 2, 3, 128, 4, 5, 6, // 128, 128, 128, 128, 0, 1, 2, 3, // 0, 128, 128, 128, 1, 2, 3, 4, // 128, 0, 128, 128, 1, 2, 3, 4, // 0, 1, 128, 128, 2, 3, 4, 5, // 128, 128, 0, 128, 1, 2, 3, 4, // 0, 128, 1, 128, 2, 3, 4, 5, // 128, 0, 1, 128, 2, 3, 4, 5, // 0, 1, 2, 128, 3, 4, 5, 6, // 128, 128, 128, 0, 1, 2, 3, 4, // 0, 128, 128, 1, 2, 3, 4, 5, // 128, 0, 128, 1, 2, 3, 4, 5, // 0, 1, 128, 2, 3, 4, 5, 6, // 128, 128, 0, 1, 2, 3, 4, 5, // 0, 128, 1, 2, 3, 4, 5, 6, // 128, 0, 1, 2, 3, 4, 5, 6, // 0, 1, 2, 3, 4, 5, 6, 7}; return LoadU(du8, table + mask_bits * 8); } } // namespace detail // Half vector of bytes: one table lookup template HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { const DFromV d; const uint64_t mask_bits = detail::BitsFromMask(mask); const Vec128 indices = detail::IndicesForExpandFromBits(mask_bits); return BitCast(d, TableLookupBytesOr0(v, indices)); } // Full vector of bytes: two table lookups template HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { const Full128 d; const RebindToUnsigned du; const Half duh; const Vec128 vu = BitCast(du, v); const uint64_t mask_bits = detail::BitsFromMask(mask); const uint64_t maskL = mask_bits & 0xFF; const uint64_t maskH = mask_bits >> 8; // We want to skip past the v bytes already consumed by idxL. There is no // instruction for shift-reg by variable bytes. Storing v itself would work // but would involve a store-load forwarding stall. We instead shuffle using // loaded indices. multishift_epi64_epi8 would also help, but if we have that, // we probably also have native 8-bit Expand. alignas(16) static constexpr uint8_t iota[32] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128}; const VFromD shift = LoadU(du, iota + PopCount(maskL)); const VFromD vL = LowerHalf(duh, vu); const VFromD vH = LowerHalf(duh, TableLookupBytesOr0(vu, shift)); const VFromD idxL = detail::IndicesForExpandFromBits<8>(maskL); const VFromD idxH = detail::IndicesForExpandFromBits<8>(maskH); const VFromD expandL = TableLookupBytesOr0(vL, idxL); const VFromD expandH = TableLookupBytesOr0(vH, idxH); return BitCast(d, Combine(du, expandH, expandL)); } template HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { const DFromV d; const RebindToUnsigned du; const Rebind du8; const uint64_t mask_bits = detail::BitsFromMask(mask); // Storing as 8-bit reduces table size from 4 KiB to 2 KiB. We cannot apply // the nibble trick used below because not all indices fit within one lane. alignas(16) static constexpr uint8_t table[2048] = { // PrintExpand16x8ByteTables 128, 128, 128, 128, 128, 128, 128, 128, // 0, 128, 128, 128, 128, 128, 128, 128, // 128, 0, 128, 128, 128, 128, 128, 128, // 0, 2, 128, 128, 128, 128, 128, 128, // 128, 128, 0, 128, 128, 128, 128, 128, // 0, 128, 2, 128, 128, 128, 128, 128, // 128, 0, 2, 128, 128, 128, 128, 128, // 0, 2, 4, 128, 128, 128, 128, 128, // 128, 128, 128, 0, 128, 128, 128, 128, // 0, 128, 128, 2, 128, 128, 128, 128, // 128, 0, 128, 2, 128, 128, 128, 128, // 0, 2, 128, 4, 128, 128, 128, 128, // 128, 128, 0, 2, 128, 128, 128, 128, // 0, 128, 2, 4, 128, 128, 128, 128, // 128, 0, 2, 4, 128, 128, 128, 128, // 0, 2, 4, 6, 128, 128, 128, 128, // 128, 128, 128, 128, 0, 128, 128, 128, // 0, 128, 128, 128, 2, 128, 128, 128, // 128, 0, 128, 128, 2, 128, 128, 128, // 0, 2, 128, 128, 4, 128, 128, 128, // 128, 128, 0, 128, 2, 128, 128, 128, // 0, 128, 2, 128, 4, 128, 128, 128, // 128, 0, 2, 128, 4, 128, 128, 128, // 0, 2, 4, 128, 6, 128, 128, 128, // 128, 128, 128, 0, 2, 128, 128, 128, // 0, 128, 128, 2, 4, 128, 128, 128, // 128, 0, 128, 2, 4, 128, 128, 128, // 0, 2, 128, 4, 6, 128, 128, 128, // 128, 128, 0, 2, 4, 128, 128, 128, // 0, 128, 2, 4, 6, 128, 128, 128, // 128, 0, 2, 4, 6, 128, 128, 128, // 0, 2, 4, 6, 8, 128, 128, 128, // 128, 128, 128, 128, 128, 0, 128, 128, // 0, 128, 128, 128, 128, 2, 128, 128, // 128, 0, 128, 128, 128, 2, 128, 128, // 0, 2, 128, 128, 128, 4, 128, 128, // 128, 128, 0, 128, 128, 2, 128, 128, // 0, 128, 2, 128, 128, 4, 128, 128, // 128, 0, 2, 128, 128, 4, 128, 128, // 0, 2, 4, 128, 128, 6, 128, 128, // 128, 128, 128, 0, 128, 2, 128, 128, // 0, 128, 128, 2, 128, 4, 128, 128, // 128, 0, 128, 2, 128, 4, 128, 128, // 0, 2, 128, 4, 128, 6, 128, 128, // 128, 128, 0, 2, 128, 4, 128, 128, // 0, 128, 2, 4, 128, 6, 128, 128, // 128, 0, 2, 4, 128, 6, 128, 128, // 0, 2, 4, 6, 128, 8, 128, 128, // 128, 128, 128, 128, 0, 2, 128, 128, // 0, 128, 128, 128, 2, 4, 128, 128, // 128, 0, 128, 128, 2, 4, 128, 128, // 0, 2, 128, 128, 4, 6, 128, 128, // 128, 128, 0, 128, 2, 4, 128, 128, // 0, 128, 2, 128, 4, 6, 128, 128, // 128, 0, 2, 128, 4, 6, 128, 128, // 0, 2, 4, 128, 6, 8, 128, 128, // 128, 128, 128, 0, 2, 4, 128, 128, // 0, 128, 128, 2, 4, 6, 128, 128, // 128, 0, 128, 2, 4, 6, 128, 128, // 0, 2, 128, 4, 6, 8, 128, 128, // 128, 128, 0, 2, 4, 6, 128, 128, // 0, 128, 2, 4, 6, 8, 128, 128, // 128, 0, 2, 4, 6, 8, 128, 128, // 0, 2, 4, 6, 8, 10, 128, 128, // 128, 128, 128, 128, 128, 128, 0, 128, // 0, 128, 128, 128, 128, 128, 2, 128, // 128, 0, 128, 128, 128, 128, 2, 128, // 0, 2, 128, 128, 128, 128, 4, 128, // 128, 128, 0, 128, 128, 128, 2, 128, // 0, 128, 2, 128, 128, 128, 4, 128, // 128, 0, 2, 128, 128, 128, 4, 128, // 0, 2, 4, 128, 128, 128, 6, 128, // 128, 128, 128, 0, 128, 128, 2, 128, // 0, 128, 128, 2, 128, 128, 4, 128, // 128, 0, 128, 2, 128, 128, 4, 128, // 0, 2, 128, 4, 128, 128, 6, 128, // 128, 128, 0, 2, 128, 128, 4, 128, // 0, 128, 2, 4, 128, 128, 6, 128, // 128, 0, 2, 4, 128, 128, 6, 128, // 0, 2, 4, 6, 128, 128, 8, 128, // 128, 128, 128, 128, 0, 128, 2, 128, // 0, 128, 128, 128, 2, 128, 4, 128, // 128, 0, 128, 128, 2, 128, 4, 128, // 0, 2, 128, 128, 4, 128, 6, 128, // 128, 128, 0, 128, 2, 128, 4, 128, // 0, 128, 2, 128, 4, 128, 6, 128, // 128, 0, 2, 128, 4, 128, 6, 128, // 0, 2, 4, 128, 6, 128, 8, 128, // 128, 128, 128, 0, 2, 128, 4, 128, // 0, 128, 128, 2, 4, 128, 6, 128, // 128, 0, 128, 2, 4, 128, 6, 128, // 0, 2, 128, 4, 6, 128, 8, 128, // 128, 128, 0, 2, 4, 128, 6, 128, // 0, 128, 2, 4, 6, 128, 8, 128, // 128, 0, 2, 4, 6, 128, 8, 128, // 0, 2, 4, 6, 8, 128, 10, 128, // 128, 128, 128, 128, 128, 0, 2, 128, // 0, 128, 128, 128, 128, 2, 4, 128, // 128, 0, 128, 128, 128, 2, 4, 128, // 0, 2, 128, 128, 128, 4, 6, 128, // 128, 128, 0, 128, 128, 2, 4, 128, // 0, 128, 2, 128, 128, 4, 6, 128, // 128, 0, 2, 128, 128, 4, 6, 128, // 0, 2, 4, 128, 128, 6, 8, 128, // 128, 128, 128, 0, 128, 2, 4, 128, // 0, 128, 128, 2, 128, 4, 6, 128, // 128, 0, 128, 2, 128, 4, 6, 128, // 0, 2, 128, 4, 128, 6, 8, 128, // 128, 128, 0, 2, 128, 4, 6, 128, // 0, 128, 2, 4, 128, 6, 8, 128, // 128, 0, 2, 4, 128, 6, 8, 128, // 0, 2, 4, 6, 128, 8, 10, 128, // 128, 128, 128, 128, 0, 2, 4, 128, // 0, 128, 128, 128, 2, 4, 6, 128, // 128, 0, 128, 128, 2, 4, 6, 128, // 0, 2, 128, 128, 4, 6, 8, 128, // 128, 128, 0, 128, 2, 4, 6, 128, // 0, 128, 2, 128, 4, 6, 8, 128, // 128, 0, 2, 128, 4, 6, 8, 128, // 0, 2, 4, 128, 6, 8, 10, 128, // 128, 128, 128, 0, 2, 4, 6, 128, // 0, 128, 128, 2, 4, 6, 8, 128, // 128, 0, 128, 2, 4, 6, 8, 128, // 0, 2, 128, 4, 6, 8, 10, 128, // 128, 128, 0, 2, 4, 6, 8, 128, // 0, 128, 2, 4, 6, 8, 10, 128, // 128, 0, 2, 4, 6, 8, 10, 128, // 0, 2, 4, 6, 8, 10, 12, 128, // 128, 128, 128, 128, 128, 128, 128, 0, // 0, 128, 128, 128, 128, 128, 128, 2, // 128, 0, 128, 128, 128, 128, 128, 2, // 0, 2, 128, 128, 128, 128, 128, 4, // 128, 128, 0, 128, 128, 128, 128, 2, // 0, 128, 2, 128, 128, 128, 128, 4, // 128, 0, 2, 128, 128, 128, 128, 4, // 0, 2, 4, 128, 128, 128, 128, 6, // 128, 128, 128, 0, 128, 128, 128, 2, // 0, 128, 128, 2, 128, 128, 128, 4, // 128, 0, 128, 2, 128, 128, 128, 4, // 0, 2, 128, 4, 128, 128, 128, 6, // 128, 128, 0, 2, 128, 128, 128, 4, // 0, 128, 2, 4, 128, 128, 128, 6, // 128, 0, 2, 4, 128, 128, 128, 6, // 0, 2, 4, 6, 128, 128, 128, 8, // 128, 128, 128, 128, 0, 128, 128, 2, // 0, 128, 128, 128, 2, 128, 128, 4, // 128, 0, 128, 128, 2, 128, 128, 4, // 0, 2, 128, 128, 4, 128, 128, 6, // 128, 128, 0, 128, 2, 128, 128, 4, // 0, 128, 2, 128, 4, 128, 128, 6, // 128, 0, 2, 128, 4, 128, 128, 6, // 0, 2, 4, 128, 6, 128, 128, 8, // 128, 128, 128, 0, 2, 128, 128, 4, // 0, 128, 128, 2, 4, 128, 128, 6, // 128, 0, 128, 2, 4, 128, 128, 6, // 0, 2, 128, 4, 6, 128, 128, 8, // 128, 128, 0, 2, 4, 128, 128, 6, // 0, 128, 2, 4, 6, 128, 128, 8, // 128, 0, 2, 4, 6, 128, 128, 8, // 0, 2, 4, 6, 8, 128, 128, 10, // 128, 128, 128, 128, 128, 0, 128, 2, // 0, 128, 128, 128, 128, 2, 128, 4, // 128, 0, 128, 128, 128, 2, 128, 4, // 0, 2, 128, 128, 128, 4, 128, 6, // 128, 128, 0, 128, 128, 2, 128, 4, // 0, 128, 2, 128, 128, 4, 128, 6, // 128, 0, 2, 128, 128, 4, 128, 6, // 0, 2, 4, 128, 128, 6, 128, 8, // 128, 128, 128, 0, 128, 2, 128, 4, // 0, 128, 128, 2, 128, 4, 128, 6, // 128, 0, 128, 2, 128, 4, 128, 6, // 0, 2, 128, 4, 128, 6, 128, 8, // 128, 128, 0, 2, 128, 4, 128, 6, // 0, 128, 2, 4, 128, 6, 128, 8, // 128, 0, 2, 4, 128, 6, 128, 8, // 0, 2, 4, 6, 128, 8, 128, 10, // 128, 128, 128, 128, 0, 2, 128, 4, // 0, 128, 128, 128, 2, 4, 128, 6, // 128, 0, 128, 128, 2, 4, 128, 6, // 0, 2, 128, 128, 4, 6, 128, 8, // 128, 128, 0, 128, 2, 4, 128, 6, // 0, 128, 2, 128, 4, 6, 128, 8, // 128, 0, 2, 128, 4, 6, 128, 8, // 0, 2, 4, 128, 6, 8, 128, 10, // 128, 128, 128, 0, 2, 4, 128, 6, // 0, 128, 128, 2, 4, 6, 128, 8, // 128, 0, 128, 2, 4, 6, 128, 8, // 0, 2, 128, 4, 6, 8, 128, 10, // 128, 128, 0, 2, 4, 6, 128, 8, // 0, 128, 2, 4, 6, 8, 128, 10, // 128, 0, 2, 4, 6, 8, 128, 10, // 0, 2, 4, 6, 8, 10, 128, 12, // 128, 128, 128, 128, 128, 128, 0, 2, // 0, 128, 128, 128, 128, 128, 2, 4, // 128, 0, 128, 128, 128, 128, 2, 4, // 0, 2, 128, 128, 128, 128, 4, 6, // 128, 128, 0, 128, 128, 128, 2, 4, // 0, 128, 2, 128, 128, 128, 4, 6, // 128, 0, 2, 128, 128, 128, 4, 6, // 0, 2, 4, 128, 128, 128, 6, 8, // 128, 128, 128, 0, 128, 128, 2, 4, // 0, 128, 128, 2, 128, 128, 4, 6, // 128, 0, 128, 2, 128, 128, 4, 6, // 0, 2, 128, 4, 128, 128, 6, 8, // 128, 128, 0, 2, 128, 128, 4, 6, // 0, 128, 2, 4, 128, 128, 6, 8, // 128, 0, 2, 4, 128, 128, 6, 8, // 0, 2, 4, 6, 128, 128, 8, 10, // 128, 128, 128, 128, 0, 128, 2, 4, // 0, 128, 128, 128, 2, 128, 4, 6, // 128, 0, 128, 128, 2, 128, 4, 6, // 0, 2, 128, 128, 4, 128, 6, 8, // 128, 128, 0, 128, 2, 128, 4, 6, // 0, 128, 2, 128, 4, 128, 6, 8, // 128, 0, 2, 128, 4, 128, 6, 8, // 0, 2, 4, 128, 6, 128, 8, 10, // 128, 128, 128, 0, 2, 128, 4, 6, // 0, 128, 128, 2, 4, 128, 6, 8, // 128, 0, 128, 2, 4, 128, 6, 8, // 0, 2, 128, 4, 6, 128, 8, 10, // 128, 128, 0, 2, 4, 128, 6, 8, // 0, 128, 2, 4, 6, 128, 8, 10, // 128, 0, 2, 4, 6, 128, 8, 10, // 0, 2, 4, 6, 8, 128, 10, 12, // 128, 128, 128, 128, 128, 0, 2, 4, // 0, 128, 128, 128, 128, 2, 4, 6, // 128, 0, 128, 128, 128, 2, 4, 6, // 0, 2, 128, 128, 128, 4, 6, 8, // 128, 128, 0, 128, 128, 2, 4, 6, // 0, 128, 2, 128, 128, 4, 6, 8, // 128, 0, 2, 128, 128, 4, 6, 8, // 0, 2, 4, 128, 128, 6, 8, 10, // 128, 128, 128, 0, 128, 2, 4, 6, // 0, 128, 128, 2, 128, 4, 6, 8, // 128, 0, 128, 2, 128, 4, 6, 8, // 0, 2, 128, 4, 128, 6, 8, 10, // 128, 128, 0, 2, 128, 4, 6, 8, // 0, 128, 2, 4, 128, 6, 8, 10, // 128, 0, 2, 4, 128, 6, 8, 10, // 0, 2, 4, 6, 128, 8, 10, 12, // 128, 128, 128, 128, 0, 2, 4, 6, // 0, 128, 128, 128, 2, 4, 6, 8, // 128, 0, 128, 128, 2, 4, 6, 8, // 0, 2, 128, 128, 4, 6, 8, 10, // 128, 128, 0, 128, 2, 4, 6, 8, // 0, 128, 2, 128, 4, 6, 8, 10, // 128, 0, 2, 128, 4, 6, 8, 10, // 0, 2, 4, 128, 6, 8, 10, 12, // 128, 128, 128, 0, 2, 4, 6, 8, // 0, 128, 128, 2, 4, 6, 8, 10, // 128, 0, 128, 2, 4, 6, 8, 10, // 0, 2, 128, 4, 6, 8, 10, 12, // 128, 128, 0, 2, 4, 6, 8, 10, // 0, 128, 2, 4, 6, 8, 10, 12, // 128, 0, 2, 4, 6, 8, 10, 12, // 0, 2, 4, 6, 8, 10, 12, 14}; // Extend to double length because InterleaveLower will only use the (valid) // lower half, and we want N u16. const Twice du8x2; const Vec128 indices8 = ZeroExtendVector(du8x2, Load(du8, table + mask_bits * 8)); const Vec128 indices16 = BitCast(du, InterleaveLower(du8x2, indices8, indices8)); // TableLookupBytesOr0 operates on bytes. To convert u16 lane indices to byte // indices, add 0 to even and 1 to odd byte lanes. const Vec128 byte_indices = Add( indices16, Set(du, static_cast(HWY_IS_LITTLE_ENDIAN ? 0x0100 : 0x0001))); return BitCast(d, TableLookupBytesOr0(v, byte_indices)); } template HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { const DFromV d; const RebindToUnsigned du; const uint64_t mask_bits = detail::BitsFromMask(mask); alignas(16) static constexpr uint32_t packed_array[16] = { // PrintExpand64x4Nibble - same for 32x4. 0x0000ffff, 0x0000fff0, 0x0000ff0f, 0x0000ff10, 0x0000f0ff, 0x0000f1f0, 0x0000f10f, 0x0000f210, 0x00000fff, 0x00001ff0, 0x00001f0f, 0x00002f10, 0x000010ff, 0x000021f0, 0x0000210f, 0x00003210}; // For lane i, shift the i-th 4-bit index down to bits [0, 2). const Vec128 packed = Set(du, packed_array[mask_bits]); alignas(16) static constexpr uint32_t shifts[4] = {0, 4, 8, 12}; Vec128 indices = packed >> Load(du, shifts); // AVX2 _mm256_permutexvar_epi32 will ignore upper bits, but IndicesFromVec // checks bounds, so clear the upper bits. indices = And(indices, Set(du, N - 1)); const Vec128 expand = TableLookupLanes(BitCast(du, v), IndicesFromVec(du, indices)); // TableLookupLanes cannot also zero masked-off lanes, so do that now. return IfThenElseZero(mask, BitCast(d, expand)); } template HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { // Same as Compress, just zero out the mask=false lanes. return IfThenElseZero(mask, Compress(v, mask)); } // For single-element vectors, this is at least as fast as native. template HWY_API Vec128 Expand(Vec128 v, Mask128 mask) { return IfThenElseZero(mask, v); } // ------------------------------ LoadExpand template HWY_API VFromD LoadExpand(MFromD mask, D d, const TFromD* HWY_RESTRICT unaligned) { return Expand(LoadU(d, unaligned), mask); } #endif // HWY_NATIVE_EXPAND // ------------------------------ TwoTablesLookupLanes template using IndicesFromD = decltype(IndicesFromVec(D(), Zero(RebindToUnsigned()))); // RVV/SVE have their own implementations of // TwoTablesLookupLanes(D d, VFromD a, VFromD b, IndicesFromD idx) #if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE && \ HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE_256 && \ HWY_TARGET != HWY_SVE2_128 template HWY_API VFromD TwoTablesLookupLanes(D /*d*/, VFromD a, VFromD b, IndicesFromD idx) { return TwoTablesLookupLanes(a, b, idx); } #endif // ------------------------------ Reverse2, Reverse4, Reverse8 (8-bit) #if (defined(HWY_NATIVE_REVERSE2_8) == defined(HWY_TARGET_TOGGLE)) || HWY_IDE #ifdef HWY_NATIVE_REVERSE2_8 #undef HWY_NATIVE_REVERSE2_8 #else #define HWY_NATIVE_REVERSE2_8 #endif #undef HWY_PREFER_ROTATE // Platforms on which RotateRight is likely faster than TableLookupBytes. // RVV and SVE anyway have their own implementation of this. #if HWY_TARGET == HWY_SSE2 || HWY_TARGET <= HWY_AVX3 || \ HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_PPC8 #define HWY_PREFER_ROTATE 1 #else #define HWY_PREFER_ROTATE 0 #endif template HWY_API VFromD Reverse2(D d, VFromD v) { // Exclude AVX3 because its 16-bit RotateRight is actually 3 instructions. #if HWY_PREFER_ROTATE && HWY_TARGET > HWY_AVX3 const Repartition du16; return BitCast(d, RotateRight<8>(BitCast(du16, v))); #else const VFromD shuffle = Dup128VecFromValues(d, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); return TableLookupBytes(v, shuffle); #endif } template HWY_API VFromD Reverse4(D d, VFromD v) { #if HWY_PREFER_ROTATE const Repartition du16; return BitCast(d, Reverse2(du16, BitCast(du16, Reverse2(d, v)))); #else const Repartition du8; const VFromD shuffle = Dup128VecFromValues( du8, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); return TableLookupBytes(v, BitCast(d, shuffle)); #endif } template HWY_API VFromD Reverse8(D d, VFromD v) { #if HWY_PREFER_ROTATE const Repartition du32; return BitCast(d, Reverse2(du32, BitCast(du32, Reverse4(d, v)))); #else const Repartition du8; const VFromD shuffle = Dup128VecFromValues( du8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); return TableLookupBytes(v, BitCast(d, shuffle)); #endif } #endif // HWY_NATIVE_REVERSE2_8 // ------------------------------ ReverseLaneBytes #if (defined(HWY_NATIVE_REVERSE_LANE_BYTES) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_REVERSE_LANE_BYTES #undef HWY_NATIVE_REVERSE_LANE_BYTES #else #define HWY_NATIVE_REVERSE_LANE_BYTES #endif template HWY_API V ReverseLaneBytes(V v) { const DFromV d; const Repartition du8; return BitCast(d, Reverse2(du8, BitCast(du8, v))); } template HWY_API V ReverseLaneBytes(V v) { const DFromV d; const Repartition du8; return BitCast(d, Reverse4(du8, BitCast(du8, v))); } template HWY_API V ReverseLaneBytes(V v) { const DFromV d; const Repartition du8; return BitCast(d, Reverse8(du8, BitCast(du8, v))); } #endif // HWY_NATIVE_REVERSE_LANE_BYTES // ------------------------------ ReverseBits // On these targets, we emulate 8-bit shifts using 16-bit shifts and therefore // require at least two lanes to BitCast to 16-bit. We avoid Highway's 8-bit // shifts because those would add extra masking already taken care of by // UI8ReverseBitsStep. Note that AVX3_DL/AVX3_ZEN4 support GFNI and use it to // implement ReverseBits, so this code is not used there. #undef HWY_REVERSE_BITS_MIN_BYTES #if ((HWY_TARGET >= HWY_AVX3 && HWY_TARGET <= HWY_SSE2) || \ HWY_TARGET == HWY_WASM || HWY_TARGET == HWY_WASM_EMU256) #define HWY_REVERSE_BITS_MIN_BYTES 2 #else #define HWY_REVERSE_BITS_MIN_BYTES 1 #endif #if (defined(HWY_NATIVE_REVERSE_BITS_UI8) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_REVERSE_BITS_UI8 #undef HWY_NATIVE_REVERSE_BITS_UI8 #else #define HWY_NATIVE_REVERSE_BITS_UI8 #endif namespace detail { template , HWY_REVERSE_BITS_MIN_BYTES - 1)> HWY_INLINE V UI8ReverseBitsStep(V v) { const DFromV d; const RebindToUnsigned du; #if HWY_REVERSE_BITS_MIN_BYTES == 2 const Repartition d_shift; #else const RebindToUnsigned d_shift; #endif const auto v_to_shift = BitCast(d_shift, v); const auto shl_result = BitCast(d, ShiftLeft(v_to_shift)); const auto shr_result = BitCast(d, ShiftRight(v_to_shift)); const auto shr_result_mask = BitCast(d, Set(du, static_cast(kShrResultMask))); return Or(And(shr_result, shr_result_mask), AndNot(shr_result_mask, shl_result)); } #if HWY_REVERSE_BITS_MIN_BYTES == 2 template , 1)> HWY_INLINE V UI8ReverseBitsStep(V v) { return V{UI8ReverseBitsStep(Vec128{v.raw}) .raw}; } #endif } // namespace detail template HWY_API V ReverseBits(V v) { auto result = detail::UI8ReverseBitsStep<1, 0x55>(v); result = detail::UI8ReverseBitsStep<2, 0x33>(result); result = detail::UI8ReverseBitsStep<4, 0x0F>(result); return result; } #endif // HWY_NATIVE_REVERSE_BITS_UI8 #if (defined(HWY_NATIVE_REVERSE_BITS_UI16_32_64) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_REVERSE_BITS_UI16_32_64 #undef HWY_NATIVE_REVERSE_BITS_UI16_32_64 #else #define HWY_NATIVE_REVERSE_BITS_UI16_32_64 #endif template HWY_API V ReverseBits(V v) { const DFromV d; const Repartition du8; return ReverseLaneBytes(BitCast(d, ReverseBits(BitCast(du8, v)))); } #endif // HWY_NATIVE_REVERSE_BITS_UI16_32_64 // ------------------------------ Per4LaneBlockShuffle #if (defined(HWY_NATIVE_PER4LANEBLKSHUF_DUP32) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 #else #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32 #endif #if HWY_TARGET != HWY_SCALAR namespace detail { template HWY_INLINE Vec Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, const uint32_t x2, const uint32_t x1, const uint32_t x0) { #if HWY_TARGET == HWY_RVV constexpr int kPow2 = d.Pow2(); constexpr int kLoadPow2 = HWY_MAX(kPow2, -1); const ScalableTag d_load; #else constexpr size_t kMaxBytes = d.MaxBytes(); #if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES constexpr size_t kMinLanesToLoad = 2; #else constexpr size_t kMinLanesToLoad = 4; #endif constexpr size_t kNumToLoad = HWY_MAX(kMaxBytes / sizeof(uint32_t), kMinLanesToLoad); const CappedTag d_load; #endif return ResizeBitCast(d, Dup128VecFromValues(d_load, x0, x1, x2, x3)); } } // namespace detail #endif #endif // HWY_NATIVE_PER4LANEBLKSHUF_DUP32 #if HWY_TARGET != HWY_SCALAR namespace detail { template HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<0> /*idx_10_tag*/, V v) { return DupEven(v); } template HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<1> /*idx_10_tag*/, V v) { const DFromV d; return Reverse2(d, v); } template HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<2> /*idx_10_tag*/, V v) { return v; } template HWY_INLINE V Per2LaneBlockShuffle(hwy::SizeTag<3> /*idx_10_tag*/, V v) { return DupOdd(v); } HWY_INLINE uint32_t U8x4Per4LaneBlkIndices(const uint32_t idx3, const uint32_t idx2, const uint32_t idx1, const uint32_t idx0) { #if HWY_IS_LITTLE_ENDIAN return static_cast((idx3 << 24) | (idx2 << 16) | (idx1 << 8) | idx0); #else return static_cast(idx3 | (idx2 << 8) | (idx1 << 16) | (idx0 << 24)); #endif } template HWY_INLINE Vec TblLookupPer4LaneBlkU8IdxInBlk(D d, const uint32_t idx3, const uint32_t idx2, const uint32_t idx1, const uint32_t idx0) { #if HWY_TARGET == HWY_RVV const AdjustSimdTagToMinVecPow2> du32; #else const Repartition du32; #endif return ResizeBitCast( d, Set(du32, U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0))); } #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || \ HWY_TARGET == HWY_SVE2_128 || HWY_TARGET == HWY_EMU128 #define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) void* = nullptr #else #define HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE(D) HWY_IF_T_SIZE_D(D, 8) template HWY_INLINE V Per4LaneBlkShufDoTblLookup(V v, V idx) { const DFromV d; const Repartition du8; return BitCast(d, TableLookupBytes(BitCast(du8, v), BitCast(du8, idx))); } template HWY_INLINE Vec TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3, const uint32_t idx2, const uint32_t idx1, const uint32_t idx0) { const Repartition du32; const uint32_t idx3210 = U8x4Per4LaneBlkIndices(idx3, idx2, idx1, idx0); const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32( du32, static_cast(idx3210 + 0x0C0C0C0C), static_cast(idx3210 + 0x08080808), static_cast(idx3210 + 0x04040404), static_cast(idx3210)); return ResizeBitCast(d, v_byte_idx); } template HWY_INLINE Vec TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3, const uint32_t idx2, const uint32_t idx1, const uint32_t idx0) { const Repartition du32; #if HWY_IS_LITTLE_ENDIAN const uint32_t idx10 = static_cast((idx1 << 16) | idx0); const uint32_t idx32 = static_cast((idx3 << 16) | idx2); constexpr uint32_t kLaneByteOffsets{0x01000100}; #else const uint32_t idx10 = static_cast(idx1 | (idx0 << 16)); const uint32_t idx32 = static_cast(idx3 | (idx2 << 16)); constexpr uint32_t kLaneByteOffsets{0x00010001}; #endif constexpr uint32_t kHiLaneByteOffsets{kLaneByteOffsets + 0x08080808u}; const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32( du32, static_cast(idx32 * 0x0202u + kHiLaneByteOffsets), static_cast(idx10 * 0x0202u + kHiLaneByteOffsets), static_cast(idx32 * 0x0202u + kLaneByteOffsets), static_cast(idx10 * 0x0202u + kLaneByteOffsets)); return ResizeBitCast(d, v_byte_idx); } template HWY_INLINE Vec TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3, const uint32_t idx2, const uint32_t idx1, const uint32_t idx0) { const Repartition du32; #if HWY_IS_LITTLE_ENDIAN constexpr uint32_t kLaneByteOffsets{0x03020100}; #else constexpr uint32_t kLaneByteOffsets{0x00010203}; #endif const auto v_byte_idx = Per4LaneBlkShufDupSet4xU32( du32, static_cast(idx3 * 0x04040404u + kLaneByteOffsets), static_cast(idx2 * 0x04040404u + kLaneByteOffsets), static_cast(idx1 * 0x04040404u + kLaneByteOffsets), static_cast(idx0 * 0x04040404u + kLaneByteOffsets)); return ResizeBitCast(d, v_byte_idx); } #endif template HWY_INLINE VFromD TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3, const uint32_t idx2, const uint32_t idx1, const uint32_t idx0) { return TblLookupPer4LaneBlkU8IdxInBlk(d, idx3, idx2, idx1, idx0); } #if HWY_TARGET == HWY_RVV template HWY_INLINE VFromD TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3, const uint32_t idx2, const uint32_t idx1, const uint32_t idx0) { const Rebind du8; return PromoteTo(d, TblLookupPer4LaneBlkU8IdxInBlk(du8, idx3, idx2, idx1, idx0)); } #else template HWY_INLINE VFromD TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3, const uint32_t idx2, const uint32_t idx1, const uint32_t idx0) { const uint16_t u16_idx0 = static_cast(idx0); const uint16_t u16_idx1 = static_cast(idx1); const uint16_t u16_idx2 = static_cast(idx2); const uint16_t u16_idx3 = static_cast(idx3); #if HWY_TARGET == HWY_NEON || HWY_TARGET == HWY_NEON_WITHOUT_AES constexpr size_t kMinLanesToLoad = 4; #else constexpr size_t kMinLanesToLoad = 8; #endif constexpr size_t kNumToLoad = HWY_MAX(HWY_MAX_LANES_D(D), kMinLanesToLoad); const CappedTag d_load; return ResizeBitCast( d, Dup128VecFromValues(d_load, u16_idx0, u16_idx1, u16_idx2, u16_idx3, u16_idx0, u16_idx1, u16_idx2, u16_idx3)); } template HWY_INLINE VFromD TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3, const uint32_t idx2, const uint32_t idx1, const uint32_t idx0) { return Per4LaneBlkShufDupSet4xU32(d, idx3, idx2, idx1, idx0); } template HWY_INLINE VFromD TblLookupPer4LaneBlkIdxInBlk(D d, const uint32_t idx3, const uint32_t idx2, const uint32_t idx1, const uint32_t idx0) { const RebindToUnsigned du; const Rebind du32; return BitCast(d, PromoteTo(du, Per4LaneBlkShufDupSet4xU32(du32, idx3, idx2, idx1, idx0))); } #endif template HWY_INLINE IndicesFromD TblLookupPer4LaneBlkShufIdx(D d, const uint32_t idx3, const uint32_t idx2, const uint32_t idx1, const uint32_t idx0) { const RebindToUnsigned du; using TU = TFromD; auto idx_in_blk = TblLookupPer4LaneBlkIdxInBlk(du, idx3, idx2, idx1, idx0); constexpr size_t kN = HWY_MAX_LANES_D(D); if (kN < 4) { idx_in_blk = And(idx_in_blk, Set(du, static_cast(kN - 1))); } #if HWY_TARGET == HWY_RVV const auto blk_offsets = AndS(Iota0(du), static_cast(~TU{3})); #else const auto blk_offsets = And(Iota(du, TU{0}), Set(du, static_cast(~TU{3}))); #endif return IndicesFromVec(d, Add(idx_in_blk, blk_offsets)); } template )> HWY_INLINE V Per4LaneBlkShufDoTblLookup(V v, IndicesFromD> idx) { return TableLookupLanes(v, idx); } #undef HWY_PER_4_BLK_TBL_LOOKUP_LANES_ENABLE template HWY_INLINE V TblLookupPer4LaneBlkShuf(V v, size_t idx3210) { const DFromV d; const uint32_t idx3 = static_cast((idx3210 >> 6) & 3); const uint32_t idx2 = static_cast((idx3210 >> 4) & 3); const uint32_t idx1 = static_cast((idx3210 >> 2) & 3); const uint32_t idx0 = static_cast(idx3210 & 3); const auto idx = TblLookupPer4LaneBlkShufIdx(d, idx3, idx2, idx1, idx0); return Per4LaneBlkShufDoTblLookup(v, idx); } // The detail::Per4LaneBlockShuffle overloads that have the extra lane_size_tag // and vect_size_tag parameters are only called for vectors that have at // least 4 lanes (or scalable vectors that might possibly have 4 or more lanes) template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, hwy::SizeTag /*lane_size_tag*/, hwy::SizeTag /*vect_size_tag*/, V v) { return TblLookupPer4LaneBlkShuf(v, kIdx3210); } #if HWY_HAVE_FLOAT64 template HWY_INLINE VFromD>> Per4LaneBlockShufCastToWide( hwy::FloatTag /* type_tag */, hwy::SizeTag<4> /* lane_size_tag */, V v) { const DFromV d; const RepartitionToWide dw; return BitCast(dw, v); } #endif template HWY_INLINE VFromD>>> Per4LaneBlockShufCastToWide(hwy::FloatTag /* type_tag */, hwy::SizeTag /* lane_size_tag */, V v) { const DFromV d; const RebindToUnsigned du; const RepartitionToWide dw; return BitCast(dw, v); } template HWY_INLINE VFromD>> Per4LaneBlockShufCastToWide( hwy::NonFloatTag /* type_tag */, hwy::SizeTag /* lane_size_tag */, V v) { const DFromV d; const RepartitionToWide dw; return BitCast(dw, v); } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x1B> /*idx_3210_tag*/, V v) { const DFromV d; return Reverse4(d, v); } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x44> /*idx_3210_tag*/, V v) { const DFromV d; const auto vw = Per4LaneBlockShufCastToWide( hwy::IsFloatTag>(), hwy::SizeTag)>(), v); return BitCast(d, DupEven(vw)); } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x4E> /*idx_3210_tag*/, V v) { const DFromV d; const auto vw = Per4LaneBlockShufCastToWide( hwy::IsFloatTag>(), hwy::SizeTag)>(), v); const DFromV dw; return BitCast(d, Reverse2(dw, vw)); } #if HWY_MAX_BYTES >= 32 template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x4E> /*idx_3210_tag*/, V v) { return SwapAdjacentBlocks(v); } #endif template , 4), HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x50> /*idx_3210_tag*/, V v) { const DFromV d; return InterleaveLower(d, v, v); } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x50> /*idx_3210_tag*/, V v) { const DFromV d; return InterleaveLower(d, v, v); } template , 4)> HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x88> /*idx_3210_tag*/, V v) { const DFromV d; return ConcatEven(d, v, v); } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xA0> /*idx_3210_tag*/, V v) { return DupEven(v); } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xB1> /*idx_3210_tag*/, V v) { const DFromV d; return Reverse2(d, v); } template , 4)> HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xDD> /*idx_3210_tag*/, V v) { const DFromV d; return ConcatOdd(d, v, v); } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xE4> /*idx_3210_tag*/, V v) { return v; } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xEE> /*idx_3210_tag*/, V v) { const DFromV d; const auto vw = Per4LaneBlockShufCastToWide( hwy::IsFloatTag>(), hwy::SizeTag)>(), v); return BitCast(d, DupOdd(vw)); } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xF5> /*idx_3210_tag*/, V v) { return DupOdd(v); } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xFA> /*idx_3210_tag*/, V v) { const DFromV d; return InterleaveUpper(d, v, v); } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag idx_3210_tag, V v) { const DFromV d; return Per4LaneBlockShuffle(idx_3210_tag, hwy::SizeTag)>(), hwy::SizeTag(), v); } } // namespace detail #endif // HWY_TARGET != HWY_SCALAR template , 1)> HWY_API V Per4LaneBlockShuffle(V v) { static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true"); static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true"); static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true"); static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true"); return v; } #if HWY_TARGET != HWY_SCALAR template , 2)> HWY_API V Per4LaneBlockShuffle(V v) { static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true"); static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true"); static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true"); static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true"); constexpr bool isReverse2 = (kIdx0 == 1 || kIdx1 == 0) && (kIdx0 != kIdx1); constexpr size_t kPer2BlkIdx0 = (kIdx0 <= 1) ? kIdx0 : (isReverse2 ? 1 : 0); constexpr size_t kPer2BlkIdx1 = (kIdx1 <= 1) ? kIdx1 : (isReverse2 ? 0 : 1); constexpr size_t kIdx10 = (kPer2BlkIdx1 << 1) | kPer2BlkIdx0; static_assert(kIdx10 <= 3, "kIdx10 <= 3 must be true"); return detail::Per2LaneBlockShuffle(hwy::SizeTag(), v); } template , 2)> HWY_API V Per4LaneBlockShuffle(V v) { static_assert(kIdx0 <= 3, "kIdx0 <= 3 must be true"); static_assert(kIdx1 <= 3, "kIdx1 <= 3 must be true"); static_assert(kIdx2 <= 3, "kIdx2 <= 3 must be true"); static_assert(kIdx3 <= 3, "kIdx3 <= 3 must be true"); constexpr size_t kIdx3210 = (kIdx3 << 6) | (kIdx2 << 4) | (kIdx1 << 2) | kIdx0; return detail::Per4LaneBlockShuffle(hwy::SizeTag(), v); } #endif // ------------------------------ Blocks template HWY_API size_t Blocks(D d) { return (d.MaxBytes() <= 16) ? 1 : ((Lanes(d) * sizeof(TFromD) + 15) / 16); } // ------------------------------ Block insert/extract/broadcast ops #if (defined(HWY_NATIVE_BLK_INSERT_EXTRACT) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_BLK_INSERT_EXTRACT #undef HWY_NATIVE_BLK_INSERT_EXTRACT #else #define HWY_NATIVE_BLK_INSERT_EXTRACT #endif template HWY_API V InsertBlock(V /*v*/, V blk_to_insert) { static_assert(kBlockIdx == 0, "Invalid block index"); return blk_to_insert; } template HWY_API V ExtractBlock(V v) { static_assert(kBlockIdx == 0, "Invalid block index"); return v; } template HWY_API V BroadcastBlock(V v) { static_assert(kBlockIdx == 0, "Invalid block index"); return v; } #endif // HWY_NATIVE_BLK_INSERT_EXTRACT // ------------------------------ BroadcastLane #if (defined(HWY_NATIVE_BROADCASTLANE) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_BROADCASTLANE #undef HWY_NATIVE_BROADCASTLANE #else #define HWY_NATIVE_BROADCASTLANE #endif template HWY_API V BroadcastLane(V v) { return Broadcast(v); } #endif // HWY_NATIVE_BROADCASTLANE // ------------------------------ Slide1Up and Slide1Down #if (defined(HWY_NATIVE_SLIDE1_UP_DOWN) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_SLIDE1_UP_DOWN #undef HWY_NATIVE_SLIDE1_UP_DOWN #else #define HWY_NATIVE_SLIDE1_UP_DOWN #endif template HWY_API VFromD Slide1Up(D d, VFromD /*v*/) { return Zero(d); } template HWY_API VFromD Slide1Down(D d, VFromD /*v*/) { return Zero(d); } #if HWY_TARGET != HWY_SCALAR template HWY_API VFromD Slide1Up(D d, VFromD v) { return ShiftLeftLanes<1>(d, v); } template HWY_API VFromD Slide1Down(D d, VFromD v) { return ShiftRightLanes<1>(d, v); } #endif // HWY_TARGET != HWY_SCALAR #endif // HWY_NATIVE_SLIDE1_UP_DOWN // ------------------------------ SlideUpBlocks template HWY_API VFromD SlideUpBlocks(D /*d*/, VFromD v) { static_assert(kBlocks == 0, "kBlocks == 0 must be true"); return v; } #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 template HWY_API VFromD SlideUpBlocks(D d, VFromD v) { static_assert(0 <= kBlocks && static_cast(kBlocks) < d.MaxBlocks(), "kBlocks must be between 0 and d.MaxBlocks() - 1"); constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD); return SlideUpLanes(d, v, static_cast(kBlocks) * kLanesPerBlock); } #endif // ------------------------------ SlideDownBlocks template HWY_API VFromD SlideDownBlocks(D /*d*/, VFromD v) { static_assert(kBlocks == 0, "kBlocks == 0 must be true"); return v; } #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 template HWY_API VFromD SlideDownBlocks(D d, VFromD v) { static_assert(0 <= kBlocks && static_cast(kBlocks) < d.MaxBlocks(), "kBlocks must be between 0 and d.MaxBlocks() - 1"); constexpr size_t kLanesPerBlock = 16 / sizeof(TFromD); return SlideDownLanes(d, v, static_cast(kBlocks) * kLanesPerBlock); } #endif // ------------------------------ SumsOfAdjQuadAbsDiff #if (defined(HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF) == \ defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF #undef HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF #else #define HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF #endif #if HWY_TARGET != HWY_SCALAR template )> HWY_API Vec>> SumsOfAdjQuadAbsDiff(V8 a, V8 b) { static_assert(0 <= kAOffset && kAOffset <= 1, "kAOffset must be between 0 and 1"); static_assert(0 <= kBOffset && kBOffset <= 3, "kBOffset must be between 0 and 3"); using D8 = DFromV; const D8 d8; const RebindToUnsigned du8; const RepartitionToWide d16; const RepartitionToWide du16; // Ensure that a is resized to a vector that has at least // HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the interleave and // CombineShiftRightBytes operations below. #if HWY_TARGET == HWY_RVV // On RVV targets, need to ensure that d8_interleave.Pow2() >= 0 is true // to ensure that Lanes(d8_interleave) >= 16 is true. // Lanes(d8_interleave) >= Lanes(d8) is guaranteed to be true on RVV // targets as d8_interleave.Pow2() >= d8.Pow2() is true. constexpr int kInterleavePow2 = HWY_MAX(d8.Pow2(), 0); const ScalableTag, kInterleavePow2> d8_interleave; #elif HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || \ HWY_TARGET == HWY_SVE2_128 // On SVE targets, Lanes(d8_interleave) >= 16 and // Lanes(d8_interleave) >= Lanes(d8) are both already true as d8 is a SIMD // tag for a full u8/i8 vector on SVE. const D8 d8_interleave; #else // On targets that use non-scalable vector types, Lanes(d8_interleave) is // equal to HWY_MAX(Lanes(d8), size_t{8} << kAOffset). constexpr size_t kInterleaveLanes = HWY_MAX(HWY_MAX_LANES_D(D8), size_t{8} << kAOffset); const FixedTag, kInterleaveLanes> d8_interleave; #endif // The ResizeBitCast operation below will resize a to a vector that has // at least HWY_MAX(Lanes(d8), size_t{8} << kAOffset) lanes for the // InterleaveLower, InterleaveUpper, and CombineShiftRightBytes operations // below. const auto a_to_interleave = ResizeBitCast(d8_interleave, a); const auto a_interleaved_lo = InterleaveLower(d8_interleave, a_to_interleave, a_to_interleave); const auto a_interleaved_hi = InterleaveUpper(d8_interleave, a_to_interleave, a_to_interleave); /* a01: { a[kAOffset*4+0], a[kAOffset*4+1], a[kAOffset*4+1], a[kAOffset*4+2], a[kAOffset*4+2], a[kAOffset*4+3], a[kAOffset*4+3], a[kAOffset*4+4], a[kAOffset*4+4], a[kAOffset*4+5], a[kAOffset*4+5], a[kAOffset*4+6], a[kAOffset*4+6], a[kAOffset*4+7], a[kAOffset*4+7], a[kAOffset*4+8] } */ /* a23: { a[kAOffset*4+2], a[kAOffset*4+3], a[kAOffset*4+3], a[kAOffset*4+4], a[kAOffset*4+4], a[kAOffset*4+5], a[kAOffset*4+5], a[kAOffset*4+6], a[kAOffset*4+6], a[kAOffset*4+7], a[kAOffset*4+7], a[kAOffset*4+8], a[kAOffset*4+8], a[kAOffset*4+9], a[kAOffset*4+9], a[kAOffset*4+10] } */ // a01 and a23 are resized back to V8 as only the first Lanes(d8) lanes of // the CombineShiftRightBytes are needed for the subsequent AbsDiff operations // and as a01 and a23 need to be the same vector type as b01 and b23 for the // AbsDiff operations below. const V8 a01 = ResizeBitCast(d8, CombineShiftRightBytes( d8_interleave, a_interleaved_hi, a_interleaved_lo)); const V8 a23 = ResizeBitCast(d8, CombineShiftRightBytes( d8_interleave, a_interleaved_hi, a_interleaved_lo)); /* b01: { b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1], b[kBOffset*4+0], b[kBOffset*4+1] } */ /* b23: { b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3], b[kBOffset*4+2], b[kBOffset*4+3] } */ const V8 b01 = BitCast(d8, Broadcast(BitCast(d16, b))); const V8 b23 = BitCast(d8, Broadcast(BitCast(d16, b))); const VFromD absdiff_sum_01 = SumsOf2(BitCast(du8, AbsDiff(a01, b01))); const VFromD absdiff_sum_23 = SumsOf2(BitCast(du8, AbsDiff(a23, b23))); return BitCast(d16, Add(absdiff_sum_01, absdiff_sum_23)); } #endif // HWY_TARGET != HWY_SCALAR #endif // HWY_NATIVE_SUMS_OF_ADJ_QUAD_ABS_DIFF // ------------------------------ SumsOfShuffledQuadAbsDiff #if (defined(HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF) == \ defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF #undef HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF #else #define HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF #endif #if HWY_TARGET != HWY_SCALAR template )> HWY_API Vec>> SumsOfShuffledQuadAbsDiff(V8 a, V8 b) { static_assert(0 <= kIdx0 && kIdx0 <= 3, "kIdx0 must be between 0 and 3"); static_assert(0 <= kIdx1 && kIdx1 <= 3, "kIdx1 must be between 0 and 3"); static_assert(0 <= kIdx2 && kIdx2 <= 3, "kIdx2 must be between 0 and 3"); static_assert(0 <= kIdx3 && kIdx3 <= 3, "kIdx3 must be between 0 and 3"); #if HWY_TARGET == HWY_RVV // On RVV, ensure that both vA and vB have a LMUL of at least 1/2 so that // both vA and vB can be bitcasted to a u32 vector. const detail::AdjustSimdTagToMinVecPow2< RepartitionToWideX2>> d32; const RepartitionToNarrow d16; const RepartitionToNarrow d8; const auto vA = ResizeBitCast(d8, a); const auto vB = ResizeBitCast(d8, b); #else const DFromV d8; const RepartitionToWide d16; const RepartitionToWide d32; const auto vA = a; const auto vB = b; #endif const RebindToUnsigned du8; const auto a_shuf = Per4LaneBlockShuffle(BitCast(d32, vA)); /* a0123_2345: { a_shuf[0], a_shuf[1], a_shuf[2], a_shuf[3], a_shuf[2], a_shuf[3], a_shuf[4], a_shuf[5], a_shuf[8], a_shuf[9], a_shuf[10], a_shuf[11], a_shuf[10], a_shuf[11], a_shuf[12], a_shuf[13] } */ /* a1234_3456: { a_shuf[1], a_shuf[2], a_shuf[3], a_shuf[4], a_shuf[3], a_shuf[4], a_shuf[5], a_shuf[6], a_shuf[9], a_shuf[10], a_shuf[11], a_shuf[12], a_shuf[11], a_shuf[12], a_shuf[13], a_shuf[14] } */ #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // On RVV/SVE targets, use Slide1Up/Slide1Down instead of // ShiftLeftBytes/ShiftRightBytes to avoid unnecessary zeroing out of any // lanes that are shifted into an adjacent 16-byte block as any lanes that are // shifted into an adjacent 16-byte block by Slide1Up/Slide1Down will be // replaced by the OddEven operation. const auto a_0123_2345 = BitCast( d8, OddEven(BitCast(d32, Slide1Up(d16, BitCast(d16, a_shuf))), a_shuf)); const auto a_1234_3456 = BitCast(d8, OddEven(BitCast(d32, Slide1Up(d8, BitCast(d8, a_shuf))), BitCast(d32, Slide1Down(d8, BitCast(d8, a_shuf))))); #else const auto a_0123_2345 = BitCast(d8, OddEven(ShiftLeftBytes<2>(d32, a_shuf), a_shuf)); const auto a_1234_3456 = BitCast( d8, OddEven(ShiftLeftBytes<1>(d32, a_shuf), ShiftRightBytes<1>(d32, a_shuf))); #endif auto even_sums = SumsOf4(BitCast(du8, AbsDiff(a_0123_2345, vB))); auto odd_sums = SumsOf4(BitCast(du8, AbsDiff(a_1234_3456, vB))); #if HWY_IS_LITTLE_ENDIAN odd_sums = ShiftLeft<16>(odd_sums); #else even_sums = ShiftLeft<16>(even_sums); #endif const auto sums = OddEven(BitCast(d16, odd_sums), BitCast(d16, even_sums)); #if HWY_TARGET == HWY_RVV return ResizeBitCast(RepartitionToWide>(), sums); #else return sums; #endif } #endif // HWY_TARGET != HWY_SCALAR #endif // HWY_NATIVE_SUMS_OF_SHUFFLED_QUAD_ABS_DIFF // ================================================== Operator wrapper // SVE* and RVV currently cannot define operators and have already defined // (only) the corresponding functions such as Add. #if (defined(HWY_NATIVE_OPERATOR_REPLACEMENTS) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS #undef HWY_NATIVE_OPERATOR_REPLACEMENTS #else #define HWY_NATIVE_OPERATOR_REPLACEMENTS #endif template HWY_API V Add(V a, V b) { return a + b; } template HWY_API V Sub(V a, V b) { return a - b; } template HWY_API V Mul(V a, V b) { return a * b; } template HWY_API V Div(V a, V b) { return a / b; } template HWY_API V Mod(V a, V b) { return a % b; } template V Shl(V a, V b) { return a << b; } template V Shr(V a, V b) { return a >> b; } template HWY_API auto Eq(V a, V b) -> decltype(a == b) { return a == b; } template HWY_API auto Ne(V a, V b) -> decltype(a == b) { return a != b; } template HWY_API auto Lt(V a, V b) -> decltype(a == b) { return a < b; } template HWY_API auto Gt(V a, V b) -> decltype(a == b) { return a > b; } template HWY_API auto Ge(V a, V b) -> decltype(a == b) { return a >= b; } template HWY_API auto Le(V a, V b) -> decltype(a == b) { return a <= b; } #endif // HWY_NATIVE_OPERATOR_REPLACEMENTS // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE();