// Copyright 2021 Google LLC // Copyright 2023 Arm Limited and/or its affiliates // SPDX-License-Identifier: Apache-2.0 // SPDX-License-Identifier: BSD-3-Clause // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Target-independent types/functions defined after target-specific ops. // The "include guards" in this file that check HWY_TARGET_TOGGLE serve to skip // the generic implementation here if native ops are already defined. #include "hwy/base.h" // Define detail::Shuffle1230 etc, but only when viewing the current header; // normally this is included via highway.h, which includes ops/*.h. #if HWY_IDE && !defined(HWY_HIGHWAY_INCLUDED) #include "hwy/detect_targets.h" #include "hwy/ops/emu128-inl.h" #endif // HWY_IDE // Relies on the external include guard in highway.h. HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { // The lane type of a vector type, e.g. float for Vec>. template using LaneType = decltype(GetLane(V())); // Vector type, e.g. Vec128 for CappedTag. Useful as the return // type of functions that do not take a vector argument, or as an argument type // if the function only has a template argument for D, or for explicit type // names instead of auto. This may be a built-in type. template using Vec = decltype(Zero(D())); // Mask type. Useful as the return type of functions that do not take a mask // argument, or as an argument type if the function only has a template argument // for D, or for explicit type names instead of auto. template using Mask = decltype(MaskFromVec(Zero(D()))); // Returns the closest value to v within [lo, hi]. template HWY_API V Clamp(const V v, const V lo, const V hi) { return Min(Max(lo, v), hi); } // CombineShiftRightBytes (and -Lanes) are not available for the scalar target, // and RVV has its own implementation of -Lanes. #if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV template HWY_API VFromD CombineShiftRightLanes(D d, VFromD hi, VFromD lo) { constexpr size_t kBytes = kLanes * sizeof(TFromD); static_assert(kBytes < 16, "Shift count is per-block"); return CombineShiftRightBytes(d, hi, lo); } #endif // Returns lanes with the most significant bit set and all other bits zero. template HWY_API Vec SignBit(D d) { const RebindToUnsigned du; return BitCast(d, Set(du, SignMask>())); } // Returns quiet NaN. template HWY_API Vec NaN(D d) { const RebindToSigned di; // LimitsMax sets all exponent and mantissa bits to 1. The exponent plus // mantissa MSB (to indicate quiet) would be sufficient. return BitCast(d, Set(di, LimitsMax>())); } // Returns positive infinity. template HWY_API Vec Inf(D d) { const RebindToUnsigned du; using T = TFromD; using TU = TFromD; const TU max_x2 = static_cast(MaxExponentTimes2()); return BitCast(d, Set(du, max_x2 >> 1)); } // ------------------------------ ZeroExtendResizeBitCast // The implementation of detail::ZeroExtendResizeBitCast for the HWY_EMU128 // target is in emu128-inl.h, and the implementation of // detail::ZeroExtendResizeBitCast for the HWY_SCALAR target is in scalar-inl.h #if HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR namespace detail { #if HWY_HAVE_SCALABLE template HWY_INLINE VFromD ZeroExtendResizeBitCast( hwy::SizeTag /* from_size_tag */, hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom d_from, VFromD v) { const Repartition d_to_u8; const auto resized = ResizeBitCast(d_to_u8, v); // Zero the upper bytes which were not present/valid in d_from. const size_t num_bytes = Lanes(Repartition()); return BitCast(d_to, IfThenElseZero(FirstN(d_to_u8, num_bytes), resized)); } #else // target that uses fixed-size vectors // Truncating or same-size resizing cast: same as ResizeBitCast template HWY_INLINE VFromD ZeroExtendResizeBitCast( hwy::SizeTag /* from_size_tag */, hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom /*d_from*/, VFromD v) { return ResizeBitCast(d_to, v); } // Resizing cast to vector that has twice the number of lanes of the source // vector template HWY_INLINE VFromD ZeroExtendResizeBitCast( hwy::SizeTag /* from_size_tag */, hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom d_from, VFromD v) { const Twice dt_from; return BitCast(d_to, ZeroExtendVector(dt_from, v)); } // Resizing cast to vector that has more than twice the number of lanes of the // source vector template HWY_INLINE VFromD ZeroExtendResizeBitCast( hwy::SizeTag /* from_size_tag */, hwy::SizeTag /* to_size_tag */, DTo d_to, DFrom /*d_from*/, VFromD v) { using TFrom = TFromD; constexpr size_t kNumOfFromLanes = kFromVectSize / sizeof(TFrom); const Repartition d_resize_to; return BitCast(d_to, IfThenElseZero(FirstN(d_resize_to, kNumOfFromLanes), ResizeBitCast(d_resize_to, v))); } #endif // HWY_HAVE_SCALABLE } // namespace detail #endif // HWY_TARGET != HWY_EMU128 && HWY_TARGET != HWY_SCALAR template HWY_API VFromD ZeroExtendResizeBitCast(DTo d_to, DFrom d_from, VFromD v) { return detail::ZeroExtendResizeBitCast(hwy::SizeTag(), hwy::SizeTag(), d_to, d_from, v); } // ------------------------------ SafeFillN template > HWY_API void SafeFillN(const size_t num, const T value, D d, T* HWY_RESTRICT to) { #if HWY_MEM_OPS_MIGHT_FAULT (void)d; for (size_t i = 0; i < num; ++i) { to[i] = value; } #else BlendedStore(Set(d, value), FirstN(d, num), d, to); #endif } // ------------------------------ SafeCopyN template > HWY_API void SafeCopyN(const size_t num, D d, const T* HWY_RESTRICT from, T* HWY_RESTRICT to) { #if HWY_MEM_OPS_MIGHT_FAULT (void)d; for (size_t i = 0; i < num; ++i) { to[i] = from[i]; } #else const Mask mask = FirstN(d, num); BlendedStore(MaskedLoad(mask, d, from), mask, d, to); #endif } // ------------------------------ MaskFalse #if (defined(HWY_NATIVE_MASK_FALSE) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_MASK_FALSE #undef HWY_NATIVE_MASK_FALSE #else #define HWY_NATIVE_MASK_FALSE #endif template HWY_API Mask MaskFalse(D d) { return MaskFromVec(Zero(d)); } #endif // HWY_NATIVE_MASK_FALSE // ------------------------------ BitwiseIfThenElse #if (defined(HWY_NATIVE_BITWISE_IF_THEN_ELSE) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE #else #define HWY_NATIVE_BITWISE_IF_THEN_ELSE #endif template HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { return Or(And(mask, yes), AndNot(mask, no)); } #endif // HWY_NATIVE_BITWISE_IF_THEN_ELSE // ------------------------------ PromoteMaskTo #if (defined(HWY_NATIVE_PROMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_PROMOTE_MASK_TO #undef HWY_NATIVE_PROMOTE_MASK_TO #else #define HWY_NATIVE_PROMOTE_MASK_TO #endif template HWY_API Mask PromoteMaskTo(DTo d_to, DFrom d_from, Mask m) { static_assert( sizeof(TFromD) > sizeof(TFromD), "sizeof(TFromD) must be greater than sizeof(TFromD)"); static_assert( IsSame, Mask, DTo>>>(), "Mask must be the same type as Mask, DTo>>"); const RebindToSigned di_to; const RebindToSigned di_from; return MaskFromVec(BitCast( d_to, PromoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m))))); } #endif // HWY_NATIVE_PROMOTE_MASK_TO // ------------------------------ DemoteMaskTo #if (defined(HWY_NATIVE_DEMOTE_MASK_TO) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_DEMOTE_MASK_TO #undef HWY_NATIVE_DEMOTE_MASK_TO #else #define HWY_NATIVE_DEMOTE_MASK_TO #endif template HWY_API Mask DemoteMaskTo(DTo d_to, DFrom d_from, Mask m) { static_assert(sizeof(TFromD) < sizeof(TFromD), "sizeof(TFromD) must be less than sizeof(TFromD)"); static_assert( IsSame, Mask, DTo>>>(), "Mask must be the same type as Mask, DTo>>"); const RebindToSigned di_to; const RebindToSigned di_from; return MaskFromVec( BitCast(d_to, DemoteTo(di_to, BitCast(di_from, VecFromMask(d_from, m))))); } #endif // HWY_NATIVE_DEMOTE_MASK_TO // ------------------------------ CombineMasks #if (defined(HWY_NATIVE_COMBINE_MASKS) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_COMBINE_MASKS #undef HWY_NATIVE_COMBINE_MASKS #else #define HWY_NATIVE_COMBINE_MASKS #endif #if HWY_TARGET != HWY_SCALAR template HWY_API Mask CombineMasks(D d, Mask> hi, Mask> lo) { const Half dh; return MaskFromVec(Combine(d, VecFromMask(dh, hi), VecFromMask(dh, lo))); } #endif #endif // HWY_NATIVE_COMBINE_MASKS // ------------------------------ LowerHalfOfMask #if (defined(HWY_NATIVE_LOWER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_LOWER_HALF_OF_MASK #undef HWY_NATIVE_LOWER_HALF_OF_MASK #else #define HWY_NATIVE_LOWER_HALF_OF_MASK #endif template HWY_API Mask LowerHalfOfMask(D d, Mask> m) { const Twice dt; return MaskFromVec(LowerHalf(d, VecFromMask(dt, m))); } #endif // HWY_NATIVE_LOWER_HALF_OF_MASK // ------------------------------ UpperHalfOfMask #if (defined(HWY_NATIVE_UPPER_HALF_OF_MASK) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_UPPER_HALF_OF_MASK #undef HWY_NATIVE_UPPER_HALF_OF_MASK #else #define HWY_NATIVE_UPPER_HALF_OF_MASK #endif #if HWY_TARGET != HWY_SCALAR template HWY_API Mask UpperHalfOfMask(D d, Mask> m) { const Twice dt; return MaskFromVec(UpperHalf(d, VecFromMask(dt, m))); } #endif #endif // HWY_NATIVE_UPPER_HALF_OF_MASK // ------------------------------ OrderedDemote2MasksTo #if (defined(HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO) == \ defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO #undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO #else #define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO #endif #if HWY_TARGET != HWY_SCALAR template HWY_API Mask OrderedDemote2MasksTo(DTo d_to, DFrom d_from, Mask a, Mask b) { static_assert( sizeof(TFromD) == sizeof(TFromD) / 2, "sizeof(TFromD) must be equal to sizeof(TFromD) / 2"); static_assert(IsSame, Mask, DFrom>>>(), "Mask must be the same type as " "Mask, DFrom>>>()"); const RebindToSigned di_from; const RebindToSigned di_to; const auto va = BitCast(di_from, VecFromMask(d_from, a)); const auto vb = BitCast(di_from, VecFromMask(d_from, b)); return MaskFromVec(BitCast(d_to, OrderedDemote2To(di_to, va, vb))); } #endif #endif // HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO // ------------------------------ InterleaveWholeLower/InterleaveWholeUpper #if (defined(HWY_NATIVE_INTERLEAVE_WHOLE) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_INTERLEAVE_WHOLE #undef HWY_NATIVE_INTERLEAVE_WHOLE #else #define HWY_NATIVE_INTERLEAVE_WHOLE #endif #if HWY_TARGET != HWY_SCALAR template HWY_API VFromD InterleaveWholeLower(D d, VFromD a, VFromD b) { // InterleaveWholeLower(d, a, b) is equivalent to InterleaveLower(a, b) if // D().MaxBytes() <= 16 is true return InterleaveLower(d, a, b); } template HWY_API VFromD InterleaveWholeUpper(D d, VFromD a, VFromD b) { // InterleaveWholeUpper(d, a, b) is equivalent to InterleaveUpper(a, b) if // D().MaxBytes() <= 16 is true return InterleaveUpper(d, a, b); } // InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on AVX2/AVX3 // is implemented in x86_256-inl.h. // InterleaveWholeLower/InterleaveWholeUpper for 64-byte vectors on AVX3 is // implemented in x86_512-inl.h. // InterleaveWholeLower/InterleaveWholeUpper for 32-byte vectors on WASM_EMU256 // is implemented in wasm_256-inl.h. #endif // HWY_TARGET != HWY_SCALAR #endif // HWY_NATIVE_INTERLEAVE_WHOLE #if HWY_TARGET != HWY_SCALAR // The InterleaveWholeLower without the optional D parameter is generic for all // vector lengths. template HWY_API V InterleaveWholeLower(V a, V b) { return InterleaveWholeLower(DFromV(), a, b); } #endif // HWY_TARGET != HWY_SCALAR // ------------------------------ AddSub template , 1)> HWY_API V AddSub(V a, V b) { // AddSub(a, b) for a one-lane vector is equivalent to Sub(a, b) return Sub(a, b); } // AddSub for F32x2, F32x4, and F64x2 vectors is implemented in x86_128-inl.h on // SSSE3/SSE4/AVX2/AVX3 // AddSub for F32x8 and F64x4 vectors is implemented in x86_256-inl.h on // AVX2/AVX3 template >()) ? 32 : sizeof(TFromV)))> HWY_API V AddSub(V a, V b) { using D = DFromV; using T = TFromD; using TNegate = If(), MakeSigned, T>; const D d; const Rebind d_negate; // Negate the even lanes of b const auto negated_even_b = OddEven(b, BitCast(d, Neg(BitCast(d_negate, b)))); return Add(a, negated_even_b); } // ------------------------------ MaskedAddOr etc. #if (defined(HWY_NATIVE_MASKED_ARITH) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_MASKED_ARITH #undef HWY_NATIVE_MASKED_ARITH #else #define HWY_NATIVE_MASKED_ARITH #endif template HWY_API V MaskedMinOr(V no, M m, V a, V b) { return IfThenElse(m, Min(a, b), no); } template HWY_API V MaskedMaxOr(V no, M m, V a, V b) { return IfThenElse(m, Max(a, b), no); } template HWY_API V MaskedAddOr(V no, M m, V a, V b) { return IfThenElse(m, Add(a, b), no); } template HWY_API V MaskedSubOr(V no, M m, V a, V b) { return IfThenElse(m, Sub(a, b), no); } template HWY_API V MaskedMulOr(V no, M m, V a, V b) { return IfThenElse(m, Mul(a, b), no); } template HWY_API V MaskedDivOr(V no, M m, V a, V b) { return IfThenElse(m, Div(a, b), no); } template HWY_API V MaskedModOr(V no, M m, V a, V b) { return IfThenElse(m, Mod(a, b), no); } template HWY_API V MaskedSatAddOr(V no, M m, V a, V b) { return IfThenElse(m, SaturatedAdd(a, b), no); } template HWY_API V MaskedSatSubOr(V no, M m, V a, V b) { return IfThenElse(m, SaturatedSub(a, b), no); } #endif // HWY_NATIVE_MASKED_ARITH // ------------------------------ IfNegativeThenNegOrUndefIfZero #if (defined(HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG) == \ defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG #undef HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG #else #define HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG #endif template HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) { #if HWY_HAVE_SCALABLE || HWY_TARGET == HWY_SVE_256 || HWY_TARGET == HWY_SVE2_128 // MaskedSubOr is more efficient than IfNegativeThenElse on RVV/SVE const auto zero = Zero(DFromV()); return MaskedSubOr(v, Lt(mask, zero), zero, v); #else return IfNegativeThenElse(mask, Neg(v), v); #endif } #endif // HWY_NATIVE_INTEGER_IF_NEGATIVE_THEN_NEG template HWY_API V IfNegativeThenNegOrUndefIfZero(V mask, V v) { return CopySign(v, Xor(mask, v)); } // ------------------------------ SaturatedNeg #if (defined(HWY_NATIVE_SATURATED_NEG_8_16_32) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_SATURATED_NEG_8_16_32 #undef HWY_NATIVE_SATURATED_NEG_8_16_32 #else #define HWY_NATIVE_SATURATED_NEG_8_16_32 #endif template HWY_API V SaturatedNeg(V v) { const DFromV d; return SaturatedSub(Zero(d), v); } template )> HWY_API V SaturatedNeg(V v) { const DFromV d; #if HWY_TARGET == HWY_RVV || \ (HWY_TARGET >= HWY_PPC10 && HWY_TARGET <= HWY_PPC8) || \ (HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES) // RVV/NEON/SVE/PPC have native I32 SaturatedSub instructions return SaturatedSub(Zero(d), v); #else // ~v[i] - ((v[i] > LimitsMin()) ? -1 : 0) is equivalent to // (v[i] > LimitsMin) ? (-v[i]) : LimitsMax() since // -v[i] == ~v[i] + 1 == ~v[i] - (-1) and // ~LimitsMin() == LimitsMax(). return Sub(Not(v), VecFromMask(d, Gt(v, Set(d, LimitsMin())))); #endif } #endif // HWY_NATIVE_SATURATED_NEG_8_16_32 #if (defined(HWY_NATIVE_SATURATED_NEG_64) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_SATURATED_NEG_64 #undef HWY_NATIVE_SATURATED_NEG_64 #else #define HWY_NATIVE_SATURATED_NEG_64 #endif template )> HWY_API V SaturatedNeg(V v) { #if HWY_TARGET == HWY_RVV || \ (HWY_TARGET >= HWY_SVE2_128 && HWY_TARGET <= HWY_NEON_WITHOUT_AES) // RVV/NEON/SVE have native I64 SaturatedSub instructions const DFromV d; return SaturatedSub(Zero(d), v); #else const auto neg_v = Neg(v); return Add(neg_v, BroadcastSignBit(And(v, neg_v))); #endif } #endif // HWY_NATIVE_SATURATED_NEG_64 // ------------------------------ SaturatedAbs #if (defined(HWY_NATIVE_SATURATED_ABS) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_SATURATED_ABS #undef HWY_NATIVE_SATURATED_ABS #else #define HWY_NATIVE_SATURATED_ABS #endif template HWY_API V SaturatedAbs(V v) { return Max(v, SaturatedNeg(v)); } #endif // ------------------------------ Reductions // Targets follow one of two strategies. If HWY_NATIVE_REDUCE_SCALAR is toggled, // they (RVV/SVE/Armv8/Emu128) implement ReduceSum and SumOfLanes via Set. // Otherwise, they (Armv7/PPC/scalar/WASM/x86) define zero to most of the // SumOfLanes overloads. For the latter group, we here define the remaining // overloads, plus ReduceSum which uses them plus GetLane. #if (defined(HWY_NATIVE_REDUCE_SCALAR) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_REDUCE_SCALAR #undef HWY_NATIVE_REDUCE_SCALAR #else #define HWY_NATIVE_REDUCE_SCALAR #endif namespace detail { // Allows reusing the same shuffle code for SumOfLanes/MinOfLanes/MaxOfLanes. struct AddFunc { template V operator()(V a, V b) const { return Add(a, b); } }; struct MinFunc { template V operator()(V a, V b) const { return Min(a, b); } }; struct MaxFunc { template V operator()(V a, V b) const { return Max(a, b); } }; // No-op for vectors of at most one block. template HWY_INLINE VFromD ReduceAcrossBlocks(D, Func, VFromD v) { return v; } // Reduces a lane with its counterpart in other block(s). Shared by AVX2 and // WASM_EMU256. AVX3 has its own overload. template HWY_INLINE VFromD ReduceAcrossBlocks(D /*d*/, Func f, VFromD v) { return f(v, SwapAdjacentBlocks(v)); } // These return the reduction result broadcasted across all lanes. They assume // the caller has already reduced across blocks. template HWY_INLINE VFromD ReduceWithinBlocks(D d, Func f, VFromD v10) { return f(v10, Reverse2(d, v10)); } template HWY_INLINE VFromD ReduceWithinBlocks(D d, Func f, VFromD v3210) { const VFromD v0123 = Reverse4(d, v3210); const VFromD v03_12_12_03 = f(v3210, v0123); const VFromD v12_03_03_12 = Reverse2(d, v03_12_12_03); return f(v03_12_12_03, v12_03_03_12); } template HWY_INLINE VFromD ReduceWithinBlocks(D d, Func f, VFromD v76543210) { // The upper half is reversed from the lower half; omit for brevity. const VFromD v34_25_16_07 = f(v76543210, Reverse8(d, v76543210)); const VFromD v0347_1625_1625_0347 = f(v34_25_16_07, Reverse4(d, v34_25_16_07)); return f(v0347_1625_1625_0347, Reverse2(d, v0347_1625_1625_0347)); } template HWY_INLINE VFromD ReduceWithinBlocks(D d, Func f, VFromD v) { const RepartitionToWide dw; using VW = VFromD; const VW vw = BitCast(dw, v); // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN. const VW even = And(vw, Set(dw, 0xFF)); const VW odd = ShiftRight<8>(vw); const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd)); #if HWY_IS_LITTLE_ENDIAN return DupEven(BitCast(d, reduced)); #else return DupOdd(BitCast(d, reduced)); #endif } template HWY_INLINE VFromD ReduceWithinBlocks(D d, Func f, VFromD v) { const RepartitionToWide dw; using VW = VFromD; const VW vw = BitCast(dw, v); // Sign-extend // f is commutative, so no need to adapt for HWY_IS_LITTLE_ENDIAN. const VW even = ShiftRight<8>(ShiftLeft<8>(vw)); const VW odd = ShiftRight<8>(vw); const VW reduced = ReduceWithinBlocks(dw, f, f(even, odd)); #if HWY_IS_LITTLE_ENDIAN return DupEven(BitCast(d, reduced)); #else return DupOdd(BitCast(d, reduced)); #endif } } // namespace detail template HWY_API VFromD SumOfLanes(D d, VFromD v) { const detail::AddFunc f; v = detail::ReduceAcrossBlocks(d, f, v); return detail::ReduceWithinBlocks(d, f, v); } template HWY_API VFromD MinOfLanes(D d, VFromD v) { const detail::MinFunc f; v = detail::ReduceAcrossBlocks(d, f, v); return detail::ReduceWithinBlocks(d, f, v); } template HWY_API VFromD MaxOfLanes(D d, VFromD v) { const detail::MaxFunc f; v = detail::ReduceAcrossBlocks(d, f, v); return detail::ReduceWithinBlocks(d, f, v); } template HWY_API TFromD ReduceSum(D d, VFromD v) { return GetLane(SumOfLanes(d, v)); } template HWY_API TFromD ReduceMin(D d, VFromD v) { return GetLane(MinOfLanes(d, v)); } template HWY_API TFromD ReduceMax(D d, VFromD v) { return GetLane(MaxOfLanes(d, v)); } #endif // HWY_NATIVE_REDUCE_SCALAR // Corner cases for both generic and native implementations: // N=1 (native covers N=2 e.g. for u64x2 and even u32x2 on Arm) template HWY_API TFromD ReduceSum(D /*d*/, VFromD v) { return GetLane(v); } template HWY_API TFromD ReduceMin(D /*d*/, VFromD v) { return GetLane(v); } template HWY_API TFromD ReduceMax(D /*d*/, VFromD v) { return GetLane(v); } template HWY_API VFromD SumOfLanes(D /* tag */, VFromD v) { return v; } template HWY_API VFromD MinOfLanes(D /* tag */, VFromD v) { return v; } template HWY_API VFromD MaxOfLanes(D /* tag */, VFromD v) { return v; } // N=4 for 8-bit is still less than the minimum native size. // ARMv7 NEON/PPC/RVV/SVE have target-specific implementations of the N=4 I8/U8 // ReduceSum operations #if (defined(HWY_NATIVE_REDUCE_SUM_4_UI8) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8 #undef HWY_NATIVE_REDUCE_SUM_4_UI8 #else #define HWY_NATIVE_REDUCE_SUM_4_UI8 #endif template HWY_API TFromD ReduceSum(D d, VFromD v) { const Twice> dw; return static_cast>(ReduceSum(dw, PromoteTo(dw, v))); } #endif // HWY_NATIVE_REDUCE_SUM_4_UI8 // RVV/SVE have target-specific implementations of the N=4 I8/U8 // ReduceMin/ReduceMax operations #if (defined(HWY_NATIVE_REDUCE_MINMAX_4_UI8) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8 #undef HWY_NATIVE_REDUCE_MINMAX_4_UI8 #else #define HWY_NATIVE_REDUCE_MINMAX_4_UI8 #endif template HWY_API TFromD ReduceMin(D d, VFromD v) { const Twice> dw; return static_cast>(ReduceMin(dw, PromoteTo(dw, v))); } template HWY_API TFromD ReduceMax(D d, VFromD v) { const Twice> dw; return static_cast>(ReduceMax(dw, PromoteTo(dw, v))); } #endif // HWY_NATIVE_REDUCE_MINMAX_4_UI8 // ------------------------------ IsInf, IsFinite // AVX3 has target-specific implementations of these. #if (defined(HWY_NATIVE_ISINF) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_ISINF #undef HWY_NATIVE_ISINF #else #define HWY_NATIVE_ISINF #endif template > HWY_API MFromD IsInf(const V v) { using T = TFromD; const D d; const RebindToUnsigned du; const VFromD vu = BitCast(du, v); // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. return RebindMask( d, Eq(Add(vu, vu), Set(du, static_cast>(hwy::MaxExponentTimes2())))); } // Returns whether normal/subnormal/zero. template > HWY_API MFromD IsFinite(const V v) { using T = TFromD; const D d; const RebindToUnsigned du; const RebindToSigned di; // cheaper than unsigned comparison const VFromD vu = BitCast(du, v); // 'Shift left' to clear the sign bit. MSVC seems to generate incorrect code // for AVX2 if we instead add vu + vu. #if HWY_COMPILER_MSVC const VFromD shl = ShiftLeft<1>(vu); #else const VFromD shl = Add(vu, vu); #endif // Then shift right so we can compare with the max exponent (cannot compare // with MaxExponentTimes2 directly because it is negative and non-negative // floats would be greater). const VFromD exp = BitCast(di, ShiftRight() + 1>(shl)); return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField()))); } #endif // HWY_NATIVE_ISINF // ------------------------------ LoadInterleaved2 #if HWY_IDE || \ (defined(HWY_NATIVE_LOAD_STORE_INTERLEAVED) == defined(HWY_TARGET_TOGGLE)) #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED #else #define HWY_NATIVE_LOAD_STORE_INTERLEAVED #endif template HWY_API void LoadInterleaved2(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1) { const VFromD A = LoadU(d, unaligned); // v1[1] v0[1] v1[0] v0[0] const VFromD B = LoadU(d, unaligned + Lanes(d)); v0 = ConcatEven(d, B, A); v1 = ConcatOdd(d, B, A); } template HWY_API void LoadInterleaved2(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1) { v0 = LoadU(d, unaligned + 0); v1 = LoadU(d, unaligned + 1); } // ------------------------------ LoadInterleaved3 (CombineShiftRightBytes) namespace detail { #if HWY_IDE template HWY_INLINE V ShuffleTwo1230(V a, V /* b */) { return a; } template HWY_INLINE V ShuffleTwo2301(V a, V /* b */) { return a; } template HWY_INLINE V ShuffleTwo3012(V a, V /* b */) { return a; } #endif // HWY_IDE // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. template HWY_INLINE void LoadTransposedBlocks3(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& A, VFromD& B, VFromD& C) { constexpr size_t kN = MaxLanes(d); A = LoadU(d, unaligned + 0 * kN); B = LoadU(d, unaligned + 1 * kN); C = LoadU(d, unaligned + 2 * kN); } } // namespace detail template HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { const RebindToUnsigned du; using V = VFromD; using VU = VFromD; // Compact notation so these fit on one line: 12 := v1[2]. V A; // 05 24 14 04 23 13 03 22 12 02 21 11 01 20 10 00 V B; // 1a 0a 29 19 09 28 18 08 27 17 07 26 16 06 25 15 V C; // 2f 1f 0f 2e 1e 0e 2d 1d 0d 2c 1c 0c 2b 1b 0b 2a detail::LoadTransposedBlocks3(d, unaligned, A, B, C); // Compress all lanes belonging to v0 into consecutive lanes. constexpr uint8_t Z = 0x80; const VU idx_v0A = Dup128VecFromValues(du, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z); const VU idx_v0B = Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z); const VU idx_v0C = Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13); const VU idx_v1A = Dup128VecFromValues(du, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z); const VU idx_v1B = Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15, Z, Z, Z, Z, Z); const VU idx_v1C = Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 2, 5, 8, 11, 14); const VU idx_v2A = Dup128VecFromValues(du, 2, 5, 8, 11, 14, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z); const VU idx_v2B = Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 10, 13, Z, Z, Z, Z, Z, Z); const VU idx_v2C = Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0, 3, 6, 9, 12, 15); const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A)); const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B)); const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C)); const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A)); const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B)); const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C)); const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A)); const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B)); const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C)); v0 = Xor3(v0L, v0M, v0U); v1 = Xor3(v1L, v1M, v1U); v2 = Xor3(v2L, v2M, v2U); } // 8-bit lanes x8 template HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { const RebindToUnsigned du; using V = VFromD; using VU = VFromD; V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0] V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2] V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5] detail::LoadTransposedBlocks3(d, unaligned, A, B, C); // Compress all lanes belonging to v0 into consecutive lanes. constexpr uint8_t Z = 0x80; const VU idx_v0A = Dup128VecFromValues(du, 0, 3, 6, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0); const VU idx_v0B = Dup128VecFromValues(du, Z, Z, Z, 1, 4, 7, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0); const VU idx_v0C = Dup128VecFromValues(du, Z, Z, Z, Z, Z, Z, 2, 5, 0, 0, 0, 0, 0, 0, 0, 0); const VU idx_v1A = Dup128VecFromValues(du, 1, 4, 7, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0); const VU idx_v1B = Dup128VecFromValues(du, Z, Z, Z, 2, 5, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0); const VU idx_v1C = Dup128VecFromValues(du, Z, Z, Z, Z, Z, 0, 3, 6, 0, 0, 0, 0, 0, 0, 0, 0); const VU idx_v2A = Dup128VecFromValues(du, 2, 5, Z, Z, Z, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0); const VU idx_v2B = Dup128VecFromValues(du, Z, Z, 0, 3, 6, Z, Z, Z, 0, 0, 0, 0, 0, 0, 0, 0); const VU idx_v2C = Dup128VecFromValues(du, Z, Z, Z, Z, Z, 1, 4, 7, 0, 0, 0, 0, 0, 0, 0, 0); const V v0L = BitCast(d, TableLookupBytesOr0(A, idx_v0A)); const V v0M = BitCast(d, TableLookupBytesOr0(B, idx_v0B)); const V v0U = BitCast(d, TableLookupBytesOr0(C, idx_v0C)); const V v1L = BitCast(d, TableLookupBytesOr0(A, idx_v1A)); const V v1M = BitCast(d, TableLookupBytesOr0(B, idx_v1B)); const V v1U = BitCast(d, TableLookupBytesOr0(C, idx_v1C)); const V v2L = BitCast(d, TableLookupBytesOr0(A, idx_v2A)); const V v2M = BitCast(d, TableLookupBytesOr0(B, idx_v2B)); const V v2U = BitCast(d, TableLookupBytesOr0(C, idx_v2C)); v0 = Xor3(v0L, v0M, v0U); v1 = Xor3(v1L, v1M, v1U); v2 = Xor3(v2L, v2M, v2U); } // 16-bit lanes x8 template HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { const RebindToUnsigned du; const Repartition du8; using V = VFromD; using VU8 = VFromD; V A; // v1[2] v0[2] v2[1] v1[1] v0[1] v2[0] v1[0] v0[0] V B; // v0[5] v2[4] v1[4] v0[4] v2[3] v1[3] v0[3] v2[2] V C; // v2[7] v1[7] v0[7] v2[6] v1[6] v0[6] v2[5] v1[5] detail::LoadTransposedBlocks3(d, unaligned, A, B, C); // Compress all lanes belonging to v0 into consecutive lanes. Same as above, // but each element of the array contains a byte index for a byte of a lane. constexpr uint8_t Z = 0x80; const VU8 idx_v0A = Dup128VecFromValues(du8, 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z); const VU8 idx_v0B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z); const VU8 idx_v0C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x04, 0x05, 0x0A, 0x0B); const VU8 idx_v1A = Dup128VecFromValues(du8, 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z); const VU8 idx_v1B = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z, Z, Z, Z); const VU8 idx_v1C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D); const VU8 idx_v2A = Dup128VecFromValues(du8, 0x04, 0x05, 0x0A, 0x0B, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z); const VU8 idx_v2B = Dup128VecFromValues(du8, Z, Z, Z, Z, 0x00, 0x01, 0x06, 0x07, 0x0C, 0x0D, Z, Z, Z, Z, Z, Z); const VU8 idx_v2C = Dup128VecFromValues(du8, Z, Z, Z, Z, Z, Z, Z, Z, Z, Z, 0x02, 0x03, 0x08, 0x09, 0x0E, 0x0F); const V v0L = TableLookupBytesOr0(A, BitCast(d, idx_v0A)); const V v0M = TableLookupBytesOr0(B, BitCast(d, idx_v0B)); const V v0U = TableLookupBytesOr0(C, BitCast(d, idx_v0C)); const V v1L = TableLookupBytesOr0(A, BitCast(d, idx_v1A)); const V v1M = TableLookupBytesOr0(B, BitCast(d, idx_v1B)); const V v1U = TableLookupBytesOr0(C, BitCast(d, idx_v1C)); const V v2L = TableLookupBytesOr0(A, BitCast(d, idx_v2A)); const V v2M = TableLookupBytesOr0(B, BitCast(d, idx_v2B)); const V v2U = TableLookupBytesOr0(C, BitCast(d, idx_v2C)); v0 = Xor3(v0L, v0M, v0U); v1 = Xor3(v1L, v1M, v1U); v2 = Xor3(v2L, v2M, v2U); } template HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { using V = VFromD; V A; // v0[1] v2[0] v1[0] v0[0] V B; // v1[2] v0[2] v2[1] v1[1] V C; // v2[3] v1[3] v0[3] v2[2] detail::LoadTransposedBlocks3(d, unaligned, A, B, C); const V vxx_02_03_xx = OddEven(C, B); v0 = detail::ShuffleTwo1230(A, vxx_02_03_xx); // Shuffle2301 takes the upper/lower halves of the output from one input, so // we cannot just combine 13 and 10 with 12 and 11 (similar to v0/v2). Use // OddEven because it may have higher throughput than Shuffle. const V vxx_xx_10_11 = OddEven(A, B); const V v12_13_xx_xx = OddEven(B, C); v1 = detail::ShuffleTwo2301(vxx_xx_10_11, v12_13_xx_xx); const V vxx_20_21_xx = OddEven(B, A); v2 = detail::ShuffleTwo3012(vxx_20_21_xx, C); } template HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { VFromD A; // v1[0] v0[0] VFromD B; // v0[1] v2[0] VFromD C; // v2[1] v1[1] detail::LoadTransposedBlocks3(d, unaligned, A, B, C); v0 = OddEven(B, A); v1 = CombineShiftRightBytes)>(d, C, A); v2 = OddEven(C, B); } template , HWY_IF_LANES_D(D, 1)> HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { v0 = LoadU(d, unaligned + 0); v1 = LoadU(d, unaligned + 1); v2 = LoadU(d, unaligned + 2); } // ------------------------------ LoadInterleaved4 namespace detail { // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. template HWY_INLINE void LoadTransposedBlocks4(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& vA, VFromD& vB, VFromD& vC, VFromD& vD) { constexpr size_t kN = MaxLanes(d); vA = LoadU(d, unaligned + 0 * kN); vB = LoadU(d, unaligned + 1 * kN); vC = LoadU(d, unaligned + 2 * kN); vD = LoadU(d, unaligned + 3 * kN); } } // namespace detail template HWY_API void LoadInterleaved4(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2, VFromD& v3) { const Repartition d64; using V64 = VFromD; using V = VFromD; // 16 lanes per block; the lowest four blocks are at the bottom of vA..vD. // Here int[i] means the four interleaved values of the i-th 4-tuple and // int[3..0] indicates four consecutive 4-tuples (0 = least-significant). V vA; // int[13..10] int[3..0] V vB; // int[17..14] int[7..4] V vC; // int[1b..18] int[b..8] V vD; // int[1f..1c] int[f..c] detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); // For brevity, the comments only list the lower block (upper = lower + 0x10) const V v5140 = InterleaveLower(d, vA, vB); // int[5,1,4,0] const V vd9c8 = InterleaveLower(d, vC, vD); // int[d,9,c,8] const V v7362 = InterleaveUpper(d, vA, vB); // int[7,3,6,2] const V vfbea = InterleaveUpper(d, vC, vD); // int[f,b,e,a] const V v6420 = InterleaveLower(d, v5140, v7362); // int[6,4,2,0] const V veca8 = InterleaveLower(d, vd9c8, vfbea); // int[e,c,a,8] const V v7531 = InterleaveUpper(d, v5140, v7362); // int[7,5,3,1] const V vfdb9 = InterleaveUpper(d, vd9c8, vfbea); // int[f,d,b,9] const V64 v10L = BitCast(d64, InterleaveLower(d, v6420, v7531)); // v10[7..0] const V64 v10U = BitCast(d64, InterleaveLower(d, veca8, vfdb9)); // v10[f..8] const V64 v32L = BitCast(d64, InterleaveUpper(d, v6420, v7531)); // v32[7..0] const V64 v32U = BitCast(d64, InterleaveUpper(d, veca8, vfdb9)); // v32[f..8] v0 = BitCast(d, InterleaveLower(d64, v10L, v10U)); v1 = BitCast(d, InterleaveUpper(d64, v10L, v10U)); v2 = BitCast(d, InterleaveLower(d64, v32L, v32U)); v3 = BitCast(d, InterleaveUpper(d64, v32L, v32U)); } template HWY_API void LoadInterleaved4(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2, VFromD& v3) { // In the last step, we interleave by half of the block size, which is usually // 8 bytes but half that for 8-bit x8 vectors. using TW = hwy::UnsignedFromSize; const Repartition dw; using VW = VFromD; // (Comments are for 256-bit vectors.) // 8 lanes per block; the lowest four blocks are at the bottom of vA..vD. VFromD vA; // v3210[9]v3210[8] v3210[1]v3210[0] VFromD vB; // v3210[b]v3210[a] v3210[3]v3210[2] VFromD vC; // v3210[d]v3210[c] v3210[5]v3210[4] VFromD vD; // v3210[f]v3210[e] v3210[7]v3210[6] detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); const VFromD va820 = InterleaveLower(d, vA, vB); // v3210[a,8] v3210[2,0] const VFromD vec64 = InterleaveLower(d, vC, vD); // v3210[e,c] v3210[6,4] const VFromD vb931 = InterleaveUpper(d, vA, vB); // v3210[b,9] v3210[3,1] const VFromD vfd75 = InterleaveUpper(d, vC, vD); // v3210[f,d] v3210[7,5] const VW v10_b830 = // v10[b..8] v10[3..0] BitCast(dw, InterleaveLower(d, va820, vb931)); const VW v10_fc74 = // v10[f..c] v10[7..4] BitCast(dw, InterleaveLower(d, vec64, vfd75)); const VW v32_b830 = // v32[b..8] v32[3..0] BitCast(dw, InterleaveUpper(d, va820, vb931)); const VW v32_fc74 = // v32[f..c] v32[7..4] BitCast(dw, InterleaveUpper(d, vec64, vfd75)); v0 = BitCast(d, InterleaveLower(dw, v10_b830, v10_fc74)); v1 = BitCast(d, InterleaveUpper(dw, v10_b830, v10_fc74)); v2 = BitCast(d, InterleaveLower(dw, v32_b830, v32_fc74)); v3 = BitCast(d, InterleaveUpper(dw, v32_b830, v32_fc74)); } template HWY_API void LoadInterleaved4(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2, VFromD& v3) { using V = VFromD; V vA; // v3210[4] v3210[0] V vB; // v3210[5] v3210[1] V vC; // v3210[6] v3210[2] V vD; // v3210[7] v3210[3] detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); const V v10e = InterleaveLower(d, vA, vC); // v1[6,4] v0[6,4] v1[2,0] v0[2,0] const V v10o = InterleaveLower(d, vB, vD); // v1[7,5] v0[7,5] v1[3,1] v0[3,1] const V v32e = InterleaveUpper(d, vA, vC); // v3[6,4] v2[6,4] v3[2,0] v2[2,0] const V v32o = InterleaveUpper(d, vB, vD); // v3[7,5] v2[7,5] v3[3,1] v2[3,1] v0 = InterleaveLower(d, v10e, v10o); v1 = InterleaveUpper(d, v10e, v10o); v2 = InterleaveLower(d, v32e, v32o); v3 = InterleaveUpper(d, v32e, v32o); } template HWY_API void LoadInterleaved4(D d, const TFromD* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2, VFromD& v3) { VFromD vA, vB, vC, vD; detail::LoadTransposedBlocks4(d, unaligned, vA, vB, vC, vD); v0 = InterleaveLower(d, vA, vC); v1 = InterleaveUpper(d, vA, vC); v2 = InterleaveLower(d, vB, vD); v3 = InterleaveUpper(d, vB, vD); } // Any T x1 template , HWY_IF_LANES_D(D, 1)> HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2, VFromD& v3) { v0 = LoadU(d, unaligned + 0); v1 = LoadU(d, unaligned + 1); v2 = LoadU(d, unaligned + 2); v3 = LoadU(d, unaligned + 3); } // ------------------------------ StoreInterleaved2 namespace detail { // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. template HWY_INLINE void StoreTransposedBlocks2(VFromD A, VFromD B, D d, TFromD* HWY_RESTRICT unaligned) { constexpr size_t kN = MaxLanes(d); StoreU(A, d, unaligned + 0 * kN); StoreU(B, d, unaligned + 1 * kN); } } // namespace detail // >= 128 bit vector template HWY_API void StoreInterleaved2(VFromD v0, VFromD v1, D d, TFromD* HWY_RESTRICT unaligned) { const auto v10L = InterleaveLower(d, v0, v1); // .. v1[0] v0[0] const auto v10U = InterleaveUpper(d, v0, v1); // .. v1[kN/2] v0[kN/2] detail::StoreTransposedBlocks2(v10L, v10U, d, unaligned); } // <= 64 bits template HWY_API void StoreInterleaved2(V part0, V part1, D d, TFromD* HWY_RESTRICT unaligned) { const Twice d2; const auto v0 = ZeroExtendVector(d2, part0); const auto v1 = ZeroExtendVector(d2, part1); const auto v10 = InterleaveLower(d2, v0, v1); StoreU(v10, d2, unaligned); } // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes, // TableLookupBytes) namespace detail { // Default for <= 128-bit vectors; x86_256 and x86_512 have their own overload. template HWY_INLINE void StoreTransposedBlocks3(VFromD A, VFromD B, VFromD C, D d, TFromD* HWY_RESTRICT unaligned) { constexpr size_t kN = MaxLanes(d); StoreU(A, d, unaligned + 0 * kN); StoreU(B, d, unaligned + 1 * kN); StoreU(C, d, unaligned + 2 * kN); } } // namespace detail // >= 128-bit vector, 8-bit lanes template HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; using TU = TFromD; const auto k5 = Set(du, TU{5}); const auto k6 = Set(du, TU{6}); // Interleave (v0,v1,v2) to (MSB on left, lane 0 on right): // v0[5], v2[4],v1[4],v0[4] .. v2[0],v1[0],v0[0]. We're expanding v0 lanes // to their place, with 0x80 so lanes to be filled from other vectors are 0 // to enable blending by ORing together. const VFromD shuf_A0 = Dup128VecFromValues(du, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80, 5); // Cannot reuse shuf_A0 because it contains 5. const VFromD shuf_A1 = Dup128VecFromValues(du, 0x80, 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80); // The interleaved vectors will be named A, B, C; temporaries with suffix // 0..2 indicate which input vector's lanes they hold. // cannot reuse shuf_A0 (has 5) const auto shuf_A2 = CombineShiftRightBytes<15>(du, shuf_A1, shuf_A1); const auto A0 = TableLookupBytesOr0(v0, shuf_A0); // 5..4..3..2..1..0 const auto A1 = TableLookupBytesOr0(v1, shuf_A1); // ..4..3..2..1..0. const auto A2 = TableLookupBytesOr0(v2, shuf_A2); // .4..3..2..1..0.. const VFromD