// Copyright 2021 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Per-target #if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE) == \ defined(HWY_TARGET_TOGGLE) #ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE #undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE #else #define HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE #endif #include #include #include "hwy/contrib/sort/order.h" // SortDescending #include "hwy/contrib/sort/shared-inl.h" // SortConstants #include "hwy/highway.h" HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { namespace detail { // Base class of both KeyLane (with or without VQSORT_ENABLED) template struct KeyLaneBase { static constexpr bool Is128() { return false; } constexpr size_t LanesPerKey() const { return 1; } // What type bench_sort should allocate for generating inputs. using LaneType = LaneTypeArg; // What type to pass to VQSort. using KeyType = KeyTypeArg; const char* KeyString() const { return IsSame() ? "f16" : IsSame() ? "f32" : IsSame() ? "f64" : IsSame() ? "i16" : IsSame() ? "i32" : IsSame() ? "i64" : IsSame() ? "u32" : IsSame() ? "u32" : IsSame() ? "u64" : IsSame() ? "k+v=64" : "?"; } }; // Wrapper functions so we can specialize for floats - infinity trumps // HighestValue (the normal value with the largest magnitude). Must be outside // Order* classes to enable SFINAE. LargestSortValue is used even if // !VQSORT_ENABLED. template Vec LargestSortValue(D d) { return Inf(d); } template Vec LargestSortValue(D d) { return Set(d, hwy::HighestValue>()); } template Vec SmallestSortValue(D d) { return Neg(Inf(d)); } template Vec SmallestSortValue(D d) { return Set(d, hwy::LowestValue>()); } // Returns the next distinct larger value unless already +inf. template Vec LargerSortValue(D d, Vec v) { HWY_DASSERT(AllFalse(d, IsNaN(v))); // we replaced all NaN with LastValue. using T = TFromD; const RebindToUnsigned du; using VU = Vec; using TU = TFromD; const VU vu = BitCast(du, Abs(v)); // The direction depends on the original sign. Integer comparison is cheaper // than float comparison and treats -0 as 0 (so we return +epsilon). const Mask was_pos = Le(BitCast(du, v), SignBit(du)); // If positive, add 1, else -1. const VU add = IfThenElse(was_pos, Set(du, 1u), Set(du, LimitsMax())); // Prev/next integer is the prev/next value, even if mantissa under/overflows. v = BitCast(d, Add(vu, add)); // But we may have overflowed into inf or NaN; replace with inf if positive, // but the largest (later negated!) value if the input was -inf. const Mask was_pos_f = RebindMask(d, was_pos); v = IfThenElse(IsFinite(v), v, IfThenElse(was_pos_f, Inf(d), Set(d, HighestValue()))); // Restore the original sign - not via CopySignToAbs because we used a mask. return IfThenElse(was_pos_f, v, Neg(v)); } // Returns the next distinct smaller value unless already -inf. template Vec SmallerSortValue(D d, Vec v) { HWY_DASSERT(AllFalse(d, IsNaN(v))); // we replaced all NaN with LastValue. using T = TFromD; const RebindToUnsigned du; using VU = Vec; using TU = TFromD; const VU vu = BitCast(du, Abs(v)); // The direction depends on the original sign. Float comparison because we // want to treat 0 as -0 so we return -epsilon. const Mask was_pos = Gt(v, Zero(d)); // If positive, add -1, else 1. const VU add = IfThenElse(RebindMask(du, was_pos), Set(du, LimitsMax()), Set(du, 1)); // Prev/next integer is the prev/next value, even if mantissa under/overflows. v = BitCast(d, Add(vu, add)); // But we may have overflowed into inf or NaN; replace with +inf (which will // later be negated) if negative, but the largest value if the input was +inf. v = IfThenElse(IsFinite(v), v, IfThenElse(was_pos, Set(d, HighestValue()), Inf(d))); // Restore the original sign - not via CopySignToAbs because we used a mask. return IfThenElse(was_pos, v, Neg(v)); } template Vec LargerSortValue(D d, Vec v) { return Add(v, Set(d, TFromD{1})); } template Vec SmallerSortValue(D d, Vec v) { return Sub(v, Set(d, TFromD{1})); } #if VQSORT_ENABLED || HWY_IDE // Highway does not provide a lane type for 128-bit keys, so we use uint64_t // along with an abstraction layer for single-lane vs. lane-pair, which is // independent of the order. template struct KeyLane : public KeyLaneBase { // For HeapSort HWY_INLINE void Swap(LaneType* a, LaneType* b) const { const LaneType temp = *a; *a = *b; *b = temp; } template HWY_INLINE V CompressKeys(V keys, M mask) const { return CompressNot(keys, mask); } // Broadcasts one key into a vector template HWY_INLINE Vec SetKey(D d, const LaneType* key) const { return Set(d, *key); } template HWY_INLINE Mask EqualKeys(D /*tag*/, Vec a, Vec b) const { return Eq(a, b); } template HWY_INLINE Mask NotEqualKeys(D /*tag*/, Vec a, Vec b) const { return Ne(a, b); } // For keys=lanes, any difference counts. template HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec diff) const { // Must avoid floating-point comparisons (for -0) const RebindToUnsigned du; return AllTrue(du, Eq(BitCast(du, diff), Zero(du))); } HWY_INLINE bool Equal1(const LaneType* a, const LaneType* b) const { return *a == *b; } template HWY_INLINE Vec ReverseKeys(D d, Vec v) const { return Reverse(d, v); } template HWY_INLINE Vec ReverseKeys2(D d, Vec v) const { return Reverse2(d, v); } template HWY_INLINE Vec ReverseKeys4(D d, Vec v) const { return Reverse4(d, v); } template HWY_INLINE Vec ReverseKeys8(D d, Vec v) const { return Reverse8(d, v); } template HWY_INLINE Vec ReverseKeys16(D d, Vec v) const { static_assert(SortConstants::kMaxCols <= 16, "Assumes u32x16 = 512 bit"); return ReverseKeys(d, v); } template HWY_INLINE V OddEvenKeys(const V odd, const V even) const { return OddEven(odd, even); } template HWY_INLINE Vec SwapAdjacentPairs(D d, const Vec v) const { const Repartition du32; return BitCast(d, Shuffle2301(BitCast(du32, v))); } template HWY_INLINE Vec SwapAdjacentPairs(D /* tag */, const Vec v) const { return Shuffle1032(v); } template HWY_INLINE Vec SwapAdjacentPairs(D /* tag */, const Vec v) const { return SwapAdjacentBlocks(v); } template HWY_INLINE Vec SwapAdjacentQuads(D d, const Vec v) const { #if HWY_HAVE_FLOAT64 // in case D is float32 const RepartitionToWide dw; #else const RepartitionToWide> dw; #endif return BitCast(d, SwapAdjacentPairs(dw, BitCast(dw, v))); } template HWY_INLINE Vec SwapAdjacentQuads(D d, const Vec v) const { // Assumes max vector size = 512 return ConcatLowerUpper(d, v, v); } template HWY_INLINE Vec OddEvenPairs(D d, const Vec odd, const Vec even) const { #if HWY_HAVE_FLOAT64 // in case D is float32 const RepartitionToWide dw; #else const RepartitionToWide> dw; #endif return BitCast(d, OddEven(BitCast(dw, odd), BitCast(dw, even))); } template HWY_INLINE Vec OddEvenPairs(D /* tag */, Vec odd, Vec even) const { return OddEvenBlocks(odd, even); } template HWY_INLINE Vec OddEvenQuads(D d, Vec odd, Vec even) const { #if HWY_HAVE_FLOAT64 // in case D is float32 const RepartitionToWide dw; #else const RepartitionToWide> dw; #endif return BitCast(d, OddEvenPairs(dw, BitCast(dw, odd), BitCast(dw, even))); } template HWY_INLINE Vec OddEvenQuads(D d, Vec odd, Vec even) const { return ConcatUpperLower(d, odd, even); } }; // Anything order-related depends on the key traits *and* the order (see // FirstOfLanes). We cannot implement just one Compare function because Lt128 // only compiles if the lane type is u64. Thus we need either overloaded // functions with a tag type, class specializations, or separate classes. // We avoid overloaded functions because we want all functions to be callable // from a SortTraits without per-function wrappers. Specializing would work, but // we are anyway going to specialize at a higher level. template struct OrderAscending : public KeyLane { // False indicates the entire key (i.e. lane) should be compared. KV stands // for key-value. static constexpr bool IsKV() { return false; } using Order = SortAscending; using OrderForSortingNetwork = OrderAscending; HWY_INLINE bool Compare1(const T* a, const T* b) const { return *a < *b; } template HWY_INLINE Mask Compare(D /* tag */, Vec a, Vec b) const { return Lt(a, b); } // Two halves of Sort2, used in ScanMinMax. template HWY_INLINE Vec First(D /* tag */, const Vec a, const Vec b) const { return Min(a, b); } template HWY_INLINE Vec Last(D /* tag */, const Vec a, const Vec b) const { return Max(a, b); } template HWY_INLINE Vec FirstOfLanes(D d, Vec v, T* HWY_RESTRICT /* buf */) const { return MinOfLanes(d, v); } template HWY_INLINE Vec LastOfLanes(D d, Vec v, T* HWY_RESTRICT /* buf */) const { return MaxOfLanes(d, v); } template HWY_INLINE Vec FirstValue(D d) const { return SmallestSortValue(d); } template HWY_INLINE Vec LastValue(D d) const { return LargestSortValue(d); } template HWY_INLINE Vec PrevValue(D d, Vec v) const { return SmallerSortValue(d, v); } }; template struct OrderDescending : public KeyLane { // False indicates the entire key (i.e. lane) should be compared. KV stands // for key-value. static constexpr bool IsKV() { return false; } using Order = SortDescending; using OrderForSortingNetwork = OrderDescending; HWY_INLINE bool Compare1(const T* a, const T* b) const { return *b < *a; } template HWY_INLINE Mask Compare(D /* tag */, Vec a, Vec b) const { return Lt(b, a); } template HWY_INLINE Vec First(D /* tag */, const Vec a, const Vec b) const { return Max(a, b); } template HWY_INLINE Vec Last(D /* tag */, const Vec a, const Vec b) const { return Min(a, b); } template HWY_INLINE Vec FirstOfLanes(D d, Vec v, T* HWY_RESTRICT /* buf */) const { return MaxOfLanes(d, v); } template HWY_INLINE Vec LastOfLanes(D d, Vec v, T* HWY_RESTRICT /* buf */) const { return MinOfLanes(d, v); } template HWY_INLINE Vec FirstValue(D d) const { return LargestSortValue(d); } template HWY_INLINE Vec LastValue(D d) const { return SmallestSortValue(d); } template HWY_INLINE Vec PrevValue(D d, Vec v) const { return LargerSortValue(d, v); } }; struct KeyValue64 : public KeyLane { // True indicates only part of the key (i.e. lane) should be compared. KV // stands for key-value. static constexpr bool IsKV() { return true; } template HWY_INLINE Mask EqualKeys(D /*tag*/, Vec a, Vec b) const { return Eq(ShiftRight<32>(a), ShiftRight<32>(b)); } template HWY_INLINE Mask NotEqualKeys(D /*tag*/, Vec a, Vec b) const { return Ne(ShiftRight<32>(a), ShiftRight<32>(b)); } HWY_INLINE bool Equal1(const uint64_t* a, const uint64_t* b) const { return (*a >> 32) == (*b >> 32); } // Only count differences in the actual key, not the value. template HWY_INLINE bool NoKeyDifference(D /*tag*/, Vec diff) const { // Must avoid floating-point comparisons (for -0) const RebindToUnsigned du; const Vec zero = Zero(du); const Vec keys = ShiftRight<32>(diff); // clear values return AllTrue(du, Eq(BitCast(du, keys), zero)); } }; struct OrderAscendingKV64 : public KeyValue64 { using Order = SortAscending; using OrderForSortingNetwork = OrderAscending; HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) const { return (*a >> 32) < (*b >> 32); } template HWY_INLINE Mask Compare(D /* tag */, Vec a, Vec b) const { return Lt(ShiftRight<32>(a), ShiftRight<32>(b)); } // Not required to be stable (preserving the order of equivalent keys), so // we can include the value in the comparison. template HWY_INLINE Vec First(D /* tag */, const Vec a, const Vec b) const { return Min(a, b); } template HWY_INLINE Vec Last(D /* tag */, const Vec a, const Vec b) const { return Max(a, b); } template HWY_INLINE Vec FirstOfLanes(D d, Vec v, uint64_t* HWY_RESTRICT /* buf */) const { return MinOfLanes(d, v); } template HWY_INLINE Vec LastOfLanes(D d, Vec v, uint64_t* HWY_RESTRICT /* buf */) const { return MaxOfLanes(d, v); } // Same as for regular lanes. template HWY_INLINE Vec FirstValue(D d) const { return Set(d, hwy::LowestValue>()); } template HWY_INLINE Vec LastValue(D d) const { return Set(d, hwy::HighestValue>()); } template HWY_INLINE Vec PrevValue(D d, Vec v) const { return Sub(v, Set(d, uint64_t{1} << 32)); } }; struct OrderDescendingKV64 : public KeyValue64 { using Order = SortDescending; using OrderForSortingNetwork = OrderDescending; HWY_INLINE bool Compare1(const LaneType* a, const LaneType* b) const { return (*b >> 32) < (*a >> 32); } template HWY_INLINE Mask Compare(D /* tag */, Vec a, Vec b) const { return Lt(ShiftRight<32>(b), ShiftRight<32>(a)); } // Not required to be stable (preserving the order of equivalent keys), so // we can include the value in the comparison. template HWY_INLINE Vec First(D /* tag */, const Vec a, const Vec b) const { return Max(a, b); } template HWY_INLINE Vec Last(D /* tag */, const Vec a, const Vec b) const { return Min(a, b); } template HWY_INLINE Vec FirstOfLanes(D d, Vec v, uint64_t* HWY_RESTRICT /* buf */) const { return MaxOfLanes(d, v); } template HWY_INLINE Vec LastOfLanes(D d, Vec v, uint64_t* HWY_RESTRICT /* buf */) const { return MinOfLanes(d, v); } template HWY_INLINE Vec FirstValue(D d) const { return Set(d, hwy::HighestValue>()); } template HWY_INLINE Vec LastValue(D d) const { return Set(d, hwy::LowestValue>()); } template HWY_INLINE Vec PrevValue(D d, Vec v) const { return Add(v, Set(d, uint64_t{1} << 32)); } }; // Shared code that depends on Order. template struct TraitsLane : public Base { using TraitsForSortingNetwork = TraitsLane; // For each lane i: replaces a[i] with the first and b[i] with the second // according to Base. // Corresponds to a conditional swap, which is one "node" of a sorting // network. Min/Max are cheaper than compare + blend at least for integers. template HWY_INLINE void Sort2(D d, Vec& a, Vec& b) const { const Base* base = static_cast(this); const Vec a_copy = a; // Prior to AVX3, there is no native 64-bit Min/Max, so they compile to 4 // instructions. We can reduce it to a compare + 2 IfThenElse. #if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3 if (sizeof(TFromD) == 8) { const Mask cmp = base->Compare(d, a, b); a = IfThenElse(cmp, a, b); b = IfThenElse(cmp, b, a_copy); return; } #endif a = base->First(d, a, b); b = base->Last(d, a_copy, b); } // Conditionally swaps even-numbered lanes with their odd-numbered neighbor. template HWY_INLINE Vec SortPairsDistance1(D d, Vec v) const { const Base* base = static_cast(this); Vec swapped = base->ReverseKeys2(d, v); // Further to the above optimization, Sort2+OddEvenKeys compile to four // instructions; we can save one by combining two blends. #if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3 const Vec cmp = VecFromMask(d, base->Compare(d, v, swapped)); return IfVecThenElse(DupOdd(cmp), swapped, v); #else Sort2(d, v, swapped); return base->OddEvenKeys(swapped, v); #endif } // (See above - we use Sort2 for non-64-bit types.) template HWY_INLINE Vec SortPairsDistance1(D d, Vec v) const { const Base* base = static_cast(this); Vec swapped = base->ReverseKeys2(d, v); Sort2(d, v, swapped); return base->OddEvenKeys(swapped, v); } // Swaps with the vector formed by reversing contiguous groups of 4 keys. template HWY_INLINE Vec SortPairsReverse4(D d, Vec v) const { const Base* base = static_cast(this); Vec swapped = base->ReverseKeys4(d, v); Sort2(d, v, swapped); return base->OddEvenPairs(d, swapped, v); } // Conditionally swaps lane 0 with 4, 1 with 5 etc. template HWY_INLINE Vec SortPairsDistance4(D d, Vec v) const { const Base* base = static_cast(this); Vec swapped = base->SwapAdjacentQuads(d, v); // Only used in Merge16, so this will not be used on AVX2 (which only has 4 // u64 lanes), so skip the above optimization for 64-bit AVX2. Sort2(d, v, swapped); return base->OddEvenQuads(d, swapped, v); } }; #else template struct OrderAscending : public KeyLaneBase { using Order = SortAscending; HWY_INLINE bool Compare1(const T* a, const T* b) const { return *a < *b; } template HWY_INLINE Mask Compare(D /* tag */, Vec a, Vec b) { return Lt(a, b); } template HWY_INLINE Vec LastValue(D d) const { return LargestSortValue(d); } }; template struct OrderDescending : public KeyLaneBase { using Order = SortDescending; HWY_INLINE bool Compare1(const T* a, const T* b) const { return *b < *a; } template HWY_INLINE Mask Compare(D /* tag */, Vec a, Vec b) { return Lt(b, a); } template HWY_INLINE Vec LastValue(D d) const { return SmallestSortValue(d); } }; template struct TraitsLane : public Order { // For HeapSort template // MSVC doesn't find typename Order::LaneType. HWY_INLINE void Swap(T* a, T* b) const { const T temp = *a; *a = *b; *b = temp; } template HWY_INLINE Vec SetKey(D d, const TFromD* key) const { return Set(d, *key); } }; #endif // VQSORT_ENABLED } // namespace detail // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE(); #endif // HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE