// Copyright 2021 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Normal include guard for target-independent parts #ifndef HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_ #define HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_ #include #include // std::sort, std::min, std::max #include // std::less, std::greater #include #include "hwy/base.h" #include "hwy/contrib/sort/vqsort.h" #include "hwy/print.h" // Third-party algorithms #define HAVE_AVX2SORT 0 #define HAVE_IPS4O 0 // When enabling, consider changing max_threads (required for Table 1a) #define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1) #define HAVE_PDQSORT 0 #define HAVE_SORT512 0 #define HAVE_VXSORT 0 #if HWY_ARCH_X86 #define HAVE_INTEL 0 #else #define HAVE_INTEL 0 #endif #if HAVE_PARALLEL_IPS4O #include // NOLINT #endif #if HAVE_AVX2SORT HWY_PUSH_ATTRIBUTES("avx2,avx") #include "avx2sort.h" //NOLINT HWY_POP_ATTRIBUTES #endif #if HAVE_IPS4O || HAVE_PARALLEL_IPS4O #include "third_party/ips4o/include/ips4o.hpp" #include "third_party/ips4o/include/ips4o/thread_pool.hpp" #endif #if HAVE_PDQSORT #include "third_party/boost/allowed/sort/sort.hpp" #endif #if HAVE_SORT512 #include "sort512.h" //NOLINT #endif // vxsort is difficult to compile for multiple targets because it also uses // .cpp files, and we'd also have to #undef its include guards. Instead, compile // only for AVX2 or AVX3 depending on this macro. #define VXSORT_AVX3 1 #if HAVE_VXSORT // inlined from vxsort_targets_enable_avx512 (must close before end of header) #ifdef __GNUC__ #ifdef __clang__ #if VXSORT_AVX3 #pragma clang attribute push(__attribute__((target("avx512f,avx512dq"))), \ apply_to = any(function)) #else #pragma clang attribute push(__attribute__((target("avx2"))), \ apply_to = any(function)) #endif // VXSORT_AVX3 #else #pragma GCC push_options #if VXSORT_AVX3 #pragma GCC target("avx512f,avx512dq") #else #pragma GCC target("avx2") #endif // VXSORT_AVX3 #endif #endif #if VXSORT_AVX3 #include "vxsort/machine_traits.avx512.h" #else #include "vxsort/machine_traits.avx2.h" #endif // VXSORT_AVX3 #include "vxsort/vxsort.h" #ifdef __GNUC__ #ifdef __clang__ #pragma clang attribute pop #else #pragma GCC pop_options #endif #endif #endif // HAVE_VXSORT namespace hwy { enum class Dist { kUniform8, kUniform16, kUniform32 }; static inline std::vector AllDist() { return {/*Dist::kUniform8, Dist::kUniform16,*/ Dist::kUniform32}; } static inline const char* DistName(Dist dist) { switch (dist) { case Dist::kUniform8: return "uniform8"; case Dist::kUniform16: return "uniform16"; case Dist::kUniform32: return "uniform32"; } return "unreachable"; } template class InputStats { public: void Notify(T value) { min_ = std::min(min_, value); max_ = std::max(max_, value); // Converting to integer would truncate floats, multiplying to save digits // risks overflow especially when casting, so instead take the sum of the // bit representations as the checksum. uint64_t bits = 0; static_assert(sizeof(T) <= 8, "Expected a built-in type"); CopyBytes(&value, &bits); // not same size sum_ += bits; count_ += 1; } bool operator==(const InputStats& other) const { char type_name[100]; detail::TypeName(hwy::detail::MakeTypeInfo(), 1, type_name); if (count_ != other.count_) { HWY_ABORT("Sort %s: count %d vs %d\n", type_name, static_cast(count_), static_cast(other.count_)); } if (min_ != other.min_ || max_ != other.max_) { HWY_ABORT("Sort %s: minmax %f/%f vs %f/%f\n", type_name, static_cast(min_), static_cast(max_), static_cast(other.min_), static_cast(other.max_)); } // Sum helps detect duplicated/lost values if (sum_ != other.sum_) { HWY_ABORT("Sort %s: Sum mismatch %g %g; min %g max %g\n", type_name, static_cast(sum_), static_cast(other.sum_), static_cast(min_), static_cast(max_)); } return true; } private: T min_ = hwy::HighestValue(); T max_ = hwy::LowestValue(); uint64_t sum_ = 0; size_t count_ = 0; }; enum class Algo { #if HAVE_INTEL kIntel, #endif #if HAVE_AVX2SORT kSEA, #endif #if HAVE_IPS4O kIPS4O, #endif #if HAVE_PARALLEL_IPS4O kParallelIPS4O, #endif #if HAVE_PDQSORT kPDQ, #endif #if HAVE_SORT512 kSort512, #endif #if HAVE_VXSORT kVXSort, #endif kStd, kVQSort, kHeap, }; static inline const char* AlgoName(Algo algo) { switch (algo) { #if HAVE_INTEL case Algo::kIntel: return "intel"; #endif #if HAVE_AVX2SORT case Algo::kSEA: return "sea"; #endif #if HAVE_IPS4O case Algo::kIPS4O: return "ips4o"; #endif #if HAVE_PARALLEL_IPS4O case Algo::kParallelIPS4O: return "par_ips4o"; #endif #if HAVE_PDQSORT case Algo::kPDQ: return "pdq"; #endif #if HAVE_SORT512 case Algo::kSort512: return "sort512"; #endif #if HAVE_VXSORT case Algo::kVXSort: return "vxsort"; #endif case Algo::kStd: return "std"; case Algo::kVQSort: return "vq"; case Algo::kHeap: return "heap"; } return "unreachable"; } } // namespace hwy #endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_ // Per-target #if defined(HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE) == defined(HWY_TARGET_TOGGLE) #ifdef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE #undef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE #else #define HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE #endif #include "hwy/contrib/sort/traits-inl.h" #include "hwy/contrib/sort/traits128-inl.h" #include "hwy/contrib/sort/vqsort-inl.h" // HeapSort #include "hwy/aligned_allocator.h" HWY_BEFORE_NAMESPACE(); // Requires target pragma set by HWY_BEFORE_NAMESPACE #if HAVE_INTEL && HWY_TARGET <= HWY_AVX3 // #include "avx512-16bit-qsort.hpp" // requires vbmi2 #include "avx512-32bit-qsort.hpp" #include "avx512-64bit-qsort.hpp" #endif namespace hwy { namespace HWY_NAMESPACE { #if HAVE_INTEL || HAVE_VXSORT // only supports ascending order template using OtherOrder = detail::OrderAscending; #else template using OtherOrder = detail::OrderDescending; #endif class Xorshift128Plus { static HWY_INLINE uint64_t SplitMix64(uint64_t z) { z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull; z = (z ^ (z >> 27)) * 0x94D049BB133111EBull; return z ^ (z >> 31); } public: // Generates two vectors of 64-bit seeds via SplitMix64 and stores into // `seeds`. Generating these afresh in each ChoosePivot is too expensive. template static void GenerateSeeds(DU64 du64, TFromD* HWY_RESTRICT seeds) { seeds[0] = SplitMix64(0x9E3779B97F4A7C15ull); for (size_t i = 1; i < 2 * Lanes(du64); ++i) { seeds[i] = SplitMix64(seeds[i - 1]); } } // Need to pass in the state because vector cannot be class members. template static VU64 RandomBits(VU64& state0, VU64& state1) { VU64 s1 = state0; VU64 s0 = state1; const VU64 bits = Add(s1, s0); state0 = s0; s1 = Xor(s1, ShiftLeft<23>(s1)); state1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0)))); return bits; } }; template Vec RandomValues(D d, VU64& s0, VU64& s1, const VU64 mask) { const VU64 bits = Xorshift128Plus::RandomBits(s0, s1); return BitCast(d, And(bits, mask)); } // It is important to avoid denormals, which are flushed to zero by SIMD but not // scalar sorts, and NaN, which may be ordered differently in scalar vs. SIMD. template Vec RandomValues(DF df, VU64& s0, VU64& s1, const VU64 mask) { using TF = TFromD; const RebindToUnsigned du; using VU = Vec; const VU64 bits64 = And(Xorshift128Plus::RandomBits(s0, s1), mask); #if HWY_TARGET == HWY_SCALAR // Cannot repartition u64 to smaller types using TU = MakeUnsigned; const VU bits = Set(du, static_cast(GetLane(bits64) & LimitsMax())); #else const VU bits = BitCast(du, bits64); #endif // Avoid NaN/denormal by only generating values in [1, 2), i.e. random // mantissas with the exponent taken from the representation of 1.0. const VU k1 = BitCast(du, Set(df, TF{1.0})); const VU mantissa_mask = Set(du, MantissaMask()); const VU representation = OrAnd(k1, bits, mantissa_mask); return BitCast(df, representation); } template Vec MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t) { switch (sizeof_t) { case 2: return Set(du64, (dist == Dist::kUniform8) ? 0x00FF00FF00FF00FFull : 0xFFFFFFFFFFFFFFFFull); case 4: return Set(du64, (dist == Dist::kUniform8) ? 0x000000FF000000FFull : (dist == Dist::kUniform16) ? 0x0000FFFF0000FFFFull : 0xFFFFFFFFFFFFFFFFull); case 8: return Set(du64, (dist == Dist::kUniform8) ? 0x00000000000000FFull : (dist == Dist::kUniform16) ? 0x000000000000FFFFull : 0x00000000FFFFFFFFull); default: HWY_ABORT("Logic error"); return Zero(du64); } } template InputStats GenerateInput(const Dist dist, T* v, size_t num) { SortTag du64; using VU64 = Vec; const size_t N64 = Lanes(du64); auto seeds = hwy::AllocateAligned(2 * N64); Xorshift128Plus::GenerateSeeds(du64, seeds.get()); VU64 s0 = Load(du64, seeds.get()); VU64 s1 = Load(du64, seeds.get() + N64); #if HWY_TARGET == HWY_SCALAR const Sisd d; #else const Repartition d; #endif using V = Vec; const size_t N = Lanes(d); const VU64 mask = MaskForDist(du64, dist, sizeof(T)); auto buf = hwy::AllocateAligned(N); size_t i = 0; for (; i + N <= num; i += N) { const V values = RandomValues(d, s0, s1, mask); StoreU(values, d, v + i); } if (i < num) { const V values = RandomValues(d, s0, s1, mask); StoreU(values, d, buf.get()); CopyBytes(buf.get(), v + i, (num - i) * sizeof(T)); } InputStats input_stats; for (size_t i = 0; i < num; ++i) { input_stats.Notify(v[i]); } return input_stats; } struct SharedState { #if HAVE_PARALLEL_IPS4O const unsigned max_threads = hwy::LimitsMax(); // 16 for Table 1a ips4o::StdThreadPool pool{static_cast( HWY_MIN(max_threads, std::thread::hardware_concurrency() / 2))}; #endif }; // Bridge from keys (passed to Run) to lanes as expected by HeapSort. For // non-128-bit keys they are the same: template void CallHeapSort(KeyType* HWY_RESTRICT keys, const size_t num_keys) { using detail::SharedTraits; using detail::TraitsLane; if (Order().IsAscending()) { const SharedTraits>> st; return detail::HeapSort(st, keys, num_keys); } else { const SharedTraits>> st; return detail::HeapSort(st, keys, num_keys); } } #if VQSORT_ENABLED template void CallHeapSort(hwy::uint128_t* HWY_RESTRICT keys, const size_t num_keys) { using detail::SharedTraits; using detail::Traits128; uint64_t* lanes = reinterpret_cast(keys); const size_t num_lanes = num_keys * 2; if (Order().IsAscending()) { const SharedTraits> st; return detail::HeapSort(st, lanes, num_lanes); } else { const SharedTraits> st; return detail::HeapSort(st, lanes, num_lanes); } } template void CallHeapSort(K64V64* HWY_RESTRICT keys, const size_t num_keys) { using detail::SharedTraits; using detail::Traits128; uint64_t* lanes = reinterpret_cast(keys); const size_t num_lanes = num_keys * 2; if (Order().IsAscending()) { const SharedTraits> st; return detail::HeapSort(st, lanes, num_lanes); } else { const SharedTraits> st; return detail::HeapSort(st, lanes, num_lanes); } } template void CallHeapSort(K32V32* HWY_RESTRICT keys, const size_t num_keys) { using detail::SharedTraits; using detail::TraitsLane; uint64_t* lanes = reinterpret_cast(keys); const size_t num_lanes = num_keys; if (Order().IsAscending()) { const SharedTraits> st; return detail::HeapSort(st, lanes, num_lanes); } else { const SharedTraits> st; return detail::HeapSort(st, lanes, num_lanes); } } #endif // VQSORT_ENABLED template void Run(Algo algo, KeyType* HWY_RESTRICT inout, size_t num, SharedState& shared, size_t /*thread*/) { const std::less less; const std::greater greater; #if !HAVE_PARALLEL_IPS4O (void)shared; #endif switch (algo) { #if HAVE_INTEL && HWY_TARGET <= HWY_AVX3 case Algo::kIntel: return avx512_qsort(inout, static_cast(num)); #endif #if HAVE_AVX2SORT case Algo::kSEA: return avx2::quicksort(inout, static_cast(num)); #endif #if HAVE_IPS4O case Algo::kIPS4O: if (Order().IsAscending()) { return ips4o::sort(inout, inout + num, less); } else { return ips4o::sort(inout, inout + num, greater); } #endif #if HAVE_PARALLEL_IPS4O case Algo::kParallelIPS4O: if (Order().IsAscending()) { return ips4o::parallel::sort(inout, inout + num, less, shared.pool); } else { return ips4o::parallel::sort(inout, inout + num, greater, shared.pool); } #endif #if HAVE_SORT512 case Algo::kSort512: HWY_ABORT("not supported"); // return Sort512::Sort(inout, num); #endif #if HAVE_PDQSORT case Algo::kPDQ: if (Order().IsAscending()) { return boost::sort::pdqsort_branchless(inout, inout + num, less); } else { return boost::sort::pdqsort_branchless(inout, inout + num, greater); } #endif #if HAVE_VXSORT case Algo::kVXSort: { #if (VXSORT_AVX3 && HWY_TARGET != HWY_AVX3) || \ (!VXSORT_AVX3 && HWY_TARGET != HWY_AVX2) fprintf(stderr, "Do not call for target %s\n", hwy::TargetName(HWY_TARGET)); return; #else #if VXSORT_AVX3 vxsort::vxsort vx; #else vxsort::vxsort vx; #endif if (Order().IsAscending()) { return vx.sort(inout, inout + num - 1); } else { fprintf(stderr, "Skipping VX - does not support descending order\n"); return; } #endif // enabled for this target } #endif // HAVE_VXSORT case Algo::kStd: if (Order().IsAscending()) { return std::sort(inout, inout + num, less); } else { return std::sort(inout, inout + num, greater); } case Algo::kVQSort: return VQSort(inout, num, Order()); case Algo::kHeap: return CallHeapSort(inout, num); default: HWY_ABORT("Not implemented"); } } // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE(); #endif // HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE