// Copyright 2019 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "hwy/targets.h" #include #include #include // abort / exit #include "hwy/highway.h" #include "hwy/per_target.h" // VectorBytes #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN #include "sanitizer/common_interface_defs.h" // __sanitizer_print_stack_trace #endif #if HWY_ARCH_X86 #include #if HWY_COMPILER_MSVC #include #else // !HWY_COMPILER_MSVC #include #endif // HWY_COMPILER_MSVC #elif (HWY_ARCH_ARM || HWY_ARCH_PPC || HWY_ARCH_S390X) && HWY_OS_LINUX // sys/auxv.h does not always include asm/hwcap.h, or define HWCAP*, hence we // still include this directly. See #1199. #ifndef TOOLCHAIN_MISS_ASM_HWCAP_H #include #endif #ifndef TOOLCHAIN_MISS_SYS_AUXV_H #include #endif #endif // HWY_ARCH_* namespace hwy { namespace { // When running tests, this value can be set to the mocked supported targets // mask. Only written to from a single thread before the test starts. int64_t supported_targets_for_test_ = 0; // Mask of targets disabled at runtime with DisableTargets. int64_t supported_mask_ = LimitsMax(); #if HWY_ARCH_X86 && HWY_HAVE_RUNTIME_DISPATCH namespace x86 { // Calls CPUID instruction with eax=level and ecx=count and returns the result // in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd). HWY_INLINE void Cpuid(const uint32_t level, const uint32_t count, uint32_t* HWY_RESTRICT abcd) { #if HWY_COMPILER_MSVC int regs[4]; __cpuidex(regs, level, count); for (int i = 0; i < 4; ++i) { abcd[i] = regs[i]; } #else // HWY_COMPILER_MSVC uint32_t a; uint32_t b; uint32_t c; uint32_t d; __cpuid_count(level, count, a, b, c, d); abcd[0] = a; abcd[1] = b; abcd[2] = c; abcd[3] = d; #endif // HWY_COMPILER_MSVC } HWY_INLINE bool IsBitSet(const uint32_t reg, const int index) { return (reg & (1U << index)) != 0; } // Returns the lower 32 bits of extended control register 0. // Requires CPU support for "OSXSAVE" (see below). uint32_t ReadXCR0() { #if HWY_COMPILER_MSVC return static_cast(_xgetbv(0)); #else // HWY_COMPILER_MSVC uint32_t xcr0, xcr0_high; const uint32_t index = 0; asm volatile(".byte 0x0F, 0x01, 0xD0" : "=a"(xcr0), "=d"(xcr0_high) : "c"(index)); return xcr0; #endif // HWY_COMPILER_MSVC } bool IsAMD() { uint32_t abcd[4]; Cpuid(0, 0, abcd); const uint32_t max_level = abcd[0]; return max_level >= 1 && abcd[1] == 0x68747541 && abcd[2] == 0x444d4163 && abcd[3] == 0x69746e65; } // Arbitrary bit indices indicating which instruction set extensions are // supported. Use enum to ensure values are distinct. enum class FeatureIndex : uint32_t { kSSE = 0, kSSE2, kSSE3, kSSSE3, kSSE41, kSSE42, kCLMUL, kAES, kAVX, kAVX2, kF16C, kFMA, kLZCNT, kBMI, kBMI2, kAVX512F, kAVX512VL, kAVX512CD, kAVX512DQ, kAVX512BW, kAVX512FP16, kVNNI, kVPCLMULQDQ, kVBMI, kVBMI2, kVAES, kPOPCNTDQ, kBITALG, kGFNI, kSentinel }; static_assert(static_cast(FeatureIndex::kSentinel) < 64, "Too many bits for u64"); HWY_INLINE constexpr uint64_t Bit(FeatureIndex index) { return 1ull << static_cast(index); } // Returns bit array of FeatureIndex from CPUID feature flags. uint64_t FlagsFromCPUID() { uint64_t flags = 0; // return value uint32_t abcd[4]; Cpuid(0, 0, abcd); const uint32_t max_level = abcd[0]; // Standard feature flags Cpuid(1, 0, abcd); flags |= IsBitSet(abcd[3], 25) ? Bit(FeatureIndex::kSSE) : 0; flags |= IsBitSet(abcd[3], 26) ? Bit(FeatureIndex::kSSE2) : 0; flags |= IsBitSet(abcd[2], 0) ? Bit(FeatureIndex::kSSE3) : 0; flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kCLMUL) : 0; flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kSSSE3) : 0; flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kFMA) : 0; flags |= IsBitSet(abcd[2], 19) ? Bit(FeatureIndex::kSSE41) : 0; flags |= IsBitSet(abcd[2], 20) ? Bit(FeatureIndex::kSSE42) : 0; flags |= IsBitSet(abcd[2], 25) ? Bit(FeatureIndex::kAES) : 0; flags |= IsBitSet(abcd[2], 28) ? Bit(FeatureIndex::kAVX) : 0; flags |= IsBitSet(abcd[2], 29) ? Bit(FeatureIndex::kF16C) : 0; // Extended feature flags Cpuid(0x80000001U, 0, abcd); flags |= IsBitSet(abcd[2], 5) ? Bit(FeatureIndex::kLZCNT) : 0; // Extended features if (max_level >= 7) { Cpuid(7, 0, abcd); flags |= IsBitSet(abcd[1], 3) ? Bit(FeatureIndex::kBMI) : 0; flags |= IsBitSet(abcd[1], 5) ? Bit(FeatureIndex::kAVX2) : 0; flags |= IsBitSet(abcd[1], 8) ? Bit(FeatureIndex::kBMI2) : 0; flags |= IsBitSet(abcd[1], 16) ? Bit(FeatureIndex::kAVX512F) : 0; flags |= IsBitSet(abcd[1], 17) ? Bit(FeatureIndex::kAVX512DQ) : 0; flags |= IsBitSet(abcd[1], 28) ? Bit(FeatureIndex::kAVX512CD) : 0; flags |= IsBitSet(abcd[1], 30) ? Bit(FeatureIndex::kAVX512BW) : 0; flags |= IsBitSet(abcd[1], 31) ? Bit(FeatureIndex::kAVX512VL) : 0; flags |= IsBitSet(abcd[2], 1) ? Bit(FeatureIndex::kVBMI) : 0; flags |= IsBitSet(abcd[2], 6) ? Bit(FeatureIndex::kVBMI2) : 0; flags |= IsBitSet(abcd[2], 8) ? Bit(FeatureIndex::kGFNI) : 0; flags |= IsBitSet(abcd[2], 9) ? Bit(FeatureIndex::kVAES) : 0; flags |= IsBitSet(abcd[2], 10) ? Bit(FeatureIndex::kVPCLMULQDQ) : 0; flags |= IsBitSet(abcd[2], 11) ? Bit(FeatureIndex::kVNNI) : 0; flags |= IsBitSet(abcd[2], 12) ? Bit(FeatureIndex::kBITALG) : 0; flags |= IsBitSet(abcd[2], 14) ? Bit(FeatureIndex::kPOPCNTDQ) : 0; flags |= IsBitSet(abcd[3], 23) ? Bit(FeatureIndex::kAVX512FP16) : 0; } return flags; } // Each Highway target requires a 'group' of multiple features/flags. constexpr uint64_t kGroupSSE2 = Bit(FeatureIndex::kSSE) | Bit(FeatureIndex::kSSE2); constexpr uint64_t kGroupSSSE3 = Bit(FeatureIndex::kSSE3) | Bit(FeatureIndex::kSSSE3) | kGroupSSE2; constexpr uint64_t kGroupSSE4 = Bit(FeatureIndex::kSSE41) | Bit(FeatureIndex::kSSE42) | Bit(FeatureIndex::kCLMUL) | Bit(FeatureIndex::kAES) | kGroupSSSE3; // We normally assume BMI/BMI2/FMA are available if AVX2 is. This allows us to // use BZHI and (compiler-generated) MULX. However, VirtualBox lacks them // [https://www.virtualbox.org/ticket/15471]. Thus we provide the option of // avoiding using and requiring these so AVX2 can still be used. #ifdef HWY_DISABLE_BMI2_FMA constexpr uint64_t kGroupBMI2_FMA = 0; #else constexpr uint64_t kGroupBMI2_FMA = Bit(FeatureIndex::kBMI) | Bit(FeatureIndex::kBMI2) | Bit(FeatureIndex::kFMA); #endif #ifdef HWY_DISABLE_F16C constexpr uint64_t kGroupF16C = 0; #else constexpr uint64_t kGroupF16C = Bit(FeatureIndex::kF16C); #endif constexpr uint64_t kGroupAVX2 = Bit(FeatureIndex::kAVX) | Bit(FeatureIndex::kAVX2) | Bit(FeatureIndex::kLZCNT) | kGroupBMI2_FMA | kGroupF16C | kGroupSSE4; constexpr uint64_t kGroupAVX3 = Bit(FeatureIndex::kAVX512F) | Bit(FeatureIndex::kAVX512VL) | Bit(FeatureIndex::kAVX512DQ) | Bit(FeatureIndex::kAVX512BW) | Bit(FeatureIndex::kAVX512CD) | kGroupAVX2; constexpr uint64_t kGroupAVX3_DL = Bit(FeatureIndex::kVNNI) | Bit(FeatureIndex::kVPCLMULQDQ) | Bit(FeatureIndex::kVBMI) | Bit(FeatureIndex::kVBMI2) | Bit(FeatureIndex::kVAES) | Bit(FeatureIndex::kPOPCNTDQ) | Bit(FeatureIndex::kBITALG) | Bit(FeatureIndex::kGFNI) | kGroupAVX3; constexpr uint64_t kGroupAVX3_SPR = Bit(FeatureIndex::kAVX512FP16) | kGroupAVX3_DL; int64_t DetectTargets() { int64_t bits = 0; // return value of supported targets. #if HWY_ARCH_X86_64 bits |= HWY_SSE2; // always present in x64 #endif const uint64_t flags = FlagsFromCPUID(); // Set target bit(s) if all their group's flags are all set. if ((flags & kGroupAVX3_SPR) == kGroupAVX3_SPR) { bits |= HWY_AVX3_SPR; } if ((flags & kGroupAVX3_DL) == kGroupAVX3_DL) { bits |= HWY_AVX3_DL; } if ((flags & kGroupAVX3) == kGroupAVX3) { bits |= HWY_AVX3; } if ((flags & kGroupAVX2) == kGroupAVX2) { bits |= HWY_AVX2; } if ((flags & kGroupSSE4) == kGroupSSE4) { bits |= HWY_SSE4; } if ((flags & kGroupSSSE3) == kGroupSSSE3) { bits |= HWY_SSSE3; } #if HWY_ARCH_X86_32 if ((flags & kGroupSSE2) == kGroupSSE2) { bits |= HWY_SSE2; } #endif // Clear bits if the OS does not support XSAVE - otherwise, registers // are not preserved across context switches. uint32_t abcd[4]; Cpuid(1, 0, abcd); const bool has_osxsave = IsBitSet(abcd[2], 27); if (has_osxsave) { const uint32_t xcr0 = ReadXCR0(); const int64_t min_avx3 = HWY_AVX3 | HWY_AVX3_DL | HWY_AVX3_SPR; const int64_t min_avx2 = HWY_AVX2 | min_avx3; // XMM if (!IsBitSet(xcr0, 1)) { #if HWY_ARCH_X86_64 // The HWY_SSE2, HWY_SSSE3, and HWY_SSE4 bits do not need to be // cleared on x86_64, even if bit 1 of XCR0 is not set, as // the lower 128 bits of XMM0-XMM15 are guaranteed to be // preserved across context switches on x86_64 // Only clear the AVX2/AVX3 bits on x86_64 if bit 1 of XCR0 is not set bits &= ~min_avx2; #else bits &= ~(HWY_SSE2 | HWY_SSSE3 | HWY_SSE4 | min_avx2); #endif } // YMM if (!IsBitSet(xcr0, 2)) { bits &= ~min_avx2; } // opmask, ZMM lo/hi if (!IsBitSet(xcr0, 5) || !IsBitSet(xcr0, 6) || !IsBitSet(xcr0, 7)) { bits &= ~min_avx3; } } // has_osxsave // This is mainly to work around the slow Zen4 CompressStore. It's unclear // whether subsequent AMD models will be affected; assume yes. if ((bits & HWY_AVX3_DL) && IsAMD()) { bits |= HWY_AVX3_ZEN4; } return bits; } } // namespace x86 #elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH namespace arm { int64_t DetectTargets() { int64_t bits = 0; // return value of supported targets. using CapBits = unsigned long; // NOLINT const CapBits hw = getauxval(AT_HWCAP); (void)hw; #if HWY_ARCH_ARM_A64 bits |= HWY_NEON_WITHOUT_AES; // aarch64 always has NEON and VFPv4.. // .. but not necessarily AES, which is required for HWY_NEON. #if defined(HWCAP_AES) if (hw & HWCAP_AES) { bits |= HWY_NEON; } #endif // HWCAP_AES #if defined(HWCAP_SVE) if (hw & HWCAP_SVE) { bits |= HWY_SVE; } #endif #ifndef HWCAP2_SVE2 #define HWCAP2_SVE2 (1 << 1) #endif #ifndef HWCAP2_SVEAES #define HWCAP2_SVEAES (1 << 2) #endif const CapBits hw2 = getauxval(AT_HWCAP2); if ((hw2 & HWCAP2_SVE2) && (hw2 & HWCAP2_SVEAES)) { bits |= HWY_SVE2; } #else // !HWY_ARCH_ARM_A64 // Some old auxv.h / hwcap.h do not define these. If not, treat as unsupported. #if defined(HWCAP_NEON) && defined(HWCAP_VFPv4) if ((hw & HWCAP_NEON) && (hw & HWCAP_VFPv4)) { bits |= HWY_NEON_WITHOUT_AES; } #endif // aarch32 would check getauxval(AT_HWCAP2) & HWCAP2_AES, but we do not yet // support that platform, and Armv7 lacks AES entirely. Because HWY_NEON // requires native AES instructions, we do not enable that target here. #endif // HWY_ARCH_ARM_A64 return bits; } } // namespace arm #elif HWY_ARCH_PPC && HWY_HAVE_RUNTIME_DISPATCH namespace ppc { #ifndef PPC_FEATURE_HAS_ALTIVEC #define PPC_FEATURE_HAS_ALTIVEC 0x10000000 #endif #ifndef PPC_FEATURE_HAS_VSX #define PPC_FEATURE_HAS_VSX 0x00000080 #endif #ifndef PPC_FEATURE2_ARCH_2_07 #define PPC_FEATURE2_ARCH_2_07 0x80000000 #endif #ifndef PPC_FEATURE2_VEC_CRYPTO #define PPC_FEATURE2_VEC_CRYPTO 0x02000000 #endif #ifndef PPC_FEATURE2_ARCH_3_00 #define PPC_FEATURE2_ARCH_3_00 0x00800000 #endif #ifndef PPC_FEATURE2_ARCH_3_1 #define PPC_FEATURE2_ARCH_3_1 0x00040000 #endif using CapBits = unsigned long; // NOLINT // For AT_HWCAP, the others are for AT_HWCAP2 constexpr CapBits kGroupVSX = PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_VSX; #if defined(HWY_DISABLE_PPC8_CRYPTO) constexpr CapBits kGroupPPC8 = PPC_FEATURE2_ARCH_2_07; #else constexpr CapBits kGroupPPC8 = PPC_FEATURE2_ARCH_2_07 | PPC_FEATURE2_VEC_CRYPTO; #endif constexpr CapBits kGroupPPC9 = kGroupPPC8 | PPC_FEATURE2_ARCH_3_00; constexpr CapBits kGroupPPC10 = kGroupPPC9 | PPC_FEATURE2_ARCH_3_1; int64_t DetectTargets() { int64_t bits = 0; // return value of supported targets. #if defined(AT_HWCAP) && defined(AT_HWCAP2) const CapBits hw = getauxval(AT_HWCAP); if ((hw & kGroupVSX) == kGroupVSX) { const CapBits hw2 = getauxval(AT_HWCAP2); if ((hw2 & kGroupPPC8) == kGroupPPC8) { bits |= HWY_PPC8; } if ((hw2 & kGroupPPC9) == kGroupPPC9) { bits |= HWY_PPC9; } if ((hw2 & kGroupPPC10) == kGroupPPC10) { bits |= HWY_PPC10; } } // VSX #endif // defined(AT_HWCAP) && defined(AT_HWCAP2) return bits; } } // namespace ppc #elif HWY_ARCH_S390X && HWY_HAVE_RUNTIME_DISPATCH namespace s390x { #ifndef HWCAP_S390_VX #define HWCAP_S390_VX 2048 #endif #ifndef HWCAP_S390_VXE #define HWCAP_S390_VXE 8192 #endif #ifndef HWCAP_S390_VXRS_EXT2 #define HWCAP_S390_VXRS_EXT2 32768 #endif using CapBits = unsigned long; // NOLINT constexpr CapBits kGroupZ14 = HWCAP_S390_VX | HWCAP_S390_VXE; constexpr CapBits kGroupZ15 = HWCAP_S390_VX | HWCAP_S390_VXE | HWCAP_S390_VXRS_EXT2; int64_t DetectTargets() { int64_t bits = 0; #if defined(AT_HWCAP) const CapBits hw = getauxval(AT_HWCAP); if ((hw & kGroupZ14) == kGroupZ14) { bits |= HWY_Z14; } if ((hw & kGroupZ15) == kGroupZ15) { bits |= HWY_Z15; } #endif return bits; } } // namespace s390x #endif // HWY_ARCH_X86 // Returns targets supported by the CPU, independently of DisableTargets. // Factored out of SupportedTargets to make its structure more obvious. Note // that x86 CPUID may take several hundred cycles. int64_t DetectTargets() { // Apps will use only one of these (the default is EMU128), but compile flags // for this TU may differ from that of the app, so allow both. int64_t bits = HWY_SCALAR | HWY_EMU128; #if HWY_ARCH_X86 && HWY_HAVE_RUNTIME_DISPATCH bits |= x86::DetectTargets(); #elif HWY_ARCH_ARM && HWY_HAVE_RUNTIME_DISPATCH bits |= arm::DetectTargets(); #elif HWY_ARCH_PPC && HWY_HAVE_RUNTIME_DISPATCH bits |= ppc::DetectTargets(); #elif HWY_ARCH_S390X && HWY_HAVE_RUNTIME_DISPATCH bits |= s390x::DetectTargets(); #else // TODO(janwas): detect support for WASM/RVV. // This file is typically compiled without HWY_IS_TEST, but targets_test has // it set, and will expect all of its HWY_TARGETS (= all attainable) to be // supported. bits |= HWY_ENABLED_BASELINE; #endif // HWY_ARCH_* if ((bits & HWY_ENABLED_BASELINE) != HWY_ENABLED_BASELINE) { const uint64_t bits_u = static_cast(bits); const uint64_t enabled = static_cast(HWY_ENABLED_BASELINE); fprintf(stderr, "WARNING: CPU supports 0x%08x%08x, software requires 0x%08x%08x\n", static_cast(bits_u >> 32), static_cast(bits_u & 0xFFFFFFFF), static_cast(enabled >> 32), static_cast(enabled & 0xFFFFFFFF)); } return bits; } } // namespace HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) Abort(const char* file, int line, const char* format, ...) { char buf[800]; va_list args; va_start(args, format); vsnprintf(buf, sizeof(buf), format, args); va_end(args); fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf); // If compiled with any sanitizer, they can also print a stack trace. #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN __sanitizer_print_stack_trace(); #endif // HWY_IS_* fflush(stderr); // Now terminate the program: #if HWY_ARCH_RVV exit(1); // trap/abort just freeze Spike. #elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC // Facilitates breaking into a debugger, but don't use this in non-debug // builds because it looks like "illegal instruction", which is misleading. __builtin_trap(); #else abort(); // Compile error without this due to HWY_NORETURN. #endif } HWY_DLLEXPORT void DisableTargets(int64_t disabled_targets) { supported_mask_ = static_cast(~disabled_targets); // This will take effect on the next call to SupportedTargets, which is // called right before GetChosenTarget::Update. However, calling Update here // would make it appear that HWY_DYNAMIC_DISPATCH was called, which we want // to check in tests. We instead de-initialize such that the next // HWY_DYNAMIC_DISPATCH calls GetChosenTarget::Update via FunctionCache. GetChosenTarget().DeInit(); } HWY_DLLEXPORT void SetSupportedTargetsForTest(int64_t targets) { supported_targets_for_test_ = targets; GetChosenTarget().DeInit(); // see comment above } HWY_DLLEXPORT int64_t SupportedTargets() { int64_t targets = supported_targets_for_test_; if (HWY_LIKELY(targets == 0)) { // Mock not active. Re-detect instead of caching just in case we're on a // heterogeneous ISA (also requires some app support to pin threads). This // is only reached on the first HWY_DYNAMIC_DISPATCH or after each call to // DisableTargets or SetSupportedTargetsForTest. targets = DetectTargets(); // VectorBytes invokes HWY_DYNAMIC_DISPATCH. To prevent infinite recursion, // first set up ChosenTarget. No need to Update() again afterwards with the // final targets - that will be done by a caller of this function. GetChosenTarget().Update(targets); // Now that we can call VectorBytes, check for targets with specific sizes. if (HWY_ARCH_ARM_A64) { const size_t vec_bytes = VectorBytes(); // uncached, see declaration if ((targets & HWY_SVE) && vec_bytes == 32) { targets = static_cast(targets | HWY_SVE_256); } else { targets = static_cast(targets & ~HWY_SVE_256); } if ((targets & HWY_SVE2) && vec_bytes == 16) { targets = static_cast(targets | HWY_SVE2_128); } else { targets = static_cast(targets & ~HWY_SVE2_128); } } // HWY_ARCH_ARM_A64 } targets &= supported_mask_; return targets == 0 ? HWY_STATIC_TARGET : targets; } HWY_DLLEXPORT ChosenTarget& GetChosenTarget() { static ChosenTarget chosen_target; return chosen_target; } } // namespace hwy