// Copyright 2020 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Per-target definitions shared by ops/*.h and user code. #include #include "hwy/base.h" // Separate header because foreach_target.h re-enables its include guard. #include "hwy/ops/set_macros-inl.h" // Relies on the external include guard in highway.h. HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { // Highway operations are implemented as overloaded functions selected using an // internal-only tag type D := Simd. T is the lane type. kPow2 is a // shift count applied to scalable vectors. Instead of referring to Simd<> // directly, users create D via aliases ScalableTag() (defaults to a // full vector, or fractions/groups if the argument is negative/positive), // CappedTag or FixedTag. The actual number of lanes is // Lanes(D()), a power of two. For scalable vectors, N is either HWY_LANES or a // cap. For constexpr-size vectors, N is the actual number of lanes. This // ensures Half> is the same type as Full256, as required by x86. template struct Simd { constexpr Simd() = default; using T = Lane; static_assert((N & (N - 1)) == 0 && N != 0, "N must be a power of two"); // Only for use by MaxLanes, required by MSVC. Cannot be enum because GCC // warns when using enums and non-enums in the same expression. Cannot be // static constexpr function (another MSVC limitation). static constexpr size_t kPrivateN = N; static constexpr int kPrivatePow2 = kPow2; template static constexpr size_t NewN() { // Round up to correctly handle scalars with N=1. return (N * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT); } #if HWY_HAVE_SCALABLE template static constexpr int Pow2Ratio() { return (sizeof(NewT) > sizeof(T)) ? static_cast(CeilLog2(sizeof(NewT) / sizeof(T))) : -static_cast(CeilLog2(sizeof(T) / sizeof(NewT))); } #endif // Widening/narrowing ops change the number of lanes and/or their type. // To initialize such vectors, we need the corresponding tag types: // PromoteTo/DemoteTo() with another lane type, but same number of lanes. #if HWY_HAVE_SCALABLE template using Rebind = Simd()>; #else template using Rebind = Simd; #endif // Change lane type while keeping the same vector size, e.g. for MulEven. template using Repartition = Simd(), kPow2>; // Half the lanes while keeping the same lane type, e.g. for LowerHalf. // Round up to correctly handle scalars with N=1. #if HWY_HAVE_SCALABLE // Reducing the cap (N) is required for SVE - if N is the limiter for f32xN, // then we expect Half> to have N/2 lanes (rounded up). using Half = Simd; #else using Half = Simd; #endif // Twice the lanes while keeping the same lane type, e.g. for Combine. #if HWY_HAVE_SCALABLE using Twice = Simd; #else using Twice = Simd; #endif }; namespace detail { #if HWY_HAVE_SCALABLE template constexpr bool IsFull(Simd /* d */) { return N == HWY_LANES(T) && kPow2 == 0; } #endif // Returns the number of lanes (possibly zero) after applying a shift: // - 0: no change; // - [1,3]: a group of 2,4,8 [fractional] vectors; // - [-3,-1]: a fraction of a vector from 1/8 to 1/2. constexpr size_t ScaleByPower(size_t N, int pow2) { return pow2 >= 0 ? (N << pow2) : (N >> (-pow2)); } // Struct wrappers enable validation of arguments via static_assert. template struct ScalableTagChecker { static_assert(-3 <= kPow2 && kPow2 <= 3, "Fraction must be 1/8 to 8"); #if HWY_TARGET == HWY_RVV // Only RVV supports register groups. using type = Simd; #elif HWY_HAVE_SCALABLE // For SVE[2], only allow full or fractions. using type = Simd; #elif HWY_TARGET == HWY_SCALAR using type = Simd; #else // Only allow full or fractions. using type = Simd; #endif }; template struct CappedTagChecker { static_assert(kLimit != 0, "Does not make sense to have zero lanes"); using type = Simd; }; template struct FixedTagChecker { static_assert(kNumLanes != 0, "Does not make sense to have zero lanes"); static_assert(kNumLanes * sizeof(T) <= HWY_MAX_BYTES, "Too many lanes"); #if HWY_TARGET == HWY_SCALAR // HWY_MAX_BYTES would still allow uint8x8, which is not supported. static_assert(kNumLanes == 1, "Scalar only supports one lane"); #endif using type = Simd; }; } // namespace detail // Alias for a tag describing a full vector (kPow2 == 0: the most common usage, // e.g. 1D loops where the application does not care about the vector size) or a // fraction/multiple of one. Multiples are the same as full vectors for all // targets except RVV. Fractions (kPow2 < 0) are useful as the argument/return // value of type promotion and demotion. template using ScalableTag = typename detail::ScalableTagChecker::type; // Alias for a tag describing a vector with *up to* kLimit active lanes, even on // targets with scalable vectors and HWY_SCALAR. The runtime lane count // `Lanes(tag)` may be less than kLimit, and is 1 on HWY_SCALAR. This alias is // typically used for 1D loops with a relatively low application-defined upper // bound, e.g. for 8x8 DCTs. However, it is better if data structures are // designed to be vector-length-agnostic (e.g. a hybrid SoA where there are // chunks of `M >= MaxLanes(d)` DC components followed by M AC1, .., and M AC63; // this would enable vector-length-agnostic loops using ScalableTag). template using CappedTag = typename detail::CappedTagChecker::type; // Alias for a tag describing a vector with *exactly* kNumLanes active lanes, // even on targets with scalable vectors. HWY_SCALAR only supports one lane. // All other targets allow kNumLanes up to HWY_MAX_BYTES / sizeof(T). // // NOTE: if the application does not need to support HWY_SCALAR (+), use this // instead of CappedTag to emphasize that there will be exactly kNumLanes lanes. // This is useful for data structures that rely on exactly 128-bit SIMD, but // these are discouraged because they cannot benefit from wider vectors. // Instead, applications would ideally define a larger problem size and loop // over it with the (unknown size) vectors from ScalableTag. // // + e.g. if the baseline is known to support SIMD, or the application requires // ops such as TableLookupBytes not supported by HWY_SCALAR. template using FixedTag = typename detail::FixedTagChecker::type; template using TFromD = typename D::T; // Tag for the same number of lanes as D, but with the LaneType T. template using Rebind = typename D::template Rebind; template using RebindToSigned = Rebind>, D>; template using RebindToUnsigned = Rebind>, D>; template using RebindToFloat = Rebind>, D>; // Tag for the same total size as D, but with the LaneType T. template using Repartition = typename D::template Repartition; template using RepartitionToWide = Repartition>, D>; template using RepartitionToNarrow = Repartition>, D>; // Tag for the same lane type as D, but half the lanes. template using Half = typename D::Half; // Tag for the same lane type as D, but twice the lanes. template using Twice = typename D::Twice; // Same as base.h macros but with a Simd argument instead of T. #define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD) #define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD) #define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD) #define HWY_IF_NOT_FLOAT_D(D) HWY_IF_NOT_FLOAT(TFromD) #define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_LANE_SIZE(TFromD, bytes) #define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_LANE_SIZE(TFromD, bytes) // MSVC workaround: use PrivateN directly instead of MaxLanes. #define HWY_IF_LT128_D(D) \ hwy::EnableIf) < 16>* = nullptr #define HWY_IF_GE128_D(D) \ hwy::EnableIf) >= 16>* = nullptr // Same, but with a vector argument. #define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV) #define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV) #define HWY_IF_FLOAT_V(V) HWY_IF_FLOAT(TFromV) #define HWY_IF_LANE_SIZE_V(V, bytes) HWY_IF_LANE_SIZE(TFromV, bytes) // For implementing functions for a specific type. // IsSame<...>() in template arguments is broken on MSVC2015. #define HWY_IF_LANES_ARE(T, V) EnableIf>::value>* = nullptr template HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D /* d */) { return D::kPrivatePow2; } // MSVC requires the explicit . #define HWY_IF_POW2_GE(D, MIN) hwy::EnableIf(D()) >= (MIN)>* = nullptr #if HWY_HAVE_SCALABLE // Upper bound on the number of lanes. Intended for template arguments and // reducing code size (e.g. for SSE4, we know at compile-time that vectors will // not exceed 16 bytes). WARNING: this may be a loose bound, use Lanes() as the // actual size for allocating storage. WARNING: MSVC might not be able to deduce // arguments if this is used in EnableIf. See HWY_IF_LT128_D above. template HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) { return detail::ScaleByPower(HWY_MIN(D::kPrivateN, HWY_LANES(TFromD)), D::kPrivatePow2); } #else // Workaround for MSVC 2017: T,N,kPow2 argument deduction fails, so returning N // is not an option, nor does a member function work. template HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) { return D::kPrivateN; } // (Potentially) non-constant actual size of the vector at runtime, subject to // the limit imposed by the Simd. Useful for advancing loop counters. // Targets with scalable vectors define this themselves. template HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd) { return N; } #endif // !HWY_HAVE_SCALABLE // NOTE: GCC generates incorrect code for vector arguments to non-inlined // functions in two situations: // - on Windows and GCC 10.3, passing by value crashes due to unaligned loads: // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412. // - on ARM64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not // all) tests to fail. // // We therefore pass by const& only on GCC and (Windows or ARM64). This alias // must be used for all vector/mask parameters of functions marked HWY_NOINLINE, // and possibly also other functions that are not inlined. #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && \ ((defined(_WIN32) || defined(_WIN64)) || HWY_ARCH_ARM_A64) template using VecArg = const V&; #else template using VecArg = V; #endif // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE();