// Copyright 2019 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // 128-bit WASM vectors and operations. // External include guard in highway.h - see comment there. #include #include "hwy/base.h" #include "hwy/ops/shared-inl.h" #ifdef HWY_WASM_OLD_NAMES #define wasm_i8x16_shuffle wasm_v8x16_shuffle #define wasm_i16x8_shuffle wasm_v16x8_shuffle #define wasm_i32x4_shuffle wasm_v32x4_shuffle #define wasm_i64x2_shuffle wasm_v64x2_shuffle #define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16 #define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8 #define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8 #define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16 #define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8 #define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8 #define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4 #define wasm_i62x2_trunc_sat_f64x2 wasm_i64x2_trunc_saturate_f64x2 #define wasm_u8x16_add_sat wasm_u8x16_add_saturate #define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate #define wasm_u16x8_add_sat wasm_u16x8_add_saturate #define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate #define wasm_i8x16_add_sat wasm_i8x16_add_saturate #define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate #define wasm_i16x8_add_sat wasm_i16x8_add_saturate #define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate #endif HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { #if HWY_TARGET == HWY_WASM_EMU256 template using Full256 = Simd; #endif namespace detail { template struct Raw128 { using type = __v128_u; }; template <> struct Raw128 { using type = __f32x4; }; template <> struct Raw128 { using type = __f64x2; }; } // namespace detail template class Vec128 { using Raw = typename detail::Raw128::type; public: using PrivateT = T; // only for DFromV static constexpr size_t kPrivateN = N; // only for DFromV // Compound assignment. Only usable if there is a corresponding non-member // binary operator overload. For example, only f32 and f64 support division. HWY_INLINE Vec128& operator*=(const Vec128 other) { return *this = (*this * other); } HWY_INLINE Vec128& operator/=(const Vec128 other) { return *this = (*this / other); } HWY_INLINE Vec128& operator+=(const Vec128 other) { return *this = (*this + other); } HWY_INLINE Vec128& operator-=(const Vec128 other) { return *this = (*this - other); } HWY_INLINE Vec128& operator%=(const Vec128 other) { return *this = (*this % other); } HWY_INLINE Vec128& operator&=(const Vec128 other) { return *this = (*this & other); } HWY_INLINE Vec128& operator|=(const Vec128 other) { return *this = (*this | other); } HWY_INLINE Vec128& operator^=(const Vec128 other) { return *this = (*this ^ other); } Raw raw; }; template using Vec64 = Vec128; template using Vec32 = Vec128; template using Vec16 = Vec128; // FF..FF or 0. template struct Mask128 { using PrivateT = T; // only for DFromM static constexpr size_t kPrivateN = N; // only for DFromM typename detail::Raw128::type raw; }; template using DFromV = Simd; template using DFromM = Simd; template using TFromV = typename V::PrivateT; // ------------------------------ Zero // Use HWY_MAX_LANES_D here because VFromD is defined in terms of Zero. template HWY_API Vec128, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { return Vec128, HWY_MAX_LANES_D(D)>{wasm_i32x4_splat(0)}; } template HWY_API Vec128, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { return Vec128, HWY_MAX_LANES_D(D)>{wasm_f32x4_splat(0.0f)}; } template HWY_API Vec128, HWY_MAX_LANES_D(D)> Zero(D /* tag */) { return Vec128, HWY_MAX_LANES_D(D)>{wasm_f64x2_splat(0.0)}; } template using VFromD = decltype(Zero(D())); // ------------------------------ Tuple (VFromD) #include "hwy/ops/tuple-inl.h" // ------------------------------ BitCast namespace detail { HWY_INLINE __v128_u BitCastToInteger(__v128_u v) { return v; } HWY_INLINE __v128_u BitCastToInteger(__f32x4 v) { return static_cast<__v128_u>(v); } HWY_INLINE __v128_u BitCastToInteger(__f64x2 v) { return static_cast<__v128_u>(v); } template HWY_INLINE Vec128 BitCastToByte(Vec128 v) { return Vec128{BitCastToInteger(v.raw)}; } // Cannot rely on function overloading because return types differ. template struct BitCastFromInteger128 { HWY_INLINE __v128_u operator()(__v128_u v) { return v; } }; template <> struct BitCastFromInteger128 { HWY_INLINE __f32x4 operator()(__v128_u v) { return static_cast<__f32x4>(v); } }; template <> struct BitCastFromInteger128 { HWY_INLINE __f64x2 operator()(__v128_u v) { return static_cast<__f64x2>(v); } }; template HWY_INLINE VFromD BitCastFromByte(D d, Vec128 v) { return VFromD{BitCastFromInteger128>()(v.raw)}; } } // namespace detail template HWY_API VFromD BitCast(D d, Vec128().MaxLanes()> v) { return detail::BitCastFromByte(d, detail::BitCastToByte(v)); } // ------------------------------ ResizeBitCast template HWY_API VFromD ResizeBitCast(D d, FromV v) { const Repartition du8_to; return BitCast(d, VFromD{detail::BitCastToInteger(v.raw)}); } // ------------------------------ Set template HWY_API VFromD Set(D /* tag */, TFromD t) { return VFromD{wasm_i8x16_splat(static_cast(t))}; } template HWY_API VFromD Set(D /* tag */, TFromD t) { return VFromD{wasm_i16x8_splat(static_cast(t))}; } template HWY_API VFromD Set(D /* tag */, TFromD t) { return VFromD{wasm_i32x4_splat(static_cast(t))}; } template HWY_API VFromD Set(D /* tag */, TFromD t) { return VFromD{wasm_i64x2_splat(static_cast(t))}; } template HWY_API VFromD Set(D /* tag */, TFromD t) { return VFromD{wasm_i16x8_splat(BitCastScalar(t))}; } template HWY_API VFromD Set(D /* tag */, TFromD t) { return VFromD{wasm_f32x4_splat(t)}; } template HWY_API VFromD Set(D /* tag */, TFromD t) { return VFromD{wasm_f64x2_splat(t)}; } HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") // For all vector sizes. template HWY_API VFromD Undefined(D d) { return Zero(d); } HWY_DIAGNOSTICS(pop) // For all vector sizes. template , typename T2> HWY_API VFromD Iota(D d, const T2 first) { HWY_ALIGN T lanes[MaxLanes(d)]; for (size_t i = 0; i < MaxLanes(d); ++i) { lanes[i] = AddWithWraparound(static_cast(first), i); } return Load(d, lanes); } // ------------------------------ Dup128VecFromValues template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7, TFromD t8, TFromD t9, TFromD t10, TFromD t11, TFromD t12, TFromD t13, TFromD t14, TFromD t15) { return VFromD{wasm_i8x16_make(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15)}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7, TFromD t8, TFromD t9, TFromD t10, TFromD t11, TFromD t12, TFromD t13, TFromD t14, TFromD t15) { return VFromD{wasm_u8x16_make(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15)}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7) { return VFromD{wasm_i16x8_make(t0, t1, t2, t3, t4, t5, t6, t7)}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7) { return VFromD{wasm_u16x8_make(t0, t1, t2, t3, t4, t5, t6, t7)}; } template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7) { const RebindToSigned di; return BitCast(d, Dup128VecFromValues( di, BitCastScalar(t0), BitCastScalar(t1), BitCastScalar(t2), BitCastScalar(t3), BitCastScalar(t4), BitCastScalar(t5), BitCastScalar(t6), BitCastScalar(t7))); } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3) { return VFromD{wasm_i32x4_make(t0, t1, t2, t3)}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3) { return VFromD{wasm_u32x4_make(t0, t1, t2, t3)}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1, TFromD t2, TFromD t3) { return VFromD{wasm_f32x4_make(t0, t1, t2, t3)}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1) { return VFromD{wasm_i64x2_make(t0, t1)}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1) { return VFromD{wasm_u64x2_make(t0, t1)}; } template HWY_API VFromD Dup128VecFromValues(D /*d*/, TFromD t0, TFromD t1) { return VFromD{wasm_f64x2_make(t0, t1)}; } // ================================================== ARITHMETIC // ------------------------------ Addition // Unsigned template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i8x16_add(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_add(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_add(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i64x2_add(a.raw, b.raw)}; } // Signed template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i8x16_add(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_add(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_add(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i64x2_add(a.raw, b.raw)}; } // Float template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_f32x4_add(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_f64x2_add(a.raw, b.raw)}; } // ------------------------------ Subtraction // Unsigned template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i8x16_sub(a.raw, b.raw)}; } template HWY_API Vec128 operator-(Vec128 a, Vec128 b) { return Vec128{wasm_i16x8_sub(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_sub(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i64x2_sub(a.raw, b.raw)}; } // Signed template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i8x16_sub(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_sub(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_sub(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i64x2_sub(a.raw, b.raw)}; } // Float template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_f32x4_sub(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_f64x2_sub(a.raw, b.raw)}; } // ------------------------------ SaturatedAdd // Returns a + b clamped to the destination range. // Unsigned template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{wasm_u8x16_add_sat(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{wasm_u16x8_add_sat(a.raw, b.raw)}; } // Signed template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{wasm_i8x16_add_sat(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_add_sat(a.raw, b.raw)}; } // ------------------------------ SaturatedSub // Returns a - b clamped to the destination range. // Unsigned template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{wasm_u8x16_sub_sat(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{wasm_u16x8_sub_sat(a.raw, b.raw)}; } // Signed template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{wasm_i8x16_sub_sat(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_sub_sat(a.raw, b.raw)}; } // ------------------------------ Average // Returns (a + b + 1) / 2 // Unsigned template HWY_API Vec128 AverageRound(const Vec128 a, const Vec128 b) { return Vec128{wasm_u8x16_avgr(a.raw, b.raw)}; } template HWY_API Vec128 AverageRound(const Vec128 a, const Vec128 b) { return Vec128{wasm_u16x8_avgr(a.raw, b.raw)}; } // ------------------------------ Absolute value // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{wasm_i8x16_abs(v.raw)}; } template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{wasm_i16x8_abs(v.raw)}; } template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{wasm_i32x4_abs(v.raw)}; } template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{wasm_i64x2_abs(v.raw)}; } template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{wasm_f32x4_abs(v.raw)}; } template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{wasm_f64x2_abs(v.raw)}; } // ------------------------------ Shift lanes by constant #bits // Unsigned template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{wasm_i16x8_shl(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{wasm_u16x8_shr(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{wasm_i32x4_shl(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{wasm_i64x2_shl(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{wasm_u32x4_shr(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{wasm_u64x2_shr(v.raw, kBits)}; } // Signed template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{wasm_i16x8_shl(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{wasm_i16x8_shr(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{wasm_i32x4_shl(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{wasm_i64x2_shl(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{wasm_i32x4_shr(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{wasm_i64x2_shr(v.raw, kBits)}; } // 8-bit template HWY_API Vec128 ShiftLeft(const Vec128 v) { const DFromV d8; // Use raw instead of BitCast to support N=1. const Vec128 shifted{ShiftLeft(Vec128>{v.raw}).raw}; return kBits == 1 ? (v + v) : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); } template HWY_API Vec128 ShiftRight(const Vec128 v) { const DFromV d8; // Use raw instead of BitCast to support N=1. const Vec128 shifted{ ShiftRight(Vec128{v.raw}).raw}; return shifted & Set(d8, 0xFF >> kBits); } template HWY_API Vec128 ShiftRight(const Vec128 v) { const DFromV di; const RebindToUnsigned du; const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); return (shifted ^ shifted_sign) - shifted_sign; } // ------------------------------ RotateRight (ShiftRight, Or) template HWY_API Vec128 RotateRight(const Vec128 v) { constexpr size_t kSizeInBits = sizeof(T) * 8; static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); if (kBits == 0) return v; return Or(ShiftRight(v), ShiftLeft(v)); } // ------------------------------ Shift lanes by same variable #bits // After https://reviews.llvm.org/D108415 shift argument became unsigned. HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") // Unsigned template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { return Vec128{wasm_i16x8_shl(v.raw, bits)}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { return Vec128{wasm_u16x8_shr(v.raw, bits)}; } template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { return Vec128{wasm_i32x4_shl(v.raw, bits)}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { return Vec128{wasm_u32x4_shr(v.raw, bits)}; } template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { return Vec128{wasm_i64x2_shl(v.raw, bits)}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { return Vec128{wasm_u64x2_shr(v.raw, bits)}; } // Signed template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { return Vec128{wasm_i16x8_shl(v.raw, bits)}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { return Vec128{wasm_i16x8_shr(v.raw, bits)}; } template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { return Vec128{wasm_i32x4_shl(v.raw, bits)}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { return Vec128{wasm_i32x4_shr(v.raw, bits)}; } template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { return Vec128{wasm_i64x2_shl(v.raw, bits)}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { return Vec128{wasm_i64x2_shr(v.raw, bits)}; } // 8-bit template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { const DFromV d8; // Use raw instead of BitCast to support N=1. const Vec128 shifted{ ShiftLeftSame(Vec128>{v.raw}, bits).raw}; return shifted & Set(d8, static_cast((0xFF << bits) & 0xFF)); } template HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { const DFromV d8; // Use raw instead of BitCast to support N=1. const Vec128 shifted{ ShiftRightSame(Vec128{v.raw}, bits).raw}; return shifted & Set(d8, 0xFF >> bits); } template HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { const DFromV di; const RebindToUnsigned du; const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits)); return (shifted ^ shifted_sign) - shifted_sign; } // ignore Wsign-conversion HWY_DIAGNOSTICS(pop) // ------------------------------ Minimum // Unsigned template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{wasm_u8x16_min(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{wasm_u16x8_min(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{wasm_u32x4_min(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { // Avoid wasm_u64x2_extract_lane - not all implementations have it yet. const uint64_t a0 = static_cast(wasm_i64x2_extract_lane(a.raw, 0)); const uint64_t b0 = static_cast(wasm_i64x2_extract_lane(b.raw, 0)); const uint64_t a1 = static_cast(wasm_i64x2_extract_lane(a.raw, 1)); const uint64_t b1 = static_cast(wasm_i64x2_extract_lane(b.raw, 1)); alignas(16) uint64_t min[2] = {HWY_MIN(a0, b0), HWY_MIN(a1, b1)}; return Vec128{wasm_v128_load(min)}; } // Signed template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{wasm_i8x16_min(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{wasm_i16x8_min(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_min(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { alignas(16) int64_t min[4]; min[0] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0), wasm_i64x2_extract_lane(b.raw, 0)); min[1] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1), wasm_i64x2_extract_lane(b.raw, 1)); return Vec128{wasm_v128_load(min)}; } // Float template HWY_API Vec128 Min(Vec128 a, Vec128 b) { // Equivalent to a < b ? a : b (taking into account our swapped arg order, // so that Min(NaN, x) is x to match x86). return Vec128{wasm_f32x4_pmin(b.raw, a.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { // Equivalent to a < b ? a : b (taking into account our swapped arg order, // so that Min(NaN, x) is x to match x86). return Vec128{wasm_f64x2_pmin(b.raw, a.raw)}; } // ------------------------------ Maximum // Unsigned template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{wasm_u8x16_max(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{wasm_u16x8_max(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{wasm_u32x4_max(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { // Avoid wasm_u64x2_extract_lane - not all implementations have it yet. const uint64_t a0 = static_cast(wasm_i64x2_extract_lane(a.raw, 0)); const uint64_t b0 = static_cast(wasm_i64x2_extract_lane(b.raw, 0)); const uint64_t a1 = static_cast(wasm_i64x2_extract_lane(a.raw, 1)); const uint64_t b1 = static_cast(wasm_i64x2_extract_lane(b.raw, 1)); alignas(16) uint64_t max[2] = {HWY_MAX(a0, b0), HWY_MAX(a1, b1)}; return Vec128{wasm_v128_load(max)}; } // Signed template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{wasm_i8x16_max(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{wasm_i16x8_max(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_max(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { alignas(16) int64_t max[2]; max[0] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0), wasm_i64x2_extract_lane(b.raw, 0)); max[1] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1), wasm_i64x2_extract_lane(b.raw, 1)); return Vec128{wasm_v128_load(max)}; } // Float template HWY_API Vec128 Max(Vec128 a, Vec128 b) { // Equivalent to b < a ? a : b (taking into account our swapped arg order, // so that Max(NaN, x) is x to match x86). return Vec128{wasm_f32x4_pmax(b.raw, a.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { // Equivalent to b < a ? a : b (taking into account our swapped arg order, // so that Max(NaN, x) is x to match x86). return Vec128{wasm_f64x2_pmax(b.raw, a.raw)}; } // ------------------------------ Integer multiplication // Unsigned template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_mul(a.raw, b.raw)}; } template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_mul(a.raw, b.raw)}; } // Signed template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_mul(a.raw, b.raw)}; } template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_mul(a.raw, b.raw)}; } // Returns the upper 16 bits of a * b in each lane. template HWY_API Vec128 MulHigh(const Vec128 a, const Vec128 b) { const auto l = wasm_u32x4_extmul_low_u16x8(a.raw, b.raw); const auto h = wasm_u32x4_extmul_high_u16x8(a.raw, b.raw); // TODO(eustas): shift-right + narrow? return Vec128{ wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; } template HWY_API Vec128 MulHigh(const Vec128 a, const Vec128 b) { const auto l = wasm_i32x4_extmul_low_i16x8(a.raw, b.raw); const auto h = wasm_i32x4_extmul_high_i16x8(a.raw, b.raw); // TODO(eustas): shift-right + narrow? return Vec128{ wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; } template HWY_API Vec128 MulFixedPoint15(Vec128 a, Vec128 b) { return Vec128{wasm_i16x8_q15mulr_sat(a.raw, b.raw)}; } // Multiplies even lanes (0, 2 ..) and returns the double-width result. template HWY_API Vec128, (N + 1) / 2> MulEven(const Vec128 a, const Vec128 b) { const DFromV d; const RepartitionToWide dw; constexpr int kSrcBits = sizeof(T) * 8; const auto ae = ShiftRight(ShiftLeft(ResizeBitCast(dw, a))); const auto be = ShiftRight(ShiftLeft(ResizeBitCast(dw, b))); return ae * be; } template HWY_API Vec128, (N + 1) / 2> MulEven(const Vec128 a, const Vec128 b) { const DFromV d; const RepartitionToWide dw; const auto kEvenMask = Set(dw, LimitsMax()); const auto ae = And(ResizeBitCast(dw, a), kEvenMask); const auto be = And(ResizeBitCast(dw, b), kEvenMask); return ae * be; } template HWY_API Vec128 MulEven(const Vec128 a, const Vec128 b) { const DFromV d; const RepartitionToWide dw; const auto ae = ShiftRight<32>(ShiftLeft<32>(ResizeBitCast(dw, a))).raw; const auto be = ShiftRight<32>(ShiftLeft<32>(ResizeBitCast(dw, b))).raw; return Vec128{wasm_i64x2_mul(ae, be)}; } template HWY_API Vec128 MulEven(const Vec128 a, const Vec128 b) { const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0); const auto ae = wasm_v128_and(a.raw, kEvenMask); const auto be = wasm_v128_and(b.raw, kEvenMask); return Vec128{wasm_i64x2_mul(ae, be)}; } // Multiplies odd lanes (1, 3 ..) and returns the double-width result. template HWY_API Vec128, (N + 1) / 2> MulOdd(const Vec128 a, const Vec128 b) { const DFromV d; const RepartitionToWide dw; constexpr int kSrcBits = sizeof(T) * 8; const auto ao = ShiftRight(BitCast(dw, a)); const auto bo = ShiftRight(BitCast(dw, b)); return ao * bo; } template HWY_API Vec128, (N + 1) / 2> MulOdd(const Vec128 a, const Vec128 b) { const DFromV d; const RepartitionToWide dw; const auto ao = ShiftRight<32>(BitCast(dw, a)); const auto bo = ShiftRight<32>(BitCast(dw, b)); return Vec128, (N + 1) / 2>{wasm_i64x2_mul(ao.raw, bo.raw)}; } // ------------------------------ Negate template HWY_API Vec128 Neg(const Vec128 v) { return Xor(v, SignBit(DFromV())); } template HWY_API Vec128 Neg(const Vec128 v) { return Vec128{wasm_i8x16_neg(v.raw)}; } template HWY_API Vec128 Neg(const Vec128 v) { return Vec128{wasm_i16x8_neg(v.raw)}; } template HWY_API Vec128 Neg(const Vec128 v) { return Vec128{wasm_i32x4_neg(v.raw)}; } template HWY_API Vec128 Neg(const Vec128 v) { return Vec128{wasm_i64x2_neg(v.raw)}; } // ------------------------------ Floating-point mul / div template HWY_API Vec128 operator*(Vec128 a, Vec128 b) { return Vec128{wasm_f32x4_mul(a.raw, b.raw)}; } template HWY_API Vec128 operator*(Vec128 a, Vec128 b) { return Vec128{wasm_f64x2_mul(a.raw, b.raw)}; } template HWY_API Vec128 operator/(const Vec128 a, const Vec128 b) { return Vec128{wasm_f32x4_div(a.raw, b.raw)}; } template HWY_API Vec128 operator/(const Vec128 a, const Vec128 b) { return Vec128{wasm_f64x2_div(a.raw, b.raw)}; } template HWY_API Vec128 ApproximateReciprocal(const Vec128 v) { return Set(DFromV(), T{1.0}) / v; } // Integer overload defined in generic_ops-inl.h. template HWY_API Vec128 AbsDiff(const Vec128 a, const Vec128 b) { return Abs(a - b); } // ------------------------------ Floating-point multiply-add variants template HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, Vec128 add) { return mul * x + add; } template HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, Vec128 add) { return add - mul * x; } template HWY_API Vec128 MulSub(Vec128 mul, Vec128 x, Vec128 sub) { return mul * x - sub; } template HWY_API Vec128 NegMulSub(Vec128 mul, Vec128 x, Vec128 sub) { return Neg(mul) * x - sub; } // ------------------------------ Floating-point square root // Full precision square root template HWY_API Vec128 Sqrt(const Vec128 v) { return Vec128{wasm_f32x4_sqrt(v.raw)}; } template HWY_API Vec128 Sqrt(const Vec128 v) { return Vec128{wasm_f64x2_sqrt(v.raw)}; } // Approximate reciprocal square root template HWY_API Vec128 ApproximateReciprocalSqrt(const Vec128 v) { // TODO(eustas): find cheaper a way to calculate this. return Set(DFromV(), T{1.0}) / Sqrt(v); } // ------------------------------ Floating-point rounding // Toward nearest integer, ties to even template HWY_API Vec128 Round(const Vec128 v) { return Vec128{wasm_f32x4_nearest(v.raw)}; } template HWY_API Vec128 Round(const Vec128 v) { return Vec128{wasm_f64x2_nearest(v.raw)}; } // Toward zero, aka truncate template HWY_API Vec128 Trunc(const Vec128 v) { return Vec128{wasm_f32x4_trunc(v.raw)}; } template HWY_API Vec128 Trunc(const Vec128 v) { return Vec128{wasm_f64x2_trunc(v.raw)}; } // Toward +infinity, aka ceiling template HWY_API Vec128 Ceil(const Vec128 v) { return Vec128{wasm_f32x4_ceil(v.raw)}; } template HWY_API Vec128 Ceil(const Vec128 v) { return Vec128{wasm_f64x2_ceil(v.raw)}; } // Toward -infinity, aka floor template HWY_API Vec128 Floor(const Vec128 v) { return Vec128{wasm_f32x4_floor(v.raw)}; } template HWY_API Vec128 Floor(const Vec128 v) { return Vec128{wasm_f64x2_floor(v.raw)}; } // ------------------------------ Floating-point classification template HWY_API Mask128 IsNaN(const Vec128 v) { return v != v; } template HWY_API Mask128 IsInf(const Vec128 v) { const DFromV d; const RebindToUnsigned du; const VFromD vu = BitCast(du, v); // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0. return RebindMask(d, Eq(Add(vu, vu), Set(du, hwy::MaxExponentTimes2()))); } // Returns whether normal/subnormal/zero. template HWY_API Mask128 IsFinite(const Vec128 v) { const DFromV d; const RebindToUnsigned du; const RebindToSigned di; // cheaper than unsigned comparison const VFromD vu = BitCast(du, v); // 'Shift left' to clear the sign bit, then right so we can compare with the // max exponent (cannot compare with MaxExponentTimes2 directly because it is // negative and non-negative floats would be greater). const VFromD exp = BitCast(di, ShiftRight() + 1>(Add(vu, vu))); return RebindMask(d, Lt(exp, Set(di, hwy::MaxExponentField()))); } // ================================================== COMPARE // Comparisons fill a lane with 1-bits if the condition is true, else 0. // Mask and Vec are the same (true = FF..FF). template HWY_API Mask128 MaskFromVec(const Vec128 v) { return Mask128{v.raw}; } template using MFromD = decltype(MaskFromVec(VFromD())); template HWY_API MFromD RebindMask(DTo /* tag */, Mask128 m) { static_assert(sizeof(TFrom) == sizeof(TFromD), "Must have same size"); return MFromD{m.raw}; } template HWY_API Mask128 TestBit(Vec128 v, Vec128 bit) { static_assert(!hwy::IsFloat(), "Only integer vectors supported"); return (v & bit) == bit; } // ------------------------------ Equality // Unsigned template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i8x16_eq(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i16x8_eq(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i32x4_eq(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i64x2_eq(a.raw, b.raw)}; } // Signed template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i8x16_eq(a.raw, b.raw)}; } template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{wasm_i16x8_eq(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i32x4_eq(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i64x2_eq(a.raw, b.raw)}; } // Float template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_f32x4_eq(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_f64x2_eq(a.raw, b.raw)}; } // ------------------------------ Inequality // Unsigned template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i8x16_ne(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i16x8_ne(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i32x4_ne(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i64x2_ne(a.raw, b.raw)}; } // Signed template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i8x16_ne(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i16x8_ne(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i32x4_ne(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i64x2_ne(a.raw, b.raw)}; } // Float template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_f32x4_ne(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_f64x2_ne(a.raw, b.raw)}; } // ------------------------------ Strict inequality template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_i8x16_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_i16x8_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_i32x4_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_i64x2_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_u8x16_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_u16x8_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_u32x4_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { const DFromV d; const Repartition d32; const auto a32 = BitCast(d32, a); const auto b32 = BitCast(d32, b); // If the upper halves are not equal, this is the answer. const auto m_gt = a32 > b32; // Otherwise, the lower half decides. const auto m_eq = a32 == b32; const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2); const auto lo_gt = And(m_eq, MaskFromVec(VFromD{lo_in_hi})); const auto gt = Or(lo_gt, m_gt); // Copy result in upper 32 bits to lower 32 bits. return Mask128{wasm_i32x4_shuffle(gt.raw, gt.raw, 1, 1, 3, 3)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_f32x4_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_f64x2_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator<(const Vec128 a, const Vec128 b) { return operator>(b, a); } // ------------------------------ Weak inequality // Float >= template HWY_API Mask128 operator>=(const Vec128 a, const Vec128 b) { return Mask128{wasm_f32x4_ge(a.raw, b.raw)}; } template HWY_API Mask128 operator>=(const Vec128 a, const Vec128 b) { return Mask128{wasm_f64x2_ge(a.raw, b.raw)}; } template HWY_API Mask128 operator>=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i8x16_ge(a.raw, b.raw)}; } template HWY_API Mask128 operator>=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i16x8_ge(a.raw, b.raw)}; } template HWY_API Mask128 operator>=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i32x4_ge(a.raw, b.raw)}; } template HWY_API Mask128 operator>=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i64x2_ge(a.raw, b.raw)}; } template HWY_API Mask128 operator>=(const Vec128 a, const Vec128 b) { return Mask128{wasm_u8x16_ge(a.raw, b.raw)}; } template HWY_API Mask128 operator>=(const Vec128 a, const Vec128 b) { return Mask128{wasm_u16x8_ge(a.raw, b.raw)}; } template HWY_API Mask128 operator>=(const Vec128 a, const Vec128 b) { return Mask128{wasm_u32x4_ge(a.raw, b.raw)}; } template HWY_API Mask128 operator>=(const Vec128 a, const Vec128 b) { return Not(b > a); } template HWY_API Mask128 operator<=(const Vec128 a, const Vec128 b) { return operator>=(b, a); } // ------------------------------ FirstN (Iota, Lt) template HWY_API MFromD FirstN(D d, size_t num) { const RebindToSigned di; // Signed comparisons may be cheaper. using TI = TFromD; return RebindMask(d, Iota(di, 0) < Set(di, static_cast(num))); } // ================================================== LOGICAL // ------------------------------ Not template HWY_API Vec128 Not(Vec128 v) { return Vec128{wasm_v128_not(v.raw)}; } // ------------------------------ And template HWY_API Vec128 And(Vec128 a, Vec128 b) { return Vec128{wasm_v128_and(a.raw, b.raw)}; } // ------------------------------ AndNot // Returns ~not_mask & mask. template HWY_API Vec128 AndNot(Vec128 not_mask, Vec128 mask) { return Vec128{wasm_v128_andnot(mask.raw, not_mask.raw)}; } // ------------------------------ Or template HWY_API Vec128 Or(Vec128 a, Vec128 b) { return Vec128{wasm_v128_or(a.raw, b.raw)}; } // ------------------------------ Xor template HWY_API Vec128 Xor(Vec128 a, Vec128 b) { return Vec128{wasm_v128_xor(a.raw, b.raw)}; } // ------------------------------ Xor3 template HWY_API Vec128 Xor3(Vec128 x1, Vec128 x2, Vec128 x3) { return Xor(x1, Xor(x2, x3)); } // ------------------------------ Or3 template HWY_API Vec128 Or3(Vec128 o1, Vec128 o2, Vec128 o3) { return Or(o1, Or(o2, o3)); } // ------------------------------ OrAnd template HWY_API Vec128 OrAnd(Vec128 o, Vec128 a1, Vec128 a2) { return Or(o, And(a1, a2)); } // ------------------------------ IfVecThenElse template HWY_API Vec128 IfVecThenElse(Vec128 mask, Vec128 yes, Vec128 no) { return IfThenElse(MaskFromVec(mask), yes, no); } // ------------------------------ Operator overloads (internal-only if float) template HWY_API Vec128 operator&(const Vec128 a, const Vec128 b) { return And(a, b); } template HWY_API Vec128 operator|(const Vec128 a, const Vec128 b) { return Or(a, b); } template HWY_API Vec128 operator^(const Vec128 a, const Vec128 b) { return Xor(a, b); } // ------------------------------ CopySign template HWY_API Vec128 CopySign(const Vec128 magn, const Vec128 sign) { static_assert(IsFloat(), "Only makes sense for floating-point"); const DFromV d; return BitwiseIfThenElse(SignBit(d), sign, magn); } // ------------------------------ CopySignToAbs template HWY_API Vec128 CopySignToAbs(const Vec128 abs, const Vec128 sign) { static_assert(IsFloat(), "Only makes sense for floating-point"); const DFromV d; return OrAnd(abs, SignBit(d), sign); } // ------------------------------ BroadcastSignBit (compare) template HWY_API Vec128 BroadcastSignBit(const Vec128 v) { return ShiftRight(v); } template HWY_API Vec128 BroadcastSignBit(const Vec128 v) { const DFromV d; return VecFromMask(d, v < Zero(d)); } // ------------------------------ Mask template HWY_API VFromD VecFromMask(D /* tag */, MFromD v) { return VFromD{v.raw}; } // mask ? yes : no template HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, Vec128 no) { return Vec128{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)}; } // mask ? yes : 0 template HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { return yes & VecFromMask(DFromV(), mask); } // mask ? 0 : no template HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { return AndNot(VecFromMask(DFromV(), mask), no); } template HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, Vec128 no) { static_assert(IsSigned(), "Only works for signed/float"); const DFromV d; const RebindToSigned di; v = BitCast(d, BroadcastSignBit(BitCast(di, v))); return IfThenElse(MaskFromVec(v), yes, no); } template HWY_API Vec128 ZeroIfNegative(Vec128 v) { const DFromV d; const auto zero = Zero(d); return IfThenElse(Mask128{(v > zero).raw}, v, zero); } // ------------------------------ Mask logical template HWY_API Mask128 Not(const Mask128 m) { const DFromM d; return MaskFromVec(Not(VecFromMask(d, m))); } template HWY_API Mask128 And(const Mask128 a, Mask128 b) { const DFromM d; return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 AndNot(const Mask128 a, Mask128 b) { const DFromM d; return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 Or(const Mask128 a, Mask128 b) { const DFromM d; return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 Xor(const Mask128 a, Mask128 b) { const DFromM d; return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 ExclusiveNeither(const Mask128 a, Mask128 b) { const DFromM d; return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); } // ------------------------------ Shl (BroadcastSignBit, IfThenElse) // The x86 multiply-by-Pow2() trick will not work because WASM saturates // float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a // scalar count operand, per-lane shift instructions would require extract_lane // for each lane, and hoping that shuffle is correctly mapped to a native // instruction. Using non-vector shifts would incur a store-load forwarding // stall when loading the result vector. We instead test bits of the shift // count to "predicate" a shift of the entire vector by a constant. template HWY_API Vec128 operator<<(Vec128 v, const Vec128 bits) { const DFromV d; Mask128 mask; // Need a signed type for BroadcastSignBit. auto test = BitCast(RebindToSigned(), bits); // Move the highest valid bit of the shift count into the sign bit. test = ShiftLeft<5>(test); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<4>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<2>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); return IfThenElse(mask, ShiftLeft<1>(v), v); } template HWY_API Vec128 operator<<(Vec128 v, const Vec128 bits) { const DFromV d; Mask128 mask; // Need a signed type for BroadcastSignBit. auto test = BitCast(RebindToSigned(), bits); // Move the highest valid bit of the shift count into the sign bit. test = ShiftLeft<12>(test); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<8>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<4>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<2>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); return IfThenElse(mask, ShiftLeft<1>(v), v); } template HWY_API Vec128 operator<<(Vec128 v, const Vec128 bits) { const DFromV d; Mask128 mask; // Need a signed type for BroadcastSignBit. auto test = BitCast(RebindToSigned(), bits); // Move the highest valid bit of the shift count into the sign bit. test = ShiftLeft<27>(test); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<16>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<8>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<4>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<2>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); return IfThenElse(mask, ShiftLeft<1>(v), v); } template HWY_API Vec128 operator<<(Vec128 v, const Vec128 bits) { const DFromV d; const RebindToUnsigned du; using TU = MakeUnsigned; alignas(16) TU lanes[2] = {}; alignas(16) TU bits_lanes[2] = {}; Store(BitCast(du, v), du, lanes); Store(BitCast(du, bits), du, bits_lanes); lanes[0] <<= (bits_lanes[0] & 63); lanes[1] <<= (bits_lanes[1] & 63); return BitCast(d, Load(du, lanes)); } // ------------------------------ Shr (BroadcastSignBit, IfThenElse) template HWY_API Vec128 operator>>(Vec128 v, const Vec128 bits) { const DFromV d; Mask128 mask; // Need a signed type for BroadcastSignBit. auto test = BitCast(RebindToSigned(), bits); // Move the highest valid bit of the shift count into the sign bit. test = ShiftLeft<5>(test); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<4>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<2>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); return IfThenElse(mask, ShiftRight<1>(v), v); } template HWY_API Vec128 operator>>(Vec128 v, const Vec128 bits) { const DFromV d; Mask128 mask; // Need a signed type for BroadcastSignBit. auto test = BitCast(RebindToSigned(), bits); // Move the highest valid bit of the shift count into the sign bit. test = ShiftLeft<12>(test); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<8>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<4>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<2>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); return IfThenElse(mask, ShiftRight<1>(v), v); } template HWY_API Vec128 operator>>(Vec128 v, const Vec128 bits) { const DFromV d; Mask128 mask; // Need a signed type for BroadcastSignBit. auto test = BitCast(RebindToSigned(), bits); // Move the highest valid bit of the shift count into the sign bit. test = ShiftLeft<27>(test); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<16>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<8>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<4>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<2>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); return IfThenElse(mask, ShiftRight<1>(v), v); } template HWY_API Vec128 operator>>(Vec128 v, const Vec128 bits) { const DFromV d; alignas(16) T lanes[2] = {}; alignas(16) T bits_lanes[2] = {}; Store(v, d, lanes); Store(bits, d, bits_lanes); lanes[0] >>= (bits_lanes[0] & 63); lanes[1] >>= (bits_lanes[1] & 63); return Load(d, lanes); } // ================================================== MEMORY // ------------------------------ Load template > HWY_API Vec128 Load(D /* tag */, const T* HWY_RESTRICT aligned) { return Vec128{wasm_v128_load(aligned)}; } // Partial template HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT p) { VFromD v; CopyBytes(p, &v); return v; } // LoadU == Load. template HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { return Load(d, p); } // 128-bit SIMD => nothing to duplicate, same as an unaligned load. template HWY_API VFromD LoadDup128(D d, const TFromD* HWY_RESTRICT p) { return Load(d, p); } template > HWY_API VFromD MaskedLoad(MFromD m, D d, const T* HWY_RESTRICT aligned) { return IfThenElseZero(m, Load(d, aligned)); } template > HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D d, const T* HWY_RESTRICT aligned) { return IfThenElse(m, Load(d, aligned), v); } // ------------------------------ Store namespace detail { template HWY_INLINE T ExtractLane(const Vec128 v) { return static_cast(wasm_i8x16_extract_lane(v.raw, kLane)); } template HWY_INLINE T ExtractLane(const Vec128 v) { const int16_t lane = wasm_i16x8_extract_lane(v.raw, kLane); return static_cast(lane); } template HWY_INLINE T ExtractLane(const Vec128 v) { const DFromV d; const RebindToUnsigned du; const uint16_t bits = ExtractLane(BitCast(du, v)); return BitCastScalar(bits); } template HWY_INLINE T ExtractLane(const Vec128 v) { return static_cast(wasm_i32x4_extract_lane(v.raw, kLane)); } template HWY_INLINE T ExtractLane(const Vec128 v) { return static_cast(wasm_i64x2_extract_lane(v.raw, kLane)); } template HWY_INLINE float ExtractLane(const Vec128 v) { return wasm_f32x4_extract_lane(v.raw, kLane); } template HWY_INLINE double ExtractLane(const Vec128 v) { return wasm_f64x2_extract_lane(v.raw, kLane); } } // namespace detail template HWY_API void Store(VFromD v, D /* tag */, TFromD* HWY_RESTRICT aligned) { wasm_v128_store(aligned, v.raw); } // Partial template HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT p) { CopyBytes(&v, p); } template HWY_API void Store(VFromD v, D /* tag */, TFromD* HWY_RESTRICT p) { *p = detail::ExtractLane<0>(v); } // StoreU == Store. template HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { Store(v, d, p); } template HWY_API void BlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT p) { StoreU(IfThenElse(m, v, LoadU(d, p)), d, p); } // ------------------------------ Non-temporal stores // Same as aligned stores on non-x86. template HWY_API void Stream(VFromD v, D /* tag */, TFromD* HWY_RESTRICT aligned) { wasm_v128_store(aligned, v.raw); } // ------------------------------ Scatter in generic_ops-inl.h // ------------------------------ Gather in generic_ops-inl.h // ================================================== SWIZZLE // ------------------------------ ExtractLane // One overload per vector length just in case *_extract_lane raise compile // errors if their argument is out of bounds (even if that would never be // reached at runtime). template HWY_API T ExtractLane(const Vec128 v, size_t i) { HWY_DASSERT(i == 0); (void)i; return detail::ExtractLane<0>(v); } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::ExtractLane<0>(v); case 1: return detail::ExtractLane<1>(v); } } #endif alignas(16) T lanes[2]; Store(v, DFromV(), lanes); return lanes[i]; } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::ExtractLane<0>(v); case 1: return detail::ExtractLane<1>(v); case 2: return detail::ExtractLane<2>(v); case 3: return detail::ExtractLane<3>(v); } } #endif alignas(16) T lanes[4]; Store(v, DFromV(), lanes); return lanes[i]; } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::ExtractLane<0>(v); case 1: return detail::ExtractLane<1>(v); case 2: return detail::ExtractLane<2>(v); case 3: return detail::ExtractLane<3>(v); case 4: return detail::ExtractLane<4>(v); case 5: return detail::ExtractLane<5>(v); case 6: return detail::ExtractLane<6>(v); case 7: return detail::ExtractLane<7>(v); } } #endif alignas(16) T lanes[8]; Store(v, DFromV(), lanes); return lanes[i]; } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::ExtractLane<0>(v); case 1: return detail::ExtractLane<1>(v); case 2: return detail::ExtractLane<2>(v); case 3: return detail::ExtractLane<3>(v); case 4: return detail::ExtractLane<4>(v); case 5: return detail::ExtractLane<5>(v); case 6: return detail::ExtractLane<6>(v); case 7: return detail::ExtractLane<7>(v); case 8: return detail::ExtractLane<8>(v); case 9: return detail::ExtractLane<9>(v); case 10: return detail::ExtractLane<10>(v); case 11: return detail::ExtractLane<11>(v); case 12: return detail::ExtractLane<12>(v); case 13: return detail::ExtractLane<13>(v); case 14: return detail::ExtractLane<14>(v); case 15: return detail::ExtractLane<15>(v); } } #endif alignas(16) T lanes[16]; Store(v, DFromV(), lanes); return lanes[i]; } // ------------------------------ GetLane template HWY_API T GetLane(const Vec128 v) { return detail::ExtractLane<0>(v); } // ------------------------------ InsertLane namespace detail { template HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { static_assert(kLane < N, "Lane index out of bounds"); return Vec128{ wasm_i8x16_replace_lane(v.raw, kLane, static_cast(t))}; } template HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { static_assert(kLane < N, "Lane index out of bounds"); return Vec128{ wasm_i16x8_replace_lane(v.raw, kLane, BitCastScalar(t))}; } template HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { static_assert(kLane < N, "Lane index out of bounds"); return Vec128{ wasm_i32x4_replace_lane(v.raw, kLane, static_cast(t))}; } template HWY_INLINE Vec128 InsertLane(const Vec128 v, T t) { static_assert(kLane < N, "Lane index out of bounds"); return Vec128{ wasm_i64x2_replace_lane(v.raw, kLane, static_cast(t))}; } template HWY_INLINE Vec128 InsertLane(const Vec128 v, float t) { static_assert(kLane < N, "Lane index out of bounds"); return Vec128{wasm_f32x4_replace_lane(v.raw, kLane, t)}; } template HWY_INLINE Vec128 InsertLane(const Vec128 v, double t) { static_assert(kLane < 2, "Lane index out of bounds"); return Vec128{wasm_f64x2_replace_lane(v.raw, kLane, t)}; } } // namespace detail // Requires one overload per vector length because InsertLane<3> may be a // compile error if it calls wasm_f64x2_replace_lane. template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { HWY_DASSERT(i == 0); (void)i; return Set(DFromV(), t); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); } } #endif const DFromV d; alignas(16) T lanes[2]; Store(v, d, lanes); lanes[i] = t; return Load(d, lanes); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); case 2: return detail::InsertLane<2>(v, t); case 3: return detail::InsertLane<3>(v, t); } } #endif const DFromV d; alignas(16) T lanes[4]; Store(v, d, lanes); lanes[i] = t; return Load(d, lanes); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); case 2: return detail::InsertLane<2>(v, t); case 3: return detail::InsertLane<3>(v, t); case 4: return detail::InsertLane<4>(v, t); case 5: return detail::InsertLane<5>(v, t); case 6: return detail::InsertLane<6>(v, t); case 7: return detail::InsertLane<7>(v, t); } } #endif const DFromV d; alignas(16) T lanes[8]; Store(v, d, lanes); lanes[i] = t; return Load(d, lanes); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); case 2: return detail::InsertLane<2>(v, t); case 3: return detail::InsertLane<3>(v, t); case 4: return detail::InsertLane<4>(v, t); case 5: return detail::InsertLane<5>(v, t); case 6: return detail::InsertLane<6>(v, t); case 7: return detail::InsertLane<7>(v, t); case 8: return detail::InsertLane<8>(v, t); case 9: return detail::InsertLane<9>(v, t); case 10: return detail::InsertLane<10>(v, t); case 11: return detail::InsertLane<11>(v, t); case 12: return detail::InsertLane<12>(v, t); case 13: return detail::InsertLane<13>(v, t); case 14: return detail::InsertLane<14>(v, t); case 15: return detail::InsertLane<15>(v, t); } } #endif const DFromV d; alignas(16) T lanes[16]; Store(v, d, lanes); lanes[i] = t; return Load(d, lanes); } // ------------------------------ LowerHalf template HWY_API VFromD LowerHalf(D /* tag */, VFromD> v) { return VFromD{v.raw}; } template HWY_API Vec128 LowerHalf(Vec128 v) { return Vec128{v.raw}; } // ------------------------------ ShiftLeftBytes // 0x01..0F, kBytes = 1 => 0x02..0F00 template HWY_API VFromD ShiftLeftBytes(D /* tag */, VFromD v) { static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); const __i8x16 zero = wasm_i8x16_splat(0); switch (kBytes) { case 0: return v; case 1: return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)}; case 2: return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)}; case 3: return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)}; case 4: return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)}; case 5: return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)}; case 6: return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)}; case 7: return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)}; case 8: return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)}; case 9: return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6)}; case 10: return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5)}; case 11: return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4)}; case 12: return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3)}; case 13: return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2)}; case 14: return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1)}; case 15: return VFromD{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0)}; } return VFromD{zero}; } template HWY_API Vec128 ShiftLeftBytes(Vec128 v) { return ShiftLeftBytes(DFromV(), v); } // ------------------------------ ShiftLeftLanes template HWY_API VFromD ShiftLeftLanes(D d, const VFromD v) { const Repartition d8; constexpr size_t kBytes = kLanes * sizeof(TFromD); return BitCast(d, ShiftLeftBytes(BitCast(d8, v))); } template HWY_API Vec128 ShiftLeftLanes(const Vec128 v) { return ShiftLeftLanes(DFromV(), v); } // ------------------------------ ShiftRightBytes namespace detail { // Helper function allows zeroing invalid lanes in caller. template HWY_API __i8x16 ShrBytes(const Vec128 v) { static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); const __i8x16 zero = wasm_i8x16_splat(0); switch (kBytes) { case 0: return v.raw; case 1: return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); case 2: return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16); case 3: return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16); case 4: return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16); case 5: return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16); case 6: return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16); case 7: return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16); case 8: return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16); case 9: return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 10: return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 11: return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 12: return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 13: return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 14: return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 15: return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 16: return zero; } } } // namespace detail // 0x01..0F, kBytes = 1 => 0x0001..0E template HWY_API VFromD ShiftRightBytes(D d, VFromD v) { // For partial vectors, clear upper lanes so we shift in zeros. if (d.MaxBytes() != 16) { const Full128> dfull; const VFromD vfull{v.raw}; v = VFromD{IfThenElseZero(FirstN(dfull, MaxLanes(d)), vfull).raw}; } return VFromD{detail::ShrBytes(v)}; } // ------------------------------ ShiftRightLanes template HWY_API VFromD ShiftRightLanes(D d, const VFromD v) { const Repartition d8; constexpr size_t kBytes = kLanes * sizeof(TFromD); return BitCast(d, ShiftRightBytes(d8, BitCast(d8, v))); } // ------------------------------ UpperHalf (ShiftRightBytes) template > HWY_API Vec64 UpperHalf(D /* tag */, const Vec128 v) { return Vec64{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; } // Partial template HWY_API VFromD UpperHalf(D d, VFromD> v) { return LowerHalf(d, ShiftRightBytes(Twice(), v)); } // ------------------------------ CombineShiftRightBytes template > HWY_API Vec128 CombineShiftRightBytes(D /* tag */, Vec128 hi, Vec128 lo) { static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); switch (kBytes) { case 0: return lo; case 1: return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)}; case 2: return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17)}; case 3: return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)}; case 4: return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19)}; case 5: return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)}; case 6: return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21)}; case 7: return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22)}; case 8: return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23)}; case 9: return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24)}; case 10: return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25)}; case 11: return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26)}; case 12: return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27)}; case 13: return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28)}; case 14: return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29)}; case 15: return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30)}; } return hi; } template HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { constexpr size_t kSize = d.MaxBytes(); static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); const Repartition d8; using V8 = Vec128; const DFromV dfull8; const Repartition, decltype(dfull8)> dfull; const V8 hi8{BitCast(d8, hi).raw}; // Move into most-significant bytes const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw}); const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(dfull8, hi8, lo8); return VFromD{BitCast(dfull, r).raw}; } // ------------------------------ Broadcast/splat any lane template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{wasm_i8x16_shuffle( v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)}; } template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)}; } template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{ wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; } template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, kLane, kLane)}; } // ------------------------------ TableLookupBytes // Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e. // lane indices in [0, 16). template HWY_API Vec128 TableLookupBytes(const Vec128 bytes, const Vec128 from) { return Vec128{wasm_i8x16_swizzle(bytes.raw, from.raw)}; } template HWY_API Vec128 TableLookupBytesOr0(const Vec128 bytes, const Vec128 from) { const DFromV d; // Mask size must match vector type, so cast everything to this type. Repartition di8; Repartition> d_bytes8; const auto msb = BitCast(di8, from) < Zero(di8); const auto lookup = TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from)); return BitCast(d, IfThenZeroElse(msb, lookup)); } // ------------------------------ Hard-coded shuffles // Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). // Shuffle0321 rotates one lane to the right (the previous least-significant // lane is now most-significant). These could also be implemented via // CombineShiftRightBytes but the shuffle_abcd notation is more convenient. // Swap 32-bit halves in 64-bit halves. template HWY_API Vec128 Shuffle2301(const Vec128 v) { static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; } // These are used by generic_ops-inl to implement LoadInterleaved3. namespace detail { template HWY_API Vec128 ShuffleTwo2301(const Vec128 a, const Vec128 b) { static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 1, 0, 3 + 16, 2 + 16, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)}; } template HWY_API Vec128 ShuffleTwo2301(const Vec128 a, const Vec128 b) { static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i16x8_shuffle(a.raw, b.raw, 1, 0, 3 + 8, 2 + 8, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)}; } template HWY_API Vec128 ShuffleTwo2301(const Vec128 a, const Vec128 b) { static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 1, 0, 3 + 4, 2 + 4)}; } template HWY_API Vec128 ShuffleTwo1230(const Vec128 a, const Vec128 b) { static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 0, 3, 2 + 16, 1 + 16, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)}; } template HWY_API Vec128 ShuffleTwo1230(const Vec128 a, const Vec128 b) { static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i16x8_shuffle(a.raw, b.raw, 0, 3, 2 + 8, 1 + 8, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)}; } template HWY_API Vec128 ShuffleTwo1230(const Vec128 a, const Vec128 b) { static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 3, 2 + 4, 1 + 4)}; } template HWY_API Vec128 ShuffleTwo3012(const Vec128 a, const Vec128 b) { static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 2, 1, 0 + 16, 3 + 16, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F, 0x7F)}; } template HWY_API Vec128 ShuffleTwo3012(const Vec128 a, const Vec128 b) { static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i16x8_shuffle(a.raw, b.raw, 2, 1, 0 + 8, 3 + 8, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF)}; } template HWY_API Vec128 ShuffleTwo3012(const Vec128 a, const Vec128 b) { static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 1, 0 + 4, 3 + 4)}; } } // namespace detail // Swap 64-bit halves template HWY_API Vec128 Shuffle01(const Vec128 v) { static_assert(sizeof(T) == 8, "Only for 64-bit lanes"); return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; } template HWY_API Vec128 Shuffle1032(const Vec128 v) { static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; } // Rotate right 32 bits template HWY_API Vec128 Shuffle0321(const Vec128 v) { static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; } // Rotate left 32 bits template HWY_API Vec128 Shuffle2103(const Vec128 v) { static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; } // Reverse template HWY_API Vec128 Shuffle0123(const Vec128 v) { static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; } // ------------------------------ TableLookupLanes // Returned by SetTableIndices for use by TableLookupLanes. template struct Indices128 { __v128_u raw; }; namespace detail { template HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( D d) { const Repartition d8; return Iota(d8, 0); } template HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( D d) { const Repartition d8; alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; return Load(d8, kBroadcastLaneBytes); } template HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( D d) { const Repartition d8; alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; return Load(d8, kBroadcastLaneBytes); } template HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( D d) { const Repartition d8; alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; return Load(d8, kBroadcastLaneBytes); } template HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { const Repartition d8; return Zero(d8); } template HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { const Repartition d8; alignas(16) static constexpr uint8_t kByteOffsets[16] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; return Load(d8, kByteOffsets); } template HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { const Repartition d8; alignas(16) static constexpr uint8_t kByteOffsets[16] = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; return Load(d8, kByteOffsets); } template HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { const Repartition d8; alignas(16) static constexpr uint8_t kByteOffsets[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; return Load(d8, kByteOffsets); } } // namespace detail template HWY_API Indices128, MaxLanes(D())> IndicesFromVec( D d, Vec128 vec) { using T = TFromD; static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); #if HWY_IS_DEBUG_BUILD const RebindToUnsigned du; using TU = TFromD; HWY_DASSERT(AllTrue( du, Lt(BitCast(du, vec), Set(du, static_cast(MaxLanes(d) * 2))))); #endif (void)d; return Indices128, MaxLanes(D())>{vec.raw}; } template HWY_API Indices128, MaxLanes(D())> IndicesFromVec( D d, Vec128 vec) { using T = TFromD; static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); #if HWY_IS_DEBUG_BUILD const RebindToUnsigned du; using TU = TFromD; HWY_DASSERT(AllTrue( du, Lt(BitCast(du, vec), Set(du, static_cast(MaxLanes(d) * 2))))); #endif const Repartition d8; using V8 = VFromD; // Broadcast each lane index to all bytes of T and shift to bytes const V8 lane_indices = TableLookupBytes( BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d)); constexpr int kIndexShiftAmt = static_cast(FloorLog2(sizeof(T))); const V8 byte_indices = ShiftLeft(lane_indices); const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d)); return Indices128, MaxLanes(D())>{BitCast(d, sum).raw}; } template HWY_API Indices128, HWY_MAX_LANES_D(D)> SetTableIndices( D d, const TI* idx) { const Rebind di; return IndicesFromVec(d, LoadU(di, idx)); } template HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { using TI = MakeSigned; const DFromV d; const Rebind di; return BitCast(d, TableLookupBytes(BitCast(di, v), Vec128{idx.raw})); } template HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, Indices128 idx) { const DFromV d; const Twice dt; // TableLookupLanes currently requires table and index vectors to be the same // size, though a half-length index vector would be sufficient here. #if HWY_IS_MSAN const Vec128 idx_vec{idx.raw}; const Indices128 idx2{Combine(dt, idx_vec, idx_vec).raw}; #else // We only keep LowerHalf of the result, which is valid in idx. const Indices128 idx2{idx.raw}; #endif return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2)); } template HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, Indices128 idx) { const DFromV d; const Repartition du8; const VFromD byte_idx{idx.raw}; const auto byte_idx_mod = byte_idx & Set(du8, uint8_t{0x0F}); // If ANDing did not change the index, it is for the lower half. const auto is_lo = (byte_idx == byte_idx_mod); return BitCast(d, IfThenElse(is_lo, TableLookupBytes(a, byte_idx_mod), TableLookupBytes(b, byte_idx_mod))); } // ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01) // Single lane: no change template , HWY_IF_LANES_D(D, 1)> HWY_API Vec128 Reverse(D /* tag */, Vec128 v) { return v; } // 32-bit x2: shuffle template , HWY_IF_T_SIZE(T, 4)> HWY_API Vec64 Reverse(D /* tag */, const Vec64 v) { return Vec64{Shuffle2301(Vec128{v.raw}).raw}; } // 64-bit x2: shuffle template , HWY_IF_T_SIZE(T, 8)> HWY_API Vec128 Reverse(D /* tag */, const Vec128 v) { return Shuffle01(v); } // 32-bit x2: shuffle template , HWY_IF_T_SIZE(T, 4)> HWY_API Vec128 Reverse(D /* tag */, const Vec128 v) { return Shuffle0123(v); } // 16-bit template HWY_API VFromD Reverse(D d, const VFromD v) { const RepartitionToWide> du32; return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v)))); } template HWY_API VFromD Reverse(D d, const VFromD v) { static constexpr int kN = 16 + Lanes(d); return VFromD{wasm_i8x16_shuffle( v.raw, v.raw, // kN is adjusted to ensure we have valid indices for all lengths. kN - 1, kN - 2, kN - 3, kN - 4, kN - 5, kN - 6, kN - 7, kN - 8, kN - 9, kN - 10, kN - 11, kN - 12, kN - 13, kN - 14, kN - 15, kN - 16)}; } // ------------------------------ Reverse2 template HWY_API VFromD Reverse2(D d, const VFromD v) { const RepartitionToWide> dw; return BitCast(d, RotateRight<16>(BitCast(dw, v))); } template HWY_API VFromD Reverse2(D /* tag */, const VFromD v) { return Shuffle2301(v); } template HWY_API VFromD Reverse2(D /* tag */, const VFromD v) { return Shuffle01(v); } // ------------------------------ Reverse4 template HWY_API VFromD Reverse4(D /* tag */, const VFromD v) { return VFromD{wasm_i16x8_shuffle(v.raw, v.raw, 3, 2, 1, 0, 7, 6, 5, 4)}; } template HWY_API VFromD Reverse4(D /* tag */, const VFromD v) { return Shuffle0123(v); } template HWY_API VFromD Reverse4(D /* tag */, const VFromD) { HWY_ASSERT(0); // don't have 8 u64 lanes } // ------------------------------ Reverse8 template HWY_API VFromD Reverse8(D d, const VFromD v) { return Reverse(d, v); } template HWY_API VFromD Reverse8(D /* tag */, const VFromD) { HWY_ASSERT(0); // don't have 8 lanes for > 16-bit lanes } // ------------------------------ InterleaveLower template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i8x16_shuffle( a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{ wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i8x16_shuffle( a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{ wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { const DFromV d; const RebindToUnsigned du; return BitCast(d, InterleaveLower(BitCast(du, a), BitCast(du, b))); } // Additional overload for the optional tag (all vector lengths). template HWY_API VFromD InterleaveLower(D /* tag */, VFromD a, VFromD b) { return InterleaveLower(a, b); } // ------------------------------ InterleaveUpper (UpperHalf) // All functions inside detail lack the required D parameter. namespace detail { template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{ wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{ wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; } } // namespace detail // Full template > HWY_API Vec128 InterleaveUpper(D /* tag */, Vec128 a, Vec128 b) { return detail::InterleaveUpper(a, b); } // Partial template HWY_API VFromD InterleaveUpper(D d, VFromD a, VFromD b) { const Half d2; return InterleaveLower(d, VFromD{UpperHalf(d2, a).raw}, VFromD{UpperHalf(d2, b).raw}); } // ------------------------------ ZipLower/ZipUpper (InterleaveLower) // Same as Interleave*, except that the return lanes are double-width integers; // this is necessary because the single-lane scalar cannot return two values. template >> HWY_API VFromD ZipLower(V a, V b) { return BitCast(DW(), InterleaveLower(a, b)); } template , class DW = RepartitionToWide> HWY_API VFromD ZipLower(DW dw, V a, V b) { return BitCast(dw, InterleaveLower(D(), a, b)); } template , class DW = RepartitionToWide> HWY_API VFromD ZipUpper(DW dw, V a, V b) { return BitCast(dw, InterleaveUpper(D(), a, b)); } // ------------------------------ Per4LaneBlockShuffle namespace detail { template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, hwy::SizeTag<1> /*lane_size_tag*/, hwy::SizeTag /*vect_size_tag*/, V v) { constexpr int kIdx3 = static_cast((kIdx3210 >> 6) & 3); constexpr int kIdx2 = static_cast((kIdx3210 >> 4) & 3); constexpr int kIdx1 = static_cast((kIdx3210 >> 2) & 3); constexpr int kIdx0 = static_cast(kIdx3210 & 3); return V{wasm_i8x16_shuffle(v.raw, v.raw, kIdx0, kIdx1, kIdx2, kIdx3, kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4, kIdx0 + 8, kIdx1 + 8, kIdx2 + 8, kIdx3 + 8, kIdx0 + 12, kIdx1 + 12, kIdx2 + 12, kIdx3 + 12)}; } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, hwy::SizeTag /*vect_size_tag*/, V v) { constexpr int kIdx3 = static_cast((kIdx3210 >> 6) & 3); constexpr int kIdx2 = static_cast((kIdx3210 >> 4) & 3); constexpr int kIdx1 = static_cast((kIdx3210 >> 2) & 3); constexpr int kIdx0 = static_cast(kIdx3210 & 3); return V{wasm_i16x8_shuffle(v.raw, v.raw, kIdx0, kIdx1, kIdx2, kIdx3, kIdx0 + 4, kIdx1 + 4, kIdx2 + 4, kIdx3 + 4)}; } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag /*idx_3210_tag*/, hwy::SizeTag<4> /*lane_size_tag*/, hwy::SizeTag /*vect_size_tag*/, V v) { constexpr int kIdx3 = static_cast((kIdx3210 >> 6) & 3); constexpr int kIdx2 = static_cast((kIdx3210 >> 4) & 3); constexpr int kIdx1 = static_cast((kIdx3210 >> 2) & 3); constexpr int kIdx0 = static_cast(kIdx3210 & 3); return V{wasm_i32x4_shuffle(v.raw, v.raw, kIdx0, kIdx1, kIdx2, kIdx3)}; } } // namespace detail // ------------------------------ SlideUpLanes namespace detail { template HWY_INLINE V SlideUpLanes(V v, size_t amt) { const DFromV d; const Full64 du64; const auto vu64 = ResizeBitCast(du64, v); return ResizeBitCast( d, ShiftLeftSame(vu64, static_cast(amt * sizeof(TFromV) * 8))); } template HWY_INLINE V SlideUpLanes(V v, size_t amt) { const DFromV d; const Repartition du8; const auto idx = Iota(du8, static_cast(size_t{0} - amt * sizeof(TFromV))); return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx)); } } // namespace detail template HWY_API VFromD SlideUpLanes(D /*d*/, VFromD v, size_t /*amt*/) { return v; } template HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftLeftLanes<1>(d, v); } } #else (void)d; #endif return detail::SlideUpLanes(v, amt); } template HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftLeftLanes<1>(d, v); case 2: return ShiftLeftLanes<2>(d, v); case 3: return ShiftLeftLanes<3>(d, v); } } #else (void)d; #endif return detail::SlideUpLanes(v, amt); } template HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftLeftLanes<1>(d, v); case 2: return ShiftLeftLanes<2>(d, v); case 3: return ShiftLeftLanes<3>(d, v); case 4: return ShiftLeftLanes<4>(d, v); case 5: return ShiftLeftLanes<5>(d, v); case 6: return ShiftLeftLanes<6>(d, v); case 7: return ShiftLeftLanes<7>(d, v); } } #else (void)d; #endif return detail::SlideUpLanes(v, amt); } template HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftLeftLanes<1>(d, v); case 2: return ShiftLeftLanes<2>(d, v); case 3: return ShiftLeftLanes<3>(d, v); case 4: return ShiftLeftLanes<4>(d, v); case 5: return ShiftLeftLanes<5>(d, v); case 6: return ShiftLeftLanes<6>(d, v); case 7: return ShiftLeftLanes<7>(d, v); case 8: return ShiftLeftLanes<8>(d, v); case 9: return ShiftLeftLanes<9>(d, v); case 10: return ShiftLeftLanes<10>(d, v); case 11: return ShiftLeftLanes<11>(d, v); case 12: return ShiftLeftLanes<12>(d, v); case 13: return ShiftLeftLanes<13>(d, v); case 14: return ShiftLeftLanes<14>(d, v); case 15: return ShiftLeftLanes<15>(d, v); } } #else (void)d; #endif return detail::SlideUpLanes(v, amt); } // ------------------------------ SlideDownLanes namespace detail { template HWY_INLINE V SlideDownLanes(V v, size_t amt) { const DFromV d; const Repartition, decltype(d)> dv; return BitCast(d, ShiftRightSame(BitCast(dv, v), static_cast(amt * sizeof(TFromV) * 8))); } template HWY_INLINE V SlideDownLanes(V v, size_t amt) { const DFromV d; const Repartition di8; auto idx = Iota(di8, static_cast(amt * sizeof(TFromV))); idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15}))); return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx)); } } // namespace detail template HWY_API VFromD SlideDownLanes(D /*d*/, VFromD v, size_t /*amt*/) { return v; } template HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftRightLanes<1>(d, v); } } #else (void)d; #endif return detail::SlideDownLanes(v, amt); } template HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftRightLanes<1>(d, v); case 2: return ShiftRightLanes<2>(d, v); case 3: return ShiftRightLanes<3>(d, v); } } #else (void)d; #endif return detail::SlideDownLanes(v, amt); } template HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftRightLanes<1>(d, v); case 2: return ShiftRightLanes<2>(d, v); case 3: return ShiftRightLanes<3>(d, v); case 4: return ShiftRightLanes<4>(d, v); case 5: return ShiftRightLanes<5>(d, v); case 6: return ShiftRightLanes<6>(d, v); case 7: return ShiftRightLanes<7>(d, v); } } #else (void)d; #endif return detail::SlideDownLanes(v, amt); } template HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftRightLanes<1>(d, v); case 2: return ShiftRightLanes<2>(d, v); case 3: return ShiftRightLanes<3>(d, v); case 4: return ShiftRightLanes<4>(d, v); case 5: return ShiftRightLanes<5>(d, v); case 6: return ShiftRightLanes<6>(d, v); case 7: return ShiftRightLanes<7>(d, v); case 8: return ShiftRightLanes<8>(d, v); case 9: return ShiftRightLanes<9>(d, v); case 10: return ShiftRightLanes<10>(d, v); case 11: return ShiftRightLanes<11>(d, v); case 12: return ShiftRightLanes<12>(d, v); case 13: return ShiftRightLanes<13>(d, v); case 14: return ShiftRightLanes<14>(d, v); case 15: return ShiftRightLanes<15>(d, v); } } #else (void)d; #endif return detail::SlideDownLanes(v, amt); } // ================================================== COMBINE // ------------------------------ Combine (InterleaveLower) // N = N/2 + N/2 (upper half undefined) template >> HWY_API VFromD Combine(D d, VH hi_half, VH lo_half) { const Half dh; const RebindToUnsigned duh; // Treat half-width input as one lane, and expand to two lanes. using VU = Vec128, 2>; const VU lo{BitCast(duh, lo_half).raw}; const VU hi{BitCast(duh, hi_half).raw}; return BitCast(d, InterleaveLower(lo, hi)); } // ------------------------------ ZeroExtendVector (IfThenElseZero) template HWY_API VFromD ZeroExtendVector(D d, VFromD> lo) { const Half dh; return IfThenElseZero(FirstN(d, MaxLanes(dh)), VFromD{lo.raw}); } // ------------------------------ ConcatLowerLower template > HWY_API Vec128 ConcatLowerLower(D /* tag */, Vec128 hi, Vec128 lo) { return Vec128{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)}; } // ------------------------------ ConcatUpperUpper template > HWY_API Vec128 ConcatUpperUpper(D /* tag */, Vec128 hi, Vec128 lo) { return Vec128{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)}; } // ------------------------------ ConcatLowerUpper template > HWY_API Vec128 ConcatLowerUpper(D d, Vec128 hi, Vec128 lo) { return CombineShiftRightBytes<8>(d, hi, lo); } // ------------------------------ ConcatUpperLower template > HWY_API Vec128 ConcatUpperLower(D d, Vec128 hi, Vec128 lo) { return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi); } // ------------------------------ Concat partial (Combine, LowerHalf) template HWY_API VFromD ConcatLowerLower(D d, VFromD hi, VFromD lo) { const Half d2; return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo)); } template HWY_API VFromD ConcatUpperUpper(D d, VFromD hi, VFromD lo) { const Half d2; return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo)); } template HWY_API VFromD ConcatLowerUpper(D d, const VFromD hi, const VFromD lo) { const Half d2; return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo)); } template HWY_API VFromD ConcatUpperLower(D d, VFromD hi, VFromD lo) { const Half d2; return Combine(d, UpperHalf(d2, hi), LowerHalf(d2, lo)); } // ------------------------------ ConcatOdd // 8-bit full template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec128 ConcatOdd(D /* tag */, Vec128 hi, Vec128 lo) { return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31)}; } // 8-bit x8 template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec64 ConcatOdd(D /* tag */, Vec64 hi, Vec64 lo) { // Don't care about upper half. return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 17, 19, 21, 23, 1, 3, 5, 7, 17, 19, 21, 23)}; } // 8-bit x4 template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec32 ConcatOdd(D /* tag */, Vec32 hi, Vec32 lo) { // Don't care about upper 3/4. return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 3, 17, 19, 1, 3, 17, 19, 1, 3, 17, 19, 1, 3, 17, 19)}; } // 16-bit full template , HWY_IF_T_SIZE(T, 2)> HWY_API Vec128 ConcatOdd(D /* tag */, Vec128 hi, Vec128 lo) { return Vec128{ wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 5, 7, 9, 11, 13, 15)}; } // 16-bit x4 template , HWY_IF_T_SIZE(T, 2)> HWY_API Vec64 ConcatOdd(D /* tag */, Vec64 hi, Vec64 lo) { // Don't care about upper half. return Vec128{ wasm_i16x8_shuffle(lo.raw, hi.raw, 1, 3, 9, 11, 1, 3, 9, 11)}; } // 32-bit full template , HWY_IF_T_SIZE(T, 4)> HWY_API Vec128 ConcatOdd(D /* tag */, Vec128 hi, Vec128 lo) { return Vec128{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)}; } // Any T x2 template , HWY_IF_LANES_D(D, 2)> HWY_API Vec128 ConcatOdd(D d, Vec128 hi, Vec128 lo) { return InterleaveUpper(d, lo, hi); } // ------------------------------ ConcatEven (InterleaveLower) // 8-bit full template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec128 ConcatEven(D /* tag */, Vec128 hi, Vec128 lo) { return Vec128{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30)}; } // 8-bit x8 template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec64 ConcatEven(D /* tag */, Vec64 hi, Vec64 lo) { // Don't care about upper half. return Vec64{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 16, 18, 20, 22, 0, 2, 4, 6, 16, 18, 20, 22)}; } // 8-bit x4 template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec32 ConcatEven(D /* tag */, Vec32 hi, Vec32 lo) { // Don't care about upper 3/4. return Vec32{wasm_i8x16_shuffle(lo.raw, hi.raw, 0, 2, 16, 18, 0, 2, 16, 18, 0, 2, 16, 18, 0, 2, 16, 18)}; } // 16-bit full template , HWY_IF_T_SIZE(T, 2)> HWY_API Vec128 ConcatEven(D /* tag */, Vec128 hi, Vec128 lo) { return Vec128{ wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 4, 6, 8, 10, 12, 14)}; } // 16-bit x4 template , HWY_IF_T_SIZE(T, 2)> HWY_API Vec64 ConcatEven(D /* tag */, Vec64 hi, Vec64 lo) { // Don't care about upper half. return Vec64{wasm_i16x8_shuffle(lo.raw, hi.raw, 0, 2, 8, 10, 0, 2, 8, 10)}; } // 32-bit full template , HWY_IF_T_SIZE(T, 4)> HWY_API Vec128 ConcatEven(D /* tag */, Vec128 hi, Vec128 lo) { return Vec128{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)}; } // Any T x2 template , HWY_IF_LANES_D(D, 2)> HWY_API Vec128 ConcatEven(D d, Vec128 hi, Vec128 lo) { return InterleaveLower(d, lo, hi); } // ------------------------------ DupEven (InterleaveLower) template HWY_API Vec128 DupEven(Vec128 v) { return Vec128{wasm_i8x16_shuffle(v.raw, v.raw, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14)}; } template HWY_API Vec128 DupEven(Vec128 v) { return Vec128{wasm_i16x8_shuffle(v.raw, v.raw, 0, 0, 2, 2, 4, 4, 6, 6)}; } template HWY_API Vec128 DupEven(Vec128 v) { return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 0, 0, 2, 2)}; } template HWY_API Vec128 DupEven(const Vec128 v) { return InterleaveLower(DFromV(), v, v); } // ------------------------------ DupOdd (InterleaveUpper) template HWY_API Vec128 DupOdd(Vec128 v) { return Vec128{wasm_i8x16_shuffle(v.raw, v.raw, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15)}; } template HWY_API Vec128 DupOdd(Vec128 v) { return Vec128{wasm_i16x8_shuffle(v.raw, v.raw, 1, 1, 3, 3, 5, 5, 7, 7)}; } template HWY_API Vec128 DupOdd(Vec128 v) { return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 1, 3, 3)}; } template HWY_API Vec128 DupOdd(const Vec128 v) { return InterleaveUpper(DFromV(), v, v); } // ------------------------------ OddEven namespace detail { template HWY_INLINE Vec128 OddEven(hwy::SizeTag<1> /* tag */, const Vec128 a, const Vec128 b) { const DFromV d; const Repartition d8; alignas(16) static constexpr uint8_t mask[16] = { 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); } template HWY_INLINE Vec128 OddEven(hwy::SizeTag<2> /* tag */, const Vec128 a, const Vec128 b) { return Vec128{ wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)}; } template HWY_INLINE Vec128 OddEven(hwy::SizeTag<4> /* tag */, const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; } template HWY_INLINE Vec128 OddEven(hwy::SizeTag<8> /* tag */, const Vec128 a, const Vec128 b) { return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)}; } } // namespace detail template HWY_API Vec128 OddEven(const Vec128 a, const Vec128 b) { return detail::OddEven(hwy::SizeTag(), a, b); } template HWY_API Vec128 OddEven(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; } // ------------------------------ OddEvenBlocks template HWY_API Vec128 OddEvenBlocks(Vec128 /* odd */, Vec128 even) { return even; } // ------------------------------ SwapAdjacentBlocks template HWY_API Vec128 SwapAdjacentBlocks(Vec128 v) { return v; } // ------------------------------ ReverseBlocks // Single block: no change template HWY_API VFromD ReverseBlocks(D /* tag */, VFromD v) { return v; } // ================================================== CONVERT // ------------------------------ Promotions (part w/ narrow lanes -> full) // Unsigned: zero-extend. template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{wasm_u16x8_extend_low_u8x16(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{wasm_u32x4_extend_low_u16x8(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{wasm_u64x2_extend_low_u32x4(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{ wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{wasm_u16x8_extend_low_u8x16(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{wasm_u32x4_extend_low_u16x8(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{wasm_u64x2_extend_low_u32x4(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{ wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; } // U8/U16 to U64/I64: First, zero-extend to U32, and then zero-extend to // TFromD template HWY_API VFromD PromoteTo(D d, V v) { const Rebind du32; return PromoteTo(d, PromoteTo(du32, v)); } // Signed: replicate sign bit. template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{wasm_i16x8_extend_low_i8x16(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{wasm_i32x4_extend_low_i16x8(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{wasm_i64x2_extend_low_i32x4(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{ wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))}; } // I8/I16 to I64: First, promote to I32, and then promote to I64 template HWY_API VFromD PromoteTo(D d, V v) { const Rebind di32; return PromoteTo(d, PromoteTo(di32, v)); } template HWY_API VFromD PromoteTo(D df32, VFromD> v) { const Rebind du16; const RebindToSigned di32; return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{wasm_f64x2_convert_low_i32x4(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{wasm_f64x2_convert_low_u32x4(v.raw)}; } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD{wasm_f64x2_promote_low_f32x4(v.raw)}; } template HWY_API VFromD PromoteTo(D di64, VFromD> v) { const Rebind di32; const RebindToFloat df32; const RebindToUnsigned du32; const Repartition du32_as_du8; const auto exponent_adj = BitCast( du32, Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))), BitCast(du32_as_du8, Set(du32, uint32_t{157}))), BitCast(du32_as_du8, Set(du32, uint32_t{32})))); const auto adj_v = BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj)); const auto f32_to_i32_result = ConvertTo(di32, adj_v); const auto lo64_or_mask = PromoteTo( di64, BitCast(du32, VecFromMask(di32, Eq(f32_to_i32_result, Set(di32, LimitsMax()))))); return Or(PromoteTo(di64, BitCast(di32, f32_to_i32_result)) << PromoteTo(di64, exponent_adj), lo64_or_mask); } template HWY_API VFromD PromoteTo(D du64, VFromD> v) { const Rebind du32; const RebindToFloat df32; const Repartition du32_as_du8; const auto exponent_adj = BitCast( du32, Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))), BitCast(du32_as_du8, Set(du32, uint32_t{158}))), BitCast(du32_as_du8, Set(du32, uint32_t{32})))); const auto adj_v = BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj)); const auto f32_to_u32_result = ConvertTo(du32, adj_v); const auto lo32_or_mask = PromoteTo( du64, VecFromMask(du32, f32_to_u32_result == Set(du32, LimitsMax()))); return Or(PromoteTo(du64, f32_to_u32_result) << PromoteTo(du64, exponent_adj), lo32_or_mask); } // ------------------------------ PromoteUpperTo // Per-target flag to prevent generic_ops-inl.h from defining PromoteUpperTo. #ifdef HWY_NATIVE_PROMOTE_UPPER_TO #undef HWY_NATIVE_PROMOTE_UPPER_TO #else #define HWY_NATIVE_PROMOTE_UPPER_TO #endif // Unsigned: zero-extend. template HWY_API VFromD PromoteUpperTo(D /* tag */, VFromD> v) { return VFromD{wasm_u16x8_extend_high_u8x16(v.raw)}; } template HWY_API VFromD PromoteUpperTo(D /* tag */, VFromD> v) { return VFromD{wasm_u32x4_extend_high_u16x8(v.raw)}; } template HWY_API VFromD PromoteUpperTo(D /* tag */, VFromD> v) { return VFromD{wasm_u64x2_extend_high_u32x4(v.raw)}; } template HWY_API VFromD PromoteUpperTo(D /* tag */, VFromD> v) { return VFromD{wasm_u16x8_extend_high_u8x16(v.raw)}; } template HWY_API VFromD PromoteUpperTo(D /* tag */, VFromD> v) { return VFromD{wasm_u32x4_extend_high_u16x8(v.raw)}; } template HWY_API VFromD PromoteUpperTo(D /* tag */, VFromD> v) { return VFromD{wasm_u64x2_extend_high_u32x4(v.raw)}; } // Signed: replicate sign bit. template HWY_API VFromD PromoteUpperTo(D /* tag */, VFromD> v) { return VFromD{wasm_i16x8_extend_high_i8x16(v.raw)}; } template HWY_API VFromD PromoteUpperTo(D /* tag */, VFromD> v) { return VFromD{wasm_i32x4_extend_high_i16x8(v.raw)}; } template HWY_API VFromD PromoteUpperTo(D /* tag */, VFromD> v) { return VFromD{wasm_i64x2_extend_high_i32x4(v.raw)}; } template HWY_API VFromD PromoteUpperTo(D df32, VFromD> v) { const Rebind dh; return PromoteTo(df32, UpperHalf(dh, v)); } template HWY_API VFromD PromoteUpperTo(D df32, VFromD> v) { const Repartition du16; const RebindToSigned di32; return BitCast(df32, ShiftLeft<16>(PromoteUpperTo(di32, BitCast(du16, v)))); } template HWY_API VFromD PromoteUpperTo(D dd, VFromD> v) { // There is no wasm_f64x2_convert_high_i32x4. return PromoteTo(dd, UpperHalf(Rebind(), v)); } template HWY_API VFromD PromoteUpperTo(D dd, VFromD> v) { // There is no wasm_f64x2_convert_high_u32x4. return PromoteTo(dd, UpperHalf(Rebind(), v)); } template HWY_API VFromD PromoteUpperTo(D dd, VFromD> v) { // There is no wasm_f64x2_promote_high_f32x4. return PromoteTo(dd, UpperHalf(Rebind(), v)); } template HWY_API VFromD PromoteUpperTo(D d64, VFromD> v) { return PromoteTo(d64, UpperHalf(Rebind(), v)); } // Generic version for <=64 bit input/output (_high is only for full vectors). template HWY_API VFromD PromoteUpperTo(D d, V v) { const Rebind, decltype(d)> dh; return PromoteTo(d, UpperHalf(dh, v)); } // ------------------------------ Demotions (full -> part w/ narrow lanes) template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{wasm_u16x8_narrow_i32x4(v.raw, v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{wasm_i16x8_narrow_i32x4(v.raw, v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); return VFromD{wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{wasm_u8x16_narrow_i16x8(v.raw, v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); return VFromD{wasm_i8x16_narrow_i16x8(intermediate, intermediate)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{wasm_i8x16_narrow_i16x8(v.raw, v.raw)}; } template HWY_API VFromD DemoteTo(D dn, VFromD> v) { const DFromV du32; const RebindToSigned di32; return DemoteTo(dn, BitCast(di32, Min(v, Set(du32, 0x7FFFFFFF)))); } template HWY_API VFromD DemoteTo(D du8, VFromD> v) { const DFromV du16; const RebindToSigned di16; return DemoteTo(du8, BitCast(di16, Min(v, Set(du16, 0x7FFF)))); } template HWY_API VFromD DemoteTo(D dbf16, VFromD> v) { const Rebind di32; const Rebind du32; // for logical shift right const Rebind du16; const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); return BitCast(dbf16, DemoteTo(du16, bits_in_32)); } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{wasm_u32x4_trunc_sat_f64x2_zero(v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD{wasm_f32x4_demote_f64x2_zero(v.raw)}; } template HWY_API VFromD DemoteTo(D df32, VFromD> v) { const Rebind df64; const RebindToUnsigned du64; const RebindToSigned di32; const RebindToUnsigned du32; const auto k2p64_63 = Set(df64, 27670116110564327424.0); const auto f64_hi52 = Xor(BitCast(df64, ShiftRight<12>(BitCast(du64, v))), k2p64_63) - k2p64_63; const auto f64_lo12 = PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)), Set(du32, uint32_t{0x00000FFF})))); const auto f64_sum = f64_hi52 + f64_lo12; const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12; const auto f64_sum_is_inexact = ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64)))); const auto f64_bits_decrement = And(ShiftRight<63>(BitCast(du64, Xor(f64_sum, f64_carry))), f64_sum_is_inexact); const auto adj_f64_val = BitCast( df64, Or(BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact)); return DemoteTo(df32, adj_f64_val); } template HWY_API VFromD DemoteTo(D df32, VFromD> v) { const Rebind df64; const RebindToUnsigned du64; const RebindToSigned di32; const RebindToUnsigned du32; const auto k2p64 = Set(df64, 18446744073709551616.0); const auto f64_hi52 = Or(BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64; const auto f64_lo12 = PromoteTo(df64, BitCast(di32, And(TruncateTo(du32, BitCast(du64, v)), Set(du32, uint32_t{0x00000FFF})))); const auto f64_sum = f64_hi52 + f64_lo12; const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12; const auto f64_sum_is_inexact = ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64)))); const auto adj_f64_val = BitCast( df64, Or(BitCast(du64, f64_sum) - ShiftRight<63>(BitCast(du64, f64_carry)), f64_sum_is_inexact)); return DemoteTo(df32, adj_f64_val); } template >> HWY_API VFromD ReorderDemote2To(D dbf16, V32 a, V32 b) { const RebindToUnsigned du16; const Repartition du32; const VFromD b_in_even = ShiftRight<16>(BitCast(du32, b)); return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); } // Specializations for partial vectors because i16x8_narrow_i32x4 sets lanes // above 2*N. template HWY_API Vec32 ReorderDemote2To(D dn, Vec32 a, Vec32 b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } template HWY_API Vec64 ReorderDemote2To(D dn, Vec64 a, Vec64 b) { const Twice dn_full; const Repartition du32_full; const Vec128 v_full{wasm_i16x8_narrow_i32x4(a.raw, b.raw)}; const auto vu32_full = BitCast(du32_full, v_full); return LowerHalf( BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); } template HWY_API Vec128 ReorderDemote2To(D /* tag */, Vec128 a, Vec128 b) { return Vec128{wasm_i16x8_narrow_i32x4(a.raw, b.raw)}; } template HWY_API Vec32 ReorderDemote2To(D dn, Vec32 a, Vec32 b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } template HWY_API Vec64 ReorderDemote2To(D dn, Vec64 a, Vec64 b) { const Twice dn_full; const Repartition du32_full; const Vec128 v_full{wasm_u16x8_narrow_i32x4(a.raw, b.raw)}; const auto vu32_full = BitCast(du32_full, v_full); return LowerHalf( BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); } template HWY_API Vec128 ReorderDemote2To(D /* tag */, Vec128 a, Vec128 b) { return Vec128{wasm_u16x8_narrow_i32x4(a.raw, b.raw)}; } template HWY_API VFromD ReorderDemote2To(D dn, Vec128 a, Vec128 b) { const DFromV du32; const RebindToSigned di32; const auto max_i32 = Set(du32, 0x7FFFFFFFu); const auto clamped_a = BitCast(di32, Min(a, max_i32)); const auto clamped_b = BitCast(di32, Min(b, max_i32)); return ReorderDemote2To(dn, clamped_a, clamped_b); } template HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, VFromD> b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } // Specializations for partial vectors because i8x16_narrow_i16x8 sets lanes // above 2*N. template HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, VFromD> b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } template HWY_API Vec64 ReorderDemote2To(D dn, Vec64 a, Vec64 b) { const Twice dn_full; const Repartition du32_full; const Vec128 v_full{wasm_i8x16_narrow_i16x8(a.raw, b.raw)}; const auto vu32_full = BitCast(du32_full, v_full); return LowerHalf( BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); } template HWY_API Vec128 ReorderDemote2To(D /* tag */, Vec128 a, Vec128 b) { return Vec128{wasm_i8x16_narrow_i16x8(a.raw, b.raw)}; } template HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, VFromD> b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } template HWY_API Vec64 ReorderDemote2To(D dn, Vec64 a, Vec64 b) { const Twice dn_full; const Repartition du32_full; const Vec128 v_full{wasm_u8x16_narrow_i16x8(a.raw, b.raw)}; const auto vu32_full = BitCast(du32_full, v_full); return LowerHalf( BitCast(dn_full, ConcatEven(du32_full, vu32_full, vu32_full))); } template HWY_API Vec128 ReorderDemote2To(D /* tag */, Vec128 a, Vec128 b) { return Vec128{wasm_u8x16_narrow_i16x8(a.raw, b.raw)}; } template HWY_API VFromD ReorderDemote2To(D dn, Vec128 a, Vec128 b) { const DFromV du16; const RebindToSigned di16; const auto max_i16 = Set(du16, 0x7FFFu); const auto clamped_a = BitCast(di16, Min(a, max_i16)); const auto clamped_b = BitCast(di16, Min(b, max_i16)); return ReorderDemote2To(dn, clamped_a, clamped_b); } template HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, VFromD> b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } // For already range-limited input [0, 255]. template HWY_API Vec128 U8FromU32(const Vec128 v) { const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); return Vec128{ wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; } // ------------------------------ Truncations template HWY_API VFromD TruncateTo(DTo /* tag */, Vec128 v) { // BitCast requires the same size; DTo might be u8x1 and v u16x1. const Repartition, DFromV> dto; return VFromD{BitCast(dto, v).raw}; } template HWY_API Vec16 TruncateTo(D /* tag */, Vec128 v) { const Full128 d; const auto v1 = BitCast(d, v); const auto v2 = ConcatEven(d, v1, v1); const auto v4 = ConcatEven(d, v2, v2); return LowerHalf(LowerHalf(LowerHalf(ConcatEven(d, v4, v4)))); } template HWY_API Vec32 TruncateTo(D /* tag */, Vec128 v) { const Full128 d; const auto v1 = BitCast(d, v); const auto v2 = ConcatEven(d, v1, v1); return LowerHalf(LowerHalf(ConcatEven(d, v2, v2))); } template HWY_API Vec64 TruncateTo(D /* tag */, Vec128 v) { const Full128 d; const auto v1 = BitCast(d, v); return LowerHalf(ConcatEven(d, v1, v1)); } template HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { const Repartition> d; const auto v1 = Vec128{v.raw}; const auto v2 = ConcatEven(d, v1, v1); const auto v3 = ConcatEven(d, v2, v2); return VFromD{v3.raw}; } template HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { const Repartition> d; const auto v1 = Vec128{v.raw}; const auto v2 = ConcatEven(d, v1, v1); return VFromD{v2.raw}; } template HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { const Repartition> d; const auto v1 = Vec128{v.raw}; const auto v2 = ConcatEven(d, v1, v1); return VFromD{v2.raw}; } // ------------------------------ Demotions to/from i64 namespace detail { template HWY_INLINE VFromD> DemoteFromU64MaskOutResult( D /*dn*/, VFromD> v) { return v; } template HWY_INLINE VFromD> DemoteFromU64MaskOutResult( D /*dn*/, VFromD> v) { const DFromV du64; return And(v, Set(du64, static_cast(hwy::HighestValue>()))); } template HWY_INLINE VFromD> DemoteFromU64Saturate( D dn, VFromD> v) { const Rebind du64; const RebindToSigned di64; constexpr int kShiftAmt = static_cast(sizeof(TFromD) * 8) - static_cast(hwy::IsSigned>()); const auto too_big = BitCast( du64, VecFromMask( di64, Gt(BitCast(di64, ShiftRight(v)), Zero(di64)))); return DemoteFromU64MaskOutResult(dn, Or(v, too_big)); } template HWY_INLINE VFromD ReorderDemote2From64To32Combine(D dn, V a, V b) { return ConcatEven(dn, BitCast(dn, b), BitCast(dn, a)); } } // namespace detail template HWY_API VFromD DemoteTo(D dn, VFromD> v) { const DFromV di64; const RebindToUnsigned du64; const RebindToUnsigned dn_u; // Negative values are saturated by first saturating their bitwise inverse // and then inverting the saturation result const auto invert_mask = BitCast(du64, BroadcastSignBit(v)); const auto saturated_vals = Xor( invert_mask, detail::DemoteFromU64Saturate(dn, Xor(invert_mask, BitCast(du64, v)))); return BitCast(dn, TruncateTo(dn_u, saturated_vals)); } template HWY_API VFromD DemoteTo(D dn, VFromD> v) { const DFromV di64; const RebindToUnsigned du64; const auto non_neg_vals = BitCast(du64, AndNot(BroadcastSignBit(v), v)); return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, non_neg_vals)); } template HWY_API VFromD DemoteTo(D dn, VFromD> v) { return TruncateTo(dn, detail::DemoteFromU64Saturate(dn, v)); } template )> HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, VFromD> b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } template HWY_API VFromD ReorderDemote2To(D dn, VFromD> a, VFromD> b) { const DFromV d; const Twice dt; return DemoteTo(dn, Combine(dt, b, a)); } template HWY_API Vec128 ReorderDemote2To(D dn, Vec128 a, Vec128 b) { const DFromV di64; const RebindToUnsigned du64; const Half dnh; // Negative values are saturated by first saturating their bitwise inverse // and then inverting the saturation result const auto invert_mask_a = BitCast(du64, BroadcastSignBit(a)); const auto invert_mask_b = BitCast(du64, BroadcastSignBit(b)); const auto saturated_a = Xor( invert_mask_a, detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_a, BitCast(du64, a)))); const auto saturated_b = Xor( invert_mask_b, detail::DemoteFromU64Saturate(dnh, Xor(invert_mask_b, BitCast(du64, b)))); return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); } template HWY_API Vec128 ReorderDemote2To(D dn, Vec128 a, Vec128 b) { const DFromV di64; const RebindToUnsigned du64; const Half dnh; const auto saturated_a = detail::DemoteFromU64Saturate( dnh, BitCast(du64, AndNot(BroadcastSignBit(a), a))); const auto saturated_b = detail::DemoteFromU64Saturate( dnh, BitCast(du64, AndNot(BroadcastSignBit(b), b))); return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); } template HWY_API Vec128 ReorderDemote2To(D dn, Vec128 a, Vec128 b) { const Half dnh; const auto saturated_a = detail::DemoteFromU64Saturate(dnh, a); const auto saturated_b = detail::DemoteFromU64Saturate(dnh, b); return ConcatEven(dn, BitCast(dn, saturated_b), BitCast(dn, saturated_a)); } template ), class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD) * 2), HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV) * 2)> HWY_API VFromD OrderedDemote2To(D d, V a, V b) { return ReorderDemote2To(d, a, b); } template >> HWY_API VFromD OrderedDemote2To(D dbf16, V32 a, V32 b) { const RebindToUnsigned du16; return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a))); } // ------------------------------ ConvertTo template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD{wasm_f32x4_convert_i32x4(v.raw)}; } template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD{wasm_f32x4_convert_u32x4(v.raw)}; } template HWY_API VFromD ConvertTo(D dd, VFromD> v) { // Based on wim's approach (https://stackoverflow.com/questions/41144668/) const Repartition d32; const Repartition d64; // Toggle MSB of lower 32-bits and insert exponent for 2^84 + 2^63 const auto k84_63 = Set(d64, 0x4530000080000000ULL); const auto v_upper = BitCast(dd, ShiftRight<32>(BitCast(d64, v)) ^ k84_63); // Exponent is 2^52, lower 32 bits from v (=> 32-bit OddEven) const auto k52 = Set(d32, 0x43300000); const auto v_lower = BitCast(dd, OddEven(k52, BitCast(d32, v))); const auto k84_63_52 = BitCast(dd, Set(d64, 0x4530000080100000ULL)); return (v_upper - k84_63_52) + v_lower; // order matters! } namespace detail { template HWY_INLINE VFromD>> U64ToF64VecFast(VW w) { const DFromV d64; const RebindToFloat dd; const auto cnst2_52_dbl = Set(dd, 0x0010000000000000); // 2^52 return BitCast(dd, Or(w, BitCast(d64, cnst2_52_dbl))) - cnst2_52_dbl; } } // namespace detail template HWY_API VFromD ConvertTo(D dd, VFromD> v) { // Based on wim's approach (https://stackoverflow.com/questions/41144668/) const RebindToUnsigned d64; using VU = VFromD; const VU msk_lo = Set(d64, 0xFFFFFFFF); const auto cnst2_32_dbl = Set(dd, 4294967296.0); // 2^32 // Extract the 32 lowest/highest significant bits of v const VU v_lo = And(v, msk_lo); const VU v_hi = ShiftRight<32>(v); const auto v_lo_dbl = detail::U64ToF64VecFast(v_lo); return MulAdd(cnst2_32_dbl, detail::U64ToF64VecFast(v_hi), v_lo_dbl); } // Truncates (rounds toward zero). template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD{wasm_i32x4_trunc_sat_f32x4(v.raw)}; } template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD{wasm_u32x4_trunc_sat_f32x4(v.raw)}; } template HWY_API VFromD ConvertTo(DI di, VFromD> v) { using VI = VFromD; using MI = MFromD; const RebindToUnsigned du; using VU = VFromD; const Repartition du16; const VI k1075 = Set(di, 1075); // biased exponent of 2^52 // Exponent indicates whether the number can be represented as int64_t. const VU biased_exp = ShiftRight<52>(BitCast(du, v)) & Set(du, 0x7FF); const MI in_range = BitCast(di, biased_exp) < Set(di, 1086); // If we were to cap the exponent at 51 and add 2^52, the number would be in // [2^52, 2^53) and mantissa bits could be read out directly. We need to // round-to-0 (truncate). // Use 16-bit saturated unsigned subtraction to compute shift_mnt and // shift_int since biased_exp[i] is a non-negative integer that is less than // or equal to 2047. // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be // zero as the upper 48 bits of both k1075 and biased_exp are zero. const VU shift_mnt = BitCast( du, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp))); const VU shift_int = BitCast( du, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075))); const VU mantissa = BitCast(du, v) & Set(du, (1ULL << 52) - 1); // Include implicit 1-bit VU int53 = (mantissa | Set(du, 1ULL << 52)) >> shift_mnt; // WASM clamps shift count; zero if greater. const MI tiny = BitCast(di, shift_mnt) > Set(di, 63); int53 = IfThenZeroElse(RebindMask(du, tiny), int53); // For inputs larger than 2^53 - 1, insert zeros at the bottom. // For inputs less than 2^63, the implicit 1-bit is guaranteed not to be // shifted out of the left shift result below as shift_int[i] <= 10 is true // for any inputs that are less than 2^63. const VU shifted = int53 << shift_int; // Saturate to LimitsMin (unchanged when negating below) or LimitsMax. const VI sign_mask = BroadcastSignBit(BitCast(di, v)); const VI limit = Set(di, LimitsMax()) - sign_mask; const VI magnitude = IfThenElse(in_range, BitCast(di, shifted), limit); // If the input was negative, negate the integer (two's complement). return (magnitude ^ sign_mask) - sign_mask; } template HWY_API VFromD ConvertTo(DU du, VFromD> v) { const RebindToSigned di; using MI = MFromD; using VU = VFromD; const Repartition du16; const VU k1075 = Set(du, 1075); /* biased exponent of 2^52 */ const auto non_neg_v = ZeroIfNegative(v); // Exponent indicates whether the number can be represented as int64_t. const VU biased_exp = ShiftRight<52>(BitCast(du, non_neg_v)); const VU out_of_range = BitCast(du, VecFromMask(di, BitCast(di, biased_exp) > Set(di, 1086))); // If we were to cap the exponent at 51 and add 2^52, the number would be in // [2^52, 2^53) and mantissa bits could be read out directly. We need to // round-to-0 (truncate), but changing rounding mode in MXCSR hits a // compiler reordering bug: https://gcc.godbolt.org/z/4hKj6c6qc . We instead // manually shift the mantissa into place (we already have many of the // inputs anyway). // Use 16-bit saturated unsigned subtraction to compute shift_mnt and // shift_int since biased_exp[i] is a non-negative integer that is less than // or equal to 2047. // 16-bit saturated unsigned subtraction is also more efficient than a // 64-bit subtraction followed by a 64-bit signed Max operation on // WASM. // The upper 48 bits of both shift_mnt and shift_int are guaranteed to be // zero as the upper 48 bits of both k1075 and biased_exp are zero. const VU shift_mnt = BitCast( du, SaturatedSub(BitCast(du16, k1075), BitCast(du16, biased_exp))); const VU shift_int = BitCast( du, SaturatedSub(BitCast(du16, biased_exp), BitCast(du16, k1075))); const VU mantissa = BitCast(du, non_neg_v) & Set(du, (1ULL << 52) - 1); // Include implicit 1-bit. VU int53 = (mantissa | Set(du, 1ULL << 52)) >> shift_mnt; // WASM clamps shift count; zero if greater. const MI tiny = BitCast(di, shift_mnt) > Set(di, 63); int53 = IfThenZeroElse(RebindMask(du, tiny), int53); // For inputs larger than 2^53 - 1, insert zeros at the bottom. // For inputs less than 2^64, the implicit 1-bit is guaranteed not to be // shifted out of the left shift result below as shift_int[i] <= 11 is true // for any inputs that are less than 2^64. const VU shifted = int53 << shift_int; return (shifted | out_of_range); } // ------------------------------ NearestInt (Round) template HWY_API Vec128 NearestInt(const Vec128 v) { return ConvertTo(RebindToSigned>(), Round(v)); } // ================================================== MISC // ------------------------------ SumsOf8 (ShiftRight, Add) template HWY_API Vec128 SumsOf8(const Vec128 v) { const DFromV du8; const RepartitionToWide du16; const RepartitionToWide du32; const RepartitionToWide du64; using VU16 = VFromD; const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v)); const VU16 vECA86420 = And(BitCast(du16, v), Set(du16, 0xFF)); const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420); const VU16 szz_FE_zz_BA_zz_76_zz_32 = BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10))); const VU16 sxx_FC_xx_B8_xx_74_xx_30 = Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32); const VU16 szz_zz_xx_FC_zz_zz_xx_74 = BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30))); const VU16 sxx_xx_xx_F8_xx_xx_xx_70 = Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74); return And(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), Set(du64, 0xFFFF)); } template HWY_API Vec128 SumsOf8(const Vec128 v) { const DFromV di8; const RepartitionToWide di16; const RepartitionToWide di32; const RepartitionToWide di64; const RebindToUnsigned du32; const RebindToUnsigned du64; using VI16 = VFromD; const VI16 vFDB97531 = ShiftRight<8>(BitCast(di16, v)); const VI16 vECA86420 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, v))); const VI16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420); const VI16 sDC_zz_98_zz_54_zz_10_zz = BitCast(di16, ShiftLeft<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10))); const VI16 sFC_xx_B8_xx_74_xx_30_xx = Add(sFE_DC_BA_98_76_54_32_10, sDC_zz_98_zz_54_zz_10_zz); const VI16 sB8_xx_zz_zz_30_xx_zz_zz = BitCast(di16, ShiftLeft<32>(BitCast(du64, sFC_xx_B8_xx_74_xx_30_xx))); const VI16 sF8_xx_xx_xx_70_xx_xx_xx = Add(sFC_xx_B8_xx_74_xx_30_xx, sB8_xx_zz_zz_30_xx_zz_zz); return ShiftRight<48>(BitCast(di64, sF8_xx_xx_xx_70_xx_xx_xx)); } // ------------------------------ LoadMaskBits (TestBit) namespace detail { template HWY_INLINE MFromD LoadMaskBits(D d, uint64_t bits) { const RebindToUnsigned du; // Easier than Set(), which would require an >8-bit type, which would not // compile for T=uint8_t, N=1. const VFromD vbits{wasm_i32x4_splat(static_cast(bits))}; // Replicate bytes 8x such that each byte contains the bit that governs it. alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}; const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8)); alignas(16) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128}; return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); } template HWY_INLINE MFromD LoadMaskBits(D d, uint64_t bits) { const RebindToUnsigned du; alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; return RebindMask( d, TestBit(Set(du, static_cast(bits)), Load(du, kBit))); } template HWY_INLINE MFromD LoadMaskBits(D d, uint64_t bits) { const RebindToUnsigned du; alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8}; return RebindMask( d, TestBit(Set(du, static_cast(bits)), Load(du, kBit))); } template HWY_INLINE MFromD LoadMaskBits(D d, uint64_t bits) { const RebindToUnsigned du; alignas(16) static constexpr uint64_t kBit[8] = {1, 2}; return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit))); } } // namespace detail // `p` points to at least 8 readable bytes, not all of which need be valid. template HWY_API MFromD LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { uint64_t mask_bits = 0; CopyBytes<(MaxLanes(d) + 7) / 8>(bits, &mask_bits); return detail::LoadMaskBits(d, mask_bits); } // ------------------------------ Dup128MaskFromMaskBits template HWY_API MFromD Dup128MaskFromMaskBits(D d, unsigned mask_bits) { constexpr size_t kN = MaxLanes(d); if (kN < 8) mask_bits &= (1u << kN) - 1; return detail::LoadMaskBits(d, mask_bits); } // ------------------------------ Mask namespace detail { // Full template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, const Mask128 mask) { alignas(16) uint64_t lanes[2]; wasm_v128_store(lanes, mask.raw); constexpr uint64_t kMagic = 0x103070F1F3F80ULL; const uint64_t lo = ((lanes[0] * kMagic) >> 56); const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00; return (hi + lo); } // 64-bit template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, const Mask128 mask) { constexpr uint64_t kMagic = 0x103070F1F3F80ULL; return (static_cast(wasm_i64x2_extract_lane(mask.raw, 0)) * kMagic) >> 56; } // 32-bit or less: need masking template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, const Mask128 mask) { uint64_t bytes = static_cast(wasm_i64x2_extract_lane(mask.raw, 0)); // Clear potentially undefined bytes. bytes &= (1ULL << (N * 8)) - 1; constexpr uint64_t kMagic = 0x103070F1F3F80ULL; return (bytes * kMagic) >> 56; } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, const Mask128 mask) { // Remove useless lower half of each u16 while preserving the sign bit. const __i16x8 zero = wasm_i16x8_splat(0); const Mask128 mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)}; return BitsFromMask(hwy::SizeTag<1>(), mask8); } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, const Mask128 mask) { const __i32x4 mask_i = static_cast<__i32x4>(mask.raw); const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8); const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice); alignas(16) uint32_t lanes[4]; wasm_v128_store(lanes, sliced_mask); return lanes[0] | lanes[1] | lanes[2] | lanes[3]; } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, const Mask128 mask) { const __i64x2 mask_i = static_cast<__i64x2>(mask.raw); const __i64x2 slice = wasm_i64x2_make(1, 2); const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice); alignas(16) uint64_t lanes[2]; wasm_v128_store(lanes, sliced_mask); return lanes[0] | lanes[1]; } // Returns the lowest N bits for the BitsFromMask result. template constexpr uint64_t OnlyActive(uint64_t bits) { return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1); } // Returns 0xFF for bytes with index >= N, otherwise 0. template constexpr __i8x16 BytesAbove() { return /**/ (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1) : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1) : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1) : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1) : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0) : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1) : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1) : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1) : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1) : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1) : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1) : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1) : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1) : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1) : (N == 11) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1) : (N == 13) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1) : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1); } template HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); } template HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128 m) { return PopCount(BitsFromMask(tag, m)); } template HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128 m) { return PopCount(BitsFromMask(tag, m)); } template HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128 m) { const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8); const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift); alignas(16) uint64_t lanes[2]; wasm_v128_store(lanes, shifted_bits); return PopCount(lanes[0] | lanes[1]); } template HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128 m) { alignas(16) int64_t lanes[2]; wasm_v128_store(lanes, m.raw); return static_cast(-(lanes[0] + lanes[1])); } } // namespace detail // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D d, const MFromD mask, uint8_t* bits) { const uint64_t mask_bits = detail::BitsFromMask(mask); const size_t kNumBytes = (d.MaxLanes() + 7) / 8; CopyBytes(&mask_bits, bits); return kNumBytes; } template HWY_API size_t CountTrue(D /* tag */, const MFromD m) { return detail::CountTrue(hwy::SizeTag)>(), m); } // Partial template , HWY_IF_V_SIZE_LE_D(D, 8)> HWY_API size_t CountTrue(D d, MFromD m) { // Ensure all undefined bytes are 0. const MFromD mask{detail::BytesAbove()}; const Full128 dfull; return CountTrue(dfull, Mask128{AndNot(mask, m).raw}); } // Full vector template HWY_API bool AllFalse(D d, const MFromD m) { const auto v8 = BitCast(Full128(), VecFromMask(d, m)); return !wasm_v128_any_true(v8.raw); } // Full vector namespace detail { template HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask128 m) { return wasm_i8x16_all_true(m.raw); } template HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask128 m) { return wasm_i16x8_all_true(m.raw); } template HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask128 m) { return wasm_i32x4_all_true(m.raw); } template HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask128 m) { return wasm_i64x2_all_true(m.raw); } } // namespace detail template > HWY_API bool AllTrue(D /* tag */, const Mask128 m) { return detail::AllTrue(hwy::SizeTag(), m); } // Partial vectors template , HWY_IF_V_SIZE_LE_D(D, 8)> HWY_API bool AllFalse(D d, const MFromD m) { // Ensure all undefined bytes are 0. const MFromD mask{detail::BytesAbove()}; return AllFalse(Full128(), Mask128{AndNot(mask, m).raw}); } template , HWY_IF_V_SIZE_LE_D(D, 8)> HWY_API bool AllTrue(D d, const MFromD m) { // Ensure all undefined bytes are FF. const MFromD mask{detail::BytesAbove()}; return AllTrue(Full128(), Mask128{Or(mask, m).raw}); } template HWY_API size_t FindKnownFirstTrue(D /* tag */, const MFromD mask) { const uint32_t bits = static_cast(detail::BitsFromMask(mask)); return Num0BitsBelowLS1Bit_Nonzero32(bits); } template HWY_API intptr_t FindFirstTrue(D /* tag */, const MFromD mask) { const uint32_t bits = static_cast(detail::BitsFromMask(mask)); return bits ? static_cast(Num0BitsBelowLS1Bit_Nonzero32(bits)) : -1; } template HWY_API size_t FindKnownLastTrue(D /* tag */, const MFromD mask) { const uint32_t bits = static_cast(detail::BitsFromMask(mask)); return 31 - Num0BitsAboveMS1Bit_Nonzero32(bits); } template HWY_API intptr_t FindLastTrue(D /* tag */, const MFromD mask) { const uint32_t bits = static_cast(detail::BitsFromMask(mask)); return bits ? (31 - static_cast(Num0BitsAboveMS1Bit_Nonzero32(bits))) : -1; } // ------------------------------ Compress namespace detail { template HWY_INLINE Vec128 IdxFromBits(const uint64_t mask_bits) { HWY_DASSERT(mask_bits < 256); const Simd d; const Rebind d8; const Simd du; // We need byte indices for TableLookupBytes (one vector's worth for each of // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We // can instead store lane indices and convert to byte indices (2*lane + 0..1), // with the doubling baked into the table. Unpacking nibbles is likely more // costly than the higher cache footprint from storing bytes. alignas(16) static constexpr uint8_t table[256 * 8] = { // PrintCompress16x8Tables 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; const Vec128 byte_idx{Load(d8, table + mask_bits * 8).raw}; const Vec128 pairs = ZipLower(byte_idx, byte_idx); return BitCast(d, pairs + Set(du, 0x0100)); } template HWY_INLINE Vec128 IdxFromNotBits(const uint64_t mask_bits) { HWY_DASSERT(mask_bits < 256); const Simd d; const Rebind d8; const Simd du; // We need byte indices for TableLookupBytes (one vector's worth for each of // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We // can instead store lane indices and convert to byte indices (2*lane + 0..1), // with the doubling baked into the table. Unpacking nibbles is likely more // costly than the higher cache footprint from storing bytes. alignas(16) static constexpr uint8_t table[256 * 8] = { // PrintCompressNot16x8Tables 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; const Vec128 byte_idx{Load(d8, table + mask_bits * 8).raw}; const Vec128 pairs = ZipLower(byte_idx, byte_idx); return BitCast(d, pairs + Set(du, 0x0100)); } template HWY_INLINE Vec128 IdxFromBits(const uint64_t mask_bits) { HWY_DASSERT(mask_bits < 16); // There are only 4 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[16 * 16] = { // PrintCompress32x4Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Simd d; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_INLINE Vec128 IdxFromNotBits(const uint64_t mask_bits) { HWY_DASSERT(mask_bits < 16); // There are only 4 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[16 * 16] = { // PrintCompressNot32x4Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Simd d; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_INLINE Vec128 IdxFromBits(const uint64_t mask_bits) { HWY_DASSERT(mask_bits < 4); // There are only 2 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[4 * 16] = { // PrintCompress64x2Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Simd d; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_INLINE Vec128 IdxFromNotBits(const uint64_t mask_bits) { HWY_DASSERT(mask_bits < 4); // There are only 2 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[4 * 16] = { // PrintCompressNot64x2Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Simd d; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } // Helper functions called by both Compress and CompressStore - avoids a // redundant BitsFromMask in the latter. template HWY_INLINE Vec128 Compress(Vec128 v, const uint64_t mask_bits) { const auto idx = detail::IdxFromBits(mask_bits); const DFromV d; const RebindToSigned di; return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx))); } template HWY_INLINE Vec128 CompressNot(Vec128 v, const uint64_t mask_bits) { const auto idx = detail::IdxFromNotBits(mask_bits); const DFromV d; const RebindToSigned di; return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx))); } } // namespace detail template struct CompressIsPartition { #if HWY_TARGET == HWY_WASM_EMU256 enum { value = 0 }; #else enum { value = (sizeof(T) != 1) }; #endif }; // Single lane: no-op template HWY_API Vec128 Compress(Vec128 v, Mask128 /*m*/) { return v; } // Two lanes: conditional swap template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. const Full128 d; const Vec128 m = VecFromMask(d, mask); const Vec128 maskL = DupEven(m); const Vec128 maskH = DupOdd(m); const Vec128 swap = AndNot(maskL, maskH); return IfVecThenElse(swap, Shuffle01(v), v); } // General case, 2 or 4 byte lanes template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { return detail::Compress(v, detail::BitsFromMask(mask)); } // Single lane: no-op template HWY_API Vec128 CompressNot(Vec128 v, Mask128 /*m*/) { return v; } // Two lanes: conditional swap template HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. const Full128 d; const Vec128 m = VecFromMask(d, mask); const Vec128 maskL = DupEven(m); const Vec128 maskH = DupOdd(m); const Vec128 swap = AndNot(maskH, maskL); return IfVecThenElse(swap, Shuffle01(v), v); } // General case, 2 or 4 byte lanes template HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // For partial vectors, we cannot pull the Not() into the table because // BitsFromMask clears the upper bits. if (N < 16 / sizeof(T)) { return detail::Compress(v, detail::BitsFromMask(Not(mask))); } return detail::CompressNot(v, detail::BitsFromMask(mask)); } // ------------------------------ CompressBlocksNot HWY_API Vec128 CompressBlocksNot(Vec128 v, Mask128 /* m */) { return v; } // ------------------------------ CompressBits template HWY_API Vec128 CompressBits(Vec128 v, const uint8_t* HWY_RESTRICT bits) { uint64_t mask_bits = 0; constexpr size_t kNumBytes = (N + 7) / 8; CopyBytes(bits, &mask_bits); if (N < 8) { mask_bits &= (1ull << N) - 1; } return detail::Compress(v, mask_bits); } // ------------------------------ CompressStore template HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, TFromD* HWY_RESTRICT unaligned) { const uint64_t mask_bits = detail::BitsFromMask(mask); const auto c = detail::Compress(v, mask_bits); StoreU(c, d, unaligned); return PopCount(mask_bits); } // ------------------------------ CompressBlendedStore template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; // so we can support fp16/bf16 const uint64_t mask_bits = detail::BitsFromMask(m); const size_t count = PopCount(mask_bits); const VFromD compressed = detail::Compress(BitCast(du, v), mask_bits); const MFromD store_mask = RebindMask(d, FirstN(du, count)); BlendedStore(BitCast(d, compressed), store_mask, d, unaligned); return count; } // ------------------------------ CompressBitsStore template HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, D d, TFromD* HWY_RESTRICT unaligned) { uint64_t mask_bits = 0; constexpr size_t kN = MaxLanes(d); CopyBytes<(kN + 7) / 8>(bits, &mask_bits); if (kN < 8) { mask_bits &= (1ull << kN) - 1; } const auto c = detail::Compress(v, mask_bits); StoreU(c, d, unaligned); return PopCount(mask_bits); } // ------------------------------ StoreInterleaved2/3/4 // HWY_NATIVE_LOAD_STORE_INTERLEAVED not set, hence defined in // generic_ops-inl.h. // ------------------------------ Additional mask logical operations template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { return mask; } template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { const FixedTag d; const auto vmask = VecFromMask(d, mask); return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask))); } template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { const Simd d; const auto vmask = VecFromMask(d, mask); const auto neg_vmask = ResizeBitCast(d, Neg(ResizeBitCast(Full64(), vmask))); return MaskFromVec(Or(vmask, neg_vmask)); } template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { const Full128 d; const Repartition di64; auto vmask = BitCast(di64, VecFromMask(d, mask)); vmask = Or(vmask, Neg(vmask)); // Copy the sign bit of the first int64_t lane to the second int64_t lane const auto vmask2 = BroadcastSignBit(InterleaveLower(Zero(di64), vmask)); return MaskFromVec(BitCast(d, Or(vmask, vmask2))); } template HWY_API Mask128 SetBeforeFirst(Mask128 mask) { return Not(SetAtOrAfterFirst(mask)); } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { return mask; } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { const FixedTag d; const RebindToSigned di; const auto vmask = BitCast(di, VecFromMask(d, mask)); const auto zero = Zero(di); const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero); return MaskFromVec(BitCast(d, And(vmask, vmask2))); } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { const Simd d; const RebindToSigned di; const auto vmask = ResizeBitCast(Full64(), VecFromMask(d, mask)); const auto only_first_vmask = BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask))))); return MaskFromVec(only_first_vmask); } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { const Full128 d; const RebindToSigned di; const Repartition di64; const auto zero = Zero(di64); const auto vmask = BitCast(di64, VecFromMask(d, mask)); const auto vmask2 = VecFromMask(di64, InterleaveLower(zero, vmask) == zero); const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask)))); return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2)))); } template HWY_API Mask128 SetAtOrBeforeFirst(Mask128 /*mask*/) { const FixedTag d; const RebindToSigned di; using TI = MakeSigned; return RebindMask(d, MaskFromVec(Set(di, TI(-1)))); } template HWY_API Mask128 SetAtOrBeforeFirst(Mask128 mask) { const Simd d; return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask)))); } // ------------------------------ MulEven/Odd (Load) HWY_INLINE Vec128 MulEven(const Vec128 a, const Vec128 b) { alignas(16) uint64_t mul[2]; mul[0] = Mul128(static_cast(wasm_i64x2_extract_lane(a.raw, 0)), static_cast(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]); return Load(Full128(), mul); } HWY_INLINE Vec128 MulOdd(const Vec128 a, const Vec128 b) { alignas(16) uint64_t mul[2]; mul[0] = Mul128(static_cast(wasm_i64x2_extract_lane(a.raw, 1)), static_cast(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]); return Load(Full128(), mul); } // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) // Generic for all vector lengths. template >> HWY_API VFromD WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) { const Rebind du32; using VU32 = VFromD; const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32 // Using shift/and instead of Zip leads to the odd/even order that // RearrangeToOddPlusEven prefers. const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); const VU32 ao = And(BitCast(du32, a), odd); const VU32 be = ShiftLeft<16>(BitCast(du32, b)); const VU32 bo = And(BitCast(du32, b), odd); return Mul(BitCast(df32, ae), BitCast(df32, be)) + Mul(BitCast(df32, ao), BitCast(df32, bo)); } template >> HWY_API VFromD ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD sum0, VFromD& sum1) { const Rebind du32; using VU32 = VFromD; const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32 // Using shift/and instead of Zip leads to the odd/even order that // RearrangeToOddPlusEven prefers. const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); const VU32 ao = And(BitCast(du32, a), odd); const VU32 be = ShiftLeft<16>(BitCast(du32, b)); const VU32 bo = And(BitCast(du32, b), odd); sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1); return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0); } // Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is // safe. template >> HWY_API VFromD WidenMulPairwiseAdd(D32 /* tag */, V16 a, V16 b) { return VFromD{wasm_i32x4_dot_i16x8(a.raw, b.raw)}; } template >> HWY_API VFromD WidenMulPairwiseAdd(DU32 du32, VU16 a, VU16 b) { const auto lo16_mask = Set(du32, 0x0000FFFFu); const auto a0 = And(BitCast(du32, a), lo16_mask); const auto b0 = And(BitCast(du32, b), lo16_mask); const auto a1 = ShiftRight<16>(BitCast(du32, a)); const auto b1 = ShiftRight<16>(BitCast(du32, b)); return MulAdd(a1, b1, a0 * b0); } // Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is // safe. template >> HWY_API VFromD ReorderWidenMulAccumulate(D32 d, V16 a, V16 b, const VFromD sum0, VFromD& /*sum1*/) { return sum0 + WidenMulPairwiseAdd(d, a, b); } // Even if N=1, the input is always at least 2 lanes, hence i32x4_dot_i16x8 is // safe. template >> HWY_API VFromD ReorderWidenMulAccumulate(DU32 d, VU16 a, VU16 b, const VFromD sum0, VFromD& /*sum1*/) { return sum0 + WidenMulPairwiseAdd(d, a, b); } // ------------------------------ RearrangeToOddPlusEven template HWY_API Vec128 RearrangeToOddPlusEven( const Vec128 sum0, const Vec128 /*sum1*/) { return sum0; // invariant already holds } template HWY_API Vec128 RearrangeToOddPlusEven( const Vec128 sum0, const Vec128 /*sum1*/) { return sum0; // invariant already holds } template HWY_API Vec128 RearrangeToOddPlusEven(const Vec128 sum0, const Vec128 sum1) { return Add(sum0, sum1); } // ------------------------------ Reductions // Nothing native, generic_ops-inl defines SumOfLanes and ReduceSum. // ------------------------------ Lt128 template HWY_INLINE MFromD Lt128(D d, VFromD a, VFromD b) { // Truth table of Eq and Lt for Hi and Lo u64. // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) // =H =L cH cL | out = cH | (=H & cL) // 0 0 0 0 | 0 // 0 0 0 1 | 0 // 0 0 1 0 | 1 // 0 0 1 1 | 1 // 0 1 0 0 | 0 // 0 1 0 1 | 0 // 0 1 1 0 | 1 // 1 0 0 0 | 0 // 1 0 0 1 | 1 // 1 1 0 0 | 0 const MFromD eqHL = Eq(a, b); const VFromD ltHL = VecFromMask(d, Lt(a, b)); // We need to bring cL to the upper lane/bit corresponding to cH. Comparing // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the // comparison result leftwards requires only 4. IfThenElse compiles to the // same code as OrAnd(). const VFromD ltLx = DupEven(ltHL); const VFromD outHx = IfThenElse(eqHL, ltLx, ltHL); return MaskFromVec(DupOdd(outHx)); } template HWY_INLINE MFromD Lt128Upper(D d, VFromD a, VFromD b) { const VFromD ltHL = VecFromMask(d, Lt(a, b)); return MaskFromVec(InterleaveUpper(d, ltHL, ltHL)); } // ------------------------------ Eq128 template HWY_INLINE MFromD Eq128(D d, VFromD a, VFromD b) { const VFromD eqHL = VecFromMask(d, Eq(a, b)); return MaskFromVec(And(Reverse2(d, eqHL), eqHL)); } template HWY_INLINE MFromD Eq128Upper(D d, VFromD a, VFromD b) { const VFromD eqHL = VecFromMask(d, Eq(a, b)); return MaskFromVec(InterleaveUpper(d, eqHL, eqHL)); } // ------------------------------ Ne128 template HWY_INLINE MFromD Ne128(D d, VFromD a, VFromD b) { const VFromD neHL = VecFromMask(d, Ne(a, b)); return MaskFromVec(Or(Reverse2(d, neHL), neHL)); } template HWY_INLINE MFromD Ne128Upper(D d, VFromD a, VFromD b) { const VFromD neHL = VecFromMask(d, Ne(a, b)); return MaskFromVec(InterleaveUpper(d, neHL, neHL)); } // ------------------------------ Min128, Max128 (Lt128) // Without a native OddEven, it seems infeasible to go faster than Lt128. template HWY_INLINE VFromD Min128(D d, const VFromD a, const VFromD b) { return IfThenElse(Lt128(d, a, b), a, b); } template HWY_INLINE VFromD Max128(D d, const VFromD a, const VFromD b) { return IfThenElse(Lt128(d, b, a), a, b); } template HWY_INLINE VFromD Min128Upper(D d, const VFromD a, const VFromD b) { return IfThenElse(Lt128Upper(d, a, b), a, b); } template HWY_INLINE VFromD Max128Upper(D d, const VFromD a, const VFromD b) { return IfThenElse(Lt128Upper(d, b, a), a, b); } // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE();