// Copyright 2019 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // 128-bit WASM vectors and operations. // External include guard in highway.h - see comment there. #include #include #include #include "hwy/base.h" #include "hwy/ops/shared-inl.h" #ifdef HWY_WASM_OLD_NAMES #define wasm_i8x16_shuffle wasm_v8x16_shuffle #define wasm_i16x8_shuffle wasm_v16x8_shuffle #define wasm_i32x4_shuffle wasm_v32x4_shuffle #define wasm_i64x2_shuffle wasm_v64x2_shuffle #define wasm_u16x8_extend_low_u8x16 wasm_i16x8_widen_low_u8x16 #define wasm_u32x4_extend_low_u16x8 wasm_i32x4_widen_low_u16x8 #define wasm_i32x4_extend_low_i16x8 wasm_i32x4_widen_low_i16x8 #define wasm_i16x8_extend_low_i8x16 wasm_i16x8_widen_low_i8x16 #define wasm_u32x4_extend_high_u16x8 wasm_i32x4_widen_high_u16x8 #define wasm_i32x4_extend_high_i16x8 wasm_i32x4_widen_high_i16x8 #define wasm_i32x4_trunc_sat_f32x4 wasm_i32x4_trunc_saturate_f32x4 #define wasm_u8x16_add_sat wasm_u8x16_add_saturate #define wasm_u8x16_sub_sat wasm_u8x16_sub_saturate #define wasm_u16x8_add_sat wasm_u16x8_add_saturate #define wasm_u16x8_sub_sat wasm_u16x8_sub_saturate #define wasm_i8x16_add_sat wasm_i8x16_add_saturate #define wasm_i8x16_sub_sat wasm_i8x16_sub_saturate #define wasm_i16x8_add_sat wasm_i16x8_add_saturate #define wasm_i16x8_sub_sat wasm_i16x8_sub_saturate #endif HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { template using Full128 = Simd; template using Full64 = Simd; namespace detail { template struct Raw128 { using type = __v128_u; }; template <> struct Raw128 { using type = __f32x4; }; } // namespace detail template class Vec128 { using Raw = typename detail::Raw128::type; public: // Compound assignment. Only usable if there is a corresponding non-member // binary operator overload. For example, only f32 and f64 support division. HWY_INLINE Vec128& operator*=(const Vec128 other) { return *this = (*this * other); } HWY_INLINE Vec128& operator/=(const Vec128 other) { return *this = (*this / other); } HWY_INLINE Vec128& operator+=(const Vec128 other) { return *this = (*this + other); } HWY_INLINE Vec128& operator-=(const Vec128 other) { return *this = (*this - other); } HWY_INLINE Vec128& operator&=(const Vec128 other) { return *this = (*this & other); } HWY_INLINE Vec128& operator|=(const Vec128 other) { return *this = (*this | other); } HWY_INLINE Vec128& operator^=(const Vec128 other) { return *this = (*this ^ other); } Raw raw; }; template using Vec64 = Vec128; // FF..FF or 0. template struct Mask128 { typename detail::Raw128::type raw; }; namespace detail { // Deduce Simd from Vec128 struct DeduceD { template Simd operator()(Vec128) const { return Simd(); } }; } // namespace detail template using DFromV = decltype(detail::DeduceD()(V())); template using TFromV = TFromD>; // ------------------------------ BitCast namespace detail { HWY_INLINE __v128_u BitCastToInteger(__v128_u v) { return v; } HWY_INLINE __v128_u BitCastToInteger(__f32x4 v) { return static_cast<__v128_u>(v); } HWY_INLINE __v128_u BitCastToInteger(__f64x2 v) { return static_cast<__v128_u>(v); } template HWY_INLINE Vec128 BitCastToByte(Vec128 v) { return Vec128{BitCastToInteger(v.raw)}; } // Cannot rely on function overloading because return types differ. template struct BitCastFromInteger128 { HWY_INLINE __v128_u operator()(__v128_u v) { return v; } }; template <> struct BitCastFromInteger128 { HWY_INLINE __f32x4 operator()(__v128_u v) { return static_cast<__f32x4>(v); } }; template HWY_INLINE Vec128 BitCastFromByte(Simd /* tag */, Vec128 v) { return Vec128{BitCastFromInteger128()(v.raw)}; } } // namespace detail template HWY_API Vec128 BitCast(Simd d, Vec128 v) { return detail::BitCastFromByte(d, detail::BitCastToByte(v)); } // ------------------------------ Zero // Returns an all-zero vector/part. template HWY_API Vec128 Zero(Simd /* tag */) { return Vec128{wasm_i32x4_splat(0)}; } template HWY_API Vec128 Zero(Simd /* tag */) { return Vec128{wasm_f32x4_splat(0.0f)}; } template using VFromD = decltype(Zero(D())); // ------------------------------ Set // Returns a vector/part with all lanes set to "t". template HWY_API Vec128 Set(Simd /* tag */, const uint8_t t) { return Vec128{wasm_i8x16_splat(static_cast(t))}; } template HWY_API Vec128 Set(Simd /* tag */, const uint16_t t) { return Vec128{wasm_i16x8_splat(static_cast(t))}; } template HWY_API Vec128 Set(Simd /* tag */, const uint32_t t) { return Vec128{wasm_i32x4_splat(static_cast(t))}; } template HWY_API Vec128 Set(Simd /* tag */, const uint64_t t) { return Vec128{wasm_i64x2_splat(static_cast(t))}; } template HWY_API Vec128 Set(Simd /* tag */, const int8_t t) { return Vec128{wasm_i8x16_splat(t)}; } template HWY_API Vec128 Set(Simd /* tag */, const int16_t t) { return Vec128{wasm_i16x8_splat(t)}; } template HWY_API Vec128 Set(Simd /* tag */, const int32_t t) { return Vec128{wasm_i32x4_splat(t)}; } template HWY_API Vec128 Set(Simd /* tag */, const int64_t t) { return Vec128{wasm_i64x2_splat(t)}; } template HWY_API Vec128 Set(Simd /* tag */, const float t) { return Vec128{wasm_f32x4_splat(t)}; } HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") // Returns a vector with uninitialized elements. template HWY_API Vec128 Undefined(Simd d) { return Zero(d); } HWY_DIAGNOSTICS(pop) // Returns a vector with lane i=[0, N) set to "first" + i. template Vec128 Iota(const Simd d, const T2 first) { HWY_ALIGN T lanes[16 / sizeof(T)]; for (size_t i = 0; i < 16 / sizeof(T); ++i) { lanes[i] = static_cast(first + static_cast(i)); } return Load(d, lanes); } // ================================================== ARITHMETIC // ------------------------------ Addition // Unsigned template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i8x16_add(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_add(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_add(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i64x2_add(a.raw, b.raw)}; } // Signed template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i8x16_add(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_add(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_add(a.raw, b.raw)}; } template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_i64x2_add(a.raw, b.raw)}; } // Float template HWY_API Vec128 operator+(const Vec128 a, const Vec128 b) { return Vec128{wasm_f32x4_add(a.raw, b.raw)}; } // ------------------------------ Subtraction // Unsigned template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i8x16_sub(a.raw, b.raw)}; } template HWY_API Vec128 operator-(Vec128 a, Vec128 b) { return Vec128{wasm_i16x8_sub(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_sub(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i64x2_sub(a.raw, b.raw)}; } // Signed template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i8x16_sub(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_sub(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_sub(a.raw, b.raw)}; } template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_i64x2_sub(a.raw, b.raw)}; } // Float template HWY_API Vec128 operator-(const Vec128 a, const Vec128 b) { return Vec128{wasm_f32x4_sub(a.raw, b.raw)}; } // ------------------------------ SaturatedAdd // Returns a + b clamped to the destination range. // Unsigned template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{wasm_u8x16_add_sat(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{wasm_u16x8_add_sat(a.raw, b.raw)}; } // Signed template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{wasm_i8x16_add_sat(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedAdd(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_add_sat(a.raw, b.raw)}; } // ------------------------------ SaturatedSub // Returns a - b clamped to the destination range. // Unsigned template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{wasm_u8x16_sub_sat(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{wasm_u16x8_sub_sat(a.raw, b.raw)}; } // Signed template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{wasm_i8x16_sub_sat(a.raw, b.raw)}; } template HWY_API Vec128 SaturatedSub(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_sub_sat(a.raw, b.raw)}; } // ------------------------------ Average // Returns (a + b + 1) / 2 // Unsigned template HWY_API Vec128 AverageRound(const Vec128 a, const Vec128 b) { return Vec128{wasm_u8x16_avgr(a.raw, b.raw)}; } template HWY_API Vec128 AverageRound(const Vec128 a, const Vec128 b) { return Vec128{wasm_u16x8_avgr(a.raw, b.raw)}; } // ------------------------------ Absolute value // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{wasm_i8x16_abs(v.raw)}; } template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{wasm_i16x8_abs(v.raw)}; } template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{wasm_i32x4_abs(v.raw)}; } template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{wasm_i64x2_abs(v.raw)}; } template HWY_API Vec128 Abs(const Vec128 v) { return Vec128{wasm_f32x4_abs(v.raw)}; } // ------------------------------ Shift lanes by constant #bits // Unsigned template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{wasm_i16x8_shl(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{wasm_u16x8_shr(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{wasm_i32x4_shl(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{wasm_i64x2_shl(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{wasm_u32x4_shr(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{wasm_u64x2_shr(v.raw, kBits)}; } // Signed template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{wasm_i16x8_shl(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{wasm_i16x8_shr(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{wasm_i32x4_shl(v.raw, kBits)}; } template HWY_API Vec128 ShiftLeft(const Vec128 v) { return Vec128{wasm_i64x2_shl(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{wasm_i32x4_shr(v.raw, kBits)}; } template HWY_API Vec128 ShiftRight(const Vec128 v) { return Vec128{wasm_i64x2_shr(v.raw, kBits)}; } // 8-bit template HWY_API Vec128 ShiftLeft(const Vec128 v) { const DFromV d8; // Use raw instead of BitCast to support N=1. const Vec128 shifted{ShiftLeft(Vec128>{v.raw}).raw}; return kBits == 1 ? (v + v) : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); } template HWY_API Vec128 ShiftRight(const Vec128 v) { const DFromV d8; // Use raw instead of BitCast to support N=1. const Vec128 shifted{ ShiftRight(Vec128{v.raw}).raw}; return shifted & Set(d8, 0xFF >> kBits); } template HWY_API Vec128 ShiftRight(const Vec128 v) { const DFromV di; const RebindToUnsigned du; const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); return (shifted ^ shifted_sign) - shifted_sign; } // ------------------------------ RotateRight (ShiftRight, Or) template HWY_API Vec128 RotateRight(const Vec128 v) { constexpr size_t kSizeInBits = sizeof(T) * 8; static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); if (kBits == 0) return v; return Or(ShiftRight(v), ShiftLeft(v)); } // ------------------------------ Shift lanes by same variable #bits // After https://reviews.llvm.org/D108415 shift argument became unsigned. HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4245 4365, ignored "-Wsign-conversion") // Unsigned template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { return Vec128{wasm_i16x8_shl(v.raw, bits)}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { return Vec128{wasm_u16x8_shr(v.raw, bits)}; } template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { return Vec128{wasm_i32x4_shl(v.raw, bits)}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { return Vec128{wasm_u32x4_shr(v.raw, bits)}; } template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { return Vec128{wasm_i64x2_shl(v.raw, bits)}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { return Vec128{wasm_u64x2_shr(v.raw, bits)}; } // Signed template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { return Vec128{wasm_i16x8_shl(v.raw, bits)}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { return Vec128{wasm_i16x8_shr(v.raw, bits)}; } template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { return Vec128{wasm_i32x4_shl(v.raw, bits)}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { return Vec128{wasm_i32x4_shr(v.raw, bits)}; } template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { return Vec128{wasm_i64x2_shl(v.raw, bits)}; } template HWY_API Vec128 ShiftRightSame(const Vec128 v, const int bits) { return Vec128{wasm_i64x2_shr(v.raw, bits)}; } // 8-bit template HWY_API Vec128 ShiftLeftSame(const Vec128 v, const int bits) { const DFromV d8; // Use raw instead of BitCast to support N=1. const Vec128 shifted{ ShiftLeftSame(Vec128>{v.raw}, bits).raw}; return shifted & Set(d8, static_cast((0xFF << bits) & 0xFF)); } template HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { const DFromV d8; // Use raw instead of BitCast to support N=1. const Vec128 shifted{ ShiftRightSame(Vec128{v.raw}, bits).raw}; return shifted & Set(d8, 0xFF >> bits); } template HWY_API Vec128 ShiftRightSame(Vec128 v, const int bits) { const DFromV di; const RebindToUnsigned du; const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits)); return (shifted ^ shifted_sign) - shifted_sign; } // ignore Wsign-conversion HWY_DIAGNOSTICS(pop) // ------------------------------ Minimum // Unsigned template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{wasm_u8x16_min(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{wasm_u16x8_min(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{wasm_u32x4_min(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { alignas(16) uint64_t min[2]; min[0] = HWY_MIN(wasm_u64x2_extract_lane(a.raw, 0), wasm_u64x2_extract_lane(b.raw, 0)); min[1] = HWY_MIN(wasm_u64x2_extract_lane(a.raw, 1), wasm_u64x2_extract_lane(b.raw, 1)); return Vec128{wasm_v128_load(min)}; } // Signed template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{wasm_i8x16_min(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{wasm_i16x8_min(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_min(a.raw, b.raw)}; } template HWY_API Vec128 Min(Vec128 a, Vec128 b) { alignas(16) int64_t min[4]; min[0] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 0), wasm_i64x2_extract_lane(b.raw, 0)); min[1] = HWY_MIN(wasm_i64x2_extract_lane(a.raw, 1), wasm_i64x2_extract_lane(b.raw, 1)); return Vec128{wasm_v128_load(min)}; } // Float template HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128{wasm_f32x4_min(a.raw, b.raw)}; } // ------------------------------ Maximum // Unsigned template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{wasm_u8x16_max(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{wasm_u16x8_max(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{wasm_u32x4_max(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { alignas(16) uint64_t max[2]; max[0] = HWY_MAX(wasm_u64x2_extract_lane(a.raw, 0), wasm_u64x2_extract_lane(b.raw, 0)); max[1] = HWY_MAX(wasm_u64x2_extract_lane(a.raw, 1), wasm_u64x2_extract_lane(b.raw, 1)); return Vec128{wasm_v128_load(max)}; } // Signed template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{wasm_i8x16_max(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{wasm_i16x8_max(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_max(a.raw, b.raw)}; } template HWY_API Vec128 Max(Vec128 a, Vec128 b) { alignas(16) int64_t max[2]; max[0] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 0), wasm_i64x2_extract_lane(b.raw, 0)); max[1] = HWY_MAX(wasm_i64x2_extract_lane(a.raw, 1), wasm_i64x2_extract_lane(b.raw, 1)); return Vec128{wasm_v128_load(max)}; } // Float template HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128{wasm_f32x4_max(a.raw, b.raw)}; } // ------------------------------ Integer multiplication // Unsigned template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_mul(a.raw, b.raw)}; } template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_mul(a.raw, b.raw)}; } // Signed template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{wasm_i16x8_mul(a.raw, b.raw)}; } template HWY_API Vec128 operator*(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_mul(a.raw, b.raw)}; } // Returns the upper 16 bits of a * b in each lane. template HWY_API Vec128 MulHigh(const Vec128 a, const Vec128 b) { // TODO(eustas): replace, when implemented in WASM. const auto al = wasm_u32x4_extend_low_u16x8(a.raw); const auto ah = wasm_u32x4_extend_high_u16x8(a.raw); const auto bl = wasm_u32x4_extend_low_u16x8(b.raw); const auto bh = wasm_u32x4_extend_high_u16x8(b.raw); const auto l = wasm_i32x4_mul(al, bl); const auto h = wasm_i32x4_mul(ah, bh); // TODO(eustas): shift-right + narrow? return Vec128{ wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; } template HWY_API Vec128 MulHigh(const Vec128 a, const Vec128 b) { // TODO(eustas): replace, when implemented in WASM. const auto al = wasm_i32x4_extend_low_i16x8(a.raw); const auto ah = wasm_i32x4_extend_high_i16x8(a.raw); const auto bl = wasm_i32x4_extend_low_i16x8(b.raw); const auto bh = wasm_i32x4_extend_high_i16x8(b.raw); const auto l = wasm_i32x4_mul(al, bl); const auto h = wasm_i32x4_mul(ah, bh); // TODO(eustas): shift-right + narrow? return Vec128{ wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; } // Multiplies even lanes (0, 2 ..) and returns the double-width result. template HWY_API Vec128 MulEven(const Vec128 a, const Vec128 b) { // TODO(eustas): replace, when implemented in WASM. const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0); const auto ae = wasm_v128_and(a.raw, kEvenMask); const auto be = wasm_v128_and(b.raw, kEvenMask); return Vec128{wasm_i64x2_mul(ae, be)}; } template HWY_API Vec128 MulEven(const Vec128 a, const Vec128 b) { // TODO(eustas): replace, when implemented in WASM. const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0); const auto ae = wasm_v128_and(a.raw, kEvenMask); const auto be = wasm_v128_and(b.raw, kEvenMask); return Vec128{wasm_i64x2_mul(ae, be)}; } // ------------------------------ Negate template HWY_API Vec128 Neg(const Vec128 v) { return Xor(v, SignBit(DFromV())); } template HWY_API Vec128 Neg(const Vec128 v) { return Vec128{wasm_i8x16_neg(v.raw)}; } template HWY_API Vec128 Neg(const Vec128 v) { return Vec128{wasm_i16x8_neg(v.raw)}; } template HWY_API Vec128 Neg(const Vec128 v) { return Vec128{wasm_i32x4_neg(v.raw)}; } template HWY_API Vec128 Neg(const Vec128 v) { return Vec128{wasm_i64x2_neg(v.raw)}; } // ------------------------------ Floating-point mul / div template HWY_API Vec128 operator*(Vec128 a, Vec128 b) { return Vec128{wasm_f32x4_mul(a.raw, b.raw)}; } template HWY_API Vec128 operator/(const Vec128 a, const Vec128 b) { return Vec128{wasm_f32x4_div(a.raw, b.raw)}; } // Approximate reciprocal template HWY_API Vec128 ApproximateReciprocal(const Vec128 v) { const Vec128 one = Vec128{wasm_f32x4_splat(1.0f)}; return one / v; } // Absolute value of difference. template HWY_API Vec128 AbsDiff(const Vec128 a, const Vec128 b) { return Abs(a - b); } // ------------------------------ Floating-point multiply-add variants // Returns mul * x + add template HWY_API Vec128 MulAdd(const Vec128 mul, const Vec128 x, const Vec128 add) { // TODO(eustas): replace, when implemented in WASM. // TODO(eustas): is it wasm_f32x4_qfma? return mul * x + add; } // Returns add - mul * x template HWY_API Vec128 NegMulAdd(const Vec128 mul, const Vec128 x, const Vec128 add) { // TODO(eustas): replace, when implemented in WASM. return add - mul * x; } // Returns mul * x - sub template HWY_API Vec128 MulSub(const Vec128 mul, const Vec128 x, const Vec128 sub) { // TODO(eustas): replace, when implemented in WASM. // TODO(eustas): is it wasm_f32x4_qfms? return mul * x - sub; } // Returns -mul * x - sub template HWY_API Vec128 NegMulSub(const Vec128 mul, const Vec128 x, const Vec128 sub) { // TODO(eustas): replace, when implemented in WASM. return Neg(mul) * x - sub; } // ------------------------------ Floating-point square root // Full precision square root template HWY_API Vec128 Sqrt(const Vec128 v) { return Vec128{wasm_f32x4_sqrt(v.raw)}; } // Approximate reciprocal square root template HWY_API Vec128 ApproximateReciprocalSqrt(const Vec128 v) { // TODO(eustas): find cheaper a way to calculate this. const Vec128 one = Vec128{wasm_f32x4_splat(1.0f)}; return one / Sqrt(v); } // ------------------------------ Floating-point rounding // Toward nearest integer, ties to even template HWY_API Vec128 Round(const Vec128 v) { return Vec128{wasm_f32x4_nearest(v.raw)}; } // Toward zero, aka truncate template HWY_API Vec128 Trunc(const Vec128 v) { return Vec128{wasm_f32x4_trunc(v.raw)}; } // Toward +infinity, aka ceiling template HWY_API Vec128 Ceil(const Vec128 v) { return Vec128{wasm_f32x4_ceil(v.raw)}; } // Toward -infinity, aka floor template HWY_API Vec128 Floor(const Vec128 v) { return Vec128{wasm_f32x4_floor(v.raw)}; } // ================================================== COMPARE // Comparisons fill a lane with 1-bits if the condition is true, else 0. template HWY_API Mask128 RebindMask(Simd /*tag*/, Mask128 m) { static_assert(sizeof(TFrom) == sizeof(TTo), "Must have same size"); return Mask128{m.raw}; } template HWY_API Mask128 TestBit(Vec128 v, Vec128 bit) { static_assert(!hwy::IsFloat(), "Only integer vectors supported"); return (v & bit) == bit; } // ------------------------------ Equality // Unsigned template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i8x16_eq(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i16x8_eq(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i32x4_eq(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i64x2_eq(a.raw, b.raw)}; } // Signed template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i8x16_eq(a.raw, b.raw)}; } template HWY_API Mask128 operator==(Vec128 a, Vec128 b) { return Mask128{wasm_i16x8_eq(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i32x4_eq(a.raw, b.raw)}; } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_i64x2_eq(a.raw, b.raw)}; } // Float template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { return Mask128{wasm_f32x4_eq(a.raw, b.raw)}; } // ------------------------------ Inequality // Unsigned template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i8x16_ne(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i16x8_ne(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i32x4_ne(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i64x2_ne(a.raw, b.raw)}; } // Signed template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i8x16_ne(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i16x8_ne(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i32x4_ne(a.raw, b.raw)}; } template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_i64x2_ne(a.raw, b.raw)}; } // Float template HWY_API Mask128 operator!=(const Vec128 a, const Vec128 b) { return Mask128{wasm_f32x4_ne(a.raw, b.raw)}; } // ------------------------------ Strict inequality template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_i8x16_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_i16x8_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_i32x4_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_i64x2_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_u8x16_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_u16x8_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_u32x4_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { const DFromV d; const Repartition d32; const auto a32 = BitCast(d32, a); const auto b32 = BitCast(d32, b); // If the upper halves are not equal, this is the answer. const auto m_gt = a32 > b32; // Otherwise, the lower half decides. const auto m_eq = a32 == b32; const auto lo_in_hi = wasm_i32x4_shuffle(m_gt.raw, m_gt.raw, 0, 0, 2, 2); const auto lo_gt = And(m_eq, MaskFromVec(VFromD{lo_in_hi})); const auto gt = Or(lo_gt, m_gt); // Copy result in upper 32 bits to lower 32 bits. return Mask128{wasm_i32x4_shuffle(gt.raw, gt.raw, 1, 1, 3, 3)}; } template HWY_API Mask128 operator>(const Vec128 a, const Vec128 b) { return Mask128{wasm_f32x4_gt(a.raw, b.raw)}; } template HWY_API Mask128 operator<(const Vec128 a, const Vec128 b) { return operator>(b, a); } // ------------------------------ Weak inequality // Float <= >= template HWY_API Mask128 operator<=(const Vec128 a, const Vec128 b) { return Mask128{wasm_f32x4_le(a.raw, b.raw)}; } template HWY_API Mask128 operator>=(const Vec128 a, const Vec128 b) { return Mask128{wasm_f32x4_ge(a.raw, b.raw)}; } // ------------------------------ FirstN (Iota, Lt) template HWY_API Mask128 FirstN(const Simd d, size_t num) { const RebindToSigned di; // Signed comparisons may be cheaper. return RebindMask(d, Iota(di, 0) < Set(di, static_cast>(num))); } // ================================================== LOGICAL // ------------------------------ Not template HWY_API Vec128 Not(Vec128 v) { return Vec128{wasm_v128_not(v.raw)}; } // ------------------------------ And template HWY_API Vec128 And(Vec128 a, Vec128 b) { return Vec128{wasm_v128_and(a.raw, b.raw)}; } // ------------------------------ AndNot // Returns ~not_mask & mask. template HWY_API Vec128 AndNot(Vec128 not_mask, Vec128 mask) { return Vec128{wasm_v128_andnot(mask.raw, not_mask.raw)}; } // ------------------------------ Or template HWY_API Vec128 Or(Vec128 a, Vec128 b) { return Vec128{wasm_v128_or(a.raw, b.raw)}; } // ------------------------------ Xor template HWY_API Vec128 Xor(Vec128 a, Vec128 b) { return Vec128{wasm_v128_xor(a.raw, b.raw)}; } // ------------------------------ OrAnd template HWY_API Vec128 OrAnd(Vec128 o, Vec128 a1, Vec128 a2) { return Or(o, And(a1, a2)); } // ------------------------------ IfVecThenElse template HWY_API Vec128 IfVecThenElse(Vec128 mask, Vec128 yes, Vec128 no) { return IfThenElse(MaskFromVec(mask), yes, no); } // ------------------------------ Operator overloads (internal-only if float) template HWY_API Vec128 operator&(const Vec128 a, const Vec128 b) { return And(a, b); } template HWY_API Vec128 operator|(const Vec128 a, const Vec128 b) { return Or(a, b); } template HWY_API Vec128 operator^(const Vec128 a, const Vec128 b) { return Xor(a, b); } // ------------------------------ CopySign template HWY_API Vec128 CopySign(const Vec128 magn, const Vec128 sign) { static_assert(IsFloat(), "Only makes sense for floating-point"); const auto msb = SignBit(DFromV()); return Or(AndNot(msb, magn), And(msb, sign)); } template HWY_API Vec128 CopySignToAbs(const Vec128 abs, const Vec128 sign) { static_assert(IsFloat(), "Only makes sense for floating-point"); return Or(abs, And(SignBit(DFromV()), sign)); } // ------------------------------ BroadcastSignBit (compare) template HWY_API Vec128 BroadcastSignBit(const Vec128 v) { return ShiftRight(v); } template HWY_API Vec128 BroadcastSignBit(const Vec128 v) { const DFromV d; return VecFromMask(d, v < Zero(d)); } // ------------------------------ Mask // Mask and Vec are the same (true = FF..FF). template HWY_API Mask128 MaskFromVec(const Vec128 v) { return Mask128{v.raw}; } template HWY_API Vec128 VecFromMask(Simd /* tag */, Mask128 v) { return Vec128{v.raw}; } // mask ? yes : no template HWY_API Vec128 IfThenElse(Mask128 mask, Vec128 yes, Vec128 no) { return Vec128{wasm_v128_bitselect(yes.raw, no.raw, mask.raw)}; } // mask ? yes : 0 template HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { return yes & VecFromMask(DFromV(), mask); } // mask ? 0 : no template HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { return AndNot(VecFromMask(DFromV(), mask), no); } template HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, Vec128 no) { static_assert(IsSigned(), "Only works for signed/float"); const DFromV d; const RebindToSigned di; v = BitCast(d, BroadcastSignBit(BitCast(di, v))); return IfThenElse(MaskFromVec(v), yes, no); } template HWY_API Vec128 ZeroIfNegative(Vec128 v) { const DFromV d; const auto zero = Zero(d); return IfThenElse(Mask128{(v > zero).raw}, v, zero); } // ------------------------------ Mask logical template HWY_API Mask128 Not(const Mask128 m) { return MaskFromVec(Not(VecFromMask(Simd(), m))); } template HWY_API Mask128 And(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 AndNot(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 Or(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 Xor(const Mask128 a, Mask128 b) { const Simd d; return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); } // ------------------------------ Shl (BroadcastSignBit, IfThenElse) // The x86 multiply-by-Pow2() trick will not work because WASM saturates // float->int correctly to 2^31-1 (not 2^31). Because WASM's shifts take a // scalar count operand, per-lane shift instructions would require extract_lane // for each lane, and hoping that shuffle is correctly mapped to a native // instruction. Using non-vector shifts would incur a store-load forwarding // stall when loading the result vector. We instead test bits of the shift // count to "predicate" a shift of the entire vector by a constant. template HWY_API Vec128 operator<<(Vec128 v, const Vec128 bits) { const DFromV d; Mask128 mask; // Need a signed type for BroadcastSignBit. auto test = BitCast(RebindToSigned(), bits); // Move the highest valid bit of the shift count into the sign bit. test = ShiftLeft<12>(test); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<8>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<4>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<2>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); return IfThenElse(mask, ShiftLeft<1>(v), v); } template HWY_API Vec128 operator<<(Vec128 v, const Vec128 bits) { const DFromV d; Mask128 mask; // Need a signed type for BroadcastSignBit. auto test = BitCast(RebindToSigned(), bits); // Move the highest valid bit of the shift count into the sign bit. test = ShiftLeft<27>(test); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<16>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<8>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<4>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftLeft<2>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); return IfThenElse(mask, ShiftLeft<1>(v), v); } template HWY_API Vec128 operator<<(Vec128 v, const Vec128 bits) { const DFromV d; alignas(16) T lanes[2]; alignas(16) T bits_lanes[2]; Store(v, d, lanes); Store(bits, d, bits_lanes); lanes[0] <<= bits_lanes[0]; lanes[1] <<= bits_lanes[1]; return Load(d, lanes); } // ------------------------------ Shr (BroadcastSignBit, IfThenElse) template HWY_API Vec128 operator>>(Vec128 v, const Vec128 bits) { const DFromV d; Mask128 mask; // Need a signed type for BroadcastSignBit. auto test = BitCast(RebindToSigned(), bits); // Move the highest valid bit of the shift count into the sign bit. test = ShiftLeft<12>(test); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<8>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<4>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<2>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); return IfThenElse(mask, ShiftRight<1>(v), v); } template HWY_API Vec128 operator>>(Vec128 v, const Vec128 bits) { const DFromV d; Mask128 mask; // Need a signed type for BroadcastSignBit. auto test = BitCast(RebindToSigned(), bits); // Move the highest valid bit of the shift count into the sign bit. test = ShiftLeft<27>(test); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<16>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<8>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<4>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); test = ShiftLeft<1>(test); // next bit (descending order) v = IfThenElse(mask, ShiftRight<2>(v), v); mask = RebindMask(d, MaskFromVec(BroadcastSignBit(test))); return IfThenElse(mask, ShiftRight<1>(v), v); } // ================================================== MEMORY // ------------------------------ Load template HWY_API Vec128 Load(Full128 /* tag */, const T* HWY_RESTRICT aligned) { return Vec128{wasm_v128_load(aligned)}; } template HWY_API Vec128 MaskedLoad(Mask128 m, Simd d, const T* HWY_RESTRICT aligned) { return IfThenElseZero(m, Load(d, aligned)); } // Partial load. template HWY_API Vec128 Load(Simd /* tag */, const T* HWY_RESTRICT p) { Vec128 v; CopyBytes(p, &v); return v; } // LoadU == Load. template HWY_API Vec128 LoadU(Simd d, const T* HWY_RESTRICT p) { return Load(d, p); } // 128-bit SIMD => nothing to duplicate, same as an unaligned load. template HWY_API Vec128 LoadDup128(Simd d, const T* HWY_RESTRICT p) { return Load(d, p); } // ------------------------------ Store template HWY_API void Store(Vec128 v, Full128 /* tag */, T* HWY_RESTRICT aligned) { wasm_v128_store(aligned, v.raw); } // Partial store. template HWY_API void Store(Vec128 v, Simd /* tag */, T* HWY_RESTRICT p) { CopyBytes(&v, p); } HWY_API void Store(const Vec128 v, Simd /* tag */, float* HWY_RESTRICT p) { *p = wasm_f32x4_extract_lane(v.raw, 0); } // StoreU == Store. template HWY_API void StoreU(Vec128 v, Simd d, T* HWY_RESTRICT p) { Store(v, d, p); } // ------------------------------ Non-temporal stores // Same as aligned stores on non-x86. template HWY_API void Stream(Vec128 v, Simd /* tag */, T* HWY_RESTRICT aligned) { wasm_v128_store(aligned, v.raw); } // ------------------------------ Scatter (Store) template HWY_API void ScatterOffset(Vec128 v, Simd d, T* HWY_RESTRICT base, const Vec128 offset) { static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); alignas(16) T lanes[N]; Store(v, d, lanes); alignas(16) Offset offset_lanes[N]; Store(offset, Rebind(), offset_lanes); uint8_t* base_bytes = reinterpret_cast(base); for (size_t i = 0; i < N; ++i) { CopyBytes(&lanes[i], base_bytes + offset_lanes[i]); } } template HWY_API void ScatterIndex(Vec128 v, Simd d, T* HWY_RESTRICT base, const Vec128 index) { static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); alignas(16) T lanes[N]; Store(v, d, lanes); alignas(16) Index index_lanes[N]; Store(index, Rebind(), index_lanes); for (size_t i = 0; i < N; ++i) { base[index_lanes[i]] = lanes[i]; } } // ------------------------------ Gather (Load/Store) template HWY_API Vec128 GatherOffset(const Simd d, const T* HWY_RESTRICT base, const Vec128 offset) { static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); alignas(16) Offset offset_lanes[N]; Store(offset, Rebind(), offset_lanes); alignas(16) T lanes[N]; const uint8_t* base_bytes = reinterpret_cast(base); for (size_t i = 0; i < N; ++i) { CopyBytes(base_bytes + offset_lanes[i], &lanes[i]); } return Load(d, lanes); } template HWY_API Vec128 GatherIndex(const Simd d, const T* HWY_RESTRICT base, const Vec128 index) { static_assert(sizeof(T) == sizeof(Index), "Must match for portability"); alignas(16) Index index_lanes[N]; Store(index, Rebind(), index_lanes); alignas(16) T lanes[N]; for (size_t i = 0; i < N; ++i) { lanes[i] = base[index_lanes[i]]; } return Load(d, lanes); } // ================================================== SWIZZLE // ------------------------------ Extract lane // Gets the single value stored in a vector/part. template HWY_API uint8_t GetLane(const Vec128 v) { return static_cast(wasm_i8x16_extract_lane(v.raw, 0)); } template HWY_API int8_t GetLane(const Vec128 v) { return static_cast(wasm_i8x16_extract_lane(v.raw, 0)); } template HWY_API uint16_t GetLane(const Vec128 v) { return static_cast(wasm_i16x8_extract_lane(v.raw, 0)); } template HWY_API int16_t GetLane(const Vec128 v) { return static_cast(wasm_i16x8_extract_lane(v.raw, 0)); } template HWY_API uint32_t GetLane(const Vec128 v) { return static_cast(wasm_i32x4_extract_lane(v.raw, 0)); } template HWY_API int32_t GetLane(const Vec128 v) { return static_cast(wasm_i32x4_extract_lane(v.raw, 0)); } template HWY_API uint64_t GetLane(const Vec128 v) { return static_cast(wasm_i64x2_extract_lane(v.raw, 0)); } template HWY_API int64_t GetLane(const Vec128 v) { return static_cast(wasm_i64x2_extract_lane(v.raw, 0)); } template HWY_API float GetLane(const Vec128 v) { return wasm_f32x4_extract_lane(v.raw, 0); } // ------------------------------ LowerHalf template HWY_API Vec128 LowerHalf(Simd /* tag */, Vec128 v) { return Vec128{v.raw}; } template HWY_API Vec128 LowerHalf(Vec128 v) { return LowerHalf(Simd(), v); } // ------------------------------ ShiftLeftBytes // 0x01..0F, kBytes = 1 => 0x02..0F00 template HWY_API Vec128 ShiftLeftBytes(Simd /* tag */, Vec128 v) { static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); const __i8x16 zero = wasm_i8x16_splat(0); switch (kBytes) { case 0: return v; case 1: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14)}; case 2: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)}; case 3: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)}; case 4: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11)}; case 5: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)}; case 6: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9)}; case 7: return Vec128{wasm_i8x16_shuffle( v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8)}; case 8: return Vec128{wasm_i8x16_shuffle( v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6, 7)}; case 9: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5, 6)}; case 10: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4, 5)}; case 11: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3, 4)}; case 12: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2, 3)}; case 13: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1, 2)}; case 14: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0, 1)}; case 15: return Vec128{wasm_i8x16_shuffle(v.raw, zero, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 0)}; } return Vec128{zero}; } template HWY_API Vec128 ShiftLeftBytes(Vec128 v) { return ShiftLeftBytes(Simd(), v); } // ------------------------------ ShiftLeftLanes template HWY_API Vec128 ShiftLeftLanes(Simd d, const Vec128 v) { const Repartition d8; return BitCast(d, ShiftLeftBytes(BitCast(d8, v))); } template HWY_API Vec128 ShiftLeftLanes(const Vec128 v) { return ShiftLeftLanes(DFromV(), v); } // ------------------------------ ShiftRightBytes namespace detail { // Helper function allows zeroing invalid lanes in caller. template HWY_API __i8x16 ShrBytes(const Vec128 v) { static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); const __i8x16 zero = wasm_i8x16_splat(0); switch (kBytes) { case 0: return v.raw; case 1: return wasm_i8x16_shuffle(v.raw, zero, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); case 2: return wasm_i8x16_shuffle(v.raw, zero, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16); case 3: return wasm_i8x16_shuffle(v.raw, zero, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16); case 4: return wasm_i8x16_shuffle(v.raw, zero, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16); case 5: return wasm_i8x16_shuffle(v.raw, zero, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16); case 6: return wasm_i8x16_shuffle(v.raw, zero, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16); case 7: return wasm_i8x16_shuffle(v.raw, zero, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16); case 8: return wasm_i8x16_shuffle(v.raw, zero, 8, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16); case 9: return wasm_i8x16_shuffle(v.raw, zero, 9, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 10: return wasm_i8x16_shuffle(v.raw, zero, 10, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 11: return wasm_i8x16_shuffle(v.raw, zero, 11, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 12: return wasm_i8x16_shuffle(v.raw, zero, 12, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 13: return wasm_i8x16_shuffle(v.raw, zero, 13, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 14: return wasm_i8x16_shuffle(v.raw, zero, 14, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 15: return wasm_i8x16_shuffle(v.raw, zero, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16); case 16: return zero; } } } // namespace detail // 0x01..0F, kBytes = 1 => 0x0001..0E template HWY_API Vec128 ShiftRightBytes(Simd /* tag */, Vec128 v) { // For partial vectors, clear upper lanes so we shift in zeros. if (N != 16 / sizeof(T)) { const Vec128 vfull{v.raw}; v = Vec128{IfThenElseZero(FirstN(Full128(), N), vfull).raw}; } return Vec128{detail::ShrBytes(v)}; } // ------------------------------ ShiftRightLanes template HWY_API Vec128 ShiftRightLanes(Simd d, const Vec128 v) { const Repartition d8; return BitCast(d, ShiftRightBytes(d8, BitCast(d8, v))); } // ------------------------------ UpperHalf (ShiftRightBytes) // Full input: copy hi into lo (smaller instruction encoding than shifts). template HWY_API Vec64 UpperHalf(Full64 /* tag */, const Vec128 v) { return Vec64{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; } HWY_API Vec64 UpperHalf(Full64 /* tag */, const Vec128 v) { return Vec64{wasm_i32x4_shuffle(v.raw, v.raw, 2, 3, 2, 3)}; } // Partial template HWY_API Vec128 UpperHalf(Half> /* tag */, Vec128 v) { const DFromV d; const RebindToUnsigned du; const auto vu = BitCast(du, v); const auto upper = BitCast(d, ShiftRightBytes(du, vu)); return Vec128{upper.raw}; } // ------------------------------ CombineShiftRightBytes template > HWY_API V CombineShiftRightBytes(Full128 /* tag */, V hi, V lo) { static_assert(0 <= kBytes && kBytes <= 16, "Invalid kBytes"); switch (kBytes) { case 0: return lo; case 1: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16)}; case 2: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17)}; case 3: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)}; case 4: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19)}; case 5: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20)}; case 6: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21)}; case 7: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22)}; case 8: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23)}; case 9: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24)}; case 10: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25)}; case 11: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26)}; case 12: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27)}; case 13: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28)}; case 14: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29)}; case 15: return V{wasm_i8x16_shuffle(lo.raw, hi.raw, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30)}; } return hi; } template > HWY_API V CombineShiftRightBytes(Simd d, V hi, V lo) { constexpr size_t kSize = N * sizeof(T); static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); const Repartition d8; const Full128 d_full8; using V8 = VFromD; const V8 hi8{BitCast(d8, hi).raw}; // Move into most-significant bytes const V8 lo8 = ShiftLeftBytes<16 - kSize>(V8{BitCast(d8, lo).raw}); const V8 r = CombineShiftRightBytes<16 - kSize + kBytes>(d_full8, hi8, lo8); return V{BitCast(Full128(), r).raw}; } // ------------------------------ Broadcast/splat any lane template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{wasm_i16x8_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane, kLane, kLane, kLane, kLane)}; } template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{ wasm_i32x4_shuffle(v.raw, v.raw, kLane, kLane, kLane, kLane)}; } template HWY_API Vec128 Broadcast(const Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, kLane, kLane)}; } // ------------------------------ TableLookupBytes // Returns vector of bytes[from[i]]. "from" is also interpreted as bytes, i.e. // lane indices in [0, 16). template HWY_API Vec128 TableLookupBytes(const Vec128 bytes, const Vec128 from) { // Not yet available in all engines, see // https://github.com/WebAssembly/simd/blob/bdcc304b2d379f4601c2c44ea9b44ed9484fde7e/proposals/simd/ImplementationStatus.md // V8 implementation of this had a bug, fixed on 2021-04-03: // https://chromium-review.googlesource.com/c/v8/v8/+/2822951 #if 0 return Vec128{wasm_i8x16_swizzle(bytes.raw, from.raw)}; #else alignas(16) uint8_t control[16]; alignas(16) uint8_t input[16]; alignas(16) uint8_t output[16]; wasm_v128_store(control, from.raw); wasm_v128_store(input, bytes.raw); for (size_t i = 0; i < 16; ++i) { output[i] = control[i] < 16 ? input[control[i]] : 0; } return Vec128{wasm_v128_load(output)}; #endif } template HWY_API Vec128 TableLookupBytesOr0(const Vec128 bytes, const Vec128 from) { const Simd d; // Mask size must match vector type, so cast everything to this type. Repartition di8; Repartition> d_bytes8; const auto msb = BitCast(di8, from) < Zero(di8); const auto lookup = TableLookupBytes(BitCast(d_bytes8, bytes), BitCast(di8, from)); return BitCast(d, IfThenZeroElse(msb, lookup)); } // ------------------------------ Hard-coded shuffles // Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). // Shuffle0321 rotates one lane to the right (the previous least-significant // lane is now most-significant). These could also be implemented via // CombineShiftRightBytes but the shuffle_abcd notation is more convenient. // Swap 32-bit halves in 64-bit halves. template HWY_API Vec128 Shuffle2301(const Vec128 v) { static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); static_assert(N == 2 || N == 4, "Does not make sense for N=1"); return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 0, 3, 2)}; } // Swap 64-bit halves template HWY_API Vec128 Shuffle01(const Vec128 v) { static_assert(sizeof(T) == 8, "Only for 64-bit lanes"); return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; } template HWY_API Vec128 Shuffle1032(const Vec128 v) { static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); return Vec128{wasm_i64x2_shuffle(v.raw, v.raw, 1, 0)}; } // Rotate right 32 bits template HWY_API Vec128 Shuffle0321(const Vec128 v) { static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 2, 3, 0)}; } // Rotate left 32 bits template HWY_API Vec128 Shuffle2103(const Vec128 v) { static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 0, 1, 2)}; } // Reverse template HWY_API Vec128 Shuffle0123(const Vec128 v) { static_assert(sizeof(T) == 4, "Only for 32-bit lanes"); return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 3, 2, 1, 0)}; } // ------------------------------ TableLookupLanes // Returned by SetTableIndices for use by TableLookupLanes. template struct Indices128 { __v128_u raw; }; template HWY_API Indices128 IndicesFromVec(Simd d, Vec128 vec) { static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); #if HWY_IS_DEBUG_BUILD const Rebind di; HWY_DASSERT(AllFalse(di, Lt(vec, Zero(di))) && AllTrue(di, Lt(vec, Set(di, static_cast(N))))); #endif const Repartition d8; using V8 = VFromD; const Repartition d16; // Broadcast each lane index to all bytes of T and shift to bytes static_assert(sizeof(T) == 4 || sizeof(T) == 8, ""); if (sizeof(T) == 4) { alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = { 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; const V8 lane_indices = TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes)); const V8 byte_indices = BitCast(d8, ShiftLeft<2>(BitCast(d16, lane_indices))); alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; return Indices128{Add(byte_indices, Load(d8, kByteOffsets)).raw}; } else { alignas(16) constexpr uint8_t kBroadcastLaneBytes[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; const V8 lane_indices = TableLookupBytes(BitCast(d8, vec), Load(d8, kBroadcastLaneBytes)); const V8 byte_indices = BitCast(d8, ShiftLeft<3>(BitCast(d16, lane_indices))); alignas(16) constexpr uint8_t kByteOffsets[16] = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; return Indices128{Add(byte_indices, Load(d8, kByteOffsets)).raw}; } } template HWY_API Indices128 SetTableIndices(Simd d, const TI* idx) { const Rebind di; return IndicesFromVec(d, LoadU(di, idx)); } template HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { using TI = MakeSigned; const DFromV d; const Rebind di; return BitCast(d, TableLookupBytes(BitCast(di, v), Vec128{idx.raw})); } // ------------------------------ Reverse (Shuffle0123, Shuffle2301, Shuffle01) // Single lane: no change template HWY_API Vec128 Reverse(Simd /* tag */, const Vec128 v) { return v; } // Two lanes: shuffle template HWY_API Vec128 Reverse(Simd /* tag */, const Vec128 v) { return Vec128{Shuffle2301(Vec128{v.raw}).raw}; } template HWY_API Vec128 Reverse(Full128 /* tag */, const Vec128 v) { return Shuffle01(v); } // Four lanes: shuffle template HWY_API Vec128 Reverse(Full128 /* tag */, const Vec128 v) { return Shuffle0123(v); } // 16-bit template HWY_API Vec128 Reverse(Simd d, const Vec128 v) { const RepartitionToWide> du32; return BitCast(d, RotateRight<16>(Reverse(du32, BitCast(du32, v)))); } // ------------------------------ Reverse2 template HWY_API Vec128 Reverse2(Simd d, const Vec128 v) { const RepartitionToWide> du32; return BitCast(d, RotateRight<16>(BitCast(du32, v))); } template HWY_API Vec128 Reverse2(Simd /* tag */, const Vec128 v) { return Shuffle2301(v); } template HWY_API Vec128 Reverse2(Simd /* tag */, const Vec128 v) { return Shuffle01(v); } // ------------------------------ Reverse4 template HWY_API Vec128 Reverse4(Simd d, const Vec128 v) { return BitCast(d, Vec128{wasm_i16x8_shuffle(v.raw, v.raw, 3, 2, 1, 0, 7, 6, 5, 4)}); } template HWY_API Vec128 Reverse4(Simd /* tag */, const Vec128 v) { return Shuffle0123(v); } template HWY_API Vec128 Reverse4(Simd /* tag */, const Vec128) { HWY_ASSERT(0); // don't have 8 u64 lanes } // ------------------------------ Reverse8 template HWY_API Vec128 Reverse8(Simd d, const Vec128 v) { return Reverse(d, v); } template HWY_API Vec128 Reverse8(Simd, const Vec128) { HWY_ASSERT(0); // don't have 8 lanes unless 16-bit } // ------------------------------ InterleaveLower template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i8x16_shuffle( a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{ wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i8x16_shuffle( a.raw, b.raw, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{ wasm_i16x8_shuffle(a.raw, b.raw, 0, 8, 1, 9, 2, 10, 3, 11)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 0, 2)}; } template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 0, 4, 1, 5)}; } // Additional overload for the optional tag. template HWY_API V InterleaveLower(DFromV /* tag */, V a, V b) { return InterleaveLower(a, b); } // ------------------------------ InterleaveUpper (UpperHalf) // All functions inside detail lack the required D parameter. namespace detail { template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{ wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i8x16_shuffle(a.raw, b.raw, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{ wasm_i16x8_shuffle(a.raw, b.raw, 4, 12, 5, 13, 6, 14, 7, 15)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 1, 3)}; } template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 2, 6, 3, 7)}; } } // namespace detail // Full template > HWY_API V InterleaveUpper(Full128 /* tag */, V a, V b) { return detail::InterleaveUpper(a, b); } // Partial template > HWY_API V InterleaveUpper(Simd d, V a, V b) { const Half d2; return InterleaveLower(d, V{UpperHalf(d2, a).raw}, V{UpperHalf(d2, b).raw}); } // ------------------------------ ZipLower/ZipUpper (InterleaveLower) // Same as Interleave*, except that the return lanes are double-width integers; // this is necessary because the single-lane scalar cannot return two values. template >> HWY_API VFromD ZipLower(V a, V b) { return BitCast(DW(), InterleaveLower(a, b)); } template , class DW = RepartitionToWide> HWY_API VFromD ZipLower(DW dw, V a, V b) { return BitCast(dw, InterleaveLower(D(), a, b)); } template , class DW = RepartitionToWide> HWY_API VFromD ZipUpper(DW dw, V a, V b) { return BitCast(dw, InterleaveUpper(D(), a, b)); } // ================================================== COMBINE // ------------------------------ Combine (InterleaveLower) // N = N/2 + N/2 (upper half undefined) template HWY_API Vec128 Combine(Simd d, Vec128 hi_half, Vec128 lo_half) { const Half d2; const RebindToUnsigned du2; // Treat half-width input as one lane, and expand to two lanes. using VU = Vec128, 2>; const VU lo{BitCast(du2, lo_half).raw}; const VU hi{BitCast(du2, hi_half).raw}; return BitCast(d, InterleaveLower(lo, hi)); } // ------------------------------ ZeroExtendVector (Combine, IfThenElseZero) template HWY_API Vec128 ZeroExtendVector(Simd d, Vec128 lo) { return IfThenElseZero(FirstN(d, N / 2), Vec128{lo.raw}); } // ------------------------------ ConcatLowerLower // hiH,hiL loH,loL |-> hiL,loL (= lower halves) template HWY_API Vec128 ConcatLowerLower(Full128 /* tag */, const Vec128 hi, const Vec128 lo) { return Vec128{wasm_i64x2_shuffle(lo.raw, hi.raw, 0, 2)}; } template HWY_API Vec128 ConcatLowerLower(Simd d, const Vec128 hi, const Vec128 lo) { const Half d2; return Combine(d, LowerHalf(d2, hi), LowerHalf(d2, lo)); } // ------------------------------ ConcatUpperUpper template HWY_API Vec128 ConcatUpperUpper(Full128 /* tag */, const Vec128 hi, const Vec128 lo) { return Vec128{wasm_i64x2_shuffle(lo.raw, hi.raw, 1, 3)}; } template HWY_API Vec128 ConcatUpperUpper(Simd d, const Vec128 hi, const Vec128 lo) { const Half d2; return Combine(d, UpperHalf(d2, hi), UpperHalf(d2, lo)); } // ------------------------------ ConcatLowerUpper template HWY_API Vec128 ConcatLowerUpper(Full128 d, const Vec128 hi, const Vec128 lo) { return CombineShiftRightBytes<8>(d, hi, lo); } template HWY_API Vec128 ConcatLowerUpper(Simd d, const Vec128 hi, const Vec128 lo) { const Half d2; return Combine(d, LowerHalf(d2, hi), UpperHalf(d2, lo)); } // ------------------------------ ConcatUpperLower template HWY_API Vec128 ConcatUpperLower(Simd d, const Vec128 hi, const Vec128 lo) { return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi); } // ------------------------------ ConcatOdd // 32-bit full template HWY_API Vec128 ConcatOdd(Full128 /* tag */, Vec128 hi, Vec128 lo) { return Vec128{wasm_i32x4_shuffle(lo.raw, hi.raw, 1, 3, 5, 7)}; } // 32-bit partial template HWY_API Vec128 ConcatOdd(Simd /* tag */, Vec128 hi, Vec128 lo) { return InterleaveUpper(Simd(), lo, hi); } // 64-bit full - no partial because we need at least two inputs to have // even/odd. template HWY_API Vec128 ConcatOdd(Full128 /* tag */, Vec128 hi, Vec128 lo) { return InterleaveUpper(Full128(), lo, hi); } // ------------------------------ ConcatEven (InterleaveLower) // 32-bit full template HWY_API Vec128 ConcatEven(Full128 /* tag */, Vec128 hi, Vec128 lo) { return Vec128{wasm_i32x4_shuffle(lo.raw, hi.raw, 0, 2, 4, 6)}; } // 32-bit partial template HWY_API Vec128 ConcatEven(Simd /* tag */, Vec128 hi, Vec128 lo) { return InterleaveLower(Simd(), lo, hi); } // 64-bit full - no partial because we need at least two inputs to have // even/odd. template HWY_API Vec128 ConcatEven(Full128 /* tag */, Vec128 hi, Vec128 lo) { return InterleaveLower(Full128(), lo, hi); } // ------------------------------ DupEven (InterleaveLower) template HWY_API Vec128 DupEven(Vec128 v) { return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 0, 0, 2, 2)}; } template HWY_API Vec128 DupEven(const Vec128 v) { return InterleaveLower(DFromV(), v, v); } // ------------------------------ DupOdd (InterleaveUpper) template HWY_API Vec128 DupOdd(Vec128 v) { return Vec128{wasm_i32x4_shuffle(v.raw, v.raw, 1, 1, 3, 3)}; } template HWY_API Vec128 DupOdd(const Vec128 v) { return InterleaveUpper(DFromV(), v, v); } // ------------------------------ OddEven namespace detail { template HWY_INLINE Vec128 OddEven(hwy::SizeTag<1> /* tag */, const Vec128 a, const Vec128 b) { const DFromV d; const Repartition d8; alignas(16) constexpr uint8_t mask[16] = {0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0, 0xFF, 0}; return IfThenElse(MaskFromVec(BitCast(d, Load(d8, mask))), b, a); } template HWY_INLINE Vec128 OddEven(hwy::SizeTag<2> /* tag */, const Vec128 a, const Vec128 b) { return Vec128{ wasm_i16x8_shuffle(a.raw, b.raw, 8, 1, 10, 3, 12, 5, 14, 7)}; } template HWY_INLINE Vec128 OddEven(hwy::SizeTag<4> /* tag */, const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; } template HWY_INLINE Vec128 OddEven(hwy::SizeTag<8> /* tag */, const Vec128 a, const Vec128 b) { return Vec128{wasm_i64x2_shuffle(a.raw, b.raw, 2, 1)}; } } // namespace detail template HWY_API Vec128 OddEven(const Vec128 a, const Vec128 b) { return detail::OddEven(hwy::SizeTag(), a, b); } template HWY_API Vec128 OddEven(const Vec128 a, const Vec128 b) { return Vec128{wasm_i32x4_shuffle(a.raw, b.raw, 4, 1, 6, 3)}; } // ------------------------------ OddEvenBlocks template HWY_API Vec128 OddEvenBlocks(Vec128 /* odd */, Vec128 even) { return even; } // ------------------------------ SwapAdjacentBlocks template HWY_API Vec128 SwapAdjacentBlocks(Vec128 v) { return v; } // ------------------------------ ReverseBlocks // Single block: no change template HWY_API Vec128 ReverseBlocks(Full128 /* tag */, const Vec128 v) { return v; } // ================================================== CONVERT // ------------------------------ Promotions (part w/ narrow lanes -> full) // Unsigned: zero-extend. template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_u16x8_extend_low_u8x16(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{ wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_u16x8_extend_low_u8x16(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{ wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(v.raw))}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_u32x4_extend_low_u16x8(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_u64x2_extend_low_u32x4(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_u32x4_extend_low_u16x8(v.raw)}; } // Signed: replicate sign bit. template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_i16x8_extend_low_i8x16(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{ wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(v.raw))}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_i32x4_extend_low_i16x8(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_i64x2_extend_low_i32x4(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_f64x2_convert_low_i32x4(v.raw)}; } template HWY_API Vec128 PromoteTo(Simd df32, const Vec128 v) { const RebindToSigned di32; const RebindToUnsigned du32; // Expand to u32 so we can shift. const auto bits16 = PromoteTo(du32, Vec128{v.raw}); const auto sign = ShiftRight<15>(bits16); const auto biased_exp = ShiftRight<10>(bits16) & Set(du32, 0x1F); const auto mantissa = bits16 & Set(du32, 0x3FF); const auto subnormal = BitCast(du32, ConvertTo(df32, BitCast(di32, mantissa)) * Set(df32, 1.0f / 16384 / 1024)); const auto biased_exp32 = biased_exp + Set(du32, 127 - 15); const auto mantissa32 = ShiftLeft<23 - 10>(mantissa); const auto normal = ShiftLeft<23>(biased_exp32) | mantissa32; const auto bits32 = IfThenElse(biased_exp == Zero(du32), subnormal, normal); return BitCast(df32, ShiftLeft<31>(sign) | bits32); } template HWY_API Vec128 PromoteTo(Simd df32, const Vec128 v) { const Rebind du16; const RebindToSigned di32; return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); } // ------------------------------ Demotions (full -> part w/ narrow lanes) template HWY_API Vec128 DemoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_u16x8_narrow_i32x4(v.raw, v.raw)}; } template HWY_API Vec128 DemoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_i16x8_narrow_i32x4(v.raw, v.raw)}; } template HWY_API Vec128 DemoteTo(Simd /* tag */, const Vec128 v) { const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); return Vec128{ wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; } template HWY_API Vec128 DemoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_u8x16_narrow_i16x8(v.raw, v.raw)}; } template HWY_API Vec128 DemoteTo(Simd /* tag */, const Vec128 v) { const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); return Vec128{wasm_i8x16_narrow_i16x8(intermediate, intermediate)}; } template HWY_API Vec128 DemoteTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_i8x16_narrow_i16x8(v.raw, v.raw)}; } template HWY_API Vec128 DemoteTo(Simd /* di */, const Vec128 v) { return Vec128{wasm_i32x4_trunc_sat_f64x2_zero(v.raw)}; } template HWY_API Vec128 DemoteTo(Simd df16, const Vec128 v) { const RebindToUnsigned du16; const Rebind du; const RebindToSigned di; const auto bits32 = BitCast(du, v); const auto sign = ShiftRight<31>(bits32); const auto biased_exp32 = ShiftRight<23>(bits32) & Set(du, 0xFF); const auto mantissa32 = bits32 & Set(du, 0x7FFFFF); const auto k15 = Set(di, 15); const auto exp = Min(BitCast(di, biased_exp32) - Set(di, 127), k15); const auto is_tiny = exp < Set(di, -24); const auto is_subnormal = exp < Set(di, -14); const auto biased_exp16 = BitCast(du, IfThenZeroElse(is_subnormal, exp + k15)); const auto sub_exp = BitCast(du, Set(di, -14) - exp); // [1, 11) const auto sub_m = (Set(du, 1) << (Set(du, 10) - sub_exp)) + (mantissa32 >> (Set(du, 13) + sub_exp)); const auto mantissa16 = IfThenElse(RebindMask(du, is_subnormal), sub_m, ShiftRight<13>(mantissa32)); // <1024 const auto sign16 = ShiftLeft<15>(sign); const auto normal16 = sign16 | ShiftLeft<10>(biased_exp16) | mantissa16; const auto bits16 = IfThenZeroElse(is_tiny, BitCast(di, normal16)); return Vec128{DemoteTo(du16, bits16).raw}; } template HWY_API Vec128 DemoteTo(Simd dbf16, const Vec128 v) { const Rebind di32; const Rebind du32; // for logical shift right const Rebind du16; const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); return BitCast(dbf16, DemoteTo(du16, bits_in_32)); } template HWY_API Vec128 ReorderDemote2To( Simd dbf16, Vec128 a, Vec128 b) { const RebindToUnsigned du16; const Repartition du32; const Vec128 b_in_even = ShiftRight<16>(BitCast(du32, b)); return BitCast(dbf16, OddEven(BitCast(du16, a), BitCast(du16, b_in_even))); } // For already range-limited input [0, 255]. template HWY_API Vec128 U8FromU32(const Vec128 v) { const auto intermediate = wasm_i16x8_narrow_i32x4(v.raw, v.raw); return Vec128{ wasm_u8x16_narrow_i16x8(intermediate, intermediate)}; } // ------------------------------ Convert i32 <=> f32 (Round) template HWY_API Vec128 ConvertTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_f32x4_convert_i32x4(v.raw)}; } // Truncates (rounds toward zero). template HWY_API Vec128 ConvertTo(Simd /* tag */, const Vec128 v) { return Vec128{wasm_i32x4_trunc_sat_f32x4(v.raw)}; } template HWY_API Vec128 NearestInt(const Vec128 v) { return ConvertTo(Simd(), Round(v)); } // ================================================== MISC // ------------------------------ SumsOf8 (ShiftRight, Add) template HWY_API Vec128 SumsOf8(const Vec128 v) { const DFromV du8; const RepartitionToWide du16; const RepartitionToWide du32; const RepartitionToWide du64; using VU16 = VFromD; const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v)); const VU16 vECA86420 = And(BitCast(du16, v), Set(du16, 0xFF)); const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420); const VU16 szz_FE_zz_BA_zz_76_zz_32 = BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10))); const VU16 sxx_FC_xx_B8_xx_74_xx_30 = Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32); const VU16 szz_zz_xx_FC_zz_zz_xx_74 = BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30))); const VU16 sxx_xx_xx_F8_xx_xx_xx_70 = Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74); return And(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), Set(du64, 0xFFFF)); } // ------------------------------ LoadMaskBits (TestBit) namespace detail { template HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t bits) { const RebindToUnsigned du; // Easier than Set(), which would require an >8-bit type, which would not // compile for T=uint8_t, N=1. const Vec128 vbits{wasm_i32x4_splat(static_cast(bits))}; // Replicate bytes 8x such that each byte contains the bit that governs it. alignas(16) constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}; const auto rep8 = TableLookupBytes(vbits, Load(du, kRep8)); alignas(16) constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128}; return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); } template HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t bits) { const RebindToUnsigned du; alignas(16) constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; return RebindMask( d, TestBit(Set(du, static_cast(bits)), Load(du, kBit))); } template HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t bits) { const RebindToUnsigned du; alignas(16) constexpr uint32_t kBit[8] = {1, 2, 4, 8}; return RebindMask( d, TestBit(Set(du, static_cast(bits)), Load(du, kBit))); } template HWY_INLINE Mask128 LoadMaskBits(Simd d, uint64_t bits) { const RebindToUnsigned du; alignas(16) constexpr uint64_t kBit[8] = {1, 2}; return RebindMask(d, TestBit(Set(du, bits), Load(du, kBit))); } } // namespace detail // `p` points to at least 8 readable bytes, not all of which need be valid. template HWY_API Mask128 LoadMaskBits(Simd d, const uint8_t* HWY_RESTRICT bits) { uint64_t mask_bits = 0; CopyBytes<(N + 7) / 8>(bits, &mask_bits); return detail::LoadMaskBits(d, mask_bits); } // ------------------------------ Mask namespace detail { // Full template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, const Mask128 mask) { alignas(16) uint64_t lanes[2]; wasm_v128_store(lanes, mask.raw); constexpr uint64_t kMagic = 0x103070F1F3F80ULL; const uint64_t lo = ((lanes[0] * kMagic) >> 56); const uint64_t hi = ((lanes[1] * kMagic) >> 48) & 0xFF00; return (hi + lo); } // 64-bit template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, const Mask128 mask) { constexpr uint64_t kMagic = 0x103070F1F3F80ULL; return (static_cast(wasm_i64x2_extract_lane(mask.raw, 0)) * kMagic) >> 56; } // 32-bit or less: need masking template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, const Mask128 mask) { uint64_t bytes = static_cast(wasm_i64x2_extract_lane(mask.raw, 0)); // Clear potentially undefined bytes. bytes &= (1ULL << (N * 8)) - 1; constexpr uint64_t kMagic = 0x103070F1F3F80ULL; return (bytes * kMagic) >> 56; } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, const Mask128 mask) { // Remove useless lower half of each u16 while preserving the sign bit. const __i16x8 zero = wasm_i16x8_splat(0); const Mask128 mask8{wasm_i8x16_narrow_i16x8(mask.raw, zero)}; return BitsFromMask(hwy::SizeTag<1>(), mask8); } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, const Mask128 mask) { const __i32x4 mask_i = static_cast<__i32x4>(mask.raw); const __i32x4 slice = wasm_i32x4_make(1, 2, 4, 8); const __i32x4 sliced_mask = wasm_v128_and(mask_i, slice); alignas(16) uint32_t lanes[4]; wasm_v128_store(lanes, sliced_mask); return lanes[0] | lanes[1] | lanes[2] | lanes[3]; } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, const Mask128 mask) { const __i64x2 mask_i = static_cast<__i64x2>(mask.raw); const __i64x2 slice = wasm_i64x2_make(1, 2); const __i64x2 sliced_mask = wasm_v128_and(mask_i, slice); alignas(16) uint64_t lanes[2]; wasm_v128_store(lanes, sliced_mask); return lanes[0] | lanes[1]; } // Returns the lowest N bits for the BitsFromMask result. template constexpr uint64_t OnlyActive(uint64_t bits) { return ((N * sizeof(T)) == 16) ? bits : bits & ((1ull << N) - 1); } // Returns 0xFF for bytes with index >= N, otherwise 0. template constexpr __i8x16 BytesAbove() { return /**/ (N == 0) ? wasm_i32x4_make(-1, -1, -1, -1) : (N == 4) ? wasm_i32x4_make(0, -1, -1, -1) : (N == 8) ? wasm_i32x4_make(0, 0, -1, -1) : (N == 12) ? wasm_i32x4_make(0, 0, 0, -1) : (N == 16) ? wasm_i32x4_make(0, 0, 0, 0) : (N == 2) ? wasm_i16x8_make(0, -1, -1, -1, -1, -1, -1, -1) : (N == 6) ? wasm_i16x8_make(0, 0, 0, -1, -1, -1, -1, -1) : (N == 10) ? wasm_i16x8_make(0, 0, 0, 0, 0, -1, -1, -1) : (N == 14) ? wasm_i16x8_make(0, 0, 0, 0, 0, 0, 0, -1) : (N == 1) ? wasm_i8x16_make(0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1) : (N == 3) ? wasm_i8x16_make(0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1) : (N == 5) ? wasm_i8x16_make(0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1) : (N == 7) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1) : (N == 9) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1) : (N == 11) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1) : (N == 13) ? wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1) : wasm_i8x16_make(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1); } template HWY_INLINE uint64_t BitsFromMask(const Mask128 mask) { return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); } template HWY_INLINE size_t CountTrue(hwy::SizeTag<1> tag, const Mask128 m) { return PopCount(BitsFromMask(tag, m)); } template HWY_INLINE size_t CountTrue(hwy::SizeTag<2> tag, const Mask128 m) { return PopCount(BitsFromMask(tag, m)); } template HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, const Mask128 m) { const __i32x4 var_shift = wasm_i32x4_make(1, 2, 4, 8); const __i32x4 shifted_bits = wasm_v128_and(m.raw, var_shift); alignas(16) uint64_t lanes[2]; wasm_v128_store(lanes, shifted_bits); return PopCount(lanes[0] | lanes[1]); } template HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, const Mask128 m) { alignas(16) int64_t lanes[2]; wasm_v128_store(lanes, m.raw); return static_cast(-(lanes[0] + lanes[1])); } } // namespace detail // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(const Simd /* tag */, const Mask128 mask, uint8_t* bits) { const uint64_t mask_bits = detail::BitsFromMask(mask); const size_t kNumBytes = (N + 7) / 8; CopyBytes(&mask_bits, bits); return kNumBytes; } template HWY_API size_t CountTrue(const Simd /* tag */, const Mask128 m) { return detail::CountTrue(hwy::SizeTag(), m); } // Partial vector template HWY_API size_t CountTrue(const Simd d, const Mask128 m) { // Ensure all undefined bytes are 0. const Mask128 mask{detail::BytesAbove()}; return CountTrue(d, Mask128{AndNot(mask, m).raw}); } // Full vector template HWY_API bool AllFalse(const Full128 d, const Mask128 m) { #if 0 // Casting followed by wasm_i8x16_any_true results in wasm error: // i32.eqz[0] expected type i32, found i8x16.popcnt of type s128 const auto v8 = BitCast(Full128(), VecFromMask(d, m)); return !wasm_i8x16_any_true(v8.raw); #else (void)d; return (wasm_i64x2_extract_lane(m.raw, 0) | wasm_i64x2_extract_lane(m.raw, 1)) == 0; #endif } // Full vector namespace detail { template HWY_INLINE bool AllTrue(hwy::SizeTag<1> /*tag*/, const Mask128 m) { return wasm_i8x16_all_true(m.raw); } template HWY_INLINE bool AllTrue(hwy::SizeTag<2> /*tag*/, const Mask128 m) { return wasm_i16x8_all_true(m.raw); } template HWY_INLINE bool AllTrue(hwy::SizeTag<4> /*tag*/, const Mask128 m) { return wasm_i32x4_all_true(m.raw); } template HWY_INLINE bool AllTrue(hwy::SizeTag<8> /*tag*/, const Mask128 m) { return wasm_i64x2_all_true(m.raw); } } // namespace detail template HWY_API bool AllTrue(const Simd /* tag */, const Mask128 m) { return detail::AllTrue(hwy::SizeTag(), m); } // Partial vectors template HWY_API bool AllFalse(Simd /* tag */, const Mask128 m) { // Ensure all undefined bytes are 0. const Mask128 mask{detail::BytesAbove()}; return AllFalse(Full128(), Mask128{AndNot(mask, m).raw}); } template HWY_API bool AllTrue(const Simd /* d */, const Mask128 m) { // Ensure all undefined bytes are FF. const Mask128 mask{detail::BytesAbove()}; return AllTrue(Full128(), Mask128{Or(mask, m).raw}); } template HWY_API intptr_t FindFirstTrue(const Simd /* tag */, const Mask128 mask) { const uint64_t bits = detail::BitsFromMask(mask); return bits ? static_cast(Num0BitsBelowLS1Bit_Nonzero64(bits)) : -1; } // ------------------------------ Compress namespace detail { template HWY_INLINE Vec128 Idx16x8FromBits(const uint64_t mask_bits) { HWY_DASSERT(mask_bits < 256); const Simd d; const Rebind d8; const Simd du; // We need byte indices for TableLookupBytes (one vector's worth for each of // 256 combinations of 8 mask bits). Loading them directly requires 4 KiB. We // can instead store lane indices and convert to byte indices (2*lane + 0..1), // with the doubling baked into the table. Unpacking nibbles is likely more // costly than the higher cache footprint from storing bytes. alignas(16) constexpr uint8_t table[256 * 8] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 0, 0, 2, 6, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 0, 0, 4, 6, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 2, 4, 6, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 0, 0, 2, 8, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 0, 0, 4, 8, 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 0, 0, 2, 4, 8, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 0, 0, 2, 6, 8, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 0, 0, 4, 6, 8, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 2, 4, 6, 8, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0, 0, 0, 2, 10, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 0, 0, 4, 10, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 0, 0, 2, 4, 10, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 0, 0, 6, 10, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 0, 0, 2, 6, 10, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 0, 0, 4, 6, 10, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 2, 4, 6, 10, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 0, 0, 8, 10, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 0, 0, 2, 8, 10, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0, 0, 0, 4, 8, 10, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 0, 0, 2, 4, 8, 10, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 0, 0, 6, 8, 10, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 0, 0, 2, 6, 8, 10, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 0, 0, 2, 4, 6, 8, 10, 0, 0, 12, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 0, 0, 2, 12, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 0, 0, 4, 12, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0, 0, 0, 2, 4, 12, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 0, 0, 6, 12, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 0, 0, 2, 6, 12, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 0, 0, 4, 6, 12, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 0, 0, 2, 4, 6, 12, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 0, 0, 8, 12, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 0, 0, 2, 8, 12, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 4, 8, 12, 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 0, 0, 2, 4, 8, 12, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0, 0, 0, 6, 8, 12, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 0, 0, 2, 6, 8, 12, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 0, 0, 4, 6, 8, 12, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 0, 0, 2, 4, 6, 8, 12, 0, 0, 10, 12, 0, 0, 0, 0, 0, 0, 0, 10, 12, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 0, 0, 2, 10, 12, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 0, 0, 4, 10, 12, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 0, 0, 2, 4, 10, 12, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 0, 0, 6, 10, 12, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0, 0, 0, 2, 6, 10, 12, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 0, 0, 4, 6, 10, 12, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 0, 0, 2, 4, 6, 10, 12, 0, 0, 8, 10, 12, 0, 0, 0, 0, 0, 0, 8, 10, 12, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 0, 0, 2, 8, 10, 12, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 0, 0, 4, 8, 10, 12, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 0, 0, 2, 4, 8, 10, 12, 0, 0, 6, 8, 10, 12, 0, 0, 0, 0, 0, 6, 8, 10, 12, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 0, 0, 2, 6, 8, 10, 12, 0, 0, 4, 6, 8, 10, 12, 0, 0, 0, 0, 4, 6, 8, 10, 12, 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 0, 2, 4, 6, 8, 10, 12, 0, 14, 0, 0, 0, 0, 0, 0, 0, 0, 14, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 0, 0, 2, 14, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 0, 0, 4, 14, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 0, 0, 2, 4, 14, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 0, 0, 6, 14, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 0, 0, 2, 6, 14, 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 0, 0, 4, 6, 14, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0, 0, 0, 2, 4, 6, 14, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 0, 0, 8, 14, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 0, 0, 2, 8, 14, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 0, 0, 4, 8, 14, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 0, 0, 2, 4, 8, 14, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 0, 0, 6, 8, 14, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 0, 0, 2, 6, 8, 14, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 0, 0, 4, 6, 8, 14, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 0, 0, 2, 4, 6, 8, 14, 0, 0, 10, 14, 0, 0, 0, 0, 0, 0, 0, 10, 14, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 0, 0, 2, 10, 14, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 0, 0, 4, 10, 14, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 0, 0, 2, 4, 10, 14, 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 0, 0, 6, 10, 14, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 0, 0, 2, 6, 10, 14, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 0, 0, 4, 6, 10, 14, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 0, 0, 2, 4, 6, 10, 14, 0, 0, 8, 10, 14, 0, 0, 0, 0, 0, 0, 8, 10, 14, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0, 0, 0, 2, 8, 10, 14, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 0, 0, 4, 8, 10, 14, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 0, 0, 2, 4, 8, 10, 14, 0, 0, 6, 8, 10, 14, 0, 0, 0, 0, 0, 6, 8, 10, 14, 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 0, 0, 2, 6, 8, 10, 14, 0, 0, 4, 6, 8, 10, 14, 0, 0, 0, 0, 4, 6, 8, 10, 14, 0, 0, 2, 4, 6, 8, 10, 14, 0, 0, 0, 2, 4, 6, 8, 10, 14, 0, 12, 14, 0, 0, 0, 0, 0, 0, 0, 12, 14, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 0, 0, 2, 12, 14, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0, 0, 0, 4, 12, 14, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 0, 0, 2, 4, 12, 14, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 0, 0, 6, 12, 14, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 0, 0, 2, 6, 12, 14, 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 0, 0, 4, 6, 12, 14, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 0, 0, 2, 4, 6, 12, 14, 0, 0, 8, 12, 14, 0, 0, 0, 0, 0, 0, 8, 12, 14, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 0, 0, 2, 8, 12, 14, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 0, 0, 4, 8, 12, 14, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0, 0, 0, 2, 4, 8, 12, 14, 0, 0, 6, 8, 12, 14, 0, 0, 0, 0, 0, 6, 8, 12, 14, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 0, 0, 2, 6, 8, 12, 14, 0, 0, 4, 6, 8, 12, 14, 0, 0, 0, 0, 4, 6, 8, 12, 14, 0, 0, 2, 4, 6, 8, 12, 14, 0, 0, 0, 2, 4, 6, 8, 12, 14, 0, 10, 12, 14, 0, 0, 0, 0, 0, 0, 10, 12, 14, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 0, 0, 2, 10, 12, 14, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 0, 0, 4, 10, 12, 14, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 0, 0, 2, 4, 10, 12, 14, 0, 0, 6, 10, 12, 14, 0, 0, 0, 0, 0, 6, 10, 12, 14, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 0, 0, 2, 6, 10, 12, 14, 0, 0, 4, 6, 10, 12, 14, 0, 0, 0, 0, 4, 6, 10, 12, 14, 0, 0, 2, 4, 6, 10, 12, 14, 0, 0, 0, 2, 4, 6, 10, 12, 14, 0, 8, 10, 12, 14, 0, 0, 0, 0, 0, 8, 10, 12, 14, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 0, 0, 2, 8, 10, 12, 14, 0, 0, 4, 8, 10, 12, 14, 0, 0, 0, 0, 4, 8, 10, 12, 14, 0, 0, 2, 4, 8, 10, 12, 14, 0, 0, 0, 2, 4, 8, 10, 12, 14, 0, 6, 8, 10, 12, 14, 0, 0, 0, 0, 6, 8, 10, 12, 14, 0, 0, 2, 6, 8, 10, 12, 14, 0, 0, 0, 2, 6, 8, 10, 12, 14, 0, 4, 6, 8, 10, 12, 14, 0, 0, 0, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 2, 4, 6, 8, 10, 12, 14}; const Vec128 byte_idx{Load(d8, table + mask_bits * 8).raw}; const Vec128 pairs = ZipLower(byte_idx, byte_idx); return BitCast(d, pairs + Set(du, 0x0100)); } template HWY_INLINE Vec128 Idx32x4FromBits(const uint64_t mask_bits) { HWY_DASSERT(mask_bits < 16); // There are only 4 lanes, so we can afford to load the index vector directly. alignas(16) constexpr uint8_t packed_array[16 * 16] = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, // 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, // 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, // 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 0, 1, 2, 3, // 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, // 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, // 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 0, 1, 2, 3, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, // 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, // 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, // 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, // 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, // 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, // 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Simd d; const Repartition d8; return BitCast(d, Load(d8, packed_array + 16 * mask_bits)); } template HWY_INLINE Vec128 Idx64x2FromBits(const uint64_t mask_bits) { HWY_DASSERT(mask_bits < 4); // There are only 2 lanes, so we can afford to load the index vector directly. alignas(16) constexpr uint8_t packed_array[4 * 16] = { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, // 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, // 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Simd d; const Repartition d8; return BitCast(d, Load(d8, packed_array + 16 * mask_bits)); } // Helper functions called by both Compress and CompressStore - avoids a // redundant BitsFromMask in the latter. template HWY_INLINE Vec128 Compress(hwy::SizeTag<2> /*tag*/, Vec128 v, const uint64_t mask_bits) { const auto idx = detail::Idx16x8FromBits(mask_bits); const DFromV d; const RebindToSigned di; return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx))); } template HWY_INLINE Vec128 Compress(hwy::SizeTag<4> /*tag*/, Vec128 v, const uint64_t mask_bits) { const auto idx = detail::Idx32x4FromBits(mask_bits); const DFromV d; const RebindToSigned di; return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx))); } template HWY_INLINE Vec128 Compress(hwy::SizeTag<8> /*tag*/, Vec128 v, const uint64_t mask_bits) { const auto idx = detail::Idx64x2FromBits(mask_bits); const DFromV d; const RebindToSigned di; return BitCast(d, TableLookupBytes(BitCast(di, v), BitCast(di, idx))); } } // namespace detail template HWY_API Vec128 Compress(Vec128 v, const Mask128 mask) { const uint64_t mask_bits = detail::BitsFromMask(mask); return detail::Compress(hwy::SizeTag(), v, mask_bits); } // ------------------------------ CompressBits template HWY_API Vec128 CompressBits(Vec128 v, const uint8_t* HWY_RESTRICT bits) { uint64_t mask_bits = 0; constexpr size_t kNumBytes = (N + 7) / 8; CopyBytes(bits, &mask_bits); if (N < 8) { mask_bits &= (1ull << N) - 1; } return detail::Compress(hwy::SizeTag(), v, mask_bits); } // ------------------------------ CompressStore template HWY_API size_t CompressStore(Vec128 v, const Mask128 mask, Simd d, T* HWY_RESTRICT unaligned) { const uint64_t mask_bits = detail::BitsFromMask(mask); const auto c = detail::Compress(hwy::SizeTag(), v, mask_bits); StoreU(c, d, unaligned); return PopCount(mask_bits); } // ------------------------------ CompressBlendedStore template HWY_API size_t CompressBlendedStore(Vec128 v, Mask128 m, Simd d, T* HWY_RESTRICT unaligned) { const RebindToUnsigned du; // so we can support fp16/bf16 using TU = TFromD; const uint64_t mask_bits = detail::BitsFromMask(m); const size_t count = PopCount(mask_bits); const Mask128 store_mask = FirstN(du, count); const Vec128 compressed = detail::Compress(hwy::SizeTag(), BitCast(du, v), mask_bits); const Vec128 prev = BitCast(du, LoadU(d, unaligned)); StoreU(BitCast(d, IfThenElse(store_mask, compressed, prev)), d, unaligned); return count; } // ------------------------------ CompressBitsStore template HWY_API size_t CompressBitsStore(Vec128 v, const uint8_t* HWY_RESTRICT bits, Simd d, T* HWY_RESTRICT unaligned) { uint64_t mask_bits = 0; constexpr size_t kNumBytes = (N + 7) / 8; CopyBytes(bits, &mask_bits); if (N < 8) { mask_bits &= (1ull << N) - 1; } const auto c = detail::Compress(hwy::SizeTag(), v, mask_bits); StoreU(c, d, unaligned); return PopCount(mask_bits); } // ------------------------------ StoreInterleaved3 (CombineShiftRightBytes, // TableLookupBytes) // 128 bits HWY_API void StoreInterleaved3(const Vec128 a, const Vec128 b, const Vec128 c, Full128 d, uint8_t* HWY_RESTRICT unaligned) { const auto k5 = Set(d, 5); const auto k6 = Set(d, 6); // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0]. // 0x80 so lanes to be filled from other vectors are 0 for blending. alignas(16) static constexpr uint8_t tbl_r0[16] = { 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; alignas(16) static constexpr uint8_t tbl_g0[16] = { 0x80, 0, 0x80, 0x80, 1, 0x80, // 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; const auto shuf_r0 = Load(d, tbl_r0); const auto shuf_g0 = Load(d, tbl_g0); // cannot reuse r0 due to 5 in MSB const auto shuf_b0 = CombineShiftRightBytes<15>(d, shuf_g0, shuf_g0); const auto r0 = TableLookupBytes(a, shuf_r0); // 5..4..3..2..1..0 const auto g0 = TableLookupBytes(b, shuf_g0); // ..4..3..2..1..0. const auto b0 = TableLookupBytes(c, shuf_b0); // .4..3..2..1..0.. const auto int0 = r0 | g0 | b0; StoreU(int0, d, unaligned + 0 * 16); // Second vector: g10,r10, bgr[9:6], b5,g5 const auto shuf_r1 = shuf_b0 + k6; // .A..9..8..7..6.. const auto shuf_g1 = shuf_r0 + k5; // A..9..8..7..6..5 const auto shuf_b1 = shuf_g0 + k5; // ..9..8..7..6..5. const auto r1 = TableLookupBytes(a, shuf_r1); const auto g1 = TableLookupBytes(b, shuf_g1); const auto b1 = TableLookupBytes(c, shuf_b1); const auto int1 = r1 | g1 | b1; StoreU(int1, d, unaligned + 1 * 16); // Third vector: bgr[15:11], b10 const auto shuf_r2 = shuf_b1 + k6; // ..F..E..D..C..B. const auto shuf_g2 = shuf_r1 + k5; // .F..E..D..C..B.. const auto shuf_b2 = shuf_g1 + k5; // F..E..D..C..B..A const auto r2 = TableLookupBytes(a, shuf_r2); const auto g2 = TableLookupBytes(b, shuf_g2); const auto b2 = TableLookupBytes(c, shuf_b2); const auto int2 = r2 | g2 | b2; StoreU(int2, d, unaligned + 2 * 16); } // 64 bits HWY_API void StoreInterleaved3(const Vec128 a, const Vec128 b, const Vec128 c, Full64 d, uint8_t* HWY_RESTRICT unaligned) { // Use full vectors for the shuffles and first result. const Full128 d_full; const auto k5 = Set(d_full, 5); const auto k6 = Set(d_full, 6); const Vec128 full_a{a.raw}; const Vec128 full_b{b.raw}; const Vec128 full_c{c.raw}; // Shuffle (a,b,c) vector bytes to (MSB on left): r5, bgr[4:0]. // 0x80 so lanes to be filled from other vectors are 0 for blending. alignas(16) static constexpr uint8_t tbl_r0[16] = { 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, // 3, 0x80, 0x80, 4, 0x80, 0x80, 5}; alignas(16) static constexpr uint8_t tbl_g0[16] = { 0x80, 0, 0x80, 0x80, 1, 0x80, // 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, 4, 0x80, 0x80}; const auto shuf_r0 = Load(d_full, tbl_r0); const auto shuf_g0 = Load(d_full, tbl_g0); // cannot reuse r0 due to 5 in MSB const auto shuf_b0 = CombineShiftRightBytes<15>(d_full, shuf_g0, shuf_g0); const auto r0 = TableLookupBytes(full_a, shuf_r0); // 5..4..3..2..1..0 const auto g0 = TableLookupBytes(full_b, shuf_g0); // ..4..3..2..1..0. const auto b0 = TableLookupBytes(full_c, shuf_b0); // .4..3..2..1..0.. const auto int0 = r0 | g0 | b0; StoreU(int0, d_full, unaligned + 0 * 16); // Second (HALF) vector: bgr[7:6], b5,g5 const auto shuf_r1 = shuf_b0 + k6; // ..7..6.. const auto shuf_g1 = shuf_r0 + k5; // .7..6..5 const auto shuf_b1 = shuf_g0 + k5; // 7..6..5. const auto r1 = TableLookupBytes(full_a, shuf_r1); const auto g1 = TableLookupBytes(full_b, shuf_g1); const auto b1 = TableLookupBytes(full_c, shuf_b1); const decltype(Zero(d)) int1{(r1 | g1 | b1).raw}; StoreU(int1, d, unaligned + 1 * 16); } // <= 32 bits template HWY_API void StoreInterleaved3(const Vec128 a, const Vec128 b, const Vec128 c, Simd /*tag*/, uint8_t* HWY_RESTRICT unaligned) { // Use full vectors for the shuffles and result. const Full128 d_full; const Vec128 full_a{a.raw}; const Vec128 full_b{b.raw}; const Vec128 full_c{c.raw}; // Shuffle (a,b,c) vector bytes to bgr[3:0]. // 0x80 so lanes to be filled from other vectors are 0 for blending. alignas(16) static constexpr uint8_t tbl_r0[16] = { 0, 0x80, 0x80, 1, 0x80, 0x80, 2, 0x80, 0x80, 3, 0x80, 0x80, // 0x80, 0x80, 0x80, 0x80}; const auto shuf_r0 = Load(d_full, tbl_r0); const auto shuf_g0 = CombineShiftRightBytes<15>(d_full, shuf_r0, shuf_r0); const auto shuf_b0 = CombineShiftRightBytes<14>(d_full, shuf_r0, shuf_r0); const auto r0 = TableLookupBytes(full_a, shuf_r0); // ......3..2..1..0 const auto g0 = TableLookupBytes(full_b, shuf_g0); // .....3..2..1..0. const auto b0 = TableLookupBytes(full_c, shuf_b0); // ....3..2..1..0.. const auto int0 = r0 | g0 | b0; alignas(16) uint8_t buf[16]; StoreU(int0, d_full, buf); CopyBytes(buf, unaligned); } // ------------------------------ StoreInterleaved4 // 128 bits HWY_API void StoreInterleaved4(const Vec128 v0, const Vec128 v1, const Vec128 v2, const Vec128 v3, Full128 d8, uint8_t* HWY_RESTRICT unaligned) { const RepartitionToWide d16; const RepartitionToWide d32; // let a,b,c,d denote v0..3. const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0 const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0 const auto ba8 = ZipUpper(d16, v0, v1); const auto dc8 = ZipUpper(d16, v2, v3); const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0 const auto dcba_4 = ZipUpper(d32, ba0, dc0); // d..a7 d..a4 const auto dcba_8 = ZipLower(d32, ba8, dc8); // d..aB d..a8 const auto dcba_C = ZipUpper(d32, ba8, dc8); // d..aF d..aC StoreU(BitCast(d8, dcba_0), d8, unaligned + 0 * 16); StoreU(BitCast(d8, dcba_4), d8, unaligned + 1 * 16); StoreU(BitCast(d8, dcba_8), d8, unaligned + 2 * 16); StoreU(BitCast(d8, dcba_C), d8, unaligned + 3 * 16); } // 64 bits HWY_API void StoreInterleaved4(const Vec128 in0, const Vec128 in1, const Vec128 in2, const Vec128 in3, Full64 /* tag */, uint8_t* HWY_RESTRICT unaligned) { // Use full vectors to reduce the number of stores. const Full128 d_full8; const RepartitionToWide d16; const RepartitionToWide d32; const Vec128 v0{in0.raw}; const Vec128 v1{in1.raw}; const Vec128 v2{in2.raw}; const Vec128 v3{in3.raw}; // let a,b,c,d denote v0..3. const auto ba0 = ZipLower(d16, v0, v1); // b7 a7 .. b0 a0 const auto dc0 = ZipLower(d16, v2, v3); // d7 c7 .. d0 c0 const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0 const auto dcba_4 = ZipUpper(d32, ba0, dc0); // d..a7 d..a4 StoreU(BitCast(d_full8, dcba_0), d_full8, unaligned + 0 * 16); StoreU(BitCast(d_full8, dcba_4), d_full8, unaligned + 1 * 16); } // <= 32 bits template HWY_API void StoreInterleaved4(const Vec128 in0, const Vec128 in1, const Vec128 in2, const Vec128 in3, Simd /*tag*/, uint8_t* HWY_RESTRICT unaligned) { // Use full vectors to reduce the number of stores. const Full128 d_full8; const RepartitionToWide d16; const RepartitionToWide d32; const Vec128 v0{in0.raw}; const Vec128 v1{in1.raw}; const Vec128 v2{in2.raw}; const Vec128 v3{in3.raw}; // let a,b,c,d denote v0..3. const auto ba0 = ZipLower(d16, v0, v1); // b3 a3 .. b0 a0 const auto dc0 = ZipLower(d16, v2, v3); // d3 c3 .. d0 c0 const auto dcba_0 = ZipLower(d32, ba0, dc0); // d..a3 d..a0 alignas(16) uint8_t buf[16]; StoreU(BitCast(d_full8, dcba_0), d_full8, buf); CopyBytes<4 * N>(buf, unaligned); } // ------------------------------ MulEven/Odd (Load) HWY_INLINE Vec128 MulEven(const Vec128 a, const Vec128 b) { alignas(16) uint64_t mul[2]; mul[0] = Mul128(static_cast(wasm_i64x2_extract_lane(a.raw, 0)), static_cast(wasm_i64x2_extract_lane(b.raw, 0)), &mul[1]); return Load(Full128(), mul); } HWY_INLINE Vec128 MulOdd(const Vec128 a, const Vec128 b) { alignas(16) uint64_t mul[2]; mul[0] = Mul128(static_cast(wasm_i64x2_extract_lane(a.raw, 1)), static_cast(wasm_i64x2_extract_lane(b.raw, 1)), &mul[1]); return Load(Full128(), mul); } // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) template HWY_API Vec128 ReorderWidenMulAccumulate(Simd df32, Vec128 a, Vec128 b, const Vec128 sum0, Vec128& sum1) { const Repartition du16; const RebindToUnsigned du32; const Vec128 zero = Zero(du16); const Vec128 a0 = ZipLower(du32, zero, BitCast(du16, a)); const Vec128 a1 = ZipUpper(du32, zero, BitCast(du16, a)); const Vec128 b0 = ZipLower(du32, zero, BitCast(du16, b)); const Vec128 b1 = ZipUpper(du32, zero, BitCast(du16, b)); sum1 = MulAdd(BitCast(df32, a1), BitCast(df32, b1), sum1); return MulAdd(BitCast(df32, a0), BitCast(df32, b0), sum0); } // ------------------------------ Reductions namespace detail { // N=1 for any T: no-op template HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag /* tag */, const Vec128 v) { return v; } template HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag /* tag */, const Vec128 v) { return v; } template HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag /* tag */, const Vec128 v) { return v; } // u32/i32/f32: // N=2 template HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v10) { return v10 + Vec128{Shuffle2301(Vec128{v10.raw}).raw}; } template HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v10) { return Min(v10, Vec128{Shuffle2301(Vec128{v10.raw}).raw}); } template HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v10) { return Max(v10, Vec128{Shuffle2301(Vec128{v10.raw}).raw}); } // N=4 (full) template HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { const Vec128 v1032 = Shuffle1032(v3210); const Vec128 v31_20_31_20 = v3210 + v1032; const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); return v20_31_20_31 + v31_20_31_20; } template HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { const Vec128 v1032 = Shuffle1032(v3210); const Vec128 v31_20_31_20 = Min(v3210, v1032); const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); return Min(v20_31_20_31, v31_20_31_20); } template HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag<4> /* tag */, const Vec128 v3210) { const Vec128 v1032 = Shuffle1032(v3210); const Vec128 v31_20_31_20 = Max(v3210, v1032); const Vec128 v20_31_20_31 = Shuffle0321(v31_20_31_20); return Max(v20_31_20_31, v31_20_31_20); } // u64/i64/f64: // N=2 (full) template HWY_INLINE Vec128 SumOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { const Vec128 v01 = Shuffle01(v10); return v10 + v01; } template HWY_INLINE Vec128 MinOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { const Vec128 v01 = Shuffle01(v10); return Min(v10, v01); } template HWY_INLINE Vec128 MaxOfLanes(hwy::SizeTag<8> /* tag */, const Vec128 v10) { const Vec128 v01 = Shuffle01(v10); return Max(v10, v01); } // u16/i16 template HWY_API Vec128 MinOfLanes(hwy::SizeTag<2> /* tag */, Vec128 v) { const DFromV d; const Repartition d32; const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); const auto odd = ShiftRight<16>(BitCast(d32, v)); const auto min = MinOfLanes(d32, Min(even, odd)); // Also broadcast into odd lanes. return BitCast(d, Or(min, ShiftLeft<16>(min))); } template HWY_API Vec128 MaxOfLanes(hwy::SizeTag<2> /* tag */, Vec128 v) { const DFromV d; const Repartition d32; const auto even = And(BitCast(d32, v), Set(d32, 0xFFFF)); const auto odd = ShiftRight<16>(BitCast(d32, v)); const auto min = MaxOfLanes(d32, Max(even, odd)); // Also broadcast into odd lanes. return BitCast(d, Or(min, ShiftLeft<16>(min))); } } // namespace detail // Supported for u/i/f 32/64. Returns the same value in each lane. template HWY_API Vec128 SumOfLanes(Simd /* tag */, const Vec128 v) { return detail::SumOfLanes(hwy::SizeTag(), v); } template HWY_API Vec128 MinOfLanes(Simd /* tag */, const Vec128 v) { return detail::MinOfLanes(hwy::SizeTag(), v); } template HWY_API Vec128 MaxOfLanes(Simd /* tag */, const Vec128 v) { return detail::MaxOfLanes(hwy::SizeTag(), v); } // ------------------------------ Lt128 namespace detail { template Mask128 ShiftMaskLeft(Mask128 m) { return MaskFromVec(ShiftLeftLanes(VecFromMask(Simd(), m))); } } // namespace detail template HWY_INLINE Mask128 Lt128(Simd d, Vec128 a, Vec128 b) { static_assert(!IsSigned() && sizeof(T) == 8, "Use u64"); // Truth table of Eq and Lt for Hi and Lo u64. // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) // =H =L cH cL | out = cH | (=H & cL) // 0 0 0 0 | 0 // 0 0 0 1 | 0 // 0 0 1 0 | 1 // 0 0 1 1 | 1 // 0 1 0 0 | 0 // 0 1 0 1 | 0 // 0 1 1 0 | 1 // 1 0 0 0 | 0 // 1 0 0 1 | 1 // 1 1 0 0 | 0 const Mask128 eqHL = Eq(a, b); const Mask128 ltHL = Lt(a, b); // We need to bring cL to the upper lane/bit corresponding to cH. Comparing // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the // comparison result leftwards requires only 4. const Mask128 ltLx = detail::ShiftMaskLeft<1>(ltHL); const Mask128 outHx = Or(ltHL, And(eqHL, ltLx)); const Vec128 vecHx = VecFromMask(d, outHx); return MaskFromVec(InterleaveUpper(d, vecHx, vecHx)); } // ------------------------------ Min128, Max128 (Lt128) // Without a native OddEven, it seems infeasible to go faster than Lt128. template HWY_INLINE VFromD Min128(D d, const VFromD a, const VFromD b) { return IfThenElse(Lt128(d, a, b), a, b); } template HWY_INLINE VFromD Max128(D d, const VFromD a, const VFromD b) { return IfThenElse(Lt128(d, a, b), b, a); } // ================================================== Operator wrapper template HWY_API V Add(V a, V b) { return a + b; } template HWY_API V Sub(V a, V b) { return a - b; } template HWY_API V Mul(V a, V b) { return a * b; } template HWY_API V Div(V a, V b) { return a / b; } template V Shl(V a, V b) { return a << b; } template V Shr(V a, V b) { return a >> b; } template HWY_API auto Eq(V a, V b) -> decltype(a == b) { return a == b; } template HWY_API auto Ne(V a, V b) -> decltype(a == b) { return a != b; } template HWY_API auto Lt(V a, V b) -> decltype(a == b) { return a < b; } template HWY_API auto Gt(V a, V b) -> decltype(a == b) { return a > b; } template HWY_API auto Ge(V a, V b) -> decltype(a == b) { return a >= b; } template HWY_API auto Le(V a, V b) -> decltype(a == b) { return a <= b; } // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE();