// Copyright 2021 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // 256-bit WASM vectors and operations. Experimental. // External include guard in highway.h - see comment there. #include #include #include #include "hwy/base.h" #include "hwy/ops/shared-inl.h" #include "hwy/ops/wasm_128-inl.h" HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { template using Full256 = Simd; template using Full128 = Simd; // TODO(richardwinterton): add this to DeduceD in wasm_128 similar to x86_128. template class Vec256 { public: // Compound assignment. Only usable if there is a corresponding non-member // binary operator overload. For example, only f32 and f64 support division. HWY_INLINE Vec256& operator*=(const Vec256 other) { return *this = (*this * other); } HWY_INLINE Vec256& operator/=(const Vec256 other) { return *this = (*this / other); } HWY_INLINE Vec256& operator+=(const Vec256 other) { return *this = (*this + other); } HWY_INLINE Vec256& operator-=(const Vec256 other) { return *this = (*this - other); } HWY_INLINE Vec256& operator&=(const Vec256 other) { return *this = (*this & other); } HWY_INLINE Vec256& operator|=(const Vec256 other) { return *this = (*this | other); } HWY_INLINE Vec256& operator^=(const Vec256 other) { return *this = (*this ^ other); } Vec128 v0; Vec128 v1; }; template struct Mask256 { Mask128 m0; Mask128 m1; }; // ------------------------------ BitCast template HWY_API Vec256 BitCast(Full256 d, Vec256 v) { const Half dh; Vec256 ret; ret.v0 = BitCast(dh, v.v0); ret.v1 = BitCast(dh, v.v1); return ret; // TODO(richardwinterton): implement other ops like this } // ------------------------------ Zero // Returns an all-zero vector/part. template HWY_API Vec256 Zero(Full256 /* tag */) { return Vec256{wasm_i32x4_splat(0)}; } HWY_API Vec256 Zero(Full256 /* tag */) { return Vec256{wasm_f32x4_splat(0.0f)}; } template using VFromD = decltype(Zero(D())); // ------------------------------ Set // Returns a vector/part with all lanes set to "t". HWY_API Vec256 Set(Full256 /* tag */, const uint8_t t) { return Vec256{wasm_i8x16_splat(static_cast(t))}; } HWY_API Vec256 Set(Full256 /* tag */, const uint16_t t) { return Vec256{wasm_i16x8_splat(static_cast(t))}; } HWY_API Vec256 Set(Full256 /* tag */, const uint32_t t) { return Vec256{wasm_i32x4_splat(static_cast(t))}; } HWY_API Vec256 Set(Full256 /* tag */, const uint64_t t) { return Vec256{wasm_i64x2_splat(static_cast(t))}; } HWY_API Vec256 Set(Full256 /* tag */, const int8_t t) { return Vec256{wasm_i8x16_splat(t)}; } HWY_API Vec256 Set(Full256 /* tag */, const int16_t t) { return Vec256{wasm_i16x8_splat(t)}; } HWY_API Vec256 Set(Full256 /* tag */, const int32_t t) { return Vec256{wasm_i32x4_splat(t)}; } HWY_API Vec256 Set(Full256 /* tag */, const int64_t t) { return Vec256{wasm_i64x2_splat(t)}; } HWY_API Vec256 Set(Full256 /* tag */, const float t) { return Vec256{wasm_f32x4_splat(t)}; } HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") // Returns a vector with uninitialized elements. template HWY_API Vec256 Undefined(Full256 d) { return Zero(d); } HWY_DIAGNOSTICS(pop) // Returns a vector with lane i=[0, N) set to "first" + i. template Vec256 Iota(const Full256 d, const T2 first) { HWY_ALIGN T lanes[16 / sizeof(T)]; for (size_t i = 0; i < 16 / sizeof(T); ++i) { lanes[i] = static_cast(first + static_cast(i)); } return Load(d, lanes); } // ================================================== ARITHMETIC // ------------------------------ Addition // Unsigned HWY_API Vec256 operator+(const Vec256 a, const Vec256 b) { return Vec256{wasm_i8x16_add(a.raw, b.raw)}; } HWY_API Vec256 operator+(const Vec256 a, const Vec256 b) { return Vec256{wasm_i16x8_add(a.raw, b.raw)}; } HWY_API Vec256 operator+(const Vec256 a, const Vec256 b) { return Vec256{wasm_i32x4_add(a.raw, b.raw)}; } // Signed HWY_API Vec256 operator+(const Vec256 a, const Vec256 b) { return Vec256{wasm_i8x16_add(a.raw, b.raw)}; } HWY_API Vec256 operator+(const Vec256 a, const Vec256 b) { return Vec256{wasm_i16x8_add(a.raw, b.raw)}; } HWY_API Vec256 operator+(const Vec256 a, const Vec256 b) { return Vec256{wasm_i32x4_add(a.raw, b.raw)}; } // Float HWY_API Vec256 operator+(const Vec256 a, const Vec256 b) { return Vec256{wasm_f32x4_add(a.raw, b.raw)}; } // ------------------------------ Subtraction // Unsigned HWY_API Vec256 operator-(const Vec256 a, const Vec256 b) { return Vec256{wasm_i8x16_sub(a.raw, b.raw)}; } HWY_API Vec256 operator-(Vec256 a, Vec256 b) { return Vec256{wasm_i16x8_sub(a.raw, b.raw)}; } HWY_API Vec256 operator-(const Vec256 a, const Vec256 b) { return Vec256{wasm_i32x4_sub(a.raw, b.raw)}; } // Signed HWY_API Vec256 operator-(const Vec256 a, const Vec256 b) { return Vec256{wasm_i8x16_sub(a.raw, b.raw)}; } HWY_API Vec256 operator-(const Vec256 a, const Vec256 b) { return Vec256{wasm_i16x8_sub(a.raw, b.raw)}; } HWY_API Vec256 operator-(const Vec256 a, const Vec256 b) { return Vec256{wasm_i32x4_sub(a.raw, b.raw)}; } // Float HWY_API Vec256 operator-(const Vec256 a, const Vec256 b) { return Vec256{wasm_f32x4_sub(a.raw, b.raw)}; } // ------------------------------ SumsOf8 HWY_API Vec256 SumsOf8(const Vec256 v) { HWY_ABORT("not implemented"); } // ------------------------------ SaturatedAdd // Returns a + b clamped to the destination range. // Unsigned HWY_API Vec256 SaturatedAdd(const Vec256 a, const Vec256 b) { return Vec256{wasm_u8x16_add_sat(a.raw, b.raw)}; } HWY_API Vec256 SaturatedAdd(const Vec256 a, const Vec256 b) { return Vec256{wasm_u16x8_add_sat(a.raw, b.raw)}; } // Signed HWY_API Vec256 SaturatedAdd(const Vec256 a, const Vec256 b) { return Vec256{wasm_i8x16_add_sat(a.raw, b.raw)}; } HWY_API Vec256 SaturatedAdd(const Vec256 a, const Vec256 b) { return Vec256{wasm_i16x8_add_sat(a.raw, b.raw)}; } // ------------------------------ SaturatedSub // Returns a - b clamped to the destination range. // Unsigned HWY_API Vec256 SaturatedSub(const Vec256 a, const Vec256 b) { return Vec256{wasm_u8x16_sub_sat(a.raw, b.raw)}; } HWY_API Vec256 SaturatedSub(const Vec256 a, const Vec256 b) { return Vec256{wasm_u16x8_sub_sat(a.raw, b.raw)}; } // Signed HWY_API Vec256 SaturatedSub(const Vec256 a, const Vec256 b) { return Vec256{wasm_i8x16_sub_sat(a.raw, b.raw)}; } HWY_API Vec256 SaturatedSub(const Vec256 a, const Vec256 b) { return Vec256{wasm_i16x8_sub_sat(a.raw, b.raw)}; } // ------------------------------ Average // Returns (a + b + 1) / 2 // Unsigned HWY_API Vec256 AverageRound(const Vec256 a, const Vec256 b) { return Vec256{wasm_u8x16_avgr(a.raw, b.raw)}; } HWY_API Vec256 AverageRound(const Vec256 a, const Vec256 b) { return Vec256{wasm_u16x8_avgr(a.raw, b.raw)}; } // ------------------------------ Absolute value // Returns absolute value, except that LimitsMin() maps to LimitsMax() + 1. HWY_API Vec256 Abs(const Vec256 v) { return Vec256{wasm_i8x16_abs(v.raw)}; } HWY_API Vec256 Abs(const Vec256 v) { return Vec256{wasm_i16x8_abs(v.raw)}; } HWY_API Vec256 Abs(const Vec256 v) { return Vec256{wasm_i32x4_abs(v.raw)}; } HWY_API Vec256 Abs(const Vec256 v) { return Vec256{wasm_i62x2_abs(v.raw)}; } HWY_API Vec256 Abs(const Vec256 v) { return Vec256{wasm_f32x4_abs(v.raw)}; } // ------------------------------ Shift lanes by constant #bits // Unsigned template HWY_API Vec256 ShiftLeft(const Vec256 v) { return Vec256{wasm_i16x8_shl(v.raw, kBits)}; } template HWY_API Vec256 ShiftRight(const Vec256 v) { return Vec256{wasm_u16x8_shr(v.raw, kBits)}; } template HWY_API Vec256 ShiftLeft(const Vec256 v) { return Vec256{wasm_i32x4_shl(v.raw, kBits)}; } template HWY_API Vec256 ShiftRight(const Vec256 v) { return Vec256{wasm_u32x4_shr(v.raw, kBits)}; } // Signed template HWY_API Vec256 ShiftLeft(const Vec256 v) { return Vec256{wasm_i16x8_shl(v.raw, kBits)}; } template HWY_API Vec256 ShiftRight(const Vec256 v) { return Vec256{wasm_i16x8_shr(v.raw, kBits)}; } template HWY_API Vec256 ShiftLeft(const Vec256 v) { return Vec256{wasm_i32x4_shl(v.raw, kBits)}; } template HWY_API Vec256 ShiftRight(const Vec256 v) { return Vec256{wasm_i32x4_shr(v.raw, kBits)}; } // 8-bit template HWY_API Vec256 ShiftLeft(const Vec256 v) { const Full256 d8; // Use raw instead of BitCast to support N=1. const Vec256 shifted{ShiftLeft(Vec128>{v.raw}).raw}; return kBits == 1 ? (v + v) : (shifted & Set(d8, static_cast((0xFF << kBits) & 0xFF))); } template HWY_API Vec256 ShiftRight(const Vec256 v) { const Full256 d8; // Use raw instead of BitCast to support N=1. const Vec256 shifted{ShiftRight(Vec128{v.raw}).raw}; return shifted & Set(d8, 0xFF >> kBits); } template HWY_API Vec256 ShiftRight(const Vec256 v) { const Full256 di; const Full256 du; const auto shifted = BitCast(di, ShiftRight(BitCast(du, v))); const auto shifted_sign = BitCast(di, Set(du, 0x80 >> kBits)); return (shifted ^ shifted_sign) - shifted_sign; } // ------------------------------ RotateRight (ShiftRight, Or) template HWY_API Vec256 RotateRight(const Vec256 v) { constexpr size_t kSizeInBits = sizeof(T) * 8; static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); if (kBits == 0) return v; return Or(ShiftRight(v), ShiftLeft(v)); } // ------------------------------ Shift lanes by same variable #bits // Unsigned HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { return Vec256{wasm_i16x8_shl(v.raw, bits)}; } HWY_API Vec256 ShiftRightSame(const Vec256 v, const int bits) { return Vec256{wasm_u16x8_shr(v.raw, bits)}; } HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { return Vec256{wasm_i32x4_shl(v.raw, bits)}; } HWY_API Vec256 ShiftRightSame(const Vec256 v, const int bits) { return Vec256{wasm_u32x4_shr(v.raw, bits)}; } // Signed HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { return Vec256{wasm_i16x8_shl(v.raw, bits)}; } HWY_API Vec256 ShiftRightSame(const Vec256 v, const int bits) { return Vec256{wasm_i16x8_shr(v.raw, bits)}; } HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { return Vec256{wasm_i32x4_shl(v.raw, bits)}; } HWY_API Vec256 ShiftRightSame(const Vec256 v, const int bits) { return Vec256{wasm_i32x4_shr(v.raw, bits)}; } // 8-bit template HWY_API Vec256 ShiftLeftSame(const Vec256 v, const int bits) { const Full256 d8; // Use raw instead of BitCast to support N=1. const Vec256 shifted{ShiftLeftSame(Vec128>{v.raw}, bits).raw}; return shifted & Set(d8, (0xFF << bits) & 0xFF); } HWY_API Vec256 ShiftRightSame(Vec256 v, const int bits) { const Full256 d8; // Use raw instead of BitCast to support N=1. const Vec256 shifted{ ShiftRightSame(Vec128{v.raw}, bits).raw}; return shifted & Set(d8, 0xFF >> bits); } HWY_API Vec256 ShiftRightSame(Vec256 v, const int bits) { const Full256 di; const Full256 du; const auto shifted = BitCast(di, ShiftRightSame(BitCast(du, v), bits)); const auto shifted_sign = BitCast(di, Set(du, 0x80 >> bits)); return (shifted ^ shifted_sign) - shifted_sign; } // ------------------------------ Minimum // Unsigned HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { return Vec256{wasm_u8x16_min(a.raw, b.raw)}; } HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { return Vec256{wasm_u16x8_min(a.raw, b.raw)}; } HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { return Vec256{wasm_u32x4_min(a.raw, b.raw)}; } HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { alignas(32) float min[4]; min[0] = HWY_MIN(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0)); min[1] = HWY_MIN(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1)); return Vec256{wasm_v128_load(min)}; } // Signed HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { return Vec256{wasm_i8x16_min(a.raw, b.raw)}; } HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { return Vec256{wasm_i16x8_min(a.raw, b.raw)}; } HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { return Vec256{wasm_i32x4_min(a.raw, b.raw)}; } HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { alignas(32) float min[4]; min[0] = HWY_MIN(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0)); min[1] = HWY_MIN(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1)); return Vec256{wasm_v128_load(min)}; } // Float HWY_API Vec256 Min(const Vec256 a, const Vec256 b) { return Vec256{wasm_f32x4_min(a.raw, b.raw)}; } // ------------------------------ Maximum // Unsigned HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { return Vec256{wasm_u8x16_max(a.raw, b.raw)}; } HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { return Vec256{wasm_u16x8_max(a.raw, b.raw)}; } HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { return Vec256{wasm_u32x4_max(a.raw, b.raw)}; } HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { alignas(32) float max[4]; max[0] = HWY_MAX(wasm_u64x2_extract_lane(a, 0), wasm_u64x2_extract_lane(b, 0)); max[1] = HWY_MAX(wasm_u64x2_extract_lane(a, 1), wasm_u64x2_extract_lane(b, 1)); return Vec256{wasm_v128_load(max)}; } // Signed HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { return Vec256{wasm_i8x16_max(a.raw, b.raw)}; } HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { return Vec256{wasm_i16x8_max(a.raw, b.raw)}; } HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { return Vec256{wasm_i32x4_max(a.raw, b.raw)}; } HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { alignas(32) float max[4]; max[0] = HWY_MAX(wasm_i64x2_extract_lane(a, 0), wasm_i64x2_extract_lane(b, 0)); max[1] = HWY_MAX(wasm_i64x2_extract_lane(a, 1), wasm_i64x2_extract_lane(b, 1)); return Vec256{wasm_v128_load(max)}; } // Float HWY_API Vec256 Max(const Vec256 a, const Vec256 b) { return Vec256{wasm_f32x4_max(a.raw, b.raw)}; } // ------------------------------ Integer multiplication // Unsigned HWY_API Vec256 operator*(const Vec256 a, const Vec256 b) { return Vec256{wasm_i16x8_mul(a.raw, b.raw)}; } HWY_API Vec256 operator*(const Vec256 a, const Vec256 b) { return Vec256{wasm_i32x4_mul(a.raw, b.raw)}; } // Signed HWY_API Vec256 operator*(const Vec256 a, const Vec256 b) { return Vec256{wasm_i16x8_mul(a.raw, b.raw)}; } HWY_API Vec256 operator*(const Vec256 a, const Vec256 b) { return Vec256{wasm_i32x4_mul(a.raw, b.raw)}; } // Returns the upper 16 bits of a * b in each lane. HWY_API Vec256 MulHigh(const Vec256 a, const Vec256 b) { // TODO(eustas): replace, when implemented in WASM. const auto al = wasm_u32x4_extend_low_u16x8(a.raw); const auto ah = wasm_u32x4_extend_high_u16x8(a.raw); const auto bl = wasm_u32x4_extend_low_u16x8(b.raw); const auto bh = wasm_u32x4_extend_high_u16x8(b.raw); const auto l = wasm_i32x4_mul(al, bl); const auto h = wasm_i32x4_mul(ah, bh); // TODO(eustas): shift-right + narrow? return Vec256{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; } HWY_API Vec256 MulHigh(const Vec256 a, const Vec256 b) { // TODO(eustas): replace, when implemented in WASM. const auto al = wasm_i32x4_extend_low_i16x8(a.raw); const auto ah = wasm_i32x4_extend_high_i16x8(a.raw); const auto bl = wasm_i32x4_extend_low_i16x8(b.raw); const auto bh = wasm_i32x4_extend_high_i16x8(b.raw); const auto l = wasm_i32x4_mul(al, bl); const auto h = wasm_i32x4_mul(ah, bh); // TODO(eustas): shift-right + narrow? return Vec256{wasm_i16x8_shuffle(l, h, 1, 3, 5, 7, 9, 11, 13, 15)}; } // Multiplies even lanes (0, 2 ..) and returns the double-width result. HWY_API Vec256 MulEven(const Vec256 a, const Vec256 b) { // TODO(eustas): replace, when implemented in WASM. const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0); const auto ae = wasm_v128_and(a.raw, kEvenMask); const auto be = wasm_v128_and(b.raw, kEvenMask); return Vec256{wasm_i64x2_mul(ae, be)}; } HWY_API Vec256 MulEven(const Vec256 a, const Vec256 b) { // TODO(eustas): replace, when implemented in WASM. const auto kEvenMask = wasm_i32x4_make(-1, 0, -1, 0); const auto ae = wasm_v128_and(a.raw, kEvenMask); const auto be = wasm_v128_and(b.raw, kEvenMask); return Vec256{wasm_i64x2_mul(ae, be)}; } // ------------------------------ Negate template HWY_API Vec256 Neg(const Vec256 v) { return Xor(v, SignBit(Full256())); } HWY_API Vec256 Neg(const Vec256 v) { return Vec256{wasm_i8x16_neg(v.raw)}; } HWY_API Vec256 Neg(const Vec256 v) { return Vec256{wasm_i16x8_neg(v.raw)}; } HWY_API Vec256 Neg(const Vec256 v) { return Vec256{wasm_i32x4_neg(v.raw)}; } HWY_API Vec256 Neg(const Vec256 v) { return Vec256{wasm_i64x2_neg(v.raw)}; } // ------------------------------ Floating-point mul / div HWY_API Vec256 operator*(Vec256 a, Vec256 b) { return Vec256{wasm_f32x4_mul(a.raw, b.raw)}; } HWY_API Vec256