// Copyright 2019 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "tests/blockwise_test.cc" #include "hwy/foreach_target.h" // IWYU pragma: keep #include "hwy/highway.h" #include "hwy/tests/test_util-inl.h" HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { template struct TestBroadcastR { HWY_NOINLINE void operator()() const { using T = typename D::T; const D d; const size_t N = Lanes(d); if (kLane >= N) return; auto in_lanes = AllocateAligned(N); auto expected = AllocateAligned(N); HWY_ASSERT(in_lanes && expected); ZeroBytes(in_lanes.get(), N * sizeof(T)); const size_t blockN = HWY_MIN(N * sizeof(T), 16) / sizeof(T); // Need to set within each 128-bit block for (size_t block = 0; block < N; block += blockN) { in_lanes[block + kLane] = ConvertScalarTo(block + 1); } PreventElision(in_lanes[0]); // workaround for f16x1 failure const auto in = Load(d, in_lanes.get()); for (size_t block = 0; block < N; block += blockN) { for (size_t i = 0; i < blockN; ++i) { expected[block + i] = ConvertScalarTo(block + 1); } } HWY_ASSERT_VEC_EQ(d, expected.get(), Broadcast(in)); TestBroadcastR()(); } }; template struct TestBroadcastR { void operator()() const {} }; struct TestBroadcast { template HWY_NOINLINE void operator()(T /*unused*/, D d) { TestBroadcastR()(); } }; HWY_NOINLINE void TestAllBroadcast() { ForAllTypes(ForPartialVectors()); } template struct ChooseTableSize { template using type = DIdx; }; template <> struct ChooseTableSize { template using type = ScalableTag; }; template struct TestTableLookupBytes { template HWY_NOINLINE void operator()(T /*unused*/, D d) { #if HWY_TARGET != HWY_SCALAR RandomState rng; const typename ChooseTableSize::template type d_tbl; const Repartition d_tbl8; const Repartition d8; const size_t N = Lanes(d); const size_t NT8 = Lanes(d_tbl8); const size_t N8 = Lanes(d8); auto in_bytes = AllocateAligned(NT8); auto indices = AllocateAligned(N8); auto expected = AllocateAligned(N); HWY_ASSERT(in_bytes && indices && expected); // Random input bytes for (size_t i = 0; i < NT8; ++i) { in_bytes[i] = Random32(&rng) & 0xFF; } const auto in = BitCast(d_tbl, Load(d_tbl8, in_bytes.get())); // Enough test data; for larger vectors, upper lanes will be zero. const uint8_t index_bytes_source[64] = { // Same index as source, multiple outputs from same input, // unused input (9), ascending/descending and nonconsecutive neighbors. 0, 2, 1, 2, 15, 12, 13, 14, 6, 7, 8, 5, 4, 3, 10, 11, 11, 10, 3, 4, 5, 8, 7, 6, 14, 13, 12, 15, 2, 1, 2, 0, 4, 3, 2, 2, 5, 6, 7, 7, 15, 15, 15, 15, 15, 15, 0, 1}; const size_t max_index = HWY_MIN(NT8, 16) - 1; uint8_t* index_bytes = reinterpret_cast(indices.get()); for (size_t i = 0; i < N8; ++i) { index_bytes[i] = (i < 64) ? index_bytes_source[i] : 0; // Avoid asan error for partial vectors. index_bytes[i] = static_cast(HWY_MIN(index_bytes[i], max_index)); } const Vec indices_v = Load(d, indices.get()); uint8_t* expected_bytes = reinterpret_cast(expected.get()); for (size_t block = 0; block < N8; block += 16) { for (size_t i = 0; i < 16 && (block + i) < N8; ++i) { const uint8_t index = index_bytes[block + i]; HWY_ASSERT(index <= max_index); // Note that block + index may exceed NT8 on RVV, which is fine because // the operation uses the larger of the table and index vector size. HWY_ASSERT(block + index < HWY_MAX(N8, NT8)); // For large vectors, the lane index may wrap around due to block, // also wrap around after 8-bit overflow. expected_bytes[block + i] = in_bytes[(block + index) % HWY_MIN(NT8, 256)]; } } HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytes(in, indices_v)); // Individually test zeroing each byte position. for (size_t i = 0; i < N8; ++i) { const uint8_t prev_expected = expected_bytes[i]; const uint8_t prev_index = index_bytes[i]; expected_bytes[i] = 0; const int idx = 0x80 + (static_cast(Random32(&rng) & 7) << 4); HWY_ASSERT(0x80 <= idx && idx < 256); index_bytes[i] = static_cast(idx); const Vec indices_v = Load(d, indices.get()); HWY_ASSERT_VEC_EQ(d, expected.get(), TableLookupBytesOr0(in, indices_v)); expected_bytes[i] = prev_expected; index_bytes[i] = prev_index; } #else (void)d; #endif } }; HWY_NOINLINE void TestAllTableLookupBytesSame() { // Partial index, same-sized table. ForIntegerTypes(ForPartialVectors>()); } HWY_NOINLINE void TestAllTableLookupBytesMixed() { // Partial index, full-size table. ForIntegerTypes(ForPartialVectors>()); } struct TestInterleaveLower { template HWY_NOINLINE void operator()(T /*unused*/, D d) { using TU = MakeUnsigned; const size_t N = Lanes(d); auto even_lanes = AllocateAligned(N); auto odd_lanes = AllocateAligned(N); auto expected = AllocateAligned(N); HWY_ASSERT(even_lanes && odd_lanes && expected); for (size_t i = 0; i < N; ++i) { even_lanes[i] = ConvertScalarTo(2 * i + 0); odd_lanes[i] = ConvertScalarTo(2 * i + 1); } const auto even = Load(d, even_lanes.get()); const auto odd = Load(d, odd_lanes.get()); const size_t blockN = HWY_MIN(16 / sizeof(T), N); for (size_t i = 0; i < Lanes(d); ++i) { const size_t block = i / blockN; const size_t index = (i % blockN) + block * 2 * blockN; expected[i] = ConvertScalarTo(index & LimitsMax()); } HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(even, odd)); HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveLower(d, even, odd)); } }; struct TestInterleaveUpper { template HWY_NOINLINE void operator()(T /*unused*/, D d) { const size_t N = Lanes(d); if (N == 1) return; auto even_lanes = AllocateAligned(N); auto odd_lanes = AllocateAligned(N); auto expected = AllocateAligned(N); HWY_ASSERT(even_lanes && odd_lanes && expected); for (size_t i = 0; i < N; ++i) { even_lanes[i] = ConvertScalarTo(2 * i + 0); odd_lanes[i] = ConvertScalarTo(2 * i + 1); } const auto even = Load(d, even_lanes.get()); const auto odd = Load(d, odd_lanes.get()); const size_t blockN = HWY_MIN(16 / sizeof(T), N); for (size_t i = 0; i < Lanes(d); ++i) { const size_t block = i / blockN; expected[i] = ConvertScalarTo((i % blockN) + block * 2 * blockN + blockN); } HWY_ASSERT_VEC_EQ(d, expected.get(), InterleaveUpper(d, even, odd)); } }; HWY_NOINLINE void TestAllInterleave() { // Not DemoteVectors because this cannot be supported by HWY_SCALAR. ForAllTypes(ForShrinkableVectors()); ForAllTypes(ForShrinkableVectors()); } struct TestZipLower { template HWY_NOINLINE void operator()(T /*unused*/, D d) { using WideT = MakeWide; static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width"); static_assert(IsSigned() == IsSigned(), "Must have same sign"); const size_t N = Lanes(d); auto even_lanes = AllocateAligned(N); auto odd_lanes = AllocateAligned(N); // At least 2 lanes for HWY_SCALAR auto zip_lanes = AllocateAligned(HWY_MAX(N, 2)); HWY_ASSERT(even_lanes && odd_lanes && zip_lanes); const T kMaxT = LimitsMax(); for (size_t i = 0; i < N; ++i) { even_lanes[i] = ConvertScalarTo((2 * i + 0) & kMaxT); odd_lanes[i] = ConvertScalarTo((2 * i + 1) & kMaxT); } const auto even = Load(d, even_lanes.get()); const auto odd = Load(d, odd_lanes.get()); const Repartition dw; #if HWY_TARGET == HWY_SCALAR // Safely handle big-endian const auto expected = Set(dw, static_cast(1ULL << (sizeof(T) * 8))); #else const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N); for (size_t i = 0; i < N; i += 2) { const size_t base = (i / blockN) * blockN; const size_t mod = i % blockN; zip_lanes[i + 0] = even_lanes[mod / 2 + base]; zip_lanes[i + 1] = odd_lanes[mod / 2 + base]; // Without this, `expected` is incorrect with Clang and 512-bit SVE: the // first byte of the second block is 0x10 instead of 0x20 as it should be. PreventElision(zip_lanes[i + 0]); } const Vec expected = BitCast(dw, Load(d, zip_lanes.get())); #endif // HWY_TARGET == HWY_SCALAR HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(even, odd)); HWY_ASSERT_VEC_EQ(dw, expected, ZipLower(dw, even, odd)); } }; #if HWY_TARGET == HWY_SCALAR template using ForZipToWideVectors = ForPartialVectors; #else template using ForZipToWideVectors = ForShrinkableVectors; #endif HWY_NOINLINE void TestAllZipLower() { const ForZipToWideVectors lower_unsigned; lower_unsigned(uint8_t()); lower_unsigned(uint16_t()); #if HWY_HAVE_INTEGER64 lower_unsigned(uint32_t()); // generates u64 #endif const ForZipToWideVectors lower_signed; lower_signed(int8_t()); lower_signed(int16_t()); #if HWY_HAVE_INTEGER64 lower_signed(int32_t()); // generates i64 #endif // No float - concatenating f32 does not result in a f64 } // Remove this test (so it does not show as having run) if the only target is // HWY_SCALAR, which does not support this op. #if HWY_TARGETS != HWY_SCALAR struct TestZipUpper { template HWY_NOINLINE void operator()(T /*unused*/, D d) { #if HWY_TARGET == HWY_SCALAR (void)d; #else using WideT = MakeWide; static_assert(sizeof(T) * 2 == sizeof(WideT), "Must be double-width"); static_assert(IsSigned() == IsSigned(), "Must have same sign"); const size_t N = Lanes(d); if (N < 16 / sizeof(T)) return; auto even_lanes = AllocateAligned(N); auto odd_lanes = AllocateAligned(N); auto zip_lanes = AllocateAligned(N); HWY_ASSERT(even_lanes && odd_lanes && zip_lanes); const T kMaxT = LimitsMax(); for (size_t i = 0; i < N; ++i) { even_lanes[i] = ConvertScalarTo((2 * i + 0) & kMaxT); odd_lanes[i] = ConvertScalarTo((2 * i + 1) & kMaxT); } const auto even = Load(d, even_lanes.get()); const auto odd = Load(d, odd_lanes.get()); const size_t blockN = HWY_MIN(size_t(16) / sizeof(T), N); for (size_t i = 0; i < N; i += 2) { const size_t base = (i / blockN) * blockN + blockN / 2; const size_t mod = i % blockN; zip_lanes[i + 0] = even_lanes[mod / 2 + base]; zip_lanes[i + 1] = odd_lanes[mod / 2 + base]; // See comment at previous call to PreventElision. PreventElision(zip_lanes[i + 0]); } const Repartition dw; const Vec expected = BitCast(dw, Load(d, zip_lanes.get())); HWY_ASSERT_VEC_EQ(dw, expected, ZipUpper(dw, even, odd)); #endif // HWY_TARGET == HWY_SCALAR } }; HWY_NOINLINE void TestAllZipUpper() { const ForShrinkableVectors upper_unsigned; upper_unsigned(uint8_t()); upper_unsigned(uint16_t()); #if HWY_HAVE_INTEGER64 upper_unsigned(uint32_t()); // generates u64 #endif const ForShrinkableVectors upper_signed; upper_signed(int8_t()); upper_signed(int16_t()); #if HWY_HAVE_INTEGER64 upper_signed(int32_t()); // generates i64 #endif // No float - concatenating f32 does not result in a f64 } #endif // HWY_TARGETS != HWY_SCALAR class TestSpecialShuffle32 { public: template HWY_NOINLINE void operator()(T /*unused*/, D d) { const auto v = Iota(d, 0); VerifyLanes32(d, Shuffle2301(v), 2, 3, 0, 1, __FILE__, __LINE__); VerifyLanes32(d, Shuffle1032(v), 1, 0, 3, 2, __FILE__, __LINE__); VerifyLanes32(d, Shuffle0321(v), 0, 3, 2, 1, __FILE__, __LINE__); VerifyLanes32(d, Shuffle2103(v), 2, 1, 0, 3, __FILE__, __LINE__); VerifyLanes32(d, Shuffle0123(v), 0, 1, 2, 3, __FILE__, __LINE__); } private: // HWY_INLINE works around a Clang SVE compiler bug where all but the first // 128 bits (the NEON register) of actual are zero. template HWY_INLINE void VerifyLanes32(D d, VecArg actual, const size_t i3, const size_t i2, const size_t i1, const size_t i0, const char* filename, const int line) { using T = TFromD; constexpr size_t kBlockN = 16 / sizeof(T); const size_t N = Lanes(d); if (N < 4) return; auto expected = AllocateAligned(N); HWY_ASSERT(expected); for (size_t block = 0; block < N; block += kBlockN) { expected[block + 3] = ConvertScalarTo(block + i3); expected[block + 2] = ConvertScalarTo(block + i2); expected[block + 1] = ConvertScalarTo(block + i1); expected[block + 0] = ConvertScalarTo(block + i0); } AssertVecEqual(d, expected.get(), actual, filename, line); } }; class TestSpecialShuffle64 { public: template HWY_NOINLINE void operator()(T /*unused*/, D d) { const auto v = Iota(d, 0); VerifyLanes64(d, Shuffle01(v), 0, 1, __FILE__, __LINE__); } private: // HWY_INLINE works around a Clang SVE compiler bug where all but the first // 128 bits (the NEON register) of actual are zero. template HWY_INLINE void VerifyLanes64(D d, VecArg actual, const size_t i1, const size_t i0, const char* filename, const int line) { using T = TFromD; constexpr size_t kBlockN = 16 / sizeof(T); const size_t N = Lanes(d); if (N < 2) return; auto expected = AllocateAligned(N); HWY_ASSERT(expected); for (size_t block = 0; block < N; block += kBlockN) { expected[block + 1] = ConvertScalarTo(block + i1); expected[block + 0] = ConvertScalarTo(block + i0); } AssertVecEqual(d, expected.get(), actual, filename, line); } }; HWY_NOINLINE void TestAllSpecialShuffles() { const ForGEVectors<128, TestSpecialShuffle32> test32; test32(uint32_t()); test32(int32_t()); test32(float()); #if HWY_HAVE_INTEGER64 const ForGEVectors<128, TestSpecialShuffle64> test64; test64(uint64_t()); test64(int64_t()); #endif #if HWY_HAVE_FLOAT64 const ForGEVectors<128, TestSpecialShuffle64> test_d; test_d(double()); #endif } // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE(); #if HWY_ONCE namespace hwy { HWY_BEFORE_TEST(HwyBlockwiseTest); HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllBroadcast); HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesSame); HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllTableLookupBytesMixed); HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllInterleave); HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipLower); #if HWY_TARGETS != HWY_SCALAR HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllZipUpper); #endif HWY_EXPORT_AND_TEST_P(HwyBlockwiseTest, TestAllSpecialShuffles); } // namespace hwy #endif