// Copyright 2019 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "tests/reduction_test.cc" #include "hwy/foreach_target.h" // IWYU pragma: keep #include "hwy/highway.h" #include "hwy/tests/test_util-inl.h" HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { struct TestSumOfLanes { template >() || ((HWY_MAX_LANES_D(D) & 1) != 0)>* = nullptr> HWY_NOINLINE void SignedEvenLengthVectorTests(D /*d*/) { // do nothing } template >() && ((HWY_MAX_LANES_D(D) & 1) == 0)>* = nullptr> HWY_NOINLINE void SignedEvenLengthVectorTests(D d) { using T = TFromD; const size_t lanes = Lanes(d); #if HWY_HAVE_SCALABLE // On platforms that use scalable vectors, it is possible for Lanes(d) to be // odd but for MaxLanes(d) to be even if Lanes(d) < 2 is true. if (lanes < 2) return; #endif const T pairs = ConvertScalarTo(lanes / 2); // Lanes are the repeated sequence -2, 1, [...]; each pair sums to -1, // so the eventual total is just -(N/2). Vec v = InterleaveLower(Set(d, ConvertScalarTo(-2)), Set(d, ConvertScalarTo(1))); HWY_ASSERT_VEC_EQ(d, Set(d, ConvertScalarTo(-pairs)), SumOfLanes(d, v)); HWY_ASSERT_EQ(ConvertScalarTo(-pairs), ReduceSum(d, v)); // Similar test with a positive result. v = InterleaveLower(Set(d, ConvertScalarTo(-2)), Set(d, ConvertScalarTo(4))); HWY_ASSERT_VEC_EQ(d, Set(d, ConvertScalarTo(pairs * ConvertScalarTo(2))), SumOfLanes(d, v)); HWY_ASSERT_EQ(ConvertScalarTo(pairs * ConvertScalarTo(2)), ReduceSum(d, v)); } template HWY_NOINLINE void operator()(T /*unused*/, D d) { const size_t N = Lanes(d); auto in_lanes = AllocateAligned(N); HWY_ASSERT(in_lanes); // Lane i = bit i, higher lanes 0 T sum = ConvertScalarTo(0); // Avoid setting sign bit and cap so that f16 precision is not exceeded. constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 9); for (size_t i = 0; i < N; ++i) { in_lanes[i] = ConvertScalarTo(i < kBits ? 1ull << i : 0ull); sum = AddWithWraparound(sum, in_lanes[i]); } HWY_ASSERT_VEC_EQ(d, Set(d, sum), SumOfLanes(d, Load(d, in_lanes.get()))); HWY_ASSERT_EQ(T(sum), ReduceSum(d, Load(d, in_lanes.get()))); // Lane i = i (iota) to include upper lanes sum = ConvertScalarTo(0); for (size_t i = 0; i < N; ++i) { sum = AddWithWraparound(sum, ConvertScalarTo(i)); } HWY_ASSERT_VEC_EQ(d, Set(d, sum), SumOfLanes(d, Iota(d, 0))); HWY_ASSERT_EQ(T(sum), ReduceSum(d, Iota(d, 0))); // Run more tests only for signed types with even vector lengths. Some of // this code may not otherwise compile, so put it in a templated function. SignedEvenLengthVectorTests(d); } }; HWY_NOINLINE void TestAllSumOfLanes() { ForAllTypes(ForPartialVectors()); } struct TestMinOfLanes { template HWY_NOINLINE void operator()(T /*unused*/, D d) { const size_t N = Lanes(d); auto in_lanes = AllocateAligned(N); // Lane i = bit i, higher lanes = 2 (not the minimum) T min = HighestValue(); // Avoid setting sign bit and cap at double precision constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51); for (size_t i = 0; i < N; ++i) { in_lanes[i] = ConvertScalarTo(i < kBits ? 1ull << i : 2ull); min = HWY_MIN(min, in_lanes[i]); } HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get()))); // Lane i = N - i to include upper lanes min = HighestValue(); for (size_t i = 0; i < N; ++i) { in_lanes[i] = ConvertScalarTo(N - i); // no 8-bit T so no wraparound min = HWY_MIN(min, in_lanes[i]); } HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get()))); // Bug #910: also check negative values min = HighestValue(); const T input_copy[] = {ConvertScalarTo(-1), ConvertScalarTo(-2), 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}; size_t i = 0; for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) { in_lanes[i] = input_copy[i]; min = HWY_MIN(min, input_copy[i]); } // Pad with neutral element to full vector (so we can load) for (; i < N; ++i) { in_lanes[i] = min; } HWY_ASSERT_VEC_EQ(d, Set(d, min), MinOfLanes(d, Load(d, in_lanes.get()))); HWY_ASSERT_EQ(min, ReduceMin(d, Load(d, in_lanes.get()))); } }; struct TestMaxOfLanes { template HWY_NOINLINE void operator()(T /*unused*/, D d) { const size_t N = Lanes(d); auto in_lanes = AllocateAligned(N); T max = LowestValue(); // Avoid setting sign bit and cap at double precision constexpr size_t kBits = HWY_MIN(sizeof(T) * 8 - 1, 51); for (size_t i = 0; i < N; ++i) { in_lanes[i] = ConvertScalarTo(i < kBits ? 1ull << i : 0ull); max = HWY_MAX(max, in_lanes[i]); } HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get()))); // Lane i = i to include upper lanes max = LowestValue(); for (size_t i = 0; i < N; ++i) { in_lanes[i] = ConvertScalarTo(i); // no 8-bit T so no wraparound max = HWY_MAX(max, in_lanes[i]); } HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get()))); // Bug #910: also check negative values max = LowestValue(); const T input_copy[] = {ConvertScalarTo(-1), ConvertScalarTo(-2), 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}; size_t i = 0; for (; i < HWY_MIN(N, sizeof(input_copy) / sizeof(T)); ++i) { in_lanes[i] = input_copy[i]; max = HWY_MAX(max, in_lanes[i]); } // Pad with neutral element to full vector (so we can load) for (; i < N; ++i) { in_lanes[i] = max; } HWY_ASSERT_VEC_EQ(d, Set(d, max), MaxOfLanes(d, Load(d, in_lanes.get()))); HWY_ASSERT_EQ(max, ReduceMax(d, Load(d, in_lanes.get()))); } }; HWY_NOINLINE void TestAllMinMaxOfLanes() { ForAllTypes(ForPartialVectors()); ForAllTypes(ForPartialVectors()); } struct TestSumsOf2 { template HWY_NOINLINE void operator()(T /*unused*/, D d) { RandomState rng; using TW = MakeWide; const size_t N = Lanes(d); if (N < 2) return; const RepartitionToWide dw; auto in_lanes = AllocateAligned(N); auto sum_lanes = AllocateAligned(N / 2); for (size_t rep = 0; rep < 100; ++rep) { for (size_t i = 0; i < N; ++i) { in_lanes[i] = RandomFiniteValue(&rng); } for (size_t idx_sum = 0; idx_sum < N / 2; ++idx_sum) { TW sum = static_cast(static_cast(in_lanes[idx_sum * 2]) + static_cast(in_lanes[idx_sum * 2 + 1])); sum_lanes[idx_sum] = sum; } const Vec in = Load(d, in_lanes.get()); HWY_ASSERT_VEC_EQ(dw, sum_lanes.get(), SumsOf2(in)); } } }; HWY_NOINLINE void TestAllSumsOf2() { ForGEVectors<16, TestSumsOf2>()(int8_t()); ForGEVectors<16, TestSumsOf2>()(uint8_t()); ForGEVectors<32, TestSumsOf2>()(int16_t()); ForGEVectors<32, TestSumsOf2>()(uint16_t()); #if HWY_HAVE_FLOAT16 ForGEVectors<32, TestSumsOf2>()(float16_t()); #endif #if HWY_HAVE_INTEGER64 ForGEVectors<64, TestSumsOf2>()(int32_t()); ForGEVectors<64, TestSumsOf2>()(uint32_t()); #endif #if HWY_HAVE_FLOAT64 ForGEVectors<64, TestSumsOf2>()(float()); #endif } struct TestSumsOf4 { template HWY_NOINLINE void operator()(T /*unused*/, D d) { RandomState rng; using TW = MakeWide; using TW2 = MakeWide; const size_t N = Lanes(d); if (N < 4) return; const Repartition dw2; auto in_lanes = AllocateAligned(N); auto sum_lanes = AllocateAligned(N / 4); for (size_t rep = 0; rep < 100; ++rep) { for (size_t i = 0; i < N; ++i) { in_lanes[i] = RandomFiniteValue(&rng); } for (size_t idx_sum = 0; idx_sum < N / 4; ++idx_sum) { TW2 sum = static_cast(static_cast(in_lanes[idx_sum * 4]) + static_cast(in_lanes[idx_sum * 4 + 1]) + static_cast(in_lanes[idx_sum * 4 + 2]) + static_cast(in_lanes[idx_sum * 4 + 3])); sum_lanes[idx_sum] = sum; } const Vec in = Load(d, in_lanes.get()); HWY_ASSERT_VEC_EQ(dw2, sum_lanes.get(), SumsOf4(in)); } } }; HWY_NOINLINE void TestAllSumsOf4() { ForGEVectors<32, TestSumsOf4>()(int8_t()); ForGEVectors<32, TestSumsOf4>()(uint8_t()); #if HWY_HAVE_INTEGER64 ForGEVectors<64, TestSumsOf4>()(int16_t()); ForGEVectors<64, TestSumsOf4>()(uint16_t()); #endif } struct TestSumsOf8 { template HWY_NOINLINE void operator()(T /*unused*/, D d) { RandomState rng; using TW = MakeWide>>; const size_t N = Lanes(d); if (N < 8) return; const Repartition d64; auto in_lanes = AllocateAligned(N); auto sum_lanes = AllocateAligned(N / 8); for (size_t rep = 0; rep < 100; ++rep) { for (size_t i = 0; i < N; ++i) { in_lanes[i] = ConvertScalarTo(Random64(&rng) & 0xFF); } for (size_t idx_sum = 0; idx_sum < N / 8; ++idx_sum) { TW sum = 0; for (size_t i = 0; i < 8; ++i) { sum += in_lanes[idx_sum * 8 + i]; } sum_lanes[idx_sum] = sum; } const Vec in = Load(d, in_lanes.get()); HWY_ASSERT_VEC_EQ(d64, sum_lanes.get(), SumsOf8(in)); } } }; HWY_NOINLINE void TestAllSumsOf8() { ForGEVectors<64, TestSumsOf8>()(int8_t()); ForGEVectors<64, TestSumsOf8>()(uint8_t()); } // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE(); #if HWY_ONCE namespace hwy { HWY_BEFORE_TEST(HwyReductionTest); HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumOfLanes); HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllMinMaxOfLanes); HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumsOf2); HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumsOf4); HWY_EXPORT_AND_TEST_P(HwyReductionTest, TestAllSumsOf8); } // namespace hwy #endif