// Copyright 2019 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include // Ensure incompatibilities with Windows macros (e.g. #define StoreFence) are // detected. Must come before Highway headers. #include "hwy/base.h" #include "hwy/tests/test_util.h" #if defined(_WIN32) || defined(_WIN64) #include #endif #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "tests/memory_test.cc" #include "hwy/cache_control.h" #include "hwy/foreach_target.h" // IWYU pragma: keep #include "hwy/highway.h" #include "hwy/tests/test_util-inl.h" HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { struct TestLoadStore { template HWY_NOINLINE void operator()(T /*unused*/, D d) { const size_t N = Lanes(d); const VFromD hi = IotaForSpecial(d, 1 + N); const VFromD lo = IotaForSpecial(d, 1); auto lanes = AllocateAligned(2 * N); auto lanes2 = AllocateAligned(2 * N); auto lanes3 = AllocateAligned(N); HWY_ASSERT(lanes && lanes2 && lanes3); Store(hi, d, &lanes[N]); Store(lo, d, &lanes[0]); // Aligned load const VFromD lo2 = Load(d, &lanes[0]); HWY_ASSERT_VEC_EQ(d, lo2, lo); // Aligned store Store(lo2, d, &lanes2[0]); Store(hi, d, &lanes2[N]); for (size_t i = 0; i < 2 * N; ++i) { HWY_ASSERT_EQ(lanes[i], lanes2[i]); } // Unaligned load const VFromD vu = LoadU(d, &lanes[1]); Store(vu, d, lanes3.get()); for (size_t i = 0; i < N; ++i) { HWY_ASSERT_EQ(i + 2, lanes3[i]); } // Unaligned store StoreU(lo2, d, &lanes2[N / 2]); size_t i = 0; for (; i < N / 2; ++i) { HWY_ASSERT_EQ(lanes[i], lanes2[i]); } for (; i < 3 * N / 2; ++i) { HWY_ASSERT_EQ(i - N / 2 + 1, lanes2[i]); } // Subsequent values remain unchanged. for (; i < 2 * N; ++i) { HWY_ASSERT_EQ(i + 1, lanes2[i]); } } }; HWY_NOINLINE void TestAllLoadStore() { ForAllTypesAndSpecial(ForPartialVectors()); } struct TestSafeCopyN { template HWY_NOINLINE void operator()(T /*unused*/, D d) { const size_t N = Lanes(d); const auto v = Iota(d, 1); auto from = AllocateAligned(N + 2); auto to = AllocateAligned(N + 2); HWY_ASSERT(from && to); Store(v, d, from.get()); // 0: nothing changes to[0] = ConvertScalarTo(0); SafeCopyN(0, d, from.get(), to.get()); HWY_ASSERT_EQ(T(), to[0]); // 1: only first changes to[1] = ConvertScalarTo(0); SafeCopyN(1, d, from.get(), to.get()); HWY_ASSERT_EQ(ConvertScalarTo(1), to[0]); HWY_ASSERT_EQ(T(), to[1]); // N-1: last does not change to[N - 1] = ConvertScalarTo(0); SafeCopyN(N - 1, d, from.get(), to.get()); HWY_ASSERT_EQ(T(), to[N - 1]); // Also check preceding lanes to[N - 1] = ConvertScalarTo(N); HWY_ASSERT_VEC_EQ(d, to.get(), v); // N: all change to[N] = ConvertScalarTo(0); SafeCopyN(N, d, from.get(), to.get()); HWY_ASSERT_VEC_EQ(d, to.get(), v); HWY_ASSERT_EQ(T(), to[N]); // N+1: subsequent lane does not change if using masked store to[N + 1] = ConvertScalarTo(0); SafeCopyN(N + 1, d, from.get(), to.get()); HWY_ASSERT_VEC_EQ(d, to.get(), v); #if !HWY_MEM_OPS_MIGHT_FAULT HWY_ASSERT_EQ(T(), to[N + 1]); #endif } }; HWY_NOINLINE void TestAllSafeCopyN() { ForAllTypes(ForPartialVectors()); } struct TestLoadDup128 { template HWY_NOINLINE void operator()(T /*unused*/, D d) { // Scalar does not define LoadDup128. #if HWY_TARGET != HWY_SCALAR || HWY_IDE constexpr size_t N128 = 16 / sizeof(T); alignas(16) T lanes[N128]; for (size_t i = 0; i < N128; ++i) { lanes[i] = ConvertScalarTo(1 + i); } const size_t N = Lanes(d); auto expected = AllocateAligned(N); HWY_ASSERT(expected); for (size_t i = 0; i < N; ++i) { expected[i] = ConvertScalarTo(i % N128 + 1); } HWY_ASSERT_VEC_EQ(d, expected.get(), LoadDup128(d, lanes)); #else (void)d; #endif } }; HWY_NOINLINE void TestAllLoadDup128() { ForAllTypes(ForGEVectors<128, TestLoadDup128>()); } struct TestStream { template HWY_NOINLINE void operator()(T /*unused*/, D d) { const Vec v = Iota(d, 1); const size_t affected_bytes = (Lanes(d) * sizeof(T) + HWY_STREAM_MULTIPLE - 1) & ~size_t(HWY_STREAM_MULTIPLE - 1); const size_t affected_lanes = affected_bytes / sizeof(T); auto out = AllocateAligned(2 * affected_lanes); HWY_ASSERT(out); ZeroBytes(out.get(), 2 * affected_lanes * sizeof(T)); Stream(v, d, out.get()); FlushStream(); const Vec actual = Load(d, out.get()); HWY_ASSERT_VEC_EQ(d, v, actual); // Ensure Stream didn't modify more memory than expected for (size_t i = affected_lanes; i < 2 * affected_lanes; ++i) { HWY_ASSERT_EQ(ConvertScalarTo(0), out[i]); } } }; HWY_NOINLINE void TestAllStream() { const ForPartialVectors test; // No u8,u16. test(uint32_t()); test(uint64_t()); // No i8,i16. test(int32_t()); test(int64_t()); ForFloatTypes(test); } // Assumes little-endian byte order! struct TestScatter { template HWY_NOINLINE void operator()(T /*unused*/, D d) { using Offset = MakeSigned; const Rebind d_offsets; const size_t N = Lanes(d); const size_t range = 4 * N; // number of items to scatter const size_t max_bytes = range * sizeof(T); // upper bound on offset RandomState rng; auto values = AllocateAligned(range); auto offsets = AllocateAligned(N); // or indices // Scatter into these regions, ensure vector results match scalar auto expected = AllocateAligned(range); auto actual = AllocateAligned(range); HWY_ASSERT(values && offsets && expected && actual); // Data to be scattered uint8_t* bytes = reinterpret_cast(values.get()); for (size_t i = 0; i < max_bytes; ++i) { bytes[i] = static_cast(Random32(&rng) & 0xFF); } const Vec data = Load(d, values.get()); for (size_t rep = 0; rep < 100; ++rep) { // Byte offsets ZeroBytes(expected.get(), range * sizeof(T)); ZeroBytes(actual.get(), range * sizeof(T)); for (size_t i = 0; i < N; ++i) { // Must be aligned offsets[i] = static_cast((Random32(&rng) % range) * sizeof(T)); CopyBytes( values.get() + i, reinterpret_cast(expected.get()) + offsets[i]); } const auto voffsets = Load(d_offsets, offsets.get()); ScatterOffset(data, d, actual.get(), voffsets); if (!BytesEqual(expected.get(), actual.get(), max_bytes)) { Print(d, "Data", data); Print(d_offsets, "Offsets", voffsets); HWY_ASSERT(false); } // Indices ZeroBytes(expected.get(), range * sizeof(T)); ZeroBytes(actual.get(), range * sizeof(T)); for (size_t i = 0; i < N; ++i) { offsets[i] = static_cast(Random32(&rng) % range); CopyBytes(values.get() + i, &expected[size_t(offsets[i])]); } const auto vindices = Load(d_offsets, offsets.get()); ScatterIndex(data, d, actual.get(), vindices); if (!BytesEqual(expected.get(), actual.get(), max_bytes)) { Print(d, "Data", data); Print(d_offsets, "Indices", vindices); HWY_ASSERT(false); } } } }; HWY_NOINLINE void TestAllScatter() { ForUIF3264(ForPartialVectors()); } struct TestGather { template HWY_NOINLINE void operator()(T /*unused*/, D d) { using Offset = MakeSigned; const size_t N = Lanes(d); const size_t range = 4 * N; // number of items to gather const size_t max_bytes = range * sizeof(T); // upper bound on offset RandomState rng; auto values = AllocateAligned(range); auto expected = AllocateAligned(N); auto offsets = AllocateAligned(N); auto indices = AllocateAligned(N); HWY_ASSERT(values && expected && offsets && indices); // Data to be gathered from uint8_t* bytes = reinterpret_cast(values.get()); for (size_t i = 0; i < max_bytes; ++i) { bytes[i] = static_cast(Random32(&rng) & 0xFF); } for (size_t rep = 0; rep < 100; ++rep) { // Offsets for (size_t i = 0; i < N; ++i) { // Must be aligned offsets[i] = static_cast((Random32(&rng) % range) * sizeof(T)); CopyBytes(bytes + offsets[i], &expected[i]); } const Rebind d_offset; const T* base = values.get(); auto actual = GatherOffset(d, base, Load(d_offset, offsets.get())); HWY_ASSERT_VEC_EQ(d, expected.get(), actual); // Indices for (size_t i = 0; i < N; ++i) { indices[i] = static_cast(Random32(&rng) % (max_bytes / sizeof(T))); CopyBytes(base + indices[i], &expected[i]); } actual = GatherIndex(d, base, Load(d_offset, indices.get())); HWY_ASSERT_VEC_EQ(d, expected.get(), actual); } } }; HWY_NOINLINE void TestAllGather() { ForUIF3264(ForPartialVectors()); } HWY_NOINLINE void TestAllCache() { LoadFence(); FlushStream(); int test = 0; Prefetch(&test); FlushCacheline(&test); Pause(); } namespace detail { template HWY_INLINE T GenerateOtherValue(size_t val) { const T conv_val = static_cast(val); return (conv_val == static_cast(kNo)) ? static_cast(-17) : conv_val; } template HWY_INLINE T GenerateOtherValue(size_t val) { const T flt_val = static_cast(val); return (flt_val == static_cast(kNo) ? static_cast(0.5426808228865735) : flt_val); } template HWY_INLINE T GenerateOtherValue(size_t val) { return BF16FromF32(GenerateOtherValue(val)); } template HWY_INLINE T GenerateOtherValue(size_t val) { return F16FromF32(GenerateOtherValue(val)); } } // namespace detail struct TestLoadN { template HWY_NOINLINE void operator()(T /*unused*/, D d) { const size_t N = Lanes(d); constexpr size_t kMaxLanesPerBlock = 16 / sizeof(T); const size_t lpb = HWY_MIN(N, kMaxLanesPerBlock); HWY_ASSERT(lpb >= 1); HWY_ASSERT(N <= (static_cast(~size_t(0)) / 4)); const size_t load_buf_len = (3 * N) + 4; auto load_buf = AllocateAligned(load_buf_len); auto expected = AllocateAligned(N); HWY_ASSERT(load_buf && expected); for (size_t i = 0; i < load_buf_len; i++) { load_buf[i] = detail::GenerateOtherValue<0, T>(i + 1); } ZeroBytes(expected.get(), N * sizeof(T)); // Without Load(), the vector type for special floats might not match. HWY_ASSERT_VEC_EQ(d, Load(d, expected.get()), LoadN(d, load_buf.get(), 0)); for (size_t i = 0; i <= lpb; i++) { CopyBytes(load_buf.get(), expected.get(), i * sizeof(T)); const VFromD actual_1 = LoadN(d, load_buf.get(), i); HWY_ASSERT_VEC_EQ(d, Load(d, expected.get()), actual_1); CopyBytes(load_buf.get() + 3, expected.get(), i * sizeof(T)); const VFromD actual_2 = LoadN(d, load_buf.get() + 3, i); HWY_ASSERT_VEC_EQ(d, Load(d, expected.get()), actual_2); } const size_t lplb = HWY_MAX(N / 4, lpb); for (size_t i = HWY_MAX(lpb * 2, lplb); i <= N * 2; i += lplb) { const size_t max_num_of_lanes_to_load = i + (11 & (lpb - 1)); const size_t expected_num_of_lanes_loaded = HWY_MIN(max_num_of_lanes_to_load, N); CopyBytes(load_buf.get(), expected.get(), expected_num_of_lanes_loaded * sizeof(T)); const VFromD actual_1 = LoadN(d, load_buf.get(), max_num_of_lanes_to_load); HWY_ASSERT_VEC_EQ(d, Load(d, expected.get()), actual_1); CopyBytes(load_buf.get() + 3, expected.get(), expected_num_of_lanes_loaded * sizeof(T)); const VFromD actual_2 = LoadN(d, load_buf.get() + 3, max_num_of_lanes_to_load); HWY_ASSERT_VEC_EQ(d, Load(d, expected.get()), actual_2); } load_buf[0] = detail::GenerateOtherValue<0, T>(0); CopyBytes(load_buf.get(), expected.get(), N * sizeof(T)); HWY_ASSERT_VEC_EQ(d, Load(d, expected.get()), LoadN(d, load_buf.get(), N)); } }; HWY_NOINLINE void TestAllLoadN() { ForAllTypesAndSpecial(ForPartialVectors()); } struct TestLoadNOr { template HWY_NOINLINE void operator()(T /*unused*/, D d) { constexpr int kNo = 2; const size_t N = Lanes(d); constexpr size_t kMaxLanesPerBlock = 16 / sizeof(T); const size_t lpb = HWY_MIN(N, kMaxLanesPerBlock); HWY_ASSERT(lpb >= 1); HWY_ASSERT(N <= (static_cast(~size_t(0)) / 4)); const size_t load_buf_len = (3 * N) + 4; auto load_buf = AllocateAligned(load_buf_len); auto expected = AllocateAligned(N); HWY_ASSERT(load_buf && expected); for (size_t i = 0; i < load_buf_len; i++) { load_buf[i] = detail::GenerateOtherValue(i + 1); } const Vec no = Set(d, ConvertScalarTo(kNo)); for (size_t i = 0; i < N; ++i) { expected[i] = ConvertScalarTo(kNo); } // Without Load(), the vector type for special floats might not match. HWY_ASSERT_VEC_EQ(d, Load(d, expected.get()), LoadNOr(no, d, load_buf.get(), 0)); for (size_t i = 0; i <= lpb; i++) { CopyBytes(load_buf.get(), expected.get(), i * sizeof(T)); const VFromD actual_1 = LoadNOr(no, d, load_buf.get(), i); HWY_ASSERT_VEC_EQ(d, Load(d, expected.get()), actual_1); CopyBytes(load_buf.get() + 3, expected.get(), i * sizeof(T)); const VFromD actual_2 = LoadNOr(no, d, load_buf.get() + 3, i); HWY_ASSERT_VEC_EQ(d, Load(d, expected.get()), actual_2); } const size_t lplb = HWY_MAX(N / 4, lpb); for (size_t i = HWY_MAX(lpb * 2, lplb); i <= N * 2; i += lplb) { const size_t max_num_of_lanes_to_load = i + (11 & (lpb - 1)); const size_t expected_num_of_lanes_loaded = HWY_MIN(max_num_of_lanes_to_load, N); CopyBytes(load_buf.get(), expected.get(), expected_num_of_lanes_loaded * sizeof(T)); const VFromD actual_1 = LoadNOr(no, d, load_buf.get(), max_num_of_lanes_to_load); HWY_ASSERT_VEC_EQ(d, Load(d, expected.get()), actual_1); CopyBytes(load_buf.get() + 3, expected.get(), expected_num_of_lanes_loaded * sizeof(T)); const VFromD actual_2 = LoadNOr(no, d, load_buf.get() + 3, max_num_of_lanes_to_load); HWY_ASSERT_VEC_EQ(d, Load(d, expected.get()), actual_2); } load_buf[0] = detail::GenerateOtherValue(kNo); CopyBytes(load_buf.get(), expected.get(), N * sizeof(T)); HWY_ASSERT_VEC_EQ(d, Load(d, expected.get()), LoadNOr(no, d, load_buf.get(), N)); } }; HWY_NOINLINE void TestAllLoadNOr() { ForAllTypesAndSpecial(ForPartialVectors()); } class TestStoreN { private: template static HWY_INLINE T NegativeFillValue() { return LowestValue(); } template static HWY_INLINE T NegativeFillValue() { return static_cast(-1); } public: template HWY_NOINLINE void operator()(T /*unused*/, D d) { const size_t N = Lanes(d); constexpr size_t kMaxLanesPerBlock = 16 / sizeof(T); const size_t lpb = HWY_MIN(N, kMaxLanesPerBlock); HWY_ASSERT(lpb >= 1); const size_t full_dvec_N = Lanes(DFromV>()); HWY_ASSERT(N <= full_dvec_N); HWY_ASSERT(full_dvec_N <= (static_cast(~size_t(0)) / 8)); const size_t buf_offset = HWY_MAX(kMaxLanesPerBlock, full_dvec_N); const size_t buf_size = buf_offset + 3 * full_dvec_N + 4; auto expected = AllocateAligned(buf_size); auto actual = AllocateAligned(buf_size); HWY_ASSERT(expected && actual); const T neg_fill_val = NegativeFillValue(); for (size_t i = 0; i < buf_size; i++) { expected[i] = neg_fill_val; actual[i] = neg_fill_val; } const Vec v_neg_fill_val = Set(d, neg_fill_val); for (size_t i = 0; i <= lpb; i++) { const Vec v = IotaForSpecial(d, i + 1); const Vec v_expected = IfThenElse(FirstN(d, i), v, v_neg_fill_val); Store(v_expected, d, expected.get() + buf_offset); Store(v_neg_fill_val, d, actual.get() + buf_offset); StoreN(v, d, actual.get() + buf_offset, i); HWY_ASSERT_ARRAY_EQ(expected.get(), actual.get(), buf_size); StoreU(v_expected, d, expected.get() + buf_offset + 3); StoreU(v_neg_fill_val, d, actual.get() + buf_offset + 3); StoreN(v, d, actual.get() + buf_offset + 3, i); HWY_ASSERT_ARRAY_EQ(expected.get(), actual.get(), buf_size); } const size_t lplb = HWY_MAX(N / 4, lpb); for (size_t i = HWY_MAX(lpb * 2, lplb); i <= N * 2; i += lplb) { const size_t max_num_of_lanes_to_store = i + (11 & (lpb - 1)); const size_t expected_num_of_lanes_written = HWY_MIN(max_num_of_lanes_to_store, N); const Vec v = IotaForSpecial(d, max_num_of_lanes_to_store + 1); const Vec v_expected = IfThenElse( FirstN(d, expected_num_of_lanes_written), v, v_neg_fill_val); Store(v_expected, d, expected.get() + buf_offset); Store(v_neg_fill_val, d, actual.get() + buf_offset); StoreN(v, d, actual.get() + buf_offset, max_num_of_lanes_to_store); HWY_ASSERT_ARRAY_EQ(expected.get(), actual.get(), buf_size); StoreU(v_expected, d, expected.get() + buf_offset + 3); StoreU(v_neg_fill_val, d, actual.get() + buf_offset + 3); StoreN(v, d, actual.get() + buf_offset + 3, max_num_of_lanes_to_store); HWY_ASSERT_ARRAY_EQ(expected.get(), actual.get(), buf_size); } } }; HWY_NOINLINE void TestAllStoreN() { ForAllTypesAndSpecial(ForPartialVectors()); } // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE(); #if HWY_ONCE namespace hwy { HWY_BEFORE_TEST(HwyMemoryTest); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadStore); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllSafeCopyN); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadDup128); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStream); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllScatter); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllGather); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllCache); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadN); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllLoadNOr); HWY_EXPORT_AND_TEST_P(HwyMemoryTest, TestAllStoreN); } // namespace hwy #endif