// Copyright 2020 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #include // FLT_MAX #include // std::abs // clang-format off #undef HWY_TARGET_INCLUDE #define HWY_TARGET_INCLUDE "hwy/contrib/math/math_test.cc" #include "hwy/foreach_target.h" // IWYU pragma: keep #include "hwy/highway.h" #include "hwy/contrib/math/math-inl.h" #include "hwy/tests/test_util-inl.h" // clang-format on HWY_BEFORE_NAMESPACE(); namespace hwy { namespace HWY_NAMESPACE { // We have had test failures caused by excess precision due to keeping // intermediate results in 80-bit x87 registers. One such failure mode is that // Log1p computes a 1.0 which is not exactly equal to 1.0f, causing is_pole to // incorrectly evaluate to false. #undef HWY_MATH_TEST_EXCESS_PRECISION #if HWY_ARCH_X86_32 && HWY_COMPILER_GCC_ACTUAL && \ (HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128) // GCC 13+: because CMAKE_CXX_EXTENSIONS is OFF, we build with -std= and hence // also -fexcess-precision=standard, so there is no problem. See #1708 and // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=323. #if HWY_COMPILER_GCC_ACTUAL >= 1300 #define HWY_MATH_TEST_EXCESS_PRECISION 0 #else // HWY_COMPILER_GCC_ACTUAL < 1300 // The build system must enable SSE2, e.g. via HWY_CMAKE_SSE2 - see // https://stackoverflow.com/questions/20869904/c-handling-of-excess-precision . #if defined(__SSE2__) // correct flag given, no problem #define HWY_MATH_TEST_EXCESS_PRECISION 0 #else #define HWY_MATH_TEST_EXCESS_PRECISION 1 #pragma message( \ "Skipping scalar math_test on 32-bit x86 GCC <13 without HWY_CMAKE_SSE2") #endif // defined(__SSE2__) #endif // HWY_COMPILER_GCC_ACTUAL #else // not (x86-32, GCC, scalar target): running math_test normally #define HWY_MATH_TEST_EXCESS_PRECISION 0 #endif // HWY_ARCH_X86_32 etc template HWY_NOINLINE void TestMath(const char* name, T (*fx1)(T), Vec (*fxN)(D, VecArg>), D d, T min, T max, uint64_t max_error_ulp) { if (HWY_MATH_TEST_EXCESS_PRECISION) { static bool once = true; if (once) { once = false; fprintf(stderr, "Skipping math_test due to GCC issue with excess precision.\n"); } return; } using UintT = MakeUnsigned; const UintT min_bits = BitCastScalar(min); const UintT max_bits = BitCastScalar(max); // If min is negative and max is positive, the range needs to be broken into // two pieces, [+0, max] and [-0, min], otherwise [min, max]. int range_count = 1; UintT ranges[2][2] = {{min_bits, max_bits}, {0, 0}}; if ((min < 0.0) && (max > 0.0)) { ranges[0][0] = BitCastScalar(ConvertScalarTo(+0.0)); ranges[0][1] = max_bits; ranges[1][0] = BitCastScalar(ConvertScalarTo(-0.0)); ranges[1][1] = min_bits; range_count = 2; } uint64_t max_ulp = 0; // Emulation is slower, so cannot afford as many. constexpr UintT kSamplesPerRange = static_cast(AdjustedReps(4000)); for (int range_index = 0; range_index < range_count; ++range_index) { const UintT start = ranges[range_index][0]; const UintT stop = ranges[range_index][1]; const UintT step = HWY_MAX(1, ((stop - start) / kSamplesPerRange)); for (UintT value_bits = start; value_bits <= stop; value_bits += step) { // For reasons unknown, the HWY_MAX is necessary on RVV, otherwise // value_bits can be less than start, and thus possibly NaN. const T value = BitCastScalar(HWY_MIN(HWY_MAX(start, value_bits), stop)); const T actual = GetLane(fxN(d, Set(d, value))); const T expected = fx1(value); // Skip small inputs and outputs on armv7, it flushes subnormals to zero. #if HWY_TARGET <= HWY_NEON_WITHOUT_AES && HWY_ARCH_ARM_V7 if ((std::abs(value) < 1e-37f) || (std::abs(expected) < 1e-37f)) { continue; } #endif const auto ulp = hwy::detail::ComputeUlpDelta(actual, expected); max_ulp = HWY_MAX(max_ulp, ulp); if (ulp > max_error_ulp) { fprintf(stderr, "%s: %s(%f) expected %f actual %f ulp %g max ulp %u\n", hwy::TypeName(T(), Lanes(d)).c_str(), name, value, expected, actual, static_cast(ulp), static_cast(max_error_ulp)); } } } fprintf(stderr, "%s: %s max_ulp %g\n", hwy::TypeName(T(), Lanes(d)).c_str(), name, static_cast(max_ulp)); HWY_ASSERT(max_ulp <= max_error_ulp); } #define DEFINE_MATH_TEST_FUNC(NAME) \ HWY_NOINLINE void TestAll##NAME() { \ ForFloat3264Types(ForPartialVectors()); \ } #undef DEFINE_MATH_TEST #define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR, \ F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR) \ struct Test##NAME { \ template \ HWY_NOINLINE void operator()(T, D d) { \ if (sizeof(T) == 4) { \ TestMath(HWY_STR(NAME), F32x1, F32xN, d, F32_MIN, F32_MAX, \ F32_ERROR); \ } else { \ TestMath(HWY_STR(NAME), F64x1, F64xN, d, \ static_cast(F64_MIN), static_cast(F64_MAX), \ F64_ERROR); \ } \ } \ }; \ DEFINE_MATH_TEST_FUNC(NAME) // Floating point values closest to but less than 1.0. Avoid variables with // static initializers inside HWY_BEFORE_NAMESPACE/HWY_AFTER_NAMESPACE to // ensure target-specific code does not leak into startup code. float kNearOneF() { return BitCastScalar(0x3F7FFFFF); } double kNearOneD() { return BitCastScalar(0x3FEFFFFFFFFFFFFFULL); } // The discrepancy is unacceptably large for MSYS2 (less accurate libm?), so // only increase the error tolerance there. constexpr uint64_t Cos64ULP() { #if defined(__MINGW32__) return 23; #else return 3; #endif } constexpr uint64_t ACosh32ULP() { #if defined(__MINGW32__) return 8; #else return 3; #endif } template static Vec SinCosSin(const D d, VecArg> x) { Vec s, c; SinCos(d, x, s, c); return s; } template static Vec SinCosCos(const D d, VecArg> x) { Vec s, c; SinCos(d, x, s, c); return c; } // on targets without FMA the result is less inaccurate constexpr uint64_t SinCosSin32ULP() { #if !(HWY_NATIVE_FMA) return 256; #else return 3; #endif } constexpr uint64_t SinCosCos32ULP() { #if !(HWY_NATIVE_FMA) return 64; #else return 3; #endif } // clang-format off DEFINE_MATH_TEST(Acos, std::acos, CallAcos, -1.0f, +1.0f, 3, // NEON is 3 instead of 2 std::acos, CallAcos, -1.0, +1.0, 2) DEFINE_MATH_TEST(Acosh, std::acosh, CallAcosh, +1.0f, +FLT_MAX, ACosh32ULP(), std::acosh, CallAcosh, +1.0, +DBL_MAX, 3) DEFINE_MATH_TEST(Asin, std::asin, CallAsin, -1.0f, +1.0f, 4, // 4 ulp on Armv7, not 2 std::asin, CallAsin, -1.0, +1.0, 2) DEFINE_MATH_TEST(Asinh, std::asinh, CallAsinh, -FLT_MAX, +FLT_MAX, 3, std::asinh, CallAsinh, -DBL_MAX, +DBL_MAX, 3) DEFINE_MATH_TEST(Atan, std::atan, CallAtan, -FLT_MAX, +FLT_MAX, 3, std::atan, CallAtan, -DBL_MAX, +DBL_MAX, 3) // NEON has ULP 4 instead of 3 DEFINE_MATH_TEST(Atanh, std::atanh, CallAtanh, -kNearOneF(), +kNearOneF(), 4, std::atanh, CallAtanh, -kNearOneD(), +kNearOneD(), 3) DEFINE_MATH_TEST(Cos, std::cos, CallCos, -39000.0f, +39000.0f, 3, std::cos, CallCos, -39000.0, +39000.0, Cos64ULP()) DEFINE_MATH_TEST(Exp, std::exp, CallExp, -FLT_MAX, +104.0f, 1, std::exp, CallExp, -DBL_MAX, +104.0, 1) DEFINE_MATH_TEST(Expm1, std::expm1, CallExpm1, -FLT_MAX, +104.0f, 4, std::expm1, CallExpm1, -DBL_MAX, +104.0, 4) DEFINE_MATH_TEST(Log, std::log, CallLog, +FLT_MIN, +FLT_MAX, 1, std::log, CallLog, +DBL_MIN, +DBL_MAX, 1) DEFINE_MATH_TEST(Log10, std::log10, CallLog10, +FLT_MIN, +FLT_MAX, 2, std::log10, CallLog10, +DBL_MIN, +DBL_MAX, 2) DEFINE_MATH_TEST(Log1p, std::log1p, CallLog1p, +0.0f, +1e37f, 3, // NEON is 3 instead of 2 std::log1p, CallLog1p, +0.0, +DBL_MAX, 2) DEFINE_MATH_TEST(Log2, std::log2, CallLog2, +FLT_MIN, +FLT_MAX, 2, std::log2, CallLog2, +DBL_MIN, +DBL_MAX, 2) DEFINE_MATH_TEST(Sin, std::sin, CallSin, -39000.0f, +39000.0f, 3, std::sin, CallSin, -39000.0, +39000.0, 4) // MSYS is 4 instead of 3 DEFINE_MATH_TEST(Sinh, std::sinh, CallSinh, -80.0f, +80.0f, 4, std::sinh, CallSinh, -709.0, +709.0, 4) DEFINE_MATH_TEST(Tanh, std::tanh, CallTanh, -FLT_MAX, +FLT_MAX, 4, std::tanh, CallTanh, -DBL_MAX, +DBL_MAX, 4) DEFINE_MATH_TEST(SinCosSin, std::sin, SinCosSin, -39000.0f, +39000.0f, SinCosSin32ULP(), std::sin, SinCosSin, -39000.0, +39000.0, 1) DEFINE_MATH_TEST(SinCosCos, std::cos, SinCosCos, -39000.0f, +39000.0f, SinCosCos32ULP(), std::cos, SinCosCos, -39000.0, +39000.0, 1) // clang-format on template void Atan2TestCases(T /*unused*/, D d, size_t& padded, AlignedFreeUniquePtr& out_y, AlignedFreeUniquePtr& out_x, AlignedFreeUniquePtr& out_expected) { struct YX { T y; T x; T expected; }; const T pos = ConvertScalarTo(1E5); const T neg = ConvertScalarTo(-1E7); const T p0 = ConvertScalarTo(0); // -0 is not enough to get an actual negative zero. const T n0 = ConvertScalarTo(-0.0); const T p1 = ConvertScalarTo(1); const T n1 = ConvertScalarTo(-1); const T p2 = ConvertScalarTo(2); const T n2 = ConvertScalarTo(-2); const T inf = GetLane(Inf(d)); const T nan = GetLane(NaN(d)); const T pi = ConvertScalarTo(3.141592653589793238); const YX test_cases[] = { // 45 degree steps: {p0, p1, p0}, // E {n1, p1, -pi / 4}, // SE {n1, p0, -pi / 2}, // S {n1, n1, -3 * pi / 4}, // SW {p0, n1, pi}, // W {p1, n1, 3 * pi / 4}, // NW {p1, p0, pi / 2}, // N {p1, p1, pi / 4}, // NE // y = ±0, x < 0 or -0 {p0, n1, pi}, {n0, n2, -pi}, // y = ±0, x > 0 or +0 {p0, p2, p0}, {n0, p2, n0}, // y = ±∞, x finite {inf, p2, pi / 2}, {-inf, p2, -pi / 2}, // y = ±∞, x = -∞ {inf, -inf, 3 * pi / 4}, {-inf, -inf, -3 * pi / 4}, // y = ±∞, x = +∞ {inf, inf, pi / 4}, {-inf, inf, -pi / 4}, // y < 0, x = ±0 {n2, p0, -pi / 2}, {n1, n0, -pi / 2}, // y > 0, x = ±0 {pos, p0, pi / 2}, {p2, n0, pi / 2}, // finite y > 0, x = -∞ {pos, -inf, pi}, // finite y < 0, x = -∞ {neg, -inf, -pi}, // finite y > 0, x = +∞ {pos, inf, p0}, // finite y < 0, x = +∞ {neg, inf, n0}, // y NaN xor x NaN {nan, p0, nan}, {pos, nan, nan}}; const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]); const size_t N = Lanes(d); padded = RoundUpTo(kNumTestCases, N); // allow loading whole vectors out_y = AllocateAligned(padded); out_x = AllocateAligned(padded); out_expected = AllocateAligned(padded); HWY_ASSERT(out_y && out_x); size_t i = 0; for (; i < kNumTestCases; ++i) { out_y[i] = test_cases[i].y; out_x[i] = test_cases[i].x; out_expected[i] = test_cases[i].expected; } for (; i < padded; ++i) { out_y[i] = p0; out_x[i] = p0; out_expected[i] = p0; } } struct TestAtan2 { template HWY_NOINLINE void operator()(T t, D d) { const size_t N = Lanes(d); size_t padded; AlignedFreeUniquePtr in_y, in_x, expected; Atan2TestCases(t, d, padded, in_y, in_x, expected); const Vec tolerance = Set(d, ConvertScalarTo(1E-5)); for (size_t i = 0; i < padded; ++i) { const T actual = ConvertScalarTo(atan2(in_y[i], in_x[i])); // fprintf(stderr, "%zu: table %f atan2 %f\n", i, expected[i], actual); HWY_ASSERT_EQ(expected[i], actual); } for (size_t i = 0; i < padded; i += N) { const Vec y = Load(d, &in_y[i]); const Vec x = Load(d, &in_x[i]); #if HWY_ARCH_ARM_A64 // TODO(b/287462770): inline to work around incorrect SVE codegen const Vec actual = Atan2(d, y, x); #else const Vec actual = CallAtan2(d, y, x); #endif const Vec vexpected = Load(d, &expected[i]); const Mask exp_nan = IsNaN(vexpected); const Mask act_nan = IsNaN(actual); HWY_ASSERT_MASK_EQ(d, exp_nan, act_nan); // If not NaN, then compare with tolerance const Mask ge = Ge(actual, Sub(vexpected, tolerance)); const Mask le = Le(actual, Add(vexpected, tolerance)); const Mask ok = Or(act_nan, And(le, ge)); if (!AllTrue(d, ok)) { const size_t mismatch = static_cast(FindKnownFirstTrue(d, Not(ok))); fprintf(stderr, "Mismatch for i=%d expected %f actual %f\n", static_cast(i + mismatch), expected[i + mismatch], ExtractLane(actual, mismatch)); HWY_ASSERT(0); } } } }; HWY_NOINLINE void TestAllAtan2() { if (HWY_MATH_TEST_EXCESS_PRECISION) return; ForFloat3264Types(ForPartialVectors()); } // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE(); #if HWY_ONCE namespace hwy { HWY_BEFORE_TEST(HwyMathTest); HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcos); HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAcosh); HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsin); HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAsinh); HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtan); HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtanh); HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllCos); HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExp); HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllExpm1); HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog); HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog10); HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog1p); HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllLog2); HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSin); HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSinh); HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllTanh); HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllAtan2); HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSinCosSin); HWY_EXPORT_AND_TEST_P(HwyMathTest, TestAllSinCosCos); } // namespace hwy #endif