// Copyright 2019 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // 128-bit Arm NEON vectors and operations. // External include guard in highway.h - see comment there. // Arm NEON intrinsics are documented at: // https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon] #include "hwy/ops/shared-inl.h" HWY_BEFORE_NAMESPACE(); // Must come after HWY_BEFORE_NAMESPACE so that the intrinsics are compiled with // the same target attribute as our code, see #834. HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized") #include // NOLINT(build/include_order) HWY_DIAGNOSTICS(pop) // Must come after arm_neon.h. namespace hwy { namespace HWY_NAMESPACE { namespace detail { // for code folding and Raw128 // Macros used to define single and double function calls for multiple types // for full and half vectors. These macros are undefined at the end of the file. // HWY_NEON_BUILD_TPL_* is the template<...> prefix to the function. #define HWY_NEON_BUILD_TPL_1 #define HWY_NEON_BUILD_TPL_2 #define HWY_NEON_BUILD_TPL_3 // HWY_NEON_BUILD_RET_* is return type; type arg is without _t suffix so we can // extend it to int32x4x2_t packs. #define HWY_NEON_BUILD_RET_1(type, size) Vec128 #define HWY_NEON_BUILD_RET_2(type, size) Vec128 #define HWY_NEON_BUILD_RET_3(type, size) Vec128 // HWY_NEON_BUILD_PARAM_* is the list of parameters the function receives. #define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128 a #define HWY_NEON_BUILD_PARAM_2(type, size) \ const Vec128 a, const Vec128 b #define HWY_NEON_BUILD_PARAM_3(type, size) \ const Vec128 a, const Vec128 b, \ const Vec128 c // HWY_NEON_BUILD_ARG_* is the list of arguments passed to the underlying // function. #define HWY_NEON_BUILD_ARG_1 a.raw #define HWY_NEON_BUILD_ARG_2 a.raw, b.raw #define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw // We use HWY_NEON_EVAL(func, ...) to delay the evaluation of func until after // the __VA_ARGS__ have been expanded. This allows "func" to be a macro on // itself like with some of the library "functions" such as vshlq_u8. For // example, HWY_NEON_EVAL(vshlq_u8, MY_PARAMS) where MY_PARAMS is defined as // "a, b" (without the quotes) will end up expanding "vshlq_u8(a, b)" if needed. // Directly writing vshlq_u8(MY_PARAMS) would fail since vshlq_u8() macro // expects two arguments. #define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__) // Main macro definition that defines a single function for the given type and // size of vector, using the underlying (prefix##infix##suffix) function and // the template, return type, parameters and arguments defined by the "args" // parameters passed here (see HWY_NEON_BUILD_* macros defined before). #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \ HWY_CONCAT(HWY_NEON_BUILD_TPL_, args) \ HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size) \ name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \ return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \ HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \ } // The HWY_NEON_DEF_FUNCTION_* macros define all the variants of a function // called "name" using the set of neon functions starting with the given // "prefix" for all the variants of certain types, as specified next to each // macro. For example, the prefix "vsub" can be used to define the operator- // using args=2. // uint8_t #define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \ HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args) \ HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args) \ HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args) \ HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args) // int8_t #define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \ HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args) \ HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args) \ HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args) \ HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args) // uint16_t #define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \ HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args) \ HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args) \ HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args) // int16_t #define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \ HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args) \ HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args) \ HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args) // uint32_t #define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \ HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args) \ HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args) // int32_t #define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \ HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args) \ HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args) // uint64_t #define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \ HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args) // int64_t #define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \ HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) && \ (HWY_COMPILER_GCC_ACTUAL >= 1300 || HWY_COMPILER_CLANG >= 1100) #define HWY_NEON_HAVE_BFLOAT16 1 #else #define HWY_NEON_HAVE_BFLOAT16 0 #endif // bfloat16_t #if HWY_NEON_HAVE_BFLOAT16 #define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION(bfloat16, 8, name, prefix##q, infix, bf16, args) \ HWY_NEON_DEF_FUNCTION(bfloat16, 4, name, prefix, infix, bf16, args) \ HWY_NEON_DEF_FUNCTION(bfloat16, 2, name, prefix, infix, bf16, args) \ HWY_NEON_DEF_FUNCTION(bfloat16, 1, name, prefix, infix, bf16, args) #else #define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) #endif // Used for conversion instructions if HWY_NEON_HAVE_F16C. #define HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, \ args) \ HWY_NEON_DEF_FUNCTION(float16, 8, name, prefix##q, infix, f16, args) \ HWY_NEON_DEF_FUNCTION(float16, 4, name, prefix, infix, f16, args) \ HWY_NEON_DEF_FUNCTION(float16, 2, name, prefix, infix, f16, args) \ HWY_NEON_DEF_FUNCTION(float16, 1, name, prefix, infix, f16, args) // float16_t #if HWY_HAVE_FLOAT16 #define HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, args) #else #define HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args) #endif // Enable generic functions for whichever of (f16, bf16) are not supported. #if !HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16 #define HWY_NEON_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D) #elif !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16 #define HWY_NEON_IF_EMULATED_D(D) HWY_IF_F16_D(D) #elif HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16 #define HWY_NEON_IF_EMULATED_D(D) HWY_IF_BF16_D(D) #elif HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16 #define HWY_NEON_IF_EMULATED_D(D) hwy::EnableIf* = nullptr #else #error "Logic error, handled all four cases" #endif // float #define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \ HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args) \ HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args) // double #if HWY_HAVE_FLOAT64 #define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \ HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args) #else #define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) #endif // Helper macros to define for more than one type. // uint8_t, uint16_t and uint32_t #define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) // int8_t, int16_t and int32_t #define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) // uint8_t, uint16_t, uint32_t and uint64_t #define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) // int8_t, int16_t, int32_t and int64_t #define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) // All int*_t and uint*_t up to 64 #define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) #define HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) // All previous types. #define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) #define HWY_NEON_DEF_FUNCTION_UI_8_16_32(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) #define HWY_NEON_DEF_FUNCTION_UIF_8_16_32(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_UI_8_16_32(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) #define HWY_NEON_DEF_FUNCTION_UIF_64(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) // For vzip1/2 #define HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \ HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) #define HWY_NEON_DEF_FUNCTION_FULL_UIF_64(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) // For eor3q, which is only defined for full vectors. #define HWY_NEON_DEF_FUNCTION_FULL_UI(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \ HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \ HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \ HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \ HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \ HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \ HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args) // Emulation of some intrinsics on armv7. #if HWY_ARCH_ARM_V7 #define vuzp1_s8(x, y) vuzp_s8(x, y).val[0] #define vuzp1_u8(x, y) vuzp_u8(x, y).val[0] #define vuzp1_s16(x, y) vuzp_s16(x, y).val[0] #define vuzp1_u16(x, y) vuzp_u16(x, y).val[0] #define vuzp1_s32(x, y) vuzp_s32(x, y).val[0] #define vuzp1_u32(x, y) vuzp_u32(x, y).val[0] #define vuzp1_f32(x, y) vuzp_f32(x, y).val[0] #define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0] #define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0] #define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0] #define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0] #define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0] #define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0] #define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0] #define vuzp2_s8(x, y) vuzp_s8(x, y).val[1] #define vuzp2_u8(x, y) vuzp_u8(x, y).val[1] #define vuzp2_s16(x, y) vuzp_s16(x, y).val[1] #define vuzp2_u16(x, y) vuzp_u16(x, y).val[1] #define vuzp2_s32(x, y) vuzp_s32(x, y).val[1] #define vuzp2_u32(x, y) vuzp_u32(x, y).val[1] #define vuzp2_f32(x, y) vuzp_f32(x, y).val[1] #define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1] #define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1] #define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1] #define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1] #define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1] #define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1] #define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1] #define vzip1_s8(x, y) vzip_s8(x, y).val[0] #define vzip1_u8(x, y) vzip_u8(x, y).val[0] #define vzip1_s16(x, y) vzip_s16(x, y).val[0] #define vzip1_u16(x, y) vzip_u16(x, y).val[0] #define vzip1_f32(x, y) vzip_f32(x, y).val[0] #define vzip1_u32(x, y) vzip_u32(x, y).val[0] #define vzip1_s32(x, y) vzip_s32(x, y).val[0] #define vzip1q_s8(x, y) vzipq_s8(x, y).val[0] #define vzip1q_u8(x, y) vzipq_u8(x, y).val[0] #define vzip1q_s16(x, y) vzipq_s16(x, y).val[0] #define vzip1q_u16(x, y) vzipq_u16(x, y).val[0] #define vzip1q_s32(x, y) vzipq_s32(x, y).val[0] #define vzip1q_u32(x, y) vzipq_u32(x, y).val[0] #define vzip1q_f32(x, y) vzipq_f32(x, y).val[0] #define vzip2_s8(x, y) vzip_s8(x, y).val[1] #define vzip2_u8(x, y) vzip_u8(x, y).val[1] #define vzip2_s16(x, y) vzip_s16(x, y).val[1] #define vzip2_u16(x, y) vzip_u16(x, y).val[1] #define vzip2_s32(x, y) vzip_s32(x, y).val[1] #define vzip2_u32(x, y) vzip_u32(x, y).val[1] #define vzip2_f32(x, y) vzip_f32(x, y).val[1] #define vzip2q_s8(x, y) vzipq_s8(x, y).val[1] #define vzip2q_u8(x, y) vzipq_u8(x, y).val[1] #define vzip2q_s16(x, y) vzipq_s16(x, y).val[1] #define vzip2q_u16(x, y) vzipq_u16(x, y).val[1] #define vzip2q_s32(x, y) vzipq_s32(x, y).val[1] #define vzip2q_u32(x, y) vzipq_u32(x, y).val[1] #define vzip2q_f32(x, y) vzipq_f32(x, y).val[1] #endif // Wrappers over uint8x16x2_t etc. so we can define StoreInterleaved2 // overloads for all vector types, even those (bfloat16_t) where the // underlying vector is the same as others (uint16_t). template struct Tuple2; template struct Tuple3; template struct Tuple4; template <> struct Tuple2 { uint8x16x2_t raw; }; template struct Tuple2 { uint8x8x2_t raw; }; template <> struct Tuple2 { int8x16x2_t raw; }; template struct Tuple2 { int8x8x2_t raw; }; template <> struct Tuple2 { uint16x8x2_t raw; }; template struct Tuple2 { uint16x4x2_t raw; }; template <> struct Tuple2 { int16x8x2_t raw; }; template struct Tuple2 { int16x4x2_t raw; }; template <> struct Tuple2 { uint32x4x2_t raw; }; template struct Tuple2 { uint32x2x2_t raw; }; template <> struct Tuple2 { int32x4x2_t raw; }; template struct Tuple2 { int32x2x2_t raw; }; template <> struct Tuple2 { uint64x2x2_t raw; }; template struct Tuple2 { uint64x1x2_t raw; }; template <> struct Tuple2 { int64x2x2_t raw; }; template struct Tuple2 { int64x1x2_t raw; }; template <> struct Tuple2 { float32x4x2_t raw; }; template struct Tuple2 { float32x2x2_t raw; }; #if HWY_HAVE_FLOAT64 template <> struct Tuple2 { float64x2x2_t raw; }; template struct Tuple2 { float64x1x2_t raw; }; #endif // HWY_HAVE_FLOAT64 template <> struct Tuple3 { uint8x16x3_t raw; }; template struct Tuple3 { uint8x8x3_t raw; }; template <> struct Tuple3 { int8x16x3_t raw; }; template struct Tuple3 { int8x8x3_t raw; }; template <> struct Tuple3 { uint16x8x3_t raw; }; template struct Tuple3 { uint16x4x3_t raw; }; template <> struct Tuple3 { int16x8x3_t raw; }; template struct Tuple3 { int16x4x3_t raw; }; template <> struct Tuple3 { uint32x4x3_t raw; }; template struct Tuple3 { uint32x2x3_t raw; }; template <> struct Tuple3 { int32x4x3_t raw; }; template struct Tuple3 { int32x2x3_t raw; }; template <> struct Tuple3 { uint64x2x3_t raw; }; template struct Tuple3 { uint64x1x3_t raw; }; template <> struct Tuple3 { int64x2x3_t raw; }; template struct Tuple3 { int64x1x3_t raw; }; template <> struct Tuple3 { float32x4x3_t raw; }; template struct Tuple3 { float32x2x3_t raw; }; #if HWY_HAVE_FLOAT64 template <> struct Tuple3 { float64x2x3_t raw; }; template struct Tuple3 { float64x1x3_t raw; }; #endif // HWY_HAVE_FLOAT64 template <> struct Tuple4 { uint8x16x4_t raw; }; template struct Tuple4 { uint8x8x4_t raw; }; template <> struct Tuple4 { int8x16x4_t raw; }; template struct Tuple4 { int8x8x4_t raw; }; template <> struct Tuple4 { uint16x8x4_t raw; }; template struct Tuple4 { uint16x4x4_t raw; }; template <> struct Tuple4 { int16x8x4_t raw; }; template struct Tuple4 { int16x4x4_t raw; }; template <> struct Tuple4 { uint32x4x4_t raw; }; template struct Tuple4 { uint32x2x4_t raw; }; template <> struct Tuple4 { int32x4x4_t raw; }; template struct Tuple4 { int32x2x4_t raw; }; template <> struct Tuple4 { uint64x2x4_t raw; }; template struct Tuple4 { uint64x1x4_t raw; }; template <> struct Tuple4 { int64x2x4_t raw; }; template struct Tuple4 { int64x1x4_t raw; }; template <> struct Tuple4 { float32x4x4_t raw; }; template struct Tuple4 { float32x2x4_t raw; }; #if HWY_HAVE_FLOAT64 template <> struct Tuple4 { float64x2x4_t raw; }; template struct Tuple4 { float64x1x4_t raw; }; #endif // HWY_HAVE_FLOAT64 template struct Raw128; template <> struct Raw128 { using type = uint8x16_t; }; template struct Raw128 { using type = uint8x8_t; }; template <> struct Raw128 { using type = uint16x8_t; }; template struct Raw128 { using type = uint16x4_t; }; template <> struct Raw128 { using type = uint32x4_t; }; template struct Raw128 { using type = uint32x2_t; }; template <> struct Raw128 { using type = uint64x2_t; }; template <> struct Raw128 { using type = uint64x1_t; }; template <> struct Raw128 { using type = int8x16_t; }; template struct Raw128 { using type = int8x8_t; }; template <> struct Raw128 { using type = int16x8_t; }; template struct Raw128 { using type = int16x4_t; }; template <> struct Raw128 { using type = int32x4_t; }; template struct Raw128 { using type = int32x2_t; }; template <> struct Raw128 { using type = int64x2_t; }; template <> struct Raw128 { using type = int64x1_t; }; template <> struct Raw128 { using type = float32x4_t; }; template struct Raw128 { using type = float32x2_t; }; #if HWY_HAVE_FLOAT64 template <> struct Raw128 { using type = float64x2_t; }; template <> struct Raw128 { using type = float64x1_t; }; #endif // HWY_HAVE_FLOAT64 #if HWY_NEON_HAVE_F16C template <> struct Tuple2 { float16x8x2_t raw; }; template struct Tuple2 { float16x4x2_t raw; }; template <> struct Tuple3 { float16x8x3_t raw; }; template struct Tuple3 { float16x4x3_t raw; }; template <> struct Tuple4 { float16x8x4_t raw; }; template struct Tuple4 { float16x4x4_t raw; }; template <> struct Raw128 { using type = float16x8_t; }; template struct Raw128 { using type = float16x4_t; }; #else // !HWY_NEON_HAVE_F16C template struct Tuple2 : public Tuple2 {}; template struct Tuple3 : public Tuple3 {}; template struct Tuple4 : public Tuple4 {}; template struct Raw128 : public Raw128 {}; #endif // HWY_NEON_HAVE_F16C #if HWY_NEON_HAVE_BFLOAT16 template <> struct Tuple2 { bfloat16x8x2_t raw; }; template struct Tuple2 { bfloat16x4x2_t raw; }; template <> struct Tuple3 { bfloat16x8x3_t raw; }; template struct Tuple3 { bfloat16x4x3_t raw; }; template <> struct Tuple4 { bfloat16x8x4_t raw; }; template struct Tuple4 { bfloat16x4x4_t raw; }; template <> struct Raw128 { using type = bfloat16x8_t; }; template struct Raw128 { using type = bfloat16x4_t; }; #else // !HWY_NEON_HAVE_BFLOAT16 template struct Tuple2 : public Tuple2 {}; template struct Tuple3 : public Tuple3 {}; template struct Tuple4 : public Tuple4 {}; template struct Raw128 : public Raw128 {}; #endif // HWY_NEON_HAVE_BFLOAT16 } // namespace detail template class Vec128 { public: using Raw = typename detail::Raw128::type; using PrivateT = T; // only for DFromV static constexpr size_t kPrivateN = N; // only for DFromV HWY_INLINE Vec128() {} Vec128(const Vec128&) = default; Vec128& operator=(const Vec128&) = default; HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {} // Compound assignment. Only usable if there is a corresponding non-member // binary operator overload. For example, only f32 and f64 support division. HWY_INLINE Vec128& operator*=(const Vec128 other) { return *this = (*this * other); } HWY_INLINE Vec128& operator/=(const Vec128 other) { return *this = (*this / other); } HWY_INLINE Vec128& operator+=(const Vec128 other) { return *this = (*this + other); } HWY_INLINE Vec128& operator-=(const Vec128 other) { return *this = (*this - other); } HWY_INLINE Vec128& operator%=(const Vec128 other) { return *this = (*this % other); } HWY_INLINE Vec128& operator&=(const Vec128 other) { return *this = (*this & other); } HWY_INLINE Vec128& operator|=(const Vec128 other) { return *this = (*this | other); } HWY_INLINE Vec128& operator^=(const Vec128 other) { return *this = (*this ^ other); } Raw raw; }; template using Vec64 = Vec128; template using Vec32 = Vec128; template using Vec16 = Vec128; // FF..FF or 0. template class Mask128 { // Arm C Language Extensions return and expect unsigned type. using Raw = typename detail::Raw128, N>::type; public: using PrivateT = T; // only for DFromM static constexpr size_t kPrivateN = N; // only for DFromM HWY_INLINE Mask128() {} Mask128(const Mask128&) = default; Mask128& operator=(const Mask128&) = default; HWY_INLINE explicit Mask128(const Raw raw) : raw(raw) {} Raw raw; }; template using Mask64 = Mask128; template using DFromV = Simd; template using DFromM = Simd; template using TFromV = typename V::PrivateT; // ------------------------------ Set namespace detail { // We want to route any combination of N/kPow2 to the intrinsics depending on // whether the requested size is <= 64 bits or 128. HWY_NEON_BUILD_TPL is // unconditional and currently does not accept inputs (such as whether the // vector is 64 or 128-bit). Thus we are not able to use HWY_IF_V_SIZE_D for // SFINAE. We instead define a private NativeSet which receives a Simd<> whose // kPow2 has already been folded into its N. #define HWY_NEON_BUILD_TPL_HWY_SET #define HWY_NEON_BUILD_RET_HWY_SET(type, size) Vec128 #define HWY_NEON_BUILD_PARAM_HWY_SET(type, size) \ Simd /* tag */, type##_t t #define HWY_NEON_BUILD_ARG_HWY_SET t HWY_NEON_DEF_FUNCTION_ALL_TYPES(NativeSet, vdup, _n_, HWY_SET) #if !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_F16C HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(NativeSet, vdup, _n_, HWY_SET) #endif HWY_NEON_DEF_FUNCTION_BFLOAT_16(NativeSet, vdup, _n_, HWY_SET) template HWY_API Vec128, MaxLanes(D())> NativeSet(D d, TFromD t) { const uint16_t tu = BitCastScalar(t); return Vec128, d.MaxLanes()>(Set(RebindToUnsigned(), tu).raw); } #undef HWY_NEON_BUILD_TPL_HWY_SET #undef HWY_NEON_BUILD_RET_HWY_SET #undef HWY_NEON_BUILD_PARAM_HWY_SET #undef HWY_NEON_BUILD_ARG_HWY_SET } // namespace detail // Full vector. Cannot yet use VFromD because that is defined in terms of Set. // Do not use a typename T = TFromD argument because T will be deduced from // the actual argument type, which can differ from TFromD. template HWY_INLINE Vec128> Set(D /* tag */, T t) { return detail::NativeSet(Full128>(), static_cast>(t)); } // Partial vector: create 64-bit and return wrapper. template HWY_API Vec128, MaxLanes(D())> Set(D /* tag */, T t) { const Full64> dfull; return Vec128, MaxLanes(D())>( detail::NativeSet(dfull, static_cast>(t)).raw); } template using VFromD = decltype(Set(D(), TFromD())); template HWY_API VFromD Zero(D d) { // Default ctor also works for bfloat16_t and float16_t. return Set(d, TFromD{}); } HWY_DIAGNOSTICS(push) HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") #if HWY_COMPILER_GCC_ACTUAL HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized") #endif template HWY_API VFromD Undefined(D /*tag*/) { VFromD v; return v; } HWY_DIAGNOSTICS(pop) #if !HWY_COMPILER_GCC && !HWY_COMPILER_CLANGCL namespace detail { #pragma pack(push, 1) template struct alignas(8) Vec64ValsWrapper { static_assert(sizeof(T) >= 1, "sizeof(T) >= 1 must be true"); static_assert(sizeof(T) <= 8, "sizeof(T) <= 8 must be true"); T vals[8 / sizeof(T)]; }; #pragma pack(pop) } // namespace detail #endif // !HWY_COMPILER_GCC && !HWY_COMPILER_CLANGCL template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7, TFromD /*t8*/, TFromD /*t9*/, TFromD /*t10*/, TFromD /*t11*/, TFromD /*t12*/, TFromD /*t13*/, TFromD /*t14*/, TFromD /*t15*/) { #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL typedef int8_t GccI8RawVectType __attribute__((__vector_size__(8))); (void)d; const GccI8RawVectType raw = { static_cast(t0), static_cast(t1), static_cast(t2), static_cast(t3), static_cast(t4), static_cast(t5), static_cast(t6), static_cast(t7)}; return VFromD(reinterpret_cast::Raw>(raw)); #else return ResizeBitCast( d, Set(Full64(), BitCastScalar(detail::Vec64ValsWrapper>{ {t0, t1, t2, t3, t4, t5, t6, t7}}))); #endif } template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD /*t4*/, TFromD /*t5*/, TFromD /*t6*/, TFromD /*t7*/) { #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL typedef int16_t GccI16RawVectType __attribute__((__vector_size__(8))); (void)d; const GccI16RawVectType raw = { static_cast(t0), static_cast(t1), static_cast(t2), static_cast(t3)}; return VFromD(reinterpret_cast::Raw>(raw)); #else return ResizeBitCast( d, Set(Full64(), BitCastScalar( detail::Vec64ValsWrapper>{{t0, t1, t2, t3}}))); #endif } template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD t1, TFromD /*t2*/, TFromD /*t3*/) { #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL typedef int32_t GccI32RawVectType __attribute__((__vector_size__(8))); (void)d; const GccI32RawVectType raw = {static_cast(t0), static_cast(t1)}; return VFromD(reinterpret_cast::Raw>(raw)); #else return ResizeBitCast(d, Set(Full64(), BitCastScalar( detail::Vec64ValsWrapper>{{t0, t1}}))); #endif } template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD t1, TFromD /*t2*/, TFromD /*t3*/) { #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL typedef float GccF32RawVectType __attribute__((__vector_size__(8))); (void)d; const GccF32RawVectType raw = {t0, t1}; return VFromD(reinterpret_cast::Raw>(raw)); #else return ResizeBitCast(d, Set(Full64(), BitCastScalar( detail::Vec64ValsWrapper>{{t0, t1}}))); #endif } template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD /*t1*/) { return Set(d, t0); } template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7, TFromD t8, TFromD t9, TFromD t10, TFromD t11, TFromD t12, TFromD t13, TFromD t14, TFromD t15) { #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL typedef int8_t GccI8RawVectType __attribute__((__vector_size__(16))); (void)d; const GccI8RawVectType raw = { static_cast(t0), static_cast(t1), static_cast(t2), static_cast(t3), static_cast(t4), static_cast(t5), static_cast(t6), static_cast(t7), static_cast(t8), static_cast(t9), static_cast(t10), static_cast(t11), static_cast(t12), static_cast(t13), static_cast(t14), static_cast(t15)}; return VFromD(reinterpret_cast::Raw>(raw)); #else const Half dh; return Combine(d, Dup128VecFromValues(dh, t8, t9, t10, t11, t12, t13, t14, t15, t8, t9, t10, t11, t12, t13, t14, t15), Dup128VecFromValues(dh, t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, t2, t3, t4, t5, t6, t7)); #endif } template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7) { #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL typedef int16_t GccI16RawVectType __attribute__((__vector_size__(16))); (void)d; const GccI16RawVectType raw = { static_cast(t0), static_cast(t1), static_cast(t2), static_cast(t3), static_cast(t4), static_cast(t5), static_cast(t6), static_cast(t7)}; return VFromD(reinterpret_cast::Raw>(raw)); #else const Half dh; return Combine(d, Dup128VecFromValues(dh, t4, t5, t6, t7, t4, t5, t6, t7), Dup128VecFromValues(dh, t0, t1, t2, t3, t0, t1, t2, t3)); #endif } template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD t1, TFromD t2, TFromD t3) { #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL typedef int32_t GccI32RawVectType __attribute__((__vector_size__(16))); (void)d; const GccI32RawVectType raw = { static_cast(t0), static_cast(t1), static_cast(t2), static_cast(t3)}; return VFromD(reinterpret_cast::Raw>(raw)); #else const Half dh; return Combine(d, Dup128VecFromValues(dh, t2, t3, t2, t3), Dup128VecFromValues(dh, t0, t1, t0, t1)); #endif } template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD t1, TFromD t2, TFromD t3) { #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL typedef float GccF32RawVectType __attribute__((__vector_size__(16))); (void)d; const GccF32RawVectType raw = {t0, t1, t2, t3}; return VFromD(reinterpret_cast::Raw>(raw)); #else const Half dh; return Combine(d, Dup128VecFromValues(dh, t2, t3, t2, t3), Dup128VecFromValues(dh, t0, t1, t0, t1)); #endif } template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD t1) { #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL typedef int64_t GccI64RawVectType __attribute__((__vector_size__(16))); (void)d; const GccI64RawVectType raw = {static_cast(t0), static_cast(t1)}; return VFromD(reinterpret_cast::Raw>(raw)); #else const Half dh; return Combine(d, Set(dh, t1), Set(dh, t0)); #endif } #if HWY_HAVE_FLOAT64 template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD t1) { #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL typedef double GccF64RawVectType __attribute__((__vector_size__(16))); (void)d; const GccF64RawVectType raw = {t0, t1}; return VFromD(reinterpret_cast::Raw>(raw)); #else const Half dh; return Combine(d, Set(dh, t1), Set(dh, t0)); #endif } #endif // Generic for all vector lengths template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7) { const RebindToSigned di; return BitCast(d, Dup128VecFromValues( di, BitCastScalar(t0), BitCastScalar(t1), BitCastScalar(t2), BitCastScalar(t3), BitCastScalar(t4), BitCastScalar(t5), BitCastScalar(t6), BitCastScalar(t7))); } #if (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD /*t4*/, TFromD /*t5*/, TFromD /*t6*/, TFromD /*t7*/) { typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(8))); (void)d; const GccF16RawVectType raw = { static_cast<__fp16>(t0), static_cast<__fp16>(t1), static_cast<__fp16>(t2), static_cast<__fp16>(t3)}; return VFromD(reinterpret_cast::Raw>(raw)); } template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7) { typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(16))); (void)d; const GccF16RawVectType raw = { static_cast<__fp16>(t0), static_cast<__fp16>(t1), static_cast<__fp16>(t2), static_cast<__fp16>(t3), static_cast<__fp16>(t4), static_cast<__fp16>(t5), static_cast<__fp16>(t6), static_cast<__fp16>(t7)}; return VFromD(reinterpret_cast::Raw>(raw)); } #else // Generic for all vector lengths if MSVC or !HWY_NEON_HAVE_F16C template HWY_API VFromD Dup128VecFromValues(D d, TFromD t0, TFromD t1, TFromD t2, TFromD t3, TFromD t4, TFromD t5, TFromD t6, TFromD t7) { const RebindToSigned di; return BitCast(d, Dup128VecFromValues( di, BitCastScalar(t0), BitCastScalar(t1), BitCastScalar(t2), BitCastScalar(t3), BitCastScalar(t4), BitCastScalar(t5), BitCastScalar(t6), BitCastScalar(t7))); } #endif // (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C namespace detail { template HWY_INLINE VFromD Iota0(D d) { return Dup128VecFromValues( d, TFromD{0}, TFromD{1}, TFromD{2}, TFromD{3}, TFromD{4}, TFromD{5}, TFromD{6}, TFromD{7}, TFromD{8}, TFromD{9}, TFromD{10}, TFromD{11}, TFromD{12}, TFromD{13}, TFromD{14}, TFromD{15}); } template HWY_INLINE VFromD Iota0(D d) { return Dup128VecFromValues(d, TFromD{0}, TFromD{1}, TFromD{2}, TFromD{3}, TFromD{4}, TFromD{5}, TFromD{6}, TFromD{7}); } template HWY_INLINE VFromD Iota0(D d) { const RebindToUnsigned du; return BitCast(d, Dup128VecFromValues(du, uint16_t{0}, uint16_t{0x3C00}, uint16_t{0x4000}, uint16_t{0x4200}, uint16_t{0x4400}, uint16_t{0x4500}, uint16_t{0x4600}, uint16_t{0x4700})); } template HWY_INLINE VFromD Iota0(D d) { return Dup128VecFromValues(d, TFromD{0}, TFromD{1}, TFromD{2}, TFromD{3}); } template HWY_INLINE VFromD Iota0(D d) { return Dup128VecFromValues(d, TFromD{0}, TFromD{1}); } #if HWY_COMPILER_MSVC template static HWY_INLINE V MaskOutIota(V v) { constexpr size_t kVecSizeInBytes = HWY_MAX_LANES_V(V) * sizeof(TFromV); constexpr uint64_t kU64MaskOutMask = hwy::LimitsMax>(); const DFromV d; const Repartition du8; using VU8 = VFromD; const auto mask_out_mask = BitCast(d, VU8(vreinterpret_u8_u64(vdup_n_u64(kU64MaskOutMask)))); return v & mask_out_mask; } template static HWY_INLINE V MaskOutIota(V v) { return v; } #endif } // namespace detail template HWY_API VFromD Iota(D d, const T2 first) { const auto result_iota = detail::Iota0(d) + Set(d, static_cast>(first)); #if HWY_COMPILER_MSVC return detail::MaskOutIota(result_iota); #else return result_iota; #endif } // ------------------------------ Tuple (VFromD) #include "hwy/ops/tuple-inl.h" // ------------------------------ Combine // Full result template HWY_API Vec128 Combine(D /* tag */, Vec64 hi, Vec64 lo) { return Vec128(vcombine_u8(lo.raw, hi.raw)); } template HWY_API Vec128 Combine(D /* tag */, Vec64 hi, Vec64 lo) { return Vec128(vcombine_u16(lo.raw, hi.raw)); } template HWY_API Vec128 Combine(D /* tag */, Vec64 hi, Vec64 lo) { return Vec128(vcombine_u32(lo.raw, hi.raw)); } template HWY_API Vec128 Combine(D /* tag */, Vec64 hi, Vec64 lo) { return Vec128(vcombine_u64(lo.raw, hi.raw)); } template HWY_API Vec128 Combine(D /* tag */, Vec64 hi, Vec64 lo) { return Vec128(vcombine_s8(lo.raw, hi.raw)); } template HWY_API Vec128 Combine(D /* tag */, Vec64 hi, Vec64 lo) { return Vec128(vcombine_s16(lo.raw, hi.raw)); } template HWY_API Vec128 Combine(D /* tag */, Vec64 hi, Vec64 lo) { return Vec128(vcombine_s32(lo.raw, hi.raw)); } template HWY_API Vec128 Combine(D /* tag */, Vec64 hi, Vec64 lo) { return Vec128(vcombine_s64(lo.raw, hi.raw)); } #if HWY_HAVE_FLOAT16 template HWY_API Vec128 Combine(D, Vec64 hi, Vec64 lo) { return Vec128(vcombine_f16(lo.raw, hi.raw)); } #endif // HWY_HAVE_FLOAT16 #if HWY_NEON_HAVE_BFLOAT16 template HWY_API VFromD Combine(D, Vec64 hi, Vec64 lo) { return VFromD(vcombine_bf16(lo.raw, hi.raw)); } #endif // HWY_NEON_HAVE_BFLOAT16 template , HWY_NEON_IF_EMULATED_D(D)> HWY_API VFromD Combine(D d, VFromD hi, VFromD lo) { const RebindToUnsigned du; const Half duh; return BitCast(d, Combine(du, BitCast(duh, hi), BitCast(duh, lo))); } template HWY_API Vec128 Combine(D /* tag */, Vec64 hi, Vec64 lo) { return Vec128(vcombine_f32(lo.raw, hi.raw)); } #if HWY_HAVE_FLOAT64 template HWY_API Vec128 Combine(D /* tag */, Vec64 hi, Vec64 lo) { return Vec128(vcombine_f64(lo.raw, hi.raw)); } #endif // HWY_HAVE_FLOAT64 // ------------------------------ BitCast namespace detail { // Converts from Vec128 to Vec128 using the // vreinterpret*_u8_*() set of functions. #define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8 #define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \ Vec128 #define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128 v #define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw // Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined. template HWY_INLINE Vec128 BitCastToByte(Vec128 v) { return v; } HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) HWY_NEON_DEF_FUNCTION_BFLOAT_16(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) HWY_NEON_DEF_FUNCTION_INTS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) HWY_NEON_DEF_FUNCTION_UINT_16(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) #if !HWY_HAVE_FLOAT16 #if HWY_NEON_HAVE_F16C HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) #else template HWY_INLINE Vec128 BitCastToByte(Vec128 v) { return BitCastToByte(Vec128(v.raw)); } #endif // HWY_NEON_HAVE_F16C #endif // !HWY_HAVE_FLOAT16 #if !HWY_NEON_HAVE_BFLOAT16 template HWY_INLINE Vec128 BitCastToByte(Vec128 v) { return BitCastToByte(Vec128(v.raw)); } #endif // !HWY_NEON_HAVE_BFLOAT16 #undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8 #undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8 #undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8 #undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 template HWY_INLINE VFromD BitCastFromByte(D /* tag */, VFromD v) { return v; } // 64-bit or less: template HWY_INLINE VFromD BitCastFromByte(D /* tag */, VFromD> v) { return VFromD(vreinterpret_s8_u8(v.raw)); } template HWY_INLINE VFromD BitCastFromByte(D /* tag */, VFromD> v) { return VFromD(vreinterpret_u16_u8(v.raw)); } template HWY_INLINE VFromD BitCastFromByte(D /* tag */, VFromD> v) { return VFromD(vreinterpret_s16_u8(v.raw)); } template HWY_INLINE VFromD BitCastFromByte(D /* tag */, VFromD> v) { return VFromD(vreinterpret_u32_u8(v.raw)); } template HWY_INLINE VFromD BitCastFromByte(D /* tag */, VFromD> v) { return VFromD(vreinterpret_s32_u8(v.raw)); } template HWY_INLINE Vec64 BitCastFromByte(D /* tag */, Vec64 v) { return Vec64(vreinterpret_u64_u8(v.raw)); } template HWY_INLINE Vec64 BitCastFromByte(D /* tag */, Vec64 v) { return Vec64(vreinterpret_s64_u8(v.raw)); } // Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C. template HWY_INLINE VFromD BitCastFromByte(D, VFromD> v) { #if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C return VFromD(vreinterpret_f16_u8(v.raw)); #else const RebindToUnsigned du; return VFromD(BitCastFromByte(du, v).raw); #endif } template HWY_INLINE VFromD BitCastFromByte(D, VFromD> v) { #if HWY_NEON_HAVE_BFLOAT16 return VFromD(vreinterpret_bf16_u8(v.raw)); #else const RebindToUnsigned du; return VFromD(BitCastFromByte(du, v).raw); #endif } template HWY_INLINE VFromD BitCastFromByte(D /* tag */, VFromD> v) { return VFromD(vreinterpret_f32_u8(v.raw)); } #if HWY_HAVE_FLOAT64 template HWY_INLINE Vec64 BitCastFromByte(D /* tag */, Vec64 v) { return Vec64(vreinterpret_f64_u8(v.raw)); } #endif // HWY_HAVE_FLOAT64 // 128-bit full: template HWY_INLINE Vec128 BitCastFromByte(D /* tag */, Vec128 v) { return Vec128(vreinterpretq_s8_u8(v.raw)); } template HWY_INLINE Vec128 BitCastFromByte(D /* tag */, Vec128 v) { return Vec128(vreinterpretq_u16_u8(v.raw)); } template HWY_INLINE Vec128 BitCastFromByte(D /* tag */, Vec128 v) { return Vec128(vreinterpretq_s16_u8(v.raw)); } template HWY_INLINE Vec128 BitCastFromByte(D /* tag */, Vec128 v) { return Vec128(vreinterpretq_u32_u8(v.raw)); } template HWY_INLINE Vec128 BitCastFromByte(D /* tag */, Vec128 v) { return Vec128(vreinterpretq_s32_u8(v.raw)); } template HWY_INLINE Vec128 BitCastFromByte(D /* tag */, Vec128 v) { return Vec128(vreinterpretq_u64_u8(v.raw)); } template HWY_INLINE Vec128 BitCastFromByte(D /* tag */, Vec128 v) { return Vec128(vreinterpretq_s64_u8(v.raw)); } template HWY_INLINE Vec128 BitCastFromByte(D /* tag */, Vec128 v) { return Vec128(vreinterpretq_f32_u8(v.raw)); } #if HWY_HAVE_FLOAT64 template HWY_INLINE Vec128 BitCastFromByte(D /* tag */, Vec128 v) { return Vec128(vreinterpretq_f64_u8(v.raw)); } #endif // HWY_HAVE_FLOAT64 // Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C. template HWY_INLINE VFromD BitCastFromByte(D, Vec128 v) { #if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C return VFromD(vreinterpretq_f16_u8(v.raw)); #else return VFromD(BitCastFromByte(RebindToUnsigned(), v).raw); #endif } template HWY_INLINE VFromD BitCastFromByte(D, Vec128 v) { #if HWY_NEON_HAVE_BFLOAT16 return VFromD(vreinterpretq_bf16_u8(v.raw)); #else return VFromD(BitCastFromByte(RebindToUnsigned(), v).raw); #endif } } // namespace detail template HWY_API VFromD BitCast(D d, Vec128().MaxLanes()> v) { return detail::BitCastFromByte(d, detail::BitCastToByte(v)); } // ------------------------------ ResizeBitCast // <= 8 byte vector to <= 8 byte vector template HWY_API VFromD ResizeBitCast(D d, FromV v) { const Repartition du8; return BitCast(d, VFromD{detail::BitCastToByte(v).raw}); } // 16-byte vector to 16-byte vector: same as BitCast template HWY_API VFromD ResizeBitCast(D d, FromV v) { return BitCast(d, v); } // 16-byte vector to <= 8-byte vector template HWY_API VFromD ResizeBitCast(D d, FromV v) { const DFromV d_from; const Half dh_from; return ResizeBitCast(d, LowerHalf(dh_from, v)); } // <= 8-bit vector to 16-byte vector template HWY_API VFromD ResizeBitCast(D d, FromV v) { const Full64> d_full64_from; const Full128> d_full128_from; return BitCast(d, Combine(d_full128_from, Zero(d_full64_from), ResizeBitCast(d_full64_from, v))); } // ------------------------------ GetLane namespace detail { #define HWY_NEON_BUILD_TPL_HWY_GET template #define HWY_NEON_BUILD_RET_HWY_GET(type, size) type##_t #define HWY_NEON_BUILD_PARAM_HWY_GET(type, size) Vec128 v #define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane HWY_NEON_DEF_FUNCTION_ALL_TYPES(GetLane, vget, _lane_, HWY_GET) #undef HWY_NEON_BUILD_TPL_HWY_GET #undef HWY_NEON_BUILD_RET_HWY_GET #undef HWY_NEON_BUILD_PARAM_HWY_GET #undef HWY_NEON_BUILD_ARG_HWY_GET } // namespace detail template HWY_API TFromV GetLane(const V v) { return detail::GetLane<0>(v); } // ------------------------------ ExtractLane // Requires one overload per vector length because GetLane<3> is a compile error // if v is a uint32x2_t. template HWY_API T ExtractLane(const Vec128 v, size_t i) { HWY_DASSERT(i == 0); (void)i; return detail::GetLane<0>(v); } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::GetLane<0>(v); case 1: return detail::GetLane<1>(v); } } #endif alignas(16) T lanes[2]; Store(v, DFromV(), lanes); return lanes[i]; } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::GetLane<0>(v); case 1: return detail::GetLane<1>(v); case 2: return detail::GetLane<2>(v); case 3: return detail::GetLane<3>(v); } } #endif alignas(16) T lanes[4]; Store(v, DFromV(), lanes); return lanes[i]; } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::GetLane<0>(v); case 1: return detail::GetLane<1>(v); case 2: return detail::GetLane<2>(v); case 3: return detail::GetLane<3>(v); case 4: return detail::GetLane<4>(v); case 5: return detail::GetLane<5>(v); case 6: return detail::GetLane<6>(v); case 7: return detail::GetLane<7>(v); } } #endif alignas(16) T lanes[8]; Store(v, DFromV(), lanes); return lanes[i]; } template HWY_API T ExtractLane(const Vec128 v, size_t i) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::GetLane<0>(v); case 1: return detail::GetLane<1>(v); case 2: return detail::GetLane<2>(v); case 3: return detail::GetLane<3>(v); case 4: return detail::GetLane<4>(v); case 5: return detail::GetLane<5>(v); case 6: return detail::GetLane<6>(v); case 7: return detail::GetLane<7>(v); case 8: return detail::GetLane<8>(v); case 9: return detail::GetLane<9>(v); case 10: return detail::GetLane<10>(v); case 11: return detail::GetLane<11>(v); case 12: return detail::GetLane<12>(v); case 13: return detail::GetLane<13>(v); case 14: return detail::GetLane<14>(v); case 15: return detail::GetLane<15>(v); } } #endif alignas(16) T lanes[16]; Store(v, DFromV(), lanes); return lanes[i]; } // ------------------------------ InsertLane namespace detail { #define HWY_NEON_BUILD_TPL_HWY_INSERT template #define HWY_NEON_BUILD_RET_HWY_INSERT(type, size) Vec128 #define HWY_NEON_BUILD_PARAM_HWY_INSERT(type, size) \ Vec128 v, type##_t t #define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane HWY_NEON_DEF_FUNCTION_ALL_TYPES(InsertLane, vset, _lane_, HWY_INSERT) #undef HWY_NEON_BUILD_TPL_HWY_INSERT #undef HWY_NEON_BUILD_RET_HWY_INSERT #undef HWY_NEON_BUILD_PARAM_HWY_INSERT #undef HWY_NEON_BUILD_ARG_HWY_INSERT template , HWY_NEON_IF_EMULATED_D(D)> HWY_API V InsertLane(const V v, TFromD t) { const D d; const RebindToUnsigned du; const uint16_t tu = BitCastScalar(t); return BitCast(d, InsertLane(BitCast(du, v), tu)); } } // namespace detail // Requires one overload per vector length because InsertLane<3> may be a // compile error. template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { HWY_DASSERT(i == 0); (void)i; return Set(DFromV(), t); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); } } #endif const DFromV d; alignas(16) T lanes[2]; Store(v, d, lanes); lanes[i] = t; return Load(d, lanes); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); case 2: return detail::InsertLane<2>(v, t); case 3: return detail::InsertLane<3>(v, t); } } #endif const DFromV d; alignas(16) T lanes[4]; Store(v, d, lanes); lanes[i] = t; return Load(d, lanes); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); case 2: return detail::InsertLane<2>(v, t); case 3: return detail::InsertLane<3>(v, t); case 4: return detail::InsertLane<4>(v, t); case 5: return detail::InsertLane<5>(v, t); case 6: return detail::InsertLane<6>(v, t); case 7: return detail::InsertLane<7>(v, t); } } #endif const DFromV d; alignas(16) T lanes[8]; Store(v, d, lanes); lanes[i] = t; return Load(d, lanes); } template HWY_API Vec128 InsertLane(const Vec128 v, size_t i, T t) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(i)) { switch (i) { case 0: return detail::InsertLane<0>(v, t); case 1: return detail::InsertLane<1>(v, t); case 2: return detail::InsertLane<2>(v, t); case 3: return detail::InsertLane<3>(v, t); case 4: return detail::InsertLane<4>(v, t); case 5: return detail::InsertLane<5>(v, t); case 6: return detail::InsertLane<6>(v, t); case 7: return detail::InsertLane<7>(v, t); case 8: return detail::InsertLane<8>(v, t); case 9: return detail::InsertLane<9>(v, t); case 10: return detail::InsertLane<10>(v, t); case 11: return detail::InsertLane<11>(v, t); case 12: return detail::InsertLane<12>(v, t); case 13: return detail::InsertLane<13>(v, t); case 14: return detail::InsertLane<14>(v, t); case 15: return detail::InsertLane<15>(v, t); } } #endif const DFromV d; alignas(16) T lanes[16]; Store(v, d, lanes); lanes[i] = t; return Load(d, lanes); } // ================================================== ARITHMETIC // ------------------------------ Addition HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator+, vadd, _, 2) // ------------------------------ Subtraction HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator-, vsub, _, 2) // ------------------------------ SumsOf8 HWY_API Vec128 SumsOf8(const Vec128 v) { return Vec128(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v.raw)))); } HWY_API Vec64 SumsOf8(const Vec64 v) { return Vec64(vpaddl_u32(vpaddl_u16(vpaddl_u8(v.raw)))); } HWY_API Vec128 SumsOf8(const Vec128 v) { return Vec128(vpaddlq_s32(vpaddlq_s16(vpaddlq_s8(v.raw)))); } HWY_API Vec64 SumsOf8(const Vec64 v) { return Vec64(vpaddl_s32(vpaddl_s16(vpaddl_s8(v.raw)))); } // ------------------------------ SumsOf2 namespace detail { template HWY_INLINE VFromD>> SumsOf2( hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) { return VFromD>>(vpaddl_s8(v.raw)); } template HWY_INLINE VFromD>> SumsOf2( hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) { return VFromD>>(vpaddlq_s8(v.raw)); } template HWY_INLINE VFromD>> SumsOf2( hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) { return VFromD>>(vpaddl_u8(v.raw)); } template HWY_INLINE VFromD>> SumsOf2( hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) { return VFromD>>(vpaddlq_u8(v.raw)); } template HWY_INLINE VFromD>> SumsOf2( hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) { return VFromD>>(vpaddl_s16(v.raw)); } template HWY_INLINE VFromD>> SumsOf2( hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) { return VFromD>>(vpaddlq_s16(v.raw)); } template HWY_INLINE VFromD>> SumsOf2( hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) { return VFromD>>(vpaddl_u16(v.raw)); } template HWY_INLINE VFromD>> SumsOf2( hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) { return VFromD>>(vpaddlq_u16(v.raw)); } template HWY_INLINE VFromD>> SumsOf2( hwy::SignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) { return VFromD>>(vpaddl_s32(v.raw)); } template HWY_INLINE VFromD>> SumsOf2( hwy::SignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) { return VFromD>>(vpaddlq_s32(v.raw)); } template HWY_INLINE VFromD>> SumsOf2( hwy::UnsignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) { return VFromD>>(vpaddl_u32(v.raw)); } template HWY_INLINE VFromD>> SumsOf2( hwy::UnsignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) { return VFromD>>(vpaddlq_u32(v.raw)); } } // namespace detail // ------------------------------ SaturatedAdd #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB #undef HWY_NATIVE_I32_SATURATED_ADDSUB #else #define HWY_NATIVE_I32_SATURATED_ADDSUB #endif #ifdef HWY_NATIVE_U32_SATURATED_ADDSUB #undef HWY_NATIVE_U32_SATURATED_ADDSUB #else #define HWY_NATIVE_U32_SATURATED_ADDSUB #endif #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB #undef HWY_NATIVE_I64_SATURATED_ADDSUB #else #define HWY_NATIVE_I64_SATURATED_ADDSUB #endif #ifdef HWY_NATIVE_U64_SATURATED_ADDSUB #undef HWY_NATIVE_U64_SATURATED_ADDSUB #else #define HWY_NATIVE_U64_SATURATED_ADDSUB #endif // Returns a + b clamped to the destination range. HWY_NEON_DEF_FUNCTION_INTS_UINTS(SaturatedAdd, vqadd, _, 2) // ------------------------------ SaturatedSub // Returns a - b clamped to the destination range. HWY_NEON_DEF_FUNCTION_INTS_UINTS(SaturatedSub, vqsub, _, 2) // ------------------------------ Average // Returns (a + b + 1) / 2 HWY_NEON_DEF_FUNCTION_UINT_8(AverageRound, vrhadd, _, 2) HWY_NEON_DEF_FUNCTION_UINT_16(AverageRound, vrhadd, _, 2) // ------------------------------ Neg HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Neg, vneg, _, 1) HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) // i64 implemented below #if !HWY_HAVE_FLOAT16 template HWY_API Vec128 Neg(const Vec128 v) { const DFromV d; const RebindToUnsigned du; using TU = TFromD; return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask()))); } #endif // !HWY_HAVE_FLOAT16 // There is no vneg for bf16, but we can cast to f16 (emulated or native). template HWY_API Vec128 Neg(const Vec128 v) { const DFromV d; const Rebind df16; return BitCast(d, Neg(BitCast(df16, v))); } HWY_API Vec64 Neg(const Vec64 v) { #if HWY_ARCH_ARM_A64 return Vec64(vneg_s64(v.raw)); #else return Zero(DFromV()) - v; #endif } HWY_API Vec128 Neg(const Vec128 v) { #if HWY_ARCH_ARM_A64 return Vec128(vnegq_s64(v.raw)); #else return Zero(DFromV()) - v; #endif } // ------------------------------ SaturatedNeg #ifdef HWY_NATIVE_SATURATED_NEG_8_16_32 #undef HWY_NATIVE_SATURATED_NEG_8_16_32 #else #define HWY_NATIVE_SATURATED_NEG_8_16_32 #endif HWY_NEON_DEF_FUNCTION_INT_8_16_32(SaturatedNeg, vqneg, _, 1) #if HWY_ARCH_ARM_A64 #ifdef HWY_NATIVE_SATURATED_NEG_64 #undef HWY_NATIVE_SATURATED_NEG_64 #else #define HWY_NATIVE_SATURATED_NEG_64 #endif HWY_API Vec64 SaturatedNeg(const Vec64 v) { return Vec64(vqneg_s64(v.raw)); } HWY_API Vec128 SaturatedNeg(const Vec128 v) { return Vec128(vqnegq_s64(v.raw)); } #endif // ------------------------------ ShiftLeft // Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported). #pragma push_macro("HWY_NEON_DEF_FUNCTION") #undef HWY_NEON_DEF_FUNCTION #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \ template \ HWY_API Vec128 name(const Vec128 v) { \ return kBits == 0 ? v \ : Vec128(HWY_NEON_EVAL( \ prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \ } HWY_NEON_DEF_FUNCTION_INTS_UINTS(ShiftLeft, vshl, _n_, ignored) HWY_NEON_DEF_FUNCTION_UINTS(ShiftRight, vshr, _n_, ignored) HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, ignored) #pragma pop_macro("HWY_NEON_DEF_FUNCTION") // ------------------------------ RotateRight (ShiftRight, Or) template HWY_API Vec128 RotateRight(const Vec128 v) { constexpr size_t kSizeInBits = sizeof(T) * 8; static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); if (kBits == 0) return v; return Or(ShiftRight(v), ShiftLeft(v)); } // NOTE: vxarq_u64 can be applied to uint64_t, but we do not yet have a // mechanism for checking for extensions to Armv8. // ------------------------------ Shl HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return Vec128(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw))); } template HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return Vec128(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw))); } HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return Vec128(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw))); } template HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return Vec128(vshl_u16(v.raw, vreinterpret_s16_u16(bits.raw))); } HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return Vec128(vshlq_u32(v.raw, vreinterpretq_s32_u32(bits.raw))); } template HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return Vec128(vshl_u32(v.raw, vreinterpret_s32_u32(bits.raw))); } HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return Vec128(vshlq_u64(v.raw, vreinterpretq_s64_u64(bits.raw))); } HWY_API Vec64 operator<<(Vec64 v, Vec64 bits) { return Vec64(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw))); } HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return Vec128(vshlq_s8(v.raw, bits.raw)); } template HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return Vec128(vshl_s8(v.raw, bits.raw)); } HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return Vec128(vshlq_s16(v.raw, bits.raw)); } template HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return Vec128(vshl_s16(v.raw, bits.raw)); } HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return Vec128(vshlq_s32(v.raw, bits.raw)); } template HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return Vec128(vshl_s32(v.raw, bits.raw)); } HWY_API Vec128 operator<<(Vec128 v, Vec128 bits) { return Vec128(vshlq_s64(v.raw, bits.raw)); } HWY_API Vec64 operator<<(Vec64 v, Vec64 bits) { return Vec64(vshl_s64(v.raw, bits.raw)); } // ------------------------------ Shr (Neg) HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { const RebindToSigned> di; const int8x16_t neg_bits = Neg(BitCast(di, bits)).raw; return Vec128(vshlq_u8(v.raw, neg_bits)); } template HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { const RebindToSigned> di; const int8x8_t neg_bits = Neg(BitCast(di, bits)).raw; return Vec128(vshl_u8(v.raw, neg_bits)); } HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { const RebindToSigned> di; const int16x8_t neg_bits = Neg(BitCast(di, bits)).raw; return Vec128(vshlq_u16(v.raw, neg_bits)); } template HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { const RebindToSigned> di; const int16x4_t neg_bits = Neg(BitCast(di, bits)).raw; return Vec128(vshl_u16(v.raw, neg_bits)); } HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { const RebindToSigned> di; const int32x4_t neg_bits = Neg(BitCast(di, bits)).raw; return Vec128(vshlq_u32(v.raw, neg_bits)); } template HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { const RebindToSigned> di; const int32x2_t neg_bits = Neg(BitCast(di, bits)).raw; return Vec128(vshl_u32(v.raw, neg_bits)); } HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { const RebindToSigned> di; const int64x2_t neg_bits = Neg(BitCast(di, bits)).raw; return Vec128(vshlq_u64(v.raw, neg_bits)); } HWY_API Vec64 operator>>(Vec64 v, Vec64 bits) { const RebindToSigned> di; const int64x1_t neg_bits = Neg(BitCast(di, bits)).raw; return Vec64(vshl_u64(v.raw, neg_bits)); } HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { return Vec128(vshlq_s8(v.raw, Neg(bits).raw)); } template HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { return Vec128(vshl_s8(v.raw, Neg(bits).raw)); } HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { return Vec128(vshlq_s16(v.raw, Neg(bits).raw)); } template HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { return Vec128(vshl_s16(v.raw, Neg(bits).raw)); } HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { return Vec128(vshlq_s32(v.raw, Neg(bits).raw)); } template HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { return Vec128(vshl_s32(v.raw, Neg(bits).raw)); } HWY_API Vec128 operator>>(Vec128 v, Vec128 bits) { return Vec128(vshlq_s64(v.raw, Neg(bits).raw)); } HWY_API Vec64 operator>>(Vec64 v, Vec64 bits) { return Vec64(vshl_s64(v.raw, Neg(bits).raw)); } // ------------------------------ ShiftLeftSame (Shl) template HWY_API Vec128 ShiftLeftSame(const Vec128 v, int bits) { return v << Set(DFromV(), static_cast(bits)); } template HWY_API Vec128 ShiftRightSame(const Vec128 v, int bits) { return v >> Set(DFromV(), static_cast(bits)); } // ------------------------------ Int/float multiplication // Per-target flag to prevent generic_ops-inl.h from defining 8-bit operator*. #ifdef HWY_NATIVE_MUL_8 #undef HWY_NATIVE_MUL_8 #else #define HWY_NATIVE_MUL_8 #endif // All except ui64 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator*, vmul, _, 2) HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator*, vmul, _, 2) HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2) // ------------------------------ Integer multiplication // Returns the upper 16 bits of a * b in each lane. HWY_API Vec128 MulHigh(Vec128 a, Vec128 b) { int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw)); #if HWY_ARCH_ARM_A64 int32x4_t rhi = vmull_high_s16(a.raw, b.raw); #else int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw)); #endif return Vec128( vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi))); } HWY_API Vec128 MulHigh(Vec128 a, Vec128 b) { uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw)); #if HWY_ARCH_ARM_A64 uint32x4_t rhi = vmull_high_u16(a.raw, b.raw); #else uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw)); #endif return Vec128( vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi))); } template HWY_API Vec128 MulHigh(Vec128 a, Vec128 b) { int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.raw, b.raw)); return Vec128(vget_low_s16(vuzp2q_s16(hi_lo, hi_lo))); } template HWY_API Vec128 MulHigh(Vec128 a, Vec128 b) { uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.raw, b.raw)); return Vec128(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo))); } HWY_API Vec128 MulFixedPoint15(Vec128 a, Vec128 b) { return Vec128(vqrdmulhq_s16(a.raw, b.raw)); } template HWY_API Vec128 MulFixedPoint15(Vec128 a, Vec128 b) { return Vec128(vqrdmulh_s16(a.raw, b.raw)); } // ------------------------------ Floating-point division // Emulate missing intrinsic #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 HWY_INLINE float64x1_t vrecpe_f64(float64x1_t raw) { const CappedTag d; const Twice dt; using VT = VFromD; return LowerHalf(d, VT(vrecpeq_f64(Combine(dt, v, v).raw))).raw; } #endif // Approximate reciprocal HWY_NEON_DEF_FUNCTION_ALL_FLOATS(ApproximateReciprocal, vrecpe, _, 1) #if HWY_HAVE_FLOAT64 #ifdef HWY_NATIVE_F64_APPROX_RECIP #undef HWY_NATIVE_F64_APPROX_RECIP #else #define HWY_NATIVE_F64_APPROX_RECIP #endif HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2) #else // !HWY_HAVE_FLOAT64 namespace detail { HWY_NEON_DEF_FUNCTION_ALL_FLOATS(ReciprocalNewtonRaphsonStep, vrecps, _, 2) } // namespace detail template HWY_API Vec128 operator/(Vec128 a, Vec128 b) { auto x = ApproximateReciprocal(b); x *= detail::ReciprocalNewtonRaphsonStep(x, b); x *= detail::ReciprocalNewtonRaphsonStep(x, b); x *= detail::ReciprocalNewtonRaphsonStep(x, b); return a * x; } #endif // HWY_HAVE_FLOAT64 // ------------------------------ Absolute value of difference. HWY_NEON_DEF_FUNCTION_ALL_FLOATS(AbsDiff, vabd, _, 2) HWY_NEON_DEF_FUNCTION_UI_8_16_32(AbsDiff, vabd, _, 2) // no UI64 #ifdef HWY_NATIVE_INTEGER_ABS_DIFF #undef HWY_NATIVE_INTEGER_ABS_DIFF #else #define HWY_NATIVE_INTEGER_ABS_DIFF #endif // ------------------------------ Integer multiply-add // Per-target flag to prevent generic_ops-inl.h from defining int MulAdd. #ifdef HWY_NATIVE_INT_FMA #undef HWY_NATIVE_INT_FMA #else #define HWY_NATIVE_INT_FMA #endif // Wrappers for changing argument order to what intrinsics expect. namespace detail { // All except ui64 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(MulAdd, vmla, _, 3) HWY_NEON_DEF_FUNCTION_INT_8_16_32(MulAdd, vmla, _, 3) HWY_NEON_DEF_FUNCTION_UINT_8_16_32(NegMulAdd, vmls, _, 3) HWY_NEON_DEF_FUNCTION_INT_8_16_32(NegMulAdd, vmls, _, 3) } // namespace detail template HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, Vec128 add) { return detail::MulAdd(add, mul, x); } template HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, Vec128 add) { return detail::NegMulAdd(add, mul, x); } // 64-bit integer template HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, Vec128 add) { return Add(Mul(mul, x), add); } template HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, Vec128 add) { return Sub(add, Mul(mul, x)); } // ------------------------------ Floating-point multiply-add variants namespace detail { #if defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64 // Wrappers for changing argument order to what intrinsics expect. HWY_NEON_DEF_FUNCTION_ALL_FLOATS(MulAdd, vfma, _, 3) HWY_NEON_DEF_FUNCTION_ALL_FLOATS(NegMulAdd, vfms, _, 3) #else // Emulate. Matches intrinsics arg order. template HWY_API Vec128 MulAdd(Vec128 add, Vec128 mul, Vec128 x) { return mul * x + add; } template HWY_API Vec128 NegMulAdd(Vec128 add, Vec128 mul, Vec128 x) { return add - mul * x; } #endif // defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64 } // namespace detail template HWY_API Vec128 MulAdd(Vec128 mul, Vec128 x, Vec128 add) { return detail::MulAdd(add, mul, x); } template HWY_API Vec128 NegMulAdd(Vec128 mul, Vec128 x, Vec128 add) { return detail::NegMulAdd(add, mul, x); } template HWY_API Vec128 MulSub(Vec128 mul, Vec128 x, Vec128 sub) { return MulAdd(mul, x, Neg(sub)); } template HWY_API Vec128 NegMulSub(Vec128 mul, Vec128 x, Vec128 sub) { return Neg(MulAdd(mul, x, sub)); } // ------------------------------ Floating-point square root (IfThenZeroElse) // Emulate missing intrinsic #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 490 HWY_INLINE float64x1_t vrsqrte_f64(float64x1_t raw) { const CappedTag d; const Twice dt; using VT = VFromD; const VFromD v(raw); return LowerHalf(d, VT(vrsqrteq_f64(Combine(dt, v, v).raw))).raw; } #endif // Approximate reciprocal square root HWY_NEON_DEF_FUNCTION_ALL_FLOATS(ApproximateReciprocalSqrt, vrsqrte, _, 1) #if HWY_HAVE_FLOAT64 #ifdef HWY_NATIVE_F64_APPROX_RSQRT #undef HWY_NATIVE_F64_APPROX_RSQRT #else #define HWY_NATIVE_F64_APPROX_RSQRT #endif // Full precision square root HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Sqrt, vsqrt, _, 1) #else // !HWY_HAVE_FLOAT64 namespace detail { HWY_NEON_DEF_FUNCTION_ALL_FLOATS(ReciprocalSqrtStep, vrsqrts, _, 2) } // namespace detail template HWY_API Vec128 Sqrt(const Vec128 v) { auto recip = ApproximateReciprocalSqrt(v); recip *= detail::ReciprocalSqrtStep(v * recip, recip); recip *= detail::ReciprocalSqrtStep(v * recip, recip); recip *= detail::ReciprocalSqrtStep(v * recip, recip); const auto root = v * recip; return IfThenZeroElse(v == Zero(Simd()), root); } #endif // HWY_HAVE_FLOAT64 // ================================================== LOGICAL // ------------------------------ Not // There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION. template HWY_API Vec128 Not(const Vec128 v) { const DFromV d; const Repartition d8; return BitCast(d, Vec128(vmvnq_u8(BitCast(d8, v).raw))); } template HWY_API Vec128 Not(const Vec128 v) { const DFromV d; const Repartition d8; using V8 = decltype(Zero(d8)); return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw))); } // ------------------------------ And HWY_NEON_DEF_FUNCTION_INTS_UINTS(And, vand, _, 2) // Uses the u32/64 defined above. template HWY_API Vec128 And(const Vec128 a, const Vec128 b) { const DFromV d; const RebindToUnsigned du; return BitCast(d, BitCast(du, a) & BitCast(du, b)); } // ------------------------------ AndNot namespace detail { // reversed_andnot returns a & ~b. HWY_NEON_DEF_FUNCTION_INTS_UINTS(reversed_andnot, vbic, _, 2) } // namespace detail // Returns ~not_mask & mask. template HWY_API Vec128 AndNot(const Vec128 not_mask, const Vec128 mask) { return detail::reversed_andnot(mask, not_mask); } // Uses the u32/64 defined above. template HWY_API Vec128 AndNot(const Vec128 not_mask, const Vec128 mask) { const DFromV d; const RebindToUnsigned du; VFromD ret = detail::reversed_andnot(BitCast(du, mask), BitCast(du, not_mask)); return BitCast(d, ret); } // ------------------------------ Or HWY_NEON_DEF_FUNCTION_INTS_UINTS(Or, vorr, _, 2) // Uses the u32/64 defined above. template HWY_API Vec128 Or(const Vec128 a, const Vec128 b) { const DFromV d; const RebindToUnsigned du; return BitCast(d, BitCast(du, a) | BitCast(du, b)); } // ------------------------------ Xor HWY_NEON_DEF_FUNCTION_INTS_UINTS(Xor, veor, _, 2) // Uses the u32/64 defined above. template HWY_API Vec128 Xor(const Vec128 a, const Vec128 b) { const DFromV d; const RebindToUnsigned du; return BitCast(d, BitCast(du, a) ^ BitCast(du, b)); } // ------------------------------ Xor3 #if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SHA3) HWY_NEON_DEF_FUNCTION_FULL_UI(Xor3, veor3, _, 3) // Half vectors are not natively supported. Two Xor are likely more efficient // than Combine to 128-bit. template HWY_API Vec128 Xor3(Vec128 x1, Vec128 x2, Vec128 x3) { return Xor(x1, Xor(x2, x3)); } template HWY_API Vec128 Xor3(const Vec128 x1, const Vec128 x2, const Vec128 x3) { const DFromV d; const RebindToUnsigned du; return BitCast(d, Xor3(BitCast(du, x1), BitCast(du, x2), BitCast(du, x3))); } #else template HWY_API Vec128 Xor3(Vec128 x1, Vec128 x2, Vec128 x3) { return Xor(x1, Xor(x2, x3)); } #endif // ------------------------------ Or3 template HWY_API Vec128 Or3(Vec128 o1, Vec128 o2, Vec128 o3) { return Or(o1, Or(o2, o3)); } // ------------------------------ OrAnd template HWY_API Vec128 OrAnd(Vec128 o, Vec128 a1, Vec128 a2) { return Or(o, And(a1, a2)); } // ------------------------------ IfVecThenElse template HWY_API Vec128 IfVecThenElse(Vec128 mask, Vec128 yes, Vec128 no) { return IfThenElse(MaskFromVec(mask), yes, no); } // ------------------------------ BitwiseIfThenElse #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE #else #define HWY_NATIVE_BITWISE_IF_THEN_ELSE #endif template HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { return IfVecThenElse(mask, yes, no); } // ------------------------------ Operator overloads (internal-only if float) template HWY_API Vec128 operator&(const Vec128 a, const Vec128 b) { return And(a, b); } template HWY_API Vec128 operator|(const Vec128 a, const Vec128 b) { return Or(a, b); } template HWY_API Vec128 operator^(const Vec128 a, const Vec128 b) { return Xor(a, b); } // ------------------------------ I64/U64 AbsDiff template HWY_API Vec128 AbsDiff(const Vec128 a, const Vec128 b) { return Max(a, b) - Min(a, b); } template HWY_API Vec128 AbsDiff(const Vec128 a, const Vec128 b) { return Or(SaturatedSub(a, b), SaturatedSub(b, a)); } // ------------------------------ PopulationCount #ifdef HWY_NATIVE_POPCNT #undef HWY_NATIVE_POPCNT #else #define HWY_NATIVE_POPCNT #endif namespace detail { template HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<1> /* tag */, Vec128 v) { const Full128 d8; return Vec128(vcntq_u8(BitCast(d8, v).raw)); } template HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<1> /* tag */, Vec128 v) { const Simd d8; return Vec128(vcnt_u8(BitCast(d8, v).raw)); } // NEON lacks popcount for lane sizes > 1, so take pairwise sums of the bytes. template HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<2> /* tag */, Vec128 v) { const Full128 d8; const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw); return Vec128(vpaddlq_u8(bytes)); } template HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<2> /* tag */, Vec128 v) { const Repartition> d8; const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw); return Vec128(vpaddl_u8(bytes)); } template HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<4> /* tag */, Vec128 v) { const Full128 d8; const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw); return Vec128(vpaddlq_u16(vpaddlq_u8(bytes))); } template HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<4> /* tag */, Vec128 v) { const Repartition> d8; const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw); return Vec128(vpaddl_u16(vpaddl_u8(bytes))); } template HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<8> /* tag */, Vec128 v) { const Full128 d8; const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw); return Vec128(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes)))); } template HWY_INLINE Vec128 PopulationCount(hwy::SizeTag<8> /* tag */, Vec128 v) { const Repartition> d8; const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw); return Vec128(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes)))); } } // namespace detail template HWY_API Vec128 PopulationCount(Vec128 v) { return detail::PopulationCount(hwy::SizeTag(), v); } // ================================================== SIGN // ------------------------------ Abs // i64 is implemented after BroadcastSignBit. HWY_NEON_DEF_FUNCTION_INT_8_16_32(Abs, vabs, _, 1) HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Abs, vabs, _, 1) // ------------------------------ SaturatedAbs #ifdef HWY_NATIVE_SATURATED_ABS #undef HWY_NATIVE_SATURATED_ABS #else #define HWY_NATIVE_SATURATED_ABS #endif HWY_NEON_DEF_FUNCTION_INT_8_16_32(SaturatedAbs, vqabs, _, 1) // ------------------------------ CopySign template HWY_API Vec128 CopySign(Vec128 magn, Vec128 sign) { static_assert(IsFloat(), "Only makes sense for floating-point"); const DFromV d; return BitwiseIfThenElse(SignBit(d), sign, magn); } // ------------------------------ CopySignToAbs template HWY_API Vec128 CopySignToAbs(Vec128 abs, Vec128 sign) { static_assert(IsFloat(), "Only makes sense for floating-point"); const DFromV d; return OrAnd(abs, SignBit(d), sign); } // ------------------------------ BroadcastSignBit template HWY_API Vec128 BroadcastSignBit(const Vec128 v) { return ShiftRight(v); } // ================================================== MASK // ------------------------------ To/from vector // Mask and Vec have the same representation (true = FF..FF). template HWY_API Mask128 MaskFromVec(const Vec128 v) { const Simd, N, 0> du; return Mask128(BitCast(du, v).raw); } template using MFromD = decltype(MaskFromVec(VFromD())); template HWY_API VFromD VecFromMask(D d, const MFromD m) { // Raw type of masks is unsigned. const RebindToUnsigned du; return BitCast(d, VFromD(m.raw)); } // ------------------------------ RebindMask (MaskFromVec) template HWY_API MFromD RebindMask(DTo /* tag */, Mask128 m) { static_assert(sizeof(TFrom) == sizeof(TFromD), "Must have same size"); return MFromD(m.raw); } // ------------------------------ IfThenElse #define HWY_NEON_BUILD_TPL_HWY_IF #define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128 #define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \ const Mask128 mask, const Vec128 yes, \ const Vec128 no #define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw HWY_NEON_DEF_FUNCTION_ALL_TYPES(IfThenElse, vbsl, _, HWY_IF) template , HWY_NEON_IF_EMULATED_D(D)> HWY_API V IfThenElse(MFromD mask, V yes, V no) { const DFromV d; const RebindToUnsigned du; return BitCast( d, IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no))); } #undef HWY_NEON_BUILD_TPL_HWY_IF #undef HWY_NEON_BUILD_RET_HWY_IF #undef HWY_NEON_BUILD_PARAM_HWY_IF #undef HWY_NEON_BUILD_ARG_HWY_IF // mask ? yes : 0 template HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { return yes & VecFromMask(DFromV(), mask); } template HWY_API Vec128 IfThenElseZero(Mask128 mask, Vec128 yes) { const DFromV d; const RebindToUnsigned du; return BitCast(d, IfThenElseZero(RebindMask(du, mask), BitCast(du, yes))); } // mask ? 0 : no template HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { return AndNot(VecFromMask(DFromV(), mask), no); } template HWY_API Vec128 IfThenZeroElse(Mask128 mask, Vec128 no) { const DFromV d; const RebindToUnsigned du; return BitCast(d, IfThenZeroElse(RebindMask(du, mask), BitCast(du, no))); } template HWY_API Vec128 IfNegativeThenElse(Vec128 v, Vec128 yes, Vec128 no) { static_assert(IsSigned(), "Only works for signed/float"); const DFromV d; const RebindToSigned di; Mask128 m = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); return IfThenElse(m, yes, no); } template HWY_API Vec128 ZeroIfNegative(Vec128 v) { const auto zero = Zero(DFromV()); return Max(zero, v); } // ------------------------------ Mask logical template HWY_API Mask128 Not(const Mask128 m) { return MaskFromVec(Not(VecFromMask(DFromM(), m))); } template HWY_API Mask128 And(const Mask128 a, Mask128 b) { const DFromM d; return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 AndNot(const Mask128 a, Mask128 b) { const DFromM d; return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 Or(const Mask128 a, Mask128 b) { const DFromM d; return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 Xor(const Mask128 a, Mask128 b) { const DFromM d; return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); } template HWY_API Mask128 ExclusiveNeither(const Mask128 a, Mask128 b) { const DFromM d; return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); } // ================================================== COMPARE // Comparisons fill a lane with 1-bits if the condition is true, else 0. // ------------------------------ Shuffle2301 (for i64 compares) // Swap 32-bit halves in 64-bits HWY_API Vec64 Shuffle2301(const Vec64 v) { return Vec64(vrev64_u32(v.raw)); } HWY_API Vec64 Shuffle2301(const Vec64 v) { return Vec64(vrev64_s32(v.raw)); } HWY_API Vec64 Shuffle2301(const Vec64 v) { return Vec64(vrev64_f32(v.raw)); } HWY_API Vec128 Shuffle2301(const Vec128 v) { return Vec128(vrev64q_u32(v.raw)); } HWY_API Vec128 Shuffle2301(const Vec128 v) { return Vec128(vrev64q_s32(v.raw)); } HWY_API Vec128 Shuffle2301(const Vec128 v) { return Vec128(vrev64q_f32(v.raw)); } #define HWY_NEON_BUILD_TPL_HWY_COMPARE #define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128 #define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \ const Vec128 a, const Vec128 b #define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw // ------------------------------ Equality HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE) #if HWY_ARCH_ARM_A64 HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE) #else // No 64-bit comparisons on armv7: emulate them below, after Shuffle2301. HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE) HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE) #endif // ------------------------------ Strict inequality (signed, float) #if HWY_ARCH_ARM_A64 HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<, vclt, _, HWY_COMPARE) #else HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<, vclt, _, HWY_COMPARE) HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE) #endif HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE) // ------------------------------ Weak inequality (float) #if HWY_ARCH_ARM_A64 HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<=, vcle, _, HWY_COMPARE) #else HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<=, vcle, _, HWY_COMPARE) HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<=, vcle, _, HWY_COMPARE) #endif HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE) #undef HWY_NEON_BUILD_TPL_HWY_COMPARE #undef HWY_NEON_BUILD_RET_HWY_COMPARE #undef HWY_NEON_BUILD_PARAM_HWY_COMPARE #undef HWY_NEON_BUILD_ARG_HWY_COMPARE // ------------------------------ Armv7 i64 compare (Shuffle2301, Eq) #if HWY_ARCH_ARM_V7 template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { const Simd d32; const Simd d64; const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); const auto cmp64 = cmp32 & Shuffle2301(cmp32); return MaskFromVec(BitCast(d64, cmp64)); } template HWY_API Mask128 operator==(const Vec128 a, const Vec128 b) { const Simd d32; const Simd d64; const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); const auto cmp64 = cmp32 & Shuffle2301(cmp32); return MaskFromVec(BitCast(d64, cmp64)); } HWY_API Mask128 operator<(const Vec128 a, const Vec128 b) { const int64x2_t sub = vqsubq_s64(a.raw, b.raw); return MaskFromVec(BroadcastSignBit(Vec128(sub))); } HWY_API Mask128 operator<(const Vec64 a, const Vec64 b) { const int64x1_t sub = vqsub_s64(a.raw, b.raw); return MaskFromVec(BroadcastSignBit(Vec64(sub))); } template HWY_API Mask128 operator<(const Vec128 a, const Vec128 b) { const DFromV du; const RebindToSigned di; const Vec128 msb = AndNot(a, b) | AndNot(a ^ b, a - b); return MaskFromVec(BitCast(du, BroadcastSignBit(BitCast(di, msb)))); } template HWY_API Mask128 operator<=(const Vec128 a, const Vec128 b) { return Not(b < a); } template HWY_API Mask128 operator<=(const Vec128 a, const Vec128 b) { return Not(b < a); } #endif // ------------------------------ operator!= (operator==) // Customize HWY_NEON_DEF_FUNCTION to call 2 functions. #pragma push_macro("HWY_NEON_DEF_FUNCTION") #undef HWY_NEON_DEF_FUNCTION // This cannot have _any_ template argument (in x86_128 we can at least have N // as an argument), otherwise it is not more specialized than rewritten // operator== in C++20, leading to compile errors. #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \ HWY_API Mask128 name(Vec128 a, \ Vec128 b) { \ return Not(a == b); \ } HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator!=, ignored, ignored, ignored) #pragma pop_macro("HWY_NEON_DEF_FUNCTION") // ------------------------------ Reversed comparisons template HWY_API Mask128 operator>(Vec128 a, Vec128 b) { return operator<(b, a); } template HWY_API Mask128 operator>=(Vec128 a, Vec128 b) { return operator<=(b, a); } // ------------------------------ FirstN (Iota, Lt) template HWY_API MFromD FirstN(D d, size_t num) { const RebindToSigned di; // Signed comparisons are cheaper. using TI = TFromD; return RebindMask(d, detail::Iota0(di) < Set(di, static_cast(num))); } // ------------------------------ TestBit (Eq) #define HWY_NEON_BUILD_TPL_HWY_TESTBIT #define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128 #define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \ Vec128 v, Vec128 bit #define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw #if HWY_ARCH_ARM_A64 HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT) #else // No 64-bit versions on armv7 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT) HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT) template HWY_API Mask128 TestBit(Vec128 v, Vec128 bit) { return (v & bit) == bit; } template HWY_API Mask128 TestBit(Vec128 v, Vec128 bit) { return (v & bit) == bit; } #endif #undef HWY_NEON_BUILD_TPL_HWY_TESTBIT #undef HWY_NEON_BUILD_RET_HWY_TESTBIT #undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT #undef HWY_NEON_BUILD_ARG_HWY_TESTBIT // ------------------------------ Abs i64 (IfThenElse, BroadcastSignBit) HWY_API Vec128 Abs(const Vec128 v) { #if HWY_ARCH_ARM_A64 return Vec128(vabsq_s64(v.raw)); #else const auto zero = Zero(DFromV()); return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); #endif } HWY_API Vec64 Abs(const Vec64 v) { #if HWY_ARCH_ARM_A64 return Vec64(vabs_s64(v.raw)); #else const auto zero = Zero(DFromV()); return IfThenElse(MaskFromVec(BroadcastSignBit(v)), zero - v, v); #endif } HWY_API Vec128 SaturatedAbs(const Vec128 v) { #if HWY_ARCH_ARM_A64 return Vec128(vqabsq_s64(v.raw)); #else const auto zero = Zero(DFromV()); return IfThenElse(MaskFromVec(BroadcastSignBit(v)), SaturatedSub(zero, v), v); #endif } HWY_API Vec64 SaturatedAbs(const Vec64 v) { #if HWY_ARCH_ARM_A64 return Vec64(vqabs_s64(v.raw)); #else const auto zero = Zero(DFromV()); return IfThenElse(MaskFromVec(BroadcastSignBit(v)), SaturatedSub(zero, v), v); #endif } // ------------------------------ Min (IfThenElse, BroadcastSignBit) // Unsigned HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min, vmin, _, 2) template HWY_API Vec128 Min(Vec128 a, Vec128 b) { #if HWY_ARCH_ARM_A64 return IfThenElse(b < a, b, a); #else const DFromV du; const RebindToSigned di; return BitCast(du, BitCast(di, a) - BitCast(di, SaturatedSub(a, b))); #endif } // Signed HWY_NEON_DEF_FUNCTION_INT_8_16_32(Min, vmin, _, 2) template HWY_API Vec128 Min(Vec128 a, Vec128 b) { #if HWY_ARCH_ARM_A64 return IfThenElse(b < a, b, a); #else const Vec128 sign = SaturatedSub(a, b); return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b); #endif } // Float: IEEE minimumNumber on v8 #if HWY_ARCH_ARM_A64 HWY_NEON_DEF_FUNCTION_FLOAT_16_32(Min, vminnm, _, 2) // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic, so define // in terms of the 128-bit intrinsic. #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 namespace detail { template HWY_INLINE V F64Vec64Min(V a, V b) { const DFromV d; const Twice dt; return LowerHalf(d, Min(ZeroExtendVector(dt, a), ZeroExtendVector(dt, b))); } } // namespace detail #endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 HWY_API Vec64 Min(Vec64 a, Vec64 b) { #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 return detail::F64Vec64Min(a, b); #else return Vec64(vminnm_f64(a.raw, b.raw)); #endif } HWY_API Vec128 Min(Vec128 a, Vec128 b) { return Vec128(vminnmq_f64(a.raw, b.raw)); } #else // Armv7: NaN if any is NaN. HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vmin, _, 2) #endif // HWY_ARCH_ARM_A64 // ------------------------------ Max (IfThenElse, BroadcastSignBit) // Unsigned (no u64) HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Max, vmax, _, 2) template HWY_API Vec128 Max(Vec128 a, Vec128 b) { #if HWY_ARCH_ARM_A64 return IfThenElse(b < a, a, b); #else const DFromV du; const RebindToSigned di; return BitCast(du, BitCast(di, b) + BitCast(di, SaturatedSub(a, b))); #endif } // Signed (no i64) HWY_NEON_DEF_FUNCTION_INT_8_16_32(Max, vmax, _, 2) template HWY_API Vec128 Max(Vec128 a, Vec128 b) { #if HWY_ARCH_ARM_A64 return IfThenElse(b < a, a, b); #else const Vec128 sign = SaturatedSub(a, b); return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a); #endif } // Float: IEEE minimumNumber on v8 #if HWY_ARCH_ARM_A64 HWY_NEON_DEF_FUNCTION_FLOAT_16_32(Max, vmaxnm, _, 2) // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic, so define // in terms of the 128-bit intrinsic. #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 namespace detail { template HWY_INLINE V F64Vec64Max(V a, V b) { const DFromV d; const Twice dt; return LowerHalf(d, Max(ZeroExtendVector(dt, a), ZeroExtendVector(dt, b))); } } // namespace detail #endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 HWY_API Vec64 Max(Vec64 a, Vec64 b) { #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 return detail::F64Vec64Max(a, b); #else return Vec64(vmaxnm_f64(a.raw, b.raw)); #endif } HWY_API Vec128 Max(Vec128 a, Vec128 b) { return Vec128(vmaxnmq_f64(a.raw, b.raw)); } #else // Armv7: NaN if any is NaN. HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2) #endif // HWY_ARCH_ARM_A64 // ================================================== MEMORY // ------------------------------ Load 128 template HWY_API Vec128 LoadU(D /* tag */, const uint8_t* HWY_RESTRICT unaligned) { return Vec128(vld1q_u8(unaligned)); } template HWY_API Vec128 LoadU(D /* tag */, const uint16_t* HWY_RESTRICT unaligned) { return Vec128(vld1q_u16(unaligned)); } template HWY_API Vec128 LoadU(D /* tag */, const uint32_t* HWY_RESTRICT unaligned) { return Vec128(vld1q_u32(unaligned)); } template HWY_API Vec128 LoadU(D /* tag */, const uint64_t* HWY_RESTRICT unaligned) { return Vec128(vld1q_u64(unaligned)); } template HWY_API Vec128 LoadU(D /* tag */, const int8_t* HWY_RESTRICT unaligned) { return Vec128(vld1q_s8(unaligned)); } template HWY_API Vec128 LoadU(D /* tag */, const int16_t* HWY_RESTRICT unaligned) { return Vec128(vld1q_s16(unaligned)); } template HWY_API Vec128 LoadU(D /* tag */, const int32_t* HWY_RESTRICT unaligned) { return Vec128(vld1q_s32(unaligned)); } template HWY_API Vec128 LoadU(D /* tag */, const int64_t* HWY_RESTRICT unaligned) { return Vec128(vld1q_s64(unaligned)); } #if HWY_HAVE_FLOAT16 template HWY_API Vec128 LoadU(D /* tag */, const float16_t* HWY_RESTRICT unaligned) { return Vec128(vld1q_f16(detail::NativeLanePointer(unaligned))); } #endif // HWY_HAVE_FLOAT16 #if HWY_NEON_HAVE_BFLOAT16 template HWY_API Vec128 LoadU(D /* tag */, const bfloat16_t* HWY_RESTRICT unaligned) { return Vec128(vld1q_bf16(detail::NativeLanePointer(unaligned))); } #endif // HWY_NEON_HAVE_BFLOAT16 template HWY_API Vec128 LoadU(D /* tag */, const float* HWY_RESTRICT unaligned) { return Vec128(vld1q_f32(unaligned)); } #if HWY_HAVE_FLOAT64 template HWY_API Vec128 LoadU(D /* tag */, const double* HWY_RESTRICT unaligned) { return Vec128(vld1q_f64(unaligned)); } #endif // HWY_HAVE_FLOAT64 // ------------------------------ Load 64 template HWY_API Vec64 LoadU(D /* tag */, const uint8_t* HWY_RESTRICT p) { return Vec64(vld1_u8(p)); } template HWY_API Vec64 LoadU(D /* tag */, const uint16_t* HWY_RESTRICT p) { return Vec64(vld1_u16(p)); } template HWY_API Vec64 LoadU(D /* tag */, const uint32_t* HWY_RESTRICT p) { return Vec64(vld1_u32(p)); } template HWY_API Vec64 LoadU(D /* tag */, const uint64_t* HWY_RESTRICT p) { return Vec64(vld1_u64(p)); } template HWY_API Vec64 LoadU(D /* tag */, const int8_t* HWY_RESTRICT p) { return Vec64(vld1_s8(p)); } template HWY_API Vec64 LoadU(D /* tag */, const int16_t* HWY_RESTRICT p) { return Vec64(vld1_s16(p)); } template HWY_API Vec64 LoadU(D /* tag */, const int32_t* HWY_RESTRICT p) { return Vec64(vld1_s32(p)); } template HWY_API Vec64 LoadU(D /* tag */, const int64_t* HWY_RESTRICT p) { return Vec64(vld1_s64(p)); } #if HWY_HAVE_FLOAT16 template HWY_API Vec64 LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) { return Vec64(vld1_f16(detail::NativeLanePointer(p))); } #endif // HWY_HAVE_FLOAT16 #if HWY_NEON_HAVE_BFLOAT16 template HWY_API Vec64 LoadU(D /* tag */, const bfloat16_t* HWY_RESTRICT p) { return Vec64(vld1_bf16(detail::NativeLanePointer(p))); } #endif // HWY_NEON_HAVE_BFLOAT16 template HWY_API Vec64 LoadU(D /* tag */, const float* HWY_RESTRICT p) { return Vec64(vld1_f32(p)); } #if HWY_HAVE_FLOAT64 template HWY_API Vec64 LoadU(D /* tag */, const double* HWY_RESTRICT p) { return Vec64(vld1_f64(p)); } #endif // HWY_HAVE_FLOAT64 // ------------------------------ Load 32 // Actual 32-bit broadcast load - used to implement the other lane types // because reinterpret_cast of the pointer leads to incorrect codegen on GCC. template HWY_API Vec32 LoadU(D /*tag*/, const uint32_t* HWY_RESTRICT p) { return Vec32(vld1_dup_u32(p)); } template HWY_API Vec32 LoadU(D /*tag*/, const int32_t* HWY_RESTRICT p) { return Vec32(vld1_dup_s32(p)); } template HWY_API Vec32 LoadU(D /*tag*/, const float* HWY_RESTRICT p) { return Vec32(vld1_dup_f32(p)); } // {u,i}{8,16} template HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { const Repartition d32; uint32_t buf; CopyBytes<4>(p, &buf); return BitCast(d, LoadU(d32, &buf)); } #if HWY_HAVE_FLOAT16 template HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { const Repartition d32; uint32_t buf; CopyBytes<4>(p, &buf); return BitCast(d, LoadU(d32, &buf)); } #endif // HWY_HAVE_FLOAT16 #if HWY_NEON_HAVE_BFLOAT16 template HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { const Repartition d32; uint32_t buf; CopyBytes<4>(p, &buf); return BitCast(d, LoadU(d32, &buf)); } #endif // HWY_NEON_HAVE_BFLOAT16 // ------------------------------ Load 16 // Actual 16-bit broadcast load - used to implement the other lane types // because reinterpret_cast of the pointer leads to incorrect codegen on GCC. template HWY_API VFromD LoadU(D /* tag */, const uint16_t* HWY_RESTRICT p) { return VFromD(vld1_dup_u16(p)); } template HWY_API VFromD LoadU(D /* tag */, const int16_t* HWY_RESTRICT p) { return VFromD(vld1_dup_s16(p)); } #if HWY_HAVE_FLOAT16 template HWY_API VFromD LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) { return VFromD(vld1_dup_f16(detail::NativeLanePointer(p))); } #endif // HWY_HAVE_FLOAT16 #if HWY_NEON_HAVE_BFLOAT16 template HWY_API VFromD LoadU(D /* tag */, const bfloat16_t* HWY_RESTRICT p) { return VFromD(vld1_dup_bf16(detail::NativeLanePointer(p))); } #endif // HWY_NEON_HAVE_BFLOAT16 // 8-bit x2 template HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { const Repartition d16; uint16_t buf; CopyBytes<2>(p, &buf); return BitCast(d, LoadU(d16, &buf)); } // ------------------------------ Load 8 template HWY_API VFromD LoadU(D /* tag */, const uint8_t* HWY_RESTRICT p) { return VFromD(vld1_dup_u8(p)); } template HWY_API VFromD LoadU(D /* tag */, const int8_t* HWY_RESTRICT p) { return VFromD(vld1_dup_s8(p)); } // ------------------------------ Load misc template HWY_API VFromD LoadU(D d, const TFromD* HWY_RESTRICT p) { const RebindToUnsigned du; return BitCast(d, LoadU(du, detail::U16LanePointer(p))); } // On Arm, Load is the same as LoadU. template HWY_API VFromD Load(D d, const TFromD* HWY_RESTRICT p) { return LoadU(d, p); } template HWY_API VFromD MaskedLoad(MFromD m, D d, const TFromD* HWY_RESTRICT aligned) { return IfThenElseZero(m, Load(d, aligned)); } template HWY_API VFromD MaskedLoadOr(VFromD v, MFromD m, D d, const TFromD* HWY_RESTRICT aligned) { return IfThenElse(m, Load(d, aligned), v); } // 128-bit SIMD => nothing to duplicate, same as an unaligned load. template HWY_API VFromD LoadDup128(D d, const TFromD* HWY_RESTRICT p) { return LoadU(d, p); } // ------------------------------ Store 128 template HWY_API void StoreU(Vec128 v, D /* tag */, uint8_t* HWY_RESTRICT unaligned) { vst1q_u8(unaligned, v.raw); } template HWY_API void StoreU(Vec128 v, D /* tag */, uint16_t* HWY_RESTRICT unaligned) { vst1q_u16(unaligned, v.raw); } template HWY_API void StoreU(Vec128 v, D /* tag */, uint32_t* HWY_RESTRICT unaligned) { vst1q_u32(unaligned, v.raw); } template HWY_API void StoreU(Vec128 v, D /* tag */, uint64_t* HWY_RESTRICT unaligned) { vst1q_u64(unaligned, v.raw); } template HWY_API void StoreU(Vec128 v, D /* tag */, int8_t* HWY_RESTRICT unaligned) { vst1q_s8(unaligned, v.raw); } template HWY_API void StoreU(Vec128 v, D /* tag */, int16_t* HWY_RESTRICT unaligned) { vst1q_s16(unaligned, v.raw); } template HWY_API void StoreU(Vec128 v, D /* tag */, int32_t* HWY_RESTRICT unaligned) { vst1q_s32(unaligned, v.raw); } template HWY_API void StoreU(Vec128 v, D /* tag */, int64_t* HWY_RESTRICT unaligned) { vst1q_s64(unaligned, v.raw); } #if HWY_HAVE_FLOAT16 template HWY_API void StoreU(Vec128 v, D /* tag */, float16_t* HWY_RESTRICT unaligned) { vst1q_f16(detail::NativeLanePointer(unaligned), v.raw); } #endif // HWY_HAVE_FLOAT16 #if HWY_NEON_HAVE_BFLOAT16 template HWY_API void StoreU(Vec128 v, D /* tag */, bfloat16_t* HWY_RESTRICT unaligned) { vst1q_bf16(detail::NativeLanePointer(unaligned), v.raw); } #endif // HWY_NEON_HAVE_BFLOAT16 template HWY_API void StoreU(Vec128 v, D /* tag */, float* HWY_RESTRICT unaligned) { vst1q_f32(unaligned, v.raw); } #if HWY_HAVE_FLOAT64 template HWY_API void StoreU(Vec128 v, D /* tag */, double* HWY_RESTRICT unaligned) { vst1q_f64(unaligned, v.raw); } #endif // HWY_HAVE_FLOAT64 // ------------------------------ Store 64 template HWY_API void StoreU(Vec64 v, D /* tag */, uint8_t* HWY_RESTRICT p) { vst1_u8(p, v.raw); } template HWY_API void StoreU(Vec64 v, D /* tag */, uint16_t* HWY_RESTRICT p) { vst1_u16(p, v.raw); } template HWY_API void StoreU(Vec64 v, D /* tag */, uint32_t* HWY_RESTRICT p) { vst1_u32(p, v.raw); } template HWY_API void StoreU(Vec64 v, D /* tag */, uint64_t* HWY_RESTRICT p) { vst1_u64(p, v.raw); } template HWY_API void StoreU(Vec64 v, D /* tag */, int8_t* HWY_RESTRICT p) { vst1_s8(p, v.raw); } template HWY_API void StoreU(Vec64 v, D /* tag */, int16_t* HWY_RESTRICT p) { vst1_s16(p, v.raw); } template HWY_API void StoreU(Vec64 v, D /* tag */, int32_t* HWY_RESTRICT p) { vst1_s32(p, v.raw); } template HWY_API void StoreU(Vec64 v, D /* tag */, int64_t* HWY_RESTRICT p) { vst1_s64(p, v.raw); } #if HWY_HAVE_FLOAT16 template HWY_API void StoreU(Vec64 v, D /* tag */, float16_t* HWY_RESTRICT p) { vst1_f16(detail::NativeLanePointer(p), v.raw); } #endif // HWY_HAVE_FLOAT16 #if HWY_NEON_HAVE_BFLOAT16 template HWY_API void StoreU(Vec64 v, D /* tag */, bfloat16_t* HWY_RESTRICT p) { vst1_bf16(detail::NativeLanePointer(p), v.raw); } #endif // HWY_NEON_HAVE_BFLOAT16 template HWY_API void StoreU(Vec64 v, D /* tag */, float* HWY_RESTRICT p) { vst1_f32(p, v.raw); } #if HWY_HAVE_FLOAT64 template HWY_API void StoreU(Vec64 v, D /* tag */, double* HWY_RESTRICT p) { vst1_f64(p, v.raw); } #endif // HWY_HAVE_FLOAT64 // ------------------------------ Store 32 template HWY_API void StoreU(Vec32 v, D, uint32_t* HWY_RESTRICT p) { vst1_lane_u32(p, v.raw, 0); } template HWY_API void StoreU(Vec32 v, D, int32_t* HWY_RESTRICT p) { vst1_lane_s32(p, v.raw, 0); } template HWY_API void StoreU(Vec32 v, D, float* HWY_RESTRICT p) { vst1_lane_f32(p, v.raw, 0); } // {u,i}{8,16} template HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { Repartition d32; uint32_t buf = GetLane(BitCast(d32, v)); CopyBytes<4>(&buf, p); } #if HWY_HAVE_FLOAT16 template HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { Repartition d32; uint32_t buf = GetLane(BitCast(d32, v)); CopyBytes<4>(&buf, p); } #endif #if HWY_NEON_HAVE_BFLOAT16 template HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { Repartition d32; uint32_t buf = GetLane(BitCast(d32, v)); CopyBytes<4>(&buf, p); } #endif // HWY_NEON_HAVE_BFLOAT16 // ------------------------------ Store 16 template HWY_API void StoreU(Vec16 v, D, uint16_t* HWY_RESTRICT p) { vst1_lane_u16(p, v.raw, 0); } template HWY_API void StoreU(Vec16 v, D, int16_t* HWY_RESTRICT p) { vst1_lane_s16(p, v.raw, 0); } #if HWY_HAVE_FLOAT16 template HWY_API void StoreU(Vec16 v, D, float16_t* HWY_RESTRICT p) { vst1_lane_f16(detail::NativeLanePointer(p), v.raw, 0); } #endif // HWY_HAVE_FLOAT16 #if HWY_NEON_HAVE_BFLOAT16 template HWY_API void StoreU(Vec16 v, D, bfloat16_t* HWY_RESTRICT p) { vst1_lane_bf16(detail::NativeLanePointer(p), v.raw, 0); } #endif // HWY_NEON_HAVE_BFLOAT16 template HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { const Repartition d16; const uint16_t buf = GetLane(BitCast(d16, v)); CopyBytes<2>(&buf, p); } // ------------------------------ Store 8 template HWY_API void StoreU(Vec128 v, D, uint8_t* HWY_RESTRICT p) { vst1_lane_u8(p, v.raw, 0); } template HWY_API void StoreU(Vec128 v, D, int8_t* HWY_RESTRICT p) { vst1_lane_s8(p, v.raw, 0); } // ------------------------------ Store misc template HWY_API void StoreU(VFromD v, D d, TFromD* HWY_RESTRICT p) { const RebindToUnsigned du; return StoreU(BitCast(du, v), du, detail::U16LanePointer(p)); } HWY_DIAGNOSTICS(push) #if HWY_COMPILER_GCC_ACTUAL HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized") #endif // On Arm, Store is the same as StoreU. template HWY_API void Store(VFromD v, D d, TFromD* HWY_RESTRICT aligned) { StoreU(v, d, aligned); } HWY_DIAGNOSTICS(pop) template HWY_API void BlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT p) { // Treat as unsigned so that we correctly support float16. const RebindToUnsigned du; const auto blended = IfThenElse(RebindMask(du, m), BitCast(du, v), BitCast(du, LoadU(d, p))); StoreU(BitCast(d, blended), d, p); } // ------------------------------ Non-temporal stores // Same as aligned stores on non-x86. template HWY_API void Stream(const VFromD v, D d, TFromD* HWY_RESTRICT aligned) { #if HWY_ARCH_ARM_A64 #if HWY_COMPILER_GCC __builtin_prefetch(aligned, 1, 0); #elif HWY_COMPILER_MSVC __prefetch2(aligned, 0x11); #endif #endif Store(v, d, aligned); } // ================================================== CONVERT // ------------------------------ ConvertTo #if HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16 // TODO(janwas): use macro generator instead of handwritten template HWY_API Vec128 ConvertTo(D /* tag */, Vec128 v) { return Vec128(vcvtq_f16_s16(v.raw)); } template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD(vcvt_f16_s16(v.raw)); } template HWY_API Vec128 ConvertTo(D /* tag */, Vec128 v) { return Vec128(vcvtq_f16_u16(v.raw)); } template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD(vcvt_f16_u16(v.raw)); } #endif // HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16 template HWY_API Vec128 ConvertTo(D /* tag */, Vec128 v) { return Vec128(vcvtq_f32_s32(v.raw)); } template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD(vcvt_f32_s32(v.raw)); } template HWY_API Vec128 ConvertTo(D /* tag */, Vec128 v) { return Vec128(vcvtq_f32_u32(v.raw)); } template HWY_API VFromD ConvertTo(D /* tag */, VFromD> v) { return VFromD(vcvt_f32_u32(v.raw)); } #if HWY_HAVE_FLOAT64 template HWY_API Vec128 ConvertTo(D /* tag */, Vec128 v) { return Vec128(vcvtq_f64_s64(v.raw)); } template HWY_API Vec64 ConvertTo(D /* tag */, Vec64 v) { // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic. #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 return Set(Full64(), static_cast(GetLane(v))); #else return Vec64(vcvt_f64_s64(v.raw)); #endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 } template HWY_API Vec128 ConvertTo(D /* tag */, Vec128 v) { return Vec128(vcvtq_f64_u64(ZeroIfNegative(v).raw)); } template HWY_API Vec64 ConvertTo(D /* tag */, Vec64 v) { // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic. const auto non_neg_v = ZeroIfNegative(v); #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 return Set(Full64(), static_cast(GetLane(non_neg_v))); #else return Vec64(vcvt_f64_u64(non_neg_v.raw)); #endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 } #endif // HWY_HAVE_FLOAT64 namespace detail { // Truncates (rounds toward zero). template HWY_INLINE Vec128 ConvertFToI(D /* tag */, Vec128 v) { #if HWY_COMPILER_CLANG && \ ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7) // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is // outside of the range of an int32_t. int32x4_t raw_result; __asm__( #if HWY_ARCH_ARM_A64 "fcvtzs %0.4s, %1.4s" #else "vcvt.s32.f32 %0, %1" #endif : "=w"(raw_result) : "w"(v.raw)); return Vec128(raw_result); #else return Vec128(vcvtq_s32_f32(v.raw)); #endif } template HWY_INLINE VFromD ConvertFToI(D /* tag */, VFromD> v) { #if HWY_COMPILER_CLANG && \ ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7) // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is // outside of the range of an int32_t. int32x2_t raw_result; __asm__( #if HWY_ARCH_ARM_A64 "fcvtzs %0.2s, %1.2s" #else "vcvt.s32.f32 %0, %1" #endif : "=w"(raw_result) : "w"(v.raw)); return VFromD(raw_result); #else return VFromD(vcvt_s32_f32(v.raw)); #endif } template HWY_INLINE Vec128 ConvertFToU(D /* tag */, Vec128 v) { #if HWY_COMPILER_CLANG && \ ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7) // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is // outside of the range of an uint32_t. uint32x4_t raw_result; __asm__( #if HWY_ARCH_ARM_A64 "fcvtzu %0.4s, %1.4s" #else "vcvt.u32.f32 %0, %1" #endif : "=w"(raw_result) : "w"(v.raw)); return Vec128(raw_result); #else return Vec128(vcvtq_u32_f32(v.raw)); #endif } template HWY_INLINE VFromD ConvertFToU(D /* tag */, VFromD> v) { #if HWY_COMPILER_CLANG && \ ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7) // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is // outside of the range of an uint32_t. uint32x2_t raw_result; __asm__( #if HWY_ARCH_ARM_A64 "fcvtzu %0.2s, %1.2s" #else "vcvt.u32.f32 %0, %1" #endif : "=w"(raw_result) : "w"(v.raw)); return VFromD(raw_result); #else return VFromD(vcvt_u32_f32(v.raw)); #endif } #if HWY_HAVE_FLOAT64 // Truncates (rounds toward zero). template HWY_INLINE Vec128 ConvertFToI(D /* tag */, Vec128 v) { #if HWY_COMPILER_CLANG && HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly // to avoid undefined behavior if v[i] is outside of the range of an int64_t. int64x2_t raw_result; __asm__("fcvtzs %0.2d, %1.2d" : "=w"(raw_result) : "w"(v.raw)); return Vec128(raw_result); #else return Vec128(vcvtq_s64_f64(v.raw)); #endif } template HWY_INLINE Vec64 ConvertFToI(D /* tag */, Vec64 v) { #if HWY_ARCH_ARM_A64 && \ ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700) || \ (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200)) // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly // to avoid undefined behavior if v[i] is outside of the range of an int64_t. // If compiling for AArch64 NEON with GCC 6 or earlier, use inline assembly to // work around the missing vcvt_s64_f64 intrinsic. int64x1_t raw_result; __asm__("fcvtzs %d0, %d1" : "=w"(raw_result) : "w"(v.raw)); return Vec64(raw_result); #else return Vec64(vcvt_s64_f64(v.raw)); #endif } template HWY_INLINE Vec128 ConvertFToU(D /* tag */, Vec128 v) { #if HWY_COMPILER_CLANG && HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly // to avoid undefined behavior if v[i] is outside of the range of an uint64_t. uint64x2_t raw_result; __asm__("fcvtzu %0.2d, %1.2d" : "=w"(raw_result) : "w"(v.raw)); return Vec128(raw_result); #else return Vec128(vcvtq_u64_f64(v.raw)); #endif } template HWY_INLINE Vec64 ConvertFToU(D /* tag */, Vec64 v) { #if HWY_ARCH_ARM_A64 && \ ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700) || \ (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200)) // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly // to avoid undefined behavior if v[i] is outside of the range of an uint64_t. // Inline assembly is also used if compiling for AArch64 NEON with GCC 6 or // earlier to work around the issue of the missing vcvt_u64_f64 intrinsic. uint64x1_t raw_result; __asm__("fcvtzu %d0, %d1" : "=w"(raw_result) : "w"(v.raw)); return Vec64(raw_result); #else return Vec64(vcvt_u64_f64(v.raw)); #endif } #endif // HWY_HAVE_FLOAT64 #if HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16 // Truncates (rounds toward zero). template HWY_INLINE Vec128 ConvertFToI(D /* tag */, Vec128 v) { #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly // to avoid undefined behavior if v[i] is outside of the range of an int16_t. int16x8_t raw_result; __asm__("fcvtzs %0.8h, %1.8h" : "=w"(raw_result) : "w"(v.raw)); return Vec128(raw_result); #else return Vec128(vcvtq_s16_f16(v.raw)); #endif } template HWY_INLINE VFromD ConvertFToI(D /* tag */, VFromD> v) { #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly // to avoid undefined behavior if v[i] is outside of the range of an int16_t. int16x4_t raw_result; __asm__("fcvtzs %0.4h, %1.4h" : "=w"(raw_result) : "w"(v.raw)); return VFromD(raw_result); #else return VFromD(vcvt_s16_f16(v.raw)); #endif } template HWY_INLINE Vec128 ConvertFToU(D /* tag */, Vec128 v) { #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly // to avoid undefined behavior if v[i] is outside of the range of an uint16_t. uint16x8_t raw_result; __asm__("fcvtzu %0.8h, %1.8h" : "=w"(raw_result) : "w"(v.raw)); return Vec128(raw_result); #else return Vec128(vcvtq_u16_f16(v.raw)); #endif } template HWY_INLINE VFromD ConvertFToU(D /* tag */, VFromD> v) { #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly // to avoid undefined behavior if v[i] is outside of the range of an uint16_t. uint16x4_t raw_result; __asm__("fcvtzu %0.4h, %1.4h" : "=w"(raw_result) : "w"(v.raw)); return VFromD(raw_result); #else return VFromD(vcvt_u16_f16(v.raw)); #endif } #endif // HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16 } // namespace detail template HWY_API VFromD ConvertTo(D di, VFromD> v) { return detail::ConvertFToI(di, v); } template HWY_API VFromD ConvertTo(D du, VFromD> v) { return detail::ConvertFToU(du, v); } // ------------------------------ PromoteTo (ConvertTo) // Unsigned: zero-extend to full vector. template HWY_API Vec128 PromoteTo(D /* tag */, Vec64 v) { return Vec128(vmovl_u8(v.raw)); } template HWY_API Vec128 PromoteTo(D /* tag */, Vec32 v) { uint16x8_t a = vmovl_u8(v.raw); return Vec128(vmovl_u16(vget_low_u16(a))); } template HWY_API Vec128 PromoteTo(D /* tag */, Vec64 v) { return Vec128(vmovl_u16(v.raw)); } template HWY_API Vec128 PromoteTo(D /* tag */, Vec64 v) { return Vec128(vmovl_u32(v.raw)); } template HWY_API Vec128 PromoteTo(D d, Vec64 v) { return BitCast(d, Vec128(vmovl_u8(v.raw))); } template HWY_API Vec128 PromoteTo(D d, Vec32 v) { uint16x8_t a = vmovl_u8(v.raw); return BitCast(d, Vec128(vmovl_u16(vget_low_u16(a)))); } template HWY_API Vec128 PromoteTo(D d, Vec64 v) { return BitCast(d, Vec128(vmovl_u16(v.raw))); } template HWY_API Vec128 PromoteTo(D d, Vec64 v) { return BitCast(d, Vec128(vmovl_u32(v.raw))); } // Unsigned: zero-extend to half vector. template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD(vget_low_u16(vmovl_u8(v.raw))); } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(v.raw))))); } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD(vget_low_u32(vmovl_u16(v.raw))); } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD(vget_low_u64(vmovl_u32(v.raw))); } template HWY_API VFromD PromoteTo(D d, VFromD> v) { using VU16 = VFromD>; return BitCast(d, VU16(vget_low_u16(vmovl_u8(v.raw)))); } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { const uint32x4_t u32 = vmovl_u16(vget_low_u16(vmovl_u8(v.raw))); return VFromD(vget_low_s32(vreinterpretq_s32_u32(u32))); } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD(vget_low_s32(vreinterpretq_s32_u32(vmovl_u16(v.raw)))); } template HWY_API VFromD PromoteTo(D d, VFromD> v) { using DU = RebindToUnsigned; return BitCast(d, VFromD(vget_low_u64(vmovl_u32(v.raw)))); } // U8/U16 to U64/I64: First, zero-extend to U32, and then zero-extend to // TFromD template HWY_API VFromD PromoteTo(D d, V v) { const Rebind du32; return PromoteTo(d, PromoteTo(du32, v)); } // Signed: replicate sign bit to full vector. template HWY_API Vec128 PromoteTo(D /* tag */, Vec64 v) { return Vec128(vmovl_s8(v.raw)); } template HWY_API Vec128 PromoteTo(D /* tag */, Vec32 v) { int16x8_t a = vmovl_s8(v.raw); return Vec128(vmovl_s16(vget_low_s16(a))); } template HWY_API Vec128 PromoteTo(D /* tag */, Vec64 v) { return Vec128(vmovl_s16(v.raw)); } template HWY_API Vec128 PromoteTo(D /* tag */, Vec64 v) { return Vec128(vmovl_s32(v.raw)); } // Signed: replicate sign bit to half vector. template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD(vget_low_s16(vmovl_s8(v.raw))); } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(v.raw))))); } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD(vget_low_s32(vmovl_s16(v.raw))); } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD(vget_low_s64(vmovl_s32(v.raw))); } // I8/I16 to I64: First, promote to I32, and then promote to I64 template HWY_API VFromD PromoteTo(D d, V v) { const Rebind di32; return PromoteTo(d, PromoteTo(di32, v)); } #if HWY_NEON_HAVE_F16C // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions. #ifdef HWY_NATIVE_F16C #undef HWY_NATIVE_F16C #else #define HWY_NATIVE_F16C #endif template HWY_API Vec128 PromoteTo(D /* tag */, Vec64 v) { return Vec128(vcvt_f32_f16(v.raw)); } template HWY_API VFromD PromoteTo(D /* tag */, VFromD> v) { return VFromD(vget_low_f32(vcvt_f32_f16(v.raw))); } #endif // HWY_NEON_HAVE_F16C #if HWY_HAVE_FLOAT64 template HWY_API Vec128 PromoteTo(D /* tag */, Vec64 v) { return Vec128(vcvt_f64_f32(v.raw)); } template HWY_API Vec64 PromoteTo(D /* tag */, Vec32 v) { return Vec64(vget_low_f64(vcvt_f64_f32(v.raw))); } template HWY_API Vec128 PromoteTo(D /* tag */, Vec64 v) { const int64x2_t i64 = vmovl_s32(v.raw); return Vec128(vcvtq_f64_s64(i64)); } template HWY_API Vec64 PromoteTo(D d, Vec32 v) { return ConvertTo(d, Vec64(vget_low_s64(vmovl_s32(v.raw)))); } template HWY_API Vec128 PromoteTo(D /* tag */, Vec64 v) { const uint64x2_t u64 = vmovl_u32(v.raw); return Vec128(vcvtq_f64_u64(u64)); } template HWY_API Vec64 PromoteTo(D d, Vec32 v) { return ConvertTo(d, Vec64(vget_low_u64(vmovl_u32(v.raw)))); } template HWY_API VFromD PromoteTo(D d64, VFromD> v) { const RebindToFloat df64; return ConvertTo(d64, PromoteTo(df64, v)); } #else // !HWY_HAVE_FLOAT64 template HWY_API VFromD PromoteTo(D di64, VFromD> v) { const Rebind di32; const RebindToFloat df32; const RebindToUnsigned du32; const Repartition du32_as_du8; const auto exponent_adj = BitCast( du32, Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))), BitCast(du32_as_du8, Set(du32, uint32_t{157}))), BitCast(du32_as_du8, Set(du32, uint32_t{32})))); const auto adj_v = BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj)); const auto f32_to_i32_result = ConvertTo(di32, adj_v); const auto lo64_or_mask = PromoteTo( di64, BitCast(du32, VecFromMask(di32, Eq(f32_to_i32_result, Set(di32, LimitsMax()))))); return Or(PromoteTo(di64, BitCast(di32, f32_to_i32_result)) << PromoteTo(di64, exponent_adj), lo64_or_mask); } template HWY_API VFromD PromoteTo(D du64, VFromD> v) { const Rebind du32; const RebindToFloat df32; const Repartition du32_as_du8; const auto exponent_adj = BitCast( du32, Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))), BitCast(du32_as_du8, Set(du32, uint32_t{158}))), BitCast(du32_as_du8, Set(du32, uint32_t{32})))); const auto adj_v = BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj)); const auto f32_to_u32_result = ConvertTo(du32, adj_v); const auto lo32_or_mask = PromoteTo( du64, VecFromMask(du32, f32_to_u32_result == Set(du32, LimitsMax()))); return Or(PromoteTo(du64, f32_to_u32_result) << PromoteTo(du64, exponent_adj), lo32_or_mask); } #endif // HWY_HAVE_FLOAT64 // ------------------------------ PromoteUpperTo #if HWY_ARCH_ARM_A64 // Per-target flag to prevent generic_ops-inl.h from defining PromoteUpperTo. #ifdef HWY_NATIVE_PROMOTE_UPPER_TO #undef HWY_NATIVE_PROMOTE_UPPER_TO #else #define HWY_NATIVE_PROMOTE_UPPER_TO #endif // Unsigned: zero-extend to full vector. template HWY_API Vec128 PromoteUpperTo(D /* tag */, Vec128 v) { return Vec128(vmovl_high_u8(v.raw)); } template HWY_API Vec128 PromoteUpperTo(D /* tag */, Vec128 v) { return Vec128(vmovl_high_u16(v.raw)); } template HWY_API Vec128 PromoteUpperTo(D /* tag */, Vec128 v) { return Vec128(vmovl_high_u32(v.raw)); } template HWY_API Vec128 PromoteUpperTo(D d, Vec128 v) { return BitCast(d, Vec128(vmovl_high_u8(v.raw))); } template HWY_API Vec128 PromoteUpperTo(D d, Vec128 v) { return BitCast(d, Vec128(vmovl_high_u16(v.raw))); } template HWY_API Vec128 PromoteUpperTo(D d, Vec128 v) { return BitCast(d, Vec128(vmovl_high_u32(v.raw))); } // Signed: replicate sign bit to full vector. template HWY_API Vec128 PromoteUpperTo(D /* tag */, Vec128 v) { return Vec128(vmovl_high_s8(v.raw)); } template HWY_API Vec128 PromoteUpperTo(D /* tag */, Vec128 v) { return Vec128(vmovl_high_s16(v.raw)); } template HWY_API Vec128 PromoteUpperTo(D /* tag */, Vec128 v) { return Vec128(vmovl_high_s32(v.raw)); } #if HWY_NEON_HAVE_F16C template HWY_API Vec128 PromoteUpperTo(D /* tag */, Vec128 v) { return Vec128(vcvt_high_f32_f16(v.raw)); } #endif // HWY_NEON_HAVE_F16C template HWY_API VFromD PromoteUpperTo(D df32, VFromD> v) { const Repartition du16; const RebindToSigned di32; return BitCast(df32, ShiftLeft<16>(PromoteUpperTo(di32, BitCast(du16, v)))); } #if HWY_HAVE_FLOAT64 template HWY_API Vec128 PromoteUpperTo(D /* tag */, Vec128 v) { return Vec128(vcvt_high_f64_f32(v.raw)); } template HWY_API Vec128 PromoteUpperTo(D /* tag */, Vec128 v) { const int64x2_t i64 = vmovl_high_s32(v.raw); return Vec128(vcvtq_f64_s64(i64)); } template HWY_API Vec128 PromoteUpperTo(D /* tag */, Vec128 v) { const uint64x2_t u64 = vmovl_high_u32(v.raw); return Vec128(vcvtq_f64_u64(u64)); } #endif // HWY_HAVE_FLOAT64 template HWY_API VFromD PromoteUpperTo(D d64, Vec128 v) { #if HWY_HAVE_FLOAT64 const RebindToFloat df64; return ConvertTo(d64, PromoteUpperTo(df64, v)); #else const Rebind dh; return PromoteTo(d, UpperHalf(dh, v)); #endif } // Generic version for <=64 bit input/output (_high is only for full vectors). template HWY_API VFromD PromoteUpperTo(D d, V v) { const Rebind, decltype(d)> dh; return PromoteTo(d, UpperHalf(dh, v)); } #endif // HWY_ARCH_ARM_A64 // ------------------------------ DemoteTo (ConvertTo) // From full vector to half or quarter template HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { return Vec64(vqmovun_s32(v.raw)); } template HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { return Vec64(vqmovn_s32(v.raw)); } template HWY_API Vec32 DemoteTo(D /* tag */, Vec128 v) { const uint16x4_t a = vqmovun_s32(v.raw); return Vec32(vqmovn_u16(vcombine_u16(a, a))); } template HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { return Vec64(vqmovun_s16(v.raw)); } template HWY_API Vec32 DemoteTo(D /* tag */, Vec128 v) { const int16x4_t a = vqmovn_s32(v.raw); return Vec32(vqmovn_s16(vcombine_s16(a, a))); } template HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { return Vec64(vqmovn_s16(v.raw)); } template HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { return Vec64(vqmovn_u32(v.raw)); } template HWY_API Vec32 DemoteTo(D /* tag */, Vec128 v) { const uint16x4_t a = vqmovn_u32(v.raw); return Vec32(vqmovn_u16(vcombine_u16(a, a))); } template HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { return Vec64(vqmovn_u16(v.raw)); } // From half vector to partial half template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD(vqmovun_s32(vcombine_s32(v.raw, v.raw))); } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD(vqmovn_s32(vcombine_s32(v.raw, v.raw))); } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { const uint16x4_t a = vqmovun_s32(vcombine_s32(v.raw, v.raw)); return VFromD(vqmovn_u16(vcombine_u16(a, a))); } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD(vqmovun_s16(vcombine_s16(v.raw, v.raw))); } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { const int16x4_t a = vqmovn_s32(vcombine_s32(v.raw, v.raw)); return VFromD(vqmovn_s16(vcombine_s16(a, a))); } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD(vqmovn_s16(vcombine_s16(v.raw, v.raw))); } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD(vqmovn_u32(vcombine_u32(v.raw, v.raw))); } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { const uint16x4_t a = vqmovn_u32(vcombine_u32(v.raw, v.raw)); return VFromD(vqmovn_u16(vcombine_u16(a, a))); } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD(vqmovn_u16(vcombine_u16(v.raw, v.raw))); } template HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { return Vec64(vqmovn_s64(v.raw)); } template HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { return Vec64(vqmovun_s64(v.raw)); } template HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { return Vec64(vqmovn_u64(v.raw)); } template HWY_API VFromD DemoteTo(D d, Vec128 v) { const Rebind di32; return DemoteTo(d, DemoteTo(di32, v)); } template HWY_API VFromD DemoteTo(D d, Vec128 v) { const Rebind du32; return DemoteTo(d, DemoteTo(du32, v)); } template HWY_API VFromD DemoteTo(D d, Vec128 v) { const Rebind du32; return DemoteTo(d, DemoteTo(du32, v)); } template HWY_API Vec32 DemoteTo(D /* tag */, Vec64 v) { return Vec32(vqmovn_s64(vcombine_s64(v.raw, v.raw))); } template HWY_API Vec32 DemoteTo(D /* tag */, Vec64 v) { return Vec32(vqmovun_s64(vcombine_s64(v.raw, v.raw))); } template HWY_API Vec32 DemoteTo(D /* tag */, Vec64 v) { return Vec32(vqmovn_u64(vcombine_u64(v.raw, v.raw))); } template HWY_API VFromD DemoteTo(D d, Vec64 v) { const Rebind di32; return DemoteTo(d, DemoteTo(di32, v)); } template HWY_API VFromD DemoteTo(D d, Vec64 v) { const Rebind du32; return DemoteTo(d, DemoteTo(du32, v)); } template HWY_API VFromD DemoteTo(D d, Vec64 v) { const Rebind du32; return DemoteTo(d, DemoteTo(du32, v)); } #if HWY_NEON_HAVE_F16C // We already toggled HWY_NATIVE_F16C above. template HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { return Vec64{vcvt_f16_f32(v.raw)}; } template HWY_API VFromD DemoteTo(D /* tag */, VFromD> v) { return VFromD(vcvt_f16_f32(vcombine_f32(v.raw, v.raw))); } #endif // HWY_NEON_HAVE_F16C template HWY_API VFromD DemoteTo(D dbf16, VFromD> v) { const Rebind di32; const Rebind du32; // for logical shift right const Rebind du16; const auto bits_in_32 = BitCast(di32, ShiftRight<16>(BitCast(du32, v))); return BitCast(dbf16, DemoteTo(du16, bits_in_32)); } #if HWY_HAVE_FLOAT64 template HWY_API Vec64 DemoteTo(D /* tag */, Vec128 v) { return Vec64(vcvt_f32_f64(v.raw)); } template HWY_API Vec32 DemoteTo(D /* tag */, Vec64 v) { return Vec32(vcvt_f32_f64(vcombine_f64(v.raw, v.raw))); } template HWY_API VFromD DemoteTo(D d32, VFromD> v) { const Rebind>, D> d64; return DemoteTo(d32, ConvertTo(d64, v)); } #endif // HWY_HAVE_FLOAT64 template HWY_API VFromD DemoteTo(D df32, VFromD> v) { const Rebind di64; const RebindToUnsigned du64; #if HWY_ARCH_ARM_A64 const RebindToFloat df64; const auto k2p64_63 = Set(df64, 27670116110564327424.0); const auto f64_hi52 = Xor(BitCast(df64, ShiftRight<12>(BitCast(du64, v))), k2p64_63) - k2p64_63; const auto f64_lo12 = ConvertTo(df64, And(BitCast(du64, v), Set(du64, uint64_t{0x00000FFF}))); const auto f64_sum = f64_hi52 + f64_lo12; const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12; const auto f64_sum_is_inexact = ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64)))); const auto f64_bits_decrement = And(ShiftRight<63>(BitCast(du64, Xor(f64_sum, f64_carry))), f64_sum_is_inexact); const auto adj_f64_val = BitCast( df64, Or(BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact)); return DemoteTo(df32, adj_f64_val); #else const RebindToUnsigned du32; const auto hi23 = TruncateTo(du32, ShiftRight<41>(BitCast(du64, v))); const auto mid23 = And(TruncateTo(du32, ShiftRight<18>(BitCast(du64, v))), Set(du32, uint32_t{0x007FFFFFu})); const auto lo18 = And(TruncateTo(du32, BitCast(du64, v)), Set(du32, uint32_t{0x0003FFFFu})); const auto k2p41_f32 = Set(df32, 2199023255552.0f); const auto k2p64_63_f32 = Set(df32, 27670116110564327424.0f); const auto hi23_f32 = BitCast(df32, Xor(hi23, BitCast(du32, k2p64_63_f32))) - k2p64_63_f32; const auto mid23_f32 = BitCast(df32, Or(mid23, BitCast(du32, k2p41_f32))) - k2p41_f32; const auto lo18_f32 = ConvertTo(df32, lo18); const auto s_hi46 = hi23_f32 + mid23_f32; const auto c_hi46 = (hi23_f32 - s_hi46) + mid23_f32; auto s_lo = c_hi46 + lo18_f32; const auto c_lo = (c_hi46 - s_lo) + lo18_f32; const auto s_lo_inexact_mask = VecFromMask(du32, RebindMask(du32, c_lo != Zero(df32))); const auto s_lo_mag_adj = ShiftRight<31>( And(s_lo_inexact_mask, Xor(BitCast(du32, s_lo), BitCast(du32, c_lo)))); s_lo = BitCast(df32, BitCast(du32, s_lo) - s_lo_mag_adj); s_lo = BitCast(df32, Or(BitCast(du32, s_lo), ShiftRight<31>(s_lo_inexact_mask))); return s_hi46 + s_lo; #endif } template HWY_API VFromD DemoteTo(D df32, VFromD> v) { #if HWY_ARCH_ARM_A64 const Rebind du64; const RebindToFloat df64; const auto k2p64 = Set(df64, 18446744073709551616.0); const auto f64_hi52 = Or(BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64; const auto f64_lo12 = ConvertTo(df64, And(v, Set(du64, uint64_t{0x00000FFF}))); const auto f64_sum = f64_hi52 + f64_lo12; const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12; const auto f64_sum_is_inexact = ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64)))); const auto adj_f64_val = BitCast( df64, Or(BitCast(du64, f64_sum) - ShiftRight<63>(BitCast(du64, f64_carry)), f64_sum_is_inexact)); return DemoteTo(df32, adj_f64_val); #else const RebindToUnsigned du32; const auto hi23 = TruncateTo(du32, ShiftRight<41>(v)); const auto mid23 = And(TruncateTo(du32, ShiftRight<18>(v)), Set(du32, uint32_t{0x007FFFFFu})); const auto lo18 = And(TruncateTo(du32, v), Set(du32, uint32_t{0x0003FFFFu})); const auto k2p41_f32 = Set(df32, 2199023255552.0f); const auto k2p64_f32 = Set(df32, 18446744073709551616.0f); const auto hi23_f32 = BitCast(df32, Or(hi23, BitCast(du32, k2p64_f32))) - k2p64_f32; const auto mid23_f32 = BitCast(df32, Or(mid23, BitCast(du32, k2p41_f32))) - k2p41_f32; const auto lo18_f32 = ConvertTo(df32, lo18); const auto s_hi46 = hi23_f32 + mid23_f32; const auto c_hi46 = (hi23_f32 - s_hi46) + mid23_f32; auto s_lo = c_hi46 + lo18_f32; const auto c_lo = (c_hi46 - s_lo) + lo18_f32; const auto s_lo_inexact_mask = VecFromMask(du32, RebindMask(du32, c_lo != Zero(df32))); const auto s_lo_mag_adj = ShiftRight<31>( And(s_lo_inexact_mask, Xor(BitCast(du32, s_lo), BitCast(du32, c_lo)))); s_lo = BitCast(df32, BitCast(du32, s_lo) - s_lo_mag_adj); s_lo = BitCast(df32, Or(BitCast(du32, s_lo), ShiftRight<31>(s_lo_inexact_mask))); return s_hi46 + s_lo; #endif } HWY_API Vec32 U8FromU32(Vec128 v) { const uint8x16_t org_v = detail::BitCastToByte(v).raw; const uint8x16_t w = vuzp1q_u8(org_v, org_v); return Vec32(vget_low_u8(vuzp1q_u8(w, w))); } template HWY_API Vec128 U8FromU32(Vec128 v) { const uint8x8_t org_v = detail::BitCastToByte(v).raw; const uint8x8_t w = vuzp1_u8(org_v, org_v); return Vec128(vuzp1_u8(w, w)); } // ------------------------------ Round (IfThenElse, mask, logical) #if HWY_ARCH_ARM_A64 // Toward nearest integer HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Round, vrndn, _, 1) // Toward zero, aka truncate HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Trunc, vrnd, _, 1) // Toward +infinity, aka ceiling HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Ceil, vrndp, _, 1) // Toward -infinity, aka floor HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Floor, vrndm, _, 1) #else // ------------------------------ Trunc // Armv7 only supports truncation to integer. We can either convert back to // float (3 floating-point and 2 logic operations) or manipulate the binary32 // representation, clearing the lowest 23-exp mantissa bits. This requires 9 // integer operations and 3 constants, which is likely more expensive. namespace detail { // The original value is already the desired result if NaN or the magnitude is // large (i.e. the value is already an integer). template HWY_INLINE Mask128 UseInt(const Vec128 v) { return Abs(v) < Set(Simd(), MantissaEnd()); } } // namespace detail template HWY_API Vec128 Trunc(const Vec128 v) { const DFromV df; const RebindToSigned di; const auto integer = ConvertTo(di, v); // round toward 0 const auto int_f = ConvertTo(df, integer); return IfThenElse(detail::UseInt(v), int_f, v); } template HWY_API Vec128 Round(const Vec128 v) { const DFromV df; // Armv7 also lacks a native NearestInt, but we can instead rely on rounding // (we assume the current mode is nearest-even) after addition with a large // value such that no mantissa bits remain. We may need a compiler flag for // precise floating-point to prevent this from being "optimized" out. const auto max = Set(df, MantissaEnd()); const auto large = CopySignToAbs(max, v); const auto added = large + v; const auto rounded = added - large; // Keep original if NaN or the magnitude is large (already an int). return IfThenElse(Abs(v) < max, rounded, v); } template HWY_API Vec128 Ceil(const Vec128 v) { const DFromV df; const RebindToSigned di; const auto integer = ConvertTo(di, v); // round toward 0 const auto int_f = ConvertTo(df, integer); // Truncating a positive non-integer ends up smaller; if so, add 1. const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v))); return IfThenElse(detail::UseInt(v), int_f - neg1, v); } template HWY_API Vec128 Floor(const Vec128 v) { const DFromV df; const RebindToSigned di; const auto integer = ConvertTo(di, v); // round toward 0 const auto int_f = ConvertTo(df, integer); // Truncating a negative non-integer ends up larger; if so, subtract 1. const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v))); return IfThenElse(detail::UseInt(v), int_f + neg1, v); } #endif // ------------------------------ NearestInt (Round) #if HWY_ARCH_ARM_A64 HWY_API Vec128 NearestInt(const Vec128 v) { return Vec128(vcvtnq_s32_f32(v.raw)); } template HWY_API Vec128 NearestInt(const Vec128 v) { return Vec128(vcvtn_s32_f32(v.raw)); } #else template HWY_API Vec128 NearestInt(const Vec128 v) { const RebindToSigned> di; return ConvertTo(di, Round(v)); } #endif // ------------------------------ Floating-point classification template HWY_API Mask128 IsNaN(const Vec128 v) { return v != v; } // ================================================== SWIZZLE // ------------------------------ LowerHalf // <= 64 bit: just return different type template HWY_API Vec128 LowerHalf(Vec128 v) { return Vec128(v.raw); } HWY_API Vec64 LowerHalf(Vec128 v) { return Vec64(vget_low_u8(v.raw)); } HWY_API Vec64 LowerHalf(Vec128 v) { return Vec64(vget_low_u16(v.raw)); } HWY_API Vec64 LowerHalf(Vec128 v) { return Vec64(vget_low_u32(v.raw)); } HWY_API Vec64 LowerHalf(Vec128 v) { return Vec64(vget_low_u64(v.raw)); } HWY_API Vec64 LowerHalf(Vec128 v) { return Vec64(vget_low_s8(v.raw)); } HWY_API Vec64 LowerHalf(Vec128 v) { return Vec64(vget_low_s16(v.raw)); } HWY_API Vec64 LowerHalf(Vec128 v) { return Vec64(vget_low_s32(v.raw)); } HWY_API Vec64 LowerHalf(Vec128 v) { return Vec64(vget_low_s64(v.raw)); } HWY_API Vec64 LowerHalf(Vec128 v) { return Vec64(vget_low_f32(v.raw)); } #if HWY_HAVE_FLOAT16 HWY_API Vec64 LowerHalf(Vec128 v) { return Vec64(vget_low_f16(v.raw)); } #endif // HWY_HAVE_FLOAT16 #if HWY_HAVE_FLOAT64 HWY_API Vec64 LowerHalf(Vec128 v) { return Vec64(vget_low_f64(v.raw)); } #endif // HWY_HAVE_FLOAT64 template HWY_API VFromD>> LowerHalf(V v) { const Full128 du; const Half> dh; return BitCast(dh, LowerHalf(BitCast(du, v))); } template HWY_API VFromD LowerHalf(DH /* tag */, VFromD> v) { return LowerHalf(v); } // ------------------------------ CombineShiftRightBytes // 128-bit template > HWY_API Vec128 CombineShiftRightBytes(D d, Vec128 hi, Vec128 lo) { static_assert(0 < kBytes && kBytes < 16, "kBytes must be in [1, 15]"); const Repartition d8; uint8x16_t v8 = vextq_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes); return BitCast(d, Vec128(v8)); } // 64-bit template > HWY_API Vec64 CombineShiftRightBytes(D d, Vec64 hi, Vec64 lo) { static_assert(0 < kBytes && kBytes < 8, "kBytes must be in [1, 7]"); const Repartition d8; uint8x8_t v8 = vext_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes); return BitCast(d, VFromD(v8)); } // <= 32-bit defined after ShiftLeftBytes. // ------------------------------ Shift vector by constant #bytes namespace detail { // Partially specialize because kBytes = 0 and >= size are compile errors; // callers replace the latter with 0xFF for easier specialization. template struct ShiftLeftBytesT { // Full template HWY_INLINE Vec128 operator()(const Vec128 v) { const Full128 d; return CombineShiftRightBytes<16 - kBytes>(d, v, Zero(d)); } // Partial template HWY_INLINE Vec128 operator()(const Vec128 v) { // Expand to 64-bit so we only use the native EXT instruction. const Full64 d64; const auto zero64 = Zero(d64); const decltype(zero64) v64(v.raw); return Vec128( CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw); } }; template <> struct ShiftLeftBytesT<0> { template HWY_INLINE Vec128 operator()(const Vec128 v) { return v; } }; template <> struct ShiftLeftBytesT<0xFF> { template HWY_INLINE Vec128 operator()(const Vec128 v) { return Xor(v, v); } }; template struct ShiftRightBytesT { template HWY_INLINE Vec128 operator()(Vec128 v) { const DFromV d; // For < 64-bit vectors, zero undefined lanes so we shift in zeros. if (d.MaxBytes() < 8) { constexpr size_t kReg = d.MaxBytes() == 16 ? 16 : 8; const Simd dreg; v = Vec128( IfThenElseZero(FirstN(dreg, N), VFromD(v.raw)).raw); } return CombineShiftRightBytes(d, Zero(d), v); } }; template <> struct ShiftRightBytesT<0> { template HWY_INLINE Vec128 operator()(const Vec128 v) { return v; } }; template <> struct ShiftRightBytesT<0xFF> { template HWY_INLINE Vec128 operator()(const Vec128 v) { return Xor(v, v); } }; } // namespace detail template HWY_API VFromD ShiftLeftBytes(D d, VFromD v) { return detail::ShiftLeftBytesT<(kBytes >= d.MaxBytes() ? 0xFF : kBytes)>()(v); } template HWY_API Vec128 ShiftLeftBytes(Vec128 v) { return ShiftLeftBytes(DFromV(), v); } template HWY_API VFromD ShiftLeftLanes(D d, VFromD v) { const Repartition d8; return BitCast(d, ShiftLeftBytes)>(BitCast(d8, v))); } template HWY_API Vec128 ShiftLeftLanes(Vec128 v) { return ShiftLeftLanes(DFromV(), v); } // 0x01..0F, kBytes = 1 => 0x0001..0E template HWY_API VFromD ShiftRightBytes(D d, VFromD v) { return detail::ShiftRightBytesT<(kBytes >= d.MaxBytes() ? 0xFF : kBytes)>()( v); } template HWY_API VFromD ShiftRightLanes(D d, VFromD v) { const Repartition d8; return BitCast( d, ShiftRightBytes)>(d8, BitCast(d8, v))); } // Calls ShiftLeftBytes template HWY_API VFromD CombineShiftRightBytes(D d, VFromD hi, VFromD lo) { constexpr size_t kSize = d.MaxBytes(); static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); const Repartition d8; const Full64 d_full8; const Repartition, decltype(d_full8)> d_full; using V64 = VFromD; const V64 hi64(BitCast(d8, hi).raw); // Move into most-significant bytes const V64 lo64 = ShiftLeftBytes<8 - kSize>(V64(BitCast(d8, lo).raw)); const V64 r = CombineShiftRightBytes<8 - kSize + kBytes>(d_full8, hi64, lo64); // After casting to full 64-bit vector of correct type, shrink to 32-bit return VFromD(BitCast(d_full, r).raw); } // ------------------------------ UpperHalf (ShiftRightBytes) // Full input template HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { return Vec64(vget_high_u8(v.raw)); } template HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { return Vec64(vget_high_u16(v.raw)); } template HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { return Vec64(vget_high_u32(v.raw)); } template HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { return Vec64(vget_high_u64(v.raw)); } template HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { return Vec64(vget_high_s8(v.raw)); } template HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { return Vec64(vget_high_s16(v.raw)); } template HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { return Vec64(vget_high_s32(v.raw)); } template HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { return Vec64(vget_high_s64(v.raw)); } #if HWY_HAVE_FLOAT16 template HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { return Vec64(vget_high_f16(v.raw)); } #endif template HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { return Vec64(vget_high_f32(v.raw)); } #if HWY_HAVE_FLOAT64 template HWY_API Vec64 UpperHalf(D /* tag */, Vec128 v) { return Vec64(vget_high_f64(v.raw)); } #endif // HWY_HAVE_FLOAT64 template HWY_API VFromD UpperHalf(D dh, VFromD> v) { const RebindToUnsigned> du; const Half duh; return BitCast(dh, UpperHalf(duh, BitCast(du, v))); } // Partial template HWY_API VFromD UpperHalf(DH dh, VFromD> v) { const Twice d; const RebindToUnsigned du; const VFromD upper = ShiftRightBytes(du, BitCast(du, v)); return VFromD(BitCast(d, upper).raw); } // ------------------------------ Broadcast/splat any lane template HWY_API Vec128 Broadcast(Vec128 v) { return v; } #if HWY_ARCH_ARM_A64 // Unsigned template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 16, "Invalid lane"); return Vec128(vdupq_laneq_u8(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128(vdup_lane_u8(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 8, "Invalid lane"); return Vec128(vdupq_laneq_u16(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128(vdup_lane_u16(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 4, "Invalid lane"); return Vec128(vdupq_laneq_u32(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128(vdup_lane_u32(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 2, "Invalid lane"); return Vec128(vdupq_laneq_u64(v.raw, kLane)); } // Signed template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 16, "Invalid lane"); return Vec128(vdupq_laneq_s8(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128(vdup_lane_s8(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 8, "Invalid lane"); return Vec128(vdupq_laneq_s16(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128(vdup_lane_s16(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 4, "Invalid lane"); return Vec128(vdupq_laneq_s32(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128(vdup_lane_s32(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 2, "Invalid lane"); return Vec128(vdupq_laneq_s64(v.raw, kLane)); } // Float #if HWY_HAVE_FLOAT16 template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 8, "Invalid lane"); return Vec128(vdupq_laneq_f16(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128(vdup_lane_f16(v.raw, kLane)); } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 4, "Invalid lane"); return Vec128(vdupq_laneq_f32(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128(vdup_lane_f32(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 2, "Invalid lane"); return Vec128(vdupq_laneq_f64(v.raw, kLane)); } #else // !HWY_ARCH_ARM_A64 // No vdupq_laneq_* on armv7: use vgetq_lane_* + vdupq_n_*. // Unsigned template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 16, "Invalid lane"); return Vec128(vdupq_n_u8(vgetq_lane_u8(v.raw, kLane))); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128(vdup_lane_u8(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 8, "Invalid lane"); return Vec128(vdupq_n_u16(vgetq_lane_u16(v.raw, kLane))); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128(vdup_lane_u16(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 4, "Invalid lane"); return Vec128(vdupq_n_u32(vgetq_lane_u32(v.raw, kLane))); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128(vdup_lane_u32(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 2, "Invalid lane"); return Vec128(vdupq_n_u64(vgetq_lane_u64(v.raw, kLane))); } // Signed template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 16, "Invalid lane"); return Vec128(vdupq_n_s8(vgetq_lane_s8(v.raw, kLane))); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128(vdup_lane_s8(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 8, "Invalid lane"); return Vec128(vdupq_n_s16(vgetq_lane_s16(v.raw, kLane))); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128(vdup_lane_s16(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 4, "Invalid lane"); return Vec128(vdupq_n_s32(vgetq_lane_s32(v.raw, kLane))); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128(vdup_lane_s32(v.raw, kLane)); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 2, "Invalid lane"); return Vec128(vdupq_n_s64(vgetq_lane_s64(v.raw, kLane))); } // Float #if HWY_HAVE_FLOAT16 template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 8, "Invalid lane"); return Vec128(vdupq_n_f16(vgetq_lane_f16(v.raw, kLane))); } #endif // HWY_HAVE_FLOAT16 template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < 4, "Invalid lane"); return Vec128(vdupq_n_f32(vgetq_lane_f32(v.raw, kLane))); } template HWY_API Vec128 Broadcast(Vec128 v) { static_assert(0 <= kLane && kLane < N, "Invalid lane"); return Vec128(vdup_lane_f32(v.raw, kLane)); } #endif // HWY_ARCH_ARM_A64 // ------------------------------ TableLookupLanes // Returned by SetTableIndices for use by TableLookupLanes. template struct Indices128 { typename detail::Raw128::type raw; }; namespace detail { template HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( D d) { const Repartition d8; return Iota(d8, 0); } template HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( D d) { const Repartition d8; alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; return Load(d8, kBroadcastLaneBytes); } template HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( D d) { const Repartition d8; alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; return Load(d8, kBroadcastLaneBytes); } template HWY_INLINE VFromD> IndicesFromVecBroadcastLaneBytes( D d) { const Repartition d8; alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; return Load(d8, kBroadcastLaneBytes); } template HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { const Repartition d8; return Zero(d8); } template HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { const Repartition d8; alignas(16) static constexpr uint8_t kByteOffsets[16] = { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; return Load(d8, kByteOffsets); } template HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { const Repartition d8; alignas(16) static constexpr uint8_t kByteOffsets[16] = { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; return Load(d8, kByteOffsets); } template HWY_INLINE VFromD> IndicesFromVecByteOffsets(D d) { const Repartition d8; alignas(16) static constexpr uint8_t kByteOffsets[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; return Load(d8, kByteOffsets); } } // namespace detail template HWY_API Indices128, MaxLanes(D())> IndicesFromVec( D d, Vec128 vec) { using T = TFromD; static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); #if HWY_IS_DEBUG_BUILD const RebindToUnsigned du; using TU = TFromD; HWY_DASSERT(AllTrue( du, Lt(BitCast(du, vec), Set(du, static_cast(MaxLanes(d) * 2))))); #endif (void)d; return Indices128, MaxLanes(D())>{BitCast(d, vec).raw}; } template HWY_API Indices128, MaxLanes(D())> IndicesFromVec( D d, Vec128 vec) { using T = TFromD; static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); #if HWY_IS_DEBUG_BUILD const RebindToUnsigned du; using TU = TFromD; HWY_DASSERT(AllTrue( du, Lt(BitCast(du, vec), Set(du, static_cast(MaxLanes(d) * 2))))); #endif const Repartition d8; using V8 = VFromD; // Broadcast each lane index to all bytes of T and shift to bytes const V8 lane_indices = TableLookupBytes( BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d)); constexpr int kIndexShiftAmt = static_cast(FloorLog2(sizeof(T))); const V8 byte_indices = ShiftLeft(lane_indices); const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d)); return Indices128, MaxLanes(D())>{BitCast(d, sum).raw}; } template HWY_API Indices128, MaxLanes(D())> SetTableIndices(D d, const TI* idx) { const Rebind di; return IndicesFromVec(d, LoadU(di, idx)); } template HWY_API Vec128 TableLookupLanes(Vec128 v, Indices128 idx) { const DFromV d; const RebindToSigned di; return BitCast( d, TableLookupBytes(BitCast(di, v), BitCast(di, Vec128{idx.raw}))); } template HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, Indices128 idx) { const DFromV d; const Twice dt; // TableLookupLanes currently requires table and index vectors to be the same // size, though a half-length index vector would be sufficient here. #if HWY_IS_MSAN const Vec128 idx_vec{idx.raw}; const Indices128 idx2{Combine(dt, idx_vec, idx_vec).raw}; #else // We only keep LowerHalf of the result, which is valid in idx. const Indices128 idx2{idx.raw}; #endif return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2)); } template HWY_API Vec64 TwoTablesLookupLanes(Vec64 a, Vec64 b, Indices128 idx) { const DFromV d; const Repartition du8; const auto a_u8 = BitCast(du8, a); const auto b_u8 = BitCast(du8, b); const auto idx_u8 = BitCast(du8, Vec64{idx.raw}); #if HWY_ARCH_ARM_A64 const Twice dt_u8; return BitCast( d, Vec64{vqtbl1_u8(Combine(dt_u8, b_u8, a_u8).raw, idx_u8.raw)}); #else detail::Tuple2 tup = {{{a_u8.raw, b_u8.raw}}}; return BitCast(d, Vec64{vtbl2_u8(tup.raw, idx_u8.raw)}); #endif } template HWY_API Vec128 TwoTablesLookupLanes(Vec128 a, Vec128 b, Indices128 idx) { const DFromV d; const Repartition du8; const auto a_u8 = BitCast(du8, a); const auto b_u8 = BitCast(du8, b); const auto idx_u8 = BitCast(du8, Vec128{idx.raw}); #if HWY_ARCH_ARM_A64 detail::Tuple2 tup = {{{a_u8.raw, b_u8.raw}}}; return BitCast(d, Vec128{vqtbl2q_u8(tup.raw, idx_u8.raw)}); #else const Half dh; const Repartition dh_u8; const auto a_lo_u8 = LowerHalf(dh_u8, a_u8); const auto a_hi_u8 = UpperHalf(dh_u8, a_u8); const auto b_lo_u8 = LowerHalf(dh_u8, b_u8); const auto b_hi_u8 = UpperHalf(dh_u8, b_u8); const auto idx_lo_u8 = LowerHalf(dh_u8, idx_u8); const auto idx_hi_u8 = UpperHalf(dh_u8, idx_u8); detail::Tuple4 tup = { {{a_lo_u8.raw, a_hi_u8.raw, b_lo_u8.raw, b_hi_u8.raw}}}; const auto lo_result = BitCast(dh, Vec64{vtbl4_u8(tup.raw, idx_lo_u8.raw)}); const auto hi_result = BitCast(dh, Vec64{vtbl4_u8(tup.raw, idx_hi_u8.raw)}); return Combine(d, hi_result, lo_result); #endif } // ------------------------------ Reverse2 (CombineShiftRightBytes) // Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. #ifdef HWY_NATIVE_REVERSE2_8 #undef HWY_NATIVE_REVERSE2_8 #else #define HWY_NATIVE_REVERSE2_8 #endif template HWY_API VFromD Reverse2(D d, VFromD v) { const RebindToUnsigned du; return BitCast(d, VFromD(vrev16_u8(BitCast(du, v).raw))); } template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec128 Reverse2(D d, Vec128 v) { const RebindToUnsigned du; return BitCast(d, Vec128(vrev16q_u8(BitCast(du, v).raw))); } template HWY_API VFromD Reverse2(D d, VFromD v) { const RebindToUnsigned du; return BitCast(d, VFromD(vrev32_u16(BitCast(du, v).raw))); } template , HWY_IF_T_SIZE(T, 2)> HWY_API Vec128 Reverse2(D d, Vec128 v) { const RebindToUnsigned du; return BitCast(d, Vec128(vrev32q_u16(BitCast(du, v).raw))); } template HWY_API VFromD Reverse2(D d, VFromD v) { const RebindToUnsigned du; return BitCast(d, VFromD(vrev64_u32(BitCast(du, v).raw))); } template , HWY_IF_T_SIZE(T, 4)> HWY_API Vec128 Reverse2(D d, Vec128 v) { const RebindToUnsigned du; return BitCast(d, Vec128(vrev64q_u32(BitCast(du, v).raw))); } template HWY_API VFromD Reverse2(D d, VFromD v) { return CombineShiftRightBytes<8>(d, v, v); } // ------------------------------ Reverse4 (Reverse2) template HWY_API VFromD Reverse4(D d, VFromD v) { const RebindToUnsigned du; return BitCast(d, VFromD(vrev32_u8(BitCast(du, v).raw))); } template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec128 Reverse4(D d, Vec128 v) { const RebindToUnsigned du; return BitCast(d, Vec128(vrev32q_u8(BitCast(du, v).raw))); } template HWY_API VFromD Reverse4(D d, VFromD v) { const RebindToUnsigned du; return BitCast(d, VFromD(vrev64_u16(BitCast(du, v).raw))); } template , HWY_IF_T_SIZE(T, 2)> HWY_API Vec128 Reverse4(D d, Vec128 v) { const RebindToUnsigned du; return BitCast(d, Vec128(vrev64q_u16(BitCast(du, v).raw))); } template HWY_API VFromD Reverse4(D d, VFromD v) { const RepartitionToWide> duw; return BitCast(d, Reverse2(duw, BitCast(duw, Reverse2(d, v)))); } template HWY_API VFromD Reverse4(D /* tag */, VFromD) { HWY_ASSERT(0); // don't have 8 u64 lanes } // ------------------------------ Reverse8 (Reverse2, Reverse4) template HWY_API VFromD Reverse8(D d, VFromD v) { const RebindToUnsigned du; return BitCast(d, VFromD(vrev64_u8(BitCast(du, v).raw))); } template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec128 Reverse8(D d, Vec128 v) { const RebindToUnsigned du; return BitCast(d, Vec128(vrev64q_u8(BitCast(du, v).raw))); } template HWY_API VFromD Reverse8(D d, VFromD v) { const Repartition du64; return BitCast(d, Reverse2(du64, BitCast(du64, Reverse4(d, v)))); } template HWY_API VFromD Reverse8(D, VFromD) { HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit } // ------------------------------ Reverse (Reverse2, Reverse4, Reverse8) template , HWY_IF_LANES_D(D, 1)> HWY_API Vec128 Reverse(D /* tag */, Vec128 v) { return v; } template , HWY_IF_LANES_D(D, 2)> HWY_API Vec128 Reverse(D d, Vec128 v) { return Reverse2(d, v); } template , HWY_IF_LANES_D(D, 4)> HWY_API Vec128 Reverse(D d, Vec128 v) { return Reverse4(d, v); } template , HWY_IF_LANES_D(D, 8)> HWY_API Vec128 Reverse(D d, Vec128 v) { return Reverse8(d, v); } template , HWY_IF_LANES_D(D, 16)> HWY_API Vec128 Reverse(D d, Vec128 v) { const Repartition du64; return BitCast(d, Reverse2(du64, BitCast(du64, Reverse8(d, v)))); } // ------------------------------ ReverseBits #if HWY_ARCH_ARM_A64 #ifdef HWY_NATIVE_REVERSE_BITS_UI8 #undef HWY_NATIVE_REVERSE_BITS_UI8 #else #define HWY_NATIVE_REVERSE_BITS_UI8 #endif HWY_NEON_DEF_FUNCTION_INT_8(ReverseBits, vrbit, _, 1) HWY_NEON_DEF_FUNCTION_UINT_8(ReverseBits, vrbit, _, 1) #endif // HWY_ARCH_ARM_A64 // ------------------------------ Other shuffles (TableLookupBytes) // Notation: let Vec128 have lanes 3,2,1,0 (0 is least-significant). // Shuffle0321 rotates one lane to the right (the previous least-significant // lane is now most-significant). These could also be implemented via // CombineShiftRightBytes but the shuffle_abcd notation is more convenient. // Swap 64-bit halves template HWY_API Vec128 Shuffle1032(Vec128 v) { return CombineShiftRightBytes<8>(DFromV(), v, v); } template HWY_API Vec128 Shuffle01(Vec128 v) { return CombineShiftRightBytes<8>(DFromV(), v, v); } // Rotate right 32 bits template HWY_API Vec128 Shuffle0321(Vec128 v) { return CombineShiftRightBytes<4>(DFromV(), v, v); } // Rotate left 32 bits template HWY_API Vec128 Shuffle2103(Vec128 v) { return CombineShiftRightBytes<12>(DFromV(), v, v); } // Reverse template HWY_API Vec128 Shuffle0123(Vec128 v) { return Reverse4(DFromV(), v); } // ------------------------------ InterleaveLower // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides // the least-significant lane) and "b". To concatenate two half-width integers // into one, use ZipLower/Upper instead (also works with scalar). HWY_NEON_DEF_FUNCTION_UIF_8_16_32(InterleaveLower, vzip1, _, 2) #if HWY_ARCH_ARM_A64 // N=1 makes no sense (in that case, there would be no upper/lower). HWY_NEON_DEF_FUNCTION_FULL_UIF_64(InterleaveLower, vzip1, _, 2) #else // Emulated version for Armv7. template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { const DFromV d; return CombineShiftRightBytes<8>(d, b, Shuffle01(a)); } #endif #if !HWY_HAVE_FLOAT16 template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { const DFromV d; const RebindToUnsigned du; return BitCast(d, InterleaveLower(BitCast(du, a), BitCast(du, b))); } #endif // !HWY_HAVE_FLOAT16 // < 64 bit parts template HWY_API Vec128 InterleaveLower(Vec128 a, Vec128 b) { return Vec128(InterleaveLower(Vec64(a.raw), Vec64(b.raw)).raw); } // Additional overload for the optional Simd<> tag. template HWY_API VFromD InterleaveLower(D /* tag */, VFromD a, VFromD b) { return InterleaveLower(a, b); } // ------------------------------ InterleaveUpper (UpperHalf) // All functions inside detail lack the required D parameter. namespace detail { HWY_NEON_DEF_FUNCTION_UIF_8_16_32(InterleaveUpper, vzip2, _, 2) #if HWY_ARCH_ARM_A64 // N=1 makes no sense (in that case, there would be no upper/lower). HWY_NEON_DEF_FUNCTION_FULL_UIF_64(InterleaveUpper, vzip2, _, 2) #else // Emulated version for Armv7. template HWY_API Vec128 InterleaveUpper(Vec128 a, Vec128 b) { const DFromV d; return CombineShiftRightBytes<8>(d, Shuffle01(b), a); } #endif } // namespace detail // Full register template HWY_API VFromD InterleaveUpper(D /* tag */, VFromD a, VFromD b) { return detail::InterleaveUpper(a, b); } // Partial template HWY_API VFromD InterleaveUpper(D d, VFromD a, VFromD b) { const Half d2; const VFromD a2(UpperHalf(d2, a).raw); const VFromD b2(UpperHalf(d2, b).raw); return InterleaveLower(d, a2, b2); } // ------------------------------ ZipLower/ZipUpper (InterleaveLower) // Same as Interleave*, except that the return lanes are double-width integers; // this is necessary because the single-lane scalar cannot return two values. template >> HWY_API VFromD ZipLower(V a, V b) { return BitCast(DW(), InterleaveLower(a, b)); } template , class DW = RepartitionToWide> HWY_API VFromD ZipLower(DW dw, V a, V b) { return BitCast(dw, InterleaveLower(D(), a, b)); } template , class DW = RepartitionToWide> HWY_API VFromD ZipUpper(DW dw, V a, V b) { return BitCast(dw, InterleaveUpper(D(), a, b)); } // ------------------------------ Per4LaneBlockShuffle namespace detail { #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 #else #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32 #endif template HWY_INLINE VFromD Per4LaneBlkShufDupSet4xU32(D d, const uint32_t /*x3*/, const uint32_t /*x2*/, const uint32_t x1, const uint32_t x0) { typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(8))); const GccU32RawVectType raw = {x0, x1}; return ResizeBitCast(d, Vec64(reinterpret_cast(raw))); } template HWY_INLINE VFromD Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, const uint32_t x2, const uint32_t x1, const uint32_t x0) { typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16))); const GccU32RawVectType raw = {x0, x1, x2, x3}; return ResizeBitCast(d, Vec128(reinterpret_cast(raw))); } #endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG template , 4)> HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x88> /*idx_3210_tag*/, hwy::SizeTag /*lane_size_tag*/, hwy::SizeTag /*vect_size_tag*/, V v) { const DFromV d; const RebindToUnsigned du; const RepartitionToWide dw; const auto evens = BitCast(dw, ConcatEven(d, v, v)); return BitCast(d, InterleaveLower(dw, evens, evens)); } template , 4)> HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xDD> /*idx_3210_tag*/, hwy::SizeTag /*lane_size_tag*/, hwy::SizeTag /*vect_size_tag*/, V v) { const DFromV d; const RebindToUnsigned du; const RepartitionToWide dw; const auto odds = BitCast(dw, ConcatOdd(d, v, v)); return BitCast(d, InterleaveLower(dw, odds, odds)); } template HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xFA> /*idx_3210_tag*/, hwy::SizeTag<2> /*lane_size_tag*/, hwy::SizeTag<8> /*vect_size_tag*/, V v) { const DFromV d; return InterleaveUpper(d, v, v); } } // namespace detail // ------------------------------ SlideUpLanes namespace detail { template HWY_INLINE V SlideUpLanes(V v, size_t amt) { const DFromV d; using TU = UnsignedFromSize; const Repartition du; return BitCast(d, BitCast(du, v) << Set( du, static_cast(amt * sizeof(TFromV) * 8))); } template HWY_INLINE V SlideUpLanes(V v, size_t amt) { const DFromV d; const Repartition du8; const auto idx = Iota(du8, static_cast(size_t{0} - amt * sizeof(TFromV))); return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx)); } } // namespace detail template HWY_API VFromD SlideUpLanes(D /*d*/, VFromD v, size_t /*amt*/) { return v; } template HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftLeftLanes<1>(d, v); } } #else (void)d; #endif return detail::SlideUpLanes(v, amt); } template HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftLeftLanes<1>(d, v); case 2: return ShiftLeftLanes<2>(d, v); case 3: return ShiftLeftLanes<3>(d, v); } } #else (void)d; #endif return detail::SlideUpLanes(v, amt); } template HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftLeftLanes<1>(d, v); case 2: return ShiftLeftLanes<2>(d, v); case 3: return ShiftLeftLanes<3>(d, v); case 4: return ShiftLeftLanes<4>(d, v); case 5: return ShiftLeftLanes<5>(d, v); case 6: return ShiftLeftLanes<6>(d, v); case 7: return ShiftLeftLanes<7>(d, v); } } #else (void)d; #endif return detail::SlideUpLanes(v, amt); } template HWY_API VFromD SlideUpLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftLeftLanes<1>(d, v); case 2: return ShiftLeftLanes<2>(d, v); case 3: return ShiftLeftLanes<3>(d, v); case 4: return ShiftLeftLanes<4>(d, v); case 5: return ShiftLeftLanes<5>(d, v); case 6: return ShiftLeftLanes<6>(d, v); case 7: return ShiftLeftLanes<7>(d, v); case 8: return ShiftLeftLanes<8>(d, v); case 9: return ShiftLeftLanes<9>(d, v); case 10: return ShiftLeftLanes<10>(d, v); case 11: return ShiftLeftLanes<11>(d, v); case 12: return ShiftLeftLanes<12>(d, v); case 13: return ShiftLeftLanes<13>(d, v); case 14: return ShiftLeftLanes<14>(d, v); case 15: return ShiftLeftLanes<15>(d, v); } } #else (void)d; #endif return detail::SlideUpLanes(v, amt); } // ------------------------------ SlideDownLanes namespace detail { template HWY_INLINE V SlideDownLanes(V v, size_t amt) { const DFromV d; using TU = UnsignedFromSize; const Repartition du; return BitCast(d, BitCast(du, v) << Set( du, static_cast(TU{0} - amt * sizeof(TFromV) * 8))); } template HWY_INLINE V SlideDownLanes(V v, size_t amt) { const DFromV d; const Repartition di8; auto idx = Iota(di8, static_cast(amt * sizeof(TFromV))); idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15}))); return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx)); } } // namespace detail template HWY_API VFromD SlideDownLanes(D /*d*/, VFromD v, size_t /*amt*/) { return v; } template HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftRightLanes<1>(d, v); } } #else (void)d; #endif return detail::SlideDownLanes(v, amt); } template HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftRightLanes<1>(d, v); case 2: return ShiftRightLanes<2>(d, v); case 3: return ShiftRightLanes<3>(d, v); } } #else (void)d; #endif return detail::SlideDownLanes(v, amt); } template HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftRightLanes<1>(d, v); case 2: return ShiftRightLanes<2>(d, v); case 3: return ShiftRightLanes<3>(d, v); case 4: return ShiftRightLanes<4>(d, v); case 5: return ShiftRightLanes<5>(d, v); case 6: return ShiftRightLanes<6>(d, v); case 7: return ShiftRightLanes<7>(d, v); } } #else (void)d; #endif return detail::SlideDownLanes(v, amt); } template HWY_API VFromD SlideDownLanes(D d, VFromD v, size_t amt) { #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang if (__builtin_constant_p(amt)) { switch (amt) { case 0: return v; case 1: return ShiftRightLanes<1>(d, v); case 2: return ShiftRightLanes<2>(d, v); case 3: return ShiftRightLanes<3>(d, v); case 4: return ShiftRightLanes<4>(d, v); case 5: return ShiftRightLanes<5>(d, v); case 6: return ShiftRightLanes<6>(d, v); case 7: return ShiftRightLanes<7>(d, v); case 8: return ShiftRightLanes<8>(d, v); case 9: return ShiftRightLanes<9>(d, v); case 10: return ShiftRightLanes<10>(d, v); case 11: return ShiftRightLanes<11>(d, v); case 12: return ShiftRightLanes<12>(d, v); case 13: return ShiftRightLanes<13>(d, v); case 14: return ShiftRightLanes<14>(d, v); case 15: return ShiftRightLanes<15>(d, v); } } #else (void)d; #endif return detail::SlideDownLanes(v, amt); } // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) #if HWY_NEON_HAVE_BFLOAT16 template HWY_API Vec128 ReorderWidenMulAccumulate(D /*d32*/, Vec128 a, Vec128 b, const Vec128 sum0, Vec128& /*sum1*/) { return Vec128(vbfdotq_f32(sum0.raw, a.raw, b.raw)); } template HWY_API VFromD ReorderWidenMulAccumulate( D /*d32*/, VFromD> a, VFromD> b, const VFromD sum0, VFromD& /*sum1*/) { return VFromD(vbfdot_f32(sum0.raw, a.raw, b.raw)); } #else template >> HWY_API VFromD ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD sum0, VFromD& sum1) { const RebindToUnsigned du32; using VU32 = VFromD; const VU32 odd = Set(du32, 0xFFFF0000u); const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); const VU32 ao = And(BitCast(du32, a), odd); const VU32 be = ShiftLeft<16>(BitCast(du32, b)); const VU32 bo = And(BitCast(du32, b), odd); sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1); return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0); } #endif // HWY_NEON_HAVE_BFLOAT16 template HWY_API Vec128 ReorderWidenMulAccumulate(D /*d32*/, Vec128 a, Vec128 b, const Vec128 sum0, Vec128& sum1) { #if HWY_ARCH_ARM_A64 sum1 = Vec128(vmlal_high_s16(sum1.raw, a.raw, b.raw)); #else const Full64 dh; sum1 = Vec128( vmlal_s16(sum1.raw, UpperHalf(dh, a).raw, UpperHalf(dh, b).raw)); #endif return Vec128( vmlal_s16(sum0.raw, LowerHalf(a).raw, LowerHalf(b).raw)); } template HWY_API Vec64 ReorderWidenMulAccumulate(D d32, Vec64 a, Vec64 b, const Vec64 sum0, Vec64& sum1) { // vmlal writes into the upper half, which the caller cannot use, so // split into two halves. const Vec128 mul_3210(vmull_s16(a.raw, b.raw)); const Vec64 mul_32 = UpperHalf(d32, mul_3210); sum1 += mul_32; return sum0 + LowerHalf(mul_3210); } template HWY_API Vec32 ReorderWidenMulAccumulate(D d32, Vec32 a, Vec32 b, const Vec32 sum0, Vec32& sum1) { const Vec128 mul_xx10(vmull_s16(a.raw, b.raw)); const Vec64 mul_10(LowerHalf(mul_xx10)); const Vec32 mul0 = LowerHalf(d32, mul_10); const Vec32 mul1 = UpperHalf(d32, mul_10); sum1 += mul1; return sum0 + mul0; } template HWY_API Vec128 ReorderWidenMulAccumulate(D /*d32*/, Vec128 a, Vec128 b, const Vec128 sum0, Vec128& sum1) { #if HWY_ARCH_ARM_A64 sum1 = Vec128(vmlal_high_u16(sum1.raw, a.raw, b.raw)); #else const Full64 dh; sum1 = Vec128( vmlal_u16(sum1.raw, UpperHalf(dh, a).raw, UpperHalf(dh, b).raw)); #endif return Vec128( vmlal_u16(sum0.raw, LowerHalf(a).raw, LowerHalf(b).raw)); } template HWY_API Vec64 ReorderWidenMulAccumulate(D d32, Vec64 a, Vec64 b, const Vec64 sum0, Vec64& sum1) { // vmlal writes into the upper half, which the caller cannot use, so // split into two halves. const Vec128 mul_3210(vmull_u16(a.raw, b.raw)); const Vec64 mul_32 = UpperHalf(d32, mul_3210); sum1 += mul_32; return sum0 + LowerHalf(mul_3210); } template HWY_API Vec32 ReorderWidenMulAccumulate(D du32, Vec32 a, Vec32 b, const Vec32 sum0, Vec32& sum1) { const Vec128 mul_xx10(vmull_u16(a.raw, b.raw)); const Vec64 mul_10(LowerHalf(mul_xx10)); const Vec32 mul0 = LowerHalf(du32, mul_10); const Vec32 mul1 = UpperHalf(du32, mul_10); sum1 += mul1; return sum0 + mul0; } // ------------------------------ Combine partial (InterleaveLower) // < 64bit input, <= 64 bit result template HWY_API VFromD Combine(D d, VFromD> hi, VFromD> lo) { // First double N (only lower halves will be used). const VFromD hi2(hi.raw); const VFromD lo2(lo.raw); // Repartition to two unsigned lanes (each the size of the valid input). const Simd, 2, 0> du; return BitCast(d, InterleaveLower(BitCast(du, lo2), BitCast(du, hi2))); } // ------------------------------ RearrangeToOddPlusEven (Combine) template HWY_API Vec128 RearrangeToOddPlusEven(Vec128 sum0, Vec128 sum1) { #if HWY_NEON_HAVE_BFLOAT16 (void)sum1; // unused by bf16 ReorderWidenMulAccumulate return sum0; #else return Add(sum0, sum1); #endif } HWY_API Vec128 RearrangeToOddPlusEven(Vec128 sum0, Vec128 sum1) { // vmlal_s16 multiplied the lower half into sum0 and upper into sum1. #if HWY_ARCH_ARM_A64 // pairwise sum is available and what we want return Vec128(vpaddq_s32(sum0.raw, sum1.raw)); #else const Full128 d; const Half d64; const Vec64 hi( vpadd_s32(LowerHalf(d64, sum1).raw, UpperHalf(d64, sum1).raw)); const Vec64 lo( vpadd_s32(LowerHalf(d64, sum0).raw, UpperHalf(d64, sum0).raw)); return Combine(Full128(), hi, lo); #endif } HWY_API Vec64 RearrangeToOddPlusEven(Vec64 sum0, Vec64 sum1) { // vmlal_s16 multiplied the lower half into sum0 and upper into sum1. return Vec64(vpadd_s32(sum0.raw, sum1.raw)); } HWY_API Vec32 RearrangeToOddPlusEven(Vec32 sum0, Vec32 sum1) { // Only one widened sum per register, so add them for sum of odd and even. return sum0 + sum1; } HWY_API Vec128 RearrangeToOddPlusEven(Vec128 sum0, Vec128 sum1) { // vmlal_s16 multiplied the lower half into sum0 and upper into sum1. #if HWY_ARCH_ARM_A64 // pairwise sum is available and what we want return Vec128(vpaddq_u32(sum0.raw, sum1.raw)); #else const Full128 d; const Half d64; const Vec64 hi( vpadd_u32(LowerHalf(d64, sum1).raw, UpperHalf(d64, sum1).raw)); const Vec64 lo( vpadd_u32(LowerHalf(d64, sum0).raw, UpperHalf(d64, sum0).raw)); return Combine(Full128(), hi, lo); #endif } HWY_API Vec64 RearrangeToOddPlusEven(Vec64 sum0, Vec64 sum1) { // vmlal_u16 multiplied the lower half into sum0 and upper into sum1. return Vec64(vpadd_u32(sum0.raw, sum1.raw)); } HWY_API Vec32 RearrangeToOddPlusEven(Vec32 sum0, Vec32 sum1) { // Only one widened sum per register, so add them for sum of odd and even. return sum0 + sum1; } // ------------------------------ WidenMulPairwiseAdd #if HWY_NEON_HAVE_BFLOAT16 template HWY_API Vec128 WidenMulPairwiseAdd(D d32, Vec128 a, Vec128 b) { return Vec128(vbfdotq_f32(Zero(d32).raw, a.raw, b.raw)); } template HWY_API VFromD WidenMulPairwiseAdd(D d32, VFromD> a, VFromD> b) { return VFromD(vbfdot_f32(Zero(d32).raw, a.raw, b.raw)); } #else template HWY_API VFromD WidenMulPairwiseAdd( D32 df32, VFromD> a, VFromD> b) { const RebindToUnsigned du32; using VU32 = VFromD; const VU32 odd = Set(du32, 0xFFFF0000u); const VU32 ae = ShiftLeft<16>(BitCast(du32, a)); const VU32 ao = And(BitCast(du32, a), odd); const VU32 be = ShiftLeft<16>(BitCast(du32, b)); const VU32 bo = And(BitCast(du32, b), odd); return MulAdd(BitCast(df32, ae), BitCast(df32, be), Mul(BitCast(df32, ao), BitCast(df32, bo))); } #endif // HWY_NEON_HAVE_BFLOAT16 template HWY_API Vec128 WidenMulPairwiseAdd(D /*d32*/, Vec128 a, Vec128 b) { Vec128 sum1; #if HWY_ARCH_ARM_A64 sum1 = Vec128(vmull_high_s16(a.raw, b.raw)); #else const Full64 dh; sum1 = Vec128(vmull_s16(UpperHalf(dh, a).raw, UpperHalf(dh, b).raw)); #endif Vec128 sum0 = Vec128(vmull_s16(LowerHalf(a).raw, LowerHalf(b).raw)); return RearrangeToOddPlusEven(sum0, sum1); } template HWY_API Vec64 WidenMulPairwiseAdd(D d32, Vec64 a, Vec64 b) { // vmlal writes into the upper half, which the caller cannot use, so // split into two halves. const Vec128 mul_3210(vmull_s16(a.raw, b.raw)); const Vec64 mul0 = LowerHalf(mul_3210); const Vec64 mul1 = UpperHalf(d32, mul_3210); return RearrangeToOddPlusEven(mul0, mul1); } template HWY_API Vec32 WidenMulPairwiseAdd(D d32, Vec32 a, Vec32 b) { const Vec128 mul_xx10(vmull_s16(a.raw, b.raw)); const Vec64 mul_10(LowerHalf(mul_xx10)); const Vec32 mul0 = LowerHalf(d32, mul_10); const Vec32 mul1 = UpperHalf(d32, mul_10); return RearrangeToOddPlusEven(mul0, mul1); } template HWY_API Vec128 WidenMulPairwiseAdd(D /*d32*/, Vec128 a, Vec128 b) { Vec128 sum1; #if HWY_ARCH_ARM_A64 sum1 = Vec128(vmull_high_u16(a.raw, b.raw)); #else const Full64 dh; sum1 = Vec128(vmull_u16(UpperHalf(dh, a).raw, UpperHalf(dh, b).raw)); #endif Vec128 sum0 = Vec128(vmull_u16(LowerHalf(a).raw, LowerHalf(b).raw)); return RearrangeToOddPlusEven(sum0, sum1); } template HWY_API Vec64 WidenMulPairwiseAdd(D d32, Vec64 a, Vec64 b) { // vmlal writes into the upper half, which the caller cannot use, so // split into two halves. const Vec128 mul_3210(vmull_u16(a.raw, b.raw)); const Vec64 mul0 = LowerHalf(mul_3210); const Vec64 mul1 = UpperHalf(d32, mul_3210); return RearrangeToOddPlusEven(mul0, mul1); } template HWY_API Vec32 WidenMulPairwiseAdd(D d32, Vec32 a, Vec32 b) { const Vec128 mul_xx10(vmull_u16(a.raw, b.raw)); const Vec64 mul_10(LowerHalf(mul_xx10)); const Vec32 mul0 = LowerHalf(d32, mul_10); const Vec32 mul1 = UpperHalf(d32, mul_10); return RearrangeToOddPlusEven(mul0, mul1); } // ------------------------------ ZeroExtendVector (Combine) template HWY_API VFromD ZeroExtendVector(D d, VFromD> lo) { return Combine(d, Zero(Half()), lo); } // ------------------------------ ConcatLowerLower // 64 or 128-bit input: just interleave template HWY_API VFromD ConcatLowerLower(D d, VFromD hi, VFromD lo) { // Treat half-width input as a single lane and interleave them. const Repartition, decltype(d)> du; return BitCast(d, InterleaveLower(BitCast(du, lo), BitCast(du, hi))); } namespace detail { #if HWY_ARCH_ARM_A64 HWY_NEON_DEF_FUNCTION_UIF_8_16_32(InterleaveEven, vtrn1, _, 2) HWY_NEON_DEF_FUNCTION_UIF_8_16_32(InterleaveOdd, vtrn2, _, 2) #else // vtrn returns a struct with even and odd result. #define HWY_NEON_BUILD_TPL_HWY_TRN #define HWY_NEON_BUILD_RET_HWY_TRN(type, size) type##x##size##x2_t // Pass raw args so we can accept uint16x2 args, for which there is no // corresponding uint16x2x2 return type. #define HWY_NEON_BUILD_PARAM_HWY_TRN(TYPE, size) \ Raw128::type a, Raw128::type b #define HWY_NEON_BUILD_ARG_HWY_TRN a, b // Cannot use UINT8 etc. type macros because the x2_t tuples are only defined // for full and half vectors. HWY_NEON_DEF_FUNCTION(uint8, 16, InterleaveEvenOdd, vtrnq, _, u8, HWY_TRN) HWY_NEON_DEF_FUNCTION(uint8, 8, InterleaveEvenOdd, vtrn, _, u8, HWY_TRN) HWY_NEON_DEF_FUNCTION(uint16, 8, InterleaveEvenOdd, vtrnq, _, u16, HWY_TRN) HWY_NEON_DEF_FUNCTION(uint16, 4, InterleaveEvenOdd, vtrn, _, u16, HWY_TRN) HWY_NEON_DEF_FUNCTION(uint32, 4, InterleaveEvenOdd, vtrnq, _, u32, HWY_TRN) HWY_NEON_DEF_FUNCTION(uint32, 2, InterleaveEvenOdd, vtrn, _, u32, HWY_TRN) HWY_NEON_DEF_FUNCTION(int8, 16, InterleaveEvenOdd, vtrnq, _, s8, HWY_TRN) HWY_NEON_DEF_FUNCTION(int8, 8, InterleaveEvenOdd, vtrn, _, s8, HWY_TRN) HWY_NEON_DEF_FUNCTION(int16, 8, InterleaveEvenOdd, vtrnq, _, s16, HWY_TRN) HWY_NEON_DEF_FUNCTION(int16, 4, InterleaveEvenOdd, vtrn, _, s16, HWY_TRN) HWY_NEON_DEF_FUNCTION(int32, 4, InterleaveEvenOdd, vtrnq, _, s32, HWY_TRN) HWY_NEON_DEF_FUNCTION(int32, 2, InterleaveEvenOdd, vtrn, _, s32, HWY_TRN) HWY_NEON_DEF_FUNCTION(float32, 4, InterleaveEvenOdd, vtrnq, _, f32, HWY_TRN) HWY_NEON_DEF_FUNCTION(float32, 2, InterleaveEvenOdd, vtrn, _, f32, HWY_TRN) #undef HWY_NEON_BUILD_TPL_HWY_TRN #undef HWY_NEON_BUILD_RET_HWY_TRN #undef HWY_NEON_BUILD_PARAM_HWY_TRN #undef HWY_NEON_BUILD_ARG_HWY_TRN #endif // HWY_ARCH_ARM_A64 } // namespace detail // <= 32-bit input/output template HWY_API VFromD ConcatLowerLower(D d, VFromD hi, VFromD lo) { // Treat half-width input as two lanes and take every second one. const Repartition, decltype(d)> du; #if HWY_ARCH_ARM_A64 return BitCast(d, detail::InterleaveEven(BitCast(du, lo), BitCast(du, hi))); #else using VU = VFromD; return BitCast( d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw) .val[0])); #endif } // ------------------------------ ConcatUpperUpper // 64 or 128-bit input: just interleave template HWY_API VFromD ConcatUpperUpper(D d, VFromD hi, VFromD lo) { // Treat half-width input as a single lane and interleave them. const Repartition, decltype(d)> du; return BitCast(d, InterleaveUpper(du, BitCast(du, lo), BitCast(du, hi))); } // <= 32-bit input/output template HWY_API VFromD ConcatUpperUpper(D d, VFromD hi, VFromD lo) { // Treat half-width input as two lanes and take every second one. const Repartition, decltype(d)> du; #if HWY_ARCH_ARM_A64 return BitCast(d, detail::InterleaveOdd(BitCast(du, lo), BitCast(du, hi))); #else using VU = VFromD; return BitCast( d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw) .val[1])); #endif } // ------------------------------ ConcatLowerUpper (ShiftLeftBytes) // 64 or 128-bit input: extract from concatenated template HWY_API VFromD ConcatLowerUpper(D d, VFromD hi, VFromD lo) { return CombineShiftRightBytes(d, hi, lo); } // <= 32-bit input/output template HWY_API VFromD ConcatLowerUpper(D d, VFromD hi, VFromD lo) { constexpr size_t kSize = d.MaxBytes(); const Repartition d8; const Full64 d8x8; const Full64> d64; using V8x8 = VFromD; const V8x8 hi8x8(BitCast(d8, hi).raw); // Move into most-significant bytes const V8x8 lo8x8 = ShiftLeftBytes<8 - kSize>(V8x8(BitCast(d8, lo).raw)); const V8x8 r = CombineShiftRightBytes<8 - kSize / 2>(d8x8, hi8x8, lo8x8); // Back to original lane type, then shrink N. return VFromD(BitCast(d64, r).raw); } // ------------------------------ ConcatUpperLower // Works for all N. template HWY_API VFromD ConcatUpperLower(D d, VFromD hi, VFromD lo) { return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi); } // ------------------------------ ConcatOdd (InterleaveUpper) namespace detail { // There is no vuzpq_u64. HWY_NEON_DEF_FUNCTION_UIF_8_16_32(ConcatEven, vuzp1, _, 2) HWY_NEON_DEF_FUNCTION_UIF_8_16_32(ConcatOdd, vuzp2, _, 2) #if !HWY_HAVE_FLOAT16 template HWY_INLINE Vec128 ConcatEven(Vec128 hi, Vec128 lo) { const DFromV d; const RebindToUnsigned du; return BitCast(d, ConcatEven(BitCast(du, hi), BitCast(du, lo))); } template HWY_INLINE Vec128 ConcatOdd(Vec128 hi, Vec128 lo) { const DFromV d; const RebindToUnsigned du; return BitCast(d, ConcatOdd(BitCast(du, hi), BitCast(du, lo))); } #endif // !HWY_HAVE_FLOAT16 } // namespace detail // Full/half vector template HWY_API VFromD ConcatOdd(D /* tag */, VFromD hi, VFromD lo) { return detail::ConcatOdd(lo, hi); } // 8-bit x4 template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec32 ConcatOdd(D d, Vec32 hi, Vec32 lo) { const Twice d2; const Repartition dw2; const VFromD hi2(hi.raw); const VFromD lo2(lo.raw); const VFromD Hx1Lx1 = BitCast(dw2, ConcatOdd(d2, hi2, lo2)); // Compact into two pairs of u8, skipping the invalid x lanes. Could also use // vcopy_lane_u16, but that's A64-only. return Vec32(BitCast(d2, ConcatEven(dw2, Hx1Lx1, Hx1Lx1)).raw); } // Any type x2 template > HWY_API Vec128 ConcatOdd(D d, Vec128 hi, Vec128 lo) { return InterleaveUpper(d, lo, hi); } // ------------------------------ ConcatEven (InterleaveLower) // Full/half vector template HWY_API VFromD ConcatEven(D /* tag */, VFromD hi, VFromD lo) { return detail::ConcatEven(lo, hi); } // 8-bit x4 template , HWY_IF_T_SIZE(T, 1)> HWY_API Vec32 ConcatEven(D d, Vec32 hi, Vec32 lo) { const Twice d2; const Repartition dw2; const VFromD hi2(hi.raw); const VFromD lo2(lo.raw); const VFromD Hx0Lx0 = BitCast(dw2, ConcatEven(d2, hi2, lo2)); // Compact into two pairs of u8, skipping the invalid x lanes. Could also use // vcopy_lane_u16, but that's A64-only. return Vec32(BitCast(d2, ConcatEven(dw2, Hx0Lx0, Hx0Lx0)).raw); } // Any type x2 template > HWY_API Vec128 ConcatEven(D d, Vec128 hi, Vec128 lo) { return InterleaveLower(d, lo, hi); } // ------------------------------ DupEven (InterleaveLower) template HWY_API Vec128 DupEven(Vec128 v) { #if HWY_ARCH_ARM_A64 return detail::InterleaveEven(v, v); #else return Vec128(detail::InterleaveEvenOdd(v.raw, v.raw).val[0]); #endif } template HWY_API Vec128 DupEven(Vec128 v) { return InterleaveLower(DFromV(), v, v); } // ------------------------------ DupOdd (InterleaveUpper) template HWY_API Vec128 DupOdd(Vec128 v) { #if HWY_ARCH_ARM_A64 return detail::InterleaveOdd(v, v); #else return Vec128(detail::InterleaveEvenOdd(v.raw, v.raw).val[1]); #endif } template HWY_API Vec128 DupOdd(Vec128 v) { return InterleaveUpper(DFromV(), v, v); } // ------------------------------ OddEven (IfThenElse) template HWY_API Vec128 OddEven(const Vec128 a, const Vec128 b) { const DFromV d; const Repartition d8; alignas(16) static constexpr uint8_t kBytes[16] = { ((0 / sizeof(T)) & 1) ? 0 : 0xFF, ((1 / sizeof(T)) & 1) ? 0 : 0xFF, ((2 / sizeof(T)) & 1) ? 0 : 0xFF, ((3 / sizeof(T)) & 1) ? 0 : 0xFF, ((4 / sizeof(T)) & 1) ? 0 : 0xFF, ((5 / sizeof(T)) & 1) ? 0 : 0xFF, ((6 / sizeof(T)) & 1) ? 0 : 0xFF, ((7 / sizeof(T)) & 1) ? 0 : 0xFF, ((8 / sizeof(T)) & 1) ? 0 : 0xFF, ((9 / sizeof(T)) & 1) ? 0 : 0xFF, ((10 / sizeof(T)) & 1) ? 0 : 0xFF, ((11 / sizeof(T)) & 1) ? 0 : 0xFF, ((12 / sizeof(T)) & 1) ? 0 : 0xFF, ((13 / sizeof(T)) & 1) ? 0 : 0xFF, ((14 / sizeof(T)) & 1) ? 0 : 0xFF, ((15 / sizeof(T)) & 1) ? 0 : 0xFF, }; const auto vec = BitCast(d, Load(d8, kBytes)); return IfThenElse(MaskFromVec(vec), b, a); } // ------------------------------ OddEvenBlocks template HWY_API Vec128 OddEvenBlocks(Vec128 /* odd */, Vec128 even) { return even; } // ------------------------------ SwapAdjacentBlocks template HWY_API Vec128 SwapAdjacentBlocks(Vec128 v) { return v; } // ------------------------------ ReverseBlocks // Single block: no change template HWY_API VFromD ReverseBlocks(D /* tag */, VFromD v) { return v; } // ------------------------------ ReorderDemote2To (OddEven) template >> HWY_API VFromD ReorderDemote2To(D dbf16, V32 a, V32 b) { const RebindToUnsigned du16; return BitCast(dbf16, ConcatOdd(du16, BitCast(du16, b), BitCast(du16, a))); } template HWY_API Vec128 ReorderDemote2To(D d32, Vec128 a, Vec128 b) { const Vec64 a32(vqmovn_s64(a.raw)); #if HWY_ARCH_ARM_A64 (void)d32; return Vec128(vqmovn_high_s64(a32.raw, b.raw)); #else const Vec64 b32(vqmovn_s64(b.raw)); return Combine(d32, b32, a32); #endif } template HWY_API VFromD ReorderDemote2To(D d32, VFromD> a, VFromD> b) { const Rebind dt; return DemoteTo(d32, Combine(dt, b, a)); } template HWY_API Vec128 ReorderDemote2To(D d32, Vec128 a, Vec128 b) { const Vec64 a32(vqmovun_s64(a.raw)); #if HWY_ARCH_ARM_A64 (void)d32; return Vec128(vqmovun_high_s64(a32.raw, b.raw)); #else const Vec64 b32(vqmovun_s64(b.raw)); return Combine(d32, b32, a32); #endif } template HWY_API VFromD ReorderDemote2To(D d32, VFromD> a, VFromD> b) { const Rebind dt; return DemoteTo(d32, Combine(dt, b, a)); } template HWY_API Vec128 ReorderDemote2To(D d32, Vec128 a, Vec128 b) { const Vec64 a32(vqmovn_u64(a.raw)); #if HWY_ARCH_ARM_A64 (void)d32; return Vec128(vqmovn_high_u64(a32.raw, b.raw)); #else const Vec64 b32(vqmovn_u64(b.raw)); return Combine(d32, b32, a32); #endif } template HWY_API VFromD ReorderDemote2To(D d32, VFromD> a, VFromD> b) { const Rebind dt; return DemoteTo(d32, Combine(dt, b, a)); } template HWY_API Vec128 ReorderDemote2To(D d16, Vec128 a, Vec128 b) { const Vec64 a16(vqmovn_s32(a.raw)); #if HWY_ARCH_ARM_A64 (void)d16; return Vec128(vqmovn_high_s32(a16.raw, b.raw)); #else const Vec64 b16(vqmovn_s32(b.raw)); return Combine(d16, b16, a16); #endif } template HWY_API Vec64 ReorderDemote2To(D /*d16*/, Vec64 a, Vec64 b) { const Full128 d32; const Vec128 ab = Combine(d32, b, a); return Vec64(vqmovn_s32(ab.raw)); } template HWY_API Vec32 ReorderDemote2To(D /*d16*/, Vec32 a, Vec32 b) { const Full128 d32; const Vec64 ab(vzip1_s32(a.raw, b.raw)); return Vec32(vqmovn_s32(Combine(d32, ab, ab).raw)); } template HWY_API Vec128 ReorderDemote2To(D d16, Vec128 a, Vec128 b) { const Vec64 a16(vqmovun_s32(a.raw)); #if HWY_ARCH_ARM_A64 (void)d16; return Vec128(vqmovun_high_s32(a16.raw, b.raw)); #else const Vec64 b16(vqmovun_s32(b.raw)); return Combine(d16, b16, a16); #endif } template HWY_API Vec64 ReorderDemote2To(D /*d16*/, Vec64 a, Vec64 b) { const Full128 d32; const Vec128 ab = Combine(d32, b, a); return Vec64(vqmovun_s32(ab.raw)); } template HWY_API Vec32 ReorderDemote2To(D /*d16*/, Vec32 a, Vec32 b) { const Full128 d32; const Vec64 ab(vzip1_s32(a.raw, b.raw)); return Vec32(vqmovun_s32(Combine(d32, ab, ab).raw)); } template HWY_API Vec128 ReorderDemote2To(D d16, Vec128 a, Vec128 b) { const Vec64 a16(vqmovn_u32(a.raw)); #if HWY_ARCH_ARM_A64 (void)d16; return Vec128(vqmovn_high_u32(a16.raw, b.raw)); #else const Vec64 b16(vqmovn_u32(b.raw)); return Combine(d16, b16, a16); #endif } template HWY_API Vec64 ReorderDemote2To(D /*d16*/, Vec64 a, Vec64 b) { const Full128 d32; const Vec128 ab = Combine(d32, b, a); return Vec64(vqmovn_u32(ab.raw)); } template HWY_API Vec32 ReorderDemote2To(D /*d16*/, Vec32 a, Vec32 b) { const Full128 d32; const Vec64 ab(vzip1_u32(a.raw, b.raw)); return Vec32(vqmovn_u32(Combine(d32, ab, ab).raw)); } template HWY_API Vec128 ReorderDemote2To(D d8, Vec128 a, Vec128 b) { const Vec64 a8(vqmovn_s16(a.raw)); #if HWY_ARCH_ARM_A64 (void)d8; return Vec128(vqmovn_high_s16(a8.raw, b.raw)); #else const Vec64 b8(vqmovn_s16(b.raw)); return Combine(d8, b8, a8); #endif } template HWY_API VFromD ReorderDemote2To(D d8, VFromD> a, VFromD> b) { const Rebind dt; return DemoteTo(d8, Combine(dt, b, a)); } template HWY_API Vec128 ReorderDemote2To(D d8, Vec128 a, Vec128 b) { const Vec64 a8(vqmovun_s16(a.raw)); #if HWY_ARCH_ARM_A64 (void)d8; return Vec128(vqmovun_high_s16(a8.raw, b.raw)); #else const Vec64 b8(vqmovun_s16(b.raw)); return Combine(d8, b8, a8); #endif } template HWY_API VFromD ReorderDemote2To(D d8, VFromD> a, VFromD> b) { const Rebind dt; return DemoteTo(d8, Combine(dt, b, a)); } template HWY_API Vec128 ReorderDemote2To(D d8, Vec128 a, Vec128 b) { const Vec64 a8(vqmovn_u16(a.raw)); #if HWY_ARCH_ARM_A64 (void)d8; return Vec128(vqmovn_high_u16(a8.raw, b.raw)); #else const Vec64 b8(vqmovn_u16(b.raw)); return Combine(d8, b8, a8); #endif } template HWY_API VFromD ReorderDemote2To(D d8, VFromD> a, VFromD> b) { const Rebind dt; return DemoteTo(d8, Combine(dt, b, a)); } template ), HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV) * 2)> HWY_API VFromD OrderedDemote2To(D d, V a, V b) { return ReorderDemote2To(d, a, b); } template >> HWY_API VFromD OrderedDemote2To(D dbf16, V32 a, V32 b) { return ReorderDemote2To(dbf16, a, b); } // ================================================== CRYPTO // (aarch64 or Arm7) and (__ARM_FEATURE_AES or HWY_HAVE_RUNTIME_DISPATCH). // Otherwise, rely on generic_ops-inl.h to emulate AESRound / CLMul*. #if HWY_TARGET == HWY_NEON #ifdef HWY_NATIVE_AES #undef HWY_NATIVE_AES #else #define HWY_NATIVE_AES #endif HWY_API Vec128 AESRound(Vec128 state, Vec128 round_key) { // NOTE: it is important that AESE and AESMC be consecutive instructions so // they can be fused. AESE includes AddRoundKey, which is a different ordering // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual // round key (the compiler will hopefully optimize this for multiple rounds). return Vec128(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^ round_key; } HWY_API Vec128 AESLastRound(Vec128 state, Vec128 round_key) { return Vec128(vaeseq_u8(state.raw, vdupq_n_u8(0))) ^ round_key; } HWY_API Vec128 AESInvMixColumns(Vec128 state) { return Vec128{vaesimcq_u8(state.raw)}; } HWY_API Vec128 AESRoundInv(Vec128 state, Vec128 round_key) { // NOTE: it is important that AESD and AESIMC be consecutive instructions so // they can be fused. AESD includes AddRoundKey, which is a different ordering // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual // round key (the compiler will hopefully optimize this for multiple rounds). return Vec128(vaesimcq_u8(vaesdq_u8(state.raw, vdupq_n_u8(0)))) ^ round_key; } HWY_API Vec128 AESLastRoundInv(Vec128 state, Vec128 round_key) { return Vec128(vaesdq_u8(state.raw, vdupq_n_u8(0))) ^ round_key; } HWY_API Vec128 CLMulLower(Vec128 a, Vec128 b) { return Vec128((uint64x2_t)vmull_p64(GetLane(a), GetLane(b))); } HWY_API Vec128 CLMulUpper(Vec128 a, Vec128 b) { return Vec128( (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw)); } #endif // HWY_TARGET == HWY_NEON // ================================================== MISC template HWY_API VFromD PromoteTo(D df32, VFromD> v) { const Rebind du16; const RebindToSigned di32; return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); } // ------------------------------ Truncations template , typename TFrom, HWY_IF_UNSIGNED(TFrom), HWY_IF_UNSIGNED(TTo), hwy::EnableIf<(sizeof(TTo) < sizeof(TFrom))>* = nullptr> HWY_API Vec128 TruncateTo(DTo /* tag */, Vec128 v) { const Repartition> d; return Vec128{BitCast(d, v).raw}; } template HWY_API Vec16 TruncateTo(D /* tag */, Vec128 v) { const Repartition> d; const auto v1 = BitCast(d, v); const auto v2 = detail::ConcatEven(v1, v1); const auto v3 = detail::ConcatEven(v2, v2); const auto v4 = detail::ConcatEven(v3, v3); return LowerHalf(LowerHalf(LowerHalf(v4))); } template HWY_API Vec32 TruncateTo(D /* tag */, Vec128 v) { const Repartition> d; const auto v1 = BitCast(d, v); const auto v2 = detail::ConcatEven(v1, v1); const auto v3 = detail::ConcatEven(v2, v2); return LowerHalf(LowerHalf(v3)); } template HWY_API Vec64 TruncateTo(D /* tag */, Vec128 v) { const Repartition> d; const auto v1 = BitCast(d, v); const auto v2 = detail::ConcatEven(v1, v1); return LowerHalf(v2); } template HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { const Repartition> d; const auto v1 = BitCast(d, v); const auto v2 = detail::ConcatEven(v1, v1); const auto v3 = detail::ConcatEven(v2, v2); return LowerHalf(LowerHalf(v3)); } template HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { const Repartition> d; const auto v1 = BitCast(d, v); const auto v2 = detail::ConcatEven(v1, v1); return LowerHalf(v2); } template HWY_API VFromD TruncateTo(D /* tag */, VFromD> v) { const Repartition> d; const auto v1 = BitCast(d, v); const auto v2 = detail::ConcatEven(v1, v1); return LowerHalf(v2); } // ------------------------------ MulEven (ConcatEven) // Multiplies even lanes (0, 2 ..) and places the double-wide result into // even and the upper half into its odd neighbor lane. HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { const DFromV d; int8x16_t a_packed = ConcatEven(d, a, a).raw; int8x16_t b_packed = ConcatEven(d, b, b).raw; return Vec128( vmull_s8(vget_low_s8(a_packed), vget_low_s8(b_packed))); } HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { const DFromV d; uint8x16_t a_packed = ConcatEven(d, a, a).raw; uint8x16_t b_packed = ConcatEven(d, b, b).raw; return Vec128( vmull_u8(vget_low_u8(a_packed), vget_low_u8(b_packed))); } HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { const DFromV d; int16x8_t a_packed = ConcatEven(d, a, a).raw; int16x8_t b_packed = ConcatEven(d, b, b).raw; return Vec128( vmull_s16(vget_low_s16(a_packed), vget_low_s16(b_packed))); } HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { const DFromV d; uint16x8_t a_packed = ConcatEven(d, a, a).raw; uint16x8_t b_packed = ConcatEven(d, b, b).raw; return Vec128( vmull_u16(vget_low_u16(a_packed), vget_low_u16(b_packed))); } HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { const DFromV d; int32x4_t a_packed = ConcatEven(d, a, a).raw; int32x4_t b_packed = ConcatEven(d, b, b).raw; return Vec128( vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed))); } HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { const DFromV d; uint32x4_t a_packed = ConcatEven(d, a, a).raw; uint32x4_t b_packed = ConcatEven(d, b, b).raw; return Vec128( vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed))); } template HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { const DFromV d; int8x8_t a_packed = ConcatEven(d, a, a).raw; int8x8_t b_packed = ConcatEven(d, b, b).raw; return Vec128( vget_low_s16(vmull_s8(a_packed, b_packed))); } template HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { const DFromV d; uint8x8_t a_packed = ConcatEven(d, a, a).raw; uint8x8_t b_packed = ConcatEven(d, b, b).raw; return Vec128( vget_low_u16(vmull_u8(a_packed, b_packed))); } template HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { const DFromV d; int16x4_t a_packed = ConcatEven(d, a, a).raw; int16x4_t b_packed = ConcatEven(d, b, b).raw; return Vec128( vget_low_s32(vmull_s16(a_packed, b_packed))); } template HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { const DFromV d; uint16x4_t a_packed = ConcatEven(d, a, a).raw; uint16x4_t b_packed = ConcatEven(d, b, b).raw; return Vec128( vget_low_u32(vmull_u16(a_packed, b_packed))); } template HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { const DFromV d; int32x2_t a_packed = ConcatEven(d, a, a).raw; int32x2_t b_packed = ConcatEven(d, b, b).raw; return Vec128( vget_low_s64(vmull_s32(a_packed, b_packed))); } template HWY_API Vec128 MulEven(Vec128 a, Vec128 b) { const DFromV d; uint32x2_t a_packed = ConcatEven(d, a, a).raw; uint32x2_t b_packed = ConcatEven(d, b, b).raw; return Vec128( vget_low_u64(vmull_u32(a_packed, b_packed))); } HWY_INLINE Vec128 MulEven(Vec128 a, Vec128 b) { uint64_t hi; uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 0), vgetq_lane_u64(b.raw, 0), &hi); return Vec128(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1)); } // Multiplies odd lanes (1, 3 ..) and places the double-wide result into // even and the upper half into its odd neighbor lane. HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { const DFromV d; int8x16_t a_packed = ConcatOdd(d, a, a).raw; int8x16_t b_packed = ConcatOdd(d, b, b).raw; return Vec128( vmull_s8(vget_low_s8(a_packed), vget_low_s8(b_packed))); } HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { const DFromV d; uint8x16_t a_packed = ConcatOdd(d, a, a).raw; uint8x16_t b_packed = ConcatOdd(d, b, b).raw; return Vec128( vmull_u8(vget_low_u8(a_packed), vget_low_u8(b_packed))); } HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { const DFromV d; int16x8_t a_packed = ConcatOdd(d, a, a).raw; int16x8_t b_packed = ConcatOdd(d, b, b).raw; return Vec128( vmull_s16(vget_low_s16(a_packed), vget_low_s16(b_packed))); } HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { const DFromV d; uint16x8_t a_packed = ConcatOdd(d, a, a).raw; uint16x8_t b_packed = ConcatOdd(d, b, b).raw; return Vec128( vmull_u16(vget_low_u16(a_packed), vget_low_u16(b_packed))); } HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { const DFromV d; int32x4_t a_packed = ConcatOdd(d, a, a).raw; int32x4_t b_packed = ConcatOdd(d, b, b).raw; return Vec128( vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed))); } HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { const DFromV d; uint32x4_t a_packed = ConcatOdd(d, a, a).raw; uint32x4_t b_packed = ConcatOdd(d, b, b).raw; return Vec128( vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed))); } template HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { const DFromV d; int8x8_t a_packed = ConcatOdd(d, a, a).raw; int8x8_t b_packed = ConcatOdd(d, b, b).raw; return Vec128( vget_low_s16(vmull_s8(a_packed, b_packed))); } template HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { const DFromV d; uint8x8_t a_packed = ConcatOdd(d, a, a).raw; uint8x8_t b_packed = ConcatOdd(d, b, b).raw; return Vec128( vget_low_u16(vmull_u8(a_packed, b_packed))); } template HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { const DFromV d; int16x4_t a_packed = ConcatOdd(d, a, a).raw; int16x4_t b_packed = ConcatOdd(d, b, b).raw; return Vec128( vget_low_s32(vmull_s16(a_packed, b_packed))); } template HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { const DFromV d; uint16x4_t a_packed = ConcatOdd(d, a, a).raw; uint16x4_t b_packed = ConcatOdd(d, b, b).raw; return Vec128( vget_low_u32(vmull_u16(a_packed, b_packed))); } template HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { const DFromV d; int32x2_t a_packed = ConcatOdd(d, a, a).raw; int32x2_t b_packed = ConcatOdd(d, b, b).raw; return Vec128( vget_low_s64(vmull_s32(a_packed, b_packed))); } template HWY_API Vec128 MulOdd(Vec128 a, Vec128 b) { const DFromV d; uint32x2_t a_packed = ConcatOdd(d, a, a).raw; uint32x2_t b_packed = ConcatOdd(d, b, b).raw; return Vec128( vget_low_u64(vmull_u32(a_packed, b_packed))); } HWY_INLINE Vec128 MulOdd(Vec128 a, Vec128 b) { uint64_t hi; uint64_t lo = Mul128(vgetq_lane_u64(a.raw, 1), vgetq_lane_u64(b.raw, 1), &hi); return Vec128(vsetq_lane_u64(hi, vdupq_n_u64(lo), 1)); } // ------------------------------ TableLookupBytes (Combine, LowerHalf) // Both full template HWY_API Vec128 TableLookupBytes(Vec128 bytes, Vec128 from) { const DFromV d; const Repartition d8; #if HWY_ARCH_ARM_A64 return BitCast(d, Vec128(vqtbl1q_u8(BitCast(d8, bytes).raw, BitCast(d8, from).raw))); #else uint8x16_t table0 = BitCast(d8, bytes).raw; uint8x8x2_t table; table.val[0] = vget_low_u8(table0); table.val[1] = vget_high_u8(table0); uint8x16_t idx = BitCast(d8, from).raw; uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx)); uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx)); return BitCast(d, Vec128(vcombine_u8(low, hi))); #endif } // Partial index vector template HWY_API Vec128 TableLookupBytes(Vec128 bytes, Vec128 from) { const Full128 d_full; const Vec64 from64(from.raw); const auto idx_full = Combine(d_full, from64, from64); const auto out_full = TableLookupBytes(bytes, idx_full); return Vec128(LowerHalf(Half(), out_full).raw); } // Partial table vector template HWY_API Vec128 TableLookupBytes(Vec128 bytes, Vec128 from) { const Full128 d_full; return TableLookupBytes(Combine(d_full, bytes, bytes), from); } // Partial both template HWY_API Vec128 TableLookupBytes(Vec128 bytes, Vec128 from) { const DFromV d; const Simd d_idx; const Repartition d_idx8; // uint8x8 const auto bytes8 = BitCast(Repartition(), bytes); const auto from8 = BitCast(d_idx8, from); const VFromD v8(vtbl1_u8(bytes8.raw, from8.raw)); return BitCast(d_idx, v8); } // For all vector widths; Arm anyway zeroes if >= 0x10. template HWY_API VI TableLookupBytesOr0(V bytes, VI from) { return TableLookupBytes(bytes, from); } // ---------------------------- AESKeyGenAssist (AESLastRound, TableLookupBytes) #if HWY_TARGET == HWY_NEON template HWY_API Vec128 AESKeyGenAssist(Vec128 v) { alignas(16) static constexpr uint8_t kRconXorMask[16] = { 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0}; alignas(16) static constexpr uint8_t kRotWordShuffle[16] = { 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12}; const DFromV d; const Repartition du32; const auto w13 = BitCast(d, DupOdd(BitCast(du32, v))); const auto sub_word_result = AESLastRound(w13, Load(d, kRconXorMask)); return TableLookupBytes(sub_word_result, Load(d, kRotWordShuffle)); } #endif // HWY_TARGET == HWY_NEON // ------------------------------ Scatter in generic_ops-inl.h // ------------------------------ Gather in generic_ops-inl.h // ------------------------------ Reductions // On Armv8 we define ReduceSum and generic_ops defines SumOfLanes via Set. #if HWY_ARCH_ARM_A64 #ifdef HWY_NATIVE_REDUCE_SCALAR #undef HWY_NATIVE_REDUCE_SCALAR #else #define HWY_NATIVE_REDUCE_SCALAR #endif // TODO(janwas): use normal HWY_NEON_DEF, then FULL type list. #define HWY_NEON_DEF_REDUCTION(type, size, name, prefix, infix, suffix) \ template \ HWY_API type##_t name(D /* tag */, Vec128 v) { \ return HWY_NEON_EVAL(prefix##infix##suffix, v.raw); \ } // Excludes u64/s64 (missing minv/maxv) and f16 (missing addv). #define HWY_NEON_DEF_REDUCTION_CORE_TYPES(name, prefix) \ HWY_NEON_DEF_REDUCTION(uint8, 8, name, prefix, _, u8) \ HWY_NEON_DEF_REDUCTION(uint8, 16, name, prefix##q, _, u8) \ HWY_NEON_DEF_REDUCTION(uint16, 4, name, prefix, _, u16) \ HWY_NEON_DEF_REDUCTION(uint16, 8, name, prefix##q, _, u16) \ HWY_NEON_DEF_REDUCTION(uint32, 2, name, prefix, _, u32) \ HWY_NEON_DEF_REDUCTION(uint32, 4, name, prefix##q, _, u32) \ HWY_NEON_DEF_REDUCTION(int8, 8, name, prefix, _, s8) \ HWY_NEON_DEF_REDUCTION(int8, 16, name, prefix##q, _, s8) \ HWY_NEON_DEF_REDUCTION(int16, 4, name, prefix, _, s16) \ HWY_NEON_DEF_REDUCTION(int16, 8, name, prefix##q, _, s16) \ HWY_NEON_DEF_REDUCTION(int32, 2, name, prefix, _, s32) \ HWY_NEON_DEF_REDUCTION(int32, 4, name, prefix##q, _, s32) \ HWY_NEON_DEF_REDUCTION(float32, 2, name, prefix, _, f32) \ HWY_NEON_DEF_REDUCTION(float32, 4, name, prefix##q, _, f32) \ HWY_NEON_DEF_REDUCTION(float64, 2, name, prefix##q, _, f64) // Different interface than HWY_NEON_DEF_FUNCTION_FULL_UI_64. #define HWY_NEON_DEF_REDUCTION_UI64(name, prefix) \ HWY_NEON_DEF_REDUCTION(uint64, 2, name, prefix##q, _, u64) \ HWY_NEON_DEF_REDUCTION(int64, 2, name, prefix##q, _, s64) #if HWY_HAVE_FLOAT16 #define HWY_NEON_DEF_REDUCTION_F16(name, prefix) \ HWY_NEON_DEF_REDUCTION(float16, 4, name, prefix, _, f16) \ HWY_NEON_DEF_REDUCTION(float16, 8, name, prefix##q, _, f16) #else #define HWY_NEON_DEF_REDUCTION_F16(name, prefix) #endif HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceMin, vminv) HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceMax, vmaxv) HWY_NEON_DEF_REDUCTION_F16(ReduceMin, vminv) HWY_NEON_DEF_REDUCTION_F16(ReduceMax, vmaxv) HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceSum, vaddv) HWY_NEON_DEF_REDUCTION_UI64(ReduceSum, vaddv) // Emulate missing UI64 and partial N=2. template HWY_API TFromD ReduceSum(D /* tag */, VFromD v10) { return GetLane(v10) + ExtractLane(v10, 1); } template HWY_API TFromD ReduceMin(D /* tag */, VFromD v10) { return HWY_MIN(GetLane(v10), ExtractLane(v10, 1)); } template HWY_API TFromD ReduceMax(D /* tag */, VFromD v10) { return HWY_MAX(GetLane(v10), ExtractLane(v10, 1)); } #if HWY_HAVE_FLOAT16 template HWY_API float16_t ReduceMin(D d, VFromD v10) { return GetLane(Min(v10, Reverse2(d, v10))); } template HWY_API float16_t ReduceMax(D d, VFromD v10) { return GetLane(Max(v10, Reverse2(d, v10))); } template HWY_API float16_t ReduceSum(D /* tag */, VFromD v) { const float16x4_t x2 = vpadd_f16(v.raw, v.raw); return GetLane(VFromD(vpadd_f16(x2, x2))); } template HWY_API float16_t ReduceSum(D d, VFromD v) { const Half dh; return ReduceSum(dh, LowerHalf(dh, VFromD(vpaddq_f16(v.raw, v.raw)))); } #endif // HWY_HAVE_FLOAT16 #undef HWY_NEON_DEF_REDUCTION_CORE_TYPES #undef HWY_NEON_DEF_REDUCTION_F16 #undef HWY_NEON_DEF_REDUCTION_UI64 #undef HWY_NEON_DEF_REDUCTION // ------------------------------ SumOfLanes template HWY_API VFromD SumOfLanes(D d, VFromD v) { return Set(d, ReduceSum(d, v)); } template HWY_API VFromD MinOfLanes(D d, VFromD v) { return Set(d, ReduceMin(d, v)); } template HWY_API VFromD MaxOfLanes(D d, VFromD v) { return Set(d, ReduceMax(d, v)); } // On Armv7 we define SumOfLanes and generic_ops defines ReduceSum via GetLane. #else // !HWY_ARCH_ARM_A64 // Armv7 lacks N=2 and 8-bit x4, so enable generic versions of those. #undef HWY_IF_SUM_OF_LANES_D #define HWY_IF_SUM_OF_LANES_D(D) \ hwy::EnableIf<(HWY_MAX_LANES_D(D) == 2) || \ (sizeof(TFromD) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \ nullptr #undef HWY_IF_MINMAX_OF_LANES_D #define HWY_IF_MINMAX_OF_LANES_D(D) \ hwy::EnableIf<(HWY_MAX_LANES_D(D) == 2) || \ (sizeof(TFromD) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \ nullptr // For arm7, we implement reductions using a series of pairwise operations. This // produces the full vector result, so we express Reduce* in terms of *OfLanes. #define HWY_NEON_BUILD_TYPE_T(type, size) type##x##size##_t #define HWY_NEON_DEF_PAIRWISE_REDUCTION(type, size, name, prefix, suffix) \ template \ HWY_API Vec128 name##OfLanes(D /* d */, \ Vec128 v) { \ HWY_NEON_BUILD_TYPE_T(type, size) tmp = prefix##_##suffix(v.raw, v.raw); \ if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \ if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \ return Vec128(tmp); \ } // For the wide versions, the pairwise operations produce a half-length vector. // We produce that `tmp` and then Combine. #define HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(type, size, half, name, prefix, \ suffix) \ template \ HWY_API Vec128 name##OfLanes(D /* d */, \ Vec128 v) { \ HWY_NEON_BUILD_TYPE_T(type, half) tmp; \ tmp = prefix##_##suffix(vget_high_##suffix(v.raw), \ vget_low_##suffix(v.raw)); \ if ((size / 2) > 1) tmp = prefix##_##suffix(tmp, tmp); \ if ((size / 4) > 1) tmp = prefix##_##suffix(tmp, tmp); \ if ((size / 8) > 1) tmp = prefix##_##suffix(tmp, tmp); \ return Vec128(vcombine_##suffix(tmp, tmp)); \ } #define HWY_NEON_DEF_PAIRWISE_REDUCTIONS(name, prefix) \ HWY_NEON_DEF_PAIRWISE_REDUCTION(uint32, 2, name, prefix, u32) \ HWY_NEON_DEF_PAIRWISE_REDUCTION(uint16, 4, name, prefix, u16) \ HWY_NEON_DEF_PAIRWISE_REDUCTION(uint8, 8, name, prefix, u8) \ HWY_NEON_DEF_PAIRWISE_REDUCTION(int32, 2, name, prefix, s32) \ HWY_NEON_DEF_PAIRWISE_REDUCTION(int16, 4, name, prefix, s16) \ HWY_NEON_DEF_PAIRWISE_REDUCTION(int8, 8, name, prefix, s8) \ HWY_NEON_DEF_PAIRWISE_REDUCTION(float32, 2, name, prefix, f32) \ HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint32, 4, 2, name, prefix, u32) \ HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint16, 8, 4, name, prefix, u16) \ HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(uint8, 16, 8, name, prefix, u8) \ HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int32, 4, 2, name, prefix, s32) \ HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int16, 8, 4, name, prefix, s16) \ HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(int8, 16, 8, name, prefix, s8) \ HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION(float32, 4, 2, name, prefix, f32) HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Sum, vpadd) HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Min, vpmin) HWY_NEON_DEF_PAIRWISE_REDUCTIONS(Max, vpmax) #undef HWY_NEON_DEF_PAIRWISE_REDUCTIONS #undef HWY_NEON_DEF_WIDE_PAIRWISE_REDUCTION #undef HWY_NEON_DEF_PAIRWISE_REDUCTION #undef HWY_NEON_BUILD_TYPE_T // GetLane(SumsOf4(v)) is more efficient on ArmV7 NEON than the default // N=4 I8/U8 ReduceSum implementation in generic_ops-inl.h #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8 #undef HWY_NATIVE_REDUCE_SUM_4_UI8 #else #define HWY_NATIVE_REDUCE_SUM_4_UI8 #endif template HWY_API TFromD ReduceSum(D /*d*/, VFromD v) { return static_cast>(GetLane(SumsOf4(v))); } #endif // HWY_ARCH_ARM_A64 // ------------------------------ LoadMaskBits (TestBit) namespace detail { // Helper function to set 64 bits and potentially return a smaller vector. The // overload is required to call the q vs non-q intrinsics. Note that 8-bit // LoadMaskBits only requires 16 bits, but 64 avoids casting. template HWY_INLINE VFromD Set64(D /* tag */, uint64_t mask_bits) { const auto v64 = Vec64(vdup_n_u64(mask_bits)); return VFromD(BitCast(Full64>(), v64).raw); } template HWY_INLINE Vec128 Set64(Full128 d, uint64_t mask_bits) { return BitCast(d, Vec128(vdupq_n_u64(mask_bits))); } template HWY_INLINE MFromD LoadMaskBits(D d, uint64_t mask_bits) { const RebindToUnsigned du; // Easier than Set(), which would require an >8-bit type, which would not // compile for T=uint8_t, N=1. const auto vmask_bits = Set64(du, mask_bits); // Replicate bytes 8x such that each byte contains the bit that governs it. alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}; const auto rep8 = TableLookupBytes(vmask_bits, Load(du, kRep8)); alignas(16) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128}; return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); } template HWY_INLINE MFromD LoadMaskBits(D d, uint64_t mask_bits) { const RebindToUnsigned du; alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; const auto vmask_bits = Set(du, static_cast(mask_bits)); return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); } template HWY_INLINE MFromD LoadMaskBits(D d, uint64_t mask_bits) { const RebindToUnsigned du; alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8}; const auto vmask_bits = Set(du, static_cast(mask_bits)); return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); } template HWY_INLINE MFromD LoadMaskBits(D d, uint64_t mask_bits) { const RebindToUnsigned du; alignas(16) static constexpr uint64_t kBit[8] = {1, 2}; return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit))); } } // namespace detail // `p` points to at least 8 readable bytes, not all of which need be valid. template HWY_API MFromD LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { uint64_t mask_bits = 0; CopyBytes<(d.MaxLanes() + 7) / 8>(bits, &mask_bits); return detail::LoadMaskBits(d, mask_bits); } // ------------------------------ Dup128MaskFromMaskBits template HWY_API MFromD Dup128MaskFromMaskBits(D d, unsigned mask_bits) { constexpr size_t kN = MaxLanes(d); if (kN < 8) mask_bits &= (1u << kN) - 1; return detail::LoadMaskBits(d, mask_bits); } // ------------------------------ Mask namespace detail { // Returns mask[i]? 0xF : 0 in each nibble. This is more efficient than // BitsFromMask for use in (partial) CountTrue, FindFirstTrue and AllFalse. template HWY_INLINE uint64_t NibblesFromMask(D d, MFromD mask) { const Full128 du16; const Vec128 vu16 = BitCast(du16, VecFromMask(d, mask)); const Vec64 nib(vshrn_n_u16(vu16.raw, 4)); return GetLane(BitCast(Full64(), nib)); } template HWY_INLINE uint64_t NibblesFromMask(D d, MFromD mask) { // There is no vshrn_n_u16 for uint16x4, so zero-extend. const Twice d2; const VFromD v128 = ZeroExtendVector(d2, VecFromMask(d, mask)); // No need to mask, upper half is zero thanks to ZeroExtendVector. return NibblesFromMask(d2, MaskFromVec(v128)); } template HWY_INLINE uint64_t NibblesFromMask(D d, MFromD mask) { const Mask64> mask64(mask.raw); const uint64_t nib = NibblesFromMask(Full64>(), mask64); // Clear nibbles from upper half of 64-bits return nib & ((1ull << (d.MaxBytes() * 4)) - 1); } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { alignas(16) static constexpr uint8_t kSliceLanes[16] = { 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, }; const Full128 du; const Vec128 values = BitCast(du, VecFromMask(Full128(), mask)) & Load(du, kSliceLanes); #if HWY_ARCH_ARM_A64 // Can't vaddv - we need two separate bytes (16 bits). const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw)); const uint8x8_t x4 = vpadd_u8(x2, x2); const uint8x8_t x8 = vpadd_u8(x4, x4); return vget_lane_u64(vreinterpret_u64_u8(x8), 0) & 0xFFFF; #else // Don't have vpaddq, so keep doubling lane size. const uint16x8_t x2 = vpaddlq_u8(values.raw); const uint32x4_t x4 = vpaddlq_u16(x2); const uint64x2_t x8 = vpaddlq_u32(x4); return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0); #endif } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<1> /*tag*/, Mask128 mask) { // Upper lanes of partial loads are undefined. OnlyActive will fix this if // we load all kSliceLanes so the upper lanes do not pollute the valid bits. alignas(8) static constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80}; const DFromM d; const RebindToUnsigned du; const Vec128 slice(Load(Full64(), kSliceLanes).raw); const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; #if HWY_ARCH_ARM_A64 return vaddv_u8(values.raw); #else const uint16x4_t x2 = vpaddl_u8(values.raw); const uint32x2_t x4 = vpaddl_u16(x2); const uint64x1_t x8 = vpaddl_u32(x4); return vget_lane_u64(x8, 0); #endif } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { alignas(16) static constexpr uint16_t kSliceLanes[8] = { 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80}; const Full128 d; const Full128 du; const Vec128 values = BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); #if HWY_ARCH_ARM_A64 return vaddvq_u16(values.raw); #else const uint32x4_t x2 = vpaddlq_u16(values.raw); const uint64x2_t x4 = vpaddlq_u32(x2); return vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1); #endif } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<2> /*tag*/, Mask128 mask) { // Upper lanes of partial loads are undefined. OnlyActive will fix this if // we load all kSliceLanes so the upper lanes do not pollute the valid bits. alignas(8) static constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8}; const DFromM d; const RebindToUnsigned du; const Vec128 slice(Load(Full64(), kSliceLanes).raw); const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; #if HWY_ARCH_ARM_A64 return vaddv_u16(values.raw); #else const uint32x2_t x2 = vpaddl_u16(values.raw); const uint64x1_t x4 = vpaddl_u32(x2); return vget_lane_u64(x4, 0); #endif } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { alignas(16) static constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8}; const Full128 d; const Full128 du; const Vec128 values = BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); #if HWY_ARCH_ARM_A64 return vaddvq_u32(values.raw); #else const uint64x2_t x2 = vpaddlq_u32(values.raw); return vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1); #endif } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<4> /*tag*/, Mask128 mask) { // Upper lanes of partial loads are undefined. OnlyActive will fix this if // we load all kSliceLanes so the upper lanes do not pollute the valid bits. alignas(8) static constexpr uint32_t kSliceLanes[2] = {1, 2}; const DFromM d; const RebindToUnsigned du; const Vec128 slice(Load(Full64(), kSliceLanes).raw); const Vec128 values = BitCast(du, VecFromMask(d, mask)) & slice; #if HWY_ARCH_ARM_A64 return vaddv_u32(values.raw); #else const uint64x1_t x2 = vpaddl_u32(values.raw); return vget_lane_u64(x2, 0); #endif } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 m) { alignas(16) static constexpr uint64_t kSliceLanes[2] = {1, 2}; const Full128 d; const Full128 du; const Vec128 values = BitCast(du, VecFromMask(d, m)) & Load(du, kSliceLanes); #if HWY_ARCH_ARM_A64 return vaddvq_u64(values.raw); #else return vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1); #endif } template HWY_INLINE uint64_t BitsFromMask(hwy::SizeTag<8> /*tag*/, Mask128 m) { const Full64 d; const Full64 du; const Vec64 values = BitCast(du, VecFromMask(d, m)) & Set(du, 1); return vget_lane_u64(values.raw, 0); } // Returns the lowest N for the BitsFromMask result. template constexpr uint64_t OnlyActive(uint64_t bits) { return ((N * sizeof(T)) >= 8) ? bits : (bits & ((1ull << N) - 1)); } template HWY_INLINE uint64_t BitsFromMask(Mask128 mask) { return OnlyActive(BitsFromMask(hwy::SizeTag(), mask)); } // Returns number of lanes whose mask is set. // // Masks are either FF..FF or 0. Unfortunately there is no reduce-sub op // ("vsubv"). ANDing with 1 would work but requires a constant. Negating also // changes each lane to 1 (if mask set) or 0. // NOTE: PopCount also operates on vectors, so we still have to do horizontal // sums separately. We specialize CountTrue for full vectors (negating instead // of PopCount because it avoids an extra shift), and use PopCount of // NibblesFromMask for partial vectors. template HWY_INLINE size_t CountTrue(hwy::SizeTag<1> /*tag*/, Mask128 mask) { const Full128 di; const int8x16_t ones = vnegq_s8(BitCast(di, VecFromMask(Full128(), mask)).raw); #if HWY_ARCH_ARM_A64 return static_cast(vaddvq_s8(ones)); #else const int16x8_t x2 = vpaddlq_s8(ones); const int32x4_t x4 = vpaddlq_s16(x2); const int64x2_t x8 = vpaddlq_s32(x4); return static_cast(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1)); #endif } template HWY_INLINE size_t CountTrue(hwy::SizeTag<2> /*tag*/, Mask128 mask) { const Full128 di; const int16x8_t ones = vnegq_s16(BitCast(di, VecFromMask(Full128(), mask)).raw); #if HWY_ARCH_ARM_A64 return static_cast(vaddvq_s16(ones)); #else const int32x4_t x2 = vpaddlq_s16(ones); const int64x2_t x4 = vpaddlq_s32(x2); return static_cast(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1)); #endif } template HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, Mask128 mask) { const Full128 di; const int32x4_t ones = vnegq_s32(BitCast(di, VecFromMask(Full128(), mask)).raw); #if HWY_ARCH_ARM_A64 return static_cast(vaddvq_s32(ones)); #else const int64x2_t x2 = vpaddlq_s32(ones); return static_cast(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1)); #endif } template HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, Mask128 mask) { #if HWY_ARCH_ARM_A64 const Full128 di; const int64x2_t ones = vnegq_s64(BitCast(di, VecFromMask(Full128(), mask)).raw); return static_cast(vaddvq_s64(ones)); #else const Full128 du; const auto mask_u = VecFromMask(du, RebindMask(du, mask)); const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63); return static_cast(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1)); #endif } } // namespace detail // Full template > HWY_API size_t CountTrue(D /* tag */, Mask128 mask) { return detail::CountTrue(hwy::SizeTag(), mask); } // Partial template HWY_API size_t CountTrue(D d, MFromD mask) { constexpr int kDiv = 4 * sizeof(TFromD); return PopCount(detail::NibblesFromMask(d, mask)) / kDiv; } template HWY_API size_t FindKnownFirstTrue(D d, MFromD mask) { const uint64_t nib = detail::NibblesFromMask(d, mask); constexpr size_t kDiv = 4 * sizeof(TFromD); return Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv; } template HWY_API intptr_t FindFirstTrue(D d, MFromD mask) { const uint64_t nib = detail::NibblesFromMask(d, mask); if (nib == 0) return -1; constexpr size_t kDiv = 4 * sizeof(TFromD); return static_cast(Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv); } template HWY_API size_t FindKnownLastTrue(D d, MFromD mask) { const uint64_t nib = detail::NibblesFromMask(d, mask); constexpr size_t kDiv = 4 * sizeof(TFromD); return (63 - Num0BitsAboveMS1Bit_Nonzero64(nib)) / kDiv; } template HWY_API intptr_t FindLastTrue(D d, MFromD mask) { const uint64_t nib = detail::NibblesFromMask(d, mask); if (nib == 0) return -1; constexpr size_t kDiv = 4 * sizeof(TFromD); return static_cast((63 - Num0BitsAboveMS1Bit_Nonzero64(nib)) / kDiv); } // `p` points to at least 8 writable bytes. template HWY_API size_t StoreMaskBits(D d, MFromD mask, uint8_t* bits) { const uint64_t mask_bits = detail::BitsFromMask(mask); const size_t kNumBytes = (d.MaxLanes() + 7) / 8; CopyBytes(&mask_bits, bits); return kNumBytes; } template HWY_API bool AllFalse(D d, MFromD m) { return detail::NibblesFromMask(d, m) == 0; } // Full template > HWY_API bool AllTrue(D d, Mask128 m) { return detail::NibblesFromMask(d, m) == ~0ull; } // Partial template HWY_API bool AllTrue(D d, MFromD m) { return detail::NibblesFromMask(d, m) == (1ull << (d.MaxBytes() * 4)) - 1; } // ------------------------------ Compress template struct CompressIsPartition { enum { value = (sizeof(T) != 1) }; }; namespace detail { // Load 8 bytes, replicate into upper half so ZipLower can use the lower half. template HWY_INLINE Vec128 Load8Bytes(D /*tag*/, const uint8_t* bytes) { return Vec128(vreinterpretq_u8_u64( vld1q_dup_u64(HWY_RCAST_ALIGNED(const uint64_t*, bytes)))); } // Load 8 bytes and return half-reg with N <= 8 bytes. template HWY_INLINE VFromD Load8Bytes(D d, const uint8_t* bytes) { return Load(d, bytes); } template HWY_INLINE Vec128 IdxFromBits(hwy::SizeTag<2> /*tag*/, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 256); const Simd d; const Repartition d8; const Simd du; // NEON does not provide an equivalent of AVX2 permutevar, so we need byte // indices for VTBL (one vector's worth for each of 256 combinations of // 8 mask bits). Loading them directly would require 4 KiB. We can instead // store lane indices and convert to byte indices (2*lane + 0..1), with the // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles // is likely more costly than the higher cache footprint from storing bytes. alignas(16) static constexpr uint8_t table[256 * 8] = { // PrintCompress16x8Tables 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; const Vec128 byte_idx = Load8Bytes(d8, table + mask_bits * 8); const Vec128 pairs = ZipLower(byte_idx, byte_idx); return BitCast(d, pairs + Set(du, 0x0100)); } template HWY_INLINE Vec128 IdxFromNotBits(hwy::SizeTag<2> /*tag*/, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 256); const Simd d; const Repartition d8; const Simd du; // NEON does not provide an equivalent of AVX2 permutevar, so we need byte // indices for VTBL (one vector's worth for each of 256 combinations of // 8 mask bits). Loading them directly would require 4 KiB. We can instead // store lane indices and convert to byte indices (2*lane + 0..1), with the // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles // is likely more costly than the higher cache footprint from storing bytes. alignas(16) static constexpr uint8_t table[256 * 8] = { // PrintCompressNot16x8Tables 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; const Vec128 byte_idx = Load8Bytes(d8, table + mask_bits * 8); const Vec128 pairs = ZipLower(byte_idx, byte_idx); return BitCast(d, pairs + Set(du, 0x0100)); } template HWY_INLINE Vec128 IdxFromBits(hwy::SizeTag<4> /*tag*/, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 16); // There are only 4 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[16 * 16] = { // PrintCompress32x4Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Simd d; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_INLINE Vec128 IdxFromNotBits(hwy::SizeTag<4> /*tag*/, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 16); // There are only 4 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[16 * 16] = { // PrintCompressNot32x4Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Simd d; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } #if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64 template HWY_INLINE Vec128 IdxFromBits(hwy::SizeTag<8> /*tag*/, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 4); // There are only 2 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[64] = { // PrintCompress64x2Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Simd d; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } template HWY_INLINE Vec128 IdxFromNotBits(hwy::SizeTag<8> /*tag*/, uint64_t mask_bits) { HWY_DASSERT(mask_bits < 4); // There are only 2 lanes, so we can afford to load the index vector directly. alignas(16) static constexpr uint8_t u8_indices[4 * 16] = { // PrintCompressNot64x2Tables 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; const Simd d; const Repartition d8; return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); } #endif // Helper function called by both Compress and CompressStore - avoids a // redundant BitsFromMask in the latter. template HWY_INLINE Vec128 Compress(Vec128 v, uint64_t mask_bits) { const auto idx = detail::IdxFromBits(hwy::SizeTag(), mask_bits); using D = DFromV; const RebindToSigned di; return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); } template HWY_INLINE Vec128 CompressNot(Vec128 v, uint64_t mask_bits) { const auto idx = detail::IdxFromNotBits(hwy::SizeTag(), mask_bits); using D = DFromV; const RebindToSigned di; return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); } } // namespace detail // Single lane: no-op template HWY_API Vec128 Compress(Vec128 v, Mask128 /*m*/) { return v; } // Two lanes: conditional swap template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. const DFromV d; const Vec128 m = VecFromMask(d, mask); const Vec128 maskL = DupEven(m); const Vec128 maskH = DupOdd(m); const Vec128 swap = AndNot(maskL, maskH); return IfVecThenElse(swap, Shuffle01(v), v); } // General case, 2 or 4 byte lanes template HWY_API Vec128 Compress(Vec128 v, Mask128 mask) { return detail::Compress(v, detail::BitsFromMask(mask)); } // Single lane: no-op template HWY_API Vec128 CompressNot(Vec128 v, Mask128 /*m*/) { return v; } // Two lanes: conditional swap template HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. const DFromV d; const Vec128 m = VecFromMask(d, mask); const Vec128 maskL = DupEven(m); const Vec128 maskH = DupOdd(m); const Vec128 swap = AndNot(maskH, maskL); return IfVecThenElse(swap, Shuffle01(v), v); } // General case, 2 or 4 byte lanes template HWY_API Vec128 CompressNot(Vec128 v, Mask128 mask) { // For partial vectors, we cannot pull the Not() into the table because // BitsFromMask clears the upper bits. if (N < 16 / sizeof(T)) { return detail::Compress(v, detail::BitsFromMask(Not(mask))); } return detail::CompressNot(v, detail::BitsFromMask(mask)); } // ------------------------------ CompressBlocksNot HWY_API Vec128 CompressBlocksNot(Vec128 v, Mask128 /* m */) { return v; } // ------------------------------ CompressBits template HWY_INLINE Vec128 CompressBits(Vec128 v, const uint8_t* HWY_RESTRICT bits) { uint64_t mask_bits = 0; constexpr size_t kNumBytes = (N + 7) / 8; CopyBytes(bits, &mask_bits); if (N < 8) { mask_bits &= (1ull << N) - 1; } return detail::Compress(v, mask_bits); } // ------------------------------ CompressStore template HWY_API size_t CompressStore(VFromD v, MFromD mask, D d, TFromD* HWY_RESTRICT unaligned) { const uint64_t mask_bits = detail::BitsFromMask(mask); StoreU(detail::Compress(v, mask_bits), d, unaligned); return PopCount(mask_bits); } // ------------------------------ CompressBlendedStore template HWY_API size_t CompressBlendedStore(VFromD v, MFromD m, D d, TFromD* HWY_RESTRICT unaligned) { const RebindToUnsigned du; // so we can support fp16/bf16 const uint64_t mask_bits = detail::BitsFromMask(m); const size_t count = PopCount(mask_bits); const MFromD store_mask = RebindMask(d, FirstN(du, count)); const VFromD compressed = detail::Compress(BitCast(du, v), mask_bits); BlendedStore(BitCast(d, compressed), store_mask, d, unaligned); return count; } // ------------------------------ CompressBitsStore template HWY_API size_t CompressBitsStore(VFromD v, const uint8_t* HWY_RESTRICT bits, D d, TFromD* HWY_RESTRICT unaligned) { uint64_t mask_bits = 0; constexpr size_t kNumBytes = (d.MaxLanes() + 7) / 8; CopyBytes(bits, &mask_bits); if (d.MaxLanes() < 8) { mask_bits &= (1ull << d.MaxLanes()) - 1; } StoreU(detail::Compress(v, mask_bits), d, unaligned); return PopCount(mask_bits); } // ------------------------------ LoadInterleaved2 // Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2. #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED #else #define HWY_NATIVE_LOAD_STORE_INTERLEAVED #endif namespace detail { #define HWY_NEON_BUILD_TPL_HWY_LOAD_INT #define HWY_NEON_BUILD_ARG_HWY_LOAD_INT from #if HWY_ARCH_ARM_A64 #define HWY_IF_LOAD_INT(D) HWY_IF_V_SIZE_GT_D(D, 4) #define HWY_NEON_DEF_FUNCTION_LOAD_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES #else // Exclude 64x2 and f64x1, which are only supported on aarch64 #define HWY_IF_LOAD_INT(D) \ HWY_IF_V_SIZE_GT_D(D, 4), \ hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD) < 8)>* = \ nullptr #define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \ HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args) #endif // HWY_ARCH_ARM_A64 // Must return raw tuple because Tuple2 lack a ctor, and we cannot use // brace-initialization in HWY_NEON_DEF_FUNCTION because some functions return // void. #define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \ decltype(Tuple2().raw) // Tuple tag arg allows overloading (cannot just overload on return type) #define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \ const NativeLaneType*from, Tuple2 HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved2, vld2, _, HWY_LOAD_INT) #undef HWY_NEON_BUILD_RET_HWY_LOAD_INT #undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT #define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \ decltype(Tuple3().raw) #define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \ const NativeLaneType*from, Tuple3 HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved3, vld3, _, HWY_LOAD_INT) #undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT #undef HWY_NEON_BUILD_RET_HWY_LOAD_INT #define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \ decltype(Tuple4().raw) #define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \ const NativeLaneType*from, Tuple4 HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved4, vld4, _, HWY_LOAD_INT) #undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT #undef HWY_NEON_BUILD_RET_HWY_LOAD_INT #undef HWY_NEON_DEF_FUNCTION_LOAD_INT #undef HWY_NEON_BUILD_TPL_HWY_LOAD_INT #undef HWY_NEON_BUILD_ARG_HWY_LOAD_INT } // namespace detail template > HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1) { auto raw = detail::LoadInterleaved2(detail::NativeLanePointer(unaligned), detail::Tuple2()); v0 = VFromD(raw.val[0]); v1 = VFromD(raw.val[1]); } // <= 32 bits: avoid loading more than N bytes by copying to buffer template > HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1) { // The smallest vector registers are 64-bits and we want space for two. alignas(16) T buf[2 * 8 / sizeof(T)] = {}; CopyBytes(unaligned, buf); auto raw = detail::LoadInterleaved2(detail::NativeLanePointer(buf), detail::Tuple2()); v0 = VFromD(raw.val[0]); v1 = VFromD(raw.val[1]); } #if HWY_ARCH_ARM_V7 // 64x2: split into two 64x1 template , HWY_IF_T_SIZE(T, 8)> HWY_API void LoadInterleaved2(D d, T* HWY_RESTRICT unaligned, Vec128& v0, Vec128& v1) { const Half dh; VFromD v00, v10, v01, v11; LoadInterleaved2(dh, detail::NativeLanePointer(unaligned), v00, v10); LoadInterleaved2(dh, detail::NativeLanePointer(unaligned + 2), v01, v11); v0 = Combine(d, v01, v00); v1 = Combine(d, v11, v10); } #endif // HWY_ARCH_ARM_V7 // ------------------------------ LoadInterleaved3 template > HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { auto raw = detail::LoadInterleaved3(detail::NativeLanePointer(unaligned), detail::Tuple3()); v0 = VFromD(raw.val[0]); v1 = VFromD(raw.val[1]); v2 = VFromD(raw.val[2]); } // <= 32 bits: avoid writing more than N bytes by copying to buffer template > HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2) { // The smallest vector registers are 64-bits and we want space for three. alignas(16) T buf[3 * 8 / sizeof(T)] = {}; CopyBytes(unaligned, buf); auto raw = detail::LoadInterleaved3(detail::NativeLanePointer(buf), detail::Tuple3()); v0 = VFromD(raw.val[0]); v1 = VFromD(raw.val[1]); v2 = VFromD(raw.val[2]); } #if HWY_ARCH_ARM_V7 // 64x2: split into two 64x1 template , HWY_IF_T_SIZE(T, 8)> HWY_API void LoadInterleaved3(D d, const TFromD* HWY_RESTRICT unaligned, Vec128& v0, Vec128& v1, Vec128& v2) { const Half dh; VFromD v00, v10, v20, v01, v11, v21; LoadInterleaved3(dh, detail::NativeLanePointer(unaligned), v00, v10, v20); LoadInterleaved3(dh, detail::NativeLanePointer(unaligned + 3), v01, v11, v21); v0 = Combine(d, v01, v00); v1 = Combine(d, v11, v10); v2 = Combine(d, v21, v20); } #endif // HWY_ARCH_ARM_V7 // ------------------------------ LoadInterleaved4 template > HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2, VFromD& v3) { auto raw = detail::LoadInterleaved4(detail::NativeLanePointer(unaligned), detail::Tuple4()); v0 = VFromD(raw.val[0]); v1 = VFromD(raw.val[1]); v2 = VFromD(raw.val[2]); v3 = VFromD(raw.val[3]); } // <= 32 bits: avoid writing more than N bytes by copying to buffer template > HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, VFromD& v0, VFromD& v1, VFromD& v2, VFromD& v3) { alignas(16) T buf[4 * 8 / sizeof(T)] = {}; CopyBytes(unaligned, buf); auto raw = detail::LoadInterleaved4(detail::NativeLanePointer(buf), detail::Tuple4()); v0 = VFromD(raw.val[0]); v1 = VFromD(raw.val[1]); v2 = VFromD(raw.val[2]); v3 = VFromD(raw.val[3]); } #if HWY_ARCH_ARM_V7 // 64x2: split into two 64x1 template , HWY_IF_T_SIZE(T, 8)> HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, Vec128& v0, Vec128& v1, Vec128& v2, Vec128& v3) { const Half dh; VFromD v00, v10, v20, v30, v01, v11, v21, v31; LoadInterleaved4(dh, detail::NativeLanePointer(unaligned), v00, v10, v20, v30); LoadInterleaved4(dh, detail::NativeLanePointer(unaligned + 4), v01, v11, v21, v31); v0 = Combine(d, v01, v00); v1 = Combine(d, v11, v10); v2 = Combine(d, v21, v20); v3 = Combine(d, v31, v30); } #endif // HWY_ARCH_ARM_V7 #undef HWY_IF_LOAD_INT // ------------------------------ StoreInterleaved2 namespace detail { #define HWY_NEON_BUILD_TPL_HWY_STORE_INT #define HWY_NEON_BUILD_RET_HWY_STORE_INT(type, size) void #define HWY_NEON_BUILD_ARG_HWY_STORE_INT to, tup.raw #if HWY_ARCH_ARM_A64 #define HWY_IF_STORE_INT(D) HWY_IF_V_SIZE_GT_D(D, 4) #define HWY_NEON_DEF_FUNCTION_STORE_INT HWY_NEON_DEF_FUNCTION_ALL_TYPES #else // Exclude 64x2 and f64x1, which are only supported on aarch64 #define HWY_IF_STORE_INT(D) \ HWY_IF_V_SIZE_GT_D(D, 4), \ hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD) < 8)>* = \ nullptr #define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \ HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \ HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args) #endif // HWY_ARCH_ARM_A64 #define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \ Tuple2 tup, NativeLaneType*to HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved2, vst2, _, HWY_STORE_INT) #undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT #define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \ Tuple3 tup, NativeLaneType*to HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved3, vst3, _, HWY_STORE_INT) #undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT #define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \ Tuple4 tup, NativeLaneType*to HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved4, vst4, _, HWY_STORE_INT) #undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT #undef HWY_NEON_DEF_FUNCTION_STORE_INT #undef HWY_NEON_BUILD_TPL_HWY_STORE_INT #undef HWY_NEON_BUILD_RET_HWY_STORE_INT #undef HWY_NEON_BUILD_ARG_HWY_STORE_INT } // namespace detail template > HWY_API void StoreInterleaved2(VFromD v0, VFromD v1, D d, T* HWY_RESTRICT unaligned) { detail::Tuple2 tup = {{{v0.raw, v1.raw}}}; detail::StoreInterleaved2(tup, detail::NativeLanePointer(unaligned)); } // <= 32 bits: avoid writing more than N bytes by copying to buffer template > HWY_API void StoreInterleaved2(VFromD v0, VFromD v1, D d, T* HWY_RESTRICT unaligned) { alignas(16) T buf[2 * 8 / sizeof(T)]; detail::Tuple2 tup = {{{v0.raw, v1.raw}}}; detail::StoreInterleaved2(tup, detail::NativeLanePointer(buf)); CopyBytes(buf, unaligned); } #if HWY_ARCH_ARM_V7 // 64x2: split into two 64x1 template , HWY_IF_T_SIZE(T, 8)> HWY_API void StoreInterleaved2(Vec128 v0, Vec128 v1, D d, T* HWY_RESTRICT unaligned) { const Half dh; StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), dh, detail::NativeLanePointer(unaligned)); StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), dh, detail::NativeLanePointer(unaligned + 2)); } #endif // HWY_ARCH_ARM_V7 // ------------------------------ StoreInterleaved3 template > HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, T* HWY_RESTRICT unaligned) { detail::Tuple3 tup = {{{v0.raw, v1.raw, v2.raw}}}; detail::StoreInterleaved3(tup, detail::NativeLanePointer(unaligned)); } // <= 32 bits: avoid writing more than N bytes by copying to buffer template > HWY_API void StoreInterleaved3(VFromD v0, VFromD v1, VFromD v2, D d, T* HWY_RESTRICT unaligned) { alignas(16) T buf[3 * 8 / sizeof(T)]; detail::Tuple3 tup = {{{v0.raw, v1.raw, v2.raw}}}; detail::StoreInterleaved3(tup, detail::NativeLanePointer(buf)); CopyBytes(buf, unaligned); } #if HWY_ARCH_ARM_V7 // 64x2: split into two 64x1 template , HWY_IF_T_SIZE(T, 8)> HWY_API void StoreInterleaved3(Vec128 v0, Vec128 v1, Vec128 v2, D d, T* HWY_RESTRICT unaligned) { const Half dh; StoreInterleaved3(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), dh, detail::NativeLanePointer(unaligned)); StoreInterleaved3(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), dh, detail::NativeLanePointer(unaligned + 3)); } #endif // HWY_ARCH_ARM_V7 // ------------------------------ StoreInterleaved4 template > HWY_API void StoreInterleaved4(VFromD v0, VFromD v1, VFromD v2, VFromD v3, D d, T* HWY_RESTRICT unaligned) { detail::Tuple4 tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}}; detail::StoreInterleaved4(tup, detail::NativeLanePointer(unaligned)); } // <= 32 bits: avoid writing more than N bytes by copying to buffer template > HWY_API void StoreInterleaved4(VFromD v0, VFromD v1, VFromD v2, VFromD v3, D d, T* HWY_RESTRICT unaligned) { alignas(16) T buf[4 * 8 / sizeof(T)]; detail::Tuple4 tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}}; detail::StoreInterleaved4(tup, detail::NativeLanePointer(buf)); CopyBytes(buf, unaligned); } #if HWY_ARCH_ARM_V7 // 64x2: split into two 64x1 template , HWY_IF_T_SIZE(T, 8)> HWY_API void StoreInterleaved4(Vec128 v0, Vec128 v1, Vec128 v2, Vec128 v3, D d, T* HWY_RESTRICT unaligned) { const Half dh; StoreInterleaved4(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), LowerHalf(dh, v3), dh, detail::NativeLanePointer(unaligned)); StoreInterleaved4(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), UpperHalf(dh, v3), dh, detail::NativeLanePointer(unaligned + 4)); } #endif // HWY_ARCH_ARM_V7 #undef HWY_IF_STORE_INT // ------------------------------ Additional mask logical operations template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { return mask; } template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { const FixedTag d; const auto vmask = VecFromMask(d, mask); return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask))); } template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { const Simd d; const auto vmask = VecFromMask(d, mask); const auto neg_vmask = ResizeBitCast(d, Neg(ResizeBitCast(Full64(), vmask))); return MaskFromVec(Or(vmask, neg_vmask)); } template HWY_API Mask128 SetAtOrAfterFirst(Mask128 mask) { const Full128 d; const Repartition di64; auto vmask = BitCast(di64, VecFromMask(d, mask)); vmask = Or(vmask, Neg(vmask)); // Copy the sign bit of the first int64_t lane to the second int64_t lane const auto vmask2 = BroadcastSignBit(InterleaveLower(Zero(di64), vmask)); return MaskFromVec(BitCast(d, Or(vmask, vmask2))); } template HWY_API Mask128 SetBeforeFirst(Mask128 mask) { return Not(SetAtOrAfterFirst(mask)); } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { return mask; } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { const FixedTag d; const RebindToSigned di; const auto vmask = BitCast(di, VecFromMask(d, mask)); const auto zero = Zero(di); const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero); return MaskFromVec(BitCast(d, And(vmask, vmask2))); } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { const Simd d; const RebindToSigned di; const auto vmask = ResizeBitCast(Full64(), VecFromMask(d, mask)); const auto only_first_vmask = BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask))))); return MaskFromVec(only_first_vmask); } template HWY_API Mask128 SetOnlyFirst(Mask128 mask) { const Full128 d; const RebindToSigned di; const Repartition di64; const auto zero = Zero(di64); const auto vmask = BitCast(di64, VecFromMask(d, mask)); const auto vmask2 = VecFromMask(di64, InterleaveLower(zero, vmask) == zero); const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask)))); return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2)))); } template HWY_API Mask128 SetAtOrBeforeFirst(Mask128 /*mask*/) { const FixedTag d; const RebindToSigned di; using TI = MakeSigned; return RebindMask(d, MaskFromVec(Set(di, TI(-1)))); } template HWY_API Mask128 SetAtOrBeforeFirst(Mask128 mask) { const Simd d; return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask)))); } // ------------------------------ Lt128 template HWY_INLINE MFromD Lt128(D d, VFromD a, VFromD b) { static_assert(IsSame, uint64_t>(), "T must be u64"); // Truth table of Eq and Lt for Hi and Lo u64. // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) // =H =L cH cL | out = cH | (=H & cL) // 0 0 0 0 | 0 // 0 0 0 1 | 0 // 0 0 1 0 | 1 // 0 0 1 1 | 1 // 0 1 0 0 | 0 // 0 1 0 1 | 0 // 0 1 1 0 | 1 // 1 0 0 0 | 0 // 1 0 0 1 | 1 // 1 1 0 0 | 0 const MFromD eqHL = Eq(a, b); const VFromD ltHL = VecFromMask(d, Lt(a, b)); // We need to bring cL to the upper lane/bit corresponding to cH. Comparing // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the // comparison result leftwards requires only 4. IfThenElse compiles to the // same code as OrAnd(). const VFromD ltLx = DupEven(ltHL); const VFromD outHx = IfThenElse(eqHL, ltLx, ltHL); return MaskFromVec(DupOdd(outHx)); } template HWY_INLINE MFromD Lt128Upper(D d, VFromD a, VFromD b) { const VFromD ltHL = VecFromMask(d, Lt(a, b)); return MaskFromVec(InterleaveUpper(d, ltHL, ltHL)); } // ------------------------------ Eq128 template HWY_INLINE MFromD Eq128(D d, VFromD a, VFromD b) { static_assert(IsSame, uint64_t>(), "T must be u64"); const VFromD eqHL = VecFromMask(d, Eq(a, b)); return MaskFromVec(And(Reverse2(d, eqHL), eqHL)); } template HWY_INLINE MFromD Eq128Upper(D d, VFromD a, VFromD b) { const VFromD eqHL = VecFromMask(d, Eq(a, b)); return MaskFromVec(InterleaveUpper(d, eqHL, eqHL)); } // ------------------------------ Ne128 template HWY_INLINE MFromD Ne128(D d, VFromD a, VFromD b) { static_assert(IsSame, uint64_t>(), "T must be u64"); const VFromD neHL = VecFromMask(d, Ne(a, b)); return MaskFromVec(Or(Reverse2(d, neHL), neHL)); } template HWY_INLINE MFromD Ne128Upper(D d, VFromD a, VFromD b) { const VFromD neHL = VecFromMask(d, Ne(a, b)); return MaskFromVec(InterleaveUpper(d, neHL, neHL)); } // ------------------------------ Min128, Max128 (Lt128) // Without a native OddEven, it seems infeasible to go faster than Lt128. template HWY_INLINE VFromD Min128(D d, VFromD a, VFromD b) { return IfThenElse(Lt128(d, a, b), a, b); } template HWY_INLINE VFromD Max128(D d, VFromD a, VFromD b) { return IfThenElse(Lt128(d, b, a), a, b); } template HWY_INLINE VFromD Min128Upper(D d, VFromD a, VFromD b) { return IfThenElse(Lt128Upper(d, a, b), a, b); } template HWY_INLINE VFromD Max128Upper(D d, VFromD a, VFromD b) { return IfThenElse(Lt128Upper(d, b, a), a, b); } // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex #ifdef HWY_NATIVE_LEADING_ZERO_COUNT #undef HWY_NATIVE_LEADING_ZERO_COUNT #else #define HWY_NATIVE_LEADING_ZERO_COUNT #endif HWY_NEON_DEF_FUNCTION_INT_8_16_32(LeadingZeroCount, vclz, _, 1) HWY_NEON_DEF_FUNCTION_UINT_8_16_32(LeadingZeroCount, vclz, _, 1) template )> HWY_API V LeadingZeroCount(V v) { const DFromV d; const RebindToUnsigned du; const Repartition du32; const auto v_k32 = BitCast(du32, Set(du, 32)); const auto v_u32_lzcnt = LeadingZeroCount(BitCast(du32, v)) + v_k32; const auto v_u32_lo_lzcnt = And(v_u32_lzcnt, BitCast(du32, Set(du, 0xFFFFFFFFu))); const auto v_u32_hi_lzcnt = BitCast(du32, ShiftRight<32>(BitCast(du, v_u32_lzcnt))); return BitCast( d, IfThenElse(v_u32_hi_lzcnt == v_k32, v_u32_lo_lzcnt, v_u32_hi_lzcnt)); } template HWY_API V HighestSetBitIndex(V v) { const DFromV d; using T = TFromD; return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v)); } template HWY_API V TrailingZeroCount(V v) { return LeadingZeroCount(ReverseBits(v)); } template HWY_API V TrailingZeroCount(V v) { const DFromV d; const Repartition du8; return LeadingZeroCount( ReverseLaneBytes(BitCast(d, ReverseBits(BitCast(du8, v))))); } namespace detail { // for code folding #if HWY_ARCH_ARM_V7 #undef vuzp1_s8 #undef vuzp1_u8 #undef vuzp1_s16 #undef vuzp1_u16 #undef vuzp1_s32 #undef vuzp1_u32 #undef vuzp1_f32 #undef vuzp1q_s8 #undef vuzp1q_u8 #undef vuzp1q_s16 #undef vuzp1q_u16 #undef vuzp1q_s32 #undef vuzp1q_u32 #undef vuzp1q_f32 #undef vuzp2_s8 #undef vuzp2_u8 #undef vuzp2_s16 #undef vuzp2_u16 #undef vuzp2_s32 #undef vuzp2_u32 #undef vuzp2_f32 #undef vuzp2q_s8 #undef vuzp2q_u8 #undef vuzp2q_s16 #undef vuzp2q_u16 #undef vuzp2q_s32 #undef vuzp2q_u32 #undef vuzp2q_f32 #undef vzip1_s8 #undef vzip1_u8 #undef vzip1_s16 #undef vzip1_u16 #undef vzip1_s32 #undef vzip1_u32 #undef vzip1_f32 #undef vzip1q_s8 #undef vzip1q_u8 #undef vzip1q_s16 #undef vzip1q_u16 #undef vzip1q_s32 #undef vzip1q_u32 #undef vzip1q_f32 #undef vzip2_s8 #undef vzip2_u8 #undef vzip2_s16 #undef vzip2_u16 #undef vzip2_s32 #undef vzip2_u32 #undef vzip2_f32 #undef vzip2q_s8 #undef vzip2q_u8 #undef vzip2q_s16 #undef vzip2q_u16 #undef vzip2q_s32 #undef vzip2q_u32 #undef vzip2q_f32 #endif #undef HWY_NEON_BUILD_ARG_1 #undef HWY_NEON_BUILD_ARG_2 #undef HWY_NEON_BUILD_ARG_3 #undef HWY_NEON_BUILD_PARAM_1 #undef HWY_NEON_BUILD_PARAM_2 #undef HWY_NEON_BUILD_PARAM_3 #undef HWY_NEON_BUILD_RET_1 #undef HWY_NEON_BUILD_RET_2 #undef HWY_NEON_BUILD_RET_3 #undef HWY_NEON_BUILD_TPL_1 #undef HWY_NEON_BUILD_TPL_2 #undef HWY_NEON_BUILD_TPL_3 #undef HWY_NEON_DEF_FUNCTION #undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS #undef HWY_NEON_DEF_FUNCTION_ALL_TYPES #undef HWY_NEON_DEF_FUNCTION_BFLOAT_16 #undef HWY_NEON_DEF_FUNCTION_FLOAT_16 #undef HWY_NEON_DEF_FUNCTION_FLOAT_16_32 #undef HWY_NEON_DEF_FUNCTION_FLOAT_32 #undef HWY_NEON_DEF_FUNCTION_FLOAT_64 #undef HWY_NEON_DEF_FUNCTION_FULL_UI #undef HWY_NEON_DEF_FUNCTION_FULL_UI_64 #undef HWY_NEON_DEF_FUNCTION_FULL_UIF_64 #undef HWY_NEON_DEF_FUNCTION_INT_16 #undef HWY_NEON_DEF_FUNCTION_INT_32 #undef HWY_NEON_DEF_FUNCTION_INT_64 #undef HWY_NEON_DEF_FUNCTION_INT_8 #undef HWY_NEON_DEF_FUNCTION_INT_8_16_32 #undef HWY_NEON_DEF_FUNCTION_INTS #undef HWY_NEON_DEF_FUNCTION_INTS_UINTS #undef HWY_NEON_DEF_FUNCTION_UI_8_16_32 #undef HWY_NEON_DEF_FUNCTION_UIF_64 #undef HWY_NEON_DEF_FUNCTION_UIF_8_16_32 #undef HWY_NEON_DEF_FUNCTION_UINT_16 #undef HWY_NEON_DEF_FUNCTION_UINT_32 #undef HWY_NEON_DEF_FUNCTION_UINT_64 #undef HWY_NEON_DEF_FUNCTION_UINT_8 #undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32 #undef HWY_NEON_DEF_FUNCTION_UINTS #undef HWY_NEON_EVAL #undef HWY_NEON_IF_EMULATED_D } // namespace detail // NOLINTNEXTLINE(google-readability-namespace-comments) } // namespace HWY_NAMESPACE } // namespace hwy HWY_AFTER_NAMESPACE();