/* Vectorized routines for x86 AVX-512 instructions. * * This header file, unusually, provides many complete function * implementations so they can be inlined by the compiler. * * Contents: * 1. Function declarations for esl_avx512.c * 2. Inlined functions: horizontal max, sum * 3. Inlined functions: left, right shift */ #ifndef eslAVX512_INCLUDED #define eslAVX512_INCLUDED #include "esl_config.h" #ifdef eslENABLE_AVX512 #include "easel.h" #include #include /***************************************************************** * 1. Function declarations for esl_avx512.c *****************************************************************/ extern void esl_avx512_dump_512i_hex8(__m512i v); /***************************************************************** * 2. Inlined functions: horizontal max, sum *****************************************************************/ /* Function: esl_avx512_hmax_epu8() * Synopsis: Return max of 64 unsigned uint8_t elements in epu8 vector. */ static inline uint8_t esl_avx512_hmax_epu8(__m512i a) { // Use AVX instructions for this because AVX-512 can't extract 8-bit quantities // Intel has stated that there will be no performance penalty for switching between AVX-512 and AVX __m256i b = _mm256_max_epu8(_mm512_extracti32x8_epi32(a, 0), _mm512_extracti32x8_epi32(a, 1)); b = _mm256_max_epu8(b, _mm256_permute2x128_si256(b, b, 0x01)); b = _mm256_max_epu8(b, _mm256_shuffle_epi32 (b, 0x4e)); b = _mm256_max_epu8(b, _mm256_shuffle_epi32 (b, 0xb1)); b = _mm256_max_epu8(b, _mm256_shufflelo_epi16 (b, 0xb1)); b = _mm256_max_epu8(b, _mm256_srli_si256 (b, 1)); return _mm256_extract_epi8(b, 0); // epi8 is fine here. gets cast properly to uint8_t on return. } /* Function: esl_avx512_hmax_epu8() * Synopsis: Return max of 64 unsigned uint8_t elements in epu8 vector. * Incept: SRE, Thu May 25 13:20:45 2017 [Old 97's, Oppenheimer] */ static inline int8_t esl_avx512_hmax_epi8(__m512i a) { __m256i b = _mm256_max_epi8(_mm512_extracti32x8_epi32(a, 0), _mm512_extracti32x8_epi32(a, 1)); b = _mm256_max_epi8(b, _mm256_permute2x128_si256(b, b, 0x01)); b = _mm256_max_epi8(b, _mm256_shuffle_epi32 (b, 0x4e)); b = _mm256_max_epi8(b, _mm256_shuffle_epi32 (b, 0xb1)); b = _mm256_max_epi8(b, _mm256_shufflelo_epi16 (b, 0xb1)); b = _mm256_max_epi8(b, _mm256_srli_si256 (b, 1)); return _mm256_extract_epi8(b, 0); } /* Function: esl_avx512_hmax_epi16() * Synopsis: Return max of 32 signed int8_t elements in epi16 vector. */ static inline int16_t esl_avx512_hmax_epi16(__m512i a) { __m256i b = _mm256_max_epi16(_mm512_extracti32x8_epi32(a, 0), _mm512_extracti32x8_epi32(a, 1)); b = _mm256_max_epi16(b, _mm256_permute2x128_si256(b, b, 0x01)); b = _mm256_max_epi16(b, _mm256_shuffle_epi32 (b, 0x4e)); b = _mm256_max_epi16(b, _mm256_shuffle_epi32 (b, 0xb1)); b = _mm256_max_epi16(b, _mm256_shufflelo_epi16 (b, 0xb1)); return _mm256_extract_epi16(b, 0); } /* Function: esl_avx512_hsum_ps() * Synopsis: sums the floating-point values in an __m512 vector * returns the result in ret_sum * Purpose: To compute the sum of the 32-bit float elements of a 512-bit vector */ static inline void esl_avx512_hsum_ps(__m512 a, float *ret_sum) { __m512 temp1_AVX_512 = _mm512_shuffle_f32x4(a, a, 0x4e); //swap high and low halves of a __m512 temp2_AVX_512 = _mm512_add_ps(a, temp1_AVX_512); // sum corresponding floats in the high, low halves temp1_AVX_512 = _mm512_shuffle_f32x4(temp2_AVX_512, temp2_AVX_512, 0xb1); //swap high and low quarters of each half of temp2 temp2_AVX_512 = _mm512_add_ps(temp2_AVX_512, temp1_AVX_512); // sum corresponding floats in the high, low quarters temp1_AVX_512 = _mm512_shuffle_ps(temp2_AVX_512, temp2_AVX_512, 0x4e); //swap high and low eigths of each quarter of a temp2_AVX_512 = _mm512_add_ps(temp2_AVX_512, temp1_AVX_512); // sum corresponding floats in the high, low eighths temp1_AVX_512 = _mm512_shuffle_ps(temp2_AVX_512, temp2_AVX_512, 0xb1); //swap high and low sixteenths of each eighth temp2_AVX_512 = _mm512_add_ps(temp2_AVX_512, temp1_AVX_512); // each element of temp2_AVX_512 now contains the sum of all the floats in a __m256 temp3_AVX = _mm512_extractf32x8_ps(temp2_AVX_512, 0); //Grab the low half of temp2_AVX_512 // because AVX-512 doesn't provide an operation to extract one float from a 512-bit vector // printf("output sum vector is: "); // print_512(temp2_AVX_512); int *retint_ptr = (int *) ret_sum; // This is a horrible hack because there isn't an intrinsic to extract a float from // an __m256. Do this to avoid casting an int back to a float and screwing it up *retint_ptr = _mm256_extract_epi32((__m256i) temp3_AVX, 0); } /***************************************************************** * 3. Inlined functions: left and right shifts *****************************************************************/ /* Function: esl_avx512_rightshift_int8() * Synopsis: Shift int8 vector elements to the right, shifting -inf on. * Incept: SRE, Sun Jun 4 17:43:14 2017 * See: esl_sse.h::esl_sse_rightshift_int8() */ static inline __m512i esl_avx512_rightshift_int8(__m512i v, __m512i neginfmask) { // Similar to AVX logic, but complicated by lack of permute2x128 instruction. v = _mm512_alignr_epi8(v, _mm512_maskz_shuffle_i32x4(0xfff0, v, v, 0x90), 15); return _mm512_or_si512(v, neginfmask); } /* Function: esl_avx512_rightshift_int16() * Synopsis: Shift int16 vector elements to the right, shifting -inf on. * Incept: SRE, Sun Jun 4 17:49:02 2017 * See: esl_sse.h::esl_sse_rightshift_int16() */ static inline __m512i esl_avx512_rightshift_int16(__m512i v, __m512i neginfmask) { v = _mm512_alignr_epi8(v, _mm512_maskz_shuffle_i32x4(0xfff0, v, v, 0x90), 14); return _mm512_or_si512(v, neginfmask); } /* Function: esl_avx512_rightshiftz_float() * Synopsis: Shift float vector elements to the right, shifting zero on. * Incept: SRE, Sun Jun 4 17:59:54 2017 * See: esl_sse.h::esl_sse_rightshiftz_float() */ static inline __m512 esl_avx512_rightshiftz_float(__m512 v) { return ((__m512) _mm512_alignr_epi8((__m512i) v, _mm512_maskz_shuffle_i32x4(0xfff0, (__m512i) v, (__m512i) v, 0x90), 12)); } /* Function: esl_avx512_leftshiftz_float() * Synopsis: Shift float vector elements to the left, shifting zero on. * Incept: SRE, Sun Jun 4 18:04:34 2017 * See: esl_sse.h::esl_sse_leftshiftz_float() */ static inline __m512 esl_avx512_leftshiftz_float(__m512 v) { return ((__m512) _mm512_alignr_epi8( _mm512_maskz_shuffle_i32x4(0x0fff, (__m512i) v, (__m512i) v, 0x39), (__m512i) v, 4)); } #endif //eslAVX512_INCLUDED #endif //eslENABLE_AVX512