/* Vectorized routines for x86 Streaming SIMD Extensions (SSE). * * This header file, unusually, provides many complete function * implementations so they can be inlined by the compiler. * * esl_sse supports both our SSE implementations and our SSE4 * implementations. In a plain "SSE" (i.e. SSE/SSE2, e.g. HMMER3), * SSE4.1-dependent code is ifdef'ed out. * * Contents: * 1. Function declarations for esl_sse.c * 2. Inlined functions: horizontal max, min, sum * 3. Inlined functions: left, right shift * 4. Inlined functions: any_gt * 5. Inlined functions: select */ #ifndef eslSSE_INCLUDED #define eslSSE_INCLUDED #include "esl_config.h" #if defined(eslENABLE_SSE) || defined(eslENABLE_SSE4) #include "easel.h" #include #include /***************************************************************** * 1. Function declarations (from esl_sse.c) *****************************************************************/ extern __m128 esl_sse_logf(__m128 x); extern __m128 esl_sse_expf(__m128 x); extern void esl_sse_dump_ps(FILE *fp, __m128 v); /***************************************************************** * 2. Inlined functions: horizontal max, min *****************************************************************/ /* Function: esl_sse_hmax_epu8() * Synopsis: Return max of 16 uint8_t elements in epu8 vector. */ static inline uint8_t esl_sse_hmax_epu8(__m128i a) { a = _mm_max_epu8(a, _mm_srli_si128(a, 8)); a = _mm_max_epu8(a, _mm_srli_si128(a, 4)); a = _mm_max_epu8(a, _mm_srli_si128(a, 2)); a = _mm_max_epu8(a, _mm_srli_si128(a, 1)); return (uint8_t) _mm_extract_epi16(a, 0); /* only low-order 8 bits set; so _epi16 or _epi8 equiv; _epi8 is SSE4.1 */ } #ifdef eslENABLE_SSE4 /* Function: esl_sse_hmax_epi8() * Synopsis: Return max of 16 int8_t elements in epi8 vector. * (SSE4.1) */ static inline int8_t esl_sse_hmax_epi8(__m128i a) { a = _mm_max_epi8(a, _mm_shuffle_epi32 (a, _MM_SHUFFLE(2,3,0,1))); // _MM_SHUFFLE() args are reversed._MM_SHUFFLE(3,2,1,0) is a no-op, for example. a = _mm_max_epi8(a, _mm_shuffle_epi32 (a, _MM_SHUFFLE(0,1,2,3))); a = _mm_max_epi8(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(2,3,0,1))); a = _mm_max_epi8(a, _mm_srli_epi16 (a, 8)); return (int8_t) _mm_cvtsi128_si32(a); } #endif /* Function: esl_sse_hmax_epi16() * Synopsis: Return max of 16 int16_t elements in epi16 vector. */ static inline int16_t esl_sse_hmax_epi16(__m128i a) { a = _mm_max_epi16(a, _mm_shuffle_epi32 (a, _MM_SHUFFLE(1,0,3,2))); a = _mm_max_epi16(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1,0,3,2))); a = _mm_max_epi16(a, _mm_srli_epi32(a, 16)); return (int16_t) _mm_cvtsi128_si32(a); } /* Function: esl_sse_hmax_ps() * Synopsis: Find the maximum of elements in a vector. * * Purpose: Find the maximum valued element in the four float elements * in , and return that maximum value in <*ret_max>. * * Xref: J3/90 for benchmarking of some alternative implementations. */ static inline void esl_sse_hmax_ps(__m128 a, float *ret_max) { a = _mm_max_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1))); a = _mm_max_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2))); _mm_store_ss(ret_max, a); } /* Function: esl_sse_hmin_ps() * Synopsis: Find the minimum of elements in a vector. * * Purpose: Find the minimum valued element in the four float elements * in and return that minimum value in <*ret_min>. */ static inline void esl_sse_hmin_ps(__m128 a, float *ret_min) { a = _mm_min_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1))); a = _mm_min_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2))); _mm_store_ss(ret_min, a); } /* Function: esl_sse_hsum_ps() * Synopsis: Takes the horizontal sum of elements in a vector. * * Purpose: Add the four float elements in vector ; return * that sum in <*ret_sum>. */ static inline void esl_sse_hsum_ps(__m128 a, float *ret_sum) { a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1))); a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2))); _mm_store_ss(ret_sum, a); } /***************************************************************** * 3. Inlined functions: left, right shift *****************************************************************/ /* Function: esl_sse_rightshift_int8() * Synopsis: Shift int8 vector elements to the right, shifting -inf on * Incept: SRE, Sun Jun 4 09:52:45 2017 * * Purpose: Given an int8 vector <{ a0 .. a16}>, and a mask <{-inf, * 0*15 }> with the desired value of -inf in slot 0 * (and zeros elsewhere), return <{ -inf, a0..a14 }>; * i.e. shift the values in to the right, while * shifting $-\infty$ on. * * By our convention, "right" and "left" refer to memory * order (low addresses on the left). On a little-endian * (x86) architecture, this is a left shift in the hardware * register. * * This can be used both for signed (epi8) and unsigned * (epu8) int8 vectors. * * Xref: HMMER's simdvec.md: on our left/right convention. */ static inline __m128i esl_sse_rightshift_int8(__m128i a, __m128i neginfmask) { return _mm_or_si128(_mm_slli_si128(a, 1), neginfmask); } /* Function: esl_sse_rightshift_int16() * Synopsis: Shift int16 vector elements to the right, shifting -inf on * Incept: SRE, Sun Jun 4 10:12:24 2017 [Gary Jules, Mad World] * * Purpose: Same as but for int16. */ static inline __m128i esl_sse_rightshift_int16(__m128i a, __m128i neginfmask) { return _mm_or_si128( _mm_slli_si128(a, 2), neginfmask); } /* Function: esl_sse_rightshiftz_float() * Synopsis: Shift float vector elements to the right, shifting zero on. * * Purpose: Same as but for floats, * and the value that is shifted on is a zero. */ static inline __m128 esl_sse_rightshiftz_float(__m128 a) { // Tricky. IEEE754 representation of zero is all 0 bits, so shift alone suffices. return (__m128) _mm_slli_si128( (__m128i) a, 4); } /* Function: esl_sse_leftshiftz_float() * Synopsis: Shift float vector elements to the left, shifting zero on. * * Purpose: Same as but leftwise: <[ a0 a1 a2 * a3 ]> becomes <[ a1 a2 a3 0 ]>. Used in Backwards. */ static inline __m128 esl_sse_leftshiftz_float(__m128 a) { // Same trick. IEEE754 representation of zero is all 0 bits. return (__m128) _mm_srli_si128( (__m128i) a, 4); } /* Function: esl_sse_rightshift_ps() * Synopsis: Shift vector elements to the right, shifting -inf on. * * Purpose: Returns a vector containing * <{ b[0] a[0] a[1] a[2] }>: * i.e. shift the values in to the * right, and load the first value of * into the first slot. * * Note: used in Infernal. */ static inline __m128 esl_sse_rightshift_ps(__m128 a, __m128 b) { return _mm_move_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 0, 0)), b); } /* Function: esl_sse_leftshift_ps() * Synopsis: Shift vector elements to the left, shifting -inf on. * * Purpose: Returns a vector containing * <{ a[1] a[2] a[3] b[0]}>: * i.e. shift the values in to the * left and load the first value of * into the first slot. * * Note: used in Infernal. */ static inline __m128 esl_sse_leftshift_ps(__m128 a, __m128 b) { register __m128 v = _mm_move_ss(a, b); /* now b[0] a[1] a[2] a[3] */ return _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)); /* now a[1] a[2] a[3] b[0] */ } /***************************************************************** * 3. Inlined functions: any_gt *****************************************************************/ /* Function: esl_sse_any_gt_epu8() * Synopsis: Returns TRUE if any a[z] > b[z]. * * Purpose: Return TRUE if any b[z]> for * in two vectors of unsigned chars. * * We need this incantation because SSE provides * no instruction. * * For equality tests, note that works fine * for unsigned ints though there is no * instruction either). * * See vec_any_gt */ static inline int esl_sse_any_gt_epu8(__m128i a, __m128i b) { __m128i mask = _mm_cmpeq_epi8(_mm_max_epu8(a,b), b); /* anywhere a>b, mask[z] = 0x0; elsewhere 0xff */ int maskbits = _mm_movemask_epi8(_mm_xor_si128(mask, _mm_cmpeq_epi8(mask, mask))); /* the xor incantation is a bitwise inversion */ return maskbits != 0; } /* Function: esl_sse_any_gt_epi16() * Synopsis: Return >0 if any a[z] > b[z] */ static inline int esl_sse_any_gt_epi16(__m128i a, __m128i b) { return (_mm_movemask_epi8(_mm_cmpgt_epi16(a,b)) != 0); } /* Function: esl_sse_any_gt_ps() * Synopsis: Returns TRUE if any a[z] > b[z] * * Xref: From Apple Altivec/SSE migration guide. */ static inline int esl_sse_any_gt_ps(__m128 a, __m128 b) { __m128 mask = _mm_cmpgt_ps(a,b); int maskbits = _mm_movemask_ps( mask ); return maskbits != 0; } /***************************************************************** * 5. Inlined functions: select *****************************************************************/ /* Function: esl_sse_select_ps() * Synopsis: SSE equivalent of * * Purpose: Vector select. Returns a vector where * is all 0's; where is all 1's. * * Useful for avoiding conditional branches. For example, * to implement \ccode{if (a > 0) a += a;}: * * \begin{cchunk} * mask = _mm_cmpgt_ps(a, _mm_setzero_ps()); * twoa = _mm_add_ps(a, a); * a = esl_sse_select_ps(a, twoa, mask); * \end{cchunk} * * Notes: As recommended by the Altivec/SSE Migration Guide, * Apple Computer, Inc. */ static inline __m128 esl_sse_select_ps(__m128 a, __m128 b, __m128 mask) { b = _mm_and_ps(b, mask); a = _mm_andnot_ps(mask, a); return _mm_or_ps(a,b); } #endif /*eslENABLE_SSE || eslENABLE_SSE4 */ #endif /*eslSSE_INCLUDED*/