/* Copyright (c) 2018 Mozilla 2012-2017 Jean-Marc Valin */ /* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* AVX implementation of vector operations, compile with -mavx AVX2/FMA implementation of vector operations, compile with -mavx2 -mfma */ #ifndef VEC_AVX_H #define VEC_AVX_H #include #include #include "celt/x86/x86cpu.h" #define MAX_INPUTS (2048) #define USE_SU_BIAS #ifndef __SSE_4_1__ static inline __m128 mm_floor_ps(__m128 x) { __m128 half = _mm_set1_ps(0.5); return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_sub_ps(x, half))); } #undef _mm_floor_ps #define _mm_floor_ps(x) mm_floor_ps(x) #endif /* If we don't have AVX available, emulate what we need with SSE up to 4.1. */ #ifndef __AVX__ typedef struct { __m128 lo; __m128 hi; } mm256_emu; #define __m256 mm256_emu static inline mm256_emu mm256_loadu_ps(const float *src) { mm256_emu ret; ret.lo = _mm_loadu_ps(&src[0]); ret.hi = _mm_loadu_ps(&src[4]); return ret; } #define _mm256_loadu_ps(src) mm256_loadu_ps(src) static inline void mm256_storeu_ps(float *dst, mm256_emu src) { _mm_storeu_ps(dst, src.lo); _mm_storeu_ps(&dst[4], src.hi); } #define _mm256_storeu_ps(dst, src) mm256_storeu_ps(dst, src) static inline mm256_emu mm256_setzero_ps(void) { mm256_emu ret; ret.lo = _mm_setzero_ps(); ret.hi = ret.lo; return ret; } #define _mm256_setzero_ps mm256_setzero_ps static inline mm256_emu mm256_broadcast_ss(const float *x) { mm256_emu ret; ret.lo = _mm_set1_ps(*x); ret.hi = ret.lo; return ret; } #define _mm256_broadcast_ss(x) mm256_broadcast_ss(x) static inline mm256_emu mm256_set1_ps(float x) { mm256_emu ret; ret.lo = _mm_set1_ps(x); ret.hi = ret.lo; return ret; } #define _mm256_set1_ps(x) mm256_set1_ps(x) static inline mm256_emu mm256_mul_ps(mm256_emu a, mm256_emu b) { mm256_emu ret; ret.lo = _mm_mul_ps(a.lo, b.lo); ret.hi = _mm_mul_ps(a.hi, b.hi); return ret; } #define _mm256_mul_ps(a,b) mm256_mul_ps(a,b) static inline mm256_emu mm256_add_ps(mm256_emu a, mm256_emu b) { mm256_emu ret; ret.lo = _mm_add_ps(a.lo, b.lo); ret.hi = _mm_add_ps(a.hi, b.hi); return ret; } #define _mm256_add_ps(a,b) mm256_add_ps(a,b) static inline mm256_emu mm256_max_ps(mm256_emu a, mm256_emu b) { mm256_emu ret; ret.lo = _mm_max_ps(a.lo, b.lo); ret.hi = _mm_max_ps(a.hi, b.hi); return ret; } #define _mm256_max_ps(a,b) mm256_max_ps(a,b) static inline mm256_emu mm256_min_ps(mm256_emu a, mm256_emu b) { mm256_emu ret; ret.lo = _mm_min_ps(a.lo, b.lo); ret.hi = _mm_min_ps(a.hi, b.hi); return ret; } #define _mm256_min_ps(a,b) mm256_min_ps(a,b) static inline mm256_emu mm256_rcp_ps(mm256_emu a) { mm256_emu ret; ret.lo = _mm_rcp_ps(a.lo); ret.hi = _mm_rcp_ps(a.hi); return ret; } #define _mm256_rcp_ps(a) mm256_rcp_ps(a) static inline __m128 mm256_extractf128_ps(mm256_emu x, int i) { return (i==0) ? x.lo : x.hi; } #undef _mm256_extractf128_ps #define _mm256_extractf128_ps(x,i) mm256_extractf128_ps(x,i) static inline mm256_emu mm256_insertf128_ps(mm256_emu dst, __m128 src, int i) { if (i==0) dst.lo = src; else dst.hi = src; return dst; } #undef _mm256_insertf128_ps #define _mm256_insertf128_ps(dst,src,i) mm256_insertf128_ps(dst,src,i) #endif /* __AVX__ */ /* If we don't have AVX2 available, emulate what we need with SSE up to 4.1. */ #ifndef __AVX2__ typedef struct { __m128i lo; __m128i hi; } mm256i_emu; typedef __m256i real_m256i; #define __m256i mm256i_emu static inline mm256i_emu mm256_setzero_si256(void) { mm256i_emu ret; ret.lo = _mm_setzero_si128(); ret.hi = ret.lo; return ret; } #define _mm256_setzero_si256 mm256_setzero_si256 static inline mm256i_emu mm256_loadu_si256(const mm256i_emu *src) { mm256i_emu ret; ret.lo = _mm_loadu_si128((const __m128i*)src); ret.hi = _mm_loadu_si128(&((const __m128i*)src)[1]); return ret; } #define _mm256_loadu_si256(src) mm256_loadu_si256(src) static inline void mm256_storeu_si256(mm256i_emu *dst, mm256i_emu src) { _mm_storeu_si128((__m128i*)dst, src.lo); _mm_storeu_si128(&((__m128i*)dst)[1], src.hi); } #define _mm256_storeu_si256(dst, src) mm256_storeu_si256(dst, src) static inline mm256i_emu mm256_broadcastd_epi32(__m128i x) { mm256i_emu ret; ret.hi = ret.lo = _mm_shuffle_epi32(x, 0); return ret; } #define _mm256_broadcastd_epi32(x) mm256_broadcastd_epi32(x) static inline mm256i_emu mm256_set1_epi32(int x) { mm256i_emu ret; ret.lo = _mm_set1_epi32(x); ret.hi = ret.lo; return ret; } #define _mm256_set1_epi32(x) mm256_set1_epi32(x) static inline mm256i_emu mm256_set1_epi16(int x) { mm256i_emu ret; ret.lo = _mm_set1_epi16(x); ret.hi = ret.lo; return ret; } #define _mm256_set1_epi16(x) mm256_set1_epi16(x) static inline mm256i_emu mm256_add_epi32(mm256i_emu a, mm256i_emu b) { mm256i_emu ret; ret.lo = _mm_add_epi32(a.lo, b.lo); ret.hi = _mm_add_epi32(a.hi, b.hi); return ret; } #define _mm256_add_epi32(a,b) mm256_add_epi32(a,b) static inline mm256i_emu mm256_madd_epi16(mm256i_emu a, mm256i_emu b) { mm256i_emu ret; ret.lo = _mm_madd_epi16(a.lo, b.lo); ret.hi = _mm_madd_epi16(a.hi, b.hi); return ret; } #define _mm256_madd_epi16(a,b) mm256_madd_epi16(a,b) static inline mm256i_emu mm256_maddubs_epi16(mm256i_emu a, mm256i_emu b) { mm256i_emu ret; ret.lo = _mm_maddubs_epi16(a.lo, b.lo); ret.hi = _mm_maddubs_epi16(a.hi, b.hi); return ret; } #define _mm256_maddubs_epi16(a,b) mm256_maddubs_epi16(a,b) /* Emulating the conversion functions is tricky because they use __m256i but are defined in AVX. So we need to make a special when only AVX is available. */ #ifdef __AVX__ typedef union { mm256i_emu fake; real_m256i real; } mm256_union; static inline __m256 mm256_cvtepi32_ps(mm256i_emu a) { mm256_union src; src.fake = a; return _mm256_cvtepi32_ps(src.real); } #define _mm256_cvtepi32_ps(a) mm256_cvtepi32_ps(a) static inline mm256i_emu mm256_cvtps_epi32(__m256 a) { mm256_union ret; ret.real = _mm256_cvtps_epi32(a); return ret.fake; } #define _mm256_cvtps_epi32(a) mm256_cvtps_epi32(a) #else static inline mm256_emu mm256_cvtepi32_ps(mm256i_emu a) { mm256_emu ret; ret.lo = _mm_cvtepi32_ps(a.lo); ret.hi = _mm_cvtepi32_ps(a.hi); return ret; } #define _mm256_cvtepi32_ps(a) mm256_cvtepi32_ps(a) static inline mm256i_emu mm256_cvtps_epi32(mm256_emu a) { mm256i_emu ret; ret.lo = _mm_cvtps_epi32(a.lo); ret.hi = _mm_cvtps_epi32(a.hi); return ret; } #define _mm256_cvtps_epi32(a) mm256_cvtps_epi32(a) #endif /* __AVX__ */ #endif /* __AVX2__ */ /* In case we don't have FMA, make it a mul and an add. */ #if !(defined(__FMA__) && defined(__AVX__)) #define _mm256_fmadd_ps(a,b,c) _mm256_add_ps(_mm256_mul_ps(a, b), c) #define _mm_fmadd_ps(a,b,c) _mm_add_ps(_mm_mul_ps(a, b), c) #endif #ifdef __AVX2__ static inline __m256 exp8_approx(__m256 X) { const __m256 K0 = _mm256_set1_ps(0.99992522f); const __m256 K1 = _mm256_set1_ps(0.69583354f); const __m256 K2 = _mm256_set1_ps(0.22606716f); const __m256 K3 = _mm256_set1_ps(0.078024523f); const __m256 log2_E = _mm256_set1_ps(1.44269504f); const __m256 max_in = _mm256_set1_ps(50.f); const __m256 min_in = _mm256_set1_ps(-50.f); __m256 XF, Y; __m256i I; X = _mm256_mul_ps(X, log2_E); X = _mm256_max_ps(min_in, _mm256_min_ps(max_in, X)); XF = _mm256_floor_ps(X); I = _mm256_cvtps_epi32(XF); X = _mm256_sub_ps(X, XF); Y = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_fmadd_ps(K3, X, K2), X, K1), X, K0); I = _mm256_slli_epi32(I, 23); Y = _mm256_castsi256_ps(_mm256_add_epi32(I, _mm256_castps_si256(Y))); return Y; } static inline void vector_ps_to_epi8(unsigned char *x, const float *_x, int len) { int i; __m256 const127 = _mm256_set1_ps(127.f); for (i=0;i