/****************************************************************************** Copyright (c) 2007-2018, Intel Corp. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ******************************************************************************/ #define MAKE_COMMON #if !defined(BUILD_FILE_NAME) # define BUILD_FILE_NAME F_POW_BUILD_FILE_NAME #endif /* ** Pick up the latest default DPML definitions. however, don't let ** dpml_private.h default TABLE_NAME. This has already been done or over- ** ridden when the constant file was generated. */ #define DONT_DEFAULT_TABLE_NAME 1 #define NEW_DPML_MACROS 1 #include "dpml_private.h" #if defined(UNDEF_TABLE_NAME) # undef TABLE_NAME #endif /* ** Pick up common build time constants and definitions from the generated ** power constant table file */ #define DEFINE_SYMBOLIC_CONSTANTS 1 #include STR( BUILD_FILE_NAME ) /* ** Pick up common compile time constants and definitions shared by ** pow and exp. */ #include "dpml_pow.h" /* Must come after BUILD_FILE_NAME */ /* ** Select data type and algorithm specific constants from the included file */ #if EXP2 # define EXP_SELECT(a,b,c) a # define EXP_EXP10_SELECT(a,b) # define EXP2_SELECT(a,b) a # define NOT_EXP2(a) #elif EXP10 # define EXP_SELECT(a,b,c) b # define EXP_EXP10_SELECT(a,b) b # define EXP2_SELECT(a,b) b # define NOT_EXP2(a) a #else # define EXP_SELECT(a,b,c) c # define EXP_EXP10_SELECT(a,b) a # define EXP2_SELECT(a,b) b # define NOT_EXP2(a) a #endif #if (!ONE_TYPE && USE_BACKUP) # define EXP_HI_CHECK EXP_SELECT(EXP2_HI_CHECK_R, \ EXP_HI_CHECK_R, \ EXP10_HI_CHECK_R) # define EXP_LO_CHECK EXP_LO_CHECK_R # define POW2_LO_CHECK POW2_LO_CHECK_R # define POW2_HI_CHECK POW2_HI_CHECK_R #else # define EXP_HI_CHECK EXP_SELECT(EXP2_HI_CHECK_F, \ EXP_HI_CHECK_F, \ EXP10_HI_CHECK_F) # define EXP_LO_CHECK EXP_LO_CHECK_F # define POW2_LO_CHECK POW2_LO_CHECK_F # define POW2_HI_CHECK POW2_HI_CHECK_F #endif #define BIG ACC_BIG /* ** Design Overview: ** --------------- ** ** Given x and a positive integer POW2_K, for b = 2, e or 10 define: ** ** t = x * (lnb/ln2) ** m = nint(t * (2^POW2_K))/2^POW2_K (1) ** z = x - m ( ln2/lnb) ** w = t - m ** ** Then ** ** e^x = 2^(x*(lnb/ln2)) ** = 2^(m + w) ** = 2^m * 2^w (2) ** = 2^m * e^z (3) ** ** From the definition of m, it follows that m can be re-written as ** ** m = I + j/(2^POW2_K) ** ** where I and j are integers and 0 <= j <= 2^POW2_K - 1. It follows that (3) ** can be written as: ** ** e^x = 2^(I + j/(2^POW2_K)) * e^z ** = 2^I * 2^(j/(2^POW2_K)) * e^z (4) ** ** Now e^z can be approximated by a polynomial of the form: ** ** e^z = 1 + z*p(z) ** ** So that (4) can be rewritten as: ** ** e^x = 2^I * 2^(j/(2^POW2_K))*(1 + z*p(z)) (5) ** ** Design Details: ** -------------- ** ** o The multiplication by 2^I can be accomplished by adding I to the ** exponent field of a binary floating point number or creating an ** appropriate floating point value and multiplying. ** ** o There are exactly 2^(POW2_K) possible values represented by ** 2^(j/(2^POW2_K)). These can be stored in a table and looked up ** at run time using the bit sequence in 'j' as an index. The form of ** the table values (hi and lo pieces) is determined by efficiency ** considerations for fast pow and exp, although the performance ** impact on exp is minimal: one floating point add, if sequential ** polynomial evaluation is used; none if parallel evaluation is used. ** */ /* ** IMPLEMENTATION ISSUES: ** ---------------------- ** ** In general, extra precision is required at two key points in the algorithm: ** the argument reduction (computing z or w), and the scaling by 2^(j/2^POW2_K). ** On systems where no backup precision is available, these calculations are ** performed in hi and lo pieces to preserve the overall accuracy of the final ** result. ** ** Let T(j) = 2^(j/2^POW2_K) correctly rounded to F_PRECISION and define ** R(j) = [2^(j/2^POW2_K) - T(j)]/T(j). We assume that these values are ** precomputed and stored in a table. ** ** ** Data Types With No Backup ** ------------------------- ** ** Compute the reduced argument, z = x - m*(ln2/lnb), carefully, using ** ln2 in hi and lo pieces: ** ** z <-- [ x - m*hi_bits(ln2/lnb) ] - m*lo_bits(ln2/lnb) ** ** Perform the evaluation as: ** ** e^x = 2^I * 2^(j/(2^POW2_K)) * [ 1 + z*p(z) ] ** = 2^I * [ T(j) + T(j)*R(j) ] * [ 1 + z*p(z) ] ** = 2^I * T(j) * [ 1 + R(j) ] * [ 1 + z*p(z) ] ** = [ 2^I * T(j) ] * [ 1 + R(j) + z*p(z) + R(j)*z*p(z) ] ** ** Denote 2^I*T(j) as V(I,j) and note that the last term is insignificant ** relative to the final result, then we have: ** ** e^x = [ 2^I * T(j) ] * [ 1 + R(j) + z*p(z) + R(j)*z*p(z) ] ** = V(I,j) * { 1 + [ R(j) + z*p(z) ] } ** = V(I,j) + V(I,j)*[ R(j) + z*p(z) ] ** ** Data Types With Backup ** ---------------------- ** ** Compute all intermediate results in backup precision: ** ** w <-- (x/ln2) - m ** ** e^x = 2^I * 2^(j/(2^POW2_K)) * 2^w ** = 2^I * T(j) * [1 + w*q(w)] ** = V(I,j) * Q(w) ** ** where V(I,j) is as above. */ #ifndef BASE_NAME # define BASE_NAME EXP_SELECT( EXP2_BASE_NAME, \ EXP10_BASE_NAME, \ EXP_BASE_NAME) #endif #define ACC_POLY_F EXP_SELECT(ACC_POW2_POLY_F, ACC_EXP10_POLY_F, ACC_EXP_POLY_F) #define ERROR_OVERFLOW EXP2_SELECT(EXP2_OVERFLOW, EXP_OVERFLOW) #define ERROR_UNDERFLOW EXP2_SELECT(EXP2_UNDERFLOW, EXP_UNDERFLOW) #define NO_ERR_POS_INF EXP2_SELECT(EXP2_OF_INF, EXP_OF_INF) #define NO_ERR_NEG_INF EXP2_SELECT(EXP2_OF_NEG_INF, EXP_OF_NEG_INF) #if !defined(SPECIAL_EXP) # if !defined F_ENTRY_NAME # define F_ENTRY_NAME EXP_SELECT(F_EXP2_NAME, \ F_EXP10_NAME, \ F_EXP_NAME) # endif F_TYPE F_ENTRY_NAME( F_TYPE x ) { EXCEPTION_RECORD_DECLARATION B_TYPE fm, z, w, t; B_UNION stack_tmp_u; F_UNION stack_tmp_v; U_WORD status_word; WORD m, i, j; # if !COMPATIBILITY_MODE WORD func_error_word ; # endif /* ** Weed out: near overflow and underflow cases; NaN's, Inf's and ** denorms; arguments that would underflow during polynomial ** evaluation. ** ** The product x*(1/ln2) is on the critical path of this routine. ** Because the code is structured with a branch prior to the ** multiplication, some compilers fail to schedule the load of the ** constant 1/ln2 early enough to avoid delaying the reduced argument ** computation. To avoid this delay, we preload (1/ln2). */ stack_tmp_v.f = x; i = stack_tmp_v.F_SIGNED_HI_WORD; m = i & MAKE_MASK(F_SIGN_BIT_POS, 0); NOT_EXP2( t = EXP_EXP10_SELECT(RECIP_LN2, LN10_OV_LN2); ) if (((U_WORD) m - EXP_LO_CHECK) >= EXP_HI_CHECK) goto possible_problems; /* ** compute the reduced argument, w. Note that we obtain ** nint((x/ln2) * (2^POW2_K)) by adding x/ln2 to a suitably large ** constant (BIG). This requires that the rounding mode in effect ** at the time the add takes place must be round-to-nearest. */ z = EXP2_SELECT( (B_TYPE) x , ((B_TYPE) x) * t ) ; INIT_FPU_STATE_AND_ROUND_TO_NEAREST(status_word); t = BIG + z; /* Save for getting m later on */ fm = D_GROUP(t - BIG); #define CONS_HI EXP_EXP10_SELECT( LN2_HI, LN2_OV_LN10_HI ) #define CONS_LO EXP_EXP10_SELECT( LN2_LO, LN2_OV_LN10_LO ) w = EXP2_SELECT( z - fm , BACKUP_SELECT( z - fm, D_GROUP(x - fm*CONS_HI) - fm*CONS_LO ) ); /* ** Now get the bits of m as a integer and break it up into I and j */ stack_tmp_u.f = t; GET_LOW_32_BITS(m, stack_tmp_u); j = (m & POW2_INDEX_MASK) << POW2_INDEX_POS; i = m & (~POW2_INDEX_MASK); z = BACKUP_SELECT( ACC_POW2_POLY_R(w), ACC_POLY_F( POW2_LO_OV_POW2_HI(j), w ) ); /* ** Scale 2^(j/2^POW2_K) by 2^I, so that only one multiply is done. ** Note that the macros IPOW2(j) and IPOW2_LO(u,j) are used to access ** the table value T(j) as an integer rather than as a floating point ** value. */ IF_SMALL_WORD(IPOW2_LO(stack_tmp_u,j);) m = IPOW2(j); i = ALIGN_SCALE_WITH_EXP(i); m = W_ADD_TO_EXP_FIELD(m, i); stack_tmp_u.B_HI_WORD = m; t = stack_tmp_u.f; /* Check for possible problems and do the final multiply */ if (((U_WORD) IEEE_SELECT(m, (m & LO_MASK)) - POW2_LO_CHECK) >= POW2_HI_CHECK) goto boundary_check; z = BACKUP_SELECT(z*t, t + t*z); RESTORE_FPU_STATE(status_word); return (F_TYPE) z; boundary_check: /* ** Need to check for overflow, underflow and denormals. Don't need ** round to nearest state any more so restore original state */ RESTORE_FPU_STATE(status_word); /* ** Do the "final" multiple. However, when no backup is available ** the final multiply might involve a denormal or NaN, so we need to ** do this scaling carefully. Note that for the no-backup case, ** we are essentially re-doing the calculation as ** 2^I*(T(j) + T(j)*p(z)) */ BACKUP_SELECT(z = t*z;, t = POW2_HI(j); z = t + t*z; ) stack_tmp_u.f = z; m = stack_tmp_u.B_HI_WORD; IF_NO_BACKUP( /* "Multiply" by 2^i */ m = W_ADD_TO_EXP_FIELD(m, i); stack_tmp_u.B_HI_WORD = m; z = stack_tmp_u.f; ) /* ** Isolate exponent field and check for overflow and underflow. Note ** that subtracting 1 from the biased exponent field causes 0 exponents ** (which indicate underflows or denormal results) to be mapped to the ** high integer range. This allows testing for OK results, overflows ** and underflow by checking the size of (i-1) */ i = ((U_WORD) m) >> B_EXP_POS; IF_VAX( i &= MAKE_MASK(B_EXP_WIDTH + 1, 0); ) i -= BACKUP_SELECT( (F_MIN_BIN_EXP + B_EXP_BIAS), 1); if ((U_WORD) i <= F_MAX_BIN_EXP - BACKUP_SELECT( F_MIN_BIN_EXP, (1 - B_EXP_BIAS))) /* No exceptions, return final result */ return (F_TYPE) z; # if COMPATIBILITY_MODE j = ERROR_OVERFLOW; # else func_error_word = ERROR_WORD( STATUS_OVERFLOW, POS_HUGE_INDEX, POS_INFINITY_INDEX, F_TYPE_ENUM, DPML_ERANGE, 0 ) ; # endif if (((U_WORD) i) < (2*F_MAX_BIN_EXP + B_EXP_BIAS)) goto do_exception; /* ** OK, If we get here, the result is (most likely) an underflow or ** a denormal value. */ # if COMPATIBILITY_MODE j = ERROR_UNDERFLOW; # else func_error_word = ERROR_WORD( STATUS_UNDERFLOW, POS_ZERO_INDEX, POS_ZERO_INDEX, F_TYPE_ENUM, DPML_ERANGE, 0 ) ; # endif # if IEEE_FLOATING i += (F_PRECISION + 1); if ((WORD) i < 0) goto do_exception; /* ** The result is probably denormal so we have to (carefully) ** generate the result. Begin by setting the exponent field to ** the exponent field of 1, minus the number of bits of ** "denormalization" and convert to F_TYPE */ { F_TYPE u, v; stack_tmp_u.B_HI_WORD = m - ALIGN_WITH_B_TYPE_EXP(F_MIN_BIN_EXP); v = (F_TYPE) stack_tmp_u.f; /* ** Now add 1 to scaled result to force an alignment shift. The ** result of the addition will have correct fraction field for ** the denormalized result, but not the correct exponent */ u = (F_TYPE) ONE; v += u; stack_tmp_v.f = v; /* ** If the result of the sum is equal to one, then the final result ** will underflow. Otherwise remove the scale factor (i.e. get ** the correct exponent field) by subtracting the exponent of 1 ** from v. */ if (v == u) goto do_exception; m = stack_tmp_v.F_HI_WORD - ALIGN_W_EXP_FIELD(F_EXP_BIAS - F_NORM); stack_tmp_v.F_HI_WORD = m; v = stack_tmp_v.f; /* ** check to see if final result really was a denorm */ # if COMPATIBILITY_MODE if (((m & F_EXP_MASK) == 0) && !PROCESS_DENORMS) goto do_exception; # else if ( ( m & F_EXP_MASK ) == 0 ) { P_EXCPTN_VALUE_F( tmp_rec.value_under_test, v ) ; func_error_word = ERROR_WORD( STATUS_DENORM_PROCESSING | STATUS_UNDERFLOW, POS_ZERO_INDEX, POS_ZERO_INDEX, F_TYPE_ENUM, DPML_ERANGE, 0 ) ; goto do_exception ; } # endif return v ; } NaN_or_Inf: /* ** If we get here, i and m are the high words of x and |x| ** respectively. If m doesn't contain all of the bits of x, ** (i.e. BITS_PER_WORD < BITS_PER_F_TYPE) check for non-zero bits ** in the low word */ m <<= (BITS_PER_WORD - F_EXP_POS); IF_SMALL_WORD( m = m OR_LOW_BITS_SET(stack_tmp_v); ) if (m != 0) /* x was NaN, just return it */ return x; /* x was +/- infinity */ # if COMPATIBILITY_MODE j = ((WORD) i < 0) ? NO_ERR_NEG_INF : NO_ERR_POS_INF; # else func_error_word = ( ( WORD )i < 0 ) ? ERROR_WORD( EXP2_SELECT(STATUS_NO_ERROR,STATUS_UNDERFLOW), POS_ZERO_INDEX, POS_ZERO_INDEX, F_TYPE_ENUM, DPML_ERANGE, 0 ) : ERROR_WORD( EXP2_SELECT(STATUS_NO_ERROR,STATUS_OVERFLOW), POS_HUGE_INDEX, POS_INFINITY_INDEX, F_TYPE_ENUM, DPML_ERANGE, 0 ) ; # endif # endif goto do_exception; possible_problems: /* ** If we get here, x was either very small (e^x = 1), very big (e^x is ** overflow or underflow) or and IEEE special case (NaN or Inf) */ IF_IEEE( /* Screen out NaN's and Inf's */ if (m >= F_EXP_MASK) goto NaN_or_Inf; ) /* If argument is tiny (including denorms and zero) just return 1. */ if (m <= EXP_LO_CHECK) return (F_TYPE) ONE; /* ** If we get here, the final result is guaranteed to overflow or ** underflow, depending on the sign of x. Since i contains the high ** bits of x, just branch on the sign bit of x. */ # if COMPATIBILITY_MODE j = (i & F_SIGN_BIT_MASK) ? ERROR_UNDERFLOW : ERROR_OVERFLOW; /* Argument was positive */ # else func_error_word = ( i & F_SIGN_BIT_MASK ) ? ERROR_WORD( STATUS_UNDERFLOW, POS_ZERO_INDEX, POS_ZERO_INDEX, F_TYPE_ENUM, DPML_ERANGE, 0 ) : ERROR_WORD( STATUS_OVERFLOW, POS_HUGE_INDEX, POS_INFINITY_INDEX, F_TYPE_ENUM, DPML_ERANGE, 0 ) ; # endif goto do_exception; do_exception: # if COMPATIBILITY_MODE GET_EXCEPTION_RESULT_1(j, x, x); return x; # else RETURN_EXCEPTION_RESULT_1( func_error_word, x, F_F, _FpCodeExp ) ; # endif } #else /* defined(SPECIAL_EXP) */ # if USE_BACKUP # define LO_PART_DECL # else # define LO_PART pow2_low # define LO_PART_DECL , F_TYPE *LO_PART # endif B_TYPE F_EXP_SPECIAL_ENTRY_NAME (F_TYPE x, EXP_WORD_TYPE *pow_of_two LO_PART_DECL) { B_TYPE fm, z, w, t; B_UNION stack_tmp_u; WORD m, j; /* ** compute the reduced argument, w. */ z = (B_TYPE) x * RECIP_LN2; t = BIG + z; /* Save for getting m later on */ fm = t - BIG; w = BACKUP_SELECT( (z - fm), (x - fm*LN2_HI) - fm*LN2_LO ); /* ** Now get the bits of m as a integer and break it up into I and j. ** Return the value of I in *pow_of_two. */ stack_tmp_u.f = t; GET_LOW_32_BITS(m, stack_tmp_u); j = (m & POW2_INDEX_MASK) << POW2_INDEX_POS; # if USE_BACKUP z = POW2_HI(j) * ACC_POW2_POLY_R(w); # else z = POW2_HI(j); *LO_PART = z*ACC_EXP_POLY_F( POW2_LO_OV_POW2_HI(j), w ); # endif *pow_of_two = m & (~POW2_INDEX_MASK); return z; } #endif