/* * Correctly rounded trigonometric functions * * Author : Catherine Daramy, Florent de Dinechin, David Defour * * This file is part of the crlibm library developed by the Arenaire * project at Ecole Normale Superieure de Lyon * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include "crlibm.h" #include "crlibm_private.h" #include "trigo_fast.h" extern double scs_sin_rn(double); extern double scs_sin_ru(double); extern double scs_sin_rd(double); extern double scs_sin_rz(double); extern double scs_cos_rn(double); extern double scs_cos_ru(double); extern double scs_cos_rd(double); extern double scs_cos_rz(double); extern double scs_tan_rn(double); extern double scs_tan_rd(double); extern double scs_tan_ru(double); extern double scs_tan_rz(double); extern int rem_pio2_scs(scs_ptr, scs_ptr); /* How these functions work: The trig range reduction in crlibm computes an integer k and a reduced argument y such that x = k.Pi/256 + y with the reduced argument y directly in -Pi/512, Pi/512. (Pi/512 < 4/512 = 2^-7) y is computed as a double-double yh+yl Then we read off a table sah+sal ~ sin(kPi/256) cah+cal ~ cos(kPi/256) and we use the reconstruction sin(kPi/256 + y) = sin(kPi/256)cos(y) + cos(kPi/256)sin(y) cos(kPi/256 + y) = cos(kPi/256)cos(y) - sin(kPi/256)sin(y) where cos(y) and sin(y) are computed as unevaluated 1+tc and (yh+yl)(1+ts) respectively, where tc and ts are doubles resulting from a small polynomial approximation. This gives 14 extra bits of accuracy, so this first step is very accurate. Why not use accurate tables as defined by Gal ? In short, because Gal's fast approach only gives as many additiona bits as you've got to address the table (so we are limited to 7 if we limit the table size to 4KB), and we need more to have a good average performance. From a performance point of view we probably lose a few cycles: There is 4 values to read in our scheme compared to 3 in Gal's method. The reconstruction costs a few floating-point operations more (not that many, if you look in details and want to ensure more than 7 extra bits). Now for the advantages: 1/ The whole thing is simpler 2/ We have much more accuracy in the table, which simplifies the proof. 3/ We will be able to reuse the same table values to speed up the second step (just tabulating a third double such that the three-double approx of sin/cos(kPi/256) will be summed exactly into an SCS number) Now a word on range reduction: We have 4 possible range reductions: Cody and Waite with 2 constants (the fastest) Cody and Waite with 3 constants (almost as fast) Cody and Waite with 3 constants in double-double and k a long-long int Payne and Hanek, implemented in SCS (the slowest). Each of these range reductions except Payne and Hanek is valid for x smaller than some bound. This range reduction may cancel up to 62 bits according to a program by Kahan/Douglas available in Muller's book and implemented as function WorstCaseForAdditiveRangeReduction in common-procedures.mpl However this is not a concern unless x is close to a multiple of Pi/2 (that is k&127==0): in the general case the reconstruction will add a tabulated non-zero value, so the error to consider in the range reduction is the absolute error. Only in the cases when k&127==0 do we need to have 62 extra bits to compute with. This is ensured by using a slower, more accurate range reduction. This test for k&127==0 actually speeds up even these cases, because in these cases there is no table to read and no reconstruction to do : a simple approximation to the function suffices. Why not use Payne and Hanek only as in Markstein's book ? Because our scheme, in the absence of FMA, is much faster for small values which are the most used. Markstein takes as reduced argument the fractional part of x*256/Pi, (or maybe it's 512 in his case), so he's got the same tables as we have, but different polynomials (which compute sin(2Pi*y) and cos(2Pi*y). */ #define DEBUG 0 /* TODO: - in the Cody and Waite, the kd=double(k) should be replaced with a single substraction of the magic number of Double2Int - In some Cody and Waite there are Mul12 involving k, CH and CM. They can be improved by pre-splitting CH, CM (tabulated values) and k (as an int) Then you can improve the precision by taking kmax into account - The first coefficient of the cosine polynomial is equal exactly to 1/2 and this should be modified in order to increase to accuracy of the approximation. - The second step should get the reduced argument from the first step (and use the same argument reduction). This should lead to 5x improvement of the worst case. - in the tangent there are three steps. This could be studied for the other functions */ static int rem_pio256_scs(scs_ptr result, const scs_ptr x){ uint64_t r[SCS_NB_WORDS+3], tmp; unsigned int N; /* result r[0],...,r[10] could store till 300 bits of precision */ /* that is really enough for computing the reduced argument */ int sign, i, j, ind; int *digits_256_over_pi_pt; if ((X_EXP != 1)||(X_IND < -2)){ scs_set(result, x); return 0; } /* Compute the product |x| * 256/Pi */ if (X_IND == -2) { r[0] = 0; r[1] = 0; r[2] = (uint64_t)(digits_256_over_pi[0]) * X_HW[0]; r[3] = ((uint64_t)(digits_256_over_pi[0]) * X_HW[1] +(uint64_t)(digits_256_over_pi[1]) * X_HW[0]); if(X_HW[2] == 0){ for(i=4; i<(SCS_NB_WORDS+3); i++){ r[i] = ((uint64_t)(digits_256_over_pi[i-3]) * X_HW[1] +(uint64_t)(digits_256_over_pi[i-2]) * X_HW[0]); }}else { for(i=4; i<(SCS_NB_WORDS+3); i++){ r[i] = ((uint64_t)(digits_256_over_pi[i-4]) * X_HW[2] +(uint64_t)(digits_256_over_pi[i-3]) * X_HW[1] +(uint64_t)(digits_256_over_pi[i-2]) * X_HW[0]); } } }else { if (X_IND == -1){ r[0] = 0; r[1] = (uint64_t)(digits_256_over_pi[0]) * X_HW[0]; r[2] = ((uint64_t)(digits_256_over_pi[0]) * X_HW[1] +(uint64_t)(digits_256_over_pi[1]) * X_HW[0]); if(X_HW[2] == 0){ for(i=3; i<(SCS_NB_WORDS+3); i++){ r[i] = ((uint64_t)(digits_256_over_pi[i-2]) * X_HW[1] +(uint64_t)(digits_256_over_pi[i-1]) * X_HW[0]); }}else { for(i=3; i<(SCS_NB_WORDS+3); i++){ r[i] = ((uint64_t)(digits_256_over_pi[i-3]) * X_HW[2] +(uint64_t)(digits_256_over_pi[i-2]) * X_HW[1] +(uint64_t)(digits_256_over_pi[i-1]) * X_HW[0]); }} }else { if (X_IND == 0){ r[0] = (uint64_t)(digits_256_over_pi[0]) * X_HW[0]; r[1] = ((uint64_t)(digits_256_over_pi[0]) * X_HW[1] +(uint64_t)(digits_256_over_pi[1]) * X_HW[0]); if(X_HW[2] == 0){ for(i=2; i<(SCS_NB_WORDS+3); i++){ r[i] = ((uint64_t)(digits_256_over_pi[i-1]) * X_HW[1] +(uint64_t)(digits_256_over_pi[ i ]) * X_HW[0]); }}else { for(i=2; i<(SCS_NB_WORDS+3); i++){ r[i] = ((uint64_t)(digits_256_over_pi[i-2]) * X_HW[2] +(uint64_t)(digits_256_over_pi[i-1]) * X_HW[1] +(uint64_t)(digits_256_over_pi[ i ]) * X_HW[0]); }} }else { if (X_IND == 1){ r[0] = ((uint64_t)(digits_256_over_pi[0]) * X_HW[1] +(uint64_t)(digits_256_over_pi[1]) * X_HW[0]); if(X_HW[2] == 0){ for(i=1; i<(SCS_NB_WORDS+3); i++){ r[i] = ((uint64_t)(digits_256_over_pi[ i ]) * X_HW[1] +(uint64_t)(digits_256_over_pi[i+1]) * X_HW[0]); }}else { for(i=1; i<(SCS_NB_WORDS+3); i++){ r[i] = ((uint64_t)(digits_256_over_pi[i-1]) * X_HW[2] +(uint64_t)(digits_256_over_pi[ i ]) * X_HW[1] +(uint64_t)(digits_256_over_pi[i+1]) * X_HW[0]); }} }else { ind = (X_IND - 2); digits_256_over_pi_pt = (int*)&(digits_256_over_pi[ind]); if(X_HW[2] == 0){ for(i=0; i<(SCS_NB_WORDS+3); i++){ r[i] = ((uint64_t)(digits_256_over_pi_pt[i+1]) * X_HW[1] +(uint64_t)(digits_256_over_pi_pt[i+2]) * X_HW[0]); }}else { for(i=0; i<(SCS_NB_WORDS+3); i++){ r[i] = ((uint64_t)(digits_256_over_pi_pt[ i ]) * X_HW[2] +(uint64_t)(digits_256_over_pi_pt[i+1]) * X_HW[1] +(uint64_t)(digits_256_over_pi_pt[i+2]) * X_HW[0]); } } } } } } /* Carry propagate */ r[SCS_NB_WORDS+1] += r[SCS_NB_WORDS+2]>>30; for(i=(SCS_NB_WORDS+1); i>0; i--) {tmp=r[i]>>30; r[i-1] += tmp; r[i] -= (tmp<<30);} /* The integer part is in r[0] */ N = r[0]; if (r[1] > (SCS_RADIX)/2){ /* test if the reduced part is bigger than Pi/4 */ N += 1; sign = -1; for(i=1; i<(SCS_NB_WORDS+3); i++) { r[i]=((~(unsigned int)(r[i])) & 0x3fffffff);} } else sign = 1; /* Now we get the reduced argument and check for possible * cancellation. By Kahan algorithm we will have at most 2 digits * of cancellations, r[1] and r[2] in the worst case. */ if (r[1] == 0) if (r[2] == 0) i = 3; else i = 2; else i = 1; for(j=0; jx); \ k= rem_pio256_scs(Y, X); \ index=(k&127)<<2; \ quadrant = (k>>7)&3; \ x0 = (double)(Y->h_word[0]); \ x1 = ((double)(Y->h_word[1])) * SHIFT1; \ x2 = ((double)(Y->h_word[2])) * SHIFT2; \ x3 = ((double)(Y->h_word[3])) * SHIFT3; \ nb.i[HI] = ((Y->index)*SCS_NB_BITS +1023) << 20; \ nb.i[LO] = 0; \ nb.d *= Y->sign; \ yh=(x2+x1)+x0; \ yl=(((x0-yh)+x1)+x2) + x3; \ yh *= nb.d; /* exact multiplication */ \ yl *= nb.d; /* exact multiplication */ \ }while(0) /* A structure that holds all the information to be exchanged between ComputeTrigWithArgred and the 12 functions sin_rn etc It is purely for performance (almost 100 cycles out of 300 on a P4 when compared to passing a list of arguments). In addition to saving a few memory accesses, it also allows other small optimizations like deferring the possible change of sign of the result to the the last moment using rri->changesign. All this is not very elegant, but it is safe. */ struct rrinfo_s {double rh; double rl; double x; int absxhi; int function;} ; typedef struct rrinfo_s rrinfo; #define changesign function /* saves one int in the rrinfo structure */ static void ComputeTrigWithArgred(rrinfo *rri){ double sah,sal,cah,cal, yh, yl, yh2, ts,tc, kd; double kch_h,kch_l, kcm_h,kcm_l, th, tl,sh,sl,ch,cl; int k, quadrant, index; int64_t kl; if (rri->absxhi < XMAX_CODY_WAITE_3) { /* Compute k, deduce the table index and the quadrant */ #if 0 DOUBLE2INT(k, rri->x * INV_PIO256); kd = (double) k; #else {db_number _t; double _d = rri->x * INV_PIO256; _t.d = (_d+6755399441055744.0); k = _t.i[LO]; kd=_t.d-6755399441055744.0; } #endif quadrant = (k>>7)&3; index=(k&127)<<2; if (index == 0) { /* Here a large cancellation on yh+yl would be a problem, so use double-double RR */ /* all this is exact */ Mul12(&kch_h, &kch_l, kd, RR_DD_MCH); Mul12(&kcm_h, &kcm_l, kd, RR_DD_MCM); Add12 (th,tl, kch_l, kcm_h) ; /* only rounding error in the last multiplication and addition */ Add22 (&yh, &yl, (rri->x + kch_h) , (kcm_l - kd*RR_DD_CL), th, tl) ; goto computeZero; } else { /* index <> 0, don't worry about cancellations on yh+yl */ if (rri->absxhi < XMAX_CODY_WAITE_2) { /* CW 2: all this is exact but the rightmost multiplication */ Add12 (yh,yl, (rri->x - kd*RR_CW2_CH), (kd*RR_CW2_MCL) ) ; } else { /* CW 3: all this is exact but the rightmost multiplication */ Add12Cond(yh,yl, (rri->x - kd*RR_CW3_CH) - kd*RR_CW3_CM, kd*RR_CW3_MCL); } } goto computeNotZero; } else if ( rri->absxhi < XMAX_DDRR ) { /* x sufficiently small for a Cody and Waite in double-double */ DOUBLE2LONGINT(kl, rri->x*INV_PIO256); kd=(double)kl; quadrant = (kl>>7)&3; index=(kl&127)<<2; if(index == 0) { /* Here again a large cancellation on yh+yl would be a problem, so we do the accurate range reduction */ RangeReductionSCS(); /*recomputes k, index, quadrant, and yh and yl*/ /* Now it may happen that the new k differs by 1 of kl, so check that */ if(index==0) /* no surprise */ goto computeZero; else goto computeNotZero; } else { /* index<>0 : double-double argument reduction*/ /* all this is exact */ Mul12(&kch_h, &kch_l, kd, RR_DD_MCH); Mul12(&kcm_h, &kcm_l, kd, RR_DD_MCM); Add12 (th,tl, kch_l, kcm_h) ; /* only rounding error in the last multiplication and addition */ Add22 (&yh, &yl, (rri->x + kch_h) , (kcm_l - kd*RR_DD_CL), th, tl) ; // printf("%f\n", yh); goto computeNotZero; } } /* closes if ( absxhi < XMAX_DDRR ) */ else { /* Worst case : x very large, sin(x) probably meaningless, we return correct rounding but do't mind taking time for it */ RangeReductionSCS(); quadrant = (k>>7)&3; if(index == 0) goto computeZero; else goto computeNotZero; } computeZero: switch(rri->function) { case SIN: if (quadrant&1) DoCosZero(&rri->rh, &rri->rl); else DoSinZero(&rri->rh, &rri->rl); rri->changesign=(quadrant==2)||(quadrant==3); return; case COS: if (quadrant&1) DoSinZero(&rri->rh, &rri->rl); else DoCosZero(&rri->rh, &rri->rl); rri->changesign= (quadrant==1)||(quadrant==2); return; case TAN: rri->changesign = quadrant&1; if (quadrant&1) { DoSinZero(&ch, &cl); DoCosZero(&sh, &sl); } else { DoSinZero(&sh, &sl); DoCosZero(&ch, &cl); } Div22(&rri->rh, &rri->rl, sh, sl, ch, cl); return; } computeNotZero: if(index<=(64<<2)) { sah=sincosTable[index+0].d; /* sin(a), high part */ sal=sincosTable[index+1].d; /* sin(a), low part */ cah=sincosTable[index+2].d; /* cos(a), high part */ cal=sincosTable[index+3].d; /* cos(a), low part */ }else { /* cah <= sah */ index=(128<<2) - index; cah=sincosTable[index+0].d; /* cos(a), high part */ cal=sincosTable[index+1].d; /* cos(a), low part */ sah=sincosTable[index+2].d; /* sin(a), high part */ sal=sincosTable[index+3].d; /* sin(a), low part */ } yh2 = yh*yh ; ts = yh2 * (s3.d + yh2*(s5.d + yh2*s7.d)); tc = yh2 * (c2.d + yh2*(c4.d + yh2*c6.d )); switch(rri->function) { case SIN: if (quadrant&1) DoCosNotZero(&rri->rh, &rri->rl); else DoSinNotZero(&rri->rh, &rri->rl); rri->changesign=(quadrant==2)||(quadrant==3); return; case COS: if (quadrant&1) DoSinNotZero(&rri->rh, &rri->rl); else DoCosNotZero(&rri->rh, &rri->rl); rri->changesign=(quadrant==1)||(quadrant==2); return; case TAN: rri->changesign = quadrant&1; if (quadrant&1) { DoSinNotZero(&ch, &cl); DoCosNotZero(&sh, &sl); } else { DoSinNotZero(&sh, &sl); DoCosNotZero(&ch, &cl); } Div22(&rri->rh, &rri->rl, sh, sl, ch, cl); return; } } /************************************************************* ************************************************************* * SIN ROUNDED TO NEAREST * ************************************************************* *************************************************************/ double sin_rn(double x){ double ts,x2,rncst; rrinfo rri; db_number x_split; double r; x_split.d=x; rri.absxhi = x_split.i[HI] & 0x7fffffff; /* SPECIAL CASES: x=(Nan, Inf) sin(x)=Nan */ if (rri.absxhi>=0x7ff00000) { x_split.l=0xfff8000000000000LL; return x_split.d - x_split.d; } else if (rri.absxhi < XMAX_SIN_CASE2){ /* CASE 1 : x small enough sin(x)=x */ if (rri.absxhi =0x7ff00000) { x_split.l=0xfff8000000000000LL; return x_split.d - x_split.d; } if (rri.absxhi < XMAX_SIN_CASE2){ /* CASE 1 : x small enough, return x suitably rounded */ if (rri.absxhi =0.) return x; else { x_split.l --; return x_split.d; } } else { /* CASE 2 : x < Pi/512 Fast polynomial evaluation */ xx = x*x; ts = x * xx * (s3.d + xx*(s5.d + xx*s7.d )); Add12(rri.rh,rri.rl, x, ts); epsilon=EPS_SIN_CASE2; } } else { /* CASE 3 : Need argument reduction */ rri.x=x; rri.function=SIN; ComputeTrigWithArgred(&rri); epsilon=EPS_SINCOS_CASE3; if(rri.changesign) { rri.rh = -rri.rh; rri.rl = -rri.rl; } } TEST_AND_RETURN_RU(rri.rh, rri.rl, epsilon); /* if the previous block didn't return a value, launch accurate phase */ return scs_sin_ru(x); } /************************************************************* ************************************************************* * SIN ROUNDED TOWARD -INFINITY * ************************************************************* *************************************************************/ double sin_rd(double x){ double xx, ts, epsilon; db_number x_split; rrinfo rri; x_split.d=x; rri.absxhi = x_split.i[HI] & 0x7fffffff; /* SPECIAL CASES: x=(Nan, Inf) sin(x)=Nan */ if (rri.absxhi>=0x7ff00000) { x_split.l=0xfff8000000000000LL; return x_split.d - x_split.d; } if (rri.absxhi < XMAX_SIN_CASE2){ /* CASE 1 : x small enough, return x suitably rounded */ if (rri.absxhi =0x7ff00000) { x_split.l=0xfff8000000000000LL; return x_split.d - x_split.d; } if (rri.absxhi < XMAX_SIN_CASE2){ /* CASE 1 : x small enough, return x suitably rounded */ if (rri.absxhi =0x7ff00000) { /* was : return x-x; but it's optimized out by Intel compiler (bug reported). Who cares to be slow in this case anyway... */ x_split.l=0xfff8000000000000LL; return x_split.d-x_split.d; } if (rri.absxhi < XMAX_COS_CASE2){ /* CASE 1 : x small enough cos(x)=1. */ if (rri.absxhi =0x7ff00000) { x_split.l=0xfff8000000000000LL; return x_split.d - x_split.d; } if (rri.absxhi < XMAX_COS_CASE2){ /* CASE 1 : x small enough cos(x)=1. */ if (rri.absxhi =0x7ff00000) { x_split.l=0xfff8000000000000LL; return x_split.d - x_split.d; } if (rri.absxhi < XMAX_COS_CASE2){ if (x==0) return 1; /* CASE 1 : x small enough cos(x)=1. */ if (rri.absxhi =0x7ff00000) { x_split.l=0xfff8000000000000LL; return x_split.d - x_split.d; } if (rri.absxhi < XMAX_COS_CASE2){ if (x==0) return 1; /* CASE 1 : x small enough cos(x)=1. */ if (rri.absxhi =0x7ff00000) { x_split.l=0xfff8000000000000LL; return x_split.d - x_split.d; } if (rri.absxhi < XMAX_TAN_CASE2){ if (rri.absxhi < XMAX_RETURN_X_FOR_TAN) return x; /* Dynamic computation of the rounding constant */ rndcst.i[HI] = 0x3ff00000 + (((rri.absxhi & 0x000fffff)+0x00100000) >> (0x3ff+2 - (rri.absxhi>>20))) ; rndcst.i[LO] =0xffffffff; /* Fast Taylor series */ x2 = x*x; p5 = t5.d + x2*(t7.d + x2*(t9.d + x2*t11.d)); tt = x2*(t3h.d + (t3l.d + x2*p5)); Add12(rri.rh, rri.rl, x, x*tt); /* Test if round to nearest achieved */ if(rri.rh == (rri.rh + (rri.rl * rndcst.d))) return rri.rh; else return scs_tan_rn(x); } else { /* Otherwise : Range reduction then standard evaluation */ rri.x=x; rri.function=TAN; ComputeTrigWithArgred(&rri); /* Test if round to nearest achieved */ if(rri.rh == (rri.rh + (rri.rl * RN_CST_TAN_CASE3))) if(rri.changesign) return -rri.rh; else return rri.rh; else return scs_tan_rn(x); } } /************************************************************* ************************************************************* * ROUNDED TOWARD +INFINITY ************************************************************* *************************************************************/ double tan_ru(double x){ double epsilon, p5, tt, x2; db_number x_split; rrinfo rri; x_split.d=x; rri.absxhi = x_split.i[HI] & 0x7fffffff; /* SPECIAL CASES: x=(Nan, Inf) cos(x)=Nan */ if (rri.absxhi>=0x7ff00000) { x_split.l=0xfff8000000000000LL; return x_split.d - x_split.d; } if (rri.absxhi < XMAX_TAN_CASE2){ if (rri.absxhi < XMAX_RETURN_X_FOR_TAN) { if(x<=0.) return x; else { x_split.l ++; return x_split.d; } } else { /* Fast Taylor series */ x2 = x*x; p5 = t5.d + x2*(t7.d + x2*(t9.d + x2*t11.d)); tt = x2*(t3h.d + (t3l.d +x2*p5)); Add12(rri.rh, rri.rl, x, x*tt); /* TODO dynamic computation of error constant */ TEST_AND_RETURN_RU(rri.rh, rri.rl, EPS_TAN_CASE2); /* if the previous block didn't return a value, launch accurate phase */ return scs_tan_ru(x); } } else { /* Normal case: Range reduction then standard evaluation */ rri.x=x; rri.function=TAN; ComputeTrigWithArgred(&rri); epsilon=EPS_TAN_CASE3; if(rri.changesign) { rri.rh= -rri.rh; rri.rl=-rri.rl; } } TEST_AND_RETURN_RU(rri.rh, rri.rl, epsilon); /* if the previous block didn't return a value, launch accurate phase */ return scs_tan_ru(x); } /************************************************************* ************************************************************* * ROUNDED TOWARD -INFINITY ************************************************************* *************************************************************/ double tan_rd(double x){ double epsilon, p5, tt, x2; rrinfo rri; db_number x_split; x_split.d=x; rri.absxhi = x_split.i[HI] & 0x7fffffff; /* SPECIAL CASES: x=(Nan, Inf) cos(x)=Nan */ if (rri.absxhi>=0x7ff00000){ x_split.l=0xfff8000000000000LL; return x_split.d - x_split.d; } if (rri.absxhi < XMAX_TAN_CASE2){ if (rri.absxhi < XMAX_RETURN_X_FOR_TAN) { if(x>=0.) return x; else { x_split.l ++; return x_split.d; } } /* Fast Taylor series */ x2 = x*x; p5 = t5.d + x2*(t7.d + x2*(t9.d + x2*t11.d)); tt = x2*(t3h.d + (t3l.d +x2*p5)); Add12(rri.rh, rri.rl, x, x*tt); TEST_AND_RETURN_RD(rri.rh, rri.rl, EPS_TAN_CASE2); /* if the previous block didn't return a value, launch accurate phase */ return scs_tan_rd(x); } else { /* normal case: Range reduction then standard evaluation */ rri.x=x; rri.function=TAN; ComputeTrigWithArgred(&rri); epsilon=EPS_TAN_CASE3; if(rri.changesign) { rri.rh= -rri.rh; rri.rl=-rri.rl; } } TEST_AND_RETURN_RD(rri.rh, rri.rl, epsilon); /* if the previous block didn't return a value, launch accurate phase */ return scs_tan_rd(x); } /************************************************************* ************************************************************* * ROUNDED TOWARD ZERO ************************************************************* *************************************************************/ double tan_rz(double x){ double epsilon, p5, tt, x2; rrinfo rri; db_number x_split; x_split.d=x; rri.absxhi = x_split.i[HI] & 0x7fffffff; /* SPECIAL CASES: x=(Nan, Inf) cos(x)=Nan */ if (rri.absxhi>=0x7ff00000) { x_split.l=0xfff8000000000000LL; return x_split.d - x_split.d; } if (rri.absxhi < XMAX_TAN_CASE2){ if (rri.absxhi < XMAX_RETURN_X_FOR_TAN) { return x; } else{ /* Fast Taylor series */ x2 = x*x; p5 = t5.d + x2*(t7.d + x2*(t9.d + x2*t11.d)); tt = x2*(t3h.d + (t3l.d +x2*p5)); Add12(rri.rh, rri.rl, x, x*tt); TEST_AND_RETURN_RZ(rri.rh, rri.rl, EPS_TAN_CASE2); /* if the TEST_AND_RETURN block didn't return a value, launch accurate phase */ return scs_tan_rz(x); } } else { /* Normal case: Range reduction then standard evaluation */ rri.x=x; rri.function=TAN; ComputeTrigWithArgred(&rri); epsilon=EPS_TAN_CASE3; if(rri.changesign) { rri.rh = -rri.rh; rri.rl = -rri.rl; } } TEST_AND_RETURN_RZ(rri.rh, rri.rl, epsilon); /* if the previous block didn't return a value, launch accurate phase */ return scs_tan_rz(x); }