/* *this function computes log, correctly rounded, using double-extended arithmetic THIS IS EXPERIMENTAL SOFTWARE In particular it changes rounding modes all the time without warning nor restoring. * * Author : Florent de Dinechin * Florent.de.Dinechin at ens-lyon.fr * This function compiles both on IA32 and IA64 architectures. On IA64, it needs icc 8.1 or higher, with the following flags (which should be set up by the autoconf). icc -DHAVE_CONFIG_H -Qoption,cpp,--extended_float_types \ -IPF_fp_speculationsafe -c log-de.c;\ mv log-de.o log-td.o; make */ #include #include #include "crlibm.h" #include "crlibm_private.h" #include "double-extended.h" #include "log-de.h" static void log_accurate(double_ext* prh, double_ext* prl, double_ext z, int E, int index) { double_ext eh,el, t13, t12, t11, t10, t9, t8, p7h,p7l, t7h,t7l, t6h,t6l, t5h,t5l, t4h,t4l, t3h,t3l, t2h,t2l, t1h,t1l, t0h,t0l; /* Many temporary because single assignment form is nicer for Gappa */ #if !(defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64)) double_ext c1h,c2h,c3h,c4h,c5h,c6h,c7h,c8h,c9h,c10h,c11h,c12h,c13h,c14h,c15h; double_ext c1l,c2l,c3l,c4l,c5l,c6l,c7l,c8l; #endif #if EVAL_PERF crlibm_second_step_taken++; #endif /* TODO check the conditions for the double-double ops */ PREFETCH_POLY_ACCURATE; t13 = c13h + z*c14h; t12 = c12h + z*t13; t11 = c11h + z*t12; t10 = c10h + z*t11; t9 = c9h + z*t10; t8 = c8h + z*t9; #if 1 /* This is faster on PIII. Your mileage may vary */ Mul12_ext(&p7h, &p7l, z, t8); Add22_ext(&t7h, &t7l, p7h,p7l, c7h,c7l); #else FMA22_ext(&t7h, &t7l, z,0, t8,0, c7h,c7l); #endif FMA22_ext(&t6h, &t6l, z,0, t7h,t7l, c6h,c6l); FMA22_ext(&t5h, &t5l, z,0, t6h,t6l, c5h,c5l); FMA22_ext(&t4h, &t4l, z,0, t5h,t5l, c4h,c4l); FMA22_ext(&t3h, &t3l, z,0, t4h,t4l, c3h,c3l); FMA22_ext(&t2h, &t2l, z,0, t3h,t3l, c2h,c2l); FMA22_ext(&t1h, &t1l, z,0, t2h,t2l, c1h,c1l); FMA22_ext(&t0h, &t0l, z,0, t1h,t1l, argredtable[index].logirh, argredtable[index].logirl); Mul22_ext(&eh, &el, log2H,log2L, E, 0); Add22_ext(prh, prl, eh,el, t0h,t0l); } double log_rn(double x) { double_ext logirh, r, y, z, th, tl, logde; #if defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) db_number xdb; int E, index, index0, roundtestmask; #else /* assuming Itanium here */ int64_t E, i; uint64_t index, roundtestmask; double c2,c3,c4,c5,c6,c7; #endif #if defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) xdb.d=x; index0 = (xdb.i[HI] & 0x000fffff); index = (index0 + (1<<(20-L-1))) >> (20-L); E = (xdb.i[HI]>>20)-1023; /* extract the exponent */ /* Filter cases */ if (xdb.i[HI] < 0x00100000){ /* x < 2^(-1022) */ if (((xdb.i[HI] & 0x7fffffff)|xdb.i[LO])==0) return -1.0/0.0; /* log(+/-0) = -Inf */ if (xdb.i[HI] < 0) return (x-x)/0.0; /* log(-x) = Nan */ /* Else subnormal number */ xdb.d *= two64; /* make x a normal number */ E = -64 + (xdb.i[HI]>>20)-1023; /* extract the exponent */ index0 = (xdb.i[HI] & 0x000fffff); index = (index0 + (1<<(20-L-1))) >> (20-L); } if (xdb.i[HI] >= 0x7ff00000) return x+x; /* Inf or Nan */ DOUBLE_EXTENDED_MODE; /* This one should be overlapped with following integer computation */ /* Extract exponent and mantissa */ xdb.i[HI] = index0 | 0x3ff00000; /* do exponent = 0 */ /* reduce such that sqrt(2)/2 < xdb.d < sqrt(2) */ if (index >= MAXINDEX){ /* corresponds to y>sqrt(2)*/ xdb.i[HI] -= 0x00100000; index = index & INDEXMASK; E++; } y = xdb.d; #else /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */ /* Here come the code specific to Itanium processor */ E=0; PREFETCH_POLY_QUICK; /* defined in log-de.h */ y=x; i = _Asm_getf(2/*_FR_D*/, y); /* Cast y to a 64-bit integer */ /* Filter special cases */ if (i<(int64_t)ULL(0010000000000000)){ /* equivalent to : x < 2^(-1022) */ if ((i & ULL(7fffffffffffffff))==0) return -1.0/0.0; /* log(+/-0) = -Inf */ if (i<0) return (x-x)/0.0; /* log(-x) = Nan */ /* Else subnormal number */ y *= two64; /* make x a normal number */ E = -64; i = _Asm_getf(2/*_FR_D*/, y); /* and update i */ } if (i >= ULL(7ff0000000000000)) return x+x; /* Inf or Nan */ /* Extract exponent and mantissa */ E += (i>>52)-1023; i = i & ULL(000fffffffffffff); /* keep only mantissa */ index = (i + (ULL(1)<<(52-L-1))) >> (52-L); /* reduce such that sqrt(2)/2 < xdb.d < sqrt(2) */ if (index >= MAXINDEX){ /* corresponds to y>sqrt(2)*/ y = _Asm_setf(2/*_FR_D*/, (i | ULL(3ff0000000000000)) - ULL(0010000000000000) ); /* exponent = -1 */ index = index & INDEXMASK; E++; } else y = _Asm_setf(2/*_FR_D*/, i | ULL(3ff0000000000000) ); /* exponent = 0*/ #endif /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */ /* All the previous argument reduction was exact */ /* now y holds 1+f, and E is the exponent */ r = (double_ext) (argredtable[index].r); /* approx to 1/y.d */ logirh = argredtable[index].logirh; z = y*r - 1. ; /* even without an FMA, all exact */ #if 0 if(E==0) roundtestmask=ACCURATE_TO_61_BITS; else roundtestmask=ACCURATE_TO_61_BITS; #else roundtestmask=ACCURATE_TO_62_BITS; #endif #ifdef ESTRIN /* Estrin polynomial evaluation */ double_ext z2,z4, p01, p23, p45, p67, p03, p47,p07; z2 = z*z; p67 = c6 + z*c7; p45 = c4 + z*c5; p23 = c2 + z*c3; p01 = logirh + z; z4 = z2*z2; p47 = p45 + z2*p67; p03 = p01 + z2*p23; p07 = p03 + z4*p47; logde = p07 + E*log2H; #endif #ifdef PATERSON double_ext z4,z2,t0,t1,t2,t3,t4,t5,t6,t7,t8; z2 = z * z; t1 = z + ps_alpha; t2 = z + ps_beta; t3 = c3 * z + c2; t4 = z + logirh; z4 = z2 * z2; t5 = z2 + ps_c; t6 = t3 * z2 + t4; t7 = t5 * t1 + t2; t0 = z4 * c7; t8 = t7 * t0 + t6; logde = t8 + E*log2H; #endif #if 0 /* to time the first step only */ BACK_TO_DOUBLE_MODE; return (double)t; #endif /* To test the second step only, comment out the following line */ DE_TEST_AND_RETURN_RN(logde, roundtestmask); log_accurate(&th, &tl, z, E, index); BACK_TO_DOUBLE_MODE; return (double) (th+tl); /* The exact sum of these double-extended is rounded to the nearest */ } double log_rd(double x) { double_ext logirh, r, y, z, th, tl, logde; #if defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) db_number xdb; int E, index, roundtestmask; #else int64_t E, i; uint64_t index, roundtestmask; double_ext c1,c2,c3,c4,c5,c6,c7; #endif E=0; #if defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) xdb.d=x; /* Filter cases */ if (xdb.i[HI] < 0x00100000){ /* x < 2^(-1022) */ if (((xdb.i[HI] & 0x7fffffff)|xdb.i[LO])==0) return -1.0/0.0; /* log(+/-0) = -Inf */ if (xdb.i[HI] < 0) return (x-x)/0.0; /* log(-x) = Nan */ /* Else subnormal number */ E = -64; xdb.d *= two64; /* make x a normal number */ } if (xdb.i[HI] >= 0x7ff00000) return x+x; /* Inf or Nan */ DOUBLE_EXTENDED_MODE; /* This one should be overlapped with following integer computation */ /* Extract exponent and mantissa */ E += (xdb.i[HI]>>20)-1023; /* extract the exponent */ index = (xdb.i[HI] & 0x000fffff); xdb.i[HI] = index | 0x3ff00000; /* do exponent = 0 */ index = (index + (1<<(20-L-1))) >> (20-L); /* reduce such that sqrt(2)/2 < xdb.d < sqrt(2) */ if (index >= MAXINDEX){ /* corresponds to y>sqrt(2)*/ xdb.i[HI] -= 0x00100000; E++; } y = xdb.d; #else /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */ /* Here come the code specific to Itanium processor */ PREFETCH_POLY_QUICK; /* defined in log-de.h */ y=x; i = _Asm_getf(2/*_FR_D*/, y); /* Cast y to a 64-bit integer */ /* Filter special cases */ if (i<(int64_t)ULL(0010000000000000)){ /* equivalent to : x < 2^(-1022) */ if ((i & ULL(7fffffffffffffff))==0) return -1.0/0.0; /* log(+/-0) = -Inf */ if (i<0) return (x-x)/0.0; /* log(-x) = Nan */ /* Else subnormal number */ y *= two64; /* make x a normal number */ E = -64; i = _Asm_getf(2/*_FR_D*/, y); /* and update i */ } if (i >= ULL(7ff0000000000000)) return x+x; /* Inf or Nan */ /* Extract exponent and mantissa */ E += (i>>52)-1023; i = i & ULL(000fffffffffffff); /* keep only mantissa */ index = (i + (ULL(1)<<(52-L-1))) >> (52-L); /* reduce such that sqrt(2)/2 < xdb.d < sqrt(2) */ if (index >= MAXINDEX){ /* corresponds to y>sqrt(2)*/ y = _Asm_setf(2/*_FR_D*/, (i | ULL(3ff0000000000000)) - ULL(0010000000000000) ); /* exponent = -1 */ E++; } else y = _Asm_setf(2/*_FR_D*/, i | ULL(3ff0000000000000) ); /* exponent = 0*/ #endif /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */ /* All the previous argument reduction was exact */ /* now y holds 1+f, and E is the exponent */ index = index & INDEXMASK; logirh = argredtable[index].logirh; r = (double_ext) (argredtable[index].r); /* approx to 1/y.d */ z = y*r - 1. ; /* even without an FMA, all exact */ if(E==0) roundtestmask=ACCURATE_TO_61_BITS; else roundtestmask=ACCURATE_TO_62_BITS; #ifdef ESTRIN /* Estrin polynomial evaluation */ double_ext z2,z4, p01, p23, p45, p67, p03, p47,p07; z2 = z*z; p67 = c6 + z*c7; p45 = c4 + z*c5; p23 = c2 + z*c3; p01 = logirh + z; z4 = z2*z2; p47 = p45 + z2*p67; p03 = p01 + z2*p23; p07 = p03 + z4*p47; logde = p07 + E*log2H; #endif #ifdef PATERSON double_ext z4,z2,t0,t1,t2,t3,t4,t5,t6,t7,t8; z2 = z * z; t1 = z + ps_alpha; t2 = z + ps_beta; t3 = c3 * z + c2; t4 = z + logirh; z4 = z2 * z2; t5 = z2 + ps_c; t6 = t3 * z2 + t4; t7 = t5 * t1 + t2; t0 = z4 * c7; t8 = t7 * t0 + t6; logde = t8 + E*log2H; #endif #if 0 /* to time the first step only */ BACK_TO_DOUBLE_MODE; return (double)t; #endif /* To test the second step only, comment out the following line */ DE_TEST_AND_RETURN_RD(logde, roundtestmask); log_accurate(&th, &tl, z, E, index); RETURN_SUM_ROUNDED_DOWN(th, tl); } double log_ru(double x) { double_ext logirh, r, y, z, th, tl, logde; #if defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) db_number xdb; int E, index, roundtestmask; #else int64_t E, i; uint64_t index, roundtestmask; double_ext c1,c2,c3,c4,c5,c6,c7; #endif E=0; #if defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) xdb.d=x; /* Filter cases */ if (xdb.i[HI] < 0x00100000){ /* x < 2^(-1022) */ if (((xdb.i[HI] & 0x7fffffff)|xdb.i[LO])==0) return -1.0/0.0; /* log(+/-0) = -Inf */ if (xdb.i[HI] < 0) return (x-x)/0.0; /* log(-x) = Nan */ /* Else subnormal number */ E = -64; xdb.d *= two64; /* make x a normal number */ } if (xdb.i[HI] >= 0x7ff00000) return x+x; /* Inf or Nan */ DOUBLE_EXTENDED_MODE; /* This one should be overlapped with following integer computation */ /* Extract exponent and mantissa */ E += (xdb.i[HI]>>20)-1023; /* extract the exponent */ index = (xdb.i[HI] & 0x000fffff); xdb.i[HI] = index | 0x3ff00000; /* do exponent = 0 */ index = (index + (1<<(20-L-1))) >> (20-L); /* reduce such that sqrt(2)/2 < xdb.d < sqrt(2) */ if (index >= MAXINDEX){ /* corresponds to y>sqrt(2)*/ index = index & INDEXMASK; xdb.i[HI] -= 0x00100000; E++; } y = xdb.d; #else /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */ /* Here come the code specific to Itanium processor */ PREFETCH_POLY_QUICK; /* defined in log-de.h */ y=x; i = _Asm_getf(2/*_FR_D*/, y); /* Cast y to a 64-bit integer */ /* Filter special cases */ if (i<(int64_t)ULL(0010000000000000)){ /* equivalent to : x < 2^(-1022) */ if ((i & ULL(7fffffffffffffff))==0) return -1.0/0.0; /* log(+/-0) = -Inf */ if (i<0) return (x-x)/0.0; /* log(-x) = Nan */ /* Else subnormal number */ y *= two64; /* make x a normal number */ E = -64; i = _Asm_getf(2/*_FR_D*/, y); /* and update i */ } if (i >= ULL(7ff0000000000000)) return x+x; /* Inf or Nan */ /* Extract exponent and mantissa */ E += (i>>52)-1023; i = i & ULL(000fffffffffffff); /* keep only mantissa */ index = (i + (ULL(1)<<(52-L-1))) >> (52-L); /* reduce such that sqrt(2)/2 < xdb.d < sqrt(2) */ if (index >= MAXINDEX){ /* corresponds to y>sqrt(2)*/ y = _Asm_setf(2/*_FR_D*/, (i | ULL(3ff0000000000000)) - ULL(0010000000000000) ); /* exponent = -1 */ index = index & INDEXMASK; E++; } else y = _Asm_setf(2/*_FR_D*/, i | ULL(3ff0000000000000) ); /* exponent = 0*/ #endif /* defined(CRLIBM_TYPECPU_X86) || defined(CRLIBM_TYPECPU_AMD64) */ /* All the previous argument reduction was exact */ /* now y holds 1+f, and E is the exponent */ logirh = argredtable[index].logirh; r = (double_ext) (argredtable[index].r); /* approx to 1/y.d */ z = y*r - 1. ; /* even without an FMA, all exact */ if(E==0) roundtestmask=ACCURATE_TO_61_BITS; else roundtestmask=ACCURATE_TO_62_BITS; #ifdef ESTRIN /* Estrin polynomial evaluation */ double_ext z2,z4, p01, p23, p45, p67, p03, p47,p07; z2 = z*z; p67 = c6 + z*c7; p45 = c4 + z*c5; p23 = c2 + z*c3; p01 = logirh + z; z4 = z2*z2; p47 = p45 + z2*p67; p03 = p01 + z2*p23; p07 = p03 + z4*p47; logde = p07 + E*log2H; #endif #ifdef PATERSON double_ext z4,z2,t0,t1,t2,t3,t4,t5,t6,t7,t8; z2 = z * z; t1 = z + ps_alpha; t2 = z + ps_beta; t3 = c3 * z + c2; t4 = z + logirh; z4 = z2 * z2; t5 = z2 + ps_c; t6 = t3 * z2 + t4; t7 = t5 * t1 + t2; t0 = z4 * c7; t8 = t7 * t0 + t6; logde = t8 + E*log2H; #endif #if 0 /* to time the first step only */ BACK_TO_DOUBLE_MODE; return (double)t; #endif /* To test the second step only, comment out the following line */ DE_TEST_AND_RETURN_RU(logde, roundtestmask); log_accurate(&th, &tl, z, E, index); RETURN_SUM_ROUNDED_UP(th, tl); } double log_rz(double x) { if (x>1.0) return log_rd(x); else return log_ru(x); }