/* Copyright (C) 2018 Fredrik Johansson This file is part of Arb. Arb is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License (LGPL) as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. See . */ #include "acb.h" /* We need uint64_t instead of mp_limb_t on 32-bit systems for safe summation of 30-bit error bounds. */ #include /* The following macros are found in FLINT's longlong.h, but the release version is out of date. */ /* x86 : 64 bit */ #if (GMP_LIMB_BITS == 64 && defined (__amd64__)) #define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl) \ __asm__ ("addq %8,%q2\n\tadcq %6,%q1\n\tadcq %4,%q0" \ : "=r" (sh), "=&r" (sm), "=&r" (sl) \ : "0" ((mp_limb_t)(ah)), "rme" ((mp_limb_t)(bh)), \ "1" ((mp_limb_t)(am)), "rme" ((mp_limb_t)(bm)), \ "2" ((mp_limb_t)(al)), "rme" ((mp_limb_t)(bl))) \ #define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl) \ __asm__ ("subq %8,%q2\n\tsbbq %6,%q1\n\tsbbq %4,%q0" \ : "=r" (dh), "=&r" (dm), "=&r" (dl) \ : "0" ((mp_limb_t)(mh)), "rme" ((mp_limb_t)(sh)), \ "1" ((mp_limb_t)(mm)), "rme" ((mp_limb_t)(sm)), \ "2" ((mp_limb_t)(ml)), "rme" ((mp_limb_t)(sl))) \ #endif /* x86_64 */ /* x86 : 32 bit */ #if (GMP_LIMB_BITS == 32 && (defined (__i386__) \ || defined (__i486__) || defined(__amd64__))) #define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl) \ __asm__ ("addl %8,%k2\n\tadcl %6,%k1\n\tadcl %4,%k0" \ : "=r" (sh), "=r" (sm), "=&r" (sl) \ : "0" ((mp_limb_t)(ah)), "g" ((mp_limb_t)(bh)), \ "1" ((mp_limb_t)(am)), "g" ((mp_limb_t)(bm)), \ "2" ((mp_limb_t)(al)), "g" ((mp_limb_t)(bl))) \ #define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl) \ __asm__ ("subl %8,%k2\n\tsbbl %6,%k1\n\tsbbl %4,%k0" \ : "=r" (dh), "=r" (dm), "=&r" (dl) \ : "0" ((mp_limb_t)(mh)), "g" ((mp_limb_t)(sh)), \ "1" ((mp_limb_t)(mm)), "g" ((mp_limb_t)(sm)), \ "2" ((mp_limb_t)(ml)), "g" ((mp_limb_t)(sl))) \ #endif /* x86 */ #if !defined(add_sssaaaaaa2) #define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl) \ do { \ mp_limb_t __t, __u; \ add_ssaaaa(__t, sl, (mp_limb_t) 0, al, (mp_limb_t) 0, bl); \ add_ssaaaa(__u, sm, (mp_limb_t) 0, am, (mp_limb_t) 0, bm); \ add_ssaaaa(sh, sm, ah + bh, sm, __u, __t); \ } while (0) #define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl) \ do { \ mp_limb_t __t, __u; \ sub_ddmmss(__t, dl, (mp_limb_t) 0, ml, (mp_limb_t) 0, sl); \ sub_ddmmss(__u, dm, (mp_limb_t) 0, mm, (mp_limb_t) 0, sm); \ sub_ddmmss(dh, dm, mh - sh, dm, -__u, -__t); \ } while (0) #endif /* Add ((a * b) / 2^MAG_BITS) * 2^exp into srad*2^srad_exp. Assumes that srad_exp >= exp and that overflow cannot occur. */ #define RAD_ADDMUL(srad, srad_exp, a, b, exp) \ do { \ uint64_t __a, __b; \ slong __shift; \ __a = (a); \ __b = (b); \ __shift = (srad_exp) - (exp); \ if (__shift < MAG_BITS) \ (srad) += (((__a) * (__b)) >> (MAG_BITS + __shift)) + 1; \ else \ (srad) += 1; \ } while (0) void mag_set_ui_2exp_small(mag_t z, ulong x, slong e); static void add_errors(mag_t rad, uint64_t Aerr, slong Aexp, uint64_t Berr, slong Bexp, uint64_t Cerr, slong Cexp) { slong shift; if (Aerr && Berr) { if (Aexp >= Bexp) { shift = Aexp - Bexp; if (shift < 64) Aerr = Aerr + (Berr >> shift) + 1; else Aerr = Aerr + 1; } else { shift = Bexp - Aexp; if (shift < 64) Aerr = Berr + (Aerr >> shift) + 1; else Aerr = Berr + 1; Aexp = Bexp; } } else if (Berr) { Aerr = Berr; Aexp = Bexp; } if (Aerr && Cerr) { if (Aexp >= Cexp) { shift = Aexp - Cexp; if (shift < 64) Aerr = Aerr + (Cerr >> shift) + 1; else Aerr = Aerr + 1; } else { shift = Cexp - Aexp; if (shift < 64) Aerr = Cerr + (Aerr >> shift) + 1; else Aerr = Cerr + 1; Aexp = Cexp; } } else if (Cerr) { Aerr = Cerr; Aexp = Cexp; } #if FLINT_BITS == 64 mag_set_ui_2exp_small(rad, Aerr, Aexp - MAG_BITS); #else mag_set_d(rad, Aerr * (1.0 + 1e-14)); mag_mul_2exp_si(rad, rad, Aexp - MAG_BITS); #endif } void _arb_dot_addmul_generic(mp_ptr sum, mp_ptr serr, mp_ptr tmp, mp_size_t sn, mp_srcptr xptr, mp_size_t xn, mp_srcptr yptr, mp_size_t yn, int negative, flint_bitcnt_t shift); void _arb_dot_add_generic(mp_ptr sum, mp_ptr serr, mp_ptr tmp, mp_size_t sn, mp_srcptr xptr, mp_size_t xn, int negative, flint_bitcnt_t shift); static void _arb_dot_output(arb_t res, mp_ptr sum, mp_size_t sn, int negative, uint64_t serr, slong sum_exp, uint64_t srad, slong srad_exp, slong prec) { slong exp_fix; int inexact; if (sum[sn - 1] >= LIMB_TOP) { mpn_neg(sum, sum, sn); negative ^= 1; } exp_fix = 0; if (sum[sn - 1] == 0) { slong sum_exp2; mp_size_t sn2; sn2 = sn; sum_exp2 = sum_exp; while (sn2 > 0 && sum[sn2 - 1] == 0) { sum_exp2 -= FLINT_BITS; sn2--; } if (sn2 == 0) { arf_zero(arb_midref(res)); inexact = 0; } else { inexact = _arf_set_round_mpn(arb_midref(res), &exp_fix, sum, sn2, negative, prec, ARF_RND_DOWN); _fmpz_set_si_small(ARF_EXPREF(arb_midref(res)), exp_fix + sum_exp2); } } else { if (sn == 2) /* unnecessary? */ inexact = _arf_set_round_uiui(arb_midref(res), &exp_fix, sum[1], sum[0], negative, prec, ARF_RND_DOWN); else inexact = _arf_set_round_mpn(arb_midref(res), &exp_fix, sum, sn, negative, prec, ARF_RND_DOWN); _fmpz_set_si_small(ARF_EXPREF(arb_midref(res)), exp_fix + sum_exp); } add_errors(arb_radref(res), inexact << MAG_BITS, exp_fix + sum_exp - prec, ((uint64_t) serr) << MAG_BITS, sum_exp - sn * FLINT_BITS, srad, srad_exp); } /* xxx: don't use surrounding variables */ #define ARB_DOT_ADD(s_sum, s_serr, s_sn, s_sum_exp, s_subtract, xm) \ if (!arf_is_special(xm)) \ { \ mp_srcptr xptr; \ xexp = ARF_EXP(xm); \ xn = ARF_SIZE(xm); \ xnegative = ARF_SGNBIT(xm); \ shift = s_sum_exp - xexp; \ if (shift >= s_sn * FLINT_BITS) \ { \ s_serr++; \ } \ else \ { \ xptr = (xn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(xm) : ARF_PTR_D(xm); \ _arb_dot_add_generic(s_sum, &s_serr, tmp, s_sn, xptr, xn, xnegative ^ s_subtract, shift); \ } \ } \ /* xxx: don't use surrounding variables */ #define ARB_DOT_ADD_RAD(s_srad, s_srad_exp, xr) \ if (!mag_is_special(xr)) \ { \ xrad = MAG_MAN(xr); \ xrexp = MAG_EXP(xr); \ shift = s_srad_exp - xrexp; \ if (shift < 64) \ s_srad += (xrad >> shift) + 1; \ else \ s_srad++; \ } static void _arf_complex_mul_gauss(arf_t e, arf_t f, const arf_t a, const arf_t b, const arf_t c, const arf_t d) { mp_srcptr ap, bp, cp, dp; int asgn, bsgn, csgn, dsgn; mp_size_t an, bn, cn, dn; slong aexp, bexp, cexp, dexp; fmpz texp, uexp; fmpz_t za, zb, zc, zd, t, u, v; slong abot, bbot, cbot, dbot; ARF_GET_MPN_READONLY(ap, an, a); asgn = ARF_SGNBIT(a); aexp = ARF_EXP(a); ARF_GET_MPN_READONLY(bp, bn, b); bsgn = ARF_SGNBIT(b); bexp = ARF_EXP(b); ARF_GET_MPN_READONLY(cp, cn, c); csgn = ARF_SGNBIT(c); cexp = ARF_EXP(c); ARF_GET_MPN_READONLY(dp, dn, d); dsgn = ARF_SGNBIT(d); dexp = ARF_EXP(d); /* Gauss multiplication e = ac - bd f = (a+b)(c+d) - ac - bd */ abot = aexp - an * FLINT_BITS; bbot = bexp - bn * FLINT_BITS; cbot = cexp - cn * FLINT_BITS; dbot = dexp - dn * FLINT_BITS; texp = FLINT_MIN(abot, bbot); uexp = FLINT_MIN(cbot, dbot); fmpz_init(za); fmpz_init(zb); fmpz_init(zc); fmpz_init(zd); fmpz_init(t); fmpz_init(u); fmpz_init(v); fmpz_lshift_mpn(za, ap, an, asgn, abot - texp); fmpz_lshift_mpn(zb, bp, bn, bsgn, bbot - texp); fmpz_lshift_mpn(zc, cp, cn, csgn, cbot - uexp); fmpz_lshift_mpn(zd, dp, dn, dsgn, dbot - uexp); fmpz_add(t, za, zb); fmpz_add(v, zc, zd); fmpz_mul(u, t, v); fmpz_mul(t, za, zc); fmpz_mul(v, zb, zd); fmpz_sub(u, u, t); fmpz_sub(u, u, v); fmpz_sub(t, t, v); texp += uexp; arf_set_fmpz_2exp(e, t, &texp); arf_set_fmpz_2exp(f, u, &texp); fmpz_clear(za); fmpz_clear(zb); fmpz_clear(zc); fmpz_clear(zd); fmpz_clear(t); fmpz_clear(u); fmpz_clear(v); } /* TODO: this could be much lower, but it's currently competing against mulhigh in the Karatsuba range. */ ARB_DLL slong acb_dot_gauss_dot_cutoff = 128; #define GAUSS_CUTOFF acb_dot_gauss_dot_cutoff void acb_dot(acb_t res, const acb_t initial, int subtract, acb_srcptr x, slong xstep, acb_srcptr y, slong ystep, slong len, slong prec) { slong i, j, padding, extend; slong xexp, yexp, exp; slong re_nonzero, im_nonzero; slong re_max_exp, re_min_exp, re_sum_exp; slong im_max_exp, im_min_exp, im_sum_exp; slong re_srad_exp, re_max_rad_exp; slong im_srad_exp, im_max_rad_exp; slong re_prec, im_prec; slong xrexp, yrexp; int xnegative, ynegative; mp_size_t xn, yn, re_sn, im_sn, alloc; flint_bitcnt_t shift; arb_srcptr xi, yi; arf_srcptr xm, ym; mag_srcptr xr, yr; mp_limb_t xtop, ytop; mp_limb_t xrad, yrad; mp_limb_t re_serr, im_serr; /* Sum over arithmetic errors */ uint64_t re_srad, im_srad; /* Sum over propagated errors */ mp_ptr tmp, re_sum, im_sum; /* Workspace */ slong xoff, yoff; char * use_gauss; ARF_ADD_TMP_DECL; /* todo: fast fma and fmma (len=2) code */ if (len <= 1) { if (initial == NULL) { if (len <= 0) acb_zero(res); else { acb_mul(res, x, y, prec); if (subtract) acb_neg(res, res); } return; } else if (len <= 0) { acb_set_round(res, initial, prec); return; } } /* Number of nonzero midpoint terms in sum. */ re_nonzero = 0; im_nonzero = 0; /* Terms are bounded by 2^max_exp (with WORD_MIN = -infty) */ re_max_exp = WORD_MIN; im_max_exp = WORD_MIN; /* Propagated error terms are bounded by 2^max_rad_exp */ re_max_rad_exp = WORD_MIN; im_max_rad_exp = WORD_MIN; /* Used to reduce the precision. */ re_min_exp = WORD_MAX; im_min_exp = WORD_MAX; /* Account for the initial term. */ if (initial != NULL) { if (!ARB_IS_LAGOM(acb_realref(initial)) || !ARB_IS_LAGOM(acb_imagref(initial))) { acb_dot_simple(res, initial, subtract, x, xstep, y, ystep, len, prec); return; } xm = arb_midref(acb_realref(initial)); xr = arb_radref(acb_realref(initial)); if (!arf_is_special(xm)) { re_max_exp = ARF_EXP(xm); re_nonzero++; if (prec > 2 * FLINT_BITS) re_min_exp = ARF_EXP(xm) - ARF_SIZE(xm) * FLINT_BITS; } if (!mag_is_special(xr)) re_max_rad_exp = MAG_EXP(xr); xm = arb_midref(acb_imagref(initial)); xr = arb_radref(acb_imagref(initial)); if (!arf_is_special(xm)) { im_max_exp = ARF_EXP(xm); im_nonzero++; if (prec > 2 * FLINT_BITS) im_min_exp = ARF_EXP(xm) - ARF_SIZE(xm) * FLINT_BITS; } if (!mag_is_special(xr)) im_max_rad_exp = MAG_EXP(xr); } for (xoff = 0; xoff < 2; xoff++) { for (yoff = 0; yoff < 2; yoff++) { slong nonzero, max_exp, min_exp, max_rad_exp; if (xoff == yoff) { nonzero = re_nonzero; max_exp = re_max_exp; min_exp = re_min_exp; max_rad_exp = re_max_rad_exp; } else { nonzero = im_nonzero; max_exp = im_max_exp; min_exp = im_min_exp; max_rad_exp = im_max_rad_exp; } /* Determine maximum exponents for the main sum and the radius sum. */ for (i = 0; i < len; i++) { xi = ((arb_srcptr) x) + 2 * i * xstep + xoff; yi = ((arb_srcptr) y) + 2 * i * ystep + yoff; /* Fallback for huge exponents or non-finite values. */ if (!ARB_IS_LAGOM(xi) || !ARB_IS_LAGOM(yi)) { acb_dot_simple(res, initial, subtract, x, xstep, y, ystep, len, prec); return; } xm = arb_midref(xi); ym = arb_midref(yi); xr = arb_radref(xi); yr = arb_radref(yi); /* (xm+xr)(ym+yr) = xm ym + [xr ym + xm yr + xr yr] */ if (!arf_is_special(xm)) { xexp = ARF_EXP(xm); if (!arf_is_special(ym)) { yexp = ARF_EXP(ym); max_exp = FLINT_MAX(max_exp, xexp + yexp); nonzero++; if (prec > 2 * FLINT_BITS) { slong bot; bot = (xexp + yexp) - (ARF_SIZE(xm) + ARF_SIZE(ym)) * FLINT_BITS; min_exp = FLINT_MIN(min_exp, bot); } if (!mag_is_special(xr)) { xrexp = MAG_EXP(xr); max_rad_exp = FLINT_MAX(max_rad_exp, yexp + xrexp); if (!mag_is_special(yr)) { yrexp = MAG_EXP(yr); max_rad_exp = FLINT_MAX(max_rad_exp, xexp + yrexp); max_rad_exp = FLINT_MAX(max_rad_exp, xrexp + yrexp); } } else { if (!mag_is_special(yr)) { yrexp = MAG_EXP(yr); max_rad_exp = FLINT_MAX(max_rad_exp, xexp + yrexp); } } } else /* if y = 0, something can happen only if yr != 0 */ { if (!mag_is_special(yr)) { yrexp = MAG_EXP(yr); max_rad_exp = FLINT_MAX(max_rad_exp, xexp + yrexp); if (!mag_is_special(xr)) { xrexp = MAG_EXP(xr); max_rad_exp = FLINT_MAX(max_rad_exp, xrexp + yrexp); } } } } else /* if x = 0, something can happen only if xr != 0 */ { if (!mag_is_special(xr)) { xrexp = MAG_EXP(xr); if (!arf_is_special(ym)) { yexp = ARF_EXP(ym); max_rad_exp = FLINT_MAX(max_rad_exp, xrexp + yexp); } if (!mag_is_special(yr)) { yrexp = MAG_EXP(yr); max_rad_exp = FLINT_MAX(max_rad_exp, xrexp + yrexp); } } } } if (xoff == yoff) { re_nonzero = nonzero; re_max_exp = max_exp; re_min_exp = min_exp; re_max_rad_exp = max_rad_exp; } else { im_nonzero = nonzero; im_max_exp = max_exp; im_min_exp = min_exp; im_max_rad_exp = max_rad_exp; } } } re_prec = prec; im_prec = prec; if (re_max_exp == WORD_MIN && re_max_rad_exp == WORD_MIN && im_max_exp == WORD_MIN && im_max_rad_exp == WORD_MIN) { acb_zero(res); return; } /* The midpoint sum is zero. */ if (re_max_exp == WORD_MIN) { re_prec = 2; } else { if (re_max_rad_exp != WORD_MIN) re_prec = FLINT_MIN(re_prec, re_max_exp - re_max_rad_exp + MAG_BITS); if (re_min_exp != WORD_MAX) re_prec = FLINT_MIN(re_prec, re_max_exp - re_min_exp + MAG_BITS); re_prec = FLINT_MAX(re_prec, 2); } if (im_max_exp == WORD_MIN) { im_prec = 2; } else { if (im_max_rad_exp != WORD_MIN) im_prec = FLINT_MIN(im_prec, im_max_exp - im_max_rad_exp + MAG_BITS); if (re_min_exp != WORD_MAX) im_prec = FLINT_MIN(im_prec, im_max_exp - im_min_exp + MAG_BITS); im_prec = FLINT_MAX(im_prec, 2); } extend = FLINT_BIT_COUNT(re_nonzero) + 1; padding = 4 + FLINT_BIT_COUNT(len); re_sn = (re_prec + extend + padding + FLINT_BITS - 1) / FLINT_BITS; re_sn = FLINT_MAX(re_sn, 2); re_sum_exp = re_max_exp + extend; extend = FLINT_BIT_COUNT(im_nonzero) + 1; padding = 4 + FLINT_BIT_COUNT(len); im_sn = (im_prec + extend + padding + FLINT_BITS - 1) / FLINT_BITS; im_sn = FLINT_MAX(im_sn, 2); im_sum_exp = im_max_exp + extend; /* We need sn + 1 limb for the sum (sn limbs + 1 dummy limb for carry or borrow that avoids an extra branch). We need 2 * (sn + 2) limbs to store the product of two numbers with up to (sn + 2) limbs, plus 1 extra limb for shifting the product. */ alloc = (re_sn + 1) + (im_sn + 1) + 2 * (FLINT_MAX(re_sn, im_sn) + 2) + 1; ARF_ADD_TMP_ALLOC(re_sum, alloc) im_sum = re_sum + (re_sn + 1); tmp = im_sum + (im_sn + 1); /* Sum of propagated errors. */ re_srad_exp = re_max_rad_exp; re_srad = 0; im_srad_exp = im_max_rad_exp; im_srad = 0; /* Set sum to 0 */ re_serr = 0; for (j = 0; j < re_sn + 1; j++) re_sum[j] = 0; im_serr = 0; for (j = 0; j < im_sn + 1; j++) im_sum[j] = 0; if (initial != NULL) { xm = arb_midref(acb_realref(initial)); xr = arb_radref(acb_realref(initial)); ARB_DOT_ADD(re_sum, re_serr, re_sn, re_sum_exp, subtract, xm); ARB_DOT_ADD_RAD(re_srad, re_srad_exp, xr); xm = arb_midref(acb_imagref(initial)); xr = arb_radref(acb_imagref(initial)); ARB_DOT_ADD(im_sum, im_serr, im_sn, im_sum_exp, subtract, xm); ARB_DOT_ADD_RAD(im_srad, im_srad_exp, xr); } /* Look for terms to process using the Gauss multiplication formula. If any such terms are found, we mask the ith entry in use_gauss so that they will be skipped in the main loop. Important: the cutoffs must be such that the fast case (xn <= 2, yn <= 2, sn <= 3) is not hit below and the mask check is done. The cutoffs below are not optimal in the generic case; also, it would be nicer to have both mulhigh and Gauss here. A more elegant solution would be to write a fallback version of acb_dot_simple where acb_addmul does the right thing. */ use_gauss = NULL; if (re_prec >= GAUSS_CUTOFF * FLINT_BITS && im_prec >= GAUSS_CUTOFF * FLINT_BITS) { arf_t e, f; for (i = 0; i < len; i++) { arb_srcptr ai, bi, ci, di; mp_size_t an, bn, cn, dn; slong aexp, bexp, cexp, dexp; ai = ((arb_srcptr) x) + 2 * i * xstep; bi = ((arb_srcptr) x) + 2 * i * xstep + 1; ci = ((arb_srcptr) y) + 2 * i * ystep; di = ((arb_srcptr) y) + 2 * i * ystep + 1; an = ARF_SIZE(arb_midref(ai)); bn = ARF_SIZE(arb_midref(bi)); cn = ARF_SIZE(arb_midref(ci)); dn = ARF_SIZE(arb_midref(di)); aexp = ARF_EXP(arb_midref(ai)); bexp = ARF_EXP(arb_midref(bi)); cexp = ARF_EXP(arb_midref(ci)); dexp = ARF_EXP(arb_midref(di)); if (an >= GAUSS_CUTOFF && bn >= GAUSS_CUTOFF && bn >= GAUSS_CUTOFF && cn >= GAUSS_CUTOFF && FLINT_ABS(an - bn) <= 2 && FLINT_ABS(cn - dn) <= 2 && FLINT_ABS(aexp - bexp) <= 64 && FLINT_ABS(cexp - dexp) <= 64 && re_sum_exp - (aexp + cexp) < 0.1 * re_prec && im_sum_exp - (aexp + dexp) < 0.1 * im_prec && an + cn < 2.2 * re_sn && an + dn < 2.2 * im_sn) { if (use_gauss == NULL) { use_gauss = flint_calloc(len, sizeof(char)); arf_init(e); arf_init(f); } use_gauss[i] = 1; _arf_complex_mul_gauss(e, f, arb_midref(ai), arb_midref(bi), arb_midref(ci), arb_midref(di)); ARB_DOT_ADD(re_sum, re_serr, re_sn, re_sum_exp, 0, e); ARB_DOT_ADD(im_sum, im_serr, im_sn, im_sum_exp, 0, f); } } if (use_gauss != NULL) { arf_clear(e); arf_clear(f); } } for (xoff = 0; xoff < 2; xoff++) { for (yoff = 0; yoff < 2; yoff++) { slong sum_exp, srad_exp; mp_ptr sum; mp_size_t sn; mp_limb_t serr; uint64_t srad; int flipsign; if (xoff == yoff) { sum_exp = re_sum_exp; srad_exp = re_srad_exp; sum = re_sum; sn = re_sn; if (re_max_exp == WORD_MIN && re_max_rad_exp == WORD_MIN) continue; } else { sum_exp = im_sum_exp; srad_exp = im_srad_exp; sum = im_sum; sn = im_sn; if (im_max_exp == WORD_MIN && im_max_rad_exp == WORD_MIN) continue; } serr = 0; srad = 0; flipsign = (xoff + yoff == 2); for (i = 0; i < len; i++) { xi = ((arb_srcptr) x) + 2 * i * xstep + xoff; yi = ((arb_srcptr) y) + 2 * i * ystep + yoff; xm = arb_midref(xi); ym = arb_midref(yi); xr = arb_radref(xi); yr = arb_radref(yi); /* The midpoints of x[i] and y[i] are both nonzero. */ if (!arf_is_special(xm) && !arf_is_special(ym)) { xexp = ARF_EXP(xm); xn = ARF_SIZE(xm); xnegative = ARF_SGNBIT(xm); yexp = ARF_EXP(ym); yn = ARF_SIZE(ym); ynegative = ARF_SGNBIT(ym); exp = xexp + yexp; shift = sum_exp - exp; if (shift >= sn * FLINT_BITS) { /* We may yet need the top limbs for bounds. */ ARF_GET_TOP_LIMB(xtop, xm); ARF_GET_TOP_LIMB(ytop, ym); serr++; } else if (xn <= 2 && yn <= 2 && sn <= 3) { mp_limb_t x1, x0, y1, y0; mp_limb_t u3, u2, u1, u0; if (xn == 1 && yn == 1) { xtop = ARF_NOPTR_D(xm)[0]; ytop = ARF_NOPTR_D(ym)[0]; umul_ppmm(u3, u2, xtop, ytop); u1 = u0 = 0; } else if (xn == 2 && yn == 2) { x0 = ARF_NOPTR_D(xm)[0]; x1 = ARF_NOPTR_D(xm)[1]; y0 = ARF_NOPTR_D(ym)[0]; y1 = ARF_NOPTR_D(ym)[1]; xtop = x1; ytop = y1; nn_mul_2x2(u3, u2, u1, u0, x1, x0, y1, y0); } else if (xn == 1) { x0 = ARF_NOPTR_D(xm)[0]; y0 = ARF_NOPTR_D(ym)[0]; y1 = ARF_NOPTR_D(ym)[1]; xtop = x0; ytop = y1; nn_mul_2x1(u3, u2, u1, y1, y0, x0); u0 = 0; } else { x0 = ARF_NOPTR_D(xm)[0]; x1 = ARF_NOPTR_D(xm)[1]; y0 = ARF_NOPTR_D(ym)[0]; xtop = x1; ytop = y0; nn_mul_2x1(u3, u2, u1, x1, x0, y0); u0 = 0; } if (sn == 2) { if (shift < FLINT_BITS) { serr += ((u2 << (FLINT_BITS - shift)) != 0) || (u1 != 0) || (u0 != 0); u2 = (u2 >> shift) | (u3 << (FLINT_BITS - shift)); u3 = (u3 >> shift); } else if (shift == FLINT_BITS) { serr += (u2 != 0) || (u1 != 0) || (u0 != 0); u2 = u3; u3 = 0; } else /* FLINT_BITS < shift < 2 * FLINT_BITS */ { serr += ((u3 << (2 * FLINT_BITS - shift)) != 0) || (u2 != 0) || (u1 != 0) || (u0 != 0); u2 = (u3 >> (shift - FLINT_BITS)); u3 = 0; } if (xnegative ^ ynegative ^ flipsign) sub_ddmmss(sum[1], sum[0], sum[1], sum[0], u3, u2); else add_ssaaaa(sum[1], sum[0], sum[1], sum[0], u3, u2); } else if (sn == 3) { if (shift < FLINT_BITS) { serr += ((u1 << (FLINT_BITS - shift)) != 0) || (u0 != 0); u1 = (u1 >> shift) | (u2 << (FLINT_BITS - shift)); u2 = (u2 >> shift) | (u3 << (FLINT_BITS - shift)); u3 = (u3 >> shift); } else if (shift == FLINT_BITS) { serr += (u1 != 0) || (u0 != 0); u1 = u2; u2 = u3; u3 = 0; } else if (shift < 2 * FLINT_BITS) { serr += ((u2 << (2 * FLINT_BITS - shift)) != 0) || (u1 != 0) || (u0 != 0); u1 = (u3 << (2 * FLINT_BITS - shift)) | (u2 >> (shift - FLINT_BITS)); u2 = (u3 >> (shift - FLINT_BITS)); u3 = 0; } else if (shift == 2 * FLINT_BITS) { serr += (u2 != 0) || (u1 != 0) || (u0 != 0); u1 = u3; u2 = 0; u3 = 0; } else /* 2 * FLINT_BITS < shift < 3 * FLINT_BITS */ { serr += ((u3 << (3 * FLINT_BITS - shift)) != 0) || (u2 != 0) || (u1 != 0) || (u0 != 0); u1 = (u3 >> (shift - 2 * FLINT_BITS)); u2 = 0; u3 = 0; } if (xnegative ^ ynegative ^ flipsign) sub_dddmmmsss2(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1); else add_sssaaaaaa2(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1); } } else { mp_srcptr xptr, yptr; xptr = (xn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(xm) : ARF_PTR_D(xm); yptr = (yn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(ym) : ARF_PTR_D(ym); xtop = xptr[xn - 1]; ytop = yptr[yn - 1]; if (use_gauss == NULL || use_gauss[i] == 0) _arb_dot_addmul_generic(sum, &serr, tmp, sn, xptr, xn, yptr, yn, xnegative ^ ynegative ^ flipsign, shift); } xrad = MAG_MAN(xr); yrad = MAG_MAN(yr); if (xrad != 0 && yrad != 0) { xrexp = MAG_EXP(xr); yrexp = MAG_EXP(yr); RAD_ADDMUL(srad, srad_exp, (xtop >> (FLINT_BITS - MAG_BITS)) + 1, yrad, xexp + yrexp); RAD_ADDMUL(srad, srad_exp, (ytop >> (FLINT_BITS - MAG_BITS)) + 1, xrad, yexp + xrexp); RAD_ADDMUL(srad, srad_exp, xrad, yrad, xrexp + yrexp); } else if (xrad != 0) { xrexp = MAG_EXP(xr); RAD_ADDMUL(srad, srad_exp, (ytop >> (FLINT_BITS - MAG_BITS)) + 1, xrad, yexp + xrexp); } else if (yrad != 0) { yrexp = MAG_EXP(yr); RAD_ADDMUL(srad, srad_exp, (xtop >> (FLINT_BITS - MAG_BITS)) + 1, yrad, xexp + yrexp); } } else { xrad = MAG_MAN(xr); yrad = MAG_MAN(yr); xexp = ARF_EXP(xm); yexp = ARF_EXP(ym); xrexp = MAG_EXP(xr); yrexp = MAG_EXP(yr); /* (xm+xr)(ym+yr) = xm ym + [xm yr + ym xr + xr yr] */ if (yrad && !arf_is_special(xm)) { ARF_GET_TOP_LIMB(xtop, xm); RAD_ADDMUL(srad, srad_exp, (xtop >> (FLINT_BITS - MAG_BITS)) + 1, yrad, xexp + yrexp); } if (xrad && !arf_is_special(ym)) { ARF_GET_TOP_LIMB(ytop, ym); RAD_ADDMUL(srad, srad_exp, (ytop >> (FLINT_BITS - MAG_BITS)) + 1, xrad, yexp + xrexp); } if (xrad && yrad) { RAD_ADDMUL(srad, srad_exp, xrad, yrad, xrexp + yrexp); } } } if (xoff == yoff) { re_serr += serr; re_srad += srad; } else { im_serr += serr; im_srad += srad; } } } _arb_dot_output(acb_realref(res), re_sum, re_sn, subtract, re_serr, re_sum_exp, re_srad, re_srad_exp, re_prec); _arb_dot_output(acb_imagref(res), im_sum, im_sn, subtract, im_serr, im_sum_exp, im_srad, im_srad_exp, im_prec); ARF_ADD_TMP_FREE(re_sum, alloc); if (use_gauss != NULL) flint_free(use_gauss); }