/*
Copyright (C) 2018 Fredrik Johansson
This file is part of Arb.
Arb is free software: you can redistribute it and/or modify it under
the terms of the GNU Lesser General Public License (LGPL) as published
by the Free Software Foundation; either version 2.1 of the License, or
(at your option) any later version. See .
*/
#include "arb.h"
/* We need uint64_t instead of mp_limb_t on 32-bit systems for
safe summation of 30-bit error bounds. */
#include
/* The following macros are found in FLINT's longlong.h, but
the release version is out of date. */
/* x86 : 64 bit */
#if (GMP_LIMB_BITS == 64 && defined (__amd64__))
#define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl) \
__asm__ ("addq %8,%q2\n\tadcq %6,%q1\n\tadcq %4,%q0" \
: "=r" (sh), "=&r" (sm), "=&r" (sl) \
: "0" ((mp_limb_t)(ah)), "rme" ((mp_limb_t)(bh)), \
"1" ((mp_limb_t)(am)), "rme" ((mp_limb_t)(bm)), \
"2" ((mp_limb_t)(al)), "rme" ((mp_limb_t)(bl))) \
#define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl) \
__asm__ ("subq %8,%q2\n\tsbbq %6,%q1\n\tsbbq %4,%q0" \
: "=r" (dh), "=&r" (dm), "=&r" (dl) \
: "0" ((mp_limb_t)(mh)), "rme" ((mp_limb_t)(sh)), \
"1" ((mp_limb_t)(mm)), "rme" ((mp_limb_t)(sm)), \
"2" ((mp_limb_t)(ml)), "rme" ((mp_limb_t)(sl))) \
#endif /* x86_64 */
/* x86 : 32 bit */
#if (GMP_LIMB_BITS == 32 && (defined (__i386__) \
|| defined (__i486__) || defined(__amd64__)))
#define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl) \
__asm__ ("addl %8,%k2\n\tadcl %6,%k1\n\tadcl %4,%k0" \
: "=r" (sh), "=r" (sm), "=&r" (sl) \
: "0" ((mp_limb_t)(ah)), "g" ((mp_limb_t)(bh)), \
"1" ((mp_limb_t)(am)), "g" ((mp_limb_t)(bm)), \
"2" ((mp_limb_t)(al)), "g" ((mp_limb_t)(bl))) \
#define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl) \
__asm__ ("subl %8,%k2\n\tsbbl %6,%k1\n\tsbbl %4,%k0" \
: "=r" (dh), "=r" (dm), "=&r" (dl) \
: "0" ((mp_limb_t)(mh)), "g" ((mp_limb_t)(sh)), \
"1" ((mp_limb_t)(mm)), "g" ((mp_limb_t)(sm)), \
"2" ((mp_limb_t)(ml)), "g" ((mp_limb_t)(sl))) \
#endif /* x86 */
#if !defined(add_sssaaaaaa2)
#define add_sssaaaaaa2(sh, sm, sl, ah, am, al, bh, bm, bl) \
do { \
mp_limb_t __t, __u; \
add_ssaaaa(__t, sl, (mp_limb_t) 0, al, (mp_limb_t) 0, bl); \
add_ssaaaa(__u, sm, (mp_limb_t) 0, am, (mp_limb_t) 0, bm); \
add_ssaaaa(sh, sm, ah + bh, sm, __u, __t); \
} while (0)
#define sub_dddmmmsss2(dh, dm, dl, mh, mm, ml, sh, sm, sl) \
do { \
mp_limb_t __t, __u; \
sub_ddmmss(__t, dl, (mp_limb_t) 0, ml, (mp_limb_t) 0, sl); \
sub_ddmmss(__u, dm, (mp_limb_t) 0, mm, (mp_limb_t) 0, sm); \
sub_ddmmss(dh, dm, mh - sh, dm, -__u, -__t); \
} while (0)
#endif
void mpfr_mulhigh_n(mp_ptr rp, mp_srcptr np, mp_srcptr mp, mp_size_t n);
void mpfr_sqrhigh_n(mp_ptr rp, mp_srcptr np, mp_size_t n);
/* Add ((a * b) / 2^MAG_BITS) * 2^exp into srad*2^srad_exp.
Assumes that srad_exp >= exp and that overflow cannot occur. */
#define RAD_ADDMUL(srad, srad_exp, a, b, exp) \
do { \
uint64_t __a, __b; \
slong __shift; \
__a = (a); \
__b = (b); \
__shift = (srad_exp) - (exp); \
if (__shift < MAG_BITS) \
(srad) += (((__a) * (__b)) >> (MAG_BITS + __shift)) + 1; \
else \
(srad) += 1; \
} while (0)
/* mag_set_ui_2exp_si but assumes no promotion can occur.
Do we need this? */
void
mag_set_ui_2exp_small(mag_t z, ulong x, slong e)
{
_fmpz_demote(MAG_EXPREF(z));
if (x == 0)
{
MAG_EXP(z) = 0;
MAG_MAN(z) = 0;
}
else
{
slong bits;
mp_limb_t overflow;
count_leading_zeros(bits, x);
bits = FLINT_BITS - bits;
if (bits <= MAG_BITS)
{
x = x << (MAG_BITS - bits);
}
else
{
x = (x >> (bits - MAG_BITS)) + 1;
overflow = x >> MAG_BITS;
bits += overflow;
x >>= overflow;
}
MAG_EXP(z) = bits + e;
MAG_MAN(z) = x;
}
}
/* Sets rad to (Aerr 2^Aexp + Berr 2^Bexp + Cerr 2^Cexp) 2^(-MAG_BITS).
Assumes that overflow cannot occur. */
static void
add_errors(mag_t rad, uint64_t Aerr, slong Aexp, uint64_t Berr, slong Bexp, uint64_t Cerr, slong Cexp)
{
slong shift;
if (Aerr && Berr)
{
if (Aexp >= Bexp)
{
shift = Aexp - Bexp;
if (shift < 64)
Aerr = Aerr + (Berr >> shift) + 1;
else
Aerr = Aerr + 1;
}
else
{
shift = Bexp - Aexp;
if (shift < 64)
Aerr = Berr + (Aerr >> shift) + 1;
else
Aerr = Berr + 1;
Aexp = Bexp;
}
}
else if (Berr)
{
Aerr = Berr;
Aexp = Bexp;
}
if (Aerr && Cerr)
{
if (Aexp >= Cexp)
{
shift = Aexp - Cexp;
if (shift < 64)
Aerr = Aerr + (Cerr >> shift) + 1;
else
Aerr = Aerr + 1;
}
else
{
shift = Cexp - Aexp;
if (shift < 64)
Aerr = Cerr + (Aerr >> shift) + 1;
else
Aerr = Cerr + 1;
Aexp = Cexp;
}
}
else if (Cerr)
{
Aerr = Cerr;
Aexp = Cexp;
}
#if FLINT_BITS == 64
mag_set_ui_2exp_small(rad, Aerr, Aexp - MAG_BITS);
#else
mag_set_d(rad, Aerr * (1.0 + 1e-14));
mag_mul_2exp_si(rad, rad, Aexp - MAG_BITS);
#endif
}
static void
mulhigh(mp_ptr res, mp_srcptr xptr, mp_size_t xn, mp_srcptr yptr, mp_size_t yn, mp_size_t nn)
{
mp_ptr tmp, xxx, yyy;
slong k;
ARF_MUL_TMP_DECL;
ARF_MUL_TMP_ALLOC(tmp, 2 * nn);
xxx = tmp;
yyy = tmp + nn;
mpn_copyi(xxx + nn - FLINT_MIN(xn, nn), xptr + xn - FLINT_MIN(xn, nn), FLINT_MIN(xn, nn));
mpn_copyi(yyy + nn - FLINT_MIN(yn, nn), yptr + yn - FLINT_MIN(yn, nn), FLINT_MIN(yn, nn));
for (k = 0; k < nn - xn; k++)
xxx[k] = 0;
for (k = 0; k < nn - yn; k++)
yyy[k] = 0;
if (xptr == yptr && xn == yn)
mpfr_sqrhigh_n(res, xxx, nn);
else
mpfr_mulhigh_n(res, xxx, yyy, nn);
ARF_MUL_TMP_FREE(tmp, 2 * nn);
}
void
_arb_dot_addmul_generic(mp_ptr sum, mp_ptr serr, mp_ptr tmp, mp_size_t sn,
mp_srcptr xptr, mp_size_t xn, mp_srcptr yptr, mp_size_t yn,
int negative, flint_bitcnt_t shift)
{
slong shift_bits, shift_limbs, term_prec;
mp_limb_t cy;
mp_ptr sstart, tstart;
mp_size_t tn, nn;
shift_bits = shift % FLINT_BITS;
shift_limbs = shift / FLINT_BITS;
/*
shift term_prec (discarded bits)
|--------------------|--------------|
| t[tn-1] | ... | t[0] | Term
[ s[n-1] | s[n-2] | ... | s[0] ] Sum
*/
/* Upper bound for the term bit length. The actual term bit length
could be smaller if the product does not extend all
the way down to s. */
term_prec = sn * FLINT_BITS - shift;
/* The mulhigh error relative to the top of the sum will
be bounded by nn * 2^(-shift) * 2^(-FLINT_BITS*nn).
One extra limb ensures that this is <1 sum-ulp. */
term_prec += FLINT_BITS;
nn = (term_prec + FLINT_BITS - 1) / FLINT_BITS;
/* Sanity check; must conform to the pre-allocated memory! */
if (nn > sn + 2)
{
flint_printf("nn > sn + 2\n");
flint_abort();
}
/* Use mulhigh? */
if (term_prec >= MUL_MPFR_MIN_LIMBS * FLINT_BITS &&
term_prec <= MUL_MPFR_MAX_LIMBS * FLINT_BITS &&
xn * FLINT_BITS > 0.9 * term_prec &&
yn * FLINT_BITS > 0.9 * term_prec)
{
mulhigh(tmp, xptr, xn, yptr, yn, nn);
tstart = tmp + nn;
tn = nn;
serr[0]++;
}
else
{
if (xn > nn || yn > nn)
{
if (xn > nn)
{
xptr += (xn - nn);
xn = nn;
}
if (yn > nn)
{
yptr += (yn - nn);
yn = nn;
}
serr[0]++;
}
tn = xn + yn;
ARF_MPN_MUL(tmp + 1, xptr, xn, yptr, yn);
tstart = tmp + 1;
}
if (shift_bits != 0)
{
tstart[-1] = mpn_rshift(tstart, tstart, tn, shift_bits);
tstart = tstart - 1;
tn++;
}
while (tstart[0] == 0)
{
tstart++;
tn--;
}
if (shift_limbs + tn <= sn)
{
/* No truncation of the term. */
sstart = sum + sn - shift_limbs - tn;
nn = tn;
}
else
{
/* The term is truncated. */
sstart = sum;
tstart = tstart - (sn - shift_limbs - tn);
nn = sn - shift_limbs;
serr[0]++;
}
/* Likely case. Note: assumes 1 extra (dummy) limb in sum. */
if (shift_limbs <= 1)
{
if (negative)
sstart[nn] -= mpn_sub_n(sstart, sstart, tstart, nn);
else
sstart[nn] += mpn_add_n(sstart, sstart, tstart, nn);
}
else
{
if (negative)
{
cy = mpn_sub_n(sstart, sstart, tstart, nn);
mpn_sub_1(sstart + nn, sstart + nn, shift_limbs, cy);
}
else
{
cy = mpn_add_n(sstart, sstart, tstart, nn);
mpn_add_1(sstart + nn, sstart + nn, shift_limbs, cy);
}
}
}
void
_arb_dot_add_generic(mp_ptr sum, mp_ptr serr, mp_ptr tmp, mp_size_t sn,
mp_srcptr xptr, mp_size_t xn,
int negative, flint_bitcnt_t shift)
{
slong shift_bits, shift_limbs, term_prec;
mp_limb_t cy, err;
mp_ptr sstart, tstart;
mp_size_t tn, nn;
shift_bits = shift % FLINT_BITS;
shift_limbs = shift / FLINT_BITS;
term_prec = sn * FLINT_BITS - shift;
term_prec += FLINT_BITS;
nn = (term_prec + FLINT_BITS - 1) / FLINT_BITS;
err = 0;
if (xn > nn)
{
xptr += (xn - nn);
xn = nn;
err = 1;
}
tn = xn;
if (shift_bits == 0)
{
mpn_copyi(tmp, xptr, tn);
tstart = tmp;
}
else
{
tmp[0] = mpn_rshift(tmp + 1, xptr, tn, shift_bits);
tstart = tmp;
tn++;
}
while (tstart[0] == 0)
{
tstart++;
tn--;
}
if (shift_limbs + tn <= sn)
{
/* No truncation of the term. */
sstart = sum + sn - shift_limbs - tn;
nn = tn;
}
else
{
/* The term is truncated. */
sstart = sum;
tstart = tstart - (sn - shift_limbs - tn);
nn = sn - shift_limbs;
err = 1;
}
serr[0] += err;
/* Likely case. Note: assumes 1 extra (dummy) limb in sum. */
if (shift_limbs <= 1)
{
if (negative)
sstart[nn] -= mpn_sub_n(sstart, sstart, tstart, nn);
else
sstart[nn] += mpn_add_n(sstart, sstart, tstart, nn);
}
else
{
if (negative)
{
cy = mpn_sub_n(sstart, sstart, tstart, nn);
mpn_sub_1(sstart + nn, sstart + nn, shift_limbs, cy);
}
else
{
cy = mpn_add_n(sstart, sstart, tstart, nn);
mpn_add_1(sstart + nn, sstart + nn, shift_limbs, cy);
}
}
}
void
arb_dot(arb_t res, const arb_t initial, int subtract, arb_srcptr x, slong xstep, arb_srcptr y, slong ystep, slong len, slong prec)
{
slong i, j, nonzero, padding, extend;
slong xexp, yexp, exp, max_exp, min_exp, sum_exp;
slong xrexp, yrexp, srad_exp, max_rad_exp;
int xnegative, ynegative, inexact;
mp_size_t xn, yn, sn, alloc;
flint_bitcnt_t shift;
arb_srcptr xi, yi;
arf_srcptr xm, ym;
mag_srcptr xr, yr;
mp_limb_t xtop, ytop;
mp_limb_t xrad, yrad;
mp_limb_t serr; /* Sum over arithmetic errors */
uint64_t srad; /* Sum over propagated errors */
mp_ptr tmp, sum; /* Workspace */
ARF_ADD_TMP_DECL;
/* todo: fast fma and fmma (len=2) code */
if (len <= 1)
{
if (initial == NULL)
{
if (len <= 0)
arb_zero(res);
else
{
arb_mul(res, x, y, prec);
if (subtract)
arb_neg(res, res);
}
return;
}
else if (len <= 0)
{
arb_set_round(res, initial, prec);
return;
}
}
/* Number of nonzero midpoint terms in sum. */
nonzero = 0;
/* Terms are bounded by 2^max_exp (with WORD_MIN = -infty) */
max_exp = WORD_MIN;
/* Propagated error terms are bounded by 2^max_rad_exp */
max_rad_exp = WORD_MIN;
/* Used to reduce the precision. */
min_exp = WORD_MAX;
/* Account for the initial term. */
if (initial != NULL)
{
if (!ARB_IS_LAGOM(initial))
{
arb_dot_simple(res, initial, subtract, x, xstep, y, ystep, len, prec);
return;
}
xm = arb_midref(initial);
xr = arb_radref(initial);
if (!arf_is_special(xm))
{
max_exp = ARF_EXP(xm);
nonzero++;
if (prec > 2 * FLINT_BITS)
min_exp = ARF_EXP(xm) - ARF_SIZE(xm) * FLINT_BITS;
}
if (!mag_is_special(xr))
max_rad_exp = MAG_EXP(xr);
}
/* Determine maximum exponents for the main sum and the radius sum. */
for (i = 0; i < len; i++)
{
xi = x + i * xstep;
yi = y + i * ystep;
/* Fallback for huge exponents or non-finite values. */
if (!ARB_IS_LAGOM(xi) || !ARB_IS_LAGOM(yi))
{
arb_dot_simple(res, initial, subtract, x, xstep, y, ystep, len, prec);
return;
}
xm = arb_midref(xi);
ym = arb_midref(yi);
xr = arb_radref(xi);
yr = arb_radref(yi);
/* (xm+xr)(ym+yr) = xm ym + [xr ym + xm yr + xr yr] */
if (!arf_is_special(xm))
{
xexp = ARF_EXP(xm);
if (!arf_is_special(ym))
{
yexp = ARF_EXP(ym);
max_exp = FLINT_MAX(max_exp, xexp + yexp);
nonzero++;
if (prec > 2 * FLINT_BITS)
{
slong bot;
bot = (xexp + yexp) - (ARF_SIZE(xm) + ARF_SIZE(ym)) * FLINT_BITS;
min_exp = FLINT_MIN(min_exp, bot);
}
if (!mag_is_special(xr))
{
xrexp = MAG_EXP(xr);
max_rad_exp = FLINT_MAX(max_rad_exp, yexp + xrexp);
if (!mag_is_special(yr))
{
yrexp = MAG_EXP(yr);
max_rad_exp = FLINT_MAX(max_rad_exp, xexp + yrexp);
max_rad_exp = FLINT_MAX(max_rad_exp, xrexp + yrexp);
}
}
else
{
if (!mag_is_special(yr))
{
yrexp = MAG_EXP(yr);
max_rad_exp = FLINT_MAX(max_rad_exp, xexp + yrexp);
}
}
}
else /* if y = 0, something can happen only if yr != 0 */
{
if (!mag_is_special(yr))
{
yrexp = MAG_EXP(yr);
max_rad_exp = FLINT_MAX(max_rad_exp, xexp + yrexp);
if (!mag_is_special(xr))
{
xrexp = MAG_EXP(xr);
max_rad_exp = FLINT_MAX(max_rad_exp, xrexp + yrexp);
}
}
}
}
else /* if x = 0, something can happen only if xr != 0 */
{
if (!mag_is_special(xr))
{
xrexp = MAG_EXP(xr);
if (!arf_is_special(ym))
{
yexp = ARF_EXP(ym);
max_rad_exp = FLINT_MAX(max_rad_exp, xrexp + yexp);
}
if (!mag_is_special(yr))
{
yrexp = MAG_EXP(yr);
max_rad_exp = FLINT_MAX(max_rad_exp, xrexp + yrexp);
}
}
}
}
/* The midpoint sum is zero. */
if (max_exp == WORD_MIN)
{
/* The sum is exactly zero. */
if (max_rad_exp == WORD_MIN)
{
arb_zero(res);
return;
}
prec = 2;
}
else
{
/* Reduce precision based on errors. */
if (max_rad_exp != WORD_MIN)
prec = FLINT_MIN(prec, max_exp - max_rad_exp + MAG_BITS);
/* Reduce precision based on actual sizes. */
if (min_exp != WORD_MAX)
prec = FLINT_MIN(prec, max_exp - min_exp + MAG_BITS);
prec = FLINT_MAX(prec, 2);
}
/* Extend sum so that we can use two's complement addition. */
extend = FLINT_BIT_COUNT(nonzero) + 1;
/* Extra bits to improve accuracy (optional). */
padding = 4 + FLINT_BIT_COUNT(len);
/* Number of limbs. */
sn = (prec + extend + padding + FLINT_BITS - 1) / FLINT_BITS;
/* Avoid having to make a special case for sn = 1. */
sn = FLINT_MAX(sn, 2);
/* Exponent for the main sum. */
sum_exp = max_exp + extend;
/* We need sn + 1 limb for the sum (sn limbs + 1 dummy limb
for carry or borrow that avoids an extra branch). We need
2 * (sn + 2) limbs to store the product of two numbers
with up to (sn + 2) limbs, plus 1 extra limb for shifting
the product. */
alloc = (sn + 1) + 2 * (sn + 2) + 1;
ARF_ADD_TMP_ALLOC(sum, alloc)
tmp = sum + (sn + 1);
/* Sum of propagated errors. */
srad_exp = max_rad_exp;
srad = 0;
/* Set sum to 0 */
serr = 0;
for (j = 0; j < sn + 1; j++)
sum[j] = 0;
if (initial != NULL)
{
xm = arb_midref(initial);
xr = arb_radref(initial);
if (!arf_is_special(xm))
{
mp_srcptr xptr;
xexp = ARF_EXP(xm);
xn = ARF_SIZE(xm);
xnegative = ARF_SGNBIT(xm);
shift = sum_exp - xexp;
if (shift >= sn * FLINT_BITS)
{
serr++;
}
else
{
xptr = (xn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(xm) : ARF_PTR_D(xm);
_arb_dot_add_generic(sum, &serr, tmp, sn, xptr, xn, xnegative ^ subtract, shift);
}
}
if (!mag_is_special(xr))
{
xrad = MAG_MAN(xr);
xrexp = MAG_EXP(xr);
shift = srad_exp - xrexp;
if (shift < 64)
srad += (xrad >> shift) + 1;
else
srad++;
}
}
for (i = 0; i < len; i++)
{
xi = x + i * xstep;
yi = y + i * ystep;
xm = arb_midref(xi);
ym = arb_midref(yi);
xr = arb_radref(xi);
yr = arb_radref(yi);
/* The midpoints of x[i] and y[i] are both nonzero. */
if (!arf_is_special(xm) && !arf_is_special(ym))
{
xexp = ARF_EXP(xm);
xn = ARF_SIZE(xm);
xnegative = ARF_SGNBIT(xm);
yexp = ARF_EXP(ym);
yn = ARF_SIZE(ym);
ynegative = ARF_SGNBIT(ym);
exp = xexp + yexp;
shift = sum_exp - exp;
if (shift >= sn * FLINT_BITS)
{
/* We may yet need the top limbs for bounds. */
ARF_GET_TOP_LIMB(xtop, xm);
ARF_GET_TOP_LIMB(ytop, ym);
serr++;
}
#if 0
else if (xn == 1 && yn == 1 && sn == 2 && shift < FLINT_BITS) /* Fastest path. */
{
mp_limb_t hi, lo, out;
xtop = ARF_NOPTR_D(xm)[0];
ytop = ARF_NOPTR_D(ym)[0];
umul_ppmm(hi, lo, xtop, ytop);
out = lo << (FLINT_BITS - shift);
lo = (lo >> shift) | (hi << (FLINT_BITS - shift));
hi = (hi >> shift);
serr += (out != 0);
if (xnegative ^ ynegative)
sub_ddmmss(sum[1], sum[0], sum[1], sum[0], hi, lo);
else
add_ssaaaa(sum[1], sum[0], sum[1], sum[0], hi, lo);
}
else if (xn == 2 && yn == 2 && shift < FLINT_BITS && sn <= 3)
{
mp_limb_t x1, x0, y1, y0;
mp_limb_t u3, u2, u1, u0;
x0 = ARF_NOPTR_D(xm)[0];
x1 = ARF_NOPTR_D(xm)[1];
y0 = ARF_NOPTR_D(ym)[0];
y1 = ARF_NOPTR_D(ym)[1];
xtop = x1;
ytop = y1;
nn_mul_2x2(u3, u2, u1, u0, x1, x0, y1, y0);
u0 = (u0 != 0) || ((u1 << (FLINT_BITS - shift)) != 0);
u1 = (u1 >> shift) | (u2 << (FLINT_BITS - shift));
u2 = (u2 >> shift) | (u3 << (FLINT_BITS - shift));
u3 = (u3 >> shift);
if (sn == 2)
{
serr += (u0 || (u1 != 0));
if (xnegative ^ ynegative)
sub_ddmmss(sum[1], sum[0], sum[1], sum[0], u3, u2);
else
add_ssaaaa(sum[1], sum[0], sum[1], sum[0], u3, u2);
}
else
{
serr += u0;
if (xnegative ^ ynegative)
sub_dddmmmsss2(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
else
add_sssaaaaaa2(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
}
}
#endif
else if (xn <= 2 && yn <= 2 && sn <= 3)
{
mp_limb_t x1, x0, y1, y0;
mp_limb_t u3, u2, u1, u0;
if (xn == 1 && yn == 1)
{
xtop = ARF_NOPTR_D(xm)[0];
ytop = ARF_NOPTR_D(ym)[0];
umul_ppmm(u3, u2, xtop, ytop);
u1 = u0 = 0;
}
else if (xn == 2 && yn == 2)
{
x0 = ARF_NOPTR_D(xm)[0];
x1 = ARF_NOPTR_D(xm)[1];
y0 = ARF_NOPTR_D(ym)[0];
y1 = ARF_NOPTR_D(ym)[1];
xtop = x1;
ytop = y1;
nn_mul_2x2(u3, u2, u1, u0, x1, x0, y1, y0);
}
else if (xn == 1)
{
x0 = ARF_NOPTR_D(xm)[0];
y0 = ARF_NOPTR_D(ym)[0];
y1 = ARF_NOPTR_D(ym)[1];
xtop = x0;
ytop = y1;
nn_mul_2x1(u3, u2, u1, y1, y0, x0);
u0 = 0;
}
else
{
x0 = ARF_NOPTR_D(xm)[0];
x1 = ARF_NOPTR_D(xm)[1];
y0 = ARF_NOPTR_D(ym)[0];
xtop = x1;
ytop = y0;
nn_mul_2x1(u3, u2, u1, x1, x0, y0);
u0 = 0;
}
if (sn == 2)
{
if (shift < FLINT_BITS)
{
serr += ((u2 << (FLINT_BITS - shift)) != 0) || (u1 != 0) || (u0 != 0);
u2 = (u2 >> shift) | (u3 << (FLINT_BITS - shift));
u3 = (u3 >> shift);
}
else if (shift == FLINT_BITS)
{
serr += (u2 != 0) || (u1 != 0) || (u0 != 0);
u2 = u3;
u3 = 0;
}
else /* FLINT_BITS < shift < 2 * FLINT_BITS */
{
serr += ((u3 << (2 * FLINT_BITS - shift)) != 0) || (u2 != 0) || (u1 != 0) || (u0 != 0);
u2 = (u3 >> (shift - FLINT_BITS));
u3 = 0;
}
if (xnegative ^ ynegative)
sub_ddmmss(sum[1], sum[0], sum[1], sum[0], u3, u2);
else
add_ssaaaa(sum[1], sum[0], sum[1], sum[0], u3, u2);
}
else if (sn == 3)
{
if (shift < FLINT_BITS)
{
serr += ((u1 << (FLINT_BITS - shift)) != 0) || (u0 != 0);
u1 = (u1 >> shift) | (u2 << (FLINT_BITS - shift));
u2 = (u2 >> shift) | (u3 << (FLINT_BITS - shift));
u3 = (u3 >> shift);
}
else if (shift == FLINT_BITS)
{
serr += (u1 != 0) || (u0 != 0);
u1 = u2;
u2 = u3;
u3 = 0;
}
else if (shift < 2 * FLINT_BITS)
{
serr += ((u2 << (2 * FLINT_BITS - shift)) != 0) || (u1 != 0) || (u0 != 0);
u1 = (u3 << (2 * FLINT_BITS - shift)) | (u2 >> (shift - FLINT_BITS));
u2 = (u3 >> (shift - FLINT_BITS));
u3 = 0;
}
else if (shift == 2 * FLINT_BITS)
{
serr += (u2 != 0) || (u1 != 0) || (u0 != 0);
u1 = u3;
u2 = 0;
u3 = 0;
}
else /* 2 * FLINT_BITS < shift < 3 * FLINT_BITS */
{
serr += ((u3 << (3 * FLINT_BITS - shift)) != 0) || (u2 != 0) || (u1 != 0) || (u0 != 0);
u1 = (u3 >> (shift - 2 * FLINT_BITS));
u2 = 0;
u3 = 0;
}
if (xnegative ^ ynegative)
sub_dddmmmsss2(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
else
add_sssaaaaaa2(sum[2], sum[1], sum[0], sum[2], sum[1], sum[0], u3, u2, u1);
}
}
else
{
mp_srcptr xptr, yptr;
xptr = (xn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(xm) : ARF_PTR_D(xm);
yptr = (yn <= ARF_NOPTR_LIMBS) ? ARF_NOPTR_D(ym) : ARF_PTR_D(ym);
xtop = xptr[xn - 1];
ytop = yptr[yn - 1];
_arb_dot_addmul_generic(sum, &serr, tmp, sn, xptr, xn, yptr, yn, xnegative ^ ynegative, shift);
}
xrad = MAG_MAN(xr);
yrad = MAG_MAN(yr);
if (xrad != 0 && yrad != 0)
{
xrexp = MAG_EXP(xr);
yrexp = MAG_EXP(yr);
RAD_ADDMUL(srad, srad_exp, (xtop >> (FLINT_BITS - MAG_BITS)) + 1, yrad, xexp + yrexp);
RAD_ADDMUL(srad, srad_exp, (ytop >> (FLINT_BITS - MAG_BITS)) + 1, xrad, yexp + xrexp);
RAD_ADDMUL(srad, srad_exp, xrad, yrad, xrexp + yrexp);
}
else if (xrad != 0)
{
xrexp = MAG_EXP(xr);
RAD_ADDMUL(srad, srad_exp, (ytop >> (FLINT_BITS - MAG_BITS)) + 1, xrad, yexp + xrexp);
}
else if (yrad != 0)
{
yrexp = MAG_EXP(yr);
RAD_ADDMUL(srad, srad_exp, (xtop >> (FLINT_BITS - MAG_BITS)) + 1, yrad, xexp + yrexp);
}
}
else
{
xrad = MAG_MAN(xr);
yrad = MAG_MAN(yr);
xexp = ARF_EXP(xm);
yexp = ARF_EXP(ym);
xrexp = MAG_EXP(xr);
yrexp = MAG_EXP(yr);
/* (xm+xr)(ym+yr) = xm ym + [xm yr + ym xr + xr yr] */
if (yrad && !arf_is_special(xm))
{
ARF_GET_TOP_LIMB(xtop, xm);
RAD_ADDMUL(srad, srad_exp, (xtop >> (FLINT_BITS - MAG_BITS)) + 1, yrad, xexp + yrexp);
}
if (xrad && !arf_is_special(ym))
{
ARF_GET_TOP_LIMB(ytop, ym);
RAD_ADDMUL(srad, srad_exp, (ytop >> (FLINT_BITS - MAG_BITS)) + 1, xrad, yexp + xrexp);
}
if (xrad && yrad)
{
RAD_ADDMUL(srad, srad_exp, xrad, yrad, xrexp + yrexp);
}
}
}
xnegative = 0;
if (sum[sn - 1] >= LIMB_TOP)
{
mpn_neg(sum, sum, sn);
xnegative = 1;
}
if (sum[sn - 1] == 0)
{
slong sum_exp2;
mp_size_t sn2;
sn2 = sn;
sum_exp2 = sum_exp;
while (sn2 > 0 && sum[sn2 - 1] == 0)
{
sum_exp2 -= FLINT_BITS;
sn2--;
}
if (sn2 == 0)
{
arf_zero(arb_midref(res));
inexact = 0;
}
else
{
inexact = _arf_set_round_mpn(arb_midref(res), &exp, sum, sn2, xnegative ^ subtract, prec, ARF_RND_DOWN);
_fmpz_set_si_small(ARF_EXPREF(arb_midref(res)), exp + sum_exp2);
}
}
else
{
if (sn == 2)
inexact = _arf_set_round_uiui(arb_midref(res), &exp, sum[1], sum[0], xnegative ^ subtract, prec, ARF_RND_DOWN);
else
inexact = _arf_set_round_mpn(arb_midref(res), &exp, sum, sn, xnegative ^ subtract, prec, ARF_RND_DOWN);
_fmpz_set_si_small(ARF_EXPREF(arb_midref(res)), exp + sum_exp);
}
/*
Add the three sources of error.
Final rounding error = inexact * 2^(exp + sum_exp - prec)
0 <= inexact <= 1
Arithmetic error = serr * 2^(sum_exp - sn * FLINT_BITS)
0 <= serr <= 3 * n
Propagated error = srad * 2^(srad_exp - MAG_BITS)
0 <= srad <= 3 * n * MAG_BITS
We shift the first two by MAG_BITS so that the magnitudes
become similar and a reasonably accurate addition can be done
by comparing exponents without normalizing first.
*/
add_errors(arb_radref(res),
inexact << MAG_BITS,
exp + sum_exp - prec,
((uint64_t) serr) << MAG_BITS,
sum_exp - sn * FLINT_BITS,
srad,
srad_exp);
ARF_ADD_TMP_FREE(sum, alloc);
}