/* Arithmetic modulo Fermat numbers. Copyright 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2012 Alexander Kruppa, Paul Zimmermann This file is part of the ECM Library. The ECM Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. The ECM Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the ECM Library; see the file COPYING.LIB. If not, see http://www.gnu.org/licenses/ or write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. */ #include #include /* for abs if assertions enabled */ #include "ecm-impl.h" #include "ecm-gmp.h" #ifdef HAVE_LIMITS_H # include #else # ifndef UINT_MAX # define UINT_MAX (~(unsigned int) 0) # endif #endif /* #define DEBUG 1 #define CHECKSUM 1 */ #ifndef THREAD_SAFE_GT /* Set the following define to choose which version of code to use: * 0 == old, i.e. original code where variable gt is not thread-safe, * 1 == alpha, makes the variable gt thread-safe with minimal code disruption, * 2 == beta, thread-safe with more disruption to code, but possibly faster. * NOTE: Limited testing shows alpha slightly faster than beta, possibly due * to an additional argument being passed on many calls with beta version. */ #define THREAD_SAFE_GT 1 #endif #if THREAD_SAFE_GT > 0 #include static pthread_key_t gtkey; static pthread_once_t gtkey_once = PTHREAD_ONCE_INIT; #define gt (*gtptr) /* NOTE: Has pervasive effect throughout this file. */ #else static mpz_t gt; static int gt_inited = 0; #endif unsigned int Fermat; #if THREAD_SAFE_GT > 0 static void set_gtkey (void) { (void) pthread_key_create (>key, NULL); } static mpz_t * get_gtptr (void) { (void) pthread_once (>key_once, set_gtkey); mpz_t *res = pthread_getspecific (gtkey); if (!res) /* True:: this thread's first time through here */ { res = calloc (1, sizeof (mpz_t)); (void) pthread_setspecific (gtkey, res); } return res; } #endif #define CACHESIZE 512U /* a' <- a+b, b' <- a-b. */ #if THREAD_SAFE_GT == 2 #define ADDSUB_MOD(a, b) \ mpz_sub (gt, a, b); \ mpz_add (a, a, b); \ F_mod_gt (b, n, gtptr);\ F_mod_1 (a, n); #else #define ADDSUB_MOD(a, b) \ mpz_sub (gt, a, b); \ mpz_add (a, a, b); \ F_mod_gt (b, n); \ F_mod_1 (a, n); #endif __GMP_DECLSPEC mp_limb_t __gmpn_mod_34lsub1 (mp_limb_t*, mp_size_t); /* compute remainder modulo 2^(GMP_LIMB_BITS*3/4)-1 */ #ifndef HAVE___GMPN_MOD_34LSUB1 mp_limb_t __gmpn_mod_34lsub1 (mp_limb_t *src, mp_size_t size) { mp_ptr tp; mp_limb_t r, d; ASSERT(GMP_LIMB_BITS % 4 == 0); tp = malloc (size * sizeof (mp_limb_t)); if (tp == NULL) { fprintf (stderr, "Cannot allocate memory in __gmpn_mod_34lsub1\n"); exit (1); } MPN_COPY (tp, src, size); d = ((mp_limb_t) 1 << (3 * (GMP_LIMB_BITS / 4))) - (mp_limb_t) 1; mpn_divmod_1 (&r, tp, size, d); free (tp); return r; } #endif /* RS -> RS (mod 2^n+1). If input |RS| < 2^(2*n), result |RS| < 2^(n+1) */ static inline void F_mod_1 (mpz_t RS, unsigned int n) { mp_size_t size; mp_limb_t v; int sgn; size = mpz_size (RS); ASSERT_ALWAYS(size <= (mp_size_t) n / GMP_NUMB_BITS + 1); sgn = mpz_sgn (RS); /* Remember original sign */ v = mpz_getlimbn (RS, n / GMP_NUMB_BITS); mpz_tdiv_r_2exp (RS, RS, n); /* Just a truncate. RS < 2^n. Can make RS zero and so change sgn(RS)! */ if (sgn == -1) mpz_add_ui (RS, RS, v); else mpz_sub_ui (RS, RS, v); } /* R = gt (mod 2^n+1) */ static inline void #if THREAD_SAFE_GT == 2 F_mod_gt (mpz_t R, unsigned int n, mpz_t *gtptr) #else F_mod_gt (mpz_t R, unsigned int n) #endif { #if THREAD_SAFE_GT == 1 mpz_t *gtptr = pthread_getspecific (gtkey); #endif mp_size_t size; mp_limb_t v; size = mpz_size (gt); ASSERT(R != gt); if ((unsigned int) size == n / GMP_NUMB_BITS + 1) { int sgn; sgn = mpz_sgn (gt); v = mpz_getlimbn (gt, n / GMP_NUMB_BITS); mpz_tdiv_r_2exp (gt, gt, n); /* Just a truncate */ if (sgn == -1) mpz_add_ui (R, gt, v); else mpz_sub_ui (R, gt, v); } else if ((unsigned int) size > n / GMP_NUMB_BITS + 1) { mpz_tdiv_q_2exp (R, gt, n); mpz_tdiv_r_2exp (gt, gt, n); /* Just a truncate */ mpz_sub (R, gt, R); } else mpz_set (R, gt); } /* R = S1 * S2 (mod 2^n+1) where n is a power of 2 S1 == S2, S1 == R, S2 == R ok, but none may == gt. Assume n >= GMP_NUMB_BITS, and GMP_NUMB_BITS is a power of two. */ static void #if THREAD_SAFE_GT == 2 F_mulmod (mpz_t R, mpz_t S1, mpz_t S2, unsigned int n, mpz_t *gtptr) #else F_mulmod (mpz_t R, mpz_t S1, mpz_t S2, unsigned int n) #endif { #if THREAD_SAFE_GT == 1 mpz_t *gtptr = pthread_getspecific (gtkey); #endif int n2 = n / GMP_NUMB_BITS; /* type of _mp_size is int */ F_mod_1 (S1, n); F_mod_1 (S2, n); ASSERT(mpz_size (S1) <= (unsigned) n2); ASSERT(mpz_size (S2) <= (unsigned) n2); if (n >= 32768) { unsigned long k; _mpz_realloc (gt, n2 + 1); /* in case the reallocation fails, _mpz_realloc sets the value to 0 */ ASSERT_ALWAYS (mpz_cmp_ui (gt, 0) != 0); k = mpn_fft_best_k (n2, S1 == S2); /* the following cannot be changed to use mpn_mulmod_bnm1 since we are precisely multiplying modulo a Fermat number */ mpn_mul_fft (PTR(gt), n2, PTR(S1), ABSIZ(S1), PTR(S2), ABSIZ(S2), k); MPN_NORMALIZE(PTR(gt), n2); SIZ(gt) = ((SIZ(S1) ^ SIZ(S2)) >= 0) ? n2 : -n2; #if THREAD_SAFE_GT == 2 F_mod_gt (R, n, gtptr); #else F_mod_gt (R, n); #endif return; } mpz_mul (gt, S1, S2); #if THREAD_SAFE_GT == 2 F_mod_gt (R, n, gtptr); #else F_mod_gt (R, n); #endif return; } /* R = S + sgn(S)*(2^e) */ static void mpz_absadd_2exp (mpz_t RS, unsigned int e) { mp_size_t siz, limb_idx, bit_idx; mp_limb_t cy; int sgn; limb_idx = e / GMP_NUMB_BITS; bit_idx = e % GMP_NUMB_BITS; siz = mpz_size (RS); sgn = (mpz_sgn (RS) >= 0) ? 1 : -1; if (limb_idx >= RS->_mp_alloc) /* WARNING: mpz_realloc2 does not keep the value!!! */ mpz_realloc2 (RS, (limb_idx + 1) * GMP_NUMB_BITS); /* Now RS->_mp_alloc > limb_idx) */ while (siz <= limb_idx) { RS->_mp_d[siz++] = 0; RS->_mp_size += sgn; } /* Now RS->_mp_alloc >= siz > limb_idx */ cy = mpn_add_1 (RS->_mp_d + limb_idx, RS->_mp_d + limb_idx, siz - limb_idx, ((mp_limb_t)1) << bit_idx); if (cy) { if (RS->_mp_alloc <= siz) /* WARNING: mpz_realloc2 does not keep the value!!! */ mpz_realloc2 (RS, (siz + 1) * GMP_NUMB_BITS); RS->_mp_d[siz] = 1; RS->_mp_size += sgn; } } /* R = S / 2 (mod 2^n + 1). S == gt is ok */ static void F_divby2 (mpz_t R, mpz_t S, unsigned int n) { int odd, sgn; odd = mpz_odd_p (S); sgn = mpz_sgn (S); mpz_tdiv_q_2exp (R, S, 1); if (odd) { /* We shifted out a set bit at the bottom. With negative wrap-around, that becomes -2^(n-1), so we add -2^(n-1) + 2^n+1 = 2^(n-1)+1. If |S| < 2^(n+1), |R| < 2^n + 2^(n-1) + 1 < 2^(n+1) for n > 1. */ mpz_absadd_2exp (R, n - 1); if (sgn < 0) mpz_sub_ui (R, R, 1); else mpz_add_ui (R, R, 1); } } /* RS = RS / 3 (mod 2^n + 1). RS == gt is ok */ static void F_divby3_1 (mpz_t RS, unsigned int n) { /* 2^2^m == 1 (mod 3) for m>0, thus F_m == 2 (mod 3) */ int mod, sgn; sgn = mpz_sgn (RS); mod = __gmpn_mod_34lsub1 (RS->_mp_d, mpz_size (RS)) % 3; if (mod == 1) { /* Add F_m. If |RS| < 2^(n+1), |RS|+F_m < 3*2^n+1 */ mpz_absadd_2exp (RS, n); if (sgn >= 0) mpz_add_ui (RS, RS, 1); else mpz_sub_ui (RS, RS, 1); } else if (mod == 2) { /* Add 2 * F_m. If |RS| < 2^(n+1), |RS|+2*F_m < 4*2^n+2 */ mpz_absadd_2exp (RS, n + 1); if (sgn >= 0) mpz_add_ui (RS, RS, 2); else mpz_sub_ui (RS, RS, 2); } mpz_divby3_1op (RS); /* |RS| < (4*2^n+2)/3 < 2^(n+1) */ } static void F_divby5_1 (mpz_t RS, unsigned int n) { /* 2^2^m == 1 (mod 5) for m>1, thus F_m == 2 (mod 5) */ int mod, sgn; sgn = mpz_sgn (RS); mod = __gmpn_mod_34lsub1 (RS->_mp_d, mpz_size (RS)) % 5; if (mod == 1) { /* Add 2 * F_m == 4 (mod 5) */ mpz_absadd_2exp (RS, n + 1); if (sgn == 1) mpz_add_ui (RS, RS, 2); else mpz_sub_ui (RS, RS, 2); } else if (mod == 2) { /* Add 4 * F_m == 3 (mod 5) */ mpz_absadd_2exp (RS, n + 2); if (sgn == 1) mpz_add_ui (RS, RS, 4); else mpz_sub_ui (RS, RS, 4); } else if (mod == 3) { /* Add F_m == 3 (mod 5) */ mpz_absadd_2exp (RS, n); if (sgn == 1) mpz_add_ui (RS, RS, 1); else mpz_sub_ui (RS, RS, 1); } else if (mod == 4) { /* Add 3 * F_m == 1 (mod 5) */ mpz_absadd_2exp (RS, n); mpz_absadd_2exp (RS, n + 1); if (sgn == 1) mpz_add_ui (RS, RS, 3); else mpz_sub_ui (RS, RS, 3); } ASSERT(mpz_divisible_ui_p (RS, 5)); mpz_divexact_ui (RS, RS, 5); } /* A 2^(m+2) length convolution is possible: (2^(3n/4) - 2^(n/4))^2 == 2 (mod 2^n+1) so we have an element of order 2^(m+2) of simple enough form to use it as a root of unity the transform */ /* Multiply by sqrt(2)^e (mod F_m). n = 2^m */ /* R = (S * sqrt(2)^e) % (2^n+1) */ /* R == S is ok, but neither must be == gt */ /* Assumes 0 < e < 4*n, and e <> 2*n */ static void #if THREAD_SAFE_GT == 2 F_mul_sqrt2exp (mpz_t R, mpz_t S, int e, unsigned int n, mpz_t *gtptr) #else F_mul_sqrt2exp (mpz_t R, mpz_t S, int e, unsigned int n) #endif { #if THREAD_SAFE_GT == 1 mpz_t *gtptr = pthread_getspecific (gtkey); #endif int chgsgn = 0, odd; ASSERT(S != gt); ASSERT(R != gt); ASSERT(0 < e && (unsigned int) e < 4 * n && (unsigned int) e != 2 * n); /* 0 < e < 4*n */ if ((unsigned) e > 2 * n) /* sqrt(2)^(2*n) == -1 (mod F_m), so */ { e -= 2 * n; /* sqrt(2)^e == -sqrt(2)^(e-2*n) (mod F_m) */ chgsgn = 1; } /* Now e < 2*n */ odd = e & 1; e >>= 1; if (odd) { /* Multiply by sqrt(2) == 2^(3n/4) - 2^(n/4) */ /* S * (2^(3n/4) - 2^(n/4)) == 2^(n/4) * (S*2^(n/2) - S) */ mpz_mul_2exp (gt, S, n / 2); mpz_sub (gt, gt, S); mpz_tdiv_q_2exp (R, gt, n / 4 * 3); mpz_tdiv_r_2exp (gt, gt, n / 4 * 3); mpz_mul_2exp (gt, gt, n / 4); mpz_sub (R, gt, R); if (e != 0) { mpz_tdiv_q_2exp (gt, R, n-e); mpz_tdiv_r_2exp (R, R, n-e); mpz_mul_2exp (R, R, e); mpz_sub (R, R, gt); } } else /* necessarily e <> 0 */ { ASSERT (e != 0); /* S = a*2^(n-e) + b, b < 2^(n-e) */ /* S*2^e = a*2^n + b*2^e = b*2^e - a */ /* b*2^e < 2^(n-e)*2^e = 2^n */ mpz_tdiv_q_2exp (gt, S, n - e); /* upper e bits (=a) into gt */ mpz_tdiv_r_2exp (R, S, n - e); /* lower n-e bits (=b) into R */ /* This is simply a truncate if S == R */ mpz_mul_2exp (R, R, e); /* R < 2^n */ mpz_sub (R, R, gt); } if (chgsgn) mpz_neg (R, R); } /* Same, but input may be gt. Input and output must not be identical. Currently this routine is always called with e=n, with n a power of 2, thus we assume e is even. Moreover we assume 0 < e < 2n. */ static void #if THREAD_SAFE_GT == 2 F_mul_sqrt2exp_2 (mpz_t R, mpz_t S, int e, unsigned int n, mpz_t *gtptr) #else F_mul_sqrt2exp_2 (mpz_t R, mpz_t S, int e, unsigned int n) #endif { #if THREAD_SAFE_GT == 1 mpz_t *gtptr = pthread_getspecific (gtkey); #endif ASSERT (S != R); ASSERT (R != gt); ASSERT (0 < e && (unsigned) e < 2 * n); ASSERT ((e & 1) == 0); e >>= 1; mpz_tdiv_q_2exp (R, S, n - e); /* upper e bits into R */ mpz_tdiv_r_2exp (gt, S, n - e); /* lower n-e bits into gt */ mpz_mul_2exp (gt, gt, e); mpz_sub (R, gt, R); } #define A0s A[0] #define A1s A[l << stride2] #define A2s A[2 * l << stride2] #define A3s A[3 * l << stride2] #define A0is A[i << stride2] #define A1is A[(i + l) << stride2] #define A2is A[(i + 2 * l) << stride2] #define A3is A[(i + 3 * l) << stride2] /* Decimation-in-frequency FFT. Unscrambled input, scrambled output. Elements are (mod 2^n+1), l and n must be powers of 2, l must be <= 4*n. Performs forward transform. Assumes l > 1. */ static void #if THREAD_SAFE_GT == 2 F_fft_dif (mpz_t *A, int l, int stride2, int n, mpz_t *gtptr) #else F_fft_dif (mpz_t *A, int l, int stride2, int n) #endif { #if THREAD_SAFE_GT == 1 mpz_t *gtptr = pthread_getspecific (gtkey); #endif int i, omega = (4 * n) / l, iomega; ASSERT (l > 1); ASSERT((4 * n) % l == 0); if (l == 2) { ADDSUB_MOD(A[0], A[1< 1) { #if THREAD_SAFE_GT == 2 F_fft_dif (A, l, stride2, n, gtptr); F_fft_dif (A + (l << stride2), l, stride2, n, gtptr); F_fft_dif (A + (2 * l << stride2), l, stride2, n, gtptr); F_fft_dif (A + (3 * l << stride2), l, stride2, n, gtptr); #else F_fft_dif (A, l, stride2, n); F_fft_dif (A + (l << stride2), l, stride2, n); F_fft_dif (A + (2 * l << stride2), l, stride2, n); F_fft_dif (A + (3 * l << stride2), l, stride2, n); #endif } } /* Decimation-in-time inverse FFT. Scrambled input, unscrambled output. Does not perform divide-by-length. l, and n as in F_fft_dif(). Assume l > 1. */ static void #if THREAD_SAFE_GT == 2 F_fft_dit (mpz_t *A, int l, int stride2, int n, mpz_t *gtptr) #else F_fft_dit (mpz_t *A, int l, int stride2, int n) #endif { #if THREAD_SAFE_GT == 1 mpz_t *gtptr = pthread_getspecific (gtkey); #endif int i, omega = (4 * n) / l, iomega; ASSERT (l > 1); ASSERT((4 * n) % l == 0); if (l == 2) { ADDSUB_MOD(A[0], A[1< 1) { #if THREAD_SAFE_GT == 2 F_fft_dit (A, l, stride2, n, gtptr); F_fft_dit (A + (l << stride2), l, stride2, n, gtptr); F_fft_dit (A + (2 * l << stride2), l, stride2, n, gtptr); F_fft_dit (A + (3 * l << stride2), l, stride2, n, gtptr); #else F_fft_dit (A, l, stride2, n); F_fft_dit (A + (l << stride2), l, stride2, n); F_fft_dit (A + (2 * l << stride2), l, stride2, n); F_fft_dit (A + (3 * l << stride2), l, stride2, n); #endif } mpz_sub (gt, A3s, A1s); /* gt = -(a1 - a3) */ mpz_add (A1s, A1s, A3s); /* A1 = a1 + a3 */ #if THREAD_SAFE_GT == 2 F_mul_sqrt2exp_2 (A3s, gt, n, n, gtptr); /* A3 = i * -(a1 - a3) */ #else F_mul_sqrt2exp_2 (A3s, gt, n, n); /* A3 = i * -(a1 - a3) */ #endif mpz_sub (gt, A[0], A2s); /* gt = a0 - a2 */ mpz_add (A[0], A[0], A2s); /* A0 = a0 + a2 */ mpz_sub (A2s, A[0], A1s); /* A2 = a0 - a1 + a2 - a3 */ mpz_add (A[0], A[0], A1s); /* A0 = a0 + a1 + a2 + a3 */ mpz_add (A1s, gt, A3s); /* A1 = a0 - a2 + i * -(a1 - a3) */ mpz_sub (A3s, gt, A3s); /* A3 = a0 - a2 - i * -(a1 - a3) */ for (i = 1, iomega = omega; i < l; i++, iomega += omega) { /* Divide by omega^i. Since sqrt(2)^(4*n) == 1 (mod 2^n+1), this is like multiplying by omega^(4*n-i) */ /* iomega goes from 4n/l to n-4n/l (with original l) thus 3n < 4*n-iomega < 4n */ #if THREAD_SAFE_GT == 2 F_mul_sqrt2exp (A1is, A1is, 4 * n - iomega, n, gtptr); #else F_mul_sqrt2exp (A1is, A1is, 4 * n - iomega, n); #endif /* 2n < 4*n-2*iomega < 4n */ #if THREAD_SAFE_GT == 2 F_mul_sqrt2exp (A2is, A2is, 4 * n - 2 * iomega, n, gtptr); #else F_mul_sqrt2exp (A2is, A2is, 4 * n - 2 * iomega, n); #endif /* n < 4*n-3*iomega < 4n, and 4*n-3*iomega cannot equal 2n since n is a power of 2 and 3*iomega is divisible by 3 */ #if THREAD_SAFE_GT == 2 F_mul_sqrt2exp (A3is, A3is, 4 * n - 3 * iomega, n, gtptr); #else F_mul_sqrt2exp (A3is, A3is, 4 * n - 3 * iomega, n); #endif mpz_sub (gt, A3is, A1is); mpz_add (A1is, A1is, A3is); #if THREAD_SAFE_GT == 2 F_mul_sqrt2exp_2 (A3is, gt, n, n, gtptr); #else F_mul_sqrt2exp_2 (A3is, gt, n, n); #endif mpz_sub (gt, A0is, A2is); mpz_add (A0is, A0is, A2is); mpz_sub (A2is, A0is, A1is); mpz_add (A0is, A0is, A1is); mpz_add (A1is, gt, A3is); mpz_sub (A3is, gt, A3is); F_mod_1 (A0is, n); F_mod_1 (A1is, n); F_mod_1 (A2is, n); F_mod_1 (A3is, n); } } #define A0 A[i] #define A1 A[l+i] #define A2 A[2*l+i] #define A3 A[3*l+i] #define B0 B[i] #define B1 B[l+i] #define B2 B[2*l+i] #define B3 B[3*l+i] #define C0 C[i] #define C1 C[l+i] #define C2 C[2*l+i] #define C3 C[3*l+i] #define C4 C[4*l+i] #define C5 C[5*l+i] #define C6 C[6*l+i] #define C7 C[7*l+i] #define t0 t[i] #define t1 t[l+i] #define t2 t[2*l+i] #define t3 t[3*l+i] #define t4 t[4*l+i] #define t5 t[5*l+i] /* Assume A <> B. There was some code for squaring (A=B) in revision <= 2788. */ static unsigned int F_toomcook4 (mpz_t *C, mpz_t *A, mpz_t *B, unsigned int len, unsigned int n, #if THREAD_SAFE_GT == 2 mpz_t *t, mpz_t *gtptr) #else mpz_t *t) #endif { #if THREAD_SAFE_GT == 1 mpz_t *gtptr = pthread_getspecific (gtkey); #endif unsigned int l, i, r; ASSERT(A != B); ASSERT(len % 4 == 0); l = len / 4; for (i = 0; i < l; i++) { /*** Evaluate A(2), A(-2), 8*A(1/2) ***/ mpz_mul_2exp (t0, A0, 1); mpz_add (t0, t0, A1); mpz_mul_2exp (t0, t0, 1); mpz_add (t0, t0, A2); mpz_mul_2exp (t0, t0, 1); mpz_add (t0, t0, A3); /* t[0 .. l-1] = 8*A(1/2) < 15*N */ F_mod_1 (t0, n); mpz_mul_2exp (t2, A3, 2); mpz_add (t2, t2, A1); mpz_mul_2exp (t2, t2, 1); /* t[2l .. 3l-1] = 8*A_3 + 2*A_1 */ mpz_mul_2exp (gt, A2, 2); mpz_add (gt, gt, A0); /* gt = 4*A_2 + A0 */ mpz_sub (t4, gt, t2); /* t[4l .. 5l-1] = A(-2) */ mpz_add (t2, t2, gt); /* t[2l .. 3l-1] = A(2) */ F_mod_1 (t4, n); F_mod_1 (t2, n); /*** Evaluate B(2), B(-2), 8*B(1/2) ***/ mpz_mul_2exp (t1, B0, 1); mpz_add (t1, t1, B1); mpz_mul_2exp (t1, t1, 1); mpz_add (t1, t1, B2); mpz_mul_2exp (t1, t1, 1); mpz_add (t1, t1, B3); /* t[l .. 2l-1] = 8*B(1/2) */ F_mod_1 (t1, n); mpz_mul_2exp (t3, B3, 2); mpz_add (t3, t3, B1); mpz_mul_2exp (t3, t3, 1); /* t[3l .. 4l-1] = 8*B_3 + 2*B_1 */ mpz_mul_2exp (gt, B2, 2); mpz_add (gt, gt, B0); /* gt = 4*B_2 + B0 */ mpz_sub (t5, gt, t3); /* t[5l .. 6l-1] = B(-2) */ mpz_add (t3, t3, gt); /* t[3l .. 4l-1] = B(2) */ F_mod_1 (t5, n); F_mod_1 (t3, n); /* Evaluate A(1), A(-1) */ mpz_add (C2, A0, A2); /* May overwrite A2 */ #undef A2 mpz_add (gt, A1, A3); mpz_set (C1, B0); /* C1 = B(0) May overwrite A1 */ #undef A1 mpz_sub (C4, C2, gt); /* C4 = A(-1). May overwrite B0 */ #undef B0 mpz_add (C2, C2, gt); /* C2 = A(1) < 4*N */ F_mod_1 (C2, n); F_mod_1 (C4, n); /* Evaluate B(1), B(-1) */ mpz_add (gt, C1, B2); /* B0 is in C1 */ mpz_set (C6, A3); /* C6 = A(inf) May overwrite B2 */ #undef B2 mpz_add (C3, B1, B3); /* May overwrite A3 */ #undef A3 mpz_sub (C5, gt, C3); /* C5 = B(-1). May overwrite B1 */ #undef B1 mpz_add (C3, gt, C3); /* C3 = B(1) */ F_mod_1 (C3, n); F_mod_1 (C5, n); } /* A0 A1 A2 A3 B0 B1 B2 B3 */ /* A0 B0 A(1) B(1) A(-1) B(-1) A3 B3 */ /* C0 C1 C2 C3 C4 C5 C6 C7 */ r = F_mul (t, t, t + l, l, DEFAULT, n, t + 6 * l); /* t0 = 8*A(1/2) * 8*B(1/2) = 64*C(1/2) */ r += F_mul (t + 2 * l, t + 2 * l, t + 3 * l, l, DEFAULT, n, t + 6 * l); /* t2 = A(2) * B(2) = C(2) */ r += F_mul (t + 4 * l, t + 4 * l, t + 5 * l, l, DEFAULT, n, t + 6 * l); /* t4 = A(-2) * B(-2) = C(-2) */ r += F_mul (C, A, C + l, l, DEFAULT, n, t + 6 * l); /* C0 = A(0)*B(0) = C(0) */ r += F_mul (C + 2 * l, C + 2 * l, C + 3 * l, l, DEFAULT, n, t + 6 * l); /* C2 = A(1)*B(1) = C(1) */ r += F_mul (C + 4 * l, C + 4 * l, C + 5 * l, l, DEFAULT, n, t + 6 * l); /* C4 = A(-1)*B(-1) = C(-1) */ r += F_mul (C + 6 * l, C + 6 * l, B + 3 * l, l, DEFAULT, n, t + 6 * l); /* C6 = A(inf)*B(inf) = C(inf) */ /* C(0) C(1) C(-1) C(inf) 64*C(1/2) C(2) C(-2) */ /* C0,C1 C2,C3 C4,C5 C6,C7 t0,t1 t2,t3 t4,t5 */ for (i = 0; i < 2 * l - 1; i++) { mpz_add (t0, t0, t2); /* t0 = 65 34 20 16 20 34 65 */ mpz_sub (gt, C2, C4); /* gt = 2*C_odd(1) = 0 2 0 2 0 2 0 */ mpz_add (C2, C2, C4); /* C2 = 2*C_even(1) = 2 0 2 0 2 0 2 */ F_divby2 (C2, C2, n); /* C2 = C_even(1) */ mpz_add (C4, t2, t4); /* C4 = 2*C_even(2) */ F_divby2 (C4, C4, n); /* C4 = C_even(2) */ mpz_sub (t4, t2, t4); /* t4 = 2*C_odd(2) */ F_divby2 (t4, t4, n); F_divby2 (t4, t4, n); /* t4 = C_odd(2)/2 = C_1 + 4*C_3 + 16*C_5 */ F_divby2 (t2, gt, n); /* t2 = C_odd(1) */ mpz_sub (t0, t0, gt); /* t0 = 65 32 20 14 20 32 65 */ mpz_mul_2exp (gt, gt, 4); mpz_sub (t0, t0, gt); /* t0 = 65 0 20 -18 20 0 65 */ mpz_add (gt, C0, C6); /* gt = C_0 + C_6 */ mpz_sub (C2, C2, gt); /* C2 = C_2 + C_4 */ mpz_sub (t0, t0, gt); /* t0 = 64 0 20 -18 20 0 64 */ mpz_mul_2exp (gt, gt, 5); /* gt = 32*C_0 + 32*C_6 */ F_divby2 (t0, t0, n); /* t0 = 32 0 10 -9 10 0 32 */ mpz_sub (t0, t0, gt); /* t0 = 0 0 10 -9 10 0 0 */ mpz_sub (t0, t0, C2); /* t0 = 0 0 9 -9 9 0 0 */ F_divby3_1 (t0, n); F_divby3_1 (t0, n); /* t0 = 0 0 1 -1 1 0 0 */ mpz_sub (t0, C2, t0); /* t0 = C_3 */ mpz_sub (t2, t2, t0); /* t2 = C_1 + C_5 */ mpz_mul_2exp (gt, t0, 2); /* gt = 4*C_3 */ mpz_sub (t4, t4, gt); /* t4 = C_1 + 16*C_5 */ mpz_sub (t4, t4, t2); /* t4 = 15*C_5 */ F_divby3_1 (t4, n); F_divby5_1 (t4, n); /* t4 = C_5 */ mpz_sub (t2, t2, t4); /* t2 = C_1 */ mpz_sub (C4, C4, C0); /* C4 = 4*C_2 + 16*C_4 + 64*C_6 */ F_divby2 (C4, C4, n); F_divby2 (C4, C4, n); /* C4 = C_2 + 4*C_4 + 16*C_6 */ mpz_mul_2exp (gt, C6, 4); mpz_sub (C4, C4, gt); /* C4 = C_2 + 4*C_4 */ mpz_sub (C4, C4, C2); /* C4 = 3*C_4 */ F_divby3_1 (C4, n); /* C4 = C_4 */ mpz_sub (C2, C2, C4); /* C2 = C_2 */ } for (i = 0; i < l - 1; i++) { mpz_add (C1, C1, t2); F_mod_1 (C1, n); } mpz_set (C1, t2); F_mod_1 (C1, n); for (i = l; i < 2 * l - 1; i++) { mpz_add (C1, C1, t2); F_mod_1 (C1, n); } for (i = 0; i < l - 1; i++) { mpz_add (C3, C3, t0); F_mod_1 (C3, n); } mpz_set (C3, t0); F_mod_1 (C3, n); for (i = l; i < 2 * l - 1; i++) { mpz_add (C3, C3, t0); F_mod_1 (C3, n); } for (i = 0; i < l - 1; i++) { mpz_add (C5, C5, t4); F_mod_1 (C5, n); } mpz_set (C5, t4); F_mod_1 (C5, n); for (i = l; i < 2 * l - 1; i++) { mpz_add (C5, C5, t4); F_mod_1 (C5, n); } return r; } /* Karatsuba split. Calls F_mul() to multiply the three pieces. Assume A <> B (there was code for squaring in revision <= 2788. */ static unsigned int F_karatsuba (mpz_t *R, mpz_t *A, mpz_t *B, unsigned int len, unsigned int n, #if THREAD_SAFE_GT == 2 mpz_t *t, mpz_t *gtptr) #else mpz_t *t) #endif { #if THREAD_SAFE_GT == 1 mpz_t *gtptr = pthread_getspecific (gtkey); #endif unsigned int i, r; ASSERT(len % 2 == 0); len /= 2; for (i = 0; i < len; i++) { mpz_add (t[i], A[i], A[i + len]); /* t0 = A0 + A1 */ mpz_add (t[i + len], B[i], B[i + len]); /* t1 = B0 + B1 */ } r = F_mul (t, t, t + len, len, DEFAULT, n, t + 2 * len); /* t[0...2*len-1] = (A0+A1) * (B0+B1) = A0*B0 + A0*B1 + A1*B0 + A1*B1 */ if (R != A) { r += F_mul (R, A, B, len, DEFAULT, n, t + 2 * len); /* R[0...2*len-1] = A0 * B0 */ r += F_mul (R + 2 * len, A + len, B + len, len, DEFAULT, n, t + 2 * len); /* R[2*len...4*len-1] = A1 * B1, may overwrite B */ } else if (R + 2 * len != B) { r += F_mul (R + 2 * len, A + len, B + len, len, DEFAULT, n, t + 2 * len); /* R[2*len...4*len-1] = A1 * B1 */ r += F_mul (R, A, B, len, DEFAULT, n, t + 2 * len); /* R[0...2*len-1] = A0 * B0, overwrites A */ } else /* R == A && R + 2*len == B */ { for (i = 0; i < len; i++) { /* mpz_swap instead? Perhaps undo later? Or interface for F_mul to specify separate result arrays for high/low half? */ mpz_set (gt, A[len + i]); /* Swap A1 and B0 */ mpz_set (A[len + i], B[i]); mpz_set (B[i], gt); } r += F_mul (R, R, R + len, len, DEFAULT, n, t + 2 * len); /* R[0...2*len-1] = A0 * B0, overwrites A */ r += F_mul (R + 2 * len, R + 2 * len, R + 3 * len, len, DEFAULT, n, t + 2 * len); /* R[2*len...4*len-1] = A1 * B1, overwrites B */ } /* R[0...2*len-2] == A0*B0, R[2*len-1] == 0 */ /* R[2*len...3*len-2] == A1*B1, R[4*len-1] == 0 */ /* t[0...2*len-2] == (A0+A1)*(B0+B1), t[2*len-1] == 0 */ /* We're doing indices i and i+len in one loop on the assumption that 6 residues will probably fit into cache. After all, Karatsuba is only called for smallish F_m. This way, the final add R[i+len] += t[i] can be done inside the same loop and we need only one pass over main memory. */ for (i = 0; i < len - 1; i++) { mpz_sub (t[i], t[i], R[i]); /* t = A0*B1 + A1*B0 + A1*B1 */ mpz_sub (t[i], t[i], R[i + 2 * len]); /* t = A0*B1 + A1*B0 */ mpz_sub (t[i + len], t[i + len], R[i + len]); mpz_sub (t[i + len], t[i + len ], R[i + 3 * len]); mpz_add (R[i + len], R[i + len], t[i]); mpz_add (R[i + 2 * len], R[i + 2 * len], t[i + len]); } mpz_sub (t[len - 1], t[len - 1], R[len - 1]); mpz_sub (R[2 * len - 1], t[len - 1], R[3 * len - 1]); return r; } /* Multiply two polynomials with coefficients modulo 2^(2^m)+1. len is length (=degree+1) of polynomials and must be a power of 2. n=2^m Return value: number of multiplies performed, or UINT_MAX in case of error. */ unsigned int F_mul (mpz_t *R, mpz_t *A, mpz_t *B, unsigned int len, int parameter, unsigned int n, mpz_t *t) { unsigned int i, r=0; unsigned int transformlen = (parameter == NOPAD) ? len : 2 * len; #ifdef CHECKSUM mpz_t chksum1, chksum_1, chksum0, chksuminf; #endif /* Handle trivial cases */ if (len == 0) return 0; #if THREAD_SAFE_GT > 0 mpz_t *gtptr = get_gtptr (); if (ALLOC (gt) == 0) /* True:: this thread F_clear'd gt or just calloc'd gt */ mpz_init2 (gt, 2 * n); #else if (!gt_inited) { mpz_init2 (gt, 2 * n); gt_inited = 1; } #endif if (len == 1) { if (parameter == MONIC) { /* (x + a0)(x + b0) = x^2 + (a0 + b0)x + a0*b0 */ mpz_add (gt, A[0], B[0]); #if THREAD_SAFE_GT == 2 F_mod_gt (t[0], n, gtptr); F_mulmod (R[0], A[0], B[0], n, gtptr); /* May overwrite A[0] */ #else F_mod_gt (t[0], n); F_mulmod (R[0], A[0], B[0], n); /* May overwrite A[0] */ #endif mpz_set (R[1], t[0]); /* May overwrite B[0] */ /* We don't store the leading 1 monomial in the result poly */ } else { #if THREAD_SAFE_GT == 2 F_mulmod (R[0], A[0], B[0], n, gtptr); /* May overwrite A[0] */ #else F_mulmod (R[0], A[0], B[0], n); /* May overwrite A[0] */ #endif mpz_set_ui (R[1], 0); /* May overwrite B[0] */ } return 1; } #ifdef CHECKSUM mpz_init2 (chksum1, n+64); mpz_init2 (chksum_1, n+64); mpz_init2 (chksum0, n+64); mpz_init2 (chksuminf, n+64); mpz_set_ui (gt, 0); for (i = 0; i < len; i++) { /* Compute A(1) and B(1) */ mpz_add (chksum1, chksum1, A[i]); mpz_add (gt, gt, B[i]); /* Compute A(-1) and B(-1) */ if (i % 2 == 0) { mpz_add (chksum_1, chksum_1, A[i]); mpz_add (chksum0, chksum0, B[i]); /* chksum0 used temporarily here */ } else { mpz_sub (chksum_1, chksum_1, A[i]); mpz_sub (chksum0, chksum0, B[i]); } } if (parameter == MONIC) { mpz_add_ui (chksum1, chksum1, 1); mpz_add_ui (gt, gt, 1); mpz_add_ui (chksum_1, chksum_1, 1); mpz_add_ui (chksum0, chksum0, 1); } mpz_mul (gt, gt, chksum1); #if THREAD_SAFE_GT == 2 F_mod_gt (chksum1, n, gtptr); #else F_mod_gt (chksum1, n); #endif mpz_mul (gt, chksum0, chksum_1); #if THREAD_SAFE_GT == 2 F_mod_gt (chksum_1, n, gtptr); #else F_mod_gt (chksum_1, n); #endif /* Compute A(0) * B(0) */ mpz_mul (gt, A[0], B[0]); #if THREAD_SAFE_GT == 2 F_mod_gt (chksum0, n, gtptr); #else F_mod_gt (chksum0, n); #endif /* Compute A(inf) * B(inf) */ mpz_mul (gt, A[len - 1], B[len - 1]); #if THREAD_SAFE_GT == 2 F_mod_gt (chksuminf, n, gtptr); #else F_mod_gt (chksuminf, n); #endif if (parameter == MONIC) { mpz_add (chksuminf, chksuminf, A[len - 2]); mpz_add (chksuminf, chksuminf, B[len - 2]); } r += 4; #endif /* CHECKSUM */ /* Don't do FFT if len <= 4 (Karatsuba or Toom-Cook are faster) unless we do a transform without zero padding, or if transformlen > 4*n (no suitable primitive roots of 1) */ if ((len > 4 || parameter == NOPAD) && transformlen <= 4 * n) { unsigned int len2; /* len2 = log_2(transformlen). Assumes transformlen > 0 */ for (i = transformlen, len2 = 0; (i&1) == 0; i >>= 1, len2++); if (i != 1) { outputf (OUTPUT_ERROR, "F_mul: polynomial length must be power of 2, " "but is %d\n", len); return UINT_MAX; } /* Are we performing a squaring or multiplication? */ if (A != B) { /* So it's a multiplication */ /* Put transform of B into t */ for (i = 0; i < len; i++) mpz_set (t[i], B[i]); if (parameter == MONIC) mpz_set_ui (t[i++], 1); for (; i < transformlen; i++) mpz_set_ui (t[i], 0); #if THREAD_SAFE_GT == 2 F_fft_dif (t, transformlen, 0, n, gtptr); #else F_fft_dif (t, transformlen, 0, n); #endif } else t = R; /* Do squaring */ /* Put A into R */ for (i = 0; i < len; i++) mpz_set (R[i], A[i]); if (parameter == MONIC) mpz_set_ui (R[i++], 1); /* May overwrite B[0] */ for (; i < transformlen; i++) mpz_set_ui (R[i], 0); /* May overwrite B[i - len] */ #if THREAD_SAFE_GT == 2 F_fft_dif (R, transformlen, 0, n, gtptr); #else F_fft_dif (R, transformlen, 0, n); #endif for (i = 0; i < transformlen; i++) { #if THREAD_SAFE_GT == 2 F_mulmod (R[i], R[i], t[i], n, gtptr); #else F_mulmod (R[i], R[i], t[i], n); #endif /* Do the div-by-length. Transform length was transformlen, len2 = log_2 (transformlen), so divide by 2^(len2) = sqrt(2)^(2*len2) */ /* since transformlen = 2^len2 <= 4*n then for n >= 8 we have 2*len2 <= 2*log2(4*n) < 2n */ #if THREAD_SAFE_GT == 2 F_mul_sqrt2exp (R[i], R[i], 4 * n - 2 * len2, n, gtptr); #else F_mul_sqrt2exp (R[i], R[i], 4 * n - 2 * len2, n); #endif } r += transformlen; #if THREAD_SAFE_GT == 2 F_fft_dit (R, transformlen, 0, n, gtptr); #else F_fft_dit (R, transformlen, 0, n); #endif if (parameter == MONIC) mpz_sub_ui (R[0], R[0], 1); } else { /* Karatsuba or Toom-Cook split */ if (parameter == NOPAD) { outputf (OUTPUT_ERROR, "F_mul: cyclic/short products not supported " "by Karatsuba/Toom-Cook\n"); return UINT_MAX; } if (len / n == 4 || len == 2) #if THREAD_SAFE_GT == 2 r += F_karatsuba (R, A, B, len, n, t, gtptr); #else r += F_karatsuba (R, A, B, len, n, t); #endif else #if THREAD_SAFE_GT == 2 r += F_toomcook4 (R, A, B, len, n, t, gtptr); #else r += F_toomcook4 (R, A, B, len, n, t); #endif if (parameter == MONIC) /* Handle the leading monomial the hard way */ { /* This only works if A, B and R do not overlap */ if (A == R || B == R + len) { outputf (OUTPUT_ERROR, "F_mul: monic polynomials with Karatsuba/" "Toom-Cook and overlapping input/output not supported\n"); return UINT_MAX; } for (i = 0; i < len; i++) { mpz_add (R[i + len], R[i + len], A[i]); mpz_add (R[i + len], R[i + len], B[i]); F_mod_1 (R[i + len], n); } } } #ifdef DEBUG if (parameter != MONIC && parameter != NOPAD) { F_mod_1 (R[transformlen - 1], n); if (mpz_sgn (R[transformlen - 1]) != 0) outputf (OUTPUT_ALWAYS, "F_mul, len %d: R[%d] == %Zd != 0\n", len, transformlen - 1, R[transformlen - 1]); } #endif #ifdef CHECKSUM /* Compute R(1) = (A*B)(1) and subtract from chksum1 */ for (i = 0; i < transformlen; i++) mpz_sub (chksum1, chksum1, R[i]); if (parameter == MONIC) mpz_sub_ui (chksum1, chksum1, 1); while (mpz_sizeinbase (chksum1, 2) > n) F_mod_1 (chksum1, n); if (mpz_sgn (chksum1) != 0) outputf (OUTPUT_ALWAYS, "F_mul, len %d: A(1)*B(1) != R(1), difference %Zd\n", len, chksum1); /* Compute R(-1) = (A*B)(-1) and subtract from chksum_1 */ for (i = 0; i < transformlen; i++) if (i % 2 == 0) mpz_sub (chksum_1, chksum_1, R[i]); else mpz_add (chksum_1, chksum_1, R[i]); if (parameter == MONIC) mpz_sub_ui (chksum_1, chksum_1, 1); while (mpz_sizeinbase (chksum_1, 2) > n) F_mod_1 (chksum_1, n); if (mpz_sgn (chksum_1) != 0) outputf (OUTPUT_ALWAYS, "F_mul, len %d: A(-1)*B(-1) != R(-1), difference %Zd\n", len, chksum_1); if (parameter != NOPAD) { mpz_sub (chksum0, chksum0, R[0]); while (mpz_sizeinbase (chksum0, 2) > n) F_mod_1 (chksum0, n); if (mpz_sgn (chksum0) != 0) outputf (OUTPUT_ALWAYS, "F_mul, len %d: A(0)*B(0) != R(0), difference %Zd\n", len, chksum0); mpz_sub (chksuminf, chksuminf, R[transformlen - 2]); while (mpz_sizeinbase (chksuminf, 2) > n) F_mod_1 (chksuminf, n); if (mpz_sgn (chksuminf) != 0) outputf (OUTPUT_ALWAYS, "F_mul, len %d: A(inf)*B(inf) != R(inf), difference %Zd\n", len, chksuminf); } mpz_clear (chksum1); mpz_clear (chksum_1); mpz_clear (chksum0); mpz_clear (chksuminf); #endif /* CHECKSUM */ return r; } /* Transposed multiply of two polynomials with coefficients modulo 2^(2^m)+1. lenB is the length of polynomial B and must be a power of 2, lenA is the length of polynomial A and must be lenB / 2 or lenB / 2 + 1. n=2^m t must have space for 2*lenB coefficients Only the product coefficients [lenA - 1 ... lenA + lenB/2 - 2] will go into R[0 ... lenB / 2 - 1] Return value: number of multiplies performed, UINT_MAX in error case. */ unsigned int F_mul_trans (mpz_t *R, mpz_t *A, mpz_t *B, unsigned int lenA, unsigned int lenB, unsigned int n, mpz_t *t) { unsigned int i, r = 0, len2; /* Handle trivial cases */ if (lenB < 2) return 0; ASSERT(lenA == lenB / 2 || lenA == lenB / 2 + 1); #if THREAD_SAFE_GT > 0 mpz_t *gtptr = get_gtptr (); if (ALLOC (gt) == 0) /* True:: this thread F_clear'd gt or just calloc'd gt */ mpz_init2 (gt, 2 * n); #else if (!gt_inited) { mpz_init2 (gt, 2 * n); gt_inited = 1; } #endif if (lenB == 2) { #if THREAD_SAFE_GT == 2 F_mulmod (R[0], A[0], B[0], n, gtptr); #else F_mulmod (R[0], A[0], B[0], n); #endif return 1; } if (lenB <= 4 * n) { /* len2 = log_2(lenB) */ for (i = lenB, len2 = 0; i > 1 && (i&1) == 0; i >>= 1, len2++); if (i != 1) { outputf (OUTPUT_ERROR, "F_mul_trans: polynomial length must be power of 2, " "but is %d\n", lenB); return UINT_MAX; } /* Put transform of B into t */ for (i = 0; i < lenB; i++) mpz_set (t[i], B[i]); #if THREAD_SAFE_GT == 2 F_fft_dif (t, lenB, 0, n, gtptr); #else F_fft_dif (t, lenB, 0, n); #endif /* Put transform of reversed A into t + lenB */ for (i = 0; i < lenA; i++) mpz_set (t[i + lenB], A[lenA - 1 - i]); for (i = lenA; i < lenB; i++) mpz_set_ui (t[i + lenB], 0); #if THREAD_SAFE_GT == 2 F_fft_dif (t + lenB, lenB, 0, n, gtptr); #else F_fft_dif (t + lenB, lenB, 0, n); #endif for (i = 0; i < lenB; i++) { #if THREAD_SAFE_GT == 2 F_mulmod (t[i], t[i], t[i + lenB], n, gtptr); #else F_mulmod (t[i], t[i], t[i + lenB], n); #endif /* Do the div-by-length. Transform length was len, so divide by 2^len2 = sqrt(2)^(2*len2) */ /* since len2 = log2(lenB) and lenB <= 4*n, for n >= 8 we have 2*len2 < 2*n */ #if THREAD_SAFE_GT == 2 F_mul_sqrt2exp (t[i], t[i], 4 * n - 2 * len2, n, gtptr); #else F_mul_sqrt2exp (t[i], t[i], 4 * n - 2 * len2, n); #endif } r += lenB; #if THREAD_SAFE_GT == 2 F_fft_dit (t, lenB, 0, n, gtptr); #else F_fft_dit (t, lenB, 0, n); #endif for (i = 0; i < lenB / 2; i++) mpz_set (R[i], t[i + lenA - 1]); } else { /* Only Karatsuba, no Toom-Cook here */ unsigned int h = lenB / 4; const unsigned int lenA0 = h, lenA1 = lenA - h; outputf (OUTPUT_DEVVERBOSE, "schoen_strass.c: Transposed Karatsuba, " "lenA = %lu, lenB = %lu\n", lenA, lenB); /* A = a1 * x^h + a0 B = b3 * x^3h + b2 * x^2h + b1 * x^h + b0 mul^T(A, B) = mul^T(a0,b3) * x^4h + (mul^T(a1,b3) + mul^T(a0,b2)) * x^3h + (mul^T(a1,b2) + mul^T(a0,b1)) * x^2h + (mul^T(a1,b1) + mul^T(a0,b0)) * x + mul^T(a1,b0) We only want the x^h, x^2h and x^3h coefficients, mul^T(a1,b1) + mul^T(a0,b0) mul^T(a1,b2) + mul^T(a0,b1) mul^T(a1,b3) + mul^T(a0,b2) Specifically, we want R[i] = \sum_{j=0}^{lenA} A[j] * B[j+i], 0 <= i < 2h */ /* T */ for (i = 0; i < h; i++) mpz_add (t[i], A[i], A[i + h]); if (lenA1 == h + 1) mpz_set (t[h], A[2*h]); r = F_mul_trans (t, t, B + h, lenA1, 2 * h, n, t + lenA1); /* Uses t[h ... 5h-1] as temp */ /* U */ for (i = 0; i < 2 * h; i++) mpz_sub (t[i + h], B[i], B[h + i]); r += F_mul_trans (t + h, A, t + h, lenA0, 2 * h, n, t + 3 * h); /* Uses t[3h ... 7h-1] as temp */ for (i = 0; i < h; i++) mpz_add (R[i], t[i], t[i + h]); /* R[0 ... h-1] = t + r */ /* V */ for (i = 0; i < 2 * h; i++) mpz_sub (t[i + h], B[i + 2 * h], B[i + h]); r += F_mul_trans (t + h, A + h, t + h, lenA1, 2 * h, n, t + 3 * h); /* Uses t[3h ... 7h - 1] as temp */ for (i = 0; i < h; i++) mpz_add (R[i + h], t[i], t[i + h]); } return r; } void F_clear (void) { #if THREAD_SAFE_GT > 0 mpz_t *gtptr = pthread_getspecific (gtkey); if (gtptr) { mpz_clear (gt); /* Free this thread's gt limb space allocation */ memset (gtptr, 0, sizeof (mpz_t)); /* Zero out GMP context for this gt */ /* Don't free(gtptr); needed for this thread's next time through */ } #else if (gt_inited) mpz_clear (gt); gt_inited = 0; #endif }