/* Copyright (C) 2008, 2009, William Hart Copyright (C) 2010 Fredrik Johansson Copyright (C) 2021 Daniel Schultz This file is part of FLINT. FLINT is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License (LGPL) as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. See . */ #include #include "flint.h" #include "ulong_extras.h" #include "fmpz.h" #include "nmod_vec.h" #include "fmpz_vec.h" #include "fmpz_poly.h" /* The better mpn_addmul_1 is, the larger FMPZ_CRT_UI_CUTOFF can be. */ #define FMPZ_CRT_UI_CUTOFF 50 /* The better fmpz_fdiv_ui (mpn_mod_1) is, the larger FMPZ_MOD_UI_CUTOFF can be. Very poor implementations of mpn_mod_1 can have 10, and anything up to 100 is reasonable. */ #define FMPZ_MOD_UI_CUTOFF 75 /* minimal number of basecase chunks in which to partition */ #define FMPZ_CRT_UI_MULTIPLE_CUTOFF 8 #define FMPZ_MOD_UI_MULTIPLE_CUTOFF 4 void fmpz_comb_temp_init(fmpz_comb_temp_t CT, const fmpz_comb_t C) { CT->Alen = FLINT_MAX(C->crt_klen, C->mod_klen); CT->Tlen = FLINT_MAX(C->crt_P->localsize, C->mod_P->localsize); CT->A = _fmpz_vec_init(CT->Alen); CT->T = _fmpz_vec_init(CT->Tlen); } void fmpz_comb_init(fmpz_comb_t C, mp_srcptr m, slong len) { int success; slong l, i, j, k, s; ulong tt, mm, mt; fmpz_poly_t M, Mm; /* only used for resizable fmpz array convenience */ if (len < 1) flint_throw(FLINT_ERROR, "fmpz_comb_init: len should be positive"); fmpz_poly_init(Mm); fmpz_poly_init(M); fmpz_multi_CRT_init(C->crt_P); C->packed_multipliers = NULL; C->step = NULL; C->crt_lu = NULL; C->crt_lu_alloc = 0; C->crt_offsets = NULL; C->crt_offsets_alloc = 0; fmpz_multi_mod_init(C->mod_P); C->mod_lu = NULL; C->mod_lu_alloc = 0; C->mod_offsets = NULL; C->mod_offsets_alloc = 0; C->num_primes = len; /* set up crt */ for (l = 0, i = 0, k = 0; l < len; k++) { if (k + 1 >= C->crt_offsets_alloc) { C->crt_offsets_alloc = FLINT_MAX(k + 1, C->crt_offsets_alloc*3/2); C->crt_offsets = FLINT_ARRAY_REALLOC(C->crt_offsets, C->crt_offsets_alloc, slong); } fmpz_poly_fit_length(M, k + 1); fmpz_one(M->coeffs + k); for (j = i; l < len && fmpz_size(M->coeffs + k) <= FMPZ_CRT_UI_CUTOFF; j++) { if (j + 1 >= C->crt_lu_alloc) { C->crt_lu_alloc = FLINT_MAX(j + 1, C->crt_lu_alloc*3/2); C->crt_lu = FLINT_ARRAY_REALLOC(C->crt_lu, C->crt_lu_alloc, crt_lut_entry); } C->crt_lu[j].i0 = 0; C->crt_lu[j].i1 = 0; C->crt_lu[j].i2 = 0; mm = m[l]; if (l + 1 < len && !n_mul_checked(&mt, mm, m[l+1])) { mm = mt; if (l + 2 < len && !n_mul_checked(&mt, mm, m[l+2])) { mm = mt; success = (1 == n_gcdinv(&C->crt_lu[j].i0, m[l+1]*m[l+2] % m[l+0], m[l+0])) && (1 == n_gcdinv(&C->crt_lu[j].i1, m[l+0]*m[l+2] % m[l+1], m[l+1])) && (1 == n_gcdinv(&C->crt_lu[j].i2, m[l+0]*m[l+1] % m[l+2], m[l+2])); if (!success) goto bad_moduli; C->crt_lu[j].i0 *= m[l+1]*m[l+2]; C->crt_lu[j].i1 *= m[l+0]*m[l+2]; C->crt_lu[j].i2 *= m[l+0]*m[l+1]; l += 3; } else { success = (1 == n_gcdinv(&C->crt_lu[j].i0, m[l+1] % m[l+0], m[l+0])) && (1 == n_gcdinv(&C->crt_lu[j].i1, m[l+0] % m[l+1], m[l+1])); if (!success) goto bad_moduli; C->crt_lu[j].i0 *= m[l+1]; C->crt_lu[j].i1 *= m[l+0]; l += 2; } } else { C->crt_lu[j].i0 = 1; l += 1; } nmod_init(&C->crt_lu[j].mod, mm); fmpz_mul_ui(M->coeffs + k, M->coeffs + k, C->crt_lu[j].mod.n); } C->crt_offsets[k] = j; fmpz_poly_fit_length(Mm, j); for ( ; i < j; i++) { fmpz_divexact_ui(Mm->coeffs + i, M->coeffs + k, C->crt_lu[i].mod.n); tt = fmpz_fdiv_ui(Mm->coeffs + i, C->crt_lu[i].mod.n); success = (1 == n_gcdinv(&tt, tt, C->crt_lu[i].mod.n)); if (!success) goto bad_moduli; C->crt_lu[i].i0 = nmod_mul(tt, C->crt_lu[i].i0, C->crt_lu[i].mod); C->crt_lu[i].i1 = nmod_mul(tt, C->crt_lu[i].i1, C->crt_lu[i].mod); C->crt_lu[i].i2 = nmod_mul(tt, C->crt_lu[i].i2, C->crt_lu[i].mod); } } /* avoid small last chunk and have at least FMPZ_CRT_UI_MULTIPLE_CUTOFF chunks or one big chunk */ while (k > 1 && (k < FMPZ_CRT_UI_MULTIPLE_CUTOFF || fmpz_size(M->coeffs + k - 1) <= FMPZ_CRT_UI_CUTOFF*3/4)) { k--; for (i = (k >= 2) ? C->crt_offsets[k - 2] : 0; i < C->crt_offsets[k]; i++) { int last = (i >= C->crt_offsets[k - 1]); fmpz_mul(Mm->coeffs + i, Mm->coeffs + i, M->coeffs + k - last); tt = fmpz_fdiv_ui(M->coeffs + k - last, C->crt_lu[i].mod.n); success = (1 == n_gcdinv(&tt, tt, C->crt_lu[i].mod.n)); if (!success) goto bad_moduli; C->crt_lu[i].i0 = nmod_mul(tt, C->crt_lu[i].i0, C->crt_lu[i].mod); C->crt_lu[i].i1 = nmod_mul(tt, C->crt_lu[i].i1, C->crt_lu[i].mod); C->crt_lu[i].i2 = nmod_mul(tt, C->crt_lu[i].i2, C->crt_lu[i].mod); } C->crt_offsets[k - 1] = C->crt_offsets[k]; fmpz_mul(M->coeffs + k - 1, M->coeffs + k - 1, M->coeffs + k); } C->crt_klen = k; success = fmpz_multi_CRT_precompute(C->crt_P, M->coeffs, C->crt_klen); if (!success) goto bad_moduli; C->step = FLINT_ARRAY_ALLOC(C->crt_klen, slong); l = 0; for (k = 0, i = 0; k < C->crt_klen; k++) { int all_large = 1; s = 1; for (j = i; j < C->crt_offsets[k]; j++) { if (C->crt_lu[j].i1 != 0 || C->crt_lu[j].i2 != 0) all_large = 0; FLINT_ASSERT(fmpz_cmp(Mm->coeffs + j, M->coeffs + k) <= 0); s = FLINT_MAX(s, fmpz_size(Mm->coeffs + j)); } if (all_large) { s = 1; for (j = i; j < C->crt_offsets[k]; j++) { FLINT_ASSERT(C->crt_lu[i].i1 == 0 && C->crt_lu[i].i2 == 0); fmpz_mul_ui(Mm->coeffs + j, Mm->coeffs + j, C->crt_lu[j].i0); s = FLINT_MAX(s, fmpz_size(Mm->coeffs + j)); } C->step[k] = -s - 1; } else { C->step[k] = s; } for ( ; i < C->crt_offsets[k]; i++) { l += s; } } C->packed_multipliers = FLINT_ARRAY_ALLOC(l, mp_limb_t); l = 0; for (k = 0, i = 0; k < C->crt_klen; k++) { s = C->step[k]; if (s < 0) s = -s - 1; for ( ; i < C->crt_offsets[k]; i++) { fmpz_get_ui_array(C->packed_multipliers + l, s, Mm->coeffs + i); l += s; } } /* set up mod */ for (l = 0, i = 0, k = 0; l < len; k++) { if (k + 1 >= C->mod_offsets_alloc) { C->mod_offsets_alloc = FLINT_MAX(k + 1, C->mod_offsets_alloc*3/2); C->mod_offsets = FLINT_ARRAY_REALLOC(C->mod_offsets, C->mod_offsets_alloc, slong); } fmpz_poly_fit_length(M, k + 1); fmpz_one(M->coeffs + k); for (j = i; l < len && fmpz_size(M->coeffs + k) <= FMPZ_MOD_UI_CUTOFF; j++) { if (j + 1 >= C->mod_lu_alloc) { C->mod_lu_alloc = FLINT_MAX(j + 1, C->mod_lu_alloc*3/2); C->mod_lu = FLINT_ARRAY_REALLOC(C->mod_lu, C->mod_lu_alloc, mod_lut_entry); } C->mod_lu[j].mod0.n = 0; C->mod_lu[j].mod1.n = 0; C->mod_lu[j].mod2.n = 0; mm = m[l]; if (l + 1 < len && !n_mul_checked(&mt, mm, m[l+1])) { mm = mt; if (l + 2 < len && !n_mul_checked(&mt, mm, m[l+2])) { mm = mt; success = (m[l+0] != 0) && (m[l+1] != 0) && (m[l+2] != 0); if (!success) goto zero_moduli; nmod_init(&C->mod_lu[j].mod0, m[l+0]); nmod_init(&C->mod_lu[j].mod1, m[l+1]); nmod_init(&C->mod_lu[j].mod2, m[l+2]); l += 3; } else { success = (m[l+0] != 0) && (m[l+1] != 0); if (!success) goto zero_moduli; nmod_init(&C->mod_lu[j].mod0, m[l+0]); nmod_init(&C->mod_lu[j].mod1, m[l+1]); l += 2; } } else { success = (m[l+0] != 0); if (!success) goto zero_moduli; nmod_init(&C->mod_lu[j].mod0, m[l+0]); l += 1; } nmod_init(&C->mod_lu[j].mod, mm); fmpz_mul_ui(M->coeffs + k, M->coeffs + k, C->mod_lu[j].mod.n); } C->mod_offsets[k] = j; i = j; } /* avoid small last chunk and have at least FMPZ_MOD_UI_MULTIPLE_CUTOFF chunks or one big chunk */ while (k > 1 && (k < FMPZ_MOD_UI_MULTIPLE_CUTOFF || fmpz_size(M->coeffs + k - 1) < FMPZ_MOD_UI_CUTOFF*3/4)) { k--; C->mod_offsets[k - 1] = C->mod_offsets[k]; fmpz_mul(M->coeffs + k - 1, M->coeffs + k - 1, M->coeffs + k); } C->mod_klen = k; success = fmpz_multi_mod_precompute(C->mod_P, M->coeffs, C->mod_klen); if (!success) goto zero_moduli; fmpz_poly_clear(M); fmpz_poly_clear(Mm); return; bad_moduli: flint_throw(FLINT_ERROR, "fmpz_comb_init: moduli are not pairwise prime"); zero_moduli: flint_throw(FLINT_ERROR, "fmpz_comb_init: moduli are not nonzero"); }