/* Copyright (C) 2018 Daniel Schultz This file is part of FLINT. FLINT is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License (LGPL) as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. See . */ #include "string.h" #include "thread_pool.h" #include "nmod_mpoly.h" /* improve locality */ #define BLOCK 128 #define MAX_ARRAY_SIZE (WORD(300000)) #define MAX_LEX_SIZE (WORD(300)) typedef struct { slong idx; slong work; slong len; nmod_mpoly_t poly; } _chunk_struct; typedef struct { #if FLINT_USES_PTHREAD pthread_mutex_t mutex; #endif volatile int idx; slong nthreads; slong Al, Bl, Pl; mp_limb_t * Acoeffs, * Bcoeffs; slong * Amain, * Bmain; ulong * Apexp, * Bpexp; slong * perm; slong nvars; const ulong * mults; slong array_size; slong degb; const nmod_mpoly_ctx_struct * ctx; _chunk_struct * Pchunks; int rev; } _base_struct; typedef _base_struct _base_t[1]; typedef struct { slong idx; slong time; _base_struct * base; ulong * exp; } _worker_arg_struct; /****************** LEX ******************/ static void _nmod_mpoly_mul_array_threaded_worker_LEX(void * varg) { slong i, j, Pi; _worker_arg_struct * arg = (_worker_arg_struct *) varg; _base_struct * base = arg->base; slong Al = base->Al; slong Bl = base->Bl; slong Pl = base->Pl; slong * Amain = base->Amain; slong * Bmain = base->Bmain; ulong * coeff_array; TMP_INIT; TMP_START; coeff_array = (ulong *) TMP_ALLOC(3*base->array_size*sizeof(ulong)); for (j = 0; j < 3*base->array_size; j++) coeff_array[j] = 0; #if FLINT_USES_PTHREAD pthread_mutex_lock(&base->mutex); #endif Pi = base->idx; base->idx = Pi + 1; #if FLINT_USES_PTHREAD pthread_mutex_unlock(&base->mutex); #endif while (Pi < Pl) { slong len; mp_limb_t t2, t1, t0, u1, u0; Pi = base->perm[Pi]; /* work out bit counts for this chunk */ len = 0; for (i = 0, j = Pi; i < Al && j >= 0; i++, j--) { if (j < Bl) { len += FLINT_MIN(Amain[i + 1] - Amain[i], Bmain[j + 1] - Bmain[j]); } } umul_ppmm(t1, t0, base->ctx->mod.n - 1, base->ctx->mod.n - 1); umul_ppmm(t2, t1, t1, len); umul_ppmm(u1, u0, t0, len); add_sssaaaaaa(t2, t1, t0, t2, t1, UWORD(0), UWORD(0), u1, u0); (base->Pchunks + Pi)->len = 0; if (t2 != 0) { /* need three words */ for (i = 0, j = Pi; i < Al && j >= 0; i++, j--) { if (j >= Bl) continue; _nmod_mpoly_addmul_array1_ulong3(coeff_array, base->Acoeffs + base->Amain[i], base->Apexp + base->Amain[i], base->Amain[i + 1] - base->Amain[i], base->Bcoeffs + base->Bmain[j], base->Bpexp + base->Bmain[j], base->Bmain[j + 1] - base->Bmain[j]); } (base->Pchunks + Pi)->len = nmod_mpoly_append_array_sm3_LEX( (base->Pchunks + Pi)->poly, 0, coeff_array, base->mults, base->nvars - 1, base->array_size, Pl - Pi - 1, base->ctx); } else if (t1 != 0) { /* fits into two words */ for (i = 0, j = Pi; i < Al && j >= 0; i++, j--) { if (j >= Bl) continue; _nmod_mpoly_addmul_array1_ulong2(coeff_array, base->Acoeffs + base->Amain[i], base->Apexp + base->Amain[i], base->Amain[i + 1] - base->Amain[i], base->Bcoeffs + base->Bmain[j], base->Bpexp + base->Bmain[j], base->Bmain[j + 1] - base->Bmain[j]); } (base->Pchunks + Pi)->len = nmod_mpoly_append_array_sm2_LEX( (base->Pchunks + Pi)->poly, 0, coeff_array, base->mults, base->nvars - 1, base->array_size, Pl - Pi - 1, base->ctx); } else if (t0 != 0) { /* fits into one word */ for (i = 0, j = Pi; i < Al && j >= 0; i++, j--) { if (j >= Bl) continue; _nmod_mpoly_addmul_array1_ulong1(coeff_array, base->Acoeffs + base->Amain[i], base->Apexp + base->Amain[i], base->Amain[i + 1] - base->Amain[i], base->Bcoeffs + base->Bmain[j], base->Bpexp + base->Bmain[j], base->Bmain[j + 1] - base->Bmain[j]); } (base->Pchunks + Pi)->len = nmod_mpoly_append_array_sm1_LEX( (base->Pchunks + Pi)->poly, 0, coeff_array, base->mults, base->nvars - 1, base->array_size, Pl - Pi - 1, base->ctx); } #if FLINT_USES_PTHREAD pthread_mutex_lock(&base->mutex); #endif Pi = base->idx; base->idx = Pi + 1; #if FLINT_USES_PTHREAD pthread_mutex_unlock(&base->mutex); #endif } TMP_END; } void _nmod_mpoly_mul_array_chunked_threaded_LEX( nmod_mpoly_t P, const nmod_mpoly_t A, const nmod_mpoly_t B, const ulong * mults, const nmod_mpoly_ctx_t ctx, const thread_pool_handle * handles, slong num_handles) { slong nvars = ctx->minfo->nvars; slong Pi, i, j, Plen, Pl, Al, Bl, array_size; slong * Amain, * Bmain; ulong * Apexp, * Bpexp; _base_t base; _worker_arg_struct * args; _chunk_struct * Pchunks; slong * perm; TMP_INIT; array_size = 1; for (i = 0; i < nvars - 1; i++) { array_size *= mults[i]; } /* compute lengths of poly2 and poly3 in chunks */ Al = 1 + (slong) (A->exps[0] >> (A->bits*(nvars - 1))); Bl = 1 + (slong) (B->exps[0] >> (B->bits*(nvars - 1))); TMP_START; /* compute indices and lengths of coefficients of polys in main variable */ Amain = (slong *) TMP_ALLOC((Al + 1)*sizeof(slong)); Bmain = (slong *) TMP_ALLOC((Bl + 1)*sizeof(slong)); Apexp = (ulong *) flint_malloc(A->length*sizeof(ulong)); Bpexp = (ulong *) flint_malloc(B->length*sizeof(ulong)); mpoly_main_variable_split_LEX(Amain, Apexp, A->exps, Al, A->length, mults, nvars - 1, A->bits); mpoly_main_variable_split_LEX(Bmain, Bpexp, B->exps, Bl, B->length, mults, nvars - 1, B->bits); Pl = Al + Bl - 1; /* work out data for each chunk of the output */ Pchunks = (_chunk_struct *) TMP_ALLOC(Pl*sizeof(_chunk_struct)); perm = (slong *) TMP_ALLOC(Pl*sizeof(slong)); for (Pi = 0; Pi < Pl; Pi++) { nmod_mpoly_init3((Pchunks + Pi)->poly, 8, P->bits, ctx); (Pchunks + Pi)->work = 0; perm[Pi] = Pi; for (i = 0, j = Pi; i < Al && j >= 0; i++, j--) { if (j >= Bl) continue; (Pchunks + Pi)->work += (Amain[i + 1] - Amain[i]) *(Bmain[j + 1] - Bmain[j]); } } for (i = 0; i < Pl; i++) { for (j = i; j > 0 && (Pchunks + perm[j - 1])->work < (Pchunks + perm[j])->work; j--) { slong t = perm[j - 1]; perm[j - 1] = perm[j]; perm[j] = t; } } base->nthreads = num_handles + 1; base->Al = Al; base->Bl = Bl; base->Pl = Pl; base->Acoeffs = A->coeffs; base->Amain = Amain; base->Apexp = Apexp; base->Bcoeffs = B->coeffs; base->Bmain = Bmain; base->Bpexp = Bpexp; base->idx = 0; base->perm = perm; base->nvars = nvars; base->ctx = ctx; base->Pchunks = Pchunks; base->array_size = array_size; base->mults = mults; args = (_worker_arg_struct *) TMP_ALLOC(base->nthreads *sizeof(_worker_arg_struct)); #if FLINT_USES_PTHREAD pthread_mutex_init(&base->mutex, NULL); #endif for (i = 0; i < num_handles; i++) { args[i].idx = i; args[i].base = base; thread_pool_wake(global_thread_pool, handles[i], 0, _nmod_mpoly_mul_array_threaded_worker_LEX, &args[i]); } i = num_handles; args[i].idx = i; args[i].base = base; _nmod_mpoly_mul_array_threaded_worker_LEX(&args[i]); for (i = 0; i < num_handles; i++) { thread_pool_wait(global_thread_pool, handles[i]); } #if FLINT_USES_PTHREAD pthread_mutex_destroy(&base->mutex); #endif /* join answers */ Plen = 0; for (Pi = 0; Pi < Pl; Pi++) { _nmod_mpoly_fit_length(&P->coeffs, &P->coeffs_alloc, &P->exps, &P->exps_alloc, 1, Plen + (Pchunks + Pi)->len); FLINT_ASSERT((Pchunks + Pi)->poly->coeffs != NULL); FLINT_ASSERT((Pchunks + Pi)->poly->exps != NULL); memcpy(P->exps + Plen, (Pchunks + Pi)->poly->exps, (Pchunks + Pi)->len*sizeof(ulong)); memcpy(P->coeffs + Plen, (Pchunks + Pi)->poly->coeffs, (Pchunks + Pi)->len*sizeof(mp_limb_t)); Plen += (Pchunks + Pi)->len; flint_free((Pchunks + Pi)->poly->coeffs); flint_free((Pchunks + Pi)->poly->exps); } P->length = Plen; flint_free(Apexp); flint_free(Bpexp); TMP_END; } int _nmod_mpoly_mul_array_threaded_pool_LEX( nmod_mpoly_t A, const nmod_mpoly_t B, fmpz * maxBfields, const nmod_mpoly_t C, fmpz * maxCfields, const nmod_mpoly_ctx_t ctx, const thread_pool_handle * handles, slong num_handles) { slong i, exp_bits, array_size; ulong max, * mults; int success; TMP_INIT; FLINT_ASSERT(B->length != 0); FLINT_ASSERT(C->length != 0); FLINT_ASSERT(ctx->minfo->ord == ORD_LEX); FLINT_ASSERT(1 == mpoly_words_per_exp(B->bits, ctx->minfo)); FLINT_ASSERT(1 == mpoly_words_per_exp(C->bits, ctx->minfo)); TMP_START; /* compute maximum exponents for each variable */ mults = (ulong *) TMP_ALLOC(ctx->minfo->nfields*sizeof(ulong)); /* the field of index n-1 is the one that wil be pulled out */ i = ctx->minfo->nfields - 1; FLINT_ASSERT(fmpz_fits_si(maxBfields + i)); FLINT_ASSERT(fmpz_fits_si(maxCfields + i)); mults[i] = 1 + fmpz_get_ui(maxBfields + i) + fmpz_get_ui(maxCfields + i); max = mults[i]; if (((slong) mults[i]) <= 0 || mults[i] > MAX_LEX_SIZE) { success = 0; goto cleanup; } /* the fields of index n-2...0, contribute to the array size */ array_size = WORD(1); for (i--; i >= 0; i--) { ulong hi; FLINT_ASSERT(fmpz_fits_si(maxBfields + i)); FLINT_ASSERT(fmpz_fits_si(maxCfields + i)); mults[i] = 1 + fmpz_get_ui(maxBfields + i) + fmpz_get_ui(maxCfields + i); max |= mults[i]; umul_ppmm(hi, array_size, array_size, mults[i]); if (hi != 0 || (slong) mults[i] <= 0 || array_size <= 0 || array_size > MAX_ARRAY_SIZE) { success = 0; goto cleanup; } } exp_bits = FLINT_MAX(MPOLY_MIN_BITS, FLINT_BIT_COUNT(max) + 1); exp_bits = mpoly_fix_bits(exp_bits, ctx->minfo); /* array multiplication assumes result fits into 1 word */ if (1 != mpoly_words_per_exp(exp_bits, ctx->minfo)) { success = 0; goto cleanup; } /* handle aliasing and do array multiplication */ if (A == B || A == C) { nmod_mpoly_t T; nmod_mpoly_init3(T, B->length + C->length - 1, exp_bits, ctx); _nmod_mpoly_mul_array_chunked_threaded_LEX(T, C, B, mults, ctx, handles, num_handles); nmod_mpoly_swap(T, A, ctx); nmod_mpoly_clear(T, ctx); } else { nmod_mpoly_fit_length_reset_bits(A, B->length + C->length - 1, exp_bits, ctx); _nmod_mpoly_mul_array_chunked_threaded_LEX(A, C, B, mults, ctx, handles, num_handles); } success = 1; cleanup: TMP_END; return success; } /***************************** DEGLEX and DEGREVLEX *****************************/ static void _nmod_mpoly_mul_array_threaded_worker_DEG(void * varg) { slong i, j, Pi; _worker_arg_struct * arg = (_worker_arg_struct *) varg; _base_struct * base = arg->base; slong Al = base->Al; slong Bl = base->Bl; slong Pl = base->Pl; slong * Amain = base->Amain; slong * Bmain = base->Bmain; ulong * coeff_array; slong (* upack_sm1)(nmod_mpoly_t, slong, ulong *, slong, slong, slong, const nmod_mpoly_ctx_t); slong (* upack_sm2)(nmod_mpoly_t, slong, ulong *, slong, slong, slong, const nmod_mpoly_ctx_t); slong (* upack_sm3)(nmod_mpoly_t, slong, ulong *, slong, slong, slong, const nmod_mpoly_ctx_t); TMP_INIT; upack_sm1 = &nmod_mpoly_append_array_sm1_DEGLEX; upack_sm2 = &nmod_mpoly_append_array_sm2_DEGLEX; upack_sm3 = &nmod_mpoly_append_array_sm3_DEGLEX; if (base->rev) { upack_sm1 = &nmod_mpoly_append_array_sm1_DEGREVLEX; upack_sm2 = &nmod_mpoly_append_array_sm2_DEGREVLEX; upack_sm3 = &nmod_mpoly_append_array_sm3_DEGREVLEX; } TMP_START; coeff_array = (ulong *) TMP_ALLOC(3*base->array_size*sizeof(ulong)); for (j = 0; j < 3*base->array_size; j++) coeff_array[j] = 0; #if FLINT_USES_PTHREAD pthread_mutex_lock(&base->mutex); #endif Pi = base->idx; base->idx = Pi + 1; #if FLINT_USES_PTHREAD pthread_mutex_unlock(&base->mutex); #endif while (Pi < Pl) { slong len; mp_limb_t t2, t1, t0, u1, u0; Pi = base->perm[Pi]; /* work out bit counts for this chunk */ len = 0; for (i = 0, j = Pi; i < Al && j >= 0; i++, j--) { if (j < Bl) { len += FLINT_MIN(Amain[i + 1] - Amain[i], Bmain[j + 1] - Bmain[j]); } } umul_ppmm(t1, t0, base->ctx->mod.n - 1, base->ctx->mod.n - 1); umul_ppmm(t2, t1, t1, len); umul_ppmm(u1, u0, t0, len); add_sssaaaaaa(t2, t1, t0, t2, t1, UWORD(0), UWORD(0), u1, u0); (base->Pchunks + Pi)->len = 0; if (t2 != 0) { /* need three words */ for (i = 0, j = Pi; i < Al && j >= 0; i++, j--) { if (j >= Bl) continue; _nmod_mpoly_addmul_array1_ulong3(coeff_array, base->Acoeffs + base->Amain[i], base->Apexp + base->Amain[i], base->Amain[i + 1] - base->Amain[i], base->Bcoeffs + base->Bmain[j], base->Bpexp + base->Bmain[j], base->Bmain[j + 1] - base->Bmain[j]); } (base->Pchunks + Pi)->len = upack_sm3((base->Pchunks + Pi)->poly, 0, coeff_array, Pl - Pi - 1, base->nvars, base->degb, base->ctx); } else if (t1 != 0) { /* fits into two words */ for (i = 0, j = Pi; i < Al && j >= 0; i++, j--) { if (j >= Bl) continue; _nmod_mpoly_addmul_array1_ulong2(coeff_array, base->Acoeffs + base->Amain[i], base->Apexp + base->Amain[i], base->Amain[i + 1] - base->Amain[i], base->Bcoeffs + base->Bmain[j], base->Bpexp + base->Bmain[j], base->Bmain[j + 1] - base->Bmain[j]); } (base->Pchunks + Pi)->len = upack_sm2((base->Pchunks + Pi)->poly, 0, coeff_array, Pl - Pi - 1, base->nvars, base->degb, base->ctx); } else if (t0 != 0) { /* fits into one word */ for (i = 0, j = Pi; i < Al && j >= 0; i++, j--) { if (j >= Bl) continue; _nmod_mpoly_addmul_array1_ulong1(coeff_array, base->Acoeffs + base->Amain[i], base->Apexp + base->Amain[i], base->Amain[i + 1] - base->Amain[i], base->Bcoeffs + base->Bmain[j], base->Bpexp + base->Bmain[j], base->Bmain[j + 1] - base->Bmain[j]); } (base->Pchunks + Pi)->len = upack_sm1((base->Pchunks + Pi)->poly, 0, coeff_array, Pl - Pi - 1, base->nvars, base->degb, base->ctx); } #if FLINT_USES_PTHREAD pthread_mutex_lock(&base->mutex); #endif Pi = base->idx; base->idx = Pi + 1; #if FLINT_USES_PTHREAD pthread_mutex_unlock(&base->mutex); #endif } TMP_END; } void _nmod_mpoly_mul_array_chunked_threaded_DEG( nmod_mpoly_t P, const nmod_mpoly_t A, const nmod_mpoly_t B, ulong degb, const nmod_mpoly_ctx_t ctx, const thread_pool_handle * handles, slong num_handles) { slong nvars = ctx->minfo->nvars; slong Pi, i, j, Plen, Pl, Al, Bl, array_size; slong * Amain, * Bmain; ulong * Apexp, * Bpexp; _base_t base; _worker_arg_struct * args; _chunk_struct * Pchunks; slong * perm; TMP_INIT; /* compute lengths of poly2 and poly3 in chunks */ Al = 1 + (slong) (A->exps[0] >> (A->bits*nvars)); Bl = 1 + (slong) (B->exps[0] >> (B->bits*nvars)); array_size = 1; for (i = 0; i < nvars-1; i++) { array_size *= degb; } TMP_START; /* compute indices and lengths of coefficients of polys in main variable */ Amain = (slong *) TMP_ALLOC((Al + 1)*sizeof(slong)); Bmain = (slong *) TMP_ALLOC((Bl + 1)*sizeof(slong)); Apexp = (ulong *) flint_malloc(A->length*sizeof(ulong)); Bpexp = (ulong *) flint_malloc(B->length*sizeof(ulong)); mpoly_main_variable_split_DEG(Amain, Apexp, A->exps, Al, A->length, degb, nvars, A->bits); mpoly_main_variable_split_DEG(Bmain, Bpexp, B->exps, Bl, B->length, degb, nvars, B->bits); Pl = Al + Bl - 1; FLINT_ASSERT(Pl == degb); /* work out data for each chunk of the output */ Pchunks = (_chunk_struct *) TMP_ALLOC(Pl*sizeof(_chunk_struct)); perm = (slong *) TMP_ALLOC(Pl*sizeof(slong)); for (Pi = 0; Pi < Pl; Pi++) { nmod_mpoly_init3((Pchunks + Pi)->poly, 8, P->bits, ctx); (Pchunks + Pi)->work = 0; perm[Pi] = Pi; for (i = 0, j = Pi; i < Al && j >= 0; i++, j--) { if (j < Bl) { (Pchunks + Pi)->work += (Amain[i + 1] - Amain[i]) *(Bmain[j + 1] - Bmain[j]); } } } for (i = 0; i < Pl; i++) { for (j = i; j > 0 && (Pchunks + perm[j-1])->work < (Pchunks + perm[j])->work; j--) { slong t = perm[j - 1]; perm[j - 1] = perm[j]; perm[j] = t; } } base->nthreads = num_handles + 1; base->Al = Al; base->Bl = Bl; base->Pl = Pl; base->Acoeffs = A->coeffs; base->Amain = Amain; base->Apexp = Apexp; base->Bcoeffs = B->coeffs; base->Bmain = Bmain; base->Bpexp = Bpexp; base->idx = 0; base->perm = perm; base->nvars = nvars; base->Pchunks = Pchunks; base->ctx = ctx; base->array_size = array_size; base->degb = degb; base->rev = (ctx->minfo->ord == ORD_DEGREVLEX); args = (_worker_arg_struct *) TMP_ALLOC(base->nthreads *sizeof(_worker_arg_struct)); #if FLINT_USES_PTHREAD pthread_mutex_init(&base->mutex, NULL); #endif for (i = 0; i < num_handles; i++) { args[i].idx = i; args[i].base = base; thread_pool_wake(global_thread_pool, handles[i], 0, _nmod_mpoly_mul_array_threaded_worker_DEG, &args[i]); } i = num_handles; args[i].idx = i; args[i].base = base; _nmod_mpoly_mul_array_threaded_worker_DEG(&args[i]); for (i = 0; i < num_handles; i++) { thread_pool_wait(global_thread_pool, handles[i]); } #if FLINT_USES_PTHREAD pthread_mutex_destroy(&base->mutex); #endif /* join answers */ Plen = 0; for (Pi = 0; Pi < Pl; Pi++) { _nmod_mpoly_fit_length(&P->coeffs, &P->coeffs_alloc, &P->exps, &P->exps_alloc, 1, Plen + (Pchunks + Pi)->len); FLINT_ASSERT((Pchunks + Pi)->poly->coeffs != NULL); FLINT_ASSERT((Pchunks + Pi)->poly->exps != NULL); memcpy(P->exps + Plen, (Pchunks + Pi)->poly->exps, (Pchunks + Pi)->len*sizeof(ulong)); memcpy(P->coeffs + Plen, (Pchunks + Pi)->poly->coeffs, (Pchunks + Pi)->len*sizeof(mp_limb_t)); Plen += (Pchunks + Pi)->len; flint_free((Pchunks + Pi)->poly->coeffs); flint_free((Pchunks + Pi)->poly->exps); } P->length = Plen; flint_free(Apexp); flint_free(Bpexp); TMP_END; } int _nmod_mpoly_mul_array_threaded_pool_DEG( nmod_mpoly_t A, const nmod_mpoly_t B, fmpz * maxBfields, const nmod_mpoly_t C, fmpz * maxCfields, const nmod_mpoly_ctx_t ctx, const thread_pool_handle * handles, slong num_handles) { slong i, exp_bits, array_size; ulong deg; int success; FLINT_ASSERT(B->length != 0); FLINT_ASSERT(C->length != 0); FLINT_ASSERT( ctx->minfo->ord == ORD_DEGREVLEX || ctx->minfo->ord == ORD_DEGLEX); FLINT_ASSERT(1 == mpoly_words_per_exp(B->bits, ctx->minfo)); FLINT_ASSERT(1 == mpoly_words_per_exp(C->bits, ctx->minfo)); /* the field of index n-1 is the one that wil be pulled out */ i = ctx->minfo->nfields - 1; FLINT_ASSERT(fmpz_fits_si(maxBfields + i)); FLINT_ASSERT(fmpz_fits_si(maxCfields + i)); deg = 1 + fmpz_get_ui(maxBfields + i) + fmpz_get_ui(maxCfields + i); if (((slong) deg) <= 0 || deg > MAX_ARRAY_SIZE) { success = 0; goto cleanup; } /* the fields of index n-2...1, contribute to the array size */ array_size = WORD(1); for (i--; i >= 1; i--) { ulong hi; umul_ppmm(hi, array_size, array_size, deg); if (hi != WORD(0) || array_size <= 0 || array_size > MAX_ARRAY_SIZE) { success = 0; goto cleanup; } } exp_bits = FLINT_MAX(MPOLY_MIN_BITS, FLINT_BIT_COUNT(deg) + 1); exp_bits = mpoly_fix_bits(exp_bits, ctx->minfo); /* array multiplication assumes result fit into 1 word */ if (1 != mpoly_words_per_exp(exp_bits, ctx->minfo)) { success = 0; goto cleanup; } /* handle aliasing and do array multiplication */ if (A == B || A == C) { nmod_mpoly_t T; nmod_mpoly_init3(T, B->length + C->length - 1, exp_bits, ctx); _nmod_mpoly_mul_array_chunked_threaded_DEG(T, C, B, deg, ctx, handles, num_handles); nmod_mpoly_swap(T, A, ctx); nmod_mpoly_clear(T, ctx); } else { nmod_mpoly_fit_length_reset_bits(A, B->length + C->length - 1, exp_bits, ctx); _nmod_mpoly_mul_array_chunked_threaded_DEG(A, C, B, deg, ctx, handles, num_handles); } success = 1; cleanup: return success; } int nmod_mpoly_mul_array_threaded( nmod_mpoly_t A, const nmod_mpoly_t B, const nmod_mpoly_t C, const nmod_mpoly_ctx_t ctx) { slong i; int success; fmpz * maxBfields, * maxCfields; thread_pool_handle * handles; slong num_handles; slong thread_limit = FLINT_MIN(B->length, C->length)/16; TMP_INIT; if (B->length == 0 || C->length == 0) { nmod_mpoly_zero(A, ctx); return 1; } if (1 != mpoly_words_per_exp(B->bits, ctx->minfo) || 1 != mpoly_words_per_exp(C->bits, ctx->minfo)) { return 0; } TMP_START; maxBfields = (fmpz *) TMP_ALLOC(ctx->minfo->nfields*sizeof(fmpz)); maxCfields = (fmpz *) TMP_ALLOC(ctx->minfo->nfields*sizeof(fmpz)); for (i = 0; i < ctx->minfo->nfields; i++) { fmpz_init(maxBfields + i); fmpz_init(maxCfields + i); } mpoly_max_fields_fmpz(maxBfields, B->exps, B->length, B->bits, ctx->minfo); mpoly_max_fields_fmpz(maxCfields, C->exps, C->length, C->bits, ctx->minfo); num_handles = flint_request_threads(&handles, thread_limit); switch (ctx->minfo->ord) { case ORD_LEX: { success = _nmod_mpoly_mul_array_threaded_pool_LEX(A, B, maxBfields, C, maxCfields, ctx, handles, num_handles); break; } case ORD_DEGREVLEX: case ORD_DEGLEX: { success = _nmod_mpoly_mul_array_threaded_pool_DEG(A, B, maxBfields, C, maxCfields, ctx, handles, num_handles); break; } default: { success = 0; break; } } flint_give_back_threads(handles, num_handles); for (i = 0; i < ctx->minfo->nfields; i++) { fmpz_clear(maxBfields + i); fmpz_clear(maxCfields + i); } TMP_END; return success; }