/* Copyright (C) 2018 Daniel Schultz This file is part of FLINT. FLINT is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License (LGPL) as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. See . */ #include "thread_pool.h" #include "fmpz_mpoly.h" /* NOTE: this file is dirty - it assumes that a zero fmpz is zero */ /* improve locality */ #define BLOCK 128 #define MAX_ARRAY_SIZE (WORD(300000)) #define MAX_LEX_SIZE (WORD(300)) typedef struct { slong idx; slong work; slong len; fmpz_mpoly_t poly; } _chunk_struct; typedef struct { #if FLINT_USES_PTHREAD pthread_mutex_t mutex; #endif volatile int idx; slong nthreads; slong Al, Bl, Pl; fmpz * Acoeffs, * Bcoeffs; slong * Amax, * Bmax, * Asum, * Bsum; slong * Amain, * Bmain; ulong * Apexp, * Bpexp; slong * perm; slong nvars; const ulong * mults; slong array_size; slong degb; _chunk_struct * Pchunks; int rev; } _base_struct; typedef _base_struct _base_t[1]; typedef struct { slong idx; slong time; _base_struct * base; ulong * exp; } _worker_arg_struct; /****************** LEX ******************/ void _fmpz_mpoly_mul_array_threaded_worker_LEX(void * varg) { slong i, j, Pi; _worker_arg_struct * arg = (_worker_arg_struct *) varg; _base_struct * base = arg->base; ulong * coeff_array; TMP_INIT; TMP_START; coeff_array = (ulong *) TMP_ALLOC(3*base->array_size*sizeof(ulong)); for (j = 0; j < 3*base->array_size; j++) coeff_array[j] = 0; #if FLINT_USES_PTHREAD pthread_mutex_lock(&base->mutex); #endif Pi = base->idx; base->idx = Pi + 1; #if FLINT_USES_PTHREAD pthread_mutex_unlock(&base->mutex); #endif while (Pi < base->Pl) { /* work out bit counts for this chunk */ slong Abits = 0; slong Bbits = 0; slong Pbits = 0; slong number = 0; for (i = 0, j = base->perm[Pi]; i < base->Al && j >= 0; i++, j--) { if (j < base->Bl) { number++; Abits = FLINT_MAX(Abits, base->Amax[i]); Bbits = FLINT_MAX(Bbits, base->Bmax[j]); Pbits = FLINT_MAX(Pbits, FLINT_MIN(base->Asum[i] + base->Bmax[j], base->Amax[i] + base->Bsum[j])); } } Pbits += FLINT_BIT_COUNT(number) + 1; /* includes one bit for sign */ if (Abits <= FLINT_BITS - 2 && Bbits <= FLINT_BITS - 2) { if (Pbits <= FLINT_BITS) { for (i = 0, j = base->perm[Pi]; i < base->Al && j >= 0; i++, j--) { if (j < base->Bl) { _fmpz_mpoly_addmul_array1_slong1((ulong *)coeff_array, base->Acoeffs + base->Amain[i], base->Apexp + base->Amain[i], base->Amain[i + 1] - base->Amain[i], base->Bcoeffs + base->Bmain[j], base->Bpexp + base->Bmain[j], base->Bmain[j + 1] - base->Bmain[j]); } } (base->Pchunks + base->perm[Pi])->len = fmpz_mpoly_append_array_sm1_LEX( (base->Pchunks + base->perm[Pi])->poly, 0, (ulong *)coeff_array, base->mults, base->nvars - 1, base->array_size, base->Pl - base->perm[Pi] - 1); } else if (Pbits <= 2*FLINT_BITS) { for (i = 0, j = base->perm[Pi]; i < base->Al && j >= 0; i++, j--) { if (j < base->Bl) { _fmpz_mpoly_addmul_array1_slong2((ulong *)coeff_array, base->Acoeffs + base->Amain[i], base->Apexp + base->Amain[i], base->Amain[i + 1] - base->Amain[i], base->Bcoeffs + base->Bmain[j], base->Bpexp + base->Bmain[j], base->Bmain[j + 1] - base->Bmain[j]); } } (base->Pchunks + base->perm[Pi])->len = fmpz_mpoly_append_array_sm2_LEX( (base->Pchunks + base->perm[Pi])->poly, 0, (ulong *)coeff_array, base->mults, base->nvars - 1, base->array_size, base->Pl - base->perm[Pi] - 1); } else { for (i = 0, j = base->perm[Pi]; i < base->Al && j >= 0; i++, j--) { if (j < base->Bl) { _fmpz_mpoly_addmul_array1_slong((ulong *)coeff_array, base->Acoeffs + base->Amain[i], base->Apexp + base->Amain[i], base->Amain[i + 1] - base->Amain[i], base->Bcoeffs + base->Bmain[j], base->Bpexp + base->Bmain[j], base->Bmain[j + 1] - base->Bmain[j]); } } (base->Pchunks + base->perm[Pi])->len = fmpz_mpoly_append_array_sm3_LEX( (base->Pchunks + base->perm[Pi])->poly, 0, (ulong *)coeff_array, base->mults, base->nvars - 1, base->array_size, base->Pl - base->perm[Pi] - 1); } } else { for (i = 0, j = base->perm[Pi]; i < base->Al && j >= 0; i++, j--) { if (j < base->Bl) { _fmpz_mpoly_addmul_array1_fmpz((fmpz *)coeff_array, base->Acoeffs + base->Amain[i], base->Apexp + base->Amain[i], base->Amain[i + 1] - base->Amain[i], base->Bcoeffs + base->Bmain[j], base->Bpexp + base->Bmain[j], base->Bmain[j + 1] - base->Bmain[j]); } } (base->Pchunks + base->perm[Pi])->len = fmpz_mpoly_append_array_fmpz_LEX( (base->Pchunks + base->perm[Pi])->poly, 0, (fmpz *)coeff_array, base->mults, base->nvars - 1, base->array_size, base->Pl - base->perm[Pi] - 1); } #if FLINT_USES_PTHREAD pthread_mutex_lock(&base->mutex); #endif Pi = base->idx; base->idx = Pi + 1; #if FLINT_USES_PTHREAD pthread_mutex_unlock(&base->mutex); #endif } TMP_END; } void _fmpz_mpoly_mul_array_chunked_threaded_LEX( fmpz_mpoly_t P, const fmpz_mpoly_t A, const fmpz_mpoly_t B, const ulong * mults, const fmpz_mpoly_ctx_t ctx, const thread_pool_handle * handles, slong num_handles) { slong nvars = ctx->minfo->nvars; slong Pi, i, j, Plen, Pl, Al, Bl, array_size; slong * Asum, * Amax, * Bsum, * Bmax; slong * Amain, * Bmain; ulong * Apexp, * Bpexp; _base_t base; _worker_arg_struct * args; _chunk_struct * Pchunks; slong * perm; TMP_INIT; array_size = 1; for (i = 0; i < nvars - 1; i++) { array_size *= mults[i]; } /* compute lengths of poly2 and poly3 in chunks */ Al = 1 + (slong) (A->exps[0] >> (A->bits*(nvars - 1))); Bl = 1 + (slong) (B->exps[0] >> (B->bits*(nvars - 1))); TMP_START; /* compute indices and lengths of coefficients of polys in main variable */ Amain = (slong *) TMP_ALLOC(((Al + 1) + Al + Al + (Bl + 1) + Bl + Bl)*sizeof(slong)); Asum = Amain + Al + 1; Amax = Asum + Al; Bmain = Amax + Al; Bsum = Bmain + Bl + 1; Bmax = Bsum + Bl; Apexp = (ulong *) flint_malloc(A->length*sizeof(ulong)); Bpexp = (ulong *) flint_malloc(B->length*sizeof(ulong)); mpoly_main_variable_split_LEX(Amain, Apexp, A->exps, Al, A->length, mults, nvars - 1, A->bits); mpoly_main_variable_split_LEX(Bmain, Bpexp, B->exps, Bl, B->length, mults, nvars - 1, B->bits); /* work out bit counts for each chunk */ for (i = 0; i < Al; i++) { _fmpz_vec_sum_max_bits(&Asum[i], &Amax[i], A->coeffs + Amain[i], Amain[i + 1] - Amain[i]); } for (j = 0; j < Bl; j++) { _fmpz_vec_sum_max_bits(&Bsum[j], &Bmax[j], B->coeffs + Bmain[j], Bmain[j + 1] - Bmain[j]); } Pl = Al + Bl - 1; /* work out data for each chunk of the output */ Pchunks = (_chunk_struct *) TMP_ALLOC(Pl*sizeof(_chunk_struct)); perm = (slong *) TMP_ALLOC(Pl*sizeof(slong)); for (Pi = 0; Pi < Pl; Pi++) { fmpz_mpoly_init3((Pchunks + Pi)->poly, 8, P->bits, ctx); (Pchunks + Pi)->work = 0; perm[Pi] = Pi; for (i = 0, j = Pi; i < Al && j >= 0; i++, j--) { if (j < Bl) { (Pchunks + Pi)->work += (Amain[i + 1] - Amain[i]) *(Bmain[j + 1] - Bmain[j]); } } } for (i = 0; i < Pl; i++) { for (j = i; j > 0 && (Pchunks + perm[j - 1])->work < (Pchunks + perm[j])->work; j--) { slong t = perm[j - 1]; perm[j - 1] = perm[j]; perm[j] = t; } } base->nthreads = num_handles + 1; base->Al = Al; base->Bl = Bl; base->Pl = Pl; base->Amax = Amax; base->Bmax = Bmax; base->Asum = Asum; base->Bsum = Bsum; base->Acoeffs = A->coeffs; base->Amain = Amain; base->Apexp = Apexp; base->Bcoeffs = B->coeffs; base->Bmain = Bmain; base->Bpexp = Bpexp; base->idx = 0; base->perm = perm; base->nvars = nvars; base->Pchunks = Pchunks; base->array_size = array_size; base->mults = mults; args = (_worker_arg_struct *) TMP_ALLOC(base->nthreads *sizeof(_worker_arg_struct)); #if FLINT_USES_PTHREAD pthread_mutex_init(&base->mutex, NULL); #endif for (i = 0; i < num_handles; i++) { args[i].idx = i; args[i].base = base; thread_pool_wake(global_thread_pool, handles[i], 0, _fmpz_mpoly_mul_array_threaded_worker_LEX, &args[i]); } i = num_handles; args[i].idx = i; args[i].base = base; _fmpz_mpoly_mul_array_threaded_worker_LEX(&args[i]); for (i = 0; i < num_handles; i++) { thread_pool_wait(global_thread_pool, handles[i]); } #if FLINT_USES_PTHREAD pthread_mutex_destroy(&base->mutex); #endif /* join answers */ Plen = 0; for (Pi = 0; Pi < Pl; Pi++) { _fmpz_mpoly_fit_length(&P->coeffs, &P->exps, &P->alloc, Plen + (Pchunks + Pi)->len, 1); for (i = 0; i < (Pchunks + Pi)->len; i++) { P->exps[Plen] = (Pchunks + Pi)->poly->exps[i]; fmpz_swap(P->coeffs + Plen, (Pchunks + Pi)->poly->coeffs + i); fmpz_clear((Pchunks + Pi)->poly->coeffs + i); Plen++; } flint_free((Pchunks + Pi)->poly->coeffs); flint_free((Pchunks + Pi)->poly->exps); } _fmpz_mpoly_set_length(P, Plen, ctx); flint_free(Apexp); flint_free(Bpexp); TMP_END; } int _fmpz_mpoly_mul_array_threaded_pool_LEX( fmpz_mpoly_t A, const fmpz_mpoly_t B, fmpz * maxBfields, const fmpz_mpoly_t C, fmpz * maxCfields, const fmpz_mpoly_ctx_t ctx, const thread_pool_handle * handles, slong num_handles) { slong i, exp_bits, array_size; ulong max, * mults; int success; TMP_INIT; FLINT_ASSERT(B->length != 0); FLINT_ASSERT(C->length != 0); FLINT_ASSERT(ctx->minfo->ord == ORD_LEX); FLINT_ASSERT(1 == mpoly_words_per_exp(B->bits, ctx->minfo)); FLINT_ASSERT(1 == mpoly_words_per_exp(C->bits, ctx->minfo)); TMP_START; /* compute maximum exponents for each variable */ mults = (ulong *) TMP_ALLOC(ctx->minfo->nfields*sizeof(ulong)); /* the field of index n-1 is the one that wil be pulled out */ i = ctx->minfo->nfields - 1; FLINT_ASSERT(fmpz_fits_si(maxBfields + i)); FLINT_ASSERT(fmpz_fits_si(maxCfields + i)); mults[i] = 1 + fmpz_get_ui(maxBfields + i) + fmpz_get_ui(maxCfields + i); max = mults[i]; if (((slong) mults[i]) <= 0 || mults[i] > MAX_LEX_SIZE) { success = 0; goto cleanup; } /* the fields of index n-2...0, contribute to the array size */ array_size = WORD(1); for (i--; i >= 0; i--) { ulong hi; FLINT_ASSERT(fmpz_fits_si(maxBfields + i)); FLINT_ASSERT(fmpz_fits_si(maxCfields + i)); mults[i] = 1 + fmpz_get_ui(maxBfields + i) + fmpz_get_ui(maxCfields + i); max |= mults[i]; umul_ppmm(hi, array_size, array_size, mults[i]); if (hi != WORD(0) || (array_size | (slong) mults[i]) <= 0 || array_size > MAX_ARRAY_SIZE) { success = 0; goto cleanup; } } exp_bits = FLINT_MAX(MPOLY_MIN_BITS, FLINT_BIT_COUNT(max) + 1); exp_bits = mpoly_fix_bits(exp_bits, ctx->minfo); /* array multiplication assumes result fits into 1 word */ if (1 != mpoly_words_per_exp(exp_bits, ctx->minfo)) { success = 0; goto cleanup; } /* handle aliasing and do array multiplication */ if (A == B || A == C) { fmpz_mpoly_t T; fmpz_mpoly_init3(T, B->length + C->length, exp_bits, ctx); _fmpz_mpoly_mul_array_chunked_threaded_LEX(T, C, B, mults, ctx, handles, num_handles); fmpz_mpoly_swap(T, A, ctx); fmpz_mpoly_clear(T, ctx); } else { fmpz_mpoly_fit_length_reset_bits(A, B->length + C->length, exp_bits, ctx); _fmpz_mpoly_mul_array_chunked_threaded_LEX(A, C, B, mults, ctx, handles, num_handles); } success = 1; cleanup: TMP_END; return success; } /***************************** DEGLEX and DEGREVLEX *****************************/ void _fmpz_mpoly_mul_array_threaded_worker_DEG(void * varg) { slong i, j, Pi; _worker_arg_struct * arg = (_worker_arg_struct *) varg; _base_struct * base = arg->base; ulong * coeff_array; slong (* upack_sm1)(fmpz_mpoly_t, slong, ulong *, slong, slong, slong); slong (* upack_sm2)(fmpz_mpoly_t, slong, ulong *, slong, slong, slong); slong (* upack_sm3)(fmpz_mpoly_t, slong, ulong *, slong, slong, slong); slong (* upack_fmpz)(fmpz_mpoly_t, slong, fmpz *, slong, slong, slong); TMP_INIT; upack_sm1 = &fmpz_mpoly_append_array_sm1_DEGLEX; upack_sm2 = &fmpz_mpoly_append_array_sm2_DEGLEX; upack_sm3 = &fmpz_mpoly_append_array_sm3_DEGLEX; upack_fmpz = &fmpz_mpoly_append_array_fmpz_DEGLEX; if (base->rev) { upack_sm1 = &fmpz_mpoly_append_array_sm1_DEGREVLEX; upack_sm2 = &fmpz_mpoly_append_array_sm2_DEGREVLEX; upack_sm3 = &fmpz_mpoly_append_array_sm3_DEGREVLEX; upack_fmpz = &fmpz_mpoly_append_array_fmpz_DEGREVLEX; } TMP_START; coeff_array = (ulong *) TMP_ALLOC(3*base->array_size*sizeof(ulong)); for (j = 0; j < 3*base->array_size; j++) coeff_array[j] = 0; #if FLINT_USES_PTHREAD pthread_mutex_lock(&base->mutex); #endif Pi = base->idx; base->idx = Pi + 1; #if FLINT_USES_PTHREAD pthread_mutex_unlock(&base->mutex); #endif while (Pi < base->Pl) { /* work out bit counts for this chunk */ slong Abits = 0; slong Bbits = 0; slong Pbits = 0; slong number = 0; for (i = 0, j = base->perm[Pi]; i < base->Al && j >= 0; i++, j--) { if (j < base->Bl) { number++; Abits = FLINT_MAX(Abits, base->Amax[i]); Bbits = FLINT_MAX(Bbits, base->Bmax[j]); Pbits = FLINT_MAX(Pbits, FLINT_MIN(base->Asum[i] + base->Bmax[j], base->Amax[i] + base->Bsum[j])); } } Pbits += FLINT_BIT_COUNT(number) + 1; /* includes one bit for sign */ if (Abits <= FLINT_BITS - 2 && Bbits <= FLINT_BITS - 2) { if (Pbits <= FLINT_BITS) { for (i = 0, j = base->perm[Pi]; i < base->Al && j >= 0; i++, j--) { if (j < base->Bl) { _fmpz_mpoly_addmul_array1_slong1((ulong *)coeff_array, base->Acoeffs + base->Amain[i], base->Apexp + base->Amain[i], base->Amain[i + 1] - base->Amain[i], base->Bcoeffs + base->Bmain[j], base->Bpexp + base->Bmain[j], base->Bmain[j + 1] - base->Bmain[j]); } } (base->Pchunks + base->perm[Pi])->len = upack_sm1((base->Pchunks + base->perm[Pi])->poly, 0, (ulong *)coeff_array, base->Pl - base->perm[Pi] - 1, base->nvars, base->degb); } else if (Pbits <= 2*FLINT_BITS) { for (i = 0, j = base->perm[Pi]; i < base->Al && j >= 0; i++, j--) { if (j < base->Bl) { _fmpz_mpoly_addmul_array1_slong2((ulong *)coeff_array, base->Acoeffs + base->Amain[i], base->Apexp + base->Amain[i], base->Amain[i + 1] - base->Amain[i], base->Bcoeffs + base->Bmain[j], base->Bpexp + base->Bmain[j], base->Bmain[j + 1] - base->Bmain[j]); } } (base->Pchunks + base->perm[Pi])->len = upack_sm2((base->Pchunks + base->perm[Pi])->poly, 0, (ulong *)coeff_array, base->Pl - base->perm[Pi] - 1, base->nvars, base->degb); } else { for (i = 0, j = base->perm[Pi]; i < base->Al && j >= 0; i++, j--) { if (j < base->Bl) { _fmpz_mpoly_addmul_array1_slong((ulong *)coeff_array, base->Acoeffs + base->Amain[i], base->Apexp + base->Amain[i], base->Amain[i + 1] - base->Amain[i], base->Bcoeffs + base->Bmain[j], base->Bpexp + base->Bmain[j], base->Bmain[j + 1] - base->Bmain[j]); } } (base->Pchunks + base->perm[Pi])->len = upack_sm3((base->Pchunks + base->perm[Pi])->poly, 0, (ulong *)coeff_array, base->Pl - base->perm[Pi] - 1, base->nvars, base->degb); } } else { for (i = 0, j = base->perm[Pi]; i < base->Al && j >= 0; i++, j--) { if (j < base->Bl) { _fmpz_mpoly_addmul_array1_fmpz((fmpz *)coeff_array, base->Acoeffs + base->Amain[i], base->Apexp + base->Amain[i], base->Amain[i + 1] - base->Amain[i], base->Bcoeffs + base->Bmain[j], base->Bpexp + base->Bmain[j], base->Bmain[j + 1] - base->Bmain[j]); } } (base->Pchunks + base->perm[Pi])->len = upack_fmpz((base->Pchunks + base->perm[Pi])->poly, 0, (fmpz *)coeff_array, base->Pl - base->perm[Pi] - 1, base->nvars, base->degb); } #if FLINT_USES_PTHREAD pthread_mutex_lock(&base->mutex); #endif Pi = base->idx; base->idx = Pi + 1; #if FLINT_USES_PTHREAD pthread_mutex_unlock(&base->mutex); #endif } TMP_END; } void _fmpz_mpoly_mul_array_chunked_threaded_DEG( fmpz_mpoly_t P, const fmpz_mpoly_t A, const fmpz_mpoly_t B, ulong degb, const fmpz_mpoly_ctx_t ctx, const thread_pool_handle * handles, slong num_handles) { slong nvars = ctx->minfo->nvars; slong Pi, i, j, Plen, Pl, Al, Bl, array_size; slong * Asum, * Amax, * Bsum, * Bmax; slong * Amain, * Bmain; ulong * Apexp, * Bpexp; _base_t base; _worker_arg_struct * args; _chunk_struct * Pchunks; slong * perm; TMP_INIT; /* compute lengths of poly2 and poly3 in chunks */ Al = 1 + (slong) (A->exps[0] >> (A->bits*nvars)); Bl = 1 + (slong) (B->exps[0] >> (B->bits*nvars)); array_size = 1; for (i = 0; i < nvars-1; i++) { array_size *= degb; } TMP_START; /* compute indices and lengths of coefficients of polys in main variable */ Amain = (slong *) TMP_ALLOC(((Al + 1) + Al + Al + (Bl + 1) + Bl + Bl)*sizeof(slong)); Asum = Amain + Al + 1; Amax = Asum + Al; Bmain = Amax + Al; Bsum = Bmain + Bl + 1; Bmax = Bsum + Bl; Apexp = (ulong *) flint_malloc(A->length*sizeof(ulong)); Bpexp = (ulong *) flint_malloc(B->length*sizeof(ulong)); mpoly_main_variable_split_DEG(Amain, Apexp, A->exps, Al, A->length, degb, nvars, A->bits); mpoly_main_variable_split_DEG(Bmain, Bpexp, B->exps, Bl, B->length, degb, nvars, B->bits); /* work out bit counts for each chunk */ for (i = 0; i < Al; i++) { _fmpz_vec_sum_max_bits(&Asum[i], &Amax[i], A->coeffs + Amain[i], Amain[i + 1] - Amain[i]); } for (j = 0; j < Bl; j++) { _fmpz_vec_sum_max_bits(&Bsum[j], &Bmax[j], B->coeffs + Bmain[j], Bmain[j + 1] - Bmain[j]); } Pl = Al + Bl - 1; FLINT_ASSERT(Pl == degb); /* work out data for each chunk of the output */ Pchunks = (_chunk_struct *) TMP_ALLOC(Pl*sizeof(_chunk_struct)); perm = (slong *) TMP_ALLOC(Pl*sizeof(slong)); for (Pi = 0; Pi < Pl; Pi++) { fmpz_mpoly_init3((Pchunks + Pi)->poly, 8, P->bits, ctx); (Pchunks + Pi)->work = 0; perm[Pi] = Pi; for (i = 0, j = Pi; i < Al && j >= 0; i++, j--) { if (j < Bl) { (Pchunks + Pi)->work += (Amain[i + 1] - Amain[i]) *(Bmain[j + 1] - Bmain[j]); } } } for (i = 0; i < Pl; i++) { for (j = i; j > 0 && (Pchunks + perm[j-1])->work < (Pchunks + perm[j])->work; j--) { slong t = perm[j - 1]; perm[j - 1] = perm[j]; perm[j] = t; } } base->nthreads = num_handles + 1; base->Al = Al; base->Bl = Bl; base->Pl = Pl; base->Amax = Amax; base->Bmax = Bmax; base->Asum = Asum; base->Bsum = Bsum; base->Acoeffs = A->coeffs; base->Amain = Amain; base->Apexp = Apexp; base->Bcoeffs = B->coeffs; base->Bmain = Bmain; base->Bpexp = Bpexp; base->idx = 0; base->perm = perm; base->nvars = nvars; base->Pchunks = Pchunks; base->array_size = array_size; base->degb = degb; base->rev = (ctx->minfo->ord == ORD_DEGREVLEX); args = (_worker_arg_struct *) TMP_ALLOC(base->nthreads *sizeof(_worker_arg_struct)); #if FLINT_USES_PTHREAD pthread_mutex_init(&base->mutex, NULL); #endif for (i = 0; i < num_handles; i++) { args[i].idx = i; args[i].base = base; thread_pool_wake(global_thread_pool, handles[i], 0, _fmpz_mpoly_mul_array_threaded_worker_DEG, &args[i]); } i = num_handles; args[i].idx = i; args[i].base = base; _fmpz_mpoly_mul_array_threaded_worker_DEG(&args[i]); for (i = 0; i < num_handles; i++) { thread_pool_wait(global_thread_pool, handles[i]); } #if FLINT_USES_PTHREAD pthread_mutex_destroy(&base->mutex); #endif /* join answers */ Plen = 0; for (Pi = 0; Pi < Pl; Pi++) { _fmpz_mpoly_fit_length(&P->coeffs, &P->exps, &P->alloc, Plen + (Pchunks + Pi)->len, 1); for (i = 0; i < (Pchunks + Pi)->len; i++) { P->exps[Plen] = (Pchunks + Pi)->poly->exps[i]; fmpz_swap(P->coeffs + Plen, (Pchunks + Pi)->poly->coeffs + i); fmpz_clear((Pchunks + Pi)->poly->coeffs + i); Plen++; } flint_free((Pchunks + Pi)->poly->coeffs); flint_free((Pchunks + Pi)->poly->exps); } _fmpz_mpoly_set_length(P, Plen, ctx); flint_free(Apexp); flint_free(Bpexp); TMP_END; } int _fmpz_mpoly_mul_array_threaded_pool_DEG( fmpz_mpoly_t A, const fmpz_mpoly_t B, fmpz * maxBfields, const fmpz_mpoly_t C, fmpz * maxCfields, const fmpz_mpoly_ctx_t ctx, const thread_pool_handle * handles, slong num_handles) { slong i, exp_bits, array_size; ulong deg; int success; FLINT_ASSERT(B->length != 0); FLINT_ASSERT(C->length != 0); FLINT_ASSERT( ctx->minfo->ord == ORD_DEGREVLEX || ctx->minfo->ord == ORD_DEGLEX); FLINT_ASSERT(1 == mpoly_words_per_exp(B->bits, ctx->minfo)); FLINT_ASSERT(1 == mpoly_words_per_exp(C->bits, ctx->minfo)); /* the field of index n-1 is the one that wil be pulled out */ i = ctx->minfo->nfields - 1; FLINT_ASSERT(fmpz_fits_si(maxBfields + i)); FLINT_ASSERT(fmpz_fits_si(maxCfields + i)); deg = 1 + fmpz_get_ui(maxBfields + i) + fmpz_get_ui(maxCfields + i); if (((slong) deg) <= 0 || deg > MAX_ARRAY_SIZE) { success = 0; goto cleanup; } /* the fields of index n-2...1, contribute to the array size */ array_size = WORD(1); for (i--; i >= 1; i--) { ulong hi; umul_ppmm(hi, array_size, array_size, deg); if (hi != WORD(0) || array_size <= 0 || array_size > MAX_ARRAY_SIZE) { success = 0; goto cleanup; } } exp_bits = FLINT_MAX(MPOLY_MIN_BITS, FLINT_BIT_COUNT(deg) + 1); exp_bits = mpoly_fix_bits(exp_bits, ctx->minfo); /* array multiplication assumes result fit into 1 word */ if (1 != mpoly_words_per_exp(exp_bits, ctx->minfo)) { success = 0; goto cleanup; } /* handle aliasing and do array multiplication */ if (A == B || A == C) { fmpz_mpoly_t T; fmpz_mpoly_init3(T, B->length + C->length, exp_bits, ctx); _fmpz_mpoly_mul_array_chunked_threaded_DEG(T, C, B, deg, ctx, handles, num_handles); fmpz_mpoly_swap(T, A, ctx); fmpz_mpoly_clear(T, ctx); } else { fmpz_mpoly_fit_length_reset_bits(A, B->length + C->length, exp_bits, ctx); _fmpz_mpoly_mul_array_chunked_threaded_DEG(A, C, B, deg, ctx, handles, num_handles); } success = 1; cleanup: return success; } int fmpz_mpoly_mul_array_threaded( fmpz_mpoly_t A, const fmpz_mpoly_t B, const fmpz_mpoly_t C, const fmpz_mpoly_ctx_t ctx) { slong i; int success; fmpz * maxBfields, * maxCfields; thread_pool_handle * handles; slong num_handles; slong thread_limit = FLINT_MIN(A->length, B->length)/16; TMP_INIT; if (B->length == 0 || C->length == 0) { fmpz_mpoly_zero(A, ctx); return 1; } if (1 != mpoly_words_per_exp(B->bits, ctx->minfo) || 1 != mpoly_words_per_exp(C->bits, ctx->minfo)) { return 0; } TMP_START; maxBfields = (fmpz *) TMP_ALLOC(ctx->minfo->nfields*sizeof(fmpz)); maxCfields = (fmpz *) TMP_ALLOC(ctx->minfo->nfields*sizeof(fmpz)); for (i = 0; i < ctx->minfo->nfields; i++) { fmpz_init(maxBfields + i); fmpz_init(maxCfields + i); } mpoly_max_fields_fmpz(maxBfields, B->exps, B->length, B->bits, ctx->minfo); mpoly_max_fields_fmpz(maxCfields, C->exps, C->length, C->bits, ctx->minfo); num_handles = flint_request_threads(&handles, thread_limit); switch (ctx->minfo->ord) { case ORD_LEX: { success = _fmpz_mpoly_mul_array_threaded_pool_LEX(A, B, maxBfields, C, maxCfields, ctx, handles, num_handles); break; } case ORD_DEGREVLEX: case ORD_DEGLEX: { success = _fmpz_mpoly_mul_array_threaded_pool_DEG(A, B, maxBfields, C, maxCfields, ctx, handles, num_handles); break; } default: { success = 0; break; } } flint_give_back_threads(handles, num_handles); for (i = 0; i < ctx->minfo->nfields; i++) { fmpz_clear(maxBfields + i); fmpz_clear(maxCfields + i); } TMP_END; return success; }