/* Copyright (C) 2011 Fredrik Johansson Copyright (C) 2012 Lina Kulakova Copyright (C) 2014 Martin Lee Copyright (C) 2020 William Hart This file is part of FLINT. FLINT is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License (LGPL) as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. See . */ #include #include "flint.h" #include "fmpz_vec.h" #include "fmpz_mod_poly.h" #include "fmpz_mod_mat.h" #include "ulong_extras.h" #include "thread_support.h" typedef struct { fmpz_mod_poly_struct * res; fmpz_mod_mat_struct * C; const fmpz * h; const fmpz * poly; const fmpz * polyinv; const fmpz * p; fmpz * t; volatile slong * j; slong k; slong m; slong len; slong leninv; slong len2; #if FLINT_USES_PTHREAD pthread_mutex_t * mutex; #endif } compose_vec_arg_t; void _fmpz_mod_poly_compose_mod_brent_kung_vec_preinv_worker(void * arg_ptr) { compose_vec_arg_t arg = *((compose_vec_arg_t *) arg_ptr); slong i, j, k = arg.k, n = arg.len - 1; slong len = arg.len, leninv = arg.leninv; fmpz * t = arg.t; const fmpz * h = arg.h; const fmpz * poly = arg.poly; const fmpz * polyinv = arg.polyinv; fmpz_mod_poly_struct * res = arg.res; fmpz_mat_struct * C = arg.C->mat; const fmpz * p = arg.p; while (1) { #if FLINT_USES_PTHREAD pthread_mutex_lock(arg.mutex); #endif j = *arg.j; *arg.j = j + 1; #if FLINT_USES_PTHREAD pthread_mutex_unlock(arg.mutex); #endif if (j >= arg.len2) return; _fmpz_vec_set(res[j].coeffs, C->rows[(j + 1)*k - 1], n); if (n == 1) /* special case, constant polynomials */ { for (i = 2; i <= k; i++) { fmpz_mul(t + 0, res[j].coeffs + 0, h + 0); fmpz_add(res[j].coeffs + 0, t + 0, C->rows[(j + 1)*k - i] + 0); fmpz_mod(res[j].coeffs + 0, res[j].coeffs + 0, p); } } else { for (i = 2; i <= k; i++) { _fmpz_mod_poly_mulmod_preinv(t, res[j].coeffs, n, h, n, poly, len, polyinv, leninv, p); _fmpz_mod_poly_add(res[j].coeffs, t, n, C->rows[(j + 1)*k - i], n, p); } } } } void _fmpz_mod_poly_compose_mod_brent_kung_vec_preinv_threaded_pool(fmpz_mod_poly_struct * res, const fmpz_mod_poly_struct * polys, slong lenpolys, slong l, const fmpz * g, slong glen, const fmpz * poly, slong len, const fmpz * polyinv, slong leninv, const fmpz_t p, thread_pool_handle * threads, slong num_threads) { fmpz_mod_mat_t A, B, C; slong i, j, n, m, k, len2 = l, len1, shared_j = 0; fmpz * h; compose_vec_arg_t * args; #if FLINT_USES_PTHREAD pthread_mutex_t mutex; #endif n = len - 1; m = n_sqrt(n*len2) + 1; h = _fmpz_vec_init(n); k = len/m + 1; fmpz_mod_mat_init(A, m, n, p); fmpz_mod_mat_init(B, k*len2, m, p); fmpz_mod_mat_init(C, k*len2, n, p); /* Set rows of B to the segments of polys */ for (j = 0; j < len2; j++) { len1 = polys[j].length; for (i = 0; i < len1 / m; i++) _fmpz_vec_set(B->mat->rows[i + j*k], polys[j].coeffs + i*m, m); _fmpz_vec_set(B->mat->rows[i + j*k], polys[j].coeffs + i*m, len1 % m); } /* Set rows of A to powers of last element of polys */ _fmpz_mod_poly_powers_mod_preinv_threaded_pool(A->mat->rows, g, glen, m, poly, len, polyinv, leninv, p, threads, num_threads); _fmpz_mod_mat_mul_classical_threaded_pool_op(C, NULL, B, A, 0, threads, num_threads); /* Evaluate block composition using the Horner scheme */ if (n == 1) { fmpz_mul(h + 0, A->mat->rows[m - 1] + 0, A->mat->rows[1] + 0); fmpz_mod(h + 0, h + 0, p); } else { _fmpz_mod_poly_mulmod_preinv(h, A->mat->rows[m - 1], n, A->mat->rows[1], n, poly, len, polyinv, leninv, p); } args = (compose_vec_arg_t *) flint_malloc(sizeof(compose_vec_arg_t)*(num_threads + 1)); for (i = 0; i < num_threads + 1; i++) { args[i].res = res; args[i].C = C; args[i].h = h; args[i].k = k; args[i].m = m; args[i].j = &shared_j; args[i].poly = poly; args[i].t = _fmpz_vec_init(len); args[i].len = len; args[i].polyinv = polyinv; args[i].leninv = leninv; args[i].p = p; args[i].len2 = len2; #if FLINT_USES_PTHREAD args[i].mutex = &mutex; #endif } #if FLINT_USES_PTHREAD pthread_mutex_init(&mutex, NULL); #endif for (i = 0; i < num_threads; i++) { thread_pool_wake(global_thread_pool, threads[i], 0, _fmpz_mod_poly_compose_mod_brent_kung_vec_preinv_worker, &args[i]); } _fmpz_mod_poly_compose_mod_brent_kung_vec_preinv_worker(&args[num_threads]); for (i = 0; i < num_threads; i++) { thread_pool_wait(global_thread_pool, threads[i]); } #if FLINT_USES_PTHREAD pthread_mutex_destroy(&mutex); #endif for (i = 0; i < num_threads + 1; i++) _fmpz_vec_clear(args[i].t, len); flint_free(args); _fmpz_vec_clear(h, n); fmpz_mod_mat_clear(A); fmpz_mod_mat_clear(B); fmpz_mod_mat_clear(C); } void fmpz_mod_poly_compose_mod_brent_kung_vec_preinv_threaded_pool(fmpz_mod_poly_struct * res, const fmpz_mod_poly_struct * polys, slong len1, slong n, const fmpz_mod_poly_t g, const fmpz_mod_poly_t poly, const fmpz_mod_poly_t polyinv, const fmpz_mod_ctx_t ctx, thread_pool_handle * threads, slong num_threads) { slong len2 = poly->length; slong i; if (n == 0) return; if (len2 == 1) { for (i = 0; i < n; i++) fmpz_mod_poly_zero(res + i, ctx); } if (len2 == 2) { for (i = 0; i < n; i++) fmpz_mod_poly_set(res + i, polys + i, ctx); return; } for (i = 0; i < n; i++) { fmpz_mod_poly_fit_length(res + i, len2 - 1, ctx); _fmpz_mod_poly_set_length(res + i, len2 - 1); } _fmpz_mod_poly_compose_mod_brent_kung_vec_preinv_threaded_pool(res, polys, len1, n, g->coeffs, g->length, poly->coeffs, len2, polyinv->coeffs, polyinv->length, fmpz_mod_ctx_modulus(ctx), threads, num_threads); for (i = 0; i < n; i++) _fmpz_mod_poly_normalise(res + i); } void fmpz_mod_poly_compose_mod_brent_kung_vec_preinv_threaded(fmpz_mod_poly_struct * res, const fmpz_mod_poly_struct * polys, slong len1, slong n, const fmpz_mod_poly_t g, const fmpz_mod_poly_t poly, const fmpz_mod_poly_t polyinv, const fmpz_mod_ctx_t ctx) { slong i, len2 = poly->length; thread_pool_handle * threads; slong num_threads; for (i = 0; i < len1; i++) { if (polys[i].length >= len2) { flint_printf ("Exception (fmpz_mod_poly_compose_mod_brent_kung_vec_preinv_threaded)." "The degree of the first polynomial must be smaller than that of the " " modulus\n"); flint_abort(); } } if (n > len1) { flint_printf ("Exception (fmpz_mod_poly_compose_mod_brent_kung_vec_preinv_threaded)." "n is larger than the length of polys\n"); flint_abort(); } if (n == 0) return; if (len2 == 1) { for (i = 0; i < n; i++) fmpz_mod_poly_zero(res + i, ctx); return; } if (len2 == 2) { for (i = 0; i < n; i++) fmpz_mod_poly_set(res + i, polys + i, ctx); return; } for (i = 0; i < n; i++) { fmpz_mod_poly_fit_length(res + i, len2 - 1, ctx); _fmpz_mod_poly_set_length(res + i, len2 - 1); } num_threads = flint_request_threads(&threads, flint_get_num_threads()); _fmpz_mod_poly_compose_mod_brent_kung_vec_preinv_threaded_pool(res, polys, len1, n, g->coeffs, g->length, poly->coeffs, len2, polyinv->coeffs, polyinv->length, fmpz_mod_ctx_modulus(ctx), threads, num_threads); flint_give_back_threads(threads, num_threads); for (i = 0; i < n; i++) _fmpz_mod_poly_normalise(res + i); }