/*
    Copyright (C) 2020 Daniel Schultz

    This file is part of FLINT.

    FLINT is free software: you can redistribute it and/or modify it under
    the terms of the GNU Lesser General Public License (LGPL) as published
    by the Free Software Foundation; either version 2.1 of the License, or
    (at your option) any later version.  See <https://www.gnu.org/licenses/>.
*/

#include "nmod_mpoly_factor.h"
#include "fq_nmod_mpoly_factor.h"


static void n_bpoly_reverse_gens(n_bpoly_t a, const n_bpoly_t b)
{
    slong i, j;
    n_bpoly_zero(a);
    for (i = 0; i < b->length; i++)
    {
        const n_fq_poly_struct * bi = b->coeffs + i;
        for (j = 0; j < bi->length; j++)
        {
            n_bpoly_set_coeff(a, j, i, bi->coeffs[j]);
        }
    }
}

static void n_bpoly_mod_make_monic_series(
    n_bpoly_t A,
    const n_bpoly_t B,
    slong order,
    nmod_t ctx)
{
    slong i;
    n_poly_t lcinv;

    FLINT_ASSERT(A != B);
    FLINT_ASSERT(B->length > 0);
    FLINT_ASSERT(n_bpoly_mod_is_canonical(B, ctx));

    n_poly_init(lcinv);
    n_poly_mod_inv_series(lcinv, B->coeffs + B->length - 1, order, ctx);

    n_bpoly_fit_length(A, B->length);
    for (i = 0; i < B->length; i++)
        n_poly_mod_mullow(A->coeffs + i, B->coeffs + i, lcinv, order, ctx);

    A->length = B->length;
    n_bpoly_normalise(A);

    n_poly_clear(lcinv);
}


static void _n_bpoly_set_poly_gen0(
    n_bpoly_t A,
    const mp_limb_t * Bcoeffs, slong Blength)
{
    slong i;
    n_bpoly_fit_length(A, Blength);
    A->length = Blength;
    for (i = 0; i < Blength; i++)
        n_poly_set_ui(A->coeffs + i, Bcoeffs[i]);
}


static void n_bpoly_mod_eval(
    nmod_poly_t E,
    const n_bpoly_t A,
    mp_limb_t alpha,
    nmod_t ctx)
{
    slong i;
    n_poly_t alphapow;

    nmod_poly_zero(E);

    if (alpha == 0)
    {
        for (i = A->length - 1; i >= 0; i--)
            nmod_poly_set_coeff_ui(E, i, n_poly_get_coeff(A->coeffs + i, 0));
        return;
    }

    n_poly_init2(alphapow, 2);
    alphapow->length = 2;
    alphapow->coeffs[0] = 1;
    alphapow->coeffs[1] = alpha;

    for (i = A->length - 1; i >= 0; i--)
        nmod_poly_set_coeff_ui(E, i, n_poly_mod_eval_pow(A->coeffs + i, alphapow, ctx));

    n_poly_clear(alphapow);
}


/****************** lifting **************************************************/

static void _hensel_lift_fac(
    n_bpoly_t G,
    n_bpoly_t H,
    const n_bpoly_t f,
    n_bpoly_t g,
    n_bpoly_t h,
    const n_bpoly_t a,
    const n_bpoly_t b,
    slong p0,
    slong p1,
    nmod_t ctx)
{
    slong i, j;
    n_bpoly_t c, t1, t2, q, r;

    n_bpoly_init(c);
    n_bpoly_init(t1);
    n_bpoly_init(t2);
    n_bpoly_init(q);
    n_bpoly_init(r);

    n_bpoly_mod_mul(t1, g, h, ctx);
    n_bpoly_mod_sub(c, f, t1, ctx);
    for (i = 0; i < c->length; i++)
    {
        for (j = 0; j < p0; j++)
            FLINT_ASSERT(n_poly_get_coeff(c->coeffs + i, j) == 0);
        n_poly_shift_right(c->coeffs + i, c->coeffs + i, p0);
        n_poly_truncate(c->coeffs + i, p1);
    }

    n_bpoly_mod_mul_series(t1, c, b, p1, ctx);
    n_bpoly_mod_divrem_series(q, r, t1, g, p1, ctx);
    for (i = 0; i < r->length; i++)
        n_poly_shift_left(r->coeffs + i, r->coeffs + i, p0);
    for (i = 0; i < g->length; i++)
        n_poly_truncate(g->coeffs + i, p0);
    n_bpoly_mod_add(t1, r, g, ctx);

    n_bpoly_mod_mul_series(t2, c, a, p1, ctx);
    n_bpoly_mod_divrem_series(q, r, t2, h, p1, ctx);
    for (i = 0; i < r->length; i++)
        n_poly_shift_left(r->coeffs + i, r->coeffs + i, p0);
    for (i = 0; i < h->length; i++)
        n_poly_truncate(h->coeffs + i, p0);
    n_bpoly_mod_add(t2, r, h, ctx);

    n_bpoly_swap(G, t1);
    n_bpoly_swap(H, t2);

#if FLINT_WANT_ASSERT
    n_bpoly_mod_mul(t1, G, H, ctx);
    n_bpoly_mod_sub(c, f, t1, ctx);
    for (i = 0; i < c->length; i++)
        for (j = 0; j < p0 + p1; j++)
            FLINT_ASSERT(n_poly_get_coeff(c->coeffs + i, j) == 0);
#endif

    n_bpoly_clear(c);
    n_bpoly_clear(t1);
    n_bpoly_clear(t2);
    n_bpoly_clear(q);
    n_bpoly_clear(r);
}

static void _hensel_lift_inv(
    n_bpoly_t A,
    n_bpoly_t B,
    const n_bpoly_t G,
    const n_bpoly_t H,
    n_bpoly_t a,
    n_bpoly_t b,
    slong p0,
    slong p1,
    nmod_t ctx)
{
    slong i, j;
    n_bpoly_t c, t1, t2, q, r;

    n_bpoly_init(c);
    n_bpoly_init(t1);
    n_bpoly_init(t2);
    n_bpoly_init(q);
    n_bpoly_init(r);

    for (i = 0; i < a->length; i++)
        n_poly_truncate(a->coeffs + i, p0);
    for (i = 0; i < b->length; i++)
        n_poly_truncate(b->coeffs + i, p0);

    n_bpoly_mod_mul(t1, G, a, ctx);
    n_bpoly_mod_mul(t2, H, b, ctx);
    n_bpoly_mod_add(c, t1, t2, ctx);

    FLINT_ASSERT(c->length > 0);
    for (i = 0; i < c->length; i++)
        n_poly_mod_neg(c->coeffs + i, c->coeffs + i, ctx);
    n_poly_mod_add_ui(c->coeffs + 0, c->coeffs + 0, 1, ctx);
    n_bpoly_normalise(c);

    for (i = 0; i < c->length; i++)
    {
        for (j = 0; j < p0; j++)
            FLINT_ASSERT(n_poly_get_coeff(c->coeffs + i, j) == 0);
        n_poly_shift_right(c->coeffs + i, c->coeffs + i, p0);
        n_poly_truncate(c->coeffs + i, p1);
    }

    n_bpoly_mod_mul_series(t1, c, b, p1, ctx);
    n_bpoly_mod_divrem_series(q, r, t1, G, p1, ctx);
    for (i = 0; i < r->length; i++)
        n_poly_shift_left(r->coeffs + i, r->coeffs + i, p0);
    n_bpoly_mod_add(t1, r, b, ctx);

    n_bpoly_mod_mul_series(t2, c, a, p1, ctx);
    n_bpoly_mod_divrem_series(q, r, t2, H, p1, ctx);
    for (i = 0; i < r->length; i++)
        n_poly_shift_left(r->coeffs + i, r->coeffs + i, p0);
    n_bpoly_mod_add(t2, r, a, ctx);

    n_bpoly_swap(t1, B);
    n_bpoly_swap(t2, A);

#if FLINT_WANT_ASSERT
    n_bpoly_mod_mul(t1, G, A, ctx);
    n_bpoly_mod_mul(t2, H, B, ctx);
    n_bpoly_mod_add(c, t1, t2, ctx);

    FLINT_ASSERT(c->length > 0);
    for (i = 0; i < c->length; i++)
        n_poly_mod_neg(c->coeffs + i, c->coeffs + i, ctx);
    n_poly_mod_add_ui(c->coeffs + 0, c->coeffs + 0, 1, ctx);
    n_bpoly_normalise(c);

    for (i = 0; i < c->length; i++)
        for (j = 0; j < p0 + p1; j++)
            FLINT_ASSERT(n_poly_get_coeff(c->coeffs + i, j) == 0);
#endif

    n_bpoly_clear(c);
    n_bpoly_clear(t1);
    n_bpoly_clear(t2);
    n_bpoly_clear(q);
    n_bpoly_clear(r); 
}

static void _hensel_lift_tree(
    int opt,
    slong * link,
    n_bpoly_struct * v,
    n_bpoly_struct * w,
    const n_bpoly_t f,
    slong j,
    slong p0,
    slong p1,
    nmod_t ctx)
{
    FLINT_ASSERT(p1 <= p0);

    if (j < 0)
        return;

    if (opt >= 0)
        _hensel_lift_fac(v + j, v + j + 1,
                           f, v + j, v + j + 1, w + j, w + j + 1, p0, p1, ctx);

    if (opt <= 0)
        _hensel_lift_inv(w + j, w + j + 1,
                              v + j, v + j + 1, w + j, w + j + 1, p0, p1, ctx);

    _hensel_lift_tree(opt, link, v, w, v + j, link[j], p0, p1, ctx);
    _hensel_lift_tree(opt, link, v, w, v + j + 1, link[j + 1], p0, p1, ctx);
}

typedef struct {
    slong * link;
    n_bpoly_struct ** lifted_fac;
    n_tpoly_t tmp;
    n_bpoly_t bmp;
    slong r;
    slong fac_lift_order;
    slong inv_lift_order;
    nmod_eval_interp_t E;
    int Eok;
    int use_linear;
} n_bpoly_mod_lift_struct;

typedef n_bpoly_mod_lift_struct n_bpoly_mod_lift_t[1];

static void n_bpoly_mod_lift_init(n_bpoly_mod_lift_t L)
{
    L->link = NULL;
    L->lifted_fac = NULL;
    n_tpoly_init(L->tmp);
    n_bpoly_init(L->bmp);
    L->r = 0;
    L->fac_lift_order = 0;
    L->inv_lift_order = 0;
    nmod_eval_interp_init(L->E);
}

static void n_bpoly_mod_lift_clear(n_bpoly_mod_lift_t L)
{
    flint_free(L->link);
    flint_free(L->lifted_fac);
    n_tpoly_clear(L->tmp);
    n_bpoly_clear(L->bmp);
    nmod_eval_interp_clear(L->E);
}


static void _n_bpoly_mod_lift_build_tree(
    n_bpoly_mod_lift_t L,
    n_bpoly_struct * local_facs,
    slong r,
    const n_bpoly_t monicA,
    nmod_t ctx)
{
    slong i, j;
    n_bpoly_struct * v, * w;
    slong e[FLINT_BITS+1];
    slong * link;
    nmod_poly_t d, g, h, a, b;

    nmod_poly_init_mod(d, ctx);
    nmod_poly_init_mod(g, ctx);
    nmod_poly_init_mod(h, ctx);
    nmod_poly_init_mod(a, ctx);
    nmod_poly_init_mod(b, ctx);

    FLINT_ASSERT(r > 1);

    L->link = (slong *) flint_realloc(L->link, (2*r - 2)*sizeof(slong));
    link = L->link;

    n_tpoly_clear(L->tmp);
    n_tpoly_init(L->tmp);
    n_tpoly_fit_length(L->tmp, 2*(2*r - 2));
    v = L->tmp->coeffs + 0;
    w = L->tmp->coeffs + (2*r - 2);

    for (i = 0; i < r; i++)
    {
        n_bpoly_swap(v + i, local_facs + i);
        link[i] = -i - 1;
    }

    for (i = r, j = 0; j < 2*r - 4; i++, j += 2)
    {
        slong s, minp, mind;

        minp = j;
        mind = n_bpoly_degree0(v + j);
        for (s = j + 1; s < i; s++)
        {
            if (n_bpoly_degree0(v + s) < mind)
            {
                minp = s;
                mind = n_bpoly_degree0(v + s);
            }
        }
        n_bpoly_swap(v + j, v + minp);
        SLONG_SWAP(link[j], link[minp]);

        minp = j + 1;
        mind = n_bpoly_degree0(v + j + 1);
        for (s = j + 2; s < i; s++)
        {
            if (n_bpoly_degree0(v + s) < mind)
            {
                minp = s;
                mind = n_bpoly_degree0(v + s);
            }
        }
        n_bpoly_swap(v + j + 1, v + minp);
        SLONG_SWAP(link[j + 1], link[minp]);

        n_bpoly_mod_mul_series(v + i, v + j, v + j + 1, L->fac_lift_order, ctx);
        link[i] = j;
    }

    for (j = 0; j < 2*r - 2; j += 2)
    {
        n_bpoly_mod_eval(g, v + j, 0, ctx);
        n_bpoly_mod_eval(h, v + j + 1, 0, ctx);
        nmod_poly_xgcd(d, a, b, g, h);
        if (!nmod_poly_is_one(d))
            flint_throw(FLINT_IMPINV, "n_bpoly_mod_lift: bad inverse");
        _n_bpoly_set_poly_gen0(w + j, a->coeffs, a->length);
        _n_bpoly_set_poly_gen0(w + j + 1, b->coeffs, b->length);
    }

    nmod_poly_clear(d);
    nmod_poly_clear(g);
    nmod_poly_clear(h);
    nmod_poly_clear(a);
    nmod_poly_clear(b);

    for (i = 0; i < 2*r - 2; i++)
        if (-L->link[i] - 1 >= 0)
            L->lifted_fac[-L->link[i] - 1] = v + i;

    for (i = 0, e[i] = L->inv_lift_order; e[i] > 1; i++)
        e[i+1] = (e[i] + 1)/2;
    e[i] = 1;

    for (i--; i >= 0; i--)
        _hensel_lift_tree(-1, L->link, v, w, monicA, 2*r-4, e[i+1], e[i]-e[i+1], ctx);
}

static void _n_bpoly_mod_lift_build_steps(n_bpoly_mod_lift_t L, nmod_t ctx)
{
    slong i, j, k;
    slong r = L->r;
    slong order = L->fac_lift_order;
    n_bpoly_struct * A, * Bfinal, * U, * Ue, * Be, * B;
    n_poly_struct * s, * Binv;
    n_poly_struct * c, * t;

    FLINT_ASSERT(L->tmp->alloc >= 4*r + 1);
    A = L->tmp->coeffs;
    Bfinal = A + 1;
    U = Ue = Bfinal + r;
    B = U + r;
    Be = B + r;

    FLINT_ASSERT(L->bmp->alloc >= 2*r + 5);
    s = L->bmp->coeffs;
    Binv = s + r;
    c = Binv + r;
    t = c + 1;

    for (k = 0; k < r; k++)
    {
        /* s[k] = (prod_{i!=k} B[i].coeffs[0])^-1 (mod B[k].coeffs[0]) */
        n_poly_mod_div(t, A->coeffs + 0, B[k].coeffs + 0, ctx);
        if (!n_poly_mod_invmod(s + k, t, B[k].coeffs + 0, ctx))
            flint_throw(FLINT_IMPINV, "n_bpoly_mod_lift: bad inverse");

        /* set up mul (mod B[k].coeffs[0]) */
        n_poly_reverse(t, B[k].coeffs + 0, B[k].coeffs[0].length);
        n_poly_mod_inv_series(Binv + k, t, B[k].coeffs[0].length, ctx);

        if (L->Eok)
        {
            n_bpoly_fit_length(Be + k, order);
            for (i = 0; i < order; i++)
                nmod_eval_interp_from_coeffs_poly(Be[k].coeffs + i,
                                               B[k].coeffs + i, L->E, ctx);
        }
    }

    /* U[0], U[r-1] are not used, zero out both U and Ue */
    for (k = 1; k < r - 1; k++)
    {
        n_bpoly_fit_length(U + k, order);
        for (i = U[k].length; i < order; i++)
            U[k].coeffs[i].length = 0;
        U[k].length = order;
    }

    if (r > 2 && L->Eok)
    {
        slong len = nmod_eval_interp_eval_length(L->E);

        for (j = 0; j < order; j++)
        {
            k = r - 2;
            nmod_evals_zero(Ue[k].coeffs + j);
            for (i = 0; i <= j; i++)
                nmod_evals_addmul(Ue[k].coeffs + j, Be[k].coeffs + i,
                                       Be[k + 1].coeffs + j - i, len, ctx);
            for (k--; k > 0; k--)
            {
                nmod_evals_zero(Ue[k].coeffs + j);
                for (i = 0; i <= j; i++)
                    nmod_evals_addmul(Ue[k].coeffs + j, Be[k].coeffs + i,
                                       Ue[k + 1].coeffs + j - i, len, ctx);
            }
        }
    }
    else if (r > 2)
    {
        for (j = 0; j < order; j++)
        {
            k = r - 2;
            n_poly_zero(U[k].coeffs + j);
            for (i = 0; i <= j; i++)
            {
                if (i < B[k].length && j - i < B[k + 1].length)
                {
                    n_poly_mod_mul(t, B[k].coeffs + i, B[k + 1].coeffs + j - i, ctx);
                    n_poly_mod_add(U[k].coeffs + j, U[k].coeffs + j, t, ctx);
                }
            }
            for (k--; k > 0; k--)
            {
                n_poly_zero(U[k].coeffs + j);
                for (i = 0; i <= j; i++)
                {
                    if (i < B[k].length)
                    {
                        n_poly_mod_mul(t, B[k].coeffs + i, U[k + 1].coeffs + j - i, ctx);
                        n_poly_mod_add(U[k].coeffs + j, U[k].coeffs + j, t, ctx);
                    }
                }
            }
        }
    }
}


/* linear lifting has large memory requirements wrt r */
static int _use_linear_cutoff(slong r, slong degx)
{
    return r < 30 + 5*FLINT_BIT_COUNT(degx);
}

/* evaluation has even large memory requirements wrt r */
static int _try_eval_cutoff(slong r, slong degx)
{
    return r < 20 + 2*FLINT_BIT_COUNT(degx);
}

static void n_bpoly_mod_lift_start(
    n_bpoly_mod_lift_t L,
    nmod_poly_struct * local_facs,
    slong r,
    const n_bpoly_t monicA,
    nmod_t ctx)
{
    slong i, k;
    slong degx = n_bpoly_degree0(monicA);
    n_bpoly_struct * A, * Bfinal, * U, * Ue, * B;

    FLINT_ASSERT(r > 1);

    L->r = r;
    L->lifted_fac = (n_bpoly_struct **) flint_realloc(L->lifted_fac,
                                                   r*sizeof(n_bpoly_struct *));
    L->fac_lift_order = 1;
    L->inv_lift_order = 1;
    L->use_linear = _use_linear_cutoff(r, degx);

    if (!L->use_linear)
    {
        n_bpoly_struct * new_facs = FLINT_ARRAY_ALLOC(r, n_bpoly_struct);
        for (i = 0; i < r; i++)
        {
            n_bpoly_init(new_facs + i);
            _n_bpoly_set_poly_gen0(new_facs + i, local_facs[i].coeffs,
                                                 local_facs[i].length);
        }

        _n_bpoly_mod_lift_build_tree(L, new_facs, r, monicA, ctx);

        for (i = 0; i < r; i++)
            n_bpoly_clear(new_facs + i);
        flint_free(new_facs);
    }
    else
    {
        n_tpoly_fit_length(L->tmp, 4*r + 1);
        A = L->tmp->coeffs;
        Bfinal = A + 1;
        U = Ue = Bfinal + r;
        B = U + r;

        n_bpoly_fit_length(L->bmp, 2*r + 5);

        n_bpoly_fit_length(A, 1);
        A->length = 1;
        n_poly_one(A->coeffs + 0);
        for (k = 0; k < r; k++)
        {
            n_bpoly_fit_length(B + k, 1);
            B[k].length = 1;
            n_poly_set_nmod_poly(B[k].coeffs + 0, local_facs + k);
            n_poly_mod_mul(A->coeffs + 0, A->coeffs + 0, B[k].coeffs + 0, ctx);

            L->lifted_fac[k] = Bfinal + k;
            n_bpoly_reverse_gens(L->lifted_fac[k], B + k);

            U[k].length = 0;
        }

        L->Eok = _try_eval_cutoff(r, degx) &&
                 nmod_eval_interp_set_degree_modulus(L->E, degx, ctx);

        _n_bpoly_mod_lift_build_steps(L, ctx);
    }
}

/*
    assuming N is reduced
        combine the factors in L according to the rows of N
        and then replace N by an identity matrix
*/
void n_bpoly_mod_lift_combine(
    n_bpoly_mod_lift_t L,
    nmod_mat_t N,
    const n_bpoly_t monicA,
    nmod_t ctx)
{
    slong i, j, k, r, degx;
    n_bpoly_struct * A, * Bfinal, * U, * Ue, * B;
    slong oldr = L->r;
    slong newr = nmod_mat_nrows(N);
    slong order = L->fac_lift_order;
    n_bpoly_struct * new_facs;
    n_bpoly_t T;

    FLINT_ASSERT(newr > 1);
    FLINT_ASSERT(newr < oldr);
    FLINT_ASSERT(oldr == nmod_mat_ncols(N));
    FLINT_ASSERT(nmod_mat_is_reduced(N));

    /* on input we should have a factorization of monicA mod y^order */
#if FLINT_WANT_ASSERT
    {
        n_bpoly_t t1, t2;
        n_bpoly_init(t1);
        n_bpoly_init(t2);
        n_bpoly_set(t1, L->lifted_fac[0]);
        for (k = 1; k < L->r; k++)
        {
            n_bpoly_mod_mul_series(t2, t1, L->lifted_fac[k], order, ctx);
            n_bpoly_swap(t1, t2);
        }
        n_bpoly_mod_sub(t2, monicA, t1, ctx);
        for (i = 0; i < t2->length; i++)
            for (j = 0; j < order; j++)
                FLINT_ASSERT(0 == n_poly_get_coeff(t2->coeffs + i, j));
        n_bpoly_clear(t1);
        n_bpoly_clear(t2);
    }
#endif

    n_bpoly_init(T);
    new_facs = FLINT_ARRAY_ALLOC(newr, n_bpoly_struct);
    for (i = 0; i < newr; i++)
    {
        n_bpoly_init(new_facs + i);
        n_bpoly_one(new_facs + i);
        for (j = 0; j < oldr; j++)
        {
            if (nmod_mat_entry(N, i, j) == 0)
                continue;
            FLINT_ASSERT(nmod_mat_entry(N, i, j) == 1);
            n_bpoly_mod_mul_series(T, new_facs + i, L->lifted_fac[j], order, ctx);
            n_bpoly_swap(new_facs + i, T);
        }
    }

    L->r = r = newr;

    degx = n_bpoly_degree0(monicA);

    /* do not use quadratic lifting if we were not already */
    L->use_linear = L->use_linear || _use_linear_cutoff(r, degx);

    if (!L->use_linear)
    {
        _n_bpoly_mod_lift_build_tree(L, new_facs, newr, monicA, ctx);

        for (i = 0; i < newr; i++)
            n_bpoly_clear(new_facs + i);
        flint_free(new_facs);
        n_bpoly_clear(T);
    }
    else
    {
        if (!L->Eok && r < 20 + 2*FLINT_BIT_COUNT(degx))
            L->Eok = nmod_eval_interp_set_degree_modulus(L->E, degx, ctx);

        A = L->tmp->coeffs;
        Bfinal = A + 1;

        n_bpoly_swap(T, A);
        n_tpoly_clear(L->tmp);
        n_tpoly_init(L->tmp);
        n_tpoly_fit_length(L->tmp, 4*r + 1);
        A = L->tmp->coeffs;
        n_bpoly_swap(A, T);
        n_bpoly_clear(T);
        Bfinal = A + 1;
        U = Ue = Bfinal + r;
        B = U + r;

        n_bpoly_clear(L->bmp);
        n_bpoly_init(L->bmp);
        n_bpoly_fit_length(L->bmp, 2*r + 5);

        for (i = 0; i < newr; i++)
        {
            L->lifted_fac[i] = Bfinal + i;
            n_bpoly_swap(Bfinal + i, new_facs + i);
            n_bpoly_clear(new_facs + i);
        }
        flint_free(new_facs);

        for (k = 0; k < r; k++)
        {
            n_bpoly_reverse_gens(B + k, L->lifted_fac[k]);
            FLINT_ASSERT(B[k].length <= order);
            n_bpoly_fit_length(B + k, order);
            for (i = B[k].length; i < order; i++)
                n_poly_zero(B[k].coeffs + i);
        }

        _n_bpoly_mod_lift_build_steps(L, ctx);
    }

    nmod_mat_clear(N);
    nmod_mat_init(N, L->r, L->r, ctx.n);
    for (i = 0; i < L->r;  i++)
        nmod_mat_entry(N, i, i) = 1;

    /* on output we should have a factorization of monicA mod y^order */
#if FLINT_WANT_ASSERT
    {
        n_bpoly_t t1, t2;
        n_bpoly_init(t1);
        n_bpoly_init(t2);
        n_bpoly_set(t1, L->lifted_fac[0]);
        for (k = 1; k < L->r; k++)
        {
            n_bpoly_mod_mul_series(t2, t1, L->lifted_fac[k], order, ctx);
            n_bpoly_swap(t1, t2);
        }
        n_bpoly_mod_sub(t2, monicA, t1, ctx);
        for (i = 0; i < t2->length; i++)
            for (j = 0; j < FLINT_MIN(order, t2->coeffs[i].length); j++)
                FLINT_ASSERT(0 == t2->coeffs[i].coeffs[j]);
        n_bpoly_clear(t1);
        n_bpoly_clear(t2);
    }
#endif
}

static void n_bpoly_mod_lift_continue(
    n_bpoly_mod_lift_t L,
    const n_bpoly_t monicA,
    slong order,
    nmod_t ctx)
{
    slong i, j, k;
    slong r = L->r;
    n_bpoly_struct * A, * Bfinal, * U, * Ue, * Be, * B;
    n_poly_struct * s, * Binv;
    n_poly_struct * c, * t, * ce, * vk, * vek;

    if (order <= L->fac_lift_order)
        return;

    if (!L->use_linear)
    {
        n_bpoly_struct * v = L->tmp->coeffs + 0;
        n_bpoly_struct * w = L->tmp->coeffs + (2*r - 2);
        slong e[FLINT_BITS+1];

        FLINT_ASSERT(1 <= L->inv_lift_order);
        FLINT_ASSERT(L->fac_lift_order <= 2*L->inv_lift_order);

        for (i = 0, e[i] = order; e[i] > L->fac_lift_order; i++)
            e[i+1] = (e[i] + 1)/2;
        e[i]   = L->fac_lift_order;
        e[i+1] = L->inv_lift_order;

        if (e[i+1] < e[i])
            _hensel_lift_tree(-1, L->link, v, w, monicA, 2*r-4, e[i+1], e[i]-e[i+1], ctx);

        for (i--; i > 0; i--)
            _hensel_lift_tree(0, L->link, v, w, monicA, 2*r-4, e[i+1], e[i]-e[i+1], ctx);

        _hensel_lift_tree(1, L->link, v, w, monicA, 2*r-4, e[1], e[0]-e[1], ctx);

        L->fac_lift_order = e[0];
        L->inv_lift_order = e[1];
        return;
    }

    A = L->tmp->coeffs;
    Bfinal = A + 1;
    U = Ue = Bfinal + r;
    B = U + r;
    Be = B + r;

    s = L->bmp->coeffs;
    Binv = s + r;
    c = Binv + r;
    t = c + 1;
    ce = t + 1;
    vk = ce + 1;
    vek = vk + 1;

    /* tack on reversal of monicA */
    for (i = 0; i < monicA->length; i++)
    {
        n_poly_struct * Bi = monicA->coeffs + i;
        j = FLINT_MIN(Bi->length, order);
        for (j--; j >= L->fac_lift_order; j--)
            n_bpoly_set_coeff(A, j, i, Bi->coeffs[j]);
    }

    /* tack on zeros to the B[k] */
    for (k = 0; k < r; k++)
    {
        n_bpoly_fit_length(B + k, order);
        if (L->Eok)
            n_bpoly_fit_length(Be + k, order);

        for (i = B[k].length; i < order; i++)
        {
            n_poly_zero(B[k].coeffs + i);
            if (L->Eok)
                nmod_evals_zero(Be[k].coeffs + i);
        }

        /* U[0] is not used, zero out both U and Ue */
        if (k > 0)
        {
            n_bpoly_fit_length(U + k, order);
            for (i = U[k].length; i < order; i++)
                U[k].coeffs[i].length = 0;
            U[k].length = order;
        }
    }

    if (L->Eok && r > 2)
    {
        slong len = nmod_eval_interp_eval_length(L->E);

        for (j = L->fac_lift_order; j < order; j++)
        {
            k = r - 2;
            nmod_evals_zero(Ue[k].coeffs + j);
            for (i = 0; i <= j; i++)
                nmod_evals_addmul(Ue[k].coeffs + j, Be[k].coeffs + i,
                                              Be[k + 1].coeffs + j - i, len, ctx);
            for (k--; k > 0; k--)
            {
                nmod_evals_zero(Ue[k].coeffs + j);
                for (i = 0; i <= j; i++)
                    nmod_evals_addmul(Ue[k].coeffs + j, Be[k].coeffs + i,
                                              Ue[k + 1].coeffs + j - i, len, ctx);
            }

            nmod_evals_zero(ce);
            for (i = 0; i <= j; i++)
                nmod_evals_addmul(ce, Be[0].coeffs + i,
                                      Ue[1].coeffs + j - i, len, ctx);

            nmod_eval_interp_to_coeffs_poly(c, ce, L->E, ctx);

            if (j < A->length)
                n_poly_mod_sub(c, A->coeffs + j, c, ctx);
            else
                n_poly_mod_neg(c, c, ctx);

            if (n_poly_is_zero(c))
                continue;

            for (k = r - 1; k >= 0; k--)
            {
                n_poly_mod_rem(t, c, B[k].coeffs + 0, ctx);
                n_poly_mod_mulmod_preinv(vk, s + k, t, B[k].coeffs + 0, Binv + k, ctx);
                nmod_eval_interp_from_coeffs_poly(vek, vk, L->E, ctx);
                if (!n_poly_is_zero(vk))
                {
                    nmod_evals_add_inplace(Be[k].coeffs + j, vek, len, ctx);
                    n_poly_mod_add(B[k].coeffs + j, B[k].coeffs + j, vk, ctx);
                    if (!n_poly_is_zero(B[k].coeffs + j))
                        B[k].length = FLINT_MAX(B[k].length, j + 1);
                }

                /* correct the U's */
                if (k > r - 2)
                {
                    n_poly_swap(ce, vek);
                }
                else if (k > 0)
                {
                    n_poly_struct * p;
                    p = (k == r - 2) ? Be[k + 1].coeffs : Ue[k + 1].coeffs;
                    nmod_evals_fmma(ce, Be[k].coeffs + 0, ce, p, vek, len, ctx);
                    nmod_evals_add_inplace(Ue[k].coeffs + j, ce, len, ctx);
                }
            }
        }
    }
    else if (L->Eok)
    {
        slong len = nmod_eval_interp_eval_length(L->E);

        FLINT_ASSERT(r == 2);

        for (j = L->fac_lift_order; j < order; j++)
        {
            nmod_evals_zero(ce);
            for (i = 0; i <= j; i++)
                nmod_evals_addmul(ce, Be[0].coeffs + i,
                                      Be[1].coeffs + j - i, len, ctx);

            nmod_eval_interp_to_coeffs_poly(c, ce, L->E, ctx);

            if (j < A->length)
                n_poly_mod_sub(c, A->coeffs + j, c, ctx);
            else
                n_poly_mod_neg(c, c, ctx);

            if (n_poly_is_zero(c))
                continue;

            for (k = 0; k < r; k++)
            {
                n_poly_mod_rem(t, c, B[k].coeffs + 0, ctx);
                n_poly_mod_mulmod_preinv(vk, s + k, t, B[k].coeffs + 0, Binv + k, ctx);
                nmod_eval_interp_from_coeffs_poly(vek, vk, L->E, ctx);
                if (!n_poly_is_zero(vk))
                {
                    nmod_evals_add_inplace(Be[k].coeffs + j, vek, len, ctx);
                    n_poly_mod_add(B[k].coeffs + j, B[k].coeffs + j, vk, ctx);
                    if (!n_poly_is_zero(B[k].coeffs + j))
                        B[k].length = FLINT_MAX(B[k].length, j + 1);
                }
            }
        }
    }
    else if (r > 2)
    {
        for (j = L->fac_lift_order; j < order; j++)
        {
            k = r - 2;
            n_poly_zero(U[k].coeffs + j);
            for (i = 0; i <= j; i++)
            {
                if (i < B[k].length && j - i < B[k + 1].length)
                {
                    n_poly_mod_mul(t, B[k].coeffs + i, B[k + 1].coeffs + j - i, ctx);
                    n_poly_mod_add(U[k].coeffs + j, U[k].coeffs + j, t, ctx);
                }
            }
            for (k--; k > 0; k--)
            {
                n_poly_zero(U[k].coeffs + j);
                for (i = 0; i <= j; i++)
                {
                    if (i < B[k].length)
                    {
                        n_poly_mod_mul(t, B[k].coeffs + i, U[k + 1].coeffs + j - i, ctx);
                        n_poly_mod_add(U[k].coeffs + j, U[k].coeffs + j, t, ctx);
                    }
                }
            }

            if (j < A->length)
                n_poly_set(c, A->coeffs + j);
            else
                n_poly_zero(c);

            for (i = 0; i <= j; i++)
            {
                if (i < B[0].length)
                {
                    n_poly_mod_mul(t, B[0].coeffs + i, U[1].coeffs + j - i, ctx);
                    n_poly_mod_sub(c, c, t, ctx);
                }
            }

            if (n_poly_is_zero(c))
                continue;

            for (k = r - 1; k >= 0; k--)
            {
                n_poly_mod_rem(t, c, B[k].coeffs + 0, ctx);
                n_poly_mod_mulmod_preinv(vk, s + k, t, B[k].coeffs + 0, Binv + k, ctx);
                if (!n_poly_is_zero(vk))
                {
                    n_poly_mod_add(B[k].coeffs + j, B[k].coeffs + j, vk, ctx);
                    if (!n_poly_is_zero(B[k].coeffs + j))
                        B[k].length = FLINT_MAX(B[k].length, j + 1);
                }

                /* correct the U's */
                if (k > r - 2)
                {
                    n_poly_swap(ce, vk);
                }
                else if (k > 0)
                {
                    n_poly_struct * p;
                    n_poly_mod_mul(t, B[k].coeffs + 0, ce, ctx);
                    p = (k == r - 2) ? B[k + 1].coeffs : U[k + 1].coeffs;
                    n_poly_mod_mul(ce, p, vk, ctx);
                    n_poly_mod_add(ce, ce, t, ctx);
                    n_poly_mod_add(U[k].coeffs + j, U[k].coeffs + j, ce, ctx);
                }
            }
        }
    }
    else
    {
        FLINT_ASSERT(r == 2);

        for (j = L->fac_lift_order; j < order; j++)
        {
            if (j < A->length)
                n_poly_set(c, A->coeffs + j);
            else
                n_poly_zero(c);

            for (i = FLINT_MIN(j, B[0].length - 1); i >= 0; i--)
            {
                n_poly_mod_mul(t, B[0].coeffs + i, B[1].coeffs + j - i, ctx);
                n_poly_mod_sub(c, c, t, ctx);
            }

            if (n_poly_is_zero(c))
                continue;

            for (k = 0; k < r; k++)
            {
                n_poly_mod_rem(t, c, B[k].coeffs + 0, ctx);
                n_poly_mod_mulmod_preinv(vk, s + k, t, B[k].coeffs + 0, Binv + k, ctx);
                if (!n_poly_is_zero(vk))
                {
                    n_poly_mod_add(B[k].coeffs + j, B[k].coeffs + j, vk, ctx);
                    if (!n_poly_is_zero(B[k].coeffs + j))
                        B[k].length = FLINT_MAX(B[k].length, j + 1);
                }
            }
        }
    }

    L->fac_lift_order = order;

    for (k = 0; k < r; k++)
        n_bpoly_reverse_gens(Bfinal + k, B + k);

#if FLINT_WANT_ASSERT
    {
        n_bpoly_t t1, t2;
        n_bpoly_init(t1);
        n_bpoly_init(t2);
        n_bpoly_set(t1, Bfinal + 0);
        for (k = 1; k < r; k++)
        {
            n_bpoly_mod_mul_series(t2, t1, Bfinal + k, order, ctx);
            n_bpoly_swap(t1, t2);
        }
        n_bpoly_mod_sub(t2, monicA, t1, ctx);

        for (i = 0; i < t2->length; i++)
        {
            for (j = 0; j < FLINT_MIN(order, t2->coeffs[i].length); j++)
            {
                FLINT_ASSERT(0 == t2->coeffs[i].coeffs[j]);
            }
        }
        n_bpoly_clear(t1);
        n_bpoly_clear(t2);
    }
#endif
}


/************* lattice reduction ********************************************/

/*
    The rows of N, if they are 0-1, correspond to combinations of the g that
    give factors of A. Compute {A/g[i]*g[i]'}_i and use the CLD bounds to
    try to make N smaller.
*/
static void _lattice(
    nmod_mat_t N,
    n_bpoly_struct * const * g,
    slong r,
    slong lift_order,
    slong * CLD,
    slong * lattice_order,
    const n_bpoly_t A,
    nmod_t ctx)
{
    slong i, j, k;
    n_bpoly_t Q, R, dg;
    n_bpoly_struct * ld;
    nmod_mat_t M, T1, T2;
    int nlimbs;
    mp_limb_t * trow;

    nlimbs = _nmod_vec_dot_bound_limbs(r, ctx);
    trow = FLINT_ARRAY_ALLOC(r, mp_limb_t);
    n_bpoly_init(Q);
    n_bpoly_init(R);
    n_bpoly_init(dg);
    ld = FLINT_ARRAY_ALLOC(r, n_bpoly_struct);
    for (i = 0; i < r; i++)
    {
        n_bpoly_init(ld + i);
        n_bpoly_mod_divrem_series(Q, R, A, g[i], lift_order, ctx);
        FLINT_ASSERT(R->length == 0);
        n_bpoly_mod_derivative_gen0(R, g[i], ctx);
        n_bpoly_mod_mul_series(ld + i, Q, R, lift_order, ctx);
    }

    for (k = 0; k + 1 < A->length; k++)
    {
        slong nrows = nmod_mat_nrows(N);
        slong lower = FLINT_MAX(CLD[k], *lattice_order);

        FLINT_ASSERT(nrows > 0);

        /*
            consider powers y^j for which
            j >= lattice_order (j < lattice_order has already been added)
            j >= CLD[k]
            j < lift_order
        */

        if (lift_order <= lower)
            continue;

        nmod_mat_init(M, lift_order - lower, nrows, ctx.n);

        for (j = lower; j < lift_order; j++)
        {
            for (i = 0; i < r; i++)
                trow[i] = n_bpoly_get_coeff(ld + i, k, j);

            for (i = 0; i < nrows; i++)
                nmod_mat_entry(M, j - lower, i) =
                             _nmod_vec_dot(trow, N->rows[i], r, ctx, nlimbs);
        }

        nmod_mat_init_nullspace_tr(T1, M);

        nmod_mat_init(T2, nmod_mat_nrows(T1), nmod_mat_ncols(N), ctx.n);
        nmod_mat_mul(T2, T1, N);
        nmod_mat_swap(T2, N);
        nmod_mat_rref(N);

        nmod_mat_clear(M);
        nmod_mat_clear(T1);
        nmod_mat_clear(T2);
    }

    flint_free(trow);
    n_bpoly_clear(Q);
    n_bpoly_clear(R);
    n_bpoly_clear(dg);
    for (i = 0; i < r; i++)
        n_bpoly_clear(ld + i);
    flint_free(ld);

    *lattice_order = lift_order;
}


/**************** recombination **********************************************/

/*
    First multiply the g[i] into the gprod[i] according the the rows of N.
    Then, run zassenhaus on the gprod[i] as factors of A.
*/
static int _zassenhaus(
    const zassenhaus_prune_t zas,
    slong limit,
    n_tpoly_t F,
    mp_limb_t malpha,
    const nmod_mat_t N,
    n_bpoly_struct * const * g,
    slong r,
    slong order,
    const n_bpoly_t A,
    nmod_t ctx)
{
    slong total_deg;
    int success;
    n_bpoly_t Q, R, t1, t2;
    n_poly_t cont;
    slong i, j, k, len, nrows = nmod_mat_nrows(N);
    slong * subset;
    n_bpoly_struct * gprod;
    n_bpoly_struct * f;
    n_bpoly_t A_copy;
    int is_simple_check = (limit == 1 && r == nmod_mat_nrows(N));

    FLINT_ASSERT(nmod_mat_ncols(N) == r);

    n_poly_init(cont);
    n_bpoly_init(Q);
    n_bpoly_init(R);
    n_bpoly_init(t1);
    n_bpoly_init(t2);
    n_bpoly_init(A_copy);
    gprod = FLINT_ARRAY_ALLOC(nrows, n_bpoly_struct);
    subset = FLINT_ARRAY_ALLOC(nrows, slong);
    for (i = 0; i < nrows; i++)
    {
        subset[i] = i;
        n_bpoly_init(gprod + i);
        n_bpoly_one(gprod + i);
        for (j = 0; j < r; j++)
        {
            if (nmod_mat_entry(N, i, j) == 0)
                continue;
            FLINT_ASSERT(nmod_mat_entry(N, i, j) == 1);
            n_bpoly_mod_mul_series(t1, gprod + i, g[j], order, ctx);
            n_bpoly_swap(gprod + i, t1);
        }
    }

    f = (n_bpoly_struct *) A;

    len = nrows;
    for (k = 1; k <= len/2; k++)
    {
        if (k > limit)
        {
            success = 0;
            goto cleanup;
        }

        zassenhaus_subset_first(subset, len, k);
        while (1)
        {
            total_deg = 0;
            for (i = 0; i < len; i++)
            {
                if (subset[i] >= 0)
                    total_deg += gprod[subset[i]].length - 1;
            }

            if (!zassenhaus_prune_degree_is_possible(zas, total_deg))
            {
                if (!zassenhaus_subset_next(subset, len))
                    break;
                continue;
            }

            FLINT_ASSERT(f->length > 0);
            n_bpoly_set_poly_gen1(t1, f->coeffs + f->length - 1);
            for (i = 0; i < len; i++)
            {
                if (subset[i] >= 0)
                {
                    n_bpoly_mod_mul_series(t2, t1, gprod + subset[i], order, ctx);
                    n_bpoly_swap(t1, t2);
                }
            }

            n_bpoly_mod_make_primitive(cont, t1, ctx);
            if (n_bpoly_mod_divides(Q, f, t1, ctx))
            {
                n_bpoly_mod_taylor_shift_gen1(t1, t1, malpha, ctx);
                n_tpoly_fit_length(F, F->length + 1);
                n_bpoly_swap(F->coeffs + F->length, t1);
                F->length++;
                f = A_copy;
                n_bpoly_swap(f, Q);

                len -= k;
                if (!zassenhaus_subset_next_disjoint(subset, len + k))
                    break;
            }
            else if (is_simple_check)
            {
                success = 0;
                goto cleanup;
            }
            else
            {
                if (!zassenhaus_subset_next(subset, len))
                    break;
            }
        }
    }

    if (f->length > 1)
    {
        n_tpoly_fit_length(F, F->length + 1);
        n_bpoly_mod_taylor_shift_gen1(F->coeffs + F->length, f, malpha, ctx);
        F->length++;
    }
    else
    {
        FLINT_ASSERT(f->length == 1);
        FLINT_ASSERT(n_poly_is_one(f->coeffs + 0));
    }

    success = 1;

cleanup:

    for (i = 0; i < nrows; i++)
        n_bpoly_clear(gprod + i);
    flint_free(gprod);

    flint_free(subset);

    n_poly_clear(cont);
    n_bpoly_clear(Q);
    n_bpoly_clear(R);
    n_bpoly_clear(t1);
    n_bpoly_clear(t2);
    n_bpoly_clear(A_copy);

    return success;
}


/*****************************************************************************/

/*
    x = gen(0), y = gen(1).
    A is supposed to be separable wrt x.
    Put the content of A wrt x in c, and the factors in F.
    Return 1 for success, i.e. a good small prime (y - alpha) was found.
    If allow_shift is false, only (y - 0) is tried.

    TODO: copy this precision strategy to the other bpoly factorers

    The helpers n_bpoly_mod_lift_{start|continue|combine} are used:
        start:      start the lift mod y^0
        continue:   lift up to mod y^n
        combine:    when the lattice work has proven several factors to be
                    grouped together, combine these and start over with fewer
                    local factors. ex: if N = [1 1 0 1 0],
                                              [0 0 1 0 1]
                                       then combine the five factors f1,...,f5
                                       into two f1*f2*f4, f3*f5
*/
int n_bpoly_mod_factor_smprime(
    n_poly_t c,     /* poly in y */
    n_tpoly_t F,
    n_bpoly_t A,    /* clobbered */
    int allow_shift,
    nmod_t ctx)
{
    int success;
    slong i, r;
    slong Alenx, Aleny;
    slong final_order, lift_order, lattice_order;
    slong * CLD;
    nmod_poly_t Aeval;
    mp_limb_t alpha_best, alpha_tmp;
    nmod_poly_factor_t local_fac_best, local_fac_tmp;
    int local_fac_tries = 0;
    n_bpoly_t monicA;
    nmod_mat_t N;
    slong zas_limit;    
    zassenhaus_prune_t zas;
    n_bpoly_mod_lift_t L;

    n_bpoly_mod_make_primitive(c, A, ctx);

    Alenx = A->length;

    FLINT_ASSERT(Alenx > 1);

    nmod_poly_init_mod(Aeval, ctx);
    nmod_poly_factor_init(local_fac_best);
    nmod_poly_factor_init(local_fac_tmp);
    n_bpoly_init(monicA);
    nmod_mat_init(N, 0, 0, ctx.n);
    CLD = FLINT_ARRAY_ALLOC(Alenx, slong);
    zassenhaus_prune_init(zas);
    n_bpoly_mod_lift_init(L);

    Aleny = 0;
    for (i = 0; i < Alenx; i++)
    {
        Aleny = FLINT_MAX(Aleny, A->coeffs[i].length);
        CLD[i] = A->coeffs[i].length;
    }
    mpoly_bivar_cld_bounds(CLD, Alenx);

    zassenhaus_prune_set_degree(zas, Alenx - 1);

    alpha_tmp = 0;
    alpha_best = 0;
    goto got_alpha;

next_alpha:

    if (!allow_shift || alpha_tmp + 1 >= ctx.n)
    {
        if (local_fac_best->num > 0)
            goto doit;
        success = 0;
        goto cleanup;
    }

    alpha_tmp++;

got_alpha:

    n_bpoly_mod_eval(Aeval, A, alpha_tmp, ctx);

    /* if killed leading coeff, get new alpha */
    if (Aeval->length != Alenx)
        goto next_alpha;

    /* note the constant term of Aeval can be zero */

    local_fac_tmp->num = 0;
    nmod_poly_factor(local_fac_tmp, Aeval);
    r = local_fac_tmp->num;

    zassenhaus_prune_start_add_factors(zas);
    for (i = 0; i < r; i++)
        zassenhaus_prune_add_factor(zas, local_fac_tmp->p[i].length - 1,
                                         local_fac_tmp->exp[i]);
    zassenhaus_prune_end_add_factors(zas);

    if (r < 2 && local_fac_tmp->exp[0] == 1)
        goto irreducible;
    if (zassenhaus_prune_must_be_irreducible(zas))
        goto irreducible;

    /* if multiple factors, get new alpha */
    for (i = 0; i < r; i++)
    {
        FLINT_ASSERT(local_fac_tmp->p[i].length > 1);
        FLINT_ASSERT(local_fac_tmp->p[i].coeffs[local_fac_tmp->p[i].length - 1] == 1);
        if (local_fac_tmp->exp[i] != 1)
            goto next_alpha;
    }

    /* done if A is constant in y */
    if (Aleny < 2)
    {
        n_poly_t mock;
        n_tpoly_fit_length(F, r);
        F->length = r;
        for (i = 0; i < r; i++)
        {
            n_poly_mock(mock, local_fac_tmp->p + i);
            n_bpoly_set_poly_gen0(F->coeffs + i, mock);
        }
        success = 1;
        goto cleanup;
    }

    /* alpha_tmp & local_fac_tmp are good; update best */
    if (local_fac_best->num < 1 || local_fac_best->num > local_fac_tmp->num)
    {
        alpha_best = alpha_tmp;
        nmod_poly_factor_swap(local_fac_best, local_fac_tmp);
    }

    if (++local_fac_tries < 2)
        goto next_alpha;

doit:

    n_bpoly_mod_taylor_shift_gen1(A, A, alpha_best, ctx);

    /* local_fac_best is a factorization mod (y - alpha_best) */
    r = local_fac_best->num;

    /* precision for constructing true factors */
    final_order = Aleny;

    /* precision for lifted local factors */
    lift_order = Aleny;
    for (i = 0; i < Alenx - 1; i++)
        if (CLD[i] > 0 && lift_order > CLD[i])
            lift_order = CLD[i];
    lift_order = lift_order + 4;

    /* lift up to y^lift_order */
    n_bpoly_mod_make_monic_series(monicA, A, lift_order, ctx);
    n_bpoly_mod_lift_start(L, local_fac_best->p, r, monicA, ctx);
    n_bpoly_mod_lift_continue(L, monicA, lift_order, ctx);

    /* the rows of N give the combinations of local factors */    
    nmod_mat_clear(N);
    nmod_mat_init(N, r, r, ctx.n);
    for (i = 0; i < r; i++)
        nmod_mat_entry(N, i, i) = 1;

    /* size limit on subsets in zassenhaus combination */
    zas_limit = 1;

    lattice_order = 0;
    _lattice(N, L->lifted_fac, L->r, lift_order, CLD, &lattice_order, A, ctx);
    if (nmod_mat_nrows(N) < 2)
        goto irreducible_shift;
    if (!nmod_mat_is_reduced(N))
        goto increase;
    if (nmod_mat_nrows(N) < nmod_mat_ncols(N)/4*3)
        n_bpoly_mod_lift_combine(L, N, monicA, ctx);

try_zas:

    /* zassenhaus only make sense if N is a nice 0-1 mat */
    FLINT_ASSERT(nmod_mat_is_reduced(N));

    while (nmod_mat_nrows(N) > 2 && 2*L->fac_lift_order < final_order)
    {
        lift_order = 2*L->fac_lift_order;
        n_bpoly_mod_make_monic_series(monicA, A, lift_order, ctx);
        n_bpoly_mod_lift_continue(L, monicA, lift_order, ctx);
        _lattice(N, L->lifted_fac, L->r, lift_order, CLD, &lattice_order, A, ctx);
        if (nmod_mat_nrows(N) < 2)
            goto irreducible_shift;
        if (!nmod_mat_is_reduced(N))
            goto increase;
        if (nmod_mat_nrows(N) < nmod_mat_ncols(N)/4*3)
            n_bpoly_mod_lift_combine(L, N, monicA, ctx);
    }

    if (L->fac_lift_order < final_order)
    {
        lift_order = final_order;
        n_bpoly_mod_make_monic_series(monicA, A, lift_order, ctx);
        n_bpoly_mod_lift_continue(L, monicA, lift_order, ctx);
    }

    /* combine local factors according the rows of N, then by subsets */
    F->length = 0;
    success = _zassenhaus(zas, zas_limit, F, nmod_neg(alpha_best, ctx), N,
                                     L->lifted_fac, L->r, final_order, A, ctx);
    if (success)
        goto cleanup;

    /* first attempt failed, try subsets of size 1 or 2 from now on */
    zas_limit = 2;

more:

    /* increase precision until N is a nice 0-1 mat */
    _lattice(N, L->lifted_fac, L->r, lift_order, CLD, &lattice_order, A, ctx);
    if (nmod_mat_nrows(N) < 2)
        goto irreducible_shift;
    if (!nmod_mat_is_reduced(N))
        goto increase;
    if (nmod_mat_nrows(N) < nmod_mat_ncols(N)/4*3)
        n_bpoly_mod_lift_combine(L, N, monicA, ctx);
    goto try_zas;

increase:

    if (lift_order < final_order)
        lift_order += 4 + lift_order/2;
    else
        lift_order += 1 + lift_order/8;

    n_bpoly_mod_make_monic_series(monicA, A, lift_order, ctx);
    n_bpoly_mod_lift_continue(L, monicA, lift_order, ctx);
    goto more;

cleanup:

    n_bpoly_mod_lift_clear(L);

    flint_free(CLD);

    nmod_mat_clear(N);
    nmod_poly_clear(Aeval);
    nmod_poly_factor_clear(local_fac_best);
    nmod_poly_factor_clear(local_fac_tmp);
    n_bpoly_clear(monicA);

    zassenhaus_prune_clear(zas);

    return success;

irreducible_shift:

    n_bpoly_mod_taylor_shift_gen1(A, A, nmod_neg(alpha_best, ctx), ctx);

irreducible:

    n_tpoly_fit_length(F, 1);
    F->length = 1;
    n_bpoly_swap(F->coeffs + 0, A);
    success = 1;
    goto cleanup;
}