/*
    Copyright (C) 2018 Fredrik Johansson

    This file is part of Arb.

    Arb is free software: you can redistribute it and/or modify it under
    the terms of the GNU Lesser General Public License (LGPL) as published
    by the Free Software Foundation; either version 2.1 of the License, or
    (at your option) any later version.  See <http://www.gnu.org/licenses/>.
*/

#include "arb_mat.h"

/* Block size for better cache locality. */
#define BLOCK_SIZE 32

/* Don't convert to doubles when smaller than this block size. */
#define MIN_D_BLOCK_SIZE 5

/* With doubles, we can have an exponent range of about 1024, minus some
   slack for accumulated sums. */
#define DOUBLE_MAX_OFFSET 900

static __inline__ double dot8(const double * A, const double * B)
{
    return ((A[0] * B[0] + A[1] * B[1]) + (A[2] * B[2] + A[3] * B[3])) +
           ((A[4] * B[4] + A[5] * B[5]) + (A[6] * B[6] + A[7] * B[7]));
}

/* Upper bound of matrix product, assuming nonnegative entries and
   no overflow/underflow. B is pre-transposed. Straightforward blocked
   implementation; could use BLAS, but this matrix product is rarely going
   to be the bottleneck. */

static void
_d_mat_addmul(double * C, const double * A, const double * B, slong ar, slong ac, slong bc)
{
    slong ii, jj, kk, i, j, k;
    double t, eps;

    eps = ldexp(1.0, -52);

    for (ii = 0; ii < ar; ii += BLOCK_SIZE)
    {
        for (jj = 0; jj < bc; jj += BLOCK_SIZE)
        {
            for (kk = 0; kk < ac; kk += BLOCK_SIZE)
            {
                for (i = ii; i < FLINT_MIN(ii + BLOCK_SIZE, ar); i++)
                {
                    for (j = jj; j < FLINT_MIN(jj + BLOCK_SIZE, bc); j++)
                    {
                        if (BLOCK_SIZE == 32 && kk + BLOCK_SIZE <= ac)
                        {
                            double t0, t1, t2, t3;

                            t0 = dot8(A + i * ac + kk + 0, B + j * ac + kk + 0);
                            t1 = dot8(A + i * ac + kk + 8, B + j * ac + kk + 8);
                            t2 = dot8(A + i * ac + kk + 16, B + j * ac + kk + 16);
                            t3 = dot8(A + i * ac + kk + 24, B + j * ac + kk + 24);

                            t = (t0 + t1) + (t2 + t3);
                        }
                        else
                        {
                            t = 0.0;

                            for (k = kk; k < FLINT_MIN(kk + BLOCK_SIZE, ac); k++)
                                t += A[i * ac + k] * B[j * ac + k];
                        }

                        C[i * bc + j] += t;
                    }
                }
            }
        }
    }

    /* Compensate for possible rounding errors */
    for (i = 0; i < ar; i++)
        for (j = 0; j < bc; j++)
            C[i * bc + j] *= (1.0 + 2.01 * (ac + 1) * eps);
}

/* We use WORD_MIN to represent zero here. */
static __inline__ slong _mag_get_exp(const mag_t x)
{
    if (mag_is_special(x))
        return WORD_MIN;
    else
        return MAG_EXP(x);
}

static double
mag_get_d_fixed_si(const mag_t x, slong e)
{
    return ldexp(MAG_MAN(x), MAG_EXP(x) - e - MAG_BITS);
}

void
_arb_mat_addmul_rad_mag_fast(arb_mat_t C, mag_srcptr A, mag_srcptr B,
    slong ar, slong ac, slong bc)
{
    slong i, j, k, M, N, P, top, n, block_start, block_end;
    slong *A_min, *A_max, *B_min, *B_max, max_offset;
    double *CC, *AA, *BB;

    M = ar;
    N = ac;
    P = bc;

    /* todo: could use TMP_ALLOC */
    A_min = flint_malloc(sizeof(slong) * M);
    A_max = flint_malloc(sizeof(slong) * M);
    B_min = flint_malloc(sizeof(slong) * P);
    B_max = flint_malloc(sizeof(slong) * P);

    AA = flint_malloc(ar * ac * sizeof(double));
    BB = flint_malloc(ac * bc * sizeof(double));
    CC = flint_malloc(ar * bc * sizeof(double));

    max_offset = DOUBLE_MAX_OFFSET;

    block_start = 0;
    while (block_start < N)
    {
        block_end = block_start + 1;  /* index is exclusive block_end */

        /* begin with this column of A and row of B */
        for (i = 0; i < M; i++)
            A_max[i] = A_min[i] = _mag_get_exp(A + i * N + block_start);
        for (i = 0; i < P; i++)
            B_max[i] = B_min[i] = _mag_get_exp(B + i * N + block_start);

        while (block_end < N)
        {
            /* check if we can extend with column [block_end] of A */
            for (i = 0; i < M; i++)
            {
                top = _mag_get_exp(A + i * N + block_end);
                /* zeros are irrelevant */
                if (top == WORD_MIN || A_max[i] == WORD_MIN)
                    continue;
                /* jump will be too big */
                if (top > A_min[i] + max_offset || top < A_max[i] - max_offset)
                    goto mblocks_built;
            }

            /* check if we can extend with row [block_end] of B */
            for (i = 0; i < P; i++)
            {
                top = _mag_get_exp(B + i * N + block_end);
                if (top == WORD_MIN || B_max[i] == WORD_MIN)
                    continue;
                if (top > B_min[i] + max_offset || top < B_max[i] - max_offset)
                    goto mblocks_built;
            }

            /* second pass to update the extreme values */
            for (i = 0; i < M; i++)
            {
                top = _mag_get_exp(A + i * N + block_end);
                if (A_max[i] == WORD_MIN)
                {
                    A_max[i] = top;
                    A_min[i] = top;
                }
                else if (top != WORD_MIN)
                {
                    if (top < A_min[i]) A_min[i] = top;
                    if (top > A_max[i]) A_max[i] = top;
                }
            }

            for (i = 0; i < P; i++)
            {
                top = _mag_get_exp(B + i * N + block_end);
                if (B_max[i] == WORD_MIN)
                {
                    B_max[i] = top;
                    B_min[i] = top;
                }
                else if (top != WORD_MIN)
                {
                    if (top < B_min[i]) B_min[i] = top;
                    if (top > B_max[i]) B_max[i] = top;
                }
            }

            block_end++;
        }

    mblocks_built:

        n = block_end - block_start;

        if (n <= MIN_D_BLOCK_SIZE)
        {
            /* increment so we don't just do steps of 1 in degenerate cases */
            block_end = FLINT_MIN(block_start + MIN_D_BLOCK_SIZE, N);
            n = block_end - block_start;

            for (i = 0; i < ar; i++)
            {
                for (j = 0; j < bc; j++)
                {
                    for (k = 0; k < n; k++)
                    {
                        mag_fast_addmul(arb_radref(arb_mat_entry(C, i, j)),
                            A + i * ac + block_start + k,
                            B + j * ac + block_start + k);
                    }
                }
            }
        }
        else
        {
            for (i = 0; i < ar; i++)
            {
                if (A_min[i] == WORD_MIN)  /* only zeros in this row */
                    continue;

                A_min[i] = (A_min[i] + A_max[i]) / 2;

                for (j = 0; j < n; j++)
                    AA[i * n + j] = mag_get_d_fixed_si(A + i * ac + block_start + j, A_min[i]);
            }

            /* Note: B and BB are both transposed in memory */
            for (i = 0; i < bc; i++)
            {
                if (B_min[i] == WORD_MIN)  /* only zeros in this column */
                    continue;

                B_min[i] = (B_min[i] + B_max[i]) / 2;

                for (j = 0; j < n; j++)
                    BB[i * n + j] = mag_get_d_fixed_si(B + i * ac + block_start + j, B_min[i]);
            }

            for (i = 0; i < ar * bc; i++)
                CC[i] = 0.0;

            _d_mat_addmul(CC, AA, BB, ar, n, bc);

            for (i = 0; i < ar; i++)
            {
                if (A_min[i] == WORD_MIN)
                    continue;

                for (j = 0; j < bc; j++)
                {
                    if (B_min[j] == WORD_MIN)
                        continue;

                    if (CC[i * bc + j] != 0.0)
                    {
                        mag_t t;
                        MAG_SET_D_2EXP(MAG_MAN(t), MAG_EXP(t), CC[i * bc + j], A_min[i] + B_min[j]);
                        mag_add(arb_radref(arb_mat_entry(C, i, j)),
                                arb_radref(arb_mat_entry(C, i, j)), t);
                    }
                }
            }
        }

        block_start = block_end;
    }

    flint_free(A_max);
    flint_free(A_min);
    flint_free(B_max);
    flint_free(B_min);

    flint_free(AA);
    flint_free(BB);
    flint_free(CC);
}