#include "ecm-gpu.h"

#ifdef WITH_GPU

#include "cudacommon.h"

#include "cgbn_stage1.h"


#define TWO32 4294967296 /* 2^32 */

/* Try to reduce all composite factors to primes.
 * This can be hard if factors overlap e.g. (a*b, a*c*d, b*c)
 */
void reducefactors (mpz_t *factors, int *array_found, unsigned int nb_curves)
{
  unsigned int i, j;
  unsigned int found;
  unsigned int updates;
  mpz_t gcd;
  mpz_init (gcd);

  found = 0;
  mpz_t *reduced = (mpz_t *) malloc (nb_curves * sizeof (mpz_t));
  ASSERT_ALWAYS (reduced != NULL);

  /* Add all unique factors to reduced */
  for (i = 0; i < nb_curves; i++)
    {
      if (array_found[i] == ECM_NO_FACTOR_FOUND)
        continue;

      /* Scan for match */
      updates = 0;
      for (j = 0; j < found; j++) {
          if (mpz_cmp (factors[i], reduced[j]) == 0) {
              updates = 1;
              break;
          }
      }
      if (!updates)
          mpz_init_set (reduced[found++], factors[i]);
    }

  do {
    outputf (OUTPUT_DEVVERBOSE, "GPU: Reducing %d factors\n", found);
    updates = 0;

    /* remove any trivial factor */
    for (i = 0; i < found; i++)
      {
        while (mpz_cmp_ui (reduced[i], 1) == 0) {
          found--;
          mpz_swap (reduced[i], reduced[found]);
          mpz_clear (reduced[found]);
          if (i == found)
              break;
        }
      }

    for (i = 0; i < found; i++)
      {
        /* Try to reduce an existing factor */
        for (j = i+1; j < found; j++)
          {
            /* if i == j remove reduced[j] */
            if (mpz_cmp (reduced[i], reduced[j]) == 0)
              {
                  updates += 1;
                  found--;
                  mpz_swap (reduced[j], reduced[found]);
                  mpz_clear (reduced[found]);
                  if (j == found)
                      break;
              }

            mpz_gcd (gcd, reduced[i], reduced[j]);
            if (mpz_cmp_ui (gcd, 1) > 0)
              {
                /* gcd(2*3, 2*3*5) remove 2*3 from F2 leaving 2*3 and 5 */
                if (mpz_cmp (gcd, reduced[i]) == 0)
                  {
                    updates += 1;
                    assert( mpz_divisible_p (reduced[j], gcd) );
                    mpz_divexact (reduced[j], reduced[j], gcd);
                  }
                /* gcd(2*3*5, 2*3) == 2*3 from F1 leaving 5 and 2*3 */
                else if (mpz_cmp (gcd, reduced[j]) == 0)
                  {
                    updates += 1;
                    assert( mpz_divisible_p (reduced[i], gcd) );
                    mpz_divexact (reduced[i], reduced[i], gcd);
                  }

                /* hard case gcd(2*3, 3*5) = 3, remove 3 from both, add 3 as new factor */
                else if (found < nb_curves)
                  {
                    updates += 1;
                    mpz_divexact (reduced[j], reduced[j], gcd);
                    mpz_divexact (reduced[i], reduced[i], gcd);

                    mpz_init (reduced[found]);
                    mpz_set (reduced[found], gcd);
                    found++;
                  }
              }
            if (mpz_cmp_ui (reduced[i], 1) == 0)
                break;
          }
      }
  } while (updates > 0);

  /* bubble_sort, fast enough because found < num_curves */
  do {
    updates = 0;
    for (j = 1; j < found; j++)
      {
        if (mpz_cmp(reduced[j-1], reduced[j]) > 0)
          {
            updates += 1;
            mpz_swap(reduced[j-1], reduced[j]);
          }
      }
  } while (updates > 0);

  outputf (OUTPUT_DEVVERBOSE, "GPU: Reduced to %d factors\n", found);
  /* write out reduced[i], update array_found */
  for (i = 0; i < found; i++)
    {
      mpz_swap(factors[i], reduced[i]);
      mpz_clear(reduced[i]);
      array_found[i] = ECM_FACTOR_FOUND_STEP1;
      outputf (OUTPUT_DEVVERBOSE, "GPU: Reduced factor %d: %Zd\n", i+1, factors[i]);
    }

  for (i = found; i < nb_curves; i++)
    array_found[i] = ECM_NO_FACTOR_FOUND;

  mpz_clear (gcd);
  free(reduced);
}


static void
A_from_sigma (mpz_t A, unsigned int sigma, mpz_t n)
{
  mpz_t tmp;
  int i;
  mpz_init_set_ui (tmp, sigma);
  /* Compute d = sigma/2^32 */
  for (i = 0; i < 32; i++)
    {
      if (mpz_tstbit (tmp, 0) == 1)
      mpz_add (tmp, tmp, n);
      mpz_div_2exp (tmp, tmp, 1);
    }
  mpz_mul_2exp (tmp, tmp, 2);           /* 4d */
  mpz_sub_ui (tmp, tmp, 2);             /* 4d-2 */

  mpz_set (A, tmp);

  mpz_clear (tmp);
}


int
gpu_ecm (mpz_t f, const ecm_params params, ecm_params mutable_params, mpz_t n, double B1)
{
  unsigned int i;
  int youpi = ECM_NO_FACTOR_FOUND;
  int factor_found = ECM_NO_FACTOR_FOUND;
  long st, st2;
  long tottime; /* at the end, total time in ms */
  unsigned int firstsigma_ui;
  float gputime = 0.0;
  mpz_t tmp_A;
  mpz_t *factors = NULL; /* Contains either a factor of n either end-of-stage-1
                         residue (depending of the value of array_found */
  int *array_found = NULL;
  /* Only for stage 2 */
  int base2 = 0;  /* If n is of form 2^n[+-]1, set base to [+-]n */
  int Fermat = 0; /* If base2 > 0 is a power of 2, set Fermat to base2 */
  int po2 = 0;    /* Whether we should use power-of-2 poly degree */
  /* Use only in stage 2 */
  mpmod_t modulus;
  curve P;
  mpz_t B2min, B2; /* Local B2, B2min to avoid changing caller's values */
  unsigned long dF;
  root_params_t root_params;
  unsigned int nb_curves = 0; /* Local copy of number of curves */

  /* This helps keep track of when params is changed in this function. */
  assert((void*) params == (void*) mutable_params); /* params != mutable params */

  ASSERT((-1 <= params->sigma_is_A) && (params->sigma_is_A <= 1));
  ASSERT((GMP_NUMB_BITS == 32) || (GMP_NUMB_BITS == 64));

  /* Set global VERBOSE to avoid the need to explicitly passing verbose */
  set_verbose (params->verbose);
  ECM_STDOUT = (params->os == NULL) ? stdout : params->os;
  ECM_STDERR = (params->es == NULL) ? stdout : params->es;

  /* Check that N is not too big */
  size_t max_bits = ECM_GPU_CGBN_MAX_BITS - 6;
  if (mpz_sizeinbase (n, 2) > max_bits)
    {
      outputf (OUTPUT_ERROR, "GPU: Error, input number should be stricly lower"
                             " than 2^%d\n", max_bits);
      return ECM_ERROR;
    }

  /* Only param = ECM_PARAM_BATCH_32BITS_D is accepted on GPU */
  if (params->param == ECM_PARAM_DEFAULT)
      mutable_params->param = ECM_PARAM_BATCH_32BITS_D;

  if (params->param != ECM_PARAM_BATCH_32BITS_D)
    {
      outputf (OUTPUT_ERROR, "GPU: Error, only param = ECM_PARAM_BATCH_32BITS_D "
                             "is accepted on GPU.\n");
      return ECM_ERROR;
    }

  if (params->go != NULL && mpz_cmp_ui (params->go, 1) > 0)
    {
      outputf (OUTPUT_ERROR, "GPU: Error, option -go is not allowed\n");
      return ECM_ERROR;
    }

  /* Cannot do resume on GPU */
  if (!ECM_IS_DEFAULT_B1_DONE(params->B1done) && params->B1done < B1)
    {
      outputf (OUTPUT_ERROR, "GPU: Error, cannot resume on GPU.\n");
      return ECM_ERROR;
    }

  /* Current code works only for sigma_is_A = 0 */
  if (params->sigma_is_A != 0)
    {
      outputf (OUTPUT_ERROR, "GPU: ERROR, sigma_is_A not allowed.\n");
      return ECM_ERROR;
    }

  /* check that repr == ECM_MOD_DEFAULT or ECM_MOD_BASE2 (only for stage 2) */
  if (params->repr != ECM_MOD_DEFAULT && params->repr != ECM_MOD_BASE2)
      outputf (OUTPUT_ERROR, "GPU: Warning, the value of repr will be ignored "
      "for step 1 on GPU.\n");

  /* It is only for stage 2, it is not taken into account for GPU code */
  if (mpmod_init (modulus, n, params->repr) != 0)
    return ECM_ERROR;

  /* See what kind of number we have as that may influence optimal parameter
     selection. Test for base 2 number. Note: this was already done by
     mpmod_init. */

  if (modulus->repr == ECM_MOD_BASE2)
    base2 = modulus->bits;

  /* For a Fermat number (base2 a positive power of 2) */
  for (Fermat = base2; Fermat > 0 && (Fermat & 1) == 0; Fermat >>= 1);
  if (Fermat == 1)
    {
      Fermat = base2;
      po2 = 1;
    }
  else
      Fermat = 0;

  /* Set parameters for stage 2 */
  mpres_init (P.x, modulus);
  mpres_init (P.y, modulus);
  mpres_init (P.A, modulus);
  mpz_init (tmp_A);
  mpz_init (B2);
  mpz_init (B2min);

  youpi = set_stage_2_params (
          B2, params->B2,
          B2min, params->B2min,
          &root_params,
          B1, &mutable_params->k, params->S, params->use_ntt, &po2, &dF,
                              params->TreeFilename, params->maxmem, Fermat, modulus);
  if (youpi == ECM_ERROR)
      goto end_gpu_ecm;

  /* Initialize the GPU if necessary and determine nb_curves */
  if (!params->gpu_device_init)
    {
      st = cputime ();
      youpi = select_and_init_GPU (
          params->gpu_device,
          &mutable_params->gpu_number_of_curves,
          test_verbose (OUTPUT_VERBOSE));

      if (youpi != 0)
        {
          youpi = ECM_ERROR;
          goto end_gpu_ecm;
        }

      outputf (OUTPUT_VERBOSE, "GPU: Selection and initialization of the device "
                               "took %ldms\n", elltime (st, cputime ()));
      /* TRICKS: If initialization of the device is too long (few seconds), */
      /* try running 'nvidia-smi -q -l' on the background .                 */
      mutable_params->gpu_device_init = 1;
    }

  /* Number of curves is only set after select_and_init_GPU */
  nb_curves = params->gpu_number_of_curves;

  ASSERT (params->sigma_is_A == 0);
  if (mpz_sgn (params->sigma) == 0)
    {
      /* generate random value in [2, 2^32 - nb_curves - 1] */
      mpz_set_ui (mutable_params->sigma,
                  (get_random_ul () % (TWO32 - 2 - nb_curves)) + 2);
    }
  else /* sigma should be in [2, 2^32-nb_curves] */
    {
      if (mpz_cmp_ui (params->sigma, 2) < 0 ||
          mpz_cmp_ui (params->sigma, TWO32 - nb_curves) >= 0)
        {
          outputf (OUTPUT_ERROR, "GPU: Error, sigma should be in [2,%lu]\n",
                                 TWO32 - nb_curves - 1);
          youpi = ECM_ERROR;
          goto end_gpu_ecm;
        }
    }
  firstsigma_ui = mpz_get_ui (params->sigma);

  print_B1_B2_poly (OUTPUT_NORMAL, ECM_ECM, B1, params->B1done,  params->B2min, B2min,
                    B2, params->S, params->sigma, params->sigma_is_A, ECM_EC_TYPE_MONTGOMERY,
                    params->go, params->param, nb_curves);
  outputf (OUTPUT_VERBOSE, "dF=%lu, k=%lu, d=%lu, d2=%lu, i0=%Zd\n",
           dF, params->k, root_params.d1, root_params.d2, root_params.i0);

  if ((mpz_cmp (B2, B2min) < 0) && modulus->repr == ECM_MOD_BASE2 && params->nobase2step2)
    {
      outputf (OUTPUT_ERROR, "GPU: ERROR, -nobase2s2 not supported on GPU.\n");
      youpi = ECM_ERROR;
      goto end_gpu_ecm;
    }

  if (test_verbose (OUTPUT_VERBOSE))
    {
      if (mpz_cmp_d (B2min, B1) != 0)
        {
          outputf (OUTPUT_VERBOSE,
            "Can't compute success probabilities for B1 <> B2min\n");
        }
      else
        {
          rhoinit (256, 10);
          print_expcurves (B1, B2, dF, params->k, root_params.S, params->param);
        }
    }

  /* Init arrays */
  factors = (mpz_t *) malloc (nb_curves * sizeof (mpz_t));
  ASSERT_ALWAYS (factors != NULL);

  array_found = (int *) malloc (nb_curves * sizeof (int));
  ASSERT_ALWAYS (array_found != NULL);

  for (i = 0; i < nb_curves; i++)
    {
      mpz_init (factors[i]);
      array_found[i] = ECM_NO_FACTOR_FOUND;
    }


  /* Compute s */
  if (B1 != params->batch_last_B1_used || mpz_cmp_ui (params->batch_s, 1) <= 0)
    {
      mutable_params->batch_last_B1_used = B1;

      st = cputime ();
      /* construct the batch exponent */
      compute_s (mutable_params->batch_s, B1, NULL);
      outputf (OUTPUT_VERBOSE, "Computing batch product (of %" PRIu64
                               " bits) of primes up to B1=%1.0f took %ldms\n",
                               mpz_sizeinbase (params->batch_s, 2), B1, cputime () - st);
    }


  st = cputime ();

  youpi = cgbn_ecm_stage1 (factors, array_found, n, params->batch_s, nb_curves,
                           firstsigma_ui, &gputime, params->verbose);

  outputf (OUTPUT_NORMAL, "Computing %u Step 1 took %ldms of CPU time / "
                          "%.0fms of GPU time\n", nb_curves,
                          elltime (st, cputime ()), gputime);
  outputf (OUTPUT_VERBOSE, "Throughput: %.3f curves per second ",
                           1000 * nb_curves/gputime);
  outputf (OUTPUT_VERBOSE, "(on average %.2fms per Step 1)\n",
                           gputime/nb_curves);
  tottime = (long) gputime;

  mutable_params->B1done = B1;

  /* GMP documentation says mpz_sizeinbase(op, 2) is always the exact value. */
  size_t n_bits = mpz_sizeinbase(n, 2);

  /* Save stage 1 residues as x = x0 + x1 * 2^bits + ... + xk * 2^(bits*k) */
  mpz_set_ui (mutable_params->x, 0);
  /* Equivalent to using mpz_mul_2exp and mpz_add while avoiding O(n*k) limp copies */
  mpz_realloc2(mutable_params->x, nb_curves * n_bits);
  for (i = 0; i < nb_curves; i++)
    for (size_t j = 0; j < n_bits; j++)
      if (mpz_tstbit (factors[i], j))
        mpz_setbit(mutable_params->x, j + n_bits * i);

  /* was a factor found in stage 1 ? */
  if (youpi != ECM_NO_FACTOR_FOUND)
      goto end_gpu_ecm_factors;

  if (mpz_cmp (B2, B2min) < 0)
      goto end_gpu_ecm_factors;

  st2 = cputime ();

  P.disc = 0; /* For stage2 this needs to be 0, in order not to use CM stuff */

  for (i = 0; i < nb_curves; i++)
    {
      /* hack to reduce verbose Step 2 */
      if (params->verbose > 0)
        set_verbose (params->verbose-1);

      if (test_verbose (OUTPUT_RESVERBOSE))
        outputf (OUTPUT_RESVERBOSE, "x=%Zd\n", factors[i]);

      if (params->stop_asap != NULL && params->stop_asap())
          goto end_gpu_ecm_rhotable;

      mpres_set_z (P.x, factors[i], modulus);
      mpres_set_ui (P.y, 1, modulus);
      A_from_sigma (tmp_A, i+firstsigma_ui, modulus->orig_modulus);
      mpres_set_z (P.A, tmp_A, modulus);

      /* compute stage 2 */
      youpi = montgomery_to_weierstrass (factors[i], P.x, P.y, P.A, modulus);
      if (youpi != ECM_NO_FACTOR_FOUND)
        goto next_curve;

      if (test_verbose (OUTPUT_RESVERBOSE) && youpi == ECM_NO_FACTOR_FOUND
          && mpz_cmp (B2, B2min) >= 0)
        {
          mpz_t t;

          mpz_init (t);
          mpres_get_z (t, P.x, modulus);
          outputf (OUTPUT_RESVERBOSE, "After switch to Weierstrass form, "
                                      "P=(%Zd", t);
          mpres_get_z (t, P.y, modulus);
          outputf (OUTPUT_RESVERBOSE, ", %Zd)\n", t);
          mpres_get_z (t, P.A, modulus);
          outputf (OUTPUT_RESVERBOSE, "on curve Y^2 = X^3 + %Zd * X + b\n",
                       t);
          mpz_clear (t);
        }

      youpi = stage2 (factors[i], &P, modulus, dF, params->k, &root_params,
                      params->use_ntt, params->TreeFilename, i+1, params->stop_asap);

    next_curve:
      set_verbose (params->verbose);

      if (youpi != ECM_NO_FACTOR_FOUND)
        {
          array_found[i] = youpi;
          outputf (OUTPUT_NORMAL, "GPU: factor %Zd found in Step 2 with"
                " curve %u (-sigma 3:%u)\n", factors[i], i, i+firstsigma_ui);
          /* factor_found corresponds to the first factor found */
          if (factor_found == ECM_NO_FACTOR_FOUND)
            factor_found = youpi;
        }
    }

  /* If a factor was found in Step 2, make sure we set
   * our return value "youpi" appropriately
   */
  youpi = factor_found;

  st2 = elltime (st2, cputime ());
  outputf (OUTPUT_NORMAL, "Computing %u Step 2 on CPU took %ldms\n",
                          nb_curves, st2);
  outputf (OUTPUT_VERBOSE, "Throughput: %.3f Step 2 per second ",
                           1000.0 * nb_curves /st2);
  outputf (OUTPUT_VERBOSE, "(on average %0.2fms per Step 2)\n",
                           ((double) st2)/((double) nb_curves));
  tottime += st2;

end_gpu_ecm_factors:

  reducefactors(factors, array_found, nb_curves);

  /* If f0, ,fk are the factors found (in stage 1 or 2)
   * f = f0 + f1*n + .. + fk*n^k
   * The purpose of this construction is to be able to return more than one
   * factor if needed without breaking the lib interface (as gcd(f,n)=gcd(f0,n).
   */
  mpz_set_ui (f, 0);
  for (i = 0; i < nb_curves; i++)
  {
    /* invert order of factors so they are processed in same order found */
    if (array_found[nb_curves-1-i] != ECM_NO_FACTOR_FOUND)
      {
        mpz_mul (f, f, n);
        mpz_add (f, f, factors[nb_curves-1-i]);
      }
  }

  for (i = 0; i < nb_curves; i++)
      mpz_clear (factors[i]);

  free (array_found);
  free (factors);

end_gpu_ecm_rhotable:
  if (test_verbose (OUTPUT_VERBOSE))
    {
      if (mpz_cmp_d (B2min, B1) == 0)
        {
          if (youpi == ECM_NO_FACTOR_FOUND &&
              (params->stop_asap == NULL || !params->stop_asap()))
              print_exptime (B1, B2, dF, params->k, root_params.S,
                             (long) (tottime / nb_curves), params->param);
          rhoinit (1, 0); /* Free memory of rhotable */
        }
    }


end_gpu_ecm:
  mpz_clear (root_params.i0);
  mpres_clear (P.A, modulus);
  mpres_clear (P.y, modulus);
  mpres_clear (P.x, modulus);
  mpmod_clear (modulus);
  mpz_clear (tmp_A);
  mpz_clear (B2);
  mpz_clear (B2min);

  return youpi;
}

#endif /* HAVE_GPU */