// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced
// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
// LICENSE and NOTICE for details. LLNL-CODE-806117.
//
// This file is part of the MFEM library. For more information and source code
// availability visit https://mfem.org.
//
// MFEM is free software; you can redistribute it and/or modify it under the
// terms of the BSD-3 license. We welcome feedback and contributions, see file
// CONTRIBUTING.md for details.

#include "bench.hpp"

#ifdef MFEM_USE_BENCHMARK

/*
  This benchmark is inspired by the CEED's bake-off problems to benchmark the
  performance of the action of the partial assembly (PA) of DG Convection on
  nonconforming meshes.

   * --benchmark_filter=BK_DG/[1024-max_dofs]/[1-max_order]/[prob]
   * --benchmark_context=device=[cpu/cuda/hip]
*/

// The maximum polynomial order used for benchmarking
const int max_order = 6;
// The maximum number of dofs for benchmarking
const int max_dofs = 1e7;

void velocity_function(const Vector &x, Vector &v)
{
   int dim = x.Size();
   switch (dim)
   {
      case 1: v(0) = 1.0; break;
      case 2: v(0) = x(1); v(1) = -x(0); break;
      case 3: v(0) = x(1); v(1) = -x(0); v(2) = x(0); break;
   }
}

/// A kernel testing DG Convection
struct KernelMesh
{
   const int N;
   Mesh mesh;

   KernelMesh(int N, double prob)
   : N(N), mesh(Mesh::MakeCartesian3D(N,N,N,Element::HEXAHEDRON))
   {
      if (prob >= 0.0)
      {
         mesh.EnsureNCMesh();
         if (prob > 0.0)
         {
            mesh.RandomRefinement(prob);
         }            
      }
   }
};

struct Kernel: public KernelMesh
{
   const int p, q, dim = 3;
   DG_FECollection fec;
   FiniteElementSpace fes;
   const int dofs;
   GridFunction x,y;
   BilinearForm a;
   double mdofs;
   VectorFunctionCoefficient velocity;

   Kernel(int order, int N, double prob = -1, bool GLL = false)
   : 
      KernelMesh(N, prob),
      p(order),
      q(2*p + (GLL?-1:3)),
      fec(p, dim, BasisType::GaussLobatto),
      fes(&mesh, &fec),
      dofs(fes.GetTrueVSize()),
      x(&fes),
      y(&fes),
      a(&fes),
      mdofs(0.0),
      velocity(dim, velocity_function)
   {
      if (is_runnable())
      {
         x.Randomize(1);
         a.SetAssemblyLevel(AssemblyLevel::PARTIAL);
         a.AddDomainIntegrator(new ConvectionIntegrator(velocity, -1.0));
         a.AddInteriorFaceIntegrator(
            new TransposeIntegrator(new DGTraceIntegrator(velocity, 1.0, -0.5)));
         a.AddBdrFaceIntegrator(
            new TransposeIntegrator(new DGTraceIntegrator(velocity, 1.0, -0.5)));
         a.Assemble();
         a.Mult(x, y);
         MFEM_DEVICE_SYNC;
      }
   }

   /// @brief Heuristic to evaluate if the case will run out of memory
   bool is_runnable() const
   {
      const long long int gB = 1073741824/8;
      const int mem_size = Device::IsEnabled()?16:256;
      const long long int max_mem = mem_size * gB;
      const int num_elems = fes.GetNE();
      long long int mem = num_elems * pow(p+1, dim+1) * 8;
      // std::cout << "mem = " << mem << " , max_mem = " << max_mem << std::endl;
      return mem < max_mem;
   }

   void setup()
   {
      a.Assemble();
   }

   void benchmark_setup()
   {
      setup();
      MFEM_DEVICE_SYNC;
      mdofs += MDofs();
   }

   void benchmark_action()
   {
      a.Mult(x, y);
      MFEM_DEVICE_SYNC;
      mdofs += MDofs();
   }

   double SumMdofs() const { return mdofs; }

   double MDofs() const { return 1e-6 * dofs; }
};

/// BK_DG inspired benchmark for the action
static void BK_DG(bm::State &state)
{
   const int dim = 3;
   const int p = state.range(1);
   const int target_dofs = state.range(0);
   const int elem_dofs = pow(p+1, dim);
   const int N = pow(target_dofs / elem_dofs, 1.0/dim) + 1;
   const double prob = ((double)state.range(2))/100;
   Kernel ker(p, N, prob);
   if ( !ker.is_runnable() ) { state.SkipWithError("MAX_MEM"); }
   while (state.KeepRunning()) { ker.benchmark_action(); }
   state.counters["MDof/s"] = bm::Counter(ker.SumMdofs(), bm::Counter::kIsRate);
   state.counters["Dofs"] = bm::Counter(ker.dofs, bm::Counter::kDefaults);
   state.counters["Order"] = bm::Counter(ker.p);
   state.counters["Prob"] = bm::Counter(state.range(2));
}

BENCHMARK(BK_DG)->ArgsProduct({
      benchmark::CreateRange(1024, max_dofs, /*step=*/2),
      benchmark::CreateDenseRange(1, max_order, /*step=*/1),
      {-1, 0, 1, 10, 30}
    })->Unit(bm::kMillisecond);

int main(int argc, char *argv[])
{
   bm::ConsoleReporter CR;
   bm::Initialize(&argc, argv);

   // Device setup, cpu by default
   std::string device_config = "cpu";
   if (bmi::global_context != nullptr)
   {
      const auto device = bmi::global_context->find("device");
      if (device != bmi::global_context->end())
      {
         mfem::out << device->first << " : " << device->second << std::endl;
         device_config = device->second;
      }
   }
   Device device(device_config.c_str());
   device.Print();

   if (bm::ReportUnrecognizedArguments(argc, argv)) { return 1; }
   bm::RunSpecifiedBenchmarks(&CR);
   return 0;
}

#endif // MFEM_USE_BENCHMARK