// Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
//
// SPDX-License-Identifier: BSD-2-Clause
//
// This file is part of CEED:  http://github.com/ceed

#include "./kernel-defines.hpp"

// Kernels are based on the cuda backend from LLNL and VT groups
//
// Expects the following types to be defined:
// - CeedInt
// - CeedScalar
//
// Expects the following constants to be defined:
// - COMPONENT_COUNT            : CeedInt
// - ELEMENT_SIZE               : CeedInt
// - NODE_COUNT                 : CeedInt
// - TILE_SIZE                  : int
// - USES_INDICES               : bool
// - STRIDE_TYPE                : ceed::occa::StrideType
// - NODE_STRIDE                : Optional[CeedInt]
// - COMPONENT_STRIDE           : Optional[CeedInt]
// - ELEMENT_STRIDE             : Optional[CeedInt]
// - UNSTRIDED_COMPONENT_STRIDE : Optional[CeedInt]

const char *occa_elem_restriction_source = STRINGIFY_SOURCE(

    @directive("#define PRINT_KERNEL_HASHES 0")

            typedef CeedScalar *
        QuadVector @dim(ELEMENT_SIZE, COMPONENT_COUNT, elementCount);

    @kernel void applyRestriction(const CeedInt elementCount, const CeedInt *indices, CeedScalar *u, QuadVector v) {
      @tile(TILE_SIZE, @outer, @inner) for (int element = 0; element < elementCount; ++element) {
        @directive("#if PRINT_KERNEL_HASHES")
            // Print to see which kernel is being run
            if (element == 0) {
          printf("\n\napplyRestriction Kernel: " OKL_KERNEL_HASH "\n\n");
        }
        @directive("#endif")

            @directive("#if USES_INDICES") for (int node = 0; node < ELEMENT_SIZE; ++node) {
          const CeedInt index = indices[node + (element * ELEMENT_SIZE)];

          for (int c = 0; c < COMPONENT_COUNT; ++c) {
            v(node, c, element) = u[index + (c * UNSTRIDED_COMPONENT_STRIDE)];
          }
        }
        @directive("#else") for (int node = 0; node < ELEMENT_SIZE; ++node) {
          for (int c = 0; c < COMPONENT_COUNT; ++c) {
            v(node, c, element) = u[(node * NODE_STRIDE) + (c * COMPONENT_STRIDE) + (element * ELEMENT_STRIDE)];
          }
        }
        @directive("#endif")
      }
    }

    @directive("#if USES_INDICES")

        @kernel void applyRestrictionTranspose(const CeedInt elementCount, const CeedInt *quadIndices, const CeedInt *dofOffsets,
                                               const CeedInt *dofIndices, const QuadVector u, CeedScalar *v) {
          @tile(TILE_SIZE, @outer, @inner) for (int n = 0; n < NODE_COUNT; ++n) {
            @directive("#if PRINT_KERNEL_HASHES")
                // Print to see which kernel is being run
                if (n == 0) {
              printf("\n\napplyRestrictionTranspose Kernel: " OKL_KERNEL_HASH "\n\n");
            }
            @directive("#endif")

                CeedScalar vComp[COMPONENT_COUNT];

            // Prefetch index information
            const CeedInt vIndex      = quadIndices[n];
            const CeedInt offsetStart = dofOffsets[n];
            const CeedInt offsetEnd   = dofOffsets[n + 1];

            for (int c = 0; c < COMPONENT_COUNT; ++c) {
              vComp[c] = 0;
            }

            // Aggregate by component
            for (CeedInt i = offsetStart; i < offsetEnd; ++i) {
              const CeedInt index = dofIndices[i];

              const int node    = (index % ELEMENT_SIZE);
              const int element = (index / ELEMENT_SIZE);

              for (int c = 0; c < COMPONENT_COUNT; ++c) {
                vComp[c] += u(node, c, element);
              }
            }

            // Update dofs by component
            for (int c = 0; c < COMPONENT_COUNT; ++c) {
              v[vIndex + (c * UNSTRIDED_COMPONENT_STRIDE)] += vComp[c];
            }
          }
        }

    @directive("#else")  // USES_INDICES = false

    @kernel void applyRestrictionTranspose(const CeedInt elementCount, const CeedInt *quadIndices, const CeedInt *dofOffsets,
                                           const CeedInt *dofIndices, const QuadVector u, CeedScalar *v) {
      @tile(TILE_SIZE, @outer, @inner) for (int element = 0; element < elementCount; ++element) {
        @directive("#if PRINT_KERNEL_HASHES")
            // Print to see which kernel is being run
            if (element == 0) {
          printf("\n\napplyRestrictionTranspose Kernel: " OKL_KERNEL_HASH "\n\n");
        }
        @directive("#endif")

            for (int node = 0; node < ELEMENT_SIZE; ++node) {
          for (int c = 0; c < COMPONENT_COUNT; ++c) {
            v[(node * NODE_STRIDE) + (c * COMPONENT_STRIDE) + (element * ELEMENT_STRIDE)] += u(node, c, element);
          }
        }
      }
    }

    @directive("#endif")  // USES_INDICES

);