/* $Source: bitbucket.org:berkeleylab/gasnet.git/tests/testcudauva.c $
 * Copyright (c) 2021, The Regents of the University of California
 *
 * Description: test of GEX_MK_CLASS_CUDA_UVA
 *
 * This test verifies correctness of gex_MK_Create() for the device class
 * GEX_MK_CLASS_CUDA_UVA.  This includes checking the expected behavior of
 * builds both with and without configure-time enable of support for this
 * device class.
 */

// WARNING: This test exercises one or more EXPERIMENTAL features.
// One should not clone the logic in this test, since details of such features
// may change without notice.

#include <gasnetex.h>
#include <gasnet_mk.h>
#include <gasnet_coll.h>

#if GASNET_HAVE_MK_CLASS_CUDA_UVA
#include <cuda.h>
#endif

static size_t len = 0;
#ifndef TEST_SEGSZ
#define TEST_SEGSZ_EXPR (2*len)
#endif

#include <test.h>

#define check_cudacall(op) do {                 \
  int _retval = (op);                           \
  if_pf(_retval) {                              \
    const char *_errorname;                     \
    cuGetErrorName(_retval, &_errorname);       \
    FATALERR(#op": %s(%i)",_errorname,_retval); \
  }                                             \
} while (0)

static gex_Client_t  myclient;
static gex_EP_t      myep;
static gex_TM_t      myteam;
static gex_Segment_t mysegment;
static gex_Rank_t    myrank;
static gex_Rank_t    nranks;


int main(int argc, char **argv)
{
  int client_segment = 1;
  int seed = 0;
  int rc;

#if GASNET_CONDUIT_IBV
  { // These settings are a hack to avoid triggering known bugs/limitations,
    // by disabling multiple paths.  See bug 4148.
    // Both of these variables can have potentially serious negative impacts
    // on performance, and should not be used in general.
    // As noted above "One should not clone the logic in this test".
    setenv("GASNET_SUPERNODE_MAXSIZE", "1", 0 /* NO overwrite if already set */);
    setenv("GASNET_NUM_QPS", "1", 0 /* NO overwrite if already set */);
  }
#endif

  GASNET_Safe(gex_Client_Init(&myclient, &myep, &myteam, "testcudauva", &argc, &argv, 0));

  test_init("testcudauva", 0, "[options] (size) (seed)\n"
               " Segment allocation options:\n"
               "     -client-seg:  Test client-allocated GPU segment (default)\n"
               "     -gasnet-seg:  Test GASNet-allocated GPU segment\n"
               " size      length of segment\n"
               " seed      seed for PRNG\n");

  int help = 0;
  int argi = 1;
  while (argc > argi) {
    if (!strcmp(argv[argi], "-client-seg")) {
      client_segment = 1;
      ++argi;
    } else if (!strcmp(argv[argi], "-gasnet-seg")) {
      client_segment = 0;
      ++argi;
    } else if (argv[argi][0] == '-') {
      help = 1;
      ++argi;
    } else break;
  }

  if (argi < argc) { len = atol(argv[argi]); ++argi; }
  if (len == 0) { len = 16*1024*1024; }

  if (argi < argc) { seed = atoi(argv[argi]); ++argi; }
  if (seed == 0) { seed = (((unsigned int)TIME()) & 0xFFFF); }

  if (argi < argc || help) test_usage();

  uint8_t *tmp = test_malloc(len);
  uint8_t *array1 = test_malloc(len);
  uint8_t *array2 = test_malloc(len);

  myrank = gex_TM_QueryRank(myteam);
  nranks = gex_TM_QuerySize(myteam);
  gex_Rank_t peer = (myrank + 1) % nranks;

  if (nranks == 1) {
    // TODO: remove once loopback kinds works correctly
    MSG0("WARNING: This test requires a minimum of two nodes. Test skipped.\n");
    gasnet_exit(0); // prevents false negatives, such as from test harnesses for smp-conduit
  }

  GASNET_Safe(gex_Segment_Attach(&mysegment, myteam, TEST_SEGSZ_REQUEST));

  MSG0("Running CUDA UVA non-local xfer tests with size %lu, PRNG seed %d, and %s-allocated GPU segment",
       (unsigned long)len, seed, client_segment ? "client" : "GASNet");

  TEST_BCAST(&seed, 0, &seed, sizeof(seed));
  TEST_SRAND(seed);
  for (size_t i = 0; i < len; ++i) {
     unsigned int r = TEST_RAND(0,65535);
     array1[i] = r & 0xff;
     array2[i] = (r >> 8) & 0xff;
  }

  BARRIER();

  gex_EP_t gpu_ep;
  gex_MK_t kind;
  gex_MK_Create_args_t args;

  args.gex_flags = 0;
  args.gex_class = GEX_MK_CLASS_CUDA_UVA;
  args.gex_args.gex_class_cuda_uva.gex_CUdevice = 0;

#if GASNET_HAVE_MK_CLASS_CUDA_UVA
  {
    if (GASNET_HAVE_MK_CLASS_MULTIPLE != 1) {
       ERR("Invalid GASNET_HAVE_MK_CLASS_MULTIPLE");
    }

    int count;
    cuInit(0);
    if (cuDeviceGetCount(&count) || !count) {
      MSG("GEX_MK_CLASS_CUDA_UVA: skipped - could not find a CUDA device");
      // If this lack of a device is NOT a collective property, then we want
      // to at least balance the collective operations (to avoid hanging).
      // However, at least one peer will fail a gex_EP_QueryBoundSegmentNB().
      // For the case all ranks lack a GPU, this test *will* exit gracefully.
      GASNET_Safe( gex_EP_PublishBoundSegment(myteam, NULL, 0, 0) );
      for (int i = 0; i < 4; ++i) BARRIER(); // currently exactly one per case
    } else {
      CUcontext ctx;
      check_cudacall( cuDevicePrimaryCtxRetain(&ctx, 0) );
      check_cudacall( cuCtxPushCurrent(ctx) );

      CUdeviceptr dptr;
      uint8_t *client_gpu = NULL;
      if (client_segment) {
        check_cudacall( cuMemAlloc(&dptr, TEST_SEGSZ_REQUEST) );
        client_gpu = (uint8_t *) dptr;
      }

      GASNET_Safe( gex_MK_Create(&kind, myclient, &args, 0) );
      gex_Segment_t d_segment = GEX_SEGMENT_INVALID;
      GASNET_Safe( gex_Segment_Create(&d_segment, myclient, client_gpu, TEST_SEGSZ_REQUEST, kind, 0));
      uint8_t *loc_gpu = gex_Segment_QueryAddr(d_segment);
      if (client_segment) assert_always(loc_gpu == client_gpu);

      GASNET_Safe( gex_EP_Create(&gpu_ep, myclient, GEX_EP_CAPABILITY_RMA, 0));
      gex_EP_BindSegment(gpu_ep, d_segment, 0);
      GASNET_Safe( gex_EP_PublishBoundSegment(myteam, &gpu_ep, 1, 0) );

      // TM (3 of 4 being pairs) for the four possible pairings
      gex_EP_Index_t host_epidx = gex_EP_QueryIndex(myep);
      gex_EP_Index_t gpu_epidx  = gex_EP_QueryIndex(gpu_ep);
      assert_always(host_epidx == 0);
      assert_always(gpu_epidx  == 1);
      gex_TM_t LH_RH = myteam;
      gex_TM_t LH_RG = gex_TM_Pair(myep,   gpu_epidx);
      gex_TM_t LG_RH = gex_TM_Pair(gpu_ep, host_epidx);
      gex_TM_t LG_RG = gex_TM_Pair(gpu_ep, gpu_epidx);

      uint8_t *rem_gpu;
      size_t queried_len;
      gex_Event_Wait( gex_EP_QueryBoundSegmentNB(LH_RG, peer, (void**)&rem_gpu, NULL, &queried_len, 0) );
      assert_always(queried_len == TEST_SEGSZ_REQUEST);
      
// Case 1. Put - local host to remote gpu
      gex_RMA_PutBlocking(LH_RG, peer, rem_gpu, array1, len, 0);
      BARRIER();
      cuMemcpyDtoH(tmp, (CUdeviceptr)loc_gpu, len);
      if (memcmp(tmp, array1, len)) {
         ERR("Case 1 verification failed");
         cuMemcpyHtoD((CUdeviceptr)loc_gpu, array1, len);
      } else {
         MSG("Case 1 verification passed");
      }

// Case 2. Get - remote gpu to local host
      memset(tmp, 0, len);
      gex_RMA_GetBlocking(LH_RG, tmp, peer, rem_gpu, len, 0);
      if (memcmp(tmp, array1, len)) {
         ERR("Case 2 verification failed");
      } else {
         MSG("Case 2 verification passed");
      }
      BARRIER();

// Case 3. Put - local gpu to remote gpu
      gex_RMA_PutBlocking(LG_RG, peer, rem_gpu+len, loc_gpu, len, 0);
      BARRIER();
      cuMemcpyDtoH(tmp, (CUdeviceptr)loc_gpu+len, len);
      if (memcmp(tmp, array1, len)) {
         ERR("Case 3 verification failed");
         cuMemcpyHtoD((CUdeviceptr)loc_gpu+len, array1, len);
      } else {
         MSG("Case 3 verification passed");
      }

// Case 4. Get - remote gpu to local gpu
      cuMemcpyHtoD((CUdeviceptr)loc_gpu, array2, len);
      BARRIER();
      gex_RMA_GetBlocking(LG_RG, loc_gpu+len, peer, rem_gpu, len, 0);
      cuMemcpyDtoH(tmp, (CUdeviceptr)loc_gpu+len, len);
      if (memcmp(tmp, array2, len)) {
         ERR("Case 4 verification failed");
         cuMemcpyHtoD((CUdeviceptr)loc_gpu+len, array2, len);
      } else {
         MSG("Case 4 verification passed");
      }

      if (!test_errs) MSG("GEX_MK_CLASS_CUDA_UVA: success");

      check_cudacall( cuCtxSetCurrent(NULL) );
      check_cudacall( cuDevicePrimaryCtxRelease(0) );
    }
  }
#else
  {
    gex_System_SetVerboseErrors(0);
    int rc = gex_MK_Create(&kind, myclient, &args, 0);
    assert_always(rc == GASNET_ERR_BAD_ARG);
    MSG("GEX_MK_CLASS_CUDA_UVA: correct failure due to missing support");
  }
#endif

  // Just to ensure these exist:
  args.gex_class = GEX_MK_CLASS_HOST;
  kind = GEX_MK_HOST;

  MSG("done.");

  BARRIER();
  gasnet_exit(0);

  return 0;
}