# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) # Copyright (c) 2022 Nvidia Inc. All rights reserved. See COPYING file import resource from pyverbs.providers.mlx5.mlx5dv import Mlx5DevxObj, WqeDataSeg, Mlx5UMEM from tests.mlx5_base import Mlx5DevxRcResources, Mlx5DevxTrafficBase import pyverbs.providers.mlx5.mlx5_enums as dve import tests.cuda_utils as cu import pyverbs.enums as e try: from cuda import cuda, cudart, nvrtc cu.CUDA_FOUND = True except ImportError: cu.CUDA_FOUND = False GPU_PAGE_SIZE = 1 << 16 @cu.set_mem_io_cuda_methods class CudaDevxRes(Mlx5DevxRcResources): def __init__(self, dev_name, ib_port, gid_index, mr_access=e.IBV_ACCESS_LOCAL_WRITE): """ Initialize DevX resources with CUDA memory allocations. :param dev_name: Device name to be used :param ib_port: IB port of the device to use :param gid_index: Which GID index to use :param mr_access: The MR access """ self.mr_access = mr_access self.cuda_addr = None self.dmabuf_fd = None self.umem = None self.mkey = None self.lkey = None self.lkey = None super().__init__(dev_name=dev_name, ib_port=ib_port, gid_index=gid_index) def init_resources(self): self.alloc_cuda_mem() super().init_resources() self.create_dmabuf_umem() self.create_mkey() def get_wqe_data_segment(self): return WqeDataSeg(self.msg_size, self.lkey, int(self.cuda_addr)) def alloc_cuda_mem(self): """ Allocates CUDA memory and a DMABUF FD on that memory. """ self.cuda_addr = cu.check_cuda_errors(cuda.cuMemAlloc(GPU_PAGE_SIZE)) # Sync between memory operations attr_value = 1 cu.check_cuda_errors(cuda.cuPointerSetAttribute( attr_value, cuda.CUpointer_attribute.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, int(self.cuda_addr) )) # Memory address and size must be aligned to page size to get a handle assert (GPU_PAGE_SIZE % resource.getpagesize() == 0 and int(self.cuda_addr) % resource.getpagesize() == 0) self.dmabuf_fd = cu.check_cuda_errors( cuda.cuMemGetHandleForAddressRange(self.cuda_addr, GPU_PAGE_SIZE, cuda.CUmemRangeHandleType.CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, 0)) def create_mr(self): pass def create_dmabuf_umem(self): umem_aligment = resource.getpagesize() self.umem = Mlx5UMEM(self.ctx, GPU_PAGE_SIZE, 0, umem_aligment, self.mr_access, umem_aligment, dve.MLX5DV_UMEM_MASK_DMABUF, self.dmabuf_fd) def create_mkey(self): from tests.mlx5_prm_structs import SwMkc, CreateMkeyIn, CreateMkeyOut accesses = [e.IBV_ACCESS_LOCAL_WRITE, e.IBV_ACCESS_REMOTE_READ, e.IBV_ACCESS_REMOTE_WRITE] lw, rr, rw = (list(map(lambda access: int(self.mr_access & access != 0), accesses))) mkey_ctx = SwMkc(lr=1, lw=lw, rr=rr, rw=rw, access_mode_1_0=0x1, start_addr=int(self.cuda_addr), len=GPU_PAGE_SIZE, pd=self.dv_pd.pdn, qpn=0xffffff) self.mkey = Mlx5DevxObj(self.ctx, CreateMkeyIn(sw_mkc=mkey_ctx, mkey_umem_id=self.umem.umem_id, mkey_umem_valid=1), len(CreateMkeyOut())) self.lkey = CreateMkeyOut(self.mkey.out_view).mkey_index << 8 @cu.set_init_cuda_methods class Mlx5GpuDevxRcTrafficTest(Mlx5DevxTrafficBase): """ Test DevX traffic over CUDA memory using DMA BUF and UMEM """ @cu.requires_cuda def test_mlx_devx_cuda_send_imm_traffic(self): """ Creates two DevX RC QPs and runs SEND_IMM traffic over CUDA allocated memory using UMEM and DMA BUF. """ self.create_players(CudaDevxRes) # Send traffic self.send_imm_traffic()