/* * (C) Copyright 2020 Hewlett Packard Enterprise Development LP * Copyright (c) 2021 Amazon.com, Inc. or its affiliates. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #if HAVE_CONFIG_H #include #endif #include "ofi_hmem.h" #include "ofi.h" #if HAVE_NEURON #include #include "nrt/nrt.h" #include "nrt/nrt_experimental.h" struct neuron_ops { NRT_STATUS (*nrt_tensor_allocate)(nrt_tensor_placement_t tensor_placement, int logical_nc_id, size_t size, const char *name, nrt_tensor_t **tensor); void (*nrt_tensor_free)(nrt_tensor_t **tensor); void *(*nrt_tensor_get_va)(const nrt_tensor_t *tensor); NRT_STATUS (*nrt_memcpy_to_device)(void *dest, const void *src, size_t size); NRT_STATUS (*nrt_get_dmabuf_fd)(uint64_t va, uint64_t size, int* fd); NRT_STATUS (*nrt_get_total_nc_count)(uint32_t *nc_count); }; static void *neuron_handle; static struct neuron_ops neuron_ops; static int neuron_dl_init(void) { neuron_handle = dlopen("libnrt.so.1", RTLD_NOW); if (!neuron_handle) { FI_INFO(&core_prov, FI_LOG_CORE, "Failed to dlopen libnrt.so.1\n"); return -FI_ENOSYS; } neuron_ops.nrt_tensor_allocate = dlsym(neuron_handle, "nrt_tensor_allocate"); if (!neuron_ops.nrt_tensor_allocate) { FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find nrt_tensor_allocate\n"); goto err; } neuron_ops.nrt_tensor_free = dlsym(neuron_handle, "nrt_tensor_free"); if (!neuron_ops.nrt_tensor_free) { FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find nrt_tensor_free\n"); goto err; } neuron_ops.nrt_tensor_get_va = dlsym(neuron_handle, "nrt_tensor_get_va"); if (!neuron_ops.nrt_tensor_get_va) { FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find nrt_tensor_get_va\n"); goto err; } neuron_ops.nrt_memcpy_to_device = dlsym(neuron_handle, "nrt_memcpy_to_device"); if (!neuron_ops.nrt_memcpy_to_device) { FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find nrt_memcpy_to_device\n"); goto err; } neuron_ops.nrt_get_dmabuf_fd = dlsym(neuron_handle, "nrt_get_dmabuf_fd"); if (!neuron_ops.nrt_get_dmabuf_fd) { FI_INFO(&core_prov, FI_LOG_CORE, "Failed to find nrt_get_dmabuf_fd, " "dmabuf feature will not be used for Neuron devices\n"); } neuron_ops.nrt_get_total_nc_count = dlsym(neuron_handle, "nrt_get_total_nc_count"); if (!neuron_ops.nrt_get_total_nc_count) { FI_WARN(&core_prov, FI_LOG_CORE, "Failed to find nrt_get_total_nc_count"); goto err; } return FI_SUCCESS; err: dlclose(neuron_handle); return -FI_ENODATA; } static int neuron_dl_close(void) { dlclose(neuron_handle); return FI_SUCCESS; } int neuron_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size) { NRT_STATUS ret; ret = neuron_ops.nrt_memcpy_to_device(dev, host, size); if (ret == NRT_SUCCESS) return FI_SUCCESS; FI_WARN(&core_prov, FI_LOG_CORE, "Failed to copy from host memory " "to AWS neuron: %d\n", ret); return -FI_EIO; } int neuron_copy_from_dev(uint64_t device, void *host, const void *dev, size_t size) { FI_WARN_ONCE(&core_prov, FI_LOG_CORE, "Copies from AWS Neuron to host memory are not supported."); return -FI_ENOSYS; } int neuron_host_register(void *ptr, size_t size) { return FI_SUCCESS; } int neuron_host_unregister(void *ptr) { return FI_SUCCESS; } int neuron_hmem_init(void) { int ret; uint32_t total_nc_count; ret = neuron_dl_init(); if (ret) return ret; /* Note that nrt_get_total_nc_count() is one of the few neuron functions * that can be called before nrt_init() */ if (neuron_ops.nrt_get_total_nc_count(&total_nc_count) != NRT_SUCCESS) { return -FI_ENOSYS; } if (total_nc_count == 0) { return -FI_ENOSYS; } FI_INFO(&core_prov, FI_LOG_CORE, "number of neuron cores: %d\n", total_nc_count); return FI_SUCCESS; } int neuron_hmem_cleanup(void) { neuron_dl_close(); return FI_SUCCESS; } void *neuron_alloc(void **handle, size_t size) { NRT_STATUS ret; nrt_tensor_t **tensor; void *ptr; tensor = (nrt_tensor_t **)handle; ret = neuron_ops.nrt_tensor_allocate(NRT_TENSOR_PLACEMENT_DEVICE, 0, size, "libfabric", tensor); if (ret != NRT_SUCCESS) { FI_WARN(&core_prov, FI_LOG_CORE, "Failed to allocate nrt tensor: %d\n", ret); return NULL; } ptr = neuron_ops.nrt_tensor_get_va(*tensor); if (!ptr) neuron_ops.nrt_tensor_free(tensor); return ptr; } void neuron_free(void **handle) { neuron_ops.nrt_tensor_free((nrt_tensor_t **)handle); } /** * @brief Get the dma-buf fd of Neuron memory region if it was registered for EFA peer direct * * @param addr[in] the device buffer address * @param size[in] the device buffer size (in bytes) * @param fd[out] the dma-buf fd * @param offset[out] the offset within the dma-buf object * @return int On success, return 0. On failure, return a negative error code */ int neuron_get_dmabuf_fd(const void *addr, uint64_t size, int *fd, uint64_t *offset) { NRT_STATUS ret; /* nrt_get_dmabuf_fd symbol doesn't exist in Neuron Runtime */ if (!neuron_ops.nrt_get_dmabuf_fd) { return -FI_ENOPROTOOPT; } ret = neuron_ops.nrt_get_dmabuf_fd((uintptr_t)addr, size, fd); if (ret == NRT_SUCCESS) { /* * The assumption is that nrt_get_dmabuf_fd() would fail for * any addr that is not the starting address of the dma-buf * object. Otherwise we need a low level op to get the base * address of the dma-buf object. */ *offset = 0; return FI_SUCCESS; } else if (ret == NRT_RESOURCE) { /* real error from Neuron */ FI_INFO(&core_prov, FI_LOG_CORE, "Failed to retrieve dmabuf_fd: %d\n", ret); return -FI_EINVAL; } else { /* fallback to mem registration using ibv_reg_mr */ FI_INFO(&core_prov, FI_LOG_CORE, "Failed to retrieve dmabuf_fd: %d\n", ret); return -FI_ENOPROTOOPT; } } #else int neuron_copy_to_dev(uint64_t device, void *dev, const void *host, size_t size) { return -FI_ENOSYS; } int neuron_copy_from_dev(uint64_t device, void *host, const void *dev, size_t size) { return -FI_ENOSYS; } int neuron_host_register(void *ptr, size_t size) { return -FI_ENOSYS; } int neuron_host_unregister(void *ptr) { return -FI_ENOSYS; } int neuron_hmem_init(void) { return -FI_ENOSYS; } int neuron_hmem_cleanup(void) { return -FI_ENOSYS; } void *neuron_alloc(void **handle, size_t size) { return NULL; } void neuron_free(void **handle) { return; } int neuron_get_dmabuf_fd(const void *addr, uint64_t size, int *fd, uint64_t *offset) { return -FI_ENOSYS; } #endif /* HAVE_NEURON */