/* * Copyright (c) 2009 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2009 System Fabric Works, Inc. All rights reserved. * Copyright (C) 2006-2007 QLogic Corporation, All rights reserved. * Copyright (c) 2005. PathScale, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "rxe_queue.h" #include "rxe-abi.h" #include "rxe.h" #include "rxe_trace.h" static void rxe_free_context(struct ibv_context *ibctx); static const struct verbs_match_ent hca_table[] = { VERBS_DRIVER_ID(RDMA_DRIVER_RXE), VERBS_NAME_MATCH("rxe", NULL), {}, }; static int rxe_query_device(struct ibv_context *context, const struct ibv_query_device_ex_input *input, struct ibv_device_attr_ex *attr, size_t attr_size) { struct ib_uverbs_ex_query_device_resp resp; size_t resp_size = sizeof(resp); uint64_t raw_fw_ver; unsigned int major, minor, sub_minor; int ret; ret = ibv_cmd_query_device_any(context, input, attr, attr_size, &resp, &resp_size); if (ret) return ret; raw_fw_ver = resp.base.fw_ver; major = (raw_fw_ver >> 32) & 0xffff; minor = (raw_fw_ver >> 16) & 0xffff; sub_minor = raw_fw_ver & 0xffff; snprintf(attr->orig_attr.fw_ver, sizeof(attr->orig_attr.fw_ver), "%d.%d.%d", major, minor, sub_minor); return 0; } static int rxe_query_port(struct ibv_context *context, uint8_t port, struct ibv_port_attr *attr) { struct ibv_query_port cmd; return ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd)); } static struct ibv_pd *rxe_alloc_pd(struct ibv_context *context) { struct ibv_alloc_pd cmd; struct ib_uverbs_alloc_pd_resp resp; struct ibv_pd *pd; pd = calloc(1, sizeof(*pd)); if (!pd) return NULL; if (ibv_cmd_alloc_pd(context, pd, &cmd, sizeof(cmd), &resp, sizeof(resp))) { free(pd); return NULL; } return pd; } static int rxe_dealloc_pd(struct ibv_pd *pd) { int ret; ret = ibv_cmd_dealloc_pd(pd); if (!ret) free(pd); return ret; } static struct ibv_mw *rxe_alloc_mw(struct ibv_pd *ibpd, enum ibv_mw_type type) { int ret; struct ibv_mw *ibmw; struct ibv_alloc_mw cmd = {}; struct ib_uverbs_alloc_mw_resp resp = {}; ibmw = calloc(1, sizeof(*ibmw)); if (!ibmw) return NULL; ret = ibv_cmd_alloc_mw(ibpd, type, ibmw, &cmd, sizeof(cmd), &resp, sizeof(resp)); if (ret) { free(ibmw); return NULL; } return ibmw; } static int rxe_dealloc_mw(struct ibv_mw *ibmw) { int ret; ret = ibv_cmd_dealloc_mw(ibmw); if (ret) return ret; free(ibmw); return 0; } static int next_rkey(int rkey) { return (rkey & 0xffffff00) | ((rkey + 1) & 0x000000ff); } static int rxe_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr_list, struct ibv_send_wr **bad_wr); static int rxe_bind_mw(struct ibv_qp *ibqp, struct ibv_mw *ibmw, struct ibv_mw_bind *mw_bind) { int ret; struct ibv_mw_bind_info *bind_info = &mw_bind->bind_info; struct ibv_send_wr ibwr; struct ibv_send_wr *bad_wr; if (bind_info->mw_access_flags & IBV_ACCESS_ZERO_BASED) { ret = EINVAL; goto err; } memset(&ibwr, 0, sizeof(ibwr)); ibwr.opcode = IBV_WR_BIND_MW; ibwr.next = NULL; ibwr.wr_id = mw_bind->wr_id; ibwr.send_flags = mw_bind->send_flags; ibwr.bind_mw.bind_info = mw_bind->bind_info; ibwr.bind_mw.mw = ibmw; ibwr.bind_mw.rkey = next_rkey(ibmw->rkey); ret = rxe_post_send(ibqp, &ibwr, &bad_wr); if (ret) goto err; /* user has to undo this if he gets an error wc */ ibmw->rkey = ibwr.bind_mw.rkey; return 0; err: errno = ret; return errno; } static struct ibv_mr *rxe_reg_mr(struct ibv_pd *pd, void *addr, size_t length, uint64_t hca_va, int access) { struct verbs_mr *vmr; struct ibv_reg_mr cmd; struct ib_uverbs_reg_mr_resp resp; int ret; vmr = calloc(1, sizeof(*vmr)); if (!vmr) return NULL; ret = ibv_cmd_reg_mr(pd, addr, length, hca_va, access, vmr, &cmd, sizeof(cmd), &resp, sizeof(resp)); if (ret) { free(vmr); return NULL; } return &vmr->ibv_mr; } static int rxe_dereg_mr(struct verbs_mr *vmr) { int ret; ret = ibv_cmd_dereg_mr(vmr); if (ret) return ret; free(vmr); return 0; } static int cq_start_poll(struct ibv_cq_ex *current, struct ibv_poll_cq_attr *attr) { struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); pthread_spin_lock(&cq->lock); cq->cur_index = load_consumer_index(cq->queue); if (check_cq_queue_empty(cq)) { pthread_spin_unlock(&cq->lock); errno = ENOENT; return errno; } cq->wc = addr_from_index(cq->queue, cq->cur_index); cq->vcq.cq_ex.status = cq->wc->status; cq->vcq.cq_ex.wr_id = cq->wc->wr_id; return 0; } static int cq_next_poll(struct ibv_cq_ex *current) { struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); advance_cq_cur_index(cq); if (check_cq_queue_empty(cq)) { store_consumer_index(cq->queue, cq->cur_index); pthread_spin_unlock(&cq->lock); errno = ENOENT; return errno; } cq->wc = addr_from_index(cq->queue, cq->cur_index); cq->vcq.cq_ex.status = cq->wc->status; cq->vcq.cq_ex.wr_id = cq->wc->wr_id; return 0; } static void cq_end_poll(struct ibv_cq_ex *current) { struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); advance_cq_cur_index(cq); store_consumer_index(cq->queue, cq->cur_index); pthread_spin_unlock(&cq->lock); } static enum ibv_wc_opcode cq_read_opcode(struct ibv_cq_ex *current) { struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); return cq->wc->opcode; } static uint32_t cq_read_vendor_err(struct ibv_cq_ex *current) { struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); return cq->wc->vendor_err; } static uint32_t cq_read_byte_len(struct ibv_cq_ex *current) { struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); return cq->wc->byte_len; } static __be32 cq_read_imm_data(struct ibv_cq_ex *current) { struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); return cq->wc->ex.imm_data; } static uint32_t cq_read_qp_num(struct ibv_cq_ex *current) { struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); return cq->wc->qp_num; } static uint32_t cq_read_src_qp(struct ibv_cq_ex *current) { struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); return cq->wc->src_qp; } static unsigned int cq_read_wc_flags(struct ibv_cq_ex *current) { struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); return cq->wc->wc_flags; } static uint32_t cq_read_slid(struct ibv_cq_ex *current) { struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); return cq->wc->slid; } static uint8_t cq_read_sl(struct ibv_cq_ex *current) { struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); return cq->wc->sl; } static uint8_t cq_read_dlid_path_bits(struct ibv_cq_ex *current) { struct rxe_cq *cq = container_of(current, struct rxe_cq, vcq.cq_ex); return cq->wc->dlid_path_bits; } static int rxe_destroy_cq(struct ibv_cq *ibcq); static struct ibv_cq *rxe_create_cq(struct ibv_context *context, int cqe, struct ibv_comp_channel *channel, int comp_vector) { struct rxe_cq *cq; struct urxe_create_cq_resp resp = {}; int ret; cq = calloc(1, sizeof(*cq)); if (!cq) return NULL; ret = ibv_cmd_create_cq(context, cqe, channel, comp_vector, &cq->vcq.cq, NULL, 0, &resp.ibv_resp, sizeof(resp)); if (ret) { free(cq); return NULL; } cq->queue = mmap(NULL, resp.mi.size, PROT_READ | PROT_WRITE, MAP_SHARED, context->cmd_fd, resp.mi.offset); if ((void *)cq->queue == MAP_FAILED) { ibv_cmd_destroy_cq(&cq->vcq.cq); free(cq); return NULL; } cq->wc_size = 1ULL << cq->queue->log2_elem_size; if (cq->wc_size < sizeof(struct ib_uverbs_wc)) { rxe_destroy_cq(&cq->vcq.cq); return NULL; } cq->mmap_info = resp.mi; pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE); return &cq->vcq.cq; } enum rxe_sup_wc_flags { RXE_SUP_WC_FLAGS = IBV_WC_EX_WITH_BYTE_LEN | IBV_WC_EX_WITH_IMM | IBV_WC_EX_WITH_QP_NUM | IBV_WC_EX_WITH_SRC_QP | IBV_WC_EX_WITH_SLID | IBV_WC_EX_WITH_SL | IBV_WC_EX_WITH_DLID_PATH_BITS, RXE_SUP_WC_EX_FLAGS = RXE_SUP_WC_FLAGS, // add extended flags here }; static struct ibv_cq_ex *rxe_create_cq_ex(struct ibv_context *context, struct ibv_cq_init_attr_ex *attr) { int ret; struct rxe_cq *cq; struct urxe_create_cq_ex_resp resp = {}; /* user is asking for flags we don't support */ if (attr->wc_flags & ~RXE_SUP_WC_EX_FLAGS) { errno = EOPNOTSUPP; goto err; } cq = calloc(1, sizeof(*cq)); if (!cq) goto err; ret = ibv_cmd_create_cq_ex(context, attr, &cq->vcq, NULL, 0, &resp.ibv_resp, sizeof(resp), 0); if (ret) goto err_free; cq->queue = mmap(NULL, resp.mi.size, PROT_READ | PROT_WRITE, MAP_SHARED, context->cmd_fd, resp.mi.offset); if ((void *)cq->queue == MAP_FAILED) goto err_destroy; cq->wc_size = 1ULL << cq->queue->log2_elem_size; if (cq->wc_size < sizeof(struct ib_uverbs_wc)) goto err_unmap; cq->mmap_info = resp.mi; pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE); cq->vcq.cq_ex.start_poll = cq_start_poll; cq->vcq.cq_ex.next_poll = cq_next_poll; cq->vcq.cq_ex.end_poll = cq_end_poll; cq->vcq.cq_ex.read_opcode = cq_read_opcode; cq->vcq.cq_ex.read_vendor_err = cq_read_vendor_err; cq->vcq.cq_ex.read_wc_flags = cq_read_wc_flags; if (attr->wc_flags & IBV_WC_EX_WITH_BYTE_LEN) cq->vcq.cq_ex.read_byte_len = cq_read_byte_len; if (attr->wc_flags & IBV_WC_EX_WITH_IMM) cq->vcq.cq_ex.read_imm_data = cq_read_imm_data; if (attr->wc_flags & IBV_WC_EX_WITH_QP_NUM) cq->vcq.cq_ex.read_qp_num = cq_read_qp_num; if (attr->wc_flags & IBV_WC_EX_WITH_SRC_QP) cq->vcq.cq_ex.read_src_qp = cq_read_src_qp; if (attr->wc_flags & IBV_WC_EX_WITH_SLID) cq->vcq.cq_ex.read_slid = cq_read_slid; if (attr->wc_flags & IBV_WC_EX_WITH_SL) cq->vcq.cq_ex.read_sl = cq_read_sl; if (attr->wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS) cq->vcq.cq_ex.read_dlid_path_bits = cq_read_dlid_path_bits; return &cq->vcq.cq_ex; err_unmap: if (cq->mmap_info.size) munmap(cq->queue, cq->mmap_info.size); err_destroy: ibv_cmd_destroy_cq(&cq->vcq.cq); err_free: free(cq); err: return NULL; } static int rxe_resize_cq(struct ibv_cq *ibcq, int cqe) { struct rxe_cq *cq = to_rcq(ibcq); struct ibv_resize_cq cmd; struct urxe_resize_cq_resp resp; int ret; pthread_spin_lock(&cq->lock); ret = ibv_cmd_resize_cq(ibcq, cqe, &cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); if (ret) { pthread_spin_unlock(&cq->lock); return ret; } munmap(cq->queue, cq->mmap_info.size); cq->queue = mmap(NULL, resp.mi.size, PROT_READ | PROT_WRITE, MAP_SHARED, ibcq->context->cmd_fd, resp.mi.offset); ret = errno; pthread_spin_unlock(&cq->lock); if ((void *)cq->queue == MAP_FAILED) { cq->queue = NULL; cq->mmap_info.size = 0; return ret; } cq->mmap_info = resp.mi; return 0; } static int rxe_destroy_cq(struct ibv_cq *ibcq) { struct rxe_cq *cq = to_rcq(ibcq); int ret; ret = ibv_cmd_destroy_cq(ibcq); if (ret) return ret; if (cq->mmap_info.size) munmap(cq->queue, cq->mmap_info.size); free(cq); return 0; } static int rxe_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc) { struct rxe_cq *cq = to_rcq(ibcq); struct rxe_queue_buf *q; int npolled; uint8_t *src; pthread_spin_lock(&cq->lock); q = cq->queue; for (npolled = 0; npolled < ne; ++npolled, ++wc) { if (queue_empty(q)) break; src = consumer_addr(q); memcpy(wc, src, sizeof(*wc)); advance_consumer(q); } pthread_spin_unlock(&cq->lock); return npolled; } static struct ibv_srq *rxe_create_srq(struct ibv_pd *ibpd, struct ibv_srq_init_attr *attr) { struct rxe_srq *srq; struct ibv_srq *ibsrq; struct ibv_create_srq cmd; struct urxe_create_srq_resp resp; int ret; srq = calloc(1, sizeof(*srq)); if (srq == NULL) return NULL; ibsrq = &srq->vsrq.srq; ret = ibv_cmd_create_srq(ibpd, ibsrq, attr, &cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); if (ret) { free(srq); return NULL; } srq->rq.queue = mmap(NULL, resp.mi.size, PROT_READ | PROT_WRITE, MAP_SHARED, ibpd->context->cmd_fd, resp.mi.offset); if ((void *)srq->rq.queue == MAP_FAILED) { ibv_cmd_destroy_srq(ibsrq); free(srq); return NULL; } srq->mmap_info = resp.mi; srq->rq.max_sge = attr->attr.max_sge; pthread_spin_init(&srq->rq.lock, PTHREAD_PROCESS_PRIVATE); return ibsrq; } static struct ibv_srq *rxe_create_srq_ex( struct ibv_context *ibcontext, struct ibv_srq_init_attr_ex *attr_ex) { struct rxe_srq *srq; struct ibv_srq *ibsrq; struct ibv_create_xsrq cmd; struct urxe_create_srq_ex_resp resp; int ret; srq = calloc(1, sizeof(*srq)); if (srq == NULL) return NULL; ibsrq = &srq->vsrq.srq; ret = ibv_cmd_create_srq_ex(ibcontext, &srq->vsrq, attr_ex, &cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); if (ret) { free(srq); return NULL; } srq->rq.queue = mmap(NULL, resp.mi.size, PROT_READ | PROT_WRITE, MAP_SHARED, ibcontext->cmd_fd, resp.mi.offset); if ((void *)srq->rq.queue == MAP_FAILED) { ibv_cmd_destroy_srq(ibsrq); free(srq); return NULL; } srq->mmap_info = resp.mi; srq->rq.max_sge = attr_ex->attr.max_sge; pthread_spin_init(&srq->rq.lock, PTHREAD_PROCESS_PRIVATE); return ibsrq; } static int rxe_modify_srq(struct ibv_srq *ibsrq, struct ibv_srq_attr *attr, int attr_mask) { struct rxe_srq *srq = to_rsrq(ibsrq); struct urxe_modify_srq cmd; int rc = 0; struct mminfo mi; mi.offset = 0; mi.size = 0; if (attr_mask & IBV_SRQ_MAX_WR) pthread_spin_lock(&srq->rq.lock); cmd.mmap_info_addr = (__u64)(uintptr_t) &mi; rc = ibv_cmd_modify_srq(ibsrq, attr, attr_mask, &cmd.ibv_cmd, sizeof(cmd)); if (rc) goto out; if (attr_mask & IBV_SRQ_MAX_WR) { munmap(srq->rq.queue, srq->mmap_info.size); srq->rq.queue = mmap(NULL, mi.size, PROT_READ | PROT_WRITE, MAP_SHARED, ibsrq->context->cmd_fd, mi.offset); if ((void *)srq->rq.queue == MAP_FAILED) { rc = errno; srq->rq.queue = NULL; srq->mmap_info.size = 0; goto out; } srq->mmap_info = mi; } out: if (attr_mask & IBV_SRQ_MAX_WR) pthread_spin_unlock(&srq->rq.lock); return rc; } static int rxe_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr) { struct ibv_query_srq cmd; return ibv_cmd_query_srq(srq, attr, &cmd, sizeof(cmd)); } static int rxe_destroy_srq(struct ibv_srq *ibsrq) { int ret; struct rxe_srq *srq = to_rsrq(ibsrq); struct rxe_queue_buf *q = srq->rq.queue; ret = ibv_cmd_destroy_srq(ibsrq); if (!ret) { if (srq->mmap_info.size) munmap(q, srq->mmap_info.size); free(srq); } return ret; } static int rxe_post_one_recv(struct rxe_wq *rq, struct ibv_recv_wr *recv_wr) { int i; struct rxe_recv_wqe *wqe; struct rxe_queue_buf *q = rq->queue; int num_sge = recv_wr->num_sge; int length = 0; int rc = 0; if (queue_full(q)) { rc = ENOMEM; goto out; } if (num_sge > rq->max_sge) { rc = EINVAL; goto out; } wqe = (struct rxe_recv_wqe *)producer_addr(q); wqe->wr_id = recv_wr->wr_id; memcpy(wqe->dma.sge, recv_wr->sg_list, num_sge*sizeof(*wqe->dma.sge)); for (i = 0; i < num_sge; i++) length += wqe->dma.sge[i].length; wqe->dma.length = length; wqe->dma.resid = length; wqe->dma.cur_sge = 0; wqe->dma.num_sge = num_sge; wqe->dma.sge_offset = 0; advance_producer(q); out: return rc; } static int rxe_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *recv_wr, struct ibv_recv_wr **bad_recv_wr) { struct rxe_srq *srq = to_rsrq(ibsrq); int rc = 0; pthread_spin_lock(&srq->rq.lock); while (recv_wr) { rc = rxe_post_one_recv(&srq->rq, recv_wr); if (rc) { *bad_recv_wr = recv_wr; break; } recv_wr = recv_wr->next; } pthread_spin_unlock(&srq->rq.lock); return rc; } /* * builders always consume one send queue slot * setters (below) reach back and adjust previous build */ static void wr_atomic_cmp_swp(struct ibv_qp_ex *ibqp, uint32_t rkey, uint64_t remote_addr, uint64_t compare, uint64_t swap) { struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); if (check_qp_queue_full(qp)) return; memset(wqe, 0, sizeof(*wqe)); wqe->wr.wr_id = ibqp->wr_id; wqe->wr.send_flags = ibqp->wr_flags; wqe->wr.opcode = IBV_WR_ATOMIC_CMP_AND_SWP; wqe->wr.wr.atomic.remote_addr = remote_addr; wqe->wr.wr.atomic.compare_add = compare; wqe->wr.wr.atomic.swap = swap; wqe->wr.wr.atomic.rkey = rkey; wqe->iova = remote_addr; advance_qp_cur_index(qp); } static void wr_atomic_fetch_add(struct ibv_qp_ex *ibqp, uint32_t rkey, uint64_t remote_addr, uint64_t add) { struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); if (check_qp_queue_full(qp)) return; memset(wqe, 0, sizeof(*wqe)); wqe->wr.wr_id = qp->vqp.qp_ex.wr_id; wqe->wr.opcode = IBV_WR_ATOMIC_FETCH_AND_ADD; wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; wqe->wr.wr.atomic.remote_addr = remote_addr; wqe->wr.wr.atomic.compare_add = add; wqe->wr.wr.atomic.rkey = rkey; wqe->iova = remote_addr; advance_qp_cur_index(qp); } static void wr_bind_mw(struct ibv_qp_ex *ibqp, struct ibv_mw *ibmw, uint32_t rkey, const struct ibv_mw_bind_info *info) { struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); if (check_qp_queue_full(qp)) return; memset(wqe, 0, sizeof(*wqe)); wqe->wr.wr_id = ibqp->wr_id; wqe->wr.opcode = IBV_WR_BIND_MW; wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; wqe->wr.wr.mw.addr = info->addr; wqe->wr.wr.mw.length = info->length; wqe->wr.wr.mw.mr_lkey = info->mr->lkey; wqe->wr.wr.mw.mw_rkey = ibmw->rkey; wqe->wr.wr.mw.rkey = rkey; wqe->wr.wr.mw.access = info->mw_access_flags; advance_qp_cur_index(qp); } static void wr_local_inv(struct ibv_qp_ex *ibqp, uint32_t invalidate_rkey) { struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); if (check_qp_queue_full(qp)) return; memset(wqe, 0, sizeof(*wqe)); wqe->wr.wr_id = qp->vqp.qp_ex.wr_id; wqe->wr.opcode = IBV_WR_LOCAL_INV; wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; wqe->wr.ex.invalidate_rkey = invalidate_rkey; advance_qp_cur_index(qp); } static void wr_rdma_read(struct ibv_qp_ex *ibqp, uint32_t rkey, uint64_t remote_addr) { struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); if (check_qp_queue_full(qp)) return; memset(wqe, 0, sizeof(*wqe)); wqe->wr.wr_id = qp->vqp.qp_ex.wr_id; wqe->wr.opcode = IBV_WR_RDMA_READ; wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; wqe->wr.wr.rdma.remote_addr = remote_addr; wqe->wr.wr.rdma.rkey = rkey; wqe->iova = remote_addr; advance_qp_cur_index(qp); } static void wr_rdma_write(struct ibv_qp_ex *ibqp, uint32_t rkey, uint64_t remote_addr) { struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); if (check_qp_queue_full(qp)) return; memset(wqe, 0, sizeof(*wqe)); wqe->wr.wr_id = qp->vqp.qp_ex.wr_id; wqe->wr.opcode = IBV_WR_RDMA_WRITE; wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; wqe->wr.wr.rdma.remote_addr = remote_addr; wqe->wr.wr.rdma.rkey = rkey; wqe->iova = remote_addr; advance_qp_cur_index(qp); } static void wr_flush(struct ibv_qp_ex *ibqp, uint32_t rkey, uint64_t remote_addr, size_t length, uint8_t type, uint8_t level) { struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); if (check_qp_queue_full(qp)) return; memset(wqe, 0, sizeof(*wqe)); wqe->wr.wr_id = qp->vqp.qp_ex.wr_id; wqe->wr.opcode = IBV_WR_FLUSH; wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; wqe->wr.wr.flush.remote_addr = remote_addr; wqe->wr.wr.flush.rkey = rkey; wqe->wr.wr.flush.type = type; wqe->wr.wr.flush.level = level; wqe->dma.length = length; wqe->dma.resid = length; wqe->iova = remote_addr; advance_qp_cur_index(qp); } static void wr_atomic_write(struct ibv_qp_ex *ibqp, uint32_t rkey, uint64_t remote_addr, const void *atomic_wr) { struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); if (check_qp_queue_full(qp)) return; memset(wqe, 0, sizeof(*wqe)); wqe->wr.wr_id = qp->vqp.qp_ex.wr_id; wqe->wr.opcode = IBV_WR_ATOMIC_WRITE; wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; wqe->wr.wr.rdma.remote_addr = remote_addr; wqe->wr.wr.rdma.rkey = rkey; memcpy(wqe->dma.atomic_wr, atomic_wr, 8); wqe->dma.length = 8; wqe->dma.resid = 8; wqe->iova = remote_addr; advance_qp_cur_index(qp); } static void wr_rdma_write_imm(struct ibv_qp_ex *ibqp, uint32_t rkey, uint64_t remote_addr, __be32 imm_data) { struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); if (check_qp_queue_full(qp)) return; memset(wqe, 0, sizeof(*wqe)); wqe->wr.wr_id = qp->vqp.qp_ex.wr_id; wqe->wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; wqe->wr.wr.rdma.remote_addr = remote_addr; wqe->wr.wr.rdma.rkey = rkey; wqe->wr.ex.imm_data = imm_data; wqe->iova = remote_addr; advance_qp_cur_index(qp); } static void wr_send(struct ibv_qp_ex *ibqp) { struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); if (check_qp_queue_full(qp)) return; memset(wqe, 0, sizeof(*wqe)); wqe->wr.wr_id = qp->vqp.qp_ex.wr_id; wqe->wr.opcode = IBV_WR_SEND; wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; advance_qp_cur_index(qp); } static void wr_send_imm(struct ibv_qp_ex *ibqp, __be32 imm_data) { struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); if (check_qp_queue_full(qp)) return; memset(wqe, 0, sizeof(*wqe)); wqe->wr.wr_id = qp->vqp.qp_ex.wr_id; wqe->wr.opcode = IBV_WR_SEND_WITH_IMM; wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; wqe->wr.ex.imm_data = imm_data; advance_qp_cur_index(qp); } static void wr_send_inv(struct ibv_qp_ex *ibqp, uint32_t invalidate_rkey) { struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); if (check_qp_queue_full(qp)) return; memset(wqe, 0, sizeof(*wqe)); wqe->wr.wr_id = qp->vqp.qp_ex.wr_id; wqe->wr.opcode = IBV_WR_SEND_WITH_INV; wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; wqe->wr.ex.invalidate_rkey = invalidate_rkey; advance_qp_cur_index(qp); } static void wr_set_ud_addr(struct ibv_qp_ex *ibqp, struct ibv_ah *ibah, uint32_t remote_qpn, uint32_t remote_qkey) { struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); struct rxe_ah *ah = to_rah(ibah); struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index - 1); if (qp->err) return; wqe->wr.wr.ud.remote_qpn = remote_qpn; wqe->wr.wr.ud.remote_qkey = remote_qkey; wqe->wr.wr.ud.ah_num = ah->ah_num; if (!ah->ah_num) /* old kernels only */ memcpy(&wqe->wr.wr.ud.av, &ah->av, sizeof(ah->av)); } static void wr_set_inline_data(struct ibv_qp_ex *ibqp, void *addr, size_t length) { struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index - 1); if (qp->err) return; if (length > qp->sq.max_inline) { qp->err = ENOSPC; return; } memcpy(wqe->dma.inline_data, addr, length); wqe->dma.length = length; wqe->dma.resid = length; } static void wr_set_inline_data_list(struct ibv_qp_ex *ibqp, size_t num_buf, const struct ibv_data_buf *buf_list) { struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index - 1); uint8_t *data = wqe->dma.inline_data; size_t length; size_t tot_length = 0; if (qp->err) return; while (num_buf--) { length = buf_list->length; if (tot_length + length > qp->sq.max_inline) { qp->err = ENOSPC; return; } memcpy(data, buf_list->addr, length); buf_list++; data += length; } wqe->dma.length = tot_length; wqe->dma.resid = tot_length; } static void wr_set_sge(struct ibv_qp_ex *ibqp, uint32_t lkey, uint64_t addr, uint32_t length) { struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index - 1); if (qp->err) return; if (length) { wqe->dma.length = length; wqe->dma.resid = length; wqe->dma.num_sge = 1; wqe->dma.sge[0].addr = addr; wqe->dma.sge[0].length = length; wqe->dma.sge[0].lkey = lkey; } } static void wr_set_sge_list(struct ibv_qp_ex *ibqp, size_t num_sge, const struct ibv_sge *sg_list) { struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index - 1); size_t tot_length = 0; if (qp->err) return; if (num_sge > qp->sq.max_sge) { qp->err = ENOSPC; return; } wqe->dma.num_sge = num_sge; memcpy(wqe->dma.sge, sg_list, num_sge*sizeof(*sg_list)); while (num_sge--) tot_length += sg_list->length; wqe->dma.length = tot_length; wqe->dma.resid = tot_length; } static void wr_start(struct ibv_qp_ex *ibqp) { struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); pthread_spin_lock(&qp->sq.lock); qp->err = 0; qp->cur_index = load_producer_index(qp->sq.queue); } static int post_send_db(struct ibv_qp *ibqp); static int wr_complete(struct ibv_qp_ex *ibqp) { int ret; struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); if (qp->err) { pthread_spin_unlock(&qp->sq.lock); return qp->err; } store_producer_index(qp->sq.queue, qp->cur_index); ret = post_send_db(&qp->vqp.qp); pthread_spin_unlock(&qp->sq.lock); return ret; } static void wr_abort(struct ibv_qp_ex *ibqp) { struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); pthread_spin_unlock(&qp->sq.lock); } static int map_queue_pair(int cmd_fd, struct rxe_qp *qp, struct ibv_qp_init_attr *attr, struct rxe_create_qp_resp *resp) { if (attr->srq) { qp->rq.max_sge = 0; qp->rq.queue = NULL; qp->rq_mmap_info.size = 0; } else { qp->rq.max_sge = attr->cap.max_recv_sge; qp->rq.queue = mmap(NULL, resp->rq_mi.size, PROT_READ | PROT_WRITE, MAP_SHARED, cmd_fd, resp->rq_mi.offset); if ((void *)qp->rq.queue == MAP_FAILED) return errno; qp->rq_mmap_info = resp->rq_mi; pthread_spin_init(&qp->rq.lock, PTHREAD_PROCESS_PRIVATE); } qp->sq.max_sge = attr->cap.max_send_sge; qp->sq.max_inline = attr->cap.max_inline_data; qp->sq.queue = mmap(NULL, resp->sq_mi.size, PROT_READ | PROT_WRITE, MAP_SHARED, cmd_fd, resp->sq_mi.offset); if ((void *)qp->sq.queue == MAP_FAILED) { if (qp->rq_mmap_info.size) munmap(qp->rq.queue, qp->rq_mmap_info.size); return errno; } qp->sq_mmap_info = resp->sq_mi; pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE); return 0; } static struct ibv_qp *rxe_create_qp(struct ibv_pd *ibpd, struct ibv_qp_init_attr *attr) { struct ibv_create_qp cmd = {}; struct urxe_create_qp_resp resp = {}; struct rxe_qp *qp; int ret; qp = calloc(1, sizeof(*qp)); if (!qp) goto err; ret = ibv_cmd_create_qp(ibpd, &qp->vqp.qp, attr, &cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); if (ret) goto err_free; ret = map_queue_pair(ibpd->context->cmd_fd, qp, attr, &resp.drv_payload); if (ret) goto err_destroy; qp->sq_mmap_info = resp.sq_mi; pthread_spin_init(&qp->sq.lock, PTHREAD_PROCESS_PRIVATE); return &qp->vqp.qp; err_destroy: ibv_cmd_destroy_qp(&qp->vqp.qp); err_free: free(qp); err: return NULL; } enum { RXE_QP_CREATE_FLAGS_SUP = 0, RXE_QP_COMP_MASK_SUP = IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_CREATE_FLAGS | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS, RXE_SUP_RC_QP_SEND_OPS_FLAGS = IBV_QP_EX_WITH_RDMA_WRITE | IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM | IBV_QP_EX_WITH_SEND | IBV_QP_EX_WITH_SEND_WITH_IMM | IBV_QP_EX_WITH_RDMA_READ | IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP | IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD | IBV_QP_EX_WITH_LOCAL_INV | IBV_QP_EX_WITH_BIND_MW | IBV_QP_EX_WITH_SEND_WITH_INV | IBV_QP_EX_WITH_FLUSH | IBV_QP_EX_WITH_ATOMIC_WRITE, RXE_SUP_UC_QP_SEND_OPS_FLAGS = IBV_QP_EX_WITH_RDMA_WRITE | IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM | IBV_QP_EX_WITH_SEND | IBV_QP_EX_WITH_SEND_WITH_IMM | IBV_QP_EX_WITH_BIND_MW | IBV_QP_EX_WITH_SEND_WITH_INV, RXE_SUP_UD_QP_SEND_OPS_FLAGS = IBV_QP_EX_WITH_SEND | IBV_QP_EX_WITH_SEND_WITH_IMM, }; static int check_qp_init_attr(struct ibv_qp_init_attr_ex *attr) { if (attr->comp_mask & ~RXE_QP_COMP_MASK_SUP) goto err; if ((attr->comp_mask & IBV_QP_INIT_ATTR_CREATE_FLAGS) && (attr->create_flags & ~RXE_QP_CREATE_FLAGS_SUP)) goto err; if (attr->comp_mask & IBV_QP_INIT_ATTR_SEND_OPS_FLAGS) { switch (attr->qp_type) { case IBV_QPT_RC: if (attr->send_ops_flags & ~RXE_SUP_RC_QP_SEND_OPS_FLAGS) goto err; break; case IBV_QPT_UC: if (attr->send_ops_flags & ~RXE_SUP_UC_QP_SEND_OPS_FLAGS) goto err; break; case IBV_QPT_UD: if (attr->send_ops_flags & ~RXE_SUP_UD_QP_SEND_OPS_FLAGS) goto err; break; default: goto err; } } return 0; err: errno = EOPNOTSUPP; return errno; } static void set_qp_send_ops(struct rxe_qp *qp, uint64_t flags) { if (flags & IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP) qp->vqp.qp_ex.wr_atomic_cmp_swp = wr_atomic_cmp_swp; if (flags & IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD) qp->vqp.qp_ex.wr_atomic_fetch_add = wr_atomic_fetch_add; if (flags & IBV_QP_EX_WITH_BIND_MW) qp->vqp.qp_ex.wr_bind_mw = wr_bind_mw; if (flags & IBV_QP_EX_WITH_LOCAL_INV) qp->vqp.qp_ex.wr_local_inv = wr_local_inv; if (flags & IBV_QP_EX_WITH_ATOMIC_WRITE) qp->vqp.qp_ex.wr_atomic_write = wr_atomic_write; if (flags & IBV_QP_EX_WITH_RDMA_READ) qp->vqp.qp_ex.wr_rdma_read = wr_rdma_read; if (flags & IBV_QP_EX_WITH_RDMA_WRITE) qp->vqp.qp_ex.wr_rdma_write = wr_rdma_write; if (flags & IBV_QP_EX_WITH_FLUSH) qp->vqp.qp_ex.wr_flush = wr_flush; if (flags & IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM) qp->vqp.qp_ex.wr_rdma_write_imm = wr_rdma_write_imm; if (flags & IBV_QP_EX_WITH_SEND) qp->vqp.qp_ex.wr_send = wr_send; if (flags & IBV_QP_EX_WITH_SEND_WITH_IMM) qp->vqp.qp_ex.wr_send_imm = wr_send_imm; if (flags & IBV_QP_EX_WITH_SEND_WITH_INV) qp->vqp.qp_ex.wr_send_inv = wr_send_inv; qp->vqp.qp_ex.wr_set_ud_addr = wr_set_ud_addr; qp->vqp.qp_ex.wr_set_inline_data = wr_set_inline_data; qp->vqp.qp_ex.wr_set_inline_data_list = wr_set_inline_data_list; qp->vqp.qp_ex.wr_set_sge = wr_set_sge; qp->vqp.qp_ex.wr_set_sge_list = wr_set_sge_list; qp->vqp.qp_ex.wr_start = wr_start; qp->vqp.qp_ex.wr_complete = wr_complete; qp->vqp.qp_ex.wr_abort = wr_abort; } static struct ibv_qp *rxe_create_qp_ex(struct ibv_context *context, struct ibv_qp_init_attr_ex *attr) { int ret; struct rxe_qp *qp; struct ibv_create_qp_ex cmd = {}; struct urxe_create_qp_ex_resp resp = {}; size_t cmd_size = sizeof(cmd); size_t resp_size = sizeof(resp); ret = check_qp_init_attr(attr); if (ret) goto err; qp = calloc(1, sizeof(*qp)); if (!qp) goto err; if (attr->comp_mask & IBV_QP_INIT_ATTR_SEND_OPS_FLAGS) set_qp_send_ops(qp, attr->send_ops_flags); ret = ibv_cmd_create_qp_ex2(context, &qp->vqp, attr, &cmd, cmd_size, &resp.ibv_resp, resp_size); if (ret) goto err_free; qp->vqp.comp_mask |= VERBS_QP_EX; ret = map_queue_pair(context->cmd_fd, qp, (struct ibv_qp_init_attr *)attr, &resp.drv_payload); if (ret) goto err_destroy; return &qp->vqp.qp; err_destroy: ibv_cmd_destroy_qp(&qp->vqp.qp); err_free: free(qp); err: return NULL; } static int rxe_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) { struct ibv_query_qp cmd = {}; return ibv_cmd_query_qp(ibqp, attr, attr_mask, init_attr, &cmd, sizeof(cmd)); } static int rxe_modify_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, int attr_mask) { struct ibv_modify_qp cmd = {}; return ibv_cmd_modify_qp(ibqp, attr, attr_mask, &cmd, sizeof(cmd)); } static int rxe_destroy_qp(struct ibv_qp *ibqp) { int ret; struct rxe_qp *qp = to_rqp(ibqp); ret = ibv_cmd_destroy_qp(ibqp); if (!ret) { if (qp->rq_mmap_info.size) munmap(qp->rq.queue, qp->rq_mmap_info.size); if (qp->sq_mmap_info.size) munmap(qp->sq.queue, qp->sq_mmap_info.size); free(qp); } return ret; } /* basic sanity checks for send work request */ static int validate_send_wr(struct rxe_qp *qp, struct ibv_send_wr *ibwr, unsigned int length) { struct rxe_wq *sq = &qp->sq; enum ibv_wr_opcode opcode = ibwr->opcode; if (ibwr->num_sge > sq->max_sge) return EINVAL; if ((opcode == IBV_WR_ATOMIC_CMP_AND_SWP) || (opcode == IBV_WR_ATOMIC_FETCH_AND_ADD)) if (length < 8 || ibwr->wr.atomic.remote_addr & 0x7) return EINVAL; if ((ibwr->send_flags & IBV_SEND_INLINE) && (length > sq->max_inline)) return EINVAL; if (ibwr->opcode == IBV_WR_BIND_MW) { if (length) return EINVAL; if (ibwr->num_sge) return EINVAL; if (ibwr->imm_data) return EINVAL; if ((qp_type(qp) != IBV_QPT_RC) && (qp_type(qp) != IBV_QPT_UC)) return EINVAL; } return 0; } static void convert_send_wr(struct rxe_qp *qp, struct rxe_send_wr *kwr, struct ibv_send_wr *uwr) { struct ibv_mw *ibmw; struct ibv_mr *ibmr; memset(kwr, 0, sizeof(*kwr)); kwr->wr_id = uwr->wr_id; kwr->opcode = uwr->opcode; kwr->send_flags = uwr->send_flags; kwr->ex.imm_data = uwr->imm_data; switch (uwr->opcode) { case IBV_WR_RDMA_WRITE: case IBV_WR_RDMA_WRITE_WITH_IMM: case IBV_WR_RDMA_READ: kwr->wr.rdma.remote_addr = uwr->wr.rdma.remote_addr; kwr->wr.rdma.rkey = uwr->wr.rdma.rkey; break; case IBV_WR_SEND: case IBV_WR_SEND_WITH_IMM: if (qp_type(qp) == IBV_QPT_UD) { struct rxe_ah *ah = to_rah(uwr->wr.ud.ah); kwr->wr.ud.remote_qpn = uwr->wr.ud.remote_qpn; kwr->wr.ud.remote_qkey = uwr->wr.ud.remote_qkey; kwr->wr.ud.ah_num = ah->ah_num; } break; case IBV_WR_ATOMIC_CMP_AND_SWP: case IBV_WR_ATOMIC_FETCH_AND_ADD: kwr->wr.atomic.remote_addr = uwr->wr.atomic.remote_addr; kwr->wr.atomic.compare_add = uwr->wr.atomic.compare_add; kwr->wr.atomic.swap = uwr->wr.atomic.swap; kwr->wr.atomic.rkey = uwr->wr.atomic.rkey; break; case IBV_WR_BIND_MW: ibmr = uwr->bind_mw.bind_info.mr; ibmw = uwr->bind_mw.mw; kwr->wr.mw.addr = uwr->bind_mw.bind_info.addr; kwr->wr.mw.length = uwr->bind_mw.bind_info.length; kwr->wr.mw.mr_lkey = ibmr->lkey; kwr->wr.mw.mw_rkey = ibmw->rkey; kwr->wr.mw.rkey = uwr->bind_mw.rkey; kwr->wr.mw.access = uwr->bind_mw.bind_info.mw_access_flags; break; default: break; } } static int init_send_wqe(struct rxe_qp *qp, struct rxe_wq *sq, struct ibv_send_wr *ibwr, unsigned int length, struct rxe_send_wqe *wqe) { int num_sge = ibwr->num_sge; int i; unsigned int opcode = ibwr->opcode; convert_send_wr(qp, &wqe->wr, ibwr); if (qp_type(qp) == IBV_QPT_UD) { struct rxe_ah *ah = to_rah(ibwr->wr.ud.ah); if (!ah->ah_num) /* old kernels only */ memcpy(&wqe->wr.wr.ud.av, &ah->av, sizeof(struct rxe_av)); } if (ibwr->send_flags & IBV_SEND_INLINE) { uint8_t *inline_data = wqe->dma.inline_data; for (i = 0; i < num_sge; i++) { memcpy(inline_data, (uint8_t *)(long)ibwr->sg_list[i].addr, ibwr->sg_list[i].length); inline_data += ibwr->sg_list[i].length; } } else memcpy(wqe->dma.sge, ibwr->sg_list, num_sge*sizeof(struct ibv_sge)); if ((opcode == IBV_WR_ATOMIC_CMP_AND_SWP) || (opcode == IBV_WR_ATOMIC_FETCH_AND_ADD)) wqe->iova = ibwr->wr.atomic.remote_addr; else wqe->iova = ibwr->wr.rdma.remote_addr; wqe->dma.length = length; wqe->dma.resid = length; wqe->dma.num_sge = num_sge; wqe->dma.cur_sge = 0; wqe->dma.sge_offset = 0; wqe->state = 0; return 0; } static int post_one_send(struct rxe_qp *qp, struct rxe_wq *sq, struct ibv_send_wr *ibwr) { int err; struct rxe_send_wqe *wqe; unsigned int length = 0; int i; for (i = 0; i < ibwr->num_sge; i++) length += ibwr->sg_list[i].length; err = validate_send_wr(qp, ibwr, length); if (err) { verbs_err(verbs_get_ctx(qp->vqp.qp.context), "validate send failed\n"); return err; } wqe = (struct rxe_send_wqe *)producer_addr(sq->queue); err = init_send_wqe(qp, sq, ibwr, length, wqe); if (err) return err; if (queue_full(sq->queue)) return ENOMEM; advance_producer(sq->queue); rdma_tracepoint(rdma_core_rxe, post_send, qp->vqp.qp.context->device->name, qp->vqp.qp.qp_num, (char *)ibv_wr_opcode_str(ibwr->opcode), length); return 0; } /* send a null post send as a doorbell */ static int post_send_db(struct ibv_qp *ibqp) { struct ibv_post_send cmd; struct ib_uverbs_post_send_resp resp; cmd.hdr.command = IB_USER_VERBS_CMD_POST_SEND; cmd.hdr.in_words = sizeof(cmd) / 4; cmd.hdr.out_words = sizeof(resp) / 4; cmd.response = (uintptr_t)&resp; cmd.qp_handle = ibqp->handle; cmd.wr_count = 0; cmd.sge_count = 0; cmd.wqe_size = sizeof(struct ibv_send_wr); if (write(ibqp->context->cmd_fd, &cmd, sizeof(cmd)) != sizeof(cmd)) return errno; return 0; } /* this API does not make a distinction between * restartable and non-restartable errors */ static int rxe_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr_list, struct ibv_send_wr **bad_wr) { int rc = 0; int err; struct rxe_qp *qp = to_rqp(ibqp); struct rxe_wq *sq = &qp->sq; if (!bad_wr) return EINVAL; *bad_wr = NULL; if (!sq || !wr_list || !sq->queue) return EINVAL; pthread_spin_lock(&sq->lock); while (wr_list) { rc = post_one_send(qp, sq, wr_list); if (rc) { *bad_wr = wr_list; break; } wr_list = wr_list->next; } pthread_spin_unlock(&sq->lock); err = post_send_db(ibqp); return err ? err : rc; } static int rxe_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *recv_wr, struct ibv_recv_wr **bad_wr) { int rc = 0; struct rxe_qp *qp = to_rqp(ibqp); struct rxe_wq *rq = &qp->rq; if (!bad_wr) return EINVAL; *bad_wr = NULL; if (!rq || !recv_wr || !rq->queue) return EINVAL; /* see C10-97.2.1 */ if (ibqp->state == IBV_QPS_RESET) return EINVAL; pthread_spin_lock(&rq->lock); while (recv_wr) { rc = rxe_post_one_recv(rq, recv_wr); if (rc) { *bad_wr = recv_wr; break; } recv_wr = recv_wr->next; } pthread_spin_unlock(&rq->lock); return rc; } static inline int ipv6_addr_v4mapped(const struct in6_addr *a) { return IN6_IS_ADDR_V4MAPPED(a); } typedef typeof(((struct rxe_av *)0)->sgid_addr) sockaddr_union_t; static inline int rdma_gid2ip(sockaddr_union_t *out, union ibv_gid *gid) { if (ipv6_addr_v4mapped((struct in6_addr *)gid)) { memset(&out->_sockaddr_in, 0, sizeof(out->_sockaddr_in)); memcpy(&out->_sockaddr_in.sin_addr.s_addr, gid->raw + 12, 4); } else { memset(&out->_sockaddr_in6, 0, sizeof(out->_sockaddr_in6)); out->_sockaddr_in6.sin6_family = AF_INET6; memcpy(&out->_sockaddr_in6.sin6_addr.s6_addr, gid->raw, 16); } return 0; } static int rxe_create_av(struct rxe_ah *ah, struct ibv_pd *pd, struct ibv_ah_attr *attr) { struct rxe_av *av = &ah->av; union ibv_gid sgid; int ret; ret = ibv_query_gid(pd->context, attr->port_num, attr->grh.sgid_index, &sgid); if (ret) return ret; av->port_num = attr->port_num; memcpy(&av->grh, &attr->grh, sizeof(attr->grh)); ret = ipv6_addr_v4mapped((struct in6_addr *)attr->grh.dgid.raw); av->network_type = ret ? RXE_NETWORK_TYPE_IPV4 : RXE_NETWORK_TYPE_IPV6; rdma_gid2ip(&av->sgid_addr, &sgid); rdma_gid2ip(&av->dgid_addr, &attr->grh.dgid); ret = ibv_resolve_eth_l2_from_gid(pd->context, attr, av->dmac, NULL); return ret; } /* * Newer kernels will return a non-zero AH index in resp.ah_num * which can be returned in UD send WQEs. * Older kernels will leave ah_num == 0. For these create an AV and use * in UD send WQEs. */ static struct ibv_ah *rxe_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) { struct rxe_ah *ah; struct urxe_create_ah_resp resp = {}; int ret; ah = calloc(1, sizeof(*ah)); if (!ah) return NULL; ret = ibv_cmd_create_ah(pd, &ah->ibv_ah, attr, &resp.ibv_resp, sizeof(resp)); if (ret) goto err_free; ah->ah_num = resp.ah_num; if (!ah->ah_num) { /* old kernels only */ ret = rxe_create_av(ah, pd, attr); if (ret) goto err_free; } return &ah->ibv_ah; err_free: free(ah); return NULL; } static int rxe_destroy_ah(struct ibv_ah *ibah) { struct rxe_ah *ah = to_rah(ibah); int ret; ret = ibv_cmd_destroy_ah(&ah->ibv_ah); if (!ret) free(ah); return ret; } static const struct verbs_context_ops rxe_ctx_ops = { .query_device_ex = rxe_query_device, .query_port = rxe_query_port, .alloc_pd = rxe_alloc_pd, .dealloc_pd = rxe_dealloc_pd, .reg_mr = rxe_reg_mr, .dereg_mr = rxe_dereg_mr, .alloc_mw = rxe_alloc_mw, .dealloc_mw = rxe_dealloc_mw, .bind_mw = rxe_bind_mw, .create_cq = rxe_create_cq, .create_cq_ex = rxe_create_cq_ex, .poll_cq = rxe_poll_cq, .req_notify_cq = ibv_cmd_req_notify_cq, .resize_cq = rxe_resize_cq, .destroy_cq = rxe_destroy_cq, .create_srq = rxe_create_srq, .create_srq_ex = rxe_create_srq_ex, .modify_srq = rxe_modify_srq, .query_srq = rxe_query_srq, .destroy_srq = rxe_destroy_srq, .post_srq_recv = rxe_post_srq_recv, .create_qp = rxe_create_qp, .create_qp_ex = rxe_create_qp_ex, .query_qp = rxe_query_qp, .modify_qp = rxe_modify_qp, .destroy_qp = rxe_destroy_qp, .post_send = rxe_post_send, .post_recv = rxe_post_recv, .create_ah = rxe_create_ah, .destroy_ah = rxe_destroy_ah, .attach_mcast = ibv_cmd_attach_mcast, .detach_mcast = ibv_cmd_detach_mcast, .free_context = rxe_free_context, }; static struct verbs_context *rxe_alloc_context(struct ibv_device *ibdev, int cmd_fd, void *private_data) { struct rxe_context *context; struct ibv_get_context cmd; struct ib_uverbs_get_context_resp resp; context = verbs_init_and_alloc_context(ibdev, cmd_fd, context, ibv_ctx, RDMA_DRIVER_RXE); if (!context) return NULL; if (ibv_cmd_get_context(&context->ibv_ctx, &cmd, sizeof(cmd), &resp, sizeof(resp))) goto out; verbs_set_ops(&context->ibv_ctx, &rxe_ctx_ops); return &context->ibv_ctx; out: verbs_uninit_context(&context->ibv_ctx); free(context); return NULL; } static void rxe_free_context(struct ibv_context *ibctx) { struct rxe_context *context = to_rctx(ibctx); verbs_uninit_context(&context->ibv_ctx); free(context); } static void rxe_uninit_device(struct verbs_device *verbs_device) { struct rxe_device *dev = to_rdev(&verbs_device->device); free(dev); } static struct verbs_device *rxe_device_alloc(struct verbs_sysfs_dev *sysfs_dev) { struct rxe_device *dev; dev = calloc(1, sizeof(*dev)); if (!dev) return NULL; dev->abi_version = sysfs_dev->abi_ver; return &dev->ibv_dev; } static const struct verbs_device_ops rxe_dev_ops = { .name = "rxe", /* * For 64 bit machines ABI version 1 and 2 are the same. Otherwise 32 * bit machines require ABI version 2 which guarentees the user and * kernel use the same ABI. */ .match_min_abi_version = sizeof(void *) == 8?1:2, .match_max_abi_version = 2, .match_table = hca_table, .alloc_device = rxe_device_alloc, .uninit_device = rxe_uninit_device, .alloc_context = rxe_alloc_context, }; PROVIDER_DRIVER(rxe, rxe_dev_ops);