/* * Copyright (c) 2015-2016 QLogic Corporation * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and /or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "qelr.h" #include "qelr_chain.h" #include "qelr_verbs.h" #include #include #include #include #include #define QELR_SQE_ELEMENT_SIZE (sizeof(struct rdma_sq_sge)) #define QELR_RQE_ELEMENT_SIZE (sizeof(struct rdma_rq_sge)) #define QELR_CQE_SIZE (sizeof(union rdma_cqe)) static void qelr_inc_sw_cons_u16(struct qelr_qp_hwq_info *info) { info->cons = (info->cons + 1) % info->max_wr; info->wqe_cons++; } static void qelr_inc_sw_prod_u16(struct qelr_qp_hwq_info *info) { info->prod = (info->prod + 1) % info->max_wr; } static inline int qelr_wq_is_full(struct qelr_qp_hwq_info *info) { return (((info->prod + 1) % info->max_wr) == info->cons); } int qelr_query_device(struct ibv_context *context, const struct ibv_query_device_ex_input *input, struct ibv_device_attr_ex *attr, size_t attr_size) { struct ib_uverbs_ex_query_device_resp resp; size_t resp_size = sizeof(resp); uint64_t fw_ver; unsigned int major, minor, revision, eng; int ret; ret = ibv_cmd_query_device_any(context, input, attr, attr_size, &resp, &resp_size); if (ret) return ret; fw_ver = resp.base.fw_ver; major = (fw_ver >> 24) & 0xff; minor = (fw_ver >> 16) & 0xff; revision = (fw_ver >> 8) & 0xff; eng = fw_ver & 0xff; snprintf(attr->orig_attr.fw_ver, sizeof(attr->orig_attr.fw_ver), "%d.%d.%d.%d", major, minor, revision, eng); return 0; } int qelr_query_port(struct ibv_context *context, uint8_t port, struct ibv_port_attr *attr) { struct ibv_query_port cmd; int status; status = ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd)); return status; } struct ibv_pd *qelr_alloc_pd(struct ibv_context *context) { struct qelr_alloc_pd cmd; struct qelr_alloc_pd_resp resp; struct qelr_pd *pd; struct qelr_devctx *cxt = get_qelr_ctx(context); pd = malloc(sizeof(*pd)); if (!pd) return NULL; bzero(pd, sizeof(*pd)); memset(&cmd, 0, sizeof(cmd)); if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp))) { free(pd); return NULL; } pd->pd_id = resp.pd_id; verbs_debug(&cxt->ibv_ctx, "Allocated pd: %d\n", pd->pd_id); return &pd->ibv_pd; } int qelr_dealloc_pd(struct ibv_pd *ibpd) { int rc = 0; struct qelr_pd *pd = get_qelr_pd(ibpd); struct qelr_devctx *cxt = get_qelr_ctx(ibpd->context); verbs_debug(&cxt->ibv_ctx, "Deallocated pd: %d\n", pd->pd_id); rc = ibv_cmd_dealloc_pd(ibpd); if (rc) return rc; free(pd); return rc; } struct ibv_mr *qelr_reg_mr(struct ibv_pd *ibpd, void *addr, size_t len, uint64_t hca_va, int access) { struct qelr_mr *mr; struct ibv_reg_mr cmd; struct qelr_reg_mr_resp resp; struct qelr_pd *pd = get_qelr_pd(ibpd); struct qelr_devctx *cxt = get_qelr_ctx(ibpd->context); mr = malloc(sizeof(*mr)); if (!mr) return NULL; bzero(mr, sizeof(*mr)); if (ibv_cmd_reg_mr(ibpd, addr, len, hca_va, access, &mr->vmr, &cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp))) { free(mr); return NULL; } verbs_debug(&cxt->ibv_ctx, "MR Register %p completed successfully pd_id=%d addr=%p len=%zu access=%d lkey=%x rkey=%x\n", mr, pd->pd_id, addr, len, access, mr->vmr.ibv_mr.lkey, mr->vmr.ibv_mr.rkey); return &mr->vmr.ibv_mr; } int qelr_dereg_mr(struct verbs_mr *vmr) { struct qelr_devctx *cxt = get_qelr_ctx(vmr->ibv_mr.context); int rc; rc = ibv_cmd_dereg_mr(vmr); if (rc) return rc; verbs_debug(&cxt->ibv_ctx, "MR DERegister %p completed successfully\n", vmr); free(vmr); return 0; } static void consume_cqe(struct qelr_cq *cq) { if (cq->latest_cqe == cq->toggle_cqe) cq->chain_toggle ^= RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK; cq->latest_cqe = qelr_chain_consume(&cq->chain); } static inline int qelr_cq_entries(int entries) { /* FW requires an extra entry */ return entries + 1; } struct ibv_cq *qelr_create_cq(struct ibv_context *context, int cqe, struct ibv_comp_channel *channel, int comp_vector) { struct qelr_devctx *cxt = get_qelr_ctx(context); struct qelr_create_cq_resp resp = {}; struct qelr_create_cq cmd; struct qelr_cq *cq; int chain_size; int rc; verbs_debug(&cxt->ibv_ctx, "create cq: context=%p, cqe=%d, channel=%p, comp_vector=%d\n", context, cqe, channel, comp_vector); if (!cqe || cqe > cxt->max_cqes) { verbs_err(&cxt->ibv_ctx, "create cq: failed. attempted to allocate %d cqes but valid range is 1...%d\n", cqe, cxt->max_cqes); errno = EINVAL; return NULL; } /* allocate CQ structure */ cq = calloc(1, sizeof(*cq)); if (!cq) return NULL; /* allocate CQ buffer */ chain_size = qelr_cq_entries(cqe) * QELR_CQE_SIZE; rc = qelr_chain_alloc(&cq->chain, chain_size, cxt->kernel_page_size, QELR_CQE_SIZE); if (rc) goto err_0; cmd.addr = (uintptr_t) cq->chain.first_addr; cmd.len = cq->chain.size; rc = ibv_cmd_create_cq(context, cqe, channel, comp_vector, &cq->ibv_cq, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); if (rc) { verbs_err(&cxt->ibv_ctx, "create cq: failed with rc = %d\n", rc); goto err_1; } /* map the doorbell and prepare its data */ cq->db.data.icid = htole16(resp.icid); cq->db.data.params = DB_AGG_CMD_SET << RDMA_PWM_VAL32_DATA_AGG_CMD_SHIFT; cq->db_addr = cxt->db_addr + resp.db_offset; if (resp.db_rec_addr) { cq->db_rec_map = mmap(NULL, cxt->kernel_page_size, PROT_WRITE, MAP_SHARED, context->cmd_fd, resp.db_rec_addr); if (cq->db_rec_map == MAP_FAILED) { int errsv = errno; verbs_err(&cxt->ibv_ctx, "alloc context: doorbell rec mapping failed resp.db_rec_addr = %llx size=%d context->cmd_fd=%d errno=%d\n", resp.db_rec_addr, cxt->kernel_page_size, context->cmd_fd, errsv); goto err_1; } cq->db_rec_addr = cq->db_rec_map; } else { /* Kernel doesn't support doorbell recovery. Point to dummy * location instead */ cq->db_rec_addr = &cxt->db_rec_addr_dummy; } /* point to the very last element, passing this we will toggle */ cq->toggle_cqe = qelr_chain_get_last_elem(&cq->chain); cq->chain_toggle = RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK; cq->latest_cqe = NULL; /* must be different from chain_toggle */ consume_cqe(cq); verbs_debug(&cxt->ibv_ctx, "create cq: successfully created %p\n", cq); return &cq->ibv_cq; err_1: qelr_chain_free(&cq->chain); err_0: free(cq); return NULL; } int qelr_destroy_cq(struct ibv_cq *ibv_cq) { struct qelr_devctx *cxt = get_qelr_ctx(ibv_cq->context); struct qelr_cq *cq = get_qelr_cq(ibv_cq); int rc; verbs_debug(&cxt->ibv_ctx, "destroy cq: %p\n", cq); rc = ibv_cmd_destroy_cq(ibv_cq); if (rc) { verbs_debug(&cxt->ibv_ctx, "destroy cq: failed to destroy %p, got %d.\n", cq, rc); return rc; } qelr_chain_free(&cq->chain); if (cq->db_rec_map) munmap(cq->db_rec_map, cxt->kernel_page_size); verbs_debug(&cxt->ibv_ctx, "destroy cq: successfully destroyed %p\n", cq); free(cq); return 0; } static struct qelr_srq *qelr_get_srq(struct qelr_devctx *cxt, uint32_t srq_id) { if (unlikely(srq_id >= QELR_MAX_SRQ_ID)) { verbs_err(&cxt->ibv_ctx, "invalid srq_id %u\n", srq_id); return NULL; } return cxt->srq_table[srq_id]; } int qelr_query_srq(struct ibv_srq *ibv_srq, struct ibv_srq_attr *attr) { struct ibv_query_srq cmd; return ibv_cmd_query_srq(ibv_srq, attr, &cmd, sizeof(cmd)); } int qelr_modify_srq(struct ibv_srq *srq, struct ibv_srq_attr *attr, int attr_mask) { struct ibv_modify_srq cmd; return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof(cmd)); } static void qelr_destroy_srq_buffers(struct ibv_srq *ibv_srq) { struct qelr_srq *srq = get_qelr_srq(ibv_srq); uint32_t *virt_prod_pair_addr; uint32_t prod_size; qelr_chain_free(&srq->hw_srq.chain); virt_prod_pair_addr = srq->hw_srq.virt_prod_pair_addr; prod_size = sizeof(struct rdma_srq_producers); ibv_dofork_range(virt_prod_pair_addr, prod_size); munmap(virt_prod_pair_addr, prod_size); } int qelr_destroy_srq(struct ibv_srq *ibv_srq) { struct qelr_devctx *cxt = get_qelr_ctx(ibv_srq->context); struct qelr_srq *srq = get_qelr_srq(ibv_srq); int ret; ret = ibv_cmd_destroy_srq(ibv_srq); if (ret) return ret; if (srq->is_xrc) cxt->srq_table[srq->srq_id] = NULL; qelr_destroy_srq_buffers(ibv_srq); free(srq); return 0; } static void qelr_create_srq_configure_req(struct qelr_srq *srq, struct qelr_create_srq *req) { req->srq_addr = (uintptr_t)srq->hw_srq.chain.first_addr; req->srq_len = srq->hw_srq.chain.size; req->prod_pair_addr = (uintptr_t)srq->hw_srq.virt_prod_pair_addr; } static inline void qelr_create_srq_configure_req_ex(struct qelr_srq *srq, struct qelr_create_srq_ex *req) { req->srq_addr = (uintptr_t)srq->hw_srq.chain.first_addr; req->srq_len = srq->hw_srq.chain.size; req->prod_pair_addr = (uintptr_t)srq->hw_srq.virt_prod_pair_addr; } static int qelr_create_srq_buffers(struct qelr_devctx *cxt, struct qelr_srq *srq, uint32_t max_wr) { uint32_t max_sges; int chain_size, prod_size; void *addr; int rc; if (!max_wr) return -EINVAL; max_wr = min_t(uint32_t, max_wr, cxt->max_srq_wr); max_sges = max_wr * (cxt->sges_per_srq_wr + 1); /* +1 for header */ chain_size = max_sges * QELR_RQE_ELEMENT_SIZE; rc = qelr_chain_alloc(&srq->hw_srq.chain, chain_size, cxt->kernel_page_size, QELR_RQE_ELEMENT_SIZE); if (rc) { verbs_err(&cxt->ibv_ctx, "create srq: failed to map srq, got %d", rc); return rc; } prod_size = sizeof(struct rdma_srq_producers); addr = mmap(NULL, prod_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (addr == MAP_FAILED) { verbs_err(&cxt->ibv_ctx, "create srq: failed to map producer, got %d", errno); qelr_chain_free(&srq->hw_srq.chain); return errno; } rc = ibv_dontfork_range(addr, prod_size); if (rc) { munmap(addr, prod_size); qelr_chain_free(&srq->hw_srq.chain); return rc; } srq->hw_srq.virt_prod_pair_addr = addr; srq->hw_srq.max_sges = cxt->sges_per_srq_wr; srq->hw_srq.max_wr = max_wr; return 0; } struct ibv_srq *qelr_create_srq(struct ibv_pd *pd, struct ibv_srq_init_attr *init_attr) { struct qelr_devctx *cxt = get_qelr_ctx(pd->context); struct qelr_create_srq req; struct qelr_create_srq_resp resp; struct ibv_srq *ibv_srq; struct qelr_srq *srq; int ret; srq = calloc(1, sizeof(*srq)); if (!srq) return NULL; ibv_srq = &srq->verbs_srq.srq; ret = qelr_create_srq_buffers(cxt, srq, init_attr->attr.max_wr); if (ret) { free(srq); return NULL; } pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE); qelr_create_srq_configure_req(srq, &req); ret = ibv_cmd_create_srq(pd, ibv_srq, init_attr, &req.ibv_cmd, sizeof(req), &resp.ibv_resp, sizeof(resp)); if (ret) { qelr_destroy_srq_buffers(ibv_srq); free(srq); return NULL; } return ibv_srq; } static void qelr_free_rq(struct qelr_qp *qp) { free(qp->rqe_wr_id); } static void qelr_free_sq(struct qelr_qp *qp) { free(qp->wqe_wr_id); } static void qelr_chain_free_sq(struct qelr_qp *qp) { qelr_chain_free(&qp->sq.chain); } static void qelr_chain_free_rq(struct qelr_qp *qp) { qelr_chain_free(&qp->rq.chain); } static inline bool qelr_qp_has_rq(struct qelr_qp *qp) { return !!(qp->flags & QELR_QP_FLAG_RQ); } static inline bool qelr_qp_has_sq(struct qelr_qp *qp) { return !!(qp->flags & QELR_QP_FLAG_SQ); } static inline int qelr_create_qp_buffers_sq(struct qelr_devctx *cxt, struct qelr_qp *qp, struct ibv_qp_init_attr_ex *attrx) { uint32_t max_send_wr, max_send_sges, max_send_buf; int chain_size; int rc; /* SQ */ max_send_wr = attrx->cap.max_send_wr; max_send_wr = max_t(uint32_t, max_send_wr, 1); max_send_wr = min_t(uint32_t, max_send_wr, cxt->max_send_wr); max_send_sges = max_send_wr * cxt->sges_per_send_wr; max_send_buf = max_send_sges * QELR_SQE_ELEMENT_SIZE; chain_size = max_send_buf; rc = qelr_chain_alloc(&qp->sq.chain, chain_size, cxt->kernel_page_size, QELR_SQE_ELEMENT_SIZE); if (rc) verbs_err(&cxt->ibv_ctx, "create qp: failed to map SQ chain, got %d", rc); qp->sq.max_wr = max_send_wr; qp->sq.max_sges = cxt->sges_per_send_wr; return rc; } static inline int qelr_create_qp_buffers_rq(struct qelr_devctx *cxt, struct qelr_qp *qp, struct ibv_qp_init_attr_ex *attrx) { uint32_t max_recv_wr, max_recv_sges, max_recv_buf; int chain_size; int rc; /* RQ */ max_recv_wr = attrx->cap.max_recv_wr; max_recv_wr = max_t(uint32_t, max_recv_wr, 1); max_recv_wr = min_t(uint32_t, max_recv_wr, cxt->max_recv_wr); max_recv_sges = max_recv_wr * cxt->sges_per_recv_wr; max_recv_buf = max_recv_sges * QELR_RQE_ELEMENT_SIZE; chain_size = max_recv_buf; rc = qelr_chain_alloc(&qp->rq.chain, chain_size, cxt->kernel_page_size, QELR_RQE_ELEMENT_SIZE); if (rc) verbs_err(&cxt->ibv_ctx, "create qp: failed to map RQ chain, got %d", rc); qp->rq.max_wr = max_recv_wr; qp->rq.max_sges = cxt->sges_per_recv_wr; return rc; } static inline int qelr_create_qp_buffers(struct qelr_devctx *cxt, struct qelr_qp *qp, struct ibv_qp_init_attr_ex *attrx) { int rc; if (qelr_qp_has_sq(qp)) { rc = qelr_create_qp_buffers_sq(cxt, qp, attrx); if (rc) return rc; } if (qelr_qp_has_rq(qp)) { rc = qelr_create_qp_buffers_rq(cxt, qp, attrx); if (rc && qelr_qp_has_sq(qp)) { qelr_chain_free_sq(qp); if (qp->sq.db_rec_map) munmap(qp->sq.db_rec_map, cxt->kernel_page_size); return rc; } } return 0; } static inline int qelr_configure_qp_sq(struct qelr_devctx *cxt, struct qelr_qp *qp, struct ibv_qp_init_attr_ex *attrx, struct qelr_create_qp_resp *resp) { qp->sq.icid = resp->sq_icid; qp->sq.db_data.data.icid = htole16(resp->sq_icid); qp->sq.prod = 0; qp->sq.db = cxt->db_addr + resp->sq_db_offset; qp->sq.edpm_db = cxt->db_addr; if (resp->sq_db_rec_addr) { qp->sq.db_rec_map = mmap(NULL, cxt->kernel_page_size, PROT_WRITE, MAP_SHARED, cxt->ibv_ctx.context.cmd_fd, resp->sq_db_rec_addr); if (qp->sq.db_rec_map == MAP_FAILED) { int errsv = errno; verbs_err(&cxt->ibv_ctx, "alloc context: doorbell rec mapping failed resp.db_rec_addr = %llx size=%d context->cmd_fd=%d errno=%d\n", resp->sq_db_rec_addr, cxt->kernel_page_size, cxt->ibv_ctx.context.cmd_fd, errsv); return -ENOMEM; } qp->sq.db_rec_addr = qp->sq.db_rec_map; } else { /* Kernel doesn't support doorbell recovery. Point to dummy * location instead */ qp->sq.db_rec_addr = &cxt->db_rec_addr_dummy; } /* shadow SQ */ qp->sq.max_wr++; /* prod/cons method requires N+1 elements */ qp->wqe_wr_id = calloc(qp->sq.max_wr, sizeof(*qp->wqe_wr_id)); if (!qp->wqe_wr_id) { verbs_err(&cxt->ibv_ctx, "create qp: failed shadow SQ memory allocation\n"); return -ENOMEM; } return 0; } static inline int qelr_configure_qp_rq(struct qelr_devctx *cxt, struct qelr_qp *qp, struct qelr_create_qp_resp *resp) { /* RQ */ qp->rq.icid = resp->rq_icid; qp->rq.db_data.data.icid = htole16(resp->rq_icid); qp->rq.db = cxt->db_addr + resp->rq_db_offset; qp->rq.iwarp_db2 = cxt->db_addr + resp->rq_db2_offset; qp->rq.iwarp_db2_data.data.icid = htole16(qp->rq.icid); qp->rq.iwarp_db2_data.data.value = htole16(DQ_TCM_IWARP_POST_RQ_CF_CMD); qp->rq.prod = 0; if (resp->rq_db_rec_addr) { qp->rq.db_rec_map = mmap(NULL, cxt->kernel_page_size, PROT_WRITE, MAP_SHARED, cxt->ibv_ctx.context.cmd_fd, resp->rq_db_rec_addr); if (qp->rq.db_rec_map == MAP_FAILED) { int errsv = errno; verbs_err(&cxt->ibv_ctx, "alloc context: doorbell rec mapping failed resp.db_rec_addr = %llx size=%d context->cmd_fd=%d errno=%d\n", resp->rq_db_rec_addr, cxt->kernel_page_size, cxt->ibv_ctx.context.cmd_fd, errsv); return -ENOMEM; } qp->rq.db_rec_addr = qp->rq.db_rec_map; } else { /* Kernel doesn't support doorbell recovery. Point to dummy * location instead */ qp->rq.db_rec_addr = &cxt->db_rec_addr_dummy; } /* shadow RQ */ qp->rq.max_wr++; /* prod/cons method requires N+1 elements */ qp->rqe_wr_id = calloc(qp->rq.max_wr, sizeof(*qp->rqe_wr_id)); if (!qp->rqe_wr_id) { verbs_err(&cxt->ibv_ctx, "create qp: failed shadow RQ memory allocation\n"); return -ENOMEM; } return 0; } static inline int qelr_configure_qp(struct qelr_devctx *cxt, struct qelr_qp *qp, struct ibv_qp_init_attr_ex *attrx, struct qelr_create_qp_resp *resp) { int rc; /* general */ pthread_spin_init(&qp->q_lock, PTHREAD_PROCESS_PRIVATE); qp->qp_id = resp->qp_id; qp->state = QELR_QPS_RST; qp->sq_sig_all = attrx->sq_sig_all; qp->atomic_supported = resp->atomic_supported; if (cxt->dpm_flags & QELR_DPM_FLAGS_EDPM_MODE) qp->edpm_mode = 1; if (qelr_qp_has_sq(qp)) { rc = qelr_configure_qp_sq(cxt, qp, attrx, resp); if (rc) return rc; } if (qelr_qp_has_rq(qp)) { rc = qelr_configure_qp_rq(cxt, qp, resp); if (rc && qelr_qp_has_sq(qp)) qelr_free_sq(qp); } return rc; } static inline void qelr_print_qp_init_attr(struct qelr_devctx *cxt, struct ibv_qp_init_attr_ex *attrx) { verbs_debug(&cxt->ibv_ctx, "create qp: send_cq=%p, recv_cq=%p, srq=%p, max_inline_data=%d, max_recv_sge=%d, max_recv_wr=%d, max_send_sge=%d, max_send_wr=%d, qp_type=%d, sq_sig_all=%d\n", attrx->send_cq, attrx->recv_cq, attrx->srq, attrx->cap.max_inline_data, attrx->cap.max_recv_sge, attrx->cap.max_recv_wr, attrx->cap.max_send_sge, attrx->cap.max_send_wr, attrx->qp_type, attrx->sq_sig_all); } static inline void qelr_create_qp_configure_sq_req(struct qelr_qp *qp, struct qelr_create_qp *req) { req->sq_addr = (uintptr_t)qp->sq.chain.first_addr; req->sq_len = qp->sq.chain.size; } static inline void qelr_create_qp_configure_rq_req(struct qelr_qp *qp, struct qelr_create_qp *req) { req->rq_addr = (uintptr_t)qp->rq.chain.first_addr; req->rq_len = qp->rq.chain.size; } static inline void qelr_create_qp_configure_req(struct qelr_qp *qp, struct qelr_create_qp *req) { memset(req, 0, sizeof(*req)); req->qp_handle_hi = U64_HI(qp); req->qp_handle_lo = U64_LO(qp); if (qelr_qp_has_sq(qp)) qelr_create_qp_configure_sq_req(qp, req); if (qelr_qp_has_rq(qp)) qelr_create_qp_configure_rq_req(qp, req); } static inline void qelr_basic_qp_config(struct qelr_qp *qp, struct ibv_qp_init_attr_ex *attrx) { if (attrx->srq) qp->srq = get_qelr_srq(attrx->srq); if (attrx->qp_type == IBV_QPT_RC || attrx->qp_type == IBV_QPT_XRC_SEND) qp->flags |= QELR_QP_FLAG_SQ; if (attrx->qp_type == IBV_QPT_RC && !qp->srq) qp->flags |= QELR_QP_FLAG_RQ; } static void qelr_print_ah_attr(struct qelr_devctx *cxt, struct ibv_ah_attr *attr) { verbs_debug(&cxt->ibv_ctx, "grh.dgid=[%#" PRIx64 ":%#" PRIx64 "], grh.flow_label=%d, grh.sgid_index=%d, grh.hop_limit=%d, grh.traffic_class=%d, dlid=%d, sl=%d, src_path_bits=%d, static_rate = %d, port_num=%d\n", be64toh(attr->grh.dgid.global.interface_id), be64toh(attr->grh.dgid.global.subnet_prefix), attr->grh.flow_label, attr->grh.hop_limit, attr->grh.sgid_index, attr->grh.traffic_class, attr->dlid, attr->sl, attr->src_path_bits, attr->static_rate, attr->port_num); } static void qelr_print_qp_attr(struct qelr_devctx *cxt, struct ibv_qp_attr *attr) { verbs_debug(&cxt->ibv_ctx, "\tqp_state=%d\tcur_qp_state=%d\tpath_mtu=%d\tpath_mig_state=%d\tqkey=%d\trq_psn=%d\tsq_psn=%d\tdest_qp_num=%d\tqp_access_flags=%d\tmax_inline_data=%d\tmax_recv_sge=%d\tmax_recv_wr=%d\tmax_send_sge=%d\tmax_send_wr=%d\tpkey_index=%d\talt_pkey_index=%d\ten_sqd_async_notify=%d\tsq_draining=%d\tmax_rd_atomic=%d\tmax_dest_rd_atomic=%d\tmin_rnr_timer=%d\tport_num=%d\ttimeout=%d\tretry_cnt=%d\trnr_retry=%d\talt_port_num=%d\talt_timeout=%d\n", attr->qp_state, attr->cur_qp_state, attr->path_mtu, attr->path_mig_state, attr->qkey, attr->rq_psn, attr->sq_psn, attr->dest_qp_num, attr->qp_access_flags, attr->cap.max_inline_data, attr->cap.max_recv_sge, attr->cap.max_recv_wr, attr->cap.max_send_sge, attr->cap.max_send_wr, attr->pkey_index, attr->alt_pkey_index, attr->en_sqd_async_notify, attr->sq_draining, attr->max_rd_atomic, attr->max_dest_rd_atomic, attr->min_rnr_timer, attr->port_num, attr->timeout, attr->retry_cnt, attr->rnr_retry, attr->alt_port_num, attr->alt_timeout); qelr_print_ah_attr(cxt, &attr->ah_attr); qelr_print_ah_attr(cxt, &attr->alt_ah_attr); } int qelr_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, struct ibv_qp_init_attr *init_attr) { struct ibv_query_qp cmd; struct qelr_devctx *cxt = get_qelr_ctx(qp->context); int rc; verbs_debug(&cxt->ibv_ctx, "QP Query %p, attr_mask=0x%x\n", get_qelr_qp(qp), attr_mask); rc = ibv_cmd_query_qp(qp, attr, attr_mask, init_attr, &cmd, sizeof(cmd)); qelr_print_qp_attr(cxt, attr); return rc; } static enum qelr_qp_state get_qelr_qp_state(enum ibv_qp_state qps) { switch (qps) { case IBV_QPS_RESET: return QELR_QPS_RST; case IBV_QPS_INIT: return QELR_QPS_INIT; case IBV_QPS_RTR: return QELR_QPS_RTR; case IBV_QPS_RTS: return QELR_QPS_RTS; case IBV_QPS_SQD: return QELR_QPS_SQD; case IBV_QPS_SQE: return QELR_QPS_SQE; case IBV_QPS_ERR: default: return QELR_QPS_ERR; }; } static void qelr_reset_qp_hwq_info(struct qelr_qp_hwq_info *q) { qelr_chain_reset(&q->chain); q->prod = 0; q->cons = 0; q->wqe_cons = 0; q->db_data.data.value = 0; } static int qelr_update_qp_state(struct qelr_qp *qp, enum ibv_qp_state new_ib_state) { int status = 0; enum qelr_qp_state new_state; /* iWARP states are updated implicitely by driver and don't have a * real purpose in user-lib. */ if (IS_IWARP(qp->ibv_qp->context->device)) return 0; new_state = get_qelr_qp_state(new_ib_state); pthread_spin_lock(&qp->q_lock); if (new_state == qp->state) { pthread_spin_unlock(&qp->q_lock); return 0; } switch (qp->state) { case QELR_QPS_RST: switch (new_state) { case QELR_QPS_INIT: qp->prev_wqe_size = 0; qelr_reset_qp_hwq_info(&qp->sq); qelr_reset_qp_hwq_info(&qp->rq); break; default: status = -EINVAL; break; }; break; case QELR_QPS_INIT: /* INIT->XXX */ switch (new_state) { case QELR_QPS_RTR: /* Update doorbell (in case post_recv was done before * move to RTR) */ if (IS_ROCE(qp->ibv_qp->context->device) && (qelr_qp_has_rq(qp))) { mmio_wc_start(); writel(qp->rq.db_data.raw, qp->rq.db); mmio_flush_writes(); } break; case QELR_QPS_ERR: break; default: /* invalid state change. */ status = -EINVAL; break; }; break; case QELR_QPS_RTR: /* RTR->XXX */ switch (new_state) { case QELR_QPS_RTS: break; case QELR_QPS_ERR: break; default: /* invalid state change. */ status = -EINVAL; break; }; break; case QELR_QPS_RTS: /* RTS->XXX */ switch (new_state) { case QELR_QPS_SQD: case QELR_QPS_SQE: break; case QELR_QPS_ERR: break; default: /* invalid state change. */ status = -EINVAL; break; }; break; case QELR_QPS_SQD: /* SQD->XXX */ switch (new_state) { case QELR_QPS_RTS: case QELR_QPS_SQE: case QELR_QPS_ERR: break; default: /* invalid state change. */ status = -EINVAL; break; }; break; case QELR_QPS_SQE: switch (new_state) { case QELR_QPS_RTS: case QELR_QPS_ERR: break; default: /* invalid state change. */ status = -EINVAL; break; }; break; case QELR_QPS_ERR: /* ERR->XXX */ switch (new_state) { case QELR_QPS_RST: break; default: status = -EINVAL; break; }; break; default: status = -EINVAL; break; }; if (!status) qp->state = new_state; pthread_spin_unlock(&qp->q_lock); return status; } int qelr_modify_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr, int attr_mask) { struct ibv_modify_qp cmd = {}; struct qelr_qp *qp = get_qelr_qp(ibqp); struct qelr_devctx *cxt = get_qelr_ctx(ibqp->context); union ibv_gid sgid, *p_dgid; int rc; verbs_debug(&cxt->ibv_ctx, "QP Modify %p, attr_mask=0x%x\n", qp, attr_mask); qelr_print_qp_attr(cxt, attr); rc = ibv_cmd_modify_qp(ibqp, attr, attr_mask, &cmd, sizeof(cmd)); if (rc) { verbs_err(&cxt->ibv_ctx, "QP Modify: Failed command. rc=%d\n", rc); return rc; } if (attr_mask & IBV_QP_STATE) { rc = qelr_update_qp_state(qp, attr->qp_state); verbs_debug(&cxt->ibv_ctx, "QP Modify state %d->%d, rc=%d\n", qp->state, attr->qp_state, rc); if (rc) { verbs_err(&cxt->ibv_ctx, "QP Modify: Failed to update state. rc=%d\n", rc); return rc; } } /* EDPM must be disabled if GIDs match */ if (attr_mask & IBV_QP_AV) { rc = ibv_query_gid(ibqp->context, attr->ah_attr.port_num, attr->ah_attr.grh.sgid_index, &sgid); if (!rc) { p_dgid = &attr->ah_attr.grh.dgid; qp->edpm_disabled = !memcmp(&sgid, p_dgid, sizeof(sgid)); verbs_debug(&cxt->ibv_ctx, "QP Modify: %p, edpm_disabled=%d\n", qp, qp->edpm_disabled); } else { verbs_err(&cxt->ibv_ctx, "QP Modify: Failed querying GID. rc=%d\n", rc); } } return 0; } int qelr_destroy_qp(struct ibv_qp *ibqp) { struct qelr_devctx *cxt = get_qelr_ctx(ibqp->context); struct qelr_qp *qp = get_qelr_qp(ibqp); int rc = 0; verbs_debug(&cxt->ibv_ctx, "destroy qp: %p\n", qp); rc = ibv_cmd_destroy_qp(ibqp); if (rc) { verbs_err(&cxt->ibv_ctx, "destroy qp: failed to destroy %p, got %d.\n", qp, rc); return rc; } qelr_free_sq(qp); qelr_free_rq(qp); qelr_chain_free_sq(qp); qelr_chain_free_rq(qp); if (qp->sq.db_rec_map) munmap(qp->sq.db_rec_map, cxt->kernel_page_size); if (qp->rq.db_rec_map) munmap(qp->rq.db_rec_map, cxt->kernel_page_size); verbs_debug(&cxt->ibv_ctx, "destroy cq: successfully destroyed %p\n", qp); free(qp); return 0; } static int sge_data_len(struct ibv_sge *sg_list, int num_sge) { int i, len = 0; for (i = 0; i < num_sge; i++) len += sg_list[i].length; return len; } static void swap_wqe_data64(uint64_t *p) { __be64 *bep=(__be64 *)p; int i; for (i = 0; i < ROCE_WQE_ELEM_SIZE / sizeof(uint64_t); i++, p++, bep++) *bep = htobe64(*p); } static inline void qelr_init_dpm_info(struct qelr_devctx *cxt, struct qelr_qp *qp, struct ibv_send_wr *wr, struct qelr_dpm *dpm, int data_size) { dpm->is_edpm = 0; dpm->is_ldpm = 0; /* DPM only succeeds when transmit queues are empty */ if (!qelr_chain_is_full(&qp->sq.chain)) return; /* Check if edpm can be used */ if (wr->send_flags & IBV_SEND_INLINE && !qp->edpm_disabled && cxt->dpm_flags & QELR_DPM_FLAGS_ENHANCED && data_size <= cxt->edpm_limit_size) { memset(dpm, 0, sizeof(*dpm)); dpm->rdma_ext = (struct qelr_rdma_ext *)&dpm->payload; dpm->is_edpm = 1; return; } /* Check if ldpm can be used - not inline and limited to ldpm_limit */ if (cxt->dpm_flags & QELR_DPM_FLAGS_LEGACY && !(wr->send_flags & IBV_SEND_INLINE) && data_size <= cxt->ldpm_limit_size) { memset(dpm, 0, sizeof(*dpm)); dpm->is_ldpm = 1; } } #define QELR_IB_OPCODE_SEND_ONLY 0x04 #define QELR_IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE 0x05 #define QELR_IB_OPCODE_RDMA_WRITE_ONLY 0x0a #define QELR_IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE 0x0b #define QELR_IB_OPCODE_SEND_WITH_INV 0x17 #define QELR_IS_IMM_OR_INV(opcode) \ (((opcode) == QELR_IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE) || \ ((opcode) == QELR_IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE) || \ ((opcode) == QELR_IB_OPCODE_SEND_WITH_INV)) static inline void qelr_edpm_set_msg_data(struct qelr_qp *qp, struct qelr_dpm *dpm, uint8_t opcode, uint16_t length, uint8_t se, uint8_t comp) { uint32_t wqe_size, dpm_size, params; /* edpm mode - 0 : ack field is treated by old FW as "completion" * edpm mode - 1 : ack field is treated by new FW as ack which is * always required. */ uint8_t ack = (qp->edpm_mode) ? 1 : comp; params = 0; wqe_size = length + (QELR_IS_IMM_OR_INV(opcode) ? sizeof(uint32_t) : 0); dpm_size = wqe_size + sizeof(struct db_roce_dpm_data); SET_FIELD(params, DB_ROCE_DPM_PARAMS_ACK_REQUEST, ack); SET_FIELD(params, DB_ROCE_DPM_PARAMS_DPM_TYPE, DPM_ROCE); SET_FIELD(params, DB_ROCE_DPM_PARAMS_OPCODE, opcode); SET_FIELD(params, DB_ROCE_DPM_PARAMS_WQE_SIZE, wqe_size); SET_FIELD(params, DB_ROCE_DPM_PARAMS_COMPLETION_FLG, comp ? 1 : 0); SET_FIELD(params, DB_ROCE_DPM_PARAMS_S_FLG, se ? 1 : 0); SET_FIELD(params, DB_ROCE_DPM_PARAMS_SIZE, (dpm_size + sizeof(uint64_t) - 1) / sizeof(uint64_t)); dpm->msg.data.params.params = htole32(params); } static inline void qelr_edpm_set_inv_imm(struct qelr_qp *qp, struct qelr_dpm *dpm, __be32 data) { memcpy(&dpm->payload[dpm->payload_offset], &data, sizeof(data)); dpm->payload_offset += sizeof(data); dpm->payload_size += sizeof(data); } static inline void qelr_edpm_set_rdma_ext(struct qelr_qp *qp, struct qelr_dpm *dpm, uint64_t remote_addr, uint32_t rkey) { dpm->rdma_ext->remote_va = htobe64(remote_addr); dpm->rdma_ext->remote_key = htobe32(rkey); dpm->payload_offset += sizeof(*dpm->rdma_ext); dpm->payload_size += sizeof(*dpm->rdma_ext); } static inline void qelr_edpm_set_payload(struct qelr_qp *qp, struct qelr_dpm *dpm, char *buf, uint32_t length) { memcpy(&dpm->payload[dpm->payload_offset], buf, length); dpm->payload_offset += length; } static void qelr_prepare_sq_inline_data(struct qelr_qp *qp, struct qelr_dpm *dpm, int data_size, uint8_t *wqe_size, struct ibv_send_wr *wr, uint8_t *bits, uint8_t bit) { int i; uint32_t seg_siz; char *seg_prt, *wqe; if (!data_size) return; /* set the bit */ *bits |= bit; seg_prt = NULL; wqe = NULL; seg_siz = 0; /* copy data inline */ for (i = 0; i < wr->num_sge; i++) { uint32_t len = wr->sg_list[i].length; void *src = (void *)(uintptr_t)wr->sg_list[i].addr; if (dpm->is_edpm) qelr_edpm_set_payload(qp, dpm, src, len); while (len > 0) { uint32_t cur; /* new segment required */ if (!seg_siz) { wqe = (char *)qelr_chain_produce(&qp->sq.chain); seg_prt = wqe; seg_siz = sizeof(struct rdma_sq_common_wqe); (*wqe_size)++; } /* calculate currently allowed length */ cur = min(len, seg_siz); memcpy(seg_prt, src, cur); /* update segment variables */ seg_prt += cur; seg_siz -= cur; /* update sge variables */ src += cur; len -= cur; /* swap fully-completed segments */ if (!seg_siz) swap_wqe_data64((uint64_t *)wqe); } } /* swap last not completed segment */ if (seg_siz) swap_wqe_data64((uint64_t *)wqe); if (dpm->is_edpm) { dpm->payload_size += data_size; if (wr->opcode == IBV_WR_RDMA_WRITE || wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) dpm->rdma_ext->dma_length = htobe32(data_size); } } static void qelr_prepare_sq_sges(struct qelr_qp *qp, struct qelr_dpm *dpm, uint8_t *wqe_size, struct ibv_send_wr *wr) { int i; for (i = 0; i < wr->num_sge; i++) { struct rdma_sq_sge *sge = qelr_chain_produce(&qp->sq.chain); TYPEPTR_ADDR_SET(sge, addr, wr->sg_list[i].addr); sge->l_key = htole32(wr->sg_list[i].lkey); sge->length = htole32(wr->sg_list[i].length); if (dpm->is_ldpm) { memcpy(&dpm->payload[dpm->payload_size], sge, sizeof(*sge)); dpm->payload_size += sizeof(*sge); } } if (wqe_size) *wqe_size += wr->num_sge; } static uint32_t qelr_prepare_sq_rdma_data(struct qelr_qp *qp, struct qelr_dpm *dpm, int data_size, uint8_t *p_wqe_size, struct rdma_sq_rdma_wqe_1st *rwqe, struct rdma_sq_rdma_wqe_2nd *rwqe2, struct ibv_send_wr *wr, bool is_imm) { memset(rwqe2, 0, sizeof(*rwqe2)); rwqe2->r_key = htole32(wr->wr.rdma.rkey); TYPEPTR_ADDR_SET(rwqe2, remote_va, wr->wr.rdma.remote_addr); rwqe->length = htole32(data_size); if (is_imm) rwqe->imm_data = htole32(be32toh(wr->imm_data)); if (wr->send_flags & IBV_SEND_INLINE && (wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM || wr->opcode == IBV_WR_RDMA_WRITE)) { uint8_t flags = 0; SET_FIELD2(flags, RDMA_SQ_RDMA_WQE_1ST_INLINE_FLG, 1); qelr_prepare_sq_inline_data(qp, dpm, data_size, p_wqe_size, wr, &rwqe->flags, flags); rwqe->wqe_size = *p_wqe_size; } else { if (dpm->is_ldpm) dpm->payload_size = sizeof(*rwqe) + sizeof(*rwqe2); qelr_prepare_sq_sges(qp, dpm, p_wqe_size, wr); rwqe->wqe_size = *p_wqe_size; if (dpm->is_ldpm) { memcpy(dpm->payload, rwqe, sizeof(*rwqe)); memcpy(&dpm->payload[sizeof(*rwqe)], rwqe2, sizeof(*rwqe2)); } } return data_size; } static uint32_t qelr_prepare_sq_send_data(struct qelr_qp *qp, struct qelr_dpm *dpm, int data_size, uint8_t *p_wqe_size, struct rdma_sq_send_wqe_1st *swqe, struct rdma_sq_send_wqe_2st *swqe2, struct ibv_send_wr *wr, bool is_imm) { memset(swqe2, 0, sizeof(*swqe2)); swqe->length = htole32(data_size); if (is_imm) swqe->inv_key_or_imm_data = htole32(be32toh(wr->imm_data)); if (wr->send_flags & IBV_SEND_INLINE) { uint8_t flags = 0; SET_FIELD2(flags, RDMA_SQ_SEND_WQE_INLINE_FLG, 1); qelr_prepare_sq_inline_data(qp, dpm, data_size, p_wqe_size, wr, &swqe->flags, flags); swqe->wqe_size = *p_wqe_size; } else { if (dpm->is_ldpm) dpm->payload_size = sizeof(*swqe) + sizeof(*swqe2); qelr_prepare_sq_sges(qp, dpm, p_wqe_size, wr); swqe->wqe_size = *p_wqe_size; if (dpm->is_ldpm) { memcpy(dpm->payload, swqe, sizeof(*swqe)); memcpy(&dpm->payload[sizeof(*swqe)], swqe2, sizeof(*swqe2)); } } return data_size; } static void qelr_prepare_sq_atom_data(struct qelr_qp *qp, struct qelr_dpm *dpm, struct rdma_sq_atomic_wqe_1st *awqe1, struct rdma_sq_atomic_wqe_2nd *awqe2, struct rdma_sq_atomic_wqe_3rd *awqe3, struct ibv_send_wr *wr) { if (dpm->is_ldpm) { memcpy(&dpm->payload[dpm->payload_size], awqe1, sizeof(*awqe1)); dpm->payload_size += sizeof(*awqe1); memcpy(&dpm->payload[dpm->payload_size], awqe2, sizeof(*awqe2)); dpm->payload_size += sizeof(*awqe2); memcpy(&dpm->payload[dpm->payload_size], awqe3, sizeof(*awqe3)); dpm->payload_size += sizeof(*awqe3); } qelr_prepare_sq_sges(qp, dpm, NULL, wr); } static inline void qelr_ldpm_prepare_data(struct qelr_qp *qp, struct qelr_dpm *dpm) { uint32_t val, params; /* DPM size is given in 8 bytes so we round up */ val = dpm->payload_size + sizeof(struct db_roce_dpm_data); val = DIV_ROUND_UP(val, sizeof(uint64_t)); params = 0; SET_FIELD(params, DB_ROCE_DPM_PARAMS_SIZE, val); SET_FIELD(params, DB_ROCE_DPM_PARAMS_DPM_TYPE, DPM_LEGACY); dpm->msg.data.params.params = htole32(params); } static enum ibv_wc_opcode qelr_ibv_to_wc_opcode(enum ibv_wr_opcode opcode) { switch (opcode) { case IBV_WR_RDMA_WRITE: case IBV_WR_RDMA_WRITE_WITH_IMM: return IBV_WC_RDMA_WRITE; case IBV_WR_SEND_WITH_IMM: case IBV_WR_SEND: case IBV_WR_SEND_WITH_INV: return IBV_WC_SEND; case IBV_WR_RDMA_READ: return IBV_WC_RDMA_READ; case IBV_WR_ATOMIC_CMP_AND_SWP: return IBV_WC_COMP_SWAP; case IBV_WR_ATOMIC_FETCH_AND_ADD: return IBV_WC_FETCH_ADD; default: return IBV_WC_SEND; } } static inline void doorbell_qp(struct qelr_qp *qp) { mmio_wc_start(); writel(qp->sq.db_data.raw, qp->sq.db); /* copy value to doorbell recovery mechanism */ qp->sq.db_rec_addr->db_data = qp->sq.db_data.raw; mmio_flush_writes(); } static inline void doorbell_dpm_qp(struct qelr_devctx *cxt, struct qelr_qp *qp, struct qelr_dpm *dpm) { uint32_t offset = 0; uint64_t *payload = (uint64_t *)dpm->payload; uint32_t num_dwords; int bytes = 0; void *db_addr; mmio_wc_start(); /* Write message header */ dpm->msg.data.icid = qp->sq.db_data.data.icid; dpm->msg.data.prod_val = qp->sq.db_data.data.value; db_addr = qp->sq.edpm_db; writeq(dpm->msg.raw, db_addr); /* Write mesage body */ bytes += sizeof(uint64_t); num_dwords = DIV_ROUND_UP(dpm->payload_size, sizeof(uint64_t)); db_addr += sizeof(dpm->msg.data); if (bytes == cxt->edpm_trans_size) { mmio_flush_writes(); bytes = 0; } while (offset < num_dwords) { /* endianity is different between FW and DORQ HW block */ if (dpm->is_ldpm) mmio_write64_be(db_addr, htobe64(payload[offset])); else /* EDPM */ mmio_write64(db_addr, payload[offset]); bytes += sizeof(uint64_t); db_addr += sizeof(uint64_t); /* Writing to a wc bar. We need to flush the writes every * edpm transaction size otherwise the CPU could optimize away * the duplicate stores. */ if (bytes == cxt->edpm_trans_size) { mmio_flush_writes(); bytes = 0; } offset++; } mmio_flush_writes(); } static inline int qelr_can_post_send(struct qelr_devctx *cxt, struct qelr_qp *qp, struct ibv_send_wr *wr, int data_size) { /* Invalid WR */ if (wr->num_sge > qp->sq.max_sges) { verbs_err(&cxt->ibv_ctx, "error: WR is bad. Post send on QP %p failed\n", qp); return -EINVAL; } /* WR overflow */ if (qelr_wq_is_full(&qp->sq)) { verbs_err(&cxt->ibv_ctx, "error: WQ is full. Post send on QP %p failed (this error appears only once)\n", qp); return -ENOMEM; } /* WQE overflow */ if (qelr_chain_get_elem_left_u32(&qp->sq.chain) < QELR_MAX_SQ_WQE_SIZE) { verbs_err(&cxt->ibv_ctx, "error: WQ PBL is full. Post send on QP %p failed (this error appears only once)\n", qp); return -ENOMEM; } if ((wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP || wr->opcode == IBV_WR_ATOMIC_FETCH_AND_ADD) && !qp->atomic_supported) { verbs_err(&cxt->ibv_ctx, "Atomic not supported on this machine\n"); return -EINVAL; } if ((wr->send_flags & IBV_SEND_INLINE) && (data_size > ROCE_REQ_MAX_INLINE_DATA_SIZE)) { verbs_err(&cxt->ibv_ctx, "Too much inline data in WR: %d\n", data_size); return -EINVAL; } return 0; } static void qelr_configure_xrc_srq(struct ibv_send_wr *wr, struct rdma_sq_common_wqe *wqe, struct qelr_dpm *dpm) { struct rdma_sq_send_wqe_1st *xrc_wqe; /* xrc_srq location is the same for all relevant wqes */ xrc_wqe = (struct rdma_sq_send_wqe_1st *)wqe; xrc_wqe->xrc_srq = htole32(wr->qp_type.xrc.remote_srqn); if (dpm->is_edpm) { struct qelr_xrceth *xrceth; xrceth = (struct qelr_xrceth *) &dpm->payload[dpm->payload_offset]; xrceth->xrc_srq = htobe32(wr->qp_type.xrc.remote_srqn); dpm->payload_offset += sizeof(*xrceth); dpm->payload_size += sizeof(*xrceth); dpm->rdma_ext = (struct qelr_rdma_ext *)&dpm->payload_offset; } } static int __qelr_post_send(struct qelr_devctx *cxt, struct qelr_qp *qp, struct ibv_send_wr *wr, int data_size, int *normal_db_required) { uint8_t se, comp, fence; struct rdma_sq_common_wqe *wqe; struct rdma_sq_send_wqe_1st *swqe; struct rdma_sq_send_wqe_2st *swqe2; struct rdma_sq_rdma_wqe_1st *rwqe; struct rdma_sq_rdma_wqe_2nd *rwqe2; struct rdma_sq_atomic_wqe_1st *awqe1; struct rdma_sq_atomic_wqe_2nd *awqe2; struct rdma_sq_atomic_wqe_3rd *awqe3; struct qelr_dpm dpm; uint32_t wqe_length; uint8_t wqe_size; uint16_t db_val; int rc = 0; qelr_init_dpm_info(cxt, qp, wr, &dpm, data_size); wqe = qelr_chain_produce(&qp->sq.chain); comp = (!!(wr->send_flags & IBV_SEND_SIGNALED)) || (!!qp->sq_sig_all); qp->wqe_wr_id[qp->sq.prod].signaled = comp; /* common fields */ wqe->flags = 0; se = !!(wr->send_flags & IBV_SEND_SOLICITED); fence = !!(wr->send_flags & IBV_SEND_FENCE); SET_FIELD2(wqe->flags, RDMA_SQ_COMMON_WQE_SE_FLG, se); SET_FIELD2(wqe->flags, RDMA_SQ_COMMON_WQE_COMP_FLG, comp); SET_FIELD2(wqe->flags, RDMA_SQ_COMMON_WQE_RD_FENCE_FLG, fence); wqe->prev_wqe_size = qp->prev_wqe_size; qp->wqe_wr_id[qp->sq.prod].opcode = qelr_ibv_to_wc_opcode(wr->opcode); if (get_ibv_qp(qp)->qp_type == IBV_QPT_XRC_SEND) qelr_configure_xrc_srq(wr, wqe, &dpm); switch (wr->opcode) { case IBV_WR_SEND_WITH_IMM: wqe->req_type = RDMA_SQ_REQ_TYPE_SEND_WITH_IMM; swqe = (struct rdma_sq_send_wqe_1st *)wqe; wqe_size = sizeof(struct rdma_sq_send_wqe) / RDMA_WQE_BYTES; swqe2 = (struct rdma_sq_send_wqe_2st *)qelr_chain_produce(&qp->sq.chain); if (dpm.is_edpm) qelr_edpm_set_inv_imm(qp, &dpm, wr->imm_data); wqe_length = qelr_prepare_sq_send_data(qp, &dpm, data_size, &wqe_size, swqe, swqe2, wr, 1 /* Imm */); if (dpm.is_edpm) qelr_edpm_set_msg_data(qp, &dpm, QELR_IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE, wqe_length, se, comp); else if (dpm.is_ldpm) qelr_ldpm_prepare_data(qp, &dpm); qp->wqe_wr_id[qp->sq.prod].wqe_size = wqe_size; qp->prev_wqe_size = wqe_size; qp->wqe_wr_id[qp->sq.prod].bytes_len = wqe_length; break; case IBV_WR_SEND: wqe->req_type = RDMA_SQ_REQ_TYPE_SEND; swqe = (struct rdma_sq_send_wqe_1st *)wqe; wqe_size = sizeof(struct rdma_sq_send_wqe) / RDMA_WQE_BYTES; swqe2 = (struct rdma_sq_send_wqe_2st *)qelr_chain_produce(&qp->sq.chain); wqe_length = qelr_prepare_sq_send_data(qp, &dpm, data_size, &wqe_size, swqe, swqe2, wr, 0); if (dpm.is_edpm) qelr_edpm_set_msg_data(qp, &dpm, QELR_IB_OPCODE_SEND_ONLY, wqe_length, se, comp); else if (dpm.is_ldpm) qelr_ldpm_prepare_data(qp, &dpm); qp->wqe_wr_id[qp->sq.prod].wqe_size = wqe_size; qp->prev_wqe_size = wqe_size; qp->wqe_wr_id[qp->sq.prod].bytes_len = wqe_length; break; case IBV_WR_SEND_WITH_INV: wqe->req_type = RDMA_SQ_REQ_TYPE_SEND_WITH_INVALIDATE; swqe = (struct rdma_sq_send_wqe_1st *)wqe; wqe_size = sizeof(struct rdma_sq_send_wqe) / RDMA_WQE_BYTES; swqe2 = qelr_chain_produce(&qp->sq.chain); if (dpm.is_edpm) qelr_edpm_set_inv_imm(qp, &dpm, htobe32(wr->invalidate_rkey)); swqe->inv_key_or_imm_data = htole32(wr->invalidate_rkey); wqe_length = qelr_prepare_sq_send_data(qp, &dpm, data_size, &wqe_size, swqe, swqe2, wr, 0); if (dpm.is_edpm) qelr_edpm_set_msg_data(qp, &dpm, QELR_IB_OPCODE_SEND_WITH_INV, wqe_length, se, comp); else if (dpm.is_ldpm) qelr_ldpm_prepare_data(qp, &dpm); qp->wqe_wr_id[qp->sq.prod].wqe_size = wqe_size; qp->prev_wqe_size = wqe_size; qp->wqe_wr_id[qp->sq.prod].bytes_len = wqe_length; break; case IBV_WR_RDMA_WRITE_WITH_IMM: wqe->req_type = RDMA_SQ_REQ_TYPE_RDMA_WR_WITH_IMM; rwqe = (struct rdma_sq_rdma_wqe_1st *)wqe; wqe_size = sizeof(struct rdma_sq_rdma_wqe) / RDMA_WQE_BYTES; rwqe2 = (struct rdma_sq_rdma_wqe_2nd *)qelr_chain_produce(&qp->sq.chain); if (dpm.is_edpm) { qelr_edpm_set_rdma_ext(qp, &dpm, wr->wr.rdma.remote_addr, wr->wr.rdma.rkey); qelr_edpm_set_inv_imm(qp, &dpm, wr->imm_data); } wqe_length = qelr_prepare_sq_rdma_data(qp, &dpm, data_size, &wqe_size, rwqe, rwqe2, wr, 1 /* Imm */); if (dpm.is_edpm) qelr_edpm_set_msg_data(qp, &dpm, QELR_IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE, wqe_length + sizeof(*dpm.rdma_ext), se, comp); else if (dpm.is_ldpm) qelr_ldpm_prepare_data(qp, &dpm); qp->wqe_wr_id[qp->sq.prod].wqe_size = wqe_size; qp->prev_wqe_size = wqe_size; qp->wqe_wr_id[qp->sq.prod].bytes_len = wqe_length; break; case IBV_WR_RDMA_WRITE: wqe->req_type = RDMA_SQ_REQ_TYPE_RDMA_WR; rwqe = (struct rdma_sq_rdma_wqe_1st *)wqe; wqe_size = sizeof(struct rdma_sq_rdma_wqe) / RDMA_WQE_BYTES; rwqe2 = (struct rdma_sq_rdma_wqe_2nd *)qelr_chain_produce(&qp->sq.chain); if (dpm.is_edpm) qelr_edpm_set_rdma_ext(qp, &dpm, wr->wr.rdma.remote_addr, wr->wr.rdma.rkey); wqe_length = qelr_prepare_sq_rdma_data(qp, &dpm, data_size, &wqe_size, rwqe, rwqe2, wr, 0); if (dpm.is_edpm) qelr_edpm_set_msg_data(qp, &dpm, QELR_IB_OPCODE_RDMA_WRITE_ONLY, wqe_length + sizeof(*dpm.rdma_ext), se, comp); else if (dpm.is_ldpm) qelr_ldpm_prepare_data(qp, &dpm); qp->wqe_wr_id[qp->sq.prod].wqe_size = wqe_size; qp->prev_wqe_size = wqe_size; qp->wqe_wr_id[qp->sq.prod].bytes_len = wqe_length; break; case IBV_WR_RDMA_READ: wqe->req_type = RDMA_SQ_REQ_TYPE_RDMA_RD; rwqe = (struct rdma_sq_rdma_wqe_1st *)wqe; wqe_size = sizeof(struct rdma_sq_rdma_wqe) / RDMA_WQE_BYTES; rwqe2 = (struct rdma_sq_rdma_wqe_2nd *)qelr_chain_produce(&qp->sq.chain); wqe_length = qelr_prepare_sq_rdma_data(qp, &dpm, data_size, &wqe_size, rwqe, rwqe2, wr, 0); if (dpm.is_ldpm) qelr_ldpm_prepare_data(qp, &dpm); qp->wqe_wr_id[qp->sq.prod].wqe_size = wqe_size; qp->prev_wqe_size = wqe_size; qp->wqe_wr_id[qp->sq.prod].bytes_len = wqe_length; break; case IBV_WR_ATOMIC_CMP_AND_SWP: case IBV_WR_ATOMIC_FETCH_AND_ADD: awqe1 = (struct rdma_sq_atomic_wqe_1st *)wqe; awqe1->wqe_size = 4; awqe2 = (struct rdma_sq_atomic_wqe_2nd *)qelr_chain_produce(&qp->sq.chain); TYPEPTR_ADDR_SET(awqe2, remote_va, wr->wr.atomic.remote_addr); awqe2->r_key = htole32(wr->wr.atomic.rkey); awqe3 = (struct rdma_sq_atomic_wqe_3rd *)qelr_chain_produce(&qp->sq.chain); if (wr->opcode == IBV_WR_ATOMIC_FETCH_AND_ADD) { wqe->req_type = RDMA_SQ_REQ_TYPE_ATOMIC_ADD; TYPEPTR_ADDR_SET(awqe3, swap_data, wr->wr.atomic.compare_add); } else { wqe->req_type = RDMA_SQ_REQ_TYPE_ATOMIC_CMP_AND_SWAP; TYPEPTR_ADDR_SET(awqe3, swap_data, wr->wr.atomic.swap); TYPEPTR_ADDR_SET(awqe3, cmp_data, wr->wr.atomic.compare_add); } qelr_prepare_sq_atom_data(qp, &dpm, awqe1, awqe2, awqe3, wr); if (dpm.is_ldpm) qelr_ldpm_prepare_data(qp, &dpm); qp->wqe_wr_id[qp->sq.prod].wqe_size = awqe1->wqe_size; qp->prev_wqe_size = awqe1->wqe_size; break; default: /* restore prod to its position before this WR was processed */ qelr_chain_set_prod(&qp->sq.chain, le16toh(qp->sq.db_data.data.value), wqe); /* restore prev_wqe_size */ qp->prev_wqe_size = wqe->prev_wqe_size; rc = -EINVAL; verbs_err(&cxt->ibv_ctx, "Invalid opcode %d in work request on QP %p\n", wr->opcode, qp); break; } if (rc) return rc; qp->wqe_wr_id[qp->sq.prod].wr_id = wr->wr_id; qelr_inc_sw_prod_u16(&qp->sq); db_val = le16toh(qp->sq.db_data.data.value) + 1; qp->sq.db_data.data.value = htole16(db_val); if (dpm.is_edpm || dpm.is_ldpm) { doorbell_dpm_qp(cxt, qp, &dpm); *normal_db_required = 0; } else { *normal_db_required = 1; } return 0; } int qelr_post_send(struct ibv_qp *ib_qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { struct qelr_devctx *cxt = get_qelr_ctx(ib_qp->context); struct qelr_qp *qp = get_qelr_qp(ib_qp); int doorbell_required = 0; *bad_wr = NULL; int rc = 0; pthread_spin_lock(&qp->q_lock); if (IS_ROCE(ib_qp->context->device) && (qp->state != QELR_QPS_RTS && qp->state != QELR_QPS_ERR && qp->state != QELR_QPS_SQD)) { pthread_spin_unlock(&qp->q_lock); *bad_wr = wr; return -EINVAL; } while (wr) { int data_size = sge_data_len(wr->sg_list, wr->num_sge); rc = qelr_can_post_send(cxt, qp, wr, data_size); if (rc) { *bad_wr = wr; break; } rc = __qelr_post_send(cxt, qp, wr, data_size, &doorbell_required); if (rc) { *bad_wr = wr; break; } wr = wr->next; } if (doorbell_required) doorbell_qp(qp); pthread_spin_unlock(&qp->q_lock); return rc; } static uint32_t qelr_srq_elem_left(struct qelr_srq_hwq_info *hw_srq) { uint32_t used; /* Calculate number of elements used based on producer * count and consumer count and subtract it from max * work request supported so that we get elements left. */ used = (uint32_t)(((uint64_t)((uint64_t)~0U) + 1 + (uint64_t)(hw_srq->wr_prod_cnt)) - (uint64_t)hw_srq->wr_cons_cnt); return hw_srq->max_wr - used; } int qelr_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) { struct qelr_devctx *cxt = get_qelr_ctx(ibsrq->context); struct qelr_srq *srq = get_qelr_srq(ibsrq); struct qelr_srq_hwq_info *hw_srq = &srq->hw_srq; struct qelr_chain *chain; int status = 0; pthread_spin_lock(&srq->lock); chain = &srq->hw_srq.chain; while (wr) { struct rdma_srq_wqe_header *hdr; int i; if (!qelr_srq_elem_left(hw_srq) || wr->num_sge > srq->hw_srq.max_sges) { verbs_err(&cxt->ibv_ctx, "Can't post WR (%d,%d) || (%d > %d)\n", hw_srq->wr_prod_cnt, hw_srq->wr_cons_cnt, wr->num_sge, srq->hw_srq.max_sges); status = -ENOMEM; *bad_wr = wr; break; } hdr = qelr_chain_produce(chain); SRQ_HDR_SET(hdr, wr->wr_id, wr->num_sge); hw_srq->wr_prod_cnt++; hw_srq->wqe_prod++; hw_srq->sge_prod++; verbs_debug(&cxt->ibv_ctx, "SRQ WR: SGEs: %d with wr_id[%d] = %" PRIx64 "\n", wr->num_sge, hw_srq->wqe_prod, wr->wr_id); for (i = 0; i < wr->num_sge; i++) { struct rdma_srq_sge *srq_sge; srq_sge = qelr_chain_produce(chain); SRQ_SGE_SET(srq_sge, wr->sg_list[i].addr, wr->sg_list[i].length, wr->sg_list[i].lkey); verbs_debug(&cxt->ibv_ctx, "[%d]: len %d key %x addr %x:%x\n", i, srq_sge->length, srq_sge->l_key, srq_sge->addr.hi, srq_sge->addr.lo); hw_srq->sge_prod++; } /* Make sure that descriptors are written before we update * producers. */ udma_ordering_write_barrier(); struct rdma_srq_producers *virt_prod; virt_prod = srq->hw_srq.virt_prod_pair_addr; virt_prod->sge_prod = htole32(hw_srq->sge_prod); virt_prod->wqe_prod = htole32(hw_srq->wqe_prod); wr = wr->next; } verbs_debug(&cxt->ibv_ctx, "POST: Elements in SRQ: %d\n", qelr_chain_get_elem_left_u32(chain)); pthread_spin_unlock(&srq->lock); return status; } int qelr_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) { int status = 0; struct qelr_qp *qp = get_qelr_qp(ibqp); struct qelr_devctx *cxt = get_qelr_ctx(ibqp->context); uint16_t db_val; uint8_t iwarp = IS_IWARP(ibqp->context->device); if (unlikely(qp->srq)) { verbs_err(&cxt->ibv_ctx, "QP is associated with SRQ, cannot post RQ buffers\n"); *bad_wr = wr; return -EINVAL; } pthread_spin_lock(&qp->q_lock); if (!iwarp && qp->state == QELR_QPS_RST) { pthread_spin_unlock(&qp->q_lock); *bad_wr = wr; return -EINVAL; } while (wr) { int i; if (qelr_chain_get_elem_left_u32(&qp->rq.chain) < QELR_MAX_RQ_WQE_SIZE || wr->num_sge > qp->rq.max_sges) { verbs_err(&cxt->ibv_ctx, "Can't post WR (%d < %d) || (%d > %d)\n", qelr_chain_get_elem_left_u32(&qp->rq.chain), QELR_MAX_RQ_WQE_SIZE, wr->num_sge, qp->rq.max_sges); status = -ENOMEM; *bad_wr = wr; break; } for (i = 0; i < wr->num_sge; i++) { uint32_t flags = 0; struct rdma_rq_sge *rqe; /* first one must include the number of SGE in the * list */ if (!i) SET_FIELD(flags, RDMA_RQ_SGE_NUM_SGES, wr->num_sge); SET_FIELD(flags, RDMA_RQ_SGE_L_KEY, wr->sg_list[i].lkey); rqe = qelr_chain_produce(&qp->rq.chain); RQ_SGE_SET(rqe, wr->sg_list[i].addr, wr->sg_list[i].length, flags); } /* Special case of no sges. FW requires between 1-4 sges... * in this case we need to post 1 sge with length zero. this is * because rdma write with immediate consumes an RQ. */ if (!wr->num_sge) { uint32_t flags = 0; struct rdma_rq_sge *rqe; /* first one must include the number of SGE in the * list */ SET_FIELD(flags, RDMA_RQ_SGE_L_KEY, 0); SET_FIELD(flags, RDMA_RQ_SGE_NUM_SGES, 1); rqe = qelr_chain_produce(&qp->rq.chain); RQ_SGE_SET(rqe, 0, 0, flags); i = 1; } qp->rqe_wr_id[qp->rq.prod].wr_id = wr->wr_id; qp->rqe_wr_id[qp->rq.prod].wqe_size = i; qelr_inc_sw_prod_u16(&qp->rq); mmio_wc_start(); db_val = le16toh(qp->rq.db_data.data.value) + 1; qp->rq.db_data.data.value = htole16(db_val); writel(qp->rq.db_data.raw, qp->rq.db); /* copy value to doorbell recovery mechanism */ qp->rq.db_rec_addr->db_data = qp->rq.db_data.raw; mmio_flush_writes(); if (iwarp) { writel(qp->rq.iwarp_db2_data.raw, qp->rq.iwarp_db2); mmio_flush_writes(); } wr = wr->next; } pthread_spin_unlock(&qp->q_lock); return status; } static int is_valid_cqe(struct qelr_cq *cq, union rdma_cqe *cqe) { struct rdma_cqe_requester *resp_cqe = &cqe->req; return (resp_cqe->flags & RDMA_CQE_REQUESTER_TOGGLE_BIT_MASK) == cq->chain_toggle; } static enum rdma_cqe_type cqe_get_type(union rdma_cqe *cqe) { struct rdma_cqe_requester *resp_cqe = &cqe->req; return GET_FIELD(resp_cqe->flags, RDMA_CQE_REQUESTER_TYPE); } static struct qelr_qp *cqe_get_qp(union rdma_cqe *cqe) { struct regpair *qph = &cqe->req.qp_handle; return (struct qelr_qp *)HILO_U64(le32toh(qph->hi), le32toh(qph->lo)); } static int process_req(struct qelr_qp *qp, struct qelr_cq *cq, int num_entries, struct ibv_wc *wc, uint16_t hw_cons, enum ibv_wc_status status, int force) { struct qelr_devctx *cxt = get_qelr_ctx(qp->ibv_qp->context); uint16_t cnt = 0; while (num_entries && qp->sq.wqe_cons != hw_cons) { if (!qp->wqe_wr_id[qp->sq.cons].signaled && !force) { /* skip WC */ goto next_cqe; } /* fill WC */ wc->status = status; wc->wc_flags = 0; wc->qp_num = qp->qp_id; /* common section */ wc->wr_id = qp->wqe_wr_id[qp->sq.cons].wr_id; wc->opcode = qp->wqe_wr_id[qp->sq.cons].opcode; switch (wc->opcode) { case IBV_WC_RDMA_WRITE: wc->byte_len = qp->wqe_wr_id[qp->sq.cons].bytes_len; verbs_debug(&cxt->ibv_ctx, "POLL REQ CQ: IBV_WC_RDMA_WRITE byte_len=%d\n", qp->wqe_wr_id[qp->sq.cons].bytes_len); break; case IBV_WC_COMP_SWAP: case IBV_WC_FETCH_ADD: wc->byte_len = 8; break; case IBV_WC_RDMA_READ: case IBV_WC_SEND: case IBV_WC_BIND_MW: wc->byte_len = qp->wqe_wr_id[qp->sq.cons].bytes_len; verbs_debug(&cxt->ibv_ctx, "POLL REQ CQ: IBV_WC_RDMA_READ / IBV_WC_SEND\n"); break; default: break; } num_entries--; wc++; cnt++; next_cqe: while (qp->wqe_wr_id[qp->sq.cons].wqe_size--) qelr_chain_consume(&qp->sq.chain); qelr_inc_sw_cons_u16(&qp->sq); } return cnt; } static int qelr_poll_cq_req(struct qelr_qp *qp, struct qelr_cq *cq, int num_entries, struct ibv_wc *wc, struct rdma_cqe_requester *req) { struct qelr_devctx *cxt = get_qelr_ctx(qp->ibv_qp->context); uint16_t sq_cons = le16toh(req->sq_cons); int cnt = 0; switch (req->status) { case RDMA_CQE_REQ_STS_OK: cnt = process_req(qp, cq, num_entries, wc, sq_cons, IBV_WC_SUCCESS, 0); break; case RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR: verbs_err(&cxt->ibv_ctx, "Error: POLL CQ with ROCE_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR. QP icid=0x%x\n", qp->sq.icid); cnt = process_req(qp, cq, num_entries, wc, sq_cons, IBV_WC_WR_FLUSH_ERR, 1); break; default: /* other errors case */ /* process all WQE before the consumer */ qp->state = QELR_QPS_ERR; cnt = process_req(qp, cq, num_entries, wc, sq_cons - 1, IBV_WC_SUCCESS, 0); wc += cnt; /* if we have extra WC fill it with actual error info */ if (cnt < num_entries) { enum ibv_wc_status wc_status; switch (req->status) { case RDMA_CQE_REQ_STS_BAD_RESPONSE_ERR: verbs_err(&cxt->ibv_ctx, "Error: POLL CQ with RDMA_CQE_REQ_STS_BAD_RESPONSE_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_BAD_RESP_ERR; break; case RDMA_CQE_REQ_STS_LOCAL_LENGTH_ERR: verbs_err(&cxt->ibv_ctx, "Error: POLL CQ with RDMA_CQE_REQ_STS_LOCAL_LENGTH_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_LOC_LEN_ERR; break; case RDMA_CQE_REQ_STS_LOCAL_QP_OPERATION_ERR: verbs_err(&cxt->ibv_ctx, "Error: POLL CQ with RDMA_CQE_REQ_STS_LOCAL_QP_OPERATION_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_LOC_QP_OP_ERR; break; case RDMA_CQE_REQ_STS_LOCAL_PROTECTION_ERR: verbs_err(&cxt->ibv_ctx, "Error: POLL CQ with RDMA_CQE_REQ_STS_LOCAL_PROTECTION_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_LOC_PROT_ERR; break; case RDMA_CQE_REQ_STS_MEMORY_MGT_OPERATION_ERR: verbs_err(&cxt->ibv_ctx, "Error: POLL CQ with RDMA_CQE_REQ_STS_MEMORY_MGT_OPERATION_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_MW_BIND_ERR; break; case RDMA_CQE_REQ_STS_REMOTE_INVALID_REQUEST_ERR: verbs_err(&cxt->ibv_ctx, "Error: POLL CQ with RDMA_CQE_REQ_STS_REMOTE_INVALID_REQUEST_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_REM_INV_REQ_ERR; break; case RDMA_CQE_REQ_STS_REMOTE_ACCESS_ERR: verbs_err(&cxt->ibv_ctx, "Error: POLL CQ with RDMA_CQE_REQ_STS_REMOTE_ACCESS_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_REM_ACCESS_ERR; break; case RDMA_CQE_REQ_STS_REMOTE_OPERATION_ERR: verbs_err(&cxt->ibv_ctx, "Error: POLL CQ with RDMA_CQE_REQ_STS_REMOTE_OPERATION_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_REM_OP_ERR; break; case RDMA_CQE_REQ_STS_RNR_NAK_RETRY_CNT_ERR: verbs_err(&cxt->ibv_ctx, "Error: POLL CQ with RDMA_CQE_REQ_STS_RNR_NAK_RETRY_CNT_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_RNR_RETRY_EXC_ERR; break; case RDMA_CQE_REQ_STS_TRANSPORT_RETRY_CNT_ERR: verbs_err(&cxt->ibv_ctx, "RDMA_CQE_REQ_STS_TRANSPORT_RETRY_CNT_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_RETRY_EXC_ERR; break; default: verbs_err(&cxt->ibv_ctx, "IBV_WC_GENERAL_ERR. QP icid=0x%x\n", qp->sq.icid); wc_status = IBV_WC_GENERAL_ERR; } cnt += process_req(qp, cq, 1, wc, sq_cons, wc_status, 1 /* force use of WC */); } } return cnt; } static void __process_resp_one(struct qelr_devctx *cxt, struct qelr_cq *cq, struct ibv_wc *wc, struct rdma_cqe_responder *resp, uint64_t wr_id, uint32_t qp_id) { enum ibv_wc_status wc_status = IBV_WC_SUCCESS; uint8_t flags; wc->opcode = IBV_WC_RECV; wc->wr_id = wr_id; wc->wc_flags = 0; switch (resp->status) { case RDMA_CQE_RESP_STS_LOCAL_ACCESS_ERR: wc_status = IBV_WC_LOC_ACCESS_ERR; break; case RDMA_CQE_RESP_STS_LOCAL_LENGTH_ERR: wc_status = IBV_WC_LOC_LEN_ERR; break; case RDMA_CQE_RESP_STS_LOCAL_QP_OPERATION_ERR: wc_status = IBV_WC_LOC_QP_OP_ERR; break; case RDMA_CQE_RESP_STS_LOCAL_PROTECTION_ERR: wc_status = IBV_WC_LOC_PROT_ERR; break; case RDMA_CQE_RESP_STS_MEMORY_MGT_OPERATION_ERR: wc_status = IBV_WC_MW_BIND_ERR; break; case RDMA_CQE_RESP_STS_REMOTE_INVALID_REQUEST_ERR: wc_status = IBV_WC_REM_INV_RD_REQ_ERR; break; case RDMA_CQE_RESP_STS_OK: wc_status = IBV_WC_SUCCESS; wc->byte_len = le32toh(resp->length); if (GET_FIELD(resp->flags, RDMA_CQE_REQUESTER_TYPE) == RDMA_CQE_TYPE_RESPONDER_XRC_SRQ) wc->src_qp = le16toh(resp->rq_cons_or_srq_id); flags = resp->flags & QELR_RESP_RDMA_IMM; switch (flags) { case QELR_RESP_RDMA_IMM: /* update opcode */ wc->opcode = IBV_WC_RECV_RDMA_WITH_IMM; SWITCH_FALLTHROUGH; case QELR_RESP_IMM: wc->imm_data = htobe32(le32toh(resp->imm_data_or_inv_r_Key)); wc->wc_flags |= IBV_WC_WITH_IMM; break; case QELR_RESP_INV: wc->invalidated_rkey = le32toh(resp->imm_data_or_inv_r_Key); wc->wc_flags |= IBV_WC_WITH_INV; break; case QELR_RESP_RDMA: verbs_err(&cxt->ibv_ctx, "Invalid flags detected\n"); break; default: /* valid configuration, but nothing to do here */ break; } break; default: wc->status = IBV_WC_GENERAL_ERR; verbs_err(&cxt->ibv_ctx, "Invalid CQE status detected\n"); } /* fill WC */ wc->status = wc_status; wc->qp_num = qp_id; } static int process_resp_one_srq(struct qelr_srq *srq, struct qelr_cq *cq, struct ibv_wc *wc, struct rdma_cqe_responder *resp, uint32_t qp_id) { struct qelr_srq_hwq_info *hw_srq = &srq->hw_srq; uint64_t wr_id; wr_id = (((uint64_t)(le32toh(resp->srq_wr_id.hi))) << 32) + le32toh(resp->srq_wr_id.lo); if (resp->status == RDMA_CQE_RESP_STS_WORK_REQUEST_FLUSHED_ERR) { wc->byte_len = 0; wc->status = IBV_WC_WR_FLUSH_ERR; wc->qp_num = qp_id; wc->wr_id = wr_id; } else { __process_resp_one(get_qelr_ctx(srq->verbs_srq.srq.context), cq, wc, resp, wr_id, qp_id); } hw_srq->wr_cons_cnt++; return 1; } static int process_resp_one(struct qelr_qp *qp, struct qelr_cq *cq, struct ibv_wc *wc, struct rdma_cqe_responder *resp) { uint64_t wr_id = qp->rqe_wr_id[qp->rq.cons].wr_id; __process_resp_one(get_qelr_ctx(qp->ibv_qp->context), cq, wc, resp, wr_id, qp->qp_id); while (qp->rqe_wr_id[qp->rq.cons].wqe_size--) qelr_chain_consume(&qp->rq.chain); qelr_inc_sw_cons_u16(&qp->rq); return 1; } static int process_resp_flush(struct qelr_qp *qp, struct qelr_cq *cq, int num_entries, struct ibv_wc *wc, uint16_t hw_cons) { uint16_t cnt = 0; while (num_entries && qp->rq.wqe_cons != hw_cons) { /* fill WC */ wc->status = IBV_WC_WR_FLUSH_ERR; wc->qp_num = qp->qp_id; wc->byte_len = 0; wc->wr_id = qp->rqe_wr_id[qp->rq.cons].wr_id; num_entries--; wc++; cnt++; while (qp->rqe_wr_id[qp->rq.cons].wqe_size--) qelr_chain_consume(&qp->rq.chain); qelr_inc_sw_cons_u16(&qp->rq); } return cnt; } /* return latest CQE (needs processing) */ static union rdma_cqe *get_cqe(struct qelr_cq *cq) { return cq->latest_cqe; } static void try_consume_req_cqe(struct qelr_cq *cq, struct qelr_qp *qp, struct rdma_cqe_requester *req, int *update) { uint16_t sq_cons = le16toh(req->sq_cons); if (sq_cons == qp->sq.wqe_cons) { consume_cqe(cq); *update |= 1; } } /* used with flush only, when resp->rq_cons is valid */ static void try_consume_resp_cqe(struct qelr_cq *cq, struct qelr_qp *qp, uint16_t rq_cons, int *update) { if (rq_cons == qp->rq.wqe_cons) { consume_cqe(cq); *update |= 1; } } static int qelr_poll_cq_resp_srq(struct qelr_srq *srq, struct qelr_cq *cq, int num_entries, struct ibv_wc *wc, struct rdma_cqe_responder *resp, int *update, uint32_t qp_id) { int cnt; cnt = process_resp_one_srq(srq, cq, wc, resp, qp_id); consume_cqe(cq); *update |= 1; return cnt; } static int qelr_poll_cq_resp(struct qelr_qp *qp, struct qelr_cq *cq, int num_entries, struct ibv_wc *wc, struct rdma_cqe_responder *resp, int *update) { uint16_t rq_cons = le16toh(resp->rq_cons_or_srq_id); int cnt; if (resp->status == RDMA_CQE_RESP_STS_WORK_REQUEST_FLUSHED_ERR) { cnt = process_resp_flush(qp, cq, num_entries, wc, rq_cons); try_consume_resp_cqe(cq, qp, rq_cons, update); } else { cnt = process_resp_one(qp, cq, wc, resp); consume_cqe(cq); *update |= 1; } return cnt; } static void doorbell_cq(struct qelr_cq *cq, uint32_t cons, uint8_t flags) { mmio_wc_start(); cq->db.data.agg_flags = flags; cq->db.data.value = htole32(cons); writeq(cq->db.raw, cq->db_addr); /* copy value to doorbell recovery mechanism */ cq->db_rec_addr->db_data = cq->db.raw; mmio_flush_writes(); } static struct qelr_srq *qelr_get_xrc_srq_from_cqe(struct qelr_cq *cq, union rdma_cqe *cqe, struct qelr_qp *qp) { struct qelr_devctx *cxt; struct qelr_srq *srq; uint16_t srq_id; srq_id = le16toh(cqe->resp.rq_cons_or_srq_id); cxt = get_qelr_ctx(cq->ibv_cq.context); srq = qelr_get_srq(cxt, srq_id); if (unlikely(!srq)) { verbs_err(&cxt->ibv_ctx, "srq handle is null\n"); return NULL; } return srq; } int qelr_poll_cq(struct ibv_cq *ibcq, int num_entries, struct ibv_wc *wc) { struct qelr_cq *cq = get_qelr_cq(ibcq); int done = 0; union rdma_cqe *cqe = get_cqe(cq); struct qelr_srq *srq; struct regpair *qph; int update = 0; uint32_t db_cons; uint32_t qp_id; while (num_entries && is_valid_cqe(cq, cqe)) { int cnt = 0; struct qelr_qp *qp; /* prevent speculative reads of any field of CQE */ udma_from_device_barrier(); qp = cqe_get_qp(cqe); if (!qp && cqe_get_type(cqe) != RDMA_CQE_TYPE_RESPONDER_XRC_SRQ) { verbs_err(verbs_get_ctx(qp->ibv_qp->context), "Error: CQE QP pointer is NULL. CQE=%p\n", cqe); break; } switch (cqe_get_type(cqe)) { case RDMA_CQE_TYPE_REQUESTER: cnt = qelr_poll_cq_req(qp, cq, num_entries, wc, &cqe->req); try_consume_req_cqe(cq, qp, &cqe->req, &update); break; case RDMA_CQE_TYPE_RESPONDER_RQ: cnt = qelr_poll_cq_resp(qp, cq, num_entries, wc, &cqe->resp, &update); break; case RDMA_CQE_TYPE_RESPONDER_XRC_SRQ: qph = &cqe->req.qp_handle; srq = qelr_get_xrc_srq_from_cqe(cq, cqe, qp); if (unlikely(!srq)) { consume_cqe(cq); cqe = get_cqe(cq); update |= 1; continue; } qp_id = le32toh(qph->lo); cnt = qelr_poll_cq_resp_srq(srq, cq, num_entries, wc, &cqe->resp, &update, qp_id); break; case RDMA_CQE_TYPE_RESPONDER_SRQ: cnt = qelr_poll_cq_resp_srq(qp->srq, cq, num_entries, wc, &cqe->resp, &update, qp->qp_id); break; case RDMA_CQE_TYPE_INVALID: default: printf("Error: invalid CQE type = %d\n", cqe_get_type(cqe)); } num_entries -= cnt; wc += cnt; done += cnt; cqe = get_cqe(cq); } db_cons = qelr_chain_get_cons_idx_u32(&cq->chain) - 1; if (update) { /* doorbell notifies about latest VALID entry, * but chain already point to the next INVALID one */ doorbell_cq(cq, db_cons, cq->arm_flags); } return done; } void qelr_cq_event(struct ibv_cq *ibcq) { /* Trigger received, can reset arm flags */ struct qelr_cq *cq = get_qelr_cq(ibcq); cq->arm_flags = 0; } int qelr_arm_cq(struct ibv_cq *ibcq, int solicited) { struct qelr_cq *cq = get_qelr_cq(ibcq); uint32_t db_cons; db_cons = qelr_chain_get_cons_idx_u32(&cq->chain) - 1; cq->arm_flags = solicited ? DQ_UCM_ROCE_CQ_ARM_SE_CF_CMD : DQ_UCM_ROCE_CQ_ARM_CF_CMD; doorbell_cq(cq, db_cons, cq->arm_flags); return 0; } void qelr_async_event(struct ibv_context *context, struct ibv_async_event *event) { struct qelr_cq *cq = NULL; struct qelr_qp *qp = NULL; switch (event->event_type) { case IBV_EVENT_CQ_ERR: cq = get_qelr_cq(event->element.cq); break; case IBV_EVENT_QP_FATAL: case IBV_EVENT_QP_REQ_ERR: case IBV_EVENT_QP_ACCESS_ERR: case IBV_EVENT_PATH_MIG_ERR:{ qp = get_qelr_qp(event->element.qp); break; } case IBV_EVENT_SQ_DRAINED: case IBV_EVENT_PATH_MIG: case IBV_EVENT_COMM_EST: case IBV_EVENT_QP_LAST_WQE_REACHED: break; case IBV_EVENT_SRQ_LIMIT_REACHED: case IBV_EVENT_SRQ_ERR: return; case IBV_EVENT_PORT_ACTIVE: case IBV_EVENT_PORT_ERR: break; default: break; } fprintf(stderr, "qelr_async_event not implemented yet cq=%p qp=%p\n", cq, qp); } struct ibv_xrcd *qelr_open_xrcd(struct ibv_context *context, struct ibv_xrcd_init_attr *init_attr) { struct qelr_devctx *cxt = get_qelr_ctx(context); struct ib_uverbs_open_xrcd_resp resp; struct ibv_open_xrcd cmd; struct verbs_xrcd *xrcd; int rc; xrcd = calloc(1, sizeof(*xrcd)); if (!xrcd) return NULL; rc = ibv_cmd_open_xrcd(context, xrcd, sizeof(*xrcd), init_attr, &cmd, sizeof(cmd), &resp, sizeof(resp)); if (rc) { verbs_err(&cxt->ibv_ctx, "open xrcd: failed with rc=%d.\n", rc); free(xrcd); return NULL; } return &xrcd->xrcd; } int qelr_close_xrcd(struct ibv_xrcd *ibxrcd) { struct verbs_xrcd *xrcd = container_of(ibxrcd, struct verbs_xrcd, xrcd); struct qelr_devctx *cxt = get_qelr_ctx(ibxrcd->context); int rc; rc = ibv_cmd_close_xrcd(xrcd); if (rc) { verbs_err(&cxt->ibv_ctx, "close xrcd: failed with rc=%d.\n", rc); free(xrcd); } return rc; } static struct ibv_srq * qelr_create_xrc_srq(struct ibv_context *context, struct ibv_srq_init_attr_ex *init_attr) { struct qelr_devctx *cxt = get_qelr_ctx(context); struct qelr_create_srq_ex req; struct qelr_create_srq_resp resp; struct ibv_srq *ibv_srq; struct qelr_srq *srq; int rc = 0; srq = calloc(1, sizeof(*srq)); if (!srq) goto err0; ibv_srq = &srq->verbs_srq.srq; rc = qelr_create_srq_buffers(cxt, srq, init_attr->attr.max_wr); if (rc) goto err1; pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE); qelr_create_srq_configure_req_ex(srq, &req); rc = ibv_cmd_create_srq_ex(context, &srq->verbs_srq, init_attr, &req.ibv_cmd, sizeof(req), &resp.ibv_resp, sizeof(resp)); if (rc) goto err1; if (unlikely(resp.srq_id >= QELR_MAX_SRQ_ID)) { rc = -EINVAL; goto err1; } srq->srq_id = resp.srq_id; srq->is_xrc = 1; cxt->srq_table[resp.srq_id] = srq; verbs_debug(&cxt->ibv_ctx, "create srq_ex: successfully created %p.\n", srq); return ibv_srq; err1: qelr_destroy_srq_buffers(ibv_srq); free(srq); err0: verbs_err(&cxt->ibv_ctx, "create srq: failed to create. rc=%d\n", rc); return NULL; } int qelr_get_srq_num(struct ibv_srq *ibv_srq, uint32_t *srq_num) { struct qelr_srq *srq = get_qelr_srq(ibv_srq); *srq_num = srq->srq_id; return 0; } struct ibv_srq *qelr_create_srq_ex(struct ibv_context *context, struct ibv_srq_init_attr_ex *init_attr) { struct qelr_devctx *cxt = get_qelr_ctx(context); if (init_attr->srq_type == IBV_SRQT_BASIC) return qelr_create_srq(init_attr->pd, (struct ibv_srq_init_attr *)init_attr); if (init_attr->srq_type == IBV_SRQT_XRC) return qelr_create_xrc_srq(context, init_attr); verbs_err(&cxt->ibv_ctx, "failed to create srq type %d\n", init_attr->srq_type); return NULL; } static struct ibv_qp *create_qp(struct ibv_context *context, struct ibv_qp_init_attr_ex *attrx) { struct qelr_devctx *cxt = get_qelr_ctx(context); struct qelr_create_qp_resp resp = {}; struct qelr_create_qp req; struct ibv_qp *ibqp; struct qelr_qp *qp; int rc; qelr_print_qp_init_attr(cxt, attrx); #define QELR_CREATE_QP_SUPP_ATTR_MASK \ (IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_XRCD) if (!check_comp_mask(attrx->comp_mask, QELR_CREATE_QP_SUPP_ATTR_MASK)) { errno = EOPNOTSUPP; return NULL; } qp = calloc(1, sizeof(*qp)); if (!qp) return NULL; qelr_basic_qp_config(qp, attrx); rc = qelr_create_qp_buffers(cxt, qp, attrx); if (rc) goto err0; qelr_create_qp_configure_req(qp, &req); rc = ibv_cmd_create_qp_ex(context, &qp->verbs_qp, attrx, &req.ibv_cmd, sizeof(req), &resp.ibv_resp, sizeof(resp)); if (rc) { verbs_err(&cxt->ibv_ctx, "create qp: failed on ibv_cmd_create_qp with %d\n", rc); goto err1; } rc = qelr_configure_qp(cxt, qp, attrx, &resp); if (rc) goto err2; verbs_debug(&cxt->ibv_ctx, "create qp: successfully created %p. handle_hi=%x handle_lo=%x\n", qp, req.qp_handle_hi, req.qp_handle_lo); ibqp = (struct ibv_qp *)&qp->verbs_qp; qp->ibv_qp = ibqp; return get_ibv_qp(qp); err2: rc = ibv_cmd_destroy_qp(get_ibv_qp(qp)); if (rc) verbs_err(&cxt->ibv_ctx, "create qp: fatal fault. rc=%d\n", rc); err1: if (qelr_qp_has_sq(qp)) qelr_chain_free(&qp->sq.chain); if (qelr_qp_has_rq(qp)) qelr_chain_free(&qp->rq.chain); err0: free(qp); return NULL; } struct ibv_qp *qelr_create_qp_ex(struct ibv_context *context, struct ibv_qp_init_attr_ex *attr) { return create_qp(context, attr); } struct ibv_qp *qelr_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) { struct ibv_qp *qp; struct ibv_qp_init_attr_ex attrx = {}; memcpy(&attrx, attr, sizeof(*attr)); attrx.comp_mask = IBV_QP_INIT_ATTR_PD; attrx.pd = pd; qp = create_qp(pd->context, &attrx); if (qp) memcpy(attr, &attrx, sizeof(*attr)); return qp; }