diff --git a/MAINTAINERS b/MAINTAINERS index a6d3bd9d2a8d..e034f1461eb4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -733,6 +733,14 @@ S: Maintained F: Documentation/i2c/busses/i2c-ali1563.rst F: drivers/i2c/busses/i2c-ali1563.c +ALIBABA ELASTIC RDMA DRIVER +M: Cheng Xu +M: Kai Shen +L: linux-rdma@vger.kernel.org +S: Supported +F: drivers/infiniband/hw/erdma +F: include/uapi/rdma/erdma-abi.h + ALIENWARE WMI DRIVER L: Dell.Client.Kernel@dell.com S: Maintained diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index 33d3ce9c888e..aa36ac618e72 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -78,20 +78,21 @@ config INFINIBAND_VIRT_DMA def_bool !HIGHMEM if INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS -source "drivers/infiniband/hw/mthca/Kconfig" -source "drivers/infiniband/hw/qib/Kconfig" +source "drivers/infiniband/hw/bnxt_re/Kconfig" source "drivers/infiniband/hw/cxgb4/Kconfig" source "drivers/infiniband/hw/efa/Kconfig" +source "drivers/infiniband/hw/erdma/Kconfig" +source "drivers/infiniband/hw/hfi1/Kconfig" +source "drivers/infiniband/hw/hns/Kconfig" source "drivers/infiniband/hw/irdma/Kconfig" source "drivers/infiniband/hw/mlx4/Kconfig" source "drivers/infiniband/hw/mlx5/Kconfig" +source "drivers/infiniband/hw/mthca/Kconfig" source "drivers/infiniband/hw/ocrdma/Kconfig" -source "drivers/infiniband/hw/vmw_pvrdma/Kconfig" -source "drivers/infiniband/hw/usnic/Kconfig" -source "drivers/infiniband/hw/hns/Kconfig" -source "drivers/infiniband/hw/bnxt_re/Kconfig" -source "drivers/infiniband/hw/hfi1/Kconfig" source "drivers/infiniband/hw/qedr/Kconfig" +source "drivers/infiniband/hw/qib/Kconfig" +source "drivers/infiniband/hw/usnic/Kconfig" +source "drivers/infiniband/hw/vmw_pvrdma/Kconfig" source "drivers/infiniband/sw/rdmavt/Kconfig" source "drivers/infiniband/sw/rxe/Kconfig" source "drivers/infiniband/sw/siw/Kconfig" diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile index fba0b3be903e..6b3a88046125 100644 --- a/drivers/infiniband/hw/Makefile +++ b/drivers/infiniband/hw/Makefile @@ -13,3 +13,4 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/ obj-$(CONFIG_INFINIBAND_HNS) += hns/ obj-$(CONFIG_INFINIBAND_QEDR) += qedr/ obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re/ +obj-$(CONFIG_INFINIBAND_ERDMA) += erdma/ diff --git a/drivers/infiniband/hw/erdma/Kconfig b/drivers/infiniband/hw/erdma/Kconfig new file mode 100644 index 000000000000..169038e3ceb1 --- /dev/null +++ b/drivers/infiniband/hw/erdma/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +config INFINIBAND_ERDMA + tristate "Alibaba Elastic RDMA Adapter (ERDMA) support" + depends on PCI_MSI && 64BIT + depends on INFINIBAND_ADDR_TRANS + depends on INFINIBAND_USER_ACCESS + help + This is a RDMA/iWarp driver for Alibaba Elastic RDMA Adapter(ERDMA), + which supports RDMA features in Alibaba cloud environment. + + To compile this driver as module, choose M here. The module will be + called erdma. diff --git a/drivers/infiniband/hw/erdma/Makefile b/drivers/infiniband/hw/erdma/Makefile new file mode 100644 index 000000000000..51d2ef91905a --- /dev/null +++ b/drivers/infiniband/hw/erdma/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_INFINIBAND_ERDMA) := erdma.o + +erdma-y := erdma_cm.o erdma_main.o erdma_cmdq.o erdma_cq.o erdma_verbs.o erdma_qp.o erdma_eq.o diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h new file mode 100644 index 000000000000..2aae635c1c8d --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -0,0 +1,287 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#ifndef __ERDMA_H__ +#define __ERDMA_H__ + +#include +#include +#include +#include + +#include "erdma_hw.h" + +#define DRV_MODULE_NAME "erdma" +#define ERDMA_NODE_DESC "Elastic RDMA(iWARP) stack" + +struct erdma_eq { + void *qbuf; + dma_addr_t qbuf_dma_addr; + + spinlock_t lock; + + u32 depth; + + u16 ci; + u16 rsvd; + + atomic64_t event_num; + atomic64_t notify_num; + + u64 __iomem *db_addr; + u64 *db_record; +}; + +struct erdma_cmdq_sq { + void *qbuf; + dma_addr_t qbuf_dma_addr; + + spinlock_t lock; + + u32 depth; + u16 ci; + u16 pi; + + u16 wqebb_cnt; + + u64 *db_record; +}; + +struct erdma_cmdq_cq { + void *qbuf; + dma_addr_t qbuf_dma_addr; + + spinlock_t lock; + + u32 depth; + u32 ci; + u32 cmdsn; + + u64 *db_record; + + atomic64_t armed_num; +}; + +enum { + ERDMA_CMD_STATUS_INIT, + ERDMA_CMD_STATUS_ISSUED, + ERDMA_CMD_STATUS_FINISHED, + ERDMA_CMD_STATUS_TIMEOUT +}; + +struct erdma_comp_wait { + struct completion wait_event; + u32 cmd_status; + u32 ctx_id; + u16 sq_pi; + u8 comp_status; + u8 rsvd; + u32 comp_data[4]; +}; + +enum { + ERDMA_CMDQ_STATE_OK_BIT = 0, + ERDMA_CMDQ_STATE_TIMEOUT_BIT = 1, + ERDMA_CMDQ_STATE_CTX_ERR_BIT = 2, +}; + +#define ERDMA_CMDQ_TIMEOUT_MS 15000 +#define ERDMA_REG_ACCESS_WAIT_MS 20 +#define ERDMA_WAIT_DEV_DONE_CNT 500 + +struct erdma_cmdq { + unsigned long *comp_wait_bitmap; + struct erdma_comp_wait *wait_pool; + spinlock_t lock; + + bool use_event; + + struct erdma_cmdq_sq sq; + struct erdma_cmdq_cq cq; + struct erdma_eq eq; + + unsigned long state; + + struct semaphore credits; + u16 max_outstandings; +}; + +#define COMPROMISE_CC ERDMA_CC_CUBIC +enum erdma_cc_alg { + ERDMA_CC_NEWRENO = 0, + ERDMA_CC_CUBIC, + ERDMA_CC_HPCC_RTT, + ERDMA_CC_HPCC_ECN, + ERDMA_CC_HPCC_INT, + ERDMA_CC_METHODS_NUM +}; + +struct erdma_devattr { + u32 fw_version; + + unsigned char peer_addr[ETH_ALEN]; + + int numa_node; + enum erdma_cc_alg cc; + u32 grp_num; + u32 irq_num; + + bool disable_dwqe; + u16 dwqe_pages; + u16 dwqe_entries; + + u32 max_qp; + u32 max_send_wr; + u32 max_recv_wr; + u32 max_ord; + u32 max_ird; + + u32 max_send_sge; + u32 max_recv_sge; + u32 max_sge_rd; + u32 max_cq; + u32 max_cqe; + u64 max_mr_size; + u32 max_mr; + u32 max_pd; + u32 max_mw; + u32 local_dma_key; +}; + +#define ERDMA_IRQNAME_SIZE 50 + +struct erdma_irq { + char name[ERDMA_IRQNAME_SIZE]; + u32 msix_vector; + cpumask_t affinity_hint_mask; +}; + +struct erdma_eq_cb { + bool ready; + void *dev; /* All EQs use this fields to get erdma_dev struct */ + struct erdma_irq irq; + struct erdma_eq eq; + struct tasklet_struct tasklet; +}; + +struct erdma_resource_cb { + unsigned long *bitmap; + spinlock_t lock; + u32 next_alloc_idx; + u32 max_cap; +}; + +enum { + ERDMA_RES_TYPE_PD = 0, + ERDMA_RES_TYPE_STAG_IDX = 1, + ERDMA_RES_CNT = 2, +}; + +#define ERDMA_EXTRA_BUFFER_SIZE ERDMA_DB_SIZE +#define WARPPED_BUFSIZE(size) ((size) + ERDMA_EXTRA_BUFFER_SIZE) + +struct erdma_dev { + struct ib_device ibdev; + struct net_device *netdev; + struct pci_dev *pdev; + struct notifier_block netdev_nb; + + resource_size_t func_bar_addr; + resource_size_t func_bar_len; + u8 __iomem *func_bar; + + struct erdma_devattr attrs; + /* physical port state (only one port per device) */ + enum ib_port_state state; + + /* cmdq and aeq use the same msix vector */ + struct erdma_irq comm_irq; + struct erdma_cmdq cmdq; + struct erdma_eq aeq; + struct erdma_eq_cb ceqs[ERDMA_NUM_MSIX_VEC - 1]; + + spinlock_t lock; + struct erdma_resource_cb res_cb[ERDMA_RES_CNT]; + struct xarray qp_xa; + struct xarray cq_xa; + + u32 next_alloc_qpn; + u32 next_alloc_cqn; + + spinlock_t db_bitmap_lock; + /* We provide max 64 uContexts that each has one SQ doorbell Page. */ + DECLARE_BITMAP(sdb_page, ERDMA_DWQE_TYPE0_CNT); + /* + * We provide max 496 uContexts that each has one SQ normal Db, + * and one directWQE db。 + */ + DECLARE_BITMAP(sdb_entry, ERDMA_DWQE_TYPE1_CNT); + + atomic_t num_ctx; + struct list_head cep_list; +}; + +static inline void *get_queue_entry(void *qbuf, u32 idx, u32 depth, u32 shift) +{ + idx &= (depth - 1); + + return qbuf + (idx << shift); +} + +static inline struct erdma_dev *to_edev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct erdma_dev, ibdev); +} + +static inline u32 erdma_reg_read32(struct erdma_dev *dev, u32 reg) +{ + return readl(dev->func_bar + reg); +} + +static inline u64 erdma_reg_read64(struct erdma_dev *dev, u32 reg) +{ + return readq(dev->func_bar + reg); +} + +static inline void erdma_reg_write32(struct erdma_dev *dev, u32 reg, u32 value) +{ + writel(value, dev->func_bar + reg); +} + +static inline void erdma_reg_write64(struct erdma_dev *dev, u32 reg, u64 value) +{ + writeq(value, dev->func_bar + reg); +} + +static inline u32 erdma_reg_read32_filed(struct erdma_dev *dev, u32 reg, + u32 filed_mask) +{ + u32 val = erdma_reg_read32(dev, reg); + + return FIELD_GET(filed_mask, val); +} + +int erdma_cmdq_init(struct erdma_dev *dev); +void erdma_finish_cmdq_init(struct erdma_dev *dev); +void erdma_cmdq_destroy(struct erdma_dev *dev); + +void erdma_cmdq_build_reqhdr(u64 *hdr, u32 mod, u32 op); +int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, u64 *req, u32 req_size, + u64 *resp0, u64 *resp1); +void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq); + +int erdma_ceqs_init(struct erdma_dev *dev); +void erdma_ceqs_uninit(struct erdma_dev *dev); +void notify_eq(struct erdma_eq *eq); +void *get_next_valid_eqe(struct erdma_eq *eq); + +int erdma_aeq_init(struct erdma_dev *dev); +void erdma_aeq_destroy(struct erdma_dev *dev); + +void erdma_aeq_event_handler(struct erdma_dev *dev); +void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb); + +#endif diff --git a/drivers/infiniband/hw/erdma/erdma_cm.c b/drivers/infiniband/hw/erdma/erdma_cm.c new file mode 100644 index 000000000000..f13f16479eca --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_cm.c @@ -0,0 +1,1430 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +/* Authors: Bernard Metzler */ +/* Fredy Neeser */ +/* Greg Joyce */ +/* Copyright (c) 2008-2019, IBM Corporation */ +/* Copyright (c) 2017, Open Grid Computing, Inc. */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "erdma.h" +#include "erdma_cm.h" +#include "erdma_verbs.h" + +static struct workqueue_struct *erdma_cm_wq; + +static void erdma_cm_llp_state_change(struct sock *sk); +static void erdma_cm_llp_data_ready(struct sock *sk); +static void erdma_cm_llp_error_report(struct sock *sk); + +static void erdma_sk_assign_cm_upcalls(struct sock *sk) +{ + write_lock_bh(&sk->sk_callback_lock); + sk->sk_state_change = erdma_cm_llp_state_change; + sk->sk_data_ready = erdma_cm_llp_data_ready; + sk->sk_error_report = erdma_cm_llp_error_report; + write_unlock_bh(&sk->sk_callback_lock); +} + +static void erdma_sk_save_upcalls(struct sock *sk) +{ + struct erdma_cep *cep = sk_to_cep(sk); + + write_lock_bh(&sk->sk_callback_lock); + cep->sk_state_change = sk->sk_state_change; + cep->sk_data_ready = sk->sk_data_ready; + cep->sk_error_report = sk->sk_error_report; + write_unlock_bh(&sk->sk_callback_lock); +} + +static void erdma_sk_restore_upcalls(struct sock *sk, struct erdma_cep *cep) +{ + sk->sk_state_change = cep->sk_state_change; + sk->sk_data_ready = cep->sk_data_ready; + sk->sk_error_report = cep->sk_error_report; + sk->sk_user_data = NULL; +} + +static void erdma_socket_disassoc(struct socket *s) +{ + struct sock *sk = s->sk; + struct erdma_cep *cep; + + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + cep = sk_to_cep(sk); + if (cep) { + erdma_sk_restore_upcalls(sk, cep); + erdma_cep_put(cep); + } else { + WARN_ON_ONCE(1); + } + write_unlock_bh(&sk->sk_callback_lock); + } else { + WARN_ON_ONCE(1); + } +} + +static void erdma_cep_socket_assoc(struct erdma_cep *cep, struct socket *s) +{ + cep->sock = s; + erdma_cep_get(cep); + s->sk->sk_user_data = cep; + + erdma_sk_save_upcalls(s->sk); + erdma_sk_assign_cm_upcalls(s->sk); +} + +static void erdma_disassoc_listen_cep(struct erdma_cep *cep) +{ + if (cep->listen_cep) { + erdma_cep_put(cep->listen_cep); + cep->listen_cep = NULL; + } +} + +static struct erdma_cep *erdma_cep_alloc(struct erdma_dev *dev) +{ + struct erdma_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL); + unsigned long flags; + + if (!cep) + return NULL; + + INIT_LIST_HEAD(&cep->listenq); + INIT_LIST_HEAD(&cep->devq); + INIT_LIST_HEAD(&cep->work_freelist); + + kref_init(&cep->ref); + cep->state = ERDMA_EPSTATE_IDLE; + init_waitqueue_head(&cep->waitq); + spin_lock_init(&cep->lock); + cep->dev = dev; + + spin_lock_irqsave(&dev->lock, flags); + list_add_tail(&cep->devq, &dev->cep_list); + spin_unlock_irqrestore(&dev->lock, flags); + + return cep; +} + +static void erdma_cm_free_work(struct erdma_cep *cep) +{ + struct list_head *w, *tmp; + struct erdma_cm_work *work; + + list_for_each_safe(w, tmp, &cep->work_freelist) { + work = list_entry(w, struct erdma_cm_work, list); + list_del(&work->list); + kfree(work); + } +} + +static void erdma_cancel_mpatimer(struct erdma_cep *cep) +{ + spin_lock_bh(&cep->lock); + if (cep->mpa_timer) { + if (cancel_delayed_work(&cep->mpa_timer->work)) { + erdma_cep_put(cep); + kfree(cep->mpa_timer); + } + cep->mpa_timer = NULL; + } + spin_unlock_bh(&cep->lock); +} + +static void erdma_put_work(struct erdma_cm_work *work) +{ + INIT_LIST_HEAD(&work->list); + spin_lock_bh(&work->cep->lock); + list_add(&work->list, &work->cep->work_freelist); + spin_unlock_bh(&work->cep->lock); +} + +static void erdma_cep_set_inuse(struct erdma_cep *cep) +{ + unsigned long flags; + + spin_lock_irqsave(&cep->lock, flags); + while (cep->in_use) { + spin_unlock_irqrestore(&cep->lock, flags); + wait_event_interruptible(cep->waitq, !cep->in_use); + if (signal_pending(current)) + flush_signals(current); + + spin_lock_irqsave(&cep->lock, flags); + } + + cep->in_use = 1; + spin_unlock_irqrestore(&cep->lock, flags); +} + +static void erdma_cep_set_free(struct erdma_cep *cep) +{ + unsigned long flags; + + spin_lock_irqsave(&cep->lock, flags); + cep->in_use = 0; + spin_unlock_irqrestore(&cep->lock, flags); + + wake_up(&cep->waitq); +} + +static void __erdma_cep_dealloc(struct kref *ref) +{ + struct erdma_cep *cep = container_of(ref, struct erdma_cep, ref); + struct erdma_dev *dev = cep->dev; + unsigned long flags; + + WARN_ON(cep->listen_cep); + + kfree(cep->private_data); + kfree(cep->mpa.pdata); + spin_lock_bh(&cep->lock); + if (!list_empty(&cep->work_freelist)) + erdma_cm_free_work(cep); + spin_unlock_bh(&cep->lock); + + spin_lock_irqsave(&dev->lock, flags); + list_del(&cep->devq); + spin_unlock_irqrestore(&dev->lock, flags); + kfree(cep); +} + +static struct erdma_cm_work *erdma_get_work(struct erdma_cep *cep) +{ + struct erdma_cm_work *work = NULL; + + spin_lock_bh(&cep->lock); + if (!list_empty(&cep->work_freelist)) { + work = list_entry(cep->work_freelist.next, struct erdma_cm_work, + list); + list_del_init(&work->list); + } + + spin_unlock_bh(&cep->lock); + return work; +} + +static int erdma_cm_alloc_work(struct erdma_cep *cep, int num) +{ + struct erdma_cm_work *work; + + while (num--) { + work = kmalloc(sizeof(*work), GFP_KERNEL); + if (!work) { + if (!(list_empty(&cep->work_freelist))) + erdma_cm_free_work(cep); + return -ENOMEM; + } + work->cep = cep; + INIT_LIST_HEAD(&work->list); + list_add(&work->list, &cep->work_freelist); + } + + return 0; +} + +static int erdma_cm_upcall(struct erdma_cep *cep, enum iw_cm_event_type reason, + int status) +{ + struct iw_cm_event event; + struct iw_cm_id *cm_id; + + memset(&event, 0, sizeof(event)); + event.status = status; + event.event = reason; + + if (reason == IW_CM_EVENT_CONNECT_REQUEST) { + event.provider_data = cep; + cm_id = cep->listen_cep->cm_id; + + event.ird = cep->dev->attrs.max_ird; + event.ord = cep->dev->attrs.max_ord; + } else { + cm_id = cep->cm_id; + } + + if (reason == IW_CM_EVENT_CONNECT_REQUEST || + reason == IW_CM_EVENT_CONNECT_REPLY) { + u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len); + + if (pd_len && cep->mpa.pdata) { + event.private_data_len = pd_len; + event.private_data = cep->mpa.pdata; + } + + getname_local(cep->sock, &event.local_addr); + getname_peer(cep->sock, &event.remote_addr); + } + + return cm_id->event_handler(cm_id, &event); +} + +void erdma_qp_cm_drop(struct erdma_qp *qp) +{ + struct erdma_cep *cep = qp->cep; + + if (!qp->cep) + return; + + erdma_cep_set_inuse(cep); + + /* already closed. */ + if (cep->state == ERDMA_EPSTATE_CLOSED) + goto out; + + if (cep->cm_id) { + switch (cep->state) { + case ERDMA_EPSTATE_AWAIT_MPAREP: + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -EINVAL); + break; + case ERDMA_EPSTATE_RDMA_MODE: + erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); + break; + case ERDMA_EPSTATE_IDLE: + case ERDMA_EPSTATE_LISTENING: + case ERDMA_EPSTATE_CONNECTING: + case ERDMA_EPSTATE_AWAIT_MPAREQ: + case ERDMA_EPSTATE_RECVD_MPAREQ: + case ERDMA_EPSTATE_CLOSED: + default: + break; + } + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + erdma_cep_put(cep); + } + cep->state = ERDMA_EPSTATE_CLOSED; + + if (cep->sock) { + erdma_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + } + + if (cep->qp) { + cep->qp = NULL; + erdma_qp_put(qp); + } +out: + erdma_cep_set_free(cep); +} + +void erdma_cep_put(struct erdma_cep *cep) +{ + WARN_ON(kref_read(&cep->ref) < 1); + kref_put(&cep->ref, __erdma_cep_dealloc); +} + +void erdma_cep_get(struct erdma_cep *cep) +{ + kref_get(&cep->ref); +} + +static int erdma_send_mpareqrep(struct erdma_cep *cep, const void *pdata, + u8 pd_len) +{ + struct socket *s = cep->sock; + struct mpa_rr *rr = &cep->mpa.hdr; + struct kvec iov[3]; + struct msghdr msg; + int iovec_num = 0; + int ret; + int mpa_len; + + memset(&msg, 0, sizeof(msg)); + + rr->params.pd_len = cpu_to_be16(pd_len); + + iov[iovec_num].iov_base = rr; + iov[iovec_num].iov_len = sizeof(*rr); + iovec_num++; + mpa_len = sizeof(*rr); + + iov[iovec_num].iov_base = &cep->mpa.ext_data; + iov[iovec_num].iov_len = sizeof(cep->mpa.ext_data); + iovec_num++; + mpa_len += sizeof(cep->mpa.ext_data); + + if (pd_len) { + iov[iovec_num].iov_base = (char *)pdata; + iov[iovec_num].iov_len = pd_len; + mpa_len += pd_len; + iovec_num++; + } + + ret = kernel_sendmsg(s, &msg, iov, iovec_num, mpa_len); + + return ret < 0 ? ret : 0; +} + +static inline int ksock_recv(struct socket *sock, char *buf, size_t size, + int flags) +{ + struct kvec iov = { buf, size }; + struct msghdr msg = { .msg_name = NULL, .msg_flags = flags }; + + return kernel_recvmsg(sock, &msg, &iov, 1, size, flags); +} + +static int __recv_mpa_hdr(struct erdma_cep *cep, int hdr_rcvd, char *hdr, + int hdr_size, int *rcvd_out) +{ + struct socket *s = cep->sock; + int rcvd; + + *rcvd_out = 0; + if (hdr_rcvd < hdr_size) { + rcvd = ksock_recv(s, hdr + hdr_rcvd, hdr_size - hdr_rcvd, + MSG_DONTWAIT); + if (rcvd == -EAGAIN) + return -EAGAIN; + + if (rcvd <= 0) + return -ECONNABORTED; + + hdr_rcvd += rcvd; + *rcvd_out = rcvd; + + if (hdr_rcvd < hdr_size) + return -EAGAIN; + } + + return 0; +} + +static void __mpa_rr_set_revision(__be16 *bits, u8 rev) +{ + *bits = (*bits & ~MPA_RR_MASK_REVISION) | + (cpu_to_be16(rev) & MPA_RR_MASK_REVISION); +} + +static u8 __mpa_rr_revision(__be16 mpa_rr_bits) +{ + __be16 rev = mpa_rr_bits & MPA_RR_MASK_REVISION; + + return (u8)be16_to_cpu(rev); +} + +static void __mpa_ext_set_cc(__be32 *bits, u32 cc) +{ + *bits = (*bits & ~MPA_EXT_FLAG_CC) | + (cpu_to_be32(cc) & MPA_EXT_FLAG_CC); +} + +static u8 __mpa_ext_cc(__be32 mpa_ext_bits) +{ + __be32 cc = mpa_ext_bits & MPA_EXT_FLAG_CC; + + return (u8)be32_to_cpu(cc); +} + +/* + * Receive MPA Request/Reply header. + * + * Returns 0 if complete MPA Request/Reply haeder including + * eventual private data was received. Returns -EAGAIN if + * header was partially received or negative error code otherwise. + * + * Context: May be called in process context only + */ +static int erdma_recv_mpa_rr(struct erdma_cep *cep) +{ + struct mpa_rr *hdr = &cep->mpa.hdr; + struct socket *s = cep->sock; + u16 pd_len; + int rcvd, to_rcv, ret, pd_rcvd; + + if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) { + ret = __recv_mpa_hdr(cep, cep->mpa.bytes_rcvd, + (char *)&cep->mpa.hdr, + sizeof(struct mpa_rr), &rcvd); + cep->mpa.bytes_rcvd += rcvd; + if (ret) + return ret; + } + + if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA || + __mpa_rr_revision(hdr->params.bits) != MPA_REVISION_EXT_1) + return -EPROTO; + + if (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr) < + sizeof(struct erdma_mpa_ext)) { + ret = __recv_mpa_hdr( + cep, cep->mpa.bytes_rcvd - sizeof(struct mpa_rr), + (char *)&cep->mpa.ext_data, + sizeof(struct erdma_mpa_ext), &rcvd); + cep->mpa.bytes_rcvd += rcvd; + if (ret) + return ret; + } + + pd_len = be16_to_cpu(hdr->params.pd_len); + pd_rcvd = cep->mpa.bytes_rcvd - sizeof(struct mpa_rr) - + sizeof(struct erdma_mpa_ext); + to_rcv = pd_len - pd_rcvd; + + if (!to_rcv) { + /* + * We have received the whole MPA Request/Reply message. + * Check against peer protocol violation. + */ + u32 word; + + ret = __recv_mpa_hdr(cep, 0, (char *)&word, sizeof(word), + &rcvd); + if (ret == -EAGAIN && rcvd == 0) + return 0; + + if (ret) + return ret; + + return -EPROTO; + } + + /* + * At this point, MPA header has been fully received, and pd_len != 0. + * So, begin to receive private data. + */ + if (!cep->mpa.pdata) { + cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL); + if (!cep->mpa.pdata) + return -ENOMEM; + } + + rcvd = ksock_recv(s, cep->mpa.pdata + pd_rcvd, to_rcv + 4, + MSG_DONTWAIT); + if (rcvd < 0) + return rcvd; + + if (rcvd > to_rcv) + return -EPROTO; + + cep->mpa.bytes_rcvd += rcvd; + + if (to_rcv == rcvd) + return 0; + + return -EAGAIN; +} + +/* + * erdma_proc_mpareq() + * + * Read MPA Request from socket and signal new connection to IWCM + * if success. Caller must hold lock on corresponding listening CEP. + */ +static int erdma_proc_mpareq(struct erdma_cep *cep) +{ + struct mpa_rr *req; + int ret; + + ret = erdma_recv_mpa_rr(cep); + if (ret) + return ret; + + req = &cep->mpa.hdr; + + if (memcmp(req->key, MPA_KEY_REQ, MPA_KEY_SIZE)) + return -EPROTO; + + memcpy(req->key, MPA_KEY_REP, MPA_KEY_SIZE); + + /* Currently does not support marker and crc. */ + if (req->params.bits & MPA_RR_FLAG_MARKERS || + req->params.bits & MPA_RR_FLAG_CRC) + goto reject_conn; + + cep->state = ERDMA_EPSTATE_RECVD_MPAREQ; + + /* Keep reference until IWCM accepts/rejects */ + erdma_cep_get(cep); + ret = erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0); + if (ret) + erdma_cep_put(cep); + + return ret; + +reject_conn: + req->params.bits &= ~MPA_RR_FLAG_MARKERS; + req->params.bits |= MPA_RR_FLAG_REJECT; + req->params.bits &= ~MPA_RR_FLAG_CRC; + + kfree(cep->mpa.pdata); + cep->mpa.pdata = NULL; + erdma_send_mpareqrep(cep, NULL, 0); + + return -EOPNOTSUPP; +} + +static int erdma_proc_mpareply(struct erdma_cep *cep) +{ + struct erdma_qp_attrs qp_attrs; + struct erdma_qp *qp = cep->qp; + struct mpa_rr *rep; + int ret; + + ret = erdma_recv_mpa_rr(cep); + if (ret) + goto out_err; + + erdma_cancel_mpatimer(cep); + + rep = &cep->mpa.hdr; + + if (memcmp(rep->key, MPA_KEY_REP, MPA_KEY_SIZE)) { + ret = -EPROTO; + goto out_err; + } + + if (rep->params.bits & MPA_RR_FLAG_REJECT) { + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); + return -ECONNRESET; + } + + /* Currently does not support marker and crc. */ + if ((rep->params.bits & MPA_RR_FLAG_MARKERS) || + (rep->params.bits & MPA_RR_FLAG_CRC)) { + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED); + return -EINVAL; + } + + memset(&qp_attrs, 0, sizeof(qp_attrs)); + qp_attrs.irq_size = cep->ird; + qp_attrs.orq_size = cep->ord; + qp_attrs.state = ERDMA_QP_STATE_RTS; + + down_write(&qp->state_lock); + if (qp->attrs.state > ERDMA_QP_STATE_RTR) { + ret = -EINVAL; + up_write(&qp->state_lock); + goto out_err; + } + + qp->attrs.qp_type = ERDMA_QP_ACTIVE; + if (__mpa_ext_cc(cep->mpa.ext_data.bits) != qp->attrs.cc) + qp->attrs.cc = COMPROMISE_CC; + + ret = erdma_modify_qp_internal(qp, &qp_attrs, + ERDMA_QP_ATTR_STATE | + ERDMA_QP_ATTR_LLP_HANDLE | + ERDMA_QP_ATTR_MPA); + + up_write(&qp->state_lock); + + if (!ret) { + ret = erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0); + if (!ret) + cep->state = ERDMA_EPSTATE_RDMA_MODE; + + return 0; + } + +out_err: + if (ret != -EAGAIN) + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); + + return ret; +} + +static void erdma_accept_newconn(struct erdma_cep *cep) +{ + struct socket *s = cep->sock; + struct socket *new_s = NULL; + struct erdma_cep *new_cep = NULL; + int ret = 0; + + if (cep->state != ERDMA_EPSTATE_LISTENING) + goto error; + + new_cep = erdma_cep_alloc(cep->dev); + if (!new_cep) + goto error; + + /* + * 4: Allocate a sufficient number of work elements + * to allow concurrent handling of local + peer close + * events, MPA header processing + MPA timeout. + */ + if (erdma_cm_alloc_work(new_cep, 4) != 0) + goto error; + + /* + * Copy saved socket callbacks from listening CEP + * and assign new socket with new CEP + */ + new_cep->sk_state_change = cep->sk_state_change; + new_cep->sk_data_ready = cep->sk_data_ready; + new_cep->sk_error_report = cep->sk_error_report; + + ret = kernel_accept(s, &new_s, O_NONBLOCK); + if (ret != 0) + goto error; + + new_cep->sock = new_s; + erdma_cep_get(new_cep); + new_s->sk->sk_user_data = new_cep; + + tcp_sock_set_nodelay(new_s->sk); + new_cep->state = ERDMA_EPSTATE_AWAIT_MPAREQ; + + ret = erdma_cm_queue_work(new_cep, ERDMA_CM_WORK_MPATIMEOUT); + if (ret) + goto error; + + new_cep->listen_cep = cep; + erdma_cep_get(cep); + + if (atomic_read(&new_s->sk->sk_rmem_alloc)) { + /* MPA REQ already queued */ + erdma_cep_set_inuse(new_cep); + ret = erdma_proc_mpareq(new_cep); + if (ret != -EAGAIN) { + erdma_cep_put(cep); + new_cep->listen_cep = NULL; + if (ret) { + erdma_cep_set_free(new_cep); + goto error; + } + } + erdma_cep_set_free(new_cep); + } + return; + +error: + if (new_cep) { + new_cep->state = ERDMA_EPSTATE_CLOSED; + erdma_cancel_mpatimer(new_cep); + + erdma_cep_put(new_cep); + new_cep->sock = NULL; + } + + if (new_s) { + erdma_socket_disassoc(new_s); + sock_release(new_s); + } +} + +static int erdma_newconn_connected(struct erdma_cep *cep) +{ + int ret = 0; + + cep->mpa.hdr.params.bits = 0; + __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, MPA_REVISION_EXT_1); + + memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, MPA_KEY_SIZE); + cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie); + __mpa_ext_set_cc(&cep->mpa.ext_data.bits, cep->qp->attrs.cc); + + ret = erdma_send_mpareqrep(cep, cep->private_data, cep->pd_len); + cep->state = ERDMA_EPSTATE_AWAIT_MPAREP; + cep->mpa.hdr.params.pd_len = 0; + + if (ret >= 0) + ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_MPATIMEOUT); + + return ret; +} + +static void erdma_cm_work_handler(struct work_struct *w) +{ + struct erdma_cm_work *work; + struct erdma_cep *cep; + int release_cep = 0, ret = 0; + + work = container_of(w, struct erdma_cm_work, work.work); + cep = work->cep; + + erdma_cep_set_inuse(cep); + + switch (work->type) { + case ERDMA_CM_WORK_CONNECTED: + erdma_cancel_mpatimer(cep); + if (cep->state == ERDMA_EPSTATE_CONNECTING) { + ret = erdma_newconn_connected(cep); + if (ret) { + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -EIO); + release_cep = 1; + } + } + break; + case ERDMA_CM_WORK_CONNECTTIMEOUT: + if (cep->state == ERDMA_EPSTATE_CONNECTING) { + cep->mpa_timer = NULL; + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -ETIMEDOUT); + release_cep = 1; + } + break; + case ERDMA_CM_WORK_ACCEPT: + erdma_accept_newconn(cep); + break; + case ERDMA_CM_WORK_READ_MPAHDR: + if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) { + if (cep->listen_cep) { + erdma_cep_set_inuse(cep->listen_cep); + + if (cep->listen_cep->state == + ERDMA_EPSTATE_LISTENING) + ret = erdma_proc_mpareq(cep); + else + ret = -EFAULT; + + erdma_cep_set_free(cep->listen_cep); + + if (ret != -EAGAIN) { + erdma_cep_put(cep->listen_cep); + cep->listen_cep = NULL; + if (ret) + erdma_cep_put(cep); + } + } + } else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) { + ret = erdma_proc_mpareply(cep); + } + + if (ret && ret != -EAGAIN) + release_cep = 1; + break; + case ERDMA_CM_WORK_CLOSE_LLP: + if (cep->cm_id) + erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); + release_cep = 1; + break; + case ERDMA_CM_WORK_PEER_CLOSE: + if (cep->cm_id) { + if (cep->state == ERDMA_EPSTATE_CONNECTING || + cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) { + /* + * MPA reply not received, but connection drop + */ + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -ECONNRESET); + } else if (cep->state == ERDMA_EPSTATE_RDMA_MODE) { + /* + * NOTE: IW_CM_EVENT_DISCONNECT is given just + * to transition IWCM into CLOSING. + */ + erdma_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0); + erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); + } + } else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) { + /* Socket close before MPA request received. */ + erdma_disassoc_listen_cep(cep); + erdma_cep_put(cep); + } + release_cep = 1; + break; + case ERDMA_CM_WORK_MPATIMEOUT: + cep->mpa_timer = NULL; + if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) { + /* + * MPA request timed out: + * Hide any partially received private data and signal + * timeout + */ + cep->mpa.hdr.params.pd_len = 0; + + if (cep->cm_id) + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -ETIMEDOUT); + release_cep = 1; + } else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) { + /* No MPA req received after peer TCP stream setup. */ + erdma_disassoc_listen_cep(cep); + + erdma_cep_put(cep); + release_cep = 1; + } + break; + default: + WARN(1, "Undefined CM work type: %d\n", work->type); + } + + if (release_cep) { + erdma_cancel_mpatimer(cep); + cep->state = ERDMA_EPSTATE_CLOSED; + if (cep->qp) { + struct erdma_qp *qp = cep->qp; + /* + * Serialize a potential race with application + * closing the QP and calling erdma_qp_cm_drop() + */ + erdma_qp_get(qp); + erdma_cep_set_free(cep); + + erdma_qp_llp_close(qp); + erdma_qp_put(qp); + + erdma_cep_set_inuse(cep); + cep->qp = NULL; + erdma_qp_put(qp); + } + + if (cep->sock) { + erdma_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + } + + if (cep->cm_id) { + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + if (cep->state != ERDMA_EPSTATE_LISTENING) + erdma_cep_put(cep); + } + } + erdma_cep_set_free(cep); + erdma_put_work(work); + erdma_cep_put(cep); +} + +int erdma_cm_queue_work(struct erdma_cep *cep, enum erdma_work_type type) +{ + struct erdma_cm_work *work = erdma_get_work(cep); + unsigned long delay = 0; + + if (!work) + return -ENOMEM; + + work->type = type; + work->cep = cep; + + erdma_cep_get(cep); + + INIT_DELAYED_WORK(&work->work, erdma_cm_work_handler); + + if (type == ERDMA_CM_WORK_MPATIMEOUT) { + cep->mpa_timer = work; + + if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) + delay = MPAREP_TIMEOUT; + else + delay = MPAREQ_TIMEOUT; + } else if (type == ERDMA_CM_WORK_CONNECTTIMEOUT) { + cep->mpa_timer = work; + + delay = CONNECT_TIMEOUT; + } + + queue_delayed_work(erdma_cm_wq, &work->work, delay); + + return 0; +} + +static void erdma_cm_llp_data_ready(struct sock *sk) +{ + struct erdma_cep *cep; + + read_lock(&sk->sk_callback_lock); + + cep = sk_to_cep(sk); + if (!cep) + goto out; + + if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ || + cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) + erdma_cm_queue_work(cep, ERDMA_CM_WORK_READ_MPAHDR); + +out: + read_unlock(&sk->sk_callback_lock); +} + +static void erdma_cm_llp_error_report(struct sock *sk) +{ + struct erdma_cep *cep = sk_to_cep(sk); + + if (cep) + cep->sk_error_report(sk); +} + +static void erdma_cm_llp_state_change(struct sock *sk) +{ + struct erdma_cep *cep; + void (*orig_state_change)(struct sock *sk); + + read_lock(&sk->sk_callback_lock); + + cep = sk_to_cep(sk); + if (!cep) { + read_unlock(&sk->sk_callback_lock); + return; + } + orig_state_change = cep->sk_state_change; + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + if (cep->state == ERDMA_EPSTATE_CONNECTING) + erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTED); + else + erdma_cm_queue_work(cep, ERDMA_CM_WORK_ACCEPT); + break; + case TCP_CLOSE: + case TCP_CLOSE_WAIT: + if (cep->state != ERDMA_EPSTATE_LISTENING) + erdma_cm_queue_work(cep, ERDMA_CM_WORK_PEER_CLOSE); + break; + default: + break; + } + read_unlock(&sk->sk_callback_lock); + orig_state_change(sk); +} + +static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, + int laddrlen, struct sockaddr *raddr, + int raddrlen, int flags) +{ + int ret; + + sock_set_reuseaddr(s->sk); + ret = s->ops->bind(s, laddr, laddrlen); + if (ret) + return ret; + ret = s->ops->connect(s, raddr, raddrlen, flags); + return ret < 0 ? ret : 0; +} + +int erdma_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) +{ + struct erdma_dev *dev = to_edev(id->device); + struct erdma_qp *qp; + struct erdma_cep *cep = NULL; + struct socket *s = NULL; + struct sockaddr *laddr = (struct sockaddr *)&id->m_local_addr; + struct sockaddr *raddr = (struct sockaddr *)&id->m_remote_addr; + u16 pd_len = params->private_data_len; + int ret; + + if (pd_len > MPA_MAX_PRIVDATA) + return -EINVAL; + + if (params->ird > dev->attrs.max_ird || + params->ord > dev->attrs.max_ord) + return -EINVAL; + + if (laddr->sa_family != AF_INET || raddr->sa_family != AF_INET) + return -EAFNOSUPPORT; + + qp = find_qp_by_qpn(dev, params->qpn); + if (!qp) + return -ENOENT; + erdma_qp_get(qp); + + ret = sock_create(AF_INET, SOCK_STREAM, IPPROTO_TCP, &s); + if (ret < 0) + goto error_put_qp; + + cep = erdma_cep_alloc(dev); + if (!cep) { + ret = -ENOMEM; + goto error_release_sock; + } + + erdma_cep_set_inuse(cep); + + /* Associate QP with CEP */ + erdma_cep_get(cep); + qp->cep = cep; + cep->qp = qp; + + /* Associate cm_id with CEP */ + id->add_ref(id); + cep->cm_id = id; + + /* + * 6: Allocate a sufficient number of work elements + * to allow concurrent handling of local + peer close + * events, MPA header processing + MPA timeout, connected event + * and connect timeout. + */ + ret = erdma_cm_alloc_work(cep, 6); + if (ret != 0) { + ret = -ENOMEM; + goto error_release_cep; + } + + cep->ird = params->ird; + cep->ord = params->ord; + cep->state = ERDMA_EPSTATE_CONNECTING; + + erdma_cep_socket_assoc(cep, s); + + if (pd_len) { + cep->pd_len = pd_len; + cep->private_data = kmalloc(pd_len, GFP_KERNEL); + if (!cep->private_data) { + ret = -ENOMEM; + goto error_disassoc; + } + + memcpy(cep->private_data, params->private_data, + params->private_data_len); + } + + ret = kernel_bindconnect(s, laddr, sizeof(*laddr), raddr, + sizeof(*raddr), O_NONBLOCK); + if (ret != -EINPROGRESS && ret != 0) { + goto error_disassoc; + } else if (ret == 0) { + ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTED); + if (ret) + goto error_disassoc; + } else { + ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTTIMEOUT); + if (ret) + goto error_disassoc; + } + + erdma_cep_set_free(cep); + return 0; + +error_disassoc: + kfree(cep->private_data); + cep->private_data = NULL; + cep->pd_len = 0; + + erdma_socket_disassoc(s); + +error_release_cep: + /* disassoc with cm_id */ + cep->cm_id = NULL; + id->rem_ref(id); + + /* disassoc with qp */ + qp->cep = NULL; + erdma_cep_put(cep); + cep->qp = NULL; + + cep->state = ERDMA_EPSTATE_CLOSED; + + erdma_cep_set_free(cep); + + /* release the cep. */ + erdma_cep_put(cep); + +error_release_sock: + if (s) + sock_release(s); +error_put_qp: + erdma_qp_put(qp); + + return ret; +} + +int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) +{ + struct erdma_dev *dev = to_edev(id->device); + struct erdma_cep *cep = (struct erdma_cep *)id->provider_data; + struct erdma_qp *qp; + struct erdma_qp_attrs qp_attrs; + int ret; + + erdma_cep_set_inuse(cep); + erdma_cep_put(cep); + + /* Free lingering inbound private data */ + if (cep->mpa.hdr.params.pd_len) { + cep->mpa.hdr.params.pd_len = 0; + kfree(cep->mpa.pdata); + cep->mpa.pdata = NULL; + } + erdma_cancel_mpatimer(cep); + + if (cep->state != ERDMA_EPSTATE_RECVD_MPAREQ) { + erdma_cep_set_free(cep); + erdma_cep_put(cep); + + return -ECONNRESET; + } + + qp = find_qp_by_qpn(dev, params->qpn); + if (!qp) + return -ENOENT; + erdma_qp_get(qp); + + down_write(&qp->state_lock); + if (qp->attrs.state > ERDMA_QP_STATE_RTR) { + ret = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + + if (params->ord > dev->attrs.max_ord || + params->ird > dev->attrs.max_ord) { + ret = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + + if (params->private_data_len > MPA_MAX_PRIVDATA) { + ret = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + + cep->ird = params->ird; + cep->ord = params->ord; + + cep->cm_id = id; + id->add_ref(id); + + memset(&qp_attrs, 0, sizeof(qp_attrs)); + qp_attrs.orq_size = params->ord; + qp_attrs.irq_size = params->ird; + + qp_attrs.state = ERDMA_QP_STATE_RTS; + + /* Associate QP with CEP */ + erdma_cep_get(cep); + qp->cep = cep; + cep->qp = qp; + + cep->state = ERDMA_EPSTATE_RDMA_MODE; + + qp->attrs.qp_type = ERDMA_QP_PASSIVE; + qp->attrs.pd_len = params->private_data_len; + + if (qp->attrs.cc != __mpa_ext_cc(cep->mpa.ext_data.bits)) + qp->attrs.cc = COMPROMISE_CC; + + /* move to rts */ + ret = erdma_modify_qp_internal(qp, &qp_attrs, + ERDMA_QP_ATTR_STATE | + ERDMA_QP_ATTR_ORD | + ERDMA_QP_ATTR_LLP_HANDLE | + ERDMA_QP_ATTR_IRD | + ERDMA_QP_ATTR_MPA); + up_write(&qp->state_lock); + + if (ret) + goto error; + + cep->mpa.ext_data.bits = 0; + __mpa_ext_set_cc(&cep->mpa.ext_data.bits, qp->attrs.cc); + cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie); + + ret = erdma_send_mpareqrep(cep, params->private_data, + params->private_data_len); + if (!ret) { + ret = erdma_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); + if (ret) + goto error; + + erdma_cep_set_free(cep); + + return 0; + } + +error: + erdma_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + + cep->state = ERDMA_EPSTATE_CLOSED; + + if (cep->cm_id) { + cep->cm_id->rem_ref(id); + cep->cm_id = NULL; + } + + if (qp->cep) { + erdma_cep_put(cep); + qp->cep = NULL; + } + + cep->qp = NULL; + erdma_qp_put(qp); + + erdma_cep_set_free(cep); + erdma_cep_put(cep); + + return ret; +} + +int erdma_reject(struct iw_cm_id *id, const void *pdata, u8 plen) +{ + struct erdma_cep *cep = (struct erdma_cep *)id->provider_data; + + erdma_cep_set_inuse(cep); + erdma_cep_put(cep); + + erdma_cancel_mpatimer(cep); + + if (cep->state != ERDMA_EPSTATE_RECVD_MPAREQ) { + erdma_cep_set_free(cep); + erdma_cep_put(cep); + + return -ECONNRESET; + } + + if (__mpa_rr_revision(cep->mpa.hdr.params.bits) == MPA_REVISION_EXT_1) { + cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */ + erdma_send_mpareqrep(cep, pdata, plen); + } + + erdma_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + + cep->state = ERDMA_EPSTATE_CLOSED; + + erdma_cep_set_free(cep); + erdma_cep_put(cep); + + return 0; +} + +int erdma_create_listen(struct iw_cm_id *id, int backlog) +{ + struct socket *s; + struct erdma_cep *cep = NULL; + int ret = 0; + struct erdma_dev *dev = to_edev(id->device); + int addr_family = id->local_addr.ss_family; + struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr); + + if (addr_family != AF_INET) + return -EAFNOSUPPORT; + + ret = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s); + if (ret < 0) + return ret; + + sock_set_reuseaddr(s->sk); + + /* For wildcard addr, limit binding to current device only */ + if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) + s->sk->sk_bound_dev_if = dev->netdev->ifindex; + + ret = s->ops->bind(s, (struct sockaddr *)laddr, + sizeof(struct sockaddr_in)); + if (ret) + goto error; + + cep = erdma_cep_alloc(dev); + if (!cep) { + ret = -ENOMEM; + goto error; + } + erdma_cep_socket_assoc(cep, s); + + ret = erdma_cm_alloc_work(cep, backlog); + if (ret) + goto error; + + ret = s->ops->listen(s, backlog); + if (ret) + goto error; + + cep->cm_id = id; + id->add_ref(id); + + if (!id->provider_data) { + id->provider_data = + kmalloc(sizeof(struct list_head), GFP_KERNEL); + if (!id->provider_data) { + ret = -ENOMEM; + goto error; + } + INIT_LIST_HEAD((struct list_head *)id->provider_data); + } + + list_add_tail(&cep->listenq, (struct list_head *)id->provider_data); + cep->state = ERDMA_EPSTATE_LISTENING; + + return 0; + +error: + if (cep) { + erdma_cep_set_inuse(cep); + + if (cep->cm_id) { + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + } + cep->sock = NULL; + erdma_socket_disassoc(s); + cep->state = ERDMA_EPSTATE_CLOSED; + + erdma_cep_set_free(cep); + erdma_cep_put(cep); + } + sock_release(s); + + return ret; +} + +static void erdma_drop_listeners(struct iw_cm_id *id) +{ + struct list_head *p, *tmp; + /* + * In case of a wildcard rdma_listen on a multi-homed device, + * a listener's IWCM id is associated with more than one listening CEP. + */ + list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) { + struct erdma_cep *cep = + list_entry(p, struct erdma_cep, listenq); + + list_del(p); + + erdma_cep_set_inuse(cep); + + if (cep->cm_id) { + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + } + if (cep->sock) { + erdma_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + } + cep->state = ERDMA_EPSTATE_CLOSED; + erdma_cep_set_free(cep); + erdma_cep_put(cep); + } +} + +int erdma_destroy_listen(struct iw_cm_id *id) +{ + if (!id->provider_data) + return 0; + + erdma_drop_listeners(id); + kfree(id->provider_data); + id->provider_data = NULL; + + return 0; +} + +int erdma_cm_init(void) +{ + erdma_cm_wq = create_singlethread_workqueue("erdma_cm_wq"); + if (!erdma_cm_wq) + return -ENOMEM; + + return 0; +} + +void erdma_cm_exit(void) +{ + if (erdma_cm_wq) + destroy_workqueue(erdma_cm_wq); +} diff --git a/drivers/infiniband/hw/erdma/erdma_cm.h b/drivers/infiniband/hw/erdma/erdma_cm.h new file mode 100644 index 000000000000..8a3f998fec9b --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_cm.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +/* Authors: Bernard Metzler */ +/* Greg Joyce */ +/* Copyright (c) 2008-2019, IBM Corporation */ +/* Copyright (c) 2017, Open Grid Computing, Inc. */ + +#ifndef __ERDMA_CM_H__ +#define __ERDMA_CM_H__ + +#include +#include +#include + +/* iWarp MPA protocol defs */ +#define MPA_REVISION_EXT_1 129 +#define MPA_MAX_PRIVDATA RDMA_MAX_PRIVATE_DATA +#define MPA_KEY_REQ "MPA ID Req Frame" +#define MPA_KEY_REP "MPA ID Rep Frame" +#define MPA_KEY_SIZE 16 +#define MPA_DEFAULT_HDR_LEN 28 + +struct mpa_rr_params { + __be16 bits; + __be16 pd_len; +}; + +/* + * MPA request/response Hdr bits & fields + */ +enum { + MPA_RR_FLAG_MARKERS = __cpu_to_be16(0x8000), + MPA_RR_FLAG_CRC = __cpu_to_be16(0x4000), + MPA_RR_FLAG_REJECT = __cpu_to_be16(0x2000), + MPA_RR_RESERVED = __cpu_to_be16(0x1f00), + MPA_RR_MASK_REVISION = __cpu_to_be16(0x00ff) +}; + +/* + * MPA request/reply header + */ +struct mpa_rr { + u8 key[16]; + struct mpa_rr_params params; +}; + +struct erdma_mpa_ext { + __be32 cookie; + __be32 bits; +}; + +enum { + MPA_EXT_FLAG_CC = cpu_to_be32(0x0000000f), +}; + +struct erdma_mpa_info { + struct mpa_rr hdr; /* peer mpa hdr in host byte order */ + struct erdma_mpa_ext ext_data; + char *pdata; + int bytes_rcvd; +}; + +struct erdma_sk_upcalls { + void (*sk_state_change)(struct sock *sk); + void (*sk_data_ready)(struct sock *sk, int bytes); + void (*sk_error_report)(struct sock *sk); +}; + +struct erdma_dev; + +enum erdma_cep_state { + ERDMA_EPSTATE_IDLE = 1, + ERDMA_EPSTATE_LISTENING, + ERDMA_EPSTATE_CONNECTING, + ERDMA_EPSTATE_AWAIT_MPAREQ, + ERDMA_EPSTATE_RECVD_MPAREQ, + ERDMA_EPSTATE_AWAIT_MPAREP, + ERDMA_EPSTATE_RDMA_MODE, + ERDMA_EPSTATE_CLOSED +}; + +struct erdma_cep { + struct iw_cm_id *cm_id; + struct erdma_dev *dev; + struct list_head devq; + spinlock_t lock; + struct kref ref; + int in_use; + wait_queue_head_t waitq; + enum erdma_cep_state state; + + struct list_head listenq; + struct erdma_cep *listen_cep; + + struct erdma_qp *qp; + struct socket *sock; + + struct erdma_cm_work *mpa_timer; + struct list_head work_freelist; + + struct erdma_mpa_info mpa; + int ord; + int ird; + + int pd_len; + /* hold user's private data. */ + void *private_data; + + /* Saved upcalls of socket llp.sock */ + void (*sk_state_change)(struct sock *sk); + void (*sk_data_ready)(struct sock *sk); + void (*sk_error_report)(struct sock *sk); +}; + +#define MPAREQ_TIMEOUT (HZ * 20) +#define MPAREP_TIMEOUT (HZ * 10) +#define CONNECT_TIMEOUT (HZ * 10) + +enum erdma_work_type { + ERDMA_CM_WORK_ACCEPT = 1, + ERDMA_CM_WORK_READ_MPAHDR, + ERDMA_CM_WORK_CLOSE_LLP, /* close socket */ + ERDMA_CM_WORK_PEER_CLOSE, /* socket indicated peer close */ + ERDMA_CM_WORK_MPATIMEOUT, + ERDMA_CM_WORK_CONNECTED, + ERDMA_CM_WORK_CONNECTTIMEOUT +}; + +struct erdma_cm_work { + struct delayed_work work; + struct list_head list; + enum erdma_work_type type; + struct erdma_cep *cep; +}; + +#define to_sockaddr_in(a) (*(struct sockaddr_in *)(&(a))) + +static inline int getname_peer(struct socket *s, struct sockaddr_storage *a) +{ + return s->ops->getname(s, (struct sockaddr *)a, 1); +} + +static inline int getname_local(struct socket *s, struct sockaddr_storage *a) +{ + return s->ops->getname(s, (struct sockaddr *)a, 0); +} + +int erdma_connect(struct iw_cm_id *id, struct iw_cm_conn_param *param); +int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *param); +int erdma_reject(struct iw_cm_id *id, const void *pdata, u8 plen); +int erdma_create_listen(struct iw_cm_id *id, int backlog); +int erdma_destroy_listen(struct iw_cm_id *id); + +void erdma_cep_get(struct erdma_cep *ceq); +void erdma_cep_put(struct erdma_cep *ceq); +int erdma_cm_queue_work(struct erdma_cep *ceq, enum erdma_work_type type); + +int erdma_cm_init(void); +void erdma_cm_exit(void); + +#define sk_to_cep(sk) ((struct erdma_cep *)((sk)->sk_user_data)) + +#endif diff --git a/drivers/infiniband/hw/erdma/erdma_cmdq.c b/drivers/infiniband/hw/erdma/erdma_cmdq.c new file mode 100644 index 000000000000..57da0c670472 --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_cmdq.c @@ -0,0 +1,493 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#include +#include +#include + +#include "erdma.h" +#include "erdma_hw.h" +#include "erdma_verbs.h" + +static void arm_cmdq_cq(struct erdma_cmdq *cmdq) +{ + struct erdma_dev *dev = container_of(cmdq, struct erdma_dev, cmdq); + u64 db_data = FIELD_PREP(ERDMA_CQDB_CI_MASK, cmdq->cq.ci) | + FIELD_PREP(ERDMA_CQDB_ARM_MASK, 1) | + FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cmdq->cq.cmdsn) | + FIELD_PREP(ERDMA_CQDB_IDX_MASK, cmdq->cq.cmdsn); + + *cmdq->cq.db_record = db_data; + writeq(db_data, dev->func_bar + ERDMA_CMDQ_CQDB_REG); + + atomic64_inc(&cmdq->cq.armed_num); +} + +static void kick_cmdq_db(struct erdma_cmdq *cmdq) +{ + struct erdma_dev *dev = container_of(cmdq, struct erdma_dev, cmdq); + u64 db_data = FIELD_PREP(ERDMA_CMD_HDR_WQEBB_INDEX_MASK, cmdq->sq.pi); + + *cmdq->sq.db_record = db_data; + writeq(db_data, dev->func_bar + ERDMA_CMDQ_SQDB_REG); +} + +static struct erdma_comp_wait *get_comp_wait(struct erdma_cmdq *cmdq) +{ + int comp_idx; + + spin_lock(&cmdq->lock); + comp_idx = find_first_zero_bit(cmdq->comp_wait_bitmap, + cmdq->max_outstandings); + if (comp_idx == cmdq->max_outstandings) { + spin_unlock(&cmdq->lock); + return ERR_PTR(-ENOMEM); + } + + __set_bit(comp_idx, cmdq->comp_wait_bitmap); + spin_unlock(&cmdq->lock); + + return &cmdq->wait_pool[comp_idx]; +} + +static void put_comp_wait(struct erdma_cmdq *cmdq, + struct erdma_comp_wait *comp_wait) +{ + int used; + + cmdq->wait_pool[comp_wait->ctx_id].cmd_status = ERDMA_CMD_STATUS_INIT; + spin_lock(&cmdq->lock); + used = __test_and_clear_bit(comp_wait->ctx_id, cmdq->comp_wait_bitmap); + spin_unlock(&cmdq->lock); + + WARN_ON(!used); +} + +static int erdma_cmdq_wait_res_init(struct erdma_dev *dev, + struct erdma_cmdq *cmdq) +{ + int i; + + cmdq->wait_pool = + devm_kcalloc(&dev->pdev->dev, cmdq->max_outstandings, + sizeof(struct erdma_comp_wait), GFP_KERNEL); + if (!cmdq->wait_pool) + return -ENOMEM; + + spin_lock_init(&cmdq->lock); + cmdq->comp_wait_bitmap = devm_bitmap_zalloc( + &dev->pdev->dev, cmdq->max_outstandings, GFP_KERNEL); + if (!cmdq->comp_wait_bitmap) + return -ENOMEM; + + for (i = 0; i < cmdq->max_outstandings; i++) { + init_completion(&cmdq->wait_pool[i].wait_event); + cmdq->wait_pool[i].ctx_id = i; + } + + return 0; +} + +static int erdma_cmdq_sq_init(struct erdma_dev *dev) +{ + struct erdma_cmdq *cmdq = &dev->cmdq; + struct erdma_cmdq_sq *sq = &cmdq->sq; + u32 buf_size; + + sq->wqebb_cnt = SQEBB_COUNT(ERDMA_CMDQ_SQE_SIZE); + sq->depth = cmdq->max_outstandings * sq->wqebb_cnt; + + buf_size = sq->depth << SQEBB_SHIFT; + + sq->qbuf = + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), + &sq->qbuf_dma_addr, GFP_KERNEL); + if (!sq->qbuf) + return -ENOMEM; + + sq->db_record = (u64 *)(sq->qbuf + buf_size); + + spin_lock_init(&sq->lock); + + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_SQ_ADDR_H_REG, + upper_32_bits(sq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_SQ_ADDR_L_REG, + lower_32_bits(sq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_DEPTH_REG, sq->depth); + erdma_reg_write64(dev, ERDMA_CMDQ_SQ_DB_HOST_ADDR_REG, + sq->qbuf_dma_addr + buf_size); + + return 0; +} + +static int erdma_cmdq_cq_init(struct erdma_dev *dev) +{ + struct erdma_cmdq *cmdq = &dev->cmdq; + struct erdma_cmdq_cq *cq = &cmdq->cq; + u32 buf_size; + + cq->depth = cmdq->sq.depth; + buf_size = cq->depth << CQE_SHIFT; + + cq->qbuf = + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), + &cq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + if (!cq->qbuf) + return -ENOMEM; + + spin_lock_init(&cq->lock); + + cq->db_record = (u64 *)(cq->qbuf + buf_size); + + atomic64_set(&cq->armed_num, 0); + + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_CQ_ADDR_H_REG, + upper_32_bits(cq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_CQ_ADDR_L_REG, + lower_32_bits(cq->qbuf_dma_addr)); + erdma_reg_write64(dev, ERDMA_CMDQ_CQ_DB_HOST_ADDR_REG, + cq->qbuf_dma_addr + buf_size); + + return 0; +} + +static int erdma_cmdq_eq_init(struct erdma_dev *dev) +{ + struct erdma_cmdq *cmdq = &dev->cmdq; + struct erdma_eq *eq = &cmdq->eq; + u32 buf_size; + + eq->depth = cmdq->max_outstandings; + buf_size = eq->depth << EQE_SHIFT; + + eq->qbuf = + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), + &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + if (!eq->qbuf) + return -ENOMEM; + + spin_lock_init(&eq->lock); + atomic64_set(&eq->event_num, 0); + + eq->db_addr = + (u64 __iomem *)(dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG); + eq->db_record = (u64 *)(eq->qbuf + buf_size); + + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_H_REG, + upper_32_bits(eq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_L_REG, + lower_32_bits(eq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_DEPTH_REG, eq->depth); + erdma_reg_write64(dev, ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG, + eq->qbuf_dma_addr + buf_size); + + return 0; +} + +int erdma_cmdq_init(struct erdma_dev *dev) +{ + int err, i; + struct erdma_cmdq *cmdq = &dev->cmdq; + u32 sts, ctrl; + + cmdq->max_outstandings = ERDMA_CMDQ_MAX_OUTSTANDING; + cmdq->use_event = false; + + sema_init(&cmdq->credits, cmdq->max_outstandings); + + err = erdma_cmdq_wait_res_init(dev, cmdq); + if (err) + return err; + + err = erdma_cmdq_sq_init(dev); + if (err) + return err; + + err = erdma_cmdq_cq_init(dev); + if (err) + goto err_destroy_sq; + + err = erdma_cmdq_eq_init(dev); + if (err) + goto err_destroy_cq; + + ctrl = FIELD_PREP(ERDMA_REG_DEV_CTRL_INIT_MASK, 1); + erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG, ctrl); + + for (i = 0; i < ERDMA_WAIT_DEV_DONE_CNT; i++) { + sts = erdma_reg_read32_filed(dev, ERDMA_REGS_DEV_ST_REG, + ERDMA_REG_DEV_ST_INIT_DONE_MASK); + if (sts) + break; + + msleep(ERDMA_REG_ACCESS_WAIT_MS); + } + + if (i == ERDMA_WAIT_DEV_DONE_CNT) { + dev_err(&dev->pdev->dev, "wait init done failed.\n"); + err = -ETIMEDOUT; + goto err_destroy_eq; + } + + set_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); + + return 0; + +err_destroy_eq: + dma_free_coherent(&dev->pdev->dev, + (cmdq->eq.depth << EQE_SHIFT) + + ERDMA_EXTRA_BUFFER_SIZE, + cmdq->eq.qbuf, cmdq->eq.qbuf_dma_addr); + +err_destroy_cq: + dma_free_coherent(&dev->pdev->dev, + (cmdq->cq.depth << CQE_SHIFT) + + ERDMA_EXTRA_BUFFER_SIZE, + cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr); + +err_destroy_sq: + dma_free_coherent(&dev->pdev->dev, + (cmdq->sq.depth << SQEBB_SHIFT) + + ERDMA_EXTRA_BUFFER_SIZE, + cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr); + + return err; +} + +void erdma_finish_cmdq_init(struct erdma_dev *dev) +{ + /* after device init successfully, change cmdq to event mode. */ + dev->cmdq.use_event = true; + arm_cmdq_cq(&dev->cmdq); +} + +void erdma_cmdq_destroy(struct erdma_dev *dev) +{ + struct erdma_cmdq *cmdq = &dev->cmdq; + + clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); + + dma_free_coherent(&dev->pdev->dev, + (cmdq->eq.depth << EQE_SHIFT) + + ERDMA_EXTRA_BUFFER_SIZE, + cmdq->eq.qbuf, cmdq->eq.qbuf_dma_addr); + dma_free_coherent(&dev->pdev->dev, + (cmdq->sq.depth << SQEBB_SHIFT) + + ERDMA_EXTRA_BUFFER_SIZE, + cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr); + dma_free_coherent(&dev->pdev->dev, + (cmdq->cq.depth << CQE_SHIFT) + + ERDMA_EXTRA_BUFFER_SIZE, + cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr); +} + +static void *get_next_valid_cmdq_cqe(struct erdma_cmdq *cmdq) +{ + __be32 *cqe = get_queue_entry(cmdq->cq.qbuf, cmdq->cq.ci, + cmdq->cq.depth, CQE_SHIFT); + u32 owner = FIELD_GET(ERDMA_CQE_HDR_OWNER_MASK, + __be32_to_cpu(READ_ONCE(*cqe))); + + return owner ^ !!(cmdq->cq.ci & cmdq->cq.depth) ? cqe : NULL; +} + +static void push_cmdq_sqe(struct erdma_cmdq *cmdq, u64 *req, size_t req_len, + struct erdma_comp_wait *comp_wait) +{ + __le64 *wqe; + u64 hdr = *req; + + comp_wait->cmd_status = ERDMA_CMD_STATUS_ISSUED; + reinit_completion(&comp_wait->wait_event); + comp_wait->sq_pi = cmdq->sq.pi; + + wqe = get_queue_entry(cmdq->sq.qbuf, cmdq->sq.pi, cmdq->sq.depth, + SQEBB_SHIFT); + memcpy(wqe, req, req_len); + + cmdq->sq.pi += cmdq->sq.wqebb_cnt; + hdr |= FIELD_PREP(ERDMA_CMD_HDR_WQEBB_INDEX_MASK, cmdq->sq.pi) | + FIELD_PREP(ERDMA_CMD_HDR_CONTEXT_COOKIE_MASK, + comp_wait->ctx_id) | + FIELD_PREP(ERDMA_CMD_HDR_WQEBB_CNT_MASK, cmdq->sq.wqebb_cnt - 1); + *wqe = cpu_to_le64(hdr); + + kick_cmdq_db(cmdq); +} + +static int erdma_poll_single_cmd_completion(struct erdma_cmdq *cmdq) +{ + struct erdma_comp_wait *comp_wait; + u32 hdr0, sqe_idx; + __be32 *cqe; + u16 ctx_id; + u64 *sqe; + int i; + + cqe = get_next_valid_cmdq_cqe(cmdq); + if (!cqe) + return -EAGAIN; + + cmdq->cq.ci++; + + dma_rmb(); + hdr0 = __be32_to_cpu(*cqe); + sqe_idx = __be32_to_cpu(*(cqe + 1)); + + sqe = get_queue_entry(cmdq->sq.qbuf, sqe_idx, cmdq->sq.depth, + SQEBB_SHIFT); + ctx_id = FIELD_GET(ERDMA_CMD_HDR_CONTEXT_COOKIE_MASK, *sqe); + comp_wait = &cmdq->wait_pool[ctx_id]; + if (comp_wait->cmd_status != ERDMA_CMD_STATUS_ISSUED) + return -EIO; + + comp_wait->cmd_status = ERDMA_CMD_STATUS_FINISHED; + comp_wait->comp_status = FIELD_GET(ERDMA_CQE_HDR_SYNDROME_MASK, hdr0); + cmdq->sq.ci += cmdq->sq.wqebb_cnt; + + for (i = 0; i < 4; i++) + comp_wait->comp_data[i] = __be32_to_cpu(*(cqe + 2 + i)); + + if (cmdq->use_event) + complete(&comp_wait->wait_event); + + return 0; +} + +static void erdma_polling_cmd_completions(struct erdma_cmdq *cmdq) +{ + unsigned long flags; + u16 comp_num; + + spin_lock_irqsave(&cmdq->cq.lock, flags); + + /* We must have less than # of max_outstandings + * completions at one time. + */ + for (comp_num = 0; comp_num < cmdq->max_outstandings; comp_num++) + if (erdma_poll_single_cmd_completion(cmdq)) + break; + + if (comp_num && cmdq->use_event) + arm_cmdq_cq(cmdq); + + spin_unlock_irqrestore(&cmdq->cq.lock, flags); +} + +void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq) +{ + int got_event = 0; + + if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state) || + !cmdq->use_event) + return; + + while (get_next_valid_eqe(&cmdq->eq)) { + cmdq->eq.ci++; + got_event++; + } + + if (got_event) { + cmdq->cq.cmdsn++; + erdma_polling_cmd_completions(cmdq); + } + + notify_eq(&cmdq->eq); +} + +static int erdma_poll_cmd_completion(struct erdma_comp_wait *comp_ctx, + struct erdma_cmdq *cmdq, u32 timeout) +{ + unsigned long comp_timeout = jiffies + msecs_to_jiffies(timeout); + + while (1) { + erdma_polling_cmd_completions(cmdq); + if (comp_ctx->cmd_status != ERDMA_CMD_STATUS_ISSUED) + break; + + if (time_is_before_jiffies(comp_timeout)) + return -ETIME; + + msleep(20); + } + + return 0; +} + +static int erdma_wait_cmd_completion(struct erdma_comp_wait *comp_ctx, + struct erdma_cmdq *cmdq, u32 timeout) +{ + unsigned long flags = 0; + + wait_for_completion_timeout(&comp_ctx->wait_event, + msecs_to_jiffies(timeout)); + + if (unlikely(comp_ctx->cmd_status != ERDMA_CMD_STATUS_FINISHED)) { + spin_lock_irqsave(&cmdq->cq.lock, flags); + comp_ctx->cmd_status = ERDMA_CMD_STATUS_TIMEOUT; + spin_unlock_irqrestore(&cmdq->cq.lock, flags); + return -ETIME; + } + + return 0; +} + +void erdma_cmdq_build_reqhdr(u64 *hdr, u32 mod, u32 op) +{ + *hdr = FIELD_PREP(ERDMA_CMD_HDR_SUB_MOD_MASK, mod) | + FIELD_PREP(ERDMA_CMD_HDR_OPCODE_MASK, op); +} + +int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, u64 *req, u32 req_size, + u64 *resp0, u64 *resp1) +{ + struct erdma_comp_wait *comp_wait; + int ret; + + if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state)) + return -ENODEV; + + down(&cmdq->credits); + + comp_wait = get_comp_wait(cmdq); + if (IS_ERR(comp_wait)) { + clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); + set_bit(ERDMA_CMDQ_STATE_CTX_ERR_BIT, &cmdq->state); + up(&cmdq->credits); + return PTR_ERR(comp_wait); + } + + spin_lock(&cmdq->sq.lock); + push_cmdq_sqe(cmdq, req, req_size, comp_wait); + spin_unlock(&cmdq->sq.lock); + + if (cmdq->use_event) + ret = erdma_wait_cmd_completion(comp_wait, cmdq, + ERDMA_CMDQ_TIMEOUT_MS); + else + ret = erdma_poll_cmd_completion(comp_wait, cmdq, + ERDMA_CMDQ_TIMEOUT_MS); + + if (ret) { + set_bit(ERDMA_CMDQ_STATE_TIMEOUT_BIT, &cmdq->state); + clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); + goto out; + } + + if (comp_wait->comp_status) + ret = -EIO; + + if (resp0 && resp1) { + *resp0 = *((u64 *)&comp_wait->comp_data[0]); + *resp1 = *((u64 *)&comp_wait->comp_data[2]); + } + put_comp_wait(cmdq, comp_wait); + +out: + up(&cmdq->credits); + + return ret; +} diff --git a/drivers/infiniband/hw/erdma/erdma_cq.c b/drivers/infiniband/hw/erdma/erdma_cq.c new file mode 100644 index 000000000000..751c7f9f0de7 --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_cq.c @@ -0,0 +1,205 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#include + +#include "erdma_hw.h" +#include "erdma_verbs.h" + +static void *get_next_valid_cqe(struct erdma_cq *cq) +{ + __be32 *cqe = get_queue_entry(cq->kern_cq.qbuf, cq->kern_cq.ci, + cq->depth, CQE_SHIFT); + u32 owner = FIELD_GET(ERDMA_CQE_HDR_OWNER_MASK, + __be32_to_cpu(READ_ONCE(*cqe))); + + return owner ^ !!(cq->kern_cq.ci & cq->depth) ? cqe : NULL; +} + +static void notify_cq(struct erdma_cq *cq, u8 solcitied) +{ + u64 db_data = + FIELD_PREP(ERDMA_CQDB_IDX_MASK, (cq->kern_cq.notify_cnt)) | + FIELD_PREP(ERDMA_CQDB_CQN_MASK, cq->cqn) | + FIELD_PREP(ERDMA_CQDB_ARM_MASK, 1) | + FIELD_PREP(ERDMA_CQDB_SOL_MASK, solcitied) | + FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cq->kern_cq.cmdsn) | + FIELD_PREP(ERDMA_CQDB_CI_MASK, cq->kern_cq.ci); + + *cq->kern_cq.db_record = db_data; + writeq(db_data, cq->kern_cq.db); +} + +int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct erdma_cq *cq = to_ecq(ibcq); + unsigned long irq_flags; + int ret = 0; + + spin_lock_irqsave(&cq->kern_cq.lock, irq_flags); + + notify_cq(cq, (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED); + + if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && get_next_valid_cqe(cq)) + ret = 1; + + cq->kern_cq.notify_cnt++; + + spin_unlock_irqrestore(&cq->kern_cq.lock, irq_flags); + + return ret; +} + +static const enum ib_wc_opcode wc_mapping_table[ERDMA_NUM_OPCODES] = { + [ERDMA_OP_WRITE] = IB_WC_RDMA_WRITE, + [ERDMA_OP_READ] = IB_WC_RDMA_READ, + [ERDMA_OP_SEND] = IB_WC_SEND, + [ERDMA_OP_SEND_WITH_IMM] = IB_WC_SEND, + [ERDMA_OP_RECEIVE] = IB_WC_RECV, + [ERDMA_OP_RECV_IMM] = IB_WC_RECV_RDMA_WITH_IMM, + [ERDMA_OP_RECV_INV] = IB_WC_RECV, + [ERDMA_OP_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, + [ERDMA_OP_INVALIDATE] = IB_WC_LOCAL_INV, + [ERDMA_OP_RSP_SEND_IMM] = IB_WC_RECV, + [ERDMA_OP_SEND_WITH_INV] = IB_WC_SEND, + [ERDMA_OP_REG_MR] = IB_WC_REG_MR, + [ERDMA_OP_LOCAL_INV] = IB_WC_LOCAL_INV, + [ERDMA_OP_READ_WITH_INV] = IB_WC_RDMA_READ, +}; + +static const struct { + enum erdma_wc_status erdma; + enum ib_wc_status base; + enum erdma_vendor_err vendor; +} map_cqe_status[ERDMA_NUM_WC_STATUS] = { + { ERDMA_WC_SUCCESS, IB_WC_SUCCESS, ERDMA_WC_VENDOR_NO_ERR }, + { ERDMA_WC_GENERAL_ERR, IB_WC_GENERAL_ERR, ERDMA_WC_VENDOR_NO_ERR }, + { ERDMA_WC_RECV_WQE_FORMAT_ERR, IB_WC_GENERAL_ERR, + ERDMA_WC_VENDOR_INVALID_RQE }, + { ERDMA_WC_RECV_STAG_INVALID_ERR, IB_WC_REM_ACCESS_ERR, + ERDMA_WC_VENDOR_RQE_INVALID_STAG }, + { ERDMA_WC_RECV_ADDR_VIOLATION_ERR, IB_WC_REM_ACCESS_ERR, + ERDMA_WC_VENDOR_RQE_ADDR_VIOLATION }, + { ERDMA_WC_RECV_RIGHT_VIOLATION_ERR, IB_WC_REM_ACCESS_ERR, + ERDMA_WC_VENDOR_RQE_ACCESS_RIGHT_ERR }, + { ERDMA_WC_RECV_PDID_ERR, IB_WC_REM_ACCESS_ERR, + ERDMA_WC_VENDOR_RQE_INVALID_PD }, + { ERDMA_WC_RECV_WARRPING_ERR, IB_WC_REM_ACCESS_ERR, + ERDMA_WC_VENDOR_RQE_WRAP_ERR }, + { ERDMA_WC_SEND_WQE_FORMAT_ERR, IB_WC_LOC_QP_OP_ERR, + ERDMA_WC_VENDOR_INVALID_SQE }, + { ERDMA_WC_SEND_WQE_ORD_EXCEED, IB_WC_GENERAL_ERR, + ERDMA_WC_VENDOR_ZERO_ORD }, + { ERDMA_WC_SEND_STAG_INVALID_ERR, IB_WC_LOC_ACCESS_ERR, + ERDMA_WC_VENDOR_SQE_INVALID_STAG }, + { ERDMA_WC_SEND_ADDR_VIOLATION_ERR, IB_WC_LOC_ACCESS_ERR, + ERDMA_WC_VENDOR_SQE_ADDR_VIOLATION }, + { ERDMA_WC_SEND_RIGHT_VIOLATION_ERR, IB_WC_LOC_ACCESS_ERR, + ERDMA_WC_VENDOR_SQE_ACCESS_ERR }, + { ERDMA_WC_SEND_PDID_ERR, IB_WC_LOC_ACCESS_ERR, + ERDMA_WC_VENDOR_SQE_INVALID_PD }, + { ERDMA_WC_SEND_WARRPING_ERR, IB_WC_LOC_ACCESS_ERR, + ERDMA_WC_VENDOR_SQE_WARP_ERR }, + { ERDMA_WC_FLUSH_ERR, IB_WC_WR_FLUSH_ERR, ERDMA_WC_VENDOR_NO_ERR }, + { ERDMA_WC_RETRY_EXC_ERR, IB_WC_RETRY_EXC_ERR, ERDMA_WC_VENDOR_NO_ERR }, +}; + +#define ERDMA_POLLCQ_NO_QP 1 + +static int erdma_poll_one_cqe(struct erdma_cq *cq, struct ib_wc *wc) +{ + struct erdma_dev *dev = to_edev(cq->ibcq.device); + u8 opcode, syndrome, qtype; + struct erdma_kqp *kern_qp; + struct erdma_cqe *cqe; + struct erdma_qp *qp; + u16 wqe_idx, depth; + u32 qpn, cqe_hdr; + u64 *id_table; + u64 *wqe_hdr; + + cqe = get_next_valid_cqe(cq); + if (!cqe) + return -EAGAIN; + + cq->kern_cq.ci++; + + /* cqbuf should be ready when we poll */ + dma_rmb(); + + qpn = be32_to_cpu(cqe->qpn); + wqe_idx = be32_to_cpu(cqe->qe_idx); + cqe_hdr = be32_to_cpu(cqe->hdr); + + qp = find_qp_by_qpn(dev, qpn); + if (!qp) + return ERDMA_POLLCQ_NO_QP; + + kern_qp = &qp->kern_qp; + + qtype = FIELD_GET(ERDMA_CQE_HDR_QTYPE_MASK, cqe_hdr); + syndrome = FIELD_GET(ERDMA_CQE_HDR_SYNDROME_MASK, cqe_hdr); + opcode = FIELD_GET(ERDMA_CQE_HDR_OPCODE_MASK, cqe_hdr); + + if (qtype == ERDMA_CQE_QTYPE_SQ) { + id_table = kern_qp->swr_tbl; + depth = qp->attrs.sq_size; + wqe_hdr = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx, + qp->attrs.sq_size, SQEBB_SHIFT); + kern_qp->sq_ci = + FIELD_GET(ERDMA_SQE_HDR_WQEBB_CNT_MASK, *wqe_hdr) + + wqe_idx + 1; + } else { + id_table = kern_qp->rwr_tbl; + depth = qp->attrs.rq_size; + } + wc->wr_id = id_table[wqe_idx & (depth - 1)]; + wc->byte_len = be32_to_cpu(cqe->size); + + wc->wc_flags = 0; + + wc->opcode = wc_mapping_table[opcode]; + if (opcode == ERDMA_OP_RECV_IMM || opcode == ERDMA_OP_RSP_SEND_IMM) { + wc->ex.imm_data = cpu_to_be32(le32_to_cpu(cqe->imm_data)); + wc->wc_flags |= IB_WC_WITH_IMM; + } else if (opcode == ERDMA_OP_RECV_INV) { + wc->ex.invalidate_rkey = be32_to_cpu(cqe->inv_rkey); + wc->wc_flags |= IB_WC_WITH_INVALIDATE; + } + + if (syndrome >= ERDMA_NUM_WC_STATUS) + syndrome = ERDMA_WC_GENERAL_ERR; + + wc->status = map_cqe_status[syndrome].base; + wc->vendor_err = map_cqe_status[syndrome].vendor; + wc->qp = &qp->ibqp; + + return 0; +} + +int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct erdma_cq *cq = to_ecq(ibcq); + unsigned long flags; + int npolled, ret; + + spin_lock_irqsave(&cq->kern_cq.lock, flags); + + for (npolled = 0; npolled < num_entries;) { + ret = erdma_poll_one_cqe(cq, wc + npolled); + + if (ret == -EAGAIN) /* no received new CQEs. */ + break; + else if (ret) /* ignore invalid CQEs. */ + continue; + + npolled++; + } + + spin_unlock_irqrestore(&cq->kern_cq.lock, flags); + + return npolled; +} diff --git a/drivers/infiniband/hw/erdma/erdma_eq.c b/drivers/infiniband/hw/erdma/erdma_eq.c new file mode 100644 index 000000000000..8f2d094e0227 --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_eq.c @@ -0,0 +1,329 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#include +#include +#include + +#include "erdma.h" +#include "erdma_hw.h" +#include "erdma_verbs.h" + +#define MAX_POLL_CHUNK_SIZE 16 + +void notify_eq(struct erdma_eq *eq) +{ + u64 db_data = FIELD_PREP(ERDMA_EQDB_CI_MASK, eq->ci) | + FIELD_PREP(ERDMA_EQDB_ARM_MASK, 1); + + *eq->db_record = db_data; + writeq(db_data, eq->db_addr); + + atomic64_inc(&eq->notify_num); +} + +void *get_next_valid_eqe(struct erdma_eq *eq) +{ + u64 *eqe = get_queue_entry(eq->qbuf, eq->ci, eq->depth, EQE_SHIFT); + u32 owner = FIELD_GET(ERDMA_CEQE_HDR_O_MASK, READ_ONCE(*eqe)); + + return owner ^ !!(eq->ci & eq->depth) ? eqe : NULL; +} + +void erdma_aeq_event_handler(struct erdma_dev *dev) +{ + struct erdma_aeqe *aeqe; + u32 cqn, qpn; + struct erdma_qp *qp; + struct erdma_cq *cq; + struct ib_event event; + u32 poll_cnt = 0; + + memset(&event, 0, sizeof(event)); + + while (poll_cnt < MAX_POLL_CHUNK_SIZE) { + aeqe = get_next_valid_eqe(&dev->aeq); + if (!aeqe) + break; + + dma_rmb(); + + dev->aeq.ci++; + atomic64_inc(&dev->aeq.event_num); + poll_cnt++; + + if (FIELD_GET(ERDMA_AEQE_HDR_TYPE_MASK, + le32_to_cpu(aeqe->hdr)) == ERDMA_AE_TYPE_CQ_ERR) { + cqn = le32_to_cpu(aeqe->event_data0); + cq = find_cq_by_cqn(dev, cqn); + if (!cq) + continue; + + event.device = cq->ibcq.device; + event.element.cq = &cq->ibcq; + event.event = IB_EVENT_CQ_ERR; + if (cq->ibcq.event_handler) + cq->ibcq.event_handler(&event, + cq->ibcq.cq_context); + } else { + qpn = le32_to_cpu(aeqe->event_data0); + qp = find_qp_by_qpn(dev, qpn); + if (!qp) + continue; + + event.device = qp->ibqp.device; + event.element.qp = &qp->ibqp; + event.event = IB_EVENT_QP_FATAL; + if (qp->ibqp.event_handler) + qp->ibqp.event_handler(&event, + qp->ibqp.qp_context); + } + } + + notify_eq(&dev->aeq); +} + +int erdma_aeq_init(struct erdma_dev *dev) +{ + struct erdma_eq *eq = &dev->aeq; + u32 buf_size; + + eq->depth = ERDMA_DEFAULT_EQ_DEPTH; + buf_size = eq->depth << EQE_SHIFT; + + eq->qbuf = + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), + &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + if (!eq->qbuf) + return -ENOMEM; + + spin_lock_init(&eq->lock); + atomic64_set(&eq->event_num, 0); + atomic64_set(&eq->notify_num, 0); + + eq->db_addr = (u64 __iomem *)(dev->func_bar + ERDMA_REGS_AEQ_DB_REG); + eq->db_record = (u64 *)(eq->qbuf + buf_size); + + erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_H_REG, + upper_32_bits(eq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_L_REG, + lower_32_bits(eq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_AEQ_DEPTH_REG, eq->depth); + erdma_reg_write64(dev, ERDMA_AEQ_DB_HOST_ADDR_REG, + eq->qbuf_dma_addr + buf_size); + + return 0; +} + +void erdma_aeq_destroy(struct erdma_dev *dev) +{ + struct erdma_eq *eq = &dev->aeq; + + dma_free_coherent(&dev->pdev->dev, + WARPPED_BUFSIZE(eq->depth << EQE_SHIFT), eq->qbuf, + eq->qbuf_dma_addr); +} + +void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb) +{ + struct erdma_dev *dev = ceq_cb->dev; + struct erdma_cq *cq; + u32 poll_cnt = 0; + u64 *ceqe; + int cqn; + + if (!ceq_cb->ready) + return; + + while (poll_cnt < MAX_POLL_CHUNK_SIZE) { + ceqe = get_next_valid_eqe(&ceq_cb->eq); + if (!ceqe) + break; + + dma_rmb(); + ceq_cb->eq.ci++; + poll_cnt++; + cqn = FIELD_GET(ERDMA_CEQE_HDR_CQN_MASK, READ_ONCE(*ceqe)); + + cq = find_cq_by_cqn(dev, cqn); + if (!cq) + continue; + + if (rdma_is_kernel_res(&cq->ibcq.res)) + cq->kern_cq.cmdsn++; + + if (cq->ibcq.comp_handler) + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); + } + + notify_eq(&ceq_cb->eq); +} + +static irqreturn_t erdma_intr_ceq_handler(int irq, void *data) +{ + struct erdma_eq_cb *ceq_cb = data; + + tasklet_schedule(&ceq_cb->tasklet); + + return IRQ_HANDLED; +} + +static void erdma_intr_ceq_task(unsigned long data) +{ + erdma_ceq_completion_handler((struct erdma_eq_cb *)data); +} + +static int erdma_set_ceq_irq(struct erdma_dev *dev, u16 ceqn) +{ + struct erdma_eq_cb *eqc = &dev->ceqs[ceqn]; + int err; + + snprintf(eqc->irq.name, ERDMA_IRQNAME_SIZE, "erdma-ceq%u@pci:%s", ceqn, + pci_name(dev->pdev)); + eqc->irq.msix_vector = pci_irq_vector(dev->pdev, ceqn + 1); + + tasklet_init(&dev->ceqs[ceqn].tasklet, erdma_intr_ceq_task, + (unsigned long)&dev->ceqs[ceqn]); + + cpumask_set_cpu(cpumask_local_spread(ceqn + 1, dev->attrs.numa_node), + &eqc->irq.affinity_hint_mask); + + err = request_irq(eqc->irq.msix_vector, erdma_intr_ceq_handler, 0, + eqc->irq.name, eqc); + if (err) { + dev_err(&dev->pdev->dev, "failed to request_irq(%d)\n", err); + return err; + } + + irq_set_affinity_hint(eqc->irq.msix_vector, + &eqc->irq.affinity_hint_mask); + + return 0; +} + +static void erdma_free_ceq_irq(struct erdma_dev *dev, u16 ceqn) +{ + struct erdma_eq_cb *eqc = &dev->ceqs[ceqn]; + + irq_set_affinity_hint(eqc->irq.msix_vector, NULL); + free_irq(eqc->irq.msix_vector, eqc); +} + +static int create_eq_cmd(struct erdma_dev *dev, u32 eqn, struct erdma_eq *eq) +{ + struct erdma_cmdq_create_eq_req req; + dma_addr_t db_info_dma_addr; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON, + CMDQ_OPCODE_CREATE_EQ); + req.eqn = eqn; + req.depth = ilog2(eq->depth); + req.qbuf_addr = eq->qbuf_dma_addr; + req.qtype = ERDMA_EQ_TYPE_CEQ; + /* Vector index is the same as EQN. */ + req.vector_idx = eqn; + db_info_dma_addr = eq->qbuf_dma_addr + (eq->depth << EQE_SHIFT); + req.db_dma_addr_l = lower_32_bits(db_info_dma_addr); + req.db_dma_addr_h = upper_32_bits(db_info_dma_addr); + + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, + sizeof(struct erdma_cmdq_create_eq_req), + NULL, NULL); +} + +static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn) +{ + struct erdma_eq *eq = &dev->ceqs[ceqn].eq; + u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT; + int ret; + + eq->qbuf = + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), + &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + if (!eq->qbuf) + return -ENOMEM; + + spin_lock_init(&eq->lock); + atomic64_set(&eq->event_num, 0); + atomic64_set(&eq->notify_num, 0); + + eq->depth = ERDMA_DEFAULT_EQ_DEPTH; + eq->db_addr = + (u64 __iomem *)(dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG + + (ceqn + 1) * ERDMA_DB_SIZE); + eq->db_record = (u64 *)(eq->qbuf + buf_size); + eq->ci = 0; + dev->ceqs[ceqn].dev = dev; + + /* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */ + ret = create_eq_cmd(dev, ceqn + 1, eq); + dev->ceqs[ceqn].ready = ret ? false : true; + + return ret; +} + +static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn) +{ + struct erdma_eq *eq = &dev->ceqs[ceqn].eq; + u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT; + struct erdma_cmdq_destroy_eq_req req; + int err; + + dev->ceqs[ceqn].ready = 0; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON, + CMDQ_OPCODE_DESTROY_EQ); + /* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */ + req.eqn = ceqn + 1; + req.qtype = ERDMA_EQ_TYPE_CEQ; + req.vector_idx = ceqn + 1; + + err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); + if (err) + return; + + dma_free_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), eq->qbuf, + eq->qbuf_dma_addr); +} + +int erdma_ceqs_init(struct erdma_dev *dev) +{ + u32 i, j; + int err; + + for (i = 0; i < dev->attrs.irq_num - 1; i++) { + err = erdma_ceq_init_one(dev, i); + if (err) + goto out_err; + + err = erdma_set_ceq_irq(dev, i); + if (err) { + erdma_ceq_uninit_one(dev, i); + goto out_err; + } + } + + return 0; + +out_err: + for (j = 0; j < i; j++) { + erdma_free_ceq_irq(dev, j); + erdma_ceq_uninit_one(dev, j); + } + + return err; +} + +void erdma_ceqs_uninit(struct erdma_dev *dev) +{ + u32 i; + + for (i = 0; i < dev->attrs.irq_num - 1; i++) { + erdma_free_ceq_irq(dev, i); + erdma_ceq_uninit_one(dev, i); + } +} diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h new file mode 100644 index 000000000000..b210c49c669f --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -0,0 +1,508 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#ifndef __ERDMA_HW_H__ +#define __ERDMA_HW_H__ + +#include +#include + +/* PCIe device related definition. */ +#define PCI_VENDOR_ID_ALIBABA 0x1ded + +#define ERDMA_PCI_WIDTH 64 +#define ERDMA_FUNC_BAR 0 +#define ERDMA_MISX_BAR 2 + +#define ERDMA_BAR_MASK (BIT(ERDMA_FUNC_BAR) | BIT(ERDMA_MISX_BAR)) + +/* MSI-X related. */ +#define ERDMA_NUM_MSIX_VEC 32U +#define ERDMA_MSIX_VECTOR_CMDQ 0 + +/* PCIe Bar0 Registers. */ +#define ERDMA_REGS_VERSION_REG 0x0 +#define ERDMA_REGS_DEV_CTRL_REG 0x10 +#define ERDMA_REGS_DEV_ST_REG 0x14 +#define ERDMA_REGS_NETDEV_MAC_L_REG 0x18 +#define ERDMA_REGS_NETDEV_MAC_H_REG 0x1C +#define ERDMA_REGS_CMDQ_SQ_ADDR_L_REG 0x20 +#define ERDMA_REGS_CMDQ_SQ_ADDR_H_REG 0x24 +#define ERDMA_REGS_CMDQ_CQ_ADDR_L_REG 0x28 +#define ERDMA_REGS_CMDQ_CQ_ADDR_H_REG 0x2C +#define ERDMA_REGS_CMDQ_DEPTH_REG 0x30 +#define ERDMA_REGS_CMDQ_EQ_DEPTH_REG 0x34 +#define ERDMA_REGS_CMDQ_EQ_ADDR_L_REG 0x38 +#define ERDMA_REGS_CMDQ_EQ_ADDR_H_REG 0x3C +#define ERDMA_REGS_AEQ_ADDR_L_REG 0x40 +#define ERDMA_REGS_AEQ_ADDR_H_REG 0x44 +#define ERDMA_REGS_AEQ_DEPTH_REG 0x48 +#define ERDMA_REGS_GRP_NUM_REG 0x4c +#define ERDMA_REGS_AEQ_DB_REG 0x50 +#define ERDMA_CMDQ_SQ_DB_HOST_ADDR_REG 0x60 +#define ERDMA_CMDQ_CQ_DB_HOST_ADDR_REG 0x68 +#define ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG 0x70 +#define ERDMA_AEQ_DB_HOST_ADDR_REG 0x78 +#define ERDMA_REGS_STATS_TSO_IN_PKTS_REG 0x80 +#define ERDMA_REGS_STATS_TSO_OUT_PKTS_REG 0x88 +#define ERDMA_REGS_STATS_TSO_OUT_BYTES_REG 0x90 +#define ERDMA_REGS_STATS_TX_DROP_PKTS_REG 0x98 +#define ERDMA_REGS_STATS_TX_BPS_METER_DROP_PKTS_REG 0xa0 +#define ERDMA_REGS_STATS_TX_PPS_METER_DROP_PKTS_REG 0xa8 +#define ERDMA_REGS_STATS_RX_PKTS_REG 0xc0 +#define ERDMA_REGS_STATS_RX_BYTES_REG 0xc8 +#define ERDMA_REGS_STATS_RX_DROP_PKTS_REG 0xd0 +#define ERDMA_REGS_STATS_RX_BPS_METER_DROP_PKTS_REG 0xd8 +#define ERDMA_REGS_STATS_RX_PPS_METER_DROP_PKTS_REG 0xe0 +#define ERDMA_REGS_CEQ_DB_BASE_REG 0x100 +#define ERDMA_CMDQ_SQDB_REG 0x200 +#define ERDMA_CMDQ_CQDB_REG 0x300 + +/* DEV_CTRL_REG details. */ +#define ERDMA_REG_DEV_CTRL_RESET_MASK 0x00000001 +#define ERDMA_REG_DEV_CTRL_INIT_MASK 0x00000002 + +/* DEV_ST_REG details. */ +#define ERDMA_REG_DEV_ST_RESET_DONE_MASK 0x00000001U +#define ERDMA_REG_DEV_ST_INIT_DONE_MASK 0x00000002U + +/* eRDMA PCIe DBs definition. */ +#define ERDMA_BAR_DB_SPACE_BASE 4096 + +#define ERDMA_BAR_SQDB_SPACE_OFFSET ERDMA_BAR_DB_SPACE_BASE +#define ERDMA_BAR_SQDB_SPACE_SIZE (384 * 1024) + +#define ERDMA_BAR_RQDB_SPACE_OFFSET \ + (ERDMA_BAR_SQDB_SPACE_OFFSET + ERDMA_BAR_SQDB_SPACE_SIZE) +#define ERDMA_BAR_RQDB_SPACE_SIZE (96 * 1024) + +#define ERDMA_BAR_CQDB_SPACE_OFFSET \ + (ERDMA_BAR_RQDB_SPACE_OFFSET + ERDMA_BAR_RQDB_SPACE_SIZE) + +/* Doorbell page resources related. */ +/* + * Max # of parallelly issued directSQE is 3072 per device, + * hardware organizes this into 24 group, per group has 128 credits. + */ +#define ERDMA_DWQE_MAX_GRP_CNT 24 +#define ERDMA_DWQE_NUM_PER_GRP 128 + +#define ERDMA_DWQE_TYPE0_CNT 64 +#define ERDMA_DWQE_TYPE1_CNT 496 +/* type1 DB contains 2 DBs, takes 256Byte. */ +#define ERDMA_DWQE_TYPE1_CNT_PER_PAGE 16 + +#define ERDMA_SDB_SHARED_PAGE_INDEX 95 + +/* Doorbell related. */ +#define ERDMA_DB_SIZE 8 + +#define ERDMA_CQDB_IDX_MASK GENMASK_ULL(63, 56) +#define ERDMA_CQDB_CQN_MASK GENMASK_ULL(55, 32) +#define ERDMA_CQDB_ARM_MASK BIT_ULL(31) +#define ERDMA_CQDB_SOL_MASK BIT_ULL(30) +#define ERDMA_CQDB_CMDSN_MASK GENMASK_ULL(29, 28) +#define ERDMA_CQDB_CI_MASK GENMASK_ULL(23, 0) + +#define ERDMA_EQDB_ARM_MASK BIT(31) +#define ERDMA_EQDB_CI_MASK GENMASK_ULL(23, 0) + +#define ERDMA_PAGE_SIZE_SUPPORT 0x7FFFF000 + +/* WQE related. */ +#define EQE_SIZE 16 +#define EQE_SHIFT 4 +#define RQE_SIZE 32 +#define RQE_SHIFT 5 +#define CQE_SIZE 32 +#define CQE_SHIFT 5 +#define SQEBB_SIZE 32 +#define SQEBB_SHIFT 5 +#define SQEBB_MASK (~(SQEBB_SIZE - 1)) +#define SQEBB_ALIGN(size) ((size + SQEBB_SIZE - 1) & SQEBB_MASK) +#define SQEBB_COUNT(size) (SQEBB_ALIGN(size) >> SQEBB_SHIFT) + +#define ERDMA_MAX_SQE_SIZE 128 +#define ERDMA_MAX_WQEBB_PER_SQE 4 + +/* CMDQ related. */ +#define ERDMA_CMDQ_MAX_OUTSTANDING 128 +#define ERDMA_CMDQ_SQE_SIZE 64 + +/* cmdq sub module definition. */ +enum CMDQ_WQE_SUB_MOD { + CMDQ_SUBMOD_RDMA = 0, + CMDQ_SUBMOD_COMMON = 1 +}; + +enum CMDQ_RDMA_OPCODE { + CMDQ_OPCODE_QUERY_DEVICE = 0, + CMDQ_OPCODE_CREATE_QP = 1, + CMDQ_OPCODE_DESTROY_QP = 2, + CMDQ_OPCODE_MODIFY_QP = 3, + CMDQ_OPCODE_CREATE_CQ = 4, + CMDQ_OPCODE_DESTROY_CQ = 5, + CMDQ_OPCODE_REG_MR = 8, + CMDQ_OPCODE_DEREG_MR = 9 +}; + +enum CMDQ_COMMON_OPCODE { + CMDQ_OPCODE_CREATE_EQ = 0, + CMDQ_OPCODE_DESTROY_EQ = 1, + CMDQ_OPCODE_QUERY_FW_INFO = 2, +}; + +/* cmdq-SQE HDR */ +#define ERDMA_CMD_HDR_WQEBB_CNT_MASK GENMASK_ULL(54, 52) +#define ERDMA_CMD_HDR_CONTEXT_COOKIE_MASK GENMASK_ULL(47, 32) +#define ERDMA_CMD_HDR_SUB_MOD_MASK GENMASK_ULL(25, 24) +#define ERDMA_CMD_HDR_OPCODE_MASK GENMASK_ULL(23, 16) +#define ERDMA_CMD_HDR_WQEBB_INDEX_MASK GENMASK_ULL(15, 0) + +struct erdma_cmdq_destroy_cq_req { + u64 hdr; + u32 cqn; +}; + +#define ERDMA_EQ_TYPE_AEQ 0 +#define ERDMA_EQ_TYPE_CEQ 1 + +struct erdma_cmdq_create_eq_req { + u64 hdr; + u64 qbuf_addr; + u8 vector_idx; + u8 eqn; + u8 depth; + u8 qtype; + u32 db_dma_addr_l; + u32 db_dma_addr_h; +}; + +struct erdma_cmdq_destroy_eq_req { + u64 hdr; + u64 rsvd0; + u8 vector_idx; + u8 eqn; + u8 rsvd1; + u8 qtype; +}; + +/* create_cq cfg0 */ +#define ERDMA_CMD_CREATE_CQ_DEPTH_MASK GENMASK(31, 24) +#define ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK GENMASK(23, 20) +#define ERDMA_CMD_CREATE_CQ_CQN_MASK GENMASK(19, 0) + +/* create_cq cfg1 */ +#define ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK GENMASK(31, 16) +#define ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK BIT(15) +#define ERDMA_CMD_CREATE_CQ_EQN_MASK GENMASK(9, 0) + +struct erdma_cmdq_create_cq_req { + u64 hdr; + u32 cfg0; + u32 qbuf_addr_l; + u32 qbuf_addr_h; + u32 cfg1; + u64 cq_db_info_addr; + u32 first_page_offset; +}; + +/* regmr/deregmr cfg0 */ +#define ERDMA_CMD_MR_VALID_MASK BIT(31) +#define ERDMA_CMD_MR_KEY_MASK GENMASK(27, 20) +#define ERDMA_CMD_MR_MPT_IDX_MASK GENMASK(19, 0) + +/* regmr cfg1 */ +#define ERDMA_CMD_REGMR_PD_MASK GENMASK(31, 12) +#define ERDMA_CMD_REGMR_TYPE_MASK GENMASK(7, 6) +#define ERDMA_CMD_REGMR_RIGHT_MASK GENMASK(5, 2) +#define ERDMA_CMD_REGMR_ACC_MODE_MASK GENMASK(1, 0) + +/* regmr cfg2 */ +#define ERDMA_CMD_REGMR_PAGESIZE_MASK GENMASK(31, 27) +#define ERDMA_CMD_REGMR_MTT_TYPE_MASK GENMASK(21, 20) +#define ERDMA_CMD_REGMR_MTT_CNT_MASK GENMASK(19, 0) + +struct erdma_cmdq_reg_mr_req { + u64 hdr; + u32 cfg0; + u32 cfg1; + u64 start_va; + u32 size; + u32 cfg2; + u64 phy_addr[4]; +}; + +struct erdma_cmdq_dereg_mr_req { + u64 hdr; + u32 cfg; +}; + +/* modify qp cfg */ +#define ERDMA_CMD_MODIFY_QP_STATE_MASK GENMASK(31, 24) +#define ERDMA_CMD_MODIFY_QP_CC_MASK GENMASK(23, 20) +#define ERDMA_CMD_MODIFY_QP_QPN_MASK GENMASK(19, 0) + +struct erdma_cmdq_modify_qp_req { + u64 hdr; + u32 cfg; + u32 cookie; + __be32 dip; + __be32 sip; + __be16 sport; + __be16 dport; + u32 send_nxt; + u32 recv_nxt; +}; + +/* create qp cfg0 */ +#define ERDMA_CMD_CREATE_QP_SQ_DEPTH_MASK GENMASK(31, 20) +#define ERDMA_CMD_CREATE_QP_QPN_MASK GENMASK(19, 0) + +/* create qp cfg1 */ +#define ERDMA_CMD_CREATE_QP_RQ_DEPTH_MASK GENMASK(31, 20) +#define ERDMA_CMD_CREATE_QP_PD_MASK GENMASK(19, 0) + +/* create qp cqn_mtt_cfg */ +#define ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK GENMASK(31, 28) +#define ERDMA_CMD_CREATE_QP_CQN_MASK GENMASK(23, 0) + +/* create qp mtt_cfg */ +#define ERDMA_CMD_CREATE_QP_PAGE_OFFSET_MASK GENMASK(31, 12) +#define ERDMA_CMD_CREATE_QP_MTT_CNT_MASK GENMASK(11, 1) +#define ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK BIT(0) + +#define ERDMA_CMDQ_CREATE_QP_RESP_COOKIE_MASK GENMASK_ULL(31, 0) + +struct erdma_cmdq_create_qp_req { + u64 hdr; + u32 cfg0; + u32 cfg1; + u32 sq_cqn_mtt_cfg; + u32 rq_cqn_mtt_cfg; + u64 sq_buf_addr; + u64 rq_buf_addr; + u32 sq_mtt_cfg; + u32 rq_mtt_cfg; + u64 sq_db_info_dma_addr; + u64 rq_db_info_dma_addr; +}; + +struct erdma_cmdq_destroy_qp_req { + u64 hdr; + u32 qpn; +}; + +/* cap qword 0 definition */ +#define ERDMA_CMD_DEV_CAP_MAX_CQE_MASK GENMASK_ULL(47, 40) +#define ERDMA_CMD_DEV_CAP_MAX_RECV_WR_MASK GENMASK_ULL(23, 16) +#define ERDMA_CMD_DEV_CAP_MAX_MR_SIZE_MASK GENMASK_ULL(7, 0) + +/* cap qword 1 definition */ +#define ERDMA_CMD_DEV_CAP_DMA_LOCAL_KEY_MASK GENMASK_ULL(63, 32) +#define ERDMA_CMD_DEV_CAP_DEFAULT_CC_MASK GENMASK_ULL(31, 28) +#define ERDMA_CMD_DEV_CAP_QBLOCK_MASK GENMASK_ULL(27, 16) +#define ERDMA_CMD_DEV_CAP_MAX_MW_MASK GENMASK_ULL(7, 0) + +#define ERDMA_NQP_PER_QBLOCK 1024 + +#define ERDMA_CMD_INFO0_FW_VER_MASK GENMASK_ULL(31, 0) + +/* CQE hdr */ +#define ERDMA_CQE_HDR_OWNER_MASK BIT(31) +#define ERDMA_CQE_HDR_OPCODE_MASK GENMASK(23, 16) +#define ERDMA_CQE_HDR_QTYPE_MASK GENMASK(15, 8) +#define ERDMA_CQE_HDR_SYNDROME_MASK GENMASK(7, 0) + +#define ERDMA_CQE_QTYPE_SQ 0 +#define ERDMA_CQE_QTYPE_RQ 1 +#define ERDMA_CQE_QTYPE_CMDQ 2 + +struct erdma_cqe { + __be32 hdr; + __be32 qe_idx; + __be32 qpn; + union { + __le32 imm_data; + __be32 inv_rkey; + }; + __be32 size; + __be32 rsvd[3]; +}; + +struct erdma_sge { + __aligned_le64 laddr; + __le32 length; + __le32 lkey; +}; + +/* Receive Queue Element */ +struct erdma_rqe { + __le16 qe_idx; + __le16 rsvd0; + __le32 qpn; + __le32 rsvd1; + __le32 rsvd2; + __le64 to; + __le32 length; + __le32 stag; +}; + +/* SQE */ +#define ERDMA_SQE_HDR_SGL_LEN_MASK GENMASK_ULL(63, 56) +#define ERDMA_SQE_HDR_WQEBB_CNT_MASK GENMASK_ULL(54, 52) +#define ERDMA_SQE_HDR_QPN_MASK GENMASK_ULL(51, 32) +#define ERDMA_SQE_HDR_OPCODE_MASK GENMASK_ULL(31, 27) +#define ERDMA_SQE_HDR_DWQE_MASK BIT_ULL(26) +#define ERDMA_SQE_HDR_INLINE_MASK BIT_ULL(25) +#define ERDMA_SQE_HDR_FENCE_MASK BIT_ULL(24) +#define ERDMA_SQE_HDR_SE_MASK BIT_ULL(23) +#define ERDMA_SQE_HDR_CE_MASK BIT_ULL(22) +#define ERDMA_SQE_HDR_WQEBB_INDEX_MASK GENMASK_ULL(15, 0) + +/* REG MR attrs */ +#define ERDMA_SQE_MR_MODE_MASK GENMASK(1, 0) +#define ERDMA_SQE_MR_ACCESS_MASK GENMASK(5, 2) +#define ERDMA_SQE_MR_MTT_TYPE_MASK GENMASK(7, 6) +#define ERDMA_SQE_MR_MTT_CNT_MASK GENMASK(31, 12) + +struct erdma_write_sqe { + __le64 hdr; + __be32 imm_data; + __le32 length; + + __le32 sink_stag; + __le32 sink_to_l; + __le32 sink_to_h; + + __le32 rsvd; + + struct erdma_sge sgl[0]; +}; + +struct erdma_send_sqe { + __le64 hdr; + union { + __be32 imm_data; + __le32 invalid_stag; + }; + + __le32 length; + struct erdma_sge sgl[0]; +}; + +struct erdma_readreq_sqe { + __le64 hdr; + __le32 invalid_stag; + __le32 length; + __le32 sink_stag; + __le32 sink_to_l; + __le32 sink_to_h; + __le32 rsvd; +}; + +struct erdma_reg_mr_sqe { + __le64 hdr; + __le64 addr; + __le32 length; + __le32 stag; + __le32 attrs; + __le32 rsvd; +}; + +/* EQ related. */ +#define ERDMA_DEFAULT_EQ_DEPTH 256 + +/* ceqe */ +#define ERDMA_CEQE_HDR_DB_MASK BIT_ULL(63) +#define ERDMA_CEQE_HDR_PI_MASK GENMASK_ULL(55, 32) +#define ERDMA_CEQE_HDR_O_MASK BIT_ULL(31) +#define ERDMA_CEQE_HDR_CQN_MASK GENMASK_ULL(19, 0) + +/* aeqe */ +#define ERDMA_AEQE_HDR_O_MASK BIT(31) +#define ERDMA_AEQE_HDR_TYPE_MASK GENMASK(23, 16) +#define ERDMA_AEQE_HDR_SUBTYPE_MASK GENMASK(7, 0) + +#define ERDMA_AE_TYPE_QP_FATAL_EVENT 0 +#define ERDMA_AE_TYPE_QP_ERQ_ERR_EVENT 1 +#define ERDMA_AE_TYPE_ACC_ERR_EVENT 2 +#define ERDMA_AE_TYPE_CQ_ERR 3 +#define ERDMA_AE_TYPE_OTHER_ERROR 4 + +struct erdma_aeqe { + __le32 hdr; + __le32 event_data0; + __le32 event_data1; + __le32 rsvd; +}; + +enum erdma_opcode { + ERDMA_OP_WRITE = 0, + ERDMA_OP_READ = 1, + ERDMA_OP_SEND = 2, + ERDMA_OP_SEND_WITH_IMM = 3, + + ERDMA_OP_RECEIVE = 4, + ERDMA_OP_RECV_IMM = 5, + ERDMA_OP_RECV_INV = 6, + + ERDMA_OP_REQ_ERR = 7, + ERDMA_OP_READ_RESPONSE = 8, + ERDMA_OP_WRITE_WITH_IMM = 9, + + ERDMA_OP_RECV_ERR = 10, + + ERDMA_OP_INVALIDATE = 11, + ERDMA_OP_RSP_SEND_IMM = 12, + ERDMA_OP_SEND_WITH_INV = 13, + + ERDMA_OP_REG_MR = 14, + ERDMA_OP_LOCAL_INV = 15, + ERDMA_OP_READ_WITH_INV = 16, + ERDMA_NUM_OPCODES = 17, + ERDMA_OP_INVALID = ERDMA_NUM_OPCODES + 1 +}; + +enum erdma_wc_status { + ERDMA_WC_SUCCESS = 0, + ERDMA_WC_GENERAL_ERR = 1, + ERDMA_WC_RECV_WQE_FORMAT_ERR = 2, + ERDMA_WC_RECV_STAG_INVALID_ERR = 3, + ERDMA_WC_RECV_ADDR_VIOLATION_ERR = 4, + ERDMA_WC_RECV_RIGHT_VIOLATION_ERR = 5, + ERDMA_WC_RECV_PDID_ERR = 6, + ERDMA_WC_RECV_WARRPING_ERR = 7, + ERDMA_WC_SEND_WQE_FORMAT_ERR = 8, + ERDMA_WC_SEND_WQE_ORD_EXCEED = 9, + ERDMA_WC_SEND_STAG_INVALID_ERR = 10, + ERDMA_WC_SEND_ADDR_VIOLATION_ERR = 11, + ERDMA_WC_SEND_RIGHT_VIOLATION_ERR = 12, + ERDMA_WC_SEND_PDID_ERR = 13, + ERDMA_WC_SEND_WARRPING_ERR = 14, + ERDMA_WC_FLUSH_ERR = 15, + ERDMA_WC_RETRY_EXC_ERR = 16, + ERDMA_NUM_WC_STATUS +}; + +enum erdma_vendor_err { + ERDMA_WC_VENDOR_NO_ERR = 0, + ERDMA_WC_VENDOR_INVALID_RQE = 1, + ERDMA_WC_VENDOR_RQE_INVALID_STAG = 2, + ERDMA_WC_VENDOR_RQE_ADDR_VIOLATION = 3, + ERDMA_WC_VENDOR_RQE_ACCESS_RIGHT_ERR = 4, + ERDMA_WC_VENDOR_RQE_INVALID_PD = 5, + ERDMA_WC_VENDOR_RQE_WRAP_ERR = 6, + ERDMA_WC_VENDOR_INVALID_SQE = 0x20, + ERDMA_WC_VENDOR_ZERO_ORD = 0x21, + ERDMA_WC_VENDOR_SQE_INVALID_STAG = 0x30, + ERDMA_WC_VENDOR_SQE_ADDR_VIOLATION = 0x31, + ERDMA_WC_VENDOR_SQE_ACCESS_ERR = 0x32, + ERDMA_WC_VENDOR_SQE_INVALID_PD = 0x33, + ERDMA_WC_VENDOR_SQE_WARP_ERR = 0x34 +}; + +#endif diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c new file mode 100644 index 000000000000..07e743d24847 --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -0,0 +1,608 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "erdma.h" +#include "erdma_cm.h" +#include "erdma_hw.h" +#include "erdma_verbs.h" + +MODULE_AUTHOR("Cheng Xu "); +MODULE_DESCRIPTION("Alibaba elasticRDMA adapter driver"); +MODULE_LICENSE("Dual BSD/GPL"); + +static int erdma_netdev_event(struct notifier_block *nb, unsigned long event, + void *arg) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(arg); + struct erdma_dev *dev = container_of(nb, struct erdma_dev, netdev_nb); + + if (dev->netdev == NULL || dev->netdev != netdev) + goto done; + + switch (event) { + case NETDEV_UP: + dev->state = IB_PORT_ACTIVE; + erdma_port_event(dev, IB_EVENT_PORT_ACTIVE); + break; + case NETDEV_DOWN: + dev->state = IB_PORT_DOWN; + erdma_port_event(dev, IB_EVENT_PORT_ERR); + break; + case NETDEV_REGISTER: + case NETDEV_UNREGISTER: + case NETDEV_CHANGEADDR: + case NETDEV_CHANGEMTU: + case NETDEV_GOING_DOWN: + case NETDEV_CHANGE: + default: + break; + } + +done: + return NOTIFY_OK; +} + +static int erdma_enum_and_get_netdev(struct erdma_dev *dev) +{ + struct net_device *netdev; + int ret = -ENODEV; + + /* Already binded to a net_device, so we skip. */ + if (dev->netdev) + return 0; + + rtnl_lock(); + for_each_netdev(&init_net, netdev) { + /* + * In erdma, the paired netdev and ibdev should have the same + * MAC address. erdma can get the value from its PCIe bar + * registers. Since erdma can not get the paired netdev + * reference directly, we do a traverse here to get the paired + * netdev. + */ + if (ether_addr_equal_unaligned(netdev->perm_addr, + dev->attrs.peer_addr)) { + ret = ib_device_set_netdev(&dev->ibdev, netdev, 1); + if (ret) { + rtnl_unlock(); + ibdev_warn(&dev->ibdev, + "failed (%d) to link netdev", ret); + return ret; + } + + dev->netdev = netdev; + break; + } + } + + rtnl_unlock(); + + return ret; +} + +static int erdma_device_register(struct erdma_dev *dev) +{ + struct ib_device *ibdev = &dev->ibdev; + int ret; + + ret = erdma_enum_and_get_netdev(dev); + if (ret) + return ret; + + addrconf_addr_eui48((u8 *)&ibdev->node_guid, dev->netdev->dev_addr); + + ret = ib_register_device(ibdev, "erdma_%d", &dev->pdev->dev); + if (ret) { + dev_err(&dev->pdev->dev, + "ib_register_device failed: ret = %d\n", ret); + return ret; + } + + dev->netdev_nb.notifier_call = erdma_netdev_event; + ret = register_netdevice_notifier(&dev->netdev_nb); + if (ret) { + ibdev_err(&dev->ibdev, "failed to register notifier.\n"); + ib_unregister_device(ibdev); + } + + return ret; +} + +static irqreturn_t erdma_comm_irq_handler(int irq, void *data) +{ + struct erdma_dev *dev = data; + + erdma_cmdq_completion_handler(&dev->cmdq); + erdma_aeq_event_handler(dev); + + return IRQ_HANDLED; +} + +static void erdma_dwqe_resource_init(struct erdma_dev *dev) +{ + int total_pages, type0, type1; + + dev->attrs.grp_num = erdma_reg_read32(dev, ERDMA_REGS_GRP_NUM_REG); + + if (dev->attrs.grp_num < 4) + dev->attrs.disable_dwqe = true; + else + dev->attrs.disable_dwqe = false; + + /* One page contains 4 goups. */ + total_pages = dev->attrs.grp_num * 4; + + if (dev->attrs.grp_num >= ERDMA_DWQE_MAX_GRP_CNT) { + dev->attrs.grp_num = ERDMA_DWQE_MAX_GRP_CNT; + type0 = ERDMA_DWQE_TYPE0_CNT; + type1 = ERDMA_DWQE_TYPE1_CNT / ERDMA_DWQE_TYPE1_CNT_PER_PAGE; + } else { + type1 = total_pages / 3; + type0 = total_pages - type1 - 1; + } + + dev->attrs.dwqe_pages = type0; + dev->attrs.dwqe_entries = type1 * ERDMA_DWQE_TYPE1_CNT_PER_PAGE; +} + +static int erdma_request_vectors(struct erdma_dev *dev) +{ + int expect_irq_num = min(num_possible_cpus() + 1, ERDMA_NUM_MSIX_VEC); + int ret; + + ret = pci_alloc_irq_vectors(dev->pdev, 1, expect_irq_num, PCI_IRQ_MSIX); + if (ret < 0) { + dev_err(&dev->pdev->dev, "request irq vectors failed(%d)\n", + ret); + return ret; + } + dev->attrs.irq_num = ret; + + return 0; +} + +static int erdma_comm_irq_init(struct erdma_dev *dev) +{ + snprintf(dev->comm_irq.name, ERDMA_IRQNAME_SIZE, "erdma-common@pci:%s", + pci_name(dev->pdev)); + dev->comm_irq.msix_vector = + pci_irq_vector(dev->pdev, ERDMA_MSIX_VECTOR_CMDQ); + + cpumask_set_cpu(cpumask_first(cpumask_of_pcibus(dev->pdev->bus)), + &dev->comm_irq.affinity_hint_mask); + irq_set_affinity_hint(dev->comm_irq.msix_vector, + &dev->comm_irq.affinity_hint_mask); + + return request_irq(dev->comm_irq.msix_vector, erdma_comm_irq_handler, 0, + dev->comm_irq.name, dev); +} + +static void erdma_comm_irq_uninit(struct erdma_dev *dev) +{ + irq_set_affinity_hint(dev->comm_irq.msix_vector, NULL); + free_irq(dev->comm_irq.msix_vector, dev); +} + +static int erdma_device_init(struct erdma_dev *dev, struct pci_dev *pdev) +{ + int ret; + + erdma_dwqe_resource_init(dev); + + ret = dma_set_mask_and_coherent(&pdev->dev, + DMA_BIT_MASK(ERDMA_PCI_WIDTH)); + if (ret) + return ret; + + dma_set_max_seg_size(&pdev->dev, UINT_MAX); + + return 0; +} + +static void erdma_device_uninit(struct erdma_dev *dev) +{ + u32 ctrl = FIELD_PREP(ERDMA_REG_DEV_CTRL_RESET_MASK, 1); + + erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG, ctrl); +} + +static const struct pci_device_id erdma_pci_tbl[] = { + { PCI_DEVICE(PCI_VENDOR_ID_ALIBABA, 0x107f) }, + {} +}; + +static int erdma_probe_dev(struct pci_dev *pdev) +{ + struct erdma_dev *dev; + int bars, err; + u32 version; + + err = pci_enable_device(pdev); + if (err) { + dev_err(&pdev->dev, "pci_enable_device failed(%d)\n", err); + return err; + } + + pci_set_master(pdev); + + dev = ib_alloc_device(erdma_dev, ibdev); + if (!dev) { + dev_err(&pdev->dev, "ib_alloc_device failed\n"); + err = -ENOMEM; + goto err_disable_device; + } + + pci_set_drvdata(pdev, dev); + dev->pdev = pdev; + dev->attrs.numa_node = dev_to_node(&pdev->dev); + + bars = pci_select_bars(pdev, IORESOURCE_MEM); + err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME); + if (bars != ERDMA_BAR_MASK || err) { + err = err ? err : -EINVAL; + goto err_ib_device_release; + } + + dev->func_bar_addr = pci_resource_start(pdev, ERDMA_FUNC_BAR); + dev->func_bar_len = pci_resource_len(pdev, ERDMA_FUNC_BAR); + + dev->func_bar = + devm_ioremap(&pdev->dev, dev->func_bar_addr, dev->func_bar_len); + if (!dev->func_bar) { + dev_err(&pdev->dev, "devm_ioremap failed.\n"); + err = -EFAULT; + goto err_release_bars; + } + + version = erdma_reg_read32(dev, ERDMA_REGS_VERSION_REG); + if (version == 0) { + /* we knows that it is a non-functional function. */ + err = -ENODEV; + goto err_iounmap_func_bar; + } + + err = erdma_device_init(dev, pdev); + if (err) + goto err_iounmap_func_bar; + + err = erdma_request_vectors(dev); + if (err) + goto err_iounmap_func_bar; + + err = erdma_comm_irq_init(dev); + if (err) + goto err_free_vectors; + + err = erdma_aeq_init(dev); + if (err) + goto err_uninit_comm_irq; + + err = erdma_cmdq_init(dev); + if (err) + goto err_uninit_aeq; + + err = erdma_ceqs_init(dev); + if (err) + goto err_uninit_cmdq; + + erdma_finish_cmdq_init(dev); + + return 0; + +err_uninit_cmdq: + erdma_device_uninit(dev); + erdma_cmdq_destroy(dev); + +err_uninit_aeq: + erdma_aeq_destroy(dev); + +err_uninit_comm_irq: + erdma_comm_irq_uninit(dev); + +err_free_vectors: + pci_free_irq_vectors(dev->pdev); + +err_iounmap_func_bar: + devm_iounmap(&pdev->dev, dev->func_bar); + +err_release_bars: + pci_release_selected_regions(pdev, bars); + +err_ib_device_release: + ib_dealloc_device(&dev->ibdev); + +err_disable_device: + pci_disable_device(pdev); + + return err; +} + +static void erdma_remove_dev(struct pci_dev *pdev) +{ + struct erdma_dev *dev = pci_get_drvdata(pdev); + + erdma_ceqs_uninit(dev); + + erdma_device_uninit(dev); + + erdma_cmdq_destroy(dev); + erdma_aeq_destroy(dev); + erdma_comm_irq_uninit(dev); + pci_free_irq_vectors(dev->pdev); + + devm_iounmap(&pdev->dev, dev->func_bar); + pci_release_selected_regions(pdev, ERDMA_BAR_MASK); + + ib_dealloc_device(&dev->ibdev); + + pci_disable_device(pdev); +} + +#define ERDMA_GET_CAP(name, cap) FIELD_GET(ERDMA_CMD_DEV_CAP_##name##_MASK, cap) + +static int erdma_dev_attrs_init(struct erdma_dev *dev) +{ + int err; + u64 req_hdr, cap0, cap1; + + erdma_cmdq_build_reqhdr(&req_hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_QUERY_DEVICE); + + err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0, + &cap1); + if (err) + return err; + + dev->attrs.max_cqe = 1 << ERDMA_GET_CAP(MAX_CQE, cap0); + dev->attrs.max_mr_size = 1ULL << ERDMA_GET_CAP(MAX_MR_SIZE, cap0); + dev->attrs.max_mw = 1 << ERDMA_GET_CAP(MAX_MW, cap1); + dev->attrs.max_recv_wr = 1 << ERDMA_GET_CAP(MAX_RECV_WR, cap0); + dev->attrs.local_dma_key = ERDMA_GET_CAP(DMA_LOCAL_KEY, cap1); + dev->attrs.cc = ERDMA_GET_CAP(DEFAULT_CC, cap1); + dev->attrs.max_qp = ERDMA_NQP_PER_QBLOCK * ERDMA_GET_CAP(QBLOCK, cap1); + dev->attrs.max_mr = dev->attrs.max_qp << 1; + dev->attrs.max_cq = dev->attrs.max_qp << 1; + + dev->attrs.max_send_wr = ERDMA_MAX_SEND_WR; + dev->attrs.max_ord = ERDMA_MAX_ORD; + dev->attrs.max_ird = ERDMA_MAX_IRD; + dev->attrs.max_send_sge = ERDMA_MAX_SEND_SGE; + dev->attrs.max_recv_sge = ERDMA_MAX_RECV_SGE; + dev->attrs.max_sge_rd = ERDMA_MAX_SGE_RD; + dev->attrs.max_pd = ERDMA_MAX_PD; + + dev->res_cb[ERDMA_RES_TYPE_PD].max_cap = ERDMA_MAX_PD; + dev->res_cb[ERDMA_RES_TYPE_STAG_IDX].max_cap = dev->attrs.max_mr; + + erdma_cmdq_build_reqhdr(&req_hdr, CMDQ_SUBMOD_COMMON, + CMDQ_OPCODE_QUERY_FW_INFO); + + err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0, + &cap1); + if (!err) + dev->attrs.fw_version = + FIELD_GET(ERDMA_CMD_INFO0_FW_VER_MASK, cap0); + + return err; +} + +static int erdma_res_cb_init(struct erdma_dev *dev) +{ + int i, j; + + for (i = 0; i < ERDMA_RES_CNT; i++) { + dev->res_cb[i].next_alloc_idx = 1; + spin_lock_init(&dev->res_cb[i].lock); + dev->res_cb[i].bitmap = + bitmap_zalloc(dev->res_cb[i].max_cap, GFP_KERNEL); + if (!dev->res_cb[i].bitmap) + goto err; + } + + return 0; + +err: + for (j = 0; j < i; j++) + bitmap_free(dev->res_cb[j].bitmap); + + return -ENOMEM; +} + +static void erdma_res_cb_free(struct erdma_dev *dev) +{ + int i; + + for (i = 0; i < ERDMA_RES_CNT; i++) + bitmap_free(dev->res_cb[i].bitmap); +} + +static const struct ib_device_ops erdma_device_ops = { + .owner = THIS_MODULE, + .driver_id = RDMA_DRIVER_ERDMA, + .uverbs_abi_ver = ERDMA_ABI_VERSION, + + .alloc_mr = erdma_ib_alloc_mr, + .alloc_pd = erdma_alloc_pd, + .alloc_ucontext = erdma_alloc_ucontext, + .create_cq = erdma_create_cq, + .create_qp = erdma_create_qp, + .dealloc_pd = erdma_dealloc_pd, + .dealloc_ucontext = erdma_dealloc_ucontext, + .dereg_mr = erdma_dereg_mr, + .destroy_cq = erdma_destroy_cq, + .destroy_qp = erdma_destroy_qp, + .get_dma_mr = erdma_get_dma_mr, + .get_port_immutable = erdma_get_port_immutable, + .iw_accept = erdma_accept, + .iw_add_ref = erdma_qp_get_ref, + .iw_connect = erdma_connect, + .iw_create_listen = erdma_create_listen, + .iw_destroy_listen = erdma_destroy_listen, + .iw_get_qp = erdma_get_ibqp, + .iw_reject = erdma_reject, + .iw_rem_ref = erdma_qp_put_ref, + .map_mr_sg = erdma_map_mr_sg, + .mmap = erdma_mmap, + .mmap_free = erdma_mmap_free, + .modify_qp = erdma_modify_qp, + .post_recv = erdma_post_recv, + .post_send = erdma_post_send, + .poll_cq = erdma_poll_cq, + .query_device = erdma_query_device, + .query_gid = erdma_query_gid, + .query_port = erdma_query_port, + .query_qp = erdma_query_qp, + .req_notify_cq = erdma_req_notify_cq, + .reg_user_mr = erdma_reg_user_mr, + + INIT_RDMA_OBJ_SIZE(ib_cq, erdma_cq, ibcq), + INIT_RDMA_OBJ_SIZE(ib_pd, erdma_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, erdma_ucontext, ibucontext), + INIT_RDMA_OBJ_SIZE(ib_qp, erdma_qp, ibqp), +}; + +static int erdma_ib_device_add(struct pci_dev *pdev) +{ + struct erdma_dev *dev = pci_get_drvdata(pdev); + struct ib_device *ibdev = &dev->ibdev; + u64 mac; + int ret; + + ret = erdma_dev_attrs_init(dev); + if (ret) + return ret; + + ibdev->node_type = RDMA_NODE_RNIC; + memcpy(ibdev->node_desc, ERDMA_NODE_DESC, sizeof(ERDMA_NODE_DESC)); + + /* + * Current model (one-to-one device association): + * One ERDMA device per net_device or, equivalently, + * per physical port. + */ + ibdev->phys_port_cnt = 1; + ibdev->num_comp_vectors = dev->attrs.irq_num - 1; + + ib_set_device_ops(ibdev, &erdma_device_ops); + + INIT_LIST_HEAD(&dev->cep_list); + + spin_lock_init(&dev->lock); + xa_init_flags(&dev->qp_xa, XA_FLAGS_ALLOC1); + xa_init_flags(&dev->cq_xa, XA_FLAGS_ALLOC1); + dev->next_alloc_cqn = 1; + dev->next_alloc_qpn = 1; + + ret = erdma_res_cb_init(dev); + if (ret) + return ret; + + spin_lock_init(&dev->db_bitmap_lock); + bitmap_zero(dev->sdb_page, ERDMA_DWQE_TYPE0_CNT); + bitmap_zero(dev->sdb_entry, ERDMA_DWQE_TYPE1_CNT); + + atomic_set(&dev->num_ctx, 0); + + mac = erdma_reg_read32(dev, ERDMA_REGS_NETDEV_MAC_L_REG); + mac |= (u64)erdma_reg_read32(dev, ERDMA_REGS_NETDEV_MAC_H_REG) << 32; + + u64_to_ether_addr(mac, dev->attrs.peer_addr); + + ret = erdma_device_register(dev); + if (ret) + goto err_out; + + return 0; + +err_out: + xa_destroy(&dev->qp_xa); + xa_destroy(&dev->cq_xa); + + erdma_res_cb_free(dev); + + return ret; +} + +static void erdma_ib_device_remove(struct pci_dev *pdev) +{ + struct erdma_dev *dev = pci_get_drvdata(pdev); + + unregister_netdevice_notifier(&dev->netdev_nb); + ib_unregister_device(&dev->ibdev); + + erdma_res_cb_free(dev); + xa_destroy(&dev->qp_xa); + xa_destroy(&dev->cq_xa); +} + +static int erdma_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int ret; + + ret = erdma_probe_dev(pdev); + if (ret) + return ret; + + ret = erdma_ib_device_add(pdev); + if (ret) { + erdma_remove_dev(pdev); + return ret; + } + + return 0; +} + +static void erdma_remove(struct pci_dev *pdev) +{ + erdma_ib_device_remove(pdev); + erdma_remove_dev(pdev); +} + +static struct pci_driver erdma_pci_driver = { + .name = DRV_MODULE_NAME, + .id_table = erdma_pci_tbl, + .probe = erdma_probe, + .remove = erdma_remove +}; + +MODULE_DEVICE_TABLE(pci, erdma_pci_tbl); + +static __init int erdma_init_module(void) +{ + int ret; + + ret = erdma_cm_init(); + if (ret) + return ret; + + ret = pci_register_driver(&erdma_pci_driver); + if (ret) + erdma_cm_exit(); + + return ret; +} + +static void __exit erdma_exit_module(void) +{ + pci_unregister_driver(&erdma_pci_driver); + + erdma_cm_exit(); +} + +module_init(erdma_init_module); +module_exit(erdma_exit_module); diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c new file mode 100644 index 000000000000..72f08171a28a --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -0,0 +1,566 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2021, Alibaba Group */ +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include +#include +#include +#include + +#include +#include + +#include "erdma.h" +#include "erdma_cm.h" +#include "erdma_verbs.h" + +void erdma_qp_llp_close(struct erdma_qp *qp) +{ + struct erdma_qp_attrs qp_attrs; + + down_write(&qp->state_lock); + + switch (qp->attrs.state) { + case ERDMA_QP_STATE_RTS: + case ERDMA_QP_STATE_RTR: + case ERDMA_QP_STATE_IDLE: + case ERDMA_QP_STATE_TERMINATE: + qp_attrs.state = ERDMA_QP_STATE_CLOSING; + erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE); + break; + case ERDMA_QP_STATE_CLOSING: + qp->attrs.state = ERDMA_QP_STATE_IDLE; + break; + default: + break; + } + + if (qp->cep) { + erdma_cep_put(qp->cep); + qp->cep = NULL; + } + + up_write(&qp->state_lock); +} + +struct ib_qp *erdma_get_ibqp(struct ib_device *ibdev, int id) +{ + struct erdma_qp *qp = find_qp_by_qpn(to_edev(ibdev), id); + + if (qp) + return &qp->ibqp; + + return NULL; +} + +static int erdma_modify_qp_state_to_rts(struct erdma_qp *qp, + struct erdma_qp_attrs *attrs, + enum erdma_qp_attr_mask mask) +{ + int ret; + struct erdma_dev *dev = qp->dev; + struct erdma_cmdq_modify_qp_req req; + struct tcp_sock *tp; + struct erdma_cep *cep = qp->cep; + struct sockaddr_storage local_addr, remote_addr; + + if (!(mask & ERDMA_QP_ATTR_LLP_HANDLE)) + return -EINVAL; + + if (!(mask & ERDMA_QP_ATTR_MPA)) + return -EINVAL; + + ret = getname_local(cep->sock, &local_addr); + if (ret < 0) + return ret; + + ret = getname_peer(cep->sock, &remote_addr); + if (ret < 0) + return ret; + + qp->attrs.state = ERDMA_QP_STATE_RTS; + + tp = tcp_sk(qp->cep->sock->sk); + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_MODIFY_QP); + + req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, qp->attrs.state) | + FIELD_PREP(ERDMA_CMD_MODIFY_QP_CC_MASK, qp->attrs.cc) | + FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp)); + + req.cookie = be32_to_cpu(qp->cep->mpa.ext_data.cookie); + req.dip = to_sockaddr_in(remote_addr).sin_addr.s_addr; + req.sip = to_sockaddr_in(local_addr).sin_addr.s_addr; + req.dport = to_sockaddr_in(remote_addr).sin_port; + req.sport = to_sockaddr_in(local_addr).sin_port; + + req.send_nxt = tp->snd_nxt; + /* rsvd tcp seq for mpa-rsp in server. */ + if (qp->attrs.qp_type == ERDMA_QP_PASSIVE) + req.send_nxt += MPA_DEFAULT_HDR_LEN + qp->attrs.pd_len; + req.recv_nxt = tp->rcv_nxt; + + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); +} + +static int erdma_modify_qp_state_to_stop(struct erdma_qp *qp, + struct erdma_qp_attrs *attrs, + enum erdma_qp_attr_mask mask) +{ + struct erdma_dev *dev = qp->dev; + struct erdma_cmdq_modify_qp_req req; + + qp->attrs.state = attrs->state; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_MODIFY_QP); + + req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, attrs->state) | + FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp)); + + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); +} + +int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, + enum erdma_qp_attr_mask mask) +{ + int drop_conn, ret = 0; + + if (!mask) + return 0; + + if (!(mask & ERDMA_QP_ATTR_STATE)) + return 0; + + switch (qp->attrs.state) { + case ERDMA_QP_STATE_IDLE: + case ERDMA_QP_STATE_RTR: + if (attrs->state == ERDMA_QP_STATE_RTS) { + ret = erdma_modify_qp_state_to_rts(qp, attrs, mask); + } else if (attrs->state == ERDMA_QP_STATE_ERROR) { + qp->attrs.state = ERDMA_QP_STATE_ERROR; + if (qp->cep) { + erdma_cep_put(qp->cep); + qp->cep = NULL; + } + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + } + break; + case ERDMA_QP_STATE_RTS: + drop_conn = 0; + + if (attrs->state == ERDMA_QP_STATE_CLOSING) { + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + drop_conn = 1; + } else if (attrs->state == ERDMA_QP_STATE_TERMINATE) { + qp->attrs.state = ERDMA_QP_STATE_TERMINATE; + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + drop_conn = 1; + } else if (attrs->state == ERDMA_QP_STATE_ERROR) { + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + qp->attrs.state = ERDMA_QP_STATE_ERROR; + drop_conn = 1; + } + + if (drop_conn) + erdma_qp_cm_drop(qp); + + break; + case ERDMA_QP_STATE_TERMINATE: + if (attrs->state == ERDMA_QP_STATE_ERROR) + qp->attrs.state = ERDMA_QP_STATE_ERROR; + break; + case ERDMA_QP_STATE_CLOSING: + if (attrs->state == ERDMA_QP_STATE_IDLE) { + qp->attrs.state = ERDMA_QP_STATE_IDLE; + } else if (attrs->state == ERDMA_QP_STATE_ERROR) { + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + qp->attrs.state = ERDMA_QP_STATE_ERROR; + } else if (attrs->state != ERDMA_QP_STATE_CLOSING) { + return -ECONNABORTED; + } + break; + default: + break; + } + + return ret; +} + +static void erdma_qp_safe_free(struct kref *ref) +{ + struct erdma_qp *qp = container_of(ref, struct erdma_qp, ref); + + complete(&qp->safe_free); +} + +void erdma_qp_put(struct erdma_qp *qp) +{ + WARN_ON(kref_read(&qp->ref) < 1); + kref_put(&qp->ref, erdma_qp_safe_free); +} + +void erdma_qp_get(struct erdma_qp *qp) +{ + kref_get(&qp->ref); +} + +static int fill_inline_data(struct erdma_qp *qp, + const struct ib_send_wr *send_wr, u16 wqe_idx, + u32 sgl_offset, __le32 *length_field) +{ + u32 remain_size, copy_size, data_off, bytes = 0; + char *data; + int i = 0; + + wqe_idx += (sgl_offset >> SQEBB_SHIFT); + sgl_offset &= (SQEBB_SIZE - 1); + data = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx, qp->attrs.sq_size, + SQEBB_SHIFT); + + while (i < send_wr->num_sge) { + bytes += send_wr->sg_list[i].length; + if (bytes > (int)ERDMA_MAX_INLINE) + return -EINVAL; + + remain_size = send_wr->sg_list[i].length; + data_off = 0; + + while (1) { + copy_size = min(remain_size, SQEBB_SIZE - sgl_offset); + + memcpy(data + sgl_offset, + (void *)(uintptr_t)send_wr->sg_list[i].addr + + data_off, + copy_size); + remain_size -= copy_size; + data_off += copy_size; + sgl_offset += copy_size; + wqe_idx += (sgl_offset >> SQEBB_SHIFT); + sgl_offset &= (SQEBB_SIZE - 1); + + data = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx, + qp->attrs.sq_size, SQEBB_SHIFT); + if (!remain_size) + break; + } + + i++; + } + *length_field = cpu_to_le32(bytes); + + return bytes; +} + +static int fill_sgl(struct erdma_qp *qp, const struct ib_send_wr *send_wr, + u16 wqe_idx, u32 sgl_offset, __le32 *length_field) +{ + int i = 0; + u32 bytes = 0; + char *sgl; + + if (send_wr->num_sge > qp->dev->attrs.max_send_sge) + return -EINVAL; + + if (sgl_offset & 0xF) + return -EINVAL; + + while (i < send_wr->num_sge) { + wqe_idx += (sgl_offset >> SQEBB_SHIFT); + sgl_offset &= (SQEBB_SIZE - 1); + sgl = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx, + qp->attrs.sq_size, SQEBB_SHIFT); + + bytes += send_wr->sg_list[i].length; + memcpy(sgl + sgl_offset, &send_wr->sg_list[i], + sizeof(struct ib_sge)); + + sgl_offset += sizeof(struct ib_sge); + i++; + } + + *length_field = cpu_to_le32(bytes); + return 0; +} + +static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, + const struct ib_send_wr *send_wr) +{ + u32 wqe_size, wqebb_cnt, hw_op, flags, sgl_offset; + u32 idx = *pi & (qp->attrs.sq_size - 1); + enum ib_wr_opcode op = send_wr->opcode; + struct erdma_readreq_sqe *read_sqe; + struct erdma_reg_mr_sqe *regmr_sge; + struct erdma_write_sqe *write_sqe; + struct erdma_send_sqe *send_sqe; + struct ib_rdma_wr *rdma_wr; + struct erdma_mr *mr; + __le32 *length_field; + u64 wqe_hdr, *entry; + struct ib_sge *sge; + u32 attrs; + int ret; + + entry = get_queue_entry(qp->kern_qp.sq_buf, idx, qp->attrs.sq_size, + SQEBB_SHIFT); + + /* Clear the SQE header section. */ + *entry = 0; + + qp->kern_qp.swr_tbl[idx] = send_wr->wr_id; + flags = send_wr->send_flags; + wqe_hdr = FIELD_PREP( + ERDMA_SQE_HDR_CE_MASK, + ((flags & IB_SEND_SIGNALED) || qp->kern_qp.sig_all) ? 1 : 0); + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SE_MASK, + flags & IB_SEND_SOLICITED ? 1 : 0); + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_FENCE_MASK, + flags & IB_SEND_FENCE ? 1 : 0); + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_INLINE_MASK, + flags & IB_SEND_INLINE ? 1 : 0); + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, QP_ID(qp)); + + switch (op) { + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + hw_op = ERDMA_OP_WRITE; + if (op == IB_WR_RDMA_WRITE_WITH_IMM) + hw_op = ERDMA_OP_WRITE_WITH_IMM; + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op); + rdma_wr = container_of(send_wr, struct ib_rdma_wr, wr); + write_sqe = (struct erdma_write_sqe *)entry; + + write_sqe->imm_data = send_wr->ex.imm_data; + write_sqe->sink_stag = cpu_to_le32(rdma_wr->rkey); + write_sqe->sink_to_h = + cpu_to_le32(upper_32_bits(rdma_wr->remote_addr)); + write_sqe->sink_to_l = + cpu_to_le32(lower_32_bits(rdma_wr->remote_addr)); + + length_field = &write_sqe->length; + wqe_size = sizeof(struct erdma_write_sqe); + sgl_offset = wqe_size; + break; + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + read_sqe = (struct erdma_readreq_sqe *)entry; + if (unlikely(send_wr->num_sge != 1)) + return -EINVAL; + hw_op = ERDMA_OP_READ; + if (op == IB_WR_RDMA_READ_WITH_INV) { + hw_op = ERDMA_OP_READ_WITH_INV; + read_sqe->invalid_stag = + cpu_to_le32(send_wr->ex.invalidate_rkey); + } + + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op); + rdma_wr = container_of(send_wr, struct ib_rdma_wr, wr); + read_sqe->length = cpu_to_le32(send_wr->sg_list[0].length); + read_sqe->sink_stag = cpu_to_le32(send_wr->sg_list[0].lkey); + read_sqe->sink_to_l = + cpu_to_le32(lower_32_bits(send_wr->sg_list[0].addr)); + read_sqe->sink_to_h = + cpu_to_le32(upper_32_bits(send_wr->sg_list[0].addr)); + + sge = get_queue_entry(qp->kern_qp.sq_buf, idx + 1, + qp->attrs.sq_size, SQEBB_SHIFT); + sge->addr = rdma_wr->remote_addr; + sge->lkey = rdma_wr->rkey; + sge->length = send_wr->sg_list[0].length; + wqe_size = sizeof(struct erdma_readreq_sqe) + + send_wr->num_sge * sizeof(struct ib_sge); + + goto out; + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + case IB_WR_SEND_WITH_INV: + send_sqe = (struct erdma_send_sqe *)entry; + hw_op = ERDMA_OP_SEND; + if (op == IB_WR_SEND_WITH_IMM) { + hw_op = ERDMA_OP_SEND_WITH_IMM; + send_sqe->imm_data = send_wr->ex.imm_data; + } else if (op == IB_WR_SEND_WITH_INV) { + hw_op = ERDMA_OP_SEND_WITH_INV; + send_sqe->invalid_stag = + cpu_to_le32(send_wr->ex.invalidate_rkey); + } + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op); + length_field = &send_sqe->length; + wqe_size = sizeof(struct erdma_send_sqe); + sgl_offset = wqe_size; + + break; + case IB_WR_REG_MR: + wqe_hdr |= + FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, ERDMA_OP_REG_MR); + regmr_sge = (struct erdma_reg_mr_sqe *)entry; + mr = to_emr(reg_wr(send_wr)->mr); + + mr->access = ERDMA_MR_ACC_LR | + to_erdma_access_flags(reg_wr(send_wr)->access); + regmr_sge->addr = cpu_to_le64(mr->ibmr.iova); + regmr_sge->length = cpu_to_le32(mr->ibmr.length); + regmr_sge->stag = cpu_to_le32(mr->ibmr.lkey); + attrs = FIELD_PREP(ERDMA_SQE_MR_MODE_MASK, 0) | + FIELD_PREP(ERDMA_SQE_MR_ACCESS_MASK, mr->access) | + FIELD_PREP(ERDMA_SQE_MR_MTT_CNT_MASK, + mr->mem.mtt_nents); + + if (mr->mem.mtt_nents < ERDMA_MAX_INLINE_MTT_ENTRIES) { + attrs |= FIELD_PREP(ERDMA_SQE_MR_MTT_TYPE_MASK, 0); + /* Copy SGLs to SQE content to accelerate */ + memcpy(get_queue_entry(qp->kern_qp.sq_buf, idx + 1, + qp->attrs.sq_size, SQEBB_SHIFT), + mr->mem.mtt_buf, MTT_SIZE(mr->mem.mtt_nents)); + wqe_size = sizeof(struct erdma_reg_mr_sqe) + + MTT_SIZE(mr->mem.mtt_nents); + } else { + attrs |= FIELD_PREP(ERDMA_SQE_MR_MTT_TYPE_MASK, 1); + wqe_size = sizeof(struct erdma_reg_mr_sqe); + } + + regmr_sge->attrs = cpu_to_le32(attrs); + goto out; + case IB_WR_LOCAL_INV: + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, + ERDMA_OP_LOCAL_INV); + regmr_sge = (struct erdma_reg_mr_sqe *)entry; + regmr_sge->stag = cpu_to_le32(send_wr->ex.invalidate_rkey); + wqe_size = sizeof(struct erdma_reg_mr_sqe); + goto out; + default: + return -EOPNOTSUPP; + } + + if (flags & IB_SEND_INLINE) { + ret = fill_inline_data(qp, send_wr, idx, sgl_offset, + length_field); + if (ret < 0) + return -EINVAL; + wqe_size += ret; + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, ret); + } else { + ret = fill_sgl(qp, send_wr, idx, sgl_offset, length_field); + if (ret) + return -EINVAL; + wqe_size += send_wr->num_sge * sizeof(struct ib_sge); + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, + send_wr->num_sge); + } + +out: + wqebb_cnt = SQEBB_COUNT(wqe_size); + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_CNT_MASK, wqebb_cnt - 1); + *pi += wqebb_cnt; + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, *pi); + + *entry = wqe_hdr; + + return 0; +} + +static void kick_sq_db(struct erdma_qp *qp, u16 pi) +{ + u64 db_data = FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, QP_ID(qp)) | + FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, pi); + + *(u64 *)qp->kern_qp.sq_db_info = db_data; + writeq(db_data, qp->kern_qp.hw_sq_db); +} + +int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr, + const struct ib_send_wr **bad_send_wr) +{ + struct erdma_qp *qp = to_eqp(ibqp); + int ret = 0; + const struct ib_send_wr *wr = send_wr; + unsigned long flags; + u16 sq_pi; + + if (!send_wr) + return -EINVAL; + + spin_lock_irqsave(&qp->lock, flags); + sq_pi = qp->kern_qp.sq_pi; + + while (wr) { + if ((u16)(sq_pi - qp->kern_qp.sq_ci) >= qp->attrs.sq_size) { + ret = -ENOMEM; + *bad_send_wr = send_wr; + break; + } + + ret = erdma_push_one_sqe(qp, &sq_pi, wr); + if (ret) { + *bad_send_wr = wr; + break; + } + qp->kern_qp.sq_pi = sq_pi; + kick_sq_db(qp, sq_pi); + + wr = wr->next; + } + spin_unlock_irqrestore(&qp->lock, flags); + + return ret; +} + +static int erdma_post_recv_one(struct erdma_qp *qp, + const struct ib_recv_wr *recv_wr) +{ + struct erdma_rqe *rqe = + get_queue_entry(qp->kern_qp.rq_buf, qp->kern_qp.rq_pi, + qp->attrs.rq_size, RQE_SHIFT); + + rqe->qe_idx = cpu_to_le16(qp->kern_qp.rq_pi + 1); + rqe->qpn = cpu_to_le32(QP_ID(qp)); + + if (recv_wr->num_sge == 0) { + rqe->length = 0; + } else if (recv_wr->num_sge == 1) { + rqe->stag = cpu_to_le32(recv_wr->sg_list[0].lkey); + rqe->to = cpu_to_le64(recv_wr->sg_list[0].addr); + rqe->length = cpu_to_le32(recv_wr->sg_list[0].length); + } else { + return -EINVAL; + } + + *(u64 *)qp->kern_qp.rq_db_info = *(u64 *)rqe; + writeq(*(u64 *)rqe, qp->kern_qp.hw_rq_db); + + qp->kern_qp.rwr_tbl[qp->kern_qp.rq_pi & (qp->attrs.rq_size - 1)] = + recv_wr->wr_id; + qp->kern_qp.rq_pi++; + + return 0; +} + +int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr, + const struct ib_recv_wr **bad_recv_wr) +{ + const struct ib_recv_wr *wr = recv_wr; + struct erdma_qp *qp = to_eqp(ibqp); + unsigned long flags; + int ret; + + spin_lock_irqsave(&qp->lock, flags); + + while (wr) { + ret = erdma_post_recv_one(qp, wr); + if (ret) { + *bad_recv_wr = wr; + break; + } + wr = wr->next; + } + + spin_unlock_irqrestore(&qp->lock, flags); + return ret; +} diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c new file mode 100644 index 000000000000..a7a3d42e2016 --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -0,0 +1,1460 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +/* Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "erdma.h" +#include "erdma_cm.h" +#include "erdma_hw.h" +#include "erdma_verbs.h" + +static int create_qp_cmd(struct erdma_dev *dev, struct erdma_qp *qp) +{ + struct erdma_cmdq_create_qp_req req; + struct erdma_pd *pd = to_epd(qp->ibqp.pd); + struct erdma_uqp *user_qp; + u64 resp0, resp1; + int err; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_CREATE_QP); + + req.cfg0 = FIELD_PREP(ERDMA_CMD_CREATE_QP_SQ_DEPTH_MASK, + ilog2(qp->attrs.sq_size)) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_QPN_MASK, QP_ID(qp)); + req.cfg1 = FIELD_PREP(ERDMA_CMD_CREATE_QP_RQ_DEPTH_MASK, + ilog2(qp->attrs.rq_size)) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_PD_MASK, pd->pdn); + + if (rdma_is_kernel_res(&qp->ibqp.res)) { + u32 pgsz_range = ilog2(SZ_1M) - PAGE_SHIFT; + + req.sq_cqn_mtt_cfg = + FIELD_PREP(ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK, + pgsz_range) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->scq->cqn); + req.rq_cqn_mtt_cfg = + FIELD_PREP(ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK, + pgsz_range) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->rcq->cqn); + + req.sq_mtt_cfg = + FIELD_PREP(ERDMA_CMD_CREATE_QP_PAGE_OFFSET_MASK, 0) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK, 1) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK, + ERDMA_MR_INLINE_MTT); + req.rq_mtt_cfg = req.sq_mtt_cfg; + + req.rq_buf_addr = qp->kern_qp.rq_buf_dma_addr; + req.sq_buf_addr = qp->kern_qp.sq_buf_dma_addr; + req.sq_db_info_dma_addr = qp->kern_qp.sq_buf_dma_addr + + (qp->attrs.sq_size << SQEBB_SHIFT); + req.rq_db_info_dma_addr = qp->kern_qp.rq_buf_dma_addr + + (qp->attrs.rq_size << RQE_SHIFT); + } else { + user_qp = &qp->user_qp; + req.sq_cqn_mtt_cfg = FIELD_PREP( + ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK, + ilog2(user_qp->sq_mtt.page_size) - PAGE_SHIFT); + req.sq_cqn_mtt_cfg |= + FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->scq->cqn); + + req.rq_cqn_mtt_cfg = FIELD_PREP( + ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK, + ilog2(user_qp->rq_mtt.page_size) - PAGE_SHIFT); + req.rq_cqn_mtt_cfg |= + FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->rcq->cqn); + + req.sq_mtt_cfg = user_qp->sq_mtt.page_offset; + req.sq_mtt_cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK, + user_qp->sq_mtt.mtt_nents) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK, + user_qp->sq_mtt.mtt_type); + + req.rq_mtt_cfg = user_qp->rq_mtt.page_offset; + req.rq_mtt_cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK, + user_qp->rq_mtt.mtt_nents) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK, + user_qp->rq_mtt.mtt_type); + + req.sq_buf_addr = user_qp->sq_mtt.mtt_entry[0]; + req.rq_buf_addr = user_qp->rq_mtt.mtt_entry[0]; + + req.sq_db_info_dma_addr = user_qp->sq_db_info_dma_addr; + req.rq_db_info_dma_addr = user_qp->rq_db_info_dma_addr; + } + + err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), &resp0, + &resp1); + if (!err) + qp->attrs.cookie = + FIELD_GET(ERDMA_CMDQ_CREATE_QP_RESP_COOKIE_MASK, resp0); + + return err; +} + +static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr) +{ + struct erdma_cmdq_reg_mr_req req; + struct erdma_pd *pd = to_epd(mr->ibmr.pd); + u64 *phy_addr; + int i; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_REG_MR); + + req.cfg0 = FIELD_PREP(ERDMA_CMD_MR_VALID_MASK, mr->valid) | + FIELD_PREP(ERDMA_CMD_MR_KEY_MASK, mr->ibmr.lkey & 0xFF) | + FIELD_PREP(ERDMA_CMD_MR_MPT_IDX_MASK, mr->ibmr.lkey >> 8); + req.cfg1 = FIELD_PREP(ERDMA_CMD_REGMR_PD_MASK, pd->pdn) | + FIELD_PREP(ERDMA_CMD_REGMR_TYPE_MASK, mr->type) | + FIELD_PREP(ERDMA_CMD_REGMR_RIGHT_MASK, mr->access) | + FIELD_PREP(ERDMA_CMD_REGMR_ACC_MODE_MASK, 0); + req.cfg2 = FIELD_PREP(ERDMA_CMD_REGMR_PAGESIZE_MASK, + ilog2(mr->mem.page_size)) | + FIELD_PREP(ERDMA_CMD_REGMR_MTT_TYPE_MASK, mr->mem.mtt_type) | + FIELD_PREP(ERDMA_CMD_REGMR_MTT_CNT_MASK, mr->mem.page_cnt); + + if (mr->type == ERDMA_MR_TYPE_DMA) + goto post_cmd; + + if (mr->type == ERDMA_MR_TYPE_NORMAL) { + req.start_va = mr->mem.va; + req.size = mr->mem.len; + } + + if (mr->type == ERDMA_MR_TYPE_FRMR || + mr->mem.mtt_type == ERDMA_MR_INDIRECT_MTT) { + phy_addr = req.phy_addr; + *phy_addr = mr->mem.mtt_entry[0]; + } else { + phy_addr = req.phy_addr; + for (i = 0; i < mr->mem.mtt_nents; i++) + *phy_addr++ = mr->mem.mtt_entry[i]; + } + +post_cmd: + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); +} + +static int create_cq_cmd(struct erdma_dev *dev, struct erdma_cq *cq) +{ + struct erdma_cmdq_create_cq_req req; + u32 page_size; + struct erdma_mem *mtt; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_CREATE_CQ); + + req.cfg0 = FIELD_PREP(ERDMA_CMD_CREATE_CQ_CQN_MASK, cq->cqn) | + FIELD_PREP(ERDMA_CMD_CREATE_CQ_DEPTH_MASK, ilog2(cq->depth)); + req.cfg1 = FIELD_PREP(ERDMA_CMD_CREATE_CQ_EQN_MASK, cq->assoc_eqn); + + if (rdma_is_kernel_res(&cq->ibcq.res)) { + page_size = SZ_32M; + req.cfg0 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK, + ilog2(page_size) - PAGE_SHIFT); + req.qbuf_addr_l = lower_32_bits(cq->kern_cq.qbuf_dma_addr); + req.qbuf_addr_h = upper_32_bits(cq->kern_cq.qbuf_dma_addr); + + req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK, 1) | + FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK, + ERDMA_MR_INLINE_MTT); + + req.first_page_offset = 0; + req.cq_db_info_addr = + cq->kern_cq.qbuf_dma_addr + (cq->depth << CQE_SHIFT); + } else { + mtt = &cq->user_cq.qbuf_mtt; + req.cfg0 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK, + ilog2(mtt->page_size) - PAGE_SHIFT); + if (mtt->mtt_nents == 1) { + req.qbuf_addr_l = lower_32_bits(*(u64 *)mtt->mtt_buf); + req.qbuf_addr_h = upper_32_bits(*(u64 *)mtt->mtt_buf); + } else { + req.qbuf_addr_l = lower_32_bits(mtt->mtt_entry[0]); + req.qbuf_addr_h = upper_32_bits(mtt->mtt_entry[0]); + } + req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK, + mtt->mtt_nents); + req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK, + mtt->mtt_type); + + req.first_page_offset = mtt->page_offset; + req.cq_db_info_addr = cq->user_cq.db_info_dma_addr; + } + + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); +} + +static int erdma_alloc_idx(struct erdma_resource_cb *res_cb) +{ + int idx; + unsigned long flags; + + spin_lock_irqsave(&res_cb->lock, flags); + idx = find_next_zero_bit(res_cb->bitmap, res_cb->max_cap, + res_cb->next_alloc_idx); + if (idx == res_cb->max_cap) { + idx = find_first_zero_bit(res_cb->bitmap, res_cb->max_cap); + if (idx == res_cb->max_cap) { + res_cb->next_alloc_idx = 1; + spin_unlock_irqrestore(&res_cb->lock, flags); + return -ENOSPC; + } + } + + set_bit(idx, res_cb->bitmap); + res_cb->next_alloc_idx = idx + 1; + spin_unlock_irqrestore(&res_cb->lock, flags); + + return idx; +} + +static inline void erdma_free_idx(struct erdma_resource_cb *res_cb, u32 idx) +{ + unsigned long flags; + u32 used; + + spin_lock_irqsave(&res_cb->lock, flags); + used = __test_and_clear_bit(idx, res_cb->bitmap); + spin_unlock_irqrestore(&res_cb->lock, flags); + WARN_ON(!used); +} + +static struct rdma_user_mmap_entry * +erdma_user_mmap_entry_insert(struct erdma_ucontext *uctx, void *address, + u32 size, u8 mmap_flag, u64 *mmap_offset) +{ + struct erdma_user_mmap_entry *entry = + kzalloc(sizeof(*entry), GFP_KERNEL); + int ret; + + if (!entry) + return NULL; + + entry->address = (u64)address; + entry->mmap_flag = mmap_flag; + + size = PAGE_ALIGN(size); + + ret = rdma_user_mmap_entry_insert(&uctx->ibucontext, &entry->rdma_entry, + size); + if (ret) { + kfree(entry); + return NULL; + } + + *mmap_offset = rdma_user_mmap_get_offset(&entry->rdma_entry); + + return &entry->rdma_entry; +} + +int erdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, + struct ib_udata *unused) +{ + struct erdma_dev *dev = to_edev(ibdev); + + memset(attr, 0, sizeof(*attr)); + + attr->max_mr_size = dev->attrs.max_mr_size; + attr->vendor_id = PCI_VENDOR_ID_ALIBABA; + attr->vendor_part_id = dev->pdev->device; + attr->hw_ver = dev->pdev->revision; + attr->max_qp = dev->attrs.max_qp; + attr->max_qp_wr = min(dev->attrs.max_send_wr, dev->attrs.max_recv_wr); + attr->max_qp_rd_atom = dev->attrs.max_ord; + attr->max_qp_init_rd_atom = dev->attrs.max_ird; + attr->max_res_rd_atom = dev->attrs.max_qp * dev->attrs.max_ird; + attr->device_cap_flags = IB_DEVICE_MEM_MGT_EXTENSIONS; + attr->kernel_cap_flags = IBK_LOCAL_DMA_LKEY; + ibdev->local_dma_lkey = dev->attrs.local_dma_key; + attr->max_send_sge = dev->attrs.max_send_sge; + attr->max_recv_sge = dev->attrs.max_recv_sge; + attr->max_sge_rd = dev->attrs.max_sge_rd; + attr->max_cq = dev->attrs.max_cq; + attr->max_cqe = dev->attrs.max_cqe; + attr->max_mr = dev->attrs.max_mr; + attr->max_pd = dev->attrs.max_pd; + attr->max_mw = dev->attrs.max_mw; + attr->max_fast_reg_page_list_len = ERDMA_MAX_FRMR_PA; + attr->page_size_cap = ERDMA_PAGE_SIZE_SUPPORT; + attr->fw_ver = dev->attrs.fw_version; + + if (dev->netdev) + addrconf_addr_eui48((u8 *)&attr->sys_image_guid, + dev->netdev->dev_addr); + + return 0; +} + +int erdma_query_gid(struct ib_device *ibdev, u32 port, int idx, + union ib_gid *gid) +{ + struct erdma_dev *dev = to_edev(ibdev); + + memset(gid, 0, sizeof(*gid)); + ether_addr_copy(gid->raw, dev->attrs.peer_addr); + + return 0; +} + +int erdma_query_port(struct ib_device *ibdev, u32 port, + struct ib_port_attr *attr) +{ + struct erdma_dev *dev = to_edev(ibdev); + struct net_device *ndev = dev->netdev; + + memset(attr, 0, sizeof(*attr)); + + attr->gid_tbl_len = 1; + attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; + attr->max_msg_sz = -1; + + if (!ndev) + goto out; + + ib_get_eth_speed(ibdev, port, &attr->active_speed, &attr->active_width); + attr->max_mtu = ib_mtu_int_to_enum(ndev->mtu); + attr->active_mtu = ib_mtu_int_to_enum(ndev->mtu); + if (netif_running(ndev) && netif_carrier_ok(ndev)) + dev->state = IB_PORT_ACTIVE; + else + dev->state = IB_PORT_DOWN; + attr->state = dev->state; + +out: + if (dev->state == IB_PORT_ACTIVE) + attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; + else + attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; + + return 0; +} + +int erdma_get_port_immutable(struct ib_device *ibdev, u32 port, + struct ib_port_immutable *port_immutable) +{ + port_immutable->gid_tbl_len = 1; + port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; + + return 0; +} + +int erdma_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct erdma_pd *pd = to_epd(ibpd); + struct erdma_dev *dev = to_edev(ibpd->device); + int pdn; + + pdn = erdma_alloc_idx(&dev->res_cb[ERDMA_RES_TYPE_PD]); + if (pdn < 0) + return pdn; + + pd->pdn = pdn; + + return 0; +} + +int erdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct erdma_pd *pd = to_epd(ibpd); + struct erdma_dev *dev = to_edev(ibpd->device); + + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_PD], pd->pdn); + + return 0; +} + +static int erdma_qp_validate_cap(struct erdma_dev *dev, + struct ib_qp_init_attr *attrs) +{ + if ((attrs->cap.max_send_wr > dev->attrs.max_send_wr) || + (attrs->cap.max_recv_wr > dev->attrs.max_recv_wr) || + (attrs->cap.max_send_sge > dev->attrs.max_send_sge) || + (attrs->cap.max_recv_sge > dev->attrs.max_recv_sge) || + (attrs->cap.max_inline_data > ERDMA_MAX_INLINE) || + !attrs->cap.max_send_wr || !attrs->cap.max_recv_wr) { + return -EINVAL; + } + + return 0; +} + +static int erdma_qp_validate_attr(struct erdma_dev *dev, + struct ib_qp_init_attr *attrs) +{ + if (attrs->qp_type != IB_QPT_RC) + return -EOPNOTSUPP; + + if (attrs->srq) + return -EOPNOTSUPP; + + if (!attrs->send_cq || !attrs->recv_cq) + return -EOPNOTSUPP; + + return 0; +} + +static void free_kernel_qp(struct erdma_qp *qp) +{ + struct erdma_dev *dev = qp->dev; + + vfree(qp->kern_qp.swr_tbl); + vfree(qp->kern_qp.rwr_tbl); + + if (qp->kern_qp.sq_buf) + dma_free_coherent( + &dev->pdev->dev, + WARPPED_BUFSIZE(qp->attrs.sq_size << SQEBB_SHIFT), + qp->kern_qp.sq_buf, qp->kern_qp.sq_buf_dma_addr); + + if (qp->kern_qp.rq_buf) + dma_free_coherent( + &dev->pdev->dev, + WARPPED_BUFSIZE(qp->attrs.rq_size << RQE_SHIFT), + qp->kern_qp.rq_buf, qp->kern_qp.rq_buf_dma_addr); +} + +static int init_kernel_qp(struct erdma_dev *dev, struct erdma_qp *qp, + struct ib_qp_init_attr *attrs) +{ + struct erdma_kqp *kqp = &qp->kern_qp; + int size; + + if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) + kqp->sig_all = 1; + + kqp->sq_pi = 0; + kqp->sq_ci = 0; + kqp->rq_pi = 0; + kqp->rq_ci = 0; + kqp->hw_sq_db = + dev->func_bar + (ERDMA_SDB_SHARED_PAGE_INDEX << PAGE_SHIFT); + kqp->hw_rq_db = dev->func_bar + ERDMA_BAR_RQDB_SPACE_OFFSET; + + kqp->swr_tbl = vmalloc(qp->attrs.sq_size * sizeof(u64)); + kqp->rwr_tbl = vmalloc(qp->attrs.rq_size * sizeof(u64)); + if (!kqp->swr_tbl || !kqp->rwr_tbl) + goto err_out; + + size = (qp->attrs.sq_size << SQEBB_SHIFT) + ERDMA_EXTRA_BUFFER_SIZE; + kqp->sq_buf = dma_alloc_coherent(&dev->pdev->dev, size, + &kqp->sq_buf_dma_addr, GFP_KERNEL); + if (!kqp->sq_buf) + goto err_out; + + size = (qp->attrs.rq_size << RQE_SHIFT) + ERDMA_EXTRA_BUFFER_SIZE; + kqp->rq_buf = dma_alloc_coherent(&dev->pdev->dev, size, + &kqp->rq_buf_dma_addr, GFP_KERNEL); + if (!kqp->rq_buf) + goto err_out; + + kqp->sq_db_info = kqp->sq_buf + (qp->attrs.sq_size << SQEBB_SHIFT); + kqp->rq_db_info = kqp->rq_buf + (qp->attrs.rq_size << RQE_SHIFT); + + return 0; + +err_out: + free_kernel_qp(qp); + return -ENOMEM; +} + +static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem, + u64 start, u64 len, int access, u64 virt, + unsigned long req_page_size, u8 force_indirect_mtt) +{ + struct ib_block_iter biter; + uint64_t *phy_addr = NULL; + int ret = 0; + + mem->umem = ib_umem_get(&dev->ibdev, start, len, access); + if (IS_ERR(mem->umem)) { + ret = PTR_ERR(mem->umem); + mem->umem = NULL; + return ret; + } + + mem->va = virt; + mem->len = len; + mem->page_size = ib_umem_find_best_pgsz(mem->umem, req_page_size, virt); + mem->page_offset = start & (mem->page_size - 1); + mem->mtt_nents = ib_umem_num_dma_blocks(mem->umem, mem->page_size); + mem->page_cnt = mem->mtt_nents; + + if (mem->page_cnt > ERDMA_MAX_INLINE_MTT_ENTRIES || + force_indirect_mtt) { + mem->mtt_type = ERDMA_MR_INDIRECT_MTT; + mem->mtt_buf = + alloc_pages_exact(MTT_SIZE(mem->page_cnt), GFP_KERNEL); + if (!mem->mtt_buf) { + ret = -ENOMEM; + goto error_ret; + } + phy_addr = mem->mtt_buf; + } else { + mem->mtt_type = ERDMA_MR_INLINE_MTT; + phy_addr = mem->mtt_entry; + } + + rdma_umem_for_each_dma_block(mem->umem, &biter, mem->page_size) { + *phy_addr = rdma_block_iter_dma_address(&biter); + phy_addr++; + } + + if (mem->mtt_type == ERDMA_MR_INDIRECT_MTT) { + mem->mtt_entry[0] = + dma_map_single(&dev->pdev->dev, mem->mtt_buf, + MTT_SIZE(mem->page_cnt), DMA_TO_DEVICE); + if (dma_mapping_error(&dev->pdev->dev, mem->mtt_entry[0])) { + free_pages_exact(mem->mtt_buf, MTT_SIZE(mem->page_cnt)); + mem->mtt_buf = NULL; + ret = -ENOMEM; + goto error_ret; + } + } + + return 0; + +error_ret: + if (mem->umem) { + ib_umem_release(mem->umem); + mem->umem = NULL; + } + + return ret; +} + +static void put_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem) +{ + if (mem->mtt_buf) { + dma_unmap_single(&dev->pdev->dev, mem->mtt_entry[0], + MTT_SIZE(mem->page_cnt), DMA_TO_DEVICE); + free_pages_exact(mem->mtt_buf, MTT_SIZE(mem->page_cnt)); + } + + if (mem->umem) { + ib_umem_release(mem->umem); + mem->umem = NULL; + } +} + +static int erdma_map_user_dbrecords(struct erdma_ucontext *ctx, + u64 dbrecords_va, + struct erdma_user_dbrecords_page **dbr_page, + dma_addr_t *dma_addr) +{ + struct erdma_user_dbrecords_page *page = NULL; + int rv = 0; + + mutex_lock(&ctx->dbrecords_page_mutex); + + list_for_each_entry(page, &ctx->dbrecords_page_list, list) + if (page->va == (dbrecords_va & PAGE_MASK)) + goto found; + + page = kmalloc(sizeof(*page), GFP_KERNEL); + if (!page) { + rv = -ENOMEM; + goto out; + } + + page->va = (dbrecords_va & PAGE_MASK); + page->refcnt = 0; + + page->umem = ib_umem_get(ctx->ibucontext.device, + dbrecords_va & PAGE_MASK, PAGE_SIZE, 0); + if (IS_ERR(page->umem)) { + rv = PTR_ERR(page->umem); + kfree(page); + goto out; + } + + list_add(&page->list, &ctx->dbrecords_page_list); + +found: + *dma_addr = sg_dma_address(page->umem->sgt_append.sgt.sgl) + + (dbrecords_va & ~PAGE_MASK); + *dbr_page = page; + page->refcnt++; + +out: + mutex_unlock(&ctx->dbrecords_page_mutex); + return rv; +} + +static void +erdma_unmap_user_dbrecords(struct erdma_ucontext *ctx, + struct erdma_user_dbrecords_page **dbr_page) +{ + if (!ctx || !(*dbr_page)) + return; + + mutex_lock(&ctx->dbrecords_page_mutex); + if (--(*dbr_page)->refcnt == 0) { + list_del(&(*dbr_page)->list); + ib_umem_release((*dbr_page)->umem); + kfree(*dbr_page); + } + + *dbr_page = NULL; + mutex_unlock(&ctx->dbrecords_page_mutex); +} + +static int init_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx, + u64 va, u32 len, u64 db_info_va) +{ + dma_addr_t db_info_dma_addr; + u32 rq_offset; + int ret; + + if (len < (PAGE_ALIGN(qp->attrs.sq_size * SQEBB_SIZE) + + qp->attrs.rq_size * RQE_SIZE)) + return -EINVAL; + + ret = get_mtt_entries(qp->dev, &qp->user_qp.sq_mtt, va, + qp->attrs.sq_size << SQEBB_SHIFT, 0, va, + (SZ_1M - SZ_4K), 1); + if (ret) + return ret; + + rq_offset = PAGE_ALIGN(qp->attrs.sq_size << SQEBB_SHIFT); + qp->user_qp.rq_offset = rq_offset; + + ret = get_mtt_entries(qp->dev, &qp->user_qp.rq_mtt, va + rq_offset, + qp->attrs.rq_size << RQE_SHIFT, 0, va + rq_offset, + (SZ_1M - SZ_4K), 1); + if (ret) + goto put_sq_mtt; + + ret = erdma_map_user_dbrecords(uctx, db_info_va, + &qp->user_qp.user_dbr_page, + &db_info_dma_addr); + if (ret) + goto put_rq_mtt; + + qp->user_qp.sq_db_info_dma_addr = db_info_dma_addr; + qp->user_qp.rq_db_info_dma_addr = db_info_dma_addr + ERDMA_DB_SIZE; + + return 0; + +put_rq_mtt: + put_mtt_entries(qp->dev, &qp->user_qp.rq_mtt); + +put_sq_mtt: + put_mtt_entries(qp->dev, &qp->user_qp.sq_mtt); + + return ret; +} + +static void free_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx) +{ + put_mtt_entries(qp->dev, &qp->user_qp.sq_mtt); + put_mtt_entries(qp->dev, &qp->user_qp.rq_mtt); + erdma_unmap_user_dbrecords(uctx, &qp->user_qp.user_dbr_page); +} + +int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, + struct ib_udata *udata) +{ + struct erdma_qp *qp = to_eqp(ibqp); + struct erdma_dev *dev = to_edev(ibqp->device); + struct erdma_ucontext *uctx = rdma_udata_to_drv_context( + udata, struct erdma_ucontext, ibucontext); + struct erdma_ureq_create_qp ureq; + struct erdma_uresp_create_qp uresp; + int ret; + + ret = erdma_qp_validate_cap(dev, attrs); + if (ret) + goto err_out; + + ret = erdma_qp_validate_attr(dev, attrs); + if (ret) + goto err_out; + + qp->scq = to_ecq(attrs->send_cq); + qp->rcq = to_ecq(attrs->recv_cq); + qp->dev = dev; + qp->attrs.cc = dev->attrs.cc; + + init_rwsem(&qp->state_lock); + kref_init(&qp->ref); + init_completion(&qp->safe_free); + + ret = xa_alloc_cyclic(&dev->qp_xa, &qp->ibqp.qp_num, qp, + XA_LIMIT(1, dev->attrs.max_qp - 1), + &dev->next_alloc_qpn, GFP_KERNEL); + if (ret < 0) { + ret = -ENOMEM; + goto err_out; + } + + qp->attrs.sq_size = roundup_pow_of_two(attrs->cap.max_send_wr * + ERDMA_MAX_WQEBB_PER_SQE); + qp->attrs.rq_size = roundup_pow_of_two(attrs->cap.max_recv_wr); + + if (uctx) { + ret = ib_copy_from_udata(&ureq, udata, + min(sizeof(ureq), udata->inlen)); + if (ret) + goto err_out_xa; + + ret = init_user_qp(qp, uctx, ureq.qbuf_va, ureq.qbuf_len, + ureq.db_record_va); + if (ret) + goto err_out_xa; + + memset(&uresp, 0, sizeof(uresp)); + + uresp.num_sqe = qp->attrs.sq_size; + uresp.num_rqe = qp->attrs.rq_size; + uresp.qp_id = QP_ID(qp); + uresp.rq_offset = qp->user_qp.rq_offset; + + ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (ret) + goto err_out_cmd; + } else { + init_kernel_qp(dev, qp, attrs); + } + + qp->attrs.max_send_sge = attrs->cap.max_send_sge; + qp->attrs.max_recv_sge = attrs->cap.max_recv_sge; + qp->attrs.state = ERDMA_QP_STATE_IDLE; + + ret = create_qp_cmd(dev, qp); + if (ret) + goto err_out_cmd; + + spin_lock_init(&qp->lock); + + return 0; + +err_out_cmd: + if (uctx) + free_user_qp(qp, uctx); + else + free_kernel_qp(qp); +err_out_xa: + xa_erase(&dev->qp_xa, QP_ID(qp)); +err_out: + return ret; +} + +static int erdma_create_stag(struct erdma_dev *dev, u32 *stag) +{ + int stag_idx; + + stag_idx = erdma_alloc_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX]); + if (stag_idx < 0) + return stag_idx; + + /* For now, we always let key field be zero. */ + *stag = (stag_idx << 8); + + return 0; +} + +struct ib_mr *erdma_get_dma_mr(struct ib_pd *ibpd, int acc) +{ + struct erdma_dev *dev = to_edev(ibpd->device); + struct erdma_mr *mr; + u32 stag; + int ret; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + ret = erdma_create_stag(dev, &stag); + if (ret) + goto out_free; + + mr->type = ERDMA_MR_TYPE_DMA; + + mr->ibmr.lkey = stag; + mr->ibmr.rkey = stag; + mr->ibmr.pd = ibpd; + mr->access = ERDMA_MR_ACC_LR | to_erdma_access_flags(acc); + ret = regmr_cmd(dev, mr); + if (ret) + goto out_remove_stag; + + return &mr->ibmr; + +out_remove_stag: + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX], + mr->ibmr.lkey >> 8); + +out_free: + kfree(mr); + + return ERR_PTR(ret); +} + +struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, + u32 max_num_sg) +{ + struct erdma_mr *mr; + struct erdma_dev *dev = to_edev(ibpd->device); + int ret; + u32 stag; + + if (mr_type != IB_MR_TYPE_MEM_REG) + return ERR_PTR(-EOPNOTSUPP); + + if (max_num_sg > ERDMA_MR_MAX_MTT_CNT) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + ret = erdma_create_stag(dev, &stag); + if (ret) + goto out_free; + + mr->type = ERDMA_MR_TYPE_FRMR; + + mr->ibmr.lkey = stag; + mr->ibmr.rkey = stag; + mr->ibmr.pd = ibpd; + /* update it in FRMR. */ + mr->access = ERDMA_MR_ACC_LR | ERDMA_MR_ACC_LW | ERDMA_MR_ACC_RR | + ERDMA_MR_ACC_RW; + + mr->mem.page_size = PAGE_SIZE; /* update it later. */ + mr->mem.page_cnt = max_num_sg; + mr->mem.mtt_type = ERDMA_MR_INDIRECT_MTT; + mr->mem.mtt_buf = + alloc_pages_exact(MTT_SIZE(mr->mem.page_cnt), GFP_KERNEL); + if (!mr->mem.mtt_buf) { + ret = -ENOMEM; + goto out_remove_stag; + } + + mr->mem.mtt_entry[0] = + dma_map_single(&dev->pdev->dev, mr->mem.mtt_buf, + MTT_SIZE(mr->mem.page_cnt), DMA_TO_DEVICE); + if (dma_mapping_error(&dev->pdev->dev, mr->mem.mtt_entry[0])) { + ret = -ENOMEM; + goto out_free_mtt; + } + + ret = regmr_cmd(dev, mr); + if (ret) + goto out_dma_unmap; + + return &mr->ibmr; + +out_dma_unmap: + dma_unmap_single(&dev->pdev->dev, mr->mem.mtt_entry[0], + MTT_SIZE(mr->mem.page_cnt), DMA_TO_DEVICE); +out_free_mtt: + free_pages_exact(mr->mem.mtt_buf, MTT_SIZE(mr->mem.page_cnt)); + +out_remove_stag: + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX], + mr->ibmr.lkey >> 8); + +out_free: + kfree(mr); + + return ERR_PTR(ret); +} + +static int erdma_set_page(struct ib_mr *ibmr, u64 addr) +{ + struct erdma_mr *mr = to_emr(ibmr); + + if (mr->mem.mtt_nents >= mr->mem.page_cnt) + return -1; + + *((u64 *)mr->mem.mtt_buf + mr->mem.mtt_nents) = addr; + mr->mem.mtt_nents++; + + return 0; +} + +int erdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) +{ + struct erdma_mr *mr = to_emr(ibmr); + int num; + + mr->mem.mtt_nents = 0; + + num = ib_sg_to_pages(&mr->ibmr, sg, sg_nents, sg_offset, + erdma_set_page); + + return num; +} + +struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, + u64 virt, int access, struct ib_udata *udata) +{ + struct erdma_mr *mr = NULL; + struct erdma_dev *dev = to_edev(ibpd->device); + u32 stag; + int ret; + + if (!len || len > dev->attrs.max_mr_size) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + ret = get_mtt_entries(dev, &mr->mem, start, len, access, virt, + SZ_2G - SZ_4K, 0); + if (ret) + goto err_out_free; + + ret = erdma_create_stag(dev, &stag); + if (ret) + goto err_out_put_mtt; + + mr->ibmr.lkey = mr->ibmr.rkey = stag; + mr->ibmr.pd = ibpd; + mr->mem.va = virt; + mr->mem.len = len; + mr->access = ERDMA_MR_ACC_LR | to_erdma_access_flags(access); + mr->valid = 1; + mr->type = ERDMA_MR_TYPE_NORMAL; + + ret = regmr_cmd(dev, mr); + if (ret) + goto err_out_mr; + + return &mr->ibmr; + +err_out_mr: + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX], + mr->ibmr.lkey >> 8); + +err_out_put_mtt: + put_mtt_entries(dev, &mr->mem); + +err_out_free: + kfree(mr); + + return ERR_PTR(ret); +} + +int erdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +{ + struct erdma_mr *mr; + struct erdma_dev *dev = to_edev(ibmr->device); + struct erdma_cmdq_dereg_mr_req req; + int ret; + + mr = to_emr(ibmr); + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_DEREG_MR); + + req.cfg = FIELD_PREP(ERDMA_CMD_MR_MPT_IDX_MASK, ibmr->lkey >> 8) | + FIELD_PREP(ERDMA_CMD_MR_KEY_MASK, ibmr->lkey & 0xFF); + + ret = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); + if (ret) + return ret; + + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX], ibmr->lkey >> 8); + + put_mtt_entries(dev, &mr->mem); + + kfree(mr); + return 0; +} + +int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +{ + struct erdma_cq *cq = to_ecq(ibcq); + struct erdma_dev *dev = to_edev(ibcq->device); + struct erdma_ucontext *ctx = rdma_udata_to_drv_context( + udata, struct erdma_ucontext, ibucontext); + int err; + struct erdma_cmdq_destroy_cq_req req; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_DESTROY_CQ); + req.cqn = cq->cqn; + + err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); + if (err) + return err; + + if (rdma_is_kernel_res(&cq->ibcq.res)) { + dma_free_coherent(&dev->pdev->dev, + WARPPED_BUFSIZE(cq->depth << CQE_SHIFT), + cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr); + } else { + erdma_unmap_user_dbrecords(ctx, &cq->user_cq.user_dbr_page); + put_mtt_entries(dev, &cq->user_cq.qbuf_mtt); + } + + xa_erase(&dev->cq_xa, cq->cqn); + + return 0; +} + +int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) +{ + struct erdma_qp *qp = to_eqp(ibqp); + struct erdma_dev *dev = to_edev(ibqp->device); + struct erdma_ucontext *ctx = rdma_udata_to_drv_context( + udata, struct erdma_ucontext, ibucontext); + struct erdma_qp_attrs qp_attrs; + int err; + struct erdma_cmdq_destroy_qp_req req; + + down_write(&qp->state_lock); + qp_attrs.state = ERDMA_QP_STATE_ERROR; + erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE); + up_write(&qp->state_lock); + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_DESTROY_QP); + req.qpn = QP_ID(qp); + + err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); + if (err) + return err; + + erdma_qp_put(qp); + wait_for_completion(&qp->safe_free); + + if (rdma_is_kernel_res(&qp->ibqp.res)) { + vfree(qp->kern_qp.swr_tbl); + vfree(qp->kern_qp.rwr_tbl); + dma_free_coherent( + &dev->pdev->dev, + WARPPED_BUFSIZE(qp->attrs.rq_size << RQE_SHIFT), + qp->kern_qp.rq_buf, qp->kern_qp.rq_buf_dma_addr); + dma_free_coherent( + &dev->pdev->dev, + WARPPED_BUFSIZE(qp->attrs.sq_size << SQEBB_SHIFT), + qp->kern_qp.sq_buf, qp->kern_qp.sq_buf_dma_addr); + } else { + put_mtt_entries(dev, &qp->user_qp.sq_mtt); + put_mtt_entries(dev, &qp->user_qp.rq_mtt); + erdma_unmap_user_dbrecords(ctx, &qp->user_qp.user_dbr_page); + } + + if (qp->cep) + erdma_cep_put(qp->cep); + xa_erase(&dev->qp_xa, QP_ID(qp)); + + return 0; +} + +void erdma_qp_get_ref(struct ib_qp *ibqp) +{ + erdma_qp_get(to_eqp(ibqp)); +} + +void erdma_qp_put_ref(struct ib_qp *ibqp) +{ + erdma_qp_put(to_eqp(ibqp)); +} + +int erdma_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma) +{ + struct rdma_user_mmap_entry *rdma_entry; + struct erdma_user_mmap_entry *entry; + pgprot_t prot; + int err; + + rdma_entry = rdma_user_mmap_entry_get(ctx, vma); + if (!rdma_entry) + return -EINVAL; + + entry = to_emmap(rdma_entry); + + switch (entry->mmap_flag) { + case ERDMA_MMAP_IO_NC: + /* map doorbell. */ + prot = pgprot_device(vma->vm_page_prot); + break; + default: + return -EINVAL; + } + + err = rdma_user_mmap_io(ctx, vma, PFN_DOWN(entry->address), PAGE_SIZE, + prot, rdma_entry); + + rdma_user_mmap_entry_put(rdma_entry); + return err; +} + +void erdma_mmap_free(struct rdma_user_mmap_entry *rdma_entry) +{ + struct erdma_user_mmap_entry *entry = to_emmap(rdma_entry); + + kfree(entry); +} + +#define ERDMA_SDB_PAGE 0 +#define ERDMA_SDB_ENTRY 1 +#define ERDMA_SDB_SHARED 2 + +static void alloc_db_resources(struct erdma_dev *dev, + struct erdma_ucontext *ctx) +{ + u32 bitmap_idx; + struct erdma_devattr *attrs = &dev->attrs; + + if (attrs->disable_dwqe) + goto alloc_normal_db; + + /* Try to alloc independent SDB page. */ + spin_lock(&dev->db_bitmap_lock); + bitmap_idx = find_first_zero_bit(dev->sdb_page, attrs->dwqe_pages); + if (bitmap_idx != attrs->dwqe_pages) { + set_bit(bitmap_idx, dev->sdb_page); + spin_unlock(&dev->db_bitmap_lock); + + ctx->sdb_type = ERDMA_SDB_PAGE; + ctx->sdb_idx = bitmap_idx; + ctx->sdb_page_idx = bitmap_idx; + ctx->sdb = dev->func_bar_addr + ERDMA_BAR_SQDB_SPACE_OFFSET + + (bitmap_idx << PAGE_SHIFT); + ctx->sdb_page_off = 0; + + return; + } + + bitmap_idx = find_first_zero_bit(dev->sdb_entry, attrs->dwqe_entries); + if (bitmap_idx != attrs->dwqe_entries) { + set_bit(bitmap_idx, dev->sdb_entry); + spin_unlock(&dev->db_bitmap_lock); + + ctx->sdb_type = ERDMA_SDB_ENTRY; + ctx->sdb_idx = bitmap_idx; + ctx->sdb_page_idx = attrs->dwqe_pages + + bitmap_idx / ERDMA_DWQE_TYPE1_CNT_PER_PAGE; + ctx->sdb_page_off = bitmap_idx % ERDMA_DWQE_TYPE1_CNT_PER_PAGE; + + ctx->sdb = dev->func_bar_addr + ERDMA_BAR_SQDB_SPACE_OFFSET + + (ctx->sdb_page_idx << PAGE_SHIFT); + + return; + } + + spin_unlock(&dev->db_bitmap_lock); + +alloc_normal_db: + ctx->sdb_type = ERDMA_SDB_SHARED; + ctx->sdb_idx = 0; + ctx->sdb_page_idx = ERDMA_SDB_SHARED_PAGE_INDEX; + ctx->sdb_page_off = 0; + + ctx->sdb = dev->func_bar_addr + (ctx->sdb_page_idx << PAGE_SHIFT); +} + +static void erdma_uctx_user_mmap_entries_remove(struct erdma_ucontext *uctx) +{ + rdma_user_mmap_entry_remove(uctx->sq_db_mmap_entry); + rdma_user_mmap_entry_remove(uctx->rq_db_mmap_entry); + rdma_user_mmap_entry_remove(uctx->cq_db_mmap_entry); +} + +int erdma_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *udata) +{ + struct erdma_ucontext *ctx = to_ectx(ibctx); + struct erdma_dev *dev = to_edev(ibctx->device); + int ret; + struct erdma_uresp_alloc_ctx uresp = {}; + + if (atomic_inc_return(&dev->num_ctx) > ERDMA_MAX_CONTEXT) { + ret = -ENOMEM; + goto err_out; + } + + INIT_LIST_HEAD(&ctx->dbrecords_page_list); + mutex_init(&ctx->dbrecords_page_mutex); + + alloc_db_resources(dev, ctx); + + ctx->rdb = dev->func_bar_addr + ERDMA_BAR_RQDB_SPACE_OFFSET; + ctx->cdb = dev->func_bar_addr + ERDMA_BAR_CQDB_SPACE_OFFSET; + + if (udata->outlen < sizeof(uresp)) { + ret = -EINVAL; + goto err_out; + } + + ctx->sq_db_mmap_entry = erdma_user_mmap_entry_insert( + ctx, (void *)ctx->sdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.sdb); + if (!ctx->sq_db_mmap_entry) { + ret = -ENOMEM; + goto err_out; + } + + ctx->rq_db_mmap_entry = erdma_user_mmap_entry_insert( + ctx, (void *)ctx->rdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.rdb); + if (!ctx->rq_db_mmap_entry) { + ret = -EINVAL; + goto err_out; + } + + ctx->cq_db_mmap_entry = erdma_user_mmap_entry_insert( + ctx, (void *)ctx->cdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.cdb); + if (!ctx->cq_db_mmap_entry) { + ret = -EINVAL; + goto err_out; + } + + uresp.dev_id = dev->pdev->device; + uresp.sdb_type = ctx->sdb_type; + uresp.sdb_offset = ctx->sdb_page_off; + + ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (ret) + goto err_out; + + return 0; + +err_out: + erdma_uctx_user_mmap_entries_remove(ctx); + atomic_dec(&dev->num_ctx); + return ret; +} + +void erdma_dealloc_ucontext(struct ib_ucontext *ibctx) +{ + struct erdma_ucontext *ctx = to_ectx(ibctx); + struct erdma_dev *dev = to_edev(ibctx->device); + + spin_lock(&dev->db_bitmap_lock); + if (ctx->sdb_type == ERDMA_SDB_PAGE) + clear_bit(ctx->sdb_idx, dev->sdb_page); + else if (ctx->sdb_type == ERDMA_SDB_ENTRY) + clear_bit(ctx->sdb_idx, dev->sdb_entry); + + erdma_uctx_user_mmap_entries_remove(ctx); + + spin_unlock(&dev->db_bitmap_lock); + + atomic_dec(&dev->num_ctx); +} + +static int ib_qp_state_to_erdma_qp_state[IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = ERDMA_QP_STATE_IDLE, + [IB_QPS_INIT] = ERDMA_QP_STATE_IDLE, + [IB_QPS_RTR] = ERDMA_QP_STATE_RTR, + [IB_QPS_RTS] = ERDMA_QP_STATE_RTS, + [IB_QPS_SQD] = ERDMA_QP_STATE_CLOSING, + [IB_QPS_SQE] = ERDMA_QP_STATE_TERMINATE, + [IB_QPS_ERR] = ERDMA_QP_STATE_ERROR +}; + +int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, + struct ib_udata *udata) +{ + struct erdma_qp_attrs new_attrs; + enum erdma_qp_attr_mask erdma_attr_mask = 0; + struct erdma_qp *qp = to_eqp(ibqp); + int ret = 0; + + if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS) + return -EOPNOTSUPP; + + memset(&new_attrs, 0, sizeof(new_attrs)); + + if (attr_mask & IB_QP_STATE) { + new_attrs.state = ib_qp_state_to_erdma_qp_state[attr->qp_state]; + + erdma_attr_mask |= ERDMA_QP_ATTR_STATE; + } + + down_write(&qp->state_lock); + + ret = erdma_modify_qp_internal(qp, &new_attrs, erdma_attr_mask); + + up_write(&qp->state_lock); + + return ret; +} + +int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + struct erdma_qp *qp; + struct erdma_dev *dev; + + if (ibqp && qp_attr && qp_init_attr) { + qp = to_eqp(ibqp); + dev = to_edev(ibqp->device); + } else { + return -EINVAL; + } + + qp_attr->cap.max_inline_data = ERDMA_MAX_INLINE; + qp_init_attr->cap.max_inline_data = ERDMA_MAX_INLINE; + + qp_attr->cap.max_send_wr = qp->attrs.sq_size; + qp_attr->cap.max_recv_wr = qp->attrs.rq_size; + qp_attr->cap.max_send_sge = qp->attrs.max_send_sge; + qp_attr->cap.max_recv_sge = qp->attrs.max_recv_sge; + + qp_attr->path_mtu = ib_mtu_int_to_enum(dev->netdev->mtu); + qp_attr->max_rd_atomic = qp->attrs.irq_size; + qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; + + qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ; + + qp_init_attr->cap = qp_attr->cap; + + return 0; +} + +static int erdma_init_user_cq(struct erdma_ucontext *ctx, struct erdma_cq *cq, + struct erdma_ureq_create_cq *ureq) +{ + int ret; + struct erdma_dev *dev = to_edev(cq->ibcq.device); + + ret = get_mtt_entries(dev, &cq->user_cq.qbuf_mtt, ureq->qbuf_va, + ureq->qbuf_len, 0, ureq->qbuf_va, SZ_64M - SZ_4K, + 1); + if (ret) + return ret; + + ret = erdma_map_user_dbrecords(ctx, ureq->db_record_va, + &cq->user_cq.user_dbr_page, + &cq->user_cq.db_info_dma_addr); + if (ret) + put_mtt_entries(dev, &cq->user_cq.qbuf_mtt); + + return ret; +} + +static int erdma_init_kernel_cq(struct erdma_cq *cq) +{ + struct erdma_dev *dev = to_edev(cq->ibcq.device); + + cq->kern_cq.qbuf = + dma_alloc_coherent(&dev->pdev->dev, + WARPPED_BUFSIZE(cq->depth << CQE_SHIFT), + &cq->kern_cq.qbuf_dma_addr, GFP_KERNEL); + if (!cq->kern_cq.qbuf) + return -ENOMEM; + + cq->kern_cq.db_record = + (u64 *)(cq->kern_cq.qbuf + (cq->depth << CQE_SHIFT)); + spin_lock_init(&cq->kern_cq.lock); + /* use default cqdb addr */ + cq->kern_cq.db = dev->func_bar + ERDMA_BAR_CQDB_SPACE_OFFSET; + + return 0; +} + +int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) +{ + struct erdma_cq *cq = to_ecq(ibcq); + struct erdma_dev *dev = to_edev(ibcq->device); + unsigned int depth = attr->cqe; + int ret; + struct erdma_ucontext *ctx = rdma_udata_to_drv_context( + udata, struct erdma_ucontext, ibucontext); + + if (depth > dev->attrs.max_cqe) + return -EINVAL; + + depth = roundup_pow_of_two(depth); + cq->ibcq.cqe = depth; + cq->depth = depth; + cq->assoc_eqn = attr->comp_vector + 1; + + ret = xa_alloc_cyclic(&dev->cq_xa, &cq->cqn, cq, + XA_LIMIT(1, dev->attrs.max_cq - 1), + &dev->next_alloc_cqn, GFP_KERNEL); + if (ret < 0) + return ret; + + if (!rdma_is_kernel_res(&ibcq->res)) { + struct erdma_ureq_create_cq ureq; + struct erdma_uresp_create_cq uresp; + + ret = ib_copy_from_udata(&ureq, udata, + min(udata->inlen, sizeof(ureq))); + if (ret) + goto err_out_xa; + + ret = erdma_init_user_cq(ctx, cq, &ureq); + if (ret) + goto err_out_xa; + + uresp.cq_id = cq->cqn; + uresp.num_cqe = depth; + + ret = ib_copy_to_udata(udata, &uresp, + min(sizeof(uresp), udata->outlen)); + if (ret) + goto err_free_res; + } else { + ret = erdma_init_kernel_cq(cq); + if (ret) + goto err_out_xa; + } + + ret = create_cq_cmd(dev, cq); + if (ret) + goto err_free_res; + + return 0; + +err_free_res: + if (!rdma_is_kernel_res(&ibcq->res)) { + erdma_unmap_user_dbrecords(ctx, &cq->user_cq.user_dbr_page); + put_mtt_entries(dev, &cq->user_cq.qbuf_mtt); + } else { + dma_free_coherent(&dev->pdev->dev, + WARPPED_BUFSIZE(depth << CQE_SHIFT), + cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr); + } + +err_out_xa: + xa_erase(&dev->cq_xa, cq->cqn); + + return ret; +} + +void erdma_port_event(struct erdma_dev *dev, enum ib_event_type reason) +{ + struct ib_event event; + + event.device = &dev->ibdev; + event.element.port_num = 1; + event.event = reason; + + ib_dispatch_event(&event); +} diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h new file mode 100644 index 000000000000..c7baddb1f292 --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -0,0 +1,342 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ + +/* Authors: Cheng Xu */ +/* Kai Shen */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#ifndef __ERDMA_VERBS_H__ +#define __ERDMA_VERBS_H__ + +#include + +#include +#include +#include + +#include "erdma.h" +#include "erdma_cm.h" +#include "erdma_hw.h" + +/* RDMA Capability. */ +#define ERDMA_MAX_PD (128 * 1024) +#define ERDMA_MAX_SEND_WR 4096 +#define ERDMA_MAX_ORD 128 +#define ERDMA_MAX_IRD 128 +#define ERDMA_MAX_SGE_RD 1 +#define ERDMA_MAX_CONTEXT (128 * 1024) +#define ERDMA_MAX_SEND_SGE 6 +#define ERDMA_MAX_RECV_SGE 1 +#define ERDMA_MAX_INLINE (sizeof(struct erdma_sge) * (ERDMA_MAX_SEND_SGE)) +#define ERDMA_MAX_FRMR_PA 512 + +enum { + ERDMA_MMAP_IO_NC = 0, /* no cache */ +}; + +struct erdma_user_mmap_entry { + struct rdma_user_mmap_entry rdma_entry; + u64 address; + u8 mmap_flag; +}; + +struct erdma_ucontext { + struct ib_ucontext ibucontext; + + u32 sdb_type; + u32 sdb_idx; + u32 sdb_page_idx; + u32 sdb_page_off; + u64 sdb; + u64 rdb; + u64 cdb; + + struct rdma_user_mmap_entry *sq_db_mmap_entry; + struct rdma_user_mmap_entry *rq_db_mmap_entry; + struct rdma_user_mmap_entry *cq_db_mmap_entry; + + /* doorbell records */ + struct list_head dbrecords_page_list; + struct mutex dbrecords_page_mutex; +}; + +struct erdma_pd { + struct ib_pd ibpd; + u32 pdn; +}; + +/* + * MemoryRegion definition. + */ +#define ERDMA_MAX_INLINE_MTT_ENTRIES 4 +#define MTT_SIZE(mtt_cnt) (mtt_cnt << 3) /* per mtt takes 8 Bytes. */ +#define ERDMA_MR_MAX_MTT_CNT 524288 +#define ERDMA_MTT_ENTRY_SIZE 8 + +#define ERDMA_MR_TYPE_NORMAL 0 +#define ERDMA_MR_TYPE_FRMR 1 +#define ERDMA_MR_TYPE_DMA 2 + +#define ERDMA_MR_INLINE_MTT 0 +#define ERDMA_MR_INDIRECT_MTT 1 + +#define ERDMA_MR_ACC_LR BIT(0) +#define ERDMA_MR_ACC_LW BIT(1) +#define ERDMA_MR_ACC_RR BIT(2) +#define ERDMA_MR_ACC_RW BIT(3) + +static inline u8 to_erdma_access_flags(int access) +{ + return (access & IB_ACCESS_REMOTE_READ ? ERDMA_MR_ACC_RR : 0) | + (access & IB_ACCESS_LOCAL_WRITE ? ERDMA_MR_ACC_LW : 0) | + (access & IB_ACCESS_REMOTE_WRITE ? ERDMA_MR_ACC_RW : 0); +} + +struct erdma_mem { + struct ib_umem *umem; + void *mtt_buf; + u32 mtt_type; + u32 page_size; + u32 page_offset; + u32 page_cnt; + u32 mtt_nents; + + u64 va; + u64 len; + + u64 mtt_entry[ERDMA_MAX_INLINE_MTT_ENTRIES]; +}; + +struct erdma_mr { + struct ib_mr ibmr; + struct erdma_mem mem; + u8 type; + u8 access; + u8 valid; +}; + +struct erdma_user_dbrecords_page { + struct list_head list; + struct ib_umem *umem; + u64 va; + int refcnt; +}; + +struct erdma_uqp { + struct erdma_mem sq_mtt; + struct erdma_mem rq_mtt; + + dma_addr_t sq_db_info_dma_addr; + dma_addr_t rq_db_info_dma_addr; + + struct erdma_user_dbrecords_page *user_dbr_page; + + u32 rq_offset; +}; + +struct erdma_kqp { + u16 sq_pi; + u16 sq_ci; + + u16 rq_pi; + u16 rq_ci; + + u64 *swr_tbl; + u64 *rwr_tbl; + + void __iomem *hw_sq_db; + void __iomem *hw_rq_db; + + void *sq_buf; + dma_addr_t sq_buf_dma_addr; + + void *rq_buf; + dma_addr_t rq_buf_dma_addr; + + void *sq_db_info; + void *rq_db_info; + + u8 sig_all; +}; + +enum erdma_qp_state { + ERDMA_QP_STATE_IDLE = 0, + ERDMA_QP_STATE_RTR = 1, + ERDMA_QP_STATE_RTS = 2, + ERDMA_QP_STATE_CLOSING = 3, + ERDMA_QP_STATE_TERMINATE = 4, + ERDMA_QP_STATE_ERROR = 5, + ERDMA_QP_STATE_UNDEF = 7, + ERDMA_QP_STATE_COUNT = 8 +}; + +enum erdma_qp_attr_mask { + ERDMA_QP_ATTR_STATE = (1 << 0), + ERDMA_QP_ATTR_LLP_HANDLE = (1 << 2), + ERDMA_QP_ATTR_ORD = (1 << 3), + ERDMA_QP_ATTR_IRD = (1 << 4), + ERDMA_QP_ATTR_SQ_SIZE = (1 << 5), + ERDMA_QP_ATTR_RQ_SIZE = (1 << 6), + ERDMA_QP_ATTR_MPA = (1 << 7) +}; + +struct erdma_qp_attrs { + enum erdma_qp_state state; + enum erdma_cc_alg cc; /* Congestion control algorithm */ + u32 sq_size; + u32 rq_size; + u32 orq_size; + u32 irq_size; + u32 max_send_sge; + u32 max_recv_sge; + u32 cookie; +#define ERDMA_QP_ACTIVE 0 +#define ERDMA_QP_PASSIVE 1 + u8 qp_type; + u8 pd_len; +}; + +struct erdma_qp { + struct ib_qp ibqp; + struct kref ref; + struct completion safe_free; + struct erdma_dev *dev; + struct erdma_cep *cep; + struct rw_semaphore state_lock; + + union { + struct erdma_kqp kern_qp; + struct erdma_uqp user_qp; + }; + + struct erdma_cq *scq; + struct erdma_cq *rcq; + + struct erdma_qp_attrs attrs; + spinlock_t lock; +}; + +struct erdma_kcq_info { + void *qbuf; + dma_addr_t qbuf_dma_addr; + u32 ci; + u32 cmdsn; + u32 notify_cnt; + + spinlock_t lock; + u8 __iomem *db; + u64 *db_record; +}; + +struct erdma_ucq_info { + struct erdma_mem qbuf_mtt; + struct erdma_user_dbrecords_page *user_dbr_page; + dma_addr_t db_info_dma_addr; +}; + +struct erdma_cq { + struct ib_cq ibcq; + u32 cqn; + + u32 depth; + u32 assoc_eqn; + + union { + struct erdma_kcq_info kern_cq; + struct erdma_ucq_info user_cq; + }; +}; + +#define QP_ID(qp) ((qp)->ibqp.qp_num) + +static inline struct erdma_qp *find_qp_by_qpn(struct erdma_dev *dev, int id) +{ + return (struct erdma_qp *)xa_load(&dev->qp_xa, id); +} + +static inline struct erdma_cq *find_cq_by_cqn(struct erdma_dev *dev, int id) +{ + return (struct erdma_cq *)xa_load(&dev->cq_xa, id); +} + +void erdma_qp_get(struct erdma_qp *qp); +void erdma_qp_put(struct erdma_qp *qp); +int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, + enum erdma_qp_attr_mask mask); +void erdma_qp_llp_close(struct erdma_qp *qp); +void erdma_qp_cm_drop(struct erdma_qp *qp); + +static inline struct erdma_ucontext *to_ectx(struct ib_ucontext *ibctx) +{ + return container_of(ibctx, struct erdma_ucontext, ibucontext); +} + +static inline struct erdma_pd *to_epd(struct ib_pd *pd) +{ + return container_of(pd, struct erdma_pd, ibpd); +} + +static inline struct erdma_mr *to_emr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct erdma_mr, ibmr); +} + +static inline struct erdma_qp *to_eqp(struct ib_qp *qp) +{ + return container_of(qp, struct erdma_qp, ibqp); +} + +static inline struct erdma_cq *to_ecq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct erdma_cq, ibcq); +} + +static inline struct erdma_user_mmap_entry * +to_emmap(struct rdma_user_mmap_entry *ibmmap) +{ + return container_of(ibmmap, struct erdma_user_mmap_entry, rdma_entry); +} + +int erdma_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *data); +void erdma_dealloc_ucontext(struct ib_ucontext *ibctx); +int erdma_query_device(struct ib_device *dev, struct ib_device_attr *attr, + struct ib_udata *data); +int erdma_get_port_immutable(struct ib_device *dev, u32 port, + struct ib_port_immutable *ib_port_immutable); +int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *data); +int erdma_query_port(struct ib_device *dev, u32 port, + struct ib_port_attr *attr); +int erdma_query_gid(struct ib_device *dev, u32 port, int idx, + union ib_gid *gid); +int erdma_alloc_pd(struct ib_pd *ibpd, struct ib_udata *data); +int erdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); +int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, + struct ib_udata *data); +int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, + struct ib_qp_init_attr *init_attr); +int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, + struct ib_udata *data); +int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); +int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); +struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, + u64 virt, int access, struct ib_udata *udata); +struct ib_mr *erdma_get_dma_mr(struct ib_pd *ibpd, int rights); +int erdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *data); +int erdma_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma); +void erdma_mmap_free(struct rdma_user_mmap_entry *rdma_entry); +void erdma_qp_get_ref(struct ib_qp *ibqp); +void erdma_qp_put_ref(struct ib_qp *ibqp); +struct ib_qp *erdma_get_ibqp(struct ib_device *dev, int id); +int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr, + const struct ib_send_wr **bad_send_wr); +int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr, + const struct ib_recv_wr **bad_recv_wr); +int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, + u32 max_num_sg); +int erdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); +void erdma_port_event(struct erdma_dev *dev, enum ib_event_type reason); + +#endif diff --git a/include/uapi/rdma/erdma-abi.h b/include/uapi/rdma/erdma-abi.h new file mode 100644 index 000000000000..b7a0222f978f --- /dev/null +++ b/include/uapi/rdma/erdma-abi.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* + * Copyright (c) 2020-2022, Alibaba Group. + */ + +#ifndef __ERDMA_USER_H__ +#define __ERDMA_USER_H__ + +#include + +#define ERDMA_ABI_VERSION 1 + +struct erdma_ureq_create_cq { + __aligned_u64 db_record_va; + __aligned_u64 qbuf_va; + __u32 qbuf_len; + __u32 rsvd0; +}; + +struct erdma_uresp_create_cq { + __u32 cq_id; + __u32 num_cqe; +}; + +struct erdma_ureq_create_qp { + __aligned_u64 db_record_va; + __aligned_u64 qbuf_va; + __u32 qbuf_len; + __u32 rsvd0; +}; + +struct erdma_uresp_create_qp { + __u32 qp_id; + __u32 num_sqe; + __u32 num_rqe; + __u32 rq_offset; +}; + +struct erdma_uresp_alloc_ctx { + __u32 dev_id; + __u32 pad; + __u32 sdb_type; + __u32 sdb_offset; + __aligned_u64 sdb; + __aligned_u64 rdb; + __aligned_u64 cdb; +}; + +#endif diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h index 3072e5d6b692..7dd56210226f 100644 --- a/include/uapi/rdma/ib_user_ioctl_verbs.h +++ b/include/uapi/rdma/ib_user_ioctl_verbs.h @@ -250,6 +250,7 @@ enum rdma_driver_id { RDMA_DRIVER_QIB, RDMA_DRIVER_EFA, RDMA_DRIVER_SIW, + RDMA_DRIVER_ERDMA, }; enum ib_uverbs_gid_type {