Use ODP MRs for kernel ULPs

The following series extends MR creation routines to allow creation of
 user MRs through kernel ULPs as a proxy. The immediate use case is to
 allow RDS to work over FS-DAX, which requires ODP (on-demand-paging)
 MRs to be created and such MRs were not possible to create prior this
 series.
 
 The first part of this patchset extends RDMA to have special verb
 ib_reg_user_mr(). The common use case that uses this function is a
 userspace application that allocates memory for HCA access but the
 responsibility to register the memory at the HCA is on an kernel ULP.
 This ULP acts as an agent for the userspace application.
 
 The second part provides advise MR functionality for ULPs. This is
 integral part of ODP flows and used to trigger pagefaults in advance
 to prepare memory before running working set.
 
 The third part is actual user of those in-kernel APIs.
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYIAB0WIQT1m3YD37UfMCUQBNwp8NhrnBAZsQUCXiVO8AAKCRAp8NhrnBAZ
 scTrAP9gb0d3qv0IOtHw5aGI1DAgjTUn/SzUOnsjDEn7DIoh9gEA2+ZmaEyLXKrl
 +UcZb31auy5P8ueJYokRLhLAyRcOIAg=
 =yaHb
 -----END PGP SIGNATURE-----

Merge tag 'rds-odp-for-5.5' of https://git.kernel.org/pub/scm/linux/kernel/git/leon/linux-rdma

Leon Romanovsky says:

====================
Use ODP MRs for kernel ULPs

The following series extends MR creation routines to allow creation of
user MRs through kernel ULPs as a proxy. The immediate use case is to
allow RDS to work over FS-DAX, which requires ODP (on-demand-paging)
MRs to be created and such MRs were not possible to create prior this
series.

The first part of this patchset extends RDMA to have special verb
ib_reg_user_mr(). The common use case that uses this function is a
userspace application that allocates memory for HCA access but the
responsibility to register the memory at the HCA is on an kernel ULP.
This ULP acts as an agent for the userspace application.

The second part provides advise MR functionality for ULPs. This is
integral part of ODP flows and used to trigger pagefaults in advance
to prepare memory before running working set.

The third part is actual user of those in-kernel APIs.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2020-01-21 10:22:51 +01:00
commit ad063075d4
45 changed files with 560 additions and 255 deletions

View file

@ -181,15 +181,14 @@ EXPORT_SYMBOL(ib_umem_find_best_pgsz);
/** /**
* ib_umem_get - Pin and DMA map userspace memory. * ib_umem_get - Pin and DMA map userspace memory.
* *
* @udata: userspace context to pin memory for * @device: IB device to connect UMEM
* @addr: userspace virtual address to start at * @addr: userspace virtual address to start at
* @size: length of region to pin * @size: length of region to pin
* @access: IB_ACCESS_xxx flags for memory being pinned * @access: IB_ACCESS_xxx flags for memory being pinned
*/ */
struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
size_t size, int access) size_t size, int access)
{ {
struct ib_ucontext *context;
struct ib_umem *umem; struct ib_umem *umem;
struct page **page_list; struct page **page_list;
unsigned long lock_limit; unsigned long lock_limit;
@ -201,14 +200,6 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
struct scatterlist *sg; struct scatterlist *sg;
unsigned int gup_flags = FOLL_WRITE; unsigned int gup_flags = FOLL_WRITE;
if (!udata)
return ERR_PTR(-EIO);
context = container_of(udata, struct uverbs_attr_bundle, driver_udata)
->context;
if (!context)
return ERR_PTR(-EIO);
/* /*
* If the combination of the addr and size requested for this memory * If the combination of the addr and size requested for this memory
* region causes an integer overflow, return error. * region causes an integer overflow, return error.
@ -226,7 +217,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
umem = kzalloc(sizeof(*umem), GFP_KERNEL); umem = kzalloc(sizeof(*umem), GFP_KERNEL);
if (!umem) if (!umem)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
umem->ibdev = context->device; umem->ibdev = device;
umem->length = size; umem->length = size;
umem->address = addr; umem->address = addr;
umem->writable = ib_access_writable(access); umem->writable = ib_access_writable(access);
@ -281,7 +272,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
npages -= ret; npages -= ret;
sg = ib_umem_add_sg_table(sg, page_list, ret, sg = ib_umem_add_sg_table(sg, page_list, ret,
dma_get_max_seg_size(context->device->dma_device), dma_get_max_seg_size(device->dma_device),
&umem->sg_nents); &umem->sg_nents);
up_read(&mm->mmap_sem); up_read(&mm->mmap_sem);
@ -289,10 +280,10 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
sg_mark_end(sg); sg_mark_end(sg);
umem->nmap = ib_dma_map_sg(context->device, umem->nmap = ib_dma_map_sg(device,
umem->sg_head.sgl, umem->sg_head.sgl,
umem->sg_nents, umem->sg_nents,
DMA_BIDIRECTIONAL); DMA_BIDIRECTIONAL);
if (!umem->nmap) { if (!umem->nmap) {
ret = -ENOMEM; ret = -ENOMEM;
@ -303,7 +294,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
goto out; goto out;
umem_release: umem_release:
__ib_umem_release(context->device, umem, 0); __ib_umem_release(device, umem, 0);
vma: vma:
atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm); atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
out: out:

View file

@ -110,15 +110,12 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
* They exist only to hold the per_mm reference to help the driver create * They exist only to hold the per_mm reference to help the driver create
* children umems. * children umems.
* *
* @udata: udata from the syscall being used to create the umem * @device: IB device to create UMEM
* @access: ib_reg_mr access flags * @access: ib_reg_mr access flags
*/ */
struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
int access) int access)
{ {
struct ib_ucontext *context =
container_of(udata, struct uverbs_attr_bundle, driver_udata)
->context;
struct ib_umem *umem; struct ib_umem *umem;
struct ib_umem_odp *umem_odp; struct ib_umem_odp *umem_odp;
int ret; int ret;
@ -126,14 +123,11 @@ struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
if (access & IB_ACCESS_HUGETLB) if (access & IB_ACCESS_HUGETLB)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
if (!context)
return ERR_PTR(-EIO);
umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL); umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL);
if (!umem_odp) if (!umem_odp)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
umem = &umem_odp->umem; umem = &umem_odp->umem;
umem->ibdev = context->device; umem->ibdev = device;
umem->writable = ib_access_writable(access); umem->writable = ib_access_writable(access);
umem->owning_mm = current->mm; umem->owning_mm = current->mm;
umem_odp->is_implicit_odp = 1; umem_odp->is_implicit_odp = 1;
@ -201,7 +195,7 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_child);
/** /**
* ib_umem_odp_get - Create a umem_odp for a userspace va * ib_umem_odp_get - Create a umem_odp for a userspace va
* *
* @udata: userspace context to pin memory for * @device: IB device struct to get UMEM
* @addr: userspace virtual address to start at * @addr: userspace virtual address to start at
* @size: length of region to pin * @size: length of region to pin
* @access: IB_ACCESS_xxx flags for memory being pinned * @access: IB_ACCESS_xxx flags for memory being pinned
@ -210,23 +204,14 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_child);
* pinning, instead, stores the mm for future page fault handling in * pinning, instead, stores the mm for future page fault handling in
* conjunction with MMU notifiers. * conjunction with MMU notifiers.
*/ */
struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device,
size_t size, int access, unsigned long addr, size_t size, int access,
const struct mmu_interval_notifier_ops *ops) const struct mmu_interval_notifier_ops *ops)
{ {
struct ib_umem_odp *umem_odp; struct ib_umem_odp *umem_odp;
struct ib_ucontext *context;
struct mm_struct *mm; struct mm_struct *mm;
int ret; int ret;
if (!udata)
return ERR_PTR(-EIO);
context = container_of(udata, struct uverbs_attr_bundle, driver_udata)
->context;
if (!context)
return ERR_PTR(-EIO);
if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND))) if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)))
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
@ -234,7 +219,7 @@ struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
if (!umem_odp) if (!umem_odp)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
umem_odp->umem.ibdev = context->device; umem_odp->umem.ibdev = device;
umem_odp->umem.length = size; umem_odp->umem.length = size;
umem_odp->umem.address = addr; umem_odp->umem.address = addr;
umem_odp->umem.writable = ib_access_writable(access); umem_odp->umem.writable = ib_access_writable(access);

View file

@ -1990,6 +1990,47 @@ EXPORT_SYMBOL(ib_resize_cq);
/* Memory regions */ /* Memory regions */
struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int access_flags)
{
struct ib_mr *mr;
if (access_flags & IB_ACCESS_ON_DEMAND) {
if (!(pd->device->attrs.device_cap_flags &
IB_DEVICE_ON_DEMAND_PAGING)) {
pr_debug("ODP support not available\n");
return ERR_PTR(-EINVAL);
}
}
mr = pd->device->ops.reg_user_mr(pd, start, length, virt_addr,
access_flags, NULL);
if (IS_ERR(mr))
return mr;
mr->device = pd->device;
mr->pd = pd;
mr->dm = NULL;
atomic_inc(&pd->usecnt);
mr->res.type = RDMA_RESTRACK_MR;
rdma_restrack_kadd(&mr->res);
return mr;
}
EXPORT_SYMBOL(ib_reg_user_mr);
int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
u32 flags, struct ib_sge *sg_list, u32 num_sge)
{
if (!pd->device->ops.advise_mr)
return -EOPNOTSUPP;
return pd->device->ops.advise_mr(pd, advice, flags, sg_list, num_sge,
NULL);
}
EXPORT_SYMBOL(ib_advise_mr);
int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
{ {
struct ib_pd *pd = mr->pd; struct ib_pd *pd = mr->pd;

View file

@ -837,7 +837,8 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
bytes += (qplib_qp->sq.max_wqe * psn_sz); bytes += (qplib_qp->sq.max_wqe * psn_sz);
} }
bytes = PAGE_ALIGN(bytes); bytes = PAGE_ALIGN(bytes);
umem = ib_umem_get(udata, ureq.qpsva, bytes, IB_ACCESS_LOCAL_WRITE); umem = ib_umem_get(&rdev->ibdev, ureq.qpsva, bytes,
IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(umem)) if (IS_ERR(umem))
return PTR_ERR(umem); return PTR_ERR(umem);
@ -850,7 +851,7 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
if (!qp->qplib_qp.srq) { if (!qp->qplib_qp.srq) {
bytes = (qplib_qp->rq.max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE); bytes = (qplib_qp->rq.max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
bytes = PAGE_ALIGN(bytes); bytes = PAGE_ALIGN(bytes);
umem = ib_umem_get(udata, ureq.qprva, bytes, umem = ib_umem_get(&rdev->ibdev, ureq.qprva, bytes,
IB_ACCESS_LOCAL_WRITE); IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(umem)) if (IS_ERR(umem))
goto rqfail; goto rqfail;
@ -1304,7 +1305,8 @@ static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev,
bytes = (qplib_srq->max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE); bytes = (qplib_srq->max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
bytes = PAGE_ALIGN(bytes); bytes = PAGE_ALIGN(bytes);
umem = ib_umem_get(udata, ureq.srqva, bytes, IB_ACCESS_LOCAL_WRITE); umem = ib_umem_get(&rdev->ibdev, ureq.srqva, bytes,
IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(umem)) if (IS_ERR(umem))
return PTR_ERR(umem); return PTR_ERR(umem);
@ -2545,7 +2547,7 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
goto fail; goto fail;
} }
cq->umem = ib_umem_get(udata, req.cq_va, cq->umem = ib_umem_get(&rdev->ibdev, req.cq_va,
entries * sizeof(struct cq_base), entries * sizeof(struct cq_base),
IB_ACCESS_LOCAL_WRITE); IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(cq->umem)) { if (IS_ERR(cq->umem)) {
@ -3514,7 +3516,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
/* The fixed portion of the rkey is the same as the lkey */ /* The fixed portion of the rkey is the same as the lkey */
mr->ib_mr.rkey = mr->qplib_mr.rkey; mr->ib_mr.rkey = mr->qplib_mr.rkey;
umem = ib_umem_get(udata, start, length, mr_access_flags); umem = ib_umem_get(&rdev->ibdev, start, length, mr_access_flags);
if (IS_ERR(umem)) { if (IS_ERR(umem)) {
dev_err(rdev_to_dev(rdev), "Failed to get umem"); dev_err(rdev_to_dev(rdev), "Failed to get umem");
rc = -EFAULT; rc = -EFAULT;

View file

@ -543,7 +543,7 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
mhp->rhp = rhp; mhp->rhp = rhp;
mhp->umem = ib_umem_get(udata, start, length, acc); mhp->umem = ib_umem_get(pd->device, start, length, acc);
if (IS_ERR(mhp->umem)) if (IS_ERR(mhp->umem))
goto err_free_skb; goto err_free_skb;

View file

@ -1358,7 +1358,7 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
int inline_size; int inline_size;
int err; int err;
if (udata->inlen && if (udata && udata->inlen &&
!ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) { !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) {
ibdev_dbg(&dev->ibdev, ibdev_dbg(&dev->ibdev,
"Incompatible ABI params, udata not cleared\n"); "Incompatible ABI params, udata not cleared\n");
@ -1384,7 +1384,7 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
goto err_out; goto err_out;
} }
mr->umem = ib_umem_get(udata, start, length, access_flags); mr->umem = ib_umem_get(ibpd->device, start, length, access_flags);
if (IS_ERR(mr->umem)) { if (IS_ERR(mr->umem)) {
err = PTR_ERR(mr->umem); err = PTR_ERR(mr->umem);
ibdev_dbg(&dev->ibdev, ibdev_dbg(&dev->ibdev,

View file

@ -163,7 +163,7 @@ static int get_cq_umem(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq,
u32 npages; u32 npages;
int ret; int ret;
*umem = ib_umem_get(udata, ucmd.buf_addr, buf->size, *umem = ib_umem_get(&hr_dev->ib_dev, ucmd.buf_addr, buf->size,
IB_ACCESS_LOCAL_WRITE); IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(*umem)) if (IS_ERR(*umem))
return PTR_ERR(*umem); return PTR_ERR(*umem);

View file

@ -31,7 +31,8 @@ int hns_roce_db_map_user(struct hns_roce_ucontext *context,
refcount_set(&page->refcount, 1); refcount_set(&page->refcount, 1);
page->user_virt = page_addr; page->user_virt = page_addr;
page->umem = ib_umem_get(udata, page_addr, PAGE_SIZE, 0); page->umem = ib_umem_get(context->ibucontext.device, page_addr,
PAGE_SIZE, 0);
if (IS_ERR(page->umem)) { if (IS_ERR(page->umem)) {
ret = PTR_ERR(page->umem); ret = PTR_ERR(page->umem);
kfree(page); kfree(page);

View file

@ -1145,7 +1145,7 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (!mr) if (!mr)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
mr->umem = ib_umem_get(udata, start, length, access_flags); mr->umem = ib_umem_get(pd->device, start, length, access_flags);
if (IS_ERR(mr->umem)) { if (IS_ERR(mr->umem)) {
ret = PTR_ERR(mr->umem); ret = PTR_ERR(mr->umem);
goto err_free; goto err_free;
@ -1230,7 +1230,7 @@ static int rereg_mr_trans(struct ib_mr *ibmr, int flags,
} }
ib_umem_release(mr->umem); ib_umem_release(mr->umem);
mr->umem = ib_umem_get(udata, start, length, mr_access_flags); mr->umem = ib_umem_get(ibmr->device, start, length, mr_access_flags);
if (IS_ERR(mr->umem)) { if (IS_ERR(mr->umem)) {
ret = PTR_ERR(mr->umem); ret = PTR_ERR(mr->umem);
mr->umem = NULL; mr->umem = NULL;

View file

@ -744,7 +744,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
goto err_alloc_rq_inline_buf; goto err_alloc_rq_inline_buf;
} }
hr_qp->umem = ib_umem_get(udata, ucmd.buf_addr, hr_qp->umem = ib_umem_get(ib_pd->device, ucmd.buf_addr,
hr_qp->buff_size, 0); hr_qp->buff_size, 0);
if (IS_ERR(hr_qp->umem)) { if (IS_ERR(hr_qp->umem)) {
dev_err(dev, "ib_umem_get error for create qp\n"); dev_err(dev, "ib_umem_get error for create qp\n");

View file

@ -186,7 +186,8 @@ static int create_user_srq(struct hns_roce_srq *srq, struct ib_udata *udata,
if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
return -EFAULT; return -EFAULT;
srq->umem = ib_umem_get(udata, ucmd.buf_addr, srq_buf_size, 0); srq->umem =
ib_umem_get(srq->ibsrq.device, ucmd.buf_addr, srq_buf_size, 0);
if (IS_ERR(srq->umem)) if (IS_ERR(srq->umem))
return PTR_ERR(srq->umem); return PTR_ERR(srq->umem);
@ -205,7 +206,7 @@ static int create_user_srq(struct hns_roce_srq *srq, struct ib_udata *udata,
goto err_user_srq_mtt; goto err_user_srq_mtt;
/* config index queue BA */ /* config index queue BA */
srq->idx_que.umem = ib_umem_get(udata, ucmd.que_addr, srq->idx_que.umem = ib_umem_get(srq->ibsrq.device, ucmd.que_addr,
srq->idx_que.buf_size, 0); srq->idx_que.buf_size, 0);
if (IS_ERR(srq->idx_que.umem)) { if (IS_ERR(srq->idx_que.umem)) {
dev_err(hr_dev->dev, "ib_umem_get error for index queue\n"); dev_err(hr_dev->dev, "ib_umem_get error for index queue\n");

View file

@ -1756,12 +1756,15 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
int ret; int ret;
int pg_shift; int pg_shift;
if (!udata)
return ERR_PTR(-EOPNOTSUPP);
if (iwdev->closing) if (iwdev->closing)
return ERR_PTR(-ENODEV); return ERR_PTR(-ENODEV);
if (length > I40IW_MAX_MR_SIZE) if (length > I40IW_MAX_MR_SIZE)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
region = ib_umem_get(udata, start, length, acc); region = ib_umem_get(pd->device, start, length, acc);
if (IS_ERR(region)) if (IS_ERR(region))
return (struct ib_mr *)region; return (struct ib_mr *)region;

View file

@ -144,7 +144,7 @@ static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_udata *udata,
int shift; int shift;
int n; int n;
*umem = ib_umem_get(udata, buf_addr, cqe * cqe_size, *umem = ib_umem_get(&dev->ib_dev, buf_addr, cqe * cqe_size,
IB_ACCESS_LOCAL_WRITE); IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(*umem)) if (IS_ERR(*umem))
return PTR_ERR(*umem); return PTR_ERR(*umem);

View file

@ -64,7 +64,8 @@ int mlx4_ib_db_map_user(struct ib_udata *udata, unsigned long virt,
page->user_virt = (virt & PAGE_MASK); page->user_virt = (virt & PAGE_MASK);
page->refcnt = 0; page->refcnt = 0;
page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0); page->umem = ib_umem_get(context->ibucontext.device, virt & PAGE_MASK,
PAGE_SIZE, 0);
if (IS_ERR(page->umem)) { if (IS_ERR(page->umem)) {
err = PTR_ERR(page->umem); err = PTR_ERR(page->umem);
kfree(page); kfree(page);

View file

@ -367,7 +367,7 @@ int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va,
return block_shift; return block_shift;
} }
static struct ib_umem *mlx4_get_umem_mr(struct ib_udata *udata, u64 start, static struct ib_umem *mlx4_get_umem_mr(struct ib_device *device, u64 start,
u64 length, int access_flags) u64 length, int access_flags)
{ {
/* /*
@ -398,7 +398,7 @@ static struct ib_umem *mlx4_get_umem_mr(struct ib_udata *udata, u64 start,
up_read(&current->mm->mmap_sem); up_read(&current->mm->mmap_sem);
} }
return ib_umem_get(udata, start, length, access_flags); return ib_umem_get(device, start, length, access_flags);
} }
struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
@ -415,7 +415,7 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (!mr) if (!mr)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
mr->umem = mlx4_get_umem_mr(udata, start, length, access_flags); mr->umem = mlx4_get_umem_mr(pd->device, start, length, access_flags);
if (IS_ERR(mr->umem)) { if (IS_ERR(mr->umem)) {
err = PTR_ERR(mr->umem); err = PTR_ERR(mr->umem);
goto err_free; goto err_free;
@ -504,7 +504,7 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
ib_umem_release(mmr->umem); ib_umem_release(mmr->umem);
mmr->umem = mlx4_get_umem_mr(udata, start, length, mmr->umem = mlx4_get_umem_mr(mr->device, start, length,
mr_access_flags); mr_access_flags);
if (IS_ERR(mmr->umem)) { if (IS_ERR(mmr->umem)) {
err = PTR_ERR(mmr->umem); err = PTR_ERR(mmr->umem);

View file

@ -916,7 +916,7 @@ static int create_rq(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
(qp->sq.wqe_cnt << qp->sq.wqe_shift); (qp->sq.wqe_cnt << qp->sq.wqe_shift);
qp->umem = ib_umem_get(udata, wq.buf_addr, qp->buf_size, 0); qp->umem = ib_umem_get(pd->device, wq.buf_addr, qp->buf_size, 0);
if (IS_ERR(qp->umem)) { if (IS_ERR(qp->umem)) {
err = PTR_ERR(qp->umem); err = PTR_ERR(qp->umem);
goto err; goto err;
@ -1110,7 +1110,8 @@ static int create_qp_common(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
if (err) if (err)
goto err; goto err;
qp->umem = ib_umem_get(udata, ucmd.buf_addr, qp->buf_size, 0); qp->umem =
ib_umem_get(pd->device, ucmd.buf_addr, qp->buf_size, 0);
if (IS_ERR(qp->umem)) { if (IS_ERR(qp->umem)) {
err = PTR_ERR(qp->umem); err = PTR_ERR(qp->umem);
goto err; goto err;

View file

@ -110,7 +110,8 @@ int mlx4_ib_create_srq(struct ib_srq *ib_srq,
if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
return -EFAULT; return -EFAULT;
srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0); srq->umem =
ib_umem_get(ib_srq->device, ucmd.buf_addr, buf_size, 0);
if (IS_ERR(srq->umem)) if (IS_ERR(srq->umem))
return PTR_ERR(srq->umem); return PTR_ERR(srq->umem);

View file

@ -708,8 +708,8 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
*cqe_size = ucmd.cqe_size; *cqe_size = ucmd.cqe_size;
cq->buf.umem = cq->buf.umem =
ib_umem_get(udata, ucmd.buf_addr, entries * ucmd.cqe_size, ib_umem_get(&dev->ib_dev, ucmd.buf_addr,
IB_ACCESS_LOCAL_WRITE); entries * ucmd.cqe_size, IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(cq->buf.umem)) { if (IS_ERR(cq->buf.umem)) {
err = PTR_ERR(cq->buf.umem); err = PTR_ERR(cq->buf.umem);
return err; return err;
@ -1108,7 +1108,7 @@ static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
if (ucmd.cqe_size && SIZE_MAX / ucmd.cqe_size <= entries - 1) if (ucmd.cqe_size && SIZE_MAX / ucmd.cqe_size <= entries - 1)
return -EINVAL; return -EINVAL;
umem = ib_umem_get(udata, ucmd.buf_addr, umem = ib_umem_get(&dev->ib_dev, ucmd.buf_addr,
(size_t)ucmd.cqe_size * entries, (size_t)ucmd.cqe_size * entries,
IB_ACCESS_LOCAL_WRITE); IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(umem)) { if (IS_ERR(umem)) {

View file

@ -2134,7 +2134,7 @@ static int devx_umem_get(struct mlx5_ib_dev *dev, struct ib_ucontext *ucontext,
if (err) if (err)
return err; return err;
obj->umem = ib_umem_get(&attrs->driver_udata, addr, size, access); obj->umem = ib_umem_get(&dev->ib_dev, addr, size, access);
if (IS_ERR(obj->umem)) if (IS_ERR(obj->umem))
return PTR_ERR(obj->umem); return PTR_ERR(obj->umem);

View file

@ -64,7 +64,8 @@ int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context,
page->user_virt = (virt & PAGE_MASK); page->user_virt = (virt & PAGE_MASK);
page->refcnt = 0; page->refcnt = 0;
page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0); page->umem = ib_umem_get(context->ibucontext.device, virt & PAGE_MASK,
PAGE_SIZE, 0);
if (IS_ERR(page->umem)) { if (IS_ERR(page->umem)) {
err = PTR_ERR(page->umem); err = PTR_ERR(page->umem);
kfree(page); kfree(page);

View file

@ -815,6 +815,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
struct ib_device_attr *props, struct ib_device_attr *props,
struct ib_udata *uhw) struct ib_udata *uhw)
{ {
size_t uhw_outlen = (uhw) ? uhw->outlen : 0;
struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_dev *dev = to_mdev(ibdev);
struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_core_dev *mdev = dev->mdev;
int err = -ENOMEM; int err = -ENOMEM;
@ -828,12 +829,12 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
u64 max_tso; u64 max_tso;
resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length); resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
if (uhw->outlen && uhw->outlen < resp_len) if (uhw_outlen && uhw_outlen < resp_len)
return -EINVAL; return -EINVAL;
resp.response_length = resp_len; resp.response_length = resp_len;
if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen)) if (uhw && uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
return -EINVAL; return -EINVAL;
memset(props, 0, sizeof(*props)); memset(props, 0, sizeof(*props));
@ -897,7 +898,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
props->raw_packet_caps |= props->raw_packet_caps |=
IB_RAW_PACKET_CAP_CVLAN_STRIPPING; IB_RAW_PACKET_CAP_CVLAN_STRIPPING;
if (field_avail(typeof(resp), tso_caps, uhw->outlen)) { if (field_avail(typeof(resp), tso_caps, uhw_outlen)) {
max_tso = MLX5_CAP_ETH(mdev, max_lso_cap); max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
if (max_tso) { if (max_tso) {
resp.tso_caps.max_tso = 1 << max_tso; resp.tso_caps.max_tso = 1 << max_tso;
@ -907,7 +908,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
} }
} }
if (field_avail(typeof(resp), rss_caps, uhw->outlen)) { if (field_avail(typeof(resp), rss_caps, uhw_outlen)) {
resp.rss_caps.rx_hash_function = resp.rss_caps.rx_hash_function =
MLX5_RX_HASH_FUNC_TOEPLITZ; MLX5_RX_HASH_FUNC_TOEPLITZ;
resp.rss_caps.rx_hash_fields_mask = resp.rss_caps.rx_hash_fields_mask =
@ -927,9 +928,9 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
resp.response_length += sizeof(resp.rss_caps); resp.response_length += sizeof(resp.rss_caps);
} }
} else { } else {
if (field_avail(typeof(resp), tso_caps, uhw->outlen)) if (field_avail(typeof(resp), tso_caps, uhw_outlen))
resp.response_length += sizeof(resp.tso_caps); resp.response_length += sizeof(resp.tso_caps);
if (field_avail(typeof(resp), rss_caps, uhw->outlen)) if (field_avail(typeof(resp), rss_caps, uhw_outlen))
resp.response_length += sizeof(resp.rss_caps); resp.response_length += sizeof(resp.rss_caps);
} }
@ -1014,6 +1015,23 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
if (dev->odp_caps.general_caps & IB_ODP_SUPPORT) if (dev->odp_caps.general_caps & IB_ODP_SUPPORT)
props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING; props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
props->odp_caps = dev->odp_caps; props->odp_caps = dev->odp_caps;
if (!uhw) {
/* ODP for kernel QPs is not implemented for receive
* WQEs and SRQ WQEs
*/
props->odp_caps.per_transport_caps.rc_odp_caps &=
~(IB_ODP_SUPPORT_READ |
IB_ODP_SUPPORT_SRQ_RECV);
props->odp_caps.per_transport_caps.uc_odp_caps &=
~(IB_ODP_SUPPORT_READ |
IB_ODP_SUPPORT_SRQ_RECV);
props->odp_caps.per_transport_caps.ud_odp_caps &=
~(IB_ODP_SUPPORT_READ |
IB_ODP_SUPPORT_SRQ_RECV);
props->odp_caps.per_transport_caps.xrc_odp_caps &=
~(IB_ODP_SUPPORT_READ |
IB_ODP_SUPPORT_SRQ_RECV);
}
} }
if (MLX5_CAP_GEN(mdev, cd)) if (MLX5_CAP_GEN(mdev, cd))
@ -1054,7 +1072,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
MLX5_MAX_CQ_PERIOD; MLX5_MAX_CQ_PERIOD;
} }
if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) { if (field_avail(typeof(resp), cqe_comp_caps, uhw_outlen)) {
resp.response_length += sizeof(resp.cqe_comp_caps); resp.response_length += sizeof(resp.cqe_comp_caps);
if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) { if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) {
@ -1072,7 +1090,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
} }
} }
if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen) && if (field_avail(typeof(resp), packet_pacing_caps, uhw_outlen) &&
raw_support) { raw_support) {
if (MLX5_CAP_QOS(mdev, packet_pacing) && if (MLX5_CAP_QOS(mdev, packet_pacing) &&
MLX5_CAP_GEN(mdev, qos)) { MLX5_CAP_GEN(mdev, qos)) {
@ -1091,7 +1109,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
} }
if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes, if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes,
uhw->outlen)) { uhw_outlen)) {
if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe)) if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe))
resp.mlx5_ib_support_multi_pkt_send_wqes = resp.mlx5_ib_support_multi_pkt_send_wqes =
MLX5_IB_ALLOW_MPW; MLX5_IB_ALLOW_MPW;
@ -1104,7 +1122,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes); sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes);
} }
if (field_avail(typeof(resp), flags, uhw->outlen)) { if (field_avail(typeof(resp), flags, uhw_outlen)) {
resp.response_length += sizeof(resp.flags); resp.response_length += sizeof(resp.flags);
if (MLX5_CAP_GEN(mdev, cqe_compression_128)) if (MLX5_CAP_GEN(mdev, cqe_compression_128))
@ -1120,8 +1138,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT; resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT;
} }
if (field_avail(typeof(resp), sw_parsing_caps, if (field_avail(typeof(resp), sw_parsing_caps, uhw_outlen)) {
uhw->outlen)) {
resp.response_length += sizeof(resp.sw_parsing_caps); resp.response_length += sizeof(resp.sw_parsing_caps);
if (MLX5_CAP_ETH(mdev, swp)) { if (MLX5_CAP_ETH(mdev, swp)) {
resp.sw_parsing_caps.sw_parsing_offloads |= resp.sw_parsing_caps.sw_parsing_offloads |=
@ -1141,7 +1158,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
} }
} }
if (field_avail(typeof(resp), striding_rq_caps, uhw->outlen) && if (field_avail(typeof(resp), striding_rq_caps, uhw_outlen) &&
raw_support) { raw_support) {
resp.response_length += sizeof(resp.striding_rq_caps); resp.response_length += sizeof(resp.striding_rq_caps);
if (MLX5_CAP_GEN(mdev, striding_rq)) { if (MLX5_CAP_GEN(mdev, striding_rq)) {
@ -1164,8 +1181,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
} }
} }
if (field_avail(typeof(resp), tunnel_offloads_caps, if (field_avail(typeof(resp), tunnel_offloads_caps, uhw_outlen)) {
uhw->outlen)) {
resp.response_length += sizeof(resp.tunnel_offloads_caps); resp.response_length += sizeof(resp.tunnel_offloads_caps);
if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan)) if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan))
resp.tunnel_offloads_caps |= resp.tunnel_offloads_caps |=
@ -1186,7 +1202,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP; MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP;
} }
if (uhw->outlen) { if (uhw_outlen) {
err = ib_copy_to_udata(uhw, &resp, resp.response_length); err = ib_copy_to_udata(uhw, &resp, resp.response_length);
if (err) if (err)
@ -4773,7 +4789,6 @@ static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port)
struct ib_device_attr *dprops = NULL; struct ib_device_attr *dprops = NULL;
struct ib_port_attr *pprops = NULL; struct ib_port_attr *pprops = NULL;
int err = -ENOMEM; int err = -ENOMEM;
struct ib_udata uhw = {.inlen = 0, .outlen = 0};
pprops = kzalloc(sizeof(*pprops), GFP_KERNEL); pprops = kzalloc(sizeof(*pprops), GFP_KERNEL);
if (!pprops) if (!pprops)
@ -4783,7 +4798,7 @@ static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port)
if (!dprops) if (!dprops)
goto out; goto out;
err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw); err = mlx5_ib_query_device(&dev->ib_dev, dprops, NULL);
if (err) { if (err) {
mlx5_ib_warn(dev, "query_device failed %d\n", err); mlx5_ib_warn(dev, "query_device failed %d\n", err);
goto out; goto out;

View file

@ -1153,12 +1153,12 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
const struct ib_send_wr **bad_wr); const struct ib_send_wr **bad_wr);
int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, int mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
const struct ib_recv_wr **bad_wr); const struct ib_recv_wr **bad_wr);
int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, int mlx5_ib_read_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
int buflen, size_t *bc); size_t buflen, size_t *bc);
int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer, int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
int buflen, size_t *bc); size_t buflen, size_t *bc);
int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer,
void *buffer, int buflen, size_t *bc); size_t buflen, size_t *bc);
int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
struct ib_udata *udata); struct ib_udata *udata);
void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);

View file

@ -737,10 +737,9 @@ static int mr_cache_max_order(struct mlx5_ib_dev *dev)
return MLX5_MAX_UMR_SHIFT; return MLX5_MAX_UMR_SHIFT;
} }
static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata, static int mr_umem_get(struct mlx5_ib_dev *dev, u64 start, u64 length,
u64 start, u64 length, int access_flags, int access_flags, struct ib_umem **umem, int *npages,
struct ib_umem **umem, int *npages, int *page_shift, int *page_shift, int *ncont, int *order)
int *ncont, int *order)
{ {
struct ib_umem *u; struct ib_umem *u;
@ -749,7 +748,7 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
if (access_flags & IB_ACCESS_ON_DEMAND) { if (access_flags & IB_ACCESS_ON_DEMAND) {
struct ib_umem_odp *odp; struct ib_umem_odp *odp;
odp = ib_umem_odp_get(udata, start, length, access_flags, odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags,
&mlx5_mn_ops); &mlx5_mn_ops);
if (IS_ERR(odp)) { if (IS_ERR(odp)) {
mlx5_ib_dbg(dev, "umem get failed (%ld)\n", mlx5_ib_dbg(dev, "umem get failed (%ld)\n",
@ -765,7 +764,7 @@ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
if (order) if (order)
*order = ilog2(roundup_pow_of_two(*ncont)); *order = ilog2(roundup_pow_of_two(*ncont));
} else { } else {
u = ib_umem_get(udata, start, length, access_flags); u = ib_umem_get(&dev->ib_dev, start, length, access_flags);
if (IS_ERR(u)) { if (IS_ERR(u)) {
mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u)); mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(u));
return PTR_ERR(u); return PTR_ERR(u);
@ -1247,6 +1246,8 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && !start && if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && !start &&
length == U64_MAX) { length == U64_MAX) {
if (virt_addr != start)
return ERR_PTR(-EINVAL);
if (!(access_flags & IB_ACCESS_ON_DEMAND) || if (!(access_flags & IB_ACCESS_ON_DEMAND) ||
!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
@ -1257,7 +1258,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
return &mr->ibmr; return &mr->ibmr;
} }
err = mr_umem_get(dev, udata, start, length, access_flags, &umem, err = mr_umem_get(dev, start, length, access_flags, &umem,
&npages, &page_shift, &ncont, &order); &npages, &page_shift, &ncont, &order);
if (err < 0) if (err < 0)
@ -1424,9 +1425,8 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
flags |= IB_MR_REREG_TRANS; flags |= IB_MR_REREG_TRANS;
ib_umem_release(mr->umem); ib_umem_release(mr->umem);
mr->umem = NULL; mr->umem = NULL;
err = mr_umem_get(dev, udata, addr, len, access_flags, err = mr_umem_get(dev, addr, len, access_flags, &mr->umem,
&mr->umem, &npages, &page_shift, &ncont, &npages, &page_shift, &ncont, &order);
&order);
if (err) if (err)
goto err; goto err;
} }

View file

@ -497,7 +497,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
struct mlx5_ib_mr *imr; struct mlx5_ib_mr *imr;
int err; int err;
umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags); umem_odp = ib_umem_odp_alloc_implicit(&dev->ib_dev, access_flags);
if (IS_ERR(umem_odp)) if (IS_ERR(umem_odp))
return ERR_CAST(umem_odp); return ERR_CAST(umem_odp);
@ -624,11 +624,10 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
unsigned long current_seq; unsigned long current_seq;
u64 access_mask; u64 access_mask;
u64 start_idx, page_mask; u64 start_idx;
page_shift = odp->page_shift; page_shift = odp->page_shift;
page_mask = ~(BIT(page_shift) - 1); start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
start_idx = (user_va - (mr->mmkey.iova & page_mask)) >> page_shift;
access_mask = ODP_READ_ALLOWED_BIT; access_mask = ODP_READ_ALLOWED_BIT;
if (odp->umem.writable && !downgrade) if (odp->umem.writable && !downgrade)
@ -767,11 +766,19 @@ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt,
{ {
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
if (unlikely(io_virt < mr->mmkey.iova))
return -EFAULT;
if (!odp->is_implicit_odp) { if (!odp->is_implicit_odp) {
if (unlikely(io_virt < ib_umem_start(odp) || u64 user_va;
ib_umem_end(odp) - io_virt < bcnt))
if (check_add_overflow(io_virt - mr->mmkey.iova,
(u64)odp->umem.address, &user_va))
return -EFAULT; return -EFAULT;
return pagefault_real_mr(mr, odp, io_virt, bcnt, bytes_mapped, if (unlikely(user_va >= ib_umem_end(odp) ||
ib_umem_end(odp) - user_va < bcnt))
return -EFAULT;
return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped,
flags); flags);
} }
return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped, return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped,
@ -1237,15 +1244,15 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
wqe = wqe_start; wqe = wqe_start;
qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL; qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL;
if (qp && sq) { if (qp && sq) {
ret = mlx5_ib_read_user_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE, ret = mlx5_ib_read_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE,
&bytes_copied); &bytes_copied);
if (ret) if (ret)
goto read_user; goto read_user;
ret = mlx5_ib_mr_initiator_pfault_handler( ret = mlx5_ib_mr_initiator_pfault_handler(
dev, pfault, qp, &wqe, &wqe_end, bytes_copied); dev, pfault, qp, &wqe, &wqe_end, bytes_copied);
} else if (qp && !sq) { } else if (qp && !sq) {
ret = mlx5_ib_read_user_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE, ret = mlx5_ib_read_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE,
&bytes_copied); &bytes_copied);
if (ret) if (ret)
goto read_user; goto read_user;
ret = mlx5_ib_mr_responder_pfault_handler_rq( ret = mlx5_ib_mr_responder_pfault_handler_rq(
@ -1253,8 +1260,8 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
} else if (!qp) { } else if (!qp) {
struct mlx5_ib_srq *srq = res_to_srq(res); struct mlx5_ib_srq *srq = res_to_srq(res);
ret = mlx5_ib_read_user_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE, ret = mlx5_ib_read_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE,
&bytes_copied); &bytes_copied);
if (ret) if (ret)
goto read_user; goto read_user;
ret = mlx5_ib_mr_responder_pfault_handler_srq( ret = mlx5_ib_mr_responder_pfault_handler_srq(

View file

@ -129,14 +129,10 @@ static int is_sqp(enum ib_qp_type qp_type)
* *
* Return: zero on success, or an error code. * Return: zero on success, or an error code.
*/ */
static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem, static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem, void *buffer,
void *buffer, size_t buflen, int wqe_index,
u32 buflen, int wq_offset, int wq_wqe_cnt,
int wqe_index, int wq_wqe_shift, int bcnt,
int wq_offset,
int wq_wqe_cnt,
int wq_wqe_shift,
int bcnt,
size_t *bytes_copied) size_t *bytes_copied)
{ {
size_t offset = wq_offset + ((wqe_index % wq_wqe_cnt) << wq_wqe_shift); size_t offset = wq_offset + ((wqe_index % wq_wqe_cnt) << wq_wqe_shift);
@ -160,11 +156,43 @@ static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem,
return 0; return 0;
} }
int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, static int mlx5_ib_read_kernel_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index,
int wqe_index, void *buffer, size_t buflen, size_t *bc)
void *buffer, {
int buflen, struct mlx5_wqe_ctrl_seg *ctrl;
size_t *bc) size_t bytes_copied = 0;
size_t wqe_length;
void *p;
int ds;
wqe_index = wqe_index & qp->sq.fbc.sz_m1;
/* read the control segment first */
p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, wqe_index);
ctrl = p;
ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
wqe_length = ds * MLX5_WQE_DS_UNITS;
/* read rest of WQE if it spreads over more than one stride */
while (bytes_copied < wqe_length) {
size_t copy_length =
min_t(size_t, buflen - bytes_copied, MLX5_SEND_WQE_BB);
if (!copy_length)
break;
memcpy(buffer + bytes_copied, p, copy_length);
bytes_copied += copy_length;
wqe_index = (wqe_index + 1) & qp->sq.fbc.sz_m1;
p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, wqe_index);
}
*bc = bytes_copied;
return 0;
}
static int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index,
void *buffer, size_t buflen, size_t *bc)
{ {
struct mlx5_ib_qp_base *base = &qp->trans_qp.base; struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
struct ib_umem *umem = base->ubuffer.umem; struct ib_umem *umem = base->ubuffer.umem;
@ -176,18 +204,10 @@ int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp,
int ret; int ret;
int ds; int ds;
if (buflen < sizeof(*ctrl))
return -EINVAL;
/* at first read as much as possible */ /* at first read as much as possible */
ret = mlx5_ib_read_user_wqe_common(umem, ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index,
buffer, wq->offset, wq->wqe_cnt,
buflen, wq->wqe_shift, buflen,
wqe_index,
wq->offset,
wq->wqe_cnt,
wq->wqe_shift,
buflen,
&bytes_copied); &bytes_copied);
if (ret) if (ret)
return ret; return ret;
@ -210,13 +230,9 @@ int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp,
* so read the remaining bytes starting * so read the remaining bytes starting
* from wqe_index 0 * from wqe_index 0
*/ */
ret = mlx5_ib_read_user_wqe_common(umem, ret = mlx5_ib_read_user_wqe_common(umem, buffer + bytes_copied,
buffer + bytes_copied, buflen - bytes_copied, 0, wq->offset,
buflen - bytes_copied, wq->wqe_cnt, wq->wqe_shift,
0,
wq->offset,
wq->wqe_cnt,
wq->wqe_shift,
wqe_length - bytes_copied, wqe_length - bytes_copied,
&bytes_copied2); &bytes_copied2);
@ -226,11 +242,24 @@ int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp,
return 0; return 0;
} }
int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int mlx5_ib_read_wqe_sq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
int wqe_index, size_t buflen, size_t *bc)
void *buffer, {
int buflen, struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
size_t *bc) struct ib_umem *umem = base->ubuffer.umem;
if (buflen < sizeof(struct mlx5_wqe_ctrl_seg))
return -EINVAL;
if (!umem)
return mlx5_ib_read_kernel_wqe_sq(qp, wqe_index, buffer,
buflen, bc);
return mlx5_ib_read_user_wqe_sq(qp, wqe_index, buffer, buflen, bc);
}
static int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index,
void *buffer, size_t buflen, size_t *bc)
{ {
struct mlx5_ib_qp_base *base = &qp->trans_qp.base; struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
struct ib_umem *umem = base->ubuffer.umem; struct ib_umem *umem = base->ubuffer.umem;
@ -238,14 +267,9 @@ int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp,
size_t bytes_copied; size_t bytes_copied;
int ret; int ret;
ret = mlx5_ib_read_user_wqe_common(umem, ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index,
buffer, wq->offset, wq->wqe_cnt,
buflen, wq->wqe_shift, buflen,
wqe_index,
wq->offset,
wq->wqe_cnt,
wq->wqe_shift,
buflen,
&bytes_copied); &bytes_copied);
if (ret) if (ret)
@ -254,25 +278,33 @@ int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp,
return 0; return 0;
} }
int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int mlx5_ib_read_wqe_rq(struct mlx5_ib_qp *qp, int wqe_index, void *buffer,
int wqe_index, size_t buflen, size_t *bc)
void *buffer, {
int buflen, struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
size_t *bc) struct ib_umem *umem = base->ubuffer.umem;
struct mlx5_ib_wq *wq = &qp->rq;
size_t wqe_size = 1 << wq->wqe_shift;
if (buflen < wqe_size)
return -EINVAL;
if (!umem)
return -EOPNOTSUPP;
return mlx5_ib_read_user_wqe_rq(qp, wqe_index, buffer, buflen, bc);
}
static int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index,
void *buffer, size_t buflen, size_t *bc)
{ {
struct ib_umem *umem = srq->umem; struct ib_umem *umem = srq->umem;
size_t bytes_copied; size_t bytes_copied;
int ret; int ret;
ret = mlx5_ib_read_user_wqe_common(umem, ret = mlx5_ib_read_user_wqe_common(umem, buffer, buflen, wqe_index, 0,
buffer, srq->msrq.max, srq->msrq.wqe_shift,
buflen, buflen, &bytes_copied);
wqe_index,
0,
srq->msrq.max,
srq->msrq.wqe_shift,
buflen,
&bytes_copied);
if (ret) if (ret)
return ret; return ret;
@ -280,6 +312,21 @@ int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq,
return 0; return 0;
} }
int mlx5_ib_read_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index, void *buffer,
size_t buflen, size_t *bc)
{
struct ib_umem *umem = srq->umem;
size_t wqe_size = 1 << srq->msrq.wqe_shift;
if (buflen < wqe_size)
return -EINVAL;
if (!umem)
return -EOPNOTSUPP;
return mlx5_ib_read_user_wqe_srq(srq, wqe_index, buffer, buflen, bc);
}
static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
{ {
struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
@ -749,7 +796,7 @@ static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
{ {
int err; int err;
*umem = ib_umem_get(udata, addr, size, 0); *umem = ib_umem_get(&dev->ib_dev, addr, size, 0);
if (IS_ERR(*umem)) { if (IS_ERR(*umem)) {
mlx5_ib_dbg(dev, "umem_get failed\n"); mlx5_ib_dbg(dev, "umem_get failed\n");
return PTR_ERR(*umem); return PTR_ERR(*umem);
@ -806,7 +853,7 @@ static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
if (!ucmd->buf_addr) if (!ucmd->buf_addr)
return -EINVAL; return -EINVAL;
rwq->umem = ib_umem_get(udata, ucmd->buf_addr, rwq->buf_size, 0); rwq->umem = ib_umem_get(&dev->ib_dev, ucmd->buf_addr, rwq->buf_size, 0);
if (IS_ERR(rwq->umem)) { if (IS_ERR(rwq->umem)) {
mlx5_ib_dbg(dev, "umem_get failed\n"); mlx5_ib_dbg(dev, "umem_get failed\n");
err = PTR_ERR(rwq->umem); err = PTR_ERR(rwq->umem);

View file

@ -80,7 +80,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE);
srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0); srq->umem = ib_umem_get(pd->device, ucmd.buf_addr, buf_size, 0);
if (IS_ERR(srq->umem)) { if (IS_ERR(srq->umem)) {
mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size); mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size);
err = PTR_ERR(srq->umem); err = PTR_ERR(srq->umem);

View file

@ -880,7 +880,7 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (!mr) if (!mr)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
mr->umem = ib_umem_get(udata, start, length, acc); mr->umem = ib_umem_get(pd->device, start, length, acc);
if (IS_ERR(mr->umem)) { if (IS_ERR(mr->umem)) {
err = PTR_ERR(mr->umem); err = PTR_ERR(mr->umem);
goto err; goto err;

View file

@ -869,7 +869,7 @@ struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
mr = kzalloc(sizeof(*mr), GFP_KERNEL); mr = kzalloc(sizeof(*mr), GFP_KERNEL);
if (!mr) if (!mr)
return ERR_PTR(status); return ERR_PTR(status);
mr->umem = ib_umem_get(udata, start, len, acc); mr->umem = ib_umem_get(ibpd->device, start, len, acc);
if (IS_ERR(mr->umem)) { if (IS_ERR(mr->umem)) {
status = -EFAULT; status = -EFAULT;
goto umem_err; goto umem_err;

View file

@ -772,7 +772,7 @@ static inline int qedr_init_user_queue(struct ib_udata *udata,
q->buf_addr = buf_addr; q->buf_addr = buf_addr;
q->buf_len = buf_len; q->buf_len = buf_len;
q->umem = ib_umem_get(udata, q->buf_addr, q->buf_len, access); q->umem = ib_umem_get(&dev->ibdev, q->buf_addr, q->buf_len, access);
if (IS_ERR(q->umem)) { if (IS_ERR(q->umem)) {
DP_ERR(dev, "create user queue: failed ib_umem_get, got %ld\n", DP_ERR(dev, "create user queue: failed ib_umem_get, got %ld\n",
PTR_ERR(q->umem)); PTR_ERR(q->umem));
@ -1415,9 +1415,8 @@ static int qedr_init_srq_user_params(struct ib_udata *udata,
if (rc) if (rc)
return rc; return rc;
srq->prod_umem = srq->prod_umem = ib_umem_get(srq->ibsrq.device, ureq->prod_pair_addr,
ib_umem_get(udata, ureq->prod_pair_addr, sizeof(struct rdma_srq_producers), access);
sizeof(struct rdma_srq_producers), access);
if (IS_ERR(srq->prod_umem)) { if (IS_ERR(srq->prod_umem)) {
qedr_free_pbl(srq->dev, &srq->usrq.pbl_info, srq->usrq.pbl_tbl); qedr_free_pbl(srq->dev, &srq->usrq.pbl_info, srq->usrq.pbl_tbl);
ib_umem_release(srq->usrq.umem); ib_umem_release(srq->usrq.umem);
@ -2839,7 +2838,7 @@ struct ib_mr *qedr_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
mr->type = QEDR_MR_USER; mr->type = QEDR_MR_USER;
mr->umem = ib_umem_get(udata, start, len, acc); mr->umem = ib_umem_get(ibpd->device, start, len, acc);
if (IS_ERR(mr->umem)) { if (IS_ERR(mr->umem)) {
rc = -EFAULT; rc = -EFAULT;
goto err0; goto err0;

View file

@ -135,7 +135,7 @@ int pvrdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
goto err_cq; goto err_cq;
} }
cq->umem = ib_umem_get(udata, ucmd.buf_addr, ucmd.buf_size, cq->umem = ib_umem_get(ibdev, ucmd.buf_addr, ucmd.buf_size,
IB_ACCESS_LOCAL_WRITE); IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(cq->umem)) { if (IS_ERR(cq->umem)) {
ret = PTR_ERR(cq->umem); ret = PTR_ERR(cq->umem);

View file

@ -126,7 +126,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
umem = ib_umem_get(udata, start, length, access_flags); umem = ib_umem_get(pd->device, start, length, access_flags);
if (IS_ERR(umem)) { if (IS_ERR(umem)) {
dev_warn(&dev->pdev->dev, dev_warn(&dev->pdev->dev,
"could not get umem for mem region\n"); "could not get umem for mem region\n");

View file

@ -276,8 +276,9 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd,
if (!is_srq) { if (!is_srq) {
/* set qp->sq.wqe_cnt, shift, buf_size.. */ /* set qp->sq.wqe_cnt, shift, buf_size.. */
qp->rumem = ib_umem_get(udata, ucmd.rbuf_addr, qp->rumem =
ucmd.rbuf_size, 0); ib_umem_get(pd->device, ucmd.rbuf_addr,
ucmd.rbuf_size, 0);
if (IS_ERR(qp->rumem)) { if (IS_ERR(qp->rumem)) {
ret = PTR_ERR(qp->rumem); ret = PTR_ERR(qp->rumem);
goto err_qp; goto err_qp;
@ -288,7 +289,7 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd,
qp->srq = to_vsrq(init_attr->srq); qp->srq = to_vsrq(init_attr->srq);
} }
qp->sumem = ib_umem_get(udata, ucmd.sbuf_addr, qp->sumem = ib_umem_get(pd->device, ucmd.sbuf_addr,
ucmd.sbuf_size, 0); ucmd.sbuf_size, 0);
if (IS_ERR(qp->sumem)) { if (IS_ERR(qp->sumem)) {
if (!is_srq) if (!is_srq)

View file

@ -146,7 +146,7 @@ int pvrdma_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init_attr,
goto err_srq; goto err_srq;
} }
srq->umem = ib_umem_get(udata, ucmd.buf_addr, ucmd.buf_size, 0); srq->umem = ib_umem_get(ibsrq->device, ucmd.buf_addr, ucmd.buf_size, 0);
if (IS_ERR(srq->umem)) { if (IS_ERR(srq->umem)) {
ret = PTR_ERR(srq->umem); ret = PTR_ERR(srq->umem);
goto err_srq; goto err_srq;

View file

@ -390,7 +390,7 @@ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
if (length == 0) if (length == 0)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
umem = ib_umem_get(udata, start, length, mr_access_flags); umem = ib_umem_get(pd->device, start, length, mr_access_flags);
if (IS_ERR(umem)) if (IS_ERR(umem))
return (void *)umem; return (void *)umem;

View file

@ -169,7 +169,7 @@ int rxe_mem_init_user(struct rxe_pd *pd, u64 start,
void *vaddr; void *vaddr;
int err; int err;
umem = ib_umem_get(udata, start, length, access); umem = ib_umem_get(pd->ibpd.device, start, length, access);
if (IS_ERR(umem)) { if (IS_ERR(umem)) {
pr_warn("err %d from rxe_umem_get\n", pr_warn("err %d from rxe_umem_get\n",
(int)PTR_ERR(umem)); (int)PTR_ERR(umem));

View file

@ -69,7 +69,7 @@ static inline size_t ib_umem_num_pages(struct ib_umem *umem)
#ifdef CONFIG_INFINIBAND_USER_MEM #ifdef CONFIG_INFINIBAND_USER_MEM
struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
size_t size, int access); size_t size, int access);
void ib_umem_release(struct ib_umem *umem); void ib_umem_release(struct ib_umem *umem);
int ib_umem_page_count(struct ib_umem *umem); int ib_umem_page_count(struct ib_umem *umem);
@ -83,7 +83,7 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
#include <linux/err.h> #include <linux/err.h>
static inline struct ib_umem *ib_umem_get(struct ib_udata *udata, static inline struct ib_umem *ib_umem_get(struct ib_device *device,
unsigned long addr, size_t size, unsigned long addr, size_t size,
int access) int access)
{ {

View file

@ -114,9 +114,9 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
struct ib_umem_odp * struct ib_umem_odp *
ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, size_t size, ib_umem_odp_get(struct ib_device *device, unsigned long addr, size_t size,
int access, const struct mmu_interval_notifier_ops *ops); int access, const struct mmu_interval_notifier_ops *ops);
struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata, struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
int access); int access);
struct ib_umem_odp * struct ib_umem_odp *
ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem, unsigned long addr, ib_umem_odp_alloc_child(struct ib_umem_odp *root_umem, unsigned long addr,
@ -134,7 +134,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset,
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
static inline struct ib_umem_odp * static inline struct ib_umem_odp *
ib_umem_odp_get(struct ib_udata *udata, unsigned long addr, size_t size, ib_umem_odp_get(struct ib_device *device, unsigned long addr, size_t size,
int access, const struct mmu_interval_notifier_ops *ops) int access, const struct mmu_interval_notifier_ops *ops)
{ {
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);

View file

@ -4153,6 +4153,15 @@ static inline void ib_dma_free_coherent(struct ib_device *dev,
dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle); dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle);
} }
/* ib_reg_user_mr - register a memory region for virtual addresses from kernel
* space. This function should be called when 'current' is the owning MM.
*/
struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
u64 virt_addr, int mr_access_flags);
/* ib_advise_mr - give an advice about an address range in a memory region */
int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
u32 flags, struct ib_sge *sg_list, u32 num_sge);
/** /**
* ib_dereg_mr_user - Deregisters a memory region and removes it from the * ib_dereg_mr_user - Deregisters a memory region and removes it from the
* HCA translation table. * HCA translation table.

View file

@ -156,6 +156,13 @@ static void rds_ib_add_one(struct ib_device *device)
has_fmr = (device->ops.alloc_fmr && device->ops.dealloc_fmr && has_fmr = (device->ops.alloc_fmr && device->ops.dealloc_fmr &&
device->ops.map_phys_fmr && device->ops.unmap_fmr); device->ops.map_phys_fmr && device->ops.unmap_fmr);
rds_ibdev->use_fastreg = (has_fr && !has_fmr); rds_ibdev->use_fastreg = (has_fr && !has_fmr);
rds_ibdev->odp_capable =
!!(device->attrs.device_cap_flags &
IB_DEVICE_ON_DEMAND_PAGING) &&
!!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps &
IB_ODP_SUPPORT_WRITE) &&
!!(device->attrs.odp_caps.per_transport_caps.rc_odp_caps &
IB_ODP_SUPPORT_READ);
rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32;
rds_ibdev->max_1m_mrs = device->attrs.max_mr ? rds_ibdev->max_1m_mrs = device->attrs.max_mr ?

View file

@ -247,7 +247,8 @@ struct rds_ib_device {
struct ib_device *dev; struct ib_device *dev;
struct ib_pd *pd; struct ib_pd *pd;
struct dma_pool *rid_hdrs_pool; /* RDS headers DMA pool */ struct dma_pool *rid_hdrs_pool; /* RDS headers DMA pool */
bool use_fastreg; u8 use_fastreg:1;
u8 odp_capable:1;
unsigned int max_mrs; unsigned int max_mrs;
struct rds_ib_mr_pool *mr_1m_pool; struct rds_ib_mr_pool *mr_1m_pool;

View file

@ -67,6 +67,7 @@ struct rds_ib_frmr {
/* This is stored as mr->r_trans_private. */ /* This is stored as mr->r_trans_private. */
struct rds_ib_mr { struct rds_ib_mr {
struct delayed_work work;
struct rds_ib_device *device; struct rds_ib_device *device;
struct rds_ib_mr_pool *pool; struct rds_ib_mr_pool *pool;
struct rds_ib_connection *ic; struct rds_ib_connection *ic;
@ -81,9 +82,11 @@ struct rds_ib_mr {
unsigned int sg_len; unsigned int sg_len;
int sg_dma_len; int sg_dma_len;
u8 odp:1;
union { union {
struct rds_ib_fmr fmr; struct rds_ib_fmr fmr;
struct rds_ib_frmr frmr; struct rds_ib_frmr frmr;
struct ib_mr *mr;
} u; } u;
}; };
@ -122,12 +125,14 @@ void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
struct rds_sock *rs, u32 *key_ret, struct rds_sock *rs, u32 *key_ret,
struct rds_connection *conn); struct rds_connection *conn, u64 start, u64 length,
int need_odp);
void rds_ib_sync_mr(void *trans_private, int dir); void rds_ib_sync_mr(void *trans_private, int dir);
void rds_ib_free_mr(void *trans_private, int invalidate); void rds_ib_free_mr(void *trans_private, int invalidate);
void rds_ib_flush_mrs(void); void rds_ib_flush_mrs(void);
int rds_ib_mr_init(void); int rds_ib_mr_init(void);
void rds_ib_mr_exit(void); void rds_ib_mr_exit(void);
u32 rds_ib_get_lkey(void *trans_private);
void __rds_ib_teardown_mr(struct rds_ib_mr *); void __rds_ib_teardown_mr(struct rds_ib_mr *);
void rds_ib_teardown_mr(struct rds_ib_mr *); void rds_ib_teardown_mr(struct rds_ib_mr *);

View file

@ -37,8 +37,15 @@
#include "rds_single_path.h" #include "rds_single_path.h"
#include "ib_mr.h" #include "ib_mr.h"
#include "rds.h"
struct workqueue_struct *rds_ib_mr_wq; struct workqueue_struct *rds_ib_mr_wq;
struct rds_ib_dereg_odp_mr {
struct work_struct work;
struct ib_mr *mr;
};
static void rds_ib_odp_mr_worker(struct work_struct *work);
static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
{ {
@ -213,6 +220,9 @@ void rds_ib_sync_mr(void *trans_private, int direction)
struct rds_ib_mr *ibmr = trans_private; struct rds_ib_mr *ibmr = trans_private;
struct rds_ib_device *rds_ibdev = ibmr->device; struct rds_ib_device *rds_ibdev = ibmr->device;
if (ibmr->odp)
return;
switch (direction) { switch (direction) {
case DMA_FROM_DEVICE: case DMA_FROM_DEVICE:
ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg, ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
@ -482,6 +492,16 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
if (ibmr->odp) {
/* A MR created and marked as use_once. We use delayed work,
* because there is a change that we are in interrupt and can't
* call to ib_dereg_mr() directly.
*/
INIT_DELAYED_WORK(&ibmr->work, rds_ib_odp_mr_worker);
queue_delayed_work(rds_ib_mr_wq, &ibmr->work, 0);
return;
}
/* Return it to the pool's free list */ /* Return it to the pool's free list */
if (rds_ibdev->use_fastreg) if (rds_ibdev->use_fastreg)
rds_ib_free_frmr_list(ibmr); rds_ib_free_frmr_list(ibmr);
@ -526,9 +546,17 @@ void rds_ib_flush_mrs(void)
up_read(&rds_ib_devices_lock); up_read(&rds_ib_devices_lock);
} }
u32 rds_ib_get_lkey(void *trans_private)
{
struct rds_ib_mr *ibmr = trans_private;
return ibmr->u.mr->lkey;
}
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
struct rds_sock *rs, u32 *key_ret, struct rds_sock *rs, u32 *key_ret,
struct rds_connection *conn) struct rds_connection *conn,
u64 start, u64 length, int need_odp)
{ {
struct rds_ib_device *rds_ibdev; struct rds_ib_device *rds_ibdev;
struct rds_ib_mr *ibmr = NULL; struct rds_ib_mr *ibmr = NULL;
@ -541,6 +569,51 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
goto out; goto out;
} }
if (need_odp == ODP_ZEROBASED || need_odp == ODP_VIRTUAL) {
u64 virt_addr = need_odp == ODP_ZEROBASED ? 0 : start;
int access_flags =
(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ |
IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_ATOMIC |
IB_ACCESS_ON_DEMAND);
struct ib_sge sge = {};
struct ib_mr *ib_mr;
if (!rds_ibdev->odp_capable) {
ret = -EOPNOTSUPP;
goto out;
}
ib_mr = ib_reg_user_mr(rds_ibdev->pd, start, length, virt_addr,
access_flags);
if (IS_ERR(ib_mr)) {
rdsdebug("rds_ib_get_user_mr returned %d\n",
IS_ERR(ib_mr));
ret = PTR_ERR(ib_mr);
goto out;
}
if (key_ret)
*key_ret = ib_mr->rkey;
ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
if (!ibmr) {
ib_dereg_mr(ib_mr);
ret = -ENOMEM;
goto out;
}
ibmr->u.mr = ib_mr;
ibmr->odp = 1;
sge.addr = virt_addr;
sge.length = length;
sge.lkey = ib_mr->lkey;
ib_advise_mr(rds_ibdev->pd,
IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE,
IB_UVERBS_ADVISE_MR_FLAG_FLUSH, &sge, 1);
return ibmr;
}
if (conn) if (conn)
ic = conn->c_transport_data; ic = conn->c_transport_data;
@ -629,3 +702,12 @@ void rds_ib_mr_exit(void)
{ {
destroy_workqueue(rds_ib_mr_wq); destroy_workqueue(rds_ib_mr_wq);
} }
static void rds_ib_odp_mr_worker(struct work_struct *work)
{
struct rds_ib_mr *ibmr;
ibmr = container_of(work, struct rds_ib_mr, work.work);
ib_dereg_mr(ibmr->u.mr);
kfree(ibmr);
}

View file

@ -39,6 +39,7 @@
#include "rds_single_path.h" #include "rds_single_path.h"
#include "rds.h" #include "rds.h"
#include "ib.h" #include "ib.h"
#include "ib_mr.h"
/* /*
* Convert IB-specific error message to RDS error message and call core * Convert IB-specific error message to RDS error message and call core
@ -635,6 +636,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
send->s_sge[0].addr = ic->i_send_hdrs_dma[pos]; send->s_sge[0].addr = ic->i_send_hdrs_dma[pos];
send->s_sge[0].length = sizeof(struct rds_header); send->s_sge[0].length = sizeof(struct rds_header);
send->s_sge[0].lkey = ic->i_pd->local_dma_lkey;
memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, memcpy(ic->i_send_hdrs[pos], &rm->m_inc.i_hdr,
sizeof(struct rds_header)); sizeof(struct rds_header));
@ -650,6 +652,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
send->s_sge[1].addr = sg_dma_address(scat); send->s_sge[1].addr = sg_dma_address(scat);
send->s_sge[1].addr += rm->data.op_dmaoff; send->s_sge[1].addr += rm->data.op_dmaoff;
send->s_sge[1].length = len; send->s_sge[1].length = len;
send->s_sge[1].lkey = ic->i_pd->local_dma_lkey;
bytes_sent += len; bytes_sent += len;
rm->data.op_dmaoff += len; rm->data.op_dmaoff += len;
@ -858,20 +861,29 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
int ret; int ret;
int num_sge; int num_sge;
int nr_sig = 0; int nr_sig = 0;
u64 odp_addr = op->op_odp_addr;
u32 odp_lkey = 0;
/* map the op the first time we see it */ /* map the op the first time we see it */
if (!op->op_mapped) { if (!op->op_odp_mr) {
op->op_count = ib_dma_map_sg(ic->i_cm_id->device, if (!op->op_mapped) {
op->op_sg, op->op_nents, (op->op_write) ? op->op_count =
DMA_TO_DEVICE : DMA_FROM_DEVICE); ib_dma_map_sg(ic->i_cm_id->device, op->op_sg,
rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); op->op_nents,
if (op->op_count == 0) { (op->op_write) ? DMA_TO_DEVICE :
rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); DMA_FROM_DEVICE);
ret = -ENOMEM; /* XXX ? */ rdsdebug("ic %p mapping op %p: %d\n", ic, op,
goto out; op->op_count);
if (op->op_count == 0) {
rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
ret = -ENOMEM; /* XXX ? */
goto out;
}
op->op_mapped = 1;
} }
} else {
op->op_mapped = 1; op->op_count = op->op_nents;
odp_lkey = rds_ib_get_lkey(op->op_odp_mr->r_trans_private);
} }
/* /*
@ -923,14 +935,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
for (j = 0; j < send->s_rdma_wr.wr.num_sge && for (j = 0; j < send->s_rdma_wr.wr.num_sge &&
scat != &op->op_sg[op->op_count]; j++) { scat != &op->op_sg[op->op_count]; j++) {
len = sg_dma_len(scat); len = sg_dma_len(scat);
send->s_sge[j].addr = sg_dma_address(scat); if (!op->op_odp_mr) {
send->s_sge[j].addr = sg_dma_address(scat);
send->s_sge[j].lkey = ic->i_pd->local_dma_lkey;
} else {
send->s_sge[j].addr = odp_addr;
send->s_sge[j].lkey = odp_lkey;
}
send->s_sge[j].length = len; send->s_sge[j].length = len;
send->s_sge[j].lkey = ic->i_pd->local_dma_lkey;
sent += len; sent += len;
rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr); rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
remote_addr += len; remote_addr += len;
odp_addr += len;
scat++; scat++;
} }

View file

@ -156,11 +156,13 @@ void rds_rdma_drop_keys(struct rds_sock *rs)
static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
struct page **pages, int write) struct page **pages, int write)
{ {
unsigned int gup_flags = FOLL_LONGTERM;
int ret; int ret;
ret = get_user_pages_fast(user_addr, nr_pages, write ? FOLL_WRITE : 0, if (write)
pages); gup_flags |= FOLL_WRITE;
ret = get_user_pages_fast(user_addr, nr_pages, gup_flags, pages);
if (ret >= 0 && ret < nr_pages) { if (ret >= 0 && ret < nr_pages) {
while (ret--) while (ret--)
put_page(pages[ret]); put_page(pages[ret]);
@ -175,13 +177,14 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
struct rds_conn_path *cp) struct rds_conn_path *cp)
{ {
struct rds_mr *mr = NULL, *found; struct rds_mr *mr = NULL, *found;
struct scatterlist *sg = NULL;
unsigned int nr_pages; unsigned int nr_pages;
struct page **pages = NULL; struct page **pages = NULL;
struct scatterlist *sg;
void *trans_private; void *trans_private;
unsigned long flags; unsigned long flags;
rds_rdma_cookie_t cookie; rds_rdma_cookie_t cookie;
unsigned int nents; unsigned int nents = 0;
int need_odp = 0;
long i; long i;
int ret; int ret;
@ -195,6 +198,21 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
goto out; goto out;
} }
/* If the combination of the addr and size requested for this memory
* region causes an integer overflow, return error.
*/
if (((args->vec.addr + args->vec.bytes) < args->vec.addr) ||
PAGE_ALIGN(args->vec.addr + args->vec.bytes) <
(args->vec.addr + args->vec.bytes)) {
ret = -EINVAL;
goto out;
}
if (!can_do_mlock()) {
ret = -EPERM;
goto out;
}
nr_pages = rds_pages_in_vec(&args->vec); nr_pages = rds_pages_in_vec(&args->vec);
if (nr_pages == 0) { if (nr_pages == 0) {
ret = -EINVAL; ret = -EINVAL;
@ -248,36 +266,44 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
* the zero page. * the zero page.
*/ */
ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
if (ret < 0) if (ret == -EOPNOTSUPP) {
need_odp = 1;
} else if (ret <= 0) {
goto out; goto out;
} else {
nents = ret;
sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
if (!sg) {
ret = -ENOMEM;
goto out;
}
WARN_ON(!nents);
sg_init_table(sg, nents);
nents = ret; /* Stick all pages into the scatterlist */
sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); for (i = 0 ; i < nents; i++)
if (!sg) { sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
ret = -ENOMEM;
goto out; rdsdebug("RDS: trans_private nents is %u\n", nents);
} }
WARN_ON(!nents);
sg_init_table(sg, nents);
/* Stick all pages into the scatterlist */
for (i = 0 ; i < nents; i++)
sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
rdsdebug("RDS: trans_private nents is %u\n", nents);
/* Obtain a transport specific MR. If this succeeds, the /* Obtain a transport specific MR. If this succeeds, the
* s/g list is now owned by the MR. * s/g list is now owned by the MR.
* Note that dma_map() implies that pending writes are * Note that dma_map() implies that pending writes are
* flushed to RAM, so no dma_sync is needed here. */ * flushed to RAM, so no dma_sync is needed here. */
trans_private = rs->rs_transport->get_mr(sg, nents, rs, trans_private = rs->rs_transport->get_mr(
&mr->r_key, sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL,
cp ? cp->cp_conn : NULL); args->vec.addr, args->vec.bytes,
need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED);
if (IS_ERR(trans_private)) { if (IS_ERR(trans_private)) {
for (i = 0 ; i < nents; i++) /* In ODP case, we don't GUP pages, so don't need
put_page(sg_page(&sg[i])); * to release anything.
kfree(sg); */
if (!need_odp) {
for (i = 0 ; i < nents; i++)
put_page(sg_page(&sg[i]));
kfree(sg);
}
ret = PTR_ERR(trans_private); ret = PTR_ERR(trans_private);
goto out; goto out;
} }
@ -291,7 +317,11 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
* map page aligned regions. So we keep the offset, and build * map page aligned regions. So we keep the offset, and build
* a 64bit cookie containing <R_Key, offset> and pass that * a 64bit cookie containing <R_Key, offset> and pass that
* around. */ * around. */
cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK); if (need_odp)
cookie = rds_rdma_make_cookie(mr->r_key, 0);
else
cookie = rds_rdma_make_cookie(mr->r_key,
args->vec.addr & ~PAGE_MASK);
if (cookie_ret) if (cookie_ret)
*cookie_ret = cookie; *cookie_ret = cookie;
@ -456,22 +486,26 @@ void rds_rdma_free_op(struct rm_rdma_op *ro)
{ {
unsigned int i; unsigned int i;
for (i = 0; i < ro->op_nents; i++) { if (ro->op_odp_mr) {
struct page *page = sg_page(&ro->op_sg[i]); rds_mr_put(ro->op_odp_mr);
} else {
for (i = 0; i < ro->op_nents; i++) {
struct page *page = sg_page(&ro->op_sg[i]);
/* Mark page dirty if it was possibly modified, which /* Mark page dirty if it was possibly modified, which
* is the case for a RDMA_READ which copies from remote * is the case for a RDMA_READ which copies from remote
* to local memory */ * to local memory
if (!ro->op_write) { */
WARN_ON(!page->mapping && irqs_disabled()); if (!ro->op_write)
set_page_dirty(page); set_page_dirty(page);
put_page(page);
} }
put_page(page);
} }
kfree(ro->op_notifier); kfree(ro->op_notifier);
ro->op_notifier = NULL; ro->op_notifier = NULL;
ro->op_active = 0; ro->op_active = 0;
ro->op_odp_mr = NULL;
} }
void rds_atomic_free_op(struct rm_atomic_op *ao) void rds_atomic_free_op(struct rm_atomic_op *ao)
@ -581,6 +615,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
struct rds_iovec *iovs; struct rds_iovec *iovs;
unsigned int i, j; unsigned int i, j;
int ret = 0; int ret = 0;
bool odp_supported = true;
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
|| rm->rdma.op_active) || rm->rdma.op_active)
@ -602,6 +637,9 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
ret = -EINVAL; ret = -EINVAL;
goto out_ret; goto out_ret;
} }
/* odp-mr is not supported for multiple requests within one message */
if (args->nr_local != 1)
odp_supported = false;
iovs = vec->iov; iovs = vec->iov;
@ -623,6 +661,8 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
op->op_silent = !!(args->flags & RDS_RDMA_SILENT); op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
op->op_active = 1; op->op_active = 1;
op->op_recverr = rs->rs_recverr; op->op_recverr = rs->rs_recverr;
op->op_odp_mr = NULL;
WARN_ON(!nr_pages); WARN_ON(!nr_pages);
op->op_sg = rds_message_alloc_sgs(rm, nr_pages, &ret); op->op_sg = rds_message_alloc_sgs(rm, nr_pages, &ret);
if (!op->op_sg) if (!op->op_sg)
@ -672,10 +712,44 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
* If it's a READ operation, we need to pin the pages for writing. * If it's a READ operation, we need to pin the pages for writing.
*/ */
ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
if (ret < 0) if ((!odp_supported && ret <= 0) ||
(odp_supported && ret <= 0 && ret != -EOPNOTSUPP))
goto out_pages; goto out_pages;
else
ret = 0; if (ret == -EOPNOTSUPP) {
struct rds_mr *local_odp_mr;
if (!rs->rs_transport->get_mr) {
ret = -EOPNOTSUPP;
goto out_pages;
}
local_odp_mr =
kzalloc(sizeof(*local_odp_mr), GFP_KERNEL);
if (!local_odp_mr) {
ret = -ENOMEM;
goto out_pages;
}
RB_CLEAR_NODE(&local_odp_mr->r_rb_node);
refcount_set(&local_odp_mr->r_refcount, 1);
local_odp_mr->r_trans = rs->rs_transport;
local_odp_mr->r_sock = rs;
local_odp_mr->r_trans_private =
rs->rs_transport->get_mr(
NULL, 0, rs, &local_odp_mr->r_key, NULL,
iov->addr, iov->bytes, ODP_VIRTUAL);
if (IS_ERR(local_odp_mr->r_trans_private)) {
ret = IS_ERR(local_odp_mr->r_trans_private);
rdsdebug("get_mr ret %d %p\"", ret,
local_odp_mr->r_trans_private);
kfree(local_odp_mr);
ret = -EOPNOTSUPP;
goto out_pages;
}
rdsdebug("Need odp; local_odp_mr %p trans_private %p\n",
local_odp_mr, local_odp_mr->r_trans_private);
op->op_odp_mr = local_odp_mr;
op->op_odp_addr = iov->addr;
}
rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
nr_bytes, nr, iov->bytes, iov->addr); nr_bytes, nr, iov->bytes, iov->addr);
@ -691,6 +765,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
offset); offset);
sg_dma_len(sg) = sg->length;
rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
sg->offset, sg->length, iov->addr, iov->bytes); sg->offset, sg->length, iov->addr, iov->bytes);
@ -709,6 +784,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
goto out_pages; goto out_pages;
} }
op->op_bytes = nr_bytes; op->op_bytes = nr_bytes;
ret = 0;
out_pages: out_pages:
kfree(pages); kfree(pages);
@ -755,7 +831,8 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
if (mr) { if (mr) {
mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); mr->r_trans->sync_mr(mr->r_trans_private,
DMA_TO_DEVICE);
rm->rdma.op_rdma_mr = mr; rm->rdma.op_rdma_mr = mr;
} }
return err; return err;

View file

@ -40,7 +40,6 @@
#ifdef ATOMIC64_INIT #ifdef ATOMIC64_INIT
#define KERNEL_HAS_ATOMIC64 #define KERNEL_HAS_ATOMIC64
#endif #endif
#ifdef RDS_DEBUG #ifdef RDS_DEBUG
#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
#else #else
@ -478,6 +477,9 @@ struct rds_message {
struct rds_notifier *op_notifier; struct rds_notifier *op_notifier;
struct rds_mr *op_rdma_mr; struct rds_mr *op_rdma_mr;
u64 op_odp_addr;
struct rds_mr *op_odp_mr;
} rdma; } rdma;
struct rm_data_op { struct rm_data_op {
unsigned int op_active:1; unsigned int op_active:1;
@ -573,7 +575,8 @@ struct rds_transport {
void (*exit)(void); void (*exit)(void);
void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg, void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
struct rds_sock *rs, u32 *key_ret, struct rds_sock *rs, u32 *key_ret,
struct rds_connection *conn); struct rds_connection *conn,
u64 start, u64 length, int need_odp);
void (*sync_mr)(void *trans_private, int direction); void (*sync_mr)(void *trans_private, int direction);
void (*free_mr)(void *trans_private, int invalidate); void (*free_mr)(void *trans_private, int invalidate);
void (*flush_mrs)(void); void (*flush_mrs)(void);
@ -956,6 +959,12 @@ static inline bool rds_destroy_pending(struct rds_connection *conn)
(conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn)); (conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn));
} }
enum {
ODP_NOT_NEEDED,
ODP_ZEROBASED,
ODP_VIRTUAL
};
/* stats.c */ /* stats.c */
DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
#define rds_stats_inc_which(which, member) do { \ #define rds_stats_inc_which(which, member) do { \