linux-stable/net/xdp/xdp_umem.c

260 lines
5.3 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/* XDP user-space packet buffer
* Copyright(c) 2018 Intel Corporation.
*/
#include <linux/init.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/uaccess.h>
#include <linux/slab.h>
#include <linux/bpf.h>
#include <linux/mm.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/idr.h>
#include <linux/vmalloc.h>
#include "xdp_umem.h"
#include "xsk_queue.h"
static DEFINE_IDA(umem_ida);
static void xdp_umem_unpin_pages(struct xdp_umem *umem)
{
unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true);
kvfree(umem->pgs);
umem->pgs = NULL;
}
static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
{
if (umem->user) {
atomic_long_sub(umem->npgs, &umem->user->locked_vm);
free_uid(umem->user);
}
}
static void xdp_umem_addr_unmap(struct xdp_umem *umem)
{
vunmap(umem->addrs);
umem->addrs = NULL;
}
static int xdp_umem_addr_map(struct xdp_umem *umem, struct page **pages,
u32 nr_pages)
{
umem->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
if (!umem->addrs)
return -ENOMEM;
return 0;
}
static void xdp_umem_release(struct xdp_umem *umem)
{
umem->zc = false;
ida_free(&umem_ida, umem->id);
xdp_umem_addr_unmap(umem);
xdp_umem_unpin_pages(umem);
xdp_umem_unaccount_pages(umem);
kfree(umem);
}
static void xdp_umem_release_deferred(struct work_struct *work)
{
struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
xdp_umem_release(umem);
}
void xdp_get_umem(struct xdp_umem *umem)
{
refcount_inc(&umem->users);
}
void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup)
{
if (!umem)
return;
if (refcount_dec_and_test(&umem->users)) {
if (defer_cleanup) {
INIT_WORK(&umem->work, xdp_umem_release_deferred);
schedule_work(&umem->work);
} else {
xdp_umem_release(umem);
}
}
}
static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
{
unsigned int gup_flags = FOLL_WRITE;
long npgs;
int err;
umem->pgs = kvcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL | __GFP_NOWARN);
if (!umem->pgs)
return -ENOMEM;
mmap locking API: use coccinelle to convert mmap_sem rwsem call sites This change converts the existing mmap_sem rwsem calls to use the new mmap locking API instead. The change is generated using coccinelle with the following rule: // spatch --sp-file mmap_lock_api.cocci --in-place --include-headers --dir . @@ expression mm; @@ ( -init_rwsem +mmap_init_lock | -down_write +mmap_write_lock | -down_write_killable +mmap_write_lock_killable | -down_write_trylock +mmap_write_trylock | -up_write +mmap_write_unlock | -downgrade_write +mmap_write_downgrade | -down_read +mmap_read_lock | -down_read_killable +mmap_read_lock_killable | -down_read_trylock +mmap_read_trylock | -up_read +mmap_read_unlock ) -(&mm->mmap_sem) +(mm) Signed-off-by: Michel Lespinasse <walken@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com> Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Davidlohr Bueso <dbueso@suse.de> Cc: David Rientjes <rientjes@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Liam Howlett <Liam.Howlett@oracle.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ying Han <yinghan@google.com> Link: http://lkml.kernel.org/r/20200520052908.204642-5-walken@google.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-09 04:33:25 +00:00
mmap_read_lock(current->mm);
npgs = pin_user_pages(address, umem->npgs,
gup_flags | FOLL_LONGTERM, &umem->pgs[0]);
mmap locking API: use coccinelle to convert mmap_sem rwsem call sites This change converts the existing mmap_sem rwsem calls to use the new mmap locking API instead. The change is generated using coccinelle with the following rule: // spatch --sp-file mmap_lock_api.cocci --in-place --include-headers --dir . @@ expression mm; @@ ( -init_rwsem +mmap_init_lock | -down_write +mmap_write_lock | -down_write_killable +mmap_write_lock_killable | -down_write_trylock +mmap_write_trylock | -up_write +mmap_write_unlock | -downgrade_write +mmap_write_downgrade | -down_read +mmap_read_lock | -down_read_killable +mmap_read_lock_killable | -down_read_trylock +mmap_read_trylock | -up_read +mmap_read_unlock ) -(&mm->mmap_sem) +(mm) Signed-off-by: Michel Lespinasse <walken@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com> Reviewed-by: Laurent Dufour <ldufour@linux.ibm.com> Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Davidlohr Bueso <dbueso@suse.de> Cc: David Rientjes <rientjes@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Jerome Glisse <jglisse@redhat.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Liam Howlett <Liam.Howlett@oracle.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ying Han <yinghan@google.com> Link: http://lkml.kernel.org/r/20200520052908.204642-5-walken@google.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2020-06-09 04:33:25 +00:00
mmap_read_unlock(current->mm);
if (npgs != umem->npgs) {
if (npgs >= 0) {
umem->npgs = npgs;
err = -ENOMEM;
goto out_pin;
}
err = npgs;
goto out_pgs;
}
return 0;
out_pin:
xdp_umem_unpin_pages(umem);
out_pgs:
kvfree(umem->pgs);
umem->pgs = NULL;
return err;
}
static int xdp_umem_account_pages(struct xdp_umem *umem)
{
unsigned long lock_limit, new_npgs, old_npgs;
if (capable(CAP_IPC_LOCK))
return 0;
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
umem->user = get_uid(current_user());
do {
old_npgs = atomic_long_read(&umem->user->locked_vm);
new_npgs = old_npgs + umem->npgs;
if (new_npgs > lock_limit) {
free_uid(umem->user);
umem->user = NULL;
return -ENOBUFS;
}
} while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
new_npgs) != old_npgs);
return 0;
}
static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
{
bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
u64 addr = mr->addr, size = mr->len;
u32 chunks_rem, npgs_rem;
u64 chunks, npgs;
int err;
xsk: new descriptor addressing scheme Currently, AF_XDP only supports a fixed frame-size memory scheme where each frame is referenced via an index (idx). A user passes the frame index to the kernel, and the kernel acts upon the data. Some NICs, however, do not have a fixed frame-size model, instead they have a model where a memory window is passed to the hardware and multiple frames are filled into that window (referred to as the "type-writer" model). By changing the descriptor format from the current frame index addressing scheme, AF_XDP can in the future be extended to support these kinds of NICs. In the index-based model, an idx refers to a frame of size frame_size. Addressing a frame in the UMEM is done by offseting the UMEM starting address by a global offset, idx * frame_size + offset. Communicating via the fill- and completion-rings are done by means of idx. In this commit, the idx is removed in favor of an address (addr), which is a relative address ranging over the UMEM. To convert an idx-based address to the new addr is simply: addr = idx * frame_size + offset. We also stop referring to the UMEM "frame" as a frame. Instead it is simply called a chunk. To transfer ownership of a chunk to the kernel, the addr of the chunk is passed in the fill-ring. Note, that the kernel will mask addr to make it chunk aligned, so there is no need for userspace to do that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or 3000 to the fill-ring will refer to the same chunk. On the completion-ring, the addr will match that of the Tx descriptor, passed to the kernel. Changing the descriptor format to use chunks/addr will allow for future changes to move to a type-writer based model, where multiple frames can reside in one chunk. In this model passing one single chunk into the fill-ring, would potentially result in multiple Rx descriptors. This commit changes the uapi of AF_XDP sockets, and updates the documentation. Signed-off-by: Björn Töpel <bjorn.topel@intel.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 11:57:13 +00:00
if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
/* Strictly speaking we could support this, if:
* - huge pages, or*
* - using an IOMMU, or
* - making sure the memory area is consecutive
* but for now, we simply say "computer says no".
*/
return -EINVAL;
}
if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG)
return -EINVAL;
if (!unaligned_chunks && !is_power_of_2(chunk_size))
return -EINVAL;
if (!PAGE_ALIGNED(addr)) {
/* Memory area has to be page size aligned. For
* simplicity, this might change.
*/
return -EINVAL;
}
if ((addr + size) < addr)
return -EINVAL;
npgs = div_u64_rem(size, PAGE_SIZE, &npgs_rem);
if (npgs_rem)
npgs++;
if (npgs > U32_MAX)
return -EINVAL;
chunks = div_u64_rem(size, chunk_size, &chunks_rem);
if (!chunks || chunks > U32_MAX)
return -EINVAL;
if (!unaligned_chunks && chunks_rem)
return -EINVAL;
if (headroom >= chunk_size - XDP_PACKET_HEADROOM)
return -EINVAL;
umem->size = size;
xsk: new descriptor addressing scheme Currently, AF_XDP only supports a fixed frame-size memory scheme where each frame is referenced via an index (idx). A user passes the frame index to the kernel, and the kernel acts upon the data. Some NICs, however, do not have a fixed frame-size model, instead they have a model where a memory window is passed to the hardware and multiple frames are filled into that window (referred to as the "type-writer" model). By changing the descriptor format from the current frame index addressing scheme, AF_XDP can in the future be extended to support these kinds of NICs. In the index-based model, an idx refers to a frame of size frame_size. Addressing a frame in the UMEM is done by offseting the UMEM starting address by a global offset, idx * frame_size + offset. Communicating via the fill- and completion-rings are done by means of idx. In this commit, the idx is removed in favor of an address (addr), which is a relative address ranging over the UMEM. To convert an idx-based address to the new addr is simply: addr = idx * frame_size + offset. We also stop referring to the UMEM "frame" as a frame. Instead it is simply called a chunk. To transfer ownership of a chunk to the kernel, the addr of the chunk is passed in the fill-ring. Note, that the kernel will mask addr to make it chunk aligned, so there is no need for userspace to do that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or 3000 to the fill-ring will refer to the same chunk. On the completion-ring, the addr will match that of the Tx descriptor, passed to the kernel. Changing the descriptor format to use chunks/addr will allow for future changes to move to a type-writer based model, where multiple frames can reside in one chunk. In this model passing one single chunk into the fill-ring, would potentially result in multiple Rx descriptors. This commit changes the uapi of AF_XDP sockets, and updates the documentation. Signed-off-by: Björn Töpel <bjorn.topel@intel.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 11:57:13 +00:00
umem->headroom = headroom;
xsk: Introduce AF_XDP buffer allocation API In order to simplify AF_XDP zero-copy enablement for NIC driver developers, a new AF_XDP buffer allocation API is added. The implementation is based on a single core (single producer/consumer) buffer pool for the AF_XDP UMEM. A buffer is allocated using the xsk_buff_alloc() function, and returned using xsk_buff_free(). If a buffer is disassociated with the pool, e.g. when a buffer is passed to an AF_XDP socket, a buffer is said to be released. Currently, the release function is only used by the AF_XDP internals and not visible to the driver. Drivers using this API should register the XDP memory model with the new MEM_TYPE_XSK_BUFF_POOL type. The API is defined in net/xdp_sock_drv.h. The buffer type is struct xdp_buff, and follows the lifetime of regular xdp_buffs, i.e. the lifetime of an xdp_buff is restricted to a NAPI context. In other words, the API is not replacing xdp_frames. In addition to introducing the API and implementations, the AF_XDP core is migrated to use the new APIs. rfc->v1: Fixed build errors/warnings for m68k and riscv. (kbuild test robot) Added headroom/chunk size getter. (Maxim/Björn) v1->v2: Swapped SoBs. (Maxim) v2->v3: Initialize struct xdp_buff member frame_sz. (Björn) Add API to query the DMA address of a frame. (Maxim) Do DMA sync for CPU till the end of the frame to handle possible growth (frame_sz). (Maxim) Signed-off-by: Björn Töpel <bjorn.topel@intel.com> Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Link: https://lore.kernel.org/bpf/20200520192103.355233-6-bjorn.topel@gmail.com
2020-05-20 19:20:53 +00:00
umem->chunk_size = chunk_size;
umem->chunks = chunks;
umem->npgs = npgs;
umem->pgs = NULL;
umem->user = NULL;
umem->flags = mr->flags;
INIT_LIST_HEAD(&umem->xsk_dma_list);
refcount_set(&umem->users, 1);
err = xdp_umem_account_pages(umem);
if (err)
return err;
err = xdp_umem_pin_pages(umem, (unsigned long)addr);
if (err)
goto out_account;
err = xdp_umem_addr_map(umem, umem->pgs, umem->npgs);
if (err)
goto out_unpin;
xsk: Introduce AF_XDP buffer allocation API In order to simplify AF_XDP zero-copy enablement for NIC driver developers, a new AF_XDP buffer allocation API is added. The implementation is based on a single core (single producer/consumer) buffer pool for the AF_XDP UMEM. A buffer is allocated using the xsk_buff_alloc() function, and returned using xsk_buff_free(). If a buffer is disassociated with the pool, e.g. when a buffer is passed to an AF_XDP socket, a buffer is said to be released. Currently, the release function is only used by the AF_XDP internals and not visible to the driver. Drivers using this API should register the XDP memory model with the new MEM_TYPE_XSK_BUFF_POOL type. The API is defined in net/xdp_sock_drv.h. The buffer type is struct xdp_buff, and follows the lifetime of regular xdp_buffs, i.e. the lifetime of an xdp_buff is restricted to a NAPI context. In other words, the API is not replacing xdp_frames. In addition to introducing the API and implementations, the AF_XDP core is migrated to use the new APIs. rfc->v1: Fixed build errors/warnings for m68k and riscv. (kbuild test robot) Added headroom/chunk size getter. (Maxim/Björn) v1->v2: Swapped SoBs. (Maxim) v2->v3: Initialize struct xdp_buff member frame_sz. (Björn) Add API to query the DMA address of a frame. (Maxim) Do DMA sync for CPU till the end of the frame to handle possible growth (frame_sz). (Maxim) Signed-off-by: Björn Töpel <bjorn.topel@intel.com> Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Link: https://lore.kernel.org/bpf/20200520192103.355233-6-bjorn.topel@gmail.com
2020-05-20 19:20:53 +00:00
return 0;
out_unpin:
xdp_umem_unpin_pages(umem);
out_account:
xdp_umem_unaccount_pages(umem);
return err;
}
struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
{
struct xdp_umem *umem;
int err;
umem = kzalloc(sizeof(*umem), GFP_KERNEL);
if (!umem)
return ERR_PTR(-ENOMEM);
err = ida_alloc(&umem_ida, GFP_KERNEL);
if (err < 0) {
kfree(umem);
return ERR_PTR(err);
}
umem->id = err;
err = xdp_umem_reg(umem, mr);
if (err) {
ida_free(&umem_ida, umem->id);
kfree(umem);
return ERR_PTR(err);
}
return umem;
}