diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index ef45b8bd1b35..d34c8433caf9 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -372,9 +372,6 @@ struct io_ring_ctx { struct list_head io_buffers_cache; - /* deferred free list, protected by ->uring_lock */ - struct hlist_head io_buf_list; - /* Keep this last, we don't need it for the fast path */ struct wait_queue_head poll_wq; struct io_restriction restrictions; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0ef418faac33..50e859da59e1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -303,7 +303,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); - INIT_HLIST_HEAD(&ctx->io_buf_list); ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, sizeof(struct io_rsrc_node)); ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, @@ -2598,15 +2597,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -static void io_pages_unmap(void *ptr, struct page ***pages, - unsigned short *npages) +void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, + bool put_pages) { bool do_vunmap = false; if (!ptr) return; - if (*npages) { + if (put_pages && *npages) { struct page **to_free = *pages; int i; @@ -2628,14 +2627,6 @@ static void io_pages_unmap(void *ptr, struct page ***pages, *npages = 0; } -void io_mem_free(void *ptr) -{ - if (!ptr) - return; - - folio_put(virt_to_folio(ptr)); -} - static void io_pages_free(struct page ***pages, int npages) { struct page **page_array = *pages; @@ -2730,8 +2721,10 @@ static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, static void io_rings_free(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { - io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages); - io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages); + io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages, + true); + io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages, + true); } else { io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); ctx->n_ring_pages = 0; @@ -2788,8 +2781,8 @@ static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, return ERR_PTR(-ENOMEM); } -static void *io_pages_map(struct page ***out_pages, unsigned short *npages, - size_t size) +void *io_pages_map(struct page ***out_pages, unsigned short *npages, + size_t size) { gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; struct page **pages; @@ -2819,17 +2812,6 @@ static void *io_pages_map(struct page ***out_pages, unsigned short *npages, return ret; } -void *io_mem_alloc(size_t size) -{ - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; - void *ret; - - ret = (void *) __get_free_pages(gfp, get_order(size)); - if (ret) - return ret; - return ERR_PTR(-ENOMEM); -} - static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, unsigned int cq_entries, size_t *sq_offset) { @@ -2926,7 +2908,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) ctx->mm_account = NULL; } io_rings_free(ctx); - io_kbuf_mmap_list_free(ctx); percpu_ref_exit(&ctx->refs); free_uid(ctx->user); @@ -3396,10 +3377,8 @@ static void *io_uring_validate_mmap_request(struct file *file, { struct io_ring_ctx *ctx = file->private_data; loff_t offset = pgoff << PAGE_SHIFT; - struct page *page; - void *ptr; - switch (offset & IORING_OFF_MMAP_MASK) { + switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: /* Don't allow mmap if the ring was setup without it */ @@ -3414,6 +3393,7 @@ static void *io_uring_validate_mmap_request(struct file *file, case IORING_OFF_PBUF_RING: { struct io_buffer_list *bl; unsigned int bgid; + void *ptr; bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; bl = io_pbuf_get_bl(ctx, bgid); @@ -3421,17 +3401,11 @@ static void *io_uring_validate_mmap_request(struct file *file, return bl; ptr = bl->buf_ring; io_put_bl(ctx, bl); - break; + return ptr; } - default: - return ERR_PTR(-EINVAL); } - page = virt_to_head_page(ptr); - if (sz > page_size(page)) - return ERR_PTR(-EINVAL); - - return ptr; + return ERR_PTR(-EINVAL); } int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, @@ -3450,7 +3424,6 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; long offset = vma->vm_pgoff << PAGE_SHIFT; - unsigned long pfn; void *ptr; ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); @@ -3465,10 +3438,11 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) case IORING_OFF_SQES: return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, ctx->n_sqe_pages); + case IORING_OFF_PBUF_RING: + return io_pbuf_mmap(file, vma); } - pfn = virt_to_phys(ptr) >> PAGE_SHIFT; - return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); + return -EINVAL; } static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 75230d914007..dec996a1c789 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -109,8 +109,10 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); -void *io_mem_alloc(size_t size); -void io_mem_free(void *ptr); +void *io_pages_map(struct page ***out_pages, unsigned short *npages, + size_t size); +void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, + bool put_pages); enum { IO_EVENTFD_OP_SIGNAL_BIT, diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 4289f4a92693..820ac599d003 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -32,25 +32,12 @@ struct io_provide_buf { __u16 bid; }; -struct io_buf_free { - struct hlist_node list; - void *mem; - size_t size; - int inuse; -}; - -static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx, - unsigned int bgid) -{ - return xa_load(&ctx->io_bl_xa, bgid); -} - static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, unsigned int bgid) { lockdep_assert_held(&ctx->uring_lock); - return __io_buffer_get_list(ctx, bgid); + return xa_load(&ctx->io_bl_xa, bgid); } static int io_buffer_add_list(struct io_ring_ctx *ctx, @@ -191,24 +178,6 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, return ret; } -/* - * Mark the given mapped range as free for reuse - */ -static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl) -{ - struct io_buf_free *ibf; - - hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { - if (bl->buf_ring == ibf->mem) { - ibf->inuse = 0; - return; - } - } - - /* can't happen... */ - WARN_ON_ONCE(1); -} - static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer_list *bl, unsigned nbufs) { @@ -220,23 +189,16 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (bl->is_buf_ring) { i = bl->buf_ring->tail - bl->head; - if (bl->is_mmap) { - /* - * io_kbuf_list_free() will free the page(s) at - * ->release() time. - */ - io_kbuf_mark_free(ctx, bl); - bl->buf_ring = NULL; - bl->is_mmap = 0; - } else if (bl->buf_nr_pages) { + if (bl->buf_nr_pages) { int j; - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - kvfree(bl->buf_pages); - vunmap(bl->buf_ring); - bl->buf_pages = NULL; - bl->buf_nr_pages = 0; + if (!bl->is_mmap) { + for (j = 0; j < bl->buf_nr_pages; j++) + unpin_user_page(bl->buf_pages[j]); + } + io_pages_unmap(bl->buf_ring, &bl->buf_pages, + &bl->buf_nr_pages, bl->is_mmap); + bl->is_mmap = 0; } /* make sure it's seen as empty */ INIT_LIST_HEAD(&bl->buf_list); @@ -537,63 +499,18 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, return ret; } -/* - * See if we have a suitable region that we can reuse, rather than allocate - * both a new io_buf_free and mem region again. We leave it on the list as - * even a reused entry will need freeing at ring release. - */ -static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx, - size_t ring_size) -{ - struct io_buf_free *ibf, *best = NULL; - size_t best_dist; - - hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { - size_t dist; - - if (ibf->inuse || ibf->size < ring_size) - continue; - dist = ibf->size - ring_size; - if (!best || dist < best_dist) { - best = ibf; - if (!dist) - break; - best_dist = dist; - } - } - - return best; -} - static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, struct io_uring_buf_reg *reg, struct io_buffer_list *bl) { - struct io_buf_free *ibf; size_t ring_size; - void *ptr; ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); - /* Reuse existing entry, if we can */ - ibf = io_lookup_buf_free_entry(ctx, ring_size); - if (!ibf) { - ptr = io_mem_alloc(ring_size); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); + bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size); + if (!bl->buf_ring) + return -ENOMEM; - /* Allocate and store deferred free entry */ - ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT); - if (!ibf) { - io_mem_free(ptr); - return -ENOMEM; - } - ibf->mem = ptr; - ibf->size = ring_size; - hlist_add_head(&ibf->list, &ctx->io_buf_list); - } - ibf->inuse = 1; - bl->buf_ring = ibf->mem; bl->is_buf_ring = 1; bl->is_mmap = 1; return 0; @@ -741,18 +658,19 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, return ERR_PTR(-EINVAL); } -/* - * Called at or after ->release(), free the mmap'ed buffers that we used - * for memory mapped provided buffer rings. - */ -void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx) +int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) { - struct io_buf_free *ibf; - struct hlist_node *tmp; + struct io_ring_ctx *ctx = file->private_data; + loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT; + struct io_buffer_list *bl; + int bgid, ret; - hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) { - hlist_del(&ibf->list); - io_mem_free(ibf->mem); - kfree(ibf); - } + bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + bl = io_pbuf_get_bl(ctx, bgid); + if (IS_ERR(bl)) + return PTR_ERR(bl); + + ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); + io_put_bl(ctx, bl); + return ret; } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index df365b8860cf..53c141d9a8b2 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -55,8 +55,6 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); -void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx); - void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); @@ -64,6 +62,7 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl); struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, unsigned long bgid); +int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma); static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) {