From 6e76ac595855db27bbdaef337173294a6fd6eb2c Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sat, 29 Apr 2023 01:40:30 +0900 Subject: [PATCH] io_uring: Add io_uring_setup flag to pre-register ring fd and never install it With IORING_REGISTER_USE_REGISTERED_RING, an application can register the ring fd and use it via registered index rather than installed fd. This allows using a registered ring for everything *except* the initial mmap. With IORING_SETUP_NO_MMAP, io_uring_setup uses buffers allocated by the user, rather than requiring a subsequent mmap. The combination of the two allows a user to operate *entirely* via a registered ring fd, making it unnecessary to ever install the fd in the first place. So, add a flag IORING_SETUP_REGISTERED_FD_ONLY to make io_uring_setup register the fd and return a registered index, without installing the fd. This allows an application to avoid touching the fd table at all, and allows a library to never even momentarily install a file descriptor. This splits out an io_ring_add_registered_file helper from io_ring_add_registered_fd, for use by io_uring_setup. Signed-off-by: Josh Triplett Link: https://lore.kernel.org/r/bc8f431bada371c183b95a83399628b605e978a3.1682699803.git.josh@joshtriplett.org Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 7 ++++++ io_uring/io_uring.c | 37 ++++++++++++++++++------------- io_uring/io_uring.h | 3 +++ io_uring/tctx.c | 41 +++++++++++++++++++++-------------- 4 files changed, 57 insertions(+), 31 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 2edba9a274de..f222d263bc55 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -178,6 +178,13 @@ enum { */ #define IORING_SETUP_NO_MMAP (1U << 14) +/* + * Register the ring fd in itself for use with + * IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather + * than an fd. + */ +#define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 61379cf8e7f5..dab09f568294 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3788,19 +3788,13 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return 0; } -static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) +static int io_uring_install_fd(struct file *file) { - int ret, fd; + int fd; fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); if (fd < 0) return fd; - - ret = __io_uring_add_tctx_node(ctx); - if (ret) { - put_unused_fd(fd); - return ret; - } fd_install(fd, file); return fd; } @@ -3840,6 +3834,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, struct io_uring_params __user *params) { struct io_ring_ctx *ctx; + struct io_uring_task *tctx; struct file *file; int ret; @@ -3851,6 +3846,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, entries = IORING_MAX_ENTRIES; } + if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY) + && !(p->flags & IORING_SETUP_NO_MMAP)) + return -EINVAL; + /* * Use twice as many entries for the CQ ring. It's possible for the * application to drive a higher depth than the size of the SQ ring, @@ -4007,22 +4006,30 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; } + ret = __io_uring_add_tctx_node(ctx); + if (ret) + goto err_fput; + tctx = current->io_uring; + /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup */ - ret = io_uring_install_fd(ctx, file); - if (ret < 0) { - /* fput will clean it up */ - fput(file); - return ret; - } + if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY) + ret = io_ring_add_registered_file(tctx, file, 0, IO_RINGFD_REG_MAX); + else + ret = io_uring_install_fd(file); + if (ret < 0) + goto err_fput; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: io_ring_ctx_wait_and_kill(ctx); return ret; +err_fput: + fput(file); + return ret; } /* @@ -4049,7 +4056,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | - IORING_SETUP_NO_MMAP)) + IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY)) return -EINVAL; return io_uring_create(entries, &p, params); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 259bf798a390..9b8dfb3bb2b4 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -75,6 +75,9 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx); +int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, + int start, int end); + int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 3a8d1dd97e1b..c043fe93a3f2 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -208,29 +208,38 @@ void io_uring_unreg_ringfd(void) } } +int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, + int start, int end) +{ + int offset; + for (offset = start; offset < end; offset++) { + offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); + if (tctx->registered_rings[offset]) + continue; + + tctx->registered_rings[offset] = file; + return offset; + } + return -EBUSY; +} + static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, int start, int end) { struct file *file; int offset; - for (offset = start; offset < end; offset++) { - offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); - if (tctx->registered_rings[offset]) - continue; - - file = fget(fd); - if (!file) { - return -EBADF; - } else if (!io_is_uring_fops(file)) { - fput(file); - return -EOPNOTSUPP; - } - tctx->registered_rings[offset] = file; - return offset; + file = fget(fd); + if (!file) { + return -EBADF; + } else if (!io_is_uring_fops(file)) { + fput(file); + return -EOPNOTSUPP; } - - return -EBUSY; + offset = io_ring_add_registered_file(tctx, file, start, end); + if (offset < 0) + fput(file); + return offset; } /*