From 0df96fb71a395b4fc9c80180306420c743f395a8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 3 Nov 2023 09:26:13 -0600 Subject: [PATCH 1/7] io_uring/rw: don't attempt to allocate async data if opcode doesn't need it The new read multishot method doesn't need to allocate async data ever, as it doesn't do vectored IO and it must only be used with provided buffers. While it doesn't have ->prep_async() set, it also sets ->async_size to 0, which is different from any other read/write type we otherwise support. If it's used on a file type that isn't pollable, we do try and allocate this async data, and then try and use that data. But since we passed in a size of 0 for the data, we get a NULL back on data allocation. We then proceed to dereference that to copy state, and that obviously won't end well. Add a check in io_setup_async_rw() for this condition, and avoid copying state. Also add a check for whether or not buffer selection is specified in prep while at it. Fixes: fc68fcda0491 ("io_uring/rw: add support for IORING_OP_READ_MULTISHOT") Link: https://bugzilla.kernel.org/show_bug.cgi?id=218101 Signed-off-by: Jens Axboe --- io_uring/rw.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/io_uring/rw.c b/io_uring/rw.c index 3398e1d944c2..1c76de483ef6 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -131,6 +131,10 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { int ret; + /* must be used with provided buffers */ + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return -EINVAL; + ret = io_prep_rw(req, sqe); if (unlikely(ret)) return ret; @@ -542,6 +546,9 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, { if (!force && !io_cold_defs[req->opcode].prep_async) return 0; + /* opcode type doesn't need async data */ + if (!io_cold_defs[req->opcode].async_size) + return 0; if (!req_has_async_data(req)) { struct io_async_rw *iorw; From f8f9ab2d98116e79d220f1d089df7464ad4e026d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 3 Nov 2023 10:35:40 -0600 Subject: [PATCH 2/7] io_uring/net: ensure socket is marked connected on connect retry io_uring does non-blocking connection attempts, which can yield some unexpected results if a connect request is re-attempted by an an application. This is equivalent to the following sync syscall sequence: sock = socket(AF_INET, SOCK_STREAM | SOCK_NONBLOCK, IPPROTO_TCP); connect(sock, &addr, sizeof(addr); ret == -1 and errno == EINPROGRESS expected here. Now poll for POLLOUT on sock, and when that returns, we expect the socket to be connected. But if we follow that procedure with: connect(sock, &addr, sizeof(addr)); you'd expect ret == -1 and errno == EISCONN here, but you actually get ret == 0. If we attempt the connection one more time, then we get EISCON as expected. io_uring used to do this, but turns out that bluetooth fails with EBADFD if you attempt to re-connect. Also looks like EISCONN _could_ occur with this sequence. Retain the ->in_progress logic, but work-around a potential EISCONN or EBADFD error and only in those cases look at the sock_error(). This should work in general and avoid the odd sequence of a repeated connect request returning success when the socket is already connected. This is all a side effect of the socket state being in a CONNECTING state when we get EINPROGRESS, and only a re-connect or other related operation will turn that into CONNECTED. Cc: stable@vger.kernel.org Fixes: 3fb1bd688172 ("io_uring/net: handle -EINPROGRESS correct for IORING_OP_CONNECT") Link: https://github.com/axboe/liburing/issues/980 Signed-off-by: Jens Axboe --- io_uring/net.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 7a8e298af81b..75d494dad7e2 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1461,16 +1461,6 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - if (connect->in_progress) { - struct socket *socket; - - ret = -ENOTSOCK; - socket = sock_from_file(req->file); - if (socket) - ret = sock_error(socket->sk); - goto out; - } - if (req_has_async_data(req)) { io = req->async_data; } else { @@ -1490,9 +1480,7 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) && force_nonblock) { if (ret == -EINPROGRESS) { connect->in_progress = true; - return -EAGAIN; - } - if (ret == -ECONNABORTED) { + } else if (ret == -ECONNABORTED) { if (connect->seen_econnaborted) goto out; connect->seen_econnaborted = true; @@ -1506,6 +1494,16 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) memcpy(req->async_data, &__io, sizeof(__io)); return -EAGAIN; } + if (connect->in_progress) { + /* + * At least bluetooth will return -EBADFD on a re-connect + * attempt, and it's (supposedly) also valid to get -EISCONN + * which means the previous result is good. For both of these, + * grab the sock_error() and use that for the completion. + */ + if (ret == -EBADFD || ret == -EISCONN) + ret = sock_error(sock_from_file(req->file)->sk); + } if (ret == -ERESTARTSYS) ret = -EINTR; out: From 0e984ec88da9747549227900e5215c5e6a1b65ae Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 6 Nov 2023 07:41:17 -0700 Subject: [PATCH 3/7] io_uring/rw: add separate prep handler for readv/writev Rather than sprinkle opcode checks in the generic read/write prep handler, have a separate prep handler for the vectored readv/writev operation. Signed-off-by: Jens Axboe --- io_uring/opdef.c | 4 ++-- io_uring/rw.c | 22 +++++++++++++++------- io_uring/rw.h | 1 + 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 25a3515a177c..0521a26bc6cd 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -66,7 +66,7 @@ const struct io_issue_def io_issue_defs[] = { .iopoll = 1, .iopoll_queue = 1, .vectored = 1, - .prep = io_prep_rw, + .prep = io_prep_rwv, .issue = io_read, }, [IORING_OP_WRITEV] = { @@ -80,7 +80,7 @@ const struct io_issue_def io_issue_defs[] = { .iopoll = 1, .iopoll_queue = 1, .vectored = 1, - .prep = io_prep_rw, + .prep = io_prep_rwv, .issue = io_write, }, [IORING_OP_FSYNC] = { diff --git a/io_uring/rw.c b/io_uring/rw.c index 1c76de483ef6..63d343bae762 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -110,15 +110,23 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); rw->flags = READ_ONCE(sqe->rw_flags); + return 0; +} - /* Have to do this validation here, as this is in io_read() rw->len might - * have chanaged due to buffer selection +int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + int ret; + + ret = io_prep_rw(req, sqe); + if (unlikely(ret)) + return ret; + + /* + * Have to do this validation here, as this is in io_read() rw->len + * might have chanaged due to buffer selection */ - if (req->opcode == IORING_OP_READV && req->flags & REQ_F_BUFFER_SELECT) { - ret = io_iov_buffer_select_prep(req); - if (ret) - return ret; - } + if (req->flags & REQ_F_BUFFER_SELECT) + return io_iov_buffer_select_prep(req); return 0; } diff --git a/io_uring/rw.h b/io_uring/rw.h index c5aed03d42a4..32aa7937513a 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -16,6 +16,7 @@ struct io_async_rw { }; int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read(struct io_kiocb *req, unsigned int issue_flags); int io_readv_prep_async(struct io_kiocb *req); int io_write(struct io_kiocb *req, unsigned int issue_flags); From f688944cfb810986c626cb13d95bc666e5c8a36c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 6 Nov 2023 07:43:16 -0700 Subject: [PATCH 4/7] io_uring/rw: add separate prep handler for fixed read/write Rather than sprinkle opcode checks in the generic read/write prep handler, have a separate prep handler for the vectored readv/writev operation. Signed-off-by: Jens Axboe --- io_uring/opdef.c | 4 ++-- io_uring/rw.c | 30 ++++++++++++++++++------------ io_uring/rw.h | 1 + 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 0521a26bc6cd..799db44283c7 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -98,7 +98,7 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .prep = io_prep_rw, + .prep = io_prep_rw_fixed, .issue = io_read, }, [IORING_OP_WRITE_FIXED] = { @@ -111,7 +111,7 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .prep = io_prep_rw, + .prep = io_prep_rw_fixed, .issue = io_write, }, [IORING_OP_POLL_ADD] = { diff --git a/io_uring/rw.c b/io_uring/rw.c index 63d343bae762..9e3e56b74e35 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -83,18 +83,6 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) /* used for fixed read/write too - just read unconditionally */ req->buf_index = READ_ONCE(sqe->buf_index); - if (req->opcode == IORING_OP_READ_FIXED || - req->opcode == IORING_OP_WRITE_FIXED) { - struct io_ring_ctx *ctx = req->ctx; - u16 index; - - if (unlikely(req->buf_index >= ctx->nr_user_bufs)) - return -EFAULT; - index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); - req->imu = ctx->user_bufs[index]; - io_req_set_rsrc_node(req, ctx, 0); - } - ioprio = READ_ONCE(sqe->ioprio); if (ioprio) { ret = ioprio_check_cap(ioprio); @@ -131,6 +119,24 @@ int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_ring_ctx *ctx = req->ctx; + u16 index; + int ret; + + ret = io_prep_rw(req, sqe); + if (unlikely(ret)) + return ret; + + if (unlikely(req->buf_index >= ctx->nr_user_bufs)) + return -EFAULT; + index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); + req->imu = ctx->user_bufs[index]; + io_req_set_rsrc_node(req, ctx, 0); + return 0; +} + /* * Multishot read is prepared just like a normal read/write request, only * difference is that we set the MULTISHOT flag. diff --git a/io_uring/rw.h b/io_uring/rw.h index 32aa7937513a..f9e89b4fe4da 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -17,6 +17,7 @@ struct io_async_rw { int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read(struct io_kiocb *req, unsigned int issue_flags); int io_readv_prep_async(struct io_kiocb *req); int io_write(struct io_kiocb *req, unsigned int issue_flags); From 89d528ba2f8281de61163c6b62e598b64d832175 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Mon, 6 Nov 2023 20:39:07 +0000 Subject: [PATCH 5/7] io_uring: indicate if io_kbuf_recycle did recycle anything It can be useful to know if io_kbuf_recycle did actually recycle the buffer on the request, or if it left the request alone. Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20231106203909.197089-2-dyudaken@gmail.com Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 6 +++--- io_uring/kbuf.h | 13 ++++++++----- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index fea06810b43d..a1e4239c7d75 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -52,7 +52,7 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } -void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) +bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; @@ -65,7 +65,7 @@ void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) * multiple use. */ if (req->flags & REQ_F_PARTIAL_IO) - return; + return false; io_ring_submit_lock(ctx, issue_flags); @@ -76,7 +76,7 @@ void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) req->buf_index = buf->bgid; io_ring_submit_unlock(ctx, issue_flags); - return; + return true; } unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index d14345ef61fc..f2d615236b2c 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -53,11 +53,11 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); -void io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); +bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid); -static inline void io_kbuf_recycle_ring(struct io_kiocb *req) +static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) { /* * We don't need to recycle for REQ_F_BUFFER_RING, we can just clear @@ -80,8 +80,10 @@ static inline void io_kbuf_recycle_ring(struct io_kiocb *req) } else { req->buf_index = req->buf_list->bgid; req->flags &= ~REQ_F_BUFFER_RING; + return true; } } + return false; } static inline bool io_do_buffer_select(struct io_kiocb *req) @@ -91,12 +93,13 @@ static inline bool io_do_buffer_select(struct io_kiocb *req) return !(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)); } -static inline void io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) +static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) { if (req->flags & REQ_F_BUFFER_SELECTED) - io_kbuf_recycle_legacy(req, issue_flags); + return io_kbuf_recycle_legacy(req, issue_flags); if (req->flags & REQ_F_BUFFER_RING) - io_kbuf_recycle_ring(req); + return io_kbuf_recycle_ring(req); + return false; } static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req, From 49fbe99486786661994a55ced855c31d966bbdf0 Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Mon, 6 Nov 2023 20:39:08 +0000 Subject: [PATCH 6/7] io_uring: do not allow multishot read to set addr or len For addr: this field is not used, since buffer select is forced. But by forcing it to be zero it leaves open future uses of the field. len is actually usable, you could imagine that you want to receive multishot up to a certain length. However right now this is not how it is implemented, and it seems safer to force this to be zero. Fixes: fc68fcda0491 ("io_uring/rw: add support for IORING_OP_READ_MULTISHOT") Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20231106203909.197089-3-dyudaken@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/io_uring/rw.c b/io_uring/rw.c index 9e3e56b74e35..8321e004ab13 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -143,6 +143,7 @@ int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) */ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); int ret; /* must be used with provided buffers */ @@ -153,6 +154,9 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(ret)) return ret; + if (rw->addr || rw->len) + return -EINVAL; + req->flags |= REQ_F_APOLL_MULTISHOT; return 0; } From e53759298a7d7e98c3e5c2440d395d19cea7d6bf Mon Sep 17 00:00:00 2001 From: Dylan Yudaken Date: Mon, 6 Nov 2023 20:39:09 +0000 Subject: [PATCH 7/7] io_uring: do not clamp read length for multishot read When doing a multishot read, the code path reuses the old read paths. However this breaks an assumption built into those paths, namely that struct io_rw::len is available for reuse by __io_import_iovec. For multishot this results in len being set for the first receive call, and then subsequent calls are clamped to that buffer length incorrectly. Instead keep len as zero after recycling buffers, to reuse the full buffer size of the next selected buffer. Fixes: fc68fcda0491 ("io_uring/rw: add support for IORING_OP_READ_MULTISHOT") Signed-off-by: Dylan Yudaken Link: https://lore.kernel.org/r/20231106203909.197089-4-dyudaken@gmail.com Signed-off-by: Jens Axboe --- io_uring/rw.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 8321e004ab13..64390d4e20c1 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -912,6 +912,7 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags) int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) { + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned int cflags = 0; int ret; @@ -928,7 +929,12 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) * handling arm it. */ if (ret == -EAGAIN) { - io_kbuf_recycle(req, issue_flags); + /* + * Reset rw->len to 0 again to avoid clamping future mshot + * reads, in case the buffer size varies. + */ + if (io_kbuf_recycle(req, issue_flags)) + rw->len = 0; return -EAGAIN; } @@ -941,6 +947,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) * jump to the termination path. This request is then done. */ cflags = io_put_kbuf(req, issue_flags); + rw->len = 0; /* similarly to above, reset len to 0 */ if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER,