mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-11-01 08:58:07 +00:00
for-6.12/io_uring-20240922
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmbvv30QHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpj3+EACs346FzM8PlZe1GxBZ6OnQX80blwoldAxC +Abl5xjoJKUgA7rY3lJVBRNR6olA/4I2VD3g8b3RT6lpd/oKzPFg7FOj5Dc/oN+c Fo6C7zZdr8caokpL4pfwgyG8ZNssQgRg8e0kRSw8A7AMo1zUazqAXtxjRzeMEOLC 1kWRYGdHCbVjx+hRIyX6KKP427Z5nXvcqFOC0BOpd5jDNYVh9WjNNyUE7trkGJ7o 1cjlpaaOURS0yU/4hue6tRnM8LDjaImyTyISvBWzKfKvpc19K1alOQNvHIoIeiBQ 5MgCNkSpbRmUTrydYVEQXl0Cia2d5+0KQsavUB9nZ8M++NftbRr/i26xT8ReZzXI NjaedDF+MyOKeJaft2ZeKH8GgWolysMBa4e89CveRxosa/6gwHCkkB4UK9b3gaBB Fij1zh/7fIVG7Tz8yNUDyGe6DzOEol1bn1KnL35/9nuCCRnSAM0vRPwJSkurlQ8B PqVUS3BArn+LQZmSZ3HJVKOHv2QAY8etqWizvVmu4DB9Ar+uZ6Ur2uwfMN9JAODP Fm2qVvxS73QlrvisdbnVbTzqBnqh3Rs4mb5my/gCWO1s67qtu3abSJCSzcnyxQdd yBMDegJxTNv6DErNjPEF4qDODwSTIzswr//kOeLns1EtDGfrK8nxUfIKPQUwLSTO Y7h2ru83uA== =goTY -----END PGP SIGNATURE----- Merge tag 'for-6.12/io_uring-20240922' of git://git.kernel.dk/linux Pull more io_uring updates from Jens Axboe: "Mostly just a set of fixes in here, or little changes that didn't get included in the initial pull request. This contains: - Move the SQPOLL napi polling outside the submission lock (Olivier) - Rename of the "copy buffers" API that got added in the 6.12 merge window. There's really no copying going on, it's just referencing the buffers. After a bit of consideration, decided that it was better to simply rename this to avoid potential confusion (me) - Shrink struct io_mapped_ubuf from 48 to 32 bytes, by changing it to start + len tracking rather than having start / end in there, and by removing the caching of folio_mask when we can just calculate it from folio_shift when we need it (me) - Fixes for the SQPOLL affinity checking (me, Felix) - Fix for how cqring waiting checks for the presence of task_work. Just check it directly rather than check for a specific notification mechanism (me) - Tweak to how request linking is represented in tracing (me) - Fix a syzbot report that deliberately sets up a huge list of overflow entries, and then hits rcu stalls when flushing this list. Just check for the need to preempt, and drop/reacquire locks in the loop. There's no state maintained over the loop itself, and each entry is yanked from head-of-list (me)" * tag 'for-6.12/io_uring-20240922' of git://git.kernel.dk/linux: io_uring: check if we need to reschedule during overflow flush io_uring: improve request linking trace io_uring: check for presence of task_work rather than TIF_NOTIFY_SIGNAL io_uring/sqpoll: do the napi busy poll outside the submission block io_uring: clean up a type in io_uring_register_get_file() io_uring/sqpoll: do not put cpumask on stack io_uring/sqpoll: retain test for whether the CPU is valid io_uring/rsrc: change ubuf->ubuf_end to length tracking io_uring/rsrc: get rid of io_mapped_ubuf->folio_mask io_uring: rename "copy buffers" to "clone buffers"
This commit is contained in:
commit
3147a0689d
8 changed files with 55 additions and 35 deletions
|
@ -609,8 +609,8 @@ enum io_uring_register_op {
|
||||||
|
|
||||||
IORING_REGISTER_CLOCK = 29,
|
IORING_REGISTER_CLOCK = 29,
|
||||||
|
|
||||||
/* copy registered buffers from source ring to current ring */
|
/* clone registered buffers from source ring to current ring */
|
||||||
IORING_REGISTER_COPY_BUFFERS = 30,
|
IORING_REGISTER_CLONE_BUFFERS = 30,
|
||||||
|
|
||||||
/* this goes last */
|
/* this goes last */
|
||||||
IORING_REGISTER_LAST,
|
IORING_REGISTER_LAST,
|
||||||
|
@ -701,7 +701,7 @@ enum {
|
||||||
IORING_REGISTER_SRC_REGISTERED = 1,
|
IORING_REGISTER_SRC_REGISTERED = 1,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct io_uring_copy_buffers {
|
struct io_uring_clone_buffers {
|
||||||
__u32 src_fd;
|
__u32 src_fd;
|
||||||
__u32 flags;
|
__u32 flags;
|
||||||
__u32 pad[6];
|
__u32 pad[6];
|
||||||
|
|
|
@ -177,9 +177,8 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
|
||||||
seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
|
seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
|
||||||
for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
|
for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
|
||||||
struct io_mapped_ubuf *buf = ctx->user_bufs[i];
|
struct io_mapped_ubuf *buf = ctx->user_bufs[i];
|
||||||
unsigned int len = buf->ubuf_end - buf->ubuf;
|
|
||||||
|
|
||||||
seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, len);
|
seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len);
|
||||||
}
|
}
|
||||||
if (has_lock && !xa_empty(&ctx->personalities)) {
|
if (has_lock && !xa_empty(&ctx->personalities)) {
|
||||||
unsigned long index;
|
unsigned long index;
|
||||||
|
|
|
@ -635,6 +635,21 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying)
|
||||||
}
|
}
|
||||||
list_del(&ocqe->list);
|
list_del(&ocqe->list);
|
||||||
kfree(ocqe);
|
kfree(ocqe);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For silly syzbot cases that deliberately overflow by huge
|
||||||
|
* amounts, check if we need to resched and drop and
|
||||||
|
* reacquire the locks if so. Nothing real would ever hit this.
|
||||||
|
* Ideally we'd have a non-posting unlock for this, but hard
|
||||||
|
* to care for a non-real case.
|
||||||
|
*/
|
||||||
|
if (need_resched()) {
|
||||||
|
io_cq_unlock_post(ctx);
|
||||||
|
mutex_unlock(&ctx->uring_lock);
|
||||||
|
cond_resched();
|
||||||
|
mutex_lock(&ctx->uring_lock);
|
||||||
|
io_cq_lock(ctx);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (list_empty(&ctx->cq_overflow_list)) {
|
if (list_empty(&ctx->cq_overflow_list)) {
|
||||||
|
@ -2164,7 +2179,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
|
||||||
* conditions are true (normal request), then just queue it.
|
* conditions are true (normal request), then just queue it.
|
||||||
*/
|
*/
|
||||||
if (unlikely(link->head)) {
|
if (unlikely(link->head)) {
|
||||||
trace_io_uring_link(req, link->head);
|
trace_io_uring_link(req, link->last);
|
||||||
link->last->link = req;
|
link->last->link = req;
|
||||||
link->last = req;
|
link->last = req;
|
||||||
|
|
||||||
|
@ -2472,7 +2487,7 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
|
||||||
return 1;
|
return 1;
|
||||||
if (unlikely(!llist_empty(&ctx->work_llist)))
|
if (unlikely(!llist_empty(&ctx->work_llist)))
|
||||||
return 1;
|
return 1;
|
||||||
if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL)))
|
if (unlikely(task_work_pending(current)))
|
||||||
return 1;
|
return 1;
|
||||||
if (unlikely(task_sigpending(current)))
|
if (unlikely(task_sigpending(current)))
|
||||||
return -EINTR;
|
return -EINTR;
|
||||||
|
@ -2579,9 +2594,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
|
||||||
* If we got woken because of task_work being processed, run it
|
* If we got woken because of task_work being processed, run it
|
||||||
* now rather than let the caller do another wait loop.
|
* now rather than let the caller do another wait loop.
|
||||||
*/
|
*/
|
||||||
io_run_task_work();
|
|
||||||
if (!llist_empty(&ctx->work_llist))
|
if (!llist_empty(&ctx->work_llist))
|
||||||
io_run_local_work(ctx, nr_wait);
|
io_run_local_work(ctx, nr_wait);
|
||||||
|
io_run_task_work();
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Non-local task_work will be run on exit to userspace, but
|
* Non-local task_work will be run on exit to userspace, but
|
||||||
|
|
|
@ -542,11 +542,11 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
|
||||||
break;
|
break;
|
||||||
ret = io_register_clock(ctx, arg);
|
ret = io_register_clock(ctx, arg);
|
||||||
break;
|
break;
|
||||||
case IORING_REGISTER_COPY_BUFFERS:
|
case IORING_REGISTER_CLONE_BUFFERS:
|
||||||
ret = -EINVAL;
|
ret = -EINVAL;
|
||||||
if (!arg || nr_args != 1)
|
if (!arg || nr_args != 1)
|
||||||
break;
|
break;
|
||||||
ret = io_register_copy_buffers(ctx, arg);
|
ret = io_register_clone_buffers(ctx, arg);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
ret = -EINVAL;
|
ret = -EINVAL;
|
||||||
|
@ -561,7 +561,7 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
|
||||||
* true, then the registered index is used. Otherwise, the normal fd table.
|
* true, then the registered index is used. Otherwise, the normal fd table.
|
||||||
* Caller must call fput() on the returned file, unless it's an ERR_PTR.
|
* Caller must call fput() on the returned file, unless it's an ERR_PTR.
|
||||||
*/
|
*/
|
||||||
struct file *io_uring_register_get_file(int fd, bool registered)
|
struct file *io_uring_register_get_file(unsigned int fd, bool registered)
|
||||||
{
|
{
|
||||||
struct file *file;
|
struct file *file;
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,6 @@
|
||||||
|
|
||||||
int io_eventfd_unregister(struct io_ring_ctx *ctx);
|
int io_eventfd_unregister(struct io_ring_ctx *ctx);
|
||||||
int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
|
int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
|
||||||
struct file *io_uring_register_get_file(int fd, bool registered);
|
struct file *io_uring_register_get_file(unsigned int fd, bool registered);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -38,7 +38,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
|
||||||
static const struct io_mapped_ubuf dummy_ubuf = {
|
static const struct io_mapped_ubuf dummy_ubuf = {
|
||||||
/* set invalid range, so io_import_fixed() fails meeting it */
|
/* set invalid range, so io_import_fixed() fails meeting it */
|
||||||
.ubuf = -1UL,
|
.ubuf = -1UL,
|
||||||
.ubuf_end = 0,
|
.len = UINT_MAX,
|
||||||
};
|
};
|
||||||
|
|
||||||
int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
|
int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
|
||||||
|
@ -991,16 +991,13 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
|
||||||
size = iov->iov_len;
|
size = iov->iov_len;
|
||||||
/* store original address for later verification */
|
/* store original address for later verification */
|
||||||
imu->ubuf = (unsigned long) iov->iov_base;
|
imu->ubuf = (unsigned long) iov->iov_base;
|
||||||
imu->ubuf_end = imu->ubuf + iov->iov_len;
|
imu->len = iov->iov_len;
|
||||||
imu->nr_bvecs = nr_pages;
|
imu->nr_bvecs = nr_pages;
|
||||||
imu->folio_shift = PAGE_SHIFT;
|
imu->folio_shift = PAGE_SHIFT;
|
||||||
imu->folio_mask = PAGE_MASK;
|
if (coalesced)
|
||||||
if (coalesced) {
|
|
||||||
imu->folio_shift = data.folio_shift;
|
imu->folio_shift = data.folio_shift;
|
||||||
imu->folio_mask = ~((1UL << data.folio_shift) - 1);
|
|
||||||
}
|
|
||||||
refcount_set(&imu->refs, 1);
|
refcount_set(&imu->refs, 1);
|
||||||
off = (unsigned long) iov->iov_base & ~imu->folio_mask;
|
off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);
|
||||||
*pimu = imu;
|
*pimu = imu;
|
||||||
ret = 0;
|
ret = 0;
|
||||||
|
|
||||||
|
@ -1100,7 +1097,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
|
||||||
if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
|
if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end)))
|
||||||
return -EFAULT;
|
return -EFAULT;
|
||||||
/* not inside the mapped region */
|
/* not inside the mapped region */
|
||||||
if (unlikely(buf_addr < imu->ubuf || buf_end > imu->ubuf_end))
|
if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len)))
|
||||||
return -EFAULT;
|
return -EFAULT;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1143,14 +1140,14 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
|
||||||
iter->bvec = bvec + seg_skip;
|
iter->bvec = bvec + seg_skip;
|
||||||
iter->nr_segs -= seg_skip;
|
iter->nr_segs -= seg_skip;
|
||||||
iter->count -= bvec->bv_len + offset;
|
iter->count -= bvec->bv_len + offset;
|
||||||
iter->iov_offset = offset & ~imu->folio_mask;
|
iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int io_copy_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
|
static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
|
||||||
{
|
{
|
||||||
struct io_mapped_ubuf **user_bufs;
|
struct io_mapped_ubuf **user_bufs;
|
||||||
struct io_rsrc_data *data;
|
struct io_rsrc_data *data;
|
||||||
|
@ -1214,9 +1211,9 @@ static int io_copy_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
|
||||||
*
|
*
|
||||||
* Since the memory is already accounted once, don't account it again.
|
* Since the memory is already accounted once, don't account it again.
|
||||||
*/
|
*/
|
||||||
int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg)
|
int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
|
||||||
{
|
{
|
||||||
struct io_uring_copy_buffers buf;
|
struct io_uring_clone_buffers buf;
|
||||||
bool registered_src;
|
bool registered_src;
|
||||||
struct file *file;
|
struct file *file;
|
||||||
int ret;
|
int ret;
|
||||||
|
@ -1234,7 +1231,7 @@ int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg)
|
||||||
file = io_uring_register_get_file(buf.src_fd, registered_src);
|
file = io_uring_register_get_file(buf.src_fd, registered_src);
|
||||||
if (IS_ERR(file))
|
if (IS_ERR(file))
|
||||||
return PTR_ERR(file);
|
return PTR_ERR(file);
|
||||||
ret = io_copy_buffers(ctx, file->private_data);
|
ret = io_clone_buffers(ctx, file->private_data);
|
||||||
if (!registered_src)
|
if (!registered_src)
|
||||||
fput(file);
|
fput(file);
|
||||||
return ret;
|
return ret;
|
||||||
|
|
|
@ -42,12 +42,11 @@ struct io_rsrc_node {
|
||||||
|
|
||||||
struct io_mapped_ubuf {
|
struct io_mapped_ubuf {
|
||||||
u64 ubuf;
|
u64 ubuf;
|
||||||
u64 ubuf_end;
|
unsigned int len;
|
||||||
unsigned int nr_bvecs;
|
unsigned int nr_bvecs;
|
||||||
unsigned int folio_shift;
|
unsigned int folio_shift;
|
||||||
unsigned long acct_pages;
|
|
||||||
unsigned long folio_mask;
|
|
||||||
refcount_t refs;
|
refcount_t refs;
|
||||||
|
unsigned long acct_pages;
|
||||||
struct bio_vec bvec[] __counted_by(nr_bvecs);
|
struct bio_vec bvec[] __counted_by(nr_bvecs);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -68,7 +67,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
|
||||||
struct io_mapped_ubuf *imu,
|
struct io_mapped_ubuf *imu,
|
||||||
u64 buf_addr, size_t len);
|
u64 buf_addr, size_t len);
|
||||||
|
|
||||||
int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg);
|
int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg);
|
||||||
void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
|
void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
|
||||||
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
|
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
|
||||||
int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
|
int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
|
||||||
|
|
|
@ -196,9 +196,6 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
|
||||||
ret = io_submit_sqes(ctx, to_submit);
|
ret = io_submit_sqes(ctx, to_submit);
|
||||||
mutex_unlock(&ctx->uring_lock);
|
mutex_unlock(&ctx->uring_lock);
|
||||||
|
|
||||||
if (io_napi(ctx))
|
|
||||||
ret += io_napi_sqpoll_busy_poll(ctx);
|
|
||||||
|
|
||||||
if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
|
if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
|
||||||
wake_up(&ctx->sqo_sq_wait);
|
wake_up(&ctx->sqo_sq_wait);
|
||||||
if (creds)
|
if (creds)
|
||||||
|
@ -323,6 +320,10 @@ static int io_sq_thread(void *data)
|
||||||
if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE))
|
if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE))
|
||||||
sqt_spin = true;
|
sqt_spin = true;
|
||||||
|
|
||||||
|
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
|
||||||
|
if (io_napi(ctx))
|
||||||
|
io_napi_sqpoll_busy_poll(ctx);
|
||||||
|
|
||||||
if (sqt_spin || !time_after(jiffies, timeout)) {
|
if (sqt_spin || !time_after(jiffies, timeout)) {
|
||||||
if (sqt_spin) {
|
if (sqt_spin) {
|
||||||
io_sq_update_worktime(sqd, &start);
|
io_sq_update_worktime(sqd, &start);
|
||||||
|
@ -461,13 +462,22 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
if (p->flags & IORING_SETUP_SQ_AFF) {
|
if (p->flags & IORING_SETUP_SQ_AFF) {
|
||||||
struct cpumask allowed_mask;
|
cpumask_var_t allowed_mask;
|
||||||
int cpu = p->sq_thread_cpu;
|
int cpu = p->sq_thread_cpu;
|
||||||
|
|
||||||
ret = -EINVAL;
|
ret = -EINVAL;
|
||||||
cpuset_cpus_allowed(current, &allowed_mask);
|
if (cpu >= nr_cpu_ids || !cpu_online(cpu))
|
||||||
if (!cpumask_test_cpu(cpu, &allowed_mask))
|
|
||||||
goto err_sqpoll;
|
goto err_sqpoll;
|
||||||
|
ret = -ENOMEM;
|
||||||
|
if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL))
|
||||||
|
goto err_sqpoll;
|
||||||
|
ret = -EINVAL;
|
||||||
|
cpuset_cpus_allowed(current, allowed_mask);
|
||||||
|
if (!cpumask_test_cpu(cpu, allowed_mask)) {
|
||||||
|
free_cpumask_var(allowed_mask);
|
||||||
|
goto err_sqpoll;
|
||||||
|
}
|
||||||
|
free_cpumask_var(allowed_mask);
|
||||||
sqd->sq_cpu = cpu;
|
sqd->sq_cpu = cpu;
|
||||||
} else {
|
} else {
|
||||||
sqd->sq_cpu = -1;
|
sqd->sq_cpu = -1;
|
||||||
|
|
Loading…
Reference in a new issue