iov_iter.3-5.15-2021-09-17

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmFEikcQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpmG4D/93W/CdNgw88WFkYPfjwICKHOcSDZhGqMzh
 Ug1cp4BP8lPkiCvyC8VfM3XMBUWf9j8Ijb4X7b+wjuBWaNQdJHlcb1XSEQj4sh8/
 w6MUGUz76/z1z6DE0HzzPHRZyrdog+oW9jZ+qpKCjguVBcs4eu3NdY3LbDcrVvzV
 xzi3o52NbvpHdgWl6LuQqJiIq0twG/6RiguKfqZDfxZxPq6m3cSgjWRLquAV9nUJ
 +S6/wyGkaRK3qPMTtphWyL9TM1pr+od8K5tfKYlgdjsAoCkqIzpIJUR62rTKz3Be
 jjPLxkP0TkE3YPRCjyvZR1Eb7ZwgfuyCszWnGtmBmOt5/JXDUPXEqiQPCg7rVj47
 6x2JGe/bglCnSTWwYSvOQNJDqRVBiXBr59jOvSWNTFO2Tj5v9Q0dk2etgMYwA9oS
 k5vdDhFLNW5T4aibNbpJFJctZaHu9N1rFkzvW4DTdur7lj64ePRMtugaU2F9PhBt
 VwQlkjcuvz5GBjpwS6QdZ78ro0oUSgGOhYiRHJ8JUHJOqDv4SChyC3Tf9sD7ELzZ
 /JJNviD8/iv8ZpHNKGlbwFdive4CxqXIrOYaTycrDJ32/oQkYnEWIaLMmGHaF/F+
 hasiUdS5D277DVz2/R2e0e2s8YXhkmRipoHjEdq57zk7PqRolheVQdaqYuCSmtwH
 MjcJi1hi6g==
 =TnwU
 -----END PGP SIGNATURE-----

Merge tag 'iov_iter.3-5.15-2021-09-17' of git://git.kernel.dk/linux-block

Pull io_uring iov_iter retry fixes from Jens Axboe:
 "This adds a helper to save/restore iov_iter state, and modifies
  io_uring to use it.

  After that is done, we can now kill the iter->truncated addition that
  we added for this release. The io_uring change is being overly
  cautious with the save/restore/advance, but better safe than sorry and
  we can always improve that and reduce the overhead if it proves to be
  of concern. The only case to be worried about in this regard is huge
  IO, where iteration can take a while to iterate segments.

  I spent some time writing test cases, and expanded the coverage quite
  a bit from the last posting of this. liburing carries this regression
  test case now:

      https://git.kernel.dk/cgit/liburing/tree/test/file-verify.c

  which exercises all of this. It now also supports provided buffers,
  and explicitly tests for end-of-file/device truncation as well.

  On top of that, Pavel sanitized the IOPOLL retry path to follow the
  exact same pattern as normal IO"

* tag 'iov_iter.3-5.15-2021-09-17' of git://git.kernel.dk/linux-block:
  io_uring: move iopoll reissue into regular IO path
  Revert "iov_iter: track truncated size"
  io_uring: use iov_iter state save/restore helpers
  iov_iter: add helper to save iov_iter state
This commit is contained in:
Linus Torvalds 2021-09-17 09:23:44 -07:00
commit ddf21bd8ab
3 changed files with 128 additions and 45 deletions

View File

@ -712,6 +712,7 @@ struct io_async_rw {
struct iovec fast_iov[UIO_FASTIOV];
const struct iovec *free_iovec;
struct iov_iter iter;
struct iov_iter_state iter_state;
size_t bytes_done;
struct wait_page_queue wpq;
};
@ -735,7 +736,6 @@ enum {
REQ_F_BUFFER_SELECTED_BIT,
REQ_F_COMPLETE_INLINE_BIT,
REQ_F_REISSUE_BIT,
REQ_F_DONT_REISSUE_BIT,
REQ_F_CREDS_BIT,
REQ_F_REFCOUNT_BIT,
REQ_F_ARM_LTIMEOUT_BIT,
@ -782,8 +782,6 @@ enum {
REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT),
/* caller should reissue async */
REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
/* don't attempt request reissue, see io_rw_reissue() */
REQ_F_DONT_REISSUE = BIT(REQ_F_DONT_REISSUE_BIT),
/* supports async reads */
REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT),
/* supports async writes */
@ -2444,13 +2442,6 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
req = list_first_entry(done, struct io_kiocb, inflight_entry);
list_del(&req->inflight_entry);
if (READ_ONCE(req->result) == -EAGAIN &&
!(req->flags & REQ_F_DONT_REISSUE)) {
req->iopoll_completed = 0;
io_req_task_queue_reissue(req);
continue;
}
__io_cqring_fill_event(ctx, req->user_data, req->result,
io_put_rw_kbuf(req));
(*nr_events)++;
@ -2613,8 +2604,7 @@ static bool io_resubmit_prep(struct io_kiocb *req)
if (!rw)
return !io_req_prep_async(req);
/* may have left rw->iter inconsistent on -EIOCBQUEUED */
iov_iter_revert(&rw->iter, req->result - iov_iter_count(&rw->iter));
iov_iter_restore(&rw->iter, &rw->iter_state);
return true;
}
@ -2714,10 +2704,9 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
if (unlikely(res != req->result)) {
if (!(res == -EAGAIN && io_rw_should_reissue(req) &&
io_resubmit_prep(req))) {
req_set_fail(req);
req->flags |= REQ_F_DONT_REISSUE;
if (res == -EAGAIN && io_rw_should_reissue(req)) {
req->flags |= REQ_F_REISSUE;
return;
}
}
@ -2937,7 +2926,6 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
struct io_async_rw *io = req->async_data;
bool check_reissue = kiocb->ki_complete == io_complete_rw;
/* add previously done IO, if any */
if (io && io->bytes_done > 0) {
@ -2949,19 +2937,27 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
if (req->flags & REQ_F_CUR_POS)
req->file->f_pos = kiocb->ki_pos;
if (ret >= 0 && check_reissue)
if (ret >= 0 && (kiocb->ki_complete == io_complete_rw))
__io_complete_rw(req, ret, 0, issue_flags);
else
io_rw_done(kiocb, ret);
if (check_reissue && (req->flags & REQ_F_REISSUE)) {
if (req->flags & REQ_F_REISSUE) {
req->flags &= ~REQ_F_REISSUE;
if (io_resubmit_prep(req)) {
io_req_task_queue_reissue(req);
} else {
unsigned int cflags = io_put_rw_kbuf(req);
struct io_ring_ctx *ctx = req->ctx;
req_set_fail(req);
__io_req_complete(req, issue_flags, ret,
io_put_rw_kbuf(req));
if (issue_flags & IO_URING_F_NONBLOCK) {
mutex_lock(&ctx->uring_lock);
__io_req_complete(req, issue_flags, ret, cflags);
mutex_unlock(&ctx->uring_lock);
} else {
__io_req_complete(req, issue_flags, ret, cflags);
}
}
}
}
@ -3324,12 +3320,17 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec,
if (!force && !io_op_defs[req->opcode].needs_async_setup)
return 0;
if (!req->async_data) {
struct io_async_rw *iorw;
if (io_alloc_async_data(req)) {
kfree(iovec);
return -ENOMEM;
}
io_req_map_rw(req, iovec, fast_iov, iter);
iorw = req->async_data;
/* we've copied and mapped the iter, ensure state is saved */
iov_iter_save_state(&iorw->iter, &iorw->iter_state);
}
return 0;
}
@ -3348,6 +3349,7 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
iorw->free_iovec = iov;
if (iov)
req->flags |= REQ_F_NEED_CLEANUP;
iov_iter_save_state(&iorw->iter, &iorw->iter_state);
return 0;
}
@ -3451,19 +3453,28 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter __iter, *iter = &__iter;
struct io_async_rw *rw = req->async_data;
ssize_t io_size, ret, ret2;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
struct iov_iter_state __state, *state;
ssize_t ret, ret2;
if (rw) {
iter = &rw->iter;
state = &rw->iter_state;
/*
* We come here from an earlier attempt, restore our state to
* match in case it doesn't. It's cheap enough that we don't
* need to make this conditional.
*/
iov_iter_restore(iter, state);
iovec = NULL;
} else {
ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
if (ret < 0)
return ret;
state = &__state;
iov_iter_save_state(iter, state);
}
io_size = iov_iter_count(iter);
req->result = io_size;
req->result = iov_iter_count(iter);
/* Ensure we clear previously set non-block flag */
if (!force_nonblock)
@ -3477,7 +3488,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
return ret ?: -EAGAIN;
}
ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size);
ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result);
if (unlikely(ret)) {
kfree(iovec);
return ret;
@ -3493,30 +3504,49 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
/* no retry on NONBLOCK nor RWF_NOWAIT */
if (req->flags & REQ_F_NOWAIT)
goto done;
/* some cases will consume bytes even on error returns */
iov_iter_reexpand(iter, iter->count + iter->truncated);
iov_iter_revert(iter, io_size - iov_iter_count(iter));
ret = 0;
} else if (ret == -EIOCBQUEUED) {
goto out_free;
} else if (ret <= 0 || ret == io_size || !force_nonblock ||
} else if (ret <= 0 || ret == req->result || !force_nonblock ||
(req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
/* read all, failed, already did sync or don't want to retry */
goto done;
}
/*
* Don't depend on the iter state matching what was consumed, or being
* untouched in case of error. Restore it and we'll advance it
* manually if we need to.
*/
iov_iter_restore(iter, state);
ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true);
if (ret2)
return ret2;
iovec = NULL;
rw = req->async_data;
/* now use our persistent iterator, if we aren't already */
iter = &rw->iter;
/*
* Now use our persistent iterator and state, if we aren't already.
* We've restored and mapped the iter to match.
*/
if (iter != &rw->iter) {
iter = &rw->iter;
state = &rw->iter_state;
}
do {
io_size -= ret;
/*
* We end up here because of a partial read, either from
* above or inside this loop. Advance the iter by the bytes
* that were consumed.
*/
iov_iter_advance(iter, ret);
if (!iov_iter_count(iter))
break;
rw->bytes_done += ret;
iov_iter_save_state(iter, state);
/* if we can retry, do so with the callbacks armed */
if (!io_rw_should_retry(req)) {
kiocb->ki_flags &= ~IOCB_WAITQ;
@ -3534,7 +3564,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
return 0;
/* we got some bytes, but not all. retry. */
kiocb->ki_flags &= ~IOCB_WAITQ;
} while (ret > 0 && ret < io_size);
iov_iter_restore(iter, state);
} while (ret > 0);
done:
kiocb_done(kiocb, ret, issue_flags);
out_free:
@ -3557,19 +3588,24 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter __iter, *iter = &__iter;
struct io_async_rw *rw = req->async_data;
ssize_t ret, ret2, io_size;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
struct iov_iter_state __state, *state;
ssize_t ret, ret2;
if (rw) {
iter = &rw->iter;
state = &rw->iter_state;
iov_iter_restore(iter, state);
iovec = NULL;
} else {
ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
if (ret < 0)
return ret;
state = &__state;
iov_iter_save_state(iter, state);
}
io_size = iov_iter_count(iter);
req->result = io_size;
req->result = iov_iter_count(iter);
ret2 = 0;
/* Ensure we clear previously set non-block flag */
if (!force_nonblock)
@ -3586,7 +3622,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags)
(req->flags & REQ_F_ISREG))
goto copy_iov;
ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size);
ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result);
if (unlikely(ret))
goto out_free;
@ -3633,9 +3669,9 @@ done:
kiocb_done(kiocb, ret2, issue_flags);
} else {
copy_iov:
/* some cases will consume bytes even on error returns */
iov_iter_reexpand(iter, iter->count + iter->truncated);
iov_iter_revert(iter, io_size - iov_iter_count(iter));
iov_iter_restore(iter, state);
if (ret2 > 0)
iov_iter_advance(iter, ret2);
ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
return ret ?: -EAGAIN;
}

View File

@ -27,6 +27,12 @@ enum iter_type {
ITER_DISCARD,
};
struct iov_iter_state {
size_t iov_offset;
size_t count;
unsigned long nr_segs;
};
struct iov_iter {
u8 iter_type;
bool data_source;
@ -47,7 +53,6 @@ struct iov_iter {
};
loff_t xarray_start;
};
size_t truncated;
};
static inline enum iter_type iov_iter_type(const struct iov_iter *i)
@ -55,6 +60,14 @@ static inline enum iter_type iov_iter_type(const struct iov_iter *i)
return i->iter_type;
}
static inline void iov_iter_save_state(struct iov_iter *iter,
struct iov_iter_state *state)
{
state->iov_offset = iter->iov_offset;
state->count = iter->count;
state->nr_segs = iter->nr_segs;
}
static inline bool iter_is_iovec(const struct iov_iter *i)
{
return iov_iter_type(i) == ITER_IOVEC;
@ -233,6 +246,7 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
size_t maxsize, size_t *start);
int iov_iter_npages(const struct iov_iter *i, int maxpages);
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state);
const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);
@ -255,10 +269,8 @@ static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
* conversion in assignement is by definition greater than all
* values of size_t, including old i->count.
*/
if (i->count > count) {
i->truncated += i->count - count;
if (i->count > count)
i->count = count;
}
}
/*
@ -267,7 +279,6 @@ static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
*/
static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
{
i->truncated -= count - i->count;
i->count = count;
}

View File

@ -1972,3 +1972,39 @@ int import_single_range(int rw, void __user *buf, size_t len,
return 0;
}
EXPORT_SYMBOL(import_single_range);
/**
* iov_iter_restore() - Restore a &struct iov_iter to the same state as when
* iov_iter_save_state() was called.
*
* @i: &struct iov_iter to restore
* @state: state to restore from
*
* Used after iov_iter_save_state() to bring restore @i, if operations may
* have advanced it.
*
* Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
*/
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
{
if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) &&
!iov_iter_is_kvec(i))
return;
i->iov_offset = state->iov_offset;
i->count = state->count;
/*
* For the *vec iters, nr_segs + iov is constant - if we increment
* the vec, then we also decrement the nr_segs count. Hence we don't
* need to track both of these, just one is enough and we can deduct
* the other from that. ITER_KVEC and ITER_IOVEC are the same struct
* size, so we can just increment the iov pointer as they are unionzed.
* ITER_BVEC _may_ be the same size on some archs, but on others it is
* not. Be safe and handle it separately.
*/
BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
if (iov_iter_is_bvec(i))
i->bvec -= state->nr_segs - i->nr_segs;
else
i->iov -= state->nr_segs - i->nr_segs;
i->nr_segs = state->nr_segs;
}