diff --git a/block/bio.c b/block/bio.c index b9642a41f286..c9223e9d31da 100644 --- a/block/bio.c +++ b/block/bio.c @@ -251,6 +251,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_opf = opf; bio->bi_flags = 0; bio->bi_ioprio = 0; + bio->bi_write_hint = 0; bio->bi_status = 0; bio->bi_iter.bi_sector = 0; bio->bi_iter.bi_size = 0; @@ -813,6 +814,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) { bio_set_flag(bio, BIO_CLONED); bio->bi_ioprio = bio_src->bi_ioprio; + bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter = bio_src->bi_iter; if (bio->bi_bdev) { diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index e6468eab2681..b1e7415f8439 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -172,6 +172,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; + bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; diff --git a/block/blk-merge.c b/block/blk-merge.c index 2d470cf2173e..2a06fd33039d 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -810,6 +810,10 @@ static struct request *attempt_merge(struct request_queue *q, if (rq_data_dir(req) != rq_data_dir(next)) return NULL; + /* Don't merge requests with different write hints. */ + if (req->write_hint != next->write_hint) + return NULL; + if (req->ioprio != next->ioprio) return NULL; @@ -937,6 +941,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (!bio_crypt_rq_ctx_compatible(rq, bio)) return false; + /* Don't merge requests with different write hints. */ + if (rq->write_hint != bio->bi_write_hint) + return false; + if (rq->ioprio != bio_prio(bio)) return false; diff --git a/block/blk-mq.c b/block/blk-mq.c index 2dc01551e27c..9c475a89e3af 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2584,6 +2584,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, rq->cmd_flags |= REQ_FAILFAST_MASK; rq->__sector = bio->bi_iter.bi_sector; + rq->write_hint = bio->bi_write_hint; blk_rq_bio_prep(rq, bio, nr_segs); /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */ @@ -3175,6 +3176,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, } rq->nr_phys_segments = rq_src->nr_phys_segments; rq->ioprio = rq_src->ioprio; + rq->write_hint = rq_src->write_hint; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; diff --git a/block/bounce.c b/block/bounce.c index 7cfcb242f9a1..d6a5219f29dd 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -169,6 +169,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src) if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; + bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; diff --git a/block/fops.c b/block/fops.c index 0cf8cf72cdfa..63566a3db186 100644 --- a/block/fops.c +++ b/block/fops.c @@ -73,6 +73,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb)); } bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; + bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio.bi_ioprio = iocb->ki_ioprio; ret = bio_iov_iter_get_pages(&bio, iter); @@ -203,6 +204,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, for (;;) { bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; + bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; @@ -321,6 +323,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, dio->flags = 0; dio->iocb = iocb; bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; + bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio->bi_end_io = blkdev_bio_end_io_async; bio->bi_ioprio = iocb->ki_ioprio; @@ -482,7 +485,7 @@ static void blkdev_readahead(struct readahead_control *rac) } static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset) + struct inode *inode, loff_t offset, unsigned int len) { loff_t isize = i_size_read(inode); diff --git a/fs/buffer.c b/fs/buffer.c index 9a54077de87d..4f73d23c2c46 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -55,7 +55,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, - struct writeback_control *wbc); + enum rw_hint hint, struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -1889,7 +1889,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -1944,7 +1945,8 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -2756,6 +2758,7 @@ static void end_bio_bh_io_sync(struct bio *bio) } static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, + enum rw_hint write_hint, struct writeback_control *wbc) { const enum req_op op = opf & REQ_OP_MASK; @@ -2783,6 +2786,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_write_hint = write_hint; __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); @@ -2802,7 +2806,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, void submit_bh(blk_opf_t opf, struct buffer_head *bh) { - submit_bh_wbc(opf, bh, NULL); + submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL); } EXPORT_SYMBOL(submit_bh); diff --git a/fs/direct-io.c b/fs/direct-io.c index 60456263a338..62c97ff9e852 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -410,6 +410,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, bio->bi_end_io = dio_bio_end_io; if (dio->is_pinned) bio_set_flag(bio, BIO_PAGE_PINNED); + bio->bi_write_hint = file_inode(dio->iocb->ki_filp)->i_write_hint; + sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dda0aed95464..4a4e60cdac4e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/fs/fcntl.c b/fs/fcntl.c index c3e342eb74af..54cc85d3338e 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -268,8 +269,15 @@ static int f_getowner_uids(struct file *filp, unsigned long arg) } #endif -static bool rw_hint_valid(enum rw_hint hint) +static bool rw_hint_valid(u64 hint) { + BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET); + BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE); + BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT); + BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM); + BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG); + BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME); + switch (hint) { case RWH_WRITE_LIFE_NOT_SET: case RWH_WRITE_LIFE_NONE: @@ -283,34 +291,40 @@ static bool rw_hint_valid(enum rw_hint hint) } } -static long fcntl_rw_hint(struct file *file, unsigned int cmd, - unsigned long arg) +static long fcntl_get_rw_hint(struct file *file, unsigned int cmd, + unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; - enum rw_hint hint; - u64 h; + u64 hint = READ_ONCE(inode->i_write_hint); - switch (cmd) { - case F_GET_RW_HINT: - h = inode->i_write_hint; - if (copy_to_user(argp, &h, sizeof(*argp))) - return -EFAULT; - return 0; - case F_SET_RW_HINT: - if (copy_from_user(&h, argp, sizeof(h))) - return -EFAULT; - hint = (enum rw_hint) h; - if (!rw_hint_valid(hint)) - return -EINVAL; + if (copy_to_user(argp, &hint, sizeof(*argp))) + return -EFAULT; + return 0; +} - inode_lock(inode); - inode->i_write_hint = hint; - inode_unlock(inode); - return 0; - default: +static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct inode *inode = file_inode(file); + u64 __user *argp = (u64 __user *)arg; + u64 hint; + + if (copy_from_user(&hint, argp, sizeof(hint))) + return -EFAULT; + if (!rw_hint_valid(hint)) return -EINVAL; - } + + WRITE_ONCE(inode->i_write_hint, hint); + + /* + * file->f_mapping->host may differ from inode. As an example, + * blkdev_open() modifies file->f_mapping. + */ + if (file->f_mapping->host != inode) + WRITE_ONCE(file->f_mapping->host->i_write_hint, hint); + + return 0; } static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, @@ -416,8 +430,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, err = memfd_fcntl(filp, cmd, argi); break; case F_GET_RW_HINT: + err = fcntl_get_rw_hint(filp, cmd, arg); + break; case F_SET_RW_HINT: - err = fcntl_rw_hint(filp, cmd, arg); + err = fcntl_set_rw_hint(filp, cmd, arg); break; default: break; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index d9ccfd27e4f1..789af5c8fade 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -2465,7 +2465,7 @@ out: } static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode, - loff_t offset) + loff_t offset, unsigned int len) { int ret; diff --git a/fs/inode.c b/fs/inode.c index 5c8a5250d0ac..d290f007b3d1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "internal.h" diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 093c4515b22a..4e8e41c8b3c0 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. - * Copyright (C) 2016-2019 Christoph Hellwig. + * Copyright (C) 2016-2023 Christoph Hellwig. */ #include #include @@ -95,6 +95,44 @@ static inline bool ifs_block_is_dirty(struct folio *folio, return test_bit(block + blks_per_folio, ifs->state); } +static unsigned ifs_find_dirty_range(struct folio *folio, + struct iomap_folio_state *ifs, u64 *range_start, u64 range_end) +{ + struct inode *inode = folio->mapping->host; + unsigned start_blk = + offset_in_folio(folio, *range_start) >> inode->i_blkbits; + unsigned end_blk = min_not_zero( + offset_in_folio(folio, range_end) >> inode->i_blkbits, + i_blocks_per_folio(inode, folio)); + unsigned nblks = 1; + + while (!ifs_block_is_dirty(folio, ifs, start_blk)) + if (++start_blk == end_blk) + return 0; + + while (start_blk + nblks < end_blk) { + if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks)) + break; + nblks++; + } + + *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits); + return nblks << inode->i_blkbits; +} + +static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start, + u64 range_end) +{ + struct iomap_folio_state *ifs = folio->private; + + if (*range_start >= range_end) + return 0; + + if (ifs) + return ifs_find_dirty_range(folio, ifs, range_start, range_end); + return range_end - *range_start; +} + static void ifs_clear_range_dirty(struct folio *folio, struct iomap_folio_state *ifs, size_t off, size_t len) { @@ -1454,15 +1492,10 @@ out_unlock: EXPORT_SYMBOL_GPL(iomap_page_mkwrite); static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, - size_t len, int error) + size_t len) { struct iomap_folio_state *ifs = folio->private; - if (error) { - folio_set_error(folio); - mapping_set_error(inode->i_mapping, error); - } - WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0); @@ -1479,40 +1512,29 @@ static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) { struct inode *inode = ioend->io_inode; - struct bio *bio = &ioend->io_inline_bio; - struct bio *last = ioend->io_bio, *next; - u64 start = bio->bi_iter.bi_sector; - loff_t offset = ioend->io_offset; - bool quiet = bio_flagged(bio, BIO_QUIET); + struct bio *bio = &ioend->io_bio; + struct folio_iter fi; u32 folio_count = 0; - for (bio = &ioend->io_inline_bio; bio; bio = next) { - struct folio_iter fi; - - /* - * For the last bio, bi_private points to the ioend, so we - * need to explicitly end the iteration here. - */ - if (bio == last) - next = NULL; - else - next = bio->bi_private; - - /* walk all folios in bio, ending page IO on them */ - bio_for_each_folio_all(fi, bio) { - iomap_finish_folio_write(inode, fi.folio, fi.length, - error); - folio_count++; - } - bio_put(bio); - } - /* The ioend has been freed by bio_put() */ - - if (unlikely(error && !quiet)) { - printk_ratelimited(KERN_ERR + if (error) { + mapping_set_error(inode->i_mapping, error); + if (!bio_flagged(bio, BIO_QUIET)) { + pr_err_ratelimited( "%s: writeback error on inode %lu, offset %lld, sector %llu", - inode->i_sb->s_id, inode->i_ino, offset, start); + inode->i_sb->s_id, inode->i_ino, + ioend->io_offset, ioend->io_sector); + } } + + /* walk all folios in bio, ending page IO on them */ + bio_for_each_folio_all(fi, bio) { + if (error) + folio_set_error(fi.folio); + iomap_finish_folio_write(inode, fi.folio, fi.length); + folio_count++; + } + + bio_put(bio); /* frees the ioend */ return folio_count; } @@ -1553,7 +1575,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_ioends); static bool iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) { - if (ioend->io_bio->bi_status != next->io_bio->bi_status) + if (ioend->io_bio.bi_status != next->io_bio.bi_status) return false; if ((ioend->io_flags & IOMAP_F_SHARED) ^ (next->io_flags & IOMAP_F_SHARED)) @@ -1618,47 +1640,46 @@ EXPORT_SYMBOL_GPL(iomap_sort_ioends); static void iomap_writepage_end_bio(struct bio *bio) { - struct iomap_ioend *ioend = bio->bi_private; - - iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status)); + iomap_finish_ioend(iomap_ioend_from_bio(bio), + blk_status_to_errno(bio->bi_status)); } /* * Submit the final bio for an ioend. * * If @error is non-zero, it means that we have a situation where some part of - * the submission process has failed after we've marked pages for writeback - * and unlocked them. In this situation, we need to fail the bio instead of - * submitting it. This typically only happens on a filesystem shutdown. + * the submission process has failed after we've marked pages for writeback. + * We cannot cancel ioend directly in that case, so call the bio end I/O handler + * with the error status here to run the normal I/O completion handler to clear + * the writeback bit and let the file system proess the errors. */ -static int -iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend, - int error) +static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) { - ioend->io_bio->bi_private = ioend; - ioend->io_bio->bi_end_io = iomap_writepage_end_bio; - - if (wpc->ops->prepare_ioend) - error = wpc->ops->prepare_ioend(ioend, error); - if (error) { - /* - * If we're failing the IO now, just mark the ioend with an - * error and finish it. This will run IO completion immediately - * as there is only one reference to the ioend at this point in - * time. - */ - ioend->io_bio->bi_status = errno_to_blk_status(error); - bio_endio(ioend->io_bio); + if (!wpc->ioend) return error; + + /* + * Let the file systems prepare the I/O submission and hook in an I/O + * comletion handler. This also needs to happen in case after a + * failure happened so that the file system end I/O handler gets called + * to clean up. + */ + if (wpc->ops->prepare_ioend) + error = wpc->ops->prepare_ioend(wpc->ioend, error); + + if (error) { + wpc->ioend->io_bio.bi_status = errno_to_blk_status(error); + bio_endio(&wpc->ioend->io_bio); + } else { + submit_bio(&wpc->ioend->io_bio); } - submit_bio(ioend->io_bio); - return 0; + wpc->ioend = NULL; + return error; } -static struct iomap_ioend * -iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, - loff_t offset, sector_t sector, struct writeback_control *wbc) +static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct inode *inode, loff_t pos) { struct iomap_ioend *ioend; struct bio *bio; @@ -1666,63 +1687,42 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, REQ_OP_WRITE | wbc_to_write_flags(wbc), GFP_NOFS, &iomap_ioend_bioset); - bio->bi_iter.bi_sector = sector; + bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); + bio->bi_end_io = iomap_writepage_end_bio; wbc_init_bio(wbc, bio); + bio->bi_write_hint = inode->i_write_hint; - ioend = container_of(bio, struct iomap_ioend, io_inline_bio); + ioend = iomap_ioend_from_bio(bio); INIT_LIST_HEAD(&ioend->io_list); ioend->io_type = wpc->iomap.type; ioend->io_flags = wpc->iomap.flags; ioend->io_inode = inode; ioend->io_size = 0; - ioend->io_folios = 0; - ioend->io_offset = offset; - ioend->io_bio = bio; - ioend->io_sector = sector; + ioend->io_offset = pos; + ioend->io_sector = bio->bi_iter.bi_sector; + + wpc->nr_folios = 0; return ioend; } -/* - * Allocate a new bio, and chain the old bio to the new one. - * - * Note that we have to perform the chaining in this unintuitive order - * so that the bi_private linkage is set up in the right direction for the - * traversal in iomap_finish_ioend(). - */ -static struct bio * -iomap_chain_bio(struct bio *prev) -{ - struct bio *new; - - new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS); - bio_clone_blkg_association(new, prev); - new->bi_iter.bi_sector = bio_end_sector(prev); - - bio_chain(prev, new); - bio_get(prev); /* for iomap_finish_ioend */ - submit_bio(prev); - return new; -} - -static bool -iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, - sector_t sector) +static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) { if ((wpc->iomap.flags & IOMAP_F_SHARED) != (wpc->ioend->io_flags & IOMAP_F_SHARED)) return false; if (wpc->iomap.type != wpc->ioend->io_type) return false; - if (offset != wpc->ioend->io_offset + wpc->ioend->io_size) + if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) return false; - if (sector != bio_end_sector(wpc->ioend->io_bio)) + if (iomap_sector(&wpc->iomap, pos) != + bio_end_sector(&wpc->ioend->io_bio)) return false; /* * Limit ioend bio chain lengths to minimise IO completion latency. This * also prevents long tight loops ending page writeback on all the * folios in the ioend. */ - if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE) + if (wpc->nr_folios >= IOEND_BATCH_SIZE) return false; return true; } @@ -1730,255 +1730,238 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, /* * Test to see if we have an existing ioend structure that we could append to * first; otherwise finish off the current ioend and start another. + * + * If a new ioend is created and cached, the old ioend is submitted to the block + * layer instantly. Batching optimisations are provided by higher level block + * plugging. + * + * At the end of a writeback pass, there will be a cached ioend remaining on the + * writepage context that the caller will need to submit. */ -static void -iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, - struct iomap_folio_state *ifs, struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct list_head *iolist) +static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct folio *folio, + struct inode *inode, loff_t pos, unsigned len) { - sector_t sector = iomap_sector(&wpc->iomap, pos); - unsigned len = i_blocksize(inode); + struct iomap_folio_state *ifs = folio->private; size_t poff = offset_in_folio(folio, pos); + int error; - if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) { - if (wpc->ioend) - list_add(&wpc->ioend->io_list, iolist); - wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc); + if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) { +new_ioend: + error = iomap_submit_ioend(wpc, 0); + if (error) + return error; + wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos); } - if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) { - wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio); - bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff); - } + if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff)) + goto new_ioend; if (ifs) atomic_add(len, &ifs->write_bytes_pending); wpc->ioend->io_size += len; wbc_account_cgroup_owner(wbc, &folio->page, len); + return 0; } -/* - * We implement an immediate ioend submission policy here to avoid needing to - * chain multiple ioends and hence nest mempool allocations which can violate - * the forward progress guarantees we need to provide. The current ioend we're - * adding blocks to is cached in the writepage context, and if the new block - * doesn't append to the cached ioend, it will create a new ioend and cache that - * instead. - * - * If a new ioend is created and cached, the old ioend is returned and queued - * locally for submission once the entire page is processed or an error has been - * detected. While ioends are submitted immediately after they are completed, - * batching optimisations are provided by higher level block plugging. - * - * At the end of a writeback pass, there will be a cached ioend remaining on the - * writepage context that the caller will need to submit. - */ -static int -iomap_writepage_map(struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct inode *inode, - struct folio *folio, u64 end_pos) +static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct folio *folio, + struct inode *inode, u64 pos, unsigned dirty_len, + unsigned *count) { - struct iomap_folio_state *ifs = folio->private; - struct iomap_ioend *ioend, *next; - unsigned len = i_blocksize(inode); - unsigned nblocks = i_blocks_per_folio(inode, folio); - u64 pos = folio_pos(folio); - int error = 0, count = 0, i; - LIST_HEAD(submit_list); + int error; - WARN_ON_ONCE(end_pos <= pos); + do { + unsigned map_len; - if (!ifs && nblocks > 1) { - ifs = ifs_alloc(inode, folio, 0); - iomap_set_range_dirty(folio, 0, end_pos - pos); - } - - WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0); - - /* - * Walk through the folio to find areas to write back. If we - * run off the end of the current map or find the current map - * invalid, grab a new one. - */ - for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) { - if (ifs && !ifs_block_is_dirty(folio, ifs, i)) - continue; - - error = wpc->ops->map_blocks(wpc, inode, pos); + error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len); if (error) break; - trace_iomap_writepage_map(inode, &wpc->iomap); - if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE)) - continue; - if (wpc->iomap.type == IOMAP_HOLE) - continue; - iomap_add_to_ioend(inode, pos, folio, ifs, wpc, wbc, - &submit_list); - count++; - } - if (count) - wpc->ioend->io_folios++; + trace_iomap_writepage_map(inode, pos, dirty_len, &wpc->iomap); - WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list)); - WARN_ON_ONCE(!folio_test_locked(folio)); - WARN_ON_ONCE(folio_test_writeback(folio)); - WARN_ON_ONCE(folio_test_dirty(folio)); + map_len = min_t(u64, dirty_len, + wpc->iomap.offset + wpc->iomap.length - pos); + WARN_ON_ONCE(!folio->private && map_len < dirty_len); + + switch (wpc->iomap.type) { + case IOMAP_INLINE: + WARN_ON_ONCE(1); + error = -EIO; + break; + case IOMAP_HOLE: + break; + default: + error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos, + map_len); + if (!error) + (*count)++; + break; + } + dirty_len -= map_len; + pos += map_len; + } while (dirty_len && !error); /* * We cannot cancel the ioend directly here on error. We may have * already set other pages under writeback and hence we have to run I/O * completion to mark the error state of the pages under writeback * appropriately. + * + * Just let the file system know what portion of the folio failed to + * map. */ - if (unlikely(error)) { + if (error && wpc->ops->discard_folio) + wpc->ops->discard_folio(folio, pos); + return error; +} + +/* + * Check interaction of the folio with the file end. + * + * If the folio is entirely beyond i_size, return false. If it straddles + * i_size, adjust end_pos and zero all data beyond i_size. + */ +static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode, + u64 *end_pos) +{ + u64 isize = i_size_read(inode); + + if (*end_pos > isize) { + size_t poff = offset_in_folio(folio, isize); + pgoff_t end_index = isize >> PAGE_SHIFT; + /* - * Let the filesystem know what portion of the current page - * failed to map. If the page hasn't been added to ioend, it - * won't be affected by I/O completion and we must unlock it - * now. + * If the folio is entirely ouside of i_size, skip it. + * + * This can happen due to a truncate operation that is in + * progress and in that case truncate will finish it off once + * we've dropped the folio lock. + * + * Note that the pgoff_t used for end_index is an unsigned long. + * If the given offset is greater than 16TB on a 32-bit system, + * then if we checked if the folio is fully outside i_size with + * "if (folio->index >= end_index + 1)", "end_index + 1" would + * overflow and evaluate to 0. Hence this folio would be + * redirtied and written out repeatedly, which would result in + * an infinite loop; the user program performing this operation + * would hang. Instead, we can detect this situation by + * checking if the folio is totally beyond i_size or if its + * offset is just equal to the EOF. */ - if (wpc->ops->discard_folio) - wpc->ops->discard_folio(folio, pos); - if (!count) { - folio_unlock(folio); - goto done; - } + if (folio->index > end_index || + (folio->index == end_index && poff == 0)) + return false; + + /* + * The folio straddles i_size. + * + * It must be zeroed out on each and every writepage invocation + * because it may be mmapped: + * + * A file is mapped in multiples of the page size. For a + * file that is not a multiple of the page size, the + * remaining memory is zeroed when mapped, and writes to that + * region are not written out to the file. + * + * Also adjust the writeback range to skip all blocks entirely + * beyond i_size. + */ + folio_zero_segment(folio, poff, folio_size(folio)); + *end_pos = round_up(isize, i_blocksize(inode)); } + return true; +} + +static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, + struct writeback_control *wbc, struct folio *folio) +{ + struct iomap_folio_state *ifs = folio->private; + struct inode *inode = folio->mapping->host; + u64 pos = folio_pos(folio); + u64 end_pos = pos + folio_size(folio); + unsigned count = 0; + int error = 0; + u32 rlen; + + WARN_ON_ONCE(!folio_test_locked(folio)); + WARN_ON_ONCE(folio_test_dirty(folio)); + WARN_ON_ONCE(folio_test_writeback(folio)); + + trace_iomap_writepage(inode, pos, folio_size(folio)); + + if (!iomap_writepage_handle_eof(folio, inode, &end_pos)) { + folio_unlock(folio); + return 0; + } + WARN_ON_ONCE(end_pos <= pos); + + if (i_blocks_per_folio(inode, folio) > 1) { + if (!ifs) { + ifs = ifs_alloc(inode, folio, 0); + iomap_set_range_dirty(folio, 0, end_pos - pos); + } + + /* + * Keep the I/O completion handler from clearing the writeback + * bit until we have submitted all blocks by adding a bias to + * ifs->write_bytes_pending, which is dropped after submitting + * all blocks. + */ + WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0); + atomic_inc(&ifs->write_bytes_pending); + } + + /* + * Set the writeback bit ASAP, as the I/O completion for the single + * block per folio case happen hit as soon as we're submitting the bio. + */ + folio_start_writeback(folio); + + /* + * Walk through the folio to find dirty areas to write back. + */ + while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) { + error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, + pos, rlen, &count); + if (error) + break; + pos += rlen; + } + + if (count) + wpc->nr_folios++; + /* * We can have dirty bits set past end of file in page_mkwrite path * while mapping the last partial folio. Hence it's better to clear * all the dirty bits in the folio here. */ iomap_clear_range_dirty(folio, 0, folio_size(folio)); - folio_start_writeback(folio); + + /* + * Usually the writeback bit is cleared by the I/O completion handler. + * But we may end up either not actually writing any blocks, or (when + * there are multiple blocks in a folio) all I/O might have finished + * already at this point. In that case we need to clear the writeback + * bit ourselves right after unlocking the page. + */ folio_unlock(folio); - - /* - * Preserve the original error if there was one; catch - * submission errors here and propagate into subsequent ioend - * submissions. - */ - list_for_each_entry_safe(ioend, next, &submit_list, io_list) { - int error2; - - list_del_init(&ioend->io_list); - error2 = iomap_submit_ioend(wpc, ioend, error); - if (error2 && !error) - error = error2; + if (ifs) { + if (atomic_dec_and_test(&ifs->write_bytes_pending)) + folio_end_writeback(folio); + } else { + if (!count) + folio_end_writeback(folio); } - - /* - * We can end up here with no error and nothing to write only if we race - * with a partial page truncate on a sub-page block sized filesystem. - */ - if (!count) - folio_end_writeback(folio); -done: mapping_set_error(inode->i_mapping, error); return error; } -/* - * Write out a dirty page. - * - * For delalloc space on the page, we need to allocate space and flush it. - * For unwritten space on the page, we need to start the conversion to - * regular allocated space. - */ static int iomap_do_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { - struct iomap_writepage_ctx *wpc = data; - struct inode *inode = folio->mapping->host; - u64 end_pos, isize; - - trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio)); - - /* - * Refuse to write the folio out if we're called from reclaim context. - * - * This avoids stack overflows when called from deeply used stacks in - * random callers for direct reclaim or memcg reclaim. We explicitly - * allow reclaim from kswapd as the stack usage there is relatively low. - * - * This should never happen except in the case of a VM regression so - * warn about it. - */ - if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == - PF_MEMALLOC)) - goto redirty; - - /* - * Is this folio beyond the end of the file? - * - * The folio index is less than the end_index, adjust the end_pos - * to the highest offset that this folio should represent. - * ----------------------------------------------------- - * | file mapping | | - * ----------------------------------------------------- - * | Page ... | Page N-2 | Page N-1 | Page N | | - * ^--------------------------------^----------|-------- - * | desired writeback range | see else | - * ---------------------------------^------------------| - */ - isize = i_size_read(inode); - end_pos = folio_pos(folio) + folio_size(folio); - if (end_pos > isize) { - /* - * Check whether the page to write out is beyond or straddles - * i_size or not. - * ------------------------------------------------------- - * | file mapping | | - * ------------------------------------------------------- - * | Page ... | Page N-2 | Page N-1 | Page N | Beyond | - * ^--------------------------------^-----------|--------- - * | | Straddles | - * ---------------------------------^-----------|--------| - */ - size_t poff = offset_in_folio(folio, isize); - pgoff_t end_index = isize >> PAGE_SHIFT; - - /* - * Skip the page if it's fully outside i_size, e.g. - * due to a truncate operation that's in progress. We've - * cleaned this page and truncate will finish things off for - * us. - * - * Note that the end_index is unsigned long. If the given - * offset is greater than 16TB on a 32-bit system then if we - * checked if the page is fully outside i_size with - * "if (page->index >= end_index + 1)", "end_index + 1" would - * overflow and evaluate to 0. Hence this page would be - * redirtied and written out repeatedly, which would result in - * an infinite loop; the user program performing this operation - * would hang. Instead, we can detect this situation by - * checking if the page is totally beyond i_size or if its - * offset is just equal to the EOF. - */ - if (folio->index > end_index || - (folio->index == end_index && poff == 0)) - goto unlock; - - /* - * The page straddles i_size. It must be zeroed out on each - * and every writepage invocation because it may be mmapped. - * "A file is mapped in multiples of the page size. For a file - * that is not a multiple of the page size, the remaining - * memory is zeroed when mapped, and writes to that region are - * not written out to the file." - */ - folio_zero_segment(folio, poff, folio_size(folio)); - end_pos = isize; - } - - return iomap_writepage_map(wpc, wbc, inode, folio, end_pos); - -redirty: - folio_redirty_for_writepage(wbc, folio); -unlock: - folio_unlock(folio); - return 0; + return iomap_writepage_map(data, wbc, folio); } int @@ -1988,18 +1971,24 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, { int ret; + /* + * Writeback from reclaim context should never happen except in the case + * of a VM regression so warn about it and refuse to write the data. + */ + if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC | PF_KSWAPD)) == + PF_MEMALLOC)) + return -EIO; + wpc->ops = ops; ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc); - if (!wpc->ioend) - return ret; - return iomap_submit_ioend(wpc, wpc->ioend, ret); + return iomap_submit_ioend(wpc, ret); } EXPORT_SYMBOL_GPL(iomap_writepages); static int __init iomap_init(void) { return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), - offsetof(struct iomap_ioend, io_inline_bio), + offsetof(struct iomap_ioend, io_bio), BIOSET_NEED_BVECS); } fs_initcall(iomap_init); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index bcd3f8cf5ea4..f3b43d223a46 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -380,6 +380,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(iomap, pos); + bio->bi_write_hint = inode->i_write_hint; bio->bi_ioprio = dio->iocb->ki_ioprio; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index c16fd55f5595..0a991c4ce87d 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -154,7 +154,48 @@ DEFINE_EVENT(iomap_class, name, \ TP_ARGS(inode, iomap)) DEFINE_IOMAP_EVENT(iomap_iter_dstmap); DEFINE_IOMAP_EVENT(iomap_iter_srcmap); -DEFINE_IOMAP_EVENT(iomap_writepage_map); + +TRACE_EVENT(iomap_writepage_map, + TP_PROTO(struct inode *inode, u64 pos, unsigned int dirty_len, + struct iomap *iomap), + TP_ARGS(inode, pos, dirty_len, iomap), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, ino) + __field(u64, pos) + __field(u64, dirty_len) + __field(u64, addr) + __field(loff_t, offset) + __field(u64, length) + __field(u16, type) + __field(u16, flags) + __field(dev_t, bdev) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pos = pos; + __entry->dirty_len = dirty_len; + __entry->addr = iomap->addr; + __entry->offset = iomap->offset; + __entry->length = iomap->length; + __entry->type = iomap->type; + __entry->flags = iomap->flags; + __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; + ), + TP_printk("dev %d:%d ino 0x%llx bdev %d:%d pos 0x%llx dirty len 0x%llx " + "addr 0x%llx offset 0x%llx length 0x%llx type %s flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + MAJOR(__entry->bdev), MINOR(__entry->bdev), + __entry->pos, + __entry->dirty_len, + __entry->addr, + __entry->offset, + __entry->length, + __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS), + __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS)) +); TRACE_EVENT(iomap_iter, TP_PROTO(struct iomap_iter *iter, const void *ops, @@ -165,6 +206,7 @@ TRACE_EVENT(iomap_iter, __field(u64, ino) __field(loff_t, pos) __field(u64, length) + __field(s64, processed) __field(unsigned int, flags) __field(const void *, ops) __field(unsigned long, caller) @@ -174,15 +216,17 @@ TRACE_EVENT(iomap_iter, __entry->ino = iter->inode->i_ino; __entry->pos = iter->pos; __entry->length = iomap_length(iter); + __entry->processed = iter->processed; __entry->flags = iter->flags; __entry->ops = ops; __entry->caller = caller; ), - TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx flags %s (0x%x) ops %ps caller %pS", + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pos, __entry->length, + __entry->processed, __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), __entry->flags, __entry->ops, diff --git a/fs/mpage.c b/fs/mpage.c index 738882e0766d..fa8b99a199fa 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -605,6 +605,7 @@ alloc_new: GFP_NOFS); bio->bi_iter.bi_sector = first_block << (blkbits - 9); wbc_init_bio(wbc, bio); + bio->bi_write_hint = inode->i_write_hint; } /* diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 813f85156b0c..1698507d1ac7 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -112,7 +112,7 @@ xfs_end_ioend( * longer dirty. If we don't remove delalloc blocks here, they become * stale and can corrupt free space accounting on unmount. */ - error = blk_status_to_errno(ioend->io_bio->bi_status); + error = blk_status_to_errno(ioend->io_bio.bi_status); if (unlikely(error)) { if (ioend->io_flags & IOMAP_F_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); @@ -179,7 +179,7 @@ STATIC void xfs_end_bio( struct bio *bio) { - struct iomap_ioend *ioend = bio->bi_private; + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); struct xfs_inode *ip = XFS_I(ioend->io_inode); unsigned long flags; @@ -276,7 +276,8 @@ static int xfs_map_blocks( struct iomap_writepage_ctx *wpc, struct inode *inode, - loff_t offset) + loff_t offset, + unsigned int len) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -444,7 +445,7 @@ xfs_prepare_ioend( /* send ioends that might require a transaction to the completion wq */ if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || (ioend->io_flags & IOMAP_F_SHARED)) - ioend->io_bio->bi_end_io = xfs_end_bio; + ioend->io_bio.bi_end_io = xfs_end_bio; return status; } diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index dba5dcb62bef..3b103715acc9 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -125,7 +125,8 @@ static void zonefs_readahead(struct readahead_control *rac) * which implies that the page range can only be within the fixed inode size. */ static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset) + struct inode *inode, loff_t offset, + unsigned int len) { struct zonefs_zone *z = zonefs_inode_zone(inode); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7a8150a5f051..492b0128b5d9 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -8,6 +8,7 @@ #include #include #include +#include struct blk_mq_tags; struct blk_flush_queue; @@ -135,6 +136,7 @@ struct request { struct blk_crypto_keyslot *crypt_keyslot; #endif + enum rw_hint write_hint; unsigned short ioprio; enum mq_rq_state state; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f288c94374b3..12d87cef2c03 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -10,6 +10,7 @@ #include #include #include +#include struct bio_set; struct bio; @@ -269,6 +270,7 @@ struct bio { */ unsigned short bi_flags; /* BIO_* below */ unsigned short bi_ioprio; + enum rw_hint bi_write_hint; blk_status_t bi_status; atomic_t __bi_remaining; diff --git a/include/linux/fs.h b/include/linux/fs.h index 2ba751d097c1..b26796ccc309 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -310,19 +311,6 @@ struct address_space; struct writeback_control; struct readahead_control; -/* - * Write life time hint values. - * Stored in struct inode as u8. - */ -enum rw_hint { - WRITE_LIFE_NOT_SET = 0, - WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE, - WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT, - WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, - WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, - WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, -}; - /* Match RWF_* bits to IOCB bits */ #define IOCB_HIPRI (__force int) RWF_HIPRI #define IOCB_DSYNC (__force int) RWF_DSYNC @@ -680,7 +668,7 @@ struct inode { spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; u8 i_blkbits; - u8 i_write_hint; + enum rw_hint i_write_hint; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 96dd0acbba44..6fc1c858013d 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -293,22 +293,32 @@ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ u16 io_type; u16 io_flags; /* IOMAP_F_* */ - u32 io_folios; /* folios added to ioend */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ - struct bio *io_bio; /* bio being built */ - struct bio io_inline_bio; /* MUST BE LAST! */ + struct bio io_bio; /* MUST BE LAST! */ }; +static inline struct iomap_ioend *iomap_ioend_from_bio(struct bio *bio) +{ + return container_of(bio, struct iomap_ioend, io_bio); +} + struct iomap_writeback_ops { /* * Required, maps the blocks so that writeback can be performed on * the range starting at offset. + * + * Can return arbitrarily large regions, but we need to call into it at + * least once per folio to allow the file systems to synchronize with + * the write path that could be invalidating mappings. + * + * An existing mapping from a previous call to this method can be reused + * by the file system if it is still valid. */ int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode, - loff_t offset); + loff_t offset, unsigned len); /* * Optional, allows the file systems to perform actions just before @@ -329,6 +339,7 @@ struct iomap_writepage_ctx { struct iomap iomap; struct iomap_ioend *ioend; const struct iomap_writeback_ops *ops; + u32 nr_folios; /* folios added to the ioend */ }; void iomap_finish_ioends(struct iomap_ioend *ioend, int error); diff --git a/include/linux/rw_hint.h b/include/linux/rw_hint.h new file mode 100644 index 000000000000..309ca72f2dfb --- /dev/null +++ b/include/linux/rw_hint.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RW_HINT_H +#define _LINUX_RW_HINT_H + +#include +#include +#include + +/* Block storage write lifetime hint values. */ +enum rw_hint { + WRITE_LIFE_NOT_SET = RWH_WRITE_LIFE_NOT_SET, + WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE, + WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT, + WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, + WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, + WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, +} __packed; + +/* Sparse ignores __packed annotations on enums, hence the #ifndef below. */ +#ifndef __CHECKER__ +static_assert(sizeof(enum rw_hint) == 1); +#endif + +#endif /* _LINUX_RW_HINT_H */