ext4: Convert data=journal writeback to use ext4_writepages()

Add support for writeback of journalled data directly into
ext4_writepages() instead of offloading it to write_cache_pages(). This
actually significantly simplifies the code and reduces code duplication.
For checkpointing of committed data we can use ext4_writepages()
rightaway the same way as writeback of ordered data uses it on
transaction commit. For journalling of dirty mapped pages, we need to
add a special case to mpage_prepare_extent_to_map() to add all page
buffers to the journal.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Link: https://lore.kernel.org/r/20230228051319.4085470-8-tytso@mit.edu
This commit is contained in:
Jan Kara 2023-02-28 00:13:19 -05:00 committed by Theodore Ts'o
parent d8be7607de
commit 3f079114bf
2 changed files with 91 additions and 257 deletions

View file

@ -136,7 +136,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
new_size);
}
static int __ext4_journalled_writepage(struct page *page, unsigned int len);
static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
int pextents);
@ -1632,12 +1631,6 @@ static void ext4_print_free_blocks(struct inode *inode)
return;
}
static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode,
struct buffer_head *bh)
{
return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
}
/*
* ext4_insert_delayed_block - adds a delayed block to the extents status
* tree, incrementing the reserved cluster/block
@ -1870,219 +1863,6 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
return 0;
}
static int __ext4_journalled_writepage(struct page *page,
unsigned int len)
{
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
handle_t *handle = NULL;
int ret = 0, err = 0;
int inline_data = ext4_has_inline_data(inode);
struct buffer_head *inode_bh = NULL;
loff_t size;
ClearPageChecked(page);
if (inline_data) {
BUG_ON(page->index != 0);
BUG_ON(len > ext4_get_max_inline_size(inode));
inode_bh = ext4_journalled_write_inline_data(inode, len, page);
if (inode_bh == NULL)
goto out;
}
/*
* We need to release the page lock before we start the
* journal, so grab a reference so the page won't disappear
* out from under us.
*/
get_page(page);
unlock_page(page);
handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
ext4_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
put_page(page);
goto out_no_pagelock;
}
BUG_ON(!ext4_handle_valid(handle));
lock_page(page);
put_page(page);
size = i_size_read(inode);
if (page->mapping != mapping || page_offset(page) > size) {
/* The page got truncated from under us */
ext4_journal_stop(handle);
ret = 0;
goto out;
}
if (inline_data) {
ret = ext4_mark_inode_dirty(handle, inode);
} else {
struct buffer_head *page_bufs = page_buffers(page);
if (page->index == size >> PAGE_SHIFT)
len = size & ~PAGE_MASK;
else
len = PAGE_SIZE;
ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
NULL, do_journal_get_write_access);
err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
NULL, write_end_fn);
}
if (ret == 0)
ret = err;
err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);
if (ret == 0)
ret = err;
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
out:
unlock_page(page);
out_no_pagelock:
brelse(inode_bh);
return ret;
}
/*
* Note that we don't need to start a transaction unless we're journaling data
* because we should have holes filled from ext4_page_mkwrite(). We even don't
* need to file the inode to the transaction's list in ordered mode because if
* we are writing back data added by write(), the inode is already there and if
* we are writing back data modified via mmap(), no one guarantees in which
* transaction the data will hit the disk. In case we are journaling data, we
* cannot start transaction directly because transaction start ranks above page
* lock so we have to do some magic.
*
* This function can get called via...
* - ext4_writepages after taking page lock (have journal handle)
* - journal_submit_inode_data_buffers (no journal handle)
* - shrink_page_list via the kswapd/direct reclaim (no journal handle)
* - grab_page_cache when doing write_begin (have journal handle)
*
* We don't do any block allocation in this function. If we have page with
* multiple blocks we need to write those buffer_heads that are mapped. This
* is important for mmaped based write. So if we do with blocksize 1K
* truncate(f, 1024);
* a = mmap(f, 0, 4096);
* a[0] = 'a';
* truncate(f, 4096);
* we have in the page first buffer_head mapped via page_mkwrite call back
* but other buffer_heads would be unmapped but dirty (dirty done via the
* do_wp_page). So writepage should write the first block. If we modify
* the mmap area beyond 1024 we will again get a page_fault and the
* page_mkwrite callback will do the block allocation and mark the
* buffer_heads mapped.
*
* We redirty the page if we have any buffer_heads that is either delay or
* unwritten in the page.
*
* We can get recursively called as show below.
*
* ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
* ext4_writepage()
*
* But since we don't do any block allocation we should not deadlock.
* Page also have the dirty flag cleared so we don't get recurive page_lock.
*/
static int ext4_writepage(struct page *page,
struct writeback_control *wbc)
{
struct folio *folio = page_folio(page);
int ret = 0;
loff_t size;
unsigned int len;
struct buffer_head *page_bufs = NULL;
struct inode *inode = page->mapping->host;
struct ext4_io_submit io_submit;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
folio_invalidate(folio, 0, folio_size(folio));
folio_unlock(folio);
return -EIO;
}
trace_ext4_writepage(page);
size = i_size_read(inode);
if (page->index == size >> PAGE_SHIFT &&
!ext4_verity_in_progress(inode))
len = size & ~PAGE_MASK;
else
len = PAGE_SIZE;
/* Should never happen but for bugs in other kernel subsystems */
if (!page_has_buffers(page)) {
ext4_warning_inode(inode,
"page %lu does not have buffers attached", page->index);
ClearPageDirty(page);
unlock_page(page);
return 0;
}
page_bufs = page_buffers(page);
/*
* We cannot do block allocation or other extent handling in this
* function. If there are buffers needing that, we have to redirty
* the page. But we may reach here when we do a journal commit via
* journal_submit_inode_data_buffers() and in that case we must write
* allocated buffers to achieve data=ordered mode guarantees.
*
* Also, if there is only one buffer per page (the fs block
* size == the page size), if one buffer needs block
* allocation or needs to modify the extent tree to clear the
* unwritten flag, we know that the page can't be written at
* all, so we might as well refuse the write immediately.
* Unfortunately if the block size != page size, we can't as
* easily detect this case using ext4_walk_page_buffers(), but
* for the extremely common case, this is an optimization that
* skips a useless round trip through ext4_bio_write_page().
*/
if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL,
ext4_bh_delay_or_unwritten)) {
redirty_page_for_writepage(wbc, page);
if ((current->flags & PF_MEMALLOC) ||
(inode->i_sb->s_blocksize == PAGE_SIZE)) {
/*
* For memory cleaning there's no point in writing only
* some buffers. So just bail out. Warn if we came here
* from direct reclaim.
*/
WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
== PF_MEMALLOC);
unlock_page(page);
return 0;
}
}
if (PageChecked(page) && ext4_should_journal_data(inode))
/*
* It's mmapped pagecache. Add buffers and journal it. There
* doesn't seem much point in redirtying the page here.
*/
return __ext4_journalled_writepage(page, len);
ext4_io_submit_init(&io_submit, wbc);
io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
if (!io_submit.io_end) {
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return -ENOMEM;
}
ret = ext4_bio_write_page(&io_submit, page, len);
unlock_page(page);
ext4_io_submit(&io_submit);
/* Drop io_end reference we got from init */
ext4_put_io_end_defer(io_submit.io_end);
return ret;
}
static void mpage_page_done(struct mpage_da_data *mpd, struct page *page)
{
mpd->first_page++;
@ -2563,6 +2343,50 @@ static bool ext4_page_nomap_can_writeout(struct page *page)
return false;
}
static int ext4_journal_page_buffers(handle_t *handle, struct page *page,
int len)
{
struct buffer_head *page_bufs = page_buffers(page);
struct inode *inode = page->mapping->host;
int ret, err;
ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
NULL, do_journal_get_write_access);
err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
NULL, write_end_fn);
if (ret == 0)
ret = err;
err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);
if (ret == 0)
ret = err;
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
return ret;
}
static int mpage_journal_page_buffers(handle_t *handle,
struct mpage_da_data *mpd,
struct page *page)
{
struct inode *inode = mpd->inode;
loff_t size = i_size_read(inode);
int len;
ClearPageChecked(page);
clear_page_dirty_for_io(page);
mpd->wbc->nr_to_write--;
if (page->index == size >> PAGE_SHIFT &&
!ext4_verity_in_progress(inode))
len = size & ~PAGE_MASK;
else
len = PAGE_SIZE;
return ext4_journal_page_buffers(handle, page, len);
}
/*
* mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
* needing mapping, submit mapped pages
@ -2595,11 +2419,20 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
int blkbits = mpd->inode->i_blkbits;
ext4_lblk_t lblk;
struct buffer_head *head;
handle_t *handle = NULL;
int bpp = ext4_journal_blocks_per_page(mpd->inode);
if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
if (ext4_should_journal_data(mpd->inode)) {
handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE,
bpp);
if (IS_ERR(handle))
return PTR_ERR(handle);
}
folio_batch_init(&fbatch);
mpd->map.m_len = 0;
mpd->next_page = index;
@ -2629,6 +2462,13 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
if (mpd->map.m_len > 0 && mpd->next_page != folio->index)
goto out;
if (handle) {
err = ext4_journal_ensure_credits(handle, bpp,
0);
if (err < 0)
goto out;
}
folio_lock(folio);
/*
* If the page is no longer dirty, or its mapping no
@ -2668,8 +2508,15 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
mpd->first_page = folio->index;
mpd->next_page = folio->index + folio_nr_pages(folio);
/*
* Writeout for transaction commit where we cannot
* modify metadata is simple. Just submit the page.
* Writeout when we cannot modify metadata is simple.
* Just submit the page. For data=journal mode we
* first handle writeout of the page for checkpoint and
* only after that handle delayed page dirtying. This
* is crutial so that forcing a transaction commit and
* then calling filemap_write_and_wait() guarantees
* current state of data is in its final location. Such
* sequence is used for example by insert/collapse
* range operations before discarding the page cache.
*/
if (!mpd->can_map) {
if (ext4_page_nomap_can_writeout(&folio->page)) {
@ -2677,6 +2524,13 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
if (err < 0)
goto out;
}
/* Pending dirtying of journalled data? */
if (PageChecked(&folio->page)) {
err = mpage_journal_page_buffers(handle,
mpd, &folio->page);
if (err < 0)
goto out;
}
mpage_page_done(mpd, &folio->page);
} else {
/* Add all dirty buffers to mpd */
@ -2694,18 +2548,16 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
cond_resched();
}
mpd->scanned_until_end = 1;
if (handle)
ext4_journal_stop(handle);
return 0;
out:
folio_batch_release(&fbatch);
if (handle)
ext4_journal_stop(handle);
return err;
}
static int ext4_writepage_cb(struct folio *folio, struct writeback_control *wbc,
void *data)
{
return ext4_writepage(&folio->page, wbc);
}
static int ext4_do_writepages(struct mpage_da_data *mpd)
{
struct writeback_control *wbc = mpd->wbc;
@ -2731,13 +2583,6 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
goto out_writepages;
if (ext4_should_journal_data(inode)) {
blk_start_plug(&plug);
ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL);
blk_finish_plug(&plug);
goto out_writepages;
}
/*
* If the filesystem has aborted, it is read-only, so return
* right away instead of dumping stack traces later on that
@ -2772,6 +2617,13 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
ext4_journal_stop(handle);
}
/*
* data=journal mode does not do delalloc so we just need to writeout /
* journal already mapped buffers
*/
if (ext4_should_journal_data(inode))
mpd->can_map = 0;
if (ext4_should_dioread_nolock(inode)) {
/*
* We may need to convert up to one extent per block in
@ -3148,9 +3000,8 @@ static int ext4_da_write_end(struct file *file,
* i_disksize since writeback will push i_disksize upto i_size
* eventually. If the end of the current write is > i_size and
* inside an allocated block (ext4_da_should_update_i_disksize()
* check), we need to update i_disksize here as neither
* ext4_writepage() nor certain ext4_writepages() paths not
* allocating blocks update i_disksize.
* check), we need to update i_disksize here as certain
* ext4_writepages() paths not allocating blocks update i_disksize.
*
* Note that we defer inode dirtying to generic_write_end() /
* ext4_da_write_inline_data_end().
@ -5376,7 +5227,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
* If the folio is fully truncated, we don't need to wait for any commit
* (and we even should not as __ext4_journalled_invalidate_folio() may
* strip all buffers from the folio but keep the folio dirty which can then
* confuse e.g. concurrent ext4_writepage() seeing dirty folio without
* confuse e.g. concurrent ext4_writepages() seeing dirty folio without
* buffers). Also we don't need to wait for any commit if all buffers in
* the folio remain valid. This is most beneficial for the common case of
* blocksize == PAGESIZE.
@ -6314,18 +6165,8 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
err = __block_write_begin(page, 0, len, ext4_get_block);
if (!err) {
ret = VM_FAULT_SIGBUS;
if (ext4_walk_page_buffers(handle, inode,
page_buffers(page), 0, len, NULL,
do_journal_get_write_access))
if (ext4_journal_page_buffers(handle, page, len))
goto out_error;
if (ext4_walk_page_buffers(handle, inode,
page_buffers(page), 0, len, NULL,
write_end_fn))
goto out_error;
if (ext4_jbd2_inode_add_write(handle, inode,
page_offset(page), len))
goto out_error;
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
} else {
unlock_page(page);
}

View file

@ -584,13 +584,6 @@ DECLARE_EVENT_CLASS(ext4__page_op,
(unsigned long) __entry->index)
);
DEFINE_EVENT(ext4__page_op, ext4_writepage,
TP_PROTO(struct page *page),
TP_ARGS(page)
);
DEFINE_EVENT(ext4__page_op, ext4_readpage,
TP_PROTO(struct page *page),