linux-stable/fs/xfs/xfs_buf_mem.c

// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (c) 2023-2024 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <djwong@kernel.org>
 */
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_buf.h"
#include "xfs_buf_mem.h"
#include "xfs_trace.h"
#include <linux/shmem_fs.h>
#include "xfs_log_format.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_error.h"

/*
 * Buffer Cache for In-Memory Files
 * ================================
 *
 * Online fsck wants to create ephemeral ordered recordsets.  The existing
 * btree infrastructure can do this, but we need the buffer cache to target
 * memory instead of block devices.
 *
 * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those
 * requirements.  Therefore, the xmbuf mechanism uses an unlinked shmem file to
 * store our staging data.  This file is not installed in the file descriptor
 * table so that user programs cannot access the data, which means that the
 * xmbuf must be freed with xmbuf_destroy.
 *
 * xmbufs assume that the caller will handle all required concurrency
 * management; standard vfs locks (freezer and inode) are not taken.  Reads
 * and writes are satisfied directly from the page cache.
 *
 * The only supported block size is PAGE_SIZE, and we cannot use highmem.
 */

/*
 * shmem files used to back an in-memory buffer cache must not be exposed to
 * userspace.  Upper layers must coordinate access to the one handle returned
 * by the constructor, so establish a separate lock class for xmbufs to avoid
 * confusing lockdep.
 */
static struct lock_class_key xmbuf_i_mutex_key;

/*
 * Allocate a buffer cache target for a memory-backed file and set up the
 * buffer target.
 */
int
xmbuf_alloc(
	struct xfs_mount	*mp,
	const char		*descr,
	struct xfs_buftarg	**btpp)
{
	struct file		*file;
	struct inode		*inode;
	struct xfs_buftarg	*btp;
	int			error;

	btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL);
	if (!btp)
		return -ENOMEM;

	file = shmem_kernel_file_setup(descr, 0, 0);
	if (IS_ERR(file)) {
		error = PTR_ERR(file);
		goto out_free_btp;
	}
	inode = file_inode(file);

	/* private file, private locking */
	lockdep_set_class(&inode->i_rwsem, &xmbuf_i_mutex_key);

	/*
	 * We don't want to bother with kmapping data during repair, so don't
	 * allow highmem pages to back this mapping.
	 */
	mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);

	/* ensure all writes are below EOF to avoid pagecache zeroing */
	i_size_write(inode, inode->i_sb->s_maxbytes);

	error = xfs_buf_cache_init(btp->bt_cache);
	if (error)
		goto out_file;

	/* Initialize buffer target */
	btp->bt_mount = mp;
	btp->bt_dev = (dev_t)-1U;
	btp->bt_bdev = NULL; /* in-memory buftargs have no bdev */
	btp->bt_file = file;
	btp->bt_meta_sectorsize = XMBUF_BLOCKSIZE;
	btp->bt_meta_sectormask = XMBUF_BLOCKSIZE - 1;

	error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
	if (error)
		goto out_bcache;

	trace_xmbuf_create(btp);

	*btpp = btp;
	return 0;

out_bcache:
	xfs_buf_cache_destroy(btp->bt_cache);
out_file:
	fput(file);
out_free_btp:
	kfree(btp);
	return error;
}

/* Free a buffer cache target for a memory-backed buffer cache. */
void
xmbuf_free(
	struct xfs_buftarg	*btp)
{
	ASSERT(xfs_buftarg_is_mem(btp));
	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);

	trace_xmbuf_free(btp);

	xfs_destroy_buftarg(btp);
	xfs_buf_cache_destroy(btp->bt_cache);
	fput(btp->bt_file);
	kfree(btp);
}

/* Directly map a shmem page into the buffer cache. */
int
xmbuf_map_page(
	struct xfs_buf		*bp)
{
	struct inode		*inode = file_inode(bp->b_target->bt_file);
	struct folio		*folio = NULL;
	struct page		*page;
	loff_t                  pos = BBTOB(xfs_buf_daddr(bp));
	int			error;

	ASSERT(xfs_buftarg_is_mem(bp->b_target));

	if (bp->b_map_count != 1)
		return -ENOMEM;
	if (BBTOB(bp->b_length) != XMBUF_BLOCKSIZE)
		return -ENOMEM;
	if (offset_in_page(pos) != 0) {
		ASSERT(offset_in_page(pos));
		return -ENOMEM;
	}

	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE);
	if (error)
		return error;

	if (filemap_check_wb_err(inode->i_mapping, 0)) {
		folio_unlock(folio);
		folio_put(folio);
		return -EIO;
	}

	page = folio_file_page(folio, pos >> PAGE_SHIFT);

	/*
	 * Mark the page dirty so that it won't be reclaimed once we drop the
	 * (potentially last) reference in xmbuf_unmap_page.
	 */
	set_page_dirty(page);
	unlock_page(page);

	bp->b_addr = page_address(page);
	bp->b_pages = bp->b_page_array;
	bp->b_pages[0] = page;
	bp->b_page_count = 1;
	return 0;
}

/* Unmap a shmem page that was mapped into the buffer cache. */
void
xmbuf_unmap_page(
	struct xfs_buf		*bp)
{
	struct page		*page = bp->b_pages[0];

	ASSERT(xfs_buftarg_is_mem(bp->b_target));

	put_page(page);

	bp->b_addr = NULL;
	bp->b_pages[0] = NULL;
	bp->b_pages = NULL;
	bp->b_page_count = 0;
}

/* Is this a valid daddr within the buftarg? */
bool
xmbuf_verify_daddr(
	struct xfs_buftarg	*btp,
	xfs_daddr_t		daddr)
{
	struct inode		*inode = file_inode(btp->bt_file);

	ASSERT(xfs_buftarg_is_mem(btp));

	return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT);
}

/* Discard the page backing this buffer. */
static void
xmbuf_stale(
	struct xfs_buf		*bp)
{
	struct inode		*inode = file_inode(bp->b_target->bt_file);
	loff_t			pos;

	ASSERT(xfs_buftarg_is_mem(bp->b_target));

	pos = BBTOB(xfs_buf_daddr(bp));
	shmem_truncate_range(inode, pos, pos + BBTOB(bp->b_length) - 1);
}

/*
 * Finalize a buffer -- discard the backing page if it's stale, or run the
 * write verifier to detect problems.
 */
int
xmbuf_finalize(
	struct xfs_buf		*bp)
{
	xfs_failaddr_t		fa;
	int			error = 0;

	if (bp->b_flags & XBF_STALE) {
		xmbuf_stale(bp);
		return 0;
	}

	/*
	 * Although this btree is ephemeral, validate the buffer structure so
	 * that we can detect memory corruption errors and software bugs.
	 */
	fa = bp->b_ops->verify_struct(bp);
	if (fa) {
		error = -EFSCORRUPTED;
		xfs_verifier_error(bp, error, fa);
	}

	return error;
}

/*
 * Detach this xmbuf buffer from the transaction by any means necessary.
 * All buffers are direct-mapped, so they do not need bwrite.
 */
void
xmbuf_trans_bdetach(
	struct xfs_trans	*tp,
	struct xfs_buf		*bp)
{
	struct xfs_buf_log_item	*bli = bp->b_log_item;

	ASSERT(bli != NULL);

	bli->bli_flags &= ~(XFS_BLI_DIRTY | XFS_BLI_ORDERED |
			    XFS_BLI_LOGGED | XFS_BLI_STALE);
	clear_bit(XFS_LI_DIRTY, &bli->bli_item.li_flags);

	while (bp->b_log_item != NULL)
		xfs_trans_bdetach(tp, bp);
}
xfs: support in-memory buffer cache targets Allow the buffer cache to target in-memory files by making it possible to have a buftarg that maps pages from private shmem files. As the prevous patch alludes, the in-memory buftarg contains its own cache, points to a shmem file, and does not point to a block_device. The next few patches will make it possible to construct an xfs_btree in pageable memory by using this buftarg. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> 2024-02-22 20:43:21 +00:00			`// SPDX-License-Identifier: GPL-2.0-or-later`
			`/*`
			`* Copyright (c) 2023-2024 Oracle. All Rights Reserved.`
			`* Author: Darrick J. Wong <djwong@kernel.org>`
			`*/`
			`#include "xfs.h"`
			`#include "xfs_fs.h"`
			`#include "xfs_buf.h"`
			`#include "xfs_buf_mem.h"`
			`#include "xfs_trace.h"`
			`#include <linux/shmem_fs.h>`
xfs: launder in-memory btree buffers before transaction commit As we've noted in various places, all current users of in-memory btrees are online fsck. Online fsck only stages a btree long enough to rebuild an ondisk data structure, which means that the in-memory btree is ephemeral. Furthermore, if we encounter /any/ errors while updating an in-memory btree, all we do is tear down all the staged data and return an errno to userspace. In-memory btrees need not be transactional, so their buffers should not be committed to the ondisk log, nor should they be checkpointed by the AIL. That's just as well since the ephemeral nature of the btree means that the buftarg and the buffers may disappear quickly anyway. Therefore, we need a way to launder the btree buffers that get attached to the transaction by the generic btree code. Because the buffers are directly mapped to backing file pages, there's no need to bwrite them back to the tmpfs file. All we need to do is clean enough of the buffer log item state so that the bli can be detached from the buffer, remove the bli from the transaction's log item list, and reset the transaction dirty state as if the laundered items had never been there. For simplicity, create xfbtree transaction commit and cancel helpers that launder the in-memory btree buffers for callers. Once laundered, call the write verifier on non-stale buffers to avoid integrity issues, or punch a hole in the backing file for stale buffers. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> 2024-02-22 20:43:36 +00:00			`#include "xfs_log_format.h"`
			`#include "xfs_trans.h"`
			`#include "xfs_buf_item.h"`
			`#include "xfs_error.h"`
xfs: support in-memory buffer cache targets Allow the buffer cache to target in-memory files by making it possible to have a buftarg that maps pages from private shmem files. As the prevous patch alludes, the in-memory buftarg contains its own cache, points to a shmem file, and does not point to a block_device. The next few patches will make it possible to construct an xfs_btree in pageable memory by using this buftarg. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> 2024-02-22 20:43:21 +00:00
			`/*`
			`* Buffer Cache for In-Memory Files`
			`* ================================`
			`*`
			`* Online fsck wants to create ephemeral ordered recordsets. The existing`
			`* btree infrastructure can do this, but we need the buffer cache to target`
			`* memory instead of block devices.`
			`*`
			`* When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those`
			`* requirements. Therefore, the xmbuf mechanism uses an unlinked shmem file to`
			`* store our staging data. This file is not installed in the file descriptor`
			`* table so that user programs cannot access the data, which means that the`
			`* xmbuf must be freed with xmbuf_destroy.`
			`*`
			`* xmbufs assume that the caller will handle all required concurrency`
			`* management; standard vfs locks (freezer and inode) are not taken. Reads`
			`* and writes are satisfied directly from the page cache.`
			`*`
			`* The only supported block size is PAGE_SIZE, and we cannot use highmem.`
			`*/`

			`/*`
			`* shmem files used to back an in-memory buffer cache must not be exposed to`
			`* userspace. Upper layers must coordinate access to the one handle returned`
			`* by the constructor, so establish a separate lock class for xmbufs to avoid`
			`* confusing lockdep.`
			`*/`
			`static struct lock_class_key xmbuf_i_mutex_key;`

			`/*`
			`* Allocate a buffer cache target for a memory-backed file and set up the`
			`* buffer target.`
			`*/`
			`int`
			`xmbuf_alloc(`
			`struct xfs_mount *mp,`
			`const char *descr,`
			`struct xfs_buftarg **btpp)`
			`{`
			`struct file *file;`
			`struct inode *inode;`
			`struct xfs_buftarg *btp;`
			`int error;`

			`btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL);`
			`if (!btp)`
			`return -ENOMEM;`

			`file = shmem_kernel_file_setup(descr, 0, 0);`
			`if (IS_ERR(file)) {`
			`error = PTR_ERR(file);`
			`goto out_free_btp;`
			`}`
			`inode = file_inode(file);`

			`/* private file, private locking */`
			`lockdep_set_class(&inode->i_rwsem, &xmbuf_i_mutex_key);`

			`/*`
			`* We don't want to bother with kmapping data during repair, so don't`
			`* allow highmem pages to back this mapping.`
			`*/`
			`mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);`

			`/* ensure all writes are below EOF to avoid pagecache zeroing */`
			`i_size_write(inode, inode->i_sb->s_maxbytes);`

			`error = xfs_buf_cache_init(btp->bt_cache);`
			`if (error)`
			`goto out_file;`

			`/* Initialize buffer target */`
			`btp->bt_mount = mp;`
			`btp->bt_dev = (dev_t)-1U;`
			`btp->bt_bdev = NULL; /* in-memory buftargs have no bdev */`
			`btp->bt_file = file;`
			`btp->bt_meta_sectorsize = XMBUF_BLOCKSIZE;`
			`btp->bt_meta_sectormask = XMBUF_BLOCKSIZE - 1;`

			`error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);`
			`if (error)`
			`goto out_bcache;`

xfs: fix dev_t usage in xmbuf tracepoints Fix some inconsistencies in the xmbuf tracepoints -- they should be reporting the major/minor of the filesystem that they're associated with, so that we have some clue on whose behalf the xmbuf was created. Fix the xmbuf_free tracepoint to report the same. Don't call the trace function until the xmbuf is fully initialized. Fixes: 5076a6040ca1 ("xfs: support in-memory buffer cache target") Signed-off-by: "Darrick J. Wong" <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Chandan Babu R <chandanbabu@kernel.org> 2024-03-07 23:13:52 +00:00			`trace_xmbuf_create(btp);`

xfs: support in-memory buffer cache targets Allow the buffer cache to target in-memory files by making it possible to have a buftarg that maps pages from private shmem files. As the prevous patch alludes, the in-memory buftarg contains its own cache, points to a shmem file, and does not point to a block_device. The next few patches will make it possible to construct an xfs_btree in pageable memory by using this buftarg. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> 2024-02-22 20:43:21 +00:00			`*btpp = btp;`
			`return 0;`

			`out_bcache:`
			`xfs_buf_cache_destroy(btp->bt_cache);`
			`out_file:`
			`fput(file);`
			`out_free_btp:`
			`kfree(btp);`
			`return error;`
			`}`

			`/* Free a buffer cache target for a memory-backed buffer cache. */`
			`void`
			`xmbuf_free(`
			`struct xfs_buftarg *btp)`
			`{`
			`ASSERT(xfs_buftarg_is_mem(btp));`
			`ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);`

			`trace_xmbuf_free(btp);`

			`xfs_destroy_buftarg(btp);`
			`xfs_buf_cache_destroy(btp->bt_cache);`
			`fput(btp->bt_file);`
			`kfree(btp);`
			`}`

			`/* Directly map a shmem page into the buffer cache. */`
			`int`
			`xmbuf_map_page(`
			`struct xfs_buf *bp)`
			`{`
			`struct inode *inode = file_inode(bp->b_target->bt_file);`
			`struct folio *folio = NULL;`
			`struct page *page;`
			`loff_t pos = BBTOB(xfs_buf_daddr(bp));`
			`int error;`

			`ASSERT(xfs_buftarg_is_mem(bp->b_target));`

			`if (bp->b_map_count != 1)`
			`return -ENOMEM;`
			`if (BBTOB(bp->b_length) != XMBUF_BLOCKSIZE)`
			`return -ENOMEM;`
			`if (offset_in_page(pos) != 0) {`
			`ASSERT(offset_in_page(pos));`
			`return -ENOMEM;`
			`}`

			`error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE);`
			`if (error)`
			`return error;`

			`if (filemap_check_wb_err(inode->i_mapping, 0)) {`
			`folio_unlock(folio);`
			`folio_put(folio);`
			`return -EIO;`
			`}`

			`page = folio_file_page(folio, pos >> PAGE_SHIFT);`

			`/*`
			`* Mark the page dirty so that it won't be reclaimed once we drop the`
			`* (potentially last) reference in xmbuf_unmap_page.`
			`*/`
			`set_page_dirty(page);`
			`unlock_page(page);`

			`bp->b_addr = page_address(page);`
			`bp->b_pages = bp->b_page_array;`
			`bp->b_pages[0] = page;`
			`bp->b_page_count = 1;`
			`return 0;`
			`}`

			`/* Unmap a shmem page that was mapped into the buffer cache. */`
			`void`
			`xmbuf_unmap_page(`
			`struct xfs_buf *bp)`
			`{`
			`struct page *page = bp->b_pages[0];`

			`ASSERT(xfs_buftarg_is_mem(bp->b_target));`

			`put_page(page);`

			`bp->b_addr = NULL;`
			`bp->b_pages[0] = NULL;`
			`bp->b_pages = NULL;`
			`bp->b_page_count = 0;`
			`}`
xfs: support in-memory btrees Adapt the generic btree cursor code to be able to create a btree whose buffers come from a (presumably in-memory) buftarg with a header block that's specific to in-memory btrees. We'll connect this to other parts of online scrub in the next patches. Note that in-memory btrees always have a block size matching the system memory page size for efficiency reasons. There are also a few things we need to do to finalize a btree update; that's covered in the next patch. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> 2024-02-22 20:43:35 +00:00
			`/* Is this a valid daddr within the buftarg? */`
			`bool`
			`xmbuf_verify_daddr(`
			`struct xfs_buftarg *btp,`
			`xfs_daddr_t daddr)`
			`{`
			`struct inode *inode = file_inode(btp->bt_file);`

			`ASSERT(xfs_buftarg_is_mem(btp));`

			`return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT);`
			`}`
xfs: launder in-memory btree buffers before transaction commit As we've noted in various places, all current users of in-memory btrees are online fsck. Online fsck only stages a btree long enough to rebuild an ondisk data structure, which means that the in-memory btree is ephemeral. Furthermore, if we encounter /any/ errors while updating an in-memory btree, all we do is tear down all the staged data and return an errno to userspace. In-memory btrees need not be transactional, so their buffers should not be committed to the ondisk log, nor should they be checkpointed by the AIL. That's just as well since the ephemeral nature of the btree means that the buftarg and the buffers may disappear quickly anyway. Therefore, we need a way to launder the btree buffers that get attached to the transaction by the generic btree code. Because the buffers are directly mapped to backing file pages, there's no need to bwrite them back to the tmpfs file. All we need to do is clean enough of the buffer log item state so that the bli can be detached from the buffer, remove the bli from the transaction's log item list, and reset the transaction dirty state as if the laundered items had never been there. For simplicity, create xfbtree transaction commit and cancel helpers that launder the in-memory btree buffers for callers. Once laundered, call the write verifier on non-stale buffers to avoid integrity issues, or punch a hole in the backing file for stale buffers. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Reviewed-by: Christoph Hellwig <hch@lst.de> 2024-02-22 20:43:36 +00:00
			`/* Discard the page backing this buffer. */`
			`static void`
			`xmbuf_stale(`
			`struct xfs_buf *bp)`
			`{`
			`struct inode *inode = file_inode(bp->b_target->bt_file);`
			`loff_t pos;`

			`ASSERT(xfs_buftarg_is_mem(bp->b_target));`

			`pos = BBTOB(xfs_buf_daddr(bp));`
			`shmem_truncate_range(inode, pos, pos + BBTOB(bp->b_length) - 1);`
			`}`

			`/*`
			`* Finalize a buffer -- discard the backing page if it's stale, or run the`
			`* write verifier to detect problems.`
			`*/`
			`int`
			`xmbuf_finalize(`
			`struct xfs_buf *bp)`
			`{`
			`xfs_failaddr_t fa;`
			`int error = 0;`

			`if (bp->b_flags & XBF_STALE) {`
			`xmbuf_stale(bp);`
			`return 0;`
			`}`

			`/*`
			`* Although this btree is ephemeral, validate the buffer structure so`
			`* that we can detect memory corruption errors and software bugs.`
			`*/`
			`fa = bp->b_ops->verify_struct(bp);`
			`if (fa) {`
			`error = -EFSCORRUPTED;`
			`xfs_verifier_error(bp, error, fa);`
			`}`

			`return error;`
			`}`

			`/*`
			`* Detach this xmbuf buffer from the transaction by any means necessary.`
			`* All buffers are direct-mapped, so they do not need bwrite.`
			`*/`
			`void`
			`xmbuf_trans_bdetach(`
			`struct xfs_trans *tp,`
			`struct xfs_buf *bp)`
			`{`
			`struct xfs_buf_log_item *bli = bp->b_log_item;`

			`ASSERT(bli != NULL);`

			`bli->bli_flags &= ~(XFS_BLI_DIRTY \| XFS_BLI_ORDERED \|`
			`XFS_BLI_LOGGED \| XFS_BLI_STALE);`
			`clear_bit(XFS_LI_DIRTY, &bli->bli_item.li_flags);`

			`while (bp->b_log_item != NULL)`
			`xfs_trans_bdetach(tp, bp);`
			`}`