xfs: support in-memory buffer cache targets

Allow the buffer cache to target in-memory files by making it possible
to have a buftarg that maps pages from private shmem files.  As the
prevous patch alludes, the in-memory buftarg contains its own cache,
points to a shmem file, and does not point to a block_device.

The next few patches will make it possible to construct an xfs_btree in
pageable memory by using this buftarg.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
This commit is contained in:
Darrick J. Wong 2024-02-22 12:43:21 -08:00
parent e7b58f7c1b
commit 5076a6040c
8 changed files with 369 additions and 46 deletions

View File

@ -128,6 +128,9 @@ config XFS_LIVE_HOOKS
bool
select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL
config XFS_MEMORY_BUFS
bool
config XFS_ONLINE_SCRUB
bool "XFS online metadata check support"
default n
@ -135,6 +138,7 @@ config XFS_ONLINE_SCRUB
depends on TMPFS && SHMEM
select XFS_LIVE_HOOKS
select XFS_DRAIN_INTENTS
select XFS_MEMORY_BUFS
help
If you say Y here you will be able to check metadata on a
mounted XFS filesystem. This feature is intended to reduce

View File

@ -137,6 +137,7 @@ endif
xfs-$(CONFIG_XFS_DRAIN_INTENTS) += xfs_drain.o
xfs-$(CONFIG_XFS_LIVE_HOOKS) += xfs_hooks.o
xfs-$(CONFIG_XFS_MEMORY_BUFS) += xfs_buf_mem.o
# online scrub/repair
ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)

View File

@ -21,6 +21,7 @@
#include "xfs_errortag.h"
#include "xfs_error.h"
#include "xfs_ag.h"
#include "xfs_buf_mem.h"
struct kmem_cache *xfs_buf_cache;
@ -318,7 +319,9 @@ xfs_buf_free(
ASSERT(list_empty(&bp->b_lru));
if (bp->b_flags & _XBF_PAGES)
if (xfs_buftarg_is_mem(bp->b_target))
xmbuf_unmap_page(bp);
else if (bp->b_flags & _XBF_PAGES)
xfs_buf_free_pages(bp);
else if (bp->b_flags & _XBF_KMEM)
kfree(bp->b_addr);
@ -634,18 +637,20 @@ xfs_buf_find_insert(
if (error)
goto out_drop_pag;
/*
* For buffers that fit entirely within a single page, first attempt to
* allocate the memory from the heap to minimise memory usage. If we
* can't get heap memory for these small buffers, we fall back to using
* the page allocator.
*/
if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
xfs_buf_alloc_kmem(new_bp, flags) < 0) {
if (xfs_buftarg_is_mem(new_bp->b_target)) {
error = xmbuf_map_page(new_bp);
} else if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
xfs_buf_alloc_kmem(new_bp, flags) < 0) {
/*
* For buffers that fit entirely within a single page, first
* attempt to allocate the memory from the heap to minimise
* memory usage. If we can't get heap memory for these small
* buffers, we fall back to using the page allocator.
*/
error = xfs_buf_alloc_pages(new_bp, flags);
if (error)
goto out_free_buf;
}
if (error)
goto out_free_buf;
spin_lock(&bch->bc_lock);
bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
@ -688,6 +693,8 @@ xfs_buftarg_get_pag(
{
struct xfs_mount *mp = btp->bt_mount;
if (xfs_buftarg_is_mem(btp))
return NULL;
return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
}
@ -696,7 +703,9 @@ xfs_buftarg_buf_cache(
struct xfs_buftarg *btp,
struct xfs_perag *pag)
{
return &pag->pag_bcache;
if (pag)
return &pag->pag_bcache;
return btp->bt_cache;
}
/*
@ -926,6 +935,13 @@ xfs_buf_readahead_map(
{
struct xfs_buf *bp;
/*
* Currently we don't have a good means or justification for performing
* xmbuf_map_page asynchronously, so we don't do readahead.
*/
if (xfs_buftarg_is_mem(target))
return;
xfs_buf_read_map(target, map, nmaps,
XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
__this_address);
@ -991,7 +1007,10 @@ xfs_buf_get_uncached(
if (error)
return error;
error = xfs_buf_alloc_pages(bp, flags);
if (xfs_buftarg_is_mem(bp->b_target))
error = xmbuf_map_page(bp);
else
error = xfs_buf_alloc_pages(bp, flags);
if (error)
goto fail_free_buf;
@ -1633,6 +1652,12 @@ _xfs_buf_ioapply(
/* we only use the buffer cache for meta-data */
op |= REQ_META;
/* in-memory targets are directly mapped, no IO required. */
if (xfs_buftarg_is_mem(bp->b_target)) {
xfs_buf_ioend(bp);
return;
}
/*
* Walk all the vectors issuing IO on them. Set up the initial offset
* into the buffer and the desired IO size before we start -
@ -1988,19 +2013,24 @@ xfs_buftarg_shrink_count(
}
void
xfs_free_buftarg(
xfs_destroy_buftarg(
struct xfs_buftarg *btp)
{
shrinker_free(btp->bt_shrinker);
ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
percpu_counter_destroy(&btp->bt_io_count);
list_lru_destroy(&btp->bt_lru);
}
void
xfs_free_buftarg(
struct xfs_buftarg *btp)
{
xfs_destroy_buftarg(btp);
fs_put_dax(btp->bt_daxdev, btp->bt_mount);
/* the main block device is closed by kill_block_super */
if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
bdev_release(btp->bt_bdev_handle);
kfree(btp);
}
@ -2023,6 +2053,45 @@ xfs_setsize_buftarg(
return 0;
}
int
xfs_init_buftarg(
struct xfs_buftarg *btp,
size_t logical_sectorsize,
const char *descr)
{
/* Set up device logical sector size mask */
btp->bt_logical_sectorsize = logical_sectorsize;
btp->bt_logical_sectormask = logical_sectorsize - 1;
/*
* Buffer IO error rate limiting. Limit it to no more than 10 messages
* per 30 seconds so as to not spam logs too much on repeated errors.
*/
ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
DEFAULT_RATELIMIT_BURST);
if (list_lru_init(&btp->bt_lru))
return -ENOMEM;
if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
goto out_destroy_lru;
btp->bt_shrinker =
shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", descr);
if (!btp->bt_shrinker)
goto out_destroy_io_count;
btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count;
btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
btp->bt_shrinker->private_data = btp;
shrinker_register(btp->bt_shrinker);
return 0;
out_destroy_io_count:
percpu_counter_destroy(&btp->bt_io_count);
out_destroy_lru:
list_lru_destroy(&btp->bt_lru);
return -ENOMEM;
}
struct xfs_buftarg *
xfs_alloc_buftarg(
struct xfs_mount *mp,
@ -2049,41 +2118,12 @@ xfs_alloc_buftarg(
*/
if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)))
goto error_free;
/* Set up device logical sector size mask */
btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
/*
* Buffer IO error rate limiting. Limit it to no more than 10 messages
* per 30 seconds so as to not spam logs too much on repeated errors.
*/
ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
DEFAULT_RATELIMIT_BURST);
if (list_lru_init(&btp->bt_lru))
if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev),
mp->m_super->s_id))
goto error_free;
if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
goto error_lru;
btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s",
mp->m_super->s_id);
if (!btp->bt_shrinker)
goto error_pcpu;
btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count;
btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
btp->bt_shrinker->private_data = btp;
shrinker_register(btp->bt_shrinker);
return btp;
error_pcpu:
percpu_counter_destroy(&btp->bt_io_count);
error_lru:
list_lru_destroy(&btp->bt_lru);
error_free:
kfree(btp);
return NULL;

View File

@ -109,6 +109,7 @@ struct xfs_buftarg {
struct bdev_handle *bt_bdev_handle;
struct block_device *bt_bdev;
struct dax_device *bt_daxdev;
struct file *bt_file;
u64 bt_dax_part_off;
struct xfs_mount *bt_mount;
unsigned int bt_meta_sectorsize;
@ -122,6 +123,9 @@ struct xfs_buftarg {
struct percpu_counter bt_io_count;
struct ratelimit_state bt_ioerror_rl;
/* built-in cache, if we're not using the perag one */
struct xfs_buf_cache bt_cache[];
};
#define XB_PAGES 2
@ -387,4 +391,9 @@ int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
/* for xfs_buf_mem.c only: */
int xfs_init_buftarg(struct xfs_buftarg *btp, size_t logical_sectorsize,
const char *descr);
void xfs_destroy_buftarg(struct xfs_buftarg *btp);
#endif /* __XFS_BUF_H__ */

189
fs/xfs/xfs_buf_mem.c Normal file
View File

@ -0,0 +1,189 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2023-2024 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_buf.h"
#include "xfs_buf_mem.h"
#include "xfs_trace.h"
#include <linux/shmem_fs.h>
/*
* Buffer Cache for In-Memory Files
* ================================
*
* Online fsck wants to create ephemeral ordered recordsets. The existing
* btree infrastructure can do this, but we need the buffer cache to target
* memory instead of block devices.
*
* When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those
* requirements. Therefore, the xmbuf mechanism uses an unlinked shmem file to
* store our staging data. This file is not installed in the file descriptor
* table so that user programs cannot access the data, which means that the
* xmbuf must be freed with xmbuf_destroy.
*
* xmbufs assume that the caller will handle all required concurrency
* management; standard vfs locks (freezer and inode) are not taken. Reads
* and writes are satisfied directly from the page cache.
*
* The only supported block size is PAGE_SIZE, and we cannot use highmem.
*/
/*
* shmem files used to back an in-memory buffer cache must not be exposed to
* userspace. Upper layers must coordinate access to the one handle returned
* by the constructor, so establish a separate lock class for xmbufs to avoid
* confusing lockdep.
*/
static struct lock_class_key xmbuf_i_mutex_key;
/*
* Allocate a buffer cache target for a memory-backed file and set up the
* buffer target.
*/
int
xmbuf_alloc(
struct xfs_mount *mp,
const char *descr,
struct xfs_buftarg **btpp)
{
struct file *file;
struct inode *inode;
struct xfs_buftarg *btp;
int error;
btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL);
if (!btp)
return -ENOMEM;
file = shmem_kernel_file_setup(descr, 0, 0);
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out_free_btp;
}
inode = file_inode(file);
/* private file, private locking */
lockdep_set_class(&inode->i_rwsem, &xmbuf_i_mutex_key);
/*
* We don't want to bother with kmapping data during repair, so don't
* allow highmem pages to back this mapping.
*/
mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
/* ensure all writes are below EOF to avoid pagecache zeroing */
i_size_write(inode, inode->i_sb->s_maxbytes);
trace_xmbuf_create(btp);
error = xfs_buf_cache_init(btp->bt_cache);
if (error)
goto out_file;
/* Initialize buffer target */
btp->bt_mount = mp;
btp->bt_dev = (dev_t)-1U;
btp->bt_bdev = NULL; /* in-memory buftargs have no bdev */
btp->bt_file = file;
btp->bt_meta_sectorsize = XMBUF_BLOCKSIZE;
btp->bt_meta_sectormask = XMBUF_BLOCKSIZE - 1;
error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
if (error)
goto out_bcache;
*btpp = btp;
return 0;
out_bcache:
xfs_buf_cache_destroy(btp->bt_cache);
out_file:
fput(file);
out_free_btp:
kfree(btp);
return error;
}
/* Free a buffer cache target for a memory-backed buffer cache. */
void
xmbuf_free(
struct xfs_buftarg *btp)
{
ASSERT(xfs_buftarg_is_mem(btp));
ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
trace_xmbuf_free(btp);
xfs_destroy_buftarg(btp);
xfs_buf_cache_destroy(btp->bt_cache);
fput(btp->bt_file);
kfree(btp);
}
/* Directly map a shmem page into the buffer cache. */
int
xmbuf_map_page(
struct xfs_buf *bp)
{
struct inode *inode = file_inode(bp->b_target->bt_file);
struct folio *folio = NULL;
struct page *page;
loff_t pos = BBTOB(xfs_buf_daddr(bp));
int error;
ASSERT(xfs_buftarg_is_mem(bp->b_target));
if (bp->b_map_count != 1)
return -ENOMEM;
if (BBTOB(bp->b_length) != XMBUF_BLOCKSIZE)
return -ENOMEM;
if (offset_in_page(pos) != 0) {
ASSERT(offset_in_page(pos));
return -ENOMEM;
}
error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE);
if (error)
return error;
if (filemap_check_wb_err(inode->i_mapping, 0)) {
folio_unlock(folio);
folio_put(folio);
return -EIO;
}
page = folio_file_page(folio, pos >> PAGE_SHIFT);
/*
* Mark the page dirty so that it won't be reclaimed once we drop the
* (potentially last) reference in xmbuf_unmap_page.
*/
set_page_dirty(page);
unlock_page(page);
bp->b_addr = page_address(page);
bp->b_pages = bp->b_page_array;
bp->b_pages[0] = page;
bp->b_page_count = 1;
return 0;
}
/* Unmap a shmem page that was mapped into the buffer cache. */
void
xmbuf_unmap_page(
struct xfs_buf *bp)
{
struct page *page = bp->b_pages[0];
ASSERT(xfs_buftarg_is_mem(bp->b_target));
put_page(page);
bp->b_addr = NULL;
bp->b_pages[0] = NULL;
bp->b_pages = NULL;
bp->b_page_count = 0;
}

30
fs/xfs/xfs_buf_mem.h Normal file
View File

@ -0,0 +1,30 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2023-2024 Oracle. All Rights Reserved.
* Author: Darrick J. Wong <djwong@kernel.org>
*/
#ifndef __XFS_BUF_MEM_H__
#define __XFS_BUF_MEM_H__
#define XMBUF_BLOCKSIZE (PAGE_SIZE)
#define XMBUF_BLOCKSHIFT (PAGE_SHIFT)
#ifdef CONFIG_XFS_MEMORY_BUFS
static inline bool xfs_buftarg_is_mem(const struct xfs_buftarg *btp)
{
return btp->bt_bdev == NULL;
}
int xmbuf_alloc(struct xfs_mount *mp, const char *descr,
struct xfs_buftarg **btpp);
void xmbuf_free(struct xfs_buftarg *btp);
int xmbuf_map_page(struct xfs_buf *bp);
void xmbuf_unmap_page(struct xfs_buf *bp);
#else
# define xfs_buftarg_is_mem(...) (false)
# define xmbuf_map_page(...) (-ENOMEM)
# define xmbuf_unmap_page(...) ((void)0)
#endif /* CONFIG_XFS_MEMORY_BUFS */
#endif /* __XFS_BUF_MEM_H__ */

View File

@ -36,6 +36,7 @@
#include "xfs_error.h"
#include <linux/iomap.h>
#include "xfs_iomap.h"
#include "xfs_buf_mem.h"
/*
* We include this last to have the helpers above available for the trace

View File

@ -4514,6 +4514,55 @@ DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents);
#endif /* CONFIG_XFS_DRAIN_INTENTS */
#ifdef CONFIG_XFS_MEMORY_BUFS
TRACE_EVENT(xmbuf_create,
TP_PROTO(struct xfs_buftarg *btp),
TP_ARGS(btp),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(unsigned long, ino)
__array(char, pathname, 256)
),
TP_fast_assign(
char pathname[257];
char *path;
struct file *file = btp->bt_file;
__entry->ino = file_inode(file)->i_ino;
memset(pathname, 0, sizeof(pathname));
path = file_path(file, pathname, sizeof(pathname) - 1);
if (IS_ERR(path))
path = "(unknown)";
strncpy(__entry->pathname, path, sizeof(__entry->pathname));
),
TP_printk("xmino 0x%lx path '%s'",
__entry->ino,
__entry->pathname)
);
TRACE_EVENT(xmbuf_free,
TP_PROTO(struct xfs_buftarg *btp),
TP_ARGS(btp),
TP_STRUCT__entry(
__field(unsigned long, ino)
__field(unsigned long long, bytes)
__field(loff_t, size)
),
TP_fast_assign(
struct file *file = btp->bt_file;
struct inode *inode = file_inode(file);
__entry->size = i_size_read(inode);
__entry->bytes = (inode->i_blocks << SECTOR_SHIFT) + inode->i_bytes;
__entry->ino = inode->i_ino;
),
TP_printk("xmino 0x%lx mem_bytes 0x%llx isize 0x%llx",
__entry->ino,
__entry->bytes,
__entry->size)
);
#endif /* CONFIG_XFS_MEMORY_BUFS */
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH