bcachefs: Debug mode for c->writes references

This adds a debug mode where we split up the c->writes refcount into
distinct refcounts for every codepath that takes a reference, and adds
sysfs code to print the value of each ref.

This will make it easier to debug shutdown hangs due to refcount leaks.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2023-02-09 12:21:45 -05:00
parent dd81a060eb
commit d94189ad56
13 changed files with 168 additions and 46 deletions

View file

@ -1113,7 +1113,7 @@ static void bch2_do_discards_work(struct work_struct *work)
if (need_journal_commit * 2 > seen)
bch2_journal_flush_async(&c->journal, NULL);
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
bch2_err_str(ret));
@ -1121,9 +1121,9 @@ static void bch2_do_discards_work(struct work_struct *work)
void bch2_do_discards(struct bch_fs *c)
{
if (percpu_ref_tryget_live(&c->writes) &&
if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) &&
!queue_work(system_long_wq, &c->discard_work))
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
static int invalidate_one_bucket(struct btree_trans *trans,
@ -1233,14 +1233,14 @@ static void bch2_do_invalidates_work(struct work_struct *work)
}
bch2_trans_exit(&trans);
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
void bch2_do_invalidates(struct bch_fs *c)
{
if (percpu_ref_tryget_live(&c->writes) &&
if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) &&
!queue_work(system_long_wq, &c->invalidate_work))
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_invalidate);
}
static int bucket_freespace_init(struct btree_trans *trans, struct btree_iter *iter,

View file

@ -209,6 +209,10 @@
#include "opts.h"
#include "util.h"
#ifdef CONFIG_BCACHEFS_DEBUG
#define BCH_WRITE_REF_DEBUG
#endif
#define dynamic_fault(...) 0
#define race_fault(...) 0
@ -538,6 +542,7 @@ enum {
/* shutdown: */
BCH_FS_STOPPING,
BCH_FS_EMERGENCY_RO,
BCH_FS_GOING_RO,
BCH_FS_WRITE_DISABLE_COMPLETE,
BCH_FS_CLEAN_SHUTDOWN,
@ -627,6 +632,29 @@ typedef struct {
#define BCACHEFS_ROOT_SUBVOL_INUM \
((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
#define BCH_WRITE_REFS() \
x(trans) \
x(write) \
x(promote) \
x(node_rewrite) \
x(stripe_create) \
x(stripe_delete) \
x(reflink) \
x(fallocate) \
x(discard) \
x(invalidate) \
x(move) \
x(delete_dead_snapshots) \
x(snapshot_delete_pagecache) \
x(sysfs)
enum bch_write_ref {
#define x(n) BCH_WRITE_REF_##n,
BCH_WRITE_REFS()
#undef x
BCH_WRITE_REF_NR,
};
struct bch_fs {
struct closure cl;
@ -648,7 +676,11 @@ struct bch_fs {
struct rw_semaphore state_lock;
/* Counts outstanding writes, for clean transition to read-only */
#ifdef BCH_WRITE_REF_DEBUG
atomic_long_t writes[BCH_WRITE_REF_NR];
#else
struct percpu_ref writes;
#endif
struct work_struct read_only_work;
struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
@ -965,6 +997,46 @@ mempool_t bio_bounce_pages;
struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR];
};
extern struct wait_queue_head bch2_read_only_wait;
static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
{
#ifdef BCH_WRITE_REF_DEBUG
atomic_long_inc(&c->writes[ref]);
#else
percpu_ref_get(&c->writes);
#endif
}
static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
{
#ifdef BCH_WRITE_REF_DEBUG
return !test_bit(BCH_FS_GOING_RO, &c->flags) &&
atomic_long_inc_not_zero(&c->writes[ref]);
#else
return percpu_ref_tryget_live(&c->writes);
#endif
}
static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref)
{
#ifdef BCH_WRITE_REF_DEBUG
long v = atomic_long_dec_return(&c->writes[ref]);
BUG_ON(v < 0);
if (v)
return;
for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
if (atomic_long_read(&c->writes[i]))
return;
set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
wake_up(&bch2_read_only_wait);
#else
percpu_ref_put(&c->writes);
#endif
}
static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
{
#ifndef NO_BCACHEFS_FS

View file

@ -2036,7 +2036,7 @@ void async_btree_node_rewrite_work(struct work_struct *work)
bch2_trans_do(c, NULL, NULL, 0,
async_btree_node_rewrite_trans(&trans, a));
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
kfree(a);
}
@ -2044,12 +2044,12 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
{
struct async_btree_rewrite *a;
if (!percpu_ref_tryget_live(&c->writes))
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite))
return;
a = kmalloc(sizeof(*a), GFP_NOFS);
if (!a) {
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
return;
}

View file

@ -994,7 +994,7 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
if (ret)
return ret;
percpu_ref_get(&c->writes);
bch2_write_ref_get(c, BCH_WRITE_REF_trans);
return 0;
}
@ -1043,7 +1043,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
}
if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
unlikely(!percpu_ref_tryget_live(&c->writes))) {
unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
ret = bch2_trans_commit_get_rw_cold(trans);
if (ret)
goto out_reset;
@ -1114,7 +1114,7 @@ int __bch2_trans_commit(struct btree_trans *trans)
bch2_journal_preres_put(&c->journal, &trans->journal_preres);
if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_trans);
out_reset:
bch2_trans_reset_updates(trans);

View file

@ -707,14 +707,14 @@ static void ec_stripe_delete_work(struct work_struct *work)
break;
}
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
}
void bch2_do_stripe_deletes(struct bch_fs *c)
{
if (percpu_ref_tryget_live(&c->writes) &&
if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) &&
!schedule_work(&c->ec_stripe_delete_work))
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
}
/* stripe creation: */
@ -922,7 +922,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
BUG_ON(!s->allocated);
if (!percpu_ref_tryget_live(&c->writes))
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_create))
goto err;
ec_generate_ec(&s->new_stripe);
@ -964,7 +964,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
spin_unlock(&c->ec_stripes_heap_lock);
err_put_writes:
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create);
err:
bch2_disk_reservation_put(c, &s->res);

View file

@ -3231,7 +3231,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
struct bch_fs *c = inode->v.i_sb->s_fs_info;
long ret;
if (!percpu_ref_tryget_live(&c->writes))
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate))
return -EROFS;
inode_lock(&inode->v);
@ -3255,7 +3255,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
err:
bch2_pagecache_block_put(inode);
inode_unlock(&inode->v);
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_fallocate);
return bch2_err_class(ret);
}

View file

@ -602,7 +602,7 @@ static void bch2_write_done(struct closure *cl)
struct bch_fs *c = op->c;
bch2_disk_reservation_put(c, &op->res);
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_write);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
@ -1417,7 +1417,7 @@ void bch2_write(struct closure *cl)
}
if (c->opts.nochanges ||
!percpu_ref_tryget_live(&c->writes)) {
!bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
op->error = -BCH_ERR_erofs_no_writes;
goto err;
}
@ -1496,7 +1496,7 @@ static void promote_free(struct bch_fs *c, struct promote_op *op)
ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
bch_promote_params);
BUG_ON(ret);
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_promote);
kfree_rcu(op, rcu);
}
@ -1544,7 +1544,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
int ret;
if (!percpu_ref_tryget_live(&c->writes))
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
return NULL;
op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
@ -1601,7 +1601,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
kfree(*rbio);
*rbio = NULL;
kfree(op);
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_promote);
return NULL;
}

View file

@ -57,7 +57,7 @@ static void move_free(struct moving_io *io)
bch2_data_update_exit(&io->write);
wake_up(&ctxt->wait);
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_move);
kfree(io);
}
@ -250,7 +250,7 @@ static int bch2_move_extent(struct btree_trans *trans,
return 0;
}
if (!percpu_ref_tryget_live(&c->writes))
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_move))
return -BCH_ERR_erofs_no_writes;
/* write path might have to decompress data: */
@ -319,7 +319,7 @@ static int bch2_move_extent(struct btree_trans *trans,
err_free:
kfree(io);
err:
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_move);
trace_and_count(c, move_extent_alloc_mem_fail, k.k);
return ret;
}

View file

@ -278,7 +278,7 @@ s64 bch2_remap_range(struct bch_fs *c,
u32 dst_snapshot, src_snapshot;
int ret = 0, ret2 = 0;
if (!percpu_ref_tryget_live(&c->writes))
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink))
return -BCH_ERR_erofs_no_writes;
bch2_check_set_feature(c, BCH_FEATURE_reflink);
@ -412,7 +412,7 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_bkey_buf_exit(&new_src, c);
bch2_bkey_buf_exit(&new_dst, c);
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_reflink);
return dst_done ?: ret ?: ret2;
}

View file

@ -706,16 +706,14 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work)
struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
bch2_delete_dead_snapshots(c);
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
}
void bch2_delete_dead_snapshots_async(struct bch_fs *c)
{
if (!percpu_ref_tryget_live(&c->writes))
return;
if (!queue_work(system_long_wq, &c->snapshot_delete_work))
percpu_ref_put(&c->writes);
if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
!queue_work(system_long_wq, &c->snapshot_delete_work))
bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
}
static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans,
@ -900,7 +898,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work)
darray_exit(&s);
}
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
}
struct subvolume_unlink_hook {
@ -923,11 +921,11 @@ int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans,
if (ret)
return ret;
if (unlikely(!percpu_ref_tryget_live(&c->writes)))
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache))
return -EROFS;
if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work))
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache);
return 0;
}

View file

@ -107,7 +107,7 @@ static struct kset *bcachefs_kset;
static LIST_HEAD(bch_fs_list);
static DEFINE_MUTEX(bch_fs_list_lock);
static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait);
static void bch2_dev_free(struct bch_dev *);
static int bch2_dev_alloc(struct bch_fs *, unsigned);
@ -235,13 +235,15 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch2_dev_allocator_remove(c, ca);
}
#ifndef BCH_WRITE_REF_DEBUG
static void bch2_writes_disabled(struct percpu_ref *writes)
{
struct bch_fs *c = container_of(writes, struct bch_fs, writes);
set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
wake_up(&bch_read_only_wait);
wake_up(&bch2_read_only_wait);
}
#endif
void bch2_fs_read_only(struct bch_fs *c)
{
@ -256,7 +258,13 @@ void bch2_fs_read_only(struct bch_fs *c)
* Block new foreground-end write operations from starting - any new
* writes will return -EROFS:
*/
set_bit(BCH_FS_GOING_RO, &c->flags);
#ifndef BCH_WRITE_REF_DEBUG
percpu_ref_kill(&c->writes);
#else
for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++)
bch2_write_ref_put(c, i);
#endif
/*
* If we're not doing an emergency shutdown, we want to wait on
@ -269,16 +277,17 @@ void bch2_fs_read_only(struct bch_fs *c)
* we do need to wait on them before returning and signalling
* that going RO is complete:
*/
wait_event(bch_read_only_wait,
wait_event(bch2_read_only_wait,
test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
__bch2_fs_read_only(c);
wait_event(bch_read_only_wait,
wait_event(bch2_read_only_wait,
test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
clear_bit(BCH_FS_GOING_RO, &c->flags);
if (!bch2_journal_error(&c->journal) &&
!test_bit(BCH_FS_ERROR, &c->flags) &&
@ -315,7 +324,7 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c)
bch2_journal_halt(&c->journal);
bch2_fs_read_only_async(c);
wake_up(&bch_read_only_wait);
wake_up(&bch2_read_only_wait);
return ret;
}
@ -395,7 +404,14 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
goto err;
}
#ifndef BCH_WRITE_REF_DEBUG
percpu_ref_reinit(&c->writes);
#else
for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) {
BUG_ON(atomic_long_read(&c->writes[i]));
atomic_long_inc(&c->writes[i]);
}
#endif
set_bit(BCH_FS_RW, &c->flags);
set_bit(BCH_FS_WAS_RW, &c->flags);
@ -462,7 +478,9 @@ static void __bch2_fs_free(struct bch_fs *c)
mempool_exit(&c->btree_bounce_pool);
bioset_exit(&c->btree_bio);
mempool_exit(&c->fill_iter);
#ifndef BCH_WRITE_REF_DEBUG
percpu_ref_exit(&c->writes);
#endif
kfree(rcu_dereference_protected(c->disk_groups, 1));
kfree(c->journal_seq_blacklist_table);
kfree(c->unused_inode_hints);
@ -769,8 +787,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
!(c->io_complete_wq = alloc_workqueue("bcachefs_io",
WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) ||
#ifndef BCH_WRITE_REF_DEBUG
percpu_ref_init(&c->writes, bch2_writes_disabled,
PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
#endif
mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
bioset_init(&c->btree_bio, 1,
max(offsetof(struct btree_read_bio, bio),

View file

@ -250,7 +250,8 @@ int bch2_fs_read_write_early(struct bch_fs *);
*/
static inline void bch2_fs_lazy_rw(struct bch_fs *c)
{
if (percpu_ref_is_zero(&c->writes))
if (!test_bit(BCH_FS_RW, &c->flags) &&
!test_bit(BCH_FS_WAS_RW, &c->flags))
bch2_fs_read_write_early(c);
}

View file

@ -195,6 +195,29 @@ read_attribute(stripes_heap);
read_attribute(open_buckets);
read_attribute(write_points);
#ifdef BCH_WRITE_REF_DEBUG
read_attribute(write_refs);
const char * const bch2_write_refs[] = {
#define x(n) #n,
BCH_WRITE_REFS()
#undef x
NULL
};
static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c)
{
bch2_printbuf_tabstop_push(out, 24);
for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) {
prt_str(out, bch2_write_refs[i]);
prt_tab(out);
prt_printf(out, "%li", atomic_long_read(&c->writes[i]));
prt_newline(out);
}
}
#endif
read_attribute(internal_uuid);
read_attribute(has_data);
@ -448,6 +471,11 @@ SHOW(bch2_fs)
if (attr == &sysfs_data_jobs)
data_progress_to_text(out, c);
#ifdef BCH_WRITE_REF_DEBUG
if (attr == &sysfs_write_refs)
bch2_write_refs_to_text(out, c);
#endif
return 0;
}
@ -631,6 +659,9 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_stripes_heap,
&sysfs_open_buckets,
&sysfs_write_points,
#ifdef BCH_WRITE_REF_DEBUG
&sysfs_write_refs,
#endif
&sysfs_io_timers_read,
&sysfs_io_timers_write,
@ -682,7 +713,7 @@ STORE(bch2_fs_opts_dir)
* We don't need to take c->writes for correctness, but it eliminates an
* unsightly error message in the dmesg log when we're RO:
*/
if (unlikely(!percpu_ref_tryget_live(&c->writes)))
if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)))
return -EROFS;
tmp = kstrdup(buf, GFP_KERNEL);
@ -712,7 +743,7 @@ STORE(bch2_fs_opts_dir)
ret = size;
err:
percpu_ref_put(&c->writes);
bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
return ret;
}
SYSFS_OPS(bch2_fs_opts_dir);