bcachefs: Improved btree write statistics

This replaces sysfs btree_avg_write_size with btree_write_stats, which
now breaks out statistics by the source of the btree write.

Btree writes that are too small are a source of inefficiency, and
excessive btree resort overhead - this will let us see what's causing
them.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2022-10-28 17:08:41 -04:00
parent 8852501fe5
commit 46fee692ee
9 changed files with 91 additions and 26 deletions

View File

@ -596,6 +596,23 @@ typedef struct {
#define BCACHEFS_ROOT_SUBVOL_INUM \
((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO })
#define BCH_BTREE_WRITE_TYPES() \
x(initial, 0) \
x(init_next_bset, 1) \
x(cache_reclaim, 2) \
x(journal_reclaim, 3) \
x(interior, 4)
enum btree_write_type {
#define x(t, n) BTREE_WRITE_##t,
BCH_BTREE_WRITE_TYPES()
#undef x
BTREE_WRITE_TYPE_NR,
};
#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1)
#define BTREE_WRITE_TYPE_BITS ilog2(BTREE_WRITE_TYPE_MASK)
struct bch_fs {
struct closure cl;
@ -705,6 +722,13 @@ struct bch_fs {
struct workqueue_struct *btree_interior_update_worker;
struct work_struct btree_interior_update_work;
/* btree_io.c: */
spinlock_t btree_write_error_lock;
struct btree_write_stats {
atomic64_t nr;
atomic64_t bytes;
} btree_write_stats[BTREE_WRITE_TYPE_NR];
/* btree_iter.c: */
struct mutex btree_trans_lock;
struct list_head btree_trans_list;
@ -880,11 +904,6 @@ mempool_t bio_bounce_pages;
struct bio_set dio_write_bioset;
struct bio_set dio_read_bioset;
atomic64_t btree_writes_nr;
atomic64_t btree_writes_sectors;
spinlock_t btree_write_error_lock;
/* ERRORS */
struct list_head fsck_errors;
struct mutex fsck_error_lock;

View File

@ -241,9 +241,11 @@ wait_on_io:
* the post write cleanup:
*/
if (bch2_verify_btree_ondisk)
bch2_btree_node_write(c, b, SIX_LOCK_intent, 0);
bch2_btree_node_write(c, b, SIX_LOCK_intent,
BTREE_WRITE_cache_reclaim);
else
__bch2_btree_node_write(c, b, 0);
__bch2_btree_node_write(c, b,
BTREE_WRITE_cache_reclaim);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
@ -347,7 +349,7 @@ restart:
six_trylock_read(&b->c.lock)) {
list_move(&bc->live, &b->list);
mutex_unlock(&bc->lock);
__bch2_btree_node_write(c, b, 0);
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
six_unlock_read(&b->c.lock);
if (touched >= nr)
goto out_nounlock;
@ -624,6 +626,7 @@ out:
b->flags = 0;
b->written = 0;
b->nsets = 0;
b->write_type = 0;
b->sib_u64s[0] = 0;
b->sib_u64s[1] = 0;
b->whiteout_u64s = 0;
@ -1067,7 +1070,7 @@ wait_on_io:
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
if (btree_node_dirty(b)) {
__bch2_btree_node_write(c, b, 0);
__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
goto wait_on_io;

View File

@ -471,7 +471,8 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
};
if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) {
bch2_btree_node_write(c, b, SIX_LOCK_write, 0);
bch2_btree_node_write(c, b, SIX_LOCK_write,
BTREE_WRITE_init_next_bset);
reinit_iter = true;
}
}
@ -1646,7 +1647,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b)
} while ((v = cmpxchg(&b->flags, old, new)) != old);
if (new & (1U << BTREE_NODE_write_in_flight))
__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED);
__bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|b->write_type);
else
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
}
@ -1795,6 +1796,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
bool used_mempool;
unsigned long old, new;
bool validate_before_checksum = false;
enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK;
void *data;
int ret;
@ -1841,6 +1843,12 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags)
if (new & (1U << BTREE_NODE_need_write))
return;
do_write:
if ((flags & BTREE_WRITE_ONLY_IF_NEED))
type = b->write_type;
b->write_type = 0;
BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0));
atomic_dec(&c->btree_cache.dirty);
BUG_ON(btree_node_fake(b));
@ -2015,8 +2023,8 @@ do_write:
bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written =
cpu_to_le16(b->written);
atomic64_inc(&c->btree_writes_nr);
atomic64_add(sectors_to_write, &c->btree_writes_sectors);
atomic64_inc(&c->btree_write_stats[type].nr);
atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes);
INIT_WORK(&wbio->work, btree_write_submit);
queue_work(c->io_complete_wq, &wbio->work);
@ -2144,3 +2152,33 @@ bool bch2_btree_flush_all_writes(struct bch_fs *c)
{
return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
}
const char * const bch2_btree_write_types[] = {
#define x(t, n) [n] = #t,
BCH_BTREE_WRITE_TYPES()
NULL
};
void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c)
{
printbuf_tabstop_push(out, 20);
printbuf_tabstop_push(out, 10);
prt_tab(out);
prt_str(out, "nr");
prt_tab(out);
prt_str(out, "size");
prt_newline(out);
for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) {
u64 nr = atomic64_read(&c->btree_write_stats[i].nr);
u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes);
prt_printf(out, "%s:", bch2_btree_write_types[i]);
prt_tab(out);
prt_u64(out, nr);
prt_tab(out);
prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0);
prt_newline(out);
}
}

View File

@ -139,8 +139,12 @@ void bch2_btree_complete_write(struct bch_fs *, struct btree *,
bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
#define BTREE_WRITE_ONLY_IF_NEED (1U << 0)
#define BTREE_WRITE_ALREADY_STARTED (1U << 1)
enum btree_write_flags {
__BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS,
__BTREE_WRITE_ALREADY_STARTED,
};
#define BTREE_WRITE_ONLY_IF_NEED (1U << __BTREE_WRITE_ONLY_IF_NEED )
#define BTREE_WRITE_ALREADY_STARTED (1U << __BTREE_WRITE_ALREADY_STARTED)
void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned);
void bch2_btree_node_write(struct bch_fs *, struct btree *,
@ -219,4 +223,6 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
bn->min_key = bpos_nosnap_successor(bn->min_key);
}
void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *);
#endif /* _BCACHEFS_BTREE_IO_H */

View File

@ -77,6 +77,7 @@ struct btree {
u8 nsets;
u8 nr_key_bits;
u16 version_ondisk;
u8 write_type;
struct bkey_format format;

View File

@ -1308,6 +1308,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
set_btree_node_dirty_acct(c, b);
set_btree_node_need_write(b);
b->write_type = BTREE_WRITE_interior;
printbuf_exit(&buf);
}

View File

@ -282,6 +282,7 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b,
struct bkey_packed k;
BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
EBUG_ON(btree_node_just_written(b));
if (!bkey_pack_pos(&k, pos, b)) {
struct bkey *u = (void *) &k;

View File

@ -181,6 +181,8 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
new |= 1 << BTREE_NODE_need_write;
} while ((v = cmpxchg(&b->flags, old, new)) != old);
b->write_type = BTREE_WRITE_journal_reclaim;
btree_node_write_if_need(c, b, SIX_LOCK_read);
six_unlock_read(&b->c.lock);

View File

@ -183,7 +183,7 @@ read_attribute(io_latency_stats_read);
read_attribute(io_latency_stats_write);
read_attribute(congested);
read_attribute(btree_avg_write_size);
read_attribute(btree_write_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
@ -250,14 +250,6 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
return ret;
}
static size_t bch2_btree_avg_write_size(struct bch_fs *c)
{
u64 nr = atomic64_read(&c->btree_writes_nr);
u64 sectors = atomic64_read(&c->btree_writes_sectors);
return nr ? div64_u64(sectors, nr) : 0;
}
static long data_progress_to_text(struct printbuf *out, struct bch_fs *c)
{
long ret = 0;
@ -396,7 +388,9 @@ SHOW(bch2_fs)
sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c));
if (attr == &sysfs_btree_write_stats)
bch2_btree_write_stats_to_text(out, c);
sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
@ -557,7 +551,7 @@ SYSFS_OPS(bch2_fs);
struct attribute *bch2_fs_files[] = {
&sysfs_minor,
&sysfs_btree_cache_size,
&sysfs_btree_avg_write_size,
&sysfs_btree_write_stats,
&sysfs_promote_whole_extents,