From 46fee692eebb850b8478531e185fb5a5f942d3ea Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 28 Oct 2022 17:08:41 -0400 Subject: [PATCH] bcachefs: Improved btree write statistics This replaces sysfs btree_avg_write_size with btree_write_stats, which now breaks out statistics by the source of the btree write. Btree writes that are too small are a source of inefficiency, and excessive btree resort overhead - this will let us see what's causing them. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 29 ++++++++++++++---- fs/bcachefs/btree_cache.c | 11 ++++--- fs/bcachefs/btree_io.c | 46 ++++++++++++++++++++++++++--- fs/bcachefs/btree_io.h | 10 +++++-- fs/bcachefs/btree_types.h | 1 + fs/bcachefs/btree_update_interior.c | 1 + fs/bcachefs/btree_update_interior.h | 1 + fs/bcachefs/btree_update_leaf.c | 2 ++ fs/bcachefs/sysfs.c | 16 ++++------ 9 files changed, 91 insertions(+), 26 deletions(-) diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 544621dd4af4..18fe09cdae4d 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -596,6 +596,23 @@ typedef struct { #define BCACHEFS_ROOT_SUBVOL_INUM \ ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) +#define BCH_BTREE_WRITE_TYPES() \ + x(initial, 0) \ + x(init_next_bset, 1) \ + x(cache_reclaim, 2) \ + x(journal_reclaim, 3) \ + x(interior, 4) + +enum btree_write_type { +#define x(t, n) BTREE_WRITE_##t, + BCH_BTREE_WRITE_TYPES() +#undef x + BTREE_WRITE_TYPE_NR, +}; + +#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1) +#define BTREE_WRITE_TYPE_BITS ilog2(BTREE_WRITE_TYPE_MASK) + struct bch_fs { struct closure cl; @@ -705,6 +722,13 @@ struct bch_fs { struct workqueue_struct *btree_interior_update_worker; struct work_struct btree_interior_update_work; + /* btree_io.c: */ + spinlock_t btree_write_error_lock; + struct btree_write_stats { + atomic64_t nr; + atomic64_t bytes; + } btree_write_stats[BTREE_WRITE_TYPE_NR]; + /* btree_iter.c: */ struct mutex btree_trans_lock; struct list_head btree_trans_list; @@ -880,11 +904,6 @@ mempool_t bio_bounce_pages; struct bio_set dio_write_bioset; struct bio_set dio_read_bioset; - - atomic64_t btree_writes_nr; - atomic64_t btree_writes_sectors; - spinlock_t btree_write_error_lock; - /* ERRORS */ struct list_head fsck_errors; struct mutex fsck_error_lock; diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 135c3ea1377d..709453a909fc 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -241,9 +241,11 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) * the post write cleanup: */ if (bch2_verify_btree_ondisk) - bch2_btree_node_write(c, b, SIX_LOCK_intent, 0); + bch2_btree_node_write(c, b, SIX_LOCK_intent, + BTREE_WRITE_cache_reclaim); else - __bch2_btree_node_write(c, b, 0); + __bch2_btree_node_write(c, b, + BTREE_WRITE_cache_reclaim); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -347,7 +349,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, six_trylock_read(&b->c.lock)) { list_move(&bc->live, &b->list); mutex_unlock(&bc->lock); - __bch2_btree_node_write(c, b, 0); + __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); six_unlock_read(&b->c.lock); if (touched >= nr) goto out_nounlock; @@ -624,6 +626,7 @@ struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c, bool pcpu_read_locks) b->flags = 0; b->written = 0; b->nsets = 0; + b->write_type = 0; b->sib_u64s[0] = 0; b->sib_u64s[1] = 0; b->whiteout_u64s = 0; @@ -1067,7 +1070,7 @@ void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); if (btree_node_dirty(b)) { - __bch2_btree_node_write(c, b, 0); + __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); goto wait_on_io; diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index a322a8367688..56f9637d2ca6 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -471,7 +471,8 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) }; if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) { - bch2_btree_node_write(c, b, SIX_LOCK_write, 0); + bch2_btree_node_write(c, b, SIX_LOCK_write, + BTREE_WRITE_init_next_bset); reinit_iter = true; } } @@ -1646,7 +1647,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) } while ((v = cmpxchg(&b->flags, old, new)) != old); if (new & (1U << BTREE_NODE_write_in_flight)) - __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED); + __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|b->write_type); else wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); } @@ -1795,6 +1796,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) bool used_mempool; unsigned long old, new; bool validate_before_checksum = false; + enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK; void *data; int ret; @@ -1841,6 +1843,12 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) if (new & (1U << BTREE_NODE_need_write)) return; do_write: + if ((flags & BTREE_WRITE_ONLY_IF_NEED)) + type = b->write_type; + b->write_type = 0; + + BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0)); + atomic_dec(&c->btree_cache.dirty); BUG_ON(btree_node_fake(b)); @@ -2015,8 +2023,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written = cpu_to_le16(b->written); - atomic64_inc(&c->btree_writes_nr); - atomic64_add(sectors_to_write, &c->btree_writes_sectors); + atomic64_inc(&c->btree_write_stats[type].nr); + atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); INIT_WORK(&wbio->work, btree_write_submit); queue_work(c->io_complete_wq, &wbio->work); @@ -2144,3 +2152,33 @@ bool bch2_btree_flush_all_writes(struct bch_fs *c) { return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); } + +const char * const bch2_btree_write_types[] = { +#define x(t, n) [n] = #t, + BCH_BTREE_WRITE_TYPES() + NULL +}; + +void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c) +{ + printbuf_tabstop_push(out, 20); + printbuf_tabstop_push(out, 10); + + prt_tab(out); + prt_str(out, "nr"); + prt_tab(out); + prt_str(out, "size"); + prt_newline(out); + + for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) { + u64 nr = atomic64_read(&c->btree_write_stats[i].nr); + u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes); + + prt_printf(out, "%s:", bch2_btree_write_types[i]); + prt_tab(out); + prt_u64(out, nr); + prt_tab(out); + prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0); + prt_newline(out); + } +} diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index 8af853642123..4b1810ad7d91 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -139,8 +139,12 @@ void bch2_btree_complete_write(struct bch_fs *, struct btree *, bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); -#define BTREE_WRITE_ONLY_IF_NEED (1U << 0) -#define BTREE_WRITE_ALREADY_STARTED (1U << 1) +enum btree_write_flags { + __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS, + __BTREE_WRITE_ALREADY_STARTED, +}; +#define BTREE_WRITE_ONLY_IF_NEED (1U << __BTREE_WRITE_ONLY_IF_NEED ) +#define BTREE_WRITE_ALREADY_STARTED (1U << __BTREE_WRITE_ALREADY_STARTED) void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); void bch2_btree_node_write(struct bch_fs *, struct btree *, @@ -219,4 +223,6 @@ static inline void compat_btree_node(unsigned level, enum btree_id btree_id, bn->min_key = bpos_nosnap_successor(bn->min_key); } +void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *); + #endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index ea844dd7a16b..38c4754dbd7e 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -77,6 +77,7 @@ struct btree { u8 nsets; u8 nr_key_bits; u16 version_ondisk; + u8 write_type; struct bkey_format format; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 0150943074fa..e0483abadd72 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -1308,6 +1308,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); set_btree_node_dirty_acct(c, b); set_btree_node_need_write(b); + b->write_type = BTREE_WRITE_interior; printbuf_exit(&buf); } diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index dabe81596544..2e6d220c3bcd 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -282,6 +282,7 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b, struct bkey_packed k; BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); + EBUG_ON(btree_node_just_written(b)); if (!bkey_pack_pos(&k, pos, b)) { struct bkey *u = (void *) &k; diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index fc53958e5619..8cc271030be6 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -181,6 +181,8 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, new |= 1 << BTREE_NODE_need_write; } while ((v = cmpxchg(&b->flags, old, new)) != old); + b->write_type = BTREE_WRITE_journal_reclaim; + btree_node_write_if_need(c, b, SIX_LOCK_read); six_unlock_read(&b->c.lock); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 76301209898f..db3d377ba10c 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -183,7 +183,7 @@ read_attribute(io_latency_stats_read); read_attribute(io_latency_stats_write); read_attribute(congested); -read_attribute(btree_avg_write_size); +read_attribute(btree_write_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); @@ -250,14 +250,6 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) return ret; } -static size_t bch2_btree_avg_write_size(struct bch_fs *c) -{ - u64 nr = atomic64_read(&c->btree_writes_nr); - u64 sectors = atomic64_read(&c->btree_writes_sectors); - - return nr ? div64_u64(sectors, nr) : 0; -} - static long data_progress_to_text(struct printbuf *out, struct bch_fs *c) { long ret = 0; @@ -396,7 +388,9 @@ SHOW(bch2_fs) sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); - sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c)); + + if (attr == &sysfs_btree_write_stats) + bch2_btree_write_stats_to_text(out, c); sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); @@ -557,7 +551,7 @@ SYSFS_OPS(bch2_fs); struct attribute *bch2_fs_files[] = { &sysfs_minor, &sysfs_btree_cache_size, - &sysfs_btree_avg_write_size, + &sysfs_btree_write_stats, &sysfs_promote_whole_extents,