bcachefs: Btree write buffer

This adds a new method of doing btree updates - a straight write buffer,
implemented as a flat fixed size array.

This is only useful when we don't need to read from the btree in order
to do the update, and when reading is infrequent - perfect for the LRU
btree.

This will make LRU btree updates fast enough that we'll be able to use
it for persistently indexing buckets by fragmentation, which will be a
massive boost to copygc performance.

Changes:
 - A new btree_insert_type enum, for btree_insert_entries. Specifies
   btree, btree key cache, or btree write buffer.

 - bch2_trans_update_buffered(): updates via the btree write buffer
   don't need a btree path, so we need a new update path.

 - Transaction commit path changes:
   The update to the btree write buffer both mutates global, and can
   fail if there isn't currently room. Therefore we do all write buffer
   updates in the transaction all at once, and also if it fails we have
   to revert filesystem usage counter changes.

   If there isn't room we flush the write buffer in the transaction
   commit error path and retry.

 - A new persistent option, for specifying the number of entries in the
   write buffer.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2023-01-04 00:00:50 -05:00
parent f2b542ba42
commit 920e69bc3d
16 changed files with 677 additions and 21 deletions

View File

@ -16,6 +16,7 @@ bcachefs-y := \
btree_locking.o \
btree_update_interior.o \
btree_update_leaf.o \
btree_write_buffer.o \
buckets.o \
buckets_waiting_for_journal.o \
chardev.o \

View File

@ -391,6 +391,7 @@ enum bch_time_stats {
#include "alloc_types.h"
#include "btree_types.h"
#include "btree_write_buffer_types.h"
#include "buckets_types.h"
#include "buckets_waiting_for_journal_types.h"
#include "clock_types.h"
@ -575,6 +576,7 @@ struct btree_transaction_stats {
struct bch2_time_stats lock_hold_times;
struct mutex lock;
unsigned nr_max_paths;
unsigned wb_updates_size;
unsigned max_mem;
char *max_paths_text;
};
@ -789,6 +791,8 @@ struct bch_fs {
struct btree_key_cache btree_key_cache;
unsigned btree_key_cache_btrees;
struct btree_write_buffer btree_write_buffer;
struct workqueue_struct *btree_update_wq;
struct workqueue_struct *btree_io_complete_wq;
/* copygc needs its own workqueue for index updates.. */

View File

@ -1404,7 +1404,8 @@ struct bch_sb_field_disk_groups {
x(trans_traverse_all, 71) \
x(transaction_commit, 72) \
x(write_super, 73) \
x(trans_restart_would_deadlock_recursion_limit, 74)
x(trans_restart_would_deadlock_recursion_limit, 74) \
x(trans_restart_write_buffer_flush, 75)
enum bch_persistent_counters {
#define x(t, n, ...) BCH_COUNTER_##t,
@ -1633,6 +1634,7 @@ LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE, struct bch_sb, flags[4], 34, 54);
/*
* Features:

View File

@ -1374,6 +1374,7 @@ noinline __cold
void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
{
struct btree_insert_entry *i;
struct btree_write_buffered_key *wb;
prt_printf(buf, "transaction updates for %s journal seq %llu",
trans->fn, trans->journal_res.seq);
@ -1398,6 +1399,17 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
prt_newline(buf);
}
trans_for_each_wb_update(trans, wb) {
prt_printf(buf, "update: btree=%s wb=1 %pS",
bch2_btree_ids[wb->btree],
(void *) i->ip_allocated);
prt_newline(buf);
prt_printf(buf, " new ");
bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k));
prt_newline(buf);
}
printbuf_indent_sub(buf, 2);
}
@ -2929,8 +2941,11 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_
trans->mem_bytes = expected_mem_bytes;
}
}
if (s)
if (s) {
trans->nr_max_paths = s->nr_max_paths;
trans->wb_updates_size = s->wb_updates_size;
}
trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
trans->srcu_lock_time = jiffies;

View File

@ -419,6 +419,8 @@ struct btree_trans {
u8 fn_idx;
u8 nr_sorted;
u8 nr_updates;
u8 nr_wb_updates;
u8 wb_updates_size;
bool used_mempool:1;
bool in_traverse_all:1;
bool paths_sorted:1;
@ -448,6 +450,7 @@ struct btree_trans {
u8 sorted[BTREE_ITER_MAX + 8];
struct btree_path *paths;
struct btree_insert_entry *updates;
struct btree_write_buffered_key *wb_updates;
/* update path: */
struct btree_trans_commit_hook *hooks;

View File

@ -15,6 +15,9 @@ bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *,
struct bkey_i *);
void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
struct bkey_i *, u64);
enum btree_insert_flags {
/* First two bits for journal watermark: */
__BTREE_INSERT_NOFAIL = 2,
@ -77,6 +80,8 @@ int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_update_flags);
int __must_check bch2_trans_update_buffered(struct btree_trans *,
enum btree_id, struct bkey_i *);
void bch2_trans_commit_hook(struct btree_trans *,
struct btree_trans_commit_hook *);
@ -142,6 +147,11 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
(_i) < (_trans)->updates + (_trans)->nr_updates; \
(_i)++)
#define trans_for_each_wb_update(_trans, _i) \
for ((_i) = (_trans)->wb_updates; \
(_i) < (_trans)->wb_updates + (_trans)->nr_wb_updates; \
(_i)++)
static inline void bch2_trans_reset_updates(struct btree_trans *trans)
{
struct btree_insert_entry *i;
@ -151,6 +161,8 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans)
trans->extra_journal_res = 0;
trans->nr_updates = 0;
trans->nr_wb_updates = 0;
trans->wb_updates = NULL;
trans->hooks = NULL;
trans->extra_journal_entries.nr = 0;

View File

@ -8,6 +8,7 @@
#include "btree_iter.h"
#include "btree_key_cache.h"
#include "btree_locking.h"
#include "btree_write_buffer.h"
#include "buckets.h"
#include "debug.h"
#include "errcode.h"
@ -100,9 +101,6 @@ inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
if (path->cached)
return;
if (unlikely(btree_node_just_written(b)) &&
bch2_btree_post_write_cleanup(c, b))
bch2_trans_node_reinit_iter(trans, b);
@ -252,25 +250,26 @@ inline void bch2_btree_add_journal_pin(struct bch_fs *c,
/**
* btree_insert_key - insert a key one key into a leaf node
*/
static void btree_insert_key_leaf(struct btree_trans *trans,
struct btree_insert_entry *insert)
inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
struct btree_path *path,
struct bkey_i *insert,
u64 journal_seq)
{
struct bch_fs *c = trans->c;
struct btree *b = insert_l(insert)->b;
struct btree *b = path_l(path)->b;
struct bset_tree *t = bset_tree_last(b);
struct bset *i = bset(b, t);
int old_u64s = bset_u64s(t);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
&insert_l(insert)->iter, insert->k)))
if (unlikely(!bch2_btree_bset_insert_key(trans, path, b,
&path_l(path)->iter, insert)))
return;
i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
le64_to_cpu(i->journal_seq)));
i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq)));
bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
bch2_btree_add_journal_pin(c, b, journal_seq);
if (unlikely(!btree_node_dirty(b)))
set_btree_node_dirty_acct(c, b);
@ -288,6 +287,12 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
bch2_trans_node_reinit_iter(trans, b);
}
static void btree_insert_key_leaf(struct btree_trans *trans,
struct btree_insert_entry *insert)
{
bch2_btree_insert_key_leaf(trans, insert->path, insert->k, trans->journal_res.seq);
}
/* Cached btree updates: */
/* Normal update interface: */
@ -594,6 +599,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
struct btree_write_buffered_key *wb;
struct btree_trans_commit_hook *h;
unsigned u64s = 0;
bool marking = false;
@ -638,6 +644,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
marking = true;
}
if (trans->nr_wb_updates &&
trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
return -BCH_ERR_btree_insert_need_flush_buffer;
/*
* Don't get journal reservation until after we know insert will
* succeed:
@ -674,17 +684,25 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
return -BCH_ERR_btree_insert_need_mark_replicas;
if (trans->nr_wb_updates) {
EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY);
ret = bch2_btree_insert_keys_write_buffer(trans);
if (ret)
goto revert_fs_usage;
}
trans_for_each_update(trans, i)
if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
ret = run_one_mem_trigger(trans, i, i->flags);
if (ret)
return ret;
goto fatal_err;
}
if (unlikely(c->gc_pos.phase)) {
ret = bch2_trans_commit_run_gc_triggers(trans);
if (ret)
return ret;
goto fatal_err;
}
if (unlikely(trans->extra_journal_entries.nr)) {
@ -697,10 +715,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
}
if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
trans_for_each_update(trans, i) {
struct journal *j = &c->journal;
struct jset_entry *entry;
struct journal *j = &c->journal;
struct jset_entry *entry;
trans_for_each_update(trans, i) {
if (i->key_cache_already_flushed)
continue;
@ -725,6 +743,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
bkey_copy(&entry->start[0], i->k);
}
trans_for_each_wb_update(trans, wb) {
entry = bch2_journal_add_entry(j, &trans->journal_res,
BCH_JSET_ENTRY_btree_keys,
wb->btree, 0,
wb->k.k.u64s);
bkey_copy(&entry->start[0], &wb->k);
}
if (trans->journal_seq)
*trans->journal_seq = trans->journal_res.seq;
}
@ -742,6 +768,12 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
}
}
return 0;
fatal_err:
bch2_fatal_error(c);
revert_fs_usage:
if (trans->fs_usage_deltas)
bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas);
return ret;
}
@ -769,7 +801,8 @@ static inline int trans_lock_write(struct btree_trans *trans)
if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
return trans_lock_write_fail(trans, i);
bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
if (!i->cached)
bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
}
return 0;
@ -778,9 +811,13 @@ static inline int trans_lock_write(struct btree_trans *trans)
static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
{
struct btree_insert_entry *i;
struct btree_write_buffered_key *wb;
trans_for_each_update(trans, i)
bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
trans_for_each_wb_update(trans, wb)
bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
}
#ifdef CONFIG_BCACHEFS_DEBUG
@ -821,10 +858,11 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
struct printbuf buf = PRINTBUF;
int ret, u64s_delta = 0;
#ifdef CONFIG_BCACHEFS_DEBUG
struct printbuf buf = PRINTBUF;
trans_for_each_update(trans, i) {
int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE;
@ -833,8 +871,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
return bch2_trans_commit_bkey_invalid(trans, flags, i, &buf);
btree_insert_entry_checks(trans, i);
}
#endif
printbuf_exit(&buf);
#endif
trans_for_each_update(trans, i) {
if (i->cached)
@ -962,6 +1000,30 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
if (ret)
trace_and_count(c, trans_restart_journal_reclaim, trans, trace_ip);
break;
case -BCH_ERR_btree_insert_need_flush_buffer: {
struct btree_write_buffer *wb = &c->btree_write_buffer;
ret = 0;
if (wb->state.nr > wb->size * 3 / 4) {
bch2_trans_reset_updates(trans);
bch2_trans_unlock(trans);
mutex_lock(&wb->flush_lock);
if (wb->state.nr > wb->size * 3 / 4)
ret = __bch2_btree_write_buffer_flush(trans,
flags|BTREE_INSERT_NOCHECK_RW, true);
else
mutex_unlock(&wb->flush_lock);
if (!ret) {
trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
}
}
break;
}
default:
BUG_ON(ret >= 0);
break;
@ -1023,10 +1085,12 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i = NULL;
struct btree_write_buffered_key *wb;
unsigned u64s;
int ret = 0;
if (!trans->nr_updates &&
!trans->nr_wb_updates &&
!trans->extra_journal_entries.nr)
goto out_reset;
@ -1049,6 +1113,20 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
goto out_reset;
}
if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
mutex_trylock(&c->btree_write_buffer.flush_lock)) {
bch2_trans_begin(trans);
bch2_trans_unlock(trans);
ret = __bch2_btree_write_buffer_flush(trans,
flags|BTREE_INSERT_NOCHECK_RW, true);
if (!ret) {
trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
}
goto out;
}
EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
@ -1089,6 +1167,9 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
trans->journal_u64s += jset_u64s(i->old_k.u64s);
}
trans_for_each_wb_update(trans, wb)
trans->journal_u64s += jset_u64s(wb->k.k.u64s);
if (trans->extra_journal_res) {
ret = bch2_disk_reservation_add(c, trans->disk_res,
trans->extra_journal_res,
@ -1606,6 +1687,59 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
return bch2_trans_update_by_path(trans, path, k, flags);
}
int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
enum btree_id btree,
struct bkey_i *k)
{
struct btree_write_buffered_key *i;
int ret;
EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
trans_for_each_wb_update(trans, i) {
if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
bkey_copy(&i->k, k);
return 0;
}
}
if (!trans->wb_updates ||
trans->nr_wb_updates == trans->wb_updates_size) {
struct btree_write_buffered_key *u;
if (trans->nr_wb_updates == trans->wb_updates_size) {
struct btree_transaction_stats *s = btree_trans_stats(trans);
BUG_ON(trans->wb_updates_size > U8_MAX / 2);
trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
if (s)
s->wb_updates_size = trans->wb_updates_size;
}
u = bch2_trans_kmalloc_nomemzero(trans,
trans->wb_updates_size *
sizeof(struct btree_write_buffered_key));
ret = PTR_ERR_OR_ZERO(u);
if (ret)
return ret;
if (trans->nr_wb_updates)
memcpy(u, trans->wb_updates, trans->nr_wb_updates *
sizeof(struct btree_write_buffered_key));
trans->wb_updates = u;
}
trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
.btree = btree,
};
bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
trans->nr_wb_updates++;
return 0;
}
void bch2_trans_commit_hook(struct btree_trans *trans,
struct btree_trans_commit_hook *h)
{

View File

@ -0,0 +1,330 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "btree_locking.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
#include "error.h"
#include "journal.h"
#include "journal_reclaim.h"
#include <linux/sort.h>
static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
{
const struct btree_write_buffered_key *l = _l;
const struct btree_write_buffered_key *r = _r;
return cmp_int(l->btree, r->btree) ?:
bpos_cmp(l->k.k.p, r->k.k.p) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->journal_offset, r->journal_offset);
}
static int btree_write_buffered_journal_cmp(const void *_l, const void *_r)
{
const struct btree_write_buffered_key *l = _l;
const struct btree_write_buffered_key *r = _r;
return cmp_int(l->journal_seq, r->journal_seq);
}
static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
struct btree_iter *iter,
struct btree_write_buffered_key *wb,
unsigned commit_flags,
bool *write_locked,
size_t *fast)
{
struct bch_fs *c = trans->c;
struct btree_path *path;
int ret;
ret = bch2_btree_iter_traverse(iter);
if (ret)
return ret;
path = iter->path;
if (!*write_locked) {
ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c);
if (ret)
return ret;
bch2_btree_node_prep_for_write(trans, path, path->l[0].b);
*write_locked = true;
}
if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) {
bch2_btree_node_unlock_write(trans, path, path->l[0].b);
*write_locked = false;
goto trans_commit;
}
bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
(*fast)++;
return 0;
trans_commit:
return bch2_trans_update(trans, iter, &wb->k, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
commit_flags|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_RECLAIM);
}
static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
{
union btree_write_buffer_state old, new;
u64 v = READ_ONCE(wb->state.v);
do {
old.v = new.v = v;
new.nr = 0;
new.idx++;
} while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1)
cpu_relax();
smp_mb();
return old;
}
int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags,
bool locked)
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree_write_buffer *wb = &c->btree_write_buffer;
struct journal_entry_pin pin;
struct btree_write_buffered_key *i, *dst, *keys;
struct btree_iter iter = { NULL };
size_t nr = 0, skipped = 0, fast = 0;
bool write_locked = false;
union btree_write_buffer_state s;
int ret = 0;
memset(&pin, 0, sizeof(pin));
if (!locked && !mutex_trylock(&wb->flush_lock))
return 0;
bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL);
bch2_journal_pin_drop(j, &wb->journal_pin);
s = btree_write_buffer_switch(wb);
keys = wb->keys[s.idx];
nr = s.nr;
/*
* We first sort so that we can detect and skip redundant updates, and
* then we attempt to flush in sorted btree order, as this is most
* efficient.
*
* However, since we're not flushing in the order they appear in the
* journal we won't be able to drop our journal pin until everything is
* flushed - which means this could deadlock the journal, if we weren't
* passing BTREE_INSERT_JORUNAL_RECLAIM. This causes the update to fail
* if it would block taking a journal reservation.
*
* If that happens, we sort them by the order they appeared in the
* journal - after dropping redundant entries - and then restart
* flushing, this time dropping journal pins as we go.
*/
sort(keys, nr, sizeof(keys[0]),
btree_write_buffered_key_cmp, NULL);
for (i = keys; i < keys + nr; i++) {
if (i + 1 < keys + nr &&
i[0].btree == i[1].btree &&
bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
skipped++;
continue;
}
if (write_locked &&
(iter.path->btree_id != i->btree ||
bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) {
bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
write_locked = false;
}
if (!iter.path || iter.path->btree_id != i->btree) {
bch2_trans_iter_exit(trans, &iter);
bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, BTREE_ITER_INTENT);
}
bch2_btree_iter_set_pos(&iter, i->k.k.p);
iter.path->preserve = false;
do {
ret = bch2_btree_write_buffer_flush_one(trans, &iter, i,
commit_flags, &write_locked, &fast);
if (!write_locked)
bch2_trans_begin(trans);
} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
if (ret)
break;
}
if (write_locked)
bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
bch2_trans_iter_exit(trans, &iter);
trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
goto slowpath;
bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
out:
bch2_journal_pin_drop(j, &pin);
mutex_unlock(&wb->flush_lock);
return ret;
slowpath:
trace_write_buffer_flush_slowpath(trans, i - keys, nr);
dst = keys;
for (; i < keys + nr; i++) {
if (i + 1 < keys + nr &&
i[0].btree == i[1].btree &&
bpos_eq(i[0].k.k.p, i[1].k.k.p))
continue;
*dst = *i;
dst++;
}
nr = dst - keys;
sort(keys, nr, sizeof(keys[0]),
btree_write_buffered_journal_cmp,
NULL);
for (i = keys; i < keys + nr; i++) {
if (i->journal_seq > pin.seq) {
struct journal_entry_pin pin2;
memset(&pin2, 0, sizeof(pin2));
bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL);
bch2_journal_pin_drop(j, &pin);
bch2_journal_pin_copy(j, &pin, &pin2, NULL);
bch2_journal_pin_drop(j, &pin2);
}
ret = commit_do(trans, NULL, NULL,
commit_flags|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_RECLAIM|
JOURNAL_WATERMARK_reserved,
__bch2_btree_insert(trans, i->btree, &i->k));
if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
break;
}
goto out;
}
int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
{
bch2_trans_unlock(trans);
mutex_lock(&trans->c->btree_write_buffer.flush_lock);
return __bch2_btree_write_buffer_flush(trans, 0, true);
}
int bch2_btree_write_buffer_flush(struct btree_trans *trans)
{
return __bch2_btree_write_buffer_flush(trans, 0, false);
}
static int bch2_btree_write_buffer_journal_flush(struct journal *j,
struct journal_entry_pin *_pin, u64 seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct btree_write_buffer *wb = &c->btree_write_buffer;
mutex_lock(&wb->flush_lock);
return bch2_trans_run(c,
__bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW, true));
}
static inline u64 btree_write_buffer_ref(int idx)
{
return ((union btree_write_buffer_state) {
.ref0 = idx == 0,
.ref1 = idx == 1,
}).v;
}
int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct btree_write_buffer *wb = &c->btree_write_buffer;
struct btree_write_buffered_key *i;
union btree_write_buffer_state old, new;
int ret = 0;
u64 v;
trans_for_each_wb_update(trans, i) {
EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
i->journal_seq = trans->journal_res.seq;
i->journal_offset = trans->journal_res.offset;
}
preempt_disable();
v = READ_ONCE(wb->state.v);
do {
old.v = new.v = v;
new.v += btree_write_buffer_ref(new.idx);
new.nr += trans->nr_wb_updates;
if (new.nr > wb->size) {
ret = -BCH_ERR_btree_insert_need_flush_buffer;
goto out;
}
} while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
memcpy(wb->keys[new.idx] + old.nr,
trans->wb_updates,
sizeof(trans->wb_updates[0]) * trans->nr_wb_updates);
bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin,
bch2_btree_write_buffer_journal_flush);
atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter);
out:
preempt_enable();
return ret;
}
void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal));
kvfree(wb->keys[1]);
kvfree(wb->keys[0]);
}
int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
mutex_init(&wb->flush_lock);
wb->size = c->opts.btree_write_buffer_size;
wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL);
wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL);
if (!wb->keys[0] || !wb->keys[1])
return -ENOMEM;
return 0;
}

View File

@ -0,0 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
#define _BCACHEFS_BTREE_WRITE_BUFFER_H
int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool);
int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
int bch2_btree_write_buffer_flush(struct btree_trans *);
int bch2_btree_insert_keys_write_buffer(struct btree_trans *);
void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
int bch2_fs_btree_write_buffer_init(struct bch_fs *);
#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */

View File

@ -0,0 +1,44 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
#include "journal_types.h"
#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4
#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
struct btree_write_buffered_key {
u64 journal_seq;
unsigned journal_offset;
enum btree_id btree;
__BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
};
union btree_write_buffer_state {
struct {
atomic64_t counter;
};
struct {
u64 v;
};
struct {
u64 nr:23;
u64 idx:1;
u64 ref0:20;
u64 ref1:20;
};
};
struct btree_write_buffer {
struct mutex flush_lock;
struct journal_entry_pin journal_pin;
union btree_write_buffer_state state;
size_t size;
struct btree_write_buffered_key *keys[2];
};
#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */

View File

@ -1278,6 +1278,47 @@ int bch2_mark_reflink_p(struct btree_trans *trans,
return ret;
}
void bch2_trans_fs_usage_revert(struct btree_trans *trans,
struct replicas_delta_list *deltas)
{
struct bch_fs *c = trans->c;
struct bch_fs_usage *dst;
struct replicas_delta *d, *top = (void *) deltas->d + deltas->used;
s64 added = 0;
unsigned i;
percpu_down_read(&c->mark_lock);
preempt_disable();
dst = fs_usage_ptr(c, trans->journal_res.seq, false);
/* revert changes: */
for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
switch (d->r.data_type) {
case BCH_DATA_btree:
case BCH_DATA_user:
case BCH_DATA_parity:
added += d->delta;
}
BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
}
dst->nr_inodes -= deltas->nr_inodes;
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
added -= deltas->persistent_reserved[i];
dst->reserved -= deltas->persistent_reserved[i];
dst->persistent_reserved[i] -= deltas->persistent_reserved[i];
}
if (added > 0) {
trans->disk_res->sectors += added;
this_cpu_add(*c->online_reserved, added);
}
preempt_enable();
percpu_up_read(&c->mark_lock);
}
int bch2_trans_fs_usage_apply(struct btree_trans *trans,
struct replicas_delta_list *deltas)
{

View File

@ -229,6 +229,7 @@ int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, struct
int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,

View File

@ -42,6 +42,7 @@
x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\
x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \
x(BCH_ERR_transaction_restart, transaction_restart_split_race) \
x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \
x(BCH_ERR_transaction_restart, transaction_restart_nested) \
x(0, no_btree_node) \
x(BCH_ERR_no_btree_node, no_btree_node_relock) \
@ -58,6 +59,7 @@
x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \
x(0, lock_fail_root_changed) \
x(0, journal_reclaim_would_deadlock) \
x(0, fsck) \

View File

@ -206,6 +206,11 @@ enum opt_type {
OPT_BOOL(), \
BCH2_NO_SB_OPT, true, \
NULL, "Stash pointer to in memory btree node in btree ptr")\
x(btree_write_buffer_size, u32, \
OPT_FS|OPT_MOUNT, \
OPT_UINT(16, (1U << 20) - 1), \
BCH2_NO_SB_OPT, 1U << 13, \
NULL, "Number of btree write buffer entries") \
x(gc_reserve_percent, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(5, 21), \

View File

@ -16,6 +16,7 @@
#include "btree_key_cache.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "btree_write_buffer.h"
#include "buckets_waiting_for_journal.h"
#include "chardev.h"
#include "checksum.h"
@ -463,6 +464,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_fs_compress_exit(c);
bch2_journal_keys_free(&c->journal_keys);
bch2_journal_entries_free(c);
bch2_fs_btree_write_buffer_exit(c);
percpu_free_rwsem(&c->mark_lock);
free_percpu(c->online_reserved);
@ -816,6 +818,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
bch2_fs_btree_iter_init(c) ?:
bch2_fs_btree_interior_update_init(c) ?:
bch2_fs_buckets_waiting_for_journal_init(c) ?:
bch2_fs_btree_write_buffer_init(c) ?:
bch2_fs_subvolumes_init(c) ?:
bch2_fs_io_init(c) ?:
bch2_fs_encryption_init(c) ?:

View File

@ -1111,6 +1111,51 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
__entry->new_u64s)
);
DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip),
TP_ARGS(trans, caller_ip)
);
TRACE_EVENT(write_buffer_flush,
TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size),
TP_ARGS(trans, nr, skipped, fast, size),
TP_STRUCT__entry(
__field(size_t, nr )
__field(size_t, skipped )
__field(size_t, fast )
__field(size_t, size )
),
TP_fast_assign(
__entry->nr = nr;
__entry->skipped = skipped;
__entry->fast = fast;
__entry->size = size;
),
TP_printk("%zu/%zu skipped %zu fast %zu",
__entry->nr, __entry->size, __entry->skipped, __entry->fast)
);
TRACE_EVENT(write_buffer_flush_slowpath,
TP_PROTO(struct btree_trans *trans, size_t nr, size_t size),
TP_ARGS(trans, nr, size),
TP_STRUCT__entry(
__field(size_t, nr )
__field(size_t, size )
),
TP_fast_assign(
__entry->nr = nr;
__entry->size = size;
),
TP_printk("%zu/%zu", __entry->nr, __entry->size)
);
#endif /* _TRACE_BCACHEFS_H */
/* This part must be outside protection */