bcachefs: Simplify journal replay

With BTREE_ITER_WITH_JOURNAL, there's no longer any restrictions on the
order we have to replay keys from the journal in, and we can also start
up journal reclaim right away - and delete a bunch of code.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
This commit is contained in:
Kent Overstreet 2021-12-27 23:10:06 -05:00 committed by Kent Overstreet
parent 8e432d98a5
commit d8601afca8
8 changed files with 22 additions and 114 deletions

View file

@ -902,8 +902,7 @@ static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
static bool allocator_thread_running(struct bch_dev *ca)
{
unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags) &&
test_bit(BCH_FS_ALLOC_REPLAY_DONE, &ca->fs->flags)
test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
? ALLOCATOR_running
: ALLOCATOR_stopped;
alloc_thread_set_state(ca, state);

View file

@ -510,8 +510,6 @@ enum {
BCH_FS_INITIAL_GC_DONE,
BCH_FS_INITIAL_GC_UNFIXED,
BCH_FS_TOPOLOGY_REPAIR_DONE,
BCH_FS_ALLOC_REPLAY_DONE,
BCH_FS_BTREE_INTERIOR_REPLAY_DONE,
BCH_FS_FSCK_DONE,
BCH_FS_STARTED,
BCH_FS_RW,

View file

@ -16,8 +16,7 @@ static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
size_t max_dirty = 4096 + (nr_keys * 3) / 4;
return nr_dirty > max_dirty &&
test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
return nr_dirty > max_dirty;
}
int bch2_btree_key_cache_journal_flush(struct journal *,

View file

@ -45,7 +45,7 @@ static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
BUG_ON(!b->c.level);
if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
return;
bch2_btree_node_iter_init_from_start(&iter, b);
@ -1851,9 +1851,6 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
{
struct async_btree_rewrite *a;
if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
return;
if (!percpu_ref_tryget(&c->writes))
return;

View file

@ -206,9 +206,6 @@ static bool btree_insert_key_leaf(struct btree_trans *trans,
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
EBUG_ON(!insert->level &&
!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags));
if (unlikely(!bch2_btree_bset_insert_key(trans, insert->path, b,
&insert_l(insert)->iter, insert->k)))
return false;

View file

@ -489,9 +489,6 @@ static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
u64 seq;
int err;
if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
return 0;
lockdep_assert_held(&j->reclaim_lock);
while (1) {
@ -689,8 +686,6 @@ static int bch2_journal_reclaim_thread(void *arg)
set_freezable();
kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
j->last_flushed = jiffies;
while (!ret && !kthread_should_stop()) {

View file

@ -148,7 +148,6 @@ enum journal_space_from {
enum {
JOURNAL_REPLAY_DONE,
JOURNAL_STARTED,
JOURNAL_RECLAIM_STARTED,
JOURNAL_NEED_WRITE,
JOURNAL_MAY_GET_UNRESERVED,
JOURNAL_MAY_SKIP_FLUSH,

View file

@ -474,8 +474,8 @@ static void replay_now_at(struct journal *j, u64 seq)
bch2_journal_pin_put(j, j->replay_journal_seq++);
}
static int __bch2_journal_replay_key(struct btree_trans *trans,
struct journal_key *k)
static int bch2_journal_replay_key(struct btree_trans *trans,
struct journal_key *k)
{
struct btree_iter iter;
unsigned iter_flags =
@ -484,7 +484,7 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
int ret;
if (!k->level && k->btree_id == BTREE_ID_alloc)
iter_flags |= BTREE_ITER_CACHED|BTREE_ITER_CACHED_NOFILL;
iter_flags |= BTREE_ITER_CACHED;
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, k->level,
@ -503,29 +503,12 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
return ret;
}
static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
{
unsigned commit_flags =
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_RESERVED;
if (!k->allocated)
commit_flags |= BTREE_INSERT_JOURNAL_REPLAY;
return bch2_trans_do(c, NULL, NULL, commit_flags,
__bch2_journal_replay_key(&trans, k));
}
static int journal_sort_seq_cmp(const void *_l, const void *_r)
{
const struct journal_key *l = *((const struct journal_key **)_l);
const struct journal_key *r = *((const struct journal_key **)_r);
return cmp_int(r->level, l->level) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->btree_id, r->btree_id) ?:
bpos_cmp(l->k->k.p, r->k->k.p);
return cmp_int(l->journal_seq, r->journal_seq);
}
static int bch2_journal_replay(struct bch_fs *c)
@ -533,10 +516,7 @@ static int bch2_journal_replay(struct bch_fs *c)
struct journal_keys *keys = &c->journal_keys;
struct journal_key **keys_sorted, *k;
struct journal *j = &c->journal;
struct bch_dev *ca;
unsigned idx;
size_t i;
u64 seq;
int ret;
keys_sorted = kmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL);
@ -555,73 +535,25 @@ static int bch2_journal_replay(struct bch_fs *c)
replay_now_at(j, keys->journal_seq_base);
}
seq = j->replay_journal_seq;
/*
* First replay updates to the alloc btree - these will only update the
* btree key cache:
*/
for (i = 0; i < keys->nr; i++) {
k = keys_sorted[i];
cond_resched();
if (!k->level && k->btree_id == BTREE_ID_alloc) {
j->replay_journal_seq = keys->journal_seq_base + k->journal_seq;
ret = bch2_journal_replay_key(c, k);
if (ret)
goto err;
}
}
if (!k->allocated)
replay_now_at(j, keys->journal_seq_base + k->journal_seq);
/* Now we can start the allocator threads: */
set_bit(BCH_FS_ALLOC_REPLAY_DONE, &c->flags);
for_each_member_device(ca, c, idx)
bch2_wake_allocator(ca);
/*
* Next replay updates to interior btree nodes:
*/
for (i = 0; i < keys->nr; i++) {
k = keys_sorted[i];
cond_resched();
if (k->level) {
j->replay_journal_seq = keys->journal_seq_base + k->journal_seq;
ret = bch2_journal_replay_key(c, k);
if (ret)
goto err;
}
}
/*
* Now that the btree is in a consistent state, we can start journal
* reclaim (which will be flushing entries from the btree key cache back
* to the btree:
*/
set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
set_bit(JOURNAL_RECLAIM_STARTED, &j->flags);
journal_reclaim_kick(j);
j->replay_journal_seq = seq;
/*
* Now replay leaf node updates:
*/
for (i = 0; i < keys->nr; i++) {
k = keys_sorted[i];
cond_resched();
if (k->level || k->btree_id == BTREE_ID_alloc)
continue;
replay_now_at(j, keys->journal_seq_base + k->journal_seq);
ret = bch2_journal_replay_key(c, k);
if (ret)
ret = bch2_trans_do(c, NULL, NULL,
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_JOURNAL_RESERVED|
(!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY : 0),
bch2_journal_replay_key(&trans, k));
if (ret) {
bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
ret, bch2_btree_ids[k->btree_id], k->level);
goto err;
}
}
replay_now_at(j, j->replay_journal_seq_end);
@ -629,14 +561,9 @@ static int bch2_journal_replay(struct bch_fs *c)
bch2_journal_set_replay_done(j);
bch2_journal_flush_all_pins(j);
kfree(keys_sorted);
return bch2_journal_error(j);
ret = bch2_journal_error(j);
err:
bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
ret, bch2_btree_ids[k->btree_id], k->level);
kfree(keys_sorted);
return ret;
}
@ -1215,7 +1142,8 @@ int bch2_fs_recovery(struct bch_fs *c)
ret = bch2_journal_replay(c);
if (ret)
goto err;
bch_verbose(c, "journal replay done");
if (c->opts.verbose || !c->sb.clean)
bch_info(c, "journal replay done");
if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) &&
!c->opts.nochanges) {
@ -1385,10 +1313,6 @@ int bch2_fs_initialize(struct bch_fs *c)
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
set_bit(BCH_FS_ALLOC_REPLAY_DONE, &c->flags);
set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
err = "unable to allocate journal buckets";
for_each_online_member(ca, c, i) {
ret = bch2_dev_journal_alloc(ca);