bcachefs: Rewrite journal_seq_blacklist machinery

Now, we store blacklisted journal sequence numbers in the superblock,
not the journal: this helps to greatly simplify the code, and more
importantly it's now implemented in a way that doesn't require all btree
nodes to be visited before starting the journal - instead, we
unconditionally blacklist the next 4 journal sequence numbers after an
unclean shutdown.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2019-04-04 21:53:12 -04:00 committed by Kent Overstreet
parent ece254b258
commit 1dd7f9d98d
15 changed files with 492 additions and 504 deletions

View file

@ -185,6 +185,7 @@
#include <linux/closure.h>
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/math64.h>
#include <linux/mutex.h>
#include <linux/percpu-refcount.h>
#include <linux/percpu-rwsem.h>
@ -486,6 +487,7 @@ enum {
BCH_FS_RW,
/* shutdown: */
BCH_FS_STOPPING,
BCH_FS_EMERGENCY_RO,
BCH_FS_WRITE_DISABLE_COMPLETE,
@ -511,6 +513,15 @@ struct bch_fs_pcpu {
u64 sectors_available;
};
struct journal_seq_blacklist_table {
size_t nr;
struct journal_seq_blacklist_table_entry {
u64 start;
u64 end;
bool dirty;
} entries[0];
};
struct bch_fs {
struct closure cl;
@ -646,6 +657,11 @@ struct bch_fs {
struct io_clock io_clock[2];
/* JOURNAL SEQ BLACKLIST */
struct journal_seq_blacklist_table *
journal_seq_blacklist_table;
struct work_struct journal_seq_blacklist_gc_work;
/* ALLOCATOR */
spinlock_t freelist_lock;
struct closure_waitlist freelist_wait;

View file

@ -909,7 +909,8 @@ struct bch_sb_field {
x(quota, 4) \
x(disk_groups, 5) \
x(clean, 6) \
x(replicas, 7)
x(replicas, 7) \
x(journal_seq_blacklist, 8)
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
@ -1124,6 +1125,20 @@ struct bch_sb_field_clean {
};
};
struct journal_seq_blacklist_entry {
__le64 start;
__le64 end;
};
struct bch_sb_field_journal_seq_blacklist {
struct bch_sb_field field;
union {
struct journal_seq_blacklist_entry start[0];
__u64 _data[0];
};
};
/* Superblock: */
/*
@ -1279,6 +1294,7 @@ enum bch_sb_features {
BCH_FEATURE_ZSTD = 2,
BCH_FEATURE_ATOMIC_NLINK = 3, /* should have gone under compat */
BCH_FEATURE_EC = 4,
BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5,
BCH_FEATURE_NR,
};

View file

@ -770,7 +770,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
struct btree_node *sorted;
struct bkey_packed *k;
struct bset *i;
bool used_mempool;
bool used_mempool, blacklisted;
unsigned u64s;
int ret, retry_read = 0, write = READ;
@ -844,20 +844,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
b->written += sectors;
ret = bch2_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b);
if (ret < 0) {
btree_err(BTREE_ERR_FATAL, c, b, i,
"insufficient memory");
goto err;
}
blacklisted = bch2_journal_seq_is_blacklisted(c,
le64_to_cpu(i->journal_seq),
true);
if (ret) {
btree_err_on(first,
BTREE_ERR_FIXABLE, c, b, i,
"first btree node bset has blacklisted journal seq");
if (!first)
continue;
}
btree_err_on(blacklisted && first,
BTREE_ERR_FIXABLE, c, b, i,
"first btree node bset has blacklisted journal seq");
if (blacklisted && !first)
continue;
bch2_btree_node_iter_large_push(iter, b,
i->start,
@ -930,7 +925,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
out:
mempool_free(iter, &c->fill_iter);
return retry_read;
err:
fsck_err:
if (ret == BTREE_RETRY_READ) {
retry_read = 1;

View file

@ -1156,6 +1156,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
if (!btree_iter_node(iter, iter->level))
return NULL;
bch2_trans_cond_resched(iter->trans);
btree_iter_up(iter);
if (!bch2_btree_node_relock(iter, iter->level))

View file

@ -4,8 +4,6 @@
#include "opts.h"
#include <linux/math64.h>
extern const char * const bch2_inode_opts[];
const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);

View file

@ -988,27 +988,57 @@ void bch2_fs_journal_stop(struct journal *j)
cancel_delayed_work_sync(&j->reclaim_work);
}
void bch2_fs_journal_start(struct journal *j)
int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
struct list_head *journal_entries)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_seq_blacklist *bl;
u64 blacklist = 0;
struct journal_entry_pin_list *p;
struct journal_replay *i;
u64 last_seq = cur_seq, nr, seq;
list_for_each_entry(bl, &j->seq_blacklist, list)
blacklist = max(blacklist, bl->end);
if (!list_empty(journal_entries))
last_seq = le64_to_cpu(list_last_entry(journal_entries,
struct journal_replay,
list)->j.last_seq);
nr = cur_seq - last_seq;
if (nr + 1 > j->pin.size) {
free_fifo(&j->pin);
init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
if (!j->pin.data) {
bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
return -ENOMEM;
}
}
j->last_seq_ondisk = last_seq;
j->pin.front = last_seq;
j->pin.back = cur_seq;
atomic64_set(&j->seq, cur_seq - 1);
fifo_for_each_entry_ptr(p, &j->pin, seq) {
INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, 0);
p->devs.nr = 0;
}
list_for_each_entry(i, journal_entries, list) {
seq = le64_to_cpu(i->j.seq);
BUG_ON(seq < last_seq || seq >= cur_seq);
p = journal_seq_pin(j, seq);
atomic_set(&p->count, 1);
p->devs = i->devs;
}
spin_lock(&j->lock);
set_bit(JOURNAL_STARTED, &j->flags);
while (journal_cur_seq(j) < blacklist)
journal_pin_new_entry(j, 0);
/*
* __journal_entry_close() only inits the next journal entry when it
* closes an open journal entry - the very first journal entry gets
* initialized here:
*/
journal_pin_new_entry(j, 1);
bch2_journal_buf_init(j);
@ -1017,12 +1047,7 @@ void bch2_fs_journal_start(struct journal *j)
bch2_journal_space_available(j);
spin_unlock(&j->lock);
/*
* Adding entries to the next journal entry before allocating space on
* disk for the next journal entry - this is ok, because these entries
* only have to go down with the next journal entry we write:
*/
bch2_journal_seq_blacklist_write(j);
return 0;
}
/* init/exit: */
@ -1090,8 +1115,6 @@ int bch2_fs_journal_init(struct journal *j)
INIT_DELAYED_WORK(&j->write_work, journal_write_work);
INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work);
init_waitqueue_head(&j->pin_flush_wait);
mutex_init(&j->blacklist_lock);
INIT_LIST_HEAD(&j->seq_blacklist);
mutex_init(&j->reclaim_lock);
mutex_init(&j->discard_lock);

View file

@ -472,8 +472,10 @@ int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
int bch2_dev_journal_alloc(struct bch_dev *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
void bch2_fs_journal_stop(struct journal *);
void bch2_fs_journal_start(struct journal *);
int bch2_fs_journal_start(struct journal *, u64, struct list_head *);
void bch2_dev_journal_exit(struct bch_dev *);
int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
void bch2_fs_journal_exit(struct journal *);

View file

@ -10,7 +10,6 @@
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
#include "replicas.h"
#include "trace.h"
@ -655,45 +654,11 @@ void bch2_journal_entries_free(struct list_head *list)
}
}
int bch2_journal_set_seq(struct bch_fs *c, u64 last_seq, u64 end_seq)
{
struct journal *j = &c->journal;
struct journal_entry_pin_list *p;
u64 seq, nr = end_seq - last_seq + 1;
if (nr > j->pin.size) {
free_fifo(&j->pin);
init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
if (!j->pin.data) {
bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
return -ENOMEM;
}
}
atomic64_set(&j->seq, end_seq);
j->last_seq_ondisk = last_seq;
j->pin.front = last_seq;
j->pin.back = end_seq + 1;
fifo_for_each_entry_ptr(p, &j->pin, seq) {
INIT_LIST_HEAD(&p->list);
INIT_LIST_HEAD(&p->flushed);
atomic_set(&p->count, 0);
p->devs.nr = 0;
}
return 0;
}
int bch2_journal_read(struct bch_fs *c, struct list_head *list)
{
struct journal *j = &c->journal;
struct journal_list jlist;
struct journal_replay *i;
struct journal_entry_pin_list *p;
struct bch_dev *ca;
u64 cur_seq, end_seq;
unsigned iter;
size_t keys = 0, entries = 0;
bool degraded = false;
@ -725,17 +690,12 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
if (jlist.ret)
return jlist.ret;
if (list_empty(list)){
bch_err(c, "no journal entries found");
return BCH_FSCK_REPAIR_IMPOSSIBLE;
}
list_for_each_entry(i, list, list) {
struct jset_entry *entry;
struct bkey_i *k, *_n;
struct bch_replicas_padded replicas;
char buf[80];
bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
ret = jset_validate_entries(c, &i->j, READ);
if (ret)
goto fsck_err;
@ -745,6 +705,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
* the devices - this is wrong:
*/
bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
if (!degraded &&
(test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c,
@ -755,68 +717,18 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
if (ret)
return ret;
}
}
i = list_last_entry(list, struct journal_replay, list);
ret = bch2_journal_set_seq(c,
le64_to_cpu(i->j.last_seq),
le64_to_cpu(i->j.seq));
if (ret)
return ret;
mutex_lock(&j->blacklist_lock);
list_for_each_entry(i, list, list) {
p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
atomic_set(&p->count, 1);
p->devs = i->devs;
if (bch2_journal_seq_blacklist_read(j, i)) {
mutex_unlock(&j->blacklist_lock);
return -ENOMEM;
}
}
mutex_unlock(&j->blacklist_lock);
cur_seq = journal_last_seq(j);
end_seq = le64_to_cpu(list_last_entry(list,
struct journal_replay, list)->j.seq);
list_for_each_entry(i, list, list) {
struct jset_entry *entry;
struct bkey_i *k, *_n;
bool blacklisted;
mutex_lock(&j->blacklist_lock);
while (cur_seq < le64_to_cpu(i->j.seq) &&
bch2_journal_seq_blacklist_find(j, cur_seq))
cur_seq++;
blacklisted = bch2_journal_seq_blacklist_find(j,
le64_to_cpu(i->j.seq));
mutex_unlock(&j->blacklist_lock);
fsck_err_on(blacklisted, c,
"found blacklisted journal entry %llu",
le64_to_cpu(i->j.seq));
fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
"journal entries %llu-%llu missing! (replaying %llu-%llu)",
cur_seq, le64_to_cpu(i->j.seq) - 1,
journal_last_seq(j), end_seq);
cur_seq = le64_to_cpu(i->j.seq) + 1;
for_each_jset_key(k, _n, entry, &i->j)
keys++;
entries++;
}
bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
keys, entries, journal_cur_seq(j));
if (!list_empty(list)) {
i = list_last_entry(list, struct journal_replay, list);
bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
keys, entries, le64_to_cpu(i->j.seq));
}
fsck_err:
return ret;
}

View file

@ -35,7 +35,6 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \
vstruct_for_each_safe(entry, k, _n)
int bch2_journal_set_seq(struct bch_fs *c, u64, u64);
int bch2_journal_read(struct bch_fs *, struct list_head *);
void bch2_journal_entries_free(struct list_head *);
int bch2_journal_replay(struct bch_fs *, struct list_head *);

View file

@ -1,13 +1,10 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "error.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "btree_iter.h"
#include "eytzinger.h"
#include "journal_seq_blacklist.h"
#include "super-io.h"
/*
* journal_seq_blacklist machinery:
@ -37,327 +34,285 @@
* record that it was blacklisted so that a) on recovery we don't think we have
* missing journal entries and b) so that the btree code continues to ignore
* that bset, until that btree node is rewritten.
*
* Blacklisted journal sequence numbers are themselves recorded as entries in
* the journal.
*/
/*
* Called when journal needs to evict a blacklist entry to reclaim space: find
* any btree nodes that refer to the blacklist journal sequence numbers, and
* rewrite them:
*/
static void journal_seq_blacklist_flush(struct journal *j,
struct journal_entry_pin *pin, u64 seq)
static unsigned
blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
{
struct bch_fs *c =
container_of(j, struct bch_fs, journal);
struct journal_seq_blacklist *bl =
container_of(pin, struct journal_seq_blacklist, pin);
struct blacklisted_node n;
struct closure cl;
unsigned i;
int ret;
closure_init_stack(&cl);
for (i = 0;; i++) {
struct btree_trans trans;
struct btree_iter *iter;
struct btree *b;
bch2_trans_init(&trans, c);
mutex_lock(&j->blacklist_lock);
if (i >= bl->nr_entries) {
mutex_unlock(&j->blacklist_lock);
break;
}
n = bl->entries[i];
mutex_unlock(&j->blacklist_lock);
iter = bch2_trans_get_node_iter(&trans, n.btree_id, n.pos,
0, 0, 0);
b = bch2_btree_iter_peek_node(iter);
/* The node might have already been rewritten: */
if (b->data->keys.seq == n.seq) {
ret = bch2_btree_node_rewrite(c, iter, n.seq, 0);
if (ret) {
bch2_trans_exit(&trans);
bch2_fs_fatal_error(c,
"error %i rewriting btree node with blacklisted journal seq",
ret);
bch2_journal_halt(j);
return;
}
}
bch2_trans_exit(&trans);
}
for (i = 0;; i++) {
struct btree_update *as;
struct pending_btree_node_free *d;
mutex_lock(&j->blacklist_lock);
if (i >= bl->nr_entries) {
mutex_unlock(&j->blacklist_lock);
break;
}
n = bl->entries[i];
mutex_unlock(&j->blacklist_lock);
redo_wait:
mutex_lock(&c->btree_interior_update_lock);
/*
* Is the node on the list of pending interior node updates -
* being freed? If so, wait for that to finish:
*/
for_each_pending_btree_node_free(c, as, d)
if (n.seq == d->seq &&
n.btree_id == d->btree_id &&
!d->level &&
!bkey_cmp(n.pos, d->key.k.p)) {
closure_wait(&as->wait, &cl);
mutex_unlock(&c->btree_interior_update_lock);
closure_sync(&cl);
goto redo_wait;
}
mutex_unlock(&c->btree_interior_update_lock);
}
mutex_lock(&j->blacklist_lock);
bch2_journal_pin_drop(j, &bl->pin);
list_del(&bl->list);
kfree(bl->entries);
kfree(bl);
mutex_unlock(&j->blacklist_lock);
return bl
? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
sizeof(struct journal_seq_blacklist_entry))
: 0;
}
/*
* Determine if a particular sequence number is blacklisted - if so, return
* blacklist entry:
*/
struct journal_seq_blacklist *
bch2_journal_seq_blacklist_find(struct journal *j, u64 seq)
static unsigned sb_blacklist_u64s(unsigned nr)
{
struct journal_seq_blacklist *bl;
struct bch_sb_field_journal_seq_blacklist *bl;
lockdep_assert_held(&j->blacklist_lock);
return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
}
list_for_each_entry(bl, &j->seq_blacklist, list)
if (seq >= bl->start && seq <= bl->end)
return bl;
static struct bch_sb_field_journal_seq_blacklist *
blacklist_entry_try_merge(struct bch_fs *c,
struct bch_sb_field_journal_seq_blacklist *bl,
unsigned i)
{
unsigned nr = blacklist_nr_entries(bl);
if (le64_to_cpu(bl->start[i].end) >=
le64_to_cpu(bl->start[i + 1].start)) {
bl->start[i].end = bl->start[i + 1].end;
--nr;
memmove(&bl->start[i],
&bl->start[i + 1],
sizeof(bl->start[0]) * (nr - i));
bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
sb_blacklist_u64s(nr));
BUG_ON(!bl);
}
return bl;
}
int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
{
struct bch_sb_field_journal_seq_blacklist *bl;
unsigned i, nr;
int ret = 0;
mutex_lock(&c->sb_lock);
bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
nr = blacklist_nr_entries(bl);
if (bl) {
for (i = 0; i < nr; i++) {
struct journal_seq_blacklist_entry *e =
bl->start + i;
if (start == le64_to_cpu(e->start) &&
end == le64_to_cpu(e->end))
goto out;
if (start <= le64_to_cpu(e->start) &&
end >= le64_to_cpu(e->end)) {
e->start = cpu_to_le64(start);
e->end = cpu_to_le64(end);
if (i + 1 < nr)
bl = blacklist_entry_try_merge(c,
bl, i);
if (i)
bl = blacklist_entry_try_merge(c,
bl, i - 1);
goto out_write_sb;
}
}
}
bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
sb_blacklist_u64s(nr + 1));
if (!bl) {
ret = -ENOMEM;
goto out;
}
bl->start[nr].start = cpu_to_le64(start);
bl->start[nr].end = cpu_to_le64(end);
out_write_sb:
c->disk_sb.sb->features[0] |=
1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3;
ret = bch2_write_super(c);
out:
mutex_unlock(&c->sb_lock);
return ret;
}
static int journal_seq_blacklist_table_cmp(const void *_l,
const void *_r, size_t size)
{
const struct journal_seq_blacklist_table_entry *l = _l;
const struct journal_seq_blacklist_table_entry *r = _r;
return (l->start > r->start) - (l->start < r->start);
}
bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
bool dirty)
{
struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
struct journal_seq_blacklist_table_entry search = { .start = seq };
int idx;
if (!t)
return false;
idx = eytzinger0_find_le(t->entries, t->nr,
sizeof(t->entries[0]),
journal_seq_blacklist_table_cmp,
&search);
if (idx < 0)
return false;
BUG_ON(t->entries[idx].start > seq);
if (seq >= t->entries[idx].end)
return false;
if (dirty)
t->entries[idx].dirty = true;
return true;
}
int bch2_blacklist_table_initialize(struct bch_fs *c)
{
struct bch_sb_field_journal_seq_blacklist *bl =
bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
struct journal_seq_blacklist_table *t;
unsigned i, nr = blacklist_nr_entries(bl);
BUG_ON(c->journal_seq_blacklist_table);
if (!bl)
return 0;
t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr,
GFP_KERNEL);
if (!t)
return -ENOMEM;
t->nr = nr;
for (i = 0; i < nr; i++) {
t->entries[i].start = le64_to_cpu(bl->start[i].start);
t->entries[i].end = le64_to_cpu(bl->start[i].end);
}
eytzinger0_sort(t->entries,
t->nr,
sizeof(t->entries[0]),
journal_seq_blacklist_table_cmp,
NULL);
c->journal_seq_blacklist_table = t;
return 0;
}
static const char *
bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
struct bch_sb_field *f)
{
struct bch_sb_field_journal_seq_blacklist *bl =
field_to_type(f, journal_seq_blacklist);
struct journal_seq_blacklist_entry *i;
unsigned nr = blacklist_nr_entries(bl);
for (i = bl->start; i < bl->start + nr; i++) {
if (le64_to_cpu(i->start) >=
le64_to_cpu(i->end))
return "entry start >= end";
if (i + 1 < bl->start + nr &&
le64_to_cpu(i[0].end) >
le64_to_cpu(i[1].start))
return "entries out of order";
}
return NULL;
}
/*
* Allocate a new, in memory blacklist entry:
*/
static struct journal_seq_blacklist *
bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end)
static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
struct bch_sb *sb,
struct bch_sb_field *f)
{
struct journal_seq_blacklist *bl;
struct bch_sb_field_journal_seq_blacklist *bl =
field_to_type(f, journal_seq_blacklist);
struct journal_seq_blacklist_entry *i;
unsigned nr = blacklist_nr_entries(bl);
lockdep_assert_held(&j->blacklist_lock);
for (i = bl->start; i < bl->start + nr; i++) {
if (i != bl->start)
pr_buf(out, " ");
/*
* When we start the journal, bch2_journal_start() will skip over @seq:
*/
bl = kzalloc(sizeof(*bl), GFP_KERNEL);
if (!bl)
return NULL;
bl->start = start;
bl->end = end;
list_add_tail(&bl->list, &j->seq_blacklist);
return bl;
pr_buf(out, "%llu-%llu",
le64_to_cpu(i->start),
le64_to_cpu(i->end));
}
}
/*
* Returns true if @seq is newer than the most recent journal entry that got
* written, and data corresponding to @seq should be ignored - also marks @seq
* as blacklisted so that on future restarts the corresponding data will still
* be ignored:
*/
int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
.validate = bch2_sb_journal_seq_blacklist_validate,
.to_text = bch2_sb_journal_seq_blacklist_to_text
};
void bch2_blacklist_entries_gc(struct work_struct *work)
{
struct journal *j = &c->journal;
struct journal_seq_blacklist *bl = NULL;
struct blacklisted_node *n;
u64 journal_seq;
int ret = 0;
struct bch_fs *c = container_of(work, struct bch_fs,
journal_seq_blacklist_gc_work);
struct journal_seq_blacklist_table *t;
struct bch_sb_field_journal_seq_blacklist *bl;
struct journal_seq_blacklist_entry *src, *dst;
struct btree_trans trans;
unsigned i, nr, new_nr;
int ret;
if (!seq)
return 0;
bch2_trans_init(&trans, c);
spin_lock(&j->lock);
journal_seq = journal_cur_seq(j);
spin_unlock(&j->lock);
for (i = 0; i < BTREE_ID_NR; i++) {
struct btree_iter *iter;
struct btree *b;
/* Interier updates aren't journalled: */
BUG_ON(b->level);
BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
/*
* Decrease this back to j->seq + 2 when we next rev the on disk format:
* increasing it temporarily to work around bug in old kernels
*/
fsck_err_on(seq > journal_seq + 4, c,
"bset journal seq too far in the future: %llu > %llu",
seq, journal_seq);
if (seq <= journal_seq &&
list_empty_careful(&j->seq_blacklist))
return 0;
mutex_lock(&j->blacklist_lock);
if (seq <= journal_seq) {
bl = bch2_journal_seq_blacklist_find(j, seq);
if (!bl)
goto out;
} else {
bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
if (!j->new_blacklist) {
j->new_blacklist = bch2_journal_seq_blacklisted_new(j,
journal_seq + 1,
journal_seq + 1);
if (!j->new_blacklist) {
ret = -ENOMEM;
goto out;
for_each_btree_node(&trans, iter, i, POS_MIN,
BTREE_ITER_PREFETCH, b)
if (test_bit(BCH_FS_STOPPING, &c->flags)) {
bch2_trans_exit(&trans);
return;
}
}
bl = j->new_blacklist;
bl->end = max(bl->end, seq);
bch2_trans_iter_free(&trans, iter);
}
for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
if (b->data->keys.seq == n->seq &&
b->btree_id == n->btree_id &&
!bkey_cmp(b->key.k.p, n->pos))
goto found_entry;
if (!bl->nr_entries ||
is_power_of_2(bl->nr_entries)) {
n = krealloc(bl->entries,
max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n),
GFP_KERNEL);
if (!n) {
ret = -ENOMEM;
goto out;
}
bl->entries = n;
}
bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
.seq = b->data->keys.seq,
.btree_id = b->btree_id,
.pos = b->key.k.p,
};
found_entry:
ret = 1;
out:
fsck_err:
mutex_unlock(&j->blacklist_lock);
return ret;
}
static int __bch2_journal_seq_blacklist_read(struct journal *j,
struct journal_replay *i,
u64 start, u64 end)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_seq_blacklist *bl;
bch_verbose(c, "blacklisting existing journal seq %llu-%llu",
start, end);
bl = bch2_journal_seq_blacklisted_new(j, start, end);
if (!bl)
return -ENOMEM;
bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin,
journal_seq_blacklist_flush);
return 0;
}
/*
* After reading the journal, find existing journal seq blacklist entries and
* read them into memory:
*/
int bch2_journal_seq_blacklist_read(struct journal *j,
struct journal_replay *i)
{
struct jset_entry *entry;
int ret = 0;
vstruct_for_each(&i->j, entry) {
switch (entry->type) {
case BCH_JSET_ENTRY_blacklist: {
struct jset_entry_blacklist *bl_entry =
container_of(entry, struct jset_entry_blacklist, entry);
ret = __bch2_journal_seq_blacklist_read(j, i,
le64_to_cpu(bl_entry->seq),
le64_to_cpu(bl_entry->seq));
break;
}
case BCH_JSET_ENTRY_blacklist_v2: {
struct jset_entry_blacklist_v2 *bl_entry =
container_of(entry, struct jset_entry_blacklist_v2, entry);
ret = __bch2_journal_seq_blacklist_read(j, i,
le64_to_cpu(bl_entry->start),
le64_to_cpu(bl_entry->end));
break;
}
}
if (ret)
break;
}
return ret;
}
/*
* After reading the journal and walking the btree, we might have new journal
* sequence numbers to blacklist - add entries to the next journal entry to be
* written:
*/
void bch2_journal_seq_blacklist_write(struct journal *j)
{
struct journal_seq_blacklist *bl = j->new_blacklist;
struct jset_entry_blacklist_v2 *bl_entry;
struct jset_entry *entry;
if (!bl)
ret = bch2_trans_exit(&trans);
if (ret)
return;
entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j),
(sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64));
mutex_lock(&c->sb_lock);
bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
if (!bl)
goto out;
bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
bl_entry->entry.type = BCH_JSET_ENTRY_blacklist_v2;
bl_entry->start = cpu_to_le64(bl->start);
bl_entry->end = cpu_to_le64(bl->end);
nr = blacklist_nr_entries(bl);
dst = bl->start;
bch2_journal_pin_add(j,
journal_cur_seq(j),
&bl->pin,
journal_seq_blacklist_flush);
t = c->journal_seq_blacklist_table;
BUG_ON(nr != t->nr);
j->new_blacklist = NULL;
for (src = bl->start, i = eytzinger0_first(t->nr);
src < bl->start + nr;
src++, i = eytzinger0_next(i, nr)) {
BUG_ON(t->entries[i].start != le64_to_cpu(src->start));
BUG_ON(t->entries[i].end != le64_to_cpu(src->end));
if (t->entries[i].dirty)
*dst++ = *src;
}
new_nr = dst - bl->start;
bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr);
if (new_nr != nr) {
bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
new_nr ? sb_blacklist_u64s(new_nr) : 0);
BUG_ON(new_nr && !bl);
if (!new_nr)
c->disk_sb.sb->features[0] &=
~(1ULL << BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3);
bch2_write_super(c);
}
out:
mutex_unlock(&c->sb_lock);
}

View file

@ -2,13 +2,12 @@
#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
struct journal_replay;
bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
int bch2_blacklist_table_initialize(struct bch_fs *);
struct journal_seq_blacklist *
bch2_journal_seq_blacklist_find(struct journal *, u64);
int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
int bch2_journal_seq_blacklist_read(struct journal *,
struct journal_replay *);
void bch2_journal_seq_blacklist_write(struct journal *);
extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
void bch2_blacklist_entries_gc(struct work_struct *);
#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */

View file

@ -54,24 +54,6 @@ struct journal_entry_pin {
u64 seq;
};
/* corresponds to a btree node with a blacklisted bset: */
struct blacklisted_node {
__le64 seq;
enum btree_id btree_id;
struct bpos pos;
};
struct journal_seq_blacklist {
struct list_head list;
u64 start;
u64 end;
struct journal_entry_pin pin;
struct blacklisted_node *entries;
size_t nr_entries;
};
struct journal_res {
bool ref;
u8 idx;
@ -222,10 +204,6 @@ struct journal {
u64 replay_journal_seq;
struct mutex blacklist_lock;
struct list_head seq_blacklist;
struct journal_seq_blacklist *new_blacklist;
struct write_point wp;
spinlock_t err_lock;

View file

@ -12,6 +12,7 @@
#include "error.h"
#include "fsck.h"
#include "journal_io.h"
#include "journal_seq_blacklist.h"
#include "quota.h"
#include "recovery.h"
#include "replicas.h"
@ -99,18 +100,49 @@ static int verify_superblock_clean(struct bch_fs *c,
return ret;
}
static int
verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c,
struct list_head *journal)
{
struct journal_replay *i =
list_last_entry(journal, struct journal_replay, list);
u64 start_seq = le64_to_cpu(i->j.last_seq);
u64 end_seq = le64_to_cpu(i->j.seq);
u64 seq = start_seq;
int ret = 0;
list_for_each_entry(i, journal, list) {
fsck_err_on(seq != le64_to_cpu(i->j.seq), c,
"journal entries %llu-%llu missing! (replaying %llu-%llu)",
seq, le64_to_cpu(i->j.seq) - 1,
start_seq, end_seq);
seq = le64_to_cpu(i->j.seq);
fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c,
"found blacklisted journal entry %llu", seq);
do {
seq++;
} while (bch2_journal_seq_is_blacklisted(c, seq, false));
}
fsck_err:
return ret;
}
static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
{
struct bch_sb_field_clean *clean, *sb_clean;
if (!c->sb.clean)
return NULL;
int ret;
mutex_lock(&c->sb_lock);
sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
if (!sb_clean) {
if (fsck_err_on(!sb_clean, c,
"superblock marked clean but clean section not present")) {
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->sb.clean = false;
mutex_unlock(&c->sb_lock);
bch_err(c, "superblock marked clean but clean section not present");
return NULL;
}
@ -128,6 +160,9 @@ static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
mutex_unlock(&c->sb_lock);
return clean;
fsck_err:
mutex_unlock(&c->sb_lock);
return ERR_PTR(ret);
}
static int journal_replay_entry_early(struct bch_fs *c,
@ -179,14 +214,32 @@ static int journal_replay_entry_early(struct bch_fs *c,
le64_to_cpu(u->v));
break;
}
case BCH_JSET_ENTRY_blacklist: {
struct jset_entry_blacklist *bl_entry =
container_of(entry, struct jset_entry_blacklist, entry);
ret = bch2_journal_seq_blacklist_add(c,
le64_to_cpu(bl_entry->seq),
le64_to_cpu(bl_entry->seq) + 1);
break;
}
case BCH_JSET_ENTRY_blacklist_v2: {
struct jset_entry_blacklist_v2 *bl_entry =
container_of(entry, struct jset_entry_blacklist_v2, entry);
ret = bch2_journal_seq_blacklist_add(c,
le64_to_cpu(bl_entry->start),
le64_to_cpu(bl_entry->end) + 1);
break;
}
}
return ret;
}
static int load_journal_metadata(struct bch_fs *c,
struct bch_sb_field_clean *clean,
struct list_head *journal)
static int journal_replay_early(struct bch_fs *c,
struct bch_sb_field_clean *clean,
struct list_head *journal)
{
struct jset_entry *entry;
int ret;
@ -300,37 +353,76 @@ static bool journal_empty(struct list_head *journal)
int bch2_fs_recovery(struct bch_fs *c)
{
const char *err = "cannot allocate memory";
struct bch_sb_field_clean *clean;
struct bch_sb_field_clean *clean = NULL;
u64 journal_seq;
LIST_HEAD(journal);
int ret;
clean = read_superblock_clean(c);
if (clean)
if (c->sb.clean)
clean = read_superblock_clean(c);
ret = PTR_ERR_OR_ZERO(clean);
if (ret)
goto err;
if (c->sb.clean)
bch_info(c, "recovering from clean shutdown, journal seq %llu",
le64_to_cpu(clean->journal_seq));
if (!clean || c->opts.fsck) {
if (!c->replicas.entries) {
bch_info(c, "building replicas info");
set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
}
if (!c->sb.clean || c->opts.fsck) {
struct jset *j;
ret = bch2_journal_read(c, &journal);
if (ret)
goto err;
ret = verify_superblock_clean(c, &clean,
&list_last_entry(&journal, struct journal_replay,
list)->j);
fsck_err_on(c->sb.clean && !journal_empty(&journal), c,
"filesystem marked clean but journal not empty");
if (!c->sb.clean && list_empty(&journal)){
bch_err(c, "no journal entries found");
ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
goto err;
}
j = &list_last_entry(&journal, struct journal_replay, list)->j;
ret = verify_superblock_clean(c, &clean, j);
if (ret)
goto err;
journal_seq = le64_to_cpu(j->seq) + 1;
} else {
ret = bch2_journal_set_seq(c,
le64_to_cpu(clean->journal_seq),
le64_to_cpu(clean->journal_seq));
if (ret)
goto err;
journal_seq = le64_to_cpu(clean->journal_seq) + 1;
}
fsck_err_on(clean && !journal_empty(&journal), c,
"filesystem marked clean but journal not empty");
ret = journal_replay_early(c, clean, &journal);
if (ret)
goto err;
ret = load_journal_metadata(c, clean, &journal);
if (!c->sb.clean) {
ret = bch2_journal_seq_blacklist_add(c,
journal_seq,
journal_seq + 4);
if (ret) {
bch_err(c, "error creating new journal seq blacklist entry");
goto err;
}
journal_seq += 4;
}
ret = bch2_blacklist_table_initialize(c);
ret = verify_journal_entries_not_blacklisted_or_missing(c, &journal);
if (ret)
goto err;
ret = bch2_fs_journal_start(&c->journal, journal_seq, &journal);
if (ret)
goto err;
@ -351,11 +443,6 @@ int bch2_fs_recovery(struct bch_fs *c)
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
if (!c->replicas.entries) {
bch_info(c, "building replicas info");
set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
}
if (c->opts.fsck ||
!(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) ||
test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
@ -377,13 +464,6 @@ int bch2_fs_recovery(struct bch_fs *c)
if (c->sb.encryption_type && !c->sb.clean)
atomic64_add(1 << 16, &c->key_version);
/*
* bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
* will give spurious errors about oldest_gen > bucket_gen -
* this is a hack but oh well.
*/
bch2_fs_journal_start(&c->journal);
if (c->opts.noreplay)
goto out;
@ -424,6 +504,10 @@ int bch2_fs_recovery(struct bch_fs *c)
SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
}
mutex_unlock(&c->sb_lock);
if (c->journal_seq_blacklist_table &&
c->journal_seq_blacklist_table->nr > 128)
queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
out:
bch2_journal_entries_free(&journal);
kfree(clean);
@ -472,7 +556,7 @@ int bch2_fs_initialize(struct bch_fs *c)
* journal_res_get() will crash if called before this has
* set up the journal.pin FIFO and journal.cur pointer:
*/
bch2_fs_journal_start(&c->journal);
bch2_fs_journal_start(&c->journal, 1, &journal);
bch2_journal_set_replay_done(&c->journal);
err = "error going read write";

View file

@ -7,6 +7,7 @@
#include "error.h"
#include "io.h"
#include "journal.h"
#include "journal_seq_blacklist.h"
#include "replicas.h"
#include "quota.h"
#include "super-io.h"

View file

@ -30,6 +30,7 @@
#include "io.h"
#include "journal.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
#include "move.h"
#include "migrate.h"
#include "movinggc.h"
@ -468,6 +469,7 @@ static void bch2_fs_free(struct bch_fs *c)
kfree(c->replicas.entries);
kfree(c->replicas_gc.entries);
kfree(rcu_dereference_protected(c->disk_groups, 1));
kfree(c->journal_seq_blacklist_table);
if (c->journal_reclaim_wq)
destroy_workqueue(c->journal_reclaim_wq);
@ -496,6 +498,10 @@ void bch2_fs_stop(struct bch_fs *c)
bch_verbose(c, "shutting down");
set_bit(BCH_FS_STOPPING, &c->flags);
cancel_work_sync(&c->journal_seq_blacklist_gc_work);
for_each_member_device(ca, c, i)
if (ca->kobj.state_in_sysfs &&
ca->disk_sb.bdev)
@ -631,6 +637,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
spin_lock_init(&c->btree_write_error_lock);
INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work);
INIT_WORK(&c->journal_seq_blacklist_gc_work,
bch2_blacklist_entries_gc);
INIT_LIST_HEAD(&c->fsck_errors);
mutex_init(&c->fsck_error_lock);