bcachefs: Fix for bad stripe pointers

The allocator usually doesn't increment bucket gens right away on
buckets that it's about to hand out (for reasons that need to be
documented), instead deferring that to whatever extent update first
references that bucket.

But stripe pointers reference buckets without changing bucket sector
counts, meaning we could end up with a pointer in a stripe with a gen
newer than the bucket it points to.

Fix this by adding a transactional trigger for KEY_TYPE_stripe that just
writes out the keys in the alloc btree for the buckets it points to.

Also - consolidate the code that checks pointer validity.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2020-10-19 22:36:24 -04:00 committed by Kent Overstreet
parent 289980195f
commit 39283c712e
5 changed files with 206 additions and 120 deletions

View file

@ -497,8 +497,6 @@ static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
* commands to the newly free buckets, then puts them on the various freelists.
*/
#define BUCKET_GC_GEN_MAX 96U
/**
* wait_buckets_available - wait on reclaimable buckets
*

View file

@ -13,6 +13,9 @@ struct bkey_alloc_unpacked {
#undef x
};
/* How out of date a pointer gen is allowed to be: */
#define BUCKET_GC_GEN_MAX 96U
/* returns true if not equal */
static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
struct bkey_alloc_unpacked r)

View file

@ -591,6 +591,7 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter)
#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
((1U << BKEY_TYPE_EXTENTS)| \
(1U << BKEY_TYPE_INODES)| \
(1U << BKEY_TYPE_EC)| \
(1U << BKEY_TYPE_REFLINK))
enum btree_trigger_flags {

View file

@ -337,8 +337,9 @@ static inline bool iter_has_trans_triggers(struct btree_iter *iter)
static inline bool iter_has_nontrans_triggers(struct btree_iter *iter)
{
return (BTREE_NODE_TYPE_HAS_TRIGGERS &
~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) &
return (((BTREE_NODE_TYPE_HAS_TRIGGERS &
~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS)) |
(1U << BTREE_ID_EC)) &
(1U << iter->btree_id);
}

View file

@ -883,19 +883,112 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
p.crc.uncompressed_size);
}
static void bucket_set_stripe(struct bch_fs *c,
const struct bch_extent_ptr *ptr,
struct bch_fs_usage *fs_usage,
u64 journal_seq,
unsigned flags,
bool enabled)
static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
const struct bch_extent_ptr *ptr,
s64 sectors, enum bch_data_type ptr_data_type,
u8 bucket_gen, u8 bucket_data_type,
u16 dirty_sectors, u16 cached_sectors)
{
size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr);
u16 bucket_sectors = !ptr->cached
? dirty_sectors
: cached_sectors;
char buf[200];
if (gen_after(ptr->gen, bucket_gen)) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
"while marking %s",
ptr->dev, bucket_nr, bucket_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
ptr->gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EIO;
}
if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
ptr->dev, bucket_nr, bucket_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
ptr->gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EIO;
}
if (bucket_gen != ptr->gen && !ptr->cached) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n"
"while marking %s",
ptr->dev, bucket_nr, bucket_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
ptr->gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EIO;
}
if (bucket_gen != ptr->gen)
return 1;
if (bucket_data_type && ptr_data_type &&
bucket_data_type != ptr_data_type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
ptr->dev, bucket_nr, bucket_gen,
bch2_data_types[bucket_data_type],
bch2_data_types[ptr_data_type],
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EIO;
}
if ((unsigned) (bucket_sectors + sectors) > U16_MAX) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
"while marking %s",
ptr->dev, bucket_nr, bucket_gen,
bch2_data_types[bucket_data_type ?: ptr_data_type],
bucket_sectors, sectors,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EIO;
}
return 0;
}
static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
const struct bch_extent_ptr *ptr,
struct bch_fs_usage *fs_usage,
u64 journal_seq,
unsigned flags,
bool enabled)
{
bool gc = flags & BTREE_TRIGGER_GC;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = PTR_BUCKET(ca, ptr, gc);
struct bucket_mark new, old;
char buf[200];
int ret;
old = bucket_cmpxchg(g, new, ({
ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
new.dirty_sectors, new.cached_sectors);
if (ret)
return ret;
if (new.stripe && enabled)
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
if (!new.stripe && !enabled)
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u: deleting stripe but not marked\n%s",
ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
new.stripe = enabled;
if (journal_seq) {
new.journal_seq_valid = 1;
@ -904,103 +997,26 @@ static void bucket_set_stripe(struct bch_fs *c,
}));
bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
/*
* XXX write repair code for these, flag stripe as possibly bad
*/
if (old.gen != ptr->gen)
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"stripe with stale pointer");
#if 0
/*
* We'd like to check for these, but these checks don't work
* yet:
*/
if (old.stripe && enabled)
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"multiple stripes using same bucket");
if (!old.stripe && !enabled)
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"deleting stripe but bucket not marked as stripe bucket");
#endif
return 0;
}
static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
struct extent_ptr_decoded p,
const struct bch_extent_ptr *ptr,
s64 sectors, enum bch_data_type ptr_data_type,
u8 bucket_gen, u8 *bucket_data_type,
u16 *dirty_sectors, u16 *cached_sectors)
{
u16 *dst_sectors = !p.ptr.cached
u16 *dst_sectors = !ptr->cached
? dirty_sectors
: cached_sectors;
u16 orig_sectors = *dst_sectors;
char buf[200];
int ret = check_bucket_ref(c, k, ptr, sectors, ptr_data_type,
bucket_gen, *bucket_data_type,
*dirty_sectors, *cached_sectors);
if (gen_after(p.ptr.gen, bucket_gen)) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
bucket_gen,
bch2_data_types[*bucket_data_type ?: ptr_data_type],
p.ptr.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EIO;
}
if (gen_cmp(bucket_gen, p.ptr.gen) > 96U) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
bucket_gen,
bch2_data_types[*bucket_data_type ?: ptr_data_type],
p.ptr.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EIO;
}
if (bucket_gen != p.ptr.gen && !p.ptr.cached) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
bucket_gen,
bch2_data_types[*bucket_data_type ?: ptr_data_type],
p.ptr.gen,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EIO;
}
if (bucket_gen != p.ptr.gen)
return 1;
if (*bucket_data_type && *bucket_data_type != ptr_data_type) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
bucket_gen,
bch2_data_types[*bucket_data_type],
bch2_data_types[ptr_data_type],
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EIO;
}
if (checked_add(*dst_sectors, sectors)) {
bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr),
bucket_gen,
bch2_data_types[*bucket_data_type ?: ptr_data_type],
orig_sectors, sectors,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
return -EIO;
}
if (ret)
return ret;
*dst_sectors += sectors;
*bucket_data_type = *dirty_sectors || *cached_sectors
? ptr_data_type : 0;
return 0;
@ -1025,7 +1041,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
new.v.counter = old.v.counter = v;
bucket_data_type = new.data_type;
ret = __mark_pointer(c, k, p, sectors, data_type, new.gen,
ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, new.gen,
&bucket_data_type,
&new.dirty_sectors,
&new.cached_sectors);
@ -1189,6 +1205,7 @@ static int bch2_mark_stripe(struct bch_fs *c,
? bkey_s_c_to_stripe(new).v : NULL;
struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
unsigned i;
int ret;
if (!m || (old_s && !m->alive)) {
bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
@ -1198,9 +1215,12 @@ static int bch2_mark_stripe(struct bch_fs *c,
if (!new_s) {
/* Deleting: */
for (i = 0; i < old_s->nr_blocks; i++)
bucket_set_stripe(c, old_s->ptrs + i, fs_usage,
journal_seq, flags, false);
for (i = 0; i < old_s->nr_blocks; i++) {
ret = bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage,
journal_seq, flags, false);
if (ret)
return ret;
}
if (!gc && m->on_heap) {
spin_lock(&c->ec_stripes_heap_lock);
@ -1219,11 +1239,16 @@ static int bch2_mark_stripe(struct bch_fs *c,
old_s->ptrs + i,
sizeof(struct bch_extent_ptr))) {
if (old_s)
bucket_set_stripe(c, old_s->ptrs + i, fs_usage,
if (old_s) {
bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage,
journal_seq, flags, false);
bucket_set_stripe(c, new_s->ptrs + i, fs_usage,
journal_seq, flags, true);
if (ret)
return ret;
}
ret = bucket_set_stripe(c, new, new_s->ptrs + i, fs_usage,
journal_seq, flags, true);
if (ret)
return ret;
}
}
@ -1549,23 +1574,21 @@ static int trans_get_key(struct btree_trans *trans,
return ret;
}
static int bch2_trans_mark_pointer(struct btree_trans *trans,
struct bkey_s_c k, struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type)
static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
const struct bch_extent_ptr *ptr,
struct bkey_alloc_unpacked *u)
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
struct bpos pos = POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr));
struct btree_iter *iter;
struct bkey_s_c k_a;
struct bkey_alloc_unpacked u;
struct bkey_i_alloc *a;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
struct bucket *g;
struct btree_iter *iter;
struct bkey_s_c k;
int ret;
iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k_a);
iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k);
if (iter) {
u = bch2_alloc_unpack(k_a);
*u = bch2_alloc_unpack(k);
} else {
iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos,
BTREE_ITER_CACHED|
@ -1575,16 +1598,36 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
return PTR_ERR(iter);
ret = bch2_btree_iter_traverse(iter);
if (ret)
goto out;
if (ret) {
bch2_trans_iter_put(trans, iter);
return ret;
}
percpu_down_read(&c->mark_lock);
g = bucket(ca, pos.offset);
u = alloc_mem_to_key(g, READ_ONCE(g->mark));
*u = alloc_mem_to_key(g, READ_ONCE(g->mark));
percpu_up_read(&c->mark_lock);
}
ret = __mark_pointer(c, k, p, sectors, data_type, u.gen, &u.data_type,
*_iter = iter;
return 0;
}
static int bch2_trans_mark_pointer(struct btree_trans *trans,
struct bkey_s_c k, struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type)
{
struct bch_fs *c = trans->c;
struct btree_iter *iter;
struct bkey_alloc_unpacked u;
struct bkey_i_alloc *a;
int ret;
ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
if (ret)
return ret;
ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
&u.dirty_sectors, &u.cached_sectors);
if (ret)
goto out;
@ -1595,7 +1638,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
goto out;
bkey_alloc_init(&a->k_i);
a->k.p = pos;
a->k.p = iter->pos;
bch2_alloc_pack(a, u);
bch2_trans_update(trans, iter, &a->k_i, 0);
out:
@ -1716,6 +1759,44 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
return 0;
}
static int bch2_trans_mark_stripe(struct btree_trans *trans,
struct bkey_s_c k)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
struct bkey_alloc_unpacked u;
struct bkey_i_alloc *a;
struct btree_iter *iter;
unsigned i;
int ret = 0;
/*
* The allocator code doesn't necessarily update bucket gens in the
* btree when incrementing them, right before handing out new buckets -
* we just need to persist those updates here along with the new stripe:
*/
for (i = 0; i < s->nr_blocks && !ret; i++) {
ret = bch2_trans_start_alloc_update(trans, &iter,
&s->ptrs[i], &u);
if (ret)
break;
a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
goto put_iter;
bkey_alloc_init(&a->k_i);
a->k.p = iter->pos;
bch2_alloc_pack(a, u);
bch2_trans_update(trans, iter, &a->k_i, 0);
put_iter:
bch2_trans_iter_put(trans, iter);
}
return ret;
}
static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
struct bkey_s_c_reflink_p p,
u64 idx, unsigned sectors,
@ -1815,6 +1896,8 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
case KEY_TYPE_reflink_v:
return bch2_trans_mark_extent(trans, k, offset, sectors,
flags, BCH_DATA_user);
case KEY_TYPE_stripe:
return bch2_trans_mark_stripe(trans, k);
case KEY_TYPE_inode:
d = replicas_deltas_realloc(trans, 0);