bcachefs: Erasure coding fixes & refactoring

- Originally bch_extent_stripe_ptr didn't contain the block index,
   instead we'd have to search through the stripe pointers to figure out
   which pointer matched. When the block field was added to
   bch_extent_stripe_ptr, not all of the code was updated to use it.
   This patch fixes that, and we also now verify that field where it
   makes sense.

 - The ec_stripe_buf_init/exit() functions have been improved, and are
   now used by the bch2_ec_read_extent() (recovery read) path.

 - get_stripe_key() is now used by bch2_ec_read_extent().

 - We now have a getter and setter for checksums within a stripe, like
   we had previously for block sector counts, and ec_generate_checksums
   and ec_validate_checksums are now quite a bit smaller and cleaner.

ec.c still needs a lot of work, but this patch is slowly moving things
in the right direction.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2021-01-11 13:51:23 -05:00 committed by Kent Overstreet
parent b929bbef6f
commit 2a3731e34d
3 changed files with 194 additions and 183 deletions

View File

@ -1656,7 +1656,7 @@ out:
}
static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
struct bch_extent_stripe_ptr p,
struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type)
{
struct bch_fs *c = trans->c;
@ -1666,14 +1666,22 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
struct bch_replicas_padded r;
int ret = 0;
ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k);
ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.ec.idx), &iter, &k);
if (ret < 0)
return ret;
if (k.k->type != KEY_TYPE_stripe) {
bch2_fs_inconsistent(c,
"pointer to nonexistent stripe %llu",
(u64) p.idx);
(u64) p.ec.idx);
ret = -EIO;
goto out;
}
if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
bch2_fs_inconsistent(c,
"stripe pointer doesn't match stripe %llu",
(u64) p.ec.idx);
ret = -EIO;
goto out;
}
@ -1684,8 +1692,8 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
goto out;
bkey_reassemble(&s->k_i, k);
stripe_blockcount_set(&s->v, p.block,
stripe_blockcount_get(&s->v, p.block) +
stripe_blockcount_set(&s->v, p.ec.block,
stripe_blockcount_get(&s->v, p.ec.block) +
sectors);
bch2_trans_update(trans, iter, &s->k_i, 0);
@ -1736,7 +1744,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
dirty_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
} else {
ret = bch2_trans_mark_stripe_ptr(trans, p.ec,
ret = bch2_trans_mark_stripe_ptr(trans, p,
disk_sectors, data_type);
if (ret)
return ret;

View File

@ -138,44 +138,18 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
stripe_blockcount_get(s, i));
}
static int ptr_matches_stripe(struct bch_fs *c,
struct bch_stripe *v,
const struct bch_extent_ptr *ptr)
/* returns blocknr in stripe that we matched: */
static int bkey_matches_stripe(struct bch_stripe *s,
struct bkey_s_c k)
{
unsigned i;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const struct bch_extent_ptr *ptr;
unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) {
const struct bch_extent_ptr *ptr2 = v->ptrs + i;
if (ptr->dev == ptr2->dev &&
ptr->gen == ptr2->gen &&
ptr->offset >= ptr2->offset &&
ptr->offset < ptr2->offset + le16_to_cpu(v->sectors))
return i;
}
return -1;
}
static int extent_matches_stripe(struct bch_fs *c,
struct bch_stripe *v,
struct bkey_s_c k)
{
switch (k.k->type) {
case KEY_TYPE_extent: {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const struct bch_extent_ptr *ptr;
int idx;
extent_for_each_ptr(e, ptr) {
idx = ptr_matches_stripe(c, v, ptr);
if (idx >= 0)
return idx;
}
break;
}
}
bkey_for_each_ptr(ptrs, ptr)
for (i = 0; i < nr_data; i++)
if (__bch2_ptr_matches_stripe(s, ptr, i))
return i;
return -1;
}
@ -202,74 +176,93 @@ static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
/* Stripe bufs: */
static void ec_stripe_buf_free(struct ec_stripe_buf *stripe)
static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
{
unsigned i;
for (i = 0; i < stripe->key.v.nr_blocks; i++) {
kvpfree(stripe->data[i], stripe->size << 9);
stripe->data[i] = NULL;
for (i = 0; i < buf->key.v.nr_blocks; i++) {
kvpfree(buf->data[i], buf->size << 9);
buf->data[i] = NULL;
}
}
static int ec_stripe_buf_alloc(struct ec_stripe_buf *stripe)
static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
unsigned offset, unsigned size)
{
struct bch_stripe *v = &buf->key.v;
unsigned csum_granularity = 1U << v->csum_granularity_bits;
unsigned end = offset + size;
unsigned i;
memset(stripe->valid, 0xFF, sizeof(stripe->valid));
BUG_ON(end > le16_to_cpu(v->sectors));
for (i = 0; i < stripe->key.v.nr_blocks; i++) {
stripe->data[i] = kvpmalloc(stripe->size << 9, GFP_KERNEL);
if (!stripe->data[i])
offset = round_down(offset, csum_granularity);
end = min_t(unsigned, le16_to_cpu(v->sectors),
round_up(end, csum_granularity));
buf->offset = offset;
buf->size = end - offset;
memset(buf->valid, 0xFF, sizeof(buf->valid));
for (i = 0; i < buf->key.v.nr_blocks; i++) {
buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
if (!buf->data[i])
goto err;
}
return 0;
err:
ec_stripe_buf_free(stripe);
ec_stripe_buf_exit(buf);
return -ENOMEM;
}
/* Checksumming: */
static void ec_generate_checksums(struct ec_stripe_buf *buf)
static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
unsigned block, unsigned offset)
{
struct bch_stripe *v = &buf->key.v;
unsigned csum_granularity = 1 << v->csum_granularity_bits;
unsigned csums_per_device = stripe_csums_per_device(v);
unsigned csum_bytes = bch_crc_bytes[v->csum_type];
unsigned i, j;
unsigned end = buf->offset + buf->size;
unsigned len = min(csum_granularity, end - offset);
if (!csum_bytes)
BUG_ON(offset >= end);
BUG_ON(offset < buf->offset);
BUG_ON(offset & (csum_granularity - 1));
BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
(len & (csum_granularity - 1)));
return bch2_checksum(NULL, v->csum_type,
null_nonce(),
buf->data[block] + ((offset - buf->offset) << 9),
len << 9);
}
static void ec_generate_checksums(struct ec_stripe_buf *buf)
{
struct bch_stripe *v = &buf->key.v;
unsigned i, j, csums_per_device = stripe_csums_per_device(v);
if (!v->csum_type)
return;
BUG_ON(buf->offset);
BUG_ON(buf->size != le16_to_cpu(v->sectors));
for (i = 0; i < v->nr_blocks; i++) {
for (j = 0; j < csums_per_device; j++) {
unsigned offset = j << v->csum_granularity_bits;
unsigned len = min(csum_granularity, buf->size - offset);
struct bch_csum csum =
bch2_checksum(NULL, v->csum_type,
null_nonce(),
buf->data[i] + (offset << 9),
len << 9);
memcpy(stripe_csum(v, i, j), &csum, csum_bytes);
}
}
for (i = 0; i < v->nr_blocks; i++)
for (j = 0; j < csums_per_device; j++)
stripe_csum_set(v, i, j,
ec_block_checksum(buf, i, j << v->csum_granularity_bits));
}
static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
{
struct bch_stripe *v = &buf->key.v;
unsigned csum_granularity = 1 << v->csum_granularity_bits;
unsigned csum_bytes = bch_crc_bytes[v->csum_type];
unsigned i;
if (!csum_bytes)
if (!v->csum_type)
return;
for (i = 0; i < v->nr_blocks; i++) {
@ -282,21 +275,14 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
while (offset < end) {
unsigned j = offset >> v->csum_granularity_bits;
unsigned len = min(csum_granularity, end - offset);
struct bch_csum csum;
struct bch_csum want = stripe_csum_get(v, i, j);
struct bch_csum got = ec_block_checksum(buf, i, offset);
BUG_ON(offset & (csum_granularity - 1));
BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
((offset + len) & (csum_granularity - 1)));
csum = bch2_checksum(NULL, v->csum_type,
null_nonce(),
buf->data[i] + ((offset - buf->offset) << 9),
len << 9);
if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) {
if (bch2_crc_cmp(want, got)) {
bch_err_ratelimited(c,
"checksum error while doing reconstruct read (%u:%u)",
i, j);
"stripe checksum error at %u:%u: csum type %u, expected %llx got %llx",
i, j, v->csum_type,
want.lo, got.lo);
clear_bit(i, buf->valid);
break;
}
@ -373,6 +359,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
? BCH_DATA_user
: BCH_DATA_parity;
if (ptr_stale(ca, ptr)) {
bch_err_ratelimited(c,
"error %s stripe: stale pointer",
rw == READ ? "reading from" : "writing to");
clear_bit(idx, buf->valid);
return;
}
if (!bch2_dev_get_ioref(ca, rw)) {
clear_bit(idx, buf->valid);
return;
@ -415,87 +409,77 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
percpu_ref_put(&ca->io_ref);
}
/* recovery read path: */
int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
int ret;
bch2_trans_init(&trans, c, 0, 0);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS);
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
goto err;
if (k.k->type != KEY_TYPE_stripe) {
ret = -ENOENT;
goto err;
}
bkey_reassemble(&stripe->key.k_i, k);
err:
bch2_trans_exit(&trans);
return ret;
}
/* recovery read path: */
int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
{
struct ec_stripe_buf *buf;
struct closure cl;
struct bkey_s_c k;
struct bch_stripe *v;
unsigned stripe_idx;
unsigned offset, end;
unsigned i, nr_data, csum_granularity;
int ret = 0, idx;
unsigned i, offset;
int ret = 0;
closure_init_stack(&cl);
BUG_ON(!rbio->pick.has_ec);
stripe_idx = rbio->pick.ec.idx;
buf = kzalloc(sizeof(*buf), GFP_NOIO);
if (!buf)
return -ENOMEM;
bch2_trans_init(&trans, c, 0, 0);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EC,
POS(0, stripe_idx),
BTREE_ITER_SLOTS);
k = bch2_btree_iter_peek_slot(iter);
if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) {
ret = get_stripe_key(c, rbio->pick.ec.idx, buf);
if (ret) {
bch_err_ratelimited(c,
"error doing reconstruct read: stripe not found");
"error doing reconstruct read: error %i looking up stripe", ret);
kfree(buf);
return bch2_trans_exit(&trans) ?: -EIO;
return -EIO;
}
bkey_reassemble(&buf->key.k_i, k);
bch2_trans_exit(&trans);
v = &buf->key.v;
nr_data = v->nr_blocks - v->nr_redundant;
idx = ptr_matches_stripe(c, v, &rbio->pick.ptr);
BUG_ON(idx < 0);
csum_granularity = 1U << v->csum_granularity_bits;
offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset;
end = offset + bio_sectors(&rbio->bio);
BUG_ON(end > le16_to_cpu(v->sectors));
buf->offset = round_down(offset, csum_granularity);
buf->size = min_t(unsigned, le16_to_cpu(v->sectors),
round_up(end, csum_granularity)) - buf->offset;
for (i = 0; i < v->nr_blocks; i++) {
buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO);
if (!buf->data[i]) {
ret = -ENOMEM;
goto err;
}
if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
bch_err_ratelimited(c,
"error doing reconstruct read: pointer doesn't match stripe");
ret = -EIO;
goto err;
}
memset(buf->valid, 0xFF, sizeof(buf->valid));
offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
bch_err_ratelimited(c,
"error doing reconstruct read: read is bigger than stripe");
ret = -EIO;
goto err;
}
for (i = 0; i < v->nr_blocks; i++) {
struct bch_extent_ptr *ptr = v->ptrs + i;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (ptr_stale(ca, ptr)) {
bch_err_ratelimited(c,
"error doing reconstruct read: stale pointer");
clear_bit(i, buf->valid);
continue;
}
ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
if (ret)
goto err;
for (i = 0; i < v->nr_blocks; i++)
ec_block_io(c, buf, REQ_OP_READ, i, &cl);
}
closure_sync(&cl);
@ -513,10 +497,9 @@ int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
goto err;
memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
buf->data[idx] + ((offset - buf->offset) << 9));
buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
err:
for (i = 0; i < v->nr_blocks; i++)
kfree(buf->data[i]);
ec_stripe_buf_exit(buf);
kfree(buf);
return ret;
}
@ -784,7 +767,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
struct bkey_s_c k;
struct bkey_s_extent e;
struct bkey_buf sk;
int ret = 0, dev, idx;
int ret = 0, dev, block;
bch2_bkey_buf_init(&sk);
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
@ -805,13 +788,13 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
continue;
}
idx = extent_matches_stripe(c, &s->key.v, k);
if (idx < 0) {
block = bkey_matches_stripe(&s->key.v, k);
if (block < 0) {
bch2_btree_iter_next(iter);
continue;
}
dev = s->key.v.ptrs[idx].dev;
dev = s->key.v.ptrs[block].dev;
bch2_bkey_buf_reassemble(&sk, c, k);
e = bkey_i_to_s_extent(sk.k);
@ -820,7 +803,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev);
BUG_ON(!ec_ptr);
extent_stripe_ptr_add(e, s, ec_ptr, idx);
extent_stripe_ptr_add(e, s, ec_ptr, block);
bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
bch2_trans_update(&trans, iter, sk.k, 0);
@ -875,7 +858,7 @@ static void ec_stripe_create(struct ec_stripe_new *s)
swap(s->new_stripe.data[i],
s->existing_stripe.data[i]);
ec_stripe_buf_free(&s->existing_stripe);
ec_stripe_buf_exit(&s->existing_stripe);
}
BUG_ON(!s->allocated);
@ -941,8 +924,8 @@ err:
bch2_keylist_free(&s->keys, s->inline_keys);
ec_stripe_buf_free(&s->existing_stripe);
ec_stripe_buf_free(&s->new_stripe);
ec_stripe_buf_exit(&s->existing_stripe);
ec_stripe_buf_exit(&s->new_stripe);
closure_debug_destroy(&s->iodone);
kfree(s);
}
@ -1145,9 +1128,6 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
bch2_keylist_init(&s->keys, s->inline_keys);
s->new_stripe.offset = 0;
s->new_stripe.size = h->blocksize;
ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data,
s->nr_parity, h->blocksize);
@ -1305,9 +1285,7 @@ err:
/* XXX: doesn't obey target: */
static s64 get_existing_stripe(struct bch_fs *c,
unsigned target,
unsigned algo,
unsigned redundancy)
struct ec_stripe_head *head)
{
ec_stripes_heap *h = &c->ec_stripes_heap;
struct stripe *m;
@ -1325,8 +1303,9 @@ static s64 get_existing_stripe(struct bch_fs *c,
stripe_idx = h->data[heap_idx].idx;
m = genradix_ptr(&c->stripes[0], stripe_idx);
if (m->algorithm == algo &&
m->nr_redundant == redundancy &&
if (m->algorithm == head->algo &&
m->nr_redundant == head->redundancy &&
m->sectors == head->blocksize &&
m->blocks_nonempty < m->nr_blocks - m->nr_redundant) {
bch2_stripes_heap_del(c, m, stripe_idx);
spin_unlock(&c->ec_stripes_heap_lock);
@ -1338,24 +1317,6 @@ static s64 get_existing_stripe(struct bch_fs *c,
return -1;
}
static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
int ret;
bch2_trans_init(&trans, c, 0, 0);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS);
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (!ret)
bkey_reassemble(&stripe->key.k_i, k);
bch2_trans_exit(&trans);
return ret;
}
struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
unsigned target,
unsigned algo,
@ -1382,7 +1343,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
return NULL;
}
idx = get_existing_stripe(c, target, algo, redundancy);
idx = get_existing_stripe(c, h);
if (idx >= 0) {
h->s->have_existing_stripe = true;
ret = get_stripe_key(c, idx, &h->s->existing_stripe);
@ -1392,7 +1353,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
return NULL;
}
if (ec_stripe_buf_alloc(&h->s->existing_stripe)) {
if (ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize)) {
/*
* this is a problem: we have deleted from the
* stripes heap already
@ -1411,7 +1372,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
&h->s->existing_stripe.key.k_i);
}
if (ec_stripe_buf_alloc(&h->s->new_stripe)) {
if (ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize)) {
BUG();
}
}

View File

@ -60,9 +60,51 @@ static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
}
static inline void *stripe_csum(struct bch_stripe *s,
unsigned dev, unsigned csum_idx)
unsigned block, unsigned csum_idx)
{
return (void *) s + stripe_csum_offset(s, dev, csum_idx);
EBUG_ON(block >= s->nr_blocks);
EBUG_ON(csum_idx >= stripe_csums_per_device(s));
return (void *) s + stripe_csum_offset(s, block, csum_idx);
}
static inline struct bch_csum stripe_csum_get(struct bch_stripe *s,
unsigned block, unsigned csum_idx)
{
struct bch_csum csum = { 0 };
memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]);
return csum;
}
static inline void stripe_csum_set(struct bch_stripe *s,
unsigned block, unsigned csum_idx,
struct bch_csum csum)
{
memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
}
static inline bool __bch2_ptr_matches_stripe(const struct bch_stripe *s,
const struct bch_extent_ptr *ptr,
unsigned block)
{
unsigned nr_data = s->nr_blocks - s->nr_redundant;
if (block >= nr_data)
return false;
return ptr->dev == s->ptrs[block].dev &&
ptr->gen == s->ptrs[block].gen &&
ptr->offset >= s->ptrs[block].offset &&
ptr->offset < s->ptrs[block].offset + le16_to_cpu(s->sectors);
}
static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
struct extent_ptr_decoded p)
{
BUG_ON(!p.has_ec);
return __bch2_ptr_matches_stripe(s, &p.ptr, p.ec.block);
}
struct bch_read_bio;