bcachefs: Account for stripe parity sectors separately

Instead of trying to charge EC parity to the data within the stripe
(which is subject to rounding errors), let's charge it to the stripe
itself. It should also make -ENOSPC issues easier to deal with if we
charge for parity blocks up front, and means we can also make more fine
grained accounting available to the user.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2020-07-09 18:31:51 -04:00 committed by Kent Overstreet
parent 39283c712e
commit af4d05c46b
5 changed files with 142 additions and 86 deletions

View file

@ -1036,7 +1036,8 @@ LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
x(journal, 2) \
x(btree, 3) \
x(user, 4) \
x(cached, 5)
x(cached, 5) \
x(parity, 6)
enum bch_data_type {
#define x(t, n) BCH_DATA_##t,

View file

@ -77,6 +77,26 @@
#include <linux/preempt.h>
static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
enum bch_data_type data_type,
s64 sectors)
{
switch (data_type) {
case BCH_DATA_btree:
fs_usage->btree += sectors;
break;
case BCH_DATA_user:
case BCH_DATA_parity:
fs_usage->data += sectors;
break;
case BCH_DATA_cached:
fs_usage->cached += sectors;
break;
default:
break;
}
}
/*
* Clear journal_seq_valid for buckets for which it's not needed, to prevent
* wraparound:
@ -132,17 +152,7 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
struct bch_replicas_entry *e =
cpu_replicas_entry(&c->replicas, i);
switch (e->data_type) {
case BCH_DATA_btree:
usage->btree += usage->replicas[i];
break;
case BCH_DATA_user:
usage->data += usage->replicas[i];
break;
case BCH_DATA_cached:
usage->cached += usage->replicas[i];
break;
}
fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
}
percpu_up_write(&c->mark_lock);
@ -374,9 +384,14 @@ static inline int is_fragmented_bucket(struct bucket_mark m,
return 0;
}
static inline int is_stripe_data_bucket(struct bucket_mark m)
{
return m.stripe && m.data_type != BCH_DATA_parity;
}
static inline int bucket_stripe_sectors(struct bucket_mark m)
{
return m.stripe ? m.dirty_sectors : 0;
return is_stripe_data_bucket(m) ? m.dirty_sectors : 0;
}
static inline enum bch_data_type bucket_type(struct bucket_mark m)
@ -520,17 +535,7 @@ static inline int update_replicas(struct bch_fs *c,
if (!fs_usage)
return 0;
switch (r->data_type) {
case BCH_DATA_btree:
fs_usage->btree += sectors;
break;
case BCH_DATA_user:
fs_usage->data += sectors;
break;
case BCH_DATA_cached:
fs_usage->cached += sectors;
break;
}
fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
fs_usage->replicas[idx] += sectors;
return 0;
}
@ -958,12 +963,15 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
}
static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
const struct bch_extent_ptr *ptr,
unsigned ptr_idx,
struct bch_fs_usage *fs_usage,
u64 journal_seq,
unsigned flags,
u64 journal_seq, unsigned flags,
bool enabled)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
unsigned nr_data = s->nr_blocks - s->nr_redundant;
bool parity = ptr_idx >= nr_data;
const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
bool gc = flags & BTREE_TRIGGER_GC;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = PTR_BUCKET(ca, ptr, gc);
@ -990,6 +998,12 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
(bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
new.stripe = enabled;
if ((flags & BTREE_TRIGGER_GC) && parity) {
new.data_type = enabled ? BCH_DATA_parity : 0;
new.dirty_sectors = enabled ? le16_to_cpu(s->sectors): 0;
}
if (journal_seq) {
new.journal_seq_valid = 1;
new.journal_seq = journal_seq;
@ -1074,12 +1088,10 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
struct bch_extent_stripe_ptr p,
enum bch_data_type data_type,
struct bch_fs_usage *fs_usage,
s64 sectors, unsigned flags,
struct bch_replicas_padded *r,
unsigned *nr_data,
unsigned *nr_parity)
s64 sectors, unsigned flags)
{
bool gc = flags & BTREE_TRIGGER_GC;
struct bch_replicas_padded r;
struct stripe *m;
unsigned i, blocks_nonempty = 0;
@ -1094,14 +1106,10 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
return -EIO;
}
BUG_ON(m->r.e.data_type != data_type);
*nr_data = m->nr_blocks - m->nr_redundant;
*nr_parity = m->nr_redundant;
*r = m->r;
m->block_sectors[p.block] += sectors;
r = m->r;
for (i = 0; i < m->nr_blocks; i++)
blocks_nonempty += m->block_sectors[i] != 0;
@ -1113,6 +1121,9 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
spin_unlock(&c->ec_stripes_heap_lock);
r.e.data_type = data_type;
update_replicas(c, fs_usage, &r.e, sectors);
return 0;
}
@ -1158,25 +1169,11 @@ static int bch2_mark_extent(struct bch_fs *c,
dirty_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
} else {
struct bch_replicas_padded ec_r;
unsigned nr_data, nr_parity;
s64 parity_sectors;
ret = bch2_mark_stripe_ptr(c, p.ec, data_type,
fs_usage, disk_sectors, flags,
&ec_r, &nr_data, &nr_parity);
fs_usage, disk_sectors, flags);
if (ret)
return ret;
parity_sectors =
__ptr_disk_sectors_delta(p.crc.live_size,
offset, sectors, flags,
p.crc.compressed_size * nr_parity,
p.crc.uncompressed_size * nr_data);
update_replicas(c, fs_usage, &ec_r.e,
disk_sectors + parity_sectors);
/*
* There may be other dirty pointers in this extent, but
* if so they're not required for mounting if we have an
@ -1216,7 +1213,7 @@ static int bch2_mark_stripe(struct bch_fs *c,
if (!new_s) {
/* Deleting: */
for (i = 0; i < old_s->nr_blocks; i++) {
ret = bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage,
ret = bucket_set_stripe(c, old, i, fs_usage,
journal_seq, flags, false);
if (ret)
return ret;
@ -1228,6 +1225,10 @@ static int bch2_mark_stripe(struct bch_fs *c,
spin_unlock(&c->ec_stripes_heap_lock);
}
if (gc)
update_replicas(c, fs_usage, &m->r.e,
-((s64) m->sectors * m->nr_redundant));
memset(m, 0, sizeof(*m));
} else {
BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks);
@ -1240,12 +1241,12 @@ static int bch2_mark_stripe(struct bch_fs *c,
sizeof(struct bch_extent_ptr))) {
if (old_s) {
bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage,
bucket_set_stripe(c, old, i, fs_usage,
journal_seq, flags, false);
if (ret)
return ret;
}
ret = bucket_set_stripe(c, new, new_s->ptrs + i, fs_usage,
ret = bucket_set_stripe(c, new, i, fs_usage,
journal_seq, flags, true);
if (ret)
return ret;
@ -1258,8 +1259,16 @@ static int bch2_mark_stripe(struct bch_fs *c,
m->nr_blocks = new_s->nr_blocks;
m->nr_redundant = new_s->nr_redundant;
if (gc && old_s)
update_replicas(c, fs_usage, &m->r.e,
-((s64) m->sectors * m->nr_redundant));
bch2_bkey_to_replicas(&m->r.e, new);
if (gc)
update_replicas(c, fs_usage, &m->r.e,
((s64) m->sectors * m->nr_redundant));
/* gc recalculates these fields: */
if (!(flags & BTREE_TRIGGER_GC)) {
m->blocks_nonempty = 0;
@ -1648,15 +1657,13 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
struct bch_extent_stripe_ptr p,
s64 sectors, enum bch_data_type data_type,
struct bch_replicas_padded *r,
unsigned *nr_data,
unsigned *nr_parity)
s64 sectors, enum bch_data_type data_type)
{
struct bch_fs *c = trans->c;
struct btree_iter *iter;
struct bkey_s_c k;
struct bkey_i_stripe *s;
struct bch_replicas_padded r;
int ret = 0;
ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k);
@ -1677,15 +1684,14 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
goto out;
bkey_reassemble(&s->k_i, k);
stripe_blockcount_set(&s->v, p.block,
stripe_blockcount_get(&s->v, p.block) +
sectors);
*nr_data = s->v.nr_blocks - s->v.nr_redundant;
*nr_parity = s->v.nr_redundant;
bch2_bkey_to_replicas(&r->e, bkey_i_to_s_c(&s->k_i));
bch2_trans_update(trans, iter, &s->k_i, 0);
bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
r.e.data_type = data_type;
update_replicas_list(trans, &r.e, sectors);
out:
bch2_trans_iter_put(trans, iter);
return ret;
@ -1730,25 +1736,11 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
dirty_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
} else {
struct bch_replicas_padded ec_r;
unsigned nr_data, nr_parity;
s64 parity_sectors;
ret = bch2_trans_mark_stripe_ptr(trans, p.ec,
disk_sectors, data_type,
&ec_r, &nr_data, &nr_parity);
disk_sectors, data_type);
if (ret)
return ret;
parity_sectors =
__ptr_disk_sectors_delta(p.crc.live_size,
offset, sectors, flags,
p.crc.compressed_size * nr_parity,
p.crc.uncompressed_size * nr_data);
update_replicas_list(trans, &ec_r.e,
disk_sectors + parity_sectors);
r.e.nr_required = 0;
}
}
@ -1760,15 +1752,26 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
}
static int bch2_trans_mark_stripe(struct btree_trans *trans,
struct bkey_s_c k)
struct bkey_s_c k,
unsigned flags)
{
const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
unsigned nr_data = s->nr_blocks - s->nr_redundant;
struct bch_replicas_padded r;
struct bkey_alloc_unpacked u;
struct bkey_i_alloc *a;
struct btree_iter *iter;
bool deleting = flags & BTREE_TRIGGER_OVERWRITE;
s64 sectors = le16_to_cpu(s->sectors);
unsigned i;
int ret = 0;
if (deleting)
sectors = -sectors;
bch2_bkey_to_replicas(&r.e, k);
update_replicas_list(trans, &r.e, sectors * s->nr_redundant);
/*
* The allocator code doesn't necessarily update bucket gens in the
* btree when incrementing them, right before handing out new buckets -
@ -1776,11 +1779,20 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
*/
for (i = 0; i < s->nr_blocks && !ret; i++) {
bool parity = i >= nr_data;
ret = bch2_trans_start_alloc_update(trans, &iter,
&s->ptrs[i], &u);
if (ret)
break;
if (parity) {
u.dirty_sectors += sectors;
u.data_type = u.dirty_sectors
? BCH_DATA_parity
: 0;
}
a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
ret = PTR_ERR_OR_ZERO(a);
if (ret)
@ -1897,7 +1909,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
return bch2_trans_mark_extent(trans, k, offset, sectors,
flags, BCH_DATA_user);
case KEY_TYPE_stripe:
return bch2_trans_mark_stripe(trans, k);
return bch2_trans_mark_stripe(trans, k, flags);
case KEY_TYPE_inode:
d = replicas_deltas_realloc(trans, 0);

View file

@ -343,12 +343,17 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
unsigned offset = 0, bytes = buf->size << 9;
struct bch_extent_ptr *ptr = &v->ptrs[idx];
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
enum bch_data_type data_type = idx < buf->key.v.nr_blocks - buf->key.v.nr_redundant
? BCH_DATA_user
: BCH_DATA_parity;
if (!bch2_dev_get_ioref(ca, rw)) {
clear_bit(idx, buf->valid);
return;
}
this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
while (offset < bytes) {
unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS,
DIV_ROUND_UP(bytes, PAGE_SIZE));
@ -670,6 +675,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
/* stripe creation: */
static int ec_stripe_bkey_insert(struct bch_fs *c,
struct ec_stripe_new *s,
struct bkey_i_stripe *stripe)
{
struct btree_trans trans;
@ -711,7 +717,7 @@ static int ec_stripe_bkey_insert(struct bch_fs *c,
bch2_trans_update(&trans, iter, &stripe->k_i, 0);
ret = bch2_trans_commit(&trans, NULL, NULL,
ret = bch2_trans_commit(&trans, &s->res, NULL,
BTREE_INSERT_NOFAIL);
err:
bch2_trans_iter_put(&trans, iter);
@ -858,8 +864,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
ret = s->existing_stripe
? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i,
NULL, NULL, BTREE_INSERT_NOFAIL)
: ec_stripe_bkey_insert(c, &s->stripe.key);
&s->res, NULL, BTREE_INSERT_NOFAIL)
: ec_stripe_bkey_insert(c, s, &s->stripe.key);
if (ret) {
bch_err(c, "error creating stripe: error creating stripe key");
goto err_put_writes;
@ -886,6 +892,8 @@ static void ec_stripe_create(struct ec_stripe_new *s)
err_put_writes:
percpu_ref_put(&c->writes);
err:
bch2_disk_reservation_put(c, &s->res);
open_bucket_for_each(c, &s->blocks, ob, i) {
ob->ec = NULL;
__bch2_open_bucket_put(c, ob);
@ -1325,6 +1333,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
struct open_bucket *ob;
unsigned i, data_idx = 0;
s64 idx;
int ret;
closure_init_stack(&cl);
@ -1356,6 +1365,22 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
}
}
if (!h->s->existing_stripe &&
!h->s->res.sectors) {
ret = bch2_disk_reservation_get(c, &h->s->res,
h->blocksize,
h->s->nr_parity, 0);
if (ret) {
/* What should we do here? */
bch_err(c, "unable to create new stripe: %i", ret);
bch2_ec_stripe_head_put(c, h);
h = NULL;
goto out;
}
}
if (new_stripe_alloc_buckets(c, h)) {
bch2_ec_stripe_head_put(c, h);
h = NULL;

View file

@ -3,6 +3,7 @@
#define _BCACHEFS_EC_H
#include "ec_types.h"
#include "buckets_types.h"
#include "keylist_types.h"
const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c);
@ -105,6 +106,7 @@ struct ec_stripe_new {
struct open_buckets blocks;
u8 data_block_idx[EC_STRIPE_MAX];
struct open_buckets parity;
struct disk_reservation res;
struct keylist keys;
u64 inline_keys[BKEY_U64s * 8];

View file

@ -122,7 +122,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
extent_to_replicas(k, e);
break;
case KEY_TYPE_stripe:
e->data_type = BCH_DATA_user;
e->data_type = BCH_DATA_parity;
stripe_to_replicas(k, e);
break;
}
@ -449,7 +449,23 @@ static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k,
bch2_bkey_to_replicas(&search.e, k);
return __bch2_mark_replicas(c, &search.e, check);
ret = __bch2_mark_replicas(c, &search.e, check);
if (ret)
return ret;
if (search.e.data_type == BCH_DATA_parity) {
search.e.data_type = BCH_DATA_cached;
ret = __bch2_mark_replicas(c, &search.e, check);
if (ret)
return ret;
search.e.data_type = BCH_DATA_user;
ret = __bch2_mark_replicas(c, &search.e, check);
if (ret)
return ret;
}
return 0;
}
bool bch2_bkey_replicas_marked(struct bch_fs *c,