bcachefs: New erasure coding shutdown path

This implements a new shutdown path for erasure coding, which is needed
for the upcoming BCH_WRITE_WAIT_FOR_EC write path.

The process is:
 - Cancel new stripes being built up
 - Close out/cancel open buckets on write points or the partial list
   that are for stripes
 - Shutdown rebalance/copygc
 - Then wait for in flight new stripes to finish

With BCH_WRITE_WAIT_FOR_EC, move ops will be waiting on stripes to fill
up before they complete; the new ec shutdown path is needed for shutting
down copygc/rebalance without deadlocking.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2023-03-13 22:01:47 -04:00
parent b9fa375bab
commit b40901b0f7
10 changed files with 141 additions and 94 deletions

View file

@ -2158,44 +2158,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
*/
bch2_recalc_capacity(c);
/* Next, close write points that point to this device... */
for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
bch2_writepoint_stop(c, ca, &c->write_points[i]);
bch2_writepoint_stop(c, ca, &c->copygc_write_point);
bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
bch2_writepoint_stop(c, ca, &c->btree_write_point);
mutex_lock(&c->btree_reserve_cache_lock);
while (c->btree_reserve_cache_nr) {
struct btree_alloc *a =
&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
bch2_open_buckets_put(c, &a->ob);
}
mutex_unlock(&c->btree_reserve_cache_lock);
spin_lock(&c->freelist_lock);
i = 0;
while (i < c->open_buckets_partial_nr) {
struct open_bucket *ob =
c->open_buckets + c->open_buckets_partial[i];
if (ob->dev == ca->dev_idx) {
--c->open_buckets_partial_nr;
swap(c->open_buckets_partial[i],
c->open_buckets_partial[c->open_buckets_partial_nr]);
ob->on_partial_list = false;
spin_unlock(&c->freelist_lock);
bch2_open_bucket_put(c, ob);
spin_lock(&c->freelist_lock);
} else {
i++;
}
}
spin_unlock(&c->freelist_lock);
bch2_ec_stop_dev(c, ca);
bch2_open_buckets_stop(c, ca, false);
/*
* Wake up threads that were blocked on allocation, so they can notice

View file

@ -1023,43 +1023,94 @@ static int open_bucket_add_buckets(struct btree_trans *trans,
return ret < 0 ? ret : 0;
}
void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
struct open_buckets *obs)
static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c,
struct bch_dev *ca, bool ec)
{
struct open_buckets ptrs = { .nr = 0 };
struct open_bucket *ob, *ob2;
unsigned i, j;
open_bucket_for_each(c, obs, ob, i) {
bool drop = !ca || ob->dev == ca->dev_idx;
if (ec) {
return ob->ec != NULL;
} else if (ca) {
bool drop = ob->dev == ca->dev_idx;
struct open_bucket *ob2;
unsigned i;
if (!drop && ob->ec) {
mutex_lock(&ob->ec->lock);
for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) {
if (!ob->ec->blocks[j])
for (i = 0; i < ob->ec->new_stripe.key.v.nr_blocks; i++) {
if (!ob->ec->blocks[i])
continue;
ob2 = c->open_buckets + ob->ec->blocks[j];
ob2 = c->open_buckets + ob->ec->blocks[i];
drop |= ob2->dev == ca->dev_idx;
}
mutex_unlock(&ob->ec->lock);
}
if (drop)
return drop;
} else {
return true;
}
}
static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
bool ec, struct write_point *wp)
{
struct open_buckets ptrs = { .nr = 0 };
struct open_bucket *ob;
unsigned i;
mutex_lock(&wp->lock);
open_bucket_for_each(c, &wp->ptrs, ob, i)
if (should_drop_bucket(ob, c, ca, ec))
bch2_open_bucket_put(c, ob);
else
ob_push(c, &ptrs, ob);
}
*obs = ptrs;
wp->ptrs = ptrs;
mutex_unlock(&wp->lock);
}
void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
struct write_point *wp)
void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
bool ec)
{
mutex_lock(&wp->lock);
bch2_open_buckets_stop_dev(c, ca, &wp->ptrs);
mutex_unlock(&wp->lock);
unsigned i;
/* Next, close write points that point to this device... */
for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
bch2_writepoint_stop(c, ca, ec, &c->write_points[i]);
bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point);
bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point);
bch2_writepoint_stop(c, ca, ec, &c->btree_write_point);
mutex_lock(&c->btree_reserve_cache_lock);
while (c->btree_reserve_cache_nr) {
struct btree_alloc *a =
&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
bch2_open_buckets_put(c, &a->ob);
}
mutex_unlock(&c->btree_reserve_cache_lock);
spin_lock(&c->freelist_lock);
i = 0;
while (i < c->open_buckets_partial_nr) {
struct open_bucket *ob =
c->open_buckets + c->open_buckets_partial[i];
if (should_drop_bucket(ob, c, ca, ec)) {
--c->open_buckets_partial_nr;
swap(c->open_buckets_partial[i],
c->open_buckets_partial[c->open_buckets_partial_nr]);
ob->on_partial_list = false;
spin_unlock(&c->freelist_lock);
bch2_open_bucket_put(c, ob);
spin_lock(&c->freelist_lock);
} else {
i++;
}
}
spin_unlock(&c->freelist_lock);
bch2_ec_stop_dev(c, ca);
}
static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
@ -1107,8 +1158,7 @@ static bool try_increase_writepoints(struct bch_fs *c)
return true;
}
static bool try_decrease_writepoints(struct bch_fs *c,
unsigned old_nr)
static bool try_decrease_writepoints(struct bch_fs *c, unsigned old_nr)
{
struct write_point *wp;
@ -1129,7 +1179,7 @@ static bool try_decrease_writepoints(struct bch_fs *c,
hlist_del_rcu(&wp->node);
mutex_unlock(&c->write_points_hash_lock);
bch2_writepoint_stop(c, NULL, wp);
bch2_writepoint_stop(c, NULL, false, wp);
return true;
}

View file

@ -202,11 +202,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
struct bkey_i *, unsigned, bool);
void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
struct open_buckets *);
void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
struct write_point *);
void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool);
static inline struct write_point_specifier writepoint_hashed(unsigned long v)
{

View file

@ -655,7 +655,6 @@ typedef struct {
x(fallocate) \
x(discard) \
x(invalidate) \
x(move) \
x(delete_dead_snapshots) \
x(snapshot_delete_pagecache) \
x(sysfs)
@ -958,14 +957,14 @@ struct bch_fs {
struct list_head ec_stripe_new_list;
struct mutex ec_stripe_new_lock;
wait_queue_head_t ec_stripe_new_wait;
struct work_struct ec_stripe_create_work;
u64 ec_stripe_hint;
struct bio_set ec_bioset;
struct work_struct ec_stripe_delete_work;
struct llist_head ec_stripe_delete_list;
struct bio_set ec_bioset;
/* REFLINK */
u64 reflink_hint;

View file

@ -252,6 +252,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, &op->res,
NULL,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
m->data_opts.btree_insert_flags);
if (!ret) {

View file

@ -989,6 +989,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
while (1) {
ret = commit_do(trans, NULL, NULL,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL,
ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
s, &bp_offset));
@ -1127,7 +1128,9 @@ static void ec_stripe_create(struct ec_stripe_new *s)
goto err;
}
ret = bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
ret = bch2_trans_do(c, &s->res, NULL,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL,
ec_stripe_key_update(&trans, &s->new_stripe.key,
!s->have_existing_stripe));
if (ret) {
@ -1409,6 +1412,11 @@ struct ec_stripe_head *__bch2_ec_stripe_head_get(struct btree_trans *trans,
if (ret)
return ERR_PTR(ret);
if (test_bit(BCH_FS_GOING_RO, &c->flags)) {
h = ERR_PTR(-EROFS);
goto found;
}
list_for_each_entry(h, &c->ec_stripe_head_list, list)
if (h->target == target &&
h->algo == algo &&
@ -1753,7 +1761,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans,
return ERR_PTR(ret);
}
void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca)
{
struct ec_stripe_head *h;
struct open_bucket *ob;
@ -1761,11 +1769,13 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
mutex_lock(&c->ec_stripe_head_lock);
list_for_each_entry(h, &c->ec_stripe_head_list, list) {
mutex_lock(&h->lock);
if (!h->s)
goto unlock;
if (!ca)
goto found;
for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
if (!h->s->blocks[i])
continue;
@ -1784,6 +1794,32 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
mutex_unlock(&c->ec_stripe_head_lock);
}
void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
{
__bch2_ec_stop(c, ca);
}
void bch2_fs_ec_stop(struct bch_fs *c)
{
__bch2_ec_stop(c, NULL);
}
static bool bch2_fs_ec_flush_done(struct bch_fs *c)
{
bool ret;
mutex_lock(&c->ec_stripe_new_lock);
ret = list_empty(&c->ec_stripe_new_list);
mutex_unlock(&c->ec_stripe_new_lock);
return ret;
}
void bch2_fs_ec_flush(struct bch_fs *c)
{
wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c));
}
int bch2_stripes_read(struct bch_fs *c)
{
struct btree_trans trans;
@ -1915,14 +1951,22 @@ void bch2_fs_ec_exit(struct bch_fs *c)
void bch2_fs_ec_init_early(struct bch_fs *c)
{
spin_lock_init(&c->ec_stripes_new_lock);
mutex_init(&c->ec_stripes_heap_lock);
INIT_LIST_HEAD(&c->ec_stripe_head_list);
mutex_init(&c->ec_stripe_head_lock);
INIT_LIST_HEAD(&c->ec_stripe_new_list);
mutex_init(&c->ec_stripe_new_lock);
init_waitqueue_head(&c->ec_stripe_new_wait);
INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
}
int bch2_fs_ec_init(struct bch_fs *c)
{
spin_lock_init(&c->ec_stripes_new_lock);
return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
BIOSET_NEED_BVECS);
}

View file

@ -245,8 +245,8 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s,
}
void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
void bch2_ec_flush_new_stripes(struct bch_fs *);
void bch2_fs_ec_stop(struct bch_fs *);
void bch2_fs_ec_flush(struct bch_fs *);
int bch2_stripes_read(struct bch_fs *);

View file

@ -705,7 +705,8 @@ static void bch2_write_done(struct closure *cl)
struct bch_fs *c = op->c;
bch2_disk_reservation_put(c, &op->res);
bch2_write_ref_put(c, BCH_WRITE_REF_write);
if (!(op->flags & BCH_WRITE_MOVE))
bch2_write_ref_put(c, BCH_WRITE_REF_write);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
@ -1842,7 +1843,12 @@ void bch2_write(struct closure *cl)
goto err;
}
if (c->opts.nochanges ||
if (c->opts.nochanges) {
op->error = -BCH_ERR_erofs_no_writes;
goto err;
}
if (!(op->flags & BCH_WRITE_MOVE) &&
!bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
op->error = -BCH_ERR_erofs_no_writes;
goto err;

View file

@ -59,7 +59,6 @@ struct moving_io {
static void move_free(struct moving_io *io)
{
struct moving_context *ctxt = io->write.ctxt;
struct bch_fs *c = ctxt->c;
if (io->b)
atomic_dec(&io->b->count);
@ -71,7 +70,6 @@ static void move_free(struct moving_io *io)
wake_up(&ctxt->wait);
mutex_unlock(&ctxt->lock);
bch2_write_ref_put(c, BCH_WRITE_REF_move);
kfree(io);
}
@ -280,9 +278,6 @@ static int bch2_move_extent(struct btree_trans *trans,
return 0;
}
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_move))
return -BCH_ERR_erofs_no_writes;
/*
* Before memory allocations & taking nocow locks in
* bch2_data_update_init():
@ -378,7 +373,6 @@ static int bch2_move_extent(struct btree_trans *trans,
err_free:
kfree(io);
err:
bch2_write_ref_put(c, BCH_WRITE_REF_move);
trace_and_count(c, move_extent_alloc_mem_fail, k.k);
return ret;
}

View file

@ -205,9 +205,12 @@ static void __bch2_fs_read_only(struct bch_fs *c)
unsigned i, clean_passes = 0;
u64 seq = 0;
bch2_fs_ec_stop(c);
bch2_open_buckets_stop(c, NULL, true);
bch2_rebalance_stop(c);
bch2_copygc_stop(c);
bch2_gc_thread_stop(c);
bch2_fs_ec_flush(c);
bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu",
journal_cur_seq(&c->journal));
@ -700,15 +703,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
INIT_LIST_HEAD(&c->fsck_errors);
mutex_init(&c->fsck_error_lock);
INIT_LIST_HEAD(&c->ec_stripe_head_list);
mutex_init(&c->ec_stripe_head_lock);
INIT_LIST_HEAD(&c->ec_stripe_new_list);
mutex_init(&c->ec_stripe_new_lock);
mutex_init(&c->ec_stripes_heap_lock);
seqcount_init(&c->gc_pos_lock);
seqcount_init(&c->usage_lock);