bcachefs: Don't block on journal reservation with btree locks held

Fixes a deadlock between the allocator thread, when it first starts up,
and journal replay

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2018-11-18 21:35:59 -05:00 committed by Kent Overstreet
parent 129550c4d0
commit f1a79365a7
4 changed files with 108 additions and 59 deletions

View file

@ -638,12 +638,12 @@ static void btree_update_wait_on_journal(struct closure *cl)
int ret;
ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
if (ret < 0)
goto err;
if (!ret) {
if (ret == -EAGAIN) {
continue_at(cl, btree_update_wait_on_journal, system_wq);
return;
}
if (ret < 0)
goto err;
bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
err:

View file

@ -344,19 +344,35 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
trans_for_each_entry(trans, i)
BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
u64s = 0;
trans_for_each_entry(trans, i)
u64s += jset_u64s(i->k->k.u64s);
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
? bch2_journal_res_get(&c->journal,
&trans->journal_res,
u64s, u64s)
: 0;
if (ret)
return ret;
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
u64s = 0;
trans_for_each_entry(trans, i)
u64s += jset_u64s(i->k->k.u64s);
while ((ret = bch2_journal_res_get(&c->journal,
&trans->journal_res, u64s,
JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) {
struct btree_iter *iter = trans->entries[0].iter;
bch2_btree_iter_unlock(iter);
ret = bch2_journal_res_get(&c->journal,
&trans->journal_res, u64s,
JOURNAL_RES_GET_CHECK);
if (ret)
return ret;
if (!bch2_btree_iter_relock(iter)) {
trans_restart(" (iter relock after journal res get blocked)");
return -EINTR;
}
}
if (ret)
return ret;
}
multi_lock_write(c, trans);

View file

@ -335,15 +335,14 @@ u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
}
static int __journal_res_get(struct journal *j, struct journal_res *res,
unsigned u64s_min, unsigned u64s_max)
unsigned flags)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf;
int ret;
retry:
ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
if (ret)
return ret;
if (journal_res_get_fast(j, res, flags))
return 0;
spin_lock(&j->lock);
/*
@ -351,10 +350,9 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
* that just did journal_entry_open() and call journal_entry_close()
* unnecessarily
*/
ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
if (ret) {
if (journal_res_get_fast(j, res, flags)) {
spin_unlock(&j->lock);
return 1;
return 0;
}
/*
@ -377,7 +375,12 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
spin_unlock(&j->lock);
return -EROFS;
case JOURNAL_ENTRY_INUSE:
/* haven't finished writing out the previous one: */
/*
* The current journal entry is still open, but we failed to get
* a journal reservation because there's not enough space in it,
* and we can't close it and start another because we haven't
* finished writing out the previous entry:
*/
spin_unlock(&j->lock);
trace_journal_entry_full(c);
goto blocked;
@ -408,7 +411,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
blocked:
if (!j->res_get_blocked_start)
j->res_get_blocked_start = local_clock() ?: 1;
return 0;
return -EAGAIN;
}
/*
@ -422,14 +425,14 @@ static int __journal_res_get(struct journal *j, struct journal_res *res,
* btree node write locks.
*/
int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
unsigned u64s_min, unsigned u64s_max)
unsigned flags)
{
int ret;
wait_event(j->wait,
(ret = __journal_res_get(j, res, u64s_min,
u64s_max)));
return ret < 0 ? ret : 0;
(ret = __journal_res_get(j, res, flags)) != -EAGAIN ||
(flags & JOURNAL_RES_GET_NONBLOCK));
return ret;
}
u64 bch2_journal_last_unwritten_seq(struct journal *j)
@ -453,28 +456,55 @@ u64 bch2_journal_last_unwritten_seq(struct journal *j)
* btree root - every journal entry contains the roots of all the btrees, so it
* doesn't need to bother with getting a journal reservation
*/
int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent)
int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
{
int ret;
struct bch_fs *c = container_of(j, struct bch_fs, journal);
bool need_reclaim = false;
retry:
spin_lock(&j->lock);
BUG_ON(seq > journal_cur_seq(j));
if (seq < journal_cur_seq(j) ||
journal_entry_is_open(j)) {
spin_unlock(&j->lock);
return 1;
return 0;
}
if (journal_cur_seq(j) < seq) {
switch (journal_buf_switch(j, false)) {
case JOURNAL_ENTRY_ERROR:
spin_unlock(&j->lock);
return -EROFS;
case JOURNAL_ENTRY_INUSE:
/* haven't finished writing out the previous one: */
trace_journal_entry_full(c);
goto blocked;
case JOURNAL_ENTRY_CLOSED:
break;
case JOURNAL_UNLOCKED:
goto retry;
}
}
BUG_ON(journal_cur_seq(j) < seq);
if (!journal_entry_open(j)) {
need_reclaim = true;
goto blocked;
}
ret = journal_entry_open(j);
if (!ret)
closure_wait(&j->async_wait, parent);
spin_unlock(&j->lock);
if (!ret)
bch2_journal_reclaim_work(&j->reclaim_work.work);
return 0;
blocked:
if (!j->res_get_blocked_start)
j->res_get_blocked_start = local_clock() ?: 1;
return ret;
closure_wait(&j->async_wait, cl);
spin_unlock(&j->lock);
if (need_reclaim)
bch2_journal_reclaim_work(&j->reclaim_work.work);
return -EAGAIN;
}
static int journal_seq_error(struct journal *j, u64 seq)
@ -594,11 +624,10 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
void bch2_journal_meta_async(struct journal *j, struct closure *parent)
{
struct journal_res res;
unsigned u64s = jset_u64s(0);
memset(&res, 0, sizeof(res));
bch2_journal_res_get(j, &res, u64s, u64s);
bch2_journal_res_get(j, &res, jset_u64s(0), 0);
bch2_journal_res_put(j, &res);
bch2_journal_flush_seq_async(j, res.seq, parent);
@ -607,12 +636,11 @@ void bch2_journal_meta_async(struct journal *j, struct closure *parent)
int bch2_journal_meta(struct journal *j)
{
struct journal_res res;
unsigned u64s = jset_u64s(0);
int ret;
memset(&res, 0, sizeof(res));
ret = bch2_journal_res_get(j, &res, u64s, u64s);
ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
if (ret)
return ret;

View file

@ -272,12 +272,14 @@ static inline void bch2_journal_res_put(struct journal *j,
}
int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
unsigned, unsigned);
unsigned);
#define JOURNAL_RES_GET_NONBLOCK (1 << 0)
#define JOURNAL_RES_GET_CHECK (1 << 1)
static inline int journal_res_get_fast(struct journal *j,
struct journal_res *res,
unsigned u64s_min,
unsigned u64s_max)
unsigned flags)
{
union journal_res_state old, new;
u64 v = atomic64_read(&j->reservations.counter);
@ -289,42 +291,45 @@ static inline int journal_res_get_fast(struct journal *j,
* Check if there is still room in the current journal
* entry:
*/
if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s)
if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
return 0;
res->offset = old.cur_entry_offset;
res->u64s = min(u64s_max, j->cur_entry_u64s -
old.cur_entry_offset);
if (flags & JOURNAL_RES_GET_CHECK)
return 1;
journal_state_inc(&new);
new.cur_entry_offset += res->u64s;
journal_state_inc(&new);
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
res->ref = true;
res->idx = new.idx;
res->seq = le64_to_cpu(j->buf[res->idx].data->seq);
res->ref = true;
res->idx = old.idx;
res->offset = old.cur_entry_offset;
res->seq = le64_to_cpu(j->buf[old.idx].data->seq);
return 1;
}
static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
unsigned u64s_min, unsigned u64s_max)
unsigned u64s, unsigned flags)
{
int ret;
EBUG_ON(res->ref);
EBUG_ON(u64s_max < u64s_min);
EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
if (journal_res_get_fast(j, res, u64s_min, u64s_max))
res->u64s = u64s;
if (journal_res_get_fast(j, res, flags))
goto out;
ret = bch2_journal_res_get_slowpath(j, res, u64s_min, u64s_max);
ret = bch2_journal_res_get_slowpath(j, res, flags);
if (ret)
return ret;
out:
lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
EBUG_ON(!res->ref);
if (!(flags & JOURNAL_RES_GET_CHECK)) {
lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
EBUG_ON(!res->ref);
}
return 0;
}