bcachefs: Better approach to write vs. read lock deadlocks

Instead of unconditionally upgrading read locks to intent locks in
do_bch2_trans_commit(), this patch changes the path that takes write
locks to first trylock, and then if trylock fails check if we have a
conflicting read lock, and restart the transaction if necessary.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
This commit is contained in:
Kent Overstreet 2021-09-07 21:25:32 -04:00 committed by Kent Overstreet
parent b301105b48
commit caaa66aa54
2 changed files with 82 additions and 41 deletions

View file

@ -545,6 +545,54 @@ static inline void normalize_read_intent_locks(struct btree_trans *trans)
bch2_trans_verify_locks(trans);
}
static inline bool have_conflicting_read_lock(struct btree_trans *trans, struct btree_path *pos)
{
struct btree_path *path;
unsigned i;
trans_for_each_path_inorder(trans, path, i) {
//if (path == pos)
// break;
if (path->nodes_locked != path->nodes_intent_locked)
return true;
}
return false;
}
static inline int trans_lock_write(struct btree_trans *trans)
{
struct btree_insert_entry *i;
trans_for_each_update(trans, i) {
if (same_leaf_as_prev(trans, i))
continue;
if (!six_trylock_write(&insert_l(i)->b->c.lock)) {
if (have_conflicting_read_lock(trans, i->path))
goto fail;
__btree_node_lock_type(trans->c, insert_l(i)->b,
SIX_LOCK_write);
}
bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
}
return 0;
fail:
while (--i >= trans->updates) {
if (same_leaf_as_prev(trans, i))
continue;
bch2_btree_node_unlock_write_inlined(trans, i->path, insert_l(i)->b);
}
trace_trans_restart_would_deadlock_write(trans->ip);
return btree_trans_restart(trans);
}
/*
* Get journal reservation, take write locks, and attempt to do btree update(s):
*/
@ -554,10 +602,25 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
struct btree_path *path;
struct bkey_s_c old;
int ret, u64s_delta = 0;
trans_for_each_update(trans, i) {
const char *invalid = bch2_bkey_invalid(c,
bkey_i_to_s_c(i->k), i->bkey_type);
if (invalid) {
char buf[200];
bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
bch_err(c, "invalid bkey %s on insert from %ps -> %ps: %s\n",
buf, (void *) trans->ip,
(void *) i->ip_allocated, invalid);
bch2_fatal_error(c);
return -EINVAL;
}
btree_insert_entry_checks(trans, i);
}
trans_for_each_update(trans, i) {
struct bkey u;
@ -599,48 +662,11 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
if (unlikely(ret))
return ret;
/*
* Can't be holding any read locks when we go to take write locks:
* another thread could be holding an intent lock on the same node we
* have a read lock on, and it'll block trying to take a write lock
* (because we hold a read lock) and it could be blocking us by holding
* its own read lock (while we're trying to to take write locks).
*
* note - this must be done after bch2_trans_journal_preres_get_cold()
* or anything else that might call bch2_trans_relock(), since that
* would just retake the read locks:
*/
trans_for_each_path(trans, path)
if (path->nodes_locked != path->nodes_intent_locked &&
!bch2_btree_path_upgrade(trans, path, path->level + 1)) {
trace_trans_restart_upgrade(trans->ip, trace_ip,
path->btree_id, &path->pos);
return btree_trans_restart(trans);
}
trans_for_each_update(trans, i) {
const char *invalid = bch2_bkey_invalid(c,
bkey_i_to_s_c(i->k), i->bkey_type);
if (invalid) {
char buf[200];
bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
bch_err(c, "invalid bkey %s on insert from %ps -> %ps: %s\n",
buf, (void *) trans->ip,
(void *) i->ip_allocated, invalid);
bch2_fatal_error(c);
return -EINVAL;
}
btree_insert_entry_checks(trans, i);
}
normalize_read_intent_locks(trans);
trans_for_each_update(trans, i)
if (!same_leaf_as_prev(trans, i)) {
btree_node_lock_type(c, insert_l(i)->b, SIX_LOCK_write);
bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
}
ret = trans_lock_write(trans);
if (unlikely(ret))
return ret;
ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);

View file

@ -756,6 +756,21 @@ TRACE_EVENT(trans_restart_would_deadlock,
__entry->want_pos_snapshot)
);
TRACE_EVENT(trans_restart_would_deadlock_write,
TP_PROTO(unsigned long trans_ip),
TP_ARGS(trans_ip),
TP_STRUCT__entry(
__field(unsigned long, trans_ip )
),
TP_fast_assign(
__entry->trans_ip = trans_ip;
),
TP_printk("%ps", (void *) __entry->trans_ip)
);
TRACE_EVENT(trans_restart_mem_realloced,
TP_PROTO(unsigned long trans_ip, unsigned long caller_ip,
unsigned long bytes),