bcachefs: Run insert triggers before overwrite triggers

Currently, btree triggers are run in natural key order, which presents a
problem for fallocate in INSERT_RANGE mode: since we're moving existing
extents to higher offsets, the trigger for deleting the old extent runs
before the trigger that adds the new extent, potentially leading to
indirect extents being deleted that shouldn't be when the delete causes
the refcount to hit 0.

This changes the order we run triggers so that for a givin btree, we run
all insert triggers before overwrite triggers, nicely sidestepping this
issue.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
This commit is contained in:
Kent Overstreet 2021-10-27 12:51:12 -04:00 committed by Kent Overstreet
parent c714614bd0
commit f0c3f88b35
4 changed files with 109 additions and 64 deletions

View file

@ -338,7 +338,8 @@ struct btree_insert_entry {
enum btree_id btree_id:8;
u8 level;
bool cached:1;
bool trans_triggers_run:1;
bool insert_trigger_run:1;
bool overwrite_trigger_run:1;
struct bkey_i *k;
struct btree_path *path;
unsigned long ip_allocated;

View file

@ -816,10 +816,112 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
return 0;
}
static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
{
struct bkey _deleted = KEY(0, 0, 0);
struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
struct bkey_s_c old;
struct bkey unpacked;
struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
bool trans_trigger_run;
unsigned btree_id = 0;
int ret = 0;
/*
*
* For a given btree, this algorithm runs insert triggers before
* overwrite triggers: this is so that when extents are being moved
* (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before
* they are re-added.
*/
for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) {
while (btree_id_start < trans->updates + trans->nr_updates &&
btree_id_start->btree_id < btree_id)
btree_id_start++;
/*
* Running triggers will append more updates to the list of updates as
* we're walking it:
*/
do {
trans_trigger_run = false;
for (i = btree_id_start;
i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
i++) {
if (i->insert_trigger_run ||
(i->flags & BTREE_TRIGGER_NORUN) ||
!(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
continue;
BUG_ON(i->overwrite_trigger_run);
i->insert_trigger_run = true;
trans_trigger_run = true;
old = bch2_btree_path_peek_slot(i->path, &unpacked);
_deleted.p = i->path->pos;
if (old.k->type == i->k->k.type &&
((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
i->overwrite_trigger_run = true;
ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(i->k),
BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|i->flags);
} else {
ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(i->k),
BTREE_TRIGGER_INSERT|i->flags);
}
if (ret == -EINTR)
trace_trans_restart_mark(trans->ip, _RET_IP_,
i->btree_id, &i->path->pos);
if (ret)
return ret;
}
} while (trans_trigger_run);
do {
trans_trigger_run = false;
for (i = btree_id_start;
i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
i++) {
if (i->overwrite_trigger_run ||
(i->flags & BTREE_TRIGGER_NORUN) ||
!(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)))
continue;
BUG_ON(!i->insert_trigger_run);
i->overwrite_trigger_run = true;
trans_trigger_run = true;
old = bch2_btree_path_peek_slot(i->path, &unpacked);
_deleted.p = i->path->pos;
ret = bch2_trans_mark_key(trans, old, deleted,
BTREE_TRIGGER_OVERWRITE|i->flags);
if (ret == -EINTR)
trace_trans_restart_mark(trans->ip, _RET_IP_,
i->btree_id, &i->path->pos);
if (ret)
return ret;
}
} while (trans_trigger_run);
}
trans_for_each_update(trans, i)
BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) &&
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
(!i->insert_trigger_run || !i->overwrite_trigger_run));
return 0;
}
int __bch2_trans_commit(struct btree_trans *trans)
{
struct btree_insert_entry *i = NULL;
bool trans_trigger_run;
unsigned u64s;
int ret = 0;
@ -854,30 +956,9 @@ int __bch2_trans_commit(struct btree_trans *trans)
i->btree_id, i->k->k.p);
#endif
/*
* Running triggers will append more updates to the list of updates as
* we're walking it:
*/
do {
trans_trigger_run = false;
trans_for_each_update(trans, i) {
if ((BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
!i->trans_triggers_run) {
i->trans_triggers_run = true;
trans_trigger_run = true;
ret = bch2_trans_mark_update(trans, i->path,
i->k, i->flags);
if (unlikely(ret)) {
if (ret == -EINTR)
trace_trans_restart_mark(trans->ip, _RET_IP_,
i->btree_id, &i->path->pos);
goto out;
}
}
}
} while (trans_trigger_run);
ret = bch2_trans_commit_run_triggers(trans);
if (ret)
goto out;
trans_for_each_update(trans, i) {
BUG_ON(!i->path->should_be_locked);
@ -1297,7 +1378,7 @@ int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
if (i < trans->updates + trans->nr_updates &&
!btree_insert_entry_cmp(&n, i)) {
BUG_ON(i->trans_triggers_run);
BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
/*
* This is a hack to ensure that inode creates update the btree,

View file

@ -1882,41 +1882,6 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c old,
}
}
int bch2_trans_mark_update(struct btree_trans *trans,
struct btree_path *path,
struct bkey_i *new,
unsigned flags)
{
struct bkey _deleted = KEY(0, 0, 0);
struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
struct bkey_s_c old;
struct bkey unpacked;
int ret;
_deleted.p = path->pos;
if (unlikely(flags & BTREE_TRIGGER_NORUN))
return 0;
if (!btree_node_type_needs_gc(path->btree_id))
return 0;
old = bch2_btree_path_peek_slot(path, &unpacked);
if (old.k->type == new->k.type &&
((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
} else {
ret = bch2_trans_mark_key(trans, deleted, bkey_i_to_s_c(new),
BTREE_TRIGGER_INSERT|flags) ?:
bch2_trans_mark_key(trans, old, deleted,
BTREE_TRIGGER_OVERWRITE|flags);
}
return ret;
}
static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
struct bch_dev *ca, size_t b,
enum bch_data_type type,

View file

@ -233,8 +233,6 @@ int bch2_mark_update(struct btree_trans *, struct btree_path *,
int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c,
struct bkey_s_c, unsigned);
int bch2_trans_mark_update(struct btree_trans *, struct btree_path *,
struct bkey_i *, unsigned);
void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,