bcachefs: Subvol dirents are now only visible in parent subvol

This changes the on disk format for dirents that point to subvols so
that they also record the subvolid of the parent subvol, so that we can
filter them out in other subvolumes.

This also updates the dirent code to do that filtering, and in
particular tweaks the rename code - we need to ensure that there's only
ever one dirent (counting multiplicities in different snapshots) that
point to a subvolume.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
This commit is contained in:
Kent Overstreet 2021-10-12 12:06:02 -04:00 committed by Kent Overstreet
parent e5fa91d7ac
commit 4db650277d
6 changed files with 233 additions and 130 deletions

View file

@ -793,7 +793,13 @@ struct bch_dirent {
struct bch_val v;
/* Target inode number: */
union {
__le64 d_inum;
struct { /* DT_SUBVOL */
__le32 d_child_subvol;
__le32 d_parent_subvol;
};
};
/*
* Copy of mode bits 12-15 from the target inode - so userspace can get
@ -1268,7 +1274,8 @@ enum bcachefs_metadata_version {
bcachefs_metadata_version_btree_ptr_sectors_written = 14,
bcachefs_metadata_version_snapshot_2 = 15,
bcachefs_metadata_version_reflink_p_fix = 16,
bcachefs_metadata_version_max = 17,
bcachefs_metadata_version_subvol_dirent = 17,
bcachefs_metadata_version_max = 18,
};
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)

View file

@ -64,6 +64,15 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
}
static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
if (d.v->d_type == DT_SUBVOL)
return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol;
return true;
}
const struct bch_hash_desc bch2_dirent_hash_desc = {
.btree_id = BTREE_ID_dirents,
.key_type = KEY_TYPE_dirent,
@ -71,6 +80,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
.hash_bkey = dirent_hash_bkey,
.cmp_key = dirent_cmp_key,
.cmp_bkey = dirent_cmp_bkey,
.is_visible = dirent_is_visible,
};
const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
@ -114,14 +124,18 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
bch_scnmemcpy(out, d.v->d_name,
bch2_dirent_name_bytes(d));
pr_buf(out, " -> %llu type %s", d.v->d_inum,
pr_buf(out, " -> %llu type %s",
d.v->d_type != DT_SUBVOL
? le64_to_cpu(d.v->d_inum)
: le32_to_cpu(d.v->d_child_subvol),
d.v->d_type < BCH_DT_MAX
? bch2_d_types[d.v->d_type]
: "(bad d_type)");
}
static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
u8 type, const struct qstr *name, u64 dst)
subvol_inum dir, u8 type,
const struct qstr *name, u64 dst)
{
struct bkey_i_dirent *dirent;
unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
@ -137,7 +151,14 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
bkey_dirent_init(&dirent->k_i);
dirent->k.u64s = u64s;
dirent->v.d_inum = cpu_to_le64(dst);
if (type != DT_SUBVOL) {
dirent->v.d_inum = cpu_to_le64(dst);
} else {
dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol);
dirent->v.d_child_subvol = cpu_to_le32(dst);
}
dirent->v.d_type = type;
memcpy(dirent->v.d_name, name->name, name->len);
@ -159,7 +180,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
struct bkey_i_dirent *dirent;
int ret;
dirent = dirent_create_key(trans, type, name, dst_inum);
dirent = dirent_create_key(trans, dir, type, name, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
@ -178,45 +199,30 @@ static void dirent_copy_target(struct bkey_i_dirent *dst,
dst->v.d_type = src.v->d_type;
}
int __bch2_dirent_read_target(struct btree_trans *trans,
struct bkey_s_c_dirent d,
u32 *subvol, u32 *snapshot, u64 *inum,
bool is_fsck)
static int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
struct bkey_s_c_dirent d, subvol_inum *target)
{
struct bch_subvolume s;
int ret = 0;
*subvol = 0;
*snapshot = d.k->p.snapshot;
if (d.v->d_type == DT_SUBVOL &&
d.v->d_parent_subvol != dir.subvol)
return 1;
if (likely(d.v->d_type != DT_SUBVOL)) {
*inum = le64_to_cpu(d.v->d_inum);
target->subvol = dir.subvol;
target->inum = le64_to_cpu(d.v->d_inum);
} else {
*subvol = le64_to_cpu(d.v->d_inum);
target->subvol = le32_to_cpu(d.v->d_child_subvol);
ret = bch2_subvolume_get(trans, *subvol, !is_fsck, BTREE_ITER_CACHED, &s);
ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s);
*snapshot = le32_to_cpu(s.snapshot);
*inum = le64_to_cpu(s.inode);
target->inum = le64_to_cpu(s.inode);
}
return ret;
}
static int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
struct bkey_s_c_dirent d, subvol_inum *target)
{
u32 snapshot;
int ret = 0;
ret = __bch2_dirent_read_target(trans, d, &target->subvol, &snapshot,
&target->inum, false);
if (!target->subvol)
target->subvol = dir.subvol;
return ret;
}
int bch2_dirent_rename(struct btree_trans *trans,
subvol_inum src_dir, struct bch_hash_info *src_hash,
subvol_inum dst_dir, struct bch_hash_info *dst_hash,
@ -230,6 +236,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
struct bpos dst_pos =
POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
unsigned src_type = 0, dst_type = 0, src_update_flags = 0;
int ret = 0;
if (src_dir.subvol != dst_dir.subvol)
@ -238,36 +245,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
memset(src_inum, 0, sizeof(*src_inum));
memset(dst_inum, 0, sizeof(*dst_inum));
/*
* Lookup dst:
*
* Note that in BCH_RENAME mode, we're _not_ checking if
* the target already exists - we're relying on the VFS
* to do that check for us for correctness:
*/
ret = mode == BCH_RENAME
? bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
dst_hash, dst_dir, dst_name)
: bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
dst_hash, dst_dir, dst_name,
BTREE_ITER_INTENT);
if (ret)
goto out;
old_dst = bch2_btree_iter_peek_slot(&dst_iter);
ret = bkey_err(old_dst);
if (ret)
goto out;
if (mode != BCH_RENAME) {
ret = bch2_dirent_read_target(trans, dst_dir,
bkey_s_c_to_dirent(old_dst), dst_inum);
if (ret)
goto out;
}
if (mode != BCH_RENAME_EXCHANGE)
*src_offset = dst_iter.pos.offset;
/* Lookup src: */
ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
src_hash, src_dir, src_name,
@ -285,8 +262,51 @@ int bch2_dirent_rename(struct btree_trans *trans,
if (ret)
goto out;
src_type = bkey_s_c_to_dirent(old_src).v->d_type;
if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE)
return -EOPNOTSUPP;
/* Lookup dst: */
if (mode == BCH_RENAME) {
/*
* Note that we're _not_ checking if the target already exists -
* we're relying on the VFS to do that check for us for
* correctness:
*/
ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
dst_hash, dst_dir, dst_name);
if (ret)
goto out;
} else {
ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
dst_hash, dst_dir, dst_name,
BTREE_ITER_INTENT);
if (ret)
goto out;
old_dst = bch2_btree_iter_peek_slot(&dst_iter);
ret = bkey_err(old_dst);
if (ret)
goto out;
ret = bch2_dirent_read_target(trans, dst_dir,
bkey_s_c_to_dirent(old_dst), dst_inum);
if (ret)
goto out;
dst_type = bkey_s_c_to_dirent(old_dst).v->d_type;
if (dst_type == DT_SUBVOL)
return -EOPNOTSUPP;
}
if (mode != BCH_RENAME_EXCHANGE)
*src_offset = dst_iter.pos.offset;
/* Create new dst key: */
new_dst = dirent_create_key(trans, 0, dst_name, 0);
new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0);
ret = PTR_ERR_OR_ZERO(new_dst);
if (ret)
goto out;
@ -296,7 +316,7 @@ int bch2_dirent_rename(struct btree_trans *trans,
/* Create new src key: */
if (mode == BCH_RENAME_EXCHANGE) {
new_src = dirent_create_key(trans, 0, src_name, 0);
new_src = dirent_create_key(trans, src_dir, 0, src_name, 0);
ret = PTR_ERR_OR_ZERO(new_src);
if (ret)
goto out;
@ -326,10 +346,9 @@ int bch2_dirent_rename(struct btree_trans *trans,
* If we're not overwriting, we can just insert
* new_dst at the src position:
*/
new_dst->k.p = src_iter.pos;
bch2_trans_update(trans, &src_iter,
&new_dst->k_i, 0);
goto out_set_offset;
new_src = new_dst;
new_src->k.p = src_iter.pos;
goto out_set_src;
} else {
/* If we're overwriting, we can't insert new_dst
* at a different slot because it has to
@ -350,9 +369,25 @@ int bch2_dirent_rename(struct btree_trans *trans,
}
}
bch2_trans_update(trans, &src_iter, &new_src->k_i, 0);
bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
out_set_offset:
out_set_src:
/*
* If we're deleting a subvolume, we need to really delete the dirent,
* not just emit a whiteout in the current snapshot:
*/
if (src_type == DT_SUBVOL) {
bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
ret = bch2_btree_iter_traverse(&src_iter);
if (ret)
goto out;
new_src->k.p = src_iter.pos;
src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
}
bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
if (mode == BCH_RENAME_EXCHANGE)
*src_offset = new_src->k.p.offset;
*dst_offset = new_dst->k.p.offset;
@ -393,6 +428,8 @@ int __bch2_dirent_lookup_trans(struct btree_trans *trans,
d = bkey_s_c_to_dirent(k);
ret = bch2_dirent_read_target(trans, dir, d, inum);
if (ret > 0)
ret = -ENOENT;
if (ret)
bch2_trans_iter_exit(trans, iter);
@ -453,6 +490,7 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_dirent dirent;
subvol_inum target;
u32 snapshot;
int ret;
@ -474,6 +512,12 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
dirent = bkey_s_c_to_dirent(k);
ret = bch2_dirent_read_target(&trans, inum, dirent, &target);
if (ret < 0)
break;
if (ret)
continue;
/*
* XXX: dir_emit() can fault and block, while we're holding
* locks
@ -481,7 +525,7 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
ctx->pos = dirent.k->p.offset;
if (!dir_emit(ctx, dirent.v->d_name,
bch2_dirent_name_bytes(dirent),
le64_to_cpu(dirent.v->d_inum),
target.inum,
vfs_d_type(dirent.v->d_type)))
break;
ctx->pos = dirent.k->p.offset + 1;

View file

@ -33,9 +33,6 @@ int bch2_dirent_create(struct btree_trans *, subvol_inum,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *, int);
int __bch2_dirent_read_target(struct btree_trans *, struct bkey_s_c_dirent,
u32 *, u32 *, u64 *, bool);
static inline unsigned vfs_d_type(unsigned type)
{
return type == DT_SUBVOL ? DT_DIR : type;

View file

@ -134,10 +134,11 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
if (ret)
goto err;
*snapshot = iter.pos.snapshot;
ret = k.k->type == KEY_TYPE_inode
? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
: -ENOENT;
if (!ret)
*snapshot = iter.pos.snapshot;
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@ -1045,46 +1046,60 @@ static int fix_overlapping_extent(struct btree_trans *trans,
}
#endif
static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
struct btree_iter *iter,
struct bpos pos)
{
struct bkey_s_c k;
int ret;
bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0);
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (!ret && k.k->type != KEY_TYPE_dirent)
ret = -ENOENT;
if (ret) {
bch2_trans_iter_exit(trans, iter);
return (struct bkey_s_c_dirent) { .k = ERR_PTR(ret) };
}
return bkey_s_c_to_dirent(k);
}
static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
struct bkey_s_c_dirent d)
{
return inode->bi_dir == d.k->p.inode &&
inode->bi_dir_offset == d.k->p.offset;
}
static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
struct bch_inode_unpacked *inode)
{
return d.v->d_type == DT_SUBVOL
? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol
: le64_to_cpu(d.v->d_inum) == inode->bi_inum;
}
static int inode_backpointer_exists(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 snapshot)
{
struct btree_iter iter;
struct bkey_s_c k;
u32 target_subvol, target_snapshot;
u64 target_inum;
struct bkey_s_c_dirent d;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents,
SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot), 0);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
d = dirent_get_by_pos(trans, &iter,
SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
ret = bkey_err(d.s_c);
if (ret)
goto out;
if (k.k->type != KEY_TYPE_dirent)
goto out;
return ret;
ret = __bch2_dirent_read_target(trans, bkey_s_c_to_dirent(k),
&target_subvol,
&target_snapshot,
&target_inum,
true);
if (ret)
goto out;
ret = target_inum == inode->bi_inum;
out:
ret = dirent_points_to_inode(d, inode);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
static bool inode_backpointer_matches(struct bkey_s_c_dirent d,
struct bch_inode_unpacked *inode)
{
return d.k->p.inode == inode->bi_dir &&
d.k->p.offset == inode->bi_dir_offset;
}
static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
@ -1326,7 +1341,7 @@ static int check_dirent_target(struct btree_trans *trans,
goto err;
}
if (!inode_backpointer_matches(d, target)) {
if (!inode_points_to_dirent(target, d)) {
ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
if (ret < 0)
goto err;
@ -1394,8 +1409,34 @@ static int check_dirent_target(struct btree_trans *trans,
BTREE_INSERT_LAZY_RW,
bch2_trans_update(trans, iter, &n->k_i, 0));
kfree(n);
if (ret)
return ret ?: -EINTR;
}
if (d.v->d_type == DT_SUBVOL &&
target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
(c->sb.version < bcachefs_metadata_version_subvol_dirent ||
fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u",
le32_to_cpu(d.v->d_parent_subvol),
target->bi_parent_subvol))) {
struct bkey_i_dirent *n;
n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
if (!n) {
ret = -ENOMEM;
goto err;
}
bkey_reassemble(&n->k_i, d.s_c);
n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
ret = __bch2_trans_do(trans, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW,
bch2_trans_update(trans, iter, &n->k_i, 0));
kfree(n);
return ret ?: -EINTR;
}
err:
fsck_err:
@ -1412,9 +1453,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_s_c k;
struct bkey_s_c_dirent d;
struct inode_walker_entry *i;
u32 target_snapshot;
u32 target_subvol;
u64 target_inum;
char buf[200];
int ret;
@ -1482,21 +1520,21 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
d = bkey_s_c_to_dirent(k);
ret = __bch2_dirent_read_target(trans, d,
&target_subvol,
&target_snapshot,
&target_inum,
true);
if (ret && ret != -ENOENT)
return ret;
if (fsck_err_on(ret, c,
"dirent points to missing subvolume %llu",
le64_to_cpu(d.v->d_inum)))
return remove_dirent(trans, d.k->p);
if (target_subvol) {
if (d.v->d_type == DT_SUBVOL) {
struct bch_inode_unpacked subvol_root;
u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
u32 target_snapshot;
u64 target_inum;
ret = __subvol_lookup(trans, target_subvol,
&target_snapshot, &target_inum);
if (ret && ret != -ENOENT)
return ret;
if (fsck_err_on(ret, c,
"dirent points to missing subvolume %llu",
le64_to_cpu(d.v->d_child_subvol)))
return remove_dirent(trans, d.k->p);
ret = __lookup_inode(trans, target_inum,
&subvol_root, &target_snapshot);
@ -1526,7 +1564,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
return ret;
} else {
ret = __get_visible_inodes(trans, target, s, target_inum);
ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum));
if (ret)
return ret;
@ -1786,9 +1824,11 @@ static int check_path(struct btree_trans *trans,
while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
struct btree_iter dirent_iter;
struct bkey_s_c_dirent d;
u32 parent_snapshot = snapshot;
if (inode->bi_parent_subvol) {
if (inode->bi_subvol) {
u64 inum;
ret = subvol_lookup(trans, inode->bi_parent_subvol,
@ -1798,11 +1838,18 @@ static int check_path(struct btree_trans *trans,
}
ret = lockrestart_do(trans,
inode_backpointer_exists(trans, inode, parent_snapshot));
if (ret < 0)
PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
SPOS(inode->bi_dir, inode->bi_dir_offset,
parent_snapshot))).k));
if (ret && ret != -ENOENT)
break;
if (!ret) {
if (!ret && !dirent_points_to_inode(d, inode)) {
bch2_trans_iter_exit(trans, &dirent_iter);
ret = -ENOENT;
}
if (ret == -ENOENT) {
if (fsck_err(c, "unreachable inode %llu:%u, type %u nlink %u backptr %llu:%llu",
inode->bi_inum, snapshot,
mode_to_type(inode->bi_mode),
@ -1812,7 +1859,8 @@ static int check_path(struct btree_trans *trans,
ret = reattach_inode(trans, inode, snapshot);
break;
}
ret = 0;
bch2_trans_iter_exit(trans, &dirent_iter);
if (!S_ISDIR(inode->bi_mode))
break;

View file

@ -1086,8 +1086,8 @@ int bch2_fs_recovery(struct bch_fs *c)
c->opts.version_upgrade = true;
c->opts.fsck = true;
c->opts.fix_errors = FSCK_OPT_YES;
} else if (c->sb.version < bcachefs_metadata_version_reflink_p_fix) {
bch_info(c, "filesystem version is prior to reflink_p fix - upgrading");
} else if (c->sb.version < bcachefs_metadata_version_subvol_dirent) {
bch_info(c, "filesystem version is prior to subvol_dirent - upgrading");
c->opts.version_upgrade = true;
c->opts.fsck = true;
}

View file

@ -138,8 +138,15 @@ struct bch_hash_desc {
u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
bool (*cmp_key)(struct bkey_s_c, const void *);
bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
bool (*is_visible)(subvol_inum inum, struct bkey_s_c);
};
static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k)
{
return k.k->type == desc.key_type &&
(!desc.is_visible || desc.is_visible(inum, k));
}
static __always_inline int
bch2_hash_lookup(struct btree_trans *trans,
struct btree_iter *iter,
@ -162,7 +169,7 @@ bch2_hash_lookup(struct btree_trans *trans,
if (iter->pos.inode != inum.inum)
break;
if (k.k->type == desc.key_type) {
if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_key(k, key))
return 0;
} else if (k.k->type == KEY_TYPE_hash_whiteout) {
@ -198,7 +205,7 @@ bch2_hash_hole(struct btree_trans *trans,
if (iter->pos.inode != inum.inum)
break;
if (k.k->type != desc.key_type)
if (!is_visible_key(desc, inum, k))
return 0;
}
bch2_trans_iter_exit(trans, iter);
@ -261,7 +268,7 @@ int bch2_hash_set(struct btree_trans *trans,
if (iter.pos.inode != inum.inum)
break;
if (k.k->type == desc.key_type) {
if (is_visible_key(desc, inum, k)) {
if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
goto found;