bcachefs: Initial commit

Initially forked from drivers/md/bcache, bcachefs is a new copy-on-write
filesystem with every feature you could possibly want.

Website: https://bcachefs.org

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2017-03-16 22:18:50 -08:00 committed by Kent Overstreet
parent 0d29a833b7
commit 1c6fdbd8f2
122 changed files with 57147 additions and 0 deletions

View File

@ -48,6 +48,7 @@ source "fs/ocfs2/Kconfig"
source "fs/btrfs/Kconfig"
source "fs/nilfs2/Kconfig"
source "fs/f2fs/Kconfig"
source "fs/bcachefs/Kconfig"
source "fs/zonefs/Kconfig"
endif # BLOCK

View File

@ -123,6 +123,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
obj-$(CONFIG_F2FS_FS) += f2fs/
obj-$(CONFIG_BCACHEFS_FS) += bcachefs/
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
obj-$(CONFIG_EFIVAR_FS) += efivarfs/

52
fs/bcachefs/Kconfig Normal file
View File

@ -0,0 +1,52 @@
config BCACHEFS_FS
tristate "bcachefs filesystem support"
depends on BLOCK
select EXPORTFS
select CLOSURES
select LIBCRC32C
select FS_POSIX_ACL
select LZ4_COMPRESS
select LZ4_DECOMPRESS
select ZLIB_DEFLATE
select ZLIB_INFLATE
select ZSTD_COMPRESS
select ZSTD_DECOMPRESS
select CRYPTO_SHA256
select CRYPTO_CHACHA20
select CRYPTO_POLY1305
select KEYS
help
The bcachefs filesystem - a modern, copy on write filesystem, with
support for multiple devices, compression, checksumming, etc.
config BCACHEFS_QUOTA
bool "bcachefs quota support"
depends on BCACHEFS_FS
select QUOTACTL
config BCACHEFS_POSIX_ACL
bool "bcachefs POSIX ACL support"
depends on BCACHEFS_FS
select FS_POSIX_ACL
config BCACHEFS_DEBUG
bool "bcachefs debugging"
depends on BCACHEFS_FS
help
Enables many extra debugging checks and assertions.
The resulting code will be significantly slower than normal; you
probably shouldn't select this option unless you're a developer.
config BCACHEFS_TESTS
bool "bcachefs unit and performance tests"
depends on BCACHEFS_FS
help
Include some unit and performance tests for the core btree code
config BCACHEFS_NO_LATENCY_ACCT
bool "disable latency accounting and time stats"
depends on BCACHEFS_FS
help
This disables device latency tracking and time stats, only for performance testing

53
fs/bcachefs/Makefile Normal file
View File

@ -0,0 +1,53 @@
obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o
bcachefs-y := \
acl.o \
alloc.o \
bkey.o \
bkey_methods.o \
bset.o \
btree_cache.o \
btree_gc.o \
btree_io.o \
btree_iter.o \
btree_update_interior.o \
btree_update_leaf.o \
buckets.o \
chardev.o \
checksum.o \
clock.o \
compress.o \
debug.o \
dirent.o \
disk_groups.o \
error.o \
extents.o \
fs.o \
fs-ioctl.o \
fs-io.o \
fsck.o \
inode.o \
io.o \
journal.o \
journal_io.o \
journal_reclaim.o \
journal_seq_blacklist.o \
keylist.o \
migrate.o \
move.o \
movinggc.o \
opts.o \
quota.o \
rebalance.o \
recovery.o \
replicas.o \
siphash.o \
six.o \
super.o \
super-io.o \
sysfs.o \
tests.o \
trace.o \
util.o \
xattr.o

387
fs/bcachefs/acl.c Normal file
View File

@ -0,0 +1,387 @@
// SPDX-License-Identifier: GPL-2.0
#ifdef CONFIG_BCACHEFS_POSIX_ACL
#include "bcachefs.h"
#include <linux/fs.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include "acl.h"
#include "fs.h"
#include "xattr.h"
static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
{
return sizeof(bch_acl_header) +
sizeof(bch_acl_entry_short) * nr_short +
sizeof(bch_acl_entry) * nr_long;
}
static inline int acl_to_xattr_type(int type)
{
switch (type) {
case ACL_TYPE_ACCESS:
return BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
case ACL_TYPE_DEFAULT:
return BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
default:
BUG();
}
}
/*
* Convert from filesystem to in-memory representation.
*/
static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size)
{
const void *p, *end = value + size;
struct posix_acl *acl;
struct posix_acl_entry *out;
unsigned count = 0;
if (!value)
return NULL;
if (size < sizeof(bch_acl_header))
goto invalid;
if (((bch_acl_header *)value)->a_version !=
cpu_to_le32(BCH_ACL_VERSION))
goto invalid;
p = value + sizeof(bch_acl_header);
while (p < end) {
const bch_acl_entry *entry = p;
if (p + sizeof(bch_acl_entry_short) > end)
goto invalid;
switch (le16_to_cpu(entry->e_tag)) {
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
case ACL_MASK:
case ACL_OTHER:
p += sizeof(bch_acl_entry_short);
break;
case ACL_USER:
case ACL_GROUP:
p += sizeof(bch_acl_entry);
break;
default:
goto invalid;
}
count++;
}
if (p > end)
goto invalid;
if (!count)
return NULL;
acl = posix_acl_alloc(count, GFP_KERNEL);
if (!acl)
return ERR_PTR(-ENOMEM);
out = acl->a_entries;
p = value + sizeof(bch_acl_header);
while (p < end) {
const bch_acl_entry *in = p;
out->e_tag = le16_to_cpu(in->e_tag);
out->e_perm = le16_to_cpu(in->e_perm);
switch (out->e_tag) {
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
case ACL_MASK:
case ACL_OTHER:
p += sizeof(bch_acl_entry_short);
break;
case ACL_USER:
out->e_uid = make_kuid(&init_user_ns,
le32_to_cpu(in->e_id));
p += sizeof(bch_acl_entry);
break;
case ACL_GROUP:
out->e_gid = make_kgid(&init_user_ns,
le32_to_cpu(in->e_id));
p += sizeof(bch_acl_entry);
break;
}
out++;
}
BUG_ON(out != acl->a_entries + acl->a_count);
return acl;
invalid:
pr_err("invalid acl entry");
return ERR_PTR(-EINVAL);
}
#define acl_for_each_entry(acl, acl_e) \
for (acl_e = acl->a_entries; \
acl_e < acl->a_entries + acl->a_count; \
acl_e++)
/*
* Convert from in-memory to filesystem representation.
*/
static struct bkey_i_xattr *
bch2_acl_to_xattr(struct btree_trans *trans,
const struct posix_acl *acl,
int type)
{
struct bkey_i_xattr *xattr;
bch_acl_header *acl_header;
const struct posix_acl_entry *acl_e;
void *outptr;
unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
acl_for_each_entry(acl, acl_e) {
switch (acl_e->e_tag) {
case ACL_USER:
case ACL_GROUP:
nr_long++;
break;
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
case ACL_MASK:
case ACL_OTHER:
nr_short++;
break;
default:
return ERR_PTR(-EINVAL);
}
}
acl_len = bch2_acl_size(nr_short, nr_long);
u64s = BKEY_U64s + xattr_val_u64s(0, acl_len);
if (u64s > U8_MAX)
return ERR_PTR(-E2BIG);
xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
if (IS_ERR(xattr))
return xattr;
bkey_xattr_init(&xattr->k_i);
xattr->k.u64s = u64s;
xattr->v.x_type = acl_to_xattr_type(type);
xattr->v.x_name_len = 0,
xattr->v.x_val_len = cpu_to_le16(acl_len);
acl_header = xattr_val(&xattr->v);
acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION);
outptr = (void *) acl_header + sizeof(*acl_header);
acl_for_each_entry(acl, acl_e) {
bch_acl_entry *entry = outptr;
entry->e_tag = cpu_to_le16(acl_e->e_tag);
entry->e_perm = cpu_to_le16(acl_e->e_perm);
switch (acl_e->e_tag) {
case ACL_USER:
entry->e_id = cpu_to_le32(
from_kuid(&init_user_ns, acl_e->e_uid));
outptr += sizeof(bch_acl_entry);
break;
case ACL_GROUP:
entry->e_id = cpu_to_le32(
from_kgid(&init_user_ns, acl_e->e_gid));
outptr += sizeof(bch_acl_entry);
break;
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
case ACL_MASK:
case ACL_OTHER:
outptr += sizeof(bch_acl_entry_short);
break;
}
}
BUG_ON(outptr != xattr_val(&xattr->v) + acl_len);
return xattr;
}
struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
struct dentry *dentry, int type)
{
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c_xattr xattr;
struct posix_acl *acl = NULL;
bch2_trans_init(&trans, c);
retry:
bch2_trans_begin(&trans);
iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
&inode->ei_str_hash, inode->v.i_ino,
&X_SEARCH(acl_to_xattr_type(type), "", 0),
0);
if (IS_ERR(iter)) {
if (PTR_ERR(iter) == -EINTR)
goto retry;
if (PTR_ERR(iter) != -ENOENT)
acl = ERR_CAST(iter);
goto out;
}
xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
acl = bch2_acl_from_disk(xattr_val(xattr.v),
le16_to_cpu(xattr.v->x_val_len));
if (!IS_ERR(acl))
set_cached_acl(&inode->v, type, acl);
out:
bch2_trans_exit(&trans);
return acl;
}
int bch2_set_acl_trans(struct btree_trans *trans,
struct bch_inode_unpacked *inode_u,
const struct bch_hash_info *hash_info,
struct posix_acl *acl, int type)
{
int ret;
if (type == ACL_TYPE_DEFAULT &&
!S_ISDIR(inode_u->bi_mode))
return acl ? -EACCES : 0;
if (acl) {
struct bkey_i_xattr *xattr =
bch2_acl_to_xattr(trans, acl, type);
if (IS_ERR(xattr))
return PTR_ERR(xattr);
ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
inode_u->bi_inum, &xattr->k_i, 0);
} else {
struct xattr_search_key search =
X_SEARCH(acl_to_xattr_type(type), "", 0);
ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
inode_u->bi_inum, &search);
}
return ret == -ENOENT ? 0 : ret;
}
static int inode_update_for_set_acl_fn(struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
void *p)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct timespec64 now = current_time(&inode->v);
umode_t mode = (unsigned long) p;
bi->bi_ctime = timespec_to_bch2_time(c, now);
bi->bi_mode = mode;
return 0;
}
int bch2_set_acl(struct mnt_idmap *idmap,
struct dentry *dentry,
struct posix_acl *acl, int type)
{
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans trans;
struct bch_inode_unpacked inode_u;
umode_t mode = inode->v.i_mode;
int ret;
if (type == ACL_TYPE_ACCESS && acl) {
ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
if (ret)
return ret;
}
bch2_trans_init(&trans, c);
retry:
bch2_trans_begin(&trans);
ret = bch2_set_acl_trans(&trans,
&inode->ei_inode,
&inode->ei_str_hash,
acl, type) ?:
bch2_write_inode_trans(&trans, inode, &inode_u,
inode_update_for_set_acl_fn,
(void *)(unsigned long) mode) ?:
bch2_trans_commit(&trans, NULL, NULL,
&inode->ei_journal_seq,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOUNLOCK);
if (ret == -EINTR)
goto retry;
if (unlikely(ret))
goto err;
bch2_inode_update_after_write(c, inode, &inode_u,
ATTR_CTIME|ATTR_MODE);
set_cached_acl(&inode->v, type, acl);
err:
bch2_trans_exit(&trans);
return ret;
}
int bch2_acl_chmod(struct btree_trans *trans,
struct bch_inode_info *inode,
umode_t mode,
struct posix_acl **new_acl)
{
struct btree_iter *iter;
struct bkey_s_c_xattr xattr;
struct bkey_i_xattr *new;
struct posix_acl *acl;
int ret = 0;
iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
&inode->ei_str_hash, inode->v.i_ino,
&X_SEARCH(BCH_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
BTREE_ITER_INTENT);
if (IS_ERR(iter))
return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0;
xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
acl = bch2_acl_from_disk(xattr_val(xattr.v),
le16_to_cpu(xattr.v->x_val_len));
if (IS_ERR_OR_NULL(acl))
return PTR_ERR(acl);
ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
if (ret)
goto err;
new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
if (IS_ERR(new)) {
ret = PTR_ERR(new);
goto err;
}
bch2_trans_update(trans, iter, &new->k_i, 0);
*new_acl = acl;
acl = NULL;
err:
kfree(acl);
return ret;
}
#endif /* CONFIG_BCACHEFS_POSIX_ACL */

59
fs/bcachefs/acl.h Normal file
View File

@ -0,0 +1,59 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_ACL_H
#define _BCACHEFS_ACL_H
struct bch_inode_unpacked;
struct bch_hash_info;
struct bch_inode_info;
struct posix_acl;
#ifdef CONFIG_BCACHEFS_POSIX_ACL
#define BCH_ACL_VERSION 0x0001
typedef struct {
__le16 e_tag;
__le16 e_perm;
__le32 e_id;
} bch_acl_entry;
typedef struct {
__le16 e_tag;
__le16 e_perm;
} bch_acl_entry_short;
typedef struct {
__le32 a_version;
} bch_acl_header;
struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
int bch2_set_acl_trans(struct btree_trans *,
struct bch_inode_unpacked *,
const struct bch_hash_info *,
struct posix_acl *, int);
int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *,
umode_t, struct posix_acl **);
#else
static inline int bch2_set_acl_trans(struct btree_trans *trans,
struct bch_inode_unpacked *inode_u,
const struct bch_hash_info *hash_info,
struct posix_acl *acl, int type)
{
return 0;
}
static inline int bch2_acl_chmod(struct btree_trans *trans,
struct bch_inode_info *inode,
umode_t mode,
struct posix_acl **new_acl)
{
return 0;
}
#endif /* CONFIG_BCACHEFS_POSIX_ACL */
#endif /* _BCACHEFS_ACL_H */

2205
fs/bcachefs/alloc.c Normal file

File diff suppressed because it is too large Load Diff

141
fs/bcachefs/alloc.h Normal file
View File

@ -0,0 +1,141 @@
#ifndef _BCACHEFS_ALLOC_H
#define _BCACHEFS_ALLOC_H
#include "bcachefs.h"
#include "alloc_types.h"
struct bkey;
struct bch_dev;
struct bch_fs;
struct bch_devs_List;
const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
#define bch2_bkey_alloc_ops (struct bkey_ops) { \
.key_invalid = bch2_alloc_invalid, \
.val_to_text = bch2_alloc_to_text, \
}
struct dev_alloc_list {
unsigned nr;
u8 devs[BCH_SB_MEMBERS_MAX];
};
struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *,
struct write_point *,
struct bch_devs_mask *);
void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
struct write_point *);
int bch2_alloc_read(struct bch_fs *, struct list_head *);
int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
enum bucket_alloc_ret {
ALLOC_SUCCESS = 0,
OPEN_BUCKETS_EMPTY = -1,
FREELIST_EMPTY = -2, /* Allocator thread not keeping up */
NO_DEVICES = -3, /* -EROFS */
};
long bch2_bucket_alloc_new_fs(struct bch_dev *);
int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
struct closure *);
#define __writepoint_for_each_ptr(_wp, _ob, _i, _start) \
for ((_i) = (_start); \
(_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true); \
(_i)++)
#define writepoint_for_each_ptr_all(_wp, _ob, _i) \
__writepoint_for_each_ptr(_wp, _ob, _i, 0)
#define writepoint_for_each_ptr(_wp, _ob, _i) \
__writepoint_for_each_ptr(_wp, _ob, _i, wp->first_ptr)
void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
{
if (atomic_dec_and_test(&ob->pin))
__bch2_open_bucket_put(c, ob);
}
static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs)
{
unsigned i;
for (i = 0; i < *nr; i++)
bch2_open_bucket_put(c, c->open_buckets + refs[i]);
*nr = 0;
}
static inline void bch2_open_bucket_get(struct bch_fs *c,
struct write_point *wp,
u8 *nr, u8 *refs)
{
struct open_bucket *ob;
unsigned i;
writepoint_for_each_ptr(wp, ob, i) {
atomic_inc(&ob->pin);
refs[(*nr)++] = ob - c->open_buckets;
}
}
struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
unsigned,
struct write_point_specifier,
struct bch_devs_list *,
unsigned, unsigned,
enum alloc_reserve,
unsigned,
struct closure *);
void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
struct bkey_i_extent *, unsigned);
void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
static inline void bch2_wake_allocator(struct bch_dev *ca)
{
struct task_struct *p;
rcu_read_lock();
p = rcu_dereference(ca->alloc_thread);
if (p)
wake_up_process(p);
rcu_read_unlock();
}
static inline struct write_point_specifier writepoint_hashed(unsigned long v)
{
return (struct write_point_specifier) { .v = v | 1 };
}
static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
{
return (struct write_point_specifier) { .v = (unsigned long) wp };
}
void bch2_recalc_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
static inline void writepoint_init(struct write_point *wp,
enum bch_data_type type)
{
mutex_init(&wp->lock);
wp->type = type;
}
int bch2_alloc_write(struct bch_fs *);
int bch2_fs_allocator_start(struct bch_fs *);
void bch2_fs_allocator_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_H */

90
fs/bcachefs/alloc_types.h Normal file
View File

@ -0,0 +1,90 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_ALLOC_TYPES_H
#define _BCACHEFS_ALLOC_TYPES_H
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include "clock_types.h"
#include "fifo.h"
/* There's two of these clocks, one for reads and one for writes: */
struct bucket_clock {
/*
* "now" in (read/write) IO time - incremented whenever we do X amount
* of reads or writes.
*
* Goes with the bucket read/write prios: when we read or write to a
* bucket we reset the bucket's prio to the current hand; thus hand -
* prio = time since bucket was last read/written.
*
* The units are some amount (bytes/sectors) of data read/written, and
* the units can change on the fly if we need to rescale to fit
* everything in a u16 - your only guarantee is that the units are
* consistent.
*/
u16 hand;
u16 max_last_io;
int rw;
struct io_timer rescale;
struct mutex lock;
};
/* There is one reserve for each type of btree, one for prios and gens
* and one for moving GC */
enum alloc_reserve {
RESERVE_ALLOC = -1,
RESERVE_BTREE = 0,
RESERVE_MOVINGGC = 1,
RESERVE_NONE = 2,
RESERVE_NR = 3,
};
typedef FIFO(long) alloc_fifo;
/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
#define OPEN_BUCKETS_COUNT 256
#define WRITE_POINT_COUNT 32
struct open_bucket {
spinlock_t lock;
atomic_t pin;
u8 freelist;
bool valid;
bool on_partial_list;
unsigned sectors_free;
struct bch_extent_ptr ptr;
};
struct write_point {
struct hlist_node node;
struct mutex lock;
u64 last_used;
unsigned long write_point;
enum bch_data_type type;
u8 nr_ptrs;
u8 first_ptr;
/* calculated based on how many pointers we're actually going to use: */
unsigned sectors_free;
struct open_bucket *ptrs[BCH_REPLICAS_MAX * 2];
u64 next_alloc[BCH_SB_MEMBERS_MAX];
};
struct write_point_specifier {
unsigned long v;
};
struct alloc_heap_entry {
size_t bucket;
size_t nr;
unsigned long key;
};
typedef HEAP(struct alloc_heap_entry) alloc_heap;
#endif /* _BCACHEFS_ALLOC_TYPES_H */

785
fs/bcachefs/bcachefs.h Normal file
View File

@ -0,0 +1,785 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_H
#define _BCACHEFS_H
/*
* SOME HIGH LEVEL CODE DOCUMENTATION:
*
* Bcache mostly works with cache sets, cache devices, and backing devices.
*
* Support for multiple cache devices hasn't quite been finished off yet, but
* it's about 95% plumbed through. A cache set and its cache devices is sort of
* like a md raid array and its component devices. Most of the code doesn't care
* about individual cache devices, the main abstraction is the cache set.
*
* Multiple cache devices is intended to give us the ability to mirror dirty
* cached data and metadata, without mirroring clean cached data.
*
* Backing devices are different, in that they have a lifetime independent of a
* cache set. When you register a newly formatted backing device it'll come up
* in passthrough mode, and then you can attach and detach a backing device from
* a cache set at runtime - while it's mounted and in use. Detaching implicitly
* invalidates any cached data for that backing device.
*
* A cache set can have multiple (many) backing devices attached to it.
*
* There's also flash only volumes - this is the reason for the distinction
* between struct cached_dev and struct bcache_device. A flash only volume
* works much like a bcache device that has a backing device, except the
* "cached" data is always dirty. The end result is that we get thin
* provisioning with very little additional code.
*
* Flash only volumes work but they're not production ready because the moving
* garbage collector needs more work. More on that later.
*
* BUCKETS/ALLOCATION:
*
* Bcache is primarily designed for caching, which means that in normal
* operation all of our available space will be allocated. Thus, we need an
* efficient way of deleting things from the cache so we can write new things to
* it.
*
* To do this, we first divide the cache device up into buckets. A bucket is the
* unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
* works efficiently.
*
* Each bucket has a 16 bit priority, and an 8 bit generation associated with
* it. The gens and priorities for all the buckets are stored contiguously and
* packed on disk (in a linked list of buckets - aside from the superblock, all
* of bcache's metadata is stored in buckets).
*
* The priority is used to implement an LRU. We reset a bucket's priority when
* we allocate it or on cache it, and every so often we decrement the priority
* of each bucket. It could be used to implement something more sophisticated,
* if anyone ever gets around to it.
*
* The generation is used for invalidating buckets. Each pointer also has an 8
* bit generation embedded in it; for a pointer to be considered valid, its gen
* must match the gen of the bucket it points into. Thus, to reuse a bucket all
* we have to do is increment its gen (and write its new gen to disk; we batch
* this up).
*
* Bcache is entirely COW - we never write twice to a bucket, even buckets that
* contain metadata (including btree nodes).
*
* THE BTREE:
*
* Bcache is in large part design around the btree.
*
* At a high level, the btree is just an index of key -> ptr tuples.
*
* Keys represent extents, and thus have a size field. Keys also have a variable
* number of pointers attached to them (potentially zero, which is handy for
* invalidating the cache).
*
* The key itself is an inode:offset pair. The inode number corresponds to a
* backing device or a flash only volume. The offset is the ending offset of the
* extent within the inode - not the starting offset; this makes lookups
* slightly more convenient.
*
* Pointers contain the cache device id, the offset on that device, and an 8 bit
* generation number. More on the gen later.
*
* Index lookups are not fully abstracted - cache lookups in particular are
* still somewhat mixed in with the btree code, but things are headed in that
* direction.
*
* Updates are fairly well abstracted, though. There are two different ways of
* updating the btree; insert and replace.
*
* BTREE_INSERT will just take a list of keys and insert them into the btree -
* overwriting (possibly only partially) any extents they overlap with. This is
* used to update the index after a write.
*
* BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
* overwriting a key that matches another given key. This is used for inserting
* data into the cache after a cache miss, and for background writeback, and for
* the moving garbage collector.
*
* There is no "delete" operation; deleting things from the index is
* accomplished by either by invalidating pointers (by incrementing a bucket's
* gen) or by inserting a key with 0 pointers - which will overwrite anything
* previously present at that location in the index.
*
* This means that there are always stale/invalid keys in the btree. They're
* filtered out by the code that iterates through a btree node, and removed when
* a btree node is rewritten.
*
* BTREE NODES:
*
* Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
* free smaller than a bucket - so, that's how big our btree nodes are.
*
* (If buckets are really big we'll only use part of the bucket for a btree node
* - no less than 1/4th - but a bucket still contains no more than a single
* btree node. I'd actually like to change this, but for now we rely on the
* bucket's gen for deleting btree nodes when we rewrite/split a node.)
*
* Anyways, btree nodes are big - big enough to be inefficient with a textbook
* btree implementation.
*
* The way this is solved is that btree nodes are internally log structured; we
* can append new keys to an existing btree node without rewriting it. This
* means each set of keys we write is sorted, but the node is not.
*
* We maintain this log structure in memory - keeping 1Mb of keys sorted would
* be expensive, and we have to distinguish between the keys we have written and
* the keys we haven't. So to do a lookup in a btree node, we have to search
* each sorted set. But we do merge written sets together lazily, so the cost of
* these extra searches is quite low (normally most of the keys in a btree node
* will be in one big set, and then there'll be one or two sets that are much
* smaller).
*
* This log structure makes bcache's btree more of a hybrid between a
* conventional btree and a compacting data structure, with some of the
* advantages of both.
*
* GARBAGE COLLECTION:
*
* We can't just invalidate any bucket - it might contain dirty data or
* metadata. If it once contained dirty data, other writes might overwrite it
* later, leaving no valid pointers into that bucket in the index.
*
* Thus, the primary purpose of garbage collection is to find buckets to reuse.
* It also counts how much valid data it each bucket currently contains, so that
* allocation can reuse buckets sooner when they've been mostly overwritten.
*
* It also does some things that are really internal to the btree
* implementation. If a btree node contains pointers that are stale by more than
* some threshold, it rewrites the btree node to avoid the bucket's generation
* wrapping around. It also merges adjacent btree nodes if they're empty enough.
*
* THE JOURNAL:
*
* Bcache's journal is not necessary for consistency; we always strictly
* order metadata writes so that the btree and everything else is consistent on
* disk in the event of an unclean shutdown, and in fact bcache had writeback
* caching (with recovery from unclean shutdown) before journalling was
* implemented.
*
* Rather, the journal is purely a performance optimization; we can't complete a
* write until we've updated the index on disk, otherwise the cache would be
* inconsistent in the event of an unclean shutdown. This means that without the
* journal, on random write workloads we constantly have to update all the leaf
* nodes in the btree, and those writes will be mostly empty (appending at most
* a few keys each) - highly inefficient in terms of amount of metadata writes,
* and it puts more strain on the various btree resorting/compacting code.
*
* The journal is just a log of keys we've inserted; on startup we just reinsert
* all the keys in the open journal entries. That means that when we're updating
* a node in the btree, we can wait until a 4k block of keys fills up before
* writing them out.
*
* For simplicity, we only journal updates to leaf nodes; updates to parent
* nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
* the complexity to deal with journalling them (in particular, journal replay)
* - updates to non leaf nodes just happen synchronously (see btree_split()).
*/
#undef pr_fmt
#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
#include <linux/backing-dev-defs.h>
#include <linux/bug.h>
#include <linux/bio.h>
#include <linux/closure.h>
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/percpu-refcount.h>
#include <linux/percpu-rwsem.h>
#include <linux/rhashtable.h>
#include <linux/rwsem.h>
#include <linux/seqlock.h>
#include <linux/shrinker.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <linux/zstd.h>
#include "bcachefs_format.h"
#include "fifo.h"
#include "opts.h"
#include "util.h"
#define dynamic_fault(...) 0
#define race_fault(...) 0
#define bch2_fs_init_fault(name) \
dynamic_fault("bcachefs:bch_fs_init:" name)
#define bch2_meta_read_fault(name) \
dynamic_fault("bcachefs:meta:read:" name)
#define bch2_meta_write_fault(name) \
dynamic_fault("bcachefs:meta:write:" name)
#ifdef __KERNEL__
#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name)
#else
#define bch2_fmt(_c, fmt) fmt "\n"
#endif
#define bch_info(c, fmt, ...) \
printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_notice(c, fmt, ...) \
printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_warn(c, fmt, ...) \
printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err(c, fmt, ...) \
printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_verbose(c, fmt, ...) \
do { \
if ((c)->opts.verbose_recovery) \
bch_info(c, fmt, ##__VA_ARGS__); \
} while (0)
#define pr_verbose_init(opts, fmt, ...) \
do { \
if (opt_get(opts, verbose_init)) \
pr_info(fmt, ##__VA_ARGS__); \
} while (0)
/* Parameters that are useful for debugging, but should always be compiled in: */
#define BCH_DEBUG_PARAMS_ALWAYS() \
BCH_DEBUG_PARAM(key_merging_disabled, \
"Disables merging of extents") \
BCH_DEBUG_PARAM(btree_gc_always_rewrite, \
"Causes mark and sweep to compact and rewrite every " \
"btree node it traverses") \
BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \
"Disables rewriting of btree nodes during mark and sweep")\
BCH_DEBUG_PARAM(btree_shrinker_disabled, \
"Disables the shrinker callback for the btree node cache")
/* Parameters that should only be compiled in in debug mode: */
#define BCH_DEBUG_PARAMS_DEBUG() \
BCH_DEBUG_PARAM(expensive_debug_checks, \
"Enables various runtime debugging checks that " \
"significantly affect performance") \
BCH_DEBUG_PARAM(debug_check_bkeys, \
"Run bkey_debugcheck (primarily checking GC/allocation "\
"information) when iterating over keys") \
BCH_DEBUG_PARAM(verify_btree_ondisk, \
"Reread btree nodes at various points to verify the " \
"mergesort in the read path against modifications " \
"done in memory") \
BCH_DEBUG_PARAM(journal_seq_verify, \
"Store the journal sequence number in the version " \
"number of every btree key, and verify that btree " \
"update ordering is preserved during recovery") \
BCH_DEBUG_PARAM(inject_invalid_keys, \
"Store the journal sequence number in the version " \
"number of every btree key, and verify that btree " \
"update ordering is preserved during recovery") \
#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
#ifdef CONFIG_BCACHEFS_DEBUG
#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
#else
#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
#endif
#define BCH_TIME_STATS() \
x(btree_node_mem_alloc) \
x(btree_gc) \
x(btree_split) \
x(btree_sort) \
x(btree_read) \
x(btree_lock_contended_read) \
x(btree_lock_contended_intent) \
x(btree_lock_contended_write) \
x(data_write) \
x(data_read) \
x(data_promote) \
x(journal_write) \
x(journal_delay) \
x(journal_blocked) \
x(journal_flush_seq)
enum bch_time_stats {
#define x(name) BCH_TIME_##name,
BCH_TIME_STATS()
#undef x
BCH_TIME_STAT_NR
};
#include "alloc_types.h"
#include "btree_types.h"
#include "buckets_types.h"
#include "clock_types.h"
#include "journal_types.h"
#include "keylist_types.h"
#include "quota_types.h"
#include "rebalance_types.h"
#include "super_types.h"
/* Number of nodes btree coalesce will try to coalesce at once */
#define GC_MERGE_NODES 4U
/* Maximum number of nodes we might need to allocate atomically: */
#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
/* Size of the freelist we allocate btree nodes from: */
#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4)
struct btree;
enum gc_phase {
GC_PHASE_START,
GC_PHASE_SB,
#define DEF_BTREE_ID(kwd, val, name) GC_PHASE_BTREE_##kwd,
DEFINE_BCH_BTREE_IDS()
#undef DEF_BTREE_ID
GC_PHASE_PENDING_DELETE,
GC_PHASE_ALLOC,
GC_PHASE_DONE
};
struct gc_pos {
enum gc_phase phase;
struct bpos pos;
unsigned level;
};
struct io_count {
u64 sectors[2][BCH_DATA_NR];
};
struct bch_dev {
struct kobject kobj;
struct percpu_ref ref;
struct completion ref_completion;
struct percpu_ref io_ref;
struct completion io_ref_completion;
struct bch_fs *fs;
u8 dev_idx;
/*
* Cached version of this device's member info from superblock
* Committed by bch2_write_super() -> bch_fs_mi_update()
*/
struct bch_member_cpu mi;
__uuid_t uuid;
char name[BDEVNAME_SIZE];
struct bch_sb_handle disk_sb;
int sb_write_error;
struct bch_devs_mask self;
/* biosets used in cloned bios for writing multiple replicas */
struct bio_set replica_set;
/*
* Buckets:
* Per-bucket arrays are protected by c->usage_lock, bucket_lock and
* gc_lock, for device resize - holding any is sufficient for access:
* Or rcu_read_lock(), but only for ptr_stale():
*/
struct bucket_array __rcu *buckets;
unsigned long *buckets_dirty;
/* most out of date gen in the btree */
u8 *oldest_gens;
struct rw_semaphore bucket_lock;
struct bch_dev_usage __percpu *usage_percpu;
struct bch_dev_usage usage_cached;
/* Allocator: */
struct task_struct __rcu *alloc_thread;
/*
* free: Buckets that are ready to be used
*
* free_inc: Incoming buckets - these are buckets that currently have
* cached data in them, and we can't reuse them until after we write
* their new gen to disk. After prio_write() finishes writing the new
* gens/prios, they'll be moved to the free list (and possibly discarded
* in the process)
*/
alloc_fifo free[RESERVE_NR];
alloc_fifo free_inc;
spinlock_t freelist_lock;
size_t nr_invalidated;
u8 open_buckets_partial[OPEN_BUCKETS_COUNT];
unsigned open_buckets_partial_nr;
size_t fifo_last_bucket;
/* last calculated minimum prio */
u16 max_last_bucket_io[2];
atomic_long_t saturated_count;
size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc;
u64 allocator_journal_seq_flush;
bool allocator_invalidating_data;
bool allocator_blocked;
alloc_heap alloc_heap;
/* Copying GC: */
struct task_struct *copygc_thread;
copygc_heap copygc_heap;
struct bch_pd_controller copygc_pd;
struct write_point copygc_write_point;
atomic64_t rebalance_work;
struct journal_device journal;
struct work_struct io_error_work;
/* The rest of this all shows up in sysfs */
atomic64_t cur_latency[2];
struct bch2_time_stats io_latency[2];
#define CONGESTED_MAX 1024
atomic_t congested;
u64 congested_last;
struct io_count __percpu *io_done;
};
/*
* Flag bits for what phase of startup/shutdown the cache set is at, how we're
* shutting down, etc.:
*
* BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching
* all the backing devices first (their cached data gets invalidated, and they
* won't automatically reattach).
*/
enum {
/* startup: */
BCH_FS_ALLOC_READ_DONE,
BCH_FS_ALLOCATOR_STARTED,
BCH_FS_INITIAL_GC_DONE,
BCH_FS_FSCK_DONE,
BCH_FS_STARTED,
/* shutdown: */
BCH_FS_EMERGENCY_RO,
BCH_FS_WRITE_DISABLE_COMPLETE,
/* errors: */
BCH_FS_ERROR,
BCH_FS_GC_FAILURE,
/* misc: */
BCH_FS_BDEV_MOUNTED,
BCH_FS_FSCK_FIXED_ERRORS,
BCH_FS_FIXED_GENS,
BCH_FS_REBUILD_REPLICAS,
BCH_FS_HOLD_BTREE_WRITES,
};
struct btree_debug {
unsigned id;
struct dentry *btree;
struct dentry *btree_format;
struct dentry *failed;
};
enum bch_fs_state {
BCH_FS_STARTING = 0,
BCH_FS_STOPPING,
BCH_FS_RO,
BCH_FS_RW,
};
struct bch_fs {
struct closure cl;
struct list_head list;
struct kobject kobj;
struct kobject internal;
struct kobject opts_dir;
struct kobject time_stats;
unsigned long flags;
int minor;
struct device *chardev;
struct super_block *vfs_sb;
char name[40];
/* ro/rw, add/remove devices: */
struct mutex state_lock;
enum bch_fs_state state;
/* Counts outstanding writes, for clean transition to read-only */
struct percpu_ref writes;
struct work_struct read_only_work;
struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
struct bch_replicas_cpu __rcu *replicas;
struct bch_replicas_cpu __rcu *replicas_gc;
struct mutex replicas_gc_lock;
struct bch_disk_groups_cpu __rcu *disk_groups;
struct bch_opts opts;
/* Updated by bch2_sb_update():*/
struct {
__uuid_t uuid;
__uuid_t user_uuid;
u16 encoded_extent_max;
u8 nr_devices;
u8 clean;
u8 encryption_type;
u64 time_base_lo;
u32 time_base_hi;
u32 time_precision;
u64 features;
} sb;
struct bch_sb_handle disk_sb;
unsigned short block_bits; /* ilog2(block_size) */
u16 btree_foreground_merge_threshold;
struct closure sb_write;
struct mutex sb_lock;
/* BTREE CACHE */
struct bio_set btree_bio;
struct btree_root btree_roots[BTREE_ID_NR];
bool btree_roots_dirty;
struct mutex btree_root_lock;
struct btree_cache btree_cache;
mempool_t btree_reserve_pool;
/*
* Cache of allocated btree nodes - if we allocate a btree node and
* don't use it, if we free it that space can't be reused until going
* _all_ the way through the allocator (which exposes us to a livelock
* when allocating btree reserves fail halfway through) - instead, we
* can stick them here:
*/
struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2];
unsigned btree_reserve_cache_nr;
struct mutex btree_reserve_cache_lock;
mempool_t btree_interior_update_pool;
struct list_head btree_interior_update_list;
struct mutex btree_interior_update_lock;
struct closure_waitlist btree_interior_update_wait;
struct workqueue_struct *wq;
/* copygc needs its own workqueue for index updates.. */
struct workqueue_struct *copygc_wq;
/* ALLOCATION */
struct delayed_work pd_controllers_update;
unsigned pd_controllers_update_seconds;
struct bch_devs_mask rw_devs[BCH_DATA_NR];
u64 capacity; /* sectors */
/*
* When capacity _decreases_ (due to a disk being removed), we
* increment capacity_gen - this invalidates outstanding reservations
* and forces them to be revalidated
*/
u32 capacity_gen;
atomic64_t sectors_available;
struct bch_fs_usage __percpu *usage_percpu;
struct bch_fs_usage usage_cached;
struct percpu_rw_semaphore usage_lock;
struct closure_waitlist freelist_wait;
/*
* When we invalidate buckets, we use both the priority and the amount
* of good data to determine which buckets to reuse first - to weight
* those together consistently we keep track of the smallest nonzero
* priority of any bucket.
*/
struct bucket_clock bucket_clock[2];
struct io_clock io_clock[2];
/* ALLOCATOR */
spinlock_t freelist_lock;
u8 open_buckets_freelist;
u8 open_buckets_nr_free;
struct closure_waitlist open_buckets_wait;
struct open_bucket open_buckets[OPEN_BUCKETS_COUNT];
struct write_point btree_write_point;
struct write_point rebalance_write_point;
struct write_point write_points[WRITE_POINT_COUNT];
struct hlist_head write_points_hash[WRITE_POINT_COUNT];
struct mutex write_points_hash_lock;
/* GARBAGE COLLECTION */
struct task_struct *gc_thread;
atomic_t kick_gc;
unsigned long gc_count;
/*
* Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
* has been marked by GC.
*
* gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
*
* gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
* currently running, and gc marks are currently valid
*
* Protected by gc_pos_lock. Only written to by GC thread, so GC thread
* can read without a lock.
*/
seqcount_t gc_pos_lock;
struct gc_pos gc_pos;
/*
* The allocation code needs gc_mark in struct bucket to be correct, but
* it's not while a gc is in progress.
*/
struct rw_semaphore gc_lock;
/* IO PATH */
struct bio_set bio_read;
struct bio_set bio_read_split;
struct bio_set bio_write;
struct mutex bio_bounce_pages_lock;
mempool_t bio_bounce_pages;
struct rhashtable promote_table;
mempool_t compression_bounce[2];
mempool_t compress_workspace[BCH_COMPRESSION_NR];
mempool_t decompress_workspace;
ZSTD_parameters zstd_params;
struct crypto_shash *sha256;
struct crypto_sync_skcipher *chacha20;
struct crypto_shash *poly1305;
atomic64_t key_version;
/* REBALANCE */
struct bch_fs_rebalance rebalance;
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
struct bio_set dio_write_bioset;
struct bio_set dio_read_bioset;
struct bio_list btree_write_error_list;
struct work_struct btree_write_error_work;
spinlock_t btree_write_error_lock;
/* ERRORS */
struct list_head fsck_errors;
struct mutex fsck_error_lock;
bool fsck_alloc_err;
/* FILESYSTEM */
atomic_long_t nr_inodes;
/* QUOTAS */
struct bch_memquota_type quotas[QTYP_NR];
/* DEBUG JUNK */
struct dentry *debug;
struct btree_debug btree_debug[BTREE_ID_NR];
#ifdef CONFIG_BCACHEFS_DEBUG
struct btree *verify_data;
struct btree_node *verify_ondisk;
struct mutex verify_lock;
#endif
u64 unused_inode_hint;
/*
* A btree node on disk could have too many bsets for an iterator to fit
* on the stack - have to dynamically allocate them
*/
mempool_t fill_iter;
mempool_t btree_bounce_pool;
struct journal journal;
unsigned bucket_journal_seq;
/* The rest of this all shows up in sysfs */
atomic_long_t read_realloc_races;
atomic_long_t extent_migrate_done;
atomic_long_t extent_migrate_raced;
unsigned btree_gc_periodic:1;
unsigned copy_gc_enabled:1;
bool promote_whole_extents;
#define BCH_DEBUG_PARAM(name, description) bool name;
BCH_DEBUG_PARAMS_ALL()
#undef BCH_DEBUG_PARAM
struct bch2_time_stats times[BCH_TIME_STAT_NR];
};
static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
{
#ifndef NO_BCACHEFS_FS
if (c->vfs_sb)
c->vfs_sb->s_bdi->ra_pages = ra_pages;
#endif
}
static inline bool bch2_fs_running(struct bch_fs *c)
{
return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
}
static inline unsigned bucket_bytes(const struct bch_dev *ca)
{
return ca->mi.bucket_size << 9;
}
static inline unsigned block_bytes(const struct bch_fs *c)
{
return c->opts.block_size << 9;
}
static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time)
{
return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo);
}
static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts)
{
s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo;
if (c->sb.time_precision == 1)
return ns;
return div_s64(ns, c->sb.time_precision);
}
static inline s64 bch2_current_time(struct bch_fs *c)
{
struct timespec64 now;
ktime_get_real_ts64(&now);
return timespec_to_bch2_time(c, now);
}
#endif /* _BCACHEFS_H */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,310 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_IOCTL_H
#define _BCACHEFS_IOCTL_H
#include <linux/uuid.h>
#include <asm/ioctl.h>
#include "bcachefs_format.h"
/*
* Flags common to multiple ioctls:
*/
#define BCH_FORCE_IF_DATA_LOST (1 << 0)
#define BCH_FORCE_IF_METADATA_LOST (1 << 1)
#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2)
#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3)
#define BCH_FORCE_IF_DEGRADED \
(BCH_FORCE_IF_DATA_DEGRADED| \
BCH_FORCE_IF_METADATA_DEGRADED)
/*
* If cleared, ioctl that refer to a device pass it as a pointer to a pathname
* (e.g. /dev/sda1); if set, the dev field is the device's index within the
* filesystem:
*/
#define BCH_BY_INDEX (1 << 4)
/*
* For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
* wide superblock:
*/
#define BCH_READ_DEV (1 << 5)
/* global control dev: */
/* These are currently broken, and probably unnecessary: */
#if 0
#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble)
#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental)
struct bch_ioctl_assemble {
__u32 flags;
__u32 nr_devs;
__u64 pad;
__u64 devs[];
};
struct bch_ioctl_incremental {
__u32 flags;
__u64 pad;
__u64 dev;
};
#endif
/* filesystem ioctls: */
#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid)
/* These only make sense when we also have incremental assembly */
#if 0
#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start)
#define BCH_IOCTL_STOP _IO(0xbc, 3)
#endif
#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk)
#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk)
#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk)
#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk)
#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state)
#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data)
#define BCH_IOCTL_USAGE _IOWR(0xbc, 11, struct bch_ioctl_usage)
#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super)
#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx)
#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 13, struct bch_ioctl_disk_resize)
/*
* BCH_IOCTL_QUERY_UUID: get filesystem UUID
*
* Returns user visible UUID, not internal UUID (which may not ever be changed);
* the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
* this UUID.
*/
struct bch_ioctl_query_uuid {
__uuid_t uuid;
};
#if 0
struct bch_ioctl_start {
__u32 flags;
__u32 pad;
};
#endif
/*
* BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
*
* The specified device must not be open or in use. On success, the new device
* will be an online member of the filesystem just like any other member.
*
* The device must first be prepared by userspace by formatting with a bcachefs
* superblock, which is only used for passing in superblock options/parameters
* for that device (in struct bch_member). The new device's superblock should
* not claim to be a member of any existing filesystem - UUIDs on it will be
* ignored.
*/
/*
* BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
*
* Any data present on @dev will be permanently deleted, and @dev will be
* removed from its slot in the filesystem's list of member devices. The device
* may be either offline or offline.
*
* Will fail removing @dev would leave us with insufficient read write devices
* or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
* set.
*/
/*
* BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
* but is not open (e.g. because we started in degraded mode), bring it online
*
* all existing data on @dev will be available once the device is online,
* exactly as if @dev was present when the filesystem was first mounted
*/
/*
* BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
* block device, without removing it from the filesystem (so it can be brought
* back online later)
*
* Data present on @dev will be unavailable while @dev is offline (unless
* replicated), but will still be intact and untouched if @dev is brought back
* online
*
* Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
* leave us with insufficient read write devices or degraded/unavailable data,
* unless the approprate BCH_FORCE_IF_* flags are set.
*/
struct bch_ioctl_disk {
__u32 flags;
__u32 pad;
__u64 dev;
};
/*
* BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
*
* @new_state - one of the bch_member_state states (rw, ro, failed,
* spare)
*
* Will refuse to change member state if we would then have insufficient devices
* to write to, or if it would result in degraded data (when @new_state is
* failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
*/
struct bch_ioctl_disk_set_state {
__u32 flags;
__u8 new_state;
__u8 pad[3];
__u64 dev;
};
enum bch_data_ops {
BCH_DATA_OP_SCRUB = 0,
BCH_DATA_OP_REREPLICATE = 1,
BCH_DATA_OP_MIGRATE = 2,
BCH_DATA_OP_NR = 3,
};
/*
* BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
* scrub, rereplicate, migrate).
*
* This ioctl kicks off a job in the background, and returns a file descriptor.
* Reading from the file descriptor returns a struct bch_ioctl_data_event,
* indicating current progress, and closing the file descriptor will stop the
* job. The file descriptor is O_CLOEXEC.
*/
struct bch_ioctl_data {
__u32 op;
__u32 flags;
struct bpos start;
struct bpos end;
union {
struct {
__u32 dev;
__u32 pad;
} migrate;
struct {
__u64 pad[8];
};
};
} __attribute__((packed, aligned(8)));
enum bch_data_event {
BCH_DATA_EVENT_PROGRESS = 0,
/* XXX: add an event for reporting errors */
BCH_DATA_EVENT_NR = 1,
};
struct bch_ioctl_data_progress {
__u8 data_type;
__u8 btree_id;
__u8 pad[2];
struct bpos pos;
__u64 sectors_done;
__u64 sectors_total;
} __attribute__((packed, aligned(8)));
struct bch_ioctl_data_event {
__u8 type;
__u8 pad[7];
union {
struct bch_ioctl_data_progress p;
__u64 pad2[15];
};
} __attribute__((packed, aligned(8)));
struct bch_ioctl_dev_usage {
__u8 state;
__u8 alive;
__u8 pad[6];
__u32 dev;
__u32 bucket_size;
__u64 nr_buckets;
__u64 buckets[BCH_DATA_NR];
__u64 sectors[BCH_DATA_NR];
};
struct bch_ioctl_fs_usage {
__u64 capacity;
__u64 used;
__u64 online_reserved;
__u64 persistent_reserved[BCH_REPLICAS_MAX];
__u64 sectors[BCH_DATA_NR][BCH_REPLICAS_MAX];
};
/*
* BCH_IOCTL_USAGE: query filesystem disk space usage
*
* Returns disk space usage broken out by data type, number of replicas, and
* by component device
*
* @nr_devices - number of devices userspace allocated space for in @devs
*
* On success, @fs and @devs will be filled out appropriately and devs[i].alive
* will indicate if a device was present in that slot
*
* Returns -ERANGE if @nr_devices was too small
*/
struct bch_ioctl_usage {
__u16 nr_devices;
__u16 pad[3];
struct bch_ioctl_fs_usage fs;
struct bch_ioctl_dev_usage devs[0];
};
/*
* BCH_IOCTL_READ_SUPER: read filesystem superblock
*
* Equivalent to reading the superblock directly from the block device, except
* avoids racing with the kernel writing the superblock or having to figure out
* which block device to read
*
* @sb - buffer to read into
* @size - size of userspace allocated buffer
* @dev - device to read superblock for, if BCH_READ_DEV flag is
* specified
*
* Returns -ERANGE if buffer provided is too small
*/
struct bch_ioctl_read_super {
__u32 flags;
__u32 pad;
__u64 dev;
__u64 size;
__u64 sb;
};
/*
* BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
* determine if disk is a (online) member - if so, returns device's index
*
* Returns -ENOENT if not found
*/
struct bch_ioctl_disk_get_idx {
__u64 dev;
};
/*
* BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
*
* @dev - member to resize
* @nbuckets - new number of buckets
*/
struct bch_ioctl_disk_resize {
__u32 flags;
__u32 pad;
__u64 dev;
__u64 nbuckets;
};
#endif /* _BCACHEFS_IOCTL_H */

1164
fs/bcachefs/bkey.c Normal file

File diff suppressed because it is too large Load Diff

627
fs/bcachefs/bkey.h Normal file
View File

@ -0,0 +1,627 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BKEY_H
#define _BCACHEFS_BKEY_H
#include <linux/bug.h>
#include "bcachefs_format.h"
#include "util.h"
#include "vstructs.h"
#if 0
/*
* compiled unpack functions are disabled, pending a new interface for
* dynamically allocating executable memory:
*/
#ifdef CONFIG_X86_64
#define HAVE_BCACHEFS_COMPILED_UNPACK 1
#endif
#endif
void bch2_to_binary(char *, const u64 *, unsigned);
/* bkey with split value, const */
struct bkey_s_c {
const struct bkey *k;
const struct bch_val *v;
};
/* bkey with split value */
struct bkey_s {
union {
struct {
struct bkey *k;
struct bch_val *v;
};
struct bkey_s_c s_c;
};
};
#define bkey_next(_k) vstruct_next(_k)
static inline unsigned bkey_val_u64s(const struct bkey *k)
{
return k->u64s - BKEY_U64s;
}
static inline size_t bkey_val_bytes(const struct bkey *k)
{
return bkey_val_u64s(k) * sizeof(u64);
}
static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
{
k->u64s = BKEY_U64s + val_u64s;
}
static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
{
k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
}
#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_DELETED)
#define bkey_whiteout(_k) \
((_k)->type == KEY_TYPE_DELETED || (_k)->type == KEY_TYPE_DISCARD)
#define bkey_packed_typecheck(_k) \
({ \
BUILD_BUG_ON(!type_is(_k, struct bkey *) && \
!type_is(_k, struct bkey_packed *)); \
type_is(_k, struct bkey_packed *); \
})
enum bkey_lr_packed {
BKEY_PACKED_BOTH,
BKEY_PACKED_RIGHT,
BKEY_PACKED_LEFT,
BKEY_PACKED_NONE,
};
#define bkey_lr_packed_typecheck(_l, _r) \
(!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1))
#define bkey_lr_packed(_l, _r) \
((_l)->format + ((_r)->format << 1))
#define bkey_copy(_dst, _src) \
do { \
BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \
!type_is(_dst, struct bkey_packed *)); \
BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \
!type_is(_src, struct bkey_packed *)); \
EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \
(u64 *) (_dst) < (u64 *) (_src) + \
((struct bkey *) (_src))->u64s); \
\
__memmove_u64s_down((_dst), (_src), \
((struct bkey *) (_src))->u64s); \
} while (0)
struct btree;
struct bkey_format_state {
u64 field_min[BKEY_NR_FIELDS];
u64 field_max[BKEY_NR_FIELDS];
};
void bch2_bkey_format_init(struct bkey_format_state *);
void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *);
void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
const char *bch2_bkey_format_validate(struct bkey_format *);
__pure
unsigned bch2_bkey_greatest_differing_bit(const struct btree *,
const struct bkey_packed *,
const struct bkey_packed *);
__pure
unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *);
__pure
int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *,
const struct bkey_packed *,
const struct btree *);
__pure
int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
const struct bkey_packed *,
const struct bpos *);
__pure
int __bch2_bkey_cmp_packed(const struct bkey_packed *,
const struct bkey_packed *,
const struct btree *);
__pure
int __bch2_bkey_cmp_left_packed(const struct btree *,
const struct bkey_packed *,
const struct bpos *);
static inline __pure
int bkey_cmp_left_packed(const struct btree *b,
const struct bkey_packed *l, const struct bpos *r)
{
return __bch2_bkey_cmp_left_packed(b, l, r);
}
/*
* we prefer to pass bpos by ref, but it's often enough terribly convenient to
* pass it by by val... as much as I hate c++, const ref would be nice here:
*/
__pure __flatten
static inline int bkey_cmp_left_packed_byval(const struct btree *b,
const struct bkey_packed *l,
struct bpos r)
{
return bkey_cmp_left_packed(b, l, &r);
}
/*
* If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to
* skip dispatching on k->format:
*/
#define bkey_cmp_packed(_b, _l, _r) \
({ \
int _cmp; \
\
switch (bkey_lr_packed_typecheck(_l, _r)) { \
case BKEY_PACKED_NONE: \
_cmp = bkey_cmp(((struct bkey *) (_l))->p, \
((struct bkey *) (_r))->p); \
break; \
case BKEY_PACKED_LEFT: \
_cmp = bkey_cmp_left_packed((_b), \
(struct bkey_packed *) (_l), \
&((struct bkey *) (_r))->p); \
break; \
case BKEY_PACKED_RIGHT: \
_cmp = -bkey_cmp_left_packed((_b), \
(struct bkey_packed *) (_r), \
&((struct bkey *) (_l))->p); \
break; \
case BKEY_PACKED_BOTH: \
_cmp = __bch2_bkey_cmp_packed((void *) (_l), \
(void *) (_r), (_b)); \
break; \
} \
_cmp; \
})
#if 1
static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
{
if (l.inode != r.inode)
return l.inode < r.inode ? -1 : 1;
if (l.offset != r.offset)
return l.offset < r.offset ? -1 : 1;
if (l.snapshot != r.snapshot)
return l.snapshot < r.snapshot ? -1 : 1;
return 0;
}
#else
int bkey_cmp(struct bpos l, struct bpos r);
#endif
static inline struct bpos bpos_min(struct bpos l, struct bpos r)
{
return bkey_cmp(l, r) < 0 ? l : r;
}
void bch2_bpos_swab(struct bpos *);
void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
{
return (l.hi > r.hi) - (l.hi < r.hi) ?:
(l.lo > r.lo) - (l.lo < r.lo);
}
#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 })
#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL })
static __always_inline int bversion_zero(struct bversion v)
{
return !bversion_cmp(v, ZERO_VERSION);
}
#ifdef CONFIG_BCACHEFS_DEBUG
/* statement expressions confusing unlikely()? */
#define bkey_packed(_k) \
({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \
(_k)->format != KEY_FORMAT_CURRENT; })
#else
#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT)
#endif
/*
* It's safe to treat an unpacked bkey as a packed one, but not the reverse
*/
static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k)
{
return (struct bkey_packed *) k;
}
static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k)
{
return (const struct bkey_packed *) k;
}
static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k)
{
return bkey_packed(k) ? NULL : (struct bkey_i *) k;
}
static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k)
{
return bkey_packed(k) ? NULL : (const struct bkey *) k;
}
static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
{
return format->bits_per_field[BKEY_FIELD_INODE] +
format->bits_per_field[BKEY_FIELD_OFFSET] +
format->bits_per_field[BKEY_FIELD_SNAPSHOT];
}
static inline struct bpos bkey_successor(struct bpos p)
{
struct bpos ret = p;
if (!++ret.offset)
BUG_ON(!++ret.inode);
return ret;
}
static inline struct bpos bkey_predecessor(struct bpos p)
{
struct bpos ret = p;
if (!ret.offset--)
BUG_ON(!ret.inode--);
return ret;
}
static inline u64 bkey_start_offset(const struct bkey *k)
{
return k->p.offset - k->size;
}
static inline struct bpos bkey_start_pos(const struct bkey *k)
{
return (struct bpos) {
.inode = k->p.inode,
.offset = bkey_start_offset(k),
.snapshot = k->p.snapshot,
};
}
/* Packed helpers */
static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
const struct bkey_packed *k)
{
unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
EBUG_ON(k->u64s < ret);
return ret;
}
static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
const struct bkey_packed *k)
{
return bkeyp_key_u64s(format, k) * sizeof(u64);
}
static inline unsigned bkeyp_val_u64s(const struct bkey_format *format,
const struct bkey_packed *k)
{
return k->u64s - bkeyp_key_u64s(format, k);
}
static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
const struct bkey_packed *k)
{
return bkeyp_val_u64s(format, k) * sizeof(u64);
}
static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
struct bkey_packed *k, unsigned val_u64s)
{
k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
}
#define bkeyp_val(_format, _k) \
((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k)))
extern const struct bkey_format bch2_bkey_format_current;
bool bch2_bkey_transform(const struct bkey_format *,
struct bkey_packed *,
const struct bkey_format *,
const struct bkey_packed *);
struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
const struct bkey_packed *);
#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
struct bpos __bkey_unpack_pos(const struct bkey_format *,
const struct bkey_packed *);
#endif
bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *,
const struct bkey_format *);
enum bkey_pack_pos_ret {
BKEY_PACK_POS_EXACT,
BKEY_PACK_POS_SMALLER,
BKEY_PACK_POS_FAIL,
};
enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos,
const struct btree *);
static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in,
const struct btree *b)
{
return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT;
}
void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
const struct bkey_packed *);
bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
const struct bkey_format *);
static inline u64 bkey_field_max(const struct bkey_format *f,
enum bch_bkey_fields nr)
{
return f->bits_per_field[nr] < 64
? (le64_to_cpu(f->field_offset[nr]) +
~(~0ULL << f->bits_per_field[nr]))
: U64_MAX;
}
#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
int bch2_compile_bkey_format(const struct bkey_format *, void *);
#else
static inline int bch2_compile_bkey_format(const struct bkey_format *format,
void *out) { return 0; }
#endif
static inline void bkey_reassemble(struct bkey_i *dst,
struct bkey_s_c src)
{
BUG_ON(bkey_packed(src.k));
dst->k = *src.k;
memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k));
}
#define bkey_s_null ((struct bkey_s) { .k = NULL })
#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL })
#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) })
#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) })
static inline struct bkey_s bkey_to_s(struct bkey *k)
{
return (struct bkey_s) { .k = k, .v = NULL };
}
static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
{
return (struct bkey_s_c) { .k = k, .v = NULL };
}
static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
{
return (struct bkey_s) { .k = &k->k, .v = &k->v };
}
static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
{
return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
}
/*
* For a given type of value (e.g. struct bch_extent), generates the types for
* bkey + bch_extent - inline, split, split const - and also all the conversion
* functions, which also check that the value is of the correct type.
*
* We use anonymous unions for upcasting - e.g. converting from e.g. a
* bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
* functions.
*/
#define __BKEY_VAL_ACCESSORS(name, nr, _assert) \
struct bkey_s_c_##name { \
union { \
struct { \
const struct bkey *k; \
const struct bch_##name *v; \
}; \
struct bkey_s_c s_c; \
}; \
}; \
\
struct bkey_s_##name { \
union { \
struct { \
struct bkey *k; \
struct bch_##name *v; \
}; \
struct bkey_s_c_##name c; \
struct bkey_s s; \
struct bkey_s_c s_c; \
}; \
}; \
\
static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
{ \
_assert(k->k.type, nr); \
return container_of(&k->k, struct bkey_i_##name, k); \
} \
\
static inline const struct bkey_i_##name * \
bkey_i_to_##name##_c(const struct bkey_i *k) \
{ \
_assert(k->k.type, nr); \
return container_of(&k->k, struct bkey_i_##name, k); \
} \
\
static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \
{ \
_assert(k.k->type, nr); \
return (struct bkey_s_##name) { \
.k = k.k, \
.v = container_of(k.v, struct bch_##name, v), \
}; \
} \
\
static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
{ \
_assert(k.k->type, nr); \
return (struct bkey_s_c_##name) { \
.k = k.k, \
.v = container_of(k.v, struct bch_##name, v), \
}; \
} \
\
static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
{ \
return (struct bkey_s_##name) { \
.k = &k->k, \
.v = &k->v, \
}; \
} \
\
static inline struct bkey_s_c_##name \
name##_i_to_s_c(const struct bkey_i_##name *k) \
{ \
return (struct bkey_s_c_##name) { \
.k = &k->k, \
.v = &k->v, \
}; \
} \
\
static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \
{ \
_assert(k->k.type, nr); \
return (struct bkey_s_##name) { \
.k = &k->k, \
.v = container_of(&k->v, struct bch_##name, v), \
}; \
} \
\
static inline struct bkey_s_c_##name \
bkey_i_to_s_c_##name(const struct bkey_i *k) \
{ \
_assert(k->k.type, nr); \
return (struct bkey_s_c_##name) { \
.k = &k->k, \
.v = container_of(&k->v, struct bch_##name, v), \
}; \
} \
\
static inline struct bch_##name * \
bkey_p_##name##_val(const struct bkey_format *f, \
struct bkey_packed *k) \
{ \
return container_of(bkeyp_val(f, k), struct bch_##name, v); \
} \
\
static inline const struct bch_##name * \
bkey_p_c_##name##_val(const struct bkey_format *f, \
const struct bkey_packed *k) \
{ \
return container_of(bkeyp_val(f, k), struct bch_##name, v); \
} \
\
static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
{ \
struct bkey_i_##name *k = \
container_of(&_k->k, struct bkey_i_##name, k); \
\
bkey_init(&k->k); \
memset(&k->v, 0, sizeof(k->v)); \
k->k.type = nr; \
set_bkey_val_bytes(&k->k, sizeof(k->v)); \
\
return k; \
}
#define __BKEY_VAL_ASSERT(_type, _nr) EBUG_ON(_type != _nr)
#define BKEY_VAL_ACCESSORS(name, _nr) \
static inline void __bch_##name##_assert(u8 type, u8 nr) \
{ \
EBUG_ON(type != _nr); \
} \
\
__BKEY_VAL_ACCESSORS(name, _nr, __bch_##name##_assert)
BKEY_VAL_ACCESSORS(cookie, KEY_TYPE_COOKIE);
static inline void __bch2_extent_assert(u8 type, u8 nr)
{
EBUG_ON(type != BCH_EXTENT && type != BCH_EXTENT_CACHED);
}
__BKEY_VAL_ACCESSORS(extent, BCH_EXTENT, __bch2_extent_assert);
BKEY_VAL_ACCESSORS(reservation, BCH_RESERVATION);
BKEY_VAL_ACCESSORS(inode, BCH_INODE_FS);
BKEY_VAL_ACCESSORS(inode_blockdev, BCH_INODE_BLOCKDEV);
BKEY_VAL_ACCESSORS(inode_generation, BCH_INODE_GENERATION);
BKEY_VAL_ACCESSORS(dirent, BCH_DIRENT);
BKEY_VAL_ACCESSORS(xattr, BCH_XATTR);
BKEY_VAL_ACCESSORS(alloc, BCH_ALLOC);
BKEY_VAL_ACCESSORS(quota, BCH_QUOTA);
/* byte order helpers */
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
static inline unsigned high_word_offset(const struct bkey_format *f)
{
return f->key_u64s - 1;
}
#define high_bit_offset 0
#define nth_word(p, n) ((p) - (n))
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
static inline unsigned high_word_offset(const struct bkey_format *f)
{
return 0;
}
#define high_bit_offset KEY_PACKED_BITS_START
#define nth_word(p, n) ((p) + (n))
#else
#error edit for your odd byteorder.
#endif
#define high_word(f, k) ((k)->_data + high_word_offset(f))
#define next_word(p) nth_word(p, 1)
#define prev_word(p) nth_word(p, -1)
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_bkey_pack_test(void);
#else
static inline void bch2_bkey_pack_test(void) {}
#endif
#endif /* _BCACHEFS_BKEY_H */

192
fs/bcachefs/bkey_methods.c Normal file
View File

@ -0,0 +1,192 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "bkey_methods.h"
#include "btree_types.h"
#include "alloc.h"
#include "dirent.h"
#include "error.h"
#include "extents.h"
#include "inode.h"
#include "quota.h"
#include "xattr.h"
const struct bkey_ops bch2_bkey_ops[] = {
[BKEY_TYPE_EXTENTS] = bch2_bkey_extent_ops,
[BKEY_TYPE_INODES] = bch2_bkey_inode_ops,
[BKEY_TYPE_DIRENTS] = bch2_bkey_dirent_ops,
[BKEY_TYPE_XATTRS] = bch2_bkey_xattr_ops,
[BKEY_TYPE_ALLOC] = bch2_bkey_alloc_ops,
[BKEY_TYPE_QUOTAS] = bch2_bkey_quota_ops,
[BKEY_TYPE_BTREE] = bch2_bkey_btree_ops,
};
const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
{
const struct bkey_ops *ops = &bch2_bkey_ops[type];
switch (k.k->type) {
case KEY_TYPE_DELETED:
case KEY_TYPE_DISCARD:
return NULL;
case KEY_TYPE_ERROR:
return bkey_val_bytes(k.k) != 0
? "value size should be zero"
: NULL;
case KEY_TYPE_COOKIE:
return bkey_val_bytes(k.k) != sizeof(struct bch_cookie)
? "incorrect value size"
: NULL;
default:
if (k.k->type < KEY_TYPE_GENERIC_NR)
return "invalid type";
return ops->key_invalid(c, k);
}
}
const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
{
const struct bkey_ops *ops = &bch2_bkey_ops[type];
if (k.k->u64s < BKEY_U64s)
return "u64s too small";
if (!ops->is_extents) {
if (k.k->size)
return "nonzero size field";
} else {
if ((k.k->size == 0) != bkey_deleted(k.k))
return "bad size field";
}
if (ops->is_extents &&
!k.k->size &&
!bkey_deleted(k.k))
return "zero size field";
if (k.k->p.snapshot)
return "nonzero snapshot";
if (type != BKEY_TYPE_BTREE &&
!bkey_cmp(k.k->p, POS_MAX))
return "POS_MAX key";
return NULL;
}
const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
{
return __bch2_bkey_invalid(c, type, k) ?:
bch2_bkey_val_invalid(c, type, k);
}
const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
{
if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
return "key before start of btree node";
if (bkey_cmp(k.k->p, b->data->max_key) > 0)
return "key past end of btree node";
return NULL;
}
void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
{
enum bkey_type type = btree_node_type(b);
const struct bkey_ops *ops = &bch2_bkey_ops[type];
const char *invalid;
BUG_ON(!k.k->u64s);
invalid = bch2_bkey_invalid(c, type, k) ?:
bch2_bkey_in_btree_node(b, k);
if (invalid) {
char buf[160];
bch2_bkey_val_to_text(c, type, buf, sizeof(buf), k);
bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid);
return;
}
if (k.k->type >= KEY_TYPE_GENERIC_NR &&
ops->key_debugcheck)
ops->key_debugcheck(c, b, k);
}
#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
{
char *out = buf, *end = buf + size;
p("u64s %u type %u ", k->u64s, k->type);
if (bkey_cmp(k->p, POS_MAX))
p("%llu:%llu", k->p.inode, k->p.offset);
else
p("POS_MAX");
p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo);
return out - buf;
}
int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
char *buf, size_t size, struct bkey_s_c k)
{
const struct bkey_ops *ops = &bch2_bkey_ops[type];
char *out = buf, *end = buf + size;
switch (k.k->type) {
case KEY_TYPE_DELETED:
p(" deleted");
break;
case KEY_TYPE_DISCARD:
p(" discard");
break;
case KEY_TYPE_ERROR:
p(" error");
break;
case KEY_TYPE_COOKIE:
p(" cookie");
break;
default:
if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
ops->val_to_text(c, buf, size, k);
break;
}
return out - buf;
}
int bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
char *buf, size_t size, struct bkey_s_c k)
{
char *out = buf, *end = buf + size;
out += bch2_bkey_to_text(out, end - out, k.k);
out += scnprintf(out, end - out, ": ");
out += bch2_val_to_text(c, type, out, end - out, k);
return out - buf;
}
void bch2_bkey_swab(enum bkey_type type,
const struct bkey_format *f,
struct bkey_packed *k)
{
const struct bkey_ops *ops = &bch2_bkey_ops[type];
bch2_bkey_swab_key(f, k);
if (ops->swab)
ops->swab(f, k);
}

View File

@ -0,0 +1,87 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BKEY_METHODS_H
#define _BCACHEFS_BKEY_METHODS_H
#include "bkey.h"
#define DEF_BTREE_ID(kwd, val, name) BKEY_TYPE_##kwd = val,
enum bkey_type {
DEFINE_BCH_BTREE_IDS()
BKEY_TYPE_BTREE,
};
#undef DEF_BTREE_ID
/* Type of a key in btree @id at level @level: */
static inline enum bkey_type bkey_type(unsigned level, enum btree_id id)
{
return level ? BKEY_TYPE_BTREE : (enum bkey_type) id;
}
static inline bool btree_type_has_ptrs(enum bkey_type type)
{
switch (type) {
case BKEY_TYPE_BTREE:
case BKEY_TYPE_EXTENTS:
return true;
default:
return false;
}
}
struct bch_fs;
struct btree;
struct bkey;
enum merge_result {
BCH_MERGE_NOMERGE,
/*
* The keys were mergeable, but would have overflowed size - so instead
* l was changed to the maximum size, and both keys were modified:
*/
BCH_MERGE_PARTIAL,
BCH_MERGE_MERGE,
};
typedef bool (*key_filter_fn)(struct bch_fs *, struct btree *,
struct bkey_s);
typedef enum merge_result (*key_merge_fn)(struct bch_fs *,
struct btree *,
struct bkey_i *, struct bkey_i *);
struct bkey_ops {
/* Returns reason for being invalid if invalid, else NULL: */
const char * (*key_invalid)(const struct bch_fs *,
struct bkey_s_c);
void (*key_debugcheck)(struct bch_fs *, struct btree *,
struct bkey_s_c);
void (*val_to_text)(struct bch_fs *, char *,
size_t, struct bkey_s_c);
void (*swab)(const struct bkey_format *, struct bkey_packed *);
key_filter_fn key_normalize;
key_merge_fn key_merge;
bool is_extents;
};
const char *bch2_bkey_val_invalid(struct bch_fs *, enum bkey_type,
struct bkey_s_c);
const char *__bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
const char *bch2_bkey_invalid(struct bch_fs *, enum bkey_type, struct bkey_s_c);
const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
int bch2_bkey_to_text(char *, size_t, const struct bkey *);
int bch2_val_to_text(struct bch_fs *, enum bkey_type,
char *, size_t, struct bkey_s_c);
int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
char *, size_t, struct bkey_s_c);
void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
struct bkey_packed *);
extern const struct bkey_ops bch2_bkey_ops[];
#endif /* _BCACHEFS_BKEY_METHODS_H */

1849
fs/bcachefs/bset.c Normal file

File diff suppressed because it is too large Load Diff

668
fs/bcachefs/bset.h Normal file
View File

@ -0,0 +1,668 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BSET_H
#define _BCACHEFS_BSET_H
#include <linux/kernel.h>
#include <linux/types.h>
#include "bcachefs_format.h"
#include "bkey.h"
#include "bkey_methods.h"
#include "btree_types.h"
#include "util.h" /* for time_stats */
#include "vstructs.h"
/*
* BKEYS:
*
* A bkey contains a key, a size field, a variable number of pointers, and some
* ancillary flag bits.
*
* We use two different functions for validating bkeys, bkey_invalid and
* bkey_deleted().
*
* The one exception to the rule that ptr_invalid() filters out invalid keys is
* that it also filters out keys of size 0 - these are keys that have been
* completely overwritten. It'd be safe to delete these in memory while leaving
* them on disk, just unnecessary work - so we filter them out when resorting
* instead.
*
* We can't filter out stale keys when we're resorting, because garbage
* collection needs to find them to ensure bucket gens don't wrap around -
* unless we're rewriting the btree node those stale keys still exist on disk.
*
* We also implement functions here for removing some number of sectors from the
* front or the back of a bkey - this is mainly used for fixing overlapping
* extents, by removing the overlapping sectors from the older key.
*
* BSETS:
*
* A bset is an array of bkeys laid out contiguously in memory in sorted order,
* along with a header. A btree node is made up of a number of these, written at
* different times.
*
* There could be many of them on disk, but we never allow there to be more than
* 4 in memory - we lazily resort as needed.
*
* We implement code here for creating and maintaining auxiliary search trees
* (described below) for searching an individial bset, and on top of that we
* implement a btree iterator.
*
* BTREE ITERATOR:
*
* Most of the code in bcache doesn't care about an individual bset - it needs
* to search entire btree nodes and iterate over them in sorted order.
*
* The btree iterator code serves both functions; it iterates through the keys
* in a btree node in sorted order, starting from either keys after a specific
* point (if you pass it a search key) or the start of the btree node.
*
* AUXILIARY SEARCH TREES:
*
* Since keys are variable length, we can't use a binary search on a bset - we
* wouldn't be able to find the start of the next key. But binary searches are
* slow anyways, due to terrible cache behaviour; bcache originally used binary
* searches and that code topped out at under 50k lookups/second.
*
* So we need to construct some sort of lookup table. Since we only insert keys
* into the last (unwritten) set, most of the keys within a given btree node are
* usually in sets that are mostly constant. We use two different types of
* lookup tables to take advantage of this.
*
* Both lookup tables share in common that they don't index every key in the
* set; they index one key every BSET_CACHELINE bytes, and then a linear search
* is used for the rest.
*
* For sets that have been written to disk and are no longer being inserted
* into, we construct a binary search tree in an array - traversing a binary
* search tree in an array gives excellent locality of reference and is very
* fast, since both children of any node are adjacent to each other in memory
* (and their grandchildren, and great grandchildren...) - this means
* prefetching can be used to great effect.
*
* It's quite useful performance wise to keep these nodes small - not just
* because they're more likely to be in L2, but also because we can prefetch
* more nodes on a single cacheline and thus prefetch more iterations in advance
* when traversing this tree.
*
* Nodes in the auxiliary search tree must contain both a key to compare against
* (we don't want to fetch the key from the set, that would defeat the purpose),
* and a pointer to the key. We use a few tricks to compress both of these.
*
* To compress the pointer, we take advantage of the fact that one node in the
* search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
* a function (to_inorder()) that takes the index of a node in a binary tree and
* returns what its index would be in an inorder traversal, so we only have to
* store the low bits of the offset.
*
* The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
* compress that, we take advantage of the fact that when we're traversing the
* search tree at every iteration we know that both our search key and the key
* we're looking for lie within some range - bounded by our previous
* comparisons. (We special case the start of a search so that this is true even
* at the root of the tree).
*
* So we know the key we're looking for is between a and b, and a and b don't
* differ higher than bit 50, we don't need to check anything higher than bit
* 50.
*
* We don't usually need the rest of the bits, either; we only need enough bits
* to partition the key range we're currently checking. Consider key n - the
* key our auxiliary search tree node corresponds to, and key p, the key
* immediately preceding n. The lowest bit we need to store in the auxiliary
* search tree is the highest bit that differs between n and p.
*
* Note that this could be bit 0 - we might sometimes need all 80 bits to do the
* comparison. But we'd really like our nodes in the auxiliary search tree to be
* of fixed size.
*
* The solution is to make them fixed size, and when we're constructing a node
* check if p and n differed in the bits we needed them to. If they don't we
* flag that node, and when doing lookups we fallback to comparing against the
* real key. As long as this doesn't happen to often (and it seems to reliably
* happen a bit less than 1% of the time), we win - even on failures, that key
* is then more likely to be in cache than if we were doing binary searches all
* the way, since we're touching so much less memory.
*
* The keys in the auxiliary search tree are stored in (software) floating
* point, with an exponent and a mantissa. The exponent needs to be big enough
* to address all the bits in the original key, but the number of bits in the
* mantissa is somewhat arbitrary; more bits just gets us fewer failures.
*
* We need 7 bits for the exponent and 3 bits for the key's offset (since keys
* are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
* We need one node per 128 bytes in the btree node, which means the auxiliary
* search trees take up 3% as much memory as the btree itself.
*
* Constructing these auxiliary search trees is moderately expensive, and we
* don't want to be constantly rebuilding the search tree for the last set
* whenever we insert another key into it. For the unwritten set, we use a much
* simpler lookup table - it's just a flat array, so index i in the lookup table
* corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
* within each byte range works the same as with the auxiliary search trees.
*
* These are much easier to keep up to date when we insert a key - we do it
* somewhat lazily; when we shift a key up we usually just increment the pointer
* to it, only when it would overflow do we go to the trouble of finding the
* first key in that range of bytes again.
*/
extern bool bch2_expensive_debug_checks;
static inline bool btree_keys_expensive_checks(const struct btree *b)
{
#ifdef CONFIG_BCACHEFS_DEBUG
return bch2_expensive_debug_checks || *b->expensive_debug_checks;
#else
return false;
#endif
}
enum bset_aux_tree_type {
BSET_NO_AUX_TREE,
BSET_RO_AUX_TREE,
BSET_RW_AUX_TREE,
};
#define BSET_TREE_NR_TYPES 3
#define BSET_NO_AUX_TREE_VAL (U16_MAX)
#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1)
static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t)
{
switch (t->extra) {
case BSET_NO_AUX_TREE_VAL:
EBUG_ON(t->size);
return BSET_NO_AUX_TREE;
case BSET_RW_AUX_TREE_VAL:
EBUG_ON(!t->size);
return BSET_RW_AUX_TREE;
default:
EBUG_ON(!t->size);
return BSET_RO_AUX_TREE;
}
}
typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
static inline void
__bkey_unpack_key_format_checked(const struct btree *b,
struct bkey *dst,
const struct bkey_packed *src)
{
#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
{
compiled_unpack_fn unpack_fn = b->aux_data;
unpack_fn(dst, src);
if (btree_keys_expensive_checks(b)) {
struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
/*
* hack around a harmless race when compacting whiteouts
* for a write:
*/
dst2.needs_whiteout = dst->needs_whiteout;
BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
}
}
#else
*dst = __bch2_bkey_unpack_key(&b->format, src);
#endif
}
static inline struct bkey
bkey_unpack_key_format_checked(const struct btree *b,
const struct bkey_packed *src)
{
struct bkey dst;
__bkey_unpack_key_format_checked(b, &dst, src);
return dst;
}
static inline void __bkey_unpack_key(const struct btree *b,
struct bkey *dst,
const struct bkey_packed *src)
{
if (likely(bkey_packed(src)))
__bkey_unpack_key_format_checked(b, dst, src);
else
*dst = *packed_to_bkey_c(src);
}
/**
* bkey_unpack_key -- unpack just the key, not the value
*/
static inline struct bkey bkey_unpack_key(const struct btree *b,
const struct bkey_packed *src)
{
return likely(bkey_packed(src))
? bkey_unpack_key_format_checked(b, src)
: *packed_to_bkey_c(src);
}
static inline struct bpos
bkey_unpack_pos_format_checked(const struct btree *b,
const struct bkey_packed *src)
{
#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
return bkey_unpack_key_format_checked(b, src).p;
#else
return __bkey_unpack_pos(&b->format, src);
#endif
}
static inline struct bpos bkey_unpack_pos(const struct btree *b,
const struct bkey_packed *src)
{
return likely(bkey_packed(src))
? bkey_unpack_pos_format_checked(b, src)
: packed_to_bkey_c(src)->p;
}
/* Disassembled bkeys */
static inline struct bkey_s_c bkey_disassemble(struct btree *b,
const struct bkey_packed *k,
struct bkey *u)
{
__bkey_unpack_key(b, u, k);
return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
}
/* non const version: */
static inline struct bkey_s __bkey_disassemble(struct btree *b,
struct bkey_packed *k,
struct bkey *u)
{
__bkey_unpack_key(b, u, k);
return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
}
#define for_each_bset(_b, _t) \
for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
{
return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
}
static inline bool bset_has_rw_aux_tree(struct bset_tree *t)
{
return bset_aux_tree_type(t) == BSET_RW_AUX_TREE;
}
static inline void bch2_bset_set_no_aux_tree(struct btree *b,
struct bset_tree *t)
{
BUG_ON(t < b->set);
for (; t < b->set + ARRAY_SIZE(b->set); t++) {
t->size = 0;
t->extra = BSET_NO_AUX_TREE_VAL;
t->aux_data_offset = U16_MAX;
}
}
static inline void btree_node_set_format(struct btree *b,
struct bkey_format f)
{
int len;
b->format = f;
b->nr_key_bits = bkey_format_key_bits(&f);
len = bch2_compile_bkey_format(&b->format, b->aux_data);
BUG_ON(len < 0 || len > U8_MAX);
b->unpack_fn_len = len;
bch2_bset_set_no_aux_tree(b, b->set);
}
static inline struct bset *bset_next_set(struct btree *b,
unsigned block_bytes)
{
struct bset *i = btree_bset_last(b);
EBUG_ON(!is_power_of_2(block_bytes));
return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
}
void bch2_btree_keys_free(struct btree *);
int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t);
void bch2_btree_keys_init(struct btree *, bool *);
void bch2_bset_init_first(struct btree *, struct bset *);
void bch2_bset_init_next(struct bch_fs *, struct btree *,
struct btree_node_entry *);
void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
void bch2_bset_fix_invalidated_key(struct btree *, struct bset_tree *,
struct bkey_packed *);
void bch2_bset_insert(struct btree *, struct btree_node_iter *,
struct bkey_packed *, struct bkey_i *, unsigned);
void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
/* Bkey utility code */
/* packed or unpacked */
static inline int bkey_cmp_p_or_unp(const struct btree *b,
const struct bkey_packed *l,
const struct bkey_packed *r_packed,
struct bpos *r)
{
EBUG_ON(r_packed && !bkey_packed(r_packed));
if (unlikely(!bkey_packed(l)))
return bkey_cmp(packed_to_bkey_c(l)->p, *r);
if (likely(r_packed))
return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
}
/* Returns true if @k is after iterator position @pos */
static inline bool btree_iter_pos_cmp_packed(const struct btree *b,
struct bpos *pos,
const struct bkey_packed *k,
bool strictly_greater)
{
int cmp = bkey_cmp_left_packed(b, k, pos);
return cmp > 0 ||
(cmp == 0 && !strictly_greater && !bkey_deleted(k));
}
static inline bool btree_iter_pos_cmp_p_or_unp(const struct btree *b,
struct bpos pos,
const struct bkey_packed *pos_packed,
const struct bkey_packed *k,
bool strictly_greater)
{
int cmp = bkey_cmp_p_or_unp(b, k, pos_packed, &pos);
return cmp > 0 ||
(cmp == 0 && !strictly_greater && !bkey_deleted(k));
}
struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *,
struct bkey_packed *, unsigned);
static inline struct bkey_packed *
bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
{
return bch2_bkey_prev_filter(b, t, k, 0);
}
static inline struct bkey_packed *
bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
{
return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_DISCARD + 1);
}
enum bch_extent_overlap {
BCH_EXTENT_OVERLAP_ALL = 0,
BCH_EXTENT_OVERLAP_BACK = 1,
BCH_EXTENT_OVERLAP_FRONT = 2,
BCH_EXTENT_OVERLAP_MIDDLE = 3,
};
/* Returns how k overlaps with m */
static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
const struct bkey *m)
{
int cmp1 = bkey_cmp(k->p, m->p) < 0;
int cmp2 = bkey_cmp(bkey_start_pos(k),
bkey_start_pos(m)) > 0;
return (cmp1 << 1) + cmp2;
}
/* Btree key iteration */
static inline void __bch2_btree_node_iter_init(struct btree_node_iter *iter,
bool is_extents)
{
iter->is_extents = is_extents;
memset(iter->data, 0, sizeof(iter->data));
}
void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
const struct bkey_packed *,
const struct bkey_packed *);
void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
struct bpos, bool, bool);
void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
struct btree *, bool);
struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
struct btree *,
struct bset_tree *);
void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *);
void bch2_btree_node_iter_set_drop(struct btree_node_iter *,
struct btree_node_iter_set *);
void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *);
#define btree_node_iter_for_each(_iter, _set) \
for (_set = (_iter)->data; \
_set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \
(_set)->k != (_set)->end; \
_set++)
static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter,
unsigned i)
{
return iter->data[i].k == iter->data[i].end;
}
static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
{
return __btree_node_iter_set_end(iter, 0);
}
static inline int __btree_node_iter_cmp(bool is_extents,
struct btree *b,
struct bkey_packed *l,
struct bkey_packed *r)
{
/*
* For non extents, when keys compare equal the deleted keys have to
* come first - so that bch2_btree_node_iter_next_check() can detect
* duplicate nondeleted keys (and possibly other reasons?)
*
* For extents, bkey_deleted() is used as a proxy for k->size == 0, so
* deleted keys have to sort last.
*/
return bkey_cmp_packed(b, l, r)
?: (is_extents
? (int) bkey_deleted(l) - (int) bkey_deleted(r)
: (int) bkey_deleted(r) - (int) bkey_deleted(l))
?: (l > r) - (l < r);
}
static inline int btree_node_iter_cmp(struct btree_node_iter *iter,
struct btree *b,
struct btree_node_iter_set l,
struct btree_node_iter_set r)
{
return __btree_node_iter_cmp(iter->is_extents, b,
__btree_node_offset_to_key(b, l.k),
__btree_node_offset_to_key(b, r.k));
}
static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
struct btree *b,
const struct bkey_packed *k,
const struct bkey_packed *end)
{
if (k != end) {
struct btree_node_iter_set *pos;
btree_node_iter_for_each(iter, pos)
;
BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
*pos = (struct btree_node_iter_set) {
__btree_node_key_to_offset(b, k),
__btree_node_key_to_offset(b, end)
};
}
}
static inline struct bkey_packed *
__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
struct btree *b)
{
return __btree_node_offset_to_key(b, iter->data->k);
}
static inline struct bkey_packed *
bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter,
struct btree *b,
unsigned min_key_type)
{
while (!bch2_btree_node_iter_end(iter)) {
struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b);
if (k->type >= min_key_type)
return k;
bch2_btree_node_iter_advance(iter, b);
}
return NULL;
}
static inline struct bkey_packed *
bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
struct btree *b)
{
return bch2_btree_node_iter_peek_filter(iter, b, 0);
}
static inline struct bkey_packed *
bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
{
return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_DISCARD + 1);
}
static inline struct bkey_packed *
bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
{
struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b);
if (ret)
bch2_btree_node_iter_advance(iter, b);
return ret;
}
struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *,
struct btree *, unsigned);
static inline struct bkey_packed *
bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree *b)
{
return bch2_btree_node_iter_prev_filter(iter, b, 0);
}
static inline struct bkey_packed *
bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b)
{
return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_DISCARD + 1);
}
/*
* Iterates over all _live_ keys - skipping deleted (and potentially
* overlapping) keys
*/
#define for_each_btree_node_key(b, k, iter, _is_extents) \
for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
((k) = bch2_btree_node_iter_peek(iter, b)); \
bch2_btree_node_iter_advance(iter, b))
struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
struct btree *,
struct bkey *);
#define for_each_btree_node_key_unpack(b, k, iter, _is_extents, unpacked)\
for (bch2_btree_node_iter_init_from_start((iter), (b), (_is_extents));\
(k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
bch2_btree_node_iter_advance(iter, b))
/* Accounting: */
static inline void btree_keys_account_key(struct btree_nr_keys *n,
unsigned bset,
struct bkey_packed *k,
int sign)
{
n->live_u64s += k->u64s * sign;
n->bset_u64s[bset] += k->u64s * sign;
if (bkey_packed(k))
n->packed_keys += sign;
else
n->unpacked_keys += sign;
}
#define btree_keys_account_key_add(_nr, _bset_idx, _k) \
btree_keys_account_key(_nr, _bset_idx, _k, 1)
#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \
btree_keys_account_key(_nr, _bset_idx, _k, -1)
struct bset_stats {
struct {
size_t nr, bytes;
} sets[BSET_TREE_NR_TYPES];
size_t floats;
size_t failed_unpacked;
size_t failed_prev;
size_t failed_overflow;
};
void bch2_btree_keys_stats(struct btree *, struct bset_stats *);
int bch2_bkey_print_bfloat(struct btree *, struct bkey_packed *,
char *, size_t);
/* Debug stuff */
void bch2_dump_bset(struct btree *, struct bset *, unsigned);
void bch2_dump_btree_node(struct btree *);
void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
#ifdef CONFIG_BCACHEFS_DEBUG
void __bch2_verify_btree_nr_keys(struct btree *);
void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
void bch2_verify_key_order(struct btree *, struct btree_node_iter *,
struct bkey_packed *);
#else
static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
struct btree *b) {}
static inline void bch2_verify_key_order(struct btree *b,
struct btree_node_iter *iter,
struct bkey_packed *where) {}
#endif
static inline void bch2_verify_btree_nr_keys(struct btree *b)
{
if (btree_keys_expensive_checks(b))
__bch2_verify_btree_nr_keys(b);
}
#endif /* _BCACHEFS_BSET_H */

941
fs/bcachefs/btree_cache.c Normal file
View File

@ -0,0 +1,941 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "btree_cache.h"
#include "btree_io.h"
#include "btree_iter.h"
#include "btree_locking.h"
#include "debug.h"
#include "extents.h"
#include "trace.h"
#include <linux/prefetch.h>
#define DEF_BTREE_ID(kwd, val, name) name,
const char * const bch2_btree_ids[] = {
DEFINE_BCH_BTREE_IDS()
NULL
};
#undef DEF_BTREE_ID
void bch2_recalc_btree_reserve(struct bch_fs *c)
{
unsigned i, reserve = 16;
if (!c->btree_roots[0].b)
reserve += 8;
for (i = 0; i < BTREE_ID_NR; i++)
if (c->btree_roots[i].b)
reserve += min_t(unsigned, 1,
c->btree_roots[i].b->level) * 8;
c->btree_cache.reserve = reserve;
}
static inline unsigned btree_cache_can_free(struct btree_cache *bc)
{
return max_t(int, 0, bc->used - bc->reserve);
}
static void __btree_node_data_free(struct bch_fs *c, struct btree *b)
{
EBUG_ON(btree_node_write_in_flight(b));
kvpfree(b->data, btree_bytes(c));
b->data = NULL;
bch2_btree_keys_free(b);
}
static void btree_node_data_free(struct bch_fs *c, struct btree *b)
{
struct btree_cache *bc = &c->btree_cache;
__btree_node_data_free(c, b);
bc->used--;
list_move(&b->list, &bc->freed);
}
static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
const void *obj)
{
const struct btree *b = obj;
const u64 *v = arg->key;
return PTR_HASH(&b->key) == *v ? 0 : 1;
}
static const struct rhashtable_params bch_btree_cache_params = {
.head_offset = offsetof(struct btree, hash),
.key_offset = offsetof(struct btree, key.v),
.key_len = sizeof(struct bch_extent_ptr),
.obj_cmpfn = bch2_btree_cache_cmp_fn,
};
static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
struct btree_cache *bc = &c->btree_cache;
b->data = kvpmalloc(btree_bytes(c), gfp);
if (!b->data)
goto err;
if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp))
goto err;
bc->used++;
list_move(&b->list, &bc->freeable);
return;
err:
kvpfree(b->data, btree_bytes(c));
b->data = NULL;
list_move(&b->list, &bc->freed);
}
static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
{
struct btree *b = kzalloc(sizeof(struct btree), gfp);
if (!b)
return NULL;
bkey_extent_init(&b->key);
six_lock_init(&b->lock);
lockdep_set_novalidate_class(&b->lock);
INIT_LIST_HEAD(&b->list);
INIT_LIST_HEAD(&b->write_blocked);
btree_node_data_alloc(c, b, gfp);
return b->data ? b : NULL;
}
/* Btree in memory cache - hash table */
void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
{
rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
/* Cause future lookups for this node to fail: */
bkey_i_to_extent(&b->key)->v._data[0] = 0;
}
int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
{
return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
bch_btree_cache_params);
}
int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
unsigned level, enum btree_id id)
{
int ret;
b->level = level;
b->btree_id = id;
mutex_lock(&bc->lock);
ret = __bch2_btree_node_hash_insert(bc, b);
if (!ret)
list_add(&b->list, &bc->live);
mutex_unlock(&bc->lock);
return ret;
}
__flatten
static inline struct btree *btree_cache_find(struct btree_cache *bc,
const struct bkey_i *k)
{
return rhashtable_lookup_fast(&bc->table, &PTR_HASH(k),
bch_btree_cache_params);
}
/*
* this version is for btree nodes that have already been freed (we're not
* reaping a real btree node)
*/
static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
{
struct btree_cache *bc = &c->btree_cache;
int ret = 0;
lockdep_assert_held(&bc->lock);
if (!six_trylock_intent(&b->lock))
return -ENOMEM;
if (!six_trylock_write(&b->lock))
goto out_unlock_intent;
if (btree_node_noevict(b))
goto out_unlock;
if (!btree_node_may_write(b))
goto out_unlock;
if (btree_node_dirty(b) ||
btree_node_write_in_flight(b) ||
btree_node_read_in_flight(b)) {
if (!flush)
goto out_unlock;
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
TASK_UNINTERRUPTIBLE);
/*
* Using the underscore version because we don't want to compact
* bsets after the write, since this node is about to be evicted
* - unless btree verify mode is enabled, since it runs out of
* the post write cleanup:
*/
if (verify_btree_ondisk(c))
bch2_btree_node_write(c, b, SIX_LOCK_intent);
else
__bch2_btree_node_write(c, b, SIX_LOCK_read);
/* wait for any in flight btree write */
btree_node_wait_on_io(b);
}
out:
if (PTR_HASH(&b->key) && !ret)
trace_btree_node_reap(c, b);
return ret;
out_unlock:
six_unlock_write(&b->lock);
out_unlock_intent:
six_unlock_intent(&b->lock);
ret = -ENOMEM;
goto out;
}
static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
{
return __btree_node_reclaim(c, b, false);
}
static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
{
return __btree_node_reclaim(c, b, true);
}
static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
struct bch_fs *c = container_of(shrink, struct bch_fs,
btree_cache.shrink);
struct btree_cache *bc = &c->btree_cache;
struct btree *b, *t;
unsigned long nr = sc->nr_to_scan;
unsigned long can_free;
unsigned long touched = 0;
unsigned long freed = 0;
unsigned i;
if (btree_shrinker_disabled(c))
return SHRINK_STOP;
/* Return -1 if we can't do anything right now */
if (sc->gfp_mask & __GFP_IO)
mutex_lock(&bc->lock);
else if (!mutex_trylock(&bc->lock))
return -1;
/*
* It's _really_ critical that we don't free too many btree nodes - we
* have to always leave ourselves a reserve. The reserve is how we
* guarantee that allocating memory for a new btree node can always
* succeed, so that inserting keys into the btree can always succeed and
* IO can always make forward progress:
*/
nr /= btree_pages(c);
can_free = btree_cache_can_free(bc);
nr = min_t(unsigned long, nr, can_free);
i = 0;
list_for_each_entry_safe(b, t, &bc->freeable, list) {
touched++;
if (freed >= nr)
break;
if (++i > 3 &&
!btree_node_reclaim(c, b)) {
btree_node_data_free(c, b);
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
freed++;
}
}
restart:
list_for_each_entry_safe(b, t, &bc->live, list) {
touched++;
if (freed >= nr) {
/* Save position */
if (&t->list != &bc->live)
list_move_tail(&bc->live, &t->list);
break;
}
if (!btree_node_accessed(b) &&
!btree_node_reclaim(c, b)) {
/* can't call bch2_btree_node_hash_remove under lock */
freed++;
if (&t->list != &bc->live)
list_move_tail(&bc->live, &t->list);
btree_node_data_free(c, b);
mutex_unlock(&bc->lock);
bch2_btree_node_hash_remove(bc, b);
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
if (freed >= nr)
goto out;
if (sc->gfp_mask & __GFP_IO)
mutex_lock(&bc->lock);
else if (!mutex_trylock(&bc->lock))
goto out;
goto restart;
} else
clear_btree_node_accessed(b);
}
mutex_unlock(&bc->lock);
out:
return (unsigned long) freed * btree_pages(c);
}
static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
struct bch_fs *c = container_of(shrink, struct bch_fs,
btree_cache.shrink);
struct btree_cache *bc = &c->btree_cache;
if (btree_shrinker_disabled(c))
return 0;
return btree_cache_can_free(bc) * btree_pages(c);
}
void bch2_fs_btree_cache_exit(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
unsigned i;
if (bc->shrink.list.next)
unregister_shrinker(&bc->shrink);
mutex_lock(&bc->lock);
#ifdef CONFIG_BCACHEFS_DEBUG
if (c->verify_data)
list_move(&c->verify_data->list, &bc->live);
kvpfree(c->verify_ondisk, btree_bytes(c));
#endif
for (i = 0; i < BTREE_ID_NR; i++)
if (c->btree_roots[i].b)
list_add(&c->btree_roots[i].b->list, &bc->live);
list_splice(&bc->freeable, &bc->live);
while (!list_empty(&bc->live)) {
b = list_first_entry(&bc->live, struct btree, list);
BUG_ON(btree_node_read_in_flight(b) ||
btree_node_write_in_flight(b));
if (btree_node_dirty(b))
bch2_btree_complete_write(c, b, btree_current_write(b));
clear_btree_node_dirty(b);
btree_node_data_free(c, b);
}
while (!list_empty(&bc->freed)) {
b = list_first_entry(&bc->freed, struct btree, list);
list_del(&b->list);
kfree(b);
}
mutex_unlock(&bc->lock);
if (bc->table_init_done)
rhashtable_destroy(&bc->table);
}
int bch2_fs_btree_cache_init(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
unsigned i;
int ret = 0;
pr_verbose_init(c->opts, "");
ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
if (ret)
goto out;
bc->table_init_done = true;
bch2_recalc_btree_reserve(c);
for (i = 0; i < bc->reserve; i++)
if (!btree_node_mem_alloc(c, GFP_KERNEL)) {
ret = -ENOMEM;
goto out;
}
list_splice_init(&bc->live, &bc->freeable);
#ifdef CONFIG_BCACHEFS_DEBUG
mutex_init(&c->verify_lock);
c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
if (!c->verify_ondisk) {
ret = -ENOMEM;
goto out;
}
c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL);
if (!c->verify_data) {
ret = -ENOMEM;
goto out;
}
list_del_init(&c->verify_data->list);
#endif
bc->shrink.count_objects = bch2_btree_cache_count;
bc->shrink.scan_objects = bch2_btree_cache_scan;
bc->shrink.seeks = 4;
bc->shrink.batch = btree_pages(c) * 2;
register_shrinker(&bc->shrink, "%s/btree_cache", c->name);
out:
pr_verbose_init(c->opts, "ret %i", ret);
return ret;
}
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
{
mutex_init(&bc->lock);
INIT_LIST_HEAD(&bc->live);
INIT_LIST_HEAD(&bc->freeable);
INIT_LIST_HEAD(&bc->freed);
}
/*
* We can only have one thread cannibalizing other cached btree nodes at a time,
* or we'll deadlock. We use an open coded mutex to ensure that, which a
* cannibalize_bucket() will take. This means every time we unlock the root of
* the btree, we need to release this lock if we have it held.
*/
void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
if (bc->alloc_lock == current) {
trace_btree_node_cannibalize_unlock(c);
bc->alloc_lock = NULL;
closure_wake_up(&bc->alloc_wait);
}
}
int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
{
struct btree_cache *bc = &c->btree_cache;
struct task_struct *old;
old = cmpxchg(&bc->alloc_lock, NULL, current);
if (old == NULL || old == current)
goto success;
if (!cl) {
trace_btree_node_cannibalize_lock_fail(c);
return -ENOMEM;
}
closure_wait(&bc->alloc_wait, cl);
/* Try again, after adding ourselves to waitlist */
old = cmpxchg(&bc->alloc_lock, NULL, current);
if (old == NULL || old == current) {
/* We raced */
closure_wake_up(&bc->alloc_wait);
goto success;
}
trace_btree_node_cannibalize_lock_fail(c);
return -EAGAIN;
success:
trace_btree_node_cannibalize_lock(c);
return 0;
}
static struct btree *btree_node_cannibalize(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
list_for_each_entry_reverse(b, &bc->live, list)
if (!btree_node_reclaim(c, b))
return b;
while (1) {
list_for_each_entry_reverse(b, &bc->live, list)
if (!btree_node_write_and_reclaim(c, b))
return b;
/*
* Rare case: all nodes were intent-locked.
* Just busy-wait.
*/
WARN_ONCE(1, "btree cache cannibalize failed\n");
cond_resched();
}
}
struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
u64 start_time = local_clock();
mutex_lock(&bc->lock);
/*
* btree_free() doesn't free memory; it sticks the node on the end of
* the list. Check if there's any freed nodes there:
*/
list_for_each_entry(b, &bc->freeable, list)
if (!btree_node_reclaim(c, b))
goto out_unlock;
/*
* We never free struct btree itself, just the memory that holds the on
* disk node. Check the freed list before allocating a new one:
*/
list_for_each_entry(b, &bc->freed, list)
if (!btree_node_reclaim(c, b)) {
btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_NOIO);
if (b->data)
goto out_unlock;
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
goto err;
}
b = btree_node_mem_alloc(c, __GFP_NOWARN|GFP_NOIO);
if (!b)
goto err;
BUG_ON(!six_trylock_intent(&b->lock));
BUG_ON(!six_trylock_write(&b->lock));
out_unlock:
BUG_ON(btree_node_hashed(b));
BUG_ON(btree_node_write_in_flight(b));
list_del_init(&b->list);
mutex_unlock(&bc->lock);
out:
b->flags = 0;
b->written = 0;
b->nsets = 0;
b->sib_u64s[0] = 0;
b->sib_u64s[1] = 0;
b->whiteout_u64s = 0;
b->uncompacted_whiteout_u64s = 0;
bch2_btree_keys_init(b, &c->expensive_debug_checks);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
start_time);
return b;
err:
/* Try to cannibalize another cached btree node: */
if (bc->alloc_lock == current) {
b = btree_node_cannibalize(c);
list_del_init(&b->list);
mutex_unlock(&bc->lock);
bch2_btree_node_hash_remove(bc, b);
trace_btree_node_cannibalize(c);
goto out;
}
mutex_unlock(&bc->lock);
return ERR_PTR(-ENOMEM);
}
/* Slowpath, don't want it inlined into btree_iter_traverse() */
static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
struct btree_iter *iter,
const struct bkey_i *k,
unsigned level,
enum six_lock_type lock_type,
bool sync)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
/*
* Parent node must be locked, else we could read in a btree node that's
* been freed:
*/
BUG_ON(!btree_node_locked(iter, level + 1));
BUG_ON(level >= BTREE_MAX_DEPTH);
b = bch2_btree_node_mem_alloc(c);
if (IS_ERR(b))
return b;
bkey_copy(&b->key, k);
if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) {
/* raced with another fill: */
/* mark as unhashed... */
bkey_i_to_extent(&b->key)->v._data[0] = 0;
mutex_lock(&bc->lock);
list_add(&b->list, &bc->freeable);
mutex_unlock(&bc->lock);
six_unlock_write(&b->lock);
six_unlock_intent(&b->lock);
return NULL;
}
/*
* If the btree node wasn't cached, we can't drop our lock on
* the parent until after it's added to the cache - because
* otherwise we could race with a btree_split() freeing the node
* we're trying to lock.
*
* But the deadlock described below doesn't exist in this case,
* so it's safe to not drop the parent lock until here:
*/
if (btree_node_read_locked(iter, level + 1))
btree_node_unlock(iter, level + 1);
bch2_btree_node_read(c, b, sync);
six_unlock_write(&b->lock);
if (!sync) {
six_unlock_intent(&b->lock);
return NULL;
}
if (lock_type == SIX_LOCK_read)
six_lock_downgrade(&b->lock);
return b;
}
/**
* bch_btree_node_get - find a btree node in the cache and lock it, reading it
* in from disk if necessary.
*
* If IO is necessary and running under generic_make_request, returns -EAGAIN.
*
* The btree node will have either a read or a write lock held, depending on
* the @write parameter.
*/
struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
const struct bkey_i *k, unsigned level,
enum six_lock_type lock_type,
bool may_drop_locks)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
struct bset_tree *t;
/*
* XXX: locking optimization
*
* we can make the locking looser here - caller can drop lock on parent
* node before locking child node (and potentially blocking): we just
* have to have bch2_btree_node_fill() call relock on the parent and
* return -EINTR if that fails
*/
EBUG_ON(!btree_node_locked(iter, level + 1));
EBUG_ON(level >= BTREE_MAX_DEPTH);
retry:
rcu_read_lock();
b = btree_cache_find(bc, k);
rcu_read_unlock();
if (unlikely(!b)) {
/*
* We must have the parent locked to call bch2_btree_node_fill(),
* else we could read in a btree node from disk that's been
* freed:
*/
b = bch2_btree_node_fill(c, iter, k, level, lock_type, true);
/* We raced and found the btree node in the cache */
if (!b)
goto retry;
if (IS_ERR(b))
return b;
} else {
/*
* There's a potential deadlock with splits and insertions into
* interior nodes we have to avoid:
*
* The other thread might be holding an intent lock on the node
* we want, and they want to update its parent node so they're
* going to upgrade their intent lock on the parent node to a
* write lock.
*
* But if we're holding a read lock on the parent, and we're
* trying to get the intent lock they're holding, we deadlock.
*
* So to avoid this we drop the read locks on parent nodes when
* we're starting to take intent locks - and handle the race.
*
* The race is that they might be about to free the node we
* want, and dropping our read lock on the parent node lets them
* update the parent marking the node we want as freed, and then
* free it:
*
* To guard against this, btree nodes are evicted from the cache
* when they're freed - and PTR_HASH() is zeroed out, which we
* check for after we lock the node.
*
* Then, bch2_btree_node_relock() on the parent will fail - because
* the parent was modified, when the pointer to the node we want
* was removed - and we'll bail out:
*/
if (btree_node_read_locked(iter, level + 1))
btree_node_unlock(iter, level + 1);
if (!btree_node_lock(b, k->k.p, level, iter,
lock_type, may_drop_locks))
return ERR_PTR(-EINTR);
if (unlikely(PTR_HASH(&b->key) != PTR_HASH(k) ||
b->level != level ||
race_fault())) {
six_unlock_type(&b->lock, lock_type);
if (bch2_btree_node_relock(iter, level + 1))
goto retry;
return ERR_PTR(-EINTR);
}
}
wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
TASK_UNINTERRUPTIBLE);
prefetch(b->aux_data);
for_each_bset(b, t) {
void *p = (u64 *) b->aux_data + t->aux_data_offset;
prefetch(p + L1_CACHE_BYTES * 0);
prefetch(p + L1_CACHE_BYTES * 1);
prefetch(p + L1_CACHE_BYTES * 2);
}
/* avoid atomic set bit if it's not needed: */
if (btree_node_accessed(b))
set_btree_node_accessed(b);
if (unlikely(btree_node_read_error(b))) {
six_unlock_type(&b->lock, lock_type);
return ERR_PTR(-EIO);
}
EBUG_ON(b->btree_id != iter->btree_id ||
BTREE_NODE_LEVEL(b->data) != level ||
bkey_cmp(b->data->max_key, k->k.p));
return b;
}
struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
struct btree_iter *iter,
struct btree *b,
bool may_drop_locks,
enum btree_node_sibling sib)
{
struct btree *parent;
struct btree_node_iter node_iter;
struct bkey_packed *k;
BKEY_PADDED(k) tmp;
struct btree *ret = NULL;
unsigned level = b->level;
parent = btree_iter_node(iter, level + 1);
if (!parent)
return NULL;
if (!bch2_btree_node_relock(iter, level + 1))
goto out_upgrade;
node_iter = iter->l[parent->level].iter;
k = bch2_btree_node_iter_peek_all(&node_iter, parent);
BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
k = sib == btree_prev_sib
? bch2_btree_node_iter_prev(&node_iter, parent)
: (bch2_btree_node_iter_advance(&node_iter, parent),
bch2_btree_node_iter_peek(&node_iter, parent));
if (!k)
goto out;
bch2_bkey_unpack(parent, &tmp.k, k);
ret = bch2_btree_node_get(c, iter, &tmp.k, level,
SIX_LOCK_intent, may_drop_locks);
if (PTR_ERR_OR_ZERO(ret) == -EINTR && may_drop_locks) {
struct btree_iter *linked;
if (!bch2_btree_node_relock(iter, level + 1))
goto out_upgrade;
/*
* We might have got -EINTR because trylock failed, and we're
* holding other locks that would cause us to deadlock:
*/
for_each_linked_btree_iter(iter, linked)
if (btree_iter_cmp(iter, linked) < 0)
__bch2_btree_iter_unlock(linked);
if (sib == btree_prev_sib)
btree_node_unlock(iter, level);
ret = bch2_btree_node_get(c, iter, &tmp.k, level,
SIX_LOCK_intent, may_drop_locks);
/*
* before btree_iter_relock() calls btree_iter_verify_locks():
*/
if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
btree_node_unlock(iter, level + 1);
if (!bch2_btree_node_relock(iter, level)) {
btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
if (!IS_ERR(ret)) {
six_unlock_intent(&ret->lock);
ret = ERR_PTR(-EINTR);
}
}
bch2_btree_iter_relock(iter);
}
out:
if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
btree_node_unlock(iter, level + 1);
bch2_btree_iter_verify_locks(iter);
BUG_ON((!may_drop_locks || !IS_ERR(ret)) &&
(iter->uptodate >= BTREE_ITER_NEED_RELOCK ||
!btree_node_locked(iter, level)));
if (!IS_ERR_OR_NULL(ret)) {
struct btree *n1 = ret, *n2 = b;
if (sib != btree_prev_sib)
swap(n1, n2);
BUG_ON(bkey_cmp(btree_type_successor(n1->btree_id,
n1->key.k.p),
n2->data->min_key));
}
return ret;
out_upgrade:
if (may_drop_locks)
bch2_btree_iter_upgrade(iter, level + 2, true);
ret = ERR_PTR(-EINTR);
goto out;
}
void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
const struct bkey_i *k, unsigned level)
{
struct btree_cache *bc = &c->btree_cache;
struct btree *b;
BUG_ON(!btree_node_locked(iter, level + 1));
BUG_ON(level >= BTREE_MAX_DEPTH);
rcu_read_lock();
b = btree_cache_find(bc, k);
rcu_read_unlock();
if (b)
return;
bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false);
}
int bch2_print_btree_node(struct bch_fs *c, struct btree *b,
char *buf, size_t len)
{
const struct bkey_format *f = &b->format;
struct bset_stats stats;
char ptrs[100];
memset(&stats, 0, sizeof(stats));
bch2_val_to_text(c, BKEY_TYPE_BTREE, ptrs, sizeof(ptrs),
bkey_i_to_s_c(&b->key));
bch2_btree_keys_stats(b, &stats);
return scnprintf(buf, len,
"l %u %llu:%llu - %llu:%llu:\n"
" ptrs: %s\n"
" format: u64s %u fields %u %u %u %u %u\n"
" unpack fn len: %u\n"
" bytes used %zu/%zu (%zu%% full)\n"
" sib u64s: %u, %u (merge threshold %zu)\n"
" nr packed keys %u\n"
" nr unpacked keys %u\n"
" floats %zu\n"
" failed unpacked %zu\n"
" failed prev %zu\n"
" failed overflow %zu\n",
b->level,
b->data->min_key.inode,
b->data->min_key.offset,
b->data->max_key.inode,
b->data->max_key.offset,
ptrs,
f->key_u64s,
f->bits_per_field[0],
f->bits_per_field[1],
f->bits_per_field[2],
f->bits_per_field[3],
f->bits_per_field[4],
b->unpack_fn_len,
b->nr.live_u64s * sizeof(u64),
btree_bytes(c) - sizeof(struct btree_node),
b->nr.live_u64s * 100 / btree_max_u64s(c),
b->sib_u64s[0],
b->sib_u64s[1],
BTREE_FOREGROUND_MERGE_THRESHOLD(c),
b->nr.packed_keys,
b->nr.unpacked_keys,
stats.floats,
stats.failed_unpacked,
stats.failed_prev,
stats.failed_overflow);
}

91
fs/bcachefs/btree_cache.h Normal file
View File

@ -0,0 +1,91 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BTREE_CACHE_H
#define _BCACHEFS_BTREE_CACHE_H
#include "bcachefs.h"
#include "btree_types.h"
#include "extents.h"
struct btree_iter;
extern const char * const bch2_btree_ids[];
void bch2_recalc_btree_reserve(struct bch_fs *);
void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
unsigned, enum btree_id);
void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
const struct bkey_i *, unsigned,
enum six_lock_type, bool);
struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
struct btree *, bool,
enum btree_node_sibling);
void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
const struct bkey_i *, unsigned);
void bch2_fs_btree_cache_exit(struct bch_fs *);
int bch2_fs_btree_cache_init(struct bch_fs *);
void bch2_fs_btree_cache_init_early(struct btree_cache *);
#define PTR_HASH(_k) (bkey_i_to_extent_c(_k)->v._data[0])
/* is btree node in hash table? */
static inline bool btree_node_hashed(struct btree *b)
{
return bkey_extent_is_data(&b->key.k) && PTR_HASH(&b->key);
}
#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \
for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \
&(_c)->btree_cache.table), \
_iter = 0; _iter < (_tbl)->size; _iter++) \
rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
static inline size_t btree_bytes(struct bch_fs *c)
{
return c->opts.btree_node_size << 9;
}
static inline size_t btree_max_u64s(struct bch_fs *c)
{
return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
}
static inline size_t btree_page_order(struct bch_fs *c)
{
return get_order(btree_bytes(c));
}
static inline size_t btree_pages(struct bch_fs *c)
{
return 1 << btree_page_order(c);
}
static inline unsigned btree_blocks(struct bch_fs *c)
{
return c->opts.btree_node_size >> c->block_bits;
}
#define BTREE_SPLIT_THRESHOLD(c) (btree_blocks(c) * 3 / 4)
#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3)
#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \
(BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \
(BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2))
#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->btree_id].b)
int bch2_print_btree_node(struct bch_fs *, struct btree *,
char *, size_t);
#endif /* _BCACHEFS_BTREE_CACHE_H */

1099
fs/bcachefs/btree_gc.c Normal file

File diff suppressed because it is too large Load Diff

113
fs/bcachefs/btree_gc.h Normal file
View File

@ -0,0 +1,113 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BTREE_GC_H
#define _BCACHEFS_BTREE_GC_H
#include "btree_types.h"
enum bkey_type;
void bch2_coalesce(struct bch_fs *);
void bch2_gc(struct bch_fs *);
void bch2_gc_thread_stop(struct bch_fs *);
int bch2_gc_thread_start(struct bch_fs *);
int bch2_initial_gc(struct bch_fs *, struct list_head *);
u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c);
int bch2_btree_mark_key_initial(struct bch_fs *, enum bkey_type,
struct bkey_s_c);
void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
/*
* For concurrent mark and sweep (with other index updates), we define a total
* ordering of _all_ references GC walks:
*
* Note that some references will have the same GC position as others - e.g.
* everything within the same btree node; in those cases we're relying on
* whatever locking exists for where those references live, i.e. the write lock
* on a btree node.
*
* That locking is also required to ensure GC doesn't pass the updater in
* between the updater adding/removing the reference and updating the GC marks;
* without that, we would at best double count sometimes.
*
* That part is important - whenever calling bch2_mark_pointers(), a lock _must_
* be held that prevents GC from passing the position the updater is at.
*
* (What about the start of gc, when we're clearing all the marks? GC clears the
* mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc
* position inside its cmpxchg loop, so crap magically works).
*/
/* Position of (the start of) a gc phase: */
static inline struct gc_pos gc_phase(enum gc_phase phase)
{
return (struct gc_pos) {
.phase = phase,
.pos = POS_MIN,
.level = 0,
};
}
static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
{
if (l.phase != r.phase)
return l.phase < r.phase ? -1 : 1;
if (bkey_cmp(l.pos, r.pos))
return bkey_cmp(l.pos, r.pos);
if (l.level != r.level)
return l.level < r.level ? -1 : 1;
return 0;
}
static inline struct gc_pos gc_pos_btree(enum btree_id id,
struct bpos pos, unsigned level)
{
return (struct gc_pos) {
.phase = GC_PHASE_BTREE_EXTENTS + id,
.pos = pos,
.level = level,
};
}
/*
* GC position of the pointers within a btree node: note, _not_ for &b->key
* itself, that lives in the parent node:
*/
static inline struct gc_pos gc_pos_btree_node(struct btree *b)
{
return gc_pos_btree(b->btree_id, b->key.k.p, b->level);
}
/*
* GC position of the pointer to a btree root: we don't use
* gc_pos_pointer_to_btree_node() here to avoid a potential race with
* btree_split() increasing the tree depth - the new root will have level > the
* old root and thus have a greater gc position than the old root, but that
* would be incorrect since once gc has marked the root it's not coming back.
*/
static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
{
return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH);
}
static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob)
{
return (struct gc_pos) {
.phase = GC_PHASE_ALLOC,
.pos = POS(ob ? ob - c->open_buckets : 0, 0),
};
}
static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
{
unsigned seq;
bool ret;
do {
seq = read_seqcount_begin(&c->gc_pos_lock);
ret = gc_pos_cmp(c->gc_pos, pos) < 0;
} while (read_seqcount_retry(&c->gc_pos_lock, seq));
return ret;
}
#endif /* _BCACHEFS_BTREE_GC_H */

2095
fs/bcachefs/btree_io.c Normal file

File diff suppressed because it is too large Load Diff

197
fs/bcachefs/btree_io.h Normal file
View File

@ -0,0 +1,197 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BTREE_IO_H
#define _BCACHEFS_BTREE_IO_H
#include "bset.h"
#include "extents.h"
#include "io_types.h"
struct bch_fs;
struct btree_write;
struct btree;
struct btree_iter;
struct btree_read_bio {
struct bch_fs *c;
u64 start_time;
unsigned have_ioref:1;
struct extent_pick_ptr pick;
struct work_struct work;
struct bio bio;
};
struct btree_write_bio {
void *data;
struct work_struct work;
struct bch_write_bio wbio;
};
static inline void btree_node_io_unlock(struct btree *b)
{
EBUG_ON(!btree_node_write_in_flight(b));
clear_btree_node_write_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
}
static inline void btree_node_io_lock(struct btree *b)
{
wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
static inline void btree_node_wait_on_io(struct btree *b)
{
wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
TASK_UNINTERRUPTIBLE);
}
static inline bool btree_node_may_write(struct btree *b)
{
return list_empty_careful(&b->write_blocked) &&
!b->will_make_reachable;
}
enum compact_mode {
COMPACT_LAZY,
COMPACT_WRITTEN,
COMPACT_WRITTEN_NO_WRITE_LOCK,
};
bool __bch2_compact_whiteouts(struct bch_fs *, struct btree *, enum compact_mode);
static inline unsigned should_compact_bset_lazy(struct btree *b, struct bset_tree *t)
{
unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set];
return dead_u64s > 128 && dead_u64s * 3 > bset_u64s;
}
static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
{
struct bset_tree *t;
for_each_bset(b, t)
if (should_compact_bset_lazy(b, t))
return __bch2_compact_whiteouts(c, b, COMPACT_LAZY);
return false;
}
void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
void bch2_btree_build_aux_trees(struct btree *);
void bch2_btree_init_next(struct bch_fs *, struct btree *,
struct btree_iter *);
int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool);
void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
int bch2_btree_root_read(struct bch_fs *, enum btree_id,
const struct bkey_i *, unsigned);
void bch2_btree_complete_write(struct bch_fs *, struct btree *,
struct btree_write *);
void bch2_btree_write_error_work(struct work_struct *);
void __bch2_btree_node_write(struct bch_fs *, struct btree *,
enum six_lock_type);
bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
void bch2_btree_node_write(struct bch_fs *, struct btree *,
enum six_lock_type);
/*
* btree_node_dirty() can be cleared with only a read lock,
* and for bch2_btree_node_write_cond() we want to set need_write iff it's
* still dirty:
*/
static inline void set_btree_node_need_write_if_dirty(struct btree *b)
{
unsigned long old, new, v = READ_ONCE(b->flags);
do {
old = new = v;
if (!(old & (1 << BTREE_NODE_dirty)))
return;
new |= (1 << BTREE_NODE_need_write);
} while ((v = cmpxchg(&b->flags, old, new)) != old);
}
#define bch2_btree_node_write_cond(_c, _b, cond) \
do { \
while ((_b)->written && btree_node_dirty(_b) && (cond)) { \
if (!btree_node_may_write(_b)) { \
set_btree_node_need_write_if_dirty(_b); \
break; \
} \
\
if (!btree_node_write_in_flight(_b)) { \
bch2_btree_node_write(_c, _b, SIX_LOCK_read); \
break; \
} \
\
six_unlock_read(&(_b)->lock); \
btree_node_wait_on_io(_b); \
btree_node_lock_type(c, b, SIX_LOCK_read); \
} \
} while (0)
void bch2_btree_flush_all_reads(struct bch_fs *);
void bch2_btree_flush_all_writes(struct bch_fs *);
void bch2_btree_verify_flushed(struct bch_fs *);
ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *);
/* Sorting */
struct btree_node_iter_large {
u8 is_extents;
u16 used;
struct btree_node_iter_set data[MAX_BSETS];
};
static inline void
__bch2_btree_node_iter_large_init(struct btree_node_iter_large *iter,
bool is_extents)
{
iter->used = 0;
iter->is_extents = is_extents;
}
void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *,
struct btree *);
void bch2_btree_node_iter_large_push(struct btree_node_iter_large *,
struct btree *,
const struct bkey_packed *,
const struct bkey_packed *);
static inline bool bch2_btree_node_iter_large_end(struct btree_node_iter_large *iter)
{
return !iter->used;
}
static inline struct bkey_packed *
bch2_btree_node_iter_large_peek_all(struct btree_node_iter_large *iter,
struct btree *b)
{
return bch2_btree_node_iter_large_end(iter)
? NULL
: __btree_node_offset_to_key(b, iter->data->k);
}
static inline struct bkey_packed *
bch2_btree_node_iter_large_next_all(struct btree_node_iter_large *iter,
struct btree *b)
{
struct bkey_packed *ret = bch2_btree_node_iter_large_peek_all(iter, b);
if (ret)
bch2_btree_node_iter_large_advance(iter, b);
return ret;
}
#endif /* _BCACHEFS_BTREE_IO_H */

1844
fs/bcachefs/btree_iter.c Normal file

File diff suppressed because it is too large Load Diff

314
fs/bcachefs/btree_iter.h Normal file
View File

@ -0,0 +1,314 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BTREE_ITER_H
#define _BCACHEFS_BTREE_ITER_H
#include "btree_types.h"
static inline void btree_iter_set_dirty(struct btree_iter *iter,
enum btree_iter_uptodate u)
{
iter->uptodate = max_t(unsigned, iter->uptodate, u);
}
static inline struct btree *btree_iter_node(struct btree_iter *iter,
unsigned level)
{
return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL;
}
static inline struct btree *btree_node_parent(struct btree_iter *iter,
struct btree *b)
{
return btree_iter_node(iter, b->level + 1);
}
static inline bool btree_iter_linked(const struct btree_iter *iter)
{
return iter->next != iter;
}
static inline bool __iter_has_node(const struct btree_iter *iter,
const struct btree *b)
{
/*
* We don't compare the low bits of the lock sequence numbers because
* @iter might have taken a write lock on @b, and we don't want to skip
* the linked iterator if the sequence numbers were equal before taking
* that write lock. The lock sequence number is incremented by taking
* and releasing write locks and is even when unlocked:
*/
return iter->l[b->level].b == b &&
iter->lock_seq[b->level] >> 1 == b->lock.state.seq >> 1;
}
static inline struct btree_iter *
__next_linked_iter(struct btree_iter *iter, struct btree_iter *linked)
{
return linked->next != iter ? linked->next : NULL;
}
static inline struct btree_iter *
__next_iter_with_node(struct btree_iter *iter, struct btree *b,
struct btree_iter *linked)
{
while (linked && !__iter_has_node(linked, b))
linked = __next_linked_iter(iter, linked);
return linked;
}
/**
* for_each_btree_iter - iterate over all iterators linked with @_iter,
* including @_iter
*/
#define for_each_btree_iter(_iter, _linked) \
for ((_linked) = (_iter); (_linked); \
(_linked) = __next_linked_iter(_iter, _linked))
/**
* for_each_btree_iter_with_node - iterate over all iterators linked with @_iter
* that also point to @_b
*
* @_b is assumed to be locked by @_iter
*
* Filters out iterators that don't have a valid btree_node iterator for @_b -
* i.e. iterators for which bch2_btree_node_relock() would not succeed.
*/
#define for_each_btree_iter_with_node(_iter, _b, _linked) \
for ((_linked) = (_iter); \
((_linked) = __next_iter_with_node(_iter, _b, _linked)); \
(_linked) = __next_linked_iter(_iter, _linked))
/**
* for_each_linked_btree_iter - iterate over all iterators linked with @_iter,
* _not_ including @_iter
*/
#define for_each_linked_btree_iter(_iter, _linked) \
for ((_linked) = (_iter)->next; \
(_linked) != (_iter); \
(_linked) = (_linked)->next)
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_btree_iter_verify(struct btree_iter *, struct btree *);
void bch2_btree_iter_verify_locks(struct btree_iter *);
#else
static inline void bch2_btree_iter_verify(struct btree_iter *iter,
struct btree *b) {}
static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
#endif
void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
struct btree_node_iter *, struct bset_tree *,
struct bkey_packed *, unsigned, unsigned);
int bch2_btree_iter_unlock(struct btree_iter *);
bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned);
static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
unsigned new_locks_want,
bool may_drop_locks)
{
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
return iter->locks_want < new_locks_want
? (may_drop_locks
? __bch2_btree_iter_upgrade(iter, new_locks_want)
: __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want))
: iter->uptodate <= BTREE_ITER_NEED_PEEK;
}
void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
{
if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0)
__bch2_btree_iter_downgrade(iter, 0);
}
void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
int __must_check bch2_btree_iter_traverse(struct btree_iter *);
struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
struct btree *bch2_btree_iter_next_node(struct btree_iter *, unsigned);
struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos);
void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos);
void __bch2_btree_iter_init(struct btree_iter *, struct bch_fs *,
enum btree_id, struct bpos,
unsigned , unsigned, unsigned);
static inline void bch2_btree_iter_init(struct btree_iter *iter,
struct bch_fs *c, enum btree_id btree_id,
struct bpos pos, unsigned flags)
{
__bch2_btree_iter_init(iter, c, btree_id, pos,
flags & BTREE_ITER_INTENT ? 1 : 0, 0,
(btree_id == BTREE_ID_EXTENTS
? BTREE_ITER_IS_EXTENTS : 0)|flags);
}
void bch2_btree_iter_link(struct btree_iter *, struct btree_iter *);
void bch2_btree_iter_unlink(struct btree_iter *);
void bch2_btree_iter_copy(struct btree_iter *, struct btree_iter *);
static inline struct bpos btree_type_successor(enum btree_id id,
struct bpos pos)
{
if (id == BTREE_ID_INODES) {
pos.inode++;
pos.offset = 0;
} else if (id != BTREE_ID_EXTENTS) {
pos = bkey_successor(pos);
}
return pos;
}
static inline struct bpos btree_type_predecessor(enum btree_id id,
struct bpos pos)
{
if (id == BTREE_ID_INODES) {
--pos.inode;
pos.offset = 0;
} else /* if (id != BTREE_ID_EXTENTS) */ {
pos = bkey_predecessor(pos);
}
return pos;
}
static inline int __btree_iter_cmp(enum btree_id id,
struct bpos pos,
const struct btree_iter *r)
{
if (id != r->btree_id)
return id < r->btree_id ? -1 : 1;
return bkey_cmp(pos, r->pos);
}
static inline int btree_iter_cmp(const struct btree_iter *l,
const struct btree_iter *r)
{
return __btree_iter_cmp(l->btree_id, l->pos, r);
}
/*
* Unlocks before scheduling
* Note: does not revalidate iterator
*/
static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter)
{
if (need_resched()) {
bch2_btree_iter_unlock(iter);
schedule();
} else if (race_fault()) {
bch2_btree_iter_unlock(iter);
}
}
#define __for_each_btree_node(_iter, _c, _btree_id, _start, \
_locks_want, _depth, _flags, _b) \
for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), _start, \
_locks_want, _depth, \
_flags|BTREE_ITER_NODES), \
_b = bch2_btree_iter_peek_node(_iter); \
(_b); \
(_b) = bch2_btree_iter_next_node(_iter, _depth))
#define for_each_btree_node(_iter, _c, _btree_id, _start, _flags, _b) \
__for_each_btree_node(_iter, _c, _btree_id, _start, 0, 0, _flags, _b)
static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
unsigned flags)
{
return flags & BTREE_ITER_SLOTS
? bch2_btree_iter_peek_slot(iter)
: bch2_btree_iter_peek(iter);
}
static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
unsigned flags)
{
bch2_btree_iter_cond_resched(iter);
return flags & BTREE_ITER_SLOTS
? bch2_btree_iter_next_slot(iter)
: bch2_btree_iter_next(iter);
}
#define for_each_btree_key(_iter, _c, _btree_id, _start, _flags, _k) \
for (bch2_btree_iter_init((_iter), (_c), (_btree_id), \
(_start), (_flags)), \
(_k) = __bch2_btree_iter_peek(_iter, _flags); \
!IS_ERR_OR_NULL((_k).k); \
(_k) = __bch2_btree_iter_next(_iter, _flags))
#define for_each_btree_key_continue(_iter, _flags, _k) \
for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \
!IS_ERR_OR_NULL((_k).k); \
(_k) = __bch2_btree_iter_next(_iter, _flags))
static inline int btree_iter_err(struct bkey_s_c k)
{
return PTR_ERR_OR_ZERO(k.k);
}
/* new multiple iterator interface: */
int bch2_trans_preload_iters(struct btree_trans *);
void bch2_trans_iter_free(struct btree_trans *,
struct btree_iter *);
struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
struct bpos, unsigned, u64);
struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *,
struct btree_iter *, u64);
static __always_inline u64 __btree_iter_id(void)
{
u64 ret = 0;
ret <<= 32;
ret |= _RET_IP_ & U32_MAX;
ret <<= 32;
ret |= _THIS_IP_ & U32_MAX;
return ret;
}
static __always_inline struct btree_iter *
bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
struct bpos pos, unsigned flags)
{
return __bch2_trans_get_iter(trans, btree_id, pos, flags,
__btree_iter_id());
}
static __always_inline struct btree_iter *
bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
{
return __bch2_trans_copy_iter(trans, src, __btree_iter_id());
}
void *bch2_trans_kmalloc(struct btree_trans *, size_t);
int bch2_trans_unlock(struct btree_trans *);
void bch2_trans_begin(struct btree_trans *);
void bch2_trans_init(struct btree_trans *, struct bch_fs *);
int bch2_trans_exit(struct btree_trans *);
#endif /* _BCACHEFS_BTREE_ITER_H */

196
fs/bcachefs/btree_locking.h Normal file
View File

@ -0,0 +1,196 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BTREE_LOCKING_H
#define _BCACHEFS_BTREE_LOCKING_H
/*
* Only for internal btree use:
*
* The btree iterator tracks what locks it wants to take, and what locks it
* currently has - here we have wrappers for locking/unlocking btree nodes and
* updating the iterator state
*/
#include "btree_iter.h"
#include "btree_io.h"
#include "six.h"
/* matches six lock types */
enum btree_node_locked_type {
BTREE_NODE_UNLOCKED = -1,
BTREE_NODE_READ_LOCKED = SIX_LOCK_read,
BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent,
};
static inline int btree_node_locked_type(struct btree_iter *iter,
unsigned level)
{
/*
* We're relying on the fact that if nodes_intent_locked is set
* nodes_locked must be set as well, so that we can compute without
* branches:
*/
return BTREE_NODE_UNLOCKED +
((iter->nodes_locked >> level) & 1) +
((iter->nodes_intent_locked >> level) & 1);
}
static inline bool btree_node_intent_locked(struct btree_iter *iter,
unsigned level)
{
return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED;
}
static inline bool btree_node_read_locked(struct btree_iter *iter,
unsigned level)
{
return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED;
}
static inline bool btree_node_locked(struct btree_iter *iter, unsigned level)
{
return iter->nodes_locked & (1 << level);
}
static inline void mark_btree_node_unlocked(struct btree_iter *iter,
unsigned level)
{
iter->nodes_locked &= ~(1 << level);
iter->nodes_intent_locked &= ~(1 << level);
}
static inline void mark_btree_node_locked(struct btree_iter *iter,
unsigned level,
enum six_lock_type type)
{
/* relying on this to avoid a branch */
BUILD_BUG_ON(SIX_LOCK_read != 0);
BUILD_BUG_ON(SIX_LOCK_intent != 1);
iter->nodes_locked |= 1 << level;
iter->nodes_intent_locked |= type << level;
}
static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
unsigned level)
{
mark_btree_node_locked(iter, level, SIX_LOCK_intent);
}
static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level)
{
return level < iter->locks_want
? SIX_LOCK_intent
: SIX_LOCK_read;
}
static inline enum btree_node_locked_type
btree_lock_want(struct btree_iter *iter, int level)
{
if (level < iter->level)
return BTREE_NODE_UNLOCKED;
if (level < iter->locks_want)
return BTREE_NODE_INTENT_LOCKED;
if (level == iter->level)
return BTREE_NODE_READ_LOCKED;
return BTREE_NODE_UNLOCKED;
}
static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
{
int lock_type = btree_node_locked_type(iter, level);
EBUG_ON(level >= BTREE_MAX_DEPTH);
if (lock_type != BTREE_NODE_UNLOCKED)
six_unlock_type(&iter->l[level].b->lock, lock_type);
mark_btree_node_unlocked(iter, level);
}
static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
{
btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
while (iter->nodes_locked)
btree_node_unlock(iter, __ffs(iter->nodes_locked));
}
static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
{
switch (type) {
case SIX_LOCK_read:
return BCH_TIME_btree_lock_contended_read;
case SIX_LOCK_intent:
return BCH_TIME_btree_lock_contended_intent;
case SIX_LOCK_write:
return BCH_TIME_btree_lock_contended_write;
default:
BUG();
}
}
/*
* wrapper around six locks that just traces lock contended time
*/
static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
enum six_lock_type type)
{
u64 start_time = local_clock();
six_lock_type(&b->lock, type, NULL, NULL);
bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
}
static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
enum six_lock_type type)
{
if (!six_trylock_type(&b->lock, type))
__btree_node_lock_type(c, b, type);
}
bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
struct btree_iter *, enum six_lock_type, bool);
static inline bool btree_node_lock(struct btree *b, struct bpos pos,
unsigned level,
struct btree_iter *iter,
enum six_lock_type type,
bool may_drop_locks)
{
EBUG_ON(level >= BTREE_MAX_DEPTH);
return likely(six_trylock_type(&b->lock, type)) ||
__bch2_btree_node_lock(b, pos, level, iter,
type, may_drop_locks);
}
bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
static inline bool bch2_btree_node_relock(struct btree_iter *iter,
unsigned level)
{
EBUG_ON(btree_node_locked(iter, level) &&
btree_node_locked_type(iter, level) !=
__btree_lock_want(iter, level));
return likely(btree_node_locked(iter, level)) ||
__bch2_btree_node_relock(iter, level);
}
bool bch2_btree_iter_relock(struct btree_iter *);
void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
{
EBUG_ON(iter->l[b->level].b != b);
EBUG_ON(iter->lock_seq[b->level] != b->lock.state.seq);
if (!six_trylock_write(&b->lock))
__bch2_btree_node_lock_write(b, iter);
}
#endif /* _BCACHEFS_BTREE_LOCKING_H */

479
fs/bcachefs/btree_types.h Normal file
View File

@ -0,0 +1,479 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BTREE_TYPES_H
#define _BCACHEFS_BTREE_TYPES_H
#include <linux/list.h>
#include <linux/rhashtable.h>
#include "bkey_methods.h"
#include "journal_types.h"
#include "six.h"
struct open_bucket;
struct btree_update;
#define MAX_BSETS 3U
struct btree_nr_keys {
/*
* Amount of live metadata (i.e. size of node after a compaction) in
* units of u64s
*/
u16 live_u64s;
u16 bset_u64s[MAX_BSETS];
/* live keys only: */
u16 packed_keys;
u16 unpacked_keys;
};
struct bset_tree {
/*
* We construct a binary tree in an array as if the array
* started at 1, so that things line up on the same cachelines
* better: see comments in bset.c at cacheline_to_bkey() for
* details
*/
/* size of the binary tree and prev array */
u16 size;
/* function of size - precalculated for to_inorder() */
u16 extra;
u16 data_offset;
u16 aux_data_offset;
u16 end_offset;
struct bpos max_key;
};
struct btree_write {
struct journal_entry_pin journal;
struct closure_waitlist wait;
};
struct btree_ob_ref {
u8 nr;
u8 refs[BCH_REPLICAS_MAX];
};
struct btree_alloc {
struct btree_ob_ref ob;
BKEY_PADDED(k);
};
struct btree {
/* Hottest entries first */
struct rhash_head hash;
/* Key/pointer for this btree node */
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
struct six_lock lock;
unsigned long flags;
u16 written;
u8 level;
u8 btree_id;
u8 nsets;
u8 nr_key_bits;
struct bkey_format format;
struct btree_node *data;
void *aux_data;
/*
* Sets of sorted keys - the real btree node - plus a binary search tree
*
* set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
* to the memory we have allocated for this btree node. Additionally,
* set[0]->data points to the entire btree node as it exists on disk.
*/
struct bset_tree set[MAX_BSETS];
struct btree_nr_keys nr;
u16 sib_u64s[2];
u16 whiteout_u64s;
u16 uncompacted_whiteout_u64s;
u8 page_order;
u8 unpack_fn_len;
/*
* XXX: add a delete sequence number, so when bch2_btree_node_relock()
* fails because the lock sequence number has changed - i.e. the
* contents were modified - we can still relock the node if it's still
* the one we want, without redoing the traversal
*/
/*
* For asynchronous splits/interior node updates:
* When we do a split, we allocate new child nodes and update the parent
* node to point to them: we update the parent in memory immediately,
* but then we must wait until the children have been written out before
* the update to the parent can be written - this is a list of the
* btree_updates that are blocking this node from being
* written:
*/
struct list_head write_blocked;
/*
* Also for asynchronous splits/interior node updates:
* If a btree node isn't reachable yet, we don't want to kick off
* another write - because that write also won't yet be reachable and
* marking it as completed before it's reachable would be incorrect:
*/
unsigned long will_make_reachable;
struct btree_ob_ref ob;
/* lru list */
struct list_head list;
struct btree_write writes[2];
#ifdef CONFIG_BCACHEFS_DEBUG
bool *expensive_debug_checks;
#endif
};
struct btree_cache {
struct rhashtable table;
bool table_init_done;
/*
* We never free a struct btree, except on shutdown - we just put it on
* the btree_cache_freed list and reuse it later. This simplifies the
* code, and it doesn't cost us much memory as the memory usage is
* dominated by buffers that hold the actual btree node data and those
* can be freed - and the number of struct btrees allocated is
* effectively bounded.
*
* btree_cache_freeable effectively is a small cache - we use it because
* high order page allocations can be rather expensive, and it's quite
* common to delete and allocate btree nodes in quick succession. It
* should never grow past ~2-3 nodes in practice.
*/
struct mutex lock;
struct list_head live;
struct list_head freeable;
struct list_head freed;
/* Number of elements in live + freeable lists */
unsigned used;
unsigned reserve;
struct shrinker shrink;
/*
* If we need to allocate memory for a new btree node and that
* allocation fails, we can cannibalize another node in the btree cache
* to satisfy the allocation - lock to guarantee only one thread does
* this at a time:
*/
struct task_struct *alloc_lock;
struct closure_waitlist alloc_wait;
};
struct btree_node_iter {
u8 is_extents;
struct btree_node_iter_set {
u16 k, end;
} data[MAX_BSETS];
};
enum btree_iter_type {
BTREE_ITER_KEYS,
BTREE_ITER_SLOTS,
BTREE_ITER_NODES,
};
#define BTREE_ITER_TYPE ((1 << 2) - 1)
#define BTREE_ITER_INTENT (1 << 2)
#define BTREE_ITER_PREFETCH (1 << 3)
/*
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
* @pos or the first key strictly greater than @pos
*/
#define BTREE_ITER_IS_EXTENTS (1 << 4)
/*
* indicates we need to call bch2_btree_iter_traverse() to revalidate iterator:
*/
#define BTREE_ITER_AT_END_OF_LEAF (1 << 5)
#define BTREE_ITER_ERROR (1 << 6)
enum btree_iter_uptodate {
BTREE_ITER_UPTODATE = 0,
BTREE_ITER_NEED_PEEK = 1,
BTREE_ITER_NEED_RELOCK = 2,
BTREE_ITER_NEED_TRAVERSE = 3,
};
/*
* @pos - iterator's current position
* @level - current btree depth
* @locks_want - btree level below which we start taking intent locks
* @nodes_locked - bitmask indicating which nodes in @nodes are locked
* @nodes_intent_locked - bitmask indicating which locks are intent locks
*/
struct btree_iter {
struct bch_fs *c;
struct bpos pos;
u8 flags;
enum btree_iter_uptodate uptodate:4;
enum btree_id btree_id:4;
unsigned level:4,
locks_want:4,
nodes_locked:4,
nodes_intent_locked:4;
struct btree_iter_level {
struct btree *b;
struct btree_node_iter iter;
} l[BTREE_MAX_DEPTH];
u32 lock_seq[BTREE_MAX_DEPTH];
/*
* Current unpacked key - so that bch2_btree_iter_next()/
* bch2_btree_iter_next_slot() can correctly advance pos.
*/
struct bkey k;
/*
* Circular linked list of linked iterators: linked iterators share
* locks (e.g. two linked iterators may have the same node intent
* locked, or read and write locked, at the same time), and insertions
* through one iterator won't invalidate the other linked iterators.
*/
/* Must come last: */
struct btree_iter *next;
};
#define BTREE_ITER_MAX 8
struct btree_insert_entry {
struct btree_iter *iter;
struct bkey_i *k;
unsigned extra_res;
/*
* true if entire key was inserted - can only be false for
* extents
*/
bool done;
};
struct btree_trans {
struct bch_fs *c;
u8 nr_iters;
u8 iters_live;
u8 iters_linked;
u8 nr_updates;
unsigned mem_top;
unsigned mem_bytes;
void *mem;
struct btree_iter *iters;
u64 iter_ids[BTREE_ITER_MAX];
struct btree_insert_entry updates[BTREE_ITER_MAX];
struct btree_iter iters_onstack[2];
};
#define BTREE_FLAG(flag) \
static inline bool btree_node_ ## flag(struct btree *b) \
{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
\
static inline void set_btree_node_ ## flag(struct btree *b) \
{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
\
static inline void clear_btree_node_ ## flag(struct btree *b) \
{ clear_bit(BTREE_NODE_ ## flag, &b->flags); }
enum btree_flags {
BTREE_NODE_read_in_flight,
BTREE_NODE_read_error,
BTREE_NODE_dirty,
BTREE_NODE_need_write,
BTREE_NODE_noevict,
BTREE_NODE_write_idx,
BTREE_NODE_accessed,
BTREE_NODE_write_in_flight,
BTREE_NODE_just_written,
BTREE_NODE_dying,
BTREE_NODE_fake,
};
BTREE_FLAG(read_in_flight);
BTREE_FLAG(read_error);
BTREE_FLAG(dirty);
BTREE_FLAG(need_write);
BTREE_FLAG(noevict);
BTREE_FLAG(write_idx);
BTREE_FLAG(accessed);
BTREE_FLAG(write_in_flight);
BTREE_FLAG(just_written);
BTREE_FLAG(dying);
BTREE_FLAG(fake);
static inline struct btree_write *btree_current_write(struct btree *b)
{
return b->writes + btree_node_write_idx(b);
}
static inline struct btree_write *btree_prev_write(struct btree *b)
{
return b->writes + (btree_node_write_idx(b) ^ 1);
}
static inline struct bset_tree *bset_tree_last(struct btree *b)
{
EBUG_ON(!b->nsets);
return b->set + b->nsets - 1;
}
static inline struct bset *bset(const struct btree *b,
const struct bset_tree *t)
{
return (void *) b->data + t->data_offset * sizeof(u64);
}
static inline struct bset *btree_bset_first(struct btree *b)
{
return bset(b, b->set);
}
static inline struct bset *btree_bset_last(struct btree *b)
{
return bset(b, bset_tree_last(b));
}
static inline u16
__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
{
size_t ret = (u64 *) k - (u64 *) b->data - 1;
EBUG_ON(ret > U16_MAX);
return ret;
}
static inline struct bkey_packed *
__btree_node_offset_to_key(const struct btree *b, u16 k)
{
return (void *) ((u64 *) b->data + k + 1);
}
#define btree_bkey_first(_b, _t) (bset(_b, _t)->start)
#define btree_bkey_last(_b, _t) \
({ \
EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \
vstruct_last(bset(_b, _t))); \
\
__btree_node_offset_to_key(_b, (_t)->end_offset); \
})
static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
{
t->end_offset =
__btree_node_key_to_offset(b, vstruct_last(bset(b, t)));
btree_bkey_last(b, t);
}
static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
const struct bset *i)
{
t->data_offset = (u64 *) i - (u64 *) b->data;
EBUG_ON(bset(b, t) != i);
set_btree_bset_end(b, t);
}
static inline unsigned bset_byte_offset(struct btree *b, void *i)
{
return i - (void *) b->data;
}
/* Type of keys @b contains: */
static inline enum bkey_type btree_node_type(struct btree *b)
{
return b->level ? BKEY_TYPE_BTREE : b->btree_id;
}
static inline const struct bkey_ops *btree_node_ops(struct btree *b)
{
return &bch2_bkey_ops[btree_node_type(b)];
}
static inline bool btree_node_has_ptrs(struct btree *b)
{
return btree_type_has_ptrs(btree_node_type(b));
}
static inline bool btree_node_is_extents(struct btree *b)
{
return btree_node_type(b) == BKEY_TYPE_EXTENTS;
}
struct btree_root {
struct btree *b;
struct btree_update *as;
/* On disk root - see async splits: */
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
u8 level;
u8 alive;
};
/*
* Optional hook that will be called just prior to a btree node update, when
* we're holding the write lock and we know what key is about to be overwritten:
*/
struct btree_iter;
struct btree_node_iter;
enum btree_insert_ret {
BTREE_INSERT_OK,
/* extent spanned multiple leaf nodes: have to traverse to next node: */
BTREE_INSERT_NEED_TRAVERSE,
/* write lock held for too long */
BTREE_INSERT_NEED_RESCHED,
/* leaf node needs to be split */
BTREE_INSERT_BTREE_NODE_FULL,
BTREE_INSERT_JOURNAL_RES_FULL,
BTREE_INSERT_ENOSPC,
BTREE_INSERT_NEED_GC_LOCK,
};
struct extent_insert_hook {
enum btree_insert_ret
(*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
struct bkey_s_c, const struct bkey_i *);
};
enum btree_gc_coalesce_fail_reason {
BTREE_GC_COALESCE_FAIL_RESERVE_GET,
BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
};
enum btree_node_sibling {
btree_prev_sib,
btree_next_sib,
};
typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
struct btree *,
struct btree_node_iter *);
#endif /* _BCACHEFS_BTREE_TYPES_H */

168
fs/bcachefs/btree_update.h Normal file
View File

@ -0,0 +1,168 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BTREE_UPDATE_H
#define _BCACHEFS_BTREE_UPDATE_H
#include "btree_iter.h"
#include "journal.h"
struct bch_fs;
struct btree;
struct btree_insert;
void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
struct btree_iter *);
bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
struct btree_node_iter *, struct bkey_i *);
void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *,
struct bkey_i *);
/* Normal update interface: */
struct btree_insert {
struct bch_fs *c;
struct disk_reservation *disk_res;
struct journal_res journal_res;
u64 *journal_seq;
struct extent_insert_hook *hook;
unsigned flags;
bool did_work;
unsigned short nr;
struct btree_insert_entry *entries;
};
int __bch2_btree_insert_at(struct btree_insert *);
#define BTREE_INSERT_ENTRY(_iter, _k) \
((struct btree_insert_entry) { \
.iter = (_iter), \
.k = (_k), \
.done = false, \
})
#define BTREE_INSERT_ENTRY_EXTRA_RES(_iter, _k, _extra) \
((struct btree_insert_entry) { \
.iter = (_iter), \
.k = (_k), \
.extra_res = (_extra), \
.done = false, \
})
/**
* bch_btree_insert_at - insert one or more keys at iterator positions
* @iter: btree iterator
* @insert_key: key to insert
* @disk_res: disk reservation
* @hook: extent insert callback
*
* Return values:
* -EINTR: locking changed, this function should be called again. Only returned
* if passed BTREE_INSERT_ATOMIC.
* -EROFS: filesystem read only
* -EIO: journal or btree node IO error
*/
#define bch2_btree_insert_at(_c, _disk_res, _hook, \
_journal_seq, _flags, ...) \
__bch2_btree_insert_at(&(struct btree_insert) { \
.c = (_c), \
.disk_res = (_disk_res), \
.journal_seq = (_journal_seq), \
.hook = (_hook), \
.flags = (_flags), \
.nr = COUNT_ARGS(__VA_ARGS__), \
.entries = (struct btree_insert_entry[]) { \
__VA_ARGS__ \
}})
enum {
__BTREE_INSERT_ATOMIC,
__BTREE_INSERT_NOUNLOCK,
__BTREE_INSERT_NOFAIL,
__BTREE_INSERT_USE_RESERVE,
__BTREE_INSERT_USE_ALLOC_RESERVE,
__BTREE_INSERT_JOURNAL_REPLAY,
__BTREE_INSERT_NOWAIT,
__BTREE_INSERT_GC_LOCK_HELD,
__BCH_HASH_SET_MUST_CREATE,
__BCH_HASH_SET_MUST_REPLACE,
};
/*
* Don't drop/retake locks before doing btree update, instead return -EINTR if
* we had to drop locks for any reason
*/
#define BTREE_INSERT_ATOMIC (1 << __BTREE_INSERT_ATOMIC)
/*
* Don't drop locks _after_ successfully updating btree:
*/
#define BTREE_INSERT_NOUNLOCK (1 << __BTREE_INSERT_NOUNLOCK)
/* Don't check for -ENOSPC: */
#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL)
/* for copygc, or when merging btree nodes */
#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE)
#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
/*
* Insert is for journal replay: don't get journal reservations, or mark extents
* (bch_mark_key)
*/
#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY)
/* Don't block on allocation failure (for new btree nodes: */
#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT)
#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD)
#define BCH_HASH_SET_MUST_CREATE (1 << __BCH_HASH_SET_MUST_CREATE)
#define BCH_HASH_SET_MUST_REPLACE (1 << __BCH_HASH_SET_MUST_REPLACE)
int bch2_btree_delete_at(struct btree_iter *, unsigned);
int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *,
struct disk_reservation *,
struct extent_insert_hook *, u64 *, unsigned);
int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
struct disk_reservation *,
struct extent_insert_hook *, u64 *, int flags);
int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
struct bpos, struct bpos, struct bversion,
struct disk_reservation *,
struct extent_insert_hook *, u64 *);
int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
__le64, unsigned);
int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
struct btree *, struct bkey_i_extent *);
/* new transactional interface: */
void bch2_trans_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, unsigned);
int bch2_trans_commit(struct btree_trans *,
struct disk_reservation *,
struct extent_insert_hook *,
u64 *, unsigned);
#define bch2_trans_do(_c, _journal_seq, _flags, _do) \
({ \
struct btree_trans trans; \
int _ret; \
\
bch2_trans_init(&trans, (_c)); \
\
do { \
bch2_trans_begin(&trans); \
\
_ret = (_do) ?: bch2_trans_commit(&trans, NULL, NULL, \
(_journal_seq), (_flags)); \
} while (_ret == -EINTR); \
\
bch2_trans_exit(&trans); \
_ret; \
})
#endif /* _BCACHEFS_BTREE_UPDATE_H */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,374 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
#include "btree_cache.h"
#include "btree_locking.h"
#include "btree_update.h"
struct btree_reserve {
struct disk_reservation disk_res;
unsigned nr;
struct btree *b[BTREE_RESERVE_MAX];
};
void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *);
bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
struct bkey_format *);
/* Btree node freeing/allocation: */
/*
* Tracks a btree node that has been (or is about to be) freed in memory, but
* has _not_ yet been freed on disk (because the write that makes the new
* node(s) visible and frees the old hasn't completed yet)
*/
struct pending_btree_node_free {
bool index_update_done;
__le64 seq;
enum btree_id btree_id;
unsigned level;
__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
};
/*
* Tracks an in progress split/rewrite of a btree node and the update to the
* parent node:
*
* When we split/rewrite a node, we do all the updates in memory without
* waiting for any writes to complete - we allocate the new node(s) and update
* the parent node, possibly recursively up to the root.
*
* The end result is that we have one or more new nodes being written -
* possibly several, if there were multiple splits - and then a write (updating
* an interior node) which will make all these new nodes visible.
*
* Additionally, as we split/rewrite nodes we free the old nodes - but the old
* nodes can't be freed (their space on disk can't be reclaimed) until the
* update to the interior node that makes the new node visible completes -
* until then, the old nodes are still reachable on disk.
*
*/
struct btree_update {
struct closure cl;
struct bch_fs *c;
struct list_head list;
/* What kind of update are we doing? */
enum {
BTREE_INTERIOR_NO_UPDATE,
BTREE_INTERIOR_UPDATING_NODE,
BTREE_INTERIOR_UPDATING_ROOT,
BTREE_INTERIOR_UPDATING_AS,
} mode;
unsigned must_rewrite:1;
unsigned nodes_written:1;
enum btree_id btree_id;
struct btree_reserve *reserve;
/*
* BTREE_INTERIOR_UPDATING_NODE:
* The update that made the new nodes visible was a regular update to an
* existing interior node - @b. We can't write out the update to @b
* until the new nodes we created are finished writing, so we block @b
* from writing by putting this btree_interior update on the
* @b->write_blocked list with @write_blocked_list:
*/
struct btree *b;
struct list_head write_blocked_list;
/*
* BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
* we're now blocking another btree_update
* @parent_as - btree_update that's waiting on our nodes to finish
* writing, before it can make new nodes visible on disk
* @wait - list of child btree_updates that are waiting on this
* btree_update to make all the new nodes visible before they can free
* their old btree nodes
*/
struct btree_update *parent_as;
struct closure_waitlist wait;
/*
* We may be freeing nodes that were dirty, and thus had journal entries
* pinned: we need to transfer the oldest of those pins to the
* btree_update operation, and release it when the new node(s)
* are all persistent and reachable:
*/
struct journal_entry_pin journal;
u64 journal_seq;
/*
* Nodes being freed:
* Protected by c->btree_node_pending_free_lock
*/
struct pending_btree_node_free pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
unsigned nr_pending;
/* New nodes, that will be made reachable by this update: */
struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
unsigned nr_new_nodes;
/* Only here to reduce stack usage on recursive splits: */
struct keylist parent_keys;
/*
* Enough room for btree_split's keys without realloc - btree node
* pointers never have crc/compression info, so we only need to acount
* for the pointers for three keys
*/
u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
};
#define for_each_pending_btree_node_free(c, as, p) \
list_for_each_entry(as, &c->btree_interior_update_list, list) \
for (p = as->pending; p < as->pending + as->nr_pending; p++)
void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
struct btree_iter *);
void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
void bch2_btree_open_bucket_put(struct bch_fs *, struct btree *);
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
struct btree *,
struct bkey_format);
void bch2_btree_update_done(struct btree_update *);
struct btree_update *
bch2_btree_update_start(struct bch_fs *, enum btree_id, unsigned,
unsigned, struct closure *);
void bch2_btree_interior_update_will_free_node(struct btree_update *,
struct btree *);
void bch2_btree_insert_node(struct btree_update *, struct btree *,
struct btree_iter *, struct keylist *,
unsigned);
int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
unsigned, unsigned, enum btree_node_sibling);
static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
struct btree_iter *iter,
unsigned level, unsigned flags,
enum btree_node_sibling sib)
{
struct btree *b;
/*
* iterators are inconsistent when they hit end of leaf, until
* traversed again
*
* XXX inconsistent how?
*/
if (iter->flags & BTREE_ITER_AT_END_OF_LEAF)
return;
if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
return;
if (!bch2_btree_node_relock(iter, level))
return;
b = iter->l[level].b;
if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
return;
__bch2_foreground_maybe_merge(c, iter, level, flags, sib);
}
static inline void bch2_foreground_maybe_merge(struct bch_fs *c,
struct btree_iter *iter,
unsigned level,
unsigned flags)
{
bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
btree_prev_sib);
bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
btree_next_sib);
}
void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
static inline unsigned btree_update_reserve_required(struct bch_fs *c,
struct btree *b)
{
unsigned depth = btree_node_root(c, b)->level + 1;
/*
* Number of nodes we might have to allocate in a worst case btree
* split operation - we split all the way up to the root, then allocate
* a new root, unless we're already at max depth:
*/
if (depth < BTREE_MAX_DEPTH)
return (depth - b->level) * 2 + 1;
else
return (depth - b->level) * 2 - 1;
}
static inline void btree_node_reset_sib_u64s(struct btree *b)
{
b->sib_u64s[0] = b->nr.live_u64s;
b->sib_u64s[1] = b->nr.live_u64s;
}
static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
{
return (void *) b->data + btree_bytes(c);
}
static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
struct btree *b)
{
return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
}
static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
struct btree *b)
{
return btree_data_end(c, b);
}
static inline void *write_block(struct btree *b)
{
return (void *) b->data + (b->written << 9);
}
static inline bool bset_written(struct btree *b, struct bset *i)
{
return (void *) i < write_block(b);
}
static inline bool bset_unwritten(struct btree *b, struct bset *i)
{
return (void *) i > write_block(b);
}
static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
struct btree *b,
void *end)
{
ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
b->whiteout_u64s +
b->uncompacted_whiteout_u64s;
ssize_t total = c->opts.btree_node_size << 6;
return total - used;
}
static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
struct btree *b)
{
ssize_t remaining = __bch_btree_u64s_remaining(c, b,
btree_bkey_last(b, bset_tree_last(b)));
BUG_ON(remaining < 0);
if (bset_written(b, btree_bset_last(b)))
return 0;
return remaining;
}
static inline unsigned btree_write_set_buffer(struct btree *b)
{
/*
* Could buffer up larger amounts of keys for btrees with larger keys,
* pending benchmarking:
*/
return 4 << 10;
}
static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
struct btree *b)
{
struct bset *i = btree_bset_last(b);
struct btree_node_entry *bne = max(write_block(b),
(void *) btree_bkey_last(b, bset_tree_last(b)));
ssize_t remaining_space =
__bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
if (unlikely(bset_written(b, i))) {
if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
return bne;
} else {
if (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
return bne;
}
return NULL;
}
static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t,
struct bkey_packed *k)
{
if (bset_written(b, bset(b, t))) {
EBUG_ON(b->uncompacted_whiteout_u64s <
bkeyp_key_u64s(&b->format, k));
b->uncompacted_whiteout_u64s -=
bkeyp_key_u64s(&b->format, k);
}
}
static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
struct bkey_packed *k)
{
if (bset_written(b, bset(b, t))) {
BUG_ON(!k->needs_whiteout);
b->uncompacted_whiteout_u64s +=
bkeyp_key_u64s(&b->format, k);
}
}
/*
* write lock must be held on @b (else the dirty bset that we were going to
* insert into could be written out from under us)
*/
static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
struct btree *b, unsigned u64s)
{
if (unlikely(btree_node_fake(b)))
return false;
if (btree_node_is_extents(b)) {
/* The insert key might split an existing key
* (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case:
*/
u64s += BKEY_EXTENT_U64s_MAX;
}
return u64s <= bch_btree_keys_u64s_remaining(c, b);
}
static inline bool journal_res_insert_fits(struct btree_insert *trans,
struct btree_insert_entry *insert)
{
unsigned u64s = 0;
struct btree_insert_entry *i;
/*
* If we didn't get a journal reservation, we're in journal replay and
* we're not journalling updates:
*/
if (!trans->journal_res.ref)
return true;
for (i = insert; i < trans->entries + trans->nr; i++)
u64s += jset_u64s(i->k->k.u64s + i->extra_res);
return u64s <= trans->journal_res.u64s;
}
ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */

View File

@ -0,0 +1,737 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "btree_iter.h"
#include "btree_locking.h"
#include "debug.h"
#include "extents.h"
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
#include "trace.h"
#include <linux/sort.h>
/* Inserting into a given leaf node (last stage of insert): */
/* Handle overwrites and do insert, for non extents: */
bool bch2_btree_bset_insert_key(struct btree_iter *iter,
struct btree *b,
struct btree_node_iter *node_iter,
struct bkey_i *insert)
{
const struct bkey_format *f = &b->format;
struct bkey_packed *k;
struct bset_tree *t;
unsigned clobber_u64s;
EBUG_ON(btree_node_just_written(b));
EBUG_ON(bset_written(b, btree_bset_last(b)));
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
bkey_cmp(insert->k.p, b->data->max_key) > 0);
k = bch2_btree_node_iter_peek_all(node_iter, b);
if (k && !bkey_cmp_packed(b, k, &insert->k)) {
BUG_ON(bkey_whiteout(k));
t = bch2_bkey_to_bset(b, k);
if (bset_unwritten(b, bset(b, t)) &&
bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k) &&
!bkey_whiteout(&insert->k)) {
k->type = insert->k.type;
memcpy_u64s(bkeyp_val(f, k), &insert->v,
bkey_val_u64s(&insert->k));
return true;
}
insert->k.needs_whiteout = k->needs_whiteout;
btree_keys_account_key_drop(&b->nr, t - b->set, k);
if (t == bset_tree_last(b)) {
clobber_u64s = k->u64s;
/*
* If we're deleting, and the key we're deleting doesn't
* need a whiteout (it wasn't overwriting a key that had
* been written to disk) - just delete it:
*/
if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
bch2_bset_delete(b, k, clobber_u64s);
bch2_btree_node_iter_fix(iter, b, node_iter, t,
k, clobber_u64s, 0);
return true;
}
goto overwrite;
}
k->type = KEY_TYPE_DELETED;
bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
k->u64s, k->u64s);
if (bkey_whiteout(&insert->k)) {
reserve_whiteout(b, t, k);
return true;
} else {
k->needs_whiteout = false;
}
} else {
/*
* Deleting, but the key to delete wasn't found - nothing to do:
*/
if (bkey_whiteout(&insert->k))
return false;
insert->k.needs_whiteout = false;
}
t = bset_tree_last(b);
k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
clobber_u64s = 0;
overwrite:
bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k))
bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
clobber_u64s, k->u64s);
return true;
}
static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
unsigned i, u64 seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct btree_write *w = container_of(pin, struct btree_write, journal);
struct btree *b = container_of(w, struct btree, writes[i]);
btree_node_lock_type(c, b, SIX_LOCK_read);
bch2_btree_node_write_cond(c, b,
(btree_current_write(b) == w &&
w->journal.pin_list == journal_seq_pin(j, seq)));
six_unlock_read(&b->lock);
}
static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
{
return __btree_node_flush(j, pin, 0, seq);
}
static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
{
return __btree_node_flush(j, pin, 1, seq);
}
void bch2_btree_journal_key(struct btree_insert *trans,
struct btree_iter *iter,
struct bkey_i *insert)
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree *b = iter->l[0].b;
struct btree_write *w = btree_current_write(b);
EBUG_ON(iter->level || b->level);
EBUG_ON(trans->journal_res.ref !=
!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
u64 seq = trans->journal_res.seq;
bool needs_whiteout = insert->k.needs_whiteout;
/* ick */
insert->k.needs_whiteout = false;
bch2_journal_add_keys(j, &trans->journal_res,
iter->btree_id, insert);
insert->k.needs_whiteout = needs_whiteout;
bch2_journal_set_has_inode(j, &trans->journal_res,
insert->k.p.inode);
if (trans->journal_seq)
*trans->journal_seq = seq;
btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
}
if (unlikely(!journal_pin_active(&w->journal))) {
u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
? trans->journal_res.seq
: j->replay_journal_seq;
bch2_journal_pin_add(j, seq, &w->journal,
btree_node_write_idx(b) == 0
? btree_node_flush0
: btree_node_flush1);
}
if (unlikely(!btree_node_dirty(b)))
set_btree_node_dirty(b);
}
static enum btree_insert_ret
bch2_insert_fixup_key(struct btree_insert *trans,
struct btree_insert_entry *insert)
{
struct btree_iter *iter = insert->iter;
struct btree_iter_level *l = &iter->l[0];
EBUG_ON(iter->level);
EBUG_ON(insert->k->k.u64s >
bch_btree_keys_u64s_remaining(trans->c, l->b));
if (bch2_btree_bset_insert_key(iter, l->b, &l->iter,
insert->k))
bch2_btree_journal_key(trans, iter, insert->k);
trans->did_work = true;
return BTREE_INSERT_OK;
}
/**
* btree_insert_key - insert a key one key into a leaf node
*/
static enum btree_insert_ret
btree_insert_key_leaf(struct btree_insert *trans,
struct btree_insert_entry *insert)
{
struct bch_fs *c = trans->c;
struct btree_iter *iter = insert->iter;
struct btree *b = iter->l[0].b;
enum btree_insert_ret ret;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
ret = !btree_node_is_extents(b)
? bch2_insert_fixup_key(trans, insert)
: bch2_insert_fixup_extent(trans, insert);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
if (u64s_added > live_u64s_added &&
bch2_maybe_compact_whiteouts(c, b))
bch2_btree_iter_reinit_node(iter, b);
trace_btree_insert_key(c, b, insert->k);
return ret;
}
#define trans_for_each_entry(trans, i) \
for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
/*
* We sort transaction entries so that if multiple iterators point to the same
* leaf node they'll be adjacent:
*/
static bool same_leaf_as_prev(struct btree_insert *trans,
struct btree_insert_entry *i)
{
return i != trans->entries &&
i[0].iter->l[0].b == i[-1].iter->l[0].b;
}
static inline struct btree_insert_entry *trans_next_leaf(struct btree_insert *trans,
struct btree_insert_entry *i)
{
struct btree *b = i->iter->l[0].b;
do {
i++;
} while (i < trans->entries + trans->nr && b == i->iter->l[0].b);
return i;
}
#define trans_for_each_leaf(trans, i) \
for ((i) = (trans)->entries; \
(i) < (trans)->entries + (trans)->nr; \
(i) = trans_next_leaf(trans, i))
inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
struct btree_iter *iter)
{
bch2_btree_node_lock_write(b, iter);
if (btree_node_just_written(b) &&
bch2_btree_post_write_cleanup(c, b))
bch2_btree_iter_reinit_node(iter, b);
/*
* If the last bset has been written, or if it's gotten too big - start
* a new bset to insert into:
*/
if (want_new_bset(c, b))
bch2_btree_init_next(c, b, iter);
}
static void multi_lock_write(struct bch_fs *c, struct btree_insert *trans)
{
struct btree_insert_entry *i;
trans_for_each_leaf(trans, i)
bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
}
static void multi_unlock_write(struct btree_insert *trans)
{
struct btree_insert_entry *i;
trans_for_each_leaf(trans, i)
bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
}
static inline int btree_trans_cmp(struct btree_insert_entry l,
struct btree_insert_entry r)
{
return btree_iter_cmp(l.iter, r.iter);
}
/* Normal update interface: */
/*
* Get journal reservation, take write locks, and attempt to do btree update(s):
*/
static inline int do_btree_insert_at(struct btree_insert *trans,
struct btree_iter **split,
bool *cycle_gc_lock)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
unsigned u64s;
int ret;
trans_for_each_entry(trans, i) {
BUG_ON(i->done);
BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
}
u64s = 0;
trans_for_each_entry(trans, i)
u64s += jset_u64s(i->k->k.u64s + i->extra_res);
memset(&trans->journal_res, 0, sizeof(trans->journal_res));
ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
? bch2_journal_res_get(&c->journal,
&trans->journal_res,
u64s, u64s)
: 0;
if (ret)
return ret;
multi_lock_write(c, trans);
if (race_fault()) {
ret = -EINTR;
goto out;
}
u64s = 0;
trans_for_each_entry(trans, i) {
/* Multiple inserts might go to same leaf: */
if (!same_leaf_as_prev(trans, i))
u64s = 0;
/*
* bch2_btree_node_insert_fits() must be called under write lock:
* with only an intent lock, another thread can still call
* bch2_btree_node_write(), converting an unwritten bset to a
* written one
*/
u64s += i->k->k.u64s + i->extra_res;
if (!bch2_btree_node_insert_fits(c,
i->iter->l[0].b, u64s)) {
ret = -EINTR;
*split = i->iter;
goto out;
}
}
if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
if (journal_seq_verify(c))
trans_for_each_entry(trans, i)
i->k->k.version.lo = trans->journal_res.seq;
else if (inject_invalid_keys(c))
trans_for_each_entry(trans, i)
i->k->k.version = MAX_VERSION;
}
trans_for_each_entry(trans, i) {
switch (btree_insert_key_leaf(trans, i)) {
case BTREE_INSERT_OK:
i->done = true;
break;
case BTREE_INSERT_JOURNAL_RES_FULL:
case BTREE_INSERT_NEED_TRAVERSE:
case BTREE_INSERT_NEED_RESCHED:
ret = -EINTR;
break;
case BTREE_INSERT_BTREE_NODE_FULL:
ret = -EINTR;
*split = i->iter;
break;
case BTREE_INSERT_ENOSPC:
ret = -ENOSPC;
break;
case BTREE_INSERT_NEED_GC_LOCK:
ret = -EINTR;
*cycle_gc_lock = true;
break;
default:
BUG();
}
/*
* If we did some work (i.e. inserted part of an extent),
* we have to do all the other updates as well:
*/
if (!trans->did_work && (ret || *split))
break;
}
out:
multi_unlock_write(trans);
bch2_journal_res_put(&c->journal, &trans->journal_res);
return ret;
}
static inline void btree_insert_entry_checks(struct bch_fs *c,
struct btree_insert_entry *i)
{
BUG_ON(i->iter->level);
BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
BUG_ON(debug_check_bkeys(c) &&
!bkey_deleted(&i->k->k) &&
bch2_bkey_invalid(c, (enum bkey_type) i->iter->btree_id,
bkey_i_to_s_c(i->k)));
}
/**
* __bch_btree_insert_at - insert keys at given iterator positions
*
* This is main entry point for btree updates.
*
* Return values:
* -EINTR: locking changed, this function should be called again. Only returned
* if passed BTREE_INSERT_ATOMIC.
* -EROFS: filesystem read only
* -EIO: journal or btree node IO error
*/
int __bch2_btree_insert_at(struct btree_insert *trans)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i;
struct btree_iter *linked, *split = NULL;
bool cycle_gc_lock = false;
unsigned flags;
int ret;
BUG_ON(!trans->nr);
for_each_btree_iter(trans->entries[0].iter, linked)
bch2_btree_iter_verify_locks(linked);
/* for the sake of sanity: */
BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
trans_for_each_entry(trans, i)
btree_insert_entry_checks(c, i);
bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
if (unlikely(!percpu_ref_tryget(&c->writes)))
return -EROFS;
retry:
split = NULL;
cycle_gc_lock = false;
trans_for_each_entry(trans, i) {
if (!bch2_btree_iter_upgrade(i->iter, 1, true)) {
ret = -EINTR;
goto err;
}
if (i->iter->flags & BTREE_ITER_ERROR) {
ret = -EIO;
goto err;
}
}
ret = do_btree_insert_at(trans, &split, &cycle_gc_lock);
if (unlikely(ret))
goto err;
trans_for_each_leaf(trans, i)
bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags);
trans_for_each_entry(trans, i)
bch2_btree_iter_downgrade(i->iter);
out:
percpu_ref_put(&c->writes);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
/* make sure we didn't drop or screw up locks: */
for_each_btree_iter(trans->entries[0].iter, linked) {
bch2_btree_iter_verify_locks(linked);
BUG_ON((trans->flags & BTREE_INSERT_NOUNLOCK) &&
trans->did_work &&
linked->uptodate >= BTREE_ITER_NEED_RELOCK);
}
/* make sure we didn't lose an error: */
if (!ret)
trans_for_each_entry(trans, i)
BUG_ON(!i->done);
}
BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
return ret;
err:
flags = trans->flags;
/*
* BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
* update; if we haven't done anything yet it doesn't apply
*/
if (!trans->did_work)
flags &= ~BTREE_INSERT_NOUNLOCK;
if (split) {
ret = bch2_btree_split_leaf(c, split, flags);
/*
* if the split succeeded without dropping locks the insert will
* still be atomic (in the BTREE_INSERT_ATOMIC sense, what the
* caller peeked() and is overwriting won't have changed)
*/
#if 0
/*
* XXX:
* split -> btree node merging (of parent node) might still drop
* locks when we're not passing it BTREE_INSERT_NOUNLOCK
*/
if (!ret && !trans->did_work)
goto retry;
#endif
/*
* don't care if we got ENOSPC because we told split it
* couldn't block:
*/
if (!ret || (flags & BTREE_INSERT_NOUNLOCK))
ret = -EINTR;
}
if (cycle_gc_lock) {
if (!down_read_trylock(&c->gc_lock)) {
if (flags & BTREE_INSERT_NOUNLOCK)
goto out;
bch2_btree_iter_unlock(trans->entries[0].iter);
down_read(&c->gc_lock);
}
up_read(&c->gc_lock);
}
if (ret == -EINTR) {
if (flags & BTREE_INSERT_NOUNLOCK)
goto out;
trans_for_each_entry(trans, i) {
int ret2 = bch2_btree_iter_traverse(i->iter);
if (ret2) {
ret = ret2;
goto out;
}
BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
}
/*
* BTREE_ITER_ATOMIC means we have to return -EINTR if we
* dropped locks:
*/
if (!(flags & BTREE_INSERT_ATOMIC))
goto retry;
}
goto out;
}
void bch2_trans_update(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *k,
unsigned extra_journal_res)
{
struct btree_insert_entry *i;
BUG_ON(trans->nr_updates >= ARRAY_SIZE(trans->updates));
i = &trans->updates[trans->nr_updates++];
*i = (struct btree_insert_entry) {
.iter = iter,
.k = k,
.extra_res = extra_journal_res,
};
btree_insert_entry_checks(trans->c, i);
}
int bch2_trans_commit(struct btree_trans *trans,
struct disk_reservation *disk_res,
struct extent_insert_hook *hook,
u64 *journal_seq,
unsigned flags)
{
struct btree_insert insert = {
.c = trans->c,
.disk_res = disk_res,
.journal_seq = journal_seq,
.flags = flags,
.nr = trans->nr_updates,
.entries = trans->updates,
};
if (!trans->nr_updates)
return 0;
trans->nr_updates = 0;
return __bch2_btree_insert_at(&insert);
}
int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags)
{
struct bkey_i k;
bkey_init(&k.k);
k.k.p = iter->pos;
return bch2_btree_insert_at(iter->c, NULL, NULL, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|flags,
BTREE_INSERT_ENTRY(iter, &k));
}
int bch2_btree_insert_list_at(struct btree_iter *iter,
struct keylist *keys,
struct disk_reservation *disk_res,
struct extent_insert_hook *hook,
u64 *journal_seq, unsigned flags)
{
BUG_ON(flags & BTREE_INSERT_ATOMIC);
BUG_ON(bch2_keylist_empty(keys));
bch2_verify_keylist_sorted(keys);
while (!bch2_keylist_empty(keys)) {
int ret = bch2_btree_insert_at(iter->c, disk_res, hook,
journal_seq, flags,
BTREE_INSERT_ENTRY(iter, bch2_keylist_front(keys)));
if (ret)
return ret;
bch2_keylist_pop_front(keys);
}
return 0;
}
/**
* bch_btree_insert - insert keys into the extent btree
* @c: pointer to struct bch_fs
* @id: btree to insert into
* @insert_keys: list of keys to insert
* @hook: insert callback
*/
int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
struct bkey_i *k,
struct disk_reservation *disk_res,
struct extent_insert_hook *hook,
u64 *journal_seq, int flags)
{
struct btree_iter iter;
int ret;
bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k),
BTREE_ITER_INTENT);
ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags,
BTREE_INSERT_ENTRY(&iter, k));
bch2_btree_iter_unlock(&iter);
return ret;
}
/*
* bch_btree_delete_range - delete everything within a given range
*
* Range is a half open interval - [start, end)
*/
int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
struct bpos start,
struct bpos end,
struct bversion version,
struct disk_reservation *disk_res,
struct extent_insert_hook *hook,
u64 *journal_seq)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
bch2_btree_iter_init(&iter, c, id, start,
BTREE_ITER_INTENT);
while ((k = bch2_btree_iter_peek(&iter)).k &&
!(ret = btree_iter_err(k))) {
unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
/* really shouldn't be using a bare, unpadded bkey_i */
struct bkey_i delete;
if (bkey_cmp(iter.pos, end) >= 0)
break;
bkey_init(&delete.k);
/*
* For extents, iter.pos won't necessarily be the same as
* bkey_start_pos(k.k) (for non extents they always will be the
* same). It's important that we delete starting from iter.pos
* because the range we want to delete could start in the middle
* of k.
*
* (bch2_btree_iter_peek() does guarantee that iter.pos >=
* bkey_start_pos(k.k)).
*/
delete.k.p = iter.pos;
delete.k.version = version;
if (iter.flags & BTREE_ITER_IS_EXTENTS) {
/* create the biggest key we can */
bch2_key_resize(&delete.k, max_sectors);
bch2_cut_back(end, &delete.k);
}
ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq,
BTREE_INSERT_NOFAIL,
BTREE_INSERT_ENTRY(&iter, &delete));
if (ret)
break;
bch2_btree_iter_cond_resched(&iter);
}
bch2_btree_iter_unlock(&iter);
return ret;
}

975
fs/bcachefs/buckets.c Normal file
View File

@ -0,0 +1,975 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Code for manipulating bucket marks for garbage collection.
*
* Copyright 2014 Datera, Inc.
*
* Bucket states:
* - free bucket: mark == 0
* The bucket contains no data and will not be read
*
* - allocator bucket: owned_by_allocator == 1
* The bucket is on a free list, or it is an open bucket
*
* - cached bucket: owned_by_allocator == 0 &&
* dirty_sectors == 0 &&
* cached_sectors > 0
* The bucket contains data but may be safely discarded as there are
* enough replicas of the data on other cache devices, or it has been
* written back to the backing device
*
* - dirty bucket: owned_by_allocator == 0 &&
* dirty_sectors > 0
* The bucket contains data that we must not discard (either only copy,
* or one of the 'main copies' for data requiring multiple replicas)
*
* - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
* This is a btree node, journal or gen/prio bucket
*
* Lifecycle:
*
* bucket invalidated => bucket on freelist => open bucket =>
* [dirty bucket =>] cached bucket => bucket invalidated => ...
*
* Note that cache promotion can skip the dirty bucket step, as data
* is copied from a deeper tier to a shallower tier, onto a cached
* bucket.
* Note also that a cached bucket can spontaneously become dirty --
* see below.
*
* Only a traversal of the key space can determine whether a bucket is
* truly dirty or cached.
*
* Transitions:
*
* - free => allocator: bucket was invalidated
* - cached => allocator: bucket was invalidated
*
* - allocator => dirty: open bucket was filled up
* - allocator => cached: open bucket was filled up
* - allocator => metadata: metadata was allocated
*
* - dirty => cached: dirty sectors were copied to a deeper tier
* - dirty => free: dirty sectors were overwritten or moved (copy gc)
* - cached => free: cached sectors were overwritten
*
* - metadata => free: metadata was freed
*
* Oddities:
* - cached => dirty: a device was removed so formerly replicated data
* is no longer sufficiently replicated
* - free => cached: cannot happen
* - free => dirty: cannot happen
* - free => metadata: cannot happen
*/
#include "bcachefs.h"
#include "alloc.h"
#include "btree_gc.h"
#include "buckets.h"
#include "error.h"
#include "movinggc.h"
#include "trace.h"
#include <linux/preempt.h>
#ifdef DEBUG_BUCKETS
#define lg_local_lock lg_global_lock
#define lg_local_unlock lg_global_unlock
static void bch2_fs_stats_verify(struct bch_fs *c)
{
struct bch_fs_usage stats =
__bch2_fs_usage_read(c);
unsigned i;
for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
if ((s64) stats.s[i].data[S_META] < 0)
panic("replicas %u meta underflow: %lli\n",
i + 1, stats.s[i].data[S_META]);
if ((s64) stats.s[i].data[S_DIRTY] < 0)
panic("replicas %u dirty underflow: %lli\n",
i + 1, stats.s[i].data[S_DIRTY]);
if ((s64) stats.s[i].persistent_reserved < 0)
panic("replicas %u reserved underflow: %lli\n",
i + 1, stats.s[i].persistent_reserved);
}
if ((s64) stats.online_reserved < 0)
panic("sectors_online_reserved underflow: %lli\n",
stats.online_reserved);
}
static void bch2_dev_stats_verify(struct bch_dev *ca)
{
struct bch_dev_usage stats =
__bch2_dev_usage_read(ca);
u64 n = ca->mi.nbuckets - ca->mi.first_bucket;
unsigned i;
for (i = 0; i < ARRAY_SIZE(stats.buckets); i++)
BUG_ON(stats.buckets[i] > n);
BUG_ON(stats.buckets_alloc > n);
BUG_ON(stats.buckets_unavailable > n);
}
static void bch2_disk_reservations_verify(struct bch_fs *c, int flags)
{
if (!(flags & BCH_DISK_RESERVATION_NOFAIL)) {
u64 used = __bch2_fs_sectors_used(c);
u64 cached = 0;
u64 avail = atomic64_read(&c->sectors_available);
int cpu;
for_each_possible_cpu(cpu)
cached += per_cpu_ptr(c->usage_percpu, cpu)->available_cache;
if (used + avail + cached > c->capacity)
panic("used %llu avail %llu cached %llu capacity %llu\n",
used, avail, cached, c->capacity);
}
}
#else
static void bch2_fs_stats_verify(struct bch_fs *c) {}
static void bch2_dev_stats_verify(struct bch_dev *ca) {}
static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {}
#endif
/*
* Clear journal_seq_valid for buckets for which it's not needed, to prevent
* wraparound:
*/
void bch2_bucket_seq_cleanup(struct bch_fs *c)
{
u16 last_seq_ondisk = c->journal.last_seq_ondisk;
struct bch_dev *ca;
struct bucket_array *buckets;
struct bucket *g;
struct bucket_mark m;
unsigned i;
for_each_member_device(ca, c, i) {
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
for_each_bucket(g, buckets) {
bucket_cmpxchg(g, m, ({
if (!m.journal_seq_valid ||
bucket_needs_journal_commit(m, last_seq_ondisk))
break;
m.journal_seq_valid = 0;
}));
}
up_read(&ca->bucket_lock);
}
}
#define bch2_usage_add(_acc, _stats) \
do { \
typeof(_acc) _a = (_acc), _s = (_stats); \
unsigned i; \
\
for (i = 0; i < sizeof(*_a) / sizeof(u64); i++) \
((u64 *) (_a))[i] += ((u64 *) (_s))[i]; \
} while (0)
#define bch2_usage_read_raw(_stats) \
({ \
typeof(*this_cpu_ptr(_stats)) _acc; \
int cpu; \
\
memset(&_acc, 0, sizeof(_acc)); \
\
for_each_possible_cpu(cpu) \
bch2_usage_add(&_acc, per_cpu_ptr((_stats), cpu)); \
\
_acc; \
})
#define bch2_usage_read_cached(_c, _cached, _uncached) \
({ \
typeof(_cached) _ret; \
unsigned _seq; \
\
do { \
_seq = read_seqcount_begin(&(_c)->gc_pos_lock); \
_ret = (_c)->gc_pos.phase == GC_PHASE_DONE \
? bch2_usage_read_raw(_uncached) \
: (_cached); \
} while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq)); \
\
_ret; \
})
struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
{
return bch2_usage_read_raw(ca->usage_percpu);
}
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
{
return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
}
struct bch_fs_usage
__bch2_fs_usage_read(struct bch_fs *c)
{
return bch2_usage_read_raw(c->usage_percpu);
}
struct bch_fs_usage
bch2_fs_usage_read(struct bch_fs *c)
{
return bch2_usage_read_cached(c,
c->usage_cached,
c->usage_percpu);
}
struct fs_usage_sum {
u64 data;
u64 reserved;
};
static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
{
struct fs_usage_sum sum = { 0 };
unsigned i;
for (i = 0; i < ARRAY_SIZE(stats.s); i++) {
sum.data += (stats.s[i].data[S_META] +
stats.s[i].data[S_DIRTY]) * (i + 1);
sum.reserved += stats.s[i].persistent_reserved * (i + 1);
}
sum.reserved += stats.online_reserved;
return sum;
}
#define RESERVE_FACTOR 6
static u64 reserve_factor(u64 r)
{
return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
}
static u64 avail_factor(u64 r)
{
return (r << RESERVE_FACTOR) / (1 << RESERVE_FACTOR) + 1;
}
u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
{
struct fs_usage_sum sum = __fs_usage_sum(stats);
return sum.data + reserve_factor(sum.reserved);
}
u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats)
{
return min(c->capacity, __bch2_fs_sectors_used(c, stats));
}
u64 bch2_fs_sectors_free(struct bch_fs *c, struct bch_fs_usage stats)
{
return avail_factor(c->capacity - bch2_fs_sectors_used(c, stats));
}
static inline int is_unavailable_bucket(struct bucket_mark m)
{
return !is_available_bucket(m);
}
static inline int is_fragmented_bucket(struct bucket_mark m,
struct bch_dev *ca)
{
if (!m.owned_by_allocator &&
m.data_type == BCH_DATA_USER &&
bucket_sectors_used(m))
return max_t(int, 0, (int) ca->mi.bucket_size -
bucket_sectors_used(m));
return 0;
}
static inline enum bch_data_type bucket_type(struct bucket_mark m)
{
return m.cached_sectors && !m.dirty_sectors
? BCH_DATA_CACHED
: m.data_type;
}
static bool bucket_became_unavailable(struct bch_fs *c,
struct bucket_mark old,
struct bucket_mark new)
{
return is_available_bucket(old) &&
!is_available_bucket(new) &&
(!c || c->gc_pos.phase == GC_PHASE_DONE);
}
void bch2_fs_usage_apply(struct bch_fs *c,
struct bch_fs_usage *stats,
struct disk_reservation *disk_res,
struct gc_pos gc_pos)
{
struct fs_usage_sum sum = __fs_usage_sum(*stats);
s64 added = sum.data + sum.reserved;
/*
* Not allowed to reduce sectors_available except by getting a
* reservation:
*/
BUG_ON(added > (s64) (disk_res ? disk_res->sectors : 0));
if (added > 0) {
disk_res->sectors -= added;
stats->online_reserved -= added;
}
percpu_down_read(&c->usage_lock);
preempt_disable();
/* online_reserved not subject to gc: */
this_cpu_add(c->usage_percpu->online_reserved, stats->online_reserved);
stats->online_reserved = 0;
if (!gc_will_visit(c, gc_pos))
bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats);
bch2_fs_stats_verify(c);
preempt_enable();
percpu_up_read(&c->usage_lock);
memset(stats, 0, sizeof(*stats));
}
static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
struct bucket_mark old, struct bucket_mark new)
{
struct bch_dev_usage *dev_usage;
if (c)
percpu_rwsem_assert_held(&c->usage_lock);
if (old.data_type && new.data_type &&
old.data_type != new.data_type) {
BUG_ON(!c);
bch2_fs_inconsistent(c,
"different types of data in same bucket: %s, %s",
bch2_data_types[old.data_type],
bch2_data_types[new.data_type]);
}
preempt_disable();
dev_usage = this_cpu_ptr(ca->usage_percpu);
dev_usage->buckets[bucket_type(old)]--;
dev_usage->buckets[bucket_type(new)]++;
dev_usage->buckets_alloc +=
(int) new.owned_by_allocator - (int) old.owned_by_allocator;
dev_usage->buckets_unavailable +=
is_unavailable_bucket(new) - is_unavailable_bucket(old);
dev_usage->sectors[old.data_type] -= old.dirty_sectors;
dev_usage->sectors[new.data_type] += new.dirty_sectors;
dev_usage->sectors[BCH_DATA_CACHED] +=
(int) new.cached_sectors - (int) old.cached_sectors;
dev_usage->sectors_fragmented +=
is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
preempt_enable();
if (!is_available_bucket(old) && is_available_bucket(new))
bch2_wake_allocator(ca);
bch2_dev_stats_verify(ca);
}
#define bucket_data_cmpxchg(c, ca, g, new, expr) \
({ \
struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
\
bch2_dev_usage_update(c, ca, _old, new); \
_old; \
})
bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, struct bucket_mark *old)
{
struct bucket *g;
struct bucket_mark new;
percpu_rwsem_assert_held(&c->usage_lock);
g = bucket(ca, b);
*old = bucket_data_cmpxchg(c, ca, g, new, ({
if (!is_available_bucket(new))
return false;
new.owned_by_allocator = 1;
new.data_type = 0;
new.cached_sectors = 0;
new.dirty_sectors = 0;
new.gen++;
}));
if (!old->owned_by_allocator && old->cached_sectors)
trace_invalidate(ca, bucket_to_sector(ca, b),
old->cached_sectors);
return true;
}
void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator,
struct gc_pos pos, unsigned flags)
{
struct bucket *g;
struct bucket_mark old, new;
percpu_rwsem_assert_held(&c->usage_lock);
g = bucket(ca, b);
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
gc_will_visit(c, pos))
return;
old = bucket_data_cmpxchg(c, ca, g, new, ({
new.owned_by_allocator = owned_by_allocator;
}));
BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
c->gc_pos.phase == GC_PHASE_DONE);
}
#define saturated_add(ca, dst, src, max) \
do { \
BUG_ON((int) (dst) + (src) < 0); \
if ((dst) == (max)) \
; \
else if ((dst) + (src) <= (max)) \
dst += (src); \
else { \
dst = (max); \
trace_sectors_saturated(ca); \
} \
} while (0)
void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, enum bch_data_type type,
unsigned sectors, struct gc_pos pos,
unsigned flags)
{
struct bucket *g;
struct bucket_mark old, new;
BUG_ON(!type);
if (likely(c)) {
percpu_rwsem_assert_held(&c->usage_lock);
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
gc_will_visit(c, pos))
return;
}
rcu_read_lock();
g = bucket(ca, b);
old = bucket_data_cmpxchg(c, ca, g, new, ({
saturated_add(ca, new.dirty_sectors, sectors,
GC_MAX_SECTORS_USED);
new.data_type = type;
}));
rcu_read_unlock();
BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
bucket_became_unavailable(c, old, new));
}
/* Reverting this until the copygc + compression issue is fixed: */
static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
{
if (!sectors)
return 0;
return max(1U, DIV_ROUND_UP(sectors * crc.compressed_size,
crc.uncompressed_size));
}
/*
* Checking against gc's position has to be done here, inside the cmpxchg()
* loop, to avoid racing with the start of gc clearing all the marks - GC does
* that with the gc pos seqlock held.
*/
static void bch2_mark_pointer(struct bch_fs *c,
struct bkey_s_c_extent e,
const struct bch_extent_ptr *ptr,
struct bch_extent_crc_unpacked crc,
s64 sectors, enum s_alloc type,
struct bch_fs_usage *stats,
u64 journal_seq, unsigned flags)
{
struct bucket_mark old, new;
unsigned saturated;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g = PTR_BUCKET(ca, ptr);
enum bch_data_type data_type = type == S_META
? BCH_DATA_BTREE : BCH_DATA_USER;
u64 v;
if (crc.compression_type) {
unsigned old_sectors, new_sectors;
if (sectors > 0) {
old_sectors = 0;
new_sectors = sectors;
} else {
old_sectors = e.k->size;
new_sectors = e.k->size + sectors;
}
sectors = -__disk_sectors(crc, old_sectors)
+__disk_sectors(crc, new_sectors);
}
if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
if (journal_seq)
bucket_cmpxchg(g, new, ({
new.journal_seq_valid = 1;
new.journal_seq = journal_seq;
}));
return;
}
v = atomic64_read(&g->_mark.v);
do {
new.v.counter = old.v.counter = v;
saturated = 0;
/*
* Check this after reading bucket mark to guard against
* the allocator invalidating a bucket after we've already
* checked the gen
*/
if (gen_after(new.gen, ptr->gen)) {
BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
EBUG_ON(!ptr->cached &&
test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
return;
}
if (!ptr->cached &&
new.dirty_sectors == GC_MAX_SECTORS_USED &&
sectors < 0)
saturated = -sectors;
if (ptr->cached)
saturated_add(ca, new.cached_sectors, sectors,
GC_MAX_SECTORS_USED);
else
saturated_add(ca, new.dirty_sectors, sectors,
GC_MAX_SECTORS_USED);
if (!new.dirty_sectors &&
!new.cached_sectors) {
new.data_type = 0;
if (journal_seq) {
new.journal_seq_valid = 1;
new.journal_seq = journal_seq;
}
} else {
new.data_type = data_type;
}
if (flags & BCH_BUCKET_MARK_NOATOMIC) {
g->_mark = new;
break;
}
} while ((v = atomic64_cmpxchg(&g->_mark.v,
old.v.counter,
new.v.counter)) != old.v.counter);
bch2_dev_usage_update(c, ca, old, new);
BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
bucket_became_unavailable(c, old, new));
if (saturated &&
atomic_long_add_return(saturated,
&ca->saturated_count) >=
bucket_to_sector(ca, ca->free_inc.size)) {
if (c->gc_thread) {
trace_gc_sectors_saturated(c);
wake_up_process(c->gc_thread);
}
}
}
void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
s64 sectors, bool metadata,
struct gc_pos pos,
struct bch_fs_usage *stats,
u64 journal_seq, unsigned flags)
{
/*
* synchronization w.r.t. GC:
*
* Normally, bucket sector counts/marks are updated on the fly, as
* references are added/removed from the btree, the lists of buckets the
* allocator owns, other metadata buckets, etc.
*
* When GC is in progress and going to mark this reference, we do _not_
* mark this reference here, to avoid double counting - GC will count it
* when it gets to it.
*
* To know whether we should mark a given reference (GC either isn't
* running, or has already marked references at this position) we
* construct a total order for everything GC walks. Then, we can simply
* compare the position of the reference we're marking - @pos - with
* GC's current position. If GC is going to mark this reference, GC's
* current position will be less than @pos; if GC's current position is
* greater than @pos GC has either already walked this position, or
* isn't running.
*
* To avoid racing with GC's position changing, we have to deal with
* - GC's position being set to GC_POS_MIN when GC starts:
* usage_lock guards against this
* - GC's position overtaking @pos: we guard against this with
* whatever lock protects the data structure the reference lives in
* (e.g. the btree node lock, or the relevant allocator lock).
*/
percpu_down_read(&c->usage_lock);
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
gc_will_visit(c, pos))
flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
if (!stats)
stats = this_cpu_ptr(c->usage_percpu);
switch (k.k->type) {
case BCH_EXTENT:
case BCH_EXTENT_CACHED: {
struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
enum s_alloc type = metadata ? S_META : S_DIRTY;
unsigned replicas = 0;
BUG_ON(metadata && bkey_extent_is_cached(e.k));
BUG_ON(!sectors);
extent_for_each_ptr_crc(e, ptr, crc) {
bch2_mark_pointer(c, e, ptr, crc, sectors, type,
stats, journal_seq, flags);
replicas += !ptr->cached;
}
if (replicas) {
BUG_ON(replicas - 1 > ARRAY_SIZE(stats->s));
stats->s[replicas - 1].data[type] += sectors;
}
break;
}
case BCH_RESERVATION: {
struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
if (r.v->nr_replicas) {
BUG_ON(r.v->nr_replicas - 1 > ARRAY_SIZE(stats->s));
stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
}
break;
}
}
percpu_up_read(&c->usage_lock);
}
/* Disk reservations: */
static u64 __recalc_sectors_available(struct bch_fs *c)
{
int cpu;
for_each_possible_cpu(cpu)
per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
return bch2_fs_sectors_free(c, bch2_fs_usage_read(c));
}
/* Used by gc when it's starting: */
void bch2_recalc_sectors_available(struct bch_fs *c)
{
percpu_down_write(&c->usage_lock);
atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
percpu_up_write(&c->usage_lock);
}
void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
{
percpu_down_read(&c->usage_lock);
this_cpu_sub(c->usage_percpu->online_reserved,
res->sectors);
bch2_fs_stats_verify(c);
percpu_up_read(&c->usage_lock);
res->sectors = 0;
}
#define SECTORS_CACHE 1024
int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
unsigned sectors, int flags)
{
struct bch_fs_usage *stats;
u64 old, v, get;
s64 sectors_available;
int ret;
percpu_down_read(&c->usage_lock);
preempt_disable();
stats = this_cpu_ptr(c->usage_percpu);
if (sectors <= stats->available_cache)
goto out;
v = atomic64_read(&c->sectors_available);
do {
old = v;
get = min((u64) sectors + SECTORS_CACHE, old);
if (get < sectors) {
preempt_enable();
percpu_up_read(&c->usage_lock);
goto recalculate;
}
} while ((v = atomic64_cmpxchg(&c->sectors_available,
old, old - get)) != old);
stats->available_cache += get;
out:
stats->available_cache -= sectors;
stats->online_reserved += sectors;
res->sectors += sectors;
bch2_disk_reservations_verify(c, flags);
bch2_fs_stats_verify(c);
preempt_enable();
percpu_up_read(&c->usage_lock);
return 0;
recalculate:
/*
* GC recalculates sectors_available when it starts, so that hopefully
* we don't normally end up blocking here:
*/
/*
* Piss fuck, we can be called from extent_insert_fixup() with btree
* locks held:
*/
if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
down_read(&c->gc_lock);
else if (!down_read_trylock(&c->gc_lock))
return -EINTR;
}
percpu_down_write(&c->usage_lock);
sectors_available = __recalc_sectors_available(c);
if (sectors <= sectors_available ||
(flags & BCH_DISK_RESERVATION_NOFAIL)) {
atomic64_set(&c->sectors_available,
max_t(s64, 0, sectors_available - sectors));
stats->online_reserved += sectors;
res->sectors += sectors;
ret = 0;
bch2_disk_reservations_verify(c, flags);
} else {
atomic64_set(&c->sectors_available, sectors_available);
ret = -ENOSPC;
}
bch2_fs_stats_verify(c);
percpu_up_write(&c->usage_lock);
if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
up_read(&c->gc_lock);
return ret;
}
/* Startup/shutdown: */
static void buckets_free_rcu(struct rcu_head *rcu)
{
struct bucket_array *buckets =
container_of(rcu, struct bucket_array, rcu);
kvpfree(buckets,
sizeof(struct bucket_array) +
buckets->nbuckets * sizeof(struct bucket));
}
int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
{
struct bucket_array *buckets = NULL, *old_buckets = NULL;
unsigned long *buckets_dirty = NULL;
u8 *oldest_gens = NULL;
alloc_fifo free[RESERVE_NR];
alloc_fifo free_inc;
alloc_heap alloc_heap;
copygc_heap copygc_heap;
size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
ca->mi.bucket_size / c->opts.btree_node_size);
/* XXX: these should be tunable */
size_t reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9);
size_t copygc_reserve = max_t(size_t, 16, ca->mi.nbuckets >> 7);
size_t free_inc_reserve = copygc_reserve / 2;
bool resize = ca->buckets != NULL,
start_copygc = ca->copygc_thread != NULL;
int ret = -ENOMEM;
unsigned i;
memset(&free, 0, sizeof(free));
memset(&free_inc, 0, sizeof(free_inc));
memset(&alloc_heap, 0, sizeof(alloc_heap));
memset(&copygc_heap, 0, sizeof(copygc_heap));
if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO)) ||
!(oldest_gens = kvpmalloc(nbuckets * sizeof(u8),
GFP_KERNEL|__GFP_ZERO)) ||
!(buckets_dirty = kvpmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long),
GFP_KERNEL|__GFP_ZERO)) ||
!init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
!init_fifo(&free[RESERVE_MOVINGGC],
copygc_reserve, GFP_KERNEL) ||
!init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
!init_fifo(&free_inc, free_inc_reserve, GFP_KERNEL) ||
!init_heap(&alloc_heap, free_inc_reserve, GFP_KERNEL) ||
!init_heap(&copygc_heap, copygc_reserve, GFP_KERNEL))
goto err;
buckets->first_bucket = ca->mi.first_bucket;
buckets->nbuckets = nbuckets;
bch2_copygc_stop(ca);
if (resize) {
down_write(&c->gc_lock);
down_write(&ca->bucket_lock);
percpu_down_write(&c->usage_lock);
}
old_buckets = bucket_array(ca);
if (resize) {
size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
memcpy(buckets->b,
old_buckets->b,
n * sizeof(struct bucket));
memcpy(oldest_gens,
ca->oldest_gens,
n * sizeof(u8));
memcpy(buckets_dirty,
ca->buckets_dirty,
BITS_TO_LONGS(n) * sizeof(unsigned long));
}
rcu_assign_pointer(ca->buckets, buckets);
buckets = old_buckets;
swap(ca->oldest_gens, oldest_gens);
swap(ca->buckets_dirty, buckets_dirty);
if (resize)
percpu_up_write(&c->usage_lock);
spin_lock(&c->freelist_lock);
for (i = 0; i < RESERVE_NR; i++) {
fifo_move(&free[i], &ca->free[i]);
swap(ca->free[i], free[i]);
}
fifo_move(&free_inc, &ca->free_inc);
swap(ca->free_inc, free_inc);
spin_unlock(&c->freelist_lock);
/* with gc lock held, alloc_heap can't be in use: */
swap(ca->alloc_heap, alloc_heap);
/* and we shut down copygc: */
swap(ca->copygc_heap, copygc_heap);
nbuckets = ca->mi.nbuckets;
if (resize) {
up_write(&ca->bucket_lock);
up_write(&c->gc_lock);
}
if (start_copygc &&
bch2_copygc_start(c, ca))
bch_err(ca, "error restarting copygc thread");
ret = 0;
err:
free_heap(&copygc_heap);
free_heap(&alloc_heap);
free_fifo(&free_inc);
for (i = 0; i < RESERVE_NR; i++)
free_fifo(&free[i]);
kvpfree(buckets_dirty,
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
kvpfree(oldest_gens,
nbuckets * sizeof(u8));
if (buckets)
call_rcu(&old_buckets->rcu, buckets_free_rcu);
return ret;
}
void bch2_dev_buckets_free(struct bch_dev *ca)
{
unsigned i;
free_heap(&ca->copygc_heap);
free_heap(&ca->alloc_heap);
free_fifo(&ca->free_inc);
for (i = 0; i < RESERVE_NR; i++)
free_fifo(&ca->free[i]);
kvpfree(ca->buckets_dirty,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
kvpfree(rcu_dereference_protected(ca->buckets, 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
free_percpu(ca->usage_percpu);
}
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
if (!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)))
return -ENOMEM;
return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
}

276
fs/bcachefs/buckets.h Normal file
View File

@ -0,0 +1,276 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Code for manipulating bucket marks for garbage collection.
*
* Copyright 2014 Datera, Inc.
*/
#ifndef _BUCKETS_H
#define _BUCKETS_H
#include "buckets_types.h"
#include "super.h"
#define for_each_bucket(_b, _buckets) \
for (_b = (_buckets)->b + (_buckets)->first_bucket; \
_b < (_buckets)->b + (_buckets)->nbuckets; _b++)
#define bucket_cmpxchg(g, new, expr) \
({ \
u64 _v = atomic64_read(&(g)->_mark.v); \
struct bucket_mark _old; \
\
do { \
(new).v.counter = _old.v.counter = _v; \
expr; \
} while ((_v = atomic64_cmpxchg(&(g)->_mark.v, \
_old.v.counter, \
(new).v.counter)) != _old.v.counter);\
_old; \
})
static inline struct bucket_array *bucket_array(struct bch_dev *ca)
{
return rcu_dereference_check(ca->buckets,
!ca->fs ||
percpu_rwsem_is_held(&ca->fs->usage_lock) ||
lockdep_is_held(&ca->fs->gc_lock) ||
lockdep_is_held(&ca->bucket_lock));
}
static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
{
struct bucket_array *buckets = bucket_array(ca);
BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
return buckets->b + b;
}
static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
size_t b, int rw)
{
bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand;
}
static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
{
return c->bucket_clock[rw].hand - g->io_time[rw];
}
/*
* bucket_gc_gen() returns the difference between the bucket's current gen and
* the oldest gen of any pointer into that bucket in the btree.
*/
static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
{
return bucket(ca, b)->mark.gen - ca->oldest_gens[b];
}
static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
const struct bch_extent_ptr *ptr)
{
return sector_to_bucket(ca, ptr->offset);
}
static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
const struct bch_extent_ptr *ptr)
{
return bucket(ca, PTR_BUCKET_NR(ca, ptr));
}
static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
const struct bch_extent_ptr *ptr)
{
struct bucket_mark m;
rcu_read_lock();
m = READ_ONCE(bucket(ca, PTR_BUCKET_NR(ca, ptr))->mark);
rcu_read_unlock();
return m;
}
static inline int gen_cmp(u8 a, u8 b)
{
return (s8) (a - b);
}
static inline int gen_after(u8 a, u8 b)
{
int r = gen_cmp(a, b);
return r > 0 ? r : 0;
}
/**
* ptr_stale() - check if a pointer points into a bucket that has been
* invalidated.
*/
static inline u8 ptr_stale(struct bch_dev *ca,
const struct bch_extent_ptr *ptr)
{
return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen);
}
/* bucket gc marks */
/* The dirty and cached sector counts saturate. If this occurs,
* reference counting alone will not free the bucket, and a btree
* GC must be performed. */
#define GC_MAX_SECTORS_USED ((1U << 15) - 1)
static inline unsigned bucket_sectors_used(struct bucket_mark mark)
{
return mark.dirty_sectors + mark.cached_sectors;
}
static inline bool bucket_unused(struct bucket_mark mark)
{
return !mark.owned_by_allocator &&
!mark.data_type &&
!bucket_sectors_used(mark);
}
/* Device usage: */
struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
static inline u64 __dev_buckets_available(struct bch_dev *ca,
struct bch_dev_usage stats)
{
u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
if (WARN_ONCE(stats.buckets_unavailable > total,
"buckets_unavailable overflow (%llu > %llu)\n",
stats.buckets_unavailable, total))
return 0;
return total - stats.buckets_unavailable;
}
/*
* Number of reclaimable buckets - only for use by the allocator thread:
*/
static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca)
{
return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca));
}
static inline u64 __dev_buckets_free(struct bch_dev *ca,
struct bch_dev_usage stats)
{
return __dev_buckets_available(ca, stats) +
fifo_used(&ca->free[RESERVE_NONE]) +
fifo_used(&ca->free_inc);
}
static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
{
return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca));
}
/* Filesystem usage: */
static inline enum bch_data_type s_alloc_to_data_type(enum s_alloc s)
{
switch (s) {
case S_META:
return BCH_DATA_BTREE;
case S_DIRTY:
return BCH_DATA_USER;
default:
BUG();
}
}
struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
struct disk_reservation *, struct gc_pos);
u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
u64 bch2_fs_sectors_free(struct bch_fs *, struct bch_fs_usage);
static inline bool is_available_bucket(struct bucket_mark mark)
{
return (!mark.owned_by_allocator &&
!mark.dirty_sectors &&
!mark.nouse);
}
static inline bool bucket_needs_journal_commit(struct bucket_mark m,
u16 last_seq_ondisk)
{
return m.journal_seq_valid &&
((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
}
void bch2_bucket_seq_cleanup(struct bch_fs *);
bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
size_t, struct bucket_mark *);
void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
size_t, bool, struct gc_pos, unsigned);
void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
size_t, enum bch_data_type, unsigned,
struct gc_pos, unsigned);
#define BCH_BUCKET_MARK_NOATOMIC (1 << 0)
#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 1)
#define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 2)
#define BCH_BUCKET_MARK_GC_LOCK_HELD (1 << 3)
void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, struct gc_pos,
struct bch_fs_usage *, u64, unsigned);
void bch2_recalc_sectors_available(struct bch_fs *);
void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
static inline void bch2_disk_reservation_put(struct bch_fs *c,
struct disk_reservation *res)
{
if (res->sectors)
__bch2_disk_reservation_put(c, res);
}
#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
#define BCH_DISK_RESERVATION_GC_LOCK_HELD (1 << 1)
#define BCH_DISK_RESERVATION_BTREE_LOCKS_HELD (1 << 2)
int bch2_disk_reservation_add(struct bch_fs *,
struct disk_reservation *,
unsigned, int);
static inline struct disk_reservation
bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
{
return (struct disk_reservation) {
.sectors = 0,
#if 0
/* not used yet: */
.gen = c->capacity_gen,
#endif
.nr_replicas = nr_replicas,
};
}
static inline int bch2_disk_reservation_get(struct bch_fs *c,
struct disk_reservation *res,
unsigned sectors,
unsigned nr_replicas,
int flags)
{
*res = bch2_disk_reservation_init(c, nr_replicas);
return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags);
}
int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
void bch2_dev_buckets_free(struct bch_dev *);
int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
#endif /* _BUCKETS_H */

View File

@ -0,0 +1,96 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BUCKETS_TYPES_H
#define _BUCKETS_TYPES_H
#include "util.h"
struct bucket_mark {
union {
struct {
atomic64_t v;
};
struct {
u8 gen;
u8 data_type:3,
gen_valid:1,
owned_by_allocator:1,
nouse:1,
journal_seq_valid:1;
u16 dirty_sectors;
u16 cached_sectors;
/*
* low bits of journal sequence number when this bucket was most
* recently modified: if journal_seq_valid is set, this bucket
* can't be reused until the journal sequence number written to
* disk is >= the bucket's journal sequence number:
*/
u16 journal_seq;
};
};
};
struct bucket {
union {
struct bucket_mark _mark;
const struct bucket_mark mark;
};
u16 io_time[2];
};
struct bucket_array {
struct rcu_head rcu;
u16 first_bucket;
size_t nbuckets;
struct bucket b[];
};
struct bch_dev_usage {
u64 buckets[BCH_DATA_NR];
u64 buckets_alloc;
u64 buckets_unavailable;
/* _compressed_ sectors: */
u64 sectors[BCH_DATA_NR];
u64 sectors_fragmented;
};
/* kill, switch to bch_data_type? */
enum s_alloc {
S_META,
S_DIRTY,
S_ALLOC_NR,
};
struct bch_fs_usage {
/* all fields are in units of 512 byte sectors: */
/* _uncompressed_ sectors: */
u64 online_reserved;
u64 available_cache;
struct {
u64 data[S_ALLOC_NR];
u64 persistent_reserved;
} s[BCH_REPLICAS_MAX];
};
/*
* A reservation for space on disk:
*/
struct disk_reservation {
u64 sectors;
u32 gen;
unsigned nr_replicas;
};
struct copygc_heap_entry {
u8 gen;
u32 sectors;
u64 offset;
};
typedef HEAP(struct copygc_heap_entry) copygc_heap;
#endif /* _BUCKETS_TYPES_H */

663
fs/bcachefs/chardev.c Normal file
View File

@ -0,0 +1,663 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef NO_BCACHEFS_CHARDEV
#include "bcachefs.h"
#include "alloc.h"
#include "bcachefs_ioctl.h"
#include "buckets.h"
#include "chardev.h"
#include "move.h"
#include "super.h"
#include "super-io.h"
#include <linux/anon_inodes.h>
#include <linux/cdev.h>
#include <linux/device.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/ioctl.h>
#include <linux/kthread.h>
#include <linux/major.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
/* returns with ref on ca->ref */
static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
unsigned flags)
{
struct bch_dev *ca;
if (flags & BCH_BY_INDEX) {
if (dev >= c->sb.nr_devices)
return ERR_PTR(-EINVAL);
rcu_read_lock();
ca = rcu_dereference(c->devs[dev]);
if (ca)
percpu_ref_get(&ca->ref);
rcu_read_unlock();
if (!ca)
return ERR_PTR(-EINVAL);
} else {
char *path;
path = strndup_user((const char __user *)
(unsigned long) dev, PATH_MAX);
if (IS_ERR(path))
return ERR_CAST(path);
ca = bch2_dev_lookup(c, path);
kfree(path);
}
return ca;
}
#if 0
static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
{
struct bch_ioctl_assemble arg;
struct bch_fs *c;
u64 *user_devs = NULL;
char **devs = NULL;
unsigned i;
int ret = -EFAULT;
if (copy_from_user(&arg, user_arg, sizeof(arg)))
return -EFAULT;
if (arg.flags || arg.pad)
return -EINVAL;
user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
if (!user_devs)
return -ENOMEM;
devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
if (copy_from_user(user_devs, user_arg->devs,
sizeof(u64) * arg.nr_devs))
goto err;
for (i = 0; i < arg.nr_devs; i++) {
devs[i] = strndup_user((const char __user *)(unsigned long)
user_devs[i],
PATH_MAX);
if (!devs[i]) {
ret = -ENOMEM;
goto err;
}
}
c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
ret = PTR_ERR_OR_ZERO(c);
if (!ret)
closure_put(&c->cl);
err:
if (devs)
for (i = 0; i < arg.nr_devs; i++)
kfree(devs[i]);
kfree(devs);
return ret;
}
static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg)
{
struct bch_ioctl_incremental arg;
const char *err;
char *path;
if (copy_from_user(&arg, user_arg, sizeof(arg)))
return -EFAULT;
if (arg.flags || arg.pad)
return -EINVAL;
path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
if (!path)
return -ENOMEM;
err = bch2_fs_open_incremental(path);
kfree(path);
if (err) {
pr_err("Could not register bcachefs devices: %s", err);
return -EINVAL;
}
return 0;
}
#endif
static long bch2_global_ioctl(unsigned cmd, void __user *arg)
{
switch (cmd) {
#if 0
case BCH_IOCTL_ASSEMBLE:
return bch2_ioctl_assemble(arg);
case BCH_IOCTL_INCREMENTAL:
return bch2_ioctl_incremental(arg);
#endif
default:
return -ENOTTY;
}
}
static long bch2_ioctl_query_uuid(struct bch_fs *c,
struct bch_ioctl_query_uuid __user *user_arg)
{
return copy_to_user(&user_arg->uuid,
&c->sb.user_uuid,
sizeof(c->sb.user_uuid));
}
#if 0
static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
{
if (arg.flags || arg.pad)
return -EINVAL;
return bch2_fs_start(c) ? -EIO : 0;
}
static long bch2_ioctl_stop(struct bch_fs *c)
{
bch2_fs_stop(c);
return 0;
}
#endif
static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
{
char *path;
int ret;
if (arg.flags || arg.pad)
return -EINVAL;
path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
if (!path)
return -ENOMEM;
ret = bch2_dev_add(c, path);
kfree(path);
return ret;
}
static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
{
struct bch_dev *ca;
if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
BCH_FORCE_IF_METADATA_LOST|
BCH_FORCE_IF_DEGRADED|
BCH_BY_INDEX)) ||
arg.pad)
return -EINVAL;
ca = bch2_device_lookup(c, arg.dev, arg.flags);
if (IS_ERR(ca))
return PTR_ERR(ca);
return bch2_dev_remove(c, ca, arg.flags);
}
static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
{
char *path;
int ret;
if (arg.flags || arg.pad)
return -EINVAL;
path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
if (!path)
return -ENOMEM;
ret = bch2_dev_online(c, path);
kfree(path);
return ret;
}
static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
{
struct bch_dev *ca;
int ret;
if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
BCH_FORCE_IF_METADATA_LOST|
BCH_FORCE_IF_DEGRADED|
BCH_BY_INDEX)) ||
arg.pad)
return -EINVAL;
ca = bch2_device_lookup(c, arg.dev, arg.flags);
if (IS_ERR(ca))
return PTR_ERR(ca);
ret = bch2_dev_offline(c, ca, arg.flags);
percpu_ref_put(&ca->ref);
return ret;
}
static long bch2_ioctl_disk_set_state(struct bch_fs *c,
struct bch_ioctl_disk_set_state arg)
{
struct bch_dev *ca;
int ret;
if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
BCH_FORCE_IF_METADATA_LOST|
BCH_FORCE_IF_DEGRADED|
BCH_BY_INDEX)) ||
arg.pad[0] || arg.pad[1] || arg.pad[2])
return -EINVAL;
ca = bch2_device_lookup(c, arg.dev, arg.flags);
if (IS_ERR(ca))
return PTR_ERR(ca);
ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
percpu_ref_put(&ca->ref);
return ret;
}
struct bch_data_ctx {
struct bch_fs *c;
struct bch_ioctl_data arg;
struct bch_move_stats stats;
int ret;
struct task_struct *thread;
};
static int bch2_data_thread(void *arg)
{
struct bch_data_ctx *ctx = arg;
ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
ctx->stats.data_type = U8_MAX;
return 0;
}
static int bch2_data_job_release(struct inode *inode, struct file *file)
{
struct bch_data_ctx *ctx = file->private_data;
kthread_stop(ctx->thread);
put_task_struct(ctx->thread);
kfree(ctx);
return 0;
}
static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
size_t len, loff_t *ppos)
{
struct bch_data_ctx *ctx = file->private_data;
struct bch_fs *c = ctx->c;
struct bch_ioctl_data_event e = {
.type = BCH_DATA_EVENT_PROGRESS,
.p.data_type = ctx->stats.data_type,
.p.btree_id = ctx->stats.iter.btree_id,
.p.pos = ctx->stats.iter.pos,
.p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
.p.sectors_total = bch2_fs_sectors_used(c, bch2_fs_usage_read(c)),
};
if (len < sizeof(e))
return -EINVAL;
return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e);
}
static const struct file_operations bcachefs_data_ops = {
.release = bch2_data_job_release,
.read = bch2_data_job_read,
.llseek = no_llseek,
};
static long bch2_ioctl_data(struct bch_fs *c,
struct bch_ioctl_data arg)
{
struct bch_data_ctx *ctx = NULL;
struct file *file = NULL;
unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
int ret, fd = -1;
if (arg.op >= BCH_DATA_OP_NR || arg.flags)
return -EINVAL;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx)
return -ENOMEM;
ctx->c = c;
ctx->arg = arg;
ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]");
if (IS_ERR(ctx->thread)) {
ret = PTR_ERR(ctx->thread);
goto err;
}
ret = get_unused_fd_flags(flags);
if (ret < 0)
goto err;
fd = ret;
file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags);
if (IS_ERR(file)) {
ret = PTR_ERR(file);
goto err;
}
fd_install(fd, file);
get_task_struct(ctx->thread);
wake_up_process(ctx->thread);
return fd;
err:
if (fd >= 0)
put_unused_fd(fd);
if (!IS_ERR_OR_NULL(ctx->thread))
kthread_stop(ctx->thread);
kfree(ctx);
return ret;
}
static long bch2_ioctl_usage(struct bch_fs *c,
struct bch_ioctl_usage __user *user_arg)
{
struct bch_ioctl_usage arg;
struct bch_dev *ca;
unsigned i, j;
int ret;
if (!test_bit(BCH_FS_STARTED, &c->flags))
return -EINVAL;
if (copy_from_user(&arg, user_arg, sizeof(arg)))
return -EFAULT;
for (i = 0; i < arg.nr_devices; i++) {
struct bch_ioctl_dev_usage dst = { .alive = 0 };
ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst));
if (ret)
return ret;
}
{
struct bch_fs_usage src = bch2_fs_usage_read(c);
struct bch_ioctl_fs_usage dst = {
.capacity = c->capacity,
.used = bch2_fs_sectors_used(c, src),
.online_reserved = src.online_reserved,
};
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
dst.persistent_reserved[i] =
src.s[i].persistent_reserved;
for (j = 0; j < S_ALLOC_NR; j++)
dst.sectors[s_alloc_to_data_type(j)][i] =
src.s[i].data[j];
}
ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst));
if (ret)
return ret;
}
for_each_member_device(ca, c, i) {
struct bch_dev_usage src = bch2_dev_usage_read(c, ca);
struct bch_ioctl_dev_usage dst = {
.alive = 1,
.state = ca->mi.state,
.bucket_size = ca->mi.bucket_size,
.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket,
};
if (ca->dev_idx >= arg.nr_devices) {
percpu_ref_put(&ca->ref);
return -ERANGE;
}
if (percpu_ref_tryget(&ca->io_ref)) {
dst.dev = huge_encode_dev(ca->disk_sb.bdev->bd_dev);
percpu_ref_put(&ca->io_ref);
}
for (j = 0; j < BCH_DATA_NR; j++) {
dst.buckets[j] = src.buckets[j];
dst.sectors[j] = src.sectors[j];
}
ret = copy_to_user(&user_arg->devs[i], &dst, sizeof(dst));
if (ret)
return ret;
}
return 0;
}
static long bch2_ioctl_read_super(struct bch_fs *c,
struct bch_ioctl_read_super arg)
{
struct bch_dev *ca = NULL;
struct bch_sb *sb;
int ret = 0;
if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) ||
arg.pad)
return -EINVAL;
mutex_lock(&c->sb_lock);
if (arg.flags & BCH_READ_DEV) {
ca = bch2_device_lookup(c, arg.dev, arg.flags);
if (IS_ERR(ca)) {
ret = PTR_ERR(ca);
goto err;
}
sb = ca->disk_sb.sb;
} else {
sb = c->disk_sb.sb;
}
if (vstruct_bytes(sb) > arg.size) {
ret = -ERANGE;
goto err;
}
ret = copy_to_user((void __user *)(unsigned long)arg.sb,
sb, vstruct_bytes(sb));
err:
if (ca)
percpu_ref_put(&ca->ref);
mutex_unlock(&c->sb_lock);
return ret;
}
static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
struct bch_ioctl_disk_get_idx arg)
{
dev_t dev = huge_decode_dev(arg.dev);
struct bch_dev *ca;
unsigned i;
for_each_online_member(ca, c, i)
if (ca->disk_sb.bdev->bd_dev == dev) {
percpu_ref_put(&ca->io_ref);
return i;
}
return -ENOENT;
}
static long bch2_ioctl_disk_resize(struct bch_fs *c,
struct bch_ioctl_disk_resize arg)
{
struct bch_dev *ca;
int ret;
if ((arg.flags & ~BCH_BY_INDEX) ||
arg.pad)
return -EINVAL;
ca = bch2_device_lookup(c, arg.dev, arg.flags);
if (IS_ERR(ca))
return PTR_ERR(ca);
ret = bch2_dev_resize(c, ca, arg.nbuckets);
percpu_ref_put(&ca->ref);
return ret;
}
#define BCH_IOCTL(_name, _argtype) \
do { \
_argtype i; \
\
if (copy_from_user(&i, arg, sizeof(i))) \
return -EFAULT; \
return bch2_ioctl_##_name(c, i); \
} while (0)
long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
{
/* ioctls that don't require admin cap: */
switch (cmd) {
case BCH_IOCTL_QUERY_UUID:
return bch2_ioctl_query_uuid(c, arg);
case BCH_IOCTL_USAGE:
return bch2_ioctl_usage(c, arg);
}
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
switch (cmd) {
#if 0
case BCH_IOCTL_START:
BCH_IOCTL(start, struct bch_ioctl_start);
case BCH_IOCTL_STOP:
return bch2_ioctl_stop(c);
#endif
case BCH_IOCTL_READ_SUPER:
BCH_IOCTL(read_super, struct bch_ioctl_read_super);
case BCH_IOCTL_DISK_GET_IDX:
BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
}
if (!test_bit(BCH_FS_STARTED, &c->flags))
return -EINVAL;
/* ioctls that do require admin cap: */
switch (cmd) {
case BCH_IOCTL_DISK_ADD:
BCH_IOCTL(disk_add, struct bch_ioctl_disk);
case BCH_IOCTL_DISK_REMOVE:
BCH_IOCTL(disk_remove, struct bch_ioctl_disk);
case BCH_IOCTL_DISK_ONLINE:
BCH_IOCTL(disk_online, struct bch_ioctl_disk);
case BCH_IOCTL_DISK_OFFLINE:
BCH_IOCTL(disk_offline, struct bch_ioctl_disk);
case BCH_IOCTL_DISK_SET_STATE:
BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
case BCH_IOCTL_DATA:
BCH_IOCTL(data, struct bch_ioctl_data);
case BCH_IOCTL_DISK_RESIZE:
BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
default:
return -ENOTTY;
}
}
static DEFINE_IDR(bch_chardev_minor);
static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
{
unsigned minor = iminor(file_inode(filp));
struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL;
void __user *arg = (void __user *) v;
return c
? bch2_fs_ioctl(c, cmd, arg)
: bch2_global_ioctl(cmd, arg);
}
static const struct file_operations bch_chardev_fops = {
.owner = THIS_MODULE,
.unlocked_ioctl = bch2_chardev_ioctl,
.open = nonseekable_open,
};
static int bch_chardev_major;
static struct class *bch_chardev_class;
static struct device *bch_chardev;
void bch2_fs_chardev_exit(struct bch_fs *c)
{
if (!IS_ERR_OR_NULL(c->chardev))
device_unregister(c->chardev);
if (c->minor >= 0)
idr_remove(&bch_chardev_minor, c->minor);
}
int bch2_fs_chardev_init(struct bch_fs *c)
{
c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
if (c->minor < 0)
return c->minor;
c->chardev = device_create(bch_chardev_class, NULL,
MKDEV(bch_chardev_major, c->minor), c,
"bcachefs%u-ctl", c->minor);
if (IS_ERR(c->chardev))
return PTR_ERR(c->chardev);
return 0;
}
void bch2_chardev_exit(void)
{
if (!IS_ERR_OR_NULL(bch_chardev_class))
device_destroy(bch_chardev_class,
MKDEV(bch_chardev_major, U8_MAX));
if (!IS_ERR_OR_NULL(bch_chardev_class))
class_destroy(bch_chardev_class);
if (bch_chardev_major > 0)
unregister_chrdev(bch_chardev_major, "bcachefs");
}
int __init bch2_chardev_init(void)
{
bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
if (bch_chardev_major < 0)
return bch_chardev_major;
bch_chardev_class = class_create("bcachefs");
if (IS_ERR(bch_chardev_class))
return PTR_ERR(bch_chardev_class);
bch_chardev = device_create(bch_chardev_class, NULL,
MKDEV(bch_chardev_major, U8_MAX),
NULL, "bcachefs-ctl");
if (IS_ERR(bch_chardev))
return PTR_ERR(bch_chardev);
return 0;
}
#endif /* NO_BCACHEFS_CHARDEV */

31
fs/bcachefs/chardev.h Normal file
View File

@ -0,0 +1,31 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_CHARDEV_H
#define _BCACHEFS_CHARDEV_H
#ifndef NO_BCACHEFS_FS
long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *);
void bch2_fs_chardev_exit(struct bch_fs *);
int bch2_fs_chardev_init(struct bch_fs *);
void bch2_chardev_exit(void);
int __init bch2_chardev_init(void);
#else
static inline long bch2_fs_ioctl(struct bch_fs *c,
unsigned cmd, void __user * arg)
{
return -ENOSYS;
}
static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; }
static inline void bch2_chardev_exit(void) {}
static inline int __init bch2_chardev_init(void) { return 0; }
#endif /* NO_BCACHEFS_FS */
#endif /* _BCACHEFS_CHARDEV_H */

753
fs/bcachefs/checksum.c Normal file
View File

@ -0,0 +1,753 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "checksum.h"
#include "super.h"
#include "super-io.h"
#include <linux/crc32c.h>
#include <linux/crypto.h>
#include <linux/key.h>
#include <linux/random.h>
#include <linux/scatterlist.h>
#include <crypto/algapi.h>
#include <crypto/chacha.h>
#include <crypto/hash.h>
#include <crypto/poly1305.h>
#include <crypto/skcipher.h>
#include <keys/user-type.h>
/*
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
* use permitted, subject to terms of PostgreSQL license; see.)
* If we have a 64-bit integer type, then a 64-bit CRC looks just like the
* usual sort of implementation. (See Ross Williams' excellent introduction
* A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
* ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
* If we have no working 64-bit type, then fake it with two 32-bit registers.
*
* The present implementation is a normal (not "reflected", in Williams'
* terms) 64-bit CRC, using initial all-ones register contents and a final
* bit inversion. The chosen polynomial is borrowed from the DLT1 spec
* (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
*
* x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
* x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
* x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
* x^7 + x^4 + x + 1
*/
static const u64 crc_table[256] = {
0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL,
0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL,
0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL,
0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL,
0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL,
0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL,
0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL,
0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL,
0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL,
0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL,
0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL,
0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL,
0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL,
0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL,
0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL,
0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL,
0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL,
0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL,
0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL,
0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL,
0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL,
0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL,
0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL,
0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL,
0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL,
0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL,
0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL,
0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL,
0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL,
0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL,
0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL,
0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL,
0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL,
0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL,
0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL,
0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL,
0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL,
0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL,
0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL,
0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL,
0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL,
0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL,
0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL,
0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL,
0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL,
0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL,
0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL,
0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL,
0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL,
0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL,
0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL,
0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL,
0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL,
0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL,
0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL,
0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL,
0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL,
0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL,
0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL,
0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL,
0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL,
0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL,
0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL,
0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL,
0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL,
0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL,
0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL,
0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL,
0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL,
0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL,
0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL,
0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL,
0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL,
0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL,
0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL,
0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL,
0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL,
0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL,
0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL,
0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL,
0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL,
0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL,
0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL,
0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL,
0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL,
0x9AFCE626CE85B507ULL,
};
u64 bch2_crc64_update(u64 crc, const void *_data, size_t len)
{
const unsigned char *data = _data;
while (len--) {
int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
crc = crc_table[i] ^ (crc << 8);
}
return crc;
}
static u64 bch2_checksum_init(unsigned type)
{
switch (type) {
case BCH_CSUM_NONE:
return 0;
case BCH_CSUM_CRC32C_NONZERO:
return U32_MAX;
case BCH_CSUM_CRC64_NONZERO:
return U64_MAX;
case BCH_CSUM_CRC32C:
return 0;
case BCH_CSUM_CRC64:
return 0;
default:
BUG();
}
}
static u64 bch2_checksum_final(unsigned type, u64 crc)
{
switch (type) {
case BCH_CSUM_NONE:
return 0;
case BCH_CSUM_CRC32C_NONZERO:
return crc ^ U32_MAX;
case BCH_CSUM_CRC64_NONZERO:
return crc ^ U64_MAX;
case BCH_CSUM_CRC32C:
return crc;
case BCH_CSUM_CRC64:
return crc;
default:
BUG();
}
}
static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
{
switch (type) {
case BCH_CSUM_NONE:
return 0;
case BCH_CSUM_CRC32C_NONZERO:
case BCH_CSUM_CRC32C:
return crc32c(crc, data, len);
case BCH_CSUM_CRC64_NONZERO:
case BCH_CSUM_CRC64:
return bch2_crc64_update(crc, data, len);
default:
BUG();
}
}
static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
struct scatterlist *sg, size_t len)
{
SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
int ret;
skcipher_request_set_sync_tfm(req, tfm);
skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
ret = crypto_skcipher_encrypt(req);
BUG_ON(ret);
}
static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
struct nonce nonce,
void *buf, size_t len)
{
struct scatterlist sg;
sg_init_one(&sg, buf, len);
do_encrypt_sg(tfm, nonce, &sg, len);
}
int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
void *buf, size_t len)
{
struct crypto_sync_skcipher *chacha20 =
crypto_alloc_sync_skcipher("chacha20", 0, 0);
int ret;
if (!chacha20) {
pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20));
return PTR_ERR(chacha20);
}
ret = crypto_skcipher_setkey(&chacha20->base,
(void *) key, sizeof(*key));
if (ret) {
pr_err("crypto_skcipher_setkey() error: %i", ret);
goto err;
}
do_encrypt(chacha20, nonce, buf, len);
err:
crypto_free_sync_skcipher(chacha20);
return ret;
}
static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
struct nonce nonce)
{
u8 key[POLY1305_KEY_SIZE];
nonce.d[3] ^= BCH_NONCE_POLY;
memset(key, 0, sizeof(key));
do_encrypt(c->chacha20, nonce, key, sizeof(key));
desc->tfm = c->poly1305;
crypto_shash_init(desc);
crypto_shash_update(desc, key, sizeof(key));
}
struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
struct nonce nonce, const void *data, size_t len)
{
switch (type) {
case BCH_CSUM_NONE:
case BCH_CSUM_CRC32C_NONZERO:
case BCH_CSUM_CRC64_NONZERO:
case BCH_CSUM_CRC32C:
case BCH_CSUM_CRC64: {
u64 crc = bch2_checksum_init(type);
crc = bch2_checksum_update(type, crc, data, len);
crc = bch2_checksum_final(type, crc);
return (struct bch_csum) { .lo = cpu_to_le64(crc) };
}
case BCH_CSUM_CHACHA20_POLY1305_80:
case BCH_CSUM_CHACHA20_POLY1305_128: {
SHASH_DESC_ON_STACK(desc, c->poly1305);
u8 digest[POLY1305_DIGEST_SIZE];
struct bch_csum ret = { 0 };
gen_poly_key(c, desc, nonce);
crypto_shash_update(desc, data, len);
crypto_shash_final(desc, digest);
memcpy(&ret, digest, bch_crc_bytes[type]);
return ret;
}
default:
BUG();
}
}
void bch2_encrypt(struct bch_fs *c, unsigned type,
struct nonce nonce, void *data, size_t len)
{
if (!bch2_csum_type_is_encryption(type))
return;
do_encrypt(c->chacha20, nonce, data, len);
}
static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
struct nonce nonce, struct bio *bio,
struct bvec_iter *iter)
{
struct bio_vec bv;
switch (type) {
case BCH_CSUM_NONE:
return (struct bch_csum) { 0 };
case BCH_CSUM_CRC32C_NONZERO:
case BCH_CSUM_CRC64_NONZERO:
case BCH_CSUM_CRC32C:
case BCH_CSUM_CRC64: {
u64 crc = bch2_checksum_init(type);
#ifdef CONFIG_HIGHMEM
__bio_for_each_segment(bv, bio, *iter, *iter) {
void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
crc = bch2_checksum_update(type,
crc, p, bv.bv_len);
kunmap_atomic(p);
}
#else
__bio_for_each_contig_segment(bv, bio, *iter, *iter)
crc = bch2_checksum_update(type, crc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
#endif
crc = bch2_checksum_final(type, crc);
return (struct bch_csum) { .lo = cpu_to_le64(crc) };
}
case BCH_CSUM_CHACHA20_POLY1305_80:
case BCH_CSUM_CHACHA20_POLY1305_128: {
SHASH_DESC_ON_STACK(desc, c->poly1305);
u8 digest[POLY1305_DIGEST_SIZE];
struct bch_csum ret = { 0 };
gen_poly_key(c, desc, nonce);
#ifdef CONFIG_HIGHMEM
__bio_for_each_segment(bv, bio, *iter, *iter) {
void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
crypto_shash_update(desc, p, bv.bv_len);
kunmap_atomic(p);
}
#else
__bio_for_each_contig_segment(bv, bio, *iter, *iter)
crypto_shash_update(desc,
page_address(bv.bv_page) + bv.bv_offset,
bv.bv_len);
#endif
crypto_shash_final(desc, digest);
memcpy(&ret, digest, bch_crc_bytes[type]);
return ret;
}
default:
BUG();
}
}
struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
struct nonce nonce, struct bio *bio)
{
struct bvec_iter iter = bio->bi_iter;
return __bch2_checksum_bio(c, type, nonce, bio, &iter);
}
void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
struct nonce nonce, struct bio *bio)
{
struct bio_vec bv;
struct bvec_iter iter;
struct scatterlist sgl[16], *sg = sgl;
size_t bytes = 0;
if (!bch2_csum_type_is_encryption(type))
return;
sg_init_table(sgl, ARRAY_SIZE(sgl));
bio_for_each_segment(bv, bio, iter) {
if (sg == sgl + ARRAY_SIZE(sgl)) {
sg_mark_end(sg - 1);
do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
nonce = nonce_add(nonce, bytes);
bytes = 0;
sg_init_table(sgl, ARRAY_SIZE(sgl));
sg = sgl;
}
sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
bytes += bv.bv_len;
}
sg_mark_end(sg - 1);
do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
}
static inline bool bch2_checksum_mergeable(unsigned type)
{
switch (type) {
case BCH_CSUM_NONE:
case BCH_CSUM_CRC32C:
case BCH_CSUM_CRC64:
return true;
default:
return false;
}
}
static struct bch_csum bch2_checksum_merge(unsigned type,
struct bch_csum a,
struct bch_csum b, size_t b_len)
{
BUG_ON(!bch2_checksum_mergeable(type));
while (b_len) {
unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
a.lo = bch2_checksum_update(type, a.lo,
page_address(ZERO_PAGE(0)), b);
b_len -= b;
}
a.lo ^= b.lo;
a.hi ^= b.hi;
return a;
}
int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
struct bversion version,
struct bch_extent_crc_unpacked crc_old,
struct bch_extent_crc_unpacked *crc_a,
struct bch_extent_crc_unpacked *crc_b,
unsigned len_a, unsigned len_b,
unsigned new_csum_type)
{
struct bvec_iter iter = bio->bi_iter;
struct nonce nonce = extent_nonce(version, crc_old);
struct bch_csum merged = { 0 };
struct crc_split {
struct bch_extent_crc_unpacked *crc;
unsigned len;
unsigned csum_type;
struct bch_csum csum;
} splits[3] = {
{ crc_a, len_a, new_csum_type },
{ crc_b, len_b, new_csum_type },
{ NULL, bio_sectors(bio) - len_a - len_b, new_csum_type },
}, *i;
bool mergeable = crc_old.csum_type == new_csum_type &&
bch2_checksum_mergeable(new_csum_type);
unsigned crc_nonce = crc_old.nonce;
BUG_ON(len_a + len_b > bio_sectors(bio));
BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
BUG_ON(crc_old.compression_type);
BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
bch2_csum_type_is_encryption(new_csum_type));
for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
iter.bi_size = i->len << 9;
if (mergeable || i->crc)
i->csum = __bch2_checksum_bio(c, i->csum_type,
nonce, bio, &iter);
else
bio_advance_iter(bio, &iter, i->len << 9);
nonce = nonce_add(nonce, i->len << 9);
}
if (mergeable)
for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
merged = bch2_checksum_merge(new_csum_type, merged,
i->csum, i->len << 9);
else
merged = bch2_checksum_bio(c, crc_old.csum_type,
extent_nonce(version, crc_old), bio);
if (bch2_crc_cmp(merged, crc_old.csum))
return -EIO;
for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
if (i->crc)
*i->crc = (struct bch_extent_crc_unpacked) {
.csum_type = i->csum_type,
.compressed_size = i->len,
.uncompressed_size = i->len,
.offset = 0,
.live_size = i->len,
.nonce = crc_nonce,
.csum = i->csum,
};
if (bch2_csum_type_is_encryption(new_csum_type))
crc_nonce += i->len;
}
return 0;
}
#ifdef __KERNEL__
int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
{
char key_description[60];
struct key *keyring_key;
const struct user_key_payload *ukp;
int ret;
snprintf(key_description, sizeof(key_description),
"bcachefs:%pUb", &sb->user_uuid);
keyring_key = request_key(&key_type_logon, key_description, NULL);
if (IS_ERR(keyring_key))
return PTR_ERR(keyring_key);
down_read(&keyring_key->sem);
ukp = dereference_key_locked(keyring_key);
if (ukp->datalen == sizeof(*key)) {
memcpy(key, ukp->data, ukp->datalen);
ret = 0;
} else {
ret = -EINVAL;
}
up_read(&keyring_key->sem);
key_put(keyring_key);
return ret;
}
#else
#include <keyutils.h>
#include <uuid/uuid.h>
int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
{
key_serial_t key_id;
char key_description[60];
char uuid[40];
uuid_unparse_lower(sb->user_uuid.b, uuid);
sprintf(key_description, "bcachefs:%s", uuid);
key_id = request_key("user", key_description, NULL,
KEY_SPEC_USER_KEYRING);
if (key_id < 0)
return -errno;
if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
return -1;
return 0;
}
#endif
int bch2_decrypt_sb_key(struct bch_fs *c,
struct bch_sb_field_crypt *crypt,
struct bch_key *key)
{
struct bch_encrypted_key sb_key = crypt->key;
struct bch_key user_key;
int ret = 0;
/* is key encrypted? */
if (!bch2_key_is_encrypted(&sb_key))
goto out;
ret = bch2_request_key(c->disk_sb.sb, &user_key);
if (ret) {
bch_err(c, "error requesting encryption key: %i", ret);
goto err;
}
/* decrypt real key: */
ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
&sb_key, sizeof(sb_key));
if (ret)
goto err;
if (bch2_key_is_encrypted(&sb_key)) {
bch_err(c, "incorrect encryption key");
ret = -EINVAL;
goto err;
}
out:
*key = sb_key.key;
err:
memzero_explicit(&sb_key, sizeof(sb_key));
memzero_explicit(&user_key, sizeof(user_key));
return ret;
}
static int bch2_alloc_ciphers(struct bch_fs *c)
{
if (!c->chacha20)
c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
if (IS_ERR(c->chacha20)) {
bch_err(c, "error requesting chacha20 module: %li",
PTR_ERR(c->chacha20));
return PTR_ERR(c->chacha20);
}
if (!c->poly1305)
c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
if (IS_ERR(c->poly1305)) {
bch_err(c, "error requesting poly1305 module: %li",
PTR_ERR(c->poly1305));
return PTR_ERR(c->poly1305);
}
return 0;
}
int bch2_disable_encryption(struct bch_fs *c)
{
struct bch_sb_field_crypt *crypt;
struct bch_key key;
int ret = -EINVAL;
mutex_lock(&c->sb_lock);
crypt = bch2_sb_get_crypt(c->disk_sb.sb);
if (!crypt)
goto out;
/* is key encrypted? */
ret = 0;
if (bch2_key_is_encrypted(&crypt->key))
goto out;
ret = bch2_decrypt_sb_key(c, crypt, &key);
if (ret)
goto out;
crypt->key.magic = BCH_KEY_MAGIC;
crypt->key.key = key;
SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
bch2_write_super(c);
out:
mutex_unlock(&c->sb_lock);
return ret;
}
int bch2_enable_encryption(struct bch_fs *c, bool keyed)
{
struct bch_encrypted_key key;
struct bch_key user_key;
struct bch_sb_field_crypt *crypt;
int ret = -EINVAL;
mutex_lock(&c->sb_lock);
/* Do we already have an encryption key? */
if (bch2_sb_get_crypt(c->disk_sb.sb))
goto err;
ret = bch2_alloc_ciphers(c);
if (ret)
goto err;
key.magic = BCH_KEY_MAGIC;
get_random_bytes(&key.key, sizeof(key.key));
if (keyed) {
ret = bch2_request_key(c->disk_sb.sb, &user_key);
if (ret) {
bch_err(c, "error requesting encryption key: %i", ret);
goto err;
}
ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
&key, sizeof(key));
if (ret)
goto err;
}
ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto err;
crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64));
if (!crypt) {
ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
goto err;
}
crypt->key = key;
/* write superblock */
SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
bch2_write_super(c);
err:
mutex_unlock(&c->sb_lock);
memzero_explicit(&user_key, sizeof(user_key));
memzero_explicit(&key, sizeof(key));
return ret;
}
void bch2_fs_encryption_exit(struct bch_fs *c)
{
if (!IS_ERR_OR_NULL(c->poly1305))
crypto_free_shash(c->poly1305);
if (!IS_ERR_OR_NULL(c->chacha20))
crypto_free_sync_skcipher(c->chacha20);
if (!IS_ERR_OR_NULL(c->sha256))
crypto_free_shash(c->sha256);
}
int bch2_fs_encryption_init(struct bch_fs *c)
{
struct bch_sb_field_crypt *crypt;
struct bch_key key;
int ret = 0;
pr_verbose_init(c->opts, "");
c->sha256 = crypto_alloc_shash("sha256", 0, 0);
if (IS_ERR(c->sha256)) {
bch_err(c, "error requesting sha256 module");
ret = PTR_ERR(c->sha256);
goto out;
}
crypt = bch2_sb_get_crypt(c->disk_sb.sb);
if (!crypt)
goto out;
ret = bch2_alloc_ciphers(c);
if (ret)
goto out;
ret = bch2_decrypt_sb_key(c, crypt, &key);
if (ret)
goto out;
ret = crypto_skcipher_setkey(&c->chacha20->base,
(void *) &key.key, sizeof(key.key));
if (ret)
goto out;
out:
memzero_explicit(&key, sizeof(key));
pr_verbose_init(c->opts, "ret %i", ret);
return ret;
}

184
fs/bcachefs/checksum.h Normal file
View File

@ -0,0 +1,184 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_CHECKSUM_H
#define _BCACHEFS_CHECKSUM_H
#include "bcachefs.h"
#include "extents_types.h"
#include "super-io.h"
#include <crypto/chacha.h>
u64 bch2_crc64_update(u64, const void *, size_t);
#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28)
#define BCH_NONCE_BTREE cpu_to_le32(2 << 28)
#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28)
#define BCH_NONCE_PRIO cpu_to_le32(4 << 28)
#define BCH_NONCE_POLY cpu_to_le32(1 << 31)
struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
const void *, size_t);
/*
* This is used for various on disk data structures - bch_sb, prio_set, bset,
* jset: The checksum is _always_ the first field of these structs
*/
#define csum_vstruct(_c, _type, _nonce, _i) \
({ \
const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \
const void *end = vstruct_end(_i); \
\
bch2_checksum(_c, _type, _nonce, start, end - start); \
})
int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
int bch2_request_key(struct bch_sb *, struct bch_key *);
void bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
void *data, size_t);
struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
struct nonce, struct bio *);
int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
struct bch_extent_crc_unpacked,
struct bch_extent_crc_unpacked *,
struct bch_extent_crc_unpacked *,
unsigned, unsigned, unsigned);
void bch2_encrypt_bio(struct bch_fs *, unsigned,
struct nonce, struct bio *);
int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
struct bch_key *);
int bch2_disable_encryption(struct bch_fs *);
int bch2_enable_encryption(struct bch_fs *, bool);
void bch2_fs_encryption_exit(struct bch_fs *);
int bch2_fs_encryption_init(struct bch_fs *);
static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
bool data)
{
switch (type) {
case BCH_CSUM_OPT_NONE:
return BCH_CSUM_NONE;
case BCH_CSUM_OPT_CRC32C:
return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
case BCH_CSUM_OPT_CRC64:
return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
default:
BUG();
}
}
static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
unsigned opt)
{
if (c->sb.encryption_type)
return c->opts.wide_macs
? BCH_CSUM_CHACHA20_POLY1305_128
: BCH_CSUM_CHACHA20_POLY1305_80;
return bch2_csum_opt_to_type(opt, true);
}
static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
{
if (c->sb.encryption_type)
return BCH_CSUM_CHACHA20_POLY1305_128;
return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
}
static const unsigned bch2_compression_opt_to_type[] = {
#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_##t,
BCH_COMPRESSION_TYPES()
#undef x
};
static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
unsigned type)
{
if (type >= BCH_CSUM_NR)
return false;
if (bch2_csum_type_is_encryption(type) && !c->chacha20)
return false;
return true;
}
/* returns true if not equal */
static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
{
/*
* XXX: need some way of preventing the compiler from optimizing this
* into a form that isn't constant time..
*/
return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0;
}
/* for skipping ahead and encrypting/decrypting at an offset: */
static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
{
EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
return nonce;
}
static inline struct nonce null_nonce(void)
{
struct nonce ret;
memset(&ret, 0, sizeof(ret));
return ret;
}
static inline struct nonce extent_nonce(struct bversion version,
struct bch_extent_crc_unpacked crc)
{
unsigned size = crc.compression_type ? crc.uncompressed_size : 0;
struct nonce nonce = (struct nonce) {{
[0] = cpu_to_le32(size << 22),
[1] = cpu_to_le32(version.lo),
[2] = cpu_to_le32(version.lo >> 32),
[3] = cpu_to_le32(version.hi|
(crc.compression_type << 24))^BCH_NONCE_EXTENT,
}};
return nonce_add(nonce, crc.nonce << 9);
}
static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
{
return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
}
static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb)
{
__le64 magic = __bch2_sb_magic(sb);
return (struct nonce) {{
[0] = 0,
[1] = 0,
[2] = ((__le32 *) &magic)[0],
[3] = ((__le32 *) &magic)[1],
}};
}
static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c)
{
__le64 magic = bch2_sb_magic(c);
return (struct nonce) {{
[0] = 0,
[1] = 0,
[2] = ((__le32 *) &magic)[0],
[3] = ((__le32 *) &magic)[1],
}};
}
#endif /* _BCACHEFS_CHECKSUM_H */

180
fs/bcachefs/clock.c Normal file
View File

@ -0,0 +1,180 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "clock.h"
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/preempt.h>
static inline long io_timer_cmp(io_timer_heap *h,
struct io_timer *l,
struct io_timer *r)
{
return l->expire - r->expire;
}
void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
{
size_t i;
spin_lock(&clock->timer_lock);
for (i = 0; i < clock->timers.used; i++)
if (clock->timers.data[i] == timer)
goto out;
BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp));
out:
spin_unlock(&clock->timer_lock);
}
void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
{
size_t i;
spin_lock(&clock->timer_lock);
for (i = 0; i < clock->timers.used; i++)
if (clock->timers.data[i] == timer) {
heap_del(&clock->timers, i, io_timer_cmp);
break;
}
spin_unlock(&clock->timer_lock);
}
struct io_clock_wait {
struct io_timer io_timer;
struct timer_list cpu_timer;
struct task_struct *task;
int expired;
};
static void io_clock_wait_fn(struct io_timer *timer)
{
struct io_clock_wait *wait = container_of(timer,
struct io_clock_wait, io_timer);
wait->expired = 1;
wake_up_process(wait->task);
}
static void io_clock_cpu_timeout(struct timer_list *timer)
{
struct io_clock_wait *wait = container_of(timer,
struct io_clock_wait, cpu_timer);
wait->expired = 1;
wake_up_process(wait->task);
}
void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
{
struct io_clock_wait wait;
/* XXX: calculate sleep time rigorously */
wait.io_timer.expire = until;
wait.io_timer.fn = io_clock_wait_fn;
wait.task = current;
wait.expired = 0;
bch2_io_timer_add(clock, &wait.io_timer);
schedule();
bch2_io_timer_del(clock, &wait.io_timer);
}
void bch2_kthread_io_clock_wait(struct io_clock *clock,
unsigned long io_until,
unsigned long cpu_timeout)
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct io_clock_wait wait;
wait.io_timer.expire = io_until;
wait.io_timer.fn = io_clock_wait_fn;
wait.task = current;
wait.expired = 0;
bch2_io_timer_add(clock, &wait.io_timer);
timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
while (1) {
set_current_state(TASK_INTERRUPTIBLE);
if (kthread && kthread_should_stop())
break;
if (wait.expired)
break;
schedule();
try_to_freeze();
}
__set_current_state(TASK_RUNNING);
del_timer_sync(&wait.cpu_timer);
destroy_timer_on_stack(&wait.cpu_timer);
bch2_io_timer_del(clock, &wait.io_timer);
}
static struct io_timer *get_expired_timer(struct io_clock *clock,
unsigned long now)
{
struct io_timer *ret = NULL;
spin_lock(&clock->timer_lock);
if (clock->timers.used &&
time_after_eq(now, clock->timers.data[0]->expire))
heap_pop(&clock->timers, ret, io_timer_cmp);
spin_unlock(&clock->timer_lock);
return ret;
}
void bch2_increment_clock(struct bch_fs *c, unsigned sectors, int rw)
{
struct io_clock *clock = &c->io_clock[rw];
struct io_timer *timer;
unsigned long now;
/* Buffer up one megabyte worth of IO in the percpu counter */
preempt_disable();
if (likely(this_cpu_add_return(*clock->pcpu_buf, sectors) <
IO_CLOCK_PCPU_SECTORS)) {
preempt_enable();
return;
}
sectors = this_cpu_xchg(*clock->pcpu_buf, 0);
preempt_enable();
now = atomic_long_add_return(sectors, &clock->now);
while ((timer = get_expired_timer(clock, now)))
timer->fn(timer);
}
void bch2_io_clock_exit(struct io_clock *clock)
{
free_heap(&clock->timers);
free_percpu(clock->pcpu_buf);
}
int bch2_io_clock_init(struct io_clock *clock)
{
atomic_long_set(&clock->now, 0);
spin_lock_init(&clock->timer_lock);
clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
if (!clock->pcpu_buf)
return -ENOMEM;
if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
return -ENOMEM;
return 0;
}

25
fs/bcachefs/clock.h Normal file
View File

@ -0,0 +1,25 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_CLOCK_H
#define _BCACHEFS_CLOCK_H
void bch2_io_timer_add(struct io_clock *, struct io_timer *);
void bch2_io_timer_del(struct io_clock *, struct io_timer *);
void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
unsigned long);
void bch2_increment_clock(struct bch_fs *, unsigned, int);
void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
({ \
long __ret = timeout; \
might_sleep(); \
if (!___wait_cond_timeout(condition)) \
__ret = __wait_event_timeout(wq, condition, timeout); \
__ret; \
})
void bch2_io_clock_exit(struct io_clock *);
int bch2_io_clock_init(struct io_clock *);
#endif /* _BCACHEFS_CLOCK_H */

36
fs/bcachefs/clock_types.h Normal file
View File

@ -0,0 +1,36 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_CLOCK_TYPES_H
#define _BCACHEFS_CLOCK_TYPES_H
#include "util.h"
#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3)
/*
* Clocks/timers in units of sectors of IO:
*
* Note - they use percpu batching, so they're only approximate.
*/
struct io_timer;
typedef void (*io_timer_fn)(struct io_timer *);
struct io_timer {
io_timer_fn fn;
unsigned long expire;
};
/* Amount to buffer up on a percpu counter */
#define IO_CLOCK_PCPU_SECTORS 128
typedef HEAP(struct io_timer *) io_timer_heap;
struct io_clock {
atomic_long_t now;
u16 __percpu *pcpu_buf;
spinlock_t timer_lock;
io_timer_heap timers;
};
#endif /* _BCACHEFS_CLOCK_TYPES_H */

621
fs/bcachefs/compress.c Normal file
View File

@ -0,0 +1,621 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "checksum.h"
#include "compress.h"
#include "extents.h"
#include "io.h"
#include "super-io.h"
#include <linux/lz4.h>
#include <linux/zlib.h>
#include <linux/zstd.h>
/* Bounce buffer: */
struct bbuf {
void *b;
enum {
BB_NONE,
BB_VMAP,
BB_KMALLOC,
BB_VMALLOC,
BB_MEMPOOL,
} type;
int rw;
};
static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
{
void *b;
BUG_ON(size > c->sb.encoded_extent_max << 9);
b = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
if (b)
return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
b = mempool_alloc(&c->compression_bounce[rw], GFP_NOWAIT);
b = b ? page_address(b) : NULL;
if (b)
return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
b = vmalloc(size);
if (b)
return (struct bbuf) { .b = b, .type = BB_VMALLOC, .rw = rw };
b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO);
b = b ? page_address(b) : NULL;
if (b)
return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
BUG();
}
static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
struct bvec_iter start, int rw)
{
struct bbuf ret;
struct bio_vec bv;
struct bvec_iter iter;
unsigned nr_pages = 0;
struct page *stack_pages[16];
struct page **pages = NULL;
bool first = true;
unsigned prev_end = PAGE_SIZE;
void *data;
BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
#ifndef CONFIG_HIGHMEM
__bio_for_each_contig_segment(bv, bio, iter, start) {
if (bv.bv_len == start.bi_size)
return (struct bbuf) {
.b = page_address(bv.bv_page) + bv.bv_offset,
.type = BB_NONE, .rw = rw
};
}
#endif
__bio_for_each_segment(bv, bio, iter, start) {
if ((!first && bv.bv_offset) ||
prev_end != PAGE_SIZE)
goto bounce;
prev_end = bv.bv_offset + bv.bv_len;
nr_pages++;
}
BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
pages = nr_pages > ARRAY_SIZE(stack_pages)
? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO)
: stack_pages;
if (!pages)
goto bounce;
nr_pages = 0;
__bio_for_each_segment(bv, bio, iter, start)
pages[nr_pages++] = bv.bv_page;
data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
if (pages != stack_pages)
kfree(pages);
if (data)
return (struct bbuf) {
.b = data + bio_iter_offset(bio, start),
.type = BB_VMAP, .rw = rw
};
bounce:
ret = __bounce_alloc(c, start.bi_size, rw);
if (rw == READ)
memcpy_from_bio(ret.b, bio, start);
return ret;
}
static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw)
{
return __bio_map_or_bounce(c, bio, bio->bi_iter, rw);
}
static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
{
switch (buf.type) {
case BB_NONE:
break;
case BB_VMAP:
vunmap((void *) ((unsigned long) buf.b & PAGE_MASK));
break;
case BB_KMALLOC:
kfree(buf.b);
break;
case BB_VMALLOC:
vfree(buf.b);
break;
case BB_MEMPOOL:
mempool_free(virt_to_page(buf.b),
&c->compression_bounce[buf.rw]);
break;
}
}
static inline void zlib_set_workspace(z_stream *strm, void *workspace)
{
#ifdef __KERNEL__
strm->workspace = workspace;
#endif
}
static int __bio_uncompress(struct bch_fs *c, struct bio *src,
void *dst_data, struct bch_extent_crc_unpacked crc)
{
struct bbuf src_data = { NULL };
size_t src_len = src->bi_iter.bi_size;
size_t dst_len = crc.uncompressed_size << 9;
void *workspace;
int ret;
src_data = bio_map_or_bounce(c, src, READ);
switch (crc.compression_type) {
case BCH_COMPRESSION_LZ4_OLD:
case BCH_COMPRESSION_LZ4:
ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
src_len, dst_len, dst_len);
if (ret != dst_len)
goto err;
break;
case BCH_COMPRESSION_GZIP: {
z_stream strm = {
.next_in = src_data.b,
.avail_in = src_len,
.next_out = dst_data,
.avail_out = dst_len,
};
workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
zlib_set_workspace(&strm, workspace);
zlib_inflateInit2(&strm, -MAX_WBITS);
ret = zlib_inflate(&strm, Z_FINISH);
mempool_free(workspace, &c->decompress_workspace);
if (ret != Z_STREAM_END)
goto err;
break;
}
case BCH_COMPRESSION_ZSTD: {
ZSTD_DCtx *ctx;
size_t len;
workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound());
src_len = le32_to_cpup(src_data.b);
len = zstd_decompress_dctx(ctx,
dst_data, dst_len,
src_data.b + 4, src_len);
mempool_free(workspace, &c->decompress_workspace);
if (len != dst_len)
goto err;
break;
}
default:
BUG();
}
ret = 0;
out:
bio_unmap_or_unbounce(c, src_data);
return ret;
err:
ret = -EIO;
goto out;
}
int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
struct bch_extent_crc_unpacked *crc)
{
struct bbuf data = { NULL };
size_t dst_len = crc->uncompressed_size << 9;
/* bio must own its pages: */
BUG_ON(!bio->bi_vcnt);
BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
if (crc->uncompressed_size > c->sb.encoded_extent_max ||
crc->compressed_size > c->sb.encoded_extent_max) {
bch_err(c, "error rewriting existing data: extent too big");
return -EIO;
}
data = __bounce_alloc(c, dst_len, WRITE);
if (__bio_uncompress(c, bio, data.b, *crc)) {
bch_err(c, "error rewriting existing data: decompression error");
bio_unmap_or_unbounce(c, data);
return -EIO;
}
/*
* might have to free existing pages and retry allocation from mempool -
* do this _after_ decompressing:
*/
bch2_bio_alloc_more_pages_pool(c, bio, crc->live_size << 9);
memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
crc->csum_type = 0;
crc->compression_type = 0;
crc->compressed_size = crc->live_size;
crc->uncompressed_size = crc->live_size;
crc->offset = 0;
crc->csum = (struct bch_csum) { 0, 0 };
bio_unmap_or_unbounce(c, data);
return 0;
}
int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
struct bio *dst, struct bvec_iter dst_iter,
struct bch_extent_crc_unpacked crc)
{
struct bbuf dst_data = { NULL };
size_t dst_len = crc.uncompressed_size << 9;
int ret = -ENOMEM;
if (crc.uncompressed_size > c->sb.encoded_extent_max ||
crc.compressed_size > c->sb.encoded_extent_max)
return -EIO;
dst_data = dst_len == dst_iter.bi_size
? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
: __bounce_alloc(c, dst_len, WRITE);
ret = __bio_uncompress(c, src, dst_data.b, crc);
if (ret)
goto err;
if (dst_data.type != BB_NONE)
memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
err:
bio_unmap_or_unbounce(c, dst_data);
return ret;
}
static int attempt_compress(struct bch_fs *c,
void *workspace,
void *dst, size_t dst_len,
void *src, size_t src_len,
unsigned compression_type)
{
switch (compression_type) {
case BCH_COMPRESSION_LZ4: {
int len = src_len;
int ret = LZ4_compress_destSize(
src, dst,
&len, dst_len,
workspace);
if (len < src_len)
return -len;
return ret;
}
case BCH_COMPRESSION_GZIP: {
z_stream strm = {
.next_in = src,
.avail_in = src_len,
.next_out = dst,
.avail_out = dst_len,
};
zlib_set_workspace(&strm, workspace);
zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
Z_DEFAULT_STRATEGY);
if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
return 0;
if (zlib_deflateEnd(&strm) != Z_OK)
return 0;
return strm.total_out;
}
case BCH_COMPRESSION_ZSTD: {
ZSTD_CCtx *ctx = zstd_init_cctx(workspace,
zstd_cctx_workspace_bound(&c->zstd_params.cParams));
size_t len = zstd_compress_cctx(ctx,
dst + 4, dst_len - 4,
src, src_len,
&c->zstd_params);
if (zstd_is_error(len))
return 0;
*((__le32 *) dst) = cpu_to_le32(len);
return len + 4;
}
default:
BUG();
}
}
static unsigned __bio_compress(struct bch_fs *c,
struct bio *dst, size_t *dst_len,
struct bio *src, size_t *src_len,
unsigned compression_type)
{
struct bbuf src_data = { NULL }, dst_data = { NULL };
void *workspace;
unsigned pad;
int ret = 0;
BUG_ON(compression_type >= BCH_COMPRESSION_NR);
BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
/* If it's only one block, don't bother trying to compress: */
if (bio_sectors(src) <= c->opts.block_size)
return 0;
dst_data = bio_map_or_bounce(c, dst, WRITE);
src_data = bio_map_or_bounce(c, src, READ);
workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO);
*src_len = src->bi_iter.bi_size;
*dst_len = dst->bi_iter.bi_size;
/*
* XXX: this algorithm sucks when the compression code doesn't tell us
* how much would fit, like LZ4 does:
*/
while (1) {
if (*src_len <= block_bytes(c)) {
ret = -1;
break;
}
ret = attempt_compress(c, workspace,
dst_data.b, *dst_len,
src_data.b, *src_len,
compression_type);
if (ret > 0) {
*dst_len = ret;
ret = 0;
break;
}
/* Didn't fit: should we retry with a smaller amount? */
if (*src_len <= *dst_len) {
ret = -1;
break;
}
/*
* If ret is negative, it's a hint as to how much data would fit
*/
BUG_ON(-ret >= *src_len);
if (ret < 0)
*src_len = -ret;
else
*src_len -= (*src_len - *dst_len) / 2;
*src_len = round_down(*src_len, block_bytes(c));
}
mempool_free(workspace, &c->compress_workspace[compression_type]);
if (ret)
goto err;
/* Didn't get smaller: */
if (round_up(*dst_len, block_bytes(c)) >= *src_len)
goto err;
pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
memset(dst_data.b + *dst_len, 0, pad);
*dst_len += pad;
if (dst_data.type != BB_NONE)
memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
BUG_ON(*dst_len & (block_bytes(c) - 1));
BUG_ON(*src_len & (block_bytes(c) - 1));
out:
bio_unmap_or_unbounce(c, src_data);
bio_unmap_or_unbounce(c, dst_data);
return compression_type;
err:
compression_type = 0;
goto out;
}
unsigned bch2_bio_compress(struct bch_fs *c,
struct bio *dst, size_t *dst_len,
struct bio *src, size_t *src_len,
unsigned compression_type)
{
unsigned orig_dst = dst->bi_iter.bi_size;
unsigned orig_src = src->bi_iter.bi_size;
/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
c->sb.encoded_extent_max << 9);
/* Don't generate a bigger output than input: */
dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
if (compression_type == BCH_COMPRESSION_LZ4_OLD)
compression_type = BCH_COMPRESSION_LZ4;
compression_type =
__bio_compress(c, dst, dst_len, src, src_len, compression_type);
dst->bi_iter.bi_size = orig_dst;
src->bi_iter.bi_size = orig_src;
return compression_type;
}
static int __bch2_fs_compress_init(struct bch_fs *, u64);
#define BCH_FEATURE_NONE 0
static const unsigned bch2_compression_opt_to_feature[] = {
#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
BCH_COMPRESSION_TYPES()
#undef x
};
#undef BCH_FEATURE_NONE
static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
{
int ret = 0;
if ((c->sb.features & f) == f)
return 0;
mutex_lock(&c->sb_lock);
if ((c->sb.features & f) == f) {
mutex_unlock(&c->sb_lock);
return 0;
}
ret = __bch2_fs_compress_init(c, c->sb.features|f);
if (ret) {
mutex_unlock(&c->sb_lock);
return ret;
}
c->disk_sb.sb->features[0] |= cpu_to_le64(f);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
return 0;
}
int bch2_check_set_has_compressed_data(struct bch_fs *c,
unsigned compression_type)
{
BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
return compression_type
? __bch2_check_set_has_compressed_data(c,
1ULL << bch2_compression_opt_to_feature[compression_type])
: 0;
}
void bch2_fs_compress_exit(struct bch_fs *c)
{
unsigned i;
mempool_exit(&c->decompress_workspace);
for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
mempool_exit(&c->compress_workspace[i]);
mempool_exit(&c->compression_bounce[WRITE]);
mempool_exit(&c->compression_bounce[READ]);
}
static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
{
size_t max_extent = c->sb.encoded_extent_max << 9;
size_t order = get_order(max_extent);
size_t decompress_workspace_size = 0;
bool decompress_workspace_needed;
ZSTD_parameters params = zstd_get_params(0, max_extent);
struct {
unsigned feature;
unsigned type;
size_t compress_workspace;
size_t decompress_workspace;
} compression_types[] = {
{ BCH_FEATURE_LZ4, BCH_COMPRESSION_LZ4, LZ4_MEM_COMPRESS, 0 },
{ BCH_FEATURE_GZIP, BCH_COMPRESSION_GZIP,
zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
zlib_inflate_workspacesize(), },
{ BCH_FEATURE_ZSTD, BCH_COMPRESSION_ZSTD,
zstd_cctx_workspace_bound(&params.cParams),
zstd_dctx_workspace_bound() },
}, *i;
int ret = 0;
pr_verbose_init(c->opts, "");
c->zstd_params = params;
for (i = compression_types;
i < compression_types + ARRAY_SIZE(compression_types);
i++)
if (features & (1 << i->feature))
goto have_compressed;
goto out;
have_compressed:
if (!mempool_initialized(&c->compression_bounce[READ])) {
ret = mempool_init_page_pool(&c->compression_bounce[READ],
1, order);
if (ret)
goto out;
}
if (!mempool_initialized(&c->compression_bounce[WRITE])) {
ret = mempool_init_page_pool(&c->compression_bounce[WRITE],
1, order);
if (ret)
goto out;
}
for (i = compression_types;
i < compression_types + ARRAY_SIZE(compression_types);
i++) {
decompress_workspace_size =
max(decompress_workspace_size, i->decompress_workspace);
if (!(features & (1 << i->feature)))
continue;
if (i->decompress_workspace)
decompress_workspace_needed = true;
if (mempool_initialized(&c->compress_workspace[i->type]))
continue;
ret = mempool_init_kvpmalloc_pool(
&c->compress_workspace[i->type],
1, i->compress_workspace);
if (ret)
goto out;
}
ret = mempool_init_kmalloc_pool(
&c->decompress_workspace,
1, decompress_workspace_size);
if (ret)
goto out;
out:
pr_verbose_init(c->opts, "ret %i", ret);
return ret;
}
int bch2_fs_compress_init(struct bch_fs *c)
{
u64 f = c->sb.features;
if (c->opts.compression)
f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression];
if (c->opts.background_compression)
f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression];
return __bch2_fs_compress_init(c, f);
}

18
fs/bcachefs/compress.h Normal file
View File

@ -0,0 +1,18 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_COMPRESS_H
#define _BCACHEFS_COMPRESS_H
#include "extents_types.h"
int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
struct bch_extent_crc_unpacked *);
int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
struct bvec_iter, struct bch_extent_crc_unpacked);
unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
struct bio *, size_t *, unsigned);
int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
void bch2_fs_compress_exit(struct bch_fs *);
int bch2_fs_compress_init(struct bch_fs *);
#endif /* _BCACHEFS_COMPRESS_H */

425
fs/bcachefs/debug.c Normal file
View File

@ -0,0 +1,425 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Assorted bcachefs debug code
*
* Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
* Copyright 2012 Google, Inc.
*/
#include "bcachefs.h"
#include "bkey_methods.h"
#include "btree_cache.h"
#include "btree_io.h"
#include "btree_iter.h"
#include "btree_update.h"
#include "buckets.h"
#include "debug.h"
#include "error.h"
#include "extents.h"
#include "fsck.h"
#include "inode.h"
#include "io.h"
#include "super.h"
#include <linux/console.h>
#include <linux/debugfs.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/seq_file.h>
static struct dentry *bch_debug;
#ifdef CONFIG_BCACHEFS_DEBUG
void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
{
struct btree *v = c->verify_data;
struct btree_node *n_ondisk, *n_sorted, *n_inmemory;
struct bset *sorted, *inmemory;
struct extent_pick_ptr pick;
struct bch_dev *ca;
struct bio *bio;
if (c->opts.nochanges)
return;
btree_node_io_lock(b);
mutex_lock(&c->verify_lock);
n_ondisk = c->verify_ondisk;
n_sorted = c->verify_data->data;
n_inmemory = b->data;
bkey_copy(&v->key, &b->key);
v->written = 0;
v->level = b->level;
v->btree_id = b->btree_id;
bch2_btree_keys_init(v, &c->expensive_debug_checks);
if (bch2_btree_pick_ptr(c, b, NULL, &pick) <= 0)
return;
ca = bch_dev_bkey_exists(c, pick.ptr.dev);
if (!bch2_dev_get_ioref(ca, READ))
return;
bio = bio_alloc_bioset(ca->disk_sb.bdev,
buf_pages(n_sorted, btree_bytes(c)),
REQ_OP_READ|REQ_META,
GFP_NOIO,
&c->btree_bio);
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
bch2_bio_map(bio, n_sorted);
submit_bio_wait(bio);
bio_put(bio);
percpu_ref_put(&ca->io_ref);
memcpy(n_ondisk, n_sorted, btree_bytes(c));
if (bch2_btree_node_read_done(c, v, false))
goto out;
n_sorted = c->verify_data->data;
sorted = &n_sorted->keys;
inmemory = &n_inmemory->keys;
if (inmemory->u64s != sorted->u64s ||
memcmp(inmemory->start,
sorted->start,
vstruct_end(inmemory) - (void *) inmemory->start)) {
unsigned offset = 0, sectors;
struct bset *i;
unsigned j;
console_lock();
printk(KERN_ERR "*** in memory:\n");
bch2_dump_bset(b, inmemory, 0);
printk(KERN_ERR "*** read back in:\n");
bch2_dump_bset(v, sorted, 0);
while (offset < b->written) {
if (!offset ) {
i = &n_ondisk->keys;
sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
c->block_bits;
} else {
struct btree_node_entry *bne =
(void *) n_ondisk + (offset << 9);
i = &bne->keys;
sectors = vstruct_blocks(bne, c->block_bits) <<
c->block_bits;
}
printk(KERN_ERR "*** on disk block %u:\n", offset);
bch2_dump_bset(b, i, offset);
offset += sectors;
}
printk(KERN_ERR "*** block %u/%u not written\n",
offset >> c->block_bits, btree_blocks(c));
for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
if (inmemory->_data[j] != sorted->_data[j])
break;
printk(KERN_ERR "b->written %u\n", b->written);
console_unlock();
panic("verify failed at %u\n", j);
}
out:
mutex_unlock(&c->verify_lock);
btree_node_io_unlock(b);
}
#endif
#ifdef CONFIG_DEBUG_FS
/* XXX: bch_fs refcounting */
struct dump_iter {
struct bpos from;
struct bch_fs *c;
enum btree_id id;
char buf[PAGE_SIZE];
size_t bytes; /* what's currently in buf */
char __user *ubuf; /* destination user buffer */
size_t size; /* size of requested read */
ssize_t ret; /* bytes read so far */
};
static int flush_buf(struct dump_iter *i)
{
if (i->bytes) {
size_t bytes = min(i->bytes, i->size);
int err = copy_to_user(i->ubuf, i->buf, bytes);
if (err)
return err;
i->ret += bytes;
i->ubuf += bytes;
i->size -= bytes;
i->bytes -= bytes;
memmove(i->buf, i->buf + bytes, i->bytes);
}
return 0;
}
static int bch2_dump_open(struct inode *inode, struct file *file)
{
struct btree_debug *bd = inode->i_private;
struct dump_iter *i;
i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
if (!i)
return -ENOMEM;
file->private_data = i;
i->from = POS_MIN;
i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]);
i->id = bd->id;
return 0;
}
static int bch2_dump_release(struct inode *inode, struct file *file)
{
kfree(file->private_data);
return 0;
}
static ssize_t bch2_read_btree(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
struct dump_iter *i = file->private_data;
struct btree_iter iter;
struct bkey_s_c k;
int err;
i->ubuf = buf;
i->size = size;
i->ret = 0;
err = flush_buf(i);
if (err)
return err;
if (!i->size)
return i->ret;
bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
k = bch2_btree_iter_peek(&iter);
while (k.k && !(err = btree_iter_err(k))) {
bch2_bkey_val_to_text(i->c, bkey_type(0, i->id),
i->buf, sizeof(i->buf), k);
i->bytes = strlen(i->buf);
BUG_ON(i->bytes >= PAGE_SIZE);
i->buf[i->bytes] = '\n';
i->bytes++;
k = bch2_btree_iter_next(&iter);
i->from = iter.pos;
err = flush_buf(i);
if (err)
break;
if (!i->size)
break;
}
bch2_btree_iter_unlock(&iter);
return err < 0 ? err : i->ret;
}
static const struct file_operations btree_debug_ops = {
.owner = THIS_MODULE,
.open = bch2_dump_open,
.release = bch2_dump_release,
.read = bch2_read_btree,
};
static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
struct dump_iter *i = file->private_data;
struct btree_iter iter;
struct btree *b;
int err;
i->ubuf = buf;
i->size = size;
i->ret = 0;
err = flush_buf(i);
if (err)
return err;
if (!i->size || !bkey_cmp(POS_MAX, i->from))
return i->ret;
for_each_btree_node(&iter, i->c, i->id, i->from, 0, b) {
i->bytes = bch2_print_btree_node(i->c, b, i->buf,
sizeof(i->buf));
err = flush_buf(i);
if (err)
break;
/*
* can't easily correctly restart a btree node traversal across
* all nodes, meh
*/
i->from = bkey_cmp(POS_MAX, b->key.k.p)
? bkey_successor(b->key.k.p)
: b->key.k.p;
if (!i->size)
break;
}
bch2_btree_iter_unlock(&iter);
return err < 0 ? err : i->ret;
}
static const struct file_operations btree_format_debug_ops = {
.owner = THIS_MODULE,
.open = bch2_dump_open,
.release = bch2_dump_release,
.read = bch2_read_btree_formats,
};
static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
struct dump_iter *i = file->private_data;
struct btree_iter iter;
struct bkey_s_c k;
struct btree *prev_node = NULL;
int err;
i->ubuf = buf;
i->size = size;
i->ret = 0;
err = flush_buf(i);
if (err)
return err;
if (!i->size)
return i->ret;
bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
while ((k = bch2_btree_iter_peek(&iter)).k &&
!(err = btree_iter_err(k))) {
struct btree_iter_level *l = &iter.l[0];
struct bkey_packed *_k =
bch2_btree_node_iter_peek(&l->iter, l->b);
if (l->b != prev_node) {
i->bytes = bch2_print_btree_node(i->c, l->b, i->buf,
sizeof(i->buf));
err = flush_buf(i);
if (err)
break;
}
prev_node = l->b;
i->bytes = bch2_bkey_print_bfloat(l->b, _k, i->buf,
sizeof(i->buf));
err = flush_buf(i);
if (err)
break;
bch2_btree_iter_next(&iter);
i->from = iter.pos;
err = flush_buf(i);
if (err)
break;
if (!i->size)
break;
}
bch2_btree_iter_unlock(&iter);
return err < 0 ? err : i->ret;
}
static const struct file_operations bfloat_failed_debug_ops = {
.owner = THIS_MODULE,
.open = bch2_dump_open,
.release = bch2_dump_release,
.read = bch2_read_bfloat_failed,
};
void bch2_fs_debug_exit(struct bch_fs *c)
{
if (!IS_ERR_OR_NULL(c->debug))
debugfs_remove_recursive(c->debug);
}
void bch2_fs_debug_init(struct bch_fs *c)
{
struct btree_debug *bd;
char name[100];
if (IS_ERR_OR_NULL(bch_debug))
return;
snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
c->debug = debugfs_create_dir(name, bch_debug);
if (IS_ERR_OR_NULL(c->debug))
return;
for (bd = c->btree_debug;
bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
bd++) {
bd->id = bd - c->btree_debug;
bd->btree = debugfs_create_file(bch2_btree_ids[bd->id],
0400, c->debug, bd,
&btree_debug_ops);
snprintf(name, sizeof(name), "%s-formats",
bch2_btree_ids[bd->id]);
bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
&btree_format_debug_ops);
snprintf(name, sizeof(name), "%s-bfloat-failed",
bch2_btree_ids[bd->id]);
bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
&bfloat_failed_debug_ops);
}
}
#endif
void bch2_debug_exit(void)
{
if (!IS_ERR_OR_NULL(bch_debug))
debugfs_remove_recursive(bch_debug);
}
int __init bch2_debug_init(void)
{
int ret = 0;
bch_debug = debugfs_create_dir("bcachefs", NULL);
return ret;
}

63
fs/bcachefs/debug.h Normal file
View File

@ -0,0 +1,63 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_DEBUG_H
#define _BCACHEFS_DEBUG_H
#include "bcachefs.h"
struct bio;
struct btree;
struct bch_fs;
#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
BCH_DEBUG_PARAMS()
#undef BCH_DEBUG_PARAM
#define BCH_DEBUG_PARAM(name, description) \
static inline bool name(struct bch_fs *c) \
{ return bch2_##name || c->name; }
BCH_DEBUG_PARAMS_ALWAYS()
#undef BCH_DEBUG_PARAM
#ifdef CONFIG_BCACHEFS_DEBUG
#define BCH_DEBUG_PARAM(name, description) \
static inline bool name(struct bch_fs *c) \
{ return bch2_##name || c->name; }
BCH_DEBUG_PARAMS_DEBUG()
#undef BCH_DEBUG_PARAM
void __bch2_btree_verify(struct bch_fs *, struct btree *);
#define bypass_torture_test(d) ((d)->bypass_torture_test)
#else /* DEBUG */
#define BCH_DEBUG_PARAM(name, description) \
static inline bool name(struct bch_fs *c) { return false; }
BCH_DEBUG_PARAMS_DEBUG()
#undef BCH_DEBUG_PARAM
static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {}
#define bypass_torture_test(d) 0
#endif
static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
{
if (verify_btree_ondisk(c))
__bch2_btree_verify(c, b);
}
#ifdef CONFIG_DEBUG_FS
void bch2_fs_debug_exit(struct bch_fs *);
void bch2_fs_debug_init(struct bch_fs *);
#else
static inline void bch2_fs_debug_exit(struct bch_fs *c) {}
static inline void bch2_fs_debug_init(struct bch_fs *c) {}
#endif
void bch2_debug_exit(void);
int bch2_debug_init(void);
#endif /* _BCACHEFS_DEBUG_H */

426
fs/bcachefs/dirent.c Normal file
View File

@ -0,0 +1,426 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "bkey_methods.h"
#include "btree_update.h"
#include "extents.h"
#include "dirent.h"
#include "fs.h"
#include "keylist.h"
#include "str_hash.h"
#include <linux/dcache.h>
unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
{
unsigned len = bkey_val_bytes(d.k) -
offsetof(struct bch_dirent, d_name);
while (len && !d.v->d_name[len - 1])
--len;
return len;
}
static unsigned dirent_val_u64s(unsigned len)
{
return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
sizeof(u64));
}
static u64 bch2_dirent_hash(const struct bch_hash_info *info,
const struct qstr *name)
{
struct bch_str_hash_ctx ctx;
bch2_str_hash_init(&ctx, info);
bch2_str_hash_update(&ctx, info, name->name, name->len);
/* [0,2) reserved for dots */
return max_t(u64, bch2_str_hash_end(&ctx, info), 2);
}
static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
{
return bch2_dirent_hash(info, key);
}
static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
return bch2_dirent_hash(info, &name);
}
static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
{
struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
int len = bch2_dirent_name_bytes(l);
const struct qstr *r = _r;
return len - r->len ?: memcmp(l.v->d_name, r->name, len);
}
static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
{
struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
int l_len = bch2_dirent_name_bytes(l);
int r_len = bch2_dirent_name_bytes(r);
return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
}
const struct bch_hash_desc bch2_dirent_hash_desc = {
.btree_id = BTREE_ID_DIRENTS,
.key_type = BCH_DIRENT,
.whiteout_type = BCH_DIRENT_WHITEOUT,
.hash_key = dirent_hash_key,
.hash_bkey = dirent_hash_bkey,
.cmp_key = dirent_cmp_key,
.cmp_bkey = dirent_cmp_bkey,
};
const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_dirent d;
unsigned len;
switch (k.k->type) {
case BCH_DIRENT:
if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
return "value too small";
d = bkey_s_c_to_dirent(k);
len = bch2_dirent_name_bytes(d);
if (!len)
return "empty name";
/*
* older versions of bcachefs were buggy and creating dirent
* keys that were bigger than necessary:
*/
if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7))
return "value too big";
if (len > BCH_NAME_MAX)
return "dirent name too big";
if (memchr(d.v->d_name, '/', len))
return "dirent name has invalid characters";
return NULL;
case BCH_DIRENT_WHITEOUT:
return bkey_val_bytes(k.k) != 0
? "value size should be zero"
: NULL;
default:
return "invalid type";
}
}
void bch2_dirent_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
{
struct bkey_s_c_dirent d;
size_t n = 0;
switch (k.k->type) {
case BCH_DIRENT:
d = bkey_s_c_to_dirent(k);
n += bch_scnmemcpy(buf + n, size - n, d.v->d_name,
bch2_dirent_name_bytes(d));
n += scnprintf(buf + n, size - n, " -> %llu", d.v->d_inum);
break;
case BCH_DIRENT_WHITEOUT:
scnprintf(buf, size, "whiteout");
break;
}
}
static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
u8 type, const struct qstr *name, u64 dst)
{
struct bkey_i_dirent *dirent;
unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
if (name->len > BCH_NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
BUG_ON(u64s > U8_MAX);
dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
if (IS_ERR(dirent))
return dirent;
bkey_dirent_init(&dirent->k_i);
dirent->k.u64s = u64s;
dirent->v.d_inum = cpu_to_le64(dst);
dirent->v.d_type = type;
memcpy(dirent->v.d_name, name->name, name->len);
memset(dirent->v.d_name + name->len, 0,
bkey_val_bytes(&dirent->k) -
offsetof(struct bch_dirent, d_name) -
name->len);
EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
return dirent;
}
int __bch2_dirent_create(struct btree_trans *trans,
u64 dir_inum, const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
int flags)
{
struct bkey_i_dirent *dirent;
int ret;
dirent = dirent_create_key(trans, type, name, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
return __bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
dir_inum, &dirent->k_i, flags);
}
int bch2_dirent_create(struct bch_fs *c, u64 dir_inum,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
u64 *journal_seq, int flags)
{
return bch2_trans_do(c, journal_seq, flags,
__bch2_dirent_create(&trans, dir_inum, hash_info,
type, name, dst_inum, flags));
}
static void dirent_copy_target(struct bkey_i_dirent *dst,
struct bkey_s_c_dirent src)
{
dst->v.d_inum = src.v->d_inum;
dst->v.d_type = src.v->d_type;
}
static struct bpos bch2_dirent_pos(struct bch_inode_info *inode,
const struct qstr *name)
{
return POS(inode->v.i_ino, bch2_dirent_hash(&inode->ei_str_hash, name));
}
int bch2_dirent_rename(struct btree_trans *trans,
struct bch_inode_info *src_dir, const struct qstr *src_name,
struct bch_inode_info *dst_dir, const struct qstr *dst_name,
enum bch_rename_mode mode)
{
struct btree_iter *src_iter, *dst_iter;
struct bkey_s_c old_src, old_dst;
struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
struct bpos dst_pos = bch2_dirent_pos(dst_dir, dst_name);
int ret;
/*
* Lookup dst:
*
* Note that in BCH_RENAME mode, we're _not_ checking if
* the target already exists - we're relying on the VFS
* to do that check for us for correctness:
*/
dst_iter = mode == BCH_RENAME
? bch2_hash_hole(trans, bch2_dirent_hash_desc,
&dst_dir->ei_str_hash,
dst_dir->v.i_ino, dst_name)
: bch2_hash_lookup(trans, bch2_dirent_hash_desc,
&dst_dir->ei_str_hash,
dst_dir->v.i_ino, dst_name,
BTREE_ITER_INTENT);
if (IS_ERR(dst_iter))
return PTR_ERR(dst_iter);
old_dst = bch2_btree_iter_peek_slot(dst_iter);
/* Lookup src: */
src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
&src_dir->ei_str_hash,
src_dir->v.i_ino, src_name,
BTREE_ITER_INTENT);
if (IS_ERR(src_iter))
return PTR_ERR(src_iter);
old_src = bch2_btree_iter_peek_slot(src_iter);
/* Create new dst key: */
new_dst = dirent_create_key(trans, 0, dst_name, 0);
if (IS_ERR(new_dst))
return PTR_ERR(new_dst);
dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
new_dst->k.p = dst_iter->pos;
/* Create new src key: */
if (mode == BCH_RENAME_EXCHANGE) {
new_src = dirent_create_key(trans, 0, src_name, 0);
if (IS_ERR(new_src))
return PTR_ERR(new_src);
dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
new_src->k.p = src_iter->pos;
} else {
new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
if (IS_ERR(new_src))
return PTR_ERR(new_src);
bkey_init(&new_src->k);
new_src->k.p = src_iter->pos;
if (bkey_cmp(dst_pos, src_iter->pos) <= 0 &&
bkey_cmp(src_iter->pos, dst_iter->pos) < 0) {
/*
* We have a hash collision for the new dst key,
* and new_src - the key we're deleting - is between
* new_dst's hashed slot and the slot we're going to be
* inserting it into - oops. This will break the hash
* table if we don't deal with it:
*/
if (mode == BCH_RENAME) {
/*
* If we're not overwriting, we can just insert
* new_dst at the src position:
*/
new_dst->k.p = src_iter->pos;
bch2_trans_update(trans, src_iter, &new_dst->k_i, 0);
return 0;
} else {
/* If we're overwriting, we can't insert new_dst
* at a different slot because it has to
* overwrite old_dst - just make sure to use a
* whiteout when deleting src:
*/
new_src->k.type = BCH_DIRENT_WHITEOUT;
}
} else {
/* Check if we need a whiteout to delete src: */
ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
&src_dir->ei_str_hash,
src_iter);
if (ret < 0)
return ret;
if (ret)
new_src->k.type = BCH_DIRENT_WHITEOUT;
}
}
bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
return 0;
}
int __bch2_dirent_delete(struct btree_trans *trans, u64 dir_inum,
const struct bch_hash_info *hash_info,
const struct qstr *name)
{
return bch2_hash_delete(trans, bch2_dirent_hash_desc, hash_info,
dir_inum, name);
}
int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum,
const struct bch_hash_info *hash_info,
const struct qstr *name,
u64 *journal_seq)
{
return bch2_trans_do(c, journal_seq,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL,
__bch2_dirent_delete(&trans, dir_inum, hash_info, name));
}
u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
const struct bch_hash_info *hash_info,
const struct qstr *name)
{
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k;
u64 inum = 0;
bch2_trans_init(&trans, c);
iter = bch2_hash_lookup(&trans, bch2_dirent_hash_desc,
hash_info, dir_inum, name, 0);
if (IS_ERR(iter)) {
BUG_ON(PTR_ERR(iter) == -EINTR);
goto out;
}
k = bch2_btree_iter_peek_slot(iter);
inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
out:
bch2_trans_exit(&trans);
return inum;
}
int bch2_empty_dir(struct bch_fs *c, u64 dir_inum)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS(dir_inum, 0), 0, k) {
if (k.k->p.inode > dir_inum)
break;
if (k.k->type == BCH_DIRENT) {
ret = -ENOTEMPTY;
break;
}
}
bch2_btree_iter_unlock(&iter);
return ret;
}
int bch2_readdir(struct bch_fs *c, struct file *file,
struct dir_context *ctx)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_s_c_dirent dirent;
unsigned len;
if (!dir_emit_dots(file, ctx))
return 0;
for_each_btree_key(&iter, c, BTREE_ID_DIRENTS,
POS(inode->v.i_ino, ctx->pos), 0, k) {
if (k.k->type != BCH_DIRENT)
continue;
dirent = bkey_s_c_to_dirent(k);
if (bkey_cmp(k.k->p, POS(inode->v.i_ino, ctx->pos)) < 0)
continue;
if (k.k->p.inode > inode->v.i_ino)
break;
len = bch2_dirent_name_bytes(dirent);
/*
* XXX: dir_emit() can fault and block, while we're holding
* locks
*/
if (!dir_emit(ctx, dirent.v->d_name, len,
le64_to_cpu(dirent.v->d_inum),
dirent.v->d_type))
break;
ctx->pos = k.k->p.offset + 1;
}
bch2_btree_iter_unlock(&iter);
return 0;
}

55
fs/bcachefs/dirent.h Normal file
View File

@ -0,0 +1,55 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_DIRENT_H
#define _BCACHEFS_DIRENT_H
#include "str_hash.h"
extern const struct bch_hash_desc bch2_dirent_hash_desc;
const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_dirent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
#define bch2_bkey_dirent_ops (struct bkey_ops) { \
.key_invalid = bch2_dirent_invalid, \
.val_to_text = bch2_dirent_to_text, \
}
struct qstr;
struct file;
struct dir_context;
struct bch_fs;
struct bch_hash_info;
struct bch_inode_info;
unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent);
int __bch2_dirent_create(struct btree_trans *, u64,
const struct bch_hash_info *, u8,
const struct qstr *, u64, int);
int bch2_dirent_create(struct bch_fs *c, u64, const struct bch_hash_info *,
u8, const struct qstr *, u64, u64 *, int);
int __bch2_dirent_delete(struct btree_trans *, u64,
const struct bch_hash_info *,
const struct qstr *);
int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *,
const struct qstr *, u64 *);
enum bch_rename_mode {
BCH_RENAME,
BCH_RENAME_OVERWRITE,
BCH_RENAME_EXCHANGE,
};
int bch2_dirent_rename(struct btree_trans *,
struct bch_inode_info *, const struct qstr *,
struct bch_inode_info *, const struct qstr *,
enum bch_rename_mode);
u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
const struct qstr *);
int bch2_empty_dir(struct bch_fs *, u64);
int bch2_readdir(struct bch_fs *, struct file *, struct dir_context *);
#endif /* _BCACHEFS_DIRENT_H */

494
fs/bcachefs/disk_groups.c Normal file
View File

@ -0,0 +1,494 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "disk_groups.h"
#include "super-io.h"
#include <linux/sort.h>
static int group_cmp(const void *_l, const void *_r)
{
const struct bch_disk_group *l = _l;
const struct bch_disk_group *r = _r;
return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
(BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
(BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
strncmp(l->label, r->label, sizeof(l->label));
}
static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
struct bch_sb_field *f)
{
struct bch_sb_field_disk_groups *groups =
field_to_type(f, disk_groups);
struct bch_disk_group *g, *sorted = NULL;
struct bch_sb_field_members *mi;
struct bch_member *m;
unsigned i, nr_groups, len;
const char *err = NULL;
mi = bch2_sb_get_members(sb);
groups = bch2_sb_get_disk_groups(sb);
nr_groups = disk_groups_nr(groups);
for (m = mi->members;
m < mi->members + sb->nr_devices;
m++) {
unsigned g;
if (!BCH_MEMBER_GROUP(m))
continue;
g = BCH_MEMBER_GROUP(m) - 1;
if (g >= nr_groups ||
BCH_GROUP_DELETED(&groups->entries[g]))
return "disk has invalid group";
}
if (!nr_groups)
return NULL;
for (g = groups->entries;
g < groups->entries + nr_groups;
g++) {
if (BCH_GROUP_DELETED(g))
continue;
len = strnlen(g->label, sizeof(g->label));
if (!len) {
err = "group with empty label";
goto err;
}
}
sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
if (!sorted)
return "cannot allocate memory";
memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
for (i = 0; i + 1 < nr_groups; i++)
if (!BCH_GROUP_DELETED(sorted + i) &&
!group_cmp(sorted + i, sorted + i + 1)) {
err = "duplicate groups";
goto err;
}
err = NULL;
err:
kfree(sorted);
return err;
}
static size_t bch2_sb_disk_groups_to_text(char *buf, size_t size,
struct bch_sb *sb,
struct bch_sb_field *f)
{
char *out = buf, *end = buf + size;
struct bch_sb_field_disk_groups *groups =
field_to_type(f, disk_groups);
struct bch_disk_group *g;
unsigned nr_groups = disk_groups_nr(groups);
for (g = groups->entries;
g < groups->entries + nr_groups;
g++) {
if (g != groups->entries)
out += scnprintf(out, end - out, " ");
if (BCH_GROUP_DELETED(g))
out += scnprintf(out, end - out, "[deleted]");
else
out += scnprintf(out, end - out,
"[parent %llu name %s]",
BCH_GROUP_PARENT(g),
g->label);
}
return out - buf;
}
const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
.validate = bch2_sb_disk_groups_validate,
.to_text = bch2_sb_disk_groups_to_text
};
int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
{
struct bch_sb_field_members *mi;
struct bch_sb_field_disk_groups *groups;
struct bch_disk_groups_cpu *cpu_g, *old_g;
unsigned i, g, nr_groups;
lockdep_assert_held(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb.sb);
groups = bch2_sb_get_disk_groups(c->disk_sb.sb);
nr_groups = disk_groups_nr(groups);
if (!groups)
return 0;
cpu_g = kzalloc(sizeof(*cpu_g) +
sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
if (!cpu_g)
return -ENOMEM;
cpu_g->nr = nr_groups;
for (i = 0; i < nr_groups; i++) {
struct bch_disk_group *src = &groups->entries[i];
struct bch_disk_group_cpu *dst = &cpu_g->entries[i];
dst->deleted = BCH_GROUP_DELETED(src);
dst->parent = BCH_GROUP_PARENT(src);
}
for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
struct bch_member *m = mi->members + i;
struct bch_disk_group_cpu *dst =
&cpu_g->entries[BCH_MEMBER_GROUP(m)];
if (!bch2_member_exists(m))
continue;
g = BCH_MEMBER_GROUP(m);
while (g) {
dst = &cpu_g->entries[g - 1];
__set_bit(i, dst->devs.d);
g = dst->parent;
}
}
old_g = rcu_dereference_protected(c->disk_groups,
lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->disk_groups, cpu_g);
if (old_g)
kfree_rcu(old_g, rcu);
return 0;
}
const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
{
struct target t = target_decode(target);
switch (t.type) {
case TARGET_NULL:
return NULL;
case TARGET_DEV: {
struct bch_dev *ca = t.dev < c->sb.nr_devices
? rcu_dereference(c->devs[t.dev])
: NULL;
return ca ? &ca->self : NULL;
}
case TARGET_GROUP: {
struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
return t.group < g->nr && !g->entries[t.group].deleted
? &g->entries[t.group].devs
: NULL;
}
default:
BUG();
}
}
bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
{
struct target t = target_decode(target);
switch (t.type) {
case TARGET_NULL:
return false;
case TARGET_DEV:
return dev == t.dev;
case TARGET_GROUP: {
struct bch_disk_groups_cpu *g;
const struct bch_devs_mask *m;
bool ret;
rcu_read_lock();
g = rcu_dereference(c->disk_groups);
m = t.group < g->nr && !g->entries[t.group].deleted
? &g->entries[t.group].devs
: NULL;
ret = m ? test_bit(dev, m->d) : false;
rcu_read_unlock();
return ret;
}
default:
BUG();
}
}
static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
unsigned parent,
const char *name, unsigned namelen)
{
unsigned i, nr_groups = disk_groups_nr(groups);
if (!namelen || namelen > BCH_SB_LABEL_SIZE)
return -EINVAL;
for (i = 0; i < nr_groups; i++) {
struct bch_disk_group *g = groups->entries + i;
if (BCH_GROUP_DELETED(g))
continue;
if (!BCH_GROUP_DELETED(g) &&
BCH_GROUP_PARENT(g) == parent &&
strnlen(g->label, sizeof(g->label)) == namelen &&
!memcmp(name, g->label, namelen))
return i;
}
return -1;
}
static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
const char *name, unsigned namelen)
{
struct bch_sb_field_disk_groups *groups =
bch2_sb_get_disk_groups(sb->sb);
unsigned i, nr_groups = disk_groups_nr(groups);
struct bch_disk_group *g;
if (!namelen || namelen > BCH_SB_LABEL_SIZE)
return -EINVAL;
for (i = 0;
i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
i++)
;
if (i == nr_groups) {
unsigned u64s =
(sizeof(struct bch_sb_field_disk_groups) +
sizeof(struct bch_disk_group) * (nr_groups + 1)) /
sizeof(u64);
groups = bch2_sb_resize_disk_groups(sb, u64s);
if (!groups)
return -ENOSPC;
nr_groups = disk_groups_nr(groups);
}
BUG_ON(i >= nr_groups);
g = &groups->entries[i];
memcpy(g->label, name, namelen);
if (namelen < sizeof(g->label))
g->label[namelen] = '\0';
SET_BCH_GROUP_DELETED(g, 0);
SET_BCH_GROUP_PARENT(g, parent);
SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
return i;
}
int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
{
struct bch_sb_field_disk_groups *groups =
bch2_sb_get_disk_groups(sb->sb);
int v = -1;
do {
const char *next = strchrnul(name, '.');
unsigned len = next - name;
if (*next == '.')
next++;
v = __bch2_disk_group_find(groups, v + 1, name, len);
name = next;
} while (*name && v >= 0);
return v;
}
int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
{
struct bch_sb_field_disk_groups *groups;
unsigned parent = 0;
int v = -1;
do {
const char *next = strchrnul(name, '.');
unsigned len = next - name;
if (*next == '.')
next++;
groups = bch2_sb_get_disk_groups(sb->sb);
v = __bch2_disk_group_find(groups, parent, name, len);
if (v < 0)
v = __bch2_disk_group_add(sb, parent, name, len);
if (v < 0)
return v;
parent = v + 1;
name = next;
} while (*name && v >= 0);
return v;
}
int bch2_disk_path_print(struct bch_sb_handle *sb,
char *buf, size_t len, unsigned v)
{
char *out = buf, *end = out + len;
struct bch_sb_field_disk_groups *groups =
bch2_sb_get_disk_groups(sb->sb);
struct bch_disk_group *g;
unsigned nr = 0;
u16 path[32];
while (1) {
if (nr == ARRAY_SIZE(path))
goto inval;
if (v >= disk_groups_nr(groups))
goto inval;
g = groups->entries + v;
if (BCH_GROUP_DELETED(g))
goto inval;
path[nr++] = v;
if (!BCH_GROUP_PARENT(g))
break;
v = BCH_GROUP_PARENT(g) - 1;
}
while (nr) {
unsigned b = 0;
v = path[--nr];
g = groups->entries + v;
if (end != out)
b = min_t(size_t, end - out,
strnlen(g->label, sizeof(g->label)));
memcpy(out, g->label, b);
if (b < end - out)
out[b] = '\0';
out += b;
if (nr)
out += scnprintf(out, end - out, ".");
}
return out - buf;
inval:
return scnprintf(buf, len, "invalid group %u", v);
}
int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
{
struct bch_member *mi;
int v = -1;
mutex_lock(&c->sb_lock);
if (!strlen(name) || !strcmp(name, "none"))
goto write_sb;
v = bch2_disk_path_find_or_create(&c->disk_sb, name);
if (v < 0) {
mutex_unlock(&c->sb_lock);
return v;
}
write_sb:
mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
SET_BCH_MEMBER_GROUP(mi, v + 1);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
return 0;
}
int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
{
struct bch_dev *ca;
int g;
if (!strlen(buf) || !strcmp(buf, "none")) {
*v = 0;
return 0;
}
/* Is it a device? */
ca = bch2_dev_lookup(c, buf);
if (!IS_ERR(ca)) {
*v = dev_to_target(ca->dev_idx);
percpu_ref_put(&ca->ref);
return 0;
}
mutex_lock(&c->sb_lock);
g = bch2_disk_path_find(&c->disk_sb, buf);
mutex_unlock(&c->sb_lock);
if (g >= 0) {
*v = group_to_target(g);
return 0;
}
return -EINVAL;
}
int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v)
{
struct target t = target_decode(v);
int ret;
switch (t.type) {
case TARGET_NULL:
return scnprintf(buf, len, "none");
case TARGET_DEV: {
struct bch_dev *ca;
rcu_read_lock();
ca = t.dev < c->sb.nr_devices
? rcu_dereference(c->devs[t.dev])
: NULL;
if (ca && percpu_ref_tryget(&ca->io_ref)) {
ret = scnprintf(buf, len, "/dev/%pg",
ca->disk_sb.bdev);
percpu_ref_put(&ca->io_ref);
} else if (ca) {
ret = scnprintf(buf, len, "offline device %u", t.dev);
} else {
ret = scnprintf(buf, len, "invalid device %u", t.dev);
}
rcu_read_unlock();
break;
}
case TARGET_GROUP:
mutex_lock(&c->sb_lock);
ret = bch2_disk_path_print(&c->disk_sb, buf, len, t.group);
mutex_unlock(&c->sb_lock);
break;
default:
BUG();
}
return ret;
}

74
fs/bcachefs/disk_groups.h Normal file
View File

@ -0,0 +1,74 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_DISK_GROUPS_H
#define _BCACHEFS_DISK_GROUPS_H
extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
{
return groups
? (vstruct_end(&groups->field) -
(void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
: 0;
}
struct target {
enum {
TARGET_NULL,
TARGET_DEV,
TARGET_GROUP,
} type;
union {
unsigned dev;
unsigned group;
};
};
#define TARGET_DEV_START 1
#define TARGET_GROUP_START (256 + TARGET_DEV_START)
static inline u16 dev_to_target(unsigned dev)
{
return TARGET_DEV_START + dev;
}
static inline u16 group_to_target(unsigned group)
{
return TARGET_GROUP_START + group;
}
static inline struct target target_decode(unsigned target)
{
if (target >= TARGET_GROUP_START)
return (struct target) {
.type = TARGET_GROUP,
.group = target - TARGET_GROUP_START
};
if (target >= TARGET_DEV_START)
return (struct target) {
.type = TARGET_DEV,
.group = target - TARGET_DEV_START
};
return (struct target) { .type = TARGET_NULL };
}
const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
int bch2_disk_path_find(struct bch_sb_handle *, const char *);
int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
int bch2_disk_path_print(struct bch_sb_handle *, char *, size_t, unsigned);
int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64);
int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
const char *bch2_sb_validate_disk_groups(struct bch_sb *,
struct bch_sb_field *);
#endif /* _BCACHEFS_DISK_GROUPS_H */

159
fs/bcachefs/error.c Normal file
View File

@ -0,0 +1,159 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "error.h"
#include "io.h"
#include "super.h"
bool bch2_inconsistent_error(struct bch_fs *c)
{
set_bit(BCH_FS_ERROR, &c->flags);
switch (c->opts.errors) {
case BCH_ON_ERROR_CONTINUE:
return false;
case BCH_ON_ERROR_RO:
if (bch2_fs_emergency_read_only(c))
bch_err(c, "emergency read only");
return true;
case BCH_ON_ERROR_PANIC:
panic(bch2_fmt(c, "panic after error"));
return true;
default:
BUG();
}
}
void bch2_fatal_error(struct bch_fs *c)
{
if (bch2_fs_emergency_read_only(c))
bch_err(c, "emergency read only");
}
void bch2_io_error_work(struct work_struct *work)
{
struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
struct bch_fs *c = ca->fs;
bool dev;
mutex_lock(&c->state_lock);
dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO,
BCH_FORCE_IF_DEGRADED);
if (dev
? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO,
BCH_FORCE_IF_DEGRADED)
: bch2_fs_emergency_read_only(c))
bch_err(ca,
"too many IO errors, setting %s RO",
dev ? "device" : "filesystem");
mutex_unlock(&c->state_lock);
}
void bch2_io_error(struct bch_dev *ca)
{
//queue_work(system_long_wq, &ca->io_error_work);
}
#ifdef __KERNEL__
#define ask_yn() false
#else
#include "tools-util.h"
#endif
enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
const char *fmt, ...)
{
struct fsck_err_state *s;
va_list args;
bool fix = false, print = true, suppressing = false;
char _buf[sizeof(s->buf)], *buf = _buf;
mutex_lock(&c->fsck_error_lock);
if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
goto print;
list_for_each_entry(s, &c->fsck_errors, list)
if (s->fmt == fmt)
goto found;
s = kzalloc(sizeof(*s), GFP_KERNEL);
if (!s) {
if (!c->fsck_alloc_err)
bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
c->fsck_alloc_err = true;
buf = _buf;
goto print;
}
INIT_LIST_HEAD(&s->list);
s->fmt = fmt;
found:
list_move(&s->list, &c->fsck_errors);
s->nr++;
suppressing = s->nr == 10;
print = s->nr <= 10;
buf = s->buf;
print:
va_start(args, fmt);
vscnprintf(buf, sizeof(_buf), fmt, args);
va_end(args);
if (c->opts.fix_errors == FSCK_OPT_EXIT) {
bch_err(c, "%s, exiting", buf);
mutex_unlock(&c->fsck_error_lock);
return FSCK_ERR_EXIT;
}
if (flags & FSCK_CAN_FIX) {
if (c->opts.fix_errors == FSCK_OPT_ASK) {
printk(KERN_ERR "%s: fix?", buf);
fix = ask_yn();
} else if (c->opts.fix_errors == FSCK_OPT_YES ||
(c->opts.nochanges &&
!(flags & FSCK_CAN_IGNORE))) {
if (print)
bch_err(c, "%s, fixing", buf);
fix = true;
} else {
if (print)
bch_err(c, "%s, not fixing", buf);
fix = false;
}
} else if (flags & FSCK_NEED_FSCK) {
if (print)
bch_err(c, "%s (run fsck to correct)", buf);
} else {
if (print)
bch_err(c, "%s (repair unimplemented)", buf);
}
if (suppressing)
bch_err(c, "Ratelimiting new instances of previous error");
mutex_unlock(&c->fsck_error_lock);
if (fix)
set_bit(BCH_FS_FSCK_FIXED_ERRORS, &c->flags);
return fix ? FSCK_ERR_FIX
: flags & FSCK_CAN_IGNORE ? FSCK_ERR_IGNORE
: FSCK_ERR_EXIT;
}
void bch2_flush_fsck_errs(struct bch_fs *c)
{
struct fsck_err_state *s, *n;
mutex_lock(&c->fsck_error_lock);
set_bit(BCH_FS_FSCK_DONE, &c->flags);
list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
if (s->nr > 10)
bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf);
list_del(&s->list);
kfree(s);
}
mutex_unlock(&c->fsck_error_lock);
}

229
fs/bcachefs/error.h Normal file
View File

@ -0,0 +1,229 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_ERROR_H
#define _BCACHEFS_ERROR_H
#include <linux/list.h>
#include <linux/printk.h>
struct bch_dev;
struct bch_fs;
struct work_struct;
/*
* XXX: separate out errors that indicate on disk data is inconsistent, and flag
* superblock as such
*/
/* Error messages: */
/*
* Very fatal logic/inconsistency errors: these indicate that we've majorly
* screwed up at runtime, i.e. it's not likely that it was just caused by the
* data on disk being inconsistent. These BUG():
*
* XXX: audit and convert to inconsistent() checks
*/
#define bch2_fs_bug(c, ...) \
do { \
bch_err(c, __VA_ARGS__); \
BUG(); \
} while (0)
#define bch2_fs_bug_on(cond, c, ...) \
do { \
if (cond) \
bch2_fs_bug(c, __VA_ARGS__); \
} while (0)
/*
* Inconsistency errors: The on disk data is inconsistent. If these occur during
* initial recovery, they don't indicate a bug in the running code - we walk all
* the metadata before modifying anything. If they occur at runtime, they
* indicate either a bug in the running code or (less likely) data is being
* silently corrupted under us.
*
* XXX: audit all inconsistent errors and make sure they're all recoverable, in
* BCH_ON_ERROR_CONTINUE mode
*/
bool bch2_inconsistent_error(struct bch_fs *);
#define bch2_fs_inconsistent(c, ...) \
({ \
bch_err(c, __VA_ARGS__); \
bch2_inconsistent_error(c); \
})
#define bch2_fs_inconsistent_on(cond, c, ...) \
({ \
int _ret = !!(cond); \
\
if (_ret) \
bch2_fs_inconsistent(c, __VA_ARGS__); \
_ret; \
})
/*
* Later we might want to mark only the particular device inconsistent, not the
* entire filesystem:
*/
#define bch2_dev_inconsistent(ca, ...) \
do { \
bch_err(ca, __VA_ARGS__); \
bch2_inconsistent_error((ca)->fs); \
} while (0)
#define bch2_dev_inconsistent_on(cond, ca, ...) \
({ \
int _ret = !!(cond); \
\
if (_ret) \
bch2_dev_inconsistent(ca, __VA_ARGS__); \
_ret; \
})
/*
* Fsck errors: inconsistency errors we detect at mount time, and should ideally
* be able to repair:
*/
enum {
BCH_FSCK_OK = 0,
BCH_FSCK_ERRORS_NOT_FIXED = 1,
BCH_FSCK_REPAIR_UNIMPLEMENTED = 2,
BCH_FSCK_REPAIR_IMPOSSIBLE = 3,
BCH_FSCK_UNKNOWN_VERSION = 4,
};
enum fsck_err_opts {
FSCK_OPT_EXIT,
FSCK_OPT_YES,
FSCK_OPT_NO,
FSCK_OPT_ASK,
};
enum fsck_err_ret {
FSCK_ERR_IGNORE = 0,
FSCK_ERR_FIX = 1,
FSCK_ERR_EXIT = 2,
};
struct fsck_err_state {
struct list_head list;
const char *fmt;
u64 nr;
char buf[512];
};
#define FSCK_CAN_FIX (1 << 0)
#define FSCK_CAN_IGNORE (1 << 1)
#define FSCK_NEED_FSCK (1 << 2)
enum fsck_err_ret bch2_fsck_err(struct bch_fs *,
unsigned, const char *, ...);
void bch2_flush_fsck_errs(struct bch_fs *);
#define __fsck_err(c, _flags, msg, ...) \
({ \
int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\
\
if (_fix == FSCK_ERR_EXIT) { \
bch_err(c, "Unable to continue, halting"); \
ret = BCH_FSCK_ERRORS_NOT_FIXED; \
goto fsck_err; \
} \
\
_fix; \
})
/* These macros return true if error should be fixed: */
/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
#define __fsck_err_on(cond, c, _flags, ...) \
((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
#define need_fsck_err_on(cond, c, ...) \
__fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
#define need_fsck_err(c, ...) \
__fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
#define mustfix_fsck_err(c, ...) \
__fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__)
#define mustfix_fsck_err_on(cond, c, ...) \
__fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__)
#define fsck_err(c, ...) \
__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
#define fsck_err_on(cond, c, ...) \
__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
/*
* Fatal errors: these don't indicate a bug, but we can't continue running in RW
* mode - pretty much just due to metadata IO errors:
*/
void bch2_fatal_error(struct bch_fs *);
#define bch2_fs_fatal_error(c, ...) \
do { \
bch_err(c, __VA_ARGS__); \
bch2_fatal_error(c); \
} while (0)
#define bch2_fs_fatal_err_on(cond, c, ...) \
({ \
int _ret = !!(cond); \
\
if (_ret) \
bch2_fs_fatal_error(c, __VA_ARGS__); \
_ret; \
})
/*
* IO errors: either recoverable metadata IO (because we have replicas), or data
* IO - we need to log it and print out a message, but we don't (necessarily)
* want to shut down the fs:
*/
void bch2_io_error_work(struct work_struct *);
/* Does the error handling without logging a message */
void bch2_io_error(struct bch_dev *);
/* Logs message and handles the error: */
#define bch2_dev_io_error(ca, fmt, ...) \
do { \
printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \
"IO error on %s for " fmt), \
(ca)->name, ##__VA_ARGS__); \
bch2_io_error(ca); \
} while (0)
#define bch2_dev_io_err_on(cond, ca, ...) \
({ \
bool _ret = (cond); \
\
if (_ret) \
bch2_dev_io_error(ca, __VA_ARGS__); \
_ret; \
})
/* kill? */
#define __bcache_io_error(c, fmt, ...) \
printk_ratelimited(KERN_ERR bch2_fmt(c, \
"IO error: " fmt), ##__VA_ARGS__)
#define bcache_io_error(c, bio, fmt, ...) \
do { \
__bcache_io_error(c, fmt, ##__VA_ARGS__); \
(bio)->bi_status = BLK_STS_IOERR; \
} while (0)
#endif /* _BCACHEFS_ERROR_H */

2395
fs/bcachefs/extents.c Normal file

File diff suppressed because it is too large Load Diff

539
fs/bcachefs/extents.h Normal file
View File

@ -0,0 +1,539 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_EXTENTS_H
#define _BCACHEFS_EXTENTS_H
#include "bcachefs.h"
#include "bkey.h"
#include "extents_types.h"
struct bch_fs;
struct journal_res;
struct btree_node_iter;
struct btree_node_iter_large;
struct btree_insert;
struct btree_insert_entry;
struct extent_insert_hook;
struct bch_devs_mask;
union bch_extent_crc;
const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
struct bkey_s_c);
void bch2_btree_ptr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
#define bch2_bkey_btree_ops (struct bkey_ops) { \
.key_invalid = bch2_btree_ptr_invalid, \
.key_debugcheck = bch2_btree_ptr_debugcheck, \
.val_to_text = bch2_btree_ptr_to_text, \
.swab = bch2_ptr_swab, \
}
const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
void bch2_extent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
bool bch2_ptr_normalize(struct bch_fs *, struct btree *, struct bkey_s);
enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
struct bkey_i *, struct bkey_i *);
#define bch2_bkey_extent_ops (struct bkey_ops) { \
.key_invalid = bch2_extent_invalid, \
.key_debugcheck = bch2_extent_debugcheck, \
.val_to_text = bch2_extent_to_text, \
.swab = bch2_ptr_swab, \
.key_normalize = bch2_ptr_normalize, \
.key_merge = bch2_extent_merge, \
.is_extents = true, \
}
struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *,
struct btree *,
struct btree_node_iter_large *);
struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
struct bset *,
struct btree *,
struct btree_node_iter_large *);
int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
struct bch_devs_mask *avoid,
struct extent_pick_ptr *);
int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
struct bch_devs_mask *,
struct extent_pick_ptr *);
enum btree_insert_ret
bch2_insert_fixup_extent(struct btree_insert *,
struct btree_insert_entry *);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
unsigned, unsigned);
const struct bch_extent_ptr *
bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
bool bch2_extent_drop_device(struct bkey_s_extent, unsigned);
const struct bch_extent_ptr *
bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned);
const struct bch_extent_ptr *
bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned);
unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
unsigned bch2_extent_is_compressed(struct bkey_s_c);
unsigned bch2_extent_ptr_durability(struct bch_fs *,
const struct bch_extent_ptr *);
unsigned bch2_extent_durability(struct bch_fs *, struct bkey_s_c_extent);
bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
struct bch_extent_ptr, u64);
static inline bool bkey_extent_is_data(const struct bkey *k)
{
switch (k->type) {
case BCH_EXTENT:
case BCH_EXTENT_CACHED:
return true;
default:
return false;
}
}
static inline bool bkey_extent_is_allocation(const struct bkey *k)
{
switch (k->type) {
case BCH_EXTENT:
case BCH_EXTENT_CACHED:
case BCH_RESERVATION:
return true;
default:
return false;
}
}
static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k)
{
return bkey_extent_is_allocation(k.k) &&
!bch2_extent_is_compressed(k);
}
static inline bool bkey_extent_is_cached(const struct bkey *k)
{
return k->type == BCH_EXTENT_CACHED;
}
static inline void bkey_extent_set_cached(struct bkey *k, bool cached)
{
EBUG_ON(k->type != BCH_EXTENT &&
k->type != BCH_EXTENT_CACHED);
k->type = cached ? BCH_EXTENT_CACHED : BCH_EXTENT;
}
static inline unsigned
__extent_entry_type(const union bch_extent_entry *e)
{
return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX;
}
static inline enum bch_extent_entry_type
extent_entry_type(const union bch_extent_entry *e)
{
int ret = __ffs(e->type);
EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
return ret;
}
static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
{
switch (extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_crc32:
return sizeof(struct bch_extent_crc32);
case BCH_EXTENT_ENTRY_crc64:
return sizeof(struct bch_extent_crc64);
case BCH_EXTENT_ENTRY_crc128:
return sizeof(struct bch_extent_crc128);
case BCH_EXTENT_ENTRY_ptr:
return sizeof(struct bch_extent_ptr);
default:
BUG();
}
}
static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
{
return extent_entry_bytes(entry) / sizeof(u64);
}
static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
{
return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
}
static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
{
return !extent_entry_is_ptr(e);
}
union bch_extent_crc {
u8 type;
struct bch_extent_crc32 crc32;
struct bch_extent_crc64 crc64;
struct bch_extent_crc128 crc128;
};
/* downcast, preserves const */
#define to_entry(_entry) \
({ \
BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \
!type_is(_entry, struct bch_extent_ptr *)); \
\
__builtin_choose_expr( \
(type_is_exact(_entry, const union bch_extent_crc *) || \
type_is_exact(_entry, const struct bch_extent_ptr *)), \
(const union bch_extent_entry *) (_entry), \
(union bch_extent_entry *) (_entry)); \
})
#define __entry_to_crc(_entry) \
__builtin_choose_expr( \
type_is_exact(_entry, const union bch_extent_entry *), \
(const union bch_extent_crc *) (_entry), \
(union bch_extent_crc *) (_entry))
#define entry_to_crc(_entry) \
({ \
EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \
\
__entry_to_crc(_entry); \
})
#define entry_to_ptr(_entry) \
({ \
EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \
\
__builtin_choose_expr( \
type_is_exact(_entry, const union bch_extent_entry *), \
(const struct bch_extent_ptr *) (_entry), \
(struct bch_extent_ptr *) (_entry)); \
})
/* checksum entries: */
enum bch_extent_crc_type {
BCH_EXTENT_CRC_NONE,
BCH_EXTENT_CRC32,
BCH_EXTENT_CRC64,
BCH_EXTENT_CRC128,
};
static inline enum bch_extent_crc_type
__extent_crc_type(const union bch_extent_crc *crc)
{
if (!crc)
return BCH_EXTENT_CRC_NONE;
switch (extent_entry_type(to_entry(crc))) {
case BCH_EXTENT_ENTRY_crc32:
return BCH_EXTENT_CRC32;
case BCH_EXTENT_ENTRY_crc64:
return BCH_EXTENT_CRC64;
case BCH_EXTENT_ENTRY_crc128:
return BCH_EXTENT_CRC128;
default:
BUG();
}
}
#define extent_crc_type(_crc) \
({ \
BUILD_BUG_ON(!type_is(_crc, struct bch_extent_crc32 *) && \
!type_is(_crc, struct bch_extent_crc64 *) && \
!type_is(_crc, struct bch_extent_crc128 *) && \
!type_is(_crc, union bch_extent_crc *)); \
\
type_is(_crc, struct bch_extent_crc32 *) ? BCH_EXTENT_CRC32 \
: type_is(_crc, struct bch_extent_crc64 *) ? BCH_EXTENT_CRC64 \
: type_is(_crc, struct bch_extent_crc128 *) ? BCH_EXTENT_CRC128 \
: __extent_crc_type((union bch_extent_crc *) _crc); \
})
static inline struct bch_extent_crc_unpacked
bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
{
#define common_fields(_crc) \
.csum_type = _crc.csum_type, \
.compression_type = _crc.compression_type, \
.compressed_size = _crc._compressed_size + 1, \
.uncompressed_size = _crc._uncompressed_size + 1, \
.offset = _crc.offset, \
.live_size = k->size
switch (extent_crc_type(crc)) {
case BCH_EXTENT_CRC_NONE:
return (struct bch_extent_crc_unpacked) {
.compressed_size = k->size,
.uncompressed_size = k->size,
.live_size = k->size,
};
case BCH_EXTENT_CRC32: {
struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
common_fields(crc->crc32),
};
*((__le32 *) &ret.csum.lo) = crc->crc32.csum;
memcpy(&ret.csum.lo, &crc->crc32.csum,
sizeof(crc->crc32.csum));
return ret;
}
case BCH_EXTENT_CRC64: {
struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
common_fields(crc->crc64),
.nonce = crc->crc64.nonce,
.csum.lo = (__force __le64) crc->crc64.csum_lo,
};
*((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi;
return ret;
}
case BCH_EXTENT_CRC128: {
struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
common_fields(crc->crc128),
.nonce = crc->crc128.nonce,
.csum = crc->crc128.csum,
};
return ret;
}
default:
BUG();
}
#undef common_fields
}
/* Extent entry iteration: */
#define extent_entry_next(_entry) \
((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
#define extent_entry_last(_e) \
vstruct_idx((_e).v, bkey_val_u64s((_e).k))
/* Iterate over all entries: */
#define extent_for_each_entry_from(_e, _entry, _start) \
for ((_entry) = _start; \
(_entry) < extent_entry_last(_e); \
(_entry) = extent_entry_next(_entry))
#define extent_for_each_entry(_e, _entry) \
extent_for_each_entry_from(_e, _entry, (_e).v->start)
/* Iterate over crcs only: */
#define __extent_crc_next(_e, _p) \
({ \
typeof(&(_e).v->start[0]) _entry = _p; \
\
while ((_entry) < extent_entry_last(_e) && \
!extent_entry_is_crc(_entry)) \
(_entry) = extent_entry_next(_entry); \
\
entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL); \
})
#define __extent_for_each_crc(_e, _crc) \
for ((_crc) = __extent_crc_next(_e, (_e).v->start); \
(_crc); \
(_crc) = __extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
#define extent_crc_next(_e, _crc, _iter) \
({ \
extent_for_each_entry_from(_e, _iter, _iter) \
if (extent_entry_is_crc(_iter)) { \
(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\
break; \
} \
\
(_iter) < extent_entry_last(_e); \
})
#define extent_for_each_crc(_e, _crc, _iter) \
for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \
(_iter) = (_e).v->start; \
extent_crc_next(_e, _crc, _iter); \
(_iter) = extent_entry_next(_iter))
/* Iterate over pointers, with crcs: */
#define extent_ptr_crc_next(_e, _ptr, _crc) \
({ \
__label__ out; \
typeof(&(_e).v->start[0]) _entry; \
\
extent_for_each_entry_from(_e, _entry, to_entry(_ptr)) \
if (extent_entry_is_crc(_entry)) { \
(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_entry));\
} else { \
_ptr = entry_to_ptr(_entry); \
goto out; \
} \
\
_ptr = NULL; \
out: \
_ptr; \
})
#define extent_for_each_ptr_crc(_e, _ptr, _crc) \
for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \
(_ptr) = &(_e).v->start->ptr; \
((_ptr) = extent_ptr_crc_next(_e, _ptr, _crc)); \
(_ptr)++)
/* Iterate over pointers only, and from a given position: */
#define extent_ptr_next(_e, _ptr) \
({ \
struct bch_extent_crc_unpacked _crc; \
\
extent_ptr_crc_next(_e, _ptr, _crc); \
})
#define extent_for_each_ptr(_e, _ptr) \
for ((_ptr) = &(_e).v->start->ptr; \
((_ptr) = extent_ptr_next(_e, _ptr)); \
(_ptr)++)
#define extent_ptr_prev(_e, _ptr) \
({ \
typeof(&(_e).v->start->ptr) _p; \
typeof(&(_e).v->start->ptr) _prev = NULL; \
\
extent_for_each_ptr(_e, _p) { \
if (_p == (_ptr)) \
break; \
_prev = _p; \
} \
\
_prev; \
})
/*
* Use this when you'll be dropping pointers as you iterate. Quadratic,
* unfortunately:
*/
#define extent_for_each_ptr_backwards(_e, _ptr) \
for ((_ptr) = extent_ptr_prev(_e, NULL); \
(_ptr); \
(_ptr) = extent_ptr_prev(_e, _ptr))
void bch2_extent_crc_append(struct bkey_i_extent *,
struct bch_extent_crc_unpacked);
static inline void __extent_entry_push(struct bkey_i_extent *e)
{
union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e));
EBUG_ON(bkey_val_u64s(&e->k) + extent_entry_u64s(entry) >
BKEY_EXTENT_VAL_U64s_MAX);
e->k.u64s += extent_entry_u64s(entry);
}
static inline void extent_ptr_append(struct bkey_i_extent *e,
struct bch_extent_ptr ptr)
{
ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
extent_entry_last(extent_i_to_s(e))->ptr = ptr;
__extent_entry_push(e);
}
static inline struct bch_devs_list bch2_extent_devs(struct bkey_s_c_extent e)
{
struct bch_devs_list ret = (struct bch_devs_list) { 0 };
const struct bch_extent_ptr *ptr;
extent_for_each_ptr(e, ptr)
ret.devs[ret.nr++] = ptr->dev;
return ret;
}
static inline struct bch_devs_list bch2_extent_dirty_devs(struct bkey_s_c_extent e)
{
struct bch_devs_list ret = (struct bch_devs_list) { 0 };
const struct bch_extent_ptr *ptr;
extent_for_each_ptr(e, ptr)
if (!ptr->cached)
ret.devs[ret.nr++] = ptr->dev;
return ret;
}
static inline struct bch_devs_list bch2_extent_cached_devs(struct bkey_s_c_extent e)
{
struct bch_devs_list ret = (struct bch_devs_list) { 0 };
const struct bch_extent_ptr *ptr;
extent_for_each_ptr(e, ptr)
if (ptr->cached)
ret.devs[ret.nr++] = ptr->dev;
return ret;
}
static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
{
switch (k.k->type) {
case BCH_EXTENT:
case BCH_EXTENT_CACHED:
return bch2_extent_devs(bkey_s_c_to_extent(k));
default:
return (struct bch_devs_list) { .nr = 0 };
}
}
static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
{
switch (k.k->type) {
case BCH_EXTENT:
case BCH_EXTENT_CACHED:
return bch2_extent_dirty_devs(bkey_s_c_to_extent(k));
default:
return (struct bch_devs_list) { .nr = 0 };
}
}
static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
{
switch (k.k->type) {
case BCH_EXTENT:
case BCH_EXTENT_CACHED:
return bch2_extent_cached_devs(bkey_s_c_to_extent(k));
default:
return (struct bch_devs_list) { .nr = 0 };
}
}
bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
struct bch_extent_crc_unpacked);
bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
void bch2_extent_drop_redundant_crcs(struct bkey_s_extent);
void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
bool bch2_cut_front(struct bpos, struct bkey_i *);
bool bch2_cut_back(struct bpos, struct bkey *);
void bch2_key_resize(struct bkey *, unsigned);
int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
#endif /* _BCACHEFS_EXTENTS_H */

View File

@ -0,0 +1,27 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_EXTENTS_TYPES_H
#define _BCACHEFS_EXTENTS_TYPES_H
#include "bcachefs_format.h"
struct bch_extent_crc_unpacked {
u8 csum_type;
u8 compression_type;
u16 compressed_size;
u16 uncompressed_size;
u16 offset;
u16 live_size;
u16 nonce;
struct bch_csum csum;
};
struct extent_pick_ptr {
struct bch_extent_ptr ptr;
struct bch_extent_crc_unpacked crc;
};
#endif /* _BCACHEFS_EXTENTS_TYPES_H */

283
fs/bcachefs/eytzinger.h Normal file
View File

@ -0,0 +1,283 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _EYTZINGER_H
#define _EYTZINGER_H
#include <linux/bitops.h>
#include <linux/log2.h>
#include "util.h"
/*
* Traversal for trees in eytzinger layout - a full binary tree layed out in an
* array
*/
/*
* One based indexing version:
*
* With one based indexing each level of the tree starts at a power of two -
* good for cacheline alignment:
*
* Size parameter is treated as if we were using 0 based indexing, however:
* valid nodes, and inorder indices, are in the range [1..size) - that is, there
* are actually size - 1 elements
*/
static inline unsigned eytzinger1_child(unsigned i, unsigned child)
{
EBUG_ON(child > 1);
return (i << 1) + child;
}
static inline unsigned eytzinger1_left_child(unsigned i)
{
return eytzinger1_child(i, 0);
}
static inline unsigned eytzinger1_right_child(unsigned i)
{
return eytzinger1_child(i, 1);
}
static inline unsigned eytzinger1_first(unsigned size)
{
return rounddown_pow_of_two(size - 1);
}
static inline unsigned eytzinger1_last(unsigned size)
{
return rounddown_pow_of_two(size) - 1;
}
/*
* eytzinger1_next() and eytzinger1_prev() have the nice properties that
*
* eytzinger1_next(0) == eytzinger1_first())
* eytzinger1_prev(0) == eytzinger1_last())
*
* eytzinger1_prev(eytzinger1_first()) == 0
* eytzinger1_next(eytzinger1_last()) == 0
*/
static inline unsigned eytzinger1_next(unsigned i, unsigned size)
{
EBUG_ON(i >= size);
if (eytzinger1_right_child(i) < size) {
i = eytzinger1_right_child(i);
i <<= __fls(size) - __fls(i);
i >>= i >= size;
} else {
i >>= ffz(i) + 1;
}
return i;
}
static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
{
EBUG_ON(i >= size);
if (eytzinger1_left_child(i) < size) {
i = eytzinger1_left_child(i) + 1;
i <<= __fls(size) - __fls(i);
i -= 1;
i >>= i >= size;
} else {
i >>= __ffs(i) + 1;
}
return i;
}
static inline unsigned eytzinger1_extra(unsigned size)
{
return (size - rounddown_pow_of_two(size - 1)) << 1;
}
static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
unsigned extra)
{
unsigned b = __fls(i);
unsigned shift = __fls(size - 1) - b;
int s;
EBUG_ON(!i || i >= size);
i ^= 1U << b;
i <<= 1;
i |= 1;
i <<= shift;
/*
* sign bit trick:
*
* if (i > extra)
* i -= (i - extra) >> 1;
*/
s = extra - i;
i += (s >> 1) & (s >> 31);
return i;
}
static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
unsigned extra)
{
unsigned shift;
int s;
EBUG_ON(!i || i >= size);
/*
* sign bit trick:
*
* if (i > extra)
* i += i - extra;
*/
s = extra - i;
i -= s & (s >> 31);
shift = __ffs(i);
i >>= shift + 1;
i |= 1U << (__fls(size - 1) - shift);
return i;
}
static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size)
{
return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size));
}
static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
{
return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size));
}
#define eytzinger1_for_each(_i, _size) \
for ((_i) = eytzinger1_first((_size)); \
(_i) != 0; \
(_i) = eytzinger1_next((_i), (_size)))
/* Zero based indexing version: */
static inline unsigned eytzinger0_child(unsigned i, unsigned child)
{
EBUG_ON(child > 1);
return (i << 1) + 1 + child;
}
static inline unsigned eytzinger0_left_child(unsigned i)
{
return eytzinger0_child(i, 0);
}
static inline unsigned eytzinger0_right_child(unsigned i)
{
return eytzinger0_child(i, 1);
}
static inline unsigned eytzinger0_first(unsigned size)
{
return eytzinger1_first(size + 1) - 1;
}
static inline unsigned eytzinger0_last(unsigned size)
{
return eytzinger1_last(size + 1) - 1;
}
static inline unsigned eytzinger0_next(unsigned i, unsigned size)
{
return eytzinger1_next(i + 1, size + 1) - 1;
}
static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
{
return eytzinger1_prev(i + 1, size + 1) - 1;
}
static inline unsigned eytzinger0_extra(unsigned size)
{
return eytzinger1_extra(size + 1);
}
static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
unsigned extra)
{
return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1;
}
static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
unsigned extra)
{
return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1;
}
static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
{
return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size));
}
static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
{
return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
}
#define eytzinger0_for_each(_i, _size) \
for ((_i) = eytzinger0_first((_size)); \
(_i) != -1; \
(_i) = eytzinger0_next((_i), (_size)))
typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
/* return greatest node <= @search, or -1 if not found */
static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
eytzinger_cmp_fn cmp, const void *search)
{
unsigned i, n = 0;
if (!nr)
return -1;
do {
i = n;
n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
} while (n < nr);
if (n & 1) {
/* @i was greater than @search, return previous node: */
if (i == eytzinger0_first(nr))
return -1;
return eytzinger0_prev(i, nr);
} else {
return i;
}
}
static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
eytzinger_cmp_fn cmp, const void *search)
{
size_t i = 0;
int res;
while (i < nr &&
(res = cmp(search, base + i * size, size)))
i = eytzinger0_child(i, res > 0);
return i;
}
void eytzinger0_sort(void *, size_t, size_t,
int (*cmp_func)(const void *, const void *, size_t),
void (*swap_func)(void *, void *, size_t));
#endif /* _EYTZINGER_H */

125
fs/bcachefs/fifo.h Normal file
View File

@ -0,0 +1,125 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_FIFO_H
#define _BCACHEFS_FIFO_H
#include "util.h"
#define FIFO(type) \
struct { \
size_t front, back, size, mask; \
type *data; \
}
#define DECLARE_FIFO(type, name) FIFO(type) name
#define fifo_buf_size(fifo) \
(roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]))
#define init_fifo(fifo, _size, _gfp) \
({ \
(fifo)->front = (fifo)->back = 0; \
(fifo)->size = (_size); \
(fifo)->mask = (fifo)->size \
? roundup_pow_of_two((fifo)->size) - 1 \
: 0; \
(fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \
})
#define free_fifo(fifo) \
do { \
kvpfree((fifo)->data, fifo_buf_size(fifo)); \
(fifo)->data = NULL; \
} while (0)
#define fifo_swap(l, r) \
do { \
swap((l)->front, (r)->front); \
swap((l)->back, (r)->back); \
swap((l)->size, (r)->size); \
swap((l)->mask, (r)->mask); \
swap((l)->data, (r)->data); \
} while (0)
#define fifo_move(dest, src) \
do { \
typeof(*((dest)->data)) _t; \
while (!fifo_full(dest) && \
fifo_pop(src, _t)) \
fifo_push(dest, _t); \
} while (0)
#define fifo_used(fifo) (((fifo)->back - (fifo)->front))
#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo))
#define fifo_empty(fifo) ((fifo)->front == (fifo)->back)
#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size)
#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask])
#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
#define fifo_entry_idx_abs(fifo, p) \
((((p) >= &fifo_peek_front(fifo) \
? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \
(((p) - (fifo)->data)))
#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
#define fifo_idx_entry(fifo, i) (fifo)->data[((fifo)->front + (i)) & (fifo)->mask]
#define fifo_push_back_ref(f) \
(fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
#define fifo_push_front_ref(f) \
(fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask])
#define fifo_push_back(fifo, new) \
({ \
typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \
if (_r) \
*_r = (new); \
_r != NULL; \
})
#define fifo_push_front(fifo, new) \
({ \
typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \
if (_r) \
*_r = (new); \
_r != NULL; \
})
#define fifo_pop_front(fifo, i) \
({ \
bool _r = !fifo_empty((fifo)); \
if (_r) \
(i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \
_r; \
})
#define fifo_pop_back(fifo, i) \
({ \
bool _r = !fifo_empty((fifo)); \
if (_r) \
(i) = (fifo)->data[--(fifo)->back & (fifo)->mask] \
_r; \
})
#define fifo_push_ref(fifo) fifo_push_back_ref(fifo)
#define fifo_push(fifo, i) fifo_push_back(fifo, (i))
#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i))
#define fifo_peek(fifo) fifo_peek_front(fifo)
#define fifo_for_each_entry(_entry, _fifo, _iter) \
for (((void) (&(_iter) == &(_fifo)->front)), \
_iter = (_fifo)->front; \
((_iter != (_fifo)->back) && \
(_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \
_iter++)
#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \
for (((void) (&(_iter) == &(_fifo)->front)), \
_iter = (_fifo)->front; \
((_iter != (_fifo)->back) && \
(_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \
_iter++)
#endif /* _BCACHEFS_FIFO_H */

2862
fs/bcachefs/fs-io.c Normal file

File diff suppressed because it is too large Load Diff

47
fs/bcachefs/fs-io.h Normal file
View File

@ -0,0 +1,47 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_FS_IO_H
#define _BCACHEFS_FS_IO_H
#ifndef NO_BCACHEFS_FS
#include "buckets.h"
#include "io_types.h"
#include <linux/uio.h>
bool bch2_dirty_folio(struct address_space *, struct folio *);
int bch2_writepage(struct page *, struct writeback_control *);
int bch2_read_folio(struct file *, struct folio *);
int bch2_writepages(struct address_space *, struct writeback_control *);
void bch2_readahead(struct readahead_control *);
int bch2_write_begin(struct file *, struct address_space *, loff_t,
unsigned, struct page **, void **);
int bch2_write_end(struct file *, struct address_space *, loff_t,
unsigned, unsigned, struct page *, void *);
ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
int bch2_fsync(struct file *, loff_t, loff_t, int);
int bch2_truncate(struct bch_inode_info *, struct iattr *);
long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
loff_t bch2_llseek(struct file *, loff_t, int);
vm_fault_t bch2_page_fault(struct vm_fault *);
vm_fault_t bch2_page_mkwrite(struct vm_fault *);
void bch2_invalidate_folio(struct folio *, size_t, size_t);
bool bch2_release_folio(struct folio *, gfp_t);
void bch2_fs_fsio_exit(struct bch_fs *);
int bch2_fs_fsio_init(struct bch_fs *);
#else
static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
#endif
#endif /* _BCACHEFS_FS_IO_H */

312
fs/bcachefs/fs-ioctl.c Normal file
View File

@ -0,0 +1,312 @@
// SPDX-License-Identifier: GPL-2.0
#ifndef NO_BCACHEFS_FS
#include "bcachefs.h"
#include "chardev.h"
#include "fs.h"
#include "fs-ioctl.h"
#include "quota.h"
#include <linux/compat.h>
#include <linux/mount.h>
#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32)
/* Inode flags: */
/* bcachefs inode flags -> vfs inode flags: */
static const unsigned bch_flags_to_vfs[] = {
[__BCH_INODE_SYNC] = S_SYNC,
[__BCH_INODE_IMMUTABLE] = S_IMMUTABLE,
[__BCH_INODE_APPEND] = S_APPEND,
[__BCH_INODE_NOATIME] = S_NOATIME,
};
/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
static const unsigned bch_flags_to_uflags[] = {
[__BCH_INODE_SYNC] = FS_SYNC_FL,
[__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL,
[__BCH_INODE_APPEND] = FS_APPEND_FL,
[__BCH_INODE_NODUMP] = FS_NODUMP_FL,
[__BCH_INODE_NOATIME] = FS_NOATIME_FL,
};
/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
static const unsigned bch_flags_to_xflags[] = {
[__BCH_INODE_SYNC] = FS_XFLAG_SYNC,
[__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE,
[__BCH_INODE_APPEND] = FS_XFLAG_APPEND,
[__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP,
[__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME,
//[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
};
#define set_flags(_map, _in, _out) \
do { \
unsigned _i; \
\
for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
if ((_in) & (1 << _i)) \
(_out) |= _map[_i]; \
else \
(_out) &= ~_map[_i]; \
} while (0)
#define map_flags(_map, _in) \
({ \
unsigned _out = 0; \
\
set_flags(_map, _in, _out); \
_out; \
})
#define map_flags_rev(_map, _in) \
({ \
unsigned _i, _out = 0; \
\
for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
if ((_in) & _map[_i]) { \
(_out) |= 1 << _i; \
(_in) &= ~_map[_i]; \
} \
(_out); \
})
#define map_defined(_map) \
({ \
unsigned _in = ~0; \
\
map_flags_rev(_map, _in); \
})
/* Set VFS inode flags from bcachefs inode: */
void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
{
set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
}
struct flags_set {
unsigned mask;
unsigned flags;
unsigned projid;
};
static int bch2_inode_flags_set(struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
void *p)
{
/*
* We're relying on btree locking here for exclusion with other ioctl
* calls - use the flags in the btree (@bi), not inode->i_flags:
*/
struct flags_set *s = p;
unsigned newflags = s->flags;
unsigned oldflags = bi->bi_flags & s->mask;
if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) &&
!capable(CAP_LINUX_IMMUTABLE))
return -EPERM;
if (!S_ISREG(inode->v.i_mode) &&
!S_ISDIR(inode->v.i_mode) &&
(newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
return -EINVAL;
bi->bi_flags &= ~s->mask;
bi->bi_flags |= newflags;
inode_set_ctime_current(&inode->v);
return 0;
}
static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
{
unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
return put_user(flags, arg);
}
static int bch2_ioc_setflags(struct bch_fs *c,
struct file *file,
struct bch_inode_info *inode,
void __user *arg)
{
struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) };
unsigned uflags;
int ret;
if (get_user(uflags, (int __user *) arg))
return -EFAULT;
s.flags = map_flags_rev(bch_flags_to_uflags, uflags);
if (uflags)
return -EOPNOTSUPP;
ret = mnt_want_write_file(file);
if (ret)
return ret;
inode_lock(&inode->v);
if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
ret = -EACCES;
goto setflags_out;
}
mutex_lock(&inode->ei_update_lock);
ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s, 0);
if (!ret)
bch2_inode_flags_to_vfs(inode);
mutex_unlock(&inode->ei_update_lock);
setflags_out:
inode_unlock(&inode->v);
mnt_drop_write_file(file);
return ret;
}
static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
struct fsxattr __user *arg)
{
struct fsxattr fa = { 0 };
fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
return copy_to_user(arg, &fa, sizeof(fa));
}
static int bch2_set_projid(struct bch_fs *c,
struct bch_inode_info *inode,
u32 projid)
{
struct bch_qid qid = inode->ei_qid;
int ret;
if (projid == inode->ei_qid.q[QTYP_PRJ])
return 0;
qid.q[QTYP_PRJ] = projid;
return bch2_quota_transfer(c, 1 << QTYP_PRJ, qid, inode->ei_qid,
inode->v.i_blocks +
inode->ei_quota_reserved);
if (ret)
return ret;
inode->ei_qid.q[QTYP_PRJ] = projid;
return 0;
}
static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
void *p)
{
struct flags_set *s = p;
bi->bi_project = s->projid;
return bch2_inode_flags_set(inode, bi, p);
}
static int bch2_ioc_fssetxattr(struct bch_fs *c,
struct file *file,
struct bch_inode_info *inode,
struct fsxattr __user *arg)
{
struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) };
struct fsxattr fa;
int ret;
if (copy_from_user(&fa, arg, sizeof(fa)))
return -EFAULT;
s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
if (fa.fsx_xflags)
return -EOPNOTSUPP;
s.projid = fa.fsx_projid;
ret = mnt_want_write_file(file);
if (ret)
return ret;
inode_lock(&inode->v);
if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) {
ret = -EACCES;
goto err;
}
mutex_lock(&inode->ei_update_lock);
ret = bch2_set_projid(c, inode, fa.fsx_projid);
if (ret)
goto err_unlock;
ret = __bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, 0);
if (!ret)
bch2_inode_flags_to_vfs(inode);
err_unlock:
mutex_unlock(&inode->ei_update_lock);
err:
inode_unlock(&inode->v);
mnt_drop_write_file(file);
return ret;
}
long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct super_block *sb = inode->v.i_sb;
struct bch_fs *c = sb->s_fs_info;
switch (cmd) {
case FS_IOC_GETFLAGS:
return bch2_ioc_getflags(inode, (int __user *) arg);
case FS_IOC_SETFLAGS:
return bch2_ioc_setflags(c, file, inode, (int __user *) arg);
case FS_IOC_FSGETXATTR:
return bch2_ioc_fsgetxattr(inode, (void __user *) arg);
case FS_IOC_FSSETXATTR:
return bch2_ioc_fssetxattr(c, file, inode, (void __user *) arg);
case FS_IOC_GETVERSION:
return -ENOTTY;
case FS_IOC_SETVERSION:
return -ENOTTY;
case FS_IOC_GOINGDOWN:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
down_write(&sb->s_umount);
sb->s_flags |= SB_RDONLY;
bch2_fs_emergency_read_only(c);
up_write(&sb->s_umount);
return 0;
default:
return bch2_fs_ioctl(c, cmd, (void __user *) arg);
}
}
#ifdef CONFIG_COMPAT
long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
/* These are just misnamed, they actually get/put from/to user an int */
switch (cmd) {
case FS_IOC_GETFLAGS:
cmd = FS_IOC_GETFLAGS;
break;
case FS_IOC32_SETFLAGS:
cmd = FS_IOC_SETFLAGS;
break;
default:
return -ENOIOCTLCMD;
}
return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
}
#endif
#endif /* NO_BCACHEFS_FS */

10
fs/bcachefs/fs-ioctl.h Normal file
View File

@ -0,0 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_FS_IOCTL_H
#define _BCACHEFS_FS_IOCTL_H
void bch2_inode_flags_to_vfs(struct bch_inode_info *);
long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
#endif /* _BCACHEFS_FS_IOCTL_H */

1773
fs/bcachefs/fs.c Normal file

File diff suppressed because it is too large Load Diff

99
fs/bcachefs/fs.h Normal file
View File

@ -0,0 +1,99 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_FS_H
#define _BCACHEFS_FS_H
#include "opts.h"
#include "str_hash.h"
#include "quota_types.h"
#include <linux/seqlock.h>
#include <linux/stat.h>
/*
* Two-state lock - can be taken for add or block - both states are shared,
* like read side of rwsem, but conflict with other state:
*/
struct pagecache_lock {
atomic_long_t v;
wait_queue_head_t wait;
};
static inline void pagecache_lock_init(struct pagecache_lock *lock)
{
atomic_long_set(&lock->v, 0);
init_waitqueue_head(&lock->wait);
}
void bch2_pagecache_add_put(struct pagecache_lock *);
void bch2_pagecache_add_get(struct pagecache_lock *);
void bch2_pagecache_block_put(struct pagecache_lock *);
void bch2_pagecache_block_get(struct pagecache_lock *);
struct bch_inode_info {
struct inode v;
struct mutex ei_update_lock;
u64 ei_journal_seq;
u64 ei_quota_reserved;
unsigned long ei_last_dirtied;
struct pagecache_lock ei_pagecache_lock;
struct mutex ei_quota_lock;
struct bch_qid ei_qid;
struct bch_hash_info ei_str_hash;
/* copy of inode in btree: */
struct bch_inode_unpacked ei_inode;
};
#define to_bch_ei(_inode) \
container_of_or_null(_inode, struct bch_inode_info, v)
static inline struct bch_inode_info *file_bch_inode(struct file *file)
{
return to_bch_ei(file_inode(file));
}
static inline u8 mode_to_type(umode_t mode)
{
return (mode >> 12) & 15;
}
static inline unsigned nlink_bias(umode_t mode)
{
return S_ISDIR(mode) ? 2 : 1;
}
struct bch_inode_unpacked;
#ifndef NO_BCACHEFS_FS
/* returns 0 if we want to do the update, or error is passed up */
typedef int (*inode_set_fn)(struct bch_inode_info *,
struct bch_inode_unpacked *, void *);
void bch2_inode_update_after_write(struct bch_fs *,
struct bch_inode_info *,
struct bch_inode_unpacked *,
unsigned);
int __must_check bch2_write_inode_trans(struct btree_trans *,
struct bch_inode_info *,
struct bch_inode_unpacked *,
inode_set_fn, void *);
int __must_check __bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
inode_set_fn, void *, unsigned);
int __must_check bch2_write_inode(struct bch_fs *,
struct bch_inode_info *);
void bch2_vfs_exit(void);
int bch2_vfs_init(void);
#else
static inline void bch2_vfs_exit(void) {}
static inline int bch2_vfs_init(void) { return 0; }
#endif /* NO_BCACHEFS_FS */
#endif /* _BCACHEFS_FS_H */

1306
fs/bcachefs/fsck.c Normal file

File diff suppressed because it is too large Load Diff

8
fs/bcachefs/fsck.h Normal file
View File

@ -0,0 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_FSCK_H
#define _BCACHEFS_FSCK_H
s64 bch2_count_inode_sectors(struct bch_fs *, u64);
int bch2_fsck(struct bch_fs *);
#endif /* _BCACHEFS_FSCK_H */

517
fs/bcachefs/inode.c Normal file
View File

@ -0,0 +1,517 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "bkey_methods.h"
#include "btree_update.h"
#include "error.h"
#include "extents.h"
#include "inode.h"
#include "io.h"
#include "keylist.h"
#include <linux/random.h>
#include <asm/unaligned.h>
#define FIELD_BYTES() \
static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
static const u8 bits_table[8] = {
1 * 8 - 1,
2 * 8 - 2,
3 * 8 - 3,
4 * 8 - 4,
6 * 8 - 5,
8 * 8 - 6,
10 * 8 - 7,
13 * 8 - 8,
};
static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo)
{
__be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), };
unsigned shift, bytes, bits = likely(!hi)
? fls64(lo)
: fls64(hi) + 64;
for (shift = 1; shift <= 8; shift++)
if (bits < bits_table[shift - 1])
goto got_shift;
BUG();
got_shift:
bytes = byte_table[shift - 1];
BUG_ON(out + bytes > end);
memcpy(out, (u8 *) in + 16 - bytes, bytes);
*out |= (1 << 8) >> shift;
return bytes;
}
static int inode_decode_field(const u8 *in, const u8 *end,
u64 out[2], unsigned *out_bits)
{
__be64 be[2] = { 0, 0 };
unsigned bytes, shift;
u8 *p;
if (in >= end)
return -1;
if (!*in)
return -1;
/*
* position of highest set bit indicates number of bytes:
* shift = number of bits to remove in high byte:
*/
shift = 8 - __fls(*in); /* 1 <= shift <= 8 */
bytes = byte_table[shift - 1];
if (in + bytes > end)
return -1;
p = (u8 *) be + 16 - bytes;
memcpy(p, in, bytes);
*p ^= (1 << 8) >> shift;
out[0] = be64_to_cpu(be[0]);
out[1] = be64_to_cpu(be[1]);
*out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
return bytes;
}
void bch2_inode_pack(struct bkey_inode_buf *packed,
const struct bch_inode_unpacked *inode)
{
u8 *out = packed->inode.v.fields;
u8 *end = (void *) &packed[1];
u8 *last_nonzero_field = out;
unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
bkey_inode_init(&packed->inode.k_i);
packed->inode.k.p.inode = inode->bi_inum;
packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags);
packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode);
#define BCH_INODE_FIELD(_name, _bits) \
out += inode_encode_field(out, end, 0, inode->_name); \
nr_fields++; \
\
if (inode->_name) { \
last_nonzero_field = out; \
last_nonzero_fieldnr = nr_fields; \
}
BCH_INODE_FIELDS()
#undef BCH_INODE_FIELD
out = last_nonzero_field;
nr_fields = last_nonzero_fieldnr;
set_bkey_val_bytes(&packed->inode.k, out - (u8 *) &packed->inode.v);
memset(out, 0,
(u8 *) &packed->inode.v +
bkey_val_bytes(&packed->inode.k) - out);
SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields);
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
struct bch_inode_unpacked unpacked;
int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode),
&unpacked);
BUG_ON(ret);
BUG_ON(unpacked.bi_inum != inode->bi_inum);
BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed);
BUG_ON(unpacked.bi_mode != inode->bi_mode);
#define BCH_INODE_FIELD(_name, _bits) BUG_ON(unpacked._name != inode->_name);
BCH_INODE_FIELDS()
#undef BCH_INODE_FIELD
}
}
int bch2_inode_unpack(struct bkey_s_c_inode inode,
struct bch_inode_unpacked *unpacked)
{
const u8 *in = inode.v->fields;
const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k);
u64 field[2];
unsigned fieldnr = 0, field_bits;
int ret;
unpacked->bi_inum = inode.k->p.inode;
unpacked->bi_hash_seed = inode.v->bi_hash_seed;
unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
#define BCH_INODE_FIELD(_name, _bits) \
if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \
unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
memset((void *) unpacked + offset, 0, \
sizeof(*unpacked) - offset); \
return 0; \
} \
\
ret = inode_decode_field(in, end, field, &field_bits); \
if (ret < 0) \
return ret; \
\
if (field_bits > sizeof(unpacked->_name) * 8) \
return -1; \
\
unpacked->_name = field[1]; \
in += ret;
BCH_INODE_FIELDS()
#undef BCH_INODE_FIELD
/* XXX: signal if there were more fields than expected? */
return 0;
}
const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
if (k.k->p.offset)
return "nonzero offset";
switch (k.k->type) {
case BCH_INODE_FS: {
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
struct bch_inode_unpacked unpacked;
if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
return "incorrect value size";
if (k.k->p.inode < BLOCKDEV_INODE_MAX)
return "fs inode in blockdev range";
if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
return "invalid str hash type";
if (bch2_inode_unpack(inode, &unpacked))
return "invalid variable length fields";
if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
return "invalid data checksum type";
if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
return "invalid data checksum type";
if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
unpacked.bi_nlink != 0)
return "flagged as unlinked but bi_nlink != 0";
return NULL;
}
case BCH_INODE_BLOCKDEV:
if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_blockdev))
return "incorrect value size";
if (k.k->p.inode >= BLOCKDEV_INODE_MAX)
return "blockdev inode in fs range";
return NULL;
case BCH_INODE_GENERATION:
if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation))
return "incorrect value size";
return NULL;
default:
return "invalid type";
}
}
void bch2_inode_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
{
char *out = buf, *end = out + size;
struct bkey_s_c_inode inode;
struct bch_inode_unpacked unpacked;
switch (k.k->type) {
case BCH_INODE_FS:
inode = bkey_s_c_to_inode(k);
if (bch2_inode_unpack(inode, &unpacked)) {
out += scnprintf(out, end - out, "(unpack error)");
break;
}
#define BCH_INODE_FIELD(_name, _bits) \
out += scnprintf(out, end - out, #_name ": %llu ", (u64) unpacked._name);
BCH_INODE_FIELDS()
#undef BCH_INODE_FIELD
break;
}
}
void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
struct bch_inode_unpacked *parent)
{
s64 now = bch2_current_time(c);
memset(inode_u, 0, sizeof(*inode_u));
/* ick */
inode_u->bi_flags |= c->opts.str_hash << INODE_STR_HASH_OFFSET;
get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
inode_u->bi_mode = mode;
inode_u->bi_uid = uid;
inode_u->bi_gid = gid;
inode_u->bi_dev = rdev;
inode_u->bi_atime = now;
inode_u->bi_mtime = now;
inode_u->bi_ctime = now;
inode_u->bi_otime = now;
if (parent) {
#define BCH_INODE_FIELD(_name) inode_u->_name = parent->_name;
BCH_INODE_FIELDS_INHERIT()
#undef BCH_INODE_FIELD
}
}
static inline u32 bkey_generation(struct bkey_s_c k)
{
switch (k.k->type) {
case BCH_INODE_BLOCKDEV:
case BCH_INODE_FS:
BUG();
case BCH_INODE_GENERATION:
return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
default:
return 0;
}
}
int __bch2_inode_create(struct btree_trans *trans,
struct bch_inode_unpacked *inode_u,
u64 min, u64 max, u64 *hint)
{
struct bch_fs *c = trans->c;
struct bkey_inode_buf *inode_p;
struct btree_iter *iter;
u64 start;
int ret;
if (!max)
max = ULLONG_MAX;
if (c->opts.inodes_32bit)
max = min_t(u64, max, U32_MAX);
start = READ_ONCE(*hint);
if (start >= max || start < min)
start = min;
inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
if (IS_ERR(inode_p))
return PTR_ERR(inode_p);
iter = bch2_trans_get_iter(trans,
BTREE_ID_INODES, POS(start, 0),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
if (IS_ERR(iter))
return PTR_ERR(iter);
again:
while (1) {
struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
ret = btree_iter_err(k);
if (ret)
return ret;
switch (k.k->type) {
case BCH_INODE_BLOCKDEV:
case BCH_INODE_FS:
/* slot used */
if (iter->pos.inode >= max)
goto out;
bch2_btree_iter_next_slot(iter);
break;
default:
*hint = k.k->p.inode;
inode_u->bi_inum = k.k->p.inode;
inode_u->bi_generation = bkey_generation(k);
bch2_inode_pack(inode_p, inode_u);
bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
return 0;
}
}
out:
if (start != min) {
/* Retry from start */
start = min;
bch2_btree_iter_set_pos(iter, POS(start, 0));
goto again;
}
return -ENOSPC;
}
int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
u64 min, u64 max, u64 *hint)
{
return bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC,
__bch2_inode_create(&trans, inode_u, min, max, hint));
}
int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size,
struct extent_insert_hook *hook, u64 *journal_seq)
{
return bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
POS(inode_nr, new_size),
POS(inode_nr + 1, 0),
ZERO_VERSION, NULL, hook,
journal_seq);
}
int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
{
struct btree_iter iter;
struct bkey_i_inode_generation delete;
int ret;
ret = bch2_inode_truncate(c, inode_nr, 0, NULL, NULL);
if (ret < 0)
return ret;
ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS,
POS(inode_nr, 0),
POS(inode_nr + 1, 0),
ZERO_VERSION, NULL, NULL, NULL);
if (ret < 0)
return ret;
/*
* If this was a directory, there shouldn't be any real dirents left -
* but there could be whiteouts (from hash collisions) that we should
* delete:
*
* XXX: the dirent could ideally would delete whiteouts when they're no
* longer needed
*/
ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
POS(inode_nr, 0),
POS(inode_nr + 1, 0),
ZERO_VERSION, NULL, NULL, NULL);
if (ret < 0)
return ret;
bch2_btree_iter_init(&iter, c, BTREE_ID_INODES, POS(inode_nr, 0),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
do {
struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
u32 bi_generation = 0;
ret = btree_iter_err(k);
if (ret) {
bch2_btree_iter_unlock(&iter);
return ret;
}
bch2_fs_inconsistent_on(k.k->type != BCH_INODE_FS, c,
"inode %llu not found when deleting",
inode_nr);
switch (k.k->type) {
case BCH_INODE_FS: {
struct bch_inode_unpacked inode_u;
if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u))
bi_generation = inode_u.bi_generation + 1;
break;
}
case BCH_INODE_GENERATION: {
struct bkey_s_c_inode_generation g =
bkey_s_c_to_inode_generation(k);
bi_generation = le32_to_cpu(g.v->bi_generation);
break;
}
}
if (!bi_generation) {
bkey_init(&delete.k);
delete.k.p.inode = inode_nr;
} else {
bkey_inode_generation_init(&delete.k_i);
delete.k.p.inode = inode_nr;
delete.v.bi_generation = cpu_to_le32(bi_generation);
}
ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL,
BTREE_INSERT_ENTRY(&iter, &delete.k_i));
} while (ret == -EINTR);
bch2_btree_iter_unlock(&iter);
return ret;
}
int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
struct bch_inode_unpacked *inode)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret = -ENOENT;
for_each_btree_key(&iter, c, BTREE_ID_INODES,
POS(inode_nr, 0),
BTREE_ITER_SLOTS, k) {
switch (k.k->type) {
case BCH_INODE_FS:
ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
break;
default:
/* hole, not found */
break;
}
break;
}
return bch2_btree_iter_unlock(&iter) ?: ret;
}
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_inode_pack_test(void)
{
struct bch_inode_unpacked *u, test_inodes[] = {
{
.bi_atime = U64_MAX,
.bi_ctime = U64_MAX,
.bi_mtime = U64_MAX,
.bi_otime = U64_MAX,
.bi_size = U64_MAX,
.bi_sectors = U64_MAX,
.bi_uid = U32_MAX,
.bi_gid = U32_MAX,
.bi_nlink = U32_MAX,
.bi_generation = U32_MAX,
.bi_dev = U32_MAX,
},
};
for (u = test_inodes;
u < test_inodes + ARRAY_SIZE(test_inodes);
u++) {
struct bkey_inode_buf p;
bch2_inode_pack(&p, u);
}
}
#endif

101
fs/bcachefs/inode.h Normal file
View File

@ -0,0 +1,101 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_INODE_H
#define _BCACHEFS_INODE_H
#include "opts.h"
#include <linux/math64.h>
const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_inode_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
#define bch2_bkey_inode_ops (struct bkey_ops) { \
.key_invalid = bch2_inode_invalid, \
.val_to_text = bch2_inode_to_text, \
}
struct bch_inode_unpacked {
u64 bi_inum;
__le64 bi_hash_seed;
u32 bi_flags;
u16 bi_mode;
#define BCH_INODE_FIELD(_name, _bits) u##_bits _name;
BCH_INODE_FIELDS()
#undef BCH_INODE_FIELD
};
struct bkey_inode_buf {
struct bkey_i_inode inode;
#define BCH_INODE_FIELD(_name, _bits) + 8 + _bits / 8
u8 _pad[0 + BCH_INODE_FIELDS()];
#undef BCH_INODE_FIELD
} __attribute__((packed, aligned(8)));
void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *);
int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
uid_t, gid_t, umode_t, dev_t,
struct bch_inode_unpacked *);
int __bch2_inode_create(struct btree_trans *,
struct bch_inode_unpacked *,
u64, u64, u64 *);
int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *,
u64, u64, u64 *);
int bch2_inode_truncate(struct bch_fs *, u64, u64,
struct extent_insert_hook *, u64 *);
int bch2_inode_rm(struct bch_fs *, u64);
int bch2_inode_find_by_inum(struct bch_fs *, u64,
struct bch_inode_unpacked *);
static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
{
struct bch_io_opts ret = { 0 };
#define BCH_INODE_OPT(_name, _bits) \
if (inode->bi_##_name) \
opt_set(ret, _name, inode->bi_##_name - 1);
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
return ret;
}
static inline void __bch2_inode_opt_set(struct bch_inode_unpacked *inode,
enum bch_opt_id id, u64 v)
{
switch (id) {
#define BCH_INODE_OPT(_name, ...) \
case Opt_##_name: \
inode->bi_##_name = v; \
break;
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
default:
BUG();
}
}
static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
enum bch_opt_id id, u64 v)
{
return __bch2_inode_opt_set(inode, id, v + 1);
}
static inline void bch2_inode_opt_clear(struct bch_inode_unpacked *inode,
enum bch_opt_id id)
{
return __bch2_inode_opt_set(inode, id, 0);
}
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_inode_pack_test(void);
#else
static inline void bch2_inode_pack_test(void) {}
#endif
#endif /* _BCACHEFS_INODE_H */

1875
fs/bcachefs/io.c Normal file

File diff suppressed because it is too large Load Diff

144
fs/bcachefs/io.h Normal file
View File

@ -0,0 +1,144 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_IO_H
#define _BCACHEFS_IO_H
#include "alloc.h"
#include "checksum.h"
#include "io_types.h"
#define to_wbio(_bio) \
container_of((_bio), struct bch_write_bio, bio)
#define to_rbio(_bio) \
container_of((_bio), struct bch_read_bio, bio)
void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t);
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
void bch2_latency_acct(struct bch_dev *, u64, int);
#else
static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {}
#endif
void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
enum bch_data_type, const struct bkey_i *);
#define BLK_STS_REMOVED ((__force blk_status_t)128)
enum bch_write_flags {
BCH_WRITE_ALLOC_NOWAIT = (1 << 0),
BCH_WRITE_CACHED = (1 << 1),
BCH_WRITE_FLUSH = (1 << 2),
BCH_WRITE_DATA_ENCODED = (1 << 3),
BCH_WRITE_PAGES_STABLE = (1 << 4),
BCH_WRITE_PAGES_OWNED = (1 << 5),
BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6),
BCH_WRITE_NOPUT_RESERVATION = (1 << 7),
BCH_WRITE_NOMARK_REPLICAS = (1 << 8),
/* Internal: */
BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9),
};
static inline u64 *op_journal_seq(struct bch_write_op *op)
{
return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR)
? op->journal_seq_p : &op->journal_seq;
}
static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq)
{
op->journal_seq_p = journal_seq;
op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
}
static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
{
return op->alloc_reserve == RESERVE_MOVINGGC
? op->c->copygc_wq
: op->c->wq;
}
int bch2_write_index_default(struct bch_write_op *);
static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
struct bch_io_opts opts)
{
op->c = c;
op->io_wq = index_update_wq(op);
op->flags = 0;
op->written = 0;
op->error = 0;
op->csum_type = bch2_data_checksum_type(c, opts.data_checksum);
op->compression_type = bch2_compression_opt_to_type[opts.compression];
op->nr_replicas = 0;
op->nr_replicas_required = c->opts.data_replicas_required;
op->alloc_reserve = RESERVE_NONE;
op->open_buckets_nr = 0;
op->devs_have.nr = 0;
op->target = 0;
op->opts = opts;
op->pos = POS_MAX;
op->version = ZERO_VERSION;
op->write_point = (struct write_point_specifier) { 0 };
op->res = (struct disk_reservation) { 0 };
op->journal_seq = 0;
op->index_update_fn = bch2_write_index_default;
}
void bch2_write(struct closure *);
static inline struct bch_write_bio *wbio_init(struct bio *bio)
{
struct bch_write_bio *wbio = to_wbio(bio);
memset(&wbio->wbio, 0, sizeof(wbio->wbio));
return wbio;
}
struct bch_devs_mask;
struct cache_promote_op;
struct extent_pick_ptr;
int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
struct bkey_s_c, struct bch_devs_mask *, unsigned);
void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
enum bch_read_flags {
BCH_READ_RETRY_IF_STALE = 1 << 0,
BCH_READ_MAY_PROMOTE = 1 << 1,
BCH_READ_USER_MAPPED = 1 << 2,
BCH_READ_NODECODE = 1 << 3,
BCH_READ_LAST_FRAGMENT = 1 << 4,
/* internal: */
BCH_READ_MUST_BOUNCE = 1 << 5,
BCH_READ_MUST_CLONE = 1 << 6,
BCH_READ_IN_RETRY = 1 << 7,
};
static inline void bch2_read_extent(struct bch_fs *c,
struct bch_read_bio *rbio,
struct bkey_s_c k,
unsigned flags)
{
__bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags);
}
static inline struct bch_read_bio *rbio_init(struct bio *bio,
struct bch_io_opts opts)
{
struct bch_read_bio *rbio = to_rbio(bio);
rbio->_state = 0;
rbio->promote = NULL;
rbio->opts = opts;
return rbio;
}
void bch2_fs_io_exit(struct bch_fs *);
int bch2_fs_io_init(struct bch_fs *);
#endif /* _BCACHEFS_IO_H */

148
fs/bcachefs/io_types.h Normal file
View File

@ -0,0 +1,148 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_IO_TYPES_H
#define _BCACHEFS_IO_TYPES_H
#include "alloc_types.h"
#include "btree_types.h"
#include "buckets_types.h"
#include "extents_types.h"
#include "keylist_types.h"
#include "opts.h"
#include "super_types.h"
#include <linux/llist.h>
#include <linux/workqueue.h>
struct bch_read_bio {
struct bch_fs *c;
u64 start_time;
u64 submit_time;
/*
* Reads will often have to be split, and if the extent being read from
* was checksummed or compressed we'll also have to allocate bounce
* buffers and copy the data back into the original bio.
*
* If we didn't have to split, we have to save and restore the original
* bi_end_io - @split below indicates which:
*/
union {
struct bch_read_bio *parent;
bio_end_io_t *end_io;
};
/*
* Saved copy of bio->bi_iter, from submission time - allows us to
* resubmit on IO error, and also to copy data back to the original bio
* when we're bouncing:
*/
struct bvec_iter bvec_iter;
u16 flags;
union {
struct {
u16 bounce:1,
split:1,
kmalloc:1,
have_ioref:1,
narrow_crcs:1,
hole:1,
retry:2,
context:2;
};
u16 _state;
};
struct bch_devs_list devs_have;
struct extent_pick_ptr pick;
/* start pos of data we read (may not be pos of data we want) */
struct bpos pos;
struct bversion version;
struct promote_op *promote;
struct bch_io_opts opts;
struct work_struct work;
struct bio bio;
};
struct bch_write_bio {
struct_group(wbio,
struct bch_fs *c;
struct bch_write_bio *parent;
u64 submit_time;
struct bch_devs_list failed;
u8 order;
u8 dev;
unsigned split:1,
bounce:1,
put_bio:1,
have_ioref:1,
used_mempool:1;
);
struct bio bio;
};
struct bch_write_op {
struct closure cl;
struct bch_fs *c;
struct workqueue_struct *io_wq;
u64 start_time;
unsigned written; /* sectors */
u16 flags;
s16 error; /* dio write path expects it to hold -ERESTARTSYS... */
unsigned csum_type:4;
unsigned compression_type:4;
unsigned nr_replicas:4;
unsigned nr_replicas_required:4;
unsigned alloc_reserve:4;
u8 open_buckets_nr;
struct bch_devs_list devs_have;
u16 target;
u16 nonce;
struct bch_io_opts opts;
struct bpos pos;
struct bversion version;
/* For BCH_WRITE_DATA_ENCODED: */
struct bch_extent_crc_unpacked crc;
struct write_point_specifier write_point;
struct disk_reservation res;
u8 open_buckets[16];
/*
* If caller wants to flush but hasn't passed us a journal_seq ptr, we
* still need to stash the journal_seq somewhere:
*/
union {
u64 *journal_seq_p;
u64 journal_seq;
};
int (*index_update_fn)(struct bch_write_op *);
struct bch_devs_mask failed;
struct keylist insert_keys;
u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2];
/* Must be last: */
struct bch_write_bio wbio;
};
#endif /* _BCACHEFS_IO_TYPES_H */

1140
fs/bcachefs/journal.c Normal file

File diff suppressed because it is too large Load Diff

383
fs/bcachefs/journal.h Normal file
View File

@ -0,0 +1,383 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_JOURNAL_H
#define _BCACHEFS_JOURNAL_H
/*
* THE JOURNAL:
*
* The primary purpose of the journal is to log updates (insertions) to the
* b-tree, to avoid having to do synchronous updates to the b-tree on disk.
*
* Without the journal, the b-tree is always internally consistent on
* disk - and in fact, in the earliest incarnations bcache didn't have a journal
* but did handle unclean shutdowns by doing all index updates synchronously
* (with coalescing).
*
* Updates to interior nodes still happen synchronously and without the journal
* (for simplicity) - this may change eventually but updates to interior nodes
* are rare enough it's not a huge priority.
*
* This means the journal is relatively separate from the b-tree; it consists of
* just a list of keys and journal replay consists of just redoing those
* insertions in same order that they appear in the journal.
*
* PERSISTENCE:
*
* For synchronous updates (where we're waiting on the index update to hit
* disk), the journal entry will be written out immediately (or as soon as
* possible, if the write for the previous journal entry was still in flight).
*
* Synchronous updates are specified by passing a closure (@flush_cl) to
* bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter
* down to the journalling code. That closure will will wait on the journal
* write to complete (via closure_wait()).
*
* If the index update wasn't synchronous, the journal entry will be
* written out after 10 ms have elapsed, by default (the delay_ms field
* in struct journal).
*
* JOURNAL ENTRIES:
*
* A journal entry is variable size (struct jset), it's got a fixed length
* header and then a variable number of struct jset_entry entries.
*
* Journal entries are identified by monotonically increasing 64 bit sequence
* numbers - jset->seq; other places in the code refer to this sequence number.
*
* A jset_entry entry contains one or more bkeys (which is what gets inserted
* into the b-tree). We need a container to indicate which b-tree the key is
* for; also, the roots of the various b-trees are stored in jset_entry entries
* (one for each b-tree) - this lets us add new b-tree types without changing
* the on disk format.
*
* We also keep some things in the journal header that are logically part of the
* superblock - all the things that are frequently updated. This is for future
* bcache on raw flash support; the superblock (which will become another
* journal) can't be moved or wear leveled, so it contains just enough
* information to find the main journal, and the superblock only has to be
* rewritten when we want to move/wear level the main journal.
*
* JOURNAL LAYOUT ON DISK:
*
* The journal is written to a ringbuffer of buckets (which is kept in the
* superblock); the individual buckets are not necessarily contiguous on disk
* which means that journal entries are not allowed to span buckets, but also
* that we can resize the journal at runtime if desired (unimplemented).
*
* The journal buckets exist in the same pool as all the other buckets that are
* managed by the allocator and garbage collection - garbage collection marks
* the journal buckets as metadata buckets.
*
* OPEN/DIRTY JOURNAL ENTRIES:
*
* Open/dirty journal entries are journal entries that contain b-tree updates
* that have not yet been written out to the b-tree on disk. We have to track
* which journal entries are dirty, and we also have to avoid wrapping around
* the journal and overwriting old but still dirty journal entries with new
* journal entries.
*
* On disk, this is represented with the "last_seq" field of struct jset;
* last_seq is the first sequence number that journal replay has to replay.
*
* To avoid overwriting dirty journal entries on disk, we keep a mapping (in
* journal_device->seq) of for each journal bucket, the highest sequence number
* any journal entry it contains. Then, by comparing that against last_seq we
* can determine whether that journal bucket contains dirty journal entries or
* not.
*
* To track which journal entries are dirty, we maintain a fifo of refcounts
* (where each entry corresponds to a specific sequence number) - when a ref
* goes to 0, that journal entry is no longer dirty.
*
* Journalling of index updates is done at the same time as the b-tree itself is
* being modified (see btree_insert_key()); when we add the key to the journal
* the pending b-tree write takes a ref on the journal entry the key was added
* to. If a pending b-tree write would need to take refs on multiple dirty
* journal entries, it only keeps the ref on the oldest one (since a newer
* journal entry will still be replayed if an older entry was dirty).
*
* JOURNAL FILLING UP:
*
* There are two ways the journal could fill up; either we could run out of
* space to write to, or we could have too many open journal entries and run out
* of room in the fifo of refcounts. Since those refcounts are decremented
* without any locking we can't safely resize that fifo, so we handle it the
* same way.
*
* If the journal fills up, we start flushing dirty btree nodes until we can
* allocate space for a journal write again - preferentially flushing btree
* nodes that are pinning the oldest journal entries first.
*/
#include <linux/hash.h>
#include "journal_types.h"
struct bch_fs;
static inline void journal_wake(struct journal *j)
{
wake_up(&j->wait);
closure_wake_up(&j->async_wait);
}
static inline struct journal_buf *journal_cur_buf(struct journal *j)
{
return j->buf + j->reservations.idx;
}
static inline struct journal_buf *journal_prev_buf(struct journal *j)
{
return j->buf + !j->reservations.idx;
}
/* Sequence number of oldest dirty journal entry */
static inline u64 journal_last_seq(struct journal *j)
{
return j->pin.front;
}
static inline u64 journal_cur_seq(struct journal *j)
{
BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
return j->pin.back - 1;
}
u64 bch2_inode_journal_seq(struct journal *, u64);
static inline int journal_state_count(union journal_res_state s, int idx)
{
return idx == 0 ? s.buf0_count : s.buf1_count;
}
static inline void journal_state_inc(union journal_res_state *s)
{
s->buf0_count += s->idx == 0;
s->buf1_count += s->idx == 1;
}
static inline void bch2_journal_set_has_inode(struct journal *j,
struct journal_res *res,
u64 inum)
{
struct journal_buf *buf = &j->buf[res->idx];
unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8));
/* avoid atomic op if possible */
if (unlikely(!test_bit(bit, buf->has_inode)))
set_bit(bit, buf->has_inode);
}
/*
* Amount of space that will be taken up by some keys in the journal (i.e.
* including the jset header)
*/
static inline unsigned jset_u64s(unsigned u64s)
{
return u64s + sizeof(struct jset_entry) / sizeof(u64);
}
static inline struct jset_entry *
bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
{
struct jset *jset = buf->data;
struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));
memset(entry, 0, sizeof(*entry));
entry->u64s = cpu_to_le16(u64s);
le32_add_cpu(&jset->u64s, jset_u64s(u64s));
return entry;
}
static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
unsigned type, enum btree_id id,
unsigned level,
const void *data, unsigned u64s)
{
struct journal_buf *buf = &j->buf[res->idx];
struct jset_entry *entry = vstruct_idx(buf->data, res->offset);
unsigned actual = jset_u64s(u64s);
EBUG_ON(!res->ref);
EBUG_ON(actual > res->u64s);
res->offset += actual;
res->u64s -= actual;
entry->u64s = cpu_to_le16(u64s);
entry->btree_id = id;
entry->level = level;
entry->type = type;
entry->pad[0] = 0;
entry->pad[1] = 0;
entry->pad[2] = 0;
memcpy_u64s(entry->_data, data, u64s);
}
static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
enum btree_id id, const struct bkey_i *k)
{
bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
id, 0, k, k->k.u64s);
}
void bch2_journal_buf_put_slowpath(struct journal *, bool);
static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
bool need_write_just_set)
{
union journal_res_state s;
s.v = atomic64_sub_return(((union journal_res_state) {
.buf0_count = idx == 0,
.buf1_count = idx == 1,
}).v, &j->reservations.counter);
EBUG_ON(s.idx != idx && !s.prev_buf_unwritten);
/*
* Do not initiate a journal write if the journal is in an error state
* (previous journal entry write may have failed)
*/
if (s.idx != idx &&
!journal_state_count(s, idx) &&
s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL)
bch2_journal_buf_put_slowpath(j, need_write_just_set);
}
/*
* This function releases the journal write structure so other threads can
* then proceed to add their keys as well.
*/
static inline void bch2_journal_res_put(struct journal *j,
struct journal_res *res)
{
if (!res->ref)
return;
lock_release(&j->res_map, _RET_IP_);
while (res->u64s)
bch2_journal_add_entry(j, res,
BCH_JSET_ENTRY_btree_keys,
0, 0, NULL, 0);
bch2_journal_buf_put(j, res->idx, false);
res->ref = 0;
}
int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
unsigned, unsigned);
static inline int journal_res_get_fast(struct journal *j,
struct journal_res *res,
unsigned u64s_min,
unsigned u64s_max)
{
union journal_res_state old, new;
u64 v = atomic64_read(&j->reservations.counter);
do {
old.v = new.v = v;
/*
* Check if there is still room in the current journal
* entry:
*/
if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s)
return 0;
res->offset = old.cur_entry_offset;
res->u64s = min(u64s_max, j->cur_entry_u64s -
old.cur_entry_offset);
journal_state_inc(&new);
new.cur_entry_offset += res->u64s;
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
res->ref = true;
res->idx = new.idx;
res->seq = le64_to_cpu(j->buf[res->idx].data->seq);
return 1;
}
static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
unsigned u64s_min, unsigned u64s_max)
{
int ret;
EBUG_ON(res->ref);
EBUG_ON(u64s_max < u64s_min);
EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
if (journal_res_get_fast(j, res, u64s_min, u64s_max))
goto out;
ret = bch2_journal_res_get_slowpath(j, res, u64s_min, u64s_max);
if (ret)
return ret;
out:
lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_);
EBUG_ON(!res->ref);
return 0;
}
u64 bch2_journal_last_unwritten_seq(struct journal *);
int bch2_journal_open_seq_async(struct journal *, u64, struct closure *);
void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *);
void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
void bch2_journal_flush_async(struct journal *, struct closure *);
void bch2_journal_meta_async(struct journal *, struct closure *);
int bch2_journal_flush_seq(struct journal *, u64);
int bch2_journal_flush(struct journal *);
int bch2_journal_meta(struct journal *);
void bch2_journal_halt(struct journal *);
static inline int bch2_journal_error(struct journal *j)
{
return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
? -EIO : 0;
}
struct bch_dev;
static inline bool journal_flushes_device(struct bch_dev *ca)
{
return true;
}
int bch2_journal_mark(struct bch_fs *, struct list_head *);
void bch2_journal_entries_free(struct list_head *);
int bch2_journal_replay(struct bch_fs *, struct list_head *);
static inline void bch2_journal_set_replay_done(struct journal *j)
{
BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
set_bit(JOURNAL_REPLAY_DONE, &j->flags);
}
ssize_t bch2_journal_print_debug(struct journal *, char *);
ssize_t bch2_journal_print_pins(struct journal *, char *);
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
unsigned nr);
int bch2_dev_journal_alloc(struct bch_dev *);
void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
void bch2_fs_journal_stop(struct journal *);
void bch2_fs_journal_start(struct journal *);
void bch2_dev_journal_exit(struct bch_dev *);
int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
void bch2_fs_journal_exit(struct journal *);
int bch2_fs_journal_init(struct journal *);
#endif /* _BCACHEFS_JOURNAL_H */

1392
fs/bcachefs/journal_io.c Normal file

File diff suppressed because it is too large Load Diff

44
fs/bcachefs/journal_io.h Normal file
View File

@ -0,0 +1,44 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_JOURNAL_IO_H
#define _BCACHEFS_JOURNAL_IO_H
/*
* Only used for holding the journal entries we read in btree_journal_read()
* during cache_registration
*/
struct journal_replay {
struct list_head list;
struct bch_devs_list devs;
/* must be last: */
struct jset j;
};
static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
struct jset_entry *entry, unsigned type)
{
while (entry < vstruct_last(jset)) {
if (entry->type == type)
return entry;
entry = vstruct_next(entry);
}
return NULL;
}
#define for_each_jset_entry_type(entry, jset, type) \
for (entry = (jset)->start; \
(entry = __jset_entry_type_next(jset, entry, type)); \
entry = vstruct_next(entry))
#define for_each_jset_key(k, _n, entry, jset) \
for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \
vstruct_for_each_safe(entry, k, _n)
int bch2_journal_set_seq(struct bch_fs *c, u64, u64);
int bch2_journal_read(struct bch_fs *, struct list_head *);
int bch2_journal_entry_sectors(struct journal *);
void bch2_journal_write(struct closure *);
#endif /* _BCACHEFS_JOURNAL_IO_H */

View File

@ -0,0 +1,402 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "journal.h"
#include "journal_reclaim.h"
#include "replicas.h"
#include "super.h"
/*
* Journal entry pinning - machinery for holding a reference on a given journal
* entry, holding it open to ensure it gets replayed during recovery:
*/
static inline u64 journal_pin_seq(struct journal *j,
struct journal_entry_pin_list *pin_list)
{
return fifo_entry_idx_abs(&j->pin, pin_list);
}
u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
{
u64 ret = 0;
spin_lock(&j->lock);
if (journal_pin_active(pin))
ret = journal_pin_seq(j, pin->pin_list);
spin_unlock(&j->lock);
return ret;
}
static inline void __journal_pin_add(struct journal *j,
struct journal_entry_pin_list *pin_list,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
BUG_ON(journal_pin_active(pin));
BUG_ON(!atomic_read(&pin_list->count));
atomic_inc(&pin_list->count);
pin->pin_list = pin_list;
pin->flush = flush_fn;
if (flush_fn)
list_add(&pin->list, &pin_list->list);
else
INIT_LIST_HEAD(&pin->list);
/*
* If the journal is currently full, we might want to call flush_fn
* immediately:
*/
journal_wake(j);
}
void bch2_journal_pin_add(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
spin_lock(&j->lock);
__journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn);
spin_unlock(&j->lock);
}
static inline void __journal_pin_drop(struct journal *j,
struct journal_entry_pin *pin)
{
struct journal_entry_pin_list *pin_list = pin->pin_list;
if (!journal_pin_active(pin))
return;
pin->pin_list = NULL;
list_del_init(&pin->list);
/*
* Unpinning a journal entry make make journal_next_bucket() succeed, if
* writing a new last_seq will now make another bucket available:
*/
if (atomic_dec_and_test(&pin_list->count) &&
pin_list == &fifo_peek_front(&j->pin))
bch2_journal_reclaim_fast(j);
}
void bch2_journal_pin_drop(struct journal *j,
struct journal_entry_pin *pin)
{
spin_lock(&j->lock);
__journal_pin_drop(j, pin);
spin_unlock(&j->lock);
}
void bch2_journal_pin_add_if_older(struct journal *j,
struct journal_entry_pin *src_pin,
struct journal_entry_pin *pin,
journal_pin_flush_fn flush_fn)
{
spin_lock(&j->lock);
if (journal_pin_active(src_pin) &&
(!journal_pin_active(pin) ||
journal_pin_seq(j, src_pin->pin_list) <
journal_pin_seq(j, pin->pin_list))) {
__journal_pin_drop(j, pin);
__journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
}
spin_unlock(&j->lock);
}
/*
* Journal reclaim: flush references to open journal entries to reclaim space in
* the journal
*
* May be done by the journal code in the background as needed to free up space
* for more journal entries, or as part of doing a clean shutdown, or to migrate
* data off of a specific device:
*/
/**
* bch2_journal_reclaim_fast - do the fast part of journal reclaim
*
* Called from IO submission context, does not block. Cleans up after btree
* write completions by advancing the journal pin and each cache's last_idx,
* kicking off discards and background reclaim as necessary.
*/
void bch2_journal_reclaim_fast(struct journal *j)
{
struct journal_entry_pin_list temp;
bool popped = false;
lockdep_assert_held(&j->lock);
/*
* Unpin journal entries whose reference counts reached zero, meaning
* all btree nodes got written out
*/
while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
BUG_ON(!fifo_pop(&j->pin, temp));
popped = true;
}
if (popped)
journal_wake(j);
}
static struct journal_entry_pin *
__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
{
struct journal_entry_pin_list *pin_list;
struct journal_entry_pin *ret;
u64 iter;
/* no need to iterate over empty fifo entries: */
bch2_journal_reclaim_fast(j);
fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
if (iter > seq_to_flush)
break;
ret = list_first_entry_or_null(&pin_list->list,
struct journal_entry_pin, list);
if (ret) {
/* must be list_del_init(), see bch2_journal_pin_drop() */
list_move(&ret->list, &pin_list->flushed);
*seq = iter;
return ret;
}
}
return NULL;
}
static struct journal_entry_pin *
journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
{
struct journal_entry_pin *ret;
spin_lock(&j->lock);
ret = __journal_get_next_pin(j, seq_to_flush, seq);
spin_unlock(&j->lock);
return ret;
}
static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
{
bool ret;
spin_lock(&j->lock);
ret = ja->nr &&
(ja->last_idx != ja->cur_idx &&
ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
spin_unlock(&j->lock);
return ret;
}
/**
* bch2_journal_reclaim_work - free up journal buckets
*
* Background journal reclaim writes out btree nodes. It should be run
* early enough so that we never completely run out of journal buckets.
*
* High watermarks for triggering background reclaim:
* - FIFO has fewer than 512 entries left
* - fewer than 25% journal buckets free
*
* Background reclaim runs until low watermarks are reached:
* - FIFO has more than 1024 entries left
* - more than 50% journal buckets free
*
* As long as a reclaim can complete in the time it takes to fill up
* 512 journal entries or 25% of all journal buckets, then
* journal_next_bucket() should not stall.
*/
void bch2_journal_reclaim_work(struct work_struct *work)
{
struct bch_fs *c = container_of(to_delayed_work(work),
struct bch_fs, journal.reclaim_work);
struct journal *j = &c->journal;
struct bch_dev *ca;
struct journal_entry_pin *pin;
u64 seq, seq_to_flush = 0;
unsigned iter, bucket_to_flush;
unsigned long next_flush;
bool reclaim_lock_held = false, need_flush;
/*
* Advance last_idx to point to the oldest journal entry containing
* btree node updates that have not yet been written out
*/
for_each_rw_member(ca, c, iter) {
struct journal_device *ja = &ca->journal;
if (!ja->nr)
continue;
while (should_discard_bucket(j, ja)) {
if (!reclaim_lock_held) {
/*
* ugh:
* might be called from __journal_res_get()
* under wait_event() - have to go back to
* TASK_RUNNING before doing something that
* would block, but only if we're doing work:
*/
__set_current_state(TASK_RUNNING);
mutex_lock(&j->reclaim_lock);
reclaim_lock_held = true;
/* recheck under reclaim_lock: */
continue;
}
if (ca->mi.discard &&
bdev_max_discard_sectors(ca->disk_sb.bdev))
blkdev_issue_discard(ca->disk_sb.bdev,
bucket_to_sector(ca,
ja->buckets[ja->last_idx]),
ca->mi.bucket_size, GFP_NOIO);
spin_lock(&j->lock);
ja->last_idx = (ja->last_idx + 1) % ja->nr;
spin_unlock(&j->lock);
journal_wake(j);
}
/*
* Write out enough btree nodes to free up 50% journal
* buckets
*/
spin_lock(&j->lock);
bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
seq_to_flush = max_t(u64, seq_to_flush,
ja->bucket_seq[bucket_to_flush]);
spin_unlock(&j->lock);
}
if (reclaim_lock_held)
mutex_unlock(&j->reclaim_lock);
/* Also flush if the pin fifo is more than half full */
spin_lock(&j->lock);
seq_to_flush = max_t(s64, seq_to_flush,
(s64) journal_cur_seq(j) -
(j->pin.size >> 1));
spin_unlock(&j->lock);
/*
* If it's been longer than j->reclaim_delay_ms since we last flushed,
* make sure to flush at least one journal pin:
*/
next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
need_flush = time_after(jiffies, next_flush);
while ((pin = journal_get_next_pin(j, need_flush
? U64_MAX
: seq_to_flush, &seq))) {
__set_current_state(TASK_RUNNING);
pin->flush(j, pin, seq);
need_flush = false;
j->last_flushed = jiffies;
}
if (!test_bit(BCH_FS_RO, &c->flags))
queue_delayed_work(system_freezable_wq, &j->reclaim_work,
msecs_to_jiffies(j->reclaim_delay_ms));
}
static int journal_flush_done(struct journal *j, u64 seq_to_flush,
struct journal_entry_pin **pin,
u64 *pin_seq)
{
int ret;
*pin = NULL;
ret = bch2_journal_error(j);
if (ret)
return ret;
spin_lock(&j->lock);
/*
* If journal replay hasn't completed, the unreplayed journal entries
* hold refs on their corresponding sequence numbers
*/
ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
!test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
journal_last_seq(j) > seq_to_flush ||
(fifo_used(&j->pin) == 1 &&
atomic_read(&fifo_peek_front(&j->pin).count) == 1);
spin_unlock(&j->lock);
return ret;
}
void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
{
struct journal_entry_pin *pin;
u64 pin_seq;
if (!test_bit(JOURNAL_STARTED, &j->flags))
return;
while (1) {
wait_event(j->wait, journal_flush_done(j, seq_to_flush,
&pin, &pin_seq));
if (!pin)
break;
pin->flush(j, pin, pin_seq);
}
}
int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_entry_pin_list *p;
struct bch_devs_list devs;
u64 iter, seq = 0;
int ret = 0;
spin_lock(&j->lock);
fifo_for_each_entry_ptr(p, &j->pin, iter)
if (dev_idx >= 0
? bch2_dev_list_has_dev(p->devs, dev_idx)
: p->devs.nr < c->opts.metadata_replicas)
seq = iter;
spin_unlock(&j->lock);
bch2_journal_flush_pins(j, seq);
ret = bch2_journal_error(j);
if (ret)
return ret;
mutex_lock(&c->replicas_gc_lock);
bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
seq = 0;
spin_lock(&j->lock);
while (!ret && seq < j->pin.back) {
seq = max(seq, journal_last_seq(j));
devs = journal_seq_pin(j, seq)->devs;
seq++;
spin_unlock(&j->lock);
ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
spin_lock(&j->lock);
}
spin_unlock(&j->lock);
ret = bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
return ret;
}

View File

@ -0,0 +1,42 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
#define _BCACHEFS_JOURNAL_RECLAIM_H
#define JOURNAL_PIN (32 * 1024)
static inline bool journal_pin_active(struct journal_entry_pin *pin)
{
return pin->pin_list != NULL;
}
static inline struct journal_entry_pin_list *
journal_seq_pin(struct journal *j, u64 seq)
{
BUG_ON(seq < j->pin.front || seq >= j->pin.back);
return &j->pin.data[seq & j->pin.mask];
}
u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
journal_pin_flush_fn);
void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
void bch2_journal_pin_add_if_older(struct journal *,
struct journal_entry_pin *,
struct journal_entry_pin *,
journal_pin_flush_fn);
void bch2_journal_reclaim_fast(struct journal *);
void bch2_journal_reclaim_work(struct work_struct *);
void bch2_journal_flush_pins(struct journal *, u64);
static inline void bch2_journal_flush_all_pins(struct journal *j)
{
bch2_journal_flush_pins(j, U64_MAX);
}
int bch2_journal_flush_device_pins(struct journal *, int);
#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */

View File

@ -0,0 +1,360 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "error.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
/*
* journal_seq_blacklist machinery:
*
* To guarantee order of btree updates after a crash, we need to detect when a
* btree node entry (bset) is newer than the newest journal entry that was
* successfully written, and ignore it - effectively ignoring any btree updates
* that didn't make it into the journal.
*
* If we didn't do this, we might have two btree nodes, a and b, both with
* updates that weren't written to the journal yet: if b was updated after a,
* but b was flushed and not a - oops; on recovery we'll find that the updates
* to b happened, but not the updates to a that happened before it.
*
* Ignoring bsets that are newer than the newest journal entry is always safe,
* because everything they contain will also have been journalled - and must
* still be present in the journal on disk until a journal entry has been
* written _after_ that bset was written.
*
* To accomplish this, bsets record the newest journal sequence number they
* contain updates for; then, on startup, the btree code queries the journal
* code to ask "Is this sequence number newer than the newest journal entry? If
* so, ignore it."
*
* When this happens, we must blacklist that journal sequence number: the
* journal must not write any entries with that sequence number, and it must
* record that it was blacklisted so that a) on recovery we don't think we have
* missing journal entries and b) so that the btree code continues to ignore
* that bset, until that btree node is rewritten.
*
* Blacklisted journal sequence numbers are themselves recorded as entries in
* the journal.
*/
/*
* Called when journal needs to evict a blacklist entry to reclaim space: find
* any btree nodes that refer to the blacklist journal sequence numbers, and
* rewrite them:
*/
static void journal_seq_blacklist_flush(struct journal *j,
struct journal_entry_pin *pin, u64 seq)
{
struct bch_fs *c =
container_of(j, struct bch_fs, journal);
struct journal_seq_blacklist *bl =
container_of(pin, struct journal_seq_blacklist, pin);
struct blacklisted_node n;
struct closure cl;
unsigned i;
int ret;
closure_init_stack(&cl);
for (i = 0;; i++) {
struct btree_iter iter;
struct btree *b;
mutex_lock(&j->blacklist_lock);
if (i >= bl->nr_entries) {
mutex_unlock(&j->blacklist_lock);
break;
}
n = bl->entries[i];
mutex_unlock(&j->blacklist_lock);
__bch2_btree_iter_init(&iter, c, n.btree_id, n.pos,
0, 0, BTREE_ITER_NODES);
b = bch2_btree_iter_peek_node(&iter);
/* The node might have already been rewritten: */
if (b->data->keys.seq == n.seq) {
ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0);
if (ret) {
bch2_btree_iter_unlock(&iter);
bch2_fs_fatal_error(c,
"error %i rewriting btree node with blacklisted journal seq",
ret);
bch2_journal_halt(j);
return;
}
}
bch2_btree_iter_unlock(&iter);
}
for (i = 0;; i++) {
struct btree_update *as;
struct pending_btree_node_free *d;
mutex_lock(&j->blacklist_lock);
if (i >= bl->nr_entries) {
mutex_unlock(&j->blacklist_lock);
break;
}
n = bl->entries[i];
mutex_unlock(&j->blacklist_lock);
redo_wait:
mutex_lock(&c->btree_interior_update_lock);
/*
* Is the node on the list of pending interior node updates -
* being freed? If so, wait for that to finish:
*/
for_each_pending_btree_node_free(c, as, d)
if (n.seq == d->seq &&
n.btree_id == d->btree_id &&
!d->level &&
!bkey_cmp(n.pos, d->key.k.p)) {
closure_wait(&as->wait, &cl);
mutex_unlock(&c->btree_interior_update_lock);
closure_sync(&cl);
goto redo_wait;
}
mutex_unlock(&c->btree_interior_update_lock);
}
mutex_lock(&j->blacklist_lock);
bch2_journal_pin_drop(j, &bl->pin);
list_del(&bl->list);
kfree(bl->entries);
kfree(bl);
mutex_unlock(&j->blacklist_lock);
}
/*
* Determine if a particular sequence number is blacklisted - if so, return
* blacklist entry:
*/
struct journal_seq_blacklist *
bch2_journal_seq_blacklist_find(struct journal *j, u64 seq)
{
struct journal_seq_blacklist *bl;
lockdep_assert_held(&j->blacklist_lock);
list_for_each_entry(bl, &j->seq_blacklist, list)
if (seq >= bl->start && seq <= bl->end)
return bl;
return NULL;
}
/*
* Allocate a new, in memory blacklist entry:
*/
static struct journal_seq_blacklist *
bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end)
{
struct journal_seq_blacklist *bl;
lockdep_assert_held(&j->blacklist_lock);
/*
* When we start the journal, bch2_journal_start() will skip over @seq:
*/
bl = kzalloc(sizeof(*bl), GFP_KERNEL);
if (!bl)
return NULL;
bl->start = start;
bl->end = end;
list_add_tail(&bl->list, &j->seq_blacklist);
return bl;
}
/*
* Returns true if @seq is newer than the most recent journal entry that got
* written, and data corresponding to @seq should be ignored - also marks @seq
* as blacklisted so that on future restarts the corresponding data will still
* be ignored:
*/
int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
{
struct journal *j = &c->journal;
struct journal_seq_blacklist *bl = NULL;
struct blacklisted_node *n;
u64 journal_seq;
int ret = 0;
if (!seq)
return 0;
spin_lock(&j->lock);
journal_seq = journal_cur_seq(j);
spin_unlock(&j->lock);
/* Interier updates aren't journalled: */
BUG_ON(b->level);
BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
/*
* Decrease this back to j->seq + 2 when we next rev the on disk format:
* increasing it temporarily to work around bug in old kernels
*/
fsck_err_on(seq > journal_seq + 4, c,
"bset journal seq too far in the future: %llu > %llu",
seq, journal_seq);
if (seq <= journal_seq &&
list_empty_careful(&j->seq_blacklist))
return 0;
mutex_lock(&j->blacklist_lock);
if (seq <= journal_seq) {
bl = bch2_journal_seq_blacklist_find(j, seq);
if (!bl)
goto out;
} else {
bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
if (!j->new_blacklist) {
j->new_blacklist = bch2_journal_seq_blacklisted_new(j,
journal_seq + 1,
journal_seq + 1);
if (!j->new_blacklist) {
ret = -ENOMEM;
goto out;
}
}
bl = j->new_blacklist;
bl->end = max(bl->end, seq);
}
for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
if (b->data->keys.seq == n->seq &&
b->btree_id == n->btree_id &&
!bkey_cmp(b->key.k.p, n->pos))
goto found_entry;
if (!bl->nr_entries ||
is_power_of_2(bl->nr_entries)) {
n = krealloc(bl->entries,
max_t(size_t, bl->nr_entries * 2, 8) * sizeof(*n),
GFP_KERNEL);
if (!n) {
ret = -ENOMEM;
goto out;
}
bl->entries = n;
}
bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
.seq = b->data->keys.seq,
.btree_id = b->btree_id,
.pos = b->key.k.p,
};
found_entry:
ret = 1;
out:
fsck_err:
mutex_unlock(&j->blacklist_lock);
return ret;
}
static int __bch2_journal_seq_blacklist_read(struct journal *j,
struct journal_replay *i,
u64 start, u64 end)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_seq_blacklist *bl;
bch_verbose(c, "blacklisting existing journal seq %llu-%llu",
start, end);
bl = bch2_journal_seq_blacklisted_new(j, start, end);
if (!bl)
return -ENOMEM;
bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin,
journal_seq_blacklist_flush);
return 0;
}
/*
* After reading the journal, find existing journal seq blacklist entries and
* read them into memory:
*/
int bch2_journal_seq_blacklist_read(struct journal *j,
struct journal_replay *i)
{
struct jset_entry *entry;
int ret = 0;
vstruct_for_each(&i->j, entry) {
switch (entry->type) {
case BCH_JSET_ENTRY_blacklist: {
struct jset_entry_blacklist *bl_entry =
container_of(entry, struct jset_entry_blacklist, entry);
ret = __bch2_journal_seq_blacklist_read(j, i,
le64_to_cpu(bl_entry->seq),
le64_to_cpu(bl_entry->seq));
break;
}
case BCH_JSET_ENTRY_blacklist_v2: {
struct jset_entry_blacklist_v2 *bl_entry =
container_of(entry, struct jset_entry_blacklist_v2, entry);
ret = __bch2_journal_seq_blacklist_read(j, i,
le64_to_cpu(bl_entry->start),
le64_to_cpu(bl_entry->end));
break;
}
}
if (ret)
break;
}
return ret;
}
/*
* After reading the journal and walking the btree, we might have new journal
* sequence numbers to blacklist - add entries to the next journal entry to be
* written:
*/
void bch2_journal_seq_blacklist_write(struct journal *j)
{
struct journal_seq_blacklist *bl = j->new_blacklist;
struct jset_entry_blacklist_v2 *bl_entry;
struct jset_entry *entry;
if (!bl)
return;
entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j),
(sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64));
bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
bl_entry->entry.type = BCH_JSET_ENTRY_blacklist_v2;
bl_entry->start = cpu_to_le64(bl->start);
bl_entry->end = cpu_to_le64(bl->end);
bch2_journal_pin_add(j,
journal_cur_seq(j),
&bl->pin,
journal_seq_blacklist_flush);
j->new_blacklist = NULL;
}

View File

@ -0,0 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
struct journal_replay;
struct journal_seq_blacklist *
bch2_journal_seq_blacklist_find(struct journal *, u64);
int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
int bch2_journal_seq_blacklist_read(struct journal *,
struct journal_replay *);
void bch2_journal_seq_blacklist_write(struct journal *);
#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */

242
fs/bcachefs/journal_types.h Normal file
View File

@ -0,0 +1,242 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_JOURNAL_TYPES_H
#define _BCACHEFS_JOURNAL_TYPES_H
#include <linux/cache.h>
#include <linux/workqueue.h>
#include "alloc_types.h"
#include "super_types.h"
#include "fifo.h"
struct journal_res;
/*
* We put two of these in struct journal; we used them for writes to the
* journal that are being staged or in flight.
*/
struct journal_buf {
struct jset *data;
BKEY_PADDED(key);
struct closure_waitlist wait;
unsigned size;
unsigned disk_sectors;
/* bloom filter: */
unsigned long has_inode[1024 / sizeof(unsigned long)];
};
/*
* Something that makes a journal entry dirty - i.e. a btree node that has to be
* flushed:
*/
struct journal_entry_pin_list {
struct list_head list;
struct list_head flushed;
atomic_t count;
struct bch_devs_list devs;
};
struct journal;
struct journal_entry_pin;
typedef void (*journal_pin_flush_fn)(struct journal *j,
struct journal_entry_pin *, u64);
struct journal_entry_pin {
struct list_head list;
journal_pin_flush_fn flush;
struct journal_entry_pin_list *pin_list;
};
/* corresponds to a btree node with a blacklisted bset: */
struct blacklisted_node {
__le64 seq;
enum btree_id btree_id;
struct bpos pos;
};
struct journal_seq_blacklist {
struct list_head list;
u64 start;
u64 end;
struct journal_entry_pin pin;
struct blacklisted_node *entries;
size_t nr_entries;
};
struct journal_res {
bool ref;
u8 idx;
u16 u64s;
u32 offset;
u64 seq;
};
union journal_res_state {
struct {
atomic64_t counter;
};
struct {
u64 v;
};
struct {
u64 cur_entry_offset:20,
idx:1,
prev_buf_unwritten:1,
buf0_count:21,
buf1_count:21;
};
};
/* bytes: */
#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */
/*
* We stash some journal state as sentinal values in cur_entry_offset:
* note - cur_entry_offset is in units of u64s
*/
#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1)
#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1)
#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX)
/*
* JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
* either because something's waiting on the write to complete or because it's
* been dirty too long and the timer's expired.
*/
enum {
JOURNAL_REPLAY_DONE,
JOURNAL_STARTED,
JOURNAL_NEED_WRITE,
JOURNAL_NOT_EMPTY,
};
/* Embedded in struct bch_fs */
struct journal {
/* Fastpath stuff up front: */
unsigned long flags;
union journal_res_state reservations;
unsigned cur_entry_u64s;
unsigned prev_buf_sectors;
unsigned cur_buf_sectors;
unsigned buf_size_want;
/*
* Two journal entries -- one is currently open for new entries, the
* other is possibly being written out.
*/
struct journal_buf buf[2];
spinlock_t lock;
/* Used when waiting because the journal was full */
wait_queue_head_t wait;
struct closure_waitlist async_wait;
struct closure io;
struct delayed_work write_work;
/* Sequence number of most recent journal entry (last entry in @pin) */
atomic64_t seq;
/* last_seq from the most recent journal entry written */
u64 last_seq_ondisk;
/*
* FIFO of journal entries whose btree updates have not yet been
* written out.
*
* Each entry is a reference count. The position in the FIFO is the
* entry's sequence number relative to @seq.
*
* The journal entry itself holds a reference count, put when the
* journal entry is written out. Each btree node modified by the journal
* entry also holds a reference count, put when the btree node is
* written.
*
* When a reference count reaches zero, the journal entry is no longer
* needed. When all journal entries in the oldest journal bucket are no
* longer needed, the bucket can be discarded and reused.
*/
struct {
u64 front, back, size, mask;
struct journal_entry_pin_list *data;
} pin;
u64 replay_journal_seq;
struct mutex blacklist_lock;
struct list_head seq_blacklist;
struct journal_seq_blacklist *new_blacklist;
BKEY_PADDED(key);
struct write_point wp;
spinlock_t err_lock;
struct delayed_work reclaim_work;
unsigned long last_flushed;
/* protects advancing ja->last_idx: */
struct mutex reclaim_lock;
unsigned write_delay_ms;
unsigned reclaim_delay_ms;
u64 res_get_blocked_start;
u64 need_write_time;
u64 write_start_time;
struct bch2_time_stats *write_time;
struct bch2_time_stats *delay_time;
struct bch2_time_stats *blocked_time;
struct bch2_time_stats *flush_seq_time;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map res_map;
#endif
};
/*
* Embedded in struct bch_dev. First three fields refer to the array of journal
* buckets, in bch_sb.
*/
struct journal_device {
/*
* For each journal bucket, contains the max sequence number of the
* journal writes it contains - so we know when a bucket can be reused.
*/
u64 *bucket_seq;
unsigned sectors_free;
/* Journal bucket we're currently writing to */
unsigned cur_idx;
/* Last journal bucket that still contains an open journal entry */
/*
* j->lock and j->reclaim_lock must both be held to modify, j->lock
* sufficient to read:
*/
unsigned last_idx;
unsigned nr;
u64 *buckets;
/* Bio for journal reads/writes to this device */
struct bio *bio;
/* for bch_journal_read_device */
struct closure read;
};
#endif /* _BCACHEFS_JOURNAL_TYPES_H */

67
fs/bcachefs/keylist.c Normal file
View File

@ -0,0 +1,67 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "keylist.h"
int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
size_t nr_inline_u64s, size_t new_u64s)
{
size_t oldsize = bch_keylist_u64s(l);
size_t newsize = oldsize + new_u64s;
u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
u64 *new_keys;
newsize = roundup_pow_of_two(newsize);
if (newsize <= nr_inline_u64s ||
(old_buf && roundup_pow_of_two(oldsize) == newsize))
return 0;
new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO);
if (!new_keys)
return -ENOMEM;
if (!old_buf)
memcpy_u64s(new_keys, inline_u64s, oldsize);
l->keys_p = new_keys;
l->top_p = new_keys + oldsize;
return 0;
}
void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
{
struct bkey_i *where;
for_each_keylist_key(l, where)
if (bkey_cmp(insert->k.p, where->k.p) < 0)
break;
memmove_u64s_up((u64 *) where + insert->k.u64s,
where,
((u64 *) l->top) - ((u64 *) where));
l->top_p += insert->k.u64s;
bkey_copy(where, insert);
}
void bch2_keylist_pop_front(struct keylist *l)
{
l->top_p -= bch2_keylist_front(l)->k.u64s;
memmove_u64s_down(l->keys,
bkey_next(l->keys),
bch_keylist_u64s(l));
}
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_verify_keylist_sorted(struct keylist *l)
{
struct bkey_i *k;
for_each_keylist_key(l, k)
BUG_ON(bkey_next(k) != l->top &&
bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
}
#endif

76
fs/bcachefs/keylist.h Normal file
View File

@ -0,0 +1,76 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_KEYLIST_H
#define _BCACHEFS_KEYLIST_H
#include "keylist_types.h"
int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *);
void bch2_keylist_pop_front(struct keylist *);
static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
{
l->top_p = l->keys_p = inline_keys;
}
static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
{
if (l->keys_p != inline_keys)
kfree(l->keys_p);
bch2_keylist_init(l, inline_keys);
}
static inline void bch2_keylist_push(struct keylist *l)
{
l->top = bkey_next(l->top);
}
static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k)
{
bkey_copy(l->top, k);
bch2_keylist_push(l);
}
static inline bool bch2_keylist_empty(struct keylist *l)
{
return l->top == l->keys;
}
static inline size_t bch_keylist_u64s(struct keylist *l)
{
return l->top_p - l->keys_p;
}
static inline size_t bch2_keylist_bytes(struct keylist *l)
{
return bch_keylist_u64s(l) * sizeof(u64);
}
static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
{
return l->keys;
}
#define for_each_keylist_key(_keylist, _k) \
for (_k = (_keylist)->keys; \
_k != (_keylist)->top; \
_k = bkey_next(_k))
static inline u64 keylist_sectors(struct keylist *keys)
{
struct bkey_i *k;
u64 ret = 0;
for_each_keylist_key(keys, k)
ret += k->k.size;
return ret;
}
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_verify_keylist_sorted(struct keylist *);
#else
static inline void bch2_verify_keylist_sorted(struct keylist *l) {}
#endif
#endif /* _BCACHEFS_KEYLIST_H */

View File

@ -0,0 +1,16 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_KEYLIST_TYPES_H
#define _BCACHEFS_KEYLIST_TYPES_H
struct keylist {
union {
struct bkey_i *keys;
u64 *keys_p;
};
union {
struct bkey_i *top;
u64 *top_p;
};
};
#endif /* _BCACHEFS_KEYLIST_TYPES_H */

178
fs/bcachefs/migrate.c Normal file
View File

@ -0,0 +1,178 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Code for moving data off a device.
*/
#include "bcachefs.h"
#include "btree_update.h"
#include "buckets.h"
#include "extents.h"
#include "io.h"
#include "journal.h"
#include "keylist.h"
#include "migrate.h"
#include "move.h"
#include "replicas.h"
#include "super-io.h"
static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
unsigned dev_idx, int flags, bool metadata)
{
unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
unsigned nr_good;
bch2_extent_drop_device(e, dev_idx);
nr_good = bch2_extent_durability(c, e.c);
if ((!nr_good && !(flags & lost)) ||
(nr_good < replicas && !(flags & degraded)))
return -EINVAL;
return 0;
}
static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
struct bkey_s_c k;
struct bkey_s_extent e;
BKEY_PADDED(key) tmp;
struct btree_iter iter;
int ret = 0;
mutex_lock(&c->replicas_gc_lock);
bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
POS_MIN, BTREE_ITER_PREFETCH);
while ((k = bch2_btree_iter_peek(&iter)).k &&
!(ret = btree_iter_err(k))) {
if (!bkey_extent_is_data(k.k) ||
!bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
if (ret)
break;
bch2_btree_iter_next(&iter);
continue;
}
bkey_reassemble(&tmp.key, k);
e = bkey_i_to_s_extent(&tmp.key);
ret = drop_dev_ptrs(c, e, dev_idx, flags, false);
if (ret)
break;
/*
* If the new extent no longer has any pointers, bch2_extent_normalize()
* will do the appropriate thing with it (turning it into a
* KEY_TYPE_ERROR key, or just a discard if it was a cached extent)
*/
bch2_extent_normalize(c, e.s);
ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
bkey_i_to_s_c(&tmp.key));
if (ret)
break;
iter.pos = bkey_start_pos(&tmp.key.k);
ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL,
BTREE_INSERT_ENTRY(&iter, &tmp.key));
/*
* don't want to leave ret == -EINTR, since if we raced and
* something else overwrote the key we could spuriously return
* -EINTR below:
*/
if (ret == -EINTR)
ret = 0;
if (ret)
break;
}
bch2_btree_iter_unlock(&iter);
bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
return ret;
}
static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
struct btree_iter iter;
struct closure cl;
struct btree *b;
unsigned id;
int ret;
/* don't handle this yet: */
if (flags & BCH_FORCE_IF_METADATA_LOST)
return -EINVAL;
closure_init_stack(&cl);
mutex_lock(&c->replicas_gc_lock);
bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
for (id = 0; id < BTREE_ID_NR; id++) {
for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
struct bkey_i_extent *new_key;
retry:
if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
dev_idx)) {
/*
* we might have found a btree node key we
* needed to update, and then tried to update it
* but got -EINTR after upgrading the iter, but
* then raced and the node is now gone:
*/
bch2_btree_iter_downgrade(&iter);
ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
bkey_i_to_s_c(&b->key));
if (ret)
goto err;
} else {
bkey_copy(&tmp.k, &b->key);
new_key = bkey_i_to_extent(&tmp.k);
ret = drop_dev_ptrs(c, extent_i_to_s(new_key),
dev_idx, flags, true);
if (ret)
goto err;
ret = bch2_btree_node_update_key(c, &iter, b, new_key);
if (ret == -EINTR) {
b = bch2_btree_iter_peek_node(&iter);
goto retry;
}
if (ret)
goto err;
}
}
bch2_btree_iter_unlock(&iter);
}
ret = 0;
out:
ret = bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
return ret;
err:
bch2_btree_iter_unlock(&iter);
goto out;
}
int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
bch2_dev_metadata_drop(c, dev_idx, flags);
}

7
fs/bcachefs/migrate.h Normal file
View File

@ -0,0 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_MIGRATE_H
#define _BCACHEFS_MIGRATE_H
int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
#endif /* _BCACHEFS_MIGRATE_H */

761
fs/bcachefs/move.c Normal file
View File

@ -0,0 +1,761 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "btree_gc.h"
#include "btree_update.h"
#include "buckets.h"
#include "inode.h"
#include "io.h"
#include "journal_reclaim.h"
#include "keylist.h"
#include "move.h"
#include "replicas.h"
#include "super-io.h"
#include "trace.h"
#include <linux/ioprio.h>
#include <linux/kthread.h>
#define SECTORS_IN_FLIGHT_PER_DEVICE 2048
struct moving_io {
struct list_head list;
struct closure cl;
bool read_completed;
unsigned read_sectors;
unsigned write_sectors;
struct bch_read_bio rbio;
struct migrate_write write;
/* Must be last since it is variable size */
struct bio_vec bi_inline_vecs[0];
};
struct moving_context {
/* Closure for waiting on all reads and writes to complete */
struct closure cl;
struct bch_move_stats *stats;
struct list_head reads;
/* in flight sectors: */
atomic_t read_sectors;
atomic_t write_sectors;
wait_queue_head_t wait;
};
static int bch2_migrate_index_update(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct migrate_write *m =
container_of(op, struct migrate_write, op);
struct keylist *keys = &op->insert_keys;
struct btree_iter iter;
int ret = 0;
bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
bkey_start_pos(&bch2_keylist_front(keys)->k),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
while (1) {
struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
struct bkey_i_extent *insert, *new =
bkey_i_to_extent(bch2_keylist_front(keys));
BKEY_PADDED(k) _new, _insert;
struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
bool did_work = false;
int nr;
if (btree_iter_err(k)) {
ret = bch2_btree_iter_unlock(&iter);
break;
}
if (bversion_cmp(k.k->version, new->k.version) ||
!bkey_extent_is_data(k.k) ||
!bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k),
m->ptr, m->offset))
goto nomatch;
if (m->data_cmd == DATA_REWRITE &&
!bch2_extent_has_device(bkey_s_c_to_extent(k),
m->data_opts.rewrite_dev))
goto nomatch;
bkey_reassemble(&_insert.k, k);
insert = bkey_i_to_extent(&_insert.k);
bkey_copy(&_new.k, bch2_keylist_front(keys));
new = bkey_i_to_extent(&_new.k);
bch2_cut_front(iter.pos, &insert->k_i);
bch2_cut_back(new->k.p, &insert->k);
bch2_cut_back(insert->k.p, &new->k);
if (m->data_cmd == DATA_REWRITE) {
ptr = (struct bch_extent_ptr *)
bch2_extent_has_device(extent_i_to_s_c(insert),
m->data_opts.rewrite_dev);
bch2_extent_drop_ptr(extent_i_to_s(insert), ptr);
}
extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) {
if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) {
/*
* raced with another move op? extent already
* has a pointer to the device we just wrote
* data to
*/
continue;
}
bch2_extent_crc_append(insert, crc);
extent_ptr_append(insert, *ptr);
did_work = true;
}
if (!did_work)
goto nomatch;
bch2_extent_narrow_crcs(insert,
(struct bch_extent_crc_unpacked) { 0 });
bch2_extent_normalize(c, extent_i_to_s(insert).s);
bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert),
op->opts.background_target,
op->opts.data_replicas);
/*
* It's possible we race, and for whatever reason the extent now
* has fewer replicas than when we last looked at it - meaning
* we need to get a disk reservation here:
*/
nr = bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) -
(bch2_extent_nr_dirty_ptrs(k) + m->nr_ptrs_reserved);
if (nr > 0) {
/*
* can't call bch2_disk_reservation_add() with btree
* locks held, at least not without a song and dance
*/
bch2_btree_iter_unlock(&iter);
ret = bch2_disk_reservation_add(c, &op->res,
keylist_sectors(keys) * nr, 0);
if (ret)
goto out;
m->nr_ptrs_reserved += nr;
goto next;
}
ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
extent_i_to_s_c(insert).s_c);
if (ret)
break;
ret = bch2_btree_insert_at(c, &op->res,
NULL, op_journal_seq(op),
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL|
BTREE_INSERT_USE_RESERVE|
m->data_opts.btree_insert_flags,
BTREE_INSERT_ENTRY(&iter, &insert->k_i));
if (!ret)
atomic_long_inc(&c->extent_migrate_done);
if (ret == -EINTR)
ret = 0;
if (ret)
break;
next:
while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
bch2_keylist_pop_front(keys);
if (bch2_keylist_empty(keys))
goto out;
}
bch2_cut_front(iter.pos, bch2_keylist_front(keys));
continue;
nomatch:
if (m->ctxt)
atomic64_add(k.k->p.offset - iter.pos.offset,
&m->ctxt->stats->sectors_raced);
atomic_long_inc(&c->extent_migrate_raced);
trace_move_race(&new->k);
bch2_btree_iter_next_slot(&iter);
goto next;
}
out:
bch2_btree_iter_unlock(&iter);
return ret;
}
void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
{
/* write bio must own pages: */
BUG_ON(!m->op.wbio.bio.bi_vcnt);
m->ptr = rbio->pick.ptr;
m->offset = rbio->pos.offset - rbio->pick.crc.offset;
m->op.devs_have = rbio->devs_have;
m->op.pos = rbio->pos;
m->op.version = rbio->version;
m->op.crc = rbio->pick.crc;
m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
m->op.nonce = m->op.crc.nonce + m->op.crc.offset;
m->op.csum_type = m->op.crc.csum_type;
}
if (m->data_cmd == DATA_REWRITE)
bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
}
int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
struct write_point_specifier wp,
struct bch_io_opts io_opts,
enum data_cmd data_cmd,
struct data_opts data_opts,
struct bkey_s_c k)
{
int ret;
m->data_cmd = data_cmd;
m->data_opts = data_opts;
m->nr_ptrs_reserved = 0;
bch2_write_op_init(&m->op, c, io_opts);
m->op.compression_type =
bch2_compression_opt_to_type[io_opts.background_compression ?:
io_opts.compression];
m->op.target = data_opts.target,
m->op.write_point = wp;
if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
m->op.alloc_reserve = RESERVE_MOVINGGC;
m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
BCH_WRITE_PAGES_STABLE|
BCH_WRITE_PAGES_OWNED|
BCH_WRITE_DATA_ENCODED|
BCH_WRITE_NOMARK_REPLICAS;
m->op.nr_replicas = 1;
m->op.nr_replicas_required = 1;
m->op.index_update_fn = bch2_migrate_index_update;
switch (data_cmd) {
case DATA_ADD_REPLICAS: {
int nr = (int) io_opts.data_replicas -
bch2_extent_nr_dirty_ptrs(k);
if (nr > 0) {
m->op.nr_replicas = m->nr_ptrs_reserved = nr;
ret = bch2_disk_reservation_get(c, &m->op.res,
k.k->size, m->op.nr_replicas, 0);
if (ret)
return ret;
}
break;
}
case DATA_REWRITE:
break;
case DATA_PROMOTE:
m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
m->op.flags |= BCH_WRITE_CACHED;
break;
default:
BUG();
}
return 0;
}
static void move_free(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
struct moving_context *ctxt = io->write.ctxt;
struct bvec_iter_all iter;
struct bio_vec *bv;
bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
if (bv->bv_page)
__free_page(bv->bv_page);
wake_up(&ctxt->wait);
kfree(io);
}
static void move_write_done(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
closure_return_with_destructor(cl, move_free);
}
static void move_write(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, cl);
if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
closure_return_with_destructor(cl, move_free);
return;
}
bch2_migrate_read_done(&io->write, &io->rbio);
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
closure_call(&io->write.op.cl, bch2_write, NULL, cl);
continue_at(cl, move_write_done, NULL);
}
static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
{
struct moving_io *io =
list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
return io && io->read_completed ? io : NULL;
}
static void move_read_endio(struct bio *bio)
{
struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
struct moving_context *ctxt = io->write.ctxt;
atomic_sub(io->read_sectors, &ctxt->read_sectors);
io->read_completed = true;
if (next_pending_write(ctxt))
wake_up(&ctxt->wait);
closure_put(&ctxt->cl);
}
static void do_pending_writes(struct moving_context *ctxt)
{
struct moving_io *io;
while ((io = next_pending_write(ctxt))) {
list_del(&io->list);
closure_call(&io->cl, move_write, NULL, &ctxt->cl);
}
}
#define move_ctxt_wait_event(_ctxt, _cond) \
do { \
do_pending_writes(_ctxt); \
\
if (_cond) \
break; \
__wait_event((_ctxt)->wait, \
next_pending_write(_ctxt) || (_cond)); \
} while (1)
static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
{
unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
move_ctxt_wait_event(ctxt,
!atomic_read(&ctxt->write_sectors) ||
atomic_read(&ctxt->write_sectors) != sectors_pending);
}
static int bch2_move_extent(struct bch_fs *c,
struct moving_context *ctxt,
struct write_point_specifier wp,
struct bch_io_opts io_opts,
struct bkey_s_c_extent e,
enum data_cmd data_cmd,
struct data_opts data_opts)
{
struct moving_io *io;
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
unsigned sectors = e.k->size, pages;
int ret = -ENOMEM;
move_ctxt_wait_event(ctxt,
atomic_read(&ctxt->write_sectors) <
SECTORS_IN_FLIGHT_PER_DEVICE);
move_ctxt_wait_event(ctxt,
atomic_read(&ctxt->read_sectors) <
SECTORS_IN_FLIGHT_PER_DEVICE);
/* write path might have to decompress data: */
extent_for_each_ptr_crc(e, ptr, crc)
sectors = max_t(unsigned, sectors, crc.uncompressed_size);
pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
io = kzalloc(sizeof(struct moving_io) +
sizeof(struct bio_vec) * pages, GFP_KERNEL);
if (!io)
goto err;
io->write.ctxt = ctxt;
io->read_sectors = e.k->size;
io->write_sectors = e.k->size;
bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0);
bio_set_prio(&io->write.op.wbio.bio,
IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
GFP_KERNEL))
goto err_free;
io->rbio.opts = io_opts;
bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0);
io->rbio.bio.bi_vcnt = pages;
bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
io->rbio.bio.bi_iter.bi_size = sectors << 9;
io->rbio.bio.bi_opf = REQ_OP_READ;
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(e.k);
io->rbio.bio.bi_end_io = move_read_endio;
ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
data_cmd, data_opts, e.s_c);
if (ret)
goto err_free_pages;
atomic64_inc(&ctxt->stats->keys_moved);
atomic64_add(e.k->size, &ctxt->stats->sectors_moved);
trace_move_extent(e.k);
atomic_add(io->read_sectors, &ctxt->read_sectors);
list_add_tail(&io->list, &ctxt->reads);
/*
* dropped by move_read_endio() - guards against use after free of
* ctxt when doing wakeup
*/
closure_get(&ctxt->cl);
bch2_read_extent(c, &io->rbio, e.s_c,
BCH_READ_NODECODE|
BCH_READ_LAST_FRAGMENT);
return 0;
err_free_pages:
bio_free_pages(&io->write.op.wbio.bio);
err_free:
kfree(io);
err:
trace_move_alloc_fail(e.k);
return ret;
}
int bch2_move_data(struct bch_fs *c,
struct bch_ratelimit *rate,
struct write_point_specifier wp,
struct bpos start,
struct bpos end,
move_pred_fn pred, void *arg,
struct bch_move_stats *stats)
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct moving_context ctxt = { .stats = stats };
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
BKEY_PADDED(k) tmp;
struct bkey_s_c k;
struct bkey_s_c_extent e;
struct data_opts data_opts;
enum data_cmd data_cmd;
u64 cur_inum = U64_MAX;
int ret = 0, ret2;
closure_init_stack(&ctxt.cl);
INIT_LIST_HEAD(&ctxt.reads);
init_waitqueue_head(&ctxt.wait);
stats->data_type = BCH_DATA_USER;
bch2_btree_iter_init(&stats->iter, c, BTREE_ID_EXTENTS, start,
BTREE_ITER_PREFETCH);
if (rate)
bch2_ratelimit_reset(rate);
while (!kthread || !(ret = kthread_should_stop())) {
if (rate &&
bch2_ratelimit_delay(rate) &&
(bch2_btree_iter_unlock(&stats->iter),
(ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
break;
peek:
k = bch2_btree_iter_peek(&stats->iter);
if (!k.k)
break;
ret = btree_iter_err(k);
if (ret)
break;
if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
break;
if (!bkey_extent_is_data(k.k))
goto next_nondata;
e = bkey_s_c_to_extent(k);
if (cur_inum != k.k->p.inode) {
struct bch_inode_unpacked inode;
/* don't hold btree locks while looking up inode: */
bch2_btree_iter_unlock(&stats->iter);
io_opts = bch2_opts_to_inode_opts(c->opts);
if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
cur_inum = k.k->p.inode;
goto peek;
}
switch ((data_cmd = pred(c, arg, BKEY_TYPE_EXTENTS, e,
&io_opts, &data_opts))) {
case DATA_SKIP:
goto next;
case DATA_SCRUB:
BUG();
case DATA_ADD_REPLICAS:
case DATA_REWRITE:
case DATA_PROMOTE:
break;
default:
BUG();
}
/* unlock before doing IO: */
bkey_reassemble(&tmp.k, k);
k = bkey_i_to_s_c(&tmp.k);
bch2_btree_iter_unlock(&stats->iter);
ret2 = bch2_move_extent(c, &ctxt, wp, io_opts,
bkey_s_c_to_extent(k),
data_cmd, data_opts);
if (ret2) {
if (ret2 == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(&ctxt);
continue;
}
/* XXX signal failure */
goto next;
}
if (rate)
bch2_ratelimit_increment(rate, k.k->size);
next:
atomic64_add(k.k->size * bch2_extent_nr_dirty_ptrs(k),
&stats->sectors_seen);
next_nondata:
bch2_btree_iter_next(&stats->iter);
bch2_btree_iter_cond_resched(&stats->iter);
}
bch2_btree_iter_unlock(&stats->iter);
move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
closure_sync(&ctxt.cl);
EBUG_ON(atomic_read(&ctxt.write_sectors));
trace_move_data(c,
atomic64_read(&stats->sectors_moved),
atomic64_read(&stats->keys_moved));
return ret;
}
static int bch2_gc_data_replicas(struct bch_fs *c)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret;
mutex_lock(&c->replicas_gc_lock);
bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
BTREE_ITER_PREFETCH, k) {
ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
if (ret)
break;
}
ret = bch2_btree_iter_unlock(&iter) ?: ret;
bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
return ret;
}
static int bch2_gc_btree_replicas(struct bch_fs *c)
{
struct btree_iter iter;
struct btree *b;
unsigned id;
int ret = 0;
mutex_lock(&c->replicas_gc_lock);
bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
for (id = 0; id < BTREE_ID_NR; id++) {
for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
bkey_i_to_s_c(&b->key));
bch2_btree_iter_cond_resched(&iter);
}
ret = bch2_btree_iter_unlock(&iter) ?: ret;
}
bch2_replicas_gc_end(c, ret);
mutex_unlock(&c->replicas_gc_lock);
return ret;
}
static int bch2_move_btree(struct bch_fs *c,
move_pred_fn pred,
void *arg,
struct bch_move_stats *stats)
{
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct btree *b;
unsigned id;
struct data_opts data_opts;
enum data_cmd cmd;
int ret = 0;
stats->data_type = BCH_DATA_BTREE;
for (id = 0; id < BTREE_ID_NR; id++) {
for_each_btree_node(&stats->iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
switch ((cmd = pred(c, arg, BKEY_TYPE_BTREE,
bkey_i_to_s_c_extent(&b->key),
&io_opts,
&data_opts))) {
case DATA_SKIP:
goto next;
case DATA_SCRUB:
BUG();
case DATA_ADD_REPLICAS:
case DATA_REWRITE:
break;
default:
BUG();
}
ret = bch2_btree_node_rewrite(c, &stats->iter,
b->data->keys.seq, 0) ?: ret;
next:
bch2_btree_iter_cond_resched(&stats->iter);
}
ret = bch2_btree_iter_unlock(&stats->iter) ?: ret;
}
return ret;
}
#if 0
static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
enum bkey_type type,
struct bkey_s_c_extent e,
struct bch_io_opts *io_opts,
struct data_opts *data_opts)
{
return DATA_SCRUB;
}
#endif
static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
enum bkey_type type,
struct bkey_s_c_extent e,
struct bch_io_opts *io_opts,
struct data_opts *data_opts)
{
unsigned nr_good = bch2_extent_durability(c, e);
unsigned replicas = type == BKEY_TYPE_BTREE
? c->opts.metadata_replicas
: io_opts->data_replicas;
if (!nr_good || nr_good >= replicas)
return DATA_SKIP;
data_opts->target = 0;
data_opts->btree_insert_flags = 0;
return DATA_ADD_REPLICAS;
}
static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
enum bkey_type type,
struct bkey_s_c_extent e,
struct bch_io_opts *io_opts,
struct data_opts *data_opts)
{
struct bch_ioctl_data *op = arg;
if (!bch2_extent_has_device(e, op->migrate.dev))
return DATA_SKIP;
data_opts->target = 0;
data_opts->btree_insert_flags = 0;
data_opts->rewrite_dev = op->migrate.dev;
return DATA_REWRITE;
}
int bch2_data_job(struct bch_fs *c,
struct bch_move_stats *stats,
struct bch_ioctl_data op)
{
int ret = 0;
switch (op.op) {
case BCH_DATA_OP_REREPLICATE:
stats->data_type = BCH_DATA_JOURNAL;
ret = bch2_journal_flush_device_pins(&c->journal, -1);
ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
ret = bch2_gc_btree_replicas(c) ?: ret;
ret = bch2_move_data(c, NULL,
writepoint_hashed((unsigned long) current),
op.start,
op.end,
rereplicate_pred, c, stats) ?: ret;
ret = bch2_gc_data_replicas(c) ?: ret;
break;
case BCH_DATA_OP_MIGRATE:
if (op.migrate.dev >= c->sb.nr_devices)
return -EINVAL;
stats->data_type = BCH_DATA_JOURNAL;
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
ret = bch2_gc_btree_replicas(c) ?: ret;
ret = bch2_move_data(c, NULL,
writepoint_hashed((unsigned long) current),
op.start,
op.end,
migrate_pred, &op, stats) ?: ret;
ret = bch2_gc_data_replicas(c) ?: ret;
break;
default:
ret = -EINVAL;
}
return ret;
}

63
fs/bcachefs/move.h Normal file
View File

@ -0,0 +1,63 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_MOVE_H
#define _BCACHEFS_MOVE_H
#include "btree_iter.h"
#include "buckets.h"
#include "io_types.h"
#include "move_types.h"
struct bch_read_bio;
struct moving_context;
enum data_cmd {
DATA_SKIP,
DATA_SCRUB,
DATA_ADD_REPLICAS,
DATA_REWRITE,
DATA_PROMOTE,
};
struct data_opts {
u16 target;
unsigned rewrite_dev;
int btree_insert_flags;
};
struct migrate_write {
enum data_cmd data_cmd;
struct data_opts data_opts;
unsigned nr_ptrs_reserved;
struct moving_context *ctxt;
/* what we read: */
struct bch_extent_ptr ptr;
u64 offset;
struct bch_write_op op;
};
void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *);
int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
struct write_point_specifier,
struct bch_io_opts,
enum data_cmd, struct data_opts,
struct bkey_s_c);
typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
enum bkey_type, struct bkey_s_c_extent,
struct bch_io_opts *, struct data_opts *);
int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
struct write_point_specifier,
struct bpos, struct bpos,
move_pred_fn, void *,
struct bch_move_stats *);
int bch2_data_job(struct bch_fs *,
struct bch_move_stats *,
struct bch_ioctl_data);
#endif /* _BCACHEFS_MOVE_H */

15
fs/bcachefs/move_types.h Normal file
View File

@ -0,0 +1,15 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_MOVE_TYPES_H
#define _BCACHEFS_MOVE_TYPES_H
struct bch_move_stats {
enum bch_data_type data_type;
struct btree_iter iter;
atomic64_t keys_moved;
atomic64_t sectors_moved;
atomic64_t sectors_seen;
atomic64_t sectors_raced;
};
#endif /* _BCACHEFS_MOVE_TYPES_H */

309
fs/bcachefs/movinggc.c Normal file
View File

@ -0,0 +1,309 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Moving/copying garbage collector
*
* Copyright 2012 Google, Inc.
*/
#include "bcachefs.h"
#include "btree_iter.h"
#include "btree_update.h"
#include "buckets.h"
#include "clock.h"
#include "disk_groups.h"
#include "extents.h"
#include "eytzinger.h"
#include "io.h"
#include "keylist.h"
#include "move.h"
#include "movinggc.h"
#include "super-io.h"
#include "trace.h"
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/math64.h>
#include <linux/sched/task.h>
#include <linux/sort.h>
#include <linux/wait.h>
/*
* We can't use the entire copygc reserve in one iteration of copygc: we may
* need the buckets we're freeing up to go back into the copygc reserve to make
* forward progress, but if the copygc reserve is full they'll be available for
* any allocation - and it's possible that in a given iteration, we free up most
* of the buckets we're going to free before we allocate most of the buckets
* we're going to allocate.
*
* If we only use half of the reserve per iteration, then in steady state we'll
* always have room in the reserve for the buckets we're going to need in the
* next iteration:
*/
#define COPYGC_BUCKETS_PER_ITER(ca) \
((ca)->free[RESERVE_MOVINGGC].size / 2)
/*
* Max sectors to move per iteration: Have to take into account internal
* fragmentation from the multiple write points for each generation:
*/
#define COPYGC_SECTORS_PER_ITER(ca) \
((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca))
static inline int sectors_used_cmp(copygc_heap *heap,
struct copygc_heap_entry l,
struct copygc_heap_entry r)
{
return (l.sectors > r.sectors) - (l.sectors < r.sectors);
}
static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
{
const struct copygc_heap_entry *l = _l;
const struct copygc_heap_entry *r = _r;
return (l->offset > r->offset) - (l->offset < r->offset);
}
static bool __copygc_pred(struct bch_dev *ca,
struct bkey_s_c_extent e)
{
copygc_heap *h = &ca->copygc_heap;
const struct bch_extent_ptr *ptr =
bch2_extent_has_device(e, ca->dev_idx);
if (ptr) {
struct copygc_heap_entry search = { .offset = ptr->offset };
ssize_t i = eytzinger0_find_le(h->data, h->used,
sizeof(h->data[0]),
bucket_offset_cmp, &search);
return (i >= 0 &&
ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
ptr->gen == h->data[i].gen);
}
return false;
}
static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
enum bkey_type type,
struct bkey_s_c_extent e,
struct bch_io_opts *io_opts,
struct data_opts *data_opts)
{
struct bch_dev *ca = arg;
if (!__copygc_pred(ca, e))
return DATA_SKIP;
data_opts->target = dev_to_target(ca->dev_idx);
data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE;
data_opts->rewrite_dev = ca->dev_idx;
return DATA_REWRITE;
}
static bool have_copygc_reserve(struct bch_dev *ca)
{
bool ret;
spin_lock(&ca->freelist_lock);
ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
ca->allocator_blocked;
spin_unlock(&ca->freelist_lock);
return ret;
}
static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
{
copygc_heap *h = &ca->copygc_heap;
struct copygc_heap_entry e, *i;
struct bucket_array *buckets;
struct bch_move_stats move_stats;
u64 sectors_to_move = 0, sectors_not_moved = 0;
u64 buckets_to_move, buckets_not_moved = 0;
size_t b;
int ret;
memset(&move_stats, 0, sizeof(move_stats));
closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
/*
* Find buckets with lowest sector counts, skipping completely
* empty buckets, by building a maxheap sorted by sector count,
* and repeatedly replacing the maximum element until all
* buckets have been visited.
*/
h->used = 0;
/*
* We need bucket marks to be up to date - gc can't be recalculating
* them:
*/
down_read(&c->gc_lock);
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
struct copygc_heap_entry e;
if (m.owned_by_allocator ||
m.data_type != BCH_DATA_USER ||
!bucket_sectors_used(m) ||
bucket_sectors_used(m) >= ca->mi.bucket_size)
continue;
e = (struct copygc_heap_entry) {
.gen = m.gen,
.sectors = bucket_sectors_used(m),
.offset = bucket_to_sector(ca, b),
};
heap_add_or_replace(h, e, -sectors_used_cmp);
}
up_read(&ca->bucket_lock);
up_read(&c->gc_lock);
for (i = h->data; i < h->data + h->used; i++)
sectors_to_move += i->sectors;
while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
BUG_ON(!heap_pop(h, e, -sectors_used_cmp));
sectors_to_move -= e.sectors;
}
buckets_to_move = h->used;
if (!buckets_to_move)
return;
eytzinger0_sort(h->data, h->used,
sizeof(h->data[0]),
bucket_offset_cmp, NULL);
ret = bch2_move_data(c, &ca->copygc_pd.rate,
writepoint_ptr(&ca->copygc_write_point),
POS_MIN, POS_MAX,
copygc_pred, ca,
&move_stats);
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
for (i = h->data; i < h->data + h->used; i++) {
size_t b = sector_to_bucket(ca, i->offset);
struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
if (i->gen == m.gen && bucket_sectors_used(m)) {
sectors_not_moved += bucket_sectors_used(m);
buckets_not_moved++;
}
}
up_read(&ca->bucket_lock);
if (sectors_not_moved && !ret)
bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
sectors_not_moved, sectors_to_move,
buckets_not_moved, buckets_to_move);
trace_copygc(ca,
atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
buckets_to_move, buckets_not_moved);
}
static int bch2_copygc_thread(void *arg)
{
struct bch_dev *ca = arg;
struct bch_fs *c = ca->fs;
struct io_clock *clock = &c->io_clock[WRITE];
struct bch_dev_usage usage;
unsigned long last;
u64 available, fragmented, reserve, next;
set_freezable();
while (!kthread_should_stop()) {
if (kthread_wait_freezable(c->copy_gc_enabled))
break;
last = atomic_long_read(&clock->now);
reserve = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
ca->mi.bucket_size *
c->opts.gc_reserve_percent, 200);
usage = bch2_dev_usage_read(c, ca);
/*
* don't start copygc until less than half the gc reserve is
* available:
*/
available = __dev_buckets_available(ca, usage) *
ca->mi.bucket_size;
if (available > reserve) {
next = last + available - reserve;
bch2_kthread_io_clock_wait(clock, next,
MAX_SCHEDULE_TIMEOUT);
continue;
}
/*
* don't start copygc until there's more than half the copygc
* reserve of fragmented space:
*/
fragmented = usage.sectors_fragmented;
if (fragmented < reserve) {
next = last + reserve - fragmented;
bch2_kthread_io_clock_wait(clock, next,
MAX_SCHEDULE_TIMEOUT);
continue;
}
bch2_copygc(c, ca);
}
return 0;
}
void bch2_copygc_stop(struct bch_dev *ca)
{
ca->copygc_pd.rate.rate = UINT_MAX;
bch2_ratelimit_reset(&ca->copygc_pd.rate);
if (ca->copygc_thread) {
kthread_stop(ca->copygc_thread);
put_task_struct(ca->copygc_thread);
}
ca->copygc_thread = NULL;
}
int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
{
struct task_struct *t;
BUG_ON(ca->copygc_thread);
if (c->opts.nochanges)
return 0;
if (bch2_fs_init_fault("copygc_start"))
return -ENOMEM;
t = kthread_create(bch2_copygc_thread, ca,
"bch_copygc[%s]", ca->name);
if (IS_ERR(t))
return PTR_ERR(t);
get_task_struct(t);
ca->copygc_thread = t;
wake_up_process(ca->copygc_thread);
return 0;
}
void bch2_dev_copygc_init(struct bch_dev *ca)
{
bch2_pd_controller_init(&ca->copygc_pd);
ca->copygc_pd.d_term = 0;
}

9
fs/bcachefs/movinggc.h Normal file
View File

@ -0,0 +1,9 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_MOVINGGC_H
#define _BCACHEFS_MOVINGGC_H
void bch2_copygc_stop(struct bch_dev *);
int bch2_copygc_start(struct bch_fs *, struct bch_dev *);
void bch2_dev_copygc_init(struct bch_dev *);
#endif /* _BCACHEFS_MOVINGGC_H */

381
fs/bcachefs/opts.c Normal file
View File

@ -0,0 +1,381 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include "bcachefs.h"
#include "disk_groups.h"
#include "opts.h"
#include "super-io.h"
#include "util.h"
const char * const bch2_error_actions[] = {
"continue",
"remount-ro",
"panic",
NULL
};
const char * const bch2_csum_types[] = {
"none",
"crc32c",
"crc64",
NULL
};
const char * const bch2_compression_types[] = {
"none",
"lz4",
"gzip",
"zstd",
NULL
};
const char * const bch2_str_hash_types[] = {
"crc32c",
"crc64",
"siphash",
NULL
};
const char * const bch2_data_types[] = {
"none",
"sb",
"journal",
"btree",
"data",
"cached",
NULL
};
const char * const bch2_cache_replacement_policies[] = {
"lru",
"fifo",
"random",
NULL
};
/* Default is -1; we skip past it for struct cached_dev's cache mode */
const char * const bch2_cache_modes[] = {
"default",
"writethrough",
"writeback",
"writearound",
"none",
NULL
};
const char * const bch2_dev_state[] = {
"readwrite",
"readonly",
"failed",
"spare",
NULL
};
void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
{
#define BCH_OPT(_name, ...) \
if (opt_defined(src, _name)) \
opt_set(*dst, _name, src._name);
BCH_OPTS()
#undef BCH_OPT
}
bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
{
switch (id) {
#define BCH_OPT(_name, ...) \
case Opt_##_name: \
return opt_defined(*opts, _name);
BCH_OPTS()
#undef BCH_OPT
default:
BUG();
}
}
u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
{
switch (id) {
#define BCH_OPT(_name, ...) \
case Opt_##_name: \
return opts->_name;
BCH_OPTS()
#undef BCH_OPT
default:
BUG();
}
}
void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
{
switch (id) {
#define BCH_OPT(_name, ...) \
case Opt_##_name: \
opt_set(*opts, _name, v); \
break;
BCH_OPTS()
#undef BCH_OPT
default:
BUG();
}
}
/*
* Initial options from superblock - here we don't want any options undefined,
* any options the superblock doesn't specify are set to 0:
*/
struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
{
struct bch_opts opts = bch2_opts_empty();
#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default) \
if (_sb_opt != NO_SB_OPT) \
opt_set(opts, _name, _sb_opt(sb));
BCH_OPTS()
#undef BCH_OPT
return opts;
}
const struct bch_option bch2_opt_table[] = {
#define OPT_BOOL() .type = BCH_OPT_BOOL
#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max
#define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices
#define OPT_FN(_fn) .type = BCH_OPT_FN, \
.parse = _fn##_parse, \
.print = _fn##_print
#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default) \
[Opt_##_name] = { \
.attr = { \
.name = #_name, \
.mode = _mode == OPT_RUNTIME ? 0644 : 0444, \
}, \
.mode = _mode, \
.set_sb = SET_##_sb_opt, \
_type \
},
BCH_OPTS()
#undef BCH_OPT
};
int bch2_opt_lookup(const char *name)
{
const struct bch_option *i;
for (i = bch2_opt_table;
i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table);
i++)
if (!strcmp(name, i->attr.name))
return i - bch2_opt_table;
return -1;
}
struct synonym {
const char *s1, *s2;
};
static const struct synonym bch_opt_synonyms[] = {
{ "quota", "usrquota" },
};
static int bch2_mount_opt_lookup(const char *name)
{
const struct synonym *i;
for (i = bch_opt_synonyms;
i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms);
i++)
if (!strcmp(name, i->s1))
name = i->s2;
return bch2_opt_lookup(name);
}
int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
const char *val, u64 *res)
{
ssize_t ret;
switch (opt->type) {
case BCH_OPT_BOOL:
ret = kstrtou64(val, 10, res);
if (ret < 0)
return ret;
if (*res > 1)
return -ERANGE;
break;
case BCH_OPT_UINT:
ret = kstrtou64(val, 10, res);
if (ret < 0)
return ret;
if (*res < opt->min || *res >= opt->max)
return -ERANGE;
break;
case BCH_OPT_STR:
ret = match_string(opt->choices, -1, val);
if (ret < 0)
return ret;
*res = ret;
break;
case BCH_OPT_FN:
if (!c)
return -EINVAL;
return opt->parse(c, val, res);
}
return 0;
}
int bch2_opt_to_text(struct bch_fs *c, char *buf, size_t len,
const struct bch_option *opt, u64 v,
unsigned flags)
{
char *out = buf, *end = buf + len;
if (flags & OPT_SHOW_MOUNT_STYLE) {
if (opt->type == BCH_OPT_BOOL)
return scnprintf(out, end - out, "%s%s",
v ? "" : "no",
opt->attr.name);
out += scnprintf(out, end - out, "%s=", opt->attr.name);
}
switch (opt->type) {
case BCH_OPT_BOOL:
case BCH_OPT_UINT:
out += scnprintf(out, end - out, "%lli", v);
break;
case BCH_OPT_STR:
out += (flags & OPT_SHOW_FULL_LIST)
? bch2_scnprint_string_list(out, end - out, opt->choices, v)
: scnprintf(out, end - out, opt->choices[v]);
break;
case BCH_OPT_FN:
return opt->print(c, out, end - out, v);
default:
BUG();
}
return out - buf;
}
int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
{
char *opt, *name, *val;
int ret, id;
u64 v;
while ((opt = strsep(&options, ",")) != NULL) {
name = strsep(&opt, "=");
val = opt;
if (val) {
id = bch2_mount_opt_lookup(name);
if (id < 0)
goto bad_opt;
ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v);
if (ret < 0)
goto bad_val;
} else {
id = bch2_mount_opt_lookup(name);
v = 1;
if (id < 0 &&
!strncmp("no", name, 2)) {
id = bch2_mount_opt_lookup(name + 2);
v = 0;
}
if (id < 0)
goto bad_opt;
if (bch2_opt_table[id].type != BCH_OPT_BOOL)
goto no_val;
}
if (bch2_opt_table[id].mode < OPT_MOUNT)
goto bad_opt;
if (id == Opt_acl &&
!IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
goto bad_opt;
if ((id == Opt_usrquota ||
id == Opt_grpquota) &&
!IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
goto bad_opt;
bch2_opt_set_by_id(opts, id, v);
}
return 0;
bad_opt:
pr_err("Bad mount option %s", name);
return -1;
bad_val:
pr_err("Invalid value %s for mount option %s", val, name);
return -1;
no_val:
pr_err("Mount option %s requires a value", name);
return -1;
}
/* io opts: */
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
{
struct bch_io_opts ret = { 0 };
#define BCH_INODE_OPT(_name, _bits) \
if (opt_defined(src, _name)) \
opt_set(ret, _name, src._name);
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
return ret;
}
struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src)
{
struct bch_opts ret = { 0 };
#define BCH_INODE_OPT(_name, _bits) \
if (opt_defined(src, _name)) \
opt_set(ret, _name, src._name);
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
return ret;
}
void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
{
#define BCH_INODE_OPT(_name, _bits) \
if (opt_defined(src, _name)) \
opt_set(*dst, _name, src._name);
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
}
bool bch2_opt_is_inode_opt(enum bch_opt_id id)
{
static const enum bch_opt_id inode_opt_list[] = {
#define BCH_INODE_OPT(_name, _bits) Opt_##_name,
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
};
unsigned i;
for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
if (inode_opt_list[i] == id)
return true;
return false;
}

296
fs/bcachefs/opts.h Normal file
View File

@ -0,0 +1,296 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_OPTS_H
#define _BCACHEFS_OPTS_H
#include <linux/bug.h>
#include <linux/log2.h>
#include <linux/string.h>
#include <linux/sysfs.h>
#include "bcachefs_format.h"
extern const char * const bch2_error_actions[];
extern const char * const bch2_csum_types[];
extern const char * const bch2_compression_types[];
extern const char * const bch2_str_hash_types[];
extern const char * const bch2_data_types[];
extern const char * const bch2_cache_replacement_policies[];
extern const char * const bch2_cache_modes[];
extern const char * const bch2_dev_state[];
/*
* Mount options; we also store defaults in the superblock.
*
* Also exposed via sysfs: if an option is writeable, and it's also stored in
* the superblock, changing it via sysfs (currently? might change this) also
* updates the superblock.
*
* We store options as signed integers, where -1 means undefined. This means we
* can pass the mount options to bch2_fs_alloc() as a whole struct, and then only
* apply the options from that struct that are defined.
*/
/* dummy option, for options that aren't stored in the superblock */
LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0);
enum opt_mode {
OPT_INTERNAL,
OPT_FORMAT,
OPT_MOUNT,
OPT_RUNTIME,
};
enum opt_type {
BCH_OPT_BOOL,
BCH_OPT_UINT,
BCH_OPT_STR,
BCH_OPT_FN,
};
/**
* BCH_OPT(name, type, in mem type, mode, sb_opt)
*
* @name - name of mount option, sysfs attribute, and struct bch_opts
* member
*
* @mode - when opt may be set
*
* @sb_option - name of corresponding superblock option
*
* @type - one of OPT_BOOL, OPT_UINT, OPT_STR
*/
/*
* XXX: add fields for
* - default value
* - helptext
*/
#define BCH_OPTS() \
BCH_OPT(block_size, u16, OPT_FORMAT, \
OPT_UINT(1, 128), \
BCH_SB_BLOCK_SIZE, 8) \
BCH_OPT(btree_node_size, u16, OPT_FORMAT, \
OPT_UINT(1, 128), \
BCH_SB_BTREE_NODE_SIZE, 512) \
BCH_OPT(errors, u8, OPT_RUNTIME, \
OPT_STR(bch2_error_actions), \
BCH_SB_ERROR_ACTION, BCH_ON_ERROR_RO) \
BCH_OPT(metadata_replicas, u8, OPT_RUNTIME, \
OPT_UINT(1, BCH_REPLICAS_MAX), \
BCH_SB_META_REPLICAS_WANT, 1) \
BCH_OPT(data_replicas, u8, OPT_RUNTIME, \
OPT_UINT(1, BCH_REPLICAS_MAX), \
BCH_SB_DATA_REPLICAS_WANT, 1) \
BCH_OPT(metadata_replicas_required, u8, OPT_MOUNT, \
OPT_UINT(1, BCH_REPLICAS_MAX), \
BCH_SB_META_REPLICAS_REQ, 1) \
BCH_OPT(data_replicas_required, u8, OPT_MOUNT, \
OPT_UINT(1, BCH_REPLICAS_MAX), \
BCH_SB_DATA_REPLICAS_REQ, 1) \
BCH_OPT(metadata_checksum, u8, OPT_RUNTIME, \
OPT_STR(bch2_csum_types), \
BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_CRC32C) \
BCH_OPT(data_checksum, u8, OPT_RUNTIME, \
OPT_STR(bch2_csum_types), \
BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_CRC32C) \
BCH_OPT(compression, u8, OPT_RUNTIME, \
OPT_STR(bch2_compression_types), \
BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_NONE)\
BCH_OPT(background_compression, u8, OPT_RUNTIME, \
OPT_STR(bch2_compression_types), \
BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_NONE)\
BCH_OPT(str_hash, u8, OPT_RUNTIME, \
OPT_STR(bch2_str_hash_types), \
BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_SIPHASH) \
BCH_OPT(foreground_target, u16, OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_FOREGROUND_TARGET, 0) \
BCH_OPT(background_target, u16, OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_BACKGROUND_TARGET, 0) \
BCH_OPT(promote_target, u16, OPT_RUNTIME, \
OPT_FN(bch2_opt_target), \
BCH_SB_PROMOTE_TARGET, 0) \
BCH_OPT(inodes_32bit, u8, OPT_RUNTIME, \
OPT_BOOL(), \
BCH_SB_INODE_32BIT, false) \
BCH_OPT(gc_reserve_percent, u8, OPT_MOUNT, \
OPT_UINT(5, 21), \
BCH_SB_GC_RESERVE, 8) \
BCH_OPT(root_reserve_percent, u8, OPT_MOUNT, \
OPT_UINT(0, 100), \
BCH_SB_ROOT_RESERVE, 0) \
BCH_OPT(wide_macs, u8, OPT_RUNTIME, \
OPT_BOOL(), \
BCH_SB_128_BIT_MACS, false) \
BCH_OPT(acl, u8, OPT_MOUNT, \
OPT_BOOL(), \
BCH_SB_POSIX_ACL, true) \
BCH_OPT(usrquota, u8, OPT_MOUNT, \
OPT_BOOL(), \
BCH_SB_USRQUOTA, false) \
BCH_OPT(grpquota, u8, OPT_MOUNT, \
OPT_BOOL(), \
BCH_SB_GRPQUOTA, false) \
BCH_OPT(prjquota, u8, OPT_MOUNT, \
OPT_BOOL(), \
BCH_SB_PRJQUOTA, false) \
BCH_OPT(degraded, u8, OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false) \
BCH_OPT(discard, u8, OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false) \
BCH_OPT(verbose_recovery, u8, OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false) \
BCH_OPT(verbose_init, u8, OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false) \
BCH_OPT(journal_flush_disabled, u8, OPT_RUNTIME, \
OPT_BOOL(), \
NO_SB_OPT, false) \
BCH_OPT(nofsck, u8, OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false) \
BCH_OPT(fix_errors, u8, OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false) \
BCH_OPT(nochanges, u8, OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false) \
BCH_OPT(noreplay, u8, OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false) \
BCH_OPT(norecovery, u8, OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false) \
BCH_OPT(noexcl, u8, OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false) \
BCH_OPT(sb, u64, OPT_MOUNT, \
OPT_UINT(0, S64_MAX), \
NO_SB_OPT, BCH_SB_SECTOR) \
BCH_OPT(read_only, u8, OPT_INTERNAL, \
OPT_BOOL(), \
NO_SB_OPT, false) \
BCH_OPT(nostart, u8, OPT_INTERNAL, \
OPT_BOOL(), \
NO_SB_OPT, false) \
BCH_OPT(no_data_io, u8, OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false)
struct bch_opts {
#define BCH_OPT(_name, _bits, ...) unsigned _name##_defined:1;
BCH_OPTS()
#undef BCH_OPT
#define BCH_OPT(_name, _bits, ...) _bits _name;
BCH_OPTS()
#undef BCH_OPT
};
static const struct bch_opts bch2_opts_default = {
#define BCH_OPT(_name, _bits, _mode, _type, _sb_opt, _default) \
._name##_defined = true, \
._name = _default, \
BCH_OPTS()
#undef BCH_OPT
};
#define opt_defined(_opts, _name) ((_opts)._name##_defined)
#define opt_get(_opts, _name) \
(opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name)
#define opt_set(_opts, _name, _v) \
do { \
(_opts)._name##_defined = true; \
(_opts)._name = _v; \
} while (0)
static inline struct bch_opts bch2_opts_empty(void)
{
return (struct bch_opts) { 0 };
}
void bch2_opts_apply(struct bch_opts *, struct bch_opts);
enum bch_opt_id {
#define BCH_OPT(_name, ...) Opt_##_name,
BCH_OPTS()
#undef BCH_OPT
bch2_opts_nr
};
struct bch_fs;
struct bch_option {
struct attribute attr;
void (*set_sb)(struct bch_sb *, u64);
enum opt_mode mode;
enum opt_type type;
union {
struct {
u64 min, max;
};
struct {
const char * const *choices;
};
struct {
int (*parse)(struct bch_fs *, const char *, u64 *);
int (*print)(struct bch_fs *, char *, size_t, u64);
};
};
};
extern const struct bch_option bch2_opt_table[];
bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
struct bch_opts bch2_opts_from_sb(struct bch_sb *);
int bch2_opt_lookup(const char *);
int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *);
#define OPT_SHOW_FULL_LIST (1 << 0)
#define OPT_SHOW_MOUNT_STYLE (1 << 1)
int bch2_opt_to_text(struct bch_fs *, char *, size_t,
const struct bch_option *, u64, unsigned);
int bch2_parse_mount_opts(struct bch_opts *, char *);
/* inode opts: */
#define BCH_INODE_OPTS() \
BCH_INODE_OPT(data_checksum, 8) \
BCH_INODE_OPT(compression, 8) \
BCH_INODE_OPT(background_compression, 8) \
BCH_INODE_OPT(data_replicas, 8) \
BCH_INODE_OPT(promote_target, 16) \
BCH_INODE_OPT(foreground_target, 16) \
BCH_INODE_OPT(background_target, 16)
struct bch_io_opts {
#define BCH_INODE_OPT(_name, _bits) unsigned _name##_defined:1;
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
#define BCH_INODE_OPT(_name, _bits) u##_bits _name;
BCH_INODE_OPTS()
#undef BCH_INODE_OPT
};
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts);
void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts);
bool bch2_opt_is_inode_opt(enum bch_opt_id);
#endif /* _BCACHEFS_OPTS_H */

790
fs/bcachefs/quota.c Normal file
View File

@ -0,0 +1,790 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "btree_update.h"
#include "inode.h"
#include "quota.h"
#include "super-io.h"
static const char *bch2_sb_validate_quota(struct bch_sb *sb,
struct bch_sb_field *f)
{
struct bch_sb_field_quota *q = field_to_type(f, quota);
if (vstruct_bytes(&q->field) != sizeof(*q))
return "invalid field quota: wrong size";
return NULL;
}
const struct bch_sb_field_ops bch_sb_field_ops_quota = {
.validate = bch2_sb_validate_quota,
};
const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_quota dq;
if (k.k->p.inode >= QTYP_NR)
return "invalid quota type";
switch (k.k->type) {
case BCH_QUOTA: {
dq = bkey_s_c_to_quota(k);
if (bkey_val_bytes(k.k) != sizeof(struct bch_quota))
return "incorrect value size";
return NULL;
}
default:
return "invalid type";
}
}
static const char * const bch2_quota_counters[] = {
"space",
"inodes",
};
void bch2_quota_to_text(struct bch_fs *c, char *buf,
size_t size, struct bkey_s_c k)
{
char *out = buf, *end= buf + size;
struct bkey_s_c_quota dq;
unsigned i;
switch (k.k->type) {
case BCH_QUOTA:
dq = bkey_s_c_to_quota(k);
for (i = 0; i < Q_COUNTERS; i++)
out += scnprintf(out, end - out, "%s hardlimit %llu softlimit %llu",
bch2_quota_counters[i],
le64_to_cpu(dq.v->c[i].hardlimit),
le64_to_cpu(dq.v->c[i].softlimit));
break;
}
}
#ifdef CONFIG_BCACHEFS_QUOTA
#include <linux/cred.h>
#include <linux/fs.h>
#include <linux/quota.h>
static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
{
qtypes >>= i;
return qtypes ? i + __ffs(qtypes) : QTYP_NR;
}
#define for_each_set_qtype(_c, _i, _q, _qtypes) \
for (_i = 0; \
(_i = __next_qtype(_i, _qtypes), \
_q = &(_c)->quotas[_i], \
_i < QTYP_NR); \
_i++)
static bool ignore_hardlimit(struct bch_memquota_type *q)
{
if (capable(CAP_SYS_RESOURCE))
return true;
#if 0
struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
return capable(CAP_SYS_RESOURCE) &&
(info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
!(info->dqi_flags & DQF_ROOT_SQUASH));
#endif
return false;
}
enum quota_msg {
SOFTWARN, /* Softlimit reached */
SOFTLONGWARN, /* Grace time expired */
HARDWARN, /* Hardlimit reached */
HARDBELOW, /* Usage got below inode hardlimit */
SOFTBELOW, /* Usage got below inode softlimit */
};
static int quota_nl[][Q_COUNTERS] = {
[HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN,
[SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN,
[SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN,
[HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW,
[SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW,
[HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN,
[SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN,
[SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN,
[HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW,
[SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW,
};
struct quota_msgs {
u8 nr;
struct {
u8 qtype;
u8 msg;
} m[QTYP_NR * Q_COUNTERS];
};
static void prepare_msg(unsigned qtype,
enum quota_counters counter,
struct quota_msgs *msgs,
enum quota_msg msg_type)
{
BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m));
msgs->m[msgs->nr].qtype = qtype;
msgs->m[msgs->nr].msg = quota_nl[msg_type][counter];
msgs->nr++;
}
static void prepare_warning(struct memquota_counter *qc,
unsigned qtype,
enum quota_counters counter,
struct quota_msgs *msgs,
enum quota_msg msg_type)
{
if (qc->warning_issued & (1 << msg_type))
return;
prepare_msg(qtype, counter, msgs, msg_type);
}
static void flush_warnings(struct bch_qid qid,
struct super_block *sb,
struct quota_msgs *msgs)
{
unsigned i;
for (i = 0; i < msgs->nr; i++)
quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]),
sb->s_dev, msgs->m[i].msg);
}
static int bch2_quota_check_limit(struct bch_fs *c,
unsigned qtype,
struct bch_memquota *mq,
struct quota_msgs *msgs,
enum quota_counters counter,
s64 v,
enum quota_acct_mode mode)
{
struct bch_memquota_type *q = &c->quotas[qtype];
struct memquota_counter *qc = &mq->c[counter];
u64 n = qc->v + v;
BUG_ON((s64) n < 0);
if (mode == BCH_QUOTA_NOCHECK)
return 0;
if (v <= 0) {
if (n < qc->hardlimit &&
(qc->warning_issued & (1 << HARDWARN))) {
qc->warning_issued &= ~(1 << HARDWARN);
prepare_msg(qtype, counter, msgs, HARDBELOW);
}
if (n < qc->softlimit &&
(qc->warning_issued & (1 << SOFTWARN))) {
qc->warning_issued &= ~(1 << SOFTWARN);
prepare_msg(qtype, counter, msgs, SOFTBELOW);
}
qc->warning_issued = 0;
return 0;
}
if (qc->hardlimit &&
qc->hardlimit < n &&
!ignore_hardlimit(q)) {
if (mode == BCH_QUOTA_PREALLOC)
return -EDQUOT;
prepare_warning(qc, qtype, counter, msgs, HARDWARN);
}
if (qc->softlimit &&
qc->softlimit < n &&
qc->timer &&
ktime_get_real_seconds() >= qc->timer &&
!ignore_hardlimit(q)) {
if (mode == BCH_QUOTA_PREALLOC)
return -EDQUOT;
prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
}
if (qc->softlimit &&
qc->softlimit < n &&
qc->timer == 0) {
if (mode == BCH_QUOTA_PREALLOC)
return -EDQUOT;
prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
/* XXX is this the right one? */
qc->timer = ktime_get_real_seconds() +
q->limits[counter].warnlimit;
}
return 0;
}
int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
enum quota_counters counter, s64 v,
enum quota_acct_mode mode)
{
unsigned qtypes = enabled_qtypes(c);
struct bch_memquota_type *q;
struct bch_memquota *mq[QTYP_NR];
struct quota_msgs msgs;
unsigned i;
int ret = 0;
memset(&msgs, 0, sizeof(msgs));
for_each_set_qtype(c, i, q, qtypes)
mutex_lock_nested(&q->lock, i);
for_each_set_qtype(c, i, q, qtypes) {
mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS);
if (!mq[i]) {
ret = -ENOMEM;
goto err;
}
ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
if (ret)
goto err;
}
for_each_set_qtype(c, i, q, qtypes)
mq[i]->c[counter].v += v;
err:
for_each_set_qtype(c, i, q, qtypes)
mutex_unlock(&q->lock);
flush_warnings(qid, c->vfs_sb, &msgs);
return ret;
}
static void __bch2_quota_transfer(struct bch_memquota *src_q,
struct bch_memquota *dst_q,
enum quota_counters counter, s64 v)
{
BUG_ON(v > src_q->c[counter].v);
BUG_ON(v + dst_q->c[counter].v < v);
src_q->c[counter].v -= v;
dst_q->c[counter].v += v;
}
int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
struct bch_qid dst,
struct bch_qid src, u64 space)
{
struct bch_memquota_type *q;
struct bch_memquota *src_q[3], *dst_q[3];
struct quota_msgs msgs;
unsigned i;
int ret = 0;
qtypes &= enabled_qtypes(c);
memset(&msgs, 0, sizeof(msgs));
for_each_set_qtype(c, i, q, qtypes)
mutex_lock_nested(&q->lock, i);
for_each_set_qtype(c, i, q, qtypes) {
src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS);
dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS);
if (!src_q[i] || !dst_q[i]) {
ret = -ENOMEM;
goto err;
}
ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
dst_q[i]->c[Q_SPC].v + space,
BCH_QUOTA_PREALLOC);
if (ret)
goto err;
ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
dst_q[i]->c[Q_INO].v + 1,
BCH_QUOTA_PREALLOC);
if (ret)
goto err;
}
for_each_set_qtype(c, i, q, qtypes) {
__bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space);
__bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1);
}
err:
for_each_set_qtype(c, i, q, qtypes)
mutex_unlock(&q->lock);
flush_warnings(dst, c->vfs_sb, &msgs);
return ret;
}
static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_quota dq;
struct bch_memquota_type *q;
struct bch_memquota *mq;
unsigned i;
BUG_ON(k.k->p.inode >= QTYP_NR);
switch (k.k->type) {
case BCH_QUOTA:
dq = bkey_s_c_to_quota(k);
q = &c->quotas[k.k->p.inode];
mutex_lock(&q->lock);
mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL);
if (!mq) {
mutex_unlock(&q->lock);
return -ENOMEM;
}
for (i = 0; i < Q_COUNTERS; i++) {
mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit);
mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
}
mutex_unlock(&q->lock);
}
return 0;
}
static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
{
struct btree_iter iter;
struct bkey_s_c k;
int ret = 0;
for_each_btree_key(&iter, c, BTREE_ID_QUOTAS, POS(type, 0),
BTREE_ITER_PREFETCH, k) {
if (k.k->p.inode != type)
break;
ret = __bch2_quota_set(c, k);
if (ret)
break;
}
return bch2_btree_iter_unlock(&iter) ?: ret;
}
void bch2_fs_quota_exit(struct bch_fs *c)
{
unsigned i;
for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
genradix_free(&c->quotas[i].table);
}
void bch2_fs_quota_init(struct bch_fs *c)
{
unsigned i;
for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
mutex_init(&c->quotas[i].lock);
}
static void bch2_sb_quota_read(struct bch_fs *c)
{
struct bch_sb_field_quota *sb_quota;
unsigned i, j;
sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
if (!sb_quota)
return;
for (i = 0; i < QTYP_NR; i++) {
struct bch_memquota_type *q = &c->quotas[i];
for (j = 0; j < Q_COUNTERS; j++) {
q->limits[j].timelimit =
le32_to_cpu(sb_quota->q[i].c[j].timelimit);
q->limits[j].warnlimit =
le32_to_cpu(sb_quota->q[i].c[j].warnlimit);
}
}
}
int bch2_fs_quota_read(struct bch_fs *c)
{
unsigned i, qtypes = enabled_qtypes(c);
struct bch_memquota_type *q;
struct btree_iter iter;
struct bch_inode_unpacked u;
struct bkey_s_c k;
int ret;
mutex_lock(&c->sb_lock);
bch2_sb_quota_read(c);
mutex_unlock(&c->sb_lock);
for_each_set_qtype(c, i, q, qtypes) {
ret = bch2_quota_init_type(c, i);
if (ret)
return ret;
}
for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN,
BTREE_ITER_PREFETCH, k) {
switch (k.k->type) {
case BCH_INODE_FS:
ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
if (ret)
return ret;
bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
BCH_QUOTA_NOCHECK);
bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
BCH_QUOTA_NOCHECK);
}
}
return bch2_btree_iter_unlock(&iter) ?: ret;
}
/* Enable/disable/delete quotas for an entire filesystem: */
static int bch2_quota_enable(struct super_block *sb, unsigned uflags)
{
struct bch_fs *c = sb->s_fs_info;
if (sb->s_flags & SB_RDONLY)
return -EROFS;
/* Accounting must be enabled at mount time: */
if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT))
return -EINVAL;
/* Can't enable enforcement without accounting: */
if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota)
return -EINVAL;
if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
return -EINVAL;
if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota)
return -EINVAL;
mutex_lock(&c->sb_lock);
if (uflags & FS_QUOTA_UDQ_ENFD)
SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
if (uflags & FS_QUOTA_GDQ_ENFD)
SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true);
if (uflags & FS_QUOTA_PDQ_ENFD)
SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
return 0;
}
static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
{
struct bch_fs *c = sb->s_fs_info;
if (sb->s_flags & SB_RDONLY)
return -EROFS;
mutex_lock(&c->sb_lock);
if (uflags & FS_QUOTA_UDQ_ENFD)
SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false);
if (uflags & FS_QUOTA_GDQ_ENFD)
SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false);
if (uflags & FS_QUOTA_PDQ_ENFD)
SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
return 0;
}
static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
{
struct bch_fs *c = sb->s_fs_info;
int ret;
if (sb->s_flags & SB_RDONLY)
return -EROFS;
if (uflags & FS_USER_QUOTA) {
if (c->opts.usrquota)
return -EINVAL;
ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
POS(QTYP_USR, 0),
POS(QTYP_USR + 1, 0),
ZERO_VERSION, NULL, NULL, NULL);
if (ret)
return ret;
}
if (uflags & FS_GROUP_QUOTA) {
if (c->opts.grpquota)
return -EINVAL;
ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
POS(QTYP_GRP, 0),
POS(QTYP_GRP + 1, 0),
ZERO_VERSION, NULL, NULL, NULL);
if (ret)
return ret;
}
if (uflags & FS_PROJ_QUOTA) {
if (c->opts.prjquota)
return -EINVAL;
ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS,
POS(QTYP_PRJ, 0),
POS(QTYP_PRJ + 1, 0),
ZERO_VERSION, NULL, NULL, NULL);
if (ret)
return ret;
}
return 0;
}
/*
* Return quota status information, such as enforcements, quota file inode
* numbers etc.
*/
static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state)
{
struct bch_fs *c = sb->s_fs_info;
unsigned qtypes = enabled_qtypes(c);
unsigned i;
memset(state, 0, sizeof(*state));
for (i = 0; i < QTYP_NR; i++) {
state->s_state[i].flags |= QCI_SYSFILE;
if (!(qtypes & (1 << i)))
continue;
state->s_state[i].flags |= QCI_ACCT_ENABLED;
state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit;
state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit;
state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit;
state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit;
}
return 0;
}
/*
* Adjust quota timers & warnings
*/
static int bch2_quota_set_info(struct super_block *sb, int type,
struct qc_info *info)
{
struct bch_fs *c = sb->s_fs_info;
struct bch_sb_field_quota *sb_quota;
struct bch_memquota_type *q;
if (sb->s_flags & SB_RDONLY)
return -EROFS;
if (type >= QTYP_NR)
return -EINVAL;
if (!((1 << type) & enabled_qtypes(c)))
return -ESRCH;
if (info->i_fieldmask &
~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
return -EINVAL;
q = &c->quotas[type];
mutex_lock(&c->sb_lock);
sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
if (!sb_quota) {
sb_quota = bch2_sb_resize_quota(&c->disk_sb,
sizeof(*sb_quota) / sizeof(u64));
if (!sb_quota)
return -ENOSPC;
}
if (info->i_fieldmask & QC_SPC_TIMER)
sb_quota->q[type].c[Q_SPC].timelimit =
cpu_to_le32(info->i_spc_timelimit);
if (info->i_fieldmask & QC_SPC_WARNS)
sb_quota->q[type].c[Q_SPC].warnlimit =
cpu_to_le32(info->i_spc_warnlimit);
if (info->i_fieldmask & QC_INO_TIMER)
sb_quota->q[type].c[Q_INO].timelimit =
cpu_to_le32(info->i_ino_timelimit);
if (info->i_fieldmask & QC_INO_WARNS)
sb_quota->q[type].c[Q_INO].warnlimit =
cpu_to_le32(info->i_ino_warnlimit);
bch2_sb_quota_read(c);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
return 0;
}
/* Get/set individual quotas: */
static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src)
{
dst->d_space = src->c[Q_SPC].v << 9;
dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9;
dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9;
dst->d_spc_timer = src->c[Q_SPC].timer;
dst->d_spc_warns = src->c[Q_SPC].warns;
dst->d_ino_count = src->c[Q_INO].v;
dst->d_ino_hardlimit = src->c[Q_INO].hardlimit;
dst->d_ino_softlimit = src->c[Q_INO].softlimit;
dst->d_ino_timer = src->c[Q_INO].timer;
dst->d_ino_warns = src->c[Q_INO].warns;
}
static int bch2_get_quota(struct super_block *sb, struct kqid kqid,
struct qc_dqblk *qdq)
{
struct bch_fs *c = sb->s_fs_info;
struct bch_memquota_type *q = &c->quotas[kqid.type];
qid_t qid = from_kqid(&init_user_ns, kqid);
struct bch_memquota *mq;
memset(qdq, 0, sizeof(*qdq));
mutex_lock(&q->lock);
mq = genradix_ptr(&q->table, qid);
if (mq)
__bch2_quota_get(qdq, mq);
mutex_unlock(&q->lock);
return 0;
}
static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
struct qc_dqblk *qdq)
{
struct bch_fs *c = sb->s_fs_info;
struct bch_memquota_type *q = &c->quotas[kqid->type];
qid_t qid = from_kqid(&init_user_ns, *kqid);
struct genradix_iter iter = genradix_iter_init(&q->table, qid);
struct bch_memquota *mq;
int ret = 0;
mutex_lock(&q->lock);
while ((mq = genradix_iter_peek(&iter, &q->table))) {
if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
__bch2_quota_get(qdq, mq);
*kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
goto found;
}
genradix_iter_advance(&iter, &q->table);
}
ret = -ENOENT;
found:
mutex_unlock(&q->lock);
return ret;
}
static int bch2_set_quota(struct super_block *sb, struct kqid qid,
struct qc_dqblk *qdq)
{
struct bch_fs *c = sb->s_fs_info;
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_i_quota new_quota;
int ret;
if (sb->s_flags & SB_RDONLY)
return -EROFS;
bkey_quota_init(&new_quota.k_i);
new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
bch2_btree_iter_init(&iter, c, BTREE_ID_QUOTAS, new_quota.k.p,
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
k = bch2_btree_iter_peek_slot(&iter);
ret = btree_iter_err(k);
if (unlikely(ret))
return ret;
switch (k.k->type) {
case BCH_QUOTA:
new_quota.v = *bkey_s_c_to_quota(k).v;
break;
}
if (qdq->d_fieldmask & QC_SPC_SOFT)
new_quota.v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
if (qdq->d_fieldmask & QC_SPC_HARD)
new_quota.v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
if (qdq->d_fieldmask & QC_INO_SOFT)
new_quota.v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
if (qdq->d_fieldmask & QC_INO_HARD)
new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
BTREE_INSERT_ENTRY(&iter, &new_quota.k_i));
bch2_btree_iter_unlock(&iter);
if (ret)
return ret;
ret = __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
return ret;
}
const struct quotactl_ops bch2_quotactl_operations = {
.quota_enable = bch2_quota_enable,
.quota_disable = bch2_quota_disable,
.rm_xquota = bch2_quota_remove,
.get_state = bch2_quota_get_state,
.set_info = bch2_quota_set_info,
.get_dqblk = bch2_get_quota,
.get_nextdqblk = bch2_get_next_quota,
.set_dqblk = bch2_set_quota,
};
#endif /* CONFIG_BCACHEFS_QUOTA */

76
fs/bcachefs/quota.h Normal file
View File

@ -0,0 +1,76 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_QUOTA_H
#define _BCACHEFS_QUOTA_H
#include "inode.h"
#include "quota_types.h"
extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_quota_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
#define bch2_bkey_quota_ops (struct bkey_ops) { \
.key_invalid = bch2_quota_invalid, \
.val_to_text = bch2_quota_to_text, \
}
enum quota_acct_mode {
BCH_QUOTA_PREALLOC,
BCH_QUOTA_WARN,
BCH_QUOTA_NOCHECK,
};
static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
{
return (struct bch_qid) {
.q[QTYP_USR] = u->bi_uid,
.q[QTYP_GRP] = u->bi_gid,
.q[QTYP_PRJ] = u->bi_project,
};
}
static inline unsigned enabled_qtypes(struct bch_fs *c)
{
return ((c->opts.usrquota << QTYP_USR)|
(c->opts.grpquota << QTYP_GRP)|
(c->opts.prjquota << QTYP_PRJ));
}
#ifdef CONFIG_BCACHEFS_QUOTA
int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
s64, enum quota_acct_mode);
int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid,
struct bch_qid, u64);
void bch2_fs_quota_exit(struct bch_fs *);
void bch2_fs_quota_init(struct bch_fs *);
int bch2_fs_quota_read(struct bch_fs *);
extern const struct quotactl_ops bch2_quotactl_operations;
#else
static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
enum quota_counters counter, s64 v,
enum quota_acct_mode mode)
{
return 0;
}
static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
struct bch_qid dst,
struct bch_qid src, u64 space)
{
return 0;
}
static inline void bch2_fs_quota_exit(struct bch_fs *c) {}
static inline void bch2_fs_quota_init(struct bch_fs *c) {}
static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; }
#endif
#endif /* _BCACHEFS_QUOTA_H */

37
fs/bcachefs/quota_types.h Normal file
View File

@ -0,0 +1,37 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_QUOTA_TYPES_H
#define _BCACHEFS_QUOTA_TYPES_H
#include <linux/generic-radix-tree.h>
struct bch_qid {
u32 q[QTYP_NR];
};
struct memquota_counter {
u64 v;
u64 hardlimit;
u64 softlimit;
s64 timer;
int warns;
int warning_issued;
};
struct bch_memquota {
struct memquota_counter c[Q_COUNTERS];
};
typedef GENRADIX(struct bch_memquota) bch_memquota_table;
struct quota_limit {
u32 timelimit;
u32 warnlimit;
};
struct bch_memquota_type {
struct quota_limit limits[Q_COUNTERS];
bch_memquota_table table;
struct mutex lock;
};
#endif /* _BCACHEFS_QUOTA_TYPES_H */

342
fs/bcachefs/rebalance.c Normal file
View File

@ -0,0 +1,342 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "alloc.h"
#include "btree_iter.h"
#include "buckets.h"
#include "clock.h"
#include "disk_groups.h"
#include "extents.h"
#include "io.h"
#include "move.h"
#include "rebalance.h"
#include "super-io.h"
#include "trace.h"
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/sched/cputime.h>
static inline bool rebalance_ptr_pred(struct bch_fs *c,
const struct bch_extent_ptr *ptr,
struct bch_extent_crc_unpacked crc,
struct bch_io_opts *io_opts)
{
if (io_opts->background_target &&
!bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
!ptr->cached)
return true;
if (io_opts->background_compression &&
crc.compression_type !=
bch2_compression_opt_to_type[io_opts->background_compression])
return true;
return false;
}
void bch2_rebalance_add_key(struct bch_fs *c,
struct bkey_s_c k,
struct bch_io_opts *io_opts)
{
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
struct bkey_s_c_extent e;
if (!bkey_extent_is_data(k.k))
return;
if (!io_opts->background_target &&
!io_opts->background_compression)
return;
e = bkey_s_c_to_extent(k);
extent_for_each_ptr_crc(e, ptr, crc)
if (rebalance_ptr_pred(c, ptr, crc, io_opts)) {
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (atomic64_add_return(crc.compressed_size,
&ca->rebalance_work) ==
crc.compressed_size)
rebalance_wakeup(c);
}
}
void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
{
if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
sectors)
rebalance_wakeup(c);
}
static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
enum bkey_type type,
struct bkey_s_c_extent e,
struct bch_io_opts *io_opts,
struct data_opts *data_opts)
{
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
/* Make sure we have room to add a new pointer: */
if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
BKEY_EXTENT_VAL_U64s_MAX)
return DATA_SKIP;
extent_for_each_ptr_crc(e, ptr, crc)
if (rebalance_ptr_pred(c, ptr, crc, io_opts))
goto found;
return DATA_SKIP;
found:
data_opts->target = io_opts->background_target;
data_opts->btree_insert_flags = 0;
return DATA_ADD_REPLICAS;
}
struct rebalance_work {
int dev_most_full_idx;
unsigned dev_most_full_percent;
u64 dev_most_full_work;
u64 dev_most_full_capacity;
u64 total_work;
};
static void rebalance_work_accumulate(struct rebalance_work *w,
u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
{
unsigned percent_full;
u64 work = dev_work + unknown_dev;
if (work < dev_work || work < unknown_dev)
work = U64_MAX;
work = min(work, capacity);
percent_full = div_u64(work * 100, capacity);
if (percent_full >= w->dev_most_full_percent) {
w->dev_most_full_idx = idx;
w->dev_most_full_percent = percent_full;
w->dev_most_full_work = work;
w->dev_most_full_capacity = capacity;
}
if (w->total_work + dev_work >= w->total_work &&
w->total_work + dev_work >= dev_work)
w->total_work += dev_work;
}
static struct rebalance_work rebalance_work(struct bch_fs *c)
{
struct bch_dev *ca;
struct rebalance_work ret = { .dev_most_full_idx = -1 };
u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
unsigned i;
for_each_online_member(ca, c, i)
rebalance_work_accumulate(&ret,
atomic64_read(&ca->rebalance_work),
unknown_dev,
bucket_to_sector(ca, ca->mi.nbuckets -
ca->mi.first_bucket),
i);
rebalance_work_accumulate(&ret,
unknown_dev, 0, c->capacity, -1);
return ret;
}
static void rebalance_work_reset(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
for_each_online_member(ca, c, i)
atomic64_set(&ca->rebalance_work, 0);
atomic64_set(&c->rebalance.work_unknown_dev, 0);
}
static unsigned long curr_cputime(void)
{
u64 utime, stime;
task_cputime_adjusted(current, &utime, &stime);
return nsecs_to_jiffies(utime + stime);
}
static int bch2_rebalance_thread(void *arg)
{
struct bch_fs *c = arg;
struct bch_fs_rebalance *r = &c->rebalance;
struct io_clock *clock = &c->io_clock[WRITE];
struct rebalance_work w, p;
unsigned long start, prev_start;
unsigned long prev_run_time, prev_run_cputime;
unsigned long cputime, prev_cputime;
unsigned long io_start;
long throttle;
set_freezable();
io_start = atomic_long_read(&clock->now);
p = rebalance_work(c);
prev_start = jiffies;
prev_cputime = curr_cputime();
while (!kthread_wait_freezable(r->enabled)) {
start = jiffies;
cputime = curr_cputime();
prev_run_time = start - prev_start;
prev_run_cputime = cputime - prev_cputime;
w = rebalance_work(c);
BUG_ON(!w.dev_most_full_capacity);
if (!w.total_work) {
r->state = REBALANCE_WAITING;
kthread_wait_freezable(rebalance_work(c).total_work);
continue;
}
/*
* If there isn't much work to do, throttle cpu usage:
*/
throttle = prev_run_cputime * 100 /
max(1U, w.dev_most_full_percent) -
prev_run_time;
if (w.dev_most_full_percent < 20 && throttle > 0) {
r->state = REBALANCE_THROTTLED;
r->throttled_until_iotime = io_start +
div_u64(w.dev_most_full_capacity *
(20 - w.dev_most_full_percent),
50);
r->throttled_until_cputime = start + throttle;
bch2_kthread_io_clock_wait(clock,
r->throttled_until_iotime,
throttle);
continue;
}
/* minimum 1 mb/sec: */
r->pd.rate.rate =
max_t(u64, 1 << 11,
r->pd.rate.rate *
max(p.dev_most_full_percent, 1U) /
max(w.dev_most_full_percent, 1U));
io_start = atomic_long_read(&clock->now);
p = w;
prev_start = start;
prev_cputime = cputime;
r->state = REBALANCE_RUNNING;
memset(&r->move_stats, 0, sizeof(r->move_stats));
rebalance_work_reset(c);
bch2_move_data(c,
/* ratelimiting disabled for now */
NULL, /* &r->pd.rate, */
writepoint_ptr(&c->rebalance_write_point),
POS_MIN, POS_MAX,
rebalance_pred, NULL,
&r->move_stats);
}
return 0;
}
ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf)
{
char *out = buf, *end = out + PAGE_SIZE;
struct bch_fs_rebalance *r = &c->rebalance;
struct rebalance_work w = rebalance_work(c);
char h1[21], h2[21];
bch2_hprint(h1, w.dev_most_full_work << 9);
bch2_hprint(h2, w.dev_most_full_capacity << 9);
out += scnprintf(out, end - out,
"fullest_dev (%i):\t%s/%s\n",
w.dev_most_full_idx, h1, h2);
bch2_hprint(h1, w.total_work << 9);
bch2_hprint(h2, c->capacity << 9);
out += scnprintf(out, end - out,
"total work:\t\t%s/%s\n",
h1, h2);
out += scnprintf(out, end - out,
"rate:\t\t\t%u\n",
r->pd.rate.rate);
switch (r->state) {
case REBALANCE_WAITING:
out += scnprintf(out, end - out, "waiting\n");
break;
case REBALANCE_THROTTLED:
bch2_hprint(h1,
(r->throttled_until_iotime -
atomic_long_read(&c->io_clock[WRITE].now)) << 9);
out += scnprintf(out, end - out,
"throttled for %lu sec or %s io\n",
(r->throttled_until_cputime - jiffies) / HZ,
h1);
break;
case REBALANCE_RUNNING:
out += scnprintf(out, end - out, "running\n");
out += scnprintf(out, end - out, "pos %llu:%llu\n",
r->move_stats.iter.pos.inode,
r->move_stats.iter.pos.offset);
break;
}
return out - buf;
}
void bch2_rebalance_stop(struct bch_fs *c)
{
struct task_struct *p;
c->rebalance.pd.rate.rate = UINT_MAX;
bch2_ratelimit_reset(&c->rebalance.pd.rate);
p = rcu_dereference_protected(c->rebalance.thread, 1);
c->rebalance.thread = NULL;
if (p) {
/* for sychronizing with rebalance_wakeup() */
synchronize_rcu();
kthread_stop(p);
put_task_struct(p);
}
}
int bch2_rebalance_start(struct bch_fs *c)
{
struct task_struct *p;
if (c->opts.nochanges)
return 0;
p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance");
if (IS_ERR(p))
return PTR_ERR(p);
get_task_struct(p);
rcu_assign_pointer(c->rebalance.thread, p);
wake_up_process(p);
return 0;
}
void bch2_fs_rebalance_init(struct bch_fs *c)
{
bch2_pd_controller_init(&c->rebalance.pd);
atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
}

28
fs/bcachefs/rebalance.h Normal file
View File

@ -0,0 +1,28 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_REBALANCE_H
#define _BCACHEFS_REBALANCE_H
#include "rebalance_types.h"
static inline void rebalance_wakeup(struct bch_fs *c)
{
struct task_struct *p;
rcu_read_lock();
p = rcu_dereference(c->rebalance.thread);
if (p)
wake_up_process(p);
rcu_read_unlock();
}
void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
struct bch_io_opts *);
void bch2_rebalance_add_work(struct bch_fs *, u64);
ssize_t bch2_rebalance_work_show(struct bch_fs *, char *);
void bch2_rebalance_stop(struct bch_fs *);
int bch2_rebalance_start(struct bch_fs *);
void bch2_fs_rebalance_init(struct bch_fs *);
#endif /* _BCACHEFS_REBALANCE_H */

View File

@ -0,0 +1,27 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_REBALANCE_TYPES_H
#define _BCACHEFS_REBALANCE_TYPES_H
#include "move_types.h"
enum rebalance_state {
REBALANCE_WAITING,
REBALANCE_THROTTLED,
REBALANCE_RUNNING,
};
struct bch_fs_rebalance {
struct task_struct __rcu *thread;
struct bch_pd_controller pd;
atomic64_t work_unknown_dev;
enum rebalance_state state;
unsigned long throttled_until_iotime;
unsigned long throttled_until_cputime;
struct bch_move_stats move_stats;
unsigned enabled:1;
};
#endif /* _BCACHEFS_REBALANCE_TYPES_H */

377
fs/bcachefs/recovery.c Normal file
View File

@ -0,0 +1,377 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "alloc.h"
#include "btree_gc.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "dirent.h"
#include "error.h"
#include "fsck.h"
#include "journal_io.h"
#include "quota.h"
#include "recovery.h"
#include "super-io.h"
#include <linux/stat.h>
#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
struct bkey_i *btree_root_find(struct bch_fs *c,
struct bch_sb_field_clean *clean,
struct jset *j,
enum btree_id id, unsigned *level)
{
struct bkey_i *k;
struct jset_entry *entry, *start, *end;
if (clean) {
start = clean->start;
end = vstruct_end(&clean->field);
} else {
start = j->start;
end = vstruct_last(j);
}
for (entry = start; entry < end; entry = vstruct_next(entry))
if (entry->type == BCH_JSET_ENTRY_btree_root &&
entry->btree_id == id)
goto found;
return NULL;
found:
if (!entry->u64s)
return ERR_PTR(-EINVAL);
k = entry->start;
*level = entry->level;
return k;
}
static int verify_superblock_clean(struct bch_fs *c,
struct bch_sb_field_clean *clean,
struct jset *j)
{
unsigned i;
int ret = 0;
if (!clean || !j)
return 0;
if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
"superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
le64_to_cpu(clean->journal_seq),
le64_to_cpu(j->seq)))
bch2_fs_mark_clean(c, false);
mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
"superblock read clock doesn't match journal after clean shutdown");
mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
"superblock read clock doesn't match journal after clean shutdown");
for (i = 0; i < BTREE_ID_NR; i++) {
struct bkey_i *k1, *k2;
unsigned l1 = 0, l2 = 0;
k1 = btree_root_find(c, clean, NULL, i, &l1);
k2 = btree_root_find(c, NULL, j, i, &l2);
if (!k1 && !k2)
continue;
mustfix_fsck_err_on(!k1 || !k2 ||
IS_ERR(k1) ||
IS_ERR(k2) ||
k1->k.u64s != k2->k.u64s ||
memcmp(k1, k2, bkey_bytes(k1)) ||
l1 != l2, c,
"superblock btree root doesn't match journal after clean shutdown");
}
fsck_err:
return ret;
}
static bool journal_empty(struct list_head *journal)
{
struct journal_replay *i;
struct jset_entry *entry;
if (list_empty(journal))
return true;
i = list_last_entry(journal, struct journal_replay, list);
if (i->j.last_seq != i->j.seq)
return false;
list_for_each_entry(i, journal, list) {
vstruct_for_each(&i->j, entry) {
if (entry->type == BCH_JSET_ENTRY_btree_root)
continue;
if (entry->type == BCH_JSET_ENTRY_btree_keys &&
!entry->u64s)
continue;
return false;
}
}
return true;
}
int bch2_fs_recovery(struct bch_fs *c)
{
const char *err = "cannot allocate memory";
struct bch_sb_field_clean *clean = NULL, *sb_clean = NULL;
LIST_HEAD(journal);
struct jset *j = NULL;
unsigned i;
int ret;
mutex_lock(&c->sb_lock);
if (!bch2_sb_get_replicas(c->disk_sb.sb)) {
bch_info(c, "building replicas info");
set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
}
if (c->sb.clean)
sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
if (sb_clean) {
clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
GFP_KERNEL);
if (!clean) {
ret = -ENOMEM;
mutex_unlock(&c->sb_lock);
goto err;
}
}
mutex_unlock(&c->sb_lock);
if (clean)
bch_info(c, "recovering from clean shutdown, journal seq %llu",
le64_to_cpu(clean->journal_seq));
if (!clean || !c->opts.nofsck) {
ret = bch2_journal_read(c, &journal);
if (ret)
goto err;
j = &list_entry(journal.prev, struct journal_replay, list)->j;
} else {
ret = bch2_journal_set_seq(c,
le64_to_cpu(clean->journal_seq),
le64_to_cpu(clean->journal_seq));
BUG_ON(ret);
}
ret = verify_superblock_clean(c, clean, j);
if (ret)
goto err;
fsck_err_on(clean && !journal_empty(&journal), c,
"filesystem marked clean but journal not empty");
if (clean) {
c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
} else {
c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
}
for (i = 0; i < BTREE_ID_NR; i++) {
unsigned level;
struct bkey_i *k;
k = btree_root_find(c, clean, j, i, &level);
if (!k)
continue;
err = "invalid btree root pointer";
if (IS_ERR(k))
goto err;
err = "error reading btree root";
if (bch2_btree_root_read(c, i, k, level)) {
if (i != BTREE_ID_ALLOC)
goto err;
mustfix_fsck_err(c, "error reading btree root");
}
}
for (i = 0; i < BTREE_ID_NR; i++)
if (!c->btree_roots[i].b)
bch2_btree_root_alloc(c, i);
err = "error reading allocation information";
ret = bch2_alloc_read(c, &journal);
if (ret)
goto err;
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
bch_verbose(c, "starting mark and sweep:");
err = "error in recovery";
ret = bch2_initial_gc(c, &journal);
if (ret)
goto err;
bch_verbose(c, "mark and sweep done");
if (c->opts.noreplay)
goto out;
/*
* Mark dirty before journal replay, fsck:
* XXX: after a clean shutdown, this could be done lazily only when fsck
* finds an error
*/
bch2_fs_mark_clean(c, false);
/*
* bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
* will give spurious errors about oldest_gen > bucket_gen -
* this is a hack but oh well.
*/
bch2_fs_journal_start(&c->journal);
err = "error starting allocator";
ret = bch2_fs_allocator_start(c);
if (ret)
goto err;
bch_verbose(c, "starting journal replay:");
err = "journal replay failed";
ret = bch2_journal_replay(c, &journal);
if (ret)
goto err;
bch_verbose(c, "journal replay done");
if (c->opts.norecovery)
goto out;
err = "error in fsck";
ret = bch2_fsck(c);
if (ret)
goto err;
if (enabled_qtypes(c)) {
bch_verbose(c, "reading quotas:");
ret = bch2_fs_quota_read(c);
if (ret)
goto err;
bch_verbose(c, "quotas done");
}
out:
bch2_journal_entries_free(&journal);
kfree(clean);
return ret;
err:
fsck_err:
BUG_ON(!ret);
goto out;
}
int bch2_fs_initialize(struct bch_fs *c)
{
struct bch_inode_unpacked root_inode, lostfound_inode;
struct bkey_inode_buf packed_inode;
struct bch_hash_info root_hash_info;
struct qstr lostfound = QSTR("lost+found");
const char *err = "cannot allocate memory";
struct bch_dev *ca;
LIST_HEAD(journal);
unsigned i;
int ret;
bch_notice(c, "initializing new filesystem");
set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
ret = bch2_initial_gc(c, &journal);
if (ret)
goto err;
err = "unable to allocate journal buckets";
for_each_online_member(ca, c, i)
if (bch2_dev_journal_alloc(ca)) {
percpu_ref_put(&ca->io_ref);
goto err;
}
for (i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
/*
* journal_res_get() will crash if called before this has
* set up the journal.pin FIFO and journal.cur pointer:
*/
bch2_fs_journal_start(&c->journal);
bch2_journal_set_replay_done(&c->journal);
err = "error starting allocator";
ret = bch2_fs_allocator_start(c);
if (ret)
goto err;
bch2_inode_init(c, &root_inode, 0, 0,
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
root_inode.bi_inum = BCACHEFS_ROOT_INO;
root_inode.bi_nlink++; /* lost+found */
bch2_inode_pack(&packed_inode, &root_inode);
err = "error creating root directory";
ret = bch2_btree_insert(c, BTREE_ID_INODES,
&packed_inode.inode.k_i,
NULL, NULL, NULL, 0);
if (ret)
goto err;
bch2_inode_init(c, &lostfound_inode, 0, 0,
S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0,
&root_inode);
lostfound_inode.bi_inum = BCACHEFS_ROOT_INO + 1;
bch2_inode_pack(&packed_inode, &lostfound_inode);
err = "error creating lost+found";
ret = bch2_btree_insert(c, BTREE_ID_INODES,
&packed_inode.inode.k_i,
NULL, NULL, NULL, 0);
if (ret)
goto err;
root_hash_info = bch2_hash_info_init(c, &root_inode);
ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR,
&lostfound, lostfound_inode.bi_inum, NULL,
BTREE_INSERT_NOFAIL);
if (ret)
goto err;
atomic_long_set(&c->nr_inodes, 2);
if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
if (ret)
goto err;
}
err = "error writing first journal entry";
ret = bch2_journal_meta(&c->journal);
if (ret)
goto err;
mutex_lock(&c->sb_lock);
SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
return 0;
err:
BUG_ON(!ret);
return ret;
}

8
fs/bcachefs/recovery.h Normal file
View File

@ -0,0 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_RECOVERY_H
#define _BCACHEFS_RECOVERY_H
int bch2_fs_recovery(struct bch_fs *);
int bch2_fs_initialize(struct bch_fs *);
#endif /* _BCACHEFS_RECOVERY_H */

698
fs/bcachefs/replicas.c Normal file
View File

@ -0,0 +1,698 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "replicas.h"
#include "super-io.h"
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
struct bch_replicas_cpu *);
/* Replicas tracking - in memory: */
#define for_each_cpu_replicas_entry(_r, _i) \
for (_i = (_r)->entries; \
(void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
_i = (void *) (_i) + (_r)->entry_size)
static inline struct bch_replicas_cpu_entry *
cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
{
return (void *) r->entries + r->entry_size * i;
}
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
}
static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
unsigned dev)
{
return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
}
static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
unsigned dev)
{
e->devs[dev >> 3] |= 1 << (dev & 7);
}
static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
{
return (r->entry_size -
offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
}
int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
char *buf, size_t size)
{
char *out = buf, *end = out + size;
struct bch_replicas_cpu_entry *e;
bool first = true;
unsigned i;
for_each_cpu_replicas_entry(r, e) {
bool first_e = true;
if (!first)
out += scnprintf(out, end - out, " ");
first = false;
out += scnprintf(out, end - out, "%u: [", e->data_type);
for (i = 0; i < replicas_dev_slots(r); i++)
if (replicas_test_dev(e, i)) {
if (!first_e)
out += scnprintf(out, end - out, " ");
first_e = false;
out += scnprintf(out, end - out, "%u", i);
}
out += scnprintf(out, end - out, "]");
}
return out - buf;
}
static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e,
enum bch_data_type data_type,
struct bch_replicas_cpu_entry *r,
unsigned *max_dev)
{
const struct bch_extent_ptr *ptr;
unsigned nr = 0;
BUG_ON(!data_type ||
data_type == BCH_DATA_SB ||
data_type >= BCH_DATA_NR);
memset(r, 0, sizeof(*r));
r->data_type = data_type;
*max_dev = 0;
extent_for_each_ptr(e, ptr)
if (!ptr->cached) {
*max_dev = max_t(unsigned, *max_dev, ptr->dev);
replicas_set_dev(r, ptr->dev);
nr++;
}
return nr;
}
static inline void devlist_to_replicas(struct bch_devs_list devs,
enum bch_data_type data_type,
struct bch_replicas_cpu_entry *r,
unsigned *max_dev)
{
unsigned i;
BUG_ON(!data_type ||
data_type == BCH_DATA_SB ||
data_type >= BCH_DATA_NR);
memset(r, 0, sizeof(*r));
r->data_type = data_type;
*max_dev = 0;
for (i = 0; i < devs.nr; i++) {
*max_dev = max_t(unsigned, *max_dev, devs.devs[i]);
replicas_set_dev(r, devs.devs[i]);
}
}
static struct bch_replicas_cpu *
cpu_replicas_add_entry(struct bch_replicas_cpu *old,
struct bch_replicas_cpu_entry new_entry,
unsigned max_dev)
{
struct bch_replicas_cpu *new;
unsigned i, nr, entry_size;
entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
DIV_ROUND_UP(max_dev + 1, 8);
entry_size = max(entry_size, old->entry_size);
nr = old->nr + 1;
new = kzalloc(sizeof(struct bch_replicas_cpu) +
nr * entry_size, GFP_NOIO);
if (!new)
return NULL;
new->nr = nr;
new->entry_size = entry_size;
for (i = 0; i < old->nr; i++)
memcpy(cpu_replicas_entry(new, i),
cpu_replicas_entry(old, i),
min(new->entry_size, old->entry_size));
memcpy(cpu_replicas_entry(new, old->nr),
&new_entry,
new->entry_size);
bch2_cpu_replicas_sort(new);
return new;
}
static bool replicas_has_entry(struct bch_replicas_cpu *r,
struct bch_replicas_cpu_entry search,
unsigned max_dev)
{
return max_dev < replicas_dev_slots(r) &&
eytzinger0_find(r->entries, r->nr,
r->entry_size,
memcmp, &search) < r->nr;
}
noinline
static int bch2_mark_replicas_slowpath(struct bch_fs *c,
struct bch_replicas_cpu_entry new_entry,
unsigned max_dev)
{
struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
int ret = -ENOMEM;
mutex_lock(&c->sb_lock);
old_gc = rcu_dereference_protected(c->replicas_gc,
lockdep_is_held(&c->sb_lock));
if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
if (!new_gc)
goto err;
}
old_r = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
if (!replicas_has_entry(old_r, new_entry, max_dev)) {
new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
if (!new_r)
goto err;
ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
if (ret)
goto err;
}
/* allocations done, now commit: */
if (new_r)
bch2_write_super(c);
/* don't update in memory replicas until changes are persistent */
if (new_gc) {
rcu_assign_pointer(c->replicas_gc, new_gc);
kfree_rcu(old_gc, rcu);
}
if (new_r) {
rcu_assign_pointer(c->replicas, new_r);
kfree_rcu(old_r, rcu);
}
mutex_unlock(&c->sb_lock);
return 0;
err:
mutex_unlock(&c->sb_lock);
kfree(new_gc);
kfree(new_r);
return ret;
}
int bch2_mark_replicas(struct bch_fs *c,
enum bch_data_type data_type,
struct bch_devs_list devs)
{
struct bch_replicas_cpu_entry search;
struct bch_replicas_cpu *r, *gc_r;
unsigned max_dev;
bool marked;
if (!devs.nr)
return 0;
BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
devlist_to_replicas(devs, data_type, &search, &max_dev);
rcu_read_lock();
r = rcu_dereference(c->replicas);
gc_r = rcu_dereference(c->replicas_gc);
marked = replicas_has_entry(r, search, max_dev) &&
(!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
rcu_read_unlock();
return likely(marked) ? 0
: bch2_mark_replicas_slowpath(c, search, max_dev);
}
int bch2_mark_bkey_replicas(struct bch_fs *c,
enum bch_data_type data_type,
struct bkey_s_c k)
{
struct bch_devs_list cached = bch2_bkey_cached_devs(k);
unsigned i;
int ret;
for (i = 0; i < cached.nr; i++)
if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
bch2_dev_list_single(cached.devs[i]))))
return ret;
return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
}
int bch2_replicas_gc_end(struct bch_fs *c, int ret)
{
struct bch_replicas_cpu *new_r, *old_r;
lockdep_assert_held(&c->replicas_gc_lock);
mutex_lock(&c->sb_lock);
new_r = rcu_dereference_protected(c->replicas_gc,
lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->replicas_gc, NULL);
if (ret)
goto err;
if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
ret = -ENOSPC;
goto err;
}
bch2_write_super(c);
/* don't update in memory replicas until changes are persistent */
old_r = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->replicas, new_r);
kfree_rcu(old_r, rcu);
out:
mutex_unlock(&c->sb_lock);
return ret;
err:
kfree_rcu(new_r, rcu);
goto out;
}
int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
{
struct bch_replicas_cpu *dst, *src;
struct bch_replicas_cpu_entry *e;
lockdep_assert_held(&c->replicas_gc_lock);
mutex_lock(&c->sb_lock);
BUG_ON(c->replicas_gc);
src = rcu_dereference_protected(c->replicas,
lockdep_is_held(&c->sb_lock));
dst = kzalloc(sizeof(struct bch_replicas_cpu) +
src->nr * src->entry_size, GFP_NOIO);
if (!dst) {
mutex_unlock(&c->sb_lock);
return -ENOMEM;
}
dst->nr = 0;
dst->entry_size = src->entry_size;
for_each_cpu_replicas_entry(src, e)
if (!((1 << e->data_type) & typemask))
memcpy(cpu_replicas_entry(dst, dst->nr++),
e, dst->entry_size);
bch2_cpu_replicas_sort(dst);
rcu_assign_pointer(c->replicas_gc, dst);
mutex_unlock(&c->sb_lock);
return 0;
}
/* Replicas tracking - superblock: */
static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
unsigned *nr,
unsigned *bytes,
unsigned *max_dev)
{
struct bch_replicas_entry *i;
unsigned j;
*nr = 0;
*bytes = sizeof(*r);
*max_dev = 0;
if (!r)
return;
for_each_replicas_entry(r, i) {
for (j = 0; j < i->nr; j++)
*max_dev = max_t(unsigned, *max_dev, i->devs[j]);
(*nr)++;
}
*bytes = (void *) i - (void *) r;
}
static struct bch_replicas_cpu *
__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
{
struct bch_replicas_cpu *cpu_r;
unsigned i, nr, bytes, max_dev, entry_size;
bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
DIV_ROUND_UP(max_dev + 1, 8);
cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
nr * entry_size, GFP_NOIO);
if (!cpu_r)
return NULL;
cpu_r->nr = nr;
cpu_r->entry_size = entry_size;
if (nr) {
struct bch_replicas_cpu_entry *dst =
cpu_replicas_entry(cpu_r, 0);
struct bch_replicas_entry *src = sb_r->entries;
while (dst < cpu_replicas_entry(cpu_r, nr)) {
dst->data_type = src->data_type;
for (i = 0; i < src->nr; i++)
replicas_set_dev(dst, src->devs[i]);
src = replicas_entry_next(src);
dst = (void *) dst + entry_size;
}
}
bch2_cpu_replicas_sort(cpu_r);
return cpu_r;
}
int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
{
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_cpu *cpu_r, *old_r;
sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
if (!cpu_r)
return -ENOMEM;
old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
rcu_assign_pointer(c->replicas, cpu_r);
if (old_r)
kfree_rcu(old_r, rcu);
return 0;
}
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
struct bch_replicas_cpu *r)
{
struct bch_sb_field_replicas *sb_r;
struct bch_replicas_entry *sb_e;
struct bch_replicas_cpu_entry *e;
size_t i, bytes;
bytes = sizeof(struct bch_sb_field_replicas);
for_each_cpu_replicas_entry(r, e) {
bytes += sizeof(struct bch_replicas_entry);
for (i = 0; i < r->entry_size - 1; i++)
bytes += hweight8(e->devs[i]);
}
sb_r = bch2_sb_resize_replicas(&c->disk_sb,
DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
if (!sb_r)
return -ENOSPC;
memset(&sb_r->entries, 0,
vstruct_end(&sb_r->field) -
(void *) &sb_r->entries);
sb_e = sb_r->entries;
for_each_cpu_replicas_entry(r, e) {
sb_e->data_type = e->data_type;
for (i = 0; i < replicas_dev_slots(r); i++)
if (replicas_test_dev(e, i))
sb_e->devs[sb_e->nr++] = i;
sb_e = replicas_entry_next(sb_e);
BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
}
return 0;
}
static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
{
struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
struct bch_replicas_cpu *cpu_r = NULL;
struct bch_replicas_entry *e;
const char *err;
unsigned i;
for_each_replicas_entry(sb_r, e) {
err = "invalid replicas entry: invalid data type";
if (e->data_type >= BCH_DATA_NR)
goto err;
err = "invalid replicas entry: no devices";
if (!e->nr)
goto err;
err = "invalid replicas entry: too many devices";
if (e->nr >= BCH_REPLICAS_MAX)
goto err;
err = "invalid replicas entry: invalid device";
for (i = 0; i < e->nr; i++)
if (!bch2_dev_exists(sb, mi, e->devs[i]))
goto err;
}
err = "cannot allocate memory";
cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
if (!cpu_r)
goto err;
sort_cmp_size(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
memcmp, NULL);
for (i = 0; i + 1 < cpu_r->nr; i++) {
struct bch_replicas_cpu_entry *l =
cpu_replicas_entry(cpu_r, i);
struct bch_replicas_cpu_entry *r =
cpu_replicas_entry(cpu_r, i + 1);
BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
err = "duplicate replicas entry";
if (!memcmp(l, r, cpu_r->entry_size))
goto err;
}
err = NULL;
err:
kfree(cpu_r);
return err;
}
const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
.validate = bch2_sb_validate_replicas,
};
int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size)
{
char *out = buf, *end = out + size;
struct bch_replicas_entry *e;
bool first = true;
unsigned i;
if (!r) {
out += scnprintf(out, end - out, "(no replicas section found)");
return out - buf;
}
for_each_replicas_entry(r, e) {
if (!first)
out += scnprintf(out, end - out, " ");
first = false;
out += scnprintf(out, end - out, "%u: [", e->data_type);
for (i = 0; i < e->nr; i++)
out += scnprintf(out, end - out,
i ? " %u" : "%u", e->devs[i]);
out += scnprintf(out, end - out, "]");
}
return out - buf;
}
/* Query replicas: */
bool bch2_replicas_marked(struct bch_fs *c,
enum bch_data_type data_type,
struct bch_devs_list devs)
{
struct bch_replicas_cpu_entry search;
unsigned max_dev;
bool ret;
if (!devs.nr)
return true;
devlist_to_replicas(devs, data_type, &search, &max_dev);
rcu_read_lock();
ret = replicas_has_entry(rcu_dereference(c->replicas),
search, max_dev);
rcu_read_unlock();
return ret;
}
bool bch2_bkey_replicas_marked(struct bch_fs *c,
enum bch_data_type data_type,
struct bkey_s_c k)
{
struct bch_devs_list cached = bch2_bkey_cached_devs(k);
unsigned i;
for (i = 0; i < cached.nr; i++)
if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
bch2_dev_list_single(cached.devs[i])))
return false;
return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k));
}
struct replicas_status __bch2_replicas_status(struct bch_fs *c,
struct bch_devs_mask online_devs)
{
struct bch_sb_field_members *mi;
struct bch_replicas_cpu_entry *e;
struct bch_replicas_cpu *r;
unsigned i, dev, dev_slots, nr_online, nr_offline;
struct replicas_status ret;
memset(&ret, 0, sizeof(ret));
for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
ret.replicas[i].nr_online = UINT_MAX;
mi = bch2_sb_get_members(c->disk_sb.sb);
rcu_read_lock();
r = rcu_dereference(c->replicas);
dev_slots = replicas_dev_slots(r);
for_each_cpu_replicas_entry(r, e) {
if (e->data_type >= ARRAY_SIZE(ret.replicas))
panic("e %p data_type %u\n", e, e->data_type);
nr_online = nr_offline = 0;
for (dev = 0; dev < dev_slots; dev++) {
if (!replicas_test_dev(e, dev))
continue;
BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, dev));
if (test_bit(dev, online_devs.d))
nr_online++;
else
nr_offline++;
}
ret.replicas[e->data_type].nr_online =
min(ret.replicas[e->data_type].nr_online,
nr_online);
ret.replicas[e->data_type].nr_offline =
max(ret.replicas[e->data_type].nr_offline,
nr_offline);
}
rcu_read_unlock();
return ret;
}
struct replicas_status bch2_replicas_status(struct bch_fs *c)
{
return __bch2_replicas_status(c, bch2_online_devs(c));
}
static bool have_enough_devs(struct replicas_status s,
enum bch_data_type type,
bool force_if_degraded,
bool force_if_lost)
{
return (!s.replicas[type].nr_offline || force_if_degraded) &&
(s.replicas[type].nr_online || force_if_lost);
}
bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
{
return (have_enough_devs(s, BCH_DATA_JOURNAL,
flags & BCH_FORCE_IF_METADATA_DEGRADED,
flags & BCH_FORCE_IF_METADATA_LOST) &&
have_enough_devs(s, BCH_DATA_BTREE,
flags & BCH_FORCE_IF_METADATA_DEGRADED,
flags & BCH_FORCE_IF_METADATA_LOST) &&
have_enough_devs(s, BCH_DATA_USER,
flags & BCH_FORCE_IF_DATA_DEGRADED,
flags & BCH_FORCE_IF_DATA_LOST));
}
unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
{
struct replicas_status s = bch2_replicas_status(c);
return meta
? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
s.replicas[BCH_DATA_BTREE].nr_online)
: s.replicas[BCH_DATA_USER].nr_online;
}
unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
{
struct bch_replicas_cpu_entry *e;
struct bch_replicas_cpu *r;
unsigned ret = 0;
rcu_read_lock();
r = rcu_dereference(c->replicas);
if (ca->dev_idx >= replicas_dev_slots(r))
goto out;
for_each_cpu_replicas_entry(r, e)
if (replicas_test_dev(e, ca->dev_idx))
ret |= 1 << e->data_type;
out:
rcu_read_unlock();
return ret;
}

Some files were not shown because too many files have changed in this diff Show More