bcachefs: gc now operates on second set of bucket marks

This means we can now use gc to verify the allocation information -
important for testing persistant alloc info

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2018-07-23 05:32:01 -04:00 committed by Kent Overstreet
parent e647369168
commit 9ca53b55f7
11 changed files with 506 additions and 394 deletions

View file

@ -930,12 +930,6 @@ static int bch2_allocator_thread(void *arg)
pr_debug("free_inc now empty");
do {
if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
up_read(&c->gc_lock);
bch_err(ca, "gc failure");
goto stop;
}
/*
* Find some buckets that we can invalidate, either
* they're completely unused, or only contain clean data
@ -1293,9 +1287,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
bool invalidating_data = false;
int ret = 0;
if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
return -1;
if (test_alloc_startup(c)) {
invalidating_data = true;
goto not_enough;
@ -1321,9 +1312,7 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
continue;
bch2_mark_alloc_bucket(c, ca, bu, true,
gc_pos_alloc(c, NULL),
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
gc_pos_alloc(c, NULL), 0);
fifo_push(&ca->free_inc, bu);

View file

@ -347,7 +347,6 @@ enum gc_phase {
GC_PHASE_PENDING_DELETE,
GC_PHASE_ALLOC,
GC_PHASE_DONE
};
struct gc_pos {
@ -392,15 +391,14 @@ struct bch_dev {
* gc_lock, for device resize - holding any is sufficient for access:
* Or rcu_read_lock(), but only for ptr_stale():
*/
struct bucket_array __rcu *buckets;
struct bucket_array __rcu *buckets[2];
unsigned long *buckets_dirty;
unsigned long *buckets_written;
/* most out of date gen in the btree */
u8 *oldest_gens;
struct rw_semaphore bucket_lock;
struct bch_dev_usage __percpu *usage_percpu;
struct bch_dev_usage usage_cached;
struct bch_dev_usage __percpu *usage[2];
/* Allocator: */
struct task_struct __rcu *alloc_thread;
@ -478,7 +476,6 @@ enum {
/* errors: */
BCH_FS_ERROR,
BCH_FS_GC_FAILURE,
/* misc: */
BCH_FS_BDEV_MOUNTED,
@ -614,8 +611,8 @@ struct bch_fs {
atomic64_t sectors_available;
struct bch_fs_usage __percpu *usage_percpu;
struct bch_fs_usage usage_cached;
struct bch_fs_usage __percpu *usage[2];
struct percpu_rw_semaphore usage_lock;
struct closure_waitlist freelist_wait;
@ -656,9 +653,6 @@ struct bch_fs {
*
* gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
*
* gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
* currently running, and gc marks are currently valid
*
* Protected by gc_pos_lock. Only written to by GC thread, so GC thread
* can read without a lock.
*/

View file

@ -260,8 +260,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
{
struct gc_pos pos = { 0 };
unsigned flags =
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD|
BCH_BUCKET_MARK_GC|
(initial ? BCH_BUCKET_MARK_NOATOMIC : 0);
int ret = 0;
@ -484,9 +483,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
BCH_DATA_SB, flags);
}
if (c)
spin_lock(&c->journal.lock);
for (i = 0; i < ca->journal.nr; i++) {
b = ca->journal.buckets[i];
bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
@ -495,7 +491,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
}
if (c) {
spin_unlock(&c->journal.lock);
percpu_up_read(&c->usage_lock);
} else {
preempt_enable();
@ -511,9 +506,7 @@ static void bch2_mark_superblocks(struct bch_fs *c)
gc_pos_set(c, gc_phase(GC_PHASE_SB));
for_each_online_member(ca, c, i)
bch2_mark_dev_superblock(c, ca,
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC);
mutex_unlock(&c->sb_lock);
}
@ -521,7 +514,6 @@ static void bch2_mark_superblocks(struct bch_fs *c)
static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
{
struct gc_pos pos = { 0 };
struct bch_fs_usage stats = { 0 };
struct btree_update *as;
struct pending_btree_node_free *d;
@ -533,13 +525,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
bch2_mark_key(c, BKEY_TYPE_BTREE,
bkey_i_to_s_c(&d->key),
true, 0,
pos, &stats, 0,
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
/*
* Don't apply stats - pending deletes aren't tracked in
* bch_alloc_stats:
*/
pos, NULL, 0,
BCH_BUCKET_MARK_GC);
mutex_unlock(&c->btree_interior_update_lock);
}
@ -560,8 +547,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
fifo_for_each_entry(i, &ca->free_inc, iter)
bch2_mark_alloc_bucket(c, ca, i, true,
gc_pos_alloc(c, NULL),
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
BCH_BUCKET_MARK_GC);
@ -569,8 +555,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
fifo_for_each_entry(i, &ca->free[j], iter)
bch2_mark_alloc_bucket(c, ca, i, true,
gc_pos_alloc(c, NULL),
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
BCH_BUCKET_MARK_GC);
}
spin_unlock(&c->freelist_lock);
@ -584,8 +569,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
ca = bch_dev_bkey_exists(c, ob->ptr.dev);
bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
gc_pos_alloc(c, ob),
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
BCH_BUCKET_MARK_GC_LOCK_HELD);
BCH_BUCKET_MARK_GC);
}
spin_unlock(&ob->lock);
}
@ -593,122 +577,310 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
percpu_up_read(&c->usage_lock);
}
static void bch2_gc_start(struct bch_fs *c)
static void bch2_gc_free(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
for_each_member_device(ca, c, i) {
kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
ca->buckets[1] = NULL;
free_percpu(ca->usage[1]);
ca->usage[1] = NULL;
}
free_percpu(c->usage[1]);
c->usage[1] = NULL;
}
static void bch2_gc_done_nocheck(struct bch_fs *c)
{
struct bch_dev *ca;
struct bucket_array *buckets;
struct bucket_mark new;
unsigned i;
size_t b;
int cpu;
for_each_member_device(ca, c, i) {
struct bucket_array *src = __bucket_array(ca, 1);
memcpy(__bucket_array(ca, 0), src,
sizeof(struct bucket_array) +
sizeof(struct bucket) * src->nbuckets);
};
for_each_member_device(ca, c, i) {
struct bch_dev_usage *p;
for_each_possible_cpu(cpu) {
p = per_cpu_ptr(ca->usage[0], cpu);
memset(p, 0, sizeof(*p));
}
preempt_disable();
*this_cpu_ptr(ca->usage[0]) = __bch2_dev_usage_read(ca, 1);
preempt_enable();
}
{
struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
struct bch_fs_usage *p;
for_each_possible_cpu(cpu) {
p = per_cpu_ptr(c->usage[0], cpu);
memset(p, 0, offsetof(typeof(*p), online_reserved));
}
preempt_disable();
memcpy(this_cpu_ptr(c->usage[0]),
&src,
offsetof(typeof(*p), online_reserved));
preempt_enable();
}
}
static void bch2_gc_done(struct bch_fs *c, bool initial)
{
struct bch_dev *ca;
unsigned i;
int cpu;
#define copy_field(_f, _msg, ...) \
if (dst._f != src._f) { \
pr_info(_msg ": got %llu, should be %llu, fixing" \
, ##__VA_ARGS__, dst._f, src._f); \
dst._f = src._f; \
}
#define copy_bucket_field(_f) \
if (dst->b[b].mark._f != src->b[b].mark._f) { \
pr_info("dev %u bucket %zu has wrong " #_f \
": got %u, should be %u, fixing", \
i, b, dst->b[b].mark._f, src->b[b].mark._f); \
dst->b[b]._mark._f = src->b[b].mark._f; \
}
#define copy_dev_field(_f, _msg, ...) \
copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
#define copy_fs_field(_f, _msg, ...) \
copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
percpu_down_write(&c->usage_lock);
/*
* Indicates to buckets code that gc is now in progress - done under
* usage_lock to avoid racing with bch2_mark_key():
*/
__gc_pos_set(c, gc_phase(GC_PHASE_START));
if (initial) {
bch2_gc_done_nocheck(c);
goto out;
}
/* Save a copy of the existing bucket stats while we recompute them: */
for_each_member_device(ca, c, i) {
ca->usage_cached = __bch2_dev_usage_read(ca);
struct bucket_array *dst = __bucket_array(ca, 0);
struct bucket_array *src = __bucket_array(ca, 1);
size_t b;
if (initial) {
memcpy(dst, src,
sizeof(struct bucket_array) +
sizeof(struct bucket) * dst->nbuckets);
}
for (b = 0; b < src->nbuckets; b++) {
copy_bucket_field(gen);
copy_bucket_field(data_type);
copy_bucket_field(owned_by_allocator);
copy_bucket_field(stripe);
copy_bucket_field(dirty_sectors);
copy_bucket_field(cached_sectors);
}
};
for_each_member_device(ca, c, i) {
struct bch_dev_usage dst = __bch2_dev_usage_read(ca, 0);
struct bch_dev_usage src = __bch2_dev_usage_read(ca, 1);
struct bch_dev_usage *p;
unsigned b;
for (b = 0; b < BCH_DATA_NR; b++)
copy_dev_field(buckets[b],
"buckets[%s]", bch2_data_types[b]);
copy_dev_field(buckets_alloc, "buckets_alloc");
copy_dev_field(buckets_ec, "buckets_ec");
for (b = 0; b < BCH_DATA_NR; b++)
copy_dev_field(sectors[b],
"sectors[%s]", bch2_data_types[b]);
copy_dev_field(sectors_fragmented,
"sectors_fragmented");
for_each_possible_cpu(cpu) {
struct bch_dev_usage *p =
per_cpu_ptr(ca->usage_percpu, cpu);
p = per_cpu_ptr(ca->usage[0], cpu);
memset(p, 0, sizeof(*p));
}
preempt_disable();
p = this_cpu_ptr(ca->usage[0]);
*p = dst;
preempt_enable();
}
{
struct bch_fs_usage dst = __bch2_fs_usage_read(c, 0);
struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
struct bch_fs_usage *p;
unsigned r, b;
for (r = 0; r < BCH_REPLICAS_MAX; r++) {
for (b = 0; b < BCH_DATA_NR; b++)
copy_fs_field(replicas[r].data[b],
"replicas[%i].data[%s]",
r, bch2_data_types[b]);
copy_fs_field(replicas[r].ec_data,
"replicas[%i].ec_data", r);
copy_fs_field(replicas[r].persistent_reserved,
"replicas[%i].persistent_reserved", r);
}
for (b = 0; b < BCH_DATA_NR; b++)
copy_fs_field(buckets[b],
"buckets[%s]", bch2_data_types[b]);
for_each_possible_cpu(cpu) {
p = per_cpu_ptr(c->usage[0], cpu);
memset(p, 0, offsetof(typeof(*p), online_reserved));
}
preempt_disable();
p = this_cpu_ptr(c->usage[0]);
memcpy(p, &dst, offsetof(typeof(*p), online_reserved));
preempt_enable();
}
out:
percpu_up_write(&c->usage_lock);
#undef copy_field
#undef copy_fs_field
#undef copy_dev_field
#undef copy_bucket_field
}
static int bch2_gc_start(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
BUG_ON(c->usage[1]);
c->usage[1] = alloc_percpu(struct bch_fs_usage);
if (!c->usage[1])
return -ENOMEM;
for_each_member_device(ca, c, i) {
BUG_ON(ca->buckets[1]);
BUG_ON(ca->usage[1]);
ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO);
if (!ca->buckets[1]) {
percpu_ref_put(&ca->ref);
return -ENOMEM;
}
ca->usage[1] = alloc_percpu(struct bch_dev_usage);
if (!ca->usage[1]) {
percpu_ref_put(&ca->ref);
return -ENOMEM;
}
}
c->usage_cached = __bch2_fs_usage_read(c);
for_each_possible_cpu(cpu) {
struct bch_fs_usage *p =
per_cpu_ptr(c->usage_percpu, cpu);
percpu_down_write(&c->usage_lock);
memset(p->replicas, 0, sizeof(p->replicas));
memset(p->buckets, 0, sizeof(p->buckets));
}
for_each_member_device(ca, c, i) {
struct bucket_array *dst = __bucket_array(ca, 1);
struct bucket_array *src = __bucket_array(ca, 0);
size_t b;
dst->first_bucket = src->first_bucket;
dst->nbuckets = src->nbuckets;
for (b = 0; b < src->nbuckets; b++)
dst->b[b]._mark.gen = src->b[b].mark.gen;
};
percpu_up_write(&c->usage_lock);
/* Clear bucket marks: */
for_each_member_device(ca, c, i) {
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
bucket_cmpxchg(buckets->b + b, new, ({
new.owned_by_allocator = 0;
new.data_type = 0;
new.cached_sectors = 0;
new.dirty_sectors = 0;
new.stripe = 0;
}));
ca->oldest_gens[b] = new.gen;
}
up_read(&ca->bucket_lock);
}
return 0;
}
/**
* bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
* bch2_gc - walk _all_ references to buckets, and recompute them:
*
* Order matters here:
* - Concurrent GC relies on the fact that we have a total ordering for
* everything that GC walks - see gc_will_visit_node(),
* gc_will_visit_root()
*
* - also, references move around in the course of index updates and
* various other crap: everything needs to agree on the ordering
* references are allowed to move around in - e.g., we're allowed to
* start with a reference owned by an open_bucket (the allocator) and
* move it to the btree, but not the reverse.
*
* This is necessary to ensure that gc doesn't miss references that
* move around - if references move backwards in the ordering GC
* uses, GC could skip past them
*/
void bch2_gc(struct bch_fs *c)
int bch2_gc(struct bch_fs *c, struct list_head *journal, bool initial)
{
struct bch_dev *ca;
u64 start_time = local_clock();
unsigned i;
unsigned i, iter = 0;
int ret;
/*
* Walk _all_ references to buckets, and recompute them:
*
* Order matters here:
* - Concurrent GC relies on the fact that we have a total ordering for
* everything that GC walks - see gc_will_visit_node(),
* gc_will_visit_root()
*
* - also, references move around in the course of index updates and
* various other crap: everything needs to agree on the ordering
* references are allowed to move around in - e.g., we're allowed to
* start with a reference owned by an open_bucket (the allocator) and
* move it to the btree, but not the reverse.
*
* This is necessary to ensure that gc doesn't miss references that
* move around - if references move backwards in the ordering GC
* uses, GC could skip past them
*/
trace_gc_start(c);
/*
* Do this before taking gc_lock - bch2_disk_reservation_get() blocks on
* gc_lock if sectors_available goes to 0:
*/
bch2_recalc_sectors_available(c);
down_write(&c->gc_lock);
if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
again:
ret = bch2_gc_start(c);
if (ret)
goto out;
bch2_gc_start(c);
bch2_mark_superblocks(c);
ret = bch2_gc_btrees(c, NULL, false);
if (ret) {
bch_err(c, "btree gc failed: %d", ret);
set_bit(BCH_FS_GC_FAILURE, &c->flags);
ret = bch2_gc_btrees(c, journal, initial);
if (ret)
goto out;
}
bch2_mark_pending_btree_node_frees(c);
bch2_mark_allocator_buckets(c);
/* Indicates that gc is no longer in progress: */
gc_pos_set(c, gc_phase(GC_PHASE_DONE));
c->gc_count++;
out:
if (!ret && test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
/*
* XXX: make sure gens we fixed got saved
*/
if (iter++ <= 2) {
bch_info(c, "Fixed gens, restarting mark and sweep:");
clear_bit(BCH_FS_FIXED_GENS, &c->flags);
goto again;
}
bch_info(c, "Unable to fix bucket gens, looping");
ret = -EINVAL;
}
if (!ret)
bch2_gc_done(c, initial);
/* Indicates that gc is no longer in progress: */
__gc_pos_set(c, gc_phase(GC_PHASE_START));
bch2_gc_free(c);
up_write(&c->gc_lock);
if (!ret && initial)
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
trace_gc_end(c);
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
@ -724,6 +896,7 @@ void bch2_gc(struct bch_fs *c)
* allocator thread - issue wakeup in case they blocked on gc_lock:
*/
closure_wake_up(&c->freelist_wait);
return ret;
}
/* Btree coalescing */
@ -1039,9 +1212,6 @@ void bch2_coalesce(struct bch_fs *c)
{
enum btree_id id;
if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
return;
down_read(&c->gc_lock);
trace_gc_coalesce_start(c);
@ -1053,7 +1223,6 @@ void bch2_coalesce(struct bch_fs *c)
if (ret) {
if (ret != -ESHUTDOWN)
bch_err(c, "btree coalescing failed: %d", ret);
set_bit(BCH_FS_GC_FAILURE, &c->flags);
return;
}
}
@ -1068,6 +1237,7 @@ static int bch2_gc_thread(void *arg)
struct io_clock *clock = &c->io_clock[WRITE];
unsigned long last = atomic_long_read(&clock->now);
unsigned last_kick = atomic_read(&c->kick_gc);
int ret;
set_freezable();
@ -1101,7 +1271,9 @@ static int bch2_gc_thread(void *arg)
last = atomic_long_read(&clock->now);
last_kick = atomic_read(&c->kick_gc);
bch2_gc(c);
ret = bch2_gc(c, NULL, false);
if (ret)
bch_err(c, "btree gc failed: %i", ret);
debug_check_no_locks_held();
}
@ -1142,30 +1314,7 @@ int bch2_gc_thread_start(struct bch_fs *c)
int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
{
unsigned iter = 0;
int ret = 0;
down_write(&c->gc_lock);
again:
bch2_gc_start(c);
bch2_mark_superblocks(c);
ret = bch2_gc_btrees(c, journal, true);
if (ret)
goto err;
if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
if (iter++ > 2) {
bch_info(c, "Unable to fix bucket gens, looping");
ret = -EINVAL;
goto err;
}
bch_info(c, "Fixed gens, restarting initial mark and sweep:");
clear_bit(BCH_FS_FIXED_GENS, &c->flags);
goto again;
}
int ret = bch2_gc(c, journal, true);
/*
* Skip past versions that might have possibly been used (as nonces),
@ -1174,9 +1323,5 @@ int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
if (c->sb.encryption_type)
atomic64_add(1 << 16, &c->key_version);
gc_pos_set(c, gc_phase(GC_PHASE_DONE));
set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
err:
up_write(&c->gc_lock);
return ret;
}

View file

@ -7,7 +7,7 @@
enum bkey_type;
void bch2_coalesce(struct bch_fs *);
void bch2_gc(struct bch_fs *);
int bch2_gc(struct bch_fs *, struct list_head *, bool);
void bch2_gc_thread_stop(struct bch_fs *);
int bch2_gc_thread_start(struct bch_fs *);
int bch2_initial_gc(struct bch_fs *, struct list_head *);
@ -105,14 +105,14 @@ static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *o
};
}
static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
{
unsigned seq;
bool ret;
do {
seq = read_seqcount_begin(&c->gc_pos_lock);
ret = gc_pos_cmp(c->gc_pos, pos) < 0;
ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
} while (read_seqcount_retry(&c->gc_pos_lock, seq));
return ret;

View file

@ -160,7 +160,6 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
{
struct bch_fs *c = as->c;
struct pending_btree_node_free *d;
unsigned replicas;
/*
* btree_update lock is only needed here to avoid racing with
@ -178,15 +177,6 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
BUG_ON(d->index_update_done);
d->index_update_done = true;
/*
* Btree nodes are accounted as freed in bch_alloc_stats when they're
* freed from the index:
*/
replicas = bch2_extent_nr_dirty_ptrs(k);
if (replicas)
stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -=
c->opts.btree_node_size * replicas;
/*
* We're dropping @k from the btree, but it's still live until the
* index update is persistent so we need to keep a reference around for
@ -208,15 +198,16 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
* bch2_mark_key() compares the current gc pos to the pos we're
* moving this reference from, hence one comparison here:
*/
if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
struct bch_fs_usage tmp = { 0 };
if (gc_pos_cmp(c->gc_pos, b
? gc_pos_btree_node(b)
: gc_pos_btree_root(as->btree_id)) >= 0 &&
gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
struct gc_pos pos = { 0 };
bch2_mark_key(c, BKEY_TYPE_BTREE,
bkey_i_to_s_c(&d->key),
false, 0, b
? gc_pos_btree_node(b)
: gc_pos_btree_root(as->btree_id),
&tmp, 0, 0);
false, 0, pos,
NULL, 0, BCH_BUCKET_MARK_GC);
/*
* Don't apply tmp - pending deletes aren't tracked in
* bch_alloc_stats:
@ -287,19 +278,13 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
static void bch2_btree_node_free_ondisk(struct bch_fs *c,
struct pending_btree_node_free *pending)
{
struct bch_fs_usage stats = { 0 };
BUG_ON(!pending->index_update_done);
bch2_mark_key(c, BKEY_TYPE_BTREE,
bkey_i_to_s_c(&pending->key),
false, 0,
gc_phase(GC_PHASE_PENDING_DELETE),
&stats, 0, 0);
/*
* Don't apply stats - pending deletes aren't tracked in
* bch_alloc_stats:
*/
NULL, 0, 0);
}
static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@ -1939,6 +1924,25 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
btree_interior_update_add_node_reference(as, b);
/*
* XXX: the rest of the update path treats this like we're actually
* inserting a new node and deleting the existing node, so the
* reservation needs to include enough space for @b
*
* that is actually sketch as fuck though and I am surprised the code
* seems to work like that, definitely need to go back and rework it
* into something saner.
*
* (I think @b is just getting double counted until the btree update
* finishes and "deletes" @b on disk)
*/
ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
c->opts.btree_node_size *
bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)),
BCH_DISK_RESERVATION_NOFAIL|
BCH_DISK_RESERVATION_GC_LOCK_HELD);
BUG_ON(ret);
parent = btree_node_parent(iter, b);
if (parent) {
if (new_hash) {

View file

@ -85,8 +85,7 @@ static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
static void bch2_fs_stats_verify(struct bch_fs *c)
{
struct bch_fs_usage stats =
__bch2_fs_usage_read(c);
struct bch_fs_usage stats =_bch2_fs_usage_read(c);
unsigned i, j;
for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
@ -209,43 +208,24 @@ do { \
_acc; \
})
#define bch2_usage_read_cached(_c, _cached, _uncached) \
({ \
typeof(_cached) _ret; \
unsigned _seq; \
\
do { \
_seq = read_seqcount_begin(&(_c)->gc_pos_lock); \
_ret = (_c)->gc_pos.phase == GC_PHASE_DONE \
? bch2_usage_read_raw(_uncached) \
: (_cached); \
} while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq)); \
\
_ret; \
})
struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca, bool gc)
{
return bch2_usage_read_raw(ca->usage_percpu);
return bch2_usage_read_raw(ca->usage[gc]);
}
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
{
return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
return bch2_usage_read_raw(ca->usage[0]);
}
struct bch_fs_usage
__bch2_fs_usage_read(struct bch_fs *c)
struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *c, bool gc)
{
return bch2_usage_read_raw(c->usage_percpu);
return bch2_usage_read_raw(c->usage[gc]);
}
struct bch_fs_usage
bch2_fs_usage_read(struct bch_fs *c)
struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *c)
{
return bch2_usage_read_cached(c,
c->usage_cached,
c->usage_percpu);
return bch2_usage_read_raw(c->usage[0]);
}
struct fs_usage_sum {
@ -327,13 +307,11 @@ static inline enum bch_data_type bucket_type(struct bucket_mark m)
: m.data_type;
}
static bool bucket_became_unavailable(struct bch_fs *c,
struct bucket_mark old,
static bool bucket_became_unavailable(struct bucket_mark old,
struct bucket_mark new)
{
return is_available_bucket(old) &&
!is_available_bucket(new) &&
(!c || c->gc_pos.phase == GC_PHASE_DONE);
!is_available_bucket(new);
}
void bch2_fs_usage_apply(struct bch_fs *c,
@ -364,11 +342,13 @@ void bch2_fs_usage_apply(struct bch_fs *c,
percpu_down_read(&c->usage_lock);
preempt_disable();
/* online_reserved not subject to gc: */
this_cpu_add(c->usage_percpu->online_reserved, stats->online_reserved);
this_cpu_add(c->usage[0]->online_reserved, stats->online_reserved);
stats->online_reserved = 0;
if (!gc_will_visit(c, gc_pos))
bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats);
bch2_usage_add(this_cpu_ptr(c->usage[0]), stats);
if (gc_visited(c, gc_pos))
bch2_usage_add(this_cpu_ptr(c->usage[1]), stats);
bch2_fs_stats_verify(c);
preempt_enable();
@ -378,8 +358,9 @@ void bch2_fs_usage_apply(struct bch_fs *c,
}
static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
struct bch_fs_usage *stats,
struct bucket_mark old, struct bucket_mark new)
struct bch_fs_usage *fs_usage,
struct bucket_mark old, struct bucket_mark new,
bool gc)
{
struct bch_dev_usage *dev_usage;
@ -391,14 +372,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
bch2_data_types[old.data_type],
bch2_data_types[new.data_type]);
stats->buckets[bucket_type(old)] -= ca->mi.bucket_size;
stats->buckets[bucket_type(new)] += ca->mi.bucket_size;
preempt_disable();
dev_usage = this_cpu_ptr(ca->usage_percpu);
dev_usage = this_cpu_ptr(ca->usage[gc]);
dev_usage->buckets[bucket_type(old)]--;
dev_usage->buckets[bucket_type(new)]++;
if (bucket_type(old) != bucket_type(new)) {
if (bucket_type(old)) {
fs_usage->buckets[bucket_type(old)] -= ca->mi.bucket_size;
dev_usage->buckets[bucket_type(old)]--;
} else {
fs_usage->buckets[bucket_type(new)] += ca->mi.bucket_size;
dev_usage->buckets[bucket_type(new)]++;
}
}
dev_usage->buckets_alloc +=
(int) new.owned_by_allocator - (int) old.owned_by_allocator;
@ -425,21 +410,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
({ \
struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
\
bch2_dev_usage_update(c, ca, stats, _old, new); \
bch2_dev_usage_update(c, ca, stats, _old, new, gc); \
_old; \
})
void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, struct bucket_mark *old)
static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, struct bucket_mark *old,
bool gc)
{
struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
struct bucket *g;
struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]);
struct bucket *g = __bucket(ca, b, gc);
struct bucket_mark new;
percpu_rwsem_assert_held(&c->usage_lock);
g = bucket(ca, b);
*old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
BUG_ON(!is_available_bucket(new));
@ -450,38 +432,49 @@ void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
new.gen++;
}));
/*
* This isn't actually correct yet, since fs usage is still
* uncompressed sectors:
*/
stats->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors;
}
void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, struct bucket_mark *old)
{
percpu_rwsem_assert_held(&c->usage_lock);
__bch2_invalidate_bucket(c, ca, b, old, false);
if (!old->owned_by_allocator && old->cached_sectors)
trace_invalidate(ca, bucket_to_sector(ca, b),
old->cached_sectors);
}
void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator,
struct gc_pos pos, unsigned flags)
static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator,
bool gc)
{
struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
struct bucket *g;
struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]);
struct bucket *g = __bucket(ca, b, gc);
struct bucket_mark old, new;
percpu_rwsem_assert_held(&c->usage_lock);
g = bucket(ca, b);
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
gc_will_visit(c, pos))
return;
old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
new.owned_by_allocator = owned_by_allocator;
}));
BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
c->gc_pos.phase == GC_PHASE_DONE);
BUG_ON(!gc &&
!owned_by_allocator && !old.owned_by_allocator);
}
void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator,
struct gc_pos pos, unsigned flags)
{
percpu_rwsem_assert_held(&c->usage_lock);
if (!(flags & BCH_BUCKET_MARK_GC))
__bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false);
if ((flags & BCH_BUCKET_MARK_GC) ||
gc_visited(c, pos))
__bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true);
}
#define checked_add(a, b) \
@ -491,37 +484,49 @@ do { \
BUG_ON((a) != _res); \
} while (0)
void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, enum bch_data_type type,
unsigned sectors, struct gc_pos pos,
unsigned flags)
static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, enum bch_data_type type,
unsigned sectors, bool gc)
{
struct bch_fs_usage *stats;
struct bucket *g;
struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
struct bucket *g = __bucket(ca, b, gc);
struct bucket_mark old, new;
BUG_ON(type != BCH_DATA_SB &&
type != BCH_DATA_JOURNAL);
old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
new.data_type = type;
checked_add(new.dirty_sectors, sectors);
}));
fs_usage->replicas[0].data[type] += sectors;
}
void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, enum bch_data_type type,
unsigned sectors, struct gc_pos pos,
unsigned flags)
{
BUG_ON(type != BCH_DATA_SB &&
type != BCH_DATA_JOURNAL);
preempt_disable();
if (likely(c)) {
percpu_rwsem_assert_held(&c->usage_lock);
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
gc_will_visit(c, pos))
return;
preempt_disable();
stats = this_cpu_ptr(c->usage_percpu);
g = bucket(ca, b);
old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
new.data_type = type;
checked_add(new.dirty_sectors, sectors);
}));
stats->replicas[0].data[type] += sectors;
preempt_enable();
if (!(flags & BCH_BUCKET_MARK_GC))
__bch2_mark_metadata_bucket(c, ca, b, type, sectors,
false);
if ((flags & BCH_BUCKET_MARK_GC) ||
gc_visited(c, pos))
__bch2_mark_metadata_bucket(c, ca, b, type, sectors,
true);
} else {
struct bucket *g;
struct bucket_mark old, new;
rcu_read_lock();
g = bucket(ca, b);
@ -533,8 +538,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
rcu_read_unlock();
}
BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
bucket_became_unavailable(c, old, new));
preempt_enable();
}
static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
@ -579,23 +583,15 @@ static void bch2_mark_pointer(struct bch_fs *c,
struct extent_ptr_decoded p,
s64 sectors, enum bch_data_type data_type,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags)
u64 journal_seq, unsigned flags,
bool gc)
{
struct bucket_mark old, new;
struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
struct bucket *g = PTR_BUCKET(ca, &p.ptr);
size_t b = PTR_BUCKET_NR(ca, &p.ptr);
struct bucket *g = __bucket(ca, b, gc);
u64 v;
if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
if (journal_seq)
bucket_cmpxchg(g, new, ({
new.journal_seq_valid = 1;
new.journal_seq = journal_seq;
}));
return;
}
v = atomic64_read(&g->_mark.v);
do {
new.v.counter = old.v.counter = v;
@ -637,10 +633,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
old.v.counter,
new.v.counter)) != old.v.counter);
bch2_dev_usage_update(c, ca, fs_usage, old, new);
bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
bucket_became_unavailable(c, old, new));
BUG_ON(!gc && bucket_became_unavailable(old, new));
}
static void bch2_mark_stripe_ptr(struct bch_fs *c,
@ -688,9 +683,9 @@ static void bch2_mark_stripe_ptr(struct bch_fs *c,
static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
s64 sectors, enum bch_data_type data_type,
struct gc_pos pos,
struct bch_fs_usage *stats,
u64 journal_seq, unsigned flags)
u64 journal_seq, unsigned flags,
bool gc)
{
BUG_ON(!sectors);
@ -712,7 +707,7 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
s64 adjusted_disk_sectors = disk_sectors;
bch2_mark_pointer(c, e, p, disk_sectors, data_type,
stats, journal_seq, flags);
stats, journal_seq, flags, gc);
if (!p.ptr.cached)
for (i = 0; i < p.ec_nr; i++)
@ -758,21 +753,20 @@ static void bucket_set_stripe(struct bch_fs *c,
const struct bch_stripe *v,
bool enabled,
struct bch_fs_usage *fs_usage,
u64 journal_seq)
u64 journal_seq,
bool gc)
{
unsigned i;
for (i = 0; i < v->nr_blocks; i++) {
const struct bch_extent_ptr *ptr = v->ptrs + i;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
struct bucket *g;
size_t b = PTR_BUCKET_NR(ca, ptr);
struct bucket *g = __bucket(ca, b, gc);
struct bucket_mark new, old;
BUG_ON(ptr_stale(ca, ptr));
rcu_read_lock();
g = PTR_BUCKET(ca, ptr);
old = bucket_cmpxchg(g, new, ({
new.stripe = enabled;
if (journal_seq) {
@ -780,18 +774,18 @@ static void bucket_set_stripe(struct bch_fs *c,
new.journal_seq = journal_seq;
}
}));
rcu_read_unlock();
BUG_ON(old.stripe == enabled);
bch2_dev_usage_update(c, ca, fs_usage, old, new);
bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
}
}
static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
bool inserting, struct gc_pos pos,
bool inserting,
struct bch_fs_usage *fs_usage,
u64 journal_seq, unsigned flags)
u64 journal_seq, unsigned flags,
bool gc)
{
switch (k.k->type) {
case BCH_STRIPE: {
@ -820,12 +814,40 @@ static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
else
bch2_stripes_heap_del(c, m, idx);
bucket_set_stripe(c, s.v, inserting, fs_usage, 0);
bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
break;
}
}
}
static void __bch2_mark_key(struct bch_fs *c,
enum bkey_type type, struct bkey_s_c k,
bool inserting, s64 sectors,
struct bch_fs_usage *stats,
u64 journal_seq, unsigned flags,
bool gc)
{
switch (type) {
case BKEY_TYPE_BTREE:
bch2_mark_extent(c, k, inserting
? c->opts.btree_node_size
: -c->opts.btree_node_size,
BCH_DATA_BTREE,
stats, journal_seq, flags, gc);
break;
case BKEY_TYPE_EXTENTS:
bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
stats, journal_seq, flags, gc);
break;
case BKEY_TYPE_EC:
bch2_mark_stripe(c, k, inserting,
stats, journal_seq, flags, gc);
break;
default:
break;
}
}
void bch2_mark_key(struct bch_fs *c,
enum bkey_type type, struct bkey_s_c k,
bool inserting, s64 sectors,
@ -833,61 +855,23 @@ void bch2_mark_key(struct bch_fs *c,
struct bch_fs_usage *stats,
u64 journal_seq, unsigned flags)
{
/*
* synchronization w.r.t. GC:
*
* Normally, bucket sector counts/marks are updated on the fly, as
* references are added/removed from the btree, the lists of buckets the
* allocator owns, other metadata buckets, etc.
*
* When GC is in progress and going to mark this reference, we do _not_
* mark this reference here, to avoid double counting - GC will count it
* when it gets to it.
*
* To know whether we should mark a given reference (GC either isn't
* running, or has already marked references at this position) we
* construct a total order for everything GC walks. Then, we can simply
* compare the position of the reference we're marking - @pos - with
* GC's current position. If GC is going to mark this reference, GC's
* current position will be less than @pos; if GC's current position is
* greater than @pos GC has either already walked this position, or
* isn't running.
*
* To avoid racing with GC's position changing, we have to deal with
* - GC's position being set to GC_POS_MIN when GC starts:
* usage_lock guards against this
* - GC's position overtaking @pos: we guard against this with
* whatever lock protects the data structure the reference lives in
* (e.g. the btree node lock, or the relevant allocator lock).
*/
percpu_down_read(&c->usage_lock);
if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
gc_will_visit(c, pos))
flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
if (!stats)
stats = this_cpu_ptr(c->usage_percpu);
if (!(flags & BCH_BUCKET_MARK_GC)) {
if (!stats)
stats = this_cpu_ptr(c->usage[0]);
switch (type) {
case BKEY_TYPE_BTREE:
bch2_mark_extent(c, k, inserting
? c->opts.btree_node_size
: -c->opts.btree_node_size,
BCH_DATA_BTREE,
pos, stats, journal_seq, flags);
break;
case BKEY_TYPE_EXTENTS:
bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
pos, stats, journal_seq, flags);
break;
case BKEY_TYPE_EC:
bch2_mark_stripe(c, k, inserting,
pos, stats, journal_seq, flags);
break;
default:
break;
__bch2_mark_key(c, type, k, inserting, sectors,
stats, journal_seq, flags, false);
}
if ((flags & BCH_BUCKET_MARK_GC) ||
gc_visited(c, pos)) {
__bch2_mark_key(c, type, k, inserting, sectors,
this_cpu_ptr(c->usage[1]),
journal_seq, flags, true);
}
percpu_up_read(&c->usage_lock);
}
@ -963,28 +947,20 @@ void bch2_mark_update(struct btree_insert *trans,
/* Disk reservations: */
static u64 __recalc_sectors_available(struct bch_fs *c)
static u64 bch2_recalc_sectors_available(struct bch_fs *c)
{
int cpu;
for_each_possible_cpu(cpu)
per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
per_cpu_ptr(c->usage[0], cpu)->available_cache = 0;
return avail_factor(bch2_fs_sectors_free(c, bch2_fs_usage_read(c)));
}
/* Used by gc when it's starting: */
void bch2_recalc_sectors_available(struct bch_fs *c)
{
percpu_down_write(&c->usage_lock);
atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
percpu_up_write(&c->usage_lock);
}
void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
{
percpu_down_read(&c->usage_lock);
this_cpu_sub(c->usage_percpu->online_reserved,
this_cpu_sub(c->usage[0]->online_reserved,
res->sectors);
bch2_fs_stats_verify(c);
@ -1005,7 +981,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
percpu_down_read(&c->usage_lock);
preempt_disable();
stats = this_cpu_ptr(c->usage_percpu);
stats = this_cpu_ptr(c->usage[0]);
if (sectors <= stats->available_cache)
goto out;
@ -1055,7 +1031,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
}
percpu_down_write(&c->usage_lock);
sectors_available = __recalc_sectors_available(c);
sectors_available = bch2_recalc_sectors_available(c);
if (sectors <= sectors_available ||
(flags & BCH_DISK_RESERVATION_NOFAIL)) {
@ -1110,7 +1086,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7);
size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
btree_reserve);
bool resize = ca->buckets != NULL,
bool resize = ca->buckets[0] != NULL,
start_copygc = ca->copygc_thread != NULL;
int ret = -ENOMEM;
unsigned i;
@ -1170,7 +1146,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
BITS_TO_LONGS(n) * sizeof(unsigned long));
}
rcu_assign_pointer(ca->buckets, buckets);
rcu_assign_pointer(ca->buckets[0], buckets);
buckets = old_buckets;
swap(ca->oldest_gens, oldest_gens);
@ -1239,16 +1215,16 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
kvpfree(ca->buckets_dirty,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
kvpfree(rcu_dereference_protected(ca->buckets, 1),
kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
free_percpu(ca->usage_percpu);
free_percpu(ca->usage[0]);
}
int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
{
if (!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)))
if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
return -ENOMEM;
return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;

View file

@ -29,23 +29,34 @@
_old; \
})
static inline struct bucket_array *bucket_array(struct bch_dev *ca)
static inline struct bucket_array *__bucket_array(struct bch_dev *ca,
bool gc)
{
return rcu_dereference_check(ca->buckets,
return rcu_dereference_check(ca->buckets[gc],
!ca->fs ||
percpu_rwsem_is_held(&ca->fs->usage_lock) ||
lockdep_is_held(&ca->fs->gc_lock) ||
lockdep_is_held(&ca->bucket_lock));
}
static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
static inline struct bucket_array *bucket_array(struct bch_dev *ca)
{
struct bucket_array *buckets = bucket_array(ca);
return __bucket_array(ca, false);
}
static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
{
struct bucket_array *buckets = __bucket_array(ca, gc);
BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
return buckets->b + b;
}
static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
{
return __bucket(ca, b, false);
}
static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
size_t b, int rw)
{
@ -129,7 +140,7 @@ static inline bool bucket_unused(struct bucket_mark mark)
/* Device usage: */
struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *, bool);
struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
static inline u64 __dev_buckets_available(struct bch_dev *ca,
@ -168,7 +179,7 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
/* Filesystem usage: */
struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *, bool);
struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
struct disk_reservation *, struct gc_pos);
@ -207,17 +218,13 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
struct gc_pos, unsigned);
#define BCH_BUCKET_MARK_NOATOMIC (1 << 0)
#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 1)
#define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 2)
#define BCH_BUCKET_MARK_GC_LOCK_HELD (1 << 3)
#define BCH_BUCKET_MARK_GC (1 << 1)
void bch2_mark_key(struct bch_fs *, enum bkey_type, struct bkey_s_c,
bool, s64, struct gc_pos,
struct bch_fs_usage *, u64, unsigned);
void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
void bch2_recalc_sectors_available(struct bch_fs *);
void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
static inline void bch2_disk_reservation_put(struct bch_fs *c,

View file

@ -64,8 +64,6 @@ struct bch_dev_usage {
struct bch_fs_usage {
/* all fields are in units of 512 byte sectors: */
u64 online_reserved;
u64 available_cache;
struct {
u64 data[BCH_DATA_NR];
@ -74,6 +72,10 @@ struct bch_fs_usage {
} replicas[BCH_REPLICAS_MAX];
u64 buckets[BCH_DATA_NR];
/* fields starting here aren't touched by gc: */
u64 online_reserved;
u64 available_cache;
};
/*

View file

@ -782,9 +782,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
ca->mi.bucket_size,
gc_phase(GC_PHASE_SB),
new_fs
? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE
: 0);
0);
if (c) {
spin_unlock(&c->journal.lock);

View file

@ -374,7 +374,7 @@ static void bch2_fs_free(struct bch_fs *c)
bch2_io_clock_exit(&c->io_clock[READ]);
bch2_fs_compress_exit(c);
percpu_free_rwsem(&c->usage_lock);
free_percpu(c->usage_percpu);
free_percpu(c->usage[0]);
mempool_exit(&c->btree_iters_pool);
mempool_exit(&c->btree_bounce_pool);
bioset_exit(&c->btree_bio);
@ -606,7 +606,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
max(offsetof(struct btree_read_bio, bio),
offsetof(struct btree_write_bio, wbio.bio)),
BIOSET_NEED_BVECS) ||
!(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
!(c->usage[0] = alloc_percpu(struct bch_fs_usage)) ||
percpu_init_rwsem(&c->usage_lock) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
btree_bytes(c)) ||
@ -1028,8 +1028,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
return ret;
mutex_lock(&c->sb_lock);
bch2_mark_dev_superblock(ca->fs, ca,
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
bch2_mark_dev_superblock(ca->fs, ca, 0);
mutex_unlock(&c->sb_lock);
bch2_dev_sysfs_online(c, ca);
@ -1314,7 +1313,7 @@ static void dev_usage_clear(struct bch_dev *ca)
for_each_possible_cpu(cpu) {
struct bch_dev_usage *p =
per_cpu_ptr(ca->usage_percpu, cpu);
per_cpu_ptr(ca->usage[0], cpu);
memset(p, 0, sizeof(*p));
}
@ -1375,8 +1374,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
* allocate the journal, reset all the marks, then remark after we
* attach...
*/
bch2_mark_dev_superblock(ca->fs, ca,
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
bch2_mark_dev_superblock(ca->fs, ca, 0);
err = "journal alloc failed";
ret = bch2_dev_journal_alloc(ca);
@ -1435,8 +1433,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
ca->disk_sb.sb->dev_idx = dev_idx;
bch2_dev_attach(c, ca, dev_idx);
bch2_mark_dev_superblock(c, ca,
BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
bch2_mark_dev_superblock(c, ca, 0);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);

View file

@ -478,7 +478,7 @@ STORE(__bch2_fs)
bch2_coalesce(c);
if (attr == &sysfs_trigger_gc)
bch2_gc(c);
bch2_gc(c, NULL, false);
if (attr == &sysfs_prune_cache) {
struct shrink_control sc;