From 72c270612bd33192fa836ad0f2939af1ca218292 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 5 Jun 2013 06:24:39 -0700 Subject: [PATCH] bcache: Write out full stripes Now that we're tracking dirty data per stripe, we can add two optimizations for raid5/6: * If a stripe is already dirty, force writes to that stripe to writeback mode - to help build up full stripes of dirty data * When flushing dirty data, preferentially write out full stripes first if there are any. Signed-off-by: Kent Overstreet --- drivers/md/bcache/bcache.h | 3 +-- drivers/md/bcache/btree.c | 19 ++++++++------- drivers/md/bcache/btree.h | 9 +++---- drivers/md/bcache/debug.c | 4 ++-- drivers/md/bcache/movinggc.c | 5 ++-- drivers/md/bcache/request.c | 23 ++++++------------ drivers/md/bcache/sysfs.c | 8 +++++++ drivers/md/bcache/writeback.c | 44 +++++++++++++++++++++++++++++++++-- drivers/md/bcache/writeback.h | 43 ++++++++++++++++++++++++++++++++++ 9 files changed, 121 insertions(+), 37 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index dbddef0cdb59..342ba86c6e4f 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -387,8 +387,6 @@ struct keybuf_key { typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); struct keybuf { - keybuf_pred_fn *key_predicate; - struct bkey last_scanned; spinlock_t lock; @@ -532,6 +530,7 @@ struct cached_dev { unsigned sequential_merge:1; unsigned verify:1; + unsigned partial_stripes_expensive:1; unsigned writeback_metadata:1; unsigned writeback_running:1; unsigned char writeback_percent; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index b93cf56260a4..09fb8a2f43da 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -2252,7 +2252,8 @@ static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, } static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, - struct keybuf *buf, struct bkey *end) + struct keybuf *buf, struct bkey *end, + keybuf_pred_fn *pred) { struct btree_iter iter; bch_btree_iter_init(b, &iter, &buf->last_scanned); @@ -2271,7 +2272,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, if (bkey_cmp(&buf->last_scanned, end) >= 0) break; - if (buf->key_predicate(buf, k)) { + if (pred(buf, k)) { struct keybuf_key *w; spin_lock(&buf->lock); @@ -2290,7 +2291,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, if (!k) break; - btree(refill_keybuf, k, b, op, buf, end); + btree(refill_keybuf, k, b, op, buf, end, pred); /* * Might get an error here, but can't really do anything * and it'll get logged elsewhere. Just read what we @@ -2308,7 +2309,7 @@ static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, } void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, - struct bkey *end) + struct bkey *end, keybuf_pred_fn *pred) { struct bkey start = buf->last_scanned; struct btree_op op; @@ -2316,7 +2317,7 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, cond_resched(); - btree_root(refill_keybuf, c, &op, buf, end); + btree_root(refill_keybuf, c, &op, buf, end, pred); closure_sync(&op.cl); pr_debug("found %s keys from %llu:%llu to %llu:%llu", @@ -2402,7 +2403,8 @@ struct keybuf_key *bch_keybuf_next(struct keybuf *buf) struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, struct keybuf *buf, - struct bkey *end) + struct bkey *end, + keybuf_pred_fn *pred) { struct keybuf_key *ret; @@ -2416,15 +2418,14 @@ struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, break; } - bch_refill_keybuf(c, buf, end); + bch_refill_keybuf(c, buf, end, pred); } return ret; } -void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn) +void bch_keybuf_init(struct keybuf *buf) { - buf->key_predicate = fn; buf->last_scanned = MAX_KEY; buf->keys = RB_ROOT; diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 2b016b93cad4..f66d69a7baf1 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -391,13 +391,14 @@ void bch_moving_gc(struct closure *); int bch_btree_check(struct cache_set *, struct btree_op *); uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *); -void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *); -void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *); +void bch_keybuf_init(struct keybuf *); +void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *, + keybuf_pred_fn *); bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, struct bkey *); void bch_keybuf_del(struct keybuf *, struct keybuf_key *); struct keybuf_key *bch_keybuf_next(struct keybuf *); -struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, - struct keybuf *, struct bkey *); +struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *, + struct bkey *, keybuf_pred_fn *); #endif diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index 82e3a07771ec..1c8fd319846e 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -357,7 +357,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf, if (i->bytes) break; - w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY); + w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred); if (!w) break; @@ -380,7 +380,7 @@ static int bch_dump_open(struct inode *inode, struct file *file) file->private_data = i; i->c = c; - bch_keybuf_init(&i->keys, dump_pred); + bch_keybuf_init(&i->keys); i->keys.last_scanned = KEY(0, 0, 0); return 0; diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 04f6b97ffda6..a241e9fd4f7f 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -136,7 +136,8 @@ static void read_moving(struct closure *cl) /* XXX: if we error, background writeback could stall indefinitely */ while (!test_bit(CACHE_SET_STOPPING, &c->flags)) { - w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY); + w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, + &MAX_KEY, moving_pred); if (!w) break; @@ -248,5 +249,5 @@ void bch_moving_gc(struct closure *cl) void bch_moving_init_cache_set(struct cache_set *c) { - bch_keybuf_init(&c->moving_gc_keys, moving_pred); + bch_keybuf_init(&c->moving_gc_keys); } diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 017c95fced8e..17bd59704eba 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -22,8 +22,6 @@ #define CUTOFF_CACHE_ADD 95 #define CUTOFF_CACHE_READA 90 -#define CUTOFF_WRITEBACK 50 -#define CUTOFF_WRITEBACK_SYNC 75 struct kmem_cache *bch_search_cache; @@ -998,17 +996,6 @@ static void cached_dev_write_complete(struct closure *cl) cached_dev_bio_complete(cl); } -static bool should_writeback(struct cached_dev *dc, struct bio *bio) -{ - unsigned threshold = (bio->bi_rw & REQ_SYNC) - ? CUTOFF_WRITEBACK_SYNC - : CUTOFF_WRITEBACK; - - return !atomic_read(&dc->disk.detaching) && - cache_mode(dc, bio) == CACHE_MODE_WRITEBACK && - dc->disk.c->gc_stats.in_use < threshold; -} - static void request_write(struct cached_dev *dc, struct search *s) { struct closure *cl = &s->cl; @@ -1030,12 +1017,16 @@ static void request_write(struct cached_dev *dc, struct search *s) if (bio->bi_rw & REQ_DISCARD) goto skip; + if (should_writeback(dc, s->orig_bio, + cache_mode(dc, bio), + s->op.skip)) { + s->op.skip = false; + s->writeback = true; + } + if (s->op.skip) goto skip; - if (should_writeback(dc, s->orig_bio)) - s->writeback = true; - trace_bcache_write(s->orig_bio, s->writeback, s->op.skip); if (!s->writeback) { diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index cf8d91ec3238..70c6dff0d0cd 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -81,6 +81,9 @@ rw_attribute(writeback_rate_p_term_inverse); rw_attribute(writeback_rate_d_smooth); read_attribute(writeback_rate_debug); +read_attribute(stripe_size); +read_attribute(partial_stripes_expensive); + rw_attribute(synchronous); rw_attribute(journal_delay_ms); rw_attribute(discard); @@ -147,6 +150,9 @@ SHOW(__bch_cached_dev) sysfs_hprint(dirty_data, bcache_dev_sectors_dirty(&dc->disk) << 9); + sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9); + var_printf(partial_stripes_expensive, "%u"); + var_printf(sequential_merge, "%i"); var_hprint(sequential_cutoff); var_hprint(readahead); @@ -286,6 +292,8 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_rate_d_smooth, &sysfs_writeback_rate_debug, &sysfs_dirty_data, + &sysfs_stripe_size, + &sysfs_partial_stripes_expensive, &sysfs_sequential_cutoff, &sysfs_sequential_merge, &sysfs_clear_stats, diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index dd815475c524..d81ee5ccc726 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -108,6 +108,31 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k) return KEY_DIRTY(k); } +static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) +{ + uint64_t stripe; + unsigned nr_sectors = KEY_SIZE(k); + struct cached_dev *dc = container_of(buf, struct cached_dev, + writeback_keys); + unsigned stripe_size = 1 << dc->disk.stripe_size_bits; + + if (!KEY_DIRTY(k)) + return false; + + stripe = KEY_START(k) >> dc->disk.stripe_size_bits; + while (1) { + if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != + stripe_size) + return false; + + if (nr_sectors <= stripe_size) + return true; + + nr_sectors -= stripe_size; + stripe++; + } +} + static void dirty_init(struct keybuf_key *w) { struct dirty_io *io = w->private; @@ -152,7 +177,22 @@ static void refill_dirty(struct closure *cl) searched_from_start = true; } - bch_refill_keybuf(dc->disk.c, buf, &end); + if (dc->partial_stripes_expensive) { + uint64_t i; + + for (i = 0; i < dc->disk.nr_stripes; i++) + if (atomic_read(dc->disk.stripe_sectors_dirty + i) == + 1 << dc->disk.stripe_size_bits) + goto full_stripes; + + goto normal_refill; +full_stripes: + bch_refill_keybuf(dc->disk.c, buf, &end, + dirty_full_stripe_pred); + } else { +normal_refill: + bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); + } if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { /* Searched the entire btree - delay awhile */ @@ -446,7 +486,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) closure_init_unlocked(&dc->writeback); init_rwsem(&dc->writeback_lock); - bch_keybuf_init(&dc->writeback_keys, dirty_pred); + bch_keybuf_init(&dc->writeback_keys); dc->writeback_metadata = true; dc->writeback_running = true; diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 5ce9771df047..c91f61bb95b6 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -1,6 +1,9 @@ #ifndef _BCACHE_WRITEBACK_H #define _BCACHE_WRITEBACK_H +#define CUTOFF_WRITEBACK 40 +#define CUTOFF_WRITEBACK_SYNC 70 + static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) { uint64_t i, ret = 0; @@ -11,6 +14,46 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } +static inline bool bcache_dev_stripe_dirty(struct bcache_device *d, + uint64_t offset, + unsigned nr_sectors) +{ + uint64_t stripe = offset >> d->stripe_size_bits; + + while (1) { + if (atomic_read(d->stripe_sectors_dirty + stripe)) + return true; + + if (nr_sectors <= 1 << d->stripe_size_bits) + return false; + + nr_sectors -= 1 << d->stripe_size_bits; + stripe++; + } +} + +static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, + unsigned cache_mode, bool would_skip) +{ + unsigned in_use = dc->disk.c->gc_stats.in_use; + + if (cache_mode != CACHE_MODE_WRITEBACK || + atomic_read(&dc->disk.detaching) || + in_use > CUTOFF_WRITEBACK_SYNC) + return false; + + if (dc->partial_stripes_expensive && + bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector, + bio_sectors(bio))) + return true; + + if (would_skip) + return false; + + return bio->bi_rw & REQ_SYNC || + in_use <= CUTOFF_WRITEBACK; +} + void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); void bch_writeback_queue(struct cached_dev *); void bch_writeback_add(struct cached_dev *);