dm bio prison v1: improve concurrent IO performance

Split the bio prison into multiple regions, with a separate rbtree and
associated lock for each region.

To get fast bio prison locking and not damage the performance of
discards too much the bio-prison now stipulates that discards should
not cross a BIO_PRISON_MAX_RANGE boundary.

Because the range of a key (block_end - block_begin) must not exceed
BIO_PRISON_MAX_RANGE: break_up_discard_bio() now ensures the data
range reflected in PHYSICAL key doesn't exceed BIO_PRISON_MAX_RANGE.
And splitting the thin target's discards (handled with VIRTUAL key) is
achieved by updating dm-thin.c to set limits->max_discard_sectors in
terms of BIO_PRISON_MAX_RANGE _and_ setting the thin and thin-pool
targets' max_discard_granularity to true.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@kernel.org>
This commit is contained in:
Joe Thornber 2023-03-02 09:35:37 +00:00 committed by Mike Snitzer
parent 06961c487a
commit e2dd8aca2d
3 changed files with 121 additions and 68 deletions

View File

@ -16,11 +16,17 @@
/*----------------------------------------------------------------*/ /*----------------------------------------------------------------*/
#define NR_LOCKS 64
#define LOCK_MASK (NR_LOCKS - 1)
#define MIN_CELLS 1024 #define MIN_CELLS 1024
struct dm_bio_prison { struct prison_region {
spinlock_t lock; spinlock_t lock;
struct rb_root cells; struct rb_root cell;
} ____cacheline_aligned_in_smp;
struct dm_bio_prison {
struct prison_region regions[NR_LOCKS];
mempool_t cell_pool; mempool_t cell_pool;
}; };
@ -34,13 +40,17 @@ static struct kmem_cache *_cell_cache;
*/ */
struct dm_bio_prison *dm_bio_prison_create(void) struct dm_bio_prison *dm_bio_prison_create(void)
{ {
struct dm_bio_prison *prison = kzalloc(sizeof(*prison), GFP_KERNEL);
int ret; int ret;
unsigned i;
struct dm_bio_prison *prison = kzalloc(sizeof(*prison), GFP_KERNEL);
if (!prison) if (!prison)
return NULL; return NULL;
spin_lock_init(&prison->lock); for (i = 0; i < NR_LOCKS; i++) {
spin_lock_init(&prison->regions[i].lock);
prison->regions[i].cell = RB_ROOT;
}
ret = mempool_init_slab_pool(&prison->cell_pool, MIN_CELLS, _cell_cache); ret = mempool_init_slab_pool(&prison->cell_pool, MIN_CELLS, _cell_cache);
if (ret) { if (ret) {
@ -48,8 +58,6 @@ struct dm_bio_prison *dm_bio_prison_create(void)
return NULL; return NULL;
} }
prison->cells = RB_ROOT;
return prison; return prison;
} }
EXPORT_SYMBOL_GPL(dm_bio_prison_create); EXPORT_SYMBOL_GPL(dm_bio_prison_create);
@ -107,14 +115,26 @@ static int cmp_keys(struct dm_cell_key *lhs,
return 0; return 0;
} }
static int __bio_detain(struct dm_bio_prison *prison, static unsigned lock_nr(struct dm_cell_key *key)
{
return (key->block_begin >> BIO_PRISON_MAX_RANGE_SHIFT) & LOCK_MASK;
}
static void check_range(struct dm_cell_key *key)
{
BUG_ON(key->block_end - key->block_begin > BIO_PRISON_MAX_RANGE);
BUG_ON((key->block_begin >> BIO_PRISON_MAX_RANGE_SHIFT) !=
((key->block_end - 1) >> BIO_PRISON_MAX_RANGE_SHIFT));
}
static int __bio_detain(struct rb_root *root,
struct dm_cell_key *key, struct dm_cell_key *key,
struct bio *inmate, struct bio *inmate,
struct dm_bio_prison_cell *cell_prealloc, struct dm_bio_prison_cell *cell_prealloc,
struct dm_bio_prison_cell **cell_result) struct dm_bio_prison_cell **cell_result)
{ {
int r; int r;
struct rb_node **new = &prison->cells.rb_node, *parent = NULL; struct rb_node **new = &root->rb_node, *parent = NULL;
while (*new) { while (*new) {
struct dm_bio_prison_cell *cell = struct dm_bio_prison_cell *cell =
@ -139,7 +159,7 @@ static int __bio_detain(struct dm_bio_prison *prison,
*cell_result = cell_prealloc; *cell_result = cell_prealloc;
rb_link_node(&cell_prealloc->node, parent, new); rb_link_node(&cell_prealloc->node, parent, new);
rb_insert_color(&cell_prealloc->node, &prison->cells); rb_insert_color(&cell_prealloc->node, root);
return 0; return 0;
} }
@ -151,10 +171,12 @@ static int bio_detain(struct dm_bio_prison *prison,
struct dm_bio_prison_cell **cell_result) struct dm_bio_prison_cell **cell_result)
{ {
int r; int r;
unsigned l = lock_nr(key);
check_range(key);
spin_lock_irq(&prison->lock); spin_lock_irq(&prison->regions[l].lock);
r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result); r = __bio_detain(&prison->regions[l].cell, key, inmate, cell_prealloc, cell_result);
spin_unlock_irq(&prison->lock); spin_unlock_irq(&prison->regions[l].lock);
return r; return r;
} }
@ -181,11 +203,11 @@ EXPORT_SYMBOL_GPL(dm_get_cell);
/* /*
* @inmates must have been initialised prior to this call * @inmates must have been initialised prior to this call
*/ */
static void __cell_release(struct dm_bio_prison *prison, static void __cell_release(struct rb_root *root,
struct dm_bio_prison_cell *cell, struct dm_bio_prison_cell *cell,
struct bio_list *inmates) struct bio_list *inmates)
{ {
rb_erase(&cell->node, &prison->cells); rb_erase(&cell->node, root);
if (inmates) { if (inmates) {
if (cell->holder) if (cell->holder)
@ -198,20 +220,22 @@ void dm_cell_release(struct dm_bio_prison *prison,
struct dm_bio_prison_cell *cell, struct dm_bio_prison_cell *cell,
struct bio_list *bios) struct bio_list *bios)
{ {
spin_lock_irq(&prison->lock); unsigned l = lock_nr(&cell->key);
__cell_release(prison, cell, bios);
spin_unlock_irq(&prison->lock); spin_lock_irq(&prison->regions[l].lock);
__cell_release(&prison->regions[l].cell, cell, bios);
spin_unlock_irq(&prison->regions[l].lock);
} }
EXPORT_SYMBOL_GPL(dm_cell_release); EXPORT_SYMBOL_GPL(dm_cell_release);
/* /*
* Sometimes we don't want the holder, just the additional bios. * Sometimes we don't want the holder, just the additional bios.
*/ */
static void __cell_release_no_holder(struct dm_bio_prison *prison, static void __cell_release_no_holder(struct rb_root *root,
struct dm_bio_prison_cell *cell, struct dm_bio_prison_cell *cell,
struct bio_list *inmates) struct bio_list *inmates)
{ {
rb_erase(&cell->node, &prison->cells); rb_erase(&cell->node, root);
bio_list_merge(inmates, &cell->bios); bio_list_merge(inmates, &cell->bios);
} }
@ -219,11 +243,12 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
struct dm_bio_prison_cell *cell, struct dm_bio_prison_cell *cell,
struct bio_list *inmates) struct bio_list *inmates)
{ {
unsigned l = lock_nr(&cell->key);
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&prison->lock, flags); spin_lock_irqsave(&prison->regions[l].lock, flags);
__cell_release_no_holder(prison, cell, inmates); __cell_release_no_holder(&prison->regions[l].cell, cell, inmates);
spin_unlock_irqrestore(&prison->lock, flags); spin_unlock_irqrestore(&prison->regions[l].lock, flags);
} }
EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
@ -248,18 +273,19 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
void *context, void *context,
struct dm_bio_prison_cell *cell) struct dm_bio_prison_cell *cell)
{ {
spin_lock_irq(&prison->lock); unsigned l = lock_nr(&cell->key);
spin_lock_irq(&prison->regions[l].lock);
visit_fn(context, cell); visit_fn(context, cell);
rb_erase(&cell->node, &prison->cells); rb_erase(&cell->node, &prison->regions[l].cell);
spin_unlock_irq(&prison->lock); spin_unlock_irq(&prison->regions[l].lock);
} }
EXPORT_SYMBOL_GPL(dm_cell_visit_release); EXPORT_SYMBOL_GPL(dm_cell_visit_release);
static int __promote_or_release(struct dm_bio_prison *prison, static int __promote_or_release(struct rb_root *root,
struct dm_bio_prison_cell *cell) struct dm_bio_prison_cell *cell)
{ {
if (bio_list_empty(&cell->bios)) { if (bio_list_empty(&cell->bios)) {
rb_erase(&cell->node, &prison->cells); rb_erase(&cell->node, root);
return 1; return 1;
} }
@ -271,10 +297,11 @@ int dm_cell_promote_or_release(struct dm_bio_prison *prison,
struct dm_bio_prison_cell *cell) struct dm_bio_prison_cell *cell)
{ {
int r; int r;
unsigned l = lock_nr(&cell->key);
spin_lock_irq(&prison->lock); spin_lock_irq(&prison->regions[l].lock);
r = __promote_or_release(prison, cell); r = __promote_or_release(&prison->regions[l].cell, cell);
spin_unlock_irq(&prison->lock); spin_unlock_irq(&prison->regions[l].lock);
return r; return r;
} }

View File

@ -34,6 +34,16 @@ struct dm_cell_key {
dm_block_t block_begin, block_end; dm_block_t block_begin, block_end;
}; };
/*
* The range of a key (block_end - block_begin) must not
* exceed BIO_PRISON_MAX_RANGE. Also the range must not
* cross a similarly sized boundary.
*
* Must be a power of 2.
*/
#define BIO_PRISON_MAX_RANGE 1024
#define BIO_PRISON_MAX_RANGE_SHIFT 10
/* /*
* Treat this as opaque, only in header so callers can manage allocation * Treat this as opaque, only in header so callers can manage allocation
* themselves. * themselves.

View File

@ -1674,54 +1674,69 @@ static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t
struct dm_cell_key data_key; struct dm_cell_key data_key;
struct dm_bio_prison_cell *data_cell; struct dm_bio_prison_cell *data_cell;
struct dm_thin_new_mapping *m; struct dm_thin_new_mapping *m;
dm_block_t virt_begin, virt_end, data_begin; dm_block_t virt_begin, virt_end, data_begin, data_end;
dm_block_t len, next_boundary;
while (begin != end) { while (begin != end) {
r = ensure_next_mapping(pool);
if (r)
/* we did our best */
return;
r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end, r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
&data_begin, &maybe_shared); &data_begin, &maybe_shared);
if (r) if (r) {
/* /*
* Silently fail, letting any mappings we've * Silently fail, letting any mappings we've
* created complete. * created complete.
*/ */
break; break;
build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
/* contention, we'll give up with this range */
begin = virt_end;
continue;
} }
/* data_end = data_begin + (virt_end - virt_begin);
* IO may still be going to the destination block. We must
* quiesce before we can do the removal.
*/
m = get_next_mapping(pool);
m->tc = tc;
m->maybe_shared = maybe_shared;
m->virt_begin = virt_begin;
m->virt_end = virt_end;
m->data_block = data_begin;
m->cell = data_cell;
m->bio = bio;
/* /*
* The parent bio must not complete before sub discard bios are * Make sure the data region obeys the bio prison restrictions.
* chained to it (see end_discard's bio_chain)!
*
* This per-mapping bi_remaining increment is paired with
* the implicit decrement that occurs via bio_endio() in
* end_discard().
*/ */
bio_inc_remaining(bio); while (data_begin < data_end) {
if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) r = ensure_next_mapping(pool);
pool->process_prepared_discard(m); if (r)
return; /* we did our best */
next_boundary = ((data_begin >> BIO_PRISON_MAX_RANGE_SHIFT) + 1)
<< BIO_PRISON_MAX_RANGE_SHIFT;
len = min_t(sector_t, data_end - data_begin, next_boundary - data_begin);
build_key(tc->td, PHYSICAL, data_begin, data_begin + len, &data_key);
if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
/* contention, we'll give up with this range */
data_begin += len;
continue;
}
/*
* IO may still be going to the destination block. We must
* quiesce before we can do the removal.
*/
m = get_next_mapping(pool);
m->tc = tc;
m->maybe_shared = maybe_shared;
m->virt_begin = virt_begin;
m->virt_end = virt_begin + len;
m->data_block = data_begin;
m->cell = data_cell;
m->bio = bio;
/*
* The parent bio must not complete before sub discard bios are
* chained to it (see end_discard's bio_chain)!
*
* This per-mapping bi_remaining increment is paired with
* the implicit decrement that occurs via bio_endio() in
* end_discard().
*/
bio_inc_remaining(bio);
if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
pool->process_prepared_discard(m);
virt_begin += len;
data_begin += len;
}
begin = virt_end; begin = virt_end;
} }
@ -3380,13 +3395,13 @@ static int pool_ctr(struct dm_target *ti, unsigned int argc, char **argv)
*/ */
if (pf.discard_enabled && pf.discard_passdown) { if (pf.discard_enabled && pf.discard_passdown) {
ti->num_discard_bios = 1; ti->num_discard_bios = 1;
/* /*
* Setting 'discards_supported' circumvents the normal * Setting 'discards_supported' circumvents the normal
* stacking of discard limits (this keeps the pool and * stacking of discard limits (this keeps the pool and
* thin devices' discard limits consistent). * thin devices' discard limits consistent).
*/ */
ti->discards_supported = true; ti->discards_supported = true;
ti->max_discard_granularity = true;
} }
ti->private = pt; ti->private = pt;
@ -4096,7 +4111,7 @@ static struct target_type pool_target = {
.name = "thin-pool", .name = "thin-pool",
.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
DM_TARGET_IMMUTABLE, DM_TARGET_IMMUTABLE,
.version = {1, 22, 0}, .version = {1, 23, 0},
.module = THIS_MODULE, .module = THIS_MODULE,
.ctr = pool_ctr, .ctr = pool_ctr,
.dtr = pool_dtr, .dtr = pool_dtr,
@ -4261,6 +4276,7 @@ static int thin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
if (tc->pool->pf.discard_enabled) { if (tc->pool->pf.discard_enabled) {
ti->discards_supported = true; ti->discards_supported = true;
ti->num_discard_bios = 1; ti->num_discard_bios = 1;
ti->max_discard_granularity = true;
} }
mutex_unlock(&dm_thin_pool_table.mutex); mutex_unlock(&dm_thin_pool_table.mutex);
@ -4476,12 +4492,12 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
return; return;
limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */ limits->max_discard_sectors = pool->sectors_per_block * BIO_PRISON_MAX_RANGE;
} }
static struct target_type thin_target = { static struct target_type thin_target = {
.name = "thin", .name = "thin",
.version = {1, 22, 0}, .version = {1, 23, 0},
.module = THIS_MODULE, .module = THIS_MODULE,
.ctr = thin_ctr, .ctr = thin_ctr,
.dtr = thin_dtr, .dtr = thin_dtr,