mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-10-05 00:20:32 +00:00
dm mpath: eliminate use of spinlock in IO fast-paths
The primary motivation of this commit is to improve the scalability of DM multipath on large NUMA systems where m->lock spinlock contention has been proven to be a serious bottleneck on really fast storage. The ability to atomically read a pointer, using lockless_dereference(), is leveraged in this commit. But all pointer writes are still protected by the m->lock spinlock (which is fine since these all now occur in the slow-path). The following functions no longer require the m->lock spinlock in their fast-path: multipath_busy(), __multipath_map(), and do_end_io() And choose_pgpath() is modified to _not_ update m->current_pgpath unless it also switches the path-group. This is done to avoid needing to take the m->lock everytime __multipath_map() calls choose_pgpath(). But m->current_pgpath will be reset if it is failed via fail_path(). Suggested-by: Jeff Moyer <jmoyer@redhat.com> Reviewed-by: Hannes Reinecke <hare@suse.com> Tested-by: Hannes Reinecke <hare@suse.com> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
This commit is contained in:
parent
20800cb345
commit
2da1610ae2
1 changed files with 93 additions and 77 deletions
|
@ -305,9 +305,21 @@ static int __pg_init_all_paths(struct multipath *m)
|
||||||
return atomic_read(&m->pg_init_in_progress);
|
return atomic_read(&m->pg_init_in_progress);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
|
static int pg_init_all_paths(struct multipath *m)
|
||||||
{
|
{
|
||||||
m->current_pg = pgpath->pg;
|
int r;
|
||||||
|
unsigned long flags;
|
||||||
|
|
||||||
|
spin_lock_irqsave(&m->lock, flags);
|
||||||
|
r = __pg_init_all_paths(m);
|
||||||
|
spin_unlock_irqrestore(&m->lock, flags);
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void __switch_pg(struct multipath *m, struct priority_group *pg)
|
||||||
|
{
|
||||||
|
m->current_pg = pg;
|
||||||
|
|
||||||
/* Must we initialise the PG first, and queue I/O till it's ready? */
|
/* Must we initialise the PG first, and queue I/O till it's ready? */
|
||||||
if (m->hw_handler_name) {
|
if (m->hw_handler_name) {
|
||||||
|
@ -321,26 +333,36 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
|
||||||
atomic_set(&m->pg_init_count, 0);
|
atomic_set(&m->pg_init_count, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
|
static struct pgpath *choose_path_in_pg(struct multipath *m,
|
||||||
|
struct priority_group *pg,
|
||||||
size_t nr_bytes)
|
size_t nr_bytes)
|
||||||
{
|
{
|
||||||
|
unsigned long flags;
|
||||||
struct dm_path *path;
|
struct dm_path *path;
|
||||||
|
struct pgpath *pgpath;
|
||||||
|
|
||||||
path = pg->ps.type->select_path(&pg->ps, nr_bytes);
|
path = pg->ps.type->select_path(&pg->ps, nr_bytes);
|
||||||
if (!path)
|
if (!path)
|
||||||
return -ENXIO;
|
return ERR_PTR(-ENXIO);
|
||||||
|
|
||||||
m->current_pgpath = path_to_pgpath(path);
|
pgpath = path_to_pgpath(path);
|
||||||
|
|
||||||
if (m->current_pg != pg)
|
if (unlikely(lockless_dereference(m->current_pg) != pg)) {
|
||||||
__switch_pg(m, m->current_pgpath);
|
/* Only update current_pgpath if pg changed */
|
||||||
|
spin_lock_irqsave(&m->lock, flags);
|
||||||
|
m->current_pgpath = pgpath;
|
||||||
|
__switch_pg(m, pg);
|
||||||
|
spin_unlock_irqrestore(&m->lock, flags);
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return pgpath;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
|
static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
|
||||||
{
|
{
|
||||||
|
unsigned long flags;
|
||||||
struct priority_group *pg;
|
struct priority_group *pg;
|
||||||
|
struct pgpath *pgpath;
|
||||||
bool bypassed = true;
|
bool bypassed = true;
|
||||||
|
|
||||||
if (!atomic_read(&m->nr_valid_paths)) {
|
if (!atomic_read(&m->nr_valid_paths)) {
|
||||||
|
@ -349,16 +371,28 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Were we instructed to switch PG? */
|
/* Were we instructed to switch PG? */
|
||||||
if (m->next_pg) {
|
if (lockless_dereference(m->next_pg)) {
|
||||||
|
spin_lock_irqsave(&m->lock, flags);
|
||||||
pg = m->next_pg;
|
pg = m->next_pg;
|
||||||
|
if (!pg) {
|
||||||
|
spin_unlock_irqrestore(&m->lock, flags);
|
||||||
|
goto check_current_pg;
|
||||||
|
}
|
||||||
m->next_pg = NULL;
|
m->next_pg = NULL;
|
||||||
if (!__choose_path_in_pg(m, pg, nr_bytes))
|
spin_unlock_irqrestore(&m->lock, flags);
|
||||||
return;
|
pgpath = choose_path_in_pg(m, pg, nr_bytes);
|
||||||
|
if (!IS_ERR_OR_NULL(pgpath))
|
||||||
|
return pgpath;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Don't change PG until it has no remaining paths */
|
/* Don't change PG until it has no remaining paths */
|
||||||
if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes))
|
check_current_pg:
|
||||||
return;
|
pg = lockless_dereference(m->current_pg);
|
||||||
|
if (pg) {
|
||||||
|
pgpath = choose_path_in_pg(m, pg, nr_bytes);
|
||||||
|
if (!IS_ERR_OR_NULL(pgpath))
|
||||||
|
return pgpath;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Loop through priority groups until we find a valid path.
|
* Loop through priority groups until we find a valid path.
|
||||||
|
@ -370,31 +404,34 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
|
||||||
list_for_each_entry(pg, &m->priority_groups, list) {
|
list_for_each_entry(pg, &m->priority_groups, list) {
|
||||||
if (pg->bypassed == bypassed)
|
if (pg->bypassed == bypassed)
|
||||||
continue;
|
continue;
|
||||||
if (!__choose_path_in_pg(m, pg, nr_bytes)) {
|
pgpath = choose_path_in_pg(m, pg, nr_bytes);
|
||||||
|
if (!IS_ERR_OR_NULL(pgpath)) {
|
||||||
if (!bypassed)
|
if (!bypassed)
|
||||||
set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
|
set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
|
||||||
return;
|
return pgpath;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (bypassed--);
|
} while (bypassed--);
|
||||||
|
|
||||||
failed:
|
failed:
|
||||||
|
spin_lock_irqsave(&m->lock, flags);
|
||||||
m->current_pgpath = NULL;
|
m->current_pgpath = NULL;
|
||||||
m->current_pg = NULL;
|
m->current_pg = NULL;
|
||||||
|
spin_unlock_irqrestore(&m->lock, flags);
|
||||||
|
|
||||||
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Check whether bios must be queued in the device-mapper core rather
|
* Check whether bios must be queued in the device-mapper core rather
|
||||||
* than here in the target.
|
* than here in the target.
|
||||||
*
|
*
|
||||||
* m->lock must be held on entry.
|
|
||||||
*
|
|
||||||
* If m->queue_if_no_path and m->saved_queue_if_no_path hold the
|
* If m->queue_if_no_path and m->saved_queue_if_no_path hold the
|
||||||
* same value then we are not between multipath_presuspend()
|
* same value then we are not between multipath_presuspend()
|
||||||
* and multipath_resume() calls and we have no need to check
|
* and multipath_resume() calls and we have no need to check
|
||||||
* for the DMF_NOFLUSH_SUSPENDING flag.
|
* for the DMF_NOFLUSH_SUSPENDING flag.
|
||||||
*/
|
*/
|
||||||
static int __must_push_back(struct multipath *m)
|
static int must_push_back(struct multipath *m)
|
||||||
{
|
{
|
||||||
return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
|
return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
|
||||||
((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
|
((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
|
||||||
|
@ -416,36 +453,31 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
|
||||||
struct block_device *bdev;
|
struct block_device *bdev;
|
||||||
struct dm_mpath_io *mpio;
|
struct dm_mpath_io *mpio;
|
||||||
|
|
||||||
spin_lock_irq(&m->lock);
|
|
||||||
|
|
||||||
/* Do we need to select a new pgpath? */
|
/* Do we need to select a new pgpath? */
|
||||||
if (!m->current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
|
pgpath = lockless_dereference(m->current_pgpath);
|
||||||
__choose_pgpath(m, nr_bytes);
|
if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
|
||||||
|
pgpath = choose_pgpath(m, nr_bytes);
|
||||||
pgpath = m->current_pgpath;
|
|
||||||
|
|
||||||
if (!pgpath) {
|
if (!pgpath) {
|
||||||
if (!__must_push_back(m))
|
if (!must_push_back(m))
|
||||||
r = -EIO; /* Failed */
|
r = -EIO; /* Failed */
|
||||||
goto out_unlock;
|
return r;
|
||||||
} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
|
} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
|
||||||
test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
|
test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
|
||||||
__pg_init_all_paths(m);
|
pg_init_all_paths(m);
|
||||||
goto out_unlock;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
mpio = set_mpio(m, map_context);
|
mpio = set_mpio(m, map_context);
|
||||||
if (!mpio)
|
if (!mpio)
|
||||||
/* ENOMEM, requeue */
|
/* ENOMEM, requeue */
|
||||||
goto out_unlock;
|
return r;
|
||||||
|
|
||||||
mpio->pgpath = pgpath;
|
mpio->pgpath = pgpath;
|
||||||
mpio->nr_bytes = nr_bytes;
|
mpio->nr_bytes = nr_bytes;
|
||||||
|
|
||||||
bdev = pgpath->path.dev->bdev;
|
bdev = pgpath->path.dev->bdev;
|
||||||
|
|
||||||
spin_unlock_irq(&m->lock);
|
|
||||||
|
|
||||||
if (clone) {
|
if (clone) {
|
||||||
/*
|
/*
|
||||||
* Old request-based interface: allocated clone is passed in.
|
* Old request-based interface: allocated clone is passed in.
|
||||||
|
@ -477,11 +509,6 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
|
||||||
&pgpath->path,
|
&pgpath->path,
|
||||||
nr_bytes);
|
nr_bytes);
|
||||||
return DM_MAPIO_REMAPPED;
|
return DM_MAPIO_REMAPPED;
|
||||||
|
|
||||||
out_unlock:
|
|
||||||
spin_unlock_irq(&m->lock);
|
|
||||||
|
|
||||||
return r;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int multipath_map(struct dm_target *ti, struct request *clone,
|
static int multipath_map(struct dm_target *ti, struct request *clone,
|
||||||
|
@ -1308,7 +1335,6 @@ static int do_end_io(struct multipath *m, struct request *clone,
|
||||||
* clone bios for it and resubmit it later.
|
* clone bios for it and resubmit it later.
|
||||||
*/
|
*/
|
||||||
int r = DM_ENDIO_REQUEUE;
|
int r = DM_ENDIO_REQUEUE;
|
||||||
unsigned long flags;
|
|
||||||
|
|
||||||
if (!error && !clone->errors)
|
if (!error && !clone->errors)
|
||||||
return 0; /* I/O complete */
|
return 0; /* I/O complete */
|
||||||
|
@ -1319,17 +1345,15 @@ static int do_end_io(struct multipath *m, struct request *clone,
|
||||||
if (mpio->pgpath)
|
if (mpio->pgpath)
|
||||||
fail_path(mpio->pgpath);
|
fail_path(mpio->pgpath);
|
||||||
|
|
||||||
spin_lock_irqsave(&m->lock, flags);
|
|
||||||
if (!atomic_read(&m->nr_valid_paths)) {
|
if (!atomic_read(&m->nr_valid_paths)) {
|
||||||
if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
|
if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
|
||||||
if (!__must_push_back(m))
|
if (!must_push_back(m))
|
||||||
r = -EIO;
|
r = -EIO;
|
||||||
} else {
|
} else {
|
||||||
if (error == -EBADE)
|
if (error == -EBADE)
|
||||||
r = error;
|
r = error;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
spin_unlock_irqrestore(&m->lock, flags);
|
|
||||||
|
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
@ -1586,18 +1610,17 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
|
||||||
struct block_device **bdev, fmode_t *mode)
|
struct block_device **bdev, fmode_t *mode)
|
||||||
{
|
{
|
||||||
struct multipath *m = ti->private;
|
struct multipath *m = ti->private;
|
||||||
unsigned long flags;
|
struct pgpath *current_pgpath;
|
||||||
int r;
|
int r;
|
||||||
|
|
||||||
spin_lock_irqsave(&m->lock, flags);
|
current_pgpath = lockless_dereference(m->current_pgpath);
|
||||||
|
if (!current_pgpath)
|
||||||
|
current_pgpath = choose_pgpath(m, 0);
|
||||||
|
|
||||||
if (!m->current_pgpath)
|
if (current_pgpath) {
|
||||||
__choose_pgpath(m, 0);
|
|
||||||
|
|
||||||
if (m->current_pgpath) {
|
|
||||||
if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) {
|
if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) {
|
||||||
*bdev = m->current_pgpath->path.dev->bdev;
|
*bdev = current_pgpath->path.dev->bdev;
|
||||||
*mode = m->current_pgpath->path.dev->mode;
|
*mode = current_pgpath->path.dev->mode;
|
||||||
r = 0;
|
r = 0;
|
||||||
} else {
|
} else {
|
||||||
/* pg_init has not started or completed */
|
/* pg_init has not started or completed */
|
||||||
|
@ -1611,17 +1634,13 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
|
||||||
r = -EIO;
|
r = -EIO;
|
||||||
}
|
}
|
||||||
|
|
||||||
spin_unlock_irqrestore(&m->lock, flags);
|
|
||||||
|
|
||||||
if (r == -ENOTCONN) {
|
if (r == -ENOTCONN) {
|
||||||
spin_lock_irqsave(&m->lock, flags);
|
if (!lockless_dereference(m->current_pg)) {
|
||||||
if (!m->current_pg) {
|
|
||||||
/* Path status changed, redo selection */
|
/* Path status changed, redo selection */
|
||||||
__choose_pgpath(m, 0);
|
(void) choose_pgpath(m, 0);
|
||||||
}
|
}
|
||||||
if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
|
if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
|
||||||
__pg_init_all_paths(m);
|
pg_init_all_paths(m);
|
||||||
spin_unlock_irqrestore(&m->lock, flags);
|
|
||||||
dm_table_run_md_queue_async(m->ti->table);
|
dm_table_run_md_queue_async(m->ti->table);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1672,39 +1691,37 @@ static int multipath_busy(struct dm_target *ti)
|
||||||
{
|
{
|
||||||
bool busy = false, has_active = false;
|
bool busy = false, has_active = false;
|
||||||
struct multipath *m = ti->private;
|
struct multipath *m = ti->private;
|
||||||
struct priority_group *pg;
|
struct priority_group *pg, *next_pg;
|
||||||
struct pgpath *pgpath;
|
struct pgpath *pgpath;
|
||||||
unsigned long flags;
|
|
||||||
|
|
||||||
spin_lock_irqsave(&m->lock, flags);
|
|
||||||
|
|
||||||
/* pg_init in progress or no paths available */
|
/* pg_init in progress or no paths available */
|
||||||
if (atomic_read(&m->pg_init_in_progress) ||
|
if (atomic_read(&m->pg_init_in_progress) ||
|
||||||
(!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) {
|
(!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)))
|
||||||
busy = true;
|
return true;
|
||||||
goto out;
|
|
||||||
}
|
|
||||||
/* Guess which priority_group will be used at next mapping time */
|
/* Guess which priority_group will be used at next mapping time */
|
||||||
if (unlikely(!m->current_pgpath && m->next_pg))
|
pg = lockless_dereference(m->current_pg);
|
||||||
pg = m->next_pg;
|
next_pg = lockless_dereference(m->next_pg);
|
||||||
else if (likely(m->current_pg))
|
if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg))
|
||||||
pg = m->current_pg;
|
pg = next_pg;
|
||||||
else
|
|
||||||
|
if (!pg) {
|
||||||
/*
|
/*
|
||||||
* We don't know which pg will be used at next mapping time.
|
* We don't know which pg will be used at next mapping time.
|
||||||
* We don't call __choose_pgpath() here to avoid to trigger
|
* We don't call choose_pgpath() here to avoid to trigger
|
||||||
* pg_init just by busy checking.
|
* pg_init just by busy checking.
|
||||||
* So we don't know whether underlying devices we will be using
|
* So we don't know whether underlying devices we will be using
|
||||||
* at next mapping time are busy or not. Just try mapping.
|
* at next mapping time are busy or not. Just try mapping.
|
||||||
*/
|
*/
|
||||||
goto out;
|
return busy;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If there is one non-busy active path at least, the path selector
|
* If there is one non-busy active path at least, the path selector
|
||||||
* will be able to select it. So we consider such a pg as not busy.
|
* will be able to select it. So we consider such a pg as not busy.
|
||||||
*/
|
*/
|
||||||
busy = true;
|
busy = true;
|
||||||
list_for_each_entry(pgpath, &pg->pgpaths, list)
|
list_for_each_entry(pgpath, &pg->pgpaths, list) {
|
||||||
if (pgpath->is_active) {
|
if (pgpath->is_active) {
|
||||||
has_active = true;
|
has_active = true;
|
||||||
if (!pgpath_busy(pgpath)) {
|
if (!pgpath_busy(pgpath)) {
|
||||||
|
@ -1712,17 +1729,16 @@ static int multipath_busy(struct dm_target *ti)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!has_active)
|
if (!has_active) {
|
||||||
/*
|
/*
|
||||||
* No active path in this pg, so this pg won't be used and
|
* No active path in this pg, so this pg won't be used and
|
||||||
* the current_pg will be changed at next mapping time.
|
* the current_pg will be changed at next mapping time.
|
||||||
* We need to try mapping to determine it.
|
* We need to try mapping to determine it.
|
||||||
*/
|
*/
|
||||||
busy = false;
|
busy = false;
|
||||||
|
}
|
||||||
out:
|
|
||||||
spin_unlock_irqrestore(&m->lock, flags);
|
|
||||||
|
|
||||||
return busy;
|
return busy;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue