dm mpath: eliminate use of spinlock in IO fast-paths

The primary motivation of this commit is to improve the scalability of
DM multipath on large NUMA systems where m->lock spinlock contention has
been proven to be a serious bottleneck on really fast storage.

The ability to atomically read a pointer, using lockless_dereference(),
is leveraged in this commit.  But all pointer writes are still protected
by the m->lock spinlock (which is fine since these all now occur in the
slow-path).

The following functions no longer require the m->lock spinlock in their
fast-path: multipath_busy(), __multipath_map(), and do_end_io()

And choose_pgpath() is modified to _not_ update m->current_pgpath unless
it also switches the path-group.  This is done to avoid needing to take
the m->lock everytime __multipath_map() calls choose_pgpath().
But m->current_pgpath will be reset if it is failed via fail_path().

Suggested-by: Jeff Moyer <jmoyer@redhat.com>
Reviewed-by: Hannes Reinecke <hare@suse.com>
Tested-by: Hannes Reinecke <hare@suse.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
This commit is contained in:
Mike Snitzer 2016-03-17 18:38:17 -04:00
parent 20800cb345
commit 2da1610ae2
1 changed files with 93 additions and 77 deletions

View File

@ -305,9 +305,21 @@ static int __pg_init_all_paths(struct multipath *m)
return atomic_read(&m->pg_init_in_progress); return atomic_read(&m->pg_init_in_progress);
} }
static void __switch_pg(struct multipath *m, struct pgpath *pgpath) static int pg_init_all_paths(struct multipath *m)
{ {
m->current_pg = pgpath->pg; int r;
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
r = __pg_init_all_paths(m);
spin_unlock_irqrestore(&m->lock, flags);
return r;
}
static void __switch_pg(struct multipath *m, struct priority_group *pg)
{
m->current_pg = pg;
/* Must we initialise the PG first, and queue I/O till it's ready? */ /* Must we initialise the PG first, and queue I/O till it's ready? */
if (m->hw_handler_name) { if (m->hw_handler_name) {
@ -321,26 +333,36 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
atomic_set(&m->pg_init_count, 0); atomic_set(&m->pg_init_count, 0);
} }
static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg, static struct pgpath *choose_path_in_pg(struct multipath *m,
size_t nr_bytes) struct priority_group *pg,
size_t nr_bytes)
{ {
unsigned long flags;
struct dm_path *path; struct dm_path *path;
struct pgpath *pgpath;
path = pg->ps.type->select_path(&pg->ps, nr_bytes); path = pg->ps.type->select_path(&pg->ps, nr_bytes);
if (!path) if (!path)
return -ENXIO; return ERR_PTR(-ENXIO);
m->current_pgpath = path_to_pgpath(path); pgpath = path_to_pgpath(path);
if (m->current_pg != pg) if (unlikely(lockless_dereference(m->current_pg) != pg)) {
__switch_pg(m, m->current_pgpath); /* Only update current_pgpath if pg changed */
spin_lock_irqsave(&m->lock, flags);
m->current_pgpath = pgpath;
__switch_pg(m, pg);
spin_unlock_irqrestore(&m->lock, flags);
}
return 0; return pgpath;
} }
static void __choose_pgpath(struct multipath *m, size_t nr_bytes) static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes)
{ {
unsigned long flags;
struct priority_group *pg; struct priority_group *pg;
struct pgpath *pgpath;
bool bypassed = true; bool bypassed = true;
if (!atomic_read(&m->nr_valid_paths)) { if (!atomic_read(&m->nr_valid_paths)) {
@ -349,16 +371,28 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
} }
/* Were we instructed to switch PG? */ /* Were we instructed to switch PG? */
if (m->next_pg) { if (lockless_dereference(m->next_pg)) {
spin_lock_irqsave(&m->lock, flags);
pg = m->next_pg; pg = m->next_pg;
if (!pg) {
spin_unlock_irqrestore(&m->lock, flags);
goto check_current_pg;
}
m->next_pg = NULL; m->next_pg = NULL;
if (!__choose_path_in_pg(m, pg, nr_bytes)) spin_unlock_irqrestore(&m->lock, flags);
return; pgpath = choose_path_in_pg(m, pg, nr_bytes);
if (!IS_ERR_OR_NULL(pgpath))
return pgpath;
} }
/* Don't change PG until it has no remaining paths */ /* Don't change PG until it has no remaining paths */
if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes)) check_current_pg:
return; pg = lockless_dereference(m->current_pg);
if (pg) {
pgpath = choose_path_in_pg(m, pg, nr_bytes);
if (!IS_ERR_OR_NULL(pgpath))
return pgpath;
}
/* /*
* Loop through priority groups until we find a valid path. * Loop through priority groups until we find a valid path.
@ -370,31 +404,34 @@ static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
list_for_each_entry(pg, &m->priority_groups, list) { list_for_each_entry(pg, &m->priority_groups, list) {
if (pg->bypassed == bypassed) if (pg->bypassed == bypassed)
continue; continue;
if (!__choose_path_in_pg(m, pg, nr_bytes)) { pgpath = choose_path_in_pg(m, pg, nr_bytes);
if (!IS_ERR_OR_NULL(pgpath)) {
if (!bypassed) if (!bypassed)
set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags); set_bit(MPATHF_PG_INIT_DELAY_RETRY, &m->flags);
return; return pgpath;
} }
} }
} while (bypassed--); } while (bypassed--);
failed: failed:
spin_lock_irqsave(&m->lock, flags);
m->current_pgpath = NULL; m->current_pgpath = NULL;
m->current_pg = NULL; m->current_pg = NULL;
spin_unlock_irqrestore(&m->lock, flags);
return NULL;
} }
/* /*
* Check whether bios must be queued in the device-mapper core rather * Check whether bios must be queued in the device-mapper core rather
* than here in the target. * than here in the target.
* *
* m->lock must be held on entry.
*
* If m->queue_if_no_path and m->saved_queue_if_no_path hold the * If m->queue_if_no_path and m->saved_queue_if_no_path hold the
* same value then we are not between multipath_presuspend() * same value then we are not between multipath_presuspend()
* and multipath_resume() calls and we have no need to check * and multipath_resume() calls and we have no need to check
* for the DMF_NOFLUSH_SUSPENDING flag. * for the DMF_NOFLUSH_SUSPENDING flag.
*/ */
static int __must_push_back(struct multipath *m) static int must_push_back(struct multipath *m)
{ {
return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) || return (test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) ||
((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) != ((test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags) !=
@ -416,36 +453,31 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
struct block_device *bdev; struct block_device *bdev;
struct dm_mpath_io *mpio; struct dm_mpath_io *mpio;
spin_lock_irq(&m->lock);
/* Do we need to select a new pgpath? */ /* Do we need to select a new pgpath? */
if (!m->current_pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags)) pgpath = lockless_dereference(m->current_pgpath);
__choose_pgpath(m, nr_bytes); if (!pgpath || !test_bit(MPATHF_QUEUE_IO, &m->flags))
pgpath = choose_pgpath(m, nr_bytes);
pgpath = m->current_pgpath;
if (!pgpath) { if (!pgpath) {
if (!__must_push_back(m)) if (!must_push_back(m))
r = -EIO; /* Failed */ r = -EIO; /* Failed */
goto out_unlock; return r;
} else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) ||
test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) {
__pg_init_all_paths(m); pg_init_all_paths(m);
goto out_unlock; return r;
} }
mpio = set_mpio(m, map_context); mpio = set_mpio(m, map_context);
if (!mpio) if (!mpio)
/* ENOMEM, requeue */ /* ENOMEM, requeue */
goto out_unlock; return r;
mpio->pgpath = pgpath; mpio->pgpath = pgpath;
mpio->nr_bytes = nr_bytes; mpio->nr_bytes = nr_bytes;
bdev = pgpath->path.dev->bdev; bdev = pgpath->path.dev->bdev;
spin_unlock_irq(&m->lock);
if (clone) { if (clone) {
/* /*
* Old request-based interface: allocated clone is passed in. * Old request-based interface: allocated clone is passed in.
@ -477,11 +509,6 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
&pgpath->path, &pgpath->path,
nr_bytes); nr_bytes);
return DM_MAPIO_REMAPPED; return DM_MAPIO_REMAPPED;
out_unlock:
spin_unlock_irq(&m->lock);
return r;
} }
static int multipath_map(struct dm_target *ti, struct request *clone, static int multipath_map(struct dm_target *ti, struct request *clone,
@ -1308,7 +1335,6 @@ static int do_end_io(struct multipath *m, struct request *clone,
* clone bios for it and resubmit it later. * clone bios for it and resubmit it later.
*/ */
int r = DM_ENDIO_REQUEUE; int r = DM_ENDIO_REQUEUE;
unsigned long flags;
if (!error && !clone->errors) if (!error && !clone->errors)
return 0; /* I/O complete */ return 0; /* I/O complete */
@ -1319,17 +1345,15 @@ static int do_end_io(struct multipath *m, struct request *clone,
if (mpio->pgpath) if (mpio->pgpath)
fail_path(mpio->pgpath); fail_path(mpio->pgpath);
spin_lock_irqsave(&m->lock, flags);
if (!atomic_read(&m->nr_valid_paths)) { if (!atomic_read(&m->nr_valid_paths)) {
if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) {
if (!__must_push_back(m)) if (!must_push_back(m))
r = -EIO; r = -EIO;
} else { } else {
if (error == -EBADE) if (error == -EBADE)
r = error; r = error;
} }
} }
spin_unlock_irqrestore(&m->lock, flags);
return r; return r;
} }
@ -1586,18 +1610,17 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
struct block_device **bdev, fmode_t *mode) struct block_device **bdev, fmode_t *mode)
{ {
struct multipath *m = ti->private; struct multipath *m = ti->private;
unsigned long flags; struct pgpath *current_pgpath;
int r; int r;
spin_lock_irqsave(&m->lock, flags); current_pgpath = lockless_dereference(m->current_pgpath);
if (!current_pgpath)
current_pgpath = choose_pgpath(m, 0);
if (!m->current_pgpath) if (current_pgpath) {
__choose_pgpath(m, 0);
if (m->current_pgpath) {
if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) { if (!test_bit(MPATHF_QUEUE_IO, &m->flags)) {
*bdev = m->current_pgpath->path.dev->bdev; *bdev = current_pgpath->path.dev->bdev;
*mode = m->current_pgpath->path.dev->mode; *mode = current_pgpath->path.dev->mode;
r = 0; r = 0;
} else { } else {
/* pg_init has not started or completed */ /* pg_init has not started or completed */
@ -1611,17 +1634,13 @@ static int multipath_prepare_ioctl(struct dm_target *ti,
r = -EIO; r = -EIO;
} }
spin_unlock_irqrestore(&m->lock, flags);
if (r == -ENOTCONN) { if (r == -ENOTCONN) {
spin_lock_irqsave(&m->lock, flags); if (!lockless_dereference(m->current_pg)) {
if (!m->current_pg) {
/* Path status changed, redo selection */ /* Path status changed, redo selection */
__choose_pgpath(m, 0); (void) choose_pgpath(m, 0);
} }
if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags))
__pg_init_all_paths(m); pg_init_all_paths(m);
spin_unlock_irqrestore(&m->lock, flags);
dm_table_run_md_queue_async(m->ti->table); dm_table_run_md_queue_async(m->ti->table);
} }
@ -1672,39 +1691,37 @@ static int multipath_busy(struct dm_target *ti)
{ {
bool busy = false, has_active = false; bool busy = false, has_active = false;
struct multipath *m = ti->private; struct multipath *m = ti->private;
struct priority_group *pg; struct priority_group *pg, *next_pg;
struct pgpath *pgpath; struct pgpath *pgpath;
unsigned long flags;
spin_lock_irqsave(&m->lock, flags);
/* pg_init in progress or no paths available */ /* pg_init in progress or no paths available */
if (atomic_read(&m->pg_init_in_progress) || if (atomic_read(&m->pg_init_in_progress) ||
(!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) { (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)))
busy = true; return true;
goto out;
}
/* Guess which priority_group will be used at next mapping time */ /* Guess which priority_group will be used at next mapping time */
if (unlikely(!m->current_pgpath && m->next_pg)) pg = lockless_dereference(m->current_pg);
pg = m->next_pg; next_pg = lockless_dereference(m->next_pg);
else if (likely(m->current_pg)) if (unlikely(!lockless_dereference(m->current_pgpath) && next_pg))
pg = m->current_pg; pg = next_pg;
else
if (!pg) {
/* /*
* We don't know which pg will be used at next mapping time. * We don't know which pg will be used at next mapping time.
* We don't call __choose_pgpath() here to avoid to trigger * We don't call choose_pgpath() here to avoid to trigger
* pg_init just by busy checking. * pg_init just by busy checking.
* So we don't know whether underlying devices we will be using * So we don't know whether underlying devices we will be using
* at next mapping time are busy or not. Just try mapping. * at next mapping time are busy or not. Just try mapping.
*/ */
goto out; return busy;
}
/* /*
* If there is one non-busy active path at least, the path selector * If there is one non-busy active path at least, the path selector
* will be able to select it. So we consider such a pg as not busy. * will be able to select it. So we consider such a pg as not busy.
*/ */
busy = true; busy = true;
list_for_each_entry(pgpath, &pg->pgpaths, list) list_for_each_entry(pgpath, &pg->pgpaths, list) {
if (pgpath->is_active) { if (pgpath->is_active) {
has_active = true; has_active = true;
if (!pgpath_busy(pgpath)) { if (!pgpath_busy(pgpath)) {
@ -1712,17 +1729,16 @@ static int multipath_busy(struct dm_target *ti)
break; break;
} }
} }
}
if (!has_active) if (!has_active) {
/* /*
* No active path in this pg, so this pg won't be used and * No active path in this pg, so this pg won't be used and
* the current_pg will be changed at next mapping time. * the current_pg will be changed at next mapping time.
* We need to try mapping to determine it. * We need to try mapping to determine it.
*/ */
busy = false; busy = false;
}
out:
spin_unlock_irqrestore(&m->lock, flags);
return busy; return busy;
} }