From 0bca799b92807ee9be0890690f5dde7d8c6a8e25 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 5 Apr 2018 00:35:21 +0800
Subject: [PATCH 01/28] blk-mq: order getting budget and driver tag

This patch orders getting budget and driver tag by making sure to acquire
driver tag after budget is got, this way can help to avoid the following
race:

1) before dispatch request from scheduler queue, get one budget first, then
dequeue a request, call it request A.

2) in another IO path for dispatching request B which is from hctx->dispatch,
driver tag is got, then try to get budget in blk_mq_dispatch_rq_list(),
unfortunately the budget is held by request A.

3) meantime blk_mq_dispatch_rq_list() is called for dispatching request
A, and try to get driver tag first, unfortunately no driver tag is
available because the driver tag is held by request B

4) both two IO pathes can't move on, and IO stall is caused.

This issue can be observed when running dbench on USB storage.

This patch fixes this issue by always getting budget before getting
driver tag.

Cc: stable@vger.kernel.org
Fixes: de1482974080ec9e ("blk-mq: introduce .get_budget and .put_budget in blk_mq_ops")
Cc: Christoph Hellwig <hch@lst.de>
Cc: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Omar Sandoval <osandov@fb.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f5c7dbcb954f..90f869a083a4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1180,7 +1180,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 		struct blk_mq_queue_data bd;
 
 		rq = list_first_entry(list, struct request, queuelist);
-		if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
+
+		hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
+		if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
+			break;
+
+		if (!blk_mq_get_driver_tag(rq, NULL, false)) {
 			/*
 			 * The initial allocation attempt failed, so we need to
 			 * rerun the hardware queue when a tag is freed. The
@@ -1189,8 +1194,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 			 * we'll re-run it below.
 			 */
 			if (!blk_mq_mark_tag_wait(&hctx, rq)) {
-				if (got_budget)
-					blk_mq_put_dispatch_budget(hctx);
+				blk_mq_put_dispatch_budget(hctx);
 				/*
 				 * For non-shared tags, the RESTART check
 				 * will suffice.
@@ -1201,11 +1205,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
 			}
 		}
 
-		if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
-			blk_mq_put_driver_tag(rq);
-			break;
-		}
-
 		list_del_init(&rq->queuelist);
 
 		bd.rq = rq;
@@ -1804,11 +1803,11 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 	if (q->elevator && !bypass_insert)
 		goto insert;
 
-	if (!blk_mq_get_driver_tag(rq, NULL, false))
+	if (!blk_mq_get_dispatch_budget(hctx))
 		goto insert;
 
-	if (!blk_mq_get_dispatch_budget(hctx)) {
-		blk_mq_put_driver_tag(rq);
+	if (!blk_mq_get_driver_tag(rq, NULL, false)) {
+		blk_mq_put_dispatch_budget(hctx);
 		goto insert;
 	}
 

From 1e047eaab3bb5564f25b41e9cd3a053009f4e789 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Fri, 6 Apr 2018 10:03:17 +0900
Subject: [PATCH 02/28] block/loop: fix deadlock after loop_set_status

syzbot is reporting deadlocks at __blkdev_get() [1].

----------------------------------------
[   92.493919] systemd-udevd   D12696   525      1 0x00000000
[   92.495891] Call Trace:
[   92.501560]  schedule+0x23/0x80
[   92.502923]  schedule_preempt_disabled+0x5/0x10
[   92.504645]  __mutex_lock+0x416/0x9e0
[   92.510760]  __blkdev_get+0x73/0x4f0
[   92.512220]  blkdev_get+0x12e/0x390
[   92.518151]  do_dentry_open+0x1c3/0x2f0
[   92.519815]  path_openat+0x5d9/0xdc0
[   92.521437]  do_filp_open+0x7d/0xf0
[   92.527365]  do_sys_open+0x1b8/0x250
[   92.528831]  do_syscall_64+0x6e/0x270
[   92.530341]  entry_SYSCALL_64_after_hwframe+0x42/0xb7

[   92.931922] 1 lock held by systemd-udevd/525:
[   92.933642]  #0: 00000000a2849e25 (&bdev->bd_mutex){+.+.}, at: __blkdev_get+0x73/0x4f0
----------------------------------------

The reason of deadlock turned out that wait_event_interruptible() in
blk_queue_enter() got stuck with bdev->bd_mutex held at __blkdev_put()
due to q->mq_freeze_depth == 1.

----------------------------------------
[   92.787172] a.out           S12584   634    633 0x80000002
[   92.789120] Call Trace:
[   92.796693]  schedule+0x23/0x80
[   92.797994]  blk_queue_enter+0x3cb/0x540
[   92.803272]  generic_make_request+0xf0/0x3d0
[   92.807970]  submit_bio+0x67/0x130
[   92.810928]  submit_bh_wbc+0x15e/0x190
[   92.812461]  __block_write_full_page+0x218/0x460
[   92.815792]  __writepage+0x11/0x50
[   92.817209]  write_cache_pages+0x1ae/0x3d0
[   92.825585]  generic_writepages+0x5a/0x90
[   92.831865]  do_writepages+0x43/0xd0
[   92.836972]  __filemap_fdatawrite_range+0xc1/0x100
[   92.838788]  filemap_write_and_wait+0x24/0x70
[   92.840491]  __blkdev_put+0x69/0x1e0
[   92.841949]  blkdev_close+0x16/0x20
[   92.843418]  __fput+0xda/0x1f0
[   92.844740]  task_work_run+0x87/0xb0
[   92.846215]  do_exit+0x2f5/0xba0
[   92.850528]  do_group_exit+0x34/0xb0
[   92.852018]  SyS_exit_group+0xb/0x10
[   92.853449]  do_syscall_64+0x6e/0x270
[   92.854944]  entry_SYSCALL_64_after_hwframe+0x42/0xb7

[   92.943530] 1 lock held by a.out/634:
[   92.945105]  #0: 00000000a2849e25 (&bdev->bd_mutex){+.+.}, at: __blkdev_put+0x3c/0x1e0
----------------------------------------

The reason of q->mq_freeze_depth == 1 turned out that loop_set_status()
forgot to call blk_mq_unfreeze_queue() at error paths for
info->lo_encrypt_type != NULL case.

----------------------------------------
[   37.509497] CPU: 2 PID: 634 Comm: a.out Tainted: G        W        4.16.0+ #457
[   37.513608] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 05/19/2017
[   37.518832] RIP: 0010:blk_freeze_queue_start+0x17/0x40
[   37.521778] RSP: 0018:ffffb0c2013e7c60 EFLAGS: 00010246
[   37.524078] RAX: 0000000000000000 RBX: ffff8b07b1519798 RCX: 0000000000000000
[   37.527015] RDX: 0000000000000002 RSI: ffffb0c2013e7cc0 RDI: ffff8b07b1519798
[   37.529934] RBP: ffffb0c2013e7cc0 R08: 0000000000000008 R09: 47a189966239b898
[   37.532684] R10: dad78b99b278552f R11: 9332dca72259d5ef R12: ffff8b07acd73678
[   37.535452] R13: 0000000000004c04 R14: 0000000000000000 R15: ffff8b07b841e940
[   37.538186] FS:  00007fede33b9740(0000) GS:ffff8b07b8e80000(0000) knlGS:0000000000000000
[   37.541168] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   37.543590] CR2: 00000000206fdf18 CR3: 0000000130b30006 CR4: 00000000000606e0
[   37.546410] Call Trace:
[   37.547902]  blk_freeze_queue+0x9/0x30
[   37.549968]  loop_set_status+0x67/0x3c0 [loop]
[   37.549975]  loop_set_status64+0x3b/0x70 [loop]
[   37.549986]  lo_ioctl+0x223/0x810 [loop]
[   37.549995]  blkdev_ioctl+0x572/0x980
[   37.550003]  block_ioctl+0x34/0x40
[   37.550006]  do_vfs_ioctl+0xa7/0x6d0
[   37.550017]  ksys_ioctl+0x6b/0x80
[   37.573076]  SyS_ioctl+0x5/0x10
[   37.574831]  do_syscall_64+0x6e/0x270
[   37.576769]  entry_SYSCALL_64_after_hwframe+0x42/0xb7
----------------------------------------

[1] https://syzkaller.appspot.com/bug?id=cd662bc3f6022c0979d01a262c318fab2ee9b56f

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reported-by: syzbot <bot+48594378e9851eab70bcd6f99327c7db58c5a28a@syzkaller.appspotmail.com>
Fixes: ecdd09597a572513 ("block/loop: fix race between I/O and set_status")
Cc: Ming Lei <tom.leiming@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: stable <stable@vger.kernel.org>
Cc: Jens Axboe <axboe@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 264abaaff662..e5fc020cceda 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1103,11 +1103,15 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
 	if (info->lo_encrypt_type) {
 		unsigned int type = info->lo_encrypt_type;
 
-		if (type >= MAX_LO_CRYPT)
-			return -EINVAL;
+		if (type >= MAX_LO_CRYPT) {
+			err = -EINVAL;
+			goto exit;
+		}
 		xfer = xfer_funcs[type];
-		if (xfer == NULL)
-			return -EINVAL;
+		if (xfer == NULL) {
+			err = -EINVAL;
+			goto exit;
+		}
 	} else
 		xfer = NULL;
 

From bdac616db9bbadb90b7d6a406144571015e138f7 Mon Sep 17 00:00:00 2001
From: Omar Sandoval <osandov@fb.com>
Date: Fri, 6 Apr 2018 09:57:03 -0700
Subject: [PATCH 03/28] loop: fix LOOP_GET_STATUS lock imbalance

Commit 2d1d4c1e591f made loop_get_status() drop lo_ctx_mutex before
returning, but the loop_get_status_old(), loop_get_status64(), and
loop_get_status_compat() wrappers don't call loop_get_status() if the
passed argument is NULL. The callers expect that the lock is dropped, so
make sure we drop it in that case, too.

Reported-by: syzbot+31e8daa8b3fc129e75f2@syzkaller.appspotmail.com
Fixes: 2d1d4c1e591f ("loop: don't call into filesystem while holding lo_ctl_mutex")
Signed-off-by: Omar Sandoval <osandov@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/loop.c | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index e5fc020cceda..c9d04497a415 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1287,12 +1287,13 @@ static int
 loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) {
 	struct loop_info info;
 	struct loop_info64 info64;
-	int err = 0;
+	int err;
 
-	if (!arg)
-		err = -EINVAL;
-	if (!err)
-		err = loop_get_status(lo, &info64);
+	if (!arg) {
+		mutex_unlock(&lo->lo_ctl_mutex);
+		return -EINVAL;
+	}
+	err = loop_get_status(lo, &info64);
 	if (!err)
 		err = loop_info64_to_old(&info64, &info);
 	if (!err && copy_to_user(arg, &info, sizeof(info)))
@@ -1304,12 +1305,13 @@ loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) {
 static int
 loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
 	struct loop_info64 info64;
-	int err = 0;
+	int err;
 
-	if (!arg)
-		err = -EINVAL;
-	if (!err)
-		err = loop_get_status(lo, &info64);
+	if (!arg) {
+		mutex_unlock(&lo->lo_ctl_mutex);
+		return -EINVAL;
+	}
+	err = loop_get_status(lo, &info64);
 	if (!err && copy_to_user(arg, &info64, sizeof(info64)))
 		err = -EFAULT;
 
@@ -1533,12 +1535,13 @@ loop_get_status_compat(struct loop_device *lo,
 		       struct compat_loop_info __user *arg)
 {
 	struct loop_info64 info64;
-	int err = 0;
+	int err;
 
-	if (!arg)
-		err = -EINVAL;
-	if (!err)
-		err = loop_get_status(lo, &info64);
+	if (!arg) {
+		mutex_unlock(&lo->lo_ctl_mutex);
+		return -EINVAL;
+	}
+	err = loop_get_status(lo, &info64);
 	if (!err)
 		err = loop_info64_to_compat(&info64, arg);
 	return err;

From a1c735fb790745f94a359df45c11df4a69760389 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 8 Apr 2018 17:48:07 +0800
Subject: [PATCH 04/28] blk-mq: make sure that correct hctx->next_cpu is set

From commit 20e4d81393196 (blk-mq: simplify queue mapping & schedule
with each possisble CPU), one hctx can be mapped from all offline CPUs,
then hctx->next_cpu can be set as wrong.

This patch fixes this issue by making hctx->next_cpu pointing to the
first CPU in hctx->cpumask if all CPUs in hctx->cpumask are offline.

Cc: Stefan Haberland <sth@linux.vnet.ibm.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Fixes: 20e4d81393196 ("blk-mq: simplify queue mapping & schedule with each possisble CPU")
Cc: stable@vger.kernel.org
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 90f869a083a4..f489ec920807 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2431,6 +2431,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 		 */
 		hctx->next_cpu = cpumask_first_and(hctx->cpumask,
 				cpu_online_mask);
+		if (hctx->next_cpu >= nr_cpu_ids)
+			hctx->next_cpu = cpumask_first(hctx->cpumask);
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 	}
 }

From bffa9909a6b48d8ca3398dec601bc9162a4020c4 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 8 Apr 2018 17:48:08 +0800
Subject: [PATCH 05/28] blk-mq: don't keep offline CPUs mapped to hctx 0

From commit 4b855ad37194 ("blk-mq: Create hctx for each present CPU),
blk-mq doesn't remap queue after CPU topo is changed, that said when
some of these offline CPUs become online, they are still mapped to
hctx 0, then hctx 0 may become the bottleneck of IO dispatch and
completion.

This patch sets up the mapping from the beginning, and aligns to
queue mapping for PCI device (blk_mq_pci_map_queues()).

Cc: Stefan Haberland <sth@linux.vnet.ibm.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: stable@vger.kernel.org
Fixes: 4b855ad37194 ("blk-mq: Create hctx for each present CPU)
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-cpumap.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 9f8cffc8a701..3eb169f15842 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -16,11 +16,6 @@
 
 static int cpu_to_queue_index(unsigned int nr_queues, const int cpu)
 {
-	/*
-	 * Non present CPU will be mapped to queue index 0.
-	 */
-	if (!cpu_present(cpu))
-		return 0;
 	return cpu % nr_queues;
 }
 

From 476f8c98a9bccccbb97866974ffc80879adf2bbb Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 8 Apr 2018 17:48:09 +0800
Subject: [PATCH 06/28] blk-mq: avoid to write intermediate result to
 hctx->next_cpu

This patch figures out the final selected CPU, then writes
it to hctx->next_cpu once, then we can avoid to intermediate
next cpu observed from other dispatch paths.

Cc: Stefan Haberland <sth@linux.vnet.ibm.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f489ec920807..db178c577068 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1344,26 +1344,24 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 {
 	bool tried = false;
+	int next_cpu = hctx->next_cpu;
 
 	if (hctx->queue->nr_hw_queues == 1)
 		return WORK_CPU_UNBOUND;
 
 	if (--hctx->next_cpu_batch <= 0) {
-		int next_cpu;
 select_cpu:
-		next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
+		next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
 				cpu_online_mask);
 		if (next_cpu >= nr_cpu_ids)
-			next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
+			next_cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
 
 		/*
 		 * No online CPU is found, so have to make sure hctx->next_cpu
 		 * is set correctly for not breaking workqueue.
 		 */
 		if (next_cpu >= nr_cpu_ids)
-			hctx->next_cpu = cpumask_first(hctx->cpumask);
-		else
-			hctx->next_cpu = next_cpu;
+			next_cpu = cpumask_first(hctx->cpumask);
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 	}
 
@@ -1371,7 +1369,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 	 * Do unbound schedule if we can't find a online CPU for this hctx,
 	 * and it should only happen in the path of handling CPU DEAD.
 	 */
-	if (!cpu_online(hctx->next_cpu)) {
+	if (!cpu_online(next_cpu)) {
 		if (!tried) {
 			tried = true;
 			goto select_cpu;
@@ -1381,10 +1379,13 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 		 * Make sure to re-select CPU next time once after CPUs
 		 * in hctx->cpumask become online again.
 		 */
+		hctx->next_cpu = next_cpu;
 		hctx->next_cpu_batch = 1;
 		return WORK_CPU_UNBOUND;
 	}
-	return hctx->next_cpu;
+
+	hctx->next_cpu = next_cpu;
+	return next_cpu;
 }
 
 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,

From f82ddf1923b90f89665d08cf219287c8f9deb739 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 8 Apr 2018 17:48:10 +0800
Subject: [PATCH 07/28] blk-mq: introduce blk_mq_hw_queue_first_cpu() to figure
 out first cpu

This patch introduces helper of blk_mq_hw_queue_first_cpu() for
figuring out the hctx's first cpu, and code duplication can be
avoided.

Cc: Stefan Haberland <sth@linux.vnet.ibm.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index db178c577068..e05bd10d5c84 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1335,6 +1335,15 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	hctx_unlock(hctx, srcu_idx);
 }
 
+static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
+{
+	int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
+
+	if (cpu >= nr_cpu_ids)
+		cpu = cpumask_first(hctx->cpumask);
+	return cpu;
+}
+
 /*
  * It'd be great if the workqueue API had a way to pass
  * in a mask and had some smarts for more clever placement.
@@ -1354,14 +1363,7 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 		next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
 				cpu_online_mask);
 		if (next_cpu >= nr_cpu_ids)
-			next_cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);
-
-		/*
-		 * No online CPU is found, so have to make sure hctx->next_cpu
-		 * is set correctly for not breaking workqueue.
-		 */
-		if (next_cpu >= nr_cpu_ids)
-			next_cpu = cpumask_first(hctx->cpumask);
+			next_cpu = blk_mq_first_mapped_cpu(hctx);
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 	}
 
@@ -2430,10 +2432,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 		/*
 		 * Initialize batch roundrobin counts
 		 */
-		hctx->next_cpu = cpumask_first_and(hctx->cpumask,
-				cpu_online_mask);
-		if (hctx->next_cpu >= nr_cpu_ids)
-			hctx->next_cpu = cpumask_first(hctx->cpumask);
+		hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 	}
 }

From 15fe8a90bb45b953ca36f074194fcb519a05fdec Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 8 Apr 2018 17:48:11 +0800
Subject: [PATCH 08/28] blk-mq: remove blk_mq_delay_queue()

No driver uses this interface any more, so remove it.

Cc: Stefan Haberland <sth@linux.vnet.ibm.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c |  1 -
 block/blk-mq.c         | 30 ++----------------------------
 include/linux/blk-mq.h |  2 --
 3 files changed, 2 insertions(+), 31 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 58b3b79cbe83..3080e18cb859 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -235,7 +235,6 @@ static const char *const hctx_state_name[] = {
 	HCTX_STATE_NAME(STOPPED),
 	HCTX_STATE_NAME(TAG_ACTIVE),
 	HCTX_STATE_NAME(SCHED_RESTART),
-	HCTX_STATE_NAME(START_ON_RUN),
 };
 #undef HCTX_STATE_NAME
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e05bd10d5c84..c2c6d276da3a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1562,40 +1562,14 @@ static void blk_mq_run_work_fn(struct work_struct *work)
 	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
 
 	/*
-	 * If we are stopped, don't run the queue. The exception is if
-	 * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear
-	 * the STOPPED bit and run it.
+	 * If we are stopped, don't run the queue.
 	 */
-	if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) {
-		if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state))
-			return;
-
-		clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
+	if (test_bit(BLK_MQ_S_STOPPED, &hctx->state))
 		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
-	}
 
 	__blk_mq_run_hw_queue(hctx);
 }
 
-
-void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
-{
-	if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx)))
-		return;
-
-	/*
-	 * Stop the hw queue, then modify currently delayed work.
-	 * This should prevent us from running the queue prematurely.
-	 * Mark the queue as auto-clearing STOPPED when it runs.
-	 */
-	blk_mq_stop_hw_queue(hctx);
-	set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state);
-	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
-					&hctx->run_work,
-					msecs_to_jiffies(msecs));
-}
-EXPORT_SYMBOL(blk_mq_delay_queue);
-
 static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
 					    struct request *rq,
 					    bool at_head)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 8efcf49796a3..e3986f4b3461 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -183,7 +183,6 @@ enum {
 	BLK_MQ_S_STOPPED	= 0,
 	BLK_MQ_S_TAG_ACTIVE	= 1,
 	BLK_MQ_S_SCHED_RESTART	= 2,
-	BLK_MQ_S_START_ON_RUN	= 3,
 
 	BLK_MQ_MAX_DEPTH	= 10240,
 
@@ -270,7 +269,6 @@ void blk_mq_unquiesce_queue(struct request_queue *q);
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
-void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 		busy_tag_iter_fn *fn, void *priv);
 void blk_mq_freeze_queue(struct request_queue *q);

From efea8450c3d2d3918029b36f59ef612be57d91ae Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 8 Apr 2018 17:48:12 +0800
Subject: [PATCH 09/28] blk-mq: don't check queue mapped in
 __blk_mq_delay_run_hw_queue()

There are several reasons for removing the check:

1) blk_mq_hw_queue_mapped() returns true always now since each hctx
may be mapped by one CPU at least

2) when there isn't any online CPU mapped to this hctx, there won't
be any IO queued to this CPU, blk_mq_run_hw_queue() only runs queue
if there is IO queued to this hctx

3) If __blk_mq_delay_run_hw_queue() is called by blk_mq_delay_run_hw_queue(),
which is run from blk_mq_dispatch_rq_list() or scsi_mq_get_budget(), and
the hctx to be handled has to be mapped.

Cc: Stefan Haberland <sth@linux.vnet.ibm.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c2c6d276da3a..0ee9d8e964b3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1393,9 +1393,6 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
 					unsigned long msecs)
 {
-	if (WARN_ON_ONCE(!blk_mq_hw_queue_mapped(hctx)))
-		return;
-
 	if (unlikely(blk_mq_hctx_stopped(hctx)))
 		return;
 

From 127276c6ce5a30fcc806b7fe53015f4f89b62956 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 8 Apr 2018 17:48:13 +0800
Subject: [PATCH 10/28] blk-mq: reimplement blk_mq_hw_queue_mapped

Now the actual meaning of queue mapped is that if there is any online
CPU mapped to this hctx, so implement blk_mq_hw_queue_mapped() in this
way.

Cc: Stefan Haberland <sth@linux.vnet.ibm.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq.h b/block/blk-mq.h
index 88c558f71819..502af371b83b 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -181,7 +181,7 @@ static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
 
 static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
 {
-	return hctx->nr_ctx && hctx->tags;
+	return cpumask_first_and(hctx->cpumask, cpu_online_mask) < nr_cpu_ids;
 }
 
 void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,

From 37c7c6c76d431dd7ef9c29d95f6052bd425f004c Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Sun, 8 Apr 2018 17:48:14 +0800
Subject: [PATCH 11/28] blk-mq: remove code for dealing with remapping queue

Firstly, from commit 4b855ad37194 ("blk-mq: Create hctx for each present CPU),
blk-mq doesn't remap queue any more after CPU topo is changed.

Secondly, set->nr_hw_queues can't be bigger than nr_cpu_ids, and now we map
all possible CPUs to hw queues, so at least one CPU is mapped to each hctx.

So queue mapping has became static and fixed just like percpu variable, and
we don't need to handle queue remapping any more.

Cc: Stefan Haberland <sth@linux.vnet.ibm.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 34 +++-------------------------------
 1 file changed, 3 insertions(+), 31 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0ee9d8e964b3..0dc9e341c2a7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2329,7 +2329,7 @@ static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
 
 static void blk_mq_map_swqueue(struct request_queue *q)
 {
-	unsigned int i, hctx_idx;
+	unsigned int i;
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	struct blk_mq_tag_set *set = q->tag_set;
@@ -2346,23 +2346,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 
 	/*
 	 * Map software to hardware queues.
-	 *
-	 * If the cpu isn't present, the cpu is mapped to first hctx.
 	 */
 	for_each_possible_cpu(i) {
-		hctx_idx = q->mq_map[i];
-		/* unmapped hw queue can be remapped after CPU topo changed */
-		if (!set->tags[hctx_idx] &&
-		    !__blk_mq_alloc_rq_map(set, hctx_idx)) {
-			/*
-			 * If tags initialization fail for some hctx,
-			 * that hctx won't be brought online.  In this
-			 * case, remap the current ctx to hctx[0] which
-			 * is guaranteed to always have tags allocated
-			 */
-			q->mq_map[i] = 0;
-		}
-
 		ctx = per_cpu_ptr(q->queue_ctx, i);
 		hctx = blk_mq_map_queue(q, i);
 
@@ -2374,21 +2359,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 	mutex_unlock(&q->sysfs_lock);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
-		/*
-		 * If no software queues are mapped to this hardware queue,
-		 * disable it and free the request entries.
-		 */
-		if (!hctx->nr_ctx) {
-			/* Never unmap queue 0.  We need it as a
-			 * fallback in case of a new remap fails
-			 * allocation
-			 */
-			if (i && set->tags[i])
-				blk_mq_free_map_and_requests(set, i);
-
-			hctx->tags = NULL;
-			continue;
-		}
+		/* every hctx should get mapped by at least one CPU */
+		WARN_ON(!hctx->nr_ctx);
 
 		hctx->tags = set->tags[i];
 		WARN_ON(!hctx->tags);

From a93f00b3762026dd8231f473fae9346bda07db03 Mon Sep 17 00:00:00 2001
From: Mathieu Malaterre <malat@debian.org>
Date: Fri, 6 Apr 2018 22:14:51 +0200
Subject: [PATCH 12/28] backing: silence compiler warning using __printf
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

__printf marker was added in commit d2cc4dde9206 ("bdi_register: add
__printf verification, fix arg mismatch") for function `bdi_register`
since it is useful to verify format and arguments. Apply equivalent gcc
attribute to `bdi_register_va`.

Remove warning triggered with W=1:

  mm/backing-dev.c:881:2: warning: function might be possible candidate for ‘gnu_printf’ format attribute [-Wsuggest-attribute=format]

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mathieu Malaterre <malat@debian.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/backing-dev.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 3e4ce54d84ab..0e9c0f71f726 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -28,6 +28,7 @@ void bdi_put(struct backing_dev_info *bdi);
 
 __printf(2, 3)
 int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...);
+__printf(2, 0)
 int bdi_register_va(struct backing_dev_info *bdi, const char *fmt,
 		    va_list args);
 int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner);

From 37f9579f4c31a6d698dbf3016d7bf132f9288d30 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@wdc.com>
Date: Tue, 10 Apr 2018 17:02:40 -0600
Subject: [PATCH 13/28] blk-mq: Avoid that submitting a bio concurrently with
 device removal triggers a crash

Because blkcg_exit_queue() is now called from inside blk_cleanup_queue()
it is no longer safe to access cgroup information during or after the
blk_cleanup_queue() call. Hence protect the generic_make_request_checks()
call with blk_queue_enter() / blk_queue_exit().

Reported-by: Ming Lei <ming.lei@redhat.com>
Fixes: a063057d7c73 ("block: Fix a race between request queue removal and the block cgroup controller")
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Cc: Ming Lei <ming.lei@redhat.com>
Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index abcb8684ba67..806ce2442819 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2385,8 +2385,20 @@ blk_qc_t generic_make_request(struct bio *bio)
 	 * yet.
 	 */
 	struct bio_list bio_list_on_stack[2];
+	blk_mq_req_flags_t flags = 0;
+	struct request_queue *q = bio->bi_disk->queue;
 	blk_qc_t ret = BLK_QC_T_NONE;
 
+	if (bio->bi_opf & REQ_NOWAIT)
+		flags = BLK_MQ_REQ_NOWAIT;
+	if (blk_queue_enter(q, flags) < 0) {
+		if (!blk_queue_dying(q) && (bio->bi_opf & REQ_NOWAIT))
+			bio_wouldblock_error(bio);
+		else
+			bio_io_error(bio);
+		return ret;
+	}
+
 	if (!generic_make_request_checks(bio))
 		goto out;
 
@@ -2423,11 +2435,22 @@ blk_qc_t generic_make_request(struct bio *bio)
 	bio_list_init(&bio_list_on_stack[0]);
 	current->bio_list = bio_list_on_stack;
 	do {
-		struct request_queue *q = bio->bi_disk->queue;
-		blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ?
-			BLK_MQ_REQ_NOWAIT : 0;
+		bool enter_succeeded = true;
 
-		if (likely(blk_queue_enter(q, flags) == 0)) {
+		if (unlikely(q != bio->bi_disk->queue)) {
+			if (q)
+				blk_queue_exit(q);
+			q = bio->bi_disk->queue;
+			flags = 0;
+			if (bio->bi_opf & REQ_NOWAIT)
+				flags = BLK_MQ_REQ_NOWAIT;
+			if (blk_queue_enter(q, flags) < 0) {
+				enter_succeeded = false;
+				q = NULL;
+			}
+		}
+
+		if (enter_succeeded) {
 			struct bio_list lower, same;
 
 			/* Create a fresh bio_list for all subordinate requests */
@@ -2435,8 +2458,6 @@ blk_qc_t generic_make_request(struct bio *bio)
 			bio_list_init(&bio_list_on_stack[0]);
 			ret = q->make_request_fn(q, bio);
 
-			blk_queue_exit(q);
-
 			/* sort new bios into those for a lower level
 			 * and those for the same level
 			 */
@@ -2463,6 +2484,8 @@ blk_qc_t generic_make_request(struct bio *bio)
 	current->bio_list = NULL; /* deactivate */
 
 out:
+	if (q)
+		blk_queue_exit(q);
 	return ret;
 }
 EXPORT_SYMBOL(generic_make_request);

From 2434af79c85d45d41d0c286fedf6e0556888a54c Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Wed, 11 Apr 2018 18:47:44 +0800
Subject: [PATCH 14/28] blk-mq: Revert "blk-mq: reimplement
 blk_mq_hw_queue_mapped"

This reverts commit 127276c6ce5a30fcc806b7fe53015f4f89b62956.

When all CPUs of one hw queue become offline, there still may have IOs
not completed from this hctx. But blk_mq_hw_queue_mapped() is called in
blk_mq_queue_tag_busy_iter(), which is used for iterating request in timeout
handler, timeout event will be missed on the inactive hctx, then request may
never be completed.

Also the replementation of blk_mq_hw_queue_mapped() doesn't match the helper's
name any more, and it should have been named as blk_mq_hw_queue_active().

Even other callers need further verification about this reimplemenation.

So revert this patch now, and we can improve hw queue activate/inactivate event
after adequent researching and test.

Cc: Stefan Haberland <sth@linux.vnet.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Christoph Hellwig <hch@lst.de>
Reported-by: Jens Axboe <axboe@kernel.dk>
Fixes: 	127276c6ce5a30fcc ("blk-mq: reimplement blk_mq_hw_queue_mapped")
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq.h b/block/blk-mq.h
index 502af371b83b..88c558f71819 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -181,7 +181,7 @@ static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
 
 static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
 {
-	return cpumask_first_and(hctx->cpumask, cpu_online_mask) < nr_cpu_ids;
+	return hctx->nr_ctx && hctx->tags;
 }
 
 void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,

From 2d097c50212e137e7b53ffe3b37561153eeba87d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 11 Apr 2018 11:26:09 -0600
Subject: [PATCH 15/28] sr: get/drop reference to device in revalidate and
 check_events

We can't just use scsi_cd() to get the scsi_cd structure, we have
to grab a live reference to the device. For both callbacks, we're
not inside an open where we already hold a reference to the device.

This fixes device removal/addition under concurrent device access,
which otherwise could result in the below oops.

NULL pointer dereference at 0000000000000010
PGD 0 P4D 0
Oops: 0000 [#1] PREEMPT SMP
Modules linked in:
sr 12:0:0:0: [sr2] scsi-1 drive
 scsi_debug crc_t10dif crct10dif_generic crct10dif_common nvme nvme_core sb_edac xl
sr 12:0:0:0: Attached scsi CD-ROM sr2
 sr_mod cdrom btrfs xor zstd_decompress zstd_compress xxhash lzo_compress zlib_defc
sr 12:0:0:0: Attached scsi generic sg7 type 5
 igb ahci libahci i2c_algo_bit libata dca [last unloaded: crc_t10dif]
CPU: 43 PID: 4629 Comm: systemd-udevd Not tainted 4.16.0+ #650
Hardware name: Dell Inc. PowerEdge T630/0NT78X, BIOS 2.3.4 11/09/2016
RIP: 0010:sr_block_revalidate_disk+0x23/0x190 [sr_mod]
RSP: 0018:ffff883ff357bb58 EFLAGS: 00010292
RAX: ffffffffa00b07d0 RBX: ffff883ff3058000 RCX: ffff883ff357bb66
RDX: 0000000000000003 RSI: 0000000000007530 RDI: ffff881fea631000
RBP: 0000000000000000 R08: ffff881fe4d38400 R09: 0000000000000000
R10: 0000000000000000 R11: 00000000000001b6 R12: 000000000800005d
R13: 000000000800005d R14: ffff883ffd9b3790 R15: 0000000000000000
FS:  00007f7dc8e6d8c0(0000) GS:ffff883fff340000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000010 CR3: 0000003ffda98005 CR4: 00000000003606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 ? __invalidate_device+0x48/0x60
 check_disk_change+0x4c/0x60
 sr_block_open+0x16/0xd0 [sr_mod]
 __blkdev_get+0xb9/0x450
 ? iget5_locked+0x1c0/0x1e0
 blkdev_get+0x11e/0x320
 ? bdget+0x11d/0x150
 ? _raw_spin_unlock+0xa/0x20
 ? bd_acquire+0xc0/0xc0
 do_dentry_open+0x1b0/0x320
 ? inode_permission+0x24/0xc0
 path_openat+0x4e6/0x1420
 ? cpumask_any_but+0x1f/0x40
 ? flush_tlb_mm_range+0xa0/0x120
 do_filp_open+0x8c/0xf0
 ? __seccomp_filter+0x28/0x230
 ? _raw_spin_unlock+0xa/0x20
 ? __handle_mm_fault+0x7d6/0x9b0
 ? list_lru_add+0xa8/0xc0
 ? _raw_spin_unlock+0xa/0x20
 ? __alloc_fd+0xaf/0x160
 ? do_sys_open+0x1a6/0x230
 do_sys_open+0x1a6/0x230
 do_syscall_64+0x5a/0x100
 entry_SYSCALL_64_after_hwframe+0x3d/0xa2

Reviewed-by: Lee Duncan <lduncan@suse.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/sr.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 0cf25d789d05..3f3cb72e0c0c 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -587,18 +587,28 @@ static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 static unsigned int sr_block_check_events(struct gendisk *disk,
 					  unsigned int clearing)
 {
-	struct scsi_cd *cd = scsi_cd(disk);
+	unsigned int ret = 0;
+	struct scsi_cd *cd;
 
-	if (atomic_read(&cd->device->disk_events_disable_depth))
+	cd = scsi_cd_get(disk);
+	if (!cd)
 		return 0;
 
-	return cdrom_check_events(&cd->cdi, clearing);
+	if (!atomic_read(&cd->device->disk_events_disable_depth))
+		ret = cdrom_check_events(&cd->cdi, clearing);
+
+	scsi_cd_put(cd);
+	return ret;
 }
 
 static int sr_block_revalidate_disk(struct gendisk *disk)
 {
-	struct scsi_cd *cd = scsi_cd(disk);
 	struct scsi_sense_hdr sshdr;
+	struct scsi_cd *cd;
+
+	cd = scsi_cd_get(disk);
+	if (!cd)
+		return -ENXIO;
 
 	/* if the unit is not ready, nothing more to do */
 	if (scsi_test_unit_ready(cd->device, SR_TIMEOUT, MAX_RETRIES, &sshdr))
@@ -607,6 +617,7 @@ static int sr_block_revalidate_disk(struct gendisk *disk)
 	sr_cd_check(&cd->cdi);
 	get_sectorsize(cd);
 out:
+	scsi_cd_put(cd);
 	return 0;
 }
 

From 7ec6074ff005e5f6cd2cf186a9ec7496c3db04f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Matias=20Bj=C3=B8rling?= <mb@lightnvm.io>
Date: Thu, 12 Apr 2018 09:16:03 -0600
Subject: [PATCH 16/28] nvme: enforce 64bit offset for nvme_get_log_ext fn
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compiling on 32 bits system produces a warning for the shift width
when shifting 32 bit integer with 64bit integer.

Make sure that offset always is 64bit, and use macros for retrieving
lower and upper bits of the offset.

Signed-off-by: Matias Bjørling <mb@lightnvm.io>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 6 +++---
 drivers/nvme/host/nvme.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 197a6ba9700f..1bdd010a0cf5 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2220,7 +2220,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 
 int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 		     u8 log_page, void *log,
-		     size_t size, size_t offset)
+		     size_t size, u64 offset)
 {
 	struct nvme_command c = { };
 	unsigned long dwlen = size / 4 - 1;
@@ -2235,8 +2235,8 @@ int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	c.get_log_page.lid = log_page;
 	c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
 	c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
-	c.get_log_page.lpol = cpu_to_le32(offset & ((1ULL << 32) - 1));
-	c.get_log_page.lpou = cpu_to_le32(offset >> 32ULL);
+	c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
+	c.get_log_page.lpou = cpu_to_le32(upper_32_bits(offset));
 
 	return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
 }
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index cf93690b3ffc..09d47b1937ff 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -430,7 +430,7 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
 int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
 
 int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
-		u8 log_page, void *log, size_t size, size_t offset);
+		u8 log_page, void *log, size_t size, u64 offset);
 
 extern const struct attribute_group nvme_ns_id_attr_group;
 extern const struct block_device_operations nvme_ns_head_ops;

From 11d9ea6f2ca69237d35d6c55755beba3e006b106 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 12 Apr 2018 09:16:04 -0600
Subject: [PATCH 17/28] nvme-loop: fix kernel oops in case of unhandled command

When nvmet_req_init() fails, __nvmet_req_complete() is called
to handle the target request via .queue_response(), so
nvme_loop_queue_response() shouldn't be called again for
handling the failure.

This patch fixes this case by the following way:

- move blk_mq_start_request() before nvmet_req_init(), so
nvme_loop_queue_response() may work well to complete this
host request

- don't call nvme_cleanup_cmd() which is done in nvme_loop_complete_rq()

- don't call nvme_loop_queue_response() which is done via
.queue_response()

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
[trimmed changelog]
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/loop.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index a350765d2d5c..b9d5b69d8548 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -174,15 +174,12 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (ret)
 		return ret;
 
+	blk_mq_start_request(req);
 	iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
 	iod->req.port = nvmet_loop_port;
 	if (!nvmet_req_init(&iod->req, &queue->nvme_cq,
-			&queue->nvme_sq, &nvme_loop_ops)) {
-		nvme_cleanup_cmd(req);
-		blk_mq_start_request(req);
-		nvme_loop_queue_response(&iod->req);
+			&queue->nvme_sq, &nvme_loop_ops))
 		return BLK_STS_OK;
-	}
 
 	if (blk_rq_payload_bytes(req)) {
 		iod->sg_table.sgl = iod->first_sgl;
@@ -196,8 +193,6 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 		iod->req.transfer_len = blk_rq_payload_bytes(req);
 	}
 
-	blk_mq_start_request(req);
-
 	schedule_work(&iod->work);
 	return BLK_STS_OK;
 }

From 00b683dbabc34599b96a935aeee791f9af3ae02e Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn@suse.de>
Date: Thu, 12 Apr 2018 09:16:05 -0600
Subject: [PATCH 18/28] nvme: unexport nvme_start_keep_alive

nvme_start_keep_alive() isn't used outside core.c so unexport it and
make it static.

Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 3 +--
 drivers/nvme/host/nvme.h | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 1bdd010a0cf5..695e52ae22fd 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -826,7 +826,7 @@ static void nvme_keep_alive_work(struct work_struct *work)
 	}
 }
 
-void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
+static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
 {
 	if (unlikely(ctrl->kato == 0))
 		return;
@@ -836,7 +836,6 @@ void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
 	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
 	schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
 }
-EXPORT_SYMBOL_GPL(nvme_start_keep_alive);
 
 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
 {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 09d47b1937ff..08c4cff79cde 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -422,7 +422,6 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 		unsigned timeout, int qid, int at_head,
 		blk_mq_req_flags_t flags);
 int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
-void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
 int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
 int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);

From 74c6c71530847808d4e3be7b205719270efee80c Mon Sep 17 00:00:00 2001
From: Johannes Thumshirn <jthumshirn@suse.de>
Date: Thu, 12 Apr 2018 09:16:06 -0600
Subject: [PATCH 19/28] nvme: don't send keep-alives to the discovery
 controller
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

NVMe over Fabrics 1.0 Section 5.2 "Discovery Controller Properties and
Command Support" Figure 31 "Discovery Controller – Admin Commands"
explicitly listst all commands but "Get Log Page" and "Identify" as
reserved, but NetApp report the Linux host is sending Keep Alive
commands to the discovery controller, which is a violation of the
Spec.

We're already checking for discovery controllers when configuring the
keep alive timeout but when creating a discovery controller we're not
hard wiring the keep alive timeout to 0 and thus remain on
NVME_DEFAULT_KATO for the discovery controller.

This can be easily remproduced when issuing a direct connect to the
discovery susbsystem using:
'nvme connect [...] --nqn=nqn.2014-08.org.nvmexpress.discovery'

Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Fixes: 07bfcd09a288 ("nvme-fabrics: add a generic NVMe over Fabrics library")
Reported-by: Martin George <marting@netapp.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/fabrics.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 8f0f34d06d46..3583f9492a45 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -608,8 +608,10 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 			opts->discovery_nqn =
 				!(strcmp(opts->subsysnqn,
 					 NVME_DISC_SUBSYS_NAME));
-			if (opts->discovery_nqn)
+			if (opts->discovery_nqn) {
+				opts->kato = 0;
 				opts->nr_io_queues = 0;
+			}
 			break;
 		case NVMF_OPT_TRADDR:
 			p = match_strdup(args);

From 6038aa532a224da68c478f34f4dbce33c47169e6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 12 Apr 2018 09:16:07 -0600
Subject: [PATCH 20/28] nvme: target: fix buffer overflow

nvmet_execute_get_disc_log_page() passes a fixed-length string into
nvmet_format_discovery_entry(), which then does a longer memcpy() on
it, as pointed out by gcc-8:

In function 'nvmet_format_discovery_entry',
    inlined from 'nvmet_execute_get_disc_log_page' at drivers/nvme/target/discovery.c:126:4:
drivers/nvme/target/discovery.c:62:2: error: 'memcpy' forming offset [38, 223] is out of the bounds [0, 37] [-Werror=array-bounds]
  memcpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE);

Using strncpy() will make this well-defined, filling the rest of the
buffer with zeroes, under the assumption that the input is either
a NUL-terminated string, or a byte sequence containing no zeroes.
If the input is a string that is longer than NVMF_NQN_SIZE, we
continue to have no NUL-termination in the output.

Fixes: a07b4970f464 ("nvmet: add a generic NVMe target")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/discovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index a72425d8bce0..231e04e0a496 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -59,7 +59,7 @@ static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr,
 	memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE);
 	memcpy(e->traddr, traddr, NVMF_TRADDR_SIZE);
 	memcpy(e->tsas.common, port->disc_addr.tsas.common, NVMF_TSAS_SIZE);
-	memcpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE);
+	strncpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE);
 }
 
 /*

From 64ee0ac0527704c47170316fa58dbde50edaaf70 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 12 Apr 2018 09:16:08 -0600
Subject: [PATCH 21/28] nvme-pci: Skip queue deletion if there are no queues

User reported controller always retains CSTS.RDY to 1, which fails
controller disabling when resetting the controller. This is also before
the admin queue is allocated, and trying to disable an unallocated queue
results in a NULL dereference.

Reported-by: Alex Gagniuc <Alex_Gagniuc@Dellteam.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 295fbec1e5f2..22403aa7dc60 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2201,7 +2201,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 
 	nvme_stop_queues(&dev->ctrl);
 
-	if (!dead) {
+	if (!dead && dev->ctrl.queue_count > 0) {
 		/*
 		 * If the controller is still alive tell it to stop using the
 		 * host memory buffer.  In theory the shutdown / reset should

From a6ff7262c26c190f2480721703211cb12d66d45a Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 12 Apr 2018 09:16:09 -0600
Subject: [PATCH 22/28] nvme-pci: Remove unused queue parameter

All the queue memory is allocated up front. We don't take the node
into consideration when creating queues anymore, so removing the unused
parameter.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 22403aa7dc60..0b3b4d9fd423 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1380,8 +1380,7 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
 	return 0;
 }
 
-static int nvme_alloc_queue(struct nvme_dev *dev, int qid,
-		int depth, int node)
+static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
 {
 	struct nvme_queue *nvmeq = &dev->queues[qid];
 
@@ -1596,8 +1595,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
 	if (result < 0)
 		return result;
 
-	result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
-			dev_to_node(dev->dev));
+	result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
 	if (result)
 		return result;
 
@@ -1630,9 +1628,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
 	int ret = 0;
 
 	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
-		/* vector == qid - 1, match nvme_create_queue */
-		if (nvme_alloc_queue(dev, i, dev->q_depth,
-		     pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) {
+		if (nvme_alloc_queue(dev, i, dev->q_depth)) {
 			ret = -ENOMEM;
 			break;
 		}

From 22b5560195bd66bc43359b71821dc78cc9de56c6 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 12 Apr 2018 09:16:10 -0600
Subject: [PATCH 23/28] nvme-pci: Separate IO and admin queue IRQ vectors

The admin and first IO queues shared the first irq vector, which has an
affinity mask including cpu0. If a system allows cpu0 to be offlined,
the admin queue may not be usable if no other CPUs in the affinity mask
are online. This is a problem since unlike IO queues, there is only
one admin queue that always needs to be usable.

To fix, this patch allocates one pre_vector for the admin queue that
is assigned all CPUs, so will always be accessible. The IO queues are
assigned the remaining managed vectors.

In case a controller has only one interrupt vector available, the admin
and IO queues will share the pre_vector with all CPUs assigned.

Cc: Jianchao Wang <jianchao.w.wang@oracle.com>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/pci.c | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 0b3b4d9fd423..fbc71fac6f1e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -84,6 +84,7 @@ struct nvme_dev {
 	struct dma_pool *prp_small_pool;
 	unsigned online_queues;
 	unsigned max_qid;
+	unsigned int num_vecs;
 	int q_depth;
 	u32 db_stride;
 	void __iomem *bar;
@@ -414,7 +415,8 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
 {
 	struct nvme_dev *dev = set->driver_data;
 
-	return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev), 0);
+	return blk_mq_pci_map_queues(set, to_pci_dev(dev->dev),
+			dev->num_vecs > 1 ? 1 /* admin queue */ : 0);
 }
 
 /**
@@ -1456,7 +1458,11 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
 		nvmeq->sq_cmds_io = dev->cmb + offset;
 	}
 
-	nvmeq->cq_vector = qid - 1;
+	/*
+	 * A queue's vector matches the queue identifier unless the controller
+	 * has only one vector available.
+	 */
+	nvmeq->cq_vector = dev->num_vecs == 1 ? 0 : qid;
 	result = adapter_alloc_cq(dev, qid, nvmeq);
 	if (result < 0)
 		goto release_vector;
@@ -1910,6 +1916,10 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	int result, nr_io_queues;
 	unsigned long size;
 
+	struct irq_affinity affd = {
+		.pre_vectors = 1
+	};
+
 	nr_io_queues = num_possible_cpus();
 	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
 	if (result < 0)
@@ -1945,11 +1955,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
 	 * setting up the full range we need.
 	 */
 	pci_free_irq_vectors(pdev);
-	nr_io_queues = pci_alloc_irq_vectors(pdev, 1, nr_io_queues,
-			PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY);
-	if (nr_io_queues <= 0)
+	result = pci_alloc_irq_vectors_affinity(pdev, 1, nr_io_queues + 1,
+			PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
+	if (result <= 0)
 		return -EIO;
-	dev->max_qid = nr_io_queues;
+	dev->num_vecs = result;
+	dev->max_qid = max(result - 1, 1);
 
 	/*
 	 * Should investigate if there's a performance win from allocating

From 543c09c89fdc007c2990aa9d2abcc62e0dfa1311 Mon Sep 17 00:00:00 2001
From: "Rodrigo R. Galvao" <rosattig@linux.vnet.ibm.com>
Date: Thu, 12 Apr 2018 09:16:11 -0600
Subject: [PATCH 24/28] nvmet: Fix nvmet_execute_write_zeroes sector count

We have to increment the number of logical blocks to a 1's based value
in the native format prior to converting to 512b units.

Signed-off-by: Rodrigo R. Galvao <rosattig@linux.vnet.ibm.com>
[changelog]
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/io-cmd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c
index 28bbdff4a88b..cd2344179673 100644
--- a/drivers/nvme/target/io-cmd.c
+++ b/drivers/nvme/target/io-cmd.c
@@ -173,8 +173,8 @@ static void nvmet_execute_write_zeroes(struct nvmet_req *req)
 
 	sector = le64_to_cpu(write_zeroes->slba) <<
 		(req->ns->blksize_shift - 9);
-	nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length)) <<
-		(req->ns->blksize_shift - 9)) + 1;
+	nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length) + 1) <<
+		(req->ns->blksize_shift - 9));
 
 	if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector,
 				GFP_KERNEL, &bio, 0))

From fd92c77f58257ae5eb5180afe36e86094e4910f6 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Thu, 12 Apr 2018 09:16:12 -0600
Subject: [PATCH 25/28] nvme: check return value of init_srcu_struct function

Also add error flow in case srcu initialization function fails.

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 695e52ae22fd..b09940c556d0 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2832,7 +2832,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
 		goto out_free_head;
 	head->instance = ret;
 	INIT_LIST_HEAD(&head->list);
-	init_srcu_struct(&head->srcu);
+	ret = init_srcu_struct(&head->srcu);
+	if (ret)
+		goto out_ida_remove;
 	head->subsys = ctrl->subsys;
 	head->ns_id = nsid;
 	kref_init(&head->ref);
@@ -2854,6 +2856,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
 	return head;
 out_cleanup_srcu:
 	cleanup_srcu_struct(&head->srcu);
+out_ida_remove:
 	ida_simple_remove(&ctrl->subsys->ns_ida, head->instance);
 out_free_head:
 	kfree(head);

From c73996984902516745bc587d5e8a0b2e034aea05 Mon Sep 17 00:00:00 2001
From: Daniel Verkamp <daniel.verkamp@intel.com>
Date: Thu, 12 Apr 2018 09:16:13 -0600
Subject: [PATCH 26/28] nvmet: fix space padding in serial number

Commit 42de82a8b544 previously attempted to fix this, and it did
correctly pad the MN and FR fields with spaces, but the SN field still
contains 0 bytes.  The current code fills out the first 16 bytes with
hex2bin, leaving the last 4 bytes zeroed.  Rather than adding a lot of
error-prone math to avoid overwriting SN twice, just set the whole thing
to spaces up front (it's only 20 bytes).

Fixes: 42de82a8b544 ("nvmet: don't report 0-bytes in serial number")
Signed-off-by: Daniel Verkamp <daniel.verkamp@intel.com>
Reviewed-by: Martin Wilck <mwilck@suse.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/target/admin-cmd.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 90dcdc40ac71..5e0e9fcc0d4d 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -178,6 +178,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 	id->vid = 0;
 	id->ssvid = 0;
 
+	memset(id->sn, ' ', sizeof(id->sn));
 	bin2hex(id->sn, &ctrl->subsys->serial,
 		min(sizeof(ctrl->subsys->serial), sizeof(id->sn) / 2));
 	memcpy_and_pad(id->mn, sizeof(id->mn), model, sizeof(model) - 1, ' ');

From 62843c2e4226057c83f520c74fe9c81a1891c331 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 12 Apr 2018 09:16:14 -0600
Subject: [PATCH 27/28] nvme: Use admin command effects for admin commands

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index b09940c556d0..aac3c1d2b2a2 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1102,7 +1102,7 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	}
 
 	if (ctrl->effects)
-		effects = le32_to_cpu(ctrl->effects->iocs[opcode]);
+		effects = le32_to_cpu(ctrl->effects->acs[opcode]);
 	else
 		effects = nvme_known_admin_effects(opcode);
 

From bb06ec31452fb2da1594f88035c2ecea4e0652f4 Mon Sep 17 00:00:00 2001
From: James Smart <jsmart2021@gmail.com>
Date: Thu, 12 Apr 2018 09:16:15 -0600
Subject: [PATCH 28/28] nvme: expand nvmf_check_if_ready checks

The nvmf_check_if_ready() checks that were added are very simplistic.
As such, the routine allows a lot of cases to fail ios during windows
of reset or re-connection. In cases where there are not multi-path
options present, the error goes back to the callee - the filesystem
or application. Not good.

The common routine was rewritten and calling syntax slightly expanded
so that per-transport is_ready routines don't need to be present.
The transports now call the routine directly. The routine is now a
fabrics routine rather than an inline function.

The routine now looks at controller state to decide the action to
take. Some states mandate io failure. Others define the condition where
a command can be accepted.  When the decision is unclear, a generic
queue-or-reject check is made to look for failfast or multipath ios and
only fails the io if it is so marked. Otherwise, the io will be queued
and wait for the controller state to resolve.

Admin commands issued via ioctl share a live admin queue with commands
from the transport for controller init. The ioctls could be intermixed
with the initialization commands. It's possible for the ioctl cmd to
be issued prior to the controller being enabled. To block this, the
ioctl admin commands need to be distinguished from admin commands used
for controller init. Added a USERCMD nvme_req(req)->rq_flags bit to
reflect this division and set it on ioctls requests.  As the
nvmf_check_if_ready() routine is called prior to nvme_setup_cmd(),
ensure that commands allocated by the ioctl path (actually anything
in core.c) preps the nvme_req(req) before starting the io. This will
preserve the USERCMD flag during execution and/or retry.

Signed-off-by: James Smart <james.smart@broadcom.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.e>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c    | 17 +++++---
 drivers/nvme/host/fabrics.c | 79 +++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/fabrics.h | 33 +---------------
 drivers/nvme/host/fc.c      | 12 ++----
 drivers/nvme/host/nvme.h    |  1 +
 drivers/nvme/host/rdma.c    | 14 +------
 drivers/nvme/target/loop.c  | 11 +-----
 7 files changed, 101 insertions(+), 66 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index aac3c1d2b2a2..9df4f71e58ca 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -376,6 +376,15 @@ static void nvme_put_ns(struct nvme_ns *ns)
 	kref_put(&ns->kref, nvme_free_ns);
 }
 
+static inline void nvme_clear_nvme_request(struct request *req)
+{
+	if (!(req->rq_flags & RQF_DONTPREP)) {
+		nvme_req(req)->retries = 0;
+		nvme_req(req)->flags = 0;
+		req->rq_flags |= RQF_DONTPREP;
+	}
+}
+
 struct request *nvme_alloc_request(struct request_queue *q,
 		struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid)
 {
@@ -392,6 +401,7 @@ struct request *nvme_alloc_request(struct request_queue *q,
 		return req;
 
 	req->cmd_flags |= REQ_FAILFAST_DRIVER;
+	nvme_clear_nvme_request(req);
 	nvme_req(req)->cmd = cmd;
 
 	return req;
@@ -608,11 +618,7 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
 {
 	blk_status_t ret = BLK_STS_OK;
 
-	if (!(req->rq_flags & RQF_DONTPREP)) {
-		nvme_req(req)->retries = 0;
-		nvme_req(req)->flags = 0;
-		req->rq_flags |= RQF_DONTPREP;
-	}
+	nvme_clear_nvme_request(req);
 
 	switch (req_op(req)) {
 	case REQ_OP_DRV_IN:
@@ -742,6 +748,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 		return PTR_ERR(req);
 
 	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
+	nvme_req(req)->flags |= NVME_REQ_USERCMD;
 
 	if (ubuffer && bufflen) {
 		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 3583f9492a45..124c458806df 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -536,6 +536,85 @@ static struct nvmf_transport_ops *nvmf_lookup_transport(
 	return NULL;
 }
 
+blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, struct request *rq,
+		bool queue_live, bool is_connected)
+{
+	struct nvme_command *cmd = nvme_req(rq)->cmd;
+
+	if (likely(ctrl->state == NVME_CTRL_LIVE && is_connected))
+		return BLK_STS_OK;
+
+	switch (ctrl->state) {
+	case NVME_CTRL_DELETING:
+		goto reject_io;
+
+	case NVME_CTRL_NEW:
+	case NVME_CTRL_CONNECTING:
+		if (!is_connected)
+			/*
+			 * This is the case of starting a new
+			 * association but connectivity was lost
+			 * before it was fully created. We need to
+			 * error the commands used to initialize the
+			 * controller so the reconnect can go into a
+			 * retry attempt. The commands should all be
+			 * marked REQ_FAILFAST_DRIVER, which will hit
+			 * the reject path below. Anything else will
+			 * be queued while the state settles.
+			 */
+			goto reject_or_queue_io;
+
+		if ((queue_live &&
+		     !(nvme_req(rq)->flags & NVME_REQ_USERCMD)) ||
+		    (!queue_live && blk_rq_is_passthrough(rq) &&
+		     cmd->common.opcode == nvme_fabrics_command &&
+		     cmd->fabrics.fctype == nvme_fabrics_type_connect))
+			/*
+			 * If queue is live, allow only commands that
+			 * are internally generated pass through. These
+			 * are commands on the admin queue to initialize
+			 * the controller. This will reject any ioctl
+			 * admin cmds received while initializing.
+			 *
+			 * If the queue is not live, allow only a
+			 * connect command. This will reject any ioctl
+			 * admin cmd as well as initialization commands
+			 * if the controller reverted the queue to non-live.
+			 */
+			return BLK_STS_OK;
+
+		/*
+		 * fall-thru to the reject_or_queue_io clause
+		 */
+		break;
+
+	/* these cases fall-thru
+	 * case NVME_CTRL_LIVE:
+	 * case NVME_CTRL_RESETTING:
+	 */
+	default:
+		break;
+	}
+
+reject_or_queue_io:
+	/*
+	 * Any other new io is something we're not in a state to send
+	 * to the device. Default action is to busy it and retry it
+	 * after the controller state is recovered. However, anything
+	 * marked for failfast or nvme multipath is immediately failed.
+	 * Note: commands used to initialize the controller will be
+	 *  marked for failfast.
+	 * Note: nvme cli/ioctl commands are marked for failfast.
+	 */
+	if (!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
+		return BLK_STS_RESOURCE;
+
+reject_io:
+	nvme_req(rq)->status = NVME_SC_ABORT_REQ;
+	return BLK_STS_IOERR;
+}
+EXPORT_SYMBOL_GPL(nvmf_check_if_ready);
+
 static const match_table_t opt_tokens = {
 	{ NVMF_OPT_TRANSPORT,		"transport=%s"		},
 	{ NVMF_OPT_TRADDR,		"traddr=%s"		},
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index a3145d90c1d2..ef46c915b7b5 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -157,36 +157,7 @@ void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
 void nvmf_free_options(struct nvmf_ctrl_options *opts);
 int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
 bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
-
-static inline blk_status_t nvmf_check_init_req(struct nvme_ctrl *ctrl,
-		struct request *rq)
-{
-	struct nvme_command *cmd = nvme_req(rq)->cmd;
-
-	/*
-	 * We cannot accept any other command until the connect command has
-	 * completed, so only allow connect to pass.
-	 */
-	if (!blk_rq_is_passthrough(rq) ||
-	    cmd->common.opcode != nvme_fabrics_command ||
-	    cmd->fabrics.fctype != nvme_fabrics_type_connect) {
-		/*
-		 * Connecting state means transport disruption or initial
-		 * establishment, which can take a long time and even might
-		 * fail permanently, fail fast to give upper layers a chance
-		 * to failover.
-		 * Deleting state means that the ctrl will never accept commands
-		 * again, fail it permanently.
-		 */
-		if (ctrl->state == NVME_CTRL_CONNECTING ||
-		    ctrl->state == NVME_CTRL_DELETING) {
-			nvme_req(rq)->status = NVME_SC_ABORT_REQ;
-			return BLK_STS_IOERR;
-		}
-		return BLK_STS_RESOURCE; /* try again later */
-	}
-
-	return BLK_STS_OK;
-}
+blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl,
+	struct request *rq, bool queue_live, bool is_connected);
 
 #endif /* _NVME_FABRICS_H */
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index c6e719b2f3ca..6cb26bcf6ec0 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2277,14 +2277,6 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
 	return BLK_STS_OK;
 }
 
-static inline blk_status_t nvme_fc_is_ready(struct nvme_fc_queue *queue,
-		struct request *rq)
-{
-	if (unlikely(!test_bit(NVME_FC_Q_LIVE, &queue->flags)))
-		return nvmf_check_init_req(&queue->ctrl->ctrl, rq);
-	return BLK_STS_OK;
-}
-
 static blk_status_t
 nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
 			const struct blk_mq_queue_data *bd)
@@ -2300,7 +2292,9 @@ nvme_fc_queue_rq(struct blk_mq_hw_ctx *hctx,
 	u32 data_len;
 	blk_status_t ret;
 
-	ret = nvme_fc_is_ready(queue, rq);
+	ret = nvmf_check_if_ready(&queue->ctrl->ctrl, rq,
+		test_bit(NVME_FC_Q_LIVE, &queue->flags),
+		ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE);
 	if (unlikely(ret))
 		return ret;
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 08c4cff79cde..061fecfd44f5 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -105,6 +105,7 @@ struct nvme_request {
 
 enum {
 	NVME_REQ_CANCELLED		= (1 << 0),
+	NVME_REQ_USERCMD		= (1 << 1),
 };
 
 static inline struct nvme_request *nvme_req(struct request *req)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 758537e9ba07..1eb4438a8763 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1601,17 +1601,6 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
 	return BLK_EH_HANDLED;
 }
 
-/*
- * We cannot accept any other command until the Connect command has completed.
- */
-static inline blk_status_t
-nvme_rdma_is_ready(struct nvme_rdma_queue *queue, struct request *rq)
-{
-	if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags)))
-		return nvmf_check_init_req(&queue->ctrl->ctrl, rq);
-	return BLK_STS_OK;
-}
-
 static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
 		const struct blk_mq_queue_data *bd)
 {
@@ -1627,7 +1616,8 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
 
 	WARN_ON_ONCE(rq->tag < 0);
 
-	ret = nvme_rdma_is_ready(queue, rq);
+	ret = nvmf_check_if_ready(&queue->ctrl->ctrl, rq,
+		test_bit(NVME_RDMA_Q_LIVE, &queue->flags), true);
 	if (unlikely(ret))
 		return ret;
 
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index b9d5b69d8548..31fdfba556a8 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -149,14 +149,6 @@ nvme_loop_timeout(struct request *rq, bool reserved)
 	return BLK_EH_HANDLED;
 }
 
-static inline blk_status_t nvme_loop_is_ready(struct nvme_loop_queue *queue,
-		struct request *rq)
-{
-	if (unlikely(!test_bit(NVME_LOOP_Q_LIVE, &queue->flags)))
-		return nvmf_check_init_req(&queue->ctrl->ctrl, rq);
-	return BLK_STS_OK;
-}
-
 static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 		const struct blk_mq_queue_data *bd)
 {
@@ -166,7 +158,8 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
 	struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req);
 	blk_status_t ret;
 
-	ret = nvme_loop_is_ready(queue, req);
+	ret = nvmf_check_if_ready(&queue->ctrl->ctrl, req,
+		test_bit(NVME_LOOP_Q_LIVE, &queue->flags), true);
 	if (unlikely(ret))
 		return ret;