From ef1661ba6d2e9c8eecd13ee04067bdcc59f7aac6 Mon Sep 17 00:00:00 2001
From: Jean Sacren <sakiwit@gmail.com>
Date: Fri, 29 Oct 2021 14:29:45 -0600
Subject: [PATCH 01/19] blk-mq: fix redundant check of !e expression

In the if branch, e is checked.  In the else branch, ->dispatch_busy is
merely a number and has no effect on !e.  We should remove the check of
!e since it is always true.

Signed-off-by: Jean Sacren <sakiwit@gmail.com>
Link: https://lore.kernel.org/r/20211029202945.3052-1-sakiwit@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index c62b966dfaba..4a6789e4398b 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -497,7 +497,7 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
 		 * busy in case of 'none' scheduler, and this way may save
 		 * us one extra enqueue & dequeue to sw queue.
 		 */
-		if (!hctx->dispatch_busy && !e && !run_queue_async) {
+		if (!hctx->dispatch_busy && !run_queue_async) {
 			blk_mq_try_issue_list_directly(hctx, list);
 			if (list_empty(list))
 				goto out;

From a22c00be90de188d36f4772ef7b268aa48d7010d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 1 Nov 2021 06:56:09 -0600
Subject: [PATCH 02/19] block: assign correct tag before doing prefetch of
 request

Ensure that current tag is correctly assigned before attempting
to prefetch the first cacheline of the request.

Fixes: 92aff191cc5b ("block: prefetch request to be initialized")
Reported-and-tested-by: syzbot+cd20829ac44b92bf6ed0@syzkaller.appspotmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 221d1b7d10d6..4787d5b74aa3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -405,8 +405,8 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
 	for (i = 0; tag_mask; i++) {
 		if (!(tag_mask & (1UL << i)))
 			continue;
-		prefetch(tags->static_rqs[tag]);
 		tag = tag_offset + i;
+		prefetch(tags->static_rqs[tag]);
 		tag_mask &= ~(1UL << i);
 		rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns);
 		rq_list_add(data->cached_rq, rq);

From b22809092c70099f4d8c3b6f3d34c5bc89b300ea Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 1 Nov 2021 13:40:12 -0600
Subject: [PATCH 03/19] block: replace always false argument with 'false'

A previous commit fixed up the condition for doing direct issue, but that
left the 'from_schedule' argument dead inside the branch. Replace it with
'false'.

Fixes: ff1552232b36 ("blk-mq: don't issue request directly in case that current is to be blocked")
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4787d5b74aa3..8aed6cea3a34 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2227,7 +2227,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	plug->rq_count = 0;
 
 	if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
-		blk_mq_plug_issue_direct(plug, from_schedule);
+		blk_mq_plug_issue_direct(plug, false);
 		if (rq_list_empty(plug->mq_list))
 			return;
 	}

From a1c2f7e7f25c9d35d3bf046f99682c5373b20fa2 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Thu, 21 Oct 2021 22:59:18 +0800
Subject: [PATCH 04/19] dm: don't stop request queue after the dm device is
 suspended

For fixing queue quiesce race between driver and block layer(elevator
switch, update nr_requests, ...), we need to support concurrent quiesce
and unquiesce, which requires the two call to be balanced.

__bind() is only called from dm_swap_table() in which dm device has been
suspended already, so not necessary to stop queue again. With this way,
request queue quiesce and unquiesce can be balanced.

Reported-by: Yi Zhang <yi.zhang@redhat.com>
Fixes: e70feb8b3e68 ("blk-mq: support concurrent queue quiesce/unquiesce")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Acked-by: Mike Snitzer <snitzer@redhat.com>
Tested-by: Yi Zhang <yi.zhang@redhat.com>
Link: https://lore.kernel.org/r/20211021145918.2691762-4-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/md/dm.c | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8b91f4f0e053..be0eb2e1dd11 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1927,16 +1927,6 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 
 	dm_table_event_callback(t, event_callback, md);
 
-	/*
-	 * The queue hasn't been stopped yet, if the old table type wasn't
-	 * for request-based during suspension.  So stop it to prevent
-	 * I/O mapping before resume.
-	 * This must be done before setting the queue restrictions,
-	 * because request-based dm may be run just after the setting.
-	 */
-	if (request_based)
-		dm_stop_queue(q);
-
 	if (request_based) {
 		/*
 		 * Leverage the fact that request-based DM targets are

From 781dd830ec4f4d56b99d5d0c64bacda4c3ee3cfd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 2 Nov 2021 08:34:09 -0600
Subject: [PATCH 05/19] block: move RQF_ELV setting into allocators

It's not safe to do this before blk_queue_enter(), as the scheduler state
could have changed in between. Hence move the RQF_ELV setting into the
allocators, where we know the queue is already entered.

Suggested-by: Ming Lei <ming.lei@redhat.com>
Reported-by: Yi Zhang <yi.zhang@redhat.com>
Reported-by: Steffen Maier <maier@linux.ibm.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8aed6cea3a34..00263d896843 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -419,7 +419,6 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
 static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 {
 	struct request_queue *q = data->q;
-	struct elevator_queue *e = q->elevator;
 	u64 alloc_time_ns = 0;
 	struct request *rq;
 	unsigned int tag;
@@ -431,7 +430,11 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 	if (data->cmd_flags & REQ_NOWAIT)
 		data->flags |= BLK_MQ_REQ_NOWAIT;
 
-	if (e) {
+	if (q->elevator) {
+		struct elevator_queue *e = q->elevator;
+
+		data->rq_flags |= RQF_ELV;
+
 		/*
 		 * Flush/passthrough requests are special and go directly to the
 		 * dispatch list. Don't include reserved tags in the
@@ -447,7 +450,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 retry:
 	data->ctx = blk_mq_get_ctx(q);
 	data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
-	if (!e)
+	if (!(data->rq_flags & RQF_ELV))
 		blk_mq_tag_busy(data->hctx);
 
 	/*
@@ -490,7 +493,6 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 		.q		= q,
 		.flags		= flags,
 		.cmd_flags	= op,
-		.rq_flags	= q->elevator ? RQF_ELV : 0,
 		.nr_tags	= 1,
 	};
 	struct request *rq;
@@ -520,7 +522,6 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 		.q		= q,
 		.flags		= flags,
 		.cmd_flags	= op,
-		.rq_flags	= q->elevator ? RQF_ELV : 0,
 		.nr_tags	= 1,
 	};
 	u64 alloc_time_ns = 0;
@@ -561,6 +562,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 
 	if (!q->elevator)
 		blk_mq_tag_busy(data.hctx);
+	else
+		data.rq_flags |= RQF_ELV;
 
 	ret = -EWOULDBLOCK;
 	tag = blk_mq_get_tag(&data);
@@ -2515,7 +2518,6 @@ void blk_mq_submit_bio(struct bio *bio)
 			.q		= q,
 			.nr_tags	= 1,
 			.cmd_flags	= bio->bi_opf,
-			.rq_flags	= q->elevator ? RQF_ELV : 0,
 		};
 
 		if (plug) {

From a1cb65377e707500819b2c2c34064e5ceb32798b Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 2 Nov 2021 21:35:00 +0800
Subject: [PATCH 06/19] blk-mq: only try to run plug merge if request has same
 queue with incoming bio

It is obvious that io merge can't be done between two different queues, so
just try to run io merge in case of same queue.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20211102133502.3619184-2-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-merge.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index df69f4bb7717..893c1a60b701 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -1101,9 +1101,11 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 		 * the same queue, there should be only one such rq in a queue
 		 */
 		*same_queue_rq = true;
+
+		if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
+				BIO_MERGE_OK)
+			return true;
 	}
-	if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == BIO_MERGE_OK)
-		return true;
 	return false;
 }
 

From 62ba0c008f5d46006b71b2757e2db29e0ce7e68b Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 2 Nov 2021 21:35:01 +0800
Subject: [PATCH 07/19] blk-mq: add RQF_ELV debug entry

Looks it is missed so add it.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20211102133502.3619184-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq-debugfs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 0f8c60e9c719..4cdce8b98557 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -309,6 +309,7 @@ static const char *const rqf_name[] = {
 	RQF_NAME(SPECIAL_PAYLOAD),
 	RQF_NAME(ZONE_WRITE_LOCKED),
 	RQF_NAME(MQ_POLL_SLEPT),
+	RQF_NAME(ELV),
 };
 #undef RQF_NAME
 

From 3b87c6ea671a18fb77709240d658f4201904f8e4 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 2 Nov 2021 23:36:19 +0800
Subject: [PATCH 08/19] blk-mq: update hctx->nr_active in
 blk_mq_end_request_batch()

In case of shared tags and none io sched, batched completion still may
be run into, and hctx->nr_active is accounted when getting driver tag,
so it has to be updated in blk_mq_end_request_batch().

Otherwise, hctx->nr_active may become same with queue depth, then
hctx_may_queue() always return false, then io hang is caused.

Fixes the issue by updating the counter in batched way.

Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Fixes: f794f3351f26 ("block: add support for blk_mq_end_request_batch()")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20211102153619.3627505-4-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c |  7 +++++++
 block/blk-mq.h | 12 +++++++++---
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 00263d896843..c68aa0a332e1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -818,6 +818,13 @@ static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
 {
 	struct request_queue *q = hctx->queue;
 
+	/*
+	 * All requests should have been marked as RQF_MQ_INFLIGHT, so
+	 * update hctx->nr_active in batch
+	 */
+	if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+		__blk_mq_sub_active_requests(hctx, nr_tags);
+
 	blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
 	percpu_ref_put_many(&q->q_usage_counter, nr_tags);
 }
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 28859fc5faee..cb0b5482ca5e 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -225,12 +225,18 @@ static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
 		atomic_inc(&hctx->nr_active);
 }
 
-static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
+static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
+		int val)
 {
 	if (blk_mq_is_shared_tags(hctx->flags))
-		atomic_dec(&hctx->queue->nr_active_requests_shared_tags);
+		atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags);
 	else
-		atomic_dec(&hctx->nr_active);
+		atomic_sub(val, &hctx->nr_active);
+}
+
+static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
+{
+	__blk_mq_sub_active_requests(hctx, 1);
 }
 
 static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)

From c5fc7b93173661336b9cc4e32fd5082a95e12b94 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 3 Nov 2021 05:49:07 -0600
Subject: [PATCH 09/19] block: have plug stored requests hold references to the
 queue

Requests that were stored in the cache deliberately didn't hold an enter
reference to the queue, instead we grabbed one every time we pulled a
request out of there. That made for awkward logic on freeing the remainder
of the cached list, if needed, where we had to artificially raise the
queue usage count before each free.

Grab references up front for cached plug requests. That's safer, and also
more efficient.

Fixes: 47c122e35d7e ("block: pre-allocate requests if plug is started and is a batch")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 8 +++++++-
 block/blk-mq.c   | 7 ++++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index fd389a16013c..35a87c06276e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1643,7 +1643,13 @@ void blk_flush_plug(struct blk_plug *plug, bool from_schedule)
 		flush_plug_callbacks(plug, from_schedule);
 	if (!rq_list_empty(plug->mq_list))
 		blk_mq_flush_plug_list(plug, from_schedule);
-	if (unlikely(!from_schedule && plug->cached_rq))
+	/*
+	 * Unconditionally flush out cached requests, even if the unplug
+	 * event came from schedule. Since we know hold references to the
+	 * queue for cached requests, we don't want a blocked task holding
+	 * up a queue freeze/quiesce event.
+	 */
+	if (unlikely(!rq_list_empty(plug->cached_rq)))
 		blk_mq_free_plug_rqs(plug);
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c68aa0a332e1..5498454c2164 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -410,7 +410,10 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
 		tag_mask &= ~(1UL << i);
 		rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns);
 		rq_list_add(data->cached_rq, rq);
+		nr++;
 	}
+	/* caller already holds a reference, add for remainder */
+	percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
 	data->nr_tags -= nr;
 
 	return rq_list_pop(data->cached_rq);
@@ -630,10 +633,8 @@ void blk_mq_free_plug_rqs(struct blk_plug *plug)
 {
 	struct request *rq;
 
-	while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) {
-		percpu_ref_get(&rq->q->q_usage_counter);
+	while ((rq = rq_list_pop(&plug->cached_rq)) != NULL)
 		blk_mq_free_request(rq);
-	}
 }
 
 static void req_bio_endio(struct request *rq, struct bio *bio,

From 71539717c10521114403d27e171c9cbe35dcd900 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 3 Nov 2021 05:52:45 -0600
Subject: [PATCH 10/19] block: split request allocation components into helpers

This is in preparation for a fix, but serves as a cleanup as well moving
the cached vs regular alloc logic out of blk_mq_submit_bio().

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 71 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 48 insertions(+), 23 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5498454c2164..dcb413297a96 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2478,6 +2478,51 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
 	return BLK_MAX_REQUEST_COUNT;
 }
 
+static struct request *blk_mq_get_new_requests(struct request_queue *q,
+					       struct blk_plug *plug,
+					       struct bio *bio)
+{
+	struct blk_mq_alloc_data data = {
+		.q		= q,
+		.nr_tags	= 1,
+		.cmd_flags	= bio->bi_opf,
+	};
+	struct request *rq;
+
+	if (plug) {
+		data.nr_tags = plug->nr_ios;
+		plug->nr_ios = 1;
+		data.cached_rq = &plug->cached_rq;
+	}
+
+	rq = __blk_mq_alloc_requests(&data);
+	if (rq)
+		return rq;
+
+	rq_qos_cleanup(q, bio);
+	if (bio->bi_opf & REQ_NOWAIT)
+		bio_wouldblock_error(bio);
+	return NULL;
+}
+
+static inline struct request *blk_mq_get_request(struct request_queue *q,
+						 struct blk_plug *plug,
+						 struct bio *bio)
+{
+	if (plug) {
+		struct request *rq;
+
+		rq = rq_list_peek(&plug->cached_rq);
+		if (rq) {
+			plug->cached_rq = rq_list_next(rq);
+			INIT_LIST_HEAD(&rq->queuelist);
+			return rq;
+		}
+	}
+
+	return blk_mq_get_new_requests(q, plug, bio);
+}
+
 /**
  * blk_mq_submit_bio - Create and send a request to block device.
  * @bio: Bio pointer.
@@ -2518,29 +2563,9 @@ void blk_mq_submit_bio(struct bio *bio)
 	rq_qos_throttle(q, bio);
 
 	plug = blk_mq_plug(q, bio);
-	if (plug && plug->cached_rq) {
-		rq = rq_list_pop(&plug->cached_rq);
-		INIT_LIST_HEAD(&rq->queuelist);
-	} else {
-		struct blk_mq_alloc_data data = {
-			.q		= q,
-			.nr_tags	= 1,
-			.cmd_flags	= bio->bi_opf,
-		};
-
-		if (plug) {
-			data.nr_tags = plug->nr_ios;
-			plug->nr_ios = 1;
-			data.cached_rq = &plug->cached_rq;
-		}
-		rq = __blk_mq_alloc_requests(&data);
-		if (unlikely(!rq)) {
-			rq_qos_cleanup(q, bio);
-			if (bio->bi_opf & REQ_NOWAIT)
-				bio_wouldblock_error(bio);
-			goto queue_exit;
-		}
-	}
+	rq = blk_mq_get_request(q, plug, bio);
+	if (unlikely(!rq))
+		goto queue_exit;
 
 	trace_block_getrq(bio);
 

From c98cb5bbdab10d187aff9b4e386210eb2332af96 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 4 Nov 2021 12:45:51 -0600
Subject: [PATCH 11/19] block: make bio_queue_enter() fast-path available
 inline

Just a prep patch for shifting the queue enter logic. This moves the
expected fast path inline, and leaves __bio_queue_enter() as an
out-of-line function call. We don't want to inline the latter, as it's
mostly slow path code.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 28 +---------------------------
 block/blk.h      | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 35a87c06276e..9ca3ddd154d4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -386,30 +386,6 @@ void blk_cleanup_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
 
-static bool blk_try_enter_queue(struct request_queue *q, bool pm)
-{
-	rcu_read_lock();
-	if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter))
-		goto fail;
-
-	/*
-	 * The code that increments the pm_only counter must ensure that the
-	 * counter is globally visible before the queue is unfrozen.
-	 */
-	if (blk_queue_pm_only(q) &&
-	    (!pm || queue_rpm_status(q) == RPM_SUSPENDED))
-		goto fail_put;
-
-	rcu_read_unlock();
-	return true;
-
-fail_put:
-	blk_queue_exit(q);
-fail:
-	rcu_read_unlock();
-	return false;
-}
-
 /**
  * blk_queue_enter() - try to increase q->q_usage_counter
  * @q: request queue pointer
@@ -442,10 +418,8 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
 	return 0;
 }
 
-static inline int bio_queue_enter(struct bio *bio)
+int __bio_queue_enter(struct request_queue *q, struct bio *bio)
 {
-	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-
 	while (!blk_try_enter_queue(q, false)) {
 		struct gendisk *disk = bio->bi_bdev->bd_disk;
 
diff --git a/block/blk.h b/block/blk.h
index 7afffd548daf..814d9632d43e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -55,6 +55,40 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
 void blk_freeze_queue(struct request_queue *q);
 void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
 void blk_queue_start_drain(struct request_queue *q);
+int __bio_queue_enter(struct request_queue *q, struct bio *bio);
+
+static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
+{
+	rcu_read_lock();
+	if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter))
+		goto fail;
+
+	/*
+	 * The code that increments the pm_only counter must ensure that the
+	 * counter is globally visible before the queue is unfrozen.
+	 */
+	if (blk_queue_pm_only(q) &&
+	    (!pm || queue_rpm_status(q) == RPM_SUSPENDED))
+		goto fail_put;
+
+	rcu_read_unlock();
+	return true;
+
+fail_put:
+	blk_queue_exit(q);
+fail:
+	rcu_read_unlock();
+	return false;
+}
+
+static inline int bio_queue_enter(struct bio *bio)
+{
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+
+	if (blk_try_enter_queue(q, false))
+		return 0;
+	return __bio_queue_enter(q, bio);
+}
 
 #define BIO_INLINE_VECS 4
 struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,

From 900e080752025f0016128f07c9ed4c50eba3654b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 3 Nov 2021 05:47:09 -0600
Subject: [PATCH 12/19] block: move queue enter logic into blk_mq_submit_bio()

Retain the old logic for the fops based submit, but for our internal
blk_mq_submit_bio(), move the queue entering logic into the core
function itself.

We need to be a bit careful if going into the scheduler, as a scheduler
or queue mappings can arbitrarily change before we have entered the queue.
Have the bio scheduler mapping do that separately, it's a very cheap
operation compared to actually doing merging locking and lookups.

Reviewed-by: Christoph Hellwig <hch@lst.de>
[axboe: update to check merge post submit_bio_checks() doing remap...]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c     | 25 +++++++++---------
 block/blk-mq-sched.c | 13 +++++++---
 block/blk-mq.c       | 60 ++++++++++++++++++++++++++++++--------------
 block/blk.h          |  1 +
 4 files changed, 65 insertions(+), 34 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 9ca3ddd154d4..4366056e14c4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -744,7 +744,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 	return BLK_STS_OK;
 }
 
-static noinline_for_stack bool submit_bio_checks(struct bio *bio)
+noinline_for_stack bool submit_bio_checks(struct bio *bio)
 {
 	struct block_device *bdev = bio->bi_bdev;
 	struct request_queue *q = bdev_get_queue(bdev);
@@ -862,22 +862,23 @@ end_io:
 	return false;
 }
 
+static void __submit_bio_fops(struct gendisk *disk, struct bio *bio)
+{
+	if (unlikely(bio_queue_enter(bio) != 0))
+		return;
+	if (submit_bio_checks(bio) && blk_crypto_bio_prep(&bio))
+		disk->fops->submit_bio(bio);
+	blk_queue_exit(disk->queue);
+}
+
 static void __submit_bio(struct bio *bio)
 {
 	struct gendisk *disk = bio->bi_bdev->bd_disk;
 
-	if (unlikely(bio_queue_enter(bio) != 0))
-		return;
-
-	if (!submit_bio_checks(bio) || !blk_crypto_bio_prep(&bio))
-		goto queue_exit;
-	if (!disk->fops->submit_bio) {
+	if (!disk->fops->submit_bio)
 		blk_mq_submit_bio(bio);
-		return;
-	}
-	disk->fops->submit_bio(bio);
-queue_exit:
-	blk_queue_exit(disk->queue);
+	else
+		__submit_bio_fops(disk, bio);
 }
 
 /*
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 4a6789e4398b..4be652fa38e7 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -370,15 +370,20 @@ bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
 	bool ret = false;
 	enum hctx_type type;
 
-	if (e && e->type->ops.bio_merge)
-		return e->type->ops.bio_merge(q, bio, nr_segs);
+	if (bio_queue_enter(bio))
+		return false;
+
+	if (e && e->type->ops.bio_merge) {
+		ret = e->type->ops.bio_merge(q, bio, nr_segs);
+		goto out_put;
+	}
 
 	ctx = blk_mq_get_ctx(q);
 	hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
 	type = hctx->type;
 	if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
 	    list_empty_careful(&ctx->rq_lists[type]))
-		return false;
+		goto out_put;
 
 	/* default per sw-queue merge */
 	spin_lock(&ctx->lock);
@@ -391,6 +396,8 @@ bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
 		ret = true;
 
 	spin_unlock(&ctx->lock);
+out_put:
+	blk_queue_exit(q);
 	return ret;
 }
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index dcb413297a96..5fe40c85a308 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2478,9 +2478,23 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
 	return BLK_MAX_REQUEST_COUNT;
 }
 
+static bool blk_attempt_bio_merge(struct request_queue *q, struct bio *bio,
+				  unsigned int nr_segs, bool *same_queue_rq)
+{
+	if (!blk_queue_nomerges(q) && bio_mergeable(bio)) {
+		if (blk_attempt_plug_merge(q, bio, nr_segs, same_queue_rq))
+			return true;
+		if (blk_mq_sched_bio_merge(q, bio, nr_segs))
+			return true;
+	}
+	return false;
+}
+
 static struct request *blk_mq_get_new_requests(struct request_queue *q,
 					       struct blk_plug *plug,
-					       struct bio *bio)
+					       struct bio *bio,
+					       unsigned int nsegs,
+					       bool *same_queue_rq)
 {
 	struct blk_mq_alloc_data data = {
 		.q		= q,
@@ -2489,6 +2503,15 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
 	};
 	struct request *rq;
 
+	if (unlikely(bio_queue_enter(bio)))
+		return NULL;
+	if (unlikely(!submit_bio_checks(bio)))
+		goto put_exit;
+	if (blk_attempt_bio_merge(q, bio, nsegs, same_queue_rq))
+		goto put_exit;
+
+	rq_qos_throttle(q, bio);
+
 	if (plug) {
 		data.nr_tags = plug->nr_ios;
 		plug->nr_ios = 1;
@@ -2502,25 +2525,34 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
 	rq_qos_cleanup(q, bio);
 	if (bio->bi_opf & REQ_NOWAIT)
 		bio_wouldblock_error(bio);
+put_exit:
+	blk_queue_exit(q);
 	return NULL;
 }
 
 static inline struct request *blk_mq_get_request(struct request_queue *q,
 						 struct blk_plug *plug,
-						 struct bio *bio)
+						 struct bio *bio,
+						 unsigned int nsegs,
+						 bool *same_queue_rq)
 {
 	if (plug) {
 		struct request *rq;
 
 		rq = rq_list_peek(&plug->cached_rq);
 		if (rq) {
+			if (unlikely(!submit_bio_checks(bio)))
+				return NULL;
+			if (blk_attempt_bio_merge(q, bio, nsegs, same_queue_rq))
+				return NULL;
 			plug->cached_rq = rq_list_next(rq);
 			INIT_LIST_HEAD(&rq->queuelist);
+			rq_qos_throttle(q, bio);
 			return rq;
 		}
 	}
 
-	return blk_mq_get_new_requests(q, plug, bio);
+	return blk_mq_get_new_requests(q, plug, bio, nsegs, same_queue_rq);
 }
 
 /**
@@ -2546,26 +2578,20 @@ void blk_mq_submit_bio(struct bio *bio)
 	unsigned int nr_segs = 1;
 	blk_status_t ret;
 
+	if (unlikely(!blk_crypto_bio_prep(&bio)))
+		return;
+
 	blk_queue_bounce(q, &bio);
 	if (blk_may_split(q, bio))
 		__blk_queue_split(q, &bio, &nr_segs);
 
 	if (!bio_integrity_prep(bio))
-		goto queue_exit;
-
-	if (!blk_queue_nomerges(q) && bio_mergeable(bio)) {
-		if (blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
-			goto queue_exit;
-		if (blk_mq_sched_bio_merge(q, bio, nr_segs))
-			goto queue_exit;
-	}
-
-	rq_qos_throttle(q, bio);
+		return;
 
 	plug = blk_mq_plug(q, bio);
-	rq = blk_mq_get_request(q, plug, bio);
+	rq = blk_mq_get_request(q, plug, bio, nr_segs, &same_queue_rq);
 	if (unlikely(!rq))
-		goto queue_exit;
+		return;
 
 	trace_block_getrq(bio);
 
@@ -2646,10 +2672,6 @@ void blk_mq_submit_bio(struct bio *bio)
 		/* Default case. */
 		blk_mq_sched_insert_request(rq, false, true, true);
 	}
-
-	return;
-queue_exit:
-	blk_queue_exit(q);
 }
 
 static size_t order_to_size(unsigned int order)
diff --git a/block/blk.h b/block/blk.h
index 814d9632d43e..b4fed2033e48 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -56,6 +56,7 @@ void blk_freeze_queue(struct request_queue *q);
 void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
 void blk_queue_start_drain(struct request_queue *q);
 int __bio_queue_enter(struct request_queue *q, struct bio *bio);
+bool submit_bio_checks(struct bio *bio);
 
 static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
 {

From 10c47870155b5d9a8597eff3345d244e2fe1847f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 4 Nov 2021 11:54:47 -0600
Subject: [PATCH 13/19] block: ensure cached plug request matches the current
 queue

If we're driving multiple devices, we could have pre-populated the cache
for a different device. Ensure that the empty request matches the current
queue.

Fixes: 47c122e35d7e ("block: pre-allocate requests if plug is started and is a batch")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5fe40c85a308..bbe1fb2dd58d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2540,7 +2540,7 @@ static inline struct request *blk_mq_get_request(struct request_queue *q,
 		struct request *rq;
 
 		rq = rq_list_peek(&plug->cached_rq);
-		if (rq) {
+		if (rq && rq->q == q) {
 			if (unlikely(!submit_bio_checks(bio)))
 				return NULL;
 			if (blk_attempt_bio_merge(q, bio, nsegs, same_queue_rq))

From fe7d064fa3faec5d8157029fb8720b4fddc9e1e8 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@kernel.org>
Date: Wed, 3 Nov 2021 09:40:23 -0700
Subject: [PATCH 14/19] block: fix device_add_disk() kobject_create_and_add()
 error handling

Commit 83cbce957446 ("block: add error handling for device_add_disk /
add_disk") added error handling to device_add_disk(), however the goto
label for the kobject_create_and_add() failure did not set the return
value correctly, and so we can end up in a situation where
kobject_create_and_add() fails but we report success.

Fixes: 83cbce957446 ("block: add error handling for device_add_disk / add_disk")
Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20211103164023.1384821-1-mcgrof@kernel.org
[axboe: fold in followup fix from Wu Bo <wubo40@huawei.com>]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/genhd.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/block/genhd.c b/block/genhd.c
index 64f83c4aee99..8ea818aa88ca 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -468,11 +468,15 @@ int device_add_disk(struct device *parent, struct gendisk *disk,
 
 	disk->part0->bd_holder_dir =
 		kobject_create_and_add("holders", &ddev->kobj);
-	if (!disk->part0->bd_holder_dir)
+	if (!disk->part0->bd_holder_dir) {
+		ret = -ENOMEM;
 		goto out_del_integrity;
+	}
 	disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
-	if (!disk->slave_dir)
+	if (!disk->slave_dir) {
+		ret = -ENOMEM;
 		goto out_put_holder_dir;
+	}
 
 	ret = bd_register_pending_holders(disk);
 	if (ret < 0)

From a846a8e6c9a5949582c5a6a8bbc83a7d27fd891e Mon Sep 17 00:00:00 2001
From: Ye Bin <yebin10@huawei.com>
Date: Mon, 8 Nov 2021 15:40:19 +0800
Subject: [PATCH 15/19] blk-mq: don't free tags if the tag_set is used by other
 device in queue initialztion

We got UAF report on v5.10 as follows:
[ 1446.674930] ==================================================================
[ 1446.675970] BUG: KASAN: use-after-free in blk_mq_get_driver_tag+0x9a4/0xa90
[ 1446.676902] Read of size 8 at addr ffff8880185afd10 by task kworker/1:2/12348
[ 1446.677851]
[ 1446.678073] CPU: 1 PID: 12348 Comm: kworker/1:2 Not tainted 5.10.0-10177-gc9c81b1e346a #2
[ 1446.679168] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[ 1446.680692] Workqueue: kthrotld blk_throtl_dispatch_work_fn
[ 1446.681448] Call Trace:
[ 1446.681800]  dump_stack+0x9b/0xce
[ 1446.682916]  print_address_description.constprop.6+0x3e/0x60
[ 1446.685999]  kasan_report.cold.9+0x22/0x3a
[ 1446.687186]  blk_mq_get_driver_tag+0x9a4/0xa90
[ 1446.687785]  blk_mq_dispatch_rq_list+0x21a/0x1d40
[ 1446.692576]  __blk_mq_do_dispatch_sched+0x394/0x830
[ 1446.695758]  __blk_mq_sched_dispatch_requests+0x398/0x4f0
[ 1446.698279]  blk_mq_sched_dispatch_requests+0xdf/0x140
[ 1446.698967]  __blk_mq_run_hw_queue+0xc0/0x270
[ 1446.699561]  __blk_mq_delay_run_hw_queue+0x4cc/0x550
[ 1446.701407]  blk_mq_run_hw_queue+0x13b/0x2b0
[ 1446.702593]  blk_mq_sched_insert_requests+0x1de/0x390
[ 1446.703309]  blk_mq_flush_plug_list+0x4b4/0x760
[ 1446.705408]  blk_flush_plug_list+0x2c5/0x480
[ 1446.708471]  blk_finish_plug+0x55/0xa0
[ 1446.708980]  blk_throtl_dispatch_work_fn+0x23b/0x2e0
[ 1446.711236]  process_one_work+0x6d4/0xfe0
[ 1446.711778]  worker_thread+0x91/0xc80
[ 1446.713400]  kthread+0x32d/0x3f0
[ 1446.714362]  ret_from_fork+0x1f/0x30
[ 1446.714846]
[ 1446.715062] Allocated by task 1:
[ 1446.715509]  kasan_save_stack+0x19/0x40
[ 1446.716026]  __kasan_kmalloc.constprop.1+0xc1/0xd0
[ 1446.716673]  blk_mq_init_tags+0x6d/0x330
[ 1446.717207]  blk_mq_alloc_rq_map+0x50/0x1c0
[ 1446.717769]  __blk_mq_alloc_map_and_request+0xe5/0x320
[ 1446.718459]  blk_mq_alloc_tag_set+0x679/0xdc0
[ 1446.719050]  scsi_add_host_with_dma.cold.3+0xa0/0x5db
[ 1446.719736]  virtscsi_probe+0x7bf/0xbd0
[ 1446.720265]  virtio_dev_probe+0x402/0x6c0
[ 1446.720808]  really_probe+0x276/0xde0
[ 1446.721320]  driver_probe_device+0x267/0x3d0
[ 1446.721892]  device_driver_attach+0xfe/0x140
[ 1446.722491]  __driver_attach+0x13a/0x2c0
[ 1446.723037]  bus_for_each_dev+0x146/0x1c0
[ 1446.723603]  bus_add_driver+0x3fc/0x680
[ 1446.724145]  driver_register+0x1c0/0x400
[ 1446.724693]  init+0xa2/0xe8
[ 1446.725091]  do_one_initcall+0x9e/0x310
[ 1446.725626]  kernel_init_freeable+0xc56/0xcb9
[ 1446.726231]  kernel_init+0x11/0x198
[ 1446.726714]  ret_from_fork+0x1f/0x30
[ 1446.727212]
[ 1446.727433] Freed by task 26992:
[ 1446.727882]  kasan_save_stack+0x19/0x40
[ 1446.728420]  kasan_set_track+0x1c/0x30
[ 1446.728943]  kasan_set_free_info+0x1b/0x30
[ 1446.729517]  __kasan_slab_free+0x111/0x160
[ 1446.730084]  kfree+0xb8/0x520
[ 1446.730507]  blk_mq_free_map_and_requests+0x10b/0x1b0
[ 1446.731206]  blk_mq_realloc_hw_ctxs+0x8cb/0x15b0
[ 1446.731844]  blk_mq_init_allocated_queue+0x374/0x1380
[ 1446.732540]  blk_mq_init_queue_data+0x7f/0xd0
[ 1446.733155]  scsi_mq_alloc_queue+0x45/0x170
[ 1446.733730]  scsi_alloc_sdev+0x73c/0xb20
[ 1446.734281]  scsi_probe_and_add_lun+0x9a6/0x2d90
[ 1446.734916]  __scsi_scan_target+0x208/0xc50
[ 1446.735500]  scsi_scan_channel.part.3+0x113/0x170
[ 1446.736149]  scsi_scan_host_selected+0x25a/0x360
[ 1446.736783]  store_scan+0x290/0x2d0
[ 1446.737275]  dev_attr_store+0x55/0x80
[ 1446.737782]  sysfs_kf_write+0x132/0x190
[ 1446.738313]  kernfs_fop_write_iter+0x319/0x4b0
[ 1446.738921]  new_sync_write+0x40e/0x5c0
[ 1446.739429]  vfs_write+0x519/0x720
[ 1446.739877]  ksys_write+0xf8/0x1f0
[ 1446.740332]  do_syscall_64+0x2d/0x40
[ 1446.740802]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1446.741462]
[ 1446.741670] The buggy address belongs to the object at ffff8880185afd00
[ 1446.741670]  which belongs to the cache kmalloc-256 of size 256
[ 1446.743276] The buggy address is located 16 bytes inside of
[ 1446.743276]  256-byte region [ffff8880185afd00, ffff8880185afe00)
[ 1446.744765] The buggy address belongs to the page:
[ 1446.745416] page:ffffea0000616b00 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x185ac
[ 1446.746694] head:ffffea0000616b00 order:2 compound_mapcount:0 compound_pincount:0
[ 1446.747719] flags: 0x1fffff80010200(slab|head)
[ 1446.748337] raw: 001fffff80010200 ffffea00006a3208 ffffea000061bf08 ffff88801004f240
[ 1446.749404] raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000
[ 1446.750455] page dumped because: kasan: bad access detected
[ 1446.751227]
[ 1446.751445] Memory state around the buggy address:
[ 1446.752102]  ffff8880185afc00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1446.753090]  ffff8880185afc80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1446.754079] >ffff8880185afd00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 1446.755065]                          ^
[ 1446.755589]  ffff8880185afd80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[ 1446.756574]  ffff8880185afe00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[ 1446.757566] ==================================================================

Flag 'BLK_MQ_F_TAG_QUEUE_SHARED' will be set if the second device on the
same host initializes it's queue successfully. However, if the second
device failed to allocate memory in blk_mq_alloc_and_init_hctx() from
blk_mq_realloc_hw_ctxs() from blk_mq_init_allocated_queue(),
__blk_mq_free_map_and_rqs() will be called on error path, and if
'BLK_MQ_TAG_HCTX_SHARED' is not set, 'tag_set->tags' will be freed
while it's still used by the first device.

To fix this issue we move release newly allocated hardware context from
blk_mq_realloc_hw_ctxs to __blk_mq_update_nr_hw_queues. As there is needn't to
release hardware context in blk_mq_init_allocated_queue.

Fixes: 868f2f0b7206 ("blk-mq: dynamic h/w context count")
Signed-off-by: Ye Bin <yebin10@huawei.com>
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Link: https://lore.kernel.org/r/20211108074019.1058843-1-yebin10@huawei.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index bbe1fb2dd58d..5a9cd9fe8da3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3657,7 +3657,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 		struct blk_mq_hw_ctx *hctx = hctxs[j];
 
 		if (hctx) {
-			__blk_mq_free_map_and_rqs(set, j);
 			blk_mq_exit_hctx(q, set, hctx, j);
 			hctxs[j] = NULL;
 		}
@@ -4165,8 +4164,13 @@ fallback:
 	list_for_each_entry(q, &set->tag_list, tag_set_list) {
 		blk_mq_realloc_hw_ctxs(set, q);
 		if (q->nr_hw_queues != set->nr_hw_queues) {
+			int i = prev_nr_hw_queues;
+
 			pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
 					nr_hw_queues, prev_nr_hw_queues);
+			for (; i < set->nr_hw_queues; i++)
+				__blk_mq_free_map_and_rqs(set, i);
+
 			set->nr_hw_queues = prev_nr_hw_queues;
 			blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
 			goto fallback;

From 9ef4d0209cbadb63656a7aa29fde49c27ab2b9bf Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 9 Nov 2021 15:11:41 +0800
Subject: [PATCH 16/19] blk-mq: add one API for waiting until quiesce is done

Some drivers(NVMe, SCSI) need to call quiesce and unquiesce in pair, but it
is hard to switch to this style, so these drivers need one atomic flag for
helping to balance quiesce and unquiesce.

When quiesce is in-progress, the driver still needs to wait until
the quiesce is done, so add API of blk_mq_wait_quiesce_done() for
these drivers.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20211109071144.181581-2-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.c         | 38 +++++++++++++++++++++++++-------------
 include/linux/blk-mq.h |  1 +
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5a9cd9fe8da3..d3e5fcbc943b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -250,6 +250,30 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
 
+/**
+ * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
+ * @q: request queue.
+ *
+ * Note: it is driver's responsibility for making sure that quiesce has
+ * been started.
+ */
+void blk_mq_wait_quiesce_done(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned int i;
+	bool rcu = false;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (hctx->flags & BLK_MQ_F_BLOCKING)
+			synchronize_srcu(hctx->srcu);
+		else
+			rcu = true;
+	}
+	if (rcu)
+		synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
+
 /**
  * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
  * @q: request queue.
@@ -261,20 +285,8 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
  */
 void blk_mq_quiesce_queue(struct request_queue *q)
 {
-	struct blk_mq_hw_ctx *hctx;
-	unsigned int i;
-	bool rcu = false;
-
 	blk_mq_quiesce_queue_nowait(q);
-
-	queue_for_each_hw_ctx(q, hctx, i) {
-		if (hctx->flags & BLK_MQ_F_BLOCKING)
-			synchronize_srcu(hctx->srcu);
-		else
-			rcu = true;
-	}
-	if (rcu)
-		synchronize_rcu();
+	blk_mq_wait_quiesce_done(q);
 }
 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index b4039fdf1b04..d53ee59ba131 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -803,6 +803,7 @@ void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_quiesce_queue(struct request_queue *q);
+void blk_mq_wait_quiesce_done(struct request_queue *q);
 void blk_mq_unquiesce_queue(struct request_queue *q);
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);

From d2b9f12b0f7cf95c43f5fd4a18688d958d39e423 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 9 Nov 2021 15:11:42 +0800
Subject: [PATCH 17/19] scsi: avoid to quiesce sdev->request_queue two times

For fixing queue quiesce race between driver and block layer(elevator
switch, update nr_requests, ...), we need to support concurrent quiesce
and unquiesce, which requires the two to be balanced.

blk_mq_quiesce_queue() calls blk_mq_quiesce_queue_nowait() for updating
quiesce depth and marking the flag, then scsi_internal_device_block() calls
blk_mq_quiesce_queue_nowait() two times actually.

Fix the double quiesce and keep quiesce and unquiesce balanced.

Reported-by: Yi Zhang <yi.zhang@redhat.com>
Fixes: e70feb8b3e68 ("blk-mq: support concurrent queue quiesce/unquiesce")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20211109071144.181581-3-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 30f7d0b4eb73..51fcd46be265 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2630,6 +2630,14 @@ scsi_target_resume(struct scsi_target *starget)
 }
 EXPORT_SYMBOL(scsi_target_resume);
 
+static int __scsi_internal_device_block_nowait(struct scsi_device *sdev)
+{
+	if (scsi_device_set_state(sdev, SDEV_BLOCK))
+		return scsi_device_set_state(sdev, SDEV_CREATED_BLOCK);
+
+	return 0;
+}
+
 /**
  * scsi_internal_device_block_nowait - try to transition to the SDEV_BLOCK state
  * @sdev: device to block
@@ -2646,24 +2654,16 @@ EXPORT_SYMBOL(scsi_target_resume);
  */
 int scsi_internal_device_block_nowait(struct scsi_device *sdev)
 {
-	struct request_queue *q = sdev->request_queue;
-	int err = 0;
-
-	err = scsi_device_set_state(sdev, SDEV_BLOCK);
-	if (err) {
-		err = scsi_device_set_state(sdev, SDEV_CREATED_BLOCK);
-
-		if (err)
-			return err;
-	}
+	int ret = __scsi_internal_device_block_nowait(sdev);
 
 	/*
 	 * The device has transitioned to SDEV_BLOCK.  Stop the
 	 * block layer from calling the midlayer with this device's
 	 * request queue.
 	 */
-	blk_mq_quiesce_queue_nowait(q);
-	return 0;
+	if (!ret)
+		blk_mq_quiesce_queue_nowait(sdev->request_queue);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait);
 
@@ -2684,13 +2684,12 @@ EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait);
  */
 static int scsi_internal_device_block(struct scsi_device *sdev)
 {
-	struct request_queue *q = sdev->request_queue;
 	int err;
 
 	mutex_lock(&sdev->state_mutex);
-	err = scsi_internal_device_block_nowait(sdev);
+	err = __scsi_internal_device_block_nowait(sdev);
 	if (err == 0)
-		blk_mq_quiesce_queue(q);
+		blk_mq_quiesce_queue(sdev->request_queue);
 	mutex_unlock(&sdev->state_mutex);
 
 	return err;

From 93542fbfa7b726d053c01a9399577c03968c4f6b Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 9 Nov 2021 15:11:43 +0800
Subject: [PATCH 18/19] scsi: make sure that request queue queiesce and
 unquiesce balanced

For fixing queue quiesce race between driver and block layer(elevator
switch, update nr_requests, ...), we need to support concurrent quiesce
and unquiesce, which requires the two call balanced.

It isn't easy to audit that in all scsi drivers, especially the two may
be called from different contexts, so do it in scsi core with one
per-device atomic variable to balance quiesce and unquiesce.

Reported-by: Yi Zhang <yi.zhang@redhat.com>
Fixes: e70feb8b3e68 ("blk-mq: support concurrent queue quiesce/unquiesce")
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20211109071144.181581-4-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/scsi/scsi_lib.c    | 37 ++++++++++++++++++++++++++++---------
 include/scsi/scsi_device.h |  1 +
 2 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 51fcd46be265..e1929bcc4e53 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2638,6 +2638,32 @@ static int __scsi_internal_device_block_nowait(struct scsi_device *sdev)
 	return 0;
 }
 
+void scsi_start_queue(struct scsi_device *sdev)
+{
+	if (cmpxchg(&sdev->queue_stopped, 1, 0))
+		blk_mq_unquiesce_queue(sdev->request_queue);
+}
+
+static void scsi_stop_queue(struct scsi_device *sdev, bool nowait)
+{
+	/*
+	 * The atomic variable of ->queue_stopped covers that
+	 * blk_mq_quiesce_queue* is balanced with blk_mq_unquiesce_queue.
+	 *
+	 * However, we still need to wait until quiesce is done
+	 * in case that queue has been stopped.
+	 */
+	if (!cmpxchg(&sdev->queue_stopped, 0, 1)) {
+		if (nowait)
+			blk_mq_quiesce_queue_nowait(sdev->request_queue);
+		else
+			blk_mq_quiesce_queue(sdev->request_queue);
+	} else {
+		if (!nowait)
+			blk_mq_wait_quiesce_done(sdev->request_queue);
+	}
+}
+
 /**
  * scsi_internal_device_block_nowait - try to transition to the SDEV_BLOCK state
  * @sdev: device to block
@@ -2662,7 +2688,7 @@ int scsi_internal_device_block_nowait(struct scsi_device *sdev)
 	 * request queue.
 	 */
 	if (!ret)
-		blk_mq_quiesce_queue_nowait(sdev->request_queue);
+		scsi_stop_queue(sdev, true);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait);
@@ -2689,19 +2715,12 @@ static int scsi_internal_device_block(struct scsi_device *sdev)
 	mutex_lock(&sdev->state_mutex);
 	err = __scsi_internal_device_block_nowait(sdev);
 	if (err == 0)
-		blk_mq_quiesce_queue(sdev->request_queue);
+		scsi_stop_queue(sdev, false);
 	mutex_unlock(&sdev->state_mutex);
 
 	return err;
 }
 
-void scsi_start_queue(struct scsi_device *sdev)
-{
-	struct request_queue *q = sdev->request_queue;
-
-	blk_mq_unquiesce_queue(q);
-}
-
 /**
  * scsi_internal_device_unblock_nowait - resume a device after a block request
  * @sdev:	device to resume
diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
index 430b73bd02ac..d1c6fc83b1e3 100644
--- a/include/scsi/scsi_device.h
+++ b/include/scsi/scsi_device.h
@@ -207,6 +207,7 @@ struct scsi_device {
 					 * creation time */
 	unsigned ignore_media_change:1; /* Ignore MEDIA CHANGE on resume */
 
+	unsigned int queue_stopped;	/* request queue is quiesced */
 	bool offline_already;		/* Device offline message logged */
 
 	atomic_t disk_events_disable_depth; /* disable depth for disk events */

From 26af1cd00364ce20dbec66b93ef42f9d42dc6953 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Tue, 9 Nov 2021 15:11:44 +0800
Subject: [PATCH 19/19] nvme: wait until quiesce is done

NVMe uses one atomic flag to check if quiesce is needed. If quiesce is
started, the helper returns immediately. This way is wrong, since we
have to wait until quiesce is done.

Fixes: e70feb8b3e68 ("blk-mq: support concurrent queue quiesce/unquiesce")
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20211109071144.181581-5-ming.lei@redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index eb284f45fc44..2fc5ea9a7793 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4476,6 +4476,8 @@ static void nvme_stop_ns_queue(struct nvme_ns *ns)
 {
 	if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
 		blk_mq_quiesce_queue(ns->queue);
+	else
+		blk_mq_wait_quiesce_done(ns->queue);
 }
 
 /*
@@ -4595,6 +4597,8 @@ void nvme_stop_admin_queue(struct nvme_ctrl *ctrl)
 {
 	if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
 		blk_mq_quiesce_queue(ctrl->admin_q);
+	else
+		blk_mq_wait_quiesce_done(ctrl->admin_q);
 }
 EXPORT_SYMBOL_GPL(nvme_stop_admin_queue);