From ec51f8ee1e63498e9f521ec0e5a6d04622bb2c67 Mon Sep 17 00:00:00 2001
From: Mike Marshall <hubcap@omnibond.com>
Date: Tue, 5 Feb 2019 14:13:35 -0500
Subject: [PATCH 01/10] aio: initialize kiocb private in case any filesystems
 expect it.

A recent optimization had left private uninitialized.

Fixes: 2bc4ca9bb600 ("aio: don't zero entire aio_kiocb aio_get_req()")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Marshall <hubcap@omnibond.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/aio.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/aio.c b/fs/aio.c
index b906ff70c90f..aaaaf4d12c73 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1436,6 +1436,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
 	if (unlikely(!req->ki_filp))
 		return -EBADF;
 	req->ki_complete = aio_complete_rw;
+	req->private = NULL;
 	req->ki_pos = iocb->aio_offset;
 	req->ki_flags = iocb_flags(req->ki_filp);
 	if (iocb->aio_flags & IOCB_FLAG_RESFD)

From e7ad43c3eda6a1690c4c3c341f95dc1c6898da83 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Mon, 28 Jan 2019 09:46:07 -0700
Subject: [PATCH 02/10] nvme: lock NS list changes while handling command
 effects

If a controller supports the NS Change Notification, the namespace
scan_work is automatically triggered after attaching a new namespace.

Occasionally the namespace scan_work may append the new namespace to the
list before the admin command effects handling is completed. The effects
handling unfreezes namespaces, but if it unfreezes the newly attached
namespace, its request_queue freeze depth will be off and we'll hit the
warning in blk_mq_unfreeze_queue().

On the next namespace add, we will fail to freeze that queue due to the
previous bad accounting and deadlock waiting for frozen.

Fix that by preventing scan work from altering the namespace list while
command effects handling needs to pair freeze with unfreeze.

Reported-by: Wen Xiong <wenxiong@us.ibm.com>
Tested-by: Wen Xiong <wenxiong@us.ibm.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/core.c | 8 +++++++-
 drivers/nvme/host/nvme.h | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 150e49723c15..6a9dd68c0f4f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1253,6 +1253,7 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	 * effects say only one namespace is affected.
 	 */
 	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
+		mutex_lock(&ctrl->scan_lock);
 		nvme_start_freeze(ctrl);
 		nvme_wait_freeze(ctrl);
 	}
@@ -1281,8 +1282,10 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
 	 */
 	if (effects & NVME_CMD_EFFECTS_LBCC)
 		nvme_update_formats(ctrl);
-	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK))
+	if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) {
 		nvme_unfreeze(ctrl);
+		mutex_unlock(&ctrl->scan_lock);
+	}
 	if (effects & NVME_CMD_EFFECTS_CCC)
 		nvme_init_identify(ctrl);
 	if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC))
@@ -3401,6 +3404,7 @@ static void nvme_scan_work(struct work_struct *work)
 	if (nvme_identify_ctrl(ctrl, &id))
 		return;
 
+	mutex_lock(&ctrl->scan_lock);
 	nn = le32_to_cpu(id->nn);
 	if (ctrl->vs >= NVME_VS(1, 1, 0) &&
 	    !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
@@ -3409,6 +3413,7 @@ static void nvme_scan_work(struct work_struct *work)
 	}
 	nvme_scan_ns_sequential(ctrl, nn);
 out_free_id:
+	mutex_unlock(&ctrl->scan_lock);
 	kfree(id);
 	down_write(&ctrl->namespaces_rwsem);
 	list_sort(NULL, &ctrl->namespaces, ns_cmp);
@@ -3652,6 +3657,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 
 	ctrl->state = NVME_CTRL_NEW;
 	spin_lock_init(&ctrl->lock);
+	mutex_init(&ctrl->scan_lock);
 	INIT_LIST_HEAD(&ctrl->namespaces);
 	init_rwsem(&ctrl->namespaces_rwsem);
 	ctrl->dev = dev;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ab961bdeea89..c4a1bb41abf0 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -154,6 +154,7 @@ struct nvme_ctrl {
 	enum nvme_ctrl_state state;
 	bool identified;
 	spinlock_t lock;
+	struct mutex scan_lock;
 	const struct nvme_ctrl_ops *ops;
 	struct request_queue *admin_q;
 	struct request_queue *connect_q;

From 5c959d73dba6495ec01d04c206ee679d61ccb2b0 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 23 Jan 2019 18:46:11 -0700
Subject: [PATCH 03/10] nvme-pci: fix rapid add remove sequence

A surprise removal may fail to tear down request queues if it is racing
with the initial asynchronous probe. If that happens, the remove path
won't see the queue resources to tear down, and the controller reset
path may create a new request queue on a removed device, but will not
be able to make forward progress, deadlocking the pci removal.

Protect setting up non-blocking resources from a shutdown by holding the
same mutex, and transition to the CONNECTING state after these resources
are initialized so the probe path may see the dead controller state
before dispatching new IO.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=202081
Reported-by: Alex Gagniuc <Alex_Gagniuc@Dellteam.com>
Signed-off-by: Keith Busch <keith.busch@intel.com>
Tested-by: Alex Gagniuc <mr.nuke.me@gmail.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/pci.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9bc585415d9b..022ea1ee63f8 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2557,16 +2557,7 @@ static void nvme_reset_work(struct work_struct *work)
 	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
 		nvme_dev_disable(dev, false);
 
-	/*
-	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
-	 * initializing procedure here.
-	 */
-	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
-		dev_warn(dev->ctrl.device,
-			"failed to mark controller CONNECTING\n");
-		goto out;
-	}
-
+	mutex_lock(&dev->shutdown_lock);
 	result = nvme_pci_enable(dev);
 	if (result)
 		goto out;
@@ -2585,6 +2576,17 @@ static void nvme_reset_work(struct work_struct *work)
 	 */
 	dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1;
 	dev->ctrl.max_segments = NVME_MAX_SEGS;
+	mutex_unlock(&dev->shutdown_lock);
+
+	/*
+	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
+	 * initializing procedure here.
+	 */
+	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
+		dev_warn(dev->ctrl.device,
+			"failed to mark controller CONNECTING\n");
+		goto out;
+	}
 
 	result = nvme_init_identify(&dev->ctrl);
 	if (result)

From dd957493baa586f1431490f97f9c7c45eaf8ab10 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Sun, 3 Feb 2019 10:02:07 +0100
Subject: [PATCH 04/10] libata: Add NOLPM quirk for SAMSUNG MZ7TE512HMHP-000L1
 SSD

We've received a bugreport that using LPM with a SAMSUNG
MZ7TE512HMHP-000L1 SSD leads to system instability, we already have
a quirk for the MZ7TD256HAFV-000L9, which is also a Samsun EVO 840 /
PM851 OEM model, so it seems some of these models have a LPM issue.

This commits adds a NOLPM quirk for the model string from the new
bugeport, to avoid the reported stability issues.

Cc: stable@vger.kernel.org
BugLink: https://bugzilla.redhat.com/show_bug.cgi?id=1571330
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/ata/libata-core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index b8c3f9e6af89..adf28788cab5 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4554,6 +4554,7 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 	{ "SAMSUNG MZMPC128HBFU-000MV", "CXM14M1Q", ATA_HORKAGE_NOLPM, },
 	{ "SAMSUNG SSD PM830 mSATA *",  "CXM13D1Q", ATA_HORKAGE_NOLPM, },
 	{ "SAMSUNG MZ7TD256HAFV-000L9", NULL,       ATA_HORKAGE_NOLPM, },
+	{ "SAMSUNG MZ7TE512HMHP-000L1", "EXT06L0Q", ATA_HORKAGE_NOLPM, },
 
 	/* devices that don't properly handle queued TRIM commands */
 	{ "Micron_M500IT_*",		"MU01",	ATA_HORKAGE_NO_NCQ_TRIM |

From 30363d6506d0d202bb14f4dac36d9b4b0714ad8d Mon Sep 17 00:00:00 2001
From: Chengguang Xu <cgxu519@gmx.com>
Date: Fri, 1 Feb 2019 11:26:02 +0800
Subject: [PATCH 05/10] m68k: set proper major_num when specifying module param
 major_num

When calling register_blkdev() with specified major
device number, the return code is 0 on success.
So it seems not correct direct assign return code to
variable major_num in this case.

Tested-by: Michael Schmitz <schmitzmic@gmail.com>
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Chengguang Xu <cgxu519@gmx.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 arch/m68k/emu/nfblock.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c
index 38049357d6d3..40712e49381b 100644
--- a/arch/m68k/emu/nfblock.c
+++ b/arch/m68k/emu/nfblock.c
@@ -155,18 +155,22 @@ out:
 static int __init nfhd_init(void)
 {
 	u32 blocks, bsize;
+	int ret;
 	int i;
 
 	nfhd_id = nf_get_id("XHDI");
 	if (!nfhd_id)
 		return -ENODEV;
 
-	major_num = register_blkdev(major_num, "nfhd");
-	if (major_num <= 0) {
+	ret = register_blkdev(major_num, "nfhd");
+	if (ret < 0) {
 		pr_warn("nfhd: unable to get major number\n");
-		return major_num;
+		return ret;
 	}
 
+	if (!major_num)
+		major_num = ret;
+
 	for (i = NFHD_DEV_OFFSET; i < 24; i++) {
 		if (nfhd_get_capacity(i, 0, &blocks, &bsize))
 			continue;

From 43636c804df0126da669c261fc820fb22f62bfc2 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 21 Jan 2019 22:49:37 +0900
Subject: [PATCH 06/10] fs: ratelimit __find_get_block_slow() failure message.

When something let __find_get_block_slow() hit all_mapped path, it calls
printk() for 100+ times per a second. But there is no need to print same
message with such high frequency; it is just asking for stall warning, or
at least bloating log files.

  [  399.866302][T15342] __find_get_block_slow() failed. block=1, b_blocknr=8
  [  399.873324][T15342] b_state=0x00000029, b_size=512
  [  399.878403][T15342] device loop0 blocksize: 4096
  [  399.883296][T15342] __find_get_block_slow() failed. block=1, b_blocknr=8
  [  399.890400][T15342] b_state=0x00000029, b_size=512
  [  399.895595][T15342] device loop0 blocksize: 4096
  [  399.900556][T15342] __find_get_block_slow() failed. block=1, b_blocknr=8
  [  399.907471][T15342] b_state=0x00000029, b_size=512
  [  399.912506][T15342] device loop0 blocksize: 4096

This patch reduces frequency to up to once per a second, in addition to
concatenating three lines into one.

  [  399.866302][T15342] __find_get_block_slow() failed. block=1, b_blocknr=8, b_state=0x00000029, b_size=512, device loop0 blocksize: 4096

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/buffer.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 52d024bfdbc1..48318fb74938 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -200,6 +200,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
 	struct buffer_head *head;
 	struct page *page;
 	int all_mapped = 1;
+	static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
 
 	index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
 	page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
@@ -227,15 +228,15 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
 	 * file io on the block device and getblk.  It gets dealt with
 	 * elsewhere, don't buffer_error if we had some unmapped buffers
 	 */
-	if (all_mapped) {
-		printk("__find_get_block_slow() failed. "
-			"block=%llu, b_blocknr=%llu\n",
-			(unsigned long long)block,
-			(unsigned long long)bh->b_blocknr);
-		printk("b_state=0x%08lx, b_size=%zu\n",
-			bh->b_state, bh->b_size);
-		printk("device %pg blocksize: %d\n", bdev,
-			1 << bd_inode->i_blkbits);
+	ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
+	if (all_mapped && __ratelimit(&last_warned)) {
+		printk("__find_get_block_slow() failed. block=%llu, "
+		       "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
+		       "device %pg blocksize: %d\n",
+		       (unsigned long long)block,
+		       (unsigned long long)bh->b_blocknr,
+		       bh->b_state, bh->b_size, bdev,
+		       1 << bd_inode->i_blkbits);
 	}
 out_unlock:
 	spin_unlock(&bd_mapping->private_lock);

From 0803de78049fe1b0baf44bcddc727b036fb9139b Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 7 Feb 2019 11:55:39 +0100
Subject: [PATCH 07/10] blktrace: Show requests without sector

Currently, blktrace will not show requests that don't have any data as
rq->__sector is initialized to -1 which is out of device range and thus
discarded by act_log_check(). This is most notably the case for cache
flush requests sent to the device. Fix the problem by making
blk_rq_trace_sector() return 0 for requests without initialized sector.

Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blktrace_api.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 8804753805ac..7bb2d8de9f30 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -116,7 +116,13 @@ extern void blk_fill_rwbs(char *rwbs, unsigned int op, int bytes);
 
 static inline sector_t blk_rq_trace_sector(struct request *rq)
 {
-	return blk_rq_is_passthrough(rq) ? 0 : blk_rq_pos(rq);
+	/*
+	 * Tracing should ignore starting sector for passthrough requests and
+	 * requests where starting sector didn't get set.
+	 */
+	if (blk_rq_is_passthrough(rq) || blk_rq_pos(rq) == (sector_t)-1)
+		return 0;
+	return blk_rq_pos(rq);
 }
 
 static inline unsigned int blk_rq_trace_nr_sectors(struct request *rq)

From 8c772a9bfc7c07c76f4a58b58910452fbb20843b Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.liu@linux.alibaba.com>
Date: Fri, 25 Jan 2019 08:12:47 +0800
Subject: [PATCH 08/10] blk-iolatency: fix IO hang due to negative inflight
 counter

Our test reported the following stack, and vmcore showed that
->inflight counter is -1.

[ffffc9003fcc38d0] __schedule at ffffffff8173d95d
[ffffc9003fcc3958] schedule at ffffffff8173de26
[ffffc9003fcc3970] io_schedule at ffffffff810bb6b6
[ffffc9003fcc3988] blkcg_iolatency_throttle at ffffffff813911cb
[ffffc9003fcc3a20] rq_qos_throttle at ffffffff813847f3
[ffffc9003fcc3a48] blk_mq_make_request at ffffffff8137468a
[ffffc9003fcc3b08] generic_make_request at ffffffff81368b49
[ffffc9003fcc3b68] submit_bio at ffffffff81368d7d
[ffffc9003fcc3bb8] ext4_io_submit at ffffffffa031be00 [ext4]
[ffffc9003fcc3c00] ext4_writepages at ffffffffa03163de [ext4]
[ffffc9003fcc3d68] do_writepages at ffffffff811c49ae
[ffffc9003fcc3d78] __filemap_fdatawrite_range at ffffffff811b6188
[ffffc9003fcc3e30] filemap_write_and_wait_range at ffffffff811b6301
[ffffc9003fcc3e60] ext4_sync_file at ffffffffa030cee8 [ext4]
[ffffc9003fcc3ea8] vfs_fsync_range at ffffffff8128594b
[ffffc9003fcc3ee8] do_fsync at ffffffff81285abd
[ffffc9003fcc3f18] sys_fsync at ffffffff81285d50
[ffffc9003fcc3f28] do_syscall_64 at ffffffff81003c04
[ffffc9003fcc3f50] entry_SYSCALL_64_after_swapgs at ffffffff81742b8e

The ->inflight counter may be negative (-1) if

1) blk-iolatency was disabled when the IO was issued,

2) blk-iolatency was enabled before this IO reached its endio,

3) the ->inflight counter is decreased from 0 to -1 in endio()

In fact the hang can be easily reproduced by the below script,

H=/sys/fs/cgroup/unified/
P=/sys/fs/cgroup/unified/test

echo "+io" > $H/cgroup.subtree_control
mkdir -p $P

echo $$ > $P/cgroup.procs

xfs_io -f -d -c "pwrite 0 4k" /dev/sdg

echo "`cat /sys/block/sdg/dev` target=1000000" > $P/io.latency

xfs_io -f -d -c "pwrite 0 4k" /dev/sdg

This fixes the problem by freezing the queue so that while
enabling/disabling iolatency, there is no inflight rq running.

Note that quiesce_queue is not needed as this only updating iolatency
configuration about which dispatching request_queue doesn't care.

Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-iolatency.c | 52 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 7 deletions(-)

diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index fc714ef402a6..1893686a9c1f 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -72,6 +72,7 @@
 #include <linux/sched/loadavg.h>
 #include <linux/sched/signal.h>
 #include <trace/events/block.h>
+#include <linux/blk-mq.h>
 #include "blk-rq-qos.h"
 #include "blk-stat.h"
 
@@ -601,6 +602,9 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
 		return;
 
 	enabled = blk_iolatency_enabled(iolat->blkiolat);
+	if (!enabled)
+		return;
+
 	while (blkg && blkg->parent) {
 		iolat = blkg_to_lat(blkg);
 		if (!iolat) {
@@ -610,7 +614,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
 		rqw = &iolat->rq_wait;
 
 		atomic_dec(&rqw->inflight);
-		if (!enabled || iolat->min_lat_nsec == 0)
+		if (iolat->min_lat_nsec == 0)
 			goto next;
 		iolatency_record_time(iolat, &bio->bi_issue, now,
 				      issue_as_root);
@@ -754,10 +758,13 @@ int blk_iolatency_init(struct request_queue *q)
 	return 0;
 }
 
-static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
+/*
+ * return 1 for enabling iolatency, return -1 for disabling iolatency, otherwise
+ * return 0.
+ */
+static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
 {
 	struct iolatency_grp *iolat = blkg_to_lat(blkg);
-	struct blk_iolatency *blkiolat = iolat->blkiolat;
 	u64 oldval = iolat->min_lat_nsec;
 
 	iolat->min_lat_nsec = val;
@@ -766,9 +773,10 @@ static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
 				    BLKIOLATENCY_MAX_WIN_SIZE);
 
 	if (!oldval && val)
-		atomic_inc(&blkiolat->enabled);
+		return 1;
 	if (oldval && !val)
-		atomic_dec(&blkiolat->enabled);
+		return -1;
+	return 0;
 }
 
 static void iolatency_clear_scaling(struct blkcg_gq *blkg)
@@ -800,6 +808,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 	u64 lat_val = 0;
 	u64 oldval;
 	int ret;
+	int enable = 0;
 
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx);
 	if (ret)
@@ -834,7 +843,12 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 	blkg = ctx.blkg;
 	oldval = iolat->min_lat_nsec;
 
-	iolatency_set_min_lat_nsec(blkg, lat_val);
+	enable = iolatency_set_min_lat_nsec(blkg, lat_val);
+	if (enable) {
+		WARN_ON_ONCE(!blk_get_queue(blkg->q));
+		blkg_get(blkg);
+	}
+
 	if (oldval != iolat->min_lat_nsec) {
 		iolatency_clear_scaling(blkg);
 	}
@@ -842,6 +856,24 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
 	ret = 0;
 out:
 	blkg_conf_finish(&ctx);
+	if (ret == 0 && enable) {
+		struct iolatency_grp *tmp = blkg_to_lat(blkg);
+		struct blk_iolatency *blkiolat = tmp->blkiolat;
+
+		blk_mq_freeze_queue(blkg->q);
+
+		if (enable == 1)
+			atomic_inc(&blkiolat->enabled);
+		else if (enable == -1)
+			atomic_dec(&blkiolat->enabled);
+		else
+			WARN_ON_ONCE(1);
+
+		blk_mq_unfreeze_queue(blkg->q);
+
+		blkg_put(blkg);
+		blk_put_queue(blkg->q);
+	}
 	return ret ?: nbytes;
 }
 
@@ -977,8 +1009,14 @@ static void iolatency_pd_offline(struct blkg_policy_data *pd)
 {
 	struct iolatency_grp *iolat = pd_to_lat(pd);
 	struct blkcg_gq *blkg = lat_to_blkg(iolat);
+	struct blk_iolatency *blkiolat = iolat->blkiolat;
+	int ret;
 
-	iolatency_set_min_lat_nsec(blkg, 0);
+	ret = iolatency_set_min_lat_nsec(blkg, 0);
+	if (ret == 1)
+		atomic_inc(&blkiolat->enabled);
+	if (ret == -1)
+		atomic_dec(&blkiolat->enabled);
 	iolatency_clear_scaling(blkg);
 }
 

From 391f552af213985d3d324c60004475759a7030c5 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.liu@linux.alibaba.com>
Date: Fri, 25 Jan 2019 08:12:48 +0800
Subject: [PATCH 09/10] Blk-iolatency: warn on negative inflight IO counter

This is to catch any unexpected negative value of inflight IO counter.

Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-iolatency.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 1893686a9c1f..2620baa1f699 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -592,6 +592,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
 	u64 now = ktime_to_ns(ktime_get());
 	bool issue_as_root = bio_issue_as_root_blkg(bio);
 	bool enabled = false;
+	int inflight = 0;
 
 	blkg = bio->bi_blkg;
 	if (!blkg || !bio_flagged(bio, BIO_TRACKED))
@@ -613,7 +614,8 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
 		}
 		rqw = &iolat->rq_wait;
 
-		atomic_dec(&rqw->inflight);
+		inflight = atomic_dec_return(&rqw->inflight);
+		WARN_ON_ONCE(inflight < 0);
 		if (iolat->min_lat_nsec == 0)
 			goto next;
 		iolatency_record_time(iolat, &bio->bi_issue, now,

From 2698484178ca5cbfdde189b1d8809e1528f82a10 Mon Sep 17 00:00:00 2001
From: Liu Bo <bo.liu@linux.alibaba.com>
Date: Fri, 25 Jan 2019 08:12:49 +0800
Subject: [PATCH 10/10] blk-mq: remove duplicated definition of
 blk_mq_freeze_queue

As the prototype has been defined in "include/linux/blk-mq.h", the one
in "block/blk-mq.h" can be removed then.

Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-mq.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/block/blk-mq.h b/block/blk-mq.h
index d943d46b0785..d0b3dd54ef8d 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -36,7 +36,6 @@ struct blk_mq_ctx {
 	struct kobject		kobj;
 } ____cacheline_aligned_in_smp;
 
-void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);