From d554b5e1ca64d23e4f839e6531490fee8479fbaf Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 27 Jun 2017 22:27:57 -0400 Subject: [PATCH 01/25] nvme: Quirks for PM1725 controllers PM1725 controllers have a couple of quirks that need to be handled in the driver: - I/O queue depth must be limited to 64 entries on controllers that do not report MQES. - The host interface registers go offline briefly while resetting the chip. Thus a delay is needed before checking whether the controller is ready. Note that the admin queue depth is also limited to 64 on older versions of this board. Since our NVME_AQ_DEPTH is now 32 that is no longer an issue. Signed-off-by: Martin K. Petersen Signed-off-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 32a98e2740ad..343263bcb49a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1908,6 +1908,12 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev_warn(dev->ctrl.device, "detected Apple NVMe controller, " "set queue depth=%u to work around controller resets\n", dev->q_depth); + } else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG && + (pdev->device == 0xa821 || pdev->device == 0xa822) && + NVME_CAP_MQES(cap) == 0) { + dev->q_depth = 64; + dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, " + "set queue depth=%u\n", dev->q_depth); } /* @@ -2454,6 +2460,10 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE(0x1c5f, 0x0540), /* Memblaze Pblaze4 adapter */ .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE(0x144d, 0xa821), /* Samsung PM1725 */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, + { PCI_DEVICE(0x144d, 0xa822), /* Samsung PM1725a */ + .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, From d858e5f04e58a42a6e0c8ec74ea15e3ea4bb45d0 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 24 Apr 2017 10:58:29 +0300 Subject: [PATCH 02/25] nvme: move queue_count to the nvme_ctrl All all transports use the queue_count in exactly the same, so move it to the generic struct nvme_ctrl. In the future it will also be maintained by the core. Reviewed-by: Christoph Hellwig Reviewed-By: James Smart Reviewed-by: Johannes Thumshirn Reviewed-by: Max Gurtovoy Signed-off-by: Sagi Grimberg --- drivers/nvme/host/fc.c | 35 +++++++++++++++++----------------- drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/pci.c | 15 +++++++-------- drivers/nvme/host/rdma.c | 39 +++++++++++++++++++------------------- drivers/nvme/target/loop.c | 15 +++++++-------- 5 files changed, 51 insertions(+), 54 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index ed87214fdc0e..7eb006427caf 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -148,7 +148,6 @@ struct nvme_fc_ctrl { struct device *dev; struct nvme_fc_lport *lport; struct nvme_fc_rport *rport; - u32 queue_count; u32 cnum; u64 association_id; @@ -1614,7 +1613,7 @@ nvme_fc_free_io_queues(struct nvme_fc_ctrl *ctrl) { int i; - for (i = 1; i < ctrl->queue_count; i++) + for (i = 1; i < ctrl->ctrl.queue_count; i++) nvme_fc_free_queue(&ctrl->queues[i]); } @@ -1635,10 +1634,10 @@ __nvme_fc_create_hw_queue(struct nvme_fc_ctrl *ctrl, static void nvme_fc_delete_hw_io_queues(struct nvme_fc_ctrl *ctrl) { - struct nvme_fc_queue *queue = &ctrl->queues[ctrl->queue_count - 1]; + struct nvme_fc_queue *queue = &ctrl->queues[ctrl->ctrl.queue_count - 1]; int i; - for (i = ctrl->queue_count - 1; i >= 1; i--, queue--) + for (i = ctrl->ctrl.queue_count - 1; i >= 1; i--, queue--) __nvme_fc_delete_hw_queue(ctrl, queue, i); } @@ -1648,7 +1647,7 @@ nvme_fc_create_hw_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize) struct nvme_fc_queue *queue = &ctrl->queues[1]; int i, ret; - for (i = 1; i < ctrl->queue_count; i++, queue++) { + for (i = 1; i < ctrl->ctrl.queue_count; i++, queue++) { ret = __nvme_fc_create_hw_queue(ctrl, queue, i, qsize); if (ret) goto delete_queues; @@ -1667,7 +1666,7 @@ nvme_fc_connect_io_queues(struct nvme_fc_ctrl *ctrl, u16 qsize) { int i, ret = 0; - for (i = 1; i < ctrl->queue_count; i++) { + for (i = 1; i < ctrl->ctrl.queue_count; i++) { ret = nvme_fc_connect_queue(ctrl, &ctrl->queues[i], qsize, (qsize / 5)); if (ret) @@ -1685,7 +1684,7 @@ nvme_fc_init_io_queues(struct nvme_fc_ctrl *ctrl) { int i; - for (i = 1; i < ctrl->queue_count; i++) + for (i = 1; i < ctrl->ctrl.queue_count; i++) nvme_fc_init_queue(ctrl, i, ctrl->ctrl.sqsize); } @@ -2187,7 +2186,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) return ret; } - ctrl->queue_count = opts->nr_io_queues + 1; + ctrl->ctrl.queue_count = opts->nr_io_queues + 1; if (!opts->nr_io_queues) return 0; @@ -2204,7 +2203,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) sizeof(struct scatterlist)) + ctrl->lport->ops->fcprqst_priv_sz; ctrl->tag_set.driver_data = ctrl; - ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1; + ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1; ctrl->tag_set.timeout = NVME_IO_TIMEOUT; ret = blk_mq_alloc_tag_set(&ctrl->tag_set); @@ -2258,7 +2257,7 @@ nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) } /* check for io queues existing */ - if (ctrl->queue_count == 1) + if (ctrl->ctrl.queue_count == 1) return 0; nvme_fc_init_io_queues(ctrl); @@ -2381,7 +2380,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) * Create the io queues */ - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { if (ctrl->ctrl.state == NVME_CTRL_NEW) ret = nvme_fc_create_io_queues(ctrl); else @@ -2395,7 +2394,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) ctrl->ctrl.nr_reconnects = 0; - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { nvme_start_queues(&ctrl->ctrl); nvme_queue_scan(&ctrl->ctrl); nvme_queue_async_events(&ctrl->ctrl); @@ -2447,7 +2446,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) * io requests back to the block layer as part of normal completions * (but with error status). */ - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); @@ -2702,18 +2701,18 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, spin_lock_init(&ctrl->lock); /* io queue count */ - ctrl->queue_count = min_t(unsigned int, + ctrl->ctrl.queue_count = min_t(unsigned int, opts->nr_io_queues, lport->ops->max_hw_queues); - opts->nr_io_queues = ctrl->queue_count; /* so opts has valid value */ - ctrl->queue_count++; /* +1 for admin queue */ + opts->nr_io_queues = ctrl->ctrl.queue_count; /* so opts has valid value */ + ctrl->ctrl.queue_count++; /* +1 for admin queue */ ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; ret = -ENOMEM; - ctrl->queues = kcalloc(ctrl->queue_count, sizeof(struct nvme_fc_queue), - GFP_KERNEL); + ctrl->queues = kcalloc(ctrl->ctrl.queue_count, + sizeof(struct nvme_fc_queue), GFP_KERNEL); if (!ctrl->queues) goto out_free_ida; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index d70ff0fdd36b..6c51d92b7fab 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -142,6 +142,7 @@ struct nvme_ctrl { u16 cntlid; u32 ctrl_config; + u32 queue_count; u32 page_size; u32 max_hw_sectors; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 343263bcb49a..6b50c9096fe4 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -74,7 +74,6 @@ struct nvme_dev { struct device *dev; struct dma_pool *prp_page_pool; struct dma_pool *prp_small_pool; - unsigned queue_count; unsigned online_queues; unsigned max_qid; int q_depth; @@ -1099,9 +1098,9 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) { int i; - for (i = dev->queue_count - 1; i >= lowest; i--) { + for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) { struct nvme_queue *nvmeq = dev->queues[i]; - dev->queue_count--; + dev->ctrl.queue_count--; dev->queues[i] = NULL; nvme_free_queue(nvmeq); } @@ -1221,7 +1220,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->qid = qid; nvmeq->cq_vector = -1; dev->queues[qid] = nvmeq; - dev->queue_count++; + dev->ctrl.queue_count++; return nvmeq; @@ -1441,7 +1440,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev) unsigned i, max; int ret = 0; - for (i = dev->queue_count; i <= dev->max_qid; i++) { + for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { /* vector == qid - 1, match nvme_create_queue */ if (!nvme_alloc_queue(dev, i, dev->q_depth, pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) { @@ -1450,7 +1449,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev) } } - max = min(dev->max_qid, dev->queue_count - 1); + max = min(dev->max_qid, dev->ctrl.queue_count - 1); for (i = dev->online_queues; i <= max; i++) { ret = nvme_create_queue(dev->queues[i], i); if (ret) @@ -2001,7 +2000,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) nvme_stop_queues(&dev->ctrl); queues = dev->online_queues - 1; - for (i = dev->queue_count - 1; i > 0; i--) + for (i = dev->ctrl.queue_count - 1; i > 0; i--) nvme_suspend_queue(dev->queues[i]); if (dead) { @@ -2009,7 +2008,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) * probe, before the admin queue is configured. Thus, * queue_count can be 0 here. */ - if (dev->queue_count) + if (dev->ctrl.queue_count) nvme_suspend_queue(dev->queues[0]); } else { nvme_disable_io_queues(dev, queues); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 6d4119dfbdaa..c4bacad3fe23 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -103,7 +103,6 @@ struct nvme_rdma_queue { struct nvme_rdma_ctrl { /* read only in the hot path */ struct nvme_rdma_queue *queues; - u32 queue_count; /* other member variables */ struct blk_mq_tag_set tag_set; @@ -349,7 +348,7 @@ static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, struct nvme_rdma_ctrl *ctrl = data; struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1]; - BUG_ON(hctx_idx >= ctrl->queue_count); + BUG_ON(hctx_idx >= ctrl->ctrl.queue_count); hctx->driver_data = queue; return 0; @@ -587,7 +586,7 @@ static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl) { int i; - for (i = 1; i < ctrl->queue_count; i++) + for (i = 1; i < ctrl->ctrl.queue_count; i++) nvme_rdma_stop_and_free_queue(&ctrl->queues[i]); } @@ -595,7 +594,7 @@ static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl) { int i, ret = 0; - for (i = 1; i < ctrl->queue_count; i++) { + for (i = 1; i < ctrl->ctrl.queue_count; i++) { ret = nvmf_connect_io_queue(&ctrl->ctrl, i); if (ret) { dev_info(ctrl->ctrl.device, @@ -623,14 +622,14 @@ static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl) if (ret) return ret; - ctrl->queue_count = nr_io_queues + 1; - if (ctrl->queue_count < 2) + ctrl->ctrl.queue_count = nr_io_queues + 1; + if (ctrl->ctrl.queue_count < 2) return 0; dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", nr_io_queues); - for (i = 1; i < ctrl->queue_count; i++) { + for (i = 1; i < ctrl->ctrl.queue_count; i++) { ret = nvme_rdma_init_queue(ctrl, i, ctrl->ctrl.opts->queue_size); if (ret) { @@ -705,7 +704,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) ++ctrl->ctrl.nr_reconnects; - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { nvme_rdma_free_io_queues(ctrl); ret = blk_mq_reinit_tagset(&ctrl->tag_set); @@ -735,7 +734,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) nvme_start_keep_alive(&ctrl->ctrl); - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { ret = nvme_rdma_init_io_queues(ctrl); if (ret) goto requeue; @@ -749,7 +748,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) WARN_ON_ONCE(!changed); ctrl->ctrl.nr_reconnects = 0; - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { nvme_queue_scan(&ctrl->ctrl); nvme_queue_async_events(&ctrl->ctrl); } @@ -772,15 +771,15 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) nvme_stop_keep_alive(&ctrl->ctrl); - for (i = 0; i < ctrl->queue_count; i++) + for (i = 0; i < ctrl->ctrl.queue_count; i++) clear_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags); - if (ctrl->queue_count > 1) + if (ctrl->ctrl.queue_count > 1) nvme_stop_queues(&ctrl->ctrl); blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); /* We must take care of fastfail/requeue all our inflight requests */ - if (ctrl->queue_count > 1) + if (ctrl->ctrl.queue_count > 1) blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, @@ -1624,7 +1623,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) cancel_work_sync(&ctrl->err_work); cancel_delayed_work_sync(&ctrl->reconnect_work); - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); @@ -1716,7 +1715,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) goto del_dead_ctrl; } - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { ret = blk_mq_reinit_tagset(&ctrl->tag_set); if (ret) goto del_dead_ctrl; @@ -1733,7 +1732,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { nvme_start_queues(&ctrl->ctrl); nvme_queue_scan(&ctrl->ctrl); nvme_queue_async_events(&ctrl->ctrl); @@ -1785,7 +1784,7 @@ static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl) ctrl->tag_set.cmd_size = sizeof(struct nvme_rdma_request) + SG_CHUNK_SIZE * sizeof(struct scatterlist); ctrl->tag_set.driver_data = ctrl; - ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1; + ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1; ctrl->tag_set.timeout = NVME_IO_TIMEOUT; ret = blk_mq_alloc_tag_set(&ctrl->tag_set); @@ -1863,12 +1862,12 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work); INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work); - ctrl->queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ + ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; ret = -ENOMEM; - ctrl->queues = kcalloc(ctrl->queue_count, sizeof(*ctrl->queues), + ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues), GFP_KERNEL); if (!ctrl->queues) goto out_uninit_ctrl; @@ -1925,7 +1924,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); mutex_unlock(&nvme_rdma_ctrl_mutex); - if (opts->nr_io_queues) { + if (ctrl->ctrl.queue_count > 1) { nvme_queue_scan(&ctrl->ctrl); nvme_queue_async_events(&ctrl->ctrl); } diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 5f55c683b338..edf0e2ab19e3 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -44,7 +44,6 @@ struct nvme_loop_iod { struct nvme_loop_ctrl { struct nvme_loop_queue *queues; - u32 queue_count; struct blk_mq_tag_set admin_tag_set; @@ -241,7 +240,7 @@ static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, struct nvme_loop_ctrl *ctrl = data; struct nvme_loop_queue *queue = &ctrl->queues[hctx_idx + 1]; - BUG_ON(hctx_idx >= ctrl->queue_count); + BUG_ON(hctx_idx >= ctrl->ctrl.queue_count); hctx->driver_data = queue; return 0; @@ -307,7 +306,7 @@ static void nvme_loop_destroy_io_queues(struct nvme_loop_ctrl *ctrl) { int i; - for (i = 1; i < ctrl->queue_count; i++) + for (i = 1; i < ctrl->ctrl.queue_count; i++) nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); } @@ -330,7 +329,7 @@ static int nvme_loop_init_io_queues(struct nvme_loop_ctrl *ctrl) if (ret) goto out_destroy_queues; - ctrl->queue_count++; + ctrl->ctrl.queue_count++; } return 0; @@ -344,7 +343,7 @@ static int nvme_loop_connect_io_queues(struct nvme_loop_ctrl *ctrl) { int i, ret; - for (i = 1; i < ctrl->queue_count; i++) { + for (i = 1; i < ctrl->ctrl.queue_count; i++) { ret = nvmf_connect_io_queue(&ctrl->ctrl, i); if (ret) return ret; @@ -372,7 +371,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); if (error) return error; - ctrl->queue_count = 1; + ctrl->ctrl.queue_count = 1; error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); if (error) @@ -426,7 +425,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) { nvme_stop_keep_alive(&ctrl->ctrl); - if (ctrl->queue_count > 1) { + if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); @@ -559,7 +558,7 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) ctrl->tag_set.cmd_size = sizeof(struct nvme_loop_iod) + SG_CHUNK_SIZE * sizeof(struct scatterlist); ctrl->tag_set.driver_data = ctrl; - ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1; + ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1; ctrl->tag_set.timeout = NVME_IO_TIMEOUT; ctrl->ctrl.tagset = &ctrl->tag_set; From 20d0dfe65afd3fb59d14720570a6921eb6bf5c1f Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 27 Jun 2017 22:16:38 +0300 Subject: [PATCH 03/25] nvme: move ctrl cap to struct nvme_ctrl All transports use either a private cache of controller cap or an on-stack copy, move it to the generic struct nvme_ctrl. In the future it will also be maintained by the core. Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Max Gurtovoy Signed-off-by: Sagi Grimberg --- drivers/nvme/host/fc.c | 8 +++----- drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/pci.c | 20 +++++++++----------- drivers/nvme/host/rdma.c | 10 +++++----- drivers/nvme/target/loop.c | 7 +++---- 5 files changed, 21 insertions(+), 25 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 7eb006427caf..2f990d979037 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -152,8 +152,6 @@ struct nvme_fc_ctrl { u64 association_id; - u64 cap; - struct list_head ctrl_list; /* rport->ctrl_list */ struct blk_mq_tag_set admin_tag_set; @@ -2328,7 +2326,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) * prior connection values */ - ret = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap); + ret = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); if (ret) { dev_err(ctrl->ctrl.device, "prop_get NVME_REG_CAP failed\n"); @@ -2336,9 +2334,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) } ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize); + min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap) + 1, ctrl->ctrl.sqsize); - ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); if (ret) goto out_disconnect_admin_queue; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 6c51d92b7fab..e0b83311d5de 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -144,6 +144,7 @@ struct nvme_ctrl { u32 ctrl_config; u32 queue_count; + u64 cap; u32 page_size; u32 max_hw_sectors; u16 oncs; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6b50c9096fe4..fe3907a082ba 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1144,8 +1144,7 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) if (shutdown) nvme_shutdown_ctrl(&dev->ctrl); else - nvme_disable_ctrl(&dev->ctrl, lo_hi_readq( - dev->bar + NVME_REG_CAP)); + nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); spin_lock_irq(&nvmeq->q_lock); nvme_process_cq(nvmeq); @@ -1388,7 +1387,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) { int result; u32 aqa; - u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); struct nvme_queue *nvmeq; result = nvme_remap_bar(dev, db_bar_size(dev, 0)); @@ -1396,13 +1394,13 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) return result; dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ? - NVME_CAP_NSSRC(cap) : 0; + NVME_CAP_NSSRC(dev->ctrl.cap) : 0; if (dev->subsystem && (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); - result = nvme_disable_ctrl(&dev->ctrl, cap); + result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); if (result < 0) return result; @@ -1421,7 +1419,7 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); - result = nvme_enable_ctrl(&dev->ctrl, cap); + result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap); if (result) return result; @@ -1865,7 +1863,6 @@ static int nvme_dev_add(struct nvme_dev *dev) static int nvme_pci_enable(struct nvme_dev *dev) { - u64 cap; int result = -ENOMEM; struct pci_dev *pdev = to_pci_dev(dev->dev); @@ -1892,10 +1889,11 @@ static int nvme_pci_enable(struct nvme_dev *dev) if (result < 0) return result; - cap = lo_hi_readq(dev->bar + NVME_REG_CAP); + dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); - dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); - dev->db_stride = 1 << NVME_CAP_STRIDE(cap); + dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1, + NVME_Q_DEPTH); + dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); dev->dbs = dev->bar + 4096; /* @@ -1909,7 +1907,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev->q_depth); } else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG && (pdev->device == 0xa821 || pdev->device == 0xa822) && - NVME_CAP_MQES(cap) == 0) { + NVME_CAP_MQES(dev->ctrl.cap) == 0) { dev->q_depth = 64; dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, " "set queue depth=%u\n", dev->q_depth); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index c4bacad3fe23..2ab0cdb4d881 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -118,7 +118,6 @@ struct nvme_rdma_ctrl { struct blk_mq_tag_set admin_tag_set; struct nvme_rdma_device *device; - u64 cap; u32 max_fr_pages; struct sockaddr_storage addr; @@ -728,7 +727,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); - ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); if (ret) goto requeue; @@ -1573,7 +1572,8 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags); - error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap); + error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, + &ctrl->ctrl.cap); if (error) { dev_err(ctrl->ctrl.device, "prop_get NVME_REG_CAP failed\n"); @@ -1581,9 +1581,9 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) } ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->ctrl.sqsize); + min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); - error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); if (error) goto out_cleanup_queue; diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index edf0e2ab19e3..568ed8625696 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -48,7 +48,6 @@ struct nvme_loop_ctrl { struct blk_mq_tag_set admin_tag_set; struct list_head list; - u64 cap; struct blk_mq_tag_set tag_set; struct nvme_loop_iod async_event_iod; struct nvme_ctrl ctrl; @@ -387,7 +386,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) if (error) goto out_cleanup_queue; - error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap); + error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); if (error) { dev_err(ctrl->ctrl.device, "prop_get NVME_REG_CAP failed\n"); @@ -395,9 +394,9 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) } ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->ctrl.sqsize); + min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); - error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); if (error) goto out_cleanup_queue; From 01ad0990467eaa17ae17db7376a4f02739f558c0 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 1 May 2017 00:27:17 +0300 Subject: [PATCH 04/25] nvme-pci: rename to nvme_pci_configure_admin_queue we are going to need the name for the core routine... Reviewed-by: Johannes Thumshirn Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index fe3907a082ba..eb729ff70e7d 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1383,7 +1383,7 @@ static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size) return 0; } -static int nvme_configure_admin_queue(struct nvme_dev *dev) +static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) { int result; u32 aqa; @@ -2096,7 +2096,7 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; - result = nvme_configure_admin_queue(dev); + result = nvme_pci_configure_admin_queue(dev); if (result) goto out; From 7314183d1d0c200def4d0f5a6d978d3b29d28813 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 29 Jun 2017 11:16:49 +0300 Subject: [PATCH 05/25] nvme-fc: don't override opts->nr_io_queues Its what the user passed, so its probably a better idea to keep it intact. Also, limit the number of I/O queues to max online cpus and the lport maximum hw queues. Reviewed-by: Johannes Thumshirn Reviewed-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/fc.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 2f990d979037..996720043b3a 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2175,17 +2175,20 @@ static int nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + unsigned int nr_io_queues; int ret; - ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues); + nr_io_queues = min(min(opts->nr_io_queues, num_online_cpus()), + ctrl->lport->ops->max_hw_queues); + ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); if (ret) { dev_info(ctrl->ctrl.device, "set_queue_count failed: %d\n", ret); return ret; } - ctrl->ctrl.queue_count = opts->nr_io_queues + 1; - if (!opts->nr_io_queues) + ctrl->ctrl.queue_count = nr_io_queues + 1; + if (!nr_io_queues) return 0; nvme_fc_init_io_queues(ctrl); @@ -2245,15 +2248,19 @@ static int nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + unsigned int nr_io_queues; int ret; - ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues); + nr_io_queues = min(min(opts->nr_io_queues, num_online_cpus()), + ctrl->lport->ops->max_hw_queues); + ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues); if (ret) { dev_info(ctrl->ctrl.device, "set_queue_count failed: %d\n", ret); return ret; } + ctrl->ctrl.queue_count = nr_io_queues + 1; /* check for io queues existing */ if (ctrl->ctrl.queue_count == 1) return 0; @@ -2702,7 +2709,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->ctrl.queue_count = min_t(unsigned int, opts->nr_io_queues, lport->ops->max_hw_queues); - opts->nr_io_queues = ctrl->ctrl.queue_count; /* so opts has valid value */ ctrl->ctrl.queue_count++; /* +1 for admin queue */ ctrl->ctrl.sqsize = opts->queue_size - 1; From 4c8b99f6b1ff086c1745fcb0511800fba5c4fb34 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 29 Jun 2017 11:10:44 +0300 Subject: [PATCH 06/25] nvme-rdma: update tagset nr_hw_queues after reconnecting/resetting We might have more/less queues once we reconnect/reset. For example due to cpu going online/offline or controller constraints. Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2ab0cdb4d881..cfb22531fc16 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -741,6 +741,9 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) ret = nvme_rdma_connect_io_queues(ctrl); if (ret) goto requeue; + + blk_mq_update_nr_hw_queues(&ctrl->tag_set, + ctrl->ctrl.queue_count - 1); } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); @@ -1727,6 +1730,9 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) ret = nvme_rdma_connect_io_queues(ctrl); if (ret) goto del_dead_ctrl; + + blk_mq_update_nr_hw_queues(&ctrl->tag_set, + ctrl->ctrl.queue_count - 1); } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); From 4368c39bf6e6a2bfffb7a72d78912c68f156e8da Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 29 Jun 2017 11:13:43 +0300 Subject: [PATCH 07/25] nvme-loop: update tagset nr_hw_queues after reconnecting/resetting We might have more/less queues once we reconnect/reset. For example due to cpu going online/offline Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/target/loop.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 568ed8625696..3d51341e62ee 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -508,6 +508,9 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work) if (ret) goto out_destroy_io; + blk_mq_update_nr_hw_queues(&ctrl->tag_set, + ctrl->ctrl.queue_count - 1); + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); From cda5fd1ac5c2c42d1b2e1847aa0438b229c5c068 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 29 Jun 2017 11:20:10 +0300 Subject: [PATCH 08/25] nvme-fc: update tagset nr_hw_queues after queues reinit We might have more/less queues once we reconnect/reset. For example due to cpu going online/offline or controller constraints. Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/fc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 996720043b3a..1e859ee2b565 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2279,6 +2279,8 @@ nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) if (ret) goto out_delete_hw_queues; + blk_mq_update_nr_hw_queues(&ctrl->tag_set, nr_io_queues); + return 0; out_delete_hw_queues: From e5859d3a0ea07bf948f4538d47fe6547f6e115be Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 4 Jul 2017 10:18:50 +0300 Subject: [PATCH 09/25] nvme-fc: use blk_mq_delay_run_hw_queue instead of open-coding it Cc: James Smart Signed-off-by: Sagi Grimberg --- drivers/nvme/host/fc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 1e859ee2b565..50cc3b2b0e11 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1966,10 +1966,9 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue, if (ret != -EBUSY) return BLK_STS_IOERR; - if (op->rq) { - blk_mq_stop_hw_queues(op->rq->q); - blk_mq_delay_queue(queue->hctx, NVMEFC_QUEUE_DELAY); - } + if (op->rq) + blk_mq_delay_run_hw_queue(queue->hctx, NVMEFC_QUEUE_DELAY); + return BLK_STS_RESOURCE; } From 5e599d73c1c1816af07f94ddba879499aa39b43c Mon Sep 17 00:00:00 2001 From: Marta Rybczynska Date: Tue, 6 Jun 2017 13:27:21 +0200 Subject: [PATCH 10/25] nvme-rdma: remove race conditions from IB signalling This patch improves the way the RDMA IB signalling is done by using atomic operations for the signalling variable. This avoids race conditions on sig_count. The signalling interval changes slightly and is now the largest power of two not larger than queue depth / 2. ilog() usage idea by Bart Van Assche. Signed-off-by: Marta Rybczynska Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Cc: stable@vger.kernel.org --- drivers/nvme/host/rdma.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index cfb22531fc16..0ab163e6e640 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -86,7 +86,7 @@ enum nvme_rdma_queue_flags { struct nvme_rdma_queue { struct nvme_rdma_qe *rsp_ring; - u8 sig_count; + atomic_t sig_count; int queue_size; size_t cmnd_capsule_len; struct nvme_rdma_ctrl *ctrl; @@ -523,6 +523,7 @@ static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl, queue->cmnd_capsule_len = sizeof(struct nvme_command); queue->queue_size = queue_size; + atomic_set(&queue->sig_count, 0); queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue, RDMA_PS_TCP, IB_QPT_RC); @@ -1009,17 +1010,16 @@ static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) nvme_rdma_wr_error(cq, wc, "SEND"); } -static inline int nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) +/* + * We want to signal completion at least every queue depth/2. This returns the + * largest power of two that is not above half of (queue size + 1) to optimize + * (avoid divisions). + */ +static inline bool nvme_rdma_queue_sig_limit(struct nvme_rdma_queue *queue) { - int sig_limit; + int limit = 1 << ilog2((queue->queue_size + 1) / 2); - /* - * We signal completion every queue depth/2 and also handle the - * degenerated case of a device with queue_depth=1, where we - * would need to signal every message. - */ - sig_limit = max(queue->queue_size / 2, 1); - return (++queue->sig_count % sig_limit) == 0; + return (atomic_inc_return(&queue->sig_count) & (limit - 1)) == 0; } static int nvme_rdma_post_send(struct nvme_rdma_queue *queue, From fb051339727cd8134dd6ef0305a120e6e265dded Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 2 Jul 2017 15:33:32 +0300 Subject: [PATCH 11/25] nvme-rdma: quiesce/unquiesce admin_q instead of start/stop its hw queues unlike blk_mq_stop_hw_queues and blk_mq_start_stopped_hw_queues quiescing/unquiescing respects the submission path rcu grace. Also make sure to kick the requeue list when appropriate. Reviewed-by: Ming Lei Signed-off-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 0ab163e6e640..6f8f9ffcbc00 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -779,7 +779,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) if (ctrl->ctrl.queue_count > 1) nvme_stop_queues(&ctrl->ctrl); - blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); /* We must take care of fastfail/requeue all our inflight requests */ if (ctrl->ctrl.queue_count > 1) @@ -792,7 +792,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) * queues are not a live anymore, so restart the queues to fail fast * new IO */ - blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_start_queues(&ctrl->ctrl); nvme_rdma_reconnect_or_remove(ctrl); @@ -1636,9 +1636,10 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) if (test_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags)) nvme_shutdown_ctrl(&ctrl->ctrl); - blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_destroy_admin_queue(ctrl); } From f9c5af5f8ff14a31468546b9b1a876d537019e9a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 2 Jul 2017 15:39:34 +0300 Subject: [PATCH 12/25] nvme-fc: quiesce/unquiesce admin_q instead of start/stop its hw queues unlike blk_mq_stop_hw_queues and blk_mq_start_stopped_hw_queues quiescing/unquiescing respects the submission path rcu grace. Also, make sure to unquiesce before cleanup the admin queue. Reviewed-by: Ming Lei Reviewed-By: James Smart Signed-off-by: Sagi Grimberg --- drivers/nvme/host/fc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 50cc3b2b0e11..8d55e7827932 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1703,6 +1703,7 @@ nvme_fc_ctrl_free(struct kref *ref) list_del(&ctrl->ctrl_list); spin_unlock_irqrestore(&ctrl->rport->lock, flags); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); blk_cleanup_queue(ctrl->ctrl.admin_q); blk_mq_free_tag_set(&ctrl->admin_tag_set); @@ -2321,7 +2322,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) goto out_delete_hw_queue; if (ctrl->ctrl.state != NVME_CTRL_NEW) - blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); ret = nvmf_connect_admin_queue(&ctrl->ctrl); if (ret) @@ -2475,7 +2476,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) * use blk_mq_tagset_busy_itr() and the transport routine to * terminate the exchanges. */ - blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); From c1c0ffff3a99caf40c3bd21447135db459194123 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 2 Jul 2017 15:40:17 +0300 Subject: [PATCH 13/25] nvme-loop: quiesce/unquiesce admin_q instead of start/stop its hw queues unlike blk_mq_stop_hw_queues and blk_mq_start_stopped_hw_queues quiescing/unquiescing respects the submission path rcu grace. Reviewed-by: Ming Lei Signed-off-by: Sagi Grimberg --- drivers/nvme/target/loop.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 3d51341e62ee..6a0b70685e77 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -434,9 +434,10 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) if (ctrl->ctrl.state == NVME_CTRL_LIVE) nvme_shutdown_ctrl(&ctrl->ctrl); - blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_loop_destroy_admin_queue(ctrl); } From c81545f991a6612d3bdab18a71b3487023ec6b69 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 2 Jul 2017 15:53:27 +0300 Subject: [PATCH 14/25] nvme-pci: quiesce/unquiesce admin_q instead of start/stop its hw queues unlike blk_mq_stop_hw_queues and blk_mq_start_stopped_hw_queues quiescing/unquiescing respects the submission path rcu grace. Reviewed-by: Ming Lei Signed-off-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index eb729ff70e7d..d9c0010a9bbc 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1125,7 +1125,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) spin_unlock_irq(&nvmeq->q_lock); if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) - blk_mq_stop_hw_queues(nvmeq->dev->ctrl.admin_q); + blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q); pci_free_irq(to_pci_dev(nvmeq->dev->dev), vector, nvmeq); @@ -1315,7 +1315,7 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev) * user requests may be waiting on a stopped queue. Start the * queue to flush these to completion. */ - blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true); + blk_mq_unquiesce_queue(dev->ctrl.admin_q); blk_cleanup_queue(dev->ctrl.admin_q); blk_mq_free_tag_set(&dev->admin_tagset); } @@ -1352,7 +1352,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) return -ENODEV; } } else - blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true); + blk_mq_unquiesce_queue(dev->ctrl.admin_q); return 0; } From 8d7b8fafad87c3404f72ce2d36c79c48be1129a6 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 4 Jul 2017 18:16:58 +0300 Subject: [PATCH 15/25] nvme: kick requeue list when requeueing a request instead of when starting the queues When we requeue a request, we can always insert the request back to the scheduler instead of doing it when restarting the queues and kicking the requeue work, so get rid of the requeue kick in nvme (core and drivers). Also, now there is no need start hw queues in nvme_kill_queues We don't stop the hw queues anymore, so no need to start them. Reviewed-by: Ming Lei Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d70df1d0072d..48cafaa6fbc5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -131,7 +131,7 @@ void nvme_complete_rq(struct request *req) { if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) { nvme_req(req)->retries++; - blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q)); + blk_mq_requeue_request(req, true); return; } @@ -2694,9 +2694,6 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) /* Forcibly unquiesce queues to avoid blocking dispatch */ blk_mq_unquiesce_queue(ctrl->admin_q); - /* Forcibly start all queues to avoid having stuck requests */ - blk_mq_start_hw_queues(ctrl->admin_q); - list_for_each_entry(ns, &ctrl->namespaces, list) { /* * Revalidating a dead namespace sets capacity to 0. This will @@ -2709,16 +2706,6 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) /* Forcibly unquiesce queues to avoid blocking dispatch */ blk_mq_unquiesce_queue(ns->queue); - - /* - * Forcibly start all queues to avoid having stuck requests. - * Note that we must ensure the queues are not stopped - * when the final removal happens. - */ - blk_mq_start_hw_queues(ns->queue); - - /* draining requests in requeue list */ - blk_mq_kick_requeue_list(ns->queue); } mutex_unlock(&ctrl->namespaces_mutex); } @@ -2787,10 +2774,8 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) struct nvme_ns *ns; mutex_lock(&ctrl->namespaces_mutex); - list_for_each_entry(ns, &ctrl->namespaces, list) { + list_for_each_entry(ns, &ctrl->namespaces, list) blk_mq_unquiesce_queue(ns->queue); - blk_mq_kick_requeue_list(ns->queue); - } mutex_unlock(&ctrl->namespaces_mutex); } EXPORT_SYMBOL_GPL(nvme_start_queues); From b52c2e92546ee794a5bbab4d8ea435c1de85a8cb Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 4 Jul 2017 09:57:09 +0300 Subject: [PATCH 16/25] nbd: quiesce request queues to make sure no submissions are inflight Unlike blk_mq_stop_hw_queues, blk_mq_quiesce_queue respects the submission path rcu grace. quiesce the queue before iterating on live tags. Reviewed-by: Ming Lei Acked-by: Josef Bacik Signed-off-by: Sagi Grimberg --- drivers/block/nbd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 977ec960dd2f..dea7d85134ee 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -661,9 +661,9 @@ static void nbd_clear_req(struct request *req, void *data, bool reserved) static void nbd_clear_que(struct nbd_device *nbd) { - blk_mq_stop_hw_queues(nbd->disk->queue); + blk_mq_quiesce_queue(nbd->disk->queue); blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL); - blk_mq_start_hw_queues(nbd->disk->queue); + blk_mq_unquiesce_queue(nbd->disk->queue); dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n"); } From 436c15ab6596b12cfee7618ecaa69a4341ac3c51 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 4 Jul 2017 10:00:41 +0300 Subject: [PATCH 17/25] mtip32xx: quiesce request queues to make sure no submissions are inflight Unlike blk_mq_stop_hw_queues, blk_mq_quiesce_queue respects the submission path rcu grace. quiesce the queue before iterating on live tags, or performing device io quiescing. While were at it, verify that the request started in mtip_abort_cmd amd mtip_queue_cmd tag iteration calls. Reviewed-by: Ming Lei Signed-off-by: Sagi Grimberg --- drivers/block/mtip32xx/mtip32xx.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 61b046f256ca..87717a1a5c89 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -950,7 +950,7 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) unsigned long to; bool active = true; - blk_mq_stop_hw_queues(port->dd->queue); + blk_mq_quiesce_queue(port->dd->queue); to = jiffies + msecs_to_jiffies(timeout); do { @@ -970,10 +970,10 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) break; } while (time_before(jiffies, to)); - blk_mq_start_stopped_hw_queues(port->dd->queue, true); + blk_mq_unquiesce_queue(port->dd->queue); return active ? -EBUSY : 0; err_fault: - blk_mq_start_stopped_hw_queues(port->dd->queue, true); + blk_mq_unquiesce_queue(port->dd->queue); return -EFAULT; } @@ -2737,6 +2737,9 @@ static void mtip_abort_cmd(struct request *req, void *data, struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req); struct driver_data *dd = data; + if (!blk_mq_request_started(req)) + return; + dbg_printk(MTIP_DRV_NAME " Aborting request, tag = %d\n", req->tag); clear_bit(req->tag, dd->port->cmds_to_issue); @@ -2749,6 +2752,9 @@ static void mtip_queue_cmd(struct request *req, void *data, { struct driver_data *dd = data; + if (!blk_mq_request_started(req)) + return; + set_bit(req->tag, dd->port->cmds_to_issue); blk_abort_request(req); } @@ -2814,6 +2820,8 @@ static int mtip_service_thread(void *data) dev_warn(&dd->pdev->dev, "Completion workers still active!"); + blk_mq_quiesce_queue(dd->queue); + spin_lock(dd->queue->queue_lock); blk_mq_tagset_busy_iter(&dd->tags, mtip_queue_cmd, dd); @@ -2826,6 +2834,8 @@ static int mtip_service_thread(void *data) mtip_abort_cmd, dd); clear_bit(MTIP_PF_TO_ACTIVE_BIT, &dd->port->flags); + + blk_mq_unquiesce_queue(dd->queue); } if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) { @@ -3995,8 +4005,9 @@ static int mtip_block_remove(struct driver_data *dd) dd->disk->disk_name); blk_freeze_queue_start(dd->queue); - blk_mq_stop_hw_queues(dd->queue); + blk_mq_quiesce_queue(dd->queue); blk_mq_tagset_busy_iter(&dd->tags, mtip_no_dev_cleanup, dd); + blk_mq_unquiesce_queue(dd->queue); /* * Delete our gendisk structure. This also removes the device From 9b3e99058453399b506087b3ac99d67a80343333 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 4 Jul 2017 10:03:03 +0300 Subject: [PATCH 18/25] virtio_blk: quiesce/unquiesce live IO when entering PM states Without it its not guaranteed that no .queue_rq is inflight. Reviewed-by: Ming Lei Acked-by: Michael S. Tsirkin Cc: virtio-dev@lists.oasis-open.org Cc: Jason Wang Signed-off-by: Sagi Grimberg --- drivers/block/virtio_blk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 0297ad7c1452..4e02aa5fdac0 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -840,7 +840,7 @@ static int virtblk_freeze(struct virtio_device *vdev) /* Make sure no work handler is accessing the device. */ flush_work(&vblk->config_work); - blk_mq_stop_hw_queues(vblk->disk->queue); + blk_mq_quiesce_queue(vblk->disk->queue); vdev->config->del_vqs(vdev); return 0; @@ -857,7 +857,7 @@ static int virtblk_restore(struct virtio_device *vdev) virtio_device_ready(vdev); - blk_mq_start_stopped_hw_queues(vblk->disk->queue, true); + blk_mq_unquiesce_queue(vblk->disk->queue); return 0; } #endif From d09f2b45f346f0a9e5e1b5fcea531b1b393671dc Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 2 Jul 2017 10:56:43 +0300 Subject: [PATCH 19/25] nvme: split nvme_uninit_ctrl into stop and uninit Usually before we teardown the controller we want to: 1. complete/cancel any ctrl inflight works 2. remove ctrl namespaces (only for removal though, resets shouldn't remove any namespaces). but we do not want to destroy the controller device as we might use it for logging during the teardown stage. This patch adds nvme_start_ctrl() which queues inflight controller works (aen, ns scan, queue start and keep-alive if kato is set) and nvme_stop_ctrl() which cancels the works namespace removal is left to the callers to handle. Move nvme_uninit_ctrl after we are done with the controller device. Reviewed-by: Keith Busch Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 21 +++++++++++++++++++-- drivers/nvme/host/fc.c | 16 ++++------------ drivers/nvme/host/nvme.h | 2 ++ drivers/nvme/host/pci.c | 16 ++++------------ drivers/nvme/host/rdma.c | 29 ++++++++--------------------- drivers/nvme/target/loop.c | 19 ++++++------------- 6 files changed, 43 insertions(+), 60 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 48cafaa6fbc5..cb96f4a7ae3a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2591,12 +2591,29 @@ static void nvme_release_instance(struct nvme_ctrl *ctrl) spin_unlock(&dev_list_lock); } -void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) +void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { + nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); flush_work(&ctrl->scan_work); - nvme_remove_namespaces(ctrl); +} +EXPORT_SYMBOL_GPL(nvme_stop_ctrl); +void nvme_start_ctrl(struct nvme_ctrl *ctrl) +{ + if (ctrl->kato) + nvme_start_keep_alive(ctrl); + + if (ctrl->queue_count > 1) { + nvme_queue_scan(ctrl); + nvme_queue_async_events(ctrl); + nvme_start_queues(ctrl); + } +} +EXPORT_SYMBOL_GPL(nvme_start_ctrl); + +void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) +{ device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); spin_lock(&dev_list_lock); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 8d55e7827932..d666ada39a9b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2232,7 +2232,6 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl) out_delete_hw_queues: nvme_fc_delete_hw_io_queues(ctrl); out_cleanup_blk_queue: - nvme_stop_keep_alive(&ctrl->ctrl); blk_cleanup_queue(ctrl->ctrl.connect_q); out_free_tag_set: blk_mq_free_tag_set(&ctrl->tag_set); @@ -2366,8 +2365,6 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) goto out_disconnect_admin_queue; } - nvme_start_keep_alive(&ctrl->ctrl); - /* FC-NVME supports normal SGL Data Block Descriptors */ if (opts->queue_size > ctrl->ctrl.maxcmd) { @@ -2401,17 +2398,12 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) ctrl->ctrl.nr_reconnects = 0; - if (ctrl->ctrl.queue_count > 1) { - nvme_start_queues(&ctrl->ctrl); - nvme_queue_scan(&ctrl->ctrl); - nvme_queue_async_events(&ctrl->ctrl); - } + nvme_start_ctrl(&ctrl->ctrl); return 0; /* Success */ out_term_aen_ops: nvme_fc_term_aen_ops(ctrl); - nvme_stop_keep_alive(&ctrl->ctrl); out_disconnect_admin_queue: /* send a Disconnect(association) LS to fc-nvme target */ nvme_fc_xmt_disconnect_assoc(ctrl); @@ -2434,8 +2426,6 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) { unsigned long flags; - nvme_stop_keep_alive(&ctrl->ctrl); - spin_lock_irqsave(&ctrl->lock, flags); ctrl->flags |= FCCTRL_TERMIO; ctrl->iocnt = 0; @@ -2517,7 +2507,8 @@ nvme_fc_delete_ctrl_work(struct work_struct *work) cancel_work_sync(&ctrl->ctrl.reset_work); cancel_delayed_work_sync(&ctrl->connect_work); - + nvme_stop_ctrl(&ctrl->ctrl); + nvme_remove_namespaces(&ctrl->ctrl); /* * kill the association on the link side. this will block * waiting for io to terminate @@ -2612,6 +2603,7 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) container_of(work, struct nvme_fc_ctrl, ctrl.reset_work); int ret; + nvme_stop_ctrl(&ctrl->ctrl); /* will block will waiting for io to terminate */ nvme_fc_delete_association(ctrl); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index e0b83311d5de..8f2a168ddc01 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -280,6 +280,8 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, const struct nvme_ctrl_ops *ops, unsigned long quirks); void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); +void nvme_start_ctrl(struct nvme_ctrl *ctrl); +void nvme_stop_ctrl(struct nvme_ctrl *ctrl); void nvme_put_ctrl(struct nvme_ctrl *ctrl); int nvme_init_identify(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index d9c0010a9bbc..882ed3677117 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2134,15 +2134,6 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; - /* - * A controller that can not execute IO typically requires user - * intervention to correct. For such degraded controllers, the driver - * should not submit commands the user did not request, so skip - * registering for asynchronous event notification on this condition. - */ - if (dev->online_queues > 1) - nvme_queue_async_events(&dev->ctrl); - /* * Keep the controller around but remove all namespaces if we don't have * any working I/O queue. @@ -2163,8 +2154,7 @@ static void nvme_reset_work(struct work_struct *work) goto out; } - if (dev->online_queues > 1) - nvme_queue_scan(&dev->ctrl); + nvme_start_ctrl(&dev->ctrl); return; out: @@ -2341,11 +2331,13 @@ static void nvme_remove(struct pci_dev *pdev) } flush_work(&dev->ctrl.reset_work); - nvme_uninit_ctrl(&dev->ctrl); + nvme_stop_ctrl(&dev->ctrl); + nvme_remove_namespaces(&dev->ctrl); nvme_dev_disable(dev, true); nvme_free_host_mem(dev); nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); + nvme_uninit_ctrl(&dev->ctrl); nvme_release_prp_pools(dev); nvme_dev_unmap(dev); nvme_put_ctrl(&dev->ctrl); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 6f8f9ffcbc00..ccdbd99d5a1c 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -732,8 +732,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) if (ret) goto requeue; - nvme_start_keep_alive(&ctrl->ctrl); - if (ctrl->ctrl.queue_count > 1) { ret = nvme_rdma_init_io_queues(ctrl); if (ret) @@ -751,10 +749,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) WARN_ON_ONCE(!changed); ctrl->ctrl.nr_reconnects = 0; - if (ctrl->ctrl.queue_count > 1) { - nvme_queue_scan(&ctrl->ctrl); - nvme_queue_async_events(&ctrl->ctrl); - } + nvme_start_ctrl(&ctrl->ctrl); dev_info(ctrl->ctrl.device, "Successfully reconnected\n"); @@ -772,7 +767,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) struct nvme_rdma_ctrl, err_work); int i; - nvme_stop_keep_alive(&ctrl->ctrl); + nvme_stop_ctrl(&ctrl->ctrl); for (i = 0; i < ctrl->ctrl.queue_count; i++) clear_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags); @@ -1603,8 +1598,6 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) if (error) goto out_cleanup_queue; - nvme_start_keep_alive(&ctrl->ctrl); - return 0; out_cleanup_queue: @@ -1622,7 +1615,6 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl) static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) { - nvme_stop_keep_alive(&ctrl->ctrl); cancel_work_sync(&ctrl->err_work); cancel_delayed_work_sync(&ctrl->reconnect_work); @@ -1645,10 +1637,12 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl) static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { - nvme_uninit_ctrl(&ctrl->ctrl); + nvme_stop_ctrl(&ctrl->ctrl); + nvme_remove_namespaces(&ctrl->ctrl); if (shutdown) nvme_rdma_shutdown_ctrl(ctrl); + nvme_uninit_ctrl(&ctrl->ctrl); if (ctrl->ctrl.tagset) { blk_cleanup_queue(ctrl->ctrl.connect_q); blk_mq_free_tag_set(&ctrl->tag_set); @@ -1710,6 +1704,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) int ret; bool changed; + nvme_stop_ctrl(&ctrl->ctrl); nvme_rdma_shutdown_ctrl(ctrl); ret = nvme_rdma_configure_admin_queue(ctrl); @@ -1739,11 +1734,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); - if (ctrl->ctrl.queue_count > 1) { - nvme_start_queues(&ctrl->ctrl); - nvme_queue_scan(&ctrl->ctrl); - nvme_queue_async_events(&ctrl->ctrl); - } + nvme_start_ctrl(&ctrl->ctrl); return; @@ -1931,15 +1922,11 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); mutex_unlock(&nvme_rdma_ctrl_mutex); - if (ctrl->ctrl.queue_count > 1) { - nvme_queue_scan(&ctrl->ctrl); - nvme_queue_async_events(&ctrl->ctrl); - } + nvme_start_ctrl(&ctrl->ctrl); return &ctrl->ctrl; out_remove_admin_queue: - nvme_stop_keep_alive(&ctrl->ctrl); nvme_rdma_destroy_admin_queue(ctrl); out_kfree_queues: kfree(ctrl->queues); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 6a0b70685e77..717ed7ddb2f6 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -407,8 +407,6 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) if (error) goto out_cleanup_queue; - nvme_start_keep_alive(&ctrl->ctrl); - return 0; out_cleanup_queue: @@ -422,8 +420,6 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) { - nvme_stop_keep_alive(&ctrl->ctrl); - if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, @@ -446,8 +442,10 @@ static void nvme_loop_del_ctrl_work(struct work_struct *work) struct nvme_loop_ctrl *ctrl = container_of(work, struct nvme_loop_ctrl, delete_work); - nvme_uninit_ctrl(&ctrl->ctrl); + nvme_stop_ctrl(&ctrl->ctrl); + nvme_remove_namespaces(&ctrl->ctrl); nvme_loop_shutdown_ctrl(ctrl); + nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); } @@ -495,6 +493,7 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work) bool changed; int ret; + nvme_stop_ctrl(&ctrl->ctrl); nvme_loop_shutdown_ctrl(ctrl); ret = nvme_loop_configure_admin_queue(ctrl); @@ -515,10 +514,7 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work) changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); - nvme_queue_scan(&ctrl->ctrl); - nvme_queue_async_events(&ctrl->ctrl); - - nvme_start_queues(&ctrl->ctrl); + nvme_start_ctrl(&ctrl->ctrl); return; @@ -653,10 +649,7 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, list_add_tail(&ctrl->list, &nvme_loop_ctrl_list); mutex_unlock(&nvme_loop_ctrl_mutex); - if (opts->nr_io_queues) { - nvme_queue_scan(&ctrl->ctrl); - nvme_queue_async_events(&ctrl->ctrl); - } + nvme_start_ctrl(&ctrl->ctrl); return &ctrl->ctrl; From 842594c8775b585c58459e044708c0335b6aa6b7 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 5 Jul 2017 09:56:13 +0300 Subject: [PATCH 20/25] nvme-rdma: unconditionally recycle the request mr When our RDMA queue-pair is torn down with high load of I/O traffic, we have no way of knowing if the memory region was actually registered by the reg_mr work request as it completion flushes with error (hw might have done it or not). So in order to not deal with all this uncertanty, we simply recycle the MR in reinit_request. Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/rdma.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ccdbd99d5a1c..da04df1af231 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -272,9 +272,6 @@ static int nvme_rdma_reinit_request(void *data, struct request *rq) struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); int ret = 0; - if (!req->mr->need_inval) - goto out; - ib_dereg_mr(req->mr); req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, From d1438ad8f3eec7207618b8e01f9f3eec7b6f67c4 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 7 Jul 2017 18:08:25 -0700 Subject: [PATCH 21/25] nvme_fc/nvmet_fc: revise Create Association descriptor length Revises the Create Association LS for the amount of pad expected in 1.16. Add defines for the minimum lengths that a target can accept (e.g. variable pad lengths) Signed-off-by: James Smart Signed-off-by: Sagi Grimberg --- include/linux/nvme-fc.h | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h index bc711a10be05..21c37e39e41a 100644 --- a/include/linux/nvme-fc.h +++ b/include/linux/nvme-fc.h @@ -17,6 +17,7 @@ /* * This file contains definitions relative to FC-NVME r1.14 (16-020vB). + * The fcnvme_lsdesc_cr_assoc_cmd struct reflects expected r1.16 content. */ #ifndef _NVME_FC_H @@ -193,9 +194,21 @@ struct fcnvme_lsdesc_cr_assoc_cmd { uuid_t hostid; u8 hostnqn[FCNVME_ASSOC_HOSTNQN_LEN]; u8 subnqn[FCNVME_ASSOC_SUBNQN_LEN]; - u8 rsvd632[384]; + __be32 rsvd584[108]; /* pad to 1016 bytes, + * which makes overall LS rqst + * payload 1024 bytes + */ }; +#define FCNVME_LSDESC_CRA_CMD_DESC_MINLEN \ + offsetof(struct fcnvme_lsdesc_cr_assoc_cmd, rsvd584) + +#define FCNVME_LSDESC_CRA_CMD_DESC_MIN_DESCLEN \ + (FCNVME_LSDESC_CRA_CMD_DESC_MINLEN - \ + offsetof(struct fcnvme_lsdesc_cr_assoc_cmd, ersp_ratio)) + + + /* FCNVME_LSDESC_CREATE_CONN_CMD */ struct fcnvme_lsdesc_cr_conn_cmd { __be32 desc_tag; /* FCNVME_LSDESC_xxx */ @@ -273,6 +286,14 @@ struct fcnvme_ls_cr_assoc_rqst { struct fcnvme_lsdesc_cr_assoc_cmd assoc_cmd; }; +#define FCNVME_LSDESC_CRA_RQST_MINLEN \ + (offsetof(struct fcnvme_ls_cr_assoc_rqst, assoc_cmd) + \ + FCNVME_LSDESC_CRA_CMD_DESC_MINLEN) + +#define FCNVME_LSDESC_CRA_RQST_MIN_LISTLEN \ + FCNVME_LSDESC_CRA_CMD_DESC_MINLEN + + struct fcnvme_ls_cr_assoc_acc { struct fcnvme_ls_acc_hdr hdr; struct fcnvme_lsdesc_assoc_id associd; From 4cb7ca8073e1f226487168155c9a887079d605e4 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 7 Jul 2017 18:08:26 -0700 Subject: [PATCH 22/25] nvmet_fc: Accept variable pad lengths on Create Association LS Target validation of the Create Association LS revised to accept any LS as long as all non-pad data has been received. This allows a (newer) target to accept the LS from older initiators with varying pad lengths. Signed-off-by: James Smart Signed-off-by: Sagi Grimberg --- drivers/nvme/target/fc.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 7692a96c9065..1e6dcc241b3c 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -1164,18 +1164,24 @@ nvmet_fc_ls_create_association(struct nvmet_fc_tgtport *tgtport, memset(acc, 0, sizeof(*acc)); - if (iod->rqstdatalen < sizeof(struct fcnvme_ls_cr_assoc_rqst)) + /* + * FC-NVME spec changes. There are initiators sending different + * lengths as padding sizes for Create Association Cmd descriptor + * was incorrect. + * Accept anything of "minimum" length. Assume format per 1.15 + * spec (with HOSTID reduced to 16 bytes), ignore how long the + * trailing pad length is. + */ + if (iod->rqstdatalen < FCNVME_LSDESC_CRA_RQST_MINLEN) ret = VERR_CR_ASSOC_LEN; - else if (rqst->desc_list_len != - fcnvme_lsdesc_len( - sizeof(struct fcnvme_ls_cr_assoc_rqst))) + else if (rqst->desc_list_len < + cpu_to_be32(FCNVME_LSDESC_CRA_RQST_MIN_LISTLEN)) ret = VERR_CR_ASSOC_RQST_LEN; else if (rqst->assoc_cmd.desc_tag != cpu_to_be32(FCNVME_LSDESC_CREATE_ASSOC_CMD)) ret = VERR_CR_ASSOC_CMD; - else if (rqst->assoc_cmd.desc_len != - fcnvme_lsdesc_len( - sizeof(struct fcnvme_lsdesc_cr_assoc_cmd))) + else if (rqst->assoc_cmd.desc_len < + cpu_to_be32(FCNVME_LSDESC_CRA_CMD_DESC_MIN_DESCLEN)) ret = VERR_CR_ASSOC_CMD_LEN; else if (!rqst->assoc_cmd.ersp_ratio || (be16_to_cpu(rqst->assoc_cmd.ersp_ratio) >= From 2ee0e4ed5ca24c0642a7b72d75b4fe6dfc0a8db8 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 6 Jul 2017 12:26:52 +0300 Subject: [PATCH 23/25] nvme-pci: compile warnings in nvme_alloc_host_mem() "i" should be signed or it could cause a forever loop on the cleanup path. "size" can be used uninitialized. Fixes: 87ad72a59a38 ("nvme-pci: implement host memory buffer support") Signed-off-by: Dan Carpenter Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 882ed3677117..73fddf2c99f9 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1582,9 +1582,10 @@ static void nvme_free_host_mem(struct nvme_dev *dev) static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) { struct nvme_host_mem_buf_desc *descs; - u32 chunk_size, max_entries, i = 0; + u32 chunk_size, max_entries; + int i = 0; void **bufs; - u64 size, tmp; + u64 size = 0, tmp; /* start big and work our way down */ chunk_size = min(preferred, (u64)PAGE_SIZE << MAX_ORDER); From b27c1e683d2c8cd666a042b02096d18237911a37 Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Mon, 10 Jul 2017 16:46:59 +0800 Subject: [PATCH 24/25] nvme-pci: add module parameter for io queue depth Adjust io queue depth more easily, and make sure io queue depth >= 2. Signed-off-by: weiping zhang Signed-off-by: Sagi Grimberg --- drivers/nvme/host/pci.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 73fddf2c99f9..48d3ed3d48d1 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -35,7 +35,6 @@ #include "nvme.h" -#define NVME_Q_DEPTH 1024 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) @@ -57,6 +56,16 @@ module_param(max_host_mem_size_mb, uint, 0444); MODULE_PARM_DESC(max_host_mem_size_mb, "Maximum Host Memory Buffer (HMB) size per controller (in MiB)"); +static int io_queue_depth_set(const char *val, const struct kernel_param *kp); +static const struct kernel_param_ops io_queue_depth_ops = { + .set = io_queue_depth_set, + .get = param_get_int, +}; + +static int io_queue_depth = 1024; +module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644); +MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2"); + struct nvme_dev; struct nvme_queue; @@ -104,6 +113,17 @@ struct nvme_dev { void **host_mem_desc_bufs; }; +static int io_queue_depth_set(const char *val, const struct kernel_param *kp) +{ + int n = 0, ret; + + ret = kstrtoint(val, 10, &n); + if (ret != 0 || n < 2) + return -EINVAL; + + return param_set_int(val, kp); +} + static inline unsigned int sq_idx(unsigned int qid, u32 stride) { return qid * 2 * stride; @@ -1893,7 +1913,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP); dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1, - NVME_Q_DEPTH); + io_queue_depth); dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); dev->dbs = dev->bar + 4096; From c2f30f08c115a6bb9061afb4ba314f7e41f20686 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 10 Jul 2017 17:24:02 +0300 Subject: [PATCH 25/25] nvmet: avoid unneeded assignment of submit_bio return value We actually using the cookie returned from the last submit_bio call. Signed-off-by: Max Gurtovoy Reviewed-by: Christoph Hellwig Signed-off-by: Sagi Grimberg --- drivers/nvme/target/io-cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index 40128793e613..3b4d47a6abdb 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -85,7 +85,7 @@ static void nvmet_execute_rw(struct nvmet_req *req) bio_set_op_attrs(bio, op, op_flags); bio_chain(bio, prev); - cookie = submit_bio(prev); + submit_bio(prev); } sector += sg->length >> 9;