NVMe: Convert to blk-mq

This converts the NVMe driver to a blk-mq request-based driver.

The NVMe driver is currently bio-based and implements queue logic within
itself.  By using blk-mq, a lot of these responsibilities can be moved
and simplified.

The patch is divided into the following blocks:

 * Per-command data and cmdid have been moved into the struct request
   field. The cmdid_data can be retrieved using blk_mq_rq_to_pdu() and id
   maintenance are now handled by blk-mq through the rq->tag field.

 * The logic for splitting bio's has been moved into the blk-mq layer.
   The driver instead notifies the block layer about limited gap support in
   SG lists.

 * blk-mq handles timeouts and is reimplemented within nvme_timeout().
   This both includes abort handling and command cancelation.

 * Assignment of nvme queues to CPUs are replaced with the blk-mq
   version. The current blk-mq strategy is to assign the number of
   mapped queues and CPUs to provide synergy, while the nvme driver
   assign as many nvme hw queues as possible. This can be implemented in
   blk-mq if needed.

 * NVMe queues are merged with the tags structure of blk-mq.

 * blk-mq takes care of setup/teardown of nvme queues and guards invalid
   accesses. Therefore, RCU-usage for nvme queues can be removed.

 * IO tracing and accounting are handled by blk-mq and therefore removed.

 * Queue suspension logic is replaced with the logic from the block
   layer.

Contributions in this patch from:

  Sam Bradshaw <sbradshaw@micron.com>
  Jens Axboe <axboe@fb.com>
  Keith Busch <keith.busch@intel.com>
  Robert Nelson <rlnelson@google.com>

Acked-by: Keith Busch <keith.busch@intel.com>
Acked-by: Jens Axboe <axboe@fb.com>

Updated for new ->queue_rq() prototype.

Signed-off-by: Jens Axboe <axboe@fb.com>
This commit is contained in:
Matias Bjørling 2014-11-04 08:20:14 -07:00 committed by Jens Axboe
parent 9dbbfab7d5
commit a4aea5623d
3 changed files with 579 additions and 828 deletions

File diff suppressed because it is too large Load diff

View file

@ -2105,7 +2105,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
nvme_offset += unit_num_blocks;
nvme_sc = nvme_submit_io_cmd(dev, &c, NULL);
nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL);
if (nvme_sc != NVME_SC_SUCCESS) {
nvme_unmap_user_pages(dev,
(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
@ -2658,7 +2658,7 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
c.common.opcode = nvme_cmd_flush;
c.common.nsid = cpu_to_le32(ns->ns_id);
nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL);
nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL);
res = nvme_trans_status_code(hdr, nvme_sc);
if (res)
goto out;
@ -2686,7 +2686,7 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
c.common.opcode = nvme_cmd_flush;
c.common.nsid = cpu_to_le32(ns->ns_id);
nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL);
nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL);
res = nvme_trans_status_code(hdr, nvme_sc);
if (res)
@ -2894,7 +2894,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
c.dsm.nr = cpu_to_le32(ndesc - 1);
c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
nvme_sc = nvme_submit_io_cmd(dev, &c, NULL);
nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL);
res = nvme_trans_status_code(hdr, nvme_sc);
dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range),

View file

@ -19,6 +19,7 @@
#include <linux/pci.h>
#include <linux/miscdevice.h>
#include <linux/kref.h>
#include <linux/blk-mq.h>
struct nvme_bar {
__u64 cap; /* Controller Capabilities */
@ -71,8 +72,10 @@ extern unsigned char nvme_io_timeout;
*/
struct nvme_dev {
struct list_head node;
struct nvme_queue __rcu **queues;
unsigned short __percpu *io_queue;
struct nvme_queue **queues;
struct request_queue *admin_q;
struct blk_mq_tag_set tagset;
struct blk_mq_tag_set admin_tagset;
u32 __iomem *dbs;
struct pci_dev *pci_dev;
struct dma_pool *prp_page_pool;
@ -91,7 +94,6 @@ struct nvme_dev {
struct miscdevice miscdev;
work_func_t reset_workfn;
struct work_struct reset_work;
struct work_struct cpu_work;
char name[12];
char serial[20];
char model[40];
@ -135,7 +137,6 @@ struct nvme_iod {
int offset; /* Of PRP list */
int nents; /* Used in scatterlist */
int length; /* Of data, in bytes */
unsigned long start_time;
dma_addr_t first_dma;
struct list_head node;
struct scatterlist sg[0];
@ -153,12 +154,14 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
*/
void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod);
int nvme_setup_prps(struct nvme_dev *, struct nvme_iod *, int , gfp_t);
int nvme_setup_prps(struct nvme_dev *, struct nvme_iod *, int, gfp_t);
struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
unsigned long addr, unsigned length);
void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
struct nvme_iod *iod);
int nvme_submit_io_cmd(struct nvme_dev *, struct nvme_command *, u32 *);
int nvme_submit_io_cmd(struct nvme_dev *, struct nvme_ns *,
struct nvme_command *, u32 *);
int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns);
int nvme_submit_admin_cmd(struct nvme_dev *, struct nvme_command *,
u32 *result);
int nvme_identify(struct nvme_dev *, unsigned nsid, unsigned cns,