From 38a8c4d1d45006841f0643f4cb29b5e50758837c Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 31 Mar 2023 11:00:56 -0700 Subject: [PATCH 1/5] blk-mq: directly poll requests Polling needs a bio with a valid bi_bdev, but neither of those are guaranteed for polled driver requests. Make request based polling directly use blk-mq's polling function instead. When executing a request from a polled hctx, we know the request's cookie, and that it's from a live blk-mq queue that supports polling, so we can safely skip everything that bio_poll provides. Cc: stable@kernel.org Reported-by: Martin Belanger Reported-by: Daniel Wagner Signed-off-by: Keith Busch Tested-by: Daniel Wagner Revieded-by: Daniel Wagner Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Tested-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/20230331180056.1155862-1-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index cf1a39adf9a5..f0ea9dcfb966 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1359,8 +1359,6 @@ bool blk_rq_is_poll(struct request *rq) return false; if (rq->mq_hctx->type != HCTX_TYPE_POLL) return false; - if (WARN_ON_ONCE(!rq->bio)) - return false; return true; } EXPORT_SYMBOL_GPL(blk_rq_is_poll); @@ -1368,7 +1366,7 @@ EXPORT_SYMBOL_GPL(blk_rq_is_poll); static void blk_rq_poll_completion(struct request *rq, struct completion *wait) { do { - bio_poll(rq->bio, NULL, 0); + blk_mq_poll(rq->q, blk_rq_to_qc(rq), NULL, 0); cond_resched(); } while (!completion_done(wait)); } From d3205ab75e99a47539ec91ef85ba488f4ddfeaa9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 3 Apr 2023 13:09:25 -0700 Subject: [PATCH 2/5] nvme: fix discard support without oncs The device can report discard support without setting the ONCS DSM bit. When not set, the driver clears max_discard_size expecting it to be set later. We don't know the size until we have the namespace format, though, so setting it is deferred until configuring one, but the driver was abandoning the discard settings due to that initial clearing. Move the max_discard_size calculation above the check for a '0' discard size. Fixes: 1a86924e4f46475 ("nvme: fix interpretation of DMRSL") Reported-by: Laurence Oberman Signed-off-by: Keith Busch Reviewed-by: Niklas Cassel Reviewed-by: Sagi Grimberg Tested-by: Laurence Oberman Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 53ef028596c6..d6a9bac91a4c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1674,6 +1674,9 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) struct request_queue *queue = disk->queue; u32 size = queue_logical_block_size(queue); + if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns, UINT_MAX)) + ctrl->max_discard_sectors = nvme_lba_to_sect(ns, ctrl->dmrsl); + if (ctrl->max_discard_sectors == 0) { blk_queue_max_discard_sectors(queue, 0); return; @@ -1688,9 +1691,6 @@ static void nvme_config_discard(struct gendisk *disk, struct nvme_ns *ns) if (queue->limits.max_discard_sectors) return; - if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns, UINT_MAX)) - ctrl->max_discard_sectors = nvme_lba_to_sect(ns, ctrl->dmrsl); - blk_queue_max_discard_sectors(queue, ctrl->max_discard_sectors); blk_queue_max_discard_segments(queue, ctrl->max_discard_segments); From 8c68ae3b22fa6fb2dbe83ef955ff10936503d28e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 5 Apr 2023 20:00:46 -0600 Subject: [PATCH 3/5] ublk: read any SQE values upfront Since SQE memory is shared with userspace, we should only be reading it once. We cannot read it multiple times, particularly when it's read once for validation and then read again for the actual use. ublk_ch_uring_cmd() is safe when called as a retry operation, as the memory backing is stable at that point. But for normal issue, we want to ensure that we only read ublksrv_io_cmd once. Wrap the function in a helper that reads the value into an on-stack copy of the struct. Cc: stable@vger.kernel.org # 6.0+ Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c73cc57ec547..a36934ee9739 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1261,9 +1261,10 @@ static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id, ublk_queue_cmd(ubq, req); } -static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags, + struct ublksrv_io_cmd *ub_cmd) { - struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd; struct ublk_device *ub = cmd->file->private_data; struct ublk_queue *ubq; struct ublk_io *io; @@ -1362,6 +1363,23 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -EIOCBQUEUED; } +static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct ublksrv_io_cmd *ub_src = (struct ublksrv_io_cmd *) cmd->cmd; + struct ublksrv_io_cmd ub_cmd; + + /* + * Not necessary for async retry, but let's keep it simple and always + * copy the values to avoid any potential reuse. + */ + ub_cmd.q_id = READ_ONCE(ub_src->q_id); + ub_cmd.tag = READ_ONCE(ub_src->tag); + ub_cmd.result = READ_ONCE(ub_src->result); + ub_cmd.addr = READ_ONCE(ub_src->addr); + + return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd); +} + static const struct file_operations ublk_ch_fops = { .owner = THIS_MODULE, .open = ublk_ch_open, From 1d1665279a845d16c93687389e364386e3fe0f38 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 6 Apr 2023 20:40:59 +0800 Subject: [PATCH 4/5] block: ublk: make sure that block size is set correctly block size is one very key setting for block layer, and bad block size could panic kernel easily. Make sure that block size is set correctly. Meantime if ublk_validate_params() fails, clear ub->params so that disk is prevented from being added. Fixes: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") Reported-and-tested-by: Breno Leitao Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index a36934ee9739..604c1a13c76e 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -246,7 +246,7 @@ static int ublk_validate_params(const struct ublk_device *ub) if (ub->params.types & UBLK_PARAM_TYPE_BASIC) { const struct ublk_param_basic *p = &ub->params.basic; - if (p->logical_bs_shift > PAGE_SHIFT) + if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9) return -EINVAL; if (p->logical_bs_shift > p->physical_bs_shift) @@ -1970,6 +1970,8 @@ static int ublk_ctrl_set_params(struct ublk_device *ub, /* clear all we don't support yet */ ub->params.types &= UBLK_PARAM_TYPE_ALL; ret = ublk_validate_params(ub); + if (ret) + ub->params.types = 0; } mutex_unlock(&ub->mutex); From 3723091ea1884d599cc8b8bf719d6f42e8d4d8b1 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Wed, 22 Mar 2023 11:59:26 +0800 Subject: [PATCH 5/5] block: don't set GD_NEED_PART_SCAN if scan partition failed Currently if disk_scan_partitions() failed, GD_NEED_PART_SCAN will still set, and partition scan will be proceed again when blkdev_get_by_dev() is called. However, this will cause a problem that re-assemble partitioned raid device will creat partition for underlying disk. Test procedure: mdadm -CR /dev/md0 -l 1 -n 2 /dev/sda /dev/sdb -e 1.0 sgdisk -n 0:0:+100MiB /dev/md0 blockdev --rereadpt /dev/sda blockdev --rereadpt /dev/sdb mdadm -S /dev/md0 mdadm -A /dev/md0 /dev/sda /dev/sdb Test result: underlying disk partition and raid partition can be observed at the same time Note that this can still happen in come corner cases that GD_NEED_PART_SCAN can be set for underlying disk while re-assemble raid device. Fixes: e5cfefa97bcc ("block: fix scan partition for exclusively open device again") Reviewed-by: Jan Kara Reviewed-by: Ming Lei Signed-off-by: Yu Kuai Signed-off-by: Jens Axboe --- block/genhd.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 02d9cfb9e077..7f874737af68 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -368,7 +368,6 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode) if (disk->open_partitions) return -EBUSY; - set_bit(GD_NEED_PART_SCAN, &disk->state); /* * If the device is opened exclusively by current thread already, it's * safe to scan partitons, otherwise, use bd_prepare_to_claim() to @@ -381,12 +380,19 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode) return ret; } + set_bit(GD_NEED_PART_SCAN, &disk->state); bdev = blkdev_get_by_dev(disk_devt(disk), mode & ~FMODE_EXCL, NULL); if (IS_ERR(bdev)) ret = PTR_ERR(bdev); else blkdev_put(bdev, mode & ~FMODE_EXCL); + /* + * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set, + * and this will cause that re-assemble partitioned raid device will + * creat partition for underlying disk. + */ + clear_bit(GD_NEED_PART_SCAN, &disk->state); if (!(mode & FMODE_EXCL)) bd_abort_claiming(disk->part0, disk_scan_partitions); return ret;