From d32d3d0b47f7e34560ae3c55ddfcf68694813501 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 14 Jun 2021 13:17:34 +0200 Subject: [PATCH 01/17] nvme-multipath: set QUEUE_FLAG_NOWAIT The nvme multipathing code just dispatches bios to one of the blk-mq based paths and never blocks on its own, so set QUEUE_FLAG_NOWAIT to support REQ_NOWAIT bios. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- drivers/nvme/host/multipath.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 37ce3e8b1db2..d9a82adb856f 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -465,6 +465,8 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) ctrl->subsys->instance, head->instance); blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue); + blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue); + /* set to a default value of 512 until the disk is validated */ blk_queue_logical_block_size(head->disk->queue, 512); blk_set_stacking_limits(&head->disk->queue->limits); From e7d65803e2bb5bc739548b67a5fc72c626cf7e3b Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 24 Aug 2021 16:57:42 +0200 Subject: [PATCH 02/17] nvme-multipath: revalidate paths during rescan When triggering a rescan due to a namespace resize we will be receiving AENs on every controller, triggering a rescan of all attached namespaces. If multipath is active only the current path and the ns_head disk will be updated, the other paths will still refer to the old size until AENs for the remaining controllers are received. If I/O comes in before that it might be routed to one of the old paths, triggering an I/O failure with 'access beyond end of device'. With this patch the old paths are skipped from multipath path selection until the controller serving these paths has been rescanned. Signed-off-by: Hannes Reinecke [dwagner: - introduce NVME_NS_READY flag instead of NVME_NS_INVALIDATE - use 'revalidate' instead of 'invalidate' which follows the zoned device code path. - clear NVME_NS_READY before clearing current_path] Signed-off-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 3 +++ drivers/nvme/host/multipath.c | 17 ++++++++++++++++- drivers/nvme/host/nvme.h | 5 +++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8679a108f571..f5502ee45b96 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1874,6 +1874,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) goto out_unfreeze; } + set_bit(NVME_NS_READY, &ns->flags); blk_mq_unfreeze_queue(ns->disk->queue); if (blk_queue_is_zoned(ns->queue)) { @@ -1885,6 +1886,7 @@ static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_id_ns *id) if (nvme_ns_head_multipath(ns->head)) { blk_mq_freeze_queue(ns->head->disk->queue); nvme_update_disk_info(ns->head->disk, ns, id); + nvme_mpath_revalidate_paths(ns); blk_stack_limits(&ns->head->disk->queue->limits, &ns->queue->limits, 0); disk_update_readahead(ns->head->disk); @@ -3795,6 +3797,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) return; + clear_bit(NVME_NS_READY, &ns->flags); set_capacity(ns->disk, 0); nvme_fault_inject_fini(&ns->fault_inject); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index d9a82adb856f..5d7bc58a27bd 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -147,6 +147,21 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) mutex_unlock(&ctrl->scan_lock); } +void nvme_mpath_revalidate_paths(struct nvme_ns *ns) +{ + struct nvme_ns_head *head = ns->head; + sector_t capacity = get_capacity(head->disk); + int node; + + list_for_each_entry_rcu(ns, &head->list, siblings) { + if (capacity != get_capacity(ns->disk)) + clear_bit(NVME_NS_READY, &ns->flags); + } + + for_each_node(node) + rcu_assign_pointer(head->current_path[node], NULL); +} + static bool nvme_path_is_disabled(struct nvme_ns *ns) { /* @@ -158,7 +173,7 @@ static bool nvme_path_is_disabled(struct nvme_ns *ns) ns->ctrl->state != NVME_CTRL_DELETING) return true; if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) || - test_bit(NVME_NS_REMOVING, &ns->flags)) + !test_bit(NVME_NS_READY, &ns->flags)) return true; return false; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a2e1f298b217..8fd30ef19757 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -456,6 +456,7 @@ struct nvme_ns { #define NVME_NS_DEAD 1 #define NVME_NS_ANA_PENDING 2 #define NVME_NS_FORCE_RO 3 +#define NVME_NS_READY 4 struct cdev cdev; struct device cdev_device; @@ -748,6 +749,7 @@ void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl); void nvme_mpath_uninit(struct nvme_ctrl *ctrl); void nvme_mpath_stop(struct nvme_ctrl *ctrl); bool nvme_mpath_clear_current_path(struct nvme_ns *ns); +void nvme_mpath_revalidate_paths(struct nvme_ns *ns); void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl); void nvme_mpath_shutdown_disk(struct nvme_ns_head *head); @@ -795,6 +797,9 @@ static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns) { return false; } +static inline void nvme_mpath_revalidate_paths(struct nvme_ns *ns) +{ +} static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) { } From 43dc987828eabf915858b1ac37a8e67fa8004349 Mon Sep 17 00:00:00 2001 From: Adam Manzanares Date: Thu, 26 Aug 2021 21:15:45 +0000 Subject: [PATCH 03/17] nvme: move nvme_multi_css into nvme.h Preparatory patch in order to reuse nvme_multi_css in the nvme target code. Signed-off-by: Adam Manzanares Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 5 ----- drivers/nvme/host/nvme.h | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f5502ee45b96..4be4845638e6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1302,11 +1302,6 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) return error; } -static bool nvme_multi_css(struct nvme_ctrl *ctrl) -{ - return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI; -} - static int nvme_process_ns_desc(struct nvme_ctrl *ctrl, struct nvme_ns_ids *ids, struct nvme_ns_id_desc *cur, bool *csi_seen) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 8fd30ef19757..9871c0c9374c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -892,4 +892,9 @@ struct nvme_ctrl *nvme_ctrl_from_file(struct file *file); struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid); void nvme_put_ns(struct nvme_ns *ns); +static inline bool nvme_multi_css(struct nvme_ctrl *ctrl) +{ + return (ctrl->ctrl_config & NVME_CC_CSS_MASK) == NVME_CC_CSS_CSI; +} + #endif /* _NVME_H */ From 77d651a65569a5e60f314b768500e94fcb936311 Mon Sep 17 00:00:00 2001 From: Adam Manzanares Date: Thu, 26 Aug 2021 21:15:45 +0000 Subject: [PATCH 04/17] nvmet: looks at the passthrough controller when initializing CAP For a passthru controller make cap initialization dependent on the cap of the passthru controller, given that multiple Command Set support needs to be supported by the underlying controller. For that move the initialization of CAP later so that it can use the fully initialized nvmet_ctrl structure. Fixes: ab5d0b38c047 (nvmet: add Command Set Identifier support) Signed-off-by: Adam Manzanares Reviewed-by: Keith Busch [hch: refactored the code a bit to keep it more contained in passthru.c] Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 6 ++++-- drivers/nvme/target/nvmet.h | 2 ++ drivers/nvme/target/passthru.c | 10 ++++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 66d05eecc2a9..11c44706dc2d 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1206,6 +1206,9 @@ static void nvmet_init_cap(struct nvmet_ctrl *ctrl) ctrl->cap |= (15ULL << 24); /* maximum queue entries supported: */ ctrl->cap |= NVMET_QUEUE_SIZE - 1; + + if (nvmet_passthru_ctrl(ctrl->subsys)) + nvmet_passthrough_override_cap(ctrl); } struct nvmet_ctrl *nvmet_ctrl_find_get(const char *subsysnqn, @@ -1363,8 +1366,6 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, goto out_put_subsystem; mutex_init(&ctrl->lock); - nvmet_init_cap(ctrl); - ctrl->port = req->port; INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); @@ -1378,6 +1379,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, kref_init(&ctrl->ref); ctrl->subsys = subsys; + nvmet_init_cap(ctrl); WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL); ctrl->changed_ns_list = kmalloc_array(NVME_MAX_CHANGED_NAMESPACES, diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 06dd3d537f07..183119607968 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -613,6 +613,8 @@ nvmet_req_passthru_ctrl(struct nvmet_req *req) return nvmet_passthru_ctrl(nvmet_req_subsys(req)); } +void nvmet_passthrough_override_cap(struct nvmet_ctrl *ctrl); + u16 errno_to_nvme_status(struct nvmet_req *req, int errno); u16 nvmet_report_invalid_opcode(struct nvmet_req *req); diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 225cd1ffbe45..8784c487e462 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -20,6 +20,16 @@ MODULE_IMPORT_NS(NVME_TARGET_PASSTHRU); */ static DEFINE_XARRAY(passthru_subsystems); +void nvmet_passthrough_override_cap(struct nvmet_ctrl *ctrl) +{ + /* + * Multiple command set support can only be declared if the underlying + * controller actually supports it. + */ + if (!nvme_multi_css(ctrl->subsys->passthru_ctrl)) + ctrl->cap &= ~(1ULL << 43); +} + static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) { struct nvmet_ctrl *ctrl = req->sq->ctrl; From ab7a2737ac5acd7d485ca45d8772497717fbc781 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Aug 2021 08:11:12 +0200 Subject: [PATCH 05/17] nvmet: return bool from nvmet_passthru_ctrl and nvmet_is_passthru_req The target core code never needs the host-side nvme_ctrl structure. Open code two uses of nvmet_is_passthru_req in passthru.c, and then switch the helpers used by the core to return bool. Also rename the fuctions to better match their usage: nvmet_passthru_ctrl -> nvmet_is_passthru_subsys nvmet_req_passthru_ctrl -> nvmet_is_passthru_req Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- drivers/nvme/target/admin-cmd.c | 2 +- drivers/nvme/target/configfs.c | 2 +- drivers/nvme/target/core.c | 6 +++--- drivers/nvme/target/nvmet.h | 9 ++++----- drivers/nvme/target/passthru.c | 4 ++-- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 0cb98f2bbc8c..aa6d84d8848e 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -1015,7 +1015,7 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req) if (unlikely(ret)) return ret; - if (nvmet_req_passthru_ctrl(req)) + if (nvmet_is_passthru_req(req)) return nvmet_parse_passthru_admin_cmd(req); switch (cmd->common.opcode) { diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 273555127188..f74485c705ff 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1028,7 +1028,7 @@ nvmet_subsys_attr_version_store_locked(struct nvmet_subsys *subsys, } /* passthru subsystems use the underlying controller's version */ - if (nvmet_passthru_ctrl(subsys)) + if (nvmet_is_passthru_subsys(subsys)) return -EINVAL; ret = sscanf(page, "%d.%d.%d\n", &major, &minor, &tertiary); diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 11c44706dc2d..b8425fa34300 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -553,7 +553,7 @@ int nvmet_ns_enable(struct nvmet_ns *ns) mutex_lock(&subsys->lock); ret = 0; - if (nvmet_passthru_ctrl(subsys)) { + if (nvmet_is_passthru_subsys(subsys)) { pr_info("cannot enable both passthru and regular namespaces for a single subsystem"); goto out_unlock; } @@ -869,7 +869,7 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req) if (unlikely(ret)) return ret; - if (nvmet_req_passthru_ctrl(req)) + if (nvmet_is_passthru_req(req)) return nvmet_parse_passthru_io_cmd(req); ret = nvmet_req_find_ns(req); @@ -1207,7 +1207,7 @@ static void nvmet_init_cap(struct nvmet_ctrl *ctrl) /* maximum queue entries supported: */ ctrl->cap |= NVMET_QUEUE_SIZE - 1; - if (nvmet_passthru_ctrl(ctrl->subsys)) + if (nvmet_is_passthru_subsys(ctrl->subsys)) nvmet_passthrough_override_cap(ctrl); } diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 183119607968..7143c7fa7464 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -582,7 +582,7 @@ int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys); void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys); u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req); u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req); -static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys) +static inline bool nvmet_is_passthru_subsys(struct nvmet_subsys *subsys) { return subsys->passthru_ctrl; } @@ -601,16 +601,15 @@ static inline u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req) { return 0; } -static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys) +static inline bool nvmet_is_passthru_subsys(struct nvmet_subsys *subsys) { return NULL; } #endif /* CONFIG_NVME_TARGET_PASSTHRU */ -static inline struct nvme_ctrl * -nvmet_req_passthru_ctrl(struct nvmet_req *req) +static inline bool nvmet_is_passthru_req(struct nvmet_req *req) { - return nvmet_passthru_ctrl(nvmet_req_subsys(req)); + return nvmet_is_passthru_subsys(nvmet_req_subsys(req)); } void nvmet_passthrough_override_cap(struct nvmet_ctrl *ctrl); diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 8784c487e462..f0efb3537989 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -228,7 +228,7 @@ static int nvmet_passthru_map_sg(struct nvmet_req *req, struct request *rq) static void nvmet_passthru_execute_cmd(struct nvmet_req *req) { - struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req); + struct nvme_ctrl *ctrl = nvmet_req_subsys(req)->passthru_ctrl; struct request_queue *q = ctrl->admin_q; struct nvme_ns *ns = NULL; struct request *rq = NULL; @@ -309,7 +309,7 @@ static void nvmet_passthru_execute_cmd(struct nvmet_req *req) */ static void nvmet_passthru_set_host_behaviour(struct nvmet_req *req) { - struct nvme_ctrl *ctrl = nvmet_req_passthru_ctrl(req); + struct nvme_ctrl *ctrl = nvmet_req_subsys(req)->passthru_ctrl; struct nvme_feat_host_behavior *host; u16 status = NVME_SC_INTERNAL; int ret; From f04064814c2a15c22ed9c803f9b634ef34f91092 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 6 Sep 2021 09:04:03 +0200 Subject: [PATCH 06/17] nvmet: fixup buffer overrun in nvmet_subsys_attr_serial() The serial number is copied into the buffer via memcpy_and_pad() with the length NVMET_SN_MAX_SIZE. So when printing out we also need to take just that length as anything beyond that will be uninitialized. Signed-off-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index f74485c705ff..d784f3c200b4 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1067,7 +1067,8 @@ static ssize_t nvmet_subsys_attr_serial_show(struct config_item *item, { struct nvmet_subsys *subsys = to_subsys(item); - return snprintf(page, PAGE_SIZE, "%s\n", subsys->serial); + return snprintf(page, PAGE_SIZE, "%*s\n", + NVMET_SN_MAX_SIZE, subsys->serial); } static ssize_t From 1ba2e507f55c5b0cbde8c0fbfe0d9e39612a3e52 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 30 Aug 2021 15:36:26 +0200 Subject: [PATCH 07/17] nvme-tcp: Do not reset transport on data digest errors The spec says 7.4.6.1 Digest Error handling When a host detects a data digest error in a C2HData PDU, that host shall continue processing C2HData PDUs associated with the command and when the command processing has completed, if a successful status was returned by the controller, the host shall fail the command with a non-fatal transport error. Currently the transport is reseted when a data digest error is detected. Instead, when a digest error is detected, mark the final status as NVME_SC_DATA_XFER_ERROR and let the upper layer handle the error. In order to keep track of the final result maintain a status field in nvme_tcp_request object and use it to overwrite the completion queue status (which might be successful even though a digest error has been detected) when completing the request. Signed-off-by: Daniel Wagner Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 645025620154..e2ab12f3f51c 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -45,6 +45,7 @@ struct nvme_tcp_request { u32 pdu_len; u32 pdu_sent; u16 ttag; + __le16 status; struct list_head entry; struct llist_node lentry; __le32 ddgst; @@ -485,6 +486,7 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl) static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, struct nvme_completion *cqe) { + struct nvme_tcp_request *req; struct request *rq; rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id); @@ -496,7 +498,11 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, return -EINVAL; } - if (!nvme_try_complete_req(rq, cqe->status, cqe->result)) + req = blk_mq_rq_to_pdu(rq); + if (req->status == cpu_to_le16(NVME_SC_SUCCESS)) + req->status = cqe->status; + + if (!nvme_try_complete_req(rq, req->status, cqe->result)) nvme_complete_rq(rq); queue->nr_cqe++; @@ -758,7 +764,8 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; } else { if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { - nvme_tcp_end_request(rq, NVME_SC_SUCCESS); + nvme_tcp_end_request(rq, + le16_to_cpu(req->status)); queue->nr_cqe++; } nvme_tcp_init_recv_ctx(queue); @@ -788,18 +795,24 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, return 0; if (queue->recv_ddgst != queue->exp_ddgst) { + struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue), + pdu->command_id); + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + + req->status = cpu_to_le16(NVME_SC_DATA_XFER_ERROR); + dev_err(queue->ctrl->ctrl.device, "data digest error: recv %#x expected %#x\n", le32_to_cpu(queue->recv_ddgst), le32_to_cpu(queue->exp_ddgst)); - return -EIO; } if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); - nvme_tcp_end_request(rq, NVME_SC_SUCCESS); + nvme_tcp_end_request(rq, le16_to_cpu(req->status)); queue->nr_cqe++; } @@ -2293,6 +2306,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, return ret; req->state = NVME_TCP_SEND_CMD_PDU; + req->status = cpu_to_le16(NVME_SC_SUCCESS); req->offset = 0; req->data_sent = 0; req->pdu_len = 0; From b58da2d270dbcc67db73f15028774d27c85e16d7 Mon Sep 17 00:00:00 2001 From: Tatsuya Sasaki Date: Wed, 1 Sep 2021 08:23:42 +0000 Subject: [PATCH 08/17] nvme: update keep alive interval when kato is modified Currently the connection between host and NVMe-oF target gets disconnected by keep-alive timeout when a user connects to a target with a relatively large kato value and then sets the smaller kato with a set features command (e.g. connects with 60 seconds kato value and then sets 10 seconds kato value). The cause is that keep alive command interval on the host, which is defined as unsigned int kato in nvme_ctrl structure, does not follow the kato value changes. This patch updates the keep alive interval in the following steps when the kato is modified by a set features command: stops the keep alive work queue, then sets the kato as new timer value and re-start the queue. Signed-off-by: Tatsuya Sasaki Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 42 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4be4845638e6..1c98a2f590ea 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -116,6 +116,8 @@ static struct class *nvme_ns_chr_class; static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, unsigned nsid); +static void nvme_update_keep_alive(struct nvme_ctrl *ctrl, + struct nvme_command *cmd); /* * Prepare a queue for teardown. @@ -1152,7 +1154,8 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return effects; } -static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) +static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, + struct nvme_command *cmd, int status) { if (effects & NVME_CMD_EFFECTS_CSE_MASK) { nvme_unfreeze(ctrl); @@ -1167,6 +1170,26 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) nvme_queue_scan(ctrl); flush_work(&ctrl->scan_work); } + + switch (cmd->common.opcode) { + case nvme_admin_set_features: + switch (le32_to_cpu(cmd->common.cdw10) & 0xFF) { + case NVME_FEAT_KATO: + /* + * Keep alive commands interval on the host should be + * updated when KATO is modified by Set Features + * commands. + */ + if (!status) + nvme_update_keep_alive(ctrl, cmd); + break; + default: + break; + } + break; + default: + break; + } } int nvme_execute_passthru_rq(struct request *rq) @@ -1181,7 +1204,7 @@ int nvme_execute_passthru_rq(struct request *rq) effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); ret = nvme_execute_rq(disk, rq, false); if (effects) /* nothing to be done for zero cmd effects */ - nvme_passthru_end(ctrl, effects); + nvme_passthru_end(ctrl, effects, cmd, ret); return ret; } @@ -1269,6 +1292,21 @@ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_stop_keep_alive); +static void nvme_update_keep_alive(struct nvme_ctrl *ctrl, + struct nvme_command *cmd) +{ + unsigned int new_kato = + DIV_ROUND_UP(le32_to_cpu(cmd->common.cdw11), 1000); + + dev_info(ctrl->device, + "keep alive interval updated from %u ms to %u ms\n", + ctrl->kato * 1000 / 2, new_kato * 1000 / 2); + + nvme_stop_keep_alive(ctrl); + ctrl->kato = new_kato; + nvme_start_keep_alive(ctrl); +} + /* * In NVMe 1.0 the CNS field was just a binary controller or namespace * flag, thus sending any new CNS opcodes has a big chance of not working. From 041bd1a1fc737cd73b0b8a9f74909191a8acc9fe Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Wed, 1 Sep 2021 11:25:24 +0200 Subject: [PATCH 09/17] nvme: only call synchronize_srcu when clearing current path The function nmve_mpath_clear_current_path returns true if the current path has changed. In this case we have to wait for all concurrent submissions to finish. But if we didn't change the current path, there is no point in waiting for another RCU period to finish. Signed-off-by: Daniel Wagner Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1c98a2f590ea..4a3a33f5f11c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3838,9 +3838,12 @@ static void nvme_ns_remove(struct nvme_ns *ns) list_del_rcu(&ns->siblings); mutex_unlock(&ns->ctrl->subsys->lock); - synchronize_rcu(); /* guarantee not available in head->list */ - nvme_mpath_clear_current_path(ns); - synchronize_srcu(&ns->head->srcu); /* wait for concurrent submissions */ + /* guarantee not available in head->list */ + synchronize_rcu(); + + /* wait for concurrent submissions */ + if (nvme_mpath_clear_current_path(ns)) + synchronize_srcu(&ns->head->srcu); if (!nvme_ns_head_multipath(ns->head)) nvme_cdev_del(&ns->cdev, &ns->cdev_device); From ab3994f6efba95e0832dc9e68c088b2d7ae764b8 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Mon, 30 Aug 2021 14:25:33 -0700 Subject: [PATCH 10/17] nvme: add error handling support for add_disk() We never checked for errors on add_disk() as this function returned void. Now that this is fixed, use the shiny new error handling. Signed-off-by: Luis Chamberlain Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4a3a33f5f11c..7efb31b87f37 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3798,7 +3798,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, nvme_get_ctrl(ctrl); - device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups); + if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups)) + goto out_cleanup_ns_from_list; + if (!nvme_ns_head_multipath(ns->head)) nvme_add_ns_cdev(ns); @@ -3808,6 +3810,11 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid, return; + out_cleanup_ns_from_list: + nvme_put_ctrl(ctrl); + down_write(&ctrl->namespaces_rwsem); + list_del_init(&ns->list); + up_write(&ctrl->namespaces_rwsem); out_unlink_ns: mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); From aff959c2840858d55d9ee155d555b3aa7e068b32 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Sun, 5 Sep 2021 19:26:01 -0700 Subject: [PATCH 11/17] nvme: update MAINTAINERS email address Update the MAINTAINERS file email address. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2f298429a5e9..64bf202c8c62 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13367,7 +13367,7 @@ F: include/linux/nvme-fc.h NVM EXPRESS TARGET DRIVER M: Christoph Hellwig M: Sagi Grimberg -M: Chaitanya Kulkarni +M: Chaitanya Kulkarni L: linux-nvme@lists.infradead.org S: Supported W: http://git.infradead.org/nvme.git From dfbb3409b27fa42b96f5727a80d3ceb6a8663991 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 7 Sep 2021 20:52:13 +0900 Subject: [PATCH 12/17] block: genhd: don't call blkdev_show() with major_names_lock held If CONFIG_BLK_DEV_LOOP && CONFIG_MTD (at least; there might be other combinations), lockdep complains circular locking dependency at __loop_clr_fd(), for major_names_lock serves as a locking dependency aggregating hub across multiple block modules. ====================================================== WARNING: possible circular locking dependency detected 5.14.0+ #757 Tainted: G E ------------------------------------------------------ systemd-udevd/7568 is trying to acquire lock: ffff88800f334d48 ((wq_completion)loop0){+.+.}-{0:0}, at: flush_workqueue+0x70/0x560 but task is already holding lock: ffff888014a7d4a0 (&lo->lo_mutex){+.+.}-{3:3}, at: __loop_clr_fd+0x4d/0x400 [loop] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #6 (&lo->lo_mutex){+.+.}-{3:3}: lock_acquire+0xbe/0x1f0 __mutex_lock_common+0xb6/0xe10 mutex_lock_killable_nested+0x17/0x20 lo_open+0x23/0x50 [loop] blkdev_get_by_dev+0x199/0x540 blkdev_open+0x58/0x90 do_dentry_open+0x144/0x3a0 path_openat+0xa57/0xda0 do_filp_open+0x9f/0x140 do_sys_openat2+0x71/0x150 __x64_sys_openat+0x78/0xa0 do_syscall_64+0x3d/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae -> #5 (&disk->open_mutex){+.+.}-{3:3}: lock_acquire+0xbe/0x1f0 __mutex_lock_common+0xb6/0xe10 mutex_lock_nested+0x17/0x20 bd_register_pending_holders+0x20/0x100 device_add_disk+0x1ae/0x390 loop_add+0x29c/0x2d0 [loop] blk_request_module+0x5a/0xb0 blkdev_get_no_open+0x27/0xa0 blkdev_get_by_dev+0x5f/0x540 blkdev_open+0x58/0x90 do_dentry_open+0x144/0x3a0 path_openat+0xa57/0xda0 do_filp_open+0x9f/0x140 do_sys_openat2+0x71/0x150 __x64_sys_openat+0x78/0xa0 do_syscall_64+0x3d/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae -> #4 (major_names_lock){+.+.}-{3:3}: lock_acquire+0xbe/0x1f0 __mutex_lock_common+0xb6/0xe10 mutex_lock_nested+0x17/0x20 blkdev_show+0x19/0x80 devinfo_show+0x52/0x60 seq_read_iter+0x2d5/0x3e0 proc_reg_read_iter+0x41/0x80 vfs_read+0x2ac/0x330 ksys_read+0x6b/0xd0 do_syscall_64+0x3d/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae -> #3 (&p->lock){+.+.}-{3:3}: lock_acquire+0xbe/0x1f0 __mutex_lock_common+0xb6/0xe10 mutex_lock_nested+0x17/0x20 seq_read_iter+0x37/0x3e0 generic_file_splice_read+0xf3/0x170 splice_direct_to_actor+0x14e/0x350 do_splice_direct+0x84/0xd0 do_sendfile+0x263/0x430 __se_sys_sendfile64+0x96/0xc0 do_syscall_64+0x3d/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae -> #2 (sb_writers#3){.+.+}-{0:0}: lock_acquire+0xbe/0x1f0 lo_write_bvec+0x96/0x280 [loop] loop_process_work+0xa68/0xc10 [loop] process_one_work+0x293/0x480 worker_thread+0x23d/0x4b0 kthread+0x163/0x180 ret_from_fork+0x1f/0x30 -> #1 ((work_completion)(&lo->rootcg_work)){+.+.}-{0:0}: lock_acquire+0xbe/0x1f0 process_one_work+0x280/0x480 worker_thread+0x23d/0x4b0 kthread+0x163/0x180 ret_from_fork+0x1f/0x30 -> #0 ((wq_completion)loop0){+.+.}-{0:0}: validate_chain+0x1f0d/0x33e0 __lock_acquire+0x92d/0x1030 lock_acquire+0xbe/0x1f0 flush_workqueue+0x8c/0x560 drain_workqueue+0x80/0x140 destroy_workqueue+0x47/0x4f0 __loop_clr_fd+0xb4/0x400 [loop] blkdev_put+0x14a/0x1d0 blkdev_close+0x1c/0x20 __fput+0xfd/0x220 task_work_run+0x69/0xc0 exit_to_user_mode_prepare+0x1ce/0x1f0 syscall_exit_to_user_mode+0x26/0x60 do_syscall_64+0x4c/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae other info that might help us debug this: Chain exists of: (wq_completion)loop0 --> &disk->open_mutex --> &lo->lo_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&lo->lo_mutex); lock(&disk->open_mutex); lock(&lo->lo_mutex); lock((wq_completion)loop0); *** DEADLOCK *** 2 locks held by systemd-udevd/7568: #0: ffff888012554128 (&disk->open_mutex){+.+.}-{3:3}, at: blkdev_put+0x4c/0x1d0 #1: ffff888014a7d4a0 (&lo->lo_mutex){+.+.}-{3:3}, at: __loop_clr_fd+0x4d/0x400 [loop] stack backtrace: CPU: 0 PID: 7568 Comm: systemd-udevd Tainted: G E 5.14.0+ #757 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020 Call Trace: dump_stack_lvl+0x79/0xbf print_circular_bug+0x5d6/0x5e0 ? stack_trace_save+0x42/0x60 ? save_trace+0x3d/0x2d0 check_noncircular+0x10b/0x120 validate_chain+0x1f0d/0x33e0 ? __lock_acquire+0x953/0x1030 ? __lock_acquire+0x953/0x1030 __lock_acquire+0x92d/0x1030 ? flush_workqueue+0x70/0x560 lock_acquire+0xbe/0x1f0 ? flush_workqueue+0x70/0x560 flush_workqueue+0x8c/0x560 ? flush_workqueue+0x70/0x560 ? sched_clock_cpu+0xe/0x1a0 ? drain_workqueue+0x41/0x140 drain_workqueue+0x80/0x140 destroy_workqueue+0x47/0x4f0 ? blk_mq_freeze_queue_wait+0xac/0xd0 __loop_clr_fd+0xb4/0x400 [loop] ? __mutex_unlock_slowpath+0x35/0x230 blkdev_put+0x14a/0x1d0 blkdev_close+0x1c/0x20 __fput+0xfd/0x220 task_work_run+0x69/0xc0 exit_to_user_mode_prepare+0x1ce/0x1f0 syscall_exit_to_user_mode+0x26/0x60 do_syscall_64+0x4c/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f0fd4c661f7 Code: 00 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 41 c3 48 83 ec 18 89 7c 24 0c e8 13 fc ff ff RSP: 002b:00007ffd1c9e9fd8 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 RAX: 0000000000000000 RBX: 00007f0fd46be6c8 RCX: 00007f0fd4c661f7 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000006 RBP: 0000000000000006 R08: 000055fff1eaf400 R09: 0000000000000000 R10: 00007f0fd46be6c8 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000002f08 R15: 00007ffd1c9ea050 Commit 1c500ad706383f1a ("loop: reduce the loop_ctl_mutex scope") is for breaking "loop_ctl_mutex => &lo->lo_mutex" dependency chain. But enabling a different block module results in forming circular locking dependency due to shared major_names_lock mutex. The simplest fix is to call probe function without holding major_names_lock [1], but Christoph Hellwig does not like such idea. Therefore, instead of holding major_names_lock in blkdev_show(), introduce a different lock for blkdev_show() in order to break "sb_writers#$N => &p->lock => major_names_lock" dependency chain. Link: https://lkml.kernel.org/r/b2af8a5b-3c1b-204e-7f56-bea0b15848d6@i-love.sakura.ne.jp [1] Signed-off-by: Tetsuo Handa Link: https://lore.kernel.org/r/18a02da2-0bf3-550e-b071-2b4ab13c49f0@i-love.sakura.ne.jp Signed-off-by: Jens Axboe --- block/genhd.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 567549a011d1..7b6e5e1cf956 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -183,6 +183,7 @@ static struct blk_major_name { void (*probe)(dev_t devt); } *major_names[BLKDEV_MAJOR_HASH_SIZE]; static DEFINE_MUTEX(major_names_lock); +static DEFINE_SPINLOCK(major_names_spinlock); /* index in the above - for now: assume no multimajor ranges */ static inline int major_to_index(unsigned major) @@ -195,11 +196,11 @@ void blkdev_show(struct seq_file *seqf, off_t offset) { struct blk_major_name *dp; - mutex_lock(&major_names_lock); + spin_lock(&major_names_spinlock); for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next) if (dp->major == offset) seq_printf(seqf, "%3d %s\n", dp->major, dp->name); - mutex_unlock(&major_names_lock); + spin_unlock(&major_names_spinlock); } #endif /* CONFIG_PROC_FS */ @@ -271,6 +272,7 @@ int __register_blkdev(unsigned int major, const char *name, p->next = NULL; index = major_to_index(major); + spin_lock(&major_names_spinlock); for (n = &major_names[index]; *n; n = &(*n)->next) { if ((*n)->major == major) break; @@ -279,6 +281,7 @@ int __register_blkdev(unsigned int major, const char *name, *n = p; else ret = -EBUSY; + spin_unlock(&major_names_spinlock); if (ret < 0) { printk("register_blkdev: cannot get major %u for %s\n", @@ -298,6 +301,7 @@ void unregister_blkdev(unsigned int major, const char *name) int index = major_to_index(major); mutex_lock(&major_names_lock); + spin_lock(&major_names_spinlock); for (n = &major_names[index]; *n; n = &(*n)->next) if ((*n)->major == major) break; @@ -307,6 +311,7 @@ void unregister_blkdev(unsigned int major, const char *name) p = *n; *n = p->next; } + spin_unlock(&major_names_spinlock); mutex_unlock(&major_names_lock); kfree(p); } From 884f0e84f1e3195b801319c8ec3d5774e9bf2710 Mon Sep 17 00:00:00 2001 From: Li Jinlin Date: Tue, 7 Sep 2021 20:12:42 +0800 Subject: [PATCH 13/17] blk-throttle: fix UAF by deleteing timer in blk_throtl_exit() The pending timer has been set up in blk_throtl_init(). However, the timer is not deleted in blk_throtl_exit(). This means that the timer handler may still be running after freeing the timer, which would result in a use-after-free. Fix by calling del_timer_sync() to delete the timer in blk_throtl_exit(). Signed-off-by: Li Jinlin Link: https://lore.kernel.org/r/20210907121242.2885564-1-lijinlin3@huawei.com Signed-off-by: Jens Axboe --- block/blk-throttle.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 55c49015e533..7c4e7993ba97 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2458,6 +2458,7 @@ int blk_throtl_init(struct request_queue *q) void blk_throtl_exit(struct request_queue *q) { BUG_ON(!q->td); + del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); blkcg_deactivate_policy(q, &blkcg_policy_throtl); free_percpu(q->td->latency_buckets[READ]); From cd82cca7ebfe9c6c74a8e5b28382ba88177ba5f1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Sep 2021 16:13:02 +0200 Subject: [PATCH 14/17] block: split out operations on block special files Add a new block/fops.c for all the file and address_space operations that provide the block special file support. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210907141303.1371844-2-hch@lst.de [axboe: correct trailing whitespace while at it] Signed-off-by: Jens Axboe --- block/Makefile | 2 +- block/blk.h | 2 + block/fops.c | 640 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/block_dev.c | 637 ------------------------------------------------ 4 files changed, 643 insertions(+), 638 deletions(-) create mode 100644 block/fops.c diff --git a/block/Makefile b/block/Makefile index 1d0d466f2182..a4d8d149670b 100644 --- a/block/Makefile +++ b/block/Makefile @@ -3,7 +3,7 @@ # Makefile for the kernel block layer # -obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \ +obj-$(CONFIG_BLOCK) := fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ diff --git a/block/blk.h b/block/blk.h index 8c96b0c90c48..7d2a0ba7ed21 100644 --- a/block/blk.h +++ b/block/blk.h @@ -373,4 +373,6 @@ static inline void bio_clear_hipri(struct bio *bio) bio->bi_opf &= ~REQ_HIPRI; } +extern const struct address_space_operations def_blk_aops; + #endif /* BLK_INTERNAL_H */ diff --git a/block/fops.c b/block/fops.c new file mode 100644 index 000000000000..ffce6f6c68dd --- /dev/null +++ b/block/fops.c @@ -0,0 +1,640 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2001 Andrea Arcangeli SuSE + * Copyright (C) 2016 - 2020 Christoph Hellwig + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "blk.h" + +static struct inode *bdev_file_inode(struct file *file) +{ + return file->f_mapping->host; +} + +static int blkdev_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + bh->b_bdev = I_BDEV(inode); + bh->b_blocknr = iblock; + set_buffer_mapped(bh); + return 0; +} + +static unsigned int dio_bio_write_op(struct kiocb *iocb) +{ + unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; + + /* avoid the need for a I/O completion work item */ + if (iocb->ki_flags & IOCB_DSYNC) + op |= REQ_FUA; + return op; +} + +#define DIO_INLINE_BIO_VECS 4 + +static void blkdev_bio_end_io_simple(struct bio *bio) +{ + struct task_struct *waiter = bio->bi_private; + + WRITE_ONCE(bio->bi_private, NULL); + blk_wake_io_task(waiter); +} + +static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, + struct iov_iter *iter, unsigned int nr_pages) +{ + struct file *file = iocb->ki_filp; + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); + struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; + loff_t pos = iocb->ki_pos; + bool should_dirty = false; + struct bio bio; + ssize_t ret; + blk_qc_t qc; + + if ((pos | iov_iter_alignment(iter)) & + (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + if (nr_pages <= DIO_INLINE_BIO_VECS) + vecs = inline_vecs; + else { + vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec), + GFP_KERNEL); + if (!vecs) + return -ENOMEM; + } + + bio_init(&bio, vecs, nr_pages); + bio_set_dev(&bio, bdev); + bio.bi_iter.bi_sector = pos >> 9; + bio.bi_write_hint = iocb->ki_hint; + bio.bi_private = current; + bio.bi_end_io = blkdev_bio_end_io_simple; + bio.bi_ioprio = iocb->ki_ioprio; + + ret = bio_iov_iter_get_pages(&bio, iter); + if (unlikely(ret)) + goto out; + ret = bio.bi_iter.bi_size; + + if (iov_iter_rw(iter) == READ) { + bio.bi_opf = REQ_OP_READ; + if (iter_is_iovec(iter)) + should_dirty = true; + } else { + bio.bi_opf = dio_bio_write_op(iocb); + task_io_account_write(ret); + } + if (iocb->ki_flags & IOCB_NOWAIT) + bio.bi_opf |= REQ_NOWAIT; + if (iocb->ki_flags & IOCB_HIPRI) + bio_set_polled(&bio, iocb); + + qc = submit_bio(&bio); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(bio.bi_private)) + break; + if (!(iocb->ki_flags & IOCB_HIPRI) || + !blk_poll(bdev_get_queue(bdev), qc, true)) + blk_io_schedule(); + } + __set_current_state(TASK_RUNNING); + + bio_release_pages(&bio, should_dirty); + if (unlikely(bio.bi_status)) + ret = blk_status_to_errno(bio.bi_status); + +out: + if (vecs != inline_vecs) + kfree(vecs); + + bio_uninit(&bio); + + return ret; +} + +struct blkdev_dio { + union { + struct kiocb *iocb; + struct task_struct *waiter; + }; + size_t size; + atomic_t ref; + bool multi_bio : 1; + bool should_dirty : 1; + bool is_sync : 1; + struct bio bio; +}; + +static struct bio_set blkdev_dio_pool; + +static int blkdev_iopoll(struct kiocb *kiocb, bool wait) +{ + struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host); + struct request_queue *q = bdev_get_queue(bdev); + + return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait); +} + +static void blkdev_bio_end_io(struct bio *bio) +{ + struct blkdev_dio *dio = bio->bi_private; + bool should_dirty = dio->should_dirty; + + if (bio->bi_status && !dio->bio.bi_status) + dio->bio.bi_status = bio->bi_status; + + if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { + if (!dio->is_sync) { + struct kiocb *iocb = dio->iocb; + ssize_t ret; + + if (likely(!dio->bio.bi_status)) { + ret = dio->size; + iocb->ki_pos += ret; + } else { + ret = blk_status_to_errno(dio->bio.bi_status); + } + + dio->iocb->ki_complete(iocb, ret, 0); + if (dio->multi_bio) + bio_put(&dio->bio); + } else { + struct task_struct *waiter = dio->waiter; + + WRITE_ONCE(dio->waiter, NULL); + blk_wake_io_task(waiter); + } + } + + if (should_dirty) { + bio_check_pages_dirty(bio); + } else { + bio_release_pages(bio, false); + bio_put(bio); + } +} + +static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + unsigned int nr_pages) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = bdev_file_inode(file); + struct block_device *bdev = I_BDEV(inode); + struct blk_plug plug; + struct blkdev_dio *dio; + struct bio *bio; + bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; + bool is_read = (iov_iter_rw(iter) == READ), is_sync; + loff_t pos = iocb->ki_pos; + blk_qc_t qc = BLK_QC_T_NONE; + int ret = 0; + + if ((pos | iov_iter_alignment(iter)) & + (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); + + dio = container_of(bio, struct blkdev_dio, bio); + dio->is_sync = is_sync = is_sync_kiocb(iocb); + if (dio->is_sync) { + dio->waiter = current; + bio_get(bio); + } else { + dio->iocb = iocb; + } + + dio->size = 0; + dio->multi_bio = false; + dio->should_dirty = is_read && iter_is_iovec(iter); + + /* + * Don't plug for HIPRI/polled IO, as those should go straight + * to issue + */ + if (!is_poll) + blk_start_plug(&plug); + + for (;;) { + bio_set_dev(bio, bdev); + bio->bi_iter.bi_sector = pos >> 9; + bio->bi_write_hint = iocb->ki_hint; + bio->bi_private = dio; + bio->bi_end_io = blkdev_bio_end_io; + bio->bi_ioprio = iocb->ki_ioprio; + + ret = bio_iov_iter_get_pages(bio, iter); + if (unlikely(ret)) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + break; + } + + if (is_read) { + bio->bi_opf = REQ_OP_READ; + if (dio->should_dirty) + bio_set_pages_dirty(bio); + } else { + bio->bi_opf = dio_bio_write_op(iocb); + task_io_account_write(bio->bi_iter.bi_size); + } + if (iocb->ki_flags & IOCB_NOWAIT) + bio->bi_opf |= REQ_NOWAIT; + + dio->size += bio->bi_iter.bi_size; + pos += bio->bi_iter.bi_size; + + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); + if (!nr_pages) { + bool polled = false; + + if (iocb->ki_flags & IOCB_HIPRI) { + bio_set_polled(bio, iocb); + polled = true; + } + + qc = submit_bio(bio); + + if (polled) + WRITE_ONCE(iocb->ki_cookie, qc); + break; + } + + if (!dio->multi_bio) { + /* + * AIO needs an extra reference to ensure the dio + * structure which is embedded into the first bio + * stays around. + */ + if (!is_sync) + bio_get(bio); + dio->multi_bio = true; + atomic_set(&dio->ref, 2); + } else { + atomic_inc(&dio->ref); + } + + submit_bio(bio); + bio = bio_alloc(GFP_KERNEL, nr_pages); + } + + if (!is_poll) + blk_finish_plug(&plug); + + if (!is_sync) + return -EIOCBQUEUED; + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(dio->waiter)) + break; + + if (!(iocb->ki_flags & IOCB_HIPRI) || + !blk_poll(bdev_get_queue(bdev), qc, true)) + blk_io_schedule(); + } + __set_current_state(TASK_RUNNING); + + if (!ret) + ret = blk_status_to_errno(dio->bio.bi_status); + if (likely(!ret)) + ret = dio->size; + + bio_put(&dio->bio); + return ret; +} + +static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +{ + unsigned int nr_pages; + + if (!iov_iter_count(iter)) + return 0; + + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); + if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS) + return __blkdev_direct_IO_simple(iocb, iter, nr_pages); + + return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); +} + +static int blkdev_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, blkdev_get_block, wbc); +} + +static int blkdev_readpage(struct file * file, struct page * page) +{ + return block_read_full_page(page, blkdev_get_block); +} + +static void blkdev_readahead(struct readahead_control *rac) +{ + mpage_readahead(rac, blkdev_get_block); +} + +static int blkdev_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, struct page **pagep, + void **fsdata) +{ + return block_write_begin(mapping, pos, len, flags, pagep, + blkdev_get_block); +} + +static int blkdev_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, struct page *page, + void *fsdata) +{ + int ret; + ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); + + unlock_page(page); + put_page(page); + + return ret; +} + +static int blkdev_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return generic_writepages(mapping, wbc); +} + +const struct address_space_operations def_blk_aops = { + .set_page_dirty = __set_page_dirty_buffers, + .readpage = blkdev_readpage, + .readahead = blkdev_readahead, + .writepage = blkdev_writepage, + .write_begin = blkdev_write_begin, + .write_end = blkdev_write_end, + .writepages = blkdev_writepages, + .direct_IO = blkdev_direct_IO, + .migratepage = buffer_migrate_page_norefs, + .is_dirty_writeback = buffer_check_dirty_writeback, +}; + +/* + * for a block special file file_inode(file)->i_size is zero + * so we compute the size by hand (just as in block_read/write above) + */ +static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *bd_inode = bdev_file_inode(file); + loff_t retval; + + inode_lock(bd_inode); + retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); + inode_unlock(bd_inode); + return retval; +} + +static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct inode *bd_inode = bdev_file_inode(filp); + struct block_device *bdev = I_BDEV(bd_inode); + int error; + + error = file_write_and_wait_range(filp, start, end); + if (error) + return error; + + /* + * There is no need to serialise calls to blkdev_issue_flush with + * i_mutex and doing so causes performance issues with concurrent + * O_SYNC writers to a block device. + */ + error = blkdev_issue_flush(bdev); + if (error == -EOPNOTSUPP) + error = 0; + + return error; +} + +static int blkdev_open(struct inode *inode, struct file *filp) +{ + struct block_device *bdev; + + /* + * Preserve backwards compatibility and allow large file access + * even if userspace doesn't ask for it explicitly. Some mkfs + * binary needs it. We might want to drop this workaround + * during an unstable branch. + */ + filp->f_flags |= O_LARGEFILE; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + + if (filp->f_flags & O_NDELAY) + filp->f_mode |= FMODE_NDELAY; + if (filp->f_flags & O_EXCL) + filp->f_mode |= FMODE_EXCL; + if ((filp->f_flags & O_ACCMODE) == 3) + filp->f_mode |= FMODE_WRITE_IOCTL; + + bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + filp->f_mapping = bdev->bd_inode->i_mapping; + filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); + return 0; +} + +static int blkdev_close(struct inode *inode, struct file *filp) +{ + struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); + + blkdev_put(bdev, filp->f_mode); + return 0; +} + +static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); + fmode_t mode = file->f_mode; + + /* + * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have + * to updated it before every ioctl. + */ + if (file->f_flags & O_NDELAY) + mode |= FMODE_NDELAY; + else + mode &= ~FMODE_NDELAY; + + return blkdev_ioctl(bdev, mode, cmd, arg); +} + +/* + * Write data to the block device. Only intended for the block device itself + * and the raw driver which basically is a fake block device. + * + * Does not take i_mutex for the write and thus is not for general purpose + * use. + */ +static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *bd_inode = bdev_file_inode(file); + loff_t size = i_size_read(bd_inode); + struct blk_plug plug; + size_t shorted = 0; + ssize_t ret; + + if (bdev_read_only(I_BDEV(bd_inode))) + return -EPERM; + + if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) + return -ETXTBSY; + + if (!iov_iter_count(from)) + return 0; + + if (iocb->ki_pos >= size) + return -ENOSPC; + + if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) + return -EOPNOTSUPP; + + size -= iocb->ki_pos; + if (iov_iter_count(from) > size) { + shorted = iov_iter_count(from) - size; + iov_iter_truncate(from, size); + } + + blk_start_plug(&plug); + ret = __generic_file_write_iter(iocb, from); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + iov_iter_reexpand(from, iov_iter_count(from) + shorted); + blk_finish_plug(&plug); + return ret; +} + +static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct inode *bd_inode = bdev_file_inode(file); + loff_t size = i_size_read(bd_inode); + loff_t pos = iocb->ki_pos; + size_t shorted = 0; + ssize_t ret; + + if (pos >= size) + return 0; + + size -= pos; + if (iov_iter_count(to) > size) { + shorted = iov_iter_count(to) - size; + iov_iter_truncate(to, size); + } + + ret = generic_file_read_iter(iocb, to); + iov_iter_reexpand(to, iov_iter_count(to) + shorted); + return ret; +} + +#define BLKDEV_FALLOC_FL_SUPPORTED \ + (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ + FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) + +static long blkdev_fallocate(struct file *file, int mode, loff_t start, + loff_t len) +{ + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); + loff_t end = start + len - 1; + loff_t isize; + int error; + + /* Fail if we don't recognize the flags. */ + if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) + return -EOPNOTSUPP; + + /* Don't go off the end of the device. */ + isize = i_size_read(bdev->bd_inode); + if (start >= isize) + return -EINVAL; + if (end >= isize) { + if (mode & FALLOC_FL_KEEP_SIZE) { + len = isize - start; + end = start + len - 1; + } else + return -EINVAL; + } + + /* + * Don't allow IO that isn't aligned to logical block size. + */ + if ((start | len) & (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + /* Invalidate the page cache, including dirty pages. */ + error = truncate_bdev_range(bdev, file->f_mode, start, end); + if (error) + return error; + + switch (mode) { + case FALLOC_FL_ZERO_RANGE: + case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: + error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, + GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); + break; + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: + error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, + GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); + break; + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: + error = blkdev_issue_discard(bdev, start >> 9, len >> 9, + GFP_KERNEL, 0); + break; + default: + return -EOPNOTSUPP; + } + if (error) + return error; + + /* + * Invalidate the page cache again; if someone wandered in and dirtied + * a page, we just discard it - userspace has no way of knowing whether + * the write happened before or after discard completing... + */ + return truncate_bdev_range(bdev, file->f_mode, start, end); +} + +const struct file_operations def_blk_fops = { + .open = blkdev_open, + .release = blkdev_close, + .llseek = blkdev_llseek, + .read_iter = blkdev_read_iter, + .write_iter = blkdev_write_iter, + .iopoll = blkdev_iopoll, + .mmap = generic_file_mmap, + .fsync = blkdev_fsync, + .unlocked_ioctl = block_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_blkdev_ioctl, +#endif + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .fallocate = blkdev_fallocate, +}; + +static __init int blkdev_init(void) +{ + return bioset_init(&blkdev_dio_pool, 4, + offsetof(struct blkdev_dio, bio), + BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE); +} +module_init(blkdev_init); diff --git a/fs/block_dev.c b/fs/block_dev.c index 45df6cbccf12..84c97e276047 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -7,12 +7,10 @@ #include #include -#include #include #include #include #include -#include #include #include #include @@ -20,20 +18,14 @@ #include #include #include -#include #include -#include #include #include #include #include -#include #include -#include -#include #include #include -#include #include "internal.h" #include "../block/blk.h" @@ -42,8 +34,6 @@ struct bdev_inode { struct inode vfs_inode; }; -static const struct address_space_operations def_blk_aops; - static inline struct bdev_inode *BDEV_I(struct inode *inode) { return container_of(inode, struct bdev_inode, vfs_inode); @@ -194,332 +184,6 @@ int sb_min_blocksize(struct super_block *sb, int size) EXPORT_SYMBOL(sb_min_blocksize); -static int -blkdev_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - bh->b_bdev = I_BDEV(inode); - bh->b_blocknr = iblock; - set_buffer_mapped(bh); - return 0; -} - -static struct inode *bdev_file_inode(struct file *file) -{ - return file->f_mapping->host; -} - -static unsigned int dio_bio_write_op(struct kiocb *iocb) -{ - unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; - - /* avoid the need for a I/O completion work item */ - if (iocb->ki_flags & IOCB_DSYNC) - op |= REQ_FUA; - return op; -} - -#define DIO_INLINE_BIO_VECS 4 - -static void blkdev_bio_end_io_simple(struct bio *bio) -{ - struct task_struct *waiter = bio->bi_private; - - WRITE_ONCE(bio->bi_private, NULL); - blk_wake_io_task(waiter); -} - -static ssize_t -__blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, - unsigned int nr_pages) -{ - struct file *file = iocb->ki_filp; - struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; - loff_t pos = iocb->ki_pos; - bool should_dirty = false; - struct bio bio; - ssize_t ret; - blk_qc_t qc; - - if ((pos | iov_iter_alignment(iter)) & - (bdev_logical_block_size(bdev) - 1)) - return -EINVAL; - - if (nr_pages <= DIO_INLINE_BIO_VECS) - vecs = inline_vecs; - else { - vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec), - GFP_KERNEL); - if (!vecs) - return -ENOMEM; - } - - bio_init(&bio, vecs, nr_pages); - bio_set_dev(&bio, bdev); - bio.bi_iter.bi_sector = pos >> 9; - bio.bi_write_hint = iocb->ki_hint; - bio.bi_private = current; - bio.bi_end_io = blkdev_bio_end_io_simple; - bio.bi_ioprio = iocb->ki_ioprio; - - ret = bio_iov_iter_get_pages(&bio, iter); - if (unlikely(ret)) - goto out; - ret = bio.bi_iter.bi_size; - - if (iov_iter_rw(iter) == READ) { - bio.bi_opf = REQ_OP_READ; - if (iter_is_iovec(iter)) - should_dirty = true; - } else { - bio.bi_opf = dio_bio_write_op(iocb); - task_io_account_write(ret); - } - if (iocb->ki_flags & IOCB_NOWAIT) - bio.bi_opf |= REQ_NOWAIT; - if (iocb->ki_flags & IOCB_HIPRI) - bio_set_polled(&bio, iocb); - - qc = submit_bio(&bio); - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!READ_ONCE(bio.bi_private)) - break; - if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc, true)) - blk_io_schedule(); - } - __set_current_state(TASK_RUNNING); - - bio_release_pages(&bio, should_dirty); - if (unlikely(bio.bi_status)) - ret = blk_status_to_errno(bio.bi_status); - -out: - if (vecs != inline_vecs) - kfree(vecs); - - bio_uninit(&bio); - - return ret; -} - -struct blkdev_dio { - union { - struct kiocb *iocb; - struct task_struct *waiter; - }; - size_t size; - atomic_t ref; - bool multi_bio : 1; - bool should_dirty : 1; - bool is_sync : 1; - struct bio bio; -}; - -static struct bio_set blkdev_dio_pool; - -static int blkdev_iopoll(struct kiocb *kiocb, bool wait) -{ - struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host); - struct request_queue *q = bdev_get_queue(bdev); - - return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait); -} - -static void blkdev_bio_end_io(struct bio *bio) -{ - struct blkdev_dio *dio = bio->bi_private; - bool should_dirty = dio->should_dirty; - - if (bio->bi_status && !dio->bio.bi_status) - dio->bio.bi_status = bio->bi_status; - - if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { - if (!dio->is_sync) { - struct kiocb *iocb = dio->iocb; - ssize_t ret; - - if (likely(!dio->bio.bi_status)) { - ret = dio->size; - iocb->ki_pos += ret; - } else { - ret = blk_status_to_errno(dio->bio.bi_status); - } - - dio->iocb->ki_complete(iocb, ret, 0); - if (dio->multi_bio) - bio_put(&dio->bio); - } else { - struct task_struct *waiter = dio->waiter; - - WRITE_ONCE(dio->waiter, NULL); - blk_wake_io_task(waiter); - } - } - - if (should_dirty) { - bio_check_pages_dirty(bio); - } else { - bio_release_pages(bio, false); - bio_put(bio); - } -} - -static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - unsigned int nr_pages) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = bdev_file_inode(file); - struct block_device *bdev = I_BDEV(inode); - struct blk_plug plug; - struct blkdev_dio *dio; - struct bio *bio; - bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; - bool is_read = (iov_iter_rw(iter) == READ), is_sync; - loff_t pos = iocb->ki_pos; - blk_qc_t qc = BLK_QC_T_NONE; - int ret = 0; - - if ((pos | iov_iter_alignment(iter)) & - (bdev_logical_block_size(bdev) - 1)) - return -EINVAL; - - bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); - - dio = container_of(bio, struct blkdev_dio, bio); - dio->is_sync = is_sync = is_sync_kiocb(iocb); - if (dio->is_sync) { - dio->waiter = current; - bio_get(bio); - } else { - dio->iocb = iocb; - } - - dio->size = 0; - dio->multi_bio = false; - dio->should_dirty = is_read && iter_is_iovec(iter); - - /* - * Don't plug for HIPRI/polled IO, as those should go straight - * to issue - */ - if (!is_poll) - blk_start_plug(&plug); - - for (;;) { - bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector = pos >> 9; - bio->bi_write_hint = iocb->ki_hint; - bio->bi_private = dio; - bio->bi_end_io = blkdev_bio_end_io; - bio->bi_ioprio = iocb->ki_ioprio; - - ret = bio_iov_iter_get_pages(bio, iter); - if (unlikely(ret)) { - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); - break; - } - - if (is_read) { - bio->bi_opf = REQ_OP_READ; - if (dio->should_dirty) - bio_set_pages_dirty(bio); - } else { - bio->bi_opf = dio_bio_write_op(iocb); - task_io_account_write(bio->bi_iter.bi_size); - } - if (iocb->ki_flags & IOCB_NOWAIT) - bio->bi_opf |= REQ_NOWAIT; - - dio->size += bio->bi_iter.bi_size; - pos += bio->bi_iter.bi_size; - - nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); - if (!nr_pages) { - bool polled = false; - - if (iocb->ki_flags & IOCB_HIPRI) { - bio_set_polled(bio, iocb); - polled = true; - } - - qc = submit_bio(bio); - - if (polled) - WRITE_ONCE(iocb->ki_cookie, qc); - break; - } - - if (!dio->multi_bio) { - /* - * AIO needs an extra reference to ensure the dio - * structure which is embedded into the first bio - * stays around. - */ - if (!is_sync) - bio_get(bio); - dio->multi_bio = true; - atomic_set(&dio->ref, 2); - } else { - atomic_inc(&dio->ref); - } - - submit_bio(bio); - bio = bio_alloc(GFP_KERNEL, nr_pages); - } - - if (!is_poll) - blk_finish_plug(&plug); - - if (!is_sync) - return -EIOCBQUEUED; - - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!READ_ONCE(dio->waiter)) - break; - - if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc, true)) - blk_io_schedule(); - } - __set_current_state(TASK_RUNNING); - - if (!ret) - ret = blk_status_to_errno(dio->bio.bi_status); - if (likely(!ret)) - ret = dio->size; - - bio_put(&dio->bio); - return ret; -} - -static ssize_t -blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - unsigned int nr_pages; - - if (!iov_iter_count(iter)) - return 0; - - nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); - if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS) - return __blkdev_direct_IO_simple(iocb, iter, nr_pages); - - return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); -} - -static __init int blkdev_init(void) -{ - return bioset_init(&blkdev_dio_pool, 4, - offsetof(struct blkdev_dio, bio), - BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE); -} -module_init(blkdev_init); - int __sync_blockdev(struct block_device *bdev, int wait) { if (!bdev) @@ -637,81 +301,6 @@ int thaw_bdev(struct block_device *bdev) } EXPORT_SYMBOL(thaw_bdev); -static int blkdev_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, blkdev_get_block, wbc); -} - -static int blkdev_readpage(struct file * file, struct page * page) -{ - return block_read_full_page(page, blkdev_get_block); -} - -static void blkdev_readahead(struct readahead_control *rac) -{ - mpage_readahead(rac, blkdev_get_block); -} - -static int blkdev_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - return block_write_begin(mapping, pos, len, flags, pagep, - blkdev_get_block); -} - -static int blkdev_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - int ret; - ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); - - unlock_page(page); - put_page(page); - - return ret; -} - -/* - * private llseek: - * for a block special file file_inode(file)->i_size is zero - * so we compute the size by hand (just as in block_read/write above) - */ -static loff_t block_llseek(struct file *file, loff_t offset, int whence) -{ - struct inode *bd_inode = bdev_file_inode(file); - loff_t retval; - - inode_lock(bd_inode); - retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); - inode_unlock(bd_inode); - return retval; -} - -static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, - int datasync) -{ - struct inode *bd_inode = bdev_file_inode(filp); - struct block_device *bdev = I_BDEV(bd_inode); - int error; - - error = file_write_and_wait_range(filp, start, end); - if (error) - return error; - - /* - * There is no need to serialise calls to blkdev_issue_flush with - * i_mutex and doing so causes performance issues with concurrent - * O_SYNC writers to a block device. - */ - error = blkdev_issue_flush(bdev); - if (error == -EOPNOTSUPP) - error = 0; - - return error; -} - /** * bdev_read_page() - Start reading a page from a block device * @bdev: The device to read the page from @@ -1305,35 +894,6 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, } EXPORT_SYMBOL(blkdev_get_by_path); -static int blkdev_open(struct inode * inode, struct file * filp) -{ - struct block_device *bdev; - - /* - * Preserve backwards compatibility and allow large file access - * even if userspace doesn't ask for it explicitly. Some mkfs - * binary needs it. We might want to drop this workaround - * during an unstable branch. - */ - filp->f_flags |= O_LARGEFILE; - - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; - - if (filp->f_flags & O_NDELAY) - filp->f_mode |= FMODE_NDELAY; - if (filp->f_flags & O_EXCL) - filp->f_mode |= FMODE_EXCL; - if ((filp->f_flags & O_ACCMODE) == 3) - filp->f_mode |= FMODE_WRITE_IOCTL; - - bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - filp->f_mapping = bdev->bd_inode->i_mapping; - filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); - return 0; -} - void blkdev_put(struct block_device *bdev, fmode_t mode) { struct gendisk *disk = bdev->bd_disk; @@ -1397,203 +957,6 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) } EXPORT_SYMBOL(blkdev_put); -static int blkdev_close(struct inode * inode, struct file * filp) -{ - struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); - blkdev_put(bdev, filp->f_mode); - return 0; -} - -static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) -{ - struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - fmode_t mode = file->f_mode; - - /* - * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have - * to updated it before every ioctl. - */ - if (file->f_flags & O_NDELAY) - mode |= FMODE_NDELAY; - else - mode &= ~FMODE_NDELAY; - - return blkdev_ioctl(bdev, mode, cmd, arg); -} - -/* - * Write data to the block device. Only intended for the block device itself - * and the raw driver which basically is a fake block device. - * - * Does not take i_mutex for the write and thus is not for general purpose - * use. - */ -static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct inode *bd_inode = bdev_file_inode(file); - loff_t size = i_size_read(bd_inode); - struct blk_plug plug; - size_t shorted = 0; - ssize_t ret; - - if (bdev_read_only(I_BDEV(bd_inode))) - return -EPERM; - - if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) - return -ETXTBSY; - - if (!iov_iter_count(from)) - return 0; - - if (iocb->ki_pos >= size) - return -ENOSPC; - - if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) - return -EOPNOTSUPP; - - size -= iocb->ki_pos; - if (iov_iter_count(from) > size) { - shorted = iov_iter_count(from) - size; - iov_iter_truncate(from, size); - } - - blk_start_plug(&plug); - ret = __generic_file_write_iter(iocb, from); - if (ret > 0) - ret = generic_write_sync(iocb, ret); - iov_iter_reexpand(from, iov_iter_count(from) + shorted); - blk_finish_plug(&plug); - return ret; -} - -static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ - struct file *file = iocb->ki_filp; - struct inode *bd_inode = bdev_file_inode(file); - loff_t size = i_size_read(bd_inode); - loff_t pos = iocb->ki_pos; - size_t shorted = 0; - ssize_t ret; - - if (pos >= size) - return 0; - - size -= pos; - if (iov_iter_count(to) > size) { - shorted = iov_iter_count(to) - size; - iov_iter_truncate(to, size); - } - - ret = generic_file_read_iter(iocb, to); - iov_iter_reexpand(to, iov_iter_count(to) + shorted); - return ret; -} - -static int blkdev_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - return generic_writepages(mapping, wbc); -} - -static const struct address_space_operations def_blk_aops = { - .set_page_dirty = __set_page_dirty_buffers, - .readpage = blkdev_readpage, - .readahead = blkdev_readahead, - .writepage = blkdev_writepage, - .write_begin = blkdev_write_begin, - .write_end = blkdev_write_end, - .writepages = blkdev_writepages, - .direct_IO = blkdev_direct_IO, - .migratepage = buffer_migrate_page_norefs, - .is_dirty_writeback = buffer_check_dirty_writeback, -}; - -#define BLKDEV_FALLOC_FL_SUPPORTED \ - (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ - FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) - -static long blkdev_fallocate(struct file *file, int mode, loff_t start, - loff_t len) -{ - struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - loff_t end = start + len - 1; - loff_t isize; - int error; - - /* Fail if we don't recognize the flags. */ - if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) - return -EOPNOTSUPP; - - /* Don't go off the end of the device. */ - isize = i_size_read(bdev->bd_inode); - if (start >= isize) - return -EINVAL; - if (end >= isize) { - if (mode & FALLOC_FL_KEEP_SIZE) { - len = isize - start; - end = start + len - 1; - } else - return -EINVAL; - } - - /* - * Don't allow IO that isn't aligned to logical block size. - */ - if ((start | len) & (bdev_logical_block_size(bdev) - 1)) - return -EINVAL; - - /* Invalidate the page cache, including dirty pages. */ - error = truncate_bdev_range(bdev, file->f_mode, start, end); - if (error) - return error; - - switch (mode) { - case FALLOC_FL_ZERO_RANGE: - case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: - error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, - GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); - break; - case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: - error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, - GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); - break; - case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: - error = blkdev_issue_discard(bdev, start >> 9, len >> 9, - GFP_KERNEL, 0); - break; - default: - return -EOPNOTSUPP; - } - if (error) - return error; - - /* - * Invalidate the page cache again; if someone wandered in and dirtied - * a page, we just discard it - userspace has no way of knowing whether - * the write happened before or after discard completing... - */ - return truncate_bdev_range(bdev, file->f_mode, start, end); -} - -const struct file_operations def_blk_fops = { - .open = blkdev_open, - .release = blkdev_close, - .llseek = block_llseek, - .read_iter = blkdev_read_iter, - .write_iter = blkdev_write_iter, - .iopoll = blkdev_iopoll, - .mmap = generic_file_mmap, - .fsync = blkdev_fsync, - .unlocked_ioctl = block_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = compat_blkdev_ioctl, -#endif - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, - .fallocate = blkdev_fallocate, -}; - /** * lookup_bdev - lookup a struct block_device by name * @pathname: special file representing the block device From 0dca4462ed0681649fdcd5700a6ddfbaa65fa178 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Sep 2021 16:13:03 +0200 Subject: [PATCH 15/17] block: move fs/block_dev.c to block/bdev.c Move it together with the rest of the block layer. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210907141303.1371844-3-hch@lst.de Signed-off-by: Jens Axboe --- Documentation/core-api/kernel-api.rst | 3 +++ Documentation/filesystems/api-summary.rst | 3 --- MAINTAINERS | 1 - block/Makefile | 2 +- fs/block_dev.c => block/bdev.c | 4 ++-- fs/Makefile | 2 +- fs/internal.h | 2 +- 7 files changed, 8 insertions(+), 9 deletions(-) rename fs/block_dev.c => block/bdev.c (99%) diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst index 2a7444e3a4c2..2e7186805148 100644 --- a/Documentation/core-api/kernel-api.rst +++ b/Documentation/core-api/kernel-api.rst @@ -315,6 +315,9 @@ Block Devices .. kernel-doc:: block/genhd.c :export: +.. kernel-doc:: block/bdev.c + :export: + Char devices ============ diff --git a/Documentation/filesystems/api-summary.rst b/Documentation/filesystems/api-summary.rst index 7e5c04c98619..98db2ea5fa12 100644 --- a/Documentation/filesystems/api-summary.rst +++ b/Documentation/filesystems/api-summary.rst @@ -71,9 +71,6 @@ Other Functions .. kernel-doc:: fs/fs-writeback.c :export: -.. kernel-doc:: fs/block_dev.c - :export: - .. kernel-doc:: fs/anon_inodes.c :export: diff --git a/MAINTAINERS b/MAINTAINERS index 64bf202c8c62..2f0d830d549f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3300,7 +3300,6 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git F: block/ F: drivers/block/ -F: fs/block_dev.c F: include/linux/blk* F: kernel/trace/blktrace.c F: lib/sbitmap.c diff --git a/block/Makefile b/block/Makefile index a4d8d149670b..c03b92d8a4bc 100644 --- a/block/Makefile +++ b/block/Makefile @@ -3,7 +3,7 @@ # Makefile for the kernel block layer # -obj-$(CONFIG_BLOCK) := fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ +obj-$(CONFIG_BLOCK) := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ diff --git a/fs/block_dev.c b/block/bdev.c similarity index 99% rename from fs/block_dev.c rename to block/bdev.c index 84c97e276047..cf2780cb44a7 100644 --- a/fs/block_dev.c +++ b/block/bdev.c @@ -26,8 +26,8 @@ #include #include #include -#include "internal.h" -#include "../block/blk.h" +#include "../fs/internal.h" +#include "blk.h" struct bdev_inode { struct block_device bdev; diff --git a/fs/Makefile b/fs/Makefile index 354e2ba3ee67..a1b89b1dc8fc 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -17,7 +17,7 @@ obj-y := open.o read_write.o file_table.o super.o \ kernel_read_file.o remap_range.o ifeq ($(CONFIG_BLOCK),y) -obj-y += buffer.o block_dev.o direct-io.o mpage.o +obj-y += buffer.o direct-io.o mpage.o else obj-y += no-block.o endif diff --git a/fs/internal.h b/fs/internal.h index 68a2ae029a27..3cd065c8a66b 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -18,7 +18,7 @@ struct user_namespace; struct pipe_inode_info; /* - * block_dev.c + * block/bdev.c */ #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); From 7f2a6a69f7ced6db8220298e0497cf60482a9d4b Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 7 Sep 2021 16:03:38 -0700 Subject: [PATCH 16/17] blk-mq: allow 4x BLK_MAX_REQUEST_COUNT at blk_plug for multiple_queues Limiting number of request to BLK_MAX_REQUEST_COUNT at blk_plug hurts performance for large md arrays. [1] shows resync speed of md array drops for md array with more than 16 HDDs. Fix this by allowing more request at plug queue. The multiple_queue flag is used to only apply higher limit to multiple queue cases. [1] https://lore.kernel.org/linux-raid/CAFDAVznS71BXW8Jxv6k9dXc2iR3ysX3iZRBww_rzA8WifBFxGg@mail.gmail.com/ Tested-by: Marcin Wanat Signed-off-by: Song Liu Signed-off-by: Jens Axboe --- block/blk-mq.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 944049982e6e..6b5a509fbaff 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2135,6 +2135,18 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) } } +/* + * Allow 4x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple + * queues. This is important for md arrays to benefit from merging + * requests. + */ +static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) +{ + if (plug->multiple_queues) + return BLK_MAX_REQUEST_COUNT * 4; + return BLK_MAX_REQUEST_COUNT; +} + /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. @@ -2231,7 +2243,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) else last = list_entry_rq(plug->mq_list.prev); - if (request_count >= BLK_MAX_REQUEST_COUNT || (last && + if (request_count >= blk_plug_max_rq_count(plug) || (last && blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { blk_flush_plug_list(plug, false); trace_block_plug(q); From 221e8360834c59f0c9952630fa5904a94ebd2bb8 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 9 Sep 2021 17:06:08 +0800 Subject: [PATCH 17/17] n64cart: fix return value check in n64cart_probe() In case of error, the function devm_platform_ioremap_resource() returns ERR_PTR() and never returns NULL. The NULL test in the return value check should be replaced with IS_ERR(). Fixes: d9b2a2bbbb4d ("block: Add n64 cart driver") Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20210909090608.2989716-1-yangyingliang@huawei.com Signed-off-by: Jens Axboe --- drivers/block/n64cart.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index c84be0028f63..26798da661bd 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -129,8 +129,8 @@ static int __init n64cart_probe(struct platform_device *pdev) } reg_base = devm_platform_ioremap_resource(pdev, 0); - if (!reg_base) - return -EINVAL; + if (IS_ERR(reg_base)) + return PTR_ERR(reg_base); disk = blk_alloc_disk(NUMA_NO_NODE); if (!disk)