From 34485c37ea931d4a086701de1d61a986b9707b7d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 7 Mar 2024 08:55:42 -0800 Subject: [PATCH 01/15] nvme: change shutdown timeout setting message User visible messages containing the word "timeout" can be alarming. This one from nvme is just reporting a potentially informative device configuration, and everything is working as designed. Change the text to report the less concerning "D3 entry latency", which is where this value comes from anyway. Reported-by: Len Brown Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2baf5786a92f..0dcaf3973dc4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3233,7 +3233,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) if (ctrl->shutdown_timeout != shutdown_timeout) dev_info(ctrl->device, - "Shutdown timeout set to %u seconds\n", + "D3 entry latency set to %u seconds\n", ctrl->shutdown_timeout); } else ctrl->shutdown_timeout = shutdown_timeout; From 0889d13b9e1cbef49e802ae09f3b516911ad82a1 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 8 Mar 2024 08:11:05 +0100 Subject: [PATCH 02/15] nvmet-tcp: do not continue for invalid icreq When the length check for an icreq sqe fails we should not continue processing but rather return immediately as all other contents of that sqe cannot be relied on. Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/tcp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index c8655fc5aa5b..022d17bd36bf 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -898,6 +898,7 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) pr_err("bad nvme-tcp pdu length (%d)\n", le32_to_cpu(icreq->hdr.plen)); nvmet_tcp_fatal_error(queue); + return -EPROTO; } if (icreq->pfv != NVME_TCP_PFV_1_0) { From 1843671f86e93311e12b7dde72736167a07158fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 8 Mar 2024 09:51:10 +0100 Subject: [PATCH 03/15] nvme-apple: Convert to platform remove callback returning void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The .remove() callback for a platform driver returns an int which makes many driver authors wrongly assume it's possible to do error handling by returning an error code. However the value returned is ignored (apart from emitting a warning) and this typically results in resource leaks. To improve here there is a quest to make the remove callback return void. In the first step of this quest all drivers are converted to .remove_new(), which already returns void. Eventually after all drivers are converted, .remove_new() will be renamed to .remove(). Trivially convert this driver from always returning zero in the remove callback to the void returning variant. Signed-off-by: Uwe Kleine-König Signed-off-by: Keith Busch --- drivers/nvme/host/apple.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index a480cdeac288..dd6ec0865141 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1532,7 +1532,7 @@ put_dev: return ret; } -static int apple_nvme_remove(struct platform_device *pdev) +static void apple_nvme_remove(struct platform_device *pdev) { struct apple_nvme *anv = platform_get_drvdata(pdev); @@ -1547,8 +1547,6 @@ static int apple_nvme_remove(struct platform_device *pdev) apple_rtkit_shutdown(anv->rtk); apple_nvme_detach_genpd(anv); - - return 0; } static void apple_nvme_shutdown(struct platform_device *pdev) @@ -1598,7 +1596,7 @@ static struct platform_driver apple_nvme_driver = { .pm = pm_sleep_ptr(&apple_nvme_pm_ops), }, .probe = apple_nvme_probe, - .remove = apple_nvme_remove, + .remove_new = apple_nvme_remove, .shutdown = apple_nvme_shutdown, }; module_platform_driver(apple_nvme_driver); From 8fc3b0f1f47b8b6dd2801deeccae59a3b46c4c39 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 28 Feb 2024 16:37:54 +0800 Subject: [PATCH 04/15] nvmet: add tracing of authentication commands Add nvme_fabrics_type_auth_send and nvme_fabrics_type_auth_receive to the nvme target's tracing facility. Signed-off-by: Guixin Liu Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/trace.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c index 6ee1f3db81d0..ca537fae3730 100644 --- a/drivers/nvme/target/trace.c +++ b/drivers/nvme/target/trace.c @@ -176,6 +176,34 @@ static const char *nvmet_trace_fabrics_property_get(struct trace_seq *p, return ret; } +static const char *nvmet_trace_fabrics_auth_send(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 spsp0 = spc[1]; + u8 spsp1 = spc[2]; + u8 secp = spc[3]; + u32 tl = get_unaligned_le32(spc + 4); + + trace_seq_printf(p, "spsp0=%02x, spsp1=%02x, secp=%02x, tl=%u", + spsp0, spsp1, secp, tl); + trace_seq_putc(p, 0); + return ret; +} + +static const char *nvmet_trace_fabrics_auth_receive(struct trace_seq *p, u8 *spc) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 spsp0 = spc[1]; + u8 spsp1 = spc[2]; + u8 secp = spc[3]; + u32 al = get_unaligned_le32(spc + 4); + + trace_seq_printf(p, "spsp0=%02x, spsp1=%02x, secp=%02x, al=%u", + spsp0, spsp1, secp, al); + trace_seq_putc(p, 0); + return ret; +} + static const char *nvmet_trace_fabrics_common(struct trace_seq *p, u8 *spc) { const char *ret = trace_seq_buffer_ptr(p); @@ -195,6 +223,10 @@ const char *nvmet_trace_parse_fabrics_cmd(struct trace_seq *p, return nvmet_trace_fabrics_connect(p, spc); case nvme_fabrics_type_property_get: return nvmet_trace_fabrics_property_get(p, spc); + case nvme_fabrics_type_auth_send: + return nvmet_trace_fabrics_auth_send(p, spc); + case nvme_fabrics_type_auth_receive: + return nvmet_trace_fabrics_auth_receive(p, spc); default: return nvmet_trace_fabrics_common(p, spc); } From 2bc91743096756d7c97d10c7079617192211369b Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 28 Feb 2024 16:37:55 +0800 Subject: [PATCH 05/15] nvmet: add tracing of zns commands Add nvme_cmd_zone_append, nvme_cmd_zone_mgmt_send and nvme_cmd_zone_mgmt_recv parse to nvme target tracing. Signed-off-by: Guixin Liu Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/trace.c | 66 +++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/drivers/nvme/target/trace.c b/drivers/nvme/target/trace.c index ca537fae3730..8d1806a82887 100644 --- a/drivers/nvme/target/trace.c +++ b/drivers/nvme/target/trace.c @@ -119,6 +119,67 @@ const char *nvmet_trace_parse_admin_cmd(struct trace_seq *p, } } +static const char *nvmet_trace_zone_mgmt_send(struct trace_seq *p, u8 *cdw10) +{ + static const char * const zsa_strs[] = { + [0x01] = "close zone", + [0x02] = "finish zone", + [0x03] = "open zone", + [0x04] = "reset zone", + [0x05] = "offline zone", + [0x10] = "set zone descriptor extension" + }; + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + const char *zsa_str; + u8 zsa = cdw10[12]; + u8 all = cdw10[13]; + + if (zsa < ARRAY_SIZE(zsa_strs) && zsa_strs[zsa]) + zsa_str = zsa_strs[zsa]; + else + zsa_str = "reserved"; + + trace_seq_printf(p, "slba=%llu, zsa=%u:%s, all=%u", + slba, zsa, zsa_str, all); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvmet_trace_zone_mgmt_recv(struct trace_seq *p, u8 *cdw10) +{ + static const char * const zrasf_strs[] = { + [0x00] = "list all zones", + [0x01] = "list the zones in the ZSE: Empty state", + [0x02] = "list the zones in the ZSIO: Implicitly Opened state", + [0x03] = "list the zones in the ZSEO: Explicitly Opened state", + [0x04] = "list the zones in the ZSC: Closed state", + [0x05] = "list the zones in the ZSF: Full state", + [0x06] = "list the zones in the ZSRO: Read Only state", + [0x07] = "list the zones in the ZSO: Offline state", + [0x09] = "list the zones that have the zone attribute" + }; + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u32 numd = get_unaligned_le32(&cdw10[8]); + u8 zra = cdw10[12]; + u8 zrasf = cdw10[13]; + const char *zrasf_str; + u8 pr = cdw10[14]; + + if (zrasf < ARRAY_SIZE(zrasf_strs) && zrasf_strs[zrasf]) + zrasf_str = zrasf_strs[zrasf]; + else + zrasf_str = "reserved"; + + trace_seq_printf(p, "slba=%llu, numd=%u, zra=%u, zrasf=%u:%s, pr=%u", + slba, numd, zra, zrasf, zrasf_str, pr); + trace_seq_putc(p, 0); + + return ret; +} + const char *nvmet_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode, u8 *cdw10) { @@ -126,9 +187,14 @@ const char *nvmet_trace_parse_nvm_cmd(struct trace_seq *p, case nvme_cmd_read: case nvme_cmd_write: case nvme_cmd_write_zeroes: + case nvme_cmd_zone_append: return nvmet_trace_read_write(p, cdw10); case nvme_cmd_dsm: return nvmet_trace_dsm(p, cdw10); + case nvme_cmd_zone_mgmt_send: + return nvmet_trace_zone_mgmt_send(p, cdw10); + case nvme_cmd_zone_mgmt_recv: + return nvmet_trace_zone_mgmt_recv(p, cdw10); default: return nvmet_trace_common(p, cdw10); } From de105068fead55ed5c07ade75e9c8e7f86a00d1d Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Mon, 11 Mar 2024 10:09:27 +0800 Subject: [PATCH 06/15] nvme: fix reconnection fail due to reserved tag allocation We found a issue on production environment while using NVMe over RDMA, admin_q reconnect failed forever while remote target and network is ok. After dig into it, we found it may caused by a ABBA deadlock due to tag allocation. In my case, the tag was hold by a keep alive request waiting inside admin_q, as we quiesced admin_q while reset ctrl, so the request maked as idle and will not process before reset success. As fabric_q shares tagset with admin_q, while reconnect remote target, we need a tag for connect command, but the only one reserved tag was held by keep alive command which waiting inside admin_q. As a result, we failed to reconnect admin_q forever. In order to fix this issue, I think we should keep two reserved tags for admin queue. Fixes: ed01fee283a0 ("nvme-fabrics: only reserve a single tag") Signed-off-by: Chunguang Xu Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 6 ++++-- drivers/nvme/host/fabrics.h | 7 ------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0dcaf3973dc4..fb55b6ac0385 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4385,7 +4385,8 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, set->ops = ops; set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; if (ctrl->ops->flags & NVME_F_FABRICS) - set->reserved_tags = NVMF_RESERVED_TAGS; + /* Reserved for fabric connect and keep alive */ + set->reserved_tags = 2; set->numa_node = ctrl->numa_node; set->flags = BLK_MQ_F_NO_SCHED; if (ctrl->ops->flags & NVME_F_BLOCKING) @@ -4454,7 +4455,8 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS) set->reserved_tags = NVME_AQ_DEPTH; else if (ctrl->ops->flags & NVME_F_FABRICS) - set->reserved_tags = NVMF_RESERVED_TAGS; + /* Reserved for fabric connect */ + set->reserved_tags = 1; set->numa_node = ctrl->numa_node; set->flags = BLK_MQ_F_SHOULD_MERGE; if (ctrl->ops->flags & NVME_F_BLOCKING) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 06cc54851b1b..37c974c38dcb 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -18,13 +18,6 @@ /* default is -1: the fail fast mechanism is disabled */ #define NVMF_DEF_FAIL_FAST_TMO -1 -/* - * Reserved one command for internal usage. This command is used for sending - * the connect command, as well as for the keep alive command on the admin - * queue once live. - */ -#define NVMF_RESERVED_TAGS 1 - /* * Define a host as seen by the target. We allocate one at boot, but also * allow the override it when creating controllers. This is both to provide From dcad6f5f4303a224ac7a5802e7deffd46cf6f6cb Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 13 Mar 2024 10:29:05 +0800 Subject: [PATCH 07/15] nvme: use nvme_disk_is_ns_head helper Use nvme_disk_is_ns_head helper instead of check fops directly, and also drop CONFIG_NVME_MULTIPATH check. Signed-off-by: Guixin Liu Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pr.c | 3 +-- drivers/nvme/host/sysfs.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/pr.c b/drivers/nvme/host/pr.c index fc3eed00f9ff..e05571b2a1b0 100644 --- a/drivers/nvme/host/pr.c +++ b/drivers/nvme/host/pr.c @@ -97,8 +97,7 @@ static int nvme_sc_to_pr_err(int nvme_sc) static int nvme_send_pr_command(struct block_device *bdev, struct nvme_command *c, void *data, unsigned int data_len) { - if (IS_ENABLED(CONFIG_NVME_MULTIPATH) && - nvme_disk_is_ns_head(bdev->bd_disk)) + if (nvme_disk_is_ns_head(bdev->bd_disk)) return nvme_send_ns_head_pr_command(bdev, c, data, data_len); return nvme_send_ns_pr_command(bdev->bd_disk->private_data, c, data, diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 6c7f1d5c056f..243ebc4d471a 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -236,8 +236,7 @@ static ssize_t nuse_show(struct device *dev, struct device_attribute *attr, struct block_device *bdev = disk->part0; int ret; - if (IS_ENABLED(CONFIG_NVME_MULTIPATH) && - bdev->bd_disk->fops == &nvme_ns_head_ops) + if (nvme_disk_is_ns_head(bdev->bd_disk)) ret = ns_head_update_nuse(head); else ret = ns_update_nuse(bdev->bd_disk->private_data); From 8d539f755c316dd38c8c43c0c25d1b9d26a3b003 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Tue, 12 Mar 2024 15:52:43 +0800 Subject: [PATCH 08/15] nvme: parse zns command's zsa and zrasf to string Parse zone mgmt send commands's zsa and receive command's zrasf to string to make the trace log more human-readable. Signed-off-by: Guixin Liu Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/trace.c | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 1c36fcedea20..5e94b3c6d58a 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -164,12 +164,27 @@ static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10) static const char *nvme_trace_zone_mgmt_send(struct trace_seq *p, u8 *cdw10) { + static const char * const zsa_strs[] = { + [0x01] = "close zone", + [0x02] = "finish zone", + [0x03] = "open zone", + [0x04] = "reset zone", + [0x05] = "offline zone", + [0x10] = "set zone descriptor extension" + }; const char *ret = trace_seq_buffer_ptr(p); u64 slba = get_unaligned_le64(cdw10); + const char *zsa_str; u8 zsa = cdw10[12]; u8 all = cdw10[13]; - trace_seq_printf(p, "slba=%llu, zsa=%u, all=%u", slba, zsa, all); + if (zsa < ARRAY_SIZE(zsa_strs) && zsa_strs[zsa]) + zsa_str = zsa_strs[zsa]; + else + zsa_str = "reserved"; + + trace_seq_printf(p, "slba=%llu, zsa=%u:%s, all=%u", + slba, zsa, zsa_str, all); trace_seq_putc(p, 0); return ret; @@ -177,15 +192,32 @@ static const char *nvme_trace_zone_mgmt_send(struct trace_seq *p, u8 *cdw10) static const char *nvme_trace_zone_mgmt_recv(struct trace_seq *p, u8 *cdw10) { + static const char * const zrasf_strs[] = { + [0x00] = "list all zones", + [0x01] = "list the zones in the ZSE: Empty state", + [0x02] = "list the zones in the ZSIO: Implicitly Opened state", + [0x03] = "list the zones in the ZSEO: Explicitly Opened state", + [0x04] = "list the zones in the ZSC: Closed state", + [0x05] = "list the zones in the ZSF: Full state", + [0x06] = "list the zones in the ZSRO: Read Only state", + [0x07] = "list the zones in the ZSO: Offline state", + [0x09] = "list the zones that have the zone attribute" + }; const char *ret = trace_seq_buffer_ptr(p); u64 slba = get_unaligned_le64(cdw10); u32 numd = get_unaligned_le32(cdw10 + 8); u8 zra = cdw10[12]; u8 zrasf = cdw10[13]; + const char *zrasf_str; u8 pr = cdw10[14]; - trace_seq_printf(p, "slba=%llu, numd=%u, zra=%u, zrasf=%u, pr=%u", - slba, numd, zra, zrasf, pr); + if (zrasf < ARRAY_SIZE(zrasf_strs) && zrasf_strs[zrasf]) + zrasf_str = zrasf_strs[zrasf]; + else + zrasf_str = "reserved"; + + trace_seq_printf(p, "slba=%llu, numd=%u, zra=%u, zrasf=%u:%s, pr=%u", + slba, numd, zra, zrasf, zrasf_str, pr); trace_seq_putc(p, 0); return ret; From 6a0164f9f4a0b629fd7a5414d6c7d21999141b5e Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 31 Jan 2024 17:12:20 +0800 Subject: [PATCH 09/15] nvme: add tracing of reservation commands Add detailed parsing of reservation commands to make the trace log more consistent and human-readable. Signed-off-by: Guixin Liu Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/trace.c | 62 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 5e94b3c6d58a..7e6719a3b624 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -223,6 +223,60 @@ static const char *nvme_trace_zone_mgmt_recv(struct trace_seq *p, u8 *cdw10) return ret; } +static const char *nvme_trace_resv_reg(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 rrega = cdw10[0] & 0x7; + u8 iekey = (cdw10[0] >> 3) & 0x1; + u8 ptpl = (cdw10[3] >> 6) & 0x3; + + trace_seq_printf(p, "rrega=%u, iekey=%u, ptpl=%u", + rrega, iekey, ptpl); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_resv_acq(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 racqa = cdw10[0] & 0x7; + u8 iekey = (cdw10[0] >> 3) & 0x1; + u8 rtype = cdw10[1]; + + trace_seq_printf(p, "racqa=%u, iekey=%u, rtype=%u", + racqa, iekey, rtype); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_resv_rel(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u8 rrela = cdw10[0] & 0x7; + u8 iekey = (cdw10[0] >> 3) & 0x1; + u8 rtype = cdw10[1]; + + trace_seq_printf(p, "rrela=%u, iekey=%u, rtype=%u", + rrela, iekey, rtype); + trace_seq_putc(p, 0); + + return ret; +} + +static const char *nvme_trace_resv_report(struct trace_seq *p, u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u32 numd = get_unaligned_le32(cdw10); + u8 eds = cdw10[4] & 0x1; + + trace_seq_printf(p, "numd=%u, eds=%u", numd, eds); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvme_trace_common(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -275,6 +329,14 @@ const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, return nvme_trace_zone_mgmt_send(p, cdw10); case nvme_cmd_zone_mgmt_recv: return nvme_trace_zone_mgmt_recv(p, cdw10); + case nvme_cmd_resv_register: + return nvme_trace_resv_reg(p, cdw10); + case nvme_cmd_resv_acquire: + return nvme_trace_resv_acq(p, cdw10); + case nvme_cmd_resv_release: + return nvme_trace_resv_rel(p, cdw10); + case nvme_cmd_resv_report: + return nvme_trace_resv_report(p, cdw10); default: return nvme_trace_common(p, cdw10); } From 798edad968ace3c15c3180ba72e427630022e2d9 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 31 Jan 2024 17:12:22 +0800 Subject: [PATCH 10/15] nvme: parse format command's lbafu when tracing Add the parse of format command's lbafu to calculate lbaf. Signed-off-by: Guixin Liu Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 7e6719a3b624..0288315f0050 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -119,7 +119,10 @@ static const char *nvme_trace_get_lba_status(struct trace_seq *p, static const char *nvme_trace_admin_format_nvm(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); - u8 lbaf = cdw10[0] & 0xF; + /* + * lbafu(bit 13:12) is already in the upper 4 bits, lbafl: bit 03:00. + */ + u8 lbaf = (cdw10[1] & 0x30) | (cdw10[0] & 0xF); u8 mset = (cdw10[0] >> 4) & 0x1; u8 pi = (cdw10[0] >> 5) & 0x7; u8 pil = cdw10[1] & 0x1; From e89086c43f0500bc7c4ce225495b73b8ce234c1f Mon Sep 17 00:00:00 2001 From: "Jiawei Fu (iBug)" Date: Sat, 16 Mar 2024 03:27:49 +0800 Subject: [PATCH 11/15] drivers/nvme: Add quirks for device 126f:2262 This commit adds NVME_QUIRK_NO_DEEPEST_PS and NVME_QUIRK_BOGUS_NID for device [126f:2262], which appears to be a generic VID:PID pair used for many SSDs based on the Silicon Motion SM2262/SM2262EN controller. Two of my SSDs with this VID:PID pair exhibit the same behavior: * They frequently have trouble exiting the deepest power state (5), resulting in the entire disk unresponsive. Verified by setting nvme_core.default_ps_max_latency_us=10000 and observing them behaving normally. * They produce all-zero nguid and eui64 with `nvme id-ns` command. The offending products are: * HP SSD EX950 1TB * HIKVISION C2000Pro 2TB Signed-off-by: Jiawei Fu Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e6267a6aa380..8e0bb9692685 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3363,6 +3363,9 @@ static const struct pci_device_id nvme_id_table[] = { NVME_QUIRK_BOGUS_NID, }, { PCI_VDEVICE(REDHAT, 0x0010), /* Qemu emulated controller */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x126f, 0x2262), /* Silicon Motion generic */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS | + NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x126f, 0x2263), /* Silicon Motion unidentified */ .driver_data = NVME_QUIRK_NO_NS_DESC_LIST | NVME_QUIRK_BOGUS_NID, }, From ec58afb49e90d6fd468b0e21d2de324dff1a711c Mon Sep 17 00:00:00 2001 From: Li Feng Date: Wed, 13 Mar 2024 20:38:09 +0800 Subject: [PATCH 12/15] nvme-tcp: Export the nvme_tcp_wq to sysfs Make the workqueue userspace visible for easy viewing and configuration. Signed-off-by: Li Feng Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index a6d596e05602..2ec1186db0a3 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2800,7 +2800,7 @@ static int __init nvme_tcp_init_module(void) BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24); nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", - WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS, 0); if (!nvme_tcp_wq) return -ENOMEM; From 0c29f9fa46bbe4fdc218134823d80cf9934ef231 Mon Sep 17 00:00:00 2001 From: Li Feng Date: Wed, 13 Mar 2024 20:38:10 +0800 Subject: [PATCH 13/15] nvme/tcp: Add wq_unbound modparam for nvme_tcp_wq The default nvme_tcp_wq will use all CPUs to process tasks. Sometimes it is necessary to set CPU affinity to improve performance. A new module parameter wq_unbound is added here. If set to true, users can configure cpu affinity through /sys/devices/virtual/workqueue/nvme_tcp_wq/cpumask. Signed-off-by: Li Feng Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 2ec1186db0a3..34a882b2ec53 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -36,6 +36,14 @@ static int so_priority; module_param(so_priority, int, 0644); MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority"); +/* + * Use the unbound workqueue for nvme_tcp_wq, then we can set the cpu affinity + * from sysfs. + */ +static bool wq_unbound; +module_param(wq_unbound, bool, 0644); +MODULE_PARM_DESC(wq_unbound, "Use unbound workqueue for nvme-tcp IO context (default false)"); + /* * TLS handshake timeout */ @@ -1551,7 +1559,10 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue) else if (nvme_tcp_poll_queue(queue)) n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - ctrl->io_queues[HCTX_TYPE_READ] - 1; - queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); + if (wq_unbound) + queue->io_cpu = WORK_CPU_UNBOUND; + else + queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); } static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid) @@ -2790,6 +2801,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = { static int __init nvme_tcp_init_module(void) { + unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS; + BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8); BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72); BUILD_BUG_ON(sizeof(struct nvme_tcp_data_pdu) != 24); @@ -2799,8 +2812,10 @@ static int __init nvme_tcp_init_module(void) BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128); BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24); - nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", - WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS, 0); + if (wq_unbound) + wq_flags |= WQ_UNBOUND; + + nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", wq_flags, 0); if (!nvme_tcp_wq) return -ENOMEM; From 1e1c4bd16e385f0b1b39920ce3aa16bb33fcdb27 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 20 Mar 2024 17:19:49 +0800 Subject: [PATCH 14/15] nvme: remove redundant BUILD_BUG_ON check Remove redundant BUILD_BUG_ON check of struct nvme_dsm_range, it's already checked in nvme_init_ctrl(). Signed-off-by: Guixin Liu Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index fb55b6ac0385..212005933782 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1803,9 +1803,6 @@ static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim) { struct nvme_ctrl *ctrl = ns->ctrl; - BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < - NVME_DSM_MAX_RANGES); - if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX)) lim->max_hw_discard_sectors = nvme_lba_to_sect(ns->head, ctrl->dmrsl); From 910934da9444dbb102294796481ab05e4419d311 Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Wed, 20 Mar 2024 19:08:21 +0800 Subject: [PATCH 15/15] nvmet-rdma: remove NVMET_RDMA_REQ_INVALIDATE_RKEY flag We can simply use invalidate_rkey to check instead of adding a flag. Signed-off-by: Guixin Liu Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/rdma.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index f2bb9d95ecf4..5b8c63e74639 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -53,7 +53,6 @@ struct nvmet_rdma_cmd { enum { NVMET_RDMA_REQ_INLINE_DATA = (1 << 0), - NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1), }; struct nvmet_rdma_rsp { @@ -722,7 +721,7 @@ static void nvmet_rdma_queue_response(struct nvmet_req *req) struct rdma_cm_id *cm_id = rsp->queue->cm_id; struct ib_send_wr *first_wr; - if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) { + if (rsp->invalidate_rkey) { rsp->send_wr.opcode = IB_WR_SEND_WITH_INV; rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey; } else { @@ -905,10 +904,8 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, goto error_out; rsp->n_rdma += ret; - if (invalidate) { + if (invalidate) rsp->invalidate_rkey = key; - rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY; - } return 0; @@ -1047,6 +1044,7 @@ static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) rsp->req.cmd = cmd->nvme_cmd; rsp->req.port = queue->port; rsp->n_rdma = 0; + rsp->invalidate_rkey = 0; if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) { unsigned long flags;