From 4b8dbdfbc5f650095a8e105998e7a84b4d212495 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 28 Apr 2022 15:46:01 -0400 Subject: [PATCH 01/46] SUNRPC: Fix an RPC/RDMA performance regression Use the standard gfp mask instead of using GFP_NOWAIT. The latter causes issues when under memory pressure. Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 11 ++++------- net/sunrpc/sched.c | 1 + net/sunrpc/xprtrdma/transport.c | 6 +----- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index de7e5b41ab8f..a31a27816cc0 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1340,14 +1340,11 @@ gss_hash_cred(struct auth_cred *acred, unsigned int hashbits) /* * Lookup RPCSEC_GSS cred for the current process */ -static struct rpc_cred * -gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +static struct rpc_cred *gss_lookup_cred(struct rpc_auth *auth, + struct auth_cred *acred, int flags) { - gfp_t gfp = GFP_KERNEL; - - if (flags & RPCAUTH_LOOKUP_ASYNC) - gfp = GFP_NOWAIT | __GFP_NOWARN; - return rpcauth_lookup_credcache(auth, acred, flags, gfp); + return rpcauth_lookup_credcache(auth, acred, flags, + rpc_task_gfp_mask()); } static struct rpc_cred * diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 7f70c1e608b7..25b9221950ff 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -63,6 +63,7 @@ gfp_t rpc_task_gfp_mask(void) return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; return GFP_KERNEL; } +EXPORT_SYMBOL_GPL(rpc_task_gfp_mask); unsigned long rpc_task_timeout(const struct rpc_task *task) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 6b7e10e5a141..bcb37b51adf6 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -571,11 +571,7 @@ xprt_rdma_allocate(struct rpc_task *task) struct rpc_rqst *rqst = task->tk_rqstp; struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - gfp_t flags; - - flags = RPCRDMA_DEF_GFP; - if (RPC_IS_ASYNC(task)) - flags = GFP_NOWAIT | __GFP_NOWARN; + gfp_t flags = rpc_task_gfp_mask(); if (!rpcrdma_check_regbuf(r_xprt, req->rl_sendbuf, rqst->rq_callsize, flags)) From 9597152d98840c2517230740952df97cfcc07e2f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 18 May 2022 16:37:56 -0400 Subject: [PATCH 02/46] Revert "pNFS: nfs3_set_ds_client should set NFS_CS_NOPING" This reverts commit c6eb58435b98bd843d3179664a0195ff25adb2c3. If a transport is down, then we want to fail over to other transports if they are listed in the GETDEVICEINFO reply. Fixes: c6eb58435b98 ("pNFS: nfs3_set_ds_client should set NFS_CS_NOPING") Cc: stable@vger.kernel.org # 5.11.x Signed-off-by: Trond Myklebust --- fs/nfs/nfs3client.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index 5601e47360c2..b49359afac88 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -108,7 +108,6 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, if (mds_srv->flags & NFS_MOUNT_NORESVPORT) __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); - __set_bit(NFS_CS_NOPING, &cl_init.init_flags); __set_bit(NFS_CS_DS, &cl_init.init_flags); /* Use the MDS nfs_client cl_ipaddr. */ From 7836d75467e9d214bdf5c693b32721de729a6e38 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 18 May 2022 16:09:06 -0400 Subject: [PATCH 03/46] pNFS/flexfiles: Report RDMA connection errors to the server The RPC/RDMA driver will return -EPROTO and -ENODEV as connection errors under certain circumstances. Make sure that we handle them and report them to the server. If not, we can end up cycling forever in a LAYOUTGET/LAYOUTRETURN loop. Fixes: a12f996d3413 ("NFSv4/pNFS: Use connections to a DS that are all of the same protocol family") Cc: stable@vger.kernel.org # 5.11.x Signed-off-by: Trond Myklebust --- fs/nfs/flexfilelayout/flexfilelayout.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 604be402ae13..7d285561e59f 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1131,6 +1131,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, case -EIO: case -ETIMEDOUT: case -EPIPE: + case -EPROTO: + case -ENODEV: dprintk("%s DS connection error %d\n", __func__, task->tk_status); nfs4_delete_deviceid(devid->ld, devid->nfs_client, @@ -1236,6 +1238,8 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, case -ENOBUFS: case -EPIPE: case -EPERM: + case -EPROTO: + case -ENODEV: *op_status = status = NFS4ERR_NXIO; break; case -EACCES: From 431794e67e238e6fd170499a14fd2abf0a16b5bd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 18 May 2022 17:08:58 -0400 Subject: [PATCH 04/46] pNFS/files: Handle RDMA connection errors correctly The RPC/RDMA driver will return -EPROTO and -ENODEV as connection errors under certain circumstances. Make sure that we handle them correctly and avoid cycling forever in a LAYOUTGET/LAYOUTRETURN loop. Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayout.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 2b2661582bbe..ad34a33b0737 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -181,6 +181,8 @@ static int filelayout_async_handle_error(struct rpc_task *task, case -EIO: case -ETIMEDOUT: case -EPIPE: + case -EPROTO: + case -ENODEV: dprintk("%s DS connection error %d\n", __func__, task->tk_status); nfs4_mark_deviceid_unavailable(devid); From 6ca0a6f834ed06b2b4c6d1f7f162f2b0d3e196cf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 27 Jun 2022 16:04:02 -0400 Subject: [PATCH 05/46] NFS: Fix case insensitive renames For filesystems that are case insensitive and case preserving, we need to be able to rename from one case folded variant of the filename to another. Currently, if we have looked up the target filename before the call to rename, then we may have a hashed dentry with that target name in the dcache, causing the vfs to optimise away the rename. To avoid that, let's drop the target dentry, and leave it to the server to optimise away the rename if that is the correct thing to do. Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 0c4e8dd6aa96..d9d277d7fa84 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1739,6 +1739,10 @@ nfs_do_lookup_revalidate(struct inode *dir, struct dentry *dentry, goto out_bad; } + if ((flags & LOOKUP_RENAME_TARGET) && d_count(dentry) < 2 && + nfs_server_capable(dir, NFS_CAP_CASE_INSENSITIVE)) + goto out_bad; + if (nfs_verifier_is_delegated(dentry)) return nfs_lookup_revalidate_delegated(dir, dentry, inode); From f07a5d2427fc113dc50c5c818eba8929bc27b8ca Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Jul 2022 09:16:04 -0400 Subject: [PATCH 06/46] NFSv4.1: Don't decrease the value of seq_nr_highest_sent When we're trying to figure out what the server may or may not have seen in terms of request numbers, do not assume that requests with a larger number were missed, just because we saw a reply to a request with a smaller number. Fixes: 3453d5708b33 ("NFSv4.1: Avoid false retries when RPC calls are interrupted") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index bb0e84a46d61..628471d06947 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -784,10 +784,9 @@ static void nfs4_slot_sequence_record_sent(struct nfs4_slot *slot, if ((s32)(seqnr - slot->seq_nr_highest_sent) > 0) slot->seq_nr_highest_sent = seqnr; } -static void nfs4_slot_sequence_acked(struct nfs4_slot *slot, - u32 seqnr) +static void nfs4_slot_sequence_acked(struct nfs4_slot *slot, u32 seqnr) { - slot->seq_nr_highest_sent = seqnr; + nfs4_slot_sequence_record_sent(slot, seqnr); slot->seq_nr_last_acked = seqnr; } From 7ccafd4b2b9f34e6d8185f796f151c47424e273e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 12 Jul 2022 09:22:40 -0400 Subject: [PATCH 07/46] NFSv4.1: Handle NFS4ERR_DELAY replies to OP_SEQUENCE correctly Don't assume that the NFS4ERR_DELAY means that the server is processing this slot id. Fixes: 3453d5708b33 ("NFSv4.1: Avoid false retries when RPC calls are interrupted") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 628471d06947..4e0dcc19ca71 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -853,7 +853,6 @@ static int nfs41_sequence_process(struct rpc_task *task, __func__, slot->slot_nr, slot->seq_nr); - nfs4_slot_sequence_acked(slot, slot->seq_nr); goto out_retry; case -NFS4ERR_RETRY_UNCACHED_REP: case -NFS4ERR_SEQ_FALSE_RETRY: From f931d8374cad3dc09d0f6e3f76689fdb3f104c1a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 22 Jun 2022 15:58:22 +0200 Subject: [PATCH 08/46] nfs/blocklayout: refactor block device opening Deduplicate the helpers to open a device node by passing a name prefix argument and using the same helper for both kinds of paths. Signed-off-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/dev.c | 42 +++++++++++----------------------------- 1 file changed, 11 insertions(+), 31 deletions(-) diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 5e56da748b2a..fea5f8821da5 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -301,18 +301,14 @@ bl_validate_designator(struct pnfs_block_volume *v) } } -/* - * Try to open the udev path for the WWN. At least on Debian the udev - * by-id path will always point to the dm-multipath device if one exists. - */ static struct block_device * -bl_open_udev_path(struct pnfs_block_volume *v) +bl_open_path(struct pnfs_block_volume *v, const char *prefix) { struct block_device *bdev; const char *devname; - devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN", - v->scsi.designator_len, v->scsi.designator); + devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN", + prefix, v->scsi.designator_len, v->scsi.designator); if (!devname) return ERR_PTR(-ENOMEM); @@ -326,28 +322,6 @@ bl_open_udev_path(struct pnfs_block_volume *v) return bdev; } -/* - * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the - * wwn- links will only point to the first discovered SCSI device there. - */ -static struct block_device * -bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v) -{ - struct block_device *bdev; - const char *devname; - - devname = kasprintf(GFP_KERNEL, - "/dev/disk/by-id/dm-uuid-mpath-%d%*phN", - v->scsi.designator_type, - v->scsi.designator_len, v->scsi.designator); - if (!devname) - return ERR_PTR(-ENOMEM); - - bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL); - kfree(devname); - return bdev; -} - static int bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) @@ -360,9 +334,15 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, if (!bl_validate_designator(v)) return -EINVAL; - bdev = bl_open_dm_mpath_udev_path(v); + /* + * Try to open the RH/Fedora specific dm-mpath udev path first, as the + * wwn- links will only point to the first discovered SCSI device there. + * On other distributions like Debian, the default SCSI by-id path will + * point to the dm-multipath device if one exists. + */ + bdev = bl_open_path(v, "dm-uuid-mpath-0x"); if (IS_ERR(bdev)) - bdev = bl_open_udev_path(v); + bdev = bl_open_path(v, "wwn-0x"); if (IS_ERR(bdev)) return PTR_ERR(bdev); d->bdev = bdev; From 064109db53ecc5d88621d02f36da9f33ca0d64bd Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Thu, 23 Jun 2022 09:58:58 +0800 Subject: [PATCH 09/46] NFS: remove redundant code in nfs_file_write() filemap_fdatawait_range() will always return 0, after patch 6c984083ec24 ("NFS: Use of mapping_set_error() results in spurious errors"), it will not save the wb err in struct address_space->flags: result = filemap_fdatawait_range(file->f_mapping, ...) = 0 filemap_check_errors(mapping) = 0 test_bit(..., &mapping->flags) // flags is 0 Signed-off-by: ChenXiaoSong Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2d72b1b7ed74..54237a231687 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -663,8 +663,6 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) result = filemap_fdatawait_range(file->f_mapping, iocb->ki_pos - written, iocb->ki_pos - 1); - if (result < 0) - goto out; } result = generic_write_sync(iocb, written); if (result < 0) From c77c738c37d0fa8380a671613630298d71099180 Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Tue, 28 Jun 2022 20:24:26 +0200 Subject: [PATCH 10/46] nfs: Replace kmap() with kmap_local_page() The use of kmap() is being deprecated in favor of kmap_local_page(). With kmap_local_page(), the mapping is per thread, CPU local and not globally visible. Furthermore, the mapping can be acquired from any context (including interrupts). Therefore, use kmap_local_page() in nfs_do_filldir() because this mapping is per thread, CPU local, and not globally visible. Suggested-by: Ira Weiny Signed-off-by: Fabio M. De Francesco Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index d9d277d7fa84..7b2230297f6b 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1084,7 +1084,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, struct nfs_cache_array *array; unsigned int i; - array = kmap(desc->page); + array = kmap_local_page(desc->page); for (i = desc->cache_entry_index; i < array->size; i++) { struct nfs_cache_array_entry *ent; @@ -1110,7 +1110,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, if (array->page_is_eof) desc->eof = !desc->eob; - kunmap(desc->page); + kunmap_local(array); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %llu\n", (unsigned long long)desc->dir_cookie); } From 8b4e87a1d68f5ae440c42c15c238fd964fd381d0 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Sat, 2 Jul 2022 08:23:47 +0800 Subject: [PATCH 11/46] nfs: fix port value parsing The valid values of nfs options port and mountport are 0 to USHRT_MAX. The fs parser will return a fail for port values that are negative and the sloppy option handling then returns success. But the sloppy option handling is meant to return success for invalid options not valid options with invalid values. Restricting the sloppy option override to handle failure returns for invalid options only is sufficient to resolve this problem. Changes: v2: utilize the return value from fs_parse() to resolve this problem instead of changing the parameter definitions. Suggested-by: Trond Myklebust Signed-off-by: Ian Kent Signed-off-by: Trond Myklebust --- fs/nfs/fs_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 9a16897e8dc6..8f1f9b4af89d 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -484,7 +484,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, opt = fs_parse(fc, nfs_fs_parameters, param, &result); if (opt < 0) - return ctx->sloppy ? 1 : opt; + return (opt == -ENOPARAM && ctx->sloppy) ? 1 : opt; if (fc->security) ctx->has_sec_mnt_opts = 1; From f1bafa7375c01ff71fb7cb97c06caadfcfe815f3 Mon Sep 17 00:00:00 2001 From: Dan Aloni Date: Mon, 4 Jul 2022 15:56:57 +0300 Subject: [PATCH 12/46] sunrpc: fix expiry of auth creds Before this commit, with a large enough LRU of expired items (100), the loop skipped all the expired items and was entirely ineffectual in trimming the LRU list. Fixes: 95cd623250ad ('SUNRPC: Clean up the AUTH cache code') Signed-off-by: Dan Aloni Signed-off-by: Trond Myklebust --- net/sunrpc/auth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 682fcd24bf43..2324d1e58f21 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -445,7 +445,7 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan) * Enforce a 60 second garbage collection moratorium * Note that the cred_unused list must be time-ordered. */ - if (!time_in_range(cred->cr_expire, expired, jiffies)) + if (time_in_range(cred->cr_expire, expired, jiffies)) continue; if (!rpcauth_unhash_cred(cred)) continue; From 940261a195080cf1cdcd56948d363fe363b69da1 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Fri, 17 Jun 2022 16:23:36 -0400 Subject: [PATCH 13/46] NFS: Allow setting rsize / wsize to a multiple of PAGE_SIZE Previously, we required this to value to be a power of 2 for UDP related reasons. This patch keeps the power of 2 rule for UDP but allows more flexibility for TCP and RDMA. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 13 +++++++------ fs/nfs/flexfilelayout/flexfilelayoutdev.c | 6 ++++-- fs/nfs/internal.h | 18 ++++++++++++++++++ fs/nfs/nfs4client.c | 4 ++-- 4 files changed, 31 insertions(+), 10 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index e828504cc396..da8da5cdbbc1 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -708,9 +708,9 @@ static int nfs_init_server(struct nfs_server *server, } if (ctx->rsize) - server->rsize = nfs_block_size(ctx->rsize, NULL); + server->rsize = nfs_io_size(ctx->rsize, clp->cl_proto); if (ctx->wsize) - server->wsize = nfs_block_size(ctx->wsize, NULL); + server->wsize = nfs_io_size(ctx->wsize, clp->cl_proto); server->acregmin = ctx->acregmin * HZ; server->acregmax = ctx->acregmax * HZ; @@ -755,18 +755,19 @@ static int nfs_init_server(struct nfs_server *server, static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *fsinfo) { + struct nfs_client *clp = server->nfs_client; unsigned long max_rpc_payload, raw_max_rpc_payload; /* Work out a lot of parameters */ if (server->rsize == 0) - server->rsize = nfs_block_size(fsinfo->rtpref, NULL); + server->rsize = nfs_io_size(fsinfo->rtpref, clp->cl_proto); if (server->wsize == 0) - server->wsize = nfs_block_size(fsinfo->wtpref, NULL); + server->wsize = nfs_io_size(fsinfo->wtpref, clp->cl_proto); if (fsinfo->rtmax >= 512 && server->rsize > fsinfo->rtmax) - server->rsize = nfs_block_size(fsinfo->rtmax, NULL); + server->rsize = nfs_io_size(fsinfo->rtmax, clp->cl_proto); if (fsinfo->wtmax >= 512 && server->wsize > fsinfo->wtmax) - server->wsize = nfs_block_size(fsinfo->wtmax, NULL); + server->wsize = nfs_io_size(fsinfo->wtmax, clp->cl_proto); raw_max_rpc_payload = rpc_max_payload(server->client); max_rpc_payload = nfs_block_size(raw_max_rpc_payload, NULL); diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index bfa7202ca7be..e028f5a0ef5f 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -113,8 +113,10 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_err_drain_dsaddrs; ds_versions[i].version = be32_to_cpup(p++); ds_versions[i].minor_version = be32_to_cpup(p++); - ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL); - ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL); + ds_versions[i].rsize = nfs_io_size(be32_to_cpup(p++), + server->nfs_client->cl_proto); + ds_versions[i].wsize = nfs_io_size(be32_to_cpup(p++), + server->nfs_client->cl_proto); ds_versions[i].tightly_coupled = be32_to_cpup(p); if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 8f8cd6e2d4db..af6d261241ff 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -704,6 +704,24 @@ unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp) return nfs_block_bits(bsize, nrbitsp); } +/* + * Compute and set NFS server rsize / wsize + */ +static inline +unsigned long nfs_io_size(unsigned long iosize, enum xprt_transports proto) +{ + if (iosize < NFS_MIN_FILE_IO_SIZE) + iosize = NFS_DEF_FILE_IO_SIZE; + else if (iosize >= NFS_MAX_FILE_IO_SIZE) + iosize = NFS_MAX_FILE_IO_SIZE; + else + iosize = iosize & PAGE_MASK; + + if (proto == XPRT_TRANSPORT_UDP) + return nfs_block_bits(iosize, NULL); + return iosize; +} + /* * Determine the maximum file size for a superblock */ diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 47a6cf892c95..3c5678aec006 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -1161,9 +1161,9 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc) return error; if (ctx->rsize) - server->rsize = nfs_block_size(ctx->rsize, NULL); + server->rsize = nfs_io_size(ctx->rsize, server->nfs_client->cl_proto); if (ctx->wsize) - server->wsize = nfs_block_size(ctx->wsize, NULL); + server->wsize = nfs_io_size(ctx->wsize, server->nfs_client->cl_proto); server->acregmin = ctx->acregmin * HZ; server->acregmax = ctx->acregmax * HZ; From 51fd2eb52c0ca8275a906eed81878ef50ae94eb0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 13 Jul 2022 17:46:52 -0400 Subject: [PATCH 14/46] NFSv4: Fix races in the legacy idmapper upcall nfs_idmap_instantiate() will cause the process that is waiting in request_key_with_auxdata() to wake up and exit. If there is a second process waiting for the idmap->idmap_mutex, then it may wake up and start a new call to request_key_with_auxdata(). If the call to idmap_pipe_downcall() from the first process has not yet finished calling nfs_idmap_complete_pipe_upcall_locked(), then we may end up triggering the WARN_ON_ONCE() in nfs_idmap_prepare_pipe_upcall(). The fix is to ensure that we clear idmap->idmap_upcall_data before calling nfs_idmap_instantiate(). Fixes: e9ab41b620e4 ("NFSv4: Clean up the legacy idmapper upcall") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4idmap.c | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index f331866dd418..ec6afd3c4bca 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -561,22 +561,20 @@ nfs_idmap_prepare_pipe_upcall(struct idmap *idmap, return true; } -static void -nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret) +static void nfs_idmap_complete_pipe_upcall(struct idmap_legacy_upcalldata *data, + int ret) { - struct key *authkey = idmap->idmap_upcall_data->authkey; - - kfree(idmap->idmap_upcall_data); - idmap->idmap_upcall_data = NULL; - complete_request_key(authkey, ret); - key_put(authkey); + complete_request_key(data->authkey, ret); + key_put(data->authkey); + kfree(data); } -static void -nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret) +static void nfs_idmap_abort_pipe_upcall(struct idmap *idmap, + struct idmap_legacy_upcalldata *data, + int ret) { - if (idmap->idmap_upcall_data != NULL) - nfs_idmap_complete_pipe_upcall_locked(idmap, ret); + if (cmpxchg(&idmap->idmap_upcall_data, data, NULL) == data) + nfs_idmap_complete_pipe_upcall(data, ret); } static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux) @@ -613,7 +611,7 @@ static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux) ret = rpc_queue_upcall(idmap->idmap_pipe, msg); if (ret < 0) - nfs_idmap_abort_pipe_upcall(idmap, ret); + nfs_idmap_abort_pipe_upcall(idmap, data, ret); return ret; out2: @@ -669,6 +667,7 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) struct request_key_auth *rka; struct rpc_inode *rpci = RPC_I(file_inode(filp)); struct idmap *idmap = (struct idmap *)rpci->private; + struct idmap_legacy_upcalldata *data; struct key *authkey; struct idmap_msg im; size_t namelen_in; @@ -678,10 +677,11 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) * will have been woken up and someone else may now have used * idmap_key_cons - so after this point we may no longer touch it. */ - if (idmap->idmap_upcall_data == NULL) + data = xchg(&idmap->idmap_upcall_data, NULL); + if (data == NULL) goto out_noupcall; - authkey = idmap->idmap_upcall_data->authkey; + authkey = data->authkey; rka = get_request_key_auth(authkey); if (mlen != sizeof(im)) { @@ -703,18 +703,17 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) { ret = -EINVAL; goto out; -} + } - ret = nfs_idmap_read_and_verify_message(&im, - &idmap->idmap_upcall_data->idmap_msg, - rka->target_key, authkey); + ret = nfs_idmap_read_and_verify_message(&im, &data->idmap_msg, + rka->target_key, authkey); if (ret >= 0) { key_set_timeout(rka->target_key, nfs_idmap_cache_timeout); ret = mlen; } out: - nfs_idmap_complete_pipe_upcall_locked(idmap, ret); + nfs_idmap_complete_pipe_upcall(data, ret); out_noupcall: return ret; } @@ -728,7 +727,7 @@ idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg) struct idmap *idmap = data->idmap; if (msg->errno) - nfs_idmap_abort_pipe_upcall(idmap, msg->errno); + nfs_idmap_abort_pipe_upcall(idmap, data, msg->errno); } static void @@ -736,8 +735,11 @@ idmap_release_pipe(struct inode *inode) { struct rpc_inode *rpci = RPC_I(inode); struct idmap *idmap = (struct idmap *)rpci->private; + struct idmap_legacy_upcalldata *data; - nfs_idmap_abort_pipe_upcall(idmap, -EPIPE); + data = xchg(&idmap->idmap_upcall_data, NULL); + if (data) + nfs_idmap_complete_pipe_upcall(data, -EPIPE); } int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid) From ba8ec7a607e98e8491a1fcf924a2e6c96ac9d413 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 23 Jul 2022 14:47:28 -0400 Subject: [PATCH 15/46] SUNRPC: Shrink size of struct rpc_task Move the field 'tk_rpc_status' so that we eliminate a 4 byte hole in the structure. For x86_64, this shrinks the size of the struct by 8 bytes. 'pahole' output before the change: /* size: 232, cachelines: 4, members: 27 */ /* sum members: 222, holes: 1, sum holes: 4 */ /* sum bitfield members: 8 bits (1 bytes) */ /* padding: 5 */ /* last cacheline: 40 bytes */ 'pahole' output after the change: /* size: 224, cachelines: 4, members: 27 */ /* padding: 1 */ /* last cacheline: 32 bytes */ Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 1d7a3e51b795..acc62647317c 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -61,8 +61,6 @@ struct rpc_task { struct rpc_wait tk_wait; /* RPC wait */ } u; - int tk_rpc_status; /* Result of last RPC operation */ - /* * RPC call state */ @@ -82,6 +80,8 @@ struct rpc_task { ktime_t tk_start; /* RPC task init timestamp */ pid_t tk_owner; /* Process id for batching tasks */ + + int tk_rpc_status; /* Result of last RPC operation */ unsigned short tk_flags; /* misc flags */ unsigned short tk_timeouts; /* maj timeouts */ From 8efc4bbe84a8bdd26e848ed93a8900fad1b44ca2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 22 Jul 2022 14:12:18 -0400 Subject: [PATCH 16/46] nfs: add new nfs_direct_req tracepoint events Add some new tracepoints to the DIO write code. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 45 +++++++++---------------------- fs/nfs/internal.h | 33 +++++++++++++++++++++++ fs/nfs/nfstrace.h | 69 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 114 insertions(+), 33 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 4eb2a8380a28..ad40e81857ee 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -60,44 +60,12 @@ #include "iostat.h" #include "pnfs.h" #include "fscache.h" +#include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_VFS static struct kmem_cache *nfs_direct_cachep; -struct nfs_direct_req { - struct kref kref; /* release manager */ - - /* I/O parameters */ - struct nfs_open_context *ctx; /* file open context info */ - struct nfs_lock_context *l_ctx; /* Lock context info */ - struct kiocb * iocb; /* controlling i/o request */ - struct inode * inode; /* target file of i/o */ - - /* completion state */ - atomic_t io_count; /* i/os we're waiting for */ - spinlock_t lock; /* protect completion state */ - - loff_t io_start; /* Start offset for I/O */ - ssize_t count, /* bytes actually processed */ - max_count, /* max expected count */ - bytes_left, /* bytes left to be sent */ - error; /* any reported error */ - struct completion completion; /* wait for i/o completion */ - - /* commit state */ - struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */ - struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */ - struct work_struct work; - int flags; - /* for write */ -#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ -#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ - /* for read */ -#define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */ -#define NFS_ODIRECT_DONE INT_MAX /* write verification failed */ -}; - static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops; static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops; static void nfs_direct_write_complete(struct nfs_direct_req *dreq); @@ -595,6 +563,8 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) struct nfs_page *req; int status = data->task.tk_status; + trace_nfs_direct_commit_complete(dreq); + if (status < 0) { /* Errors in commit are fatal */ dreq->error = status; @@ -631,6 +601,8 @@ static void nfs_direct_resched_write(struct nfs_commit_info *cinfo, { struct nfs_direct_req *dreq = cinfo->dreq; + trace_nfs_direct_resched_write(dreq); + spin_lock(&dreq->lock); if (dreq->flags != NFS_ODIRECT_DONE) dreq->flags = NFS_ODIRECT_RESCHED_WRITES; @@ -695,6 +667,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) static void nfs_direct_write_complete(struct nfs_direct_req *dreq) { + trace_nfs_direct_write_complete(dreq); queue_work(nfsiod_workqueue, &dreq->work); /* Calls nfs_direct_write_schedule_work */ } @@ -705,6 +678,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) struct nfs_page *req = nfs_list_entry(hdr->pages.next); int flags = NFS_ODIRECT_DONE; + trace_nfs_direct_write_completion(dreq); + nfs_init_cinfo_from_dreq(&cinfo, dreq); spin_lock(&dreq->lock); @@ -759,6 +734,8 @@ static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; + trace_nfs_direct_write_reschedule_io(dreq); + spin_lock(&dreq->lock); if (dreq->error == 0) { dreq->flags = NFS_ODIRECT_RESCHED_WRITES; @@ -799,6 +776,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, size_t requested_bytes = 0; size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); + trace_nfs_direct_write_schedule_iovec(dreq); + nfs_pageio_init_write(&desc, inode, ioflags, false, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index af6d261241ff..0cabc7f07d9a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -877,3 +877,36 @@ static inline void nfs_set_port(struct sockaddr *sap, int *port, rpc_set_port(sap, *port); } + +struct nfs_direct_req { + struct kref kref; /* release manager */ + + /* I/O parameters */ + struct nfs_open_context *ctx; /* file open context info */ + struct nfs_lock_context *l_ctx; /* Lock context info */ + struct kiocb * iocb; /* controlling i/o request */ + struct inode * inode; /* target file of i/o */ + + /* completion state */ + atomic_t io_count; /* i/os we're waiting for */ + spinlock_t lock; /* protect completion state */ + + loff_t io_start; /* Start offset for I/O */ + ssize_t count, /* bytes actually processed */ + max_count, /* max expected count */ + bytes_left, /* bytes left to be sent */ + error; /* any reported error */ + struct completion completion; /* wait for i/o completion */ + + /* commit state */ + struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */ + struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */ + struct work_struct work; + int flags; + /* for write */ +#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */ +#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */ + /* for read */ +#define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */ +#define NFS_ODIRECT_DONE INT_MAX /* write verification failed */ +}; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 012bd7339862..65388e4a0cd7 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1576,6 +1576,75 @@ TRACE_EVENT(nfs_commit_done, ) ); +#define nfs_show_direct_req_flags(v) \ + __print_flags(v, "|", \ + { NFS_ODIRECT_DO_COMMIT, "DO_COMMIT" }, \ + { NFS_ODIRECT_RESCHED_WRITES, "RESCHED_WRITES" }, \ + { NFS_ODIRECT_SHOULD_DIRTY, "SHOULD DIRTY" }, \ + { NFS_ODIRECT_DONE, "DONE" } ) + +DECLARE_EVENT_CLASS(nfs_direct_req_class, + TP_PROTO( + const struct nfs_direct_req *dreq + ), + + TP_ARGS(dreq), + + TP_STRUCT__entry( + __field(const struct nfs_direct_req *, dreq) + __field(dev_t, dev) + __field(u64, fileid) + __field(u32, fhandle) + __field(int, ref) + __field(loff_t, io_start) + __field(ssize_t, count) + __field(ssize_t, bytes_left) + __field(ssize_t, error) + __field(int, flags) + ), + + TP_fast_assign( + const struct inode *inode = dreq->inode; + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = &nfsi->fh; + + __entry->dreq = dreq; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + __entry->ref = kref_read(&dreq->kref); + __entry->io_start = dreq->io_start; + __entry->count = dreq->count; + __entry->bytes_left = dreq->bytes_left; + __entry->error = dreq->error; + __entry->flags = dreq->flags; + ), + + TP_printk( + "dreq=%p fileid=%02x:%02x:%llu fhandle=0x%08x ref=%d " + "io_start=%lld count=%zd bytes_left=%zd error=%zd flags=%s", + __entry->dreq, MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->ref, + __entry->io_start, __entry->count, __entry->bytes_left, + __entry->error, nfs_show_direct_req_flags(__entry->flags) + ) +); + +#define DEFINE_NFS_DIRECT_REQ_EVENT(name) \ + DEFINE_EVENT(nfs_direct_req_class, name, \ + TP_PROTO( \ + const struct nfs_direct_req *dreq \ + ), \ + TP_ARGS(dreq)) + +DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_commit_complete); +DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_resched_write); +DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_complete); +DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_completion); +DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_schedule_iovec); +DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_reschedule_io); + TRACE_EVENT(nfs_fh_to_dentry, TP_PROTO( const struct super_block *sb, From 55051c0ced7d322a169f8603d306ee6ec079f8ae Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 22 Jul 2022 14:12:19 -0400 Subject: [PATCH 17/46] nfs: always check dreq->error after a commit When the client gets back a short DIO write, it will then attempt to issue another write to finish the DIO request. If that write then fails (as is often the case in an -ENOSPC situation), then we still may need to issue a COMMIT if the earlier short write was unstable. If that COMMIT then succeeds, then we don't want the client to reschedule the write requests, and to instead just return a short write. Otherwise, we can end up looping over the same DIO write forever. Always consult dreq->error after a successful RPC, even when the flag state is not NFS_ODIRECT_DONE. Link: https://bugzilla.redhat.com/show_bug.cgi?id=2028370 Reported-by: Boyang Xue Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index ad40e81857ee..a47d13296194 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -571,8 +571,9 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) dreq->max_count = 0; dreq->count = 0; dreq->flags = NFS_ODIRECT_DONE; - } else if (dreq->flags == NFS_ODIRECT_DONE) + } else { status = dreq->error; + } nfs_init_cinfo_from_dreq(&cinfo, dreq); From 69d966510d9f5de81588b37d23a9ee8ccc477b23 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 22 Jul 2022 14:12:20 -0400 Subject: [PATCH 18/46] nfs: only issue commit in DIO codepath if we have uncommitted data Currently, we try to determine whether to issue a commit based on nfs_write_need_commit which looks at the current verifier. In the case where we got a short write and then tried to follow it up with one that failed, the verifier can't be trusted. What we really want to know is whether the pgio request had any successful writes that came back as UNSTABLE. Add a new flag to the pgio request, and use that to indicate that we've had a successful unstable write. Only issue a commit if that flag is set. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 2 +- fs/nfs/write.c | 48 +++++++++++++++++++++++++---------------- include/linux/nfs_xdr.h | 1 + 3 files changed, 32 insertions(+), 19 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index a47d13296194..86df66bb14c5 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -690,7 +690,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) } nfs_direct_count_bytes(dreq, hdr); - if (hdr->good_bytes != 0 && nfs_write_need_commit(hdr)) { + if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags)) { if (!dreq->flags) dreq->flags = NFS_ODIRECT_DO_COMMIT; flags = dreq->flags; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1c706465d090..16d166bc4099 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1576,25 +1576,37 @@ static int nfs_writeback_done(struct rpc_task *task, nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count); trace_nfs_writeback_done(task, hdr); - if (hdr->res.verf->committed < hdr->args.stable && - task->tk_status >= 0) { - /* We tried a write call, but the server did not - * commit data to stable storage even though we - * requested it. - * Note: There is a known bug in Tru64 < 5.0 in which - * the server reports NFS_DATA_SYNC, but performs - * NFS_FILE_SYNC. We therefore implement this checking - * as a dprintk() in order to avoid filling syslog. - */ - static unsigned long complain; + if (task->tk_status >= 0) { + enum nfs3_stable_how committed = hdr->res.verf->committed; - /* Note this will print the MDS for a DS write */ - if (time_before(complain, jiffies)) { - dprintk("NFS: faulty NFS server %s:" - " (committed = %d) != (stable = %d)\n", - NFS_SERVER(inode)->nfs_client->cl_hostname, - hdr->res.verf->committed, hdr->args.stable); - complain = jiffies + 300 * HZ; + if (committed == NFS_UNSTABLE) { + /* + * We have some uncommitted data on the server at + * this point, so ensure that we keep track of that + * fact irrespective of what later writes do. + */ + set_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags); + } + + if (committed < hdr->args.stable) { + /* We tried a write call, but the server did not + * commit data to stable storage even though we + * requested it. + * Note: There is a known bug in Tru64 < 5.0 in which + * the server reports NFS_DATA_SYNC, but performs + * NFS_FILE_SYNC. We therefore implement this checking + * as a dprintk() in order to avoid filling syslog. + */ + static unsigned long complain; + + /* Note this will print the MDS for a DS write */ + if (time_before(complain, jiffies)) { + dprintk("NFS: faulty NFS server %s:" + " (committed = %d) != (stable = %d)\n", + NFS_SERVER(inode)->nfs_client->cl_hostname, + committed, hdr->args.stable); + complain = jiffies + 300 * HZ; + } } } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 0e3aa0f5f324..e86cf6642d21 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1600,6 +1600,7 @@ enum { NFS_IOHDR_STAT, NFS_IOHDR_RESEND_PNFS, NFS_IOHDR_RESEND_MDS, + NFS_IOHDR_UNSTABLE_WRITES, }; struct nfs_io_completion; From 0701214cd6e66585a999b132eb72ae0489beb724 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 22 Jul 2022 15:08:04 -0400 Subject: [PATCH 19/46] SUNRPC: Fail faster on bad verifier A bad verifier is not a garbage argument, it's an authentication failure. Retrying it doesn't make the problem go away, and delays upper layer recovery steps. Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index b6781ada3aa8..a97d4e06cae3 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2650,7 +2650,7 @@ rpc_decode_header(struct rpc_task *task, struct xdr_stream *xdr) out_verifier: trace_rpc_bad_verifier(task); - goto out_garbage; + goto out_err; out_msg_denied: error = -EACCES; From f67939e4b045e1c8e857055463c0b5a88eca4844 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 22 Jul 2022 15:08:17 -0400 Subject: [PATCH 20/46] SUNRPC: Replace dprintk() call site in xs_data_ready Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/sunrpc.h | 20 ++++++++++++++++++++ net/sunrpc/xprtsock.c | 6 ++++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index b61d9c90fa26..21068ad61db8 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1266,6 +1266,26 @@ TRACE_EVENT(xprt_reserve, ) ); +TRACE_EVENT(xs_data_ready, + TP_PROTO( + const struct rpc_xprt *xprt + ), + + TP_ARGS(xprt), + + TP_STRUCT__entry( + __string(addr, xprt->address_strings[RPC_DISPLAY_ADDR]) + __string(port, xprt->address_strings[RPC_DISPLAY_PORT]) + ), + + TP_fast_assign( + __assign_str(addr, xprt->address_strings[RPC_DISPLAY_ADDR]); + __assign_str(port, xprt->address_strings[RPC_DISPLAY_PORT]); + ), + + TP_printk("peer=[%s]:%s", __get_str(addr), __get_str(port)) +); + TRACE_EVENT(xs_stream_read_data, TP_PROTO(struct rpc_xprt *xprt, ssize_t err, size_t total), diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index fcdd0fca408e..eba1be9984f8 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1378,7 +1378,7 @@ static void xs_udp_data_receive_workfn(struct work_struct *work) } /** - * xs_data_ready - "data ready" callback for UDP sockets + * xs_data_ready - "data ready" callback for sockets * @sk: socket with data to read * */ @@ -1386,11 +1386,13 @@ static void xs_data_ready(struct sock *sk) { struct rpc_xprt *xprt; - dprintk("RPC: xs_data_ready...\n"); xprt = xprt_from_sock(sk); if (xprt != NULL) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); + + trace_xs_data_ready(xprt); + transport->old_data_ready(sk); /* Any data means we had a useful conversation, so * then we don't need to delay the next reconnect From 33ce83ef0bb048be259ff8ae92ad212918f1ef35 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 22 Jul 2022 15:08:24 -0400 Subject: [PATCH 21/46] NFS: Replace fs_context-related dprintk() call sites with tracepoints Contributed as part of the long patch series that converts NFS from using dprintk to tracepoints for observability. Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/fs_context.c | 24 ++++++++++-------- fs/nfs/nfstrace.h | 59 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 10 deletions(-) diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 8f1f9b4af89d..4da701fd1424 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -21,6 +21,8 @@ #include "nfs.h" #include "internal.h" +#include "nfstrace.h" + #define NFSDBG_FACILITY NFSDBG_MOUNT #if IS_ENABLED(CONFIG_NFS_V3) @@ -284,7 +286,6 @@ static int nfs_verify_server_address(struct sockaddr *addr) } } - dfprintk(MOUNT, "NFS: Invalid IP address specified\n"); return 0; } @@ -378,7 +379,7 @@ static int nfs_parse_security_flavors(struct fs_context *fc, char *string = param->string, *p; int ret; - dfprintk(MOUNT, "NFS: parsing %s=%s option\n", param->key, param->string); + trace_nfs_mount_assign(param->key, string); while ((p = strsep(&string, ":")) != NULL) { if (!*p) @@ -480,7 +481,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, unsigned int len; int ret, opt; - dfprintk(MOUNT, "NFS: parsing nfs mount option '%s'\n", param->key); + trace_nfs_mount_option(param); opt = fs_parse(fc, nfs_fs_parameters, param, &result); if (opt < 0) @@ -683,6 +684,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, return ret; break; case Opt_vers: + trace_nfs_mount_assign(param->key, param->string); ret = nfs_parse_version_string(fc, param->string); if (ret < 0) return ret; @@ -694,6 +696,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, break; case Opt_proto: + trace_nfs_mount_assign(param->key, param->string); protofamily = AF_INET; switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) { case Opt_xprt_udp6: @@ -729,6 +732,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, break; case Opt_mountproto: + trace_nfs_mount_assign(param->key, param->string); mountfamily = AF_INET; switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) { case Opt_xprt_udp6: @@ -751,6 +755,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, break; case Opt_addr: + trace_nfs_mount_assign(param->key, param->string); len = rpc_pton(fc->net_ns, param->string, param->size, &ctx->nfs_server.address, sizeof(ctx->nfs_server._address)); @@ -759,16 +764,19 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, ctx->nfs_server.addrlen = len; break; case Opt_clientaddr: + trace_nfs_mount_assign(param->key, param->string); kfree(ctx->client_address); ctx->client_address = param->string; param->string = NULL; break; case Opt_mounthost: + trace_nfs_mount_assign(param->key, param->string); kfree(ctx->mount_server.hostname); ctx->mount_server.hostname = param->string; param->string = NULL; break; case Opt_mountaddr: + trace_nfs_mount_assign(param->key, param->string); len = rpc_pton(fc->net_ns, param->string, param->size, &ctx->mount_server.address, sizeof(ctx->mount_server._address)); @@ -846,7 +854,6 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, */ case Opt_sloppy: ctx->sloppy = true; - dfprintk(MOUNT, "NFS: relaxing parsing rules\n"); break; } @@ -879,10 +886,8 @@ static int nfs_parse_source(struct fs_context *fc, size_t len; const char *end; - if (unlikely(!dev_name || !*dev_name)) { - dfprintk(MOUNT, "NFS: device name not specified\n"); + if (unlikely(!dev_name || !*dev_name)) return -EINVAL; - } /* Is the host name protected with square brakcets? */ if (*dev_name == '[') { @@ -922,7 +927,7 @@ static int nfs_parse_source(struct fs_context *fc, if (!ctx->nfs_server.export_path) goto out_nomem; - dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", ctx->nfs_server.export_path); + trace_nfs_mount_path(ctx->nfs_server.export_path); return 0; out_bad_devname: @@ -1116,7 +1121,6 @@ static int nfs23_parse_monolithic(struct fs_context *fc, return nfs_invalf(fc, "NFS: nfs_mount_data version supports only AUTH_SYS"); out_nomem: - dfprintk(MOUNT, "NFS: not enough memory to handle mount options"); return -ENOMEM; out_no_address: @@ -1248,7 +1252,7 @@ static int nfs4_parse_monolithic(struct fs_context *fc, if (IS_ERR(c)) return PTR_ERR(c); ctx->nfs_server.export_path = c; - dfprintk(MOUNT, "NFS: MNTPATH: '%s'\n", c); + trace_nfs_mount_path(c); c = strndup_user(data->client_addr.data, 16); if (IS_ERR(c)) diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 65388e4a0cd7..8bd0c13a7c4b 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1678,6 +1678,65 @@ TRACE_EVENT(nfs_fh_to_dentry, ) ); +TRACE_EVENT(nfs_mount_assign, + TP_PROTO( + const char *option, + const char *value + ), + + TP_ARGS(option, value), + + TP_STRUCT__entry( + __string(option, option) + __string(value, value) + ), + + TP_fast_assign( + __assign_str(option, option); + __assign_str(value, value); + ), + + TP_printk("option %s=%s", + __get_str(option), __get_str(value) + ) +); + +TRACE_EVENT(nfs_mount_option, + TP_PROTO( + const struct fs_parameter *param + ), + + TP_ARGS(param), + + TP_STRUCT__entry( + __string(option, param->key) + ), + + TP_fast_assign( + __assign_str(option, param->key); + ), + + TP_printk("option %s", __get_str(option)) +); + +TRACE_EVENT(nfs_mount_path, + TP_PROTO( + const char *path + ), + + TP_ARGS(path), + + TP_STRUCT__entry( + __string(path, path) + ), + + TP_fast_assign( + __assign_str(path, path); + ), + + TP_printk("path='%s'", __get_str(path)) +); + DECLARE_EVENT_CLASS(nfs_xdr_event, TP_PROTO( const struct xdr_stream *xdr, From 4f5f3b6028343d687d0533329b130e4b8280ab32 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Thu, 21 Jul 2022 14:21:31 -0400 Subject: [PATCH 22/46] SUNRPC: Introduce xdr_stream_move_subsegment() I do this by creating an xdr subsegment for the range we will be operating over. This lets me shift data to the correct place without potentially overwriting anything already there. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xdr.h | 2 ++ net/sunrpc/xdr.c | 59 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 5860f32e3958..7dcc6c31fe29 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -262,6 +262,8 @@ extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, uns extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length); extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf, unsigned int len); +extern unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int offset, + unsigned int target, unsigned int length); /** * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data. diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 5d2b3e6979fb..8ba11a754297 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -775,6 +775,34 @@ static void xdr_buf_pages_shift_left(const struct xdr_buf *buf, xdr_buf_tail_copy_left(buf, 0, len - buf->page_len, shift); } +static void xdr_buf_head_shift_left(const struct xdr_buf *buf, + unsigned int base, unsigned int len, + unsigned int shift) +{ + const struct kvec *head = buf->head; + unsigned int bytes; + + if (!shift || !len) + return; + + if (shift > base) { + bytes = (shift - base); + if (bytes >= len) + return; + base += bytes; + len -= bytes; + } + + if (base < head->iov_len) { + bytes = min_t(unsigned int, len, head->iov_len - base); + memmove(head->iov_base + (base - shift), + head->iov_base + base, bytes); + base += bytes; + len -= bytes; + } + xdr_buf_pages_shift_left(buf, base - head->iov_len, len, shift); +} + /** * xdr_shrink_bufhead * @buf: xdr_buf @@ -1680,6 +1708,37 @@ bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf, } EXPORT_SYMBOL_GPL(xdr_stream_subsegment); +/** + * xdr_stream_move_subsegment - Move part of a stream to another position + * @xdr: the source xdr_stream + * @offset: the source offset of the segment + * @target: the target offset of the segment + * @length: the number of bytes to move + * + * Moves @length bytes from @offset to @target in the xdr_stream, overwriting + * anything in its space. Returns the number of bytes in the segment. + */ +unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int offset, + unsigned int target, unsigned int length) +{ + struct xdr_buf buf; + unsigned int shift; + + if (offset < target) { + shift = target - offset; + if (xdr_buf_subsegment(xdr->buf, &buf, offset, shift + length) < 0) + return 0; + xdr_buf_head_shift_right(&buf, 0, length, shift); + } else if (offset > target) { + shift = offset - target; + if (xdr_buf_subsegment(xdr->buf, &buf, target, shift + length) < 0) + return 0; + xdr_buf_head_shift_left(&buf, shift, length, shift); + } + return length; +} +EXPORT_SYMBOL_GPL(xdr_stream_move_subsegment); + /** * xdr_buf_trim - lop at most "len" bytes off the end of "buf" * @buf: buf to be trimmed From 7c4cd5f4d2dd4a028a46bfb696b0cd387caadf33 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Thu, 21 Jul 2022 14:21:32 -0400 Subject: [PATCH 23/46] SUNRPC: Add a function for directly setting the xdr page len We need to do this step during READ_PLUS decoding so that we know pages are the right length and any extra data has been preserved in the tail. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xdr.h | 1 + net/sunrpc/xdr.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 7dcc6c31fe29..8cd38a9994ca 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -258,6 +258,7 @@ extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes); extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len); extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len); extern int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data); +extern void xdr_set_pagelen(struct xdr_stream *, unsigned int len); extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length); extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length); extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf, diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 8ba11a754297..e4ac700ca554 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1500,6 +1500,36 @@ unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len) } EXPORT_SYMBOL_GPL(xdr_read_pages); +/** + * xdr_set_pagelen - Sets the length of the XDR pages + * @xdr: pointer to xdr_stream struct + * @len: new length of the XDR page data + * + * Either grows or shrinks the length of the xdr pages by setting pagelen to + * @len bytes. When shrinking, any extra data is moved into buf->tail, whereas + * when growing any data beyond the current pointer is moved into the tail. + * + * Returns True if the operation was successful, and False otherwise. + */ +void xdr_set_pagelen(struct xdr_stream *xdr, unsigned int len) +{ + struct xdr_buf *buf = xdr->buf; + size_t remaining = xdr_stream_remaining(xdr); + size_t base = 0; + + if (len < buf->page_len) { + base = buf->page_len - len; + xdr_shrink_pagelen(buf, len); + } else { + xdr_buf_head_shift_right(buf, xdr_stream_pos(xdr), + buf->page_len, remaining); + if (len > buf->page_len) + xdr_buf_try_expand(buf, len - buf->page_len); + } + xdr_set_tail_base(xdr, base, remaining); +} +EXPORT_SYMBOL_GPL(xdr_set_pagelen); + unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset, unsigned int length) { From e1bd87608d4b6f87813f79b91e834de610f1049b Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Thu, 21 Jul 2022 14:21:33 -0400 Subject: [PATCH 24/46] SUNRPC: Add a function for zeroing out a portion of an xdr_stream This will be used during READ_PLUS decoding for handling HOLE segments. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xdr.h | 2 ++ net/sunrpc/xdr.c | 23 +++++++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 8cd38a9994ca..f0ab06acab61 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -265,6 +265,8 @@ extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf unsigned int len); extern unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int offset, unsigned int target, unsigned int length); +extern unsigned int xdr_stream_zero(struct xdr_stream *xdr, unsigned int offset, + unsigned int length); /** * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data. diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index e4ac700ca554..f09a7ab1a82b 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1769,6 +1769,29 @@ unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int off } EXPORT_SYMBOL_GPL(xdr_stream_move_subsegment); +/** + * xdr_stream_zero - zero out a portion of an xdr_stream + * @xdr: an xdr_stream to zero out + * @offset: the starting point in the stream + * @length: the number of bytes to zero + */ +unsigned int xdr_stream_zero(struct xdr_stream *xdr, unsigned int offset, + unsigned int length) +{ + struct xdr_buf buf; + + if (xdr_buf_subsegment(xdr->buf, &buf, offset, length) < 0) + return 0; + if (buf.head[0].iov_len) + xdr_buf_iov_zero(buf.head, 0, buf.head[0].iov_len); + if (buf.page_len > 0) + xdr_buf_pages_zero(&buf, 0, buf.page_len); + if (buf.tail[0].iov_len) + xdr_buf_iov_zero(buf.tail, 0, buf.tail[0].iov_len); + return length; +} +EXPORT_SYMBOL_GPL(xdr_stream_zero); + /** * xdr_buf_trim - lop at most "len" bytes off the end of "buf" * @buf: buf to be trimmed From d3b00a802c845a6021148ce2e669b5a0b5729959 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Thu, 21 Jul 2022 14:21:34 -0400 Subject: [PATCH 25/46] NFS: Replace the READ_PLUS decoding code We now take a 2-step process that allows us to place data and hole segments directly at their final position in the xdr_stream without needing to do a bunch of redundant copies to expand holes. Due to the variable lengths of each segment, the xdr metadata might cross page boundaries which I account for by setting a small scratch buffer so xdr_inline_decode() won't fail. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/nfs42xdr.c | 166 ++++++++++++++++++++++++---------------------- 1 file changed, 86 insertions(+), 80 deletions(-) diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 271e5f92ed01..b56f05113d36 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1025,73 +1025,84 @@ static int decode_deallocate(struct xdr_stream *xdr, struct nfs42_falloc_res *re return decode_op_hdr(xdr, OP_DEALLOCATE); } -static int decode_read_plus_data(struct xdr_stream *xdr, - struct nfs_pgio_args *args, - struct nfs_pgio_res *res) -{ - uint32_t count, recvd; +struct read_plus_segment { + enum data_content4 type; uint64_t offset; + union { + struct { + uint64_t length; + } hole; + + struct { + uint32_t length; + unsigned int from; + } data; + }; +}; + +static inline uint64_t read_plus_segment_length(struct read_plus_segment *seg) +{ + return seg->type == NFS4_CONTENT_DATA ? seg->data.length : seg->hole.length; +} + +static int decode_read_plus_segment(struct xdr_stream *xdr, + struct read_plus_segment *seg) +{ __be32 *p; - p = xdr_inline_decode(xdr, 8 + 4); + p = xdr_inline_decode(xdr, 4); if (!p) - return 1; + return -EIO; + seg->type = be32_to_cpup(p++); - p = xdr_decode_hyper(p, &offset); - count = be32_to_cpup(p); - recvd = xdr_align_data(xdr, res->count, xdr_align_size(count)); - if (recvd > count) - recvd = count; - if (res->count + recvd > args->count) { - if (args->count > res->count) - res->count += args->count - res->count; - return 1; - } - res->count += recvd; - if (count > recvd) - return 1; + p = xdr_inline_decode(xdr, seg->type == NFS4_CONTENT_DATA ? 12 : 16); + if (!p) + return -EIO; + p = xdr_decode_hyper(p, &seg->offset); + + if (seg->type == NFS4_CONTENT_DATA) { + struct xdr_buf buf; + uint32_t len = be32_to_cpup(p); + + seg->data.length = len; + seg->data.from = xdr_stream_pos(xdr); + + if (!xdr_stream_subsegment(xdr, &buf, xdr_align_size(len))) + return -EIO; + } else if (seg->type == NFS4_CONTENT_HOLE) { + xdr_decode_hyper(p, &seg->hole.length); + } else + return -EINVAL; return 0; } -static int decode_read_plus_hole(struct xdr_stream *xdr, - struct nfs_pgio_args *args, - struct nfs_pgio_res *res, uint32_t *eof) +static int process_read_plus_segment(struct xdr_stream *xdr, + struct nfs_pgio_args *args, + struct nfs_pgio_res *res, + struct read_plus_segment *seg) { - uint64_t offset, length, recvd; - __be32 *p; + unsigned long offset = seg->offset; + unsigned long length = read_plus_segment_length(seg); + unsigned int bufpos; - p = xdr_inline_decode(xdr, 8 + 8); - if (!p) - return 1; - - p = xdr_decode_hyper(p, &offset); - p = xdr_decode_hyper(p, &length); - if (offset != args->offset + res->count) { - /* Server returned an out-of-sequence extent */ - if (offset > args->offset + res->count || - offset + length < args->offset + res->count) { - dprintk("NFS: server returned out of sequence extent: " - "offset/size = %llu/%llu != expected %llu\n", - (unsigned long long)offset, - (unsigned long long)length, - (unsigned long long)(args->offset + - res->count)); - return 1; - } - length -= args->offset + res->count - offset; + if (offset + length < args->offset) + return 0; + else if (offset > args->offset + args->count) { + res->eof = 0; + return 0; + } else if (offset < args->offset) { + length -= (args->offset - offset); + offset = args->offset; + } else if (offset + length > args->offset + args->count) { + length = (args->offset + args->count) - offset; + res->eof = 0; } - if (length + res->count > args->count) { - *eof = 0; - if (unlikely(res->count >= args->count)) - return 1; - length = args->count - res->count; - } - recvd = xdr_expand_hole(xdr, res->count, length); - res->count += recvd; - if (recvd < length) - return 1; - return 0; + bufpos = xdr->buf->head[0].iov_len + (offset - args->offset); + if (seg->type == NFS4_CONTENT_HOLE) + return xdr_stream_zero(xdr, bufpos, length); + else + return xdr_stream_move_subsegment(xdr, seg->data.from, bufpos, length); } static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) @@ -1099,8 +1110,10 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) struct nfs_pgio_header *hdr = container_of(res, struct nfs_pgio_header, res); struct nfs_pgio_args *args = &hdr->args; - uint32_t eof, segments, type; + uint32_t segments; + struct read_plus_segment *segs; int status, i; + char scratch_buf[16]; __be32 *p; status = decode_op_hdr(xdr, OP_READ_PLUS); @@ -1112,38 +1125,31 @@ static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res *res) return -EIO; res->count = 0; - eof = be32_to_cpup(p++); + res->eof = be32_to_cpup(p++); segments = be32_to_cpup(p++); if (segments == 0) - goto out; + return status; + segs = kmalloc_array(segments, sizeof(*segs), GFP_KERNEL); + if (!segs) + return -ENOMEM; + + xdr_set_scratch_buffer(xdr, &scratch_buf, 32); + status = -EIO; for (i = 0; i < segments; i++) { - p = xdr_inline_decode(xdr, 4); - if (!p) - goto early_out; - - type = be32_to_cpup(p++); - if (type == NFS4_CONTENT_DATA) - status = decode_read_plus_data(xdr, args, res); - else if (type == NFS4_CONTENT_HOLE) - status = decode_read_plus_hole(xdr, args, res, &eof); - else - return -EINVAL; - + status = decode_read_plus_segment(xdr, &segs[i]); if (status < 0) - return status; - if (status > 0) - goto early_out; + goto out; } + xdr_set_pagelen(xdr, xdr_align_size(args->count)); + for (i = segments; i > 0; i--) + res->count += process_read_plus_segment(xdr, args, res, &segs[i-1]); + status = 0; + out: - res->eof = eof; - return 0; -early_out: - if (unlikely(!i)) - return -EIO; - res->eof = 0; - return 0; + kfree(segs); + return status; } static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res) From 29946fbcb2c31a2a367887dc58a2e7e5b012e285 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Thu, 21 Jul 2022 14:21:35 -0400 Subject: [PATCH 26/46] SUNRPC: Remove xdr_align_data() and xdr_expand_hole() These functions are no longer needed now that the NFS client places data and hole segments directly. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xdr.h | 2 -- net/sunrpc/xdr.c | 66 -------------------------------------- 2 files changed, 68 deletions(-) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index f0ab06acab61..f38c97f45354 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -259,8 +259,6 @@ extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len); extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len); extern int xdr_process_buf(const struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data); extern void xdr_set_pagelen(struct xdr_stream *, unsigned int len); -extern unsigned int xdr_align_data(struct xdr_stream *, unsigned int offset, unsigned int length); -extern unsigned int xdr_expand_hole(struct xdr_stream *, unsigned int offset, unsigned int length); extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf, unsigned int len); extern unsigned int xdr_stream_move_subsegment(struct xdr_stream *xdr, unsigned int offset, diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index f09a7ab1a82b..482586c23fdd 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1530,72 +1530,6 @@ void xdr_set_pagelen(struct xdr_stream *xdr, unsigned int len) } EXPORT_SYMBOL_GPL(xdr_set_pagelen); -unsigned int xdr_align_data(struct xdr_stream *xdr, unsigned int offset, - unsigned int length) -{ - struct xdr_buf *buf = xdr->buf; - unsigned int from, bytes, len; - unsigned int shift; - - xdr_realign_pages(xdr); - from = xdr_page_pos(xdr); - - if (from >= buf->page_len + buf->tail->iov_len) - return 0; - if (from + buf->head->iov_len >= buf->len) - return 0; - - len = buf->len - buf->head->iov_len; - - /* We only shift data left! */ - if (WARN_ONCE(from < offset, "SUNRPC: misaligned data src=%u dst=%u\n", - from, offset)) - return 0; - if (WARN_ONCE(offset > buf->page_len, - "SUNRPC: buffer overflow. offset=%u, page_len=%u\n", - offset, buf->page_len)) - return 0; - - /* Move page data to the left */ - shift = from - offset; - xdr_buf_pages_shift_left(buf, from, len, shift); - - bytes = xdr_stream_remaining(xdr); - if (length > bytes) - length = bytes; - bytes -= length; - - xdr->buf->len -= shift; - xdr_set_page(xdr, offset + length, bytes); - return length; -} -EXPORT_SYMBOL_GPL(xdr_align_data); - -unsigned int xdr_expand_hole(struct xdr_stream *xdr, unsigned int offset, - unsigned int length) -{ - struct xdr_buf *buf = xdr->buf; - unsigned int from, to, shift; - - xdr_realign_pages(xdr); - from = xdr_page_pos(xdr); - to = xdr_align_size(offset + length); - - /* Could the hole be behind us? */ - if (to > from) { - unsigned int buflen = buf->len - buf->head->iov_len; - shift = to - from; - xdr_buf_try_expand(buf, shift); - xdr_buf_pages_shift_right(buf, from, buflen, shift); - xdr_set_page(xdr, to, xdr_stream_remaining(xdr)); - } else if (to != from) - xdr_align_data(xdr, to, 0); - xdr_buf_pages_zero(buf, offset, length); - - return length; -} -EXPORT_SYMBOL_GPL(xdr_expand_hole); - /** * xdr_enter_page - decode data from the XDR page * @xdr: pointer to xdr_stream struct From 7ffcdaa670164a2ad3844a5ef6df5423782ba290 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 25 Jul 2022 09:32:21 -0400 Subject: [PATCH 27/46] SUNRPC expose functions for offline remote xprt functionality Re-arrange the code that make offline transport and delete transport callable functions. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 +++ net/sunrpc/sysfs.c | 28 +++++----------------------- net/sunrpc/xprt.c | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 23 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 522bbf937957..0d51b9f9ea37 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -505,4 +505,7 @@ static inline int xprt_test_and_set_binding(struct rpc_xprt *xprt) return test_and_set_bit(XPRT_BINDING, &xprt->state); } +void xprt_set_offline_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps); +void xprt_set_online_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps); +void xprt_delete_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps); #endif /* _LINUX_SUNRPC_XPRT_H */ diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index a3a2f8aeb80e..7330eb9a70cf 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -314,32 +314,14 @@ static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj, goto release_tasks; } if (offline) { - if (!test_and_set_bit(XPRT_OFFLINE, &xprt->state)) { - spin_lock(&xps->xps_lock); - xps->xps_nactive--; - spin_unlock(&xps->xps_lock); - } + xprt_set_offline_locked(xprt, xps); } else if (online) { - if (test_and_clear_bit(XPRT_OFFLINE, &xprt->state)) { - spin_lock(&xps->xps_lock); - xps->xps_nactive++; - spin_unlock(&xps->xps_lock); - } + xprt_set_online_locked(xprt, xps); } else if (remove) { - if (test_bit(XPRT_OFFLINE, &xprt->state)) { - if (!test_and_set_bit(XPRT_REMOVE, &xprt->state)) { - xprt_force_disconnect(xprt); - if (test_bit(XPRT_CONNECTED, &xprt->state)) { - if (!xprt->sending.qlen && - !xprt->pending.qlen && - !xprt->backlog.qlen && - !atomic_long_read(&xprt->queuelen)) - rpc_xprt_switch_remove_xprt(xps, xprt); - } - } - } else { + if (test_bit(XPRT_OFFLINE, &xprt->state)) + xprt_delete_locked(xprt, xps); + else count = -EINVAL; - } } release_tasks: diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 86d62cffba0d..8f8e3c952f24 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -2152,3 +2152,35 @@ void xprt_put(struct rpc_xprt *xprt) kref_put(&xprt->kref, xprt_destroy_kref); } EXPORT_SYMBOL_GPL(xprt_put); + +void xprt_set_offline_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps) +{ + if (!test_and_set_bit(XPRT_OFFLINE, &xprt->state)) { + spin_lock(&xps->xps_lock); + xps->xps_nactive--; + spin_unlock(&xps->xps_lock); + } +} + +void xprt_set_online_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps) +{ + if (test_and_clear_bit(XPRT_OFFLINE, &xprt->state)) { + spin_lock(&xps->xps_lock); + xps->xps_nactive++; + spin_unlock(&xps->xps_lock); + } +} + +void xprt_delete_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps) +{ + if (test_and_set_bit(XPRT_REMOVE, &xprt->state)) + return; + + xprt_force_disconnect(xprt); + if (!test_bit(XPRT_CONNECTED, &xprt->state)) + return; + + if (!xprt->sending.qlen && !xprt->pending.qlen && + !xprt->backlog.qlen && !atomic_long_read(&xprt->queuelen)) + rpc_xprt_switch_remove_xprt(xps, xprt); +} From 895245ccea251ff54ea19bc364c9a49007918115 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 25 Jul 2022 09:32:22 -0400 Subject: [PATCH 28/46] SUNRPC add function to offline remove trunkable transports Iterate thru available transports in the xprt_switch for all trunkable transports offline and possibly remote them as well. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 1 + net/sunrpc/clnt.c | 46 +++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 90501404fa49..d14333f4947a 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -234,6 +234,7 @@ int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *, struct rpc_xprt_switch *, struct rpc_xprt *, void *); +void rpc_clnt_manage_trunked_xprts(struct rpc_clnt *); const char *rpc_proc_name(const struct rpc_task *task); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index a97d4e06cae3..2b079c4d8af1 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -3000,6 +3000,52 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, } EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt); +static int rpc_xprt_offline(struct rpc_clnt *clnt, + struct rpc_xprt *xprt, + void *data) +{ + struct rpc_xprt *main_xprt; + struct rpc_xprt_switch *xps; + int err = 0; + + xprt_get(xprt); + + rcu_read_lock(); + main_xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); + xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); + err = rpc_cmp_addr_port((struct sockaddr *)&xprt->addr, + (struct sockaddr *)&main_xprt->addr); + rcu_read_unlock(); + xprt_put(main_xprt); + if (err) + goto out; + + if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { + err = -EINTR; + goto out; + } + xprt_set_offline_locked(xprt, xps); + + xprt_release_write(xprt, NULL); +out: + xprt_put(xprt); + xprt_switch_put(xps); + return err; +} + +/* rpc_clnt_manage_trunked_xprts -- offline trunked transports + * @clnt rpc_clnt structure + * + * For each active transport found in the rpc_clnt structure call + * the function rpc_xprt_offline() which will identify trunked transports + * and will mark them offline. + */ +void rpc_clnt_manage_trunked_xprts(struct rpc_clnt *clnt) +{ + rpc_clnt_iterate_for_each_xprt(clnt, rpc_xprt_offline, NULL); +} +EXPORT_SYMBOL_GPL(rpc_clnt_manage_trunked_xprts); + struct connect_timeout_data { unsigned long connect_timeout; unsigned long reconnect_timeout; From 88363d3e9db66e697fd0198cc2c1785377f2459a Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 25 Jul 2022 09:32:23 -0400 Subject: [PATCH 29/46] NFSv4.1 offline trunkable transports on DESTROY_SESSION When session is destroy, some of the transports might no longer be valid trunks for the new session. Offline existing transports. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4e0dcc19ca71..3f4e84e9646e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -9291,6 +9291,7 @@ int nfs4_proc_destroy_session(struct nfs4_session *session, if (status) dprintk("NFS: Got error %d from the server on DESTROY_SESSION. " "Session has been destroyed regardless...\n", status); + rpc_clnt_manage_trunked_xprts(session->clp->cl_rpcclient); return status; } From 95d0d30c66b855f614e677b8cd0455eed0765a6f Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 25 Jul 2022 09:32:24 -0400 Subject: [PATCH 30/46] SUNRPC create an iterator to list only OFFLINE xprts Create a new iterator helper that will go thru the all the transports in the switch and return transports that are marked OFFLINE. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprtmultipath.h | 3 + net/sunrpc/clnt.c | 11 +++- net/sunrpc/xprtmultipath.c | 99 +++++++++++++++++++++++++--- 3 files changed, 101 insertions(+), 12 deletions(-) diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h index bbb8a5fa0816..688ca7eb1d01 100644 --- a/include/linux/sunrpc/xprtmultipath.h +++ b/include/linux/sunrpc/xprtmultipath.h @@ -63,6 +63,9 @@ extern void xprt_iter_init(struct rpc_xprt_iter *xpi, extern void xprt_iter_init_listall(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps); +extern void xprt_iter_init_listoffline(struct rpc_xprt_iter *xpi, + struct rpc_xprt_switch *xps); + extern void xprt_iter_destroy(struct rpc_xprt_iter *xpi); extern struct rpc_xprt_switch *xprt_iter_xchg_switch( diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2b079c4d8af1..68021b70340d 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -786,7 +786,8 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt, EXPORT_SYMBOL_GPL(rpc_switch_client_transport); static -int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi) +int _rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi, + void func(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps)) { struct rpc_xprt_switch *xps; @@ -795,11 +796,17 @@ int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi) rcu_read_unlock(); if (xps == NULL) return -EAGAIN; - xprt_iter_init_listall(xpi, xps); + func(xpi, xps); xprt_switch_put(xps); return 0; } +static +int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi) +{ + return _rpc_clnt_xprt_iter_init(clnt, xpi, xprt_iter_init_listall); +} + /** * rpc_clnt_iterate_for_each_xprt - Apply a function to all transports * @clnt: pointer to client diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 1693f81aae37..8def8423fc0a 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -27,6 +27,7 @@ typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct rpc_xprt_switch *xps, static const struct rpc_xprt_iter_ops rpc_xprt_iter_singular; static const struct rpc_xprt_iter_ops rpc_xprt_iter_roundrobin; static const struct rpc_xprt_iter_ops rpc_xprt_iter_listall; +static const struct rpc_xprt_iter_ops rpc_xprt_iter_listoffline; static void xprt_switch_add_xprt_locked(struct rpc_xprt_switch *xps, struct rpc_xprt *xprt) @@ -248,6 +249,18 @@ struct rpc_xprt *xprt_switch_find_first_entry(struct list_head *head) return NULL; } +static +struct rpc_xprt *xprt_switch_find_first_entry_offline(struct list_head *head) +{ + struct rpc_xprt *pos; + + list_for_each_entry_rcu(pos, head, xprt_switch) { + if (!xprt_is_active(pos)) + return pos; + } + return NULL; +} + static struct rpc_xprt *xprt_iter_first_entry(struct rpc_xprt_iter *xpi) { @@ -259,8 +272,9 @@ struct rpc_xprt *xprt_iter_first_entry(struct rpc_xprt_iter *xpi) } static -struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head, - const struct rpc_xprt *cur) +struct rpc_xprt *_xprt_switch_find_current_entry(struct list_head *head, + const struct rpc_xprt *cur, + bool find_active) { struct rpc_xprt *pos; bool found = false; @@ -268,14 +282,25 @@ struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head, list_for_each_entry_rcu(pos, head, xprt_switch) { if (cur == pos) found = true; - if (found && xprt_is_active(pos)) + if (found && ((find_active && xprt_is_active(pos)) || + (!find_active && xprt_is_active(pos)))) return pos; } return NULL; } static -struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi) +struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head, + const struct rpc_xprt *cur) +{ + return _xprt_switch_find_current_entry(head, cur, true); +} + +static +struct rpc_xprt * _xprt_iter_current_entry(struct rpc_xprt_iter *xpi, + struct rpc_xprt *first_entry(struct list_head *head), + struct rpc_xprt *current_entry(struct list_head *head, + const struct rpc_xprt *cur)) { struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch); struct list_head *head; @@ -284,8 +309,30 @@ struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi) return NULL; head = &xps->xps_xprt_list; if (xpi->xpi_cursor == NULL || xps->xps_nxprts < 2) - return xprt_switch_find_first_entry(head); - return xprt_switch_find_current_entry(head, xpi->xpi_cursor); + return first_entry(head); + return current_entry(head, xpi->xpi_cursor); +} + +static +struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi) +{ + return _xprt_iter_current_entry(xpi, xprt_switch_find_first_entry, + xprt_switch_find_current_entry); +} + +static +struct rpc_xprt *xprt_switch_find_current_entry_offline(struct list_head *head, + const struct rpc_xprt *cur) +{ + return _xprt_switch_find_current_entry(head, cur, false); +} + +static +struct rpc_xprt *xprt_iter_current_entry_offline(struct rpc_xprt_iter *xpi) +{ + return _xprt_iter_current_entry(xpi, + xprt_switch_find_first_entry_offline, + xprt_switch_find_current_entry_offline); } bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, @@ -310,7 +357,7 @@ bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, static struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head, - const struct rpc_xprt *cur) + const struct rpc_xprt *cur, bool check_active) { struct rpc_xprt *pos, *prev = NULL; bool found = false; @@ -318,7 +365,12 @@ struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head, list_for_each_entry_rcu(pos, head, xprt_switch) { if (cur == prev) found = true; - if (found && xprt_is_active(pos)) + /* for request to return active transports return only + * active, for request to return offline transports + * return only offline + */ + if (found && ((check_active && xprt_is_active(pos)) || + (!check_active && !xprt_is_active(pos)))) return pos; prev = pos; } @@ -355,7 +407,7 @@ struct rpc_xprt *__xprt_switch_find_next_entry_roundrobin(struct list_head *head { struct rpc_xprt *ret; - ret = xprt_switch_find_next_entry(head, cur); + ret = xprt_switch_find_next_entry(head, cur, true); if (ret != NULL) return ret; return xprt_switch_find_first_entry(head); @@ -397,7 +449,14 @@ static struct rpc_xprt *xprt_switch_find_next_entry_all(struct rpc_xprt_switch *xps, const struct rpc_xprt *cur) { - return xprt_switch_find_next_entry(&xps->xps_xprt_list, cur); + return xprt_switch_find_next_entry(&xps->xps_xprt_list, cur, true); +} + +static +struct rpc_xprt *xprt_switch_find_next_entry_offline(struct rpc_xprt_switch *xps, + const struct rpc_xprt *cur) +{ + return xprt_switch_find_next_entry(&xps->xps_xprt_list, cur, false); } static @@ -407,6 +466,13 @@ struct rpc_xprt *xprt_iter_next_entry_all(struct rpc_xprt_iter *xpi) xprt_switch_find_next_entry_all); } +static +struct rpc_xprt *xprt_iter_next_entry_offline(struct rpc_xprt_iter *xpi) +{ + return xprt_iter_next_entry_multiple(xpi, + xprt_switch_find_next_entry_offline); +} + /* * xprt_iter_rewind - Resets the xprt iterator * @xpi: pointer to rpc_xprt_iter @@ -460,6 +526,12 @@ void xprt_iter_init_listall(struct rpc_xprt_iter *xpi, __xprt_iter_init(xpi, xps, &rpc_xprt_iter_listall); } +void xprt_iter_init_listoffline(struct rpc_xprt_iter *xpi, + struct rpc_xprt_switch *xps) +{ + __xprt_iter_init(xpi, xps, &rpc_xprt_iter_listoffline); +} + /** * xprt_iter_xchg_switch - Atomically swap out the rpc_xprt_switch * @xpi: pointer to rpc_xprt_iter @@ -574,3 +646,10 @@ const struct rpc_xprt_iter_ops rpc_xprt_iter_listall = { .xpi_xprt = xprt_iter_current_entry, .xpi_next = xprt_iter_next_entry_all, }; + +static +const struct rpc_xprt_iter_ops rpc_xprt_iter_listoffline = { + .xpi_rewind = xprt_iter_default_rewind, + .xpi_xprt = xprt_iter_current_entry_offline, + .xpi_next = xprt_iter_next_entry_offline, +}; From 9368fd6c75053630e95a6dbd17c9522e82101276 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 25 Jul 2022 09:32:25 -0400 Subject: [PATCH 31/46] SUNRPC enable back offline transports in trunking discovery When we are adding a transport to a xprt_switch that's already on the list but has been marked OFFLINE, then make the state ONLINE since it's been tested now. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 1 + net/sunrpc/clnt.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index d14333f4947a..71a3a1dd7e81 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -242,6 +242,7 @@ void rpc_clnt_xprt_switch_put(struct rpc_clnt *); void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *); bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, const struct sockaddr *sap); +void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt); void rpc_cleanup_clids(void); static inline int rpc_reply_expected(struct rpc_task *task) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 68021b70340d..9dbce3b0d3a2 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -3095,8 +3095,22 @@ void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt) } EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put); +void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt) +{ + struct rpc_xprt_switch *xps; + + rcu_read_lock(); + xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); + rcu_read_unlock(); + xprt_set_online_locked(xprt, xps); +} + void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) { + if (rpc_clnt_xprt_switch_has_addr(clnt, + (const struct sockaddr *)&xprt->addr)) { + return rpc_clnt_xprt_set_online(clnt, xprt); + } rcu_read_lock(); rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch), xprt); From 497e6464d6adcee64f071b18fc826e63cfd2f0a5 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 25 Jul 2022 09:32:26 -0400 Subject: [PATCH 32/46] SUNRPC create an rpc function that allows xprt removal from rpc_clnt Expose a function that allows a removal of xprt from the rpc_clnt. When called from NFS that's running a trunked transport then don't decrement the active transport counter. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 1 + include/linux/sunrpc/xprtmultipath.h | 2 +- net/sunrpc/clnt.c | 16 +++++++++++++++- net/sunrpc/xprt.c | 2 +- net/sunrpc/xprtmultipath.c | 11 ++++++----- 5 files changed, 24 insertions(+), 8 deletions(-) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 71a3a1dd7e81..7a43fd514398 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -240,6 +240,7 @@ const char *rpc_proc_name(const struct rpc_task *task); void rpc_clnt_xprt_switch_put(struct rpc_clnt *); void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *); +void rpc_clnt_xprt_switch_remove_xprt(struct rpc_clnt *, struct rpc_xprt *); bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, const struct sockaddr *sap); void rpc_clnt_xprt_set_online(struct rpc_clnt *clnt, struct rpc_xprt *xprt); diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h index 688ca7eb1d01..9fff0768d942 100644 --- a/include/linux/sunrpc/xprtmultipath.h +++ b/include/linux/sunrpc/xprtmultipath.h @@ -55,7 +55,7 @@ extern void rpc_xprt_switch_set_roundrobin(struct rpc_xprt_switch *xps); extern void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps, struct rpc_xprt *xprt); extern void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps, - struct rpc_xprt *xprt); + struct rpc_xprt *xprt, bool offline); extern void xprt_iter_init(struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *xps); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 9dbce3b0d3a2..26f3102500bb 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2144,7 +2144,8 @@ call_connect_status(struct rpc_task *task) xprt_release(task); value = atomic_long_dec_return(&xprt->queuelen); if (value == 0) - rpc_xprt_switch_remove_xprt(xps, saved); + rpc_xprt_switch_remove_xprt(xps, saved, + true); xprt_put(saved); task->tk_xprt = NULL; task->tk_action = call_start; @@ -3118,6 +3119,19 @@ void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) } EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt); +void rpc_clnt_xprt_switch_remove_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) +{ + struct rpc_xprt_switch *xps; + + rcu_read_lock(); + xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); + rpc_xprt_switch_remove_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch), + xprt, 0); + xps->xps_nunique_destaddr_xprts--; + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_remove_xprt); + bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, const struct sockaddr *sap) { diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 8f8e3c952f24..44348c9f4b00 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -2182,5 +2182,5 @@ void xprt_delete_locked(struct rpc_xprt *xprt, struct rpc_xprt_switch *xps) if (!xprt->sending.qlen && !xprt->pending.qlen && !xprt->backlog.qlen && !atomic_long_read(&xprt->queuelen)) - rpc_xprt_switch_remove_xprt(xps, xprt); + rpc_xprt_switch_remove_xprt(xps, xprt, true); } diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 8def8423fc0a..55da01730311 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -62,11 +62,11 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps, } static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps, - struct rpc_xprt *xprt) + struct rpc_xprt *xprt, bool offline) { if (unlikely(xprt == NULL)) return; - if (!test_bit(XPRT_OFFLINE, &xprt->state)) + if (!test_bit(XPRT_OFFLINE, &xprt->state) && offline) xps->xps_nactive--; xps->xps_nxprts--; if (xps->xps_nxprts == 0) @@ -79,14 +79,15 @@ static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps, * rpc_xprt_switch_remove_xprt - Removes an rpc_xprt from a rpc_xprt_switch * @xps: pointer to struct rpc_xprt_switch * @xprt: pointer to struct rpc_xprt + * @offline: indicates if the xprt that's being removed is in an offline state * * Removes xprt from the list of struct rpc_xprt in xps. */ void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps, - struct rpc_xprt *xprt) + struct rpc_xprt *xprt, bool offline) { spin_lock(&xps->xps_lock); - xprt_switch_remove_xprt_locked(xps, xprt); + xprt_switch_remove_xprt_locked(xps, xprt, offline); spin_unlock(&xps->xps_lock); xprt_put(xprt); } @@ -155,7 +156,7 @@ static void xprt_switch_free_entries(struct rpc_xprt_switch *xps) xprt = list_first_entry(&xps->xps_xprt_list, struct rpc_xprt, xprt_switch); - xprt_switch_remove_xprt_locked(xps, xprt); + xprt_switch_remove_xprt_locked(xps, xprt, true); spin_unlock(&xps->xps_lock); xprt_put(xprt); spin_lock(&xps->xps_lock); From e818bd085baf18cc3271c0f5549d9f5a7069efba Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 25 Jul 2022 09:32:27 -0400 Subject: [PATCH 33/46] NFSv4.1 remove xprt from xprt_switch if session trunking test fails If we are doing a session trunking test and it fails for the transport, then remove this transport from the xprt_switch group. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3f4e84e9646e..4850e29904e6 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8922,6 +8922,9 @@ void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, if (status == 0) rpc_clnt_xprt_switch_add_xprt(clnt, xprt); + else if (rpc_clnt_xprt_switch_has_addr(clnt, + (struct sockaddr *)&xprt->addr)) + rpc_clnt_xprt_switch_remove_xprt(clnt, xprt); rpc_put_task(task); } From 7960aa9e4d09504a0a3aff9e34329230a5d0aa9b Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 25 Jul 2022 09:32:28 -0400 Subject: [PATCH 34/46] SUNRPC restructure rpc_clnt_setup_test_and_add_xprt In preparation for code re-use, pull out the part of the rpc_clnt_setup_test_and_add_xprt() portion that sends a NULL rpc and then calls a session trunking function into a helper function. Re-organize the end of the function for code re-use. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 52 ++++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 26f3102500bb..9c9712274ca8 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2874,6 +2874,30 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, } EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt); +static int rpc_clnt_add_xprt_helper(struct rpc_clnt *clnt, + struct rpc_xprt *xprt, + struct rpc_add_xprt_test *data) +{ + struct rpc_task *task; + int status = -EADDRINUSE; + + /* Test the connection */ + task = rpc_call_null_helper(clnt, xprt, NULL, 0, NULL, NULL); + if (IS_ERR(task)) + return PTR_ERR(task); + + status = task->tk_status; + rpc_put_task(task); + + if (status < 0) + return status; + + /* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */ + data->add_xprt_test(clnt, xprt, data->data); + + return 0; +} + /** * rpc_clnt_setup_test_and_add_xprt() * @@ -2897,8 +2921,6 @@ int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data) { - struct rpc_task *task; - struct rpc_add_xprt_test *xtest = (struct rpc_add_xprt_test *)data; int status = -EADDRINUSE; xprt = xprt_get(xprt); @@ -2907,31 +2929,19 @@ int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt, if (rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr)) goto out_err; - /* Test the connection */ - task = rpc_call_null_helper(clnt, xprt, NULL, 0, NULL, NULL); - if (IS_ERR(task)) { - status = PTR_ERR(task); - goto out_err; - } - status = task->tk_status; - rpc_put_task(task); - + status = rpc_clnt_add_xprt_helper(clnt, xprt, data); if (status < 0) goto out_err; - /* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */ - xtest->add_xprt_test(clnt, xprt, xtest->data); - - xprt_put(xprt); - xprt_switch_put(xps); - - /* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */ - return 1; + status = 1; out_err: xprt_put(xprt); xprt_switch_put(xps); - pr_info("RPC: rpc_clnt_test_xprt failed: %d addr %s not added\n", - status, xprt->address_strings[RPC_DISPLAY_ADDR]); + if (status < 0) + pr_info("RPC: rpc_clnt_test_xprt failed: %d addr %s not " + "added\n", status, + xprt->address_strings[RPC_DISPLAY_ADDR]); + /* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */ return status; } EXPORT_SYMBOL_GPL(rpc_clnt_setup_test_and_add_xprt); From 273d6aed9e5a1859dda15256f45561315c3d237a Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 25 Jul 2022 09:32:29 -0400 Subject: [PATCH 35/46] SUNRPC export xprt_iter_rewind function Make xprt_iter_rewind callable outside of xprtmultipath.c Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprtmultipath.h | 2 ++ net/sunrpc/xprtmultipath.c | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h index 9fff0768d942..c0514c684b2c 100644 --- a/include/linux/sunrpc/xprtmultipath.h +++ b/include/linux/sunrpc/xprtmultipath.h @@ -68,6 +68,8 @@ extern void xprt_iter_init_listoffline(struct rpc_xprt_iter *xpi, extern void xprt_iter_destroy(struct rpc_xprt_iter *xpi); +extern void xprt_iter_rewind(struct rpc_xprt_iter *xpi); + extern struct rpc_xprt_switch *xprt_iter_xchg_switch( struct rpc_xprt_iter *xpi, struct rpc_xprt_switch *newswitch); diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 55da01730311..685db598acbe 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -481,7 +481,6 @@ struct rpc_xprt *xprt_iter_next_entry_offline(struct rpc_xprt_iter *xpi) * Resets xpi to ensure that it points to the first entry in the list * of transports. */ -static void xprt_iter_rewind(struct rpc_xprt_iter *xpi) { rcu_read_lock(); From 92cc04f60ab4ae199eee507e5cd4d5aa6c722e9c Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 25 Jul 2022 09:32:30 -0400 Subject: [PATCH 36/46] SUNRPC create a function that probes only offline transports For only offline transports, attempt to check connectivity via a NULL call and, if that succeeds, call a provided session trunking detection function. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 2 ++ net/sunrpc/clnt.c | 65 +++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 7a43fd514398..75eea5ebb179 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -235,6 +235,8 @@ int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *, struct rpc_xprt *, void *); void rpc_clnt_manage_trunked_xprts(struct rpc_clnt *); +void rpc_clnt_probe_trunked_xprts(struct rpc_clnt *, + struct rpc_add_xprt_test *); const char *rpc_proc_name(const struct rpc_task *task); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 9c9712274ca8..bbfc47f03480 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -807,6 +807,13 @@ int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi) return _rpc_clnt_xprt_iter_init(clnt, xpi, xprt_iter_init_listall); } +static +int rpc_clnt_xprt_iter_offline_init(struct rpc_clnt *clnt, + struct rpc_xprt_iter *xpi) +{ + return _rpc_clnt_xprt_iter_init(clnt, xpi, xprt_iter_init_listoffline); +} + /** * rpc_clnt_iterate_for_each_xprt - Apply a function to all transports * @clnt: pointer to client @@ -3018,6 +3025,64 @@ int rpc_clnt_add_xprt(struct rpc_clnt *clnt, } EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt); +static int rpc_xprt_probe_trunked(struct rpc_clnt *clnt, + struct rpc_xprt *xprt, + struct rpc_add_xprt_test *data) +{ + struct rpc_xprt_switch *xps; + struct rpc_xprt *main_xprt; + int status = 0; + + xprt_get(xprt); + + rcu_read_lock(); + main_xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); + xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); + status = rpc_cmp_addr_port((struct sockaddr *)&xprt->addr, + (struct sockaddr *)&main_xprt->addr); + rcu_read_unlock(); + xprt_put(main_xprt); + if (status || !test_bit(XPRT_OFFLINE, &xprt->state)) + goto out; + + status = rpc_clnt_add_xprt_helper(clnt, xprt, data); +out: + xprt_put(xprt); + xprt_switch_put(xps); + return status; +} + +/* rpc_clnt_probe_trunked_xprt -- probe offlined transport for session trunking + * @clnt rpc_clnt structure + * + * For each offlined transport found in the rpc_clnt structure call + * the function rpc_xprt_probe_trunked() which will determine if this + * transport still belongs to the trunking group. + */ +void rpc_clnt_probe_trunked_xprts(struct rpc_clnt *clnt, + struct rpc_add_xprt_test *data) +{ + struct rpc_xprt_iter xpi; + int ret; + + ret = rpc_clnt_xprt_iter_offline_init(clnt, &xpi); + if (ret) + return; + for (;;) { + struct rpc_xprt *xprt = xprt_iter_get_next(&xpi); + + if (!xprt) + break; + ret = rpc_xprt_probe_trunked(clnt, xprt, data); + xprt_put(xprt); + if (ret < 0) + break; + xprt_iter_rewind(&xpi); + } + xprt_iter_destroy(&xpi); +} +EXPORT_SYMBOL_GPL(rpc_clnt_probe_trunked_xprts); + static int rpc_xprt_offline(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data) From f201bdfd7c87967480000db8974f683c14aa6eb2 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Mon, 25 Jul 2022 09:32:31 -0400 Subject: [PATCH 37/46] NFSv4.1 probe offline transports for trunking on session creation Once the session is established call into the SUNRPC layer to check if any offlined trunking connections should be re-enabled. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4850e29904e6..5f59de55ac84 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -9249,6 +9249,13 @@ int nfs4_proc_create_session(struct nfs_client *clp, const struct cred *cred) int status; unsigned *ptr; struct nfs4_session *session = clp->cl_session; + struct nfs4_add_xprt_data xprtdata = { + .clp = clp, + }; + struct rpc_add_xprt_test rpcdata = { + .add_xprt_test = clp->cl_mvops->session_trunk, + .data = &xprtdata, + }; dprintk("--> %s clp=%p session=%p\n", __func__, clp, session); @@ -9265,6 +9272,7 @@ int nfs4_proc_create_session(struct nfs_client *clp, const struct cred *cred) ptr = (unsigned *)&session->sess_id.data[0]; dprintk("%s client>seqid %d sessionid %u:%u:%u:%u\n", __func__, clp->cl_seqid, ptr[0], ptr[1], ptr[2], ptr[3]); + rpc_clnt_probe_trunked_xprts(clp->cl_rpcclient, &rpcdata); out: return status; } From e35a5e782f67ed76a65ad0f23a484444a95f000f Mon Sep 17 00:00:00 2001 From: Zhang Xianwei Date: Wed, 27 Jul 2022 18:01:07 +0800 Subject: [PATCH 38/46] NFSv4.1: RECLAIM_COMPLETE must handle EACCES A client should be able to handle getting an EACCES error while doing a mount operation to reclaim state due to NFS4CLNT_RECLAIM_REBOOT being set. If the server returns RPC_AUTH_BADCRED because authentication failed when we execute "exportfs -au", then RECLAIM_COMPLETE will go a wrong way. After mount succeeds, all OPEN call will fail due to an NFS4ERR_GRACE error being returned. This patch is to fix it by resending a RPC request. Signed-off-by: Zhang Xianwei Signed-off-by: Yi Wang Fixes: aa5190d0ed7d ("NFSv4: Kill nfs4_async_handle_error() abuses by NFSv4.1") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5f59de55ac84..2d7c14ade193 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -9487,6 +9487,9 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf rpc_delay(task, NFS4_POLL_RETRY_MAX); fallthrough; case -NFS4ERR_RETRY_UNCACHED_REP: + case -EACCES: + dprintk("%s: failed to reclaim complete error %d for server %s, retrying\n", + __func__, task->tk_status, clp->cl_hostname); return -EAGAIN; case -NFS4ERR_BADSESSION: case -NFS4ERR_DEADSESSION: From 6622e3a73112fc336c1c2c582428fb5ef18e456a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 27 Jul 2022 12:27:54 -0400 Subject: [PATCH 39/46] SUNRPC: Reinitialise the backchannel request buffers before reuse When we're reusing the backchannel requests instead of freeing them, then we should reinitialise any values of the send/receive xdr_bufs so that they reflect the available space. Fixes: 0d2a970d0ae5 ("SUNRPC: Fix a backchannel race") Signed-off-by: Trond Myklebust --- net/sunrpc/backchannel_rqst.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 5a6b61dcdf2d..ad8ef1fb08b4 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -64,6 +64,17 @@ static void xprt_free_allocation(struct rpc_rqst *req) kfree(req); } +static void xprt_bc_reinit_xdr_buf(struct xdr_buf *buf) +{ + buf->head[0].iov_len = PAGE_SIZE; + buf->tail[0].iov_len = 0; + buf->pages = NULL; + buf->page_len = 0; + buf->flags = 0; + buf->len = 0; + buf->buflen = PAGE_SIZE; +} + static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags) { struct page *page; @@ -292,6 +303,9 @@ void xprt_free_bc_rqst(struct rpc_rqst *req) */ spin_lock_bh(&xprt->bc_pa_lock); if (xprt_need_to_requeue(xprt)) { + xprt_bc_reinit_xdr_buf(&req->rq_snd_buf); + xprt_bc_reinit_xdr_buf(&req->rq_rcv_buf); + req->rq_rcv_buf.len = PAGE_SIZE; list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list); xprt->bc_alloc_count++; atomic_inc(&xprt->bc_slot_count); From 72691a269f0baad6d5f4aa7af97c29081b86d70f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 27 Jul 2022 13:02:27 -0400 Subject: [PATCH 40/46] SUNRPC: Don't reuse bvec on retransmission of the request If a request is re-encoded and then retransmitted, we need to make sure that we also re-encode the bvec, in case the page lists have changed. Fixes: ff053dbbaffe ("SUNRPC: Move the call to xprt_send_pagedata() out of xprt_sock_sendmsg()") Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 ++- net/sunrpc/clnt.c | 1 - net/sunrpc/xprt.c | 27 ++++++++++++++++++--------- net/sunrpc/xprtsock.c | 12 ++---------- 4 files changed, 22 insertions(+), 21 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 0d51b9f9ea37..b9f59aabee53 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -144,7 +144,8 @@ struct rpc_xprt_ops { unsigned short (*get_srcport)(struct rpc_xprt *xprt); int (*buf_alloc)(struct rpc_task *task); void (*buf_free)(struct rpc_task *task); - int (*prepare_request)(struct rpc_rqst *req); + int (*prepare_request)(struct rpc_rqst *req, + struct xdr_buf *buf); int (*send_request)(struct rpc_rqst *req); void (*wait_for_reply_request)(struct rpc_task *task); void (*timer)(struct rpc_xprt *xprt, struct rpc_task *task); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index bbfc47f03480..b098e707ad41 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1870,7 +1870,6 @@ rpc_xdr_encode(struct rpc_task *task) req->rq_snd_buf.head[0].iov_len = 0; xdr_init_encode(&xdr, &req->rq_snd_buf, req->rq_snd_buf.head[0].iov_base, req); - xdr_free_bvec(&req->rq_snd_buf); if (rpc_encode_header(task, &xdr)) return; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 44348c9f4b00..d71eec494826 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -73,7 +73,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net); static __be32 xprt_alloc_xid(struct rpc_xprt *xprt); static void xprt_destroy(struct rpc_xprt *xprt); static void xprt_request_init(struct rpc_task *task); -static int xprt_request_prepare(struct rpc_rqst *req); +static int xprt_request_prepare(struct rpc_rqst *req, struct xdr_buf *buf); static DEFINE_SPINLOCK(xprt_list_lock); static LIST_HEAD(xprt_list); @@ -1149,7 +1149,7 @@ xprt_request_enqueue_receive(struct rpc_task *task) if (!xprt_request_need_enqueue_receive(task, req)) return 0; - ret = xprt_request_prepare(task->tk_rqstp); + ret = xprt_request_prepare(task->tk_rqstp, &req->rq_rcv_buf); if (ret) return ret; spin_lock(&xprt->queue_lock); @@ -1179,8 +1179,11 @@ xprt_request_dequeue_receive_locked(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) + if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) { xprt_request_rb_remove(req->rq_xprt, req); + xdr_free_bvec(&req->rq_rcv_buf); + req->rq_private_buf.bvec = NULL; + } } /** @@ -1336,8 +1339,14 @@ xprt_request_enqueue_transmit(struct rpc_task *task) { struct rpc_rqst *pos, *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; + int ret; if (xprt_request_need_enqueue_transmit(task, req)) { + ret = xprt_request_prepare(task->tk_rqstp, &req->rq_snd_buf); + if (ret) { + task->tk_status = ret; + return; + } req->rq_bytes_sent = 0; spin_lock(&xprt->queue_lock); /* @@ -1397,6 +1406,7 @@ xprt_request_dequeue_transmit_locked(struct rpc_task *task) } else list_del(&req->rq_xmit2); atomic_long_dec(&req->rq_xprt->xmit_queuelen); + xdr_free_bvec(&req->rq_snd_buf); } /** @@ -1433,8 +1443,6 @@ xprt_request_dequeue_xprt(struct rpc_task *task) test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) || xprt_is_pinned_rqst(req)) { spin_lock(&xprt->queue_lock); - xprt_request_dequeue_transmit_locked(task); - xprt_request_dequeue_receive_locked(task); while (xprt_is_pinned_rqst(req)) { set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); spin_unlock(&xprt->queue_lock); @@ -1442,6 +1450,8 @@ xprt_request_dequeue_xprt(struct rpc_task *task) spin_lock(&xprt->queue_lock); clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); } + xprt_request_dequeue_transmit_locked(task); + xprt_request_dequeue_receive_locked(task); spin_unlock(&xprt->queue_lock); } } @@ -1449,18 +1459,19 @@ xprt_request_dequeue_xprt(struct rpc_task *task) /** * xprt_request_prepare - prepare an encoded request for transport * @req: pointer to rpc_rqst + * @buf: pointer to send/rcv xdr_buf * * Calls into the transport layer to do whatever is needed to prepare * the request for transmission or receive. * Returns error, or zero. */ static int -xprt_request_prepare(struct rpc_rqst *req) +xprt_request_prepare(struct rpc_rqst *req, struct xdr_buf *buf) { struct rpc_xprt *xprt = req->rq_xprt; if (xprt->ops->prepare_request) - return xprt->ops->prepare_request(req); + return xprt->ops->prepare_request(req, buf); return 0; } @@ -1961,8 +1972,6 @@ void xprt_release(struct rpc_task *task) spin_unlock(&xprt->transport_lock); if (req->rq_buffer) xprt->ops->buf_free(task); - xdr_free_bvec(&req->rq_rcv_buf); - xdr_free_bvec(&req->rq_snd_buf); if (req->rq_cred != NULL) put_rpccred(req->rq_cred); if (req->rq_release_snd_buf) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index eba1be9984f8..e976007f4fd0 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -822,17 +822,9 @@ static int xs_stream_nospace(struct rpc_rqst *req, bool vm_wait) return ret; } -static int -xs_stream_prepare_request(struct rpc_rqst *req) +static int xs_stream_prepare_request(struct rpc_rqst *req, struct xdr_buf *buf) { - gfp_t gfp = rpc_task_gfp_mask(); - int ret; - - ret = xdr_alloc_bvec(&req->rq_snd_buf, gfp); - if (ret < 0) - return ret; - xdr_free_bvec(&req->rq_rcv_buf); - return xdr_alloc_bvec(&req->rq_rcv_buf, gfp); + return xdr_alloc_bvec(buf, rpc_task_gfp_mask()); } /* From b1a28f2eb9ea7a5a1763fe53fe699aa0feae4231 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 1 Aug 2022 14:16:51 -0400 Subject: [PATCH 41/46] NFS: nfs_async_write_reschedule_io must not recurse into the writeback code It is not safe to call filemap_fdatawrite_range() from nfs_async_write_reschedule_io(), since we're often calling from a page reclaim context. Just let fsync() redrive the writeback for us. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 16d166bc4099..4adf2b488da1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1444,8 +1444,6 @@ static void nfs_async_write_error(struct list_head *head, int error) static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr) { nfs_async_write_error(&hdr->pages, 0); - filemap_fdatawrite_range(hdr->inode->i_mapping, hdr->args.offset, - hdr->args.offset + hdr->args.count - 1); } static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = { From 2135e5d56278ffdb1c2e6d325dc6b87f669b9dac Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 2 Aug 2022 15:48:50 -0400 Subject: [PATCH 42/46] NFSv4/pnfs: Fix a use-after-free bug in open If someone cancels the open RPC call, then we must not try to free either the open slot or the layoutget operation arguments, since they are likely still in use by the hung RPC call. Fixes: 6949493884fe ("NFSv4: Don't hold the layoutget locks across multiple RPC calls") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 2d7c14ade193..3ed14a2a84a4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -3096,12 +3096,13 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, } out: - if (opendata->lgp) { - nfs4_lgopen_release(opendata->lgp); - opendata->lgp = NULL; - } - if (!opendata->cancelled) + if (!opendata->cancelled) { + if (opendata->lgp) { + nfs4_lgopen_release(opendata->lgp); + opendata->lgp = NULL; + } nfs4_sequence_free_slot(&opendata->o_res.seq_res); + } return ret; } From 3c59366c207e4c6c6569524af606baf017a55c61 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 1 Aug 2022 10:33:34 +1000 Subject: [PATCH 43/46] NFS: don't unhash dentry during unlink/rename NFS unlink() (and rename over existing target) must determine if the file is open, and must perform a "silly rename" instead of an unlink (or before rename) if it is. Otherwise the client might hold a file open which has been removed on the server. Consequently if it determines that the file isn't open, it must block any subsequent opens until the unlink/rename has been completed on the server. This is currently achieved by unhashing the dentry. This forces any open attempt to the slow-path for lookup which will block on i_rwsem on the directory until the unlink/rename completes. A future patch will change the VFS to only get a shared lock on i_rwsem for unlink, so this will no longer work. Instead we introduce an explicit interlock. A special value is stored in dentry->d_fsdata while the unlink/rename is running and ->d_revalidate blocks while that value is present. When ->d_revalidate unblocks, the dentry will be invalid. This closes the race without requiring exclusion on i_rwsem. d_fsdata is already used in two different ways. 1/ an IS_ROOT directory dentry might have a "devname" stored in d_fsdata. Such a dentry doesn't have a name and so cannot be the target of unlink or rename. For safety we check if an old devname is still stored, and remove it if it is. 2/ a dentry with DCACHE_NFSFS_RENAMED set will have a 'struct nfs_unlinkdata' stored in d_fsdata. While this is set maydelete() will fail, so an unlink or rename will never proceed on such a dentry. Neither of these can be in effect when a dentry is the target of unlink or rename. So we can expect d_fsdata to be NULL, and store a special value ((void*)1) which is given the name NFS_FSDATA_BLOCKED to indicate that any lookup will be blocked. The d_count() is incremented under d_lock() when a lookup finds the dentry, so we check d_count() is low, and set NFS_FSDATA_BLOCKED under the same lock to avoid any races. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 72 +++++++++++++++++++++++++++++++----------- include/linux/nfs_fs.h | 9 ++++++ 2 files changed, 63 insertions(+), 18 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 7b2230297f6b..dbab3caa15ed 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1782,6 +1782,8 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags, int ret; if (flags & LOOKUP_RCU) { + if (dentry->d_fsdata == NFS_FSDATA_BLOCKED) + return -ECHILD; parent = READ_ONCE(dentry->d_parent); dir = d_inode_rcu(parent); if (!dir) @@ -1790,6 +1792,9 @@ __nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags, if (parent != READ_ONCE(dentry->d_parent)) return -ECHILD; } else { + /* Wait for unlink to complete */ + wait_var_event(&dentry->d_fsdata, + dentry->d_fsdata != NFS_FSDATA_BLOCKED); parent = dget_parent(dentry); ret = reval(d_inode(parent), dentry, flags); dput(parent); @@ -2458,7 +2463,6 @@ static int nfs_safe_remove(struct dentry *dentry) int nfs_unlink(struct inode *dir, struct dentry *dentry) { int error; - int need_rehash = 0; dfprintk(VFS, "NFS: unlink(%s/%lu, %pd)\n", dir->i_sb->s_id, dir->i_ino, dentry); @@ -2473,15 +2477,25 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) error = nfs_sillyrename(dir, dentry); goto out; } - if (!d_unhashed(dentry)) { - __d_drop(dentry); - need_rehash = 1; - } + /* We must prevent any concurrent open until the unlink + * completes. ->d_revalidate will wait for ->d_fsdata + * to clear. We set it here to ensure no lookup succeeds until + * the unlink is complete on the server. + */ + error = -ETXTBSY; + if (WARN_ON(dentry->d_flags & DCACHE_NFSFS_RENAMED) || + WARN_ON(dentry->d_fsdata == NFS_FSDATA_BLOCKED)) + goto out; + if (dentry->d_fsdata) + /* old devname */ + kfree(dentry->d_fsdata); + dentry->d_fsdata = NFS_FSDATA_BLOCKED; + spin_unlock(&dentry->d_lock); error = nfs_safe_remove(dentry); nfs_dentry_remove_handle_error(dir, dentry, error); - if (need_rehash) - d_rehash(dentry); + dentry->d_fsdata = NULL; + wake_up_var(&dentry->d_fsdata); out: trace_nfs_unlink_exit(dir, dentry, error); return error; @@ -2588,6 +2602,15 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) } EXPORT_SYMBOL_GPL(nfs_link); +static void +nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data) +{ + struct dentry *new_dentry = data->new_dentry; + + new_dentry->d_fsdata = NULL; + wake_up_var(&new_dentry->d_fsdata); +} + /* * RENAME * FIXME: Some nfsds, like the Linux user space nfsd, may generate a @@ -2618,8 +2641,9 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); - struct dentry *dentry = NULL, *rehash = NULL; + struct dentry *dentry = NULL; struct rpc_task *task; + bool must_unblock = false; int error = -EBUSY; if (flags) @@ -2637,18 +2661,27 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, * the new target. */ if (new_inode && !S_ISDIR(new_inode->i_mode)) { - /* - * To prevent any new references to the target during the - * rename, we unhash the dentry in advance. + /* We must prevent any concurrent open until the unlink + * completes. ->d_revalidate will wait for ->d_fsdata + * to clear. We set it here to ensure no lookup succeeds until + * the unlink is complete on the server. */ - if (!d_unhashed(new_dentry)) { - d_drop(new_dentry); - rehash = new_dentry; + error = -ETXTBSY; + if (WARN_ON(new_dentry->d_flags & DCACHE_NFSFS_RENAMED) || + WARN_ON(new_dentry->d_fsdata == NFS_FSDATA_BLOCKED)) + goto out; + if (new_dentry->d_fsdata) { + /* old devname */ + kfree(new_dentry->d_fsdata); + new_dentry->d_fsdata = NULL; } + spin_lock(&new_dentry->d_lock); if (d_count(new_dentry) > 2) { int err; + spin_unlock(&new_dentry->d_lock); + /* copy the target dentry's name */ dentry = d_alloc(new_dentry->d_parent, &new_dentry->d_name); @@ -2661,14 +2694,19 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, goto out; new_dentry = dentry; - rehash = NULL; new_inode = NULL; + } else { + new_dentry->d_fsdata = NFS_FSDATA_BLOCKED; + must_unblock = true; + spin_unlock(&new_dentry->d_lock); } + } if (S_ISREG(old_inode->i_mode)) nfs_sync_inode(old_inode); - task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); + task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, + must_unblock ? nfs_unblock_rename : NULL); if (IS_ERR(task)) { error = PTR_ERR(task); goto out; @@ -2692,8 +2730,6 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, spin_unlock(&old_inode->i_lock); } out: - if (rehash) - d_rehash(rehash); trace_nfs_rename_exit(old_dir, old_dentry, new_dir, new_dentry, error); if (!error) { diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index a17c337dbdf1..b32ed68e7dc4 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -617,6 +617,15 @@ nfs_fileid_to_ino_t(u64 fileid) #define NFS_JUKEBOX_RETRY_TIME (5 * HZ) +/* We need to block new opens while a file is being unlinked. + * If it is opened *before* we decide to unlink, we will silly-rename + * instead. If it is opened *after*, then we need to create or will fail. + * If we allow the two to race, we could end up with a file that is open + * but deleted on the server resulting in ESTALE. + * So use ->d_fsdata to record when the unlink is happening + * and block dentry revalidation while it is set. + */ +#define NFS_FSDATA_BLOCKED ((void*)1) # undef ifdebug # ifdef NFS_DEBUG From af887e437bb298752b2edc5834048b8151b8aea0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 9 Aug 2022 12:50:28 -0400 Subject: [PATCH 44/46] NFS: Improve write error tracing Don't leak request pointers, but use the "device:inode" labelling that is used by all the other trace points. Furthermore, replace use of page indexes with an offset, again in order to align behaviour with other NFS trace points. Signed-off-by: Trond Myklebust --- fs/nfs/nfstrace.h | 36 +++++++++++++++++++++--------------- fs/nfs/write.c | 8 +++++--- include/linux/nfs_page.h | 3 +-- 3 files changed, 27 insertions(+), 20 deletions(-) diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 8bd0c13a7c4b..731eecfdf49a 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1447,44 +1447,50 @@ TRACE_EVENT(nfs_writeback_done, DECLARE_EVENT_CLASS(nfs_page_error_class, TP_PROTO( + const struct inode *inode, const struct nfs_page *req, int error ), - TP_ARGS(req, error), + TP_ARGS(inode, req, error), TP_STRUCT__entry( - __field(const void *, req) - __field(pgoff_t, index) - __field(unsigned int, offset) - __field(unsigned int, pgbase) - __field(unsigned int, bytes) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(loff_t, offset) + __field(unsigned int, count) __field(int, error) ), TP_fast_assign( - __entry->req = req; - __entry->index = req->wb_index; - __entry->offset = req->wb_offset; - __entry->pgbase = req->wb_pgbase; - __entry->bytes = req->wb_bytes; + const struct nfs_inode *nfsi = NFS_I(inode); + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->offset = req_offset(req); + __entry->count = req->wb_bytes; __entry->error = error; ), TP_printk( - "req=%p index=%lu offset=%u pgbase=%u bytes=%u error=%d", - __entry->req, __entry->index, __entry->offset, - __entry->pgbase, __entry->bytes, __entry->error + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u", __entry->error, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->offset, + __entry->count ) ); #define DEFINE_NFS_PAGEERR_EVENT(name) \ DEFINE_EVENT(nfs_page_error_class, name, \ TP_PROTO( \ + const struct inode *inode, \ const struct nfs_page *req, \ int error \ ), \ - TP_ARGS(req, error)) + TP_ARGS(inode, req, error)) DEFINE_NFS_PAGEERR_EVENT(nfs_write_error); DEFINE_NFS_PAGEERR_EVENT(nfs_comp_error); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 4adf2b488da1..4a3796811b4b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -592,7 +592,8 @@ nfs_lock_and_join_requests(struct page *page) static void nfs_write_error(struct nfs_page *req, int error) { - trace_nfs_write_error(req, error); + trace_nfs_write_error(page_file_mapping(req->wb_page)->host, req, + error); nfs_mapping_set_error(req->wb_page, error); nfs_inode_remove_request(req); nfs_end_page_writeback(req); @@ -1000,7 +1001,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) nfs_list_remove_request(req); if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes < bytes)) { - trace_nfs_comp_error(req, hdr->error); + trace_nfs_comp_error(hdr->inode, req, hdr->error); nfs_mapping_set_error(req->wb_page, hdr->error); goto remove_req; } @@ -1882,7 +1883,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) (long long)req_offset(req)); if (status < 0) { if (req->wb_page) { - trace_nfs_commit_error(req, status); + trace_nfs_commit_error(data->inode, req, + status); nfs_mapping_set_error(req->wb_page, status); nfs_inode_remove_request(req); } diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index f0373a6cb5fb..ba7e2e4b0926 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -202,8 +202,7 @@ nfs_list_entry(struct list_head *head) return list_entry(head, struct nfs_page, wb_list); } -static inline -loff_t req_offset(struct nfs_page *req) +static inline loff_t req_offset(const struct nfs_page *req) { return (((loff_t)req->wb_index) << PAGE_SHIFT) + req->wb_offset; } From b313eb91521872284c0e395773fc6e9827fb1446 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 9 Aug 2022 13:46:41 -0400 Subject: [PATCH 45/46] NFS: Improve O_DIRECT tracing Switch the formatting to match the other NFS tracepoints. Signed-off-by: Trond Myklebust --- fs/nfs/nfstrace.h | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 731eecfdf49a..8e87cf8e5e78 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1597,12 +1597,10 @@ DECLARE_EVENT_CLASS(nfs_direct_req_class, TP_ARGS(dreq), TP_STRUCT__entry( - __field(const struct nfs_direct_req *, dreq) __field(dev_t, dev) __field(u64, fileid) __field(u32, fhandle) - __field(int, ref) - __field(loff_t, io_start) + __field(loff_t, offset) __field(ssize_t, count) __field(ssize_t, bytes_left) __field(ssize_t, error) @@ -1614,12 +1612,10 @@ DECLARE_EVENT_CLASS(nfs_direct_req_class, const struct nfs_inode *nfsi = NFS_I(inode); const struct nfs_fh *fh = &nfsi->fh; - __entry->dreq = dreq; __entry->dev = inode->i_sb->s_dev; __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(fh); - __entry->ref = kref_read(&dreq->kref); - __entry->io_start = dreq->io_start; + __entry->offset = dreq->io_start; __entry->count = dreq->count; __entry->bytes_left = dreq->bytes_left; __entry->error = dreq->error; @@ -1627,13 +1623,14 @@ DECLARE_EVENT_CLASS(nfs_direct_req_class, ), TP_printk( - "dreq=%p fileid=%02x:%02x:%llu fhandle=0x%08x ref=%d " - "io_start=%lld count=%zd bytes_left=%zd error=%zd flags=%s", - __entry->dreq, MAJOR(__entry->dev), MINOR(__entry->dev), + "error=%zd fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%zd bytes_left=%zd flags=%s", + __entry->error, MAJOR(__entry->dev), + MINOR(__entry->dev), (unsigned long long)__entry->fileid, - __entry->fhandle, __entry->ref, - __entry->io_start, __entry->count, __entry->bytes_left, - __entry->error, nfs_show_direct_req_flags(__entry->flags) + __entry->fhandle, __entry->offset, + __entry->count, __entry->bytes_left, + nfs_show_direct_req_flags(__entry->flags) ) ); From 3fa5cbdc44de190f2c5605ba7db015ae0d26f668 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 9 Aug 2022 13:59:09 -0400 Subject: [PATCH 46/46] NFS: Improve readpage/writepage tracing Switch formatting to better match that used by other NFS tracepoints. Signed-off-by: Trond Myklebust --- fs/nfs/nfstrace.h | 54 +++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 8e87cf8e5e78..8c6cc58679ff 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -1137,7 +1137,7 @@ TRACE_EVENT(nfs_readpage_done, __field(u32, arg_count) __field(u32, res_count) __field(bool, eof) - __field(int, status) + __field(int, error) ), TP_fast_assign( @@ -1146,7 +1146,7 @@ TRACE_EVENT(nfs_readpage_done, const struct nfs_fh *fh = hdr->args.fh ? hdr->args.fh : &nfsi->fh; - __entry->status = task->tk_status; + __entry->error = task->tk_status; __entry->offset = hdr->args.offset; __entry->arg_count = hdr->args.count; __entry->res_count = hdr->res.count; @@ -1157,14 +1157,13 @@ TRACE_EVENT(nfs_readpage_done, ), TP_printk( - "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%u res=%u status=%d%s", + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u res=%u%s", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, __entry->arg_count, - __entry->res_count, __entry->status, - __entry->eof ? " eof" : "" + __entry->res_count, __entry->eof ? " eof" : "" ) ); @@ -1184,7 +1183,7 @@ TRACE_EVENT(nfs_readpage_short, __field(u32, arg_count) __field(u32, res_count) __field(bool, eof) - __field(int, status) + __field(int, error) ), TP_fast_assign( @@ -1193,7 +1192,7 @@ TRACE_EVENT(nfs_readpage_short, const struct nfs_fh *fh = hdr->args.fh ? hdr->args.fh : &nfsi->fh; - __entry->status = task->tk_status; + __entry->error = task->tk_status; __entry->offset = hdr->args.offset; __entry->arg_count = hdr->args.count; __entry->res_count = hdr->res.count; @@ -1204,14 +1203,13 @@ TRACE_EVENT(nfs_readpage_short, ), TP_printk( - "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%u res=%u status=%d%s", + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u res=%u%s", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, __entry->arg_count, - __entry->res_count, __entry->status, - __entry->eof ? " eof" : "" + __entry->res_count, __entry->eof ? " eof" : "" ) ); @@ -1323,7 +1321,7 @@ TRACE_EVENT(nfs_pgio_error, __field(u32, arg_count) __field(u32, res_count) __field(loff_t, pos) - __field(int, status) + __field(int, error) ), TP_fast_assign( @@ -1332,7 +1330,7 @@ TRACE_EVENT(nfs_pgio_error, const struct nfs_fh *fh = hdr->args.fh ? hdr->args.fh : &nfsi->fh; - __entry->status = error; + __entry->error = error; __entry->offset = hdr->args.offset; __entry->arg_count = hdr->args.count; __entry->res_count = hdr->res.count; @@ -1341,12 +1339,12 @@ TRACE_EVENT(nfs_pgio_error, __entry->fhandle = nfs_fhandle_hash(fh); ), - TP_printk("fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%u res=%u pos=%llu status=%d", + TP_printk("error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u res=%u pos=%llu", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, __entry->arg_count, __entry->res_count, - __entry->pos, __entry->status + __entry->pos ) ); @@ -1406,7 +1404,7 @@ TRACE_EVENT(nfs_writeback_done, __field(loff_t, offset) __field(u32, arg_count) __field(u32, res_count) - __field(int, status) + __field(int, error) __field(unsigned long, stable) __array(char, verifier, NFS4_VERIFIER_SIZE) ), @@ -1418,7 +1416,7 @@ TRACE_EVENT(nfs_writeback_done, hdr->args.fh : &nfsi->fh; const struct nfs_writeverf *verf = hdr->res.verf; - __entry->status = task->tk_status; + __entry->error = task->tk_status; __entry->offset = hdr->args.offset; __entry->arg_count = hdr->args.count; __entry->res_count = hdr->res.count; @@ -1432,14 +1430,14 @@ TRACE_EVENT(nfs_writeback_done, ), TP_printk( - "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%u res=%u status=%d stable=%s " - "verifier=%s", + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%u res=%u stable=%s " + "verifier=%s", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, __entry->arg_count, - __entry->res_count, __entry->status, + __entry->res_count, show_nfs_stable_how(__entry->stable), show_nfs4_verifier(__entry->verifier) ) @@ -1547,7 +1545,7 @@ TRACE_EVENT(nfs_commit_done, __field(u32, fhandle) __field(u64, fileid) __field(loff_t, offset) - __field(int, status) + __field(int, error) __field(unsigned long, stable) __array(char, verifier, NFS4_VERIFIER_SIZE) ), @@ -1559,7 +1557,7 @@ TRACE_EVENT(nfs_commit_done, data->args.fh : &nfsi->fh; const struct nfs_writeverf *verf = data->res.verf; - __entry->status = task->tk_status; + __entry->error = task->tk_status; __entry->offset = data->args.offset; __entry->stable = verf->committed; memcpy(__entry->verifier, @@ -1571,12 +1569,12 @@ TRACE_EVENT(nfs_commit_done, ), TP_printk( - "fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld status=%d stable=%s verifier=%s", + "error=%d fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld stable=%s verifier=%s", __entry->error, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, - (long long)__entry->offset, __entry->status, + (long long)__entry->offset, show_nfs_stable_how(__entry->stable), show_nfs4_verifier(__entry->verifier) )