diff --git a/MAINTAINERS b/MAINTAINERS index c660e9be8ce6..f31f987bda42 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8172,6 +8172,7 @@ F: include/uapi/scsi/fc/ FILE LOCKING (flock() and fcntl()/lockf()) M: Jeff Layton M: Chuck Lever +R: Alexander Aring L: linux-fsdevel@vger.kernel.org S: Maintained F: fs/fcntl.c diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index ce5862482097..ab8042a5b895 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -710,8 +710,6 @@ static const struct svc_version *nlmsvc_version[] = { #endif }; -static struct svc_stat nlmsvc_stats; - #define NLM_NRVERS ARRAY_SIZE(nlmsvc_version) static struct svc_program nlmsvc_program = { .pg_prog = NLM_PROGRAM, /* program number */ @@ -719,7 +717,6 @@ static struct svc_program nlmsvc_program = { .pg_vers = nlmsvc_version, /* version table */ .pg_name = "lockd", /* service name */ .pg_class = "nfsd", /* share authentication with nfsd */ - .pg_stats = &nlmsvc_stats, /* stats table */ .pg_authenticate = &lockd_authenticate, /* export authentication */ .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 760d27dd7225..8adfcd4c8c1a 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -356,15 +356,12 @@ static const struct svc_version *nfs4_callback_version[] = { [4] = &nfs4_callback_version4, }; -static struct svc_stat nfs4_callback_stats; - static struct svc_program nfs4_callback_program = { .pg_prog = NFS4_CALLBACK, /* RPC service number */ .pg_nvers = ARRAY_SIZE(nfs4_callback_version), /* Number of entries */ .pg_vers = nfs4_callback_version, /* version table */ .pg_name = "NFSv4 callback", /* service name */ .pg_class = "nfs", /* authentication class */ - .pg_stats = &nfs4_callback_stats, .pg_authenticate = nfs_callback_authenticate, .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 46fd74d91ea9..3c040c81c77d 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -328,10 +328,10 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode, } static void -nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls) +nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file) { struct nfs4_client *clp = ls->ls_stid.sc_client; - struct block_device *bdev = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_bdev; + struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev; bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, nfsd4_scsi_pr_key(clp), 0, true); diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index 4cbe0434cbb8..66a05fefae98 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -80,8 +80,6 @@ enum { int nfsd_drc_slab_create(void); void nfsd_drc_slab_free(void); -int nfsd_net_reply_cache_init(struct nfsd_net *nn); -void nfsd_net_reply_cache_destroy(struct nfsd_net *nn); int nfsd_reply_cache_init(struct nfsd_net *); void nfsd_reply_cache_shutdown(struct nfsd_net *); int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index b86d8494052c..ddd3e0d9cfa6 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -61,13 +61,10 @@ static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age); static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions); struct nfsd_fcache_disposal { - struct work_struct work; spinlock_t lock; struct list_head freeme; }; -static struct workqueue_struct *nfsd_filecache_wq __read_mostly; - static struct kmem_cache *nfsd_file_slab; static struct kmem_cache *nfsd_file_mark_slab; static struct list_lru nfsd_file_lru; @@ -283,7 +280,7 @@ nfsd_file_free(struct nfsd_file *nf) nfsd_file_mark_put(nf->nf_mark); if (nf->nf_file) { nfsd_file_check_write_error(nf); - filp_close(nf->nf_file, NULL); + nfsd_filp_close(nf->nf_file); } /* @@ -421,7 +418,37 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose) spin_lock(&l->lock); list_move_tail(&nf->nf_lru, &l->freeme); spin_unlock(&l->lock); - queue_work(nfsd_filecache_wq, &l->work); + svc_wake_up(nn->nfsd_serv); + } +} + +/** + * nfsd_file_net_dispose - deal with nfsd_files waiting to be disposed. + * @nn: nfsd_net in which to find files to be disposed. + * + * When files held open for nfsv3 are removed from the filecache, whether + * due to memory pressure or garbage collection, they are queued to + * a per-net-ns queue. This function completes the disposal, either + * directly or by waking another nfsd thread to help with the work. + */ +void nfsd_file_net_dispose(struct nfsd_net *nn) +{ + struct nfsd_fcache_disposal *l = nn->fcache_disposal; + + if (!list_empty(&l->freeme)) { + LIST_HEAD(dispose); + int i; + + spin_lock(&l->lock); + for (i = 0; i < 8 && !list_empty(&l->freeme); i++) + list_move(l->freeme.next, &dispose); + spin_unlock(&l->lock); + if (!list_empty(&l->freeme)) + /* Wake up another thread to share the work + * *before* doing any actual disposing. + */ + svc_wake_up(nn->nfsd_serv); + nfsd_file_dispose_list(&dispose); } } @@ -631,28 +658,6 @@ nfsd_file_close_inode_sync(struct inode *inode) list_del_init(&nf->nf_lru); nfsd_file_free(nf); } - flush_delayed_fput(); -} - -/** - * nfsd_file_delayed_close - close unused nfsd_files - * @work: dummy - * - * Scrape the freeme list for this nfsd_net, and then dispose of them - * all. - */ -static void -nfsd_file_delayed_close(struct work_struct *work) -{ - LIST_HEAD(head); - struct nfsd_fcache_disposal *l = container_of(work, - struct nfsd_fcache_disposal, work); - - spin_lock(&l->lock); - list_splice_init(&l->freeme, &head); - spin_unlock(&l->lock); - - nfsd_file_dispose_list(&head); } static int @@ -717,25 +722,18 @@ nfsd_file_cache_init(void) return ret; ret = -ENOMEM; - nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", WQ_UNBOUND, 0); - if (!nfsd_filecache_wq) - goto out; - - nfsd_file_slab = kmem_cache_create("nfsd_file", - sizeof(struct nfsd_file), 0, 0, NULL); + nfsd_file_slab = KMEM_CACHE(nfsd_file, 0); if (!nfsd_file_slab) { pr_err("nfsd: unable to create nfsd_file_slab\n"); goto out_err; } - nfsd_file_mark_slab = kmem_cache_create("nfsd_file_mark", - sizeof(struct nfsd_file_mark), 0, 0, NULL); + nfsd_file_mark_slab = KMEM_CACHE(nfsd_file_mark, 0); if (!nfsd_file_mark_slab) { pr_err("nfsd: unable to create nfsd_file_mark_slab\n"); goto out_err; } - ret = list_lru_init(&nfsd_file_lru); if (ret) { pr_err("nfsd: failed to init nfsd_file_lru: %d\n", ret); @@ -785,8 +783,6 @@ out_err: nfsd_file_slab = NULL; kmem_cache_destroy(nfsd_file_mark_slab); nfsd_file_mark_slab = NULL; - destroy_workqueue(nfsd_filecache_wq); - nfsd_filecache_wq = NULL; rhltable_destroy(&nfsd_file_rhltable); goto out; } @@ -832,7 +828,6 @@ nfsd_alloc_fcache_disposal(void) l = kmalloc(sizeof(*l), GFP_KERNEL); if (!l) return NULL; - INIT_WORK(&l->work, nfsd_file_delayed_close); spin_lock_init(&l->lock); INIT_LIST_HEAD(&l->freeme); return l; @@ -841,7 +836,6 @@ nfsd_alloc_fcache_disposal(void) static void nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l) { - cancel_work_sync(&l->work); nfsd_file_dispose_list(&l->freeme); kfree(l); } @@ -910,8 +904,6 @@ nfsd_file_cache_shutdown(void) fsnotify_wait_marks_destroyed(); kmem_cache_destroy(nfsd_file_mark_slab); nfsd_file_mark_slab = NULL; - destroy_workqueue(nfsd_filecache_wq); - nfsd_filecache_wq = NULL; rhltable_destroy(&nfsd_file_rhltable); for_each_possible_cpu(i) { diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index e54165a3224f..c61884def906 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -56,6 +56,7 @@ void nfsd_file_cache_shutdown_net(struct net *net); void nfsd_file_put(struct nfsd_file *nf); struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); void nfsd_file_close_inode_sync(struct inode *inode); +void nfsd_file_net_dispose(struct nfsd_net *nn); bool nfsd_file_is_cached(struct inode *inode); __be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned int may_flags, struct nfsd_file **nfp); diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 74b4360779a1..d4be519b5734 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -11,8 +11,10 @@ #include #include #include +#include #include #include +#include /* Hash tables for nfs4_clientid state */ #define CLIENT_HASH_BITS 4 @@ -26,10 +28,22 @@ struct nfsd4_client_tracking_ops; enum { /* cache misses due only to checksum comparison failures */ - NFSD_NET_PAYLOAD_MISSES, + NFSD_STATS_PAYLOAD_MISSES, /* amount of memory (in bytes) currently consumed by the DRC */ - NFSD_NET_DRC_MEM_USAGE, - NFSD_NET_COUNTERS_NUM + NFSD_STATS_DRC_MEM_USAGE, + NFSD_STATS_RC_HITS, /* repcache hits */ + NFSD_STATS_RC_MISSES, /* repcache misses */ + NFSD_STATS_RC_NOCACHE, /* uncached reqs */ + NFSD_STATS_FH_STALE, /* FH stale error */ + NFSD_STATS_IO_READ, /* bytes returned to read requests */ + NFSD_STATS_IO_WRITE, /* bytes passed in write requests */ +#ifdef CONFIG_NFSD_V4 + NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */ + NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP, +#define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op)) + NFSD_STATS_WDELEG_GETATTR, /* count of getattr conflict with wdeleg */ +#endif + NFSD_STATS_COUNTERS_NUM }; /* @@ -164,7 +178,10 @@ struct nfsd_net { atomic_t num_drc_entries; /* Per-netns stats counters */ - struct percpu_counter counter[NFSD_NET_COUNTERS_NUM]; + struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM]; + + /* sunrpc svc stats */ + struct svc_stat nfsd_svcstats; /* longest hash chain seen */ unsigned int longest_chain; @@ -192,6 +209,10 @@ struct nfsd_net { atomic_t nfsd_courtesy_clients; struct shrinker *nfsd_client_shrinker; struct work_struct nfsd_shrinker_work; + + /* last time an admin-revoke happened for NFSv4.0 */ + time64_t nfs40_last_revoke; + }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index b78eceebd945..dfcc957e460d 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -71,13 +71,15 @@ nfsd3_proc_setattr(struct svc_rqst *rqstp) struct nfsd_attrs attrs = { .na_iattr = &argp->attrs, }; + const struct timespec64 *guardtime = NULL; dprintk("nfsd: SETATTR(3) %s\n", SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - resp->status = nfsd_setattr(rqstp, &resp->fh, &attrs, - argp->check_guard, argp->guardtime); + if (argp->check_guard) + guardtime = &argp->guardtime; + resp->status = nfsd_setattr(rqstp, &resp->fh, &attrs, guardtime); return rpc_success; } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index f32128955ec8..a7a07470c1f8 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -295,17 +295,14 @@ svcxdr_decode_sattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr, static bool svcxdr_decode_sattrguard3(struct xdr_stream *xdr, struct nfsd3_sattrargs *args) { - __be32 *p; u32 check; if (xdr_stream_decode_bool(xdr, &check) < 0) return false; if (check) { - p = xdr_inline_decode(xdr, XDR_UNIT * 2); - if (!p) + if (!svcxdr_decode_nfstime3(xdr, &args->guardtime)) return false; args->check_guard = 1; - args->guardtime = be32_to_cpup(p); } else args->check_guard = 0; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 32d23ef3e5de..87c9547989f6 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -45,7 +45,7 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC -static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason); +static void nfsd4_mark_cb_fault(struct nfs4_client *clp); #define NFSPROC4_CB_NULL 0 #define NFSPROC4_CB_COMPOUND 1 @@ -85,7 +85,21 @@ static void encode_uint32(struct xdr_stream *xdr, u32 n) static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap, size_t len) { - WARN_ON_ONCE(xdr_stream_encode_uint32_array(xdr, bitmap, len) < 0); + xdr_stream_encode_uint32_array(xdr, bitmap, len); +} + +static int decode_cb_fattr4(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs4_cb_fattr *fattr) +{ + fattr->ncf_cb_change = 0; + fattr->ncf_cb_fsize = 0; + if (bitmap[0] & FATTR4_WORD0_CHANGE) + if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_change) < 0) + return -NFSERR_BAD_XDR; + if (bitmap[0] & FATTR4_WORD0_SIZE) + if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_fsize) < 0) + return -NFSERR_BAD_XDR; + return 0; } static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op) @@ -333,6 +347,30 @@ encode_cb_recallany4args(struct xdr_stream *xdr, hdr->nops++; } +/* + * CB_GETATTR4args + * struct CB_GETATTR4args { + * nfs_fh4 fh; + * bitmap4 attr_request; + * }; + * + * The size and change attributes are the only one + * guaranteed to be serviced by the client. + */ +static void +encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr, + struct nfs4_cb_fattr *fattr) +{ + struct nfs4_delegation *dp = + container_of(fattr, struct nfs4_delegation, dl_cb_fattr); + struct knfsd_fh *fh = &dp->dl_stid.sc_file->fi_fhandle; + + encode_nfs_cb_opnum4(xdr, OP_CB_GETATTR); + encode_nfs_fh4(xdr, fh); + encode_bitmap4(xdr, fattr->ncf_cb_bmap, ARRAY_SIZE(fattr->ncf_cb_bmap)); + hdr->nops++; +} + /* * CB_SEQUENCE4args * @@ -468,6 +506,26 @@ static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, xdr_reserve_space(xdr, 0); } +/* + * 20.1. Operation 3: CB_GETATTR - Get Attributes + */ +static void nfs4_xdr_enc_cb_getattr(struct rpc_rqst *req, + struct xdr_stream *xdr, const void *data) +{ + const struct nfsd4_callback *cb = data; + struct nfs4_cb_fattr *ncf = + container_of(cb, struct nfs4_cb_fattr, ncf_getattr); + struct nfs4_cb_compound_hdr hdr = { + .ident = cb->cb_clp->cl_cb_ident, + .minorversion = cb->cb_clp->cl_minorversion, + }; + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_getattr4args(xdr, &hdr, ncf); + encode_cb_nops(&hdr); +} + /* * 20.2. Operation 4: CB_RECALL - Recall a Delegation */ @@ -523,6 +581,42 @@ static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, return 0; } +/* + * 20.1. Operation 3: CB_GETATTR - Get Attributes + */ +static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfsd4_callback *cb = data; + struct nfs4_cb_compound_hdr hdr; + int status; + u32 bitmap[3] = {0}; + u32 attrlen; + struct nfs4_cb_fattr *ncf = + container_of(cb, struct nfs4_cb_fattr, ncf_getattr); + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + + status = decode_cb_op_status(xdr, OP_CB_GETATTR, &cb->cb_status); + if (status) + return status; + if (xdr_stream_decode_uint32_array(xdr, bitmap, 3) < 0) + return -NFSERR_BAD_XDR; + if (xdr_stream_decode_u32(xdr, &attrlen) < 0) + return -NFSERR_BAD_XDR; + if (attrlen > (sizeof(ncf->ncf_cb_change) + sizeof(ncf->ncf_cb_fsize))) + return -NFSERR_BAD_XDR; + status = decode_cb_fattr4(xdr, bitmap, ncf); + return status; +} + /* * 20.2. Operation 4: CB_RECALL - Recall a Delegation */ @@ -831,6 +925,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = { PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock), PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload), PROC(CB_RECALL_ANY, COMPOUND, cb_recall_any, cb_recall_any), + PROC(CB_GETATTR, COMPOUND, cb_getattr, cb_getattr), }; static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)]; @@ -887,7 +982,16 @@ static struct workqueue_struct *callback_wq; static bool nfsd4_queue_cb(struct nfsd4_callback *cb) { - return queue_work(callback_wq, &cb->cb_work); + trace_nfsd_cb_queue(cb->cb_clp, cb); + return queue_delayed_work(callback_wq, &cb->cb_work, 0); +} + +static void nfsd4_queue_cb_delayed(struct nfsd4_callback *cb, + unsigned long msecs) +{ + trace_nfsd_cb_queue(cb->cb_clp, cb); + queue_delayed_work(callback_wq, &cb->cb_work, + msecs_to_jiffies(msecs)); } static void nfsd41_cb_inflight_begin(struct nfs4_client *clp) @@ -999,18 +1103,18 @@ static void nfsd4_mark_cb_state(struct nfs4_client *clp, int newstate) { if (clp->cl_cb_state != newstate) { clp->cl_cb_state = newstate; - trace_nfsd_cb_state(clp); + trace_nfsd_cb_new_state(clp); } } -static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason) +static void nfsd4_mark_cb_down(struct nfs4_client *clp) { if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) return; nfsd4_mark_cb_state(clp, NFSD4_CB_DOWN); } -static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason) +static void nfsd4_mark_cb_fault(struct nfs4_client *clp) { if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) return; @@ -1022,7 +1126,7 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata) struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null); if (task->tk_status) - nfsd4_mark_cb_down(clp, task->tk_status); + nfsd4_mark_cb_down(clp); else nfsd4_mark_cb_state(clp, NFSD4_CB_UP); } @@ -1106,6 +1210,7 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb) { struct nfs4_client *clp = cb->cb_clp; + trace_nfsd_cb_destroy(clp, cb); nfsd41_cb_release_slot(cb); if (cb->cb_ops && cb->cb_ops->release) cb->cb_ops->release(cb); @@ -1158,6 +1263,8 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback if (!cb->cb_holds_slot) goto need_restart; + /* This is the operation status code for CB_SEQUENCE */ + trace_nfsd_cb_seq_status(task, cb); switch (cb->cb_seq_status) { case 0: /* @@ -1171,13 +1278,23 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback break; case -ESERVERFAULT: ++session->se_cb_seq_nr; - fallthrough; - case 1: - case -NFS4ERR_BADSESSION: - nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); + nfsd4_mark_cb_fault(cb->cb_clp); ret = false; break; + case 1: + /* + * cb_seq_status remains 1 if an RPC Reply was never + * received. NFSD can't know if the client processed + * the CB_SEQUENCE operation. Ask the client to send a + * DESTROY_SESSION to recover. + */ + fallthrough; + case -NFS4ERR_BADSESSION: + nfsd4_mark_cb_fault(cb->cb_clp); + ret = false; + goto need_restart; case -NFS4ERR_DELAY: + cb->cb_seq_status = 1; if (!rpc_restart_call(task)) goto out; @@ -1192,14 +1309,11 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback } break; default: - nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status); - dprintk("%s: unprocessed error %d\n", __func__, - cb->cb_seq_status); + nfsd4_mark_cb_fault(cb->cb_clp); } - nfsd41_cb_release_slot(cb); - dprintk("%s: freed slot, new seqid=%d\n", __func__, - clp->cl_cb_session->se_cb_seq_nr); + + trace_nfsd_cb_free_slot(task, cb); if (RPC_SIGNALLED(task)) goto need_restart; @@ -1211,6 +1325,7 @@ retry_nowait: goto out; need_restart: if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) { + trace_nfsd_cb_restart(clp, cb); task->tk_status = 0; cb->cb_need_restart = true; } @@ -1240,7 +1355,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) case -EIO: case -ETIMEDOUT: case -EACCES: - nfsd4_mark_cb_down(clp, task->tk_status); + nfsd4_mark_cb_down(clp); } break; default: @@ -1295,12 +1410,13 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp) nfsd41_cb_inflight_wait_complete(clp); } -/* requires cl_lock: */ static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp) { struct nfsd4_session *s; struct nfsd4_conn *c; + lockdep_assert_held(&clp->cl_lock); + list_for_each_entry(s, &clp->cl_sessions, se_perclnt) { list_for_each_entry(c, &s->se_conns, cn_persession) { if (c->cn_flags & NFS4_CDFC4_BACK) @@ -1324,11 +1440,14 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) struct nfsd4_conn *c; int err; + trace_nfsd_cb_bc_update(clp, cb); + /* * This is either an update, or the client dying; in either case, * kill the old client: */ if (clp->cl_cb_client) { + trace_nfsd_cb_bc_shutdown(clp, cb); rpc_shutdown_client(clp->cl_cb_client); clp->cl_cb_client = NULL; put_cred(clp->cl_cb_cred); @@ -1340,13 +1459,15 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) } if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) return; + spin_lock(&clp->cl_lock); /* * Only serialized callback code is allowed to clear these * flags; main nfsd code can only set them: */ - BUG_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)); + WARN_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)); clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); + memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn)); c = __nfsd4_find_backchannel(clp); if (c) { @@ -1358,7 +1479,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) err = setup_callback_client(clp, &conn, ses); if (err) { - nfsd4_mark_cb_down(clp, err); + nfsd4_mark_cb_down(clp); if (c) svc_xprt_put(c->cn_xprt); return; @@ -1369,25 +1490,28 @@ static void nfsd4_run_cb_work(struct work_struct *work) { struct nfsd4_callback *cb = - container_of(work, struct nfsd4_callback, cb_work); + container_of(work, struct nfsd4_callback, cb_work.work); struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *clnt; int flags; - if (cb->cb_need_restart) { - cb->cb_need_restart = false; - } else { - if (cb->cb_ops && cb->cb_ops->prepare) - cb->cb_ops->prepare(cb); - } + trace_nfsd_cb_start(clp); if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK) nfsd4_process_cb_update(cb); clnt = clp->cl_cb_client; if (!clnt) { - /* Callback channel broken, or client killed; give up: */ - nfsd41_destroy_cb(cb); + if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) + nfsd41_destroy_cb(cb); + else { + /* + * XXX: Ideally, we could wait for the client to + * reconnect, but I haven't figured out how + * to do that yet. + */ + nfsd4_queue_cb_delayed(cb, 25); + } return; } @@ -1400,6 +1524,12 @@ nfsd4_run_cb_work(struct work_struct *work) return; } + if (cb->cb_need_restart) { + cb->cb_need_restart = false; + } else { + if (cb->cb_ops && cb->cb_ops->prepare) + cb->cb_ops->prepare(cb); + } cb->cb_msg.rpc_cred = clp->cl_cb_cred; flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN; rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags, @@ -1414,8 +1544,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_msg.rpc_argp = cb; cb->cb_msg.rpc_resp = cb; cb->cb_ops = ops; - INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); - cb->cb_seq_status = 1; + INIT_DELAYED_WORK(&cb->cb_work, nfsd4_run_cb_work); cb->cb_status = 0; cb->cb_need_restart = false; cb->cb_holds_slot = false; diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 4c0d00bdfbb1..4f3072b5979a 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -152,6 +152,23 @@ void nfsd4_setup_layout_type(struct svc_export *exp) #endif } +void nfsd4_close_layout(struct nfs4_layout_stateid *ls) +{ + struct nfsd_file *fl; + + spin_lock(&ls->ls_stid.sc_file->fi_lock); + fl = ls->ls_file; + ls->ls_file = NULL; + spin_unlock(&ls->ls_stid.sc_file->fi_lock); + + if (fl) { + if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) + kernel_setlease(fl->nf_file, F_UNLCK, NULL, + (void **)&ls); + nfsd_file_put(fl); + } +} + static void nfsd4_free_layout_stateid(struct nfs4_stid *stid) { @@ -169,9 +186,7 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) list_del_init(&ls->ls_perfile); spin_unlock(&fp->fi_lock); - if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) - kernel_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls); - nfsd_file_put(ls->ls_file); + nfsd4_close_layout(ls); if (ls->ls_recalled) atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls); @@ -235,7 +250,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops, NFSPROC4_CLNT_CB_LAYOUT); - if (parent->sc_type == NFS4_DELEG_STID) + if (parent->sc_type == SC_TYPE_DELEG) ls->ls_file = nfsd_file_get(fp->fi_deleg_file); else ls->ls_file = find_any_file(fp); @@ -249,7 +264,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, } spin_lock(&clp->cl_lock); - stp->sc_type = NFS4_LAYOUT_STID; + stp->sc_type = SC_TYPE_LAYOUT; list_add(&ls->ls_perclnt, &clp->cl_lo_states); spin_unlock(&clp->cl_lock); @@ -268,13 +283,13 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, { struct nfs4_layout_stateid *ls; struct nfs4_stid *stid; - unsigned char typemask = NFS4_LAYOUT_STID; + unsigned short typemask = SC_TYPE_LAYOUT; __be32 status; if (create) - typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID); + typemask |= (SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG); - status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid, + status = nfsd4_lookup_stateid(cstate, stateid, typemask, 0, &stid, net_generic(SVC_NET(rqstp), nfsd_net_id)); if (status) goto out; @@ -285,7 +300,7 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp, goto out_put_stid; } - if (stid->sc_type != NFS4_LAYOUT_STID) { + if (stid->sc_type != SC_TYPE_LAYOUT) { ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type); nfs4_put_stid(stid); @@ -517,7 +532,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp, lrp->lrs_present = true; } else { trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid); - nfs4_unhash_stid(&ls->ls_stid); + ls->ls_stid.sc_status |= SC_STATUS_CLOSED; lrp->lrs_present = false; } spin_unlock(&ls->ls_lock); @@ -604,7 +619,7 @@ nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp) } static void -nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) +nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls, struct nfsd_file *file) { struct nfs4_client *clp = ls->ls_stid.sc_client; char addr_str[INET6_ADDRSTRLEN]; @@ -626,7 +641,7 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) argv[0] = (char *)nfsd_recall_failed; argv[1] = addr_str; - argv[2] = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_id; + argv[2] = file->nf_file->f_path.mnt->mnt_sb->s_id; argv[3] = NULL; error = call_usermodehelper(nfsd_recall_failed, argv, envp, @@ -656,6 +671,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) struct nfsd_net *nn; ktime_t now, cutoff; const struct nfsd4_layout_ops *ops; + struct nfsd_file *fl; trace_nfsd_cb_layout_done(&ls->ls_stid.sc_stateid, task); switch (task->tk_status) { @@ -687,12 +703,17 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) * Unknown error or non-responding client, we'll need to fence. */ trace_nfsd_layout_recall_fail(&ls->ls_stid.sc_stateid); - - ops = nfsd4_layout_ops[ls->ls_layout_type]; - if (ops->fence_client) - ops->fence_client(ls); - else - nfsd4_cb_layout_fail(ls); + rcu_read_lock(); + fl = nfsd_file_get(ls->ls_file); + rcu_read_unlock(); + if (fl) { + ops = nfsd4_layout_ops[ls->ls_layout_type]; + if (ops->fence_client) + ops->fence_client(ls, fl); + else + nfsd4_cb_layout_fail(ls, fl); + nfsd_file_put(fl); + } return 1; case -NFS4ERR_NOMATCHING_LAYOUT: trace_nfsd_layout_recall_done(&ls->ls_stid.sc_stateid); @@ -755,13 +776,11 @@ nfsd4_init_pnfs(void) for (i = 0; i < DEVID_HASH_SIZE; i++) INIT_LIST_HEAD(&nfsd_devid_hash[i]); - nfs4_layout_cache = kmem_cache_create("nfs4_layout", - sizeof(struct nfs4_layout), 0, 0, NULL); + nfs4_layout_cache = KMEM_CACHE(nfs4_layout, 0); if (!nfs4_layout_cache) return -ENOMEM; - nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid", - sizeof(struct nfs4_layout_stateid), 0, 0, NULL); + nfs4_layout_stateid_cache = KMEM_CACHE(nfs4_layout_stateid, 0); if (!nfs4_layout_stateid_cache) { kmem_cache_destroy(nfs4_layout_cache); return -ENOMEM; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 14712fa08f76..2927b1263f08 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1143,6 +1143,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, }; struct inode *inode; __be32 status = nfs_ok; + bool save_no_wcc; int err; if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { @@ -1168,8 +1169,10 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; - status = nfsd_setattr(rqstp, &cstate->current_fh, &attrs, - 0, (time64_t)0); + save_no_wcc = cstate->current_fh.fh_no_wcc; + cstate->current_fh.fh_no_wcc = true; + status = nfsd_setattr(rqstp, &cstate->current_fh, &attrs, NULL); + cstate->current_fh.fh_no_wcc = save_no_wcc; if (!status) status = nfserrno(attrs.na_labelerr); if (!status) @@ -2490,10 +2493,10 @@ nfsd4_proc_null(struct svc_rqst *rqstp) return rpc_success; } -static inline void nfsd4_increment_op_stats(u32 opnum) +static inline void nfsd4_increment_op_stats(struct nfsd_net *nn, u32 opnum) { if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP) - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_NFS4_OP(opnum)]); + percpu_counter_inc(&nn->counter[NFSD_STATS_NFS4_OP(opnum)]); } static const struct nfsd4_operation nfsd4_ops[]; @@ -2768,7 +2771,7 @@ encode_op: status, nfsd4_op_name(op->opnum)); nfsd4_cstate_clear_replay(cstate); - nfsd4_increment_op_stats(op->opnum); + nfsd4_increment_op_stats(nn, op->opnum); } fh_put(current_fh); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 9257425cbd1a..1a93c7fcf76c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -87,6 +87,7 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid); void nfsd4_end_grace(struct nfsd_net *nn); static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps); static void nfsd4_file_hash_remove(struct nfs4_file *fi); +static void deleg_reaper(struct nfsd_net *nn); /* Locking: */ @@ -127,6 +128,7 @@ static void free_session(struct nfsd4_session *); static const struct nfsd4_callback_ops nfsd4_cb_recall_ops; static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops; +static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops; static struct workqueue_struct *laundry_wq; @@ -318,6 +320,7 @@ free_nbl(struct kref *kref) struct nfsd4_blocked_lock *nbl; nbl = container_of(kref, struct nfsd4_blocked_lock, nbl_kref); + locks_release_private(&nbl->nbl_lock); kfree(nbl); } @@ -325,7 +328,6 @@ static void free_blocked_lock(struct nfsd4_blocked_lock *nbl) { locks_delete_block(&nbl->nbl_lock); - locks_release_private(&nbl->nbl_lock); kref_put(&nbl->nbl_kref, free_nbl); } @@ -1189,6 +1191,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, dp->dl_recalled = false; nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL); + nfsd4_init_cb(&dp->dl_cb_fattr.ncf_getattr, dp->dl_stid.sc_client, + &nfsd4_cb_getattr_ops, NFSPROC4_CLNT_CB_GETATTR); + dp->dl_cb_fattr.ncf_file_modified = false; + dp->dl_cb_fattr.ncf_cb_bmap[0] = FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE; get_nfs4_file(fp); dp->dl_stid.sc_file = fp; return dp; @@ -1210,6 +1216,8 @@ nfs4_put_stid(struct nfs4_stid *s) return; } idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); + if (s->sc_status & SC_STATUS_ADMIN_REVOKED) + atomic_dec(&s->sc_client->cl_admin_revoked); nfs4_free_cpntf_statelist(clp->net, s); spin_unlock(&clp->cl_lock); s->sc_free(s); @@ -1260,11 +1268,6 @@ static void destroy_unhashed_deleg(struct nfs4_delegation *dp) nfs4_put_stid(&dp->dl_stid); } -void nfs4_unhash_stid(struct nfs4_stid *s) -{ - s->sc_type = 0; -} - /** * nfs4_delegation_exists - Discover if this delegation already exists * @clp: a pointer to the nfs4_client we're granting a delegation to @@ -1312,11 +1315,12 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) lockdep_assert_held(&state_lock); lockdep_assert_held(&fp->fi_lock); + lockdep_assert_held(&clp->cl_lock); if (nfs4_delegation_exists(clp, fp)) return -EAGAIN; refcount_inc(&dp->dl_stid.sc_count); - dp->dl_stid.sc_type = NFS4_DELEG_STID; + dp->dl_stid.sc_type = SC_TYPE_DELEG; list_add(&dp->dl_perfile, &fp->fi_delegations); list_add(&dp->dl_perclnt, &clp->cl_delegations); return 0; @@ -1328,7 +1332,7 @@ static bool delegation_hashed(struct nfs4_delegation *dp) } static bool -unhash_delegation_locked(struct nfs4_delegation *dp) +unhash_delegation_locked(struct nfs4_delegation *dp, unsigned short statusmask) { struct nfs4_file *fp = dp->dl_stid.sc_file; @@ -1337,7 +1341,13 @@ unhash_delegation_locked(struct nfs4_delegation *dp) if (!delegation_hashed(dp)) return false; - dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID; + if (statusmask == SC_STATUS_REVOKED && + dp->dl_stid.sc_client->cl_minorversion == 0) + statusmask = SC_STATUS_CLOSED; + dp->dl_stid.sc_status |= statusmask; + if (statusmask & SC_STATUS_ADMIN_REVOKED) + atomic_inc(&dp->dl_stid.sc_client->cl_admin_revoked); + /* Ensure that deleg break won't try to requeue it */ ++dp->dl_time; spin_lock(&fp->fi_lock); @@ -1353,7 +1363,7 @@ static void destroy_delegation(struct nfs4_delegation *dp) bool unhashed; spin_lock(&state_lock); - unhashed = unhash_delegation_locked(dp); + unhashed = unhash_delegation_locked(dp, SC_STATUS_CLOSED); spin_unlock(&state_lock); if (unhashed) destroy_unhashed_deleg(dp); @@ -1367,9 +1377,9 @@ static void revoke_delegation(struct nfs4_delegation *dp) trace_nfsd_stid_revoke(&dp->dl_stid); - if (clp->cl_minorversion) { + if (dp->dl_stid.sc_status & + (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)) { spin_lock(&clp->cl_lock); - dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID; refcount_inc(&dp->dl_stid.sc_count); list_add(&dp->dl_recall_lru, &clp->cl_revoked); spin_unlock(&clp->cl_lock); @@ -1377,8 +1387,8 @@ static void revoke_delegation(struct nfs4_delegation *dp) destroy_unhashed_deleg(dp); } -/* - * SETCLIENTID state +/* + * SETCLIENTID state */ static unsigned int clientid_hashval(u32 id) @@ -1531,6 +1541,8 @@ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp, } idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id); + if (s->sc_status & SC_STATUS_ADMIN_REVOKED) + atomic_dec(&s->sc_client->cl_admin_revoked); list_add(&stp->st_locks, reaplist); } @@ -1541,7 +1553,7 @@ static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp) if (!unhash_ol_stateid(stp)) return false; list_del_init(&stp->st_locks); - nfs4_unhash_stid(&stp->st_stid); + stp->st_stid.sc_status |= SC_STATUS_CLOSED; return true; } @@ -1599,7 +1611,7 @@ static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp, while (!list_empty(&open_stp->st_locks)) { stp = list_entry(open_stp->st_locks.next, struct nfs4_ol_stateid, st_locks); - WARN_ON(!unhash_lock_stateid(stp)); + unhash_lock_stateid(stp); put_ol_stateid_locked(stp, reaplist); } } @@ -1620,6 +1632,7 @@ static void release_open_stateid(struct nfs4_ol_stateid *stp) LIST_HEAD(reaplist); spin_lock(&stp->st_stid.sc_client->cl_lock); + stp->st_stid.sc_status |= SC_STATUS_CLOSED; if (unhash_open_stateid(stp, &reaplist)) put_ol_stateid_locked(stp, &reaplist); spin_unlock(&stp->st_stid.sc_client->cl_lock); @@ -1675,6 +1688,136 @@ static void release_openowner(struct nfs4_openowner *oo) nfs4_put_stateowner(&oo->oo_owner); } +static struct nfs4_stid *find_one_sb_stid(struct nfs4_client *clp, + struct super_block *sb, + unsigned int sc_types) +{ + unsigned long id, tmp; + struct nfs4_stid *stid; + + spin_lock(&clp->cl_lock); + idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id) + if ((stid->sc_type & sc_types) && + stid->sc_status == 0 && + stid->sc_file->fi_inode->i_sb == sb) { + refcount_inc(&stid->sc_count); + break; + } + spin_unlock(&clp->cl_lock); + return stid; +} + +/** + * nfsd4_revoke_states - revoke all nfsv4 states associated with given filesystem + * @net: used to identify instance of nfsd (there is one per net namespace) + * @sb: super_block used to identify target filesystem + * + * All nfs4 states (open, lock, delegation, layout) held by the server instance + * and associated with a file on the given filesystem will be revoked resulting + * in any files being closed and so all references from nfsd to the filesystem + * being released. Thus nfsd will no longer prevent the filesystem from being + * unmounted. + * + * The clients which own the states will subsequently being notified that the + * states have been "admin-revoked". + */ +void nfsd4_revoke_states(struct net *net, struct super_block *sb) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + unsigned int idhashval; + unsigned int sc_types; + + sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG | SC_TYPE_LAYOUT; + + spin_lock(&nn->client_lock); + for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) { + struct list_head *head = &nn->conf_id_hashtbl[idhashval]; + struct nfs4_client *clp; + retry: + list_for_each_entry(clp, head, cl_idhash) { + struct nfs4_stid *stid = find_one_sb_stid(clp, sb, + sc_types); + if (stid) { + struct nfs4_ol_stateid *stp; + struct nfs4_delegation *dp; + struct nfs4_layout_stateid *ls; + + spin_unlock(&nn->client_lock); + switch (stid->sc_type) { + case SC_TYPE_OPEN: + stp = openlockstateid(stid); + mutex_lock_nested(&stp->st_mutex, + OPEN_STATEID_MUTEX); + + spin_lock(&clp->cl_lock); + if (stid->sc_status == 0) { + stid->sc_status |= + SC_STATUS_ADMIN_REVOKED; + atomic_inc(&clp->cl_admin_revoked); + spin_unlock(&clp->cl_lock); + release_all_access(stp); + } else + spin_unlock(&clp->cl_lock); + mutex_unlock(&stp->st_mutex); + break; + case SC_TYPE_LOCK: + stp = openlockstateid(stid); + mutex_lock_nested(&stp->st_mutex, + LOCK_STATEID_MUTEX); + spin_lock(&clp->cl_lock); + if (stid->sc_status == 0) { + struct nfs4_lockowner *lo = + lockowner(stp->st_stateowner); + struct nfsd_file *nf; + + stid->sc_status |= + SC_STATUS_ADMIN_REVOKED; + atomic_inc(&clp->cl_admin_revoked); + spin_unlock(&clp->cl_lock); + nf = find_any_file(stp->st_stid.sc_file); + if (nf) { + get_file(nf->nf_file); + filp_close(nf->nf_file, + (fl_owner_t)lo); + nfsd_file_put(nf); + } + release_all_access(stp); + } else + spin_unlock(&clp->cl_lock); + mutex_unlock(&stp->st_mutex); + break; + case SC_TYPE_DELEG: + dp = delegstateid(stid); + spin_lock(&state_lock); + if (!unhash_delegation_locked( + dp, SC_STATUS_ADMIN_REVOKED)) + dp = NULL; + spin_unlock(&state_lock); + if (dp) + revoke_delegation(dp); + break; + case SC_TYPE_LAYOUT: + ls = layoutstateid(stid); + nfsd4_close_layout(ls); + break; + } + nfs4_put_stid(stid); + spin_lock(&nn->client_lock); + if (clp->cl_minorversion == 0) + /* Allow cleanup after a lease period. + * store_release ensures cleanup will + * see any newly revoked states if it + * sees the time updated. + */ + nn->nfs40_last_revoke = + ktime_get_boottime_seconds(); + goto retry; + } + } + } + spin_unlock(&nn->client_lock); +} + static inline int hash_sessionid(struct nfs4_sessionid *sessionid) { @@ -2228,7 +2371,7 @@ __destroy_client(struct nfs4_client *clp) spin_lock(&state_lock); while (!list_empty(&clp->cl_delegations)) { dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt); - WARN_ON(!unhash_delegation_locked(dp)); + unhash_delegation_locked(dp, SC_STATUS_CLOSED); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); @@ -2460,14 +2603,16 @@ find_stateid_locked(struct nfs4_client *cl, stateid_t *t) } static struct nfs4_stid * -find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) +find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, + unsigned short typemask, unsigned short ok_states) { struct nfs4_stid *s; spin_lock(&cl->cl_lock); s = find_stateid_locked(cl, t); if (s != NULL) { - if (typemask & s->sc_type) + if ((s->sc_status & ~ok_states) == 0 && + (typemask & s->sc_type)) refcount_inc(&s->sc_count); else s = NULL; @@ -2487,9 +2632,9 @@ static struct nfs4_client *get_nfsdfs_clp(struct inode *inode) static void seq_quote_mem(struct seq_file *m, char *data, int len) { - seq_printf(m, "\""); + seq_puts(m, "\""); seq_escape_mem(m, data, len, ESCAPE_HEX | ESCAPE_NAP | ESCAPE_APPEND, "\"\\"); - seq_printf(m, "\""); + seq_puts(m, "\""); } static const char *cb_state2str(int state) @@ -2530,20 +2675,22 @@ static int client_info_show(struct seq_file *m, void *v) seq_puts(m, "status: unconfirmed\n"); seq_printf(m, "seconds from last renew: %lld\n", ktime_get_boottime_seconds() - clp->cl_time); - seq_printf(m, "name: "); + seq_puts(m, "name: "); seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len); seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion); if (clp->cl_nii_domain.data) { - seq_printf(m, "Implementation domain: "); + seq_puts(m, "Implementation domain: "); seq_quote_mem(m, clp->cl_nii_domain.data, clp->cl_nii_domain.len); - seq_printf(m, "\nImplementation name: "); + seq_puts(m, "\nImplementation name: "); seq_quote_mem(m, clp->cl_nii_name.data, clp->cl_nii_name.len); seq_printf(m, "\nImplementation time: [%lld, %ld]\n", clp->cl_nii_time.tv_sec, clp->cl_nii_time.tv_nsec); } seq_printf(m, "callback state: %s\n", cb_state2str(clp->cl_cb_state)); seq_printf(m, "callback address: %pISpc\n", &clp->cl_cb_conn.cb_addr); + seq_printf(m, "admin-revoked states: %d\n", + atomic_read(&clp->cl_admin_revoked)); drop_client(clp); return 0; @@ -2602,7 +2749,7 @@ static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f) static void nfs4_show_owner(struct seq_file *s, struct nfs4_stateowner *oo) { - seq_printf(s, "owner: "); + seq_puts(s, "owner: "); seq_quote_mem(s, oo->so_owner.data, oo->so_owner.len); } @@ -2620,20 +2767,13 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) struct nfs4_stateowner *oo; unsigned int access, deny; - if (st->sc_type != NFS4_OPEN_STID && st->sc_type != NFS4_LOCK_STID) - return 0; /* XXX: or SEQ_SKIP? */ ols = openlockstateid(st); oo = ols->st_stateowner; nf = st->sc_file; - spin_lock(&nf->fi_lock); - file = find_any_file_locked(nf); - if (!file) - goto out; - - seq_printf(s, "- "); + seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); - seq_printf(s, ": { type: open, "); + seq_puts(s, ": { type: open, "); access = bmap_to_share_mode(ols->st_access_bmap); deny = bmap_to_share_mode(ols->st_deny_bmap); @@ -2645,14 +2785,19 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) deny & NFS4_SHARE_ACCESS_READ ? "r" : "-", deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-"); - nfs4_show_superblock(s, file); - seq_printf(s, ", "); - nfs4_show_fname(s, file); - seq_printf(s, ", "); - nfs4_show_owner(s, oo); - seq_printf(s, " }\n"); -out: + spin_lock(&nf->fi_lock); + file = find_any_file_locked(nf); + if (file) { + nfs4_show_superblock(s, file); + seq_puts(s, ", "); + nfs4_show_fname(s, file); + seq_puts(s, ", "); + } spin_unlock(&nf->fi_lock); + nfs4_show_owner(s, oo); + if (st->sc_status & SC_STATUS_ADMIN_REVOKED) + seq_puts(s, ", admin-revoked"); + seq_puts(s, " }\n"); return 0; } @@ -2666,30 +2811,31 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) ols = openlockstateid(st); oo = ols->st_stateowner; nf = st->sc_file; + + seq_puts(s, "- "); + nfs4_show_stateid(s, &st->sc_stateid); + seq_puts(s, ": { type: lock, "); + spin_lock(&nf->fi_lock); file = find_any_file_locked(nf); - if (!file) - goto out; + if (file) { + /* + * Note: a lock stateid isn't really the same thing as a lock, + * it's the locking state held by one owner on a file, and there + * may be multiple (or no) lock ranges associated with it. + * (Same for the matter is true of open stateids.) + */ - seq_printf(s, "- "); - nfs4_show_stateid(s, &st->sc_stateid); - seq_printf(s, ": { type: lock, "); - - /* - * Note: a lock stateid isn't really the same thing as a lock, - * it's the locking state held by one owner on a file, and there - * may be multiple (or no) lock ranges associated with it. - * (Same for the matter is true of open stateids.) - */ - - nfs4_show_superblock(s, file); - /* XXX: open stateid? */ - seq_printf(s, ", "); - nfs4_show_fname(s, file); - seq_printf(s, ", "); + nfs4_show_superblock(s, file); + /* XXX: open stateid? */ + seq_puts(s, ", "); + nfs4_show_fname(s, file); + seq_puts(s, ", "); + } nfs4_show_owner(s, oo); - seq_printf(s, " }\n"); -out: + if (st->sc_status & SC_STATUS_ADMIN_REVOKED) + seq_puts(s, ", admin-revoked"); + seq_puts(s, " }\n"); spin_unlock(&nf->fi_lock); return 0; } @@ -2702,27 +2848,28 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) ds = delegstateid(st); nf = st->sc_file; - spin_lock(&nf->fi_lock); - file = nf->fi_deleg_file; - if (!file) - goto out; - seq_printf(s, "- "); + seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); - seq_printf(s, ": { type: deleg, "); + seq_puts(s, ": { type: deleg, "); - /* Kinda dead code as long as we only support read delegs: */ - seq_printf(s, "access: %s, ", - ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w"); + seq_printf(s, "access: %s", + ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w"); /* XXX: lease time, whether it's being recalled. */ - nfs4_show_superblock(s, file); - seq_printf(s, ", "); - nfs4_show_fname(s, file); - seq_printf(s, " }\n"); -out: + spin_lock(&nf->fi_lock); + file = nf->fi_deleg_file; + if (file) { + seq_puts(s, ", "); + nfs4_show_superblock(s, file); + seq_puts(s, ", "); + nfs4_show_fname(s, file); + } spin_unlock(&nf->fi_lock); + if (st->sc_status & SC_STATUS_ADMIN_REVOKED) + seq_puts(s, ", admin-revoked"); + seq_puts(s, " }\n"); return 0; } @@ -2732,18 +2879,25 @@ static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st) struct nfsd_file *file; ls = container_of(st, struct nfs4_layout_stateid, ls_stid); - file = ls->ls_file; - seq_printf(s, "- "); + seq_puts(s, "- "); nfs4_show_stateid(s, &st->sc_stateid); - seq_printf(s, ": { type: layout, "); + seq_puts(s, ": { type: layout"); /* XXX: What else would be useful? */ - nfs4_show_superblock(s, file); - seq_printf(s, ", "); - nfs4_show_fname(s, file); - seq_printf(s, " }\n"); + spin_lock(&ls->ls_stid.sc_file->fi_lock); + file = ls->ls_file; + if (file) { + seq_puts(s, ", "); + nfs4_show_superblock(s, file); + seq_puts(s, ", "); + nfs4_show_fname(s, file); + } + spin_unlock(&ls->ls_stid.sc_file->fi_lock); + if (st->sc_status & SC_STATUS_ADMIN_REVOKED) + seq_puts(s, ", admin-revoked"); + seq_puts(s, " }\n"); return 0; } @@ -2753,13 +2907,13 @@ static int states_show(struct seq_file *s, void *v) struct nfs4_stid *st = v; switch (st->sc_type) { - case NFS4_OPEN_STID: + case SC_TYPE_OPEN: return nfs4_show_open(s, st); - case NFS4_LOCK_STID: + case SC_TYPE_LOCK: return nfs4_show_lock(s, st); - case NFS4_DELEG_STID: + case SC_TYPE_DELEG: return nfs4_show_deleg(s, st); - case NFS4_LAYOUT_STID: + case SC_TYPE_LAYOUT: return nfs4_show_layout(s, st); default: return 0; /* XXX: or SEQ_SKIP? */ @@ -2896,11 +3050,59 @@ nfsd4_cb_recall_any_release(struct nfsd4_callback *cb) spin_unlock(&nn->client_lock); } +static int +nfsd4_cb_getattr_done(struct nfsd4_callback *cb, struct rpc_task *task) +{ + struct nfs4_cb_fattr *ncf = + container_of(cb, struct nfs4_cb_fattr, ncf_getattr); + + ncf->ncf_cb_status = task->tk_status; + switch (task->tk_status) { + case -NFS4ERR_DELAY: + rpc_delay(task, 2 * HZ); + return 0; + default: + return 1; + } +} + +static void +nfsd4_cb_getattr_release(struct nfsd4_callback *cb) +{ + struct nfs4_cb_fattr *ncf = + container_of(cb, struct nfs4_cb_fattr, ncf_getattr); + struct nfs4_delegation *dp = + container_of(ncf, struct nfs4_delegation, dl_cb_fattr); + + nfs4_put_stid(&dp->dl_stid); + clear_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags); + wake_up_bit(&ncf->ncf_cb_flags, CB_GETATTR_BUSY); +} + static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = { .done = nfsd4_cb_recall_any_done, .release = nfsd4_cb_recall_any_release, }; +static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops = { + .done = nfsd4_cb_getattr_done, + .release = nfsd4_cb_getattr_release, +}; + +static void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf) +{ + struct nfs4_delegation *dp = + container_of(ncf, struct nfs4_delegation, dl_cb_fattr); + + if (test_and_set_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags)) + return; + /* set to proper status when nfsd4_cb_getattr_done runs */ + ncf->ncf_cb_status = NFS4ERR_IO; + + refcount_inc(&dp->dl_stid.sc_count); + nfsd4_run_cb(&ncf->ncf_getattr); +} + static struct nfs4_client *create_client(struct xdr_netobj name, struct svc_rqst *rqstp, nfs4_verifier *verf) { @@ -3414,6 +3616,9 @@ out_new: new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0]; new->cl_spo_must_allow.u.words[1] = exid->spo_must_allow[1]; + /* Contrived initial CREATE_SESSION response */ + new->cl_cs_slot.sl_status = nfserr_seq_misordered; + add_to_unconfirmed(new); swap(new, conf); out_copy: @@ -3584,10 +3789,10 @@ nfsd4_create_session(struct svc_rqst *rqstp, struct nfsd4_create_session *cr_ses = &u->create_session; struct sockaddr *sa = svc_addr(rqstp); struct nfs4_client *conf, *unconf; + struct nfsd4_clid_slot *cs_slot; struct nfs4_client *old = NULL; struct nfsd4_session *new; struct nfsd4_conn *conn; - struct nfsd4_clid_slot *cs_slot = NULL; __be32 status = 0; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); @@ -3611,53 +3816,60 @@ nfsd4_create_session(struct svc_rqst *rqstp, goto out_free_session; spin_lock(&nn->client_lock); + + /* RFC 8881 Section 18.36.4 Phase 1: Client record look-up. */ unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn); conf = find_confirmed_client(&cr_ses->clientid, true, nn); - WARN_ON_ONCE(conf && unconf); + if (!conf && !unconf) { + status = nfserr_stale_clientid; + goto out_free_conn; + } + /* RFC 8881 Section 18.36.4 Phase 2: Sequence ID processing. */ + if (conf) + cs_slot = &conf->cl_cs_slot; + else + cs_slot = &unconf->cl_cs_slot; + status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); + if (status) { + if (status == nfserr_replay_cache) { + status = nfsd4_replay_create_session(cr_ses, cs_slot); + goto out_free_conn; + } + goto out_cache_error; + } + cs_slot->sl_seqid++; + cr_ses->seqid = cs_slot->sl_seqid; + + /* RFC 8881 Section 18.36.4 Phase 3: Client ID confirmation. */ if (conf) { status = nfserr_wrong_cred; if (!nfsd4_mach_creds_match(conf, rqstp)) - goto out_free_conn; - cs_slot = &conf->cl_cs_slot; - status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); - if (status) { - if (status == nfserr_replay_cache) - status = nfsd4_replay_create_session(cr_ses, cs_slot); - goto out_free_conn; - } - } else if (unconf) { + goto out_cache_error; + } else { status = nfserr_clid_inuse; if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) || !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) { trace_nfsd_clid_cred_mismatch(unconf, rqstp); - goto out_free_conn; + goto out_cache_error; } status = nfserr_wrong_cred; if (!nfsd4_mach_creds_match(unconf, rqstp)) - goto out_free_conn; - cs_slot = &unconf->cl_cs_slot; - status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0); - if (status) { - /* an unconfirmed replay returns misordered */ - status = nfserr_seq_misordered; - goto out_free_conn; - } + goto out_cache_error; old = find_confirmed_client_by_name(&unconf->cl_name, nn); if (old) { status = mark_client_expired_locked(old); if (status) { old = NULL; - goto out_free_conn; + goto out_cache_error; } trace_nfsd_clid_replaced(&old->cl_clientid); } move_to_confirmed(unconf); conf = unconf; - } else { - status = nfserr_stale_clientid; - goto out_free_conn; } + + /* RFC 8881 Section 18.36.4 Phase 4: Session creation. */ status = nfs_ok; /* Persistent sessions are not supported */ cr_ses->flags &= ~SESSION4_PERSIST; @@ -3669,8 +3881,6 @@ nfsd4_create_session(struct svc_rqst *rqstp, memcpy(cr_ses->sessionid.data, new->se_sessionid.data, NFS4_MAX_SESSIONID_LEN); - cs_slot->sl_seqid++; - cr_ses->seqid = cs_slot->sl_seqid; /* cache solo and embedded create sessions under the client_lock */ nfsd4_cache_create_session(cr_ses, cs_slot, status); @@ -3683,6 +3893,9 @@ nfsd4_create_session(struct svc_rqst *rqstp, if (old) expire_client(old); return status; + +out_cache_error: + nfsd4_cache_create_session(cr_ses, cs_slot, status); out_free_conn: spin_unlock(&nn->client_lock); free_conn(conn); @@ -4058,6 +4271,9 @@ out: } if (!list_empty(&clp->cl_revoked)) seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED; + if (atomic_read(&clp->cl_admin_revoked)) + seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED; + trace_nfsd_seq4_status(rqstp, seq); out_no_session: if (conn) free_conn(conn); @@ -4352,32 +4568,25 @@ nfsd4_free_slabs(void) int nfsd4_init_slabs(void) { - client_slab = kmem_cache_create("nfsd4_clients", - sizeof(struct nfs4_client), 0, 0, NULL); + client_slab = KMEM_CACHE(nfs4_client, 0); if (client_slab == NULL) goto out; - openowner_slab = kmem_cache_create("nfsd4_openowners", - sizeof(struct nfs4_openowner), 0, 0, NULL); + openowner_slab = KMEM_CACHE(nfs4_openowner, 0); if (openowner_slab == NULL) goto out_free_client_slab; - lockowner_slab = kmem_cache_create("nfsd4_lockowners", - sizeof(struct nfs4_lockowner), 0, 0, NULL); + lockowner_slab = KMEM_CACHE(nfs4_lockowner, 0); if (lockowner_slab == NULL) goto out_free_openowner_slab; - file_slab = kmem_cache_create("nfsd4_files", - sizeof(struct nfs4_file), 0, 0, NULL); + file_slab = KMEM_CACHE(nfs4_file, 0); if (file_slab == NULL) goto out_free_lockowner_slab; - stateid_slab = kmem_cache_create("nfsd4_stateids", - sizeof(struct nfs4_ol_stateid), 0, 0, NULL); + stateid_slab = KMEM_CACHE(nfs4_ol_stateid, 0); if (stateid_slab == NULL) goto out_free_file_slab; - deleg_slab = kmem_cache_create("nfsd4_delegations", - sizeof(struct nfs4_delegation), 0, 0, NULL); + deleg_slab = KMEM_CACHE(nfs4_delegation, 0); if (deleg_slab == NULL) goto out_free_stateid_slab; - odstate_slab = kmem_cache_create("nfsd4_odstate", - sizeof(struct nfs4_clnt_odstate), 0, 0, NULL); + odstate_slab = KMEM_CACHE(nfs4_clnt_odstate, 0); if (odstate_slab == NULL) goto out_free_deleg_slab; return 0; @@ -4531,7 +4740,8 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) continue; if (local->st_stateowner != &oo->oo_owner) continue; - if (local->st_stid.sc_type == NFS4_OPEN_STID) { + if (local->st_stid.sc_type == SC_TYPE_OPEN && + !local->st_stid.sc_status) { ret = local; refcount_inc(&ret->st_stid.sc_count); break; @@ -4540,22 +4750,75 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) return ret; } +static void nfsd4_drop_revoked_stid(struct nfs4_stid *s) + __releases(&s->sc_client->cl_lock) +{ + struct nfs4_client *cl = s->sc_client; + LIST_HEAD(reaplist); + struct nfs4_ol_stateid *stp; + struct nfs4_delegation *dp; + bool unhashed; + + switch (s->sc_type) { + case SC_TYPE_OPEN: + stp = openlockstateid(s); + if (unhash_open_stateid(stp, &reaplist)) + put_ol_stateid_locked(stp, &reaplist); + spin_unlock(&cl->cl_lock); + free_ol_stateid_reaplist(&reaplist); + break; + case SC_TYPE_LOCK: + stp = openlockstateid(s); + unhashed = unhash_lock_stateid(stp); + spin_unlock(&cl->cl_lock); + if (unhashed) + nfs4_put_stid(s); + break; + case SC_TYPE_DELEG: + dp = delegstateid(s); + list_del_init(&dp->dl_recall_lru); + spin_unlock(&cl->cl_lock); + nfs4_put_stid(s); + break; + default: + spin_unlock(&cl->cl_lock); + } +} + +static void nfsd40_drop_revoked_stid(struct nfs4_client *cl, + stateid_t *stid) +{ + /* NFSv4.0 has no way for the client to tell the server + * that it can forget an admin-revoked stateid. + * So we keep it around until the first time that the + * client uses it, and drop it the first time + * nfserr_admin_revoked is returned. + * For v4.1 and later we wait until explicitly told + * to free the stateid. + */ + if (cl->cl_minorversion == 0) { + struct nfs4_stid *st; + + spin_lock(&cl->cl_lock); + st = find_stateid_locked(cl, stid); + if (st) + nfsd4_drop_revoked_stid(st); + else + spin_unlock(&cl->cl_lock); + } +} + static __be32 nfsd4_verify_open_stid(struct nfs4_stid *s) { __be32 ret = nfs_ok; - switch (s->sc_type) { - default: - break; - case 0: - case NFS4_CLOSED_STID: - case NFS4_CLOSED_DELEG_STID: - ret = nfserr_bad_stateid; - break; - case NFS4_REVOKED_DELEG_STID: + if (s->sc_status & SC_STATUS_ADMIN_REVOKED) + ret = nfserr_admin_revoked; + else if (s->sc_status & SC_STATUS_REVOKED) ret = nfserr_deleg_revoked; - } + else if (s->sc_status & SC_STATUS_CLOSED) + ret = nfserr_bad_stateid; return ret; } @@ -4567,6 +4830,10 @@ nfsd4_lock_ol_stateid(struct nfs4_ol_stateid *stp) mutex_lock_nested(&stp->st_mutex, LOCK_STATEID_MUTEX); ret = nfsd4_verify_open_stid(&stp->st_stid); + if (ret == nfserr_admin_revoked) + nfsd40_drop_revoked_stid(stp->st_stid.sc_client, + &stp->st_stid.sc_stateid); + if (ret != nfs_ok) mutex_unlock(&stp->st_mutex); return ret; @@ -4641,7 +4908,7 @@ retry: open->op_stp = NULL; refcount_inc(&stp->st_stid.sc_count); - stp->st_stid.sc_type = NFS4_OPEN_STID; + stp->st_stid.sc_type = SC_TYPE_OPEN; INIT_LIST_HEAD(&stp->st_locks); stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner); get_nfs4_file(fp); @@ -4868,9 +5135,9 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, trace_nfsd_cb_recall_done(&dp->dl_stid.sc_stateid, task); - if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID || - dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) - return 1; + if (dp->dl_stid.sc_status) + /* CLOSED or REVOKED */ + return 1; switch (task->tk_status) { case 0: @@ -5113,12 +5380,12 @@ static int share_access_to_flags(u32 share_access) return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; } -static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s) +static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, + stateid_t *s) { struct nfs4_stid *ret; - ret = find_stateid_by_type(cl, s, - NFS4_DELEG_STID|NFS4_REVOKED_DELEG_STID); + ret = find_stateid_by_type(cl, s, SC_TYPE_DELEG, SC_STATUS_REVOKED); if (!ret) return NULL; return delegstateid(ret); @@ -5141,10 +5408,15 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open, deleg = find_deleg_stateid(cl, &open->op_delegate_stateid); if (deleg == NULL) goto out; - if (deleg->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) { + if (deleg->dl_stid.sc_status & SC_STATUS_ADMIN_REVOKED) { nfs4_put_stid(&deleg->dl_stid); - if (cl->cl_minorversion) - status = nfserr_deleg_revoked; + status = nfserr_admin_revoked; + goto out; + } + if (deleg->dl_stid.sc_status & SC_STATUS_REVOKED) { + nfs4_put_stid(&deleg->dl_stid); + nfsd40_drop_revoked_stid(cl, &open->op_delegate_stateid); + status = nfserr_deleg_revoked; goto out; } flags = share_access_to_flags(open->op_share_access); @@ -5189,7 +5461,7 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, return 0; if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE)) return nfserr_inval; - return nfsd_setattr(rqstp, fh, &attrs, 0, (time64_t)0); + return nfsd_setattr(rqstp, fh, &attrs, NULL); } static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, @@ -5560,9 +5832,11 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, goto out_unlock; spin_lock(&state_lock); + spin_lock(&clp->cl_lock); spin_lock(&fp->fi_lock); status = hash_delegation_locked(dp, fp); spin_unlock(&fp->fi_lock); + spin_unlock(&clp->cl_lock); spin_unlock(&state_lock); if (status) @@ -5634,6 +5908,8 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, struct svc_fh *parent = NULL; int cb_up; int status = 0; + struct kstat stat; + struct path path; cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client); open->op_recall = false; @@ -5671,6 +5947,18 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) { open->op_delegate_type = NFS4_OPEN_DELEGATE_WRITE; trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid); + path.mnt = currentfh->fh_export->ex_path.mnt; + path.dentry = currentfh->fh_dentry; + if (vfs_getattr(&path, &stat, + (STATX_SIZE | STATX_CTIME | STATX_CHANGE_COOKIE), + AT_STATX_SYNC_AS_STAT)) { + nfs4_put_stid(&dp->dl_stid); + destroy_delegation(dp); + goto out_no_deleg; + } + dp->dl_cb_fattr.ncf_cur_fsize = stat.size; + dp->dl_cb_fattr.ncf_initial_cinfo = + nfsd4_change_attribute(&stat, d_inode(currentfh->fh_dentry)); } else { open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid); @@ -5774,7 +6062,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf } else { status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open, true); if (status) { - stp->st_stid.sc_type = NFS4_CLOSED_STID; release_open_stateid(stp); mutex_unlock(&stp->st_mutex); goto out; @@ -6128,6 +6415,43 @@ nfs4_process_client_reaplist(struct list_head *reaplist) } } +static void nfs40_clean_admin_revoked(struct nfsd_net *nn, + struct laundry_time *lt) +{ + struct nfs4_client *clp; + + spin_lock(&nn->client_lock); + if (nn->nfs40_last_revoke == 0 || + nn->nfs40_last_revoke > lt->cutoff) { + spin_unlock(&nn->client_lock); + return; + } + nn->nfs40_last_revoke = 0; + +retry: + list_for_each_entry(clp, &nn->client_lru, cl_lru) { + unsigned long id, tmp; + struct nfs4_stid *stid; + + if (atomic_read(&clp->cl_admin_revoked) == 0) + continue; + + spin_lock(&clp->cl_lock); + idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id) + if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) { + refcount_inc(&stid->sc_count); + spin_unlock(&nn->client_lock); + /* this function drops ->cl_lock */ + nfsd4_drop_revoked_stid(stid); + nfs4_put_stid(stid); + spin_lock(&nn->client_lock); + goto retry; + } + spin_unlock(&clp->cl_lock); + } + spin_unlock(&nn->client_lock); +} + static time64_t nfs4_laundromat(struct nfsd_net *nn) { @@ -6161,12 +6485,14 @@ nfs4_laundromat(struct nfsd_net *nn) nfs4_get_client_reaplist(nn, &reaplist, <); nfs4_process_client_reaplist(&reaplist); + nfs40_clean_admin_revoked(nn, <); + spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); if (!state_expired(<, dp->dl_time)) break; - WARN_ON(!unhash_delegation_locked(dp)); + unhash_delegation_locked(dp, SC_STATUS_REVOKED); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); @@ -6225,6 +6551,8 @@ nfs4_laundromat(struct nfsd_net *nn) /* service the server-to-server copy delayed unmount list */ nfsd4_ssc_expire_umount(nn); #endif + if (atomic_long_read(&num_delegations) >= max_delegations) + deleg_reaper(nn); out: return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT); } @@ -6286,6 +6614,8 @@ deleg_reaper(struct nfsd_net *nn) list_del_init(&clp->cl_ra_cblist); clp->cl_ra->ra_keep = 0; clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG); + clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG) | + BIT(RCA4_TYPE_MASK_WDATA_DLG); trace_nfsd_cb_recall_any(clp->cl_ra); nfsd4_run_cb(&clp->cl_ra->ra_cb); } @@ -6379,6 +6709,9 @@ static __be32 nfsd4_stid_check_stateid_generation(stateid_t *in, struct nfs4_sti if (ret == nfs_ok) ret = check_stateid_generation(in, &s->sc_stateid, has_session); spin_unlock(&s->sc_lock); + if (ret == nfserr_admin_revoked) + nfsd40_drop_revoked_stid(s->sc_client, + &s->sc_stateid); return ret; } @@ -6405,32 +6738,33 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) status = nfsd4_stid_check_stateid_generation(stateid, s, 1); if (status) goto out_unlock; + status = nfsd4_verify_open_stid(s); + if (status) + goto out_unlock; + switch (s->sc_type) { - case NFS4_DELEG_STID: + case SC_TYPE_DELEG: status = nfs_ok; break; - case NFS4_REVOKED_DELEG_STID: - status = nfserr_deleg_revoked; - break; - case NFS4_OPEN_STID: - case NFS4_LOCK_STID: + case SC_TYPE_OPEN: + case SC_TYPE_LOCK: status = nfsd4_check_openowner_confirmed(openlockstateid(s)); break; default: printk("unknown stateid type %x\n", s->sc_type); - fallthrough; - case NFS4_CLOSED_STID: - case NFS4_CLOSED_DELEG_STID: status = nfserr_bad_stateid; } out_unlock: spin_unlock(&cl->cl_lock); + if (status == nfserr_admin_revoked) + nfsd40_drop_revoked_stid(cl, stateid); return status; } __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, - stateid_t *stateid, unsigned char typemask, + stateid_t *stateid, + unsigned short typemask, unsigned short statusmask, struct nfs4_stid **s, struct nfsd_net *nn) { __be32 status; @@ -6441,10 +6775,15 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, * only return revoked delegations if explicitly asked. * otherwise we report revoked or bad_stateid status. */ - if (typemask & NFS4_REVOKED_DELEG_STID) + if (statusmask & SC_STATUS_REVOKED) return_revoked = true; - else if (typemask & NFS4_DELEG_STID) - typemask |= NFS4_REVOKED_DELEG_STID; + if (typemask & SC_TYPE_DELEG) + /* Always allow REVOKED for DELEG so we can + * retturn the appropriate error. + */ + statusmask |= SC_STATUS_REVOKED; + + statusmask |= SC_STATUS_ADMIN_REVOKED; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) @@ -6457,14 +6796,17 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, } if (status) return status; - stid = find_stateid_by_type(cstate->clp, stateid, typemask); + stid = find_stateid_by_type(cstate->clp, stateid, typemask, statusmask); if (!stid) return nfserr_bad_stateid; - if ((stid->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) { + if ((stid->sc_status & SC_STATUS_REVOKED) && !return_revoked) { nfs4_put_stid(stid); - if (cstate->minorversion) - return nfserr_deleg_revoked; - return nfserr_bad_stateid; + return nfserr_deleg_revoked; + } + if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) { + nfsd40_drop_revoked_stid(cstate->clp, stateid); + nfs4_put_stid(stid); + return nfserr_admin_revoked; } *s = stid; return nfs_ok; @@ -6475,17 +6817,17 @@ nfs4_find_file(struct nfs4_stid *s, int flags) { struct nfsd_file *ret = NULL; - if (!s) + if (!s || s->sc_status) return NULL; switch (s->sc_type) { - case NFS4_DELEG_STID: + case SC_TYPE_DELEG: spin_lock(&s->sc_file->fi_lock); ret = nfsd_file_get(s->sc_file->fi_deleg_file); spin_unlock(&s->sc_file->fi_lock); break; - case NFS4_OPEN_STID: - case NFS4_LOCK_STID: + case SC_TYPE_OPEN: + case SC_TYPE_LOCK: if (flags & RD_STATE) ret = find_readable_file(s->sc_file); else @@ -6598,7 +6940,8 @@ static __be32 find_cpntf_state(struct nfsd_net *nn, stateid_t *st, goto out; *stid = find_stateid_by_type(found, &cps->cp_p_stateid, - NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID); + SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK, + 0); if (*stid) status = nfs_ok; else @@ -6655,8 +6998,8 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, } status = nfsd4_lookup_stateid(cstate, stateid, - NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, - &s, nn); + SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK, + 0, &s, nn); if (status == nfserr_bad_stateid) status = find_cpntf_state(nn, stateid, &s); if (status) @@ -6667,16 +7010,13 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, goto out; switch (s->sc_type) { - case NFS4_DELEG_STID: + case SC_TYPE_DELEG: status = nfs4_check_delegmode(delegstateid(s), flags); break; - case NFS4_OPEN_STID: - case NFS4_LOCK_STID: + case SC_TYPE_OPEN: + case SC_TYPE_LOCK: status = nfs4_check_olstateid(openlockstateid(s), flags); break; - default: - status = nfserr_bad_stateid; - break; } if (status) goto out; @@ -6755,34 +7095,39 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, spin_lock(&cl->cl_lock); s = find_stateid_locked(cl, stateid); - if (!s) + if (!s || s->sc_status & SC_STATUS_CLOSED) goto out_unlock; + if (s->sc_status & SC_STATUS_ADMIN_REVOKED) { + nfsd4_drop_revoked_stid(s); + ret = nfs_ok; + goto out; + } spin_lock(&s->sc_lock); switch (s->sc_type) { - case NFS4_DELEG_STID: + case SC_TYPE_DELEG: + if (s->sc_status & SC_STATUS_REVOKED) { + spin_unlock(&s->sc_lock); + dp = delegstateid(s); + list_del_init(&dp->dl_recall_lru); + spin_unlock(&cl->cl_lock); + nfs4_put_stid(s); + ret = nfs_ok; + goto out; + } ret = nfserr_locks_held; break; - case NFS4_OPEN_STID: + case SC_TYPE_OPEN: ret = check_stateid_generation(stateid, &s->sc_stateid, 1); if (ret) break; ret = nfserr_locks_held; break; - case NFS4_LOCK_STID: + case SC_TYPE_LOCK: spin_unlock(&s->sc_lock); refcount_inc(&s->sc_count); spin_unlock(&cl->cl_lock); ret = nfsd4_free_lock_stateid(stateid, s); goto out; - case NFS4_REVOKED_DELEG_STID: - spin_unlock(&s->sc_lock); - dp = delegstateid(s); - list_del_init(&dp->dl_recall_lru); - spin_unlock(&cl->cl_lock); - nfs4_put_stid(s); - ret = nfs_ok; - goto out; - /* Default falls through and returns nfserr_bad_stateid */ } spin_unlock(&s->sc_lock); out_unlock: @@ -6824,6 +7169,7 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ * @seqid: seqid (provided by client) * @stateid: stateid (provided by client) * @typemask: mask of allowable types for this operation + * @statusmask: mask of allowed states: 0 or STID_CLOSED * @stpp: return pointer for the stateid found * @nn: net namespace for request * @@ -6833,7 +7179,8 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ */ static __be32 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, - stateid_t *stateid, char typemask, + stateid_t *stateid, + unsigned short typemask, unsigned short statusmask, struct nfs4_ol_stateid **stpp, struct nfsd_net *nn) { @@ -6844,7 +7191,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid, trace_nfsd_preprocess(seqid, stateid); *stpp = NULL; - status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn); + status = nfsd4_lookup_stateid(cstate, stateid, + typemask, statusmask, &s, nn); if (status) return status; stp = openlockstateid(s); @@ -6866,7 +7214,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs struct nfs4_ol_stateid *stp; status = nfs4_preprocess_seqid_op(cstate, seqid, stateid, - NFS4_OPEN_STID, &stp, nn); + SC_TYPE_OPEN, 0, &stp, nn); if (status) return status; oo = openowner(stp->st_stateowner); @@ -6897,8 +7245,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; status = nfs4_preprocess_seqid_op(cstate, - oc->oc_seqid, &oc->oc_req_stateid, - NFS4_OPEN_STID, &stp, nn); + oc->oc_seqid, &oc->oc_req_stateid, + SC_TYPE_OPEN, 0, &stp, nn); if (status) goto out; oo = openowner(stp->st_stateowner); @@ -7028,18 +7376,20 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); - dprintk("NFSD: nfsd4_close on file %pd\n", + dprintk("NFSD: nfsd4_close on file %pd\n", cstate->current_fh.fh_dentry); status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid, - &close->cl_stateid, - NFS4_OPEN_STID|NFS4_CLOSED_STID, - &stp, nn); + &close->cl_stateid, + SC_TYPE_OPEN, SC_STATUS_CLOSED, + &stp, nn); nfsd4_bump_seqid(cstate, status); if (status) - goto out; + goto out; - stp->st_stid.sc_type = NFS4_CLOSED_STID; + spin_lock(&stp->st_stid.sc_client->cl_lock); + stp->st_stid.sc_status |= SC_STATUS_CLOSED; + spin_unlock(&stp->st_stid.sc_client->cl_lock); /* * Technically we don't _really_ have to increment or copy it, since @@ -7081,7 +7431,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) return status; - status = nfsd4_lookup_stateid(cstate, stateid, NFS4_DELEG_STID, &s, nn); + status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, 0, &s, nn); if (status) goto out; dp = delegstateid(s); @@ -7348,7 +7698,7 @@ retry: if (retstp) goto out_found; refcount_inc(&stp->st_stid.sc_count); - stp->st_stid.sc_type = NFS4_LOCK_STID; + stp->st_stid.sc_type = SC_TYPE_LOCK; stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner); get_nfs4_file(fp); stp->st_stid.sc_file = fp; @@ -7535,9 +7885,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &lock_stp, &new); } else { status = nfs4_preprocess_seqid_op(cstate, - lock->lk_old_lock_seqid, - &lock->lk_old_lock_stateid, - NFS4_LOCK_STID, &lock_stp, nn); + lock->lk_old_lock_seqid, + &lock->lk_old_lock_stateid, + SC_TYPE_LOCK, 0, &lock_stp, + nn); } if (status) goto out; @@ -7850,8 +8201,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_inval; status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid, - &locku->lu_stateid, NFS4_LOCK_STID, - &stp, nn); + &locku->lu_stateid, SC_TYPE_LOCK, 0, + &stp, nn); if (status) goto out; nf = find_any_file(stp->st_stid.sc_file); @@ -7996,7 +8347,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, stp = list_first_entry(&lo->lo_owner.so_stateids, struct nfs4_ol_stateid, st_perstateowner); - WARN_ON(!unhash_lock_stateid(stp)); + unhash_lock_stateid(stp); put_ol_stateid_locked(stp, &reaplist); } spin_unlock(&clp->cl_lock); @@ -8289,7 +8640,7 @@ nfs4_state_shutdown_net(struct net *net) spin_lock(&state_lock); list_for_each_safe(pos, next, &nn->del_recall_lru) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); - WARN_ON(!unhash_delegation_locked(dp)); + unhash_delegation_locked(dp, SC_STATUS_CLOSED); list_add(&dp->dl_recall_lru, &reaplist); } spin_unlock(&state_lock); @@ -8431,6 +8782,8 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict * @rqstp: RPC transaction context * @inode: file to be checked for a conflict + * @modified: return true if file was modified + * @size: new size of file if modified is true * * This function is called when there is a conflict between a write * delegation and a change/size GETATTR from another client. The server @@ -8439,21 +8792,22 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, * delegation before replying to the GETATTR. See RFC 8881 section * 18.7.4. * - * The current implementation does not support CB_GETATTR yet. However - * this can avoid recalling the delegation could be added in follow up - * work. - * * Returns 0 if there is no conflict; otherwise an nfs_stat * code is returned. */ __be32 -nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode) +nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode, + bool *modified, u64 *size) { __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct file_lock_context *ctx; struct file_lease *fl; struct nfs4_delegation *dp; + struct iattr attrs; + struct nfs4_cb_fattr *ncf; + *modified = false; ctx = locks_inode_context(inode); if (!ctx) return 0; @@ -8480,12 +8834,38 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode) return 0; } break_lease: + nfsd_stats_wdeleg_getattr_inc(nn); + dp = fl->c.flc_owner; + ncf = &dp->dl_cb_fattr; + nfs4_cb_getattr(&dp->dl_cb_fattr); spin_unlock(&ctx->flc_lock); - nfsd_stats_wdeleg_getattr_inc(); - status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); - if (status != nfserr_jukebox || - !nfsd_wait_for_delegreturn(rqstp, inode)) - return status; + wait_on_bit_timeout(&ncf->ncf_cb_flags, CB_GETATTR_BUSY, + TASK_INTERRUPTIBLE, NFSD_CB_GETATTR_TIMEOUT); + if (ncf->ncf_cb_status) { + /* Recall delegation only if client didn't respond */ + status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); + if (status != nfserr_jukebox || + !nfsd_wait_for_delegreturn(rqstp, inode)) + return status; + } + if (!ncf->ncf_file_modified && + (ncf->ncf_initial_cinfo != ncf->ncf_cb_change || + ncf->ncf_cur_fsize != ncf->ncf_cb_fsize)) + ncf->ncf_file_modified = true; + if (ncf->ncf_file_modified) { + /* + * Per section 10.4.3 of RFC 8881, the server would + * not update the file's metadata with the client's + * modified size + */ + attrs.ia_mtime = attrs.ia_ctime = current_time(inode); + attrs.ia_valid = ATTR_MTIME | ATTR_CTIME; + setattr_copy(&nop_mnt_idmap, inode, &attrs); + mark_inode_dirty(inode); + ncf->ncf_cur_fsize = ncf->ncf_cb_fsize; + *size = ncf->ncf_cur_fsize; + *modified = true; + } return 0; } break; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c719c475a068..fac938f563ad 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3507,6 +3507,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, unsigned long mask[2]; } u; unsigned long bit; + bool file_modified = false; + u64 size = 0; WARN_ON_ONCE(bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1); WARN_ON_ONCE(!nfsd_attrs_supported(minorversion, bmval)); @@ -3533,7 +3535,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, } args.size = 0; if (u.attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { - status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry)); + status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry), + &file_modified, &size); if (status) goto out; } @@ -3543,7 +3546,10 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, AT_STATX_SYNC_AS_STAT); if (err) goto out_nfserr; - args.size = args.stat.size; + if (file_modified) + args.size = size; + else + args.size = args.stat.size; if (!(args.stat.result_mask & STATX_BTIME)) /* underlying FS does not offer btime so we can't share it */ @@ -5386,16 +5392,11 @@ nfsd4_listxattr_validate_cookie(struct nfsd4_listxattrs *listxattrs, /* * If the cookie is larger than the maximum number we can fit - * in either the buffer we just got back from vfs_listxattr, or, - * XDR-encoded, in the return buffer, it's invalid. + * in the buffer we just got back from vfs_listxattr, it's invalid. */ if (cookie > (listxattrs->lsxa_len) / (XATTR_USER_PREFIX_LEN + 2)) return nfserr_badcookie; - if (cookie > (listxattrs->lsxa_maxcount / - (XDR_QUADLEN(XATTR_USER_PREFIX_LEN + 2) + 4))) - return nfserr_badcookie; - *offsetp = (u32)cookie; return 0; } @@ -5412,6 +5413,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, u64 cookie; char *sp; __be32 status, tmp; + __be64 wire_cookie; __be32 *p; u32 nuser; @@ -5427,7 +5429,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, */ cookie_offset = xdr->buf->len; count_offset = cookie_offset + 8; - p = xdr_reserve_space(xdr, 12); + p = xdr_reserve_space(xdr, XDR_UNIT * 3); if (!p) { status = nfserr_resource; goto out; @@ -5438,7 +5440,8 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, sp = listxattrs->lsxa_buf; nuser = 0; - xdrleft = listxattrs->lsxa_maxcount; + /* Bytes left is maxcount - 8 (cookie) - 4 (array count) */ + xdrleft = listxattrs->lsxa_maxcount - XDR_UNIT * 3; while (left > 0 && xdrleft > 0) { slen = strlen(sp); @@ -5451,7 +5454,8 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr, slen -= XATTR_USER_PREFIX_LEN; xdrlen = 4 + ((slen + 3) & ~3); - if (xdrlen > xdrleft) { + /* Check if both entry and eof can fit in the XDR buffer */ + if (xdrlen + XDR_UNIT > xdrleft) { if (count == 0) { /* * Can't even fit the first attribute name. @@ -5503,7 +5507,8 @@ wreof: cookie = offset + count; - write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &cookie, 8); + wire_cookie = cpu_to_be64(cookie); + write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &wire_cookie, 8); tmp = cpu_to_be32(count); write_bytes_to_xdr_buf(xdr->buf, count_offset, &tmp, 4); out: @@ -5727,27 +5732,24 @@ release: rqstp->rq_next_page = xdr->page_ptr + 1; } -/* - * Encode the reply stored in the stateowner reply cache - * - * XDR note: do not encode rp->rp_buflen: the buffer contains the - * previously sent already encoded operation. +/** + * nfsd4_encode_replay - encode a result stored in the stateowner reply cache + * @xdr: send buffer's XDR stream + * @op: operation being replayed + * + * @op->replay->rp_buf contains the previously-sent already-encoded result. */ -void -nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op) +void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op) { - __be32 *p; struct nfs4_replay *rp = op->replay; - p = xdr_reserve_space(xdr, 8 + rp->rp_buflen); - if (!p) { - WARN_ON_ONCE(1); - return; - } - *p++ = cpu_to_be32(op->opnum); - *p++ = rp->rp_status; /* already xdr'ed */ + trace_nfsd_stateowner_replay(op->opnum, rp); - p = xdr_encode_opaque_fixed(p, rp->rp_buf, rp->rp_buflen); + if (xdr_stream_encode_u32(xdr, op->opnum) != XDR_UNIT) + return; + if (xdr_stream_encode_be32(xdr, rp->rp_status) != XDR_UNIT) + return; + xdr_stream_encode_opaque_fixed(xdr, rp->rp_buf, rp->rp_buflen); } void nfsd4_release_compoundargs(struct svc_rqst *rqstp) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 5c1a4a0aa605..ba9d326b3de6 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -166,8 +166,7 @@ nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp, int nfsd_drc_slab_create(void) { - drc_slab = kmem_cache_create("nfsd_drc", - sizeof(struct nfsd_cacherep), 0, 0, NULL); + drc_slab = KMEM_CACHE(nfsd_cacherep, 0); return drc_slab ? 0: -ENOMEM; } @@ -176,27 +175,6 @@ void nfsd_drc_slab_free(void) kmem_cache_destroy(drc_slab); } -/** - * nfsd_net_reply_cache_init - per net namespace reply cache set-up - * @nn: nfsd_net being initialized - * - * Returns zero on succes; otherwise a negative errno is returned. - */ -int nfsd_net_reply_cache_init(struct nfsd_net *nn) -{ - return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM); -} - -/** - * nfsd_net_reply_cache_destroy - per net namespace reply cache tear-down - * @nn: nfsd_net being freed - * - */ -void nfsd_net_reply_cache_destroy(struct nfsd_net *nn) -{ - nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM); -} - int nfsd_reply_cache_init(struct nfsd_net *nn) { unsigned int hashsize; @@ -501,7 +479,7 @@ out: int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, unsigned int len, struct nfsd_cacherep **cacherep) { - struct nfsd_net *nn; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct nfsd_cacherep *rp, *found; __wsum csum; struct nfsd_drc_bucket *b; @@ -510,7 +488,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, int rtn = RC_DOIT; if (type == RC_NOCACHE) { - nfsd_stats_rc_nocache_inc(); + nfsd_stats_rc_nocache_inc(nn); goto out; } @@ -520,7 +498,6 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, * Since the common case is a cache miss followed by an insert, * preallocate an entry. */ - nn = net_generic(SVC_NET(rqstp), nfsd_net_id); rp = nfsd_cacherep_alloc(rqstp, csum, nn); if (!rp) goto out; @@ -537,7 +514,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, nfsd_cacherep_dispose(&dispose); - nfsd_stats_rc_misses_inc(); + nfsd_stats_rc_misses_inc(nn); atomic_inc(&nn->num_drc_entries); nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp)); goto out; @@ -545,7 +522,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start, found_entry: /* We found a matching entry which is either in progress or done. */ nfsd_reply_cache_free_locked(NULL, rp, nn); - nfsd_stats_rc_hits_inc(); + nfsd_stats_rc_hits_inc(nn); rtn = RC_DROPIT; rp = found; @@ -687,15 +664,15 @@ int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) atomic_read(&nn->num_drc_entries)); seq_printf(m, "hash buckets: %u\n", 1 << nn->maskbits); seq_printf(m, "mem usage: %lld\n", - percpu_counter_sum_positive(&nn->counter[NFSD_NET_DRC_MEM_USAGE])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_DRC_MEM_USAGE])); seq_printf(m, "cache hits: %lld\n", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS])); seq_printf(m, "cache misses: %lld\n", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES])); seq_printf(m, "not cached: %lld\n", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE])); seq_printf(m, "payload misses: %lld\n", - percpu_counter_sum_positive(&nn->counter[NFSD_NET_PAYLOAD_MISSES])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_PAYLOAD_MISSES])); seq_printf(m, "longest chain len: %u\n", nn->longest_chain); seq_printf(m, "cachesize at longest: %u\n", nn->longest_chain_cachesize); return 0; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index f206ca32e7f5..ecd18bffeebc 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -281,6 +281,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) * 3. Is that directory the root of an exported file system? */ error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb); + nfsd4_revoke_states(netns(file), path.dentry->d_sb); path_put(&path); return error; @@ -1671,14 +1672,17 @@ static __net_init int nfsd_net_init(struct net *net) retval = nfsd_idmap_init(net); if (retval) goto out_idmap_error; - retval = nfsd_net_reply_cache_init(nn); + retval = nfsd_stat_counters_init(nn); if (retval) goto out_repcache_error; + memset(&nn->nfsd_svcstats, 0, sizeof(nn->nfsd_svcstats)); + nn->nfsd_svcstats.program = &nfsd_program; nn->nfsd_versions = NULL; nn->nfsd4_minorversions = NULL; nfsd4_init_leases_net(nn); get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); seqlock_init(&nn->writeverf_lock); + nfsd_proc_stat_init(net); return 0; @@ -1699,7 +1703,8 @@ static __net_exit void nfsd_net_exit(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - nfsd_net_reply_cache_destroy(nn); + nfsd_proc_stat_shutdown(net); + nfsd_stat_counters_destroy(nn); nfsd_idmap_shutdown(net); nfsd_export_shutdown(net); nfsd_netns_free_versions(nn); @@ -1722,12 +1727,9 @@ static int __init init_nfsd(void) retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; - retval = nfsd_stat_init(); /* Statistics */ - if (retval) - goto out_free_pnfs; retval = nfsd_drc_slab_create(); if (retval) - goto out_free_stat; + goto out_free_pnfs; nfsd_lockd_init(); /* lockd->nfsd callbacks */ retval = create_proc_exports_entry(); if (retval) @@ -1761,8 +1763,6 @@ out_free_exports: out_free_lockd: nfsd_lockd_shutdown(); nfsd_drc_slab_free(); -out_free_stat: - nfsd_stat_shutdown(); out_free_pnfs: nfsd4_exit_pnfs(); out_free_slabs: @@ -1780,7 +1780,6 @@ static void __exit exit_nfsd(void) nfsd_drc_slab_free(); remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); - nfsd_stat_shutdown(); nfsd_lockd_shutdown(); nfsd4_free_slabs(); nfsd4_exit_pnfs(); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 304e9728b929..16c5a05f340e 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -86,6 +86,7 @@ extern struct mutex nfsd_mutex; extern spinlock_t nfsd_drc_lock; extern unsigned long nfsd_drc_max_mem; extern unsigned long nfsd_drc_mem_used; +extern atomic_t nfsd_th_cnt; /* number of available threads */ extern const struct seq_operations nfs_exports_op; @@ -274,6 +275,7 @@ void nfsd_lockd_shutdown(void); #define nfserr_no_grace cpu_to_be32(NFSERR_NO_GRACE) #define nfserr_reclaim_bad cpu_to_be32(NFSERR_RECLAIM_BAD) #define nfserr_badname cpu_to_be32(NFSERR_BADNAME) +#define nfserr_admin_revoked cpu_to_be32(NFS4ERR_ADMIN_REVOKED) #define nfserr_cb_path_down cpu_to_be32(NFSERR_CB_PATH_DOWN) #define nfserr_locked cpu_to_be32(NFSERR_LOCKED) #define nfserr_wrongsec cpu_to_be32(NFSERR_WRONGSEC) @@ -365,6 +367,7 @@ void nfsd_lockd_shutdown(void); #define NFSD_CLIENT_MAX_TRIM_PER_RUN 128 #define NFS4_CLIENTS_PER_GB 1024 #define NFSD_DELEGRETURN_TIMEOUT (HZ / 34) /* 30ms */ +#define NFSD_CB_GETATTR_TIMEOUT NFSD_DELEGRETURN_TIMEOUT /* * The following attributes are currently not supported by the NFSv4 server: diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index dbfa0ac13564..40fecf7b224f 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -327,6 +327,7 @@ out: __be32 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) { + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct svc_export *exp = NULL; struct dentry *dentry; __be32 error; @@ -395,7 +396,7 @@ skip_pseudoflavor_check: out: trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error); if (error == nfserr_stale) - nfsd_stats_fh_stale_inc(exp); + nfsd_stats_fh_stale_inc(nn, exp); return error; } diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index a7315928a760..36370b957b63 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -103,7 +103,7 @@ nfsd_proc_setattr(struct svc_rqst *rqstp) } } - resp->status = nfsd_setattr(rqstp, fhp, &attrs, 0, (time64_t)0); + resp->status = nfsd_setattr(rqstp, fhp, &attrs, NULL); if (resp->status != nfs_ok) goto out; @@ -390,8 +390,8 @@ nfsd_proc_create(struct svc_rqst *rqstp) */ attr->ia_valid &= ATTR_SIZE; if (attr->ia_valid) - resp->status = nfsd_setattr(rqstp, newfhp, &attrs, 0, - (time64_t)0); + resp->status = nfsd_setattr(rqstp, newfhp, &attrs, + NULL); } out_unlock: diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index a667802e08e7..c0d17b92b249 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -34,6 +34,7 @@ #define NFSDDBG_FACILITY NFSDDBG_SVC +atomic_t nfsd_th_cnt = ATOMIC_INIT(0); extern struct svc_program nfsd_program; static int nfsd(void *vrqstp); #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) @@ -80,7 +81,6 @@ unsigned long nfsd_drc_max_mem; unsigned long nfsd_drc_mem_used; #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) -static struct svc_stat nfsd_acl_svcstats; static const struct svc_version *nfsd_acl_version[] = { # if defined(CONFIG_NFSD_V2_ACL) [2] = &nfsd_acl_version2, @@ -99,15 +99,11 @@ static struct svc_program nfsd_acl_program = { .pg_vers = nfsd_acl_version, .pg_name = "nfsacl", .pg_class = "nfsd", - .pg_stats = &nfsd_acl_svcstats, .pg_authenticate = &svc_set_client, .pg_init_request = nfsd_acl_init_request, .pg_rpcbind_set = nfsd_acl_rpcbind_set, }; -static struct svc_stat nfsd_acl_svcstats = { - .program = &nfsd_acl_program, -}; #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ static const struct svc_version *nfsd_version[] = { @@ -132,7 +128,6 @@ struct svc_program nfsd_program = { .pg_vers = nfsd_version, /* version table */ .pg_name = "nfsd", /* program name */ .pg_class = "nfsd", /* authentication class */ - .pg_stats = &nfsd_svcstats, /* version table */ .pg_authenticate = &svc_set_client, /* export authentication */ .pg_init_request = nfsd_init_request, .pg_rpcbind_set = nfsd_rpcbind_set, @@ -666,7 +661,8 @@ int nfsd_create_serv(struct net *net) if (nfsd_max_blksize == 0) nfsd_max_blksize = nfsd_get_default_max_blksize(); nfsd_reset_versions(nn); - serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd); + serv = svc_create_pooled(&nfsd_program, &nn->nfsd_svcstats, + nfsd_max_blksize, nfsd); if (serv == NULL) return -ENOMEM; @@ -929,7 +925,7 @@ nfsd(void *vrqstp) current->fs->umask = 0; - atomic_inc(&nfsdstats.th_cnt); + atomic_inc(&nfsd_th_cnt); set_freezable(); @@ -941,9 +937,11 @@ nfsd(void *vrqstp) rqstp->rq_server->sv_maxconn = nn->max_connections; svc_recv(rqstp); + + nfsd_file_net_dispose(nn); } - atomic_dec(&nfsdstats.th_cnt); + atomic_dec(&nfsd_th_cnt); out: /* Release the thread */ diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index de1e0dfed06a..925817f66917 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -37,7 +37,8 @@ struct nfsd4_layout_ops { __be32 (*proc_layoutcommit)(struct inode *inode, struct nfsd4_layoutcommit *lcp); - void (*fence_client)(struct nfs4_layout_stateid *ls); + void (*fence_client)(struct nfs4_layout_stateid *ls, + struct nfsd_file *file); }; extern const struct nfsd4_layout_ops *nfsd4_layout_ops[]; @@ -72,11 +73,13 @@ void nfsd4_setup_layout_type(struct svc_export *exp); void nfsd4_return_all_client_layouts(struct nfs4_client *); void nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp); +void nfsd4_close_layout(struct nfs4_layout_stateid *ls); int nfsd4_init_pnfs(void); void nfsd4_exit_pnfs(void); #else struct nfs4_client; struct nfs4_file; +struct nfs4_layout_stateid; static inline void nfsd4_setup_layout_type(struct svc_export *exp) { @@ -89,6 +92,9 @@ static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp) { } +static inline void nfsd4_close_layout(struct nfs4_layout_stateid *ls) +{ +} static inline void nfsd4_exit_pnfs(void) { } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 41bdc913fa71..01c6f3445646 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -68,7 +68,7 @@ struct nfsd4_callback { struct nfs4_client *cb_clp; struct rpc_message cb_msg; const struct nfsd4_callback_ops *cb_ops; - struct work_struct cb_work; + struct delayed_work cb_work; int cb_seq_status; int cb_status; bool cb_need_restart; @@ -88,17 +88,34 @@ struct nfsd4_callback_ops { */ struct nfs4_stid { refcount_t sc_count; -#define NFS4_OPEN_STID 1 -#define NFS4_LOCK_STID 2 -#define NFS4_DELEG_STID 4 -/* For an open stateid kept around *only* to process close replays: */ -#define NFS4_CLOSED_STID 8 + + /* A new stateid is added to the cl_stateids idr early before it + * is fully initialised. Its sc_type is then zero. After + * initialisation the sc_type it set under cl_lock, and then + * never changes. + */ +#define SC_TYPE_OPEN BIT(0) +#define SC_TYPE_LOCK BIT(1) +#define SC_TYPE_DELEG BIT(2) +#define SC_TYPE_LAYOUT BIT(3) + unsigned short sc_type; + +/* state_lock protects sc_status for delegation stateids. + * ->cl_lock protects sc_status for open and lock stateids. + * ->st_mutex also protect sc_status for open stateids. + * ->ls_lock protects sc_status for layout stateids. + */ +/* + * For an open stateid kept around *only* to process close replays. + * For deleg stateid, kept in idr until last reference is dropped. + */ +#define SC_STATUS_CLOSED BIT(0) /* For a deleg stateid kept around only to process free_stateid's: */ -#define NFS4_REVOKED_DELEG_STID 16 -#define NFS4_CLOSED_DELEG_STID 32 -#define NFS4_LAYOUT_STID 64 +#define SC_STATUS_REVOKED BIT(1) +#define SC_STATUS_ADMIN_REVOKED BIT(2) + unsigned short sc_status; + struct list_head sc_cp_list; - unsigned char sc_type; stateid_t sc_stateid; spinlock_t sc_lock; struct nfs4_client *sc_client; @@ -117,6 +134,24 @@ struct nfs4_cpntf_state { time64_t cpntf_time; /* last time stateid used */ }; +struct nfs4_cb_fattr { + struct nfsd4_callback ncf_getattr; + u32 ncf_cb_status; + u32 ncf_cb_bmap[1]; + + /* from CB_GETATTR reply */ + u64 ncf_cb_change; + u64 ncf_cb_fsize; + + unsigned long ncf_cb_flags; + bool ncf_file_modified; + u64 ncf_initial_cinfo; + u64 ncf_cur_fsize; +}; + +/* bits for ncf_cb_flags */ +#define CB_GETATTR_BUSY 0 + /* * Represents a delegation stateid. The nfs4_client holds references to these * and they are put when it is being destroyed or when the delegation is @@ -150,6 +185,9 @@ struct nfs4_delegation { int dl_retries; struct nfsd4_callback dl_recall; bool dl_recalled; + + /* for CB_GETATTR */ + struct nfs4_cb_fattr dl_cb_fattr; }; #define cb_to_delegation(cb) \ @@ -317,8 +355,9 @@ enum { * 0. If they are not renewed within a lease period, they become eligible for * destruction by the laundromat. * - * These objects can also be destroyed prematurely by the fault injection code, - * or if the client sends certain forms of SETCLIENTID or EXCHANGE_ID updates. + * These objects can also be destroyed if the client sends certain forms of + * SETCLIENTID or EXCHANGE_ID operations. + * * Care is taken *not* to do this however when the objects have an elevated * refcount. * @@ -326,7 +365,7 @@ enum { * * o Each nfs4_clients is also hashed by name (the opaque quantity initially * sent by the client to identify itself). - * + * * o cl_perclient list is used to ensure no dangling stateowner references * when we expire the nfs4_client */ @@ -351,6 +390,7 @@ struct nfs4_client { clientid_t cl_clientid; /* generated by server */ nfs4_verifier cl_confirm; /* generated by server */ u32 cl_minorversion; + atomic_t cl_admin_revoked; /* count of admin-revoked states */ /* NFSv4.1 client implementation id: */ struct xdr_netobj cl_nii_domain; struct xdr_netobj cl_nii_name; @@ -640,6 +680,7 @@ enum nfsd4_cb_op { NFSPROC4_CLNT_CB_SEQUENCE, NFSPROC4_CLNT_CB_NOTIFY_LOCK, NFSPROC4_CLNT_CB_RECALL_ANY, + NFSPROC4_CLNT_CB_GETATTR, }; /* Returns true iff a is later than b: */ @@ -672,15 +713,15 @@ extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, stateid_t *stateid, int flags, struct nfsd_file **filp, struct nfs4_stid **cstid); __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, - stateid_t *stateid, unsigned char typemask, - struct nfs4_stid **s, struct nfsd_net *nn); + stateid_t *stateid, unsigned short typemask, + unsigned short statusmask, + struct nfs4_stid **s, struct nfsd_net *nn); struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab, void (*sc_free)(struct nfs4_stid *)); int nfs4_init_copy_state(struct nfsd_net *nn, struct nfsd4_copy *copy); void nfs4_free_copy_state(struct nfsd4_copy *copy); struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn, struct nfs4_stid *p_stid); -void nfs4_unhash_stid(struct nfs4_stid *s); void nfs4_put_stid(struct nfs4_stid *s); void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid); void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *); @@ -714,6 +755,14 @@ static inline void get_nfs4_file(struct nfs4_file *fi) } struct nfsd_file *find_any_file(struct nfs4_file *f); +#ifdef CONFIG_NFSD_V4 +void nfsd4_revoke_states(struct net *net, struct super_block *sb); +#else +static inline void nfsd4_revoke_states(struct net *net, struct super_block *sb) +{ +} +#endif + /* grace period management */ void nfsd4_end_grace(struct nfsd_net *nn); @@ -732,5 +781,5 @@ static inline bool try_to_expire_client(struct nfs4_client *clp) } extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, - struct inode *inode); + struct inode *inode, bool *file_modified, u64 *size); #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 12d79f5d4eb1..be52fb1e928e 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -27,25 +27,22 @@ #include "nfsd.h" -struct nfsd_stats nfsdstats; -struct svc_stat nfsd_svcstats = { - .program = &nfsd_program, -}; - static int nfsd_show(struct seq_file *seq, void *v) { + struct net *net = pde_data(file_inode(seq->file)); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); int i; seq_printf(seq, "rc %lld %lld %lld\nfh %lld 0 0 0 0\nio %lld %lld\n", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_FH_STALE]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_READ]), - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_WRITE])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_FH_STALE]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_READ]), + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_WRITE])); /* thread usage: */ - seq_printf(seq, "th %u 0", atomic_read(&nfsdstats.th_cnt)); + seq_printf(seq, "th %u 0", atomic_read(&nfsd_th_cnt)); /* deprecated thread usage histogram stats */ for (i = 0; i < 10; i++) @@ -55,7 +52,7 @@ static int nfsd_show(struct seq_file *seq, void *v) seq_puts(seq, "\nra 0 0 0 0 0 0 0 0 0 0 0 0\n"); /* show my rpc info */ - svc_seq_show(seq, &nfsd_svcstats); + svc_seq_show(seq, &nn->nfsd_svcstats); #ifdef CONFIG_NFSD_V4 /* Show count for individual nfsv4 operations */ @@ -63,10 +60,10 @@ static int nfsd_show(struct seq_file *seq, void *v) seq_printf(seq, "proc4ops %u", LAST_NFS4_OP + 1); for (i = 0; i <= LAST_NFS4_OP; i++) { seq_printf(seq, " %lld", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_NFS4_OP(i)])); } seq_printf(seq, "\nwdeleg_getattr %lld", - percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR])); + percpu_counter_sum_positive(&nn->counter[NFSD_STATS_WDELEG_GETATTR])); seq_putc(seq, '\n'); #endif @@ -108,31 +105,24 @@ void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num) percpu_counter_destroy(&counters[i]); } -static int nfsd_stat_counters_init(void) +int nfsd_stat_counters_init(struct nfsd_net *nn) { - return nfsd_percpu_counters_init(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM); + return nfsd_percpu_counters_init(nn->counter, NFSD_STATS_COUNTERS_NUM); } -static void nfsd_stat_counters_destroy(void) +void nfsd_stat_counters_destroy(struct nfsd_net *nn) { - nfsd_percpu_counters_destroy(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM); + nfsd_percpu_counters_destroy(nn->counter, NFSD_STATS_COUNTERS_NUM); } -int nfsd_stat_init(void) +void nfsd_proc_stat_init(struct net *net) { - int err; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); - err = nfsd_stat_counters_init(); - if (err) - return err; - - svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops); - - return 0; + svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops); } -void nfsd_stat_shutdown(void) +void nfsd_proc_stat_shutdown(struct net *net) { - nfsd_stat_counters_destroy(); - svc_proc_unregister(&init_net, "nfsd"); + svc_proc_unregister(net, "nfsd"); } diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 14f50c660b61..d2753e975dfd 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -10,94 +10,72 @@ #include #include - -enum { - NFSD_STATS_RC_HITS, /* repcache hits */ - NFSD_STATS_RC_MISSES, /* repcache misses */ - NFSD_STATS_RC_NOCACHE, /* uncached reqs */ - NFSD_STATS_FH_STALE, /* FH stale error */ - NFSD_STATS_IO_READ, /* bytes returned to read requests */ - NFSD_STATS_IO_WRITE, /* bytes passed in write requests */ -#ifdef CONFIG_NFSD_V4 - NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */ - NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP, -#define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op)) - NFSD_STATS_WDELEG_GETATTR, /* count of getattr conflict with wdeleg */ -#endif - NFSD_STATS_COUNTERS_NUM -}; - -struct nfsd_stats { - struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM]; - - atomic_t th_cnt; /* number of available threads */ -}; - -extern struct nfsd_stats nfsdstats; - -extern struct svc_stat nfsd_svcstats; - int nfsd_percpu_counters_init(struct percpu_counter *counters, int num); void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num); void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num); -int nfsd_stat_init(void); -void nfsd_stat_shutdown(void); +int nfsd_stat_counters_init(struct nfsd_net *nn); +void nfsd_stat_counters_destroy(struct nfsd_net *nn); +void nfsd_proc_stat_init(struct net *net); +void nfsd_proc_stat_shutdown(struct net *net); -static inline void nfsd_stats_rc_hits_inc(void) +static inline void nfsd_stats_rc_hits_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_HITS]); + percpu_counter_inc(&nn->counter[NFSD_STATS_RC_HITS]); } -static inline void nfsd_stats_rc_misses_inc(void) +static inline void nfsd_stats_rc_misses_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_MISSES]); + percpu_counter_inc(&nn->counter[NFSD_STATS_RC_MISSES]); } -static inline void nfsd_stats_rc_nocache_inc(void) +static inline void nfsd_stats_rc_nocache_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]); + percpu_counter_inc(&nn->counter[NFSD_STATS_RC_NOCACHE]); } -static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp) +static inline void nfsd_stats_fh_stale_inc(struct nfsd_net *nn, + struct svc_export *exp) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]); + percpu_counter_inc(&nn->counter[NFSD_STATS_FH_STALE]); if (exp && exp->ex_stats) percpu_counter_inc(&exp->ex_stats->counter[EXP_STATS_FH_STALE]); } -static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount) +static inline void nfsd_stats_io_read_add(struct nfsd_net *nn, + struct svc_export *exp, s64 amount) { - percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount); + percpu_counter_add(&nn->counter[NFSD_STATS_IO_READ], amount); if (exp && exp->ex_stats) percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_READ], amount); } -static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount) +static inline void nfsd_stats_io_write_add(struct nfsd_net *nn, + struct svc_export *exp, s64 amount) { - percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount); + percpu_counter_add(&nn->counter[NFSD_STATS_IO_WRITE], amount); if (exp && exp->ex_stats) percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_WRITE], amount); } static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nn->counter[NFSD_NET_PAYLOAD_MISSES]); + percpu_counter_inc(&nn->counter[NFSD_STATS_PAYLOAD_MISSES]); } static inline void nfsd_stats_drc_mem_usage_add(struct nfsd_net *nn, s64 amount) { - percpu_counter_add(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount); + percpu_counter_add(&nn->counter[NFSD_STATS_DRC_MEM_USAGE], amount); } static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount) { - percpu_counter_sub(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount); + percpu_counter_sub(&nn->counter[NFSD_STATS_DRC_MEM_USAGE], amount); } #ifdef CONFIG_NFSD_V4 -static inline void nfsd_stats_wdeleg_getattr_inc(void) +static inline void nfsd_stats_wdeleg_getattr_inc(struct nfsd_net *nn) { - percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR]); + percpu_counter_inc(&nn->counter[NFSD_STATS_WDELEG_GETATTR]); } #endif #endif /* _NFSD_STATS_H */ diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index d1e8cf079b0f..e545e92c4408 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -9,8 +9,10 @@ #define _NFSD_TRACE_H #include +#include #include #include +#include #include "export.h" #include "nfsfh.h" @@ -641,23 +643,18 @@ DEFINE_EVENT(nfsd_stateseqid_class, nfsd_##name, \ DEFINE_STATESEQID_EVENT(preprocess); DEFINE_STATESEQID_EVENT(open_confirm); -TRACE_DEFINE_ENUM(NFS4_OPEN_STID); -TRACE_DEFINE_ENUM(NFS4_LOCK_STID); -TRACE_DEFINE_ENUM(NFS4_DELEG_STID); -TRACE_DEFINE_ENUM(NFS4_CLOSED_STID); -TRACE_DEFINE_ENUM(NFS4_REVOKED_DELEG_STID); -TRACE_DEFINE_ENUM(NFS4_CLOSED_DELEG_STID); -TRACE_DEFINE_ENUM(NFS4_LAYOUT_STID); - #define show_stid_type(x) \ __print_flags(x, "|", \ - { NFS4_OPEN_STID, "OPEN" }, \ - { NFS4_LOCK_STID, "LOCK" }, \ - { NFS4_DELEG_STID, "DELEG" }, \ - { NFS4_CLOSED_STID, "CLOSED" }, \ - { NFS4_REVOKED_DELEG_STID, "REVOKED" }, \ - { NFS4_CLOSED_DELEG_STID, "CLOSED_DELEG" }, \ - { NFS4_LAYOUT_STID, "LAYOUT" }) + { SC_TYPE_OPEN, "OPEN" }, \ + { SC_TYPE_LOCK, "LOCK" }, \ + { SC_TYPE_DELEG, "DELEG" }, \ + { SC_TYPE_LAYOUT, "LAYOUT" }) + +#define show_stid_status(x) \ + __print_flags(x, "|", \ + { SC_STATUS_CLOSED, "CLOSED" }, \ + { SC_STATUS_REVOKED, "REVOKED" }, \ + { SC_STATUS_ADMIN_REVOKED, "ADMIN_REVOKED" }) DECLARE_EVENT_CLASS(nfsd_stid_class, TP_PROTO( @@ -666,6 +663,7 @@ DECLARE_EVENT_CLASS(nfsd_stid_class, TP_ARGS(stid), TP_STRUCT__entry( __field(unsigned long, sc_type) + __field(unsigned long, sc_status) __field(int, sc_count) __field(u32, cl_boot) __field(u32, cl_id) @@ -676,16 +674,18 @@ DECLARE_EVENT_CLASS(nfsd_stid_class, const stateid_t *stp = &stid->sc_stateid; __entry->sc_type = stid->sc_type; + __entry->sc_status = stid->sc_status; __entry->sc_count = refcount_read(&stid->sc_count); __entry->cl_boot = stp->si_opaque.so_clid.cl_boot; __entry->cl_id = stp->si_opaque.so_clid.cl_id; __entry->si_id = stp->si_opaque.so_id; __entry->si_generation = stp->si_generation; ), - TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s", + TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s state=%s", __entry->cl_boot, __entry->cl_id, __entry->si_id, __entry->si_generation, - __entry->sc_count, show_stid_type(__entry->sc_type) + __entry->sc_count, show_stid_type(__entry->sc_type), + show_stid_status(__entry->sc_status) ) ); @@ -696,6 +696,59 @@ DEFINE_EVENT(nfsd_stid_class, nfsd_stid_##name, \ DEFINE_STID_EVENT(revoke); +TRACE_EVENT(nfsd_stateowner_replay, + TP_PROTO( + u32 opnum, + const struct nfs4_replay *rp + ), + TP_ARGS(opnum, rp), + TP_STRUCT__entry( + __field(unsigned long, status) + __field(u32, opnum) + ), + TP_fast_assign( + __entry->status = be32_to_cpu(rp->rp_status); + __entry->opnum = opnum; + ), + TP_printk("opnum=%u status=%lu", + __entry->opnum, __entry->status) +); + +TRACE_EVENT_CONDITION(nfsd_seq4_status, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct nfsd4_sequence *sequence + ), + TP_ARGS(rqstp, sequence), + TP_CONDITION(sequence->status_flags), + TP_STRUCT__entry( + __field(unsigned int, netns_ino) + __field(u32, xid) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, seqno) + __field(u32, reserved) + __field(unsigned long, status_flags) + ), + TP_fast_assign( + const struct nfsd4_sessionid *sid = + (struct nfsd4_sessionid *)&sequence->sessionid; + + __entry->netns_ino = SVC_NET(rqstp)->ns.inum; + __entry->xid = be32_to_cpu(rqstp->rq_xid); + __entry->cl_boot = sid->clientid.cl_boot; + __entry->cl_id = sid->clientid.cl_id; + __entry->seqno = sid->sequence; + __entry->reserved = sid->reserved; + __entry->status_flags = sequence->status_flags; + ), + TP_printk("xid=0x%08x sessionid=%08x:%08x:%08x:%08x status_flags=%s", + __entry->xid, __entry->cl_boot, __entry->cl_id, + __entry->seqno, __entry->reserved, + show_nfs4_seq4_status(__entry->status_flags) + ) +); + DECLARE_EVENT_CLASS(nfsd_clientid_class, TP_PROTO(const clientid_t *clid), TP_ARGS(clid), @@ -1334,7 +1387,8 @@ DEFINE_EVENT(nfsd_cb_class, nfsd_cb_##name, \ TP_PROTO(const struct nfs4_client *clp), \ TP_ARGS(clp)) -DEFINE_NFSD_CB_EVENT(state); +DEFINE_NFSD_CB_EVENT(start); +DEFINE_NFSD_CB_EVENT(new_state); DEFINE_NFSD_CB_EVENT(probe); DEFINE_NFSD_CB_EVENT(lost); DEFINE_NFSD_CB_EVENT(shutdown); @@ -1405,6 +1459,128 @@ TRACE_EVENT(nfsd_cb_setup_err, __entry->error) ); +DECLARE_EVENT_CLASS(nfsd_cb_lifetime_class, + TP_PROTO( + const struct nfs4_client *clp, + const struct nfsd4_callback *cb + ), + TP_ARGS(clp, cb), + TP_STRUCT__entry( + __field(u32, cl_boot) + __field(u32, cl_id) + __field(const void *, cb) + __field(bool, need_restart) + __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) + ), + TP_fast_assign( + __entry->cl_boot = clp->cl_clientid.cl_boot; + __entry->cl_id = clp->cl_clientid.cl_id; + __entry->cb = cb; + __entry->need_restart = cb->cb_need_restart; + __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, + clp->cl_cb_conn.cb_addrlen) + ), + TP_printk("addr=%pISpc client %08x:%08x cb=%p%s", + __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, + __entry->cb, __entry->need_restart ? + " (need restart)" : " (first try)" + ) +); + +#define DEFINE_NFSD_CB_LIFETIME_EVENT(name) \ +DEFINE_EVENT(nfsd_cb_lifetime_class, nfsd_cb_##name, \ + TP_PROTO( \ + const struct nfs4_client *clp, \ + const struct nfsd4_callback *cb \ + ), \ + TP_ARGS(clp, cb)) + +DEFINE_NFSD_CB_LIFETIME_EVENT(queue); +DEFINE_NFSD_CB_LIFETIME_EVENT(destroy); +DEFINE_NFSD_CB_LIFETIME_EVENT(restart); +DEFINE_NFSD_CB_LIFETIME_EVENT(bc_update); +DEFINE_NFSD_CB_LIFETIME_EVENT(bc_shutdown); + +TRACE_EVENT(nfsd_cb_seq_status, + TP_PROTO( + const struct rpc_task *task, + const struct nfsd4_callback *cb + ), + TP_ARGS(task, cb), + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, seqno) + __field(u32, reserved) + __field(int, tk_status) + __field(int, seq_status) + ), + TP_fast_assign( + const struct nfs4_client *clp = cb->cb_clp; + const struct nfsd4_session *session = clp->cl_cb_session; + const struct nfsd4_sessionid *sid = + (struct nfsd4_sessionid *)&session->se_sessionid; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client ? + task->tk_client->cl_clid : -1; + __entry->cl_boot = sid->clientid.cl_boot; + __entry->cl_id = sid->clientid.cl_id; + __entry->seqno = sid->sequence; + __entry->reserved = sid->reserved; + __entry->tk_status = task->tk_status; + __entry->seq_status = cb->cb_seq_status; + ), + TP_printk(SUNRPC_TRACE_TASK_SPECIFIER + " sessionid=%08x:%08x:%08x:%08x tk_status=%d seq_status=%d\n", + __entry->task_id, __entry->client_id, + __entry->cl_boot, __entry->cl_id, + __entry->seqno, __entry->reserved, + __entry->tk_status, __entry->seq_status + ) +); + +TRACE_EVENT(nfsd_cb_free_slot, + TP_PROTO( + const struct rpc_task *task, + const struct nfsd4_callback *cb + ), + TP_ARGS(task, cb), + TP_STRUCT__entry( + __field(unsigned int, task_id) + __field(unsigned int, client_id) + __field(u32, cl_boot) + __field(u32, cl_id) + __field(u32, seqno) + __field(u32, reserved) + __field(u32, slot_seqno) + ), + TP_fast_assign( + const struct nfs4_client *clp = cb->cb_clp; + const struct nfsd4_session *session = clp->cl_cb_session; + const struct nfsd4_sessionid *sid = + (struct nfsd4_sessionid *)&session->se_sessionid; + + __entry->task_id = task->tk_pid; + __entry->client_id = task->tk_client ? + task->tk_client->cl_clid : -1; + __entry->cl_boot = sid->clientid.cl_boot; + __entry->cl_id = sid->clientid.cl_id; + __entry->seqno = sid->sequence; + __entry->reserved = sid->reserved; + __entry->slot_seqno = session->se_cb_seq_nr; + ), + TP_printk(SUNRPC_TRACE_TASK_SPECIFIER + " sessionid=%08x:%08x:%08x:%08x new slot seqno=%u\n", + __entry->task_id, __entry->client_id, + __entry->cl_boot, __entry->cl_id, + __entry->seqno, __entry->reserved, + __entry->slot_seqno + ) +); + TRACE_EVENT_CONDITION(nfsd_cb_recall, TP_PROTO( const struct nfs4_stid *stid diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index b7c7a9273ea0..6a4c506038e0 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -476,7 +476,6 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap) * @rqstp: controlling RPC transaction * @fhp: filehandle of target * @attr: attributes to set - * @check_guard: set to 1 if guardtime is a valid timestamp * @guardtime: do not act if ctime.tv_sec does not match this timestamp * * This call may adjust the contents of @attr (in particular, this @@ -488,8 +487,7 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap) */ __be32 nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct nfsd_attrs *attr, - int check_guard, time64_t guardtime) + struct nfsd_attrs *attr, const struct timespec64 *guardtime) { struct dentry *dentry; struct inode *inode; @@ -497,7 +495,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, int accmode = NFSD_MAY_SATTR; umode_t ftype = 0; __be32 err; - int host_err; + int host_err = 0; bool get_write_count; bool size_change = (iap->ia_valid & ATTR_SIZE); int retries; @@ -538,9 +536,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, nfsd_sanitize_attrs(inode, iap); - if (check_guard && guardtime != inode_get_ctime_sec(inode)) - return nfserr_notsync; - /* * The size case is special, it changes the file in addition to the * attributes, and file systems don't expect it to be mixed with @@ -555,6 +550,19 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, } inode_lock(inode); + err = fh_fill_pre_attrs(fhp); + if (err) + goto out_unlock; + + if (guardtime) { + struct timespec64 ctime = inode_get_ctime(inode); + if ((u32)guardtime->tv_sec != (u32)ctime.tv_sec || + guardtime->tv_nsec != ctime.tv_nsec) { + err = nfserr_notsync; + goto out_fill_attrs; + } + } + for (retries = 1;;) { struct iattr attrs; @@ -582,13 +590,23 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, attr->na_aclerr = set_posix_acl(&nop_mnt_idmap, dentry, ACL_TYPE_DEFAULT, attr->na_dpacl); +out_fill_attrs: + /* + * RFC 1813 Section 3.3.2 does not mandate that an NFS server + * returns wcc_data for SETATTR. Some client implementations + * depend on receiving wcc_data, however, to sort out partial + * updates (eg., the client requested that size and mode be + * modified, but the server changed only the file mode). + */ + fh_fill_post_attrs(fhp); +out_unlock: inode_unlock(inode); if (size_change) put_write_access(inode); out: if (!host_err) host_err = commit_metadata(fhp); - return nfserrno(host_err); + return err != 0 ? err : nfserrno(host_err); } #if defined(CONFIG_NFSD_V4) @@ -1002,7 +1020,9 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned long *count, u32 *eof, ssize_t host_err) { if (host_err >= 0) { - nfsd_stats_io_read_add(fhp->fh_export, host_err); + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + + nfsd_stats_io_read_add(nn, fhp->fh_export, host_err); *eof = nfsd_eof_on_read(file, offset, host_err, *count); *count = host_err; fsnotify_access(file); @@ -1185,7 +1205,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, goto out_nfserr; } *cnt = host_err; - nfsd_stats_io_write_add(exp, *cnt); + nfsd_stats_io_write_add(nn, exp, *cnt); fsnotify_modify(file); host_err = filemap_check_wb_err(file->f_mapping, since); if (host_err < 0) @@ -1404,7 +1424,7 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, * if the attributes have not changed. */ if (iap->ia_valid) - status = nfsd_setattr(rqstp, resfhp, attrs, 0, (time64_t)0); + status = nfsd_setattr(rqstp, resfhp, attrs, NULL); else status = nfserrno(commit_metadata(resfhp)); @@ -1906,10 +1926,10 @@ out_unlock: fh_drop_write(ffhp); /* - * If the target dentry has cached open files, then we need to try to - * close them prior to doing the rename. Flushing delayed fput - * shouldn't be done with locks held however, so we delay it until this - * point and then reattempt the whole shebang. + * If the target dentry has cached open files, then we need to + * try to close them prior to doing the rename. Final fput + * shouldn't be done with locks held however, so we delay it + * until this point and then reattempt the whole shebang. */ if (close_cached) { close_cached = false; @@ -2177,11 +2197,43 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, if (err == nfserr_eof || err == nfserr_toosmall) err = nfs_ok; /* can still be found in ->err */ out_close: - fput(file); + nfsd_filp_close(file); out: return err; } +/** + * nfsd_filp_close: close a file synchronously + * @fp: the file to close + * + * nfsd_filp_close() is similar in behaviour to filp_close(). + * The difference is that if this is the final close on the + * file, the that finalisation happens immediately, rather then + * being handed over to a work_queue, as it the case for + * filp_close(). + * When a user-space process closes a file (even when using + * filp_close() the finalisation happens before returning to + * userspace, so it is effectively synchronous. When a kernel thread + * uses file_close(), on the other hand, the handling is completely + * asynchronous. This means that any cost imposed by that finalisation + * is not imposed on the nfsd thread, and nfsd could potentually + * close files more quickly than the work queue finalises the close, + * which would lead to unbounded growth in the queue. + * + * In some contexts is it not safe to synchronously wait for + * close finalisation (see comment for __fput_sync()), but nfsd + * does not match those contexts. In partcilarly it does not, at the + * time that this function is called, hold and locks and no finalisation + * of any file, socket, or device driver would have any cause to wait + * for nfsd to make progress. + */ +void nfsd_filp_close(struct file *fp) +{ + get_file(fp); + filp_close(fp, NULL); + __fput_sync(fp); +} + /* * Get file system stats * N.B. After this call fhp needs an fh_put diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 702fbc4483bf..c60fdb6200fd 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -69,7 +69,7 @@ __be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *, const char *, unsigned int, struct svc_export **, struct dentry **); __be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *, - struct nfsd_attrs *, int, time64_t); + struct nfsd_attrs *, const struct timespec64 *); int nfsd_mountpoint(struct dentry *, struct svc_export *); #ifdef CONFIG_NFSD_V4 __be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *, @@ -148,6 +148,8 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, __be32 nfsd_permission(struct svc_rqst *, struct svc_export *, struct dentry *, int); +void nfsd_filp_close(struct file *fp); + static inline int fh_want_write(struct svc_fh *fh) { int ret; diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 03fe4e21306c..522067b7fd75 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -14,7 +14,7 @@ struct nfsd3_sattrargs { struct svc_fh fh; struct iattr attrs; int check_guard; - time64_t guardtime; + struct timespec64 guardtime; }; struct nfsd3_diropargs { diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index 0d39af1b00a0..e8b00309c449 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -54,3 +54,21 @@ #define NFS4_dec_cb_recall_any_sz (cb_compound_dec_hdr_sz + \ cb_sequence_dec_sz + \ op_dec_sz) + +/* + * 1: CB_GETATTR opcode (32-bit) + * N: file_handle + * 1: number of entry in attribute array (32-bit) + * 1: entry 0 in attribute array (32-bit) + */ +#define NFS4_enc_cb_getattr_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 1 + enc_nfs4_fh_sz + 1 + 1) +/* + * 4: fattr_bitmap_maxsz + * 1: attribute array len + * 2: change attr (64-bit) + * 2: size (64-bit) + */ +#define NFS4_dec_cb_getattr_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + 4 + 1 + 2 + 2 + op_dec_sz) diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 67cf1c9efd80..23617da0e565 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -339,7 +339,6 @@ struct svc_program { const struct svc_version **pg_vers; /* version array */ char * pg_name; /* service name */ char * pg_class; /* class name: services sharing authentication */ - struct svc_stat * pg_stats; /* rpc statistics */ enum svc_auth_status (*pg_authenticate)(struct svc_rqst *rqstp); __be32 (*pg_init_request)(struct svc_rqst *, const struct svc_program *, @@ -411,7 +410,9 @@ bool svc_rqst_replace_page(struct svc_rqst *rqstp, void svc_rqst_release_pages(struct svc_rqst *rqstp); void svc_rqst_free(struct svc_rqst *); void svc_exit_thread(struct svc_rqst *); -struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, +struct svc_serv * svc_create_pooled(struct svc_program *prog, + struct svc_stat *stats, + unsigned int bufsize, int (*threadfn)(void *data)); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); int svc_pool_stats_open(struct svc_info *si, struct file *file); diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index e7595ae62fe2..24cd199dd6f3 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -203,6 +203,30 @@ struct svc_rdma_recv_ctxt { struct page *rc_pages[RPCSVC_MAXPAGES]; }; +/* + * State for sending a Write chunk. + * - Tracks progress of writing one chunk over all its segments + * - Stores arguments for the SGL constructor functions + */ +struct svc_rdma_write_info { + struct svcxprt_rdma *wi_rdma; + struct list_head wi_list; + + const struct svc_rdma_chunk *wi_chunk; + + /* write state of this chunk */ + unsigned int wi_seg_off; + unsigned int wi_seg_no; + + /* SGL constructor arguments */ + const struct xdr_buf *wi_xdr; + unsigned char *wi_base; + unsigned int wi_next_off; + + struct svc_rdma_chunk_ctxt wi_cc; + struct work_struct wi_work; +}; + struct svc_rdma_send_ctxt { struct llist_node sc_node; struct rpc_rdma_cid sc_cid; @@ -210,9 +234,15 @@ struct svc_rdma_send_ctxt { struct svcxprt_rdma *sc_rdma; struct ib_send_wr sc_send_wr; + struct ib_send_wr *sc_wr_chain; + int sc_sqecount; struct ib_cqe sc_cqe; struct xdr_buf sc_hdrbuf; struct xdr_stream sc_stream; + + struct list_head sc_write_info_list; + struct svc_rdma_write_info sc_reply_info; + void *sc_xprt_buf; int sc_page_count; int sc_cur_sge_no; @@ -236,18 +266,27 @@ extern void svc_rdma_release_ctxt(struct svc_xprt *xprt, void *ctxt); extern int svc_rdma_recvfrom(struct svc_rqst *); /* svc_rdma_rw.c */ +extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc); extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma); extern void svc_rdma_cc_init(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc); extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma, struct svc_rdma_chunk_ctxt *cc, enum dma_data_direction dir); -extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, - const struct svc_rdma_chunk *chunk, - const struct xdr_buf *xdr); -extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, - const struct svc_rdma_recv_ctxt *rctxt, - const struct xdr_buf *xdr); +extern void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt); +extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt); +extern int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma, + const struct svc_rdma_pcl *write_pcl, + struct svc_rdma_send_ctxt *sctxt, + const struct xdr_buf *xdr); +extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_pcl *write_pcl, + const struct svc_rdma_pcl *reply_pcl, + struct svc_rdma_send_ctxt *sctxt, + const struct xdr_buf *xdr); extern int svc_rdma_process_read_list(struct svcxprt_rdma *rdma, struct svc_rqst *rqstp, struct svc_rdma_recv_ctxt *head); @@ -258,8 +297,8 @@ extern struct svc_rdma_send_ctxt * svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma); extern void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt); -extern int svc_rdma_send(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt); +extern int svc_rdma_post_send(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt); extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, const struct svc_rdma_pcl *write_pcl, diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 110c1475c527..027ac3ab457d 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -2118,6 +2118,10 @@ DEFINE_SIMPLE_CID_EVENT(svcrdma_wc_write); DEFINE_SEND_FLUSH_EVENT(svcrdma_wc_write_flush); DEFINE_SEND_FLUSH_EVENT(svcrdma_wc_write_err); +DEFINE_SIMPLE_CID_EVENT(svcrdma_wc_reply); +DEFINE_SEND_FLUSH_EVENT(svcrdma_wc_reply_flush); +DEFINE_SEND_FLUSH_EVENT(svcrdma_wc_reply_err); + TRACE_EVENT(svcrdma_qp_error, TP_PROTO( const struct ib_event *event, diff --git a/include/trace/misc/nfs.h b/include/trace/misc/nfs.h index 0d9d48dca38a..64ab5dac59ce 100644 --- a/include/trace/misc/nfs.h +++ b/include/trace/misc/nfs.h @@ -385,3 +385,37 @@ TRACE_DEFINE_ENUM(IOMODE_ANY); { SEQ4_STATUS_RESTART_RECLAIM_NEEDED, "RESTART_RECLAIM_NEEDED" }, \ { SEQ4_STATUS_CB_PATH_DOWN_SESSION, "CB_PATH_DOWN_SESSION" }, \ { SEQ4_STATUS_BACKCHANNEL_FAULT, "BACKCHANNEL_FAULT" }) + +TRACE_DEFINE_ENUM(OP_CB_GETATTR); +TRACE_DEFINE_ENUM(OP_CB_RECALL); +TRACE_DEFINE_ENUM(OP_CB_LAYOUTRECALL); +TRACE_DEFINE_ENUM(OP_CB_NOTIFY); +TRACE_DEFINE_ENUM(OP_CB_PUSH_DELEG); +TRACE_DEFINE_ENUM(OP_CB_RECALL_ANY); +TRACE_DEFINE_ENUM(OP_CB_RECALLABLE_OBJ_AVAIL); +TRACE_DEFINE_ENUM(OP_CB_RECALL_SLOT); +TRACE_DEFINE_ENUM(OP_CB_SEQUENCE); +TRACE_DEFINE_ENUM(OP_CB_WANTS_CANCELLED); +TRACE_DEFINE_ENUM(OP_CB_NOTIFY_LOCK); +TRACE_DEFINE_ENUM(OP_CB_NOTIFY_DEVICEID); +TRACE_DEFINE_ENUM(OP_CB_OFFLOAD); +TRACE_DEFINE_ENUM(OP_CB_ILLEGAL); + +#define show_nfs4_cb_op(x) \ + __print_symbolic(x, \ + { 0, "CB_NULL" }, \ + { 1, "CB_COMPOUND" }, \ + { OP_CB_GETATTR, "CB_GETATTR" }, \ + { OP_CB_RECALL, "CB_RECALL" }, \ + { OP_CB_LAYOUTRECALL, "CB_LAYOUTRECALL" }, \ + { OP_CB_NOTIFY, "CB_NOTIFY" }, \ + { OP_CB_PUSH_DELEG, "CB_PUSH_DELEG" }, \ + { OP_CB_RECALL_ANY, "CB_RECALL_ANY" }, \ + { OP_CB_RECALLABLE_OBJ_AVAIL, "CB_RECALLABLE_OBJ_AVAIL" }, \ + { OP_CB_RECALL_SLOT, "CB_RECALL_SLOT" }, \ + { OP_CB_SEQUENCE, "CB_SEQUENCE" }, \ + { OP_CB_WANTS_CANCELLED, "CB_WANTS_CANCELLED" }, \ + { OP_CB_NOTIFY_LOCK, "CB_NOTIFY_LOCK" }, \ + { OP_CB_NOTIFY_DEVICEID, "CB_NOTIFY_DEVICEID" }, \ + { OP_CB_OFFLOAD, "CB_OFFLOAD" }, \ + { OP_CB_ILLEGAL, "CB_ILLEGAL" }) diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index d2b02710ab07..b2c1b683a88e 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -921,6 +921,8 @@ out_err: * Caller provides the truncation length of the output token (h) in * cksumout.len. * + * Note that for RPCSEC, the "initial cipher state" is always all zeroes. + * * Return values: * %GSS_S_COMPLETE: Digest computed, @cksumout filled in * %GSS_S_FAILURE: Call failed @@ -931,22 +933,19 @@ u32 krb5_etm_checksum(struct crypto_sync_skcipher *cipher, int body_offset, struct xdr_netobj *cksumout) { unsigned int ivsize = crypto_sync_skcipher_ivsize(cipher); + static const u8 iv[GSS_KRB5_MAX_BLOCKSIZE]; struct ahash_request *req; struct scatterlist sg[1]; - u8 *iv, *checksumdata; int err = -ENOMEM; + u8 *checksumdata; checksumdata = kmalloc(crypto_ahash_digestsize(tfm), GFP_KERNEL); if (!checksumdata) return GSS_S_FAILURE; - /* For RPCSEC, the "initial cipher state" is always all zeroes. */ - iv = kzalloc(ivsize, GFP_KERNEL); - if (!iv) - goto out_free_mem; req = ahash_request_alloc(tfm, GFP_KERNEL); if (!req) - goto out_free_mem; + goto out_free_cksumdata; ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); err = crypto_ahash_init(req); if (err) @@ -970,8 +969,7 @@ u32 krb5_etm_checksum(struct crypto_sync_skcipher *cipher, out_free_ahash: ahash_request_free(req); -out_free_mem: - kfree(iv); +out_free_cksumdata: kfree_sensitive(checksumdata); return err ? GSS_S_FAILURE : GSS_S_COMPLETE; } diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 64cff717c3d9..3366505bc669 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -398,6 +398,7 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, u64 seq_send64; int keylen; u32 time32; + int ret; p = simple_get_bytes(p, end, &ctx->flags, sizeof(ctx->flags)); if (IS_ERR(p)) @@ -450,8 +451,16 @@ gss_import_v2_context(const void *p, const void *end, struct krb5_ctx *ctx, } ctx->mech_used.len = gss_kerberos_mech.gm_oid.len; - return gss_krb5_import_ctx_v2(ctx, gfp_mask); + ret = gss_krb5_import_ctx_v2(ctx, gfp_mask); + if (ret) { + p = ERR_PTR(ret); + goto out_free; + } + return 0; + +out_free: + kfree(ctx->mech_used.data); out_err: return PTR_ERR(p); } diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index d79f12c2550a..cb32ab9a8395 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -250,8 +250,8 @@ static int gssx_dec_option_array(struct xdr_stream *xdr, creds = kzalloc(sizeof(struct svc_cred), GFP_KERNEL); if (!creds) { - kfree(oa->data); - return -ENOMEM; + err = -ENOMEM; + goto free_oa; } oa->data[0].option.data = CREDS_VALUE; @@ -265,29 +265,40 @@ static int gssx_dec_option_array(struct xdr_stream *xdr, /* option buffer */ p = xdr_inline_decode(xdr, 4); - if (unlikely(p == NULL)) - return -ENOSPC; + if (unlikely(p == NULL)) { + err = -ENOSPC; + goto free_creds; + } length = be32_to_cpup(p); p = xdr_inline_decode(xdr, length); - if (unlikely(p == NULL)) - return -ENOSPC; + if (unlikely(p == NULL)) { + err = -ENOSPC; + goto free_creds; + } if (length == sizeof(CREDS_VALUE) && memcmp(p, CREDS_VALUE, sizeof(CREDS_VALUE)) == 0) { /* We have creds here. parse them */ err = gssx_dec_linux_creds(xdr, creds); if (err) - return err; + goto free_creds; oa->data[0].value.len = 1; /* presence */ } else { /* consume uninteresting buffer */ err = gssx_dec_buffer(xdr, &dummy); if (err) - return err; + goto free_creds; } } return 0; + +free_creds: + kfree(creds); +free_oa: + kfree(oa->data); + oa->data = NULL; + return err; } static int gssx_dec_status(struct xdr_stream *xdr, diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c index 65fc1297c6df..383860cb1d5b 100644 --- a/net/sunrpc/stats.c +++ b/net/sunrpc/stats.c @@ -314,7 +314,7 @@ EXPORT_SYMBOL_GPL(rpc_proc_unregister); struct proc_dir_entry * svc_proc_register(struct net *net, struct svc_stat *statp, const struct proc_ops *proc_ops) { - return do_register(net, statp->program->pg_name, statp, proc_ops); + return do_register(net, statp->program->pg_name, net, proc_ops); } EXPORT_SYMBOL_GPL(svc_proc_register); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index b969e505c7b7..b33e429336fb 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -451,8 +451,8 @@ __svc_init_bc(struct svc_serv *serv) * Create an RPC service */ static struct svc_serv * -__svc_create(struct svc_program *prog, unsigned int bufsize, int npools, - int (*threadfn)(void *data)) +__svc_create(struct svc_program *prog, struct svc_stat *stats, + unsigned int bufsize, int npools, int (*threadfn)(void *data)) { struct svc_serv *serv; unsigned int vers; @@ -463,7 +463,7 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, return NULL; serv->sv_name = prog->pg_name; serv->sv_program = prog; - serv->sv_stats = prog->pg_stats; + serv->sv_stats = stats; if (bufsize > RPCSVC_MAXPAYLOAD) bufsize = RPCSVC_MAXPAYLOAD; serv->sv_max_payload = bufsize? bufsize : 4096; @@ -529,26 +529,28 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, struct svc_serv *svc_create(struct svc_program *prog, unsigned int bufsize, int (*threadfn)(void *data)) { - return __svc_create(prog, bufsize, 1, threadfn); + return __svc_create(prog, NULL, bufsize, 1, threadfn); } EXPORT_SYMBOL_GPL(svc_create); /** * svc_create_pooled - Create an RPC service with pooled threads * @prog: the RPC program the new service will handle + * @stats: the stats struct if desired * @bufsize: maximum message size for @prog * @threadfn: a function to service RPC requests for @prog * * Returns an instantiated struct svc_serv object or NULL. */ struct svc_serv *svc_create_pooled(struct svc_program *prog, + struct svc_stat *stats, unsigned int bufsize, int (*threadfn)(void *data)) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); - serv = __svc_create(prog, bufsize, npools, threadfn); + serv = __svc_create(prog, stats, bufsize, npools, threadfn); if (!serv) goto out_err; return serv; @@ -1375,7 +1377,8 @@ svc_process_common(struct svc_rqst *rqstp) goto err_bad_proc; /* Syntactic check complete */ - serv->sv_stats->rpccnt++; + if (serv->sv_stats) + serv->sv_stats->rpccnt++; trace_svc_process(rqstp, progp->pg_name); aoffset = xdr_stream_pos(xdr); @@ -1427,7 +1430,8 @@ err_short_len: goto close_xprt; err_bad_rpc: - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; xdr_stream_encode_u32(xdr, RPC_MSG_DENIED); xdr_stream_encode_u32(xdr, RPC_MISMATCH); /* Only RPCv2 supported */ @@ -1438,7 +1442,8 @@ err_bad_rpc: err_bad_auth: dprintk("svc: authentication failed (%d)\n", be32_to_cpu(rqstp->rq_auth_stat)); - serv->sv_stats->rpcbadauth++; + if (serv->sv_stats) + serv->sv_stats->rpcbadauth++; /* Restore write pointer to location of reply status: */ xdr_truncate_encode(xdr, XDR_UNIT * 2); xdr_stream_encode_u32(xdr, RPC_MSG_DENIED); @@ -1448,7 +1453,8 @@ err_bad_auth: err_bad_prog: dprintk("svc: unknown program %d\n", rqstp->rq_prog); - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_prog_unavail; goto sendit; @@ -1456,7 +1462,8 @@ err_bad_vers: svc_printk(rqstp, "unknown version (%d for prog %d, %s)\n", rqstp->rq_vers, rqstp->rq_prog, progp->pg_name); - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_prog_mismatch; /* @@ -1470,19 +1477,22 @@ err_bad_vers: err_bad_proc: svc_printk(rqstp, "unknown procedure (%d)\n", rqstp->rq_proc); - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_proc_unavail; goto sendit; err_garbage_args: svc_printk(rqstp, "failed to decode RPC header\n"); - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_garbage_args; goto sendit; err_system_err: - serv->sv_stats->rpcbadfmt++; + if (serv->sv_stats) + serv->sv_stats->rpcbadfmt++; *rqstp->rq_accept_statp = rpc_system_err; goto sendit; } @@ -1534,7 +1544,8 @@ void svc_process(struct svc_rqst *rqstp) out_baddir: svc_printk(rqstp, "bad direction 0x%08x, dropping request\n", be32_to_cpu(*p)); - rqstp->rq_server->sv_stats->rpcbadfmt++; + if (rqstp->rq_server->sv_stats) + rqstp->rq_server->sv_stats->rpcbadfmt++; out_drop: svc_drop(rqstp); } @@ -1612,7 +1623,6 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) WARN_ON_ONCE(atomic_read(&task->tk_count) != 1); rpc_put_task(task); } -EXPORT_SYMBOL_GPL(svc_process_bc); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ /** diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index c9be6778643b..e5a78b761012 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -90,7 +90,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, */ get_page(virt_to_page(rqst->rq_buffer)); sctxt->sc_send_wr.opcode = IB_WR_SEND; - return svc_rdma_send(rdma, sctxt); + return svc_rdma_post_send(rdma, sctxt); } /* Server-side transport endpoint wants a whole page for its send diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index c00fcce61d1e..f2a100c4c81f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -197,28 +197,6 @@ void svc_rdma_cc_release(struct svcxprt_rdma *rdma, llist_add_batch(first, last, &rdma->sc_rw_ctxts); } -/* State for sending a Write or Reply chunk. - * - Tracks progress of writing one chunk over all its segments - * - Stores arguments for the SGL constructor functions - */ -struct svc_rdma_write_info { - struct svcxprt_rdma *wi_rdma; - - const struct svc_rdma_chunk *wi_chunk; - - /* write state of this chunk */ - unsigned int wi_seg_off; - unsigned int wi_seg_no; - - /* SGL constructor arguments */ - const struct xdr_buf *wi_xdr; - unsigned char *wi_base; - unsigned int wi_next_off; - - struct svc_rdma_chunk_ctxt wi_cc; - struct work_struct wi_work; -}; - static struct svc_rdma_write_info * svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, const struct svc_rdma_chunk *chunk) @@ -252,6 +230,71 @@ static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) queue_work(svcrdma_wq, &info->wi_work); } +/** + * svc_rdma_write_chunk_release - Release Write chunk I/O resources + * @rdma: controlling transport + * @ctxt: Send context that is being released + */ +void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) +{ + struct svc_rdma_write_info *info; + struct svc_rdma_chunk_ctxt *cc; + + while (!list_empty(&ctxt->sc_write_info_list)) { + info = list_first_entry(&ctxt->sc_write_info_list, + struct svc_rdma_write_info, wi_list); + list_del(&info->wi_list); + + cc = &info->wi_cc; + svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); + svc_rdma_write_info_free(info); + } +} + +/** + * svc_rdma_reply_chunk_release - Release Reply chunk I/O resources + * @rdma: controlling transport + * @ctxt: Send context that is being released + */ +void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) +{ + struct svc_rdma_chunk_ctxt *cc = &ctxt->sc_reply_info.wi_cc; + + if (!cc->cc_sqecount) + return; + svc_rdma_cc_release(rdma, cc, DMA_TO_DEVICE); +} + +/** + * svc_rdma_reply_done - Reply chunk Write completion handler + * @cq: controlling Completion Queue + * @wc: Work Completion report + * + * Pages under I/O are released by a subsequent Send completion. + */ +static void svc_rdma_reply_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_chunk_ctxt *cc = + container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); + struct svcxprt_rdma *rdma = cq->cq_context; + + switch (wc->status) { + case IB_WC_SUCCESS: + trace_svcrdma_wc_reply(&cc->cc_cid); + return; + case IB_WC_WR_FLUSH_ERR: + trace_svcrdma_wc_reply_flush(wc, &cc->cc_cid); + break; + default: + trace_svcrdma_wc_reply_err(wc, &cc->cc_cid); + } + + svc_xprt_deferred_close(&rdma->sc_xprt); +} + /** * svc_rdma_write_done - Write chunk completion * @cq: controlling Completion Queue @@ -265,13 +308,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct svc_rdma_chunk_ctxt *cc = container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); - struct svc_rdma_write_info *info = - container_of(cc, struct svc_rdma_write_info, wi_cc); switch (wc->status) { case IB_WC_SUCCESS: trace_svcrdma_wc_write(&cc->cc_cid); - break; + return; case IB_WC_WR_FLUSH_ERR: trace_svcrdma_wc_write_flush(wc, &cc->cc_cid); break; @@ -279,12 +320,11 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_write_err(wc, &cc->cc_cid); } - svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); - - if (unlikely(wc->status != IB_WC_SUCCESS)) - svc_xprt_deferred_close(&rdma->sc_xprt); - - svc_rdma_write_info_free(info); + /* The RDMA Write has flushed, so the client won't get + * some of the outgoing RPC message. Signal the loss + * to the client by closing the connection. + */ + svc_xprt_deferred_close(&rdma->sc_xprt); } /** @@ -580,41 +620,54 @@ static int svc_rdma_xb_write(const struct xdr_buf *xdr, void *data) return xdr->len; } -/** - * svc_rdma_send_write_chunk - Write all segments in a Write chunk - * @rdma: controlling RDMA transport - * @chunk: Write chunk provided by the client - * @xdr: xdr_buf containing the data payload - * - * Returns a non-negative number of bytes the chunk consumed, or - * %-E2BIG if the payload was larger than the Write chunk, - * %-EINVAL if client provided too many segments, - * %-ENOMEM if rdma_rw context pool was exhausted, - * %-ENOTCONN if posting failed (connection is lost), - * %-EIO if rdma_rw initialization failed (DMA mapping, etc). +/* Link Write WRs for @chunk onto @sctxt's WR chain. */ -int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, - const struct svc_rdma_chunk *chunk, - const struct xdr_buf *xdr) +static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_chunk *chunk, + const struct xdr_buf *xdr) { struct svc_rdma_write_info *info; struct svc_rdma_chunk_ctxt *cc; + struct ib_send_wr *first_wr; + struct xdr_buf payload; + struct list_head *pos; + struct ib_cqe *cqe; int ret; + if (xdr_buf_subsegment(xdr, &payload, chunk->ch_position, + chunk->ch_payload_length)) + return -EMSGSIZE; + info = svc_rdma_write_info_alloc(rdma, chunk); if (!info) return -ENOMEM; cc = &info->wi_cc; - ret = svc_rdma_xb_write(xdr, info); - if (ret != xdr->len) + ret = svc_rdma_xb_write(&payload, info); + if (ret != payload.len) goto out_err; - trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); - ret = svc_rdma_post_chunk_ctxt(rdma, cc); - if (ret < 0) + ret = -EINVAL; + if (unlikely(cc->cc_sqecount > rdma->sc_sq_depth)) goto out_err; - return xdr->len; + + first_wr = sctxt->sc_wr_chain; + cqe = &cc->cc_cqe; + list_for_each(pos, &cc->cc_rwctxts) { + struct svc_rdma_rw_ctxt *rwc; + + rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list); + first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, cqe, first_wr); + cqe = NULL; + } + sctxt->sc_wr_chain = first_wr; + sctxt->sc_sqecount += cc->cc_sqecount; + list_add(&info->wi_list, &sctxt->sc_write_info_list); + + trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); + return 0; out_err: svc_rdma_write_info_free(info); @@ -622,9 +675,39 @@ out_err: } /** - * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk + * svc_rdma_prepare_write_list - Construct WR chain for sending Write list * @rdma: controlling RDMA transport - * @rctxt: Write and Reply chunks from client + * @write_pcl: Write list provisioned by the client + * @sctxt: Send WR resources + * @xdr: xdr_buf containing an RPC Reply message + * + * Returns zero on success, or a negative errno if one or more + * Write chunks could not be sent. + */ +int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma, + const struct svc_rdma_pcl *write_pcl, + struct svc_rdma_send_ctxt *sctxt, + const struct xdr_buf *xdr) +{ + struct svc_rdma_chunk *chunk; + int ret; + + pcl_for_each_chunk(chunk, write_pcl) { + if (!chunk->ch_payload_length) + break; + ret = svc_rdma_prepare_write_chunk(rdma, sctxt, chunk, xdr); + if (ret < 0) + return ret; + } + return 0; +} + +/** + * svc_rdma_prepare_reply_chunk - Construct WR chain for writing the Reply chunk + * @rdma: controlling RDMA transport + * @write_pcl: Write chunk list provided by client + * @reply_pcl: Reply chunk provided by client + * @sctxt: Send WR resources * @xdr: xdr_buf containing an RPC Reply * * Returns a non-negative number of bytes the chunk consumed, or @@ -634,39 +717,45 @@ out_err: * %-ENOTCONN if posting failed (connection is lost), * %-EIO if rdma_rw initialization failed (DMA mapping, etc). */ -int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, - const struct svc_rdma_recv_ctxt *rctxt, - const struct xdr_buf *xdr) +int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_pcl *write_pcl, + const struct svc_rdma_pcl *reply_pcl, + struct svc_rdma_send_ctxt *sctxt, + const struct xdr_buf *xdr) { - struct svc_rdma_write_info *info; - struct svc_rdma_chunk_ctxt *cc; - struct svc_rdma_chunk *chunk; + struct svc_rdma_write_info *info = &sctxt->sc_reply_info; + struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; + struct ib_send_wr *first_wr; + struct list_head *pos; + struct ib_cqe *cqe; int ret; - if (pcl_is_empty(&rctxt->rc_reply_pcl)) - return 0; + info->wi_rdma = rdma; + info->wi_chunk = pcl_first_chunk(reply_pcl); + info->wi_seg_off = 0; + info->wi_seg_no = 0; + info->wi_cc.cc_cqe.done = svc_rdma_reply_done; - chunk = pcl_first_chunk(&rctxt->rc_reply_pcl); - info = svc_rdma_write_info_alloc(rdma, chunk); - if (!info) - return -ENOMEM; - cc = &info->wi_cc; - - ret = pcl_process_nonpayloads(&rctxt->rc_write_pcl, xdr, + ret = pcl_process_nonpayloads(write_pcl, xdr, svc_rdma_xb_write, info); if (ret < 0) - goto out_err; + return ret; + + first_wr = sctxt->sc_wr_chain; + cqe = &cc->cc_cqe; + list_for_each(pos, &cc->cc_rwctxts) { + struct svc_rdma_rw_ctxt *rwc; + + rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list); + first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, cqe, first_wr); + cqe = NULL; + } + sctxt->sc_wr_chain = first_wr; + sctxt->sc_sqecount += cc->cc_sqecount; trace_svcrdma_post_reply_chunk(&cc->cc_cid, cc->cc_sqecount); - ret = svc_rdma_post_chunk_ctxt(rdma, cc); - if (ret < 0) - goto out_err; - return xdr->len; - -out_err: - svc_rdma_write_info_free(info); - return ret; } /** diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 1a49b7f02041..dfca39abd16c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -142,6 +142,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->sc_send_wr.sg_list = ctxt->sc_sges; ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; ctxt->sc_cqe.done = svc_rdma_wc_send; + INIT_LIST_HEAD(&ctxt->sc_write_info_list); ctxt->sc_xprt_buf = buffer; xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, rdma->sc_max_req_size); @@ -205,9 +206,13 @@ out: xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, NULL); + svc_rdma_cc_init(rdma, &ctxt->sc_reply_info.wi_cc); ctxt->sc_send_wr.num_sge = 0; ctxt->sc_cur_sge_no = 0; ctxt->sc_page_count = 0; + ctxt->sc_wr_chain = &ctxt->sc_send_wr; + ctxt->sc_sqecount = 1; + return ctxt; out_empty: @@ -223,6 +228,9 @@ static void svc_rdma_send_ctxt_release(struct svcxprt_rdma *rdma, struct ib_device *device = rdma->sc_cm_id->device; unsigned int i; + svc_rdma_write_chunk_release(rdma, ctxt); + svc_rdma_reply_chunk_release(rdma, ctxt); + if (ctxt->sc_page_count) release_pages(ctxt->sc_pages, ctxt->sc_page_count); @@ -293,7 +301,7 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) struct svc_rdma_send_ctxt *ctxt = container_of(cqe, struct svc_rdma_send_ctxt, sc_cqe); - svc_rdma_wake_send_waiters(rdma, 1); + svc_rdma_wake_send_waiters(rdma, ctxt->sc_sqecount); if (unlikely(wc->status != IB_WC_SUCCESS)) goto flushed; @@ -312,51 +320,76 @@ flushed: } /** - * svc_rdma_send - Post a single Send WR - * @rdma: transport on which to post the WR - * @ctxt: send ctxt with a Send WR ready to post + * svc_rdma_post_send - Post a WR chain to the Send Queue + * @rdma: transport context + * @ctxt: WR chain to post * - * Returns zero if the Send WR was posted successfully. Otherwise, a - * negative errno is returned. + * Copy fields in @ctxt to stack variables in order to guarantee + * that these values remain available after the ib_post_send() call. + * In some error flow cases, svc_rdma_wc_send() releases @ctxt. + * + * Note there is potential for starvation when the Send Queue is + * full because there is no order to when waiting threads are + * awoken. The transport is typically provisioned with a deep + * enough Send Queue that SQ exhaustion should be a rare event. + * + * Return values: + * %0: @ctxt's WR chain was posted successfully + * %-ENOTCONN: The connection was lost */ -int svc_rdma_send(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt) +int svc_rdma_post_send(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *ctxt) { - struct ib_send_wr *wr = &ctxt->sc_send_wr; - int ret; + struct ib_send_wr *first_wr = ctxt->sc_wr_chain; + struct ib_send_wr *send_wr = &ctxt->sc_send_wr; + const struct ib_send_wr *bad_wr = first_wr; + struct rpc_rdma_cid cid = ctxt->sc_cid; + int ret, sqecount = ctxt->sc_sqecount; might_sleep(); /* Sync the transport header buffer */ ib_dma_sync_single_for_device(rdma->sc_pd->device, - wr->sg_list[0].addr, - wr->sg_list[0].length, + send_wr->sg_list[0].addr, + send_wr->sg_list[0].length, DMA_TO_DEVICE); /* If the SQ is full, wait until an SQ entry is available */ - while (1) { - if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) { + while (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) { + if (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) { + svc_rdma_wake_send_waiters(rdma, sqecount); + + /* When the transport is torn down, assume + * ib_drain_sq() will trigger enough Send + * completions to wake us. The XPT_CLOSE test + * above should then cause the while loop to + * exit. + */ percpu_counter_inc(&svcrdma_stat_sq_starve); - trace_svcrdma_sq_full(rdma, &ctxt->sc_cid); - atomic_inc(&rdma->sc_sq_avail); + trace_svcrdma_sq_full(rdma, &cid); wait_event(rdma->sc_send_wait, - atomic_read(&rdma->sc_sq_avail) > 1); - if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) - return -ENOTCONN; - trace_svcrdma_sq_retry(rdma, &ctxt->sc_cid); + atomic_read(&rdma->sc_sq_avail) > 0); + trace_svcrdma_sq_retry(rdma, &cid); continue; } trace_svcrdma_post_send(ctxt); - ret = ib_post_send(rdma->sc_qp, wr, NULL); - if (ret) - break; + ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); + if (ret) { + trace_svcrdma_sq_post_err(rdma, &cid, ret); + svc_xprt_deferred_close(&rdma->sc_xprt); + + /* If even one WR was posted, there will be a + * Send completion that bumps sc_sq_avail. + */ + if (bad_wr == first_wr) { + svc_rdma_wake_send_waiters(rdma, sqecount); + break; + } + } return 0; } - - trace_svcrdma_sq_post_err(rdma, &ctxt->sc_cid, ret); - svc_xprt_deferred_close(&rdma->sc_xprt); - wake_up(&rdma->sc_send_wait); - return ret; + return -ENOTCONN; } /** @@ -839,16 +872,10 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, * in sc_sges[0], and the RPC xdr_buf is prepared in following sges. * * Depending on whether a Write list or Reply chunk is present, - * the server may send all, a portion of, or none of the xdr_buf. + * the server may Send all, a portion of, or none of the xdr_buf. * In the latter case, only the transport header (sc_sges[0]) is * transmitted. * - * RDMA Send is the last step of transmitting an RPC reply. Pages - * involved in the earlier RDMA Writes are here transferred out - * of the rqstp and into the sctxt's page array. These pages are - * DMA unmapped by each Write completion, but the subsequent Send - * completion finally releases these pages. - * * Assumptions: * - The Reply's transport header will never be larger than a page. */ @@ -857,6 +884,7 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, const struct svc_rdma_recv_ctxt *rctxt, struct svc_rqst *rqstp) { + struct ib_send_wr *send_wr = &sctxt->sc_send_wr; int ret; ret = svc_rdma_map_reply_msg(rdma, sctxt, &rctxt->rc_write_pcl, @@ -864,16 +892,19 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, if (ret < 0) return ret; + /* Transfer pages involved in RDMA Writes to the sctxt's + * page array. Completion handling releases these pages. + */ svc_rdma_save_io_pages(rqstp, sctxt); if (rctxt->rc_inv_rkey) { - sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV; - sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey; + send_wr->opcode = IB_WR_SEND_WITH_INV; + send_wr->ex.invalidate_rkey = rctxt->rc_inv_rkey; } else { - sctxt->sc_send_wr.opcode = IB_WR_SEND; + send_wr->opcode = IB_WR_SEND; } - return svc_rdma_send(rdma, sctxt); + return svc_rdma_post_send(rdma, sctxt); } /** @@ -937,7 +968,7 @@ void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, sctxt->sc_send_wr.num_sge = 1; sctxt->sc_send_wr.opcode = IB_WR_SEND; sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; - if (svc_rdma_send(rdma, sctxt)) + if (svc_rdma_post_send(rdma, sctxt)) goto put_ctxt; return; @@ -984,10 +1015,20 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) if (!p) goto put_ctxt; - ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res); + ret = svc_rdma_prepare_write_list(rdma, &rctxt->rc_write_pcl, sctxt, + &rqstp->rq_res); if (ret < 0) - goto reply_chunk; - rc_size = ret; + goto put_ctxt; + + rc_size = 0; + if (!pcl_is_empty(&rctxt->rc_reply_pcl)) { + ret = svc_rdma_prepare_reply_chunk(rdma, &rctxt->rc_write_pcl, + &rctxt->rc_reply_pcl, sctxt, + &rqstp->rq_res); + if (ret < 0) + goto reply_chunk; + rc_size = ret; + } *p++ = *rdma_argp; *p++ = *(rdma_argp + 1); @@ -1030,45 +1071,33 @@ drop_connection: /** * svc_rdma_result_payload - special processing for a result payload - * @rqstp: svc_rqst to operate on - * @offset: payload's byte offset in @xdr + * @rqstp: RPC transaction context + * @offset: payload's byte offset in @rqstp->rq_res * @length: size of payload, in bytes * + * Assign the passed-in result payload to the current Write chunk, + * and advance to cur_result_payload to the next Write chunk, if + * there is one. + * * Return values: * %0 if successful or nothing needed to be done - * %-EMSGSIZE on XDR buffer overflow * %-E2BIG if the payload was larger than the Write chunk - * %-EINVAL if client provided too many segments - * %-ENOMEM if rdma_rw context pool was exhausted - * %-ENOTCONN if posting failed (connection is lost) - * %-EIO if rdma_rw initialization failed (DMA mapping, etc) */ int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, unsigned int length) { struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; struct svc_rdma_chunk *chunk; - struct svcxprt_rdma *rdma; - struct xdr_buf subbuf; - int ret; chunk = rctxt->rc_cur_result_payload; if (!length || !chunk) return 0; rctxt->rc_cur_result_payload = pcl_next_chunk(&rctxt->rc_write_pcl, chunk); + if (length > chunk->ch_length) return -E2BIG; - chunk->ch_position = offset; chunk->ch_payload_length = length; - - if (xdr_buf_subsegment(&rqstp->rq_res, &subbuf, offset, length)) - return -EMSGSIZE; - - rdma = container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt); - ret = svc_rdma_send_write_chunk(rdma, chunk, &subbuf); - if (ret < 0) - return ret; return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 4f27325ace4a..2b1c16b9547d 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -415,15 +415,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge) newxprt->sc_max_send_sges = dev->attrs.max_send_sge; rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests + - newxprt->sc_recv_batch; + newxprt->sc_recv_batch + 1 /* drain */; if (rq_depth > dev->attrs.max_qp_wr) { rq_depth = dev->attrs.max_qp_wr; newxprt->sc_recv_batch = 1; newxprt->sc_max_requests = rq_depth - 2; newxprt->sc_max_bc_requests = 2; } - ctxts = rdma_rw_mr_factor(dev, newxprt->sc_port_num, RPCSVC_MAXPAGES); - ctxts *= newxprt->sc_max_requests; + + /* Arbitrarily estimate the number of rw_ctxs needed for + * this transport. This is enough rw_ctxs to make forward + * progress even if the client is using one rkey per page + * in each Read chunk. + */ + ctxts = 3 * RPCSVC_MAXPAGES; newxprt->sc_sq_depth = rq_depth + ctxts; if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) newxprt->sc_sq_depth = dev->attrs.max_qp_wr; @@ -460,12 +465,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr); dprintk(" cap.max_send_sge = %d, cap.max_recv_sge = %d\n", qp_attr.cap.max_send_sge, qp_attr.cap.max_recv_sge); - + dprintk(" send CQ depth = %u, recv CQ depth = %u\n", + newxprt->sc_sq_depth, rq_depth); ret = rdma_create_qp(newxprt->sc_cm_id, newxprt->sc_pd, &qp_attr); if (ret) { trace_svcrdma_qp_err(newxprt, ret); goto errout; } + newxprt->sc_max_send_sges = qp_attr.cap.max_send_sge; newxprt->sc_qp = newxprt->sc_cm_id->qp; if (!(dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 58f3dc8d0d71..d92c13e78a56 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2987,20 +2987,11 @@ static int bc_send_request(struct rpc_rqst *req) return len; } -/* - * The close routine. Since this is client initiated, we do nothing - */ - static void bc_close(struct rpc_xprt *xprt) { xprt_disconnect_done(xprt); } -/* - * The xprt destroy routine. Again, because this connection is client - * initiated, we do nothing - */ - static void bc_destroy(struct rpc_xprt *xprt) { dprintk("RPC: bc_destroy xprt %p\n", xprt);