From ab7d763e477c5be33ac9cffc68e808bbd69371f7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 28 Dec 2015 10:32:11 -0500 Subject: [PATCH 01/18] pNFS: Ensure nfs4_layoutget_prepare returns the correct error If we're unable to perform the layoutget due to an invalid open stateid or a bulk recall, ensure that we return the error so that the caller can decide on an appropriate action. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 18d862db15b6..fcd7a9039020 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7760,6 +7760,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) struct nfs4_layoutget *lgp = calldata; struct nfs_server *server = NFS_SERVER(lgp->args.inode); struct nfs4_session *session = nfs4_get_session(server); + int ret; dprintk("--> %s\n", __func__); /* Note the is a race here, where a CB_LAYOUTRECALL can come in @@ -7770,12 +7771,12 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) if (nfs41_setup_sequence(session, &lgp->args.seq_args, &lgp->res.seq_res, task)) return; - if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, + ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid, NFS_I(lgp->args.inode)->layout, &lgp->args.range, - lgp->args.ctx->state)) { - rpc_exit(task, NFS4_OK); - } + lgp->args.ctx->state); + if (ret < 0) + rpc_exit(task, ret); } static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) From 0654cc726fc6eed6dca915fb65ba7975716ea080 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 28 Dec 2015 11:48:14 -0500 Subject: [PATCH 02/18] NFSv4.1/pNFS: Add a helper to mark the layout as returned This ensures that we don't reuse the stateid if a layout return or implied layout return means that we've returned all layout segments Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 1 + fs/nfs/nfs4proc.c | 3 ++- fs/nfs/pnfs.c | 1 + fs/nfs/pnfs.h | 13 +++++++++++++ 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 807eb6ef4f91..716cbff24450 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -192,6 +192,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &args->cbl_range); } + pnfs_mark_layout_returned_if_empty(lo); unlock: spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index fcd7a9039020..883da29b9ace 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8049,9 +8049,10 @@ static void nfs4_layoutreturn_release(void *calldata) dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); + pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); + pnfs_mark_layout_returned_if_empty(lo); if (lrp->res.lrs_present) pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); - pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); pnfs_clear_layoutreturn_waitbit(lo); lo->plh_block_lgets--; spin_unlock(&lo->plh_inode->i_lock); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 6095a8d42766..b3fb6bb02275 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1149,6 +1149,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) spin_lock(&ino->i_lock); lo = NFS_I(ino)->layout; + pnfs_mark_layout_returned_if_empty(lo); if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) lo->plh_barrier = barrier; spin_unlock(&ino->i_lock); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index d1990e90e7a0..be24a759b655 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -556,6 +556,19 @@ pnfs_calc_offset_length(u64 offset, u64 end) return 1 + end - offset; } +/** + * pnfs_mark_layout_returned_if_empty - marks the layout as returned + * @lo: layout header + * + * Note: Caller must hold inode->i_lock + */ +static inline void +pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo) +{ + if (list_empty(&lo->plh_segs)) + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); +} + extern unsigned int layoutstats_timer; #ifdef NFS_DEBUG From fc7ff36747b991d1be0d68987066ea87eedbb43e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 28 Dec 2015 10:28:59 -0500 Subject: [PATCH 03/18] pNFS: If we have to delay the layout callback, mark the layout for return If the client needs to delay the layout callback, then speed up the recall process by marking the remaining layout segments to be actively returned by the client. Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 14 ++++++++++++-- fs/nfs/pnfs.c | 4 +++- fs/nfs/pnfs.h | 3 +++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 716cbff24450..34852ece4057 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -181,9 +181,19 @@ static u32 initiate_file_draining(struct nfs_client *clp, pnfs_layoutcommit_inode(ino, false); spin_lock(&ino->i_lock); - if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || - pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, + /* + * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return) + */ + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + rv = NFS4ERR_DELAY; + goto unlock; + } + + if (pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, &args->cbl_range)) { + pnfs_mark_matching_lsegs_return(lo, + &free_me_list, + &args->cbl_range); rv = NFS4ERR_DELAY; goto unlock; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index b3fb6bb02275..360fe95c97b5 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1756,7 +1756,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out; } -static void +void pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, struct pnfs_layout_range *return_range) @@ -1768,6 +1768,8 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, if (list_empty(&lo->plh_segs)) return; + assert_spin_locked(&lo->plh_inode->i_lock); + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (should_free_lseg(&lseg->pls_range, return_range)) { dprintk("%s: marking lseg %p iomode %d " diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index be24a759b655..d93c2ebc0fd3 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -266,6 +266,9 @@ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, struct pnfs_layout_range *recall_range); +void pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, + struct list_head *tmp_list, + struct pnfs_layout_range *recall_range); bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); From 41c9127d6d588138c66b2614ac596ea63313acca Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 28 Dec 2015 12:57:53 -0500 Subject: [PATCH 04/18] NFSv4.1/pNFS: Ensure we enforce RFC5661 Section 12.5.5.2.1 The RFC requires us to check if the server is recalling a stateid that we haven't yet received. If so, tell it to wait. Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 34852ece4057..1b24ad07d4f5 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -160,6 +160,22 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, return lo; } +/* + * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing) + */ +static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo, + const nfs4_stateid *new) +{ + u32 oldseq, newseq; + + oldseq = be32_to_cpu(lo->plh_stateid.seqid); + newseq = be32_to_cpu(new->seqid); + + if (newseq > oldseq + 1) + return false; + return true; +} + static u32 initiate_file_draining(struct nfs_client *clp, struct cb_layoutrecallargs *args) { @@ -175,6 +191,10 @@ static u32 initiate_file_draining(struct nfs_client *clp, ino = lo->plh_inode; spin_lock(&ino->i_lock); + if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) { + rv = NFS4ERR_DELAY; + goto unlock; + } pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); spin_unlock(&ino->i_lock); From e0d9243048fd18da695b8d3fe331176eac60866e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 28 Dec 2015 12:21:50 -0500 Subject: [PATCH 05/18] NFSv4.1/pNFS: Don't return NFS4ERR_DELAY unnecessarily in CB_LAYOUTRECALL If the client is promising to return the layout ASAP, then there is no need to return DELAY and have the server retry. Instead default to the normal procedure described in RFC5661. Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 1b24ad07d4f5..724a9b756ab0 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -214,7 +214,7 @@ static u32 initiate_file_draining(struct nfs_client *clp, pnfs_mark_matching_lsegs_return(lo, &free_me_list, &args->cbl_range); - rv = NFS4ERR_DELAY; + rv = NFS4_OK; goto unlock; } From e07db907eb80525874b7707c62cc6f5e975ef130 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 28 Dec 2015 14:07:23 -0500 Subject: [PATCH 06/18] NFSv4: List stateid information in the callback tracepoints The stateid is extremely valuable when debugging. Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 16 +++++++--- fs/nfs/nfs4trace.h | 69 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 79 insertions(+), 6 deletions(-) diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 724a9b756ab0..e4dbab5dce6e 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -83,8 +83,11 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, res = htonl(NFS4ERR_BADHANDLE); inode = nfs_delegation_find_inode(cps->clp, &args->fh); - if (inode == NULL) + if (inode == NULL) { + trace_nfs4_cb_recall(cps->clp, &args->fh, NULL, + &args->stateid, -ntohl(res)); goto out; + } /* Set up a helper thread to actually return the delegation */ switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { case 0: @@ -96,7 +99,8 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, default: res = htonl(NFS4ERR_RESOURCE); } - trace_nfs4_recall_delegation(inode, -ntohl(res)); + trace_nfs4_cb_recall(cps->clp, &args->fh, inode, + &args->stateid, -ntohl(res)); iput(inode); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); @@ -185,8 +189,11 @@ static u32 initiate_file_draining(struct nfs_client *clp, LIST_HEAD(free_me_list); lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid); - if (!lo) + if (!lo) { + trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL, + &args->cbl_stateid, -rv); goto out; + } ino = lo->plh_inode; @@ -227,7 +234,8 @@ static u32 initiate_file_draining(struct nfs_client *clp, spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); pnfs_put_layout_hdr(lo); - trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv); + trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino, + &args->cbl_stateid, -rv); iput(ino); out: return rv; diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index d08d0c84b778..88b6b14ce71b 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -982,7 +982,6 @@ DEFINE_NFS4_INODE_EVENT(nfs4_set_acl); DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label); DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label); #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ -DEFINE_NFS4_INODE_EVENT(nfs4_recall_delegation); DECLARE_EVENT_CLASS(nfs4_inode_stateid_event, TP_PROTO( @@ -1145,8 +1144,74 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, ), \ TP_ARGS(clp, fhandle, inode, error)) DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr); -DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_layoutrecall_inode); +DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, + TP_PROTO( + const struct nfs_client *clp, + const struct nfs_fh *fhandle, + const struct inode *inode, + const nfs4_stateid *stateid, + int error + ), + + TP_ARGS(clp, fhandle, inode, stateid, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __string(dstaddr, clp ? + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR) : "unknown") + __field(int, stateid_seq) + __field(u32, stateid_hash) + ), + + TP_fast_assign( + __entry->error = error; + __entry->fhandle = nfs_fhandle_hash(fhandle); + if (inode != NULL) { + __entry->fileid = NFS_FILEID(inode); + __entry->dev = inode->i_sb->s_dev; + } else { + __entry->fileid = 0; + __entry->dev = 0; + } + __assign_str(dstaddr, clp ? + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR) : "unknown") + __entry->stateid_seq = + be32_to_cpu(stateid->seqid); + __entry->stateid_hash = + nfs_stateid_hash(stateid); + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x dstaddr=%s", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash, + __get_str(dstaddr) + ) +); + +#define DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(name) \ + DEFINE_EVENT(nfs4_inode_stateid_callback_event, name, \ + TP_PROTO( \ + const struct nfs_client *clp, \ + const struct nfs_fh *fhandle, \ + const struct inode *inode, \ + const nfs4_stateid *stateid, \ + int error \ + ), \ + TP_ARGS(clp, fhandle, inode, stateid, error)) +DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_recall); +DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_layoutrecall_file); DECLARE_EVENT_CLASS(nfs4_idmap_event, TP_PROTO( From dc602dd706cb64036132a7903ead1c67d9a7bcb9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 31 Dec 2015 11:44:06 -0500 Subject: [PATCH 07/18] NFS/pNFS: Fix up pNFS write reschedule layering violations and bugs The flexfiles layout in particular, seems to want to poke around in the O_DIRECT flags when retransmitting. This patch sets up an interface to allow it to call back into O_DIRECT to handle retransmission correctly. It also fixes a potential bug whereby we could change the behaviour of O_DIRECT if an error is already pending. Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 21 +++++++++++++++------ fs/nfs/flexfilelayout/flexfilelayout.c | 13 +------------ fs/nfs/internal.h | 1 - fs/nfs/write.c | 6 ++++++ include/linux/nfs_xdr.h | 1 + 5 files changed, 23 insertions(+), 19 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 4b1d08f56aba..e73693f75dee 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -117,12 +117,6 @@ static inline int put_dreq(struct nfs_direct_req *dreq) return atomic_dec_and_test(&dreq->io_count); } -void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq) -{ - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; -} -EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes); - static void nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) { @@ -839,10 +833,25 @@ static void nfs_write_sync_pgio_error(struct list_head *head) } } +static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr) +{ + struct nfs_direct_req *dreq = hdr->dreq; + + spin_lock(&dreq->lock); + if (dreq->error == 0) { + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + /* fake unstable write to let common nfs resend pages */ + hdr->verf.committed = NFS_UNSTABLE; + hdr->good_bytes = hdr->args.count; + } + spin_unlock(&dreq->lock); +} + static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { .error_cleanup = nfs_write_sync_pgio_error, .init_hdr = nfs_direct_pgio_init, .completion = nfs_direct_write_completion, + .reschedule_io = nfs_direct_write_reschedule_io, }; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 03516c80855a..df475d42df77 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -912,18 +912,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) hdr->args.count, (unsigned long long)hdr->args.offset); - if (!hdr->dreq) { - struct nfs_open_context *ctx; - - ctx = nfs_list_entry(hdr->pages.next)->wb_context; - set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); - hdr->completion_ops->error_cleanup(&hdr->pages); - } else { - nfs_direct_set_resched_writes(hdr->dreq); - /* fake unstable write to let common nfs resend pages */ - hdr->verf.committed = NFS_UNSTABLE; - hdr->good_bytes = hdr->args.count; - } + hdr->completion_ops->reschedule_io(hdr); return; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 313d55402238..99a2919047e9 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -519,7 +519,6 @@ static inline void nfs_inode_dio_wait(struct inode *inode) inode_dio_wait(inode); } extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); -extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq); /* nfs4proc.c */ extern void __nfs4_read_done_cb(struct nfs_pgio_header *); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 7b9316406930..0aa3e6b3db70 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1326,9 +1326,15 @@ static void nfs_async_write_error(struct list_head *head) } } +static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr) +{ + nfs_async_write_error(&hdr->pages); +} + static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = { .error_cleanup = nfs_async_write_error, .completion = nfs_write_completion, + .reschedule_io = nfs_async_write_reschedule_io, }; void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 11bbae44f4cb..e89dbb14138c 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1460,6 +1460,7 @@ struct nfs_pgio_completion_ops { void (*error_cleanup)(struct list_head *head); void (*init_hdr)(struct nfs_pgio_header *hdr); void (*completion)(struct nfs_pgio_header *hdr); + void (*reschedule_io)(struct nfs_pgio_header *hdr); }; struct nfs_unlinkdata { From af7cf057933f01dc7f33ddfb5e436ad598ed17ad Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 29 Sep 2015 20:34:05 -0400 Subject: [PATCH 08/18] NFS: Allow multiple commit requests in flight per file Allow synchronous RPC calls to wait for pending RPC calls to finish, but also allow asynchronous ones to just fire off another commit. With this patch, the xfstests generic/074 test completes in 226s instead of 242s Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 6 ---- fs/nfs/file.c | 2 +- fs/nfs/nfstrace.h | 1 - fs/nfs/pnfs_nfs.c | 5 +-- fs/nfs/write.c | 74 +++++++++++++++++++---------------------- include/linux/nfs_fs.h | 1 - include/linux/nfs_xdr.h | 1 - 7 files changed, 37 insertions(+), 53 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index e73693f75dee..14f77df79c25 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -721,14 +721,8 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) nfs_direct_write_complete(dreq, data->inode); } -static void nfs_direct_error_cleanup(struct nfs_inode *nfsi) -{ - /* There is no lock to clear */ -} - static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = { .completion = nfs_direct_commit_complete, - .error_cleanup = nfs_direct_error_cleanup, }; static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 93e236429c5d..e6ef80ec699c 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -514,7 +514,7 @@ static void nfs_check_dirty_writeback(struct page *page, * so it will not block due to pages that will shortly be freeable. */ nfsi = NFS_I(mapping->host); - if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) { + if (atomic_read(&nfsi->commit_info.rpcs_out)) { *writeback = true; return; } diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 59f838cdc009..9f80a086b612 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -39,7 +39,6 @@ { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ - { 1 << NFS_INO_COMMIT, "COMMIT" }, \ { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 24655b807d44..3c8e3a44e6ea 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -266,17 +266,14 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, } else { nfs_retry_commit(mds_pages, NULL, cinfo, 0); pnfs_generic_retry_commit(cinfo, 0); - cinfo->completion_ops->error_cleanup(NFS_I(inode)); return -ENOMEM; } } nreq += pnfs_generic_alloc_ds_commits(cinfo, &list); - if (nreq == 0) { - cinfo->completion_ops->error_cleanup(NFS_I(inode)); + if (nreq == 0) goto out; - } atomic_add(nreq, &cinfo->mds->rpcs_out); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0aa3e6b3db70..ae29f082c9c2 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include @@ -1535,27 +1537,29 @@ static void nfs_writeback_result(struct rpc_task *task, } } - -static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) +static int nfs_wait_atomic_killable(atomic_t *key) { - int ret; - - if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags)) - return 1; - if (!may_wait) - return 0; - ret = out_of_line_wait_on_bit_lock(&nfsi->flags, - NFS_INO_COMMIT, - nfs_wait_bit_killable, - TASK_KILLABLE); - return (ret < 0) ? ret : 1; + if (fatal_signal_pending(current)) + return -ERESTARTSYS; + freezable_schedule_unsafe(); + return 0; } -static void nfs_commit_clear_lock(struct nfs_inode *nfsi) +static int wait_on_commit(struct nfs_mds_commit_info *cinfo) { - clear_bit(NFS_INO_COMMIT, &nfsi->flags); - smp_mb__after_atomic(); - wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); + return wait_on_atomic_t(&cinfo->rpcs_out, + nfs_wait_atomic_killable, TASK_KILLABLE); +} + +static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo) +{ + atomic_inc(&cinfo->rpcs_out); +} + +static void nfs_commit_end(struct nfs_mds_commit_info *cinfo) +{ + if (atomic_dec_and_test(&cinfo->rpcs_out)) + wake_up_atomic_t(&cinfo->rpcs_out); } void nfs_commitdata_release(struct nfs_commit_data *data) @@ -1693,7 +1697,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, data->mds_ops, how, 0); out_bad: nfs_retry_commit(head, NULL, cinfo, 0); - cinfo->completion_ops->error_cleanup(NFS_I(inode)); return -ENOMEM; } @@ -1755,8 +1758,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); nfs_init_cinfo(&cinfo, data->inode, data->dreq); - if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) - nfs_commit_clear_lock(NFS_I(data->inode)); + nfs_commit_end(cinfo.mds); } static void nfs_commit_release(void *calldata) @@ -1775,7 +1777,6 @@ static const struct rpc_call_ops nfs_commit_ops = { static const struct nfs_commit_completion_ops nfs_commit_completion_ops = { .completion = nfs_commit_release_pages, - .error_cleanup = nfs_commit_clear_lock, }; int nfs_generic_commit_list(struct inode *inode, struct list_head *head, @@ -1794,30 +1795,25 @@ int nfs_commit_inode(struct inode *inode, int how) LIST_HEAD(head); struct nfs_commit_info cinfo; int may_wait = how & FLUSH_SYNC; + int error = 0; int res; - res = nfs_commit_set_lock(NFS_I(inode), may_wait); - if (res <= 0) - goto out_mark_dirty; nfs_init_cinfo_from_inode(&cinfo, inode); + nfs_commit_begin(cinfo.mds); res = nfs_scan_commit(inode, &head, &cinfo); - if (res) { - int error; - + if (res) error = nfs_generic_commit_list(inode, &head, how, &cinfo); - if (error < 0) - return error; - if (!may_wait) - goto out_mark_dirty; - error = wait_on_bit_action(&NFS_I(inode)->flags, - NFS_INO_COMMIT, - nfs_wait_bit_killable, - TASK_KILLABLE); - if (error < 0) - return error; - } else - nfs_commit_clear_lock(NFS_I(inode)); + nfs_commit_end(cinfo.mds); + if (error < 0) + goto out_error; + if (!may_wait) + goto out_mark_dirty; + error = wait_on_commit(cinfo.mds); + if (error < 0) + return error; return res; +out_error: + res = error; /* Note: If we exit without ensuring that the commit is complete, * we must mark the inode as dirty. Otherwise, future calls to * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index c0e961474a52..ebf0bd72a42b 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -216,7 +216,6 @@ struct nfs_inode { #define NFS_INO_FLUSHING (4) /* inode is flushing out data */ #define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ #define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */ -#define NFS_INO_COMMIT (7) /* inode is committing unstable writes */ #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ #define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ #define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */ diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index e89dbb14138c..a8905b7d4d7f 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1423,7 +1423,6 @@ struct nfs_mds_commit_info { struct nfs_commit_data; struct nfs_inode; struct nfs_commit_completion_ops { - void (*error_cleanup) (struct nfs_inode *nfsi); void (*completion) (struct nfs_commit_data *data); }; From b20135d0b2431900a3a5395970ffb7e4f3767c8b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 31 Dec 2015 09:28:06 -0500 Subject: [PATCH 09/18] NFSv4.1/pNFS: Don't queue up a new commit if the layout segment is invalid If the layout segment is invalid, then we should not be adding more write requests to the commit list. Instead, those writes should be replayed after requesting a new layout. Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 2 ++ fs/nfs/direct.c | 12 ++++++++++++ fs/nfs/pnfs.c | 3 +++ fs/nfs/pnfs.h | 6 ++++++ fs/nfs/pnfs_nfs.c | 5 +++++ fs/nfs/write.c | 9 +++++++++ include/linux/nfs_xdr.h | 2 ++ 7 files changed, 39 insertions(+) diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index e4dbab5dce6e..2be8b252e3b1 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -233,6 +233,8 @@ static u32 initiate_file_draining(struct nfs_client *clp, unlock: spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); + /* Free all lsegs that are attached to commit buckets */ + nfs_commit_inode(ino, 0); pnfs_put_layout_hdr(lo); trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino, &args->cbl_stateid, -rv); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 14f77df79c25..a9a93927fe3e 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -721,8 +721,20 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) nfs_direct_write_complete(dreq, data->inode); } +static void nfs_direct_resched_write(struct nfs_commit_info *cinfo, + struct nfs_page *req) +{ + struct nfs_direct_req *dreq = cinfo->dreq; + + spin_lock(&dreq->lock); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + nfs_mark_request_commit(req, NULL, cinfo, 0); +} + static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = { .completion = nfs_direct_commit_complete, + .resched_write = nfs_direct_resched_write, }; static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 360fe95c97b5..6593be7c0129 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -703,6 +703,8 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, ret = -EAGAIN; spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&lseg_list); + /* Free all lsegs that are attached to commit buckets */ + nfs_commit_inode(inode, 0); pnfs_put_layout_hdr(lo); iput(inode); } @@ -1811,6 +1813,7 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, pnfs_mark_matching_lsegs_return(lo, &free_me, &range); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&free_me); + nfs_commit_inode(inode, 0); } EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index d93c2ebc0fd3..4bd7faf9ce50 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -412,6 +412,12 @@ pnfs_get_lseg(struct pnfs_layout_segment *lseg) return lseg; } +static inline bool +pnfs_is_valid_lseg(struct pnfs_layout_segment *lseg) +{ + return test_bit(NFS_LSEG_VALID, &lseg->pls_flags) != 0; +} + /* Return true if a layout driver is being used for this mountpoint */ static inline int pnfs_enabled_sb(struct nfs_server *nfss) { diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 3c8e3a44e6ea..81ac6480f9e7 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -868,6 +868,11 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, buckets = cinfo->ds->buckets; list = &buckets[ds_commit_idx].written; if (list_empty(list)) { + if (!pnfs_is_valid_lseg(lseg)) { + spin_unlock(cinfo->lock); + cinfo->completion_ops->resched_write(cinfo, req); + return; + } /* Non-empty buckets hold a reference on the lseg. That ref * is normally transferred to the COMMIT call and released * there. It could also be released if the last req is pulled diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ae29f082c9c2..0aa8d6f23b4c 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1676,6 +1676,13 @@ void nfs_retry_commit(struct list_head *page_list, } EXPORT_SYMBOL_GPL(nfs_retry_commit); +static void +nfs_commit_resched_write(struct nfs_commit_info *cinfo, + struct nfs_page *req) +{ + __set_page_dirty_nobuffers(req->wb_page); +} + /* * Commit dirty pages */ @@ -1777,6 +1784,7 @@ static const struct rpc_call_ops nfs_commit_ops = { static const struct nfs_commit_completion_ops nfs_commit_completion_ops = { .completion = nfs_commit_release_pages, + .resched_write = nfs_commit_resched_write, }; int nfs_generic_commit_list(struct inode *inode, struct list_head *head, @@ -1823,6 +1831,7 @@ int nfs_commit_inode(struct inode *inode, int how) __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return res; } +EXPORT_SYMBOL_GPL(nfs_commit_inode); int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index a8905b7d4d7f..bee3e60a7006 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1420,10 +1420,12 @@ struct nfs_mds_commit_info { struct list_head list; }; +struct nfs_commit_info; struct nfs_commit_data; struct nfs_inode; struct nfs_commit_completion_ops { void (*completion) (struct nfs_commit_data *data); + void (*resched_write) (struct nfs_commit_info *, struct nfs_page *); }; struct nfs_commit_info { From 138a2935dc9783b131d9647c3bddb22ae5c84d77 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 1 Oct 2015 17:17:06 -0400 Subject: [PATCH 10/18] NFS: Relax requirements in nfs_flush_incompatible If two processes share the same credentials and NFSv4 open stateid, then allow them both to dirty the same page, even if their nfs_open_context differs. Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 6 ++++++ fs/nfs/pagelist.c | 6 ------ fs/nfs/write.c | 3 ++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 99a2919047e9..870e2ba7ba49 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -264,6 +264,12 @@ static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc) return desc->pg_mirror_count > 1; } +static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1, + const struct nfs_open_context *ctx2) +{ + return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state; +} + /* nfs2xdr.c */ extern struct rpc_procinfo nfs_procedures[]; extern int nfs2_decode_dirent(struct xdr_stream *, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 452a011ba0d8..c3a78450a239 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -903,12 +903,6 @@ static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio) pgio->pg_mirrors_dynamic = NULL; } -static bool nfs_match_open_context(const struct nfs_open_context *ctx1, - const struct nfs_open_context *ctx2) -{ - return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state; -} - static bool nfs_match_lock_context(const struct nfs_lock_context *l1, const struct nfs_lock_context *l2) { diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0aa8d6f23b4c..2c26e04d9396 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1130,7 +1130,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page) if (req == NULL) return 0; l_ctx = req->wb_lock_context; - do_flush = req->wb_page != page || req->wb_context != ctx; + do_flush = req->wb_page != page || + !nfs_match_open_context(req->wb_context, ctx); /* for now, flush if more than 1 request in page_group */ do_flush |= req->wb_this_page != req; if (l_ctx && flctx && From ed429d6b934d44e25f23f8102375a103c6fc3996 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Jan 2016 12:30:55 -0500 Subject: [PATCH 11/18] NFSv4.1/pNFS: Don't pass stateids by value to pnfs_send_layoutreturn() A stateid is a structure, pass it as a pointer. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 6593be7c0129..7a452895169f 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -53,7 +53,7 @@ static DEFINE_SPINLOCK(pnfs_spinlock); static LIST_HEAD(pnfs_modules_tbl); static int -pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, +pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, enum pnfs_iomode iomode, bool sync); /* Return the registered pnfs layout driver module matching given id */ @@ -391,7 +391,7 @@ static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, spin_unlock(&inode->i_lock); if (send) { /* Send an async layoutreturn so we dont deadlock */ - pnfs_send_layoutreturn(lo, stateid, iomode, false); + pnfs_send_layoutreturn(lo, &stateid, iomode, false); } } else spin_unlock(&inode->i_lock); @@ -947,7 +947,7 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) } static int -pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, +pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, enum pnfs_iomode iomode, bool sync) { struct inode *ino = lo->plh_inode; @@ -964,7 +964,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, goto out; } - lrp->args.stateid = stateid; + nfs4_stateid_copy(&lrp->args.stateid, stateid); lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; lrp->args.inode = ino; lrp->args.range.iomode = iomode; @@ -1035,7 +1035,7 @@ _pnfs_return_layout(struct inode *ino) spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); if (send) - status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true); out_put_layout_hdr: pnfs_put_layout_hdr(lo); out: @@ -1126,7 +1126,7 @@ bool pnfs_roc(struct inode *ino) pnfs_free_lseg_list(&tmp_list); pnfs_layoutcommit_inode(ino, true); if (layoutreturn) - pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true); return roc; } From 50f563ef5d418127a75ca9b7116232672bbd8aaf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Jan 2016 12:22:46 -0500 Subject: [PATCH 12/18] NFSv4.1/pNFS: Use nfs4_stateid_copy for copying stateids Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 7a452895169f..7e0f2b9a9b10 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -385,7 +385,7 @@ static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, enum pnfs_iomode iomode; bool send; - stateid = lo->plh_stateid; + nfs4_stateid_copy(&stateid, &lo->plh_stateid); iomode = lo->plh_return_iomode; send = pnfs_prepare_layoutreturn(lo); spin_unlock(&inode->i_lock); @@ -1007,7 +1007,7 @@ _pnfs_return_layout(struct inode *ino) dprintk("NFS: %s no layout to return\n", __func__); goto out; } - stateid = nfsi->layout->plh_stateid; + nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid); /* Reference matched in nfs4_layoutreturn_release */ pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); @@ -1098,7 +1098,7 @@ bool pnfs_roc(struct inode *ino) goto out_noroc; } - stateid = lo->plh_stateid; + nfs4_stateid_copy(&stateid, &lo->plh_stateid); /* always send layoutreturn if being marked so */ if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags)) From 5c97f5de2c7cd9e2a5f71bc7c53125d9a2833ca9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Jan 2016 11:53:04 -0500 Subject: [PATCH 13/18] NFSv4.1/pNFS: pnfs_mark_matching_lsegs_return() should set the iomode If pnfs_mark_matching_lsegs_return() needs to mark a layout segment for return, then it must also set the return iomode. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 7e0f2b9a9b10..449c4782cab3 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1758,6 +1758,16 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out; } +static void +pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) +{ + if (lo->plh_return_iomode == iomode) + return; + if (lo->plh_return_iomode != 0) + iomode = IOMODE_ANY; + lo->plh_return_iomode = iomode; +} + void pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, @@ -1780,6 +1790,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, lseg->pls_range.offset, lseg->pls_range.length); set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); + pnfs_set_plh_return_iomode(lo, return_range->iomode); mark_lseg_invalid(lseg, tmp_list); set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); @@ -1801,10 +1812,7 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, spin_lock(&inode->i_lock); /* set failure bit so that pnfs path will be retried later */ pnfs_layout_set_fail_bit(lo, iomode); - if (lo->plh_return_iomode == 0) - lo->plh_return_iomode = range.iomode; - else if (lo->plh_return_iomode != range.iomode) - lo->plh_return_iomode = IOMODE_ANY; + pnfs_set_plh_return_iomode(lo, range.iomode); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() From 10335556c9e6ed2e1949fb595b7775f475299832 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Jan 2016 11:23:52 -0500 Subject: [PATCH 14/18] NFSv4.1/pNFS: pnfs_error_mark_layout_for_return() must always return layout Fix a bug whereby if all the layout segments could be immediately freed, the call to pnfs_error_mark_layout_for_return() would never result in a layoutreturn. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 26 ++++++++++++++++++++------ fs/nfs/pnfs.h | 2 +- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 449c4782cab3..8e1d4229bf2d 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1768,17 +1768,18 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) lo->plh_return_iomode = iomode; } -void +int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, struct pnfs_layout_range *return_range) { struct pnfs_layout_segment *lseg, *next; + int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); if (list_empty(&lo->plh_segs)) - return; + return 0; assert_spin_locked(&lo->plh_inode->i_lock); @@ -1791,10 +1792,12 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, lseg->pls_range.length); set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); pnfs_set_plh_return_iomode(lo, return_range->iomode); - mark_lseg_invalid(lseg, tmp_list); + if (!mark_lseg_invalid(lseg, tmp_list)) + remaining++; set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); } + return remaining; } void pnfs_error_mark_layout_for_return(struct inode *inode, @@ -1808,6 +1811,7 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, .length = NFS4_MAX_UINT64, }; LIST_HEAD(free_me); + bool return_now = false; spin_lock(&inode->i_lock); /* set failure bit so that pnfs path will be retried later */ @@ -1818,10 +1822,20 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, * segments at hand when sending layoutreturn. See pnfs_put_lseg() * for how it works. */ - pnfs_mark_matching_lsegs_return(lo, &free_me, &range); - spin_unlock(&inode->i_lock); + if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) { + nfs4_stateid stateid; + enum pnfs_iomode iomode = lo->plh_return_iomode; + + nfs4_stateid_copy(&stateid, &lo->plh_stateid); + return_now = pnfs_prepare_layoutreturn(lo); + spin_unlock(&inode->i_lock); + if (return_now) + pnfs_send_layoutreturn(lo, &stateid, iomode, false); + } else { + spin_unlock(&inode->i_lock); + nfs_commit_inode(inode, 0); + } pnfs_free_lseg_list(&free_me); - nfs_commit_inode(inode, 0); } EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 4bd7faf9ce50..3d0f513a4a77 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -266,7 +266,7 @@ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, struct pnfs_layout_range *recall_range); -void pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, +int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, struct pnfs_layout_range *recall_range); bool pnfs_roc(struct inode *ino); From 4b0934baf9317e05c7568da1366a1d65f151d81f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Jan 2016 11:28:11 -0500 Subject: [PATCH 15/18] NFSv4.1/pNFS: Fix a race in initiate_file_draining() Peng Tao points out that the call to pnfs_mark_matching_lsegs_return() could race with pnfs_put_lseg(), in which case the layout segment is cleared, but no layoutreturn will be sent. Fix is to replace the call to pnfs_mark_matching_lsegs_invalid(). Reported-by: Peng Tao Signed-off-by: Trond Myklebust --- fs/nfs/callback_proc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 2be8b252e3b1..f0939d097406 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -216,11 +216,8 @@ static u32 initiate_file_draining(struct nfs_client *clp, goto unlock; } - if (pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, + if (pnfs_mark_matching_lsegs_return(lo, &free_me_list, &args->cbl_range)) { - pnfs_mark_matching_lsegs_return(lo, - &free_me_list, - &args->cbl_range); rv = NFS4_OK; goto unlock; } From 71b39854a500be0b80cb3bc05546a7962f387b5b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Jan 2016 12:41:15 -0500 Subject: [PATCH 16/18] NFSv4.1/pNFS: Cleanup pnfs_mark_matching_lsegs_invalid() Make it more obvious what we're returning... Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 8e1d4229bf2d..f86f060f853d 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -569,7 +569,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct pnfs_layout_range *recall_range) { struct pnfs_layout_segment *lseg, *next; - int invalid = 0, removed = 0; + int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); @@ -582,11 +582,11 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, lseg->pls_range.offset, lseg->pls_range.length); - invalid++; - removed += mark_lseg_invalid(lseg, tmp_list); + if (!mark_lseg_invalid(lseg, tmp_list)) + remaining++; } - dprintk("%s:Return %i\n", __func__, invalid - removed); - return invalid - removed; + dprintk("%s:Return %i\n", __func__, remaining); + return remaining; } /* note free_me must contain lsegs from a single layout_hdr */ From e144e5391cf0881c9d64750dca8c592f6b5f0378 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Jan 2016 12:52:53 -0500 Subject: [PATCH 17/18] NFSv4.1/pnfs: Cleanup copying of pnfs_layout_range structures Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 4 ++-- fs/nfs/pnfs.h | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index f86f060f853d..ebf896b54d9d 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -863,7 +863,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, static struct pnfs_layout_segment * send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_open_context *ctx, - struct pnfs_layout_range *range, + const struct pnfs_layout_range *range, gfp_t gfp_flags) { struct inode *ino = lo->plh_inode; @@ -896,7 +896,7 @@ send_layoutget(struct pnfs_layout_hdr *lo, lgp->args.minlength = i_size - range->offset; } lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; - lgp->args.range = *range; + pnfs_copy_range(&lgp->args.range, range); lgp->args.type = server->pnfs_curr_ld->id; lgp->args.inode = ino; lgp->args.ctx = get_nfs_open_context(ctx); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 3d0f513a4a77..dcc76335fd4b 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -578,6 +578,13 @@ pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo) set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); } +static inline void +pnfs_copy_range(struct pnfs_layout_range *dst, + const struct pnfs_layout_range *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + extern unsigned int layoutstats_timer; #ifdef NFS_DEBUG From 506c0d68269e90d354b3cbfc7523611b026c88d0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Jan 2016 13:04:47 -0500 Subject: [PATCH 18/18] NFSv4.1/pNFS: Cleanup constify struct pnfs_layout_range arguments Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 6 +++--- fs/nfs/pnfs.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index ebf896b54d9d..04db6d951b99 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -566,7 +566,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - struct pnfs_layout_range *recall_range) + const struct pnfs_layout_range *recall_range) { struct pnfs_layout_segment *lseg, *next; int remaining = 0; @@ -828,7 +828,7 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo) int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range, + const struct pnfs_layout_range *range, struct nfs4_state *open_state) { int status = 0; @@ -1771,7 +1771,7 @@ pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - struct pnfs_layout_range *return_range) + const struct pnfs_layout_range *return_range) { struct pnfs_layout_segment *lseg, *next; int remaining = 0; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index dcc76335fd4b..78df618a1596 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -261,14 +261,14 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, bool update_barrier); int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range, + const struct pnfs_layout_range *range, struct nfs4_state *open_state); int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - struct pnfs_layout_range *recall_range); + const struct pnfs_layout_range *recall_range); int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - struct pnfs_layout_range *recall_range); + const struct pnfs_layout_range *recall_range); bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);