diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 4e0654b56aef..238bd211f365 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2434,12 +2434,6 @@ and gids from such clients. This is intended to ease migration from NFSv2/v3. - objlayoutdriver.osd_login_prog= - [NFS] [OBJLAYOUT] sets the pathname to the program which - is used to automatically discover and login into new - osd-targets. Please see: - Documentation/filesystems/pnfs.txt for more explanations - nmi_debug= [KNL,SH] Specify one or more actions to take when a NMI is triggered. Format: [state][,regs][,debounce][,die] diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt index 8de578a98222..80dc0bdc302a 100644 --- a/Documentation/filesystems/nfs/pnfs.txt +++ b/Documentation/filesystems/nfs/pnfs.txt @@ -64,46 +64,9 @@ table which are called by the nfs-client pnfs-core to implement the different layout types. Files-layout-driver code is in: fs/nfs/filelayout/.. directory -Objects-layout-driver code is in: fs/nfs/objlayout/.. directory Blocks-layout-driver code is in: fs/nfs/blocklayout/.. directory Flexfiles-layout-driver code is in: fs/nfs/flexfilelayout/.. directory -objects-layout setup --------------------- - -As part of the full STD implementation the objlayoutdriver.ko needs, at times, -to automatically login to yet undiscovered iscsi/osd devices. For this the -driver makes up-calles to a user-mode script called *osd_login* - -The path_name of the script to use is by default: - /sbin/osd_login. -This name can be overridden by the Kernel module parameter: - objlayoutdriver.osd_login_prog - -If Kernel does not find the osd_login_prog path it will zero it out -and will not attempt farther logins. An admin can then write new value -to the objlayoutdriver.osd_login_prog Kernel parameter to re-enable it. - -The /sbin/osd_login is part of the nfs-utils package, and should usually -be installed on distributions that support this Kernel version. - -The API to the login script is as follows: - Usage: $0 -u -o -s - Options: - -u target uri e.g. iscsi://: - (always exists) - (More protocols can be defined in the future. - The client does not interpret this string it is - passed unchanged as received from the Server) - -o osdname of the requested target OSD - (Might be empty) - (A string which denotes the OSD name, there is a - limit of 64 chars on this string) - -s systemid of the requested target OSD - (Might be empty) - (This string, if not empty is always an hex - representation of the 20 bytes osd_system_id) - blocks-layout setup ------------------- diff --git a/fs/fuse/file.c b/fs/fuse/file.c index aa93f09ae6e6..3ee4fdc3da9e 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2177,7 +2177,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) } /* Unlock on close is handled by the flush method */ - if (fl->fl_flags & FL_CLOSE) + if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) return 0; if (pid && pid_nr == 0) diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 41e491b8e5d7..27d577dbe51a 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -69,6 +69,7 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init) if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL) goto out_nobind; + host->h_nlmclnt_ops = nlm_init->nlmclnt_ops; return host; out_nobind: nlmclnt_release_host(host); diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 112952037933..066ac313ae5c 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -150,17 +150,22 @@ static void nlmclnt_release_lockargs(struct nlm_rqst *req) * @host: address of a valid nlm_host context representing the NLM server * @cmd: fcntl-style file lock operation to perform * @fl: address of arguments for the lock operation + * @data: address of data to be sent to callback operations * */ -int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) +int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, void *data) { struct nlm_rqst *call; int status; + const struct nlmclnt_operations *nlmclnt_ops = host->h_nlmclnt_ops; call = nlm_alloc_call(host); if (call == NULL) return -ENOMEM; + if (nlmclnt_ops && nlmclnt_ops->nlmclnt_alloc_call) + nlmclnt_ops->nlmclnt_alloc_call(data); + nlmclnt_locks_init_private(fl, host); if (!fl->fl_u.nfs_fl.owner) { /* lockowner allocation has failed */ @@ -169,6 +174,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) } /* Set up the argument struct */ nlmclnt_setlockargs(call, fl); + call->a_callback_data = data; if (IS_SETLK(cmd) || IS_SETLKW(cmd)) { if (fl->fl_type != F_UNLCK) { @@ -214,8 +220,12 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) void nlmclnt_release_call(struct nlm_rqst *call) { + const struct nlmclnt_operations *nlmclnt_ops = call->a_host->h_nlmclnt_ops; + if (!atomic_dec_and_test(&call->a_count)) return; + if (nlmclnt_ops && nlmclnt_ops->nlmclnt_release_call) + nlmclnt_ops->nlmclnt_release_call(call->a_callback_data); nlmclnt_release_host(call->a_host); nlmclnt_release_lockargs(call); kfree(call); @@ -687,6 +697,19 @@ out: return status; } +static void nlmclnt_unlock_prepare(struct rpc_task *task, void *data) +{ + struct nlm_rqst *req = data; + const struct nlmclnt_operations *nlmclnt_ops = req->a_host->h_nlmclnt_ops; + bool defer_call = false; + + if (nlmclnt_ops && nlmclnt_ops->nlmclnt_unlock_prepare) + defer_call = nlmclnt_ops->nlmclnt_unlock_prepare(task, req->a_callback_data); + + if (!defer_call) + rpc_call_start(task); +} + static void nlmclnt_unlock_callback(struct rpc_task *task, void *data) { struct nlm_rqst *req = data; @@ -720,6 +743,7 @@ die: } static const struct rpc_call_ops nlmclnt_unlock_ops = { + .rpc_call_prepare = nlmclnt_unlock_prepare, .rpc_call_done = nlmclnt_unlock_callback, .rpc_release = nlmclnt_rpc_release, }; diff --git a/fs/locks.c b/fs/locks.c index 26811321d39b..af2031a1fcff 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2504,7 +2504,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx) .fl_owner = filp, .fl_pid = current->tgid, .fl_file = filp, - .fl_flags = FL_FLOCK, + .fl_flags = FL_FLOCK | FL_CLOSE, .fl_type = F_UNLCK, .fl_end = OFFSET_MAX, }; diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index f31fd0dd92c6..69d02cf8cf37 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -123,11 +123,6 @@ config PNFS_BLOCK depends on NFS_V4_1 && BLK_DEV_DM default NFS_V4 -config PNFS_OBJLAYOUT - tristate - depends on NFS_V4_1 && SCSI_OSD_ULD - default NFS_V4 - config PNFS_FLEXFILE_LAYOUT tristate depends on NFS_V4_1 && NFS_V3 diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 6abdda209642..98f4e5728a67 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -31,6 +31,5 @@ nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ -obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += flexfilelayout/ diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index f073a6d2c6a5..52479f180ea1 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -131,10 +131,11 @@ restart: if (!inode) continue; if (!nfs_sb_active(inode->i_sb)) { - rcu_read_lock(); + rcu_read_unlock(); spin_unlock(&clp->cl_lock); iput(inode); spin_lock(&clp->cl_lock); + rcu_read_lock(); goto restart; } return inode; @@ -170,10 +171,11 @@ restart: if (!inode) continue; if (!nfs_sb_active(inode->i_sb)) { - rcu_read_lock(); + rcu_read_unlock(); spin_unlock(&clp->cl_lock); iput(inode); spin_lock(&clp->cl_lock); + rcu_read_lock(); goto restart; } return inode; @@ -317,31 +319,18 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, static u32 do_callback_layoutrecall(struct nfs_client *clp, struct cb_layoutrecallargs *args) { - u32 res; - - dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type); if (args->cbl_recall_type == RETURN_FILE) - res = initiate_file_draining(clp, args); - else - res = initiate_bulk_draining(clp, args); - dprintk("%s returning %i\n", __func__, res); - return res; - + return initiate_file_draining(clp, args); + return initiate_bulk_draining(clp, args); } __be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args, void *dummy, struct cb_process_state *cps) { - u32 res; - - dprintk("%s: -->\n", __func__); + u32 res = NFS4ERR_OP_NOT_IN_SESSION; if (cps->clp) res = do_callback_layoutrecall(cps->clp, args); - else - res = NFS4ERR_OP_NOT_IN_SESSION; - - dprintk("%s: exit with status = %d\n", __func__, res); return cpu_to_be32(res); } @@ -364,8 +353,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, struct nfs_client *clp = cps->clp; struct nfs_server *server = NULL; - dprintk("%s: -->\n", __func__); - if (!clp) { res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); goto out; @@ -384,8 +371,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, goto found; } rcu_read_unlock(); - dprintk("%s: layout type %u not found\n", - __func__, dev->cbd_layout_type); continue; } @@ -395,8 +380,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, out: kfree(args->devs); - dprintk("%s: exit with status = %u\n", - __func__, be32_to_cpu(res)); return res; } @@ -417,16 +400,11 @@ static __be32 validate_seqid(const struct nfs4_slot_table *tbl, const struct nfs4_slot *slot, const struct cb_sequenceargs * args) { - dprintk("%s enter. slotid %u seqid %u, slot table seqid: %u\n", - __func__, args->csa_slotid, args->csa_sequenceid, slot->seq_nr); - if (args->csa_slotid > tbl->server_highest_slotid) return htonl(NFS4ERR_BADSLOT); /* Replay */ if (args->csa_sequenceid == slot->seq_nr) { - dprintk("%s seqid %u is a replay\n", - __func__, args->csa_sequenceid); if (nfs4_test_locked_slot(tbl, slot->slot_nr)) return htonl(NFS4ERR_DELAY); /* Signal process_op to set this error on next op */ @@ -480,15 +458,6 @@ static bool referring_call_exists(struct nfs_client *clp, for (j = 0; j < rclist->rcl_nrefcalls; j++) { ref = &rclist->rcl_refcalls[j]; - - dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u " - "slotid %u\n", __func__, - ((u32 *)&rclist->rcl_sessionid.data)[0], - ((u32 *)&rclist->rcl_sessionid.data)[1], - ((u32 *)&rclist->rcl_sessionid.data)[2], - ((u32 *)&rclist->rcl_sessionid.data)[3], - ref->rc_sequenceid, ref->rc_slotid); - status = nfs4_slot_wait_on_seqid(tbl, ref->rc_slotid, ref->rc_sequenceid, HZ >> 1) < 0; if (status) @@ -593,8 +562,6 @@ out: res->csr_status = status; trace_nfs4_cb_sequence(args, res, status); - dprintk("%s: exit with status = %d res->csr_status %d\n", __func__, - ntohl(status), ntohl(res->csr_status)); return status; } diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index d051fc3583a9..c14758e08d73 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -171,8 +171,6 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound return htonl(NFS4ERR_MINOR_VERS_MISMATCH); } hdr->nops = ntohl(*p); - dprintk("%s: minorversion %d nops %d\n", __func__, - hdr->minorversion, hdr->nops); return 0; } @@ -192,11 +190,8 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr status = decode_fh(xdr, &args->fh); if (unlikely(status != 0)) - goto out; - status = decode_bitmap(xdr, args->bitmap); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return status; + return decode_bitmap(xdr, args->bitmap); } static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args) @@ -206,17 +201,12 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, status = decode_delegation_stateid(xdr, &args->stateid); if (unlikely(status != 0)) - goto out; + return status; p = read_buf(xdr, 4); - if (unlikely(p == NULL)) { - status = htonl(NFS4ERR_RESOURCE); - goto out; - } + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); args->truncate = ntohl(*p); - status = decode_fh(xdr, &args->fh); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return decode_fh(xdr, &args->fh); } #if defined(CONFIG_NFS_V4_1) @@ -235,10 +225,8 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, uint32_t iomode; p = read_buf(xdr, 4 * sizeof(uint32_t)); - if (unlikely(p == NULL)) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); args->cbl_layout_type = ntohl(*p++); /* Depite the spec's xdr, iomode really belongs in the FILE switch, @@ -252,37 +240,23 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, args->cbl_range.iomode = iomode; status = decode_fh(xdr, &args->cbl_fh); if (unlikely(status != 0)) - goto out; + return status; p = read_buf(xdr, 2 * sizeof(uint64_t)); - if (unlikely(p == NULL)) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbl_range.offset); p = xdr_decode_hyper(p, &args->cbl_range.length); - status = decode_layout_stateid(xdr, &args->cbl_stateid); - if (unlikely(status != 0)) - goto out; + return decode_layout_stateid(xdr, &args->cbl_stateid); } else if (args->cbl_recall_type == RETURN_FSID) { p = read_buf(xdr, 2 * sizeof(uint64_t)); - if (unlikely(p == NULL)) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbl_fsid.major); p = xdr_decode_hyper(p, &args->cbl_fsid.minor); - } else if (args->cbl_recall_type != RETURN_ALL) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } - dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n", - __func__, - args->cbl_layout_type, iomode, - args->cbl_layoutchanged, args->cbl_recall_type); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + } else if (args->cbl_recall_type != RETURN_ALL) + return htonl(NFS4ERR_BADXDR); + return 0; } static @@ -437,12 +411,11 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, status = decode_sessionid(xdr, &args->csa_sessionid); if (status) - goto out; + return status; - status = htonl(NFS4ERR_RESOURCE); p = read_buf(xdr, 5 * sizeof(uint32_t)); if (unlikely(p == NULL)) - goto out; + return htonl(NFS4ERR_RESOURCE); args->csa_addr = svc_addr(rqstp); args->csa_sequenceid = ntohl(*p++); @@ -456,7 +429,7 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, sizeof(*args->csa_rclists), GFP_KERNEL); if (unlikely(args->csa_rclists == NULL)) - goto out; + return htonl(NFS4ERR_RESOURCE); for (i = 0; i < args->csa_nrclists; i++) { status = decode_rc_list(xdr, &args->csa_rclists[i]); @@ -466,27 +439,13 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, } } } - status = 0; - - dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u slotid %u " - "highestslotid %u cachethis %d nrclists %u\n", - __func__, - ((u32 *)&args->csa_sessionid)[0], - ((u32 *)&args->csa_sessionid)[1], - ((u32 *)&args->csa_sessionid)[2], - ((u32 *)&args->csa_sessionid)[3], - args->csa_sequenceid, args->csa_slotid, - args->csa_highestslotid, args->csa_cachethis, - args->csa_nrclists); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return 0; out_free: for (i = 0; i < args->csa_nrclists; i++) kfree(args->csa_rclists[i].rcl_refcalls); kfree(args->csa_rclists); - goto out; + return status; } static __be32 decode_recallany_args(struct svc_rqst *rqstp, @@ -557,11 +516,8 @@ static __be32 decode_notify_lock_args(struct svc_rqst *rqstp, struct xdr_stream status = decode_fh(xdr, &args->cbnl_fh); if (unlikely(status != 0)) - goto out; - status = decode_lockowner(xdr, args); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return status; + return decode_lockowner(xdr, args); } #endif /* CONFIG_NFS_V4_1 */ @@ -707,7 +663,6 @@ static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, status = encode_attr_mtime(xdr, res->bitmap, &res->mtime); *savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1))); out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; } @@ -734,11 +689,11 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp, __be32 status = res->csr_status; if (unlikely(status != 0)) - goto out; + return status; status = encode_sessionid(xdr, &res->csr_sessionid); if (status) - goto out; + return status; p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t)); if (unlikely(p == NULL)) @@ -748,9 +703,7 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp, *p++ = htonl(res->csr_slotid); *p++ = htonl(res->csr_highestslotid); *p++ = htonl(res->csr_target_highestslotid); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return 0; } static __be32 @@ -871,14 +824,10 @@ static __be32 process_op(int nop, struct svc_rqst *rqstp, long maxlen; __be32 res; - dprintk("%s: start\n", __func__); status = decode_op_hdr(xdr_in, &op_nr); if (unlikely(status)) return status; - dprintk("%s: minorversion=%d nop=%d op_nr=%u\n", - __func__, cps->minorversion, nop, op_nr); - switch (cps->minorversion) { case 0: status = preprocess_nfs4_op(op_nr, &op); @@ -917,7 +866,6 @@ encode_hdr: return res; if (op->encode_res != NULL && status == 0) status = op->encode_res(rqstp, xdr_out, resp); - dprintk("%s: done, status = %d\n", __func__, ntohl(status)); return status; } @@ -937,8 +885,6 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r }; unsigned int nops = 0; - dprintk("%s: start\n", __func__); - xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base); p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len); @@ -977,7 +923,6 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r *hdr_res.nops = htonl(nops); nfs4_cb_free_slot(&cps); nfs_put_client(cps.clp); - dprintk("%s: done, status = %u\n", __func__, ntohl(status)); return rpc_success; out_invalidcred: diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 04d15a0045e3..ee5ddbd36088 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -218,6 +218,7 @@ static void nfs_cb_idr_remove_locked(struct nfs_client *clp) static void pnfs_init_server(struct nfs_server *server) { rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); + rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC"); } #else @@ -240,8 +241,6 @@ static void pnfs_init_server(struct nfs_server *server) */ void nfs_free_client(struct nfs_client *clp) { - dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version); - nfs_fscache_release_client_cookie(clp); /* -EIO all pending I/O */ @@ -256,8 +255,6 @@ void nfs_free_client(struct nfs_client *clp) kfree(clp->cl_hostname); kfree(clp->cl_acceptor); kfree(clp); - - dprintk("<-- nfs_free_client()\n"); } EXPORT_SYMBOL_GPL(nfs_free_client); @@ -271,7 +268,6 @@ void nfs_put_client(struct nfs_client *clp) if (!clp) return; - dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count)); nn = net_generic(clp->cl_net, nfs_net_id); if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) { @@ -382,9 +378,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init, } smp_rmb(); - - dprintk("<-- %s found nfs_client %p for %s\n", - __func__, clp, cl_init->hostname ?: ""); return clp; } @@ -403,9 +396,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) return NULL; } - dprintk("--> nfs_get_client(%s,v%u)\n", - cl_init->hostname, rpc_ops->version); - /* see if the client already exists */ do { spin_lock(&nn->nfs_client_lock); @@ -430,8 +420,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) new = rpc_ops->alloc_client(cl_init); } while (!IS_ERR(new)); - dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n", - cl_init->hostname, PTR_ERR(new)); return new; } EXPORT_SYMBOL_GPL(nfs_get_client); @@ -558,6 +546,7 @@ static int nfs_start_lockd(struct nfs_server *server) .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? 1 : 0, .net = clp->cl_net, + .nlmclnt_ops = clp->cl_nfs_mod->rpc_ops->nlmclnt_ops, }; if (nlm_init.nfs_version > 3) @@ -624,27 +613,21 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp, { int error; - if (clp->cl_cons_state == NFS_CS_READY) { - /* the client is already initialised */ - dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp); + /* the client is already initialised */ + if (clp->cl_cons_state == NFS_CS_READY) return clp; - } /* * Create a client RPC handle for doing FSSTAT with UNIX auth only * - RFC 2623, sec 2.3.2 */ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); - if (error < 0) - goto error; - nfs_mark_client_ready(clp, NFS_CS_READY); + nfs_mark_client_ready(clp, error == 0 ? NFS_CS_READY : error); + if (error < 0) { + nfs_put_client(clp); + clp = ERR_PTR(error); + } return clp; - -error: - nfs_mark_client_ready(clp, error); - nfs_put_client(clp); - dprintk("<-- nfs_init_client() = xerror %d\n", error); - return ERR_PTR(error); } EXPORT_SYMBOL_GPL(nfs_init_client); @@ -668,8 +651,6 @@ static int nfs_init_server(struct nfs_server *server, struct nfs_client *clp; int error; - dprintk("--> nfs_init_server()\n"); - nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, data->timeo, data->retrans); if (data->flags & NFS_MOUNT_NORESVPORT) @@ -677,10 +658,8 @@ static int nfs_init_server(struct nfs_server *server, /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); - if (IS_ERR(clp)) { - dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); + if (IS_ERR(clp)) return PTR_ERR(clp); - } server->nfs_client = clp; @@ -725,13 +704,11 @@ static int nfs_init_server(struct nfs_server *server, server->mountd_protocol = data->mount_server.protocol; server->namelen = data->namlen; - dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp); return 0; error: server->nfs_client = NULL; nfs_put_client(clp); - dprintk("<-- nfs_init_server() = xerror %d\n", error); return error; } @@ -798,12 +775,10 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs struct nfs_client *clp = server->nfs_client; int error; - dprintk("--> nfs_probe_fsinfo()\n"); - if (clp->rpc_ops->set_capabilities != NULL) { error = clp->rpc_ops->set_capabilities(server, mntfh); if (error < 0) - goto out_error; + return error; } fsinfo.fattr = fattr; @@ -811,7 +786,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype)); error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) - goto out_error; + return error; nfs_server_set_fsinfo(server, &fsinfo); @@ -826,12 +801,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs server->namelen = pathinfo.max_namelen; } - dprintk("<-- nfs_probe_fsinfo() = 0\n"); return 0; - -out_error: - dprintk("nfs_probe_fsinfo: error = %d\n", -error); - return error; } EXPORT_SYMBOL_GPL(nfs_probe_fsinfo); @@ -927,8 +897,6 @@ EXPORT_SYMBOL_GPL(nfs_alloc_server); */ void nfs_free_server(struct nfs_server *server) { - dprintk("--> nfs_free_server()\n"); - nfs_server_remove_lists(server); if (server->destroy != NULL) @@ -946,7 +914,6 @@ void nfs_free_server(struct nfs_server *server) nfs_free_iostats(server->io_stats); kfree(server); nfs_release_automount_timer(); - dprintk("<-- nfs_free_server()\n"); } EXPORT_SYMBOL_GPL(nfs_free_server); @@ -1026,10 +993,6 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, struct nfs_fattr *fattr_fsinfo; int error; - dprintk("--> nfs_clone_server(,%llx:%llx,)\n", - (unsigned long long) fattr->fsid.major, - (unsigned long long) fattr->fsid.minor); - server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); @@ -1061,10 +1024,6 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; - dprintk("Cloned FSID: %llx:%llx\n", - (unsigned long long) server->fsid.major, - (unsigned long long) server->fsid.minor); - error = nfs_start_lockd(server); if (error < 0) goto out_free_server; @@ -1073,13 +1032,11 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, server->mount_time = jiffies; nfs_free_fattr(fattr_fsinfo); - dprintk("<-- nfs_clone_server() = %p\n", server); return server; out_free_server: nfs_free_fattr(fattr_fsinfo); nfs_free_server(server); - dprintk("<-- nfs_clone_server() = error %d\n", error); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(nfs_clone_server); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f92ba8d6c556..32ccd7754f8a 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -57,7 +57,7 @@ static void nfs_readdir_clear_array(struct page*); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, .read = generic_read_dir, - .iterate_shared = nfs_readdir, + .iterate = nfs_readdir, .open = nfs_opendir, .release = nfs_closedir, .fsync = nfs_fsync_dir, @@ -145,7 +145,6 @@ struct nfs_cache_array_entry { }; struct nfs_cache_array { - atomic_t refcount; int size; int eof_index; u64 last_cookie; @@ -170,27 +169,6 @@ typedef struct { unsigned int eof:1; } nfs_readdir_descriptor_t; -/* - * The caller is responsible for calling nfs_readdir_release_array(page) - */ -static -struct nfs_cache_array *nfs_readdir_get_array(struct page *page) -{ - void *ptr; - if (page == NULL) - return ERR_PTR(-EIO); - ptr = kmap(page); - if (ptr == NULL) - return ERR_PTR(-ENOMEM); - return ptr; -} - -static -void nfs_readdir_release_array(struct page *page) -{ - kunmap(page); -} - /* * we are freeing strings created by nfs_add_to_readdir_array() */ @@ -201,20 +179,11 @@ void nfs_readdir_clear_array(struct page *page) int i; array = kmap_atomic(page); - if (atomic_dec_and_test(&array->refcount)) - for (i = 0; i < array->size; i++) - kfree(array->array[i].string.name); + for (i = 0; i < array->size; i++) + kfree(array->array[i].string.name); kunmap_atomic(array); } -static bool grab_page(struct page *page) -{ - struct nfs_cache_array *array = kmap_atomic(page); - bool res = atomic_inc_not_zero(&array->refcount); - kunmap_atomic(array); - return res; -} - /* * the caller is responsible for freeing qstr.name * when called by nfs_readdir_add_to_array, the strings will be freed in @@ -239,13 +208,10 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le static int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) { - struct nfs_cache_array *array = nfs_readdir_get_array(page); + struct nfs_cache_array *array = kmap(page); struct nfs_cache_array_entry *cache_entry; int ret; - if (IS_ERR(array)) - return PTR_ERR(array); - cache_entry = &array->array[array->size]; /* Check that this entry lies within the page bounds */ @@ -264,7 +230,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) if (entry->eof != 0) array->eof_index = array->size; out: - nfs_readdir_release_array(page); + kunmap(page); return ret; } @@ -353,11 +319,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) struct nfs_cache_array *array; int status; - array = nfs_readdir_get_array(desc->page); - if (IS_ERR(array)) { - status = PTR_ERR(array); - goto out; - } + array = kmap(desc->page); if (*desc->dir_cookie == 0) status = nfs_readdir_search_for_pos(array, desc); @@ -369,8 +331,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) desc->current_index += array->size; desc->page_index++; } - nfs_readdir_release_array(desc->page); -out: + kunmap(desc->page); return status; } @@ -606,13 +567,10 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en out_nopages: if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { - array = nfs_readdir_get_array(page); - if (!IS_ERR(array)) { - array->eof_index = array->size; - status = 0; - nfs_readdir_release_array(page); - } else - status = PTR_ERR(array); + array = kmap(page); + array->eof_index = array->size; + status = 0; + kunmap(page); } put_page(scratch); @@ -674,13 +632,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, goto out; } - array = nfs_readdir_get_array(page); - if (IS_ERR(array)) { - status = PTR_ERR(array); - goto out_label_free; - } + array = kmap(page); memset(array, 0, sizeof(struct nfs_cache_array)); - atomic_set(&array->refcount, 1); array->eof_index = -1; status = nfs_readdir_alloc_pages(pages, array_size); @@ -703,8 +656,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, nfs_readdir_free_pages(pages, array_size); out_release_array: - nfs_readdir_release_array(page); -out_label_free: + kunmap(page); nfs4_label_free(entry.label); out: nfs_free_fattr(entry.fattr); @@ -743,7 +695,8 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) static void cache_page_release(nfs_readdir_descriptor_t *desc) { - nfs_readdir_clear_array(desc->page); + if (!desc->page->mapping) + nfs_readdir_clear_array(desc->page); put_page(desc->page); desc->page = NULL; } @@ -751,16 +704,8 @@ void cache_page_release(nfs_readdir_descriptor_t *desc) static struct page *get_cache_page(nfs_readdir_descriptor_t *desc) { - struct page *page; - - for (;;) { - page = read_cache_page(desc->file->f_mapping, + return read_cache_page(desc->file->f_mapping, desc->page_index, (filler_t *)nfs_readdir_filler, desc); - if (IS_ERR(page) || grab_page(page)) - break; - put_page(page); - } - return page; } /* @@ -809,12 +754,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) struct nfs_cache_array *array = NULL; struct nfs_open_dir_context *ctx = file->private_data; - array = nfs_readdir_get_array(desc->page); - if (IS_ERR(array)) { - res = PTR_ERR(array); - goto out; - } - + array = kmap(desc->page); for (i = desc->cache_entry_index; i < array->size; i++) { struct nfs_cache_array_entry *ent; @@ -835,8 +775,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) if (array->eof_index >= 0) desc->eof = 1; - nfs_readdir_release_array(desc->page); -out: + kunmap(desc->page); cache_page_release(desc); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (unsigned long long)*desc->dir_cookie, res); @@ -966,11 +905,13 @@ out: static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) { + struct inode *inode = file_inode(filp); struct nfs_open_dir_context *dir_ctx = filp->private_data; dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", filp, offset, whence); + inode_lock(inode); switch (whence) { case 1: offset += filp->f_pos; @@ -978,13 +919,16 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) if (offset >= 0) break; default: - return -EINVAL; + offset = -EINVAL; + goto out; } if (offset != filp->f_pos) { filp->f_pos = offset; dir_ctx->dir_cookie = 0; dir_ctx->duped = 0; } +out: + inode_unlock(inode); return offset; } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index c1b5fed7c863..6fb9fad2d1e6 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -392,16 +392,6 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) nfs_direct_req_release(dreq); } -static void nfs_direct_readpage_release(struct nfs_page *req) -{ - dprintk("NFS: direct read done (%s/%llu %d@%lld)\n", - req->wb_context->dentry->d_sb->s_id, - (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)), - req->wb_bytes, - (long long)req_offset(req)); - nfs_release_request(req); -} - static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) { unsigned long bytes = 0; @@ -426,7 +416,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) set_page_dirty(page); bytes += req->wb_bytes; nfs_list_remove_request(req); - nfs_direct_readpage_release(req); + nfs_release_request(req); } out_put: if (put_dreq(dreq)) @@ -700,16 +690,9 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) int status = data->task.tk_status; nfs_init_cinfo_from_dreq(&cinfo, dreq); - if (status < 0) { - dprintk("NFS: %5u commit failed with error %d.\n", - data->task.tk_pid, status); + if (status < 0 || nfs_direct_cmp_commit_data_verf(dreq, data)) dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - } else if (nfs_direct_cmp_commit_data_verf(dreq, data)) { - dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid); - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - } - dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 668213984d68..5713eb32a45e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -482,7 +482,7 @@ static int nfs_launder_page(struct page *page) inode->i_ino, (long long)page_offset(page)); nfs_fscache_wait_on_page_write(nfsi, page); - return nfs_wb_launder_page(inode, page); + return nfs_wb_page(inode, page); } static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, @@ -697,14 +697,14 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) if (!IS_ERR(l_ctx)) { status = nfs_iocounter_wait(l_ctx); nfs_put_lock_context(l_ctx); - if (status < 0) + /* NOTE: special case + * If we're signalled while cleaning up locks on process exit, we + * still need to complete the unlock. + */ + if (status < 0 && !(fl->fl_flags & FL_CLOSE)) return status; } - /* NOTE: special case - * If we're signalled while cleaning up locks on process exit, we - * still need to complete the unlock. - */ /* * Use local locking if mounted with "-onolock" or with appropriate * "-olocal_lock=" @@ -820,9 +820,23 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) is_local = 1; - /* We're simulating flock() locks using posix locks on the server */ - if (fl->fl_type == F_UNLCK) + /* + * VFS doesn't require the open mode to match a flock() lock's type. + * NFS, however, may simulate flock() locking with posix locking which + * requires the open mode to match the lock type. + */ + switch (fl->fl_type) { + case F_UNLCK: return do_unlk(filp, cmd, fl, is_local); + case F_RDLCK: + if (!(filp->f_mode & FMODE_READ)) + return -EBADF; + break; + case F_WRLCK: + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + } + return do_setlk(filp, cmd, fl, is_local); } EXPORT_SYMBOL_GPL(nfs_flock); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index acd30baca461..1cf85d65b748 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -921,11 +921,11 @@ fl_pnfs_update_layout(struct inode *ino, fl = FILELAYOUT_LSEG(lseg); status = filelayout_check_deviceid(lo, fl, gfp_flags); - if (status) - lseg = ERR_PTR(status); -out: - if (IS_ERR(lseg)) + if (status) { pnfs_put_lseg(lseg); + lseg = ERR_PTR(status); + } +out: return lseg; } @@ -933,6 +933,7 @@ static void filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { + pnfs_generic_pg_check_layout(pgio); if (!pgio->pg_lseg) { pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, req->wb_context, @@ -959,6 +960,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_commit_info cinfo; int status; + pnfs_generic_pg_check_layout(pgio); if (!pgio->pg_lseg) { pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, req->wb_context, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 42dedf2d625f..f5714ee01000 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -846,6 +846,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, int ds_idx; retry: + pnfs_generic_pg_check_layout(pgio); /* Use full layout for now */ if (!pgio->pg_lseg) ff_layout_pg_get_read(pgio, req, false); @@ -894,6 +895,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, int status; retry: + pnfs_generic_pg_check_layout(pgio); if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, @@ -1800,16 +1802,16 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); if (!ds) - return PNFS_NOT_ATTEMPTED; + goto out_failed; ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, hdr->inode); if (IS_ERR(ds_clnt)) - return PNFS_NOT_ATTEMPTED; + goto out_failed; ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); if (!ds_cred) - return PNFS_NOT_ATTEMPTED; + goto out_failed; vers = nfs4_ff_layout_ds_version(lseg, idx); @@ -1839,6 +1841,11 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) sync, RPC_TASK_SOFTCONN); put_rpccred(ds_cred); return PNFS_ATTEMPTED; + +out_failed: + if (ff_layout_avoid_mds_available_ds(lseg)) + return PNFS_TRY_AGAIN; + return PNFS_NOT_ATTEMPTED; } static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) @@ -2354,10 +2361,21 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) return 0; } +static int +ff_layout_set_layoutdriver(struct nfs_server *server, + const struct nfs_fh *dummy) +{ +#if IS_ENABLED(CONFIG_NFS_V4_2) + server->caps |= NFS_CAP_LAYOUTSTATS; +#endif + return 0; +} + static struct pnfs_layoutdriver_type flexfilelayout_type = { .id = LAYOUT_FLEX_FILES, .name = "LAYOUT_FLEX_FILES", .owner = THIS_MODULE, + .set_layoutdriver = ff_layout_set_layoutdriver, .alloc_layout_hdr = ff_layout_alloc_layout_hdr, .free_layout_hdr = ff_layout_free_layout_hdr, .alloc_lseg = ff_layout_alloc_lseg, diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 457cfeb1d5c1..6df7a0cf5660 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -119,7 +119,13 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE) ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE; - if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) { + /* + * check for valid major/minor combination. + * currently we support dataserver which talk: + * v3, v4.0, v4.1, v4.2 + */ + if (!((ds_versions[i].version == 3 && ds_versions[i].minor_version == 0) || + (ds_versions[i].version == 4 && ds_versions[i].minor_version < 3))) { dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__, i, ds_versions[i].version, ds_versions[i].minor_version); @@ -415,7 +421,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror->mirror_ds->ds_versions[0].minor_version); /* connect success, check rsize/wsize limit */ - if (ds->ds_clp) { + if (!status) { max_payload = nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient), NULL); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index f489a5a71bd5..1de93ba78dc9 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -734,7 +734,10 @@ int nfs_getattr(const struct path *path, struct kstat *stat, if (need_atime || nfs_need_revalidate_inode(inode)) { struct nfs_server *server = NFS_SERVER(inode); - nfs_readdirplus_parent_cache_miss(path->dentry); + if (!(server->flags & NFS_MOUNT_NOAC)) + nfs_readdirplus_parent_cache_miss(path->dentry); + else + nfs_readdirplus_parent_cache_hit(path->dentry); err = __nfs_revalidate_inode(server, inode); } else nfs_readdirplus_parent_cache_hit(path->dentry); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7b38fedb7e03..e9b4c3320e37 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -495,7 +495,6 @@ void nfs_mark_request_commit(struct nfs_page *req, u32 ds_commit_idx); int nfs_write_need_commit(struct nfs_pgio_header *); void nfs_writeback_update_inode(struct nfs_pgio_header *hdr); -int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf); int nfs_generic_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo); void nfs_retry_commit(struct list_head *page_list, @@ -756,9 +755,13 @@ static inline bool nfs_error_is_fatal(int err) { switch (err) { case -ERESTARTSYS: + case -EACCES: + case -EDQUOT: + case -EFBIG: case -EIO: case -ENOSPC: case -EROFS: + case -ESTALE: case -E2BIG: return true; default: diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 786f17580582..1a224a33a6c2 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -143,11 +143,8 @@ struct vfsmount *nfs_d_automount(struct path *path) struct nfs_fh *fh = NULL; struct nfs_fattr *fattr = NULL; - dprintk("--> nfs_d_automount()\n"); - - mnt = ERR_PTR(-ESTALE); if (IS_ROOT(path->dentry)) - goto out_nofree; + return ERR_PTR(-ESTALE); mnt = ERR_PTR(-ENOMEM); fh = nfs_alloc_fhandle(); @@ -155,13 +152,10 @@ struct vfsmount *nfs_d_automount(struct path *path) if (fh == NULL || fattr == NULL) goto out; - dprintk("%s: enter\n", __func__); - mnt = server->nfs_client->rpc_ops->submount(server, path->dentry, fh, fattr); if (IS_ERR(mnt)) goto out; - dprintk("%s: done, success\n", __func__); mntget(mnt); /* prevent immediate expiration */ mnt_set_expiry(mnt, &nfs_automount_list); schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); @@ -169,11 +163,6 @@ struct vfsmount *nfs_d_automount(struct path *path) out: nfs_free_fattr(fattr); nfs_free_fhandle(fh); -out_nofree: - if (IS_ERR(mnt)) - dprintk("<-- %s(): error %ld\n", __func__, PTR_ERR(mnt)); - else - dprintk("<-- %s() = %p\n", __func__, mnt); return mnt; } @@ -248,27 +237,20 @@ struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, .fattr = fattr, .authflavor = authflavor, }; - struct vfsmount *mnt = ERR_PTR(-ENOMEM); + struct vfsmount *mnt; char *page = (char *) __get_free_page(GFP_USER); char *devname; - dprintk("--> nfs_do_submount()\n"); - - dprintk("%s: submounting on %pd2\n", __func__, - dentry); if (page == NULL) - goto out; - devname = nfs_devname(dentry, page, PAGE_SIZE); - mnt = (struct vfsmount *)devname; - if (IS_ERR(devname)) - goto free_page; - mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata); -free_page: - free_page((unsigned long)page); -out: - dprintk("%s: done\n", __func__); + return ERR_PTR(-ENOMEM); - dprintk("<-- nfs_do_submount() = %p\n", mnt); + devname = nfs_devname(dentry, page, PAGE_SIZE); + if (IS_ERR(devname)) + mnt = (struct vfsmount *)devname; + else + mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata); + + free_page((unsigned long)page); return mnt; } EXPORT_SYMBOL_GPL(nfs_do_submount); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index dc925b531f32..0c07b567118d 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -865,12 +865,63 @@ static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT]; } +static void nfs3_nlm_alloc_call(void *data) +{ + struct nfs_lock_context *l_ctx = data; + if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) { + get_nfs_open_context(l_ctx->open_context); + nfs_get_lock_context(l_ctx->open_context); + } +} + +static bool nfs3_nlm_unlock_prepare(struct rpc_task *task, void *data) +{ + struct nfs_lock_context *l_ctx = data; + if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) + return nfs_async_iocounter_wait(task, l_ctx); + return false; + +} + +static void nfs3_nlm_release_call(void *data) +{ + struct nfs_lock_context *l_ctx = data; + struct nfs_open_context *ctx; + if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) { + ctx = l_ctx->open_context; + nfs_put_lock_context(l_ctx); + put_nfs_open_context(ctx); + } +} + +const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = { + .nlmclnt_alloc_call = nfs3_nlm_alloc_call, + .nlmclnt_unlock_prepare = nfs3_nlm_unlock_prepare, + .nlmclnt_release_call = nfs3_nlm_release_call, +}; + static int nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(filp); + struct nfs_lock_context *l_ctx = NULL; + struct nfs_open_context *ctx = nfs_file_open_context(filp); + int status; - return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); + if (fl->fl_flags & FL_CLOSE) { + l_ctx = nfs_get_lock_context(ctx); + if (IS_ERR(l_ctx)) + l_ctx = NULL; + else + set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags); + } + + status = nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, l_ctx); + + if (l_ctx) + nfs_put_lock_context(l_ctx); + + return status; } static int nfs3_have_delegation(struct inode *inode, fmode_t flags) @@ -921,6 +972,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .dir_inode_ops = &nfs3_dir_inode_operations, .file_inode_ops = &nfs3_file_inode_operations, .file_ops = &nfs_file_operations, + .nlmclnt_ops = &nlmclnt_fl_close_lock_ops, .getroot = nfs3_proc_get_root, .submount = nfs_submount, .try_mount = nfs_try_mount, diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 1e486c73ec94..929d09a5310a 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -167,23 +167,29 @@ static ssize_t _nfs42_proc_copy(struct file *src, if (status) return status; + res->commit_res.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS); + if (!res->commit_res.verf) + return -ENOMEM; status = nfs4_call_sync(server->client, server, &msg, &args->seq_args, &res->seq_res, 0); if (status == -ENOTSUPP) server->caps &= ~NFS_CAP_COPY; if (status) - return status; + goto out; - if (res->write_res.verifier.committed != NFS_FILE_SYNC) { - status = nfs_commit_file(dst, &res->write_res.verifier.verifier); - if (status) - return status; + if (!nfs_write_verifier_cmp(&res->write_res.verifier.verifier, + &res->commit_res.verf->verifier)) { + status = -EAGAIN; + goto out; } truncate_pagecache_range(dst_inode, pos_dst, pos_dst + res->write_res.count); - return res->write_res.count; + status = res->write_res.count; +out: + kfree(res->commit_res.verf); + return status; } ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, @@ -240,6 +246,9 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, if (err == -ENOTSUPP) { err = -EOPNOTSUPP; break; + } if (err == -EAGAIN) { + dst_exception.retry = 1; + continue; } err2 = nfs4_handle_exception(server, err, &src_exception); @@ -379,6 +388,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) pnfs_mark_layout_stateid_invalid(lo, &head); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); + nfs_commit_inode(inode, 0); } else spin_unlock(&inode->i_lock); break; @@ -400,8 +410,6 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) case -EOPNOTSUPP: NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS; } - - dprintk("%s server returns %d\n", __func__, task->tk_status); } static void diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 6c7296454bbc..528362f69cc1 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -66,12 +66,14 @@ encode_putfh_maxsz + \ encode_savefh_maxsz + \ encode_putfh_maxsz + \ - encode_copy_maxsz) + encode_copy_maxsz + \ + encode_commit_maxsz) #define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ decode_savefh_maxsz + \ decode_putfh_maxsz + \ - decode_copy_maxsz) + decode_copy_maxsz + \ + decode_commit_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_deallocate_maxsz + \ @@ -222,6 +224,18 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req, encode_nops(&hdr); } +static void encode_copy_commit(struct xdr_stream *xdr, + struct nfs42_copy_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr); + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->dst_pos); + *p = cpu_to_be32(args->count); +} + /* * Encode COPY request */ @@ -239,6 +253,7 @@ static void nfs4_xdr_enc_copy(struct rpc_rqst *req, encode_savefh(xdr, &hdr); encode_putfh(xdr, args->dst_fh, &hdr); encode_copy(xdr, args, &hdr); + encode_copy_commit(xdr, args, &hdr); encode_nops(&hdr); } @@ -481,6 +496,9 @@ static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp, if (status) goto out; status = decode_copy(xdr, res); + if (status) + goto out; + status = decode_commit(xdr, &res->commit_res); out: return status; } diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 8346ccbf2d52..692a7a8bfc7a 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -359,11 +359,9 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, struct nfs_client *old; int error; - if (clp->cl_cons_state == NFS_CS_READY) { + if (clp->cl_cons_state == NFS_CS_READY) /* the client is initialised already */ - dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp); return clp; - } /* Check NFS protocol revision and initialize RPC op vector */ clp->rpc_ops = &nfs_v4_clientops; @@ -421,7 +419,6 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, error: nfs_mark_client_ready(clp, error); nfs_put_client(clp); - dprintk("<-- nfs4_init_client() = xerror %d\n", error); return ERR_PTR(error); } @@ -469,6 +466,50 @@ static bool nfs4_same_verifier(nfs4_verifier *v1, nfs4_verifier *v2) return memcmp(v1->data, v2->data, sizeof(v1->data)) == 0; } +static int nfs4_match_client(struct nfs_client *pos, struct nfs_client *new, + struct nfs_client **prev, struct nfs_net *nn) +{ + int status; + + if (pos->rpc_ops != new->rpc_ops) + return 1; + + if (pos->cl_minorversion != new->cl_minorversion) + return 1; + + /* If "pos" isn't marked ready, we can't trust the + * remaining fields in "pos", especially the client + * ID and serverowner fields. Wait for CREATE_SESSION + * to finish. */ + if (pos->cl_cons_state > NFS_CS_READY) { + atomic_inc(&pos->cl_count); + spin_unlock(&nn->nfs_client_lock); + + nfs_put_client(*prev); + *prev = pos; + + status = nfs_wait_client_init_complete(pos); + spin_lock(&nn->nfs_client_lock); + + if (status < 0) + return status; + } + + if (pos->cl_cons_state != NFS_CS_READY) + return 1; + + if (pos->cl_clientid != new->cl_clientid) + return 1; + + /* NFSv4.1 always uses the uniform string, however someone + * might switch the uniquifier string on us. + */ + if (!nfs4_match_client_owner_id(pos, new)) + return 1; + + return 0; +} + /** * nfs40_walk_client_list - Find server that recognizes a client ID * @@ -497,34 +538,10 @@ int nfs40_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { - if (pos->rpc_ops != new->rpc_ops) - continue; - - if (pos->cl_minorversion != new->cl_minorversion) - continue; - - /* If "pos" isn't marked ready, we can't trust the - * remaining fields in "pos" */ - if (pos->cl_cons_state > NFS_CS_READY) { - atomic_inc(&pos->cl_count); - spin_unlock(&nn->nfs_client_lock); - - nfs_put_client(prev); - prev = pos; - - status = nfs_wait_client_init_complete(pos); - if (status < 0) - goto out; - status = -NFS4ERR_STALE_CLIENTID; - spin_lock(&nn->nfs_client_lock); - } - if (pos->cl_cons_state != NFS_CS_READY) - continue; - - if (pos->cl_clientid != new->cl_clientid) - continue; - - if (!nfs4_match_client_owner_id(pos, new)) + status = nfs4_match_client(pos, new, &prev, nn); + if (status < 0) + goto out_unlock; + if (status != 0) continue; /* * We just sent a new SETCLIENTID, which should have @@ -557,8 +574,6 @@ int nfs40_walk_client_list(struct nfs_client *new, prev = NULL; *result = pos; - dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", - __func__, pos, atomic_read(&pos->cl_count)); goto out; case -ERESTARTSYS: case -ETIMEDOUT: @@ -567,36 +582,22 @@ int nfs40_walk_client_list(struct nfs_client *new, */ nfs4_schedule_path_down_recovery(pos); default: + spin_lock(&nn->nfs_client_lock); goto out; } spin_lock(&nn->nfs_client_lock); } +out_unlock: spin_unlock(&nn->nfs_client_lock); /* No match found. The server lost our clientid */ out: nfs_put_client(prev); - dprintk("NFS: <-- %s status = %d\n", __func__, status); return status; } #ifdef CONFIG_NFS_V4_1 -/* - * Returns true if the client IDs match - */ -static bool nfs4_match_clientids(u64 a, u64 b) -{ - if (a != b) { - dprintk("NFS: --> %s client ID %llx does not match %llx\n", - __func__, a, b); - return false; - } - dprintk("NFS: --> %s client ID %llx matches %llx\n", - __func__, a, b); - return true; -} - /* * Returns true if the server major ids match */ @@ -605,36 +606,8 @@ nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1, struct nfs41_server_owner *o2) { if (o1->major_id_sz != o2->major_id_sz) - goto out_major_mismatch; - if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0) - goto out_major_mismatch; - - dprintk("NFS: --> %s server owner major IDs match\n", __func__); - return true; - -out_major_mismatch: - dprintk("NFS: --> %s server owner major IDs do not match\n", - __func__); - return false; -} - -/* - * Returns true if server minor ids match - */ -static bool -nfs4_check_serverowner_minor_id(struct nfs41_server_owner *o1, - struct nfs41_server_owner *o2) -{ - /* Check eir_server_owner so_minor_id */ - if (o1->minor_id != o2->minor_id) - goto out_minor_mismatch; - - dprintk("NFS: --> %s server owner minor IDs match\n", __func__); - return true; - -out_minor_mismatch: - dprintk("NFS: --> %s server owner minor IDs do not match\n", __func__); - return false; + return false; + return memcmp(o1->major_id, o2->major_id, o1->major_id_sz) == 0; } /* @@ -645,18 +618,9 @@ nfs4_check_server_scope(struct nfs41_server_scope *s1, struct nfs41_server_scope *s2) { if (s1->server_scope_sz != s2->server_scope_sz) - goto out_scope_mismatch; - if (memcmp(s1->server_scope, s2->server_scope, - s1->server_scope_sz) != 0) - goto out_scope_mismatch; - - dprintk("NFS: --> %s server scopes match\n", __func__); - return true; - -out_scope_mismatch: - dprintk("NFS: --> %s server scopes do not match\n", - __func__); - return false; + return false; + return memcmp(s1->server_scope, s2->server_scope, + s1->server_scope_sz) == 0; } /** @@ -680,7 +644,7 @@ int nfs4_detect_session_trunking(struct nfs_client *clp, struct rpc_xprt *xprt) { /* Check eir_clientid */ - if (!nfs4_match_clientids(clp->cl_clientid, res->clientid)) + if (clp->cl_clientid != res->clientid) goto out_err; /* Check eir_server_owner so_major_id */ @@ -689,8 +653,7 @@ int nfs4_detect_session_trunking(struct nfs_client *clp, goto out_err; /* Check eir_server_owner so_minor_id */ - if (!nfs4_check_serverowner_minor_id(clp->cl_serverowner, - res->server_owner)) + if (clp->cl_serverowner->minor_id != res->server_owner->minor_id) goto out_err; /* Check eir_server_scope */ @@ -739,33 +702,10 @@ int nfs41_walk_client_list(struct nfs_client *new, if (pos == new) goto found; - if (pos->rpc_ops != new->rpc_ops) - continue; - - if (pos->cl_minorversion != new->cl_minorversion) - continue; - - /* If "pos" isn't marked ready, we can't trust the - * remaining fields in "pos", especially the client - * ID and serverowner fields. Wait for CREATE_SESSION - * to finish. */ - if (pos->cl_cons_state > NFS_CS_READY) { - atomic_inc(&pos->cl_count); - spin_unlock(&nn->nfs_client_lock); - - nfs_put_client(prev); - prev = pos; - - status = nfs_wait_client_init_complete(pos); - spin_lock(&nn->nfs_client_lock); - if (status < 0) - break; - status = -NFS4ERR_STALE_CLIENTID; - } - if (pos->cl_cons_state != NFS_CS_READY) - continue; - - if (!nfs4_match_clientids(pos->cl_clientid, new->cl_clientid)) + status = nfs4_match_client(pos, new, &prev, nn); + if (status < 0) + goto out; + if (status != 0) continue; /* @@ -777,23 +717,15 @@ int nfs41_walk_client_list(struct nfs_client *new, new->cl_serverowner)) continue; - /* Unlike NFSv4.0, we know that NFSv4.1 always uses the - * uniform string, however someone might switch the - * uniquifier string on us. - */ - if (!nfs4_match_client_owner_id(pos, new)) - continue; found: atomic_inc(&pos->cl_count); *result = pos; status = 0; - dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", - __func__, pos, atomic_read(&pos->cl_count)); break; } +out: spin_unlock(&nn->nfs_client_lock); - dprintk("NFS: <-- %s status = %d\n", __func__, status); nfs_put_client(prev); return status; } @@ -916,9 +848,6 @@ static int nfs4_set_client(struct nfs_server *server, .timeparms = timeparms, }; struct nfs_client *clp; - int error; - - dprintk("--> nfs4_set_client()\n"); if (server->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); @@ -927,15 +856,11 @@ static int nfs4_set_client(struct nfs_server *server, /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); - if (IS_ERR(clp)) { - error = PTR_ERR(clp); - goto error; - } + if (IS_ERR(clp)) + return PTR_ERR(clp); - if (server->nfs_client == clp) { - error = -ELOOP; - goto error; - } + if (server->nfs_client == clp) + return -ELOOP; /* * Query for the lease time on clientid setup or renewal @@ -947,11 +872,7 @@ static int nfs4_set_client(struct nfs_server *server, set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state); server->nfs_client = clp; - dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); return 0; -error: - dprintk("<-- nfs4_set_client() = xerror %d\n", error); - return error; } /* @@ -982,7 +903,6 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, .net = mds_clp->cl_net, .timeparms = &ds_timeout, }; - struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0) @@ -998,10 +918,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, * (section 13.1 RFC 5661). */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); - clp = nfs_get_client(&cl_init); - - dprintk("<-- %s %p\n", __func__, clp); - return clp; + return nfs_get_client(&cl_init); } EXPORT_SYMBOL_GPL(nfs4_set_ds_client); @@ -1098,8 +1015,6 @@ static int nfs4_init_server(struct nfs_server *server, struct rpc_timeout timeparms; int error; - dprintk("--> nfs4_init_server()\n"); - nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, data->timeo, data->retrans); @@ -1127,7 +1042,7 @@ static int nfs4_init_server(struct nfs_server *server, data->minorversion, data->net); if (error < 0) - goto error; + return error; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -1138,16 +1053,10 @@ static int nfs4_init_server(struct nfs_server *server, server->acregmax = data->acregmax * HZ; server->acdirmin = data->acdirmin * HZ; server->acdirmax = data->acdirmax * HZ; + server->port = data->nfs_server.port; - server->port = data->nfs_server.port; - - error = nfs_init_server_rpcclient(server, &timeparms, - data->selected_flavor); - -error: - /* Done */ - dprintk("<-- nfs4_init_server() = %d\n", error); - return error; + return nfs_init_server_rpcclient(server, &timeparms, + data->selected_flavor); } /* @@ -1163,8 +1072,6 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, bool auth_probe; int error; - dprintk("--> nfs4_create_server()\n"); - server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); @@ -1180,12 +1087,10 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, if (error < 0) goto error; - dprintk("<-- nfs4_create_server() = %p\n", server); return server; error: nfs_free_server(server); - dprintk("<-- nfs4_create_server() = error %d\n", error); return ERR_PTR(error); } @@ -1200,8 +1105,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, bool auth_probe; int error; - dprintk("--> nfs4_create_referral_server()\n"); - server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); @@ -1235,12 +1138,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, if (error < 0) goto error; - dprintk("<-- nfs_create_referral_server() = %p\n", server); return server; error: nfs_free_server(server); - dprintk("<-- nfs4_create_referral_server() = error %d\n", error); return ERR_PTR(error); } @@ -1300,31 +1201,16 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, struct sockaddr *localaddr = (struct sockaddr *)&address; int error; - dprintk("--> %s: move FSID %llx:%llx to \"%s\")\n", __func__, - (unsigned long long)server->fsid.major, - (unsigned long long)server->fsid.minor, - hostname); - error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout); - if (error != 0) { - dprintk("<-- %s(): rpc_switch_client_transport returned %d\n", - __func__, error); - goto out; - } + if (error != 0) + return error; error = rpc_localaddr(clnt, localaddr, sizeof(address)); - if (error != 0) { - dprintk("<-- %s(): rpc_localaddr returned %d\n", - __func__, error); - goto out; - } + if (error != 0) + return error; - error = -EAFNOSUPPORT; - if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) { - dprintk("<-- %s(): rpc_ntop returned %d\n", - __func__, error); - goto out; - } + if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) + return -EAFNOSUPPORT; nfs_server_remove_lists(server); error = nfs4_set_client(server, hostname, sap, salen, buf, @@ -1333,21 +1219,12 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, nfs_put_client(clp); if (error != 0) { nfs_server_insert_lists(server); - dprintk("<-- %s(): nfs4_set_client returned %d\n", - __func__, error); - goto out; + return error; } if (server->nfs_client->cl_hostname == NULL) server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); nfs_server_insert_lists(server); - error = nfs_probe_destination(server); - if (error < 0) - goto out; - - dprintk("<-- %s() succeeded\n", __func__); - -out: - return error; + return nfs_probe_destination(server); } diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c index 039b3eb6d834..ac8406018962 100644 --- a/fs/nfs/nfs4getroot.c +++ b/fs/nfs/nfs4getroot.c @@ -14,8 +14,6 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p struct nfs_fsinfo fsinfo; int ret = -ENOMEM; - dprintk("--> nfs4_get_rootfh()\n"); - fsinfo.fattr = nfs_alloc_fattr(); if (fsinfo.fattr == NULL) goto out; @@ -38,6 +36,5 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); out: nfs_free_fattr(fsinfo.fattr); - dprintk("<-- nfs4_get_rootfh() = %d\n", ret); return ret; } diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index d8b040bd9814..7d531da1bae3 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -340,7 +340,6 @@ static struct vfsmount *nfs_follow_referral(struct dentry *dentry, out: free_page((unsigned long) page); free_page((unsigned long) page2); - dprintk("%s: done\n", __func__); return mnt; } @@ -358,11 +357,9 @@ static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry * int err; /* BUG_ON(IS_ROOT(dentry)); */ - dprintk("%s: enter\n", __func__); - page = alloc_page(GFP_KERNEL); if (page == NULL) - goto out; + return mnt; fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); if (fs_locations == NULL) @@ -386,8 +383,6 @@ static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry * out_free: __free_page(page); kfree(fs_locations); -out: - dprintk("%s: done\n", __func__); return mnt; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 201ca3f2c4ba..c08c46a3b8cd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -698,7 +698,8 @@ static int nfs41_sequence_process(struct rpc_task *task, session = slot->table->session; if (slot->interrupted) { - slot->interrupted = 0; + if (res->sr_status != -NFS4ERR_DELAY) + slot->interrupted = 0; interrupted = true; } @@ -2300,8 +2301,10 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) if (status != 0) return status; } - if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) + if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) { + nfs4_sequence_free_slot(&o_res->seq_res); nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label); + } return 0; } @@ -3265,6 +3268,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f .rpc_resp = &res, }; int status; + int i; bitmask[0] = FATTR4_WORD0_SUPPORTED_ATTRS | FATTR4_WORD0_FH_EXPIRE_TYPE | @@ -3330,8 +3334,13 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; server->cache_consistency_bitmask[2] = 0; + + /* Avoid a regression due to buggy server */ + for (i = 0; i < ARRAY_SIZE(res.exclcreat_bitmask); i++) + res.exclcreat_bitmask[i] &= res.attr_bitmask[i]; memcpy(server->exclcreat_bitmask, res.exclcreat_bitmask, sizeof(server->exclcreat_bitmask)); + server->acl_bitmask = res.acl_bitmask; server->fh_expire_type = res.fh_expire_type; } @@ -4610,7 +4619,7 @@ static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, return 0; if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, hdr->args.lock_context, - hdr->rw_ops->rw_mode) == -EIO) + hdr->rw_mode) == -EIO) return -EIO; if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) return -EIO; @@ -4804,8 +4813,10 @@ static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, if (!atomic_inc_not_zero(&clp->cl_count)) return -EIO; data = kmalloc(sizeof(*data), GFP_NOFS); - if (data == NULL) + if (data == NULL) { + nfs_put_client(clp); return -ENOMEM; + } data->client = clp; data->timestamp = jiffies; return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT, @@ -5782,6 +5793,7 @@ struct nfs4_unlockdata { struct nfs_locku_res res; struct nfs4_lock_state *lsp; struct nfs_open_context *ctx; + struct nfs_lock_context *l_ctx; struct file_lock fl; struct nfs_server *server; unsigned long timestamp; @@ -5806,6 +5818,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, atomic_inc(&lsp->ls_count); /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); + p->l_ctx = nfs_get_lock_context(ctx); memcpy(&p->fl, fl, sizeof(p->fl)); p->server = NFS_SERVER(inode); return p; @@ -5816,6 +5829,7 @@ static void nfs4_locku_release_calldata(void *data) struct nfs4_unlockdata *calldata = data; nfs_free_seqid(calldata->arg.seqid); nfs4_put_lock_state(calldata->lsp); + nfs_put_lock_context(calldata->l_ctx); put_nfs_open_context(calldata->ctx); kfree(calldata); } @@ -5857,6 +5871,10 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) { struct nfs4_unlockdata *calldata = data; + if (test_bit(NFS_CONTEXT_UNLOCK, &calldata->l_ctx->open_context->flags) && + nfs_async_iocounter_wait(task, calldata->l_ctx)) + return; + if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) goto out_wait; nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid); @@ -5908,6 +5926,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, * canceled lock is passed in, and it won't be an unlock. */ fl->fl_type = F_UNLCK; + if (fl->fl_flags & FL_CLOSE) + set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags); data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid); if (data == NULL) { @@ -6445,9 +6465,6 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) ctx = nfs_file_open_context(filp); state = ctx->state; - if (request->fl_start < 0 || request->fl_end < 0) - return -EINVAL; - if (IS_GETLK(cmd)) { if (state != NULL) return nfs4_proc_getlk(state, F_GETLK, request); @@ -6470,20 +6487,6 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) return -ENOLCK; - /* - * Don't rely on the VFS having checked the file open mode, - * since it won't do this for flock() locks. - */ - switch (request->fl_type) { - case F_RDLCK: - if (!(filp->f_mode & FMODE_READ)) - return -EBADF; - break; - case F_WRLCK: - if (!(filp->f_mode & FMODE_WRITE)) - return -EBADF; - } - status = nfs4_set_lock_state(state, request); if (status != 0) return status; @@ -7155,8 +7158,6 @@ int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt, }; struct rpc_task *task; - dprintk("--> %s\n", __func__); - nfs4_copy_sessionid(&args.sessionid, &clp->cl_session->sess_id); if (!(clp->cl_session->flags & SESSION4_BACK_CHAN)) args.dir = NFS4_CDFC4_FORE; @@ -7176,24 +7177,20 @@ int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt, if (memcmp(res.sessionid.data, clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { dprintk("NFS: %s: Session ID mismatch\n", __func__); - status = -EIO; - goto out; + return -EIO; } if ((res.dir & args.dir) != res.dir || res.dir == 0) { dprintk("NFS: %s: Unexpected direction from server\n", __func__); - status = -EIO; - goto out; + return -EIO; } if (res.use_conn_in_rdma_mode != args.use_conn_in_rdma_mode) { dprintk("NFS: %s: Server returned RDMA mode = true\n", __func__); - status = -EIO; - goto out; + return -EIO; } } -out: - dprintk("<-- %s status= %d\n", __func__, status); + return status; } @@ -7459,15 +7456,16 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, }; struct nfs41_exchange_id_data *calldata; struct rpc_task *task; - int status = -EIO; + int status; if (!atomic_inc_not_zero(&clp->cl_count)) - goto out; + return -EIO; - status = -ENOMEM; calldata = kzalloc(sizeof(*calldata), GFP_NOFS); - if (!calldata) - goto out; + if (!calldata) { + nfs_put_client(clp); + return -ENOMEM; + } if (!xprt) nfs4_init_boot_verifier(clp, &verifier); @@ -7476,10 +7474,6 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, if (status) goto out_calldata; - dprintk("NFS call exchange_id auth=%s, '%s'\n", - clp->cl_rpcclient->cl_auth->au_ops->au_name, - clp->cl_owner_id); - calldata->res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), GFP_NOFS); status = -ENOMEM; @@ -7545,13 +7539,6 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, rpc_put_task(task); out: - if (clp->cl_implid != NULL) - dprintk("NFS reply exchange_id: Server Implementation ID: " - "domain: %s, name: %s, date: %llu,%u\n", - clp->cl_implid->domain, clp->cl_implid->name, - clp->cl_implid->date.seconds, - clp->cl_implid->date.nseconds); - dprintk("NFS reply exchange_id: %d\n", status); return status; out_impl_id: @@ -7769,17 +7756,13 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0); nfs4_set_sequence_privileged(&args.la_seq_args); - dprintk("--> %s\n", __func__); task = rpc_run_task(&task_setup); if (IS_ERR(task)) - status = PTR_ERR(task); - else { - status = task->tk_status; - rpc_put_task(task); - } - dprintk("<-- %s return %d\n", __func__, status); + return PTR_ERR(task); + status = task->tk_status; + rpc_put_task(task); return status; } @@ -8180,6 +8163,12 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf /* fall through */ case -NFS4ERR_RETRY_UNCACHED_REP: return -EAGAIN; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + nfs4_schedule_session_recovery(clp->cl_session, + task->tk_status); + break; default: nfs4_schedule_lease_recovery(clp); } @@ -8258,7 +8247,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, if (status == 0) status = task->tk_status; rpc_put_task(task); - return 0; out: dprintk("<-- %s status=%d\n", __func__, status); return status; @@ -8357,6 +8345,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, */ pnfs_mark_layout_stateid_invalid(lo, &head); spin_unlock(&inode->i_lock); + nfs_commit_inode(inode, 0); pnfs_free_lseg_list(&head); status = -EAGAIN; goto out; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 8156bad6b441..b34de036501b 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1649,13 +1649,14 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot); } -static void nfs4_reclaim_complete(struct nfs_client *clp, +static int nfs4_reclaim_complete(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops, struct rpc_cred *cred) { /* Notify the server we're done reclaiming our state */ if (ops->reclaim_complete) - (void)ops->reclaim_complete(clp, cred); + return ops->reclaim_complete(clp, cred); + return 0; } static void nfs4_clear_reclaim_server(struct nfs_server *server) @@ -1702,13 +1703,16 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) { const struct nfs4_state_recovery_ops *ops; struct rpc_cred *cred; + int err; if (!nfs4_state_clear_reclaim_reboot(clp)) return; ops = clp->cl_mvops->reboot_recovery_ops; cred = nfs4_get_clid_cred(clp); - nfs4_reclaim_complete(clp, ops, cred); + err = nfs4_reclaim_complete(clp, ops, cred); put_rpccred(cred); + if (err == -NFS4ERR_CONN_NOT_BOUND_TO_SESSION) + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); } static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 80ce289eea05..3aebfdc82b30 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1000,8 +1000,9 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs4_label *label, + const umode_t *umask, const struct nfs_server *server, - bool excl_check, const umode_t *umask) + const uint32_t attrmask[]) { char owner_name[IDMAP_NAMESZ]; char owner_group[IDMAP_NAMESZ]; @@ -1016,22 +1017,20 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, /* * We reserve enough space to write the entire attribute buffer at once. */ - if (iap->ia_valid & ATTR_SIZE) { + if ((iap->ia_valid & ATTR_SIZE) && (attrmask[0] & FATTR4_WORD0_SIZE)) { bmval[0] |= FATTR4_WORD0_SIZE; len += 8; } - if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) - umask = NULL; if (iap->ia_valid & ATTR_MODE) { - if (umask) { + if (umask && (attrmask[2] & FATTR4_WORD2_MODE_UMASK)) { bmval[2] |= FATTR4_WORD2_MODE_UMASK; len += 8; - } else { + } else if (attrmask[1] & FATTR4_WORD1_MODE) { bmval[1] |= FATTR4_WORD1_MODE; len += 4; } } - if (iap->ia_valid & ATTR_UID) { + if ((iap->ia_valid & ATTR_UID) && (attrmask[1] & FATTR4_WORD1_OWNER)) { owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ); if (owner_namelen < 0) { dprintk("nfs: couldn't resolve uid %d to string\n", @@ -1044,7 +1043,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, bmval[1] |= FATTR4_WORD1_OWNER; len += 4 + (XDR_QUADLEN(owner_namelen) << 2); } - if (iap->ia_valid & ATTR_GID) { + if ((iap->ia_valid & ATTR_GID) && + (attrmask[1] & FATTR4_WORD1_OWNER_GROUP)) { owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ); if (owner_grouplen < 0) { dprintk("nfs: couldn't resolve gid %d to string\n", @@ -1056,32 +1056,26 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, bmval[1] |= FATTR4_WORD1_OWNER_GROUP; len += 4 + (XDR_QUADLEN(owner_grouplen) << 2); } - if (iap->ia_valid & ATTR_ATIME_SET) { - bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; - len += 16; - } else if (iap->ia_valid & ATTR_ATIME) { - bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; - len += 4; + if (attrmask[1] & FATTR4_WORD1_TIME_ACCESS_SET) { + if (iap->ia_valid & ATTR_ATIME_SET) { + bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; + len += 16; + } else if (iap->ia_valid & ATTR_ATIME) { + bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; + len += 4; + } } - if (iap->ia_valid & ATTR_MTIME_SET) { - bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; - len += 16; - } else if (iap->ia_valid & ATTR_MTIME) { - bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; - len += 4; + if (attrmask[1] & FATTR4_WORD1_TIME_MODIFY_SET) { + if (iap->ia_valid & ATTR_MTIME_SET) { + bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; + len += 16; + } else if (iap->ia_valid & ATTR_MTIME) { + bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; + len += 4; + } } - if (excl_check) { - const u32 *excl_bmval = server->exclcreat_bitmask; - bmval[0] &= excl_bmval[0]; - bmval[1] &= excl_bmval[1]; - bmval[2] &= excl_bmval[2]; - - if (!(excl_bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) - label = NULL; - } - - if (label) { + if (label && (attrmask[2] & FATTR4_WORD2_SECURITY_LABEL)) { len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2); bmval[2] |= FATTR4_WORD2_SECURITY_LABEL; } @@ -1188,8 +1182,8 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * } encode_string(xdr, create->name->len, create->name->name); - encode_attrs(xdr, create->attrs, create->label, create->server, false, - &create->umask); + encode_attrs(xdr, create->attrs, create->label, &create->umask, + create->server, create->server->attr_bitmask); } static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) @@ -1409,13 +1403,13 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op switch(arg->createmode) { case NFS4_CREATE_UNCHECKED: *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false, - &arg->umask); + encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask, + arg->server, arg->server->attr_bitmask); break; case NFS4_CREATE_GUARDED: *p = cpu_to_be32(NFS4_CREATE_GUARDED); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false, - &arg->umask); + encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask, + arg->server, arg->server->attr_bitmask); break; case NFS4_CREATE_EXCLUSIVE: *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); @@ -1424,8 +1418,8 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op case NFS4_CREATE_EXCLUSIVE4_1: *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); encode_nfs4_verifier(xdr, &arg->u.verifier); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true, - &arg->umask); + encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask, + arg->server, arg->server->exclcreat_bitmask); } } @@ -1681,7 +1675,8 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs { encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); encode_nfs4_stateid(xdr, &arg->stateid); - encode_attrs(xdr, arg->iap, arg->label, server, false, NULL); + encode_attrs(xdr, arg->iap, arg->label, NULL, server, + server->attr_bitmask); } static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) @@ -2005,16 +2000,10 @@ encode_layoutcommit(struct xdr_stream *xdr, *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ - if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) { - NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( - NFS_I(inode)->layout, xdr, args); - } else { - encode_uint32(xdr, args->layoutupdate_len); - if (args->layoutupdate_pages) { - xdr_write_pages(xdr, args->layoutupdate_pages, 0, - args->layoutupdate_len); - } - } + encode_uint32(xdr, args->layoutupdate_len); + if (args->layoutupdate_pages) + xdr_write_pages(xdr, args->layoutupdate_pages, 0, + args->layoutupdate_len); return 0; } @@ -2024,7 +2013,6 @@ encode_layoutreturn(struct xdr_stream *xdr, const struct nfs4_layoutreturn_args *args, struct compound_hdr *hdr) { - const struct pnfs_layoutdriver_type *lr_ops = NFS_SERVER(args->inode)->pnfs_curr_ld; __be32 *p; encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr); @@ -2041,8 +2029,6 @@ encode_layoutreturn(struct xdr_stream *xdr, spin_unlock(&args->inode->i_lock); if (args->ld_private->ops && args->ld_private->ops->encode) args->ld_private->ops->encode(xdr, args, args->ld_private); - else if (lr_ops->encode_layoutreturn) - lr_ops->encode_layoutreturn(xdr, args); else encode_uint32(xdr, 0); } @@ -5579,6 +5565,8 @@ static int decode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map) unsigned int i; p = xdr_inline_decode(xdr, 4); + if (!p) + return -EIO; bitmap_words = be32_to_cpup(p++); if (bitmap_words > NFS4_OP_MAP_NUM_WORDS) return -EIO; diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild deleted file mode 100644 index ed30ea072bb8..000000000000 --- a/fs/nfs/objlayout/Kbuild +++ /dev/null @@ -1,5 +0,0 @@ -# -# Makefile for the pNFS Objects Layout Driver kernel module -# -objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o -obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c deleted file mode 100644 index 049c1b1f2932..000000000000 --- a/fs/nfs/objlayout/objio_osd.c +++ /dev/null @@ -1,675 +0,0 @@ -/* - * pNFS Objects layout implementation over open-osd initiator library - * - * Copyright (C) 2009 Panasas Inc. [year of first publication] - * All rights reserved. - * - * Benny Halevy - * Boaz Harrosh - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * See the file COPYING included with this distribution for more details. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the Panasas company nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include - -#include "objlayout.h" -#include "../internal.h" - -#define NFSDBG_FACILITY NFSDBG_PNFS_LD - -struct objio_dev_ent { - struct nfs4_deviceid_node id_node; - struct ore_dev od; -}; - -static void -objio_free_deviceid_node(struct nfs4_deviceid_node *d) -{ - struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); - - dprintk("%s: free od=%p\n", __func__, de->od.od); - osduld_put_device(de->od.od); - kfree_rcu(d, rcu); -} - -struct objio_segment { - struct pnfs_layout_segment lseg; - - struct ore_layout layout; - struct ore_components oc; -}; - -static inline struct objio_segment * -OBJIO_LSEG(struct pnfs_layout_segment *lseg) -{ - return container_of(lseg, struct objio_segment, lseg); -} - -struct objio_state { - /* Generic layer */ - struct objlayout_io_res oir; - - bool sync; - /*FIXME: Support for extra_bytes at ore_get_rw_state() */ - struct ore_io_state *ios; -}; - -/* Send and wait for a get_device_info of devices in the layout, - then look them up with the osd_initiator library */ -struct nfs4_deviceid_node * -objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, - gfp_t gfp_flags) -{ - struct pnfs_osd_deviceaddr *deviceaddr; - struct objio_dev_ent *ode = NULL; - struct osd_dev *od; - struct osd_dev_info odi; - bool retry_flag = true; - __be32 *p; - int err; - - deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags); - if (!deviceaddr) - return NULL; - - p = page_address(pdev->pages[0]); - pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p); - - odi.systemid_len = deviceaddr->oda_systemid.len; - if (odi.systemid_len > sizeof(odi.systemid)) { - dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n", - __func__, sizeof(odi.systemid)); - err = -EINVAL; - goto out; - } else if (odi.systemid_len) - memcpy(odi.systemid, deviceaddr->oda_systemid.data, - odi.systemid_len); - odi.osdname_len = deviceaddr->oda_osdname.len; - odi.osdname = (u8 *)deviceaddr->oda_osdname.data; - - if (!odi.osdname_len && !odi.systemid_len) { - dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", - __func__); - err = -ENODEV; - goto out; - } - -retry_lookup: - od = osduld_info_lookup(&odi); - if (IS_ERR(od)) { - err = PTR_ERR(od); - dprintk("%s: osduld_info_lookup => %d\n", __func__, err); - if (err == -ENODEV && retry_flag) { - err = objlayout_autologin(deviceaddr); - if (likely(!err)) { - retry_flag = false; - goto retry_lookup; - } - } - goto out; - } - - dprintk("Adding new dev_id(%llx:%llx)\n", - _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id)); - - ode = kzalloc(sizeof(*ode), gfp_flags); - if (!ode) { - dprintk("%s: -ENOMEM od=%p\n", __func__, od); - goto out; - } - - nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id); - kfree(deviceaddr); - - ode->od.od = od; - return &ode->id_node; - -out: - kfree(deviceaddr); - return NULL; -} - -static void copy_single_comp(struct ore_components *oc, unsigned c, - struct pnfs_osd_object_cred *src_comp) -{ - struct ore_comp *ocomp = &oc->comps[c]; - - WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */ - WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred)); - - ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id; - ocomp->obj.id = src_comp->oc_object_id.oid_object_id; - - memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred)); -} - -static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, - struct objio_segment **pseg) -{ -/* This is the in memory structure of the objio_segment - * - * struct __alloc_objio_segment { - * struct objio_segment olseg; - * struct ore_dev *ods[numdevs]; - * struct ore_comp comps[numdevs]; - * } *aolseg; - * NOTE: The code as above compiles and runs perfectly. It is elegant, - * type safe and compact. At some Past time Linus has decided he does not - * like variable length arrays, For the sake of this principal we uglify - * the code as below. - */ - struct objio_segment *lseg; - size_t lseg_size = sizeof(*lseg) + - numdevs * sizeof(lseg->oc.ods[0]) + - numdevs * sizeof(*lseg->oc.comps); - - lseg = kzalloc(lseg_size, gfp_flags); - if (unlikely(!lseg)) { - dprintk("%s: Failed allocation numdevs=%d size=%zd\n", __func__, - numdevs, lseg_size); - return -ENOMEM; - } - - lseg->oc.numdevs = numdevs; - lseg->oc.single_comp = EC_MULTPLE_COMPS; - lseg->oc.ods = (void *)(lseg + 1); - lseg->oc.comps = (void *)(lseg->oc.ods + numdevs); - - *pseg = lseg; - return 0; -} - -int objio_alloc_lseg(struct pnfs_layout_segment **outp, - struct pnfs_layout_hdr *pnfslay, - struct pnfs_layout_range *range, - struct xdr_stream *xdr, - gfp_t gfp_flags) -{ - struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode); - struct objio_segment *objio_seg; - struct pnfs_osd_xdr_decode_layout_iter iter; - struct pnfs_osd_layout layout; - struct pnfs_osd_object_cred src_comp; - unsigned cur_comp; - int err; - - err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); - if (unlikely(err)) - return err; - - err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg); - if (unlikely(err)) - return err; - - objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit; - objio_seg->layout.group_width = layout.olo_map.odm_group_width; - objio_seg->layout.group_depth = layout.olo_map.odm_group_depth; - objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; - objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm; - - err = ore_verify_layout(layout.olo_map.odm_num_comps, - &objio_seg->layout); - if (unlikely(err)) - goto err; - - objio_seg->oc.first_dev = layout.olo_comps_index; - cur_comp = 0; - while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { - struct nfs4_deviceid_node *d; - struct objio_dev_ent *ode; - - copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); - - d = nfs4_find_get_deviceid(server, - &src_comp.oc_object_id.oid_device_id, - pnfslay->plh_lc_cred, gfp_flags); - if (!d) { - err = -ENXIO; - goto err; - } - - ode = container_of(d, struct objio_dev_ent, id_node); - objio_seg->oc.ods[cur_comp++] = &ode->od; - } - /* pnfs_osd_xdr_decode_layout_comp returns false on error */ - if (unlikely(err)) - goto err; - - *outp = &objio_seg->lseg; - return 0; - -err: - kfree(objio_seg); - dprintk("%s: Error: return %d\n", __func__, err); - *outp = NULL; - return err; -} - -void objio_free_lseg(struct pnfs_layout_segment *lseg) -{ - int i; - struct objio_segment *objio_seg = OBJIO_LSEG(lseg); - - for (i = 0; i < objio_seg->oc.numdevs; i++) { - struct ore_dev *od = objio_seg->oc.ods[i]; - struct objio_dev_ent *ode; - - if (!od) - break; - ode = container_of(od, typeof(*ode), od); - nfs4_put_deviceid_node(&ode->id_node); - } - kfree(objio_seg); -} - -static int -objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading, - struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase, - loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags, - struct objio_state **outp) -{ - struct objio_segment *objio_seg = OBJIO_LSEG(lseg); - struct ore_io_state *ios; - int ret; - struct __alloc_objio_state { - struct objio_state objios; - struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs]; - } *aos; - - aos = kzalloc(sizeof(*aos), gfp_flags); - if (unlikely(!aos)) - return -ENOMEM; - - objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs, - aos->ioerrs, rpcdata, pnfs_layout_type); - - ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading, - offset, count, &ios); - if (unlikely(ret)) { - kfree(aos); - return ret; - } - - ios->pages = pages; - ios->pgbase = pgbase; - ios->private = aos; - BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT); - - aos->objios.sync = 0; - aos->objios.ios = ios; - *outp = &aos->objios; - return 0; -} - -void objio_free_result(struct objlayout_io_res *oir) -{ - struct objio_state *objios = container_of(oir, struct objio_state, oir); - - ore_put_io_state(objios->ios); - kfree(objios); -} - -static enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) -{ - switch (oep) { - case OSD_ERR_PRI_NO_ERROR: - return (enum pnfs_osd_errno)0; - - case OSD_ERR_PRI_CLEAR_PAGES: - BUG_ON(1); - return 0; - - case OSD_ERR_PRI_RESOURCE: - return PNFS_OSD_ERR_RESOURCE; - case OSD_ERR_PRI_BAD_CRED: - return PNFS_OSD_ERR_BAD_CRED; - case OSD_ERR_PRI_NO_ACCESS: - return PNFS_OSD_ERR_NO_ACCESS; - case OSD_ERR_PRI_UNREACHABLE: - return PNFS_OSD_ERR_UNREACHABLE; - case OSD_ERR_PRI_NOT_FOUND: - return PNFS_OSD_ERR_NOT_FOUND; - case OSD_ERR_PRI_NO_SPACE: - return PNFS_OSD_ERR_NO_SPACE; - default: - WARN_ON(1); - /* fallthrough */ - case OSD_ERR_PRI_EIO: - return PNFS_OSD_ERR_EIO; - } -} - -static void __on_dev_error(struct ore_io_state *ios, - struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep, - u64 dev_offset, u64 dev_len) -{ - struct objio_state *objios = ios->private; - struct pnfs_osd_objid pooid; - struct objio_dev_ent *ode = container_of(od, typeof(*ode), od); - /* FIXME: what to do with more-then-one-group layouts. We need to - * translate from ore_io_state index to oc->comps index - */ - unsigned comp = dev_index; - - pooid.oid_device_id = ode->id_node.deviceid; - pooid.oid_partition_id = ios->oc->comps[comp].obj.partition; - pooid.oid_object_id = ios->oc->comps[comp].obj.id; - - objlayout_io_set_result(&objios->oir, comp, - &pooid, osd_pri_2_pnfs_err(oep), - dev_offset, dev_len, !ios->reading); -} - -/* - * read - */ -static void _read_done(struct ore_io_state *ios, void *private) -{ - struct objio_state *objios = private; - ssize_t status; - int ret = ore_check_io(ios, &__on_dev_error); - - /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ - - if (likely(!ret)) - status = ios->length; - else - status = ret; - - objlayout_read_done(&objios->oir, status, objios->sync); -} - -int objio_read_pagelist(struct nfs_pgio_header *hdr) -{ - struct objio_state *objios; - int ret; - - ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true, - hdr->lseg, hdr->args.pages, hdr->args.pgbase, - hdr->args.offset, hdr->args.count, hdr, - GFP_KERNEL, &objios); - if (unlikely(ret)) - return ret; - - objios->ios->done = _read_done; - dprintk("%s: offset=0x%llx length=0x%x\n", __func__, - hdr->args.offset, hdr->args.count); - ret = ore_read(objios->ios); - if (unlikely(ret)) - objio_free_result(&objios->oir); - return ret; -} - -/* - * write - */ -static void _write_done(struct ore_io_state *ios, void *private) -{ - struct objio_state *objios = private; - ssize_t status; - int ret = ore_check_io(ios, &__on_dev_error); - - /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ - - if (likely(!ret)) { - /* FIXME: should be based on the OSD's persistence model - * See OSD2r05 Section 4.13 Data persistence model */ - objios->oir.committed = NFS_FILE_SYNC; - status = ios->length; - } else { - status = ret; - } - - objlayout_write_done(&objios->oir, status, objios->sync); -} - -static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) -{ - struct objio_state *objios = priv; - struct nfs_pgio_header *hdr = objios->oir.rpcdata; - struct address_space *mapping = hdr->inode->i_mapping; - pgoff_t index = offset / PAGE_SIZE; - struct page *page; - loff_t i_size = i_size_read(hdr->inode); - - if (offset >= i_size) { - *uptodate = true; - dprintk("%s: g_zero_page index=0x%lx\n", __func__, index); - return ZERO_PAGE(0); - } - - page = find_get_page(mapping, index); - if (!page) { - page = find_or_create_page(mapping, index, GFP_NOFS); - if (unlikely(!page)) { - dprintk("%s: grab_cache_page Failed index=0x%lx\n", - __func__, index); - return NULL; - } - unlock_page(page); - } - *uptodate = PageUptodate(page); - dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate); - return page; -} - -static void __r4w_put_page(void *priv, struct page *page) -{ - dprintk("%s: index=0x%lx\n", __func__, - (page == ZERO_PAGE(0)) ? -1UL : page->index); - if (ZERO_PAGE(0) != page) - put_page(page); - return; -} - -static const struct _ore_r4w_op _r4w_op = { - .get_page = &__r4w_get_page, - .put_page = &__r4w_put_page, -}; - -int objio_write_pagelist(struct nfs_pgio_header *hdr, int how) -{ - struct objio_state *objios; - int ret; - - ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false, - hdr->lseg, hdr->args.pages, hdr->args.pgbase, - hdr->args.offset, hdr->args.count, hdr, GFP_NOFS, - &objios); - if (unlikely(ret)) - return ret; - - objios->sync = 0 != (how & FLUSH_SYNC); - objios->ios->r4w = &_r4w_op; - - if (!objios->sync) - objios->ios->done = _write_done; - - dprintk("%s: offset=0x%llx length=0x%x\n", __func__, - hdr->args.offset, hdr->args.count); - ret = ore_write(objios->ios); - if (unlikely(ret)) { - objio_free_result(&objios->oir); - return ret; - } - - if (objios->sync) - _write_done(objios->ios, objios); - - return 0; -} - -/* - * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number - * of bytes (maximum @req->wb_bytes) that can be coalesced. - */ -static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio, - struct nfs_page *prev, struct nfs_page *req) -{ - struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(pgio); - unsigned int size; - - size = pnfs_generic_pg_test(pgio, prev, req); - - if (!size || mirror->pg_count + req->wb_bytes > - (unsigned long)pgio->pg_layout_private) - return 0; - - return min(size, req->wb_bytes); -} - -static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) -{ - pnfs_generic_pg_init_read(pgio, req); - if (unlikely(pgio->pg_lseg == NULL)) - return; /* Not pNFS */ - - pgio->pg_layout_private = (void *) - OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; -} - -static bool aligned_on_raid_stripe(u64 offset, struct ore_layout *layout, - unsigned long *stripe_end) -{ - u32 stripe_off; - unsigned stripe_size; - - if (layout->raid_algorithm == PNFS_OSD_RAID_0) - return true; - - stripe_size = layout->stripe_unit * - (layout->group_width - layout->parity); - - div_u64_rem(offset, stripe_size, &stripe_off); - if (!stripe_off) - return true; - - *stripe_end = stripe_size - stripe_off; - return false; -} - -static void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) -{ - unsigned long stripe_end = 0; - u64 wb_size; - - if (pgio->pg_dreq == NULL) - wb_size = i_size_read(pgio->pg_inode) - req_offset(req); - else - wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); - - pnfs_generic_pg_init_write(pgio, req, wb_size); - if (unlikely(pgio->pg_lseg == NULL)) - return; /* Not pNFS */ - - if (req->wb_offset || - !aligned_on_raid_stripe(req->wb_index * PAGE_SIZE, - &OBJIO_LSEG(pgio->pg_lseg)->layout, - &stripe_end)) { - pgio->pg_layout_private = (void *)stripe_end; - } else { - pgio->pg_layout_private = (void *) - OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; - } -} - -static const struct nfs_pageio_ops objio_pg_read_ops = { - .pg_init = objio_init_read, - .pg_test = objio_pg_test, - .pg_doio = pnfs_generic_pg_readpages, - .pg_cleanup = pnfs_generic_pg_cleanup, -}; - -static const struct nfs_pageio_ops objio_pg_write_ops = { - .pg_init = objio_init_write, - .pg_test = objio_pg_test, - .pg_doio = pnfs_generic_pg_writepages, - .pg_cleanup = pnfs_generic_pg_cleanup, -}; - -static struct pnfs_layoutdriver_type objlayout_type = { - .id = LAYOUT_OSD2_OBJECTS, - .name = "LAYOUT_OSD2_OBJECTS", - .flags = PNFS_LAYOUTRET_ON_SETATTR | - PNFS_LAYOUTRET_ON_ERROR, - - .max_deviceinfo_size = PAGE_SIZE, - .owner = THIS_MODULE, - .alloc_layout_hdr = objlayout_alloc_layout_hdr, - .free_layout_hdr = objlayout_free_layout_hdr, - - .alloc_lseg = objlayout_alloc_lseg, - .free_lseg = objlayout_free_lseg, - - .read_pagelist = objlayout_read_pagelist, - .write_pagelist = objlayout_write_pagelist, - .pg_read_ops = &objio_pg_read_ops, - .pg_write_ops = &objio_pg_write_ops, - - .sync = pnfs_generic_sync, - - .free_deviceid_node = objio_free_deviceid_node, - - .encode_layoutcommit = objlayout_encode_layoutcommit, - .encode_layoutreturn = objlayout_encode_layoutreturn, -}; - -MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); -MODULE_AUTHOR("Benny Halevy "); -MODULE_LICENSE("GPL"); - -static int __init -objlayout_init(void) -{ - int ret = pnfs_register_layoutdriver(&objlayout_type); - - if (ret) - printk(KERN_INFO - "NFS: %s: Registering OSD pNFS Layout Driver failed: error=%d\n", - __func__, ret); - else - printk(KERN_INFO "NFS: %s: Registered OSD pNFS Layout Driver\n", - __func__); - return ret; -} - -static void __exit -objlayout_exit(void) -{ - pnfs_unregister_layoutdriver(&objlayout_type); - printk(KERN_INFO "NFS: %s: Unregistered OSD pNFS Layout Driver\n", - __func__); -} - -MODULE_ALIAS("nfs-layouttype4-2"); - -module_init(objlayout_init); -module_exit(objlayout_exit); diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c deleted file mode 100644 index 8f3d2acb81c3..000000000000 --- a/fs/nfs/objlayout/objlayout.c +++ /dev/null @@ -1,706 +0,0 @@ -/* - * pNFS Objects layout driver high level definitions - * - * Copyright (C) 2007 Panasas Inc. [year of first publication] - * All rights reserved. - * - * Benny Halevy - * Boaz Harrosh - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * See the file COPYING included with this distribution for more details. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the Panasas company nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include "objlayout.h" - -#define NFSDBG_FACILITY NFSDBG_PNFS_LD -/* - * Create a objlayout layout structure for the given inode and return it. - */ -struct pnfs_layout_hdr * -objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) -{ - struct objlayout *objlay; - - objlay = kzalloc(sizeof(struct objlayout), gfp_flags); - if (!objlay) - return NULL; - spin_lock_init(&objlay->lock); - INIT_LIST_HEAD(&objlay->err_list); - dprintk("%s: Return %p\n", __func__, objlay); - return &objlay->pnfs_layout; -} - -/* - * Free an objlayout layout structure - */ -void -objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo) -{ - struct objlayout *objlay = OBJLAYOUT(lo); - - dprintk("%s: objlay %p\n", __func__, objlay); - - WARN_ON(!list_empty(&objlay->err_list)); - kfree(objlay); -} - -/* - * Unmarshall layout and store it in pnfslay. - */ -struct pnfs_layout_segment * -objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay, - struct nfs4_layoutget_res *lgr, - gfp_t gfp_flags) -{ - int status = -ENOMEM; - struct xdr_stream stream; - struct xdr_buf buf = { - .pages = lgr->layoutp->pages, - .page_len = lgr->layoutp->len, - .buflen = lgr->layoutp->len, - .len = lgr->layoutp->len, - }; - struct page *scratch; - struct pnfs_layout_segment *lseg; - - dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay); - - scratch = alloc_page(gfp_flags); - if (!scratch) - goto err_nofree; - - xdr_init_decode(&stream, &buf, NULL); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); - - status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags); - if (unlikely(status)) { - dprintk("%s: objio_alloc_lseg Return err %d\n", __func__, - status); - goto err; - } - - __free_page(scratch); - - dprintk("%s: Return %p\n", __func__, lseg); - return lseg; - -err: - __free_page(scratch); -err_nofree: - dprintk("%s: Err Return=>%d\n", __func__, status); - return ERR_PTR(status); -} - -/* - * Free a layout segement - */ -void -objlayout_free_lseg(struct pnfs_layout_segment *lseg) -{ - dprintk("%s: freeing layout segment %p\n", __func__, lseg); - - if (unlikely(!lseg)) - return; - - objio_free_lseg(lseg); -} - -/* - * I/O Operations - */ -static inline u64 -end_offset(u64 start, u64 len) -{ - u64 end; - - end = start + len; - return end >= start ? end : NFS4_MAX_UINT64; -} - -static void _fix_verify_io_params(struct pnfs_layout_segment *lseg, - struct page ***p_pages, unsigned *p_pgbase, - u64 offset, unsigned long count) -{ - u64 lseg_end_offset; - - BUG_ON(offset < lseg->pls_range.offset); - lseg_end_offset = end_offset(lseg->pls_range.offset, - lseg->pls_range.length); - BUG_ON(offset >= lseg_end_offset); - WARN_ON(offset + count > lseg_end_offset); - - if (*p_pgbase > PAGE_SIZE) { - dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase); - *p_pages += *p_pgbase >> PAGE_SHIFT; - *p_pgbase &= ~PAGE_MASK; - } -} - -/* - * I/O done common code - */ -static void -objlayout_iodone(struct objlayout_io_res *oir) -{ - if (likely(oir->status >= 0)) { - objio_free_result(oir); - } else { - struct objlayout *objlay = oir->objlay; - - spin_lock(&objlay->lock); - objlay->delta_space_valid = OBJ_DSU_INVALID; - list_add(&objlay->err_list, &oir->err_list); - spin_unlock(&objlay->lock); - } -} - -/* - * objlayout_io_set_result - Set an osd_error code on a specific osd comp. - * - * The @index component IO failed (error returned from target). Register - * the error for later reporting at layout-return. - */ -void -objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, - struct pnfs_osd_objid *pooid, int osd_error, - u64 offset, u64 length, bool is_write) -{ - struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index]; - - BUG_ON(index >= oir->num_comps); - if (osd_error) { - ioerr->oer_component = *pooid; - ioerr->oer_comp_offset = offset; - ioerr->oer_comp_length = length; - ioerr->oer_iswrite = is_write; - ioerr->oer_errno = osd_error; - - dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " - "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", - __func__, index, ioerr->oer_errno, - ioerr->oer_iswrite, - _DEVID_LO(&ioerr->oer_component.oid_device_id), - _DEVID_HI(&ioerr->oer_component.oid_device_id), - ioerr->oer_component.oid_partition_id, - ioerr->oer_component.oid_object_id, - ioerr->oer_comp_offset, - ioerr->oer_comp_length); - } else { - /* User need not call if no error is reported */ - ioerr->oer_errno = 0; - } -} - -/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). - * This is because the osd completion is called with ints-off from - * the block layer - */ -static void _rpc_read_complete(struct work_struct *work) -{ - struct rpc_task *task; - struct nfs_pgio_header *hdr; - - dprintk("%s enter\n", __func__); - task = container_of(work, struct rpc_task, u.tk_work); - hdr = container_of(task, struct nfs_pgio_header, task); - - pnfs_ld_read_done(hdr); -} - -void -objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) -{ - struct nfs_pgio_header *hdr = oir->rpcdata; - - oir->status = hdr->task.tk_status = status; - if (status >= 0) - hdr->res.count = status; - else - hdr->pnfs_error = status; - objlayout_iodone(oir); - /* must not use oir after this point */ - - dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__, - status, hdr->res.eof, sync); - - if (sync) - pnfs_ld_read_done(hdr); - else { - INIT_WORK(&hdr->task.u.tk_work, _rpc_read_complete); - schedule_work(&hdr->task.u.tk_work); - } -} - -/* - * Perform sync or async reads. - */ -enum pnfs_try_status -objlayout_read_pagelist(struct nfs_pgio_header *hdr) -{ - struct inode *inode = hdr->inode; - loff_t offset = hdr->args.offset; - size_t count = hdr->args.count; - int err; - loff_t eof; - - eof = i_size_read(inode); - if (unlikely(offset + count > eof)) { - if (offset >= eof) { - err = 0; - hdr->res.count = 0; - hdr->res.eof = 1; - /*FIXME: do we need to call pnfs_ld_read_done() */ - goto out; - } - count = eof - offset; - } - - hdr->res.eof = (offset + count) >= eof; - _fix_verify_io_params(hdr->lseg, &hdr->args.pages, - &hdr->args.pgbase, - hdr->args.offset, hdr->args.count); - - dprintk("%s: inode(%lx) offset 0x%llx count 0x%zx eof=%d\n", - __func__, inode->i_ino, offset, count, hdr->res.eof); - - err = objio_read_pagelist(hdr); - out: - if (unlikely(err)) { - hdr->pnfs_error = err; - dprintk("%s: Returned Error %d\n", __func__, err); - return PNFS_NOT_ATTEMPTED; - } - return PNFS_ATTEMPTED; -} - -/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). - * This is because the osd completion is called with ints-off from - * the block layer - */ -static void _rpc_write_complete(struct work_struct *work) -{ - struct rpc_task *task; - struct nfs_pgio_header *hdr; - - dprintk("%s enter\n", __func__); - task = container_of(work, struct rpc_task, u.tk_work); - hdr = container_of(task, struct nfs_pgio_header, task); - - pnfs_ld_write_done(hdr); -} - -void -objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) -{ - struct nfs_pgio_header *hdr = oir->rpcdata; - - oir->status = hdr->task.tk_status = status; - if (status >= 0) { - hdr->res.count = status; - hdr->verf.committed = oir->committed; - } else { - hdr->pnfs_error = status; - } - objlayout_iodone(oir); - /* must not use oir after this point */ - - dprintk("%s: Return status %zd committed %d sync=%d\n", __func__, - status, hdr->verf.committed, sync); - - if (sync) - pnfs_ld_write_done(hdr); - else { - INIT_WORK(&hdr->task.u.tk_work, _rpc_write_complete); - schedule_work(&hdr->task.u.tk_work); - } -} - -/* - * Perform sync or async writes. - */ -enum pnfs_try_status -objlayout_write_pagelist(struct nfs_pgio_header *hdr, int how) -{ - int err; - - _fix_verify_io_params(hdr->lseg, &hdr->args.pages, - &hdr->args.pgbase, - hdr->args.offset, hdr->args.count); - - err = objio_write_pagelist(hdr, how); - if (unlikely(err)) { - hdr->pnfs_error = err; - dprintk("%s: Returned Error %d\n", __func__, err); - return PNFS_NOT_ATTEMPTED; - } - return PNFS_ATTEMPTED; -} - -void -objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay, - struct xdr_stream *xdr, - const struct nfs4_layoutcommit_args *args) -{ - struct objlayout *objlay = OBJLAYOUT(pnfslay); - struct pnfs_osd_layoutupdate lou; - __be32 *start; - - dprintk("%s: Begin\n", __func__); - - spin_lock(&objlay->lock); - lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); - lou.dsu_delta = objlay->delta_space_used; - objlay->delta_space_used = 0; - objlay->delta_space_valid = OBJ_DSU_INIT; - lou.olu_ioerr_flag = !list_empty(&objlay->err_list); - spin_unlock(&objlay->lock); - - start = xdr_reserve_space(xdr, 4); - - BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); - - *start = cpu_to_be32((xdr->p - start - 1) * 4); - - dprintk("%s: Return delta_space_used %lld err %d\n", __func__, - lou.dsu_delta, lou.olu_ioerr_flag); -} - -static int -err_prio(u32 oer_errno) -{ - switch (oer_errno) { - case 0: - return 0; - - case PNFS_OSD_ERR_RESOURCE: - return OSD_ERR_PRI_RESOURCE; - case PNFS_OSD_ERR_BAD_CRED: - return OSD_ERR_PRI_BAD_CRED; - case PNFS_OSD_ERR_NO_ACCESS: - return OSD_ERR_PRI_NO_ACCESS; - case PNFS_OSD_ERR_UNREACHABLE: - return OSD_ERR_PRI_UNREACHABLE; - case PNFS_OSD_ERR_NOT_FOUND: - return OSD_ERR_PRI_NOT_FOUND; - case PNFS_OSD_ERR_NO_SPACE: - return OSD_ERR_PRI_NO_SPACE; - default: - WARN_ON(1); - /* fallthrough */ - case PNFS_OSD_ERR_EIO: - return OSD_ERR_PRI_EIO; - } -} - -static void -merge_ioerr(struct pnfs_osd_ioerr *dest_err, - const struct pnfs_osd_ioerr *src_err) -{ - u64 dest_end, src_end; - - if (!dest_err->oer_errno) { - *dest_err = *src_err; - /* accumulated device must be blank */ - memset(&dest_err->oer_component.oid_device_id, 0, - sizeof(dest_err->oer_component.oid_device_id)); - - return; - } - - if (dest_err->oer_component.oid_partition_id != - src_err->oer_component.oid_partition_id) - dest_err->oer_component.oid_partition_id = 0; - - if (dest_err->oer_component.oid_object_id != - src_err->oer_component.oid_object_id) - dest_err->oer_component.oid_object_id = 0; - - if (dest_err->oer_comp_offset > src_err->oer_comp_offset) - dest_err->oer_comp_offset = src_err->oer_comp_offset; - - dest_end = end_offset(dest_err->oer_comp_offset, - dest_err->oer_comp_length); - src_end = end_offset(src_err->oer_comp_offset, - src_err->oer_comp_length); - if (dest_end < src_end) - dest_end = src_end; - - dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; - - if ((src_err->oer_iswrite == dest_err->oer_iswrite) && - (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { - dest_err->oer_errno = src_err->oer_errno; - } else if (src_err->oer_iswrite) { - dest_err->oer_iswrite = true; - dest_err->oer_errno = src_err->oer_errno; - } -} - -static void -encode_accumulated_error(struct objlayout *objlay, __be32 *p) -{ - struct objlayout_io_res *oir, *tmp; - struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; - - list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { - unsigned i; - - for (i = 0; i < oir->num_comps; i++) { - struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; - - if (!ioerr->oer_errno) - continue; - - printk(KERN_ERR "NFS: %s: err[%d]: errno=%d " - "is_write=%d dev(%llx:%llx) par=0x%llx " - "obj=0x%llx offset=0x%llx length=0x%llx\n", - __func__, i, ioerr->oer_errno, - ioerr->oer_iswrite, - _DEVID_LO(&ioerr->oer_component.oid_device_id), - _DEVID_HI(&ioerr->oer_component.oid_device_id), - ioerr->oer_component.oid_partition_id, - ioerr->oer_component.oid_object_id, - ioerr->oer_comp_offset, - ioerr->oer_comp_length); - - merge_ioerr(&accumulated_err, ioerr); - } - list_del(&oir->err_list); - objio_free_result(oir); - } - - pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); -} - -void -objlayout_encode_layoutreturn(struct xdr_stream *xdr, - const struct nfs4_layoutreturn_args *args) -{ - struct pnfs_layout_hdr *pnfslay = args->layout; - struct objlayout *objlay = OBJLAYOUT(pnfslay); - struct objlayout_io_res *oir, *tmp; - __be32 *start; - - dprintk("%s: Begin\n", __func__); - start = xdr_reserve_space(xdr, 4); - BUG_ON(!start); - - spin_lock(&objlay->lock); - - list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { - __be32 *last_xdr = NULL, *p; - unsigned i; - int res = 0; - - for (i = 0; i < oir->num_comps; i++) { - struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; - - if (!ioerr->oer_errno) - continue; - - dprintk("%s: err[%d]: errno=%d is_write=%d " - "dev(%llx:%llx) par=0x%llx obj=0x%llx " - "offset=0x%llx length=0x%llx\n", - __func__, i, ioerr->oer_errno, - ioerr->oer_iswrite, - _DEVID_LO(&ioerr->oer_component.oid_device_id), - _DEVID_HI(&ioerr->oer_component.oid_device_id), - ioerr->oer_component.oid_partition_id, - ioerr->oer_component.oid_object_id, - ioerr->oer_comp_offset, - ioerr->oer_comp_length); - - p = pnfs_osd_xdr_ioerr_reserve_space(xdr); - if (unlikely(!p)) { - res = -E2BIG; - break; /* accumulated_error */ - } - - last_xdr = p; - pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]); - } - - /* TODO: use xdr_write_pages */ - if (unlikely(res)) { - /* no space for even one error descriptor */ - BUG_ON(!last_xdr); - - /* we've encountered a situation with lots and lots of - * errors and no space to encode them all. Use the last - * available slot to report the union of all the - * remaining errors. - */ - encode_accumulated_error(objlay, last_xdr); - goto loop_done; - } - list_del(&oir->err_list); - objio_free_result(oir); - } -loop_done: - spin_unlock(&objlay->lock); - - *start = cpu_to_be32((xdr->p - start - 1) * 4); - dprintk("%s: Return\n", __func__); -} - -enum { - OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64, - OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1, - OSD_LOGIN_UPCALL_PATHLEN = 256 -}; - -static char osd_login_prog[OSD_LOGIN_UPCALL_PATHLEN] = "/sbin/osd_login"; - -module_param_string(osd_login_prog, osd_login_prog, sizeof(osd_login_prog), - 0600); -MODULE_PARM_DESC(osd_login_prog, "Path to the osd_login upcall program"); - -struct __auto_login { - char uri[OBJLAYOUT_MAX_URI_LEN]; - char osdname[OBJLAYOUT_MAX_OSDNAME_LEN]; - char systemid_hex[OBJLAYOUT_MAX_SYSID_HEX_LEN]; -}; - -static int __objlayout_upcall(struct __auto_login *login) -{ - static char *envp[] = { "HOME=/", - "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - NULL - }; - char *argv[8]; - int ret; - - if (unlikely(!osd_login_prog[0])) { - dprintk("%s: osd_login_prog is disabled\n", __func__); - return -EACCES; - } - - dprintk("%s uri: %s\n", __func__, login->uri); - dprintk("%s osdname %s\n", __func__, login->osdname); - dprintk("%s systemid_hex %s\n", __func__, login->systemid_hex); - - argv[0] = (char *)osd_login_prog; - argv[1] = "-u"; - argv[2] = login->uri; - argv[3] = "-o"; - argv[4] = login->osdname; - argv[5] = "-s"; - argv[6] = login->systemid_hex; - argv[7] = NULL; - - ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); - /* - * Disable the upcall mechanism if we're getting an ENOENT or - * EACCES error. The admin can re-enable it on the fly by using - * sysfs to set the objlayoutdriver.osd_login_prog module parameter once - * the problem has been fixed. - */ - if (ret == -ENOENT || ret == -EACCES) { - printk(KERN_ERR "PNFS-OBJ: %s was not found please set " - "objlayoutdriver.osd_login_prog kernel parameter!\n", - osd_login_prog); - osd_login_prog[0] = '\0'; - } - dprintk("%s %s return value: %d\n", __func__, osd_login_prog, ret); - - return ret; -} - -/* Assume dest is all zeros */ -static void __copy_nfsS_and_zero_terminate(struct nfs4_string s, - char *dest, int max_len, - const char *var_name) -{ - if (!s.len) - return; - - if (s.len >= max_len) { - pr_warn_ratelimited( - "objlayout_autologin: %s: s.len(%d) >= max_len(%d)", - var_name, s.len, max_len); - s.len = max_len - 1; /* space for null terminator */ - } - - memcpy(dest, s.data, s.len); -} - -/* Assume sysid is all zeros */ -static void _sysid_2_hex(struct nfs4_string s, - char sysid[OBJLAYOUT_MAX_SYSID_HEX_LEN]) -{ - int i; - char *cur; - - if (!s.len) - return; - - if (s.len != OSD_SYSTEMID_LEN) { - pr_warn_ratelimited( - "objlayout_autologin: systemid_len(%d) != OSD_SYSTEMID_LEN", - s.len); - if (s.len > OSD_SYSTEMID_LEN) - s.len = OSD_SYSTEMID_LEN; - } - - cur = sysid; - for (i = 0; i < s.len; i++) - cur = hex_byte_pack(cur, s.data[i]); -} - -int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr) -{ - int rc; - struct __auto_login login; - - if (!deviceaddr->oda_targetaddr.ota_netaddr.r_addr.len) - return -ENODEV; - - memset(&login, 0, sizeof(login)); - __copy_nfsS_and_zero_terminate( - deviceaddr->oda_targetaddr.ota_netaddr.r_addr, - login.uri, sizeof(login.uri), "URI"); - - __copy_nfsS_and_zero_terminate( - deviceaddr->oda_osdname, - login.osdname, sizeof(login.osdname), "OSDNAME"); - - _sysid_2_hex(deviceaddr->oda_systemid, login.systemid_hex); - - rc = __objlayout_upcall(&login); - if (rc > 0) /* script returns positive values */ - rc = -ENODEV; - - return rc; -} diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h deleted file mode 100644 index fc94a5872ed4..000000000000 --- a/fs/nfs/objlayout/objlayout.h +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Data types and function declerations for interfacing with the - * pNFS standard object layout driver. - * - * Copyright (C) 2007 Panasas Inc. [year of first publication] - * All rights reserved. - * - * Benny Halevy - * Boaz Harrosh - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * See the file COPYING included with this distribution for more details. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the Panasas company nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _OBJLAYOUT_H -#define _OBJLAYOUT_H - -#include -#include -#include "../pnfs.h" - -/* - * per-inode layout - */ -struct objlayout { - struct pnfs_layout_hdr pnfs_layout; - - /* for layout_commit */ - enum osd_delta_space_valid_enum { - OBJ_DSU_INIT = 0, - OBJ_DSU_VALID, - OBJ_DSU_INVALID, - } delta_space_valid; - s64 delta_space_used; /* consumed by write ops */ - - /* for layout_return */ - spinlock_t lock; - struct list_head err_list; -}; - -static inline struct objlayout * -OBJLAYOUT(struct pnfs_layout_hdr *lo) -{ - return container_of(lo, struct objlayout, pnfs_layout); -} - -/* - * per-I/O operation state - * embedded in objects provider io_state data structure - */ -struct objlayout_io_res { - struct objlayout *objlay; - - void *rpcdata; - int status; /* res */ - int committed; /* res */ - - /* Error reporting (layout_return) */ - struct list_head err_list; - unsigned num_comps; - /* Pointer to array of error descriptors of size num_comps. - * It should contain as many entries as devices in the osd_layout - * that participate in the I/O. It is up to the io_engine to allocate - * needed space and set num_comps. - */ - struct pnfs_osd_ioerr *ioerrs; -}; - -static inline -void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps, - struct pnfs_osd_ioerr *ioerrs, void *rpcdata, - struct pnfs_layout_hdr *pnfs_layout_type) -{ - oir->objlay = OBJLAYOUT(pnfs_layout_type); - oir->rpcdata = rpcdata; - INIT_LIST_HEAD(&oir->err_list); - oir->num_comps = num_comps; - oir->ioerrs = ioerrs; -} - -/* - * Raid engine I/O API - */ -extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, - struct pnfs_layout_hdr *pnfslay, - struct pnfs_layout_range *range, - struct xdr_stream *xdr, - gfp_t gfp_flags); -extern void objio_free_lseg(struct pnfs_layout_segment *lseg); - -/* objio_free_result will free these @oir structs received from - * objlayout_{read,write}_done - */ -extern void objio_free_result(struct objlayout_io_res *oir); - -extern int objio_read_pagelist(struct nfs_pgio_header *rdata); -extern int objio_write_pagelist(struct nfs_pgio_header *wdata, int how); - -/* - * callback API - */ -extern void objlayout_io_set_result(struct objlayout_io_res *oir, - unsigned index, struct pnfs_osd_objid *pooid, - int osd_error, u64 offset, u64 length, bool is_write); - -static inline void -objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used) -{ - /* If one of the I/Os errored out and the delta_space_used was - * invalid we render the complete report as invalid. Protocol mandate - * the DSU be accurate or not reported. - */ - spin_lock(&objlay->lock); - if (objlay->delta_space_valid != OBJ_DSU_INVALID) { - objlay->delta_space_valid = OBJ_DSU_VALID; - objlay->delta_space_used += space_used; - } - spin_unlock(&objlay->lock); -} - -extern void objlayout_read_done(struct objlayout_io_res *oir, - ssize_t status, bool sync); -extern void objlayout_write_done(struct objlayout_io_res *oir, - ssize_t status, bool sync); - -/* - * exported generic objects function vectors - */ - -extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags); -extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *); - -extern struct pnfs_layout_segment *objlayout_alloc_lseg( - struct pnfs_layout_hdr *, - struct nfs4_layoutget_res *, - gfp_t gfp_flags); -extern void objlayout_free_lseg(struct pnfs_layout_segment *); - -extern enum pnfs_try_status objlayout_read_pagelist( - struct nfs_pgio_header *); - -extern enum pnfs_try_status objlayout_write_pagelist( - struct nfs_pgio_header *, - int how); - -extern void objlayout_encode_layoutcommit( - struct pnfs_layout_hdr *, - struct xdr_stream *, - const struct nfs4_layoutcommit_args *); - -extern void objlayout_encode_layoutreturn( - struct xdr_stream *, - const struct nfs4_layoutreturn_args *); - -extern int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr); - -#endif /* _OBJLAYOUT_H */ diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c deleted file mode 100644 index f093c7ec983b..000000000000 --- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +++ /dev/null @@ -1,415 +0,0 @@ -/* - * Object-Based pNFS Layout XDR layer - * - * Copyright (C) 2007 Panasas Inc. [year of first publication] - * All rights reserved. - * - * Benny Halevy - * Boaz Harrosh - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * See the file COPYING included with this distribution for more details. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the Panasas company nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include - -#define NFSDBG_FACILITY NFSDBG_PNFS_LD - -/* - * The following implementation is based on RFC5664 - */ - -/* - * struct pnfs_osd_objid { - * struct nfs4_deviceid oid_device_id; - * u64 oid_partition_id; - * u64 oid_object_id; - * }; // xdr size 32 bytes - */ -static __be32 * -_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) -{ - p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data, - sizeof(objid->oid_device_id.data)); - - p = xdr_decode_hyper(p, &objid->oid_partition_id); - p = xdr_decode_hyper(p, &objid->oid_object_id); - return p; -} -/* - * struct pnfs_osd_opaque_cred { - * u32 cred_len; - * void *cred; - * }; // xdr size [variable] - * The return pointers are from the xdr buffer - */ -static int -_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred, - struct xdr_stream *xdr) -{ - __be32 *p = xdr_inline_decode(xdr, 1); - - if (!p) - return -EINVAL; - - opaque_cred->cred_len = be32_to_cpu(*p++); - - p = xdr_inline_decode(xdr, opaque_cred->cred_len); - if (!p) - return -EINVAL; - - opaque_cred->cred = p; - return 0; -} - -/* - * struct pnfs_osd_object_cred { - * struct pnfs_osd_objid oc_object_id; - * u32 oc_osd_version; - * u32 oc_cap_key_sec; - * struct pnfs_osd_opaque_cred oc_cap_key - * struct pnfs_osd_opaque_cred oc_cap; - * }; // xdr size 32 + 4 + 4 + [variable] + [variable] - */ -static int -_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp, - struct xdr_stream *xdr) -{ - __be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4); - int ret; - - if (!p) - return -EIO; - - p = _osd_xdr_decode_objid(p, &comp->oc_object_id); - comp->oc_osd_version = be32_to_cpup(p++); - comp->oc_cap_key_sec = be32_to_cpup(p); - - ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr); - if (unlikely(ret)) - return ret; - - ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr); - return ret; -} - -/* - * struct pnfs_osd_data_map { - * u32 odm_num_comps; - * u64 odm_stripe_unit; - * u32 odm_group_width; - * u32 odm_group_depth; - * u32 odm_mirror_cnt; - * u32 odm_raid_algorithm; - * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4 - */ -static inline int -_osd_data_map_xdr_sz(void) -{ - return 4 + 8 + 4 + 4 + 4 + 4; -} - -static __be32 * -_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map) -{ - data_map->odm_num_comps = be32_to_cpup(p++); - p = xdr_decode_hyper(p, &data_map->odm_stripe_unit); - data_map->odm_group_width = be32_to_cpup(p++); - data_map->odm_group_depth = be32_to_cpup(p++); - data_map->odm_mirror_cnt = be32_to_cpup(p++); - data_map->odm_raid_algorithm = be32_to_cpup(p++); - dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " - "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", - __func__, - data_map->odm_num_comps, - (unsigned long long)data_map->odm_stripe_unit, - data_map->odm_group_width, - data_map->odm_group_depth, - data_map->odm_mirror_cnt, - data_map->odm_raid_algorithm); - return p; -} - -int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout, - struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr) -{ - __be32 *p; - - memset(iter, 0, sizeof(*iter)); - - p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4); - if (unlikely(!p)) - return -EINVAL; - - p = _osd_xdr_decode_data_map(p, &layout->olo_map); - layout->olo_comps_index = be32_to_cpup(p++); - layout->olo_num_comps = be32_to_cpup(p++); - dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__, - layout->olo_comps_index, layout->olo_num_comps); - - iter->total_comps = layout->olo_num_comps; - return 0; -} - -bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp, - struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr, - int *err) -{ - BUG_ON(iter->decoded_comps > iter->total_comps); - if (iter->decoded_comps == iter->total_comps) - return false; - - *err = _osd_xdr_decode_object_cred(comp, xdr); - if (unlikely(*err)) { - dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d " - "total_comps=%d\n", __func__, *err, - iter->decoded_comps, iter->total_comps); - return false; /* stop the loop */ - } - dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx " - "key_len=%u cap_len=%u\n", - __func__, - _DEVID_LO(&comp->oc_object_id.oid_device_id), - _DEVID_HI(&comp->oc_object_id.oid_device_id), - comp->oc_object_id.oid_partition_id, - comp->oc_object_id.oid_object_id, - comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); - - iter->decoded_comps++; - return true; -} - -/* - * Get Device Information Decoding - * - * Note: since Device Information is currently done synchronously, all - * variable strings fields are left inside the rpc buffer and are only - * pointed to by the pnfs_osd_deviceaddr members. So the read buffer - * should not be freed while the returned information is in use. - */ -/* - *struct nfs4_string { - * unsigned int len; - * char *data; - *}; // size [variable] - * NOTE: Returned string points to inside the XDR buffer - */ -static __be32 * -__read_u8_opaque(__be32 *p, struct nfs4_string *str) -{ - str->len = be32_to_cpup(p++); - str->data = (char *)p; - - p += XDR_QUADLEN(str->len); - return p; -} - -/* - * struct pnfs_osd_targetid { - * u32 oti_type; - * struct nfs4_string oti_scsi_device_id; - * };// size 4 + [variable] - */ -static __be32 * -__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid) -{ - u32 oti_type; - - oti_type = be32_to_cpup(p++); - targetid->oti_type = oti_type; - - switch (oti_type) { - case OBJ_TARGET_SCSI_NAME: - case OBJ_TARGET_SCSI_DEVICE_ID: - p = __read_u8_opaque(p, &targetid->oti_scsi_device_id); - } - - return p; -} - -/* - * struct pnfs_osd_net_addr { - * struct nfs4_string r_netid; - * struct nfs4_string r_addr; - * }; - */ -static __be32 * -__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr) -{ - p = __read_u8_opaque(p, &netaddr->r_netid); - p = __read_u8_opaque(p, &netaddr->r_addr); - - return p; -} - -/* - * struct pnfs_osd_targetaddr { - * u32 ota_available; - * struct pnfs_osd_net_addr ota_netaddr; - * }; - */ -static __be32 * -__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr) -{ - u32 ota_available; - - ota_available = be32_to_cpup(p++); - targetaddr->ota_available = ota_available; - - if (ota_available) - p = __read_net_addr(p, &targetaddr->ota_netaddr); - - - return p; -} - -/* - * struct pnfs_osd_deviceaddr { - * struct pnfs_osd_targetid oda_targetid; - * struct pnfs_osd_targetaddr oda_targetaddr; - * u8 oda_lun[8]; - * struct nfs4_string oda_systemid; - * struct pnfs_osd_object_cred oda_root_obj_cred; - * struct nfs4_string oda_osdname; - * }; - */ - -/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does - * not have an xdr_stream - */ -static __be32 * -__read_opaque_cred(__be32 *p, - struct pnfs_osd_opaque_cred *opaque_cred) -{ - opaque_cred->cred_len = be32_to_cpu(*p++); - opaque_cred->cred = p; - return p + XDR_QUADLEN(opaque_cred->cred_len); -} - -static __be32 * -__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp) -{ - p = _osd_xdr_decode_objid(p, &comp->oc_object_id); - comp->oc_osd_version = be32_to_cpup(p++); - comp->oc_cap_key_sec = be32_to_cpup(p++); - - p = __read_opaque_cred(p, &comp->oc_cap_key); - p = __read_opaque_cred(p, &comp->oc_cap); - return p; -} - -void pnfs_osd_xdr_decode_deviceaddr( - struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p) -{ - p = __read_targetid(p, &deviceaddr->oda_targetid); - - p = __read_targetaddr(p, &deviceaddr->oda_targetaddr); - - p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun, - sizeof(deviceaddr->oda_lun)); - - p = __read_u8_opaque(p, &deviceaddr->oda_systemid); - - p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred); - - p = __read_u8_opaque(p, &deviceaddr->oda_osdname); - - /* libosd likes this terminated in dbg. It's last, so no problems */ - deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0; -} - -/* - * struct pnfs_osd_layoutupdate { - * u32 dsu_valid; - * s64 dsu_delta; - * u32 olu_ioerr_flag; - * }; xdr size 4 + 8 + 4 - */ -int -pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, - struct pnfs_osd_layoutupdate *lou) -{ - __be32 *p = xdr_reserve_space(xdr, 4 + 8 + 4); - - if (!p) - return -E2BIG; - - *p++ = cpu_to_be32(lou->dsu_valid); - if (lou->dsu_valid) - p = xdr_encode_hyper(p, lou->dsu_delta); - *p++ = cpu_to_be32(lou->olu_ioerr_flag); - return 0; -} - -/* - * struct pnfs_osd_objid { - * struct nfs4_deviceid oid_device_id; - * u64 oid_partition_id; - * u64 oid_object_id; - * }; // xdr size 32 bytes - */ -static inline __be32 * -pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id) -{ - p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, - sizeof(object_id->oid_device_id.data)); - p = xdr_encode_hyper(p, object_id->oid_partition_id); - p = xdr_encode_hyper(p, object_id->oid_object_id); - - return p; -} - -/* - * struct pnfs_osd_ioerr { - * struct pnfs_osd_objid oer_component; - * u64 oer_comp_offset; - * u64 oer_comp_length; - * u32 oer_iswrite; - * u32 oer_errno; - * }; // xdr size 32 + 24 bytes - */ -void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr) -{ - p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component); - p = xdr_encode_hyper(p, ioerr->oer_comp_offset); - p = xdr_encode_hyper(p, ioerr->oer_comp_length); - *p++ = cpu_to_be32(ioerr->oer_iswrite); - *p = cpu_to_be32(ioerr->oer_errno); -} - -__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr) -{ - __be32 *p; - - p = xdr_reserve_space(xdr, 32 + 24); - if (unlikely(!p)) - dprintk("%s: out of xdr space\n", __func__); - - return p; -} diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 6e629b856a00..ad92b401326c 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -29,19 +29,6 @@ static struct kmem_cache *nfs_page_cachep; static const struct rpc_call_ops nfs_pgio_common_ops; -static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) -{ - p->npages = pagecount; - if (pagecount <= ARRAY_SIZE(p->page_array)) - p->pagevec = p->page_array; - else { - p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); - if (!p->pagevec) - p->npages = 0; - } - return p->pagevec != NULL; -} - struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc) { @@ -115,6 +102,35 @@ nfs_iocounter_wait(struct nfs_lock_context *l_ctx) TASK_KILLABLE); } +/** + * nfs_async_iocounter_wait - wait on a rpc_waitqueue for I/O + * to complete + * @task: the rpc_task that should wait + * @l_ctx: nfs_lock_context with io_counter to check + * + * Returns true if there is outstanding I/O to wait on and the + * task has been put to sleep. + */ +bool +nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx) +{ + struct inode *inode = d_inode(l_ctx->open_context->dentry); + bool ret = false; + + if (atomic_read(&l_ctx->io_count) > 0) { + rpc_sleep_on(&NFS_SERVER(inode)->uoc_rpcwaitq, task, NULL); + ret = true; + } + + if (atomic_read(&l_ctx->io_count) == 0) { + rpc_wake_up_queued_task(&NFS_SERVER(inode)->uoc_rpcwaitq, task); + ret = false; + } + + return ret; +} +EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait); + /* * nfs_page_group_lock - lock the head of the page group * @req - request in group that is to be locked @@ -398,8 +414,11 @@ static void nfs_clear_request(struct nfs_page *req) req->wb_page = NULL; } if (l_ctx != NULL) { - if (atomic_dec_and_test(&l_ctx->io_count)) + if (atomic_dec_and_test(&l_ctx->io_count)) { wake_up_atomic_t(&l_ctx->io_count); + if (test_bit(NFS_CONTEXT_UNLOCK, &ctx->flags)) + rpc_wake_up(&NFS_SERVER(d_inode(ctx->dentry))->uoc_rpcwaitq); + } nfs_put_lock_context(l_ctx); req->wb_lock_context = NULL; } @@ -677,7 +696,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, const struct nfs_pgio_completion_ops *compl_ops, const struct nfs_rw_ops *rw_ops, size_t bsize, - int io_flags) + int io_flags, + gfp_t gfp_flags) { struct nfs_pgio_mirror *new; int i; @@ -701,7 +721,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, /* until we have a request, we don't have an lseg and no * idea how many mirrors there will be */ new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX, - sizeof(struct nfs_pgio_mirror), GFP_KERNEL); + sizeof(struct nfs_pgio_mirror), gfp_flags); desc->pg_mirrors_dynamic = new; desc->pg_mirrors = new; @@ -754,13 +774,24 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, *last_page; struct list_head *head = &mirror->pg_list; struct nfs_commit_info cinfo; + struct nfs_page_array *pg_array = &hdr->page_array; unsigned int pagecount, pageused; + gfp_t gfp_flags = GFP_KERNEL; pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count); - if (!nfs_pgarray_set(&hdr->page_array, pagecount)) { - nfs_pgio_error(hdr); - desc->pg_error = -ENOMEM; - return desc->pg_error; + + if (pagecount <= ARRAY_SIZE(pg_array->page_array)) + pg_array->pagevec = pg_array->page_array; + else { + if (hdr->rw_mode == FMODE_WRITE) + gfp_flags = GFP_NOIO; + pg_array->pagevec = kcalloc(pagecount, sizeof(struct page *), gfp_flags); + if (!pg_array->pagevec) { + pg_array->npages = 0; + nfs_pgio_error(hdr); + desc->pg_error = -ENOMEM; + return desc->pg_error; + } } nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); @@ -1256,8 +1287,10 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) mirror = &desc->pg_mirrors[midx]; if (!list_empty(&mirror->pg_list)) { prev = nfs_list_entry(mirror->pg_list.prev); - if (index != prev->wb_index + 1) - nfs_pageio_complete_mirror(desc, midx); + if (index != prev->wb_index + 1) { + nfs_pageio_complete(desc); + break; + } } } } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index dd042498ce7c..adc6ec28d4b5 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -322,9 +322,15 @@ pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, static void pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) { + struct pnfs_layout_segment *lseg; lo->plh_return_iomode = 0; lo->plh_return_seq = 0; clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); + list_for_each_entry(lseg, &lo->plh_segs, pls_list) { + if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + continue; + pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); + } } static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) @@ -367,9 +373,9 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, *next; set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_clear_layoutreturn_info(lo); list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) pnfs_clear_lseg_state(lseg, lseg_list); + pnfs_clear_layoutreturn_info(lo); pnfs_free_returned_lsegs(lo, lseg_list, &range, 0); if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) && !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) @@ -563,7 +569,6 @@ pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg) } } } -EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked); /* * is l2 fully contained in l1? @@ -728,6 +733,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); spin_unlock(&nfsi->vfs_inode.i_lock); pnfs_free_lseg_list(&tmp_list); + nfs_commit_inode(&nfsi->vfs_inode, 0); pnfs_put_layout_hdr(lo); } else spin_unlock(&nfsi->vfs_inode.i_lock); @@ -1209,7 +1215,6 @@ out: dprintk("<-- %s status: %d\n", __func__, status); return status; } -EXPORT_SYMBOL_GPL(_pnfs_return_layout); int pnfs_commit_and_return_layout(struct inode *inode) @@ -1991,6 +1996,8 @@ out_forget: spin_unlock(&ino->i_lock); lseg->pls_layout = lo; NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); + if (!pnfs_layout_is_valid(lo)) + nfs_commit_inode(ino, 0); return ERR_PTR(-EAGAIN); } @@ -2051,9 +2058,11 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, bool return_now = false; spin_lock(&inode->i_lock); + if (!pnfs_layout_is_valid(lo)) { + spin_unlock(&inode->i_lock); + return; + } pnfs_set_plh_return_info(lo, range.iomode, 0); - /* Block LAYOUTGET */ - set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() @@ -2074,11 +2083,23 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, } EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); +void +pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio) +{ + if (pgio->pg_lseg == NULL || + test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags)) + return; + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout); + void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { u64 rd_size = req->wb_bytes; + pnfs_generic_pg_check_layout(pgio); if (pgio->pg_lseg == NULL) { if (pgio->pg_dreq == NULL) rd_size = i_size_read(pgio->pg_inode) - req_offset(req); @@ -2109,6 +2130,7 @@ void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size) { + pnfs_generic_pg_check_layout(pgio); if (pgio->pg_lseg == NULL) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, @@ -2277,8 +2299,20 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc, enum pnfs_try_status trypnfs; trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how); - if (trypnfs == PNFS_NOT_ATTEMPTED) + switch (trypnfs) { + case PNFS_NOT_ATTEMPTED: pnfs_write_through_mds(desc, hdr); + case PNFS_ATTEMPTED: + break; + case PNFS_TRY_AGAIN: + /* cleanup hdr and prepare to redo pnfs */ + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + list_splice_init(&hdr->pages, &mirror->pg_list); + mirror->pg_recoalesce = 1; + } + hdr->mds_ops->rpc_release(hdr); + } } static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) @@ -2408,10 +2442,20 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) enum pnfs_try_status trypnfs; trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg); - if (trypnfs == PNFS_TRY_AGAIN) - pnfs_read_resend_pnfs(hdr); - if (trypnfs == PNFS_NOT_ATTEMPTED || hdr->task.tk_status) + switch (trypnfs) { + case PNFS_NOT_ATTEMPTED: pnfs_read_through_mds(desc, hdr); + case PNFS_ATTEMPTED: + break; + case PNFS_TRY_AGAIN: + /* cleanup hdr and prepare to redo pnfs */ + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + list_splice_init(&hdr->pages, &mirror->pg_list); + mirror->pg_recoalesce = 1; + } + hdr->mds_ops->rpc_release(hdr); + } } static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 590e1e35781f..2d05b756a8d6 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -173,14 +173,9 @@ struct pnfs_layoutdriver_type { gfp_t gfp_flags); int (*prepare_layoutreturn) (struct nfs4_layoutreturn_args *); - void (*encode_layoutreturn) (struct xdr_stream *xdr, - const struct nfs4_layoutreturn_args *args); void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args); - void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo, - struct xdr_stream *xdr, - const struct nfs4_layoutcommit_args *args); int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args); }; @@ -239,6 +234,7 @@ void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg); void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *); void unset_pnfs_layoutdriver(struct nfs_server *); +void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 7250b95549ec..d40755a0984b 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -217,7 +217,14 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo, for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) { if (list_empty(&bucket->committing)) continue; - data = nfs_commitdata_alloc(); + /* + * If the layout segment is invalid, then let + * pnfs_generic_retry_commit() clean up the bucket. + */ + if (bucket->clseg && !pnfs_is_valid_lseg(bucket->clseg) && + !test_bit(NFS_LSEG_LAYOUTRETURN, &bucket->clseg->pls_flags)) + break; + data = nfs_commitdata_alloc(false); if (!data) break; data->ds_commit_index = i; @@ -283,16 +290,10 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, unsigned int nreq = 0; if (!list_empty(mds_pages)) { - data = nfs_commitdata_alloc(); - if (data != NULL) { - data->ds_commit_index = -1; - list_add(&data->pages, &list); - nreq++; - } else { - nfs_retry_commit(mds_pages, NULL, cinfo, 0); - pnfs_generic_retry_commit(cinfo, 0); - return -ENOMEM; - } + data = nfs_commitdata_alloc(true); + data->ds_commit_index = -1; + list_add(&data->pages, &list); + nreq++; } nreq += pnfs_generic_alloc_ds_commits(cinfo, &list); @@ -619,7 +620,6 @@ void nfs4_pnfs_v3_ds_connect_unload(void) get_v3_ds_connect = NULL; } } -EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload); static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index b7bca8303989..9872cf676a50 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -638,7 +638,7 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(filp); - return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); + return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, NULL); } /* Helper functions for NFS lock bounds checking */ diff --git a/fs/nfs/read.c b/fs/nfs/read.c index defc9233e985..a8421d9dab6a 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -35,7 +35,11 @@ static struct kmem_cache *nfs_rdata_cachep; static struct nfs_pgio_header *nfs_readhdr_alloc(void) { - return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); + struct nfs_pgio_header *p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); + + if (p) + p->rw_mode = FMODE_READ; + return p; } static void nfs_readhdr_free(struct nfs_pgio_header *rhdr) @@ -64,7 +68,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, pg_ops = server->pnfs_curr_ld->pg_read_ops; #endif nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops, - server->rsize, 0); + server->rsize, 0, GFP_KERNEL); } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); @@ -451,7 +455,6 @@ void nfs_destroy_readpagecache(void) } static const struct nfs_rw_ops nfs_rw_read_ops = { - .rw_mode = FMODE_READ, .rw_alloc_header = nfs_readhdr_alloc, .rw_free_header = nfs_readhdr_free, .rw_done = nfs_readpage_done, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index cc341fc7fd44..db7ba542559e 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -60,14 +60,28 @@ static mempool_t *nfs_wdata_mempool; static struct kmem_cache *nfs_cdata_cachep; static mempool_t *nfs_commit_mempool; -struct nfs_commit_data *nfs_commitdata_alloc(void) +struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail) { - struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); + struct nfs_commit_data *p; - if (p) { - memset(p, 0, sizeof(*p)); - INIT_LIST_HEAD(&p->pages); + if (never_fail) + p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); + else { + /* It is OK to do some reclaim, not no safe to wait + * for anything to be returned to the pool. + * mempool_alloc() cannot handle that particular combination, + * so we need two separate attempts. + */ + p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT); + if (!p) + p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO | + __GFP_NOWARN | __GFP_NORETRY); + if (!p) + return NULL; } + + memset(p, 0, sizeof(*p)); + INIT_LIST_HEAD(&p->pages); return p; } EXPORT_SYMBOL_GPL(nfs_commitdata_alloc); @@ -82,8 +96,10 @@ static struct nfs_pgio_header *nfs_writehdr_alloc(void) { struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); - if (p) + if (p) { memset(p, 0, sizeof(*p)); + p->rw_mode = FMODE_WRITE; + } return p; } @@ -547,9 +563,21 @@ static void nfs_write_error_remove_page(struct nfs_page *req) { nfs_unlock_request(req); nfs_end_page_writeback(req); - nfs_release_request(req); generic_error_remove_page(page_file_mapping(req->wb_page), req->wb_page); + nfs_release_request(req); +} + +static bool +nfs_error_is_fatal_on_server(int err) +{ + switch (err) { + case 0: + case -ERESTARTSYS: + case -EINTR: + return false; + } + return nfs_error_is_fatal(err); } /* @@ -557,8 +585,7 @@ static void nfs_write_error_remove_page(struct nfs_page *req) * May return an error if the user signalled nfs_wait_on_request(). */ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page, bool nonblock, - bool launder) + struct page *page, bool nonblock) { struct nfs_page *req; int ret = 0; @@ -574,19 +601,19 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); ret = 0; + /* If there is a fatal error that covers this write, just exit */ + if (nfs_error_is_fatal_on_server(req->wb_context->error)) + goto out_launder; + if (!nfs_pageio_add_request(pgio, req)) { ret = pgio->pg_error; /* - * Remove the problematic req upon fatal errors - * in launder case, while other dirty pages can - * still be around until they get flushed. + * Remove the problematic req upon fatal errors on the server */ if (nfs_error_is_fatal(ret)) { nfs_context_set_write_error(req->wb_context, ret); - if (launder) { - nfs_write_error_remove_page(req); - goto out; - } + if (nfs_error_is_fatal_on_server(ret)) + goto out_launder; } nfs_redirty_request(req); ret = -EAGAIN; @@ -595,16 +622,18 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, NFSIOS_WRITEPAGES, 1); out: return ret; +out_launder: + nfs_write_error_remove_page(req); + return ret; } static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, - struct nfs_pageio_descriptor *pgio, bool launder) + struct nfs_pageio_descriptor *pgio) { int ret; nfs_pageio_cond_complete(pgio, page_index(page)); - ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE, - launder); + ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); ret = 0; @@ -616,8 +645,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, * Write an mmapped page to the server. */ static int nfs_writepage_locked(struct page *page, - struct writeback_control *wbc, - bool launder) + struct writeback_control *wbc) { struct nfs_pageio_descriptor pgio; struct inode *inode = page_file_mapping(page)->host; @@ -626,7 +654,7 @@ static int nfs_writepage_locked(struct page *page, nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_pageio_init_write(&pgio, inode, 0, false, &nfs_async_write_completion_ops); - err = nfs_do_writepage(page, wbc, &pgio, launder); + err = nfs_do_writepage(page, wbc, &pgio); nfs_pageio_complete(&pgio); if (err < 0) return err; @@ -639,7 +667,7 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc) { int ret; - ret = nfs_writepage_locked(page, wbc, false); + ret = nfs_writepage_locked(page, wbc); unlock_page(page); return ret; } @@ -648,7 +676,7 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * { int ret; - ret = nfs_do_writepage(page, wbc, data, false); + ret = nfs_do_writepage(page, wbc, data); unlock_page(page); return ret; } @@ -1367,7 +1395,7 @@ void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, pg_ops = server->pnfs_curr_ld->pg_write_ops; #endif nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops, - server->wsize, ioflags); + server->wsize, ioflags, GFP_NOIO); } EXPORT_SYMBOL_GPL(nfs_pageio_init_write); @@ -1704,51 +1732,15 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, if (list_empty(head)) return 0; - data = nfs_commitdata_alloc(); - - if (!data) - goto out_bad; + data = nfs_commitdata_alloc(true); /* Set up the argument struct */ nfs_init_commit(data, head, NULL, cinfo); atomic_inc(&cinfo->mds->rpcs_out); return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode), data->mds_ops, how, 0); - out_bad: - nfs_retry_commit(head, NULL, cinfo, 0); - return -ENOMEM; } -int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf) -{ - struct inode *inode = file_inode(file); - struct nfs_open_context *open; - struct nfs_commit_info cinfo; - struct nfs_page *req; - int ret; - - open = get_nfs_open_context(nfs_file_open_context(file)); - req = nfs_create_request(open, NULL, NULL, 0, i_size_read(inode)); - if (IS_ERR(req)) { - ret = PTR_ERR(req); - goto out_put; - } - - nfs_init_cinfo_from_inode(&cinfo, inode); - - memcpy(&req->wb_verf, verf, sizeof(struct nfs_write_verifier)); - nfs_request_add_commit_list(req, &cinfo); - ret = nfs_commit_inode(inode, FLUSH_SYNC); - if (ret > 0) - ret = 0; - - nfs_free_request(req); -out_put: - put_nfs_open_context(open); - return ret; -} -EXPORT_SYMBOL_GPL(nfs_commit_file); - /* * COMMIT call returned */ @@ -1985,7 +1977,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) /* * Write back all requests on one page - we do this before reading it. */ -int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder) +int nfs_wb_page(struct inode *inode, struct page *page) { loff_t range_start = page_file_offset(page); loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1); @@ -2002,7 +1994,7 @@ int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder) for (;;) { wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { - ret = nfs_writepage_locked(page, &wbc, launder); + ret = nfs_writepage_locked(page, &wbc); if (ret < 0) goto out_error; continue; @@ -2107,7 +2099,6 @@ void nfs_destroy_writepagecache(void) } static const struct nfs_rw_ops nfs_rw_write_ops = { - .rw_mode = FMODE_WRITE, .rw_alloc_header = nfs_writehdr_alloc, .rw_free_header = nfs_writehdr_free, .rw_done = nfs_writeback_done, diff --git a/include/linux/fs.h b/include/linux/fs.h index 26488b419965..0ad325ed71e8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -909,6 +909,8 @@ static inline struct file *get_file(struct file *f) #define FL_OFDLCK 1024 /* lock is "owned" by struct file */ #define FL_LAYOUT 2048 /* outstanding pNFS layout */ +#define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) + /* * Special return value from posix_lock_file() and vfs_lock_file() for * asynchronous locking. diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 140edab64446..05728396a1a1 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -18,6 +18,7 @@ /* Dummy declarations */ struct svc_rqst; +struct rpc_task; /* * This is the set of functions for lockd->nfsd communication @@ -43,6 +44,7 @@ struct nlmclnt_initdata { u32 nfs_version; int noresvport; struct net *net; + const struct nlmclnt_operations *nlmclnt_ops; }; /* @@ -52,8 +54,26 @@ struct nlmclnt_initdata { extern struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init); extern void nlmclnt_done(struct nlm_host *host); -extern int nlmclnt_proc(struct nlm_host *host, int cmd, - struct file_lock *fl); +/* + * NLM client operations provide a means to modify RPC processing of NLM + * requests. Callbacks receive a pointer to data passed into the call to + * nlmclnt_proc(). + */ +struct nlmclnt_operations { + /* Called on successful allocation of nlm_rqst, use for allocation or + * reference counting. */ + void (*nlmclnt_alloc_call)(void *); + + /* Called in rpc_task_prepare for unlock. A return value of true + * indicates the callback has put the task to sleep on a waitqueue + * and NLM should not call rpc_call_start(). */ + bool (*nlmclnt_unlock_prepare)(struct rpc_task*, void *); + + /* Called when the nlm_rqst is freed, callbacks should clean up here */ + void (*nlmclnt_release_call)(void *); +}; + +extern int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, void *data); extern int lockd_up(struct net *net); extern void lockd_down(struct net *net); diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index b37dee3acaba..41f7b6a04d69 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -69,6 +69,7 @@ struct nlm_host { char *h_addrbuf; /* address eyecatcher */ struct net *net; /* host net */ char nodename[UNX_MAXNODENAME + 1]; + const struct nlmclnt_operations *h_nlmclnt_ops; /* Callback ops for NLM users */ }; /* @@ -142,6 +143,7 @@ struct nlm_rqst { struct nlm_block * a_block; unsigned int a_retries; /* Retry count */ u8 a_owner[NLMCLNT_OHSIZE]; + void * a_callback_data; /* sent to nlmclnt_operations callbacks */ }; /* diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 287f34161086..bb0eb2c9acca 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -76,6 +76,7 @@ struct nfs_open_context { #define NFS_CONTEXT_ERROR_WRITE (0) #define NFS_CONTEXT_RESEND_WRITES (1) #define NFS_CONTEXT_BAD (2) +#define NFS_CONTEXT_UNLOCK (3) int error; struct list_head list; @@ -499,24 +500,12 @@ extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned */ extern int nfs_sync_inode(struct inode *inode); extern int nfs_wb_all(struct inode *inode); -extern int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder); +extern int nfs_wb_page(struct inode *inode, struct page *page); extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); extern int nfs_commit_inode(struct inode *, int); -extern struct nfs_commit_data *nfs_commitdata_alloc(void); +extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail); extern void nfs_commit_free(struct nfs_commit_data *data); -static inline int -nfs_wb_launder_page(struct inode *inode, struct page *page) -{ - return nfs_wb_single_page(inode, page, true); -} - -static inline int -nfs_wb_page(struct inode *inode, struct page *page) -{ - return nfs_wb_single_page(inode, page, false); -} - static inline int nfs_have_writebacks(struct inode *inode) { diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index e1502c55741e..e418a1096662 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -221,6 +221,7 @@ struct nfs_server { u32 mountd_version; unsigned short mountd_port; unsigned short mountd_protocol; + struct rpc_wait_queue uoc_rpcwaitq; }; /* Server capabilities */ diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 957049f72290..247cc3d3498f 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -64,7 +64,6 @@ struct nfs_pageio_ops { }; struct nfs_rw_ops { - const fmode_t rw_mode; struct nfs_pgio_header *(*rw_alloc_header)(void); void (*rw_free_header)(struct nfs_pgio_header *); int (*rw_done)(struct rpc_task *, struct nfs_pgio_header *, @@ -124,7 +123,8 @@ extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, const struct nfs_pgio_completion_ops *compl_ops, const struct nfs_rw_ops *rw_ops, size_t bsize, - int how); + int how, + gfp_t gfp_flags); extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *, struct nfs_page *); extern int nfs_pageio_resend(struct nfs_pageio_descriptor *, @@ -141,6 +141,7 @@ extern int nfs_page_group_lock(struct nfs_page *, bool); extern void nfs_page_group_lock_wait(struct nfs_page *); extern void nfs_page_group_unlock(struct nfs_page *); extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); +extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *); /* * Lock the page of an asynchronous request diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 348f7c158084..b28c83475ee8 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1383,6 +1383,7 @@ struct nfs42_copy_res { struct nfs42_write_res write_res; bool consecutive; bool synchronous; + struct nfs_commitres commit_res; }; struct nfs42_seek_args { @@ -1427,6 +1428,7 @@ struct nfs_pgio_header { struct list_head pages; struct nfs_page *req; struct nfs_writeverf verf; /* Used for writes */ + fmode_t rw_mode; struct pnfs_layout_segment *lseg; loff_t io_start; const struct rpc_call_ops *mds_ops; @@ -1550,6 +1552,7 @@ struct nfs_rpc_ops { const struct inode_operations *dir_inode_ops; const struct inode_operations *file_inode_ops; const struct file_operations *file_ops; + const struct nlmclnt_operations *nlmclnt_ops; int (*getroot) (struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 52da3ce54bb5..b5cb921775a0 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1042,8 +1042,6 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) struct rpc_task *task; task = rpc_new_task(task_setup_data); - if (IS_ERR(task)) - goto out; rpc_task_set_client(task, task_setup_data->rpc_client); rpc_task_set_rpc_message(task, task_setup_data->rpc_message); @@ -1053,7 +1051,6 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) atomic_inc(&task->tk_count); rpc_execute(task); -out: return task; } EXPORT_SYMBOL_GPL(rpc_run_task); @@ -1140,10 +1137,6 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) * Create an rpc_task to send the data */ task = rpc_new_task(&task_setup_data); - if (IS_ERR(task)) { - xprt_free_bc_request(req); - goto out; - } task->tk_rqstp = req; /* @@ -1158,7 +1151,6 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) WARN_ON_ONCE(atomic_read(&task->tk_count) != 2); rpc_execute(task); -out: dprintk("RPC: rpc_run_bc_task: task= %p\n", task); return task; } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 5db68b371db2..0cc83839c13c 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -965,11 +965,6 @@ struct rpc_task *rpc_new_task(const struct rpc_task_setup *setup_data) if (task == NULL) { task = rpc_alloc_task(); - if (task == NULL) { - rpc_release_calldata(setup_data->callback_ops, - setup_data->callback_data); - return ERR_PTR(-ENOMEM); - } flags = RPC_TASK_DYNAMIC; } diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 1f7082144e01..e34f4ee7f2b6 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -807,7 +807,7 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) EXPORT_SYMBOL_GPL(xdr_init_decode); /** - * xdr_init_decode - Initialize an xdr_stream for decoding data. + * xdr_init_decode_pages - Initialize an xdr_stream for decoding into pages * @xdr: pointer to xdr_stream struct * @buf: pointer to XDR buffer from which to decode data * @pages: list of pages to decode into diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index b530a2852ba8..3e63c5e97ebe 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -651,6 +651,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) xprt_wake_pending_tasks(xprt, -EAGAIN); spin_unlock_bh(&xprt->transport_lock); } +EXPORT_SYMBOL_GPL(xprt_force_disconnect); /** * xprt_conditional_disconnect - force a transport to disconnect diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index a044be2d6ad7..694e9b13ecf0 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -494,7 +494,7 @@ rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req, } sge->length = len; - ib_dma_sync_single_for_device(ia->ri_device, sge->addr, + ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, DMA_TO_DEVICE); req->rl_send_wr.num_sge++; return true; @@ -523,7 +523,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, sge[sge_no].addr = rdmab_addr(rb); sge[sge_no].length = xdr->head[0].iov_len; sge[sge_no].lkey = rdmab_lkey(rb); - ib_dma_sync_single_for_device(device, sge[sge_no].addr, + ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr, sge[sge_no].length, DMA_TO_DEVICE); /* If there is a Read chunk, the page list is being handled @@ -781,9 +781,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) return 0; out_err: - pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n", - PTR_ERR(iptr)); - r_xprt->rx_stats.failed_marshal_count++; + if (PTR_ERR(iptr) != -ENOBUFS) { + pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n", + PTR_ERR(iptr)); + r_xprt->rx_stats.failed_marshal_count++; + } return PTR_ERR(iptr); } diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index c717f5410776..62ecbccd9748 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -66,8 +66,8 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_inline_write_padding; -static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; - int xprt_rdma_pad_optimize = 0; +unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; +int xprt_rdma_pad_optimize; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -396,7 +396,7 @@ xprt_setup_rdma(struct xprt_create *args) new_xprt = rpcx_to_rdmax(xprt); - rc = rpcrdma_ia_open(new_xprt, sap, xprt_rdma_memreg_strategy); + rc = rpcrdma_ia_open(new_xprt, sap); if (rc) goto out1; @@ -457,19 +457,33 @@ out1: return ERR_PTR(rc); } -/* - * Close a connection, during shutdown or timeout/reconnect +/** + * xprt_rdma_close - Close down RDMA connection + * @xprt: generic transport to be closed + * + * Called during transport shutdown reconnect, or device + * removal. Caller holds the transport's write lock. */ static void xprt_rdma_close(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; - dprintk("RPC: %s: closing\n", __func__); - if (r_xprt->rx_ep.rep_connected > 0) + dprintk("RPC: %s: closing xprt %p\n", __func__, xprt); + + if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) { + xprt_clear_connected(xprt); + rpcrdma_ia_remove(ia); + return; + } + if (ep->rep_connected == -ENODEV) + return; + if (ep->rep_connected > 0) xprt->reestablish_timeout = 0; xprt_disconnect_done(xprt); - rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); + rpcrdma_ep_disconnect(ep, ia); } static void @@ -484,6 +498,27 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) dprintk("RPC: %s: %u\n", __func__, port); } +/** + * xprt_rdma_timer - invoked when an RPC times out + * @xprt: controlling RPC transport + * @task: RPC task that timed out + * + * Invoked when the transport is still connected, but an RPC + * retransmit timeout occurs. + * + * Since RDMA connections don't have a keep-alive, forcibly + * disconnect and retry to connect. This drives full + * detection of the network path, and retransmissions of + * all pending RPCs. + */ +static void +xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) +{ + dprintk("RPC: %5u %s: xprt = %p\n", task->tk_pid, __func__, xprt); + + xprt_force_disconnect(xprt); +} + static void xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { @@ -659,6 +694,8 @@ xprt_rdma_free(struct rpc_task *task) * xprt_rdma_send_request - marshal and send an RPC request * @task: RPC task with an RPC message in rq_snd_buf * + * Caller holds the transport's write lock. + * * Return values: * 0: The request has been sent * ENOTCONN: Caller needs to invoke connect logic then call again @@ -685,6 +722,9 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); int rc = 0; + if (!xprt_connected(xprt)) + goto drop_connection; + /* On retransmit, remove any previously registered chunks */ if (unlikely(!list_empty(&req->rl_registered))) r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); @@ -776,6 +816,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = { .alloc_slot = xprt_alloc_slot, .release_request = xprt_release_rqst_cong, /* ditto */ .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */ + .timer = xprt_rdma_timer, .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */ .set_port = xprt_rdma_set_port, .connect = xprt_rdma_connect, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3b332b395045..3dbce9ac4327 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -53,7 +53,7 @@ #include #include #include -#include /* try_module_get()/module_put() */ + #include #include "xprt_rdma.h" @@ -69,8 +69,11 @@ /* * internal functions */ +static void rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt); +static void rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf); +static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb); -static struct workqueue_struct *rpcrdma_receive_wq; +static struct workqueue_struct *rpcrdma_receive_wq __read_mostly; int rpcrdma_alloc_wq(void) @@ -180,7 +183,7 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) rep->rr_wc_flags = wc->wc_flags; rep->rr_inv_rkey = wc->ex.invalidate_rkey; - ib_dma_sync_single_for_cpu(rep->rr_device, + ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf), rdmab_addr(rep->rr_rdmabuf), rep->rr_len, DMA_FROM_DEVICE); @@ -262,6 +265,21 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) __func__, ep); complete(&ia->ri_done); break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) + pr_info("rpcrdma: removing device for %pIS:%u\n", + sap, rpc_get_port(sap)); +#endif + set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); + ep->rep_connected = -ENODEV; + xprt_force_disconnect(&xprt->rx_xprt); + wait_for_completion(&ia->ri_remove_done); + + ia->ri_id = NULL; + ia->ri_pd = NULL; + ia->ri_device = NULL; + /* Return 1 to ensure the core destroys the id. */ + return 1; case RDMA_CM_EVENT_ESTABLISHED: connstate = 1; ib_query_qp(ia->ri_id->qp, attr, @@ -291,9 +309,6 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) goto connected; case RDMA_CM_EVENT_DISCONNECTED: connstate = -ECONNABORTED; - goto connected; - case RDMA_CM_EVENT_DEVICE_REMOVAL: - connstate = -ENODEV; connected: dprintk("RPC: %s: %sconnected\n", __func__, connstate > 0 ? "" : "dis"); @@ -329,14 +344,6 @@ connected: return 0; } -static void rpcrdma_destroy_id(struct rdma_cm_id *id) -{ - if (id) { - module_put(id->device->owner); - rdma_destroy_id(id); - } -} - static struct rdma_cm_id * rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia, struct sockaddr *addr) @@ -346,6 +353,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, int rc; init_completion(&ia->ri_done); + init_completion(&ia->ri_remove_done); id = rdma_create_id(&init_net, rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC); @@ -370,16 +378,6 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, goto out; } - /* FIXME: - * Until xprtrdma supports DEVICE_REMOVAL, the provider must - * be pinned while there are active NFS/RDMA mounts to prevent - * hangs and crashes at umount time. - */ - if (!ia->ri_async_rc && !try_module_get(id->device->owner)) { - dprintk("RPC: %s: Failed to get device module\n", - __func__); - ia->ri_async_rc = -ENODEV; - } rc = ia->ri_async_rc; if (rc) goto out; @@ -389,21 +387,20 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, if (rc) { dprintk("RPC: %s: rdma_resolve_route() failed %i\n", __func__, rc); - goto put; + goto out; } rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); if (rc < 0) { dprintk("RPC: %s: wait() exited: %i\n", __func__, rc); - goto put; + goto out; } rc = ia->ri_async_rc; if (rc) - goto put; + goto out; return id; -put: - module_put(id->device->owner); + out: rdma_destroy_id(id); return ERR_PTR(rc); @@ -413,13 +410,16 @@ out: * Exported functions. */ -/* - * Open and initialize an Interface Adapter. - * o initializes fields of struct rpcrdma_ia, including - * interface and provider attributes and protection zone. +/** + * rpcrdma_ia_open - Open and initialize an Interface Adapter. + * @xprt: controlling transport + * @addr: IP address of remote peer + * + * Returns 0 on success, negative errno if an appropriate + * Interface Adapter could not be found and opened. */ int -rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) +rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr) { struct rpcrdma_ia *ia = &xprt->rx_ia; int rc; @@ -427,7 +427,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) ia->ri_id = rpcrdma_create_id(xprt, ia, addr); if (IS_ERR(ia->ri_id)) { rc = PTR_ERR(ia->ri_id); - goto out1; + goto out_err; } ia->ri_device = ia->ri_id->device; @@ -435,10 +435,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) if (IS_ERR(ia->ri_pd)) { rc = PTR_ERR(ia->ri_pd); pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc); - goto out2; + goto out_err; } - switch (memreg) { + switch (xprt_rdma_memreg_strategy) { case RPCRDMA_FRMR: if (frwr_is_supported(ia)) { ia->ri_ops = &rpcrdma_frwr_memreg_ops; @@ -452,28 +452,73 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) } /*FALLTHROUGH*/ default: - pr_err("rpcrdma: Unsupported memory registration mode: %d\n", - memreg); + pr_err("rpcrdma: Device %s does not support memreg mode %d\n", + ia->ri_device->name, xprt_rdma_memreg_strategy); rc = -EINVAL; - goto out3; + goto out_err; } return 0; -out3: - ib_dealloc_pd(ia->ri_pd); - ia->ri_pd = NULL; -out2: - rpcrdma_destroy_id(ia->ri_id); - ia->ri_id = NULL; -out1: +out_err: + rpcrdma_ia_close(ia); return rc; } -/* - * Clean up/close an IA. - * o if event handles and PD have been initialized, free them. - * o close the IA +/** + * rpcrdma_ia_remove - Handle device driver unload + * @ia: interface adapter being removed + * + * Divest transport H/W resources associated with this adapter, + * but allow it to be restored later. + */ +void +rpcrdma_ia_remove(struct rpcrdma_ia *ia) +{ + struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, + rx_ia); + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_req *req; + struct rpcrdma_rep *rep; + + cancel_delayed_work_sync(&buf->rb_refresh_worker); + + /* This is similar to rpcrdma_ep_destroy, but: + * - Don't cancel the connect worker. + * - Don't call rpcrdma_ep_disconnect, which waits + * for another conn upcall, which will deadlock. + * - rdma_disconnect is unneeded, the underlying + * connection is already gone. + */ + if (ia->ri_id->qp) { + ib_drain_qp(ia->ri_id->qp); + rdma_destroy_qp(ia->ri_id); + ia->ri_id->qp = NULL; + } + ib_free_cq(ep->rep_attr.recv_cq); + ib_free_cq(ep->rep_attr.send_cq); + + /* The ULP is responsible for ensuring all DMA + * mappings and MRs are gone. + */ + list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list) + rpcrdma_dma_unmap_regbuf(rep->rr_rdmabuf); + list_for_each_entry(req, &buf->rb_allreqs, rl_all) { + rpcrdma_dma_unmap_regbuf(req->rl_rdmabuf); + rpcrdma_dma_unmap_regbuf(req->rl_sendbuf); + rpcrdma_dma_unmap_regbuf(req->rl_recvbuf); + } + rpcrdma_destroy_mrs(buf); + + /* Allow waiters to continue */ + complete(&ia->ri_remove_done); +} + +/** + * rpcrdma_ia_close - Clean up/close an IA. + * @ia: interface adapter to close + * */ void rpcrdma_ia_close(struct rpcrdma_ia *ia) @@ -482,13 +527,15 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia) if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { if (ia->ri_id->qp) rdma_destroy_qp(ia->ri_id); - rpcrdma_destroy_id(ia->ri_id); - ia->ri_id = NULL; + rdma_destroy_id(ia->ri_id); } + ia->ri_id = NULL; + ia->ri_device = NULL; /* If the pd is still busy, xprtrdma missed freeing a resource */ if (ia->ri_pd && !IS_ERR(ia->ri_pd)) ib_dealloc_pd(ia->ri_pd); + ia->ri_pd = NULL; } /* @@ -646,6 +693,99 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ib_free_cq(ep->rep_attr.send_cq); } +/* Re-establish a connection after a device removal event. + * Unlike a normal reconnection, a fresh PD and a new set + * of MRs and buffers is needed. + */ +static int +rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +{ + struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr; + int rc, err; + + pr_info("%s: r_xprt = %p\n", __func__, r_xprt); + + rc = -EHOSTUNREACH; + if (rpcrdma_ia_open(r_xprt, sap)) + goto out1; + + rc = -ENOMEM; + err = rpcrdma_ep_create(ep, ia, &r_xprt->rx_data); + if (err) { + pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err); + goto out2; + } + + rc = -ENETUNREACH; + err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); + if (err) { + pr_err("rpcrdma: rdma_create_qp returned %d\n", err); + goto out3; + } + + rpcrdma_create_mrs(r_xprt); + return 0; + +out3: + rpcrdma_ep_destroy(ep, ia); +out2: + rpcrdma_ia_close(ia); +out1: + return rc; +} + +static int +rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, + struct rpcrdma_ia *ia) +{ + struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr; + struct rdma_cm_id *id, *old; + int err, rc; + + dprintk("RPC: %s: reconnecting...\n", __func__); + + rpcrdma_ep_disconnect(ep, ia); + + rc = -EHOSTUNREACH; + id = rpcrdma_create_id(r_xprt, ia, sap); + if (IS_ERR(id)) + goto out; + + /* As long as the new ID points to the same device as the + * old ID, we can reuse the transport's existing PD and all + * previously allocated MRs. Also, the same device means + * the transport's previous DMA mappings are still valid. + * + * This is a sanity check only. There should be no way these + * point to two different devices here. + */ + old = id; + rc = -ENETUNREACH; + if (ia->ri_device != id->device) { + pr_err("rpcrdma: can't reconnect on different device!\n"); + goto out_destroy; + } + + err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); + if (err) { + dprintk("RPC: %s: rdma_create_qp returned %d\n", + __func__, err); + goto out_destroy; + } + + /* Atomically replace the transport's ID and QP. */ + rc = 0; + old = ia->ri_id; + ia->ri_id = id; + rdma_destroy_qp(old); + +out_destroy: + rdma_destroy_id(old); +out: + return rc; +} + /* * Connect unconnected endpoint. */ @@ -654,61 +794,30 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); - struct rdma_cm_id *id, *old; - struct sockaddr *sap; unsigned int extras; - int rc = 0; + int rc; - if (ep->rep_connected != 0) { retry: - dprintk("RPC: %s: reconnecting...\n", __func__); - - rpcrdma_ep_disconnect(ep, ia); - - sap = (struct sockaddr *)&r_xprt->rx_data.addr; - id = rpcrdma_create_id(r_xprt, ia, sap); - if (IS_ERR(id)) { - rc = -EHOSTUNREACH; - goto out; - } - /* TEMP TEMP TEMP - fail if new device: - * Deregister/remarshal *all* requests! - * Close and recreate adapter, pd, etc! - * Re-determine all attributes still sane! - * More stuff I haven't thought of! - * Rrrgh! - */ - if (ia->ri_device != id->device) { - printk("RPC: %s: can't reconnect on " - "different device!\n", __func__); - rpcrdma_destroy_id(id); - rc = -ENETUNREACH; - goto out; - } - /* END TEMP */ - rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); - if (rc) { - dprintk("RPC: %s: rdma_create_qp failed %i\n", - __func__, rc); - rpcrdma_destroy_id(id); - rc = -ENETUNREACH; - goto out; - } - - old = ia->ri_id; - ia->ri_id = id; - - rdma_destroy_qp(old); - rpcrdma_destroy_id(old); - } else { + switch (ep->rep_connected) { + case 0: dprintk("RPC: %s: connecting...\n", __func__); rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); if (rc) { dprintk("RPC: %s: rdma_create_qp failed %i\n", __func__, rc); - /* do not update ep->rep_connected */ - return -ENETUNREACH; + rc = -ENETUNREACH; + goto out_noupdate; } + break; + case -ENODEV: + rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia); + if (rc) + goto out_noupdate; + break; + default: + rc = rpcrdma_ep_reconnect(r_xprt, ep, ia); + if (rc) + goto out; } ep->rep_connected = 0; @@ -736,6 +845,8 @@ retry: out: if (rc) ep->rep_connected = rc; + +out_noupdate: return rc; } @@ -878,7 +989,6 @@ struct rpcrdma_rep * rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_rep *rep; int rc; @@ -894,7 +1004,6 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) goto out_free; } - rep->rr_device = ia->ri_device; rep->rr_cqe.done = rpcrdma_wc_receive; rep->rr_rxprt = r_xprt; INIT_WORK(&rep->rr_work, rpcrdma_reply_handler); @@ -1037,6 +1146,7 @@ void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { cancel_delayed_work_sync(&buf->rb_recovery_worker); + cancel_delayed_work_sync(&buf->rb_refresh_worker); while (!list_empty(&buf->rb_recv_bufs)) { struct rpcrdma_rep *rep; @@ -1081,7 +1191,8 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) out_nomws: dprintk("RPC: %s: no MWs available\n", __func__); - schedule_delayed_work(&buf->rb_refresh_worker, 0); + if (r_xprt->rx_ep.rep_connected != -ENODEV) + schedule_delayed_work(&buf->rb_refresh_worker, 0); /* Allow the reply handler and refresh worker to run */ cond_resched(); @@ -1231,17 +1342,19 @@ rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction, bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) { + struct ib_device *device = ia->ri_device; + if (rb->rg_direction == DMA_NONE) return false; - rb->rg_iov.addr = ib_dma_map_single(ia->ri_device, + rb->rg_iov.addr = ib_dma_map_single(device, (void *)rb->rg_base, rdmab_length(rb), rb->rg_direction); - if (ib_dma_mapping_error(ia->ri_device, rdmab_addr(rb))) + if (ib_dma_mapping_error(device, rdmab_addr(rb))) return false; - rb->rg_device = ia->ri_device; + rb->rg_device = device; rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey; return true; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 171a35116de9..1d66acf1a723 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -69,6 +69,7 @@ struct rpcrdma_ia { struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; struct completion ri_done; + struct completion ri_remove_done; int ri_async_rc; unsigned int ri_max_segs; unsigned int ri_max_frmr_depth; @@ -78,10 +79,15 @@ struct rpcrdma_ia { bool ri_reminv_expected; bool ri_implicit_roundup; enum ib_mr_type ri_mrtype; + unsigned long ri_flags; struct ib_qp_attr ri_qp_attr; struct ib_qp_init_attr ri_qp_init_attr; }; +enum { + RPCRDMA_IAF_REMOVING = 0, +}; + /* * RDMA Endpoint -- one per transport instance */ @@ -164,6 +170,12 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) return (struct rpcrdma_msg *)rb->rg_base; } +static inline struct ib_device * +rdmab_device(struct rpcrdma_regbuf *rb) +{ + return rb->rg_device; +} + #define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN) /* To ensure a transport can always make forward progress, @@ -209,7 +221,6 @@ struct rpcrdma_rep { unsigned int rr_len; int rr_wc_flags; u32 rr_inv_rkey; - struct ib_device *rr_device; struct rpcrdma_xprt *rr_rxprt; struct work_struct rr_work; struct list_head rr_list; @@ -380,7 +391,6 @@ struct rpcrdma_buffer { spinlock_t rb_mwlock; /* protect rb_mws list */ struct list_head rb_mws; struct list_head rb_all; - char *rb_pool; spinlock_t rb_lock; /* protect buf lists */ int rb_send_count, rb_recv_count; @@ -497,10 +507,16 @@ struct rpcrdma_xprt { * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ extern int xprt_rdma_pad_optimize; +/* This setting controls the hunt for a supported memory + * registration strategy. + */ +extern unsigned int xprt_rdma_memreg_strategy; + /* * Interface Adapter calls - xprtrdma/verbs.c */ -int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int); +int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr); +void rpcrdma_ia_remove(struct rpcrdma_ia *ia); void rpcrdma_ia_close(struct rpcrdma_ia *); bool frwr_is_supported(struct rpcrdma_ia *); bool fmr_is_supported(struct rpcrdma_ia *);