From 68c97153fb7f2877f98aa6c29546381d9cad2fed Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 3 Jan 2012 13:22:46 -0500
Subject: [PATCH 01/17] SUNRPC: Clean up the RPCSEC_GSS service ticket requests

Instead of hacking specific service names into gss_encode_v1_msg, we should
just allow the caller to specify the service name explicitly.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfs/client.c                 |  2 +-
 fs/nfsd/nfs4callback.c          |  2 +-
 include/linux/sunrpc/auth.h     |  3 ++-
 include/linux/sunrpc/auth_gss.h |  2 +-
 net/sunrpc/auth_generic.c       |  6 +++--
 net/sunrpc/auth_gss/auth_gss.c  | 40 +++++++++++++++++++--------------
 6 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 873bf00d51a2..32ea37198e93 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -185,7 +185,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
 	clp->cl_minorversion = cl_init->minorversion;
 	clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
 #endif
-	cred = rpc_lookup_machine_cred();
+	cred = rpc_lookup_machine_cred("*");
 	if (!IS_ERR(cred))
 		clp->cl_machine_cred = cred;
 	nfs_fscache_get_client_cookie(clp);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7748d6a18d97..6f3ebb48b12f 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -718,7 +718,7 @@ int set_callback_cred(void)
 {
 	if (callback_cred)
 		return 0;
-	callback_cred = rpc_lookup_machine_cred();
+	callback_cred = rpc_lookup_machine_cred("nfs");
 	if (!callback_cred)
 		return -ENOMEM;
 	return 0;
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index febc4dbec2ca..7874a8a56638 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -26,6 +26,7 @@ struct auth_cred {
 	uid_t	uid;
 	gid_t	gid;
 	struct group_info *group_info;
+	const char *principal;
 	unsigned char machine_cred : 1;
 };
 
@@ -127,7 +128,7 @@ void			rpc_destroy_generic_auth(void);
 void 			rpc_destroy_authunix(void);
 
 struct rpc_cred *	rpc_lookup_cred(void);
-struct rpc_cred *	rpc_lookup_machine_cred(void);
+struct rpc_cred *	rpc_lookup_machine_cred(const char *service_name);
 int			rpcauth_register(const struct rpc_authops *);
 int			rpcauth_unregister(const struct rpc_authops *);
 struct rpc_auth *	rpcauth_create(rpc_authflavor_t, struct rpc_clnt *);
diff --git a/include/linux/sunrpc/auth_gss.h b/include/linux/sunrpc/auth_gss.h
index 8eee9dbbfe7a..f1cfd4c85cd0 100644
--- a/include/linux/sunrpc/auth_gss.h
+++ b/include/linux/sunrpc/auth_gss.h
@@ -82,8 +82,8 @@ struct gss_cred {
 	enum rpc_gss_svc	gc_service;
 	struct gss_cl_ctx __rcu	*gc_ctx;
 	struct gss_upcall_msg	*gc_upcall;
+	const char		*gc_principal;
 	unsigned long		gc_upcall_timestamp;
-	unsigned char		gc_machine_cred : 1;
 };
 
 #endif /* __KERNEL__ */
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index e010a015d996..1426ec3d0a53 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -41,15 +41,17 @@ EXPORT_SYMBOL_GPL(rpc_lookup_cred);
 /*
  * Public call interface for looking up machine creds.
  */
-struct rpc_cred *rpc_lookup_machine_cred(void)
+struct rpc_cred *rpc_lookup_machine_cred(const char *service_name)
 {
 	struct auth_cred acred = {
 		.uid = RPC_MACHINE_CRED_USERID,
 		.gid = RPC_MACHINE_CRED_GROUPID,
+		.principal = service_name,
 		.machine_cred = 1,
 	};
 
-	dprintk("RPC:       looking up machine cred\n");
+	dprintk("RPC:       looking up machine cred for service %s\n",
+			service_name);
 	return generic_auth.au_ops->lookup_cred(&generic_auth, &acred, 0);
 }
 EXPORT_SYMBOL_GPL(rpc_lookup_machine_cred);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index afb56553dfe7..28d72d298735 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -392,7 +392,8 @@ static void gss_encode_v0_msg(struct gss_upcall_msg *gss_msg)
 }
 
 static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
-				struct rpc_clnt *clnt, int machine_cred)
+				struct rpc_clnt *clnt,
+				const char *service_name)
 {
 	struct gss_api_mech *mech = gss_msg->auth->mech;
 	char *p = gss_msg->databuf;
@@ -407,12 +408,8 @@ static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
 		p += len;
 		gss_msg->msg.len += len;
 	}
-	if (machine_cred) {
-		len = sprintf(p, "service=* ");
-		p += len;
-		gss_msg->msg.len += len;
-	} else if (!strcmp(clnt->cl_program->name, "nfs4_cb")) {
-		len = sprintf(p, "service=nfs ");
+	if (service_name != NULL) {
+		len = sprintf(p, "service=%s ", service_name);
 		p += len;
 		gss_msg->msg.len += len;
 	}
@@ -429,17 +426,18 @@ static void gss_encode_v1_msg(struct gss_upcall_msg *gss_msg,
 }
 
 static void gss_encode_msg(struct gss_upcall_msg *gss_msg,
-				struct rpc_clnt *clnt, int machine_cred)
+				struct rpc_clnt *clnt,
+				const char *service_name)
 {
 	if (pipe_version == 0)
 		gss_encode_v0_msg(gss_msg);
 	else /* pipe_version == 1 */
-		gss_encode_v1_msg(gss_msg, clnt, machine_cred);
+		gss_encode_v1_msg(gss_msg, clnt, service_name);
 }
 
-static inline struct gss_upcall_msg *
-gss_alloc_msg(struct gss_auth *gss_auth, uid_t uid, struct rpc_clnt *clnt,
-		int machine_cred)
+static struct gss_upcall_msg *
+gss_alloc_msg(struct gss_auth *gss_auth, struct rpc_clnt *clnt,
+		uid_t uid, const char *service_name)
 {
 	struct gss_upcall_msg *gss_msg;
 	int vers;
@@ -459,7 +457,7 @@ gss_alloc_msg(struct gss_auth *gss_auth, uid_t uid, struct rpc_clnt *clnt,
 	atomic_set(&gss_msg->count, 1);
 	gss_msg->uid = uid;
 	gss_msg->auth = gss_auth;
-	gss_encode_msg(gss_msg, clnt, machine_cred);
+	gss_encode_msg(gss_msg, clnt, service_name);
 	return gss_msg;
 }
 
@@ -471,7 +469,7 @@ gss_setup_upcall(struct rpc_clnt *clnt, struct gss_auth *gss_auth, struct rpc_cr
 	struct gss_upcall_msg *gss_new, *gss_msg;
 	uid_t uid = cred->cr_uid;
 
-	gss_new = gss_alloc_msg(gss_auth, uid, clnt, gss_cred->gc_machine_cred);
+	gss_new = gss_alloc_msg(gss_auth, clnt, uid, gss_cred->gc_principal);
 	if (IS_ERR(gss_new))
 		return gss_new;
 	gss_msg = gss_add_msg(gss_new);
@@ -995,7 +993,9 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags)
 	 */
 	cred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_NEW;
 	cred->gc_service = gss_auth->service;
-	cred->gc_machine_cred = acred->machine_cred;
+	cred->gc_principal = NULL;
+	if (acred->machine_cred)
+		cred->gc_principal = acred->principal;
 	kref_get(&gss_auth->kref);
 	return &cred->gc_base;
 
@@ -1030,7 +1030,12 @@ gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags)
 	if (!test_bit(RPCAUTH_CRED_UPTODATE, &rc->cr_flags))
 		return 0;
 out:
-	if (acred->machine_cred != gss_cred->gc_machine_cred)
+	if (acred->principal != NULL) {
+		if (gss_cred->gc_principal == NULL)
+			return 0;
+		return strcmp(acred->principal, gss_cred->gc_principal) == 0;
+	}
+	if (gss_cred->gc_principal != NULL)
 		return 0;
 	return rc->cr_uid == acred->uid;
 }
@@ -1104,7 +1109,8 @@ static int gss_renew_cred(struct rpc_task *task)
 	struct rpc_auth *auth = oldcred->cr_auth;
 	struct auth_cred acred = {
 		.uid = oldcred->cr_uid,
-		.machine_cred = gss_cred->gc_machine_cred,
+		.principal = gss_cred->gc_principal,
+		.machine_cred = (gss_cred->gc_principal != NULL ? 1 : 0),
 	};
 	struct rpc_cred *new;
 

From 43717c7daebf10b43f12e68512484b3095bb1ba5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Mon, 5 Dec 2011 15:40:30 -0500
Subject: [PATCH 02/17] NFS: Retry mounting NFSROOT

Lukas Razik <linux@razik.name> reports that on his SPARC system,
booting with an NFS root file system stopped working after commit
56463e50 "NFS: Use super.c for NFSROOT mount option parsing."

We found that the network switch to which Lukas' client was attached
was delaying access to the LAN after the client's NIC driver reported
that its link was up.  The delay was longer than the timeouts used in
the NFS client during mounting.

NFSROOT worked for Lukas before commit 56463e50 because in those
kernels, the client's first operation was an rpcbind request to
determine which port the NFS server was listening on.  When that
request failed after a long timeout, the client simply selected the
default NFS port (2049).  By that time the switch was allowing access
to the LAN, and the mount succeeded.

Neither of these client behaviors is desirable, so reverting 56463e50
is really not a choice.  Instead, introduce a mechanism that retries
the NFSROOT mount request several times.  This is the same tactic that
normal user space NFS mounts employ to overcome server and network
delays.

Signed-off-by: Lukas Razik <linux@razik.name>
[ cel: match kernel coding style, add proper patch description ]
[ cel: add exponential back-off ]
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Tested-by: Lukas Razik <linux@razik.name>
Cc: stable@kernel.org # > 2.6.38
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 init/do_mounts.c | 35 +++++++++++++++++++++++++++++++----
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/init/do_mounts.c b/init/do_mounts.c
index 0f6e1d985a3b..db6e5ee0e1db 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -398,15 +398,42 @@ out:
 }
  
 #ifdef CONFIG_ROOT_NFS
+
+#define NFSROOT_TIMEOUT_MIN	5
+#define NFSROOT_TIMEOUT_MAX	30
+#define NFSROOT_RETRY_MAX	5
+
 static int __init mount_nfs_root(void)
 {
 	char *root_dev, *root_data;
+	unsigned int timeout;
+	int try, err;
 
-	if (nfs_root_data(&root_dev, &root_data) != 0)
+	err = nfs_root_data(&root_dev, &root_data);
+	if (err != 0)
 		return 0;
-	if (do_mount_root(root_dev, "nfs", root_mountflags, root_data) != 0)
-		return 0;
-	return 1;
+
+	/*
+	 * The server or network may not be ready, so try several
+	 * times.  Stop after a few tries in case the client wants
+	 * to fall back to other boot methods.
+	 */
+	timeout = NFSROOT_TIMEOUT_MIN;
+	for (try = 1; ; try++) {
+		err = do_mount_root(root_dev, "nfs",
+					root_mountflags, root_data);
+		if (err == 0)
+			return 1;
+		if (try > NFSROOT_RETRY_MAX)
+			break;
+
+		/* Wait, in case the server refused us immediately */
+		ssleep(timeout);
+		timeout <<= 1;
+		if (timeout > NFSROOT_TIMEOUT_MAX)
+			timeout = NFSROOT_TIMEOUT_MAX;
+	}
+	return 0;
 }
 #endif
 

From 2edb6bc3852c681c0d948245bd55108dc6407604 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 16 Nov 2011 11:46:31 +1100
Subject: [PATCH 03/17] NFS - fix recent breakage to NFS error handling.

From c6d615d2b97fe305cbf123a8751ced859dca1d5e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 16 Nov 2011 09:39:05 +1100
Subject: [PATCH] NFS - fix recent breakage to NFS error handling.

commit 02c24a82187d5a628c68edfe71ae60dc135cd178 made a small and
presumably unintended change to write error handling in NFS.

Previously an error from filemap_write_and_wait_range would only be of
interest if nfs_file_fsync did not return an error.  After this commit,
an error from filemap_write_and_wait_range would mean that (the rest of)
nfs_file_fsync would not even be called.

This means that:
 1/ you are more likely to see EIO than e.g. EDQUOT or ENOSPC.
 2/ NFS_CONTEXT_ERROR_WRITE remains set for longer so more writes are
    synchronous.

This patch restores previous behaviour.

Cc: stable@kernel.org
Cc: Josef Bacik <josef@redhat.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 606ef0f20aed..c43a452f7da2 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -272,13 +272,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 			datasync);
 
 	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-	if (ret)
-		return ret;
 	mutex_lock(&inode->i_mutex);
 
 	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
 	have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
 	status = nfs_commit_inode(inode, FLUSH_SYNC);
+	if (status >= 0 && ret < 0)
+		status = ret;
 	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
 	if (have_error)
 		ret = xchg(&ctx->error, 0);

From 8a0d551a59ac92d8ff048d6cb29d3a02073e81e8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 20 Dec 2011 06:57:45 -0500
Subject: [PATCH 04/17] nfs: fix regression in handling of context= option in
 NFSv4

Setting the security context of a NFSv4 mount via the context= mount
option is currently broken. The NFSv4 codepath allocates a parsed
options struct, and then parses the mount options to fill it. It
eventually calls nfs4_remote_mount which calls security_init_mnt_opts.
That clobbers the lsm_opts struct that was populated earlier. This bug
also looks like it causes a small memory leak on each v4 mount where
context= is used.

Fix this by moving the initialization of the lsm_opts into
nfs_alloc_parsed_mount_data. Also, add a destructor for
nfs_parsed_mount_data to make it easier to free all of the allocations
hanging off of it, and to ensure that the security_free_mnt_opts is
called whenever security_init_mnt_opts is.

I believe this regression was introduced quite some time ago, probably
by commit c02d7adf.

Cc: stable@vger.kernel.org
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 43 +++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 134777406ee3..3ada13c9986a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -909,10 +909,24 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve
 		data->auth_flavor_len	= 1;
 		data->version		= version;
 		data->minorversion	= 0;
+		security_init_mnt_opts(&data->lsm_opts);
 	}
 	return data;
 }
 
+static void nfs_free_parsed_mount_data(struct nfs_parsed_mount_data *data)
+{
+	if (data) {
+		kfree(data->client_address);
+		kfree(data->mount_server.hostname);
+		kfree(data->nfs_server.export_path);
+		kfree(data->nfs_server.hostname);
+		kfree(data->fscache_uniq);
+		security_free_mnt_opts(&data->lsm_opts);
+		kfree(data);
+	}
+}
+
 /*
  * Sanity-check a server address provided by the mount command.
  *
@@ -2220,9 +2234,7 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 	data = nfs_alloc_parsed_mount_data(NFS_DEFAULT_VERSION);
 	mntfh = nfs_alloc_fhandle();
 	if (data == NULL || mntfh == NULL)
-		goto out_free_fh;
-
-	security_init_mnt_opts(&data->lsm_opts);
+		goto out;
 
 	/* Validate the mount data */
 	error = nfs_validate_mount_data(raw_data, data, mntfh, dev_name);
@@ -2234,8 +2246,6 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 #ifdef CONFIG_NFS_V4
 	if (data->version == 4) {
 		mntroot = nfs4_try_mount(flags, dev_name, data);
-		kfree(data->client_address);
-		kfree(data->nfs_server.export_path);
 		goto out;
 	}
 #endif	/* CONFIG_NFS_V4 */
@@ -2290,13 +2300,8 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 	s->s_flags |= MS_ACTIVE;
 
 out:
-	kfree(data->nfs_server.hostname);
-	kfree(data->mount_server.hostname);
-	kfree(data->fscache_uniq);
-	security_free_mnt_opts(&data->lsm_opts);
-out_free_fh:
+	nfs_free_parsed_mount_data(data);
 	nfs_free_fhandle(mntfh);
-	kfree(data);
 	return mntroot;
 
 out_err_nosb:
@@ -2623,9 +2628,7 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
 
 	mntfh = nfs_alloc_fhandle();
 	if (data == NULL || mntfh == NULL)
-		goto out_free_fh;
-
-	security_init_mnt_opts(&data->lsm_opts);
+		goto out;
 
 	/* Get a volume representation */
 	server = nfs4_create_server(data, mntfh);
@@ -2677,13 +2680,10 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
 
 	s->s_flags |= MS_ACTIVE;
 
-	security_free_mnt_opts(&data->lsm_opts);
 	nfs_free_fhandle(mntfh);
 	return mntroot;
 
 out:
-	security_free_mnt_opts(&data->lsm_opts);
-out_free_fh:
 	nfs_free_fhandle(mntfh);
 	return ERR_PTR(error);
 
@@ -2838,7 +2838,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type,
 
 	data = nfs_alloc_parsed_mount_data(4);
 	if (data == NULL)
-		goto out_free_data;
+		goto out;
 
 	/* Validate the mount data */
 	error = nfs4_validate_mount_data(raw_data, data, dev_name);
@@ -2852,12 +2852,7 @@ static struct dentry *nfs4_mount(struct file_system_type *fs_type,
 		error = PTR_ERR(res);
 
 out:
-	kfree(data->client_address);
-	kfree(data->nfs_server.export_path);
-	kfree(data->nfs_server.hostname);
-	kfree(data->fscache_uniq);
-out_free_data:
-	kfree(data);
+	nfs_free_parsed_mount_data(data);
 	dprintk("<-- nfs4_mount() = %d%s\n", error,
 			error != 0 ? " [error]" : "");
 	return res;

From 61f2e5106582d02f30b6807e3f9c07463c572ccb Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 9 Nov 2011 13:58:20 -0500
Subject: [PATCH 05/17] NFSv4.1: fix backchannel slotid off-by-one bug

Cc:stable@kernel.org
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/callback_proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 43926add945b..54cea8ad5a76 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -339,7 +339,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
 	dprintk("%s enter. slotid %d seqid %d\n",
 		__func__, args->csa_slotid, args->csa_sequenceid);
 
-	if (args->csa_slotid > NFS41_BC_MAX_CALLBACKS)
+	if (args->csa_slotid >= NFS41_BC_MAX_CALLBACKS)
 		return htonl(NFS4ERR_BADSLOT);
 
 	slot = tbl->slots + args->csa_slotid;

From aacd5537270a752fe12a9914a207284fc2341c6d Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 9 Nov 2011 13:58:21 -0500
Subject: [PATCH 06/17] NFSv4.1: cleanup init and reset of session slot tables

We are either initializing or resetting a session. Initialize or reset
the session slot tables accordingly.

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 59 ++++++++++++++++++-----------------------------
 1 file changed, 22 insertions(+), 37 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d9f4d78c3413..a64aa56e140a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5021,23 +5021,6 @@ out:
 	return ret;
 }
 
-/*
- * Reset the forechannel and backchannel slot tables
- */
-static int nfs4_reset_slot_tables(struct nfs4_session *session)
-{
-	int status;
-
-	status = nfs4_reset_slot_table(&session->fc_slot_table,
-			session->fc_attrs.max_reqs, 1);
-	if (status)
-		return status;
-
-	status = nfs4_reset_slot_table(&session->bc_slot_table,
-			session->bc_attrs.max_reqs, 0);
-	return status;
-}
-
 /* Destroy the slot table */
 static void nfs4_destroy_slot_tables(struct nfs4_session *session)
 {
@@ -5083,29 +5066,35 @@ out:
 }
 
 /*
- * Initialize the forechannel and backchannel tables
+ * Initialize or reset the forechannel and backchannel tables
  */
-static int nfs4_init_slot_tables(struct nfs4_session *session)
+static int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
 {
 	struct nfs4_slot_table *tbl;
-	int status = 0;
+	int status;
 
-	tbl = &session->fc_slot_table;
+	dprintk("--> %s\n", __func__);
+	/* Fore channel */
+	tbl = &ses->fc_slot_table;
 	if (tbl->slots == NULL) {
-		status = nfs4_init_slot_table(tbl,
-				session->fc_attrs.max_reqs, 1);
+		status = nfs4_init_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
+		if (status) /* -ENOMEM */
+			return status;
+	} else {
+		status = nfs4_reset_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
 		if (status)
 			return status;
 	}
-
-	tbl = &session->bc_slot_table;
+	/* Back channel */
+	tbl = &ses->bc_slot_table;
 	if (tbl->slots == NULL) {
-		status = nfs4_init_slot_table(tbl,
-				session->bc_attrs.max_reqs, 0);
+		status = nfs4_init_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
 		if (status)
-			nfs4_destroy_slot_tables(session);
-	}
-
+			/* Fore and back channel share a connection so get
+			 * both slot tables or neither */
+			nfs4_destroy_slot_tables(ses);
+	} else
+		status = nfs4_reset_slot_table(tbl, ses->bc_attrs.max_reqs, 0);
 	return status;
 }
 
@@ -5293,13 +5282,9 @@ int nfs4_proc_create_session(struct nfs_client *clp)
 	if (status)
 		goto out;
 
-	/* Init and reset the fore channel */
-	status = nfs4_init_slot_tables(session);
-	dprintk("slot table initialization returned %d\n", status);
-	if (status)
-		goto out;
-	status = nfs4_reset_slot_tables(session);
-	dprintk("slot table reset returned %d\n", status);
+	/* Init or reset the session slot tables */
+	status = nfs4_setup_session_slot_tables(session);
+	dprintk("slot table setup returned %d\n", status);
 	if (status)
 		goto out;
 

From aabd0b40b327d5c6518c8c908819b9bf864ad56a Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 9 Nov 2011 13:58:22 -0500
Subject: [PATCH 07/17] NFSv4.1: change nfs4_free_slot parameters for dynamic
 slots

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index a64aa56e140a..b2104461ed4f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -363,9 +363,8 @@ static void renew_lease(const struct nfs_server *server, unsigned long timestamp
  * Must be called while holding tbl->slot_tbl_lock
  */
 static void
-nfs4_free_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *free_slot)
+nfs4_free_slot(struct nfs4_slot_table *tbl, u8 free_slotid)
 {
-	int free_slotid = free_slot - tbl->slots;
 	int slotid = free_slotid;
 
 	BUG_ON(slotid < 0 || slotid >= NFS4_MAX_SLOT_TABLE);
@@ -430,7 +429,7 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res)
 	}
 
 	spin_lock(&tbl->slot_tbl_lock);
-	nfs4_free_slot(tbl, res->sr_slot);
+	nfs4_free_slot(tbl, res->sr_slot - tbl->slots);
 	nfs4_check_drain_fc_complete(res->sr_session);
 	spin_unlock(&tbl->slot_tbl_lock);
 	res->sr_slot = NULL;

From 0b1c8fc43c1f9fcde2d18182988f05eeaaae509b Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 9 Nov 2011 13:58:26 -0500
Subject: [PATCH 08/17] NFSv4.1: cleanup comment and debug printk

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b2104461ed4f..fcc2408d7ab0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -552,13 +552,10 @@ int nfs41_setup_sequence(struct nfs4_session *session,
 	spin_lock(&tbl->slot_tbl_lock);
 	if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) &&
 	    !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) {
-		/*
-		 * The state manager will wait until the slot table is empty.
-		 * Schedule the reset thread
-		 */
+		/* The state manager will wait until the slot table is empty */
 		rpc_sleep_on(&tbl->slot_tbl_waitq, task, NULL);
 		spin_unlock(&tbl->slot_tbl_lock);
-		dprintk("%s Schedule Session Reset\n", __func__);
+		dprintk("%s session is draining\n", __func__);
 		return -EAGAIN;
 	}
 

From 3476f114addb7b96912840a234702f660a1f460b Mon Sep 17 00:00:00 2001
From: Chris Metcalf <cmetcalf@tilera.com>
Date: Thu, 11 Aug 2011 13:54:28 -0700
Subject: [PATCH 09/17] nfs: fix a minor do_div portability issue

This change modifies filelayout_get_dense_offset() to use the functions
in math64.h and thus avoid a 32-bit platform compile error trying to
use do_div() on an s64 type.

Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Reviewed-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4filelayout.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index a62d36b9a99e..71ec08617e23 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -49,13 +49,14 @@ filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
 			    loff_t offset)
 {
 	u32 stripe_width = flseg->stripe_unit * flseg->dsaddr->stripe_count;
-	u64 tmp;
+	u64 stripe_no;
+	u32 rem;
 
 	offset -= flseg->pattern_offset;
-	tmp = offset;
-	do_div(tmp, stripe_width);
+	stripe_no = div_u64(offset, stripe_width);
+	div_u64_rem(offset, flseg->stripe_unit, &rem);
 
-	return tmp * flseg->stripe_unit + do_div(offset, flseg->stripe_unit);
+	return stripe_no * flseg->stripe_unit + rem;
 }
 
 /* This function is used by the layout driver to calculate the

From bf118a342f10dafe44b14451a1392c3254629a1f Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 7 Dec 2011 11:55:27 -0500
Subject: [PATCH 10/17] NFSv4: include bitmap in nfsv4 get acl data

The NFSv4 bitmap size is unbounded: a server can return an arbitrary
sized bitmap in an FATTR4_WORD0_ACL request.  Replace using the
nfs4_fattr_bitmap_maxsz as a guess to the maximum bitmask returned by a server
with the inclusion of the bitmap (xdr length plus bitmasks) and the acl data
xdr length to the (cached) acl page data.

This is a general solution to commit e5012d1f "NFSv4.1: update
nfs4_fattr_bitmap_maxsz" and fixes hitting a BUG_ON in xdr_shrink_bufhead
when getting ACLs.

Fix a bug in decode_getacl that returned -EINVAL on ACLs > page when getxattr
was called with a NULL buffer, preventing ACL > PAGE_SIZE from being retrieved.

Cc: stable@kernel.org
Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c          | 96 ++++++++++++++++++++++----------------
 fs/nfs/nfs4xdr.c           | 31 ++++++++----
 include/linux/nfs_xdr.h    |  5 ++
 include/linux/sunrpc/xdr.h |  2 +
 net/sunrpc/xdr.c           |  3 +-
 5 files changed, 89 insertions(+), 48 deletions(-)

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index fcc2408d7ab0..3b1080118452 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3426,19 +3426,6 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server)
  */
 #define NFS4ACL_MAXPAGES (XATTR_SIZE_MAX >> PAGE_CACHE_SHIFT)
 
-static void buf_to_pages(const void *buf, size_t buflen,
-		struct page **pages, unsigned int *pgbase)
-{
-	const void *p = buf;
-
-	*pgbase = offset_in_page(buf);
-	p -= *pgbase;
-	while (p < buf + buflen) {
-		*(pages++) = virt_to_page(p);
-		p += PAGE_CACHE_SIZE;
-	}
-}
-
 static int buf_to_pages_noslab(const void *buf, size_t buflen,
 		struct page **pages, unsigned int *pgbase)
 {
@@ -3535,9 +3522,19 @@ out:
 	nfs4_set_cached_acl(inode, acl);
 }
 
+/*
+ * The getxattr API returns the required buffer length when called with a
+ * NULL buf. The NFSv4 acl tool then calls getxattr again after allocating
+ * the required buf.  On a NULL buf, we send a page of data to the server
+ * guessing that the ACL request can be serviced by a page. If so, we cache
+ * up to the page of ACL data, and the 2nd call to getxattr is serviced by
+ * the cache. If not so, we throw away the page, and cache the required
+ * length. The next getxattr call will then produce another round trip to
+ * the server, this time with the input buf of the required size.
+ */
 static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t buflen)
 {
-	struct page *pages[NFS4ACL_MAXPAGES];
+	struct page *pages[NFS4ACL_MAXPAGES] = {NULL, };
 	struct nfs_getaclargs args = {
 		.fh = NFS_FH(inode),
 		.acl_pages = pages,
@@ -3552,41 +3549,60 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
 		.rpc_argp = &args,
 		.rpc_resp = &res,
 	};
-	struct page *localpage = NULL;
-	int ret;
+	int ret = -ENOMEM, npages, i, acl_len = 0;
 
-	if (buflen < PAGE_SIZE) {
-		/* As long as we're doing a round trip to the server anyway,
-		 * let's be prepared for a page of acl data. */
-		localpage = alloc_page(GFP_KERNEL);
-		resp_buf = page_address(localpage);
-		if (localpage == NULL)
-			return -ENOMEM;
-		args.acl_pages[0] = localpage;
-		args.acl_pgbase = 0;
-		args.acl_len = PAGE_SIZE;
-	} else {
-		resp_buf = buf;
-		buf_to_pages(buf, buflen, args.acl_pages, &args.acl_pgbase);
+	npages = (buflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	/* As long as we're doing a round trip to the server anyway,
+	 * let's be prepared for a page of acl data. */
+	if (npages == 0)
+		npages = 1;
+
+	for (i = 0; i < npages; i++) {
+		pages[i] = alloc_page(GFP_KERNEL);
+		if (!pages[i])
+			goto out_free;
 	}
-	ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode), &msg, &args.seq_args, &res.seq_res, 0);
+	if (npages > 1) {
+		/* for decoding across pages */
+		args.acl_scratch = alloc_page(GFP_KERNEL);
+		if (!args.acl_scratch)
+			goto out_free;
+	}
+	args.acl_len = npages * PAGE_SIZE;
+	args.acl_pgbase = 0;
+	/* Let decode_getfacl know not to fail if the ACL data is larger than
+	 * the page we send as a guess */
+	if (buf == NULL)
+		res.acl_flags |= NFS4_ACL_LEN_REQUEST;
+	resp_buf = page_address(pages[0]);
+
+	dprintk("%s  buf %p buflen %ld npages %d args.acl_len %ld\n",
+		__func__, buf, buflen, npages, args.acl_len);
+	ret = nfs4_call_sync(NFS_SERVER(inode)->client, NFS_SERVER(inode),
+			     &msg, &args.seq_args, &res.seq_res, 0);
 	if (ret)
 		goto out_free;
-	if (res.acl_len > args.acl_len)
-		nfs4_write_cached_acl(inode, NULL, res.acl_len);
+
+	acl_len = res.acl_len - res.acl_data_offset;
+	if (acl_len > args.acl_len)
+		nfs4_write_cached_acl(inode, NULL, acl_len);
 	else
-		nfs4_write_cached_acl(inode, resp_buf, res.acl_len);
+		nfs4_write_cached_acl(inode, resp_buf + res.acl_data_offset,
+				      acl_len);
 	if (buf) {
 		ret = -ERANGE;
-		if (res.acl_len > buflen)
+		if (acl_len > buflen)
 			goto out_free;
-		if (localpage)
-			memcpy(buf, resp_buf, res.acl_len);
+		_copy_from_pages(buf, pages, res.acl_data_offset,
+				res.acl_len);
 	}
-	ret = res.acl_len;
+	ret = acl_len;
 out_free:
-	if (localpage)
-		__free_page(localpage);
+	for (i = 0; i < npages; i++)
+		if (pages[i])
+			__free_page(pages[i]);
+	if (args.acl_scratch)
+		__free_page(args.acl_scratch);
 	return ret;
 }
 
@@ -3617,6 +3633,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen)
 		nfs_zap_acl_cache(inode);
 	ret = nfs4_read_cached_acl(inode, buf, buflen);
 	if (ret != -ENOENT)
+		/* -ENOENT is returned if there is no ACL or if there is an ACL
+		 * but no cached acl data, just the acl length */
 		return ret;
 	return nfs4_get_acl_uncached(inode, buf, buflen);
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e6161b213ed1..dcaf69309d8e 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2517,11 +2517,13 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->fh, &hdr);
-	replen = hdr.replen + op_decode_hdr_maxsz + nfs4_fattr_bitmap_maxsz + 1;
+	replen = hdr.replen + op_decode_hdr_maxsz + 1;
 	encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
 
 	xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
 		args->acl_pages, args->acl_pgbase, args->acl_len);
+	xdr_set_scratch_buffer(xdr, page_address(args->acl_scratch), PAGE_SIZE);
+
 	encode_nops(&hdr);
 }
 
@@ -4957,17 +4959,18 @@ decode_restorefh(struct xdr_stream *xdr)
 }
 
 static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
-		size_t *acl_len)
+			 struct nfs_getaclres *res)
 {
-	__be32 *savep;
+	__be32 *savep, *bm_p;
 	uint32_t attrlen,
 		 bitmap[3] = {0};
 	struct kvec *iov = req->rq_rcv_buf.head;
 	int status;
 
-	*acl_len = 0;
+	res->acl_len = 0;
 	if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
 		goto out;
+	bm_p = xdr->p;
 	if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
 		goto out;
 	if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
@@ -4979,18 +4982,30 @@ static int decode_getacl(struct xdr_stream *xdr, struct rpc_rqst *req,
 		size_t hdrlen;
 		u32 recvd;
 
+		/* The bitmap (xdr len + bitmaps) and the attr xdr len words
+		 * are stored with the acl data to handle the problem of
+		 * variable length bitmaps.*/
+		xdr->p = bm_p;
+		res->acl_data_offset = be32_to_cpup(bm_p) + 2;
+		res->acl_data_offset <<= 2;
+
 		/* We ignore &savep and don't do consistency checks on
 		 * the attr length.  Let userspace figure it out.... */
 		hdrlen = (u8 *)xdr->p - (u8 *)iov->iov_base;
+		attrlen += res->acl_data_offset;
 		recvd = req->rq_rcv_buf.len - hdrlen;
 		if (attrlen > recvd) {
-			dprintk("NFS: server cheating in getattr"
-					" acl reply: attrlen %u > recvd %u\n",
+			if (res->acl_flags & NFS4_ACL_LEN_REQUEST) {
+				/* getxattr interface called with a NULL buf */
+				res->acl_len = attrlen;
+				goto out;
+			}
+			dprintk("NFS: acl reply: attrlen %u > recvd %u\n",
 					attrlen, recvd);
 			return -EINVAL;
 		}
 		xdr_read_pages(xdr, attrlen);
-		*acl_len = attrlen;
+		res->acl_len = attrlen;
 	} else
 		status = -EOPNOTSUPP;
 
@@ -6028,7 +6043,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_getacl(xdr, rqstp, &res->acl_len);
+	status = decode_getacl(xdr, rqstp, res);
 
 out:
 	return status;
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 2a7c533be5dd..6c898afe6095 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -602,11 +602,16 @@ struct nfs_getaclargs {
 	size_t				acl_len;
 	unsigned int			acl_pgbase;
 	struct page **			acl_pages;
+	struct page *			acl_scratch;
 	struct nfs4_sequence_args 	seq_args;
 };
 
+/* getxattr ACL interface flags */
+#define NFS4_ACL_LEN_REQUEST	0x0001	/* zero length getxattr buffer */
 struct nfs_getaclres {
 	size_t				acl_len;
+	size_t				acl_data_offset;
+	int				acl_flags;
 	struct nfs4_sequence_res	seq_res;
 };
 
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index a20970ef9e4e..af70af333546 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -191,6 +191,8 @@ extern int xdr_decode_array2(struct xdr_buf *buf, unsigned int base,
 			     struct xdr_array2_desc *desc);
 extern int xdr_encode_array2(struct xdr_buf *buf, unsigned int base,
 			     struct xdr_array2_desc *desc);
+extern void _copy_from_pages(char *p, struct page **pages, size_t pgbase,
+			     size_t len);
 
 /*
  * Provide some simple tools for XDR buffer overflow-checking etc.
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 277ebd4bf095..593f4c605305 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -296,7 +296,7 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len)
  * Copies data into an arbitrary memory location from an array of pages
  * The copy is assumed to be non-overlapping.
  */
-static void
+void
 _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
 {
 	struct page **pgfrom;
@@ -324,6 +324,7 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
 
 	} while ((len -= copy) != 0);
 }
+EXPORT_SYMBOL_GPL(_copy_from_pages);
 
 /*
  * xdr_shrink_bufhead

From 414adf14cd3b52e411f79d941a15d0fd4af427fc Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 6 Dec 2011 16:13:39 -0500
Subject: [PATCH 11/17] NFS: Clean up nfs4_find_state_owners_locked()

There's no longer a need to check the so_server field in the state
owner, because nowadays the RB tree we search for state owners
contains owners for that only server.

Make nfs4_find_state_owners_locked() use the same tree searching logic
as nfs4_insert_state_owner_locked().

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4state.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6a7107ae6b72..6354e4fcc829 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -377,31 +377,22 @@ nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
 {
 	struct rb_node **p = &server->state_owners.rb_node,
 		       *parent = NULL;
-	struct nfs4_state_owner *sp, *res = NULL;
+	struct nfs4_state_owner *sp;
 
 	while (*p != NULL) {
 		parent = *p;
 		sp = rb_entry(parent, struct nfs4_state_owner, so_server_node);
 
-		if (server < sp->so_server) {
-			p = &parent->rb_left;
-			continue;
-		}
-		if (server > sp->so_server) {
-			p = &parent->rb_right;
-			continue;
-		}
 		if (cred < sp->so_cred)
 			p = &parent->rb_left;
 		else if (cred > sp->so_cred)
 			p = &parent->rb_right;
 		else {
 			atomic_inc(&sp->so_count);
-			res = sp;
-			break;
+			return sp;
 		}
 	}
-	return res;
+	return NULL;
 }
 
 static struct nfs4_state_owner *

From 0aaaf5c424c7ffd6b0c4253251356558b16ef3a2 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Tue, 6 Dec 2011 16:13:48 -0500
Subject: [PATCH 12/17] NFS: Cache state owners after files are closed

Servers have a finite amount of memory to store NFSv4 open and lock
owners.  Moreover, servers may have a difficult time determining when
they can reap their state owner table, thanks to gray areas in the
NFSv4 protocol specification.  Thus clients should be careful to reuse
state owners when possible.

Currently Linux is not too careful.  When a user has closed all her
files on one mount point, the state owner's reference count goes to
zero, and it is released.  The next OPEN allocates a new one.  A
workload that serially opens and closes files can run through a large
number of open owners this way.

When a state owner's reference count goes to zero, slap it onto a free
list for that nfs_server, with an expiry time.  Garbage collect before
looking for a state owner.  This makes state owners for active users
available for re-use.

Now that there can be unused state owners remaining at umount time,
purge the state owner free list when a server is destroyed.  Also be
sure not to reclaim unused state owners during state recovery.

This change has benefits for the client as well.  For some workloads,
this approach drops the number of OPEN_CONFIRM calls from the same as
the number of OPEN calls, down to just one.  This reduces wire traffic
and thus open(2) latency.  Before this patch, untarring a kernel
source tarball shows the OPEN_CONFIRM call counter steadily increasing
through the test.  With the patch, the OPEN_CONFIRM count remains at 1
throughout the entire untar.

As long as the expiry time is kept short, I don't think garbage
collection should be terribly expensive, although it does bounce the
clp->cl_lock around a bit.

[ At some point we should rationalize the use of the nfs_server
->destroy method. ]

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
[Trond: Fixed a garbage collection race and a few efficiency issues]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           |  8 ++++
 fs/nfs/nfs4_fs.h          |  3 ++
 fs/nfs/nfs4state.c        | 89 +++++++++++++++++++++++++++++++++++----
 include/linux/nfs_fs_sb.h |  1 +
 4 files changed, 92 insertions(+), 9 deletions(-)

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 32ea37198e93..41bd67f80d31 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -250,6 +250,11 @@ static void pnfs_init_server(struct nfs_server *server)
 	rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC");
 }
 
+static void nfs4_destroy_server(struct nfs_server *server)
+{
+	nfs4_purge_state_owners(server);
+}
+
 #else
 static void nfs4_shutdown_client(struct nfs_client *clp)
 {
@@ -1065,6 +1070,7 @@ static struct nfs_server *nfs_alloc_server(void)
 	INIT_LIST_HEAD(&server->master_link);
 	INIT_LIST_HEAD(&server->delegations);
 	INIT_LIST_HEAD(&server->layouts);
+	INIT_LIST_HEAD(&server->state_owners_lru);
 
 	atomic_set(&server->active, 0);
 
@@ -1538,6 +1544,7 @@ static int nfs4_server_common_setup(struct nfs_server *server,
 
 	nfs_server_insert_lists(server);
 	server->mount_time = jiffies;
+	server->destroy = nfs4_destroy_server;
 out:
 	nfs_free_fattr(fattr);
 	return error;
@@ -1719,6 +1726,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source,
 
 	/* Copy data from the source */
 	server->nfs_client = source->nfs_client;
+	server->destroy = source->destroy;
 	atomic_inc(&server->nfs_client->cl_count);
 	nfs_server_copy_userdata(server, source);
 
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 693ae22f8731..4d7d0aedc101 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -94,6 +94,8 @@ struct nfs_unique_id {
 struct nfs4_state_owner {
 	struct nfs_unique_id so_owner_id;
 	struct nfs_server    *so_server;
+	struct list_head     so_lru;
+	unsigned long        so_expires;
 	struct rb_node	     so_server_node;
 
 	struct rpc_cred	     *so_cred;	 /* Associated cred */
@@ -319,6 +321,7 @@ static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
 
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
 extern void nfs4_put_state_owner(struct nfs4_state_owner *);
+extern void nfs4_purge_state_owners(struct nfs_server *);
 extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
 extern void nfs4_put_open_state(struct nfs4_state *);
 extern void nfs4_close_state(struct nfs4_state *, fmode_t);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 6354e4fcc829..a53f33b4ac3a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -49,6 +49,7 @@
 #include <linux/ratelimit.h>
 #include <linux/workqueue.h>
 #include <linux/bitops.h>
+#include <linux/jiffies.h>
 
 #include "nfs4_fs.h"
 #include "callback.h"
@@ -388,6 +389,8 @@ nfs4_find_state_owner_locked(struct nfs_server *server, struct rpc_cred *cred)
 		else if (cred > sp->so_cred)
 			p = &parent->rb_right;
 		else {
+			if (!list_empty(&sp->so_lru))
+				list_del_init(&sp->so_lru);
 			atomic_inc(&sp->so_count);
 			return sp;
 		}
@@ -412,6 +415,8 @@ nfs4_insert_state_owner_locked(struct nfs4_state_owner *new)
 		else if (new->so_cred > sp->so_cred)
 			p = &parent->rb_right;
 		else {
+			if (!list_empty(&sp->so_lru))
+				list_del_init(&sp->so_lru);
 			atomic_inc(&sp->so_count);
 			return sp;
 		}
@@ -453,6 +458,7 @@ nfs4_alloc_state_owner(void)
 	spin_lock_init(&sp->so_sequence.lock);
 	INIT_LIST_HEAD(&sp->so_sequence.list);
 	atomic_set(&sp->so_count, 1);
+	INIT_LIST_HEAD(&sp->so_lru);
 	return sp;
 }
 
@@ -470,6 +476,38 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 	}
 }
 
+static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
+{
+	rpc_destroy_wait_queue(&sp->so_sequence.wait);
+	put_rpccred(sp->so_cred);
+	kfree(sp);
+}
+
+static void nfs4_gc_state_owners(struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_state_owner *sp, *tmp;
+	unsigned long time_min, time_max;
+	LIST_HEAD(doomed);
+
+	spin_lock(&clp->cl_lock);
+	time_max = jiffies;
+	time_min = (long)time_max - (long)clp->cl_lease_time;
+	list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
+		/* NB: LRU is sorted so that oldest is at the head */
+		if (time_in_range(sp->so_expires, time_min, time_max))
+			break;
+		list_move(&sp->so_lru, &doomed);
+		nfs4_remove_state_owner_locked(sp);
+	}
+	spin_unlock(&clp->cl_lock);
+
+	list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
+		list_del(&sp->so_lru);
+		nfs4_free_state_owner(sp);
+	}
+}
+
 /**
  * nfs4_get_state_owner - Look up a state owner given a credential
  * @server: nfs_server to search
@@ -487,10 +525,10 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
 	sp = nfs4_find_state_owner_locked(server, cred);
 	spin_unlock(&clp->cl_lock);
 	if (sp != NULL)
-		return sp;
+		goto out;
 	new = nfs4_alloc_state_owner();
 	if (new == NULL)
-		return NULL;
+		goto out;
 	new->so_server = server;
 	new->so_cred = cred;
 	spin_lock(&clp->cl_lock);
@@ -502,26 +540,58 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server,
 		rpc_destroy_wait_queue(&new->so_sequence.wait);
 		kfree(new);
 	}
+out:
+	nfs4_gc_state_owners(server);
 	return sp;
 }
 
 /**
  * nfs4_put_state_owner - Release a nfs4_state_owner
  * @sp: state owner data to release
- *
  */
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
-	struct nfs_client *clp = sp->so_server->nfs_client;
-	struct rpc_cred *cred = sp->so_cred;
+	struct nfs_server *server = sp->so_server;
+	struct nfs_client *clp = server->nfs_client;
 
 	if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
 		return;
-	nfs4_remove_state_owner_locked(sp);
+
+	if (!RB_EMPTY_NODE(&sp->so_server_node)) {
+		sp->so_expires = jiffies;
+		list_add_tail(&sp->so_lru, &server->state_owners_lru);
+		spin_unlock(&clp->cl_lock);
+	} else {
+		nfs4_remove_state_owner_locked(sp);
+		spin_unlock(&clp->cl_lock);
+		nfs4_free_state_owner(sp);
+	}
+}
+
+/**
+ * nfs4_purge_state_owners - Release all cached state owners
+ * @server: nfs_server with cached state owners to release
+ *
+ * Called at umount time.  Remaining state owners will be on
+ * the LRU with ref count of zero.
+ */
+void nfs4_purge_state_owners(struct nfs_server *server)
+{
+	struct nfs_client *clp = server->nfs_client;
+	struct nfs4_state_owner *sp, *tmp;
+	LIST_HEAD(doomed);
+
+	spin_lock(&clp->cl_lock);
+	list_for_each_entry_safe(sp, tmp, &server->state_owners_lru, so_lru) {
+		list_move(&sp->so_lru, &doomed);
+		nfs4_remove_state_owner_locked(sp);
+	}
 	spin_unlock(&clp->cl_lock);
-	rpc_destroy_wait_queue(&sp->so_sequence.wait);
-	put_rpccred(cred);
-	kfree(sp);
+
+	list_for_each_entry_safe(sp, tmp, &doomed, so_lru) {
+		list_del(&sp->so_lru);
+		nfs4_free_state_owner(sp);
+	}
 }
 
 static struct nfs4_state *
@@ -1393,6 +1463,7 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov
 restart:
 	rcu_read_lock();
 	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		nfs4_purge_state_owners(server);
 		spin_lock(&clp->cl_lock);
 		for (pos = rb_first(&server->state_owners);
 		     pos != NULL;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index b5479df8378d..ba4d7656ecfd 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -153,6 +153,7 @@ struct nfs_server {
 	struct rb_root		openowner_id;
 	struct rb_root		lockowner_id;
 #endif
+	struct list_head	state_owners_lru;
 	struct list_head	layouts;
 	struct list_head	delegations;
 	void (*destroy)(struct nfs_server *);

From 5c0b4129c07b902b27d3f3ebc087757f534a3abd Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Fri, 6 Jan 2012 09:28:12 +0200
Subject: [PATCH 13/17] pnfs-obj: pNFS errors are communicated on
 iodata->pnfs_error

Some time along the way pNFS IO errors were switched to
communicate with a special iodata->pnfs_error member instead
of the regular RPC members. But objlayout was not switched
over.

Fix that!
Without this fix any IO error is hanged, because IO is not
switched to MDS and pages are never cleared or read.

[Applies to 3.2.0. Same bug different patch for 3.1/0 Kernels]
CC: Stable Tree <stable@kernel.org>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/objlayout/objlayout.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 72074e3a04f9..b3c29039f5b8 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -254,6 +254,8 @@ objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 	oir->status = rdata->task.tk_status = status;
 	if (status >= 0)
 		rdata->res.count = status;
+	else
+		rdata->pnfs_error = status;
 	objlayout_iodone(oir);
 	/* must not use oir after this point */
 
@@ -334,6 +336,8 @@ objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync)
 	if (status >= 0) {
 		wdata->res.count = status;
 		wdata->verf.committed = oir->committed;
+	} else {
+		wdata->pnfs_error = status;
 	}
 	objlayout_iodone(oir);
 	/* must not use oir after this point */

From fe0fe83585f88346557868a803a479dfaaa0688a Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Fri, 6 Jan 2012 09:31:20 +0200
Subject: [PATCH 14/17] pnfs-obj: Must return layout on IO error

As mandated by the standard. In case of an IO error, a pNFS
objects layout driver must return it's layout. This is because
all device errors are reported to the server as part of the
layout return buffer.

This is implemented the same way PNFS_LAYOUTRET_ON_SETATTR
is done, through a bit flag on the pnfs_layoutdriver_type->flags
member. The flag is set by the layout driver that wants a
layout_return preformed at pnfs_ld_{write,read}_done in case
of an error.
(Though I have not defined a wrapper like pnfs_ld_layoutret_on_setattr
 because this code is never called outside of pnfs.c and pnfs IO
 paths)

Without this patch 3.[0-2] Kernels leak memory and have an annoying
WARN_ON after every IO error utilizing the pnfs-obj driver.

[This patch is for 3.2 Kernel. 3.1/0 Kernels need a different patch]
CC: Stable Tree <stable@kernel.org>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/objlayout/objio_osd.c |  3 ++-
 fs/nfs/pnfs.c                | 12 ++++++++++++
 fs/nfs/pnfs.h                |  1 +
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index c807ab93140e..55d01280a609 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -551,7 +551,8 @@ static const struct nfs_pageio_ops objio_pg_write_ops = {
 static struct pnfs_layoutdriver_type objlayout_type = {
 	.id = LAYOUT_OSD2_OBJECTS,
 	.name = "LAYOUT_OSD2_OBJECTS",
-	.flags                   = PNFS_LAYOUTRET_ON_SETATTR,
+	.flags                   = PNFS_LAYOUTRET_ON_SETATTR |
+				   PNFS_LAYOUTRET_ON_ERROR,
 
 	.alloc_layout_hdr        = objlayout_alloc_layout_hdr,
 	.free_layout_hdr         = objlayout_free_layout_hdr,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 8e672a2b2d69..f881a6387942 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1178,6 +1178,15 @@ void pnfs_ld_write_done(struct nfs_write_data *data)
 		put_lseg(data->lseg);
 		data->lseg = NULL;
 		dprintk("pnfs write error = %d\n", data->pnfs_error);
+		if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
+						PNFS_LAYOUTRET_ON_ERROR) {
+			/* Don't lo_commit on error, Server will needs to
+			 * preform a file recovery.
+			 */
+			clear_bit(NFS_INO_LAYOUTCOMMIT,
+				  &NFS_I(data->inode)->flags);
+			pnfs_return_layout(data->inode);
+		}
 	}
 	data->mds_ops->rpc_release(data);
 }
@@ -1267,6 +1276,9 @@ static void pnfs_ld_handle_read_error(struct nfs_read_data *data)
 	put_lseg(data->lseg);
 	data->lseg = NULL;
 	dprintk("pnfs write error = %d\n", data->pnfs_error);
+	if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
+						PNFS_LAYOUTRET_ON_ERROR)
+		pnfs_return_layout(data->inode);
 
 	nfs_pageio_init_read_mds(&pgio, data->inode);
 
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 1509530cb111..53d593a0a4f2 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -68,6 +68,7 @@ enum {
 enum layoutdriver_policy_flags {
 	/* Should the pNFS client commit and return the layout upon a setattr */
 	PNFS_LAYOUTRET_ON_SETATTR	= 1 << 0,
+	PNFS_LAYOUTRET_ON_ERROR		= 1 << 1,
 };
 
 struct nfs4_deviceid_node;

From e2fecb215b321db0e4a5b2597349a63c07bec42f Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 6 Jan 2012 08:57:46 -0500
Subject: [PATCH 15/17] NFS: Remove pNFS bloat from the generic write path

We have no business doing any this in the standard write release path.
Get rid of it, and put it in the pNFS layer.

Also, while we're at it, get rid of the completely bogus unlock/relock
semantics that were present in nfs_writeback_release_full(). It is
not only unnecessary, but actually dangerous to release the write lock
just in order to take it again in nfs_page_async_flush(). Better just
to open code the pgio operations in a pnfs helper.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/internal.h |  2 ++
 fs/nfs/pnfs.c     | 30 ++++++++++++++++++++++++++++--
 fs/nfs/write.c    | 27 ++-------------------------
 3 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 3f4d95751d52..5ee92538b063 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -307,6 +307,8 @@ extern void nfs_readdata_release(struct nfs_read_data *rdata);
 /* write.c */
 extern int nfs_generic_flush(struct nfs_pageio_descriptor *desc,
 		struct list_head *head);
+extern void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
+				  struct inode *inode, int ioflags);
 extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio);
 extern void nfs_writedata_release(struct nfs_write_data *wdata);
 extern void nfs_commit_free(struct nfs_write_data *p);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index f881a6387942..17149a490065 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1166,6 +1166,33 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
 
+static int pnfs_write_done_resend_to_mds(struct inode *inode, struct list_head *head)
+{
+	struct nfs_pageio_descriptor pgio;
+	LIST_HEAD(failed);
+
+	/* Resend all requests through the MDS */
+	nfs_pageio_init_write_mds(&pgio, inode, FLUSH_STABLE);
+	while (!list_empty(head)) {
+		struct nfs_page *req = nfs_list_entry(head->next);
+
+		nfs_list_remove_request(req);
+		if (!nfs_pageio_add_request(&pgio, req))
+			nfs_list_add_request(req, &failed);
+	}
+	nfs_pageio_complete(&pgio);
+
+	if (!list_empty(&failed)) {
+		/* For some reason our attempt to resend pages. Mark the
+		 * overall send request as having failed, and let
+		 * nfs_writeback_release_full deal with the error.
+		 */
+		list_move(&failed, head);
+		return -EIO;
+	}
+	return 0;
+}
+
 /*
  * Called by non rpc-based layout drivers
  */
@@ -1175,8 +1202,6 @@ void pnfs_ld_write_done(struct nfs_write_data *data)
 		pnfs_set_layoutcommit(data);
 		data->mds_ops->rpc_call_done(&data->task, data);
 	} else {
-		put_lseg(data->lseg);
-		data->lseg = NULL;
 		dprintk("pnfs write error = %d\n", data->pnfs_error);
 		if (NFS_SERVER(data->inode)->pnfs_curr_ld->flags &
 						PNFS_LAYOUTRET_ON_ERROR) {
@@ -1187,6 +1212,7 @@ void pnfs_ld_write_done(struct nfs_write_data *data)
 				  &NFS_I(data->inode)->flags);
 			pnfs_return_layout(data->inode);
 		}
+		data->task.tk_status = pnfs_write_done_resend_to_mds(data->inode, &data->pages);
 	}
 	data->mds_ops->rpc_release(data);
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 1dda78db6a73..0c3885255f97 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1052,7 +1052,7 @@ static const struct nfs_pageio_ops nfs_pageio_write_ops = {
 	.pg_doio = nfs_generic_pg_writepages,
 };
 
-static void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
+void nfs_pageio_init_write_mds(struct nfs_pageio_descriptor *pgio,
 				  struct inode *inode, int ioflags)
 {
 	nfs_pageio_init(pgio, inode, &nfs_pageio_write_ops,
@@ -1166,13 +1166,7 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata)
 static void nfs_writeback_release_full(void *calldata)
 {
 	struct nfs_write_data	*data = calldata;
-	int ret, status = data->task.tk_status;
-	struct nfs_pageio_descriptor pgio;
-
-	if (data->pnfs_error) {
-		nfs_pageio_init_write_mds(&pgio, data->inode, FLUSH_STABLE);
-		pgio.pg_recoalesce = 1;
-	}
+	int status = data->task.tk_status;
 
 	/* Update attributes as result of writeback. */
 	while (!list_empty(&data->pages)) {
@@ -1188,11 +1182,6 @@ static void nfs_writeback_release_full(void *calldata)
 			req->wb_bytes,
 			(long long)req_offset(req));
 
-		if (data->pnfs_error) {
-			dprintk(", pnfs error = %d\n", data->pnfs_error);
-			goto next;
-		}
-
 		if (status < 0) {
 			nfs_set_pageerror(page);
 			nfs_context_set_write_error(req->wb_context, status);
@@ -1212,19 +1201,7 @@ remove_request:
 	next:
 		nfs_clear_page_tag_locked(req);
 		nfs_end_page_writeback(page);
-		if (data->pnfs_error) {
-			lock_page(page);
-			nfs_pageio_cond_complete(&pgio, page->index);
-			ret = nfs_page_async_flush(&pgio, page, 0);
-			if (ret) {
-				nfs_set_pageerror(page);
-				dprintk("rewrite to MDS error = %d\n", ret);
-			}
-			unlock_page(page);
-		}
 	}
-	if (data->pnfs_error)
-		nfs_pageio_complete(&pgio);
 	nfs_writedata_release(calldata);
 }
 

From 6926afd1925a54a13684ebe05987868890665e2b Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 7 Jan 2012 13:22:46 -0500
Subject: [PATCH 16/17] NFSv4: Save the owner/group name string when doing open

...so that we can do the uid/gid mapping outside the asynchronous RPC
context.
This fixes a bug in the current NFSv4 atomic open code where the client
isn't able to determine what the true uid/gid fields of the file are,
(because the asynchronous nature of the OPEN call denies it the ability
to do an upcall) and so fills them with default values, marking the
inode as needing revalidation.
Unfortunately, in some cases, the VFS will do some additional sanity
checks on the file, and may override the server's decision to allow
the open because it sees the wrong owner/group fields.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/idmap.c            |  83 +++++++++++++++++++++++++++++
 fs/nfs/inode.c            |   2 +
 fs/nfs/nfs4proc.c         |  10 ++++
 fs/nfs/nfs4xdr.c          | 106 +++++++++++++++++---------------------
 include/linux/nfs_idmap.h |   8 +++
 include/linux/nfs_xdr.h   |  17 ++++--
 6 files changed, 162 insertions(+), 64 deletions(-)

diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 47d1c6ff2d8e..2c05f1991e1e 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -38,6 +38,89 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/nfs_idmap.h>
+#include <linux/nfs_fs.h>
+
+/**
+ * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields
+ * @fattr: fully initialised struct nfs_fattr
+ * @owner_name: owner name string cache
+ * @group_name: group name string cache
+ */
+void nfs_fattr_init_names(struct nfs_fattr *fattr,
+		struct nfs4_string *owner_name,
+		struct nfs4_string *group_name)
+{
+	fattr->owner_name = owner_name;
+	fattr->group_name = group_name;
+}
+
+static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr)
+{
+	fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME;
+	kfree(fattr->owner_name->data);
+}
+
+static void nfs_fattr_free_group_name(struct nfs_fattr *fattr)
+{
+	fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME;
+	kfree(fattr->group_name->data);
+}
+
+static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+	struct nfs4_string *owner = fattr->owner_name;
+	__u32 uid;
+
+	if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME))
+		return false;
+	if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) {
+		fattr->uid = uid;
+		fattr->valid |= NFS_ATTR_FATTR_OWNER;
+	}
+	return true;
+}
+
+static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+	struct nfs4_string *group = fattr->group_name;
+	__u32 gid;
+
+	if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME))
+		return false;
+	if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) {
+		fattr->gid = gid;
+		fattr->valid |= NFS_ATTR_FATTR_GROUP;
+	}
+	return true;
+}
+
+/**
+ * nfs_fattr_free_names - free up the NFSv4 owner and group strings
+ * @fattr: a fully initialised nfs_fattr structure
+ */
+void nfs_fattr_free_names(struct nfs_fattr *fattr)
+{
+	if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)
+		nfs_fattr_free_owner_name(fattr);
+	if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)
+		nfs_fattr_free_group_name(fattr);
+}
+
+/**
+ * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free
+ * @server: pointer to the filesystem nfs_server structure
+ * @fattr: a fully initialised nfs_fattr structure
+ *
+ * This helper maps the cached NFSv4 owner/group strings in fattr into
+ * their numeric uid/gid equivalents, and then frees the cached strings.
+ */
+void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr)
+{
+	if (nfs_fattr_map_owner_name(server, fattr))
+		nfs_fattr_free_owner_name(fattr);
+	if (nfs_fattr_map_group_name(server, fattr))
+		nfs_fattr_free_group_name(fattr);
+}
 
 static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
 {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 50a15fa8cf98..f59cab12a8ee 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1019,6 +1019,8 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
 	fattr->valid = 0;
 	fattr->time_start = jiffies;
 	fattr->gencount = nfs_inc_attr_generation_counter();
+	fattr->owner_name = NULL;
+	fattr->group_name = NULL;
 }
 
 struct nfs_fattr *nfs_alloc_fattr(void)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3b1080118452..df3d3068242e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -52,6 +52,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/module.h>
+#include <linux/nfs_idmap.h>
 #include <linux/sunrpc/bc_xprt.h>
 #include <linux/xattr.h>
 #include <linux/utsname.h>
@@ -760,6 +761,8 @@ struct nfs4_opendata {
 	struct nfs_openres o_res;
 	struct nfs_open_confirmargs c_arg;
 	struct nfs_open_confirmres c_res;
+	struct nfs4_string owner_name;
+	struct nfs4_string group_name;
 	struct nfs_fattr f_attr;
 	struct nfs_fattr dir_attr;
 	struct dentry *dir;
@@ -783,6 +786,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 	p->o_res.server = p->o_arg.server;
 	nfs_fattr_init(&p->f_attr);
 	nfs_fattr_init(&p->dir_attr);
+	nfs_fattr_init_names(&p->f_attr, &p->owner_name, &p->group_name);
 }
 
 static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
@@ -814,6 +818,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	p->o_arg.name = &dentry->d_name;
 	p->o_arg.server = server;
 	p->o_arg.bitmask = server->attr_bitmask;
+	p->o_arg.dir_bitmask = server->cache_consistency_bitmask;
 	p->o_arg.claim = NFS4_OPEN_CLAIM_NULL;
 	if (flags & O_CREAT) {
 		u32 *s;
@@ -850,6 +855,7 @@ static void nfs4_opendata_free(struct kref *kref)
 	dput(p->dir);
 	dput(p->dentry);
 	nfs_sb_deactive(sb);
+	nfs_fattr_free_names(&p->f_attr);
 	kfree(p);
 }
 
@@ -1574,6 +1580,8 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
 	if (status != 0 || !data->rpc_done)
 		return status;
 
+	nfs_fattr_map_and_free_names(NFS_SERVER(dir), &data->f_attr);
+
 	nfs_refresh_inode(dir, o_res->dir_attr);
 
 	if (o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) {
@@ -1606,6 +1614,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 		return status;
 	}
 
+	nfs_fattr_map_and_free_names(server, &data->f_attr);
+
 	if (o_arg->open_flags & O_CREAT) {
 		update_changeattr(dir, &o_res->cinfo);
 		nfs_post_op_update_inode(dir, o_res->dir_attr);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index dcaf69309d8e..95e92e438407 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2298,7 +2298,7 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_getfh(xdr, &hdr);
 	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_restorefh(xdr, &hdr);
-	encode_getfattr(xdr, args->bitmask, &hdr);
+	encode_getfattr(xdr, args->dir_bitmask, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -3792,7 +3792,8 @@ out_overflow:
 }
 
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
-		const struct nfs_server *server, uint32_t *uid, int may_sleep)
+		const struct nfs_server *server, uint32_t *uid,
+		struct nfs4_string *owner_name)
 {
 	uint32_t len;
 	__be32 *p;
@@ -3809,8 +3810,12 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
 			goto out_overflow;
-		if (!may_sleep) {
-			/* do nothing */
+		if (owner_name != NULL) {
+			owner_name->data = kmemdup(p, len, GFP_NOWAIT);
+			if (owner_name->data != NULL) {
+				owner_name->len = len;
+				ret = NFS_ATTR_FATTR_OWNER_NAME;
+			}
 		} else if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_name_to_uid(server, (char *)p, len, uid) == 0)
 				ret = NFS_ATTR_FATTR_OWNER;
@@ -3830,7 +3835,8 @@ out_overflow:
 }
 
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
-		const struct nfs_server *server, uint32_t *gid, int may_sleep)
+		const struct nfs_server *server, uint32_t *gid,
+		struct nfs4_string *group_name)
 {
 	uint32_t len;
 	__be32 *p;
@@ -3847,8 +3853,12 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
 		p = xdr_inline_decode(xdr, len);
 		if (unlikely(!p))
 			goto out_overflow;
-		if (!may_sleep) {
-			/* do nothing */
+		if (group_name != NULL) {
+			group_name->data = kmemdup(p, len, GFP_NOWAIT);
+			if (group_name->data != NULL) {
+				group_name->len = len;
+				ret = NFS_ATTR_FATTR_GROUP_NAME;
+			}
 		} else if (len < XDR_MAX_NETOBJ) {
 			if (nfs_map_group_to_gid(server, (char *)p, len, gid) == 0)
 				ret = NFS_ATTR_FATTR_GROUP;
@@ -4285,7 +4295,7 @@ xdr_error:
 
 static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
 		struct nfs_fattr *fattr, struct nfs_fh *fh,
-		const struct nfs_server *server, int may_sleep)
+		const struct nfs_server *server)
 {
 	int status;
 	umode_t fmode = 0;
@@ -4352,12 +4362,12 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
 		goto xdr_error;
 	fattr->valid |= status;
 
-	status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, may_sleep);
+	status = decode_attr_owner(xdr, bitmap, server, &fattr->uid, fattr->owner_name);
 	if (status < 0)
 		goto xdr_error;
 	fattr->valid |= status;
 
-	status = decode_attr_group(xdr, bitmap, server, &fattr->gid, may_sleep);
+	status = decode_attr_group(xdr, bitmap, server, &fattr->gid, fattr->group_name);
 	if (status < 0)
 		goto xdr_error;
 	fattr->valid |= status;
@@ -4398,7 +4408,7 @@ xdr_error:
 }
 
 static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fattr,
-		struct nfs_fh *fh, const struct nfs_server *server, int may_sleep)
+		struct nfs_fh *fh, const struct nfs_server *server)
 {
 	__be32 *savep;
 	uint32_t attrlen,
@@ -4417,7 +4427,7 @@ static int decode_getfattr_generic(struct xdr_stream *xdr, struct nfs_fattr *fat
 	if (status < 0)
 		goto xdr_error;
 
-	status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server, may_sleep);
+	status = decode_getfattr_attrs(xdr, bitmap, fattr, fh, server);
 	if (status < 0)
 		goto xdr_error;
 
@@ -4428,9 +4438,9 @@ xdr_error:
 }
 
 static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
-		const struct nfs_server *server, int may_sleep)
+		const struct nfs_server *server)
 {
-	return decode_getfattr_generic(xdr, fattr, NULL, server, may_sleep);
+	return decode_getfattr_generic(xdr, fattr, NULL, server);
 }
 
 /*
@@ -5711,8 +5721,7 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
 	status = decode_open_downgrade(xdr, res);
 	if (status != 0)
 		goto out;
-	decode_getfattr(xdr, res->fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -5738,8 +5747,7 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_access(xdr, res);
 	if (status != 0)
 		goto out;
-	decode_getfattr(xdr, res->fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -5768,8 +5776,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_getfh(xdr, res->fh);
 	if (status)
 		goto out;
-	status = decode_getfattr(xdr, res->fattr, res->server
-			,!RPC_IS_ASYNC(rqstp->rq_task));
+	status = decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -5795,8 +5802,7 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp,
 		goto out;
 	status = decode_getfh(xdr, res->fh);
 	if (status == 0)
-		status = decode_getfattr(xdr, res->fattr, res->server,
-				!RPC_IS_ASYNC(rqstp->rq_task));
+		status = decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -5822,8 +5828,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_remove(xdr, &res->cinfo);
 	if (status)
 		goto out;
-	decode_getfattr(xdr, res->dir_attr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->dir_attr, res->server);
 out:
 	return status;
 }
@@ -5856,14 +5861,12 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	if (status)
 		goto out;
 	/* Current FH is target directory */
-	if (decode_getfattr(xdr, res->new_fattr, res->server,
-				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+	if (decode_getfattr(xdr, res->new_fattr, res->server))
 		goto out;
 	status = decode_restorefh(xdr);
 	if (status)
 		goto out;
-	decode_getfattr(xdr, res->old_fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->old_fattr, res->server);
 out:
 	return status;
 }
@@ -5899,14 +5902,12 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	 * Note order: OP_LINK leaves the directory as the current
 	 *             filehandle.
 	 */
-	if (decode_getfattr(xdr, res->dir_attr, res->server,
-				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+	if (decode_getfattr(xdr, res->dir_attr, res->server))
 		goto out;
 	status = decode_restorefh(xdr);
 	if (status)
 		goto out;
-	decode_getfattr(xdr, res->fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -5938,14 +5939,12 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_getfh(xdr, res->fh);
 	if (status)
 		goto out;
-	if (decode_getfattr(xdr, res->fattr, res->server,
-				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+	if (decode_getfattr(xdr, res->fattr, res->server))
 		goto out;
 	status = decode_restorefh(xdr);
 	if (status)
 		goto out;
-	decode_getfattr(xdr, res->dir_fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->dir_fattr, res->server);
 out:
 	return status;
 }
@@ -5977,8 +5976,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_putfh(xdr);
 	if (status)
 		goto out;
-	status = decode_getfattr(xdr, res->fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	status = decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -6076,8 +6074,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	 * 	an ESTALE error. Shouldn't be a problem,
 	 * 	though, since fattr->valid will remain unset.
 	 */
-	decode_getfattr(xdr, res->fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -6108,13 +6105,11 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 		goto out;
 	if (decode_getfh(xdr, &res->fh) != 0)
 		goto out;
-	if (decode_getfattr(xdr, res->f_attr, res->server,
-				!RPC_IS_ASYNC(rqstp->rq_task)) != 0)
+	if (decode_getfattr(xdr, res->f_attr, res->server) != 0)
 		goto out;
 	if (decode_restorefh(xdr) != 0)
 		goto out;
-	decode_getfattr(xdr, res->dir_attr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->dir_attr, res->server);
 out:
 	return status;
 }
@@ -6162,8 +6157,7 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp,
 	status = decode_open(xdr, res);
 	if (status)
 		goto out;
-	decode_getfattr(xdr, res->f_attr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->f_attr, res->server);
 out:
 	return status;
 }
@@ -6190,8 +6184,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp,
 	status = decode_setattr(xdr);
 	if (status)
 		goto out;
-	decode_getfattr(xdr, res->fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -6371,8 +6364,7 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	if (status)
 		goto out;
 	if (res->fattr)
-		decode_getfattr(xdr, res->fattr, res->server,
-				!RPC_IS_ASYNC(rqstp->rq_task));
+		decode_getfattr(xdr, res->fattr, res->server);
 	if (!status)
 		status = res->count;
 out:
@@ -6401,8 +6393,7 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	if (status)
 		goto out;
 	if (res->fattr)
-		decode_getfattr(xdr, res->fattr, res->server,
-				!RPC_IS_ASYNC(rqstp->rq_task));
+		decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -6561,8 +6552,7 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
 	status = decode_delegreturn(xdr);
 	if (status != 0)
 		goto out;
-	decode_getfattr(xdr, res->fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -6591,8 +6581,7 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req,
 		goto out;
 	xdr_enter_page(xdr, PAGE_SIZE);
 	status = decode_getfattr(xdr, &res->fs_locations->fattr,
-				 res->fs_locations->server,
-				 !RPC_IS_ASYNC(req->rq_task));
+				 res->fs_locations->server);
 out:
 	return status;
 }
@@ -6841,8 +6830,7 @@ static int nfs4_xdr_dec_layoutcommit(struct rpc_rqst *rqstp,
 	status = decode_layoutcommit(xdr, rqstp, res);
 	if (status)
 		goto out;
-	decode_getfattr(xdr, res->fattr, res->server,
-			!RPC_IS_ASYNC(rqstp->rq_task));
+	decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -6973,7 +6961,7 @@ int nfs4_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
 		goto out_overflow;
 
 	if (decode_getfattr_attrs(xdr, bitmap, entry->fattr, entry->fh,
-					entry->server, 1) < 0)
+					entry->server) < 0)
 		goto out_overflow;
 	if (entry->fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID)
 		entry->ino = entry->fattr->mounted_on_fileid;
diff --git a/include/linux/nfs_idmap.h b/include/linux/nfs_idmap.h
index ae7d6a380dae..308c18877018 100644
--- a/include/linux/nfs_idmap.h
+++ b/include/linux/nfs_idmap.h
@@ -66,6 +66,8 @@ struct idmap_msg {
 /* Forward declaration to make this header independent of others */
 struct nfs_client;
 struct nfs_server;
+struct nfs_fattr;
+struct nfs4_string;
 
 #ifdef CONFIG_NFS_USE_NEW_IDMAPPER
 
@@ -97,6 +99,12 @@ void nfs_idmap_delete(struct nfs_client *);
 
 #endif /* CONFIG_NFS_USE_NEW_IDMAPPER */
 
+void nfs_fattr_init_names(struct nfs_fattr *fattr,
+		struct nfs4_string *owner_name,
+		struct nfs4_string *group_name);
+void nfs_fattr_free_names(struct nfs_fattr *);
+void nfs_fattr_map_and_free_names(struct nfs_server *, struct nfs_fattr *);
+
 int nfs_map_name_to_uid(const struct nfs_server *, const char *, size_t, __u32 *);
 int nfs_map_group_to_gid(const struct nfs_server *, const char *, size_t, __u32 *);
 int nfs_map_uid_to_name(const struct nfs_server *, __u32, char *, size_t);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6c898afe6095..a764cef06b73 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -18,6 +18,11 @@
 /* Forward declaration for NFS v3 */
 struct nfs4_secinfo_flavors;
 
+struct nfs4_string {
+	unsigned int len;
+	char *data;
+};
+
 struct nfs_fsid {
 	uint64_t		major;
 	uint64_t		minor;
@@ -61,6 +66,8 @@ struct nfs_fattr {
 	struct timespec		pre_ctime;	/* pre_op_attr.ctime	  */
 	unsigned long		time_start;
 	unsigned long		gencount;
+	struct nfs4_string	*owner_name;
+	struct nfs4_string	*group_name;
 };
 
 #define NFS_ATTR_FATTR_TYPE		(1U << 0)
@@ -85,6 +92,8 @@ struct nfs_fattr {
 #define NFS_ATTR_FATTR_V4_REFERRAL	(1U << 19)	/* NFSv4 referral */
 #define NFS_ATTR_FATTR_MOUNTPOINT	(1U << 20)	/* Treat as mountpoint */
 #define NFS_ATTR_FATTR_MOUNTED_ON_FILEID		(1U << 21)
+#define NFS_ATTR_FATTR_OWNER_NAME	(1U << 22)
+#define NFS_ATTR_FATTR_GROUP_NAME	(1U << 23)
 
 #define NFS_ATTR_FATTR (NFS_ATTR_FATTR_TYPE \
 		| NFS_ATTR_FATTR_MODE \
@@ -324,6 +333,7 @@ struct nfs_openargs {
 	const struct qstr *	name;
 	const struct nfs_server *server;	 /* Needed for ID mapping */
 	const u32 *		bitmask;
+	const u32 *		dir_bitmask;
 	__u32			claim;
 	struct nfs4_sequence_args	seq_args;
 };
@@ -342,6 +352,8 @@ struct nfs_openres {
 	__u32			do_recall;
 	__u64			maxsize;
 	__u32			attrset[NFS4_BITMAP_SIZE];
+	struct nfs4_string	*owner;
+	struct nfs4_string	*group_owner;
 	struct nfs4_sequence_res	seq_res;
 };
 
@@ -778,11 +790,6 @@ struct nfs3_getaclres {
 	struct posix_acl *	acl_default;
 };
 
-struct nfs4_string {
-	unsigned int len;
-	char *data;
-};
-
 #ifdef CONFIG_NFS_V4
 
 typedef u64 clientid4;

From 074b1d12fe2500d7d453902f9266e6674b30d84c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 9 Jan 2012 13:46:26 -0500
Subject: [PATCH 17/17] NFSv4: Change the default setting of the
 nfs4_disable_idmapping parameter

Now that the use of numeric uids/gids is officially sanctioned in
RFC3530bis, it is time to change the default here to 'enabled'.

By doing so, we ensure that NFSv4 copies the behaviour of NFSv3 when we're
using the default AUTH_SYS authentication (i.e. when the client uses the
numeric uids/gids as authentication tokens), so that when new files are
created, they will appear to have the correct user/group.
It also fixes a number of backward compatibility issues when migrating
from NFSv3 to NFSv4 on a platform where the server uses different uid/gid
mappings than the client.

Note also that this setting has been successfully tested against servers
that do not support numeric uids/gids at several Connectathon/Bakeathon
events at this point, and the fall back to using string names/groups has
been shown to work well in all those test cases.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 Documentation/kernel-parameters.txt | 17 +++++++++++------
 fs/nfs/client.c                     |  2 +-
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 81c287fad79d..dfd21cfdf579 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1630,12 +1630,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			The default is to return 64-bit inode numbers.
 
 	nfs.nfs4_disable_idmapping=
-			[NFSv4] When set, this option disables the NFSv4
-			idmapper on the client, but only if the mount
-			is using the 'sec=sys' security flavour. This may
-			make migration from legacy NFSv2/v3 systems easier
-			provided that the server has the appropriate support.
-			The default is to always enable NFSv4 idmapping.
+			[NFSv4] When set to the default of '1', this option
+			ensures that both the RPC level authentication
+			scheme and the NFS level operations agree to use
+			numeric uids/gids if the mount is using the
+			'sec=sys' security flavour. In effect it is
+			disabling idmapping, which can make migration from
+			legacy NFSv2/v3 systems to NFSv4 easier.
+			Servers that do not support this mode of operation
+			will be autodetected by the client, and it will fall
+			back to using the idmapper.
+			To turn off this behaviour, set the value to '0'.
 
 	nmi_debug=	[KNL,AVR32,SH] Specify one or more actions to take
 			when a NMI is triggered.
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 41bd67f80d31..277dfaf2e99a 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -84,7 +84,7 @@ retry:
 /*
  * Turn off NFSv4 uid/gid mapping when using AUTH_SYS
  */
-static int nfs4_disable_idmapping = 0;
+static int nfs4_disable_idmapping = 1;
 
 /*
  * RPC cruft for NFS