From 3e25eb9c4bb649acdddb333d10774b640190f727 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Jan 2012 10:20:35 -0500
Subject: [PATCH 1/9] securityfs: fix object creation races

inode needs to be fully set up before we feed it to d_instantiate().
securityfs_create_file() does *not* do so; it sets ->i_fop and
->i_private only after we'd exposed the inode.  Unfortunately,
that's done fairly deep in call chain, so the amount of churn
is considerable.  Helper functions killed by substituting into
their solitary call sites, dead code removed.  We finally can
bury default_file_ops, now that the final value of ->i_fop is
available (and assigned) at the point where inode is allocated.

Reviewed-by: James Morris <jmorris@namei.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 security/inode.c | 193 +++++++++++++----------------------------------
 1 file changed, 51 insertions(+), 142 deletions(-)

diff --git a/security/inode.c b/security/inode.c
index 90a70a67d835..43ce6e19015f 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -25,100 +25,6 @@
 static struct vfsmount *mount;
 static int mount_count;
 
-/*
- * TODO:
- *   I think I can get rid of these default_file_ops, but not quite sure...
- */
-static ssize_t default_read_file(struct file *file, char __user *buf,
-				 size_t count, loff_t *ppos)
-{
-	return 0;
-}
-
-static ssize_t default_write_file(struct file *file, const char __user *buf,
-				   size_t count, loff_t *ppos)
-{
-	return count;
-}
-
-static int default_open(struct inode *inode, struct file *file)
-{
-	if (inode->i_private)
-		file->private_data = inode->i_private;
-
-	return 0;
-}
-
-static const struct file_operations default_file_ops = {
-	.read =		default_read_file,
-	.write =	default_write_file,
-	.open =		default_open,
-	.llseek =	noop_llseek,
-};
-
-static struct inode *get_inode(struct super_block *sb, umode_t mode, dev_t dev)
-{
-	struct inode *inode = new_inode(sb);
-
-	if (inode) {
-		inode->i_ino = get_next_ino();
-		inode->i_mode = mode;
-		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-		switch (mode & S_IFMT) {
-		default:
-			init_special_inode(inode, mode, dev);
-			break;
-		case S_IFREG:
-			inode->i_fop = &default_file_ops;
-			break;
-		case S_IFDIR:
-			inode->i_op = &simple_dir_inode_operations;
-			inode->i_fop = &simple_dir_operations;
-
-			/* directory inodes start off with i_nlink == 2 (for "." entry) */
-			inc_nlink(inode);
-			break;
-		}
-	}
-	return inode;
-}
-
-/* SMP-safe */
-static int mknod(struct inode *dir, struct dentry *dentry,
-			 umode_t mode, dev_t dev)
-{
-	struct inode *inode;
-	int error = -ENOMEM;
-
-	if (dentry->d_inode)
-		return -EEXIST;
-
-	inode = get_inode(dir->i_sb, mode, dev);
-	if (inode) {
-		d_instantiate(dentry, inode);
-		dget(dentry);
-		error = 0;
-	}
-	return error;
-}
-
-static int mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
-	int res;
-
-	mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
-	res = mknod(dir, dentry, mode, 0);
-	if (!res)
-		inc_nlink(dir);
-	return res;
-}
-
-static int create(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
-	mode = (mode & S_IALLUGO) | S_IFREG;
-	return mknod(dir, dentry, mode, 0);
-}
-
 static inline int positive(struct dentry *dentry)
 {
 	return dentry->d_inode && !d_unhashed(dentry);
@@ -145,38 +51,6 @@ static struct file_system_type fs_type = {
 	.kill_sb =	kill_litter_super,
 };
 
-static int create_by_name(const char *name, umode_t mode,
-			  struct dentry *parent,
-			  struct dentry **dentry)
-{
-	int error = 0;
-
-	*dentry = NULL;
-
-	/* If the parent is not specified, we create it in the root.
-	 * We need the root dentry to do this, which is in the super
-	 * block. A pointer to that is in the struct vfsmount that we
-	 * have around.
-	 */
-	if (!parent)
-		parent = mount->mnt_root;
-
-	mutex_lock(&parent->d_inode->i_mutex);
-	*dentry = lookup_one_len(name, parent, strlen(name));
-	if (!IS_ERR(*dentry)) {
-		if (S_ISDIR(mode))
-			error = mkdir(parent->d_inode, *dentry, mode);
-		else
-			error = create(parent->d_inode, *dentry, mode);
-		if (error)
-			dput(*dentry);
-	} else
-		error = PTR_ERR(*dentry);
-	mutex_unlock(&parent->d_inode->i_mutex);
-
-	return error;
-}
-
 /**
  * securityfs_create_file - create a file in the securityfs filesystem
  *
@@ -209,31 +83,66 @@ struct dentry *securityfs_create_file(const char *name, umode_t mode,
 				   struct dentry *parent, void *data,
 				   const struct file_operations *fops)
 {
-	struct dentry *dentry = NULL;
+	struct dentry *dentry;
+	int is_dir = S_ISDIR(mode);
+	struct inode *dir, *inode;
 	int error;
 
+	if (!is_dir) {
+		BUG_ON(!fops);
+		mode = (mode & S_IALLUGO) | S_IFREG;
+	}
+
 	pr_debug("securityfs: creating file '%s'\n",name);
 
 	error = simple_pin_fs(&fs_type, &mount, &mount_count);
-	if (error) {
-		dentry = ERR_PTR(error);
-		goto exit;
-	}
+	if (error)
+		return ERR_PTR(error);
 
-	error = create_by_name(name, mode, parent, &dentry);
-	if (error) {
-		dentry = ERR_PTR(error);
-		simple_release_fs(&mount, &mount_count);
-		goto exit;
-	}
+	if (!parent)
+		parent = mount->mnt_root;
+
+	dir = parent->d_inode;
+
+	mutex_lock(&dir->i_mutex);
+	dentry = lookup_one_len(name, parent, strlen(name));
+	if (IS_ERR(dentry))
+		goto out;
 
 	if (dentry->d_inode) {
-		if (fops)
-			dentry->d_inode->i_fop = fops;
-		if (data)
-			dentry->d_inode->i_private = data;
+		error = -EEXIST;
+		goto out1;
 	}
-exit:
+
+	inode = new_inode(dir->i_sb);
+	if (!inode) {
+		error = -ENOMEM;
+		goto out1;
+	}
+
+	inode->i_ino = get_next_ino();
+	inode->i_mode = mode;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_private = data;
+	if (is_dir) {
+		inode->i_op = &simple_dir_inode_operations;
+		inode->i_fop = &simple_dir_operations;
+		inc_nlink(inode);
+		inc_nlink(dir);
+	} else {
+		inode->i_fop = fops;
+	}
+	d_instantiate(dentry, inode);
+	dget(dentry);
+	mutex_unlock(&dir->i_mutex);
+	return dentry;
+
+out1:
+	dput(dentry);
+	dentry = ERR_PTR(error);
+out:
+	mutex_unlock(&dir->i_mutex);
+	simple_release_fs(&mount, &mount_count);
 	return dentry;
 }
 EXPORT_SYMBOL_GPL(securityfs_create_file);

From 0b2c4e39c014219ef73f05ab580c284bf8e6af0a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Jan 2012 10:46:03 -0500
Subject: [PATCH 2/9] coda: deal correctly with allocation failure from
 coda_cnode_makectl()

lookup should fail with ENOMEM, not silently make dentry negative.
Switched to saner calling conventions, while we are at it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/coda/cnode.c     | 21 +++++++++------------
 fs/coda/coda_fs_i.h |  2 +-
 fs/coda/dir.c       |  4 ++--
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 6475877b0763..8af67c9c47dd 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -156,19 +156,16 @@ struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb)
 }
 
 /* the CONTROL inode is made without asking attributes from Venus */
-int coda_cnode_makectl(struct inode **inode, struct super_block *sb)
+struct inode *coda_cnode_makectl(struct super_block *sb)
 {
-	int error = -ENOMEM;
-
-	*inode = new_inode(sb);
-	if (*inode) {
-		(*inode)->i_ino = CTL_INO;
-		(*inode)->i_op = &coda_ioctl_inode_operations;
-		(*inode)->i_fop = &coda_ioctl_operations;
-		(*inode)->i_mode = 0444;
-		error = 0;
+	struct inode *inode = new_inode(sb);
+	if (inode) {
+		inode->i_ino = CTL_INO;
+		inode->i_op = &coda_ioctl_inode_operations;
+		inode->i_fop = &coda_ioctl_operations;
+		inode->i_mode = 0444;
+		return inode;
 	}
-
-	return error;
+	return ERR_PTR(-ENOMEM);
 }
 
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
index e35071b1de0e..1c17446f1afe 100644
--- a/fs/coda/coda_fs_i.h
+++ b/fs/coda/coda_fs_i.h
@@ -51,7 +51,7 @@ struct coda_file_info {
 
 int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *);
 struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
-int coda_cnode_makectl(struct inode **inode, struct super_block *sb);
+struct inode *coda_cnode_makectl(struct super_block *sb);
 struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
 void coda_replace_fid(struct inode *, struct CodaFid *, struct CodaFid *);
 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 83d2fd8ec24b..df0f9c1b01d3 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -111,7 +111,7 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
 
 	/* control object, create inode on the fly */
 	if (coda_isroot(dir) && coda_iscontrol(name, length)) {
-		error = coda_cnode_makectl(&inode, dir->i_sb);
+		inode = coda_cnode_makectl(dir->i_sb);
 		type = CODA_NOCACHE;
 		goto exit;
 	}
@@ -125,7 +125,7 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
 		return ERR_PTR(error);
 
 exit:
-	if (inode && (type & CODA_NOCACHE))
+	if (inode && !IS_ERR(inode) && (type & CODA_NOCACHE))
 		coda_flag_inode(inode, C_VATTR | C_PURGE);
 
 	return d_splice_alias(inode, entry);

From f4947fbce208990266920d51837e4e7ba9779db1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Jan 2012 11:11:49 -0500
Subject: [PATCH 3/9] coda: switch coda_cnode_make() to sane API as well, clean
 coda_lookup()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/coda/cnode.c     | 17 +++++++----------
 fs/coda/coda_fs_i.h |  2 +-
 fs/coda/dir.c       | 29 +++++++++++++----------------
 fs/coda/inode.c     | 10 ++++++----
 4 files changed, 27 insertions(+), 31 deletions(-)

diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 8af67c9c47dd..911cf30d057d 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -88,24 +88,21 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid,
    - link the two up if this is needed
    - fill in the attributes
 */
-int coda_cnode_make(struct inode **inode, struct CodaFid *fid, struct super_block *sb)
+struct inode *coda_cnode_make(struct CodaFid *fid, struct super_block *sb)
 {
         struct coda_vattr attr;
+	struct inode *inode;
         int error;
         
 	/* We get inode numbers from Venus -- see venus source */
 	error = venus_getattr(sb, fid, &attr);
-	if ( error ) {
-	    *inode = NULL;
-	    return error;
-	} 
+	if (error)
+		return ERR_PTR(error);
 
-	*inode = coda_iget(sb, fid, &attr);
-	if ( IS_ERR(*inode) ) {
+	inode = coda_iget(sb, fid, &attr);
+	if (IS_ERR(inode))
 		printk("coda_cnode_make: coda_iget failed\n");
-                return PTR_ERR(*inode);
-        }
-	return 0;
+	return inode;
 }
 
 
diff --git a/fs/coda/coda_fs_i.h b/fs/coda/coda_fs_i.h
index 1c17446f1afe..b24fdfd8a3f0 100644
--- a/fs/coda/coda_fs_i.h
+++ b/fs/coda/coda_fs_i.h
@@ -49,7 +49,7 @@ struct coda_file_info {
 #define C_DYING       0x4   /* from venus (which died) */
 #define C_PURGE       0x8
 
-int coda_cnode_make(struct inode **, struct CodaFid *, struct super_block *);
+struct inode *coda_cnode_make(struct CodaFid *, struct super_block *);
 struct inode *coda_iget(struct super_block *sb, struct CodaFid *fid, struct coda_vattr *attr);
 struct inode *coda_cnode_makectl(struct super_block *sb);
 struct inode *coda_fid_to_inode(struct CodaFid *fid, struct super_block *sb);
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index df0f9c1b01d3..177515829062 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -96,12 +96,11 @@ const struct file_operations coda_dir_operations = {
 /* access routines: lookup, readlink, permission */
 static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struct nameidata *nd)
 {
-	struct inode *inode = NULL;
-	struct CodaFid resfid = { { 0, } };
-	int type = 0;
-	int error = 0;
+	struct super_block *sb = dir->i_sb;
 	const char *name = entry->d_name.name;
 	size_t length = entry->d_name.len;
+	struct inode *inode;
+	int type = 0;
 
 	if (length > CODA_MAXNAMLEN) {
 		printk(KERN_ERR "name too long: lookup, %s (%*s)\n",
@@ -111,23 +110,21 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, struc
 
 	/* control object, create inode on the fly */
 	if (coda_isroot(dir) && coda_iscontrol(name, length)) {
-		inode = coda_cnode_makectl(dir->i_sb);
+		inode = coda_cnode_makectl(sb);
 		type = CODA_NOCACHE;
-		goto exit;
+	} else {
+		struct CodaFid fid = { { 0, } };
+		int error = venus_lookup(sb, coda_i2f(dir), name, length,
+				     &type, &fid);
+		inode = !error ? coda_cnode_make(&fid, sb) : ERR_PTR(error);
 	}
 
-	error = venus_lookup(dir->i_sb, coda_i2f(dir), name, length,
-			     &type, &resfid);
-	if (!error)
-		error = coda_cnode_make(&inode, &resfid, dir->i_sb);
-
-	if (error && error != -ENOENT)
-		return ERR_PTR(error);
-
-exit:
-	if (inode && !IS_ERR(inode) && (type & CODA_NOCACHE))
+	if (!IS_ERR(inode) && (type & CODA_NOCACHE))
 		coda_flag_inode(inode, C_VATTR | C_PURGE);
 
+	if (inode == ERR_PTR(-ENOENT))
+		inode = NULL;
+
 	return d_splice_alias(inode, entry);
 }
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 1c08a8cd673a..5e2e1b3f068d 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -204,10 +204,12 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
 	printk("coda_read_super: rootfid is %s\n", coda_f2s(&fid));
 	
 	/* make root inode */
-        error = coda_cnode_make(&root, &fid, sb);
-        if ( error || !root ) {
-	    printk("Failure of coda_cnode_make for root: error %d\n", error);
-	    goto error;
+        root = coda_cnode_make(&fid, sb);
+        if (IS_ERR(root)) {
+		error = PTR_ERR(root);
+		printk("Failure of coda_cnode_make for root: error %d\n", error);
+		root = NULL;
+		goto error;
 	} 
 
 	printk("coda_read_super: rootinode is %ld dev %s\n", 

From eaf5f9073533cde21c7121c136f1c3f072d9cf59 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Tue, 10 Jan 2012 18:22:25 +0100
Subject: [PATCH 4/9] fix shrink_dcache_parent() livelock

Two (or more) concurrent calls of shrink_dcache_parent() on the same dentry may
cause shrink_dcache_parent() to loop forever.

Here's what appears to happen:

1 - CPU0: select_parent(P) finds C and puts it on dispose list, returns 1

2 - CPU1: select_parent(P) locks P->d_lock

3 - CPU0: shrink_dentry_list() locks C->d_lock
   dentry_kill(C) tries to lock P->d_lock but fails, unlocks C->d_lock

4 - CPU1: select_parent(P) locks C->d_lock,
         moves C from dispose list being processed on CPU0 to the new
dispose list, returns 1

5 - CPU0: shrink_dentry_list() finds dispose list empty, returns

6 - Goto 2 with CPU0 and CPU1 switched

Basically select_parent() steals the dentry from shrink_dentry_list() and thinks
it found a new one, causing shrink_dentry_list() to think it's making progress
and loop over and over.

One way to trigger this is to make udev calls stat() on the sysfs file while it
is going away.

Having a file in /lib/udev/rules.d/ with only this one rule seems to the trick:

ATTR{vendor}=="0x8086", ATTR{device}=="0x10ca", ENV{PCI_SLOT_NAME}="%k", ENV{MATCHADDR}="$attr{address}", RUN+="/bin/true"

Then execute the following loop:

while true; do
        echo -bond0 > /sys/class/net/bonding_masters
        echo +bond0 > /sys/class/net/bonding_masters
        echo -bond1 > /sys/class/net/bonding_masters
        echo +bond1 > /sys/class/net/bonding_masters
done

One fix would be to check all callers and prevent concurrent calls to
shrink_dcache_parent().  But I think a better solution is to stop the
stealing behavior.

This patch adds a new dentry flag that is set when the dentry is added to the
dispose list.  The flag is cleared in dentry_lru_del() in case the dentry gets a
new reference just before being pruned.

If the dentry has this flag, select_parent() will skip it and let
shrink_dentry_list() retry pruning it.  With select_parent() skipping those
dentries there will not be the appearance of progress (new dentries found) when
there is none, hence shrink_dcache_parent() will not loop forever.

Set the flag is also set in prune_dcache_sb() for consistency as suggested by
Linus.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 17 ++++++++++++-----
 include/linux/dcache.h |  1 +
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 3c6d3113a255..616fedff011a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -243,6 +243,7 @@ static void dentry_lru_add(struct dentry *dentry)
 static void __dentry_lru_del(struct dentry *dentry)
 {
 	list_del_init(&dentry->d_lru);
+	dentry->d_flags &= ~DCACHE_SHRINK_LIST;
 	dentry->d_sb->s_nr_dentry_unused--;
 	dentry_stat.nr_unused--;
 }
@@ -806,6 +807,7 @@ void prune_dcache_sb(struct super_block *sb, int count)
 			spin_unlock(&dentry->d_lock);
 		} else {
 			list_move_tail(&dentry->d_lru, &tmp);
+			dentry->d_flags |= DCACHE_SHRINK_LIST;
 			spin_unlock(&dentry->d_lock);
 			if (!--count)
 				break;
@@ -1097,14 +1099,19 @@ static int select_parent(struct dentry *parent, struct list_head *dispose)
 
 		/*
 		 * move only zero ref count dentries to the dispose list.
+		 *
+		 * Those which are presently on the shrink list, being processed
+		 * by shrink_dentry_list(), shouldn't be moved.  Otherwise the
+		 * loop in shrink_dcache_parent() might not make any progress
+		 * and loop forever.
 		 */
-		if (!dentry->d_count) {
-			dentry_lru_move_list(dentry, dispose);
-			found++;
-		} else {
+		if (dentry->d_count) {
 			dentry_lru_del(dentry);
+		} else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
+			dentry_lru_move_list(dentry, dispose);
+			dentry->d_flags |= DCACHE_SHRINK_LIST;
+			found++;
 		}
-
 		/*
 		 * We can return to the caller if we have found some (this
 		 * ensures forward progress). We'll be coming back to find
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index a47bda5f76db..31f73220e7d7 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -203,6 +203,7 @@ struct dentry_operations {
 
 #define DCACHE_CANT_MOUNT	0x0100
 #define DCACHE_GENOCIDE		0x0200
+#define DCACHE_SHRINK_LIST	0x0400
 
 #define DCACHE_NFSFS_RENAMED	0x1000
      /* this dentry has been "silly renamed" and has to be deleted on the last

From ace8577aeb438025ecf642f5eda3aa551d251951 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Tue, 10 Jan 2012 02:43:59 +0300
Subject: [PATCH 5/9] block_dev: Suppress bdev_cache_init() kmemleak warninig

Kmemleak reports the following warning in bdev_cache_init()
[    0.003738] kmemleak: Object 0xffff880153035200 (size 256):
[    0.003823] kmemleak:   comm "swapper/0", pid 0, jiffies 4294667299
[    0.003909] kmemleak:   min_count = 1
[    0.003988] kmemleak:   count = 0
[    0.004066] kmemleak:   flags = 0x1
[    0.004144] kmemleak:   checksum = 0
[    0.004224] kmemleak:   backtrace:
[    0.004303]      [<ffffffff814755ac>] kmemleak_alloc+0x21/0x3e
[    0.004446]      [<ffffffff811100ba>] kmem_cache_alloc+0xca/0x1dc
[    0.004592]      [<ffffffff811371b1>] alloc_vfsmnt+0x1f/0x198
[    0.004736]      [<ffffffff811375c5>] vfs_kern_mount+0x36/0xd2
[    0.004879]      [<ffffffff8113929a>] kern_mount_data+0x18/0x32
[    0.005025]      [<ffffffff81ab9075>] bdev_cache_init+0x51/0x81
[    0.005169]      [<ffffffff81ab8abf>] vfs_caches_init+0x101/0x10d
[    0.005313]      [<ffffffff81a9bae3>] start_kernel+0x344/0x383
[    0.005456]      [<ffffffff81a9b2a7>] x86_64_start_reservations+0xae/0xb2
[    0.005602]      [<ffffffff81a9b3ad>] x86_64_start_kernel+0x102/0x111
[    0.005747]      [<ffffffffffffffff>] 0xffffffffffffffff
[    0.008653] kmemleak: Trying to color unknown object at 0xffff880153035220 as Grey
[    0.008754] Pid: 0, comm: swapper/0 Not tainted 3.3.0-rc0-dbg-04200-g8180888-dirty #888
[    0.008856] Call Trace:
[    0.008934]  [<ffffffff81118704>] ? find_and_get_object+0x44/0x118
[    0.009023]  [<ffffffff81118fe6>] paint_ptr+0x57/0x8f
[    0.009109]  [<ffffffff81475935>] kmemleak_not_leak+0x23/0x42
[    0.009195]  [<ffffffff81ab9096>] bdev_cache_init+0x72/0x81
[    0.009282]  [<ffffffff81ab8abf>] vfs_caches_init+0x101/0x10d
[    0.009368]  [<ffffffff81a9bae3>] start_kernel+0x344/0x383
[    0.009466]  [<ffffffff81a9b2a7>] x86_64_start_reservations+0xae/0xb2
[    0.009555]  [<ffffffff81a9b140>] ? early_idt_handlers+0x140/0x140
[    0.009643]  [<ffffffff81a9b3ad>] x86_64_start_kernel+0x102/0x111

due to attempt to mark pointer to `struct vfsmount' as a gray object, which
is embedded into `struct mount' returned from alloc_vfsmnt().

Make `bd_mnt' static, avoiding need to tell kmemleak to mark it gray, as
suggested by Al Viro.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 69a5b6fbee2b..afe74dda632b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -25,7 +25,6 @@
 #include <linux/uio.h>
 #include <linux/namei.h>
 #include <linux/log2.h>
-#include <linux/kmemleak.h>
 #include <linux/cleancache.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -521,7 +520,7 @@ static struct super_block *blockdev_superblock __read_mostly;
 void __init bdev_cache_init(void)
 {
 	int err;
-	struct vfsmount *bd_mnt;
+	static struct vfsmount *bd_mnt;
 
 	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
 			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -533,12 +532,7 @@ void __init bdev_cache_init(void)
 	bd_mnt = kern_mount(&bd_type);
 	if (IS_ERR(bd_mnt))
 		panic("Cannot create bdev pseudo-fs");
-	/*
-	 * This vfsmount structure is only used to obtain the
-	 * blockdev_superblock, so tell kmemleak not to report it.
-	 */
-	kmemleak_not_leak(bd_mnt);
-	blockdev_superblock = bd_mnt->mnt_sb;	/* For writeback */
+	blockdev_superblock = bd_mnt->mnt_sb;   /* For writeback */
 }
 
 /*

From b3f2a92447b8443360ac117a3d7c06689562a70c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Jan 2012 17:48:52 -0500
Subject: [PATCH 6/9] hfsplus: creation of hidden dir on mount can fail

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hfsplus/super.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index edf0a801446b..427682ca9e48 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -499,9 +499,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
 		if (!sbi->hidden_dir) {
 			mutex_lock(&sbi->vh_mutex);
 			sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
-			hfsplus_create_cat(sbi->hidden_dir->i_ino, root, &str,
-					   sbi->hidden_dir);
+			if (!sbi->hidden_dir) {
+				mutex_unlock(&sbi->vh_mutex);
+				err = -ENOMEM;
+				goto out_put_root;
+			}
+			err = hfsplus_create_cat(sbi->hidden_dir->i_ino, root,
+						 &str, sbi->hidden_dir);
 			mutex_unlock(&sbi->vh_mutex);
+			if (err)
+				goto out_put_hidden_dir;
 
 			hfsplus_mark_inode_dirty(sbi->hidden_dir,
 						 HFSPLUS_I_CAT_DIRTY);

From 4041bcdc7bef06a2fb29c57394c713a74bd13b08 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Jan 2012 22:20:12 -0500
Subject: [PATCH 7/9] autofs4: autofs4_wait() vs. autofs4_catatonic_mode() race

We need to recheck ->catatonic after autofs4_wait() got ->wq_mutex
for good, or we might end up with wq inserted into queue after
autofs4_catatonic_mode() had done its thing.  It will stick there
forever, since there won't be anything to clear its ->name.name.

A bit of a complication: validate_request() drops and regains ->wq_mutex.
It actually ends up the most convenient place to stick the check into...

Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/waitq.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index e1fbdeef85db..c13273afd546 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -257,6 +257,9 @@ static int validate_request(struct autofs_wait_queue **wait,
 	struct autofs_wait_queue *wq;
 	struct autofs_info *ino;
 
+	if (sbi->catatonic)
+		return -ENOENT;
+
 	/* Wait in progress, continue; */
 	wq = autofs4_find_wait(sbi, qstr);
 	if (wq) {
@@ -289,6 +292,9 @@ static int validate_request(struct autofs_wait_queue **wait,
 			if (mutex_lock_interruptible(&sbi->wq_mutex))
 				return -EINTR;
 
+			if (sbi->catatonic)
+				return -ENOENT;
+
 			wq = autofs4_find_wait(sbi, qstr);
 			if (wq) {
 				*wait = wq;
@@ -389,7 +395,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 
 	ret = validate_request(&wq, sbi, &qstr, dentry, notify);
 	if (ret <= 0) {
-		if (ret == 0)
+		if (ret != -EINTR)
 			mutex_unlock(&sbi->wq_mutex);
 		kfree(qstr.name);
 		return ret;

From 8753333266be67ff3a984ac1f6566d31c260bee4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Jan 2012 22:24:48 -0500
Subject: [PATCH 8/9] autofs4: catatonic_mode vs. notify_daemon race

we need to hold ->wq_mutex while we are forming the packet to send,
lest we have autofs4_catatonic_mode() setting wq->name.name to NULL
just as autofs4_notify_daemon() decides to memcpy() from it...

We do have check for catatonic mode immediately after that (under
->wq_mutex, as it ought to be) and packet won't be actually sent,
but it'll be too late for us if we oops on that memcpy() from NULL...

Fix is obvious - just extend the area covered by ->wq_mutex over
that switch and check whether it's catatonic *before* doing anything
else.

Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/waitq.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index c13273afd546..9a0256da5d56 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -110,6 +110,13 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 
 	pkt.hdr.proto_version = sbi->version;
 	pkt.hdr.type = type;
+	mutex_lock(&sbi->wq_mutex);
+
+	/* Check if we have become catatonic */
+	if (sbi->catatonic) {
+		mutex_unlock(&sbi->wq_mutex);
+		return;
+	}
 	switch (type) {
 	/* Kernel protocol v4 missing and expire packets */
 	case autofs_ptype_missing:
@@ -163,22 +170,18 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 	}
 	default:
 		printk("autofs4_notify_daemon: bad type %d!\n", type);
+		mutex_unlock(&sbi->wq_mutex);
 		return;
 	}
 
-	/* Check if we have become catatonic */
-	mutex_lock(&sbi->wq_mutex);
-	if (!sbi->catatonic) {
-		pipe = sbi->pipe;
-		get_file(pipe);
-	}
+	pipe = sbi->pipe;
+	get_file(pipe);
+
 	mutex_unlock(&sbi->wq_mutex);
 
-	if (pipe) {
-		if (autofs4_write(pipe, &pkt, pktsz))
-			autofs4_catatonic_mode(sbi);
-		fput(pipe);
-	}
+	if (autofs4_write(pipe, &pkt, pktsz))
+		autofs4_catatonic_mode(sbi);
+	fput(pipe);
 }
 
 static int autofs4_getpath(struct autofs_sb_info *sbi,

From d668dc56631da067540b2494d2a1f29ff7b5f15a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 10 Jan 2012 22:35:38 -0500
Subject: [PATCH 9/9] autofs4: deal with autofs4_write/autofs4_write races

Just serialize the actual writing of packets into pipe on
a new mutex, independent from everything else in the locking
hierarchy.  As soon as something has started feeding a piece
of packet into the pipe to daemon, we *want* everything else
about to try the same to wait until we are done.

Acked-by: Ian Kent <raven@themaw.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/autofs_i.h | 1 +
 fs/autofs4/inode.c    | 1 +
 fs/autofs4/waitq.c    | 9 +++++----
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index 5869d4e974a9..d8d8e7ba6a1e 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -116,6 +116,7 @@ struct autofs_sb_info {
 	int needs_reghost;
 	struct super_block *sb;
 	struct mutex wq_mutex;
+	struct mutex pipe_mutex;
 	spinlock_t fs_lock;
 	struct autofs_wait_queue *queues; /* Wait queue pointer */
 	spinlock_t lookup_lock;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 2ba44c79d548..e16980b00b8d 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -225,6 +225,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	sbi->min_proto = 0;
 	sbi->max_proto = 0;
 	mutex_init(&sbi->wq_mutex);
+	mutex_init(&sbi->pipe_mutex);
 	spin_lock_init(&sbi->fs_lock);
 	sbi->queues = NULL;
 	spin_lock_init(&sbi->lookup_lock);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 9a0256da5d56..9ef5b2914407 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -56,26 +56,27 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi)
 	mutex_unlock(&sbi->wq_mutex);
 }
 
-static int autofs4_write(struct file *file, const void *addr, int bytes)
+static int autofs4_write(struct autofs_sb_info *sbi,
+			 struct file *file, const void *addr, int bytes)
 {
 	unsigned long sigpipe, flags;
 	mm_segment_t fs;
 	const char *data = (const char *)addr;
 	ssize_t wr = 0;
 
-	/** WARNING: this is not safe for writing more than PIPE_BUF bytes! **/
-
 	sigpipe = sigismember(&current->pending.signal, SIGPIPE);
 
 	/* Save pointer to user space and point back to kernel space */
 	fs = get_fs();
 	set_fs(KERNEL_DS);
 
+	mutex_lock(&sbi->pipe_mutex);
 	while (bytes &&
 	       (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) {
 		data += wr;
 		bytes -= wr;
 	}
+	mutex_lock(&sbi->pipe_mutex);
 
 	set_fs(fs);
 
@@ -179,7 +180,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 
 	mutex_unlock(&sbi->wq_mutex);
 
-	if (autofs4_write(pipe, &pkt, pktsz))
+	if (autofs4_write(sbi, pipe, &pkt, pktsz))
 		autofs4_catatonic_mode(sbi);
 	fput(pipe);
 }