From 3e15dcf77b23b8e9b9b7f3c0d4def8fe9c12c534 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 8 Sep 2023 16:28:59 +0300
Subject: [PATCH 01/26] fs: rename __mnt_{want,drop}_write*() helpers

Before exporting these helpers to modules, make their names more
meaningful.

The names mnt_{get,put)_write_access*() were chosen, because they rhyme
with the inode {get,put)_write_access() helpers, which have a very close
meaning for the inode object.

Suggested-by: Christian Brauner <brauner@kernel.org>
Link: https://lore.kernel.org/r/20230817-anfechtbar-ruhelosigkeit-8c6cca8443fc@brauner/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Message-Id: <20230908132900.2983519-2-amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/inode.c            |  8 ++++----
 fs/internal.h         | 12 ++++++------
 fs/namespace.c        | 34 +++++++++++++++++-----------------
 fs/open.c             |  2 +-
 include/linux/mount.h |  4 ++--
 kernel/acct.c         |  4 ++--
 6 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index 35fd688168c5..7febdc9fd1a9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2006,7 +2006,7 @@ void touch_atime(const struct path *path)
 	if (!sb_start_write_trylock(inode->i_sb))
 		return;
 
-	if (__mnt_want_write(mnt) != 0)
+	if (mnt_get_write_access(mnt) != 0)
 		goto skip_update;
 	/*
 	 * File systems can error out when updating inodes if they need to
@@ -2018,7 +2018,7 @@ void touch_atime(const struct path *path)
 	 * of the fs read only, e.g. subvolumes in Btrfs.
 	 */
 	inode_update_time(inode, S_ATIME);
-	__mnt_drop_write(mnt);
+	mnt_put_write_access(mnt);
 skip_update:
 	sb_end_write(inode->i_sb);
 }
@@ -2173,9 +2173,9 @@ static int __file_update_time(struct file *file, int sync_mode)
 	struct inode *inode = file_inode(file);
 
 	/* try to update time settings */
-	if (!__mnt_want_write_file(file)) {
+	if (!mnt_get_write_access_file(file)) {
 		ret = inode_update_time(inode, sync_mode);
-		__mnt_drop_write_file(file);
+		mnt_put_write_access_file(file);
 	}
 
 	return ret;
diff --git a/fs/internal.h b/fs/internal.h
index d64ae03998cc..8260c738980c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -73,8 +73,8 @@ extern int sb_prepare_remount_readonly(struct super_block *);
 
 extern void __init mnt_init(void);
 
-extern int __mnt_want_write_file(struct file *);
-extern void __mnt_drop_write_file(struct file *);
+int mnt_get_write_access_file(struct file *file);
+void mnt_put_write_access_file(struct file *file);
 
 extern void dissolve_on_fput(struct vfsmount *);
 extern bool may_mount(void);
@@ -101,7 +101,7 @@ static inline void put_file_access(struct file *file)
 		i_readcount_dec(file->f_inode);
 	} else if (file->f_mode & FMODE_WRITER) {
 		put_write_access(file->f_inode);
-		__mnt_drop_write(file->f_path.mnt);
+		mnt_put_write_access(file->f_path.mnt);
 	}
 }
 
@@ -130,9 +130,9 @@ static inline void sb_start_ro_state_change(struct super_block *sb)
 	 * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY
 	 * cleared, it will see s_readonly_remount set.
 	 * For RW->RO transition, the barrier pairs with the barrier in
-	 * __mnt_want_write() before the mnt_is_readonly() check. The barrier
-	 * makes sure if __mnt_want_write() sees MNT_WRITE_HOLD already
-	 * cleared, it will see s_readonly_remount set.
+	 * mnt_get_write_access() before the mnt_is_readonly() check.
+	 * The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD
+	 * already cleared, it will see s_readonly_remount set.
 	 */
 	smp_wmb();
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index e157efc54023..3fe7c0484e6a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -330,16 +330,16 @@ static int mnt_is_readonly(struct vfsmount *mnt)
  * can determine when writes are able to occur to a filesystem.
  */
 /**
- * __mnt_want_write - get write access to a mount without freeze protection
+ * mnt_get_write_access - get write access to a mount without freeze protection
  * @m: the mount on which to take a write
  *
  * This tells the low-level filesystem that a write is about to be performed to
  * it, and makes sure that writes are allowed (mnt it read-write) before
  * returning success. This operation does not protect against filesystem being
- * frozen. When the write operation is finished, __mnt_drop_write() must be
+ * frozen. When the write operation is finished, mnt_put_write_access() must be
  * called. This is effectively a refcount.
  */
-int __mnt_want_write(struct vfsmount *m)
+int mnt_get_write_access(struct vfsmount *m)
 {
 	struct mount *mnt = real_mount(m);
 	int ret = 0;
@@ -401,7 +401,7 @@ int mnt_want_write(struct vfsmount *m)
 	int ret;
 
 	sb_start_write(m->mnt_sb);
-	ret = __mnt_want_write(m);
+	ret = mnt_get_write_access(m);
 	if (ret)
 		sb_end_write(m->mnt_sb);
 	return ret;
@@ -409,15 +409,15 @@ int mnt_want_write(struct vfsmount *m)
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
 /**
- * __mnt_want_write_file - get write access to a file's mount
+ * mnt_get_write_access_file - get write access to a file's mount
  * @file: the file who's mount on which to take a write
  *
- * This is like __mnt_want_write, but if the file is already open for writing it
+ * This is like mnt_get_write_access, but if @file is already open for write it
  * skips incrementing mnt_writers (since the open file already has a reference)
  * and instead only does the check for emergency r/o remounts.  This must be
- * paired with __mnt_drop_write_file.
+ * paired with mnt_put_write_access_file.
  */
-int __mnt_want_write_file(struct file *file)
+int mnt_get_write_access_file(struct file *file)
 {
 	if (file->f_mode & FMODE_WRITER) {
 		/*
@@ -428,7 +428,7 @@ int __mnt_want_write_file(struct file *file)
 			return -EROFS;
 		return 0;
 	}
-	return __mnt_want_write(file->f_path.mnt);
+	return mnt_get_write_access(file->f_path.mnt);
 }
 
 /**
@@ -445,7 +445,7 @@ int mnt_want_write_file(struct file *file)
 	int ret;
 
 	sb_start_write(file_inode(file)->i_sb);
-	ret = __mnt_want_write_file(file);
+	ret = mnt_get_write_access_file(file);
 	if (ret)
 		sb_end_write(file_inode(file)->i_sb);
 	return ret;
@@ -453,14 +453,14 @@ int mnt_want_write_file(struct file *file)
 EXPORT_SYMBOL_GPL(mnt_want_write_file);
 
 /**
- * __mnt_drop_write - give up write access to a mount
+ * mnt_put_write_access - give up write access to a mount
  * @mnt: the mount on which to give up write access
  *
  * Tells the low-level filesystem that we are done
  * performing writes to it.  Must be matched with
- * __mnt_want_write() call above.
+ * mnt_get_write_access() call above.
  */
-void __mnt_drop_write(struct vfsmount *mnt)
+void mnt_put_write_access(struct vfsmount *mnt)
 {
 	preempt_disable();
 	mnt_dec_writers(real_mount(mnt));
@@ -477,20 +477,20 @@ void __mnt_drop_write(struct vfsmount *mnt)
  */
 void mnt_drop_write(struct vfsmount *mnt)
 {
-	__mnt_drop_write(mnt);
+	mnt_put_write_access(mnt);
 	sb_end_write(mnt->mnt_sb);
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 
-void __mnt_drop_write_file(struct file *file)
+void mnt_put_write_access_file(struct file *file)
 {
 	if (!(file->f_mode & FMODE_WRITER))
-		__mnt_drop_write(file->f_path.mnt);
+		mnt_put_write_access(file->f_path.mnt);
 }
 
 void mnt_drop_write_file(struct file *file)
 {
-	__mnt_drop_write_file(file);
+	mnt_put_write_access_file(file);
 	sb_end_write(file_inode(file)->i_sb);
 }
 EXPORT_SYMBOL(mnt_drop_write_file);
diff --git a/fs/open.c b/fs/open.c
index 98f6601fbac6..a65ce47810cf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -895,7 +895,7 @@ static int do_dentry_open(struct file *f,
 		error = get_write_access(inode);
 		if (unlikely(error))
 			goto cleanup_file;
-		error = __mnt_want_write(f->f_path.mnt);
+		error = mnt_get_write_access(f->f_path.mnt);
 		if (unlikely(error)) {
 			put_write_access(inode);
 			goto cleanup_file;
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 4f40b40306d0..ac3dd2876197 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -92,8 +92,8 @@ extern bool __mnt_is_readonly(struct vfsmount *mnt);
 extern bool mnt_may_suid(struct vfsmount *mnt);
 
 extern struct vfsmount *clone_private_mount(const struct path *path);
-extern int __mnt_want_write(struct vfsmount *);
-extern void __mnt_drop_write(struct vfsmount *);
+int mnt_get_write_access(struct vfsmount *mnt);
+void mnt_put_write_access(struct vfsmount *mnt);
 
 extern struct vfsmount *fc_mount(struct fs_context *fc);
 extern struct vfsmount *vfs_create_mount(struct fs_context *fc);
diff --git a/kernel/acct.c b/kernel/acct.c
index 1a9f929fe629..986c8214dabf 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -246,7 +246,7 @@ static int acct_on(struct filename *pathname)
 		filp_close(file, NULL);
 		return PTR_ERR(internal);
 	}
-	err = __mnt_want_write(internal);
+	err = mnt_get_write_access(internal);
 	if (err) {
 		mntput(internal);
 		kfree(acct);
@@ -271,7 +271,7 @@ static int acct_on(struct filename *pathname)
 	old = xchg(&ns->bacct, &acct->pin);
 	mutex_unlock(&acct->lock);
 	pin_kill(old);
-	__mnt_drop_write(mnt);
+	mnt_put_write_access(mnt);
 	mntput(mnt);
 	return 0;
 }

From ddf9e2ff67a910acde1d000e76b7e31267599539 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 8 Sep 2023 16:29:00 +0300
Subject: [PATCH 02/26] fs: export mnt_{get,put}_write_access() to modules

Overlayfs is going to use those to get write access on the upper mount
during entire copy up without taking freeze protection on upper sb for
the entire copy up.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Message-Id: <20230908132900.2983519-3-amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/namespace.c b/fs/namespace.c
index 3fe7c0484e6a..9e0ea73f2ff9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -386,6 +386,7 @@ int mnt_get_write_access(struct vfsmount *m)
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(mnt_get_write_access);
 
 /**
  * mnt_want_write - get write access to a mount
@@ -466,6 +467,7 @@ void mnt_put_write_access(struct vfsmount *mnt)
 	mnt_dec_writers(real_mount(mnt));
 	preempt_enable();
 }
+EXPORT_SYMBOL_GPL(mnt_put_write_access);
 
 /**
  * mnt_drop_write - give up write access to a mount

From 84d2b696236c63836011f04d13d3f09ed47fa560 Mon Sep 17 00:00:00 2001
From: Jianyong Wu <jianyong.wu@arm.com>
Date: Thu, 7 Sep 2023 09:10:25 +0000
Subject: [PATCH 03/26] init/mount: print pretty name of root device when
 panics

Given a wrong root device, current log may not give the pretty name
which is useful to locate root cause.

For example, there are 2 blk devs in a VM, /dev/vda which has 2 partitials
/dev/vda1 and /dev/vda2 and /dev/vdb which is blank. /dev/vda2 is the
right root dev. When set "root=/dev/vdb", we get error log:

[    0.635575] Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(254,16)

It's not straightforward to find out the root cause as there is lack of
the root devive name therefore hard for people to get those info from the
device number, in the example, (254,16).

It is more comprehensive way to hint the root cause if pretty name is
given here, like:

[    0.559887] Kernel panic - not syncing: VFS: Unable to mount root fs on "/dev/vdb" or unknown-block(254,16)

Signed-off-by: Jianyong Wu <jianyong.wu@arm.com>
Message-Id: <20230907091025.3436878-1-jianyong.wu@arm.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 init/do_mounts.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/init/do_mounts.c b/init/do_mounts.c
index 5dfd30b13f48..5fdef94f0864 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -244,7 +244,7 @@ retry:
 	for (i = 0, p = fs_names; i < num_fs; i++, p += strlen(p)+1)
 		printk(" %s", p);
 	printk("\n");
-	panic("VFS: Unable to mount root fs on %s", b);
+	panic("VFS: Unable to mount root fs on \"%s\" or %s", pretty_name, b);
 out:
 	put_page(page);
 }

From 5aa8fd9cea2ee0d42c5d92c5eacf0a14bbc4c293 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@kernel.org>
Date: Mon, 11 Sep 2023 20:25:50 -0400
Subject: [PATCH 04/26] fs: add a new SB_I_NOUMASK flag

SB_POSIXACL must be set when a filesystem supports POSIX ACLs, but NFSv4
also sets this flag to prevent the VFS from applying the umask on
newly-created files. NFSv4 doesn't support POSIX ACLs however, which
causes confusion when other subsystems try to test for them.

Add a new SB_I_NOUMASK flag that allows filesystems to opt-in to umask
stripping without advertising support for POSIX ACLs. Set the new flag
on NFSv4 instead of SB_POSIXACL.

Also, move mode_strip_umask to namei.h and convert init_mknod and
init_mkdir to use it.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Message-Id: <20230911-acl-fix-v3-1-b25315333f6c@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/init.c             |  6 ++----
 fs/namei.c            | 19 -------------------
 fs/nfs/super.c        |  2 +-
 include/linux/fs.h    |  3 ++-
 include/linux/namei.h | 24 ++++++++++++++++++++++++
 5 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/fs/init.c b/fs/init.c
index 9684406a8416..e9387b6c4f30 100644
--- a/fs/init.c
+++ b/fs/init.c
@@ -153,8 +153,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev)
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	if (!IS_POSIXACL(path.dentry->d_inode))
-		mode &= ~current_umask();
+	mode = mode_strip_umask(d_inode(path.dentry), mode);
 	error = security_path_mknod(&path, dentry, mode, dev);
 	if (!error)
 		error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode,
@@ -229,8 +228,7 @@ int __init init_mkdir(const char *pathname, umode_t mode)
 	dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
-	if (!IS_POSIXACL(path.dentry->d_inode))
-		mode &= ~current_umask();
+	mode = mode_strip_umask(d_inode(path.dentry), mode);
 	error = security_path_mkdir(&path, dentry, mode);
 	if (!error)
 		error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
diff --git a/fs/namei.c b/fs/namei.c
index 567ee547492b..94b27370f468 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3103,25 +3103,6 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
 }
 EXPORT_SYMBOL(unlock_rename);
 
-/**
- * mode_strip_umask - handle vfs umask stripping
- * @dir:	parent directory of the new inode
- * @mode:	mode of the new inode to be created in @dir
- *
- * Umask stripping depends on whether or not the filesystem supports POSIX
- * ACLs. If the filesystem doesn't support it umask stripping is done directly
- * in here. If the filesystem does support POSIX ACLs umask stripping is
- * deferred until the filesystem calls posix_acl_create().
- *
- * Returns: mode
- */
-static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode)
-{
-	if (!IS_POSIXACL(dir))
-		mode &= ~current_umask();
-	return mode;
-}
-
 /**
  * vfs_prepare_mode - prepare the mode to be used for a new inode
  * @idmap:	idmap of the mount the inode was found from
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0d6473cb00cb..9b1cfca8112a 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1071,7 +1071,7 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx)
 		sb->s_export_op = &nfs_export_ops;
 		break;
 	case 4:
-		sb->s_flags |= SB_POSIXACL;
+		sb->s_iflags |= SB_I_NOUMASK;
 		sb->s_time_gran = 1;
 		sb->s_time_min = S64_MIN;
 		sb->s_time_max = S64_MAX;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4aeb3fa11927..58dea591a341 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1119,7 +1119,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_NOATIME      BIT(10)	/* Do not update access times. */
 #define SB_NODIRATIME   BIT(11)	/* Do not update directory access times */
 #define SB_SILENT       BIT(15)
-#define SB_POSIXACL     BIT(16)	/* VFS does not apply the umask */
+#define SB_POSIXACL     BIT(16)	/* Supports POSIX ACLs */
 #define SB_INLINECRYPT  BIT(17)	/* Use blk-crypto for encrypted files */
 #define SB_KERNMOUNT    BIT(22)	/* this is a kern_mount call */
 #define SB_I_VERSION    BIT(23)	/* Update inode I_version field */
@@ -1166,6 +1166,7 @@ extern int send_sigurg(struct fown_struct *fown);
 #define SB_I_PERSB_BDI	0x00000200	/* has a per-sb bdi */
 #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
 #define SB_I_RETIRED	0x00000800	/* superblock shouldn't be reused */
+#define SB_I_NOUMASK	0x00001000	/* VFS does not apply umask */
 
 /* Possible states of 'frozen' field */
 enum {
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 1463cbda4888..e3619920f9f0 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -92,6 +92,30 @@ extern struct dentry *lock_rename(struct dentry *, struct dentry *);
 extern struct dentry *lock_rename_child(struct dentry *, struct dentry *);
 extern void unlock_rename(struct dentry *, struct dentry *);
 
+/**
+ * mode_strip_umask - handle vfs umask stripping
+ * @dir:	parent directory of the new inode
+ * @mode:	mode of the new inode to be created in @dir
+ *
+ * In most filesystems, umask stripping depends on whether or not the
+ * filesystem supports POSIX ACLs. If the filesystem doesn't support it umask
+ * stripping is done directly in here. If the filesystem does support POSIX
+ * ACLs umask stripping is deferred until the filesystem calls
+ * posix_acl_create().
+ *
+ * Some filesystems (like NFSv4) also want to avoid umask stripping by the
+ * VFS, but don't support POSIX ACLs. Those filesystems can set SB_I_NOUMASK
+ * to get this effect without declaring that they support POSIX ACLs.
+ *
+ * Returns: mode
+ */
+static inline umode_t __must_check mode_strip_umask(const struct inode *dir, umode_t mode)
+{
+	if (!IS_POSIXACL(dir) && !(dir->i_sb->s_iflags & SB_I_NOUMASK))
+		mode &= ~current_umask();
+	return mode;
+}
+
 extern int __must_check nd_jump_link(const struct path *path);
 
 static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)

From 61105aab4edb59bf8177f005eb2923fe5c4deb3c Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@ionos.com>
Date: Thu, 21 Sep 2023 09:57:52 +0200
Subject: [PATCH 05/26] pipe: reduce padding in struct pipe_inode_info

This has no effect on 64 bit because there are 10 32-bit integers
surrounding the two bools, but on 32 bit architectures, this reduces
the struct size by 4 bytes by merging the two bools into one word.

Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Message-Id: <20230921075755.1378787-1-max.kellermann@ionos.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pipe_fs_i.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 608a9eb86bff..598a411d7da2 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -62,9 +62,6 @@ struct pipe_inode_info {
 	unsigned int tail;
 	unsigned int max_usage;
 	unsigned int ring_size;
-#ifdef CONFIG_WATCH_QUEUE
-	bool note_loss;
-#endif
 	unsigned int nr_accounted;
 	unsigned int readers;
 	unsigned int writers;
@@ -72,6 +69,9 @@ struct pipe_inode_info {
 	unsigned int r_counter;
 	unsigned int w_counter;
 	bool poll_usage;
+#ifdef CONFIG_WATCH_QUEUE
+	bool note_loss;
+#endif
 	struct page *tmp_page;
 	struct fasync_struct *fasync_readers;
 	struct fasync_struct *fasync_writers;

From b4bd6b4bac8edd61eb8f7b836969d12c0c6af165 Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@ionos.com>
Date: Thu, 21 Sep 2023 09:57:53 +0200
Subject: [PATCH 06/26] fs/pipe: move check to pipe_has_watch_queue()

This declutters the code by reducing the number of #ifdefs and makes
the watch_queue checks simpler.  This has no runtime effect; the
machine code is identical.

Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Message-Id: <20230921075755.1378787-2-max.kellermann@ionos.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pipe.c                 | 12 +++---------
 include/linux/pipe_fs_i.h | 16 ++++++++++++++++
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index 6c1a9b1db907..6ecaccb48738 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -437,12 +437,10 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 		goto out;
 	}
 
-#ifdef CONFIG_WATCH_QUEUE
-	if (pipe->watch_queue) {
+	if (pipe_has_watch_queue(pipe)) {
 		ret = -EXDEV;
 		goto out;
 	}
-#endif
 
 	/*
 	 * If it wasn't empty we try to merge new data into
@@ -1325,10 +1323,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg)
 	unsigned int nr_slots, size;
 	long ret = 0;
 
-#ifdef CONFIG_WATCH_QUEUE
-	if (pipe->watch_queue)
+	if (pipe_has_watch_queue(pipe))
 		return -EBUSY;
-#endif
 
 	size = round_pipe_size(arg);
 	nr_slots = size >> PAGE_SHIFT;
@@ -1380,10 +1376,8 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
 
 	if (file->f_op != &pipefifo_fops || !pipe)
 		return NULL;
-#ifdef CONFIG_WATCH_QUEUE
-	if (for_splice && pipe->watch_queue)
+	if (for_splice && pipe_has_watch_queue(pipe))
 		return NULL;
-#endif
 	return pipe;
 }
 
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 598a411d7da2..8ff23bf5a819 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -124,6 +124,22 @@ struct pipe_buf_operations {
 	bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
 };
 
+/**
+ * pipe_has_watch_queue - Check whether the pipe is a watch_queue,
+ * i.e. it was created with O_NOTIFICATION_PIPE
+ * @pipe: The pipe to check
+ *
+ * Return: true if pipe is a watch queue, false otherwise.
+ */
+static inline bool pipe_has_watch_queue(const struct pipe_inode_info *pipe)
+{
+#ifdef CONFIG_WATCH_QUEUE
+	return pipe->watch_queue != NULL;
+#else
+	return false;
+#endif
+}
+
 /**
  * pipe_empty - Return true if the pipe is empty
  * @head: The pipe ring head pointer

From dfaabf916b1ca83cfac856745db2fc9d57d9b13a Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@ionos.com>
Date: Thu, 21 Sep 2023 09:57:54 +0200
Subject: [PATCH 07/26] fs/pipe: remove unnecessary spinlock from pipe_write()

This reverts commit 8df441294dd3 ("pipe: Check for ring full inside of
the spinlock in pipe_write()") which was obsoleted by commit
c73be61cede ("pipe: Add general notification queue support") because
now pipe_write() fails early with -EXDEV if there is a watch_queue.

Without a watch_queue, no notifications can be posted to the pipe and
mutex protection is enough, as can be seen in splice_pipe_to_pipe()
which does not use the spinlock either.

Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Message-Id: <20230921075755.1378787-3-max.kellermann@ionos.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pipe.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index 6ecaccb48738..939def02c18c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -505,16 +505,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 			 * it, either the reader will consume it or it'll still
 			 * be there for the next write.
 			 */
-			spin_lock_irq(&pipe->rd_wait.lock);
-
-			head = pipe->head;
-			if (pipe_full(head, pipe->tail, pipe->max_usage)) {
-				spin_unlock_irq(&pipe->rd_wait.lock);
-				continue;
-			}
-
 			pipe->head = head + 1;
-			spin_unlock_irq(&pipe->rd_wait.lock);
 
 			/* Insert it into the buffer array */
 			buf = &pipe->bufs[head & mask];

From 478dbf12176700f28d836dd03ae93a6888278230 Mon Sep 17 00:00:00 2001
From: Max Kellermann <max.kellermann@ionos.com>
Date: Thu, 21 Sep 2023 09:57:55 +0200
Subject: [PATCH 08/26] fs/pipe: use spinlock in pipe_read() only if there is a
 watch_queue

If there is no watch_queue, holding the pipe mutex is enough to
prevent concurrent writes, and we can avoid the spinlock.

O_NOTIFICATION_QUEUE is an exotic and rarely used feature, and of all
the pipes that exist at any given time, only very few actually have a
watch_queue, therefore it appears worthwile to optimize the common
case.

This patch does not optimize pipe_resize_ring() where the spinlocks
could be avoided as well; that does not seem like a worthwile
optimization because this function is not called often.

Related commits:

- commit 8df441294dd3 ("pipe: Check for ring full inside of the
  spinlock in pipe_write()")
- commit b667b8673443 ("pipe: Advance tail pointer inside of wait
  spinlock in pipe_read()")
- commit 189b0ddc2451 ("pipe: Fix missing lock in pipe_resize_ring()")

Signed-off-by: Max Kellermann <max.kellermann@ionos.com>
Message-Id: <20230921075755.1378787-4-max.kellermann@ionos.com>
Reviewed-by: David Howells <dhowells@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/pipe.c | 43 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index 939def02c18c..1362e2ec2211 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -227,6 +227,36 @@ static inline bool pipe_readable(const struct pipe_inode_info *pipe)
 	return !pipe_empty(head, tail) || !writers;
 }
 
+static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
+					    struct pipe_buffer *buf,
+					    unsigned int tail)
+{
+	pipe_buf_release(pipe, buf);
+
+	/*
+	 * If the pipe has a watch_queue, we need additional protection
+	 * by the spinlock because notifications get posted with only
+	 * this spinlock, no mutex
+	 */
+	if (pipe_has_watch_queue(pipe)) {
+		spin_lock_irq(&pipe->rd_wait.lock);
+#ifdef CONFIG_WATCH_QUEUE
+		if (buf->flags & PIPE_BUF_FLAG_LOSS)
+			pipe->note_loss = true;
+#endif
+		pipe->tail = ++tail;
+		spin_unlock_irq(&pipe->rd_wait.lock);
+		return tail;
+	}
+
+	/*
+	 * Without a watch_queue, we can simply increment the tail
+	 * without the spinlock - the mutex is enough.
+	 */
+	pipe->tail = ++tail;
+	return tail;
+}
+
 static ssize_t
 pipe_read(struct kiocb *iocb, struct iov_iter *to)
 {
@@ -320,17 +350,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 				buf->len = 0;
 			}
 
-			if (!buf->len) {
-				pipe_buf_release(pipe, buf);
-				spin_lock_irq(&pipe->rd_wait.lock);
-#ifdef CONFIG_WATCH_QUEUE
-				if (buf->flags & PIPE_BUF_FLAG_LOSS)
-					pipe->note_loss = true;
-#endif
-				tail++;
-				pipe->tail = tail;
-				spin_unlock_irq(&pipe->rd_wait.lock);
-			}
+			if (!buf->len)
+				tail = pipe_update_tail(pipe, buf, tail);
 			total_len -= chars;
 			if (!total_len)
 				break;	/* common path: read succeeded */

From 85fadf89e5708d74c4923e9ceeca4b9df0e000bb Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Fri, 22 Sep 2023 10:54:08 -0700
Subject: [PATCH 09/26] watch_queue: Annotate struct watch_filter with
 __counted_by

Prepare for the coming implementation by GCC and Clang of the __counted_by
attribute. Flexible array members annotated with __counted_by can have
their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS
(for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family
functions).

As found with Coccinelle[1], add __counted_by for struct watch_filter.

[1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci

Cc: David Howells <dhowells@redhat.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Siddh Raman Pant <code@siddh.me>
Cc: Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: Qian Cai <cai@redhat.com>
Signed-off-by: Kees Cook <keescook@chromium.org>
Tested-by: Siddh Raman Pant <code@siddh.me>
Reviewed-by: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Message-Id: <20230922175407.work.754-kees@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/watch_queue.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h
index 45cd42f55d49..429c7b6afead 100644
--- a/include/linux/watch_queue.h
+++ b/include/linux/watch_queue.h
@@ -32,7 +32,7 @@ struct watch_filter {
 		DECLARE_BITMAP(type_filter, WATCH_TYPE__NR);
 	};
 	u32			nr_filters;	/* Number of filters */
-	struct watch_type_filter filters[];
+	struct watch_type_filter filters[] __counted_by(nr_filters);
 };
 
 struct watch_queue {

From 6036c5f1317526890925576f0efcbc427a32a2ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lu=C3=ADs=20Henriques?= <lhenriques@suse.de>
Date: Thu, 28 Sep 2023 16:23:41 +0100
Subject: [PATCH 10/26] fs: simplify misleading code to remove ambiguity
 regarding ihold()/iput()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Because 'inode' is being initialised before checking if 'dentry' is negative
it looks like an extra iput() on 'inode' may happen since the ihold() is
done only if the dentry is *not* negative.  In reality this doesn't happen
because d_is_negative() is never true if ->d_inode is NULL.  This patch only
makes the code easier to understand, as I was initially mislead by it.

Signed-off-by: Luís Henriques <lhenriques@suse.de>
Link: https://lore.kernel.org/r/20230928152341.303-1-lhenriques@suse.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namei.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 94b27370f468..c36ff6f8195a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4367,11 +4367,9 @@ retry_deleg:
 	if (!IS_ERR(dentry)) {
 
 		/* Why not before? Because we want correct error value */
-		if (last.name[last.len])
+		if (last.name[last.len] || d_is_negative(dentry))
 			goto slashes;
 		inode = dentry->d_inode;
-		if (d_is_negative(dentry))
-			goto slashes;
 		ihold(inode);
 		error = security_path_unlink(&path, dentry);
 		if (error)

From 93faf426e3cc000c95f1a5d3510b77ce99adac52 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Tue, 26 Sep 2023 18:22:28 +0200
Subject: [PATCH 11/26] vfs: shave work on failed file open

Failed opens (mostly ENOENT) legitimately happen a lot, for example here
are stats from stracing kernel build for few seconds (strace -fc make):

  % time     seconds  usecs/call     calls    errors syscall
  ------ ----------- ----------- --------- --------- ------------------
    0.76    0.076233           5     15040      3688 openat

(this is tons of header files tried in different paths)

In the common case of there being nothing to close (only the file object
to free) there is a lot of overhead which can be avoided.

This is most notably delegation of freeing to task_work, which comes
with an enormous cost (see 021a160abf62 ("fs: use __fput_sync in
close(2)" for an example).

Benchmarked with will-it-scale with a custom testcase based on
tests/open1.c, stuffed into tests/openneg.c:
[snip]
        while (1) {
                int fd = open("/tmp/nonexistent", O_RDONLY);
                assert(fd == -1);

                (*iterations)++;
        }
[/snip]

Sapphire Rapids, openneg_processes -t 1 (ops/s):
before:	1950013
after:	2914973 (+49%)

file refcount is checked as a safety belt against buggy consumers with
an atomic cmpxchg. Technically it is not necessary, but it happens to
not be measurable due to several other atomics which immediately follow.
Optmizing them away to make this atomic into a problem is left as an
exercise for the reader.

v2:
- unexport fput_badopen and move to fs/internal.h
- handle the refcount with cmpxchg, adjust commentary accordingly
- tweak the commit message

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20230926162228.68666-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file_table.c | 12 ++++++++++++
 fs/internal.h   |  1 +
 fs/namei.c      |  5 ++++-
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index ee21b3da9d08..e68e97d4f00a 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -82,6 +82,18 @@ static inline void file_free(struct file *f)
 	call_rcu(&f->f_rcuhead, file_free_rcu);
 }
 
+void release_empty_file(struct file *f)
+{
+	WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED));
+	/* Uhm, we better find out who grabs references to an unopened file. */
+	WARN_ON_ONCE(atomic_long_cmpxchg(&f->f_count, 1, 0) != 1);
+	security_file_free(f);
+	put_cred(f->f_cred);
+	if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
+		percpu_counter_dec(&nr_files);
+	kmem_cache_free(filp_cachep, f);
+}
+
 /*
  * Return the total number of open files in the system
  */
diff --git a/fs/internal.h b/fs/internal.h
index 8260c738980c..f08d8fe3ae5e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -94,6 +94,7 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
 struct file *alloc_empty_file(int flags, const struct cred *cred);
 struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
 struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
+void release_empty_file(struct file *f);
 
 static inline void put_file_access(struct file *file)
 {
diff --git a/fs/namei.c b/fs/namei.c
index c36ff6f8195a..127c868a8992 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3783,7 +3783,10 @@ static struct file *path_openat(struct nameidata *nd,
 		WARN_ON(1);
 		error = -EINVAL;
 	}
-	fput(file);
+	if (unlikely(file->f_mode & FMODE_OPENED))
+		fput(file);
+	else
+		release_empty_file(file);
 	if (error == -EOPENSTALE) {
 		if (flags & LOOKUP_RCU)
 			error = -ECHILD;

From 0ede61d8589cc2d93aa78230d74ac58b5b8d0244 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 29 Sep 2023 08:45:59 +0200
Subject: [PATCH 12/26] file: convert to SLAB_TYPESAFE_BY_RCU

In recent discussions around some performance improvements in the file
handling area we discussed switching the file cache to rely on
SLAB_TYPESAFE_BY_RCU which allows us to get rid of call_rcu() based
freeing for files completely. This is a pretty sensitive change overall
but it might actually be worth doing.

The main downside is the subtlety. The other one is that we should
really wait for Jann's patch to land that enables KASAN to handle
SLAB_TYPESAFE_BY_RCU UAFs. Currently it doesn't but a patch for this
exists.

With SLAB_TYPESAFE_BY_RCU objects may be freed and reused multiple times
which requires a few changes. So it isn't sufficient anymore to just
acquire a reference to the file in question under rcu using
atomic_long_inc_not_zero() since the file might have already been
recycled and someone else might have bumped the reference.

In other words, callers might see reference count bumps from newer
users. For this reason it is necessary to verify that the pointer is the
same before and after the reference count increment. This pattern can be
seen in get_file_rcu() and __files_get_rcu().

In addition, it isn't possible to access or check fields in struct file
without first aqcuiring a reference on it. Not doing that was always
very dodgy and it was only usable for non-pointer data in struct file.
With SLAB_TYPESAFE_BY_RCU it is necessary that callers first acquire a
reference under rcu or they must hold the files_lock of the fdtable.
Failing to do either one of this is a bug.

Thanks to Jann for pointing out that we need to ensure memory ordering
between reallocations and pointer check by ensuring that all subsequent
loads have a dependency on the second load in get_file_rcu() and
providing a fixup that was folded into this patch.

Cc: Jann Horn <jannh@google.com>
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 Documentation/filesystems/files.rst          |  53 ++++----
 arch/powerpc/platforms/cell/spufs/coredump.c |  11 +-
 drivers/gpu/drm/i915/gem/i915_gem_mman.c     |   4 +-
 fs/file.c                                    | 125 ++++++++++++++++---
 fs/file_table.c                              |  40 +++---
 fs/gfs2/glock.c                              |  11 +-
 fs/notify/dnotify/dnotify.c                  |   6 +-
 fs/proc/fd.c                                 |  11 +-
 include/linux/fdtable.h                      |  17 +--
 include/linux/fs.h                           |   4 +-
 kernel/bpf/task_iter.c                       |   4 +-
 kernel/fork.c                                |   4 +-
 kernel/kcmp.c                                |   4 +-
 13 files changed, 191 insertions(+), 103 deletions(-)

diff --git a/Documentation/filesystems/files.rst b/Documentation/filesystems/files.rst
index bcf84459917f..9e38e4c221ca 100644
--- a/Documentation/filesystems/files.rst
+++ b/Documentation/filesystems/files.rst
@@ -62,7 +62,7 @@ the fdtable structure -
    be held.
 
 4. To look up the file structure given an fd, a reader
-   must use either lookup_fd_rcu() or files_lookup_fd_rcu() APIs. These
+   must use either lookup_fdget_rcu() or files_lookup_fdget_rcu() APIs. These
    take care of barrier requirements due to lock-free lookup.
 
    An example::
@@ -70,43 +70,22 @@ the fdtable structure -
 	struct file *file;
 
 	rcu_read_lock();
-	file = lookup_fd_rcu(fd);
+	file = lookup_fdget_rcu(fd);
+	rcu_read_unlock();
 	if (file) {
 		...
+                fput(file);
 	}
 	....
-	rcu_read_unlock();
 
-5. Handling of the file structures is special. Since the look-up
-   of the fd (fget()/fget_light()) are lock-free, it is possible
-   that look-up may race with the last put() operation on the
-   file structure. This is avoided using atomic_long_inc_not_zero()
-   on ->f_count::
-
-	rcu_read_lock();
-	file = files_lookup_fd_rcu(files, fd);
-	if (file) {
-		if (atomic_long_inc_not_zero(&file->f_count))
-			*fput_needed = 1;
-		else
-		/* Didn't get the reference, someone's freed */
-			file = NULL;
-	}
-	rcu_read_unlock();
-	....
-	return file;
-
-   atomic_long_inc_not_zero() detects if refcounts is already zero or
-   goes to zero during increment. If it does, we fail
-   fget()/fget_light().
-
-6. Since both fdtable and file structures can be looked up
+5. Since both fdtable and file structures can be looked up
    lock-free, they must be installed using rcu_assign_pointer()
    API. If they are looked up lock-free, rcu_dereference()
    must be used. However it is advisable to use files_fdtable()
-   and lookup_fd_rcu()/files_lookup_fd_rcu() which take care of these issues.
+   and lookup_fdget_rcu()/files_lookup_fdget_rcu() which take care of these
+   issues.
 
-7. While updating, the fdtable pointer must be looked up while
+6. While updating, the fdtable pointer must be looked up while
    holding files->file_lock. If ->file_lock is dropped, then
    another thread expand the files thereby creating a new
    fdtable and making the earlier fdtable pointer stale.
@@ -126,3 +105,19 @@ the fdtable structure -
    Since locate_fd() can drop ->file_lock (and reacquire ->file_lock),
    the fdtable pointer (fdt) must be loaded after locate_fd().
 
+On newer kernels rcu based file lookup has been switched to rely on
+SLAB_TYPESAFE_BY_RCU instead of call_rcu(). It isn't sufficient anymore
+to just acquire a reference to the file in question under rcu using
+atomic_long_inc_not_zero() since the file might have already been
+recycled and someone else might have bumped the reference. In other
+words, callers might see reference count bumps from newer users. For
+this is reason it is necessary to verify that the pointer is the same
+before and after the reference count increment. This pattern can be seen
+in get_file_rcu() and __files_get_rcu().
+
+In addition, it isn't possible to access or check fields in struct file
+without first aqcuiring a reference on it under rcu lookup. Not doing
+that was always very dodgy and it was only usable for non-pointer data
+in struct file. With SLAB_TYPESAFE_BY_RCU it is necessary that callers
+either first acquire a reference or they must hold the files_lock of the
+fdtable.
diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
index 1a587618015c..18daafbe2e65 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -66,7 +66,7 @@ static int match_context(const void *v, struct file *file, unsigned fd)
  */
 static struct spu_context *coredump_next_context(int *fd)
 {
-	struct spu_context *ctx;
+	struct spu_context *ctx = NULL;
 	struct file *file;
 	int n = iterate_fd(current->files, *fd, match_context, NULL);
 	if (!n)
@@ -74,10 +74,13 @@ static struct spu_context *coredump_next_context(int *fd)
 	*fd = n - 1;
 
 	rcu_read_lock();
-	file = lookup_fd_rcu(*fd);
-	ctx = SPUFS_I(file_inode(file))->i_ctx;
-	get_spu_context(ctx);
+	file = lookup_fdget_rcu(*fd);
 	rcu_read_unlock();
+	if (file) {
+		ctx = SPUFS_I(file_inode(file))->i_ctx;
+		get_spu_context(ctx);
+		fput(file);
+	}
 
 	return ctx;
 }
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index aa4d842d4c5a..97bf10861cad 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -916,9 +916,7 @@ static struct file *mmap_singleton(struct drm_i915_private *i915)
 	struct file *file;
 
 	rcu_read_lock();
-	file = READ_ONCE(i915->gem.mmap_singleton);
-	if (file && !get_file_rcu(file))
-		file = NULL;
+	file = get_file_rcu(&i915->gem.mmap_singleton);
 	rcu_read_unlock();
 	if (file)
 		return file;
diff --git a/fs/file.c b/fs/file.c
index 3e4a4dfa38fc..4eb026631673 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -853,8 +853,79 @@ void do_close_on_exec(struct files_struct *files)
 	spin_unlock(&files->file_lock);
 }
 
+static struct file *__get_file_rcu(struct file __rcu **f)
+{
+	struct file __rcu *file;
+	struct file __rcu *file_reloaded;
+	struct file __rcu *file_reloaded_cmp;
+
+	file = rcu_dereference_raw(*f);
+	if (!file)
+		return NULL;
+
+	if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+		return ERR_PTR(-EAGAIN);
+
+	file_reloaded = rcu_dereference_raw(*f);
+
+	/*
+	 * Ensure that all accesses have a dependency on the load from
+	 * rcu_dereference_raw() above so we get correct ordering
+	 * between reuse/allocation and the pointer check below.
+	 */
+	file_reloaded_cmp = file_reloaded;
+	OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
+
+	/*
+	 * atomic_long_inc_not_zero() above provided a full memory
+	 * barrier when we acquired a reference.
+	 *
+	 * This is paired with the write barrier from assigning to the
+	 * __rcu protected file pointer so that if that pointer still
+	 * matches the current file, we know we have successfully
+	 * acquired a reference to the right file.
+	 *
+	 * If the pointers don't match the file has been reallocated by
+	 * SLAB_TYPESAFE_BY_RCU.
+	 */
+	if (file == file_reloaded_cmp)
+		return file_reloaded;
+
+	fput(file);
+	return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * get_file_rcu - try go get a reference to a file under rcu
+ * @f: the file to get a reference on
+ *
+ * This function tries to get a reference on @f carefully verifying that
+ * @f hasn't been reused.
+ *
+ * This function should rarely have to be used and only by users who
+ * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
+ *
+ * Return: Returns @f with the reference count increased or NULL.
+ */
+struct file *get_file_rcu(struct file __rcu **f)
+{
+	for (;;) {
+		struct file __rcu *file;
+
+		file = __get_file_rcu(f);
+		if (unlikely(!file))
+			return NULL;
+
+		if (unlikely(IS_ERR(file)))
+			continue;
+
+		return file;
+	}
+}
+EXPORT_SYMBOL_GPL(get_file_rcu);
+
 static inline struct file *__fget_files_rcu(struct files_struct *files,
-	unsigned int fd, fmode_t mask)
+       unsigned int fd, fmode_t mask)
 {
 	for (;;) {
 		struct file *file;
@@ -865,12 +936,6 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
 			return NULL;
 
 		fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
-		file = rcu_dereference_raw(*fdentry);
-		if (unlikely(!file))
-			return NULL;
-
-		if (unlikely(file->f_mode & mask))
-			return NULL;
 
 		/*
 		 * Ok, we have a file pointer. However, because we do
@@ -879,10 +944,15 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
 		 *
 		 * Such a race can take two forms:
 		 *
-		 *  (a) the file ref already went down to zero,
-		 *      and get_file_rcu() fails. Just try again:
+		 *  (a) the file ref already went down to zero and the
+		 *      file hasn't been reused yet or the file count
+		 *      isn't zero but the file has already been reused.
 		 */
-		if (unlikely(!get_file_rcu(file)))
+		file = __get_file_rcu(fdentry);
+		if (unlikely(!file))
+			return NULL;
+
+		if (unlikely(IS_ERR(file)))
 			continue;
 
 		/*
@@ -893,12 +963,20 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
 		 *
 		 * If so, we need to put our ref and try again.
 		 */
-		if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
-		    unlikely(rcu_dereference_raw(*fdentry) != file)) {
+		if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
 			fput(file);
 			continue;
 		}
 
+		/*
+		 * This isn't the file we're looking for or we're not
+		 * allowed to get a reference to it.
+		 */
+		if (unlikely(file->f_mode & mask)) {
+			fput(file);
+			return NULL;
+		}
+
 		/*
 		 * Ok, we have a ref to the file, and checked that it
 		 * still exists.
@@ -948,7 +1026,14 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
 	return file;
 }
 
-struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
+struct file *lookup_fdget_rcu(unsigned int fd)
+{
+	return __fget_files_rcu(current->files, fd, 0);
+
+}
+EXPORT_SYMBOL_GPL(lookup_fdget_rcu);
+
+struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd)
 {
 	/* Must be called with rcu_read_lock held */
 	struct files_struct *files;
@@ -957,13 +1042,13 @@ struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
 	task_lock(task);
 	files = task->files;
 	if (files)
-		file = files_lookup_fd_rcu(files, fd);
+		file = __fget_files_rcu(files, fd, 0);
 	task_unlock(task);
 
 	return file;
 }
 
-struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
+struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd)
 {
 	/* Must be called with rcu_read_lock held */
 	struct files_struct *files;
@@ -974,7 +1059,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
 	files = task->files;
 	if (files) {
 		for (; fd < files_fdtable(files)->max_fds; fd++) {
-			file = files_lookup_fd_rcu(files, fd);
+			file = __fget_files_rcu(files, fd, 0);
 			if (file)
 				break;
 		}
@@ -983,7 +1068,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret
 	*ret_fd = fd;
 	return file;
 }
-EXPORT_SYMBOL(task_lookup_next_fd_rcu);
+EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
 
 /*
  * Lightweight file lookup - no refcnt increment if fd table isn't shared.
@@ -1272,12 +1357,16 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 {
 	if (unlikely(newfd == oldfd)) { /* corner case */
 		struct files_struct *files = current->files;
+		struct file *f;
 		int retval = oldfd;
 
 		rcu_read_lock();
-		if (!files_lookup_fd_rcu(files, oldfd))
+		f = __fget_files_rcu(files, oldfd, 0);
+		if (!f)
 			retval = -EBADF;
 		rcu_read_unlock();
+		if (f)
+			fput(f);
 		return retval;
 	}
 	return ksys_dup3(oldfd, newfd, 0);
diff --git a/fs/file_table.c b/fs/file_table.c
index e68e97d4f00a..a79a80031343 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -65,33 +65,33 @@ static void file_free_rcu(struct rcu_head *head)
 {
 	struct file *f = container_of(head, struct file, f_rcuhead);
 
-	put_cred(f->f_cred);
-	if (unlikely(f->f_mode & FMODE_BACKING))
-		kfree(backing_file(f));
-	else
-		kmem_cache_free(filp_cachep, f);
+	kfree(backing_file(f));
 }
 
 static inline void file_free(struct file *f)
 {
 	security_file_free(f);
-	if (unlikely(f->f_mode & FMODE_BACKING))
-		path_put(backing_file_real_path(f));
 	if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
 		percpu_counter_dec(&nr_files);
-	call_rcu(&f->f_rcuhead, file_free_rcu);
+	put_cred(f->f_cred);
+	if (unlikely(f->f_mode & FMODE_BACKING)) {
+		path_put(backing_file_real_path(f));
+		call_rcu(&f->f_rcuhead, file_free_rcu);
+	} else {
+		kmem_cache_free(filp_cachep, f);
+	}
 }
 
 void release_empty_file(struct file *f)
 {
 	WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED));
-	/* Uhm, we better find out who grabs references to an unopened file. */
-	WARN_ON_ONCE(atomic_long_cmpxchg(&f->f_count, 1, 0) != 1);
-	security_file_free(f);
-	put_cred(f->f_cred);
-	if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
-		percpu_counter_dec(&nr_files);
-	kmem_cache_free(filp_cachep, f);
+	if (atomic_long_dec_and_test(&f->f_count)) {
+		security_file_free(f);
+		put_cred(f->f_cred);
+		if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
+			percpu_counter_dec(&nr_files);
+		kmem_cache_free(filp_cachep, f);
+	}
 }
 
 /*
@@ -176,7 +176,6 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
 		return error;
 	}
 
-	atomic_long_set(&f->f_count, 1);
 	rwlock_init(&f->f_owner.lock);
 	spin_lock_init(&f->f_lock);
 	mutex_init(&f->f_pos_lock);
@@ -184,6 +183,12 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
 	f->f_mode = OPEN_FMODE(flags);
 	/* f->f_version: 0 */
 
+	/*
+	 * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
+	 * fget-rcu pattern users need to be able to handle spurious
+	 * refcount bumps we should reinitialize the reused file first.
+	 */
+	atomic_long_set(&f->f_count, 1);
 	return 0;
 }
 
@@ -483,7 +488,8 @@ EXPORT_SYMBOL(__fput_sync);
 void __init files_init(void)
 {
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
+				SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
+				SLAB_PANIC | SLAB_ACCOUNT, NULL);
 	percpu_counter_init(&nr_files, 0, GFP_KERNEL);
 }
 
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9cbf8d98489a..b4bc873aab7d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -2717,16 +2717,19 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i)
 	for(;; i->fd++) {
 		struct inode *inode;
 
-		i->file = task_lookup_next_fd_rcu(i->task, &i->fd);
+		i->file = task_lookup_next_fdget_rcu(i->task, &i->fd);
 		if (!i->file) {
 			i->fd = 0;
 			break;
 		}
+
 		inode = file_inode(i->file);
-		if (inode->i_sb != i->sb)
-			continue;
-		if (get_file_rcu(i->file))
+		if (inode->i_sb == i->sb)
 			break;
+
+		rcu_read_unlock();
+		fput(i->file);
+		rcu_read_lock();
 	}
 	rcu_read_unlock();
 	return i->file;
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index ebdcc25df0f7..869b016014d2 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -265,7 +265,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
 	struct dnotify_struct *dn;
 	struct inode *inode;
 	fl_owner_t id = current->files;
-	struct file *f;
+	struct file *f = NULL;
 	int destroy = 0, error = 0;
 	__u32 mask;
 
@@ -345,7 +345,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
 	}
 
 	rcu_read_lock();
-	f = lookup_fd_rcu(fd);
+	f = lookup_fdget_rcu(fd);
 	rcu_read_unlock();
 
 	/* if (f != filp) means that we lost a race and another task/thread
@@ -392,6 +392,8 @@ out_err:
 		fsnotify_put_mark(new_fsn_mark);
 	if (dn)
 		kmem_cache_free(dnotify_struct_cache, dn);
+	if (f)
+		fput(f);
 	return error;
 }
 
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6276b3938842..6e72e5ad42bc 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -113,10 +113,12 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
 	struct file *file;
 
 	rcu_read_lock();
-	file = task_lookup_fd_rcu(task, fd);
-	if (file)
-		*mode = file->f_mode;
+	file = task_lookup_fdget_rcu(task, fd);
 	rcu_read_unlock();
+	if (file) {
+		*mode = file->f_mode;
+		fput(file);
+	}
 	return !!file;
 }
 
@@ -259,12 +261,13 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
 		char name[10 + 1];
 		unsigned int len;
 
-		f = task_lookup_next_fd_rcu(p, &fd);
+		f = task_lookup_next_fdget_rcu(p, &fd);
 		ctx->pos = fd + 2LL;
 		if (!f)
 			break;
 		data.mode = f->f_mode;
 		rcu_read_unlock();
+		fput(f);
 		data.fd = fd;
 
 		len = snprintf(name, sizeof(name), "%u", fd);
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index e066816f3519..bc4c3287a65e 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -98,20 +98,9 @@ static inline struct file *files_lookup_fd_locked(struct files_struct *files, un
 	return files_lookup_fd_raw(files, fd);
 }
 
-static inline struct file *files_lookup_fd_rcu(struct files_struct *files, unsigned int fd)
-{
-	RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
-			   "suspicious rcu_dereference_check() usage");
-	return files_lookup_fd_raw(files, fd);
-}
-
-static inline struct file *lookup_fd_rcu(unsigned int fd)
-{
-	return files_lookup_fd_rcu(current->files, fd);
-}
-
-struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd);
-struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *fd);
+struct file *lookup_fdget_rcu(unsigned int fd);
+struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd);
+struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *fd);
 
 struct task_struct;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 58dea591a341..ceafc40cc25f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1042,7 +1042,9 @@ static inline struct file *get_file(struct file *f)
 	atomic_long_inc(&f->f_count);
 	return f;
 }
-#define get_file_rcu(x) atomic_long_inc_not_zero(&(x)->f_count)
+
+struct file *get_file_rcu(struct file __rcu **f);
+
 #define file_count(x)	atomic_long_read(&(x)->f_count)
 
 #define	MAX_NON_LFS	((1UL<<31) - 1)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index c4ab9d6cdbe9..82ad23b1d257 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -308,11 +308,9 @@ again:
 	rcu_read_lock();
 	for (;; curr_fd++) {
 		struct file *f;
-		f = task_lookup_next_fd_rcu(curr_task, &curr_fd);
+		f = task_lookup_next_fdget_rcu(curr_task, &curr_fd);
 		if (!f)
 			break;
-		if (!get_file_rcu(f))
-			continue;
 
 		/* set info->fd */
 		info->fd = curr_fd;
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b6d20dfb9a8..640123767726 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1492,9 +1492,7 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
 	struct file *exe_file;
 
 	rcu_read_lock();
-	exe_file = rcu_dereference(mm->exe_file);
-	if (exe_file && !get_file_rcu(exe_file))
-		exe_file = NULL;
+	exe_file = get_file_rcu(&mm->exe_file);
 	rcu_read_unlock();
 	return exe_file;
 }
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 5353edfad8e1..b0639f21041f 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -64,8 +64,10 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx)
 	struct file *file;
 
 	rcu_read_lock();
-	file = task_lookup_fd_rcu(task, idx);
+	file = task_lookup_fdget_rcu(task, idx);
 	rcu_read_unlock();
+	if (file)
+		fput(file);
 
 	return file;
 }

From 50d910d27362d6809a0668f0f1cb5220bc7dc6a0 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 10 Oct 2023 10:23:26 +0200
Subject: [PATCH 13/26] io_uring: use files_lookup_fd_locked()

While valid we don't need to open-code rcu dereferences if we're
acquiring file_lock anyway.

Suggested-by: Al Viro <viro@zeniv.linux.org.uk>
Link: https://lore.kernel.org/r/20231010030615.GO800259@ZenIV
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 io_uring/openclose.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/io_uring/openclose.c b/io_uring/openclose.c
index e3fae26e025d..fb73adb89067 100644
--- a/io_uring/openclose.c
+++ b/io_uring/openclose.c
@@ -220,7 +220,6 @@ int io_close(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct files_struct *files = current->files;
 	struct io_close *close = io_kiocb_to_cmd(req, struct io_close);
-	struct fdtable *fdt;
 	struct file *file;
 	int ret = -EBADF;
 
@@ -230,13 +229,7 @@ int io_close(struct io_kiocb *req, unsigned int issue_flags)
 	}
 
 	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	if (close->fd >= fdt->max_fds) {
-		spin_unlock(&files->file_lock);
-		goto err;
-	}
-	file = rcu_dereference_protected(fdt->fd[close->fd],
-			lockdep_is_held(&files->file_lock));
+	file = files_lookup_fd_locked(files, close->fd);
 	if (!file || io_is_uring_fops(file)) {
 		spin_unlock(&files->file_lock);
 		goto err;

From 7116c0af4b8414b2f19fdb366eea213cbd9d91c2 Mon Sep 17 00:00:00 2001
From: Reuben Hawkins <reubenhwk@gmail.com>
Date: Mon, 2 Oct 2023 20:57:04 -0500
Subject: [PATCH 14/26] vfs: fix readahead(2) on block devices

Readahead was factored to call generic_fadvise.  That refactor added an
S_ISREG restriction which broke readahead on block devices.

In addition to S_ISREG, this change checks S_ISBLK to fix block device
readahead.  There is no change in behavior with any file type besides block
devices in this change.

Fixes: 3d8f7615319b ("vfs: implement readahead(2) using POSIX_FADV_WILLNEED")
Signed-off-by: Reuben Hawkins <reubenhwk@gmail.com>
Link: https://lore.kernel.org/r/20231003015704.2415-1-reubenhwk@gmail.com
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 mm/readahead.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/readahead.c b/mm/readahead.c
index e815c114de21..6925e6959fd3 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -735,7 +735,8 @@ ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
 	 */
 	ret = -EINVAL;
 	if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
-	    !S_ISREG(file_inode(f.file)->i_mode))
+	    (!S_ISREG(file_inode(f.file)->i_mode) &&
+	    !S_ISBLK(file_inode(f.file)->i_mode)))
 		goto out;
 
 	ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);

From 6cf41fcfe099b61b4e3ea7dbe04a906f4c904822 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 5 Oct 2023 19:08:35 +0200
Subject: [PATCH 15/26] backing file: free directly

Backing files as used by overlayfs are never installed into file
descriptor tables and are explicitly documented as such. They aren't
subject to rcu access conditions like regular files are.

Their lifetime is bound to the lifetime of the overlayfs file, i.e.,
they're stashed in ovl_file->private_data and go away otherwise. If
they're set as vma->vm_file - which is their main purpose - then they're
subject to regular refcount rules and vma->vm_file can't be installed
into an fdtable after having been set. All in all I don't see any need
for rcu delay here. So free it directly.

This all hinges on such hybrid beasts to never actually be installed
into fdtables which - as mentioned before - is not allowed. So add an
explicit WARN_ON_ONCE() so we catch any case where someone is suddenly
trying to install one of those things into a file descriptor table so we
can have a nice long chat with them.

Link: https://lore.kernel.org/r/20231005-sakralbau-wappnen-f5c31755ed70@brauner
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file.c       | 3 +++
 fs/file_table.c | 9 +--------
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 4eb026631673..1a475d7d636e 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -604,6 +604,9 @@ void fd_install(unsigned int fd, struct file *file)
 	struct files_struct *files = current->files;
 	struct fdtable *fdt;
 
+	if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING)))
+		return;
+
 	rcu_read_lock_sched();
 
 	if (unlikely(files->resize_in_progress)) {
diff --git a/fs/file_table.c b/fs/file_table.c
index a79a80031343..08fd1dd6d863 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -61,13 +61,6 @@ struct path *backing_file_real_path(struct file *f)
 }
 EXPORT_SYMBOL_GPL(backing_file_real_path);
 
-static void file_free_rcu(struct rcu_head *head)
-{
-	struct file *f = container_of(head, struct file, f_rcuhead);
-
-	kfree(backing_file(f));
-}
-
 static inline void file_free(struct file *f)
 {
 	security_file_free(f);
@@ -76,7 +69,7 @@ static inline void file_free(struct file *f)
 	put_cred(f->f_cred);
 	if (unlikely(f->f_mode & FMODE_BACKING)) {
 		path_put(backing_file_real_path(f));
-		call_rcu(&f->f_rcuhead, file_free_rcu);
+		kfree(backing_file(f));
 	} else {
 		kmem_cache_free(filp_cachep, f);
 	}

From 95e93d17cb113a3b097774874572b69c058acab5 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Wed, 4 Oct 2023 13:19:15 +0200
Subject: [PATCH 16/26] vfs: predict the error in retry_estale as unlikely

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20231004111916.728135-2-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/namei.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/namei.h b/include/linux/namei.h
index e3619920f9f0..3100371b5e32 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -136,7 +136,7 @@ static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
 static inline bool
 retry_estale(const long error, const unsigned int flags)
 {
-	return error == -ESTALE && !(flags & LOOKUP_REVAL);
+	return unlikely(error == -ESTALE && !(flags & LOOKUP_REVAL));
 }
 
 #endif /* _LINUX_NAMEI_H */

From 6c4d1c99d2ad5e12b897da482ca8305b929cb9ce Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Wed, 4 Oct 2023 13:19:16 +0200
Subject: [PATCH 17/26] vfs: stop counting on gcc not messing with
 mnt_expiry_mark if not asked

So happens it already was not doing it, but there is no need to "hope"
as indicated in the comment.

No changes in generated assembly.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Link: https://lore.kernel.org/r/20231004111916.728135-3-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namespace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 9e0ea73f2ff9..6bde71735efa 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1346,9 +1346,9 @@ void mntput(struct vfsmount *mnt)
 {
 	if (mnt) {
 		struct mount *m = real_mount(mnt);
-		/* avoid cacheline pingpong, hope gcc doesn't get "smart" */
+		/* avoid cacheline pingpong */
 		if (unlikely(m->mnt_expiry_mark))
-			m->mnt_expiry_mark = 0;
+			WRITE_ONCE(m->mnt_expiry_mark, 0);
 		mntput_no_expire(m);
 	}
 }

From 83bc1d294130cc471a89ce10770daa281a93fcb0 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 9 Oct 2023 18:37:10 +0300
Subject: [PATCH 18/26] fs: get mnt_writers count for an open backing file's
 real path

A writeable mapped backing file can perform writes to the real inode.
Therefore, the real path mount must be kept writable so long as the
writable map exists.

This may not be strictly needed for ovelrayfs private upper mount,
but it is correct to take the mnt_writers count in the vfs helper.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231009153712.1566422-2-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/internal.h | 11 +++++++++--
 fs/open.c     | 31 +++++++++++++++++++++++++------
 2 files changed, 34 insertions(+), 8 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index f08d8fe3ae5e..4e93a685bdaa 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -96,13 +96,20 @@ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
 struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
 void release_empty_file(struct file *f);
 
+static inline void file_put_write_access(struct file *file)
+{
+	put_write_access(file->f_inode);
+	mnt_put_write_access(file->f_path.mnt);
+	if (unlikely(file->f_mode & FMODE_BACKING))
+		mnt_put_write_access(backing_file_real_path(file)->mnt);
+}
+
 static inline void put_file_access(struct file *file)
 {
 	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
 		i_readcount_dec(file->f_inode);
 	} else if (file->f_mode & FMODE_WRITER) {
-		put_write_access(file->f_inode);
-		mnt_put_write_access(file->f_path.mnt);
+		file_put_write_access(file);
 	}
 }
 
diff --git a/fs/open.c b/fs/open.c
index a65ce47810cf..fe63e236da22 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -870,6 +870,30 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
 	return ksys_fchown(fd, user, group);
 }
 
+static inline int file_get_write_access(struct file *f)
+{
+	int error;
+
+	error = get_write_access(f->f_inode);
+	if (unlikely(error))
+		return error;
+	error = mnt_get_write_access(f->f_path.mnt);
+	if (unlikely(error))
+		goto cleanup_inode;
+	if (unlikely(f->f_mode & FMODE_BACKING)) {
+		error = mnt_get_write_access(backing_file_real_path(f)->mnt);
+		if (unlikely(error))
+			goto cleanup_mnt;
+	}
+	return 0;
+
+cleanup_mnt:
+	mnt_put_write_access(f->f_path.mnt);
+cleanup_inode:
+	put_write_access(f->f_inode);
+	return error;
+}
+
 static int do_dentry_open(struct file *f,
 			  struct inode *inode,
 			  int (*open)(struct inode *, struct file *))
@@ -892,14 +916,9 @@ static int do_dentry_open(struct file *f,
 	if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
 		i_readcount_inc(inode);
 	} else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
-		error = get_write_access(inode);
+		error = file_get_write_access(f);
 		if (unlikely(error))
 			goto cleanup_file;
-		error = mnt_get_write_access(f->f_path.mnt);
-		if (unlikely(error)) {
-			put_write_access(inode);
-			goto cleanup_file;
-		}
 		f->f_mode |= FMODE_WRITER;
 	}
 

From 08582d678fcf11fc86188f0b92239d3d49667d8e Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 9 Oct 2023 18:37:11 +0300
Subject: [PATCH 19/26] fs: create helper file_user_path() for user displayed
 mapped file path

Overlayfs uses backing files with "fake" overlayfs f_path and "real"
underlying f_inode, in order to use underlying inode aops for mapped
files and to display the overlayfs path in /proc/<pid>/maps.

In preparation for storing the overlayfs "fake" path instead of the
underlying "real" path in struct backing_file, define a noop helper
file_user_path() that returns f_path for now.

Use the new helper in procfs and kernel logs whenever a path of a
mapped file is displayed to users.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231009153712.1566422-3-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 arch/arc/kernel/troubleshoot.c |  6 ++++--
 fs/proc/base.c                 |  2 +-
 fs/proc/nommu.c                |  2 +-
 fs/proc/task_mmu.c             |  4 ++--
 fs/proc/task_nommu.c           |  2 +-
 include/linux/fs.h             | 14 ++++++++++++++
 kernel/trace/trace_output.c    |  2 +-
 7 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index d5b3ed2c58f5..c380d8c30704 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -90,10 +90,12 @@ static void show_faulting_vma(unsigned long address)
 	 */
 	if (vma) {
 		char buf[ARC_PATH_MAX];
-		char *nm = "?";
+		char *nm = "anon";
 
 		if (vma->vm_file) {
-			nm = file_path(vma->vm_file, buf, ARC_PATH_MAX-1);
+			/* XXX: can we use %pD below and get rid of buf? */
+			nm = d_path(file_user_path(vma->vm_file), buf,
+				    ARC_PATH_MAX-1);
 			if (IS_ERR(nm))
 				nm = "?";
 		}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ffd54617c354..20695c928ee6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2218,7 +2218,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
 	rc = -ENOENT;
 	vma = find_exact_vma(mm, vm_start, vm_end);
 	if (vma && vma->vm_file) {
-		*path = vma->vm_file->f_path;
+		*path = *file_user_path(vma->vm_file);
 		path_get(path);
 		rc = 0;
 	}
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index 4d3493579458..c6e7ebc63756 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -58,7 +58,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 
 	if (file) {
 		seq_pad(m, ' ');
-		seq_file_path(m, file, "");
+		seq_path(m, file_user_path(file), "");
 	}
 
 	seq_putc(m, '\n');
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3dd5be96691b..1593940ca01e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -296,7 +296,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
 		if (anon_name)
 			seq_printf(m, "[anon_shmem:%s]", anon_name->name);
 		else
-			seq_file_path(m, file, "\n");
+			seq_path(m, file_user_path(file), "\n");
 		goto done;
 	}
 
@@ -1967,7 +1967,7 @@ static int show_numa_map(struct seq_file *m, void *v)
 
 	if (file) {
 		seq_puts(m, " file=");
-		seq_file_path(m, file, "\n\t= ");
+		seq_path(m, file_user_path(file), "\n\t= ");
 	} else if (vma_is_initial_heap(vma)) {
 		seq_puts(m, " heap");
 	} else if (vma_is_initial_stack(vma)) {
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index a8ac0dd8041e..2d5862c13399 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -157,7 +157,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 
 	if (file) {
 		seq_pad(m, ' ');
-		seq_file_path(m, file, "");
+		seq_path(m, file_user_path(file), "");
 	} else if (mm && vma_is_initial_stack(vma)) {
 		seq_pad(m, ' ');
 		seq_puts(m, "[stack]");
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ceafc40cc25f..057a3bbd4d27 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2513,6 +2513,20 @@ static inline const struct path *file_real_path(struct file *f)
 	return &f->f_path;
 }
 
+/*
+ * file_user_path - get the path to display for memory mapped file
+ *
+ * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file
+ * stored in ->vm_file is a backing file whose f_inode is on the underlying
+ * filesystem.  When the mapped file path is displayed to user (e.g. via
+ * /proc/<pid>/maps), this helper should be used to get the path to display
+ * to the user, which is the path of the fd that user has requested to map.
+ */
+static inline const struct path *file_user_path(struct file *f)
+{
+	return &f->f_path;
+}
+
 static inline struct file *file_clone_open(struct file *file)
 {
 	return dentry_open(&file->f_path, file->f_flags, file->f_cred);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index db575094c498..d8b302d01083 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -404,7 +404,7 @@ static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 			vmstart = vma->vm_start;
 		}
 		if (file) {
-			ret = trace_seq_path(s, &file->f_path);
+			ret = trace_seq_path(s, file_user_path(file));
 			if (ret)
 				trace_seq_printf(s, "[+0x%lx]",
 						 ip - vmstart);

From def3ae83da02f87005210fa3d448c5dd37ba4105 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Mon, 9 Oct 2023 18:37:12 +0300
Subject: [PATCH 20/26] fs: store real path instead of fake path in backing
 file f_path

A backing file struct stores two path's, one "real" path that is referring
to f_inode and one "fake" path, which should be displayed to users in
/proc/<pid>/maps.

There is a lot more potential code that needs to know the "real" path, then
code that needs to know the "fake" path.

Instead of code having to request the "real" path with file_real_path(),
store the "real" path in f_path and require code that needs to know the
"fake" path request it with file_user_path().
Replace the file_real_path() helper with a simple const accessor f_path().

After this change, file_dentry() is not expected to observe any files
with overlayfs f_path and real f_inode, so the call to ->d_real() should
not be needed.  Leave the ->d_real() call for now and add an assertion
in ovl_d_real() to catch if we made wrong assumptions.

Suggested-by: Miklos Szeredi <miklos@szeredi.hu>
Link: https://lore.kernel.org/r/CAJfpegtt48eXhhjDFA1ojcHPNKj3Go6joryCPtEFAKpocyBsnw@mail.gmail.com/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Link: https://lore.kernel.org/r/20231009153712.1566422-4-amir73il@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/file_table.c          | 12 ++++++------
 fs/internal.h            |  2 +-
 fs/open.c                | 23 +++++++++++------------
 fs/overlayfs/super.c     | 16 ++++++++++++----
 include/linux/fs.h       | 22 ++++------------------
 include/linux/fsnotify.h |  3 +--
 6 files changed, 35 insertions(+), 43 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index 08fd1dd6d863..fa92743ba6a9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -44,10 +44,10 @@ static struct kmem_cache *filp_cachep __read_mostly;
 
 static struct percpu_counter nr_files __cacheline_aligned_in_smp;
 
-/* Container for backing file with optional real path */
+/* Container for backing file with optional user path */
 struct backing_file {
 	struct file file;
-	struct path real_path;
+	struct path user_path;
 };
 
 static inline struct backing_file *backing_file(struct file *f)
@@ -55,11 +55,11 @@ static inline struct backing_file *backing_file(struct file *f)
 	return container_of(f, struct backing_file, file);
 }
 
-struct path *backing_file_real_path(struct file *f)
+struct path *backing_file_user_path(struct file *f)
 {
-	return &backing_file(f)->real_path;
+	return &backing_file(f)->user_path;
 }
-EXPORT_SYMBOL_GPL(backing_file_real_path);
+EXPORT_SYMBOL_GPL(backing_file_user_path);
 
 static inline void file_free(struct file *f)
 {
@@ -68,7 +68,7 @@ static inline void file_free(struct file *f)
 		percpu_counter_dec(&nr_files);
 	put_cred(f->f_cred);
 	if (unlikely(f->f_mode & FMODE_BACKING)) {
-		path_put(backing_file_real_path(f));
+		path_put(backing_file_user_path(f));
 		kfree(backing_file(f));
 	} else {
 		kmem_cache_free(filp_cachep, f);
diff --git a/fs/internal.h b/fs/internal.h
index 4e93a685bdaa..58e43341aebf 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -101,7 +101,7 @@ static inline void file_put_write_access(struct file *file)
 	put_write_access(file->f_inode);
 	mnt_put_write_access(file->f_path.mnt);
 	if (unlikely(file->f_mode & FMODE_BACKING))
-		mnt_put_write_access(backing_file_real_path(file)->mnt);
+		mnt_put_write_access(backing_file_user_path(file)->mnt);
 }
 
 static inline void put_file_access(struct file *file)
diff --git a/fs/open.c b/fs/open.c
index fe63e236da22..02dc608d40d8 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -881,7 +881,7 @@ static inline int file_get_write_access(struct file *f)
 	if (unlikely(error))
 		goto cleanup_inode;
 	if (unlikely(f->f_mode & FMODE_BACKING)) {
-		error = mnt_get_write_access(backing_file_real_path(f)->mnt);
+		error = mnt_get_write_access(backing_file_user_path(f)->mnt);
 		if (unlikely(error))
 			goto cleanup_mnt;
 	}
@@ -1182,20 +1182,19 @@ EXPORT_SYMBOL_GPL(kernel_file_open);
 
 /**
  * backing_file_open - open a backing file for kernel internal use
- * @path:	path of the file to open
+ * @user_path:	path that the user reuqested to open
  * @flags:	open flags
  * @real_path:	path of the backing file
  * @cred:	credentials for open
  *
  * Open a backing file for a stackable filesystem (e.g., overlayfs).
- * @path may be on the stackable filesystem and backing inode on the
- * underlying filesystem. In this case, we want to be able to return
- * the @real_path of the backing inode. This is done by embedding the
- * returned file into a container structure that also stores the path of
- * the backing inode on the underlying filesystem, which can be
- * retrieved using backing_file_real_path().
+ * @user_path may be on the stackable filesystem and @real_path on the
+ * underlying filesystem.  In this case, we want to be able to return the
+ * @user_path of the stackable filesystem. This is done by embedding the
+ * returned file into a container structure that also stores the stacked
+ * file's path, which can be retrieved using backing_file_user_path().
  */
-struct file *backing_file_open(const struct path *path, int flags,
+struct file *backing_file_open(const struct path *user_path, int flags,
 			       const struct path *real_path,
 			       const struct cred *cred)
 {
@@ -1206,9 +1205,9 @@ struct file *backing_file_open(const struct path *path, int flags,
 	if (IS_ERR(f))
 		return f;
 
-	f->f_path = *path;
-	path_get(real_path);
-	*backing_file_real_path(f) = *real_path;
+	path_get(user_path);
+	*backing_file_user_path(f) = *user_path;
+	f->f_path = *real_path;
 	error = do_dentry_open(f, d_inode(real_path->dentry), NULL);
 	if (error) {
 		fput(f);
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index def266b5e2a3..9f43f0d303ad 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -34,14 +34,22 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
 	struct dentry *real = NULL, *lower;
 	int err;
 
-	/* It's an overlay file */
+	/*
+	 * vfs is only expected to call d_real() with NULL from d_real_inode()
+	 * and with overlay inode from file_dentry() on an overlay file.
+	 *
+	 * TODO: remove @inode argument from d_real() API, remove code in this
+	 * function that deals with non-NULL @inode and remove d_real() call
+	 * from file_dentry().
+	 */
 	if (inode && d_inode(dentry) == inode)
 		return dentry;
+	else if (inode)
+		goto bug;
 
 	if (!d_is_reg(dentry)) {
-		if (!inode || inode == d_inode(dentry))
-			return dentry;
-		goto bug;
+		/* d_real_inode() is only relevant for regular files */
+		return dentry;
 	}
 
 	real = ovl_dentry_upper(dentry);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 057a3bbd4d27..b37dccfb4534 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2492,26 +2492,10 @@ struct file *dentry_open(const struct path *path, int flags,
 			 const struct cred *creds);
 struct file *dentry_create(const struct path *path, int flags, umode_t mode,
 			   const struct cred *cred);
-struct file *backing_file_open(const struct path *path, int flags,
+struct file *backing_file_open(const struct path *user_path, int flags,
 			       const struct path *real_path,
 			       const struct cred *cred);
-struct path *backing_file_real_path(struct file *f);
-
-/*
- * file_real_path - get the path corresponding to f_inode
- *
- * When opening a backing file for a stackable filesystem (e.g.,
- * overlayfs) f_path may be on the stackable filesystem and f_inode on
- * the underlying filesystem.  When the path associated with f_inode is
- * needed, this helper should be used instead of accessing f_path
- * directly.
-*/
-static inline const struct path *file_real_path(struct file *f)
-{
-	if (unlikely(f->f_mode & FMODE_BACKING))
-		return backing_file_real_path(f);
-	return &f->f_path;
-}
+struct path *backing_file_user_path(struct file *f);
 
 /*
  * file_user_path - get the path to display for memory mapped file
@@ -2524,6 +2508,8 @@ static inline const struct path *file_real_path(struct file *f)
  */
 static inline const struct path *file_user_path(struct file *f)
 {
+	if (unlikely(f->f_mode & FMODE_BACKING))
+		return backing_file_user_path(f);
 	return &f->f_path;
 }
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index ed48e4f1e755..bcb6609b54b3 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -96,8 +96,7 @@ static inline int fsnotify_file(struct file *file, __u32 mask)
 	if (file->f_mode & FMODE_NONOTIFY)
 		return 0;
 
-	/* Overlayfs internal files have fake f_path */
-	path = file_real_path(file);
+	path = &file->f_path;
 	return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
 }
 

From e4e8b47a34a432c3f65534d12d5c132b6639da71 Mon Sep 17 00:00:00 2001
From: Max Kellermann <mk@cm4all.com>
Date: Mon, 15 Jan 2018 18:30:46 +0100
Subject: [PATCH 21/26] fs: fix umask on NFS with CONFIG_FS_POSIX_ACL=n

Make IS_POSIXACL() return false if POSIX ACL support is disabled.

Never skip applying the umask in namei.c and never bother to do any
ACL specific checks if the filesystem falsely indicates it has ACLs
enabled when the feature is completely disabled in the kernel.

This fixes a problem where the umask is always ignored in the NFS
client when compiled without CONFIG_FS_POSIX_ACL.  This is a 4 year
old regression caused by commit 013cdf1088d723 which itself was not
completely wrong, but failed to consider all the side effects by
misdesigned VFS code.

Prior to that commit, there were two places where the umask could be
applied, for example when creating a directory:

 1. in the VFS layer in SYSCALL_DEFINE3(mkdirat), but only if
    !IS_POSIXACL()

 2. again (unconditionally) in nfs3_proc_mkdir()

The first one does not apply, because even without
CONFIG_FS_POSIX_ACL, the NFS client sets SB_POSIXACL in
nfs_fill_super().

After that commit, (2.) was replaced by:

 2b. in posix_acl_create(), called by nfs3_proc_mkdir()

There's one branch in posix_acl_create() which applies the umask;
however, without CONFIG_FS_POSIX_ACL, posix_acl_create() is an empty
dummy function which does not apply the umask.

The approach chosen by this patch is to make IS_POSIXACL() always
return false when POSIX ACL support is disabled, so the umask always
gets applied by the VFS layer.  This is consistent with the (regular)
behavior of posix_acl_create(): that function returns early if
IS_POSIXACL() is false, before applying the umask.

Therefore, posix_acl_create() is responsible for applying the umask if
there is ACL support enabled in the file system (SB_POSIXACL), and the
VFS layer is responsible for all other cases (no SB_POSIXACL or no
CONFIG_FS_POSIX_ACL).

Signed-off-by: Max Kellermann <mk@cm4all.com>
Link: https://lore.kernel.org/r/151603744662.29035.4910161264124875658.stgit@rabbit.intern.cm-ag
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/fs.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index b37dccfb4534..cb8bfa1f8ecb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2113,7 +2113,12 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
 #define IS_NOQUOTA(inode)	((inode)->i_flags & S_NOQUOTA)
 #define IS_APPEND(inode)	((inode)->i_flags & S_APPEND)
 #define IS_IMMUTABLE(inode)	((inode)->i_flags & S_IMMUTABLE)
+
+#ifdef CONFIG_FS_POSIX_ACL
 #define IS_POSIXACL(inode)	__IS_FLG(inode, SB_POSIXACL)
+#else
+#define IS_POSIXACL(inode)	0
+#endif
 
 #define IS_DEADDIR(inode)	((inode)->i_flags & S_DEAD)
 #define IS_NOCMTIME(inode)	((inode)->i_flags & S_NOCMTIME)

From 2bc5e5e8167f2114976f00755b9a0c7f17d6f105 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 12 Oct 2023 17:36:57 +0200
Subject: [PATCH 22/26] ovl: rely on SB_I_NOUMASK

In commit f61b9bb3f838 ("fs: add a new SB_I_NOUMASK flag") we added a
new SB_I_NOUMASK flag that is used by filesystems like NFS to indicate
that umask stripping is never supposed to be done in the vfs independent
of whether or not POSIX ACLs are supported.

Overlayfs falls into the same category as it raises SB_POSIXACL
unconditionally to defer umask application to the upper filesystem.

Now that we have SB_I_NOUMASK use that and make SB_POSIXACL properly
conditional on whether or not the kernel does have support for it. This
will enable use to turn IS_POSIXACL() into nop on kernels that don't
have POSIX ACL support avoding bugs from missed umask stripping.

Link: https://lore.kernel.org/r/20231012-einband-uferpromenade-80541a047a1f@brauner
Acked-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/overlayfs/super.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 9f43f0d303ad..361189b676b0 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1489,8 +1489,16 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
 	sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers :
 		ovl_trusted_xattr_handlers;
 	sb->s_fs_info = ofs;
+#ifdef CONFIG_FS_POSIX_ACL
 	sb->s_flags |= SB_POSIXACL;
+#endif
 	sb->s_iflags |= SB_I_SKIP_SYNC | SB_I_IMA_UNVERIFIABLE_SIGNATURE;
+	/*
+	 * Ensure that umask handling is done by the filesystems used
+	 * for the the upper layer instead of overlayfs as that would
+	 * lead to unexpected results.
+	 */
+	sb->s_iflags |= SB_I_NOUMASK;
 
 	err = -ENOMEM;
 	root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe);

From e311ba29a552194ece6c1579fe434bdad550bfac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Fri, 13 Oct 2023 15:24:42 +0200
Subject: [PATCH 23/26] chardev: Simplify usage of try_module_get()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

try_module_get(NULL) is true, so there is no need to check owner being
NULL.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20231013132441.1406200-2-u.kleine-koenig@pengutronix.de
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/char_dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/char_dev.c b/fs/char_dev.c
index 950b6919fb87..6ba032442b39 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -350,7 +350,7 @@ static struct kobject *cdev_get(struct cdev *p)
 	struct module *owner = p->owner;
 	struct kobject *kobj;
 
-	if (owner && !try_module_get(owner))
+	if (!try_module_get(owner))
 		return NULL;
 	kobj = kobject_get_unless_zero(&p->kobj);
 	if (!kobj)

From 6654408a33e6297d8e1d2773409431d487399b95 Mon Sep 17 00:00:00 2001
From: Jingbo Xu <jefflexu@linux.alibaba.com>
Date: Sat, 14 Oct 2023 20:55:11 +0800
Subject: [PATCH 24/26] writeback, cgroup: switch inodes with dirty timestamps
 to release dying cgwbs

The cgwb cleanup routine will try to release the dying cgwb by switching
the attached inodes.  It fetches the attached inodes from wb->b_attached
list, omitting the fact that inodes only with dirty timestamps reside in
wb->b_dirty_time list, which is the case when lazytime is enabled.  This
causes enormous zombie memory cgroup when lazytime is enabled, as inodes
with dirty timestamps can not be switched to a live cgwb for a long time.

It is reasonable not to switch cgwb for inodes with dirty data, as
otherwise it may break the bandwidth restrictions.  However since the
writeback of inode metadata is not accounted for, let's also switch
inodes with dirty timestamps to avoid zombie memory and block cgroups
when laztytime is enabled.

Fixes: c22d70a162d3 ("writeback, cgroup: release dying cgwbs by switching attached inodes")
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jingbo Xu <jefflexu@linux.alibaba.com>
Link: https://lore.kernel.org/r/20231014125511.102978-1-jefflexu@linux.alibaba.com
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/fs-writeback.c | 41 +++++++++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 969ce991b0b0..f5d98ed7b55a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -613,6 +613,24 @@ out_free:
 	kfree(isw);
 }
 
+static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
+				   struct list_head *list, int *nr)
+{
+	struct inode *inode;
+
+	list_for_each_entry(inode, list, i_io_list) {
+		if (!inode_prepare_wbs_switch(inode, isw->new_wb))
+			continue;
+
+		isw->inodes[*nr] = inode;
+		(*nr)++;
+
+		if (*nr >= WB_MAX_INODES_PER_ISW - 1)
+			return true;
+	}
+	return false;
+}
+
 /**
  * cleanup_offline_cgwb - detach associated inodes
  * @wb: target wb
@@ -625,7 +643,6 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
 {
 	struct cgroup_subsys_state *memcg_css;
 	struct inode_switch_wbs_context *isw;
-	struct inode *inode;
 	int nr;
 	bool restart = false;
 
@@ -647,17 +664,17 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
 
 	nr = 0;
 	spin_lock(&wb->list_lock);
-	list_for_each_entry(inode, &wb->b_attached, i_io_list) {
-		if (!inode_prepare_wbs_switch(inode, isw->new_wb))
-			continue;
-
-		isw->inodes[nr++] = inode;
-
-		if (nr >= WB_MAX_INODES_PER_ISW - 1) {
-			restart = true;
-			break;
-		}
-	}
+	/*
+	 * In addition to the inodes that have completed writeback, also switch
+	 * cgwbs for those inodes only with dirty timestamps. Otherwise, those
+	 * inodes won't be written back for a long time when lazytime is
+	 * enabled, and thus pinning the dying cgwbs. It won't break the
+	 * bandwidth restrictions, as writeback of inode metadata is not
+	 * accounted for.
+	 */
+	restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
+	if (!restart)
+		restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
 	spin_unlock(&wb->list_lock);
 
 	/* no attached inodes? bail out */

From c04d905f6c7c41f137de7e4a9279e5c938eb19ef Mon Sep 17 00:00:00 2001
From: Bernd Schubert <bschubert@ddn.com>
Date: Mon, 23 Oct 2023 20:47:18 +0200
Subject: [PATCH 25/26] vfs: Convert BUG_ON to WARN_ON_ONCE in
 open_last_lookups

The calling code actually handles -ECHILD, so this BUG_ON
can be converted to WARN_ON_ONCE.

Signed-off-by: Bernd Schubert <bschubert@ddn.com>
Link: https://lore.kernel.org/r/20231023184718.11143-1-bschubert@ddn.com
Cc: Christian Brauner <brauner@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Amir Goldstein <amir73il@gmail.com>
Cc: Dharmendra Singh <dsingh@ddn.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/namei.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/namei.c b/fs/namei.c
index 127c868a8992..8571855c281d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3516,7 +3516,8 @@ static const char *open_last_lookups(struct nameidata *nd,
 		if (likely(dentry))
 			goto finish_lookup;
 
-		BUG_ON(nd->flags & LOOKUP_RCU);
+		if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
+			return ERR_PTR(-ECHILD);
 	} else {
 		/* create side of things */
 		if (nd->flags & LOOKUP_RCU) {

From 61d4fb0b349ec1b33119913c3b0bd109de30142c Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 25 Oct 2023 12:14:37 +0200
Subject: [PATCH 26/26] file, i915: fix file reference for mmap_singleton()

Today we got a report at [1] for rcu stalls on the i915 testsuite in [2]
due to the conversion of files to SLAB_TYPSSAFE_BY_RCU. Afaict,
get_file_rcu() goes into an infinite loop trying to carefully verify
that i915->gem.mmap_singleton hasn't changed - see the splat below.

So I stared at this code to figure out what it actually does. It seems
that the i915->gem.mmap_singleton pointer itself never had rcu semantics.

The i915->gem.mmap_singleton is replaced in
file->f_op->release::singleton_release():

        static int singleton_release(struct inode *inode, struct file *file)
        {
                struct drm_i915_private *i915 = file->private_data;

                cmpxchg(&i915->gem.mmap_singleton, file, NULL);
                drm_dev_put(&i915->drm);

                return 0;
        }

The cmpxchg() is ordered against a concurrent update of
i915->gem.mmap_singleton from mmap_singleton(). IOW, when
mmap_singleton() fails to get a reference on i915->gem.mmap_singleton:

While mmap_singleton() does

        rcu_read_lock();
        file = get_file_rcu(&i915->gem.mmap_singleton);
        rcu_read_unlock();

it allocates a new file via anon_inode_getfile() and does

        smp_store_mb(i915->gem.mmap_singleton, file);

So, then what happens in the case of this bug is that at some point
fput() is called and drops the file->f_count to zero leaving the pointer
in i915->gem.mmap_singleton in tact.

Now, there might be delays until
file->f_op->release::singleton_release() is called and
i915->gem.mmap_singleton is set to NULL.

Say concurrently another task hits mmap_singleton() and does:

        rcu_read_lock();
        file = get_file_rcu(&i915->gem.mmap_singleton);
        rcu_read_unlock();

When get_file_rcu() fails to get a reference via atomic_inc_not_zero()
it will try the reload from i915->gem.mmap_singleton expecting it to be
NULL, assuming it has comparable semantics as we expect in
__fget_files_rcu().

But it hasn't so it reloads the same pointer again, trying the same
atomic_inc_not_zero() again and doing so until
file->f_op->release::singleton_release() of the old file has been
called.

So, in contrast to __fget_files_rcu() here we want to not retry when
atomic_inc_not_zero() has failed. We only want to retry in case we
managed to get a reference but the pointer did change on reload.

<3> [511.395679] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
<3> [511.395716] rcu:   Tasks blocked on level-1 rcu_node (CPUs 0-9): P6238
<3> [511.395934] rcu:   (detected by 16, t=65002 jiffies, g=123977, q=439 ncpus=20)
<6> [511.395944] task:i915_selftest   state:R  running task     stack:10568 pid:6238  tgid:6238  ppid:1001   flags:0x00004002
<6> [511.395962] Call Trace:
<6> [511.395966]  <TASK>
<6> [511.395974]  ? __schedule+0x3a8/0xd70
<6> [511.395995]  ? asm_sysvec_apic_timer_interrupt+0x1a/0x20
<6> [511.396003]  ? lockdep_hardirqs_on+0xc3/0x140
<6> [511.396013]  ? asm_sysvec_apic_timer_interrupt+0x1a/0x20
<6> [511.396029]  ? get_file_rcu+0x10/0x30
<6> [511.396039]  ? get_file_rcu+0x10/0x30
<6> [511.396046]  ? i915_gem_object_mmap+0xbc/0x450 [i915]
<6> [511.396509]  ? i915_gem_mmap+0x272/0x480 [i915]
<6> [511.396903]  ? mmap_region+0x253/0xb60
<6> [511.396925]  ? do_mmap+0x334/0x5c0
<6> [511.396939]  ? vm_mmap_pgoff+0x9f/0x1c0
<6> [511.396949]  ? rcu_is_watching+0x11/0x50
<6> [511.396962]  ? igt_mmap_offset+0xfc/0x110 [i915]
<6> [511.397376]  ? __igt_mmap+0xb3/0x570 [i915]
<6> [511.397762]  ? igt_mmap+0x11e/0x150 [i915]
<6> [511.398139]  ? __trace_bprintk+0x76/0x90
<6> [511.398156]  ? __i915_subtests+0xbf/0x240 [i915]
<6> [511.398586]  ? __pfx___i915_live_setup+0x10/0x10 [i915]
<6> [511.399001]  ? __pfx___i915_live_teardown+0x10/0x10 [i915]
<6> [511.399433]  ? __run_selftests+0xbc/0x1a0 [i915]
<6> [511.399875]  ? i915_live_selftests+0x4b/0x90 [i915]
<6> [511.400308]  ? i915_pci_probe+0x106/0x200 [i915]
<6> [511.400692]  ? pci_device_probe+0x95/0x120
<6> [511.400704]  ? really_probe+0x164/0x3c0
<6> [511.400715]  ? __pfx___driver_attach+0x10/0x10
<6> [511.400722]  ? __driver_probe_device+0x73/0x160
<6> [511.400731]  ? driver_probe_device+0x19/0xa0
<6> [511.400741]  ? __driver_attach+0xb6/0x180
<6> [511.400749]  ? __pfx___driver_attach+0x10/0x10
<6> [511.400756]  ? bus_for_each_dev+0x77/0xd0
<6> [511.400770]  ? bus_add_driver+0x114/0x210
<6> [511.400781]  ? driver_register+0x5b/0x110
<6> [511.400791]  ? i915_init+0x23/0xc0 [i915]
<6> [511.401153]  ? __pfx_i915_init+0x10/0x10 [i915]
<6> [511.401503]  ? do_one_initcall+0x57/0x270
<6> [511.401515]  ? rcu_is_watching+0x11/0x50
<6> [511.401521]  ? kmalloc_trace+0xa3/0xb0
<6> [511.401532]  ? do_init_module+0x5f/0x210
<6> [511.401544]  ? load_module+0x1d00/0x1f60
<6> [511.401581]  ? init_module_from_file+0x86/0xd0
<6> [511.401590]  ? init_module_from_file+0x86/0xd0
<6> [511.401613]  ? idempotent_init_module+0x17c/0x230
<6> [511.401639]  ? __x64_sys_finit_module+0x56/0xb0
<6> [511.401650]  ? do_syscall_64+0x3c/0x90
<6> [511.401659]  ? entry_SYSCALL_64_after_hwframe+0x6e/0xd8
<6> [511.401684]  </TASK>

Link: [1]: https://lore.kernel.org/intel-gfx/SJ1PR11MB6129CB39EED831784C331BAFB9DEA@SJ1PR11MB6129.namprd11.prod.outlook.com
Link: [2]: https://intel-gfx-ci.01.org/tree/linux-next/next-20231013/bat-dg2-11/igt@i915_selftest@live@mman.html#dmesg-warnings10963
Cc: Jann Horn <jannh@google.com>,
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20231025-formfrage-watscheln-84526cd3bd7d@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 drivers/gpu/drm/i915/gem/i915_gem_mman.c |  4 +---
 fs/file.c                                | 25 ++++++++++++++++++++++++
 include/linux/fs.h                       |  1 +
 3 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index 97bf10861cad..72b353737334 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -915,9 +915,7 @@ static struct file *mmap_singleton(struct drm_i915_private *i915)
 {
 	struct file *file;
 
-	rcu_read_lock();
-	file = get_file_rcu(&i915->gem.mmap_singleton);
-	rcu_read_unlock();
+	file = get_file_active(&i915->gem.mmap_singleton);
 	if (file)
 		return file;
 
diff --git a/fs/file.c b/fs/file.c
index 1a475d7d636e..5fb0b146e79e 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -927,6 +927,31 @@ struct file *get_file_rcu(struct file __rcu **f)
 }
 EXPORT_SYMBOL_GPL(get_file_rcu);
 
+/**
+ * get_file_active - try go get a reference to a file
+ * @f: the file to get a reference on
+ *
+ * In contast to get_file_rcu() the pointer itself isn't part of the
+ * reference counting.
+ *
+ * This function should rarely have to be used and only by users who
+ * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
+ *
+ * Return: Returns @f with the reference count increased or NULL.
+ */
+struct file *get_file_active(struct file **f)
+{
+	struct file __rcu *file;
+
+	rcu_read_lock();
+	file = __get_file_rcu(f);
+	rcu_read_unlock();
+	if (IS_ERR(file))
+		file = NULL;
+	return file;
+}
+EXPORT_SYMBOL_GPL(get_file_active);
+
 static inline struct file *__fget_files_rcu(struct files_struct *files,
        unsigned int fd, fmode_t mask)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index cb8bfa1f8ecb..b35815277cc6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1044,6 +1044,7 @@ static inline struct file *get_file(struct file *f)
 }
 
 struct file *get_file_rcu(struct file __rcu **f);
+struct file *get_file_active(struct file **f);
 
 #define file_count(x)	atomic_long_read(&(x)->f_count)