From b639585e71e63008373d3a9fd060b87315fe7ea8 Mon Sep 17 00:00:00 2001
From: Wang Jinchao <wangjinchao@xfusion.com>
Date: Wed, 31 Jan 2024 10:54:41 +0800
Subject: [PATCH 01/23] fork: Using clone_flags for legacy clone check

In the current implementation of clone(), there is a line that
initializes `u64 clone_flags = args->flags` at the top.
This means that there is no longer a need to use args->flags
for the legacy clone check.

Signed-off-by: Wang Jinchao <wangjinchao@xfusion.com>
Link: https://lore.kernel.org/r/202401311054+0800-wangjinchao@xfusion.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 47ff3b35352e..95647c66309f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2875,8 +2875,8 @@ pid_t kernel_clone(struct kernel_clone_args *args)
 	 * here has the advantage that we don't need to have a separate helper
 	 * to check for legacy clone().
 	 */
-	if ((args->flags & CLONE_PIDFD) &&
-	    (args->flags & CLONE_PARENT_SETTID) &&
+	if ((clone_flags & CLONE_PIDFD) &&
+	    (clone_flags & CLONE_PARENT_SETTID) &&
 	    (args->pidfd == args->parent_tid))
 		return -EINVAL;
 

From cdefbf2324ceda662e2667aa2f44e8b9de3d780f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 25 Jan 2024 17:17:34 +0100
Subject: [PATCH 02/23] pidfd: cleanup the usage of __pidfd_prepare's flags

- make pidfd_create() static.

- Don't pass O_RDWR | O_CLOEXEC to __pidfd_prepare() in copy_process(),
  __pidfd_prepare() adds these flags unconditionally.

- Kill the flags check in __pidfd_prepare(). sys_pidfd_open() checks the
  flags itself, all other users of pidfd_prepare() pass flags = 0.

  If we need a sanity check for those other in kernel users then
  WARN_ON_ONCE(flags & ~PIDFD_NONBLOCK) makes more sense.

- Don't pass O_RDWR to get_unused_fd_flags(), it ignores everything except
  O_CLOEXEC.

- Don't pass O_CLOEXEC to anon_inode_getfile(), it ignores everything except
  O_ACCMODE | O_NONBLOCK.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240125161734.GA778@redhat.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/pid.h | 1 -
 kernel/fork.c       | 9 +++------
 kernel/pid.c        | 2 +-
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 395cacce1179..e6a041cb8bac 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -73,7 +73,6 @@ struct file;
 extern struct pid *pidfd_pid(const struct file *file);
 struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
 struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
-int pidfd_create(struct pid *pid, unsigned int flags);
 int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
 
 static inline struct pid *get_pid(struct pid *pid)
diff --git a/kernel/fork.c b/kernel/fork.c
index 95647c66309f..726a92043531 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2130,15 +2130,12 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
 	int pidfd;
 	struct file *pidfd_file;
 
-	if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
-		return -EINVAL;
-
-	pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
+	pidfd = get_unused_fd_flags(O_CLOEXEC);
 	if (pidfd < 0)
 		return pidfd;
 
 	pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
-					flags | O_RDWR | O_CLOEXEC);
+					flags | O_RDWR);
 	if (IS_ERR(pidfd_file)) {
 		put_unused_fd(pidfd);
 		return PTR_ERR(pidfd_file);
@@ -2524,7 +2521,7 @@ __latent_entropy struct task_struct *copy_process(
 	 */
 	if (clone_flags & CLONE_PIDFD) {
 		/* Note that no task has been attached to @pid yet. */
-		retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile);
+		retval = __pidfd_prepare(pid, 0, &pidfile);
 		if (retval < 0)
 			goto bad_fork_free_pid;
 		pidfd = retval;
diff --git a/kernel/pid.c b/kernel/pid.c
index b52b10865454..c7a3e359f8f5 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -595,7 +595,7 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
  * Return: On success, a cloexec pidfd is returned.
  *         On error, a negative errno number will be returned.
  */
-int pidfd_create(struct pid *pid, unsigned int flags)
+static int pidfd_create(struct pid *pid, unsigned int flags)
 {
 	int pidfd;
 	struct file *pidfd_file;

From 21e25205d7f9b6d7d3807546dd12ea93844b7c8e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sat, 27 Jan 2024 14:24:07 +0100
Subject: [PATCH 03/23] pidfd: don't do_notify_pidfd() if !thread_group_empty()

do_notify_pidfd() makes no sense until the whole thread group exits, change
do_notify_parent() to check thread_group_empty().

This avoids the unnecessary do_notify_pidfd() when tsk is not a leader, or
it exits before other threads, or it has a ptraced EXIT_ZOMBIE sub-thread.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240127132407.GA29136@redhat.com
Reviewed-by: Tycho Andersen <tandersen@netflix.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/signal.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index c9c57d053ce4..9561a3962ca6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2050,9 +2050,11 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 
 	WARN_ON_ONCE(!tsk->ptrace &&
 	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
-
-	/* Wake up all pidfd waiters */
-	do_notify_pidfd(tsk);
+	/*
+	 * tsk is a group leader and has no threads, wake up the pidfd waiters.
+	 */
+	if (thread_group_empty(tsk))
+		do_notify_pidfd(tsk);
 
 	if (sig != SIGCHLD) {
 		/*

From 64bef697d33b75fc06c5789b3f8108680271529f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 31 Jan 2024 14:26:02 +0100
Subject: [PATCH 04/23] pidfd: implement PIDFD_THREAD flag for pidfd_open()

With this flag:

	- pidfd_open() doesn't require that the target task must be
	  a thread-group leader

	- pidfd_poll() succeeds when the task exits and becomes a
	  zombie (iow, passes exit_notify()), even if it is a leader
	  and thread-group is not empty.

	  This means that the behaviour of pidfd_poll(PIDFD_THREAD,
	  pid-of-group-leader) is not well defined if it races with
	  exec() from its sub-thread; pidfd_poll() can succeed or not
	  depending on whether pidfd_task_exited() is called before
	  or after exchange_tids().

	  Perhaps we can improve this behaviour later, pidfd_poll()
	  can probably take sig->group_exec_task into account. But
	  this doesn't really differ from the case when the leader
	  exits before other threads (so pidfd_poll() succeeds) and
	  then another thread execs and pidfd_poll() will block again.

thread_group_exited() is no longer used, perhaps it can die.

Co-developed-by: Tycho Andersen <tycho@tycho.pizza>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240131132602.GA23641@redhat.com
Tested-by: Tycho Andersen <tandersen@netflix.com>
Reviewed-by: Tycho Andersen <tandersen@netflix.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/exec.c                  |  6 +++++-
 include/linux/pid.h        |  3 ++-
 include/uapi/linux/pidfd.h |  3 ++-
 kernel/exit.c              |  7 +++++++
 kernel/fork.c              | 38 +++++++++++++++++++++++++++++++-------
 kernel/pid.c               | 14 +++-----------
 kernel/signal.c            |  6 ++++--
 7 files changed, 54 insertions(+), 23 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 8cdd5b2dd09c..b68f61bbcaa8 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1143,7 +1143,11 @@ static int de_thread(struct task_struct *tsk)
 
 		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
 		leader->exit_state = EXIT_DEAD;
-
+		/*
+		 * leader and tsk exhanged their pids, the old pid dies,
+		 * wake up the PIDFD_THREAD waiters.
+		 */
+		do_notify_pidfd(leader);
 		/*
 		 * We are going to release_task()->ptrace_unlink() silently,
 		 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
diff --git a/include/linux/pid.h b/include/linux/pid.h
index e6a041cb8bac..8124d57752b9 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -70,10 +70,11 @@ extern const struct file_operations pidfd_fops;
 
 struct file;
 
-extern struct pid *pidfd_pid(const struct file *file);
+struct pid *pidfd_pid(const struct file *file);
 struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
 struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
 int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
+void do_notify_pidfd(struct task_struct *task);
 
 static inline struct pid *get_pid(struct pid *pid)
 {
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index 5406fbc13074..2e6461459877 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -7,6 +7,7 @@
 #include <linux/fcntl.h>
 
 /* Flags for pidfd_open().  */
-#define PIDFD_NONBLOCK O_NONBLOCK
+#define PIDFD_NONBLOCK	O_NONBLOCK
+#define PIDFD_THREAD	O_EXCL
 
 #endif /* _UAPI_LINUX_PIDFD_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 3988a02efaef..c038d10dfb38 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -739,6 +739,13 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 		kill_orphaned_pgrp(tsk->group_leader, NULL);
 
 	tsk->exit_state = EXIT_ZOMBIE;
+	/*
+	 * sub-thread or delay_group_leader(), wake up the
+	 * PIDFD_THREAD waiters.
+	 */
+	if (!thread_group_empty(tsk))
+		do_notify_pidfd(tsk);
+
 	if (unlikely(tsk->ptrace)) {
 		int sig = thread_group_leader(tsk) &&
 				thread_group_empty(tsk) &&
diff --git a/kernel/fork.c b/kernel/fork.c
index 726a92043531..1a9b91055916 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -101,6 +101,7 @@
 #include <linux/user_events.h>
 #include <linux/iommu.h>
 #include <linux/rseq.h>
+#include <uapi/linux/pidfd.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -2050,6 +2051,8 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 
 	seq_put_decimal_ll(m, "Pid:\t", nr);
 
+	/* TODO: report PIDFD_THREAD */
+
 #ifdef CONFIG_PID_NS
 	seq_put_decimal_ll(m, "\nNSpid:\t", nr);
 	if (nr > 0) {
@@ -2068,22 +2071,35 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 }
 #endif
 
+static bool pidfd_task_exited(struct pid *pid, bool thread)
+{
+	struct task_struct *task;
+	bool exited;
+
+	rcu_read_lock();
+	task = pid_task(pid, PIDTYPE_PID);
+	exited = !task ||
+		(READ_ONCE(task->exit_state) && (thread || thread_group_empty(task)));
+	rcu_read_unlock();
+
+	return exited;
+}
+
 /*
  * Poll support for process exit notification.
  */
 static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 {
 	struct pid *pid = file->private_data;
+	bool thread = file->f_flags & PIDFD_THREAD;
 	__poll_t poll_flags = 0;
 
 	poll_wait(file, &pid->wait_pidfd, pts);
-
 	/*
-	 * Inform pollers only when the whole thread group exits.
-	 * If the thread group leader exits before all other threads in the
-	 * group, then poll(2) should block, similar to the wait(2) family.
+	 * Depending on PIDFD_THREAD, inform pollers when the thread
+	 * or the whole thread-group exits.
 	 */
-	if (thread_group_exited(pid))
+	if (pidfd_task_exited(pid, thread))
 		poll_flags = EPOLLIN | EPOLLRDNORM;
 
 	return poll_flags;
@@ -2141,6 +2157,11 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
 		return PTR_ERR(pidfd_file);
 	}
 	get_pid(pid); /* held by pidfd_file now */
+	/*
+	 * anon_inode_getfile() ignores everything outside of the
+	 * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
+	 */
+	pidfd_file->f_flags |= (flags & PIDFD_THREAD);
 	*ret = pidfd_file;
 	return pidfd;
 }
@@ -2154,7 +2175,8 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
  * Allocate a new file that stashes @pid and reserve a new pidfd number in the
  * caller's file descriptor table. The pidfd is reserved but not installed yet.
  *
- * The helper verifies that @pid is used as a thread group leader.
+ * The helper verifies that @pid is still in use, without PIDFD_THREAD the
+ * task identified by @pid must be a thread-group leader.
  *
  * If this function returns successfully the caller is responsible to either
  * call fd_install() passing the returned pidfd and pidfd file as arguments in
@@ -2173,7 +2195,9 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
  */
 int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
 {
-	if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+	bool thread = flags & PIDFD_THREAD;
+
+	if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
 		return -EINVAL;
 
 	return __pidfd_prepare(pid, flags, ret);
diff --git a/kernel/pid.c b/kernel/pid.c
index c7a3e359f8f5..e11144466828 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -552,11 +552,6 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
  * Return the task associated with @pidfd. The function takes a reference on
  * the returned task. The caller is responsible for releasing that reference.
  *
- * Currently, the process identified by @pidfd is always a thread-group leader.
- * This restriction currently exists for all aspects of pidfds including pidfd
- * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling
- * (only supports thread group leaders).
- *
  * Return: On success, the task_struct associated with the pidfd.
  *	   On error, a negative errno number will be returned.
  */
@@ -615,11 +610,8 @@ static int pidfd_create(struct pid *pid, unsigned int flags)
  * @flags: flags to pass
  *
  * This creates a new pid file descriptor with the O_CLOEXEC flag set for
- * the process identified by @pid. Currently, the process identified by
- * @pid must be a thread-group leader. This restriction currently exists
- * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
- * be used with CLONE_THREAD) and pidfd polling (only supports thread group
- * leaders).
+ * the task identified by @pid. Without PIDFD_THREAD flag the target task
+ * must be a thread-group leader.
  *
  * Return: On success, a cloexec pidfd is returned.
  *         On error, a negative errno number will be returned.
@@ -629,7 +621,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
 	int fd;
 	struct pid *p;
 
-	if (flags & ~PIDFD_NONBLOCK)
+	if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
 		return -EINVAL;
 
 	if (pid <= 0)
diff --git a/kernel/signal.c b/kernel/signal.c
index 9561a3962ca6..9b40109f0c56 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2019,7 +2019,7 @@ ret:
 	return ret;
 }
 
-static void do_notify_pidfd(struct task_struct *task)
+void do_notify_pidfd(struct task_struct *task)
 {
 	struct pid *pid;
 
@@ -2051,7 +2051,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
 	WARN_ON_ONCE(!tsk->ptrace &&
 	       (tsk->group_leader != tsk || !thread_group_empty(tsk)));
 	/*
-	 * tsk is a group leader and has no threads, wake up the pidfd waiters.
+	 * tsk is a group leader and has no threads, wake up the
+	 * non-PIDFD_THREAD waiters.
 	 */
 	if (thread_group_empty(tsk))
 		do_notify_pidfd(tsk);
@@ -3926,6 +3927,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 		prepare_kill_siginfo(sig, &kinfo);
 	}
 
+	/* TODO: respect PIDFD_THREAD */
 	ret = kill_pid_info(sig, &kinfo, pid);
 
 err:

From 43f0df54c96fa5abcab9df8649c1e52119bf0238 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 2 Feb 2024 14:12:26 +0100
Subject: [PATCH 05/23] pidfd_poll: report POLLHUP when pid_task() == NULL

Add another wake_up_all(wait_pidfd) into __change_pid() and change
pidfd_poll() to include EPOLLHUP if task == NULL.

This allows to wait until the target process/thread is reaped.

TODO: change do_notify_pidfd() to use the keyed wakeups.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240202131226.GA26018@redhat.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/fork.c | 22 +++++++---------------
 kernel/pid.c  |  5 +++++
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 1a9b91055916..aa08193d124f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2071,20 +2071,6 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 }
 #endif
 
-static bool pidfd_task_exited(struct pid *pid, bool thread)
-{
-	struct task_struct *task;
-	bool exited;
-
-	rcu_read_lock();
-	task = pid_task(pid, PIDTYPE_PID);
-	exited = !task ||
-		(READ_ONCE(task->exit_state) && (thread || thread_group_empty(task)));
-	rcu_read_unlock();
-
-	return exited;
-}
-
 /*
  * Poll support for process exit notification.
  */
@@ -2092,6 +2078,7 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 {
 	struct pid *pid = file->private_data;
 	bool thread = file->f_flags & PIDFD_THREAD;
+	struct task_struct *task;
 	__poll_t poll_flags = 0;
 
 	poll_wait(file, &pid->wait_pidfd, pts);
@@ -2099,8 +2086,13 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 	 * Depending on PIDFD_THREAD, inform pollers when the thread
 	 * or the whole thread-group exits.
 	 */
-	if (pidfd_task_exited(pid, thread))
+	rcu_read_lock();
+	task = pid_task(pid, PIDTYPE_PID);
+	if (!task)
+		poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
+	else if (task->exit_state && (thread || thread_group_empty(task)))
 		poll_flags = EPOLLIN | EPOLLRDNORM;
+	rcu_read_unlock();
 
 	return poll_flags;
 }
diff --git a/kernel/pid.c b/kernel/pid.c
index e11144466828..62461c7c82b8 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -349,6 +349,11 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
 	hlist_del_rcu(&task->pid_links[type]);
 	*pid_ptr = new;
 
+	if (type == PIDTYPE_PID) {
+		WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID));
+		wake_up_all(&pid->wait_pidfd);
+	}
+
 	for (tmp = PIDTYPE_MAX; --tmp >= 0; )
 		if (pid_has_task(pid, tmp))
 			return;

From 90f92b68c9869913753f8bc1d87b7762a5f36873 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 2 Feb 2024 14:12:48 +0100
Subject: [PATCH 06/23] pidfd: kill the no longer needed do_notify_pidfd() in
 de_thread()

Now that __change_pid() does wake_up_all(&pid->wait_pidfd) we can kill
do_notify_pidfd(leader) in de_thread(), it calls release_task(leader)
right after that and this implies detach_pid(leader, PIDTYPE_PID).

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240202131248.GA26022@redhat.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/exec.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index b68f61bbcaa8..ca0d53edac99 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1143,11 +1143,6 @@ static int de_thread(struct task_struct *tsk)
 
 		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
 		leader->exit_state = EXIT_DEAD;
-		/*
-		 * leader and tsk exhanged their pids, the old pid dies,
-		 * wake up the PIDFD_THREAD waiters.
-		 */
-		do_notify_pidfd(leader);
 		/*
 		 * We are going to release_task()->ptrace_unlink() silently,
 		 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees

From a1c6d5439fbddd06aad3ddbb7f12df0b98354070 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 2 Feb 2024 14:12:55 +0100
Subject: [PATCH 07/23] pid: kill the obsolete PIDTYPE_PID code in
 transfer_pid()

transfer_pid() must be never called with pid == PIDTYPE_PID,
new_leader->thread_pid should be changed by exchange_tids().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240202131255.GA26025@redhat.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/pid.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/pid.c b/kernel/pid.c
index 62461c7c82b8..de0bf2f8d18b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -396,8 +396,7 @@ void exchange_tids(struct task_struct *left, struct task_struct *right)
 void transfer_pid(struct task_struct *old, struct task_struct *new,
 			   enum pid_type type)
 {
-	if (type == PIDTYPE_PID)
-		new->thread_pid = old->thread_pid;
+	WARN_ON_ONCE(type == PIDTYPE_PID);
 	hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
 }
 

From 9ed52108f6478a6a805c0c15a3f70aabba07247e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 5 Feb 2024 15:13:48 +0100
Subject: [PATCH 08/23] pidfd: change do_notify_pidfd() to use
 __wake_up(poll_to_key(EPOLLIN))

rather than wake_up_all(). This way do_notify_pidfd() won't wakeup the
POLLHUP-only waiters which wait for pid_task() == NULL.

TODO:
    - as Christian pointed out, this asks for the new wake_up_all_poll()
      helper, it can already have other users.

    - we can probably discriminate the PIDFD_THREAD and non-PIDFD_THREAD
      waiters, but this needs more work. See
      https://lore.kernel.org/all/20240205140848.GA15853@redhat.com/

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240205141348.GA16539@redhat.com
Reviewed-by: Tycho Andersen <tandersen@netflix.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/signal.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index 9b40109f0c56..c3fac06937e2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2021,11 +2021,12 @@ ret:
 
 void do_notify_pidfd(struct task_struct *task)
 {
-	struct pid *pid;
+	struct pid *pid = task_pid(task);
 
 	WARN_ON(task->exit_state == 0);
-	pid = task_pid(task);
-	wake_up_all(&pid->wait_pidfd);
+
+	__wake_up(&pid->wait_pidfd, TASK_NORMAL, 0,
+			poll_to_key(EPOLLIN | EPOLLRDNORM));
 }
 
 /*

From e2e8a142fbd988d658ccb3da1d6f4b26a39de0fd Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 5 Feb 2024 18:43:47 +0100
Subject: [PATCH 09/23] pidfd: exit: kill the no longer used
 thread_group_exited()

It was used by pidfd_poll() but now it has no callers.

If it finally finds a modular user we can revert this change, but note
that the comment above this helper and the changelog in 38fd525a4c61
("exit: Factor thread_group_exited out of pidfd_poll") are not accurate,
thread_group_exited() won't return true if all other threads have passed
exit_notify() and are zombies, it returns true only when all other threads
are completely gone. Not to mention that it can only work if the task
identified by @pid is a thread-group leader.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240205174347.GA31461@redhat.com
Reviewed-by: Tycho Andersen <tandersen@netflix.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/sched/signal.h |  2 --
 kernel/exit.c                | 24 ------------------------
 2 files changed, 26 deletions(-)

diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 4b7664c56208..0a0e23c45406 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -735,8 +735,6 @@ static inline int thread_group_empty(struct task_struct *p)
 #define delay_group_leader(p) \
 		(thread_group_leader(p) && !thread_group_empty(p))
 
-extern bool thread_group_exited(struct pid *pid);
-
 extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
 							unsigned long *flags);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index c038d10dfb38..0e2f5dec91fb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1900,30 +1900,6 @@ Efault:
 }
 #endif
 
-/**
- * thread_group_exited - check that a thread group has exited
- * @pid: tgid of thread group to be checked.
- *
- * Test if the thread group represented by tgid has exited (all
- * threads are zombies, dead or completely gone).
- *
- * Return: true if the thread group has exited. false otherwise.
- */
-bool thread_group_exited(struct pid *pid)
-{
-	struct task_struct *task;
-	bool exited;
-
-	rcu_read_lock();
-	task = pid_task(pid, PIDTYPE_PID);
-	exited = !task ||
-		(READ_ONCE(task->exit_state) && thread_group_empty(task));
-	rcu_read_unlock();
-
-	return exited;
-}
-EXPORT_SYMBOL(thread_group_exited);
-
 /*
  * This needs to be __function_aligned as GCC implicitly makes any
  * implementation of abort() cold and drops alignment specified by

From 83b290c9e3b5d95891f43a4aeadf6071cbff25d3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 5 Feb 2024 15:55:32 +0100
Subject: [PATCH 10/23] pidfd: clone: allow CLONE_THREAD | CLONE_PIDFD together

copy_process() just needs to pass PIDFD_THREAD to __pidfd_prepare()
if clone_flags & CLONE_THREAD.

We can also add another CLONE_ flag (or perhaps reuse CLONE_DETACHED)
to enforce PIDFD_THREAD without CLONE_THREAD.

Originally-from: Tycho Andersen <tycho@tycho.pizza>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240205145532.GA28823@redhat.com
Reviewed-by: Tycho Andersen <tandersen@netflix.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/fork.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index aa08193d124f..4b6d994505ca 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2311,9 +2311,8 @@ __latent_entropy struct task_struct *copy_process(
 		/*
 		 * - CLONE_DETACHED is blocked so that we can potentially
 		 *   reuse it later for CLONE_PIDFD.
-		 * - CLONE_THREAD is blocked until someone really needs it.
 		 */
-		if (clone_flags & (CLONE_DETACHED | CLONE_THREAD))
+		if (clone_flags & CLONE_DETACHED)
 			return ERR_PTR(-EINVAL);
 	}
 
@@ -2536,8 +2535,10 @@ __latent_entropy struct task_struct *copy_process(
 	 * if the fd table isn't shared).
 	 */
 	if (clone_flags & CLONE_PIDFD) {
+		int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
+
 		/* Note that no task has been attached to @pid yet. */
-		retval = __pidfd_prepare(pid, 0, &pidfile);
+		retval = __pidfd_prepare(pid, flags, &pidfile);
 		if (retval < 0)
 			goto bad_fork_free_pid;
 		pidfd = retval;

From 0c9bd6bc4bb2ecfe8ce12e9a77ccd762dabe18b4 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tandersen@netflix.com>
Date: Wed, 7 Feb 2024 10:19:29 +0100
Subject: [PATCH 11/23] pidfd: getfd should always report ESRCH if a task is
 exiting

We can get EBADF from pidfd_getfd() if a task is currently exiting,
which might be confusing. Let's check PF_EXITING, and just report ESRCH
if so.

I chose PF_EXITING, because it is set in exit_signals(), which is called
before exit_files(). Since ->exit_status is mostly set after
exit_files() in exit_notify(), using that still leaves a window open for
the race.

Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Tycho Andersen <tandersen@netflix.com>
Link: https://lore.kernel.org/r/20240206192357.81942-1-tycho@tycho.pizza
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/pid.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/kernel/pid.c b/kernel/pid.c
index de0bf2f8d18b..c1d940fbd314 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -678,7 +678,26 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd)
 
 	up_read(&task->signal->exec_update_lock);
 
-	return file ?: ERR_PTR(-EBADF);
+	if (!file) {
+		/*
+		 * It is possible that the target thread is exiting; it can be
+		 * either:
+		 * 1. before exit_signals(), which gives a real fd
+		 * 2. before exit_files() takes the task_lock() gives a real fd
+		 * 3. after exit_files() releases task_lock(), ->files is NULL;
+		 *    this has PF_EXITING, since it was set in exit_signals(),
+		 *    __pidfd_fget() returns EBADF.
+		 * In case 3 we get EBADF, but that really means ESRCH, since
+		 * the task is currently exiting and has freed its files
+		 * struct, so we fix it up.
+		 */
+		if (task->flags & PF_EXITING)
+			file = ERR_PTR(-ESRCH);
+		else
+			file = ERR_PTR(-EBADF);
+	}
+
+	return file;
 }
 
 static int pidfd_getfd(struct pid *pid, int fd)

From f0ece18e994144b7daa025b68ead97de26a2df1f Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tandersen@netflix.com>
Date: Wed, 7 Feb 2024 10:19:44 +0100
Subject: [PATCH 12/23] selftests: add ESRCH tests for pidfd_getfd()

Ensure that pidfd_getfd() reports -ESRCH if the task is already exiting.

Signed-off-by: Tycho Andersen <tandersen@netflix.com>
Link: https://lore.kernel.org/r/20240206192357.81942-1-tycho@tycho.pizza
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 .../selftests/pidfd/pidfd_getfd_test.c        | 31 ++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/pidfd/pidfd_getfd_test.c b/tools/testing/selftests/pidfd/pidfd_getfd_test.c
index 0930e2411dfb..cd51d547b751 100644
--- a/tools/testing/selftests/pidfd/pidfd_getfd_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_getfd_test.c
@@ -5,6 +5,7 @@
 #include <fcntl.h>
 #include <limits.h>
 #include <linux/types.h>
+#include <poll.h>
 #include <sched.h>
 #include <signal.h>
 #include <stdio.h>
@@ -129,6 +130,7 @@ FIXTURE(child)
 	 * When it is closed, the child will exit.
 	 */
 	int sk;
+	bool ignore_child_result;
 };
 
 FIXTURE_SETUP(child)
@@ -165,10 +167,14 @@ FIXTURE_SETUP(child)
 
 FIXTURE_TEARDOWN(child)
 {
+	int ret;
+
 	EXPECT_EQ(0, close(self->pidfd));
 	EXPECT_EQ(0, close(self->sk));
 
-	EXPECT_EQ(0, wait_for_pid(self->pid));
+	ret = wait_for_pid(self->pid);
+	if (!self->ignore_child_result)
+		EXPECT_EQ(0, ret);
 }
 
 TEST_F(child, disable_ptrace)
@@ -235,6 +241,29 @@ TEST(flags_set)
 	EXPECT_EQ(errno, EINVAL);
 }
 
+TEST_F(child, no_strange_EBADF)
+{
+	struct pollfd fds;
+
+	self->ignore_child_result = true;
+
+	fds.fd = self->pidfd;
+	fds.events = POLLIN;
+
+	ASSERT_EQ(kill(self->pid, SIGKILL), 0);
+	ASSERT_EQ(poll(&fds, 1, 5000), 1);
+
+	/*
+	 * It used to be that pidfd_getfd() could race with the exiting thread
+	 * between exit_files() and release_task(), and get a non-null task
+	 * with a NULL files struct, and you'd get EBADF, which was slightly
+	 * confusing.
+	 */
+	errno = 0;
+	EXPECT_EQ(sys_pidfd_getfd(self->pidfd, self->remote_fd, 0), -1);
+	EXPECT_EQ(errno, ESRCH);
+}
+
 #if __NR_pidfd_getfd == -1
 int main(void)
 {

From c044a9502649a66bf5c5e1a584cb82b2c538ae25 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 9 Feb 2024 14:06:20 +0100
Subject: [PATCH 13/23] signal: fill in si_code in prepare_kill_siginfo()

So that do_tkill() can use this helper too. This also simplifies
the next patch.

TODO: perhaps we can kill prepare_kill_siginfo() and change the
callers to use SEND_SIG_NOINFO,  but this needs some changes in
__send_signal_locked() and TP_STORE_SIGINFO().

Reviewed-by: Tycho Andersen <tandersen@netflix.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240209130620.GA8039@redhat.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/signal.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index c3fac06937e2..1450689302d9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3793,12 +3793,13 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese,
 #endif
 #endif
 
-static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info)
+static void prepare_kill_siginfo(int sig, struct kernel_siginfo *info,
+				 enum pid_type type)
 {
 	clear_siginfo(info);
 	info->si_signo = sig;
 	info->si_errno = 0;
-	info->si_code = SI_USER;
+	info->si_code = (type == PIDTYPE_PID) ? SI_TKILL : SI_USER;
 	info->si_pid = task_tgid_vnr(current);
 	info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
 }
@@ -3812,7 +3813,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
 {
 	struct kernel_siginfo info;
 
-	prepare_kill_siginfo(sig, &info);
+	prepare_kill_siginfo(sig, &info, PIDTYPE_TGID);
 
 	return kill_something_info(sig, &info, pid);
 }
@@ -3925,7 +3926,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 		    (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
 			goto err;
 	} else {
-		prepare_kill_siginfo(sig, &kinfo);
+		prepare_kill_siginfo(sig, &kinfo, PIDTYPE_TGID);
 	}
 
 	/* TODO: respect PIDFD_THREAD */
@@ -3970,12 +3971,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
 {
 	struct kernel_siginfo info;
 
-	clear_siginfo(&info);
-	info.si_signo = sig;
-	info.si_errno = 0;
-	info.si_code = SI_TKILL;
-	info.si_pid = task_tgid_vnr(current);
-	info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
+	prepare_kill_siginfo(sig, &info, PIDTYPE_PID);
 
 	return do_send_specific(tgid, pid, sig, &info);
 }

From 81b9d8ac0640b285a3c369cd41a85f6c240d3a60 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Fri, 9 Feb 2024 14:06:50 +0100
Subject: [PATCH 14/23] pidfd: change pidfd_send_signal() to respect
 PIDFD_THREAD

Turn kill_pid_info() into kill_pid_info_type(), this allows to pass any
pid_type to group_send_sig_info(), despite its name it should work fine
even if type = PIDTYPE_PID.

Change pidfd_send_signal() to use PIDTYPE_PID or PIDTYPE_TGID depending
on PIDFD_THREAD.

While at it kill another TODO comment in pidfd_show_fdinfo(). As Christian
expains fdinfo reports f_flags, userspace can already detect PIDFD_THREAD.

Reviewed-by: Tycho Andersen <tandersen@netflix.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Link: https://lore.kernel.org/r/20240209130650.GA8048@redhat.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 kernel/fork.c   |  2 --
 kernel/signal.c | 39 +++++++++++++++++++++++----------------
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 4b6d994505ca..3f22ec90c5c6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2051,8 +2051,6 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 
 	seq_put_decimal_ll(m, "Pid:\t", nr);
 
-	/* TODO: report PIDFD_THREAD */
-
 #ifdef CONFIG_PID_NS
 	seq_put_decimal_ll(m, "\nNSpid:\t", nr);
 	if (nr > 0) {
diff --git a/kernel/signal.c b/kernel/signal.c
index 1450689302d9..8b8169623850 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -47,6 +47,7 @@
 #include <linux/cgroup.h>
 #include <linux/audit.h>
 #include <linux/sysctl.h>
+#include <uapi/linux/pidfd.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
@@ -1436,7 +1437,8 @@ void lockdep_assert_task_sighand_held(struct task_struct *task)
 #endif
 
 /*
- * send signal info to all the members of a group
+ * send signal info to all the members of a thread group or to the
+ * individual thread if type == PIDTYPE_PID.
  */
 int group_send_sig_info(int sig, struct kernel_siginfo *info,
 			struct task_struct *p, enum pid_type type)
@@ -1478,7 +1480,8 @@ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
 	return ret;
 }
 
-int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
+static int kill_pid_info_type(int sig, struct kernel_siginfo *info,
+				struct pid *pid, enum pid_type type)
 {
 	int error = -ESRCH;
 	struct task_struct *p;
@@ -1487,11 +1490,10 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
 		rcu_read_lock();
 		p = pid_task(pid, PIDTYPE_PID);
 		if (p)
-			error = group_send_sig_info(sig, info, p, PIDTYPE_TGID);
+			error = group_send_sig_info(sig, info, p, type);
 		rcu_read_unlock();
 		if (likely(!p || error != -ESRCH))
 			return error;
-
 		/*
 		 * The task was unhashed in between, try again.  If it
 		 * is dead, pid_task() will return NULL, if we race with
@@ -1500,6 +1502,11 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
 	}
 }
 
+int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
+{
+	return kill_pid_info_type(sig, info, pid, PIDTYPE_TGID);
+}
+
 static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid)
 {
 	int error;
@@ -3873,14 +3880,10 @@ static struct pid *pidfd_to_pid(const struct file *file)
  * @info:   signal info
  * @flags:  future flags
  *
- * The syscall currently only signals via PIDTYPE_PID which covers
- * kill(<positive-pid>, <signal>. It does not signal threads or process
- * groups.
- * In order to extend the syscall to threads and process groups the @flags
- * argument should be used. In essence, the @flags argument will determine
- * what is signaled and not the file descriptor itself. Put in other words,
- * grouping is a property of the flags argument not a property of the file
- * descriptor.
+ * Send the signal to the thread group or to the individual thread depending
+ * on PIDFD_THREAD.
+ * In the future extension to @flags may be used to override the default scope
+ * of @pidfd.
  *
  * Return: 0 on success, negative errno on failure
  */
@@ -3891,6 +3894,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 	struct fd f;
 	struct pid *pid;
 	kernel_siginfo_t kinfo;
+	enum pid_type type;
 
 	/* Enforce flags be set to 0 until we add an extension. */
 	if (flags)
@@ -3911,6 +3915,11 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 	if (!access_pidfd_pidns(pid))
 		goto err;
 
+	if (f.file->f_flags & PIDFD_THREAD)
+		type = PIDTYPE_PID;
+	else
+		type = PIDTYPE_TGID;
+
 	if (info) {
 		ret = copy_siginfo_from_user_any(&kinfo, info);
 		if (unlikely(ret))
@@ -3926,12 +3935,10 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 		    (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
 			goto err;
 	} else {
-		prepare_kill_siginfo(sig, &kinfo, PIDTYPE_TGID);
+		prepare_kill_siginfo(sig, &kinfo, type);
 	}
 
-	/* TODO: respect PIDFD_THREAD */
-	ret = kill_pid_info(sig, &kinfo, pid);
-
+	ret = kill_pid_info_type(sig, &kinfo, pid, type);
 err:
 	fdput(f);
 	return ret;

From e1fb1dc08e73466830612bcf2f9f72180965c9ba Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 9 Feb 2024 15:49:45 +0100
Subject: [PATCH 15/23] pidfd: allow to override signal scope in
 pidfd_send_signal()

Right now we determine the scope of the signal based on the type of
pidfd. There are use-cases where it's useful to override the scope of
the signal. For example in [1]. Add flags to determine the scope of the
signal:

(1) PIDFD_SIGNAL_THREAD: send signal to specific thread reference by @pidfd
(2) PIDFD_SIGNAL_THREAD_GROUP: send signal to thread-group of @pidfd
(2) PIDFD_SIGNAL_PROCESS_GROUP: send signal to process-group of @pidfd

Since we now allow specifying PIDFD_SEND_PROCESS_GROUP for
pidfd_send_signal() to send signals to process groups we need to adjust
the check restricting si_code emulation by userspace to account for
PIDTYPE_PGID.

Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Link: https://github.com/systemd/systemd/issues/31093 [1]
Link: https://lore.kernel.org/r/20240210-chihuahua-hinzog-3945b6abd44a@brauner
Link: https://lore.kernel.org/r/20240214123655.GB16265@redhat.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/uapi/linux/pidfd.h |  5 +++++
 kernel/signal.c            | 46 ++++++++++++++++++++++++++++++--------
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index 2e6461459877..72ec000a97cd 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -10,4 +10,9 @@
 #define PIDFD_NONBLOCK	O_NONBLOCK
 #define PIDFD_THREAD	O_EXCL
 
+/* Flags for pidfd_send_signal(). */
+#define PIDFD_SIGNAL_THREAD		(1UL << 0)
+#define PIDFD_SIGNAL_THREAD_GROUP	(1UL << 1)
+#define PIDFD_SIGNAL_PROCESS_GROUP	(1UL << 2)
+
 #endif /* _UAPI_LINUX_PIDFD_H */
diff --git a/kernel/signal.c b/kernel/signal.c
index 8b8169623850..bdca529f0f7b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1905,16 +1905,19 @@ int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno,
 	return send_sig_info(info.si_signo, &info, t);
 }
 
-int kill_pgrp(struct pid *pid, int sig, int priv)
+static int kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
 {
 	int ret;
-
 	read_lock(&tasklist_lock);
-	ret = __kill_pgrp_info(sig, __si_special(priv), pid);
+	ret = __kill_pgrp_info(sig, info, pgrp);
 	read_unlock(&tasklist_lock);
-
 	return ret;
 }
+
+int kill_pgrp(struct pid *pid, int sig, int priv)
+{
+	return kill_pgrp_info(sig, __si_special(priv), pid);
+}
 EXPORT_SYMBOL(kill_pgrp);
 
 int kill_pid(struct pid *pid, int sig, int priv)
@@ -3873,6 +3876,10 @@ static struct pid *pidfd_to_pid(const struct file *file)
 	return tgid_pidfd_to_pid(file);
 }
 
+#define PIDFD_SEND_SIGNAL_FLAGS                            \
+	(PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \
+	 PIDFD_SIGNAL_PROCESS_GROUP)
+
 /**
  * sys_pidfd_send_signal - Signal a process through a pidfd
  * @pidfd:  file descriptor of the process
@@ -3897,7 +3904,11 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 	enum pid_type type;
 
 	/* Enforce flags be set to 0 until we add an extension. */
-	if (flags)
+	if (flags & ~PIDFD_SEND_SIGNAL_FLAGS)
+		return -EINVAL;
+
+	/* Ensure that only a single signal scope determining flag is set. */
+	if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1)
 		return -EINVAL;
 
 	f = fdget(pidfd);
@@ -3915,10 +3926,24 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 	if (!access_pidfd_pidns(pid))
 		goto err;
 
-	if (f.file->f_flags & PIDFD_THREAD)
+	switch (flags) {
+	case 0:
+		/* Infer scope from the type of pidfd. */
+		if (f.file->f_flags & PIDFD_THREAD)
+			type = PIDTYPE_PID;
+		else
+			type = PIDTYPE_TGID;
+		break;
+	case PIDFD_SIGNAL_THREAD:
 		type = PIDTYPE_PID;
-	else
+		break;
+	case PIDFD_SIGNAL_THREAD_GROUP:
 		type = PIDTYPE_TGID;
+		break;
+	case PIDFD_SIGNAL_PROCESS_GROUP:
+		type = PIDTYPE_PGID;
+		break;
+	}
 
 	if (info) {
 		ret = copy_siginfo_from_user_any(&kinfo, info);
@@ -3931,14 +3956,17 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
 
 		/* Only allow sending arbitrary signals to yourself. */
 		ret = -EPERM;
-		if ((task_pid(current) != pid) &&
+		if ((task_pid(current) != pid || type > PIDTYPE_TGID) &&
 		    (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
 			goto err;
 	} else {
 		prepare_kill_siginfo(sig, &kinfo, type);
 	}
 
-	ret = kill_pid_info_type(sig, &kinfo, pid, type);
+	if (type == PIDTYPE_PGID)
+		ret = kill_pgrp_info(sig, &kinfo, pid);
+	else
+		ret = kill_pid_info_type(sig, &kinfo, pid, type);
 err:
 	fdput(f);
 	return ret;

From 50f4f2d197e194ec0356962b99ca2b72e9a37bc8 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 12 Feb 2024 16:00:50 +0100
Subject: [PATCH 16/23] pidfd: move struct pidfd_fops

Move the pidfd file operations over to their own file in preparation of
implementing pidfs and to isolate them from other mostly unrelated
functionality in other files.

Link: https://lore.kernel.org/r/20240213-vfs-pidfd_fs-v1-1-f863f58cfce1@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/Makefile   |   2 +-
 fs/pidfs.c    | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/fork.c | 110 ---------------------------------------------
 3 files changed, 123 insertions(+), 111 deletions(-)
 create mode 100644 fs/pidfs.c

diff --git a/fs/Makefile b/fs/Makefile
index c09016257f05..125301673985 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -15,7 +15,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		pnode.o splice.o sync.o utimes.o d_path.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
 		fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
-		kernel_read_file.o mnt_idmapping.o remap_range.o
+		kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o
 
 obj-$(CONFIG_BUFFER_HEAD)	+= buffer.o mpage.o
 obj-$(CONFIG_PROC_FS)		+= proc_namespace.o
diff --git a/fs/pidfs.c b/fs/pidfs.c
new file mode 100644
index 000000000000..eccb291862a0
--- /dev/null
+++ b/fs/pidfs.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/magic.h>
+#include <linux/mount.h>
+#include <linux/pid.h>
+#include <linux/pid_namespace.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
+#include <linux/pseudo_fs.h>
+#include <linux/seq_file.h>
+#include <uapi/linux/pidfd.h>
+
+static int pidfd_release(struct inode *inode, struct file *file)
+{
+	struct pid *pid = file->private_data;
+
+	file->private_data = NULL;
+	put_pid(pid);
+	return 0;
+}
+
+#ifdef CONFIG_PROC_FS
+/**
+ * pidfd_show_fdinfo - print information about a pidfd
+ * @m: proc fdinfo file
+ * @f: file referencing a pidfd
+ *
+ * Pid:
+ * This function will print the pid that a given pidfd refers to in the
+ * pid namespace of the procfs instance.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its pid. This is
+ * similar to calling getppid() on a process whose parent is outside of
+ * its pid namespace.
+ *
+ * NSpid:
+ * If pid namespaces are supported then this function will also print
+ * the pid of a given pidfd refers to for all descendant pid namespaces
+ * starting from the current pid namespace of the instance, i.e. the
+ * Pid field and the first entry in the NSpid field will be identical.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its first NSpid
+ * entry and no others will be shown.
+ * Note that this differs from the Pid and NSpid fields in
+ * /proc/<pid>/status where Pid and NSpid are always shown relative to
+ * the  pid namespace of the procfs instance. The difference becomes
+ * obvious when sending around a pidfd between pid namespaces from a
+ * different branch of the tree, i.e. where no ancestral relation is
+ * present between the pid namespaces:
+ * - create two new pid namespaces ns1 and ns2 in the initial pid
+ *   namespace (also take care to create new mount namespaces in the
+ *   new pid namespace and mount procfs)
+ * - create a process with a pidfd in ns1
+ * - send pidfd from ns1 to ns2
+ * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
+ *   have exactly one entry, which is 0
+ */
+static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+	struct pid *pid = f->private_data;
+	struct pid_namespace *ns;
+	pid_t nr = -1;
+
+	if (likely(pid_has_task(pid, PIDTYPE_PID))) {
+		ns = proc_pid_ns(file_inode(m->file)->i_sb);
+		nr = pid_nr_ns(pid, ns);
+	}
+
+	seq_put_decimal_ll(m, "Pid:\t", nr);
+
+#ifdef CONFIG_PID_NS
+	seq_put_decimal_ll(m, "\nNSpid:\t", nr);
+	if (nr > 0) {
+		int i;
+
+		/* If nr is non-zero it means that 'pid' is valid and that
+		 * ns, i.e. the pid namespace associated with the procfs
+		 * instance, is in the pid namespace hierarchy of pid.
+		 * Start at one below the already printed level.
+		 */
+		for (i = ns->level + 1; i <= pid->level; i++)
+			seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
+	}
+#endif
+	seq_putc(m, '\n');
+}
+#endif
+
+/*
+ * Poll support for process exit notification.
+ */
+static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
+{
+	struct pid *pid = file->private_data;
+	bool thread = file->f_flags & PIDFD_THREAD;
+	struct task_struct *task;
+	__poll_t poll_flags = 0;
+
+	poll_wait(file, &pid->wait_pidfd, pts);
+	/*
+	 * Depending on PIDFD_THREAD, inform pollers when the thread
+	 * or the whole thread-group exits.
+	 */
+	guard(rcu)();
+	task = pid_task(pid, PIDTYPE_PID);
+	if (!task)
+		poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
+	else if (task->exit_state && (thread || thread_group_empty(task)))
+		poll_flags = EPOLLIN | EPOLLRDNORM;
+
+	return poll_flags;
+}
+
+const struct file_operations pidfd_fops = {
+	.release	= pidfd_release,
+	.poll		= pidfd_poll,
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= pidfd_show_fdinfo,
+#endif
+};
diff --git a/kernel/fork.c b/kernel/fork.c
index 3f22ec90c5c6..662a61f340ce 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1993,116 +1993,6 @@ struct pid *pidfd_pid(const struct file *file)
 	return ERR_PTR(-EBADF);
 }
 
-static int pidfd_release(struct inode *inode, struct file *file)
-{
-	struct pid *pid = file->private_data;
-
-	file->private_data = NULL;
-	put_pid(pid);
-	return 0;
-}
-
-#ifdef CONFIG_PROC_FS
-/**
- * pidfd_show_fdinfo - print information about a pidfd
- * @m: proc fdinfo file
- * @f: file referencing a pidfd
- *
- * Pid:
- * This function will print the pid that a given pidfd refers to in the
- * pid namespace of the procfs instance.
- * If the pid namespace of the process is not a descendant of the pid
- * namespace of the procfs instance 0 will be shown as its pid. This is
- * similar to calling getppid() on a process whose parent is outside of
- * its pid namespace.
- *
- * NSpid:
- * If pid namespaces are supported then this function will also print
- * the pid of a given pidfd refers to for all descendant pid namespaces
- * starting from the current pid namespace of the instance, i.e. the
- * Pid field and the first entry in the NSpid field will be identical.
- * If the pid namespace of the process is not a descendant of the pid
- * namespace of the procfs instance 0 will be shown as its first NSpid
- * entry and no others will be shown.
- * Note that this differs from the Pid and NSpid fields in
- * /proc/<pid>/status where Pid and NSpid are always shown relative to
- * the  pid namespace of the procfs instance. The difference becomes
- * obvious when sending around a pidfd between pid namespaces from a
- * different branch of the tree, i.e. where no ancestral relation is
- * present between the pid namespaces:
- * - create two new pid namespaces ns1 and ns2 in the initial pid
- *   namespace (also take care to create new mount namespaces in the
- *   new pid namespace and mount procfs)
- * - create a process with a pidfd in ns1
- * - send pidfd from ns1 to ns2
- * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
- *   have exactly one entry, which is 0
- */
-static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
-{
-	struct pid *pid = f->private_data;
-	struct pid_namespace *ns;
-	pid_t nr = -1;
-
-	if (likely(pid_has_task(pid, PIDTYPE_PID))) {
-		ns = proc_pid_ns(file_inode(m->file)->i_sb);
-		nr = pid_nr_ns(pid, ns);
-	}
-
-	seq_put_decimal_ll(m, "Pid:\t", nr);
-
-#ifdef CONFIG_PID_NS
-	seq_put_decimal_ll(m, "\nNSpid:\t", nr);
-	if (nr > 0) {
-		int i;
-
-		/* If nr is non-zero it means that 'pid' is valid and that
-		 * ns, i.e. the pid namespace associated with the procfs
-		 * instance, is in the pid namespace hierarchy of pid.
-		 * Start at one below the already printed level.
-		 */
-		for (i = ns->level + 1; i <= pid->level; i++)
-			seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
-	}
-#endif
-	seq_putc(m, '\n');
-}
-#endif
-
-/*
- * Poll support for process exit notification.
- */
-static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
-{
-	struct pid *pid = file->private_data;
-	bool thread = file->f_flags & PIDFD_THREAD;
-	struct task_struct *task;
-	__poll_t poll_flags = 0;
-
-	poll_wait(file, &pid->wait_pidfd, pts);
-	/*
-	 * Depending on PIDFD_THREAD, inform pollers when the thread
-	 * or the whole thread-group exits.
-	 */
-	rcu_read_lock();
-	task = pid_task(pid, PIDTYPE_PID);
-	if (!task)
-		poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
-	else if (task->exit_state && (thread || thread_group_empty(task)))
-		poll_flags = EPOLLIN | EPOLLRDNORM;
-	rcu_read_unlock();
-
-	return poll_flags;
-}
-
-const struct file_operations pidfd_fops = {
-	.release = pidfd_release,
-	.poll = pidfd_poll,
-#ifdef CONFIG_PROC_FS
-	.show_fdinfo = pidfd_show_fdinfo,
-#endif
-};
-
 /**
  * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
  * @pid:   the struct pid for which to create a pidfd

From cb12fd8e0dabb9a1c8aef55a6a41e2c255fcdf4b Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 12 Feb 2024 16:32:38 +0100
Subject: [PATCH 17/23] pidfd: add pidfs

This moves pidfds from the anonymous inode infrastructure to a tiny
pseudo filesystem. This has been on my todo for quite a while as it will
unblock further work that we weren't able to do simply because of the
very justified limitations of anonymous inodes. Moving pidfds to a tiny
pseudo filesystem allows:

* statx() on pidfds becomes useful for the first time.
* pidfds can be compared simply via statx() and then comparing inode
  numbers.
* pidfds have unique inode numbers for the system lifetime.
* struct pid is now stashed in inode->i_private instead of
  file->private_data. This means it is now possible to introduce
  concepts that operate on a process once all file descriptors have been
  closed. A concrete example is kill-on-last-close.
* file->private_data is freed up for per-file options for pidfds.
* Each struct pid will refer to a different inode but the same struct
  pid will refer to the same inode if it's opened multiple times. In
  contrast to now where each struct pid refers to the same inode. Even
  if we were to move to anon_inode_create_getfile() which creates new
  inodes we'd still be associating the same struct pid with multiple
  different inodes.

The tiny pseudo filesystem is not visible anywhere in userspace exactly
like e.g., pipefs and sockfs. There's no lookup, there's no complex
inode operations, nothing. Dentries and inodes are always deleted when
the last pidfd is closed.

We allocate a new inode for each struct pid and we reuse that inode for
all pidfds. We use iget_locked() to find that inode again based on the
inode number which isn't recycled. We allocate a new dentry for each
pidfd that uses the same inode. That is similar to anonymous inodes
which reuse the same inode for thousands of dentries. For pidfds we're
talking way less than that. There usually won't be a lot of concurrent
openers of the same struct pid. They can probably often be counted on
two hands. I know that systemd does use separate pidfd for the same
struct pid for various complex process tracking issues. So I think with
that things actually become way simpler. Especially because we don't
have to care about lookup. Dentries and inodes continue to be always
deleted.

The code is entirely optional and fairly small. If it's not selected we
fallback to anonymous inodes. Heavily inspired by nsfs which uses a
similar stashing mechanism just for namespaces.

Link: https://lore.kernel.org/r/20240213-vfs-pidfd_fs-v1-2-f863f58cfce1@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/Kconfig                 |   7 ++
 fs/pidfs.c                 | 156 ++++++++++++++++++++++++++++++++++++-
 include/linux/pid.h        |   5 +-
 include/linux/pidfs.h      |   8 ++
 include/uapi/linux/magic.h |   1 +
 init/main.c                |   2 +
 kernel/fork.c              |  13 +---
 kernel/nsproxy.c           |   2 +-
 kernel/pid.c               |  11 +++
 9 files changed, 188 insertions(+), 17 deletions(-)
 create mode 100644 include/linux/pidfs.h

diff --git a/fs/Kconfig b/fs/Kconfig
index 89fdbefd1075..f3dbd84a0e40 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -174,6 +174,13 @@ source "fs/proc/Kconfig"
 source "fs/kernfs/Kconfig"
 source "fs/sysfs/Kconfig"
 
+config FS_PID
+	bool "Pseudo filesystem for process file descriptors"
+	depends on 64BIT
+	default y
+	help
+	  Pidfs implements advanced features for process file descriptors.
+
 config TMPFS
 	bool "Tmpfs virtual memory file system support (former shm fs)"
 	depends on SHMEM
diff --git a/fs/pidfs.c b/fs/pidfs.c
index eccb291862a0..6c3f010074af 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -1,9 +1,11 @@
 // SPDX-License-Identifier: GPL-2.0
+#include <linux/anon_inodes.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/magic.h>
 #include <linux/mount.h>
 #include <linux/pid.h>
+#include <linux/pidfs.h>
 #include <linux/pid_namespace.h>
 #include <linux/poll.h>
 #include <linux/proc_fs.h>
@@ -14,10 +16,12 @@
 
 static int pidfd_release(struct inode *inode, struct file *file)
 {
+#ifndef CONFIG_FS_PID
 	struct pid *pid = file->private_data;
 
 	file->private_data = NULL;
 	put_pid(pid);
+#endif
 	return 0;
 }
 
@@ -59,7 +63,7 @@ static int pidfd_release(struct inode *inode, struct file *file)
  */
 static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
 {
-	struct pid *pid = f->private_data;
+	struct pid *pid = pidfd_pid(f);
 	struct pid_namespace *ns;
 	pid_t nr = -1;
 
@@ -93,7 +97,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
  */
 static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 {
-	struct pid *pid = file->private_data;
+	struct pid *pid = pidfd_pid(file);
 	bool thread = file->f_flags & PIDFD_THREAD;
 	struct task_struct *task;
 	__poll_t poll_flags = 0;
@@ -113,10 +117,156 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
 	return poll_flags;
 }
 
-const struct file_operations pidfd_fops = {
+static const struct file_operations pidfs_file_operations = {
 	.release	= pidfd_release,
 	.poll		= pidfd_poll,
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= pidfd_show_fdinfo,
 #endif
 };
+
+struct pid *pidfd_pid(const struct file *file)
+{
+	if (file->f_op != &pidfs_file_operations)
+		return ERR_PTR(-EBADF);
+#ifdef CONFIG_FS_PID
+	return file_inode(file)->i_private;
+#else
+	return file->private_data;
+#endif
+}
+
+#ifdef CONFIG_FS_PID
+static struct vfsmount *pidfs_mnt __ro_after_init;
+static struct super_block *pidfs_sb __ro_after_init;
+
+/*
+ * The vfs falls back to simple_setattr() if i_op->setattr() isn't
+ * implemented. Let's reject it completely until we have a clean
+ * permission concept for pidfds.
+ */
+static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+			 struct iattr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
+static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
+			 struct kstat *stat, u32 request_mask,
+			 unsigned int query_flags)
+{
+	struct inode *inode = d_inode(path->dentry);
+
+	generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+	return 0;
+}
+
+static const struct inode_operations pidfs_inode_operations = {
+	.getattr = pidfs_getattr,
+	.setattr = pidfs_setattr,
+};
+
+static void pidfs_evict_inode(struct inode *inode)
+{
+	struct pid *pid = inode->i_private;
+
+	clear_inode(inode);
+	put_pid(pid);
+}
+
+static const struct super_operations pidfs_sops = {
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= pidfs_evict_inode,
+	.statfs		= simple_statfs,
+};
+
+static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	return dynamic_dname(buffer, buflen, "pidfd:[%lu]",
+			     d_inode(dentry)->i_ino);
+}
+
+static const struct dentry_operations pidfs_dentry_operations = {
+	.d_delete	= always_delete_dentry,
+	.d_dname	= pidfs_dname,
+};
+
+static int pidfs_init_fs_context(struct fs_context *fc)
+{
+	struct pseudo_fs_context *ctx;
+
+	ctx = init_pseudo(fc, PID_FS_MAGIC);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->ops = &pidfs_sops;
+	ctx->dops = &pidfs_dentry_operations;
+	return 0;
+}
+
+static struct file_system_type pidfs_type = {
+	.name			= "pidfs",
+	.init_fs_context	= pidfs_init_fs_context,
+	.kill_sb		= kill_anon_super,
+};
+
+struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
+{
+
+	struct inode *inode;
+	struct file *pidfd_file;
+
+	inode = iget_locked(pidfs_sb, pid->ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	if (inode->i_state & I_NEW) {
+		/*
+		 * Inode numbering for pidfs start at RESERVED_PIDS + 1.
+		 * This avoids collisions with the root inode which is 1
+		 * for pseudo filesystems.
+		 */
+		inode->i_ino = pid->ino;
+		inode->i_mode = S_IFREG | S_IRUGO;
+		inode->i_op = &pidfs_inode_operations;
+		inode->i_fop = &pidfs_file_operations;
+		inode->i_flags |= S_IMMUTABLE;
+		inode->i_private = get_pid(pid);
+		simple_inode_init_ts(inode);
+		unlock_new_inode(inode);
+	}
+
+	pidfd_file = alloc_file_pseudo(inode, pidfs_mnt, "", flags,
+				       &pidfs_file_operations);
+	if (IS_ERR(pidfd_file))
+		iput(inode);
+
+	return pidfd_file;
+}
+
+void __init pidfs_init(void)
+{
+	pidfs_mnt = kern_mount(&pidfs_type);
+	if (IS_ERR(pidfs_mnt))
+		panic("Failed to mount pidfs pseudo filesystem");
+
+	pidfs_sb = pidfs_mnt->mnt_sb;
+}
+
+#else /* !CONFIG_FS_PID */
+
+struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
+{
+	struct file *pidfd_file;
+
+	pidfd_file = anon_inode_getfile("[pidfd]", &pidfs_file_operations, pid,
+					flags | O_RDWR);
+	if (IS_ERR(pidfd_file))
+		return pidfd_file;
+
+	get_pid(pid);
+	return pidfd_file;
+}
+
+void __init pidfs_init(void) { }
+#endif
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 8124d57752b9..956481128e8d 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -55,6 +55,9 @@ struct pid
 	refcount_t count;
 	unsigned int level;
 	spinlock_t lock;
+#ifdef CONFIG_FS_PID
+	unsigned long ino;
+#endif
 	/* lists of tasks that use this pid */
 	struct hlist_head tasks[PIDTYPE_MAX];
 	struct hlist_head inodes;
@@ -66,8 +69,6 @@ struct pid
 
 extern struct pid init_struct_pid;
 
-extern const struct file_operations pidfd_fops;
-
 struct file;
 
 struct pid *pidfd_pid(const struct file *file);
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
new file mode 100644
index 000000000000..75bdf9807802
--- /dev/null
+++ b/include/linux/pidfs.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PID_FS_H
+#define _LINUX_PID_FS_H
+
+struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
+void __init pidfs_init(void);
+
+#endif /* _LINUX_PID_FS_H */
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 6325d1d0e90f..1b40a968ba91 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -101,5 +101,6 @@
 #define DMA_BUF_MAGIC		0x444d4142	/* "DMAB" */
 #define DEVMEM_MAGIC		0x454d444d	/* "DMEM" */
 #define SECRETMEM_MAGIC		0x5345434d	/* "SECM" */
+#define PID_FS_MAGIC		0x50494446	/* "PIDF" */
 
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/init/main.c b/init/main.c
index e24b0780fdff..2fbf6a3114d5 100644
--- a/init/main.c
+++ b/init/main.c
@@ -99,6 +99,7 @@
 #include <linux/init_syscalls.h>
 #include <linux/stackdepot.h>
 #include <linux/randomize_kstack.h>
+#include <linux/pidfs.h>
 #include <net/net_namespace.h>
 
 #include <asm/io.h>
@@ -1059,6 +1060,7 @@ void start_kernel(void)
 	seq_file_init();
 	proc_root_init();
 	nsfs_init();
+	pidfs_init();
 	cpuset_init();
 	cgroup_init();
 	taskstats_init_early();
diff --git a/kernel/fork.c b/kernel/fork.c
index 662a61f340ce..2f839c290dcf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -102,6 +102,7 @@
 #include <linux/iommu.h>
 #include <linux/rseq.h>
 #include <uapi/linux/pidfd.h>
+#include <linux/pidfs.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -1985,14 +1986,6 @@ static inline void rcu_copy_process(struct task_struct *p)
 #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
 }
 
-struct pid *pidfd_pid(const struct file *file)
-{
-	if (file->f_op == &pidfd_fops)
-		return file->private_data;
-
-	return ERR_PTR(-EBADF);
-}
-
 /**
  * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
  * @pid:   the struct pid for which to create a pidfd
@@ -2030,13 +2023,11 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re
 	if (pidfd < 0)
 		return pidfd;
 
-	pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid,
-					flags | O_RDWR);
+	pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
 	if (IS_ERR(pidfd_file)) {
 		put_unused_fd(pidfd);
 		return PTR_ERR(pidfd_file);
 	}
-	get_pid(pid); /* held by pidfd_file now */
 	/*
 	 * anon_inode_getfile() ignores everything outside of the
 	 * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 15781acaac1c..6ec3deec68c2 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -573,7 +573,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags)
 	if (proc_ns_file(f.file))
 		err = validate_ns(&nsset, ns);
 	else
-		err = validate_nsset(&nsset, f.file->private_data);
+		err = validate_nsset(&nsset, pidfd_pid(f.file));
 	if (!err) {
 		commit_nsset(&nsset);
 		perf_event_namespaces(current);
diff --git a/kernel/pid.c b/kernel/pid.c
index c1d940fbd314..581cc34341fd 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -42,6 +42,7 @@
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
 #include <linux/idr.h>
+#include <linux/pidfs.h>
 #include <net/sock.h>
 #include <uapi/linux/pidfd.h>
 
@@ -65,6 +66,13 @@ int pid_max = PID_MAX_DEFAULT;
 
 int pid_max_min = RESERVED_PIDS + 1;
 int pid_max_max = PID_MAX_LIMIT;
+#ifdef CONFIG_FS_PID
+/*
+ * Pseudo filesystems start inode numbering after one. We use Reserved
+ * PIDs as a natural offset.
+ */
+static u64 pidfs_ino = RESERVED_PIDS;
+#endif
 
 /*
  * PID-map pages start out as NULL, they get allocated upon
@@ -272,6 +280,9 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 	spin_lock_irq(&pidmap_lock);
 	if (!(ns->pid_allocated & PIDNS_ADDING))
 		goto out_unlock;
+#ifdef CONFIG_FS_PID
+	pid->ino = ++pidfs_ino;
+#endif
 	for ( ; upid >= pid->numbers; --upid) {
 		/* Make the PID visible to find_pid_ns. */
 		idr_replace(&upid->ns->idr, pid, upid->nr);

From 07fd7c329839cf0b8c7766883d830a1a0d12d1dd Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sun, 18 Feb 2024 14:50:13 +0100
Subject: [PATCH 18/23] libfs: add path_from_stashed()

Add a helper for both nsfs and pidfs to reuse an already stashed dentry
or to add and stash a new dentry.

Link: https://lore.kernel.org/r/20240218-neufahrzeuge-brauhaus-fb0eb6459771@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/internal.h |  3 ++
 fs/libfs.c    | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+)

diff --git a/fs/internal.h b/fs/internal.h
index b67406435fc0..cfddaec6fbf6 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -310,3 +310,6 @@ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po
 struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
 struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
 void mnt_idmap_put(struct mnt_idmap *idmap);
+int path_from_stashed(struct dentry **stashed, unsigned long ino,
+		      struct vfsmount *mnt, const struct file_operations *fops,
+		      void *data, struct path *path);
diff --git a/fs/libfs.c b/fs/libfs.c
index eec6031b0155..2a55e87e1439 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1973,3 +1973,97 @@ struct timespec64 simple_inode_init_ts(struct inode *inode)
 	return ts;
 }
 EXPORT_SYMBOL(simple_inode_init_ts);
+
+static inline struct dentry *get_stashed_dentry(struct dentry *stashed)
+{
+	struct dentry *dentry;
+
+	guard(rcu)();
+	dentry = READ_ONCE(stashed);
+	if (!dentry)
+		return NULL;
+	if (!lockref_get_not_dead(&dentry->d_lockref))
+		return NULL;
+	return dentry;
+}
+
+static struct dentry *stash_dentry(struct dentry **stashed, unsigned long ino,
+				   struct super_block *sb,
+				   const struct file_operations *fops,
+				   void *data)
+{
+	struct dentry *dentry;
+	struct inode *inode;
+
+	dentry = d_alloc_anon(sb);
+	if (!dentry)
+		return ERR_PTR(-ENOMEM);
+
+	inode = new_inode_pseudo(sb);
+	if (!inode) {
+		dput(dentry);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	inode->i_ino = ino;
+	inode->i_flags |= S_IMMUTABLE;
+	inode->i_mode = S_IFREG | S_IRUGO;
+	inode->i_fop = fops;
+	inode->i_private = data;
+	simple_inode_init_ts(inode);
+
+	/* @data is now owned by the fs */
+	d_instantiate(dentry, inode);
+
+	if (cmpxchg(stashed, NULL, dentry)) {
+		d_delete(dentry); /* make sure ->d_prune() does nothing */
+		dput(dentry);
+		cpu_relax();
+		return ERR_PTR(-EAGAIN);
+	}
+
+	return dentry;
+}
+
+/**
+ * path_from_stashed - create path from stashed or new dentry
+ * @stashed:    where to retrieve or stash dentry
+ * @ino:        inode number to use
+ * @mnt:        mnt of the filesystems to use
+ * @fops:       file operations to use
+ * @data:       data to store in inode->i_private
+ * @path:       path to create
+ *
+ * The function tries to retrieve a stashed dentry from @stashed. If the dentry
+ * is still valid then it will be reused. If the dentry isn't able the function
+ * will allocate a new dentry and inode. It will then try to update @stashed
+ * with the newly added dentry. If it fails -EAGAIN is returned and the caller
+ * my retry.
+ *
+ * Special-purpose helper for nsfs and pidfs.
+ *
+ * Return: If 0 or an error is returned the caller can be sure that @data must
+ *         be cleaned up. If 1 or -EAGAIN is returned @data is owned by the
+ *         filesystem.
+ */
+int path_from_stashed(struct dentry **stashed, unsigned long ino,
+		      struct vfsmount *mnt, const struct file_operations *fops,
+		      void *data, struct path *path)
+{
+	struct dentry *dentry;
+	int ret = 0;
+
+	dentry = get_stashed_dentry(*stashed);
+	if (dentry)
+		goto out_path;
+
+	dentry = stash_dentry(stashed, ino, mnt->mnt_sb, fops, data);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+	ret = 1;
+
+out_path:
+	path->dentry = dentry;
+	path->mnt = mntget(mnt);
+	return ret;
+}

From 1fa08aece42512be072351f482096d5796edf7ca Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sun, 18 Feb 2024 14:51:23 +0100
Subject: [PATCH 19/23] nsfs: convert to path_from_stashed() helper

Use the newly added path_from_stashed() helper for nsfs.

Link: https://lore.kernel.org/r/20240218-neufahrzeuge-brauhaus-fb0eb6459771@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/nsfs.c                 | 73 ++++++++++-----------------------------
 include/linux/ns_common.h |  2 +-
 include/linux/proc_ns.h   |  2 +-
 3 files changed, 20 insertions(+), 57 deletions(-)

diff --git a/fs/nsfs.c b/fs/nsfs.c
index 34e1e3e36733..39dc2604bec0 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -27,7 +27,8 @@ static const struct file_operations ns_file_operations = {
 static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
 {
 	struct inode *inode = d_inode(dentry);
-	const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+	struct ns_common *ns = inode->i_private;
+	const struct proc_ns_operations *ns_ops = ns->ops;
 
 	return dynamic_dname(buffer, buflen, "%s:[%lu]",
 		ns_ops->name, inode->i_ino);
@@ -38,7 +39,7 @@ static void ns_prune_dentry(struct dentry *dentry)
 	struct inode *inode = d_inode(dentry);
 	if (inode) {
 		struct ns_common *ns = inode->i_private;
-		atomic_long_set(&ns->stashed, 0);
+		WRITE_ONCE(ns->stashed, NULL);
 	}
 }
 
@@ -56,54 +57,6 @@ static void nsfs_evict(struct inode *inode)
 	ns->ops->put(ns);
 }
 
-static int __ns_get_path(struct path *path, struct ns_common *ns)
-{
-	struct vfsmount *mnt = nsfs_mnt;
-	struct dentry *dentry;
-	struct inode *inode;
-	unsigned long d;
-
-	rcu_read_lock();
-	d = atomic_long_read(&ns->stashed);
-	if (!d)
-		goto slow;
-	dentry = (struct dentry *)d;
-	if (!lockref_get_not_dead(&dentry->d_lockref))
-		goto slow;
-	rcu_read_unlock();
-	ns->ops->put(ns);
-got_it:
-	path->mnt = mntget(mnt);
-	path->dentry = dentry;
-	return 0;
-slow:
-	rcu_read_unlock();
-	inode = new_inode_pseudo(mnt->mnt_sb);
-	if (!inode) {
-		ns->ops->put(ns);
-		return -ENOMEM;
-	}
-	inode->i_ino = ns->inum;
-	simple_inode_init_ts(inode);
-	inode->i_flags |= S_IMMUTABLE;
-	inode->i_mode = S_IFREG | S_IRUGO;
-	inode->i_fop = &ns_file_operations;
-	inode->i_private = ns;
-
-	dentry = d_make_root(inode);	/* not the normal use, but... */
-	if (!dentry)
-		return -ENOMEM;
-	dentry->d_fsdata = (void *)ns->ops;
-	d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
-	if (d) {
-		d_delete(dentry);	/* make sure ->d_prune() does nothing */
-		dput(dentry);
-		cpu_relax();
-		return -EAGAIN;
-	}
-	goto got_it;
-}
-
 int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
 		     void *private_data)
 {
@@ -113,10 +66,16 @@ int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
 		struct ns_common *ns = ns_get_cb(private_data);
 		if (!ns)
 			return -ENOENT;
-		ret = __ns_get_path(path, ns);
+		ret = path_from_stashed(&ns->stashed, ns->inum, nsfs_mnt,
+					&ns_file_operations, ns, path);
+		if (ret <= 0 && ret != -EAGAIN)
+			ns->ops->put(ns);
 	} while (ret == -EAGAIN);
 
-	return ret;
+	if (ret < 0)
+		return ret;
+
+	return 0;
 }
 
 struct ns_get_path_task_args {
@@ -163,10 +122,13 @@ int open_related_ns(struct ns_common *ns,
 			return PTR_ERR(relative);
 		}
 
-		err = __ns_get_path(&path, relative);
+		err = path_from_stashed(&relative->stashed, relative->inum, nsfs_mnt,
+					&ns_file_operations, relative, &path);
+		if (err <= 0 && err != -EAGAIN)
+			relative->ops->put(relative);
 	} while (err == -EAGAIN);
 
-	if (err) {
+	if (err < 0) {
 		put_unused_fd(fd);
 		return err;
 	}
@@ -249,7 +211,8 @@ bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino)
 static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
 {
 	struct inode *inode = d_inode(dentry);
-	const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+	const struct ns_common *ns = inode->i_private;
+	const struct proc_ns_operations *ns_ops = ns->ops;
 
 	seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
 	return 0;
diff --git a/include/linux/ns_common.h b/include/linux/ns_common.h
index 0f1d024bd958..7d22ea50b098 100644
--- a/include/linux/ns_common.h
+++ b/include/linux/ns_common.h
@@ -7,7 +7,7 @@
 struct proc_ns_operations;
 
 struct ns_common {
-	atomic_long_t stashed;
+	struct dentry *stashed;
 	const struct proc_ns_operations *ops;
 	unsigned int inum;
 	refcount_t count;
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 49539bc416ce..5ea470eb4d76 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -66,7 +66,7 @@ static inline void proc_free_inum(unsigned int inum) {}
 
 static inline int ns_alloc_inum(struct ns_common *ns)
 {
-	atomic_long_set(&ns->stashed, 0);
+	WRITE_ONCE(ns->stashed, NULL);
 	return proc_alloc_inum(&ns->inum);
 }
 

From b28ddcc32d8fa3e20745b3a47dff863fe0376d79 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Mon, 19 Feb 2024 16:30:57 +0100
Subject: [PATCH 20/23] pidfs: convert to path_from_stashed() helper

Moving pidfds from the anonymous inode infrastructure to a separate tiny
in-kernel filesystem similar to sockfs, pipefs, and anon_inodefs causes
selinux denials and thus various userspace components that make heavy
use of pidfds to fail as pidfds used anon_inode_getfile() which aren't
subject to any LSM hooks. But dentry_open() is and that would cause
regressions.

The failures that are seen are selinux denials. But the core failure is
dbus-broker. That cascades into other services failing that depend on
dbus-broker. For example, when dbus-broker fails to start polkit and all
the others won't be able to work because they depend on dbus-broker.

The reason for dbus-broker failing is because it doesn't handle failures
for SO_PEERPIDFD correctly. Last kernel release we introduced
SO_PEERPIDFD (and SCM_PIDFD). SO_PEERPIDFD allows dbus-broker and polkit
and others to receive a pidfd for the peer of an AF_UNIX socket. This is
the first time in the history of Linux that we can safely authenticate
clients in a race-free manner.

dbus-broker immediately made use of this but messed up the error
checking. It only allowed EINVAL as a valid failure for SO_PEERPIDFD.
That's obviously problematic not just because of LSM denials but because
of seccomp denials that would prevent SO_PEERPIDFD from working; or any
other new error code from there.

So this is catching a flawed implementation in dbus-broker as well. It
has to fallback to the old pid-based authentication when SO_PEERPIDFD
doesn't work no matter the reasons otherwise it'll always risk such
failures. So overall that LSM denial should not have caused dbus-broker
to fail. It can never assume that a feature released one kernel ago like
SO_PEERPIDFD can be assumed to be available.

So, the next fix separate from the selinux policy update is to try and
fix dbus-broker at [3]. That should make it into Fedora as well. In
addition the selinux reference policy should also be updated. See [4]
for that. If Selinux is in enforcing mode in userspace and it encounters
anything that it doesn't know about it will deny it by default. And the
policy is entirely in userspace including declaring new types for stuff
like nsfs or pidfs to allow it.

For now we continue to raise S_PRIVATE on the inode if it's a pidfs
inode which means things behave exactly like before.

Link: https://bugzilla.redhat.com/show_bug.cgi?id=2265630
Link: https://github.com/fedora-selinux/selinux-policy/pull/2050
Link: https://github.com/bus1/dbus-broker/pull/343 [3]
Link: https://github.com/SELinuxProject/refpolicy/pull/762 [4]
Reported-by: Nathan Chancellor <nathan@kernel.org>
Link: https://lore.kernel.org/r/20240222190334.GA412503@dev-arch.thelio-3990X
Link: https://lore.kernel.org/r/20240218-neufahrzeuge-brauhaus-fb0eb6459771@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/internal.h         |  3 ++-
 fs/libfs.c            | 15 +++++++++---
 fs/nsfs.c             |  7 +++---
 fs/pidfs.c            | 57 ++++++++++++++++++++++++++++---------------
 include/linux/pid.h   |  1 +
 include/linux/pidfs.h |  1 +
 kernel/pid.c          |  1 +
 7 files changed, 58 insertions(+), 27 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index cfddaec6fbf6..a34531bcad6e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -312,4 +312,5 @@ struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
 void mnt_idmap_put(struct mnt_idmap *idmap);
 int path_from_stashed(struct dentry **stashed, unsigned long ino,
 		      struct vfsmount *mnt, const struct file_operations *fops,
-		      void *data, struct path *path);
+		      const struct inode_operations *iops, void *data,
+		      struct path *path);
diff --git a/fs/libfs.c b/fs/libfs.c
index 2a55e87e1439..2acba9d53756 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -23,6 +23,7 @@
 #include <linux/fsnotify.h>
 #include <linux/unicode.h>
 #include <linux/fscrypt.h>
+#include <linux/pidfs.h>
 
 #include <linux/uaccess.h>
 
@@ -1990,6 +1991,7 @@ static inline struct dentry *get_stashed_dentry(struct dentry *stashed)
 static struct dentry *stash_dentry(struct dentry **stashed, unsigned long ino,
 				   struct super_block *sb,
 				   const struct file_operations *fops,
+				   const struct inode_operations *iops,
 				   void *data)
 {
 	struct dentry *dentry;
@@ -2007,8 +2009,13 @@ static struct dentry *stash_dentry(struct dentry **stashed, unsigned long ino,
 
 	inode->i_ino = ino;
 	inode->i_flags |= S_IMMUTABLE;
+	if (is_pidfs_sb(sb))
+		inode->i_flags |= S_PRIVATE;
 	inode->i_mode = S_IFREG | S_IRUGO;
-	inode->i_fop = fops;
+	if (iops)
+		inode->i_op = iops;
+	if (fops)
+		inode->i_fop = fops;
 	inode->i_private = data;
 	simple_inode_init_ts(inode);
 
@@ -2030,6 +2037,7 @@ static struct dentry *stash_dentry(struct dentry **stashed, unsigned long ino,
  * @stashed:    where to retrieve or stash dentry
  * @ino:        inode number to use
  * @mnt:        mnt of the filesystems to use
+ * @iops:       inode operations to use
  * @fops:       file operations to use
  * @data:       data to store in inode->i_private
  * @path:       path to create
@@ -2048,7 +2056,8 @@ static struct dentry *stash_dentry(struct dentry **stashed, unsigned long ino,
  */
 int path_from_stashed(struct dentry **stashed, unsigned long ino,
 		      struct vfsmount *mnt, const struct file_operations *fops,
-		      void *data, struct path *path)
+		      const struct inode_operations *iops, void *data,
+		      struct path *path)
 {
 	struct dentry *dentry;
 	int ret = 0;
@@ -2057,7 +2066,7 @@ int path_from_stashed(struct dentry **stashed, unsigned long ino,
 	if (dentry)
 		goto out_path;
 
-	dentry = stash_dentry(stashed, ino, mnt->mnt_sb, fops, data);
+	dentry = stash_dentry(stashed, ino, mnt->mnt_sb, fops, iops, data);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 	ret = 1;
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 39dc2604bec0..e2da645c3d02 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -67,7 +67,7 @@ int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
 		if (!ns)
 			return -ENOENT;
 		ret = path_from_stashed(&ns->stashed, ns->inum, nsfs_mnt,
-					&ns_file_operations, ns, path);
+					&ns_file_operations, NULL, ns, path);
 		if (ret <= 0 && ret != -EAGAIN)
 			ns->ops->put(ns);
 	} while (ret == -EAGAIN);
@@ -122,8 +122,9 @@ int open_related_ns(struct ns_common *ns,
 			return PTR_ERR(relative);
 		}
 
-		err = path_from_stashed(&relative->stashed, relative->inum, nsfs_mnt,
-					&ns_file_operations, relative, &path);
+		err = path_from_stashed(&relative->stashed, relative->inum,
+					nsfs_mnt, &ns_file_operations, NULL,
+					relative, &path);
 		if (err <= 0 && err != -EAGAIN)
 			relative->ops->put(relative);
 	} while (err == -EAGAIN);
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 6c3f010074af..cf606f15def5 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -14,6 +14,8 @@
 #include <linux/seq_file.h>
 #include <uapi/linux/pidfd.h>
 
+#include "internal.h"
+
 static int pidfd_release(struct inode *inode, struct file *file)
 {
 #ifndef CONFIG_FS_PID
@@ -186,9 +188,21 @@ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
 			     d_inode(dentry)->i_ino);
 }
 
+static void pidfs_prune_dentry(struct dentry *dentry)
+{
+	struct inode *inode;
+
+	inode = d_inode(dentry);
+	if (inode) {
+		struct pid *pid = inode->i_private;
+		WRITE_ONCE(pid->stashed, NULL);
+	}
+}
+
 static const struct dentry_operations pidfs_dentry_operations = {
 	.d_delete	= always_delete_dentry,
 	.d_dname	= pidfs_dname,
+	.d_prune	= pidfs_prune_dentry,
 };
 
 static int pidfs_init_fs_context(struct fs_context *fc)
@@ -213,34 +227,28 @@ static struct file_system_type pidfs_type = {
 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
 {
 
-	struct inode *inode;
 	struct file *pidfd_file;
+	struct path path;
+	int ret;
 
-	inode = iget_locked(pidfs_sb, pid->ino);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-
-	if (inode->i_state & I_NEW) {
+	do {
 		/*
 		 * Inode numbering for pidfs start at RESERVED_PIDS + 1.
 		 * This avoids collisions with the root inode which is 1
 		 * for pseudo filesystems.
 		 */
-		inode->i_ino = pid->ino;
-		inode->i_mode = S_IFREG | S_IRUGO;
-		inode->i_op = &pidfs_inode_operations;
-		inode->i_fop = &pidfs_file_operations;
-		inode->i_flags |= S_IMMUTABLE;
-		inode->i_private = get_pid(pid);
-		simple_inode_init_ts(inode);
-		unlock_new_inode(inode);
-	}
-
-	pidfd_file = alloc_file_pseudo(inode, pidfs_mnt, "", flags,
-				       &pidfs_file_operations);
-	if (IS_ERR(pidfd_file))
-		iput(inode);
+		ret = path_from_stashed(&pid->stashed, pid->ino, pidfs_mnt,
+					&pidfs_file_operations,
+					&pidfs_inode_operations, get_pid(pid),
+					&path);
+		if (ret <= 0 && ret != -EAGAIN)
+			put_pid(pid);
+	} while (ret == -EAGAIN);
+	if (ret < 0)
+		return ERR_PTR(ret);
 
+	pidfd_file = dentry_open(&path, flags, current_cred());
+	path_put(&path);
 	return pidfd_file;
 }
 
@@ -253,6 +261,11 @@ void __init pidfs_init(void)
 	pidfs_sb = pidfs_mnt->mnt_sb;
 }
 
+bool is_pidfs_sb(const struct super_block *sb)
+{
+	return sb == pidfs_mnt->mnt_sb;
+}
+
 #else /* !CONFIG_FS_PID */
 
 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
@@ -269,4 +282,8 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
 }
 
 void __init pidfs_init(void) { }
+bool is_pidfs_sb(const struct super_block *sb)
+{
+	return false;
+}
 #endif
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 956481128e8d..c79a0efd0258 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -56,6 +56,7 @@ struct pid
 	unsigned int level;
 	spinlock_t lock;
 #ifdef CONFIG_FS_PID
+	struct dentry *stashed;
 	unsigned long ino;
 #endif
 	/* lists of tasks that use this pid */
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 75bdf9807802..40dd325a32a6 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -4,5 +4,6 @@
 
 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
 void __init pidfs_init(void);
+bool is_pidfs_sb(const struct super_block *sb);
 
 #endif /* _LINUX_PID_FS_H */
diff --git a/kernel/pid.c b/kernel/pid.c
index 581cc34341fd..99a0c5eb24b8 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -281,6 +281,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 	if (!(ns->pid_allocated & PIDNS_ADDING))
 		goto out_unlock;
 #ifdef CONFIG_FS_PID
+	pid->stashed = NULL;
 	pid->ino = ++pidfs_ino;
 #endif
 	for ( ; upid >= pid->numbers; --upid) {

From 159a0d9fd50b92cc48e4c82cde79c4cb34c85953 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Sun, 18 Feb 2024 14:52:24 +0100
Subject: [PATCH 21/23] libfs: improve path_from_stashed() helper

In earlier patches we moved both nsfs and pidfs to path_from_stashed().
The helper currently tries to add and stash a new dentry if a reusable
dentry couldn't be found and returns EAGAIN if it lost the race to stash
the dentry. The caller can use EAGAIN to retry.

The helper and the two filesystems be written in a way that makes
returning EAGAIN unnecessary. To do this we need to change the
dentry->d_prune() implementation of nsfs and pidfs to not simply replace
the stashed dentry with NULL but to use a cmpxchg() and only replace
their own dentry.

Then path_from_stashed() can then be changed to not just stash a new
dentry when no dentry is currently stashed but also when an already dead
dentry is stashed. If another task managed to install a dentry in the
meantime it can simply be reused. Pack that into a loop and call it a
day.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/CAHk-=wgtLF5Z5=15-LKAczWm=-tUjHO+Bpf7WjBG+UU3s=fEQw@mail.gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/libfs.c | 63 +++++++++++++++++++++++++++++++++++-------------------
 fs/nsfs.c  | 50 +++++++++++++++++++------------------------
 fs/pidfs.c | 28 ++++++++++--------------
 3 files changed, 74 insertions(+), 67 deletions(-)

diff --git a/fs/libfs.c b/fs/libfs.c
index 2acba9d53756..7617e1bc6e5b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1988,11 +1988,11 @@ static inline struct dentry *get_stashed_dentry(struct dentry *stashed)
 	return dentry;
 }
 
-static struct dentry *stash_dentry(struct dentry **stashed, unsigned long ino,
-				   struct super_block *sb,
-				   const struct file_operations *fops,
-				   const struct inode_operations *iops,
-				   void *data)
+static struct dentry *prepare_anon_dentry(unsigned long ino,
+					  struct super_block *sb,
+					  const struct file_operations *fops,
+					  const struct inode_operations *iops,
+					  void *data)
 {
 	struct dentry *dentry;
 	struct inode *inode;
@@ -2021,17 +2021,31 @@ static struct dentry *stash_dentry(struct dentry **stashed, unsigned long ino,
 
 	/* @data is now owned by the fs */
 	d_instantiate(dentry, inode);
-
-	if (cmpxchg(stashed, NULL, dentry)) {
-		d_delete(dentry); /* make sure ->d_prune() does nothing */
-		dput(dentry);
-		cpu_relax();
-		return ERR_PTR(-EAGAIN);
-	}
-
 	return dentry;
 }
 
+static struct dentry *stash_dentry(struct dentry **stashed,
+				   struct dentry *dentry)
+{
+	guard(rcu)();
+	for (;;) {
+		struct dentry *old;
+
+		/* Assume any old dentry was cleared out. */
+		old = cmpxchg(stashed, NULL, dentry);
+		if (likely(!old))
+			return dentry;
+
+		/* Check if somebody else installed a reusable dentry. */
+		if (lockref_get_not_dead(&old->d_lockref))
+			return old;
+
+		/* There's an old dead dentry there, try to take it over. */
+		if (likely(try_cmpxchg(stashed, &old, dentry)))
+			return dentry;
+	}
+}
+
 /**
  * path_from_stashed - create path from stashed or new dentry
  * @stashed:    where to retrieve or stash dentry
@@ -2044,15 +2058,14 @@ static struct dentry *stash_dentry(struct dentry **stashed, unsigned long ino,
  *
  * The function tries to retrieve a stashed dentry from @stashed. If the dentry
  * is still valid then it will be reused. If the dentry isn't able the function
- * will allocate a new dentry and inode. It will then try to update @stashed
- * with the newly added dentry. If it fails -EAGAIN is returned and the caller
- * my retry.
+ * will allocate a new dentry and inode. It will then check again whether it
+ * can reuse an existing dentry in case one has been added in the meantime or
+ * update @stashed with the newly added dentry.
  *
  * Special-purpose helper for nsfs and pidfs.
  *
  * Return: If 0 or an error is returned the caller can be sure that @data must
- *         be cleaned up. If 1 or -EAGAIN is returned @data is owned by the
- *         filesystem.
+ *         be cleaned up. If 1 is returned @data is owned by the filesystem.
  */
 int path_from_stashed(struct dentry **stashed, unsigned long ino,
 		      struct vfsmount *mnt, const struct file_operations *fops,
@@ -2062,17 +2075,23 @@ int path_from_stashed(struct dentry **stashed, unsigned long ino,
 	struct dentry *dentry;
 	int ret = 0;
 
-	dentry = get_stashed_dentry(*stashed);
-	if (dentry)
+	/* See if dentry can be reused. */
+	path->dentry = get_stashed_dentry(*stashed);
+	if (path->dentry)
 		goto out_path;
 
-	dentry = stash_dentry(stashed, ino, mnt->mnt_sb, fops, iops, data);
+	/* Allocate a new dentry. */
+	dentry = prepare_anon_dentry(ino, mnt->mnt_sb, fops, iops, data);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
+
+	/* Added a new dentry. @data is now owned by the filesystem. */
+	path->dentry = stash_dentry(stashed, dentry);
+	if (path->dentry != dentry)
+		dput(dentry);
 	ret = 1;
 
 out_path:
-	path->dentry = dentry;
 	path->mnt = mntget(mnt);
 	return ret;
 }
diff --git a/fs/nsfs.c b/fs/nsfs.c
index e2da645c3d02..3a36bb62353c 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -36,10 +36,12 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
 
 static void ns_prune_dentry(struct dentry *dentry)
 {
-	struct inode *inode = d_inode(dentry);
+	struct inode *inode;
+
+	inode = d_inode(dentry);
 	if (inode) {
 		struct ns_common *ns = inode->i_private;
-		WRITE_ONCE(ns->stashed, NULL);
+		cmpxchg(&ns->stashed, dentry, NULL);
 	}
 }
 
@@ -61,20 +63,17 @@ int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
 		     void *private_data)
 {
 	int ret;
+	struct ns_common *ns;
 
-	do {
-		struct ns_common *ns = ns_get_cb(private_data);
-		if (!ns)
-			return -ENOENT;
-		ret = path_from_stashed(&ns->stashed, ns->inum, nsfs_mnt,
-					&ns_file_operations, NULL, ns, path);
-		if (ret <= 0 && ret != -EAGAIN)
-			ns->ops->put(ns);
-	} while (ret == -EAGAIN);
-
+	ns = ns_get_cb(private_data);
+	if (!ns)
+		return -ENOENT;
+	ret = path_from_stashed(&ns->stashed, ns->inum, nsfs_mnt,
+				&ns_file_operations, NULL, ns, path);
+	if (ret <= 0)
+		ns->ops->put(ns);
 	if (ret < 0)
 		return ret;
-
 	return 0;
 }
 
@@ -105,6 +104,7 @@ int open_related_ns(struct ns_common *ns,
 		   struct ns_common *(*get_ns)(struct ns_common *ns))
 {
 	struct path path = {};
+	struct ns_common *relative;
 	struct file *f;
 	int err;
 	int fd;
@@ -113,22 +113,16 @@ int open_related_ns(struct ns_common *ns,
 	if (fd < 0)
 		return fd;
 
-	do {
-		struct ns_common *relative;
-
-		relative = get_ns(ns);
-		if (IS_ERR(relative)) {
-			put_unused_fd(fd);
-			return PTR_ERR(relative);
-		}
-
-		err = path_from_stashed(&relative->stashed, relative->inum,
-					nsfs_mnt, &ns_file_operations, NULL,
-					relative, &path);
-		if (err <= 0 && err != -EAGAIN)
-			relative->ops->put(relative);
-	} while (err == -EAGAIN);
+	relative = get_ns(ns);
+	if (IS_ERR(relative)) {
+		put_unused_fd(fd);
+		return PTR_ERR(relative);
+	}
 
+	err = path_from_stashed(&relative->stashed, relative->inum, nsfs_mnt,
+				&ns_file_operations, NULL, relative, &path);
+	if (err <= 0)
+		relative->ops->put(relative);
 	if (err < 0) {
 		put_unused_fd(fd);
 		return err;
diff --git a/fs/pidfs.c b/fs/pidfs.c
index cf606f15def5..5f33c820b7f8 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -140,7 +140,6 @@ struct pid *pidfd_pid(const struct file *file)
 
 #ifdef CONFIG_FS_PID
 static struct vfsmount *pidfs_mnt __ro_after_init;
-static struct super_block *pidfs_sb __ro_after_init;
 
 /*
  * The vfs falls back to simple_setattr() if i_op->setattr() isn't
@@ -195,7 +194,7 @@ static void pidfs_prune_dentry(struct dentry *dentry)
 	inode = d_inode(dentry);
 	if (inode) {
 		struct pid *pid = inode->i_private;
-		WRITE_ONCE(pid->stashed, NULL);
+		cmpxchg(&pid->stashed, dentry, NULL);
 	}
 }
 
@@ -231,19 +230,16 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
 	struct path path;
 	int ret;
 
-	do {
-		/*
-		 * Inode numbering for pidfs start at RESERVED_PIDS + 1.
-		 * This avoids collisions with the root inode which is 1
-		 * for pseudo filesystems.
-		 */
-		ret = path_from_stashed(&pid->stashed, pid->ino, pidfs_mnt,
-					&pidfs_file_operations,
-					&pidfs_inode_operations, get_pid(pid),
-					&path);
-		if (ret <= 0 && ret != -EAGAIN)
-			put_pid(pid);
-	} while (ret == -EAGAIN);
+	/*
+	* Inode numbering for pidfs start at RESERVED_PIDS + 1.
+	* This avoids collisions with the root inode which is 1
+	* for pseudo filesystems.
+	 */
+	ret = path_from_stashed(&pid->stashed, pid->ino, pidfs_mnt,
+				&pidfs_file_operations, &pidfs_inode_operations,
+				get_pid(pid), &path);
+	if (ret <= 0)
+		put_pid(pid);
 	if (ret < 0)
 		return ERR_PTR(ret);
 
@@ -257,8 +253,6 @@ void __init pidfs_init(void)
 	pidfs_mnt = kern_mount(&pidfs_type);
 	if (IS_ERR(pidfs_mnt))
 		panic("Failed to mount pidfs pseudo filesystem");
-
-	pidfs_sb = pidfs_mnt->mnt_sb;
 }
 
 bool is_pidfs_sb(const struct super_block *sb)

From 2558e3b23112adb82a558bab616890a790a38bc6 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Wed, 21 Feb 2024 09:59:51 +0100
Subject: [PATCH 22/23] libfs: add stashed_dentry_prune()

Both pidfs and nsfs use a memory location to stash a dentry for reuse by
concurrent openers. Right now two custom
dentry->d_prune::{ns,pidfs}_prune_dentry() methods are needed that do
the same thing. The only thing that differs is that they need to get to
the memory location to store or retrieve the dentry from differently.
Fix that by remember the stashing location for the dentry in
dentry->d_fsdata which allows us to retrieve it in dentry->d_prune. That
in turn makes it possible to add a common helper that pidfs and nsfs can
both use.

Link: https://lore.kernel.org/r/CAHk-=wg8cHY=i3m6RnXQ2Y2W8psicKWQEZq1=94ivUiviM-0OA@mail.gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/internal.h |  1 +
 fs/libfs.c    | 29 +++++++++++++++++++++++++++--
 fs/nsfs.c     | 16 ++--------------
 fs/pidfs.c    | 13 +------------
 4 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index a34531bcad6e..b0c843c3fa3c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -314,3 +314,4 @@ int path_from_stashed(struct dentry **stashed, unsigned long ino,
 		      struct vfsmount *mnt, const struct file_operations *fops,
 		      const struct inode_operations *iops, void *data,
 		      struct path *path);
+void stashed_dentry_prune(struct dentry *dentry);
diff --git a/fs/libfs.c b/fs/libfs.c
index 7617e1bc6e5b..472f21bd0325 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1988,7 +1988,8 @@ static inline struct dentry *get_stashed_dentry(struct dentry *stashed)
 	return dentry;
 }
 
-static struct dentry *prepare_anon_dentry(unsigned long ino,
+static struct dentry *prepare_anon_dentry(struct dentry **stashed,
+					  unsigned long ino,
 					  struct super_block *sb,
 					  const struct file_operations *fops,
 					  const struct inode_operations *iops,
@@ -2019,6 +2020,9 @@ static struct dentry *prepare_anon_dentry(unsigned long ino,
 	inode->i_private = data;
 	simple_inode_init_ts(inode);
 
+	/* Store address of location where dentry's supposed to be stashed. */
+	dentry->d_fsdata = stashed;
+
 	/* @data is now owned by the fs */
 	d_instantiate(dentry, inode);
 	return dentry;
@@ -2081,7 +2085,7 @@ int path_from_stashed(struct dentry **stashed, unsigned long ino,
 		goto out_path;
 
 	/* Allocate a new dentry. */
-	dentry = prepare_anon_dentry(ino, mnt->mnt_sb, fops, iops, data);
+	dentry = prepare_anon_dentry(stashed, ino, mnt->mnt_sb, fops, iops, data);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
@@ -2092,6 +2096,27 @@ int path_from_stashed(struct dentry **stashed, unsigned long ino,
 	ret = 1;
 
 out_path:
+	WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
+	WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
 	path->mnt = mntget(mnt);
 	return ret;
 }
+
+void stashed_dentry_prune(struct dentry *dentry)
+{
+	struct dentry **stashed = dentry->d_fsdata;
+	struct inode *inode = d_inode(dentry);
+
+	if (WARN_ON_ONCE(!stashed))
+		return;
+
+	if (!inode)
+		return;
+
+	/*
+	 * Only replace our own @dentry as someone else might've
+	 * already cleared out @dentry and stashed their own
+	 * dentry in there.
+	 */
+	cmpxchg(stashed, dentry, NULL);
+}
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 3a36bb62353c..2ce229af34e9 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -34,22 +34,10 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
 		ns_ops->name, inode->i_ino);
 }
 
-static void ns_prune_dentry(struct dentry *dentry)
-{
-	struct inode *inode;
-
-	inode = d_inode(dentry);
-	if (inode) {
-		struct ns_common *ns = inode->i_private;
-		cmpxchg(&ns->stashed, dentry, NULL);
-	}
-}
-
-const struct dentry_operations ns_dentry_operations =
-{
-	.d_prune	= ns_prune_dentry,
+const struct dentry_operations ns_dentry_operations = {
 	.d_delete	= always_delete_dentry,
 	.d_dname	= ns_dname,
+	.d_prune	= stashed_dentry_prune,
 };
 
 static void nsfs_evict(struct inode *inode)
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 5f33c820b7f8..d38b7a184994 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -187,21 +187,10 @@ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
 			     d_inode(dentry)->i_ino);
 }
 
-static void pidfs_prune_dentry(struct dentry *dentry)
-{
-	struct inode *inode;
-
-	inode = d_inode(dentry);
-	if (inode) {
-		struct pid *pid = inode->i_private;
-		cmpxchg(&pid->stashed, dentry, NULL);
-	}
-}
-
 static const struct dentry_operations pidfs_dentry_operations = {
 	.d_delete	= always_delete_dentry,
 	.d_dname	= pidfs_dname,
-	.d_prune	= pidfs_prune_dentry,
+	.d_prune	= stashed_dentry_prune,
 };
 
 static int pidfs_init_fs_context(struct fs_context *fc)

From e9c5263ce16d96311c118111ac779f004be8b473 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Fri, 1 Mar 2024 10:26:03 +0100
Subject: [PATCH 23/23] libfs: improve path_from_stashed()

Right now we pass a bunch of info that is fs specific which doesn't make
a lot of sense and it bleeds fs sepcific details into the generic
helper. nsfs and pidfs have slightly different needs when initializing
inodes. Add simple operations that are stashed in sb->s_fs_info that
both can implement. This also allows us to get rid of cleaning up
references in the caller. All in all path_from_stashed() becomes way
simpler.

Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 fs/internal.h |  8 +++++---
 fs/libfs.c    | 41 ++++++++++++++++++-----------------------
 fs/nsfs.c     | 33 ++++++++++++++++++++++-----------
 fs/pidfs.c    | 24 +++++++++++++++++++++---
 4 files changed, 66 insertions(+), 40 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index b0c843c3fa3c..7d3edcdf59cc 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -310,8 +310,10 @@ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po
 struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
 struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
 void mnt_idmap_put(struct mnt_idmap *idmap);
+struct stashed_operations {
+	void (*put_data)(void *data);
+	void (*init_inode)(struct inode *inode, void *data);
+};
 int path_from_stashed(struct dentry **stashed, unsigned long ino,
-		      struct vfsmount *mnt, const struct file_operations *fops,
-		      const struct inode_operations *iops, void *data,
-		      struct path *path);
+		      struct vfsmount *mnt, void *data, struct path *path);
 void stashed_dentry_prune(struct dentry *dentry);
diff --git a/fs/libfs.c b/fs/libfs.c
index 472f21bd0325..65322e11bcda 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1991,12 +1991,11 @@ static inline struct dentry *get_stashed_dentry(struct dentry *stashed)
 static struct dentry *prepare_anon_dentry(struct dentry **stashed,
 					  unsigned long ino,
 					  struct super_block *sb,
-					  const struct file_operations *fops,
-					  const struct inode_operations *iops,
 					  void *data)
 {
 	struct dentry *dentry;
 	struct inode *inode;
+	const struct stashed_operations *sops = sb->s_fs_info;
 
 	dentry = d_alloc_anon(sb);
 	if (!dentry)
@@ -2010,15 +2009,13 @@ static struct dentry *prepare_anon_dentry(struct dentry **stashed,
 
 	inode->i_ino = ino;
 	inode->i_flags |= S_IMMUTABLE;
-	if (is_pidfs_sb(sb))
-		inode->i_flags |= S_PRIVATE;
-	inode->i_mode = S_IFREG | S_IRUGO;
-	if (iops)
-		inode->i_op = iops;
-	if (fops)
-		inode->i_fop = fops;
-	inode->i_private = data;
+	inode->i_mode = S_IFREG;
 	simple_inode_init_ts(inode);
+	sops->init_inode(inode, data);
+
+	/* Notice when this is changed. */
+	WARN_ON_ONCE(!S_ISREG(inode->i_mode));
+	WARN_ON_ONCE(!IS_IMMUTABLE(inode));
 
 	/* Store address of location where dentry's supposed to be stashed. */
 	dentry->d_fsdata = stashed;
@@ -2055,8 +2052,6 @@ static struct dentry *stash_dentry(struct dentry **stashed,
  * @stashed:    where to retrieve or stash dentry
  * @ino:        inode number to use
  * @mnt:        mnt of the filesystems to use
- * @iops:       inode operations to use
- * @fops:       file operations to use
  * @data:       data to store in inode->i_private
  * @path:       path to create
  *
@@ -2068,38 +2063,38 @@ static struct dentry *stash_dentry(struct dentry **stashed,
  *
  * Special-purpose helper for nsfs and pidfs.
  *
- * Return: If 0 or an error is returned the caller can be sure that @data must
- *         be cleaned up. If 1 is returned @data is owned by the filesystem.
+ * Return: On success zero and on failure a negative error is returned.
  */
 int path_from_stashed(struct dentry **stashed, unsigned long ino,
-		      struct vfsmount *mnt, const struct file_operations *fops,
-		      const struct inode_operations *iops, void *data,
-		      struct path *path)
+		      struct vfsmount *mnt, void *data, struct path *path)
 {
 	struct dentry *dentry;
-	int ret = 0;
+	const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
 
 	/* See if dentry can be reused. */
 	path->dentry = get_stashed_dentry(*stashed);
-	if (path->dentry)
+	if (path->dentry) {
+		sops->put_data(data);
 		goto out_path;
+	}
 
 	/* Allocate a new dentry. */
-	dentry = prepare_anon_dentry(stashed, ino, mnt->mnt_sb, fops, iops, data);
-	if (IS_ERR(dentry))
+	dentry = prepare_anon_dentry(stashed, ino, mnt->mnt_sb, data);
+	if (IS_ERR(dentry)) {
+		sops->put_data(data);
 		return PTR_ERR(dentry);
+	}
 
 	/* Added a new dentry. @data is now owned by the filesystem. */
 	path->dentry = stash_dentry(stashed, dentry);
 	if (path->dentry != dentry)
 		dput(dentry);
-	ret = 1;
 
 out_path:
 	WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
 	WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
 	path->mnt = mntget(mnt);
-	return ret;
+	return 0;
 }
 
 void stashed_dentry_prune(struct dentry *dentry)
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 2ce229af34e9..7aaafb5cb9fc 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -50,19 +50,13 @@ static void nsfs_evict(struct inode *inode)
 int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
 		     void *private_data)
 {
-	int ret;
 	struct ns_common *ns;
 
 	ns = ns_get_cb(private_data);
 	if (!ns)
 		return -ENOENT;
-	ret = path_from_stashed(&ns->stashed, ns->inum, nsfs_mnt,
-				&ns_file_operations, NULL, ns, path);
-	if (ret <= 0)
-		ns->ops->put(ns);
-	if (ret < 0)
-		return ret;
-	return 0;
+
+	return path_from_stashed(&ns->stashed, ns->inum, nsfs_mnt, ns, path);
 }
 
 struct ns_get_path_task_args {
@@ -108,9 +102,7 @@ int open_related_ns(struct ns_common *ns,
 	}
 
 	err = path_from_stashed(&relative->stashed, relative->inum, nsfs_mnt,
-				&ns_file_operations, NULL, relative, &path);
-	if (err <= 0)
-		relative->ops->put(relative);
+				relative, &path);
 	if (err < 0) {
 		put_unused_fd(fd);
 		return err;
@@ -207,6 +199,24 @@ static const struct super_operations nsfs_ops = {
 	.show_path = nsfs_show_path,
 };
 
+static void nsfs_init_inode(struct inode *inode, void *data)
+{
+	inode->i_private = data;
+	inode->i_mode |= S_IRUGO;
+	inode->i_fop = &ns_file_operations;
+}
+
+static void nsfs_put_data(void *data)
+{
+	struct ns_common *ns = data;
+	ns->ops->put(ns);
+}
+
+static const struct stashed_operations nsfs_stashed_ops = {
+	.init_inode = nsfs_init_inode,
+	.put_data = nsfs_put_data,
+};
+
 static int nsfs_init_fs_context(struct fs_context *fc)
 {
 	struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
@@ -214,6 +224,7 @@ static int nsfs_init_fs_context(struct fs_context *fc)
 		return -ENOMEM;
 	ctx->ops = &nsfs_ops;
 	ctx->dops = &ns_dentry_operations;
+	fc->s_fs_info = (void *)&nsfs_stashed_ops;
 	return 0;
 }
 
diff --git a/fs/pidfs.c b/fs/pidfs.c
index d38b7a184994..8fd71a00be9c 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -193,6 +193,26 @@ static const struct dentry_operations pidfs_dentry_operations = {
 	.d_prune	= stashed_dentry_prune,
 };
 
+static void pidfs_init_inode(struct inode *inode, void *data)
+{
+	inode->i_private = data;
+	inode->i_flags |= S_PRIVATE;
+	inode->i_mode |= S_IRWXU;
+	inode->i_op = &pidfs_inode_operations;
+	inode->i_fop = &pidfs_file_operations;
+}
+
+static void pidfs_put_data(void *data)
+{
+	struct pid *pid = data;
+	put_pid(pid);
+}
+
+static const struct stashed_operations pidfs_stashed_ops = {
+	.init_inode = pidfs_init_inode,
+	.put_data = pidfs_put_data,
+};
+
 static int pidfs_init_fs_context(struct fs_context *fc)
 {
 	struct pseudo_fs_context *ctx;
@@ -203,6 +223,7 @@ static int pidfs_init_fs_context(struct fs_context *fc)
 
 	ctx->ops = &pidfs_sops;
 	ctx->dops = &pidfs_dentry_operations;
+	fc->s_fs_info = (void *)&pidfs_stashed_ops;
 	return 0;
 }
 
@@ -225,10 +246,7 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
 	* for pseudo filesystems.
 	 */
 	ret = path_from_stashed(&pid->stashed, pid->ino, pidfs_mnt,
-				&pidfs_file_operations, &pidfs_inode_operations,
 				get_pid(pid), &path);
-	if (ret <= 0)
-		put_pid(pid);
 	if (ret < 0)
 		return ERR_PTR(ret);