diff --git a/fs/Kconfig b/fs/Kconfig index 89fdbefd1075..f3dbd84a0e40 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -174,6 +174,13 @@ source "fs/proc/Kconfig" source "fs/kernfs/Kconfig" source "fs/sysfs/Kconfig" +config FS_PID + bool "Pseudo filesystem for process file descriptors" + depends on 64BIT + default y + help + Pidfs implements advanced features for process file descriptors. + config TMPFS bool "Tmpfs virtual memory file system support (former shm fs)" depends on SHMEM diff --git a/fs/pidfs.c b/fs/pidfs.c index eccb291862a0..6c3f010074af 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -1,9 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include #include #include +#include #include #include #include @@ -14,10 +16,12 @@ static int pidfd_release(struct inode *inode, struct file *file) { +#ifndef CONFIG_FS_PID struct pid *pid = file->private_data; file->private_data = NULL; put_pid(pid); +#endif return 0; } @@ -59,7 +63,7 @@ static int pidfd_release(struct inode *inode, struct file *file) */ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) { - struct pid *pid = f->private_data; + struct pid *pid = pidfd_pid(f); struct pid_namespace *ns; pid_t nr = -1; @@ -93,7 +97,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) */ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { - struct pid *pid = file->private_data; + struct pid *pid = pidfd_pid(file); bool thread = file->f_flags & PIDFD_THREAD; struct task_struct *task; __poll_t poll_flags = 0; @@ -113,10 +117,156 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) return poll_flags; } -const struct file_operations pidfd_fops = { +static const struct file_operations pidfs_file_operations = { .release = pidfd_release, .poll = pidfd_poll, #ifdef CONFIG_PROC_FS .show_fdinfo = pidfd_show_fdinfo, #endif }; + +struct pid *pidfd_pid(const struct file *file) +{ + if (file->f_op != &pidfs_file_operations) + return ERR_PTR(-EBADF); +#ifdef CONFIG_FS_PID + return file_inode(file)->i_private; +#else + return file->private_data; +#endif +} + +#ifdef CONFIG_FS_PID +static struct vfsmount *pidfs_mnt __ro_after_init; +static struct super_block *pidfs_sb __ro_after_init; + +/* + * The vfs falls back to simple_setattr() if i_op->setattr() isn't + * implemented. Let's reject it completely until we have a clean + * permission concept for pidfds. + */ +static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) +{ + return -EOPNOTSUPP; +} + +static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); + return 0; +} + +static const struct inode_operations pidfs_inode_operations = { + .getattr = pidfs_getattr, + .setattr = pidfs_setattr, +}; + +static void pidfs_evict_inode(struct inode *inode) +{ + struct pid *pid = inode->i_private; + + clear_inode(inode); + put_pid(pid); +} + +static const struct super_operations pidfs_sops = { + .drop_inode = generic_delete_inode, + .evict_inode = pidfs_evict_inode, + .statfs = simple_statfs, +}; + +static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) +{ + return dynamic_dname(buffer, buflen, "pidfd:[%lu]", + d_inode(dentry)->i_ino); +} + +static const struct dentry_operations pidfs_dentry_operations = { + .d_delete = always_delete_dentry, + .d_dname = pidfs_dname, +}; + +static int pidfs_init_fs_context(struct fs_context *fc) +{ + struct pseudo_fs_context *ctx; + + ctx = init_pseudo(fc, PID_FS_MAGIC); + if (!ctx) + return -ENOMEM; + + ctx->ops = &pidfs_sops; + ctx->dops = &pidfs_dentry_operations; + return 0; +} + +static struct file_system_type pidfs_type = { + .name = "pidfs", + .init_fs_context = pidfs_init_fs_context, + .kill_sb = kill_anon_super, +}; + +struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) +{ + + struct inode *inode; + struct file *pidfd_file; + + inode = iget_locked(pidfs_sb, pid->ino); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + /* + * Inode numbering for pidfs start at RESERVED_PIDS + 1. + * This avoids collisions with the root inode which is 1 + * for pseudo filesystems. + */ + inode->i_ino = pid->ino; + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_op = &pidfs_inode_operations; + inode->i_fop = &pidfs_file_operations; + inode->i_flags |= S_IMMUTABLE; + inode->i_private = get_pid(pid); + simple_inode_init_ts(inode); + unlock_new_inode(inode); + } + + pidfd_file = alloc_file_pseudo(inode, pidfs_mnt, "", flags, + &pidfs_file_operations); + if (IS_ERR(pidfd_file)) + iput(inode); + + return pidfd_file; +} + +void __init pidfs_init(void) +{ + pidfs_mnt = kern_mount(&pidfs_type); + if (IS_ERR(pidfs_mnt)) + panic("Failed to mount pidfs pseudo filesystem"); + + pidfs_sb = pidfs_mnt->mnt_sb; +} + +#else /* !CONFIG_FS_PID */ + +struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) +{ + struct file *pidfd_file; + + pidfd_file = anon_inode_getfile("[pidfd]", &pidfs_file_operations, pid, + flags | O_RDWR); + if (IS_ERR(pidfd_file)) + return pidfd_file; + + get_pid(pid); + return pidfd_file; +} + +void __init pidfs_init(void) { } +#endif diff --git a/include/linux/pid.h b/include/linux/pid.h index 8124d57752b9..956481128e8d 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -55,6 +55,9 @@ struct pid refcount_t count; unsigned int level; spinlock_t lock; +#ifdef CONFIG_FS_PID + unsigned long ino; +#endif /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; struct hlist_head inodes; @@ -66,8 +69,6 @@ struct pid extern struct pid init_struct_pid; -extern const struct file_operations pidfd_fops; - struct file; struct pid *pidfd_pid(const struct file *file); diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h new file mode 100644 index 000000000000..75bdf9807802 --- /dev/null +++ b/include/linux/pidfs.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PID_FS_H +#define _LINUX_PID_FS_H + +struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); +void __init pidfs_init(void); + +#endif /* _LINUX_PID_FS_H */ diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 6325d1d0e90f..1b40a968ba91 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -101,5 +101,6 @@ #define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */ #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */ #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ +#define PID_FS_MAGIC 0x50494446 /* "PIDF" */ #endif /* __LINUX_MAGIC_H__ */ diff --git a/init/main.c b/init/main.c index e24b0780fdff..2fbf6a3114d5 100644 --- a/init/main.c +++ b/init/main.c @@ -99,6 +99,7 @@ #include #include #include +#include #include #include @@ -1059,6 +1060,7 @@ void start_kernel(void) seq_file_init(); proc_root_init(); nsfs_init(); + pidfs_init(); cpuset_init(); cgroup_init(); taskstats_init_early(); diff --git a/kernel/fork.c b/kernel/fork.c index 662a61f340ce..2f839c290dcf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -102,6 +102,7 @@ #include #include #include +#include #include #include @@ -1985,14 +1986,6 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } -struct pid *pidfd_pid(const struct file *file) -{ - if (file->f_op == &pidfd_fops) - return file->private_data; - - return ERR_PTR(-EBADF); -} - /** * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd @@ -2030,13 +2023,11 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re if (pidfd < 0) return pidfd; - pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, - flags | O_RDWR); + pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR); if (IS_ERR(pidfd_file)) { put_unused_fd(pidfd); return PTR_ERR(pidfd_file); } - get_pid(pid); /* held by pidfd_file now */ /* * anon_inode_getfile() ignores everything outside of the * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually. diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 15781acaac1c..6ec3deec68c2 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -573,7 +573,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) if (proc_ns_file(f.file)) err = validate_ns(&nsset, ns); else - err = validate_nsset(&nsset, f.file->private_data); + err = validate_nsset(&nsset, pidfd_pid(f.file)); if (!err) { commit_nsset(&nsset); perf_event_namespaces(current); diff --git a/kernel/pid.c b/kernel/pid.c index c1d940fbd314..581cc34341fd 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -65,6 +66,13 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; +#ifdef CONFIG_FS_PID +/* + * Pseudo filesystems start inode numbering after one. We use Reserved + * PIDs as a natural offset. + */ +static u64 pidfs_ino = RESERVED_PIDS; +#endif /* * PID-map pages start out as NULL, they get allocated upon @@ -272,6 +280,9 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; +#ifdef CONFIG_FS_PID + pid->ino = ++pidfs_ino; +#endif for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr);