linux-stable/include/linux/fs_context.h

238 lines
8.2 KiB
C
Raw Normal View History

/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Filesystem superblock creation and reconfiguration context.
*
* Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*/
#ifndef _LINUX_FS_CONTEXT_H
#define _LINUX_FS_CONTEXT_H
#include <linux/kernel.h>
#include <linux/refcount.h>
#include <linux/errno.h>
#include <linux/security.h>
vfs: syscall: Add fsopen() to prepare for superblock creation Provide an fsopen() system call that starts the process of preparing to create a superblock that will then be mountable, using an fd as a context handle. fsopen() is given the name of the filesystem that will be used: int mfd = fsopen(const char *fsname, unsigned int flags); where flags can be 0 or FSOPEN_CLOEXEC. For example: sfd = fsopen("ext4", FSOPEN_CLOEXEC); fsconfig(sfd, FSCONFIG_SET_PATH, "source", "/dev/sda1", AT_FDCWD); fsconfig(sfd, FSCONFIG_SET_FLAG, "noatime", NULL, 0); fsconfig(sfd, FSCONFIG_SET_FLAG, "acl", NULL, 0); fsconfig(sfd, FSCONFIG_SET_FLAG, "user_xattr", NULL, 0); fsconfig(sfd, FSCONFIG_SET_STRING, "sb", "1", 0); fsconfig(sfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); fsinfo(sfd, NULL, ...); // query new superblock attributes mfd = fsmount(sfd, FSMOUNT_CLOEXEC, MS_RELATIME); move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH); sfd = fsopen("afs", -1); fsconfig(fd, FSCONFIG_SET_STRING, "source", "#grand.central.org:root.cell", 0); fsconfig(fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); mfd = fsmount(sfd, 0, MS_NODEV); move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH); If an error is reported at any step, an error message may be available to be read() back (ENODATA will be reported if there isn't an error available) in the form: "e <subsys>:<problem>" "e SELinux:Mount on mountpoint not permitted" Once fsmount() has been called, further fsconfig() calls will incur EBUSY, even if the fsmount() fails. read() is still possible to retrieve error information. The fsopen() syscall creates a mount context and hangs it of the fd that it returns. Netlink is not used because it is optional and would make the core VFS dependent on the networking layer and also potentially add network namespace issues. Note that, for the moment, the caller must have SYS_CAP_ADMIN to use fsopen(). Signed-off-by: David Howells <dhowells@redhat.com> cc: linux-api@vger.kernel.org Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:33:31 +00:00
#include <linux/mutex.h>
struct cred;
struct dentry;
struct file_operations;
struct file_system_type;
struct mnt_namespace;
struct net;
struct pid_namespace;
struct super_block;
struct user_namespace;
struct vfsmount;
struct path;
enum fs_context_purpose {
FS_CONTEXT_FOR_MOUNT, /* New superblock for explicit mount */
FS_CONTEXT_FOR_SUBMOUNT, /* New superblock for automatic submount */
FS_CONTEXT_FOR_RECONFIGURE, /* Superblock reconfiguration (remount) */
};
vfs: syscall: Add fsopen() to prepare for superblock creation Provide an fsopen() system call that starts the process of preparing to create a superblock that will then be mountable, using an fd as a context handle. fsopen() is given the name of the filesystem that will be used: int mfd = fsopen(const char *fsname, unsigned int flags); where flags can be 0 or FSOPEN_CLOEXEC. For example: sfd = fsopen("ext4", FSOPEN_CLOEXEC); fsconfig(sfd, FSCONFIG_SET_PATH, "source", "/dev/sda1", AT_FDCWD); fsconfig(sfd, FSCONFIG_SET_FLAG, "noatime", NULL, 0); fsconfig(sfd, FSCONFIG_SET_FLAG, "acl", NULL, 0); fsconfig(sfd, FSCONFIG_SET_FLAG, "user_xattr", NULL, 0); fsconfig(sfd, FSCONFIG_SET_STRING, "sb", "1", 0); fsconfig(sfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); fsinfo(sfd, NULL, ...); // query new superblock attributes mfd = fsmount(sfd, FSMOUNT_CLOEXEC, MS_RELATIME); move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH); sfd = fsopen("afs", -1); fsconfig(fd, FSCONFIG_SET_STRING, "source", "#grand.central.org:root.cell", 0); fsconfig(fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); mfd = fsmount(sfd, 0, MS_NODEV); move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH); If an error is reported at any step, an error message may be available to be read() back (ENODATA will be reported if there isn't an error available) in the form: "e <subsys>:<problem>" "e SELinux:Mount on mountpoint not permitted" Once fsmount() has been called, further fsconfig() calls will incur EBUSY, even if the fsmount() fails. read() is still possible to retrieve error information. The fsopen() syscall creates a mount context and hangs it of the fd that it returns. Netlink is not used because it is optional and would make the core VFS dependent on the networking layer and also potentially add network namespace issues. Note that, for the moment, the caller must have SYS_CAP_ADMIN to use fsopen(). Signed-off-by: David Howells <dhowells@redhat.com> cc: linux-api@vger.kernel.org Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:33:31 +00:00
/*
* Userspace usage phase for fsopen/fspick.
*/
enum fs_context_phase {
FS_CONTEXT_CREATE_PARAMS, /* Loading params for sb creation */
FS_CONTEXT_CREATING, /* A superblock is being created */
FS_CONTEXT_AWAITING_MOUNT, /* Superblock created, awaiting fsmount() */
FS_CONTEXT_AWAITING_RECONF, /* Awaiting initialisation for reconfiguration */
FS_CONTEXT_RECONF_PARAMS, /* Loading params for reconfiguration */
FS_CONTEXT_RECONFIGURING, /* Reconfiguring the superblock */
FS_CONTEXT_FAILED, /* Failed to correctly transition a context */
};
vfs: Add configuration parser helpers Because the new API passes in key,value parameters, match_token() cannot be used with it. Instead, provide three new helpers to aid with parsing: (1) fs_parse(). This takes a parameter and a simple static description of all the parameters and maps the key name to an ID. It returns 1 on a match, 0 on no match if unknowns should be ignored and some other negative error code on a parse error. The parameter description includes a list of key names to IDs, desired parameter types and a list of enumeration name -> ID mappings. [!] Note that for the moment I've required that the key->ID mapping array is expected to be sorted and unterminated. The size of the array is noted in the fsconfig_parser struct. This allows me to use bsearch(), but I'm not sure any performance gain is worth the hassle of requiring people to keep the array sorted. The parameter type array is sized according to the number of parameter IDs and is indexed directly. The optional enum mapping array is an unterminated, unsorted list and the size goes into the fsconfig_parser struct. The function can do some additional things: (a) If it's not ambiguous and no value is given, the prefix "no" on a key name is permitted to indicate that the parameter should be considered negatory. (b) If the desired type is a single simple integer, it will perform an appropriate conversion and store the result in a union in the parse result. (c) If the desired type is an enumeration, {key ID, name} will be looked up in the enumeration list and the matching value will be stored in the parse result union. (d) Optionally generate an error if the key is unrecognised. This is called something like: enum rdt_param { Opt_cdp, Opt_cdpl2, Opt_mba_mpbs, nr__rdt_params }; const struct fs_parameter_spec rdt_param_specs[nr__rdt_params] = { [Opt_cdp] = { fs_param_is_bool }, [Opt_cdpl2] = { fs_param_is_bool }, [Opt_mba_mpbs] = { fs_param_is_bool }, }; const const char *const rdt_param_keys[nr__rdt_params] = { [Opt_cdp] = "cdp", [Opt_cdpl2] = "cdpl2", [Opt_mba_mpbs] = "mba_mbps", }; const struct fs_parameter_description rdt_parser = { .name = "rdt", .nr_params = nr__rdt_params, .keys = rdt_param_keys, .specs = rdt_param_specs, .no_source = true, }; int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct fs_parse_result parse; struct rdt_fs_context *ctx = rdt_fc2context(fc); int ret; ret = fs_parse(fc, &rdt_parser, param, &parse); if (ret < 0) return ret; switch (parse.key) { case Opt_cdp: ctx->enable_cdpl3 = true; return 0; case Opt_cdpl2: ctx->enable_cdpl2 = true; return 0; case Opt_mba_mpbs: ctx->enable_mba_mbps = true; return 0; } return -EINVAL; } (2) fs_lookup_param(). This takes a { dirfd, path, LOOKUP_EMPTY? } or string value and performs an appropriate path lookup to convert it into a path object, which it will then return. If the desired type was a blockdev, the type of the looked up inode will be checked to make sure it is one. This can be used like: enum foo_param { Opt_source, nr__foo_params }; const struct fs_parameter_spec foo_param_specs[nr__foo_params] = { [Opt_source] = { fs_param_is_blockdev }, }; const char *char foo_param_keys[nr__foo_params] = { [Opt_source] = "source", }; const struct constant_table foo_param_alt_keys[] = { { "device", Opt_source }, }; const struct fs_parameter_description foo_parser = { .name = "foo", .nr_params = nr__foo_params, .nr_alt_keys = ARRAY_SIZE(foo_param_alt_keys), .keys = foo_param_keys, .alt_keys = foo_param_alt_keys, .specs = foo_param_specs, }; int foo_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct fs_parse_result parse; struct foo_fs_context *ctx = foo_fc2context(fc); int ret; ret = fs_parse(fc, &foo_parser, param, &parse); if (ret < 0) return ret; switch (parse.key) { case Opt_source: return fs_lookup_param(fc, &foo_parser, param, &parse, &ctx->source); default: return -EINVAL; } } (3) lookup_constant(). This takes a table of named constants and looks up the given name within it. The table is expected to be sorted such that bsearch() be used upon it. Possibly I should require the table be terminated and just use a for-loop to scan it instead of using bsearch() to reduce hassle. Tables look something like: static const struct constant_table bool_names[] = { { "0", false }, { "1", true }, { "false", false }, { "no", false }, { "true", true }, { "yes", true }, }; and a lookup is done with something like: b = lookup_constant(bool_names, param->string, -1); Additionally, optional validation routines for the parameter description are provided that can be enabled at compile time. A later patch will invoke these when a filesystem is registered. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:24 +00:00
/*
* Type of parameter value.
*/
enum fs_value_type {
fs_value_is_undefined,
fs_value_is_flag, /* Value not given a value */
fs_value_is_string, /* Value is a string */
fs_value_is_blob, /* Value is a binary blob */
fs_value_is_filename, /* Value is a filename* + dirfd */
fs_value_is_file, /* Value is a file* */
};
/*
* Configuration parameter.
*/
struct fs_parameter {
const char *key; /* Parameter name */
enum fs_value_type type:8; /* The type of value here */
union {
char *string;
void *blob;
struct filename *name;
struct file *file;
};
size_t size;
int dirfd;
};
struct p_log {
const char *prefix;
struct fc_log *log;
};
/*
* Filesystem context for holding the parameters used in the creation or
* reconfiguration of a superblock.
*
* Superblock creation fills in ->root whereas reconfiguration begins with this
* already set.
*
* See Documentation/filesystems/mount_api.rst
*/
struct fs_context {
const struct fs_context_operations *ops;
vfs: syscall: Add fsopen() to prepare for superblock creation Provide an fsopen() system call that starts the process of preparing to create a superblock that will then be mountable, using an fd as a context handle. fsopen() is given the name of the filesystem that will be used: int mfd = fsopen(const char *fsname, unsigned int flags); where flags can be 0 or FSOPEN_CLOEXEC. For example: sfd = fsopen("ext4", FSOPEN_CLOEXEC); fsconfig(sfd, FSCONFIG_SET_PATH, "source", "/dev/sda1", AT_FDCWD); fsconfig(sfd, FSCONFIG_SET_FLAG, "noatime", NULL, 0); fsconfig(sfd, FSCONFIG_SET_FLAG, "acl", NULL, 0); fsconfig(sfd, FSCONFIG_SET_FLAG, "user_xattr", NULL, 0); fsconfig(sfd, FSCONFIG_SET_STRING, "sb", "1", 0); fsconfig(sfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); fsinfo(sfd, NULL, ...); // query new superblock attributes mfd = fsmount(sfd, FSMOUNT_CLOEXEC, MS_RELATIME); move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH); sfd = fsopen("afs", -1); fsconfig(fd, FSCONFIG_SET_STRING, "source", "#grand.central.org:root.cell", 0); fsconfig(fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); mfd = fsmount(sfd, 0, MS_NODEV); move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH); If an error is reported at any step, an error message may be available to be read() back (ENODATA will be reported if there isn't an error available) in the form: "e <subsys>:<problem>" "e SELinux:Mount on mountpoint not permitted" Once fsmount() has been called, further fsconfig() calls will incur EBUSY, even if the fsmount() fails. read() is still possible to retrieve error information. The fsopen() syscall creates a mount context and hangs it of the fd that it returns. Netlink is not used because it is optional and would make the core VFS dependent on the networking layer and also potentially add network namespace issues. Note that, for the moment, the caller must have SYS_CAP_ADMIN to use fsopen(). Signed-off-by: David Howells <dhowells@redhat.com> cc: linux-api@vger.kernel.org Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:33:31 +00:00
struct mutex uapi_mutex; /* Userspace access mutex */
struct file_system_type *fs_type;
void *fs_private; /* The filesystem's context */
void *sget_key;
struct dentry *root; /* The root and superblock */
struct user_namespace *user_ns; /* The user namespace for this mount */
struct net *net_ns; /* The network namespace for this mount */
const struct cred *cred; /* The mounter's credentials */
struct p_log log; /* Logging buffer */
const char *source; /* The source name (eg. dev path) */
void *security; /* LSM options */
void *s_fs_info; /* Proposed s_fs_info */
unsigned int sb_flags; /* Proposed superblock flags (SB_*) */
unsigned int sb_flags_mask; /* Superblock flags that were changed */
unsigned int s_iflags; /* OR'd with sb->s_iflags */
enum fs_context_purpose purpose:8;
vfs: syscall: Add fsopen() to prepare for superblock creation Provide an fsopen() system call that starts the process of preparing to create a superblock that will then be mountable, using an fd as a context handle. fsopen() is given the name of the filesystem that will be used: int mfd = fsopen(const char *fsname, unsigned int flags); where flags can be 0 or FSOPEN_CLOEXEC. For example: sfd = fsopen("ext4", FSOPEN_CLOEXEC); fsconfig(sfd, FSCONFIG_SET_PATH, "source", "/dev/sda1", AT_FDCWD); fsconfig(sfd, FSCONFIG_SET_FLAG, "noatime", NULL, 0); fsconfig(sfd, FSCONFIG_SET_FLAG, "acl", NULL, 0); fsconfig(sfd, FSCONFIG_SET_FLAG, "user_xattr", NULL, 0); fsconfig(sfd, FSCONFIG_SET_STRING, "sb", "1", 0); fsconfig(sfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); fsinfo(sfd, NULL, ...); // query new superblock attributes mfd = fsmount(sfd, FSMOUNT_CLOEXEC, MS_RELATIME); move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH); sfd = fsopen("afs", -1); fsconfig(fd, FSCONFIG_SET_STRING, "source", "#grand.central.org:root.cell", 0); fsconfig(fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0); mfd = fsmount(sfd, 0, MS_NODEV); move_mount(mfd, "", sfd, AT_FDCWD, "/mnt", MOVE_MOUNT_F_EMPTY_PATH); If an error is reported at any step, an error message may be available to be read() back (ENODATA will be reported if there isn't an error available) in the form: "e <subsys>:<problem>" "e SELinux:Mount on mountpoint not permitted" Once fsmount() has been called, further fsconfig() calls will incur EBUSY, even if the fsmount() fails. read() is still possible to retrieve error information. The fsopen() syscall creates a mount context and hangs it of the fd that it returns. Netlink is not used because it is optional and would make the core VFS dependent on the networking layer and also potentially add network namespace issues. Note that, for the moment, the caller must have SYS_CAP_ADMIN to use fsopen(). Signed-off-by: David Howells <dhowells@redhat.com> cc: linux-api@vger.kernel.org Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:33:31 +00:00
enum fs_context_phase phase:8; /* The phase the context is in */
bool need_free:1; /* Need to call ops->free() */
bool global:1; /* Goes into &init_user_ns */
bool oldapi:1; /* Coming from mount(2) */
fs: add FSCONFIG_CMD_CREATE_EXCL Summary ======= This introduces FSCONFIG_CMD_CREATE_EXCL which will allows userspace to implement something like mount -t ext4 --exclusive /dev/sda /B which fails if a superblock for the requested filesystem does already exist: Before this patch ----------------- $ sudo ./move-mount -f xfs -o source=/dev/sda4 /A Requesting filesystem type xfs Mount options requested: source=/dev/sda4 Attaching mount at /A Moving single attached mount Setting key(source) with val(/dev/sda4) $ sudo ./move-mount -f xfs -o source=/dev/sda4 /B Requesting filesystem type xfs Mount options requested: source=/dev/sda4 Attaching mount at /B Moving single attached mount Setting key(source) with val(/dev/sda4) After this patch with --exclusive as a switch for FSCONFIG_CMD_CREATE_EXCL -------------------------------------------------------------------------- $ sudo ./move-mount -f xfs --exclusive -o source=/dev/sda4 /A Requesting filesystem type xfs Request exclusive superblock creation Mount options requested: source=/dev/sda4 Attaching mount at /A Moving single attached mount Setting key(source) with val(/dev/sda4) $ sudo ./move-mount -f xfs --exclusive -o source=/dev/sda4 /B Requesting filesystem type xfs Request exclusive superblock creation Mount options requested: source=/dev/sda4 Attaching mount at /B Moving single attached mount Setting key(source) with val(/dev/sda4) Device or resource busy | move-mount.c: 300: do_fsconfig: i xfs: reusing existing filesystem not allowed Details ======= As mentioned on the list (cf. [1]-[3]) mount requests like mount -t ext4 /dev/sda /A are ambigous for userspace. Either a new superblock has been created and mounted or an existing superblock has been reused and a bind-mount has been created. This becomes clear in the following example where two processes create the same mount for the same block device: P1 P2 fd_fs = fsopen("ext4"); fd_fs = fsopen("ext4"); fsconfig(fd_fs, FSCONFIG_SET_STRING, "source", "/dev/sda"); fsconfig(fd_fs, FSCONFIG_SET_STRING, "source", "/dev/sda"); fsconfig(fd_fs, FSCONFIG_SET_STRING, "dax", "always"); fsconfig(fd_fs, FSCONFIG_SET_STRING, "resuid", "1000"); // wins and creates superblock fsconfig(fd_fs, FSCONFIG_CMD_CREATE, ...) // finds compatible superblock of P1 // spins until P1 sets SB_BORN and grabs a reference fsconfig(fd_fs, FSCONFIG_CMD_CREATE, ...) fd_mnt1 = fsmount(fd_fs); fd_mnt2 = fsmount(fd_fs); move_mount(fd_mnt1, "/A") move_mount(fd_mnt2, "/B") Not just does P2 get a bind-mount but the mount options that P2 requestes are silently ignored. The VFS itself doesn't, can't and shouldn't enforce filesystem specific mount option compatibility. It only enforces incompatibility for read-only <-> read-write transitions: mount -t ext4 /dev/sda /A mount -t ext4 -o ro /dev/sda /B The read-only request will fail with EBUSY as the VFS can't just silently transition a superblock from read-write to read-only or vica versa without risking security issues. To userspace this silent superblock reuse can become a security issue in because there is currently no straightforward way for userspace to know that they did indeed manage to create a new superblock and didn't just reuse an existing one. This adds a new FSCONFIG_CMD_CREATE_EXCL command to fsconfig() that returns EBUSY if an existing superblock would be reused. Userspace that needs to be sure that it did create a new superblock with the requested mount options can request superblock creation using this command. If the command succeeds they can be sure that they did create a new superblock with the requested mount options. This requires the new mount api. With the old mount api it would be necessary to plumb this through every legacy filesystem's file_system_type->mount() method. If they want this feature they are most welcome to switch to the new mount api. Following is an analysis of the effect of FSCONFIG_CMD_CREATE_EXCL on each high-level superblock creation helper: (1) get_tree_nodev() Always allocate new superblock. Hence, FSCONFIG_CMD_CREATE and FSCONFIG_CMD_CREATE_EXCL are equivalent. The binderfs or overlayfs filesystems are examples. (4) get_tree_keyed() Finds an existing superblock based on sb->s_fs_info. Hence, FSCONFIG_CMD_CREATE would reuse an existing superblock whereas FSCONFIG_CMD_CREATE_EXCL would reject it with EBUSY. The mqueue or nfsd filesystems are examples. (2) get_tree_bdev() This effectively works like get_tree_keyed(). The ext4 or xfs filesystems are examples. (3) get_tree_single() Only one superblock of this filesystem type can ever exist. Hence, FSCONFIG_CMD_CREATE would reuse an existing superblock whereas FSCONFIG_CMD_CREATE_EXCL would reject it with EBUSY. The securityfs or configfs filesystems are examples. Note that some single-instance filesystems never destroy the superblock once it has been created during the first mount. For example, if securityfs has been mounted at least onces then the created superblock will never be destroyed again as long as there is still an LSM making use it. Consequently, even if securityfs is unmounted and the superblock seemingly destroyed it really isn't which means that FSCONFIG_CMD_CREATE_EXCL will continue rejecting reusing an existing superblock. This is acceptable thugh since special purpose filesystems such as this shouldn't have a need to use FSCONFIG_CMD_CREATE_EXCL anyway and if they do it's probably to make sure that mount options aren't ignored. Following is an analysis of the effect of FSCONFIG_CMD_CREATE_EXCL on filesystems that make use of the low-level sget_fc() helper directly. They're all effectively variants on get_tree_keyed(), get_tree_bdev(), or get_tree_nodev(): (5) mtd_get_sb() Similar logic to get_tree_keyed(). (6) afs_get_tree() Similar logic to get_tree_keyed(). (7) ceph_get_tree() Similar logic to get_tree_keyed(). Already explicitly allows forcing the allocation of a new superblock via CEPH_OPT_NOSHARE. This turns it into get_tree_nodev(). (8) fuse_get_tree_submount() Similar logic to get_tree_nodev(). (9) fuse_get_tree() Forces reuse of existing FUSE superblock. Forces reuse of existing superblock if passed in file refers to an existing FUSE connection. If FSCONFIG_CMD_CREATE_EXCL is specified together with an fd referring to an existing FUSE connections this would cause the superblock reusal to fail. If reusing is the intent then FSCONFIG_CMD_CREATE_EXCL shouldn't be specified. (10) fuse_get_tree() -> get_tree_nodev() Same logic as in get_tree_nodev(). (11) fuse_get_tree() -> get_tree_bdev() Same logic as in get_tree_bdev(). (12) virtio_fs_get_tree() Same logic as get_tree_keyed(). (13) gfs2_meta_get_tree() Forces reuse of existing gfs2 superblock. Mounting gfs2meta enforces that a gf2s superblock must already exist. If not, it will error out. Consequently, mounting gfs2meta with FSCONFIG_CMD_CREATE_EXCL would always fail. If reusing is the intent then FSCONFIG_CMD_CREATE_EXCL shouldn't be specified. (14) kernfs_get_tree() Similar logic to get_tree_keyed(). (15) nfs_get_tree_common() Similar logic to get_tree_keyed(). Already explicitly allows forcing the allocation of a new superblock via NFS_MOUNT_UNSHARED. This effectively turns it into get_tree_nodev(). Link: [1] https://lore.kernel.org/linux-block/20230704-fasching-wertarbeit-7c6ffb01c83d@brauner Link: [2] https://lore.kernel.org/linux-block/20230705-pumpwerk-vielversprechend-a4b1fd947b65@brauner Link: [3] https://lore.kernel.org/linux-fsdevel/20230725-einnahmen-warnschilder-17779aec0a97@brauner Reviewed-by: Josef Bacik <josef@toxicpanda.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Aleksa Sarai <cyphar@cyphar.com> Message-Id: <20230802-vfs-super-exclusive-v2-4-95dc4e41b870@kernel.org> Signed-off-by: Christian Brauner <brauner@kernel.org>
2023-08-02 11:57:06 +00:00
bool exclusive:1; /* create new superblock, reject existing one */
};
struct fs_context_operations {
void (*free)(struct fs_context *fc);
int (*dup)(struct fs_context *fc, struct fs_context *src_fc);
vfs: Implement a filesystem superblock creation/configuration context [AV - unfuck kern_mount_data(); we want non-NULL ->mnt_ns on long-living mounts] [AV - reordering fs/namespace.c is badly overdue, but let's keep it separate from that series] [AV - drop simple_pin_fs() change] [AV - clean vfs_kern_mount() failure exits up] Implement a filesystem context concept to be used during superblock creation for mount and superblock reconfiguration for remount. The mounting procedure then becomes: (1) Allocate new fs_context context. (2) Configure the context. (3) Create superblock. (4) Query the superblock. (5) Create a mount for the superblock. (6) Destroy the context. Rather than calling fs_type->mount(), an fs_context struct is created and fs_type->init_fs_context() is called to set it up. Pointers exist for the filesystem and LSM to hang their private data off. A set of operations has to be set by ->init_fs_context() to provide freeing, duplication, option parsing, binary data parsing, validation, mounting and superblock filling. Legacy filesystems are supported by the provision of a set of legacy fs_context operations that build up a list of mount options and then invoke fs_type->mount() from within the fs_context ->get_tree() operation. This allows all filesystems to be accessed using fs_context. It should be noted that, whilst this patch adds a lot of lines of code, there is quite a bit of duplication with existing code that can be eliminated should all filesystems be converted over. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:25 +00:00
int (*parse_param)(struct fs_context *fc, struct fs_parameter *param);
int (*parse_monolithic)(struct fs_context *fc, void *data);
int (*get_tree)(struct fs_context *fc);
int (*reconfigure)(struct fs_context *fc);
};
/*
* fs_context manipulation functions.
*/
extern struct fs_context *fs_context_for_mount(struct file_system_type *fs_type,
unsigned int sb_flags);
extern struct fs_context *fs_context_for_reconfigure(struct dentry *dentry,
unsigned int sb_flags,
unsigned int sb_flags_mask);
extern struct fs_context *fs_context_for_submount(struct file_system_type *fs_type,
struct dentry *reference);
extern struct fs_context *vfs_dup_fs_context(struct fs_context *fc);
vfs: Implement a filesystem superblock creation/configuration context [AV - unfuck kern_mount_data(); we want non-NULL ->mnt_ns on long-living mounts] [AV - reordering fs/namespace.c is badly overdue, but let's keep it separate from that series] [AV - drop simple_pin_fs() change] [AV - clean vfs_kern_mount() failure exits up] Implement a filesystem context concept to be used during superblock creation for mount and superblock reconfiguration for remount. The mounting procedure then becomes: (1) Allocate new fs_context context. (2) Configure the context. (3) Create superblock. (4) Query the superblock. (5) Create a mount for the superblock. (6) Destroy the context. Rather than calling fs_type->mount(), an fs_context struct is created and fs_type->init_fs_context() is called to set it up. Pointers exist for the filesystem and LSM to hang their private data off. A set of operations has to be set by ->init_fs_context() to provide freeing, duplication, option parsing, binary data parsing, validation, mounting and superblock filling. Legacy filesystems are supported by the provision of a set of legacy fs_context operations that build up a list of mount options and then invoke fs_type->mount() from within the fs_context ->get_tree() operation. This allows all filesystems to be accessed using fs_context. It should be noted that, whilst this patch adds a lot of lines of code, there is quite a bit of duplication with existing code that can be eliminated should all filesystems be converted over. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:25 +00:00
extern int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param);
extern int vfs_parse_fs_string(struct fs_context *fc, const char *key,
const char *value, size_t v_size);
int vfs_parse_monolithic_sep(struct fs_context *fc, void *data,
char *(*sep)(char **));
vfs: Implement a filesystem superblock creation/configuration context [AV - unfuck kern_mount_data(); we want non-NULL ->mnt_ns on long-living mounts] [AV - reordering fs/namespace.c is badly overdue, but let's keep it separate from that series] [AV - drop simple_pin_fs() change] [AV - clean vfs_kern_mount() failure exits up] Implement a filesystem context concept to be used during superblock creation for mount and superblock reconfiguration for remount. The mounting procedure then becomes: (1) Allocate new fs_context context. (2) Configure the context. (3) Create superblock. (4) Query the superblock. (5) Create a mount for the superblock. (6) Destroy the context. Rather than calling fs_type->mount(), an fs_context struct is created and fs_type->init_fs_context() is called to set it up. Pointers exist for the filesystem and LSM to hang their private data off. A set of operations has to be set by ->init_fs_context() to provide freeing, duplication, option parsing, binary data parsing, validation, mounting and superblock filling. Legacy filesystems are supported by the provision of a set of legacy fs_context operations that build up a list of mount options and then invoke fs_type->mount() from within the fs_context ->get_tree() operation. This allows all filesystems to be accessed using fs_context. It should be noted that, whilst this patch adds a lot of lines of code, there is quite a bit of duplication with existing code that can be eliminated should all filesystems be converted over. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2018-11-01 23:07:25 +00:00
extern int generic_parse_monolithic(struct fs_context *fc, void *data);
extern int vfs_get_tree(struct fs_context *fc);
extern void put_fs_context(struct fs_context *fc);
extern int vfs_parse_fs_param_source(struct fs_context *fc,
struct fs_parameter *param);
cgroup1: fix leaked context root causing sporadic NULL deref in LTP Richard reported sporadic (roughly one in 10 or so) null dereferences and other strange behaviour for a set of automated LTP tests. Things like: BUG: kernel NULL pointer dereference, address: 0000000000000008 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 0 PID: 1516 Comm: umount Not tainted 5.10.0-yocto-standard #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-48-gd9c812dda519-prebuilt.qemu.org 04/01/2014 RIP: 0010:kernfs_sop_show_path+0x1b/0x60 ...or these others: RIP: 0010:do_mkdirat+0x6a/0xf0 RIP: 0010:d_alloc_parallel+0x98/0x510 RIP: 0010:do_readlinkat+0x86/0x120 There were other less common instances of some kind of a general scribble but the common theme was mount and cgroup and a dubious dentry triggering the NULL dereference. I was only able to reproduce it under qemu by replicating Richard's setup as closely as possible - I never did get it to happen on bare metal, even while keeping everything else the same. In commit 71d883c37e8d ("cgroup_do_mount(): massage calling conventions") we see this as a part of the overall change: -------------- struct cgroup_subsys *ss; - struct dentry *dentry; [...] - dentry = cgroup_do_mount(&cgroup_fs_type, fc->sb_flags, root, - CGROUP_SUPER_MAGIC, ns); [...] - if (percpu_ref_is_dying(&root->cgrp.self.refcnt)) { - struct super_block *sb = dentry->d_sb; - dput(dentry); + ret = cgroup_do_mount(fc, CGROUP_SUPER_MAGIC, ns); + if (!ret && percpu_ref_is_dying(&root->cgrp.self.refcnt)) { + struct super_block *sb = fc->root->d_sb; + dput(fc->root); deactivate_locked_super(sb); msleep(10); return restart_syscall(); } -------------- In changing from the local "*dentry" variable to using fc->root, we now export/leave that dentry pointer in the file context after doing the dput() in the unlikely "is_dying" case. With LTP doing a crazy amount of back to back mount/unmount [testcases/bin/cgroup_regression_5_1.sh] the unlikely becomes slightly likely and then bad things happen. A fix would be to not leave the stale reference in fc->root as follows: --------------                 dput(fc->root); + fc->root = NULL;                 deactivate_locked_super(sb); -------------- ...but then we are just open-coding a duplicate of fc_drop_locked() so we simply use that instead. Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Tejun Heo <tj@kernel.org> Cc: Zefan Li <lizefan.x@bytedance.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: stable@vger.kernel.org # v5.1+ Reported-by: Richard Purdie <richard.purdie@linuxfoundation.org> Fixes: 71d883c37e8d ("cgroup_do_mount(): massage calling conventions") Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> Signed-off-by: Tejun Heo <tj@kernel.org>
2021-06-16 12:51:57 +00:00
extern void fc_drop_locked(struct fs_context *fc);
int reconfigure_single(struct super_block *s,
int flags, void *data);
extern int get_tree_nodev(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc));
extern int get_tree_single(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc));
extern int get_tree_keyed(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc),
void *key);
int setup_bdev_super(struct super_block *sb, int sb_flags,
struct fs_context *fc);
extern int get_tree_bdev(struct fs_context *fc,
int (*fill_super)(struct super_block *sb,
struct fs_context *fc));
extern const struct file_operations fscontext_fops;
/*
* Mount error, warning and informational message logging. This structure is
* shareable between a mount and a subordinate mount.
*/
struct fc_log {
refcount_t usage;
u8 head; /* Insertion index in buffer[] */
u8 tail; /* Removal index in buffer[] */
u8 need_free; /* Mask of kfree'able items in buffer[] */
struct module *owner; /* Owner module for strings that don't then need freeing */
char *buffer[8];
};
extern __attribute__((format(printf, 4, 5)))
void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...);
#define __logfc(fc, l, fmt, ...) logfc((fc)->log.log, NULL, \
l, fmt, ## __VA_ARGS__)
#define __plog(p, l, fmt, ...) logfc((p)->log, (p)->prefix, \
l, fmt, ## __VA_ARGS__)
/**
* infof - Store supplementary informational message
* @fc: The context in which to log the informational message
* @fmt: The format string
*
* Store the supplementary informational message for the process if the process
* has enabled the facility.
*/
#define infof(fc, fmt, ...) __logfc(fc, 'i', fmt, ## __VA_ARGS__)
#define info_plog(p, fmt, ...) __plog(p, 'i', fmt, ## __VA_ARGS__)
#define infofc(p, fmt, ...) __plog((&(fc)->log), 'i', fmt, ## __VA_ARGS__)
/**
* warnf - Store supplementary warning message
* @fc: The context in which to log the error message
* @fmt: The format string
*
* Store the supplementary warning message for the process if the process has
* enabled the facility.
*/
#define warnf(fc, fmt, ...) __logfc(fc, 'w', fmt, ## __VA_ARGS__)
#define warn_plog(p, fmt, ...) __plog(p, 'w', fmt, ## __VA_ARGS__)
#define warnfc(fc, fmt, ...) __plog((&(fc)->log), 'w', fmt, ## __VA_ARGS__)
/**
* errorf - Store supplementary error message
* @fc: The context in which to log the error message
* @fmt: The format string
*
* Store the supplementary error message for the process if the process has
* enabled the facility.
*/
#define errorf(fc, fmt, ...) __logfc(fc, 'e', fmt, ## __VA_ARGS__)
#define error_plog(p, fmt, ...) __plog(p, 'e', fmt, ## __VA_ARGS__)
#define errorfc(fc, fmt, ...) __plog((&(fc)->log), 'e', fmt, ## __VA_ARGS__)
/**
* invalf - Store supplementary invalid argument error message
* @fc: The context in which to log the error message
* @fmt: The format string
*
* Store the supplementary error message for the process if the process has
* enabled the facility and return -EINVAL.
*/
#define invalf(fc, fmt, ...) (errorf(fc, fmt, ## __VA_ARGS__), -EINVAL)
#define inval_plog(p, fmt, ...) (error_plog(p, fmt, ## __VA_ARGS__), -EINVAL)
#define invalfc(fc, fmt, ...) (errorfc(fc, fmt, ## __VA_ARGS__), -EINVAL)
#endif /* _LINUX_FS_CONTEXT_H */