fuse update for 6.0

-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQSQHSd0lITzzeNWNm3h3BK/laaZPAUCYvD4IQAKCRDh3BK/laaZ
 PDHHAP93H+2E9c6biGd5pEaL2ABChRY+wsQURzD+SZ6AL3JaUQD+KHxA4q0MJvws
 L6CWcf2XptUDCLe3P6sgSTvv5Gk1OAM=
 =FoXF
 -----END PGP SIGNATURE-----

Merge tag 'fuse-update-6.0' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse

Pull fuse updates from Miklos Szeredi:

 - Fix an issue with reusing the bdi in case of block based filesystems

 - Allow root (in init namespace) to access fuse filesystems in user
   namespaces if expicitly enabled with a module param

 - Misc fixes

* tag 'fuse-update-6.0' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse:
  fuse: retire block-device-based superblock on force unmount
  vfs: function to prevent re-use of block-device-based superblocks
  virtio_fs: Modify format for virtio_fs_direct_access
  virtiofs: delete unused parameter for virtio_fs_cleanup_vqs
  fuse: Add module param for CAP_SYS_ADMIN access bypassing allow_other
  fuse: Remove the control interface for virtio-fs
  fuse: ioctl: translate ENOSYS
  fuse: limit nsec
  fuse: avoid unnecessary spinlock bump
  fuse: fix deadlock between atomic O_TRUNC and page invalidation
  fuse: write inode in fuse_release()
This commit is contained in:
Linus Torvalds 2022-08-08 11:10:02 -07:00
commit 2bd5d41e0e
10 changed files with 132 additions and 33 deletions

View file

@ -279,7 +279,7 @@ How are requirements fulfilled?
the filesystem or not.
Note that the *ptrace* check is not strictly necessary to
prevent B/2/i, it is enough to check if mount owner has enough
prevent C/2/i, it is enough to check if mount owner has enough
privilege to send signal to the process accessing the
filesystem, since *SIGSTOP* can be used to get a similar effect.
@ -288,10 +288,29 @@ I think these limitations are unacceptable?
If a sysadmin trusts the users enough, or can ensure through other
measures, that system processes will never enter non-privileged
mounts, it can relax the last limitation with a 'user_allow_other'
config option. If this config option is set, the mounting user can
add the 'allow_other' mount option which disables the check for other
users' processes.
mounts, it can relax the last limitation in several ways:
- With the 'user_allow_other' config option. If this config option is
set, the mounting user can add the 'allow_other' mount option which
disables the check for other users' processes.
User namespaces have an unintuitive interaction with 'allow_other':
an unprivileged user - normally restricted from mounting with
'allow_other' - could do so in a user namespace where they're
privileged. If any process could access such an 'allow_other' mount
this would give the mounting user the ability to manipulate
processes in user namespaces where they're unprivileged. For this
reason 'allow_other' restricts access to users in the same userns
or a descendant.
- With the 'allow_sys_admin_access' module option. If this option is
set, super user's processes have unrestricted access to mounts
irrespective of allow_other setting or user namespace of the
mounting user.
Note that both of these relaxations expose the system to potential
information leak or *DoS* as described in points B and C/2/i-ii in the
preceding section.
Kernel - userspace interface
============================

View file

@ -258,7 +258,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
struct dentry *parent;
char name[32];
if (!fuse_control_sb)
if (!fuse_control_sb || fc->no_control)
return 0;
parent = fuse_control_sb->s_root;
@ -296,7 +296,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
{
int i;
if (!fuse_control_sb)
if (!fuse_control_sb || fc->no_control)
return;
for (i = fc->ctl_ndents - 1; i >= 0; i--) {

View file

@ -138,9 +138,9 @@ static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd)
WARN_ON(fcd->nr_free_ranges <= 0);
fcd->nr_free_ranges--;
}
__kick_dmap_free_worker(fcd, 0);
spin_unlock(&fcd->lock);
kick_dmap_free_worker(fcd, 0);
return dmap;
}

View file

@ -11,6 +11,7 @@
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/fs_context.h>
#include <linux/moduleparam.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/slab.h>
@ -21,6 +22,11 @@
#include <linux/types.h>
#include <linux/kernel.h>
static bool __read_mostly allow_sys_admin_access;
module_param(allow_sys_admin_access, bool, 0644);
MODULE_PARM_DESC(allow_sys_admin_access,
"Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check");
static void fuse_advise_use_readdirplus(struct inode *dir)
{
struct fuse_inode *fi = get_fuse_inode(dir);
@ -537,6 +543,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
struct fuse_file *ff;
void *security_ctx = NULL;
u32 security_ctxlen;
bool trunc = flags & O_TRUNC;
/* Userspace expects S_IFREG in create mode */
BUG_ON((mode & S_IFMT) != S_IFREG);
@ -561,7 +568,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
inarg.mode = mode;
inarg.umask = current_umask();
if (fm->fc->handle_killpriv_v2 && (flags & O_TRUNC) &&
if (fm->fc->handle_killpriv_v2 && trunc &&
!(flags & O_EXCL) && !capable(CAP_FSETID)) {
inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID;
}
@ -623,6 +630,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
} else {
file->private_data = ff;
fuse_finish_open(inode, file);
if (fm->fc->atomic_o_trunc && trunc)
truncate_pagecache(inode, 0);
else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
invalidate_inode_pages2(inode->i_mapping);
}
return err;
@ -1224,6 +1235,9 @@ int fuse_allow_current_process(struct fuse_conn *fc)
{
const struct cred *cred;
if (allow_sys_admin_access && capable(CAP_SYS_ADMIN))
return 1;
if (fc->allow_other)
return current_in_userns(fc->user_ns);

View file

@ -210,13 +210,9 @@ void fuse_finish_open(struct inode *inode, struct file *file)
fi->attr_version = atomic64_inc_return(&fc->attr_version);
i_size_write(inode, 0);
spin_unlock(&fi->lock);
truncate_pagecache(inode, 0);
file_update_time(file);
fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
} else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) {
invalidate_inode_pages2(inode->i_mapping);
}
if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
fuse_link_write_file(file);
}
@ -239,30 +235,38 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
if (err)
return err;
if (is_wb_truncate || dax_truncate) {
if (is_wb_truncate || dax_truncate)
inode_lock(inode);
fuse_set_nowrite(inode);
}
if (dax_truncate) {
filemap_invalidate_lock(inode->i_mapping);
err = fuse_dax_break_layouts(inode, 0, 0);
if (err)
goto out;
goto out_inode_unlock;
}
if (is_wb_truncate || dax_truncate)
fuse_set_nowrite(inode);
err = fuse_do_open(fm, get_node_id(inode), file, isdir);
if (!err)
fuse_finish_open(inode, file);
out:
if (is_wb_truncate || dax_truncate)
fuse_release_nowrite(inode);
if (!err) {
struct fuse_file *ff = file->private_data;
if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC))
truncate_pagecache(inode, 0);
else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
invalidate_inode_pages2(inode->i_mapping);
}
if (dax_truncate)
filemap_invalidate_unlock(inode->i_mapping);
if (is_wb_truncate | dax_truncate) {
fuse_release_nowrite(inode);
out_inode_unlock:
if (is_wb_truncate || dax_truncate)
inode_unlock(inode);
}
return err;
}
@ -338,6 +342,15 @@ static int fuse_open(struct inode *inode, struct file *file)
static int fuse_release(struct inode *inode, struct file *file)
{
struct fuse_conn *fc = get_fuse_conn(inode);
/*
* Dirty pages might remain despite write_inode_now() call from
* fuse_flush() due to writes racing with the close.
*/
if (fc->writeback_cache)
write_inode_now(inode, 1);
fuse_release_common(file, false);
/* return value is ignored by VFS */

View file

@ -180,6 +180,12 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_uid = make_kuid(fc->user_ns, attr->uid);
inode->i_gid = make_kgid(fc->user_ns, attr->gid);
inode->i_blocks = attr->blocks;
/* Sanitize nsecs */
attr->atimensec = min_t(u32, attr->atimensec, NSEC_PER_SEC - 1);
attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1);
attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1);
inode->i_atime.tv_sec = attr->atime;
inode->i_atime.tv_nsec = attr->atimensec;
/* mtime from server may be stale due to local buffered write */
@ -476,8 +482,14 @@ static void fuse_umount_begin(struct super_block *sb)
{
struct fuse_conn *fc = get_fuse_conn_super(sb);
if (!fc->no_force_umount)
fuse_abort_conn(fc);
if (fc->no_force_umount)
return;
fuse_abort_conn(fc);
// Only retire block-device-based superblocks.
if (sb->s_bdev != NULL)
retire_super(sb);
}
static void fuse_send_destroy(struct fuse_mount *fm)

View file

@ -9,6 +9,17 @@
#include <linux/compat.h>
#include <linux/fileattr.h>
static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args)
{
ssize_t ret = fuse_simple_request(fm, args);
/* Translate ENOSYS, which shouldn't be returned from fs */
if (ret == -ENOSYS)
ret = -ENOTTY;
return ret;
}
/*
* CUSE servers compiled on 32bit broke on 64bit kernels because the
* ABI was defined to be 'struct iovec' which is different on 32bit
@ -259,7 +270,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
ap.args.out_pages = true;
ap.args.out_argvar = true;
transferred = fuse_simple_request(fm, &ap.args);
transferred = fuse_send_ioctl(fm, &ap.args);
err = transferred;
if (transferred < 0)
goto out;
@ -393,7 +404,7 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff,
args.out_args[1].size = inarg.out_size;
args.out_args[1].value = ptr;
err = fuse_simple_request(fm, &args);
err = fuse_send_ioctl(fm, &args);
if (!err) {
if (outarg.result < 0)
err = outarg.result;

View file

@ -741,8 +741,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev,
}
/* Free virtqueues (device must already be reset) */
static void virtio_fs_cleanup_vqs(struct virtio_device *vdev,
struct virtio_fs *fs)
static void virtio_fs_cleanup_vqs(struct virtio_device *vdev)
{
vdev->config->del_vqs(vdev);
}
@ -757,7 +756,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
{
struct virtio_fs *fs = dax_get_private(dax_dev);
phys_addr_t offset = PFN_PHYS(pgoff);
size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff;
size_t max_nr_pages = fs->window_len / PAGE_SIZE - pgoff;
if (kaddr)
*kaddr = fs->window_kaddr + offset;
@ -895,7 +894,7 @@ static int virtio_fs_probe(struct virtio_device *vdev)
out_vqs:
virtio_reset_device(vdev);
virtio_fs_cleanup_vqs(vdev, fs);
virtio_fs_cleanup_vqs(vdev);
kfree(fs->vqs);
out:
@ -927,7 +926,7 @@ static void virtio_fs_remove(struct virtio_device *vdev)
virtio_fs_stop_all_queues(fs);
virtio_fs_drain_all_queues_locked(fs);
virtio_reset_device(vdev);
virtio_fs_cleanup_vqs(vdev, fs);
virtio_fs_cleanup_vqs(vdev);
vdev->priv = NULL;
/* Put device reference on virtio_fs object */

View file

@ -422,6 +422,35 @@ bool trylock_super(struct super_block *sb)
return false;
}
/**
* retire_super - prevents superblock from being reused
* @sb: superblock to retire
*
* The function marks superblock to be ignored in superblock test, which
* prevents it from being reused for any new mounts. If the superblock has
* a private bdi, it also unregisters it, but doesn't reduce the refcount
* of the superblock to prevent potential races. The refcount is reduced
* by generic_shutdown_super(). The function can not be called
* concurrently with generic_shutdown_super(). It is safe to call the
* function multiple times, subsequent calls have no effect.
*
* The marker will affect the re-use only for block-device-based
* superblocks. Other superblocks will still get marked if this function
* is used, but that will not affect their reusability.
*/
void retire_super(struct super_block *sb)
{
WARN_ON(!sb->s_bdev);
down_write(&sb->s_umount);
if (sb->s_iflags & SB_I_PERSB_BDI) {
bdi_unregister(sb->s_bdi);
sb->s_iflags &= ~SB_I_PERSB_BDI;
}
sb->s_iflags |= SB_I_RETIRED;
up_write(&sb->s_umount);
}
EXPORT_SYMBOL(retire_super);
/**
* generic_shutdown_super - common helper for ->kill_sb()
* @sb: superblock to kill
@ -1216,7 +1245,7 @@ static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc)
static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc)
{
return s->s_bdev == fc->sget_key;
return !(s->s_iflags & SB_I_RETIRED) && s->s_bdev == fc->sget_key;
}
/**
@ -1309,7 +1338,7 @@ EXPORT_SYMBOL(get_tree_bdev);
static int test_bdev_super(struct super_block *s, void *data)
{
return (void *)s->s_bdev == data;
return !(s->s_iflags & SB_I_RETIRED) && (void *)s->s_bdev == data;
}
struct dentry *mount_bdev(struct file_system_type *fs_type,

View file

@ -1433,6 +1433,7 @@ extern int send_sigurg(struct fown_struct *fown);
#define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */
#define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */
#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */
/* Possible states of 'frozen' field */
enum {
@ -2565,6 +2566,7 @@ extern struct dentry *mount_nodev(struct file_system_type *fs_type,
int flags, void *data,
int (*fill_super)(struct super_block *, void *, int));
extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path);
void retire_super(struct super_block *sb);
void generic_shutdown_super(struct super_block *sb);
void kill_block_super(struct super_block *sb);
void kill_anon_super(struct super_block *sb);