From df0e91d488276086bc07da2e389986cae0048c37 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 8 Feb 2018 15:17:38 +0100 Subject: [PATCH 01/14] fuse: atomic_o_trunc should truncate pagecache Fuse has an "atomic_o_trunc" mode, where userspace filesystem uses the O_TRUNC flag in the OPEN request to truncate the file atomically with the open. In this mode there's no need to send a SETATTR request to userspace after the open, so fuse_do_setattr() checks this mode and returns. But this misses the important step of truncating the pagecache. Add the missing parts of truncation to the ATTR_OPEN branch. Reported-by: Chad Austin Fixes: 6ff958edbf39 ("fuse: add atomic open+truncate support") Signed-off-by: Miklos Szeredi Cc: --- fs/fuse/dir.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 24967382a7b1..7a980b4462d9 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1629,8 +1629,19 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, return err; if (attr->ia_valid & ATTR_OPEN) { - if (fc->atomic_o_trunc) + /* This is coming from open(..., ... | O_TRUNC); */ + WARN_ON(!(attr->ia_valid & ATTR_SIZE)); + WARN_ON(attr->ia_size != 0); + if (fc->atomic_o_trunc) { + /* + * No need to send request to userspace, since actual + * truncation has already been done by OPEN. But still + * need to truncate page cache. + */ + i_size_write(inode, 0); + truncate_pagecache(inode, 0); return 0; + } file = NULL; } From 3b7008b226f3de811d4ac34238e9cf670f7c9fe7 Mon Sep 17 00:00:00 2001 From: Szymon Lukasz Date: Thu, 9 Nov 2017 21:23:35 +0100 Subject: [PATCH 02/14] fuse: return -ECONNABORTED on /dev/fuse read after abort Currently the userspace has no way of knowing whether the fuse connection ended because of umount or abort via sysfs. It makes it hard for filesystems to free the mountpoint after abort without worrying about removing some new mount. The patch fixes it by returning different errors when userspace reads from /dev/fuse (-ENODEV for umount and -ECONNABORTED for abort). Add a new capability flag FUSE_ABORT_ERROR. If set and the connection is gone because of sysfs abort, reading from the device will return -ECONNABORTED. Signed-off-by: Szymon Lukasz Signed-off-by: Miklos Szeredi --- fs/fuse/control.c | 2 +- fs/fuse/cuse.c | 4 ++-- fs/fuse/dev.c | 12 +++++++----- fs/fuse/fuse_i.h | 8 +++++++- fs/fuse/inode.c | 9 ++++++--- include/uapi/linux/fuse.h | 7 ++++++- 6 files changed, 29 insertions(+), 13 deletions(-) diff --git a/fs/fuse/control.c b/fs/fuse/control.c index b9ea99c5b5b3..78fb7a07c5ca 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -35,7 +35,7 @@ static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf, { struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (fc) { - fuse_abort_conn(fc); + fuse_abort_conn(fc, true); fuse_conn_put(fc); } return count; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index e9e97803442a..31d33b69957f 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -406,7 +406,7 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) err_region: unregister_chrdev_region(devt, 1); err: - fuse_abort_conn(fc); + fuse_abort_conn(fc, false); goto out; } @@ -581,7 +581,7 @@ static ssize_t cuse_class_abort_store(struct device *dev, { struct cuse_conn *cc = dev_get_drvdata(dev); - fuse_abort_conn(&cc->fc); + fuse_abort_conn(&cc->fc, false); return count; } static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5d06384c2cae..78b6293bd04d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1234,9 +1234,10 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, if (err) goto err_unlock; - err = -ENODEV; - if (!fiq->connected) + if (!fiq->connected) { + err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV; goto err_unlock; + } if (!list_empty(&fiq->interrupts)) { req = list_entry(fiq->interrupts.next, struct fuse_req, @@ -1287,7 +1288,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, spin_lock(&fpq->lock); clear_bit(FR_LOCKED, &req->flags); if (!fpq->connected) { - err = -ENODEV; + err = (fc->aborted && fc->abort_err) ? -ECONNABORTED : -ENODEV; goto out_end; } if (err) { @@ -2076,7 +2077,7 @@ static void end_polls(struct fuse_conn *fc) * is OK, the request will in that case be removed from the list before we touch * it. */ -void fuse_abort_conn(struct fuse_conn *fc) +void fuse_abort_conn(struct fuse_conn *fc, bool is_abort) { struct fuse_iqueue *fiq = &fc->iq; @@ -2089,6 +2090,7 @@ void fuse_abort_conn(struct fuse_conn *fc) fc->connected = 0; fc->blocked = 0; + fc->aborted = is_abort; fuse_set_initialized(fc); list_for_each_entry(fud, &fc->devices, entry) { struct fuse_pqueue *fpq = &fud->pq; @@ -2151,7 +2153,7 @@ int fuse_dev_release(struct inode *inode, struct file *file) /* Are we the last open device? */ if (atomic_dec_and_test(&fc->dev_count)) { WARN_ON(fc->iq.fasync != NULL); - fuse_abort_conn(fc); + fuse_abort_conn(fc, false); } fuse_dev_free(fud); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index c4c093bbf456..7d2e7deea64b 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -515,6 +515,9 @@ struct fuse_conn { abort and device release */ unsigned connected; + /** Connection aborted via sysfs */ + bool aborted; + /** Connection failed (version mismatch). Cannot race with setting other bitfields since it is only set once in INIT reply, before any other request, and never cleared */ @@ -526,6 +529,9 @@ struct fuse_conn { /** Do readpages asynchronously? Only set in INIT */ unsigned async_read:1; + /** Return an unique read error after abort. Only set in INIT */ + unsigned abort_err:1; + /** Do not send separate SETATTR request before open(O_TRUNC) */ unsigned atomic_o_trunc:1; @@ -851,7 +857,7 @@ void fuse_request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req); /* Abort all requests */ -void fuse_abort_conn(struct fuse_conn *fc); +void fuse_abort_conn(struct fuse_conn *fc, bool is_abort); /** * Invalidate inode attributes diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 624f18bbfd2b..3c9b675d99da 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -371,7 +371,7 @@ void fuse_unlock_inode(struct inode *inode) static void fuse_umount_begin(struct super_block *sb) { - fuse_abort_conn(get_fuse_conn_super(sb)); + fuse_abort_conn(get_fuse_conn_super(sb), false); } static void fuse_send_destroy(struct fuse_conn *fc) @@ -393,7 +393,7 @@ static void fuse_put_super(struct super_block *sb) fuse_send_destroy(fc); - fuse_abort_conn(fc); + fuse_abort_conn(fc, false); mutex_lock(&fuse_mutex); list_del(&fc->entry); fuse_ctl_remove_conn(fc); @@ -918,6 +918,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->posix_acl = 1; fc->sb->s_xattr = fuse_acl_xattr_handlers; } + if (arg->flags & FUSE_ABORT_ERROR) + fc->abort_err = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -948,7 +950,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | - FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL; + FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | + FUSE_ABORT_ERROR; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 4b5001c57f46..92fa24c24c92 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -113,6 +113,9 @@ * 7.26 * - add FUSE_HANDLE_KILLPRIV * - add FUSE_POSIX_ACL + * + * 7.27 + * - add FUSE_ABORT_ERROR */ #ifndef _LINUX_FUSE_H @@ -148,7 +151,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 26 +#define FUSE_KERNEL_MINOR_VERSION 27 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -245,6 +248,7 @@ struct fuse_file_lock { * FUSE_PARALLEL_DIROPS: allow parallel lookups and readdir * FUSE_HANDLE_KILLPRIV: fs handles killing suid/sgid/cap on write/chown/trunc * FUSE_POSIX_ACL: filesystem supports posix acls + * FUSE_ABORT_ERROR: reading the device after abort returns ECONNABORTED */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -267,6 +271,7 @@ struct fuse_file_lock { #define FUSE_PARALLEL_DIROPS (1 << 18) #define FUSE_HANDLE_KILLPRIV (1 << 19) #define FUSE_POSIX_ACL (1 << 20) +#define FUSE_ABORT_ERROR (1 << 21) /** * CUSE INIT request/reply flags From dbf107b2a7f36fa635b40e0b554514f599c75b33 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 20 Feb 2018 20:25:20 -0600 Subject: [PATCH 03/14] fuse: Remove the buggy retranslation of pids in fuse_dev_do_read At the point of fuse_dev_do_read the user space process that initiated the action on the fuse filesystem may no longer exist. The process have been killed or may have fired an asynchronous request and exited. If the initial process has exited, the code "pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns)" will either return a pid of 0, or in the unlikely event that the pid has been reallocated it can return practically any pid. Any pid is possible as the pid allocator allocates pid numbers in different pid namespaces independently. The only way to make translation in fuse_dev_do_read reliable is to call get_pid in fuse_req_init_context, and pid_vnr followed by put_pid in fuse_dev_do_read. That reference counting in other contexts has been shown to bounce cache lines between processors and in general be slow. So that is not desirable. The only known user of running the fuse server in a different pid namespace from the filesystem does not care what the pids are in the fuse messages so removing this code should not matter. Getting the translation to a server running outside of the pid namespace of a container can still be achieved by playing setns games at mount time. It is also possible to add an option to pass a pid namespace into the fuse filesystem at mount time. Fixes: 5d6d3a301c4e ("fuse: allow server to run in different pid_ns") Signed-off-by: "Eric W. Biederman" Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 78b6293bd04d..6e005da03aed 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1261,12 +1261,6 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, in = &req->in; reqsize = in->h.len; - if (task_active_pid_ns(current) != fc->pid_ns) { - rcu_read_lock(); - in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns)); - rcu_read_unlock(); - } - /* If request is too large, reply with an error and restart the read */ if (nbytes < reqsize) { req->out.h.error = -EIO; From c9582eb0ff7d2b560be60eafab29183882cdc82b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 21 Feb 2018 10:52:06 -0600 Subject: [PATCH 04/14] fuse: Fail all requests with invalid uids or gids Upon a cursory examinination the uid and gid of a fuse request are necessary for correct operation. Failing a fuse request where those values are not reliable seems a straight forward and reliable means of ensuring that fuse requests with bad data are not sent or processed. In most cases the vfs will avoid actions it suspects will cause an inode write back of an inode with an invalid uid or gid. But that does not map precisely to what fuse is doing, so test for this and solve this at the fuse level as well. Performing this work in fuse_req_init_context is cheap as the code is already performing the translation here and only needs to check the result of the translation to see if things are not representable in a form the fuse server can handle. [SzM] Don't zero the context for the nofail case, just keep using the munging version (makes sense for debugging and doesn't hurt). Signed-off-by: Eric W. Biederman Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6e005da03aed..4852f8803156 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -112,13 +112,6 @@ static void __fuse_put_request(struct fuse_req *req) refcount_dec(&req->count); } -static void fuse_req_init_context(struct fuse_conn *fc, struct fuse_req *req) -{ - req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid()); - req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid()); - req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); -} - void fuse_set_initialized(struct fuse_conn *fc) { /* Make sure stores before this are seen on another CPU */ @@ -163,11 +156,19 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, goto out; } - fuse_req_init_context(fc, req); + req->in.h.uid = from_kuid(&init_user_ns, current_fsuid()); + req->in.h.gid = from_kgid(&init_user_ns, current_fsgid()); + req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); + __set_bit(FR_WAITING, &req->flags); if (for_background) __set_bit(FR_BACKGROUND, &req->flags); + if (unlikely(req->in.h.uid == ((uid_t)-1) || + req->in.h.gid == ((gid_t)-1))) { + fuse_put_request(fc, req); + return ERR_PTR(-EOVERFLOW); + } return req; out: @@ -256,7 +257,10 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, if (!req) req = get_reserved_req(fc, file); - fuse_req_init_context(fc, req); + req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid()); + req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid()); + req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); + __set_bit(FR_WAITING, &req->flags); __clear_bit(FR_BACKGROUND, &req->flags); return req; From 8cb08329b0809453722bc12aa912be34355bcb66 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 21 Feb 2018 11:18:07 -0600 Subject: [PATCH 05/14] fuse: Support fuse filesystems outside of init_user_ns In order to support mounts from namespaces other than init_user_ns, fuse must translate uids and gids to/from the userns of the process servicing requests on /dev/fuse. This patch does that, with a couple of restrictions on the namespace: - The userns for the fuse connection is fixed to the namespace from which /dev/fuse is opened. - The namespace must be the same as s_user_ns. These restrictions simplify the implementation by avoiding the need to pass around userns references and by allowing fuse to rely on the checks in setattr_prepare for ownership changes. Either restriction could be relaxed in the future if needed. For cuse the userns used is the opener of /dev/cuse. Semantically the cuse support does not appear safe for unprivileged users. Practically the permissions on /dev/cuse only make it accessible to the global root user. If something slips through the cracks in a user namespace the only users who will be able to use the cuse device are those users mapped into the user namespace. Translation in the posix acl is updated to use the uuser namespace of the filesystem. Avoiding cases which might bypass this translation is handled in a following change. This change is stronlgy based on a similar change from Seth Forshee and Dongsu Park. Cc: Seth Forshee Cc: Dongsu Park Signed-off-by: Eric W. Biederman Signed-off-by: Miklos Szeredi --- fs/fuse/acl.c | 4 ++-- fs/fuse/cuse.c | 7 ++++++- fs/fuse/dev.c | 8 ++++---- fs/fuse/dir.c | 14 +++++++------- fs/fuse/fuse_i.h | 6 +++++- fs/fuse/inode.c | 31 +++++++++++++++++++------------ 6 files changed, 43 insertions(+), 27 deletions(-) diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index ec85765502f1..5a48cee6d7d3 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -34,7 +34,7 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type) return ERR_PTR(-ENOMEM); size = fuse_getxattr(inode, name, value, PAGE_SIZE); if (size > 0) - acl = posix_acl_from_xattr(&init_user_ns, value, size); + acl = posix_acl_from_xattr(fc->user_ns, value, size); else if ((size == 0) || (size == -ENODATA) || (size == -EOPNOTSUPP && fc->no_getxattr)) acl = NULL; @@ -81,7 +81,7 @@ int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (!value) return -ENOMEM; - ret = posix_acl_to_xattr(&init_user_ns, acl, value, size); + ret = posix_acl_to_xattr(fc->user_ns, acl, value, size); if (ret < 0) { kfree(value); return ret; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 31d33b69957f..8f68181256c0 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -48,6 +48,7 @@ #include #include #include +#include #include "fuse_i.h" @@ -498,7 +499,11 @@ static int cuse_channel_open(struct inode *inode, struct file *file) if (!cc) return -ENOMEM; - fuse_conn_init(&cc->fc); + /* + * Limit the cuse channel to requests that can + * be represented in file->f_cred->user_ns. + */ + fuse_conn_init(&cc->fc, file->f_cred->user_ns); fud = fuse_dev_alloc(&cc->fc); if (!fud) { diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 4852f8803156..686631f12001 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -156,8 +156,8 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, goto out; } - req->in.h.uid = from_kuid(&init_user_ns, current_fsuid()); - req->in.h.gid = from_kgid(&init_user_ns, current_fsgid()); + req->in.h.uid = from_kuid(fc->user_ns, current_fsuid()); + req->in.h.gid = from_kgid(fc->user_ns, current_fsgid()); req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); __set_bit(FR_WAITING, &req->flags); @@ -257,8 +257,8 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, if (!req) req = get_reserved_req(fc, file); - req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid()); - req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid()); + req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); + req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); __set_bit(FR_WAITING, &req->flags); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 7a980b4462d9..cf8a1cd7a62e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -858,8 +858,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, stat->ino = attr->ino; stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); stat->nlink = attr->nlink; - stat->uid = make_kuid(&init_user_ns, attr->uid); - stat->gid = make_kgid(&init_user_ns, attr->gid); + stat->uid = make_kuid(fc->user_ns, attr->uid); + stat->gid = make_kgid(fc->user_ns, attr->gid); stat->rdev = inode->i_rdev; stat->atime.tv_sec = attr->atime; stat->atime.tv_nsec = attr->atimensec; @@ -1475,17 +1475,17 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime) return true; } -static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg, - bool trust_local_cmtime) +static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr, + struct fuse_setattr_in *arg, bool trust_local_cmtime) { unsigned ivalid = iattr->ia_valid; if (ivalid & ATTR_MODE) arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; if (ivalid & ATTR_UID) - arg->valid |= FATTR_UID, arg->uid = from_kuid(&init_user_ns, iattr->ia_uid); + arg->valid |= FATTR_UID, arg->uid = from_kuid(fc->user_ns, iattr->ia_uid); if (ivalid & ATTR_GID) - arg->valid |= FATTR_GID, arg->gid = from_kgid(&init_user_ns, iattr->ia_gid); + arg->valid |= FATTR_GID, arg->gid = from_kgid(fc->user_ns, iattr->ia_gid); if (ivalid & ATTR_SIZE) arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; if (ivalid & ATTR_ATIME) { @@ -1657,7 +1657,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - iattr_to_fattr(attr, &inarg, trust_local_cmtime); + iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime); if (file) { struct fuse_file *ff = file->private_data; inarg.valid |= FATTR_FH; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7d2e7deea64b..f630951df8dc 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -26,6 +26,7 @@ #include #include #include +#include /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -466,6 +467,9 @@ struct fuse_conn { /** The pid namespace for this mount */ struct pid_namespace *pid_ns; + /** The user namespace for this mount */ + struct user_namespace *user_ns; + /** Maximum read size */ unsigned max_read; @@ -876,7 +880,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); /** * Initialize fuse_conn */ -void fuse_conn_init(struct fuse_conn *fc); +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns); /** * Release reference to fuse_conn diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 3c9b675d99da..1643043d4fe5 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -171,8 +171,8 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_ino = fuse_squash_ino(attr->ino); inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); set_nlink(inode, attr->nlink); - inode->i_uid = make_kuid(&init_user_ns, attr->uid); - inode->i_gid = make_kgid(&init_user_ns, attr->gid); + inode->i_uid = make_kuid(fc->user_ns, attr->uid); + inode->i_gid = make_kgid(fc->user_ns, attr->gid); inode->i_blocks = attr->blocks; inode->i_atime.tv_sec = attr->atime; inode->i_atime.tv_nsec = attr->atimensec; @@ -477,7 +477,8 @@ static int fuse_match_uint(substring_t *s, unsigned int *res) return err; } -static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) +static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev, + struct user_namespace *user_ns) { char *p; memset(d, 0, sizeof(struct fuse_mount_data)); @@ -513,7 +514,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) case OPT_USER_ID: if (fuse_match_uint(&args[0], &uv)) return 0; - d->user_id = make_kuid(current_user_ns(), uv); + d->user_id = make_kuid(user_ns, uv); if (!uid_valid(d->user_id)) return 0; d->user_id_present = 1; @@ -522,7 +523,7 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev) case OPT_GROUP_ID: if (fuse_match_uint(&args[0], &uv)) return 0; - d->group_id = make_kgid(current_user_ns(), uv); + d->group_id = make_kgid(user_ns, uv); if (!gid_valid(d->group_id)) return 0; d->group_id_present = 1; @@ -565,8 +566,8 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) struct super_block *sb = root->d_sb; struct fuse_conn *fc = get_fuse_conn_super(sb); - seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id)); - seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id)); + seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id)); + seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id)); if (fc->default_permissions) seq_puts(m, ",default_permissions"); if (fc->allow_other) @@ -597,7 +598,7 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq) fpq->connected = 1; } -void fuse_conn_init(struct fuse_conn *fc) +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) { memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); @@ -621,6 +622,7 @@ void fuse_conn_init(struct fuse_conn *fc) fc->attr_version = 1; get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); + fc->user_ns = get_user_ns(user_ns); } EXPORT_SYMBOL_GPL(fuse_conn_init); @@ -630,6 +632,7 @@ void fuse_conn_put(struct fuse_conn *fc) if (fc->destroy_req) fuse_request_free(fc->destroy_req); put_pid_ns(fc->pid_ns); + put_user_ns(fc->user_ns); fc->release(fc); } } @@ -1064,7 +1067,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); - if (!parse_fuse_opt(data, &d, is_bdev)) + if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns)) goto err; if (is_bdev) { @@ -1089,8 +1092,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (!file) goto err; - if ((file->f_op != &fuse_dev_operations) || - (file->f_cred->user_ns != &init_user_ns)) + /* + * Require mount to happen from the same user namespace which + * opened /dev/fuse to prevent potential attacks. + */ + if (file->f_op != &fuse_dev_operations || + file->f_cred->user_ns != sb->s_user_ns) goto err_fput; fc = kmalloc(sizeof(*fc), GFP_KERNEL); @@ -1098,7 +1105,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (!fc) goto err_fput; - fuse_conn_init(fc); + fuse_conn_init(fc, sb->s_user_ns); fc->release = fuse_free_conn; fud = fuse_dev_alloc(fc); From 73f03c2b4b527346778c711c2734dbff3442b139 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Fri, 22 Dec 2017 15:32:33 +0100 Subject: [PATCH 06/14] fuse: Restrict allow_other to the superblock's namespace or a descendant Unprivileged users are normally restricted from mounting with the allow_other option by system policy, but this could be bypassed for a mount done with user namespace root permissions. In such cases allow_other should not allow users outside the userns to access the mount as doing so would give the unprivileged user the ability to manipulate processes it would otherwise be unable to manipulate. Restrict allow_other to apply to users in the same userns used at mount or a descendant of that namespace. Also export current_in_userns() for use by fuse when built as a module. Reviewed-by: Serge Hallyn Signed-off-by: Seth Forshee Signed-off-by: Dongsu Park Signed-off-by: Eric W. Biederman Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 2 +- kernel/user_namespace.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index cf8a1cd7a62e..fa4009761a7a 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1030,7 +1030,7 @@ int fuse_allow_current_process(struct fuse_conn *fc) const struct cred *cred; if (fc->allow_other) - return 1; + return current_in_userns(fc->user_ns); cred = current_cred(); if (uid_eq(cred->euid, fc->user_id) && diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 246d4d4ce5c7..492c255e6c5a 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1235,6 +1235,7 @@ bool current_in_userns(const struct user_namespace *target_ns) { return in_userns(target_ns, current_user_ns()); } +EXPORT_SYMBOL(current_in_userns); static inline struct user_namespace *to_user_ns(struct ns_common *ns) { From ff1b89f389a8e64d0a583ce0b0308696f4ab5860 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 20 Mar 2018 17:11:44 +0100 Subject: [PATCH 07/14] fuse: honor AT_STATX_DONT_SYNC The description of this flag says "Don't sync attributes with the server". In other words: always use the attributes cached in the kernel and don't send network or local messages to refresh the attributes. Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index fa4009761a7a..ef883f0bee8f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -924,12 +924,13 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, } static int fuse_update_get_attr(struct inode *inode, struct file *file, - struct kstat *stat) + struct kstat *stat, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); int err = 0; - if (time_before64(fi->i_time, get_jiffies_64())) { + if (!(flags & AT_STATX_DONT_SYNC) && + time_before64(fi->i_time, get_jiffies_64())) { forget_all_cached_acls(inode); err = fuse_do_getattr(inode, stat, file); } else if (stat) { @@ -943,7 +944,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, int fuse_update_attributes(struct inode *inode, struct file *file) { - return fuse_update_get_attr(inode, file, NULL); + return fuse_update_get_attr(inode, file, NULL, 0); } int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, @@ -1794,7 +1795,7 @@ static int fuse_getattr(const struct path *path, struct kstat *stat, if (!fuse_allow_current_process(fc)) return -EACCES; - return fuse_update_get_attr(inode, NULL, stat); + return fuse_update_get_attr(inode, NULL, stat, flags); } static const struct inode_operations fuse_dir_inode_operations = { From bf5c1898bf9e6d5ce9840743e8eccc74c14d16a4 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 20 Mar 2018 17:11:44 +0100 Subject: [PATCH 08/14] fuse: honor AT_STATX_FORCE_SYNC Force a refresh of attributes from the fuse server in this case. Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ef883f0bee8f..56231b31f806 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -928,9 +928,16 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, { struct fuse_inode *fi = get_fuse_inode(inode); int err = 0; + bool sync; - if (!(flags & AT_STATX_DONT_SYNC) && - time_before64(fi->i_time, get_jiffies_64())) { + if (flags & AT_STATX_FORCE_SYNC) + sync = true; + else if (flags & AT_STATX_DONT_SYNC) + sync = false; + else + sync = time_before64(fi->i_time, get_jiffies_64()); + + if (sync) { forget_all_cached_acls(inode); err = fuse_do_getattr(inode, stat, file); } else if (stat) { From 5ba24197b94d4070d8c2a17fa4944a55cc39ef03 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 20 Mar 2018 17:11:45 +0100 Subject: [PATCH 09/14] fuse: add writeback documentation Document various modes of I/O supported by the fuse kernel module. Signed-off-by: Miklos Szeredi --- Documentation/filesystems/fuse-io.txt | 38 +++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 Documentation/filesystems/fuse-io.txt diff --git a/Documentation/filesystems/fuse-io.txt b/Documentation/filesystems/fuse-io.txt new file mode 100644 index 000000000000..07b8f73f100f --- /dev/null +++ b/Documentation/filesystems/fuse-io.txt @@ -0,0 +1,38 @@ +Fuse supports the following I/O modes: + +- direct-io +- cached + + write-through + + writeback-cache + +The direct-io mode can be selected with the FOPEN_DIRECT_IO flag in the +FUSE_OPEN reply. + +In direct-io mode the page cache is completely bypassed for reads and writes. +No read-ahead takes place. Shared mmap is disabled. + +In cached mode reads may be satisfied from the page cache, and data may be +read-ahead by the kernel to fill the cache. The cache is always kept consistent +after any writes to the file. All mmap modes are supported. + +The cached mode has two sub modes controlling how writes are handled. The +write-through mode is the default and is supported on all kernels. The +writeback-cache mode may be selected by the FUSE_WRITEBACK_CACHE flag in the +FUSE_INIT reply. + +In write-through mode each write is immediately sent to userspace as one or more +WRITE requests, as well as updating any cached pages (and caching previously +uncached, but fully written pages). No READ requests are ever sent for writes, +so when an uncached page is partially written, the page is discarded. + +In writeback-cache mode (enabled by the FUSE_WRITEBACK_CACHE flag) writes go to +the cache only, which means that the write(2) syscall can often complete very +fast. Dirty pages are written back implicitly (background writeback or page +reclaim on memory pressure) or explicitly (invoked by close(2), fsync(2) and +when the last ref to the file is being released on munmap(2)). This mode +assumes that all changes to the filesystem go through the FUSE kernel module +(size and atime/ctime/mtime attributes are kept up-to-date by the kernel), so +it's generally not suitable for network filesystems. If a partial page is +written, then the page needs to be first read from userspace. This means, that +even for files opened for O_WRONLY it is possible that READ requests will be +generated by the kernel. From e45b2546e23c2d10f8585063a15c745a7603fac9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 4 May 2018 11:47:28 -0500 Subject: [PATCH 10/14] fuse: Ensure posix acls are translated outside of init_user_ns Ensure the translation happens by failing to read or write posix acls when the filesystem has not indicated it supports posix acls. This ensures that modern cached posix acl support is available and used when dealing with posix acls. This is important because only that path has the code to convernt the uids and gids in posix acls into the user namespace of a fuse filesystem. Signed-off-by: "Eric W. Biederman" Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 1 + fs/fuse/inode.c | 7 +++++++ fs/fuse/xattr.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f630951df8dc..5256ad333b05 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -985,6 +985,7 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); int fuse_removexattr(struct inode *inode, const char *name); extern const struct xattr_handler *fuse_xattr_handlers[]; extern const struct xattr_handler *fuse_acl_xattr_handlers[]; +extern const struct xattr_handler *fuse_no_acl_xattr_handlers[]; struct posix_acl; struct posix_acl *fuse_get_acl(struct inode *inode, int type); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 1643043d4fe5..22c76cf8c2e3 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1100,6 +1100,13 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) file->f_cred->user_ns != sb->s_user_ns) goto err_fput; + /* + * If we are not in the initial user namespace posix + * acls must be translated. + */ + if (sb->s_user_ns != &init_user_ns) + sb->s_xattr = fuse_no_acl_xattr_handlers; + fc = kmalloc(sizeof(*fc), GFP_KERNEL); err = -ENOMEM; if (!fc) diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 3caac46b08b0..433717640f78 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -192,6 +192,26 @@ static int fuse_xattr_set(const struct xattr_handler *handler, return fuse_setxattr(inode, name, value, size, flags); } +static bool no_xattr_list(struct dentry *dentry) +{ + return false; +} + +static int no_xattr_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *value, size_t size) +{ + return -EOPNOTSUPP; +} + +static int no_xattr_set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *nodee, + const char *name, const void *value, + size_t size, int flags) +{ + return -EOPNOTSUPP; +} + static const struct xattr_handler fuse_xattr_handler = { .prefix = "", .get = fuse_xattr_get, @@ -209,3 +229,26 @@ const struct xattr_handler *fuse_acl_xattr_handlers[] = { &fuse_xattr_handler, NULL }; + +static const struct xattr_handler fuse_no_acl_access_xattr_handler = { + .name = XATTR_NAME_POSIX_ACL_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = no_xattr_list, + .get = no_xattr_get, + .set = no_xattr_set, +}; + +static const struct xattr_handler fuse_no_acl_default_xattr_handler = { + .name = XATTR_NAME_POSIX_ACL_DEFAULT, + .flags = ACL_TYPE_ACCESS, + .list = no_xattr_list, + .get = no_xattr_get, + .set = no_xattr_set, +}; + +const struct xattr_handler *fuse_no_acl_xattr_handlers[] = { + &fuse_no_acl_access_xattr_handler, + &fuse_no_acl_default_xattr_handler, + &fuse_xattr_handler, + NULL +}; From 4ad769f3c346ec3d458e255548dec26ca5284cf6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 29 May 2018 09:04:46 -0500 Subject: [PATCH 11/14] fuse: Allow fully unprivileged mounts Now that the fuse and the vfs work is complete. Allow the fuse filesystem to be mounted by the root user in a user namespace. Signed-off-by: "Eric W. Biederman" Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 22c76cf8c2e3..48baa26993f3 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1222,7 +1222,7 @@ static void fuse_kill_sb_anon(struct super_block *sb) static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", - .fs_flags = FS_HAS_SUBTYPE, + .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT, .mount = fuse_mount, .kill_sb = fuse_kill_sb_anon, }; From 8a301eb16d99983a4961f884690ec97b92e7dcfe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 2 Feb 2018 09:54:14 -0800 Subject: [PATCH 12/14] fuse: fix congested state leak on aborted connections If a connection gets aborted while congested, FUSE can leave nr_wb_congested[] stuck until reboot causing wait_iff_congested() to wait spuriously which can lead to severe performance degradation. The leak is caused by gating congestion state clearing with fc->connected test in request_end(). This was added way back in 2009 by 26c3679101db ("fuse: destroy bdi on umount"). While the commit description doesn't explain why the test was added, it most likely was to avoid dereferencing bdi after it got destroyed. Since then, bdi lifetime rules have changed many times and now we're always guaranteed to have access to the bdi while the superblock is alive (fc->sb). Drop fc->connected conditional to avoid leaking congestion states. Signed-off-by: Tejun Heo Reported-by: Joshua Miller Cc: Johannes Weiner Cc: stable@vger.kernel.org # v2.6.29+ Acked-by: Jan Kara Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 686631f12001..e03ca14f40e9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -385,8 +385,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) if (!fc->blocked && waitqueue_active(&fc->blocked_waitq)) wake_up(&fc->blocked_waitq); - if (fc->num_background == fc->congestion_threshold && - fc->connected && fc->sb) { + if (fc->num_background == fc->congestion_threshold && fc->sb) { clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); } From 6becdb601bae2a043d7fb9762c4d48699528ea6e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 31 May 2018 12:26:10 +0200 Subject: [PATCH 13/14] fuse: fix control dir setup and teardown syzbot is reporting NULL pointer dereference at fuse_ctl_remove_conn() [1]. Since fc->ctl_ndents is incremented by fuse_ctl_add_conn() when new_inode() failed, fuse_ctl_remove_conn() reaches an inode-less dentry and tries to clear d_inode(dentry)->i_private field. Fix by only adding the dentry to the array after being fully set up. When tearing down the control directory, do d_invalidate() on it to get rid of any mounts that might have been added. [1] https://syzkaller.appspot.com/bug?id=f396d863067238959c91c0b7cfc10b163638cac6 Reported-by: syzbot Fixes: bafa96541b25 ("[PATCH] fuse: add control filesystem") Cc: # v2.6.18 Signed-off-by: Miklos Szeredi --- fs/fuse/control.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 78fb7a07c5ca..0b694655d988 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -211,10 +211,11 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, if (!dentry) return NULL; - fc->ctl_dentry[fc->ctl_ndents++] = dentry; inode = new_inode(fuse_control_sb); - if (!inode) + if (!inode) { + dput(dentry); return NULL; + } inode->i_ino = get_next_ino(); inode->i_mode = mode; @@ -228,6 +229,9 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, set_nlink(inode, nlink); inode->i_private = fc; d_add(dentry, inode); + + fc->ctl_dentry[fc->ctl_ndents++] = dentry; + return dentry; } @@ -284,7 +288,10 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) for (i = fc->ctl_ndents - 1; i >= 0; i--) { struct dentry *dentry = fc->ctl_dentry[i]; d_inode(dentry)->i_private = NULL; - d_drop(dentry); + if (!i) { + /* Get rid of submounts: */ + d_invalidate(dentry); + } dput(dentry); } drop_nlink(d_inode(fuse_control_sb->s_root)); From 543b8f8662fe6d21f19958b666ab0051af9db21a Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 1 May 2018 13:12:14 +0900 Subject: [PATCH 14/14] fuse: don't keep dead fuse_conn at fuse_fill_super(). syzbot is reporting use-after-free at fuse_kill_sb_blk() [1]. Since sb->s_fs_info field is not cleared after fc was released by fuse_conn_put() when initialization failed, fuse_kill_sb_blk() finds already released fc and tries to hold the lock. Fix this by clearing sb->s_fs_info field after calling fuse_conn_put(). [1] https://syzkaller.appspot.com/bug?id=a07a680ed0a9290585ca424546860464dd9658db Signed-off-by: Tetsuo Handa Reported-by: syzbot Fixes: 3b463ae0c626 ("fuse: invalidation reverse calls") Cc: John Muir Cc: Csaba Henk Cc: Anand Avati Cc: # v2.6.31 Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 48baa26993f3..061500c72608 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1193,6 +1193,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fuse_dev_free(fud); err_put_conn: fuse_conn_put(fc); + sb->s_fs_info = NULL; err_fput: fput(file); err: