diff --git a/Documentation/ABI/testing/sysfs-fs-virtiofs b/Documentation/ABI/testing/sysfs-fs-virtiofs new file mode 100644 index 000000000000..4839dbce997e --- /dev/null +++ b/Documentation/ABI/testing/sysfs-fs-virtiofs @@ -0,0 +1,11 @@ +What: /sys/fs/virtiofs//tag +Date: Feb 2024 +Contact: virtio-fs@lists.linux.dev +Description: + [RO] The mount "tag" that can be used to mount this filesystem. + +What: /sys/fs/virtiofs//device +Date: Feb 2024 +Contact: virtio-fs@lists.linux.dev +Description: + Symlink to the virtio device that exports this filesystem. diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 038ed0b9aaa5..8674dbfbe59d 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -52,3 +52,14 @@ config FUSE_DAX If you want to allow mounting a Virtio Filesystem with the "dax" option, answer Y. + +config FUSE_PASSTHROUGH + bool "FUSE passthrough operations support" + default y + depends on FUSE_FS + select FS_STACK + help + This allows bypassing FUSE server by mapping specific FUSE operations + to be performed directly on a backing file. + + If you want to allow passthrough operations, answer Y. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 0c48b35c058d..6e0228c6d0cb 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -8,6 +8,8 @@ obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o +fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o +fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o virtiofs-y := virtio_fs.o diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 284a35006462..97ac994ff78f 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -174,11 +174,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, if (!fc) goto out; - down_read(&fc->killsb); - spin_lock(&fc->bg_lock); - fc->congestion_threshold = val; - spin_unlock(&fc->bg_lock); - up_read(&fc->killsb); + WRITE_ONCE(fc->congestion_threshold, val); fuse_conn_put(fc); out: return ret; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1a8f82f478cb..3ec8bb5e68ff 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1775,6 +1775,61 @@ copy_finish: return err; } +/* + * Resending all processing queue requests. + * + * During a FUSE daemon panics and failover, it is possible for some inflight + * requests to be lost and never returned. As a result, applications awaiting + * replies would become stuck forever. To address this, we can use notification + * to trigger resending of these pending requests to the FUSE daemon, ensuring + * they are properly processed again. + * + * Please note that this strategy is applicable only to idempotent requests or + * if the FUSE daemon takes careful measures to avoid processing duplicated + * non-idempotent requests. + */ +static void fuse_resend(struct fuse_conn *fc) +{ + struct fuse_dev *fud; + struct fuse_req *req, *next; + struct fuse_iqueue *fiq = &fc->iq; + LIST_HEAD(to_queue); + unsigned int i; + + spin_lock(&fc->lock); + if (!fc->connected) { + spin_unlock(&fc->lock); + return; + } + + list_for_each_entry(fud, &fc->devices, entry) { + struct fuse_pqueue *fpq = &fud->pq; + + spin_lock(&fpq->lock); + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + list_splice_tail_init(&fpq->processing[i], &to_queue); + spin_unlock(&fpq->lock); + } + spin_unlock(&fc->lock); + + list_for_each_entry_safe(req, next, &to_queue, list) { + __set_bit(FR_PENDING, &req->flags); + /* mark the request as resend request */ + req->in.h.unique |= FUSE_UNIQUE_RESEND; + } + + spin_lock(&fiq->lock); + /* iq and pq requests are both oldest to newest */ + list_splice(&to_queue, &fiq->pending); + fiq->ops->wake_pending_and_unlock(fiq); +} + +static int fuse_notify_resend(struct fuse_conn *fc) +{ + fuse_resend(fc); + return 0; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { @@ -1800,6 +1855,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_DELETE: return fuse_notify_delete(fc, size, cs); + case FUSE_NOTIFY_RESEND: + return fuse_notify_resend(fc); + default: fuse_copy_finish(cs); return -EINVAL; @@ -2251,43 +2309,91 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) return 0; } -static long fuse_dev_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) +static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) { int res; int oldfd; struct fuse_dev *fud = NULL; struct fd f; + if (get_user(oldfd, argp)) + return -EFAULT; + + f = fdget(oldfd); + if (!f.file) + return -EINVAL; + + /* + * Check against file->f_op because CUSE + * uses the same ioctl handler. + */ + if (f.file->f_op == file->f_op) + fud = fuse_get_dev(f.file); + + res = -EINVAL; + if (fud) { + mutex_lock(&fuse_mutex); + res = fuse_device_clone(fud->fc, file); + mutex_unlock(&fuse_mutex); + } + + fdput(f); + return res; +} + +static long fuse_dev_ioctl_backing_open(struct file *file, + struct fuse_backing_map __user *argp) +{ + struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_backing_map map; + + if (!fud) + return -EPERM; + + if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + return -EOPNOTSUPP; + + if (copy_from_user(&map, argp, sizeof(map))) + return -EFAULT; + + return fuse_backing_open(fud->fc, &map); +} + +static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) +{ + struct fuse_dev *fud = fuse_get_dev(file); + int backing_id; + + if (!fud) + return -EPERM; + + if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + return -EOPNOTSUPP; + + if (get_user(backing_id, argp)) + return -EFAULT; + + return fuse_backing_close(fud->fc, backing_id); +} + +static long fuse_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + switch (cmd) { case FUSE_DEV_IOC_CLONE: - if (get_user(oldfd, (__u32 __user *)arg)) - return -EFAULT; + return fuse_dev_ioctl_clone(file, argp); - f = fdget(oldfd); - if (!f.file) - return -EINVAL; + case FUSE_DEV_IOC_BACKING_OPEN: + return fuse_dev_ioctl_backing_open(file, argp); - /* - * Check against file->f_op because CUSE - * uses the same ioctl handler. - */ - if (f.file->f_op == file->f_op) - fud = fuse_get_dev(f.file); + case FUSE_DEV_IOC_BACKING_CLOSE: + return fuse_dev_ioctl_backing_close(file, argp); - res = -EINVAL; - if (fud) { - mutex_lock(&fuse_mutex); - res = fuse_device_clone(fud->fc, file); - mutex_unlock(&fuse_mutex); - } - fdput(f); - break; default: - res = -ENOTTY; - break; + return -ENOTTY; } - return res; } const struct file_operations fuse_dev_operations = { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index d19cbf34c634..4a6df591add6 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -391,6 +391,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name err = -EIO; if (fuse_invalid_attr(&outarg->attr)) goto out_put_forget; + if (outarg->nodeid == FUSE_ROOT_ID && outarg->generation != 0) { + pr_warn_once("root generation should be zero\n"); + outarg->generation = 0; + } *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, &outarg->attr, ATTR_TIMEOUT(outarg), @@ -615,7 +619,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, FUSE_ARGS(args); struct fuse_forget_link *forget; struct fuse_create_in inarg; - struct fuse_open_out outopen; + struct fuse_open_out *outopenp; struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; @@ -630,7 +634,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, goto out_err; err = -ENOMEM; - ff = fuse_file_alloc(fm); + ff = fuse_file_alloc(fm, true); if (!ff) goto out_put_forget_req; @@ -659,8 +663,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, args.out_numargs = 2; args.out_args[0].size = sizeof(outentry); args.out_args[0].value = &outentry; - args.out_args[1].size = sizeof(outopen); - args.out_args[1].value = &outopen; + /* Store outarg for fuse_finish_open() */ + outopenp = &ff->args->open_outarg; + args.out_args[1].size = sizeof(*outopenp); + args.out_args[1].value = outopenp; err = get_create_ext(&args, dir, entry, mode); if (err) @@ -676,9 +682,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, fuse_invalid_attr(&outentry.attr)) goto out_free_ff; - ff->fh = outopen.fh; + ff->fh = outopenp->fh; ff->nodeid = outentry.nodeid; - ff->open_flags = outopen.open_flags; + ff->open_flags = outopenp->open_flags; inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, &outentry.attr, ATTR_TIMEOUT(&outentry), 0); if (!inode) { @@ -692,13 +698,15 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, d_instantiate(entry, inode); fuse_change_entry_timeout(entry, &outentry); fuse_dir_changed(dir); - err = finish_open(file, entry, generic_file_open); + err = generic_file_open(inode, file); + if (!err) { + file->private_data = ff; + err = finish_open(file, entry, fuse_finish_open); + } if (err) { fi = get_fuse_inode(inode); fuse_sync_release(fi, ff, flags); } else { - file->private_data = ff; - fuse_finish_open(inode, file); if (fm->fc->atomic_o_trunc && trunc) truncate_pagecache(inode, 0); else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) @@ -1210,7 +1218,7 @@ static int fuse_do_statx(struct inode *inode, struct file *file, if (((sx->mask & STATX_SIZE) && !fuse_valid_size(sx->size)) || ((sx->mask & STATX_TYPE) && (!fuse_valid_type(sx->mode) || inode_wrong_type(inode, sx->mode)))) { - make_bad_inode(inode); + fuse_make_bad(inode); return -EIO; } @@ -1485,7 +1493,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask) * * 1) Local access checking ('default_permissions' mount option) based * on file mode. This is the plain old disk filesystem permission - * modell. + * model. * * 2) "Remote" access checking, where server is responsible for * checking permission in each inode operation. An exception to this @@ -1630,7 +1638,30 @@ out_err: static int fuse_dir_open(struct inode *inode, struct file *file) { - return fuse_open_common(inode, file, true); + struct fuse_mount *fm = get_fuse_mount(inode); + int err; + + if (fuse_is_bad(inode)) + return -EIO; + + err = generic_file_open(inode, file); + if (err) + return err; + + err = fuse_do_open(fm, get_node_id(inode), file, true); + if (!err) { + struct fuse_file *ff = file->private_data; + + /* + * Keep handling FOPEN_STREAM and FOPEN_NONSEEKABLE for + * directories for backward compatibility, though it's unlikely + * to be useful. + */ + if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE)) + nonseekable_open(inode, file); + } + + return err; } static int fuse_dir_release(struct inode *inode, struct file *file) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c007b0f0c3a7..a56e7bffd000 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -20,6 +20,7 @@ #include #include #include +#include static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, @@ -50,13 +51,7 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, return fuse_simple_request(fm, &args); } -struct fuse_release_args { - struct fuse_args args; - struct fuse_release_in inarg; - struct inode *inode; -}; - -struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release) { struct fuse_file *ff; @@ -65,15 +60,15 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) return NULL; ff->fm = fm; - ff->release_args = kzalloc(sizeof(*ff->release_args), - GFP_KERNEL_ACCOUNT); - if (!ff->release_args) { - kfree(ff); - return NULL; + if (release) { + ff->args = kzalloc(sizeof(*ff->args), GFP_KERNEL_ACCOUNT); + if (!ff->args) { + kfree(ff); + return NULL; + } } INIT_LIST_HEAD(&ff->write_entry); - mutex_init(&ff->readdir.lock); refcount_set(&ff->count, 1); RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); @@ -85,8 +80,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) void fuse_file_free(struct fuse_file *ff) { - kfree(ff->release_args); - mutex_destroy(&ff->readdir.lock); + kfree(ff->args); kfree(ff); } @@ -105,14 +99,17 @@ static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args, kfree(ra); } -static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) +static void fuse_file_put(struct fuse_file *ff, bool sync) { if (refcount_dec_and_test(&ff->count)) { - struct fuse_args *args = &ff->release_args->args; + struct fuse_release_args *ra = &ff->args->release_args; + struct fuse_args *args = (ra ? &ra->args : NULL); - if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) { - /* Do nothing when client does not implement 'open' */ - fuse_release_end(ff->fm, args, 0); + if (ra && ra->inode) + fuse_file_io_release(ff, ra->inode); + + if (!args) { + /* Do nothing when server does not implement 'open' */ } else if (sync) { fuse_simple_request(ff->fm, args); fuse_release_end(ff->fm, args, 0); @@ -132,27 +129,31 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, struct fuse_conn *fc = fm->fc; struct fuse_file *ff; int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; + bool open = isdir ? !fc->no_opendir : !fc->no_open; - ff = fuse_file_alloc(fm); + ff = fuse_file_alloc(fm, open); if (!ff) return ERR_PTR(-ENOMEM); ff->fh = 0; /* Default for no-open */ ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); - if (isdir ? !fc->no_opendir : !fc->no_open) { - struct fuse_open_out outarg; + if (open) { + /* Store outarg for fuse_finish_open() */ + struct fuse_open_out *outargp = &ff->args->open_outarg; int err; - err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg); + err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp); if (!err) { - ff->fh = outarg.fh; - ff->open_flags = outarg.open_flags; - + ff->fh = outargp->fh; + ff->open_flags = outargp->open_flags; } else if (err != -ENOSYS) { fuse_file_free(ff); return ERR_PTR(err); } else { + /* No release needed */ + kfree(ff->args); + ff->args = NULL; if (isdir) fc->no_opendir = 1; else @@ -195,40 +196,50 @@ static void fuse_link_write_file(struct file *file) spin_unlock(&fi->lock); } -void fuse_finish_open(struct inode *inode, struct file *file) +int fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + err = fuse_file_io_open(file, inode); + if (err) + return err; if (ff->open_flags & FOPEN_STREAM) stream_open(inode, file); else if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); - if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { - struct fuse_inode *fi = get_fuse_inode(inode); - - spin_lock(&fi->lock); - fi->attr_version = atomic64_inc_return(&fc->attr_version); - i_size_write(inode, 0); - spin_unlock(&fi->lock); - file_update_time(file); - fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); - } if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); + + return 0; } -int fuse_open_common(struct inode *inode, struct file *file, bool isdir) +static void fuse_truncate_update_attr(struct inode *inode, struct file *file) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + i_size_write(inode, 0); + spin_unlock(&fi->lock); + file_update_time(file); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); +} + +static int fuse_open(struct inode *inode, struct file *file) { struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = fm->fc; + struct fuse_file *ff; int err; - bool is_wb_truncate = (file->f_flags & O_TRUNC) && - fc->atomic_o_trunc && - fc->writeback_cache; - bool dax_truncate = (file->f_flags & O_TRUNC) && - fc->atomic_o_trunc && FUSE_IS_DAX(inode); + bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc; + bool is_wb_truncate = is_truncate && fc->writeback_cache; + bool dax_truncate = is_truncate && FUSE_IS_DAX(inode); if (fuse_is_bad(inode)) return -EIO; @@ -250,16 +261,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) if (is_wb_truncate || dax_truncate) fuse_set_nowrite(inode); - err = fuse_do_open(fm, get_node_id(inode), file, isdir); - if (!err) - fuse_finish_open(inode, file); + err = fuse_do_open(fm, get_node_id(inode), file, false); + if (!err) { + ff = file->private_data; + err = fuse_finish_open(inode, file); + if (err) + fuse_sync_release(fi, ff, file->f_flags); + else if (is_truncate) + fuse_truncate_update_attr(inode, file); + } if (is_wb_truncate || dax_truncate) fuse_release_nowrite(inode); if (!err) { - struct fuse_file *ff = file->private_data; - - if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) + if (is_truncate) truncate_pagecache(inode, 0); else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) invalidate_inode_pages2(inode->i_mapping); @@ -274,10 +289,13 @@ out_inode_unlock: } static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, - unsigned int flags, int opcode) + unsigned int flags, int opcode, bool sync) { struct fuse_conn *fc = ff->fm->fc; - struct fuse_release_args *ra = ff->release_args; + struct fuse_release_args *ra = &ff->args->release_args; + + if (fuse_file_passthrough(ff)) + fuse_passthrough_release(ff, fuse_inode_backing(fi)); /* Inode is NULL on error path of fuse_create_open() */ if (likely(fi)) { @@ -292,6 +310,11 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, wake_up_interruptible_all(&ff->poll_wait); + if (!ra) + return; + + /* ff->args was used for open outarg */ + memset(ff->args, 0, sizeof(*ff->args)); ra->inarg.fh = ff->fh; ra->inarg.flags = flags; ra->args.in_numargs = 1; @@ -301,23 +324,28 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, ra->args.nodeid = ff->nodeid; ra->args.force = true; ra->args.nocreds = true; + + /* + * Hold inode until release is finished. + * From fuse_sync_release() the refcount is 1 and everything's + * synchronous, so we are fine with not doing igrab() here. + */ + ra->inode = sync ? NULL : igrab(&fi->inode); } void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir) { struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_release_args *ra = ff->release_args; + struct fuse_release_args *ra = &ff->args->release_args; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; - fuse_prepare_release(fi, ff, open_flags, opcode); + fuse_prepare_release(fi, ff, open_flags, opcode, false); - if (ff->flock) { + if (ra && ff->flock) { ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id); } - /* Hold inode until release is finished */ - ra->inode = igrab(inode); /* * Normally this will send the RELEASE request, however if @@ -328,7 +356,7 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff, * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. */ - fuse_file_put(ff, ff->fm->fc->destroy, isdir); + fuse_file_put(ff, ff->fm->fc->destroy); } void fuse_release_common(struct file *file, bool isdir) @@ -337,11 +365,6 @@ void fuse_release_common(struct file *file, bool isdir) (fl_owner_t) file, isdir); } -static int fuse_open(struct inode *inode, struct file *file) -{ - return fuse_open_common(inode, file, false); -} - static int fuse_release(struct inode *inode, struct file *file) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -363,12 +386,8 @@ void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, unsigned int flags) { WARN_ON(refcount_read(&ff->count) > 1); - fuse_prepare_release(fi, ff, flags, FUSE_RELEASE); - /* - * iput(NULL) is a no-op and since the refcount is 1 and everything's - * synchronous, we are fine with not doing igrab() here" - */ - fuse_file_put(ff, true, false); + fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true); + fuse_file_put(ff, true); } EXPORT_SYMBOL_GPL(fuse_sync_release); @@ -634,7 +653,8 @@ static void fuse_release_user_pages(struct fuse_args_pages *ap, for (i = 0; i < ap->num_pages; i++) { if (should_dirty) set_page_dirty_lock(ap->pages[i]); - put_page(ap->pages[i]); + if (ap->args.is_pinned) + unpin_user_page(ap->pages[i]); } } @@ -925,7 +945,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, put_page(page); } if (ia->ff) - fuse_file_put(ia->ff, false, false); + fuse_file_put(ia->ff, false); fuse_io_free(ia); } @@ -1299,13 +1319,93 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) return res; } +static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); +} + +/* + * @return true if an exclusive lock for direct IO writes is needed + */ +static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_inode *fi = get_fuse_inode(inode); + + /* Server side has to advise that it supports parallel dio writes. */ + if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES)) + return true; + + /* + * Append will need to know the eventual EOF - always needs an + * exclusive lock. + */ + if (iocb->ki_flags & IOCB_APPEND) + return true; + + /* shared locks are not allowed with parallel page cache IO */ + if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state)) + return false; + + /* Parallel dio beyond EOF is not supported, at least for now. */ + if (fuse_io_past_eof(iocb, from)) + return true; + + return false; +} + +static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, + bool *exclusive) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_file *ff = iocb->ki_filp->private_data; + + *exclusive = fuse_dio_wr_exclusive_lock(iocb, from); + if (*exclusive) { + inode_lock(inode); + } else { + inode_lock_shared(inode); + /* + * New parallal dio allowed only if inode is not in caching + * mode and denies new opens in caching mode. This check + * should be performed only after taking shared inode lock. + * Previous past eof check was without inode lock and might + * have raced, so check it again. + */ + if (fuse_io_past_eof(iocb, from) || + fuse_file_uncached_io_start(inode, ff, NULL) != 0) { + inode_unlock_shared(inode); + inode_lock(inode); + *exclusive = true; + } + } +} + +static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_file *ff = iocb->ki_filp->private_data; + + if (exclusive) { + inode_unlock(inode); + } else { + /* Allow opens in caching mode after last parallel dio end */ + fuse_file_uncached_io_end(inode, ff); + inode_unlock_shared(inode); + } +} + static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; ssize_t written = 0; struct inode *inode = mapping->host; - ssize_t err; + ssize_t err, count; struct fuse_conn *fc = get_fuse_conn(inode); if (fc->writeback_cache) { @@ -1327,10 +1427,12 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) writethrough: inode_lock(inode); - err = generic_write_checks(iocb, from); + err = count = generic_write_checks(iocb, from); if (err <= 0) goto out; + task_io_account_write(count); + err = file_remove_privs(file); if (err) goto out; @@ -1392,10 +1494,13 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, while (nbytes < *nbytesp && ap->num_pages < max_pages) { unsigned npages; size_t start; - ret = iov_iter_get_pages2(ii, &ap->pages[ap->num_pages], - *nbytesp - nbytes, - max_pages - ap->num_pages, - &start); + struct page **pt_pages; + + pt_pages = &ap->pages[ap->num_pages]; + ret = iov_iter_extract_pages(ii, &pt_pages, + *nbytesp - nbytes, + max_pages - ap->num_pages, + 0, &start); if (ret < 0) break; @@ -1412,6 +1517,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } + ap->args.is_pinned = iov_iter_extract_will_pin(ii); ap->args.user_pages = true; if (write) ap->args.in_pages = true; @@ -1558,51 +1664,17 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) return res; } -static bool fuse_direct_write_extending_i_size(struct kiocb *iocb, - struct iov_iter *iter) -{ - struct inode *inode = file_inode(iocb->ki_filp); - - return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); -} - static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - struct file *file = iocb->ki_filp; - struct fuse_file *ff = file->private_data; struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; - bool exclusive_lock = - !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) || - get_fuse_conn(inode)->direct_io_allow_mmap || - iocb->ki_flags & IOCB_APPEND || - fuse_direct_write_extending_i_size(iocb, from); - - /* - * Take exclusive lock if - * - Parallel direct writes are disabled - a user space decision - * - Parallel direct writes are enabled and i_size is being extended. - * - Shared mmap on direct_io file is supported (FUSE_DIRECT_IO_ALLOW_MMAP). - * This might not be needed at all, but needs further investigation. - */ - if (exclusive_lock) - inode_lock(inode); - else { - inode_lock_shared(inode); - - /* A race with truncate might have come up as the decision for - * the lock type was done without holding the lock, check again. - */ - if (fuse_direct_write_extending_i_size(iocb, from)) { - inode_unlock_shared(inode); - inode_lock(inode); - exclusive_lock = true; - } - } + bool exclusive; + fuse_dio_lock(iocb, from, &exclusive); res = generic_write_checks(iocb, from); if (res > 0) { + task_io_account_write(res); if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { res = fuse_direct_IO(iocb, from); } else { @@ -1611,10 +1683,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) fuse_write_update_attr(inode, iocb->ki_pos, res); } } - if (exclusive_lock) - inode_unlock(inode); - else - inode_unlock_shared(inode); + fuse_dio_unlock(iocb, exclusive); return res; } @@ -1631,10 +1700,13 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (FUSE_IS_DAX(inode)) return fuse_dax_read_iter(iocb, to); - if (!(ff->open_flags & FOPEN_DIRECT_IO)) - return fuse_cache_read_iter(iocb, to); - else + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (ff->open_flags & FOPEN_DIRECT_IO) return fuse_direct_read_iter(iocb, to); + else if (fuse_file_passthrough(ff)) + return fuse_passthrough_read_iter(iocb, to); + else + return fuse_cache_read_iter(iocb, to); } static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) @@ -1649,10 +1721,38 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (FUSE_IS_DAX(inode)) return fuse_dax_write_iter(iocb, from); - if (!(ff->open_flags & FOPEN_DIRECT_IO)) - return fuse_cache_write_iter(iocb, from); - else + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (ff->open_flags & FOPEN_DIRECT_IO) return fuse_direct_write_iter(iocb, from); + else if (fuse_file_passthrough(ff)) + return fuse_passthrough_write_iter(iocb, from); + else + return fuse_cache_write_iter(iocb, from); +} + +static ssize_t fuse_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct fuse_file *ff = in->private_data; + + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_passthrough_splice_read(in, ppos, pipe, len, flags); + else + return filemap_splice_read(in, ppos, pipe, len, flags); +} + +static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + struct fuse_file *ff = out->private_data; + + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_passthrough_splice_write(pipe, out, ppos, len, flags); + else + return iter_file_splice_write(pipe, out, ppos, len, flags); } static void fuse_writepage_free(struct fuse_writepage_args *wpa) @@ -1667,7 +1767,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) __free_page(ap->pages[i]); if (wpa->ia.ff) - fuse_file_put(wpa->ia.ff, false, false); + fuse_file_put(wpa->ia.ff, false); kfree(ap->pages); kfree(wpa); @@ -1909,7 +2009,7 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) - fuse_file_put(ff, false, false); + fuse_file_put(ff, false); return err; } @@ -1947,26 +2047,26 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, rcu_read_unlock(); } -static int fuse_writepage_locked(struct page *page) +static int fuse_writepage_locked(struct folio *folio) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_writepage_args *wpa; struct fuse_args_pages *ap; - struct page *tmp_page; + struct folio *tmp_folio; int error = -ENOMEM; - set_page_writeback(page); + folio_start_writeback(folio); wpa = fuse_writepage_args_alloc(); if (!wpa) goto err; ap = &wpa->ia.ap; - tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!tmp_page) + tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); + if (!tmp_folio) goto err_free; error = -EIO; @@ -1975,21 +2075,21 @@ static int fuse_writepage_locked(struct page *page) goto err_nofile; fuse_writepage_add_to_bucket(fc, wpa); - fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); + fuse_write_args_fill(&wpa->ia, wpa->ia.ff, folio_pos(folio), 0); - copy_highpage(tmp_page, page); + folio_copy(tmp_folio, folio); wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; wpa->next = NULL; ap->args.in_pages = true; ap->num_pages = 1; - ap->pages[0] = tmp_page; + ap->pages[0] = &tmp_folio->page; ap->descs[0].offset = 0; ap->descs[0].length = PAGE_SIZE; ap->args.end = fuse_writepage_end; wpa->inode = inode; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); + node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP); spin_lock(&fi->lock); tree_insert(&fi->writepages, wpa); @@ -1997,48 +2097,20 @@ static int fuse_writepage_locked(struct page *page) fuse_flush_writepages(inode); spin_unlock(&fi->lock); - end_page_writeback(page); + folio_end_writeback(folio); return 0; err_nofile: - __free_page(tmp_page); + folio_put(tmp_folio); err_free: kfree(wpa); err: - mapping_set_error(page->mapping, error); - end_page_writeback(page); + mapping_set_error(folio->mapping, error); + folio_end_writeback(folio); return error; } -static int fuse_writepage(struct page *page, struct writeback_control *wbc) -{ - struct fuse_conn *fc = get_fuse_conn(page->mapping->host); - int err; - - if (fuse_page_is_writeback(page->mapping->host, page->index)) { - /* - * ->writepages() should be called for sync() and friends. We - * should only get here on direct reclaim and then we are - * allowed to skip a page which is already in flight - */ - WARN_ON(wbc->sync_mode == WB_SYNC_ALL); - - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - - if (wbc->sync_mode == WB_SYNC_NONE && - fc->num_background >= fc->congestion_threshold) - return AOP_WRITEPAGE_ACTIVATE; - - err = fuse_writepage_locked(page); - unlock_page(page); - - return err; -} - struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; @@ -2307,7 +2379,7 @@ static int fuse_writepages(struct address_space *mapping, fuse_writepages_send(&data); } if (data.ff) - fuse_file_put(data.ff, false, false); + fuse_file_put(data.ff, false); kfree(data.orig_pages); out: @@ -2401,7 +2473,7 @@ static int fuse_launder_folio(struct folio *folio) /* Serialize with pending writeback for the same page */ fuse_wait_on_page_writeback(inode, folio->index); - err = fuse_writepage_locked(&folio->page); + err = fuse_writepage_locked(folio); if (!err) fuse_wait_on_page_writeback(inode, folio->index); } @@ -2462,13 +2534,30 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fm->fc; + struct inode *inode = file_inode(file); + int rc; /* DAX mmap is superior to direct_io mmap */ - if (FUSE_IS_DAX(file_inode(file))) + if (FUSE_IS_DAX(inode)) return fuse_dax_mmap(file, vma); + /* + * If inode is in passthrough io mode, because it has some file open + * in passthrough mode, either mmap to backing file or fail mmap, + * because mixing cached mmap and passthrough io mode is not allowed. + */ + if (fuse_file_passthrough(ff)) + return fuse_passthrough_mmap(file, vma); + else if (fuse_inode_backing(get_fuse_inode(inode))) + return -ENODEV; + + /* + * FOPEN_DIRECT_IO handling is special compared to O_DIRECT, + * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP. + */ if (ff->open_flags & FOPEN_DIRECT_IO) { - /* Can't provide the coherency needed for MAP_SHARED + /* + * Can't provide the coherency needed for MAP_SHARED * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set. */ if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap) @@ -2476,7 +2565,19 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) invalidate_inode_pages2(file->f_mapping); - return generic_file_mmap(file, vma); + if (!(vma->vm_flags & VM_MAYSHARE)) { + /* MAP_PRIVATE */ + return generic_file_mmap(file, vma); + } + + /* + * First mmap of direct_io file enters caching inode io mode. + * Also waits for parallel dio writers to go into serial mode + * (exclusive instead of shared lock). + */ + rc = fuse_file_cached_io_start(inode, ff); + if (rc) + return rc; } if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) @@ -2580,10 +2681,6 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) return -ENOLCK; } - /* Unlock on close is handled by the flush method */ - if ((fl->c.flc_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) - return 0; - fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); err = fuse_simple_request(fm, &args); @@ -3213,8 +3310,8 @@ static const struct file_operations fuse_file_operations = { .lock = fuse_file_lock, .get_unmapped_area = thp_get_unmapped_area, .flock = fuse_file_flock, - .splice_read = filemap_splice_read, - .splice_write = iter_file_splice_write, + .splice_read = fuse_splice_read, + .splice_write = fuse_splice_write, .unlocked_ioctl = fuse_file_ioctl, .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, @@ -3225,10 +3322,10 @@ static const struct file_operations fuse_file_operations = { static const struct address_space_operations fuse_file_aops = { .read_folio = fuse_read_folio, .readahead = fuse_readahead, - .writepage = fuse_writepage, .writepages = fuse_writepages, .launder_folio = fuse_launder_folio, .dirty_folio = filemap_dirty_folio, + .migrate_folio = filemap_migrate_folio, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, .write_begin = fuse_write_begin, @@ -3245,7 +3342,9 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); fi->writectr = 0; + fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); + init_waitqueue_head(&fi->direct_io_waitq); fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index bcbe34488862..b24084b60864 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -76,6 +76,16 @@ struct fuse_submount_lookup { struct fuse_forget_link *forget; }; +/** Container for data related to mapping to backing file */ +struct fuse_backing { + struct file *file; + struct cred *cred; + + /** refcount */ + refcount_t count; + struct rcu_head rcu; +}; + /** FUSE inode */ struct fuse_inode { /** Inode data */ @@ -111,7 +121,7 @@ struct fuse_inode { u64 attr_version; union { - /* Write related fields (regular file only) */ + /* read/write io cache (regular file only) */ struct { /* Files usable in writepage. Protected by fi->lock */ struct list_head write_files; @@ -123,9 +133,15 @@ struct fuse_inode { * (FUSE_NOWRITE) means more writes are blocked */ int writectr; + /** Number of files/maps using page cache */ + int iocachectr; + /* Waitq for writepage completion */ wait_queue_head_t page_waitq; + /* waitq for direct-io completion */ + wait_queue_head_t direct_io_waitq; + /* List of writepage requestst (pending or sent) */ struct rb_root writepages; }; @@ -173,6 +189,10 @@ struct fuse_inode { #endif /** Submount specific lookup tracking */ struct fuse_submount_lookup *submount_lookup; +#ifdef CONFIG_FUSE_PASSTHROUGH + /** Reference to backing file in passthrough mode */ + struct fuse_backing *fb; +#endif }; /** FUSE inode state bits */ @@ -187,19 +207,21 @@ enum { FUSE_I_BAD, /* Has btime */ FUSE_I_BTIME, + /* Wants or already has page cache IO */ + FUSE_I_CACHE_IO_MODE, }; struct fuse_conn; struct fuse_mount; -struct fuse_release_args; +union fuse_file_args; /** FUSE specific file data */ struct fuse_file { /** Fuse connection for this file */ struct fuse_mount *fm; - /* Argument space reserved for release */ - struct fuse_release_args *release_args; + /* Argument space reserved for open/release */ + union fuse_file_args *args; /** Kernel file handle guaranteed to be unique */ u64 kh; @@ -221,12 +243,6 @@ struct fuse_file { /* Readdir related */ struct { - /* - * Protects below fields against (crazy) parallel readdir on - * same open file. Uncontended in the normal case. - */ - struct mutex lock; - /* Dir stream position */ loff_t pos; @@ -244,6 +260,15 @@ struct fuse_file { /** Wait queue head for poll */ wait_queue_head_t poll_wait; + /** Does file hold a fi->iocachectr refcount? */ + enum { IOM_NONE, IOM_CACHED, IOM_UNCACHED } iomode; + +#ifdef CONFIG_FUSE_PASSTHROUGH + /** Reference to backing file in passthrough mode */ + struct file *passthrough; + const struct cred *cred; +#endif + /** Has flock been performed on this file? */ bool flock:1; }; @@ -283,6 +308,7 @@ struct fuse_args { bool page_replace:1; bool may_block:1; bool is_ext:1; + bool is_pinned:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); @@ -295,6 +321,19 @@ struct fuse_args_pages { unsigned int num_pages; }; +struct fuse_release_args { + struct fuse_args args; + struct fuse_release_in inarg; + struct inode *inode; +}; + +union fuse_file_args { + /* Used during open() */ + struct fuse_open_out open_outarg; + /* Used during release() */ + struct fuse_release_args release_args; +}; + #define FUSE_ARGS(args) struct fuse_args args = {} /** The request IO state (for asynchronous processing) */ @@ -818,6 +857,12 @@ struct fuse_conn { /* Is statx not implemented by fs? */ unsigned int no_statx:1; + /** Passthrough support for read/write IO */ + unsigned int passthrough:1; + + /** Maximum stack depth for passthrough backing files */ + int max_stack_depth; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -867,6 +912,11 @@ struct fuse_conn { /* New writepages go into this bucket */ struct fuse_sync_bucket __rcu *curr_bucket; + +#ifdef CONFIG_FUSE_PASSTHROUGH + /** IDR for backing files ids */ + struct idr backing_files_map; +#endif }; /* @@ -940,7 +990,6 @@ static inline bool fuse_stale_inode(const struct inode *inode, int generation, static inline void fuse_make_bad(struct inode *inode) { - remove_inode_hash(inode); set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state); } @@ -1032,14 +1081,9 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, size_t count, int opcode); -/** - * Send OPEN or OPENDIR request - */ -int fuse_open_common(struct inode *inode, struct file *file, bool isdir); - -struct fuse_file *fuse_file_alloc(struct fuse_mount *fm); +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release); void fuse_file_free(struct fuse_file *ff); -void fuse_finish_open(struct inode *inode, struct file *file); +int fuse_finish_open(struct inode *inode, struct file *file); void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, unsigned int flags); @@ -1349,11 +1393,82 @@ int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa); int fuse_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); -/* file.c */ +/* iomode.c */ +int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff); +int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb); +void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff); +int fuse_file_io_open(struct file *file, struct inode *inode); +void fuse_file_io_release(struct fuse_file *ff, struct inode *inode); + +/* file.c */ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, bool isdir); void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir); +/* passthrough.c */ +static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return READ_ONCE(fi->fb); +#else + return NULL; +#endif +} + +static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi, + struct fuse_backing *fb) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return xchg(&fi->fb, fb); +#else + return NULL; +#endif +} + +#ifdef CONFIG_FUSE_PASSTHROUGH +struct fuse_backing *fuse_backing_get(struct fuse_backing *fb); +void fuse_backing_put(struct fuse_backing *fb); +#else + +static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) +{ + return NULL; +} + +static inline void fuse_backing_put(struct fuse_backing *fb) +{ +} +#endif + +void fuse_backing_files_init(struct fuse_conn *fc); +void fuse_backing_files_free(struct fuse_conn *fc); +int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map); +int fuse_backing_close(struct fuse_conn *fc, int backing_id); + +struct fuse_backing *fuse_passthrough_open(struct file *file, + struct inode *inode, + int backing_id); +void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb); + +static inline struct file *fuse_file_passthrough(struct fuse_file *ff) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return ff->passthrough; +#else + return NULL; +#endif +} + +ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter); +ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, struct iov_iter *iter); +ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags); +ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags); +ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 516ea2979a90..3a5d88878335 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -111,6 +111,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi)) goto out_free_forget; + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_inode_backing_set(fi, NULL); + return &fi->inode; out_free_forget: @@ -129,6 +132,9 @@ static void fuse_free_inode(struct inode *inode) #ifdef CONFIG_FUSE_DAX kfree(fi->dax); #endif + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_put(fuse_inode_backing(fi)); + kmem_cache_free(fuse_inode_cachep, fi); } @@ -469,8 +475,11 @@ retry: } else if (fuse_stale_inode(inode, generation, attr)) { /* nodeid was reused, any I/O on the old inode should fail */ fuse_make_bad(inode); - iput(inode); - goto retry; + if (inode != d_inode(sb->s_root)) { + remove_inode_hash(inode); + iput(inode); + goto retry; + } } fi = get_fuse_inode(inode); spin_lock(&fi->lock); @@ -924,6 +933,9 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; fc->max_pages_limit = FUSE_MAX_MAX_PAGES; + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_files_init(fc); + INIT_LIST_HEAD(&fc->mounts); list_add(&fm->fc_entry, &fc->mounts); fm->fc = fc; @@ -954,6 +966,8 @@ void fuse_conn_put(struct fuse_conn *fc) WARN_ON(atomic_read(&bucket->count) != 1); kfree(bucket); } + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_files_free(fc); call_rcu(&fc->rcu, delayed_release); } } @@ -974,7 +988,7 @@ static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) attr.mode = mode; attr.ino = FUSE_ROOT_ID; attr.nlink = 1; - return fuse_iget(sb, 1, 0, &attr, 0, 0); + return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0); } struct fuse_inode_handle { @@ -1117,6 +1131,11 @@ static struct dentry *fuse_get_parent(struct dentry *child) return parent; } +/* only for fid encoding; no support for file handle */ +static const struct export_operations fuse_export_fid_operations = { + .encode_fh = fuse_encode_fh, +}; + static const struct export_operations fuse_export_operations = { .fh_to_dentry = fuse_fh_to_dentry, .fh_to_parent = fuse_fh_to_parent, @@ -1291,6 +1310,26 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->create_supp_group = 1; if (flags & FUSE_DIRECT_IO_ALLOW_MMAP) fc->direct_io_allow_mmap = 1; + /* + * max_stack_depth is the max stack depth of FUSE fs, + * so it has to be at least 1 to support passthrough + * to backing files. + * + * with max_stack_depth > 1, the backing files can be + * on a stacked fs (e.g. overlayfs) themselves and with + * max_stack_depth == 1, FUSE fs can be stacked as the + * underlying fs of a stacked fs (e.g. overlayfs). + */ + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) && + (flags & FUSE_PASSTHROUGH) && + arg->max_stack_depth > 0 && + arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH) { + fc->passthrough = 1; + fc->max_stack_depth = arg->max_stack_depth; + fm->sb->s_stack_depth = arg->max_stack_depth; + } + if (flags & FUSE_NO_EXPORT_SUPPORT) + fm->sb->s_export_op = &fuse_export_fid_operations; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1337,7 +1376,8 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | - FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP; + FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | + FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; @@ -1346,6 +1386,8 @@ void fuse_send_init(struct fuse_mount *fm) #endif if (fm->fc->auto_submounts) flags |= FUSE_SUBMOUNTS; + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + flags |= FUSE_PASSTHROUGH; ia->in.flags = flags; ia->in.flags2 = flags >> 32; @@ -1496,8 +1538,8 @@ static void fuse_fill_attr_from_inode(struct fuse_attr *attr, .ctimensec = ctime.tv_nsec, .mode = fi->inode.i_mode, .nlink = fi->inode.i_nlink, - .uid = fi->inode.i_uid.val, - .gid = fi->inode.i_gid.val, + .uid = __kuid_val(fi->inode.i_uid), + .gid = __kgid_val(fi->inode.i_gid), .rdev = fi->inode.i_rdev, .blksize = 1u << fi->inode.i_blkbits, }; @@ -1534,6 +1576,7 @@ static int fuse_fill_super_submount(struct super_block *sb, sb->s_bdi = bdi_get(parent_sb->s_bdi); sb->s_xattr = parent_sb->s_xattr; + sb->s_export_op = parent_sb->s_export_op; sb->s_time_gran = parent_sb->s_time_gran; sb->s_blocksize = parent_sb->s_blocksize; sb->s_blocksize_bits = parent_sb->s_blocksize_bits; diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c new file mode 100644 index 000000000000..c653ddcf0578 --- /dev/null +++ b/fs/fuse/iomode.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE inode io modes. + * + * Copyright (c) 2024 CTERA Networks. + */ + +#include "fuse_i.h" + +#include +#include +#include +#include + +/* + * Return true if need to wait for new opens in caching mode. + */ +static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi) +{ + return READ_ONCE(fi->iocachectr) < 0 && !fuse_inode_backing(fi); +} + +/* + * Start cached io mode. + * + * Blocks new parallel dio writes and waits for the in-progress parallel dio + * writes to complete. + */ +int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + /* There are no io modes if server does not implement open */ + if (!ff->args) + return 0; + + spin_lock(&fi->lock); + /* + * Setting the bit advises new direct-io writes to use an exclusive + * lock - without it the wait below might be forever. + */ + while (fuse_is_io_cache_wait(fi)) { + set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + spin_unlock(&fi->lock); + wait_event(fi->direct_io_waitq, !fuse_is_io_cache_wait(fi)); + spin_lock(&fi->lock); + } + + /* + * Check if inode entered passthrough io mode while waiting for parallel + * dio write completion. + */ + if (fuse_inode_backing(fi)) { + clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + spin_unlock(&fi->lock); + return -ETXTBSY; + } + + WARN_ON(ff->iomode == IOM_UNCACHED); + if (ff->iomode == IOM_NONE) { + ff->iomode = IOM_CACHED; + if (fi->iocachectr == 0) + set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + fi->iocachectr++; + } + spin_unlock(&fi->lock); + return 0; +} + +static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + WARN_ON(fi->iocachectr <= 0); + WARN_ON(ff->iomode != IOM_CACHED); + ff->iomode = IOM_NONE; + fi->iocachectr--; + if (fi->iocachectr == 0) + clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + spin_unlock(&fi->lock); +} + +/* Start strictly uncached io mode where cache access is not allowed */ +int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_backing *oldfb; + int err = 0; + + spin_lock(&fi->lock); + /* deny conflicting backing files on same fuse inode */ + oldfb = fuse_inode_backing(fi); + if (oldfb && oldfb != fb) { + err = -EBUSY; + goto unlock; + } + if (fi->iocachectr > 0) { + err = -ETXTBSY; + goto unlock; + } + WARN_ON(ff->iomode != IOM_NONE); + fi->iocachectr--; + ff->iomode = IOM_UNCACHED; + + /* fuse inode holds a single refcount of backing file */ + if (!oldfb) { + oldfb = fuse_inode_backing_set(fi, fb); + WARN_ON_ONCE(oldfb != NULL); + } else { + fuse_backing_put(fb); + } +unlock: + spin_unlock(&fi->lock); + return err; +} + +void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_backing *oldfb = NULL; + + spin_lock(&fi->lock); + WARN_ON(fi->iocachectr >= 0); + WARN_ON(ff->iomode != IOM_UNCACHED); + ff->iomode = IOM_NONE; + fi->iocachectr++; + if (!fi->iocachectr) { + wake_up(&fi->direct_io_waitq); + oldfb = fuse_inode_backing_set(fi, NULL); + } + spin_unlock(&fi->lock); + if (oldfb) + fuse_backing_put(oldfb); +} + +/* + * Open flags that are allowed in combination with FOPEN_PASSTHROUGH. + * A combination of FOPEN_PASSTHROUGH and FOPEN_DIRECT_IO means that read/write + * operations go directly to the server, but mmap is done on the backing file. + * FOPEN_PASSTHROUGH mode should not co-exist with any users of the fuse inode + * page cache, so FOPEN_KEEP_CACHE is a strange and undesired combination. + */ +#define FOPEN_PASSTHROUGH_MASK \ + (FOPEN_PASSTHROUGH | FOPEN_DIRECT_IO | FOPEN_PARALLEL_DIRECT_WRITES | \ + FOPEN_NOFLUSH) + +static int fuse_file_passthrough_open(struct inode *inode, struct file *file) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_backing *fb; + int err; + + /* Check allowed conditions for file open in passthrough mode */ + if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) || !fc->passthrough || + (ff->open_flags & ~FOPEN_PASSTHROUGH_MASK)) + return -EINVAL; + + fb = fuse_passthrough_open(file, inode, + ff->args->open_outarg.backing_id); + if (IS_ERR(fb)) + return PTR_ERR(fb); + + /* First passthrough file open denies caching inode io mode */ + err = fuse_file_uncached_io_start(inode, ff, fb); + if (!err) + return 0; + + fuse_passthrough_release(ff, fb); + fuse_backing_put(fb); + + return err; +} + +/* Request access to submit new io to inode via open file */ +int fuse_file_io_open(struct file *file, struct inode *inode) +{ + struct fuse_file *ff = file->private_data; + struct fuse_inode *fi = get_fuse_inode(inode); + int err; + + /* + * io modes are not relevant with DAX and with server that does not + * implement open. + */ + if (FUSE_IS_DAX(inode) || !ff->args) + return 0; + + /* + * Server is expected to use FOPEN_PASSTHROUGH for all opens of an inode + * which is already open for passthrough. + */ + err = -EINVAL; + if (fuse_inode_backing(fi) && !(ff->open_flags & FOPEN_PASSTHROUGH)) + goto fail; + + /* + * FOPEN_PARALLEL_DIRECT_WRITES requires FOPEN_DIRECT_IO. + */ + if (!(ff->open_flags & FOPEN_DIRECT_IO)) + ff->open_flags &= ~FOPEN_PARALLEL_DIRECT_WRITES; + + /* + * First passthrough file open denies caching inode io mode. + * First caching file open enters caching inode io mode. + * + * Note that if user opens a file open with O_DIRECT, but server did + * not specify FOPEN_DIRECT_IO, a later fcntl() could remove O_DIRECT, + * so we put the inode in caching mode to prevent parallel dio. + */ + if ((ff->open_flags & FOPEN_DIRECT_IO) && + !(ff->open_flags & FOPEN_PASSTHROUGH)) + return 0; + + if (ff->open_flags & FOPEN_PASSTHROUGH) + err = fuse_file_passthrough_open(inode, file); + else + err = fuse_file_cached_io_start(inode, ff); + if (err) + goto fail; + + return 0; + +fail: + pr_debug("failed to open file in requested io mode (open_flags=0x%x, err=%i).\n", + ff->open_flags, err); + /* + * The file open mode determines the inode io mode. + * Using incorrect open mode is a server mistake, which results in + * user visible failure of open() with EIO error. + */ + return -EIO; +} + +/* No more pending io and no new io possible to inode via open/mmapped file */ +void fuse_file_io_release(struct fuse_file *ff, struct inode *inode) +{ + /* + * Last parallel dio close allows caching inode io mode. + * Last caching file close exits caching inode io mode. + */ + switch (ff->iomode) { + case IOM_NONE: + /* Nothing to do */ + break; + case IOM_UNCACHED: + fuse_file_uncached_io_end(inode, ff); + break; + case IOM_CACHED: + fuse_file_cached_io_end(inode, ff); + break; + } +} diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c new file mode 100644 index 000000000000..1567f0323858 --- /dev/null +++ b/fs/fuse/passthrough.c @@ -0,0 +1,355 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE passthrough to backing file. + * + * Copyright (c) 2023 CTERA Networks. + */ + +#include "fuse_i.h" + +#include +#include +#include + +static void fuse_file_accessed(struct file *file) +{ + struct inode *inode = file_inode(file); + + fuse_invalidate_atime(inode); +} + +static void fuse_file_modified(struct file *file) +{ + struct inode *inode = file_inode(file); + + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); +} + +ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + size_t count = iov_iter_count(iter); + ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = file, + .accessed = fuse_file_accessed, + }; + + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__, + backing_file, iocb->ki_pos, count); + + if (!count) + return 0; + + ret = backing_file_read_iter(backing_file, iter, iocb, iocb->ki_flags, + &ctx); + + return ret; +} + +ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, + struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct fuse_file *ff = file->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + size_t count = iov_iter_count(iter); + ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = file, + .end_write = fuse_file_modified, + }; + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__, + backing_file, iocb->ki_pos, count); + + if (!count) + return 0; + + inode_lock(inode); + ret = backing_file_write_iter(backing_file, iter, iocb, iocb->ki_flags, + &ctx); + inode_unlock(inode); + + return ret; +} + +ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct fuse_file *ff = in->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = in, + .accessed = fuse_file_accessed, + }; + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__, + backing_file, ppos ? *ppos : 0, len, flags); + + return backing_file_splice_read(backing_file, ppos, pipe, len, flags, + &ctx); +} + +ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + struct fuse_file *ff = out->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + struct inode *inode = file_inode(out); + ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = out, + .end_write = fuse_file_modified, + }; + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__, + backing_file, ppos ? *ppos : 0, len, flags); + + inode_lock(inode); + ret = backing_file_splice_write(pipe, backing_file, ppos, len, flags, + &ctx); + inode_unlock(inode); + + return ret; +} + +ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct fuse_file *ff = file->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = file, + .accessed = fuse_file_accessed, + }; + + pr_debug("%s: backing_file=0x%p, start=%lu, end=%lu\n", __func__, + backing_file, vma->vm_start, vma->vm_end); + + return backing_file_mmap(backing_file, vma, &ctx); +} + +struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) +{ + if (fb && refcount_inc_not_zero(&fb->count)) + return fb; + return NULL; +} + +static void fuse_backing_free(struct fuse_backing *fb) +{ + pr_debug("%s: fb=0x%p\n", __func__, fb); + + if (fb->file) + fput(fb->file); + put_cred(fb->cred); + kfree_rcu(fb, rcu); +} + +void fuse_backing_put(struct fuse_backing *fb) +{ + if (fb && refcount_dec_and_test(&fb->count)) + fuse_backing_free(fb); +} + +void fuse_backing_files_init(struct fuse_conn *fc) +{ + idr_init(&fc->backing_files_map); +} + +static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb) +{ + int id; + + idr_preload(GFP_KERNEL); + spin_lock(&fc->lock); + /* FIXME: xarray might be space inefficient */ + id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC); + spin_unlock(&fc->lock); + idr_preload_end(); + + WARN_ON_ONCE(id == 0); + return id; +} + +static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc, + int id) +{ + struct fuse_backing *fb; + + spin_lock(&fc->lock); + fb = idr_remove(&fc->backing_files_map, id); + spin_unlock(&fc->lock); + + return fb; +} + +static int fuse_backing_id_free(int id, void *p, void *data) +{ + struct fuse_backing *fb = p; + + WARN_ON_ONCE(refcount_read(&fb->count) != 1); + fuse_backing_free(fb); + return 0; +} + +void fuse_backing_files_free(struct fuse_conn *fc) +{ + idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL); + idr_destroy(&fc->backing_files_map); +} + +int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) +{ + struct file *file; + struct super_block *backing_sb; + struct fuse_backing *fb = NULL; + int res; + + pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags); + + /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ + res = -EPERM; + if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) + goto out; + + res = -EINVAL; + if (map->flags) + goto out; + + file = fget(map->fd); + res = -EBADF; + if (!file) + goto out; + + res = -EOPNOTSUPP; + if (!file->f_op->read_iter || !file->f_op->write_iter) + goto out_fput; + + backing_sb = file_inode(file)->i_sb; + res = -ELOOP; + if (backing_sb->s_stack_depth >= fc->max_stack_depth) + goto out_fput; + + fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL); + res = -ENOMEM; + if (!fb) + goto out_fput; + + fb->file = file; + fb->cred = prepare_creds(); + refcount_set(&fb->count, 1); + + res = fuse_backing_id_alloc(fc, fb); + if (res < 0) { + fuse_backing_free(fb); + fb = NULL; + } + +out: + pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res); + + return res; + +out_fput: + fput(file); + goto out; +} + +int fuse_backing_close(struct fuse_conn *fc, int backing_id) +{ + struct fuse_backing *fb = NULL; + int err; + + pr_debug("%s: backing_id=%d\n", __func__, backing_id); + + /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ + err = -EPERM; + if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) + goto out; + + err = -EINVAL; + if (backing_id <= 0) + goto out; + + err = -ENOENT; + fb = fuse_backing_id_remove(fc, backing_id); + if (!fb) + goto out; + + fuse_backing_put(fb); + err = 0; +out: + pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err); + + return err; +} + +/* + * Setup passthrough to a backing file. + * + * Returns an fb object with elevated refcount to be stored in fuse inode. + */ +struct fuse_backing *fuse_passthrough_open(struct file *file, + struct inode *inode, + int backing_id) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fm->fc; + struct fuse_backing *fb = NULL; + struct file *backing_file; + int err; + + err = -EINVAL; + if (backing_id <= 0) + goto out; + + rcu_read_lock(); + fb = idr_find(&fc->backing_files_map, backing_id); + fb = fuse_backing_get(fb); + rcu_read_unlock(); + + err = -ENOENT; + if (!fb) + goto out; + + /* Allocate backing file per fuse file to store fuse path */ + backing_file = backing_file_open(&file->f_path, file->f_flags, + &fb->file->f_path, fb->cred); + err = PTR_ERR(backing_file); + if (IS_ERR(backing_file)) { + fuse_backing_put(fb); + goto out; + } + + err = 0; + ff->passthrough = backing_file; + ff->cred = get_cred(fb->cred); +out: + pr_debug("%s: backing_id=%d, fb=0x%p, backing_file=0x%p, err=%i\n", __func__, + backing_id, fb, ff->passthrough, err); + + return err ? ERR_PTR(err) : fb; +} + +void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb) +{ + pr_debug("%s: fb=0x%p, backing_file=0x%p\n", __func__, + fb, ff->passthrough); + + fput(ff->passthrough); + ff->passthrough = NULL; + put_cred(ff->cred); + ff->cred = NULL; +} diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index c66a54d6c7d3..0377b6dc24c8 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -592,15 +592,11 @@ int fuse_readdir(struct file *file, struct dir_context *ctx) if (fuse_is_bad(inode)) return -EIO; - mutex_lock(&ff->readdir.lock); - err = UNCACHED; if (ff->open_flags & FOPEN_CACHE_DIR) err = fuse_readdir_cached(file, ctx); if (err == UNCACHED) err = fuse_readdir_uncached(file, ctx); - mutex_unlock(&ff->readdir.lock); - return err; } diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index a28466c2da71..322af827a232 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -32,6 +32,9 @@ static DEFINE_MUTEX(virtio_fs_mutex); static LIST_HEAD(virtio_fs_instances); +/* The /sys/fs/virtio_fs/ kset */ +static struct kset *virtio_fs_kset; + enum { VQ_HIPRIO, VQ_REQUEST @@ -56,7 +59,7 @@ struct virtio_fs_vq { /* A virtio-fs device instance */ struct virtio_fs { - struct kref refcount; + struct kobject kobj; struct list_head list; /* on virtio_fs_instances */ char *tag; struct virtio_fs_vq *vqs; @@ -162,18 +165,40 @@ static inline void dec_in_flight_req(struct virtio_fs_vq *fsvq) complete(&fsvq->in_flight_zero); } -static void release_virtio_fs_obj(struct kref *ref) +static ssize_t tag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) { - struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount); + struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj); + + return sysfs_emit(buf, fs->tag); +} + +static struct kobj_attribute virtio_fs_tag_attr = __ATTR_RO(tag); + +static struct attribute *virtio_fs_attrs[] = { + &virtio_fs_tag_attr.attr, + NULL +}; +ATTRIBUTE_GROUPS(virtio_fs); + +static void virtio_fs_ktype_release(struct kobject *kobj) +{ + struct virtio_fs *vfs = container_of(kobj, struct virtio_fs, kobj); kfree(vfs->vqs); kfree(vfs); } +static const struct kobj_type virtio_fs_ktype = { + .release = virtio_fs_ktype_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = virtio_fs_groups, +}; + /* Make sure virtiofs_mutex is held */ static void virtio_fs_put(struct virtio_fs *fs) { - kref_put(&fs->refcount, release_virtio_fs_obj); + kobject_put(&fs->kobj); } static void virtio_fs_fiq_release(struct fuse_iqueue *fiq) @@ -244,25 +269,46 @@ static void virtio_fs_start_all_queues(struct virtio_fs *fs) } /* Add a new instance to the list or return -EEXIST if tag name exists*/ -static int virtio_fs_add_instance(struct virtio_fs *fs) +static int virtio_fs_add_instance(struct virtio_device *vdev, + struct virtio_fs *fs) { struct virtio_fs *fs2; - bool duplicate = false; + int ret; mutex_lock(&virtio_fs_mutex); list_for_each_entry(fs2, &virtio_fs_instances, list) { - if (strcmp(fs->tag, fs2->tag) == 0) - duplicate = true; + if (strcmp(fs->tag, fs2->tag) == 0) { + mutex_unlock(&virtio_fs_mutex); + return -EEXIST; + } } - if (!duplicate) - list_add_tail(&fs->list, &virtio_fs_instances); + /* Use the virtio_device's index as a unique identifier, there is no + * need to allocate our own identifiers because the virtio_fs instance + * is only visible to userspace as long as the underlying virtio_device + * exists. + */ + fs->kobj.kset = virtio_fs_kset; + ret = kobject_add(&fs->kobj, NULL, "%d", vdev->index); + if (ret < 0) { + mutex_unlock(&virtio_fs_mutex); + return ret; + } + + ret = sysfs_create_link(&fs->kobj, &vdev->dev.kobj, "device"); + if (ret < 0) { + kobject_del(&fs->kobj); + mutex_unlock(&virtio_fs_mutex); + return ret; + } + + list_add_tail(&fs->list, &virtio_fs_instances); mutex_unlock(&virtio_fs_mutex); - if (duplicate) - return -EEXIST; + kobject_uevent(&fs->kobj, KOBJ_ADD); + return 0; } @@ -275,7 +321,7 @@ static struct virtio_fs *virtio_fs_find_instance(const char *tag) list_for_each_entry(fs, &virtio_fs_instances, list) { if (strcmp(fs->tag, tag) == 0) { - kref_get(&fs->refcount); + kobject_get(&fs->kobj); goto found; } } @@ -324,6 +370,16 @@ static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs) return -ENOMEM; memcpy(fs->tag, tag_buf, len); fs->tag[len] = '\0'; + + /* While the VIRTIO specification allows any character, newlines are + * awkward on mount(8) command-lines and cause problems in the sysfs + * "tag" attr and uevent TAG= properties. Forbid them. + */ + if (strchr(fs->tag, '\n')) { + dev_dbg(&vdev->dev, "refusing virtiofs tag with newline character\n"); + return -EINVAL; + } + return 0; } @@ -346,7 +402,7 @@ static void virtio_fs_hiprio_done_work(struct work_struct *work) kfree(req); dec_in_flight_req(fsvq); } - } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); + } while (!virtqueue_enable_cb(vq)); spin_unlock(&fsvq->lock); } @@ -628,7 +684,7 @@ static void virtio_fs_requests_done_work(struct work_struct *work) list_move_tail(&req->list, &reqs); spin_unlock(&fpq->lock); } - } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); + } while (!virtqueue_enable_cb(vq)); spin_unlock(&fsvq->lock); /* End requests */ @@ -872,7 +928,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) fs = kzalloc(sizeof(*fs), GFP_KERNEL); if (!fs) return -ENOMEM; - kref_init(&fs->refcount); + kobject_init(&fs->kobj, &virtio_fs_ktype); vdev->priv = fs; ret = virtio_fs_read_tag(vdev, fs); @@ -894,7 +950,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) */ virtio_device_ready(vdev); - ret = virtio_fs_add_instance(fs); + ret = virtio_fs_add_instance(vdev, fs); if (ret < 0) goto out_vqs; @@ -903,11 +959,10 @@ static int virtio_fs_probe(struct virtio_device *vdev) out_vqs: virtio_reset_device(vdev); virtio_fs_cleanup_vqs(vdev); - kfree(fs->vqs); out: vdev->priv = NULL; - kfree(fs); + kobject_put(&fs->kobj); return ret; } @@ -931,6 +986,8 @@ static void virtio_fs_remove(struct virtio_device *vdev) mutex_lock(&virtio_fs_mutex); /* This device is going away. No one should get new reference */ list_del_init(&fs->list); + sysfs_remove_link(&fs->kobj, "device"); + kobject_del(&fs->kobj); virtio_fs_stop_all_queues(fs); virtio_fs_drain_all_queues_locked(fs); virtio_reset_device(vdev); @@ -1517,21 +1574,56 @@ static struct file_system_type virtio_fs_type = { .kill_sb = virtio_kill_sb, }; +static int virtio_fs_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) +{ + const struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj); + + add_uevent_var(env, "TAG=%s", fs->tag); + return 0; +} + +static const struct kset_uevent_ops virtio_fs_uevent_ops = { + .uevent = virtio_fs_uevent, +}; + +static int __init virtio_fs_sysfs_init(void) +{ + virtio_fs_kset = kset_create_and_add("virtiofs", &virtio_fs_uevent_ops, + fs_kobj); + if (!virtio_fs_kset) + return -ENOMEM; + return 0; +} + +static void virtio_fs_sysfs_exit(void) +{ + kset_unregister(virtio_fs_kset); + virtio_fs_kset = NULL; +} + static int __init virtio_fs_init(void) { int ret; - ret = register_virtio_driver(&virtio_fs_driver); + ret = virtio_fs_sysfs_init(); if (ret < 0) return ret; + ret = register_virtio_driver(&virtio_fs_driver); + if (ret < 0) + goto sysfs_exit; + ret = register_filesystem(&virtio_fs_type); - if (ret < 0) { - unregister_virtio_driver(&virtio_fs_driver); - return ret; - } + if (ret < 0) + goto unregister_virtio_driver; return 0; + +unregister_virtio_driver: + unregister_virtio_driver(&virtio_fs_driver); +sysfs_exit: + virtio_fs_sysfs_exit(); + return ret; } module_init(virtio_fs_init); @@ -1539,6 +1631,7 @@ static void __exit virtio_fs_exit(void) { unregister_filesystem(&virtio_fs_type); unregister_virtio_driver(&virtio_fs_driver); + virtio_fs_sysfs_exit(); } module_exit(virtio_fs_exit); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index e7418d15fe39..d08b99d60f6f 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -211,6 +211,12 @@ * 7.39 * - add FUSE_DIRECT_IO_ALLOW_MMAP * - add FUSE_STATX and related structures + * + * 7.40 + * - add max_stack_depth to fuse_init_out, add FUSE_PASSTHROUGH init flag + * - add backing_id to fuse_open_out, add FOPEN_PASSTHROUGH open flag + * - add FUSE_NO_EXPORT_SUPPORT init flag + * - add FUSE_NOTIFY_RESEND, add FUSE_HAS_RESEND init flag */ #ifndef _LINUX_FUSE_H @@ -246,7 +252,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 39 +#define FUSE_KERNEL_MINOR_VERSION 40 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -353,6 +359,7 @@ struct fuse_file_lock { * FOPEN_STREAM: the file is stream-like (no file position at all) * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE) * FOPEN_PARALLEL_DIRECT_WRITES: Allow concurrent direct writes on the same inode + * FOPEN_PASSTHROUGH: passthrough read/write io for this open file */ #define FOPEN_DIRECT_IO (1 << 0) #define FOPEN_KEEP_CACHE (1 << 1) @@ -361,6 +368,7 @@ struct fuse_file_lock { #define FOPEN_STREAM (1 << 4) #define FOPEN_NOFLUSH (1 << 5) #define FOPEN_PARALLEL_DIRECT_WRITES (1 << 6) +#define FOPEN_PASSTHROUGH (1 << 7) /** * INIT request/reply flags @@ -410,6 +418,9 @@ struct fuse_file_lock { * symlink and mknod (single group that matches parent) * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation * FUSE_DIRECT_IO_ALLOW_MMAP: allow shared mmap in FOPEN_DIRECT_IO mode. + * FUSE_NO_EXPORT_SUPPORT: explicitly disable export support + * FUSE_HAS_RESEND: kernel supports resending pending requests, and the high bit + * of the request ID indicates resend requests */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -449,6 +460,9 @@ struct fuse_file_lock { #define FUSE_CREATE_SUPP_GROUP (1ULL << 34) #define FUSE_HAS_EXPIRE_ONLY (1ULL << 35) #define FUSE_DIRECT_IO_ALLOW_MMAP (1ULL << 36) +#define FUSE_PASSTHROUGH (1ULL << 37) +#define FUSE_NO_EXPORT_SUPPORT (1ULL << 38) +#define FUSE_HAS_RESEND (1ULL << 39) /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP @@ -635,6 +649,7 @@ enum fuse_notify_code { FUSE_NOTIFY_STORE = 4, FUSE_NOTIFY_RETRIEVE = 5, FUSE_NOTIFY_DELETE = 6, + FUSE_NOTIFY_RESEND = 7, FUSE_NOTIFY_CODE_MAX, }; @@ -761,7 +776,7 @@ struct fuse_create_in { struct fuse_open_out { uint64_t fh; uint32_t open_flags; - uint32_t padding; + int32_t backing_id; }; struct fuse_release_in { @@ -877,7 +892,8 @@ struct fuse_init_out { uint16_t max_pages; uint16_t map_alignment; uint32_t flags2; - uint32_t unused[7]; + uint32_t max_stack_depth; + uint32_t unused[6]; }; #define CUSE_INIT_INFO_MAX 4096 @@ -960,6 +976,14 @@ struct fuse_fallocate_in { uint32_t padding; }; +/** + * FUSE request unique ID flag + * + * Indicates whether this is a resend request. The receiver should handle this + * request accordingly. + */ +#define FUSE_UNIQUE_RESEND (1ULL << 63) + struct fuse_in_header { uint32_t len; uint32_t opcode; @@ -1049,9 +1073,18 @@ struct fuse_notify_retrieve_in { uint64_t dummy4; }; +struct fuse_backing_map { + int32_t fd; + uint32_t flags; + uint64_t padding; +}; + /* Device ioctls: */ #define FUSE_DEV_IOC_MAGIC 229 #define FUSE_DEV_IOC_CLONE _IOR(FUSE_DEV_IOC_MAGIC, 0, uint32_t) +#define FUSE_DEV_IOC_BACKING_OPEN _IOW(FUSE_DEV_IOC_MAGIC, 1, \ + struct fuse_backing_map) +#define FUSE_DEV_IOC_BACKING_CLOSE _IOW(FUSE_DEV_IOC_MAGIC, 2, uint32_t) struct fuse_lseek_in { uint64_t fh;