From 40488cc16f7ea0d193a4e248f0d809c25cc377db Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 12 Feb 2024 19:11:47 -0500 Subject: [PATCH 01/38] virtiofs: forbid newlines in tags Newlines in virtiofs tags are awkward for users and potential vectors for string injection attacks. Signed-off-by: Stefan Hajnoczi Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/virtio_fs.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 5f1be1da92ce..d84dacbdce2c 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -323,6 +323,16 @@ static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs) return -ENOMEM; memcpy(fs->tag, tag_buf, len); fs->tag[len] = '\0'; + + /* While the VIRTIO specification allows any character, newlines are + * awkward on mount(8) command-lines and cause problems in the sysfs + * "tag" attr and uevent TAG= properties. Forbid them. + */ + if (strchr(fs->tag, '\n')) { + dev_dbg(&vdev->dev, "refusing virtiofs tag with newline character\n"); + return -EINVAL; + } + return 0; } From a8f62f50b4e4ea92a938fca2ec1bd108d7f210e9 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 12 Feb 2024 19:11:48 -0500 Subject: [PATCH 02/38] virtiofs: export filesystem tags through sysfs The virtiofs filesystem is mounted using a "tag" which is exported by the virtiofs device: # mount -t virtiofs /mnt The virtiofs driver knows about all the available tags but these are currently not exported to user space. People have asked for these tags to be exported to user space. Most recently Lennart Poettering has asked for it as he wants to scan the tags and mount virtiofs automatically in certain cases. https://gitlab.com/virtio-fs/virtiofsd/-/issues/128 This patch exports tags at /sys/fs/virtiofs//tag where N is the id of the virtiofs device. The filesystem tag can be obtained by reading this "tag" file. There is also a symlink at /sys/fs/virtiofs//device that points to the virtiofs device that exports this tag. This patch converts the existing struct virtio_fs into a full kobject. It already had a refcount so it's an easy change. The virtio_fs objects can then be exposed in a kset at /sys/fs/virtiofs/. Note that virtio_fs objects may live slightly longer than we wish for them to be exposed to userspace, so kobject_del() is called explicitly when the underlying virtio_device is removed. The virtio_fs object is freed when all references are dropped (e.g. active mounts) but disappears as soon as the virtiofs device is gone. Originally-by: Vivek Goyal Signed-off-by: Stefan Hajnoczi Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- Documentation/ABI/testing/sysfs-fs-virtiofs | 11 ++ fs/fuse/virtio_fs.c | 112 ++++++++++++++++---- 2 files changed, 101 insertions(+), 22 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-fs-virtiofs diff --git a/Documentation/ABI/testing/sysfs-fs-virtiofs b/Documentation/ABI/testing/sysfs-fs-virtiofs new file mode 100644 index 000000000000..4839dbce997e --- /dev/null +++ b/Documentation/ABI/testing/sysfs-fs-virtiofs @@ -0,0 +1,11 @@ +What: /sys/fs/virtiofs//tag +Date: Feb 2024 +Contact: virtio-fs@lists.linux.dev +Description: + [RO] The mount "tag" that can be used to mount this filesystem. + +What: /sys/fs/virtiofs//device +Date: Feb 2024 +Contact: virtio-fs@lists.linux.dev +Description: + Symlink to the virtio device that exports this filesystem. diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index d84dacbdce2c..99b6113bbd13 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -31,6 +31,9 @@ static DEFINE_MUTEX(virtio_fs_mutex); static LIST_HEAD(virtio_fs_instances); +/* The /sys/fs/virtio_fs/ kset */ +static struct kset *virtio_fs_kset; + enum { VQ_HIPRIO, VQ_REQUEST @@ -55,7 +58,7 @@ struct virtio_fs_vq { /* A virtio-fs device instance */ struct virtio_fs { - struct kref refcount; + struct kobject kobj; struct list_head list; /* on virtio_fs_instances */ char *tag; struct virtio_fs_vq *vqs; @@ -161,18 +164,40 @@ static inline void dec_in_flight_req(struct virtio_fs_vq *fsvq) complete(&fsvq->in_flight_zero); } -static void release_virtio_fs_obj(struct kref *ref) +static ssize_t tag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) { - struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount); + struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj); + + return sysfs_emit(buf, fs->tag); +} + +static struct kobj_attribute virtio_fs_tag_attr = __ATTR_RO(tag); + +static struct attribute *virtio_fs_attrs[] = { + &virtio_fs_tag_attr.attr, + NULL +}; +ATTRIBUTE_GROUPS(virtio_fs); + +static void virtio_fs_ktype_release(struct kobject *kobj) +{ + struct virtio_fs *vfs = container_of(kobj, struct virtio_fs, kobj); kfree(vfs->vqs); kfree(vfs); } +static const struct kobj_type virtio_fs_ktype = { + .release = virtio_fs_ktype_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = virtio_fs_groups, +}; + /* Make sure virtiofs_mutex is held */ static void virtio_fs_put(struct virtio_fs *fs) { - kref_put(&fs->refcount, release_virtio_fs_obj); + kobject_put(&fs->kobj); } static void virtio_fs_fiq_release(struct fuse_iqueue *fiq) @@ -243,25 +268,44 @@ static void virtio_fs_start_all_queues(struct virtio_fs *fs) } /* Add a new instance to the list or return -EEXIST if tag name exists*/ -static int virtio_fs_add_instance(struct virtio_fs *fs) +static int virtio_fs_add_instance(struct virtio_device *vdev, + struct virtio_fs *fs) { struct virtio_fs *fs2; - bool duplicate = false; + int ret; mutex_lock(&virtio_fs_mutex); list_for_each_entry(fs2, &virtio_fs_instances, list) { - if (strcmp(fs->tag, fs2->tag) == 0) - duplicate = true; + if (strcmp(fs->tag, fs2->tag) == 0) { + mutex_unlock(&virtio_fs_mutex); + return -EEXIST; + } } - if (!duplicate) - list_add_tail(&fs->list, &virtio_fs_instances); + /* Use the virtio_device's index as a unique identifier, there is no + * need to allocate our own identifiers because the virtio_fs instance + * is only visible to userspace as long as the underlying virtio_device + * exists. + */ + fs->kobj.kset = virtio_fs_kset; + ret = kobject_add(&fs->kobj, NULL, "%d", vdev->index); + if (ret < 0) { + mutex_unlock(&virtio_fs_mutex); + return ret; + } + + ret = sysfs_create_link(&fs->kobj, &vdev->dev.kobj, "device"); + if (ret < 0) { + kobject_del(&fs->kobj); + mutex_unlock(&virtio_fs_mutex); + return ret; + } + + list_add_tail(&fs->list, &virtio_fs_instances); mutex_unlock(&virtio_fs_mutex); - if (duplicate) - return -EEXIST; return 0; } @@ -274,7 +318,7 @@ static struct virtio_fs *virtio_fs_find_instance(const char *tag) list_for_each_entry(fs, &virtio_fs_instances, list) { if (strcmp(fs->tag, tag) == 0) { - kref_get(&fs->refcount); + kobject_get(&fs->kobj); goto found; } } @@ -875,7 +919,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) fs = kzalloc(sizeof(*fs), GFP_KERNEL); if (!fs) return -ENOMEM; - kref_init(&fs->refcount); + kobject_init(&fs->kobj, &virtio_fs_ktype); vdev->priv = fs; ret = virtio_fs_read_tag(vdev, fs); @@ -897,7 +941,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) */ virtio_device_ready(vdev); - ret = virtio_fs_add_instance(fs); + ret = virtio_fs_add_instance(vdev, fs); if (ret < 0) goto out_vqs; @@ -906,11 +950,10 @@ static int virtio_fs_probe(struct virtio_device *vdev) out_vqs: virtio_reset_device(vdev); virtio_fs_cleanup_vqs(vdev); - kfree(fs->vqs); out: vdev->priv = NULL; - kfree(fs); + kobject_put(&fs->kobj); return ret; } @@ -934,6 +977,8 @@ static void virtio_fs_remove(struct virtio_device *vdev) mutex_lock(&virtio_fs_mutex); /* This device is going away. No one should get new reference */ list_del_init(&fs->list); + sysfs_remove_link(&fs->kobj, "device"); + kobject_del(&fs->kobj); virtio_fs_stop_all_queues(fs); virtio_fs_drain_all_queues_locked(fs); virtio_reset_device(vdev); @@ -1520,21 +1565,43 @@ static struct file_system_type virtio_fs_type = { .kill_sb = virtio_kill_sb, }; +static int __init virtio_fs_sysfs_init(void) +{ + virtio_fs_kset = kset_create_and_add("virtiofs", NULL, fs_kobj); + if (!virtio_fs_kset) + return -ENOMEM; + return 0; +} + +static void __exit virtio_fs_sysfs_exit(void) +{ + kset_unregister(virtio_fs_kset); + virtio_fs_kset = NULL; +} + static int __init virtio_fs_init(void) { int ret; - ret = register_virtio_driver(&virtio_fs_driver); + ret = virtio_fs_sysfs_init(); if (ret < 0) return ret; + ret = register_virtio_driver(&virtio_fs_driver); + if (ret < 0) + goto sysfs_exit; + ret = register_filesystem(&virtio_fs_type); - if (ret < 0) { - unregister_virtio_driver(&virtio_fs_driver); - return ret; - } + if (ret < 0) + goto unregister_virtio_driver; return 0; + +unregister_virtio_driver: + unregister_virtio_driver(&virtio_fs_driver); +sysfs_exit: + virtio_fs_sysfs_exit(); + return ret; } module_init(virtio_fs_init); @@ -1542,6 +1609,7 @@ static void __exit virtio_fs_exit(void) { unregister_filesystem(&virtio_fs_type); unregister_virtio_driver(&virtio_fs_driver); + virtio_fs_sysfs_exit(); } module_exit(virtio_fs_exit); From 9086b2d9e9f3da0b0f939aa1d7ff74e9bf5b54c8 Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Mon, 12 Feb 2024 19:11:49 -0500 Subject: [PATCH 03/38] virtiofs: emit uevents on filesystem events Alyssa Ross requested that virtiofs notifies userspace when filesytems become available. This can be used to detect when a filesystem with a given tag is hotplugged, for example. uevents allow userspace to detect changes without resorting to polling. The tag is included as a uevent property so it's easy for userspace to identify the filesystem in question even when the sysfs directory goes away during removal. Here are example uevents: # udevadm monitor -k -p KERNEL[111.113221] add /fs/virtiofs/2 (virtiofs) ACTION=add DEVPATH=/fs/virtiofs/2 SUBSYSTEM=virtiofs TAG=test KERNEL[165.527167] remove /fs/virtiofs/2 (virtiofs) ACTION=remove DEVPATH=/fs/virtiofs/2 SUBSYSTEM=virtiofs TAG=test Signed-off-by: Stefan Hajnoczi Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/virtio_fs.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 99b6113bbd13..62a44603740c 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -306,6 +306,8 @@ static int virtio_fs_add_instance(struct virtio_device *vdev, mutex_unlock(&virtio_fs_mutex); + kobject_uevent(&fs->kobj, KOBJ_ADD); + return 0; } @@ -1565,9 +1567,22 @@ static struct file_system_type virtio_fs_type = { .kill_sb = virtio_kill_sb, }; +static int virtio_fs_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) +{ + const struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj); + + add_uevent_var(env, "TAG=%s", fs->tag); + return 0; +} + +static const struct kset_uevent_ops virtio_fs_uevent_ops = { + .uevent = virtio_fs_uevent, +}; + static int __init virtio_fs_sysfs_init(void) { - virtio_fs_kset = kset_create_and_add("virtiofs", NULL, fs_kobj); + virtio_fs_kset = kset_create_and_add("virtiofs", &virtio_fs_uevent_ops, + fs_kobj); if (!virtio_fs_kset) return -ENOMEM; return 0; From 9511176bbaee0ac60ecc84e7b01cf5972a59ea17 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Tue, 12 Dec 2023 14:33:23 +0100 Subject: [PATCH 04/38] fuse: fix VM_MAYSHARE and direct_io_allow_mmap There were multiple issues with direct_io_allow_mmap: - fuse_link_write_file() was missing, resulting in warnings in fuse_write_file_get() and EIO from msync() - "vma->vm_ops = &fuse_file_vm_ops" was not set, but especially fuse_page_mkwrite is needed. The semantics of invalidate_inode_pages2() is so far not clearly defined in fuse_file_mmap. It dates back to commit 3121bfe76311 ("fuse: fix "direct_io" private mmap") Though, as direct_io_allow_mmap is a new feature, that was for MAP_PRIVATE only. As invalidate_inode_pages2() is calling into fuse_launder_folio() and writes out dirty pages, it should be safe to call invalidate_inode_pages2 for MAP_PRIVATE and MAP_SHARED as well. Cc: Hao Xu Cc: stable@vger.kernel.org Fixes: e78662e818f9 ("fuse: add a new fuse init flag to relax restrictions in no cache mode") Signed-off-by: Bernd Schubert Reviewed-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 148a71b8b4d0..b9cff9b6ca1b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2468,7 +2468,8 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) return fuse_dax_mmap(file, vma); if (ff->open_flags & FOPEN_DIRECT_IO) { - /* Can't provide the coherency needed for MAP_SHARED + /* + * Can't provide the coherency needed for MAP_SHARED * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set. */ if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap) @@ -2476,7 +2477,10 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) invalidate_inode_pages2(file->f_mapping); - return generic_file_mmap(file, vma); + if (!(vma->vm_flags & VM_MAYSHARE)) { + /* MAP_PRIVATE */ + return generic_file_mmap(file, vma); + } } if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) From 699cf8246ee4c2c524f18c2e395909d16e7fda1b Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Tue, 22 Aug 2023 21:48:18 +0200 Subject: [PATCH 05/38] fuse: create helper function if DIO write needs exclusive lock This makes the code a bit easier to read and allows to more easily add more conditions when an exclusive lock is needed. Signed-off-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 63 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index b9cff9b6ca1b..66b3c79ed385 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1299,6 +1299,47 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) return res; } +static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); +} + +/* + * @return true if an exclusive lock for direct IO writes is needed + */ +static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(iocb->ki_filp); + + /* Server side has to advise that it supports parallel dio writes. */ + if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES)) + return true; + + /* + * Append will need to know the eventual EOF - always needs an + * exclusive lock. + */ + if (iocb->ki_flags & IOCB_APPEND) + return true; + + /* + * Combination of page access and direct-io is difficult, shared locks + * actually introduce a conflict. + */ + if (get_fuse_conn(inode)->direct_io_allow_mmap) + return true; + + /* Parallel dio beyond EOF is not supported, at least for now. */ + if (fuse_io_past_eof(iocb, from)) + return true; + + return false; +} + static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -1558,26 +1599,12 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) return res; } -static bool fuse_direct_write_extending_i_size(struct kiocb *iocb, - struct iov_iter *iter) -{ - struct inode *inode = file_inode(iocb->ki_filp); - - return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); -} - static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - struct file *file = iocb->ki_filp; - struct fuse_file *ff = file->private_data; struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; - bool exclusive_lock = - !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) || - get_fuse_conn(inode)->direct_io_allow_mmap || - iocb->ki_flags & IOCB_APPEND || - fuse_direct_write_extending_i_size(iocb, from); + bool exclusive_lock = fuse_dio_wr_exclusive_lock(iocb, from); /* * Take exclusive lock if @@ -1591,10 +1618,10 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) else { inode_lock_shared(inode); - /* A race with truncate might have come up as the decision for - * the lock type was done without holding the lock, check again. + /* + * Previous check was without any lock and might have raced. */ - if (fuse_direct_write_extending_i_size(iocb, from)) { + if (fuse_io_past_eof(iocb, from)) { inode_unlock_shared(inode); inode_lock(inode); exclusive_lock = true; From 9bbb6717dfd286a2861ca33273f4d7c3e65423b0 Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Sun, 24 Dec 2023 00:05:53 +0100 Subject: [PATCH 06/38] fuse: add fuse_dio_lock/unlock helper functions So far this is just a helper to remove complex locking logic out of fuse_direct_write_iter. Especially needed by the next patch in the series to that adds the fuse inode cache IO mode and adds in even more locking complexity. Signed-off-by: Bernd Schubert Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 61 ++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 66b3c79ed385..0115428c33a2 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1340,6 +1340,37 @@ static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from return false; } +static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, + bool *exclusive) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + *exclusive = fuse_dio_wr_exclusive_lock(iocb, from); + if (*exclusive) { + inode_lock(inode); + } else { + inode_lock_shared(inode); + /* + * Previous check was without inode lock and might have raced, + * check again. + */ + if (fuse_io_past_eof(iocb, from)) { + inode_unlock_shared(inode); + inode_lock(inode); + *exclusive = true; + } + } +} + +static void fuse_dio_unlock(struct inode *inode, bool exclusive) +{ + if (exclusive) { + inode_unlock(inode); + } else { + inode_unlock_shared(inode); + } +} + static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -1604,30 +1635,9 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; - bool exclusive_lock = fuse_dio_wr_exclusive_lock(iocb, from); - - /* - * Take exclusive lock if - * - Parallel direct writes are disabled - a user space decision - * - Parallel direct writes are enabled and i_size is being extended. - * - Shared mmap on direct_io file is supported (FUSE_DIRECT_IO_ALLOW_MMAP). - * This might not be needed at all, but needs further investigation. - */ - if (exclusive_lock) - inode_lock(inode); - else { - inode_lock_shared(inode); - - /* - * Previous check was without any lock and might have raced. - */ - if (fuse_io_past_eof(iocb, from)) { - inode_unlock_shared(inode); - inode_lock(inode); - exclusive_lock = true; - } - } + bool exclusive; + fuse_dio_lock(iocb, from, &exclusive); res = generic_write_checks(iocb, from); if (res > 0) { if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { @@ -1638,10 +1648,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) fuse_write_update_attr(inode, iocb->ki_pos, res); } } - if (exclusive_lock) - inode_unlock(inode); - else - inode_unlock_shared(inode); + fuse_dio_unlock(inode, exclusive); return res; } From 0c9d708953d02f74cea05a01cf3e2c8f5a9fbaf4 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 1 Feb 2024 13:48:59 +0200 Subject: [PATCH 07/38] fuse: factor out helper fuse_truncate_update_attr() fuse_finish_open() is called from fuse_open_common() and from fuse_create_open(). In the latter case, the O_TRUNC flag is always cleared in finish_open()m before calling into fuse_finish_open(). Move the bits that update attribute cache post O_TRUNC open into a helper and call this helper from fuse_open_common() directly. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 0115428c33a2..1b403350a5d5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -205,30 +205,31 @@ void fuse_finish_open(struct inode *inode, struct file *file) else if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); - if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { - struct fuse_inode *fi = get_fuse_inode(inode); - - spin_lock(&fi->lock); - fi->attr_version = atomic64_inc_return(&fc->attr_version); - i_size_write(inode, 0); - spin_unlock(&fi->lock); - file_update_time(file); - fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); - } if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); } +static void fuse_truncate_update_attr(struct inode *inode, struct file *file) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + i_size_write(inode, 0); + spin_unlock(&fi->lock); + file_update_time(file); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); +} + int fuse_open_common(struct inode *inode, struct file *file, bool isdir) { struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_conn *fc = fm->fc; int err; - bool is_wb_truncate = (file->f_flags & O_TRUNC) && - fc->atomic_o_trunc && - fc->writeback_cache; - bool dax_truncate = (file->f_flags & O_TRUNC) && - fc->atomic_o_trunc && FUSE_IS_DAX(inode); + bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc; + bool is_wb_truncate = is_truncate && fc->writeback_cache; + bool dax_truncate = is_truncate && FUSE_IS_DAX(inode); if (fuse_is_bad(inode)) return -EIO; @@ -251,15 +252,18 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) fuse_set_nowrite(inode); err = fuse_do_open(fm, get_node_id(inode), file, isdir); - if (!err) + if (!err) { fuse_finish_open(inode, file); + if (is_truncate) + fuse_truncate_update_attr(inode, file); + } if (is_wb_truncate || dax_truncate) fuse_release_nowrite(inode); if (!err) { struct fuse_file *ff = file->private_data; - if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) + if (is_truncate) truncate_pagecache(inode, 0); else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) invalidate_inode_pages2(inode->i_mapping); From e26ee4efbc79610b20e7abe9d96c87f33dacc1ff Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 1 Feb 2024 15:30:05 +0200 Subject: [PATCH 08/38] fuse: allocate ff->release_args only if release is needed This removed the need to pass isdir argument to fuse_put_file(). Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 2 +- fs/fuse/file.c | 69 +++++++++++++++++++++++++++--------------------- fs/fuse/fuse_i.h | 2 +- 3 files changed, 41 insertions(+), 32 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index d19cbf34c634..b8fc3a6b87fe 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -630,7 +630,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, goto out_err; err = -ENOMEM; - ff = fuse_file_alloc(fm); + ff = fuse_file_alloc(fm, true); if (!ff) goto out_put_forget_req; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 1b403350a5d5..bdda420f5acf 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -56,7 +56,7 @@ struct fuse_release_args { struct inode *inode; }; -struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release) { struct fuse_file *ff; @@ -65,11 +65,13 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) return NULL; ff->fm = fm; - ff->release_args = kzalloc(sizeof(*ff->release_args), - GFP_KERNEL_ACCOUNT); - if (!ff->release_args) { - kfree(ff); - return NULL; + if (release) { + ff->release_args = kzalloc(sizeof(*ff->release_args), + GFP_KERNEL_ACCOUNT); + if (!ff->release_args) { + kfree(ff); + return NULL; + } } INIT_LIST_HEAD(&ff->write_entry); @@ -105,14 +107,14 @@ static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args, kfree(ra); } -static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) +static void fuse_file_put(struct fuse_file *ff, bool sync) { if (refcount_dec_and_test(&ff->count)) { - struct fuse_args *args = &ff->release_args->args; + struct fuse_release_args *ra = ff->release_args; + struct fuse_args *args = (ra ? &ra->args : NULL); - if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) { - /* Do nothing when client does not implement 'open' */ - fuse_release_end(ff->fm, args, 0); + if (!args) { + /* Do nothing when server does not implement 'open' */ } else if (sync) { fuse_simple_request(ff->fm, args); fuse_release_end(ff->fm, args, 0); @@ -132,15 +134,16 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, struct fuse_conn *fc = fm->fc; struct fuse_file *ff; int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; + bool open = isdir ? !fc->no_opendir : !fc->no_open; - ff = fuse_file_alloc(fm); + ff = fuse_file_alloc(fm, open); if (!ff) return ERR_PTR(-ENOMEM); ff->fh = 0; /* Default for no-open */ ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); - if (isdir ? !fc->no_opendir : !fc->no_open) { + if (open) { struct fuse_open_out outarg; int err; @@ -148,11 +151,13 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, if (!err) { ff->fh = outarg.fh; ff->open_flags = outarg.open_flags; - } else if (err != -ENOSYS) { fuse_file_free(ff); return ERR_PTR(err); } else { + /* No release needed */ + kfree(ff->release_args); + ff->release_args = NULL; if (isdir) fc->no_opendir = 1; else @@ -278,7 +283,7 @@ out_inode_unlock: } static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, - unsigned int flags, int opcode) + unsigned int flags, int opcode, bool sync) { struct fuse_conn *fc = ff->fm->fc; struct fuse_release_args *ra = ff->release_args; @@ -296,6 +301,9 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, wake_up_interruptible_all(&ff->poll_wait); + if (!ra) + return; + ra->inarg.fh = ff->fh; ra->inarg.flags = flags; ra->args.in_numargs = 1; @@ -305,6 +313,13 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, ra->args.nodeid = ff->nodeid; ra->args.force = true; ra->args.nocreds = true; + + /* + * Hold inode until release is finished. + * From fuse_sync_release() the refcount is 1 and everything's + * synchronous, so we are fine with not doing igrab() here. + */ + ra->inode = sync ? NULL : igrab(&fi->inode); } void fuse_file_release(struct inode *inode, struct fuse_file *ff, @@ -314,14 +329,12 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff, struct fuse_release_args *ra = ff->release_args; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; - fuse_prepare_release(fi, ff, open_flags, opcode); + fuse_prepare_release(fi, ff, open_flags, opcode, false); - if (ff->flock) { + if (ra && ff->flock) { ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id); } - /* Hold inode until release is finished */ - ra->inode = igrab(inode); /* * Normally this will send the RELEASE request, however if @@ -332,7 +345,7 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff, * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. */ - fuse_file_put(ff, ff->fm->fc->destroy, isdir); + fuse_file_put(ff, ff->fm->fc->destroy); } void fuse_release_common(struct file *file, bool isdir) @@ -367,12 +380,8 @@ void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, unsigned int flags) { WARN_ON(refcount_read(&ff->count) > 1); - fuse_prepare_release(fi, ff, flags, FUSE_RELEASE); - /* - * iput(NULL) is a no-op and since the refcount is 1 and everything's - * synchronous, we are fine with not doing igrab() here" - */ - fuse_file_put(ff, true, false); + fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true); + fuse_file_put(ff, true); } EXPORT_SYMBOL_GPL(fuse_sync_release); @@ -929,7 +938,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, put_page(page); } if (ia->ff) - fuse_file_put(ia->ff, false, false); + fuse_file_put(ia->ff, false); fuse_io_free(ia); } @@ -1705,7 +1714,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) __free_page(ap->pages[i]); if (wpa->ia.ff) - fuse_file_put(wpa->ia.ff, false, false); + fuse_file_put(wpa->ia.ff, false); kfree(ap->pages); kfree(wpa); @@ -1947,7 +1956,7 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) - fuse_file_put(ff, false, false); + fuse_file_put(ff, false); return err; } @@ -2345,7 +2354,7 @@ static int fuse_writepages(struct address_space *mapping, fuse_writepages_send(&data); } if (data.ff) - fuse_file_put(data.ff, false, false); + fuse_file_put(data.ff, false); kfree(data.orig_pages); out: diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1df83eebda92..daf7036cd692 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1036,7 +1036,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, */ int fuse_open_common(struct inode *inode, struct file *file, bool isdir); -struct fuse_file *fuse_file_alloc(struct fuse_mount *fm); +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release); void fuse_file_free(struct fuse_file *ff); void fuse_finish_open(struct inode *inode, struct file *file); From 7de64d521bf92396b7da8ae0600188ea5d75a4c9 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 2 Feb 2024 13:30:30 +0200 Subject: [PATCH 09/38] fuse: break up fuse_open_common() fuse_open_common() has a lot of code relevant only for regular files and O_TRUNC in particular. Copy the little bit of remaining code into fuse_dir_open() and stop using this common helper for directory open. Also split out fuse_dir_finish_open() from fuse_finish_open() before we add inode io modes to fuse_finish_open(). Suggested-by: Miklos Szeredi Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 25 ++++++++++++++++++++++++- fs/fuse/file.c | 9 ++------- fs/fuse/fuse_i.h | 5 ----- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b8fc3a6b87fe..66873be14670 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1630,7 +1630,30 @@ out_err: static int fuse_dir_open(struct inode *inode, struct file *file) { - return fuse_open_common(inode, file, true); + struct fuse_mount *fm = get_fuse_mount(inode); + int err; + + if (fuse_is_bad(inode)) + return -EIO; + + err = generic_file_open(inode, file); + if (err) + return err; + + err = fuse_do_open(fm, get_node_id(inode), file, true); + if (!err) { + struct fuse_file *ff = file->private_data; + + /* + * Keep handling FOPEN_STREAM and FOPEN_NONSEEKABLE for + * directories for backward compatibility, though it's unlikely + * to be useful. + */ + if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE)) + nonseekable_open(inode, file); + } + + return err; } static int fuse_dir_release(struct inode *inode, struct file *file) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index bdda420f5acf..c7f21b439d2b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -227,7 +227,7 @@ static void fuse_truncate_update_attr(struct inode *inode, struct file *file) fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); } -int fuse_open_common(struct inode *inode, struct file *file, bool isdir) +static int fuse_open(struct inode *inode, struct file *file) { struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_conn *fc = fm->fc; @@ -256,7 +256,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) if (is_wb_truncate || dax_truncate) fuse_set_nowrite(inode); - err = fuse_do_open(fm, get_node_id(inode), file, isdir); + err = fuse_do_open(fm, get_node_id(inode), file, false); if (!err) { fuse_finish_open(inode, file); if (is_truncate) @@ -354,11 +354,6 @@ void fuse_release_common(struct file *file, bool isdir) (fl_owner_t) file, isdir); } -static int fuse_open(struct inode *inode, struct file *file) -{ - return fuse_open_common(inode, file, false); -} - static int fuse_release(struct inode *inode, struct file *file) { struct fuse_conn *fc = get_fuse_conn(inode); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index daf7036cd692..8adbdcfab7e3 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1031,11 +1031,6 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, size_t count, int opcode); -/** - * Send OPEN or OPENDIR request - */ -int fuse_open_common(struct inode *inode, struct file *file, bool isdir); - struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release); void fuse_file_free(struct fuse_file *ff); void fuse_finish_open(struct inode *inode, struct file *file); From d2c487f150ae00e3cb9faf57aceacc584e0a130c Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 1 Feb 2024 15:38:06 +0200 Subject: [PATCH 10/38] fuse: prepare for failing open response In preparation for inode io modes, a server open response could fail due to conflicting inode io modes. Allow returning an error from fuse_finish_open() and handle the error in the callers. fuse_finish_open() is used as the callback of finish_open(), so that FMODE_OPENED will not be set if fuse_finish_open() fails. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 8 +++++--- fs/fuse/file.c | 15 ++++++++++----- fs/fuse/fuse_i.h | 2 +- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 66873be14670..1225df59d9b7 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -692,13 +692,15 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, d_instantiate(entry, inode); fuse_change_entry_timeout(entry, &outentry); fuse_dir_changed(dir); - err = finish_open(file, entry, generic_file_open); + err = generic_file_open(inode, file); + if (!err) { + file->private_data = ff; + err = finish_open(file, entry, fuse_finish_open); + } if (err) { fi = get_fuse_inode(inode); fuse_sync_release(fi, ff, flags); } else { - file->private_data = ff; - fuse_finish_open(inode, file); if (fm->fc->atomic_o_trunc && trunc) truncate_pagecache(inode, 0); else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c7f21b439d2b..27f50b4bd7f3 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -200,7 +200,7 @@ static void fuse_link_write_file(struct file *file) spin_unlock(&fi->lock); } -void fuse_finish_open(struct inode *inode, struct file *file) +int fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); @@ -212,6 +212,8 @@ void fuse_finish_open(struct inode *inode, struct file *file) if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); + + return 0; } static void fuse_truncate_update_attr(struct inode *inode, struct file *file) @@ -230,7 +232,9 @@ static void fuse_truncate_update_attr(struct inode *inode, struct file *file) static int fuse_open(struct inode *inode, struct file *file) { struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = fm->fc; + struct fuse_file *ff; int err; bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc; bool is_wb_truncate = is_truncate && fc->writeback_cache; @@ -258,16 +262,17 @@ static int fuse_open(struct inode *inode, struct file *file) err = fuse_do_open(fm, get_node_id(inode), file, false); if (!err) { - fuse_finish_open(inode, file); - if (is_truncate) + ff = file->private_data; + err = fuse_finish_open(inode, file); + if (err) + fuse_sync_release(fi, ff, file->f_flags); + else if (is_truncate) fuse_truncate_update_attr(inode, file); } if (is_wb_truncate || dax_truncate) fuse_release_nowrite(inode); if (!err) { - struct fuse_file *ff = file->private_data; - if (is_truncate) truncate_pagecache(inode, 0); else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 8adbdcfab7e3..8cabce8e4282 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1033,7 +1033,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release); void fuse_file_free(struct fuse_file *ff); -void fuse_finish_open(struct inode *inode, struct file *file); +int fuse_finish_open(struct inode *inode, struct file *file); void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, unsigned int flags); From cb098dd24bab8a315aa00bab1ccddb6be872156d Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 1 Feb 2024 16:26:15 +0200 Subject: [PATCH 11/38] fuse: introduce inode io modes The fuse inode io mode is determined by the mode of its open files/mmaps and parallel dio opens and expressed in the value of fi->iocachectr: > 0 - caching io: files open in caching mode or mmap on direct_io file < 0 - parallel dio: direct io mode with parallel dio writes enabled == 0 - direct io: no files open in caching mode and no files mmaped Note that iocachectr value of 0 might become positive or negative, while non-parallel dio is getting processed. direct_io mmap uses page cache, so first mmap will mark the file as ff->io_opened and increment fi->iocachectr to enter the caching io mode. If the server opens the file in caching mode while it is already open for parallel dio or vice versa the open fails. This allows executing parallel dio when inode is not in caching mode and no mmaps have been performed on the inode in question. Signed-off-by: Bernd Schubert Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/Makefile | 1 + fs/fuse/file.c | 15 +++++ fs/fuse/fuse_i.h | 17 ++++- fs/fuse/iomode.c | 158 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 189 insertions(+), 2 deletions(-) create mode 100644 fs/fuse/iomode.c diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 0c48b35c058d..b734cc2a5e65 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o +fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o virtiofs-y := virtio_fs.o diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 27f50b4bd7f3..9af356fbc578 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -113,6 +113,9 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) struct fuse_release_args *ra = ff->release_args; struct fuse_args *args = (ra ? &ra->args : NULL); + if (ra && ra->inode) + fuse_file_io_release(ff, ra->inode); + if (!args) { /* Do nothing when server does not implement 'open' */ } else if (sync) { @@ -204,6 +207,11 @@ int fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + err = fuse_file_io_open(file, inode); + if (err) + return err; if (ff->open_flags & FOPEN_STREAM) stream_open(inode, file); @@ -2509,6 +2517,7 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fm->fc; + int rc; /* DAX mmap is superior to direct_io mmap */ if (FUSE_IS_DAX(file_inode(file))) @@ -2528,6 +2537,11 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) /* MAP_PRIVATE */ return generic_file_mmap(file, vma); } + + /* First mmap of direct_io file enters caching inode io mode. */ + rc = fuse_file_cached_io_start(file_inode(file), ff); + if (rc) + return rc; } if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) @@ -3296,6 +3310,7 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); fi->writectr = 0; + fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); fi->writepages = RB_ROOT; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 8cabce8e4282..57ee9afefee2 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -111,7 +111,7 @@ struct fuse_inode { u64 attr_version; union { - /* Write related fields (regular file only) */ + /* read/write io cache (regular file only) */ struct { /* Files usable in writepage. Protected by fi->lock */ struct list_head write_files; @@ -123,6 +123,9 @@ struct fuse_inode { * (FUSE_NOWRITE) means more writes are blocked */ int writectr; + /** Number of files/maps using page cache */ + int iocachectr; + /* Waitq for writepage completion */ wait_queue_head_t page_waitq; @@ -187,6 +190,8 @@ enum { FUSE_I_BAD, /* Has btime */ FUSE_I_BTIME, + /* Wants or already has page cache IO */ + FUSE_I_CACHE_IO_MODE, }; struct fuse_conn; @@ -244,6 +249,9 @@ struct fuse_file { /** Wait queue head for poll */ wait_queue_head_t poll_wait; + /** Does file hold a fi->iocachectr refcount? */ + enum { IOM_NONE, IOM_CACHED, IOM_UNCACHED } iomode; + /** Has flock been performed on this file? */ bool flock:1; }; @@ -1343,8 +1351,13 @@ int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa); int fuse_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); -/* file.c */ +/* iomode.c */ +int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff); +int fuse_file_io_open(struct file *file, struct inode *inode); +void fuse_file_io_release(struct fuse_file *ff, struct inode *inode); + +/* file.c */ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, bool isdir); void fuse_file_release(struct inode *inode, struct fuse_file *ff, diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c new file mode 100644 index 000000000000..a1a836b2aacc --- /dev/null +++ b/fs/fuse/iomode.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE inode io modes. + * + * Copyright (c) 2024 CTERA Networks. + */ + +#include "fuse_i.h" + +#include +#include +#include +#include + +/* + * Start cached io mode, where parallel dio writes are not allowed. + */ +int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + int err = 0; + + /* There are no io modes if server does not implement open */ + if (!ff->release_args) + return 0; + + spin_lock(&fi->lock); + if (fi->iocachectr < 0) { + err = -ETXTBSY; + goto unlock; + } + WARN_ON(ff->iomode == IOM_UNCACHED); + if (ff->iomode == IOM_NONE) { + ff->iomode = IOM_CACHED; + if (fi->iocachectr == 0) + set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + fi->iocachectr++; + } +unlock: + spin_unlock(&fi->lock); + return err; +} + +static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + WARN_ON(fi->iocachectr <= 0); + WARN_ON(ff->iomode != IOM_CACHED); + ff->iomode = IOM_NONE; + fi->iocachectr--; + if (fi->iocachectr == 0) + clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + spin_unlock(&fi->lock); +} + +/* Start strictly uncached io mode where cache access is not allowed */ +static int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + int err = 0; + + spin_lock(&fi->lock); + if (fi->iocachectr > 0) { + err = -ETXTBSY; + goto unlock; + } + WARN_ON(ff->iomode != IOM_NONE); + fi->iocachectr--; + ff->iomode = IOM_UNCACHED; +unlock: + spin_unlock(&fi->lock); + return err; +} + +static void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + WARN_ON(fi->iocachectr >= 0); + WARN_ON(ff->iomode != IOM_UNCACHED); + ff->iomode = IOM_NONE; + fi->iocachectr++; + spin_unlock(&fi->lock); +} + +/* Request access to submit new io to inode via open file */ +int fuse_file_io_open(struct file *file, struct inode *inode) +{ + struct fuse_file *ff = file->private_data; + int err; + + /* + * io modes are not relevant with DAX and with server that does not + * implement open. + */ + if (FUSE_IS_DAX(inode) || !ff->release_args) + return 0; + + /* + * FOPEN_PARALLEL_DIRECT_WRITES requires FOPEN_DIRECT_IO. + */ + if (!(ff->open_flags & FOPEN_DIRECT_IO)) + ff->open_flags &= ~FOPEN_PARALLEL_DIRECT_WRITES; + + /* + * First parallel dio open denies caching inode io mode. + * First caching file open enters caching inode io mode. + * + * Note that if user opens a file open with O_DIRECT, but server did + * not specify FOPEN_DIRECT_IO, a later fcntl() could remove O_DIRECT, + * so we put the inode in caching mode to prevent parallel dio. + */ + if (ff->open_flags & FOPEN_DIRECT_IO) { + if (ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) + err = fuse_file_uncached_io_start(inode, ff); + else + return 0; + } else { + err = fuse_file_cached_io_start(inode, ff); + } + if (err) + goto fail; + + return 0; + +fail: + pr_debug("failed to open file in requested io mode (open_flags=0x%x, err=%i).\n", + ff->open_flags, err); + /* + * The file open mode determines the inode io mode. + * Using incorrect open mode is a server mistake, which results in + * user visible failure of open() with EIO error. + */ + return -EIO; +} + +/* No more pending io and no new io possible to inode via open/mmapped file */ +void fuse_file_io_release(struct fuse_file *ff, struct inode *inode) +{ + /* + * Last parallel dio close allows caching inode io mode. + * Last caching file close exits caching inode io mode. + */ + switch (ff->iomode) { + case IOM_NONE: + /* Nothing to do */ + break; + case IOM_UNCACHED: + fuse_file_uncached_io_end(inode, ff); + break; + case IOM_CACHED: + fuse_file_cached_io_end(inode, ff); + break; + } +} From 205c1d8026835746d8597e1aa70c370e014e83fa Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 9 Feb 2024 16:54:37 +0200 Subject: [PATCH 12/38] fuse: allow parallel dio writes with FUSE_DIRECT_IO_ALLOW_MMAP Instead of denying caching mode on parallel dio open, deny caching open only while parallel dio are in-progress and wait for in-progress parallel dio writes before entering inode caching io mode. This allows executing parallel dio when inode is not in caching mode even if shared mmap is allowed, but no mmaps have been performed on the inode in question. An mmap on direct_io file now waits for all in-progress parallel dio writes to complete, so parallel dio writes together with FUSE_DIRECT_IO_ALLOW_MMAP is enabled by this commit. Signed-off-by: Bernd Schubert Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 41 +++++++++++++++++++++++++++++------------ fs/fuse/fuse_i.h | 5 +++++ fs/fuse/iomode.c | 48 ++++++++++++++++++++++++++++++------------------ 3 files changed, 64 insertions(+), 30 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9af356fbc578..0a551a147673 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1335,6 +1335,7 @@ static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_inode *fi = get_fuse_inode(inode); /* Server side has to advise that it supports parallel dio writes. */ if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES)) @@ -1347,12 +1348,9 @@ static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from if (iocb->ki_flags & IOCB_APPEND) return true; - /* - * Combination of page access and direct-io is difficult, shared locks - * actually introduce a conflict. - */ - if (get_fuse_conn(inode)->direct_io_allow_mmap) - return true; + /* shared locks are not allowed with parallel page cache IO */ + if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state)) + return false; /* Parallel dio beyond EOF is not supported, at least for now. */ if (fuse_io_past_eof(iocb, from)) @@ -1365,6 +1363,7 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, bool *exclusive) { struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_file *ff = iocb->ki_filp->private_data; *exclusive = fuse_dio_wr_exclusive_lock(iocb, from); if (*exclusive) { @@ -1372,10 +1371,14 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, } else { inode_lock_shared(inode); /* - * Previous check was without inode lock and might have raced, - * check again. + * New parallal dio allowed only if inode is not in caching + * mode and denies new opens in caching mode. This check + * should be performed only after taking shared inode lock. + * Previous past eof check was without inode lock and might + * have raced, so check it again. */ - if (fuse_io_past_eof(iocb, from)) { + if (fuse_io_past_eof(iocb, from) || + fuse_file_uncached_io_start(inode, ff) != 0) { inode_unlock_shared(inode); inode_lock(inode); *exclusive = true; @@ -1383,11 +1386,16 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, } } -static void fuse_dio_unlock(struct inode *inode, bool exclusive) +static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) { + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_file *ff = iocb->ki_filp->private_data; + if (exclusive) { inode_unlock(inode); } else { + /* Allow opens in caching mode after last parallel dio end */ + fuse_file_uncached_io_end(inode, ff); inode_unlock_shared(inode); } } @@ -1669,7 +1677,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) fuse_write_update_attr(inode, iocb->ki_pos, res); } } - fuse_dio_unlock(inode, exclusive); + fuse_dio_unlock(iocb, exclusive); return res; } @@ -2523,6 +2531,10 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) if (FUSE_IS_DAX(file_inode(file))) return fuse_dax_mmap(file, vma); + /* + * FOPEN_DIRECT_IO handling is special compared to O_DIRECT, + * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP. + */ if (ff->open_flags & FOPEN_DIRECT_IO) { /* * Can't provide the coherency needed for MAP_SHARED @@ -2538,7 +2550,11 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) return generic_file_mmap(file, vma); } - /* First mmap of direct_io file enters caching inode io mode. */ + /* + * First mmap of direct_io file enters caching inode io mode. + * Also waits for parallel dio writers to go into serial mode + * (exclusive instead of shared lock). + */ rc = fuse_file_cached_io_start(file_inode(file), ff); if (rc) return rc; @@ -3312,6 +3328,7 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) fi->writectr = 0; fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); + init_waitqueue_head(&fi->direct_io_waitq); fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 57ee9afefee2..2e7af883e4b3 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -129,6 +129,9 @@ struct fuse_inode { /* Waitq for writepage completion */ wait_queue_head_t page_waitq; + /* waitq for direct-io completion */ + wait_queue_head_t direct_io_waitq; + /* List of writepage requestst (pending or sent) */ struct rb_root writepages; }; @@ -1353,6 +1356,8 @@ int fuse_fileattr_set(struct mnt_idmap *idmap, /* iomode.c */ int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff); +int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff); +void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff); int fuse_file_io_open(struct file *file, struct inode *inode); void fuse_file_io_release(struct fuse_file *ff, struct inode *inode); diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c index a1a836b2aacc..ea47c76b9df1 100644 --- a/fs/fuse/iomode.c +++ b/fs/fuse/iomode.c @@ -13,21 +13,37 @@ #include /* - * Start cached io mode, where parallel dio writes are not allowed. + * Return true if need to wait for new opens in caching mode. + */ +static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi) +{ + return READ_ONCE(fi->iocachectr) < 0; +} + +/* + * Start cached io mode. + * + * Blocks new parallel dio writes and waits for the in-progress parallel dio + * writes to complete. */ int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff) { struct fuse_inode *fi = get_fuse_inode(inode); - int err = 0; /* There are no io modes if server does not implement open */ if (!ff->release_args) return 0; spin_lock(&fi->lock); - if (fi->iocachectr < 0) { - err = -ETXTBSY; - goto unlock; + /* + * Setting the bit advises new direct-io writes to use an exclusive + * lock - without it the wait below might be forever. + */ + while (fuse_is_io_cache_wait(fi)) { + set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + spin_unlock(&fi->lock); + wait_event(fi->direct_io_waitq, !fuse_is_io_cache_wait(fi)); + spin_lock(&fi->lock); } WARN_ON(ff->iomode == IOM_UNCACHED); if (ff->iomode == IOM_NONE) { @@ -36,9 +52,8 @@ int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff) set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); fi->iocachectr++; } -unlock: spin_unlock(&fi->lock); - return err; + return 0; } static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff) @@ -56,7 +71,7 @@ static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff) } /* Start strictly uncached io mode where cache access is not allowed */ -static int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff) +int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff) { struct fuse_inode *fi = get_fuse_inode(inode); int err = 0; @@ -74,7 +89,7 @@ unlock: return err; } -static void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) +void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -83,6 +98,8 @@ static void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) WARN_ON(ff->iomode != IOM_UNCACHED); ff->iomode = IOM_NONE; fi->iocachectr++; + if (!fi->iocachectr) + wake_up(&fi->direct_io_waitq); spin_unlock(&fi->lock); } @@ -106,21 +123,16 @@ int fuse_file_io_open(struct file *file, struct inode *inode) ff->open_flags &= ~FOPEN_PARALLEL_DIRECT_WRITES; /* - * First parallel dio open denies caching inode io mode. * First caching file open enters caching inode io mode. * * Note that if user opens a file open with O_DIRECT, but server did * not specify FOPEN_DIRECT_IO, a later fcntl() could remove O_DIRECT, * so we put the inode in caching mode to prevent parallel dio. */ - if (ff->open_flags & FOPEN_DIRECT_IO) { - if (ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) - err = fuse_file_uncached_io_start(inode, ff); - else - return 0; - } else { - err = fuse_file_cached_io_start(inode, ff); - } + if (ff->open_flags & FOPEN_DIRECT_IO) + return 0; + + err = fuse_file_cached_io_start(inode, ff); if (err) goto fail; From aed918310ea2542059eeab6c74defca95c30f77b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 31 Aug 2023 23:07:55 +0300 Subject: [PATCH 13/38] fuse: factor out helper for FUSE_DEV_IOC_CLONE In preparation to adding more fuse dev ioctls. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 59 ++++++++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 26 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1a8f82f478cb..eba68b57bd7c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2251,43 +2251,50 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) return 0; } -static long fuse_dev_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) +static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) { int res; int oldfd; struct fuse_dev *fud = NULL; struct fd f; + if (get_user(oldfd, argp)) + return -EFAULT; + + f = fdget(oldfd); + if (!f.file) + return -EINVAL; + + /* + * Check against file->f_op because CUSE + * uses the same ioctl handler. + */ + if (f.file->f_op == file->f_op) + fud = fuse_get_dev(f.file); + + res = -EINVAL; + if (fud) { + mutex_lock(&fuse_mutex); + res = fuse_device_clone(fud->fc, file); + mutex_unlock(&fuse_mutex); + } + + fdput(f); + return res; +} + +static long fuse_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + switch (cmd) { case FUSE_DEV_IOC_CLONE: - if (get_user(oldfd, (__u32 __user *)arg)) - return -EFAULT; + return fuse_dev_ioctl_clone(file, argp); - f = fdget(oldfd); - if (!f.file) - return -EINVAL; - - /* - * Check against file->f_op because CUSE - * uses the same ioctl handler. - */ - if (f.file->f_op == file->f_op) - fud = fuse_get_dev(f.file); - - res = -EINVAL; - if (fud) { - mutex_lock(&fuse_mutex); - res = fuse_device_clone(fud->fc, file); - mutex_unlock(&fuse_mutex); - } - fdput(f); - break; default: - res = -ENOTTY; - break; + return -ENOTTY; } - return res; } const struct file_operations fuse_dev_operations = { From 7dc4e97a4f9a55bae6ed6ab3f96c92921259d59f Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 9 Feb 2024 16:57:17 +0200 Subject: [PATCH 14/38] fuse: introduce FUSE_PASSTHROUGH capability FUSE_PASSTHROUGH capability to passthrough FUSE operations to backing files will be made available with kernel config CONFIG_FUSE_PASSTHROUGH. When requesting FUSE_PASSTHROUGH, userspace needs to specify the max_stack_depth that is allowed for FUSE on top of backing files. Introduce the flag FOPEN_PASSTHROUGH and backing_id to fuse_open_out argument that can be used when replying to OPEN request, to setup passthrough of io operations on the fuse inode to a backing file. Introduce a refcounted fuse_backing object that will be used to associate an open backing file with a fuse inode. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/Kconfig | 11 ++++++++++ fs/fuse/Makefile | 1 + fs/fuse/fuse_i.h | 42 +++++++++++++++++++++++++++++++++++++++ fs/fuse/inode.c | 26 ++++++++++++++++++++++++ fs/fuse/passthrough.c | 30 ++++++++++++++++++++++++++++ include/uapi/linux/fuse.h | 14 ++++++++++--- 6 files changed, 121 insertions(+), 3 deletions(-) create mode 100644 fs/fuse/passthrough.c diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 038ed0b9aaa5..8674dbfbe59d 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -52,3 +52,14 @@ config FUSE_DAX If you want to allow mounting a Virtio Filesystem with the "dax" option, answer Y. + +config FUSE_PASSTHROUGH + bool "FUSE passthrough operations support" + default y + depends on FUSE_FS + select FS_STACK + help + This allows bypassing FUSE server by mapping specific FUSE operations + to be performed directly on a backing file. + + If you want to allow passthrough operations, answer Y. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index b734cc2a5e65..6e0228c6d0cb 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -10,5 +10,6 @@ obj-$(CONFIG_VIRTIO_FS) += virtiofs.o fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o +fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o virtiofs-y := virtio_fs.o diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 2e7af883e4b3..19a86acc9dd0 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -76,6 +76,15 @@ struct fuse_submount_lookup { struct fuse_forget_link *forget; }; +/** Container for data related to mapping to backing file */ +struct fuse_backing { + struct file *file; + + /** refcount */ + refcount_t count; + struct rcu_head rcu; +}; + /** FUSE inode */ struct fuse_inode { /** Inode data */ @@ -179,6 +188,10 @@ struct fuse_inode { #endif /** Submount specific lookup tracking */ struct fuse_submount_lookup *submount_lookup; +#ifdef CONFIG_FUSE_PASSTHROUGH + /** Reference to backing file in passthrough mode */ + struct fuse_backing *fb; +#endif }; /** FUSE inode state bits */ @@ -829,6 +842,12 @@ struct fuse_conn { /* Is statx not implemented by fs? */ unsigned int no_statx:1; + /** Passthrough support for read/write IO */ + unsigned int passthrough:1; + + /** Maximum stack depth for passthrough backing files */ + int max_stack_depth; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -1368,4 +1387,27 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir); +/* passthrough.c */ +static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return READ_ONCE(fi->fb); +#else + return NULL; +#endif +} + +static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi, + struct fuse_backing *fb) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return xchg(&fi->fb, fb); +#else + return NULL; +#endif +} + +struct fuse_backing *fuse_backing_get(struct fuse_backing *fb); +void fuse_backing_put(struct fuse_backing *fb); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2a6d44f91729..c771bd3c1336 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -111,6 +111,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi)) goto out_free_forget; + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_inode_backing_set(fi, NULL); + return &fi->inode; out_free_forget: @@ -129,6 +132,9 @@ static void fuse_free_inode(struct inode *inode) #ifdef CONFIG_FUSE_DAX kfree(fi->dax); #endif + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_put(fuse_inode_backing(fi)); + kmem_cache_free(fuse_inode_cachep, fi); } @@ -1284,6 +1290,24 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->create_supp_group = 1; if (flags & FUSE_DIRECT_IO_ALLOW_MMAP) fc->direct_io_allow_mmap = 1; + /* + * max_stack_depth is the max stack depth of FUSE fs, + * so it has to be at least 1 to support passthrough + * to backing files. + * + * with max_stack_depth > 1, the backing files can be + * on a stacked fs (e.g. overlayfs) themselves and with + * max_stack_depth == 1, FUSE fs can be stacked as the + * underlying fs of a stacked fs (e.g. overlayfs). + */ + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) && + (flags & FUSE_PASSTHROUGH) && + arg->max_stack_depth > 0 && + arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH) { + fc->passthrough = 1; + fc->max_stack_depth = arg->max_stack_depth; + fm->sb->s_stack_depth = arg->max_stack_depth; + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1339,6 +1363,8 @@ void fuse_send_init(struct fuse_mount *fm) #endif if (fm->fc->auto_submounts) flags |= FUSE_SUBMOUNTS; + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + flags |= FUSE_PASSTHROUGH; ia->in.flags = flags; ia->in.flags2 = flags >> 32; diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c new file mode 100644 index 000000000000..e8639c0a9ac6 --- /dev/null +++ b/fs/fuse/passthrough.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE passthrough to backing file. + * + * Copyright (c) 2023 CTERA Networks. + */ + +#include "fuse_i.h" + +#include + +struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) +{ + if (fb && refcount_inc_not_zero(&fb->count)) + return fb; + return NULL; +} + +static void fuse_backing_free(struct fuse_backing *fb) +{ + if (fb->file) + fput(fb->file); + kfree_rcu(fb, rcu); +} + +void fuse_backing_put(struct fuse_backing *fb) +{ + if (fb && refcount_dec_and_test(&fb->count)) + fuse_backing_free(fb); +} diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index e7418d15fe39..7bb6219cfda0 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -211,6 +211,10 @@ * 7.39 * - add FUSE_DIRECT_IO_ALLOW_MMAP * - add FUSE_STATX and related structures + * + * 7.40 + * - add max_stack_depth to fuse_init_out, add FUSE_PASSTHROUGH init flag + * - add backing_id to fuse_open_out, add FOPEN_PASSTHROUGH open flag */ #ifndef _LINUX_FUSE_H @@ -246,7 +250,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 39 +#define FUSE_KERNEL_MINOR_VERSION 40 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -353,6 +357,7 @@ struct fuse_file_lock { * FOPEN_STREAM: the file is stream-like (no file position at all) * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE) * FOPEN_PARALLEL_DIRECT_WRITES: Allow concurrent direct writes on the same inode + * FOPEN_PASSTHROUGH: passthrough read/write io for this open file */ #define FOPEN_DIRECT_IO (1 << 0) #define FOPEN_KEEP_CACHE (1 << 1) @@ -361,6 +366,7 @@ struct fuse_file_lock { #define FOPEN_STREAM (1 << 4) #define FOPEN_NOFLUSH (1 << 5) #define FOPEN_PARALLEL_DIRECT_WRITES (1 << 6) +#define FOPEN_PASSTHROUGH (1 << 7) /** * INIT request/reply flags @@ -449,6 +455,7 @@ struct fuse_file_lock { #define FUSE_CREATE_SUPP_GROUP (1ULL << 34) #define FUSE_HAS_EXPIRE_ONLY (1ULL << 35) #define FUSE_DIRECT_IO_ALLOW_MMAP (1ULL << 36) +#define FUSE_PASSTHROUGH (1ULL << 37) /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP @@ -761,7 +768,7 @@ struct fuse_create_in { struct fuse_open_out { uint64_t fh; uint32_t open_flags; - uint32_t padding; + int32_t backing_id; }; struct fuse_release_in { @@ -877,7 +884,8 @@ struct fuse_init_out { uint16_t max_pages; uint16_t map_alignment; uint32_t flags2; - uint32_t unused[7]; + uint32_t max_stack_depth; + uint32_t unused[6]; }; #define CUSE_INIT_INFO_MAX 4096 From 44350256ab943d424d70aa60a34f45060b3a36e8 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 11 Sep 2023 17:09:27 +0300 Subject: [PATCH 15/38] fuse: implement ioctls to manage backing files FUSE server calls the FUSE_DEV_IOC_BACKING_OPEN ioctl with a backing file descriptor. If the call succeeds, a backing file identifier is returned. A later change will be using this backing file id in a reply to OPEN request with the flag FOPEN_PASSTHROUGH to setup passthrough of file operations on the open FUSE file to the backing file. The FUSE server should call FUSE_DEV_IOC_BACKING_CLOSE ioctl to close the backing file by its id. This can be done at any time, but if an open reply with FOPEN_PASSTHROUGH flag is still in progress, the open may fail if the backing file is closed before the fuse file was opened. Setting up backing files requires a server with CAP_SYS_ADMIN privileges. For the backing file to be successfully setup, the backing file must implement both read_iter and write_iter file operations. The limitation on the level of filesystem stacking allowed for the backing file is enforced before setting up the backing file. Signed-off-by: Alessio Balsini Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 41 ++++++++++++ fs/fuse/fuse_i.h | 10 +++ fs/fuse/inode.c | 5 ++ fs/fuse/passthrough.c | 136 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/fuse.h | 9 +++ 5 files changed, 201 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index eba68b57bd7c..5a5fb56deb92 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2283,6 +2283,41 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) return res; } +static long fuse_dev_ioctl_backing_open(struct file *file, + struct fuse_backing_map __user *argp) +{ + struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_backing_map map; + + if (!fud) + return -EPERM; + + if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + return -EOPNOTSUPP; + + if (copy_from_user(&map, argp, sizeof(map))) + return -EFAULT; + + return fuse_backing_open(fud->fc, &map); +} + +static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) +{ + struct fuse_dev *fud = fuse_get_dev(file); + int backing_id; + + if (!fud) + return -EPERM; + + if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + return -EOPNOTSUPP; + + if (get_user(backing_id, argp)) + return -EFAULT; + + return fuse_backing_close(fud->fc, backing_id); +} + static long fuse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2292,6 +2327,12 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, case FUSE_DEV_IOC_CLONE: return fuse_dev_ioctl_clone(file, argp); + case FUSE_DEV_IOC_BACKING_OPEN: + return fuse_dev_ioctl_backing_open(file, argp); + + case FUSE_DEV_IOC_BACKING_CLOSE: + return fuse_dev_ioctl_backing_close(file, argp); + default: return -ENOTTY; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 19a86acc9dd0..4f1681650f57 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -79,6 +79,7 @@ struct fuse_submount_lookup { /** Container for data related to mapping to backing file */ struct fuse_backing { struct file *file; + struct cred *cred; /** refcount */ refcount_t count; @@ -897,6 +898,11 @@ struct fuse_conn { /* New writepages go into this bucket */ struct fuse_sync_bucket __rcu *curr_bucket; + +#ifdef CONFIG_FUSE_PASSTHROUGH + /** IDR for backing files ids */ + struct idr backing_files_map; +#endif }; /* @@ -1409,5 +1415,9 @@ static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi, struct fuse_backing *fuse_backing_get(struct fuse_backing *fb); void fuse_backing_put(struct fuse_backing *fb); +void fuse_backing_files_init(struct fuse_conn *fc); +void fuse_backing_files_free(struct fuse_conn *fc); +int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map); +int fuse_backing_close(struct fuse_conn *fc, int backing_id); #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index c771bd3c1336..0186df5fc9ea 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -930,6 +930,9 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; fc->max_pages_limit = FUSE_MAX_MAX_PAGES; + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_files_init(fc); + INIT_LIST_HEAD(&fc->mounts); list_add(&fm->fc_entry, &fc->mounts); fm->fc = fc; @@ -953,6 +956,8 @@ void fuse_conn_put(struct fuse_conn *fc) WARN_ON(atomic_read(&bucket->count) != 1); kfree(bucket); } + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_files_free(fc); fc->release(fc); } } diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index e8639c0a9ac6..7ec92a54c2e0 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -18,8 +18,11 @@ struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) static void fuse_backing_free(struct fuse_backing *fb) { + pr_debug("%s: fb=0x%p\n", __func__, fb); + if (fb->file) fput(fb->file); + put_cred(fb->cred); kfree_rcu(fb, rcu); } @@ -28,3 +31,136 @@ void fuse_backing_put(struct fuse_backing *fb) if (fb && refcount_dec_and_test(&fb->count)) fuse_backing_free(fb); } + +void fuse_backing_files_init(struct fuse_conn *fc) +{ + idr_init(&fc->backing_files_map); +} + +static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb) +{ + int id; + + idr_preload(GFP_KERNEL); + spin_lock(&fc->lock); + /* FIXME: xarray might be space inefficient */ + id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC); + spin_unlock(&fc->lock); + idr_preload_end(); + + WARN_ON_ONCE(id == 0); + return id; +} + +static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc, + int id) +{ + struct fuse_backing *fb; + + spin_lock(&fc->lock); + fb = idr_remove(&fc->backing_files_map, id); + spin_unlock(&fc->lock); + + return fb; +} + +static int fuse_backing_id_free(int id, void *p, void *data) +{ + struct fuse_backing *fb = p; + + WARN_ON_ONCE(refcount_read(&fb->count) != 1); + fuse_backing_free(fb); + return 0; +} + +void fuse_backing_files_free(struct fuse_conn *fc) +{ + idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL); + idr_destroy(&fc->backing_files_map); +} + +int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) +{ + struct file *file; + struct super_block *backing_sb; + struct fuse_backing *fb = NULL; + int res; + + pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags); + + /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ + res = -EPERM; + if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) + goto out; + + res = -EINVAL; + if (map->flags) + goto out; + + file = fget(map->fd); + res = -EBADF; + if (!file) + goto out; + + res = -EOPNOTSUPP; + if (!file->f_op->read_iter || !file->f_op->write_iter) + goto out_fput; + + backing_sb = file_inode(file)->i_sb; + res = -ELOOP; + if (backing_sb->s_stack_depth >= fc->max_stack_depth) + goto out_fput; + + fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL); + res = -ENOMEM; + if (!fb) + goto out_fput; + + fb->file = file; + fb->cred = prepare_creds(); + refcount_set(&fb->count, 1); + + res = fuse_backing_id_alloc(fc, fb); + if (res < 0) { + fuse_backing_free(fb); + fb = NULL; + } + +out: + pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res); + + return res; + +out_fput: + fput(file); + goto out; +} + +int fuse_backing_close(struct fuse_conn *fc, int backing_id) +{ + struct fuse_backing *fb = NULL; + int err; + + pr_debug("%s: backing_id=%d\n", __func__, backing_id); + + /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ + err = -EPERM; + if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) + goto out; + + err = -EINVAL; + if (backing_id <= 0) + goto out; + + err = -ENOENT; + fb = fuse_backing_id_remove(fc, backing_id); + if (!fb) + goto out; + + fuse_backing_put(fb); + err = 0; +out: + pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err); + + return err; +} diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 7bb6219cfda0..1162a47b6a42 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -1057,9 +1057,18 @@ struct fuse_notify_retrieve_in { uint64_t dummy4; }; +struct fuse_backing_map { + int32_t fd; + uint32_t flags; + uint64_t padding; +}; + /* Device ioctls: */ #define FUSE_DEV_IOC_MAGIC 229 #define FUSE_DEV_IOC_CLONE _IOR(FUSE_DEV_IOC_MAGIC, 0, uint32_t) +#define FUSE_DEV_IOC_BACKING_OPEN _IOW(FUSE_DEV_IOC_MAGIC, 1, \ + struct fuse_backing_map) +#define FUSE_DEV_IOC_BACKING_CLOSE _IOW(FUSE_DEV_IOC_MAGIC, 2, uint32_t) struct fuse_lseek_in { uint64_t fh; From fc8ff397b2a91590031ae08534de627f957005cb Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 9 Feb 2024 17:03:55 +0200 Subject: [PATCH 16/38] fuse: prepare for opening file in passthrough mode In preparation for opening file in passthrough mode, store the fuse_open_out argument in ff->args to be passed into fuse_file_io_open() with the optional backing_id member. This will be used for setting up passthrough to backing file on open reply with FOPEN_PASSTHROUGH flag and a valid backing_id. Opening a file in passthrough mode may fail for several reasons, such as missing capability, conflicting open flags or inode in caching mode. Return EIO from fuse_file_io_open() in those cases. The combination of FOPEN_PASSTHROUGH and FOPEN_DIRECT_IO is allowed - it mean that read/write operations will go directly to the server, but mmap will be done to the backing file. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 12 +++++++----- fs/fuse/file.c | 34 +++++++++++++++------------------- fs/fuse/fuse_i.h | 19 ++++++++++++++++--- fs/fuse/iomode.c | 44 ++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 78 insertions(+), 31 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 1225df59d9b7..25c7c97f774b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -615,7 +615,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, FUSE_ARGS(args); struct fuse_forget_link *forget; struct fuse_create_in inarg; - struct fuse_open_out outopen; + struct fuse_open_out *outopenp; struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; @@ -659,8 +659,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, args.out_numargs = 2; args.out_args[0].size = sizeof(outentry); args.out_args[0].value = &outentry; - args.out_args[1].size = sizeof(outopen); - args.out_args[1].value = &outopen; + /* Store outarg for fuse_finish_open() */ + outopenp = &ff->args->open_outarg; + args.out_args[1].size = sizeof(*outopenp); + args.out_args[1].value = outopenp; err = get_create_ext(&args, dir, entry, mode); if (err) @@ -676,9 +678,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, fuse_invalid_attr(&outentry.attr)) goto out_free_ff; - ff->fh = outopen.fh; + ff->fh = outopenp->fh; ff->nodeid = outentry.nodeid; - ff->open_flags = outopen.open_flags; + ff->open_flags = outopenp->open_flags; inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, &outentry.attr, ATTR_TIMEOUT(&outentry), 0); if (!inode) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 0a551a147673..47c1ecab7b5c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -50,12 +50,6 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, return fuse_simple_request(fm, &args); } -struct fuse_release_args { - struct fuse_args args; - struct fuse_release_in inarg; - struct inode *inode; -}; - struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release) { struct fuse_file *ff; @@ -66,9 +60,8 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release) ff->fm = fm; if (release) { - ff->release_args = kzalloc(sizeof(*ff->release_args), - GFP_KERNEL_ACCOUNT); - if (!ff->release_args) { + ff->args = kzalloc(sizeof(*ff->args), GFP_KERNEL_ACCOUNT); + if (!ff->args) { kfree(ff); return NULL; } @@ -87,7 +80,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release) void fuse_file_free(struct fuse_file *ff) { - kfree(ff->release_args); + kfree(ff->args); mutex_destroy(&ff->readdir.lock); kfree(ff); } @@ -110,7 +103,7 @@ static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args, static void fuse_file_put(struct fuse_file *ff, bool sync) { if (refcount_dec_and_test(&ff->count)) { - struct fuse_release_args *ra = ff->release_args; + struct fuse_release_args *ra = &ff->args->release_args; struct fuse_args *args = (ra ? &ra->args : NULL); if (ra && ra->inode) @@ -147,20 +140,21 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, /* Default for no-open */ ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); if (open) { - struct fuse_open_out outarg; + /* Store outarg for fuse_finish_open() */ + struct fuse_open_out *outargp = &ff->args->open_outarg; int err; - err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg); + err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp); if (!err) { - ff->fh = outarg.fh; - ff->open_flags = outarg.open_flags; + ff->fh = outargp->fh; + ff->open_flags = outargp->open_flags; } else if (err != -ENOSYS) { fuse_file_free(ff); return ERR_PTR(err); } else { /* No release needed */ - kfree(ff->release_args); - ff->release_args = NULL; + kfree(ff->args); + ff->args = NULL; if (isdir) fc->no_opendir = 1; else @@ -299,7 +293,7 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, unsigned int flags, int opcode, bool sync) { struct fuse_conn *fc = ff->fm->fc; - struct fuse_release_args *ra = ff->release_args; + struct fuse_release_args *ra = &ff->args->release_args; /* Inode is NULL on error path of fuse_create_open() */ if (likely(fi)) { @@ -317,6 +311,8 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, if (!ra) return; + /* ff->args was used for open outarg */ + memset(ff->args, 0, sizeof(*ff->args)); ra->inarg.fh = ff->fh; ra->inarg.flags = flags; ra->args.in_numargs = 1; @@ -339,7 +335,7 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir) { struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_release_args *ra = ff->release_args; + struct fuse_release_args *ra = &ff->args->release_args; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; fuse_prepare_release(fi, ff, open_flags, opcode, false); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 4f1681650f57..40e94ef3d093 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -213,15 +213,15 @@ enum { struct fuse_conn; struct fuse_mount; -struct fuse_release_args; +union fuse_file_args; /** FUSE specific file data */ struct fuse_file { /** Fuse connection for this file */ struct fuse_mount *fm; - /* Argument space reserved for release */ - struct fuse_release_args *release_args; + /* Argument space reserved for open/release */ + union fuse_file_args *args; /** Kernel file handle guaranteed to be unique */ u64 kh; @@ -320,6 +320,19 @@ struct fuse_args_pages { unsigned int num_pages; }; +struct fuse_release_args { + struct fuse_args args; + struct fuse_release_in inarg; + struct inode *inode; +}; + +union fuse_file_args { + /* Used during open() */ + struct fuse_open_out open_outarg; + /* Used during release() */ + struct fuse_release_args release_args; +}; + #define FUSE_ARGS(args) struct fuse_args args = {} /** The request IO state (for asynchronous processing) */ diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c index ea47c76b9df1..2161bdf91db2 100644 --- a/fs/fuse/iomode.c +++ b/fs/fuse/iomode.c @@ -31,7 +31,7 @@ int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff) struct fuse_inode *fi = get_fuse_inode(inode); /* There are no io modes if server does not implement open */ - if (!ff->release_args) + if (!ff->args) return 0; spin_lock(&fi->lock); @@ -103,6 +103,37 @@ void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) spin_unlock(&fi->lock); } +/* + * Open flags that are allowed in combination with FOPEN_PASSTHROUGH. + * A combination of FOPEN_PASSTHROUGH and FOPEN_DIRECT_IO means that read/write + * operations go directly to the server, but mmap is done on the backing file. + * FOPEN_PASSTHROUGH mode should not co-exist with any users of the fuse inode + * page cache, so FOPEN_KEEP_CACHE is a strange and undesired combination. + */ +#define FOPEN_PASSTHROUGH_MASK \ + (FOPEN_PASSTHROUGH | FOPEN_DIRECT_IO | FOPEN_PARALLEL_DIRECT_WRITES | \ + FOPEN_NOFLUSH) + +static int fuse_file_passthrough_open(struct inode *inode, struct file *file) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + /* Check allowed conditions for file open in passthrough mode */ + if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) || !fc->passthrough || + (ff->open_flags & ~FOPEN_PASSTHROUGH_MASK)) + return -EINVAL; + + /* TODO: implement backing file open */ + return -EOPNOTSUPP; + + /* First passthrough file open denies caching inode io mode */ + err = fuse_file_uncached_io_start(inode, ff); + + return err; +} + /* Request access to submit new io to inode via open file */ int fuse_file_io_open(struct file *file, struct inode *inode) { @@ -113,7 +144,7 @@ int fuse_file_io_open(struct file *file, struct inode *inode) * io modes are not relevant with DAX and with server that does not * implement open. */ - if (FUSE_IS_DAX(inode) || !ff->release_args) + if (FUSE_IS_DAX(inode) || !ff->args) return 0; /* @@ -123,16 +154,21 @@ int fuse_file_io_open(struct file *file, struct inode *inode) ff->open_flags &= ~FOPEN_PARALLEL_DIRECT_WRITES; /* + * First passthrough file open denies caching inode io mode. * First caching file open enters caching inode io mode. * * Note that if user opens a file open with O_DIRECT, but server did * not specify FOPEN_DIRECT_IO, a later fcntl() could remove O_DIRECT, * so we put the inode in caching mode to prevent parallel dio. */ - if (ff->open_flags & FOPEN_DIRECT_IO) + if ((ff->open_flags & FOPEN_DIRECT_IO) && + !(ff->open_flags & FOPEN_PASSTHROUGH)) return 0; - err = fuse_file_cached_io_start(inode, ff); + if (ff->open_flags & FOPEN_PASSTHROUGH) + err = fuse_file_passthrough_open(inode, file); + else + err = fuse_file_cached_io_start(inode, ff); if (err) goto fail; From 4a90451bbc7f7fde94041fbb9ca96dd915069943 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 9 Feb 2024 17:14:50 +0200 Subject: [PATCH 17/38] fuse: implement open in passthrough mode After getting a backing file id with FUSE_DEV_IOC_BACKING_OPEN ioctl, a FUSE server can reply to an OPEN request with flag FOPEN_PASSTHROUGH and the backing file id. The FUSE server should reuse the same backing file id for all the open replies of the same FUSE inode and open will fail (with -EIO) if a the server attempts to open the same inode with conflicting io modes or to setup passthrough to two different backing files for the same FUSE inode. Using the same backing file id for several different inodes is allowed. Opening a new file with FOPEN_DIRECT_IO for an inode that is already open for passthrough is allowed, but only if the FOPEN_PASSTHROUGH flag and correct backing file id are specified as well. The read/write IO of such files will not use passthrough operations to the backing file, but mmap, which does not support direct_io, will use the backing file insead of using the page cache as it always did. Even though all FUSE passthrough files of the same inode use the same backing file as a backing inode reference, each FUSE file opens a unique instance of a backing_file object to store the FUSE path that was used to open the inode and the open flags of the specific open file. The per-file, backing_file object is released along with the FUSE file. The inode associated fuse_backing object is released when the last FUSE passthrough file of that inode is released AND when the backing file id is closed by the server using the FUSE_DEV_IOC_BACKING_CLOSE ioctl. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 9 ++++++- fs/fuse/fuse_i.h | 35 ++++++++++++++++++++++++- fs/fuse/iomode.c | 60 ++++++++++++++++++++++++++++++++++++++----- fs/fuse/passthrough.c | 59 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 155 insertions(+), 8 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 47c1ecab7b5c..6fad381f3beb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -295,6 +295,9 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, struct fuse_conn *fc = ff->fm->fc; struct fuse_release_args *ra = &ff->args->release_args; + if (fuse_file_passthrough(ff)) + fuse_passthrough_release(ff, fuse_inode_backing(fi)); + /* Inode is NULL on error path of fuse_create_open() */ if (likely(fi)) { spin_lock(&fi->lock); @@ -1374,7 +1377,7 @@ static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, * have raced, so check it again. */ if (fuse_io_past_eof(iocb, from) || - fuse_file_uncached_io_start(inode, ff) != 0) { + fuse_file_uncached_io_start(inode, ff, NULL) != 0) { inode_unlock_shared(inode); inode_lock(inode); *exclusive = true; @@ -2527,6 +2530,10 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) if (FUSE_IS_DAX(file_inode(file))) return fuse_dax_mmap(file, vma); + /* TODO: implement mmap to backing file */ + if (fuse_file_passthrough(ff)) + return -ENODEV; + /* * FOPEN_DIRECT_IO handling is special compared to O_DIRECT, * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP. diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 40e94ef3d093..8fbf30fe3c3d 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -269,6 +269,12 @@ struct fuse_file { /** Does file hold a fi->iocachectr refcount? */ enum { IOM_NONE, IOM_CACHED, IOM_UNCACHED } iomode; +#ifdef CONFIG_FUSE_PASSTHROUGH + /** Reference to backing file in passthrough mode */ + struct file *passthrough; + const struct cred *cred; +#endif + /** Has flock been performed on this file? */ bool flock:1; }; @@ -1394,7 +1400,7 @@ int fuse_fileattr_set(struct mnt_idmap *idmap, /* iomode.c */ int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff); -int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff); +int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb); void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff); int fuse_file_io_open(struct file *file, struct inode *inode); @@ -1426,11 +1432,38 @@ static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi, #endif } +#ifdef CONFIG_FUSE_PASSTHROUGH struct fuse_backing *fuse_backing_get(struct fuse_backing *fb); void fuse_backing_put(struct fuse_backing *fb); +#else + +static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) +{ + return NULL; +} + +static inline void fuse_backing_put(struct fuse_backing *fb) +{ +} +#endif + void fuse_backing_files_init(struct fuse_conn *fc); void fuse_backing_files_free(struct fuse_conn *fc); int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map); int fuse_backing_close(struct fuse_conn *fc, int backing_id); +struct fuse_backing *fuse_passthrough_open(struct file *file, + struct inode *inode, + int backing_id); +void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb); + +static inline struct file *fuse_file_passthrough(struct fuse_file *ff) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return ff->passthrough; +#else + return NULL; +#endif +} + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c index 2161bdf91db2..c653ddcf0578 100644 --- a/fs/fuse/iomode.c +++ b/fs/fuse/iomode.c @@ -17,7 +17,7 @@ */ static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi) { - return READ_ONCE(fi->iocachectr) < 0; + return READ_ONCE(fi->iocachectr) < 0 && !fuse_inode_backing(fi); } /* @@ -45,6 +45,17 @@ int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff) wait_event(fi->direct_io_waitq, !fuse_is_io_cache_wait(fi)); spin_lock(&fi->lock); } + + /* + * Check if inode entered passthrough io mode while waiting for parallel + * dio write completion. + */ + if (fuse_inode_backing(fi)) { + clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + spin_unlock(&fi->lock); + return -ETXTBSY; + } + WARN_ON(ff->iomode == IOM_UNCACHED); if (ff->iomode == IOM_NONE) { ff->iomode = IOM_CACHED; @@ -71,12 +82,19 @@ static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff) } /* Start strictly uncached io mode where cache access is not allowed */ -int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff) +int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb) { struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_backing *oldfb; int err = 0; spin_lock(&fi->lock); + /* deny conflicting backing files on same fuse inode */ + oldfb = fuse_inode_backing(fi); + if (oldfb && oldfb != fb) { + err = -EBUSY; + goto unlock; + } if (fi->iocachectr > 0) { err = -ETXTBSY; goto unlock; @@ -84,6 +102,14 @@ int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff) WARN_ON(ff->iomode != IOM_NONE); fi->iocachectr--; ff->iomode = IOM_UNCACHED; + + /* fuse inode holds a single refcount of backing file */ + if (!oldfb) { + oldfb = fuse_inode_backing_set(fi, fb); + WARN_ON_ONCE(oldfb != NULL); + } else { + fuse_backing_put(fb); + } unlock: spin_unlock(&fi->lock); return err; @@ -92,15 +118,20 @@ unlock: void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff) { struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_backing *oldfb = NULL; spin_lock(&fi->lock); WARN_ON(fi->iocachectr >= 0); WARN_ON(ff->iomode != IOM_UNCACHED); ff->iomode = IOM_NONE; fi->iocachectr++; - if (!fi->iocachectr) + if (!fi->iocachectr) { wake_up(&fi->direct_io_waitq); + oldfb = fuse_inode_backing_set(fi, NULL); + } spin_unlock(&fi->lock); + if (oldfb) + fuse_backing_put(oldfb); } /* @@ -118,6 +149,7 @@ static int fuse_file_passthrough_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_backing *fb; int err; /* Check allowed conditions for file open in passthrough mode */ @@ -125,11 +157,18 @@ static int fuse_file_passthrough_open(struct inode *inode, struct file *file) (ff->open_flags & ~FOPEN_PASSTHROUGH_MASK)) return -EINVAL; - /* TODO: implement backing file open */ - return -EOPNOTSUPP; + fb = fuse_passthrough_open(file, inode, + ff->args->open_outarg.backing_id); + if (IS_ERR(fb)) + return PTR_ERR(fb); /* First passthrough file open denies caching inode io mode */ - err = fuse_file_uncached_io_start(inode, ff); + err = fuse_file_uncached_io_start(inode, ff, fb); + if (!err) + return 0; + + fuse_passthrough_release(ff, fb); + fuse_backing_put(fb); return err; } @@ -138,6 +177,7 @@ static int fuse_file_passthrough_open(struct inode *inode, struct file *file) int fuse_file_io_open(struct file *file, struct inode *inode) { struct fuse_file *ff = file->private_data; + struct fuse_inode *fi = get_fuse_inode(inode); int err; /* @@ -147,6 +187,14 @@ int fuse_file_io_open(struct file *file, struct inode *inode) if (FUSE_IS_DAX(inode) || !ff->args) return 0; + /* + * Server is expected to use FOPEN_PASSTHROUGH for all opens of an inode + * which is already open for passthrough. + */ + err = -EINVAL; + if (fuse_inode_backing(fi) && !(ff->open_flags & FOPEN_PASSTHROUGH)) + goto fail; + /* * FOPEN_PARALLEL_DIRECT_WRITES requires FOPEN_DIRECT_IO. */ diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index 7ec92a54c2e0..dc054d2ab13e 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -8,6 +8,7 @@ #include "fuse_i.h" #include +#include struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) { @@ -164,3 +165,61 @@ out: return err; } + +/* + * Setup passthrough to a backing file. + * + * Returns an fb object with elevated refcount to be stored in fuse inode. + */ +struct fuse_backing *fuse_passthrough_open(struct file *file, + struct inode *inode, + int backing_id) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fm->fc; + struct fuse_backing *fb = NULL; + struct file *backing_file; + int err; + + err = -EINVAL; + if (backing_id <= 0) + goto out; + + rcu_read_lock(); + fb = idr_find(&fc->backing_files_map, backing_id); + fb = fuse_backing_get(fb); + rcu_read_unlock(); + + err = -ENOENT; + if (!fb) + goto out; + + /* Allocate backing file per fuse file to store fuse path */ + backing_file = backing_file_open(&file->f_path, file->f_flags, + &fb->file->f_path, fb->cred); + err = PTR_ERR(backing_file); + if (IS_ERR(backing_file)) { + fuse_backing_put(fb); + goto out; + } + + err = 0; + ff->passthrough = backing_file; + ff->cred = get_cred(fb->cred); +out: + pr_debug("%s: backing_id=%d, fb=0x%p, backing_file=0x%p, err=%i\n", __func__, + backing_id, fb, ff->passthrough, err); + + return err ? ERR_PTR(err) : fb; +} + +void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb) +{ + pr_debug("%s: fb=0x%p, backing_file=0x%p\n", __func__, + fb, ff->passthrough); + + fput(ff->passthrough); + ff->passthrough = NULL; + put_cred(ff->cred); + ff->cred = NULL; +} From 57e1176e6086673d31bf0a0dc58e144c8e65e589 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 22 Nov 2023 18:26:04 +0200 Subject: [PATCH 18/38] fuse: implement read/write passthrough Use the backing file read/write helpers to implement read/write passthrough to a backing file. After read/write, we invalidate a/c/mtime/size attributes. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 18 +++++++---- fs/fuse/fuse_i.h | 3 ++ fs/fuse/passthrough.c | 69 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 6 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 6fad381f3beb..903bb71aac6d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1693,10 +1693,13 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (FUSE_IS_DAX(inode)) return fuse_dax_read_iter(iocb, to); - if (!(ff->open_flags & FOPEN_DIRECT_IO)) - return fuse_cache_read_iter(iocb, to); - else + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (ff->open_flags & FOPEN_DIRECT_IO) return fuse_direct_read_iter(iocb, to); + else if (fuse_file_passthrough(ff)) + return fuse_passthrough_read_iter(iocb, to); + else + return fuse_cache_read_iter(iocb, to); } static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) @@ -1711,10 +1714,13 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (FUSE_IS_DAX(inode)) return fuse_dax_write_iter(iocb, from); - if (!(ff->open_flags & FOPEN_DIRECT_IO)) - return fuse_cache_write_iter(iocb, from); - else + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (ff->open_flags & FOPEN_DIRECT_IO) return fuse_direct_write_iter(iocb, from); + else if (fuse_file_passthrough(ff)) + return fuse_passthrough_write_iter(iocb, from); + else + return fuse_cache_write_iter(iocb, from); } static void fuse_writepage_free(struct fuse_writepage_args *wpa) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 8fbf30fe3c3d..6cc7d0752f9f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1466,4 +1466,7 @@ static inline struct file *fuse_file_passthrough(struct fuse_file *ff) #endif } +ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter); +ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, struct iov_iter *iter); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index dc054d2ab13e..0e5d316bdad3 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -10,6 +10,75 @@ #include #include +static void fuse_file_accessed(struct file *file) +{ + struct inode *inode = file_inode(file); + + fuse_invalidate_atime(inode); +} + +static void fuse_file_modified(struct file *file) +{ + struct inode *inode = file_inode(file); + + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); +} + +ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + size_t count = iov_iter_count(iter); + ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = file, + .accessed = fuse_file_accessed, + }; + + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__, + backing_file, iocb->ki_pos, count); + + if (!count) + return 0; + + ret = backing_file_read_iter(backing_file, iter, iocb, iocb->ki_flags, + &ctx); + + return ret; +} + +ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, + struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct fuse_file *ff = file->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + size_t count = iov_iter_count(iter); + ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = file, + .end_write = fuse_file_modified, + }; + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__, + backing_file, iocb->ki_pos, count); + + if (!count) + return 0; + + inode_lock(inode); + ret = backing_file_write_iter(backing_file, iter, iocb, iocb->ki_flags, + &ctx); + inode_unlock(inode); + + return ret; +} + struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) { if (fb && refcount_inc_not_zero(&fb->count)) From 5ca73468612d8e0767614992da8decc7f9f48926 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 13 Oct 2023 13:26:03 +0300 Subject: [PATCH 19/38] fuse: implement splice read/write passthrough This allows passing fstests generic/249 and generic/591. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 29 ++++++++++++++++++++++++++-- fs/fuse/fuse_i.h | 6 ++++++ fs/fuse/passthrough.c | 45 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 2 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 903bb71aac6d..3216dcf3ace0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1723,6 +1723,31 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return fuse_cache_write_iter(iocb, from); } +static ssize_t fuse_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct fuse_file *ff = in->private_data; + + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_passthrough_splice_read(in, ppos, pipe, len, flags); + else + return filemap_splice_read(in, ppos, pipe, len, flags); +} + +static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + struct fuse_file *ff = out->private_data; + + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_passthrough_splice_write(pipe, out, ppos, len, flags); + else + return iter_file_splice_write(pipe, out, ppos, len, flags); +} + static void fuse_writepage_free(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; @@ -3303,8 +3328,8 @@ static const struct file_operations fuse_file_operations = { .lock = fuse_file_lock, .get_unmapped_area = thp_get_unmapped_area, .flock = fuse_file_flock, - .splice_read = filemap_splice_read, - .splice_write = iter_file_splice_write, + .splice_read = fuse_splice_read, + .splice_write = fuse_splice_write, .unlocked_ioctl = fuse_file_ioctl, .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 6cc7d0752f9f..53ac73739788 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1468,5 +1468,11 @@ static inline struct file *fuse_file_passthrough(struct fuse_file *ff) ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter); ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, struct iov_iter *iter); +ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags); +ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags); #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index 0e5d316bdad3..2b119c592f02 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -9,6 +9,7 @@ #include #include +#include static void fuse_file_accessed(struct file *file) { @@ -79,6 +80,50 @@ ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, return ret; } +ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct fuse_file *ff = in->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = in, + .accessed = fuse_file_accessed, + }; + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__, + backing_file, ppos ? *ppos : 0, len, flags); + + return backing_file_splice_read(backing_file, ppos, pipe, len, flags, + &ctx); +} + +ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + struct fuse_file *ff = out->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + struct inode *inode = file_inode(out); + ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = out, + .end_write = fuse_file_modified, + }; + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__, + backing_file, ppos ? *ppos : 0, len, flags); + + inode_lock(inode); + ret = backing_file_splice_write(pipe, backing_file, ppos, len, flags, + &ctx); + inode_unlock(inode); + + return ret; +} + struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) { if (fb && refcount_inc_not_zero(&fb->count)) From fda0b98ef0a6a2e3fe328b869d53002c8c82001b Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 9 Feb 2024 17:16:49 +0200 Subject: [PATCH 20/38] fuse: implement passthrough for mmap An mmap request for a file open in passthrough mode, maps the memory directly to the backing file. An mmap of a file in direct io mode, usually uses cached mmap and puts the inode in caching io mode, which denies new passthrough opens of that inode, because caching io mode is conflicting with passthrough io mode. For the same reason, trying to mmap a direct io file, while there is a passthrough file open on the same inode will fail with -ENODEV. An mmap of a file in direct io mode, also needs to wait for parallel dio writes in-progress to complete. If a passthrough file is opened, while an mmap of another direct io file is waiting for parallel dio writes to complete, the wait is aborted and mmap fails with -ENODEV. A FUSE server that uses passthrough and direct io opens on the same inode that may also be mmaped, is advised to provide a backing fd also for the files that are open in direct io mode (i.e. use the flags combination FOPEN_DIRECT_IO | FOPEN_PASSTHROUGH), so that mmap will always use the backing file, even if read/write do not passthrough. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 13 ++++++++++--- fs/fuse/fuse_i.h | 1 + fs/fuse/passthrough.c | 16 ++++++++++++++++ 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3216dcf3ace0..3ce158f70927 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2555,14 +2555,21 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fm->fc; + struct inode *inode = file_inode(file); int rc; /* DAX mmap is superior to direct_io mmap */ - if (FUSE_IS_DAX(file_inode(file))) + if (FUSE_IS_DAX(inode)) return fuse_dax_mmap(file, vma); - /* TODO: implement mmap to backing file */ + /* + * If inode is in passthrough io mode, because it has some file open + * in passthrough mode, either mmap to backing file or fail mmap, + * because mixing cached mmap and passthrough io mode is not allowed. + */ if (fuse_file_passthrough(ff)) + return fuse_passthrough_mmap(file, vma); + else if (fuse_inode_backing(get_fuse_inode(inode))) return -ENODEV; /* @@ -2589,7 +2596,7 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) * Also waits for parallel dio writers to go into serial mode * (exclusive instead of shared lock). */ - rc = fuse_file_cached_io_start(file_inode(file), ff); + rc = fuse_file_cached_io_start(inode, ff); if (rc) return rc; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 53ac73739788..7bd3552b1e80 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1474,5 +1474,6 @@ ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos, ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags); +ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma); #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index 2b119c592f02..1567f0323858 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -124,6 +124,22 @@ ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, return ret; } +ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct fuse_file *ff = file->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = file, + .accessed = fuse_file_accessed, + }; + + pr_debug("%s: backing_file=0x%p, start=%lu, end=%lu\n", __func__, + backing_file, vma->vm_start, vma->vm_end); + + return backing_file_mmap(backing_file, vma, &ctx); +} + struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) { if (fb && refcount_inc_not_zero(&fb->count)) From d30ff89870482d88807393b592d5f0d1d4bc5e2a Mon Sep 17 00:00:00 2001 From: Stefan Hajnoczi Date: Tue, 27 Feb 2024 10:57:56 -0500 Subject: [PATCH 21/38] virtiofs: drop __exit from virtio_fs_sysfs_exit() virtio_fs_sysfs_exit() is called by: - static int __init virtio_fs_init(void) - static void __exit virtio_fs_exit(void) Remove __exit from virtio_fs_sysfs_exit() since virtio_fs_init() is not an __exit function. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202402270649.GYjNX0yw-lkp@intel.com/ Signed-off-by: Stefan Hajnoczi Reviewed-by: Randy Dunlap Tested-by: Randy Dunlap # build-tested Signed-off-by: Miklos Szeredi --- fs/fuse/virtio_fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 62a44603740c..948b49c2460d 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1588,7 +1588,7 @@ static int __init virtio_fs_sysfs_init(void) return 0; } -static void __exit virtio_fs_sysfs_exit(void) +static void virtio_fs_sysfs_exit(void) { kset_unregister(virtio_fs_kset); virtio_fs_kset = NULL; From 82e081aebe4d9c26e196c8260005cc4762b57a5d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 28 Feb 2024 16:50:49 +0100 Subject: [PATCH 22/38] fuse: replace remaining make_bad_inode() with fuse_make_bad() fuse_do_statx() was added with the wrong helper. Fixes: d3045530bdd2 ("fuse: implement statx") Cc: # v6.6 Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 25c7c97f774b..ce6a38c56d54 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1214,7 +1214,7 @@ static int fuse_do_statx(struct inode *inode, struct file *file, if (((sx->mask & STATX_SIZE) && !fuse_valid_size(sx->size)) || ((sx->mask & STATX_TYPE) && (!fuse_valid_type(sx->mode) || inode_wrong_type(inode, sx->mode)))) { - make_bad_inode(inode); + fuse_make_bad(inode); return -EIO; } From 68ca1b49e430f6534d0774a94147a823e3b8b26e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 28 Feb 2024 16:50:49 +0100 Subject: [PATCH 23/38] fuse: fix root lookup with nonzero generation The root inode has a fixed nodeid and generation (1, 0). Prior to the commit 15db16837a35 ("fuse: fix illegal access to inode with reused nodeid") generation number on lookup was ignored. After this commit lookup with the wrong generation number resulted in the inode being unhashed. This is correct for non-root inodes, but replacing the root inode is wrong and results in weird behavior. Fix by reverting to the old behavior if ignoring the generation for the root inode, but issuing a warning in dmesg. Reported-by: Antonio SJ Musumeci Closes: https://lore.kernel.org/all/CAOQ4uxhek5ytdN8Yz2tNEOg5ea4NkBb4nk0FGPjPk_9nz-VG3g@mail.gmail.com/ Fixes: 15db16837a35 ("fuse: fix illegal access to inode with reused nodeid") Cc: # v5.14 Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index ce6a38c56d54..befb7dfe387a 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -391,6 +391,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name err = -EIO; if (fuse_invalid_attr(&outarg->attr)) goto out_put_forget; + if (outarg->nodeid == FUSE_ROOT_ID && outarg->generation != 0) { + pr_warn_once("root generation should be zero\n"); + outarg->generation = 0; + } *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, &outarg->attr, ATTR_TIMEOUT(outarg), From b1fe686a765e6c0d71811d825b5a1585a202b777 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 28 Feb 2024 16:50:49 +0100 Subject: [PATCH 24/38] fuse: don't unhash root The root inode is assumed to be always hashed. Do not unhash the root inode even if it is marked BAD. Fixes: 5d069dbe8aaf ("fuse: fix bad inode") Cc: # v5.11 Signed-off-by: Miklos Szeredi --- fs/fuse/fuse_i.h | 1 - fs/fuse/inode.c | 7 +++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7bd3552b1e80..4ef6087f0e5c 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -994,7 +994,6 @@ static inline bool fuse_stale_inode(const struct inode *inode, int generation, static inline void fuse_make_bad(struct inode *inode) { - remove_inode_hash(inode); set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state); } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 0186df5fc9ea..065b84efb663 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -475,8 +475,11 @@ retry: } else if (fuse_stale_inode(inode, generation, attr)) { /* nodeid was reused, any I/O on the old inode should fail */ fuse_make_bad(inode); - iput(inode); - goto retry; + if (inode != d_inode(sb->s_root)) { + remove_inode_hash(inode); + iput(inode); + goto retry; + } } fi = get_fuse_inode(inode); spin_lock(&fi->lock); From 253e52437119719eb87293ef6852e13bb5ad0960 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 28 Feb 2024 16:50:49 +0100 Subject: [PATCH 25/38] fuse: use FUSE_ROOT_ID in fuse_get_root_inode() ...when calling fuse_iget(). Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 065b84efb663..0d1aee941246 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -981,7 +981,7 @@ static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) attr.mode = mode; attr.ino = FUSE_ROOT_ID; attr.nlink = 1; - return fuse_iget(sb, 1, 0, &attr, 0, 0); + return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0); } struct fuse_inode_handle { From f9c29137392e77319f9974c2cdf27d087f05abee Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 15 Jan 2024 11:09:14 +0800 Subject: [PATCH 26/38] virtio_fs: remove duplicate check if queue is broken virtqueue_enable_cb() will call virtqueue_poll() which will check if queue is broken at beginning, so remove the virtqueue_is_broken() call Signed-off-by: Li RongQing Reviewed-by: Stefan Hajnoczi Signed-off-by: Miklos Szeredi --- fs/fuse/virtio_fs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 948b49c2460d..d51fb7ff7ed0 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -401,7 +401,7 @@ static void virtio_fs_hiprio_done_work(struct work_struct *work) kfree(req); dec_in_flight_req(fsvq); } - } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); + } while (!virtqueue_enable_cb(vq)); spin_unlock(&fsvq->lock); } @@ -683,7 +683,7 @@ static void virtio_fs_requests_done_work(struct work_struct *work) list_move_tail(&req->list, &reqs); spin_unlock(&fpq->lock); } - } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); + } while (!virtqueue_enable_cb(vq)); spin_unlock(&fsvq->lock); /* End requests */ From e1c420ac9968f40cc266ec648cce12fa55c891db Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 28 Feb 2024 18:29:37 +0000 Subject: [PATCH 27/38] fuse: Remove fuse_writepage The writepage operation is deprecated as it leads to worse performance under high memory pressure due to folios being written out in LRU order rather than sequentially within a file. Use filemap_migrate_folio() to support dirty folio migration instead of writepage. Signed-off-by: "Matthew Wilcox (Oracle)" Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3ce158f70927..e42aa6b0efb2 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2104,34 +2104,6 @@ err: return error; } -static int fuse_writepage(struct page *page, struct writeback_control *wbc) -{ - struct fuse_conn *fc = get_fuse_conn(page->mapping->host); - int err; - - if (fuse_page_is_writeback(page->mapping->host, page->index)) { - /* - * ->writepages() should be called for sync() and friends. We - * should only get here on direct reclaim and then we are - * allowed to skip a page which is already in flight - */ - WARN_ON(wbc->sync_mode == WB_SYNC_ALL); - - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - - if (wbc->sync_mode == WB_SYNC_NONE && - fc->num_background >= fc->congestion_threshold) - return AOP_WRITEPAGE_ACTIVATE; - - err = fuse_writepage_locked(page); - unlock_page(page); - - return err; -} - struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; @@ -3347,10 +3319,10 @@ static const struct file_operations fuse_file_operations = { static const struct address_space_operations fuse_file_aops = { .read_folio = fuse_read_folio, .readahead = fuse_readahead, - .writepage = fuse_writepage, .writepages = fuse_writepages, .launder_folio = fuse_launder_folio, .dirty_folio = filemap_dirty_folio, + .migrate_folio = filemap_migrate_folio, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, .write_begin = fuse_write_begin, From e0887e095a803d238bd3e2b280baa4c5e70c650c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 28 Feb 2024 18:29:38 +0000 Subject: [PATCH 28/38] fuse: Convert fuse_writepage_locked to take a folio The one remaining caller of fuse_writepage_locked() already has a folio, so convert this function entirely. Saves a few calls to compound_head() but no attempt is made to support large folios in this patch. Signed-off-by: "Matthew Wilcox (Oracle)" Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e42aa6b0efb2..dc45cef2a369 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2040,26 +2040,26 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, rcu_read_unlock(); } -static int fuse_writepage_locked(struct page *page) +static int fuse_writepage_locked(struct folio *folio) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_writepage_args *wpa; struct fuse_args_pages *ap; - struct page *tmp_page; + struct folio *tmp_folio; int error = -ENOMEM; - set_page_writeback(page); + folio_start_writeback(folio); wpa = fuse_writepage_args_alloc(); if (!wpa) goto err; ap = &wpa->ia.ap; - tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!tmp_page) + tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); + if (!tmp_folio) goto err_free; error = -EIO; @@ -2068,21 +2068,21 @@ static int fuse_writepage_locked(struct page *page) goto err_nofile; fuse_writepage_add_to_bucket(fc, wpa); - fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); + fuse_write_args_fill(&wpa->ia, wpa->ia.ff, folio_pos(folio), 0); - copy_highpage(tmp_page, page); + folio_copy(tmp_folio, folio); wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; wpa->next = NULL; ap->args.in_pages = true; ap->num_pages = 1; - ap->pages[0] = tmp_page; + ap->pages[0] = &tmp_folio->page; ap->descs[0].offset = 0; ap->descs[0].length = PAGE_SIZE; ap->args.end = fuse_writepage_end; wpa->inode = inode; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); + node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP); spin_lock(&fi->lock); tree_insert(&fi->writepages, wpa); @@ -2090,17 +2090,17 @@ static int fuse_writepage_locked(struct page *page) fuse_flush_writepages(inode); spin_unlock(&fi->lock); - end_page_writeback(page); + folio_end_writeback(folio); return 0; err_nofile: - __free_page(tmp_page); + folio_put(tmp_folio); err_free: kfree(wpa); err: - mapping_set_error(page->mapping, error); - end_page_writeback(page); + mapping_set_error(folio->mapping, error); + folio_end_writeback(folio); return error; } @@ -2466,7 +2466,7 @@ static int fuse_launder_folio(struct folio *folio) /* Serialize with pending writeback for the same page */ fuse_wait_on_page_writeback(inode, folio->index); - err = fuse_writepage_locked(&folio->page); + err = fuse_writepage_locked(folio); if (!err) fuse_wait_on_page_writeback(inode, folio->index); } From 2d09ab2203ece3d20aad6b3ba82a574c23da7556 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 5 Jan 2024 16:21:27 +0100 Subject: [PATCH 29/38] fuse: fix typo for fuse_permission comment Found by chance while working on support for idmapped mounts in fuse. Signed-off-by: Alexander Mikhalitsyn Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index befb7dfe387a..4a6df591add6 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1493,7 +1493,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask) * * 1) Local access checking ('default_permissions' mount option) based * on file mode. This is the plain old disk filesystem permission - * modell. + * model. * * 2) "Remote" access checking, where server is responsible for * checking permission in each inode operation. An exception to this From 5a4d888e9f9beeb5062fbddf789278de5295e9f8 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 5 Jan 2024 16:21:29 +0100 Subject: [PATCH 30/38] fuse: __kuid_val/__kgid_val helpers in fuse_fill_attr_from_inode() For the sake of consistency, let's use these helpers to extract {u,g}id_t values from k{u,g}id_t ones. There are no functional changes, just to make code cleaner. Signed-off-by: Alexander Mikhalitsyn Reviewed-by: Christian Brauner Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 0d1aee941246..4829e667f9ad 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1523,8 +1523,8 @@ static void fuse_fill_attr_from_inode(struct fuse_attr *attr, .ctimensec = ctime.tv_nsec, .mode = fi->inode.i_mode, .nlink = fi->inode.i_nlink, - .uid = fi->inode.i_uid.val, - .gid = fi->inode.i_gid.val, + .uid = __kuid_val(fi->inode.i_uid), + .gid = __kgid_val(fi->inode.i_gid), .rdev = fi->inode.i_rdev, .blksize = 1u << fi->inode.i_blkbits, }; From e022f6a1c711ab6d76e9e59dce77e2b25df75076 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Mon, 26 Feb 2024 11:54:35 +0800 Subject: [PATCH 31/38] fuse: add support for explicit export disabling open_by_handle_at(2) can fail with -ESTALE with a valid handle returned by a previous name_to_handle_at(2) for evicted fuse inodes, which is especially common when entry_valid_timeout is 0, e.g. when the fuse daemon is in "cache=none" mode. The time sequence is like: name_to_handle_at(2) # succeed evict fuse inode open_by_handle_at(2) # fail The root cause is that, with 0 entry_valid_timeout, the dput() called in name_to_handle_at(2) will trigger iput -> evict(), which will send FUSE_FORGET to the daemon. The following open_by_handle_at(2) will send a new FUSE_LOOKUP request upon inode cache miss since the previous inode eviction. Then the fuse daemon may fail the FUSE_LOOKUP request with -ENOENT as the cached metadata of the requested inode has already been cleaned up during the previous FUSE_FORGET. The returned -ENOENT is treated as -ESTALE when open_by_handle_at(2) returns. This confuses the application somehow, as open_by_handle_at(2) fails when the previous name_to_handle_at(2) succeeds. The returned errno is also confusing as the requested file is not deleted and already there. It is reasonable to fail name_to_handle_at(2) early in this case, after which the application can fallback to open(2) to access files. Since this issue typically appears when entry_valid_timeout is 0 which is configured by the fuse daemon, the fuse daemon is the right person to explicitly disable the export when required. Also considering FUSE_EXPORT_SUPPORT actually indicates the support for lookups of "." and "..", and there are existing fuse daemons supporting export without FUSE_EXPORT_SUPPORT set, for compatibility, we add a new INIT flag for such purpose. Reviewed-by: Amir Goldstein Signed-off-by: Jingbo Xu Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 11 ++++++++++- include/uapi/linux/fuse.h | 3 +++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 4829e667f9ad..ca5fad57bfa4 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1124,6 +1124,11 @@ static struct dentry *fuse_get_parent(struct dentry *child) return parent; } +/* only for fid encoding; no support for file handle */ +static const struct export_operations fuse_export_fid_operations = { + .encode_fh = fuse_encode_fh, +}; + static const struct export_operations fuse_export_operations = { .fh_to_dentry = fuse_fh_to_dentry, .fh_to_parent = fuse_fh_to_parent, @@ -1316,6 +1321,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->max_stack_depth = arg->max_stack_depth; fm->sb->s_stack_depth = arg->max_stack_depth; } + if (flags & FUSE_NO_EXPORT_SUPPORT) + fm->sb->s_export_op = &fuse_export_fid_operations; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1362,7 +1369,8 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | - FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP; + FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | + FUSE_NO_EXPORT_SUPPORT; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; @@ -1561,6 +1569,7 @@ static int fuse_fill_super_submount(struct super_block *sb, sb->s_bdi = bdi_get(parent_sb->s_bdi); sb->s_xattr = parent_sb->s_xattr; + sb->s_export_op = parent_sb->s_export_op; sb->s_time_gran = parent_sb->s_time_gran; sb->s_blocksize = parent_sb->s_blocksize; sb->s_blocksize_bits = parent_sb->s_blocksize_bits; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 1162a47b6a42..a86c2cad65ad 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -215,6 +215,7 @@ * 7.40 * - add max_stack_depth to fuse_init_out, add FUSE_PASSTHROUGH init flag * - add backing_id to fuse_open_out, add FOPEN_PASSTHROUGH open flag + * - add FUSE_NO_EXPORT_SUPPORT init flag */ #ifndef _LINUX_FUSE_H @@ -416,6 +417,7 @@ struct fuse_file_lock { * symlink and mknod (single group that matches parent) * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation * FUSE_DIRECT_IO_ALLOW_MMAP: allow shared mmap in FOPEN_DIRECT_IO mode. + * FUSE_NO_EXPORT_SUPPORT: explicitly disable export support */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -456,6 +458,7 @@ struct fuse_file_lock { #define FUSE_HAS_EXPIRE_ONLY (1ULL << 35) #define FUSE_DIRECT_IO_ALLOW_MMAP (1ULL << 36) #define FUSE_PASSTHROUGH (1ULL << 37) +#define FUSE_NO_EXPORT_SUPPORT (1ULL << 38) /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP From 760eac73f9f69aa28fcb3050b4946c2dcc656d12 Mon Sep 17 00:00:00 2001 From: Zhao Chen Date: Tue, 9 Jan 2024 17:24:42 +0800 Subject: [PATCH 32/38] fuse: Introduce a new notification type for resend pending requests When a FUSE daemon panics and failover, we aim to minimize the impact on applications by reusing the existing FUSE connection. During this process, another daemon is employed to preserve the FUSE connection's file descriptor. The new started FUSE Daemon will takeover the fd and continue to provide service. However, it is possible for some inflight requests to be lost and never returned. As a result, applications awaiting replies would become stuck forever. To address this, we can resend these pending requests to the new started FUSE daemon. This patch introduces a new notification type "FUSE_NOTIFY_RESEND", which can trigger resending of the pending requests, ensuring they are properly processed again. Signed-off-by: Zhao Chen Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 56 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/fuse.h | 2 ++ 2 files changed, 58 insertions(+) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5a5fb56deb92..3c15a6b7f6f2 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1775,6 +1775,59 @@ copy_finish: return err; } +/* + * Resending all processing queue requests. + * + * During a FUSE daemon panics and failover, it is possible for some inflight + * requests to be lost and never returned. As a result, applications awaiting + * replies would become stuck forever. To address this, we can use notification + * to trigger resending of these pending requests to the FUSE daemon, ensuring + * they are properly processed again. + * + * Please note that this strategy is applicable only to idempotent requests or + * if the FUSE daemon takes careful measures to avoid processing duplicated + * non-idempotent requests. + */ +static void fuse_resend(struct fuse_conn *fc) +{ + struct fuse_dev *fud; + struct fuse_req *req, *next; + struct fuse_iqueue *fiq = &fc->iq; + LIST_HEAD(to_queue); + unsigned int i; + + spin_lock(&fc->lock); + if (!fc->connected) { + spin_unlock(&fc->lock); + return; + } + + list_for_each_entry(fud, &fc->devices, entry) { + struct fuse_pqueue *fpq = &fud->pq; + + spin_lock(&fpq->lock); + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + list_splice_tail_init(&fpq->processing[i], &to_queue); + spin_unlock(&fpq->lock); + } + spin_unlock(&fc->lock); + + list_for_each_entry_safe(req, next, &to_queue, list) { + __set_bit(FR_PENDING, &req->flags); + } + + spin_lock(&fiq->lock); + /* iq and pq requests are both oldest to newest */ + list_splice(&to_queue, &fiq->pending); + fiq->ops->wake_pending_and_unlock(fiq); +} + +static int fuse_notify_resend(struct fuse_conn *fc) +{ + fuse_resend(fc); + return 0; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { @@ -1800,6 +1853,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_DELETE: return fuse_notify_delete(fc, size, cs); + case FUSE_NOTIFY_RESEND: + return fuse_notify_resend(fc); + default: fuse_copy_finish(cs); return -EINVAL; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index a86c2cad65ad..659932eb35e1 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -216,6 +216,7 @@ * - add max_stack_depth to fuse_init_out, add FUSE_PASSTHROUGH init flag * - add backing_id to fuse_open_out, add FOPEN_PASSTHROUGH open flag * - add FUSE_NO_EXPORT_SUPPORT init flag + * - add FUSE_NOTIFY_RESEND */ #ifndef _LINUX_FUSE_H @@ -645,6 +646,7 @@ enum fuse_notify_code { FUSE_NOTIFY_STORE = 4, FUSE_NOTIFY_RETRIEVE = 5, FUSE_NOTIFY_DELETE = 6, + FUSE_NOTIFY_RESEND = 7, FUSE_NOTIFY_CODE_MAX, }; From 9e7f5296f475ba5ab887ae3e55b922e17e99752b Mon Sep 17 00:00:00 2001 From: Zhao Chen Date: Tue, 9 Jan 2024 17:24:43 +0800 Subject: [PATCH 33/38] fuse: Use the high bit of request ID for indicating resend requests Some FUSE daemons want to know if the received request is a resend request. The high bit of the fuse request ID is utilized for indicating this, enabling the receiver to perform appropriate handling. The init flag "FUSE_HAS_RESEND" is added to indicate this feature. Signed-off-by: Zhao Chen Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 2 ++ fs/fuse/inode.c | 2 +- include/uapi/linux/fuse.h | 13 ++++++++++++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 3c15a6b7f6f2..3ec8bb5e68ff 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1814,6 +1814,8 @@ static void fuse_resend(struct fuse_conn *fc) list_for_each_entry_safe(req, next, &to_queue, list) { __set_bit(FR_PENDING, &req->flags); + /* mark the request as resend request */ + req->in.h.unique |= FUSE_UNIQUE_RESEND; } spin_lock(&fiq->lock); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index ca5fad57bfa4..02869edf72f3 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1370,7 +1370,7 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | - FUSE_NO_EXPORT_SUPPORT; + FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 659932eb35e1..d08b99d60f6f 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -216,7 +216,7 @@ * - add max_stack_depth to fuse_init_out, add FUSE_PASSTHROUGH init flag * - add backing_id to fuse_open_out, add FOPEN_PASSTHROUGH open flag * - add FUSE_NO_EXPORT_SUPPORT init flag - * - add FUSE_NOTIFY_RESEND + * - add FUSE_NOTIFY_RESEND, add FUSE_HAS_RESEND init flag */ #ifndef _LINUX_FUSE_H @@ -419,6 +419,8 @@ struct fuse_file_lock { * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation * FUSE_DIRECT_IO_ALLOW_MMAP: allow shared mmap in FOPEN_DIRECT_IO mode. * FUSE_NO_EXPORT_SUPPORT: explicitly disable export support + * FUSE_HAS_RESEND: kernel supports resending pending requests, and the high bit + * of the request ID indicates resend requests */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -460,6 +462,7 @@ struct fuse_file_lock { #define FUSE_DIRECT_IO_ALLOW_MMAP (1ULL << 36) #define FUSE_PASSTHROUGH (1ULL << 37) #define FUSE_NO_EXPORT_SUPPORT (1ULL << 38) +#define FUSE_HAS_RESEND (1ULL << 39) /* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ #define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP @@ -973,6 +976,14 @@ struct fuse_fallocate_in { uint32_t padding; }; +/** + * FUSE request unique ID flag + * + * Indicates whether this is a resend request. The receiver should handle this + * request accordingly. + */ +#define FUSE_UNIQUE_RESEND (1ULL << 63) + struct fuse_in_header { uint32_t len; uint32_t opcode; From 2e3f7dd08d70eca86a8cc9b4baf3da77c032d5fc Mon Sep 17 00:00:00 2001 From: Zhou Jifeng Date: Tue, 7 Nov 2023 16:13:50 +0800 Subject: [PATCH 34/38] fuse: Track process write operations in both direct and writethrough modes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Due to the fact that fuse does not count the write IO of processes in the direct and writethrough write modes, user processes cannot track write_bytes through the “/proc/[pid]/io” path. For example, the system tool iotop cannot count the write operations of the corresponding process. Signed-off-by: Zhou Jifeng Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index dc45cef2a369..761ae81ef41d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -20,6 +20,7 @@ #include #include #include +#include static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, @@ -1405,7 +1406,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) struct address_space *mapping = file->f_mapping; ssize_t written = 0; struct inode *inode = mapping->host; - ssize_t err; + ssize_t err, count; struct fuse_conn *fc = get_fuse_conn(inode); if (fc->writeback_cache) { @@ -1427,10 +1428,12 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) writethrough: inode_lock(inode); - err = generic_write_checks(iocb, from); + err = count = generic_write_checks(iocb, from); if (err <= 0) goto out; + task_io_account_write(count); + err = file_remove_privs(file); if (err) goto out; @@ -1668,6 +1671,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) fuse_dio_lock(iocb, from, &exclusive); res = generic_write_checks(iocb, from); if (res > 0) { + task_io_account_write(res); if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { res = fuse_direct_IO(iocb, from); } else { From 8a5fb186431326886ccc7b71d40aaf5e53b5d91a Mon Sep 17 00:00:00 2001 From: Jiachen Zhang Date: Thu, 8 Jun 2023 16:46:09 +0800 Subject: [PATCH 35/38] fuse: remove an unnecessary if statement FUSE remote locking code paths never add any locking state to inode->i_flctx, so the locks_remove_posix() function called on file close will return without calling fuse_setlk(). Therefore, as the if statement to be removed in this commit will always be false, remove it for clearness. Signed-off-by: Jiachen Zhang Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 761ae81ef41d..bd62be510a94 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2678,10 +2678,6 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) return -ENOLCK; } - /* Unlock on close is handled by the flush method */ - if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) - return 0; - fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); err = fuse_simple_request(fm, &args); From 738adade96b2ec414a44f3b1ed891fec3e0c03dd Mon Sep 17 00:00:00 2001 From: Lei Huang Date: Tue, 29 Aug 2023 14:36:33 -0400 Subject: [PATCH 36/38] fuse: Fix missing FOLL_PIN for direct-io Our user space filesystem relies on fuse to provide POSIX interface. In our test, a known string is written into a file and the content is read back later to verify correct data returned. We observed wrong data returned in read buffer in rare cases although correct data are stored in our filesystem. Fuse kernel module calls iov_iter_get_pages2() to get the physical pages of the user-space read buffer passed in read(). The pages are not pinned to avoid page migration. When page migration occurs, the consequence are two-folds. 1) Applications do not receive correct data in read buffer. 2) fuse kernel writes data into a wrong place. Using iov_iter_extract_pages() to pin pages fixes the issue in our test. An auxiliary variable "struct page **pt_pages" is used in the patch to prepare the 2nd parameter for iov_iter_extract_pages() since iov_iter_get_pages2() uses a different type for the 2nd parameter. [SzM] add iov_iter_extract_will_pin(ii) and unpin only if true. Signed-off-by: Lei Huang Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 15 ++++++++++----- fs/fuse/fuse_i.h | 1 + 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index bd62be510a94..8b9c5ab0abc0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -655,7 +655,8 @@ static void fuse_release_user_pages(struct fuse_args_pages *ap, for (i = 0; i < ap->num_pages; i++) { if (should_dirty) set_page_dirty_lock(ap->pages[i]); - put_page(ap->pages[i]); + if (ap->args.is_pinned) + unpin_user_page(ap->pages[i]); } } @@ -1495,10 +1496,13 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, while (nbytes < *nbytesp && ap->num_pages < max_pages) { unsigned npages; size_t start; - ret = iov_iter_get_pages2(ii, &ap->pages[ap->num_pages], - *nbytesp - nbytes, - max_pages - ap->num_pages, - &start); + struct page **pt_pages; + + pt_pages = &ap->pages[ap->num_pages]; + ret = iov_iter_extract_pages(ii, &pt_pages, + *nbytesp - nbytes, + max_pages - ap->num_pages, + 0, &start); if (ret < 0) break; @@ -1515,6 +1519,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } + ap->args.is_pinned = iov_iter_extract_will_pin(ii); ap->args.user_pages = true; if (write) ap->args.in_pages = true; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 4ef6087f0e5c..0f3521449bb4 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -314,6 +314,7 @@ struct fuse_args { bool page_replace:1; bool may_block:1; bool is_ext:1; + bool is_pinned:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); From efc4105a4cf9e300b8e9150147415fa235059293 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 7 Oct 2023 23:39:56 +0800 Subject: [PATCH 37/38] fuse: remove unneeded lock which protecting update of congestion_threshold Commit 670d21c6e17f6 ("fuse: remove reliance on bdi congestion") change how congestion_threshold is used and lock in fuse_conn_congestion_threshold_write is not needed anymore. 1. Access to supe_block is removed along with removing of bdi congestion. Then down_read(&fc->killsb) which protecting access to super_block is no needed. 2. Compare num_background and congestion_threshold without holding bg_lock. Then there is no need to hold bg_lock to update congestion_threshold. Signed-off-by: Kemeng Shi Signed-off-by: Miklos Szeredi --- fs/fuse/control.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 284a35006462..97ac994ff78f 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -174,11 +174,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, if (!fc) goto out; - down_read(&fc->killsb); - spin_lock(&fc->bg_lock); - fc->congestion_threshold = val; - spin_unlock(&fc->bg_lock); - up_read(&fc->killsb); + WRITE_ONCE(fc->congestion_threshold, val); fuse_conn_put(fc); out: return ret; From cdf6ac2a03d253f05d3e798f60f23dea1b176b92 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 6 Mar 2024 16:20:58 +0100 Subject: [PATCH 38/38] fuse: get rid of ff->readdir.lock The same protection is provided by file->f_pos_lock. Note, this relies on the fact that file->f_mode has FMODE_ATOMIC_POS. This flag is cleared by stream_open(), which would prevent locking of f_pos_lock. Prior to commit 7de64d521bf9 ("fuse: break up fuse_open_common()") FOPEN_STREAM on a directory would cause stream_open() to be called. After this commit this is not done anymore, so f_pos_lock will always be locked. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 -- fs/fuse/fuse_i.h | 6 ------ fs/fuse/readdir.c | 4 ---- 3 files changed, 12 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8b9c5ab0abc0..1b0b9f2a4fbf 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -69,7 +69,6 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release) } INIT_LIST_HEAD(&ff->write_entry); - mutex_init(&ff->readdir.lock); refcount_set(&ff->count, 1); RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); @@ -82,7 +81,6 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release) void fuse_file_free(struct fuse_file *ff) { kfree(ff->args); - mutex_destroy(&ff->readdir.lock); kfree(ff); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 0f3521449bb4..3658a70ae414 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -243,12 +243,6 @@ struct fuse_file { /* Readdir related */ struct { - /* - * Protects below fields against (crazy) parallel readdir on - * same open file. Uncontended in the normal case. - */ - struct mutex lock; - /* Dir stream position */ loff_t pos; diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index c66a54d6c7d3..0377b6dc24c8 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -592,15 +592,11 @@ int fuse_readdir(struct file *file, struct dir_context *ctx) if (fuse_is_bad(inode)) return -EIO; - mutex_lock(&ff->readdir.lock); - err = UNCACHED; if (ff->open_flags & FOPEN_CACHE_DIR) err = fuse_readdir_cached(file, ctx); if (err == UNCACHED) err = fuse_readdir_uncached(file, ctx); - mutex_unlock(&ff->readdir.lock); - return err; }