From fe12cfc17429a4ddfdf9e71711bd125b8854ed7d Mon Sep 17 00:00:00 2001 From: Jay Date: Tue, 9 Jan 2024 15:29:27 +0800 Subject: [PATCH 01/70] fs: fix a typo in attr.c The word "filesytem" should be "filesystem" Signed-off-by: Jay Link: https://lore.kernel.org/r/20240109072927.29626-1-merqqcury@gmail.com Signed-off-by: Christian Brauner --- fs/attr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/attr.c b/fs/attr.c index 5a13f0c8495f..49d23b5dbab4 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -352,7 +352,7 @@ int may_setattr(struct mnt_idmap *idmap, struct inode *inode, EXPORT_SYMBOL(may_setattr); /** - * notify_change - modify attributes of a filesytem object + * notify_change - modify attributes of a filesystem object * @idmap: idmap of the mount the inode was found from * @dentry: object affected * @attr: new attributes From 73f65b8b0325551110dedf8d27ac738bdc331802 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 8 Jan 2024 18:20:40 +0100 Subject: [PATCH 02/70] fs: Wrong function name in comment This comment refers to function mark_buffer_inode_dirty(), but the function is actually called mark_buffer_dirty_inode(), so fix the comment. Signed-off-by: Andreas Gruenbacher Link: https://lore.kernel.org/r/20240108172040.178173-1-agruenba@redhat.com Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Christian Brauner --- fs/buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index d3bcf601d3e5..dcafee512089 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -464,7 +464,7 @@ EXPORT_SYMBOL(mark_buffer_async_write); * a successful fsync(). For example, ext2 indirect blocks need to be * written back and waited upon before fsync() returns. * - * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), + * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(), * inode_has_buffers() and invalidate_inode_buffers() are provided for the * management of a list of dependent buffers at ->i_mapping->i_private_list. * From 6c8ac6e24eb04d1eebc442029d7a7528e5461cb8 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Thu, 11 Jan 2024 17:22:40 +1100 Subject: [PATCH 03/70] initramfs: remove duplicate built-in __initramfs_start unpacking If initrd_start cpio extraction fails, CONFIG_BLK_DEV_RAM triggers fallback to initrd.image handling via populate_initrd_image(). The populate_initrd_image() call follows successful extraction of any built-in cpio archive at __initramfs_start, but currently performs built-in archive extraction a second time. Prior to commit b2a74d5f9d446 ("initramfs: remove clean_rootfs"), the second built-in initramfs unpack call was used to repopulate entries removed by clean_rootfs(), but it's no longer necessary now the contents of the previous extraction are retained. Signed-off-by: David Disseldorp Link: https://lore.kernel.org/r/20240111062240.9362-1-ddiss@suse.de Reviewed-by: Christoph Hellwig Signed-off-by: Christian Brauner --- init/initramfs.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/init/initramfs.c b/init/initramfs.c index 76deb48c38cb..d3c623dde01a 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -679,8 +679,6 @@ static void __init populate_initrd_image(char *err) struct file *file; loff_t pos = 0; - unpack_to_rootfs(__initramfs_start, __initramfs_size); - printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n", err); file = filp_open("/initrd.image", O_WRONLY | O_CREAT, 0700); From 6b6ec4ca4e33d797f887cb2d7ecf365b652b338c Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Wed, 10 Jan 2024 23:47:40 +0800 Subject: [PATCH 04/70] eventfd: add a BUILD_BUG_ON() to ensure consistency between EFD_SEMAPHORE and the uapi introduce a BUILD_BUG_ON to check that the EFD_SEMAPHORE is equal to its definition in the uapi file, just like EFD_CLOEXEC and EFD_NONBLOCK. Signed-off-by: Wen Yang Link: https://lore.kernel.org/r/tencent_0BAA2DEAF9208D49987457E6583F9BE79507@qq.com Cc: Alexander Viro Cc: Christian Brauner Cc: Jens Axboe Cc: Jan Kara Cc: Cc: Signed-off-by: Christian Brauner --- fs/eventfd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/eventfd.c b/fs/eventfd.c index ad8186d47ba7..0252b71099fb 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -383,6 +383,7 @@ static int do_eventfd(unsigned int count, int flags) /* Check the EFD_* constants for consistency. */ BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK); + BUILD_BUG_ON(EFD_SEMAPHORE != (1 << 0)); if (flags & ~EFD_FLAGS_SET) return -EINVAL; From de8a3207aed33283a560193095b156d3b73fc4f0 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Tue, 16 Jan 2024 17:11:37 +0800 Subject: [PATCH 05/70] buffer: Use KMEM_CACHE instead of kmem_cache_create() Use the new KMEM_CACHE() macro instead of direct kmem_cache_create to simplify the creation of SLAB caches. Signed-off-by: Kunwu Chan Link: https://lore.kernel.org/r/20240116091137.92375-1-chentao@kylinos.cn Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/buffer.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index dcafee512089..b55dea034a5d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3121,12 +3121,8 @@ void __init buffer_init(void) unsigned long nrpages; int ret; - bh_cachep = kmem_cache_create("buffer_head", - sizeof(struct buffer_head), 0, - (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| - SLAB_MEM_SPREAD), - NULL); - + bh_cachep = KMEM_CACHE(buffer_head, + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); /* * Limit the bh occupancy to 10% of ZONE_NORMAL */ From 8b3d838139bcd1e552f1899191f734264ce2a1a5 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 16 Jan 2024 15:53:35 +0800 Subject: [PATCH 06/70] fs: improve dump_mapping() robustness We met a kernel crash issue when running stress-ng testing, and the system crashes when printing the dentry name in dump_mapping(). Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 pc : dentry_name+0xd8/0x224 lr : pointer+0x22c/0x370 sp : ffff800025f134c0 ...... Call trace: dentry_name+0xd8/0x224 pointer+0x22c/0x370 vsnprintf+0x1ec/0x730 vscnprintf+0x2c/0x60 vprintk_store+0x70/0x234 vprintk_emit+0xe0/0x24c vprintk_default+0x3c/0x44 vprintk_func+0x84/0x2d0 printk+0x64/0x88 __dump_page+0x52c/0x530 dump_page+0x14/0x20 set_migratetype_isolate+0x110/0x224 start_isolate_page_range+0xc4/0x20c offline_pages+0x124/0x474 memory_block_offline+0x44/0xf4 memory_subsys_offline+0x3c/0x70 device_offline+0xf0/0x120 ...... The root cause is that, one thread is doing page migration, and we will use the target page's ->mapping field to save 'anon_vma' pointer between page unmap and page move, and now the target page is locked and refcount is 1. Currently, there is another stress-ng thread performing memory hotplug, attempting to offline the target page that is being migrated. It discovers that the refcount of this target page is 1, preventing the offline operation, thus proceeding to dump the page. However, page_mapping() of the target page may return an incorrect file mapping to crash the system in dump_mapping(), since the target page->mapping only saves 'anon_vma' pointer without setting PAGE_MAPPING_ANON flag. The page migration issue has been fixed by commit d1adb25df711 ("mm: migrate: fix getting incorrect page mapping during page migration"). In addition, Matthew suggested we should also improve dump_mapping()'s robustness to resilient against the kernel crash [1]. With checking the 'dentry.parent' and 'dentry.d_name.name' used by dentry_name(), I can see dump_mapping() will output the invalid dentry instead of crashing the system when this issue is reproduced again. [12211.189128] page:fffff7de047741c0 refcount:1 mapcount:0 mapping:ffff989117f55ea0 index:0x1 pfn:0x211dd07 [12211.189144] aops:0x0 ino:1 invalid dentry:74786574206e6870 [12211.189148] flags: 0x57ffffc0000001(locked|node=1|zone=2|lastcpupid=0x1fffff) [12211.189150] page_type: 0xffffffff() [12211.189153] raw: 0057ffffc0000001 0000000000000000 dead000000000122 ffff989117f55ea0 [12211.189154] raw: 0000000000000001 0000000000000001 00000001ffffffff 0000000000000000 [12211.189155] page dumped because: unmovable page [1] https://lore.kernel.org/all/ZXxn%2F0oixJxxAnpF@casper.infradead.org/ Suggested-by: Matthew Wilcox Signed-off-by: Baolin Wang Link: https://lore.kernel.org/r/937ab1f87328516821d39be672b6bc18861d9d3e.1705391420.git.baolin.wang@linux.alibaba.com Signed-off-by: Christian Brauner --- fs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index 91048c4c9c9e..6d0d54230363 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -588,7 +588,8 @@ void dump_mapping(const struct address_space *mapping) } dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); - if (get_kernel_nofault(dentry, dentry_ptr)) { + if (get_kernel_nofault(dentry, dentry_ptr) || + !dentry.d_parent || !dentry.d_name.name) { pr_warn("aops:%ps ino:%lx invalid dentry:%px\n", a_ops, ino, dentry_ptr); return; From 0f05ee447949cf1b72bd7c415c803ea7f7209403 Mon Sep 17 00:00:00 2001 From: Hu Yadi Date: Fri, 12 Jan 2024 15:40:59 +0800 Subject: [PATCH 07/70] selftests/filesystems:fix build error in overlayfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One build issue comes up due to both mount.h included dev_in_maps.c In file included from dev_in_maps.c:10: /usr/include/sys/mount.h:35:3: error: expected identifier before numeric constant 35 | MS_RDONLY = 1, /* Mount read-only. */ | ^~~~~~~~~ In file included from dev_in_maps.c:13: Remove one of them to solve conflict, another error comes up: dev_in_maps.c:170:6: error: implicit declaration of function ‘mount’ [-Werror=implicit-function-declaration] 170 | if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) { | ^~~~~ cc1: all warnings being treated as errors and then , add sys_mount definition to solve it After both above, dev_in_maps.c can be built correctly on my mache(gcc 10.2,glibc-2.32,kernel-5.10) Signed-off-by: Hu Yadi Link: https://lore.kernel.org/r/20240112074059.29673-1-hu.yadi@h3c.com Acked-by: Andrei Vagin Signed-off-by: Christian Brauner --- .../selftests/filesystems/overlayfs/dev_in_maps.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c index e19ab0e85709..759f86e7d263 100644 --- a/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c +++ b/tools/testing/selftests/filesystems/overlayfs/dev_in_maps.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -32,7 +31,11 @@ static int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags) { return syscall(__NR_fsmount, fd, flags, attr_flags); } - +static int sys_mount(const char *src, const char *tgt, const char *fst, + unsigned long flags, const void *data) +{ + return syscall(__NR_mount, src, tgt, fst, flags, data); +} static int sys_move_mount(int from_dfd, const char *from_pathname, int to_dfd, const char *to_pathname, unsigned int flags) @@ -166,8 +169,7 @@ int main(int argc, char **argv) ksft_test_result_skip("unable to create a new mount namespace\n"); return 1; } - - if (mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) { + if (sys_mount(NULL, "/", NULL, MS_SLAVE | MS_REC, NULL) == -1) { pr_perror("mount"); return 1; } From 9e3f1c59367515e7e40675fe83645a131c05039d Mon Sep 17 00:00:00 2001 From: "Hu.Yadi" Date: Thu, 11 Jan 2024 19:32:29 +0800 Subject: [PATCH 08/70] selftests/move_mount_set_group:Make tests build with old libc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace SYS_ with __NR_. Using the __NR_ notation, provided by UAPI, is useful to build tests on systems without the SYS_ definitions. Replace SYS_move_mount with __NR_move_mount Similar changes: commit 87129ef13603 ("selftests/landlock: Make tests build with old libc") Acked-by: Mickaël Salaün Signed-off-by: Hu.Yadi Link: https://lore.kernel.org/r/20240111113229.10820-1-hu.yadi@h3c.com Reviewed-by: Berlin Suggested-by: Jiao Signed-off-by: Christian Brauner --- .../move_mount_set_group/move_mount_set_group_test.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c b/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c index 50ed5d475dd1..bcf51d785a37 100644 --- a/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c +++ b/tools/testing/selftests/move_mount_set_group/move_mount_set_group_test.c @@ -218,7 +218,7 @@ static bool move_mount_set_group_supported(void) if (mount(NULL, SET_GROUP_FROM, NULL, MS_SHARED, 0)) return -1; - ret = syscall(SYS_move_mount, AT_FDCWD, SET_GROUP_FROM, + ret = syscall(__NR_move_mount, AT_FDCWD, SET_GROUP_FROM, AT_FDCWD, SET_GROUP_TO, MOVE_MOUNT_SET_GROUP); umount2("/tmp", MNT_DETACH); @@ -363,7 +363,7 @@ TEST_F(move_mount_set_group, complex_sharing_copying) CLONE_VM | CLONE_FILES); ASSERT_GT(pid, 0); ASSERT_EQ(wait_for_pid(pid), 0); - ASSERT_EQ(syscall(SYS_move_mount, ca_from.mntfd, "", + ASSERT_EQ(syscall(__NR_move_mount, ca_from.mntfd, "", ca_to.mntfd, "", MOVE_MOUNT_SET_GROUP | MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH), 0); From 73fa7547c70b32cc69685f79be31135797734eb6 Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Mon, 31 Aug 2020 11:32:08 -0400 Subject: [PATCH 09/70] vfs: add RWF_NOAPPEND flag for pwritev2 The pwrite function, originally defined by POSIX (thus the "p"), is defined to ignore O_APPEND and write at the offset passed as its argument. However, historically Linux honored O_APPEND if set and ignored the offset. This cannot be changed due to stability policy, but is documented in the man page as a bug. Now that there's a pwritev2 syscall providing a superset of the pwrite functionality that has a flags argument, the conforming behavior can be offered to userspace via a new flag. Since pwritev2 checks flag validity (in kiocb_set_rw_flags) and reports unknown ones with EOPNOTSUPP, callers will not get wrong behavior on old kernels that don't support the new flag; the error is reported and the caller can decide how to handle it. Signed-off-by: Rich Felker Link: https://lore.kernel.org/r/20200831153207.GO3265@brightrain.aerifal.cx Reviewed-by: Jann Horn Signed-off-by: Christian Brauner --- include/linux/fs.h | 8 ++++++++ include/uapi/linux/fs.h | 5 ++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..4f7cfda29143 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3335,6 +3335,8 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) return 0; if (unlikely(flags & ~RWF_SUPPORTED)) return -EOPNOTSUPP; + if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND))) + return -EINVAL; if (flags & RWF_NOWAIT) { if (!(ki->ki_filp->f_mode & FMODE_NOWAIT)) @@ -3345,6 +3347,12 @@ static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags) if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; + if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) { + if (IS_APPEND(file_inode(ki->ki_filp))) + return -EPERM; + ki->ki_flags &= ~IOCB_APPEND; + } + ki->ki_flags |= kiocb_flags; return 0; } diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 48ad69f7722e..2203d3194b91 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -301,9 +301,12 @@ typedef int __bitwise __kernel_rwf_t; /* per-IO O_APPEND */ #define RWF_APPEND ((__force __kernel_rwf_t)0x00000010) +/* per-IO negation of O_APPEND */ +#define RWF_NOAPPEND ((__force __kernel_rwf_t)0x00000020) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ - RWF_APPEND) + RWF_APPEND | RWF_NOAPPEND) /* Pagemap ioctl */ #define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) From 12f7900c575679af411aaa89340bfe3dc68b46b3 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 19 Jan 2024 04:33:39 +0800 Subject: [PATCH 10/70] writeback: move wb_wakeup_delayed defination to fs-writeback.c The wb_wakeup_delayed is only used in fs-writeback.c. Move it to fs-writeback.c after defination of wb_wakeup and make it static. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20240118203339.764093-1-shikemeng@huaweicloud.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fs-writeback.c | 25 +++++++++++++++++++++++++ include/linux/backing-dev.h | 1 - mm/backing-dev.c | 25 ------------------------- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3d84fcc471c6..e4f17c53ddfc 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -141,6 +141,31 @@ static void wb_wakeup(struct bdi_writeback *wb) spin_unlock_irq(&wb->work_lock); } +/* + * This function is used when the first inode for this wb is marked dirty. It + * wakes-up the corresponding bdi thread which should then take care of the + * periodic background write-out of dirty inodes. Since the write-out would + * starts only 'dirty_writeback_interval' centisecs from now anyway, we just + * set up a timer which wakes the bdi thread up later. + * + * Note, we wouldn't bother setting up the timer, but this function is on the + * fast-path (used by '__mark_inode_dirty()'), so we save few context switches + * by delaying the wake-up. + * + * We have to be careful not to postpone flush work if it is scheduled for + * earlier. Thus we use queue_delayed_work(). + */ +static void wb_wakeup_delayed(struct bdi_writeback *wb) +{ + unsigned long timeout; + + timeout = msecs_to_jiffies(dirty_writeback_interval * 10); + spin_lock_irq(&wb->work_lock); + if (test_bit(WB_registered, &wb->state)) + queue_delayed_work(bdi_wq, &wb->dwork, timeout); + spin_unlock_irq(&wb->work_lock); +} + static void finish_writeback_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 1a97277f99b1..8e7af9a03b41 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -38,7 +38,6 @@ struct backing_dev_info *bdi_alloc(int node_id); void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); -void wb_wakeup_delayed(struct bdi_writeback *wb); void wb_wait_for_completion(struct wb_completion *done); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1e3447bccdb1..039dc74b505a 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -372,31 +372,6 @@ static int __init default_bdi_init(void) } subsys_initcall(default_bdi_init); -/* - * This function is used when the first inode for this wb is marked dirty. It - * wakes-up the corresponding bdi thread which should then take care of the - * periodic background write-out of dirty inodes. Since the write-out would - * starts only 'dirty_writeback_interval' centisecs from now anyway, we just - * set up a timer which wakes the bdi thread up later. - * - * Note, we wouldn't bother setting up the timer, but this function is on the - * fast-path (used by '__mark_inode_dirty()'), so we save few context switches - * by delaying the wake-up. - * - * We have to be careful not to postpone flush work if it is scheduled for - * earlier. Thus we use queue_delayed_work(). - */ -void wb_wakeup_delayed(struct bdi_writeback *wb) -{ - unsigned long timeout; - - timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - spin_lock_irq(&wb->work_lock); - if (test_bit(WB_registered, &wb->state)) - queue_delayed_work(bdi_wq, &wb->dwork, timeout); - spin_unlock_irq(&wb->work_lock); -} - static void wb_update_bandwidth_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), From 3948abaa4e2be938ccdfc289385a27342fb13d43 Mon Sep 17 00:00:00 2001 From: Nikita Zhandarovich Date: Fri, 19 Jan 2024 07:39:06 -0800 Subject: [PATCH 11/70] do_sys_name_to_handle(): use kzalloc() to fix kernel-infoleak syzbot identified a kernel information leak vulnerability in do_sys_name_to_handle() and issued the following report [1]. [1] "BUG: KMSAN: kernel-infoleak in instrument_copy_to_user include/linux/instrumented.h:114 [inline] BUG: KMSAN: kernel-infoleak in _copy_to_user+0xbc/0x100 lib/usercopy.c:40 instrument_copy_to_user include/linux/instrumented.h:114 [inline] _copy_to_user+0xbc/0x100 lib/usercopy.c:40 copy_to_user include/linux/uaccess.h:191 [inline] do_sys_name_to_handle fs/fhandle.c:73 [inline] __do_sys_name_to_handle_at fs/fhandle.c:112 [inline] __se_sys_name_to_handle_at+0x949/0xb10 fs/fhandle.c:94 __x64_sys_name_to_handle_at+0xe4/0x140 fs/fhandle.c:94 ... Uninit was created at: slab_post_alloc_hook+0x129/0xa70 mm/slab.h:768 slab_alloc_node mm/slub.c:3478 [inline] __kmem_cache_alloc_node+0x5c9/0x970 mm/slub.c:3517 __do_kmalloc_node mm/slab_common.c:1006 [inline] __kmalloc+0x121/0x3c0 mm/slab_common.c:1020 kmalloc include/linux/slab.h:604 [inline] do_sys_name_to_handle fs/fhandle.c:39 [inline] __do_sys_name_to_handle_at fs/fhandle.c:112 [inline] __se_sys_name_to_handle_at+0x441/0xb10 fs/fhandle.c:94 __x64_sys_name_to_handle_at+0xe4/0x140 fs/fhandle.c:94 ... Bytes 18-19 of 20 are uninitialized Memory access of size 20 starts at ffff888128a46380 Data copied to user address 0000000020000240" Per Chuck Lever's suggestion, use kzalloc() instead of kmalloc() to solve the problem. Fixes: 990d6c2d7aee ("vfs: Add name to file handle conversion support") Suggested-by: Chuck Lever III Reported-and-tested-by: Signed-off-by: Nikita Zhandarovich Link: https://lore.kernel.org/r/20240119153906.4367-1-n.zhandarovich@fintech.ru Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/fhandle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fhandle.c b/fs/fhandle.c index 18b3ba8dc8ea..57a12614addf 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -36,7 +36,7 @@ static long do_sys_name_to_handle(const struct path *path, if (f_handle.handle_bytes > MAX_HANDLE_SZ) return -EINVAL; - handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, + handle = kzalloc(sizeof(struct file_handle) + f_handle.handle_bytes, GFP_KERNEL); if (!handle) return -ENOMEM; From 9473c4450e9c83d890d435577a3776d925fa748c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 28 Dec 2023 15:15:10 -0500 Subject: [PATCH 12/70] exportfs: fix the fallback implementation of the get_name export operation The fallback implementation for the get_name export operation uses readdir() to try to match the inode number to a filename. That filename is then used together with lookup_one() to produce a dentry. A problem arises when we match the '.' or '..' entries, since that causes lookup_one() to fail. This has sometimes been seen to occur for filesystems that violate POSIX requirements around uniqueness of inode numbers, something that is common for snapshot directories. This patch just ensures that we skip '.' and '..' rather than allowing a match. Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Acked-by: Amir Goldstein Link: https://lore.kernel.org/linux-nfs/CAOQ4uxiOZobN76OKB-VBNXWeFKVwLW_eK5QtthGyYzWU9mjb7Q@mail.gmail.com/ Acked-by: Christian Brauner Signed-off-by: Chuck Lever --- fs/exportfs/expfs.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 3ae0154c5680..dcf7d86c2ce4 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -244,6 +244,16 @@ struct getdents_callback { int sequence; /* sequence counter */ }; +/* Copied from lookup_one_common() */ +static inline bool is_dot_dotdot(const char *name, size_t len) +{ + if (unlikely(name[0] == '.')) { + if (len < 2 || (len == 2 && name[1] == '.')) + return true; + } + return false; +} + /* * A rather strange filldir function to capture * the name matching the specified inode number. @@ -255,7 +265,7 @@ static bool filldir_one(struct dir_context *ctx, const char *name, int len, container_of(ctx, struct getdents_callback, ctx); buf->sequence++; - if (buf->ino == ino && len <= NAME_MAX) { + if (buf->ino == ino && len <= NAME_MAX && !is_dot_dotdot(name, len)) { memcpy(buf->name, name, len); buf->name[len] = '\0'; buf->found = 1; From 42c3732fa8073717dd7d924472f1c0bc5b452fdc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 30 Dec 2023 19:46:00 -0500 Subject: [PATCH 13/70] fs: Create a generic is_dot_dotdot() utility De-duplicate the same functionality in several places by hoisting the is_dot_dotdot() utility function into linux/fs.h. Suggested-by: Amir Goldstein Reviewed-by: Jeff Layton Reviewed-by: Amir Goldstein Acked-by: Christian Brauner Signed-off-by: Chuck Lever --- fs/crypto/fname.c | 8 +------- fs/ecryptfs/crypto.c | 10 ---------- fs/exportfs/expfs.c | 10 ---------- fs/f2fs/f2fs.h | 11 ----------- fs/namei.c | 6 ++---- include/linux/fs.h | 11 +++++++++++ 6 files changed, 14 insertions(+), 42 deletions(-) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 7b3fc189593a..0ad52fbe51c9 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -74,13 +74,7 @@ struct fscrypt_nokey_name { static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { - if (str->len == 1 && str->name[0] == '.') - return true; - - if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') - return true; - - return false; + return is_dot_dotdot(str->name, str->len); } /** diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 03bd55069d86..2fe0f3af1a08 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1949,16 +1949,6 @@ out: return rc; } -static bool is_dot_dotdot(const char *name, size_t name_size) -{ - if (name_size == 1 && name[0] == '.') - return true; - else if (name_size == 2 && name[0] == '.' && name[1] == '.') - return true; - - return false; -} - /** * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext * @plaintext_name: The plaintext name diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index dcf7d86c2ce4..07ea3d62b298 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -244,16 +244,6 @@ struct getdents_callback { int sequence; /* sequence counter */ }; -/* Copied from lookup_one_common() */ -static inline bool is_dot_dotdot(const char *name, size_t len) -{ - if (unlikely(name[0] == '.')) { - if (len < 2 || (len == 2 && name[1] == '.')) - return true; - } - return false; -} - /* * A rather strange filldir function to capture * the name matching the specified inode number. diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9043cedfa12b..322a3b8a3533 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3368,17 +3368,6 @@ static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) return is_set_ckpt_flags(sbi, CP_ERROR_FLAG); } -static inline bool is_dot_dotdot(const u8 *name, size_t len) -{ - if (len == 1 && name[0] == '.') - return true; - - if (len == 2 && name[0] == '.' && name[1] == '.') - return true; - - return false; -} - static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { diff --git a/fs/namei.c b/fs/namei.c index 71c13b2990b4..2386a70667fa 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2667,10 +2667,8 @@ static int lookup_one_common(struct mnt_idmap *idmap, if (!len) return -EACCES; - if (unlikely(name[0] == '.')) { - if (len < 2 || (len == 2 && name[1] == '.')) - return -EACCES; - } + if (is_dot_dotdot(name, len)) + return -EACCES; while (len--) { unsigned int c = *(const unsigned char *)name++; diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e..baa64344a308 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2846,6 +2846,17 @@ extern bool path_is_under(const struct path *, const struct path *); extern char *file_path(struct file *, char *, int); +/** + * is_dot_dotdot - returns true only if @name is "." or ".." + * @name: file name to check + * @len: length of file name, in bytes + */ +static inline bool is_dot_dotdot(const char *name, size_t len) +{ + return len && unlikely(name[0] == '.') && + (len == 1 || (len == 2 && name[1] == '.')); +} + #include /* needed for stackable file system support */ From bd46543d7f9a8b2c03420499b1885287e96aaf28 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Mon, 15 Jan 2024 23:27:00 +0800 Subject: [PATCH 14/70] eventfd: move 'eventfd-count' printing out of spinlock When printing eventfd->count, interrupts will be disabled and a spinlock will be obtained, competing with eventfd_write(). By moving the "eventfd-count" print out of the spinlock and merging multiple seq_printf() into one, it could improve a bit, just like timerfd_show(). Signed-off-by: Wen Yang Link: https://lore.kernel.org/r/tencent_B0B3D2BD9861FD009E03AB18A81783322709@qq.com Cc: Alexander Viro Cc: Jens Axboe Cc: Christian Brauner Cc: Christoph Hellwig Cc: Dylan Yudaken Cc: David Woodhouse Cc: Matthew Wilcox Cc: Eric Biggers Cc: Cc: Signed-off-by: Christian Brauner --- fs/eventfd.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/eventfd.c b/fs/eventfd.c index 0252b71099fb..fc4d81090763 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -283,13 +283,18 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c static void eventfd_show_fdinfo(struct seq_file *m, struct file *f) { struct eventfd_ctx *ctx = f->private_data; + __u64 cnt; spin_lock_irq(&ctx->wqh.lock); - seq_printf(m, "eventfd-count: %16llx\n", - (unsigned long long)ctx->count); + cnt = ctx->count; spin_unlock_irq(&ctx->wqh.lock); - seq_printf(m, "eventfd-id: %d\n", ctx->id); - seq_printf(m, "eventfd-semaphore: %d\n", + + seq_printf(m, + "eventfd-count: %16llx\n" + "eventfd-id: %d\n" + "eventfd-semaphore: %d\n", + cnt, + ctx->id, !!(ctx->flags & EFD_SEMAPHORE)); } #endif From 2263639f96f24a121ec9f037981b81daf5a8d60a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 23 Jan 2024 15:24:46 -0700 Subject: [PATCH 15/70] iov_iter: streamline iovec/bvec alignment iteration Rewrite the alignment checking iterators for iovec and bvec to be easier to read, and also significantly more compact in terms of generated code. This saves 270 bytes of text on x86-64 for me (with clang-18) and 224 bytes on arm64 (with gcc-13). In profiles, also saves a bit of time as well for the same workload: 0.81% -0.18% [kernel.vmlinux] [k] iov_iter_aligned_bvec 0.48% -0.09% [kernel.vmlinux] [k] iov_iter_is_aligned which is a nice side benefit as well. Signed-off-by: Jens Axboe Link: https://lore.kernel.org/r/544b31f7-6d4b-42f5-a544-1420501f081f@kernel.dk Reviewed-by: Keith Busch Signed-off-by: Christian Brauner v2: do the other half of the iterators too, as suggested by Keith. This further saves some text. --- lib/iov_iter.c | 55 +++++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index e0aa6b440ca5..15f5040709c3 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -714,12 +714,11 @@ EXPORT_SYMBOL(iov_iter_discard); static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { + const struct iovec *iov = iter_iov(i); size_t size = i->count; size_t skip = i->iov_offset; - unsigned k; - for (k = 0; k < i->nr_segs; k++, skip = 0) { - const struct iovec *iov = iter_iov(i) + k; + do { size_t len = iov->iov_len - skip; if (len > size) @@ -729,34 +728,36 @@ static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, if ((unsigned long)(iov->iov_base + skip) & addr_mask) return false; + iov++; size -= len; - if (!size) - break; - } + skip = 0; + } while (size); + return true; } static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { - size_t size = i->count; + const struct bio_vec *bvec = i->bvec; unsigned skip = i->iov_offset; - unsigned k; + size_t size = i->count; - for (k = 0; k < i->nr_segs; k++, skip = 0) { - size_t len = i->bvec[k].bv_len - skip; + do { + size_t len = bvec->bv_len; if (len > size) len = size; if (len & len_mask) return false; - if ((unsigned long)(i->bvec[k].bv_offset + skip) & addr_mask) + if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) return false; + bvec++; size -= len; - if (!size) - break; - } + skip = 0; + } while (size); + return true; } @@ -800,13 +801,12 @@ EXPORT_SYMBOL_GPL(iov_iter_is_aligned); static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) { + const struct iovec *iov = iter_iov(i); unsigned long res = 0; size_t size = i->count; size_t skip = i->iov_offset; - unsigned k; - for (k = 0; k < i->nr_segs; k++, skip = 0) { - const struct iovec *iov = iter_iov(i) + k; + do { size_t len = iov->iov_len - skip; if (len) { res |= (unsigned long)iov->iov_base + skip; @@ -814,30 +814,31 @@ static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) len = size; res |= len; size -= len; - if (!size) - break; } - } + iov++; + skip = 0; + } while (size); return res; } static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) { + const struct bio_vec *bvec = i->bvec; unsigned res = 0; size_t size = i->count; unsigned skip = i->iov_offset; - unsigned k; - for (k = 0; k < i->nr_segs; k++, skip = 0) { - size_t len = i->bvec[k].bv_len - skip; - res |= (unsigned long)i->bvec[k].bv_offset + skip; + do { + size_t len = bvec->bv_len - skip; + res |= (unsigned long)bvec->bv_offset + skip; if (len > size) len = size; res |= len; + bvec++; size -= len; - if (!size) - break; - } + skip = 0; + } while (size); + return res; } From d8f899d13d72d285db43dbb9df1acaed22d8c4e7 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 24 Jan 2024 22:28:55 +0800 Subject: [PATCH 16/70] fs: make the i_size_read/write helpers be smp_load_acquire/store_release() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In [Link] Linus mentions that acquire/release makes it clear which _particular_ memory accesses are the ordered ones, and it's unlikely to make any performance difference, so it's much better to pair up the release->acquire ordering than have a "wmb->rmb" ordering. ========================================================= update pagecache folio_mark_uptodate(folio) smp_wmb() set_bit PG_uptodate === ↑↑↑ STLR ↑↑↑ === smp_store_release(&inode->i_size, i_size) folio_test_uptodate(folio) test_bit PG_uptodate smp_rmb() === ↓↓↓ LDAR ↓↓↓ === smp_load_acquire(&inode->i_size) copy_page_to_iter() ========================================================= Calling smp_store_release() in i_size_write() ensures that the data in the page and the PG_uptodate bit are updated before the isize is updated, and calling smp_load_acquire() in i_size_read ensures that it will not read a newer isize than the data in the page. Therefore, this avoids buffered read-write inconsistencies caused by Load-Load reordering. Link: https://lore.kernel.org/r/CAHk-=wifOnmeJq+sn+2s-P46zw0SFEbw9BSCGgp2c5fYPtRPGw@mail.gmail.com/ Suggested-by: Linus Torvalds Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240124142857.4146716-2-libaokun1@huawei.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6669147b9e..ebce4763b4bb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -907,7 +907,8 @@ static inline loff_t i_size_read(const struct inode *inode) preempt_enable(); return i_size; #else - return inode->i_size; + /* Pairs with smp_store_release() in i_size_write() */ + return smp_load_acquire(&inode->i_size); #endif } @@ -929,7 +930,12 @@ static inline void i_size_write(struct inode *inode, loff_t i_size) inode->i_size = i_size; preempt_enable(); #else - inode->i_size = i_size; + /* + * Pairs with smp_load_acquire() in i_size_read() to ensure + * changes related to inode size (such as page contents) are + * visible before we see the changed inode size. + */ + smp_store_release(&inode->i_size, i_size); #endif } From 4b944f8ef99641d5af287c7d9df91d20ef5d3e88 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 24 Jan 2024 22:28:56 +0800 Subject: [PATCH 17/70] Revert "mm/filemap: avoid buffered read/write race to read inconsistent data" This reverts commit e2c27b803bb6 ("mm/filemap: avoid buffered read/write race to read inconsistent data"). After making the i_size_read/write helpers be smp_load_acquire/store_release(), it is already guaranteed that changes to page contents are visible before we see increased inode size, so the extra smp_rmb() in filemap_read() can be removed. Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240124142857.4146716-3-libaokun1@huawei.com Signed-off-by: Christian Brauner --- mm/filemap.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 750e779c23db..a72dd2eafd5a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2608,15 +2608,6 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, goto put_folios; end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); - /* - * Pairs with a barrier in - * block_write_end()->mark_buffer_dirty() or other page - * dirtying routines like iomap_write_end() to ensure - * changes to page contents are visible before we see - * increased inode size. - */ - smp_rmb(); - /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: From ad72872eb3ae634d1e4296a384baa81e85cc6acc Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 24 Jan 2024 22:28:57 +0800 Subject: [PATCH 18/70] asm-generic: remove extra type checking in acquire/release for non-SMP case If CONFIG_SMP is not enabled, the smp_load_acquire/smp_store_release is implemented as READ_ONCE/READ_ONCE and barrier() and type checking. READ_ONCE/READ_ONCE already checks the pointer type, and then checks it more stringently outside, but the non-SMP case simply isn't relevant, so remove the extra compiletime_assert_atomic_type() to avoid compilation errors. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202401230837.TXro0PHi-lkp@intel.com/ Suggested-by: Linus Torvalds Signed-off-by: Baokun Li Link: https://lore.kernel.org/r/20240124142857.4146716-4-libaokun1@huawei.com Signed-off-by: Christian Brauner --- include/asm-generic/barrier.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 961f4d88f9ef..0c0695763bea 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -193,7 +193,6 @@ do { \ #ifndef smp_store_release #define smp_store_release(p, v) \ do { \ - compiletime_assert_atomic_type(*p); \ barrier(); \ WRITE_ONCE(*p, v); \ } while (0) @@ -203,7 +202,6 @@ do { \ #define smp_load_acquire(p) \ ({ \ __unqual_scalar_typeof(*p) ___p1 = READ_ONCE(*p); \ - compiletime_assert_atomic_type(*p); \ barrier(); \ (typeof(*p))___p1; \ }) From 85f273a6a1e492afa7309fd23fcd8cb870085f56 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 26 Jan 2024 21:01:05 -0500 Subject: [PATCH 19/70] fs/pipe: Convert to lockdep_cmp_fn *_lock_nested() is fundamentally broken; lockdep needs to check lock ordering, but we cannot device a total ordering on an unbounded number of elements with only a few subclasses. the replacement is to define lock ordering with a proper comparison function. fs/pipe.c was already doing everything correctly otherwise, nothing much changes here. Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Signed-off-by: Kent Overstreet Link: https://lore.kernel.org/r/20240127020111.487218-2-kent.overstreet@linux.dev Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/pipe.c | 81 +++++++++++++++++++++++++------------------------------ 1 file changed, 36 insertions(+), 45 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index f1adbfe743d4..50c8a8596b52 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -76,18 +76,20 @@ static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; * -- Manfred Spraul 2002-05-09 */ -static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) +#define cmp_int(l, r) ((l > r) - (l < r)) + +#ifdef CONFIG_PROVE_LOCKING +static int pipe_lock_cmp_fn(const struct lockdep_map *a, + const struct lockdep_map *b) { - if (pipe->files) - mutex_lock_nested(&pipe->mutex, subclass); + return cmp_int((unsigned long) a, (unsigned long) b); } +#endif void pipe_lock(struct pipe_inode_info *pipe) { - /* - * pipe_lock() nests non-pipe inode locks (for writing to a file) - */ - pipe_lock_nested(pipe, I_MUTEX_PARENT); + if (pipe->files) + mutex_lock(&pipe->mutex); } EXPORT_SYMBOL(pipe_lock); @@ -98,28 +100,16 @@ void pipe_unlock(struct pipe_inode_info *pipe) } EXPORT_SYMBOL(pipe_unlock); -static inline void __pipe_lock(struct pipe_inode_info *pipe) -{ - mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); -} - -static inline void __pipe_unlock(struct pipe_inode_info *pipe) -{ - mutex_unlock(&pipe->mutex); -} - void pipe_double_lock(struct pipe_inode_info *pipe1, struct pipe_inode_info *pipe2) { BUG_ON(pipe1 == pipe2); - if (pipe1 < pipe2) { - pipe_lock_nested(pipe1, I_MUTEX_PARENT); - pipe_lock_nested(pipe2, I_MUTEX_CHILD); - } else { - pipe_lock_nested(pipe2, I_MUTEX_PARENT); - pipe_lock_nested(pipe1, I_MUTEX_CHILD); - } + if (pipe1 > pipe2) + swap(pipe1, pipe2); + + pipe_lock(pipe1); + pipe_lock(pipe2); } static void anon_pipe_buf_release(struct pipe_inode_info *pipe, @@ -271,7 +261,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) return 0; ret = 0; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); /* * We only wake up writers if the pipe was full when we started @@ -368,7 +358,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) ret = -EAGAIN; break; } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); /* * We only get here if we didn't actually read anything. @@ -400,13 +390,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); wake_next_reader = true; } if (pipe_empty(pipe->head, pipe->tail)) wake_next_reader = false; - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); if (was_full) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); @@ -462,7 +452,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) if (unlikely(total_len == 0)) return 0; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); if (!pipe->readers) { send_sig(SIGPIPE, current, 0); @@ -582,19 +572,19 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) * after waiting we need to re-check whether the pipe * become empty while we dropped the lock. */ - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); if (was_empty) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); was_empty = pipe_empty(pipe->head, pipe->tail); wake_next_writer = true; } out: if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) wake_next_writer = false; - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); /* * If we do do a wakeup event, we do a 'sync' wakeup, because we @@ -629,7 +619,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) switch (cmd) { case FIONREAD: - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); count = 0; head = pipe->head; tail = pipe->tail; @@ -639,16 +629,16 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) count += pipe->bufs[tail & mask].len; tail++; } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return put_user(count, (int __user *)arg); #ifdef CONFIG_WATCH_QUEUE case IOC_WATCH_QUEUE_SET_SIZE: { int ret; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); ret = watch_queue_set_size(pipe, arg); - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return ret; } @@ -734,7 +724,7 @@ pipe_release(struct inode *inode, struct file *file) { struct pipe_inode_info *pipe = file->private_data; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); if (file->f_mode & FMODE_READ) pipe->readers--; if (file->f_mode & FMODE_WRITE) @@ -747,7 +737,7 @@ pipe_release(struct inode *inode, struct file *file) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); put_pipe_info(inode, pipe); return 0; @@ -759,7 +749,7 @@ pipe_fasync(int fd, struct file *filp, int on) struct pipe_inode_info *pipe = filp->private_data; int retval = 0; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); if (filp->f_mode & FMODE_READ) retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { @@ -768,7 +758,7 @@ pipe_fasync(int fd, struct file *filp, int on) /* this can happen only if on == T */ fasync_helper(-1, filp, 0, &pipe->fasync_readers); } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return retval; } @@ -834,6 +824,7 @@ struct pipe_inode_info *alloc_pipe_info(void) pipe->nr_accounted = pipe_bufs; pipe->user = user; mutex_init(&pipe->mutex); + lock_set_cmp_fn(&pipe->mutex, pipe_lock_cmp_fn, NULL); return pipe; } @@ -1144,7 +1135,7 @@ static int fifo_open(struct inode *inode, struct file *filp) filp->private_data = pipe; /* OK, we have a pipe and it's pinned down */ - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); /* We can only do regular read/write on fifos */ stream_open(inode, filp); @@ -1214,7 +1205,7 @@ static int fifo_open(struct inode *inode, struct file *filp) } /* Ok! */ - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return 0; err_rd: @@ -1230,7 +1221,7 @@ err_wr: goto err; err: - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); put_pipe_info(inode, pipe); return ret; @@ -1411,7 +1402,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg) if (!pipe) return -EBADF; - __pipe_lock(pipe); + mutex_lock(&pipe->mutex); switch (cmd) { case F_SETPIPE_SZ: @@ -1425,7 +1416,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg) break; } - __pipe_unlock(pipe); + mutex_unlock(&pipe->mutex); return ret; } From f123dc86388cb669c3d6322702dc441abc35c31e Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 10 Apr 2023 21:04:50 +0900 Subject: [PATCH 20/70] sysv: don't call sb_bread() with pointers_lock held syzbot is reporting sleep in atomic context in SysV filesystem [1], for sb_bread() is called with rw_spinlock held. A "write_lock(&pointers_lock) => read_lock(&pointers_lock) deadlock" bug and a "sb_bread() with write_lock(&pointers_lock)" bug were introduced by "Replace BKL for chain locking with sysvfs-private rwlock" in Linux 2.5.12. Then, "[PATCH] err1-40: sysvfs locking fix" in Linux 2.6.8 fixed the former bug by moving pointers_lock lock to the callers, but instead introduced a "sb_bread() with read_lock(&pointers_lock)" bug (which made this problem easier to hit). Al Viro suggested that why not to do like get_branch()/get_block()/ find_shared() in Minix filesystem does. And doing like that is almost a revert of "[PATCH] err1-40: sysvfs locking fix" except that get_branch() from with find_shared() is called without write_lock(&pointers_lock). Reported-by: syzbot Link: https://syzkaller.appspot.com/bug?extid=69b40dc5fd40f32c199f Suggested-by: Al Viro Signed-off-by: Tetsuo Handa Link: https://lore.kernel.org/r/0d195f93-a22a-49a2-0020-103534d6f7f6@I-love.SAKURA.ne.jp Signed-off-by: Christian Brauner --- fs/sysv/itree.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 410ab2a44d2f..19bcb51a2203 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -83,9 +83,6 @@ static inline sysv_zone_t *block_end(struct buffer_head *bh) return (sysv_zone_t*)((char*)bh->b_data + bh->b_size); } -/* - * Requires read_lock(&pointers_lock) or write_lock(&pointers_lock) - */ static Indirect *get_branch(struct inode *inode, int depth, int offsets[], @@ -105,15 +102,18 @@ static Indirect *get_branch(struct inode *inode, bh = sb_bread(sb, block); if (!bh) goto failure; + read_lock(&pointers_lock); if (!verify_chain(chain, p)) goto changed; add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets); + read_unlock(&pointers_lock); if (!p->key) goto no_block; } return NULL; changed: + read_unlock(&pointers_lock); brelse(bh); *err = -EAGAIN; goto no_block; @@ -219,9 +219,7 @@ static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *b goto out; reread: - read_lock(&pointers_lock); partial = get_branch(inode, depth, offsets, chain, &err); - read_unlock(&pointers_lock); /* Simplest case - block found, no allocation needed */ if (!partial) { @@ -291,9 +289,9 @@ static Indirect *find_shared(struct inode *inode, *top = 0; for (k = depth; k > 1 && !offsets[k-1]; k--) ; + partial = get_branch(inode, k, offsets, chain, &err); write_lock(&pointers_lock); - partial = get_branch(inode, k, offsets, chain, &err); if (!partial) partial = chain + k-1; /* From cc47a057e79613b7af0110837ff082caf8895c9e Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Mon, 29 Jan 2024 19:00:23 +0100 Subject: [PATCH 21/70] ntfs3: use file_mnt_idmap helper Let's use file_mnt_idmap() as we do that across the tree. No functional impact. Cc: Christian Brauner Cc: Konstantin Komarov Cc: Cc: Cc: Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20240129180024.219766-1-aleksandr.mikhalitsyn@canonical.com Signed-off-by: Christian Brauner --- fs/ntfs3/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index ee3093be5170..144aa80cca43 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -419,7 +419,7 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry, * fnd contains tree's path to insert to. * If fnd is not NULL then dir is locked. */ - inode = ntfs_create_inode(mnt_idmap(file->f_path.mnt), dir, dentry, uni, + inode = ntfs_create_inode(file_mnt_idmap(file), dir, dentry, uni, mode, 0, NULL, 0, fnd); err = IS_ERR(inode) ? PTR_ERR(inode) : finish_open(file, dentry, ntfs_file_open); From bd8c239c0502e70c92cf9496846bcbec7cd5702f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 29 Jan 2024 10:37:29 -0800 Subject: [PATCH 22/70] iov_iter: Avoid wrap-around instrumentation in copy_compat_iovec_from_user() The loop counter "i" in copy_compat_iovec_from_user() is an int, but because the nr_segs argument is unsigned long, the signed overflow sanitizer got worried "i" could wrap around. Instead of making "i" an unsigned long (which may enlarge the type size), switch both nr_segs and i to u32. There is no truncation with nr_segs since it is never larger than UIO_MAXIOV anyway. This keeps sanitizer instrumentation[1] out of a UACCESS path: vmlinux.o: warning: objtool: copy_compat_iovec_from_user+0xa9: call to __ubsan_handle_add_overflow() with UACCESS enabled Link: https://github.com/KSPP/linux/issues/26 [1] Cc: Christian Brauner Cc: Alexander Viro Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20240129183729.work.991-kees@kernel.org Signed-off-by: Christian Brauner --- lib/iov_iter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 15f5040709c3..73715d10c812 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1167,11 +1167,12 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) EXPORT_SYMBOL(dup_iter); static __noclone int copy_compat_iovec_from_user(struct iovec *iov, - const struct iovec __user *uvec, unsigned long nr_segs) + const struct iovec __user *uvec, u32 nr_segs) { const struct compat_iovec __user *uiov = (const struct compat_iovec __user *)uvec; - int ret = -EFAULT, i; + int ret = -EFAULT; + u32 i; if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; From c67ef897fe08effad98f0c7fb9223fa1f771d09e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 29 Jan 2024 10:40:15 -0800 Subject: [PATCH 23/70] select: Avoid wrap-around instrumentation in do_sys_poll() The mix of int, unsigned int, and unsigned long used by struct poll_list::len, todo, len, and j meant that the signed overflow sanitizer got worried it needed to instrument several places where arithmetic happens between these variables. Since all of the variables are always positive and bounded by unsigned int, use a single type in all places. Additionally expand the zero-test into an explicit range check before updating "todo". This keeps sanitizer instrumentation[1] out of a UACCESS path: vmlinux.o: warning: objtool: do_sys_poll+0x285: call to __ubsan_handle_sub_overflow() with UACCESS enabled Link: https://github.com/KSPP/linux/issues/26 [1] Cc: Christian Brauner Cc: Alexander Viro Cc: Jan Kara Cc: Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20240129184014.work.593-kees@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/select.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/select.c b/fs/select.c index 0ee55af1a55c..11a3b1312abe 100644 --- a/fs/select.c +++ b/fs/select.c @@ -839,7 +839,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) struct poll_list { struct poll_list *next; - int len; + unsigned int len; struct pollfd entries[]; }; @@ -975,14 +975,15 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; - int err = -EFAULT, fdcount, len; + int err = -EFAULT, fdcount; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; struct poll_list *const head = (struct poll_list *)stack_pps; struct poll_list *walk = head; - unsigned long todo = nfds; + unsigned int todo = nfds; + unsigned int len; if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; @@ -998,9 +999,9 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, sizeof(struct pollfd) * walk->len)) goto out_fds; - todo -= walk->len; - if (!todo) + if (walk->len >= todo) break; + todo -= walk->len; len = min(todo, POLLFD_PER_PAGE); walk = walk->next = kmalloc(struct_size(walk, entries, len), @@ -1020,7 +1021,7 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, for (walk = head; walk; walk = walk->next) { struct pollfd *fds = walk->entries; - int j; + unsigned int j; for (j = walk->len; j; fds++, ufds++, j--) unsafe_put_user(fds->revents, &ufds->revents, Efault); From 617fc7775370324d0a2b888625b042221ebdcb62 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Wed, 31 Jan 2024 15:09:41 +0800 Subject: [PATCH 24/70] fs: Use KMEM_CACHE instead of kmem_cache_create commit 0a31bd5f2bbb ("KMEM_CACHE(): simplify slab cache creation") introduces a new macro. Use the new KMEM_CACHE() macro instead of direct kmem_cache_create to simplify the creation of SLAB caches. Signed-off-by: Kunwu Chan Link: https://lore.kernel.org/r/20240131070941.135178-1-chentao@kylinos.cn Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/backing-file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/backing-file.c b/fs/backing-file.c index a681f38d84d8..740185198db3 100644 --- a/fs/backing-file.c +++ b/fs/backing-file.c @@ -325,9 +325,7 @@ EXPORT_SYMBOL_GPL(backing_file_mmap); static int __init backing_aio_init(void) { - backing_aio_cachep = kmem_cache_create("backing_aio", - sizeof(struct backing_aio), - 0, SLAB_HWCACHE_ALIGN, NULL); + backing_aio_cachep = KMEM_CACHE(backing_aio, SLAB_HWCACHE_ALIGN); if (!backing_aio_cachep) return -ENOMEM; From ce51bf1790c4972125b19c22dbfa76d7e0136280 Mon Sep 17 00:00:00 2001 From: Kunwu Chan Date: Thu, 1 Feb 2024 17:34:26 +0800 Subject: [PATCH 25/70] mbcache: Simplify the allocation of slab caches Use the new KMEM_CACHE() macro instead of direct kmem_cache_create to simplify the creation of SLAB caches. Signed-off-by: Kunwu Chan Link: https://lore.kernel.org/r/20240201093426.207932-1-chentao@kylinos.cn Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/mbcache.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/mbcache.c b/fs/mbcache.c index 82aa7a35db26..fe2624e17253 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -426,9 +426,8 @@ EXPORT_SYMBOL(mb_cache_destroy); static int __init mbcache_init(void) { - mb_entry_cache = kmem_cache_create("mbcache", - sizeof(struct mb_cache_entry), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + mb_entry_cache = KMEM_CACHE(mb_cache_entry, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD); if (!mb_entry_cache) return -ENOMEM; return 0; From 57c6906778f55deba6d3f3af00284e3541bb9950 Mon Sep 17 00:00:00 2001 From: Chen Hanxiao Date: Fri, 2 Feb 2024 15:20:42 +0800 Subject: [PATCH 26/70] __fs_parse: Correct a documentation comment Commit 7f5d38141e30 ("new primitive: __fs_parse()") taking p_log instead of fs_context. So, update that comment to refer to p_log instead Signed-off-by: Chen Hanxiao Link: https://lore.kernel.org/r/20240202072042.906-1-chenhx.fnst@fujitsu.com Signed-off-by: Christian Brauner --- fs/fs_parser.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fs_parser.c b/fs/fs_parser.c index edb3712dcfa5..a4d6ca0b8971 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -83,8 +83,8 @@ static const struct fs_parameter_spec *fs_lookup_key( } /* - * fs_parse - Parse a filesystem configuration parameter - * @fc: The filesystem context to log errors through. + * __fs_parse - Parse a filesystem configuration parameter + * @log: The filesystem context to log errors through. * @desc: The parameter description to use. * @param: The parameter. * @result: Where to place the result of the parse From d3b1a9a778e1a014c5331d1e8d4863fd999eb0b5 Mon Sep 17 00:00:00 2001 From: JonasZhou Date: Fri, 2 Feb 2024 16:33:04 +0800 Subject: [PATCH 27/70] fs/address_space: move i_mmap_rwsem to mitigate a false sharing with i_mmap. In the struct address_space, there is a 32-byte gap between i_mmap and i_mmap_rwsem. Due to the alignment of struct address_space variables to 8 bytes, in certain situations, i_mmap and i_mmap_rwsem may end up in the same CACHE line. While running Unixbench/execl, we observe high false sharing issues when accessing i_mmap against i_mmap_rwsem. We move i_mmap_rwsem after i_private_list, ensuring a 64-byte gap between i_mmap and i_mmap_rwsem. For Intel Silver machines (2 sockets) using kernel v6.8 rc-2, the score of Unixbench/execl improves by ~3.94%, and the score of Unixbench/shell improves by ~3.26%. Baseline: ------------------------------------------------------------- 162 546 748 11374 21 0xffff92e266af90c0 ------------------------------------------------------------- 46.89% 44.65% 0.00% 0.00% 0x0 1 1 0xffffffff86d5fb96 460 258 271 1069 32 [k] __handle_mm_fault [kernel.vmlinux] memory.c:2940 0 1 4.21% 4.41% 0.00% 0.00% 0x4 1 1 0xffffffff86d0ed54 473 311 288 95 28 [k] filemap_read [kernel.vmlinux] atomic.h:23 0 1 0.00% 0.00% 0.04% 4.76% 0x8 1 1 0xffffffff86d4bcf1 0 0 0 5 4 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:204 0 1 6.41% 6.02% 0.00% 0.00% 0x8 1 1 0xffffffff86d4ba85 411 271 339 210 32 [k] vma_interval_tree_insert [kernel.vmlinux] interval_tree.c:23 0 1 0.00% 0.00% 0.47% 95.24% 0x10 1 1 0xffffffff86d4bd34 0 0 0 74 32 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:339 0 1 0.37% 0.13% 0.00% 0.00% 0x10 1 1 0xffffffff86d4bb4f 328 212 380 7 5 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:338 0 1 5.13% 5.08% 0.00% 0.00% 0x10 1 1 0xffffffff86d4bb4b 416 255 357 197 32 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:338 0 1 1.10% 0.53% 0.00% 0.00% 0x28 1 1 0xffffffff86e06eb8 395 228 351 24 14 [k] do_dentry_open [kernel.vmlinux] open.c:966 0 1 1.10% 2.14% 57.07% 0.00% 0x38 1 1 0xffffffff878c9225 1364 792 462 7003 32 [k] down_write [kernel.vmlinux] atomic64_64.h:109 0 1 0.00% 0.00% 0.01% 0.00% 0x38 1 1 0xffffffff878c8e75 0 0 252 3 2 [k] rwsem_down_write_slowpath [kernel.vmlinux] atomic64_64.h:109 0 1 0.00% 0.13% 0.00% 0.00% 0x38 1 1 0xffffffff878c8e23 0 596 63 2 2 [k] rwsem_down_write_slowpath [kernel.vmlinux] atomic64_64.h:15 0 1 2.38% 2.94% 6.53% 0.00% 0x38 1 1 0xffffffff878c8ccb 1150 818 570 1197 32 [k] rwsem_down_write_slowpath [kernel.vmlinux] atomic64_64.h:109 0 1 30.59% 32.22% 0.00% 0.00% 0x38 1 1 0xffffffff878c8cb4 423 251 380 648 32 [k] rwsem_down_write_slowpath [kernel.vmlinux] atomic64_64.h:15 0 1 1.83% 1.74% 35.88% 0.00% 0x38 1 1 0xffffffff86b4f833 1217 1112 565 4586 32 [k] up_write [kernel.vmlinux] atomic64_64.h:91 0 1 with this change: ------------------------------------------------------------- 360 12 300 57 35 0xffff982cdae76400 ------------------------------------------------------------- 50.00% 59.67% 0.00% 0.00% 0x0 1 1 0xffffffff8215fb86 352 200 191 558 32 [k] __handle_mm_fault [kernel.vmlinux] memory.c:2940 0 1 8.33% 5.00% 0.00% 0.00% 0x4 1 1 0xffffffff8210ed44 370 284 263 42 24 [k] filemap_read [kernel.vmlinux] atomic.h:23 0 1 0.00% 0.00% 5.26% 2.86% 0x8 1 1 0xffffffff8214bce1 0 0 0 4 4 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:204 0 1 33.33% 14.33% 0.00% 0.00% 0x8 1 1 0xffffffff8214ba75 344 186 219 140 32 [k] vma_interval_tree_insert [kernel.vmlinux] interval_tree.c:23 0 1 0.00% 0.00% 94.74% 97.14% 0x10 1 1 0xffffffff8214bd24 0 0 0 88 29 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:339 0 1 8.33% 20.00% 0.00% 0.00% 0x10 1 1 0xffffffff8214bb3b 296 209 226 167 31 [k] vma_interval_tree_remove [kernel.vmlinux] rbtree_augmented.h:338 0 1 0.00% 0.67% 0.00% 0.00% 0x28 1 1 0xffffffff82206f45 0 140 334 4 3 [k] do_dentry_open [kernel.vmlinux] open.c:966 0 1 0.00% 0.33% 0.00% 0.00% 0x38 1 1 0xffffffff8250a6c4 0 286 126 5 5 [k] errseq_sample [kernel.vmlinux] errseq.c:125 0 Signed-off-by: JonasZhou Link: https://lore.kernel.org/r/20240202083304.10995-1-JonasZhou-oc@zhaoxin.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index ebce4763b4bb..9efd6220b7c6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -482,10 +482,10 @@ struct address_space { pgoff_t writeback_index; const struct address_space_operations *a_ops; unsigned long flags; - struct rw_semaphore i_mmap_rwsem; errseq_t wb_err; spinlock_t i_private_lock; struct list_head i_private_list; + struct rw_semaphore i_mmap_rwsem; void * i_private_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* From e6f7958042a7b1dc9a4dfc19fca74217bc0c4865 Mon Sep 17 00:00:00 2001 From: Huang Xiaojia Date: Tue, 6 Feb 2024 09:43:53 +0800 Subject: [PATCH 28/70] epoll: Remove ep_scan_ready_list() in comments Since commit 443f1a042233 ("lift the calls of ep_send_events_proc() into the callers"), ep_scan_ready_list() has been removed. But there are still several in comments. All of them should be replaced with other caller functions. Signed-off-by: Huang Xiaojia Link: https://lore.kernel.org/r/20240206014353.4191262-1-huangxiaojia2@huawei.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/eventpoll.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 3534d36a1474..786e023a48b2 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -206,7 +206,7 @@ struct eventpoll { */ struct epitem *ovflist; - /* wakeup_source used when ep_scan_ready_list is running */ + /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */ struct wakeup_source *ws; /* The user that created the eventpoll descriptor */ @@ -1153,7 +1153,7 @@ static inline bool chain_epi_lockless(struct epitem *epi) * This callback takes a read lock in order not to contend with concurrent * events from another file descriptor, thus all modifications to ->rdllist * or ->ovflist are lockless. Read lock is paired with the write lock from - * ep_scan_ready_list(), which stops all list modifications and guarantees + * ep_start/done_scan(), which stops all list modifications and guarantees * that lists state is seen correctly. * * Another thing worth to mention is that ep_poll_callback() can be called @@ -1751,7 +1751,7 @@ static int ep_send_events(struct eventpoll *ep, * availability. At this point, no one can insert * into ep->rdllist besides us. The epoll_ctl() * callers are locked out by - * ep_scan_ready_list() holding "mtx" and the + * ep_send_events() holding "mtx" and the * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); @@ -1904,7 +1904,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, __set_current_state(TASK_INTERRUPTIBLE); /* - * Do the final check under the lock. ep_scan_ready_list() + * Do the final check under the lock. ep_start/done_scan() * plays with two lists (->rdllist and ->ovflist) and there * is always a race when both lists are empty for short * period of time although events are pending, so lock is From 3058fca1ed7955c904584a6d86108d664a927177 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 2 Feb 2024 13:01:31 +0200 Subject: [PATCH 29/70] fs: make file_dentry() a simple accessor file_dentry() is a relic from the days that overlayfs was using files with a "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs. In those days, file_dentry() was needed to get the underlying fs dentry that matches f_inode. Files with "fake" path should not exist nowadays, so make file_dentry() a simple accessor and use an assertion to make sure that file_dentry() was not papering over filesystem bugs. Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20240202110132.1584111-2-amir73il@gmail.com Tested-by: Stefan Berger Signed-off-by: Christian Brauner --- include/linux/fs.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9efd6220b7c6..2e07cbbf92e3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1084,9 +1084,20 @@ static inline struct inode *file_inode(const struct file *f) return f->f_inode; } +/* + * file_dentry() is a relic from the days that overlayfs was using files with a + * "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs. + * In those days, file_dentry() was needed to get the underlying fs dentry that + * matches f_inode. + * Files with "fake" path should not exist nowadays, so use an assertion to make + * sure that file_dentry() was not papering over filesystem bugs. + */ static inline struct dentry *file_dentry(const struct file *file) { - return d_real(file->f_path.dentry, file_inode(file)); + struct dentry *dentry = file->f_path.dentry; + + WARN_ON_ONCE(d_inode(dentry) != file_inode(file)); + return dentry; } struct fasync_struct { From 11b3f8ae7081607a783d60e8098d46b787f79cad Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Fri, 2 Feb 2024 13:01:32 +0200 Subject: [PATCH 30/70] fs: remove the inode argument to ->d_real() method The only remaining user of ->d_real() method is d_real_inode(), which passed NULL inode argument to get the real data dentry. There are no longer any users that call ->d_real() with a non-NULL inode argument for getting a detry from a specific underlying layer. Remove the inode argument of the method and replace it with an integer 'type' argument, to allow callers to request the real metadata dentry instead of the real data dentry. All the current users of d_real_inode() (e.g. uprobe) continue to get the real data inode. Caller that need to get the real metadata inode (e.g. IMA/EVM) can use d_inode(d_real(dentry, D_REAL_METADATA)). Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20240202110132.1584111-3-amir73il@gmail.com Tested-by: Stefan Berger Signed-off-by: Al Viro Signed-off-by: Christian Brauner --- Documentation/filesystems/locking.rst | 2 +- Documentation/filesystems/vfs.rst | 16 ++++----- fs/overlayfs/super.c | 52 ++++++++++++--------------- include/linux/dcache.h | 18 ++++++---- 4 files changed, 41 insertions(+), 47 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index d5bf4b6b7509..e664061ed55d 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -29,7 +29,7 @@ prototypes:: char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); struct vfsmount *(*d_automount)(struct path *path); int (*d_manage)(const struct path *, bool); - struct dentry *(*d_real)(struct dentry *, const struct inode *); + struct dentry *(*d_real)(struct dentry *, enum d_real_type type); locking rules: diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index eebcc0f9e2bc..6e903a903f8f 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -1264,7 +1264,7 @@ defined: char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); - struct dentry *(*d_real)(struct dentry *, const struct inode *); + struct dentry *(*d_real)(struct dentry *, enum d_real_type type); }; ``d_revalidate`` @@ -1419,16 +1419,14 @@ defined: the dentry being transited from. ``d_real`` - overlay/union type filesystems implement this method to return - one of the underlying dentries hidden by the overlay. It is - used in two different modes: + overlay/union type filesystems implement this method to return one + of the underlying dentries of a regular file hidden by the overlay. - Called from file_dentry() it returns the real dentry matching - the inode argument. The real dentry may be from a lower layer - already copied up, but still referenced from the file. This - mode is selected with a non-NULL inode argument. + The 'type' argument takes the values D_REAL_DATA or D_REAL_METADATA + for returning the real underlying dentry that refers to the inode + hosting the file's data or metadata respectively. - With NULL inode the topmost real underlying dentry is returned. + For non-regular files, the 'dentry' argument is returned. Each dentry has a pointer to its parent dentry, as well as a hash list of child dentries. Child dentries are basically like files in a diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 4ab66e3d4cff..df2ad2f60798 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -28,41 +28,38 @@ MODULE_LICENSE("GPL"); struct ovl_dir_cache; -static struct dentry *ovl_d_real(struct dentry *dentry, - const struct inode *inode) +static struct dentry *ovl_d_real(struct dentry *dentry, enum d_real_type type) { - struct dentry *real = NULL, *lower; + struct dentry *upper, *lower; int err; - /* - * vfs is only expected to call d_real() with NULL from d_real_inode() - * and with overlay inode from file_dentry() on an overlay file. - * - * TODO: remove @inode argument from d_real() API, remove code in this - * function that deals with non-NULL @inode and remove d_real() call - * from file_dentry(). - */ - if (inode && d_inode(dentry) == inode) - return dentry; - else if (inode) + switch (type) { + case D_REAL_DATA: + case D_REAL_METADATA: + break; + default: goto bug; + } if (!d_is_reg(dentry)) { /* d_real_inode() is only relevant for regular files */ return dentry; } - real = ovl_dentry_upper(dentry); - if (real && (inode == d_inode(real))) - return real; + upper = ovl_dentry_upper(dentry); + if (upper && (type == D_REAL_METADATA || + ovl_has_upperdata(d_inode(dentry)))) + return upper; - if (real && !inode && ovl_has_upperdata(d_inode(dentry))) - return real; + if (type == D_REAL_METADATA) { + lower = ovl_dentry_lower(dentry); + goto real_lower; + } /* - * Best effort lazy lookup of lowerdata for !inode case to return + * Best effort lazy lookup of lowerdata for D_REAL_DATA case to return * the real lowerdata dentry. The only current caller of d_real() with - * NULL inode is d_real_inode() from trace_uprobe and this caller is + * D_REAL_DATA is d_real_inode() from trace_uprobe and this caller is * likely going to be followed reading from the file, before placing * uprobes on offset within the file, so lowerdata should be available * when setting the uprobe. @@ -73,18 +70,13 @@ static struct dentry *ovl_d_real(struct dentry *dentry, lower = ovl_dentry_lowerdata(dentry); if (!lower) goto bug; - real = lower; - /* Handle recursion */ - real = d_real(real, inode); +real_lower: + /* Handle recursion into stacked lower fs */ + return d_real(lower, type); - if (!inode || inode == d_inode(real)) - return real; bug: - WARN(1, "%s(%pd4, %s:%lu): real dentry (%p/%lu) not found\n", - __func__, dentry, inode ? inode->i_sb->s_id : "NULL", - inode ? inode->i_ino : 0, real, - real && d_inode(real) ? d_inode(real)->i_ino : 0); + WARN(1, "%s(%pd4, %d): real dentry not found\n", __func__, dentry, type); return dentry; } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 1666c387861f..d616a745a34c 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -125,6 +125,11 @@ enum dentry_d_lock_class DENTRY_D_LOCK_NESTED }; +enum d_real_type { + D_REAL_DATA, + D_REAL_METADATA, +}; + struct dentry_operations { int (*d_revalidate)(struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); @@ -139,7 +144,7 @@ struct dentry_operations { char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); - struct dentry *(*d_real)(struct dentry *, const struct inode *); + struct dentry *(*d_real)(struct dentry *, enum d_real_type type); } ____cacheline_aligned; /* @@ -546,24 +551,23 @@ static inline struct inode *d_backing_inode(const struct dentry *upper) /** * d_real - Return the real dentry * @dentry: the dentry to query - * @inode: inode to select the dentry from multiple layers (can be NULL) + * @type: the type of real dentry (data or metadata) * * If dentry is on a union/overlay, then return the underlying, real dentry. * Otherwise return the dentry itself. * * See also: Documentation/filesystems/vfs.rst */ -static inline struct dentry *d_real(struct dentry *dentry, - const struct inode *inode) +static inline struct dentry *d_real(struct dentry *dentry, enum d_real_type type) { if (unlikely(dentry->d_flags & DCACHE_OP_REAL)) - return dentry->d_op->d_real(dentry, inode); + return dentry->d_op->d_real(dentry, type); else return dentry; } /** - * d_real_inode - Return the real inode + * d_real_inode - Return the real inode hosting the data * @dentry: The dentry to query * * If dentry is on a union/overlay, then return the underlying, real inode. @@ -572,7 +576,7 @@ static inline struct dentry *d_real(struct dentry *dentry, static inline struct inode *d_real_inode(const struct dentry *dentry) { /* This usage of d_real() results in const dentry */ - return d_backing_inode(d_real((struct dentry *) dentry, NULL)); + return d_inode(d_real((struct dentry *) dentry, D_REAL_DATA)); } struct name_snapshot { From d31563b5f9bb601a805c4a1b491edf69ada79688 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Wed, 7 Feb 2024 00:35:18 +0800 Subject: [PATCH 31/70] eventfd: strictly check the count parameter of eventfd_write to avoid inputting illegal strings Since eventfd's document has clearly stated: A write(2) call adds the 8-byte integer value supplied in its buffer to the counter. However, in the current implementation, the following code snippet did not cause an error: char str[16] = "hello world"; uint64_t value; ssize_t size; int fd; fd = eventfd(0, 0); size = write(fd, &str, strlen(str)); printf("eventfd: test writing a string, size=%ld\n", size); size = read(fd, &value, sizeof(value)); printf("eventfd: test reading as uint64, size=%ld, valus=0x%lX\n", size, value); close(fd); And its output is: eventfd: test writing a string, size=8 eventfd: test reading as uint64, size=8, valus=0x6F77206F6C6C6568 By checking whether count is equal to sizeof(ucnt), such errors could be detected. It also follows the requirements of the manual. Signed-off-by: Wen Yang Link: https://lore.kernel.org/r/tencent_10AAA44731FFFA493F9F5501521F07DD4D0A@qq.com Cc: Alexander Viro Cc: Jens Axboe Cc: Christian Brauner Cc: Jan Kara Cc: David Woodhouse Cc: Matthew Wilcox Cc: Eric Biggers Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/eventfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/eventfd.c b/fs/eventfd.c index fc4d81090763..9afdb722fa92 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -251,7 +251,7 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c ssize_t res; __u64 ucnt; - if (count < sizeof(ucnt)) + if (count != sizeof(ucnt)) return -EINVAL; if (copy_from_user(&ucnt, buf, sizeof(ucnt))) return -EFAULT; From dacfd001eaf252a81537aece602ae2fc16e6ebd5 Mon Sep 17 00:00:00 2001 From: Taylor Jackson Date: Thu, 8 Feb 2024 03:02:54 +0000 Subject: [PATCH 32/70] fs/mnt_idmapping.c: Return -EINVAL when no map is written MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, it is possible to create an idmapped mount using a user namespace without any mappings. However, this yields an idmapped mount that doesn't actually map the ids. With the following change, it will no longer be possible to create an idmapped mount when using a user namespace with no mappings, and will instead return EINVAL, an “invalid argument” error code. Reviewed-by: Christian Brauner Signed-off-by: Taylor Jackson Link: https://lore.kernel.org/r/20240208-mnt-idmap-inval-v2-1-58ef26d194e0@me.com Signed-off-by: Christian Brauner --- fs/mnt_idmapping.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 64c5205e2b5e..3c60f1eaca61 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -214,7 +214,7 @@ static int copy_mnt_idmap(struct uid_gid_map *map_from, * anything at all. */ if (nr_extents == 0) - return 0; + return -EINVAL; /* * Here we know that nr_extents is greater than zero which means From ef560389ca50e33bd0c4531236c187107fa81e1f Mon Sep 17 00:00:00 2001 From: Vincenzo Mezzela Date: Thu, 8 Feb 2024 17:20:32 +0100 Subject: [PATCH 33/70] docs: filesystems: fix typo in docs This patch resolves a spelling error in the filesystem documentation. It is submitted as part of my application to the "Linux Kernel Bug Fixing Spring Unpaid 2024" mentorship program of the Linux Kernel Foundation. Signed-off-by: Vincenzo Mezzela Link: https://lore.kernel.org/r/20240208162032.109184-1-vincenzo.mezzela@gmail.com Signed-off-by: Christian Brauner --- Documentation/filesystems/files.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/filesystems/files.rst b/Documentation/filesystems/files.rst index 9e38e4c221ca..eb770f891b27 100644 --- a/Documentation/filesystems/files.rst +++ b/Documentation/filesystems/files.rst @@ -116,7 +116,7 @@ before and after the reference count increment. This pattern can be seen in get_file_rcu() and __files_get_rcu(). In addition, it isn't possible to access or check fields in struct file -without first aqcuiring a reference on it under rcu lookup. Not doing +without first acquiring a reference on it under rcu lookup. Not doing that was always very dodgy and it was only usable for non-pointer data in struct file. With SLAB_TYPESAFE_BY_RCU it is necessary that callers either first acquire a reference or they must hold the files_lock of the From d0089603fa7a22a940f808a7cbc49402fe02281e Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Fri, 9 Feb 2024 15:52:19 +0300 Subject: [PATCH 34/70] fs: prefer kfree_rcu() in fasync_remove_entry() In 'fasync_remove_entry()', prefer 'kfree_rcu()' over 'call_rcu()' with dummy 'fasync_free_rcu()' callback. This is mostly intended in attempt to fix weird https://syzkaller.appspot.com/bug?id=6a64ad907e361e49e92d1c4c114128a1bda2ed7f, where kmemleak may consider 'fa' as unreferenced during RCU grace period. See https://lore.kernel.org/stable/20230930174657.800551-1-joel@joelfernandes.org as well. Comments are highly appreciated. Ever since ae65a5211d90 ("mm/slab: document kfree() as allowed for kmem_cache_alloc() objects") kfree() can be used for both kmalloc() and kmem_cache_alloc() so this is no safe. Do not backport this to stable, please. Link ae65a5211d90 ("mm/slab: document kfree() as > allowed for kmem_cache_alloc() objects") Signed-off-by: Dmitry Antipov Link: https://lore.kernel.org/r/20240209125220.330383-1-dmantipov@yandex.ru Signed-off-by: Christian Brauner --- fs/fcntl.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/fcntl.c b/fs/fcntl.c index c80a6acad742..c3e342eb74af 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -846,12 +846,6 @@ int send_sigurg(struct fown_struct *fown) static DEFINE_SPINLOCK(fasync_lock); static struct kmem_cache *fasync_cache __ro_after_init; -static void fasync_free_rcu(struct rcu_head *head) -{ - kmem_cache_free(fasync_cache, - container_of(head, struct fasync_struct, fa_rcu)); -} - /* * Remove a fasync entry. If successfully removed, return * positive and clear the FASYNC flag. If no entry exists, @@ -877,7 +871,7 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) write_unlock_irq(&fa->fa_lock); *fp = fa->fa_next; - call_rcu(&fa->fa_rcu, fasync_free_rcu); + kfree_rcu(fa, fa_rcu); filp->f_flags &= ~FASYNC; result = 1; break; From cf12445daec01aaa2d27bb34bd7c796a53442c42 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 9 Feb 2024 21:06:06 -0800 Subject: [PATCH 35/70] fs/hfsplus: use better @opf description Use a more descriptive explanation of the @opf function parameter, more in line with . Fixes: 02105f18a26c ("fs/hfsplus: wrapper.c: fix kernel-doc warnings") Suggested-by: Bart Van Assche Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20240210050606.9182-1-rdunlap@infradead.org Reviewed-by: Bart Van Assche Cc: Alexander Viro Cc: Christian Brauner Cc: Jens Axboe Signed-off-by: Christian Brauner --- fs/hfsplus/wrapper.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index b0cb70400996..ce9346099c72 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -30,7 +30,7 @@ struct hfsplus_wd { * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes * @buf: buffer for I/O * @data: output pointer for location of requested data - * @opf: request op flags + * @opf: I/O operation type and flags * * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads From ddb9fd7a544088ed70eccbb9f85e9cc9952131c1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 16 Feb 2024 21:23:34 +0100 Subject: [PATCH 36/70] fs/select: rework stack allocation hack for clang A while ago, we changed the way that select() and poll() preallocate a temporary buffer just under the size of the static warning limit of 1024 bytes, as clang was frequently going slightly above that limit. The warnings have recently returned and I took another look. As it turns out, clang is not actually inherently worse at reserving stack space, it just happens to inline do_select() into core_sys_select(), while gcc never inlines it. Annotate do_select() to never be inlined and in turn remove the special case for the allocation size. This should give the same behavior for both clang and gcc all the time and once more avoids those warnings. Fixes: ad312f95d41c ("fs/select: avoid clang stack usage warning") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20240216202352.2492798-1-arnd@kernel.org Reviewed-by: Kees Cook Reviewed-by: Andi Kleen Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/select.c | 2 +- include/linux/poll.h | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/select.c b/fs/select.c index 11a3b1312abe..9515c3fa1a03 100644 --- a/fs/select.c +++ b/fs/select.c @@ -476,7 +476,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in, wait->_key |= POLLOUT_SET; } -static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) +static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; diff --git a/include/linux/poll.h b/include/linux/poll.h index a9e0e1c2d1f2..d1ea4f3714a8 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -14,11 +14,7 @@ /* ~832 bytes of stack space used max in sys_select/sys_poll before allocating additional memory. */ -#ifdef __clang__ -#define MAX_STACK_ALLOC 768 -#else #define MAX_STACK_ALLOC 832 -#endif #define FRONTEND_STACK_ALLOC 256 #define SELECT_STACK_ALLOC FRONTEND_STACK_ALLOC #define POLL_STACK_ALLOC FRONTEND_STACK_ALLOC From 39a6c668e4e78c3bc262c24d9aabd0a49027f948 Mon Sep 17 00:00:00 2001 From: Bill O'Donnell Date: Mon, 19 Feb 2024 18:33:18 -0600 Subject: [PATCH 37/70] efs: convert efs to use the new mount api Convert the efs filesystem to use the new mount API. Signed-off-by: Bill O'Donnell Link: https://lore.kernel.org/r/20240220003318.166143-1-bodonnel@redhat.com Signed-off-by: Christian Brauner --- fs/efs/super.c | 114 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 84 insertions(+), 30 deletions(-) diff --git a/fs/efs/super.c b/fs/efs/super.c index f17fdac76b2e..c837ac89b384 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -14,19 +14,14 @@ #include #include #include - +#include +#include #include "efs.h" #include #include static int efs_statfs(struct dentry *dentry, struct kstatfs *buf); -static int efs_fill_super(struct super_block *s, void *d, int silent); - -static struct dentry *efs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super); -} +static int efs_init_fs_context(struct fs_context *fc); static void efs_kill_sb(struct super_block *s) { @@ -35,15 +30,6 @@ static void efs_kill_sb(struct super_block *s) kfree(sbi); } -static struct file_system_type efs_fs_type = { - .owner = THIS_MODULE, - .name = "efs", - .mount = efs_mount, - .kill_sb = efs_kill_sb, - .fs_flags = FS_REQUIRES_DEV, -}; -MODULE_ALIAS_FS("efs"); - static struct pt_types sgi_pt_types[] = { {0x00, "SGI vh"}, {0x01, "SGI trkrepl"}, @@ -63,6 +49,27 @@ static struct pt_types sgi_pt_types[] = { {0, NULL} }; +enum { + Opt_explicit_open, +}; + +static const struct fs_parameter_spec efs_param_spec[] = { + fsparam_flag ("explicit-open", Opt_explicit_open), + {} +}; + +/* + * File system definition and registration. + */ +static struct file_system_type efs_fs_type = { + .owner = THIS_MODULE, + .name = "efs", + .kill_sb = efs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = efs_init_fs_context, + .parameters = efs_param_spec, +}; +MODULE_ALIAS_FS("efs"); static struct kmem_cache * efs_inode_cachep; @@ -108,18 +115,10 @@ static void destroy_inodecache(void) kmem_cache_destroy(efs_inode_cachep); } -static int efs_remount(struct super_block *sb, int *flags, char *data) -{ - sync_filesystem(sb); - *flags |= SB_RDONLY; - return 0; -} - static const struct super_operations efs_superblock_operations = { .alloc_inode = efs_alloc_inode, .free_inode = efs_free_inode, .statfs = efs_statfs, - .remount_fs = efs_remount, }; static const struct export_operations efs_export_ops = { @@ -249,26 +248,26 @@ static int efs_validate_super(struct efs_sb_info *sb, struct efs_super *super) { return 0; } -static int efs_fill_super(struct super_block *s, void *d, int silent) +static int efs_fill_super(struct super_block *s, struct fs_context *fc) { struct efs_sb_info *sb; struct buffer_head *bh; struct inode *root; - sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL); + sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL); if (!sb) return -ENOMEM; s->s_fs_info = sb; s->s_time_min = 0; s->s_time_max = U32_MAX; - + s->s_magic = EFS_SUPER_MAGIC; if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { pr_err("device does not support %d byte blocks\n", EFS_BLOCKSIZE); return -EINVAL; } - + /* read the vh (volume header) block */ bh = sb_bread(s, 0); @@ -294,7 +293,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) pr_err("cannot read superblock\n"); return -EIO; } - + if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { #ifdef DEBUG pr_warn("invalid superblock at block %u\n", @@ -328,6 +327,61 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) return 0; } +static void efs_free_fc(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static int efs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, efs_fill_super); +} + +static int efs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + int token; + struct fs_parse_result result; + + token = fs_parse(fc, efs_param_spec, param, &result); + if (token < 0) + return token; + return 0; +} + +static int efs_reconfigure(struct fs_context *fc) +{ + sync_filesystem(fc->root->d_sb); + + return 0; +} + +struct efs_context { + unsigned long s_mount_opts; +}; + +static const struct fs_context_operations efs_context_opts = { + .parse_param = efs_parse_param, + .get_tree = efs_get_tree, + .reconfigure = efs_reconfigure, + .free = efs_free_fc, +}; + +/* + * Set up the filesystem mount context. + */ +static int efs_init_fs_context(struct fs_context *fc) +{ + struct efs_context *ctx; + + ctx = kzalloc(sizeof(struct efs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + fc->fs_private = ctx; + fc->ops = &efs_context_opts; + + return 0; +} + static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct efs_sb_info *sbi = SUPER_INFO(sb); From bae8bc46987ed8b9e8d00d0a87ac698a85d15904 Mon Sep 17 00:00:00 2001 From: Li zeming Date: Tue, 20 Feb 2024 14:20:30 +0800 Subject: [PATCH 38/70] =?UTF-8?q?libfs:=20Remove=20unnecessary=20=E2=80=98?= =?UTF-8?q?0=E2=80=99=20values=20from=20ret?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ret is assigned first, so it does not need to initialize the assignment. Signed-off-by: Li zeming Link: https://lore.kernel.org/r/20240220062030.114203-1-zeming@nfschina.com Signed-off-by: Christian Brauner --- fs/libfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/libfs.c b/fs/libfs.c index eec6031b0155..6fb8244b259e 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1752,7 +1752,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) const struct inode *dir = READ_ONCE(dentry->d_inode); struct super_block *sb = dentry->d_sb; const struct unicode_map *um = sb->s_encoding; - int ret = 0; + int ret; if (!dir || !IS_CASEFOLDED(dir)) return 0; From 3f6d810665dfde0d33785420618ceb03fba0619d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 17 Feb 2024 15:23:40 -0500 Subject: [PATCH 39/70] libfs: Re-arrange locking in offset_iterate_dir() Liam and Matthew say that once the RCU read lock is released, xa_state is not safe to re-use for the next xas_find() call. But the RCU read lock must be released on each loop iteration so that dput(), which might_sleep(), can be called safely. Thus we are forced to walk the offset tree with fresh state for each directory entry. xa_find() can do this for us, though it might be a little less efficient than maintaining xa_state locally. We believe that in the current code base, inode->i_rwsem provides protection for the xa_state maintained in offset_iterate_dir(). However, there is no guarantee that will continue to be the case in the future. Since offset_iterate_dir() doesn't build xa_state locally any more, there's no longer a strong need for offset_find_next(). Clean up by rolling these two helpers together. Suggested-by: Liam R. Howlett Message-ID: <170785993027.11135.8830043889278631735.stgit@91.116.238.104.host.secureserver.net> Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/170820142021.6328.15047865406275957018.stgit@91.116.238.104.host.secureserver.net Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/libfs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index eec6031b0155..752e24c669d9 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -402,12 +402,13 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) return vfs_setpos(file, offset, U32_MAX); } -static struct dentry *offset_find_next(struct xa_state *xas) +static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset) { struct dentry *child, *found = NULL; + XA_STATE(xas, &octx->xa, offset); rcu_read_lock(); - child = xas_next_entry(xas, U32_MAX); + child = xas_next_entry(&xas, U32_MAX); if (!child) goto out; spin_lock(&child->d_lock); @@ -430,12 +431,11 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) { - struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode); - XA_STATE(xas, &so_ctx->xa, ctx->pos); + struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); struct dentry *dentry; while (true) { - dentry = offset_find_next(&xas); + dentry = offset_find_next(octx, ctx->pos); if (!dentry) return ERR_PTR(-ENOENT); @@ -444,8 +444,8 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) break; } + ctx->pos = dentry2offset(dentry) + 1; dput(dentry); - ctx->pos = xas.xa_index + 1; } return NULL; } From 7beea725a8ca412c6190090ce7c3a13b169592a1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 17 Feb 2024 15:23:47 -0500 Subject: [PATCH 40/70] libfs: Define a minimum directory offset This value is used in several places, so make it a symbolic constant. Reviewed-by: Jan Kara Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/170820142741.6328.12428356024575347885.stgit@91.116.238.104.host.secureserver.net Signed-off-by: Christian Brauner --- fs/libfs.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 752e24c669d9..f0045db739df 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -240,6 +240,11 @@ const struct inode_operations simple_dir_inode_operations = { }; EXPORT_SYMBOL(simple_dir_inode_operations); +/* 0 is '.', 1 is '..', so always start with offset 2 or more */ +enum { + DIR_OFFSET_MIN = 2, +}; + static void offset_set(struct dentry *dentry, u32 offset) { dentry->d_fsdata = (void *)((uintptr_t)(offset)); @@ -261,9 +266,7 @@ void simple_offset_init(struct offset_ctx *octx) { xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1); lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock); - - /* 0 is '.', 1 is '..', so always start with offset 2 */ - octx->next_offset = 2; + octx->next_offset = DIR_OFFSET_MIN; } /** @@ -276,7 +279,7 @@ void simple_offset_init(struct offset_ctx *octx) */ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) { - static const struct xa_limit limit = XA_LIMIT(2, U32_MAX); + static const struct xa_limit limit = XA_LIMIT(DIR_OFFSET_MIN, U32_MAX); u32 offset; int ret; @@ -481,7 +484,7 @@ static int offset_readdir(struct file *file, struct dir_context *ctx) return 0; /* In this case, ->private_data is protected by f_pos_lock */ - if (ctx->pos == 2) + if (ctx->pos == DIR_OFFSET_MIN) file->private_data = NULL; else if (file->private_data == ERR_PTR(-ENOENT)) return 0; From ecba88a3b32d733d41e27973e25b2bc580f64281 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 17 Feb 2024 15:23:54 -0500 Subject: [PATCH 41/70] libfs: Add simple_offset_empty() For simple filesystems that use directory offset mapping, rely strictly on the directory offset map to tell when a directory has no children. After this patch is applied, the emptiness test holds only the RCU read lock when the directory being tested has no children. In addition, this adds another layer of confirmation that simple_offset_add/remove() are working as expected. Reviewed-by: Jan Kara Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/170820143463.6328.7872919188371286951.stgit@91.116.238.104.host.secureserver.net Signed-off-by: Christian Brauner --- fs/libfs.c | 32 ++++++++++++++++++++++++++++++++ include/linux/fs.h | 1 + mm/shmem.c | 4 ++-- 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index f0045db739df..f7f92a49a418 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -313,6 +313,38 @@ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) offset_set(dentry, 0); } +/** + * simple_offset_empty - Check if a dentry can be unlinked + * @dentry: dentry to be tested + * + * Returns 0 if @dentry is a non-empty directory; otherwise returns 1. + */ +int simple_offset_empty(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct offset_ctx *octx; + struct dentry *child; + unsigned long index; + int ret = 1; + + if (!inode || !S_ISDIR(inode->i_mode)) + return ret; + + index = DIR_OFFSET_MIN; + octx = inode->i_op->get_offset_ctx(inode); + xa_for_each(&octx->xa, index, child) { + spin_lock(&child->d_lock); + if (simple_positive(child)) { + spin_unlock(&child->d_lock); + ret = 0; + break; + } + spin_unlock(&child->d_lock); + } + + return ret; +} + /** * simple_offset_rename_exchange - exchange rename with directory offsets * @old_dir: parent of dentry being moved diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..03d141809a2c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3267,6 +3267,7 @@ struct offset_ctx { void simple_offset_init(struct offset_ctx *octx); int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry); void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry); +int simple_offset_empty(struct dentry *dentry); int simple_offset_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, diff --git a/mm/shmem.c b/mm/shmem.c index d7c84ff62186..6fed524343cb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3374,7 +3374,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) static int shmem_rmdir(struct inode *dir, struct dentry *dentry) { - if (!simple_empty(dentry)) + if (!simple_offset_empty(dentry)) return -ENOTEMPTY; drop_nlink(d_inode(dentry)); @@ -3431,7 +3431,7 @@ static int shmem_rename2(struct mnt_idmap *idmap, return simple_offset_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); - if (!simple_empty(new_dentry)) + if (!simple_offset_empty(new_dentry)) return -ENOTEMPTY; if (flags & RENAME_WHITEOUT) { From 9b6713cc75229f25552c643083cbdbfb771e5bca Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 17 Feb 2024 15:24:01 -0500 Subject: [PATCH 42/70] maple_tree: Add mtree_alloc_cyclic() I need a cyclic allocator for the simple_offset implementation in fs/libfs.c. Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/170820144179.6328.12838600511394432325.stgit@91.116.238.104.host.secureserver.net Signed-off-by: Christian Brauner --- include/linux/maple_tree.h | 7 +++ lib/maple_tree.c | 93 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index b3d63123b945..a53ad4dabd7e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -171,6 +171,7 @@ enum maple_type { #define MT_FLAGS_LOCK_IRQ 0x100 #define MT_FLAGS_LOCK_BH 0x200 #define MT_FLAGS_LOCK_EXTERN 0x300 +#define MT_FLAGS_ALLOC_WRAPPED 0x0800 #define MAPLE_HEIGHT_MAX 31 @@ -319,6 +320,9 @@ int mtree_insert_range(struct maple_tree *mt, unsigned long first, int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); +int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, + void *entry, unsigned long range_lo, unsigned long range_hi, + unsigned long *next, gfp_t gfp); int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); @@ -499,6 +503,9 @@ void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); void *mas_find_range_rev(struct ma_state *mas, unsigned long max); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); +int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, + void *entry, unsigned long range_lo, unsigned long range_hi, + unsigned long *next, gfp_t gfp); bool mas_nomem(struct ma_state *mas, gfp_t gfp); void mas_pause(struct ma_state *mas); diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 6f241bb38799..af0970288727 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4290,6 +4290,56 @@ exists: } +/** + * mas_alloc_cyclic() - Internal call to find somewhere to store an entry + * @mas: The maple state. + * @startp: Pointer to ID. + * @range_lo: Lower bound of range to search. + * @range_hi: Upper bound of range to search. + * @entry: The entry to store. + * @next: Pointer to next ID to allocate. + * @gfp: The GFP_FLAGS to use for allocations. + * + * Return: 0 if the allocation succeeded without wrapping, 1 if the + * allocation succeeded after wrapping, or -EBUSY if there are no + * free entries. + */ +int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, + void *entry, unsigned long range_lo, unsigned long range_hi, + unsigned long *next, gfp_t gfp) +{ + unsigned long min = range_lo; + int ret = 0; + + range_lo = max(min, *next); + ret = mas_empty_area(mas, range_lo, range_hi, 1); + if ((mas->tree->ma_flags & MT_FLAGS_ALLOC_WRAPPED) && ret == 0) { + mas->tree->ma_flags &= ~MT_FLAGS_ALLOC_WRAPPED; + ret = 1; + } + if (ret < 0 && range_lo > min) { + ret = mas_empty_area(mas, min, range_hi, 1); + if (ret == 0) + ret = 1; + } + if (ret < 0) + return ret; + + do { + mas_insert(mas, entry); + } while (mas_nomem(mas, gfp)); + if (mas_is_err(mas)) + return xa_err(mas->node); + + *startp = mas->index; + *next = *startp + 1; + if (*next == 0) + mas->tree->ma_flags |= MT_FLAGS_ALLOC_WRAPPED; + + return ret; +} +EXPORT_SYMBOL(mas_alloc_cyclic); + static __always_inline void mas_rewalk(struct ma_state *mas, unsigned long index) { retry: @@ -6443,6 +6493,49 @@ unlock: } EXPORT_SYMBOL(mtree_alloc_range); +/** + * mtree_alloc_cyclic() - Find somewhere to store this entry in the tree. + * @mt: The maple tree. + * @startp: Pointer to ID. + * @range_lo: Lower bound of range to search. + * @range_hi: Upper bound of range to search. + * @entry: The entry to store. + * @next: Pointer to next ID to allocate. + * @gfp: The GFP_FLAGS to use for allocations. + * + * Finds an empty entry in @mt after @next, stores the new index into + * the @id pointer, stores the entry at that index, then updates @next. + * + * @mt must be initialized with the MT_FLAGS_ALLOC_RANGE flag. + * + * Context: Any context. Takes and releases the mt.lock. May sleep if + * the @gfp flags permit. + * + * Return: 0 if the allocation succeeded without wrapping, 1 if the + * allocation succeeded after wrapping, -ENOMEM if memory could not be + * allocated, -EINVAL if @mt cannot be used, or -EBUSY if there are no + * free entries. + */ +int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, + void *entry, unsigned long range_lo, unsigned long range_hi, + unsigned long *next, gfp_t gfp) +{ + int ret; + + MA_STATE(mas, mt, 0, 0); + + if (!mt_is_alloc(mt)) + return -EINVAL; + if (WARN_ON_ONCE(mt_is_reserved(entry))) + return -EINVAL; + mtree_lock(mt); + ret = mas_alloc_cyclic(&mas, startp, entry, range_lo, range_hi, + next, gfp); + mtree_unlock(mt); + return ret; +} +EXPORT_SYMBOL(mtree_alloc_cyclic); + int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp) From f92e1a829d64dd66fa173c6934f03817d9e68d43 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Sat, 17 Feb 2024 15:24:08 -0500 Subject: [PATCH 43/70] test_maple_tree: testing the cyclic allocation This tests the interactions of the cyclic allocations, the maple state index and last, and overflow. Signed-off-by: Liam R. Howlett Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/170820144894.6328.13052830860966450674.stgit@91.116.238.104.host.secureserver.net Signed-off-by: Christian Brauner --- lib/test_maple_tree.c | 44 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 29185ac5c727..399380db449c 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -3599,6 +3599,45 @@ static noinline void __init check_state_handling(struct maple_tree *mt) mas_unlock(&mas); } +static noinline void __init alloc_cyclic_testing(struct maple_tree *mt) +{ + unsigned long location; + unsigned long next; + int ret = 0; + MA_STATE(mas, mt, 0, 0); + + next = 0; + mtree_lock(mt); + for (int i = 0; i < 100; i++) { + mas_alloc_cyclic(&mas, &location, mt, 2, ULONG_MAX, &next, GFP_KERNEL); + MAS_BUG_ON(&mas, i != location - 2); + MAS_BUG_ON(&mas, mas.index != location); + MAS_BUG_ON(&mas, mas.last != location); + MAS_BUG_ON(&mas, i != next - 3); + } + + mtree_unlock(mt); + mtree_destroy(mt); + next = 0; + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + for (int i = 0; i < 100; i++) { + mtree_alloc_cyclic(mt, &location, mt, 2, ULONG_MAX, &next, GFP_KERNEL); + MT_BUG_ON(mt, i != location - 2); + MT_BUG_ON(mt, i != next - 3); + MT_BUG_ON(mt, mtree_load(mt, location) != mt); + } + + mtree_destroy(mt); + /* Overflow test */ + next = ULONG_MAX - 1; + ret = mtree_alloc_cyclic(mt, &location, mt, 2, ULONG_MAX, &next, GFP_KERNEL); + MT_BUG_ON(mt, ret != 0); + ret = mtree_alloc_cyclic(mt, &location, mt, 2, ULONG_MAX, &next, GFP_KERNEL); + MT_BUG_ON(mt, ret != 0); + ret = mtree_alloc_cyclic(mt, &location, mt, 2, ULONG_MAX, &next, GFP_KERNEL); + MT_BUG_ON(mt, ret != 1); +} + static DEFINE_MTREE(tree); static int __init maple_tree_seed(void) { @@ -3880,6 +3919,11 @@ static int __init maple_tree_seed(void) check_state_handling(&tree); mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + alloc_cyclic_testing(&tree); + mtree_destroy(&tree); + + #if defined(BENCH) skip: #endif From 0e4a862174f2a8d1653a8a9cf0815020e1d3af24 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 17 Feb 2024 15:24:16 -0500 Subject: [PATCH 44/70] libfs: Convert simple directory offsets to use a Maple Tree Test robot reports: > kernel test robot noticed a -19.0% regression of aim9.disk_src.ops_per_sec on: > > commit: a2e459555c5f9da3e619b7e47a63f98574dc75f1 ("shmem: stable directory offsets") > https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master Feng Tang further clarifies that: > ... the new simple_offset_add() > called by shmem_mknod() brings extra cost related with slab, > specifically the 'radix_tree_node', which cause the regression. Willy's analysis is that, over time, the test workload causes xa_alloc_cyclic() to fragment the underlying SLAB cache. This patch replaces the offset_ctx's xarray with a Maple Tree in the hope that Maple Tree's dense node mode will handle this scenario more scalably. In addition, we can widen the simple directory offset maximum to signed long (as loff_t is also signed). Suggested-by: Matthew Wilcox Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202309081306.3ecb3734-oliver.sang@intel.com Signed-off-by: Chuck Lever Link: https://lore.kernel.org/r/170820145616.6328.12620992971699079156.stgit@91.116.238.104.host.secureserver.net Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/libfs.c | 47 +++++++++++++++++++++++----------------------- include/linux/fs.h | 5 +++-- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index f7f92a49a418..d3d31197c8e4 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -245,17 +245,17 @@ enum { DIR_OFFSET_MIN = 2, }; -static void offset_set(struct dentry *dentry, u32 offset) +static void offset_set(struct dentry *dentry, long offset) { - dentry->d_fsdata = (void *)((uintptr_t)(offset)); + dentry->d_fsdata = (void *)offset; } -static u32 dentry2offset(struct dentry *dentry) +static long dentry2offset(struct dentry *dentry) { - return (u32)((uintptr_t)(dentry->d_fsdata)); + return (long)dentry->d_fsdata; } -static struct lock_class_key simple_offset_xa_lock; +static struct lock_class_key simple_offset_lock_class; /** * simple_offset_init - initialize an offset_ctx @@ -264,8 +264,8 @@ static struct lock_class_key simple_offset_xa_lock; */ void simple_offset_init(struct offset_ctx *octx) { - xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1); - lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock); + mt_init_flags(&octx->mt, MT_FLAGS_ALLOC_RANGE); + lockdep_set_class(&octx->mt.ma_lock, &simple_offset_lock_class); octx->next_offset = DIR_OFFSET_MIN; } @@ -274,20 +274,19 @@ void simple_offset_init(struct offset_ctx *octx) * @octx: directory offset ctx to be updated * @dentry: new dentry being added * - * Returns zero on success. @so_ctx and the dentry offset are updated. + * Returns zero on success. @octx and the dentry's offset are updated. * Otherwise, a negative errno value is returned. */ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) { - static const struct xa_limit limit = XA_LIMIT(DIR_OFFSET_MIN, U32_MAX); - u32 offset; + unsigned long offset; int ret; if (dentry2offset(dentry) != 0) return -EBUSY; - ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit, - &octx->next_offset, GFP_KERNEL); + ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN, + LONG_MAX, &octx->next_offset, GFP_KERNEL); if (ret < 0) return ret; @@ -303,13 +302,13 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) */ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) { - u32 offset; + long offset; offset = dentry2offset(dentry); if (offset == 0) return; - xa_erase(&octx->xa, offset); + mtree_erase(&octx->mt, offset); offset_set(dentry, 0); } @@ -332,7 +331,7 @@ int simple_offset_empty(struct dentry *dentry) index = DIR_OFFSET_MIN; octx = inode->i_op->get_offset_ctx(inode); - xa_for_each(&octx->xa, index, child) { + mt_for_each(&octx->mt, child, index, LONG_MAX) { spin_lock(&child->d_lock); if (simple_positive(child)) { spin_unlock(&child->d_lock); @@ -362,8 +361,8 @@ int simple_offset_rename_exchange(struct inode *old_dir, { struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); - u32 old_index = dentry2offset(old_dentry); - u32 new_index = dentry2offset(new_dentry); + long old_index = dentry2offset(old_dentry); + long new_index = dentry2offset(new_dentry); int ret; simple_offset_remove(old_ctx, old_dentry); @@ -389,9 +388,9 @@ int simple_offset_rename_exchange(struct inode *old_dir, out_restore: offset_set(old_dentry, old_index); - xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL); + mtree_store(&old_ctx->mt, old_index, old_dentry, GFP_KERNEL); offset_set(new_dentry, new_index); - xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL); + mtree_store(&new_ctx->mt, new_index, new_dentry, GFP_KERNEL); return ret; } @@ -404,7 +403,7 @@ out_restore: */ void simple_offset_destroy(struct offset_ctx *octx) { - xa_destroy(&octx->xa); + mtree_destroy(&octx->mt); } /** @@ -434,16 +433,16 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) /* In this case, ->private_data is protected by f_pos_lock */ file->private_data = NULL; - return vfs_setpos(file, offset, U32_MAX); + return vfs_setpos(file, offset, LONG_MAX); } static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset) { + MA_STATE(mas, &octx->mt, offset, offset); struct dentry *child, *found = NULL; - XA_STATE(xas, &octx->xa, offset); rcu_read_lock(); - child = xas_next_entry(&xas, U32_MAX); + child = mas_find(&mas, LONG_MAX); if (!child) goto out; spin_lock(&child->d_lock); @@ -457,8 +456,8 @@ out: static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) { - u32 offset = dentry2offset(dentry); struct inode *inode = d_inode(dentry); + long offset = dentry2offset(dentry); return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset, inode->i_ino, fs_umode_to_dtype(inode->i_mode)); diff --git a/include/linux/fs.h b/include/linux/fs.h index 03d141809a2c..55144c12ee0f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -3260,8 +3261,8 @@ extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count); struct offset_ctx { - struct xarray xa; - u32 next_offset; + struct maple_tree mt; + unsigned long next_offset; }; void simple_offset_init(struct offset_ctx *octx); From 0611a640e60a0473883328746f3c53934cd4dc3c Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Wed, 21 Feb 2024 14:22:05 +0300 Subject: [PATCH 45/70] eventpoll: prefer kfree_rcu() in __ep_remove() In '__ep_remove()', prefer 'kfree_rcu()' over 'call_rcu()' with dummy 'epi_rcu_free()' callback. This follows commit d0089603fa7a ("fs: prefer kfree_rcu() in fasync_remove_entry()") and should not be backported to stable as well. Signed-off-by: Dmitry Antipov Link: https://lore.kernel.org/r/20240221112205.48389-2-dmantipov@yandex.ru Signed-off-by: Christian Brauner --- fs/eventpoll.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 786e023a48b2..39ac6fdf8bca 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -678,12 +678,6 @@ static void ep_done_scan(struct eventpoll *ep, write_unlock_irq(&ep->lock); } -static void epi_rcu_free(struct rcu_head *head) -{ - struct epitem *epi = container_of(head, struct epitem, rcu); - kmem_cache_free(epi_cache, epi); -} - static void ep_get(struct eventpoll *ep) { refcount_inc(&ep->refcount); @@ -767,7 +761,7 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make * use of the rbn field. */ - call_rcu(&epi->rcu, epi_rcu_free); + kfree_rcu(epi, rcu); percpu_counter_dec(&ep->user->epoll_watches); return ep_refcount_dec_and_test(ep); From 5916f439f2eb81eef98703b9a8f1dc91e3aa7129 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 21 Feb 2024 09:40:03 -0600 Subject: [PATCH 46/70] Convert coda to use the new mount API Convert the coda filesystem to the new internal mount API as the old one will be obsoleted and removed. This allows greater flexibility in communication of mount parameters between userspace, the VFS and the filesystem. See Documentation/filesystems/mount_api.rst for more information. Note this is slightly tricky as coda currently only has a binary mount data interface. This is handled through the parse_monolithic hook. Also add a more conventional interface with a parameter named "fd" that takes an fd that refers to a coda psdev, thereby specifying the index to use. Signed-off-by: David Howells Co-developed-by: Eric Sandeen Signed-off-by: Eric Sandeen [sandeen: forward port to current upstream mount API interfaces] Link: https://lore.kernel.org/r/97650eeb-94c7-4041-b58c-90e81e76b699@redhat.com Tested-by: Jan Harkes Reviewed-by: Ian Kent cc: coda@cs.cmu.edu Signed-off-by: Christian Brauner --- fs/coda/inode.c | 149 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 101 insertions(+), 48 deletions(-) diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 0c7c2528791e..a50356c541f6 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include #include @@ -87,10 +89,10 @@ void coda_destroy_inodecache(void) kmem_cache_destroy(coda_inode_cachep); } -static int coda_remount(struct super_block *sb, int *flags, char *data) +static int coda_reconfigure(struct fs_context *fc) { - sync_filesystem(sb); - *flags |= SB_NOATIME; + sync_filesystem(fc->root->d_sb); + fc->sb_flags |= SB_NOATIME; return 0; } @@ -102,78 +104,102 @@ static const struct super_operations coda_super_operations = .evict_inode = coda_evict_inode, .put_super = coda_put_super, .statfs = coda_statfs, - .remount_fs = coda_remount, }; -static int get_device_index(struct coda_mount_data *data) +struct coda_fs_context { + int idx; +}; + +enum { + Opt_fd, +}; + +static const struct fs_parameter_spec coda_param_specs[] = { + fsparam_fd ("fd", Opt_fd), + {} +}; + +static int coda_parse_fd(struct fs_context *fc, int fd) { + struct coda_fs_context *ctx = fc->fs_private; struct fd f; struct inode *inode; int idx; - if (data == NULL) { - pr_warn("%s: Bad mount data\n", __func__); - return -1; - } - - if (data->version != CODA_MOUNT_VERSION) { - pr_warn("%s: Bad mount version\n", __func__); - return -1; - } - - f = fdget(data->fd); + f = fdget(fd); if (!f.file) - goto Ebadf; + return -EBADF; inode = file_inode(f.file); if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) { fdput(f); - goto Ebadf; + return invalf(fc, "code: Not coda psdev"); } idx = iminor(inode); fdput(f); - if (idx < 0 || idx >= MAX_CODADEVS) { - pr_warn("%s: Bad minor number\n", __func__); - return -1; - } - - return idx; -Ebadf: - pr_warn("%s: Bad file\n", __func__); - return -1; + if (idx < 0 || idx >= MAX_CODADEVS) + return invalf(fc, "coda: Bad minor number"); + ctx->idx = idx; + return 0; } -static int coda_fill_super(struct super_block *sb, void *data, int silent) +static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param) { + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, coda_param_specs, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_fd: + return coda_parse_fd(fc, result.uint_32); + } + + return 0; +} + +/* + * Parse coda's binary mount data form. We ignore any errors and go with index + * 0 if we get one for backward compatibility. + */ +static int coda_parse_monolithic(struct fs_context *fc, void *_data) +{ + struct coda_mount_data *data = _data; + + if (!data) + return invalf(fc, "coda: Bad mount data"); + + if (data->version != CODA_MOUNT_VERSION) + return invalf(fc, "coda: Bad mount version"); + + coda_parse_fd(fc, data->fd); + return 0; +} + +static int coda_fill_super(struct super_block *sb, struct fs_context *fc) +{ + struct coda_fs_context *ctx = fc->fs_private; struct inode *root = NULL; struct venus_comm *vc; struct CodaFid fid; int error; - int idx; - if (task_active_pid_ns(current) != &init_pid_ns) - return -EINVAL; + infof(fc, "coda: device index: %i\n", ctx->idx); - idx = get_device_index((struct coda_mount_data *) data); - - /* Ignore errors in data, for backward compatibility */ - if(idx == -1) - idx = 0; - - pr_info("%s: device index: %i\n", __func__, idx); - - vc = &coda_comms[idx]; + vc = &coda_comms[ctx->idx]; mutex_lock(&vc->vc_mutex); if (!vc->vc_inuse) { - pr_warn("%s: No pseudo device\n", __func__); + errorf(fc, "coda: No pseudo device"); error = -EINVAL; goto unlock_out; } if (vc->vc_sb) { - pr_warn("%s: Device already mounted\n", __func__); + errorf(fc, "coda: Device already mounted"); error = -EBUSY; goto unlock_out; } @@ -313,18 +339,45 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -/* init_coda: used by filesystems.c to register coda */ - -static struct dentry *coda_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int coda_get_tree(struct fs_context *fc) { - return mount_nodev(fs_type, flags, data, coda_fill_super); + if (task_active_pid_ns(current) != &init_pid_ns) + return -EINVAL; + + return get_tree_nodev(fc, coda_fill_super); +} + +static void coda_free_fc(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static const struct fs_context_operations coda_context_ops = { + .free = coda_free_fc, + .parse_param = coda_parse_param, + .parse_monolithic = coda_parse_monolithic, + .get_tree = coda_get_tree, + .reconfigure = coda_reconfigure, +}; + +static int coda_init_fs_context(struct fs_context *fc) +{ + struct coda_fs_context *ctx; + + ctx = kzalloc(sizeof(struct coda_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + fc->fs_private = ctx; + fc->ops = &coda_context_ops; + return 0; } struct file_system_type coda_fs_type = { .owner = THIS_MODULE, .name = "coda", - .mount = coda_mount, + .init_fs_context = coda_init_fs_context, + .parameters = coda_param_specs, .kill_sb = kill_anon_super, .fs_flags = FS_BINARY_MOUNTDATA, }; From c997d683d952d8c927922ea0ab4d33a01c8efc2c Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sat, 24 Feb 2024 13:53:15 +0000 Subject: [PATCH 47/70] vfs: remove SLAB_MEM_SPREAD flag usage The SLAB_MEM_SPREAD flag used to be implemented in SLAB, which was removed as of v6.8-rc1 (see [1]), so it became a dead flag since the commit 16a1d968358a ("mm/slab: remove mm/slab.c and slab_def.h"). And the series[1] went on to mark it obsolete explicitly to avoid confusion for users. Here we can just remove all its users, which has no any functional change. Signed-off-by: Chengming Zhou Link: https://lore.kernel.org/all/20240223-slab-cleanup-flags-v2-1-02f1753e8303@suse.cz [1] Link: https://lore.kernel.org/r/20240224135315.830477-1-chengming.zhou@linux.dev Signed-off-by: Christian Brauner --- fs/buffer.c | 2 +- fs/dcache.c | 2 +- fs/inode.c | 2 +- fs/mbcache.c | 3 +-- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index b55dea034a5d..9a54077de87d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3122,7 +3122,7 @@ void __init buffer_init(void) int ret; bh_cachep = KMEM_CACHE(buffer_head, - SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC); /* * Limit the bh occupancy to 10% of ZONE_NORMAL */ diff --git a/fs/dcache.c b/fs/dcache.c index b813528fb147..be9e10155c8d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3136,7 +3136,7 @@ static void __init dcache_init(void) * of the dcache. */ dentry_cache = KMEM_CACHE_USERCOPY(dentry, - SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT, + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT, d_iname); /* Hash may have been set up in dcache_init_early */ diff --git a/fs/inode.c b/fs/inode.c index 6d0d54230363..d2e8e3884b36 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2286,7 +2286,7 @@ void __init inode_init(void) sizeof(struct inode), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); /* Hash may have been set up in inode_init_early */ diff --git a/fs/mbcache.c b/fs/mbcache.c index fe2624e17253..e60a840999aa 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -426,8 +426,7 @@ EXPORT_SYMBOL(mb_cache_destroy); static int __init mbcache_init(void) { - mb_entry_cache = KMEM_CACHE(mb_cache_entry, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD); + mb_entry_cache = KMEM_CACHE(mb_cache_entry, SLAB_RECLAIM_ACCOUNT); if (!mb_entry_cache) return -ENOMEM; return 0; From cbb93fe5e6d00695e1344fa697dab3974b216717 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sat, 24 Feb 2024 13:51:58 +0000 Subject: [PATCH 48/70] sysv: remove SLAB_MEM_SPREAD flag usage The SLAB_MEM_SPREAD flag used to be implemented in SLAB, which was removed as of v6.8-rc1 (see [1]), so it became a dead flag since the commit 16a1d968358a ("mm/slab: remove mm/slab.c and slab_def.h"). And the series[1] went on to mark it obsolete explicitly to avoid confusion for users. Here we can just remove all its users, which has no any functional change. Signed-off-by: Chengming Zhou Link: https://lore.kernel.org/all/20240223-slab-cleanup-flags-v2-1-02f1753e8303@suse.cz [1] Link: https://lore.kernel.org/r/20240224135158.830266-1-chengming.zhou@linux.dev Signed-off-by: Christian Brauner --- fs/sysv/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 5a915b2e68f5..76bc2d5e75a9 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -336,7 +336,7 @@ int __init sysv_init_icache(void) { sysv_inode_cachep = kmem_cache_create("sysv_inode_cache", sizeof(struct sysv_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, init_once); if (!sysv_inode_cachep) return -ENOMEM; From 8145e082a8dfcfdc0a725b0f76eca54337b5b26e Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sat, 24 Feb 2024 13:51:43 +0000 Subject: [PATCH 49/70] romfs: remove SLAB_MEM_SPREAD flag usage The SLAB_MEM_SPREAD flag used to be implemented in SLAB, which was removed as of v6.8-rc1 (see [1]), so it became a dead flag since the commit 16a1d968358a ("mm/slab: remove mm/slab.c and slab_def.h"). And the series[1] went on to mark it obsolete explicitly to avoid confusion for users. Here we can just remove all its users, which has no any functional change. Signed-off-by: Chengming Zhou Link: https://lore.kernel.org/all/20240223-slab-cleanup-flags-v2-1-02f1753e8303@suse.cz [1] Link: https://lore.kernel.org/r/20240224135143.830142-1-chengming.zhou@linux.dev Signed-off-by: Christian Brauner --- fs/romfs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 545ad44f96b8..85641c3f5782 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -630,8 +630,8 @@ static int __init init_romfs_fs(void) romfs_inode_cachep = kmem_cache_create("romfs_i", sizeof(struct romfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | - SLAB_ACCOUNT, romfs_i_init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, + romfs_i_init_once); if (!romfs_inode_cachep) { pr_err("Failed to initialise inode cache\n"); From 87a83c8c677e762a39d914e67450cc85d8fc1f9a Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sat, 24 Feb 2024 13:51:26 +0000 Subject: [PATCH 50/70] reiserfs: remove SLAB_MEM_SPREAD flag usage The SLAB_MEM_SPREAD flag used to be implemented in SLAB, which was removed as of v6.8-rc1 (see [1]), so it became a dead flag since the commit 16a1d968358a ("mm/slab: remove mm/slab.c and slab_def.h"). And the series[1] went on to mark it obsolete explicitly to avoid confusion for users. Here we can just remove all its users, which has no any functional change. Signed-off-by: Chengming Zhou Link: https://lore.kernel.org/all/20240223-slab-cleanup-flags-v2-1-02f1753e8303@suse.cz [1] Link: https://lore.kernel.org/r/20240224135126.830110-1-chengming.zhou@linux.dev Signed-off-by: Christian Brauner --- fs/reiserfs/super.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 67b5510beded..2cc469d481a2 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -670,7 +670,6 @@ static int __init init_inodecache(void) sizeof(struct reiserfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD| SLAB_ACCOUNT), init_once); if (reiserfs_inode_cachep == NULL) From c8841fc4d51f603a44c00448ad17404eb829c741 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sat, 24 Feb 2024 13:51:04 +0000 Subject: [PATCH 51/70] qnx6: remove SLAB_MEM_SPREAD flag usage The SLAB_MEM_SPREAD flag used to be implemented in SLAB, which was removed as of v6.8-rc1 (see [1]), so it became a dead flag since the commit 16a1d968358a ("mm/slab: remove mm/slab.c and slab_def.h"). And the series[1] went on to mark it obsolete explicitly to avoid confusion for users. Here we can just remove all its users, which has no any functional change. Signed-off-by: Chengming Zhou Link: https://lore.kernel.org/all/20240223-slab-cleanup-flags-v2-1-02f1753e8303@suse.cz [1] Link: https://lore.kernel.org/r/20240224135104.830045-1-chengming.zhou@linux.dev Signed-off-by: Christian Brauner --- fs/qnx6/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index a286c545717f..405913f4faff 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -615,7 +615,7 @@ static int init_inodecache(void) qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache", sizeof(struct qnx6_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); if (!qnx6_inode_cachep) return -ENOMEM; From c762b979c7c97dc7a4010f943a9c0b646a56ed36 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sat, 24 Feb 2024 13:50:48 +0000 Subject: [PATCH 52/70] proc: remove SLAB_MEM_SPREAD flag usage The SLAB_MEM_SPREAD flag used to be implemented in SLAB, which was removed as of v6.8-rc1 (see [1]), so it became a dead flag since the commit 16a1d968358a ("mm/slab: remove mm/slab.c and slab_def.h"). And the series[1] went on to mark it obsolete explicitly to avoid confusion for users. Here we can just remove all its users, which has no any functional change. Signed-off-by: Chengming Zhou Link: https://lore.kernel.org/all/20240223-slab-cleanup-flags-v2-1-02f1753e8303@suse.cz [1] Link: https://lore.kernel.org/r/20240224135048.829987-1-chengming.zhou@linux.dev Signed-off-by: Christian Brauner --- fs/proc/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index b33e490e3fd9..b9c5cb63dd50 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -95,7 +95,7 @@ void __init proc_init_kmemcache(void) proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT| + SLAB_ACCOUNT| SLAB_PANIC), init_once); pde_opener_cache = From 45f29e0eb5b969c59a70525942080905dc5b7964 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sat, 24 Feb 2024 13:50:28 +0000 Subject: [PATCH 53/70] openpromfs: remove SLAB_MEM_SPREAD flag usage The SLAB_MEM_SPREAD flag used to be implemented in SLAB, which was removed as of v6.8-rc1 (see [1]), so it became a dead flag since the commit 16a1d968358a ("mm/slab: remove mm/slab.c and slab_def.h"). And the series[1] went on to mark it obsolete explicitly to avoid confusion for users. Here we can just remove all its users, which has no any functional change. Signed-off-by: Chengming Zhou Link: https://lore.kernel.org/all/20240223-slab-cleanup-flags-v2-1-02f1753e8303@suse.cz [1] Link: https://lore.kernel.org/r/20240224135028.829910-1-chengming.zhou@linux.dev Signed-off-by: Christian Brauner --- fs/openpromfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index c4b65a6d41cc..4a0779e3ef79 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -446,7 +446,7 @@ static int __init init_openprom_fs(void) sizeof(struct op_inode_info), 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD | SLAB_ACCOUNT), + SLAB_ACCOUNT), op_inode_init_once); if (!op_inode_cachep) return -ENOMEM; From 195b3678bea35fde0619f17b73847c30e66d79d0 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sat, 24 Feb 2024 13:49:35 +0000 Subject: [PATCH 54/70] minix: remove SLAB_MEM_SPREAD flag usage The SLAB_MEM_SPREAD flag used to be implemented in SLAB, which was removed as of v6.8-rc1 (see [1]), so it became a dead flag since the commit 16a1d968358a ("mm/slab: remove mm/slab.c and slab_def.h"). And the series[1] went on to mark it obsolete explicitly to avoid confusion for users. Here we can just remove all its users, which has no any functional change. Signed-off-by: Chengming Zhou Link: https://lore.kernel.org/all/20240223-slab-cleanup-flags-v2-1-02f1753e8303@suse.cz [1] Link: https://lore.kernel.org/r/20240224134935.829715-1-chengming.zhou@linux.dev Signed-off-by: Christian Brauner --- fs/minix/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 73f37f298087..7cbd2b9f4d11 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -87,7 +87,7 @@ static int __init init_inodecache(void) minix_inode_cachep = kmem_cache_create("minix_inode_cache", sizeof(struct minix_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT), + SLAB_ACCOUNT), init_once); if (minix_inode_cachep == NULL) return -ENOMEM; From 2a95fd3cc23d06bf2a86f30c3ca983b170dcc665 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sat, 24 Feb 2024 13:49:25 +0000 Subject: [PATCH 55/70] jfs: remove SLAB_MEM_SPREAD flag usage The SLAB_MEM_SPREAD flag used to be implemented in SLAB, which was removed as of v6.8-rc1 (see [1]), so it became a dead flag since the commit 16a1d968358a ("mm/slab: remove mm/slab.c and slab_def.h"). And the series[1] went on to mark it obsolete explicitly to avoid confusion for users. Here we can just remove all its users, which has no any functional change. Signed-off-by: Chengming Zhou Link: https://lore.kernel.org/all/20240223-slab-cleanup-flags-v2-1-02f1753e8303@suse.cz [1] Link: https://lore.kernel.org/r/20240224134925.829677-1-chengming.zhou@linux.dev Acked-by: Dave Kleikamp Signed-off-by: Christian Brauner --- fs/jfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 8d8e556bd610..73f09a762b79 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -932,7 +932,7 @@ static int __init init_jfs_fs(void) jfs_inode_cachep = kmem_cache_create_usercopy("jfs_ip", sizeof(struct jfs_inode_info), - 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, + 0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, offsetof(struct jfs_inode_info, i_inline_all), sizeof_field(struct jfs_inode_info, i_inline_all), init_once); From 7ded1e365cc9de6093a71d3a493c1874aca6e77c Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Sat, 24 Feb 2024 13:47:42 +0000 Subject: [PATCH 56/70] efs: remove SLAB_MEM_SPREAD flag usage The SLAB_MEM_SPREAD flag used to be implemented in SLAB, which was removed as of v6.8-rc1 (see [1]), so it became a dead flag since the commit 16a1d968358a ("mm/slab: remove mm/slab.c and slab_def.h"). And the series[1] went on to mark it obsolete explicitly to avoid confusion for users. Here we can just remove all its users, which has no any functional change. Signed-off-by: Chengming Zhou Link: https://lore.kernel.org/all/20240223-slab-cleanup-flags-v2-1-02f1753e8303@suse.cz [1] Link: https://lore.kernel.org/r/20240224134742.829325-1-chengming.zhou@linux.dev Signed-off-by: Christian Brauner --- fs/efs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/efs/super.c b/fs/efs/super.c index c837ac89b384..e4421c10caeb 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -98,8 +98,8 @@ static int __init init_inodecache(void) { efs_inode_cachep = kmem_cache_create("efs_inode_cache", sizeof(struct efs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| - SLAB_ACCOUNT, init_once); + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, + init_once); if (efs_inode_cachep == NULL) return -ENOMEM; return 0; From 0906fbb2f7ad6ae4b1bee8777bf83171a12c10b0 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 24 Jan 2024 15:13:40 -0300 Subject: [PATCH 57/70] libfs: Attempt exact-match comparison first during casefolded lookup Casefolded comparisons are (obviously) way more costly than a simple memcmp. Try the case-sensitive comparison first, falling-back to the case-insensitive lookup only when needed. This allows any exact-match lookup to complete without having to walk the utf8 trie. Note that, for strict mode, generic_ci_d_compare used to reject an invalid UTF-8 string, which would now be considered valid if it exact-matches the disk-name. But, if that is the case, the filesystem is corrupt. More than that, it really doesn't matter in practice, because the name-under-lookup will have already been rejected by generic_ci_d_hash and we won't even get here. The memcmp is safe under RCU because we are operating on str/len instead of dentry->d_name directly, and the caller guarantees their consistency between each other in __d_lookup_rcu_op_compare. Link: https://lore.kernel.org/r/87ttn2sip7.fsf_-_@mailhost.krisman.be Suggested-by: Linus Torvalds Signed-off-by: Gabriel Krisman Bertazi --- fs/libfs.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index eec6031b0155..306a0510b7dc 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1704,16 +1704,28 @@ bool is_empty_dir_inode(struct inode *inode) static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { - const struct dentry *parent = READ_ONCE(dentry->d_parent); - const struct inode *dir = READ_ONCE(parent->d_inode); - const struct super_block *sb = dentry->d_sb; - const struct unicode_map *um = sb->s_encoding; - struct qstr qstr = QSTR_INIT(str, len); + const struct dentry *parent; + const struct inode *dir; char strbuf[DNAME_INLINE_LEN]; - int ret; + struct qstr qstr; + /* + * Attempt a case-sensitive match first. It is cheaper and + * should cover most lookups, including all the sane + * applications that expect a case-sensitive filesystem. + * + * This comparison is safe under RCU because the caller + * guarantees the consistency between str and len. See + * __d_lookup_rcu_op_compare() for details. + */ + if (len == name->len && !memcmp(str, name->name, len)) + return 0; + + parent = READ_ONCE(dentry->d_parent); + dir = READ_ONCE(parent->d_inode); if (!dir || !IS_CASEFOLDED(dir)) - goto fallback; + return 1; + /* * If the dentry name is stored in-line, then it may be concurrently * modified by a rename. If this happens, the VFS will eventually retry @@ -1724,20 +1736,14 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, if (len <= DNAME_INLINE_LEN - 1) { memcpy(strbuf, str, len); strbuf[len] = 0; - qstr.name = strbuf; + str = strbuf; /* prevent compiler from optimizing out the temporary buffer */ barrier(); } - ret = utf8_strncasecmp(um, name, &qstr); - if (ret >= 0) - return ret; + qstr.len = len; + qstr.name = str; - if (sb_has_strict_encoding(sb)) - return -EINVAL; -fallback: - if (len != name->len) - return 1; - return !!memcmp(str, name->name, len); + return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr); } /** From 2824083db76cb9d4b7910607b367e93b02912865 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 21 Feb 2024 12:14:03 -0500 Subject: [PATCH 58/70] ovl: Always reject mounting over case-insensitive directories overlayfs relies on the filesystem setting DCACHE_OP_HASH or DCACHE_OP_COMPARE to reject mounting over case-insensitive directories. Since commit bb9cd9106b22 ("fscrypt: Have filesystems handle their d_ops"), we set ->d_op through a hook in ->d_lookup, which means the root dentry won't have them, causing the mount to accidentally succeed. In v6.7-rc7, the following sequence will succeed to mount, but any dentry other than the root dentry will be a "weird" dentry to ovl and fail with EREMOTE. mkfs.ext4 -O casefold lower.img mount -O loop lower.img lower mount -t overlay -o lowerdir=lower,upperdir=upper,workdir=work ovl /mnt Mounting on a subdirectory fails, as expected, because DCACHE_OP_HASH and DCACHE_OP_COMPARE are properly set by ->lookup. Fix by explicitly rejecting superblocks that allow case-insensitive dentries. Yes, this will be solved when we move d_op configuration back to ->s_d_op. Yet, we better have an explicit fix to avoid messing up again. While there, re-sort the entries to have more descriptive error messages first. Fixes: bb9cd9106b22 ("fscrypt: Have filesystems handle their d_ops") Acked-by: Amir Goldstein Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20240221171412.10710-2-krisman@suse.de Signed-off-by: Gabriel Krisman Bertazi --- fs/overlayfs/params.c | 14 +++++++++++--- include/linux/fs.h | 9 +++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 112b4b12f825..36dcc530ac28 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -280,12 +280,20 @@ static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path, { struct ovl_fs_context *ctx = fc->fs_private; - if (ovl_dentry_weird(path->dentry)) - return invalfc(fc, "filesystem on %s not supported", name); - if (!d_is_dir(path->dentry)) return invalfc(fc, "%s is not a directory", name); + /* + * Root dentries of case-insensitive capable filesystems might + * not have the dentry operations set, but still be incompatible + * with overlayfs. Check explicitly to prevent post-mount + * failures. + */ + if (sb_has_encoding(path->mnt->mnt_sb)) + return invalfc(fc, "case-insensitive capable filesystem on %s not supported", name); + + if (ovl_dentry_weird(path->dentry)) + return invalfc(fc, "filesystem on %s not supported", name); /* * Check whether upper path is read-only here to report failures diff --git a/include/linux/fs.h b/include/linux/fs.h index e6ba0cc6f2ee..a0eb8b5759a6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3282,6 +3282,15 @@ extern int generic_check_addressable(unsigned, u64); extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); +static inline bool sb_has_encoding(const struct super_block *sb) +{ +#if IS_ENABLED(CONFIG_UNICODE) + return !!sb->s_encoding; +#else + return false; +#endif +} + int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid); int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *); From 8b6bb995d3819218498bdbee4465bffff1497a31 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 21 Feb 2024 12:14:04 -0500 Subject: [PATCH 59/70] fscrypt: Factor out a helper to configure the lookup dentry Both fscrypt_prepare_lookup_partial and fscrypt_prepare_lookup will set DCACHE_NOKEY_NAME for dentries when the key is not available. Extract out a helper to set this flag in a single place, in preparation to also add the optimization that will disable ->d_revalidate if possible. Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20240221171412.10710-3-krisman@suse.de Signed-off-by: Gabriel Krisman Bertazi --- fs/crypto/hooks.c | 15 +++++---------- include/linux/fscrypt.h | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 52504dd478d3..104771c3d3f6 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -102,11 +102,8 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, if (err && err != -ENOENT) return err; - if (fname->is_nokey_name) { - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_NOKEY_NAME; - spin_unlock(&dentry->d_lock); - } + fscrypt_prepare_dentry(dentry, fname->is_nokey_name); + return err; } EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); @@ -131,12 +128,10 @@ EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry) { int err = fscrypt_get_encryption_info(dir, true); + bool is_nokey_name = (!err && !fscrypt_has_encryption_key(dir)); + + fscrypt_prepare_dentry(dentry, is_nokey_name); - if (!err && !fscrypt_has_encryption_key(dir)) { - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_NOKEY_NAME; - spin_unlock(&dentry->d_lock); - } return err; } EXPORT_SYMBOL_GPL(fscrypt_prepare_lookup_partial); diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 12f9e455d569..c76f859cf019 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -261,6 +261,16 @@ static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) return dentry->d_flags & DCACHE_NOKEY_NAME; } +static inline void fscrypt_prepare_dentry(struct dentry *dentry, + bool is_nokey_name) +{ + if (is_nokey_name) { + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_NOKEY_NAME; + spin_unlock(&dentry->d_lock); + } +} + /* crypto.c */ void fscrypt_enqueue_decrypt_work(struct work_struct *); @@ -425,6 +435,11 @@ static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) return false; } +static inline void fscrypt_prepare_dentry(struct dentry *dentry, + bool is_nokey_name) +{ +} + /* crypto.c */ static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work) { From e86e6638d1171c2201ffff16d2c6a6fd975f8383 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 21 Feb 2024 12:14:05 -0500 Subject: [PATCH 60/70] fscrypt: Drop d_revalidate for valid dentries during lookup Unencrypted and encrypted-dentries where the key is available don't need to be revalidated by fscrypt, since they don't go stale from under VFS and the key cannot be removed for the encrypted case without evicting the dentry. Disable their d_revalidate hook on the first lookup, to avoid repeated revalidation later. This is done in preparation to always configuring d_op through sb->s_d_op. The only part detail is that, since the filesystem might have other features that require revalidation, we only apply this optimization if the d_revalidate handler is fscrypt_d_revalidate itself. Finally, we need to clean the dentry->flags even for unencrypted dentries, so the ->d_lock might be acquired even for them. In order to avoid doing it for filesystems that don't care about fscrypt at all, we peek ->d_flags without the lock at first, and only acquire it if we actually need to write the flag. Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20240221171412.10710-4-krisman@suse.de Signed-off-by: Gabriel Krisman Bertazi --- include/linux/fscrypt.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index c76f859cf019..78af02b35bd9 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -264,10 +264,29 @@ static inline bool fscrypt_is_nokey_name(const struct dentry *dentry) static inline void fscrypt_prepare_dentry(struct dentry *dentry, bool is_nokey_name) { + /* + * This code tries to only take ->d_lock when necessary to write + * to ->d_flags. We shouldn't be peeking on d_flags for + * DCACHE_OP_REVALIDATE unlocked, but in the unlikely case + * there is a race, the worst it can happen is that we fail to + * unset DCACHE_OP_REVALIDATE and pay the cost of an extra + * d_revalidate. + */ if (is_nokey_name) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_NOKEY_NAME; spin_unlock(&dentry->d_lock); + } else if (dentry->d_flags & DCACHE_OP_REVALIDATE && + dentry->d_op->d_revalidate == fscrypt_d_revalidate) { + /* + * Unencrypted dentries and encrypted dentries where the + * key is available are always valid from fscrypt + * perspective. Avoid the cost of calling + * fscrypt_d_revalidate unnecessarily. + */ + spin_lock(&dentry->d_lock); + dentry->d_flags &= ~DCACHE_OP_REVALIDATE; + spin_unlock(&dentry->d_lock); } } @@ -997,6 +1016,9 @@ static inline int fscrypt_prepare_lookup(struct inode *dir, fname->usr_fname = &dentry->d_name; fname->disk_name.name = (unsigned char *)dentry->d_name.name; fname->disk_name.len = dentry->d_name.len; + + fscrypt_prepare_dentry(dentry, false); + return 0; } From e9b10713e82c98a0a909ac55bd485b7d7ff91b85 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 21 Feb 2024 12:14:06 -0500 Subject: [PATCH 61/70] fscrypt: Drop d_revalidate once the key is added When a key is added, existing directory dentries in the DCACHE_NOKEY_NAME form are moved by the VFS to the plaintext version. But, since they have the DCACHE_OP_REVALIDATE flag set, revalidation will be done at each lookup only to return immediately, since plaintext dentries can't go stale until eviction. This patch optimizes this case, by dropping the flag once the nokey_name dentry becomes plain-text. Note that non-directory dentries are not moved this way, so they won't be affected. Of course, this can only be done if fscrypt is the only thing requiring revalidation for a dentry. For this reason, we only disable d_revalidate if the .d_revalidate hook is fscrypt_d_revalidate itself. It is safe to do it here because when moving the dentry to the plain-text version, we are holding the d_lock. We might race with a concurrent RCU lookup but this is harmless because, at worst, we will get an extra d_revalidate on the keyed dentry, which will still find the dentry to be valid. Finally, now that we do more than just clear the DCACHE_NOKEY_NAME in fscrypt_handle_d_move, skip it entirely for plaintext dentries, to avoid extra costs. Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20240221171412.10710-5-krisman@suse.de Signed-off-by: Gabriel Krisman Bertazi --- include/linux/fscrypt.h | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h index 78af02b35bd9..772f822dc6b8 100644 --- a/include/linux/fscrypt.h +++ b/include/linux/fscrypt.h @@ -192,6 +192,8 @@ struct fscrypt_operations { unsigned int *num_devs); }; +int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags); + static inline struct fscrypt_inode_info * fscrypt_get_inode_info(const struct inode *inode) { @@ -221,15 +223,29 @@ static inline bool fscrypt_needs_contents_encryption(const struct inode *inode) } /* - * When d_splice_alias() moves a directory's no-key alias to its plaintext alias - * as a result of the encryption key being added, DCACHE_NOKEY_NAME must be - * cleared. Note that we don't have to support arbitrary moves of this flag - * because fscrypt doesn't allow no-key names to be the source or target of a - * rename(). + * When d_splice_alias() moves a directory's no-key alias to its + * plaintext alias as a result of the encryption key being added, + * DCACHE_NOKEY_NAME must be cleared and there might be an opportunity + * to disable d_revalidate. Note that we don't have to support the + * inverse operation because fscrypt doesn't allow no-key names to be + * the source or target of a rename(). */ static inline void fscrypt_handle_d_move(struct dentry *dentry) { - dentry->d_flags &= ~DCACHE_NOKEY_NAME; + /* + * VFS calls fscrypt_handle_d_move even for non-fscrypt + * filesystems. + */ + if (dentry->d_flags & DCACHE_NOKEY_NAME) { + dentry->d_flags &= ~DCACHE_NOKEY_NAME; + + /* + * Other filesystem features might be handling dentry + * revalidation, in which case it cannot be disabled. + */ + if (dentry->d_op->d_revalidate == fscrypt_d_revalidate) + dentry->d_flags &= ~DCACHE_OP_REVALIDATE; + } } /** @@ -397,7 +413,6 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode, bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len); u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name); -int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags); /* bio.c */ bool fscrypt_decrypt_bio(struct bio *bio); From e6ca2883d987a31051b39c18b16c39e7ce3a2cc0 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 21 Feb 2024 12:14:07 -0500 Subject: [PATCH 62/70] libfs: Merge encrypted_ci_dentry_ops and ci_dentry_ops In preparation to get case-insensitive dentry operations from sb->s_d_op again, use the same structure with and without fscrypt. Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20240221171412.10710-6-krisman@suse.de Signed-off-by: Gabriel Krisman Bertazi --- fs/libfs.c | 42 ++++++++++-------------------------------- 1 file changed, 10 insertions(+), 32 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index 306a0510b7dc..3993453c9787 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1772,6 +1772,9 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) static const struct dentry_operations generic_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, +#ifdef CONFIG_FS_ENCRYPTION + .d_revalidate = fscrypt_d_revalidate, +#endif }; #endif @@ -1781,14 +1784,6 @@ static const struct dentry_operations generic_encrypted_dentry_ops = { }; #endif -#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) -static const struct dentry_operations generic_encrypted_ci_dentry_ops = { - .d_hash = generic_ci_d_hash, - .d_compare = generic_ci_d_compare, - .d_revalidate = fscrypt_d_revalidate, -}; -#endif - /** * generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry * @dentry: dentry to set ops on @@ -1805,38 +1800,21 @@ static const struct dentry_operations generic_encrypted_ci_dentry_ops = { * Encryption works differently in that the only dentry operation it needs is * d_revalidate, which it only needs on dentries that have the no-key name flag. * The no-key flag can't be set "later", so we don't have to worry about that. - * - * Finally, to maximize compatibility with overlayfs (which isn't compatible - * with certain dentry operations) and to avoid taking an unnecessary - * performance hit, we use custom dentry_operations for each possible - * combination rather than always installing all operations. */ void generic_set_encrypted_ci_d_ops(struct dentry *dentry) { -#ifdef CONFIG_FS_ENCRYPTION - bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME; -#endif #if IS_ENABLED(CONFIG_UNICODE) - bool needs_ci_ops = dentry->d_sb->s_encoding; -#endif -#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE) - if (needs_encrypt_ops && needs_ci_ops) { - d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops); - return; - } -#endif -#ifdef CONFIG_FS_ENCRYPTION - if (needs_encrypt_ops) { - d_set_d_op(dentry, &generic_encrypted_dentry_ops); - return; - } -#endif -#if IS_ENABLED(CONFIG_UNICODE) - if (needs_ci_ops) { + if (dentry->d_sb->s_encoding) { d_set_d_op(dentry, &generic_ci_dentry_ops); return; } #endif +#ifdef CONFIG_FS_ENCRYPTION + if (dentry->d_flags & DCACHE_NOKEY_NAME) { + d_set_d_op(dentry, &generic_encrypted_dentry_ops); + return; + } +#endif } EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops); From 70dfe3f0d239c2e8abc6a7bea24411031f85b652 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 21 Feb 2024 12:14:08 -0500 Subject: [PATCH 63/70] libfs: Add helper to choose dentry operations at mount-time In preparation to drop the similar helper that sets d_op at lookup time, add a version to set the right d_op filesystem-wide, through sb->s_d_op. The operations structures are shared across filesystems supporting fscrypt and/or casefolding, therefore we can keep it in common libfs code. Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20240221171412.10710-7-krisman@suse.de Signed-off-by: Gabriel Krisman Bertazi --- fs/libfs.c | 28 ++++++++++++++++++++++++++++ include/linux/fs.h | 1 + 2 files changed, 29 insertions(+) diff --git a/fs/libfs.c b/fs/libfs.c index 3993453c9787..c9d85f525ae8 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1818,6 +1818,34 @@ void generic_set_encrypted_ci_d_ops(struct dentry *dentry) } EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops); +/** + * generic_set_sb_d_ops - helper for choosing the set of + * filesystem-wide dentry operations for the enabled features + * @sb: superblock to be configured + * + * Filesystems supporting casefolding and/or fscrypt can call this + * helper at mount-time to configure sb->s_d_op to best set of dentry + * operations required for the enabled features. The helper must be + * called after these have been configured, but before the root dentry + * is created. + */ +void generic_set_sb_d_ops(struct super_block *sb) +{ +#if IS_ENABLED(CONFIG_UNICODE) + if (sb->s_encoding) { + sb->s_d_op = &generic_ci_dentry_ops; + return; + } +#endif +#ifdef CONFIG_FS_ENCRYPTION + if (sb->s_cop) { + sb->s_d_op = &generic_encrypted_dentry_ops; + return; + } +#endif +} +EXPORT_SYMBOL(generic_set_sb_d_ops); + /** * inode_maybe_inc_iversion - increments i_version * @inode: inode with the i_version that should be updated diff --git a/include/linux/fs.h b/include/linux/fs.h index a0eb8b5759a6..383c5145465f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3281,6 +3281,7 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); +extern void generic_set_sb_d_ops(struct super_block *sb); static inline bool sb_has_encoding(const struct super_block *sb) { From 04aa5f4eba49d6493801d0f1d515ed237459b166 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 21 Feb 2024 12:14:09 -0500 Subject: [PATCH 64/70] ext4: Configure dentry operations at dentry-creation time This was already the case for case-insensitive before commit bb9cd9106b22 ("fscrypt: Have filesystems handle their d_ops"), but it was changed to set at lookup-time to facilitate the integration with fscrypt. But it's a problem because dentries that don't get created through ->lookup() won't have any visibility of the operations. Since fscrypt now also supports configuring dentry operations at creation-time, do it for any encrypted and/or casefold volume, simplifying the implementation across these features. Acked-by: Theodore Ts'o Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20240221171412.10710-8-krisman@suse.de Signed-off-by: Gabriel Krisman Bertazi --- fs/ext4/namei.c | 1 - fs/ext4/super.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 05b647e6bc19..5e4f65c14dfb 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1762,7 +1762,6 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, struct buffer_head *bh; err = ext4_fname_prepare_lookup(dir, dentry, &fname); - generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) return NULL; if (err) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dcba0f85dfe2..215b4614eb15 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5484,6 +5484,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount4; } + generic_set_sb_d_ops(sb); sb->s_root = d_make_root(root); if (!sb->s_root) { ext4_msg(sb, KERN_ERR, "get root dentry failed"); From be2760a703e6554c2b5784e2fec804284cdcbe4d Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 21 Feb 2024 12:14:10 -0500 Subject: [PATCH 65/70] f2fs: Configure dentry operations at dentry-creation time This was already the case for case-insensitive before commit bb9cd9106b22 ("fscrypt: Have filesystems handle their d_ops"), but it was changed to set at lookup-time to facilitate the integration with fscrypt. But it's a problem because dentries that don't get created through ->lookup() won't have any visibility of the operations. Since fscrypt now also supports configuring dentry operations at creation-time, do it for any encrypted and/or casefold volume, simplifying the implementation across these features. Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20240221171412.10710-9-krisman@suse.de Signed-off-by: Gabriel Krisman Bertazi --- fs/f2fs/namei.c | 1 - fs/f2fs/super.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index b3bb815fc6aa..f7f63a567d86 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -531,7 +531,6 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, } err = f2fs_prepare_lookup(dir, dentry, &fname); - generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) goto out_splice; if (err) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d00d21a8b53a..313024f5c90c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4660,6 +4660,7 @@ try_onemore: goto free_node_inode; } + generic_set_sb_d_ops(sb); sb->s_root = d_make_root(root); /* allocate root dentry */ if (!sb->s_root) { err = -ENOMEM; From bc401c2900c128d3d69482769f1300502a9f0598 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 21 Feb 2024 12:14:11 -0500 Subject: [PATCH 66/70] ubifs: Configure dentry operations at dentry-creation time fscrypt now supports configuring dentry operations at dentry-creation time through the preset sb->s_d_op, instead of at lookup time. Enable this in ubifs, since the lookup-time mechanism is going away. Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20240221171412.10710-10-krisman@suse.de Signed-off-by: Gabriel Krisman Bertazi --- fs/ubifs/dir.c | 1 - fs/ubifs/super.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 3b13c648d490..51b9a10a9851 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -205,7 +205,6 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino); err = fscrypt_prepare_lookup(dir, dentry, &nm); - generic_set_encrypted_ci_d_ops(dentry); if (err == -ENOENT) return d_splice_alias(NULL, dentry); if (err) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 09e270d6ed02..304646b03e99 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2239,6 +2239,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) goto out_umount; } + generic_set_sb_d_ops(sb); sb->s_root = d_make_root(root); if (!sb->s_root) { err = -ENOMEM; From 101c3fad29d7a0a90ff063b1aad586a0211911ec Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 21 Feb 2024 12:14:12 -0500 Subject: [PATCH 67/70] libfs: Drop generic_set_encrypted_ci_d_ops No filesystems depend on it anymore, and it is generally a bad idea. Since all dentries should have the same set of dentry operations in case-insensitive capable filesystems, it should be propagated through ->s_d_op. Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20240221171412.10710-11-krisman@suse.de Signed-off-by: Gabriel Krisman Bertazi --- fs/libfs.c | 34 ---------------------------------- include/linux/fs.h | 1 - 2 files changed, 35 deletions(-) diff --git a/fs/libfs.c b/fs/libfs.c index c9d85f525ae8..c297953db948 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1784,40 +1784,6 @@ static const struct dentry_operations generic_encrypted_dentry_ops = { }; #endif -/** - * generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry - * @dentry: dentry to set ops on - * - * Casefolded directories need d_hash and d_compare set, so that the dentries - * contained in them are handled case-insensitively. Note that these operations - * are needed on the parent directory rather than on the dentries in it, and - * while the casefolding flag can be toggled on and off on an empty directory, - * dentry_operations can't be changed later. As a result, if the filesystem has - * casefolding support enabled at all, we have to give all dentries the - * casefolding operations even if their inode doesn't have the casefolding flag - * currently (and thus the casefolding ops would be no-ops for now). - * - * Encryption works differently in that the only dentry operation it needs is - * d_revalidate, which it only needs on dentries that have the no-key name flag. - * The no-key flag can't be set "later", so we don't have to worry about that. - */ -void generic_set_encrypted_ci_d_ops(struct dentry *dentry) -{ -#if IS_ENABLED(CONFIG_UNICODE) - if (dentry->d_sb->s_encoding) { - d_set_d_op(dentry, &generic_ci_dentry_ops); - return; - } -#endif -#ifdef CONFIG_FS_ENCRYPTION - if (dentry->d_flags & DCACHE_NOKEY_NAME) { - d_set_d_op(dentry, &generic_encrypted_dentry_ops); - return; - } -#endif -} -EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops); - /** * generic_set_sb_d_ops - helper for choosing the set of * filesystem-wide dentry operations for the enabled features diff --git a/include/linux/fs.h b/include/linux/fs.h index 383c5145465f..ff1338109b54 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3280,7 +3280,6 @@ extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); -extern void generic_set_encrypted_ci_d_ops(struct dentry *dentry); extern void generic_set_sb_d_ops(struct super_block *sb); static inline bool sb_has_encoding(const struct super_block *sb) From 24a8b7bfb96120b32c222ef108bb7a713d1e385a Mon Sep 17 00:00:00 2001 From: Nguyen Dinh Phi Date: Thu, 29 Feb 2024 01:30:31 +0800 Subject: [PATCH 68/70] fs: use inode_set_ctime_to_ts to set inode ctime to current time The function inode_set_ctime_current simply retrieves the current time and assigns it to the field __i_ctime without any alterations. Therefore, it is possible to set ctime to now directly using inode_set_ctime_to_ts Signed-off-by: Nguyen Dinh Phi Link: https://lore.kernel.org/r/20240228173031.3208743-1-phind.uet@gmail.com Signed-off-by: Christian Brauner --- fs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/inode.c b/fs/inode.c index d2e8e3884b36..5c8a5250d0ac 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2510,7 +2510,7 @@ struct timespec64 inode_set_ctime_current(struct inode *inode) { struct timespec64 now = current_time(inode); - inode_set_ctime(inode, now.tv_sec, now.tv_nsec); + inode_set_ctime_to_ts(inode, now); return now; } EXPORT_SYMBOL(inode_set_ctime_current); From 6b91bfa1651d841f0066bae2b1322cac29fdfb61 Mon Sep 17 00:00:00 2001 From: Bill O'Donnell Date: Thu, 29 Feb 2024 10:15:38 -0600 Subject: [PATCH 69/70] qnx4: convert qnx4 to use the new mount api Convert the qnx4 filesystem to use the new mount API. Tested mount, umount, and remount using a qnx4 boot image. Signed-off-by: Bill O'Donnell Link: https://lore.kernel.org/r/20240229161649.800957-1-bodonnel@redhat.com Acked-by: Anders Larsen Signed-off-by: Christian Brauner --- fs/qnx4/inode.c | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 6eb9bb369b57..7b5711f76709 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "qnx4.h" #define QNX4_VERSION 4 @@ -30,28 +31,33 @@ static const struct super_operations qnx4_sops; static struct inode *qnx4_alloc_inode(struct super_block *sb); static void qnx4_free_inode(struct inode *inode); -static int qnx4_remount(struct super_block *sb, int *flags, char *data); static int qnx4_statfs(struct dentry *, struct kstatfs *); +static int qnx4_get_tree(struct fs_context *fc); static const struct super_operations qnx4_sops = { .alloc_inode = qnx4_alloc_inode, .free_inode = qnx4_free_inode, .statfs = qnx4_statfs, - .remount_fs = qnx4_remount, }; -static int qnx4_remount(struct super_block *sb, int *flags, char *data) +static int qnx4_reconfigure(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; struct qnx4_sb_info *qs; sync_filesystem(sb); qs = qnx4_sb(sb); qs->Version = QNX4_VERSION; - *flags |= SB_RDONLY; + fc->sb_flags |= SB_RDONLY; return 0; } +static const struct fs_context_operations qnx4_context_opts = { + .get_tree = qnx4_get_tree, + .reconfigure = qnx4_reconfigure, +}; + static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_head *bh, int create ) { unsigned long phys; @@ -183,12 +189,13 @@ static const char *qnx4_checkroot(struct super_block *sb, return "bitmap file not found."; } -static int qnx4_fill_super(struct super_block *s, void *data, int silent) +static int qnx4_fill_super(struct super_block *s, struct fs_context *fc) { struct buffer_head *bh; struct inode *root; const char *errmsg; struct qnx4_sb_info *qs; + int silent = fc->sb_flags & SB_SILENT; qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL); if (!qs) @@ -216,7 +223,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) errmsg = qnx4_checkroot(s, (struct qnx4_super_block *) bh->b_data); brelse(bh); if (errmsg != NULL) { - if (!silent) + if (!silent) printk(KERN_ERR "qnx4: %s\n", errmsg); return -EINVAL; } @@ -235,6 +242,18 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) return 0; } +static int qnx4_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, qnx4_fill_super); +} + +static int qnx4_init_fs_context(struct fs_context *fc) +{ + fc->ops = &qnx4_context_opts; + + return 0; +} + static void qnx4_kill_sb(struct super_block *sb) { struct qnx4_sb_info *qs = qnx4_sb(sb); @@ -376,18 +395,12 @@ static void destroy_inodecache(void) kmem_cache_destroy(qnx4_inode_cachep); } -static struct dentry *qnx4_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super); -} - static struct file_system_type qnx4_fs_type = { - .owner = THIS_MODULE, - .name = "qnx4", - .mount = qnx4_mount, - .kill_sb = qnx4_kill_sb, - .fs_flags = FS_REQUIRES_DEV, + .owner = THIS_MODULE, + .name = "qnx4", + .kill_sb = qnx4_kill_sb, + .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = qnx4_init_fs_context, }; MODULE_ALIAS_FS("qnx4"); From 91e78a1eb6b1c0d1a8b434d46b869941e3b3e9e2 Mon Sep 17 00:00:00 2001 From: Giuseppe Scrivano Date: Thu, 29 Feb 2024 16:24:05 +0100 Subject: [PATCH 70/70] hugetlbfs: support idmapped mounts pass down the idmapped mount information to the different helper functions. Differently, hugetlb_file_setup() will continue to not have any mapping since it is only used from contexts where idmapped mounts are not used. Signed-off-by: Giuseppe Scrivano Link: https://lore.kernel.org/r/20240229152405.105031-1-gscrivan@redhat.com Signed-off-by: Christian Brauner --- fs/hugetlbfs/inode.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ea5b8e57d904..af82364c84d8 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -922,7 +922,7 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap, unsigned int ia_valid = attr->ia_valid; struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); - error = setattr_prepare(&nop_mnt_idmap, dentry, attr); + error = setattr_prepare(idmap, dentry, attr); if (error) return error; @@ -939,7 +939,7 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap, hugetlb_vmtruncate(inode, newsize); } - setattr_copy(&nop_mnt_idmap, inode, attr); + setattr_copy(idmap, inode, attr); mark_inode_dirty(inode); return 0; } @@ -974,6 +974,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; static struct inode *hugetlbfs_get_inode(struct super_block *sb, + struct mnt_idmap *idmap, struct inode *dir, umode_t mode, dev_t dev) { @@ -995,7 +996,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); inode->i_ino = get_next_ino(); - inode_init_owner(&nop_mnt_idmap, inode, dir, mode); + inode_init_owner(idmap, inode, dir, mode); lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; @@ -1039,7 +1040,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, { struct inode *inode; - inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); + inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, dev); if (!inode) return -ENOSPC; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); @@ -1051,7 +1052,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - int retval = hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, + int retval = hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); @@ -1062,7 +1063,7 @@ static int hugetlbfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - return hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0); + return hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFREG, 0); } static int hugetlbfs_tmpfile(struct mnt_idmap *idmap, @@ -1071,7 +1072,7 @@ static int hugetlbfs_tmpfile(struct mnt_idmap *idmap, { struct inode *inode; - inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0); + inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode | S_IFREG, 0); if (!inode) return -ENOSPC; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); @@ -1083,10 +1084,11 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { + const umode_t mode = S_IFLNK|S_IRWXUGO; struct inode *inode; int error = -ENOSPC; - inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0); + inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, 0); if (inode) { int l = strlen(symname)+1; error = page_symlink(inode, symname, l); @@ -1553,6 +1555,7 @@ static struct file_system_type hugetlbfs_fs_type = { .init_fs_context = hugetlbfs_init_fs_context, .parameters = hugetlb_fs_parameters, .kill_sb = kill_litter_super, + .fs_flags = FS_ALLOW_IDMAP, }; static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; @@ -1606,7 +1609,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size, } file = ERR_PTR(-ENOSPC); - inode = hugetlbfs_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0); + /* hugetlbfs_vfsmount[] mounts do not use idmapped mounts. */ + inode = hugetlbfs_get_inode(mnt->mnt_sb, &nop_mnt_idmap, NULL, + S_IFREG | S_IRWXUGO, 0); if (!inode) goto out; if (creat_flags == HUGETLB_SHMFS_INODE)