From c2a09f3d782de952f09a3962d03b939e7fa7ffa4 Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Thu, 29 Feb 2024 11:40:13 +0530 Subject: [PATCH 01/40] ext4: Fixes len calculation in mpage_journal_page_buffers Truncate operation can race with writeback, in which inode->i_size can get truncated and therefore size - folio_pos() can be negative. This fixes the len calculation. However this path doesn't get easily triggered even with data journaling. Cc: stable@kernel.org # v6.5 Fixes: 80be8c5cc925 ("Fixes: ext4: Make mpage_journal_page_buffers use folio") Signed-off-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/cff4953b5c9306aba71e944ab176a5d396b9a1b7.1709182250.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 537803250ca9..bab9223d94ac 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2334,7 +2334,7 @@ static int mpage_journal_page_buffers(handle_t *handle, if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) - len = size - folio_pos(folio); + len = size & (len - 1); return ext4_journal_folio_buffers(handle, folio, len); } From 53c17fe55a06cbb405b94d96759611d725d2c47a Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Thu, 29 Feb 2024 11:40:14 +0530 Subject: [PATCH 02/40] ext4: Remove PAGE_MASK dependency on mpage_submit_folio This patch simply removes the PAGE_MASK dependency since mpage_submit_folio() is already converted to work with folio. Signed-off-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/d6eadb090334ea49ceef4e643b371fabfcea328f.1709182251.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bab9223d94ac..e8b0773e5d2d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1865,7 +1865,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) len = folio_size(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) - len = size & ~PAGE_MASK; + len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) mpd->wbc->nr_to_write--; From a0c7cce824a54dbb83bb722df19f1ddcfa5f8d25 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Feb 2024 19:54:12 +0530 Subject: [PATCH 03/40] ext4: set FMODE_CAN_ODIRECT instead of a dummy direct_IO method Since commit a2ad63daa88b ("VFS: add FMODE_CAN_ODIRECT file flag") file systems can just set the FMODE_CAN_ODIRECT flag at open time instead of wiring up a dummy direct_IO method to indicate support for direct I/O. Signed-off-by: Christoph Hellwig [RH: Rebased to upstream] Signed-off-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/e5797bb597219a49043e53e4e90aa494b97dc328.1709215665.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 2 +- fs/ext4/inode.c | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 54d6ff22585c..965febab1d04 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -886,7 +886,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp) } filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | - FMODE_DIO_PARALLEL_WRITE; + FMODE_DIO_PARALLEL_WRITE | FMODE_CAN_ODIRECT; return dquot_file_open(inode, filp); } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e8b0773e5d2d..989e28c24a8c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3530,7 +3530,6 @@ static const struct address_space_operations ext4_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, @@ -3547,7 +3546,6 @@ static const struct address_space_operations ext4_journalled_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_journalled_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, @@ -3564,7 +3562,6 @@ static const struct address_space_operations ext4_da_aops = { .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, - .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_folio = generic_error_remove_folio, @@ -3573,7 +3570,6 @@ static const struct address_space_operations ext4_da_aops = { static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, - .direct_IO = noop_direct_IO, .dirty_folio = noop_dirty_folio, .bmap = ext4_bmap, .swap_activate = ext4_iomap_swap_activate, From ea7d09ad7c280122a322f408672ab8d75c1a0e30 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 17 Mar 2024 16:36:39 +0100 Subject: [PATCH 04/40] ext4: remove unneeded if checks before kfree kfree already checks if its argument is NULL. This fixes two Coccinelle/coccicheck warnings reported by ifnullfree.cocci. Signed-off-by: Thorsten Blum Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20240317153638.2136-2-thorsten.blum@toblux.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 044135796f2b..28cbe0f1996f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2078,8 +2078,7 @@ static int unnote_qf_name(struct fs_context *fc, int qtype) { struct ext4_fs_context *ctx = fc->fs_private; - if (ctx->s_qf_names[qtype]) - kfree(ctx->s_qf_names[qtype]); + kfree(ctx->s_qf_names[qtype]); ctx->s_qf_names[qtype] = NULL; ctx->qname_spec |= 1 << qtype; @@ -2484,8 +2483,7 @@ static int parse_options(struct fs_context *fc, char *options) param.size = v_len; ret = ext4_parse_param(fc, ¶m); - if (param.string) - kfree(param.string); + kfree(param.string); if (ret < 0) return ret; } From 35a1f12f0ca857fee1d7a04ef52cbd5f1f84de13 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 7 Mar 2024 12:53:20 +0100 Subject: [PATCH 05/40] ext4: avoid excessive credit estimate in ext4_tmpfile() A user with minimum journal size (1024 blocks these days) complained about the following error triggered by generic/697 test in ext4_tmpfile(): run fstests generic/697 at 2024-02-28 05:34:46 JBD2: vfstest wants too many credits credits:260 rsv_credits:0 max:256 EXT4-fs error (device loop0) in __ext4_new_inode:1083: error 28 Indeed the credit estimate in ext4_tmpfile() is huge. EXT4_MAXQUOTAS_INIT_BLOCKS() is 219, then 10 credits from ext4_tmpfile() itself and then ext4_xattr_credits_for_new_inode() adds more credits needed for security attributes and ACLs. Now the EXT4_MAXQUOTAS_INIT_BLOCKS() is in fact unnecessary because we've already initialized quotas with dquot_init() shortly before and so EXT4_MAXQUOTAS_TRANS_BLOCKS() is enough (which boils down to 3 credits). Fixes: af51a2ac36d1 ("ext4: ->tmpfile() support") Signed-off-by: Jan Kara Tested-by: Luis Henriques Tested-by: Disha Goel Link: https://lore.kernel.org/r/20240307115320.28949-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5e4f65c14dfb..a630b27a4cc6 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2897,7 +2897,7 @@ static int ext4_tmpfile(struct mnt_idmap *idmap, struct inode *dir, inode = ext4_new_inode_start_handle(idmap, dir, mode, NULL, 0, NULL, EXT4_HT_DIR, - EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + + EXT4_MAXQUOTAS_TRANS_BLOCKS(dir->i_sb) + 4 + EXT4_XATTR_TRANS_BLOCKS); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); From fb092d407262eb4278f3d1ca24da54396a038c62 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 14 Mar 2024 23:53:02 -0400 Subject: [PATCH 06/40] ext4: add support for FS_IOC_GETFSSYSFSPATH The new sysfs path ioctl lets us get the /sys/fs/ path for a given filesystem in a fs agnostic way, potentially nudging us towards standarizing some of our reporting. Signed-off-by: Kent Overstreet Cc: Theodore Ts'o Cc: Andreas Dilger Cc: linux-ext4@vger.kernel.org Link: https://lore.kernel.org/r/20240315035308.3563511-4-kent.overstreet@linux.dev Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 28cbe0f1996f..9bde300b1551 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5340,6 +5340,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid)); + super_set_sysfs_name_bdev(sb); INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); From c77194965dd0dcc26f9c1671d2e74e4eb1248af5 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Fri, 15 Mar 2024 15:29:56 +0100 Subject: [PATCH 07/40] Revert "ext4: apply umask if ACL support is disabled" This reverts commit 484fd6c1de13b336806a967908a927cc0356e312. The commit caused a regression because now the umask was applied to symlinks and the fix is unnecessary because the umask/O_TMPFILE bug has been fixed somewhere else already. Fixes: https://lore.kernel.org/lkml/28DSITL9912E1.2LSZUVTGTO52Q@mforney.org/ Signed-off-by: Max Kellermann Reviewed-by: Christian Brauner Tested-by: Michael Forney Link: https://lore.kernel.org/r/20240315142956.2420360-1-max.kellermann@ionos.com Signed-off-by: Theodore Ts'o --- fs/ext4/acl.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index ef4c19e5f570..0c5a79c3b5d4 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -68,11 +68,6 @@ extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); static inline int ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { - /* usually, the umask is applied by posix_acl_create(), but if - ext4 ACL support is disabled at compile time, we need to do - it here, because posix_acl_create() will never be called */ - inode->i_mode &= ~current_umask(); - return 0; } #endif /* CONFIG_EXT4_FS_POSIX_ACL */ From 9e8e819f8f272c4e5dcd0bd6c7450e36481ed139 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 19 Mar 2024 19:33:17 +0800 Subject: [PATCH 08/40] ext4: avoid overflow when setting values via sysfs When setting values of type unsigned int through sysfs, we use kstrtoul() to parse it and then truncate part of it as the final set value, when the set value is greater than UINT_MAX, the set value will not match what we see because of the truncation. As follows: $ echo 4294967296 > /sys/fs/ext4/sda/mb_max_linear_groups $ cat /sys/fs/ext4/sda/mb_max_linear_groups 0 So we use kstrtouint() to parse the attr_pointer_ui type to avoid the inconsistency described above. In addition, a judgment is added to avoid setting s_resv_clusters less than 0. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240319113325.3110393-2-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/sysfs.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 6d332dff79dd..ca820620b974 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -104,7 +104,7 @@ static ssize_t reserved_clusters_store(struct ext4_sb_info *sbi, int ret; ret = kstrtoull(skip_spaces(buf), 0, &val); - if (ret || val >= clusters) + if (ret || val >= clusters || (s64)val < 0) return -EINVAL; atomic64_set(&sbi->s_resv_clusters, val); @@ -451,7 +451,8 @@ static ssize_t ext4_attr_store(struct kobject *kobj, s_kobj); struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); void *ptr = calc_ptr(a, sbi); - unsigned long t; + unsigned int t; + unsigned long lt; int ret; switch (a->attr_id) { @@ -460,7 +461,7 @@ static ssize_t ext4_attr_store(struct kobject *kobj, case attr_pointer_ui: if (!ptr) return 0; - ret = kstrtoul(skip_spaces(buf), 0, &t); + ret = kstrtouint(skip_spaces(buf), 0, &t); if (ret) return ret; if (a->attr_ptr == ptr_ext4_super_block_offset) @@ -471,10 +472,10 @@ static ssize_t ext4_attr_store(struct kobject *kobj, case attr_pointer_ul: if (!ptr) return 0; - ret = kstrtoul(skip_spaces(buf), 0, &t); + ret = kstrtoul(skip_spaces(buf), 0, <); if (ret) return ret; - *((unsigned long *) ptr) = t; + *((unsigned long *) ptr) = lt; return len; case attr_inode_readahead: return inode_readahead_blks_store(sbi, buf, len); From f536808adcc37a546bf9cc819c349bd55a28159b Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 19 Mar 2024 19:33:18 +0800 Subject: [PATCH 09/40] ext4: refactor out ext4_generic_attr_store() Refactor out the function ext4_generic_attr_store() to handle the setting of values of various common types, with no functional changes. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240319113325.3110393-3-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/sysfs.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index ca820620b974..2b1c529b7fdf 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -443,24 +443,20 @@ static ssize_t ext4_attr_show(struct kobject *kobj, return 0; } -static ssize_t ext4_attr_store(struct kobject *kobj, - struct attribute *attr, - const char *buf, size_t len) +static ssize_t ext4_generic_attr_store(struct ext4_attr *a, + struct ext4_sb_info *sbi, + const char *buf, size_t len) { - struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, - s_kobj); - struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); - void *ptr = calc_ptr(a, sbi); + int ret; unsigned int t; unsigned long lt; - int ret; + void *ptr = calc_ptr(a, sbi); + + if (!ptr) + return 0; switch (a->attr_id) { - case attr_reserved_clusters: - return reserved_clusters_store(sbi, buf, len); case attr_pointer_ui: - if (!ptr) - return 0; ret = kstrtouint(skip_spaces(buf), 0, &t); if (ret) return ret; @@ -470,19 +466,33 @@ static ssize_t ext4_attr_store(struct kobject *kobj, *((unsigned int *) ptr) = t; return len; case attr_pointer_ul: - if (!ptr) - return 0; ret = kstrtoul(skip_spaces(buf), 0, <); if (ret) return ret; *((unsigned long *) ptr) = lt; return len; + } + return 0; +} + +static ssize_t ext4_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, + s_kobj); + struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); + + switch (a->attr_id) { + case attr_reserved_clusters: + return reserved_clusters_store(sbi, buf, len); case attr_inode_readahead: return inode_readahead_blks_store(sbi, buf, len); case attr_trigger_test_error: return trigger_test_error(sbi, buf, len); + default: + return ext4_generic_attr_store(a, sbi, buf, len); } - return 0; } static void ext4_sb_release(struct kobject *kobj) From 57341fe3179c7694c92dcf99e7f836cee4c800dd Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 19 Mar 2024 19:33:19 +0800 Subject: [PATCH 10/40] ext4: refactor out ext4_generic_attr_show() Refactor out the function ext4_generic_attr_show() to handle the reading of values of various common types, with no functional changes. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240319113325.3110393-4-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/sysfs.c | 74 +++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 42 deletions(-) diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 2b1c529b7fdf..7f455b5f22c0 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -366,13 +366,42 @@ static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi) #define print_tstamp(buf, es, tstamp) \ __print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi) +static ssize_t ext4_generic_attr_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + void *ptr = calc_ptr(a, sbi); + + if (!ptr) + return 0; + + switch (a->attr_id) { + case attr_inode_readahead: + case attr_pointer_ui: + if (a->attr_ptr == ptr_ext4_super_block_offset) + return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr)); + return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr)); + case attr_pointer_ul: + return sysfs_emit(buf, "%lu\n", *((unsigned long *) ptr)); + case attr_pointer_u8: + return sysfs_emit(buf, "%u\n", *((unsigned char *) ptr)); + case attr_pointer_u64: + if (a->attr_ptr == ptr_ext4_super_block_offset) + return sysfs_emit(buf, "%llu\n", le64_to_cpup(ptr)); + return sysfs_emit(buf, "%llu\n", *((unsigned long long *) ptr)); + case attr_pointer_string: + return sysfs_emit(buf, "%.*s\n", a->attr_size, (char *) ptr); + case attr_pointer_atomic: + return sysfs_emit(buf, "%d\n", atomic_read((atomic_t *) ptr)); + } + return 0; +} + static ssize_t ext4_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info, s_kobj); struct ext4_attr *a = container_of(attr, struct ext4_attr, attr); - void *ptr = calc_ptr(a, sbi); switch (a->attr_id) { case attr_delayed_allocation_blocks: @@ -391,45 +420,6 @@ static ssize_t ext4_attr_show(struct kobject *kobj, return sysfs_emit(buf, "%llu\n", (unsigned long long) percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit)); - case attr_inode_readahead: - case attr_pointer_ui: - if (!ptr) - return 0; - if (a->attr_ptr == ptr_ext4_super_block_offset) - return sysfs_emit(buf, "%u\n", - le32_to_cpup(ptr)); - else - return sysfs_emit(buf, "%u\n", - *((unsigned int *) ptr)); - case attr_pointer_ul: - if (!ptr) - return 0; - return sysfs_emit(buf, "%lu\n", - *((unsigned long *) ptr)); - case attr_pointer_u8: - if (!ptr) - return 0; - return sysfs_emit(buf, "%u\n", - *((unsigned char *) ptr)); - case attr_pointer_u64: - if (!ptr) - return 0; - if (a->attr_ptr == ptr_ext4_super_block_offset) - return sysfs_emit(buf, "%llu\n", - le64_to_cpup(ptr)); - else - return sysfs_emit(buf, "%llu\n", - *((unsigned long long *) ptr)); - case attr_pointer_string: - if (!ptr) - return 0; - return sysfs_emit(buf, "%.*s\n", a->attr_size, - (char *) ptr); - case attr_pointer_atomic: - if (!ptr) - return 0; - return sysfs_emit(buf, "%d\n", - atomic_read((atomic_t *) ptr)); case attr_feature: return sysfs_emit(buf, "supported\n"); case attr_first_error_time: @@ -438,9 +428,9 @@ static ssize_t ext4_attr_show(struct kobject *kobj, return print_tstamp(buf, sbi->s_es, s_last_error_time); case attr_journal_task: return journal_task_show(sbi, buf); + default: + return ext4_generic_attr_show(a, sbi, buf); } - - return 0; } static ssize_t ext4_generic_attr_store(struct ext4_attr *a, From 13df4d44a3aaabe61cd01d277b6ee23ead2a5206 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 19 Mar 2024 19:33:20 +0800 Subject: [PATCH 11/40] ext4: fix slab-out-of-bounds in ext4_mb_find_good_group_avg_frag_lists() We can trigger a slab-out-of-bounds with the following commands: mkfs.ext4 -F /dev/$disk 10G mount /dev/$disk /tmp/test echo 2147483647 > /sys/fs/ext4/$disk/mb_group_prealloc echo test > /tmp/test/file && sync ================================================================== BUG: KASAN: slab-out-of-bounds in ext4_mb_find_good_group_avg_frag_lists+0x8a/0x200 [ext4] Read of size 8 at addr ffff888121b9d0f0 by task kworker/u2:0/11 CPU: 0 PID: 11 Comm: kworker/u2:0 Tainted: GL 6.7.0-next-20240118 #521 Call Trace: dump_stack_lvl+0x2c/0x50 kasan_report+0xb6/0xf0 ext4_mb_find_good_group_avg_frag_lists+0x8a/0x200 [ext4] ext4_mb_regular_allocator+0x19e9/0x2370 [ext4] ext4_mb_new_blocks+0x88a/0x1370 [ext4] ext4_ext_map_blocks+0x14f7/0x2390 [ext4] ext4_map_blocks+0x569/0xea0 [ext4] ext4_do_writepages+0x10f6/0x1bc0 [ext4] [...] ================================================================== The flow of issue triggering is as follows: // Set s_mb_group_prealloc to 2147483647 via sysfs ext4_mb_new_blocks ext4_mb_normalize_request ext4_mb_normalize_group_request ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc ext4_mb_regular_allocator ext4_mb_choose_next_group ext4_mb_choose_next_group_best_avail mb_avg_fragment_size_order order = fls(len) - 2 = 29 ext4_mb_find_good_group_avg_frag_lists frag_list = &sbi->s_mb_avg_fragment_size[order] if (list_empty(frag_list)) // Trigger SOOB! At 4k block size, the length of the s_mb_avg_fragment_size list is 14, but an oversized s_mb_group_prealloc is set, causing slab-out-of-bounds to be triggered by an attempt to access an element at index 29. Add a new attr_id attr_clusters_in_group with values in the range [0, sbi->s_clusters_per_group] and declare mb_group_prealloc as that type to fix the issue. In addition avoid returning an order from mb_avg_fragment_size_order() greater than MB_NUM_ORDERS(sb) and reduce some useless loops. Fixes: 7e170922f06b ("ext4: Add allocation criteria 1.5 (CR1_5)") CC: stable@vger.kernel.org Signed-off-by: Baokun Li Reviewed-by: Jan Kara Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20240319113325.3110393-5-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 4 ++++ fs/ext4/sysfs.c | 13 ++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 12b3f196010b..dbf04f91516c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -831,6 +831,8 @@ static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) return 0; if (order == MB_NUM_ORDERS(sb)) order--; + if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb))) + order = MB_NUM_ORDERS(sb) - 1; return order; } @@ -1008,6 +1010,8 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context * goal length. */ order = fls(ac->ac_g_ex.fe_len) - 1; + if (WARN_ON_ONCE(order - 1 > MB_NUM_ORDERS(ac->ac_sb))) + order = MB_NUM_ORDERS(ac->ac_sb); min_order = order - sbi->s_mb_best_avail_max_trim_order; if (min_order < 0) min_order = 0; diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 7f455b5f22c0..ddd71673176c 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -29,6 +29,7 @@ typedef enum { attr_trigger_test_error, attr_first_error_time, attr_last_error_time, + attr_clusters_in_group, attr_feature, attr_pointer_ui, attr_pointer_ul, @@ -207,13 +208,14 @@ EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444); EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, ext4_sb_info, s_inode_readahead_blks); +EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group, + ext4_sb_info, s_mb_group_prealloc); EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); -EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); @@ -376,6 +378,7 @@ static ssize_t ext4_generic_attr_show(struct ext4_attr *a, switch (a->attr_id) { case attr_inode_readahead: + case attr_clusters_in_group: case attr_pointer_ui: if (a->attr_ptr == ptr_ext4_super_block_offset) return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr)); @@ -455,6 +458,14 @@ static ssize_t ext4_generic_attr_store(struct ext4_attr *a, else *((unsigned int *) ptr) = t; return len; + case attr_clusters_in_group: + ret = kstrtouint(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t > sbi->s_clusters_per_group) + return -EINVAL; + *((unsigned int *) ptr) = t; + return len; case attr_pointer_ul: ret = kstrtoul(skip_spaces(buf), 0, <); if (ret) From b7b2a5799b8fafe95fcd5455c32ba2c643c86f99 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 19 Mar 2024 19:33:21 +0800 Subject: [PATCH 12/40] ext4: add new attr pointer attr_mb_order The s_mb_best_avail_max_trim_order is of type unsigned int, and has a range of values well beyond the normal use of the mb_order. Although the mballoc code is careful enough that large numbers don't matter there, but this can mislead the sysadmin into thinking that it's normal to set such values. Hence add a new attr_id attr_mb_order with values in the range [0, 64] to avoid storing garbage values and make us more resilient to surprises in the future. Suggested-by: Jan Kara Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240319113325.3110393-6-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/sysfs.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index ddd71673176c..8e0473169458 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -30,6 +30,7 @@ typedef enum { attr_first_error_time, attr_last_error_time, attr_clusters_in_group, + attr_mb_order, attr_feature, attr_pointer_ui, attr_pointer_ul, @@ -210,6 +211,8 @@ EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead, ext4_sb_info, s_inode_readahead_blks); EXT4_ATTR_OFFSET(mb_group_prealloc, 0644, clusters_in_group, ext4_sb_info, s_mb_group_prealloc); +EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order, + ext4_sb_info, s_mb_best_avail_max_trim_order); EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); @@ -225,7 +228,6 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order); #ifdef CONFIG_EXT4_DEBUG EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); #endif @@ -379,6 +381,7 @@ static ssize_t ext4_generic_attr_show(struct ext4_attr *a, switch (a->attr_id) { case attr_inode_readahead: case attr_clusters_in_group: + case attr_mb_order: case attr_pointer_ui: if (a->attr_ptr == ptr_ext4_super_block_offset) return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr)); @@ -458,6 +461,14 @@ static ssize_t ext4_generic_attr_store(struct ext4_attr *a, else *((unsigned int *) ptr) = t; return len; + case attr_mb_order: + ret = kstrtouint(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if (t > 64) + return -EINVAL; + *((unsigned int *) ptr) = t; + return len; case attr_clusters_in_group: ret = kstrtouint(skip_spaces(buf), 0, &t); if (ret) From 63bfe841053f8dda09c9d059d543486d9dc16104 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 19 Mar 2024 19:33:22 +0800 Subject: [PATCH 13/40] ext4: add positive int attr pointer to avoid sysfs variables overflow The following variables controlled by the sysfs interface are of type int and are normally used in the range [0, INT_MAX], but are declared as attr_pointer_ui, and thus may be set to values that exceed INT_MAX and result in overflows to get negative values. err_ratelimit_burst msg_ratelimit_burst warning_ratelimit_burst err_ratelimit_interval_ms msg_ratelimit_interval_ms warning_ratelimit_interval_ms Therefore, we add attr_pointer_pi (aka positive int attr pointer) with a value range of 0-INT_MAX to avoid overflow. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240319113325.3110393-7-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/sysfs.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 8e0473169458..ddb54608ca2e 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -32,6 +32,7 @@ typedef enum { attr_clusters_in_group, attr_mb_order, attr_feature, + attr_pointer_pi, attr_pointer_ui, attr_pointer_ul, attr_pointer_u64, @@ -180,6 +181,9 @@ static struct ext4_attr ext4_attr_##_name = { \ #define EXT4_RO_ATTR_ES_STRING(_name,_elname,_size) \ EXT4_ATTR_STRING(_name, 0444, _size, ext4_super_block, _elname) +#define EXT4_RW_ATTR_SBI_PI(_name,_elname) \ + EXT4_ATTR_OFFSET(_name, 0644, pointer_pi, ext4_sb_info, _elname) + #define EXT4_RW_ATTR_SBI_UI(_name,_elname) \ EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname) @@ -222,12 +226,12 @@ EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); -EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); -EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_PI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_PI(err_ratelimit_burst, s_err_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_PI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_PI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); +EXT4_RW_ATTR_SBI_PI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); +EXT4_RW_ATTR_SBI_PI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); #ifdef CONFIG_EXT4_DEBUG EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail); #endif @@ -382,6 +386,7 @@ static ssize_t ext4_generic_attr_show(struct ext4_attr *a, case attr_inode_readahead: case attr_clusters_in_group: case attr_mb_order: + case attr_pointer_pi: case attr_pointer_ui: if (a->attr_ptr == ptr_ext4_super_block_offset) return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr)); @@ -452,6 +457,14 @@ static ssize_t ext4_generic_attr_store(struct ext4_attr *a, return 0; switch (a->attr_id) { + case attr_pointer_pi: + ret = kstrtouint(skip_spaces(buf), 0, &t); + if (ret) + return ret; + if ((int)t < 0) + return -EINVAL; + *((unsigned int *) ptr) = t; + return len; case attr_pointer_ui: ret = kstrtouint(skip_spaces(buf), 0, &t); if (ret) From 9a9f3a9842927e4af7ca10c19c94dad83bebd713 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 19 Mar 2024 19:33:23 +0800 Subject: [PATCH 14/40] ext4: set type of ac_groups_linear_remaining to __u32 to avoid overflow Now ac_groups_linear_remaining is of type __u16 and s_mb_max_linear_groups is of type unsigned int, so an overflow occurs when setting a value above 65535 through the mb_max_linear_groups sysfs interface. Therefore, the type of ac_groups_linear_remaining is set to __u32 to avoid overflow. Fixes: 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning") CC: stable@kernel.org Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240319113325.3110393-8-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 56938532b4ce..7bfc5fb5a128 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -193,8 +193,8 @@ struct ext4_allocation_context { ext4_grpblk_t ac_orig_goal_len; __u32 ac_flags; /* allocation hints */ + __u32 ac_groups_linear_remaining; __u16 ac_groups_scanned; - __u16 ac_groups_linear_remaining; __u16 ac_found; __u16 ac_cX_found[EXT4_MB_NUM_CRS]; __u16 ac_tail; From 261341a932d9244cbcd372a3659428c8723e5a49 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 19 Mar 2024 19:33:24 +0800 Subject: [PATCH 15/40] ext4: set the type of max_zeroout to unsigned int to avoid overflow The max_zeroout is of type int and the s_extent_max_zeroout_kb is of type uint, and the s_extent_max_zeroout_kb can be freely modified via the sysfs interface. When the block size is 1024, max_zeroout may overflow, so declare it as unsigned int to avoid overflow. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240319113325.3110393-9-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e57054bdc5fd..e067f2dd0335 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3402,9 +3402,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct ext4_extent *ex, *abut_ex; ext4_lblk_t ee_block, eof_block; unsigned int ee_len, depth, map_len = map->m_len; - int allocated = 0, max_zeroout = 0; int err = 0; int split_flag = EXT4_EXT_DATA_VALID2; + int allocated = 0; + unsigned int max_zeroout = 0; ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)map->m_lblk, map_len); From e19089dff547c9e1f09712acc3536d7b0aa9ce3d Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 19 Mar 2024 19:33:25 +0800 Subject: [PATCH 16/40] ext4: clean up s_mb_rb_lock to fix build warnings with C=1 Running sparse (make C=1) on mballoc.c we get the following warning: fs/ext4/mballoc.c:3194:13: warning: context imbalance in 'ext4_mb_seq_structs_summary_start' - wrong count at exit This is because __acquires(&EXT4_SB(sb)->s_mb_rb_lock) was called in ext4_mb_seq_structs_summary_start(), but s_mb_rb_lock was removed in commit 83e80a6e3543 ("ext4: use buckets for cr 1 block scan instead of rbtree"), so remove the __acquires to silence the warning. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240319113325.3110393-10-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index dbf04f91516c..c65fac9b8c72 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3190,7 +3190,6 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) } static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) -__acquires(&EXT4_SB(sb)->s_mb_rb_lock) { struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; From 744a56389f7398f286231e062c2e63f0de01bcc6 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Thu, 21 Mar 2024 01:03:10 +0000 Subject: [PATCH 17/40] ext4: replace deprecated strncpy with alternatives strncpy() is deprecated for use on NUL-terminated destination strings [1] and as such we should prefer more robust and less ambiguous string interfaces. in file.c: s_last_mounted is marked as __nonstring meaning it does not need to be NUL-terminated. Let's instead use strtomem_pad() to copy bytes from the string source to the byte array destination -- while also ensuring to pad with zeroes. in ioctl.c: We can drop the memset and size argument in favor of using the new 2-argument version of strscpy_pad() -- which was introduced with Commit e6584c3964f2f ("string: Allow 2-argument strscpy()"). This guarantees NUL-termination and NUL-padding on the destination buffer -- which seems to be a requirement judging from this comment: | static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label) | { | char label[EXT4_LABEL_MAX + 1]; | | /* | * EXT4_LABEL_MAX must always be smaller than FSLABEL_MAX because | * FSLABEL_MAX must include terminating null byte, while s_volume_name | * does not have to. | */ in super.c: s_first_error_func is marked as __nonstring meaning we can take the same approach as in file.c; just use strtomem_pad() Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1] Link: https://manpages.debian.org/testing/linux-manual-4.8/strscpy.9.en.html [2] Link: https://github.com/KSPP/linux/issues/90 Cc: linux-hardening@vger.kernel.org Signed-off-by: Justin Stitt Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/20240321-strncpy-fs-ext4-file-c-v1-1-36a6a09fef0c@google.com Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 3 +-- fs/ext4/ioctl.c | 3 +-- fs/ext4/super.c | 7 +++---- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 965febab1d04..77529c655f95 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -844,8 +844,7 @@ static int ext4_sample_last_mounted(struct super_block *sb, if (err) goto out_journal; lock_buffer(sbi->s_sbh); - strncpy(sbi->s_es->s_last_mounted, cp, - sizeof(sbi->s_es->s_last_mounted)); + strtomem_pad(sbi->s_es->s_last_mounted, cp, 0); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7160a71044c8..dab7acd49709 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1150,9 +1150,8 @@ static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label */ BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX); - memset(label, 0, sizeof(label)); lock_buffer(sbi->s_sbh); - strncpy(label, sbi->s_es->s_volume_name, EXT4_LABEL_MAX); + strscpy_pad(label, sbi->s_es->s_volume_name); unlock_buffer(sbi->s_sbh); if (copy_to_user(user_label, label, sizeof(label))) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9bde300b1551..f9b2b18374f3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6129,8 +6129,8 @@ static void ext4_update_super(struct super_block *sb) __ext4_update_tstamp(&es->s_first_error_time, &es->s_first_error_time_hi, sbi->s_first_error_time); - strncpy(es->s_first_error_func, sbi->s_first_error_func, - sizeof(es->s_first_error_func)); + strtomem_pad(es->s_first_error_func, + sbi->s_first_error_func, 0); es->s_first_error_line = cpu_to_le32(sbi->s_first_error_line); es->s_first_error_ino = @@ -6143,8 +6143,7 @@ static void ext4_update_super(struct super_block *sb) __ext4_update_tstamp(&es->s_last_error_time, &es->s_last_error_time_hi, sbi->s_last_error_time); - strncpy(es->s_last_error_func, sbi->s_last_error_func, - sizeof(es->s_last_error_func)); + strtomem_pad(es->s_last_error_func, sbi->s_last_error_func, 0); es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); es->s_last_error_block = cpu_to_le64(sbi->s_last_error_block); From 4f3e6db3c3719952cfef89340290e0b7b03f7cbc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 21 Mar 2024 17:26:49 +0100 Subject: [PATCH 18/40] Revert "ext4: drop duplicate ea_inode handling in ext4_xattr_block_set()" This reverts commit 7f48212678e91a057259b3e281701f7feb1ee397. We will need the special cleanup handling once we move allocation of EA inode outside of the buffer lock in the following patch. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20240321162657.27420-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index b67a176bfcf9..146690c10c73 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2158,6 +2158,17 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, ENTRY(header(s->base)+1)); if (error) goto getblk_failed; + if (ea_inode) { + /* Drop the extra ref on ea_inode. */ + error = ext4_xattr_inode_dec_ref(handle, + ea_inode); + if (error) + ext4_warning_inode(ea_inode, + "dec ref error=%d", + error); + iput(ea_inode); + ea_inode = NULL; + } lock_buffer(new_bh); error = ext4_journal_get_create_access(handle, sb, From 0a46ef234756dca04623b7591e8ebb3440622f0b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 21 Mar 2024 17:26:50 +0100 Subject: [PATCH 19/40] ext4: do not create EA inode under buffer lock ext4_xattr_set_entry() creates new EA inodes while holding buffer lock on the external xattr block. This is problematic as it nests all the allocation locking (which acquires locks on other buffers) under the buffer lock. This can even deadlock when the filesystem is corrupted and e.g. quota file is setup to contain xattr block as data block. Move the allocation of EA inode out of ext4_xattr_set_entry() into the callers. Reported-by: syzbot+a43d4f48b8397d0e41a9@syzkaller.appspotmail.com Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20240321162657.27420-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 113 +++++++++++++++++++++++------------------------- 1 file changed, 53 insertions(+), 60 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 146690c10c73..04f90df8dbae 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1619,6 +1619,7 @@ static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle, static int ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s, handle_t *handle, struct inode *inode, + struct inode *new_ea_inode, bool is_block) { struct ext4_xattr_entry *last, *next; @@ -1626,7 +1627,6 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, size_t min_offs = s->end - s->base, name_len = strlen(i->name); int in_inode = i->in_inode; struct inode *old_ea_inode = NULL; - struct inode *new_ea_inode = NULL; size_t old_size, new_size; int ret; @@ -1711,38 +1711,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, old_ea_inode = NULL; goto out; } - } - if (i->value && in_inode) { - WARN_ON_ONCE(!i->value_len); - new_ea_inode = ext4_xattr_inode_lookup_create(handle, inode, - i->value, i->value_len); - if (IS_ERR(new_ea_inode)) { - ret = PTR_ERR(new_ea_inode); - new_ea_inode = NULL; - goto out; - } - } - - if (old_ea_inode) { /* We are ready to release ref count on the old_ea_inode. */ ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode); - if (ret) { - /* Release newly required ref count on new_ea_inode. */ - if (new_ea_inode) { - int err; - - err = ext4_xattr_inode_dec_ref(handle, - new_ea_inode); - if (err) - ext4_warning_inode(new_ea_inode, - "dec ref new_ea_inode err=%d", - err); - ext4_xattr_inode_free_quota(inode, new_ea_inode, - i->value_len); - } + if (ret) goto out; - } ext4_xattr_inode_free_quota(inode, old_ea_inode, le32_to_cpu(here->e_value_size)); @@ -1866,7 +1839,6 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, ret = 0; out: iput(old_ea_inode); - iput(new_ea_inode); return ret; } @@ -1929,9 +1901,21 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, size_t old_ea_inode_quota = 0; unsigned int ea_ino; - #define header(x) ((struct ext4_xattr_header *)(x)) + /* If we need EA inode, prepare it before locking the buffer */ + if (i->value && i->in_inode) { + WARN_ON_ONCE(!i->value_len); + + ea_inode = ext4_xattr_inode_lookup_create(handle, inode, + i->value, i->value_len); + if (IS_ERR(ea_inode)) { + error = PTR_ERR(ea_inode); + ea_inode = NULL; + goto cleanup; + } + } + if (s->base) { int offset = (char *)s->here - bs->bh->b_data; @@ -1940,6 +1924,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, EXT4_JTR_NONE); if (error) goto cleanup; + lock_buffer(bs->bh); if (header(s->base)->h_refcount == cpu_to_le32(1)) { @@ -1966,7 +1951,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, } ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s, handle, inode, - true /* is_block */); + ea_inode, true /* is_block */); ext4_xattr_block_csum_set(inode, bs->bh); unlock_buffer(bs->bh); if (error == -EFSCORRUPTED) @@ -2034,29 +2019,13 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, s->end = s->base + sb->s_blocksize; } - error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); + error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, + true /* is_block */); if (error == -EFSCORRUPTED) goto bad_block; if (error) goto cleanup; - if (i->value && s->here->e_value_inum) { - /* - * A ref count on ea_inode has been taken as part of the call to - * ext4_xattr_set_entry() above. We would like to drop this - * extra ref but we have to wait until the xattr block is - * initialized and has its own ref count on the ea_inode. - */ - ea_ino = le32_to_cpu(s->here->e_value_inum); - error = ext4_xattr_inode_iget(inode, ea_ino, - le32_to_cpu(s->here->e_hash), - &ea_inode); - if (error) { - ea_inode = NULL; - goto cleanup; - } - } - inserted: if (!IS_LAST_ENTRY(s->first)) { new_bh = ext4_xattr_block_cache_find(inode, header(s->base), @@ -2209,17 +2178,16 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, cleanup: if (ea_inode) { - int error2; + if (error) { + int error2; - error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); - if (error2) - ext4_warning_inode(ea_inode, "dec ref error=%d", - error2); - - /* If there was an error, revert the quota charge. */ - if (error) + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (error2) + ext4_warning_inode(ea_inode, "dec ref error=%d", + error2); ext4_xattr_inode_free_quota(inode, ea_inode, i_size_read(ea_inode)); + } iput(ea_inode); } if (ce) @@ -2277,14 +2245,38 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, { struct ext4_xattr_ibody_header *header; struct ext4_xattr_search *s = &is->s; + struct inode *ea_inode = NULL; int error; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return -ENOSPC; - error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); - if (error) + /* If we need EA inode, prepare it before locking the buffer */ + if (i->value && i->in_inode) { + WARN_ON_ONCE(!i->value_len); + + ea_inode = ext4_xattr_inode_lookup_create(handle, inode, + i->value, i->value_len); + if (IS_ERR(ea_inode)) + return PTR_ERR(ea_inode); + } + error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, + false /* is_block */); + if (error) { + if (ea_inode) { + int error2; + + error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); + if (error2) + ext4_warning_inode(ea_inode, "dec ref error=%d", + error2); + + ext4_xattr_inode_free_quota(inode, ea_inode, + i_size_read(ea_inode)); + iput(ea_inode); + } return error; + } header = IHDR(inode, ext4_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); @@ -2293,6 +2285,7 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, header->h_magic = cpu_to_le32(0); ext4_clear_inode_state(inode, EXT4_STATE_XATTR); } + iput(ea_inode); return 0; } From a11adf7be9d8baefe798eab49c356ab8e3924f0e Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 23 Mar 2024 00:55:18 +0800 Subject: [PATCH 20/40] ext4: implement filesystem specific alloc_inode in unit test We expect inode with ext4_info_info type as following: mbt_kunit_init mbt_mb_init ext4_mb_init ext4_mb_init_backend sbi->s_buddy_cache = new_inode(sb); EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; Implement alloc_inode ionde with ext4_inode_info type to avoid out-of-bounds write. Signed-off-by: Kemeng Shi Reported-by: Guenter Roeck Tested-by: Guenter Roeck Link: https://lore.kernel.org/r/20240322165518.8147-1-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc-test.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index 044ca5238f41..49aabcfe6b46 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -30,7 +30,31 @@ struct mbt_ext4_super_block { #define MBT_CTX(_sb) (&MBT_SB(_sb)->mbt_ctx) #define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group]) +static struct inode *mbt_alloc_inode(struct super_block *sb) +{ + struct ext4_inode_info *ei; + + ei = kmalloc(sizeof(struct ext4_inode_info), GFP_KERNEL); + if (!ei) + return NULL; + + INIT_LIST_HEAD(&ei->i_orphan); + init_rwsem(&ei->xattr_sem); + init_rwsem(&ei->i_data_sem); + inode_init_once(&ei->vfs_inode); + ext4_fc_init_inode(&ei->vfs_inode); + + return &ei->vfs_inode; +} + +static void mbt_free_inode(struct inode *inode) +{ + kfree(EXT4_I(inode)); +} + static const struct super_operations mbt_sops = { + .alloc_inode = mbt_alloc_inode, + .free_inode = mbt_free_inode, }; static void mbt_kill_sb(struct super_block *sb) From 9c97c34a998a01bc0cf970b1685456bc2ea16b64 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 24 Apr 2024 14:19:00 +0800 Subject: [PATCH 21/40] ext4: keep "prefetch_grp" and "nr" consistent Keep "prefetch_grp" and "nr" consistent to avoid to call ext4_mb_prefetch_fini with non-prefetched groups. When we step into next criteria, "prefetch_grp" is set to prefetch start of new criteria while "nr" is number of the prefetched group in previous criteria. If previous criteria and next criteria are both inexpensive (< CR_GOAL_LEN_SLOW) and prefetch_ios reachs sbi->s_mb_prefetch_limit in previous criteria, "prefetch_grp" and "nr" will be inconsistent and may introduce unexpected cost to do ext4_mb_init_group for non-prefetched groups. Reset "nr" to 0 when we reset "prefetch_grp" to goal group to keep them consistent. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240424061904.987525-2-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c65fac9b8c72..33e6a3ebdb55 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2860,6 +2860,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) group = ac->ac_g_ex.fe_group; ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; prefetch_grp = group; + nr = 0; for (i = 0, new_cr = cr; i < ngroups; i++, ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { From d0b88624f81f272626c767f4b715f4b690fcbed2 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 24 Apr 2024 14:19:01 +0800 Subject: [PATCH 22/40] ext4: add test_mb_mark_used_cost to estimate cost of mb_mark_used Add test_mb_mark_used_cost to estimate cost of mb_mark_used Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20240424061904.987525-3-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc-test.c | 52 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index 49aabcfe6b46..bb2a223b207c 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -883,6 +883,56 @@ static void test_mb_free_blocks(struct kunit *test) ext4_mb_unload_buddy(&e4b); } +#define COUNT_FOR_ESTIMATE 100000 +static void test_mb_mark_used_cost(struct kunit *test) +{ + struct ext4_buddy e4b; + struct super_block *sb = (struct super_block *)test->priv; + struct ext4_free_extent ex; + int ret; + struct test_range ranges[TEST_RANGE_COUNT]; + int i, j; + unsigned long start, end, all = 0; + + /* buddy cache assumes that each page contains at least one block */ + if (sb->s_blocksize > PAGE_SIZE) + kunit_skip(test, "blocksize exceeds pagesize"); + + ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); + KUNIT_ASSERT_EQ(test, ret, 0); + + ex.fe_group = TEST_GOAL_GROUP; + for (j = 0; j < COUNT_FOR_ESTIMATE; j++) { + mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); + start = jiffies; + for (i = 0; i < TEST_RANGE_COUNT; i++) { + if (ranges[i].len == 0) + continue; + + ex.fe_start = ranges[i].start; + ex.fe_len = ranges[i].len; + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_mark_used(&e4b, &ex); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + } + end = jiffies; + all += (end - start); + + for (i = 0; i < TEST_RANGE_COUNT; i++) { + if (ranges[i].len == 0) + continue; + + ext4_lock_group(sb, TEST_GOAL_GROUP); + mb_free_blocks(NULL, &e4b, ranges[i].start, + ranges[i].len); + ext4_unlock_group(sb, TEST_GOAL_GROUP); + } + } + + kunit_info(test, "costed jiffies %lu\n", all); + ext4_mb_unload_buddy(&e4b); +} + static const struct mbt_ext4_block_layout mbt_test_layouts[] = { { .blocksize_bits = 10, @@ -925,6 +975,8 @@ static struct kunit_case mbt_test_cases[] = { KUNIT_CASE_PARAM(test_mb_mark_used, mbt_layouts_gen_params), KUNIT_CASE_PARAM(test_mb_free_blocks, mbt_layouts_gen_params), KUNIT_CASE_PARAM(test_mark_diskspace_used, mbt_layouts_gen_params), + KUNIT_CASE_PARAM_ATTR(test_mb_mark_used_cost, mbt_layouts_gen_params, + { .speed = KUNIT_SPEED_SLOW }), {} }; From d1a3924e43a35860ed7edaeec7f901a1ade2ac37 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 24 Apr 2024 14:19:02 +0800 Subject: [PATCH 23/40] ext4: call ext4_mb_mark_free_simple to free continuous bits in found chunk In mb_mark_used, we will find free chunk and mark it inuse. For chunk in mid of passed range, we could simply mark whole chunk inuse. For chunk at end of range, we may need to mark a continuous bits at end of part of chunk inuse and keep rest part of chunk free. To only mark a part of chunk inuse, we firstly mark whole chunk inuse and then mark a continuous range at end of chunk free. Function mb_mark_used does several times of "mb_find_buddy; mb_clear_bit; ..." to mark a continuous range free which can be done by simply calling ext4_mb_mark_free_simple which free continuous bits in a more effective way. Just call ext4_mb_mark_free_simple in mb_mark_used to use existing and effective code to free continuous blocks in chunk at end of passed range. Signed-off-by: Kemeng Shi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240424061904.987525-4-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 33e6a3ebdb55..9a2b0bd0dcbc 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2044,13 +2044,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) int ord; int mlen = 0; int max = 0; - int cur; int start = ex->fe_start; int len = ex->fe_len; unsigned ret = 0; int len0 = len; void *buddy; - bool split = false; + int ord_start, ord_end; BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); BUG_ON(e4b->bd_group != ex->fe_group); @@ -2075,16 +2074,12 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) /* let's maintain buddy itself */ while (len) { - if (!split) - ord = mb_find_order_for_block(e4b, start); + ord = mb_find_order_for_block(e4b, start); if (((start >> ord) << ord) == start && len >= (1 << ord)) { /* the whole chunk may be allocated at once! */ mlen = 1 << ord; - if (!split) - buddy = mb_find_buddy(e4b, ord, &max); - else - split = false; + buddy = mb_find_buddy(e4b, ord, &max); BUG_ON((start >> ord) >= max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; @@ -2098,20 +2093,29 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) if (ret == 0) ret = len | (ord << 16); - /* we have to split large buddy */ BUG_ON(ord <= 0); buddy = mb_find_buddy(e4b, ord, &max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; - ord--; - cur = (start >> ord) & ~1U; - buddy = mb_find_buddy(e4b, ord, &max); - mb_clear_bit(cur, buddy); - mb_clear_bit(cur + 1, buddy); - e4b->bd_info->bb_counters[ord]++; - e4b->bd_info->bb_counters[ord]++; - split = true; + ord_start = (start >> ord) << ord; + ord_end = ord_start + (1 << ord); + /* first chunk */ + if (start > ord_start) + ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, + ord_start, start - ord_start, + e4b->bd_info); + + /* last chunk */ + if (start + len < ord_end) { + ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, + start + len, + ord_end - (start + len), + e4b->bd_info); + break; + } + len = start + len - ord_end; + start = ord_end; } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); From 2caffb6a277bb0f2a482a2eb824d012d5f45f4d0 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 24 Apr 2024 14:19:03 +0800 Subject: [PATCH 24/40] ext4: use correct criteria name instead stale integer number in comment Use correct criteria name instead stale integer number in comment Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240424061904.987525-5-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 9 ++++++--- fs/ext4/mballoc.c | 16 +++++++++------- fs/ext4/mballoc.h | 4 ++-- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8d126654019e..983dad8c07ec 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -213,11 +213,14 @@ enum criteria { #define EXT4_MB_USE_RESERVED 0x2000 /* Do strict check for free blocks while retrying block allocation */ #define EXT4_MB_STRICT_CHECK 0x4000 -/* Large fragment size list lookup succeeded at least once for cr = 0 */ +/* Large fragment size list lookup succeeded at least once for + * CR_POWER2_ALIGNED */ #define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000 -/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ +/* Avg fragment size rb tree lookup succeeded at least once for + * CR_GOAL_LEN_FAST */ #define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000 -/* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */ +/* Avg fragment size rb tree lookup succeeded at least once for + * CR_BEST_AVAIL_LEN */ #define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000 struct ext4_allocation_request { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 9a2b0bd0dcbc..65ce2dd0cefb 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1135,8 +1135,9 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, ext4_mb_choose_next_group_best_avail(ac, new_cr, group); } else { /* - * TODO: For CR=2, we can arrange groups in an rb tree sorted by - * bb_free. But until that happens, we should never come here. + * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an + * rb tree sorted by bb_free. But until that happens, we should + * never come here. */ WARN_ON(1); } @@ -2683,7 +2684,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, int ret; /* - * cr=CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic + * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic * search to find large good chunks almost for free. If buddy * data is not ready, then this optimization makes no sense. But * we never skip the first block group in a flex_bg, since this @@ -3448,10 +3449,11 @@ static int ext4_mb_init_backend(struct super_block *sb) } if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) sbi->s_mb_prefetch = ext4_get_groups_count(sb); - /* now many real IOs to prefetch within a single allocation at cr=0 - * given cr=0 is an CPU-related optimization we shouldn't try to - * load too many groups, at some point we should start to use what - * we've got in memory. + /* + * now many real IOs to prefetch within a single allocation at + * CR_POWER2_ALIGNED. Given CR_POWER2_ALIGNED is an CPU-related + * optimization we shouldn't try to load too many groups, at some point + * we should start to use what we've got in memory. * with an average random access time 5ms, it'd take a second to get * 200 groups (* N with flex_bg), so let's make this limit 4 */ diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 7bfc5fb5a128..7ee13d8eecd7 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -187,8 +187,8 @@ struct ext4_allocation_context { struct ext4_free_extent ac_f_ex; /* - * goal len can change in CR1.5, so save the original len. This is - * used while adjusting the PA window and for accounting. + * goal len can change in CR_BEST_AVAIL_LEN, so save the original len. + * This is used while adjusting the PA window and for accounting. */ ext4_grpblk_t ac_orig_goal_len; From da5704eef7037a5bc84a56519729d93d10a0e0a0 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 24 Apr 2024 14:19:04 +0800 Subject: [PATCH 25/40] ext4: open coding repeated check in next_linear_group Open coding repeated check in next_linear_group. Signed-off-by: Kemeng Shi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240424061904.987525-6-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 65ce2dd0cefb..e89c1bfb7ce3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1080,23 +1080,11 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) } /* - * Return next linear group for allocation. If linear traversal should not be - * performed, this function just returns the same group + * Return next linear group for allocation. */ static ext4_group_t -next_linear_group(struct ext4_allocation_context *ac, ext4_group_t group, - ext4_group_t ngroups) +next_linear_group(ext4_group_t group, ext4_group_t ngroups) { - if (!should_optimize_scan(ac)) - goto inc_and_return; - - if (ac->ac_groups_linear_remaining) { - ac->ac_groups_linear_remaining--; - goto inc_and_return; - } - - return group; -inc_and_return: /* * Artificially restricted ngroups for non-extent * files makes group > ngroups possible on first loop. @@ -1122,8 +1110,19 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, { *new_cr = ac->ac_criteria; - if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) { - *group = next_linear_group(ac, *group, ngroups); + if (!should_optimize_scan(ac)) { + *group = next_linear_group(*group, ngroups); + return; + } + + /* + * Optimized scanning can return non adjacent groups which can cause + * seek overhead for rotational disks. So try few linear groups before + * trying optimized scan. + */ + if (ac->ac_groups_linear_remaining) { + *group = next_linear_group(*group, ngroups); + ac->ac_groups_linear_remaining--; return; } From 99b150d84e4939735cfce245e32e3d29312c68ec Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 16 Apr 2024 18:28:54 +0100 Subject: [PATCH 26/40] ext4: convert bd_bitmap_page to bd_bitmap_folio There is no need to make this a multi-page folio, so leave all the infrastructure around it in pages. But since we're locking it, playing with its refcount and checking whether it's uptodate, it needs to move to the folio API. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240416172900.244637-2-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 98 ++++++++++++++++++++++++----------------------- fs/ext4/mballoc.h | 2 +- 2 files changed, 52 insertions(+), 48 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e89c1bfb7ce3..174c029a79f6 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1452,9 +1452,10 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, int block, pnum, poff; int blocks_per_page; struct page *page; + struct folio *folio; e4b->bd_buddy_page = NULL; - e4b->bd_bitmap_page = NULL; + e4b->bd_bitmap_folio = NULL; blocks_per_page = PAGE_SIZE / sb->s_blocksize; /* @@ -1465,12 +1466,13 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, block = group * 2; pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (!page) - return -ENOMEM; - BUG_ON(page->mapping != inode->i_mapping); - e4b->bd_bitmap_page = page; - e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (IS_ERR(folio)) + return PTR_ERR(folio); + BUG_ON(folio->mapping != inode->i_mapping); + e4b->bd_bitmap_folio = folio; + e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); if (blocks_per_page >= 2) { /* buddy and bitmap are on the same page */ @@ -1488,9 +1490,9 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) { - if (e4b->bd_bitmap_page) { - unlock_page(e4b->bd_bitmap_page); - put_page(e4b->bd_bitmap_page); + if (e4b->bd_bitmap_folio) { + folio_unlock(e4b->bd_bitmap_folio); + folio_put(e4b->bd_bitmap_folio); } if (e4b->bd_buddy_page) { unlock_page(e4b->bd_buddy_page); @@ -1510,6 +1512,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) struct ext4_group_info *this_grp; struct ext4_buddy e4b; struct page *page; + struct folio *folio; int ret = 0; might_sleep(); @@ -1536,11 +1539,11 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) goto err; } - page = e4b.bd_bitmap_page; - ret = ext4_mb_init_cache(page, NULL, gfp); + folio = e4b.bd_bitmap_folio; + ret = ext4_mb_init_cache(&folio->page, NULL, gfp); if (ret) goto err; - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } @@ -1582,6 +1585,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, int pnum; int poff; struct page *page; + struct folio *folio; int ret; struct ext4_group_info *grp; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1600,7 +1604,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, e4b->bd_sb = sb; e4b->bd_group = group; e4b->bd_buddy_page = NULL; - e4b->bd_bitmap_page = NULL; + e4b->bd_bitmap_folio = NULL; if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { /* @@ -1621,53 +1625,53 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, pnum = block / blocks_per_page; poff = block % blocks_per_page; - /* we could use find_or_create_page(), but it locks page - * what we'd like to avoid in fast path ... */ - page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); - if (page == NULL || !PageUptodate(page)) { - if (page) + /* Avoid locking the folio in the fast path ... */ + folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + if (!IS_ERR(folio)) /* - * drop the page reference and try - * to get the page with lock. If we + * drop the folio reference and try + * to get the folio with lock. If we * are not uptodate that implies - * somebody just created the page but - * is yet to initialize the same. So + * somebody just created the folio but + * is yet to initialize it. So * wait for it to initialize. */ - put_page(page); - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (page) { - if (WARN_RATELIMIT(page->mapping != inode->i_mapping, - "ext4: bitmap's paging->mapping != inode->i_mapping\n")) { + folio_put(folio); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (!IS_ERR(folio)) { + if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, + "ext4: bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ - unlock_page(page); + folio_unlock(folio); ret = -EINVAL; goto err; } - if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, NULL, gfp); + if (!folio_test_uptodate(folio)) { + ret = ext4_mb_init_cache(&folio->page, NULL, gfp); if (ret) { - unlock_page(page); + folio_unlock(folio); goto err; } - mb_cmp_bitmaps(e4b, page_address(page) + + mb_cmp_bitmaps(e4b, folio_address(folio) + (poff * sb->s_blocksize)); } - unlock_page(page); + folio_unlock(folio); } } - if (page == NULL) { - ret = -ENOMEM; + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto err; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } /* Pages marked accessed already */ - e4b->bd_bitmap_page = page; - e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); + e4b->bd_bitmap_folio = folio; + e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); block++; pnum = block / blocks_per_page; @@ -1715,8 +1719,8 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, err: if (page) put_page(page); - if (e4b->bd_bitmap_page) - put_page(e4b->bd_bitmap_page); + if (e4b->bd_bitmap_folio) + folio_put(e4b->bd_bitmap_folio); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; @@ -1731,8 +1735,8 @@ static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { - if (e4b->bd_bitmap_page) - put_page(e4b->bd_bitmap_page); + if (e4b->bd_bitmap_folio) + folio_put(e4b->bd_bitmap_folio); if (e4b->bd_buddy_page) put_page(e4b->bd_buddy_page); } @@ -2157,7 +2161,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, * double allocate blocks. The reference is dropped * in ext4_mb_release_context */ - ac->ac_bitmap_page = e4b->bd_bitmap_page; + ac->ac_bitmap_page = &e4b->bd_bitmap_folio->page; get_page(ac->ac_bitmap_page); ac->ac_buddy_page = e4b->bd_buddy_page; get_page(ac->ac_buddy_page); @@ -3894,7 +3898,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb, * balance refcounts from ext4_mb_free_metadata() */ put_page(e4b.bd_buddy_page); - put_page(e4b.bd_bitmap_page); + folio_put(e4b.bd_bitmap_folio); } ext4_unlock_group(sb, entry->efd_group); ext4_mb_unload_buddy(&e4b); @@ -6316,7 +6320,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct rb_node *parent = NULL, *new_node; BUG_ON(!ext4_handle_valid(handle)); - BUG_ON(e4b->bd_bitmap_page == NULL); + BUG_ON(e4b->bd_bitmap_folio == NULL); BUG_ON(e4b->bd_buddy_page == NULL); new_node = &new_entry->efd_node; @@ -6329,7 +6333,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, * on-disk bitmap and lose not-yet-available * blocks */ get_page(e4b->bd_buddy_page); - get_page(e4b->bd_bitmap_page); + folio_get(e4b->bd_bitmap_folio); } while (*n) { parent = *n; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 7ee13d8eecd7..f8b7e9d2e463 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -217,7 +217,7 @@ struct ext4_allocation_context { struct ext4_buddy { struct page *bd_buddy_page; void *bd_buddy; - struct page *bd_bitmap_page; + struct folio *bd_bitmap_folio; void *bd_bitmap; struct ext4_group_info *bd_info; struct super_block *bd_sb; From 5eea586b47f05b5f5518cf8f9dd9283a01a8066d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 16 Apr 2024 18:28:55 +0100 Subject: [PATCH 27/40] ext4: convert bd_buddy_page to bd_buddy_folio There is no need to make this a multi-page folio, so leave all the infrastructure around it in pages. But since we're locking it, playing with its refcount and checking whether it's uptodate, it needs to move to the folio API. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240416172900.244637-3-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 91 +++++++++++++++++++++++------------------------ fs/ext4/mballoc.h | 2 +- 2 files changed, 46 insertions(+), 47 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 174c029a79f6..1a76e5741d36 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1443,7 +1443,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) * Lock the buddy and bitmap pages. This make sure other parallel init_group * on the same buddy page doesn't happen whild holding the buddy page lock. * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap - * are on the same page e4b->bd_buddy_page is NULL and return value is 0. + * are on the same page e4b->bd_buddy_folio is NULL and return value is 0. */ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) @@ -1451,10 +1451,9 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, struct inode *inode = EXT4_SB(sb)->s_buddy_cache; int block, pnum, poff; int blocks_per_page; - struct page *page; struct folio *folio; - e4b->bd_buddy_page = NULL; + e4b->bd_buddy_folio = NULL; e4b->bd_bitmap_folio = NULL; blocks_per_page = PAGE_SIZE / sb->s_blocksize; @@ -1480,11 +1479,12 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, } /* blocks_per_page == 1, hence we need another page for the buddy */ - page = find_or_create_page(inode->i_mapping, block + 1, gfp); - if (!page) - return -ENOMEM; - BUG_ON(page->mapping != inode->i_mapping); - e4b->bd_buddy_page = page; + folio = __filemap_get_folio(inode->i_mapping, block + 1, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (IS_ERR(folio)) + return PTR_ERR(folio); + BUG_ON(folio->mapping != inode->i_mapping); + e4b->bd_buddy_folio = folio; return 0; } @@ -1494,9 +1494,9 @@ static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) folio_unlock(e4b->bd_bitmap_folio); folio_put(e4b->bd_bitmap_folio); } - if (e4b->bd_buddy_page) { - unlock_page(e4b->bd_buddy_page); - put_page(e4b->bd_buddy_page); + if (e4b->bd_buddy_folio) { + folio_unlock(e4b->bd_buddy_folio); + folio_put(e4b->bd_buddy_folio); } } @@ -1511,7 +1511,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) struct ext4_group_info *this_grp; struct ext4_buddy e4b; - struct page *page; struct folio *folio; int ret = 0; @@ -1548,7 +1547,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) goto err; } - if (e4b.bd_buddy_page == NULL) { + if (e4b.bd_buddy_folio == NULL) { /* * If both the bitmap and buddy are in * the same page we don't need to force @@ -1558,11 +1557,11 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) goto err; } /* init buddy cache */ - page = e4b.bd_buddy_page; - ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp); + folio = e4b.bd_buddy_folio; + ret = ext4_mb_init_cache(&folio->page, e4b.bd_bitmap, gfp); if (ret) goto err; - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } @@ -1584,7 +1583,6 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, int block; int pnum; int poff; - struct page *page; struct folio *folio; int ret; struct ext4_group_info *grp; @@ -1603,7 +1601,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; - e4b->bd_buddy_page = NULL; + e4b->bd_buddy_folio = NULL; e4b->bd_bitmap_folio = NULL; if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { @@ -1669,7 +1667,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, goto err; } - /* Pages marked accessed already */ + /* Folios marked accessed already */ e4b->bd_bitmap_folio = folio; e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); @@ -1677,48 +1675,49 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); - if (page == NULL || !PageUptodate(page)) { - if (page) - put_page(page); - page = find_or_create_page(inode->i_mapping, pnum, gfp); - if (page) { - if (WARN_RATELIMIT(page->mapping != inode->i_mapping, - "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) { + folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) { + if (!IS_ERR(folio)) + folio_put(folio); + folio = __filemap_get_folio(inode->i_mapping, pnum, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (!IS_ERR(folio)) { + if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, + "ext4: buddy bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ - unlock_page(page); + folio_unlock(folio); ret = -EINVAL; goto err; } - if (!PageUptodate(page)) { - ret = ext4_mb_init_cache(page, e4b->bd_bitmap, + if (!folio_test_uptodate(folio)) { + ret = ext4_mb_init_cache(&folio->page, e4b->bd_bitmap, gfp); if (ret) { - unlock_page(page); + folio_unlock(folio); goto err; } } - unlock_page(page); + folio_unlock(folio); } } - if (page == NULL) { - ret = -ENOMEM; + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto err; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } - /* Pages marked accessed already */ - e4b->bd_buddy_page = page; - e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); + /* Folios marked accessed already */ + e4b->bd_buddy_folio = folio; + e4b->bd_buddy = folio_address(folio) + (poff * sb->s_blocksize); return 0; err: - if (page) - put_page(page); + if (folio) + folio_put(folio); if (e4b->bd_bitmap_folio) folio_put(e4b->bd_bitmap_folio); @@ -1737,8 +1736,8 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_folio) folio_put(e4b->bd_bitmap_folio); - if (e4b->bd_buddy_page) - put_page(e4b->bd_buddy_page); + if (e4b->bd_buddy_folio) + folio_put(e4b->bd_buddy_folio); } @@ -2163,7 +2162,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, */ ac->ac_bitmap_page = &e4b->bd_bitmap_folio->page; get_page(ac->ac_bitmap_page); - ac->ac_buddy_page = e4b->bd_buddy_page; + ac->ac_buddy_page = &e4b->bd_buddy_folio->page; get_page(ac->ac_buddy_page); /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { @@ -3897,7 +3896,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb, /* No more items in the per group rb tree * balance refcounts from ext4_mb_free_metadata() */ - put_page(e4b.bd_buddy_page); + folio_put(e4b.bd_buddy_folio); folio_put(e4b.bd_bitmap_folio); } ext4_unlock_group(sb, entry->efd_group); @@ -6321,7 +6320,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, BUG_ON(!ext4_handle_valid(handle)); BUG_ON(e4b->bd_bitmap_folio == NULL); - BUG_ON(e4b->bd_buddy_page == NULL); + BUG_ON(e4b->bd_buddy_folio == NULL); new_node = &new_entry->efd_node; cluster = new_entry->efd_start_cluster; @@ -6332,7 +6331,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, * otherwise we'll refresh it from * on-disk bitmap and lose not-yet-available * blocks */ - get_page(e4b->bd_buddy_page); + folio_get(e4b->bd_buddy_folio); folio_get(e4b->bd_bitmap_folio); } while (*n) { diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index f8b7e9d2e463..268c838ddc2d 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -215,7 +215,7 @@ struct ext4_allocation_context { #define AC_STATUS_BREAK 3 struct ext4_buddy { - struct page *bd_buddy_page; + struct folio *bd_buddy_folio; void *bd_buddy; struct folio *bd_bitmap_folio; void *bd_bitmap; From e1622a0d558200b0ccdfbf69ee29b9ac1f5c2442 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 16 Apr 2024 18:28:56 +0100 Subject: [PATCH 28/40] ext4: convert ext4_mb_init_cache() to take a folio All callers now have a folio, so convert this function from operating on a page to operating on a folio. The folio is assumed to be a single page. Signe-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240416172900.244637-4-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1a76e5741d36..71950abb457e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1274,7 +1274,7 @@ static void mb_regenerate_buddy(struct ext4_buddy *e4b) * for this page; do not hold this lock when calling this routine! */ -static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) +static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) { ext4_group_t ngroups; unsigned int blocksize; @@ -1292,13 +1292,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) char *bitmap; struct ext4_group_info *grinfo; - inode = page->mapping->host; + inode = folio->mapping->host; sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = i_blocksize(inode); blocks_per_page = PAGE_SIZE / blocksize; - mb_debug(sb, "init page %lu\n", page->index); + mb_debug(sb, "init folio %lu\n", folio->index); groups_per_page = blocks_per_page >> 1; if (groups_per_page == 0) @@ -1313,9 +1313,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) } else bh = &bhs; - first_group = page->index * blocks_per_page / 2; + first_group = folio->index * blocks_per_page / 2; - /* read all groups the page covers into the cache */ + /* read all groups the folio covers into the cache */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { if (group >= ngroups) break; @@ -1326,10 +1326,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) /* * If page is uptodate then we came here after online resize * which added some new uninitialized group info structs, so - * we must skip all initialized uptodate buddies on the page, + * we must skip all initialized uptodate buddies on the folio, * which may be currently in use by an allocating task. */ - if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { + if (folio_test_uptodate(folio) && + !EXT4_MB_GRP_NEED_INIT(grinfo)) { bh[i] = NULL; continue; } @@ -1353,7 +1354,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) err = err2; } - first_block = page->index * blocks_per_page; + first_block = folio->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { group = (first_block + i) >> 1; if (group >= ngroups) @@ -1374,7 +1375,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) * above * */ - data = page_address(page) + (i * blocksize); + data = folio_address(folio) + (i * blocksize); bitmap = bh[group - first_group]->b_data; /* @@ -1389,8 +1390,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); - mb_debug(sb, "put buddy for group %u in page %lu/%x\n", - group, page->index, i * blocksize); + mb_debug(sb, "put buddy for group %u in folio %lu/%x\n", + group, folio->index, i * blocksize); trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, @@ -1408,8 +1409,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) } else { /* this is block of bitmap */ BUG_ON(incore != NULL); - mb_debug(sb, "put bitmap for group %u in page %lu/%x\n", - group, page->index, i * blocksize); + mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n", + group, folio->index, i * blocksize); trace_ext4_mb_bitmap_load(sb, group); /* see comments in ext4_mb_put_pa() */ @@ -1427,7 +1428,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) incore = data; } } - SetPageUptodate(page); + folio_mark_uptodate(folio); out: if (bh) { @@ -1539,7 +1540,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) } folio = e4b.bd_bitmap_folio; - ret = ext4_mb_init_cache(&folio->page, NULL, gfp); + ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) goto err; if (!folio_test_uptodate(folio)) { @@ -1558,7 +1559,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) } /* init buddy cache */ folio = e4b.bd_buddy_folio; - ret = ext4_mb_init_cache(&folio->page, e4b.bd_bitmap, gfp); + ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp); if (ret) goto err; if (!folio_test_uptodate(folio)) { @@ -1647,7 +1648,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, goto err; } if (!folio_test_uptodate(folio)) { - ret = ext4_mb_init_cache(&folio->page, NULL, gfp); + ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) { folio_unlock(folio); goto err; @@ -1690,7 +1691,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, goto err; } if (!folio_test_uptodate(folio)) { - ret = ext4_mb_init_cache(&folio->page, e4b->bd_bitmap, + ret = ext4_mb_init_cache(folio, e4b->bd_bitmap, gfp); if (ret) { folio_unlock(folio); From ccedf35b5daa429c0e731eac6fb32b0208302c6b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 16 Apr 2024 18:28:57 +0100 Subject: [PATCH 29/40] ext4: convert ac_bitmap_page to ac_bitmap_folio This just carries around the bd_bitmap_folio so should also be a folio. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240416172900.244637-5-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 8 ++++---- fs/ext4/mballoc.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 71950abb457e..218480b20058 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2161,8 +2161,8 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, * double allocate blocks. The reference is dropped * in ext4_mb_release_context */ - ac->ac_bitmap_page = &e4b->bd_bitmap_folio->page; - get_page(ac->ac_bitmap_page); + ac->ac_bitmap_folio = e4b->bd_bitmap_folio; + folio_get(ac->ac_bitmap_folio); ac->ac_buddy_page = &e4b->bd_buddy_folio->page; get_page(ac->ac_buddy_page); /* store last allocated for subsequent stream allocation */ @@ -6002,8 +6002,8 @@ static void ext4_mb_release_context(struct ext4_allocation_context *ac) ext4_mb_put_pa(ac, ac->ac_sb, pa); } - if (ac->ac_bitmap_page) - put_page(ac->ac_bitmap_page); + if (ac->ac_bitmap_folio) + folio_put(ac->ac_bitmap_folio); if (ac->ac_buddy_page) put_page(ac->ac_buddy_page); if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 268c838ddc2d..469e5516fb77 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -204,7 +204,7 @@ struct ext4_allocation_context { __u8 ac_2order; /* if request is to allocate 2^N blocks and * N > 0, the field stores N, otherwise 0 */ __u8 ac_op; /* operation, for history only */ - struct page *ac_bitmap_page; + struct folio *ac_bitmap_folio; struct page *ac_buddy_page; struct ext4_prealloc_space *ac_pa; struct ext4_locality_group *ac_lg; From c84f1510fba9fb181e6a1aa7b5fcfc67381b39c9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 16 Apr 2024 18:28:58 +0100 Subject: [PATCH 30/40] ext4: convert ac_buddy_page to ac_buddy_folio This just carries around the bd_buddy_folio so should also be a folio. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240416172900.244637-6-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 8 ++++---- fs/ext4/mballoc.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 218480b20058..ac0ee747f4b9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2163,8 +2163,8 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, */ ac->ac_bitmap_folio = e4b->bd_bitmap_folio; folio_get(ac->ac_bitmap_folio); - ac->ac_buddy_page = &e4b->bd_buddy_folio->page; - get_page(ac->ac_buddy_page); + ac->ac_buddy_folio = e4b->bd_buddy_folio; + folio_get(ac->ac_buddy_folio); /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { spin_lock(&sbi->s_md_lock); @@ -6004,8 +6004,8 @@ static void ext4_mb_release_context(struct ext4_allocation_context *ac) } if (ac->ac_bitmap_folio) folio_put(ac->ac_bitmap_folio); - if (ac->ac_buddy_page) - put_page(ac->ac_buddy_page); + if (ac->ac_buddy_folio) + folio_put(ac->ac_buddy_folio); if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 469e5516fb77..d8553f1498d3 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -205,7 +205,7 @@ struct ext4_allocation_context { * N > 0, the field stores N, otherwise 0 */ __u8 ac_op; /* operation, for history only */ struct folio *ac_bitmap_folio; - struct page *ac_buddy_page; + struct folio *ac_buddy_folio; struct ext4_prealloc_space *ac_pa; struct ext4_locality_group *ac_lg; }; From 3f4830abd236d0428e50451e1ecb62e14c365e9b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 17 Apr 2024 21:10:40 +0300 Subject: [PATCH 31/40] ext4: fix potential unnitialized variable Smatch complains "err" can be uninitialized in the caller. fs/ext4/indirect.c:349 ext4_alloc_branch() error: uninitialized symbol 'err'. Set the error to zero on the success path. Fixes: 8016e29f4362 ("ext4: fast commit recovery path") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/363a4673-0fb8-4adf-b4fb-90a499077276@moroto.mountain Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index ac0ee747f4b9..648989c125f2 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6126,6 +6126,7 @@ ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) ext4_mb_mark_bb(sb, block, 1, true); ar->len = 1; + *errp = 0; return block; } From df0b5afc62f3368d657a8fe4a8d393ac481474c2 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 19 Apr 2024 10:30:05 +0800 Subject: [PATCH 32/40] ext4: remove the redundant folio_wait_stable() __filemap_get_folio() with FGP_WRITEBEGIN parameter has already wait for stable folio, so remove the redundant folio_wait_stable() in ext4_da_write_begin(), it was left over from the commit cc883236b792 ("ext4: drop unnecessary journal handle in delalloc write") that removed the retry getting page logic. Fixes: cc883236b792 ("ext4: drop unnecessary journal handle in delalloc write") Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240419023005.2719050-1-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 989e28c24a8c..4bae9ccf5fe0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2887,9 +2887,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, if (IS_ERR(folio)) return PTR_ERR(folio); - /* In case writeback began while the folio was unlocked */ - folio_wait_stable(folio); - #ifdef CONFIG_FS_ENCRYPTION ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); #else From 8b57de1c5edde3faf8a4f6a440b7ec16bb3c81d4 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 10 Apr 2024 12:28:03 +0100 Subject: [PATCH 33/40] jbd2: remove redundant assignement to variable err The variable err is being assigned a value that is never read, it is being re-assigned inside the following while loop and also after the while loop. The assignment is redundant and can be removed. Cleans up clang scan build warning: fs/jbd2/commit.c:574:2: warning: Value stored to 'err' is never read [deadcode.DeadStores] Signed-off-by: Colin Ian King Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240410112803.232993-1-colin.i.king@gmail.com Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 5e122586e06e..78a9d08ae9f8 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -571,7 +571,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) J_ASSERT(commit_transaction->t_nr_buffers <= atomic_read(&commit_transaction->t_outstanding_credits)); - err = 0; bufs = 0; descriptor = NULL; while (commit_transaction->t_buffers) { From 0c0b4a49d3e7f49690a6827a41faeffad5df7e21 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Sat, 4 May 2024 15:55:25 +0800 Subject: [PATCH 34/40] ext4: fix mb_cache_entry's e_refcnt leak in ext4_xattr_block_cache_find() Syzbot reports a warning as follows: ============================================ WARNING: CPU: 0 PID: 5075 at fs/mbcache.c:419 mb_cache_destroy+0x224/0x290 Modules linked in: CPU: 0 PID: 5075 Comm: syz-executor199 Not tainted 6.9.0-rc6-gb947cc5bf6d7 RIP: 0010:mb_cache_destroy+0x224/0x290 fs/mbcache.c:419 Call Trace: ext4_put_super+0x6d4/0xcd0 fs/ext4/super.c:1375 generic_shutdown_super+0x136/0x2d0 fs/super.c:641 kill_block_super+0x44/0x90 fs/super.c:1675 ext4_kill_sb+0x68/0xa0 fs/ext4/super.c:7327 [...] ============================================ This is because when finding an entry in ext4_xattr_block_cache_find(), if ext4_sb_bread() returns -ENOMEM, the ce's e_refcnt, which has already grown in the __entry_find(), won't be put away, and eventually trigger the above issue in mb_cache_destroy() due to reference count leakage. So call mb_cache_entry_put() on the -ENOMEM error branch as a quick fix. Reported-by: syzbot+dd43bd0f7474512edc47@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=dd43bd0f7474512edc47 Fixes: fb265c9cb49e ("ext4: add ext4_sb_bread() to disambiguate ENOMEM cases") Cc: stable@kernel.org Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240504075526.2254349-2-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 04f90df8dbae..3e9b088800b1 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -3117,8 +3117,10 @@ ext4_xattr_block_cache_find(struct inode *inode, bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO); if (IS_ERR(bh)) { - if (PTR_ERR(bh) == -ENOMEM) + if (PTR_ERR(bh) == -ENOMEM) { + mb_cache_entry_put(ea_block_cache, ce); return NULL; + } bh = NULL; EXT4_ERROR_INODE(inode, "block %lu read error", (unsigned long)ce->e_value); From dc1c4663bc493f323d6b2f9dd55c044ea920dacf Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Sat, 4 May 2024 15:55:26 +0800 Subject: [PATCH 35/40] ext4: propagate errors from ext4_sb_bread() in ext4_xattr_block_cache_find() In ext4_xattr_block_cache_find(), when ext4_sb_bread() returns an error, we will either continue to find the next ea block or return NULL to try to insert a new ea block. But whether ext4_sb_bread() returns -EIO or -ENOMEM, the next operation is most likely to fail with the same error. So propagate the error returned by ext4_sb_bread() to make ext4_xattr_block_set() fail to reduce pointless operations. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240504075526.2254349-3-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/xattr.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 3e9b088800b1..6460879b9fcb 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -2028,8 +2028,13 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, inserted: if (!IS_LAST_ENTRY(s->first)) { - new_bh = ext4_xattr_block_cache_find(inode, header(s->base), - &ce); + new_bh = ext4_xattr_block_cache_find(inode, header(s->base), &ce); + if (IS_ERR(new_bh)) { + error = PTR_ERR(new_bh); + new_bh = NULL; + goto cleanup; + } + if (new_bh) { /* We found an identical block in the cache. */ if (new_bh == bs->bh) @@ -3094,8 +3099,8 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1, * * Find an identical extended attribute block. * - * Returns a pointer to the block found, or NULL if such a block was - * not found or an error occurred. + * Returns a pointer to the block found, or NULL if such a block was not + * found, or an error pointer if an error occurred while reading ea block. */ static struct buffer_head * ext4_xattr_block_cache_find(struct inode *inode, @@ -3117,13 +3122,11 @@ ext4_xattr_block_cache_find(struct inode *inode, bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO); if (IS_ERR(bh)) { - if (PTR_ERR(bh) == -ENOMEM) { - mb_cache_entry_put(ea_block_cache, ce); - return NULL; - } - bh = NULL; - EXT4_ERROR_INODE(inode, "block %lu read error", - (unsigned long)ce->e_value); + if (PTR_ERR(bh) != -ENOMEM) + EXT4_ERROR_INODE(inode, "block %lu read error", + (unsigned long)ce->e_value); + mb_cache_entry_put(ea_block_cache, ce); + return bh; } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { *pce = ce; return bh; From ea4fd933ab4310822e244af28d22ff63785dea0e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 20 Apr 2024 03:50:05 +0100 Subject: [PATCH 36/40] ext4: remove calls to to set/clear the folio error flag Nobody checks this flag on ext4 folios, stop setting and clearing it. Cc: Theodore Ts'o Cc: Andreas Dilger Cc: linux-ext4@vger.kernel.org Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20240420025029.2166544-11-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/move_extent.c | 4 +--- fs/ext4/page-io.c | 3 --- fs/ext4/readpage.c | 1 - 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 7cd4afa4de1d..204f53b23622 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -199,10 +199,8 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) continue; if (!buffer_mapped(bh)) { err = ext4_get_block(inode, block, bh, 0); - if (err) { - folio_set_error(folio); + if (err) return err; - } if (!buffer_mapped(bh)) { folio_zero_range(folio, block_start, blocksize); set_buffer_uptodate(bh); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 312bc6813357..ad5543866d21 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -117,7 +117,6 @@ static void ext4_finish_bio(struct bio *bio) if (bio->bi_status) { int err = blk_status_to_errno(bio->bi_status); - folio_set_error(folio); mapping_set_error(folio->mapping, err); } bh = head = folio_buffers(folio); @@ -441,8 +440,6 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); - folio_clear_error(folio); - /* * Comments copied from block_write_full_folio: * diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 21e8f0aebb3c..8494492582ab 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -289,7 +289,6 @@ int ext4_mpage_readpages(struct inode *inode, if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { set_error_page: - folio_set_error(folio); folio_zero_segment(folio, 0, folio_size(folio)); folio_unlock(folio); From b4b4fda34e535756f9e774fb2d09c4537b7dfd1c Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 2 Jan 2024 21:37:30 +0800 Subject: [PATCH 37/40] ext4: fix uninitialized ratelimit_state->lock access in __ext4_fill_super() In the following concurrency we will access the uninitialized rs->lock: ext4_fill_super ext4_register_sysfs // sysfs registered msg_ratelimit_interval_ms // Other processes modify rs->interval to // non-zero via msg_ratelimit_interval_ms ext4_orphan_cleanup ext4_msg(sb, KERN_INFO, "Errors on filesystem, " __ext4_msg ___ratelimit(&(EXT4_SB(sb)->s_msg_ratelimit_state) if (!rs->interval) // do nothing if interval is 0 return 1; raw_spin_trylock_irqsave(&rs->lock, flags) raw_spin_trylock(lock) _raw_spin_trylock __raw_spin_trylock spin_acquire(&lock->dep_map, 0, 1, _RET_IP_) lock_acquire __lock_acquire register_lock_class assign_lock_key dump_stack(); ratelimit_state_init(&sbi->s_msg_ratelimit_state, 5 * HZ, 10); raw_spin_lock_init(&rs->lock); // init rs->lock here and get the following dump_stack: ========================================================= INFO: trying to register non-static key. The code is fine but needs lockdep annotation, or maybe you didn't initialize this object before use? turning off the locking correctness validator. CPU: 12 PID: 753 Comm: mount Tainted: G E 6.7.0-rc6-next-20231222 #504 [...] Call Trace: dump_stack_lvl+0xc5/0x170 dump_stack+0x18/0x30 register_lock_class+0x740/0x7c0 __lock_acquire+0x69/0x13a0 lock_acquire+0x120/0x450 _raw_spin_trylock+0x98/0xd0 ___ratelimit+0xf6/0x220 __ext4_msg+0x7f/0x160 [ext4] ext4_orphan_cleanup+0x665/0x740 [ext4] __ext4_fill_super+0x21ea/0x2b10 [ext4] ext4_fill_super+0x14d/0x360 [ext4] [...] ========================================================= Normally interval is 0 until s_msg_ratelimit_state is initialized, so ___ratelimit() does nothing. But registering sysfs precedes initializing rs->lock, so it is possible to change rs->interval to a non-zero value via the msg_ratelimit_interval_ms interface of sysfs while rs->lock is uninitialized, and then a call to ext4_msg triggers the problem by accessing an uninitialized rs->lock. Therefore register sysfs after all initializations are complete to avoid such problems. Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20240102133730.1098120-1-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f9b2b18374f3..f9a4a4e89dac 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5550,19 +5550,15 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (err) goto failed_mount6; - err = ext4_register_sysfs(sb); - if (err) - goto failed_mount7; - err = ext4_init_orphan_info(sb); if (err) - goto failed_mount8; + goto failed_mount7; #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { err = ext4_enable_quotas(sb); if (err) - goto failed_mount9; + goto failed_mount8; } #endif /* CONFIG_QUOTA */ @@ -5588,7 +5584,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); if (err) - goto failed_mount10; + goto failed_mount9; } if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) @@ -5605,15 +5601,17 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) atomic_set(&sbi->s_warning_count, 0); atomic_set(&sbi->s_msg_count, 0); + /* Register sysfs after all initializations are complete. */ + err = ext4_register_sysfs(sb); + if (err) + goto failed_mount9; + return 0; -failed_mount10: +failed_mount9: ext4_quotas_off(sb, EXT4_MAXQUOTAS); -failed_mount9: __maybe_unused +failed_mount8: __maybe_unused ext4_release_orphan_info(sb); -failed_mount8: - ext4_unregister_sysfs(sb); - kobject_put(&sbi->s_kobj); failed_mount7: ext4_unregister_li_request(sb); failed_mount6: From 078760d950016f5982751f5512e69f26ad8feb31 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Sun, 7 Apr 2024 14:53:54 +0800 Subject: [PATCH 38/40] jbd2: use shrink_type type instead of bool type for __jbd2_journal_clean_checkpoint_list() "enum shrink_type" can clearly express the meaning of the parameter of __jbd2_journal_clean_checkpoint_list(), and there is no need to use the bool type. Signed-off-by: Ye Bin Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Link: https://lore.kernel.org/r/20240407065355.1528580-2-yebin10@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 16 +++++++++------- fs/jbd2/commit.c | 2 +- include/linux/jbd2.h | 4 +++- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 1c97e64c4784..80c0ab98bc63 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -337,8 +337,6 @@ int jbd2_cleanup_journal_tail(journal_t *journal) /* Checkpoint list management */ -enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP}; - /* * journal_shrink_one_cp_list * @@ -472,21 +470,25 @@ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, * journal_clean_checkpoint_list * * Find all the written-back checkpoint buffers in the journal and release them. - * If 'destroy' is set, release all buffers unconditionally. + * If 'type' is SHRINK_DESTROY, release all buffers unconditionally. If 'type' + * is SHRINK_BUSY_STOP, will stop release buffers if encounters a busy buffer. + * To avoid wasting CPU cycles scanning the buffer list in some cases, don't + * pass SHRINK_BUSY_SKIP 'type' for this function. * * Called with j_list_lock held. */ -void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy) +void __jbd2_journal_clean_checkpoint_list(journal_t *journal, + enum shrink_type type) { transaction_t *transaction, *last_transaction, *next_transaction; - enum shrink_type type; bool released; + WARN_ON_ONCE(type == SHRINK_BUSY_SKIP); + transaction = journal->j_checkpoint_transactions; if (!transaction) return; - type = destroy ? SHRINK_DESTROY : SHRINK_BUSY_STOP; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { @@ -527,7 +529,7 @@ void jbd2_journal_destroy_checkpoint(journal_t *journal) spin_unlock(&journal->j_list_lock); break; } - __jbd2_journal_clean_checkpoint_list(journal, true); + __jbd2_journal_clean_checkpoint_list(journal, SHRINK_DESTROY); spin_unlock(&journal->j_list_lock); cond_resched(); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 78a9d08ae9f8..b341c396311b 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -501,7 +501,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) * frees some memory */ spin_lock(&journal->j_list_lock); - __jbd2_journal_clean_checkpoint_list(journal, false); + __jbd2_journal_clean_checkpoint_list(journal, SHRINK_BUSY_STOP); spin_unlock(&journal->j_list_lock); jbd2_debug(3, "JBD2: commit phase 1\n"); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 971f3e826e15..58a961999d70 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1434,7 +1434,9 @@ void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); extern void jbd2_journal_commit_transaction(journal_t *); /* Checkpoint list management */ -void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy); +enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP}; + +void __jbd2_journal_clean_checkpoint_list(journal_t *journal, enum shrink_type type); unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan); int __jbd2_journal_remove_checkpoint(struct journal_head *); int jbd2_journal_try_remove_checkpoint(struct journal_head *jh); From 26770a717cac57041d9414725e3e01dd19b08dd2 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Sun, 7 Apr 2024 14:53:55 +0800 Subject: [PATCH 39/40] jbd2: add prefix 'jbd2' for 'shrink_type' As 'shrink_type' is exported. The module prefix 'jbd2' is added to distinguish from memory reclamation. Signed-off-by: Ye Bin Reviewed-by: Jan Kara Reviewed-by: Zhang Yi Link: https://lore.kernel.org/r/20240407065355.1528580-3-yebin10@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 22 +++++++++++----------- fs/jbd2/commit.c | 2 +- include/linux/jbd2.h | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 80c0ab98bc63..951f78634adf 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -348,7 +348,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * Called with j_list_lock held. */ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, - enum shrink_type type, + enum jbd2_shrink_type type, bool *released) { struct journal_head *last_jh; @@ -365,12 +365,12 @@ static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, jh = next_jh; next_jh = jh->b_cpnext; - if (type == SHRINK_DESTROY) { + if (type == JBD2_SHRINK_DESTROY) { ret = __jbd2_journal_remove_checkpoint(jh); } else { ret = jbd2_journal_try_remove_checkpoint(jh); if (ret < 0) { - if (type == SHRINK_BUSY_SKIP) + if (type == JBD2_SHRINK_BUSY_SKIP) continue; break; } @@ -437,7 +437,7 @@ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, tid = transaction->t_tid; freed = journal_shrink_one_cp_list(transaction->t_checkpoint_list, - SHRINK_BUSY_SKIP, &released); + JBD2_SHRINK_BUSY_SKIP, &released); nr_freed += freed; (*nr_to_scan) -= min(*nr_to_scan, freed); if (*nr_to_scan == 0) @@ -470,20 +470,20 @@ unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, * journal_clean_checkpoint_list * * Find all the written-back checkpoint buffers in the journal and release them. - * If 'type' is SHRINK_DESTROY, release all buffers unconditionally. If 'type' - * is SHRINK_BUSY_STOP, will stop release buffers if encounters a busy buffer. - * To avoid wasting CPU cycles scanning the buffer list in some cases, don't - * pass SHRINK_BUSY_SKIP 'type' for this function. + * If 'type' is JBD2_SHRINK_DESTROY, release all buffers unconditionally. If + * 'type' is JBD2_SHRINK_BUSY_STOP, will stop release buffers if encounters a + * busy buffer. To avoid wasting CPU cycles scanning the buffer list in some + * cases, don't pass JBD2_SHRINK_BUSY_SKIP 'type' for this function. * * Called with j_list_lock held. */ void __jbd2_journal_clean_checkpoint_list(journal_t *journal, - enum shrink_type type) + enum jbd2_shrink_type type) { transaction_t *transaction, *last_transaction, *next_transaction; bool released; - WARN_ON_ONCE(type == SHRINK_BUSY_SKIP); + WARN_ON_ONCE(type == JBD2_SHRINK_BUSY_SKIP); transaction = journal->j_checkpoint_transactions; if (!transaction) @@ -529,7 +529,7 @@ void jbd2_journal_destroy_checkpoint(journal_t *journal) spin_unlock(&journal->j_list_lock); break; } - __jbd2_journal_clean_checkpoint_list(journal, SHRINK_DESTROY); + __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_DESTROY); spin_unlock(&journal->j_list_lock); cond_resched(); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index b341c396311b..75ea4e9a5cab 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -501,7 +501,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) * frees some memory */ spin_lock(&journal->j_list_lock); - __jbd2_journal_clean_checkpoint_list(journal, SHRINK_BUSY_STOP); + __jbd2_journal_clean_checkpoint_list(journal, JBD2_SHRINK_BUSY_STOP); spin_unlock(&journal->j_list_lock); jbd2_debug(3, "JBD2: commit phase 1\n"); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 58a961999d70..7479f64c0939 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1434,9 +1434,9 @@ void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); extern void jbd2_journal_commit_transaction(journal_t *); /* Checkpoint list management */ -enum shrink_type {SHRINK_DESTROY, SHRINK_BUSY_STOP, SHRINK_BUSY_SKIP}; +enum jbd2_shrink_type {JBD2_SHRINK_DESTROY, JBD2_SHRINK_BUSY_STOP, JBD2_SHRINK_BUSY_SKIP}; -void __jbd2_journal_clean_checkpoint_list(journal_t *journal, enum shrink_type type); +void __jbd2_journal_clean_checkpoint_list(journal_t *journal, enum jbd2_shrink_type type); unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan); int __jbd2_journal_remove_checkpoint(struct journal_head *); int jbd2_journal_try_remove_checkpoint(struct journal_head *jh); From c6a6c9694aadc4c3ab8d89bdd44aed3eab1e43c6 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 10 May 2024 18:22:53 +0300 Subject: [PATCH 40/40] ext4: fix error pointer dereference in ext4_mb_load_buddy_gfp() This code calls folio_put() on an error pointer which will lead to a crash. Check for both error pointers and NULL pointers before calling folio_put(). Fixes: 5eea586b47f0 ("ext4: convert bd_buddy_page to bd_buddy_folio") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/eaafa1d9-a61c-4af4-9f97-d3ad72c60200@moroto.mountain Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 648989c125f2..9dda9cd68ab2 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1717,7 +1717,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, return 0; err: - if (folio) + if (!IS_ERR_OR_NULL(folio)) folio_put(folio); if (e4b->bd_bitmap_folio) folio_put(e4b->bd_bitmap_folio);