From 2a162ce93232eb78124601996744f8eafec845ab Mon Sep 17 00:00:00 2001 From: Davide Italiano Date: Mon, 6 Apr 2015 22:09:15 -0700 Subject: [PATCH 1/9] Btrfs: Improve FL_KEEP_SIZE handling in fallocate - We call inode_size_ok() only if FL_KEEP_SIZE isn't specified. - As an optimisation we can skip the call if (off + len) isn't greater than the current size of the file. This operation is called under the lock so the less work we do, the better. - If we call inode_size_ok() pass to it the correct value rather than a more conservative estimation. Signed-off-by: Davide Italiano Reviewed-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 15a09cb156ce..7af1abda68d0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2682,9 +2682,12 @@ static long btrfs_fallocate(struct file *file, int mode, return ret; inode_lock(inode); - ret = inode_newsize_ok(inode, alloc_end); - if (ret) - goto out; + + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { + ret = inode_newsize_ok(inode, offset + len); + if (ret) + goto out; + } /* * TODO: Move these two operations after we have checked From 264813acb1c756aebc337b16b832604a0c9aadaf Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 21 Mar 2016 14:59:53 -0700 Subject: [PATCH 2/9] Btrfs: fix invalid reference in replace_path Dan Carpenter's static checker has found this error, it's introduced by commit 64c043de466d ("Btrfs: fix up read_tree_block to return proper error") It's really supposed to 'break' the loop on error like others. Cc: Dan Carpenter Reported-by: Dan Carpenter Signed-off-by: Liu Bo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2bd0011450df..5c806f0d443d 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1850,6 +1850,7 @@ again: eb = read_tree_block(dest, old_bytenr, old_ptr_gen); if (IS_ERR(eb)) { ret = PTR_ERR(eb); + break; } else if (!extent_buffer_uptodate(eb)) { ret = -EIO; free_extent_buffer(eb); From 0305bc279317a6ba261a663cc32721d78e6544cf Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 23 Mar 2016 11:38:17 +0800 Subject: [PATCH 3/9] btrfs: Output more info for enospc_debug mount option As one user in mail list report reproducible balance ENOSPC error, it's better to add more debug info for enospc_debug mount option. Reported-by: Marc Haber Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 53e12977bfd0..85074845c562 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -9386,15 +9386,23 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) u64 dev_min = 1; u64 dev_nr = 0; u64 target; + int debug; int index; int full = 0; int ret = 0; + debug = btrfs_test_opt(root, ENOSPC_DEBUG); + block_group = btrfs_lookup_block_group(root->fs_info, bytenr); /* odd, couldn't find the block group, leave it alone */ - if (!block_group) + if (!block_group) { + if (debug) + btrfs_warn(root->fs_info, + "can't find block group for bytenr %llu", + bytenr); return -1; + } min_free = btrfs_block_group_used(&block_group->item); @@ -9448,8 +9456,13 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * this is just a balance, so if we were marked as full * we know there is no space for a new chunk */ - if (full) + if (full) { + if (debug) + btrfs_warn(root->fs_info, + "no space to alloc new chunk for block group %llu", + block_group->key.objectid); goto out; + } index = get_block_group_index(block_group); } @@ -9496,6 +9509,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) ret = -1; } } + if (debug && ret == -1) + btrfs_warn(root->fs_info, + "no space to allocate a new chunk for block group %llu", + block_group->key.objectid); mutex_unlock(&root->fs_info->chunk_mutex); btrfs_end_transaction(trans, root); out: From 918c2ee103cf9956f1c61d3f848dbb49fd2d104a Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 30 Mar 2016 17:57:48 -0700 Subject: [PATCH 4/9] btrfs: handle non-fatal errors in btrfs_qgroup_inherit() create_pending_snapshot() will go readonly on _any_ error return from btrfs_qgroup_inherit(). If qgroups are enabled, a user can crash their fs by just making a snapshot and asking it to inherit from an invalid qgroup. For example: $ btrfs sub snap -i 1/10 /btrfs/ /btrfs/foo Will cause a transaction abort. Fix this by only throwing errors in btrfs_qgroup_inherit() when we know going readonly is acceptable. The following xfstests test case reproduces this bug: seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" here=`pwd` tmp=/tmp/$$ status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 _cleanup() { cd / rm -f $tmp.* } # get standard environment, filters and checks . ./common/rc . ./common/filter # remove previous $seqres.full before test rm -f $seqres.full # real QA test starts here _supported_fs btrfs _supported_os Linux _require_scratch rm -f $seqres.full _scratch_mkfs _scratch_mount _run_btrfs_util_prog quota enable $SCRATCH_MNT # The qgroup '1/10' does not exist and should be silently ignored _run_btrfs_util_prog subvolume snapshot -i 1/10 $SCRATCH_MNT $SCRATCH_MNT/snap1 _scratch_unmount echo "Silence is golden" status=0 exit Signed-off-by: Mark Fasheh Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 54 ++++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 5279fdae7142..7173360eea7a 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1842,8 +1842,10 @@ out: } /* - * copy the acounting information between qgroups. This is necessary when a - * snapshot or a subvolume is created + * Copy the acounting information between qgroups. This is necessary + * when a snapshot or a subvolume is created. Throwing an error will + * cause a transaction abort so we take extra care here to only error + * when a readonly fs is a reasonable outcome. */ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, @@ -1873,15 +1875,15 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, 2 * inherit->num_excl_copies; for (i = 0; i < nums; ++i) { srcgroup = find_qgroup_rb(fs_info, *i_qgroups); - if (!srcgroup) { - ret = -EINVAL; - goto out; - } - if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) { - ret = -EINVAL; - goto out; - } + /* + * Zero out invalid groups so we can ignore + * them later. + */ + if (!srcgroup || + ((srcgroup->qgroupid >> 48) <= (objectid >> 48))) + *i_qgroups = 0ULL; + ++i_qgroups; } } @@ -1916,17 +1918,19 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, */ if (inherit) { i_qgroups = (u64 *)(inherit + 1); - for (i = 0; i < inherit->num_qgroups; ++i) { + for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) { + if (*i_qgroups == 0) + continue; ret = add_qgroup_relation_item(trans, quota_root, objectid, *i_qgroups); - if (ret) + if (ret && ret != -EEXIST) goto out; ret = add_qgroup_relation_item(trans, quota_root, *i_qgroups, objectid); - if (ret) + if (ret && ret != -EEXIST) goto out; - ++i_qgroups; } + ret = 0; } @@ -1987,17 +1991,22 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, i_qgroups = (u64 *)(inherit + 1); for (i = 0; i < inherit->num_qgroups; ++i) { - ret = add_relation_rb(quota_root->fs_info, objectid, - *i_qgroups); - if (ret) - goto unlock; + if (*i_qgroups) { + ret = add_relation_rb(quota_root->fs_info, objectid, + *i_qgroups); + if (ret) + goto unlock; + } ++i_qgroups; } - for (i = 0; i < inherit->num_ref_copies; ++i) { + for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { struct btrfs_qgroup *src; struct btrfs_qgroup *dst; + if (!i_qgroups[0] || !i_qgroups[1]) + continue; + src = find_qgroup_rb(fs_info, i_qgroups[0]); dst = find_qgroup_rb(fs_info, i_qgroups[1]); @@ -2008,12 +2017,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, dst->rfer = src->rfer - level_size; dst->rfer_cmpr = src->rfer_cmpr - level_size; - i_qgroups += 2; } - for (i = 0; i < inherit->num_excl_copies; ++i) { + for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) { struct btrfs_qgroup *src; struct btrfs_qgroup *dst; + if (!i_qgroups[0] || !i_qgroups[1]) + continue; + src = find_qgroup_rb(fs_info, i_qgroups[0]); dst = find_qgroup_rb(fs_info, i_qgroups[1]); @@ -2024,7 +2035,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, dst->excl = src->excl + level_size; dst->excl_cmpr = src->excl_cmpr + level_size; - i_qgroups += 2; } unlock: From 8f282f71eaee7ac979cdbe525f76daa0722798a8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 30 Mar 2016 16:01:12 +0200 Subject: [PATCH 5/9] btrfs: fallback to vmalloc in btrfs_compare_tree The allocation of node could fail if the memory is too fragmented for a given node size, practically observed with 64k. http://article.gmane.org/gmane.comp.file-systems.btrfs/54689 Reported-and-tested-by: Jean-Denis Girard Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 77592931ab4f..ec7928a27aaa 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -5361,10 +5362,13 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } - tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL); + tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL | __GFP_NOWARN); if (!tmp_buf) { - ret = -ENOMEM; - goto out; + tmp_buf = vmalloc(left_root->nodesize); + if (!tmp_buf) { + ret = -ENOMEM; + goto out; + } } left_path->search_commit_root = 1; @@ -5565,7 +5569,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, out: btrfs_free_path(left_path); btrfs_free_path(right_path); - kfree(tmp_buf); + kvfree(tmp_buf); return ret; } From c79b4713304f812d3d6c95826fc3e5fc2c0b0c14 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 25 Mar 2016 10:02:41 -0400 Subject: [PATCH 6/9] Btrfs: don't use src fd for printk The fd we pass in may not be on a btrfs file system, so don't try to do BTRFS_I() on it. Thanks, Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 053e677839fe..21423dd15da4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1654,7 +1654,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file, src_inode = file_inode(src.file); if (src_inode->i_sb != file_inode(file)->i_sb) { - btrfs_info(BTRFS_I(src_inode)->root->fs_info, + btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, "Snapshot src from another FS"); ret = -EXDEV; } else if (!inode_owner_or_capable(src_inode)) { From 0f5dcf8de974db5970d48d12a202997baa2846a1 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 29 Mar 2016 17:19:55 -0700 Subject: [PATCH 7/9] btrfs: Add qgroup tracing This patch adds tracepoints to the qgroup code on both the reporting side (insert_dirty_extents) and the accounting side. Taken together it allows us to see what qgroup operations have happened, and what their result was. Signed-off-by: Mark Fasheh Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 9 ++++ include/trace/events/btrfs.h | 89 +++++++++++++++++++++++++++++++++++- 2 files changed, 97 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 7173360eea7a..9e119552ed32 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1463,6 +1463,7 @@ struct btrfs_qgroup_extent_record u64 bytenr = record->bytenr; assert_spin_locked(&delayed_refs->lock); + trace_btrfs_qgroup_insert_dirty_extent(record); while (*p) { parent_node = *p; @@ -1594,6 +1595,9 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); + trace_qgroup_update_counters(qg->qgroupid, cur_old_count, + cur_new_count); + /* Rfer update part */ if (cur_old_count == 0 && cur_new_count > 0) { qg->rfer += num_bytes; @@ -1683,6 +1687,9 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, goto out_free; BUG_ON(!fs_info->quota_root); + trace_btrfs_qgroup_account_extent(bytenr, num_bytes, nr_old_roots, + nr_new_roots); + qgroups = ulist_alloc(GFP_NOFS); if (!qgroups) { ret = -ENOMEM; @@ -1752,6 +1759,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, record = rb_entry(node, struct btrfs_qgroup_extent_record, node); + trace_btrfs_qgroup_account_extents(record); + if (!ret) { /* * Use (u64)-1 as time_seq to do special search, which diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index d866f21efbbf..467a4d205ff6 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -23,7 +23,7 @@ struct map_lookup; struct extent_buffer; struct btrfs_work; struct __btrfs_workqueue; -struct btrfs_qgroup_operation; +struct btrfs_qgroup_extent_record; #define show_ref_type(type) \ __print_symbolic(type, \ @@ -1231,6 +1231,93 @@ DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref, TP_ARGS(ref_root, reserved) ); + +DECLARE_EVENT_CLASS(btrfs_qgroup_extent, + TP_PROTO(struct btrfs_qgroup_extent_record *rec), + + TP_ARGS(rec), + + TP_STRUCT__entry( + __field( u64, bytenr ) + __field( u64, num_bytes ) + ), + + TP_fast_assign( + __entry->bytenr = rec->bytenr, + __entry->num_bytes = rec->num_bytes; + ), + + TP_printk("bytenr = %llu, num_bytes = %llu", + (unsigned long long)__entry->bytenr, + (unsigned long long)__entry->num_bytes) +); + +DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_account_extents, + + TP_PROTO(struct btrfs_qgroup_extent_record *rec), + + TP_ARGS(rec) +); + +DEFINE_EVENT(btrfs_qgroup_extent, btrfs_qgroup_insert_dirty_extent, + + TP_PROTO(struct btrfs_qgroup_extent_record *rec), + + TP_ARGS(rec) +); + +TRACE_EVENT(btrfs_qgroup_account_extent, + + TP_PROTO(u64 bytenr, u64 num_bytes, u64 nr_old_roots, u64 nr_new_roots), + + TP_ARGS(bytenr, num_bytes, nr_old_roots, nr_new_roots), + + TP_STRUCT__entry( + __field( u64, bytenr ) + __field( u64, num_bytes ) + __field( u64, nr_old_roots ) + __field( u64, nr_new_roots ) + ), + + TP_fast_assign( + __entry->bytenr = bytenr; + __entry->num_bytes = num_bytes; + __entry->nr_old_roots = nr_old_roots; + __entry->nr_new_roots = nr_new_roots; + ), + + TP_printk("bytenr = %llu, num_bytes = %llu, nr_old_roots = %llu, " + "nr_new_roots = %llu", + __entry->bytenr, + __entry->num_bytes, + __entry->nr_old_roots, + __entry->nr_new_roots) +); + +TRACE_EVENT(qgroup_update_counters, + + TP_PROTO(u64 qgid, u64 cur_old_count, u64 cur_new_count), + + TP_ARGS(qgid, cur_old_count, cur_new_count), + + TP_STRUCT__entry( + __field( u64, qgid ) + __field( u64, cur_old_count ) + __field( u64, cur_new_count ) + ), + + TP_fast_assign( + __entry->qgid = qgid; + __entry->cur_old_count = cur_old_count; + __entry->cur_new_count = cur_new_count; + ), + + TP_printk("qgid = %llu, cur_old_count = %llu, cur_new_count = %llu", + __entry->qgid, + __entry->cur_old_count, + __entry->cur_new_count) +); + #endif /* _TRACE_BTRFS_H */ /* This part must be outside protection */ From 7ccefb98ce3e5c4493cd213cd03714b7149cf0cb Mon Sep 17 00:00:00 2001 From: Yauhen Kharuzhy Date: Tue, 29 Mar 2016 14:17:48 -0700 Subject: [PATCH 8/9] btrfs: Reset IO error counters before start of device replacing If device replace entry was found on disk at mounting and its num_write_errors stats counter has non-NULL value, then replace operation will never be finished and -EIO error will be reported by btrfs_scrub_dev() because this counter is never reset. # mount -o degraded /media/a4fb5c0a-21c5-4fe7-8d0e-fdd87d5f71ee/ # btrfs replace status /media/a4fb5c0a-21c5-4fe7-8d0e-fdd87d5f71ee/ Started on 25.Mar 07:28:00, canceled on 25.Mar 07:28:01 at 0.0%, 40 write errs, 0 uncorr. read errs # btrfs replace start -B 4 /dev/sdg /media/a4fb5c0a-21c5-4fe7-8d0e-fdd87d5f71ee/ ERROR: ioctl(DEV_REPLACE_START) failed on "/media/a4fb5c0a-21c5-4fe7-8d0e-fdd87d5f71ee/": Input/output error, no error Reset num_write_errors and num_uncorrectable_read_errors counters in the dev_replace structure before start of replacing. Signed-off-by: Yauhen Kharuzhy Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index a1d6652e0c47..26bcb487f958 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -394,6 +394,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root, dev_replace->cursor_right = 0; dev_replace->is_valid = 1; dev_replace->item_needs_writeback = 1; + atomic64_set(&dev_replace->num_write_errors, 0); + atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; btrfs_dev_replace_unlock(dev_replace, 1); From 56f23fdbb600e6087db7b009775b95ce07cc3195 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 30 Mar 2016 23:37:21 +0100 Subject: [PATCH 9/9] Btrfs: fix file/data loss caused by fsync after rename and new inode If we rename an inode A (be it a file or a directory), create a new inode B with the old name of inode A and under the same parent directory, fsync inode B and then power fail, at log tree replay time we end up removing inode A completely. If inode A is a directory then all its files are gone too. Example scenarios where this happens: This is reproducible with the following steps, taken from a couple of test cases written for fstests which are going to be submitted upstream soon: # Scenario 1 mkfs.btrfs -f /dev/sdc mount /dev/sdc /mnt mkdir -p /mnt/a/x echo "hello" > /mnt/a/x/foo echo "world" > /mnt/a/x/bar sync mv /mnt/a/x /mnt/a/y mkdir /mnt/a/x xfs_io -c fsync /mnt/a/x The next time the fs is mounted, log tree replay happens and the directory "y" does not exist nor do the files "foo" and "bar" exist anywhere (neither in "y" nor in "x", nor the root nor anywhere). # Scenario 2 mkfs.btrfs -f /dev/sdc mount /dev/sdc /mnt mkdir /mnt/a echo "hello" > /mnt/a/foo sync mv /mnt/a/foo /mnt/a/bar echo "world" > /mnt/a/foo xfs_io -c fsync /mnt/a/foo The next time the fs is mounted, log tree replay happens and the file "bar" does not exists anymore. A file with the name "foo" exists and it matches the second file we created. Another related problem that does not involve file/data loss is when a new inode is created with the name of a deleted snapshot and we fsync it: mkfs.btrfs -f /dev/sdc mount /dev/sdc /mnt mkdir /mnt/testdir btrfs subvolume snapshot /mnt /mnt/testdir/snap btrfs subvolume delete /mnt/testdir/snap rmdir /mnt/testdir mkdir /mnt/testdir xfs_io -c fsync /mnt/testdir # or fsync some file inside /mnt/testdir The next time the fs is mounted the log replay procedure fails because it attempts to delete the snapshot entry (which has dir item key type of BTRFS_ROOT_ITEM_KEY) as if it were a regular (non-root) entry, resulting in the following error that causes mount to fail: [52174.510532] BTRFS info (device dm-0): failed to delete reference to snap, inode 257 parent 257 [52174.512570] ------------[ cut here ]------------ [52174.513278] WARNING: CPU: 12 PID: 28024 at fs/btrfs/inode.c:3986 __btrfs_unlink_inode+0x178/0x351 [btrfs]() [52174.514681] BTRFS: Transaction aborted (error -2) [52174.515630] Modules linked in: btrfs dm_flakey dm_mod overlay crc32c_generic ppdev xor raid6_pq acpi_cpufreq parport_pc tpm_tis sg parport tpm evdev i2c_piix4 proc [52174.521568] CPU: 12 PID: 28024 Comm: mount Tainted: G W 4.5.0-rc6-btrfs-next-27+ #1 [52174.522805] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014 [52174.524053] 0000000000000000 ffff8801df2a7710 ffffffff81264e93 ffff8801df2a7758 [52174.524053] 0000000000000009 ffff8801df2a7748 ffffffff81051618 ffffffffa03591cd [52174.524053] 00000000fffffffe ffff88015e6e5000 ffff88016dbc3c88 ffff88016dbc3c88 [52174.524053] Call Trace: [52174.524053] [] dump_stack+0x67/0x90 [52174.524053] [] warn_slowpath_common+0x99/0xb2 [52174.524053] [] ? __btrfs_unlink_inode+0x178/0x351 [btrfs] [52174.524053] [] warn_slowpath_fmt+0x48/0x50 [52174.524053] [] __btrfs_unlink_inode+0x178/0x351 [btrfs] [52174.524053] [] ? iput+0xb0/0x284 [52174.524053] [] btrfs_unlink_inode+0x1c/0x3d [btrfs] [52174.524053] [] check_item_in_log+0x1fe/0x29b [btrfs] [52174.524053] [] replay_dir_deletes+0x167/0x1cf [btrfs] [52174.524053] [] fixup_inode_link_count+0x289/0x2aa [btrfs] [52174.524053] [] fixup_inode_link_counts+0xcb/0x105 [btrfs] [52174.524053] [] btrfs_recover_log_trees+0x258/0x32c [btrfs] [52174.524053] [] ? replay_one_extent+0x511/0x511 [btrfs] [52174.524053] [] open_ctree+0x1dd4/0x21b9 [btrfs] [52174.524053] [] btrfs_mount+0x97e/0xaed [btrfs] [52174.524053] [] ? trace_hardirqs_on+0xd/0xf [52174.524053] [] mount_fs+0x67/0x131 [52174.524053] [] vfs_kern_mount+0x6c/0xde [52174.524053] [] btrfs_mount+0x1ac/0xaed [btrfs] [52174.524053] [] ? trace_hardirqs_on+0xd/0xf [52174.524053] [] ? lockdep_init_map+0xb9/0x1b3 [52174.524053] [] mount_fs+0x67/0x131 [52174.524053] [] vfs_kern_mount+0x6c/0xde [52174.524053] [] do_mount+0x8a6/0x9e8 [52174.524053] [] ? strndup_user+0x3f/0x59 [52174.524053] [] SyS_mount+0x77/0x9f [52174.524053] [] entry_SYSCALL_64_fastpath+0x12/0x6b [52174.561288] ---[ end trace 6b53049efb1a3ea6 ]--- Fix this by forcing a transaction commit when such cases happen. This means we check in the commit root of the subvolume tree if there was any other inode with the same reference when the inode we are fsync'ing is a new inode (created in the current transaction). Test cases for fstests, covering all the scenarios given above, were submitted upstream for fstests: * fstests: generic test for fsync after renaming directory https://patchwork.kernel.org/patch/8694281/ * fstests: generic test for fsync after renaming file https://patchwork.kernel.org/patch/8694301/ * fstests: add btrfs test for fsync after snapshot deletion https://patchwork.kernel.org/patch/8670671/ Cc: stable@vger.kernel.org Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/tree-log.c | 137 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 24d03c751149..517d0ccb351e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4415,6 +4415,127 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, return ret; } +/* + * When we are logging a new inode X, check if it doesn't have a reference that + * matches the reference from some other inode Y created in a past transaction + * and that was renamed in the current transaction. If we don't do this, then at + * log replay time we can lose inode Y (and all its files if it's a directory): + * + * mkdir /mnt/x + * echo "hello world" > /mnt/x/foobar + * sync + * mv /mnt/x /mnt/y + * mkdir /mnt/x # or touch /mnt/x + * xfs_io -c fsync /mnt/x + * + * mount fs, trigger log replay + * + * After the log replay procedure, we would lose the first directory and all its + * files (file foobar). + * For the case where inode Y is not a directory we simply end up losing it: + * + * echo "123" > /mnt/foo + * sync + * mv /mnt/foo /mnt/bar + * echo "abc" > /mnt/foo + * xfs_io -c fsync /mnt/foo + * + * + * We also need this for cases where a snapshot entry is replaced by some other + * entry (file or directory) otherwise we end up with an unreplayable log due to + * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as + * if it were a regular entry: + * + * mkdir /mnt/x + * btrfs subvolume snapshot /mnt /mnt/x/snap + * btrfs subvolume delete /mnt/x/snap + * rmdir /mnt/x + * mkdir /mnt/x + * fsync /mnt/x or fsync some new file inside it + * + * + * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in + * the same transaction. + */ +static int btrfs_check_ref_name_override(struct extent_buffer *eb, + const int slot, + const struct btrfs_key *key, + struct inode *inode) +{ + int ret; + struct btrfs_path *search_path; + char *name = NULL; + u32 name_len = 0; + u32 item_size = btrfs_item_size_nr(eb, slot); + u32 cur_offset = 0; + unsigned long ptr = btrfs_item_ptr_offset(eb, slot); + + search_path = btrfs_alloc_path(); + if (!search_path) + return -ENOMEM; + search_path->search_commit_root = 1; + search_path->skip_locking = 1; + + while (cur_offset < item_size) { + u64 parent; + u32 this_name_len; + u32 this_len; + unsigned long name_ptr; + struct btrfs_dir_item *di; + + if (key->type == BTRFS_INODE_REF_KEY) { + struct btrfs_inode_ref *iref; + + iref = (struct btrfs_inode_ref *)(ptr + cur_offset); + parent = key->offset; + this_name_len = btrfs_inode_ref_name_len(eb, iref); + name_ptr = (unsigned long)(iref + 1); + this_len = sizeof(*iref) + this_name_len; + } else { + struct btrfs_inode_extref *extref; + + extref = (struct btrfs_inode_extref *)(ptr + + cur_offset); + parent = btrfs_inode_extref_parent(eb, extref); + this_name_len = btrfs_inode_extref_name_len(eb, extref); + name_ptr = (unsigned long)&extref->name; + this_len = sizeof(*extref) + this_name_len; + } + + if (this_name_len > name_len) { + char *new_name; + + new_name = krealloc(name, this_name_len, GFP_NOFS); + if (!new_name) { + ret = -ENOMEM; + goto out; + } + name_len = this_name_len; + name = new_name; + } + + read_extent_buffer(eb, name, name_ptr, this_name_len); + di = btrfs_lookup_dir_item(NULL, BTRFS_I(inode)->root, + search_path, parent, + name, this_name_len, 0); + if (di && !IS_ERR(di)) { + ret = 1; + goto out; + } else if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + btrfs_release_path(search_path); + + cur_offset += this_len; + } + ret = 0; +out: + btrfs_free_path(search_path); + kfree(name); + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -4602,6 +4723,22 @@ again: if (min_key.type == BTRFS_INODE_ITEM_KEY) need_log_inode_item = false; + if ((min_key.type == BTRFS_INODE_REF_KEY || + min_key.type == BTRFS_INODE_EXTREF_KEY) && + BTRFS_I(inode)->generation == trans->transid) { + ret = btrfs_check_ref_name_override(path->nodes[0], + path->slots[0], + &min_key, inode); + if (ret < 0) { + err = ret; + goto out_unlock; + } else if (ret > 0) { + err = 1; + btrfs_set_log_full_commit(root->fs_info, trans); + goto out_unlock; + } + } + /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ if (min_key.type == BTRFS_XATTR_ITEM_KEY) { if (ins_nr == 0)