2018-04-03 17:23:33 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2012-11-05 16:33:06 +00:00
|
|
|
/*
|
|
|
|
* Copyright (C) STRATO AG 2012. All rights reserved.
|
|
|
|
*/
|
2018-04-03 17:23:33 +00:00
|
|
|
|
2012-11-05 16:33:06 +00:00
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/bio.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/blkdev.h>
|
|
|
|
#include <linux/kthread.h>
|
|
|
|
#include <linux/math64.h>
|
2019-08-21 16:48:25 +00:00
|
|
|
#include "misc.h"
|
2012-11-05 16:33:06 +00:00
|
|
|
#include "ctree.h"
|
|
|
|
#include "extent_map.h"
|
|
|
|
#include "disk-io.h"
|
|
|
|
#include "transaction.h"
|
|
|
|
#include "print-tree.h"
|
|
|
|
#include "volumes.h"
|
|
|
|
#include "async-thread.h"
|
|
|
|
#include "dev-replace.h"
|
2014-06-03 03:36:02 +00:00
|
|
|
#include "sysfs.h"
|
2020-11-10 11:26:07 +00:00
|
|
|
#include "zoned.h"
|
2021-02-04 10:22:11 +00:00
|
|
|
#include "block-group.h"
|
2022-10-19 14:50:51 +00:00
|
|
|
#include "fs.h"
|
2022-10-19 14:51:00 +00:00
|
|
|
#include "accessors.h"
|
2022-10-26 19:08:35 +00:00
|
|
|
#include "scrub.h"
|
2012-11-05 16:33:06 +00:00
|
|
|
|
2020-01-23 07:44:50 +00:00
|
|
|
/*
|
|
|
|
* Device replace overview
|
|
|
|
*
|
|
|
|
* [Objective]
|
|
|
|
* To copy all extents (both new and on-disk) from source device to target
|
|
|
|
* device, while still keeping the filesystem read-write.
|
|
|
|
*
|
|
|
|
* [Method]
|
|
|
|
* There are two main methods involved:
|
|
|
|
*
|
|
|
|
* - Write duplication
|
|
|
|
*
|
|
|
|
* All new writes will be written to both target and source devices, so even
|
2021-05-21 15:42:23 +00:00
|
|
|
* if replace gets canceled, sources device still contains up-to-date data.
|
2020-01-23 07:44:50 +00:00
|
|
|
*
|
2023-05-31 04:17:37 +00:00
|
|
|
* Location: handle_ops_on_dev_replace() from btrfs_map_block()
|
2020-01-23 07:44:50 +00:00
|
|
|
* Start: btrfs_dev_replace_start()
|
|
|
|
* End: btrfs_dev_replace_finishing()
|
|
|
|
* Content: Latest data/metadata
|
|
|
|
*
|
|
|
|
* - Copy existing extents
|
|
|
|
*
|
|
|
|
* This happens by re-using scrub facility, as scrub also iterates through
|
|
|
|
* existing extents from commit root.
|
|
|
|
*
|
|
|
|
* Location: scrub_write_block_to_dev_replace() from
|
|
|
|
* scrub_block_complete()
|
|
|
|
* Content: Data/meta from commit root.
|
|
|
|
*
|
|
|
|
* Due to the content difference, we need to avoid nocow write when dev-replace
|
|
|
|
* is happening. This is done by marking the block group read-only and waiting
|
|
|
|
* for NOCOW writes.
|
|
|
|
*
|
|
|
|
* After replace is done, the finishing part is done by swapping the target and
|
|
|
|
* source devices.
|
|
|
|
*
|
|
|
|
* Location: btrfs_dev_replace_update_device_in_mapping_tree() from
|
|
|
|
* btrfs_dev_replace_finishing()
|
|
|
|
*/
|
|
|
|
|
2012-11-05 16:33:06 +00:00
|
|
|
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
|
|
|
|
int scrub_ret);
|
|
|
|
static int btrfs_dev_replace_kthread(void *data);
|
|
|
|
|
|
|
|
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
2021-10-05 20:12:42 +00:00
|
|
|
struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID };
|
2012-11-05 16:33:06 +00:00
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_root *dev_root = fs_info->dev_root;
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
int slot;
|
|
|
|
int ret = 0;
|
|
|
|
struct btrfs_path *path = NULL;
|
|
|
|
int item_size;
|
|
|
|
struct btrfs_dev_replace_item *ptr;
|
|
|
|
u64 src_devid;
|
|
|
|
|
2021-03-11 16:23:16 +00:00
|
|
|
if (!dev_root)
|
|
|
|
return 0;
|
|
|
|
|
2012-11-05 16:33:06 +00:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_DEV_REPLACE_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
|
|
|
|
if (ret) {
|
|
|
|
no_valid_dev_replace_entry_found:
|
2020-10-29 22:53:56 +00:00
|
|
|
/*
|
|
|
|
* We don't have a replace item or it's corrupted. If there is
|
|
|
|
* a replace target, fail the mount.
|
|
|
|
*/
|
2021-10-05 20:12:42 +00:00
|
|
|
if (btrfs_find_device(fs_info->fs_devices, &args)) {
|
2020-10-29 22:53:56 +00:00
|
|
|
btrfs_err(fs_info,
|
|
|
|
"found replace target device without a valid replace item");
|
|
|
|
ret = -EUCLEAN;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-11-05 16:33:06 +00:00
|
|
|
ret = 0;
|
|
|
|
dev_replace->replace_state =
|
2019-08-08 04:32:44 +00:00
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
|
2012-11-05 16:33:06 +00:00
|
|
|
dev_replace->cont_reading_from_srcdev_mode =
|
|
|
|
BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
|
|
|
|
dev_replace->time_started = 0;
|
|
|
|
dev_replace->time_stopped = 0;
|
|
|
|
atomic64_set(&dev_replace->num_write_errors, 0);
|
|
|
|
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
|
|
|
|
dev_replace->cursor_left = 0;
|
|
|
|
dev_replace->committed_cursor_left = 0;
|
|
|
|
dev_replace->cursor_left_last_write_of_item = 0;
|
|
|
|
dev_replace->cursor_right = 0;
|
|
|
|
dev_replace->srcdev = NULL;
|
|
|
|
dev_replace->tgtdev = NULL;
|
|
|
|
dev_replace->is_valid = 0;
|
|
|
|
dev_replace->item_needs_writeback = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
slot = path->slots[0];
|
|
|
|
eb = path->nodes[0];
|
2021-10-21 18:58:35 +00:00
|
|
|
item_size = btrfs_item_size(eb, slot);
|
2012-11-05 16:33:06 +00:00
|
|
|
ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
|
|
|
|
|
|
|
|
if (item_size != sizeof(struct btrfs_dev_replace_item)) {
|
2013-12-20 16:37:06 +00:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"dev_replace entry found has unexpected size, ignore entry");
|
2012-11-05 16:33:06 +00:00
|
|
|
goto no_valid_dev_replace_entry_found;
|
|
|
|
}
|
|
|
|
|
|
|
|
src_devid = btrfs_dev_replace_src_devid(eb, ptr);
|
|
|
|
dev_replace->cont_reading_from_srcdev_mode =
|
|
|
|
btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
|
|
|
|
dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
|
|
|
|
dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
|
|
|
|
dev_replace->time_stopped =
|
|
|
|
btrfs_dev_replace_time_stopped(eb, ptr);
|
|
|
|
atomic64_set(&dev_replace->num_write_errors,
|
|
|
|
btrfs_dev_replace_num_write_errors(eb, ptr));
|
|
|
|
atomic64_set(&dev_replace->num_uncorrectable_read_errors,
|
|
|
|
btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
|
|
|
|
dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
|
|
|
|
dev_replace->committed_cursor_left = dev_replace->cursor_left;
|
|
|
|
dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
|
|
|
|
dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
|
|
|
|
dev_replace->is_valid = 1;
|
|
|
|
|
|
|
|
dev_replace->item_needs_writeback = 0;
|
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
2020-10-29 22:53:56 +00:00
|
|
|
/*
|
|
|
|
* We don't have an active replace item but if there is a
|
|
|
|
* replace target, fail the mount.
|
|
|
|
*/
|
2021-10-05 20:12:42 +00:00
|
|
|
if (btrfs_find_device(fs_info->fs_devices, &args)) {
|
2020-10-29 22:53:56 +00:00
|
|
|
btrfs_err(fs_info,
|
2022-08-12 10:32:19 +00:00
|
|
|
"replace without active item, run 'device scan --forget' on the target device");
|
2020-10-29 22:53:56 +00:00
|
|
|
ret = -EUCLEAN;
|
|
|
|
} else {
|
|
|
|
dev_replace->srcdev = NULL;
|
|
|
|
dev_replace->tgtdev = NULL;
|
|
|
|
}
|
2012-11-05 16:33:06 +00:00
|
|
|
break;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
2021-10-05 20:12:42 +00:00
|
|
|
dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args);
|
|
|
|
args.devid = src_devid;
|
|
|
|
dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args);
|
|
|
|
|
2012-11-05 16:33:06 +00:00
|
|
|
/*
|
|
|
|
* allow 'btrfs dev replace_cancel' if src/tgt device is
|
|
|
|
* missing
|
|
|
|
*/
|
|
|
|
if (!dev_replace->srcdev &&
|
2016-06-22 22:54:23 +00:00
|
|
|
!btrfs_test_opt(fs_info, DEGRADED)) {
|
2012-11-05 16:33:06 +00:00
|
|
|
ret = -EIO;
|
2013-12-20 16:37:06 +00:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"cannot mount because device replace operation is ongoing and");
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
|
|
|
|
src_devid);
|
2012-11-05 16:33:06 +00:00
|
|
|
}
|
|
|
|
if (!dev_replace->tgtdev &&
|
2016-06-22 22:54:23 +00:00
|
|
|
!btrfs_test_opt(fs_info, DEGRADED)) {
|
2012-11-05 16:33:06 +00:00
|
|
|
ret = -EIO;
|
2013-12-20 16:37:06 +00:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"cannot mount because device replace operation is ongoing and");
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
|
2013-08-20 11:20:08 +00:00
|
|
|
BTRFS_DEV_REPLACE_DEVID);
|
2012-11-05 16:33:06 +00:00
|
|
|
}
|
|
|
|
if (dev_replace->tgtdev) {
|
|
|
|
if (dev_replace->srcdev) {
|
|
|
|
dev_replace->tgtdev->total_bytes =
|
|
|
|
dev_replace->srcdev->total_bytes;
|
|
|
|
dev_replace->tgtdev->disk_total_bytes =
|
|
|
|
dev_replace->srcdev->disk_total_bytes;
|
2014-09-03 13:35:33 +00:00
|
|
|
dev_replace->tgtdev->commit_total_bytes =
|
|
|
|
dev_replace->srcdev->commit_total_bytes;
|
2012-11-05 16:33:06 +00:00
|
|
|
dev_replace->tgtdev->bytes_used =
|
|
|
|
dev_replace->srcdev->bytes_used;
|
2014-09-03 13:35:34 +00:00
|
|
|
dev_replace->tgtdev->commit_bytes_used =
|
|
|
|
dev_replace->srcdev->commit_bytes_used;
|
2012-11-05 16:33:06 +00:00
|
|
|
}
|
2017-12-04 04:54:55 +00:00
|
|
|
set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
|
|
|
|
&dev_replace->tgtdev->dev_state);
|
2018-02-12 15:36:25 +00:00
|
|
|
|
|
|
|
WARN_ON(fs_info->fs_devices->rw_devices == 0);
|
|
|
|
dev_replace->tgtdev->io_width = fs_info->sectorsize;
|
|
|
|
dev_replace->tgtdev->io_align = fs_info->sectorsize;
|
|
|
|
dev_replace->tgtdev->sector_size = fs_info->sectorsize;
|
|
|
|
dev_replace->tgtdev->fs_info = fs_info;
|
|
|
|
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
|
|
|
|
&dev_replace->tgtdev->dev_state);
|
2012-11-05 16:33:06 +00:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
2015-08-19 05:55:00 +00:00
|
|
|
btrfs_free_path(path);
|
2012-11-05 16:33:06 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-03-20 15:09:48 +00:00
|
|
|
/*
|
|
|
|
* Initialize a new device for device replace target from a given source dev
|
|
|
|
* and path.
|
|
|
|
*
|
|
|
|
* Return 0 and new device in @device_out, otherwise return < 0
|
|
|
|
*/
|
|
|
|
static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
|
|
|
|
const char *device_path,
|
|
|
|
struct btrfs_device *srcdev,
|
|
|
|
struct btrfs_device **device_out)
|
|
|
|
{
|
2022-01-17 15:50:39 +00:00
|
|
|
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
|
2018-03-20 15:09:48 +00:00
|
|
|
struct btrfs_device *device;
|
2023-09-27 09:34:26 +00:00
|
|
|
struct bdev_handle *bdev_handle;
|
2018-03-20 15:09:48 +00:00
|
|
|
struct block_device *bdev;
|
|
|
|
u64 devid = BTRFS_DEV_REPLACE_DEVID;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
*device_out = NULL;
|
btrfs: fix replace of seed device
If you replace a seed device in a sprouted fs, it appears to have
successfully replaced the seed device, but if you look closely, it
didn't. Here is an example.
$ mkfs.btrfs /dev/sda
$ btrfstune -S1 /dev/sda
$ mount /dev/sda /btrfs
$ btrfs device add /dev/sdb /btrfs
$ umount /btrfs
$ btrfs device scan --forget
$ mount -o device=/dev/sda /dev/sdb /btrfs
$ btrfs replace start -f /dev/sda /dev/sdc /btrfs
$ echo $?
0
BTRFS info (device sdb): dev_replace from /dev/sda (devid 1) to /dev/sdc started
BTRFS info (device sdb): dev_replace from /dev/sda (devid 1) to /dev/sdc finished
$ btrfs fi show
Label: none uuid: ab2c88b7-be81-4a7e-9849-c3666e7f9f4f
Total devices 2 FS bytes used 256.00KiB
devid 1 size 3.00GiB used 520.00MiB path /dev/sdc
devid 2 size 3.00GiB used 896.00MiB path /dev/sdb
Label: none uuid: 10bd3202-0415-43af-96a8-d5409f310a7e
Total devices 1 FS bytes used 128.00KiB
devid 1 size 3.00GiB used 536.00MiB path /dev/sda
So as per the replace start command and kernel log replace was successful.
Now let's try to clean mount.
$ umount /btrfs
$ btrfs device scan --forget
$ mount -o device=/dev/sdc /dev/sdb /btrfs
mount: /btrfs: wrong fs type, bad option, bad superblock on /dev/sdb, missing codepage or helper program, or other error.
[ 636.157517] BTRFS error (device sdc): failed to read chunk tree: -2
[ 636.180177] BTRFS error (device sdc): open_ctree failed
That's because per dev items it is still looking for the original seed
device.
$ btrfs inspect-internal dump-tree -d /dev/sdb
item 0 key (DEV_ITEMS DEV_ITEM 1) itemoff 16185 itemsize 98
devid 1 total_bytes 3221225472 bytes_used 545259520
io_align 4096 io_width 4096 sector_size 4096 type 0
generation 6 start_offset 0 dev_group 0
seek_speed 0 bandwidth 0
uuid 59368f50-9af2-4b17-91da-8a783cc418d4 <--- seed uuid
fsid 10bd3202-0415-43af-96a8-d5409f310a7e <--- seed fsid
item 1 key (DEV_ITEMS DEV_ITEM 2) itemoff 16087 itemsize 98
devid 2 total_bytes 3221225472 bytes_used 939524096
io_align 4096 io_width 4096 sector_size 4096 type 0
generation 0 start_offset 0 dev_group 0
seek_speed 0 bandwidth 0
uuid 56a0a6bc-4630-4998-8daf-3c3030c4256a <- sprout uuid
fsid ab2c88b7-be81-4a7e-9849-c3666e7f9f4f <- sprout fsid
But the replaced target has the following uuid+fsid in its superblock
which doesn't match with the expected uuid+fsid in its devitem.
$ btrfs in dump-super /dev/sdc | egrep '^generation|dev_item.uuid|dev_item.fsid|devid'
generation 20
dev_item.uuid 59368f50-9af2-4b17-91da-8a783cc418d4
dev_item.fsid ab2c88b7-be81-4a7e-9849-c3666e7f9f4f [match]
dev_item.devid 1
So if you provide the original seed device the mount shall be
successful. Which so long happening in the test case btrfs/163.
$ btrfs device scan --forget
$ mount -o device=/dev/sda /dev/sdb /btrfs
Fix in this patch:
If a seed is not sprouted then there is no replacement of it, because of
its read-only filesystem with a read-only device. Similarly, in the case
of a sprouted filesystem, the seed device is still read only. So, mark
it as you can't replace a seed device, you can only add a new device and
then delete the seed device. If replace is attempted then returns
-EINVAL.
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-04 17:34:22 +00:00
|
|
|
if (srcdev->fs_devices->seeding) {
|
2018-03-20 15:09:48 +00:00
|
|
|
btrfs_err(fs_info, "the filesystem is a seed filesystem!");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2023-09-27 09:34:26 +00:00
|
|
|
bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
|
|
|
|
fs_info->bdev_holder, NULL);
|
|
|
|
if (IS_ERR(bdev_handle)) {
|
2018-03-20 15:09:48 +00:00
|
|
|
btrfs_err(fs_info, "target device %s is invalid!", device_path);
|
2023-09-27 09:34:26 +00:00
|
|
|
return PTR_ERR(bdev_handle);
|
2018-03-20 15:09:48 +00:00
|
|
|
}
|
2023-09-27 09:34:26 +00:00
|
|
|
bdev = bdev_handle->bdev;
|
2018-03-20 15:09:48 +00:00
|
|
|
|
2020-11-10 11:26:08 +00:00
|
|
|
if (!btrfs_check_device_zone_type(fs_info, bdev)) {
|
|
|
|
btrfs_err(fs_info,
|
|
|
|
"dev-replace: zoned type of target device mismatch with filesystem");
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
2019-05-14 10:54:38 +00:00
|
|
|
sync_blockdev(bdev);
|
2018-03-20 15:09:48 +00:00
|
|
|
|
2022-01-17 15:50:39 +00:00
|
|
|
list_for_each_entry(device, &fs_devices->devices, dev_list) {
|
2018-03-20 15:09:48 +00:00
|
|
|
if (device->bdev == bdev) {
|
|
|
|
btrfs_err(fs_info,
|
|
|
|
"target device is in the filesystem!");
|
|
|
|
ret = -EEXIST;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-10-18 10:11:12 +00:00
|
|
|
if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) {
|
2018-03-20 15:09:48 +00:00
|
|
|
btrfs_err(fs_info,
|
|
|
|
"target device is smaller than source device!");
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2022-11-07 15:07:17 +00:00
|
|
|
device = btrfs_alloc_device(NULL, &devid, NULL, device_path);
|
2018-03-20 15:09:48 +00:00
|
|
|
if (IS_ERR(device)) {
|
|
|
|
ret = PTR_ERR(device);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
2022-01-12 05:06:01 +00:00
|
|
|
ret = lookup_bdev(device_path, &device->devt);
|
|
|
|
if (ret)
|
|
|
|
goto error;
|
2018-03-20 15:09:48 +00:00
|
|
|
|
|
|
|
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
|
|
|
|
device->generation = 0;
|
|
|
|
device->io_width = fs_info->sectorsize;
|
|
|
|
device->io_align = fs_info->sectorsize;
|
|
|
|
device->sector_size = fs_info->sectorsize;
|
|
|
|
device->total_bytes = btrfs_device_get_total_bytes(srcdev);
|
|
|
|
device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
|
|
|
|
device->bytes_used = btrfs_device_get_bytes_used(srcdev);
|
|
|
|
device->commit_total_bytes = srcdev->commit_total_bytes;
|
|
|
|
device->commit_bytes_used = device->bytes_used;
|
|
|
|
device->fs_info = fs_info;
|
|
|
|
device->bdev = bdev;
|
2023-09-27 09:34:26 +00:00
|
|
|
device->bdev_handle = bdev_handle;
|
2018-03-20 15:09:48 +00:00
|
|
|
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
|
|
|
|
set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
|
|
|
|
device->dev_stats_valid = 1;
|
|
|
|
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
|
2022-01-17 15:50:39 +00:00
|
|
|
device->fs_devices = fs_devices;
|
2019-05-14 10:54:39 +00:00
|
|
|
|
2021-11-11 05:14:38 +00:00
|
|
|
ret = btrfs_get_dev_zone_info(device, false);
|
2020-11-10 11:26:07 +00:00
|
|
|
if (ret)
|
|
|
|
goto error;
|
|
|
|
|
2022-01-17 15:50:39 +00:00
|
|
|
mutex_lock(&fs_devices->device_list_mutex);
|
|
|
|
list_add(&device->dev_list, &fs_devices->devices);
|
|
|
|
fs_devices->num_devices++;
|
|
|
|
fs_devices->open_devices++;
|
|
|
|
mutex_unlock(&fs_devices->device_list_mutex);
|
2018-03-20 15:09:48 +00:00
|
|
|
|
|
|
|
*device_out = device;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
error:
|
2023-09-27 09:34:26 +00:00
|
|
|
bdev_release(bdev_handle);
|
2018-03-20 15:09:48 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-11-05 16:33:06 +00:00
|
|
|
/*
|
|
|
|
* called from commit_transaction. Writes changed device replace state to
|
|
|
|
* disk.
|
|
|
|
*/
|
2019-03-20 15:51:44 +00:00
|
|
|
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
|
2012-11-05 16:33:06 +00:00
|
|
|
{
|
2019-03-20 15:51:44 +00:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2012-11-05 16:33:06 +00:00
|
|
|
int ret;
|
|
|
|
struct btrfs_root *dev_root = fs_info->dev_root;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
struct btrfs_dev_replace_item *ptr;
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
|
2018-09-07 14:11:23 +00:00
|
|
|
down_read(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
if (!dev_replace->is_valid ||
|
|
|
|
!dev_replace->item_needs_writeback) {
|
2018-09-07 14:11:23 +00:00
|
|
|
up_read(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
return 0;
|
|
|
|
}
|
2018-09-07 14:11:23 +00:00
|
|
|
up_read(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_DEV_REPLACE_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
|
|
|
|
if (ret < 0) {
|
2016-09-20 14:05:00 +00:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"error %d while searching for dev_replace item!",
|
|
|
|
ret);
|
2012-11-05 16:33:06 +00:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ret == 0 &&
|
2021-10-21 18:58:35 +00:00
|
|
|
btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
|
2012-11-05 16:33:06 +00:00
|
|
|
/*
|
|
|
|
* need to delete old one and insert a new one.
|
|
|
|
* Since no attempt is made to recover any old state, if the
|
|
|
|
* dev_replace state is 'running', the data on the target
|
|
|
|
* drive is lost.
|
|
|
|
* It would be possible to recover the state: just make sure
|
|
|
|
* that the beginning of the item is never changed and always
|
|
|
|
* contains all the essential information. Then read this
|
|
|
|
* minimal set of information and use it as a base for the
|
|
|
|
* new state.
|
|
|
|
*/
|
|
|
|
ret = btrfs_del_item(trans, dev_root, path);
|
|
|
|
if (ret != 0) {
|
2016-09-20 14:05:00 +00:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"delete too small dev_replace item failed %d!",
|
|
|
|
ret);
|
2012-11-05 16:33:06 +00:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ret == 1) {
|
|
|
|
/* need to insert a new item */
|
|
|
|
btrfs_release_path(path);
|
|
|
|
ret = btrfs_insert_empty_item(trans, dev_root, path,
|
|
|
|
&key, sizeof(*ptr));
|
|
|
|
if (ret < 0) {
|
2016-09-20 14:05:00 +00:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"insert dev_replace item failed %d!", ret);
|
2012-11-05 16:33:06 +00:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
eb = path->nodes[0];
|
|
|
|
ptr = btrfs_item_ptr(eb, path->slots[0],
|
|
|
|
struct btrfs_dev_replace_item);
|
|
|
|
|
2018-09-07 14:11:23 +00:00
|
|
|
down_write(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
if (dev_replace->srcdev)
|
|
|
|
btrfs_set_dev_replace_src_devid(eb, ptr,
|
|
|
|
dev_replace->srcdev->devid);
|
|
|
|
else
|
|
|
|
btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
|
|
|
|
btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
|
|
|
|
dev_replace->cont_reading_from_srcdev_mode);
|
|
|
|
btrfs_set_dev_replace_replace_state(eb, ptr,
|
|
|
|
dev_replace->replace_state);
|
|
|
|
btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
|
|
|
|
btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
|
|
|
|
btrfs_set_dev_replace_num_write_errors(eb, ptr,
|
|
|
|
atomic64_read(&dev_replace->num_write_errors));
|
|
|
|
btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
|
|
|
|
atomic64_read(&dev_replace->num_uncorrectable_read_errors));
|
|
|
|
dev_replace->cursor_left_last_write_of_item =
|
|
|
|
dev_replace->cursor_left;
|
|
|
|
btrfs_set_dev_replace_cursor_left(eb, ptr,
|
|
|
|
dev_replace->cursor_left_last_write_of_item);
|
|
|
|
btrfs_set_dev_replace_cursor_right(eb, ptr,
|
|
|
|
dev_replace->cursor_right);
|
|
|
|
dev_replace->item_needs_writeback = 0;
|
2018-09-07 14:11:23 +00:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
|
2023-09-12 12:04:29 +00:00
|
|
|
btrfs_mark_buffer_dirty(trans, eb);
|
2012-11-05 16:33:06 +00:00
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2021-02-04 10:22:11 +00:00
|
|
|
static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_device *src_dev)
|
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_key found_key;
|
|
|
|
struct btrfs_root *root = fs_info->dev_root;
|
|
|
|
struct btrfs_dev_extent *dev_extent = NULL;
|
|
|
|
struct btrfs_block_group *cache;
|
|
|
|
struct btrfs_trans_handle *trans;
|
2022-03-09 13:50:40 +00:00
|
|
|
int iter_ret = 0;
|
2021-02-04 10:22:11 +00:00
|
|
|
int ret = 0;
|
|
|
|
u64 chunk_offset;
|
|
|
|
|
|
|
|
/* Do not use "to_copy" on non zoned filesystem for now */
|
|
|
|
if (!btrfs_is_zoned(fs_info))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
mutex_lock(&fs_info->chunk_mutex);
|
|
|
|
|
|
|
|
/* Ensure we don't have pending new block group */
|
|
|
|
spin_lock(&fs_info->trans_lock);
|
|
|
|
while (fs_info->running_transaction &&
|
|
|
|
!list_empty(&fs_info->running_transaction->dev_update_list)) {
|
|
|
|
spin_unlock(&fs_info->trans_lock);
|
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
|
|
|
trans = btrfs_attach_transaction(root);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
mutex_lock(&fs_info->chunk_mutex);
|
|
|
|
if (ret == -ENOENT) {
|
|
|
|
spin_lock(&fs_info->trans_lock);
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = btrfs_commit_transaction(trans);
|
|
|
|
mutex_lock(&fs_info->chunk_mutex);
|
|
|
|
if (ret)
|
|
|
|
goto unlock;
|
|
|
|
|
|
|
|
spin_lock(&fs_info->trans_lock);
|
|
|
|
}
|
|
|
|
spin_unlock(&fs_info->trans_lock);
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
path->reada = READA_FORWARD;
|
|
|
|
path->search_commit_root = 1;
|
|
|
|
path->skip_locking = 1;
|
|
|
|
|
|
|
|
key.objectid = src_dev->devid;
|
|
|
|
key.type = BTRFS_DEV_EXTENT_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
|
2022-03-09 13:50:40 +00:00
|
|
|
btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
|
2021-02-04 10:22:11 +00:00
|
|
|
struct extent_buffer *leaf = path->nodes[0];
|
|
|
|
|
|
|
|
if (found_key.objectid != src_dev->devid)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (found_key.type != BTRFS_DEV_EXTENT_KEY)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (found_key.offset < key.offset)
|
|
|
|
break;
|
|
|
|
|
2022-03-09 13:50:40 +00:00
|
|
|
dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
|
2021-02-04 10:22:11 +00:00
|
|
|
|
|
|
|
chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);
|
|
|
|
|
|
|
|
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
|
|
|
|
if (!cache)
|
2022-03-09 13:50:40 +00:00
|
|
|
continue;
|
2021-02-04 10:22:11 +00:00
|
|
|
|
2022-07-15 19:45:24 +00:00
|
|
|
set_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
|
2021-02-04 10:22:11 +00:00
|
|
|
btrfs_put_block_group(cache);
|
|
|
|
}
|
2022-03-09 13:50:40 +00:00
|
|
|
if (iter_ret < 0)
|
|
|
|
ret = iter_ret;
|
2021-02-04 10:22:11 +00:00
|
|
|
|
|
|
|
btrfs_free_path(path);
|
|
|
|
unlock:
|
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
|
|
|
|
struct btrfs_block_group *cache,
|
|
|
|
u64 physical)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = cache->fs_info;
|
btrfs: use a dedicated data structure for chunk maps
Currently we abuse the extent_map structure for two purposes:
1) To actually represent extents for inodes;
2) To represent chunk mappings.
This is odd and has several disadvantages:
1) To create a chunk map, we need to do two memory allocations: one for
an extent_map structure and another one for a map_lookup structure, so
more potential for an allocation failure and more complicated code to
manage and link two structures;
2) For a chunk map we actually only use 3 fields (24 bytes) of the
respective extent map structure: the 'start' field to have the logical
start address of the chunk, the 'len' field to have the chunk's size,
and the 'orig_block_len' field to contain the chunk's stripe size.
Besides wasting a memory, it's also odd and not intuitive at all to
have the stripe size in a field named 'orig_block_len'.
We are also using 'block_len' of the extent_map structure to contain
the chunk size, so we have 2 fields for the same value, 'len' and
'block_len', which is pointless;
3) When an extent map is associated to a chunk mapping, we set the bit
EXTENT_FLAG_FS_MAPPING on its flags and then make its member named
'map_lookup' point to the associated map_lookup structure. This means
that for an extent map associated to an inode extent, we are not using
this 'map_lookup' pointer, so wasting 8 bytes (on a 64 bits platform);
4) Extent maps associated to a chunk mapping are never merged or split so
it's pointless to use the existing extent map infrastructure.
So add a dedicated data structure named 'btrfs_chunk_map' to represent
chunk mappings, this is basically the existing map_lookup structure with
some extra fields:
1) 'start' to contain the chunk logical address;
2) 'chunk_len' to contain the chunk's length;
3) 'stripe_size' for the stripe size;
4) 'rb_node' for insertion into a rb tree;
5) 'refs' for reference counting.
This way we do a single memory allocation for chunk mappings and we don't
waste memory for them with unused/unnecessary fields from an extent_map.
We also save 8 bytes from the extent_map structure by removing the
'map_lookup' pointer, so the size of struct extent_map is reduced from
144 bytes down to 136 bytes, and we can now have 30 extents map per 4K
page instead of 28.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-11-21 13:38:38 +00:00
|
|
|
struct btrfs_chunk_map *map;
|
2021-02-04 10:22:11 +00:00
|
|
|
u64 chunk_offset = cache->start;
|
|
|
|
int num_extents, cur_extent;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* Do not use "to_copy" on non zoned filesystem for now */
|
|
|
|
if (!btrfs_is_zoned(fs_info))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
spin_lock(&cache->lock);
|
2022-07-15 19:45:24 +00:00
|
|
|
if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
|
2021-02-04 10:22:11 +00:00
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
|
btrfs: use a dedicated data structure for chunk maps
Currently we abuse the extent_map structure for two purposes:
1) To actually represent extents for inodes;
2) To represent chunk mappings.
This is odd and has several disadvantages:
1) To create a chunk map, we need to do two memory allocations: one for
an extent_map structure and another one for a map_lookup structure, so
more potential for an allocation failure and more complicated code to
manage and link two structures;
2) For a chunk map we actually only use 3 fields (24 bytes) of the
respective extent map structure: the 'start' field to have the logical
start address of the chunk, the 'len' field to have the chunk's size,
and the 'orig_block_len' field to contain the chunk's stripe size.
Besides wasting a memory, it's also odd and not intuitive at all to
have the stripe size in a field named 'orig_block_len'.
We are also using 'block_len' of the extent_map structure to contain
the chunk size, so we have 2 fields for the same value, 'len' and
'block_len', which is pointless;
3) When an extent map is associated to a chunk mapping, we set the bit
EXTENT_FLAG_FS_MAPPING on its flags and then make its member named
'map_lookup' point to the associated map_lookup structure. This means
that for an extent map associated to an inode extent, we are not using
this 'map_lookup' pointer, so wasting 8 bytes (on a 64 bits platform);
4) Extent maps associated to a chunk mapping are never merged or split so
it's pointless to use the existing extent map infrastructure.
So add a dedicated data structure named 'btrfs_chunk_map' to represent
chunk mappings, this is basically the existing map_lookup structure with
some extra fields:
1) 'start' to contain the chunk logical address;
2) 'chunk_len' to contain the chunk's length;
3) 'stripe_size' for the stripe size;
4) 'rb_node' for insertion into a rb tree;
5) 'refs' for reference counting.
This way we do a single memory allocation for chunk mappings and we don't
waste memory for them with unused/unnecessary fields from an extent_map.
We also save 8 bytes from the extent_map structure by removing the
'map_lookup' pointer, so the size of struct extent_map is reduced from
144 bytes down to 136 bytes, and we can now have 30 extents map per 4K
page instead of 28.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-11-21 13:38:38 +00:00
|
|
|
map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
|
|
|
|
ASSERT(!IS_ERR(map));
|
2021-02-04 10:22:11 +00:00
|
|
|
|
2022-06-21 16:40:48 +00:00
|
|
|
num_extents = 0;
|
|
|
|
cur_extent = 0;
|
2021-02-04 10:22:11 +00:00
|
|
|
for (i = 0; i < map->num_stripes; i++) {
|
|
|
|
/* We have more device extent to copy */
|
|
|
|
if (srcdev != map->stripes[i].dev)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
num_extents++;
|
|
|
|
if (physical == map->stripes[i].physical)
|
|
|
|
cur_extent = i;
|
|
|
|
}
|
|
|
|
|
btrfs: use a dedicated data structure for chunk maps
Currently we abuse the extent_map structure for two purposes:
1) To actually represent extents for inodes;
2) To represent chunk mappings.
This is odd and has several disadvantages:
1) To create a chunk map, we need to do two memory allocations: one for
an extent_map structure and another one for a map_lookup structure, so
more potential for an allocation failure and more complicated code to
manage and link two structures;
2) For a chunk map we actually only use 3 fields (24 bytes) of the
respective extent map structure: the 'start' field to have the logical
start address of the chunk, the 'len' field to have the chunk's size,
and the 'orig_block_len' field to contain the chunk's stripe size.
Besides wasting a memory, it's also odd and not intuitive at all to
have the stripe size in a field named 'orig_block_len'.
We are also using 'block_len' of the extent_map structure to contain
the chunk size, so we have 2 fields for the same value, 'len' and
'block_len', which is pointless;
3) When an extent map is associated to a chunk mapping, we set the bit
EXTENT_FLAG_FS_MAPPING on its flags and then make its member named
'map_lookup' point to the associated map_lookup structure. This means
that for an extent map associated to an inode extent, we are not using
this 'map_lookup' pointer, so wasting 8 bytes (on a 64 bits platform);
4) Extent maps associated to a chunk mapping are never merged or split so
it's pointless to use the existing extent map infrastructure.
So add a dedicated data structure named 'btrfs_chunk_map' to represent
chunk mappings, this is basically the existing map_lookup structure with
some extra fields:
1) 'start' to contain the chunk logical address;
2) 'chunk_len' to contain the chunk's length;
3) 'stripe_size' for the stripe size;
4) 'rb_node' for insertion into a rb tree;
5) 'refs' for reference counting.
This way we do a single memory allocation for chunk mappings and we don't
waste memory for them with unused/unnecessary fields from an extent_map.
We also save 8 bytes from the extent_map structure by removing the
'map_lookup' pointer, so the size of struct extent_map is reduced from
144 bytes down to 136 bytes, and we can now have 30 extents map per 4K
page instead of 28.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-11-21 13:38:38 +00:00
|
|
|
btrfs_free_chunk_map(map);
|
2021-02-04 10:22:11 +00:00
|
|
|
|
|
|
|
if (num_extents > 1 && cur_extent < num_extents - 1) {
|
|
|
|
/*
|
|
|
|
* Has more stripes on this device. Keep this block group
|
|
|
|
* readonly until we finish all the stripes.
|
|
|
|
*/
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Last stripe on this device */
|
2022-07-15 19:45:24 +00:00
|
|
|
clear_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
|
2021-02-04 10:22:11 +00:00
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2018-11-11 14:22:16 +00:00
|
|
|
static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
|
2017-02-14 16:55:53 +00:00
|
|
|
const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
|
|
|
|
int read_src)
|
2012-11-05 16:33:06 +00:00
|
|
|
{
|
2016-06-22 22:54:24 +00:00
|
|
|
struct btrfs_root *root = fs_info->dev_root;
|
2012-11-05 16:33:06 +00:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
int ret;
|
|
|
|
struct btrfs_device *tgt_device = NULL;
|
|
|
|
struct btrfs_device *src_device = NULL;
|
|
|
|
|
2018-09-03 09:46:14 +00:00
|
|
|
src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
|
|
|
|
srcdev_name);
|
|
|
|
if (IS_ERR(src_device))
|
|
|
|
return PTR_ERR(src_device);
|
2012-11-05 16:33:06 +00:00
|
|
|
|
Btrfs: prevent ioctls from interfering with a swap file
A later patch will implement swap file support for Btrfs, but before we
do that, we need to make sure that the various Btrfs ioctls cannot
change a swap file.
When a swap file is active, we must make sure that the extents of the
file are not moved and that they don't become shared. That means that
the following are not safe:
- chattr +c (enable compression)
- reflink
- dedupe
- snapshot
- defrag
Don't allow those to happen on an active swap file.
Additionally, balance, resize, device remove, and device replace are
also unsafe if they affect an active swapfile. Add a red-black tree of
block groups and devices which contain an active swapfile. Relocation
checks each block group against this tree and skips it or errors out for
balance or resize, respectively. Device remove and device replace check
the tree for the device they will operate on.
Note that we don't have to worry about chattr -C (disable nocow), which
we ignore for non-empty files, because an active swapfile must be
non-empty and can't be truncated. We also don't have to worry about
autodefrag because it's only done on COW files. Truncate and fallocate
are already taken care of by the generic code. Device add doesn't do
relocation so it's not an issue, either.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2016-11-03 17:28:12 +00:00
|
|
|
if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
|
|
|
|
btrfs_warn_in_rcu(fs_info,
|
|
|
|
"cannot replace device %s (devid %llu) due to active swapfile",
|
|
|
|
btrfs_dev_name(src_device), src_device->devid);
|
|
|
|
return -ETXTBSY;
|
|
|
|
}
|
|
|
|
|
2015-08-14 10:33:02 +00:00
|
|
|
/*
|
|
|
|
* Here we commit the transaction to make sure commit_total_bytes
|
|
|
|
* of all the devices are updated.
|
|
|
|
*/
|
|
|
|
trans = btrfs_attach_transaction(root);
|
|
|
|
if (!IS_ERR(trans)) {
|
2016-09-10 01:39:03 +00:00
|
|
|
ret = btrfs_commit_transaction(trans);
|
2015-08-14 10:33:02 +00:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
} else if (PTR_ERR(trans) != -ENOENT) {
|
|
|
|
return PTR_ERR(trans);
|
|
|
|
}
|
|
|
|
|
2019-05-14 10:54:41 +00:00
|
|
|
ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
|
|
|
|
src_device, &tgt_device);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2021-02-04 10:22:11 +00:00
|
|
|
ret = mark_block_group_to_copy(fs_info, src_device);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2018-09-07 14:11:23 +00:00
|
|
|
down_write(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
|
|
|
break;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
2018-09-06 19:52:17 +00:00
|
|
|
ASSERT(0);
|
2016-03-24 10:48:14 +00:00
|
|
|
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
|
2019-05-14 10:54:42 +00:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
goto leave;
|
|
|
|
}
|
|
|
|
|
2016-03-24 10:48:14 +00:00
|
|
|
dev_replace->cont_reading_from_srcdev_mode = read_src;
|
2012-11-05 16:33:06 +00:00
|
|
|
dev_replace->srcdev = src_device;
|
|
|
|
dev_replace->tgtdev = tgt_device;
|
|
|
|
|
2016-03-24 10:48:12 +00:00
|
|
|
btrfs_info_in_rcu(fs_info,
|
2015-10-08 07:01:03 +00:00
|
|
|
"dev_replace from %s (devid %llu) to %s started",
|
2017-11-28 02:43:10 +00:00
|
|
|
btrfs_dev_name(src_device),
|
2012-11-05 16:33:06 +00:00
|
|
|
src_device->devid,
|
2022-11-13 01:32:07 +00:00
|
|
|
btrfs_dev_name(tgt_device));
|
2012-11-05 16:33:06 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* from now on, the writes to the srcdev are all duplicated to
|
|
|
|
* go to the tgtdev as well (refer to btrfs_map_block()).
|
|
|
|
*/
|
|
|
|
dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
|
2018-06-12 11:48:25 +00:00
|
|
|
dev_replace->time_started = ktime_get_real_seconds();
|
2012-11-05 16:33:06 +00:00
|
|
|
dev_replace->cursor_left = 0;
|
|
|
|
dev_replace->committed_cursor_left = 0;
|
|
|
|
dev_replace->cursor_left_last_write_of_item = 0;
|
|
|
|
dev_replace->cursor_right = 0;
|
|
|
|
dev_replace->is_valid = 1;
|
|
|
|
dev_replace->item_needs_writeback = 1;
|
2016-03-29 21:17:48 +00:00
|
|
|
atomic64_set(&dev_replace->num_write_errors, 0);
|
|
|
|
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
|
2018-09-07 14:11:23 +00:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
|
2020-09-04 17:34:26 +00:00
|
|
|
ret = btrfs_sysfs_add_device(tgt_device);
|
2015-08-14 10:33:07 +00:00
|
|
|
if (ret)
|
2016-09-20 14:05:02 +00:00
|
|
|
btrfs_err(fs_info, "kobj add dev failed %d", ret);
|
2015-08-14 10:33:07 +00:00
|
|
|
|
2017-06-23 16:48:21 +00:00
|
|
|
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
|
2012-11-05 16:33:06 +00:00
|
|
|
|
btrfs: fix assertion failure during scrub due to block group reallocation
During a scrub, or device replace, we can race with block group removal
and allocation and trigger the following assertion failure:
[7526.385524] assertion failed: cache->start == chunk_offset, in fs/btrfs/scrub.c:3817
[7526.387351] ------------[ cut here ]------------
[7526.387373] kernel BUG at fs/btrfs/ctree.h:3599!
[7526.388001] invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI
[7526.388970] CPU: 2 PID: 1158150 Comm: btrfs Not tainted 5.17.0-rc8-btrfs-next-114 #4
[7526.390279] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[7526.392430] RIP: 0010:assertfail.constprop.0+0x18/0x1a [btrfs]
[7526.393520] Code: f3 48 c7 c7 20 (...)
[7526.396926] RSP: 0018:ffffb9154176bc40 EFLAGS: 00010246
[7526.397690] RAX: 0000000000000048 RBX: ffffa0db8a910000 RCX: 0000000000000000
[7526.398732] RDX: 0000000000000000 RSI: ffffffff9d7239a2 RDI: 00000000ffffffff
[7526.399766] RBP: ffffa0db8a911e10 R08: ffffffffa71a3ca0 R09: 0000000000000001
[7526.400793] R10: 0000000000000001 R11: 0000000000000000 R12: ffffa0db4b170800
[7526.401839] R13: 00000003494b0000 R14: ffffa0db7c55b488 R15: ffffa0db8b19a000
[7526.402874] FS: 00007f6c99c40640(0000) GS:ffffa0de6d200000(0000) knlGS:0000000000000000
[7526.404038] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[7526.405040] CR2: 00007f31b0882160 CR3: 000000014b38c004 CR4: 0000000000370ee0
[7526.406112] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[7526.407148] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[7526.408169] Call Trace:
[7526.408529] <TASK>
[7526.408839] scrub_enumerate_chunks.cold+0x11/0x79 [btrfs]
[7526.409690] ? do_wait_intr_irq+0xb0/0xb0
[7526.410276] btrfs_scrub_dev+0x226/0x620 [btrfs]
[7526.410995] ? preempt_count_add+0x49/0xa0
[7526.411592] btrfs_ioctl+0x1ab5/0x36d0 [btrfs]
[7526.412278] ? __fget_files+0xc9/0x1b0
[7526.412825] ? kvm_sched_clock_read+0x14/0x40
[7526.413459] ? lock_release+0x155/0x4a0
[7526.414022] ? __x64_sys_ioctl+0x83/0xb0
[7526.414601] __x64_sys_ioctl+0x83/0xb0
[7526.415150] do_syscall_64+0x3b/0xc0
[7526.415675] entry_SYSCALL_64_after_hwframe+0x44/0xae
[7526.416408] RIP: 0033:0x7f6c99d34397
[7526.416931] Code: 3c 1c e8 1c ff (...)
[7526.419641] RSP: 002b:00007f6c99c3fca8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[7526.420735] RAX: ffffffffffffffda RBX: 00005624e1e007b0 RCX: 00007f6c99d34397
[7526.421779] RDX: 00005624e1e007b0 RSI: 00000000c400941b RDI: 0000000000000003
[7526.422820] RBP: 0000000000000000 R08: 00007f6c99c40640 R09: 0000000000000000
[7526.423906] R10: 00007f6c99c40640 R11: 0000000000000246 R12: 00007fff746755de
[7526.424924] R13: 00007fff746755df R14: 0000000000000000 R15: 00007f6c99c40640
[7526.425950] </TASK>
That assertion is relatively new, introduced with commit d04fbe19aefd2
("btrfs: scrub: cleanup the argument list of scrub_chunk()").
The block group we get at scrub_enumerate_chunks() can actually have a
start address that is smaller then the chunk offset we extracted from a
device extent item we got from the commit root of the device tree.
This is very rare, but it can happen due to a race with block group
removal and allocation. For example, the following steps show how this
can happen:
1) We are at transaction T, and we have the following blocks groups,
sorted by their logical start address:
[ bg A, start address A, length 1G (data) ]
[ bg B, start address B, length 1G (data) ]
(...)
[ bg W, start address W, length 1G (data) ]
--> logical address space hole of 256M,
there used to be a 256M metadata block group here
[ bg Y, start address Y, length 256M (metadata) ]
--> Y matches W's end offset + 256M
Block group Y is the block group with the highest logical address in
the whole filesystem;
2) Block group Y is deleted and its extent mapping is removed by the call
to remove_extent_mapping() made from btrfs_remove_block_group().
So after this point, the last element of the mapping red black tree,
its rightmost node, is the mapping for block group W;
3) While still at transaction T, a new data block group is allocated,
with a length of 1G. When creating the block group we do a call to
find_next_chunk(), which returns the logical start address for the
new block group. This calls returns X, which corresponds to the
end offset of the last block group, the rightmost node in the mapping
red black tree (fs_info->mapping_tree), plus one.
So we get a new block group that starts at logical address X and with
a length of 1G. It spans over the whole logical range of the old block
group Y, that was previously removed in the same transaction.
However the device extent allocated to block group X is not the same
device extent that was used by block group Y, and it also does not
overlap that extent, which must be always the case because we allocate
extents by searching through the commit root of the device tree
(otherwise it could corrupt a filesystem after a power failure or
an unclean shutdown in general), so the extent allocator is behaving
as expected;
4) We have a task running scrub, currently at scrub_enumerate_chunks().
There it searches for device extent items in the device tree, using
its commit root. It finds a device extent item that was used by
block group Y, and it extracts the value Y from that item into the
local variable 'chunk_offset', using btrfs_dev_extent_chunk_offset();
It then calls btrfs_lookup_block_group() to find block group for
the logical address Y - since there's currently no block group that
starts at that logical address, it returns block group X, because
its range contains Y.
This results in triggering the assertion:
ASSERT(cache->start == chunk_offset);
right before calling scrub_chunk(), as cache->start is X and
chunk_offset is Y.
This is more likely to happen of filesystems not larger than 50G, because
for these filesystems we use a 256M size for metadata block groups and
a 1G size for data block groups, while for filesystems larger than 50G,
we use a 1G size for both data and metadata block groups (except for
zoned filesystems). It could also happen on any filesystem size due to
the fact that system block groups are always smaller (32M) than both
data and metadata block groups, but these are not frequently deleted, so
much less likely to trigger the race.
So make scrub skip any block group with a start offset that is less than
the value we expect, as that means it's a new block group that was created
in the current transaction. It's pointless to continue and try to scrub
its extents, because scrub searches for extents using the commit root, so
it won't find any. For a device replace, skip it as well for the same
reasons, and we don't need to worry about the possibility of extents of
the new block group not being to the new device, because we have the write
duplication setup done through btrfs_map_block().
Fixes: d04fbe19aefd ("btrfs: scrub: cleanup the argument list of scrub_chunk()")
CC: stable@vger.kernel.org # 5.17
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-04-19 13:23:57 +00:00
|
|
|
/*
|
|
|
|
* Commit dev_replace state and reserve 1 item for it.
|
|
|
|
* This is crucial to ensure we won't miss copying extents for new block
|
|
|
|
* groups that are allocated after we started the device replace, and
|
|
|
|
* must be done after setting up the device replace state.
|
|
|
|
*/
|
2019-05-14 10:54:43 +00:00
|
|
|
trans = btrfs_start_transaction(root, 1);
|
2012-11-05 16:33:06 +00:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
2018-09-07 14:11:23 +00:00
|
|
|
down_write(&dev_replace->rwsem);
|
2018-09-06 19:52:17 +00:00
|
|
|
dev_replace->replace_state =
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
|
|
|
|
dev_replace->srcdev = NULL;
|
|
|
|
dev_replace->tgtdev = NULL;
|
2019-05-14 10:54:42 +00:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
goto leave;
|
|
|
|
}
|
|
|
|
|
2016-09-10 01:39:03 +00:00
|
|
|
ret = btrfs_commit_transaction(trans);
|
2012-11-05 16:33:06 +00:00
|
|
|
WARN_ON(ret);
|
|
|
|
|
|
|
|
/* the disk copy procedure reuses the scrub code */
|
|
|
|
ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
|
2014-09-03 13:35:38 +00:00
|
|
|
btrfs_device_get_total_bytes(src_device),
|
2012-11-05 16:33:06 +00:00
|
|
|
&dev_replace->scrub_progress, 0, 1);
|
|
|
|
|
2016-03-24 10:48:12 +00:00
|
|
|
ret = btrfs_dev_replace_finishing(fs_info, ret);
|
2020-01-25 11:35:38 +00:00
|
|
|
if (ret == -EINPROGRESS)
|
2016-03-24 10:48:14 +00:00
|
|
|
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
|
2012-11-05 16:33:06 +00:00
|
|
|
|
2014-10-13 04:42:12 +00:00
|
|
|
return ret;
|
2012-11-05 16:33:06 +00:00
|
|
|
|
|
|
|
leave:
|
2018-07-20 16:37:51 +00:00
|
|
|
btrfs_destroy_dev_replace_tgtdev(tgt_device);
|
2012-11-05 16:33:06 +00:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-06-22 22:54:24 +00:00
|
|
|
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
|
2016-03-24 10:48:14 +00:00
|
|
|
struct btrfs_ioctl_dev_replace_args *args)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
switch (args->start.cont_reading_from_srcdev_mode) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
|
|
|
|
args->start.tgtdev_name[0] == '\0')
|
|
|
|
return -EINVAL;
|
|
|
|
|
2016-06-22 22:54:24 +00:00
|
|
|
ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
|
2016-03-24 10:48:14 +00:00
|
|
|
args->start.srcdevid,
|
|
|
|
args->start.srcdev_name,
|
|
|
|
args->start.cont_reading_from_srcdev_mode);
|
|
|
|
args->result = ret;
|
|
|
|
/* don't warn if EINPROGRESS, someone else might be running scrub */
|
2018-11-11 14:22:24 +00:00
|
|
|
if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
|
|
|
|
ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
|
|
|
|
return 0;
|
2016-03-24 10:48:14 +00:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
|
|
|
/*
|
2016-05-20 01:18:45 +00:00
|
|
|
* blocked until all in-flight bios operations are finished.
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
|
|
|
*/
|
|
|
|
static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
|
2018-04-04 23:04:49 +00:00
|
|
|
wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
|
|
|
|
&fs_info->dev_replace.bio_counter));
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* we have removed target device, it is safe to allow new bios request.
|
|
|
|
*/
|
|
|
|
static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
|
2018-04-04 23:04:49 +00:00
|
|
|
wake_up(&fs_info->dev_replace.replace_wait);
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
|
|
|
}
|
|
|
|
|
btrfs: fix filesystem corruption after a device replace
We use a device's allocation state tree to track ranges in a device used
for allocated chunks, and we set ranges in this tree when allocating a new
chunk. However after a device replace operation, we were not setting the
allocated ranges in the new device's allocation state tree, so that tree
is empty after a device replace.
This means that a fitrim operation after a device replace will trim the
device ranges that have allocated chunks and extents, as we trim every
range for which there is not a range marked in the device's allocation
state tree. It is also important during chunk allocation, since the
device's allocation state is used to determine if a range is already
allocated when allocating a new chunk.
This is trivial to reproduce and the following script triggers the bug:
$ cat reproducer.sh
#!/bin/bash
DEV1="/dev/sdg"
DEV2="/dev/sdh"
DEV3="/dev/sdi"
wipefs -a $DEV1 $DEV2 $DEV3 &> /dev/null
# Create a raid1 test fs on 2 devices.
mkfs.btrfs -f -m raid1 -d raid1 $DEV1 $DEV2 > /dev/null
mount $DEV1 /mnt/btrfs
xfs_io -f -c "pwrite -S 0xab 0 10M" /mnt/btrfs/foo
echo "Starting to replace $DEV1 with $DEV3"
btrfs replace start -B $DEV1 $DEV3 /mnt/btrfs
echo
echo "Running fstrim"
fstrim /mnt/btrfs
echo
echo "Unmounting filesystem"
umount /mnt/btrfs
echo "Mounting filesystem in degraded mode using $DEV3 only"
wipefs -a $DEV1 $DEV2 &> /dev/null
mount -o degraded $DEV3 /mnt/btrfs
if [ $? -ne 0 ]; then
dmesg | tail
echo
echo "Failed to mount in degraded mode"
exit 1
fi
echo
echo "File foo data (expected all bytes = 0xab):"
od -A d -t x1 /mnt/btrfs/foo
umount /mnt/btrfs
When running the reproducer:
$ ./replace-test.sh
wrote 10485760/10485760 bytes at offset 0
10 MiB, 2560 ops; 0.0901 sec (110.877 MiB/sec and 28384.5216 ops/sec)
Starting to replace /dev/sdg with /dev/sdi
Running fstrim
Unmounting filesystem
Mounting filesystem in degraded mode using /dev/sdi only
mount: /mnt/btrfs: wrong fs type, bad option, bad superblock on /dev/sdi, missing codepage or helper program, or other error.
[19581.748641] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi started
[19581.803842] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi finished
[19582.208293] BTRFS info (device sdi): allowing degraded mounts
[19582.208298] BTRFS info (device sdi): disk space caching is enabled
[19582.208301] BTRFS info (device sdi): has skinny extents
[19582.212853] BTRFS warning (device sdi): devid 2 uuid 1f731f47-e1bb-4f00-bfbb-9e5a0cb4ba9f is missing
[19582.213904] btree_readpage_end_io_hook: 25839 callbacks suppressed
[19582.213907] BTRFS error (device sdi): bad tree block start, want 30490624 have 0
[19582.214780] BTRFS warning (device sdi): failed to read root (objectid=7): -5
[19582.231576] BTRFS error (device sdi): open_ctree failed
Failed to mount in degraded mode
So fix by setting all allocated ranges in the replace target device when
the replace operation is finishing, when we are holding the chunk mutex
and we can not race with new chunk allocations.
A test case for fstests follows soon.
Fixes: 1c11b63eff2a67 ("btrfs: replace pending/pinned chunks lists with io tree")
CC: stable@vger.kernel.org # 5.2+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-23 14:30:16 +00:00
|
|
|
/*
|
|
|
|
* When finishing the device replace, before swapping the source device with the
|
|
|
|
* target device we must update the chunk allocation state in the target device,
|
|
|
|
* as it is empty because replace works by directly copying the chunks and not
|
|
|
|
* through the normal chunk allocation path.
|
|
|
|
*/
|
|
|
|
static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
|
|
|
|
struct btrfs_device *tgtdev)
|
|
|
|
{
|
|
|
|
struct extent_state *cached_state = NULL;
|
|
|
|
u64 start = 0;
|
|
|
|
u64 found_start;
|
|
|
|
u64 found_end;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
|
|
|
|
|
2023-06-30 15:03:49 +00:00
|
|
|
while (find_first_extent_bit(&srcdev->alloc_state, start,
|
|
|
|
&found_start, &found_end,
|
|
|
|
CHUNK_ALLOCATED, &cached_state)) {
|
2023-05-24 23:04:32 +00:00
|
|
|
ret = set_extent_bit(&tgtdev->alloc_state, found_start,
|
2023-05-24 23:04:39 +00:00
|
|
|
found_end, CHUNK_ALLOCATED, NULL);
|
btrfs: fix filesystem corruption after a device replace
We use a device's allocation state tree to track ranges in a device used
for allocated chunks, and we set ranges in this tree when allocating a new
chunk. However after a device replace operation, we were not setting the
allocated ranges in the new device's allocation state tree, so that tree
is empty after a device replace.
This means that a fitrim operation after a device replace will trim the
device ranges that have allocated chunks and extents, as we trim every
range for which there is not a range marked in the device's allocation
state tree. It is also important during chunk allocation, since the
device's allocation state is used to determine if a range is already
allocated when allocating a new chunk.
This is trivial to reproduce and the following script triggers the bug:
$ cat reproducer.sh
#!/bin/bash
DEV1="/dev/sdg"
DEV2="/dev/sdh"
DEV3="/dev/sdi"
wipefs -a $DEV1 $DEV2 $DEV3 &> /dev/null
# Create a raid1 test fs on 2 devices.
mkfs.btrfs -f -m raid1 -d raid1 $DEV1 $DEV2 > /dev/null
mount $DEV1 /mnt/btrfs
xfs_io -f -c "pwrite -S 0xab 0 10M" /mnt/btrfs/foo
echo "Starting to replace $DEV1 with $DEV3"
btrfs replace start -B $DEV1 $DEV3 /mnt/btrfs
echo
echo "Running fstrim"
fstrim /mnt/btrfs
echo
echo "Unmounting filesystem"
umount /mnt/btrfs
echo "Mounting filesystem in degraded mode using $DEV3 only"
wipefs -a $DEV1 $DEV2 &> /dev/null
mount -o degraded $DEV3 /mnt/btrfs
if [ $? -ne 0 ]; then
dmesg | tail
echo
echo "Failed to mount in degraded mode"
exit 1
fi
echo
echo "File foo data (expected all bytes = 0xab):"
od -A d -t x1 /mnt/btrfs/foo
umount /mnt/btrfs
When running the reproducer:
$ ./replace-test.sh
wrote 10485760/10485760 bytes at offset 0
10 MiB, 2560 ops; 0.0901 sec (110.877 MiB/sec and 28384.5216 ops/sec)
Starting to replace /dev/sdg with /dev/sdi
Running fstrim
Unmounting filesystem
Mounting filesystem in degraded mode using /dev/sdi only
mount: /mnt/btrfs: wrong fs type, bad option, bad superblock on /dev/sdi, missing codepage or helper program, or other error.
[19581.748641] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi started
[19581.803842] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi finished
[19582.208293] BTRFS info (device sdi): allowing degraded mounts
[19582.208298] BTRFS info (device sdi): disk space caching is enabled
[19582.208301] BTRFS info (device sdi): has skinny extents
[19582.212853] BTRFS warning (device sdi): devid 2 uuid 1f731f47-e1bb-4f00-bfbb-9e5a0cb4ba9f is missing
[19582.213904] btree_readpage_end_io_hook: 25839 callbacks suppressed
[19582.213907] BTRFS error (device sdi): bad tree block start, want 30490624 have 0
[19582.214780] BTRFS warning (device sdi): failed to read root (objectid=7): -5
[19582.231576] BTRFS error (device sdi): open_ctree failed
Failed to mount in degraded mode
So fix by setting all allocated ranges in the replace target device when
the replace operation is finishing, when we are holding the chunk mutex
and we can not race with new chunk allocations.
A test case for fstests follows soon.
Fixes: 1c11b63eff2a67 ("btrfs: replace pending/pinned chunks lists with io tree")
CC: stable@vger.kernel.org # 5.2+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-23 14:30:16 +00:00
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
start = found_end + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
free_extent_state(cached_state);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-09-04 17:34:36 +00:00
|
|
|
static void btrfs_dev_replace_update_device_in_mapping_tree(
|
|
|
|
struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_device *srcdev,
|
|
|
|
struct btrfs_device *tgtdev)
|
|
|
|
{
|
|
|
|
u64 start = 0;
|
|
|
|
int i;
|
|
|
|
|
btrfs: use a dedicated data structure for chunk maps
Currently we abuse the extent_map structure for two purposes:
1) To actually represent extents for inodes;
2) To represent chunk mappings.
This is odd and has several disadvantages:
1) To create a chunk map, we need to do two memory allocations: one for
an extent_map structure and another one for a map_lookup structure, so
more potential for an allocation failure and more complicated code to
manage and link two structures;
2) For a chunk map we actually only use 3 fields (24 bytes) of the
respective extent map structure: the 'start' field to have the logical
start address of the chunk, the 'len' field to have the chunk's size,
and the 'orig_block_len' field to contain the chunk's stripe size.
Besides wasting a memory, it's also odd and not intuitive at all to
have the stripe size in a field named 'orig_block_len'.
We are also using 'block_len' of the extent_map structure to contain
the chunk size, so we have 2 fields for the same value, 'len' and
'block_len', which is pointless;
3) When an extent map is associated to a chunk mapping, we set the bit
EXTENT_FLAG_FS_MAPPING on its flags and then make its member named
'map_lookup' point to the associated map_lookup structure. This means
that for an extent map associated to an inode extent, we are not using
this 'map_lookup' pointer, so wasting 8 bytes (on a 64 bits platform);
4) Extent maps associated to a chunk mapping are never merged or split so
it's pointless to use the existing extent map infrastructure.
So add a dedicated data structure named 'btrfs_chunk_map' to represent
chunk mappings, this is basically the existing map_lookup structure with
some extra fields:
1) 'start' to contain the chunk logical address;
2) 'chunk_len' to contain the chunk's length;
3) 'stripe_size' for the stripe size;
4) 'rb_node' for insertion into a rb tree;
5) 'refs' for reference counting.
This way we do a single memory allocation for chunk mappings and we don't
waste memory for them with unused/unnecessary fields from an extent_map.
We also save 8 bytes from the extent_map structure by removing the
'map_lookup' pointer, so the size of struct extent_map is reduced from
144 bytes down to 136 bytes, and we can now have 30 extents map per 4K
page instead of 28.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-11-21 13:38:38 +00:00
|
|
|
write_lock(&fs_info->mapping_tree_lock);
|
2020-09-04 17:34:36 +00:00
|
|
|
do {
|
btrfs: use a dedicated data structure for chunk maps
Currently we abuse the extent_map structure for two purposes:
1) To actually represent extents for inodes;
2) To represent chunk mappings.
This is odd and has several disadvantages:
1) To create a chunk map, we need to do two memory allocations: one for
an extent_map structure and another one for a map_lookup structure, so
more potential for an allocation failure and more complicated code to
manage and link two structures;
2) For a chunk map we actually only use 3 fields (24 bytes) of the
respective extent map structure: the 'start' field to have the logical
start address of the chunk, the 'len' field to have the chunk's size,
and the 'orig_block_len' field to contain the chunk's stripe size.
Besides wasting a memory, it's also odd and not intuitive at all to
have the stripe size in a field named 'orig_block_len'.
We are also using 'block_len' of the extent_map structure to contain
the chunk size, so we have 2 fields for the same value, 'len' and
'block_len', which is pointless;
3) When an extent map is associated to a chunk mapping, we set the bit
EXTENT_FLAG_FS_MAPPING on its flags and then make its member named
'map_lookup' point to the associated map_lookup structure. This means
that for an extent map associated to an inode extent, we are not using
this 'map_lookup' pointer, so wasting 8 bytes (on a 64 bits platform);
4) Extent maps associated to a chunk mapping are never merged or split so
it's pointless to use the existing extent map infrastructure.
So add a dedicated data structure named 'btrfs_chunk_map' to represent
chunk mappings, this is basically the existing map_lookup structure with
some extra fields:
1) 'start' to contain the chunk logical address;
2) 'chunk_len' to contain the chunk's length;
3) 'stripe_size' for the stripe size;
4) 'rb_node' for insertion into a rb tree;
5) 'refs' for reference counting.
This way we do a single memory allocation for chunk mappings and we don't
waste memory for them with unused/unnecessary fields from an extent_map.
We also save 8 bytes from the extent_map structure by removing the
'map_lookup' pointer, so the size of struct extent_map is reduced from
144 bytes down to 136 bytes, and we can now have 30 extents map per 4K
page instead of 28.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-11-21 13:38:38 +00:00
|
|
|
struct btrfs_chunk_map *map;
|
|
|
|
|
|
|
|
map = btrfs_find_chunk_map_nolock(fs_info, start, U64_MAX);
|
|
|
|
if (!map)
|
2020-09-04 17:34:36 +00:00
|
|
|
break;
|
|
|
|
for (i = 0; i < map->num_stripes; i++)
|
|
|
|
if (srcdev == map->stripes[i].dev)
|
|
|
|
map->stripes[i].dev = tgtdev;
|
btrfs: use a dedicated data structure for chunk maps
Currently we abuse the extent_map structure for two purposes:
1) To actually represent extents for inodes;
2) To represent chunk mappings.
This is odd and has several disadvantages:
1) To create a chunk map, we need to do two memory allocations: one for
an extent_map structure and another one for a map_lookup structure, so
more potential for an allocation failure and more complicated code to
manage and link two structures;
2) For a chunk map we actually only use 3 fields (24 bytes) of the
respective extent map structure: the 'start' field to have the logical
start address of the chunk, the 'len' field to have the chunk's size,
and the 'orig_block_len' field to contain the chunk's stripe size.
Besides wasting a memory, it's also odd and not intuitive at all to
have the stripe size in a field named 'orig_block_len'.
We are also using 'block_len' of the extent_map structure to contain
the chunk size, so we have 2 fields for the same value, 'len' and
'block_len', which is pointless;
3) When an extent map is associated to a chunk mapping, we set the bit
EXTENT_FLAG_FS_MAPPING on its flags and then make its member named
'map_lookup' point to the associated map_lookup structure. This means
that for an extent map associated to an inode extent, we are not using
this 'map_lookup' pointer, so wasting 8 bytes (on a 64 bits platform);
4) Extent maps associated to a chunk mapping are never merged or split so
it's pointless to use the existing extent map infrastructure.
So add a dedicated data structure named 'btrfs_chunk_map' to represent
chunk mappings, this is basically the existing map_lookup structure with
some extra fields:
1) 'start' to contain the chunk logical address;
2) 'chunk_len' to contain the chunk's length;
3) 'stripe_size' for the stripe size;
4) 'rb_node' for insertion into a rb tree;
5) 'refs' for reference counting.
This way we do a single memory allocation for chunk mappings and we don't
waste memory for them with unused/unnecessary fields from an extent_map.
We also save 8 bytes from the extent_map structure by removing the
'map_lookup' pointer, so the size of struct extent_map is reduced from
144 bytes down to 136 bytes, and we can now have 30 extents map per 4K
page instead of 28.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-11-21 13:38:38 +00:00
|
|
|
start = map->start + map->chunk_len;
|
|
|
|
btrfs_free_chunk_map(map);
|
2020-09-04 17:34:36 +00:00
|
|
|
} while (start);
|
btrfs: use a dedicated data structure for chunk maps
Currently we abuse the extent_map structure for two purposes:
1) To actually represent extents for inodes;
2) To represent chunk mappings.
This is odd and has several disadvantages:
1) To create a chunk map, we need to do two memory allocations: one for
an extent_map structure and another one for a map_lookup structure, so
more potential for an allocation failure and more complicated code to
manage and link two structures;
2) For a chunk map we actually only use 3 fields (24 bytes) of the
respective extent map structure: the 'start' field to have the logical
start address of the chunk, the 'len' field to have the chunk's size,
and the 'orig_block_len' field to contain the chunk's stripe size.
Besides wasting a memory, it's also odd and not intuitive at all to
have the stripe size in a field named 'orig_block_len'.
We are also using 'block_len' of the extent_map structure to contain
the chunk size, so we have 2 fields for the same value, 'len' and
'block_len', which is pointless;
3) When an extent map is associated to a chunk mapping, we set the bit
EXTENT_FLAG_FS_MAPPING on its flags and then make its member named
'map_lookup' point to the associated map_lookup structure. This means
that for an extent map associated to an inode extent, we are not using
this 'map_lookup' pointer, so wasting 8 bytes (on a 64 bits platform);
4) Extent maps associated to a chunk mapping are never merged or split so
it's pointless to use the existing extent map infrastructure.
So add a dedicated data structure named 'btrfs_chunk_map' to represent
chunk mappings, this is basically the existing map_lookup structure with
some extra fields:
1) 'start' to contain the chunk logical address;
2) 'chunk_len' to contain the chunk's length;
3) 'stripe_size' for the stripe size;
4) 'rb_node' for insertion into a rb tree;
5) 'refs' for reference counting.
This way we do a single memory allocation for chunk mappings and we don't
waste memory for them with unused/unnecessary fields from an extent_map.
We also save 8 bytes from the extent_map structure by removing the
'map_lookup' pointer, so the size of struct extent_map is reduced from
144 bytes down to 136 bytes, and we can now have 30 extents map per 4K
page instead of 28.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-11-21 13:38:38 +00:00
|
|
|
write_unlock(&fs_info->mapping_tree_lock);
|
2020-09-04 17:34:36 +00:00
|
|
|
}
|
|
|
|
|
2012-11-05 16:33:06 +00:00
|
|
|
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
|
|
|
|
int scrub_ret)
|
|
|
|
{
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
2022-03-14 02:09:29 +00:00
|
|
|
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
|
2012-11-05 16:33:06 +00:00
|
|
|
struct btrfs_device *tgt_device;
|
|
|
|
struct btrfs_device *src_device;
|
|
|
|
struct btrfs_root *root = fs_info->tree_root;
|
|
|
|
u8 uuid_tmp[BTRFS_UUID_SIZE];
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
/* don't allow cancel or unmount to disturb the finishing procedure */
|
|
|
|
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
|
2018-09-07 14:11:23 +00:00
|
|
|
down_read(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
/* was the operation canceled, or is it finished? */
|
|
|
|
if (dev_replace->replace_state !=
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
|
2018-09-07 14:11:23 +00:00
|
|
|
up_read(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
tgt_device = dev_replace->tgtdev;
|
|
|
|
src_device = dev_replace->srcdev;
|
2018-09-07 14:11:23 +00:00
|
|
|
up_read(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* flush all outstanding I/O and inode extent mappings before the
|
|
|
|
* copy operation is declared as being finished
|
|
|
|
*/
|
2021-01-11 10:58:11 +00:00
|
|
|
ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
|
2013-01-22 10:49:33 +00:00
|
|
|
if (ret) {
|
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
return ret;
|
|
|
|
}
|
2017-06-23 16:48:21 +00:00
|
|
|
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
|
2012-11-05 16:33:06 +00:00
|
|
|
|
2019-05-17 07:44:25 +00:00
|
|
|
/*
|
|
|
|
* We have to use this loop approach because at this point src_device
|
|
|
|
* has to be available for transaction commit to complete, yet new
|
|
|
|
* chunks shouldn't be allocated on the device.
|
|
|
|
*/
|
|
|
|
while (1) {
|
|
|
|
trans = btrfs_start_transaction(root, 0);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
return PTR_ERR(trans);
|
|
|
|
}
|
|
|
|
ret = btrfs_commit_transaction(trans);
|
|
|
|
WARN_ON(ret);
|
|
|
|
|
|
|
|
/* Prevent write_all_supers() during the finishing procedure */
|
2022-03-14 02:09:29 +00:00
|
|
|
mutex_lock(&fs_devices->device_list_mutex);
|
2019-05-17 07:44:25 +00:00
|
|
|
/* Prevent new chunks being allocated on the source device */
|
|
|
|
mutex_lock(&fs_info->chunk_mutex);
|
|
|
|
|
|
|
|
if (!list_empty(&src_device->post_commit_list)) {
|
2022-03-14 02:09:29 +00:00
|
|
|
mutex_unlock(&fs_devices->device_list_mutex);
|
2019-05-17 07:44:25 +00:00
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
2012-11-05 16:33:06 +00:00
|
|
|
}
|
|
|
|
|
2018-09-07 14:11:23 +00:00
|
|
|
down_write(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
dev_replace->replace_state =
|
|
|
|
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
|
|
|
|
: BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
|
|
|
|
dev_replace->tgtdev = NULL;
|
|
|
|
dev_replace->srcdev = NULL;
|
2018-06-12 11:48:25 +00:00
|
|
|
dev_replace->time_stopped = ktime_get_real_seconds();
|
2012-11-05 16:33:06 +00:00
|
|
|
dev_replace->item_needs_writeback = 1;
|
|
|
|
|
btrfs: fix filesystem corruption after a device replace
We use a device's allocation state tree to track ranges in a device used
for allocated chunks, and we set ranges in this tree when allocating a new
chunk. However after a device replace operation, we were not setting the
allocated ranges in the new device's allocation state tree, so that tree
is empty after a device replace.
This means that a fitrim operation after a device replace will trim the
device ranges that have allocated chunks and extents, as we trim every
range for which there is not a range marked in the device's allocation
state tree. It is also important during chunk allocation, since the
device's allocation state is used to determine if a range is already
allocated when allocating a new chunk.
This is trivial to reproduce and the following script triggers the bug:
$ cat reproducer.sh
#!/bin/bash
DEV1="/dev/sdg"
DEV2="/dev/sdh"
DEV3="/dev/sdi"
wipefs -a $DEV1 $DEV2 $DEV3 &> /dev/null
# Create a raid1 test fs on 2 devices.
mkfs.btrfs -f -m raid1 -d raid1 $DEV1 $DEV2 > /dev/null
mount $DEV1 /mnt/btrfs
xfs_io -f -c "pwrite -S 0xab 0 10M" /mnt/btrfs/foo
echo "Starting to replace $DEV1 with $DEV3"
btrfs replace start -B $DEV1 $DEV3 /mnt/btrfs
echo
echo "Running fstrim"
fstrim /mnt/btrfs
echo
echo "Unmounting filesystem"
umount /mnt/btrfs
echo "Mounting filesystem in degraded mode using $DEV3 only"
wipefs -a $DEV1 $DEV2 &> /dev/null
mount -o degraded $DEV3 /mnt/btrfs
if [ $? -ne 0 ]; then
dmesg | tail
echo
echo "Failed to mount in degraded mode"
exit 1
fi
echo
echo "File foo data (expected all bytes = 0xab):"
od -A d -t x1 /mnt/btrfs/foo
umount /mnt/btrfs
When running the reproducer:
$ ./replace-test.sh
wrote 10485760/10485760 bytes at offset 0
10 MiB, 2560 ops; 0.0901 sec (110.877 MiB/sec and 28384.5216 ops/sec)
Starting to replace /dev/sdg with /dev/sdi
Running fstrim
Unmounting filesystem
Mounting filesystem in degraded mode using /dev/sdi only
mount: /mnt/btrfs: wrong fs type, bad option, bad superblock on /dev/sdi, missing codepage or helper program, or other error.
[19581.748641] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi started
[19581.803842] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi finished
[19582.208293] BTRFS info (device sdi): allowing degraded mounts
[19582.208298] BTRFS info (device sdi): disk space caching is enabled
[19582.208301] BTRFS info (device sdi): has skinny extents
[19582.212853] BTRFS warning (device sdi): devid 2 uuid 1f731f47-e1bb-4f00-bfbb-9e5a0cb4ba9f is missing
[19582.213904] btree_readpage_end_io_hook: 25839 callbacks suppressed
[19582.213907] BTRFS error (device sdi): bad tree block start, want 30490624 have 0
[19582.214780] BTRFS warning (device sdi): failed to read root (objectid=7): -5
[19582.231576] BTRFS error (device sdi): open_ctree failed
Failed to mount in degraded mode
So fix by setting all allocated ranges in the replace target device when
the replace operation is finishing, when we are holding the chunk mutex
and we can not race with new chunk allocations.
A test case for fstests follows soon.
Fixes: 1c11b63eff2a67 ("btrfs: replace pending/pinned chunks lists with io tree")
CC: stable@vger.kernel.org # 5.2+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-23 14:30:16 +00:00
|
|
|
/*
|
|
|
|
* Update allocation state in the new device and replace the old device
|
|
|
|
* with the new one in the mapping tree.
|
|
|
|
*/
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
|
|
|
if (!scrub_ret) {
|
btrfs: fix filesystem corruption after a device replace
We use a device's allocation state tree to track ranges in a device used
for allocated chunks, and we set ranges in this tree when allocating a new
chunk. However after a device replace operation, we were not setting the
allocated ranges in the new device's allocation state tree, so that tree
is empty after a device replace.
This means that a fitrim operation after a device replace will trim the
device ranges that have allocated chunks and extents, as we trim every
range for which there is not a range marked in the device's allocation
state tree. It is also important during chunk allocation, since the
device's allocation state is used to determine if a range is already
allocated when allocating a new chunk.
This is trivial to reproduce and the following script triggers the bug:
$ cat reproducer.sh
#!/bin/bash
DEV1="/dev/sdg"
DEV2="/dev/sdh"
DEV3="/dev/sdi"
wipefs -a $DEV1 $DEV2 $DEV3 &> /dev/null
# Create a raid1 test fs on 2 devices.
mkfs.btrfs -f -m raid1 -d raid1 $DEV1 $DEV2 > /dev/null
mount $DEV1 /mnt/btrfs
xfs_io -f -c "pwrite -S 0xab 0 10M" /mnt/btrfs/foo
echo "Starting to replace $DEV1 with $DEV3"
btrfs replace start -B $DEV1 $DEV3 /mnt/btrfs
echo
echo "Running fstrim"
fstrim /mnt/btrfs
echo
echo "Unmounting filesystem"
umount /mnt/btrfs
echo "Mounting filesystem in degraded mode using $DEV3 only"
wipefs -a $DEV1 $DEV2 &> /dev/null
mount -o degraded $DEV3 /mnt/btrfs
if [ $? -ne 0 ]; then
dmesg | tail
echo
echo "Failed to mount in degraded mode"
exit 1
fi
echo
echo "File foo data (expected all bytes = 0xab):"
od -A d -t x1 /mnt/btrfs/foo
umount /mnt/btrfs
When running the reproducer:
$ ./replace-test.sh
wrote 10485760/10485760 bytes at offset 0
10 MiB, 2560 ops; 0.0901 sec (110.877 MiB/sec and 28384.5216 ops/sec)
Starting to replace /dev/sdg with /dev/sdi
Running fstrim
Unmounting filesystem
Mounting filesystem in degraded mode using /dev/sdi only
mount: /mnt/btrfs: wrong fs type, bad option, bad superblock on /dev/sdi, missing codepage or helper program, or other error.
[19581.748641] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi started
[19581.803842] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi finished
[19582.208293] BTRFS info (device sdi): allowing degraded mounts
[19582.208298] BTRFS info (device sdi): disk space caching is enabled
[19582.208301] BTRFS info (device sdi): has skinny extents
[19582.212853] BTRFS warning (device sdi): devid 2 uuid 1f731f47-e1bb-4f00-bfbb-9e5a0cb4ba9f is missing
[19582.213904] btree_readpage_end_io_hook: 25839 callbacks suppressed
[19582.213907] BTRFS error (device sdi): bad tree block start, want 30490624 have 0
[19582.214780] BTRFS warning (device sdi): failed to read root (objectid=7): -5
[19582.231576] BTRFS error (device sdi): open_ctree failed
Failed to mount in degraded mode
So fix by setting all allocated ranges in the replace target device when
the replace operation is finishing, when we are holding the chunk mutex
and we can not race with new chunk allocations.
A test case for fstests follows soon.
Fixes: 1c11b63eff2a67 ("btrfs: replace pending/pinned chunks lists with io tree")
CC: stable@vger.kernel.org # 5.2+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-23 14:30:16 +00:00
|
|
|
scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
|
|
|
|
if (scrub_ret)
|
|
|
|
goto error;
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
|
|
|
btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
|
|
|
|
src_device,
|
|
|
|
tgt_device);
|
|
|
|
} else {
|
2018-11-20 11:56:16 +00:00
|
|
|
if (scrub_ret != -ECANCELED)
|
|
|
|
btrfs_err_in_rcu(fs_info,
|
2016-06-22 22:54:23 +00:00
|
|
|
"btrfs_scrub_dev(%s, %llu, %s) failed %d",
|
2017-11-28 02:43:10 +00:00
|
|
|
btrfs_dev_name(src_device),
|
2016-06-22 22:54:23 +00:00
|
|
|
src_device->devid,
|
2022-11-13 01:32:07 +00:00
|
|
|
btrfs_dev_name(tgt_device), scrub_ret);
|
btrfs: fix filesystem corruption after a device replace
We use a device's allocation state tree to track ranges in a device used
for allocated chunks, and we set ranges in this tree when allocating a new
chunk. However after a device replace operation, we were not setting the
allocated ranges in the new device's allocation state tree, so that tree
is empty after a device replace.
This means that a fitrim operation after a device replace will trim the
device ranges that have allocated chunks and extents, as we trim every
range for which there is not a range marked in the device's allocation
state tree. It is also important during chunk allocation, since the
device's allocation state is used to determine if a range is already
allocated when allocating a new chunk.
This is trivial to reproduce and the following script triggers the bug:
$ cat reproducer.sh
#!/bin/bash
DEV1="/dev/sdg"
DEV2="/dev/sdh"
DEV3="/dev/sdi"
wipefs -a $DEV1 $DEV2 $DEV3 &> /dev/null
# Create a raid1 test fs on 2 devices.
mkfs.btrfs -f -m raid1 -d raid1 $DEV1 $DEV2 > /dev/null
mount $DEV1 /mnt/btrfs
xfs_io -f -c "pwrite -S 0xab 0 10M" /mnt/btrfs/foo
echo "Starting to replace $DEV1 with $DEV3"
btrfs replace start -B $DEV1 $DEV3 /mnt/btrfs
echo
echo "Running fstrim"
fstrim /mnt/btrfs
echo
echo "Unmounting filesystem"
umount /mnt/btrfs
echo "Mounting filesystem in degraded mode using $DEV3 only"
wipefs -a $DEV1 $DEV2 &> /dev/null
mount -o degraded $DEV3 /mnt/btrfs
if [ $? -ne 0 ]; then
dmesg | tail
echo
echo "Failed to mount in degraded mode"
exit 1
fi
echo
echo "File foo data (expected all bytes = 0xab):"
od -A d -t x1 /mnt/btrfs/foo
umount /mnt/btrfs
When running the reproducer:
$ ./replace-test.sh
wrote 10485760/10485760 bytes at offset 0
10 MiB, 2560 ops; 0.0901 sec (110.877 MiB/sec and 28384.5216 ops/sec)
Starting to replace /dev/sdg with /dev/sdi
Running fstrim
Unmounting filesystem
Mounting filesystem in degraded mode using /dev/sdi only
mount: /mnt/btrfs: wrong fs type, bad option, bad superblock on /dev/sdi, missing codepage or helper program, or other error.
[19581.748641] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi started
[19581.803842] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi finished
[19582.208293] BTRFS info (device sdi): allowing degraded mounts
[19582.208298] BTRFS info (device sdi): disk space caching is enabled
[19582.208301] BTRFS info (device sdi): has skinny extents
[19582.212853] BTRFS warning (device sdi): devid 2 uuid 1f731f47-e1bb-4f00-bfbb-9e5a0cb4ba9f is missing
[19582.213904] btree_readpage_end_io_hook: 25839 callbacks suppressed
[19582.213907] BTRFS error (device sdi): bad tree block start, want 30490624 have 0
[19582.214780] BTRFS warning (device sdi): failed to read root (objectid=7): -5
[19582.231576] BTRFS error (device sdi): open_ctree failed
Failed to mount in degraded mode
So fix by setting all allocated ranges in the replace target device when
the replace operation is finishing, when we are holding the chunk mutex
and we can not race with new chunk allocations.
A test case for fstests follows soon.
Fixes: 1c11b63eff2a67 ("btrfs: replace pending/pinned chunks lists with io tree")
CC: stable@vger.kernel.org # 5.2+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-23 14:30:16 +00:00
|
|
|
error:
|
2018-09-07 14:11:23 +00:00
|
|
|
up_write(&dev_replace->rwsem);
|
2016-06-22 22:54:23 +00:00
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
2022-03-14 02:09:29 +00:00
|
|
|
mutex_unlock(&fs_devices->device_list_mutex);
|
btrfs: Wait for in-flight bios before freeing target device for raid56
When raid56 dev-replace is cancelled by running scrub, we will free
target device without waiting for in-flight bios, causing the following
NULL pointer deference or general protection failure.
BUG: unable to handle kernel NULL pointer dereference at 00000000000005e0
IP: generic_make_request_checks+0x4d/0x610
CPU: 1 PID: 11676 Comm: kworker/u4:14 Tainted: G O 4.11.0-rc2 #72
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.10.2-20170228_101828-anatol 04/01/2014
Workqueue: btrfs-endio-raid56 btrfs_endio_raid56_helper [btrfs]
task: ffff88002875b4c0 task.stack: ffffc90001334000
RIP: 0010:generic_make_request_checks+0x4d/0x610
Call Trace:
? generic_make_request+0xc7/0x360
generic_make_request+0x24/0x360
? generic_make_request+0xc7/0x360
submit_bio+0x64/0x120
? page_in_rbio+0x4d/0x80 [btrfs]
? rbio_orig_end_io+0x80/0x80 [btrfs]
finish_rmw+0x3f4/0x540 [btrfs]
validate_rbio_for_rmw+0x36/0x40 [btrfs]
raid_rmw_end_io+0x7a/0x90 [btrfs]
bio_endio+0x56/0x60
end_workqueue_fn+0x3c/0x40 [btrfs]
btrfs_scrubparity_helper+0xef/0x620 [btrfs]
btrfs_endio_raid56_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
RIP: generic_make_request_checks+0x4d/0x610 RSP: ffffc90001337bb8
In btrfs_dev_replace_finishing(), we will call
btrfs_rm_dev_replace_blocked() to wait bios before destroying the target
device when scrub is finished normally.
However when dev-replace is aborted, either due to error or cancelled by
scrub, we didn't wait for bios, this can lead to use-after-free if there
are bios holding the target device.
Furthermore, for raid56 scrub, at least 2 places are calling
btrfs_map_sblock() without protection of bio_counter, leading to the
problem.
This patch fixes the problem:
1) Wait for bio_counter before freeing target device when canceling
replace
2) When calling btrfs_map_sblock() for raid56, use bio_counter to
protect the call.
Cc: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-03-29 01:33:21 +00:00
|
|
|
btrfs_rm_dev_replace_blocked(fs_info);
|
2012-11-05 16:33:06 +00:00
|
|
|
if (tgt_device)
|
2018-07-20 16:37:51 +00:00
|
|
|
btrfs_destroy_dev_replace_tgtdev(tgt_device);
|
btrfs: Wait for in-flight bios before freeing target device for raid56
When raid56 dev-replace is cancelled by running scrub, we will free
target device without waiting for in-flight bios, causing the following
NULL pointer deference or general protection failure.
BUG: unable to handle kernel NULL pointer dereference at 00000000000005e0
IP: generic_make_request_checks+0x4d/0x610
CPU: 1 PID: 11676 Comm: kworker/u4:14 Tainted: G O 4.11.0-rc2 #72
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.10.2-20170228_101828-anatol 04/01/2014
Workqueue: btrfs-endio-raid56 btrfs_endio_raid56_helper [btrfs]
task: ffff88002875b4c0 task.stack: ffffc90001334000
RIP: 0010:generic_make_request_checks+0x4d/0x610
Call Trace:
? generic_make_request+0xc7/0x360
generic_make_request+0x24/0x360
? generic_make_request+0xc7/0x360
submit_bio+0x64/0x120
? page_in_rbio+0x4d/0x80 [btrfs]
? rbio_orig_end_io+0x80/0x80 [btrfs]
finish_rmw+0x3f4/0x540 [btrfs]
validate_rbio_for_rmw+0x36/0x40 [btrfs]
raid_rmw_end_io+0x7a/0x90 [btrfs]
bio_endio+0x56/0x60
end_workqueue_fn+0x3c/0x40 [btrfs]
btrfs_scrubparity_helper+0xef/0x620 [btrfs]
btrfs_endio_raid56_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
RIP: generic_make_request_checks+0x4d/0x610 RSP: ffffc90001337bb8
In btrfs_dev_replace_finishing(), we will call
btrfs_rm_dev_replace_blocked() to wait bios before destroying the target
device when scrub is finished normally.
However when dev-replace is aborted, either due to error or cancelled by
scrub, we didn't wait for bios, this can lead to use-after-free if there
are bios holding the target device.
Furthermore, for raid56 scrub, at least 2 places are calling
btrfs_map_sblock() without protection of bio_counter, leading to the
problem.
This patch fixes the problem:
1) Wait for bio_counter before freeing target device when canceling
replace
2) When calling btrfs_map_sblock() for raid56, use bio_counter to
protect the call.
Cc: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-03-29 01:33:21 +00:00
|
|
|
btrfs_rm_dev_replace_unblocked(fs_info);
|
2012-11-05 16:33:06 +00:00
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
|
2014-10-13 04:42:12 +00:00
|
|
|
return scrub_ret;
|
2012-11-05 16:33:06 +00:00
|
|
|
}
|
|
|
|
|
2016-06-22 22:54:23 +00:00
|
|
|
btrfs_info_in_rcu(fs_info,
|
|
|
|
"dev_replace from %s (devid %llu) to %s finished",
|
2017-11-28 02:43:10 +00:00
|
|
|
btrfs_dev_name(src_device),
|
2016-06-22 22:54:23 +00:00
|
|
|
src_device->devid,
|
2022-11-13 01:32:07 +00:00
|
|
|
btrfs_dev_name(tgt_device));
|
2017-12-04 04:54:55 +00:00
|
|
|
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
|
2012-11-05 16:33:06 +00:00
|
|
|
tgt_device->devid = src_device->devid;
|
|
|
|
src_device->devid = BTRFS_DEV_REPLACE_DEVID;
|
|
|
|
memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
|
|
|
|
memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
|
|
|
|
memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
|
2014-09-03 13:35:38 +00:00
|
|
|
btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
|
|
|
|
btrfs_device_set_disk_total_bytes(tgt_device,
|
|
|
|
src_device->disk_total_bytes);
|
|
|
|
btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
|
2014-09-03 13:35:34 +00:00
|
|
|
tgt_device->commit_bytes_used = src_device->bytes_used;
|
2016-05-03 09:44:43 +00:00
|
|
|
|
2018-07-20 16:37:50 +00:00
|
|
|
btrfs_assign_next_active_device(src_device, tgt_device);
|
2016-05-03 09:44:43 +00:00
|
|
|
|
2022-03-14 02:09:29 +00:00
|
|
|
list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list);
|
|
|
|
fs_devices->rw_devices++;
|
2012-11-05 16:33:06 +00:00
|
|
|
|
2018-09-07 14:11:23 +00:00
|
|
|
up_write(&dev_replace->rwsem);
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
|
|
|
btrfs_rm_dev_replace_blocked(fs_info);
|
|
|
|
|
2018-07-20 16:37:48 +00:00
|
|
|
btrfs_rm_dev_replace_remove_srcdev(src_device);
|
2013-10-02 17:41:01 +00:00
|
|
|
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
|
|
|
btrfs_rm_dev_replace_unblocked(fs_info);
|
|
|
|
|
2018-07-31 07:20:21 +00:00
|
|
|
/*
|
|
|
|
* Increment dev_stats_ccnt so that btrfs_run_dev_stats() will
|
|
|
|
* update on-disk dev stats value during commit transaction
|
|
|
|
*/
|
|
|
|
atomic_inc(&tgt_device->dev_stats_ccnt);
|
|
|
|
|
2012-11-05 16:33:06 +00:00
|
|
|
/*
|
|
|
|
* this is again a consistent state where no dev_replace procedure
|
|
|
|
* is running, the target device is part of the filesystem, the
|
|
|
|
* source device is not part of the filesystem anymore and its 1st
|
|
|
|
* superblock is scratched out so that it is no longer marked to
|
|
|
|
* belong to this filesystem.
|
|
|
|
*/
|
2016-06-22 22:54:23 +00:00
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
2022-03-14 02:09:29 +00:00
|
|
|
mutex_unlock(&fs_devices->device_list_mutex);
|
2012-11-05 16:33:06 +00:00
|
|
|
|
2014-10-30 08:52:31 +00:00
|
|
|
/* replace the sysfs entry */
|
2020-09-04 17:34:27 +00:00
|
|
|
btrfs_sysfs_remove_device(src_device);
|
2020-01-06 11:38:31 +00:00
|
|
|
btrfs_sysfs_update_devid(tgt_device);
|
2020-08-20 15:18:26 +00:00
|
|
|
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
|
|
|
|
btrfs_scratch_superblocks(fs_info, src_device->bdev,
|
|
|
|
src_device->name->str);
|
2014-10-30 08:52:31 +00:00
|
|
|
|
2012-11-05 16:33:06 +00:00
|
|
|
/* write back the superblocks */
|
|
|
|
trans = btrfs_start_transaction(root, 0);
|
|
|
|
if (!IS_ERR(trans))
|
2016-09-10 01:39:03 +00:00
|
|
|
btrfs_commit_transaction(trans);
|
2012-11-05 16:33:06 +00:00
|
|
|
|
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
|
btrfs: move btrfs_rm_dev_replace_free_srcdev outside of all locks
When closing and freeing the source device we could end up doing our
final blkdev_put() on the bdev, which will grab the bd_mutex. As such
we want to be holding as few locks as possible, so move this call
outside of the dev_replace->lock_finishing_cancel_unmount lock. Since
we're modifying the fs_devices we need to make sure we're holding the
uuid_mutex here, so take that as well.
There's a report from syzbot probably hitting one of the cases where
the bd_mutex and device_list_mutex are taken in the wrong order, however
it's not with device replace, like this patch fixes. As there's no
reproducer available so far, we can't verify the fix.
https://lore.kernel.org/lkml/000000000000fc04d105afcf86d7@google.com/
dashboard link: https://syzkaller.appspot.com/bug?extid=84a0634dc5d21d488419
WARNING: possible circular locking dependency detected
5.9.0-rc5-syzkaller #0 Not tainted
------------------------------------------------------
syz-executor.0/6878 is trying to acquire lock:
ffff88804c17d780 (&bdev->bd_mutex){+.+.}-{3:3}, at: blkdev_put+0x30/0x520 fs/block_dev.c:1804
but task is already holding lock:
ffff8880908cfce0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: close_fs_devices.part.0+0x2e/0x800 fs/btrfs/volumes.c:1159
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #4 (&fs_devs->device_list_mutex){+.+.}-{3:3}:
__mutex_lock_common kernel/locking/mutex.c:956 [inline]
__mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103
btrfs_finish_chunk_alloc+0x281/0xf90 fs/btrfs/volumes.c:5255
btrfs_create_pending_block_groups+0x2f3/0x700 fs/btrfs/block-group.c:2109
__btrfs_end_transaction+0xf5/0x690 fs/btrfs/transaction.c:916
find_free_extent_update_loop fs/btrfs/extent-tree.c:3807 [inline]
find_free_extent+0x23b7/0x2e60 fs/btrfs/extent-tree.c:4127
btrfs_reserve_extent+0x166/0x460 fs/btrfs/extent-tree.c:4206
cow_file_range+0x3de/0x9b0 fs/btrfs/inode.c:1063
btrfs_run_delalloc_range+0x2cf/0x1410 fs/btrfs/inode.c:1838
writepage_delalloc+0x150/0x460 fs/btrfs/extent_io.c:3439
__extent_writepage+0x441/0xd00 fs/btrfs/extent_io.c:3653
extent_write_cache_pages.constprop.0+0x69d/0x1040 fs/btrfs/extent_io.c:4249
extent_writepages+0xcd/0x2b0 fs/btrfs/extent_io.c:4370
do_writepages+0xec/0x290 mm/page-writeback.c:2352
__writeback_single_inode+0x125/0x1400 fs/fs-writeback.c:1461
writeback_sb_inodes+0x53d/0xf40 fs/fs-writeback.c:1721
wb_writeback+0x2ad/0xd40 fs/fs-writeback.c:1894
wb_do_writeback fs/fs-writeback.c:2039 [inline]
wb_workfn+0x2dc/0x13e0 fs/fs-writeback.c:2080
process_one_work+0x94c/0x1670 kernel/workqueue.c:2269
worker_thread+0x64c/0x1120 kernel/workqueue.c:2415
kthread+0x3b5/0x4a0 kernel/kthread.c:292
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294
-> #3 (sb_internal#2){.+.+}-{0:0}:
percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
__sb_start_write+0x234/0x470 fs/super.c:1672
sb_start_intwrite include/linux/fs.h:1690 [inline]
start_transaction+0xbe7/0x1170 fs/btrfs/transaction.c:624
find_free_extent_update_loop fs/btrfs/extent-tree.c:3789 [inline]
find_free_extent+0x25e1/0x2e60 fs/btrfs/extent-tree.c:4127
btrfs_reserve_extent+0x166/0x460 fs/btrfs/extent-tree.c:4206
cow_file_range+0x3de/0x9b0 fs/btrfs/inode.c:1063
btrfs_run_delalloc_range+0x2cf/0x1410 fs/btrfs/inode.c:1838
writepage_delalloc+0x150/0x460 fs/btrfs/extent_io.c:3439
__extent_writepage+0x441/0xd00 fs/btrfs/extent_io.c:3653
extent_write_cache_pages.constprop.0+0x69d/0x1040 fs/btrfs/extent_io.c:4249
extent_writepages+0xcd/0x2b0 fs/btrfs/extent_io.c:4370
do_writepages+0xec/0x290 mm/page-writeback.c:2352
__writeback_single_inode+0x125/0x1400 fs/fs-writeback.c:1461
writeback_sb_inodes+0x53d/0xf40 fs/fs-writeback.c:1721
wb_writeback+0x2ad/0xd40 fs/fs-writeback.c:1894
wb_do_writeback fs/fs-writeback.c:2039 [inline]
wb_workfn+0x2dc/0x13e0 fs/fs-writeback.c:2080
process_one_work+0x94c/0x1670 kernel/workqueue.c:2269
worker_thread+0x64c/0x1120 kernel/workqueue.c:2415
kthread+0x3b5/0x4a0 kernel/kthread.c:292
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294
-> #2 ((work_completion)(&(&wb->dwork)->work)){+.+.}-{0:0}:
__flush_work+0x60e/0xac0 kernel/workqueue.c:3041
wb_shutdown+0x180/0x220 mm/backing-dev.c:355
bdi_unregister+0x174/0x590 mm/backing-dev.c:872
del_gendisk+0x820/0xa10 block/genhd.c:933
loop_remove drivers/block/loop.c:2192 [inline]
loop_control_ioctl drivers/block/loop.c:2291 [inline]
loop_control_ioctl+0x3b1/0x480 drivers/block/loop.c:2257
vfs_ioctl fs/ioctl.c:48 [inline]
__do_sys_ioctl fs/ioctl.c:753 [inline]
__se_sys_ioctl fs/ioctl.c:739 [inline]
__x64_sys_ioctl+0x193/0x200 fs/ioctl.c:739
do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
entry_SYSCALL_64_after_hwframe+0x44/0xa9
-> #1 (loop_ctl_mutex){+.+.}-{3:3}:
__mutex_lock_common kernel/locking/mutex.c:956 [inline]
__mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103
lo_open+0x19/0xd0 drivers/block/loop.c:1893
__blkdev_get+0x759/0x1aa0 fs/block_dev.c:1507
blkdev_get fs/block_dev.c:1639 [inline]
blkdev_open+0x227/0x300 fs/block_dev.c:1753
do_dentry_open+0x4b9/0x11b0 fs/open.c:817
do_open fs/namei.c:3251 [inline]
path_openat+0x1b9a/0x2730 fs/namei.c:3368
do_filp_open+0x17e/0x3c0 fs/namei.c:3395
do_sys_openat2+0x16d/0x420 fs/open.c:1168
do_sys_open fs/open.c:1184 [inline]
__do_sys_open fs/open.c:1192 [inline]
__se_sys_open fs/open.c:1188 [inline]
__x64_sys_open+0x119/0x1c0 fs/open.c:1188
do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
entry_SYSCALL_64_after_hwframe+0x44/0xa9
-> #0 (&bdev->bd_mutex){+.+.}-{3:3}:
check_prev_add kernel/locking/lockdep.c:2496 [inline]
check_prevs_add kernel/locking/lockdep.c:2601 [inline]
validate_chain kernel/locking/lockdep.c:3218 [inline]
__lock_acquire+0x2a96/0x5780 kernel/locking/lockdep.c:4426
lock_acquire+0x1f3/0xae0 kernel/locking/lockdep.c:5006
__mutex_lock_common kernel/locking/mutex.c:956 [inline]
__mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103
blkdev_put+0x30/0x520 fs/block_dev.c:1804
btrfs_close_bdev fs/btrfs/volumes.c:1117 [inline]
btrfs_close_bdev fs/btrfs/volumes.c:1107 [inline]
btrfs_close_one_device fs/btrfs/volumes.c:1133 [inline]
close_fs_devices.part.0+0x1a4/0x800 fs/btrfs/volumes.c:1161
close_fs_devices fs/btrfs/volumes.c:1193 [inline]
btrfs_close_devices+0x95/0x1f0 fs/btrfs/volumes.c:1179
close_ctree+0x688/0x6cb fs/btrfs/disk-io.c:4149
generic_shutdown_super+0x144/0x370 fs/super.c:464
kill_anon_super+0x36/0x60 fs/super.c:1108
btrfs_kill_super+0x38/0x50 fs/btrfs/super.c:2265
deactivate_locked_super+0x94/0x160 fs/super.c:335
deactivate_super+0xad/0xd0 fs/super.c:366
cleanup_mnt+0x3a3/0x530 fs/namespace.c:1118
task_work_run+0xdd/0x190 kernel/task_work.c:141
tracehook_notify_resume include/linux/tracehook.h:188 [inline]
exit_to_user_mode_loop kernel/entry/common.c:163 [inline]
exit_to_user_mode_prepare+0x1e1/0x200 kernel/entry/common.c:190
syscall_exit_to_user_mode+0x7e/0x2e0 kernel/entry/common.c:265
entry_SYSCALL_64_after_hwframe+0x44/0xa9
other info that might help us debug this:
Chain exists of:
&bdev->bd_mutex --> sb_internal#2 --> &fs_devs->device_list_mutex
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&fs_devs->device_list_mutex);
lock(sb_internal#2);
lock(&fs_devs->device_list_mutex);
lock(&bdev->bd_mutex);
*** DEADLOCK ***
3 locks held by syz-executor.0/6878:
#0: ffff88809070c0e0 (&type->s_umount_key#70){++++}-{3:3}, at: deactivate_super+0xa5/0xd0 fs/super.c:365
#1: ffffffff8a5b37a8 (uuid_mutex){+.+.}-{3:3}, at: btrfs_close_devices+0x23/0x1f0 fs/btrfs/volumes.c:1178
#2: ffff8880908cfce0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: close_fs_devices.part.0+0x2e/0x800 fs/btrfs/volumes.c:1159
stack backtrace:
CPU: 0 PID: 6878 Comm: syz-executor.0 Not tainted 5.9.0-rc5-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x198/0x1fd lib/dump_stack.c:118
check_noncircular+0x324/0x3e0 kernel/locking/lockdep.c:1827
check_prev_add kernel/locking/lockdep.c:2496 [inline]
check_prevs_add kernel/locking/lockdep.c:2601 [inline]
validate_chain kernel/locking/lockdep.c:3218 [inline]
__lock_acquire+0x2a96/0x5780 kernel/locking/lockdep.c:4426
lock_acquire+0x1f3/0xae0 kernel/locking/lockdep.c:5006
__mutex_lock_common kernel/locking/mutex.c:956 [inline]
__mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103
blkdev_put+0x30/0x520 fs/block_dev.c:1804
btrfs_close_bdev fs/btrfs/volumes.c:1117 [inline]
btrfs_close_bdev fs/btrfs/volumes.c:1107 [inline]
btrfs_close_one_device fs/btrfs/volumes.c:1133 [inline]
close_fs_devices.part.0+0x1a4/0x800 fs/btrfs/volumes.c:1161
close_fs_devices fs/btrfs/volumes.c:1193 [inline]
btrfs_close_devices+0x95/0x1f0 fs/btrfs/volumes.c:1179
close_ctree+0x688/0x6cb fs/btrfs/disk-io.c:4149
generic_shutdown_super+0x144/0x370 fs/super.c:464
kill_anon_super+0x36/0x60 fs/super.c:1108
btrfs_kill_super+0x38/0x50 fs/btrfs/super.c:2265
deactivate_locked_super+0x94/0x160 fs/super.c:335
deactivate_super+0xad/0xd0 fs/super.c:366
cleanup_mnt+0x3a3/0x530 fs/namespace.c:1118
task_work_run+0xdd/0x190 kernel/task_work.c:141
tracehook_notify_resume include/linux/tracehook.h:188 [inline]
exit_to_user_mode_loop kernel/entry/common.c:163 [inline]
exit_to_user_mode_prepare+0x1e1/0x200 kernel/entry/common.c:190
syscall_exit_to_user_mode+0x7e/0x2e0 kernel/entry/common.c:265
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x460027
RSP: 002b:00007fff59216328 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 0000000000076035 RCX: 0000000000460027
RDX: 0000000000403188 RSI: 0000000000000002 RDI: 00007fff592163d0
RBP: 0000000000000333 R08: 0000000000000000 R09: 000000000000000b
R10: 0000000000000005 R11: 0000000000000246 R12: 00007fff59217460
R13: 0000000002df2a60 R14: 0000000000000000 R15: 00007fff59217460
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
[ add syzbot reference ]
Signed-off-by: David Sterba <dsterba@suse.com>
2020-08-20 15:18:27 +00:00
|
|
|
btrfs_rm_dev_replace_free_srcdev(src_device);
|
|
|
|
|
2012-11-05 16:33:06 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-06-14 14:24:56 +00:00
|
|
|
/*
|
|
|
|
* Read progress of device replace status according to the state and last
|
|
|
|
* stored position. The value format is the same as for
|
|
|
|
* btrfs_dev_replace::progress_1000
|
|
|
|
*/
|
|
|
|
static u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
u64 ret = 0;
|
|
|
|
|
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
|
|
|
ret = 0;
|
|
|
|
break;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
ret = 1000;
|
|
|
|
break;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
|
|
|
ret = div64_u64(dev_replace->cursor_left,
|
|
|
|
div_u64(btrfs_device_get_total_bytes(
|
|
|
|
dev_replace->srcdev), 1000));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-11-05 16:33:06 +00:00
|
|
|
void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_ioctl_dev_replace_args *args)
|
|
|
|
{
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
|
2018-09-07 14:11:23 +00:00
|
|
|
down_read(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
/* even if !dev_replace_is_valid, the values are good enough for
|
|
|
|
* the replace_status ioctl */
|
|
|
|
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
|
|
|
|
args->status.replace_state = dev_replace->replace_state;
|
|
|
|
args->status.time_started = dev_replace->time_started;
|
|
|
|
args->status.time_stopped = dev_replace->time_stopped;
|
|
|
|
args->status.num_write_errors =
|
|
|
|
atomic64_read(&dev_replace->num_write_errors);
|
|
|
|
args->status.num_uncorrectable_read_errors =
|
|
|
|
atomic64_read(&dev_replace->num_uncorrectable_read_errors);
|
2017-06-14 14:24:56 +00:00
|
|
|
args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
|
2018-09-07 14:11:23 +00:00
|
|
|
up_read(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
}
|
|
|
|
|
2018-02-12 15:33:31 +00:00
|
|
|
int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
|
2012-11-05 16:33:06 +00:00
|
|
|
{
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
struct btrfs_device *tgt_device = NULL;
|
2018-02-13 03:53:43 +00:00
|
|
|
struct btrfs_device *src_device = NULL;
|
2012-11-05 16:33:06 +00:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = fs_info->tree_root;
|
2018-02-12 15:33:31 +00:00
|
|
|
int result;
|
2012-11-05 16:33:06 +00:00
|
|
|
int ret;
|
|
|
|
|
2017-07-17 07:45:34 +00:00
|
|
|
if (sb_rdonly(fs_info->sb))
|
2013-10-10 17:40:21 +00:00
|
|
|
return -EROFS;
|
|
|
|
|
2012-11-05 16:33:06 +00:00
|
|
|
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
|
2018-09-07 14:11:23 +00:00
|
|
|
down_write(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
|
|
|
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
|
2018-09-07 14:11:23 +00:00
|
|
|
up_write(&dev_replace->rwsem);
|
2018-11-14 05:50:26 +00:00
|
|
|
break;
|
2012-11-05 16:33:06 +00:00
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
2018-11-14 05:50:26 +00:00
|
|
|
tgt_device = dev_replace->tgtdev;
|
|
|
|
src_device = dev_replace->srcdev;
|
2018-09-07 14:11:23 +00:00
|
|
|
up_write(&dev_replace->rwsem);
|
2018-11-11 14:22:20 +00:00
|
|
|
ret = btrfs_scrub_cancel(fs_info);
|
|
|
|
if (ret < 0) {
|
|
|
|
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
|
|
|
|
} else {
|
|
|
|
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
|
|
|
|
/*
|
|
|
|
* btrfs_dev_replace_finishing() will handle the
|
|
|
|
* cleanup part
|
|
|
|
*/
|
|
|
|
btrfs_info_in_rcu(fs_info,
|
|
|
|
"dev_replace from %s (devid %llu) to %s canceled",
|
|
|
|
btrfs_dev_name(src_device), src_device->devid,
|
|
|
|
btrfs_dev_name(tgt_device));
|
|
|
|
}
|
2018-11-14 05:50:26 +00:00
|
|
|
break;
|
2012-11-05 16:33:06 +00:00
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
2018-11-14 05:50:26 +00:00
|
|
|
/*
|
|
|
|
* Scrub doing the replace isn't running so we need to do the
|
|
|
|
* cleanup step of btrfs_dev_replace_finishing() here
|
|
|
|
*/
|
2012-11-05 16:33:06 +00:00
|
|
|
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
|
|
|
|
tgt_device = dev_replace->tgtdev;
|
2018-02-13 03:53:43 +00:00
|
|
|
src_device = dev_replace->srcdev;
|
2012-11-05 16:33:06 +00:00
|
|
|
dev_replace->tgtdev = NULL;
|
|
|
|
dev_replace->srcdev = NULL;
|
2018-11-14 05:50:26 +00:00
|
|
|
dev_replace->replace_state =
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
|
|
|
|
dev_replace->time_stopped = ktime_get_real_seconds();
|
|
|
|
dev_replace->item_needs_writeback = 1;
|
2012-11-05 16:33:06 +00:00
|
|
|
|
2018-09-07 14:11:23 +00:00
|
|
|
up_write(&dev_replace->rwsem);
|
2018-02-13 03:53:43 +00:00
|
|
|
|
2018-11-11 14:22:21 +00:00
|
|
|
/* Scrub for replace must not be running in suspended state */
|
2022-08-12 10:32:18 +00:00
|
|
|
btrfs_scrub_cancel(fs_info);
|
2018-11-14 05:50:26 +00:00
|
|
|
|
|
|
|
trans = btrfs_start_transaction(root, 0);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
return PTR_ERR(trans);
|
|
|
|
}
|
|
|
|
ret = btrfs_commit_transaction(trans);
|
|
|
|
WARN_ON(ret);
|
2018-02-13 03:53:43 +00:00
|
|
|
|
2018-11-14 05:50:26 +00:00
|
|
|
btrfs_info_in_rcu(fs_info,
|
|
|
|
"suspended dev_replace from %s (devid %llu) to %s canceled",
|
|
|
|
btrfs_dev_name(src_device), src_device->devid,
|
|
|
|
btrfs_dev_name(tgt_device));
|
|
|
|
|
|
|
|
if (tgt_device)
|
|
|
|
btrfs_destroy_dev_replace_tgtdev(tgt_device);
|
|
|
|
break;
|
|
|
|
default:
|
2019-02-11 18:32:10 +00:00
|
|
|
up_write(&dev_replace->rwsem);
|
2018-11-14 05:50:26 +00:00
|
|
|
result = -EINVAL;
|
|
|
|
}
|
2012-11-05 16:33:06 +00:00
|
|
|
|
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
|
|
|
|
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
|
2018-09-07 14:11:23 +00:00
|
|
|
down_write(&dev_replace->rwsem);
|
|
|
|
|
2012-11-05 16:33:06 +00:00
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
|
|
|
break;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
|
|
|
dev_replace->replace_state =
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
|
2018-06-12 11:48:25 +00:00
|
|
|
dev_replace->time_stopped = ktime_get_real_seconds();
|
2012-11-05 16:33:06 +00:00
|
|
|
dev_replace->item_needs_writeback = 1;
|
2013-12-20 16:37:06 +00:00
|
|
|
btrfs_info(fs_info, "suspending dev_replace for unmount");
|
2012-11-05 16:33:06 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2018-09-07 14:11:23 +00:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* resume dev_replace procedure that was interrupted by unmount */
|
|
|
|
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
struct task_struct *task;
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
|
2018-09-07 14:11:23 +00:00
|
|
|
down_write(&dev_replace->rwsem);
|
|
|
|
|
2012-11-05 16:33:06 +00:00
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
2018-09-07 14:11:23 +00:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
return 0;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
|
|
|
break;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
|
|
|
dev_replace->replace_state =
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
|
2013-12-20 16:37:06 +00:00
|
|
|
btrfs_info(fs_info,
|
2016-09-20 14:05:00 +00:00
|
|
|
"cannot continue dev_replace, tgtdev is missing");
|
|
|
|
btrfs_info(fs_info,
|
|
|
|
"you may cancel the operation after 'mount -o degraded'");
|
2018-11-11 14:22:17 +00:00
|
|
|
dev_replace->replace_state =
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
|
2018-09-07 14:11:23 +00:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
return 0;
|
|
|
|
}
|
2018-09-07 14:11:23 +00:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-05 16:33:06 +00:00
|
|
|
|
2018-03-20 18:51:04 +00:00
|
|
|
/*
|
|
|
|
* This could collide with a paused balance, but the exclusive op logic
|
|
|
|
* should never allow both to start and pause. We don't want to allow
|
|
|
|
* dev-replace to start anyway.
|
|
|
|
*/
|
2020-08-25 15:02:32 +00:00
|
|
|
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
|
2018-09-07 14:11:23 +00:00
|
|
|
down_write(&dev_replace->rwsem);
|
2018-11-11 14:22:18 +00:00
|
|
|
dev_replace->replace_state =
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
|
2018-09-07 14:11:23 +00:00
|
|
|
up_write(&dev_replace->rwsem);
|
2018-03-20 18:51:04 +00:00
|
|
|
btrfs_info(fs_info,
|
|
|
|
"cannot resume dev-replace, other exclusive operation running");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-11-05 16:33:06 +00:00
|
|
|
task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
|
2013-07-15 01:50:32 +00:00
|
|
|
return PTR_ERR_OR_ZERO(task);
|
2012-11-05 16:33:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_dev_replace_kthread(void *data)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = data;
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
u64 progress;
|
2018-03-20 14:35:50 +00:00
|
|
|
int ret;
|
2012-11-05 16:33:06 +00:00
|
|
|
|
2017-06-14 14:28:42 +00:00
|
|
|
progress = btrfs_dev_replace_progress(fs_info);
|
|
|
|
progress = div_u64(progress, 10);
|
|
|
|
btrfs_info_in_rcu(fs_info,
|
2017-11-28 02:43:10 +00:00
|
|
|
"continuing dev_replace from %s (devid %llu) to target %s @%u%%",
|
|
|
|
btrfs_dev_name(dev_replace->srcdev),
|
2017-06-14 14:28:42 +00:00
|
|
|
dev_replace->srcdev->devid,
|
2017-11-28 02:43:10 +00:00
|
|
|
btrfs_dev_name(dev_replace->tgtdev),
|
2017-06-14 14:28:42 +00:00
|
|
|
(unsigned int)progress);
|
|
|
|
|
2012-11-05 16:33:06 +00:00
|
|
|
ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
|
|
|
|
dev_replace->committed_cursor_left,
|
2014-09-03 13:35:38 +00:00
|
|
|
btrfs_device_get_total_bytes(dev_replace->srcdev),
|
2012-11-05 16:33:06 +00:00
|
|
|
&dev_replace->scrub_progress, 0, 1);
|
|
|
|
ret = btrfs_dev_replace_finishing(fs_info, ret);
|
2018-11-20 11:56:15 +00:00
|
|
|
WARN_ON(ret && ret != -ECANCELED);
|
2018-03-20 14:35:50 +00:00
|
|
|
|
2020-08-25 15:02:32 +00:00
|
|
|
btrfs_exclop_finish(fs_info);
|
2012-11-05 16:33:06 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-10-01 17:57:39 +00:00
|
|
|
int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
|
2012-11-05 16:33:06 +00:00
|
|
|
{
|
|
|
|
if (!dev_replace->is_valid)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
|
|
|
return 0;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
|
|
|
/*
|
|
|
|
* return true even if tgtdev is missing (this is
|
|
|
|
* something that can happen if the dev_replace
|
|
|
|
* procedure is suspended by an umount and then
|
|
|
|
* the tgtdev is missing (or "btrfs dev scan") was
|
2018-11-28 11:05:13 +00:00
|
|
|
* not called and the filesystem is remounted
|
2012-11-05 16:33:06 +00:00
|
|
|
* in degraded state. This does not stop the
|
|
|
|
* dev_replace procedure. It needs to be canceled
|
2016-03-04 19:23:12 +00:00
|
|
|
* manually if the cancellation is wanted.
|
2012-11-05 16:33:06 +00:00
|
|
|
*/
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2014-11-25 08:39:28 +00:00
|
|
|
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
|
|
|
{
|
2018-04-04 23:04:49 +00:00
|
|
|
percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
|
|
|
|
cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
2015-01-20 07:11:37 +00:00
|
|
|
while (1) {
|
2018-04-04 23:04:49 +00:00
|
|
|
percpu_counter_inc(&fs_info->dev_replace.bio_counter);
|
2015-01-20 07:11:37 +00:00
|
|
|
if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
|
|
|
|
&fs_info->fs_state)))
|
|
|
|
break;
|
|
|
|
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
|
|
|
btrfs_bio_counter_dec(fs_info);
|
2018-04-04 23:04:49 +00:00
|
|
|
wait_event(fs_info->dev_replace.replace_wait,
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 08:46:55 +00:00
|
|
|
!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
|
|
|
|
&fs_info->fs_state));
|
|
|
|
}
|
|
|
|
}
|