btrfs: add support for inserting raid stripe extents

Add support for inserting stripe extents into the raid stripe tree on
completion of every write that needs an extra logical-to-physical
translation when using RAID.

Inserting the stripe extents happens after the data I/O has completed,
this is done to

  a) support zone-append and
  b) rule out the possibility of a RAID-write-hole.

Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
Johannes Thumshirn 2023-09-14 09:06:58 -07:00 committed by David Sterba
parent 515020900d
commit 02c372e1f0
10 changed files with 168 additions and 9 deletions

View File

@ -33,7 +33,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \
lru_cache.o
lru_cache.o raid-stripe-tree.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o

View File

@ -14,6 +14,7 @@
#include "rcu-string.h"
#include "zoned.h"
#include "file-item.h"
#include "raid-stripe-tree.h"
static struct bio_set btrfs_bioset;
static struct bio_set btrfs_clone_bioset;
@ -415,6 +416,9 @@ static void btrfs_orig_write_end_io(struct bio *bio)
else
bio->bi_status = BLK_STS_OK;
if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status)
stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
btrfs_orig_bbio_end_io(bbio);
btrfs_put_bioc(bioc);
}
@ -426,6 +430,8 @@ static void btrfs_clone_write_end_io(struct bio *bio)
if (bio->bi_status) {
atomic_inc(&stripe->bioc->error);
btrfs_log_dev_io_error(bio, stripe->dev);
} else if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
}
/* Pass on control to the original bio this one was cloned from */
@ -487,6 +493,7 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
bio->bi_private = &bioc->stripes[dev_nr];
bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
bioc->stripes[dev_nr].bioc = bioc;
bioc->size = bio->bi_iter.bi_size;
btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
}
@ -496,6 +503,8 @@ static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc,
if (!bioc) {
/* Single mirror read/write fast path. */
btrfs_bio(bio)->mirror_num = mirror_num;
if (bio_op(bio) != REQ_OP_READ)
btrfs_bio(bio)->orig_physical = smap->physical;
bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT;
if (bio_op(bio) != REQ_OP_READ)
btrfs_bio(bio)->orig_physical = smap->physical;
@ -688,6 +697,18 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
bio->bi_opf |= REQ_OP_ZONE_APPEND;
}
if (is_data_bbio(bbio) && bioc &&
btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) {
/*
* No locking for the list update, as we only add to
* the list in the I/O submission path, and list
* iteration only happens in the completion path, which
* can't happen until after the last submission.
*/
btrfs_get_bioc(bioc);
list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list);
}
/*
* Csum items for reloc roots have already been cloned at this
* point, so they are handled as part of the no-checksum case.

View File

@ -42,6 +42,7 @@
#include "file-item.h"
#include "orphan.h"
#include "tree-checker.h"
#include "raid-stripe-tree.h"
#undef SCRAMBLE_DELAYED_REFS

View File

@ -71,6 +71,7 @@
#include "super.h"
#include "orphan.h"
#include "backref.h"
#include "raid-stripe-tree.h"
struct btrfs_iget_args {
u64 ino;
@ -3091,6 +3092,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
trans->block_rsv = &inode->block_rsv;
ret = btrfs_insert_raid_extent(trans, ordered_extent);
if (ret)
goto out;
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compress_type = ordered_extent->compress_type;
if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
@ -3224,7 +3229,8 @@ out:
int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
{
if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) &&
!test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
!test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
list_empty(&ordered->bioc_list))
btrfs_finish_ordered_zoned(ordered);
return btrfs_finish_one_ordered(ordered);
}

View File

@ -191,6 +191,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent(
INIT_LIST_HEAD(&entry->log_list);
INIT_LIST_HEAD(&entry->root_extent_list);
INIT_LIST_HEAD(&entry->work_list);
INIT_LIST_HEAD(&entry->bioc_list);
init_completion(&entry->completion);
/*

View File

@ -151,6 +151,8 @@ struct btrfs_ordered_extent {
struct completion completion;
struct btrfs_work flush_work;
struct list_head work_list;
struct list_head bioc_list;
};
static inline void

View File

@ -0,0 +1,87 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2023 Western Digital Corporation or its affiliates.
*/
#include <linux/btrfs_tree.h>
#include "ctree.h"
#include "fs.h"
#include "accessors.h"
#include "transaction.h"
#include "disk-io.h"
#include "raid-stripe-tree.h"
#include "volumes.h"
#include "misc.h"
#include "print-tree.h"
static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_io_context *bioc)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_key stripe_key;
struct btrfs_root *stripe_root = fs_info->stripe_root;
const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type);
u8 encoding = btrfs_bg_flags_to_raid_index(bioc->map_type);
struct btrfs_stripe_extent *stripe_extent;
const size_t item_size = struct_size(stripe_extent, strides, num_stripes);
int ret;
stripe_extent = kzalloc(item_size, GFP_NOFS);
if (!stripe_extent) {
btrfs_abort_transaction(trans, -ENOMEM);
btrfs_end_transaction(trans);
return -ENOMEM;
}
btrfs_set_stack_stripe_extent_encoding(stripe_extent, encoding);
for (int i = 0; i < num_stripes; i++) {
u64 devid = bioc->stripes[i].dev->devid;
u64 physical = bioc->stripes[i].physical;
u64 length = bioc->stripes[i].length;
struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i];
if (length == 0)
length = bioc->size;
btrfs_set_stack_raid_stride_devid(raid_stride, devid);
btrfs_set_stack_raid_stride_physical(raid_stride, physical);
}
stripe_key.objectid = bioc->logical;
stripe_key.type = BTRFS_RAID_STRIPE_KEY;
stripe_key.offset = bioc->size;
ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
item_size);
if (ret)
btrfs_abort_transaction(trans, ret);
kfree(stripe_extent);
return ret;
}
int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_ordered_extent *ordered_extent)
{
struct btrfs_io_context *bioc;
int ret;
if (!btrfs_fs_incompat(trans->fs_info, RAID_STRIPE_TREE))
return 0;
list_for_each_entry(bioc, &ordered_extent->bioc_list, rst_ordered_entry) {
ret = btrfs_insert_one_raid_extent(trans, bioc);
if (ret)
return ret;
}
while (!list_empty(&ordered_extent->bioc_list)) {
bioc = list_first_entry(&ordered_extent->bioc_list,
typeof(*bioc), rst_ordered_entry);
list_del(&bioc->rst_ordered_entry);
btrfs_put_bioc(bioc);
}
return ret;
}

View File

@ -0,0 +1,35 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2023 Western Digital Corporation or its affiliates.
*/
#ifndef BTRFS_RAID_STRIPE_TREE_H
#define BTRFS_RAID_STRIPE_TREE_H
struct btrfs_io_context;
struct btrfs_io_stripe;
struct btrfs_ordered_extent;
struct btrfs_trans_handle;
int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_ordered_extent *ordered_extent);
static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
u64 map_type)
{
u64 type = map_type & BTRFS_BLOCK_GROUP_TYPE_MASK;
u64 profile = map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE))
return false;
if (type != BTRFS_BLOCK_GROUP_DATA)
return false;
if (profile & BTRFS_BLOCK_GROUP_RAID1_MASK)
return true;
return false;
}
#endif

View File

@ -5906,6 +5906,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
}
static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
u64 logical,
u16 total_stripes)
{
struct btrfs_io_context *bioc;
@ -5925,6 +5926,7 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
bioc->fs_info = fs_info;
bioc->replace_stripe_src = -1;
bioc->full_stripe_logical = (u64)-1;
bioc->logical = logical;
return bioc;
}
@ -6451,7 +6453,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
goto out;
}
bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes);
if (!bioc) {
ret = -ENOMEM;
goto out;

View File

@ -390,12 +390,11 @@ struct btrfs_fs_devices {
struct btrfs_io_stripe {
struct btrfs_device *dev;
union {
/* Block mapping */
u64 physical;
/* For the endio handler */
struct btrfs_io_context *bioc;
};
/* Block mapping. */
u64 physical;
u64 length;
/* For the endio handler. */
struct btrfs_io_context *bioc;
};
struct btrfs_discard_stripe {
@ -428,6 +427,11 @@ struct btrfs_io_context {
atomic_t error;
u16 max_errors;
u64 logical;
u64 size;
/* Raid stripe tree ordered entry. */
struct list_head rst_ordered_entry;
/*
* The total number of stripes, including the extra duplicated
* stripe for replace.