Merge tag 'md-6.9-20240301' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md into for-6.9/block
Pull MD updates from Song: "The major changes are: 1. Refactor raid1 read_balance, by Yu Kuai and Paul Luse. 2. Clean up and fix for md_ioctl, by Li Nan. 3. Other small fixes, by Gui-Dong Han and Heming Zhao." * tag 'md-6.9-20240301' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md: (22 commits) md/raid1: factor out helpers to choose the best rdev from read_balance() md/raid1: factor out the code to manage sequential IO md/raid1: factor out choose_bb_rdev() from read_balance() md/raid1: factor out choose_slow_rdev() from read_balance() md/raid1: factor out read_first_rdev() from read_balance() md/raid1-10: factor out a new helper raid1_should_read_first() md/raid1-10: add a helper raid1_check_read_range() md/raid1: fix choose next idle in read_balance() md/raid1: record nonrot rdevs while adding/removing rdevs to conf md/raid1: factor out helpers to add rdev to conf md: add a new helper rdev_has_badblock() md/raid5: fix atomicity violation in raid5_cache_count md/md-bitmap: fix incorrect usage for sb_index md: check mddev->pers before calling md_set_readonly() md: clean up openers check in do_md_stop() and md_set_readonly() md: sync blockdev before stopping raid or setting readonly md: factor out a helper to sync mddev md: Don't clear MD_CLOSING when the raid is about to stop md: return directly before setting did_set_md_closing md: clean up invalid BUG_ON in md_ioctl ...
This commit is contained in:
commit
86b1e613eb
|
@ -234,7 +234,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
|
|||
sector_t doff;
|
||||
|
||||
bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
|
||||
if (pg_index == store->file_pages - 1) {
|
||||
/* we compare length (page numbers), not page offset. */
|
||||
if ((pg_index - store->sb_index) == store->file_pages - 1) {
|
||||
unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
|
||||
|
||||
if (last_page_size == 0)
|
||||
|
@ -438,8 +439,8 @@ static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index,
|
|||
struct page *page = store->filemap[pg_index];
|
||||
|
||||
if (mddev_is_clustered(bitmap->mddev)) {
|
||||
pg_index += bitmap->cluster_slot *
|
||||
DIV_ROUND_UP(store->bytes, PAGE_SIZE);
|
||||
/* go to node bitmap area starting point */
|
||||
pg_index += store->sb_index;
|
||||
}
|
||||
|
||||
if (store->file)
|
||||
|
@ -952,6 +953,7 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
|
|||
unsigned long index = file_page_index(store, chunk);
|
||||
unsigned long node_offset = 0;
|
||||
|
||||
index += store->sb_index;
|
||||
if (mddev_is_clustered(bitmap->mddev))
|
||||
node_offset = bitmap->cluster_slot * store->file_pages;
|
||||
|
||||
|
@ -982,6 +984,7 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
|
|||
unsigned long index = file_page_index(store, chunk);
|
||||
unsigned long node_offset = 0;
|
||||
|
||||
index += store->sb_index;
|
||||
if (mddev_is_clustered(bitmap->mddev))
|
||||
node_offset = bitmap->cluster_slot * store->file_pages;
|
||||
|
||||
|
|
183
drivers/md/md.c
183
drivers/md/md.c
|
@ -529,6 +529,24 @@ void mddev_resume(struct mddev *mddev)
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(mddev_resume);
|
||||
|
||||
/* sync bdev before setting device to readonly or stopping raid*/
|
||||
static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num)
|
||||
{
|
||||
mutex_lock(&mddev->open_mutex);
|
||||
if (mddev->pers && atomic_read(&mddev->openers) > opener_num) {
|
||||
mutex_unlock(&mddev->open_mutex);
|
||||
return -EBUSY;
|
||||
}
|
||||
if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
|
||||
mutex_unlock(&mddev->open_mutex);
|
||||
return -EBUSY;
|
||||
}
|
||||
mutex_unlock(&mddev->open_mutex);
|
||||
|
||||
sync_blockdev(mddev->gendisk->part0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Generic flush handling for md
|
||||
*/
|
||||
|
@ -4464,8 +4482,8 @@ array_state_show(struct mddev *mddev, char *page)
|
|||
return sprintf(page, "%s\n", array_states[st]);
|
||||
}
|
||||
|
||||
static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
|
||||
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
|
||||
static int do_md_stop(struct mddev *mddev, int ro);
|
||||
static int md_set_readonly(struct mddev *mddev);
|
||||
static int restart_array(struct mddev *mddev);
|
||||
|
||||
static ssize_t
|
||||
|
@ -4482,6 +4500,17 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
|
|||
case broken: /* cannot be set */
|
||||
case bad_word:
|
||||
return -EINVAL;
|
||||
case clear:
|
||||
case readonly:
|
||||
case inactive:
|
||||
case read_auto:
|
||||
if (!mddev->pers || !md_is_rdwr(mddev))
|
||||
break;
|
||||
/* write sysfs will not open mddev and opener should be 0 */
|
||||
err = mddev_set_closing_and_sync_blockdev(mddev, 0);
|
||||
if (err)
|
||||
return err;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
@ -4515,14 +4544,14 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
|
|||
case inactive:
|
||||
/* stop an active array, return 0 otherwise */
|
||||
if (mddev->pers)
|
||||
err = do_md_stop(mddev, 2, NULL);
|
||||
err = do_md_stop(mddev, 2);
|
||||
break;
|
||||
case clear:
|
||||
err = do_md_stop(mddev, 0, NULL);
|
||||
err = do_md_stop(mddev, 0);
|
||||
break;
|
||||
case readonly:
|
||||
if (mddev->pers)
|
||||
err = md_set_readonly(mddev, NULL);
|
||||
err = md_set_readonly(mddev);
|
||||
else {
|
||||
mddev->ro = MD_RDONLY;
|
||||
set_disk_ro(mddev->gendisk, 1);
|
||||
|
@ -4532,7 +4561,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
|
|||
case read_auto:
|
||||
if (mddev->pers) {
|
||||
if (md_is_rdwr(mddev))
|
||||
err = md_set_readonly(mddev, NULL);
|
||||
err = md_set_readonly(mddev);
|
||||
else if (mddev->ro == MD_RDONLY)
|
||||
err = restart_array(mddev);
|
||||
if (err == 0) {
|
||||
|
@ -4581,6 +4610,11 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
|
|||
sysfs_notify_dirent_safe(mddev->sysfs_state);
|
||||
}
|
||||
mddev_unlock(mddev);
|
||||
|
||||
if (st == readonly || st == read_auto || st == inactive ||
|
||||
(err && st == clear))
|
||||
clear_bit(MD_CLOSING, &mddev->flags);
|
||||
|
||||
return err ?: len;
|
||||
}
|
||||
static struct md_sysfs_entry md_array_state =
|
||||
|
@ -6265,7 +6299,15 @@ static void md_clean(struct mddev *mddev)
|
|||
mddev->persistent = 0;
|
||||
mddev->level = LEVEL_NONE;
|
||||
mddev->clevel[0] = 0;
|
||||
mddev->flags = 0;
|
||||
/*
|
||||
* Don't clear MD_CLOSING, or mddev can be opened again.
|
||||
* 'hold_active != 0' means mddev is still in the creation
|
||||
* process and will be used later.
|
||||
*/
|
||||
if (mddev->hold_active)
|
||||
mddev->flags = 0;
|
||||
else
|
||||
mddev->flags &= BIT_ULL_MASK(MD_CLOSING);
|
||||
mddev->sb_flags = 0;
|
||||
mddev->ro = MD_RDWR;
|
||||
mddev->metadata_type[0] = 0;
|
||||
|
@ -6378,7 +6420,8 @@ void md_stop(struct mddev *mddev)
|
|||
|
||||
EXPORT_SYMBOL_GPL(md_stop);
|
||||
|
||||
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
|
||||
/* ensure 'mddev->pers' exist before calling md_set_readonly() */
|
||||
static int md_set_readonly(struct mddev *mddev)
|
||||
{
|
||||
int err = 0;
|
||||
int did_freeze = 0;
|
||||
|
@ -6396,34 +6439,29 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
|
|||
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
|
||||
mddev_lock_nointr(mddev);
|
||||
|
||||
mutex_lock(&mddev->open_mutex);
|
||||
if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
|
||||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
|
||||
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
|
||||
pr_warn("md: %s still in use.\n",mdname(mddev));
|
||||
err = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (mddev->pers) {
|
||||
__md_stop_writes(mddev);
|
||||
__md_stop_writes(mddev);
|
||||
|
||||
if (mddev->ro == MD_RDONLY) {
|
||||
err = -ENXIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
mddev->ro = MD_RDONLY;
|
||||
set_disk_ro(mddev->gendisk, 1);
|
||||
if (mddev->ro == MD_RDONLY) {
|
||||
err = -ENXIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
mddev->ro = MD_RDONLY;
|
||||
set_disk_ro(mddev->gendisk, 1);
|
||||
|
||||
out:
|
||||
if ((mddev->pers && !err) || did_freeze) {
|
||||
if (!err || did_freeze) {
|
||||
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
sysfs_notify_dirent_safe(mddev->sysfs_state);
|
||||
}
|
||||
|
||||
mutex_unlock(&mddev->open_mutex);
|
||||
return err;
|
||||
}
|
||||
|
||||
|
@ -6431,8 +6469,7 @@ out:
|
|||
* 0 - completely stop and dis-assemble array
|
||||
* 2 - stop but do not disassemble array
|
||||
*/
|
||||
static int do_md_stop(struct mddev *mddev, int mode,
|
||||
struct block_device *bdev)
|
||||
static int do_md_stop(struct mddev *mddev, int mode)
|
||||
{
|
||||
struct gendisk *disk = mddev->gendisk;
|
||||
struct md_rdev *rdev;
|
||||
|
@ -6445,12 +6482,9 @@ static int do_md_stop(struct mddev *mddev, int mode,
|
|||
|
||||
stop_sync_thread(mddev, true, false);
|
||||
|
||||
mutex_lock(&mddev->open_mutex);
|
||||
if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
|
||||
mddev->sysfs_active ||
|
||||
if (mddev->sysfs_active ||
|
||||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
|
||||
pr_warn("md: %s still in use.\n",mdname(mddev));
|
||||
mutex_unlock(&mddev->open_mutex);
|
||||
if (did_freeze) {
|
||||
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
|
||||
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
|
||||
|
@ -6472,13 +6506,11 @@ static int do_md_stop(struct mddev *mddev, int mode,
|
|||
sysfs_unlink_rdev(mddev, rdev);
|
||||
|
||||
set_capacity_and_notify(disk, 0);
|
||||
mutex_unlock(&mddev->open_mutex);
|
||||
mddev->changed = 1;
|
||||
|
||||
if (!md_is_rdwr(mddev))
|
||||
mddev->ro = MD_RDWR;
|
||||
} else
|
||||
mutex_unlock(&mddev->open_mutex);
|
||||
}
|
||||
/*
|
||||
* Free resources if final stop
|
||||
*/
|
||||
|
@ -6524,7 +6556,7 @@ static void autorun_array(struct mddev *mddev)
|
|||
err = do_md_run(mddev);
|
||||
if (err) {
|
||||
pr_warn("md: do_md_run() returned %d\n", err);
|
||||
do_md_stop(mddev, 0, NULL);
|
||||
do_md_stop(mddev, 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -7522,16 +7554,17 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static inline bool md_ioctl_valid(unsigned int cmd)
|
||||
static inline int md_ioctl_valid(unsigned int cmd)
|
||||
{
|
||||
switch (cmd) {
|
||||
case ADD_NEW_DISK:
|
||||
case GET_ARRAY_INFO:
|
||||
case GET_BITMAP_FILE:
|
||||
case GET_DISK_INFO:
|
||||
case RAID_VERSION:
|
||||
return 0;
|
||||
case ADD_NEW_DISK:
|
||||
case GET_BITMAP_FILE:
|
||||
case HOT_ADD_DISK:
|
||||
case HOT_REMOVE_DISK:
|
||||
case RAID_VERSION:
|
||||
case RESTART_ARRAY_RW:
|
||||
case RUN_ARRAY:
|
||||
case SET_ARRAY_INFO:
|
||||
|
@ -7540,9 +7573,11 @@ static inline bool md_ioctl_valid(unsigned int cmd)
|
|||
case STOP_ARRAY:
|
||||
case STOP_ARRAY_RO:
|
||||
case CLUSTERED_DISK_NACK:
|
||||
return true;
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return -EACCES;
|
||||
return 0;
|
||||
default:
|
||||
return false;
|
||||
return -ENOTTY;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -7600,31 +7635,17 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
|
|||
int err = 0;
|
||||
void __user *argp = (void __user *)arg;
|
||||
struct mddev *mddev = NULL;
|
||||
bool did_set_md_closing = false;
|
||||
|
||||
if (!md_ioctl_valid(cmd))
|
||||
return -ENOTTY;
|
||||
|
||||
switch (cmd) {
|
||||
case RAID_VERSION:
|
||||
case GET_ARRAY_INFO:
|
||||
case GET_DISK_INFO:
|
||||
break;
|
||||
default:
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return -EACCES;
|
||||
}
|
||||
err = md_ioctl_valid(cmd);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
/*
|
||||
* Commands dealing with the RAID driver but not any
|
||||
* particular array:
|
||||
*/
|
||||
switch (cmd) {
|
||||
case RAID_VERSION:
|
||||
err = get_version(argp);
|
||||
goto out;
|
||||
default:;
|
||||
}
|
||||
if (cmd == RAID_VERSION)
|
||||
return get_version(argp);
|
||||
|
||||
/*
|
||||
* Commands creating/starting a new array:
|
||||
|
@ -7632,35 +7653,23 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
|
|||
|
||||
mddev = bdev->bd_disk->private_data;
|
||||
|
||||
if (!mddev) {
|
||||
BUG();
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Some actions do not requires the mutex */
|
||||
switch (cmd) {
|
||||
case GET_ARRAY_INFO:
|
||||
if (!mddev->raid_disks && !mddev->external)
|
||||
err = -ENODEV;
|
||||
else
|
||||
err = get_array_info(mddev, argp);
|
||||
goto out;
|
||||
return -ENODEV;
|
||||
return get_array_info(mddev, argp);
|
||||
|
||||
case GET_DISK_INFO:
|
||||
if (!mddev->raid_disks && !mddev->external)
|
||||
err = -ENODEV;
|
||||
else
|
||||
err = get_disk_info(mddev, argp);
|
||||
goto out;
|
||||
return -ENODEV;
|
||||
return get_disk_info(mddev, argp);
|
||||
|
||||
case SET_DISK_FAULTY:
|
||||
err = set_disk_faulty(mddev, new_decode_dev(arg));
|
||||
goto out;
|
||||
return set_disk_faulty(mddev, new_decode_dev(arg));
|
||||
|
||||
case GET_BITMAP_FILE:
|
||||
err = get_bitmap_file(mddev, argp);
|
||||
goto out;
|
||||
|
||||
return get_bitmap_file(mddev, argp);
|
||||
}
|
||||
|
||||
if (cmd == HOT_REMOVE_DISK)
|
||||
|
@ -7673,20 +7682,9 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
|
|||
/* Need to flush page cache, and ensure no-one else opens
|
||||
* and writes
|
||||
*/
|
||||
mutex_lock(&mddev->open_mutex);
|
||||
if (mddev->pers && atomic_read(&mddev->openers) > 1) {
|
||||
mutex_unlock(&mddev->open_mutex);
|
||||
err = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
|
||||
mutex_unlock(&mddev->open_mutex);
|
||||
err = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
did_set_md_closing = true;
|
||||
mutex_unlock(&mddev->open_mutex);
|
||||
sync_blockdev(bdev);
|
||||
err = mddev_set_closing_and_sync_blockdev(mddev, 1);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
if (!md_is_rdwr(mddev))
|
||||
|
@ -7727,11 +7725,12 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
|
|||
goto unlock;
|
||||
|
||||
case STOP_ARRAY:
|
||||
err = do_md_stop(mddev, 0, bdev);
|
||||
err = do_md_stop(mddev, 0);
|
||||
goto unlock;
|
||||
|
||||
case STOP_ARRAY_RO:
|
||||
err = md_set_readonly(mddev, bdev);
|
||||
if (mddev->pers)
|
||||
err = md_set_readonly(mddev);
|
||||
goto unlock;
|
||||
|
||||
case HOT_REMOVE_DISK:
|
||||
|
@ -7826,7 +7825,7 @@ unlock:
|
|||
mddev_unlock(mddev);
|
||||
|
||||
out:
|
||||
if(did_set_md_closing)
|
||||
if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY))
|
||||
clear_bit(MD_CLOSING, &mddev->flags);
|
||||
return err;
|
||||
}
|
||||
|
|
|
@ -207,6 +207,7 @@ enum flag_bits {
|
|||
* check if there is collision between raid1
|
||||
* serial bios.
|
||||
*/
|
||||
Nonrot, /* non-rotational device (SSD) */
|
||||
};
|
||||
|
||||
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
|
||||
|
@ -222,6 +223,16 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s,
|
||||
int sectors)
|
||||
{
|
||||
sector_t first_bad;
|
||||
int bad_sectors;
|
||||
|
||||
return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors);
|
||||
}
|
||||
|
||||
extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
|
||||
int is_new);
|
||||
extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
|
||||
|
|
|
@ -227,3 +227,72 @@ static inline bool exceed_read_errors(struct mddev *mddev, struct md_rdev *rdev)
|
|||
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* raid1_check_read_range() - check a given read range for bad blocks,
|
||||
* available read length is returned;
|
||||
* @rdev: the rdev to read;
|
||||
* @this_sector: read position;
|
||||
* @len: read length;
|
||||
*
|
||||
* helper function for read_balance()
|
||||
*
|
||||
* 1) If there are no bad blocks in the range, @len is returned;
|
||||
* 2) If the range are all bad blocks, 0 is returned;
|
||||
* 3) If there are partial bad blocks:
|
||||
* - If the bad block range starts after @this_sector, the length of first
|
||||
* good region is returned;
|
||||
* - If the bad block range starts before @this_sector, 0 is returned and
|
||||
* the @len is updated to the offset into the region before we get to the
|
||||
* good blocks;
|
||||
*/
|
||||
static inline int raid1_check_read_range(struct md_rdev *rdev,
|
||||
sector_t this_sector, int *len)
|
||||
{
|
||||
sector_t first_bad;
|
||||
int bad_sectors;
|
||||
|
||||
/* no bad block overlap */
|
||||
if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors))
|
||||
return *len;
|
||||
|
||||
/*
|
||||
* bad block range starts offset into our range so we can return the
|
||||
* number of sectors before the bad blocks start.
|
||||
*/
|
||||
if (first_bad > this_sector)
|
||||
return first_bad - this_sector;
|
||||
|
||||
/* read range is fully consumed by bad blocks. */
|
||||
if (this_sector + *len <= first_bad + bad_sectors)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* final case, bad block range starts before or at the start of our
|
||||
* range but does not cover our entire range so we still return 0 but
|
||||
* update the length with the number of sectors before we get to the
|
||||
* good ones.
|
||||
*/
|
||||
*len = first_bad + bad_sectors - this_sector;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if read should choose the first rdev.
|
||||
*
|
||||
* Balance on the whole device if no resync is going on (recovery is ok) or
|
||||
* below the resync window. Otherwise, take the first readable disk.
|
||||
*/
|
||||
static inline bool raid1_should_read_first(struct mddev *mddev,
|
||||
sector_t this_sector, int len)
|
||||
{
|
||||
if ((mddev->recovery_cp < this_sector + len))
|
||||
return true;
|
||||
|
||||
if (mddev_is_clustered(mddev) &&
|
||||
md_cluster_ops->area_resyncing(mddev, READ, this_sector,
|
||||
this_sector + len))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -498,9 +498,6 @@ static void raid1_end_write_request(struct bio *bio)
|
|||
* to user-side. So if something waits for IO, then it
|
||||
* will wait for the 'master' bio.
|
||||
*/
|
||||
sector_t first_bad;
|
||||
int bad_sectors;
|
||||
|
||||
r1_bio->bios[mirror] = NULL;
|
||||
to_put = bio;
|
||||
/*
|
||||
|
@ -516,8 +513,8 @@ static void raid1_end_write_request(struct bio *bio)
|
|||
set_bit(R1BIO_Uptodate, &r1_bio->state);
|
||||
|
||||
/* Maybe we can clear some bad blocks. */
|
||||
if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
|
||||
&first_bad, &bad_sectors) && !discard_error) {
|
||||
if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
|
||||
!discard_error) {
|
||||
r1_bio->bios[mirror] = IO_MADE_GOOD;
|
||||
set_bit(R1BIO_MadeGood, &r1_bio->state);
|
||||
}
|
||||
|
@ -582,211 +579,312 @@ static sector_t align_to_barrier_unit_end(sector_t start_sector,
|
|||
return len;
|
||||
}
|
||||
|
||||
/*
|
||||
* This routine returns the disk from which the requested read should
|
||||
* be done. There is a per-array 'next expected sequential IO' sector
|
||||
* number - if this matches on the next IO then we use the last disk.
|
||||
* There is also a per-disk 'last know head position' sector that is
|
||||
* maintained from IRQ contexts, both the normal and the resync IO
|
||||
* completion handlers update this position correctly. If there is no
|
||||
* perfect sequential match then we pick the disk whose head is closest.
|
||||
*
|
||||
* If there are 2 mirrors in the same 2 devices, performance degrades
|
||||
* because position is mirror, not device based.
|
||||
*
|
||||
* The rdev for the device selected will have nr_pending incremented.
|
||||
*/
|
||||
static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors)
|
||||
static void update_read_sectors(struct r1conf *conf, int disk,
|
||||
sector_t this_sector, int len)
|
||||
{
|
||||
const sector_t this_sector = r1_bio->sector;
|
||||
int sectors;
|
||||
int best_good_sectors;
|
||||
int best_disk, best_dist_disk, best_pending_disk;
|
||||
int has_nonrot_disk;
|
||||
struct raid1_info *info = &conf->mirrors[disk];
|
||||
|
||||
atomic_inc(&info->rdev->nr_pending);
|
||||
if (info->next_seq_sect != this_sector)
|
||||
info->seq_start = this_sector;
|
||||
info->next_seq_sect = this_sector + len;
|
||||
}
|
||||
|
||||
static int choose_first_rdev(struct r1conf *conf, struct r1bio *r1_bio,
|
||||
int *max_sectors)
|
||||
{
|
||||
sector_t this_sector = r1_bio->sector;
|
||||
int len = r1_bio->sectors;
|
||||
int disk;
|
||||
sector_t best_dist;
|
||||
unsigned int min_pending;
|
||||
struct md_rdev *rdev;
|
||||
int choose_first;
|
||||
int choose_next_idle;
|
||||
|
||||
/*
|
||||
* Check if we can balance. We can balance on the whole
|
||||
* device if no resync is going on, or below the resync window.
|
||||
* We take the first readable disk when above the resync window.
|
||||
*/
|
||||
retry:
|
||||
sectors = r1_bio->sectors;
|
||||
best_disk = -1;
|
||||
best_dist_disk = -1;
|
||||
best_dist = MaxSector;
|
||||
best_pending_disk = -1;
|
||||
min_pending = UINT_MAX;
|
||||
best_good_sectors = 0;
|
||||
has_nonrot_disk = 0;
|
||||
choose_next_idle = 0;
|
||||
clear_bit(R1BIO_FailFast, &r1_bio->state);
|
||||
|
||||
if ((conf->mddev->recovery_cp < this_sector + sectors) ||
|
||||
(mddev_is_clustered(conf->mddev) &&
|
||||
md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
|
||||
this_sector + sectors)))
|
||||
choose_first = 1;
|
||||
else
|
||||
choose_first = 0;
|
||||
|
||||
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
|
||||
sector_t dist;
|
||||
sector_t first_bad;
|
||||
int bad_sectors;
|
||||
unsigned int pending;
|
||||
bool nonrot;
|
||||
struct md_rdev *rdev;
|
||||
int read_len;
|
||||
|
||||
if (r1_bio->bios[disk] == IO_BLOCKED)
|
||||
continue;
|
||||
|
||||
rdev = conf->mirrors[disk].rdev;
|
||||
if (r1_bio->bios[disk] == IO_BLOCKED
|
||||
|| rdev == NULL
|
||||
|| test_bit(Faulty, &rdev->flags))
|
||||
continue;
|
||||
if (!test_bit(In_sync, &rdev->flags) &&
|
||||
rdev->recovery_offset < this_sector + sectors)
|
||||
continue;
|
||||
if (test_bit(WriteMostly, &rdev->flags)) {
|
||||
/* Don't balance among write-mostly, just
|
||||
* use the first as a last resort */
|
||||
if (best_dist_disk < 0) {
|
||||
if (is_badblock(rdev, this_sector, sectors,
|
||||
&first_bad, &bad_sectors)) {
|
||||
if (first_bad <= this_sector)
|
||||
/* Cannot use this */
|
||||
continue;
|
||||
best_good_sectors = first_bad - this_sector;
|
||||
} else
|
||||
best_good_sectors = sectors;
|
||||
best_dist_disk = disk;
|
||||
best_pending_disk = disk;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
/* This is a reasonable device to use. It might
|
||||
* even be best.
|
||||
*/
|
||||
if (is_badblock(rdev, this_sector, sectors,
|
||||
&first_bad, &bad_sectors)) {
|
||||
if (best_dist < MaxSector)
|
||||
/* already have a better device */
|
||||
continue;
|
||||
if (first_bad <= this_sector) {
|
||||
/* cannot read here. If this is the 'primary'
|
||||
* device, then we must not read beyond
|
||||
* bad_sectors from another device..
|
||||
*/
|
||||
bad_sectors -= (this_sector - first_bad);
|
||||
if (choose_first && sectors > bad_sectors)
|
||||
sectors = bad_sectors;
|
||||
if (best_good_sectors > sectors)
|
||||
best_good_sectors = sectors;
|
||||
|
||||
} else {
|
||||
sector_t good_sectors = first_bad - this_sector;
|
||||
if (good_sectors > best_good_sectors) {
|
||||
best_good_sectors = good_sectors;
|
||||
best_disk = disk;
|
||||
}
|
||||
if (choose_first)
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
if ((sectors > best_good_sectors) && (best_disk >= 0))
|
||||
best_disk = -1;
|
||||
best_good_sectors = sectors;
|
||||
}
|
||||
|
||||
if (best_disk >= 0)
|
||||
/* At least two disks to choose from so failfast is OK */
|
||||
set_bit(R1BIO_FailFast, &r1_bio->state);
|
||||
|
||||
nonrot = bdev_nonrot(rdev->bdev);
|
||||
has_nonrot_disk |= nonrot;
|
||||
pending = atomic_read(&rdev->nr_pending);
|
||||
dist = abs(this_sector - conf->mirrors[disk].head_position);
|
||||
if (choose_first) {
|
||||
best_disk = disk;
|
||||
break;
|
||||
}
|
||||
/* Don't change to another disk for sequential reads */
|
||||
if (conf->mirrors[disk].next_seq_sect == this_sector
|
||||
|| dist == 0) {
|
||||
int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
|
||||
struct raid1_info *mirror = &conf->mirrors[disk];
|
||||
|
||||
best_disk = disk;
|
||||
/*
|
||||
* If buffered sequential IO size exceeds optimal
|
||||
* iosize, check if there is idle disk. If yes, choose
|
||||
* the idle disk. read_balance could already choose an
|
||||
* idle disk before noticing it's a sequential IO in
|
||||
* this disk. This doesn't matter because this disk
|
||||
* will idle, next time it will be utilized after the
|
||||
* first disk has IO size exceeds optimal iosize. In
|
||||
* this way, iosize of the first disk will be optimal
|
||||
* iosize at least. iosize of the second disk might be
|
||||
* small, but not a big deal since when the second disk
|
||||
* starts IO, the first disk is likely still busy.
|
||||
*/
|
||||
if (nonrot && opt_iosize > 0 &&
|
||||
mirror->seq_start != MaxSector &&
|
||||
mirror->next_seq_sect > opt_iosize &&
|
||||
mirror->next_seq_sect - opt_iosize >=
|
||||
mirror->seq_start) {
|
||||
choose_next_idle = 1;
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (choose_next_idle)
|
||||
if (!rdev || test_bit(Faulty, &rdev->flags))
|
||||
continue;
|
||||
|
||||
if (min_pending > pending) {
|
||||
min_pending = pending;
|
||||
best_pending_disk = disk;
|
||||
}
|
||||
|
||||
if (dist < best_dist) {
|
||||
best_dist = dist;
|
||||
best_dist_disk = disk;
|
||||
/* choose the first disk even if it has some bad blocks. */
|
||||
read_len = raid1_check_read_range(rdev, this_sector, &len);
|
||||
if (read_len > 0) {
|
||||
update_read_sectors(conf, disk, this_sector, read_len);
|
||||
*max_sectors = read_len;
|
||||
return disk;
|
||||
}
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio,
|
||||
int *max_sectors)
|
||||
{
|
||||
sector_t this_sector = r1_bio->sector;
|
||||
int best_disk = -1;
|
||||
int best_len = 0;
|
||||
int disk;
|
||||
|
||||
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
|
||||
struct md_rdev *rdev;
|
||||
int len;
|
||||
int read_len;
|
||||
|
||||
if (r1_bio->bios[disk] == IO_BLOCKED)
|
||||
continue;
|
||||
|
||||
rdev = conf->mirrors[disk].rdev;
|
||||
if (!rdev || test_bit(Faulty, &rdev->flags) ||
|
||||
test_bit(WriteMostly, &rdev->flags))
|
||||
continue;
|
||||
|
||||
/* keep track of the disk with the most readable sectors. */
|
||||
len = r1_bio->sectors;
|
||||
read_len = raid1_check_read_range(rdev, this_sector, &len);
|
||||
if (read_len > best_len) {
|
||||
best_disk = disk;
|
||||
best_len = read_len;
|
||||
}
|
||||
}
|
||||
|
||||
if (best_disk != -1) {
|
||||
*max_sectors = best_len;
|
||||
update_read_sectors(conf, best_disk, this_sector, best_len);
|
||||
}
|
||||
|
||||
return best_disk;
|
||||
}
|
||||
|
||||
static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio,
|
||||
int *max_sectors)
|
||||
{
|
||||
sector_t this_sector = r1_bio->sector;
|
||||
int bb_disk = -1;
|
||||
int bb_read_len = 0;
|
||||
int disk;
|
||||
|
||||
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
|
||||
struct md_rdev *rdev;
|
||||
int len;
|
||||
int read_len;
|
||||
|
||||
if (r1_bio->bios[disk] == IO_BLOCKED)
|
||||
continue;
|
||||
|
||||
rdev = conf->mirrors[disk].rdev;
|
||||
if (!rdev || test_bit(Faulty, &rdev->flags) ||
|
||||
!test_bit(WriteMostly, &rdev->flags))
|
||||
continue;
|
||||
|
||||
/* there are no bad blocks, we can use this disk */
|
||||
len = r1_bio->sectors;
|
||||
read_len = raid1_check_read_range(rdev, this_sector, &len);
|
||||
if (read_len == r1_bio->sectors) {
|
||||
update_read_sectors(conf, disk, this_sector, read_len);
|
||||
return disk;
|
||||
}
|
||||
|
||||
/*
|
||||
* there are partial bad blocks, choose the rdev with largest
|
||||
* read length.
|
||||
*/
|
||||
if (read_len > bb_read_len) {
|
||||
bb_disk = disk;
|
||||
bb_read_len = read_len;
|
||||
}
|
||||
}
|
||||
|
||||
if (bb_disk != -1) {
|
||||
*max_sectors = bb_read_len;
|
||||
update_read_sectors(conf, bb_disk, this_sector, bb_read_len);
|
||||
}
|
||||
|
||||
return bb_disk;
|
||||
}
|
||||
|
||||
static bool is_sequential(struct r1conf *conf, int disk, struct r1bio *r1_bio)
|
||||
{
|
||||
/* TODO: address issues with this check and concurrency. */
|
||||
return conf->mirrors[disk].next_seq_sect == r1_bio->sector ||
|
||||
conf->mirrors[disk].head_position == r1_bio->sector;
|
||||
}
|
||||
|
||||
/*
|
||||
* If buffered sequential IO size exceeds optimal iosize, check if there is idle
|
||||
* disk. If yes, choose the idle disk.
|
||||
*/
|
||||
static bool should_choose_next(struct r1conf *conf, int disk)
|
||||
{
|
||||
struct raid1_info *mirror = &conf->mirrors[disk];
|
||||
int opt_iosize;
|
||||
|
||||
if (!test_bit(Nonrot, &mirror->rdev->flags))
|
||||
return false;
|
||||
|
||||
opt_iosize = bdev_io_opt(mirror->rdev->bdev) >> 9;
|
||||
return opt_iosize > 0 && mirror->seq_start != MaxSector &&
|
||||
mirror->next_seq_sect > opt_iosize &&
|
||||
mirror->next_seq_sect - opt_iosize >= mirror->seq_start;
|
||||
}
|
||||
|
||||
static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio)
|
||||
{
|
||||
if (!rdev || test_bit(Faulty, &rdev->flags))
|
||||
return false;
|
||||
|
||||
/* still in recovery */
|
||||
if (!test_bit(In_sync, &rdev->flags) &&
|
||||
rdev->recovery_offset < r1_bio->sector + r1_bio->sectors)
|
||||
return false;
|
||||
|
||||
/* don't read from slow disk unless have to */
|
||||
if (test_bit(WriteMostly, &rdev->flags))
|
||||
return false;
|
||||
|
||||
/* don't split IO for bad blocks unless have to */
|
||||
if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
struct read_balance_ctl {
|
||||
sector_t closest_dist;
|
||||
int closest_dist_disk;
|
||||
int min_pending;
|
||||
int min_pending_disk;
|
||||
int sequential_disk;
|
||||
int readable_disks;
|
||||
};
|
||||
|
||||
static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio)
|
||||
{
|
||||
int disk;
|
||||
struct read_balance_ctl ctl = {
|
||||
.closest_dist_disk = -1,
|
||||
.closest_dist = MaxSector,
|
||||
.min_pending_disk = -1,
|
||||
.min_pending = UINT_MAX,
|
||||
.sequential_disk = -1,
|
||||
};
|
||||
|
||||
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
|
||||
struct md_rdev *rdev;
|
||||
sector_t dist;
|
||||
unsigned int pending;
|
||||
|
||||
if (r1_bio->bios[disk] == IO_BLOCKED)
|
||||
continue;
|
||||
|
||||
rdev = conf->mirrors[disk].rdev;
|
||||
if (!rdev_readable(rdev, r1_bio))
|
||||
continue;
|
||||
|
||||
/* At least two disks to choose from so failfast is OK */
|
||||
if (ctl.readable_disks++ == 1)
|
||||
set_bit(R1BIO_FailFast, &r1_bio->state);
|
||||
|
||||
pending = atomic_read(&rdev->nr_pending);
|
||||
dist = abs(r1_bio->sector - conf->mirrors[disk].head_position);
|
||||
|
||||
/* Don't change to another disk for sequential reads */
|
||||
if (is_sequential(conf, disk, r1_bio)) {
|
||||
if (!should_choose_next(conf, disk))
|
||||
return disk;
|
||||
|
||||
/*
|
||||
* Add 'pending' to avoid choosing this disk if
|
||||
* there is other idle disk.
|
||||
*/
|
||||
pending++;
|
||||
/*
|
||||
* If there is no other idle disk, this disk
|
||||
* will be chosen.
|
||||
*/
|
||||
ctl.sequential_disk = disk;
|
||||
}
|
||||
|
||||
if (ctl.min_pending > pending) {
|
||||
ctl.min_pending = pending;
|
||||
ctl.min_pending_disk = disk;
|
||||
}
|
||||
|
||||
if (ctl.closest_dist > dist) {
|
||||
ctl.closest_dist = dist;
|
||||
ctl.closest_dist_disk = disk;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* sequential IO size exceeds optimal iosize, however, there is no other
|
||||
* idle disk, so choose the sequential disk.
|
||||
*/
|
||||
if (ctl.sequential_disk != -1 && ctl.min_pending != 0)
|
||||
return ctl.sequential_disk;
|
||||
|
||||
/*
|
||||
* If all disks are rotational, choose the closest disk. If any disk is
|
||||
* non-rotational, choose the disk with less pending request even the
|
||||
* disk is rotational, which might/might not be optimal for raids with
|
||||
* mixed ratation/non-rotational disks depending on workload.
|
||||
*/
|
||||
if (best_disk == -1) {
|
||||
if (has_nonrot_disk || min_pending == 0)
|
||||
best_disk = best_pending_disk;
|
||||
else
|
||||
best_disk = best_dist_disk;
|
||||
if (ctl.min_pending_disk != -1 &&
|
||||
(READ_ONCE(conf->nonrot_disks) || ctl.min_pending == 0))
|
||||
return ctl.min_pending_disk;
|
||||
else
|
||||
return ctl.closest_dist_disk;
|
||||
}
|
||||
|
||||
/*
|
||||
* This routine returns the disk from which the requested read should be done.
|
||||
*
|
||||
* 1) If resync is in progress, find the first usable disk and use it even if it
|
||||
* has some bad blocks.
|
||||
*
|
||||
* 2) Now that there is no resync, loop through all disks and skipping slow
|
||||
* disks and disks with bad blocks for now. Only pay attention to key disk
|
||||
* choice.
|
||||
*
|
||||
* 3) If we've made it this far, now look for disks with bad blocks and choose
|
||||
* the one with most number of sectors.
|
||||
*
|
||||
* 4) If we are all the way at the end, we have no choice but to use a disk even
|
||||
* if it is write mostly.
|
||||
*
|
||||
* The rdev for the device selected will have nr_pending incremented.
|
||||
*/
|
||||
static int read_balance(struct r1conf *conf, struct r1bio *r1_bio,
|
||||
int *max_sectors)
|
||||
{
|
||||
int disk;
|
||||
|
||||
clear_bit(R1BIO_FailFast, &r1_bio->state);
|
||||
|
||||
if (raid1_should_read_first(conf->mddev, r1_bio->sector,
|
||||
r1_bio->sectors))
|
||||
return choose_first_rdev(conf, r1_bio, max_sectors);
|
||||
|
||||
disk = choose_best_rdev(conf, r1_bio);
|
||||
if (disk >= 0) {
|
||||
*max_sectors = r1_bio->sectors;
|
||||
update_read_sectors(conf, disk, r1_bio->sector,
|
||||
r1_bio->sectors);
|
||||
return disk;
|
||||
}
|
||||
|
||||
if (best_disk >= 0) {
|
||||
rdev = conf->mirrors[best_disk].rdev;
|
||||
if (!rdev)
|
||||
goto retry;
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
sectors = best_good_sectors;
|
||||
/*
|
||||
* If we are here it means we didn't find a perfectly good disk so
|
||||
* now spend a bit more time trying to find one with the most good
|
||||
* sectors.
|
||||
*/
|
||||
disk = choose_bb_rdev(conf, r1_bio, max_sectors);
|
||||
if (disk >= 0)
|
||||
return disk;
|
||||
|
||||
if (conf->mirrors[best_disk].next_seq_sect != this_sector)
|
||||
conf->mirrors[best_disk].seq_start = this_sector;
|
||||
|
||||
conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
|
||||
}
|
||||
*max_sectors = sectors;
|
||||
|
||||
return best_disk;
|
||||
return choose_slow_rdev(conf, r1_bio, max_sectors);
|
||||
}
|
||||
|
||||
static void wake_up_barrier(struct r1conf *conf)
|
||||
|
@ -1760,6 +1858,52 @@ static int raid1_spare_active(struct mddev *mddev)
|
|||
return count;
|
||||
}
|
||||
|
||||
static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk,
|
||||
bool replacement)
|
||||
{
|
||||
struct raid1_info *info = conf->mirrors + disk;
|
||||
|
||||
if (replacement)
|
||||
info += conf->raid_disks;
|
||||
|
||||
if (info->rdev)
|
||||
return false;
|
||||
|
||||
if (bdev_nonrot(rdev->bdev)) {
|
||||
set_bit(Nonrot, &rdev->flags);
|
||||
WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1);
|
||||
}
|
||||
|
||||
rdev->raid_disk = disk;
|
||||
info->head_position = 0;
|
||||
info->seq_start = MaxSector;
|
||||
WRITE_ONCE(info->rdev, rdev);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool raid1_remove_conf(struct r1conf *conf, int disk)
|
||||
{
|
||||
struct raid1_info *info = conf->mirrors + disk;
|
||||
struct md_rdev *rdev = info->rdev;
|
||||
|
||||
if (!rdev || test_bit(In_sync, &rdev->flags) ||
|
||||
atomic_read(&rdev->nr_pending))
|
||||
return false;
|
||||
|
||||
/* Only remove non-faulty devices if recovery is not possible. */
|
||||
if (!test_bit(Faulty, &rdev->flags) &&
|
||||
rdev->mddev->recovery_disabled != conf->recovery_disabled &&
|
||||
rdev->mddev->degraded < conf->raid_disks)
|
||||
return false;
|
||||
|
||||
if (test_and_clear_bit(Nonrot, &rdev->flags))
|
||||
WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks - 1);
|
||||
|
||||
WRITE_ONCE(info->rdev, NULL);
|
||||
return true;
|
||||
}
|
||||
|
||||
static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
|
||||
{
|
||||
struct r1conf *conf = mddev->private;
|
||||
|
@ -1795,15 +1939,13 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
|
|||
disk_stack_limits(mddev->gendisk, rdev->bdev,
|
||||
rdev->data_offset << 9);
|
||||
|
||||
p->head_position = 0;
|
||||
rdev->raid_disk = mirror;
|
||||
raid1_add_conf(conf, rdev, mirror, false);
|
||||
err = 0;
|
||||
/* As all devices are equivalent, we don't need a full recovery
|
||||
* if this was recently any drive of the array
|
||||
*/
|
||||
if (rdev->saved_raid_disk < 0)
|
||||
conf->fullsync = 1;
|
||||
WRITE_ONCE(p->rdev, rdev);
|
||||
break;
|
||||
}
|
||||
if (test_bit(WantReplacement, &p->rdev->flags) &&
|
||||
|
@ -1813,13 +1955,11 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
|
|||
|
||||
if (err && repl_slot >= 0) {
|
||||
/* Add this device as a replacement */
|
||||
p = conf->mirrors + repl_slot;
|
||||
clear_bit(In_sync, &rdev->flags);
|
||||
set_bit(Replacement, &rdev->flags);
|
||||
rdev->raid_disk = repl_slot;
|
||||
raid1_add_conf(conf, rdev, repl_slot, true);
|
||||
err = 0;
|
||||
conf->fullsync = 1;
|
||||
WRITE_ONCE(p[conf->raid_disks].rdev, rdev);
|
||||
}
|
||||
|
||||
print_conf(conf);
|
||||
|
@ -1836,27 +1976,20 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
|
|||
if (unlikely(number >= conf->raid_disks))
|
||||
goto abort;
|
||||
|
||||
if (rdev != p->rdev)
|
||||
p = conf->mirrors + conf->raid_disks + number;
|
||||
if (rdev != p->rdev) {
|
||||
number += conf->raid_disks;
|
||||
p = conf->mirrors + number;
|
||||
}
|
||||
|
||||
print_conf(conf);
|
||||
if (rdev == p->rdev) {
|
||||
if (test_bit(In_sync, &rdev->flags) ||
|
||||
atomic_read(&rdev->nr_pending)) {
|
||||
if (!raid1_remove_conf(conf, number)) {
|
||||
err = -EBUSY;
|
||||
goto abort;
|
||||
}
|
||||
/* Only remove non-faulty devices if recovery
|
||||
* is not possible.
|
||||
*/
|
||||
if (!test_bit(Faulty, &rdev->flags) &&
|
||||
mddev->recovery_disabled != conf->recovery_disabled &&
|
||||
mddev->degraded < conf->raid_disks) {
|
||||
err = -EBUSY;
|
||||
goto abort;
|
||||
}
|
||||
WRITE_ONCE(p->rdev, NULL);
|
||||
if (conf->mirrors[conf->raid_disks + number].rdev) {
|
||||
|
||||
if (number < conf->raid_disks &&
|
||||
conf->mirrors[conf->raid_disks + number].rdev) {
|
||||
/* We just removed a device that is being replaced.
|
||||
* Move down the replacement. We drain all IO before
|
||||
* doing this to avoid confusion.
|
||||
|
@ -1944,8 +2077,6 @@ static void end_sync_write(struct bio *bio)
|
|||
struct r1bio *r1_bio = get_resync_r1bio(bio);
|
||||
struct mddev *mddev = r1_bio->mddev;
|
||||
struct r1conf *conf = mddev->private;
|
||||
sector_t first_bad;
|
||||
int bad_sectors;
|
||||
struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
|
||||
|
||||
if (!uptodate) {
|
||||
|
@ -1955,14 +2086,11 @@ static void end_sync_write(struct bio *bio)
|
|||
set_bit(MD_RECOVERY_NEEDED, &
|
||||
mddev->recovery);
|
||||
set_bit(R1BIO_WriteError, &r1_bio->state);
|
||||
} else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
|
||||
&first_bad, &bad_sectors) &&
|
||||
!is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
|
||||
r1_bio->sector,
|
||||
r1_bio->sectors,
|
||||
&first_bad, &bad_sectors)
|
||||
)
|
||||
} else if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
|
||||
!rdev_has_badblock(conf->mirrors[r1_bio->read_disk].rdev,
|
||||
r1_bio->sector, r1_bio->sectors)) {
|
||||
set_bit(R1BIO_MadeGood, &r1_bio->state);
|
||||
}
|
||||
|
||||
put_sync_write_buf(r1_bio, uptodate);
|
||||
}
|
||||
|
@ -2279,16 +2407,12 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
|
|||
s = PAGE_SIZE >> 9;
|
||||
|
||||
do {
|
||||
sector_t first_bad;
|
||||
int bad_sectors;
|
||||
|
||||
rdev = conf->mirrors[d].rdev;
|
||||
if (rdev &&
|
||||
(test_bit(In_sync, &rdev->flags) ||
|
||||
(!test_bit(Faulty, &rdev->flags) &&
|
||||
rdev->recovery_offset >= sect + s)) &&
|
||||
is_badblock(rdev, sect, s,
|
||||
&first_bad, &bad_sectors) == 0) {
|
||||
rdev_has_badblock(rdev, sect, s) == 0) {
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
if (sync_page_io(rdev, sect, s<<9,
|
||||
conf->tmppage, REQ_OP_READ, false))
|
||||
|
@ -3006,23 +3130,17 @@ static struct r1conf *setup_conf(struct mddev *mddev)
|
|||
|
||||
err = -EINVAL;
|
||||
spin_lock_init(&conf->device_lock);
|
||||
conf->raid_disks = mddev->raid_disks;
|
||||
rdev_for_each(rdev, mddev) {
|
||||
int disk_idx = rdev->raid_disk;
|
||||
if (disk_idx >= mddev->raid_disks
|
||||
|| disk_idx < 0)
|
||||
continue;
|
||||
if (test_bit(Replacement, &rdev->flags))
|
||||
disk = conf->mirrors + mddev->raid_disks + disk_idx;
|
||||
else
|
||||
disk = conf->mirrors + disk_idx;
|
||||
|
||||
if (disk->rdev)
|
||||
if (disk_idx >= conf->raid_disks || disk_idx < 0)
|
||||
continue;
|
||||
|
||||
if (!raid1_add_conf(conf, rdev, disk_idx,
|
||||
test_bit(Replacement, &rdev->flags)))
|
||||
goto abort;
|
||||
disk->rdev = rdev;
|
||||
disk->head_position = 0;
|
||||
disk->seq_start = MaxSector;
|
||||
}
|
||||
conf->raid_disks = mddev->raid_disks;
|
||||
conf->mddev = mddev;
|
||||
INIT_LIST_HEAD(&conf->retry_list);
|
||||
INIT_LIST_HEAD(&conf->bio_end_io_list);
|
||||
|
|
|
@ -71,6 +71,7 @@ struct r1conf {
|
|||
* allow for replacements.
|
||||
*/
|
||||
int raid_disks;
|
||||
int nonrot_disks;
|
||||
|
||||
spinlock_t device_lock;
|
||||
|
||||
|
|
|
@ -518,11 +518,7 @@ static void raid10_end_write_request(struct bio *bio)
|
|||
* The 'master' represents the composite IO operation to
|
||||
* user-side. So if something waits for IO, then it will
|
||||
* wait for the 'master' bio.
|
||||
*/
|
||||
sector_t first_bad;
|
||||
int bad_sectors;
|
||||
|
||||
/*
|
||||
*
|
||||
* Do not set R10BIO_Uptodate if the current device is
|
||||
* rebuilding or Faulty. This is because we cannot use
|
||||
* such device for properly reading the data back (we could
|
||||
|
@ -535,10 +531,9 @@ static void raid10_end_write_request(struct bio *bio)
|
|||
set_bit(R10BIO_Uptodate, &r10_bio->state);
|
||||
|
||||
/* Maybe we can clear some bad blocks. */
|
||||
if (is_badblock(rdev,
|
||||
r10_bio->devs[slot].addr,
|
||||
r10_bio->sectors,
|
||||
&first_bad, &bad_sectors) && !discard_error) {
|
||||
if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
|
||||
r10_bio->sectors) &&
|
||||
!discard_error) {
|
||||
bio_put(bio);
|
||||
if (repl)
|
||||
r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
|
||||
|
@ -753,17 +748,8 @@ static struct md_rdev *read_balance(struct r10conf *conf,
|
|||
best_good_sectors = 0;
|
||||
do_balance = 1;
|
||||
clear_bit(R10BIO_FailFast, &r10_bio->state);
|
||||
/*
|
||||
* Check if we can balance. We can balance on the whole
|
||||
* device if no resync is going on (recovery is ok), or below
|
||||
* the resync window. We take the first readable disk when
|
||||
* above the resync window.
|
||||
*/
|
||||
if ((conf->mddev->recovery_cp < MaxSector
|
||||
&& (this_sector + sectors >= conf->next_resync)) ||
|
||||
(mddev_is_clustered(conf->mddev) &&
|
||||
md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
|
||||
this_sector + sectors)))
|
||||
|
||||
if (raid1_should_read_first(conf->mddev, this_sector, sectors))
|
||||
do_balance = 0;
|
||||
|
||||
for (slot = 0; slot < conf->copies ; slot++) {
|
||||
|
@ -1330,10 +1316,7 @@ retry_wait:
|
|||
}
|
||||
|
||||
if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
|
||||
sector_t first_bad;
|
||||
sector_t dev_sector = r10_bio->devs[i].addr;
|
||||
int bad_sectors;
|
||||
int is_bad;
|
||||
|
||||
/*
|
||||
* Discard request doesn't care the write result
|
||||
|
@ -1342,9 +1325,8 @@ retry_wait:
|
|||
if (!r10_bio->sectors)
|
||||
continue;
|
||||
|
||||
is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors,
|
||||
&first_bad, &bad_sectors);
|
||||
if (is_bad < 0) {
|
||||
if (rdev_has_badblock(rdev, dev_sector,
|
||||
r10_bio->sectors) < 0) {
|
||||
/*
|
||||
* Mustn't write here until the bad block
|
||||
* is acknowledged
|
||||
|
@ -2290,8 +2272,6 @@ static void end_sync_write(struct bio *bio)
|
|||
struct mddev *mddev = r10_bio->mddev;
|
||||
struct r10conf *conf = mddev->private;
|
||||
int d;
|
||||
sector_t first_bad;
|
||||
int bad_sectors;
|
||||
int slot;
|
||||
int repl;
|
||||
struct md_rdev *rdev = NULL;
|
||||
|
@ -2312,11 +2292,10 @@ static void end_sync_write(struct bio *bio)
|
|||
&rdev->mddev->recovery);
|
||||
set_bit(R10BIO_WriteError, &r10_bio->state);
|
||||
}
|
||||
} else if (is_badblock(rdev,
|
||||
r10_bio->devs[slot].addr,
|
||||
r10_bio->sectors,
|
||||
&first_bad, &bad_sectors))
|
||||
} else if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
|
||||
r10_bio->sectors)) {
|
||||
set_bit(R10BIO_MadeGood, &r10_bio->state);
|
||||
}
|
||||
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
|
||||
|
@ -2597,11 +2576,8 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
|
|||
static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
|
||||
int sectors, struct page *page, enum req_op op)
|
||||
{
|
||||
sector_t first_bad;
|
||||
int bad_sectors;
|
||||
|
||||
if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
|
||||
&& (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags)))
|
||||
if (rdev_has_badblock(rdev, sector, sectors) &&
|
||||
(op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags)))
|
||||
return -1;
|
||||
if (sync_page_io(rdev, sector, sectors << 9, page, op, false))
|
||||
/* success */
|
||||
|
@ -2658,16 +2634,14 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
|
|||
s = PAGE_SIZE >> 9;
|
||||
|
||||
do {
|
||||
sector_t first_bad;
|
||||
int bad_sectors;
|
||||
|
||||
d = r10_bio->devs[sl].devnum;
|
||||
rdev = conf->mirrors[d].rdev;
|
||||
if (rdev &&
|
||||
test_bit(In_sync, &rdev->flags) &&
|
||||
!test_bit(Faulty, &rdev->flags) &&
|
||||
is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
|
||||
&first_bad, &bad_sectors) == 0) {
|
||||
rdev_has_badblock(rdev,
|
||||
r10_bio->devs[sl].addr + sect,
|
||||
s) == 0) {
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
success = sync_page_io(rdev,
|
||||
r10_bio->devs[sl].addr +
|
||||
|
|
|
@ -1210,10 +1210,8 @@ again:
|
|||
*/
|
||||
while (op_is_write(op) && rdev &&
|
||||
test_bit(WriteErrorSeen, &rdev->flags)) {
|
||||
sector_t first_bad;
|
||||
int bad_sectors;
|
||||
int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
|
||||
&first_bad, &bad_sectors);
|
||||
int bad = rdev_has_badblock(rdev, sh->sector,
|
||||
RAID5_STRIPE_SECTORS(conf));
|
||||
if (!bad)
|
||||
break;
|
||||
|
||||
|
@ -2412,7 +2410,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
|
|||
atomic_inc(&conf->active_stripes);
|
||||
|
||||
raid5_release_stripe(sh);
|
||||
conf->max_nr_stripes++;
|
||||
WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes + 1);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -2707,7 +2705,7 @@ static int drop_one_stripe(struct r5conf *conf)
|
|||
shrink_buffers(sh);
|
||||
free_stripe(conf->slab_cache, sh);
|
||||
atomic_dec(&conf->active_stripes);
|
||||
conf->max_nr_stripes--;
|
||||
WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes - 1);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -2855,8 +2853,6 @@ static void raid5_end_write_request(struct bio *bi)
|
|||
struct r5conf *conf = sh->raid_conf;
|
||||
int disks = sh->disks, i;
|
||||
struct md_rdev *rdev;
|
||||
sector_t first_bad;
|
||||
int bad_sectors;
|
||||
int replacement = 0;
|
||||
|
||||
for (i = 0 ; i < disks; i++) {
|
||||
|
@ -2888,9 +2884,8 @@ static void raid5_end_write_request(struct bio *bi)
|
|||
if (replacement) {
|
||||
if (bi->bi_status)
|
||||
md_error(conf->mddev, rdev);
|
||||
else if (is_badblock(rdev, sh->sector,
|
||||
RAID5_STRIPE_SECTORS(conf),
|
||||
&first_bad, &bad_sectors))
|
||||
else if (rdev_has_badblock(rdev, sh->sector,
|
||||
RAID5_STRIPE_SECTORS(conf)))
|
||||
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
|
||||
} else {
|
||||
if (bi->bi_status) {
|
||||
|
@ -2900,9 +2895,8 @@ static void raid5_end_write_request(struct bio *bi)
|
|||
if (!test_and_set_bit(WantReplacement, &rdev->flags))
|
||||
set_bit(MD_RECOVERY_NEEDED,
|
||||
&rdev->mddev->recovery);
|
||||
} else if (is_badblock(rdev, sh->sector,
|
||||
RAID5_STRIPE_SECTORS(conf),
|
||||
&first_bad, &bad_sectors)) {
|
||||
} else if (rdev_has_badblock(rdev, sh->sector,
|
||||
RAID5_STRIPE_SECTORS(conf))) {
|
||||
set_bit(R5_MadeGood, &sh->dev[i].flags);
|
||||
if (test_bit(R5_ReadError, &sh->dev[i].flags))
|
||||
/* That was a successful write so make
|
||||
|
@ -4674,8 +4668,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
|
|||
/* Now to look around and see what can be done */
|
||||
for (i=disks; i--; ) {
|
||||
struct md_rdev *rdev;
|
||||
sector_t first_bad;
|
||||
int bad_sectors;
|
||||
int is_bad = 0;
|
||||
|
||||
dev = &sh->dev[i];
|
||||
|
@ -4719,8 +4711,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
|
|||
rdev = conf->disks[i].replacement;
|
||||
if (rdev && !test_bit(Faulty, &rdev->flags) &&
|
||||
rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
|
||||
!is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
|
||||
&first_bad, &bad_sectors))
|
||||
!rdev_has_badblock(rdev, sh->sector,
|
||||
RAID5_STRIPE_SECTORS(conf)))
|
||||
set_bit(R5_ReadRepl, &dev->flags);
|
||||
else {
|
||||
if (rdev && !test_bit(Faulty, &rdev->flags))
|
||||
|
@ -4733,8 +4725,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
|
|||
if (rdev && test_bit(Faulty, &rdev->flags))
|
||||
rdev = NULL;
|
||||
if (rdev) {
|
||||
is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
|
||||
&first_bad, &bad_sectors);
|
||||
is_bad = rdev_has_badblock(rdev, sh->sector,
|
||||
RAID5_STRIPE_SECTORS(conf));
|
||||
if (s->blocked_rdev == NULL
|
||||
&& (test_bit(Blocked, &rdev->flags)
|
||||
|| is_bad < 0)) {
|
||||
|
@ -5463,8 +5455,8 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
|
|||
struct r5conf *conf = mddev->private;
|
||||
struct bio *align_bio;
|
||||
struct md_rdev *rdev;
|
||||
sector_t sector, end_sector, first_bad;
|
||||
int bad_sectors, dd_idx;
|
||||
sector_t sector, end_sector;
|
||||
int dd_idx;
|
||||
bool did_inc;
|
||||
|
||||
if (!in_chunk_boundary(mddev, raid_bio)) {
|
||||
|
@ -5493,8 +5485,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
|
|||
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
|
||||
if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad,
|
||||
&bad_sectors)) {
|
||||
if (rdev_has_badblock(rdev, sector, bio_sectors(raid_bio))) {
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
return 0;
|
||||
}
|
||||
|
@ -6820,7 +6811,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
|
|||
if (size <= 16 || size > 32768)
|
||||
return -EINVAL;
|
||||
|
||||
conf->min_nr_stripes = size;
|
||||
WRITE_ONCE(conf->min_nr_stripes, size);
|
||||
mutex_lock(&conf->cache_size_mutex);
|
||||
while (size < conf->max_nr_stripes &&
|
||||
drop_one_stripe(conf))
|
||||
|
@ -6832,7 +6823,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
|
|||
mutex_lock(&conf->cache_size_mutex);
|
||||
while (size > conf->max_nr_stripes)
|
||||
if (!grow_one_stripe(conf, GFP_KERNEL)) {
|
||||
conf->min_nr_stripes = conf->max_nr_stripes;
|
||||
WRITE_ONCE(conf->min_nr_stripes, conf->max_nr_stripes);
|
||||
result = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
|
@ -7388,11 +7379,13 @@ static unsigned long raid5_cache_count(struct shrinker *shrink,
|
|||
struct shrink_control *sc)
|
||||
{
|
||||
struct r5conf *conf = shrink->private_data;
|
||||
int max_stripes = READ_ONCE(conf->max_nr_stripes);
|
||||
int min_stripes = READ_ONCE(conf->min_nr_stripes);
|
||||
|
||||
if (conf->max_nr_stripes < conf->min_nr_stripes)
|
||||
if (max_stripes < min_stripes)
|
||||
/* unlikely, but not impossible */
|
||||
return 0;
|
||||
return conf->max_nr_stripes - conf->min_nr_stripes;
|
||||
return max_stripes - min_stripes;
|
||||
}
|
||||
|
||||
static struct r5conf *setup_conf(struct mddev *mddev)
|
||||
|
|
Loading…
Reference in New Issue