From 1bbe254e4336c0944dd4fb6f0b8c9665b81de50f Mon Sep 17 00:00:00 2001 From: Denis Plotnikov Date: Mon, 25 Sep 2023 15:59:40 +0300 Subject: [PATCH 1/2] md-cluster: check for timeout while a new disk adding A new disk adding may end up with timeout and a new disk won't be added. Add returning the error in that case. Found by Linux Verification Center (linuxtesting.org) with SVACE Signed-off-by: Denis Plotnikov Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230925125940.1542506-1-den-plotnikov@yandex-team.ru --- drivers/md/md-cluster.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 1e26eb223349..8e36a0feec09 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -501,7 +501,7 @@ static void process_suspend_info(struct mddev *mddev, mddev->pers->quiesce(mddev, 0); } -static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) +static int process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) { char disk_uuid[64]; struct md_cluster_info *cinfo = mddev->cluster_info; @@ -509,6 +509,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) char raid_slot[16]; char *envp[] = {event_name, disk_uuid, raid_slot, NULL}; int len; + int res = 0; len = snprintf(disk_uuid, 64, "DEVICE_UUID="); sprintf(disk_uuid + len, "%pU", cmsg->uuid); @@ -517,9 +518,14 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) init_completion(&cinfo->newdisk_completion); set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp); - wait_for_completion_timeout(&cinfo->newdisk_completion, - NEW_DEV_TIMEOUT); + if (!wait_for_completion_timeout(&cinfo->newdisk_completion, + NEW_DEV_TIMEOUT)) { + pr_err("md-cluster(%s:%d): timeout on a new disk adding\n", + __func__, __LINE__); + res = -1; + } clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); + return res; } @@ -594,7 +600,8 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) le64_to_cpu(msg->high)); break; case NEWDISK: - process_add_new_disk(mddev, msg); + if (process_add_new_disk(mddev, msg)) + ret = -1; break; case REMOVE: process_remove_disk(mddev, msg); From 78b7b13f07a3ca16c03aa8bf63f51d6780e8e9e1 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 16 Oct 2023 18:02:40 +0800 Subject: [PATCH 2/2] md: cleanup pers->prepare_suspend() pers->prepare_suspend() is not used anymore and can be removed. Reverts following three commit: - commit 431e61257d63 ("md: export md_is_rdwr() and is_md_suspended()") - commit 3e00777d5157 ("md: add a new api prepare_suspend() in md_personality") - commit 868bba54a3bc ("md/raid5: fix a deadlock in the case that reshape is interrupted") Signed-off-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20231016100240.540474-1-yukuai1@huaweicloud.com --- drivers/md/md.c | 17 ++++++++++++++++- drivers/md/md.h | 18 ------------------ drivers/md/raid5.c | 44 +------------------------------------------- 3 files changed, 17 insertions(+), 62 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 8ee079c4dc1e..09686d8db983 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -91,6 +91,18 @@ static void mddev_detach(struct mddev *mddev); static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); static void md_wakeup_thread_directly(struct md_thread __rcu *thread); +enum md_ro_state { + MD_RDWR, + MD_RDONLY, + MD_AUTO_READ, + MD_MAX_STATE +}; + +static bool md_is_rdwr(struct mddev *mddev) +{ + return (mddev->ro == MD_RDWR); +} + /* * Default number of read corrections we'll attempt on an rdev * before ejecting it from the array. We divide the read error @@ -333,6 +345,10 @@ EXPORT_SYMBOL_GPL(md_new_event); static LIST_HEAD(all_mddevs); static DEFINE_SPINLOCK(all_mddevs_lock); +static bool is_md_suspended(struct mddev *mddev) +{ + return percpu_ref_is_dying(&mddev->active_io); +} /* Rather than calling directly into the personality make_request function, * IO requests come here first so that we can check if the device is * being suspended pending a reconfiguration. @@ -9144,7 +9160,6 @@ void md_do_sync(struct md_thread *thread) spin_unlock(&mddev->lock); wake_up(&resync_wait); - wake_up(&mddev->sb_wait); md_wakeup_thread(mddev->thread); return; } diff --git a/drivers/md/md.h b/drivers/md/md.h index 55d01d431418..20f3f96cf4c1 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -565,23 +565,6 @@ enum recovery_flags { MD_RESYNCING_REMOTE, /* remote node is running resync thread */ }; -enum md_ro_state { - MD_RDWR, - MD_RDONLY, - MD_AUTO_READ, - MD_MAX_STATE -}; - -static inline bool md_is_rdwr(struct mddev *mddev) -{ - return (mddev->ro == MD_RDWR); -} - -static inline bool is_md_suspended(struct mddev *mddev) -{ - return percpu_ref_is_dying(&mddev->active_io); -} - static inline int __must_check mddev_lock(struct mddev *mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); @@ -641,7 +624,6 @@ struct md_personality int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); void (*update_reshape_pos) (struct mddev *mddev); - void (*prepare_suspend) (struct mddev *mddev); /* quiesce suspends or resumes internal processing. * 1 - stop new actions and wait for action io to complete * 0 - return to normal behaviour diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d6de084a85e5..4207e945e8c8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5953,19 +5953,6 @@ static int add_all_stripe_bios(struct r5conf *conf, return ret; } -static bool reshape_inprogress(struct mddev *mddev) -{ - return test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && - !test_bit(MD_RECOVERY_DONE, &mddev->recovery) && - !test_bit(MD_RECOVERY_INTR, &mddev->recovery); -} - -static bool reshape_disabled(struct mddev *mddev) -{ - return is_md_suspended(mddev) || !md_is_rdwr(mddev); -} - static enum stripe_result make_stripe_request(struct mddev *mddev, struct r5conf *conf, struct stripe_request_ctx *ctx, sector_t logical_sector, struct bio *bi) @@ -5997,8 +5984,7 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, if (ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { spin_unlock_irq(&conf->device_lock); - ret = STRIPE_SCHEDULE_AND_RETRY; - goto out; + return STRIPE_SCHEDULE_AND_RETRY; } } spin_unlock_irq(&conf->device_lock); @@ -6077,15 +6063,6 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, out_release: raid5_release_stripe(sh); -out: - if (ret == STRIPE_SCHEDULE_AND_RETRY && !reshape_inprogress(mddev) && - reshape_disabled(mddev)) { - bi->bi_status = BLK_STS_IOERR; - ret = STRIPE_FAIL; - pr_err("md/raid456:%s: io failed across reshape position while reshape can't make progress.\n", - mdname(mddev)); - } - return ret; } @@ -9027,22 +9004,6 @@ static int raid5_start(struct mddev *mddev) return r5l_start(conf->log); } -static void raid5_prepare_suspend(struct mddev *mddev) -{ - struct r5conf *conf = mddev->private; - - wait_event(mddev->sb_wait, !reshape_inprogress(mddev) || - percpu_ref_is_zero(&mddev->active_io)); - if (percpu_ref_is_zero(&mddev->active_io)) - return; - - /* - * Reshape is not in progress, and array is suspended, io that is - * waiting for reshpape can never be done. - */ - wake_up(&conf->wait_for_overlap); -} - static struct md_personality raid6_personality = { .name = "raid6", @@ -9063,7 +9024,6 @@ static struct md_personality raid6_personality = .check_reshape = raid6_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, - .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid6_takeover, .change_consistency_policy = raid5_change_consistency_policy, @@ -9088,7 +9048,6 @@ static struct md_personality raid5_personality = .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, - .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid5_takeover, .change_consistency_policy = raid5_change_consistency_policy, @@ -9114,7 +9073,6 @@ static struct md_personality raid4_personality = .check_reshape = raid5_check_reshape, .start_reshape = raid5_start_reshape, .finish_reshape = raid5_finish_reshape, - .prepare_suspend = raid5_prepare_suspend, .quiesce = raid5_quiesce, .takeover = raid4_takeover, .change_consistency_policy = raid5_change_consistency_policy,