From fb73b357fb985cc652a72a41541d25915c7f9635 Mon Sep 17 00:00:00 2001
From: Mariusz Tkaczyk <mariusz.tkaczyk@intel.com>
Date: Tue, 4 Sep 2018 15:08:30 +0200
Subject: [PATCH 01/17] raid5: block failing device if raid will be failed

Currently there is an inconsistency for failing the member drives
for arrays with different RAID levels. For RAID456 - there is a possibility
to fail all of the devices. However - for other RAID levels - kernel blocks
removing the member drive, if the operation results in array's FAIL state
(EBUSY is returned). For example - removing last drive from RAID1 is not
possible.
This kind of blocker was never implemented for raid456 and we cannot see
the reason why.

We had tested following patch and did not observe any regression, so do you
have any comments/reasons for current approach, or we can send the proper
patch for this?

Signed-off-by: Mariusz Tkaczyk <mariusz.tkaczyk@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e4e98f47865d..4990f0319f6c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2681,6 +2681,18 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
 	pr_debug("raid456: error called\n");
 
 	spin_lock_irqsave(&conf->device_lock, flags);
+
+	if (test_bit(In_sync, &rdev->flags) &&
+	    mddev->degraded == conf->max_degraded) {
+		/*
+		 * Don't allow to achieve failed state
+		 * Don't try to recover this device
+		 */
+		conf->recovery_disabled = mddev->recovery_disabled;
+		spin_unlock_irqrestore(&conf->device_lock, flags);
+		return;
+	}
+
 	set_bit(Faulty, &rdev->flags);
 	clear_bit(In_sync, &rdev->flags);
 	mddev->degraded = raid5_calc_degraded(conf);

From ee37d7314a32ab6809eacc3389bad0406c69a81f Mon Sep 17 00:00:00 2001
From: Alex Wu <alexwu@synology.com>
Date: Fri, 21 Sep 2018 16:05:03 +0800
Subject: [PATCH 02/17] md/raid10: Fix raid10 replace hang when new added disk
 faulty

[Symptom]

Resync thread hang when new added disk faulty during replacing.

[Root Cause]

In raid10_sync_request(), we expect to issue a bio with callback
end_sync_read(), and a bio with callback end_sync_write().

In normal situation, we will add resyncing sectors into
mddev->recovery_active when raid10_sync_request() returned, and sub
resynced sectors from mddev->recovery_active when end_sync_write()
calls end_sync_request().

If new added disk, which are replacing the old disk, is set faulty,
there is a race condition:
    1. In the first rcu protected section, resync thread did not detect
       that mreplace is set faulty and pass the condition.
    2. In the second rcu protected section, mreplace is set faulty.
    3. But, resync thread will prepare the read object first, and then
       check the write condition.
    4. It will find that mreplace is set faulty and do not have to
       prepare write object.
This cause we add resync sectors but never sub it.

[How to Reproduce]

This issue can be easily reproduced by the following steps:
    mdadm -C /dev/md0 --assume-clean -l 10 -n 4 /dev/sd[abcd]
    mdadm /dev/md0 -a /dev/sde
    mdadm /dev/md0 --replace /dev/sdd
    sleep 1
    mdadm /dev/md0 -f /dev/sde

[How to Fix]

This issue can be fixed by using local variables to record the result
of test conditions. Once the conditions are satisfied, we can make sure
that we need to issue a bio for read and a bio for write.

Previous 'commit 24afd80d99f8 ("md/raid10: handle recovery of
replacement devices.")' will also check whether bio is NULL, but leave
the comment saying that it is a pointless test. So we remove this dummy
check.

Reported-by: Alex Chen <alexchen@synology.com>
Reviewed-by: Allen Peng <allenpeng@synology.com>
Reviewed-by: BingJing Chang <bingjingc@synology.com>
Signed-off-by: Alex Wu <alexwu@synology.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid10.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d6f7978b4449..749848b2c477 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3079,6 +3079,8 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			sector_t sect;
 			int must_sync;
 			int any_working;
+			int need_recover = 0;
+			int need_replace = 0;
 			struct raid10_info *mirror = &conf->mirrors[i];
 			struct md_rdev *mrdev, *mreplace;
 
@@ -3086,11 +3088,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 			mrdev = rcu_dereference(mirror->rdev);
 			mreplace = rcu_dereference(mirror->replacement);
 
-			if ((mrdev == NULL ||
-			     test_bit(Faulty, &mrdev->flags) ||
-			     test_bit(In_sync, &mrdev->flags)) &&
-			    (mreplace == NULL ||
-			     test_bit(Faulty, &mreplace->flags))) {
+			if (mrdev != NULL &&
+			    !test_bit(Faulty, &mrdev->flags) &&
+			    !test_bit(In_sync, &mrdev->flags))
+				need_recover = 1;
+			if (mreplace != NULL &&
+			    !test_bit(Faulty, &mreplace->flags))
+				need_replace = 1;
+
+			if (!need_recover && !need_replace) {
 				rcu_read_unlock();
 				continue;
 			}
@@ -3213,7 +3219,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				r10_bio->devs[1].devnum = i;
 				r10_bio->devs[1].addr = to_addr;
 
-				if (!test_bit(In_sync, &mrdev->flags)) {
+				if (need_recover) {
 					bio = r10_bio->devs[1].bio;
 					bio->bi_next = biolist;
 					biolist = bio;
@@ -3230,16 +3236,11 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 				bio = r10_bio->devs[1].repl_bio;
 				if (bio)
 					bio->bi_end_io = NULL;
-				/* Note: if mreplace != NULL, then bio
+				/* Note: if need_replace, then bio
 				 * cannot be NULL as r10buf_pool_alloc will
 				 * have allocated it.
-				 * So the second test here is pointless.
-				 * But it keeps semantic-checkers happy, and
-				 * this comment keeps human reviewers
-				 * happy.
 				 */
-				if (mreplace == NULL || bio == NULL ||
-				    test_bit(Faulty, &mreplace->flags))
+				if (!need_replace)
 					break;
 				bio->bi_next = biolist;
 				biolist = bio;

From d595567dc4f0c1d90685ec1e2e296e2cad2643ac Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Mon, 1 Oct 2018 18:36:36 -0700
Subject: [PATCH 03/17] MD: fix invalid stored role for a disk

If we change the number of array's device after device is removed from array,
then add the device back to array, we can see that device is added as active
role instead of spare which we expected.

Please see the below link for details:
https://marc.info/?l=linux-raid&m=153736982015076&w=2

This is caused by that we prefer to use device's previous role which is
recorded by saved_raid_disk, but we should respect the new number of
conf->raid_disks since it could be changed after device is removed.

Reported-by: Gioh Kim <gi-oh.kim@profitbricks.com>
Tested-by: Gioh Kim <gi-oh.kim@profitbricks.com>
Acked-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 63ceabb4e020..a25ebf81b266 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1774,6 +1774,10 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 			} else
 				set_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = role;
+			if (role >= mddev->raid_disks) {
+				rdev->saved_raid_disk = -1;
+				rdev->raid_disk = -1;
+			}
 			break;
 		}
 		if (sb->devflags & WriteMostly1)

From 059421e041eb461fb2b3e81c9adaec18ef03ca3c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 3 Oct 2018 15:04:41 +1000
Subject: [PATCH 04/17] md: allow metadata updates while suspending an array -
 fix

Commit 35bfc52187f6 ("md: allow metadata update while suspending.")
added support for allowing md_check_recovery() to still perform
metadata updates while the array is entering the 'suspended' state.
This is needed to allow the processes of entering the state to
complete.

Unfortunately, the patch doesn't really work.  The test for
"mddev->suspended" at the start of md_check_recovery() means that the
function doesn't try to do anything at all while entering suspend.

This patch moves the code of updating the metadata while suspending to
*before* the test on mddev->suspended.

Reported-by: Jeff Mahoney <jeffm@suse.com>
Fixes: 35bfc52187f6 ("md: allow metadata update while suspending.")
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index a25ebf81b266..62bf96daa157 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8794,6 +8794,18 @@ static void md_start_sync(struct work_struct *ws)
  */
 void md_check_recovery(struct mddev *mddev)
 {
+	if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
+		/* Write superblock - thread that called mddev_suspend()
+		 * holds reconfig_mutex for us.
+		 */
+		set_bit(MD_UPDATING_SB, &mddev->flags);
+		smp_mb__after_atomic();
+		if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
+			md_update_sb(mddev, 0);
+		clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
+		wake_up(&mddev->sb_wait);
+	}
+
 	if (mddev->suspended)
 		return;
 
@@ -8953,16 +8965,6 @@ void md_check_recovery(struct mddev *mddev)
 	unlock:
 		wake_up(&mddev->sb_wait);
 		mddev_unlock(mddev);
-	} else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
-		/* Write superblock - thread that called mddev_suspend()
-		 * holds reconfig_mutex for us.
-		 */
-		set_bit(MD_UPDATING_SB, &mddev->flags);
-		smp_mb__after_atomic();
-		if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
-			md_update_sb(mddev, 0);
-		clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
-		wake_up(&mddev->sb_wait);
 	}
 }
 EXPORT_SYMBOL(md_check_recovery);

From 116d99adf59314924adae0cb691e4d289b5f2407 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 8 Oct 2018 22:16:09 +0100
Subject: [PATCH 05/17] md: remove redundant code that is no longer reachable

And earlier commit removed the error label to two statements that
are now never reachable.  Since this code is now dead code, remove it.

Detected by CoverityScan, CID#1462409 ("Structurally dead code")

Fixes: d5d885fd514f ("md: introduce new personality funciton start()")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index e6e925add700..ec3a5ef7fee0 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -3151,8 +3151,6 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
 	return 0;
 
-	rcu_assign_pointer(conf->log, NULL);
-	md_unregister_thread(&log->reclaim_thread);
 reclaim_thread:
 	mempool_exit(&log->meta_pool);
 out_mempool:

From f8f83d8ffeb47041ff0937ecac6d10bcb388cd9f Mon Sep 17 00:00:00 2001
From: Jack Wang <jinpu.wang@profitbricks.com>
Date: Mon, 8 Oct 2018 17:24:03 +0200
Subject: [PATCH 06/17] md/bitmap: use mddev_suspend/resume instead of
 ->quiesce()

After 9e1cc0a54556 ("md: use mddev_suspend/resume instead of ->quiesce()")
We still have similar left in bitmap functions.

Replace quiesce() with mddev_suspend/resume.

Also move md_bitmap_create out of mddev_suspend. and move mddev_resume
after md_bitmap_destroy. as we did in set_bitmap_file.

Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Reviewed-by: Gioh Kim <gi-oh.kim@profitbricks.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-bitmap.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 2fc8c113977f..1cd4f991792c 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -2288,9 +2288,9 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 			goto out;
 		}
 		if (mddev->pers) {
-			mddev->pers->quiesce(mddev, 1);
+			mddev_suspend(mddev);
 			md_bitmap_destroy(mddev);
-			mddev->pers->quiesce(mddev, 0);
+			mddev_resume(mddev);
 		}
 		mddev->bitmap_info.offset = 0;
 		if (mddev->bitmap_info.file) {
@@ -2327,8 +2327,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 			mddev->bitmap_info.offset = offset;
 			if (mddev->pers) {
 				struct bitmap *bitmap;
-				mddev->pers->quiesce(mddev, 1);
 				bitmap = md_bitmap_create(mddev, -1);
+				mddev_suspend(mddev);
 				if (IS_ERR(bitmap))
 					rv = PTR_ERR(bitmap);
 				else {
@@ -2337,11 +2337,12 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 					if (rv)
 						mddev->bitmap_info.offset = 0;
 				}
-				mddev->pers->quiesce(mddev, 0);
 				if (rv) {
 					md_bitmap_destroy(mddev);
+					mddev_resume(mddev);
 					goto out;
 				}
+				mddev_resume(mddev);
 			}
 		}
 	}

From 9e753ba9b9b405e3902d9f08aec5f2ea58a0c317 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Sun, 14 Oct 2018 17:05:07 -0700
Subject: [PATCH 07/17] MD: fix invalid stored role for a disk - try2

Commit d595567dc4f0 (MD: fix invalid stored role for a disk) broke linear
hotadd. Let's only fix the role for disks in raid1/10.
Based on Guoqing's original patch.

Reported-by: kernel test robot <rong.a.chen@intel.com>
Cc: Gioh Kim <gi-oh.kim@profitbricks.com>
Cc: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c     | 4 ----
 drivers/md/raid1.c  | 1 +
 drivers/md/raid10.c | 1 +
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 62bf96daa157..4c0f3e0331d5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1774,10 +1774,6 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
 			} else
 				set_bit(In_sync, &rdev->flags);
 			rdev->raid_disk = role;
-			if (role >= mddev->raid_disks) {
-				rdev->saved_raid_disk = -1;
-				rdev->raid_disk = -1;
-			}
 			break;
 		}
 		if (sb->devflags & WriteMostly1)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4e990246225e..1d54109071cc 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1734,6 +1734,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	 */
 	if (rdev->saved_raid_disk >= 0 &&
 	    rdev->saved_raid_disk >= first &&
+	    rdev->saved_raid_disk < conf->raid_disks &&
 	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
 		first = last = rdev->saved_raid_disk;
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 749848b2c477..a52f43b151b5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1808,6 +1808,7 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		first = last = rdev->raid_disk;
 
 	if (rdev->saved_raid_disk >= first &&
+	    rdev->saved_raid_disk < conf->geo.raid_disks &&
 	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
 		mirror = rdev->saved_raid_disk;
 	else

From afd75628608337cf427a1f9ca0e46698a74f25d8 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Thu, 18 Oct 2018 16:37:41 +0800
Subject: [PATCH 08/17] md-cluster/raid10: resize all the bitmaps before start
 reshape

To support add disk under grow mode, we need to resize
all the bitmaps of each node before reshape, so that we
can ensure all nodes have the same view of the bitmap of
the clustered raid.

So after the master node resized the bitmap, it broadcast
a message to other slave nodes, and it checks the size of
each bitmap are same or not by compare pages. We can only
continue the reshaping after all nodes update the bitmap
to the same size (by checking the pages), otherwise revert
bitmap size to previous value.

The resize_bitmaps interface and BITMAP_RESIZE message are
introduced in md-cluster.c for the purpose.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c | 81 +++++++++++++++++++++++++++++++++++++++++
 drivers/md/md-cluster.h |  1 +
 drivers/md/raid10.c     | 41 +++++++++++++++++++--
 3 files changed, 120 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 0b2af6e74fc3..e223fae80da3 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -105,6 +105,7 @@ enum msg_type {
 	RE_ADD,
 	BITMAP_NEEDS_SYNC,
 	CHANGE_CAPACITY,
+	BITMAP_RESIZE,
 };
 
 struct cluster_msg {
@@ -612,6 +613,11 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 	case BITMAP_NEEDS_SYNC:
 		__recover_slot(mddev, le32_to_cpu(msg->slot));
 		break;
+	case BITMAP_RESIZE:
+		if (le64_to_cpu(msg->high) != mddev->pers->size(mddev, 0, 0))
+			ret = md_bitmap_resize(mddev->bitmap,
+					    le64_to_cpu(msg->high), 0, 0);
+		break;
 	default:
 		ret = -1;
 		pr_warn("%s:%d Received unknown message from %d\n",
@@ -1102,6 +1108,80 @@ static void metadata_update_cancel(struct mddev *mddev)
 	unlock_comm(cinfo);
 }
 
+static int update_bitmap_size(struct mddev *mddev, sector_t size)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	struct cluster_msg cmsg = {0};
+	int ret;
+
+	cmsg.type = cpu_to_le32(BITMAP_RESIZE);
+	cmsg.high = cpu_to_le64(size);
+	ret = sendmsg(cinfo, &cmsg, 0);
+	if (ret)
+		pr_err("%s:%d: failed to send BITMAP_RESIZE message (%d)\n",
+			__func__, __LINE__, ret);
+	return ret;
+}
+
+static int resize_bitmaps(struct mddev *mddev, sector_t newsize, sector_t oldsize)
+{
+	struct bitmap_counts *counts;
+	char str[64];
+	struct dlm_lock_resource *bm_lockres;
+	struct bitmap *bitmap = mddev->bitmap;
+	unsigned long my_pages = bitmap->counts.pages;
+	int i, rv;
+
+	/*
+	 * We need to ensure all the nodes can grow to a larger
+	 * bitmap size before make the reshaping.
+	 */
+	rv = update_bitmap_size(mddev, newsize);
+	if (rv)
+		return rv;
+
+	for (i = 0; i < mddev->bitmap_info.nodes; i++) {
+		if (i == md_cluster_ops->slot_number(mddev))
+			continue;
+
+		bitmap = get_bitmap_from_slot(mddev, i);
+		if (IS_ERR(bitmap)) {
+			pr_err("can't get bitmap from slot %d\n", i);
+			goto out;
+		}
+		counts = &bitmap->counts;
+
+		/*
+		 * If we can hold the bitmap lock of one node then
+		 * the slot is not occupied, update the pages.
+		 */
+		snprintf(str, 64, "bitmap%04d", i);
+		bm_lockres = lockres_init(mddev, str, NULL, 1);
+		if (!bm_lockres) {
+			pr_err("Cannot initialize %s lock\n", str);
+			goto out;
+		}
+		bm_lockres->flags |= DLM_LKF_NOQUEUE;
+		rv = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
+		if (!rv)
+			counts->pages = my_pages;
+		lockres_free(bm_lockres);
+
+		if (my_pages != counts->pages)
+			/*
+			 * Let's revert the bitmap size if one node
+			 * can't resize bitmap
+			 */
+			goto out;
+	}
+
+	return 0;
+out:
+	md_bitmap_free(bitmap);
+	update_bitmap_size(mddev, oldsize);
+	return -1;
+}
+
 /*
  * return 0 if all the bitmaps have the same sync_size
  */
@@ -1492,6 +1572,7 @@ static struct md_cluster_operations cluster_ops = {
 	.remove_disk = remove_disk,
 	.load_bitmaps = load_bitmaps,
 	.gather_bitmaps = gather_bitmaps,
+	.resize_bitmaps = resize_bitmaps,
 	.lock_all_bitmaps = lock_all_bitmaps,
 	.unlock_all_bitmaps = unlock_all_bitmaps,
 	.update_size = update_size,
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index c0240708f443..9bd753a6a94e 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -26,6 +26,7 @@ struct md_cluster_operations {
 	int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
 	void (*load_bitmaps)(struct mddev *mddev, int total_slots);
 	int (*gather_bitmaps)(struct md_rdev *rdev);
+	int (*resize_bitmaps)(struct mddev *mddev, sector_t newsize, sector_t oldsize);
 	int (*lock_all_bitmaps)(struct mddev *mddev);
 	void (*unlock_all_bitmaps)(struct mddev *mddev);
 	void (*update_size)(struct mddev *mddev, sector_t old_dev_sectors);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a52f43b151b5..72e52921c545 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -25,6 +25,7 @@
 #include <linux/seq_file.h>
 #include <linux/ratelimit.h>
 #include <linux/kthread.h>
+#include <linux/raid/md_p.h>
 #include <trace/events/block.h>
 #include "md.h"
 #include "raid10.h"
@@ -4288,12 +4289,46 @@ static int raid10_start_reshape(struct mddev *mddev)
 	spin_unlock_irq(&conf->device_lock);
 
 	if (mddev->delta_disks && mddev->bitmap) {
-		ret = md_bitmap_resize(mddev->bitmap,
-				       raid10_size(mddev, 0, conf->geo.raid_disks),
-				       0, 0);
+		struct mdp_superblock_1 *sb = NULL;
+		sector_t oldsize, newsize;
+
+		oldsize = raid10_size(mddev, 0, 0);
+		newsize = raid10_size(mddev, 0, conf->geo.raid_disks);
+
+		if (!mddev_is_clustered(mddev)) {
+			ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
+			if (ret)
+				goto abort;
+			else
+				goto out;
+		}
+
+		rdev_for_each(rdev, mddev) {
+			if (rdev->raid_disk > -1 &&
+			    !test_bit(Faulty, &rdev->flags))
+				sb = page_address(rdev->sb_page);
+		}
+
+		/*
+		 * some node is already performing reshape, and no need to
+		 * call md_bitmap_resize again since it should be called when
+		 * receiving BITMAP_RESIZE msg
+		 */
+		if ((sb && (le32_to_cpu(sb->feature_map) &
+			    MD_FEATURE_RESHAPE_ACTIVE)) || (oldsize == newsize))
+			goto out;
+
+		ret = md_bitmap_resize(mddev->bitmap, newsize, 0, 0);
 		if (ret)
 			goto abort;
+
+		ret = md_cluster_ops->resize_bitmaps(mddev, newsize, oldsize);
+		if (ret) {
+			md_bitmap_resize(mddev->bitmap, oldsize, 0, 0);
+			goto abort;
+		}
 	}
+out:
 	if (mddev->delta_disks > 0) {
 		rdev_for_each(rdev, mddev)
 			if (rdev->raid_disk < 0 &&

From 7564beda19b3646d781934d04fc382b738053e6f Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Thu, 18 Oct 2018 16:37:42 +0800
Subject: [PATCH 09/17] md-cluster/raid10: support add disk under grow mode

For clustered raid10 scenario, we need to let all the nodes
know about that a new disk is added to the array, and the
reshape caused by add new member just need to be happened in
one node, but other nodes should know about the change.

Since reshape means read data from somewhere (which is already
used by array) and write data to unused region. Obviously, it
is awful if one node is reading data from address while another
node is writing to the same address. Considering we have
implemented suspend writes in the resyncing area, so we can
just broadcast the reading address to other nodes to avoid the
trouble.

For master node, it would call reshape_request then update sb
during the reshape period. To avoid above trouble, we call
resync_info_update to send RESYNC message in reshape_request.

Then from slave node's view, it receives two type messages:
1. RESYNCING message
Slave node add the address (where master node reading data from)
to suspend list.

2. METADATA_UPDATED message
Once slave nodes know the reshaping is started in master node,
it is time to update reshape position and call start_reshape to
follow master node's step. After reshape is done, only reshape
position is need to be updated, so the majority task of reshaping
is happened on the master node.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c     | 24 ++++++++++++++++++++++++
 drivers/md/md.h     |  1 +
 drivers/md/raid10.c | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4c0f3e0331d5..e07096c4ff20 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9230,6 +9230,30 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
 	if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
 		update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
 
+	/*
+	 * Since mddev->delta_disks has already updated in update_raid_disks,
+	 * so it is time to check reshape.
+	 */
+	if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
+	    (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+		/*
+		 * reshape is happening in the remote node, we need to
+		 * update reshape_position and call start_reshape.
+		 */
+		mddev->reshape_position = sb->reshape_position;
+		if (mddev->pers->update_reshape_pos)
+			mddev->pers->update_reshape_pos(mddev);
+		if (mddev->pers->start_reshape)
+			mddev->pers->start_reshape(mddev);
+	} else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
+		   mddev->reshape_position != MaxSector &&
+		   !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
+		/* reshape is just done in another node. */
+		mddev->reshape_position = MaxSector;
+		if (mddev->pers->update_reshape_pos)
+			mddev->pers->update_reshape_pos(mddev);
+	}
+
 	/* Finally set the event to be up to date */
 	mddev->events = le64_to_cpu(sb->events);
 }
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 8afd6bfdbfb9..c52afb52c776 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -557,6 +557,7 @@ struct md_personality
 	int (*check_reshape) (struct mddev *mddev);
 	int (*start_reshape) (struct mddev *mddev);
 	void (*finish_reshape) (struct mddev *mddev);
+	void (*update_reshape_pos) (struct mddev *mddev);
 	/* quiesce suspends or resumes internal processing.
 	 * 1 - stop new actions and wait for action io to complete
 	 * 0 - return to normal behaviour
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 72e52921c545..1edd58a3098b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4605,6 +4605,32 @@ read_more:
 	r10_bio->master_bio = read_bio;
 	r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
 
+	/*
+	 * Broadcast RESYNC message to other nodes, so all nodes would not
+	 * write to the region to avoid conflict.
+	*/
+	if (mddev_is_clustered(mddev) && conf->cluster_sync_high <= sector_nr) {
+		struct mdp_superblock_1 *sb = NULL;
+		int sb_reshape_pos = 0;
+
+		conf->cluster_sync_low = sector_nr;
+		conf->cluster_sync_high = sector_nr + CLUSTER_RESYNC_WINDOW_SECTORS;
+		sb = page_address(rdev->sb_page);
+		if (sb) {
+			sb_reshape_pos = le64_to_cpu(sb->reshape_position);
+			/*
+			 * Set cluster_sync_low again if next address for array
+			 * reshape is less than cluster_sync_low. Since we can't
+			 * update cluster_sync_low until it has finished reshape.
+			 */
+			if (sb_reshape_pos < conf->cluster_sync_low)
+				conf->cluster_sync_low = sb_reshape_pos;
+		}
+
+		md_cluster_ops->resync_info_update(mddev, conf->cluster_sync_low,
+							  conf->cluster_sync_high);
+	}
+
 	/* Now find the locations in the new layout */
 	__raid10_find_phys(&conf->geo, r10_bio);
 
@@ -4756,6 +4782,13 @@ static void end_reshape(struct r10conf *conf)
 	conf->fullsync = 0;
 }
 
+static void raid10_update_reshape_pos(struct mddev *mddev)
+{
+	struct r10conf *conf = mddev->private;
+
+	conf->reshape_progress = mddev->reshape_position;
+}
+
 static int handle_reshape_read_error(struct mddev *mddev,
 				     struct r10bio *r10_bio)
 {
@@ -4924,6 +4957,7 @@ static struct md_personality raid10_personality =
 	.check_reshape	= raid10_check_reshape,
 	.start_reshape	= raid10_start_reshape,
 	.finish_reshape	= raid10_finish_reshape,
+	.update_reshape_pos = raid10_update_reshape_pos,
 	.congested	= raid10_congested,
 };
 

From 5ebaf80bc8d5826edcc2d1cea26a7d5a4b8f01dd Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Thu, 18 Oct 2018 16:37:43 +0800
Subject: [PATCH 10/17] md-cluster: introduce resync_info_get interface for
 sanity check

Since the resync region from suspend_info means one node
is reshaping this area, so the position of reshape_progress
should be included in the area.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c | 14 ++++++++++++++
 drivers/md/md-cluster.h |  1 +
 drivers/md/raid10.c     |  8 +++++++-
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index e223fae80da3..ca4a35f8bad8 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1323,6 +1323,19 @@ static int resync_start(struct mddev *mddev)
 	return dlm_lock_sync_interruptible(cinfo->resync_lockres, DLM_LOCK_EX, mddev);
 }
 
+static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	struct suspend_info *s;
+
+	spin_lock_irq(&cinfo->suspend_lock);
+	list_for_each_entry(s, &cinfo->suspend_list, list) {
+		*lo = s->lo;
+		*hi = s->hi;
+	}
+	spin_unlock_irq(&cinfo->suspend_lock);
+}
+
 static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -1562,6 +1575,7 @@ static struct md_cluster_operations cluster_ops = {
 	.resync_start = resync_start,
 	.resync_finish = resync_finish,
 	.resync_info_update = resync_info_update,
+	.resync_info_get = resync_info_get,
 	.metadata_update_start = metadata_update_start,
 	.metadata_update_finish = metadata_update_finish,
 	.metadata_update_cancel = metadata_update_cancel,
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 9bd753a6a94e..a78e3021775d 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -14,6 +14,7 @@ struct md_cluster_operations {
 	int (*leave)(struct mddev *mddev);
 	int (*slot_number)(struct mddev *mddev);
 	int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
+	void (*resync_info_get)(struct mddev *mddev, sector_t *lo, sector_t *hi);
 	int (*metadata_update_start)(struct mddev *mddev);
 	int (*metadata_update_finish)(struct mddev *mddev);
 	void (*metadata_update_cancel)(struct mddev *mddev);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1edd58a3098b..b98e746e7fc4 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4785,8 +4785,14 @@ static void end_reshape(struct r10conf *conf)
 static void raid10_update_reshape_pos(struct mddev *mddev)
 {
 	struct r10conf *conf = mddev->private;
+	sector_t lo, hi;
 
-	conf->reshape_progress = mddev->reshape_position;
+	md_cluster_ops->resync_info_get(mddev, &lo, &hi);
+	if (((mddev->reshape_position <= hi) && (mddev->reshape_position >= lo))
+	    || mddev->reshape_position == MaxSector)
+		conf->reshape_progress = mddev->reshape_position;
+	else
+		WARN_ON_ONCE(1);
 }
 
 static int handle_reshape_read_error(struct mddev *mddev,

From aefb2e5fc2be590e6bef8985f3d175c3d38b0b77 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Thu, 18 Oct 2018 16:37:44 +0800
Subject: [PATCH 11/17] md-cluster/raid10: call update_size in
 md_reap_sync_thread

We need to change the capacity in all nodes after one node
finishs reshape. And as we did before, we can't change the
capacity directly in md_do_sync, instead, the capacity should
be only changed in update_size or received CHANGE_CAPACITY
msg.

So master node calls update_size after completes reshape in
md_reap_sync_thread, but we need to skip ops->update_size if
MD_CLOSING is set since reshaping could not be finish.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index e07096c4ff20..e28f5db0a882 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8623,8 +8623,10 @@ void md_do_sync(struct md_thread *thread)
 		mddev_lock_nointr(mddev);
 		md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
 		mddev_unlock(mddev);
-		set_capacity(mddev->gendisk, mddev->array_sectors);
-		revalidate_disk(mddev->gendisk);
+		if (!mddev_is_clustered(mddev)) {
+			set_capacity(mddev->gendisk, mddev->array_sectors);
+			revalidate_disk(mddev->gendisk);
+		}
 	}
 
 	spin_lock(&mddev->lock);
@@ -8968,6 +8970,8 @@ EXPORT_SYMBOL(md_check_recovery);
 void md_reap_sync_thread(struct mddev *mddev)
 {
 	struct md_rdev *rdev;
+	sector_t old_dev_sectors = mddev->dev_sectors;
+	bool is_reshaped = false;
 
 	/* resync has finished, collect result */
 	md_unregister_thread(&mddev->sync_thread);
@@ -8982,8 +8986,11 @@ void md_reap_sync_thread(struct mddev *mddev)
 		}
 	}
 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-	    mddev->pers->finish_reshape)
+	    mddev->pers->finish_reshape) {
 		mddev->pers->finish_reshape(mddev);
+		if (mddev_is_clustered(mddev))
+			is_reshaped = true;
+	}
 
 	/* If array is no-longer degraded, then any saved_raid_disk
 	 * information must be scrapped.
@@ -9004,6 +9011,14 @@ void md_reap_sync_thread(struct mddev *mddev)
 	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
 	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+	/*
+	 * We call md_cluster_ops->update_size here because sync_size could
+	 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
+	 * so it is time to update size across cluster.
+	 */
+	if (mddev_is_clustered(mddev) && is_reshaped
+				      && !test_bit(MD_CLOSING, &mddev->flags))
+		md_cluster_ops->update_size(mddev, old_dev_sectors);
 	wake_up(&resync_wait);
 	/* flag recovery needed just to double check */
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);

From ca1e98e04a8d6cefbe8ad138df806434de6de7f3 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Thu, 18 Oct 2018 16:37:45 +0800
Subject: [PATCH 12/17] md-cluster/raid10: don't call remove_and_add_spares
 during reshaping stage

remove_and_add_spares is not needed if reshape is
happening in another node, because raid10_add_disk
called inside raid10_start_reshape would handle the
role changes of disk. Plus, remove_and_add_spares
can't deal with the role change due to reshape.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index e28f5db0a882..fa2bbdec9955 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9218,8 +9218,12 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
 		}
 
 		if (role != rdev2->raid_disk) {
-			/* got activated */
-			if (rdev2->raid_disk == -1 && role != 0xffff) {
+			/*
+			 * got activated except reshape is happening.
+			 */
+			if (rdev2->raid_disk == -1 && role != 0xffff &&
+			    !(le32_to_cpu(sb->feature_map) &
+			      MD_FEATURE_RESHAPE_ACTIVE)) {
 				rdev2->saved_raid_disk = role;
 				ret = remove_and_add_spares(mddev, rdev2);
 				pr_info("Activated spare: %s\n",

From cbce6863b6d04f6eac6b144bcfaaf11a189d6533 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Thu, 18 Oct 2018 16:37:46 +0800
Subject: [PATCH 13/17] md-cluster/bitmap: don't call
 md_bitmap_sync_with_cluster during reshaping stage

When reshape is happening in one node, other nodes could receive
lots of RESYNCING messages, so md_bitmap_sync_with_cluster is called.

Since the resyncing window is typically small in these RESYNCING
messages, so WARN is always triggered, so we should not call the
func when reshape is happening.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index ca4a35f8bad8..792e0218a42f 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -457,12 +457,13 @@ static void remove_suspend_info(struct mddev *mddev, int slot)
 	mddev->pers->quiesce(mddev, 0);
 }
 
-
 static void process_suspend_info(struct mddev *mddev,
 		int slot, sector_t lo, sector_t hi)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	struct suspend_info *s;
+	struct mdp_superblock_1 *sb = NULL;
+	struct md_rdev *rdev;
 
 	if (!hi) {
 		/*
@@ -476,6 +477,12 @@ static void process_suspend_info(struct mddev *mddev,
 		return;
 	}
 
+	rdev_for_each(rdev, mddev)
+		if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) {
+			sb = page_address(rdev->sb_page);
+			break;
+		}
+
 	/*
 	 * The bitmaps are not same for different nodes
 	 * if RESYNCING is happening in one node, then
@@ -488,12 +495,18 @@ static void process_suspend_info(struct mddev *mddev,
 	 * sync_low/hi is used to record the region which
 	 * arrived in the previous RESYNCING message,
 	 *
-	 * Call bitmap_sync_with_cluster to clear
-	 * NEEDED_MASK and set RESYNC_MASK since
-	 * resync thread is running in another node,
-	 * so we don't need to do the resync again
-	 * with the same section */
-	md_bitmap_sync_with_cluster(mddev, cinfo->sync_low, cinfo->sync_hi, lo, hi);
+	 * Call md_bitmap_sync_with_cluster to clear NEEDED_MASK
+	 * and set RESYNC_MASK since  resync thread is running
+	 * in another node, so we don't need to do the resync
+	 * again with the same section.
+	 *
+	 * Skip md_bitmap_sync_with_cluster in case reshape
+	 * happening, because reshaping region is small and
+	 * we don't want to trigger lots of WARN.
+	 */
+	if (sb && !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE))
+		md_bitmap_sync_with_cluster(mddev, cinfo->sync_low,
+					    cinfo->sync_hi, lo, hi);
 	cinfo->sync_low = lo;
 	cinfo->sync_hi = hi;
 

From cb9ee154317b084a96a7c2b771d1688f39fc714c Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Thu, 18 Oct 2018 16:37:47 +0800
Subject: [PATCH 14/17] md-cluster: send BITMAP_NEEDS_SYNC message if reshaping
 is interrupted

We need to continue the reshaping if it was interrupted in
original node. So original node should call resync_bitmap
in case reshaping is aborted.

Then BITMAP_NEEDS_SYNC message is broadcasted to other nodes,
node which continues the reshaping should restart reshape from
mddev->reshape_position instead of from the first beginning.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c | 19 ++++++++++++++++---
 drivers/md/md.c         | 12 ++++++++++--
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 792e0218a42f..f1b870766deb 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -333,6 +333,12 @@ static void recover_bitmaps(struct md_thread *thread)
 			}
 		spin_unlock_irq(&cinfo->suspend_lock);
 
+		/* Kick off a reshape if needed */
+		if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
+		    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+		    mddev->reshape_position != MaxSector)
+			md_wakeup_thread(mddev->sync_thread);
+
 		if (hi > 0) {
 			if (lo < mddev->recovery_cp)
 				mddev->recovery_cp = lo;
@@ -1020,10 +1026,17 @@ static int leave(struct mddev *mddev)
 	if (!cinfo)
 		return 0;
 
-	/* BITMAP_NEEDS_SYNC message should be sent when node
+	/*
+	 * BITMAP_NEEDS_SYNC message should be sent when node
 	 * is leaving the cluster with dirty bitmap, also we
-	 * can only deliver it when dlm connection is available */
-	if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
+	 * can only deliver it when dlm connection is available.
+	 *
+	 * Also, we should send BITMAP_NEEDS_SYNC message in
+	 * case reshaping is interrupted.
+	 */
+	if ((cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector) ||
+	    (mddev->reshape_position != MaxSector &&
+	     test_bit(MD_CLOSING, &mddev->flags)))
 		resync_bitmap(mddev);
 
 	set_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index fa2bbdec9955..390f410e51dd 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8370,9 +8370,17 @@ void md_do_sync(struct md_thread *thread)
 		else if (!mddev->bitmap)
 			j = mddev->recovery_cp;
 
-	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
+	} else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
 		max_sectors = mddev->resync_max_sectors;
-	else {
+		/*
+		 * If the original node aborts reshaping then we continue the
+		 * reshaping, so set j again to avoid restart reshape from the
+		 * first beginning
+		 */
+		if (mddev_is_clustered(mddev) &&
+		    mddev->reshape_position != MaxSector)
+			j = mddev->reshape_position;
+	} else {
 		/* recovery follows the physical size of devices */
 		max_sectors = mddev->dev_sectors;
 		j = MaxSector;

From ea89238c0a7b346bc90552e0244e87edf3311920 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Thu, 18 Oct 2018 16:37:48 +0800
Subject: [PATCH 15/17] md-cluster: remove suspend_info

Previously, we allow multiple nodes can resync device, but we
had changed it to only support one node can do resync at one
time, but suspend_info is still used.

Now, let's remove the structure and use suspend_lo/hi to record
the range.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md-cluster.c | 103 +++++++++++++---------------------------
 1 file changed, 32 insertions(+), 71 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index f1b870766deb..8dff19d5502e 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -33,13 +33,6 @@ struct dlm_lock_resource {
 	int mode;
 };
 
-struct suspend_info {
-	int slot;
-	sector_t lo;
-	sector_t hi;
-	struct list_head list;
-};
-
 struct resync_info {
 	__le64 lo;
 	__le64 hi;
@@ -80,7 +73,13 @@ struct md_cluster_info {
 	struct dlm_lock_resource **other_bitmap_lockres;
 	struct dlm_lock_resource *resync_lockres;
 	struct list_head suspend_list;
+
 	spinlock_t suspend_lock;
+	/* record the region which write should be suspended */
+	sector_t suspend_lo;
+	sector_t suspend_hi;
+	int suspend_from; /* the slot which broadcast suspend_lo/hi */
+
 	struct md_thread *recovery_thread;
 	unsigned long recovery_map;
 	/* communication loc resources */
@@ -271,25 +270,22 @@ static void add_resync_info(struct dlm_lock_resource *lockres,
 	ri->hi = cpu_to_le64(hi);
 }
 
-static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres)
+static int read_resync_info(struct mddev *mddev,
+			    struct dlm_lock_resource *lockres)
 {
 	struct resync_info ri;
-	struct suspend_info *s = NULL;
-	sector_t hi = 0;
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	int ret = 0;
 
 	dlm_lock_sync(lockres, DLM_LOCK_CR);
 	memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
-	hi = le64_to_cpu(ri.hi);
-	if (hi > 0) {
-		s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
-		if (!s)
-			goto out;
-		s->hi = hi;
-		s->lo = le64_to_cpu(ri.lo);
+	if (le64_to_cpu(ri.hi) > 0) {
+		cinfo->suspend_hi = le64_to_cpu(ri.hi);
+		cinfo->suspend_lo = le64_to_cpu(ri.lo);
+		ret = 1;
 	}
 	dlm_unlock_sync(lockres);
-out:
-	return s;
+	return ret;
 }
 
 static void recover_bitmaps(struct md_thread *thread)
@@ -299,7 +295,6 @@ static void recover_bitmaps(struct md_thread *thread)
 	struct dlm_lock_resource *bm_lockres;
 	char str[64];
 	int slot, ret;
-	struct suspend_info *s, *tmp;
 	sector_t lo, hi;
 
 	while (cinfo->recovery_map) {
@@ -326,11 +321,9 @@ static void recover_bitmaps(struct md_thread *thread)
 
 		/* Clear suspend_area associated with the bitmap */
 		spin_lock_irq(&cinfo->suspend_lock);
-		list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
-			if (slot == s->slot) {
-				list_del(&s->list);
-				kfree(s);
-			}
+		cinfo->suspend_hi = 0;
+		cinfo->suspend_lo = 0;
+		cinfo->suspend_from = -1;
 		spin_unlock_irq(&cinfo->suspend_lock);
 
 		/* Kick off a reshape if needed */
@@ -441,24 +434,13 @@ static void ack_bast(void *arg, int mode)
 	}
 }
 
-static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
-{
-	struct suspend_info *s, *tmp;
-
-	list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
-		if (slot == s->slot) {
-			list_del(&s->list);
-			kfree(s);
-			break;
-		}
-}
-
 static void remove_suspend_info(struct mddev *mddev, int slot)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	mddev->pers->quiesce(mddev, 1);
 	spin_lock_irq(&cinfo->suspend_lock);
-	__remove_suspend_info(cinfo, slot);
+	cinfo->suspend_hi = 0;
+	cinfo->suspend_lo = 0;
 	spin_unlock_irq(&cinfo->suspend_lock);
 	mddev->pers->quiesce(mddev, 0);
 }
@@ -467,7 +449,6 @@ static void process_suspend_info(struct mddev *mddev,
 		int slot, sector_t lo, sector_t hi)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
-	struct suspend_info *s;
 	struct mdp_superblock_1 *sb = NULL;
 	struct md_rdev *rdev;
 
@@ -516,17 +497,11 @@ static void process_suspend_info(struct mddev *mddev,
 	cinfo->sync_low = lo;
 	cinfo->sync_hi = hi;
 
-	s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
-	if (!s)
-		return;
-	s->slot = slot;
-	s->lo = lo;
-	s->hi = hi;
 	mddev->pers->quiesce(mddev, 1);
 	spin_lock_irq(&cinfo->suspend_lock);
-	/* Remove existing entry (if exists) before adding */
-	__remove_suspend_info(cinfo, slot);
-	list_add(&s->list, &cinfo->suspend_list);
+	cinfo->suspend_from = slot;
+	cinfo->suspend_lo = lo;
+	cinfo->suspend_hi = hi;
 	spin_unlock_irq(&cinfo->suspend_lock);
 	mddev->pers->quiesce(mddev, 0);
 }
@@ -825,7 +800,6 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	int i, ret = 0;
 	struct dlm_lock_resource *bm_lockres;
-	struct suspend_info *s;
 	char str[64];
 	sector_t lo, hi;
 
@@ -844,16 +818,13 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
 		bm_lockres->flags |= DLM_LKF_NOQUEUE;
 		ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
 		if (ret == -EAGAIN) {
-			s = read_resync_info(mddev, bm_lockres);
-			if (s) {
+			if (read_resync_info(mddev, bm_lockres)) {
 				pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n",
 						__func__, __LINE__,
-						(unsigned long long) s->lo,
-						(unsigned long long) s->hi, i);
-				spin_lock_irq(&cinfo->suspend_lock);
-				s->slot = i;
-				list_add(&s->list, &cinfo->suspend_list);
-				spin_unlock_irq(&cinfo->suspend_lock);
+					(unsigned long long) cinfo->suspend_lo,
+					(unsigned long long) cinfo->suspend_hi,
+					i);
+				cinfo->suspend_from = i;
 			}
 			ret = 0;
 			lockres_free(bm_lockres);
@@ -1352,13 +1323,10 @@ static int resync_start(struct mddev *mddev)
 static void resync_info_get(struct mddev *mddev, sector_t *lo, sector_t *hi)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
-	struct suspend_info *s;
 
 	spin_lock_irq(&cinfo->suspend_lock);
-	list_for_each_entry(s, &cinfo->suspend_list, list) {
-		*lo = s->lo;
-		*hi = s->hi;
-	}
+	*lo = cinfo->suspend_lo;
+	*hi = cinfo->suspend_hi;
 	spin_unlock_irq(&cinfo->suspend_lock);
 }
 
@@ -1414,21 +1382,14 @@ static int area_resyncing(struct mddev *mddev, int direction,
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	int ret = 0;
-	struct suspend_info *s;
 
 	if ((direction == READ) &&
 		test_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state))
 		return 1;
 
 	spin_lock_irq(&cinfo->suspend_lock);
-	if (list_empty(&cinfo->suspend_list))
-		goto out;
-	list_for_each_entry(s, &cinfo->suspend_list, list)
-		if (hi > s->lo && lo < s->hi) {
-			ret = 1;
-			break;
-		}
-out:
+	if (hi > cinfo->suspend_lo && lo < cinfo->suspend_hi)
+		ret = 1;
 	spin_unlock_irq(&cinfo->suspend_lock);
 	return ret;
 }

From 6aaa58c994277647f8b05ffef3b9b225a2d08f36 Mon Sep 17 00:00:00 2001
From: Jack Wang <jinpu.wang@profitbricks.com>
Date: Fri, 19 Oct 2018 16:21:31 +0200
Subject: [PATCH 16/17] md: fix memleak for mempool

I noticed kmemleak report memory leak when run create/stop
md in a loop, backtrace:
[<000000001ca975e7>] mempool_create_node+0x86/0xd0
[<0000000095576bcd>] md_run+0x1057/0x1410 [md_mod]
[<000000007b45c5fc>] do_md_run+0x15/0x130 [md_mod]
[<000000001ede9ec0>] md_ioctl+0x1f49/0x25d0 [md_mod]
[<000000004142cacf>] blkdev_ioctl+0x680/0xd00

The root cause is we alloc mddev->flush_pool and
mddev->flush_bio_pool in md_run, but from do_md_stop
will not call into md_stop but __md_stop, move the
mempool_destroy to __md_stop fixes the problem for me.

The bug was introduced in 5a409b4f56d5, the fixes should go to
4.18+

Fixes: 5a409b4f56d5 ("MD: fix lock contention for flush bios")
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Reviewed-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 390f410e51dd..1fa7f1b1d98a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5904,14 +5904,6 @@ static void __md_stop(struct mddev *mddev)
 		mddev->to_remove = &md_redundancy_group;
 	module_put(pers->owner);
 	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-}
-
-void md_stop(struct mddev *mddev)
-{
-	/* stop the array and free an attached data structures.
-	 * This is called from dm-raid
-	 */
-	__md_stop(mddev);
 	if (mddev->flush_bio_pool) {
 		mempool_destroy(mddev->flush_bio_pool);
 		mddev->flush_bio_pool = NULL;
@@ -5920,6 +5912,14 @@ void md_stop(struct mddev *mddev)
 		mempool_destroy(mddev->flush_pool);
 		mddev->flush_pool = NULL;
 	}
+}
+
+void md_stop(struct mddev *mddev)
+{
+	/* stop the array and free an attached data structures.
+	 * This is called from dm-raid
+	 */
+	__md_stop(mddev);
 	bioset_exit(&mddev->bio_set);
 	bioset_exit(&mddev->sync_set);
 }

From af9b926de9c5986ab009e64917de87c9758bab10 Mon Sep 17 00:00:00 2001
From: Xiao Ni <xni@redhat.com>
Date: Sat, 20 Oct 2018 08:09:25 +0800
Subject: [PATCH 17/17] MD: Memory leak when flush bio size is zero

flush_pool is leaked when flush bio size is zero

Fixes: 5a409b4f56d5 ("MD: fix lock contention for flush bios")
Signed-off-by: David Jeffery <djeffery@redhat.com>
Signed-off-by: Xiao Ni <xni@redhat.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/md.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1fa7f1b1d98a..fc488cb30a94 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -452,10 +452,11 @@ static void md_end_flush(struct bio *fbio)
 	rdev_dec_pending(rdev, mddev);
 
 	if (atomic_dec_and_test(&fi->flush_pending)) {
-		if (bio->bi_iter.bi_size == 0)
+		if (bio->bi_iter.bi_size == 0) {
 			/* an empty barrier - all done */
 			bio_endio(bio);
-		else {
+			mempool_free(fi, mddev->flush_pool);
+		} else {
 			INIT_WORK(&fi->flush_work, submit_flushes);
 			queue_work(md_wq, &fi->flush_work);
 		}
@@ -509,10 +510,11 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
 	rcu_read_unlock();
 
 	if (atomic_dec_and_test(&fi->flush_pending)) {
-		if (bio->bi_iter.bi_size == 0)
+		if (bio->bi_iter.bi_size == 0) {
 			/* an empty barrier - all done */
 			bio_endio(bio);
-		else {
+			mempool_free(fi, mddev->flush_pool);
+		} else {
 			INIT_WORK(&fi->flush_work, submit_flushes);
 			queue_work(md_wq, &fi->flush_work);
 		}