From 9ed38ff53090856769b99e9d2f19740fb11e6956 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Fri, 14 Aug 2015 12:19:40 -0500
Subject: [PATCH 01/18] md-cluster: complete all write requests before adding
 suspend_info

process_suspend_info - which handles the RESYNCING request - must not
reply until all writes which were initiated before the request arrived,
have completed.

As a by-product, all process_* functions now take mddev as their
first arguement making it uniform.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 11e3bc9d2a4b..4a965f22be20 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -374,9 +374,10 @@ static void remove_suspend_info(struct md_cluster_info *cinfo, int slot)
 }
 
 
-static void process_suspend_info(struct md_cluster_info *cinfo,
+static void process_suspend_info(struct mddev *mddev,
 		int slot, sector_t lo, sector_t hi)
 {
+	struct md_cluster_info *cinfo = mddev->cluster_info;
 	struct suspend_info *s;
 
 	if (!hi) {
@@ -389,6 +390,8 @@ static void process_suspend_info(struct md_cluster_info *cinfo,
 	s->slot = slot;
 	s->lo = lo;
 	s->hi = hi;
+	mddev->pers->quiesce(mddev, 1);
+	mddev->pers->quiesce(mddev, 0);
 	spin_lock_irq(&cinfo->suspend_lock);
 	/* Remove existing entry (if exists) before adding */
 	__remove_suspend_info(cinfo, slot);
@@ -457,7 +460,7 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 	case RESYNCING:
 		pr_info("%s: %d Received message: RESYNCING from %d\n",
 			__func__, __LINE__, msg->slot);
-		process_suspend_info(mddev->cluster_info, msg->slot,
+		process_suspend_info(mddev, msg->slot,
 				msg->low, msg->high);
 		break;
 	case NEWDISK:

From 3c462c880b52aae2cfbbb8db8b401eef118cc128 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Wed, 19 Aug 2015 07:35:54 +1000
Subject: [PATCH 02/18] md: Increment version for clustered bitmaps

Add BITMAP_MAJOR_CLUSTERED as 5, in order to prevent older kernels
to assemble a clustered device.

In order to maximize compatibility, the major version is set to
BITMAP_MAJOR_CLUSTERED *only* if the bitmap is clustered.

Added MD_FEATURE_CLUSTERED in order to return error for older
kernels which would assemble MD even if the bitmap is corrupted.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/bitmap.c            | 10 ++++------
 drivers/md/bitmap.h            |  2 ++
 drivers/md/md.c                |  3 +++
 include/uapi/linux/raid/md_p.h |  2 ++
 4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 48b5890c28e3..e9d3ee703e6d 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -613,12 +613,10 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
 	write_behind = le32_to_cpu(sb->write_behind);
 	sectors_reserved = le32_to_cpu(sb->sectors_reserved);
-	/* XXX: This is a hack to ensure that we don't use clustering
-	 *  in case:
-	 *	- dm-raid is in use and
-	 *	- the nodes written in bitmap_sb is erroneous.
+	/* Setup nodes/clustername only if bitmap version is
+	 * cluster-compatible
 	 */
-	if (!bitmap->mddev->sync_super) {
+	if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) {
 		nodes = le32_to_cpu(sb->nodes);
 		strlcpy(bitmap->mddev->bitmap_info.cluster_name,
 				sb->cluster_name, 64);
@@ -628,7 +626,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
 		reason = "bad magic";
 	else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
-		 le32_to_cpu(sb->version) > BITMAP_MAJOR_HI)
+		 le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED)
 		reason = "unrecognized superblock version";
 	else if (chunksize < 512)
 		reason = "bitmap chunksize too small";
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index f1f4dd01090d..8731fa06855f 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -9,8 +9,10 @@
 #define BITMAP_MAJOR_LO 3
 /* version 4 insists the bitmap is in little-endian order
  * with version 3, it is host-endian which is non-portable
+ * Version 5 is currently set only for clustered devices
  */
 #define BITMAP_MAJOR_HI 4
+#define BITMAP_MAJOR_CLUSTERED 5
 #define	BITMAP_MAJOR_HOSTENDIAN 3
 
 /*
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c702de18207a..1e1bdd86f40c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1735,6 +1735,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
 		}
 	}
 
+	if (mddev_is_clustered(mddev))
+		sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
+
 	if (rdev->badblocks.count == 0)
 		/* Nothing to do for bad blocks*/ ;
 	else if (sb->bblog_offset == 0)
diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h
index 2ae6131e69a5..867ee874fa80 100644
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -302,6 +302,7 @@ struct mdp_superblock_1 {
 #define	MD_FEATURE_RECOVERY_BITMAP	128 /* recovery that is happening
 					     * is guided by bitmap.
 					     */
+#define MD_FEATURE_CLUSTERED		256 /* clustered MD */
 #define	MD_FEATURE_ALL			(MD_FEATURE_BITMAP_OFFSET	\
 					|MD_FEATURE_RECOVERY_OFFSET	\
 					|MD_FEATURE_RESHAPE_ACTIVE	\
@@ -310,6 +311,7 @@ struct mdp_superblock_1 {
 					|MD_FEATURE_RESHAPE_BACKWARDS	\
 					|MD_FEATURE_NEW_OFFSET		\
 					|MD_FEATURE_RECOVERY_BITMAP	\
+					|MD_FEATURE_CLUSTERED		\
 					)
 
 #endif

From c40f341f1e7fd4eddcfc5881d94cfa8669071ee6 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Wed, 19 Aug 2015 08:14:42 +1000
Subject: [PATCH 03/18] md-cluster: Use a small window for resync

Suspending the entire device for resync could take too long. Resync
in small chunks.

cluster's resync window (32M) is maintained in r1conf as
cluster_sync_low and cluster_sync_high and processed in
raid1's sync_request(). If the current resync is outside the cluster
resync window:

1. Set the cluster_sync_low to curr_resync_completed.
2. Check if the sync will fit in the new window, if not issue a
   wait_barrier() and set cluster_sync_low to sector_nr.
3. Set cluster_sync_high to cluster_sync_low + resync_window.
4. Send a message to all nodes so they may add it in their suspension
   list.

bitmap_cond_end_sync is modified to allow to force a sync inorder
to get the curr_resync_completed uptodate with the sector passed.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c     |  4 ++--
 drivers/md/bitmap.h     |  2 +-
 drivers/md/md-cluster.c | 41 +++++------------------------------------
 drivers/md/md-cluster.h |  4 +---
 drivers/md/md.c         |  8 --------
 drivers/md/raid1.c      | 26 +++++++++++++++++++++++++-
 drivers/md/raid1.h      |  7 +++++++
 drivers/md/raid10.c     |  2 +-
 drivers/md/raid5.c      |  2 +-
 9 files changed, 43 insertions(+), 53 deletions(-)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index e9d3ee703e6d..4f22e919787a 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1570,7 +1570,7 @@ void bitmap_close_sync(struct bitmap *bitmap)
 }
 EXPORT_SYMBOL(bitmap_close_sync);
 
-void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
+void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
 {
 	sector_t s = 0;
 	sector_t blocks;
@@ -1581,7 +1581,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
 		bitmap->last_end_sync = jiffies;
 		return;
 	}
-	if (time_before(jiffies, (bitmap->last_end_sync
+	if (!force && time_before(jiffies, (bitmap->last_end_sync
 				  + bitmap->mddev->bitmap_info.daemon_sleep)))
 		return;
 	wait_event(bitmap->mddev->recovery_wait,
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 8731fa06855f..7d5c3a610ca5 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -257,7 +257,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
 int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded);
 void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
 void bitmap_close_sync(struct bitmap *bitmap);
-void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
+void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force);
 
 void bitmap_unplug(struct bitmap *bitmap);
 void bitmap_daemon_work(struct mddev *mddev);
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 4a965f22be20..b94a2e68ef43 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -802,15 +802,6 @@ static int slot_number(struct mddev *mddev)
 	return cinfo->slot_number - 1;
 }
 
-static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
-{
-	struct md_cluster_info *cinfo = mddev->cluster_info;
-
-	add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
-	/* Re-acquire the lock to refresh LVB */
-	dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
-}
-
 static int metadata_update_start(struct mddev *mddev)
 {
 	return lock_comm(mddev->cluster_info);
@@ -836,45 +827,25 @@ static int metadata_update_cancel(struct mddev *mddev)
 	return dlm_unlock_sync(cinfo->token_lockres);
 }
 
-static int resync_send(struct mddev *mddev, enum msg_type type,
-		sector_t lo, sector_t hi)
+static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	struct cluster_msg cmsg;
 	int slot = cinfo->slot_number - 1;
 
+	add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
+	/* Re-acquire the lock to refresh LVB */
+	dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
 	pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__,
 			(unsigned long long)lo,
 			(unsigned long long)hi);
-	resync_info_update(mddev, lo, hi);
-	cmsg.type = cpu_to_le32(type);
+	cmsg.type = cpu_to_le32(RESYNCING);
 	cmsg.slot = cpu_to_le32(slot);
 	cmsg.low = cpu_to_le64(lo);
 	cmsg.high = cpu_to_le64(hi);
 	return sendmsg(cinfo, &cmsg);
 }
 
-static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi)
-{
-	pr_info("%s:%d\n", __func__, __LINE__);
-	return resync_send(mddev, RESYNCING, lo, hi);
-}
-
-static void resync_finish(struct mddev *mddev)
-{
-	struct md_cluster_info *cinfo = mddev->cluster_info;
-	struct cluster_msg cmsg;
-	int slot = cinfo->slot_number - 1;
-
-	pr_info("%s:%d\n", __func__, __LINE__);
-	resync_send(mddev, RESYNCING, 0, 0);
-	if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
-		cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
-		cmsg.slot = cpu_to_le32(slot);
-		sendmsg(cinfo, &cmsg);
-	}
-}
-
 static int area_resyncing(struct mddev *mddev, int direction,
 		sector_t lo, sector_t hi)
 {
@@ -997,8 +968,6 @@ static struct md_cluster_operations cluster_ops = {
 	.leave  = leave,
 	.slot_number = slot_number,
 	.resync_info_update = resync_info_update,
-	.resync_start = resync_start,
-	.resync_finish = resync_finish,
 	.metadata_update_start = metadata_update_start,
 	.metadata_update_finish = metadata_update_finish,
 	.metadata_update_cancel = metadata_update_cancel,
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 00defe2badbc..f5bdc0c86eaa 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -12,9 +12,7 @@ struct md_cluster_operations {
 	int (*join)(struct mddev *mddev, int nodes);
 	int (*leave)(struct mddev *mddev);
 	int (*slot_number)(struct mddev *mddev);
-	void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
-	int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi);
-	void (*resync_finish)(struct mddev *mddev);
+	int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
 	int (*metadata_update_start)(struct mddev *mddev);
 	int (*metadata_update_finish)(struct mddev *mddev);
 	int (*metadata_update_cancel)(struct mddev *mddev);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1e1bdd86f40c..9798a9921a38 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7805,9 +7805,6 @@ void md_do_sync(struct md_thread *thread)
 	md_new_event(mddev);
 	update_time = jiffies;
 
-	if (mddev_is_clustered(mddev))
-		md_cluster_ops->resync_start(mddev, j, max_sectors);
-
 	blk_start_plug(&plug);
 	while (j < max_sectors) {
 		sector_t sectors;
@@ -7871,8 +7868,6 @@ void md_do_sync(struct md_thread *thread)
 			j = max_sectors;
 		if (j > 2)
 			mddev->curr_resync = j;
-		if (mddev_is_clustered(mddev))
-			md_cluster_ops->resync_info_update(mddev, j, max_sectors);
 		mddev->curr_mark_cnt = io_sectors;
 		if (last_check == 0)
 			/* this is the earliest that rebuild will be
@@ -7979,9 +7974,6 @@ void md_do_sync(struct md_thread *thread)
 		}
 	}
  skip:
-	if (mddev_is_clustered(mddev))
-		md_cluster_ops->resync_finish(mddev);
-
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
 	spin_lock(&mddev->lock);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 049df6c4a8cc..1dd13bb52940 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -90,6 +90,8 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
 #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
+#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
+#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
 #define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
 
 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
@@ -2488,6 +2490,13 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 
 		bitmap_close_sync(mddev->bitmap);
 		close_sync(conf);
+
+		if (mddev_is_clustered(mddev)) {
+			conf->cluster_sync_low = 0;
+			conf->cluster_sync_high = 0;
+			/* Send zeros to mark end of resync */
+			md_cluster_ops->resync_info_update(mddev, 0, 0);
+		}
 		return 0;
 	}
 
@@ -2508,7 +2517,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 		return sync_blocks;
 	}
 
-	bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+	/* we are incrementing sector_nr below. To be safe, we check against
+	 * sector_nr + two times RESYNC_SECTORS
+	 */
+
+	bitmap_cond_end_sync(mddev->bitmap, sector_nr,
+		mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
 	r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
 
 	raise_barrier(conf, sector_nr);
@@ -2699,6 +2713,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
  bio_full:
 	r1_bio->sectors = nr_sectors;
 
+	if (mddev_is_clustered(mddev) &&
+			conf->cluster_sync_high < sector_nr + nr_sectors) {
+		conf->cluster_sync_low = mddev->curr_resync_completed;
+		conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS;
+		/* Send resync message */
+		md_cluster_ops->resync_info_update(mddev,
+				conf->cluster_sync_low,
+				conf->cluster_sync_high);
+	}
+
 	/* For a user-requested sync, we read all readable devices and do a
 	 * compare
 	 */
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index c52d7139c5d7..61c39b390cd8 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -111,6 +111,13 @@ struct r1conf {
 	 * the new thread here until we fully activate the array.
 	 */
 	struct md_thread	*thread;
+
+	/* Keep track of cluster resync window to send to other
+	 * nodes.
+	 */
+	sector_t		cluster_sync_low;
+	sector_t		cluster_sync_high;
+
 };
 
 /*
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7c99a4037715..5f30b7526c1f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -3137,7 +3137,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 		/* resync. Schedule a read for every block at this virt offset */
 		int count = 0;
 
-		bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+		bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0);
 
 		if (!bitmap_start_sync(mddev->bitmap, sector_nr,
 				       &sync_blocks, mddev->degraded) &&
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 49bb8d3ff9be..5b79770c4f08 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5613,7 +5613,7 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
 		return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
 	}
 
-	bitmap_cond_end_sync(mddev->bitmap, sector_nr);
+	bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
 
 	sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
 	if (sh == NULL) {

From 099954119d53a24573d22b70a6ea22a0a279e689 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Thu, 1 Oct 2015 00:09:18 +0800
Subject: [PATCH 04/18] md-cluster: send BITMAP_NEEDS_SYNC when node is leaving
 cluster

Previously, BITMAP_NEEDS_SYNC message is sent when the resyc
aborts, but it could abort for different reasons, and not all
of reasons require another node to take over the resync ownship.

It is better make BITMAP_NEEDS_SYNC message only be sent when
the node is leaving cluster with dirty bitmap. And we also need
to ensure dlm connection is ok.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index b94a2e68ef43..51e8552be73a 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -774,12 +774,32 @@ static int join(struct mddev *mddev, int nodes)
 	return ret;
 }
 
+static void resync_bitmap(struct mddev *mddev)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	struct cluster_msg cmsg = {0};
+	int err;
+
+	cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
+	err = sendmsg(cinfo, &cmsg);
+	if (err)
+		pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
+			__func__, __LINE__, err);
+}
+
 static int leave(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 
 	if (!cinfo)
 		return 0;
+
+	/* BITMAP_NEEDS_SYNC message should be sent when node
+	 * is leaving the cluster with dirty bitmap, also we
+	 * can only deliver it when dlm connection is available */
+	if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
+		resync_bitmap(mddev);
+
 	md_unregister_thread(&cinfo->recovery_thread);
 	md_unregister_thread(&cinfo->recv_thread);
 	lockres_free(cinfo->message_lockres);

From b8ca846e45197a2de829def27254f833a998723e Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Fri, 9 Oct 2015 11:27:01 -0500
Subject: [PATCH 05/18] md-cluster: Wake up suspended process

When the suspended_area is deleted, the suspended processes
must be woken up in order to complete their I/O.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 51e8552be73a..58eadc06a1b6 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -366,11 +366,13 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
 		}
 }
 
-static void remove_suspend_info(struct md_cluster_info *cinfo, int slot)
+static void remove_suspend_info(struct mddev *mddev, int slot)
 {
+	struct md_cluster_info *cinfo = mddev->cluster_info;
 	spin_lock_irq(&cinfo->suspend_lock);
 	__remove_suspend_info(cinfo, slot);
 	spin_unlock_irq(&cinfo->suspend_lock);
+	mddev->pers->quiesce(mddev, 2);
 }
 
 
@@ -381,7 +383,7 @@ static void process_suspend_info(struct mddev *mddev,
 	struct suspend_info *s;
 
 	if (!hi) {
-		remove_suspend_info(cinfo, slot);
+		remove_suspend_info(mddev, slot);
 		return;
 	}
 	s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
@@ -397,6 +399,7 @@ static void process_suspend_info(struct mddev *mddev,
 	__remove_suspend_info(cinfo, slot);
 	list_add(&s->list, &cinfo->suspend_list);
 	spin_unlock_irq(&cinfo->suspend_lock);
+	mddev->pers->quiesce(mddev, 2);
 }
 
 static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)

From 2910ff17d154baa5eb50e362a91104e831eb2bb6 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Mon, 28 Sep 2015 10:27:26 -0500
Subject: [PATCH 06/18] md: remove_and_add_spares() to activate specific rdev

remove_and_add_spares() checks for all devices to activate spare.
Change it to activate a specific device if a non-null rdev
argument is passed.

remove_and_add_spares() can be used to activate spares in
slot_store() as well.

For hot_remove_disk(), check if rdev->raid_disk == -1 before
calling remove_and_add_spares()

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9798a9921a38..e21a2feed826 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2691,15 +2691,9 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
 			rdev->saved_raid_disk = -1;
 		clear_bit(In_sync, &rdev->flags);
 		clear_bit(Bitmap_sync, &rdev->flags);
-		err = rdev->mddev->pers->
-			hot_add_disk(rdev->mddev, rdev);
-		if (err) {
-			rdev->raid_disk = -1;
-			return err;
-		} else
-			sysfs_notify_dirent_safe(rdev->sysfs_state);
-		if (sysfs_link_rdev(rdev->mddev, rdev))
-			/* failure here is OK */;
+		remove_and_add_spares(rdev->mddev, rdev);
+		if (rdev->raid_disk == -1)
+			return -EBUSY;
 		/* don't wakeup anyone, leave that to userspace. */
 	} else {
 		if (slot >= rdev->mddev->raid_disks &&
@@ -6004,12 +5998,16 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 	if (mddev_is_clustered(mddev))
 		md_cluster_ops->metadata_update_start(mddev);
 
+	if (rdev->raid_disk < 0)
+		goto kick_rdev;
+
 	clear_bit(Blocked, &rdev->flags);
 	remove_and_add_spares(mddev, rdev);
 
 	if (rdev->raid_disk >= 0)
 		goto busy;
 
+kick_rdev:
 	if (mddev_is_clustered(mddev))
 		md_cluster_ops->remove_disk(mddev, rdev);
 
@@ -6024,6 +6022,7 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 busy:
 	if (mddev_is_clustered(mddev))
 		md_cluster_ops->metadata_update_cancel(mddev);
+
 	printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
 		bdevname(rdev->bdev,b), mdname(mddev));
 	return -EBUSY;
@@ -8018,10 +8017,12 @@ static int remove_and_add_spares(struct mddev *mddev,
 	if (removed && mddev->kobj.sd)
 		sysfs_notify(&mddev->kobj, NULL, "degraded");
 
-	if (this)
+	if (this && removed)
 		goto no_add;
 
 	rdev_for_each(rdev, mddev) {
+		if (this && this != rdev)
+			continue;
 		if (rdev->raid_disk >= 0 &&
 		    !test_bit(In_sync, &rdev->flags) &&
 		    !test_bit(Faulty, &rdev->flags))

From 70bcecdb1534a7dcd82503b705c27a048d568c9d Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Fri, 21 Aug 2015 10:33:39 -0500
Subject: [PATCH 07/18] md-cluster: Improve md_reload_sb to be less error prone

md_reload_sb is too simplistic and it explicitly needs to determine
the changes made by the writing node. However, there are multiple areas
where a simple reload could fail.

Instead, read the superblock of one of the "good" rdevs and update
the necessary information:

- read the superblock into a newly allocated page, by temporarily
  swapping out rdev->sb_page and calling ->load_super.
- if that fails return
- if it succeeds, call check_sb_changes
  1. iterates over list of active devices and checks the matching
   dev_roles[] value.
   	If that is 'faulty', the device must be  marked as faulty
	 - call md_error to mark the device as faulty. Make sure
	   not to set CHANGE_DEVS and wakeup mddev->thread or else
	   it would initiate a resync process, which is the responsibility
	   of the "primary" node.
	 - clear the Blocked bit
	 - Call remove_and_add_spares() to hot remove the device.
	If the device is 'spare':
	 - call remove_and_add_spares() to get the number of spares
	   added in this operation.
	 - Reduce mddev->degraded to mark the array as not degraded.
  2. reset recovery_cp
- read the rest of the rdevs to update recovery_offset. If recovery_offset
  is equal to MaxSector, call spare_active() to set it In_sync

This required that recovery_offset be initialized to MaxSector, as
opposed to zero so as to communicate the end of sync for a rdev.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c |  27 +++++----
 drivers/md/md.c         | 123 +++++++++++++++++++++++++++++++++++-----
 drivers/md/md.h         |   2 +-
 drivers/md/raid1.c      |   9 +++
 4 files changed, 134 insertions(+), 27 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 58eadc06a1b6..2eb3a5019a63 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -427,8 +427,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
 static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
-
-	md_reload_sb(mddev);
+	md_reload_sb(mddev, le32_to_cpu(msg->raid_slot));
 	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
 }
 
@@ -834,11 +833,23 @@ static int metadata_update_finish(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	struct cluster_msg cmsg;
-	int ret;
+	struct md_rdev *rdev;
+	int ret = 0;
 
 	memset(&cmsg, 0, sizeof(cmsg));
 	cmsg.type = cpu_to_le32(METADATA_UPDATED);
-	ret = __sendmsg(cinfo, &cmsg);
+	cmsg.raid_slot = -1;
+	/* Pick up a good active device number to send.
+	 */
+	rdev_for_each(rdev, mddev)
+		if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) {
+			cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
+			break;
+		}
+	if (cmsg.raid_slot >= 0)
+		ret = __sendmsg(cinfo, &cmsg);
+	else
+		pr_warn("md-cluster: No good device id found to send\n");
 	unlock_comm(cinfo);
 	return ret;
 }
@@ -922,15 +933,9 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
 
 static int add_new_disk_finish(struct mddev *mddev)
 {
-	struct cluster_msg cmsg;
-	struct md_cluster_info *cinfo = mddev->cluster_info;
-	int ret;
 	/* Write sb and inform others */
 	md_update_sb(mddev, 1);
-	cmsg.type = METADATA_UPDATED;
-	ret = __sendmsg(cinfo, &cmsg);
-	unlock_comm(cinfo);
-	return ret;
+	return metadata_update_finish(mddev);
 }
 
 static int new_disk_ack(struct mddev *mddev, bool ack)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index e21a2feed826..12cc28ab9a41 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8924,25 +8924,118 @@ static int __init md_init(void)
 	return ret;
 }
 
-void md_reload_sb(struct mddev *mddev)
+static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
 {
-	struct md_rdev *rdev, *tmp;
+	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
+	struct md_rdev *rdev2;
+	int role, ret;
+	char b[BDEVNAME_SIZE];
 
-	rdev_for_each_safe(rdev, tmp, mddev) {
-		rdev->sb_loaded = 0;
-		ClearPageUptodate(rdev->sb_page);
-	}
-	mddev->raid_disks = 0;
-	analyze_sbs(mddev);
-	rdev_for_each_safe(rdev, tmp, mddev) {
-		struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
-		/* since we don't write to faulty devices, we figure out if the
-		 *  disk is faulty by comparing events
-		 */
-		if (mddev->events > sb->events)
-			set_bit(Faulty, &rdev->flags);
+	/* Check for change of roles in the active devices */
+	rdev_for_each(rdev2, mddev) {
+		if (test_bit(Faulty, &rdev2->flags))
+			continue;
+
+		/* Check if the roles changed */
+		role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
+		if (role != rdev2->raid_disk) {
+			/* got activated */
+			if (rdev2->raid_disk == -1 && role != 0xffff) {
+				rdev2->saved_raid_disk = role;
+				ret = remove_and_add_spares(mddev, rdev2);
+				pr_info("Activated spare: %s\n",
+						bdevname(rdev2->bdev,b));
+				continue;
+			}
+			/* device faulty
+			 * We just want to do the minimum to mark the disk
+			 * as faulty. The recovery is performed by the
+			 * one who initiated the error.
+			 */
+			if ((role == 0xfffe) || (role == 0xfffd)) {
+				md_error(mddev, rdev2);
+				clear_bit(Blocked, &rdev2->flags);
+			}
+		}
 	}
 
+	/* recovery_cp changed */
+	if (le64_to_cpu(sb->resync_offset) != mddev->recovery_cp)
+		mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
+
+	/* Finally set the event to be up to date */
+	mddev->events = le64_to_cpu(sb->events);
+}
+
+static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
+{
+	int err;
+	struct page *swapout = rdev->sb_page;
+	struct mdp_superblock_1 *sb;
+
+	/* Store the sb page of the rdev in the swapout temporary
+	 * variable in case we err in the future
+	 */
+	rdev->sb_page = NULL;
+	alloc_disk_sb(rdev);
+	ClearPageUptodate(rdev->sb_page);
+	rdev->sb_loaded = 0;
+	err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version);
+
+	if (err < 0) {
+		pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
+				__func__, __LINE__, rdev->desc_nr, err);
+		put_page(rdev->sb_page);
+		rdev->sb_page = swapout;
+		rdev->sb_loaded = 1;
+		return err;
+	}
+
+	sb = page_address(rdev->sb_page);
+	/* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
+	 * is not set
+	 */
+
+	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
+		rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
+
+	/* The other node finished recovery, call spare_active to set
+	 * device In_sync and mddev->degraded
+	 */
+	if (rdev->recovery_offset == MaxSector &&
+	    !test_bit(In_sync, &rdev->flags) &&
+	    mddev->pers->spare_active(mddev))
+		sysfs_notify(&mddev->kobj, NULL, "degraded");
+
+	put_page(swapout);
+	return 0;
+}
+
+void md_reload_sb(struct mddev *mddev, int nr)
+{
+	struct md_rdev *rdev;
+	int err;
+
+	/* Find the rdev */
+	rdev_for_each_rcu(rdev, mddev) {
+		if (rdev->desc_nr == nr)
+			break;
+	}
+
+	if (!rdev || rdev->desc_nr != nr) {
+		pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
+		return;
+	}
+
+	err = read_rdev(mddev, rdev);
+	if (err < 0)
+		return;
+
+	check_sb_changes(mddev, rdev);
+
+	/* Read all rdev's to update recovery_offset */
+	rdev_for_each_rcu(rdev, mddev)
+		read_rdev(mddev, rdev);
 }
 EXPORT_SYMBOL(md_reload_sb);
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index ab339571e57f..2ea00356bb23 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -658,7 +658,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 				   struct mddev *mddev);
 
 extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
-extern void md_reload_sb(struct mddev *mddev);
+extern void md_reload_sb(struct mddev *mddev, int raid_disk);
 extern void md_update_sb(struct mddev *mddev, int force);
 extern void md_kick_rdev_from_array(struct md_rdev * rdev);
 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1dd13bb52940..b54fefc85b66 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1592,6 +1592,15 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 
+	/*
+	 * find the disk ... but prefer rdev->saved_raid_disk
+	 * if possible.
+	 */
+	if (rdev->saved_raid_disk >= 0 &&
+	    rdev->saved_raid_disk >= first &&
+	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
+		first = last = rdev->saved_raid_disk;
+
 	for (mirror = first; mirror <= last; mirror++) {
 		p = conf->mirrors+mirror;
 		if (!p->rdev) {

From 2aa82191ac36cd2f2a41aa25697db30ed7c619ef Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Mon, 28 Sep 2015 19:21:35 -0500
Subject: [PATCH 08/18] md-cluster: Perform a lazy update

In a clustered environment, a change such as marking a device faulty,
can be recorded by any of the nodes. This is communicated to all the
nodes and re-recording such a change is unnecessary, and quite often
pretty disruptive.

With this patch, just before the update, we detect for the changes
and if the changes are already in superblock, we abort the update
after clearing all the flags

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md.c | 101 +++++++++++++++++++++++++++---------------------
 1 file changed, 57 insertions(+), 44 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 12cc28ab9a41..5f0967803dc7 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2199,6 +2199,46 @@ static void sync_sbs(struct mddev *mddev, int nospares)
 	}
 }
 
+static bool does_sb_need_changing(struct mddev *mddev)
+{
+	struct md_rdev *rdev;
+	struct mdp_superblock_1 *sb;
+	int role;
+
+	/* Find a good rdev */
+	rdev_for_each(rdev, mddev)
+		if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
+			break;
+
+	/* No good device found. */
+	if (!rdev)
+		return false;
+
+	sb = page_address(rdev->sb_page);
+	/* Check if a device has become faulty or a spare become active */
+	rdev_for_each(rdev, mddev) {
+		role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+		/* Device activated? */
+		if (role == 0xffff && rdev->raid_disk >=0 &&
+		    !test_bit(Faulty, &rdev->flags))
+			return true;
+		/* Device turned faulty? */
+		if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
+			return true;
+	}
+
+	/* Check if any mddev parameters have changed */
+	if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
+	    (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
+	    (mddev->recovery_cp != le64_to_cpu(sb->resync_offset)) ||
+	    (mddev->layout != le64_to_cpu(sb->layout)) ||
+	    (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
+	    (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
+		return true;
+
+	return false;
+}
+
 void md_update_sb(struct mddev *mddev, int force_change)
 {
 	struct md_rdev *rdev;
@@ -2211,6 +2251,18 @@ void md_update_sb(struct mddev *mddev, int force_change)
 			set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		return;
 	}
+
+	if (mddev_is_clustered(mddev)) {
+		if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
+			force_change = 1;
+		md_cluster_ops->metadata_update_start(mddev);
+		/* Has someone else has updated the sb */
+		if (!does_sb_need_changing(mddev)) {
+			md_cluster_ops->metadata_update_cancel(mddev);
+			clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+			return;
+		}
+	}
 repeat:
 	/* First make sure individual recovery_offsets are correct */
 	rdev_for_each(rdev, mddev) {
@@ -2359,6 +2411,9 @@ void md_update_sb(struct mddev *mddev, int force_change)
 		clear_bit(BlockedBadBlocks, &rdev->flags);
 		wake_up(&rdev->blocked_wait);
 	}
+
+	if (mddev_is_clustered(mddev))
+		md_cluster_ops->metadata_update_finish(mddev);
 }
 EXPORT_SYMBOL(md_update_sb);
 
@@ -2496,13 +2551,9 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
 			if (mddev_is_clustered(mddev))
 				md_cluster_ops->remove_disk(mddev, rdev);
 			md_kick_rdev_from_array(rdev);
-			if (mddev_is_clustered(mddev))
-				md_cluster_ops->metadata_update_start(mddev);
 			if (mddev->pers)
 				md_update_sb(mddev, 1);
 			md_new_event(mddev);
-			if (mddev_is_clustered(mddev))
-				md_cluster_ops->metadata_update_finish(mddev);
 			err = 0;
 		}
 	} else if (cmd_match(buf, "writemostly")) {
@@ -4063,12 +4114,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
 	if (err)
 		return err;
 	if (mddev->pers) {
-		if (mddev_is_clustered(mddev))
-			md_cluster_ops->metadata_update_start(mddev);
 		err = update_size(mddev, sectors);
 		md_update_sb(mddev, 1);
-		if (mddev_is_clustered(mddev))
-			md_cluster_ops->metadata_update_finish(mddev);
 	} else {
 		if (mddev->dev_sectors == 0 ||
 		    mddev->dev_sectors > sectors)
@@ -5306,8 +5353,6 @@ static void md_clean(struct mddev *mddev)
 
 static void __md_stop_writes(struct mddev *mddev)
 {
-	if (mddev_is_clustered(mddev))
-		md_cluster_ops->metadata_update_start(mddev);
 	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	flush_workqueue(md_misc_wq);
 	if (mddev->sync_thread) {
@@ -5326,8 +5371,6 @@ static void __md_stop_writes(struct mddev *mddev)
 		mddev->in_sync = 1;
 		md_update_sb(mddev, 1);
 	}
-	if (mddev_is_clustered(mddev))
-		md_cluster_ops->metadata_update_finish(mddev);
 }
 
 void md_stop_writes(struct mddev *mddev)
@@ -6015,9 +6058,6 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 	md_update_sb(mddev, 1);
 	md_new_event(mddev);
 
-	if (mddev_is_clustered(mddev))
-		md_cluster_ops->metadata_update_finish(mddev);
-
 	return 0;
 busy:
 	if (mddev_is_clustered(mddev))
@@ -6073,14 +6113,12 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
 		goto abort_export;
 	}
 
-	if (mddev_is_clustered(mddev))
-		md_cluster_ops->metadata_update_start(mddev);
 	clear_bit(In_sync, &rdev->flags);
 	rdev->desc_nr = -1;
 	rdev->saved_raid_disk = -1;
 	err = bind_rdev_to_array(rdev, mddev);
 	if (err)
-		goto abort_clustered;
+		goto abort_export;
 
 	/*
 	 * The rest should better be atomic, we can have disk failures
@@ -6090,9 +6128,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
 	rdev->raid_disk = -1;
 
 	md_update_sb(mddev, 1);
-
-	if (mddev_is_clustered(mddev))
-		md_cluster_ops->metadata_update_finish(mddev);
 	/*
 	 * Kick recovery, maybe this spare has to be added to the
 	 * array immediately.
@@ -6102,9 +6137,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
 	md_new_event(mddev);
 	return 0;
 
-abort_clustered:
-	if (mddev_is_clustered(mddev))
-		md_cluster_ops->metadata_update_cancel(mddev);
 abort_export:
 	export_rdev(rdev);
 	return err;
@@ -6422,8 +6454,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 			return rv;
 		}
 	}
-	if (mddev_is_clustered(mddev))
-		md_cluster_ops->metadata_update_start(mddev);
 	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
 		rv = update_size(mddev, (sector_t)info->size * 2);
 
@@ -6481,12 +6511,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 		}
 	}
 	md_update_sb(mddev, 1);
-	if (mddev_is_clustered(mddev))
-		md_cluster_ops->metadata_update_finish(mddev);
 	return rv;
 err:
-	if (mddev_is_clustered(mddev))
-		md_cluster_ops->metadata_update_cancel(mddev);
 	return rv;
 }
 
@@ -7599,11 +7625,7 @@ int md_allow_write(struct mddev *mddev)
 		    mddev->safemode == 0)
 			mddev->safemode = 1;
 		spin_unlock(&mddev->lock);
-		if (mddev_is_clustered(mddev))
-			md_cluster_ops->metadata_update_start(mddev);
 		md_update_sb(mddev, 0);
-		if (mddev_is_clustered(mddev))
-			md_cluster_ops->metadata_update_finish(mddev);
 		sysfs_notify_dirent_safe(mddev->sysfs_state);
 	} else
 		spin_unlock(&mddev->lock);
@@ -8182,13 +8204,8 @@ void md_check_recovery(struct mddev *mddev)
 				sysfs_notify_dirent_safe(mddev->sysfs_state);
 		}
 
-		if (mddev->flags & MD_UPDATE_SB_FLAGS) {
-			if (mddev_is_clustered(mddev))
-				md_cluster_ops->metadata_update_start(mddev);
+		if (mddev->flags & MD_UPDATE_SB_FLAGS)
 			md_update_sb(mddev, 0);
-			if (mddev_is_clustered(mddev))
-				md_cluster_ops->metadata_update_finish(mddev);
-		}
 
 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
 		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
@@ -8286,8 +8303,6 @@ void md_reap_sync_thread(struct mddev *mddev)
 			set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		}
 	}
-	if (mddev_is_clustered(mddev))
-		md_cluster_ops->metadata_update_start(mddev);
 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    mddev->pers->finish_reshape)
 		mddev->pers->finish_reshape(mddev);
@@ -8300,8 +8315,6 @@ void md_reap_sync_thread(struct mddev *mddev)
 			rdev->saved_raid_disk = -1;
 
 	md_update_sb(mddev, 1);
-	if (mddev_is_clustered(mddev))
-		md_cluster_ops->metadata_update_finish(mddev);
 	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);

From c186b128cda5a246da25f474e4689cb2bfacfcac Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Wed, 30 Sep 2015 13:20:35 -0500
Subject: [PATCH 09/18] md-cluster: Perform resync/recovery under a DLM lock

Resync or recovery must be performed by only one node at a time.
A DLM lock resource, resync_lockres provides the mutual exclusion
so that only one node performs the recovery/resync at a time.

If a node is unable to get the resync_lockres, because recovery is
being performed by another node, it set MD_RECOVER_NEEDED so as
to schedule recovery in the future.

Remove the debug message in resync_info_update()
used during development.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 29 ++++++++++++++++++++++++++---
 drivers/md/md-cluster.h |  2 ++
 drivers/md/md.c         | 29 +++++++++++++++++++++++++----
 drivers/md/raid1.c      |  2 --
 4 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 2eb3a5019a63..e1ce9c9a0473 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -55,6 +55,7 @@ struct md_cluster_info {
 	struct completion completion;
 	struct mutex sb_mutex;
 	struct dlm_lock_resource *bitmap_lockres;
+	struct dlm_lock_resource *resync_lockres;
 	struct list_head suspend_list;
 	spinlock_t suspend_lock;
 	struct md_thread *recovery_thread;
@@ -384,6 +385,8 @@ static void process_suspend_info(struct mddev *mddev,
 
 	if (!hi) {
 		remove_suspend_info(mddev, slot);
+		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+		md_wakeup_thread(mddev->thread);
 		return;
 	}
 	s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
@@ -758,6 +761,10 @@ static int join(struct mddev *mddev, int nodes)
 		goto err;
 	}
 
+	cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0);
+	if (!cinfo->resync_lockres)
+		goto err;
+
 	ret = gather_all_resync_info(mddev, nodes);
 	if (ret)
 		goto err;
@@ -768,6 +775,7 @@ static int join(struct mddev *mddev, int nodes)
 	lockres_free(cinfo->token_lockres);
 	lockres_free(cinfo->ack_lockres);
 	lockres_free(cinfo->no_new_dev_lockres);
+	lockres_free(cinfo->resync_lockres);
 	lockres_free(cinfo->bitmap_lockres);
 	if (cinfo->lockspace)
 		dlm_release_lockspace(cinfo->lockspace, 2);
@@ -861,6 +869,13 @@ static int metadata_update_cancel(struct mddev *mddev)
 	return dlm_unlock_sync(cinfo->token_lockres);
 }
 
+static int resync_start(struct mddev *mddev)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	cinfo->resync_lockres->flags |= DLM_LKF_NOQUEUE;
+	return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX);
+}
+
 static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -870,16 +885,22 @@ static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 	add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
 	/* Re-acquire the lock to refresh LVB */
 	dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
-	pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__,
-			(unsigned long long)lo,
-			(unsigned long long)hi);
 	cmsg.type = cpu_to_le32(RESYNCING);
 	cmsg.slot = cpu_to_le32(slot);
 	cmsg.low = cpu_to_le64(lo);
 	cmsg.high = cpu_to_le64(hi);
+
 	return sendmsg(cinfo, &cmsg);
 }
 
+static int resync_finish(struct mddev *mddev)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	cinfo->resync_lockres->flags &= ~DLM_LKF_NOQUEUE;
+	dlm_unlock_sync(cinfo->resync_lockres);
+	return resync_info_update(mddev, 0, 0);
+}
+
 static int area_resyncing(struct mddev *mddev, int direction,
 		sector_t lo, sector_t hi)
 {
@@ -995,6 +1016,8 @@ static struct md_cluster_operations cluster_ops = {
 	.join   = join,
 	.leave  = leave,
 	.slot_number = slot_number,
+	.resync_start = resync_start,
+	.resync_finish = resync_finish,
 	.resync_info_update = resync_info_update,
 	.metadata_update_start = metadata_update_start,
 	.metadata_update_finish = metadata_update_finish,
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index f5bdc0c86eaa..c94172673599 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -16,6 +16,8 @@ struct md_cluster_operations {
 	int (*metadata_update_start)(struct mddev *mddev);
 	int (*metadata_update_finish)(struct mddev *mddev);
 	int (*metadata_update_cancel)(struct mddev *mddev);
+	int (*resync_start)(struct mddev *mddev);
+	int (*resync_finish)(struct mddev *mddev);
 	int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi);
 	int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
 	int (*add_new_disk_finish)(struct mddev *mddev);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5f0967803dc7..61e897def04f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7657,6 +7657,7 @@ void md_do_sync(struct md_thread *thread)
 	struct md_rdev *rdev;
 	char *desc, *action = NULL;
 	struct blk_plug plug;
+	bool cluster_resync_finished = false;
 
 	/* just incase thread restarts... */
 	if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -7959,7 +7960,11 @@ void md_do_sync(struct md_thread *thread)
 		mddev->curr_resync_completed = mddev->curr_resync;
 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 	}
-	/* tell personality that we are finished */
+	/* tell personality and other nodes that we are finished */
+	if (mddev_is_clustered(mddev)) {
+		md_cluster_ops->resync_finish(mddev);
+		cluster_resync_finished = true;
+	}
 	mddev->pers->sync_request(mddev, max_sectors, &skipped);
 
 	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
@@ -7997,6 +8002,11 @@ void md_do_sync(struct md_thread *thread)
  skip:
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
+	if (mddev_is_clustered(mddev) &&
+	    test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+	    !cluster_resync_finished)
+		md_cluster_ops->resync_finish(mddev);
+
 	spin_lock(&mddev->lock);
 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
 		/* We completed so min/max setting can be forgotten if used. */
@@ -8078,14 +8088,25 @@ static int remove_and_add_spares(struct mddev *mddev,
 static void md_start_sync(struct work_struct *ws)
 {
 	struct mddev *mddev = container_of(ws, struct mddev, del_work);
+	int ret = 0;
+
+	if (mddev_is_clustered(mddev)) {
+		ret = md_cluster_ops->resync_start(mddev);
+		if (ret) {
+			mddev->sync_thread = NULL;
+			goto out;
+		}
+	}
 
 	mddev->sync_thread = md_register_thread(md_do_sync,
 						mddev,
 						"resync");
+out:
 	if (!mddev->sync_thread) {
-		printk(KERN_ERR "%s: could not start resync"
-		       " thread...\n",
-		       mdname(mddev));
+		if (!(mddev_is_clustered(mddev) && ret == -EAGAIN))
+			printk(KERN_ERR "%s: could not start resync"
+			       " thread...\n",
+			       mdname(mddev));
 		/* leave the spares where they are, it shouldn't hurt */
 		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 		clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b54fefc85b66..a2d813c9eabd 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2503,8 +2503,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 		if (mddev_is_clustered(mddev)) {
 			conf->cluster_sync_low = 0;
 			conf->cluster_sync_high = 0;
-			/* Send zeros to mark end of resync */
-			md_cluster_ops->resync_info_update(mddev, 0, 0);
 		}
 		return 0;
 	}

From dbb64f8635f5d68192108b88759a34633a4bd558 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Thu, 1 Oct 2015 13:20:27 -0500
Subject: [PATCH 10/18] md-cluster: Fix adding of new disk with new reload code

Adding the disk worked incorrectly with the new reload code. Fix it:

 - No operation should be performed on rdev marked as Candidate
 - After a metadata update operation, kick disk if role is 0xfffe
   else clear Candidate bit and continue with the regular change check.
 - Saving the mode of the lock resource to check if token lock is already
   locked, because it can be called twice while adding a disk. However,
   unlock_comm() must be called only once.
 - add_new_disk() is called by the node initiating the --add operation.
   If it needs to be canceled, call add_new_disk_cancel(). The operation
   is completed by md_update_sb() which will write and unlock the
   communication.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 35 +++++++++++++++++++--------
 drivers/md/md-cluster.h |  6 ++---
 drivers/md/md.c         | 52 ++++++++++++++++++++++++-----------------
 3 files changed, 58 insertions(+), 35 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index e1ce9c9a0473..28494e9f8d02 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -28,6 +28,7 @@ struct dlm_lock_resource {
 	struct completion completion; /* completion for synchronized locking */
 	void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
 	struct mddev *mddev; /* pointing back to mddev. */
+	int mode;
 };
 
 struct suspend_info {
@@ -107,6 +108,8 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
 	if (ret)
 		return ret;
 	wait_for_completion(&res->completion);
+	if (res->lksb.sb_status == 0)
+		res->mode = mode;
 	return res->lksb.sb_status;
 }
 
@@ -128,6 +131,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
 	init_completion(&res->completion);
 	res->ls = cinfo->lockspace;
 	res->mddev = mddev;
+	res->mode = DLM_LOCK_IV;
 	namelen = strlen(name);
 	res->name = kzalloc(namelen + 1, GFP_KERNEL);
 	if (!res->name) {
@@ -536,11 +540,17 @@ static void recv_daemon(struct md_thread *thread)
 /* lock_comm()
  * Takes the lock on the TOKEN lock resource so no other
  * node can communicate while the operation is underway.
+ * If called again, and the TOKEN lock is alread in EX mode
+ * return success. However, care must be taken that unlock_comm()
+ * is called only once.
  */
 static int lock_comm(struct md_cluster_info *cinfo)
 {
 	int error;
 
+	if (cinfo->token_lockres->mode == DLM_LOCK_EX)
+		return 0;
+
 	error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
 	if (error)
 		pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
@@ -550,6 +560,7 @@ static int lock_comm(struct md_cluster_info *cinfo)
 
 static void unlock_comm(struct md_cluster_info *cinfo)
 {
+	WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
 	dlm_unlock_sync(cinfo->token_lockres);
 }
 
@@ -862,11 +873,10 @@ static int metadata_update_finish(struct mddev *mddev)
 	return ret;
 }
 
-static int metadata_update_cancel(struct mddev *mddev)
+static void metadata_update_cancel(struct mddev *mddev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
-
-	return dlm_unlock_sync(cinfo->token_lockres);
+	unlock_comm(cinfo);
 }
 
 static int resync_start(struct mddev *mddev)
@@ -925,7 +935,11 @@ static int area_resyncing(struct mddev *mddev, int direction,
 	return ret;
 }
 
-static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
+/* add_new_disk() - initiates a disk add
+ * However, if this fails before writing md_update_sb(),
+ * add_new_disk_cancel() must be called to release token lock
+ */
+static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	struct cluster_msg cmsg;
@@ -947,16 +961,17 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
 	/* Some node does not "see" the device */
 	if (ret == -EAGAIN)
 		ret = -ENOENT;
+	if (ret)
+		unlock_comm(cinfo);
 	else
 		dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
 	return ret;
 }
 
-static int add_new_disk_finish(struct mddev *mddev)
+static void add_new_disk_cancel(struct mddev *mddev)
 {
-	/* Write sb and inform others */
-	md_update_sb(mddev, 1);
-	return metadata_update_finish(mddev);
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	unlock_comm(cinfo);
 }
 
 static int new_disk_ack(struct mddev *mddev, bool ack)
@@ -1023,8 +1038,8 @@ static struct md_cluster_operations cluster_ops = {
 	.metadata_update_finish = metadata_update_finish,
 	.metadata_update_cancel = metadata_update_cancel,
 	.area_resyncing = area_resyncing,
-	.add_new_disk_start = add_new_disk_start,
-	.add_new_disk_finish = add_new_disk_finish,
+	.add_new_disk = add_new_disk,
+	.add_new_disk_cancel = add_new_disk_cancel,
 	.new_disk_ack = new_disk_ack,
 	.remove_disk = remove_disk,
 	.gather_bitmaps = gather_bitmaps,
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index c94172673599..e75ea2613184 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -15,12 +15,12 @@ struct md_cluster_operations {
 	int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
 	int (*metadata_update_start)(struct mddev *mddev);
 	int (*metadata_update_finish)(struct mddev *mddev);
-	int (*metadata_update_cancel)(struct mddev *mddev);
+	void (*metadata_update_cancel)(struct mddev *mddev);
 	int (*resync_start)(struct mddev *mddev);
 	int (*resync_finish)(struct mddev *mddev);
 	int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi);
-	int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
-	int (*add_new_disk_finish)(struct mddev *mddev);
+	int (*add_new_disk)(struct mddev *mddev, struct md_rdev *rdev);
+	void (*add_new_disk_cancel)(struct mddev *mddev);
 	int (*new_disk_ack)(struct mddev *mddev, bool ack);
 	int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
 	int (*gather_bitmaps)(struct md_rdev *rdev);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 61e897def04f..8a6f67f55d3d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3246,14 +3246,6 @@ static void analyze_sbs(struct mddev *mddev)
 				md_kick_rdev_from_array(rdev);
 				continue;
 			}
-			/* No device should have a Candidate flag
-			 * when reading devices
-			 */
-			if (test_bit(Candidate, &rdev->flags)) {
-				pr_info("md: kicking Cluster Candidate %s from array!\n",
-					bdevname(rdev->bdev, b));
-				md_kick_rdev_from_array(rdev);
-			}
 		}
 		if (mddev->level == LEVEL_MULTIPATH) {
 			rdev->desc_nr = i++;
@@ -5950,19 +5942,12 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
 		 * check whether the device shows up in other nodes
 		 */
 		if (mddev_is_clustered(mddev)) {
-			if (info->state & (1 << MD_DISK_CANDIDATE)) {
-				/* Through --cluster-confirm */
+			if (info->state & (1 << MD_DISK_CANDIDATE))
 				set_bit(Candidate, &rdev->flags);
-				err = md_cluster_ops->new_disk_ack(mddev, true);
-				if (err) {
-					export_rdev(rdev);
-					return err;
-				}
-			} else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
+			else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
 				/* --add initiated by this node */
-				err = md_cluster_ops->add_new_disk_start(mddev, rdev);
+				err = md_cluster_ops->add_new_disk(mddev, rdev);
 				if (err) {
-					md_cluster_ops->add_new_disk_finish(mddev);
 					export_rdev(rdev);
 					return err;
 				}
@@ -5971,13 +5956,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
 
 		rdev->raid_disk = -1;
 		err = bind_rdev_to_array(rdev, mddev);
+
 		if (err)
 			export_rdev(rdev);
-		else
+
+		if (mddev_is_clustered(mddev)) {
+			if (info->state & (1 << MD_DISK_CANDIDATE))
+				md_cluster_ops->new_disk_ack(mddev, (err == 0));
+			else {
+				if (err)
+					md_cluster_ops->add_new_disk_cancel(mddev);
+				else
+					err = add_bound_rdev(rdev);
+			}
+
+		} else if (!err)
 			err = add_bound_rdev(rdev);
-		if (mddev_is_clustered(mddev) &&
-				(info->state & (1 << MD_DISK_CLUSTER_ADD)))
-			md_cluster_ops->add_new_disk_finish(mddev);
+
 		return err;
 	}
 
@@ -8055,6 +8050,8 @@ static int remove_and_add_spares(struct mddev *mddev,
 	rdev_for_each(rdev, mddev) {
 		if (this && this != rdev)
 			continue;
+		if (test_bit(Candidate, &rdev->flags))
+			continue;
 		if (rdev->raid_disk >= 0 &&
 		    !test_bit(In_sync, &rdev->flags) &&
 		    !test_bit(Faulty, &rdev->flags))
@@ -8972,6 +8969,17 @@ static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
 
 		/* Check if the roles changed */
 		role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
+
+		if (test_bit(Candidate, &rdev2->flags)) {
+			if (role == 0xfffe) {
+				pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
+				md_kick_rdev_from_array(rdev2);
+				continue;
+			}
+			else
+				clear_bit(Candidate, &rdev2->flags);
+		}
+
 		if (role != rdev2->raid_disk) {
 			/* got activated */
 			if (rdev2->raid_disk == -1 && role != 0xffff) {

From d216711bed1a0fb6527858f32cd89ff4a6553a97 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Mon, 12 Oct 2015 10:42:34 -0500
Subject: [PATCH 11/18] md-cluster: Do not printk() every received message

The receive daemon prints kernel messages for every network message
received. This would fill the kernel message log with unnecessary messages.
Remove the pr_info() messages.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 28494e9f8d02..85d0a1fef807 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -363,8 +363,6 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
 
 	list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
 		if (slot == s->slot) {
-			pr_info("%s:%d Deleting suspend_info: %d\n",
-					__func__, __LINE__, slot);
 			list_del(&s->list);
 			kfree(s);
 			break;
@@ -462,34 +460,22 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 {
 	switch (msg->type) {
 	case METADATA_UPDATED:
-		pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
-			__func__, __LINE__, msg->slot);
 		process_metadata_update(mddev, msg);
 		break;
 	case RESYNCING:
-		pr_info("%s: %d Received message: RESYNCING from %d\n",
-			__func__, __LINE__, msg->slot);
 		process_suspend_info(mddev, msg->slot,
 				msg->low, msg->high);
 		break;
 	case NEWDISK:
-		pr_info("%s: %d Received message: NEWDISK from %d\n",
-			__func__, __LINE__, msg->slot);
 		process_add_new_disk(mddev, msg);
 		break;
 	case REMOVE:
-		pr_info("%s: %d Received REMOVE from %d\n",
-			__func__, __LINE__, msg->slot);
 		process_remove_disk(mddev, msg);
 		break;
 	case RE_ADD:
-		pr_info("%s: %d Received RE_ADD from %d\n",
-			__func__, __LINE__, msg->slot);
 		process_readd_disk(mddev, msg);
 		break;
 	case BITMAP_NEEDS_SYNC:
-		pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n",
-			__func__, __LINE__, msg->slot);
 		__recover_slot(mddev, msg->slot);
 		break;
 	default:

From faeff83fa478b4dca9877d6b10a25ad252891f14 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 12 Oct 2015 17:21:21 +0800
Subject: [PATCH 12/18] md-cluster: make other members of cluster_msg is
 handled by little endian funcs

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md-cluster.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 85d0a1fef807..3c1818516070 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -418,7 +418,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
 
 	len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
 	sprintf(disk_uuid + len, "%pU", cmsg->uuid);
-	snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
+	snprintf(raid_slot, 16, "RAID_DISK=%d", le32_to_cpu(cmsg->raid_slot));
 	pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
 	init_completion(&cinfo->newdisk_completion);
 	set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
@@ -438,22 +438,26 @@ static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg
 
 static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
 {
-	struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
+	struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
+						   le32_to_cpu(msg->raid_slot));
 
 	if (rdev)
 		md_kick_rdev_from_array(rdev);
 	else
-		pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot);
+		pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
+			__func__, __LINE__, le32_to_cpu(msg->raid_slot));
 }
 
 static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
 {
-	struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
+	struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
+						   le32_to_cpu(msg->raid_slot));
 
 	if (rdev && test_bit(Faulty, &rdev->flags))
 		clear_bit(Faulty, &rdev->flags);
 	else
-		pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot);
+		pr_warn("%s: %d Could not find disk(%d) which is faulty",
+			__func__, __LINE__, le32_to_cpu(msg->raid_slot));
 }
 
 static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
@@ -936,7 +940,7 @@ static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
 	memset(&cmsg, 0, sizeof(cmsg));
 	cmsg.type = cpu_to_le32(NEWDISK);
 	memcpy(cmsg.uuid, uuid, 16);
-	cmsg.raid_slot = rdev->desc_nr;
+	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
 	lock_comm(cinfo);
 	ret = __sendmsg(cinfo, &cmsg);
 	if (ret)
@@ -979,8 +983,8 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct cluster_msg cmsg;
 	struct md_cluster_info *cinfo = mddev->cluster_info;
-	cmsg.type = REMOVE;
-	cmsg.raid_slot = rdev->desc_nr;
+	cmsg.type = cpu_to_le32(REMOVE);
+	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
 	return __sendmsg(cinfo, &cmsg);
 }
 
@@ -992,8 +996,8 @@ static int gather_bitmaps(struct md_rdev *rdev)
 	struct mddev *mddev = rdev->mddev;
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 
-	cmsg.type = RE_ADD;
-	cmsg.raid_slot = rdev->desc_nr;
+	cmsg.type = cpu_to_le32(RE_ADD);
+	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
 	err = sendmsg(cinfo, &cmsg);
 	if (err)
 		goto out;

From 487cf9142c434530443b7bb8c545e9d0f30391b1 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 12 Oct 2015 17:21:22 +0800
Subject: [PATCH 13/18] md-cluster: remove unnecessary setting for slot

Since slot will be set within _sendmsg, we can remove
the redundant code in resync_info_update.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md-cluster.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 3c1818516070..ba80df923605 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -880,13 +880,11 @@ static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	struct cluster_msg cmsg;
-	int slot = cinfo->slot_number - 1;
 
 	add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
 	/* Re-acquire the lock to refresh LVB */
 	dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
 	cmsg.type = cpu_to_le32(RESYNCING);
-	cmsg.slot = cpu_to_le32(slot);
 	cmsg.low = cpu_to_le64(lo);
 	cmsg.high = cpu_to_le64(hi);
 

From 256f5b245aab93bfa2d8b86d66545fceea05408e Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 12 Oct 2015 17:21:23 +0800
Subject: [PATCH 14/18] md-cluster: make sure the node do not receive it's own
 msg

During the past test, the node occasionally received the msg which is
sent from itself, this case should not happen in theory, but it is
better to avoid it in case something wrong happened.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index ba80df923605..8bddd7882292 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -462,6 +462,9 @@ static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
 
 static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 {
+	if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot),
+		"node %d received it's own msg\n", le32_to_cpu(msg->slot)))
+		return;
 	switch (msg->type) {
 	case METADATA_UPDATED:
 		process_metadata_update(mddev, msg);

From aee177ac5a4225ac4eeed0aa5edf2338c2d713a4 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 12 Oct 2015 17:21:24 +0800
Subject: [PATCH 15/18] md-cluster: zero cmsg before it was sent

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md-cluster.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 8bddd7882292..c7b8027cebdd 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -882,7 +882,7 @@ static int resync_start(struct mddev *mddev)
 static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
-	struct cluster_msg cmsg;
+	struct cluster_msg cmsg = {0};
 
 	add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
 	/* Re-acquire the lock to refresh LVB */
@@ -982,7 +982,7 @@ static int new_disk_ack(struct mddev *mddev, bool ack)
 
 static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 {
-	struct cluster_msg cmsg;
+	struct cluster_msg cmsg = {0};
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 	cmsg.type = cpu_to_le32(REMOVE);
 	cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
@@ -993,7 +993,7 @@ static int gather_bitmaps(struct md_rdev *rdev)
 {
 	int sn, err;
 	sector_t lo, hi;
-	struct cluster_msg cmsg;
+	struct cluster_msg cmsg = {0};
 	struct mddev *mddev = rdev->mddev;
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 

From 86b572770e7964f006d438c4e05008914e9db79b Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 12 Oct 2015 17:21:25 +0800
Subject: [PATCH 16/18] md-cluster: Add 'SUSE' as author for md-cluster.c

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md-cluster.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index c7b8027cebdd..35ac2e8cb7f1 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -1051,5 +1051,6 @@ static void cluster_exit(void)
 
 module_init(cluster_init);
 module_exit(cluster_exit);
+MODULE_AUTHOR("SUSE");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Clustering support for MD");

From a9720903d1415317e18f439917f760ec592f3e3b Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 12 Oct 2015 17:21:27 +0800
Subject: [PATCH 17/18] md-cluster: only call kick_rdev_from_array after remove
 disk successfully

For cluster raid, we should not kick it from array if the disk can't be
remove from array successfully.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8a6f67f55d3d..d39a72aec316 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2548,13 +2548,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
 			err = -EBUSY;
 		else {
 			struct mddev *mddev = rdev->mddev;
-			if (mddev_is_clustered(mddev))
-				md_cluster_ops->remove_disk(mddev, rdev);
-			md_kick_rdev_from_array(rdev);
-			if (mddev->pers)
-				md_update_sb(mddev, 1);
-			md_new_event(mddev);
 			err = 0;
+			if (mddev_is_clustered(mddev))
+				err = md_cluster_ops->remove_disk(mddev, rdev);
+
+			if (err == 0) {
+				md_kick_rdev_from_array(rdev);
+				if (mddev->pers)
+					md_update_sb(mddev, 1);
+				md_new_event(mddev);
+			}
 		}
 	} else if (cmd_match(buf, "writemostly")) {
 		set_bit(WriteMostly, &rdev->flags);

From 23b63f9fa82eed128b5c585cbfe10ced82d73e91 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Mon, 12 Oct 2015 17:21:30 +0800
Subject: [PATCH 18/18] md: check the return value for metadata_update_start

We shouldn't run related funs of md_cluster_ops in case
metadata_update_start returned failure.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
---
 drivers/md/md.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d39a72aec316..a71b36f0acb0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2245,6 +2245,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
 	int sync_req;
 	int nospares = 0;
 	int any_badblocks_changed = 0;
+	int ret = -1;
 
 	if (mddev->ro) {
 		if (force_change)
@@ -2255,10 +2256,11 @@ void md_update_sb(struct mddev *mddev, int force_change)
 	if (mddev_is_clustered(mddev)) {
 		if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
 			force_change = 1;
-		md_cluster_ops->metadata_update_start(mddev);
+		ret = md_cluster_ops->metadata_update_start(mddev);
 		/* Has someone else has updated the sb */
 		if (!does_sb_need_changing(mddev)) {
-			md_cluster_ops->metadata_update_cancel(mddev);
+			if (ret == 0)
+				md_cluster_ops->metadata_update_cancel(mddev);
 			clear_bit(MD_CHANGE_PENDING, &mddev->flags);
 			return;
 		}
@@ -2412,7 +2414,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
 		wake_up(&rdev->blocked_wait);
 	}
 
-	if (mddev_is_clustered(mddev))
+	if (mddev_is_clustered(mddev) && ret == 0)
 		md_cluster_ops->metadata_update_finish(mddev);
 }
 EXPORT_SYMBOL(md_update_sb);
@@ -6031,13 +6033,14 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 {
 	char b[BDEVNAME_SIZE];
 	struct md_rdev *rdev;
+	int ret = -1;
 
 	rdev = find_rdev(mddev, dev);
 	if (!rdev)
 		return -ENXIO;
 
 	if (mddev_is_clustered(mddev))
-		md_cluster_ops->metadata_update_start(mddev);
+		ret = md_cluster_ops->metadata_update_start(mddev);
 
 	if (rdev->raid_disk < 0)
 		goto kick_rdev;
@@ -6049,7 +6052,7 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 		goto busy;
 
 kick_rdev:
-	if (mddev_is_clustered(mddev))
+	if (mddev_is_clustered(mddev) && ret == 0)
 		md_cluster_ops->remove_disk(mddev, rdev);
 
 	md_kick_rdev_from_array(rdev);
@@ -6058,7 +6061,7 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 
 	return 0;
 busy:
-	if (mddev_is_clustered(mddev))
+	if (mddev_is_clustered(mddev) && ret == 0)
 		md_cluster_ops->metadata_update_cancel(mddev);
 
 	printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",