From 7c8f4247986bb5c5fb1d5b1fad35461989fe8310 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 23 Nov 2011 10:18:52 +1100
Subject: [PATCH 1/7] md/lock: ensure updates to page_attrs are properly
 locked.

Page attributes are set using __set_bit rather than set_bit as
it normally called under a spinlock so the extra atomicity is not
needed.

However there are two places where we might set or clear page
attributes without holding the spinlock.
So add the spinlock in those cases.

This might be the cause of occasional reports that bits a aren't
getting clear properly - theory is that BITMAP_PAGE_PENDING gets lost
when BITMAP_PAGE_NEEDWRITE is set or cleared.  This is an
inconvenience, not a threat to data safety.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 7878712721bf..b6907118283a 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1106,10 +1106,12 @@ void bitmap_write_all(struct bitmap *bitmap)
 	 */
 	int i;
 
+	spin_lock_irq(&bitmap->lock);
 	for (i = 0; i < bitmap->file_pages; i++)
 		set_page_attr(bitmap, bitmap->filemap[i],
 			      BITMAP_PAGE_NEEDWRITE);
 	bitmap->allclean = 0;
+	spin_unlock_irq(&bitmap->lock);
 }
 
 static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
@@ -1605,7 +1607,9 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
 	for (chunk = s; chunk <= e; chunk++) {
 		sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap);
 		bitmap_set_memory_bits(bitmap, sec, 1);
+		spin_lock_irq(&bitmap->lock);
 		bitmap_file_set_bit(bitmap, sec);
+		spin_unlock_irq(&bitmap->lock);
 		if (sec < bitmap->mddev->recovery_cp)
 			/* We are asserting that the array is dirty,
 			 * so move the recovery_cp address back so

From 1d23f178d56ae1349b4fc5108ac8f4f8cdc92afc Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 8 Dec 2011 15:49:12 +1100
Subject: [PATCH 2/7] md: refine interpretation of "hold_active ==
 UNTIL_IOCTL".

We like md devices to disappear when they really are not needed.
However it is not possible to tell from the current state whether it
is needed or not.  We can only tell from recent history of changes.

In particular immediately after we create an md device it looks very
similar to immediately after we have finished with it.

So we always preserve a newly created md device until something
significant happens.  This state is stored in 'hold_active'.

The normal case is to keep it until an ioctl happens, as that will
normally either activate it, or explicitly de-activate it.  If it
doesn't then it was probably created by mistake and it is now time to
get rid of it.

We can also modify an array via sysfs (instead of via ioctl) and we
currently treat any change via sysfs like an ioctl as a sign that if
it now isn't more active, it should be destroyed.
However this is not appropriate as changes made via sysfs are more
gradual so we should look for a more definitive change.

So this patch only clears 'hold_active' from UNTIL_IOCTL to clear when
the array_state is changed via sysfs.  Other changes via sysfs
are ignored.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 84acfe7d10e4..93b0da133507 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3788,6 +3788,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
 	if (err)
 		return err;
 	else {
+		if (mddev->hold_active == UNTIL_IOCTL)
+			mddev->hold_active = 0;
 		sysfs_notify_dirent_safe(mddev->sysfs_state);
 		return len;
 	}
@@ -4508,8 +4510,6 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
 	rv = mddev_lock(mddev);
-	if (mddev->hold_active == UNTIL_IOCTL)
-		mddev->hold_active = 0;
 	if (!rv) {
 		rv = entry->store(mddev, page, length);
 		mddev_unlock(mddev);

From af8a24347f966ab9fae6b0f127d1fbc9d6932d3a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 8 Dec 2011 15:49:46 +1100
Subject: [PATCH 3/7] md: take a reference to mddev during sysfs access.

When we are accessing an mddev via sysfs we know that the
mddev cannot disappear because it has an embedded kobj which
is refcounted by sysfs.
And we also take the mddev_lock.
However this is not enough.

The final mddev_put could have been called and the
mddev_delayed_delete is waiting for sysfs to let go so it can destroy
the kobj and mddev.
In this state there are a lot of changes that should not be attempted.

To to guard against this we:
 - initialise mddev->all_mddevs in on last put so the state can be
   easily detected.
 - in md_attr_show and md_attr_store, check ->all_mddevs under
   all_mddevs_lock and mddev_get the mddev if it still appears to
   be active.

This means that if we get to sysfs as the mddev is being deleted we
will get -EBUSY.

rdev_attr_store and rdev_attr_show are similar but already have
sufficient protection.  They check that rdev->mddev still points to
mddev after taking mddev_lock.  As this is cleared  before delayed
removal which can only be requested under the mddev_lock, this
ensure the rdev and mddev are still alive.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 93b0da133507..f97c3b2f2f89 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -570,7 +570,7 @@ static void mddev_put(struct mddev *mddev)
 	    mddev->ctime == 0 && !mddev->hold_active) {
 		/* Array is not configured at all, and not held active,
 		 * so destroy it */
-		list_del(&mddev->all_mddevs);
+		list_del_init(&mddev->all_mddevs);
 		bs = mddev->bio_set;
 		mddev->bio_set = NULL;
 		if (mddev->gendisk) {
@@ -4489,11 +4489,20 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 
 	if (!entry->show)
 		return -EIO;
+	spin_lock(&all_mddevs_lock);
+	if (list_empty(&mddev->all_mddevs)) {
+		spin_unlock(&all_mddevs_lock);
+		return -EBUSY;
+	}
+	mddev_get(mddev);
+	spin_unlock(&all_mddevs_lock);
+
 	rv = mddev_lock(mddev);
 	if (!rv) {
 		rv = entry->show(mddev, page);
 		mddev_unlock(mddev);
 	}
+	mddev_put(mddev);
 	return rv;
 }
 
@@ -4509,11 +4518,19 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
 		return -EIO;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
+	spin_lock(&all_mddevs_lock);
+	if (list_empty(&mddev->all_mddevs)) {
+		spin_unlock(&all_mddevs_lock);
+		return -EBUSY;
+	}
+	mddev_get(mddev);
+	spin_unlock(&all_mddevs_lock);
 	rv = mddev_lock(mddev);
 	if (!rv) {
 		rv = entry->store(mddev, page, length);
 		mddev_unlock(mddev);
 	}
+	mddev_put(mddev);
 	return rv;
 }
 

From 52c64152a935e63d9ff73ce823730c9a23dedbff Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 8 Dec 2011 16:22:48 +1100
Subject: [PATCH 4/7] md: bad blocks shouldn't cause a Blocked status on a
 Faulty device.

Once a device is marked Faulty the badblocks - whether acknowledged or
not - become irrelevant.  So they shouldn't cause the device to be
marked as Blocked.

Without this patch, a process might write "-blocked" to clear the
Blocked status, but while that will correctly fail the device, it
won't remove the apparent 'blocked' status.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index f97c3b2f2f89..d730e8f513a7 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2546,7 +2546,8 @@ state_show(struct md_rdev *rdev, char *page)
 		sep = ",";
 	}
 	if (test_bit(Blocked, &rdev->flags) ||
-	    rdev->badblocks.unacked_exist) {
+	    (rdev->badblocks.unacked_exist
+	     && !test_bit(Faulty, &rdev->flags))) {
 		len += sprintf(page+len, "%sblocked", sep);
 		sep = ",";
 	}

From 8bd2f0a05b361e07d48bb34398593f5f523946b3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 8 Dec 2011 16:26:08 +1100
Subject: [PATCH 5/7] md: ensure new badblocks are handled promptly.

When we mark blocks as bad we need them to be acknowledged by the
metadata handler promptly.

For an in-kernel metadata handler that was already being done.  But
for an external metadata handler we need to alert it of the change by
sending a notification through the sysfs file.  This adds that
notification.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d730e8f513a7..ee981737edfc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7858,6 +7858,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 				  s + rdev->data_offset, sectors, acknowledged);
 	if (rv) {
 		/* Make sure they get written out promptly */
+		sysfs_notify_dirent_safe(rdev->sysfs_state);
 		set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
 		md_wakeup_thread(rdev->mddev->thread);
 	}

From 9283d8c5af4cdcb809e655acdf4be368afec8b58 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 8 Dec 2011 16:27:57 +1100
Subject: [PATCH 6/7] md/raid5: never wait for bad-block acks on failed device.

Once a device is failed we really want to completely ignore it.
It should go away soon anyway.

In particular the presence of bad blocks on it should not cause us to
block as we won't be trying to write there anyway.

So as soon as we can check if a device is Faulty, do so and pretend
that it is already gone if it is Faulty.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 297e26092178..b0dec01028f4 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3036,6 +3036,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 		if (dev->written)
 			s->written++;
 		rdev = rcu_dereference(conf->disks[i].rdev);
+		if (rdev && test_bit(Faulty, &rdev->flags))
+			rdev = NULL;
 		if (rdev) {
 			is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
 					     &first_bad, &bad_sectors);
@@ -3063,7 +3065,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			}
 		} else if (test_bit(In_sync, &rdev->flags))
 			set_bit(R5_Insync, &dev->flags);
-		else if (!test_bit(Faulty, &rdev->flags)) {
+		else {
 			/* in sync if before recovery_offset */
 			if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
 				set_bit(R5_Insync, &dev->flags);

From 5d8c71f9e5fbdd95650be00294d238e27a363b5c Mon Sep 17 00:00:00 2001
From: Adam Kwolek <adam.kwolek@intel.com>
Date: Fri, 9 Dec 2011 14:26:11 +1100
Subject: [PATCH 7/7] md: raid5 crash during degradation

NULL pointer access causes crash in raid5 module.

Signed-off-by: Adam Kwolek <adam.kwolek@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b0dec01028f4..31670f8d6b65 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3070,7 +3070,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
 				set_bit(R5_Insync, &dev->flags);
 		}
-		if (test_bit(R5_WriteError, &dev->flags)) {
+		if (rdev && test_bit(R5_WriteError, &dev->flags)) {
 			clear_bit(R5_Insync, &dev->flags);
 			if (!test_bit(Faulty, &rdev->flags)) {
 				s->handle_bad_blocks = 1;
@@ -3078,7 +3078,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			} else
 				clear_bit(R5_WriteError, &dev->flags);
 		}
-		if (test_bit(R5_MadeGood, &dev->flags)) {
+		if (rdev && test_bit(R5_MadeGood, &dev->flags)) {
 			if (!test_bit(Faulty, &rdev->flags)) {
 				s->handle_bad_blocks = 1;
 				atomic_inc(&rdev->nr_pending);