From 7c8f4247986bb5c5fb1d5b1fad35461989fe8310 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 23 Nov 2011 10:18:52 +1100 Subject: [PATCH 1/7] md/lock: ensure updates to page_attrs are properly locked. Page attributes are set using __set_bit rather than set_bit as it normally called under a spinlock so the extra atomicity is not needed. However there are two places where we might set or clear page attributes without holding the spinlock. So add the spinlock in those cases. This might be the cause of occasional reports that bits a aren't getting clear properly - theory is that BITMAP_PAGE_PENDING gets lost when BITMAP_PAGE_NEEDWRITE is set or cleared. This is an inconvenience, not a threat to data safety. Signed-off-by: NeilBrown --- drivers/md/bitmap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 7878712721bf..b6907118283a 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1106,10 +1106,12 @@ void bitmap_write_all(struct bitmap *bitmap) */ int i; + spin_lock_irq(&bitmap->lock); for (i = 0; i < bitmap->file_pages; i++) set_page_attr(bitmap, bitmap->filemap[i], BITMAP_PAGE_NEEDWRITE); bitmap->allclean = 0; + spin_unlock_irq(&bitmap->lock); } static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) @@ -1605,7 +1607,9 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) for (chunk = s; chunk <= e; chunk++) { sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap); bitmap_set_memory_bits(bitmap, sec, 1); + spin_lock_irq(&bitmap->lock); bitmap_file_set_bit(bitmap, sec); + spin_unlock_irq(&bitmap->lock); if (sec < bitmap->mddev->recovery_cp) /* We are asserting that the array is dirty, * so move the recovery_cp address back so From 1d23f178d56ae1349b4fc5108ac8f4f8cdc92afc Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 8 Dec 2011 15:49:12 +1100 Subject: [PATCH 2/7] md: refine interpretation of "hold_active == UNTIL_IOCTL". We like md devices to disappear when they really are not needed. However it is not possible to tell from the current state whether it is needed or not. We can only tell from recent history of changes. In particular immediately after we create an md device it looks very similar to immediately after we have finished with it. So we always preserve a newly created md device until something significant happens. This state is stored in 'hold_active'. The normal case is to keep it until an ioctl happens, as that will normally either activate it, or explicitly de-activate it. If it doesn't then it was probably created by mistake and it is now time to get rid of it. We can also modify an array via sysfs (instead of via ioctl) and we currently treat any change via sysfs like an ioctl as a sign that if it now isn't more active, it should be destroyed. However this is not appropriate as changes made via sysfs are more gradual so we should look for a more definitive change. So this patch only clears 'hold_active' from UNTIL_IOCTL to clear when the array_state is changed via sysfs. Other changes via sysfs are ignored. Signed-off-by: NeilBrown --- drivers/md/md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 84acfe7d10e4..93b0da133507 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3788,6 +3788,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) if (err) return err; else { + if (mddev->hold_active == UNTIL_IOCTL) + mddev->hold_active = 0; sysfs_notify_dirent_safe(mddev->sysfs_state); return len; } @@ -4508,8 +4510,6 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, if (!capable(CAP_SYS_ADMIN)) return -EACCES; rv = mddev_lock(mddev); - if (mddev->hold_active == UNTIL_IOCTL) - mddev->hold_active = 0; if (!rv) { rv = entry->store(mddev, page, length); mddev_unlock(mddev); From af8a24347f966ab9fae6b0f127d1fbc9d6932d3a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 8 Dec 2011 15:49:46 +1100 Subject: [PATCH 3/7] md: take a reference to mddev during sysfs access. When we are accessing an mddev via sysfs we know that the mddev cannot disappear because it has an embedded kobj which is refcounted by sysfs. And we also take the mddev_lock. However this is not enough. The final mddev_put could have been called and the mddev_delayed_delete is waiting for sysfs to let go so it can destroy the kobj and mddev. In this state there are a lot of changes that should not be attempted. To to guard against this we: - initialise mddev->all_mddevs in on last put so the state can be easily detected. - in md_attr_show and md_attr_store, check ->all_mddevs under all_mddevs_lock and mddev_get the mddev if it still appears to be active. This means that if we get to sysfs as the mddev is being deleted we will get -EBUSY. rdev_attr_store and rdev_attr_show are similar but already have sufficient protection. They check that rdev->mddev still points to mddev after taking mddev_lock. As this is cleared before delayed removal which can only be requested under the mddev_lock, this ensure the rdev and mddev are still alive. Signed-off-by: NeilBrown --- drivers/md/md.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 93b0da133507..f97c3b2f2f89 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -570,7 +570,7 @@ static void mddev_put(struct mddev *mddev) mddev->ctime == 0 && !mddev->hold_active) { /* Array is not configured at all, and not held active, * so destroy it */ - list_del(&mddev->all_mddevs); + list_del_init(&mddev->all_mddevs); bs = mddev->bio_set; mddev->bio_set = NULL; if (mddev->gendisk) { @@ -4489,11 +4489,20 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) if (!entry->show) return -EIO; + spin_lock(&all_mddevs_lock); + if (list_empty(&mddev->all_mddevs)) { + spin_unlock(&all_mddevs_lock); + return -EBUSY; + } + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); + rv = mddev_lock(mddev); if (!rv) { rv = entry->show(mddev, page); mddev_unlock(mddev); } + mddev_put(mddev); return rv; } @@ -4509,11 +4518,19 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, return -EIO; if (!capable(CAP_SYS_ADMIN)) return -EACCES; + spin_lock(&all_mddevs_lock); + if (list_empty(&mddev->all_mddevs)) { + spin_unlock(&all_mddevs_lock); + return -EBUSY; + } + mddev_get(mddev); + spin_unlock(&all_mddevs_lock); rv = mddev_lock(mddev); if (!rv) { rv = entry->store(mddev, page, length); mddev_unlock(mddev); } + mddev_put(mddev); return rv; } From 52c64152a935e63d9ff73ce823730c9a23dedbff Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 8 Dec 2011 16:22:48 +1100 Subject: [PATCH 4/7] md: bad blocks shouldn't cause a Blocked status on a Faulty device. Once a device is marked Faulty the badblocks - whether acknowledged or not - become irrelevant. So they shouldn't cause the device to be marked as Blocked. Without this patch, a process might write "-blocked" to clear the Blocked status, but while that will correctly fail the device, it won't remove the apparent 'blocked' status. Signed-off-by: NeilBrown --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index f97c3b2f2f89..d730e8f513a7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2546,7 +2546,8 @@ state_show(struct md_rdev *rdev, char *page) sep = ","; } if (test_bit(Blocked, &rdev->flags) || - rdev->badblocks.unacked_exist) { + (rdev->badblocks.unacked_exist + && !test_bit(Faulty, &rdev->flags))) { len += sprintf(page+len, "%sblocked", sep); sep = ","; } From 8bd2f0a05b361e07d48bb34398593f5f523946b3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 8 Dec 2011 16:26:08 +1100 Subject: [PATCH 5/7] md: ensure new badblocks are handled promptly. When we mark blocks as bad we need them to be acknowledged by the metadata handler promptly. For an in-kernel metadata handler that was already being done. But for an external metadata handler we need to alert it of the change by sending a notification through the sysfs file. This adds that notification. Signed-off-by: NeilBrown --- drivers/md/md.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index d730e8f513a7..ee981737edfc 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7858,6 +7858,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, s + rdev->data_offset, sectors, acknowledged); if (rv) { /* Make sure they get written out promptly */ + sysfs_notify_dirent_safe(rdev->sysfs_state); set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); md_wakeup_thread(rdev->mddev->thread); } From 9283d8c5af4cdcb809e655acdf4be368afec8b58 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 8 Dec 2011 16:27:57 +1100 Subject: [PATCH 6/7] md/raid5: never wait for bad-block acks on failed device. Once a device is failed we really want to completely ignore it. It should go away soon anyway. In particular the presence of bad blocks on it should not cause us to block as we won't be trying to write there anyway. So as soon as we can check if a device is Faulty, do so and pretend that it is already gone if it is Faulty. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 297e26092178..b0dec01028f4 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3036,6 +3036,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (dev->written) s->written++; rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && test_bit(Faulty, &rdev->flags)) + rdev = NULL; if (rdev) { is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, &first_bad, &bad_sectors); @@ -3063,7 +3065,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) } } else if (test_bit(In_sync, &rdev->flags)) set_bit(R5_Insync, &dev->flags); - else if (!test_bit(Faulty, &rdev->flags)) { + else { /* in sync if before recovery_offset */ if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) set_bit(R5_Insync, &dev->flags); From 5d8c71f9e5fbdd95650be00294d238e27a363b5c Mon Sep 17 00:00:00 2001 From: Adam Kwolek Date: Fri, 9 Dec 2011 14:26:11 +1100 Subject: [PATCH 7/7] md: raid5 crash during degradation NULL pointer access causes crash in raid5 module. Signed-off-by: Adam Kwolek Signed-off-by: NeilBrown --- drivers/md/raid5.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b0dec01028f4..31670f8d6b65 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3070,7 +3070,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) set_bit(R5_Insync, &dev->flags); } - if (test_bit(R5_WriteError, &dev->flags)) { + if (rdev && test_bit(R5_WriteError, &dev->flags)) { clear_bit(R5_Insync, &dev->flags); if (!test_bit(Faulty, &rdev->flags)) { s->handle_bad_blocks = 1; @@ -3078,7 +3078,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) } else clear_bit(R5_WriteError, &dev->flags); } - if (test_bit(R5_MadeGood, &dev->flags)) { + if (rdev && test_bit(R5_MadeGood, &dev->flags)) { if (!test_bit(Faulty, &rdev->flags)) { s->handle_bad_blocks = 1; atomic_inc(&rdev->nr_pending);