From ab05613a0646dcc11049692d54bae76ca9ffa910 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Tue, 6 Nov 2012 17:13:44 +0800 Subject: [PATCH 1/8] md: Reassigned the parameters if read_seqretry returned true in func md_is_badblock. This bug was introduced by commit(v3.0-rc7-126-g2230dfe). So fix is suitable for 3.0.y thru 3.6.y. Cc: stable@vger.kernel.org Signed-off-by: Jianpeng Ma Signed-off-by: NeilBrown --- drivers/md/md.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 9ab768acfb62..14db6abb2c42 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7936,9 +7936,9 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, sector_t *first_bad, int *bad_sectors) { int hi; - int lo = 0; + int lo; u64 *p = bb->page; - int rv = 0; + int rv; sector_t target = s + sectors; unsigned seq; @@ -7953,7 +7953,8 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, retry: seq = read_seqbegin(&bb->lock); - + lo = 0; + rv = 0; hi = bb->count; /* Binary search between lo and hi for 'target' From 35f9ac2dcec8f79d7059ce174fd7b7ee3290d620 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Thu, 8 Nov 2012 08:56:27 +0800 Subject: [PATCH 2/8] md: Avoid write invalid address if read_seqretry returned true. If read_seqretry returned true and bbp was changed, it will write invalid address which can cause some serious problem. This bug was introduced by commit v3.0-rc7-130-g2699b67. So fix is suitable for 3.0.y thru 3.6.y. Reported-by: zhuwenfeng@kedacom.com Tested-by: zhuwenfeng@kedacom.com Cc: stable@vger.kernel.org Signed-off-by: Jianpeng Ma Signed-off-by: NeilBrown --- drivers/md/md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 14db6abb2c42..4c7d880a60a4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1817,10 +1817,10 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) memset(bbp, 0xff, PAGE_SIZE); for (i = 0 ; i < bb->count ; i++) { - u64 internal_bb = *p++; + u64 internal_bb = p[i]; u64 store_bb = ((BB_OFFSET(internal_bb) << 10) | BB_LEN(internal_bb)); - *bbp++ = cpu_to_le64(store_bb); + bbp[i] = cpu_to_le64(store_bb); } bb->changed = 0; if (read_seqretry(&bb->lock, seq)) From 5eff3c439d3478ba9e8ba5f8c0aaf6e6fadb6e58 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 19 Nov 2012 10:47:48 +1100 Subject: [PATCH 3/8] md: make sure everything is freed when dm-raid stops an array. md_stop() would stop an array, but not free various attached data structures. For internal arrays, these are freed later in do_md_stop() or mddev_put(), but they don't apply for dm-raid arrays. So get md_stop() to free them, and only all it from dm-raid. For internal arrays we now call __md_stop. Reported-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/md.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 4c7d880a60a4..61200717687b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5294,7 +5294,7 @@ void md_stop_writes(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_stop_writes); -void md_stop(struct mddev *mddev) +static void __md_stop(struct mddev *mddev) { mddev->ready = 0; mddev->pers->stop(mddev); @@ -5304,6 +5304,18 @@ void md_stop(struct mddev *mddev) mddev->pers = NULL; clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } + +void md_stop(struct mddev *mddev) +{ + /* stop the array and free an attached data structures. + * This is called from dm-raid + */ + __md_stop(mddev); + bitmap_destroy(mddev); + if (mddev->bio_set) + bioset_free(mddev->bio_set); +} + EXPORT_SYMBOL_GPL(md_stop); static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) @@ -5364,7 +5376,7 @@ static int do_md_stop(struct mddev * mddev, int mode, set_disk_ro(disk, 0); __md_stop_writes(mddev); - md_stop(mddev); + __md_stop(mddev); mddev->queue->merge_bvec_fn = NULL; mddev->queue->backing_dev_info.congested_fn = NULL; From 4ac6875eeb97a49bad7bc8d56b5ec935904fc6e7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 19 Nov 2012 13:11:26 +1100 Subject: [PATCH 4/8] md/raid5: round discard alignment up to power of 2. blkdev_issue_discard currently assumes that the granularity is a power of 2. So in raid5, round the chosen number up to avoid embarrassment. Cc: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid5.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c5439dce0295..baea94f0670a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5529,6 +5529,10 @@ static int run(struct mddev *mddev) * discard data disk but write parity disk */ stripe = stripe * PAGE_SIZE; + /* Round up to power of 2, as discard handling + * currently assumes that */ + while ((stripe-1) & stripe) + stripe = (stripe | (stripe-1)) + 1; mddev->queue->limits.discard_alignment = stripe; mddev->queue->limits.discard_granularity = stripe; /* From ef5b7c69b7a1b8b8744a6168b6ff02900f81b6ca Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 22 Nov 2012 09:13:36 +1100 Subject: [PATCH 5/8] md/raid5: move resolving of reconstruct_state earlier in stripe_handle. The chunk of code in stripe_handle which responds to a *_result value in reconstruct_state is really the completion of some processing that happened outside of handle_stripe (possibly asynchronously) and so should be one of the first things done in handle_stripe(). After the next patch it will be important that it happens before handle_stripe_clean_event(), as that will clear some dev->flags bit that this code tests. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 68 +++++++++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index baea94f0670a..0fb988556eea 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3490,40 +3490,6 @@ static void handle_stripe(struct stripe_head *sh) handle_failed_sync(conf, sh, &s); } - /* - * might be able to return some write requests if the parity blocks - * are safe, or on a failed drive - */ - pdev = &sh->dev[sh->pd_idx]; - s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) - || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); - qdev = &sh->dev[sh->qd_idx]; - s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) - || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) - || conf->level < 6; - - if (s.written && - (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) - && !test_bit(R5_LOCKED, &pdev->flags) - && (test_bit(R5_UPTODATE, &pdev->flags) || - test_bit(R5_Discard, &pdev->flags))))) && - (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) - && !test_bit(R5_LOCKED, &qdev->flags) - && (test_bit(R5_UPTODATE, &qdev->flags) || - test_bit(R5_Discard, &qdev->flags)))))) - handle_stripe_clean_event(conf, sh, disks, &s.return_bi); - - /* Now we might consider reading some blocks, either to check/generate - * parity, or to satisfy requests - * or to load a block that is being partially written. - */ - if (s.to_read || s.non_overwrite - || (conf->level == 6 && s.to_write && s.failed) - || (s.syncing && (s.uptodate + s.compute < disks)) - || s.replacing - || s.expanding) - handle_stripe_fill(sh, &s, disks); - /* Now we check to see if any write operations have recently * completed */ @@ -3561,6 +3527,40 @@ static void handle_stripe(struct stripe_head *sh) s.dec_preread_active = 1; } + /* + * might be able to return some write requests if the parity blocks + * are safe, or on a failed drive + */ + pdev = &sh->dev[sh->pd_idx]; + s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) + || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); + qdev = &sh->dev[sh->qd_idx]; + s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) + || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) + || conf->level < 6; + + if (s.written && + (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) + && !test_bit(R5_LOCKED, &pdev->flags) + && (test_bit(R5_UPTODATE, &pdev->flags) || + test_bit(R5_Discard, &pdev->flags))))) && + (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) + && !test_bit(R5_LOCKED, &qdev->flags) + && (test_bit(R5_UPTODATE, &qdev->flags) || + test_bit(R5_Discard, &qdev->flags)))))) + handle_stripe_clean_event(conf, sh, disks, &s.return_bi); + + /* Now we might consider reading some blocks, either to check/generate + * parity, or to satisfy requests + * or to load a block that is being partially written. + */ + if (s.to_read || s.non_overwrite + || (conf->level == 6 && s.to_write && s.failed) + || (s.syncing && (s.uptodate + s.compute < disks)) + || s.replacing + || s.expanding) + handle_stripe_fill(sh, &s, disks); + /* Now to consider new write requests and what else, if anything * should be read. We do not handle new writes when: * 1/ A 'write' operation (copy+xor) is already in flight. From ca64cae96037de16e4af92678814f5d4bf0c1c65 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 21 Nov 2012 16:33:40 +1100 Subject: [PATCH 6/8] md/raid5: Make sure we clear R5_Discard when discard is finished. commit 9e44476851e91c86c98eb92b9bc27fb801f89072 MD: raid5 avoid unnecessary zero page for trim change raid5 to clear R5_Discard when the complete request is handled rather than when submitting the per-device discard request. However it did not clear R5_Discard for the parity device. This means that if the stripe_head was reused before it expired from the cache, the setting would be wrong and a hang would result. Also if the R5_Uptodate bit happens to be set, R5_Discard again won't be cleared. But R5_Uptodate really should be clear at this point. So make sure R5_Discard is cleared in all cases, and clear R5_Uptodate when a 'discard' completes. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0fb988556eea..a4502686e7a8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2774,10 +2774,12 @@ static void handle_stripe_clean_event(struct r5conf *conf, dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && (test_bit(R5_UPTODATE, &dev->flags) || - test_and_clear_bit(R5_Discard, &dev->flags))) { + test_bit(R5_Discard, &dev->flags))) { /* We can return any write requests */ struct bio *wbi, *wbi2; pr_debug("Return write for disc %d\n", i); + if (test_and_clear_bit(R5_Discard, &dev->flags)) + clear_bit(R5_UPTODATE, &dev->flags); wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_sector < @@ -2795,7 +2797,8 @@ static void handle_stripe_clean_event(struct r5conf *conf, !test_bit(STRIPE_DEGRADED, &sh->state), 0); } - } + } else if (test_bit(R5_Discard, &sh->dev[i].flags)) + clear_bit(R5_Discard, &sh->dev[i].flags); if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) if (atomic_dec_and_test(&conf->pending_full_writes)) From e7c0c3fa29280d62aa5e11101a674bb3064bd791 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 22 Nov 2012 14:42:49 +1100 Subject: [PATCH 7/8] md/raid10: close race that lose writes lost when replacement completes. When a replacement operation completes there is a small window when the original device is marked 'faulty' and the replacement still looks like a replacement. The faulty should be removed and the replacement moved in place very quickly, bit it isn't instant. So the code write out to the array must handle the possibility that the only working device for some slot in the replacement - but it doesn't. If the primary device is faulty it just gives up. This can lead to corruption. So make the code more robust: if either the primary or the replacement is present and working, write to them. Only when neither are present do we give up. This bug has been present since replacement was introduced in 3.3, so it is suitable for any -stable kernel since then. Reported-by: "George Spelvin" Cc: stable@vger.kernel.org Signed-off-by: NeilBrown --- drivers/md/raid10.c | 129 +++++++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 61 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d1295aff4173..ad032518fdc0 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1334,18 +1334,21 @@ static void make_request(struct mddev *mddev, struct bio * bio) blocked_rdev = rrdev; break; } + if (rdev && (test_bit(Faulty, &rdev->flags) + || test_bit(Unmerged, &rdev->flags))) + rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags) || test_bit(Unmerged, &rrdev->flags))) rrdev = NULL; r10_bio->devs[i].bio = NULL; r10_bio->devs[i].repl_bio = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags) || - test_bit(Unmerged, &rdev->flags)) { + + if (!rdev && !rrdev) { set_bit(R10BIO_Degraded, &r10_bio->state); continue; } - if (test_bit(WriteErrorSeen, &rdev->flags)) { + if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; sector_t dev_sector = r10_bio->devs[i].addr; int bad_sectors; @@ -1387,8 +1390,10 @@ static void make_request(struct mddev *mddev, struct bio * bio) max_sectors = good_sectors; } } - r10_bio->devs[i].bio = bio; - atomic_inc(&rdev->nr_pending); + if (rdev) { + r10_bio->devs[i].bio = bio; + atomic_inc(&rdev->nr_pending); + } if (rrdev) { r10_bio->devs[i].repl_bio = bio; atomic_inc(&rrdev->nr_pending); @@ -1444,69 +1449,71 @@ static void make_request(struct mddev *mddev, struct bio * bio) for (i = 0; i < conf->copies; i++) { struct bio *mbio; int d = r10_bio->devs[i].devnum; - if (!r10_bio->devs[i].bio) - continue; + if (r10_bio->devs[i].bio) { + struct md_rdev *rdev = conf->mirrors[d].rdev; + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, + max_sectors); + r10_bio->devs[i].bio = mbio; - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, - max_sectors); - r10_bio->devs[i].bio = mbio; + mbio->bi_sector = (r10_bio->devs[i].addr+ + choose_data_offset(r10_bio, + rdev)); + mbio->bi_bdev = rdev->bdev; + mbio->bi_end_io = raid10_end_write_request; + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; + mbio->bi_private = r10_bio; - mbio->bi_sector = (r10_bio->devs[i].addr+ - choose_data_offset(r10_bio, - conf->mirrors[d].rdev)); - mbio->bi_bdev = conf->mirrors[d].rdev->bdev; - mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; - mbio->bi_private = r10_bio; + atomic_inc(&r10_bio->remaining); - atomic_inc(&r10_bio->remaining); + cb = blk_check_plugged(raid10_unplug, mddev, + sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid10_plug_cb, + cb); + else + plug = NULL; + spin_lock_irqsave(&conf->device_lock, flags); + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } + spin_unlock_irqrestore(&conf->device_lock, flags); + if (!plug) + md_wakeup_thread(mddev->thread); + } - cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); - if (cb) - plug = container_of(cb, struct raid10_plug_cb, cb); - else - plug = NULL; - spin_lock_irqsave(&conf->device_lock, flags); - if (plug) { - bio_list_add(&plug->pending, mbio); - plug->pending_cnt++; - } else { + if (r10_bio->devs[i].repl_bio) { + struct md_rdev *rdev = conf->mirrors[d].replacement; + if (rdev == NULL) { + /* Replacement just got moved to main 'rdev' */ + smp_mb(); + rdev = conf->mirrors[d].rdev; + } + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, + max_sectors); + r10_bio->devs[i].repl_bio = mbio; + + mbio->bi_sector = (r10_bio->devs[i].addr + + choose_data_offset( + r10_bio, rdev)); + mbio->bi_bdev = rdev->bdev; + mbio->bi_end_io = raid10_end_write_request; + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; + mbio->bi_private = r10_bio; + + atomic_inc(&r10_bio->remaining); + spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; + spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev)) + md_wakeup_thread(mddev->thread); } - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!plug) - md_wakeup_thread(mddev->thread); - - if (!r10_bio->devs[i].repl_bio) - continue; - - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, - max_sectors); - r10_bio->devs[i].repl_bio = mbio; - - /* We are actively writing to the original device - * so it cannot disappear, so the replacement cannot - * become NULL here - */ - mbio->bi_sector = (r10_bio->devs[i].addr + - choose_data_offset( - r10_bio, - conf->mirrors[d].replacement)); - mbio->bi_bdev = conf->mirrors[d].replacement->bdev; - mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; - mbio->bi_private = r10_bio; - - atomic_inc(&r10_bio->remaining); - spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) - md_wakeup_thread(mddev->thread); } /* Don't remove the bias on 'remaining' (one_write_done) until From 884162df2aadd7414bef4935e1a54976fd4e3988 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 22 Nov 2012 15:12:09 +1100 Subject: [PATCH 8/8] md/raid10: decrement correct pending counter when writing to replacement. When a write to a replacement device completes, we carefully and correctly found the rdev that the write actually went to and the blithely called rdev_dec_pending on the primary rdev, even if this write was to the replacement. This means that any writes to an array while a replacement was ongoing would cause the nr_pending count for the primary device to go negative, so it could never be removed. This bug has been present since replacement was introduced in 3.3, so it is suitable for any -stable kernel since then. Reported-by: "George Spelvin" Cc: stable@vger.kernel.org Signed-off-by: NeilBrown --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ad032518fdc0..0d5d0ff2c0f7 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -499,7 +499,7 @@ static void raid10_end_write_request(struct bio *bio, int error) */ one_write_done(r10_bio); if (dec_rdev) - rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); + rdev_dec_pending(rdev, conf->mddev); } /*