From 7dc341175af5e7361af81499ea192c7b274b6357 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Tue, 4 Dec 2012 15:21:52 +0100 Subject: [PATCH 1/8] xen-blkback: implement safe iterator for the list of persistent grants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Change foreach_grant iterator to a safe version, that allows freeing the element while iterating. Also move the free code in free_persistent_gnts to prevent freeing the element before the rb_next call. Reported-by: Dan Carpenter Signed-off-by: Roger Pau Monné Cc: Konrad Rzeszutek Wilk Cc: xen-devel@lists.xen.org Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/blkback.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 74374fb762aa..5ac841ff6cc7 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -161,10 +161,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, static void make_response(struct xen_blkif *blkif, u64 id, unsigned short op, int st); -#define foreach_grant(pos, rbtree, node) \ - for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node); \ +#define foreach_grant_safe(pos, n, rbtree, node) \ + for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \ + (n) = rb_next(&(pos)->node); \ &(pos)->node != NULL; \ - (pos) = container_of(rb_next(&(pos)->node), typeof(*(pos)), node)) + (pos) = container_of(n, typeof(*(pos)), node), \ + (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL) static void add_persistent_gnt(struct rb_root *root, @@ -217,10 +219,11 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num) struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct persistent_gnt *persistent_gnt; + struct rb_node *n; int ret = 0; int segs_to_unmap = 0; - foreach_grant(persistent_gnt, root, node) { + foreach_grant_safe(persistent_gnt, n, root, node) { BUG_ON(persistent_gnt->handle == BLKBACK_INVALID_HANDLE); gnttab_set_unmap_op(&unmap[segs_to_unmap], @@ -230,9 +233,6 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num) persistent_gnt->handle); pages[segs_to_unmap] = persistent_gnt->page; - rb_erase(&persistent_gnt->node, root); - kfree(persistent_gnt); - num--; if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST || !rb_next(&persistent_gnt->node)) { @@ -241,6 +241,10 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num) BUG_ON(ret); segs_to_unmap = 0; } + + rb_erase(&persistent_gnt->node, root); + kfree(persistent_gnt); + num--; } BUG_ON(num != 0); } From ebb351cf78ee6bf3262e6b4b6906f456c05e6d5e Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Tue, 4 Dec 2012 15:21:53 +0100 Subject: [PATCH 2/8] llist/xen-blkfront: implement safe version of llist_for_each_entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement a safe version of llist_for_each_entry, and use it in blkif_free. Previously grants where freed while iterating the list, which lead to dereferences when trying to fetch the next item. Reported-by: Dan Carpenter Signed-off-by: Roger Pau Monné Acked-by: Andrew Morton [v2: Move the llist_for_each_entry_safe in llist.h] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 3 ++- include/linux/llist.h | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 96e9b00db081..cfdb033c7107 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -792,6 +792,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) { struct llist_node *all_gnts; struct grant *persistent_gnt; + struct llist_node *n; /* Prevent new requests being issued until we fix things up. */ spin_lock_irq(&info->io_lock); @@ -804,7 +805,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) /* Remove all persistent grants */ if (info->persistent_gnts_c) { all_gnts = llist_del_all(&info->persistent_gnts); - llist_for_each_entry(persistent_gnt, all_gnts, node) { + llist_for_each_entry_safe(persistent_gnt, n, all_gnts, node) { gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); __free_page(pfn_to_page(persistent_gnt->pfn)); kfree(persistent_gnt); diff --git a/include/linux/llist.h b/include/linux/llist.h index a5199f6d0e82..d0ab98f73d38 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -124,6 +124,31 @@ static inline void init_llist_head(struct llist_head *list) &(pos)->member != NULL; \ (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member)) +/** + * llist_for_each_entry_safe - iterate safely against remove over some entries + * of lock-less list of given type. + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as a temporary storage. + * @node: the fist entry of deleted list entries. + * @member: the name of the llist_node with the struct. + * + * In general, some entries of the lock-less list can be traversed + * safely only after being removed from list, so start with an entry + * instead of list head. This variant allows removal of entries + * as we iterate. + * + * If being used on entries deleted from lock-less list directly, the + * traverse order is from the newest to the oldest added entry. If + * you want to traverse from the oldest to the newest, you must + * reverse the order by yourself before traversing. + */ +#define llist_for_each_entry_safe(pos, n, node, member) \ + for ((pos) = llist_entry((node), typeof(*(pos)), member), \ + (n) = (pos)->member.next; \ + &(pos)->member != NULL; \ + (pos) = llist_entry(n, typeof(*(pos)), member), \ + (n) = (&(pos)->member != NULL) ? (pos)->member.next : NULL) + /** * llist_empty - tests whether a lock-less list is empty * @head: the list to test From d62f691858252333ff1ddb920e6774d9020734aa Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Fri, 7 Dec 2012 19:00:31 +0100 Subject: [PATCH 3/8] xen-blkfront: handle bvecs with partial data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently blkfront fails to handle cases in blkif_completion like the following: 1st loop in rq_for_each_segment * bv_offset: 3584 * bv_len: 512 * offset += bv_len * i: 0 2nd loop: * bv_offset: 0 * bv_len: 512 * i: 0 In the second loop i should be 1, since we assume we only wanted to read a part of the previous page. This patches fixes this cases where only a part of the shared page is read, and blkif_completion assumes that if the bv_offset of a bvec is less than the previous bv_offset plus the bv_size we have to switch to the next shared page. Reported-by: Konrad Rzeszutek Wilk Signed-off-by: Roger Pau Monné Cc: linux-kernel@vger.kernel.org Cc: Konrad Rzeszutek Wilk Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index cfdb033c7107..11043c18ac5a 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -836,7 +836,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, struct blkif_response *bret) { - int i; + int i = 0; struct bio_vec *bvec; struct req_iterator iter; unsigned long flags; @@ -853,7 +853,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, */ rq_for_each_segment(bvec, s->request, iter) { BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE); - i = offset >> PAGE_SHIFT; + if (bvec->bv_offset < offset) + i++; BUG_ON(i >= s->req.u.rw.nr_segments); shared_data = kmap_atomic( pfn_to_page(s->grants_used[i]->pfn)); @@ -862,7 +863,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, bvec->bv_len); bvec_kunmap_irq(bvec_data, &flags); kunmap_atomic(shared_data); - offset += bvec->bv_len; + offset = bvec->bv_offset + bvec->bv_len; } } /* Add the persistent grant into the list of free grants */ From aea24a8bbc81c8b404e4e293fb37aefaf388d35d Mon Sep 17 00:00:00 2001 From: Derek Basehore Date: Tue, 18 Dec 2012 12:27:18 -0800 Subject: [PATCH 4/8] block: remove deadlock in disk_clear_events In disk_clear_events, do not put work on system_nrt_freezable_wq. Instead, put it on system_nrt_wq. There is a race between probing a usb and suspending the device. Since probing a usb calls disk_clear_events, which puts work on a frozen workqueue, probing cannot finish after the workqueue is frozen. However, suspending cannot finish until the usb probe is finished, so we get a deadlock, causing the system to reboot. The way to reproduce this bug is to wake up from suspend with a usb storage device plugged in, or plugging in a usb storage device right before suspend. The window of time is on the order of time it takes to probe the usb device. As long as the workqueues are frozen before the call to add_disk within sd_probe_async finishes, there will be a deadlock (which calls blkdev_get, sd_open, check_disk_change, then disk_clear_events). This is not difficult to reproduce after figuring out the timings. [akpm@linux-foundation.org: fix up comment] Signed-off-by: Derek Basehore Reviewed-by: Mandeep Singh Baines Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- block/genhd.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 9a289d7c84bb..33a3b38aa4b2 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1565,7 +1565,14 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) /* uncondtionally schedule event check and wait for it to finish */ disk_block_events(disk); - queue_delayed_work(system_freezable_wq, &ev->dwork, 0); + /* + * We need to put the work on system_nrt_wq here since there is a + * deadlock that happens while probing a usb device while suspending. If + * we put work on a freezable workqueue here, a usb probe will wait here + * until the workqueue is unfrozen during suspend. Since suspend waits + * on all probes to complete, we have a deadlock + */ + queue_delayed_work(system_nrt_wq, &ev->dwork, 0); flush_delayed_work(&ev->dwork); __disk_unblock_events(disk, false); From 12c2bdb232168511c8dd54d6626549391a228918 Mon Sep 17 00:00:00 2001 From: Derek Basehore Date: Tue, 18 Dec 2012 12:27:20 -0800 Subject: [PATCH 5/8] block: prevent race/cleanup Remove a race condition which causes a warning in disk_clear_events. This is a race between disk_clear_events() and disk_flush_events(). ev->clearing will be altered by disk_flush_events() even though we are blocking event checking through disk_flush_events(). If this happens after ev->clearing was cleared for disk_clear_events(), this can cause the WARN_ON_ONCE() in that function to be triggered. This change also has disk_clear_events() not go through a workqueue. Since we have to wait for the work to complete, we should just call the function directly. Also, since this work cannot be put on a freezable workqueue, it will have to contend with increased demand, so calling the function directly avoids this. [akpm@linux-foundation.org: fix spello in comment] Signed-off-by: Derek Basehore Cc: Mandeep Singh Baines Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- block/genhd.c | 45 ++++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 33a3b38aa4b2..3993ebf4135f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -35,6 +35,8 @@ static DEFINE_IDR(ext_devt_idr); static struct device_type disk_type; +static void disk_check_events(struct disk_events *ev, + unsigned int *clearing_ptr); static void disk_alloc_events(struct gendisk *disk); static void disk_add_events(struct gendisk *disk); static void disk_del_events(struct gendisk *disk); @@ -1549,6 +1551,7 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) const struct block_device_operations *bdops = disk->fops; struct disk_events *ev = disk->ev; unsigned int pending; + unsigned int clearing = mask; if (!ev) { /* for drivers still using the old ->media_changed method */ @@ -1558,41 +1561,53 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) return 0; } - /* tell the workfn about the events being cleared */ + disk_block_events(disk); + + /* + * store the union of mask and ev->clearing on the stack so that the + * race with disk_flush_events does not cause ambiguity (ev->clearing + * can still be modified even if events are blocked). + */ spin_lock_irq(&ev->lock); - ev->clearing |= mask; + clearing |= ev->clearing; + ev->clearing = 0; spin_unlock_irq(&ev->lock); - /* uncondtionally schedule event check and wait for it to finish */ - disk_block_events(disk); + disk_check_events(ev, &clearing); /* - * We need to put the work on system_nrt_wq here since there is a - * deadlock that happens while probing a usb device while suspending. If - * we put work on a freezable workqueue here, a usb probe will wait here - * until the workqueue is unfrozen during suspend. Since suspend waits - * on all probes to complete, we have a deadlock + * if ev->clearing is not 0, the disk_flush_events got called in the + * middle of this function, so we want to run the workfn without delay. */ - queue_delayed_work(system_nrt_wq, &ev->dwork, 0); - flush_delayed_work(&ev->dwork); - __disk_unblock_events(disk, false); + __disk_unblock_events(disk, ev->clearing ? true : false); /* then, fetch and clear pending events */ spin_lock_irq(&ev->lock); - WARN_ON_ONCE(ev->clearing & mask); /* cleared by workfn */ pending = ev->pending & mask; ev->pending &= ~mask; spin_unlock_irq(&ev->lock); + WARN_ON_ONCE(clearing & mask); return pending; } +/* + * Separate this part out so that a different pointer for clearing_ptr can be + * passed in for disk_clear_events. + */ static void disk_events_workfn(struct work_struct *work) { struct delayed_work *dwork = to_delayed_work(work); struct disk_events *ev = container_of(dwork, struct disk_events, dwork); + + disk_check_events(ev, &ev->clearing); +} + +static void disk_check_events(struct disk_events *ev, + unsigned int *clearing_ptr) +{ struct gendisk *disk = ev->disk; char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; - unsigned int clearing = ev->clearing; + unsigned int clearing = *clearing_ptr; unsigned int events; unsigned long intv; int nr_events = 0, i; @@ -1605,7 +1620,7 @@ static void disk_events_workfn(struct work_struct *work) events &= ~ev->pending; ev->pending |= events; - ev->clearing &= ~clearing; + *clearing_ptr &= ~clearing; intv = disk_events_poll_jiffies(disk); if (!ev->block && intv) From 47cd4b3c7e80cc8d5ac82930c5af835870b5aba1 Mon Sep 17 00:00:00 2001 From: Asai Thambi S P Date: Fri, 11 Jan 2013 18:46:04 +0530 Subject: [PATCH 6/8] mtip32xx: fix for driver hang after a command timeout If an I/O command times out when a PIO command is active, MTIP_PF_EH_ACTIVE_BIT is not cleared. This results in I/O hang in the driver. Fix is to clear this bit. Signed-off-by: Asai Thambi S P Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 9694dd99bbbc..b93282019632 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -626,12 +626,13 @@ static void mtip_timeout_function(unsigned long int data) } } - if (cmdto_cnt && !test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { + if (cmdto_cnt) { print_tags(port->dd, "timed out", tagaccum, cmdto_cnt); - - mtip_restart_port(port); + if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { + mtip_restart_port(port); + wake_up_interruptible(&port->svc_wait); + } clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); - wake_up_interruptible(&port->svc_wait); } if (port->ic_pause_timer) { From 58c49df378cde4ff64172483d593a1a5689c13a8 Mon Sep 17 00:00:00 2001 From: Asai Thambi S P Date: Fri, 11 Jan 2013 18:47:12 +0530 Subject: [PATCH 7/8] mtip32xx: fix for crash when the device surprise removed during rebuild When rebuild is in progress, disk->queue is yet to be created. Surprise removing the device will call remove()-> del_gendisk(). del_gendisk() expect disk->queue be not NULL. Fix is to call put_disk() when disk_queue is NULL. Signed-off-by: Asai Thambi S P Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index b93282019632..3fd100990453 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3888,7 +3888,12 @@ static int mtip_block_remove(struct driver_data *dd) * Delete our gendisk structure. This also removes the device * from /dev */ - del_gendisk(dd->disk); + if (dd->disk) { + if (dd->disk->queue) + del_gendisk(dd->disk); + else + put_disk(dd->disk); + } spin_lock(&rssd_index_lock); ida_remove(&rssd_index_ida, dd->index); @@ -3922,7 +3927,13 @@ static int mtip_block_shutdown(struct driver_data *dd) "Shutting down %s ...\n", dd->disk->disk_name); /* Delete our gendisk structure, and cleanup the blk queue. */ - del_gendisk(dd->disk); + if (dd->disk) { + if (dd->disk->queue) + del_gendisk(dd->disk); + else + put_disk(dd->disk); + } + spin_lock(&rssd_index_lock); ida_remove(&rssd_index_ida, dd->index); From 2681f7f6ce6c7416eb619d0fb19422bcc68bd9e1 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 21 Jan 2013 15:43:41 +0100 Subject: [PATCH 8/8] drbd: fix potential protocol error and resulting disconnect/reconnect When we notice a disk failure on the receiving side, we stop sending it new incoming writes. Depending on exact timing of various events, the same transfer log epoch could end up containing both replicated (before we noticed the failure) and local-only requests (after we noticed the failure). The sanity checks in tl_release(), called when receiving a P_BARRIER_ACK, check that the ack'ed transfer log epoch matches the expected epoch, and the number of contained writes matches the number of ack'ed writes. In this case, they counted both replicated and local-only writes, but the peer only acknowledges those it has seen. We get a mismatch, resulting in a protocol error and disconnect/reconnect cycle. Messages logged are "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n" A similar issue can also be triggered when starting a resync while having a healthy replication link, by invalidating one side, forcing a full sync, or attaching to a diskless node. Fix this by closing the current epoch if the state changes in a way that would cause the replication intent of the next write. Epochs now contain either only non-replicated, or only replicated writes. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_req.c | 2 +- drivers/block/drbd/drbd_req.h | 1 + drivers/block/drbd/drbd_state.c | 7 +++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index f58a4a4b4dfb..2b8303ad63c9 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -168,7 +168,7 @@ static void wake_all_senders(struct drbd_tconn *tconn) { } /* must hold resource->req_lock */ -static void start_new_tl_epoch(struct drbd_tconn *tconn) +void start_new_tl_epoch(struct drbd_tconn *tconn) { /* no point closing an epoch, if it is empty, anyways. */ if (tconn->current_tle_writes == 0) diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 016de6b8bb57..c08d22964d06 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -267,6 +267,7 @@ struct bio_and_error { int error; }; +extern void start_new_tl_epoch(struct drbd_tconn *tconn); extern void drbd_req_destroy(struct kref *kref); extern void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m); diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 53bf6182bac4..0fe220cfb9e9 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -931,6 +931,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, enum drbd_state_rv rv = SS_SUCCESS; enum sanitize_state_warnings ssw; struct after_state_chg_work *ascw; + bool did_remote, should_do_remote; os = drbd_read_state(mdev); @@ -981,11 +982,17 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, (os.disk != D_DISKLESS && ns.disk == D_DISKLESS)) atomic_inc(&mdev->local_cnt); + did_remote = drbd_should_do_remote(mdev->state); mdev->state.i = ns.i; + should_do_remote = drbd_should_do_remote(mdev->state); mdev->tconn->susp = ns.susp; mdev->tconn->susp_nod = ns.susp_nod; mdev->tconn->susp_fen = ns.susp_fen; + /* put replicated vs not-replicated requests in seperate epochs */ + if (did_remote != should_do_remote) + start_new_tl_epoch(mdev->tconn); + if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING) drbd_print_uuids(mdev, "attached to UUIDs");