diff --git a/drivers/gpu/drm/msm/msm_fence.c b/drivers/gpu/drm/msm/msm_fence.c
index cd59a5918038..b92a9091a1e2 100644
--- a/drivers/gpu/drm/msm/msm_fence.c
+++ b/drivers/gpu/drm/msm/msm_fence.c
@@ -11,7 +11,8 @@
 
 
 struct msm_fence_context *
-msm_fence_context_alloc(struct drm_device *dev, const char *name)
+msm_fence_context_alloc(struct drm_device *dev, volatile uint32_t *fenceptr,
+		const char *name)
 {
 	struct msm_fence_context *fctx;
 
@@ -22,6 +23,7 @@ msm_fence_context_alloc(struct drm_device *dev, const char *name)
 	fctx->dev = dev;
 	strncpy(fctx->name, name, sizeof(fctx->name));
 	fctx->context = dma_fence_context_alloc(1);
+	fctx->fenceptr = fenceptr;
 	init_waitqueue_head(&fctx->event);
 	spin_lock_init(&fctx->spinlock);
 
@@ -35,7 +37,12 @@ void msm_fence_context_free(struct msm_fence_context *fctx)
 
 static inline bool fence_completed(struct msm_fence_context *fctx, uint32_t fence)
 {
-	return (int32_t)(fctx->completed_fence - fence) >= 0;
+	/*
+	 * Note: Check completed_fence first, as fenceptr is in a write-combine
+	 * mapping, so it will be more expensive to read.
+	 */
+	return (int32_t)(fctx->completed_fence - fence) >= 0 ||
+		(int32_t)(*fctx->fenceptr - fence) >= 0;
 }
 
 /* legacy path for WAIT_FENCE ioctl: */
diff --git a/drivers/gpu/drm/msm/msm_fence.h b/drivers/gpu/drm/msm/msm_fence.h
index 2d9af66dcca5..6ab97062ff1a 100644
--- a/drivers/gpu/drm/msm/msm_fence.h
+++ b/drivers/gpu/drm/msm/msm_fence.h
@@ -9,19 +9,52 @@
 
 #include "msm_drv.h"
 
+/**
+ * struct msm_fence_context - fence context for gpu
+ *
+ * Each ringbuffer has a single fence context, with the GPU writing an
+ * incrementing fence seqno at the end of each submit
+ */
 struct msm_fence_context {
 	struct drm_device *dev;
+	/** name: human readable name for fence timeline */
 	char name[32];
+	/** context: see dma_fence_context_alloc() */
 	unsigned context;
-	/* last_fence == completed_fence --> no pending work */
-	uint32_t last_fence;          /* last assigned fence */
-	uint32_t completed_fence;     /* last completed fence */
+
+	/**
+	 * last_fence:
+	 *
+	 * Last assigned fence, incremented each time a fence is created
+	 * on this fence context.  If last_fence == completed_fence,
+	 * there is no remaining pending work
+	 */
+	uint32_t last_fence;
+
+	/**
+	 * completed_fence:
+	 *
+	 * The last completed fence, updated from the CPU after interrupt
+	 * from GPU
+	 */
+	uint32_t completed_fence;
+
+	/**
+	 * fenceptr:
+	 *
+	 * The address that the GPU directly writes with completed fence
+	 * seqno.  This can be ahead of completed_fence.  We can peek at
+	 * this to see if a fence has already signaled but the CPU hasn't
+	 * gotten around to handling the irq and updating completed_fence
+	 */
+	volatile uint32_t *fenceptr;
+
 	wait_queue_head_t event;
 	spinlock_t spinlock;
 };
 
 struct msm_fence_context * msm_fence_context_alloc(struct drm_device *dev,
-		const char *name);
+		volatile uint32_t *fenceptr, const char *name);
 void msm_fence_context_free(struct msm_fence_context *fctx);
 
 int msm_wait_fence(struct msm_fence_context *fctx, uint32_t fence,
diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.c b/drivers/gpu/drm/msm/msm_ringbuffer.c
index 4d2a2a4abef8..7e92d9532454 100644
--- a/drivers/gpu/drm/msm/msm_ringbuffer.c
+++ b/drivers/gpu/drm/msm/msm_ringbuffer.c
@@ -51,7 +51,7 @@ struct msm_ringbuffer *msm_ringbuffer_new(struct msm_gpu *gpu, int id,
 
 	snprintf(name, sizeof(name), "gpu-ring-%d", ring->id);
 
-	ring->fctx = msm_fence_context_alloc(gpu->dev, name);
+	ring->fctx = msm_fence_context_alloc(gpu->dev, &ring->memptrs->fence, name);
 
 	return ring;