Merge branch 'msm-next' of git://people.freedesktop.org/~robclark/linux into drm-next

On the userspace side, all the basics are working, and most of glmark2 is working. I've been working through deqp, and I've got a couple more things to fix (but we've gone from 70% to 80+% pass in last day, and current deqp run that is going should pick up another 5-10%). I expect to push the mesa patches today or tomorrow. There are a couple more a5xx related patches to take the gpu out of secure mode (for the devices that come up in secure mode, like the hw I have), but those depend on an scm patch that would come in through another tree. If that can land in the next day or two, there might be a second late pull request for drm/msm. In addition to the new-shiny, there have also been a lot of overlay/ plane related fixes for issues found using drm-hwc2 (in the process of testing/debugging the atomic/kms fence patches), resulting in rework to assign hwpipes to kms planes dynamically (as part of global atomic state) and also handling SMP (fifo) block allocation atomically as part of the ->atomic_check() step. All those patches should also help out atomic weston (when those patches eventually land). * 'msm-next' of git://people.freedesktop.org/~robclark/linux: (36 commits) drm/msm: gpu: Add support for the GPMU drm/msm: gpu: Add A5XX target support drm/msm: Disable interrupts during init drm/msm: Remove 'src_clk' from adreno configuration drm/msm: gpu: Add OUT_TYPE4 and OUT_TYPE7 drm/msm: Add adreno_gpu_write64() drm/msm: gpu Add new gpu register read/write functions drm/msm: gpu: Return error on hw_init failure drm/msm: gpu: Cut down the list of "generic" registers to the ones we use drm/msm: update generated headers drm/msm/adreno: move scratch register dumping to per-gen code drm/msm/rd: support for 64b iova drm/msm: convert iova to 64b drm/msm: set dma_mask properly drm/msm: Remove bad calls to of_node_put() drm/msm/mdp5: move LM bounds check into plane->atomic_check() drm/msm/mdp5: dump smp state on errors too drm/msm/mdp5: add debugfs to show smp block status drm/msm/mdp5: handle SMP block allocations "atomically" drm/msm/mdp5: dynamically assign hw pipes to planes ...
2024-09-03 17:40:04 +00:00 · 2016-12-01 09:25:58 +10:00 · 2016-12-01 09:25:58 +10:00 · f559013436
commit f559013436
parent a90f58311f 2401a00846
55 changed files with 6779 additions and 963 deletions
--- a/drivers/gpu/drm/msm/Makefile
+++ b/drivers/gpu/drm/msm/Makefile
@ -6,6 +6,8 @@ msm-y := \
 	adreno/adreno_gpu.o \
 	adreno/a3xx_gpu.o \
 	adreno/a4xx_gpu.o \
+	adreno/a5xx_gpu.o \
+	adreno/a5xx_power.o \
 	hdmi/hdmi.o \
 	hdmi/hdmi_audio.o \
 	hdmi/hdmi_bridge.o \
@ -37,6 +39,7 @@ msm-y := \
 	mdp/mdp5/mdp5_irq.o \
 	mdp/mdp5/mdp5_mdss.o \
 	mdp/mdp5/mdp5_kms.o \
+	mdp/mdp5/mdp5_pipe.o \
 	mdp/mdp5/mdp5_plane.o \
 	mdp/mdp5/mdp5_smp.o \
 	msm_atomic.o \
@ -48,6 +51,7 @@ msm-y := \
 	msm_gem_prime.o \
 	msm_gem_shrinker.o \
 	msm_gem_submit.o \
+	msm_gem_vma.o \
 	msm_gpu.o \
 	msm_iommu.o \
 	msm_perf.o \
--- a/drivers/gpu/drm/msm/adreno/a2xx.xml.h
+++ b/drivers/gpu/drm/msm/adreno/a2xx.xml.h
@ -8,16 +8,17 @@ This file was generated by the rules-ng-ng headergen tool in this git repository
 git clone https://github.com/freedreno/envytools.git

 The rules-ng-ng source files this header was generated from are:
- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2016-04-26 17:56:44)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2016-02-10 17:07:21)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2016-02-10 21:03:25)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  16166 bytes, from 2016-02-11 21:20:31)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83967 bytes, from 2016-02-10 17:07:21)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 109916 bytes, from 2016-02-20 18:44:48)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32907 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  12025 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  22544 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110765 bytes, from 2016-11-26 23:01:48)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          (  90321 bytes, from 2016-11-28 16:50:05)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

-Copyright (C) 2013-2015 by the following authors:
+Copyright (C) 2013-2016 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
 - Ilia Mirkin <imirkin@alum.mit.edu> (imirkin)

@ -206,12 +207,12 @@ enum a2xx_rb_copy_sample_select {
 };

 enum a2xx_rb_blend_opcode {
-	BLEND_DST_PLUS_SRC = 0,
-	BLEND_SRC_MINUS_DST = 1,
-	BLEND_MIN_DST_SRC = 2,
-	BLEND_MAX_DST_SRC = 3,
-	BLEND_DST_MINUS_SRC = 4,
-	BLEND_DST_PLUS_SRC_BIAS = 5,
+	BLEND2_DST_PLUS_SRC = 0,
+	BLEND2_SRC_MINUS_DST = 1,
+	BLEND2_MIN_DST_SRC = 2,
+	BLEND2_MAX_DST_SRC = 3,
+	BLEND2_DST_MINUS_SRC = 4,
+	BLEND2_DST_PLUS_SRC_BIAS = 5,
 };

 enum adreno_mmu_clnt_beh {
--- a/drivers/gpu/drm/msm/adreno/a3xx.xml.h
+++ b/drivers/gpu/drm/msm/adreno/a3xx.xml.h
@ -8,13 +8,14 @@ This file was generated by the rules-ng-ng headergen tool in this git repository
 git clone https://github.com/freedreno/envytools.git

 The rules-ng-ng source files this header was generated from are:
- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2016-04-26 17:56:44)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2016-02-10 17:07:21)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2016-02-10 21:03:25)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  16166 bytes, from 2016-02-11 21:20:31)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83967 bytes, from 2016-02-10 17:07:21)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 109916 bytes, from 2016-02-20 18:44:48)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32907 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  12025 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  22544 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110765 bytes, from 2016-11-26 23:01:48)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          (  90321 bytes, from 2016-11-28 16:50:05)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2016 by the following authors:
@ -129,10 +130,14 @@ enum a3xx_tex_fmt {
 	TFMT_Z16_UNORM = 9,
 	TFMT_X8Z24_UNORM = 10,
 	TFMT_Z32_FLOAT = 11,
-	TFMT_NV12_UV_TILED = 17,
-	TFMT_NV12_Y_TILED = 19,
-	TFMT_NV12_UV = 21,
-	TFMT_NV12_Y = 23,
+	TFMT_UV_64X32 = 16,
+	TFMT_VU_64X32 = 17,
+	TFMT_Y_64X32 = 18,
+	TFMT_NV12_64X32 = 19,
+	TFMT_UV_LINEAR = 20,
+	TFMT_VU_LINEAR = 21,
+	TFMT_Y_LINEAR = 22,
+	TFMT_NV12_LINEAR = 23,
 	TFMT_I420_Y = 24,
 	TFMT_I420_U = 26,
 	TFMT_I420_V = 27,
@ -525,14 +530,6 @@ enum a3xx_uche_perfcounter_select {
 	UCHE_UCHEPERF_ACTIVE_CYCLES = 20,
 };

-enum a3xx_rb_blend_opcode {
-	BLEND_DST_PLUS_SRC = 0,
-	BLEND_SRC_MINUS_DST = 1,
-	BLEND_DST_MINUS_SRC = 2,
-	BLEND_MIN_DST_SRC = 3,
-	BLEND_MAX_DST_SRC = 4,
-};
-
 enum a3xx_intp_mode {
 	SMOOTH = 0,
 	FLAT = 1,
@ -1393,13 +1390,14 @@ static inline uint32_t A3XX_RB_COPY_CONTROL_MODE(enum adreno_rb_copy_control_mod
 {
 	return ((val) << A3XX_RB_COPY_CONTROL_MODE__SHIFT) & A3XX_RB_COPY_CONTROL_MODE__MASK;
 }
+#define A3XX_RB_COPY_CONTROL_MSAA_SRGB_DOWNSAMPLE		0x00000080
 #define A3XX_RB_COPY_CONTROL_FASTCLEAR__MASK			0x00000f00
 #define A3XX_RB_COPY_CONTROL_FASTCLEAR__SHIFT			8
 static inline uint32_t A3XX_RB_COPY_CONTROL_FASTCLEAR(uint32_t val)
 {
 	return ((val) << A3XX_RB_COPY_CONTROL_FASTCLEAR__SHIFT) & A3XX_RB_COPY_CONTROL_FASTCLEAR__MASK;
 }
-#define A3XX_RB_COPY_CONTROL_UNK12				0x00001000
+#define A3XX_RB_COPY_CONTROL_DEPTH32_RESOLVE			0x00001000
 #define A3XX_RB_COPY_CONTROL_GMEM_BASE__MASK			0xffffc000
 #define A3XX_RB_COPY_CONTROL_GMEM_BASE__SHIFT			14
 static inline uint32_t A3XX_RB_COPY_CONTROL_GMEM_BASE(uint32_t val)
@ -1472,7 +1470,7 @@ static inline uint32_t A3XX_RB_DEPTH_CONTROL_ZFUNC(enum adreno_compare_func val)
 {
 	return ((val) << A3XX_RB_DEPTH_CONTROL_ZFUNC__SHIFT) & A3XX_RB_DEPTH_CONTROL_ZFUNC__MASK;
 }
-#define A3XX_RB_DEPTH_CONTROL_BF_ENABLE				0x00000080
+#define A3XX_RB_DEPTH_CONTROL_Z_CLAMP_ENABLE			0x00000080
 #define A3XX_RB_DEPTH_CONTROL_Z_TEST_ENABLE			0x80000000

 #define REG_A3XX_RB_DEPTH_CLEAR					0x00002101
--- a/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a3xx_gpu.c
@ -41,7 +41,7 @@ extern bool hang_debug;

 static void a3xx_dump(struct msm_gpu *gpu);

-static void a3xx_me_init(struct msm_gpu *gpu)
+static bool a3xx_me_init(struct msm_gpu *gpu)
 {
 	struct msm_ringbuffer *ring = gpu->rb;

@ -65,7 +65,7 @@ static void a3xx_me_init(struct msm_gpu *gpu)
 	OUT_RING(ring, 0x00000000);

 	gpu->funcs->flush(gpu);
-	gpu->funcs->idle(gpu);
+	return gpu->funcs->idle(gpu);
 }

 static int a3xx_hw_init(struct msm_gpu *gpu)
@ -294,15 +294,20 @@ static int a3xx_hw_init(struct msm_gpu *gpu)
 	/* clear ME_HALT to start micro engine */
 	gpu_write(gpu, REG_AXXX_CP_ME_CNTL, 0);

-	a3xx_me_init(gpu);
-
-	return 0;
+	return a3xx_me_init(gpu) ? 0 : -EINVAL;
 }

 static void a3xx_recover(struct msm_gpu *gpu)
 {
+	int i;
+
 	adreno_dump_info(gpu);

+	for (i = 0; i < 8; i++) {
+		printk("CP_SCRATCH_REG%d: %u\n", i,
+			gpu_read(gpu, REG_AXXX_CP_SCRATCH_REG0 + i));
+	}
+
 	/* dump registers before resetting gpu, if enabled: */
 	if (hang_debug)
 		a3xx_dump(gpu);
@ -330,17 +335,22 @@ static void a3xx_destroy(struct msm_gpu *gpu)
 	kfree(a3xx_gpu);
 }

-static void a3xx_idle(struct msm_gpu *gpu)
+static bool a3xx_idle(struct msm_gpu *gpu)
 {
 	/* wait for ringbuffer to drain: */
-	adreno_idle(gpu);
+	if (!adreno_idle(gpu))
+		return false;

 	/* then wait for GPU to finish: */
 	if (spin_until(!(gpu_read(gpu, REG_A3XX_RBBM_STATUS) &
-			A3XX_RBBM_STATUS_GPU_BUSY)))
+			A3XX_RBBM_STATUS_GPU_BUSY))) {
 		DRM_ERROR("%s: timeout waiting for GPU to idle!\n", gpu->name);

-	/* TODO maybe we need to reset GPU here to recover from hang? */
+		/* TODO maybe we need to reset GPU here to recover from hang? */
+		return false;
+	}
+
+	return true;
 }

 static irqreturn_t a3xx_irq(struct msm_gpu *gpu)
@ -419,91 +429,13 @@ static void a3xx_dump(struct msm_gpu *gpu)
 }
 /* Register offset defines for A3XX */
 static const unsigned int a3xx_register_offsets[REG_ADRENO_REGISTER_MAX] = {
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_DEBUG, REG_AXXX_CP_DEBUG),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_ME_RAM_WADDR, REG_AXXX_CP_ME_RAM_WADDR),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_ME_RAM_DATA, REG_AXXX_CP_ME_RAM_DATA),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_PFP_UCODE_DATA,
-			REG_A3XX_CP_PFP_UCODE_DATA),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_PFP_UCODE_ADDR,
-			REG_A3XX_CP_PFP_UCODE_ADDR),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_WFI_PEND_CTR, REG_A3XX_CP_WFI_PEND_CTR),
 	REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_BASE, REG_AXXX_CP_RB_BASE),
+	REG_ADRENO_SKIP(REG_ADRENO_CP_RB_BASE_HI),
 	REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_RPTR_ADDR, REG_AXXX_CP_RB_RPTR_ADDR),
+	REG_ADRENO_SKIP(REG_ADRENO_CP_RB_RPTR_ADDR_HI),
 	REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_RPTR, REG_AXXX_CP_RB_RPTR),
 	REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_WPTR, REG_AXXX_CP_RB_WPTR),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_PROTECT_CTRL, REG_A3XX_CP_PROTECT_CTRL),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_ME_CNTL, REG_AXXX_CP_ME_CNTL),
 	REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_CNTL, REG_AXXX_CP_RB_CNTL),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_IB1_BASE, REG_AXXX_CP_IB1_BASE),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_IB1_BUFSZ, REG_AXXX_CP_IB1_BUFSZ),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_IB2_BASE, REG_AXXX_CP_IB2_BASE),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_IB2_BUFSZ, REG_AXXX_CP_IB2_BUFSZ),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_TIMESTAMP, REG_AXXX_CP_SCRATCH_REG0),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_ME_RAM_RADDR, REG_AXXX_CP_ME_RAM_RADDR),
-	REG_ADRENO_DEFINE(REG_ADRENO_SCRATCH_ADDR, REG_AXXX_SCRATCH_ADDR),
-	REG_ADRENO_DEFINE(REG_ADRENO_SCRATCH_UMSK, REG_AXXX_SCRATCH_UMSK),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_ROQ_ADDR, REG_A3XX_CP_ROQ_ADDR),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_ROQ_DATA, REG_A3XX_CP_ROQ_DATA),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_MERCIU_ADDR, REG_A3XX_CP_MERCIU_ADDR),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_MERCIU_DATA, REG_A3XX_CP_MERCIU_DATA),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_MERCIU_DATA2, REG_A3XX_CP_MERCIU_DATA2),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_MEQ_ADDR, REG_A3XX_CP_MEQ_ADDR),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_MEQ_DATA, REG_A3XX_CP_MEQ_DATA),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_HW_FAULT, REG_A3XX_CP_HW_FAULT),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_PROTECT_STATUS,
-			REG_A3XX_CP_PROTECT_STATUS),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_STATUS, REG_A3XX_RBBM_STATUS),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_PERFCTR_CTL,
-			REG_A3XX_RBBM_PERFCTR_CTL),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_PERFCTR_LOAD_CMD0,
-			REG_A3XX_RBBM_PERFCTR_LOAD_CMD0),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_PERFCTR_LOAD_CMD1,
-			REG_A3XX_RBBM_PERFCTR_LOAD_CMD1),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_PERFCTR_PWR_1_LO,
-			REG_A3XX_RBBM_PERFCTR_PWR_1_LO),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_INT_0_MASK, REG_A3XX_RBBM_INT_0_MASK),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_INT_0_STATUS,
-			REG_A3XX_RBBM_INT_0_STATUS),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_AHB_ERROR_STATUS,
-			REG_A3XX_RBBM_AHB_ERROR_STATUS),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_AHB_CMD, REG_A3XX_RBBM_AHB_CMD),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_INT_CLEAR_CMD,
-			REG_A3XX_RBBM_INT_CLEAR_CMD),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_CLOCK_CTL, REG_A3XX_RBBM_CLOCK_CTL),
-	REG_ADRENO_DEFINE(REG_ADRENO_VPC_DEBUG_RAM_SEL,
-			REG_A3XX_VPC_VPC_DEBUG_RAM_SEL),
-	REG_ADRENO_DEFINE(REG_ADRENO_VPC_DEBUG_RAM_READ,
-			REG_A3XX_VPC_VPC_DEBUG_RAM_READ),
-	REG_ADRENO_DEFINE(REG_ADRENO_VSC_SIZE_ADDRESS,
-			REG_A3XX_VSC_SIZE_ADDRESS),
-	REG_ADRENO_DEFINE(REG_ADRENO_VFD_CONTROL_0, REG_A3XX_VFD_CONTROL_0),
-	REG_ADRENO_DEFINE(REG_ADRENO_VFD_INDEX_MAX, REG_A3XX_VFD_INDEX_MAX),
-	REG_ADRENO_DEFINE(REG_ADRENO_SP_VS_PVT_MEM_ADDR_REG,
-			REG_A3XX_SP_VS_PVT_MEM_ADDR_REG),
-	REG_ADRENO_DEFINE(REG_ADRENO_SP_FS_PVT_MEM_ADDR_REG,
-			REG_A3XX_SP_FS_PVT_MEM_ADDR_REG),
-	REG_ADRENO_DEFINE(REG_ADRENO_SP_VS_OBJ_START_REG,
-			REG_A3XX_SP_VS_OBJ_START_REG),
-	REG_ADRENO_DEFINE(REG_ADRENO_SP_FS_OBJ_START_REG,
-			REG_A3XX_SP_FS_OBJ_START_REG),
-	REG_ADRENO_DEFINE(REG_ADRENO_PA_SC_AA_CONFIG, REG_A3XX_PA_SC_AA_CONFIG),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_PM_OVERRIDE2,
-			REG_A3XX_RBBM_PM_OVERRIDE2),
-	REG_ADRENO_DEFINE(REG_ADRENO_SCRATCH_REG2, REG_AXXX_CP_SCRATCH_REG2),
-	REG_ADRENO_DEFINE(REG_ADRENO_SQ_GPR_MANAGEMENT,
-			REG_A3XX_SQ_GPR_MANAGEMENT),
-	REG_ADRENO_DEFINE(REG_ADRENO_SQ_INST_STORE_MANAGMENT,
-			REG_A3XX_SQ_INST_STORE_MANAGMENT),
-	REG_ADRENO_DEFINE(REG_ADRENO_TP0_CHICKEN, REG_A3XX_TP0_CHICKEN),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_RBBM_CTL, REG_A3XX_RBBM_RBBM_CTL),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_SW_RESET_CMD,
-			REG_A3XX_RBBM_SW_RESET_CMD),
-	REG_ADRENO_DEFINE(REG_ADRENO_UCHE_INVALIDATE0,
-			REG_A3XX_UCHE_CACHE_INVALIDATE0_REG),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_PERFCTR_LOAD_VALUE_LO,
-			REG_A3XX_RBBM_PERFCTR_LOAD_VALUE_LO),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_PERFCTR_LOAD_VALUE_HI,
-			REG_A3XX_RBBM_PERFCTR_LOAD_VALUE_HI),
 };

 static const struct adreno_gpu_funcs funcs = {
@ -583,7 +515,7 @@ struct msm_gpu *a3xx_gpu_init(struct drm_device *dev)
 #endif
 	}

-	if (!gpu->mmu) {
+	if (!gpu->aspace) {
 		/* TODO we think it is possible to configure the GPU to
 		 * restrict access to VRAM carveout.  But the required
 		 * registers are unknown.  For now just bail out and
--- a/drivers/gpu/drm/msm/adreno/a4xx.xml.h
+++ b/drivers/gpu/drm/msm/adreno/a4xx.xml.h
@ -8,13 +8,14 @@ This file was generated by the rules-ng-ng headergen tool in this git repository
 git clone https://github.com/freedreno/envytools.git

 The rules-ng-ng source files this header was generated from are:
- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2016-04-26 17:56:44)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2016-02-10 17:07:21)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2016-02-10 21:03:25)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  16166 bytes, from 2016-02-11 21:20:31)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83967 bytes, from 2016-02-10 17:07:21)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 109916 bytes, from 2016-02-20 18:44:48)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32907 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  12025 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  22544 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110765 bytes, from 2016-11-26 23:01:48)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          (  90321 bytes, from 2016-11-28 16:50:05)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2016 by the following authors:
@ -46,6 +47,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 enum a4xx_color_fmt {
 	RB4_A8_UNORM = 1,
 	RB4_R8_UNORM = 2,
+	RB4_R8_SNORM = 3,
+	RB4_R8_UINT = 4,
+	RB4_R8_SINT = 5,
 	RB4_R4G4B4A4_UNORM = 8,
 	RB4_R5G5B5A1_UNORM = 10,
 	RB4_R5G6B5_UNORM = 14,
@ -89,17 +93,10 @@ enum a4xx_color_fmt {

 enum a4xx_tile_mode {
 	TILE4_LINEAR = 0,
+	TILE4_2 = 2,
 	TILE4_3 = 3,
 };

-enum a4xx_rb_blend_opcode {
-	BLEND_DST_PLUS_SRC = 0,
-	BLEND_SRC_MINUS_DST = 1,
-	BLEND_DST_MINUS_SRC = 2,
-	BLEND_MIN_DST_SRC = 3,
-	BLEND_MAX_DST_SRC = 4,
-};
-
 enum a4xx_vtx_fmt {
 	VFMT4_32_FLOAT = 1,
 	VFMT4_32_32_FLOAT = 2,
@ -940,6 +937,7 @@ static inline uint32_t A4XX_RB_MODE_CONTROL_HEIGHT(uint32_t val)
 {
 	return ((val >> 5) << A4XX_RB_MODE_CONTROL_HEIGHT__SHIFT) & A4XX_RB_MODE_CONTROL_HEIGHT__MASK;
 }
+#define A4XX_RB_MODE_CONTROL_ENABLE_GMEM			0x00010000

 #define REG_A4XX_RB_RENDER_CONTROL				0x000020a1
 #define A4XX_RB_RENDER_CONTROL_BINNING_PASS			0x00000001
@ -1043,7 +1041,7 @@ static inline uint32_t A4XX_RB_MRT_BLEND_CONTROL_RGB_SRC_FACTOR(enum adreno_rb_b
 }
 #define A4XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE__MASK	0x000000e0
 #define A4XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE__SHIFT	5
-static inline uint32_t A4XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(enum a4xx_rb_blend_opcode val)
+static inline uint32_t A4XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE(enum a3xx_rb_blend_opcode val)
 {
 	return ((val) << A4XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE__SHIFT) & A4XX_RB_MRT_BLEND_CONTROL_RGB_BLEND_OPCODE__MASK;
 }
@ -1061,7 +1059,7 @@ static inline uint32_t A4XX_RB_MRT_BLEND_CONTROL_ALPHA_SRC_FACTOR(enum adreno_rb
 }
 #define A4XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE__MASK	0x00e00000
 #define A4XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE__SHIFT	21
-static inline uint32_t A4XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(enum a4xx_rb_blend_opcode val)
+static inline uint32_t A4XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE(enum a3xx_rb_blend_opcode val)
 {
 	return ((val) << A4XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE__SHIFT) & A4XX_RB_MRT_BLEND_CONTROL_ALPHA_BLEND_OPCODE__MASK;
 }
@ -1073,12 +1071,18 @@ static inline uint32_t A4XX_RB_MRT_BLEND_CONTROL_ALPHA_DEST_FACTOR(enum adreno_r
 }

 #define REG_A4XX_RB_BLEND_RED					0x000020f0
-#define A4XX_RB_BLEND_RED_UINT__MASK				0x0000ffff
+#define A4XX_RB_BLEND_RED_UINT__MASK				0x000000ff
 #define A4XX_RB_BLEND_RED_UINT__SHIFT				0
 static inline uint32_t A4XX_RB_BLEND_RED_UINT(uint32_t val)
 {
 	return ((val) << A4XX_RB_BLEND_RED_UINT__SHIFT) & A4XX_RB_BLEND_RED_UINT__MASK;
 }
+#define A4XX_RB_BLEND_RED_SINT__MASK				0x0000ff00
+#define A4XX_RB_BLEND_RED_SINT__SHIFT				8
+static inline uint32_t A4XX_RB_BLEND_RED_SINT(uint32_t val)
+{
+	return ((val) << A4XX_RB_BLEND_RED_SINT__SHIFT) & A4XX_RB_BLEND_RED_SINT__MASK;
+}
 #define A4XX_RB_BLEND_RED_FLOAT__MASK				0xffff0000
 #define A4XX_RB_BLEND_RED_FLOAT__SHIFT				16
 static inline uint32_t A4XX_RB_BLEND_RED_FLOAT(float val)
@ -1095,12 +1099,18 @@ static inline uint32_t A4XX_RB_BLEND_RED_F32(float val)
 }

 #define REG_A4XX_RB_BLEND_GREEN					0x000020f2
-#define A4XX_RB_BLEND_GREEN_UINT__MASK				0x0000ffff
+#define A4XX_RB_BLEND_GREEN_UINT__MASK				0x000000ff
 #define A4XX_RB_BLEND_GREEN_UINT__SHIFT				0
 static inline uint32_t A4XX_RB_BLEND_GREEN_UINT(uint32_t val)
 {
 	return ((val) << A4XX_RB_BLEND_GREEN_UINT__SHIFT) & A4XX_RB_BLEND_GREEN_UINT__MASK;
 }
+#define A4XX_RB_BLEND_GREEN_SINT__MASK				0x0000ff00
+#define A4XX_RB_BLEND_GREEN_SINT__SHIFT				8
+static inline uint32_t A4XX_RB_BLEND_GREEN_SINT(uint32_t val)
+{
+	return ((val) << A4XX_RB_BLEND_GREEN_SINT__SHIFT) & A4XX_RB_BLEND_GREEN_SINT__MASK;
+}
 #define A4XX_RB_BLEND_GREEN_FLOAT__MASK				0xffff0000
 #define A4XX_RB_BLEND_GREEN_FLOAT__SHIFT			16
 static inline uint32_t A4XX_RB_BLEND_GREEN_FLOAT(float val)
@ -1117,12 +1127,18 @@ static inline uint32_t A4XX_RB_BLEND_GREEN_F32(float val)
 }

 #define REG_A4XX_RB_BLEND_BLUE					0x000020f4
-#define A4XX_RB_BLEND_BLUE_UINT__MASK				0x0000ffff
+#define A4XX_RB_BLEND_BLUE_UINT__MASK				0x000000ff
 #define A4XX_RB_BLEND_BLUE_UINT__SHIFT				0
 static inline uint32_t A4XX_RB_BLEND_BLUE_UINT(uint32_t val)
 {
 	return ((val) << A4XX_RB_BLEND_BLUE_UINT__SHIFT) & A4XX_RB_BLEND_BLUE_UINT__MASK;
 }
+#define A4XX_RB_BLEND_BLUE_SINT__MASK				0x0000ff00
+#define A4XX_RB_BLEND_BLUE_SINT__SHIFT				8
+static inline uint32_t A4XX_RB_BLEND_BLUE_SINT(uint32_t val)
+{
+	return ((val) << A4XX_RB_BLEND_BLUE_SINT__SHIFT) & A4XX_RB_BLEND_BLUE_SINT__MASK;
+}
 #define A4XX_RB_BLEND_BLUE_FLOAT__MASK				0xffff0000
 #define A4XX_RB_BLEND_BLUE_FLOAT__SHIFT				16
 static inline uint32_t A4XX_RB_BLEND_BLUE_FLOAT(float val)
@ -1139,12 +1155,18 @@ static inline uint32_t A4XX_RB_BLEND_BLUE_F32(float val)
 }

 #define REG_A4XX_RB_BLEND_ALPHA					0x000020f6
-#define A4XX_RB_BLEND_ALPHA_UINT__MASK				0x0000ffff
+#define A4XX_RB_BLEND_ALPHA_UINT__MASK				0x000000ff
 #define A4XX_RB_BLEND_ALPHA_UINT__SHIFT				0
 static inline uint32_t A4XX_RB_BLEND_ALPHA_UINT(uint32_t val)
 {
 	return ((val) << A4XX_RB_BLEND_ALPHA_UINT__SHIFT) & A4XX_RB_BLEND_ALPHA_UINT__MASK;
 }
+#define A4XX_RB_BLEND_ALPHA_SINT__MASK				0x0000ff00
+#define A4XX_RB_BLEND_ALPHA_SINT__SHIFT				8
+static inline uint32_t A4XX_RB_BLEND_ALPHA_SINT(uint32_t val)
+{
+	return ((val) << A4XX_RB_BLEND_ALPHA_SINT__SHIFT) & A4XX_RB_BLEND_ALPHA_SINT__MASK;
+}
 #define A4XX_RB_BLEND_ALPHA_FLOAT__MASK				0xffff0000
 #define A4XX_RB_BLEND_ALPHA_FLOAT__SHIFT			16
 static inline uint32_t A4XX_RB_BLEND_ALPHA_FLOAT(float val)
@ -1348,7 +1370,7 @@ static inline uint32_t A4XX_RB_DEPTH_CONTROL_ZFUNC(enum adreno_compare_func val)
 {
 	return ((val) << A4XX_RB_DEPTH_CONTROL_ZFUNC__SHIFT) & A4XX_RB_DEPTH_CONTROL_ZFUNC__MASK;
 }
-#define A4XX_RB_DEPTH_CONTROL_BF_ENABLE				0x00000080
+#define A4XX_RB_DEPTH_CONTROL_Z_CLAMP_ENABLE			0x00000080
 #define A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE			0x00010000
 #define A4XX_RB_DEPTH_CONTROL_FORCE_FRAGZ_TO_FS			0x00020000
 #define A4XX_RB_DEPTH_CONTROL_Z_TEST_ENABLE			0x80000000
@ -2177,11 +2199,23 @@ static inline uint32_t REG_A4XX_RBBM_CLOCK_DELAY_RB_MARB_CCU_L1_REG(uint32_t i0)

 #define REG_A4XX_CP_DRAW_STATE_ADDR				0x00000232

-#define REG_A4XX_CP_PROTECT_REG_0				0x00000240
-
 static inline uint32_t REG_A4XX_CP_PROTECT(uint32_t i0) { return 0x00000240 + 0x1*i0; }

 static inline uint32_t REG_A4XX_CP_PROTECT_REG(uint32_t i0) { return 0x00000240 + 0x1*i0; }
+#define A4XX_CP_PROTECT_REG_BASE_ADDR__MASK			0x0001ffff
+#define A4XX_CP_PROTECT_REG_BASE_ADDR__SHIFT			0
+static inline uint32_t A4XX_CP_PROTECT_REG_BASE_ADDR(uint32_t val)
+{
+	return ((val) << A4XX_CP_PROTECT_REG_BASE_ADDR__SHIFT) & A4XX_CP_PROTECT_REG_BASE_ADDR__MASK;
+}
+#define A4XX_CP_PROTECT_REG_MASK_LEN__MASK			0x1f000000
+#define A4XX_CP_PROTECT_REG_MASK_LEN__SHIFT			24
+static inline uint32_t A4XX_CP_PROTECT_REG_MASK_LEN(uint32_t val)
+{
+	return ((val) << A4XX_CP_PROTECT_REG_MASK_LEN__SHIFT) & A4XX_CP_PROTECT_REG_MASK_LEN__MASK;
+}
+#define A4XX_CP_PROTECT_REG_TRAP_WRITE				0x20000000
+#define A4XX_CP_PROTECT_REG_TRAP_READ				0x40000000

 #define REG_A4XX_CP_PROTECT_CTRL				0x00000250

@ -2272,7 +2306,7 @@ static inline uint32_t A4XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT(uint32_t val)
 {
 	return ((val) << A4XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT) & A4XX_SP_VS_CTRL_REG0_HALFREGFOOTPRINT__MASK;
 }
-#define A4XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT__MASK		0x0003fc00
+#define A4XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT__MASK		0x0000fc00
 #define A4XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT		10
 static inline uint32_t A4XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(uint32_t val)
 {
@ -2420,7 +2454,7 @@ static inline uint32_t A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(uint32_t val)
 {
 	return ((val) << A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT__SHIFT) & A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT__MASK;
 }
-#define A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT__MASK		0x0003fc00
+#define A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT__MASK		0x0000fc00
 #define A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT__SHIFT		10
 static inline uint32_t A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(uint32_t val)
 {
@ -3117,6 +3151,8 @@ static inline uint32_t A4XX_TPL1_TP_TEX_COUNT_GS(uint32_t val)

 #define REG_A4XX_GRAS_CL_CLIP_CNTL				0x00002000
 #define A4XX_GRAS_CL_CLIP_CNTL_CLIP_DISABLE			0x00008000
+#define A4XX_GRAS_CL_CLIP_CNTL_ZNEAR_CLIP_DISABLE		0x00010000
+#define A4XX_GRAS_CL_CLIP_CNTL_ZFAR_CLIP_DISABLE		0x00020000
 #define A4XX_GRAS_CL_CLIP_CNTL_ZERO_GB_SCALE_Z			0x00400000

 #define REG_A4XX_GRAS_CLEAR_CNTL				0x00002003
@ -3253,6 +3289,7 @@ static inline uint32_t A4XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH(float val)
 	return ((((int32_t)(val * 4.0))) << A4XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH__SHIFT) & A4XX_GRAS_SU_MODE_CONTROL_LINEHALFWIDTH__MASK;
 }
 #define A4XX_GRAS_SU_MODE_CONTROL_POLY_OFFSET			0x00000800
+#define A4XX_GRAS_SU_MODE_CONTROL_MSAA_ENABLE			0x00002000
 #define A4XX_GRAS_SU_MODE_CONTROL_RENDERING_PASS		0x00100000

 #define REG_A4XX_GRAS_SC_CONTROL				0x0000207b
@ -3670,6 +3707,8 @@ static inline uint32_t A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH(uint32_t val)
 #define REG_A4XX_PC_BINNING_COMMAND				0x00000d00
 #define A4XX_PC_BINNING_COMMAND_BINNING_ENABLE			0x00000001

+#define REG_A4XX_PC_TESSFACTOR_ADDR				0x00000d08
+
 #define REG_A4XX_PC_DRAWCALL_SETUP_OVERRIDE			0x00000d0c

 #define REG_A4XX_PC_PERFCTR_PC_SEL_0				0x00000d10
@ -3690,6 +3729,20 @@ static inline uint32_t A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH(uint32_t val)

 #define REG_A4XX_PC_BIN_BASE					0x000021c0

+#define REG_A4XX_PC_VSTREAM_CONTROL				0x000021c2
+#define A4XX_PC_VSTREAM_CONTROL_SIZE__MASK			0x003f0000
+#define A4XX_PC_VSTREAM_CONTROL_SIZE__SHIFT			16
+static inline uint32_t A4XX_PC_VSTREAM_CONTROL_SIZE(uint32_t val)
+{
+	return ((val) << A4XX_PC_VSTREAM_CONTROL_SIZE__SHIFT) & A4XX_PC_VSTREAM_CONTROL_SIZE__MASK;
+}
+#define A4XX_PC_VSTREAM_CONTROL_N__MASK				0x07c00000
+#define A4XX_PC_VSTREAM_CONTROL_N__SHIFT			22
+static inline uint32_t A4XX_PC_VSTREAM_CONTROL_N(uint32_t val)
+{
+	return ((val) << A4XX_PC_VSTREAM_CONTROL_N__SHIFT) & A4XX_PC_VSTREAM_CONTROL_N__MASK;
+}
+
 #define REG_A4XX_PC_PRIM_VTX_CNTL				0x000021c4
 #define A4XX_PC_PRIM_VTX_CNTL_VAROUT__MASK			0x0000000f
 #define A4XX_PC_PRIM_VTX_CNTL_VAROUT__SHIFT			0
@ -3752,12 +3805,8 @@ static inline uint32_t A4XX_PC_HS_PARAM_SPACING(enum a4xx_tess_spacing val)
 {
 	return ((val) << A4XX_PC_HS_PARAM_SPACING__SHIFT) & A4XX_PC_HS_PARAM_SPACING__MASK;
 }
-#define A4XX_PC_HS_PARAM_PRIMTYPE__MASK				0x01800000
-#define A4XX_PC_HS_PARAM_PRIMTYPE__SHIFT			23
-static inline uint32_t A4XX_PC_HS_PARAM_PRIMTYPE(enum adreno_pa_su_sc_draw val)
-{
-	return ((val) << A4XX_PC_HS_PARAM_PRIMTYPE__SHIFT) & A4XX_PC_HS_PARAM_PRIMTYPE__MASK;
-}
+#define A4XX_PC_HS_PARAM_CW					0x00800000
+#define A4XX_PC_HS_PARAM_CONNECTED				0x01000000

 #define REG_A4XX_VBIF_VERSION					0x00003000

--- a/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a4xx_gpu.c
@ -113,7 +113,7 @@ static void a4xx_enable_hwcg(struct msm_gpu *gpu)
 }


-static void a4xx_me_init(struct msm_gpu *gpu)
+static bool a4xx_me_init(struct msm_gpu *gpu)
 {
 	struct msm_ringbuffer *ring = gpu->rb;

@ -137,7 +137,7 @@ static void a4xx_me_init(struct msm_gpu *gpu)
 	OUT_RING(ring, 0x00000000);

 	gpu->funcs->flush(gpu);
-	gpu->funcs->idle(gpu);
+	return gpu->funcs->idle(gpu);
 }

 static int a4xx_hw_init(struct msm_gpu *gpu)
@ -292,15 +292,20 @@ static int a4xx_hw_init(struct msm_gpu *gpu)
 	/* clear ME_HALT to start micro engine */
 	gpu_write(gpu, REG_A4XX_CP_ME_CNTL, 0);

-	a4xx_me_init(gpu);
-
-	return 0;
+	return a4xx_me_init(gpu) ? 0 : -EINVAL;
 }

 static void a4xx_recover(struct msm_gpu *gpu)
 {
+	int i;
+
 	adreno_dump_info(gpu);

+	for (i = 0; i < 8; i++) {
+		printk("CP_SCRATCH_REG%d: %u\n", i,
+			gpu_read(gpu, REG_AXXX_CP_SCRATCH_REG0 + i));
+	}
+
 	/* dump registers before resetting gpu, if enabled: */
 	if (hang_debug)
 		a4xx_dump(gpu);
@ -328,17 +333,21 @@ static void a4xx_destroy(struct msm_gpu *gpu)
 	kfree(a4xx_gpu);
 }

-static void a4xx_idle(struct msm_gpu *gpu)
+static bool a4xx_idle(struct msm_gpu *gpu)
 {
 	/* wait for ringbuffer to drain: */
-	adreno_idle(gpu);
+	if (!adreno_idle(gpu))
+		return false;

 	/* then wait for GPU to finish: */
 	if (spin_until(!(gpu_read(gpu, REG_A4XX_RBBM_STATUS) &
-					A4XX_RBBM_STATUS_GPU_BUSY)))
+					A4XX_RBBM_STATUS_GPU_BUSY))) {
 		DRM_ERROR("%s: timeout waiting for GPU to idle!\n", gpu->name);
+		/* TODO maybe we need to reset GPU here to recover from hang? */
+		return false;
+	}

-	/* TODO maybe we need to reset GPU here to recover from hang? */
+	return true;
 }

 static irqreturn_t a4xx_irq(struct msm_gpu *gpu)
@ -460,87 +469,13 @@ static void a4xx_show(struct msm_gpu *gpu, struct seq_file *m)

 /* Register offset defines for A4XX, in order of enum adreno_regs */
 static const unsigned int a4xx_register_offsets[REG_ADRENO_REGISTER_MAX] = {
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_DEBUG, REG_A4XX_CP_DEBUG),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_ME_RAM_WADDR, REG_A4XX_CP_ME_RAM_WADDR),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_ME_RAM_DATA, REG_A4XX_CP_ME_RAM_DATA),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_PFP_UCODE_DATA,
-			REG_A4XX_CP_PFP_UCODE_DATA),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_PFP_UCODE_ADDR,
-			REG_A4XX_CP_PFP_UCODE_ADDR),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_WFI_PEND_CTR, REG_A4XX_CP_WFI_PEND_CTR),
 	REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_BASE, REG_A4XX_CP_RB_BASE),
+	REG_ADRENO_SKIP(REG_ADRENO_CP_RB_BASE_HI),
 	REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_RPTR_ADDR, REG_A4XX_CP_RB_RPTR_ADDR),
+	REG_ADRENO_SKIP(REG_ADRENO_CP_RB_RPTR_ADDR_HI),
 	REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_RPTR, REG_A4XX_CP_RB_RPTR),
 	REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_WPTR, REG_A4XX_CP_RB_WPTR),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_PROTECT_CTRL, REG_A4XX_CP_PROTECT_CTRL),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_ME_CNTL, REG_A4XX_CP_ME_CNTL),
 	REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_CNTL, REG_A4XX_CP_RB_CNTL),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_IB1_BASE, REG_A4XX_CP_IB1_BASE),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_IB1_BUFSZ, REG_A4XX_CP_IB1_BUFSZ),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_IB2_BASE, REG_A4XX_CP_IB2_BASE),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_IB2_BUFSZ, REG_A4XX_CP_IB2_BUFSZ),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_TIMESTAMP, REG_AXXX_CP_SCRATCH_REG0),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_ME_RAM_RADDR, REG_A4XX_CP_ME_RAM_RADDR),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_ROQ_ADDR, REG_A4XX_CP_ROQ_ADDR),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_ROQ_DATA, REG_A4XX_CP_ROQ_DATA),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_MERCIU_ADDR, REG_A4XX_CP_MERCIU_ADDR),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_MERCIU_DATA, REG_A4XX_CP_MERCIU_DATA),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_MERCIU_DATA2, REG_A4XX_CP_MERCIU_DATA2),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_MEQ_ADDR, REG_A4XX_CP_MEQ_ADDR),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_MEQ_DATA, REG_A4XX_CP_MEQ_DATA),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_HW_FAULT, REG_A4XX_CP_HW_FAULT),
-	REG_ADRENO_DEFINE(REG_ADRENO_CP_PROTECT_STATUS,
-			REG_A4XX_CP_PROTECT_STATUS),
-	REG_ADRENO_DEFINE(REG_ADRENO_SCRATCH_ADDR, REG_A4XX_CP_SCRATCH_ADDR),
-	REG_ADRENO_DEFINE(REG_ADRENO_SCRATCH_UMSK, REG_A4XX_CP_SCRATCH_UMASK),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_STATUS, REG_A4XX_RBBM_STATUS),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_PERFCTR_CTL,
-			REG_A4XX_RBBM_PERFCTR_CTL),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_PERFCTR_LOAD_CMD0,
-			REG_A4XX_RBBM_PERFCTR_LOAD_CMD0),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_PERFCTR_LOAD_CMD1,
-			REG_A4XX_RBBM_PERFCTR_LOAD_CMD1),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_PERFCTR_LOAD_CMD2,
-			REG_A4XX_RBBM_PERFCTR_LOAD_CMD2),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_PERFCTR_PWR_1_LO,
-			REG_A4XX_RBBM_PERFCTR_PWR_1_LO),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_INT_0_MASK, REG_A4XX_RBBM_INT_0_MASK),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_INT_0_STATUS,
-			REG_A4XX_RBBM_INT_0_STATUS),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_AHB_ERROR_STATUS,
-			REG_A4XX_RBBM_AHB_ERROR_STATUS),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_AHB_CMD, REG_A4XX_RBBM_AHB_CMD),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_CLOCK_CTL, REG_A4XX_RBBM_CLOCK_CTL),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_AHB_ME_SPLIT_STATUS,
-			REG_A4XX_RBBM_AHB_ME_SPLIT_STATUS),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_AHB_PFP_SPLIT_STATUS,
-			REG_A4XX_RBBM_AHB_PFP_SPLIT_STATUS),
-	REG_ADRENO_DEFINE(REG_ADRENO_VPC_DEBUG_RAM_SEL,
-			REG_A4XX_VPC_DEBUG_RAM_SEL),
-	REG_ADRENO_DEFINE(REG_ADRENO_VPC_DEBUG_RAM_READ,
-			REG_A4XX_VPC_DEBUG_RAM_READ),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_INT_CLEAR_CMD,
-			REG_A4XX_RBBM_INT_CLEAR_CMD),
-	REG_ADRENO_DEFINE(REG_ADRENO_VSC_SIZE_ADDRESS,
-			REG_A4XX_VSC_SIZE_ADDRESS),
-	REG_ADRENO_DEFINE(REG_ADRENO_VFD_CONTROL_0, REG_A4XX_VFD_CONTROL_0),
-	REG_ADRENO_DEFINE(REG_ADRENO_SP_VS_PVT_MEM_ADDR_REG,
-			REG_A4XX_SP_VS_PVT_MEM_ADDR),
-	REG_ADRENO_DEFINE(REG_ADRENO_SP_FS_PVT_MEM_ADDR_REG,
-			REG_A4XX_SP_FS_PVT_MEM_ADDR),
-	REG_ADRENO_DEFINE(REG_ADRENO_SP_VS_OBJ_START_REG,
-			REG_A4XX_SP_VS_OBJ_START),
-	REG_ADRENO_DEFINE(REG_ADRENO_SP_FS_OBJ_START_REG,
-			REG_A4XX_SP_FS_OBJ_START),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_RBBM_CTL, REG_A4XX_RBBM_RBBM_CTL),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_SW_RESET_CMD,
-			REG_A4XX_RBBM_SW_RESET_CMD),
-	REG_ADRENO_DEFINE(REG_ADRENO_UCHE_INVALIDATE0,
-			REG_A4XX_UCHE_INVALIDATE0),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_PERFCTR_LOAD_VALUE_LO,
-			REG_A4XX_RBBM_PERFCTR_LOAD_VALUE_LO),
-	REG_ADRENO_DEFINE(REG_ADRENO_RBBM_PERFCTR_LOAD_VALUE_HI,
-			REG_A4XX_RBBM_PERFCTR_LOAD_VALUE_HI),
 };

 static void a4xx_dump(struct msm_gpu *gpu)
@ -587,16 +522,8 @@ static int a4xx_pm_suspend(struct msm_gpu *gpu) {

 static int a4xx_get_timestamp(struct msm_gpu *gpu, uint64_t *value)
 {
-	uint32_t hi, lo, tmp;
-
-	tmp = gpu_read(gpu, REG_A4XX_RBBM_PERFCTR_CP_0_HI);
-	do {
-		hi = tmp;
-		lo = gpu_read(gpu, REG_A4XX_RBBM_PERFCTR_CP_0_LO);
-		tmp = gpu_read(gpu, REG_A4XX_RBBM_PERFCTR_CP_0_HI);
-	} while (tmp != hi);
-
-	*value = (((uint64_t)hi) << 32) | lo;
+	*value = gpu_read64(gpu, REG_A4XX_RBBM_PERFCTR_CP_0_LO,
+		REG_A4XX_RBBM_PERFCTR_CP_0_HI);

 	return 0;
 }
@ -672,7 +599,7 @@ struct msm_gpu *a4xx_gpu_init(struct drm_device *dev)
 #endif
 	}

-	if (!gpu->mmu) {
+	if (!gpu->aspace) {
 		/* TODO we think it is possible to configure the GPU to
 		 * restrict access to VRAM carveout.  But the required
 		 * registers are unknown.  For now just bail out and
--- a/drivers/gpu/drm/msm/adreno/a5xx.xml.h
+++ b/drivers/gpu/drm/msm/adreno/a5xx.xml.h
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
@ -0,0 +1,888 @@
+/* Copyright (c) 2016 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include "msm_gem.h"
+#include "a5xx_gpu.h"
+
+extern bool hang_debug;
+static void a5xx_dump(struct msm_gpu *gpu);
+
+static void a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
+	struct msm_file_private *ctx)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct msm_drm_private *priv = gpu->dev->dev_private;
+	struct msm_ringbuffer *ring = gpu->rb;
+	unsigned int i, ibs = 0;
+
+	for (i = 0; i < submit->nr_cmds; i++) {
+		switch (submit->cmd[i].type) {
+		case MSM_SUBMIT_CMD_IB_TARGET_BUF:
+			break;
+		case MSM_SUBMIT_CMD_CTX_RESTORE_BUF:
+			if (priv->lastctx == ctx)
+				break;
+		case MSM_SUBMIT_CMD_BUF:
+			OUT_PKT7(ring, CP_INDIRECT_BUFFER_PFE, 3);
+			OUT_RING(ring, lower_32_bits(submit->cmd[i].iova));
+			OUT_RING(ring, upper_32_bits(submit->cmd[i].iova));
+			OUT_RING(ring, submit->cmd[i].size);
+			ibs++;
+			break;
+		}
+	}
+
+	OUT_PKT4(ring, REG_A5XX_CP_SCRATCH_REG(2), 1);
+	OUT_RING(ring, submit->fence->seqno);
+
+	OUT_PKT7(ring, CP_EVENT_WRITE, 4);
+	OUT_RING(ring, CACHE_FLUSH_TS | (1 << 31));
+	OUT_RING(ring, lower_32_bits(rbmemptr(adreno_gpu, fence)));
+	OUT_RING(ring, upper_32_bits(rbmemptr(adreno_gpu, fence)));
+	OUT_RING(ring, submit->fence->seqno);
+
+	gpu->funcs->flush(gpu);
+}
+
+struct a5xx_hwcg {
+	u32 offset;
+	u32 value;
+};
+
+static const struct a5xx_hwcg a530_hwcg[] = {
+	{REG_A5XX_RBBM_CLOCK_CNTL_SP0, 0x02222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL_SP1, 0x02222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL_SP2, 0x02222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL_SP3, 0x02222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL2_SP0, 0x02222220},
+	{REG_A5XX_RBBM_CLOCK_CNTL2_SP1, 0x02222220},
+	{REG_A5XX_RBBM_CLOCK_CNTL2_SP2, 0x02222220},
+	{REG_A5XX_RBBM_CLOCK_CNTL2_SP3, 0x02222220},
+	{REG_A5XX_RBBM_CLOCK_HYST_SP0, 0x0000F3CF},
+	{REG_A5XX_RBBM_CLOCK_HYST_SP1, 0x0000F3CF},
+	{REG_A5XX_RBBM_CLOCK_HYST_SP2, 0x0000F3CF},
+	{REG_A5XX_RBBM_CLOCK_HYST_SP3, 0x0000F3CF},
+	{REG_A5XX_RBBM_CLOCK_DELAY_SP0, 0x00000080},
+	{REG_A5XX_RBBM_CLOCK_DELAY_SP1, 0x00000080},
+	{REG_A5XX_RBBM_CLOCK_DELAY_SP2, 0x00000080},
+	{REG_A5XX_RBBM_CLOCK_DELAY_SP3, 0x00000080},
+	{REG_A5XX_RBBM_CLOCK_CNTL_TP0, 0x22222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL_TP1, 0x22222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL_TP2, 0x22222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL_TP3, 0x22222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL2_TP0, 0x22222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL2_TP1, 0x22222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL2_TP2, 0x22222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL2_TP3, 0x22222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL3_TP0, 0x00002222},
+	{REG_A5XX_RBBM_CLOCK_CNTL3_TP1, 0x00002222},
+	{REG_A5XX_RBBM_CLOCK_CNTL3_TP2, 0x00002222},
+	{REG_A5XX_RBBM_CLOCK_CNTL3_TP3, 0x00002222},
+	{REG_A5XX_RBBM_CLOCK_HYST_TP0, 0x77777777},
+	{REG_A5XX_RBBM_CLOCK_HYST_TP1, 0x77777777},
+	{REG_A5XX_RBBM_CLOCK_HYST_TP2, 0x77777777},
+	{REG_A5XX_RBBM_CLOCK_HYST_TP3, 0x77777777},
+	{REG_A5XX_RBBM_CLOCK_HYST2_TP0, 0x77777777},
+	{REG_A5XX_RBBM_CLOCK_HYST2_TP1, 0x77777777},
+	{REG_A5XX_RBBM_CLOCK_HYST2_TP2, 0x77777777},
+	{REG_A5XX_RBBM_CLOCK_HYST2_TP3, 0x77777777},
+	{REG_A5XX_RBBM_CLOCK_HYST3_TP0, 0x00007777},
+	{REG_A5XX_RBBM_CLOCK_HYST3_TP1, 0x00007777},
+	{REG_A5XX_RBBM_CLOCK_HYST3_TP2, 0x00007777},
+	{REG_A5XX_RBBM_CLOCK_HYST3_TP3, 0x00007777},
+	{REG_A5XX_RBBM_CLOCK_DELAY_TP0, 0x11111111},
+	{REG_A5XX_RBBM_CLOCK_DELAY_TP1, 0x11111111},
+	{REG_A5XX_RBBM_CLOCK_DELAY_TP2, 0x11111111},
+	{REG_A5XX_RBBM_CLOCK_DELAY_TP3, 0x11111111},
+	{REG_A5XX_RBBM_CLOCK_DELAY2_TP0, 0x11111111},
+	{REG_A5XX_RBBM_CLOCK_DELAY2_TP1, 0x11111111},
+	{REG_A5XX_RBBM_CLOCK_DELAY2_TP2, 0x11111111},
+	{REG_A5XX_RBBM_CLOCK_DELAY2_TP3, 0x11111111},
+	{REG_A5XX_RBBM_CLOCK_DELAY3_TP0, 0x00001111},
+	{REG_A5XX_RBBM_CLOCK_DELAY3_TP1, 0x00001111},
+	{REG_A5XX_RBBM_CLOCK_DELAY3_TP2, 0x00001111},
+	{REG_A5XX_RBBM_CLOCK_DELAY3_TP3, 0x00001111},
+	{REG_A5XX_RBBM_CLOCK_CNTL_UCHE, 0x22222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL2_UCHE, 0x22222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL3_UCHE, 0x22222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL4_UCHE, 0x00222222},
+	{REG_A5XX_RBBM_CLOCK_HYST_UCHE, 0x00444444},
+	{REG_A5XX_RBBM_CLOCK_DELAY_UCHE, 0x00000002},
+	{REG_A5XX_RBBM_CLOCK_CNTL_RB0, 0x22222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL_RB1, 0x22222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL_RB2, 0x22222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL_RB3, 0x22222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL2_RB0, 0x00222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL2_RB1, 0x00222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL2_RB2, 0x00222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL2_RB3, 0x00222222},
+	{REG_A5XX_RBBM_CLOCK_CNTL_CCU0, 0x00022220},
+	{REG_A5XX_RBBM_CLOCK_CNTL_CCU1, 0x00022220},
+	{REG_A5XX_RBBM_CLOCK_CNTL_CCU2, 0x00022220},
+	{REG_A5XX_RBBM_CLOCK_CNTL_CCU3, 0x00022220},
+	{REG_A5XX_RBBM_CLOCK_CNTL_RAC, 0x05522222},
+	{REG_A5XX_RBBM_CLOCK_CNTL2_RAC, 0x00505555},
+	{REG_A5XX_RBBM_CLOCK_HYST_RB_CCU0, 0x04040404},
+	{REG_A5XX_RBBM_CLOCK_HYST_RB_CCU1, 0x04040404},
+	{REG_A5XX_RBBM_CLOCK_HYST_RB_CCU2, 0x04040404},
+	{REG_A5XX_RBBM_CLOCK_HYST_RB_CCU3, 0x04040404},
+	{REG_A5XX_RBBM_CLOCK_HYST_RAC, 0x07444044},
+	{REG_A5XX_RBBM_CLOCK_DELAY_RB_CCU_L1_0, 0x00000002},
+	{REG_A5XX_RBBM_CLOCK_DELAY_RB_CCU_L1_1, 0x00000002},
+	{REG_A5XX_RBBM_CLOCK_DELAY_RB_CCU_L1_2, 0x00000002},
+	{REG_A5XX_RBBM_CLOCK_DELAY_RB_CCU_L1_3, 0x00000002},
+	{REG_A5XX_RBBM_CLOCK_DELAY_RAC, 0x00010011},
+	{REG_A5XX_RBBM_CLOCK_CNTL_TSE_RAS_RBBM, 0x04222222},
+	{REG_A5XX_RBBM_CLOCK_MODE_GPC, 0x02222222},
+	{REG_A5XX_RBBM_CLOCK_MODE_VFD, 0x00002222},
+	{REG_A5XX_RBBM_CLOCK_HYST_TSE_RAS_RBBM, 0x00000000},
+	{REG_A5XX_RBBM_CLOCK_HYST_GPC, 0x04104004},
+	{REG_A5XX_RBBM_CLOCK_HYST_VFD, 0x00000000},
+	{REG_A5XX_RBBM_CLOCK_DELAY_HLSQ, 0x00000000},
+	{REG_A5XX_RBBM_CLOCK_DELAY_TSE_RAS_RBBM, 0x00004000},
+	{REG_A5XX_RBBM_CLOCK_DELAY_GPC, 0x00000200},
+	{REG_A5XX_RBBM_CLOCK_DELAY_VFD, 0x00002222}
+};
+
+static const struct {
+	int (*test)(struct adreno_gpu *gpu);
+	const struct a5xx_hwcg *regs;
+	unsigned int count;
+} a5xx_hwcg_regs[] = {
+	{ adreno_is_a530, a530_hwcg, ARRAY_SIZE(a530_hwcg), },
+};
+
+static void _a5xx_enable_hwcg(struct msm_gpu *gpu,
+		const struct a5xx_hwcg *regs, unsigned int count)
+{
+	unsigned int i;
+
+	for (i = 0; i < count; i++)
+		gpu_write(gpu, regs[i].offset, regs[i].value);
+
+	gpu_write(gpu, REG_A5XX_RBBM_CLOCK_CNTL, 0xAAA8AA00);
+	gpu_write(gpu, REG_A5XX_RBBM_ISDB_CNT, 0x182);
+}
+
+static void a5xx_enable_hwcg(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(a5xx_hwcg_regs); i++) {
+		if (a5xx_hwcg_regs[i].test(adreno_gpu)) {
+			_a5xx_enable_hwcg(gpu, a5xx_hwcg_regs[i].regs,
+				a5xx_hwcg_regs[i].count);
+			return;
+		}
+	}
+}
+
+static int a5xx_me_init(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct msm_ringbuffer *ring = gpu->rb;
+
+	OUT_PKT7(ring, CP_ME_INIT, 8);
+
+	OUT_RING(ring, 0x0000002F);
+
+	/* Enable multiple hardware contexts */
+	OUT_RING(ring, 0x00000003);
+
+	/* Enable error detection */
+	OUT_RING(ring, 0x20000000);
+
+	/* Don't enable header dump */
+	OUT_RING(ring, 0x00000000);
+	OUT_RING(ring, 0x00000000);
+
+	/* Specify workarounds for various microcode issues */
+	if (adreno_is_a530(adreno_gpu)) {
+		/* Workaround for token end syncs
+		 * Force a WFI after every direct-render 3D mode draw and every
+		 * 2D mode 3 draw
+		 */
+		OUT_RING(ring, 0x0000000B);
+	} else {
+		/* No workarounds enabled */
+		OUT_RING(ring, 0x00000000);
+	}
+
+	OUT_RING(ring, 0x00000000);
+	OUT_RING(ring, 0x00000000);
+
+	gpu->funcs->flush(gpu);
+
+	return gpu->funcs->idle(gpu) ? 0 : -EINVAL;
+}
+
+static struct drm_gem_object *a5xx_ucode_load_bo(struct msm_gpu *gpu,
+		const struct firmware *fw, u64 *iova)
+{
+	struct drm_device *drm = gpu->dev;
+	struct drm_gem_object *bo;
+	void *ptr;
+
+	mutex_lock(&drm->struct_mutex);
+	bo = msm_gem_new(drm, fw->size - 4, MSM_BO_UNCACHED);
+	mutex_unlock(&drm->struct_mutex);
+
+	if (IS_ERR(bo))
+		return bo;
+
+	ptr = msm_gem_get_vaddr(bo);
+	if (!ptr) {
+		drm_gem_object_unreference_unlocked(bo);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (iova) {
+		int ret = msm_gem_get_iova(bo, gpu->id, iova);
+
+		if (ret) {
+			drm_gem_object_unreference_unlocked(bo);
+			return ERR_PTR(ret);
+		}
+	}
+
+	memcpy(ptr, &fw->data[4], fw->size - 4);
+
+	msm_gem_put_vaddr(bo);
+	return bo;
+}
+
+static int a5xx_ucode_init(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+	int ret;
+
+	if (!a5xx_gpu->pm4_bo) {
+		a5xx_gpu->pm4_bo = a5xx_ucode_load_bo(gpu, adreno_gpu->pm4,
+			&a5xx_gpu->pm4_iova);
+
+		if (IS_ERR(a5xx_gpu->pm4_bo)) {
+			ret = PTR_ERR(a5xx_gpu->pm4_bo);
+			a5xx_gpu->pm4_bo = NULL;
+			dev_err(gpu->dev->dev, "could not allocate PM4: %d\n",
+				ret);
+			return ret;
+		}
+	}
+
+	if (!a5xx_gpu->pfp_bo) {
+		a5xx_gpu->pfp_bo = a5xx_ucode_load_bo(gpu, adreno_gpu->pfp,
+			&a5xx_gpu->pfp_iova);
+
+		if (IS_ERR(a5xx_gpu->pfp_bo)) {
+			ret = PTR_ERR(a5xx_gpu->pfp_bo);
+			a5xx_gpu->pfp_bo = NULL;
+			dev_err(gpu->dev->dev, "could not allocate PFP: %d\n",
+				ret);
+			return ret;
+		}
+	}
+
+	gpu_write64(gpu, REG_A5XX_CP_ME_INSTR_BASE_LO,
+		REG_A5XX_CP_ME_INSTR_BASE_HI, a5xx_gpu->pm4_iova);
+
+	gpu_write64(gpu, REG_A5XX_CP_PFP_INSTR_BASE_LO,
+		REG_A5XX_CP_PFP_INSTR_BASE_HI, a5xx_gpu->pfp_iova);
+
+	return 0;
+}
+
+#define A5XX_INT_MASK (A5XX_RBBM_INT_0_MASK_RBBM_AHB_ERROR | \
+	  A5XX_RBBM_INT_0_MASK_RBBM_TRANSFER_TIMEOUT | \
+	  A5XX_RBBM_INT_0_MASK_RBBM_ME_MS_TIMEOUT | \
+	  A5XX_RBBM_INT_0_MASK_RBBM_PFP_MS_TIMEOUT | \
+	  A5XX_RBBM_INT_0_MASK_RBBM_ETS_MS_TIMEOUT | \
+	  A5XX_RBBM_INT_0_MASK_RBBM_ATB_ASYNC_OVERFLOW | \
+	  A5XX_RBBM_INT_0_MASK_CP_HW_ERROR | \
+	  A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS | \
+	  A5XX_RBBM_INT_0_MASK_UCHE_OOB_ACCESS | \
+	  A5XX_RBBM_INT_0_MASK_GPMU_VOLTAGE_DROOP)
+
+static int a5xx_hw_init(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	int ret;
+
+	gpu_write(gpu, REG_A5XX_VBIF_ROUND_ROBIN_QOS_ARB, 0x00000003);
+
+	/* Make all blocks contribute to the GPU BUSY perf counter */
+	gpu_write(gpu, REG_A5XX_RBBM_PERFCTR_GPU_BUSY_MASKED, 0xFFFFFFFF);
+
+	/* Enable RBBM error reporting bits */
+	gpu_write(gpu, REG_A5XX_RBBM_AHB_CNTL0, 0x00000001);
+
+	if (adreno_gpu->quirks & ADRENO_QUIRK_FAULT_DETECT_MASK) {
+		/*
+		 * Mask out the activity signals from RB1-3 to avoid false
+		 * positives
+		 */
+
+		gpu_write(gpu, REG_A5XX_RBBM_INTERFACE_HANG_MASK_CNTL11,
+			0xF0000000);
+		gpu_write(gpu, REG_A5XX_RBBM_INTERFACE_HANG_MASK_CNTL12,
+			0xFFFFFFFF);
+		gpu_write(gpu, REG_A5XX_RBBM_INTERFACE_HANG_MASK_CNTL13,
+			0xFFFFFFFF);
+		gpu_write(gpu, REG_A5XX_RBBM_INTERFACE_HANG_MASK_CNTL14,
+			0xFFFFFFFF);
+		gpu_write(gpu, REG_A5XX_RBBM_INTERFACE_HANG_MASK_CNTL15,
+			0xFFFFFFFF);
+		gpu_write(gpu, REG_A5XX_RBBM_INTERFACE_HANG_MASK_CNTL16,
+			0xFFFFFFFF);
+		gpu_write(gpu, REG_A5XX_RBBM_INTERFACE_HANG_MASK_CNTL17,
+			0xFFFFFFFF);
+		gpu_write(gpu, REG_A5XX_RBBM_INTERFACE_HANG_MASK_CNTL18,
+			0xFFFFFFFF);
+	}
+
+	/* Enable fault detection */
+	gpu_write(gpu, REG_A5XX_RBBM_INTERFACE_HANG_INT_CNTL,
+		(1 << 30) | 0xFFFF);
+
+	/* Turn on performance counters */
+	gpu_write(gpu, REG_A5XX_RBBM_PERFCTR_CNTL, 0x01);
+
+	/* Increase VFD cache access so LRZ and other data gets evicted less */
+	gpu_write(gpu, REG_A5XX_UCHE_CACHE_WAYS, 0x02);
+
+	/* Disable L2 bypass in the UCHE */
+	gpu_write(gpu, REG_A5XX_UCHE_TRAP_BASE_LO, 0xFFFF0000);
+	gpu_write(gpu, REG_A5XX_UCHE_TRAP_BASE_HI, 0x0001FFFF);
+	gpu_write(gpu, REG_A5XX_UCHE_WRITE_THRU_BASE_LO, 0xFFFF0000);
+	gpu_write(gpu, REG_A5XX_UCHE_WRITE_THRU_BASE_HI, 0x0001FFFF);
+
+	/* Set the GMEM VA range (0 to gpu->gmem) */
+	gpu_write(gpu, REG_A5XX_UCHE_GMEM_RANGE_MIN_LO, 0x00100000);
+	gpu_write(gpu, REG_A5XX_UCHE_GMEM_RANGE_MIN_HI, 0x00000000);
+	gpu_write(gpu, REG_A5XX_UCHE_GMEM_RANGE_MAX_LO,
+		0x00100000 + adreno_gpu->gmem - 1);
+	gpu_write(gpu, REG_A5XX_UCHE_GMEM_RANGE_MAX_HI, 0x00000000);
+
+	gpu_write(gpu, REG_A5XX_CP_MEQ_THRESHOLDS, 0x40);
+	gpu_write(gpu, REG_A5XX_CP_MERCIU_SIZE, 0x40);
+	gpu_write(gpu, REG_A5XX_CP_ROQ_THRESHOLDS_2, 0x80000060);
+	gpu_write(gpu, REG_A5XX_CP_ROQ_THRESHOLDS_1, 0x40201B16);
+
+	gpu_write(gpu, REG_A5XX_PC_DBG_ECO_CNTL, (0x400 << 11 | 0x300 << 22));
+
+	if (adreno_gpu->quirks & ADRENO_QUIRK_TWO_PASS_USE_WFI)
+		gpu_rmw(gpu, REG_A5XX_PC_DBG_ECO_CNTL, 0, (1 << 8));
+
+	gpu_write(gpu, REG_A5XX_PC_DBG_ECO_CNTL, 0xc0200100);
+
+	/* Enable USE_RETENTION_FLOPS */
+	gpu_write(gpu, REG_A5XX_CP_CHICKEN_DBG, 0x02000000);
+
+	/* Enable ME/PFP split notification */
+	gpu_write(gpu, REG_A5XX_RBBM_AHB_CNTL1, 0xA6FFFFFF);
+
+	/* Enable HWCG */
+	a5xx_enable_hwcg(gpu);
+
+	gpu_write(gpu, REG_A5XX_RBBM_AHB_CNTL2, 0x0000003F);
+
+	/* Set the highest bank bit */
+	gpu_write(gpu, REG_A5XX_TPL1_MODE_CNTL, 2 << 7);
+	gpu_write(gpu, REG_A5XX_RB_MODE_CNTL, 2 << 1);
+
+	/* Protect registers from the CP */
+	gpu_write(gpu, REG_A5XX_CP_PROTECT_CNTL, 0x00000007);
+
+	/* RBBM */
+	gpu_write(gpu, REG_A5XX_CP_PROTECT(0), ADRENO_PROTECT_RW(0x04, 4));
+	gpu_write(gpu, REG_A5XX_CP_PROTECT(1), ADRENO_PROTECT_RW(0x08, 8));
+	gpu_write(gpu, REG_A5XX_CP_PROTECT(2), ADRENO_PROTECT_RW(0x10, 16));
+	gpu_write(gpu, REG_A5XX_CP_PROTECT(3), ADRENO_PROTECT_RW(0x20, 32));
+	gpu_write(gpu, REG_A5XX_CP_PROTECT(4), ADRENO_PROTECT_RW(0x40, 64));
+	gpu_write(gpu, REG_A5XX_CP_PROTECT(5), ADRENO_PROTECT_RW(0x80, 64));
+
+	/* Content protect */
+	gpu_write(gpu, REG_A5XX_CP_PROTECT(6),
+		ADRENO_PROTECT_RW(REG_A5XX_RBBM_SECVID_TSB_TRUSTED_BASE_LO,
+			16));
+	gpu_write(gpu, REG_A5XX_CP_PROTECT(7),
+		ADRENO_PROTECT_RW(REG_A5XX_RBBM_SECVID_TRUST_CNTL, 2));
+
+	/* CP */
+	gpu_write(gpu, REG_A5XX_CP_PROTECT(8), ADRENO_PROTECT_RW(0x800, 64));
+	gpu_write(gpu, REG_A5XX_CP_PROTECT(9), ADRENO_PROTECT_RW(0x840, 8));
+	gpu_write(gpu, REG_A5XX_CP_PROTECT(10), ADRENO_PROTECT_RW(0x880, 32));
+	gpu_write(gpu, REG_A5XX_CP_PROTECT(11), ADRENO_PROTECT_RW(0xAA0, 1));
+
+	/* RB */
+	gpu_write(gpu, REG_A5XX_CP_PROTECT(12), ADRENO_PROTECT_RW(0xCC0, 1));
+	gpu_write(gpu, REG_A5XX_CP_PROTECT(13), ADRENO_PROTECT_RW(0xCF0, 2));
+
+	/* VPC */
+	gpu_write(gpu, REG_A5XX_CP_PROTECT(14), ADRENO_PROTECT_RW(0xE68, 8));
+	gpu_write(gpu, REG_A5XX_CP_PROTECT(15), ADRENO_PROTECT_RW(0xE70, 4));
+
+	/* UCHE */
+	gpu_write(gpu, REG_A5XX_CP_PROTECT(16), ADRENO_PROTECT_RW(0xE80, 16));
+
+	if (adreno_is_a530(adreno_gpu))
+		gpu_write(gpu, REG_A5XX_CP_PROTECT(17),
+			ADRENO_PROTECT_RW(0x10000, 0x8000));
+
+	gpu_write(gpu, REG_A5XX_RBBM_SECVID_TSB_CNTL, 0);
+	/*
+	 * Disable the trusted memory range - we don't actually supported secure
+	 * memory rendering at this point in time and we don't want to block off
+	 * part of the virtual memory space.
+	 */
+	gpu_write64(gpu, REG_A5XX_RBBM_SECVID_TSB_TRUSTED_BASE_LO,
+		REG_A5XX_RBBM_SECVID_TSB_TRUSTED_BASE_HI, 0x00000000);
+	gpu_write(gpu, REG_A5XX_RBBM_SECVID_TSB_TRUSTED_SIZE, 0x00000000);
+
+	/* Load the GPMU firmware before starting the HW init */
+	a5xx_gpmu_ucode_init(gpu);
+
+	ret = adreno_hw_init(gpu);
+	if (ret)
+		return ret;
+
+	ret = a5xx_ucode_init(gpu);
+	if (ret)
+		return ret;
+
+	/* Disable the interrupts through the initial bringup stage */
+	gpu_write(gpu, REG_A5XX_RBBM_INT_0_MASK, A5XX_INT_MASK);
+
+	/* Clear ME_HALT to start the micro engine */
+	gpu_write(gpu, REG_A5XX_CP_PFP_ME_CNTL, 0);
+	ret = a5xx_me_init(gpu);
+	if (ret)
+		return ret;
+
+	ret = a5xx_power_init(gpu);
+	if (ret)
+		return ret;
+
+	/*
+	 * Send a pipeline event stat to get misbehaving counters to start
+	 * ticking correctly
+	 */
+	if (adreno_is_a530(adreno_gpu)) {
+		OUT_PKT7(gpu->rb, CP_EVENT_WRITE, 1);
+		OUT_RING(gpu->rb, 0x0F);
+
+		gpu->funcs->flush(gpu);
+		if (!gpu->funcs->idle(gpu))
+			return -EINVAL;
+	}
+
+	/* Put the GPU into unsecure mode */
+	gpu_write(gpu, REG_A5XX_RBBM_SECVID_TRUST_CNTL, 0x0);
+
+	return 0;
+}
+
+static void a5xx_recover(struct msm_gpu *gpu)
+{
+	int i;
+
+	adreno_dump_info(gpu);
+
+	for (i = 0; i < 8; i++) {
+		printk("CP_SCRATCH_REG%d: %u\n", i,
+			gpu_read(gpu, REG_A5XX_CP_SCRATCH_REG(i)));
+	}
+
+	if (hang_debug)
+		a5xx_dump(gpu);
+
+	gpu_write(gpu, REG_A5XX_RBBM_SW_RESET_CMD, 1);
+	gpu_read(gpu, REG_A5XX_RBBM_SW_RESET_CMD);
+	gpu_write(gpu, REG_A5XX_RBBM_SW_RESET_CMD, 0);
+	adreno_recover(gpu);
+}
+
+static void a5xx_destroy(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+
+	DBG("%s", gpu->name);
+
+	if (a5xx_gpu->pm4_bo) {
+		if (a5xx_gpu->pm4_iova)
+			msm_gem_put_iova(a5xx_gpu->pm4_bo, gpu->id);
+		drm_gem_object_unreference_unlocked(a5xx_gpu->pm4_bo);
+	}
+
+	if (a5xx_gpu->pfp_bo) {
+		if (a5xx_gpu->pfp_iova)
+			msm_gem_put_iova(a5xx_gpu->pfp_bo, gpu->id);
+		drm_gem_object_unreference_unlocked(a5xx_gpu->pfp_bo);
+	}
+
+	if (a5xx_gpu->gpmu_bo) {
+		if (a5xx_gpu->gpmu_bo)
+			msm_gem_put_iova(a5xx_gpu->gpmu_bo, gpu->id);
+		drm_gem_object_unreference_unlocked(a5xx_gpu->gpmu_bo);
+	}
+
+	adreno_gpu_cleanup(adreno_gpu);
+	kfree(a5xx_gpu);
+}
+
+static inline bool _a5xx_check_idle(struct msm_gpu *gpu)
+{
+	if (gpu_read(gpu, REG_A5XX_RBBM_STATUS) & ~A5XX_RBBM_STATUS_HI_BUSY)
+		return false;
+
+	/*
+	 * Nearly every abnormality ends up pausing the GPU and triggering a
+	 * fault so we can safely just watch for this one interrupt to fire
+	 */
+	return !(gpu_read(gpu, REG_A5XX_RBBM_INT_0_STATUS) &
+		A5XX_RBBM_INT_0_MASK_MISC_HANG_DETECT);
+}
+
+static bool a5xx_idle(struct msm_gpu *gpu)
+{
+	/* wait for CP to drain ringbuffer: */
+	if (!adreno_idle(gpu))
+		return false;
+
+	if (spin_until(_a5xx_check_idle(gpu))) {
+		DRM_ERROR("%s: %ps: timeout waiting for GPU to idle: status %8.8X irq %8.8X\n",
+			gpu->name, __builtin_return_address(0),
+			gpu_read(gpu, REG_A5XX_RBBM_STATUS),
+			gpu_read(gpu, REG_A5XX_RBBM_INT_0_STATUS));
+
+		return false;
+	}
+
+	return true;
+}
+
+static void a5xx_cp_err_irq(struct msm_gpu *gpu)
+{
+	u32 status = gpu_read(gpu, REG_A5XX_CP_INTERRUPT_STATUS);
+
+	if (status & A5XX_CP_INT_CP_OPCODE_ERROR) {
+		u32 val;
+
+		gpu_write(gpu, REG_A5XX_CP_PFP_STAT_ADDR, 0);
+
+		/*
+		 * REG_A5XX_CP_PFP_STAT_DATA is indexed, and we want index 1 so
+		 * read it twice
+		 */
+
+		gpu_read(gpu, REG_A5XX_CP_PFP_STAT_DATA);
+		val = gpu_read(gpu, REG_A5XX_CP_PFP_STAT_DATA);
+
+		dev_err_ratelimited(gpu->dev->dev, "CP | opcode error | possible opcode=0x%8.8X\n",
+			val);
+	}
+
+	if (status & A5XX_CP_INT_CP_HW_FAULT_ERROR)
+		dev_err_ratelimited(gpu->dev->dev, "CP | HW fault | status=0x%8.8X\n",
+			gpu_read(gpu, REG_A5XX_CP_HW_FAULT));
+
+	if (status & A5XX_CP_INT_CP_DMA_ERROR)
+		dev_err_ratelimited(gpu->dev->dev, "CP | DMA error\n");
+
+	if (status & A5XX_CP_INT_CP_REGISTER_PROTECTION_ERROR) {
+		u32 val = gpu_read(gpu, REG_A5XX_CP_PROTECT_STATUS);
+
+		dev_err_ratelimited(gpu->dev->dev,
+			"CP | protected mode error | %s | addr=0x%8.8X | status=0x%8.8X\n",
+			val & (1 << 24) ? "WRITE" : "READ",
+			(val & 0xFFFFF) >> 2, val);
+	}
+
+	if (status & A5XX_CP_INT_CP_AHB_ERROR) {
+		u32 status = gpu_read(gpu, REG_A5XX_CP_AHB_FAULT);
+		const char *access[16] = { "reserved", "reserved",
+			"timestamp lo", "timestamp hi", "pfp read", "pfp write",
+			"", "", "me read", "me write", "", "", "crashdump read",
+			"crashdump write" };
+
+		dev_err_ratelimited(gpu->dev->dev,
+			"CP | AHB error | addr=%X access=%s error=%d | status=0x%8.8X\n",
+			status & 0xFFFFF, access[(status >> 24) & 0xF],
+			(status & (1 << 31)), status);
+	}
+}
+
+static void a5xx_rbbm_err_irq(struct msm_gpu *gpu)
+{
+	u32 status = gpu_read(gpu, REG_A5XX_RBBM_INT_0_STATUS);
+
+	if (status & A5XX_RBBM_INT_0_MASK_RBBM_AHB_ERROR) {
+		u32 val = gpu_read(gpu, REG_A5XX_RBBM_AHB_ERROR_STATUS);
+
+		dev_err_ratelimited(gpu->dev->dev,
+			"RBBM | AHB bus error | %s | addr=0x%X | ports=0x%X:0x%X\n",
+			val & (1 << 28) ? "WRITE" : "READ",
+			(val & 0xFFFFF) >> 2, (val >> 20) & 0x3,
+			(val >> 24) & 0xF);
+
+		/* Clear the error */
+		gpu_write(gpu, REG_A5XX_RBBM_AHB_CMD, (1 << 4));
+	}
+
+	if (status & A5XX_RBBM_INT_0_MASK_RBBM_TRANSFER_TIMEOUT)
+		dev_err_ratelimited(gpu->dev->dev, "RBBM | AHB transfer timeout\n");
+
+	if (status & A5XX_RBBM_INT_0_MASK_RBBM_ME_MS_TIMEOUT)
+		dev_err_ratelimited(gpu->dev->dev, "RBBM | ME master split | status=0x%X\n",
+			gpu_read(gpu, REG_A5XX_RBBM_AHB_ME_SPLIT_STATUS));
+
+	if (status & A5XX_RBBM_INT_0_MASK_RBBM_PFP_MS_TIMEOUT)
+		dev_err_ratelimited(gpu->dev->dev, "RBBM | PFP master split | status=0x%X\n",
+			gpu_read(gpu, REG_A5XX_RBBM_AHB_PFP_SPLIT_STATUS));
+
+	if (status & A5XX_RBBM_INT_0_MASK_RBBM_ETS_MS_TIMEOUT)
+		dev_err_ratelimited(gpu->dev->dev, "RBBM | ETS master split | status=0x%X\n",
+			gpu_read(gpu, REG_A5XX_RBBM_AHB_ETS_SPLIT_STATUS));
+
+	if (status & A5XX_RBBM_INT_0_MASK_RBBM_ATB_ASYNC_OVERFLOW)
+		dev_err_ratelimited(gpu->dev->dev, "RBBM | ATB ASYNC overflow\n");
+
+	if (status & A5XX_RBBM_INT_0_MASK_RBBM_ATB_BUS_OVERFLOW)
+		dev_err_ratelimited(gpu->dev->dev, "RBBM | ATB bus overflow\n");
+}
+
+static void a5xx_uche_err_irq(struct msm_gpu *gpu)
+{
+	uint64_t addr = (uint64_t) gpu_read(gpu, REG_A5XX_UCHE_TRAP_LOG_HI);
+
+	addr |= gpu_read(gpu, REG_A5XX_UCHE_TRAP_LOG_LO);
+
+	dev_err_ratelimited(gpu->dev->dev, "UCHE | Out of bounds access | addr=0x%llX\n",
+		addr);
+}
+
+static void a5xx_gpmu_err_irq(struct msm_gpu *gpu)
+{
+	dev_err_ratelimited(gpu->dev->dev, "GPMU | voltage droop\n");
+}
+
+#define RBBM_ERROR_MASK \
+	(A5XX_RBBM_INT_0_MASK_RBBM_AHB_ERROR | \
+	A5XX_RBBM_INT_0_MASK_RBBM_TRANSFER_TIMEOUT | \
+	A5XX_RBBM_INT_0_MASK_RBBM_ME_MS_TIMEOUT | \
+	A5XX_RBBM_INT_0_MASK_RBBM_PFP_MS_TIMEOUT | \
+	A5XX_RBBM_INT_0_MASK_RBBM_ETS_MS_TIMEOUT | \
+	A5XX_RBBM_INT_0_MASK_RBBM_ATB_ASYNC_OVERFLOW)
+
+static irqreturn_t a5xx_irq(struct msm_gpu *gpu)
+{
+	u32 status = gpu_read(gpu, REG_A5XX_RBBM_INT_0_STATUS);
+
+	gpu_write(gpu, REG_A5XX_RBBM_INT_CLEAR_CMD, status);
+
+	if (status & RBBM_ERROR_MASK)
+		a5xx_rbbm_err_irq(gpu);
+
+	if (status & A5XX_RBBM_INT_0_MASK_CP_HW_ERROR)
+		a5xx_cp_err_irq(gpu);
+
+	if (status & A5XX_RBBM_INT_0_MASK_UCHE_OOB_ACCESS)
+		a5xx_uche_err_irq(gpu);
+
+	if (status & A5XX_RBBM_INT_0_MASK_GPMU_VOLTAGE_DROOP)
+		a5xx_gpmu_err_irq(gpu);
+
+	if (status & A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS)
+		msm_gpu_retire(gpu);
+
+	return IRQ_HANDLED;
+}
+
+static const u32 a5xx_register_offsets[REG_ADRENO_REGISTER_MAX] = {
+	REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_BASE, REG_A5XX_CP_RB_BASE),
+	REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_BASE_HI, REG_A5XX_CP_RB_BASE_HI),
+	REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_RPTR_ADDR, REG_A5XX_CP_RB_RPTR_ADDR),
+	REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_RPTR_ADDR_HI,
+		REG_A5XX_CP_RB_RPTR_ADDR_HI),
+	REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_RPTR, REG_A5XX_CP_RB_RPTR),
+	REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_WPTR, REG_A5XX_CP_RB_WPTR),
+	REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_CNTL, REG_A5XX_CP_RB_CNTL),
+};
+
+static const u32 a5xx_registers[] = {
+	0x0000, 0x0002, 0x0004, 0x0020, 0x0022, 0x0026, 0x0029, 0x002B,
+	0x002E, 0x0035, 0x0038, 0x0042, 0x0044, 0x0044, 0x0047, 0x0095,
+	0x0097, 0x00BB, 0x03A0, 0x0464, 0x0469, 0x046F, 0x04D2, 0x04D3,
+	0x04E0, 0x0533, 0x0540, 0x0555, 0xF400, 0xF400, 0xF800, 0xF807,
+	0x0800, 0x081A, 0x081F, 0x0841, 0x0860, 0x0860, 0x0880, 0x08A0,
+	0x0B00, 0x0B12, 0x0B15, 0x0B28, 0x0B78, 0x0B7F, 0x0BB0, 0x0BBD,
+	0x0BC0, 0x0BC6, 0x0BD0, 0x0C53, 0x0C60, 0x0C61, 0x0C80, 0x0C82,
+	0x0C84, 0x0C85, 0x0C90, 0x0C98, 0x0CA0, 0x0CA0, 0x0CB0, 0x0CB2,
+	0x2180, 0x2185, 0x2580, 0x2585, 0x0CC1, 0x0CC1, 0x0CC4, 0x0CC7,
+	0x0CCC, 0x0CCC, 0x0CD0, 0x0CD8, 0x0CE0, 0x0CE5, 0x0CE8, 0x0CE8,
+	0x0CEC, 0x0CF1, 0x0CFB, 0x0D0E, 0x2100, 0x211E, 0x2140, 0x2145,
+	0x2500, 0x251E, 0x2540, 0x2545, 0x0D10, 0x0D17, 0x0D20, 0x0D23,
+	0x0D30, 0x0D30, 0x20C0, 0x20C0, 0x24C0, 0x24C0, 0x0E40, 0x0E43,
+	0x0E4A, 0x0E4A, 0x0E50, 0x0E57, 0x0E60, 0x0E7C, 0x0E80, 0x0E8E,
+	0x0E90, 0x0E96, 0x0EA0, 0x0EA8, 0x0EB0, 0x0EB2, 0xE140, 0xE147,
+	0xE150, 0xE187, 0xE1A0, 0xE1A9, 0xE1B0, 0xE1B6, 0xE1C0, 0xE1C7,
+	0xE1D0, 0xE1D1, 0xE200, 0xE201, 0xE210, 0xE21C, 0xE240, 0xE268,
+	0xE000, 0xE006, 0xE010, 0xE09A, 0xE0A0, 0xE0A4, 0xE0AA, 0xE0EB,
+	0xE100, 0xE105, 0xE380, 0xE38F, 0xE3B0, 0xE3B0, 0xE400, 0xE405,
+	0xE408, 0xE4E9, 0xE4F0, 0xE4F0, 0xE280, 0xE280, 0xE282, 0xE2A3,
+	0xE2A5, 0xE2C2, 0xE940, 0xE947, 0xE950, 0xE987, 0xE9A0, 0xE9A9,
+	0xE9B0, 0xE9B6, 0xE9C0, 0xE9C7, 0xE9D0, 0xE9D1, 0xEA00, 0xEA01,
+	0xEA10, 0xEA1C, 0xEA40, 0xEA68, 0xE800, 0xE806, 0xE810, 0xE89A,
+	0xE8A0, 0xE8A4, 0xE8AA, 0xE8EB, 0xE900, 0xE905, 0xEB80, 0xEB8F,
+	0xEBB0, 0xEBB0, 0xEC00, 0xEC05, 0xEC08, 0xECE9, 0xECF0, 0xECF0,
+	0xEA80, 0xEA80, 0xEA82, 0xEAA3, 0xEAA5, 0xEAC2, 0xA800, 0xA8FF,
+	0xAC60, 0xAC60, 0xB000, 0xB97F, 0xB9A0, 0xB9BF,
+	~0
+};
+
+static void a5xx_dump(struct msm_gpu *gpu)
+{
+	dev_info(gpu->dev->dev, "status:   %08x\n",
+		gpu_read(gpu, REG_A5XX_RBBM_STATUS));
+	adreno_dump(gpu);
+}
+
+static int a5xx_pm_resume(struct msm_gpu *gpu)
+{
+	int ret;
+
+	/* Turn on the core power */
+	ret = msm_gpu_pm_resume(gpu);
+	if (ret)
+		return ret;
+
+	/* Turn the RBCCU domain first to limit the chances of voltage droop */
+	gpu_write(gpu, REG_A5XX_GPMU_RBCCU_POWER_CNTL, 0x778000);
+
+	/* Wait 3 usecs before polling */
+	udelay(3);
+
+	ret = spin_usecs(gpu, 20, REG_A5XX_GPMU_RBCCU_PWR_CLK_STATUS,
+		(1 << 20), (1 << 20));
+	if (ret) {
+		DRM_ERROR("%s: timeout waiting for RBCCU GDSC enable: %X\n",
+			gpu->name,
+			gpu_read(gpu, REG_A5XX_GPMU_RBCCU_PWR_CLK_STATUS));
+		return ret;
+	}
+
+	/* Turn on the SP domain */
+	gpu_write(gpu, REG_A5XX_GPMU_SP_POWER_CNTL, 0x778000);
+	ret = spin_usecs(gpu, 20, REG_A5XX_GPMU_SP_PWR_CLK_STATUS,
+		(1 << 20), (1 << 20));
+	if (ret)
+		DRM_ERROR("%s: timeout waiting for SP GDSC enable\n",
+			gpu->name);
+
+	return ret;
+}
+
+static int a5xx_pm_suspend(struct msm_gpu *gpu)
+{
+	/* Clear the VBIF pipe before shutting down */
+	gpu_write(gpu, REG_A5XX_VBIF_XIN_HALT_CTRL0, 0xF);
+	spin_until((gpu_read(gpu, REG_A5XX_VBIF_XIN_HALT_CTRL1) & 0xF) == 0xF);
+
+	gpu_write(gpu, REG_A5XX_VBIF_XIN_HALT_CTRL0, 0);
+
+	/*
+	 * Reset the VBIF before power collapse to avoid issue with FIFO
+	 * entries
+	 */
+	gpu_write(gpu, REG_A5XX_RBBM_BLOCK_SW_RESET_CMD, 0x003C0000);
+	gpu_write(gpu, REG_A5XX_RBBM_BLOCK_SW_RESET_CMD, 0x00000000);
+
+	return msm_gpu_pm_suspend(gpu);
+}
+
+static int a5xx_get_timestamp(struct msm_gpu *gpu, uint64_t *value)
+{
+	*value = gpu_read64(gpu, REG_A5XX_RBBM_PERFCTR_CP_0_LO,
+		REG_A5XX_RBBM_PERFCTR_CP_0_HI);
+
+	return 0;
+}
+
+#ifdef CONFIG_DEBUG_FS
+static void a5xx_show(struct msm_gpu *gpu, struct seq_file *m)
+{
+	gpu->funcs->pm_resume(gpu);
+
+	seq_printf(m, "status:   %08x\n",
+			gpu_read(gpu, REG_A5XX_RBBM_STATUS));
+	gpu->funcs->pm_suspend(gpu);
+
+	adreno_show(gpu, m);
+}
+#endif
+
+static const struct adreno_gpu_funcs funcs = {
+	.base = {
+		.get_param = adreno_get_param,
+		.hw_init = a5xx_hw_init,
+		.pm_suspend = a5xx_pm_suspend,
+		.pm_resume = a5xx_pm_resume,
+		.recover = a5xx_recover,
+		.last_fence = adreno_last_fence,
+		.submit = a5xx_submit,
+		.flush = adreno_flush,
+		.idle = a5xx_idle,
+		.irq = a5xx_irq,
+		.destroy = a5xx_destroy,
+		.show = a5xx_show,
+	},
+	.get_timestamp = a5xx_get_timestamp,
+};
+
+struct msm_gpu *a5xx_gpu_init(struct drm_device *dev)
+{
+	struct msm_drm_private *priv = dev->dev_private;
+	struct platform_device *pdev = priv->gpu_pdev;
+	struct a5xx_gpu *a5xx_gpu = NULL;
+	struct adreno_gpu *adreno_gpu;
+	struct msm_gpu *gpu;
+	int ret;
+
+	if (!pdev) {
+		dev_err(dev->dev, "No A5XX device is defined\n");
+		return ERR_PTR(-ENXIO);
+	}
+
+	a5xx_gpu = kzalloc(sizeof(*a5xx_gpu), GFP_KERNEL);
+	if (!a5xx_gpu)
+		return ERR_PTR(-ENOMEM);
+
+	adreno_gpu = &a5xx_gpu->base;
+	gpu = &adreno_gpu->base;
+
+	a5xx_gpu->pdev = pdev;
+	adreno_gpu->registers = a5xx_registers;
+	adreno_gpu->reg_offsets = a5xx_register_offsets;
+
+	a5xx_gpu->lm_leakage = 0x4E001A;
+
+	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs);
+	if (ret) {
+		a5xx_destroy(&(a5xx_gpu->base.base));
+		return ERR_PTR(ret);
+	}
+
+	return gpu;
+}
--- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
@ -0,0 +1,60 @@
+/* Copyright (c) 2016 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef __A5XX_GPU_H__
+#define __A5XX_GPU_H__
+
+#include "adreno_gpu.h"
+
+/* Bringing over the hack from the previous targets */
+#undef ROP_COPY
+#undef ROP_XOR
+
+#include "a5xx.xml.h"
+
+struct a5xx_gpu {
+	struct adreno_gpu base;
+	struct platform_device *pdev;
+
+	struct drm_gem_object *pm4_bo;
+	uint64_t pm4_iova;
+
+	struct drm_gem_object *pfp_bo;
+	uint64_t pfp_iova;
+
+	struct drm_gem_object *gpmu_bo;
+	uint64_t gpmu_iova;
+	uint32_t gpmu_dwords;
+
+	uint32_t lm_leakage;
+};
+
+#define to_a5xx_gpu(x) container_of(x, struct a5xx_gpu, base)
+
+int a5xx_power_init(struct msm_gpu *gpu);
+void a5xx_gpmu_ucode_init(struct msm_gpu *gpu);
+
+static inline int spin_usecs(struct msm_gpu *gpu, uint32_t usecs,
+		uint32_t reg, uint32_t mask, uint32_t value)
+{
+	while (usecs--) {
+		udelay(1);
+		if ((gpu_read(gpu, reg) & mask) == value)
+			return 0;
+		cpu_relax();
+	}
+
+	return -ETIMEDOUT;
+}
+
+
+#endif /* __A5XX_GPU_H__ */
--- a/drivers/gpu/drm/msm/adreno/a5xx_power.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_power.c
@ -0,0 +1,344 @@
+/* Copyright (c) 2016 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/pm_opp.h>
+#include "a5xx_gpu.h"
+
+/*
+ * The GPMU data block is a block of shared registers that can be used to
+ * communicate back and forth. These "registers" are by convention with the GPMU
+ * firwmare and not bound to any specific hardware design
+ */
+
+#define AGC_INIT_BASE REG_A5XX_GPMU_DATA_RAM_BASE
+#define AGC_INIT_MSG_MAGIC (AGC_INIT_BASE + 5)
+#define AGC_MSG_BASE (AGC_INIT_BASE + 7)
+
+#define AGC_MSG_STATE (AGC_MSG_BASE + 0)
+#define AGC_MSG_COMMAND (AGC_MSG_BASE + 1)
+#define AGC_MSG_PAYLOAD_SIZE (AGC_MSG_BASE + 3)
+#define AGC_MSG_PAYLOAD(_o) ((AGC_MSG_BASE + 5) + (_o))
+
+#define AGC_POWER_CONFIG_PRODUCTION_ID 1
+#define AGC_INIT_MSG_VALUE 0xBABEFACE
+
+static struct {
+	uint32_t reg;
+	uint32_t value;
+} a5xx_sequence_regs[] = {
+	{ 0xB9A1, 0x00010303 },
+	{ 0xB9A2, 0x13000000 },
+	{ 0xB9A3, 0x00460020 },
+	{ 0xB9A4, 0x10000000 },
+	{ 0xB9A5, 0x040A1707 },
+	{ 0xB9A6, 0x00010000 },
+	{ 0xB9A7, 0x0E000904 },
+	{ 0xB9A8, 0x10000000 },
+	{ 0xB9A9, 0x01165000 },
+	{ 0xB9AA, 0x000E0002 },
+	{ 0xB9AB, 0x03884141 },
+	{ 0xB9AC, 0x10000840 },
+	{ 0xB9AD, 0x572A5000 },
+	{ 0xB9AE, 0x00000003 },
+	{ 0xB9AF, 0x00000000 },
+	{ 0xB9B0, 0x10000000 },
+	{ 0xB828, 0x6C204010 },
+	{ 0xB829, 0x6C204011 },
+	{ 0xB82A, 0x6C204012 },
+	{ 0xB82B, 0x6C204013 },
+	{ 0xB82C, 0x6C204014 },
+	{ 0xB90F, 0x00000004 },
+	{ 0xB910, 0x00000002 },
+	{ 0xB911, 0x00000002 },
+	{ 0xB912, 0x00000002 },
+	{ 0xB913, 0x00000002 },
+	{ 0xB92F, 0x00000004 },
+	{ 0xB930, 0x00000005 },
+	{ 0xB931, 0x00000005 },
+	{ 0xB932, 0x00000005 },
+	{ 0xB933, 0x00000005 },
+	{ 0xB96F, 0x00000001 },
+	{ 0xB970, 0x00000003 },
+	{ 0xB94F, 0x00000004 },
+	{ 0xB950, 0x0000000B },
+	{ 0xB951, 0x0000000B },
+	{ 0xB952, 0x0000000B },
+	{ 0xB953, 0x0000000B },
+	{ 0xB907, 0x00000019 },
+	{ 0xB927, 0x00000019 },
+	{ 0xB947, 0x00000019 },
+	{ 0xB967, 0x00000019 },
+	{ 0xB987, 0x00000019 },
+	{ 0xB906, 0x00220001 },
+	{ 0xB926, 0x00220001 },
+	{ 0xB946, 0x00220001 },
+	{ 0xB966, 0x00220001 },
+	{ 0xB986, 0x00300000 },
+	{ 0xAC40, 0x0340FF41 },
+	{ 0xAC41, 0x03BEFED0 },
+	{ 0xAC42, 0x00331FED },
+	{ 0xAC43, 0x021FFDD3 },
+	{ 0xAC44, 0x5555AAAA },
+	{ 0xAC45, 0x5555AAAA },
+	{ 0xB9BA, 0x00000008 },
+};
+
+/*
+ * Get the actual voltage value for the operating point at the specified
+ * frequency
+ */
+static inline uint32_t _get_mvolts(struct msm_gpu *gpu, uint32_t freq)
+{
+	struct drm_device *dev = gpu->dev;
+	struct msm_drm_private *priv = dev->dev_private;
+	struct platform_device *pdev = priv->gpu_pdev;
+	struct dev_pm_opp *opp;
+
+	opp = dev_pm_opp_find_freq_exact(&pdev->dev, freq, true);
+
+	return (!IS_ERR(opp)) ? dev_pm_opp_get_voltage(opp) / 1000 : 0;
+}
+
+/* Setup thermal limit management */
+static void a5xx_lm_setup(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+	unsigned int i;
+
+	/* Write the block of sequence registers */
+	for (i = 0; i < ARRAY_SIZE(a5xx_sequence_regs); i++)
+		gpu_write(gpu, a5xx_sequence_regs[i].reg,
+			a5xx_sequence_regs[i].value);
+
+	/* Hard code the A530 GPU thermal sensor ID for the GPMU */
+	gpu_write(gpu, REG_A5XX_GPMU_TEMP_SENSOR_ID, 0x60007);
+	gpu_write(gpu, REG_A5XX_GPMU_DELTA_TEMP_THRESHOLD, 0x01);
+	gpu_write(gpu, REG_A5XX_GPMU_TEMP_SENSOR_CONFIG, 0x01);
+
+	/* Until we get clock scaling 0 is always the active power level */
+	gpu_write(gpu, REG_A5XX_GPMU_GPMU_VOLTAGE, 0x80000000 | 0);
+
+	gpu_write(gpu, REG_A5XX_GPMU_BASE_LEAKAGE, a5xx_gpu->lm_leakage);
+
+	/* The threshold is fixed at 6000 for A530 */
+	gpu_write(gpu, REG_A5XX_GPMU_GPMU_PWR_THRESHOLD, 0x80000000 | 6000);
+
+	gpu_write(gpu, REG_A5XX_GPMU_BEC_ENABLE, 0x10001FFF);
+	gpu_write(gpu, REG_A5XX_GDPM_CONFIG1, 0x00201FF1);
+
+	/* Write the voltage table */
+	gpu_write(gpu, REG_A5XX_GPMU_BEC_ENABLE, 0x10001FFF);
+	gpu_write(gpu, REG_A5XX_GDPM_CONFIG1, 0x201FF1);
+
+	gpu_write(gpu, AGC_MSG_STATE, 1);
+	gpu_write(gpu, AGC_MSG_COMMAND, AGC_POWER_CONFIG_PRODUCTION_ID);
+
+	/* Write the max power - hard coded to 5448 for A530 */
+	gpu_write(gpu, AGC_MSG_PAYLOAD(0), 5448);
+	gpu_write(gpu, AGC_MSG_PAYLOAD(1), 1);
+
+	/*
+	 * For now just write the one voltage level - we will do more when we
+	 * can do scaling
+	 */
+	gpu_write(gpu, AGC_MSG_PAYLOAD(2), _get_mvolts(gpu, gpu->fast_rate));
+	gpu_write(gpu, AGC_MSG_PAYLOAD(3), gpu->fast_rate / 1000000);
+
+	gpu_write(gpu, AGC_MSG_PAYLOAD_SIZE, 4 * sizeof(uint32_t));
+	gpu_write(gpu, AGC_INIT_MSG_MAGIC, AGC_INIT_MSG_VALUE);
+}
+
+/* Enable SP/TP cpower collapse */
+static void a5xx_pc_init(struct msm_gpu *gpu)
+{
+	gpu_write(gpu, REG_A5XX_GPMU_PWR_COL_INTER_FRAME_CTRL, 0x7F);
+	gpu_write(gpu, REG_A5XX_GPMU_PWR_COL_BINNING_CTRL, 0);
+	gpu_write(gpu, REG_A5XX_GPMU_PWR_COL_INTER_FRAME_HYST, 0xA0080);
+	gpu_write(gpu, REG_A5XX_GPMU_PWR_COL_STAGGER_DELAY, 0x600040);
+}
+
+/* Enable the GPMU microcontroller */
+static int a5xx_gpmu_init(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+	struct msm_ringbuffer *ring = gpu->rb;
+
+	if (!a5xx_gpu->gpmu_dwords)
+		return 0;
+
+	/* Turn off protected mode for this operation */
+	OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1);
+	OUT_RING(ring, 0);
+
+	/* Kick off the IB to load the GPMU microcode */
+	OUT_PKT7(ring, CP_INDIRECT_BUFFER_PFE, 3);
+	OUT_RING(ring, lower_32_bits(a5xx_gpu->gpmu_iova));
+	OUT_RING(ring, upper_32_bits(a5xx_gpu->gpmu_iova));
+	OUT_RING(ring, a5xx_gpu->gpmu_dwords);
+
+	/* Turn back on protected mode */
+	OUT_PKT7(ring, CP_SET_PROTECTED_MODE, 1);
+	OUT_RING(ring, 1);
+
+	gpu->funcs->flush(gpu);
+
+	if (!gpu->funcs->idle(gpu)) {
+		DRM_ERROR("%s: Unable to load GPMU firmware. GPMU will not be active\n",
+			gpu->name);
+		return -EINVAL;
+	}
+
+	gpu_write(gpu, REG_A5XX_GPMU_WFI_CONFIG, 0x4014);
+
+	/* Kick off the GPMU */
+	gpu_write(gpu, REG_A5XX_GPMU_CM3_SYSRESET, 0x0);
+
+	/*
+	 * Wait for the GPMU to respond. It isn't fatal if it doesn't, we just
+	 * won't have advanced power collapse.
+	 */
+	if (spin_usecs(gpu, 25, REG_A5XX_GPMU_GENERAL_0, 0xFFFFFFFF,
+		0xBABEFACE))
+		DRM_ERROR("%s: GPMU firmware initialization timed out\n",
+			gpu->name);
+
+	return 0;
+}
+
+/* Enable limits management */
+static void a5xx_lm_enable(struct msm_gpu *gpu)
+{
+	gpu_write(gpu, REG_A5XX_GDPM_INT_MASK, 0x0);
+	gpu_write(gpu, REG_A5XX_GDPM_INT_EN, 0x0A);
+	gpu_write(gpu, REG_A5XX_GPMU_GPMU_VOLTAGE_INTR_EN_MASK, 0x01);
+	gpu_write(gpu, REG_A5XX_GPMU_TEMP_THRESHOLD_INTR_EN_MASK, 0x50000);
+	gpu_write(gpu, REG_A5XX_GPMU_THROTTLE_UNMASK_FORCE_CTRL, 0x30000);
+
+	gpu_write(gpu, REG_A5XX_GPMU_CLOCK_THROTTLE_CTRL, 0x011);
+}
+
+int a5xx_power_init(struct msm_gpu *gpu)
+{
+	int ret;
+
+	/* Set up the limits management */
+	a5xx_lm_setup(gpu);
+
+	/* Set up SP/TP power collpase */
+	a5xx_pc_init(gpu);
+
+	/* Start the GPMU */
+	ret = a5xx_gpmu_init(gpu);
+	if (ret)
+		return ret;
+
+	/* Start the limits management */
+	a5xx_lm_enable(gpu);
+
+	return 0;
+}
+
+void a5xx_gpmu_ucode_init(struct msm_gpu *gpu)
+{
+	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
+	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
+	struct drm_device *drm = gpu->dev;
+	const struct firmware *fw;
+	uint32_t dwords = 0, offset = 0, bosize;
+	unsigned int *data, *ptr, *cmds;
+	unsigned int cmds_size;
+
+	if (a5xx_gpu->gpmu_bo)
+		return;
+
+	/* Get the firmware */
+	if (request_firmware(&fw, adreno_gpu->info->gpmufw, drm->dev)) {
+		DRM_ERROR("%s: Could not get GPMU firmware. GPMU will not be active\n",
+			gpu->name);
+		return;
+	}
+
+	data = (unsigned int *) fw->data;
+
+	/*
+	 * The first dword is the size of the remaining data in dwords. Use it
+	 * as a checksum of sorts and make sure it matches the actual size of
+	 * the firmware that we read
+	 */
+
+	if (fw->size < 8 || (data[0] < 2) || (data[0] >= (fw->size >> 2)))
+		goto out;
+
+	/* The second dword is an ID - look for 2 (GPMU_FIRMWARE_ID) */
+	if (data[1] != 2)
+		goto out;
+
+	cmds = data + data[2] + 3;
+	cmds_size = data[0] - data[2] - 2;
+
+	/*
+	 * A single type4 opcode can only have so many values attached so
+	 * add enough opcodes to load the all the commands
+	 */
+	bosize = (cmds_size + (cmds_size / TYPE4_MAX_PAYLOAD) + 1) << 2;
+
+	mutex_lock(&drm->struct_mutex);
+	a5xx_gpu->gpmu_bo = msm_gem_new(drm, bosize, MSM_BO_UNCACHED);
+	mutex_unlock(&drm->struct_mutex);
+
+	if (IS_ERR(a5xx_gpu->gpmu_bo))
+		goto err;
+
+	if (msm_gem_get_iova(a5xx_gpu->gpmu_bo, gpu->id, &a5xx_gpu->gpmu_iova))
+		goto err;
+
+	ptr = msm_gem_get_vaddr(a5xx_gpu->gpmu_bo);
+	if (!ptr)
+		goto err;
+
+	while (cmds_size > 0) {
+		int i;
+		uint32_t _size = cmds_size > TYPE4_MAX_PAYLOAD ?
+			TYPE4_MAX_PAYLOAD : cmds_size;
+
+		ptr[dwords++] = PKT4(REG_A5XX_GPMU_INST_RAM_BASE + offset,
+			_size);
+
+		for (i = 0; i < _size; i++)
+			ptr[dwords++] = *cmds++;
+
+		offset += _size;
+		cmds_size -= _size;
+	}
+
+	msm_gem_put_vaddr(a5xx_gpu->gpmu_bo);
+	a5xx_gpu->gpmu_dwords = dwords;
+
+	goto out;
+
+err:
+	if (a5xx_gpu->gpmu_iova)
+		msm_gem_put_iova(a5xx_gpu->gpmu_bo, gpu->id);
+	if (a5xx_gpu->gpmu_bo)
+		drm_gem_object_unreference_unlocked(a5xx_gpu->gpmu_bo);
+
+	a5xx_gpu->gpmu_bo = NULL;
+	a5xx_gpu->gpmu_iova = 0;
+	a5xx_gpu->gpmu_dwords = 0;
+
+out:
+	/* No need to keep that firmware laying around anymore */
+	release_firmware(fw);
+}
--- a/drivers/gpu/drm/msm/adreno/adreno_common.xml.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_common.xml.h
@ -8,13 +8,14 @@ This file was generated by the rules-ng-ng headergen tool in this git repository
 git clone https://github.com/freedreno/envytools.git

 The rules-ng-ng source files this header was generated from are:
- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2016-04-26 17:56:44)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2016-02-10 17:07:21)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2016-02-10 21:03:25)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  16166 bytes, from 2016-02-11 21:20:31)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83967 bytes, from 2016-02-10 17:07:21)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 109916 bytes, from 2016-02-20 18:44:48)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32907 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  12025 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  22544 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110765 bytes, from 2016-11-26 23:01:48)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          (  90321 bytes, from 2016-11-28 16:50:05)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2016 by the following authors:
@ -172,6 +173,14 @@ enum a3xx_color_swap {
 	XYZW = 3,
 };

+enum a3xx_rb_blend_opcode {
+	BLEND_DST_PLUS_SRC = 0,
+	BLEND_SRC_MINUS_DST = 1,
+	BLEND_DST_MINUS_SRC = 2,
+	BLEND_MIN_DST_SRC = 3,
+	BLEND_MAX_DST_SRC = 4,
+};
+
 #define REG_AXXX_CP_RB_BASE					0x000001c0

 #define REG_AXXX_CP_RB_CNTL					0x000001c1
--- a/drivers/gpu/drm/msm/adreno/adreno_device.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_device.c
@ -74,6 +74,15 @@ static const struct adreno_info gpulist[] = {
 		.pfpfw = "a420_pfp.fw",
 		.gmem  = (SZ_1M + SZ_512K),
 		.init  = a4xx_gpu_init,
+	}, {
+		.rev = ADRENO_REV(5, 3, 0, ANY_ID),
+		.revn = 530,
+		.name = "A530",
+		.pm4fw = "a530_pm4.fw",
+		.pfpfw = "a530_pfp.fw",
+		.gmem = SZ_1M,
+		.init = a5xx_gpu_init,
+		.gpmufw = "a530v3_gpmu.fw2",
 	},
 };

@ -83,6 +92,8 @@ MODULE_FIRMWARE("a330_pm4.fw");
 MODULE_FIRMWARE("a330_pfp.fw");
 MODULE_FIRMWARE("a420_pm4.fw");
 MODULE_FIRMWARE("a420_pfp.fw");
+MODULE_FIRMWARE("a530_fm4.fw");
+MODULE_FIRMWARE("a530_pfp.fw");

 static inline bool _rev_match(uint8_t entry, uint8_t id)
 {
@ -145,12 +156,16 @@ struct msm_gpu *adreno_load_gpu(struct drm_device *dev)
 		mutex_lock(&dev->struct_mutex);
 		gpu->funcs->pm_resume(gpu);
 		mutex_unlock(&dev->struct_mutex);
+
+		disable_irq(gpu->irq);
+
 		ret = gpu->funcs->hw_init(gpu);
 		if (ret) {
 			dev_err(dev->dev, "gpu hw init failed: %d\n", ret);
 			gpu->funcs->destroy(gpu);
 			gpu = NULL;
 		} else {
+			enable_irq(gpu->irq);
 			/* give inactive pm a chance to kick in: */
 			msm_gpu_retire(gpu);
 		}
@ -166,12 +181,20 @@ static void set_gpu_pdev(struct drm_device *dev,
 	priv->gpu_pdev = pdev;
 }

+static const struct {
+	const char *str;
+	uint32_t flag;
+} quirks[] = {
+	{ "qcom,gpu-quirk-two-pass-use-wfi", ADRENO_QUIRK_TWO_PASS_USE_WFI },
+	{ "qcom,gpu-quirk-fault-detect-mask", ADRENO_QUIRK_FAULT_DETECT_MASK },
+};
+
 static int adreno_bind(struct device *dev, struct device *master, void *data)
 {
 	static struct adreno_platform_config config = {};
 	struct device_node *child, *node = dev->of_node;
 	u32 val;
-	int ret;
+	int ret, i;

 	ret = of_property_read_u32(node, "qcom,chipid", &val);
 	if (ret) {
@ -205,6 +228,10 @@ static int adreno_bind(struct device *dev, struct device *master, void *data)
 		return -ENXIO;
 	}

+	for (i = 0; i < ARRAY_SIZE(quirks); i++)
+		if (of_property_read_bool(node, quirks[i].str))
+			config.quirks |= quirks[i].flag;
+
 	dev->platform_data = &config;
 	set_gpu_pdev(dev_get_drvdata(master), to_platform_device(dev));
 	return 0;
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@ -22,7 +22,7 @@
 #include "msm_mmu.h"

 #define RB_SIZE    SZ_32K
-#define RB_BLKSIZE 16
+#define RB_BLKSIZE 32

 int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value)
 {
@ -54,9 +54,6 @@ int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value)
 	}
 }

-#define rbmemptr(adreno_gpu, member)  \
-	((adreno_gpu)->memptrs_iova + offsetof(struct adreno_rbmemptrs, member))
-
 int adreno_hw_init(struct msm_gpu *gpu)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
@ -79,11 +76,14 @@ int adreno_hw_init(struct msm_gpu *gpu)
 			(adreno_is_a430(adreno_gpu) ? AXXX_CP_RB_CNTL_NO_UPDATE : 0));

 	/* Setup ringbuffer address: */
-	adreno_gpu_write(adreno_gpu, REG_ADRENO_CP_RB_BASE, gpu->rb_iova);
+	adreno_gpu_write64(adreno_gpu, REG_ADRENO_CP_RB_BASE,
+		REG_ADRENO_CP_RB_BASE_HI, gpu->rb_iova);

-	if (!adreno_is_a430(adreno_gpu))
-		adreno_gpu_write(adreno_gpu, REG_ADRENO_CP_RB_RPTR_ADDR,
-						rbmemptr(adreno_gpu, rptr));
+	if (!adreno_is_a430(adreno_gpu)) {
+		adreno_gpu_write64(adreno_gpu, REG_ADRENO_CP_RB_RPTR_ADDR,
+			REG_ADRENO_CP_RB_RPTR_ADDR_HI,
+			rbmemptr(adreno_gpu, rptr));
+	}

 	return 0;
 }
@ -126,11 +126,14 @@ void adreno_recover(struct msm_gpu *gpu)
 	adreno_gpu->memptrs->wptr  = 0;

 	gpu->funcs->pm_resume(gpu);
+
+	disable_irq(gpu->irq);
 	ret = gpu->funcs->hw_init(gpu);
 	if (ret) {
 		dev_err(dev->dev, "gpu hw init failed: %d\n", ret);
 		/* hmm, oh well? */
 	}
+	enable_irq(gpu->irq);
 }

 void adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
@ -218,19 +221,18 @@ void adreno_flush(struct msm_gpu *gpu)
 	adreno_gpu_write(adreno_gpu, REG_ADRENO_CP_RB_WPTR, wptr);
 }

-void adreno_idle(struct msm_gpu *gpu)
+bool adreno_idle(struct msm_gpu *gpu)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
 	uint32_t wptr = get_wptr(gpu->rb);
-	int ret;

 	/* wait for CP to drain ringbuffer: */
-	ret = spin_until(get_rptr(adreno_gpu) == wptr);
-
-	if (ret)
-		DRM_ERROR("%s: timeout waiting to drain ringbuffer!\n", gpu->name);
+	if (!spin_until(get_rptr(adreno_gpu) == wptr))
+		return true;

 	/* TODO maybe we need to reset GPU here to recover from hang? */
+	DRM_ERROR("%s: timeout waiting to drain ringbuffer!\n", gpu->name);
+	return false;
 }

 #ifdef CONFIG_DEBUG_FS
@ -278,7 +280,6 @@ void adreno_show(struct msm_gpu *gpu, struct seq_file *m)
 void adreno_dump_info(struct msm_gpu *gpu)
 {
 	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
-	int i;

 	printk("revision: %d (%d.%d.%d.%d)\n",
 			adreno_gpu->info->revn, adreno_gpu->rev.core,
@ -290,11 +291,6 @@ void adreno_dump_info(struct msm_gpu *gpu)
 	printk("rptr:     %d\n", get_rptr(adreno_gpu));
 	printk("wptr:     %d\n", adreno_gpu->memptrs->wptr);
 	printk("rb wptr:  %d\n", get_wptr(gpu->rb));
-
-	for (i = 0; i < 8; i++) {
-		printk("CP_SCRATCH_REG%d: %u\n", i,
-			gpu_read(gpu, REG_AXXX_CP_SCRATCH_REG0 + i));
-	}
 }

 /* would be nice to not have to duplicate the _show() stuff with printk(): */
@ -350,6 +346,7 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	adreno_gpu->gmem = adreno_gpu->info->gmem;
 	adreno_gpu->revn = adreno_gpu->info->revn;
 	adreno_gpu->rev = config->rev;
+	adreno_gpu->quirks = config->quirks;

 	gpu->fast_rate = config->fast_rate;
 	gpu->slow_rate = config->slow_rate;
@ -381,7 +378,7 @@ int adreno_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 		return ret;
 	}

-	mmu = gpu->mmu;
+	mmu = gpu->aspace->mmu;
 	if (mmu) {
 		ret = mmu->funcs->attach(mmu, iommu_ports,
 				ARRAY_SIZE(iommu_ports));
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
@ -28,6 +28,9 @@
 #include "adreno_pm4.xml.h"

 #define REG_ADRENO_DEFINE(_offset, _reg) [_offset] = (_reg) + 1
+#define REG_SKIP ~0
+#define REG_ADRENO_SKIP(_offset) [_offset] = REG_SKIP
+
 /**
 * adreno_regs: List of registers that are used in across all
 * 3D devices. Each device type has different offset value for the same
@ -35,73 +38,21 @@
 * and are indexed by the enumeration values defined in this enum
 */
 enum adreno_regs {
-	REG_ADRENO_CP_DEBUG,
-	REG_ADRENO_CP_ME_RAM_WADDR,
-	REG_ADRENO_CP_ME_RAM_DATA,
-	REG_ADRENO_CP_PFP_UCODE_DATA,
-	REG_ADRENO_CP_PFP_UCODE_ADDR,
-	REG_ADRENO_CP_WFI_PEND_CTR,
 	REG_ADRENO_CP_RB_BASE,
+	REG_ADRENO_CP_RB_BASE_HI,
 	REG_ADRENO_CP_RB_RPTR_ADDR,
+	REG_ADRENO_CP_RB_RPTR_ADDR_HI,
 	REG_ADRENO_CP_RB_RPTR,
 	REG_ADRENO_CP_RB_WPTR,
-	REG_ADRENO_CP_PROTECT_CTRL,
-	REG_ADRENO_CP_ME_CNTL,
 	REG_ADRENO_CP_RB_CNTL,
-	REG_ADRENO_CP_IB1_BASE,
-	REG_ADRENO_CP_IB1_BUFSZ,
-	REG_ADRENO_CP_IB2_BASE,
-	REG_ADRENO_CP_IB2_BUFSZ,
-	REG_ADRENO_CP_TIMESTAMP,
-	REG_ADRENO_CP_ME_RAM_RADDR,
-	REG_ADRENO_CP_ROQ_ADDR,
-	REG_ADRENO_CP_ROQ_DATA,
-	REG_ADRENO_CP_MERCIU_ADDR,
-	REG_ADRENO_CP_MERCIU_DATA,
-	REG_ADRENO_CP_MERCIU_DATA2,
-	REG_ADRENO_CP_MEQ_ADDR,
-	REG_ADRENO_CP_MEQ_DATA,
-	REG_ADRENO_CP_HW_FAULT,
-	REG_ADRENO_CP_PROTECT_STATUS,
-	REG_ADRENO_SCRATCH_ADDR,
-	REG_ADRENO_SCRATCH_UMSK,
-	REG_ADRENO_SCRATCH_REG2,
-	REG_ADRENO_RBBM_STATUS,
-	REG_ADRENO_RBBM_PERFCTR_CTL,
-	REG_ADRENO_RBBM_PERFCTR_LOAD_CMD0,
-	REG_ADRENO_RBBM_PERFCTR_LOAD_CMD1,
-	REG_ADRENO_RBBM_PERFCTR_LOAD_CMD2,
-	REG_ADRENO_RBBM_PERFCTR_PWR_1_LO,
-	REG_ADRENO_RBBM_INT_0_MASK,
-	REG_ADRENO_RBBM_INT_0_STATUS,
-	REG_ADRENO_RBBM_AHB_ERROR_STATUS,
-	REG_ADRENO_RBBM_PM_OVERRIDE2,
-	REG_ADRENO_RBBM_AHB_CMD,
-	REG_ADRENO_RBBM_INT_CLEAR_CMD,
-	REG_ADRENO_RBBM_SW_RESET_CMD,
-	REG_ADRENO_RBBM_CLOCK_CTL,
-	REG_ADRENO_RBBM_AHB_ME_SPLIT_STATUS,
-	REG_ADRENO_RBBM_AHB_PFP_SPLIT_STATUS,
-	REG_ADRENO_VPC_DEBUG_RAM_SEL,
-	REG_ADRENO_VPC_DEBUG_RAM_READ,
-	REG_ADRENO_VSC_SIZE_ADDRESS,
-	REG_ADRENO_VFD_CONTROL_0,
-	REG_ADRENO_VFD_INDEX_MAX,
-	REG_ADRENO_SP_VS_PVT_MEM_ADDR_REG,
-	REG_ADRENO_SP_FS_PVT_MEM_ADDR_REG,
-	REG_ADRENO_SP_VS_OBJ_START_REG,
-	REG_ADRENO_SP_FS_OBJ_START_REG,
-	REG_ADRENO_PA_SC_AA_CONFIG,
-	REG_ADRENO_SQ_GPR_MANAGEMENT,
-	REG_ADRENO_SQ_INST_STORE_MANAGMENT,
-	REG_ADRENO_TP0_CHICKEN,
-	REG_ADRENO_RBBM_RBBM_CTL,
-	REG_ADRENO_UCHE_INVALIDATE0,
-	REG_ADRENO_RBBM_PERFCTR_LOAD_VALUE_LO,
-	REG_ADRENO_RBBM_PERFCTR_LOAD_VALUE_HI,
 	REG_ADRENO_REGISTER_MAX,
 };

+enum adreno_quirks {
+	ADRENO_QUIRK_TWO_PASS_USE_WFI = 1,
+	ADRENO_QUIRK_FAULT_DETECT_MASK = 2,
+};
+
 struct adreno_rev {
 	uint8_t  core;
 	uint8_t  major;
@ -122,12 +73,16 @@ struct adreno_info {
 	uint32_t revn;
 	const char *name;
 	const char *pm4fw, *pfpfw;
+	const char *gpmufw;
 	uint32_t gmem;
 	struct msm_gpu *(*init)(struct drm_device *dev);
 };

 const struct adreno_info *adreno_info(struct adreno_rev rev);

+#define rbmemptr(adreno_gpu, member)  \
+	((adreno_gpu)->memptrs_iova + offsetof(struct adreno_rbmemptrs, member))
+
 struct adreno_rbmemptrs {
 	volatile uint32_t rptr;
 	volatile uint32_t wptr;
@ -153,7 +108,7 @@ struct adreno_gpu {
 	// different for z180..
 	struct adreno_rbmemptrs *memptrs;
 	struct drm_gem_object *memptrs_bo;
-	uint32_t memptrs_iova;
+	uint64_t memptrs_iova;

 	/*
 	 * Register offsets are different between some GPUs.
@ -161,6 +116,8 @@ struct adreno_gpu {
 	 * code (a3xx_gpu.c) and stored in this common location.
 	 */
 	const unsigned int *reg_offsets;
+
+	uint32_t quirks;
 };
 #define to_adreno_gpu(x) container_of(x, struct adreno_gpu, base)

@ -171,6 +128,7 @@ struct adreno_platform_config {
 #ifdef DOWNSTREAM_CONFIG_MSM_BUS_SCALING
 	struct msm_bus_scale_pdata *bus_scale_table;
 #endif
+	uint32_t quirks;
 };

 #define ADRENO_IDLE_TIMEOUT msecs_to_jiffies(1000)
@ -234,6 +192,11 @@ static inline int adreno_is_a430(struct adreno_gpu *gpu)
       return gpu->revn == 430;
 }

+static inline int adreno_is_a530(struct adreno_gpu *gpu)
+{
+	return gpu->revn == 530;
+}
+
 int adreno_get_param(struct msm_gpu *gpu, uint32_t param, uint64_t *value);
 int adreno_hw_init(struct msm_gpu *gpu);
 uint32_t adreno_last_fence(struct msm_gpu *gpu);
@ -241,7 +204,7 @@ void adreno_recover(struct msm_gpu *gpu);
 void adreno_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 		struct msm_file_private *ctx);
 void adreno_flush(struct msm_gpu *gpu);
-void adreno_idle(struct msm_gpu *gpu);
+bool adreno_idle(struct msm_gpu *gpu);
 #ifdef CONFIG_DEBUG_FS
 void adreno_show(struct msm_gpu *gpu, struct seq_file *m);
 #endif
@ -278,8 +241,38 @@ OUT_PKT3(struct msm_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
 	OUT_RING(ring, CP_TYPE3_PKT | ((cnt-1) << 16) | ((opcode & 0xFF) << 8));
 }

+static inline u32 PM4_PARITY(u32 val)
+{
+	return (0x9669 >> (0xF & (val ^
+		(val >> 4) ^ (val >> 8) ^ (val >> 12) ^
+		(val >> 16) ^ ((val) >> 20) ^ (val >> 24) ^
+		(val >> 28)))) & 1;
+}
+
+/* Maximum number of values that can be executed for one opcode */
+#define TYPE4_MAX_PAYLOAD 127
+
+#define PKT4(_reg, _cnt) \
+	(CP_TYPE4_PKT | ((_cnt) << 0) | (PM4_PARITY((_cnt)) << 7) | \
+	 (((_reg) & 0x3FFFF) << 8) | (PM4_PARITY((_reg)) << 27))
+
+static inline void
+OUT_PKT4(struct msm_ringbuffer *ring, uint16_t regindx, uint16_t cnt)
+{
+	adreno_wait_ring(ring->gpu, cnt + 1);
+	OUT_RING(ring, PKT4(regindx, cnt));
+}
+
+static inline void
+OUT_PKT7(struct msm_ringbuffer *ring, uint8_t opcode, uint16_t cnt)
+{
+	adreno_wait_ring(ring->gpu, cnt + 1);
+	OUT_RING(ring, CP_TYPE7_PKT | (cnt << 0) | (PM4_PARITY(cnt) << 15) |
+		((opcode & 0x7F) << 16) | (PM4_PARITY(opcode) << 23));
+}
+
 /*
- * adreno_checkreg_off() - Checks the validity of a register enum
+ * adreno_reg_check() - Checks the validity of a register enum
 * @gpu:		Pointer to struct adreno_gpu
 * @offset_name:	The register enum that is checked
 */
@ -290,6 +283,16 @@ static inline bool adreno_reg_check(struct adreno_gpu *gpu,
 			!gpu->reg_offsets[offset_name]) {
 		BUG();
 	}
+
+	/*
+	 * REG_SKIP is a special value that tell us that the register in
+	 * question isn't implemented on target but don't trigger a BUG(). This
+	 * is used to cleanly implement adreno_gpu_write64() and
+	 * adreno_gpu_read64() in a generic fashion
+	 */
+	if (gpu->reg_offsets[offset_name] == REG_SKIP)
+		return false;
+
 	return true;
 }

@ -313,5 +316,37 @@ static inline void adreno_gpu_write(struct adreno_gpu *gpu,

 struct msm_gpu *a3xx_gpu_init(struct drm_device *dev);
 struct msm_gpu *a4xx_gpu_init(struct drm_device *dev);
+struct msm_gpu *a5xx_gpu_init(struct drm_device *dev);
+
+static inline void adreno_gpu_write64(struct adreno_gpu *gpu,
+		enum adreno_regs lo, enum adreno_regs hi, u64 data)
+{
+	adreno_gpu_write(gpu, lo, lower_32_bits(data));
+	adreno_gpu_write(gpu, hi, upper_32_bits(data));
+}
+
+/*
+ * Given a register and a count, return a value to program into
+ * REG_CP_PROTECT_REG(n) - this will block both reads and writes for _len
+ * registers starting at _reg.
+ *
+ * The register base needs to be a multiple of the length. If it is not, the
+ * hardware will quietly mask off the bits for you and shift the size. For
+ * example, if you intend the protection to start at 0x07 for a length of 4
+ * (0x07-0x0A) the hardware will actually protect (0x04-0x07) which might
+ * expose registers you intended to protect!
+ */
+#define ADRENO_PROTECT_RW(_reg, _len) \
+	((1 << 30) | (1 << 29) | \
+	((ilog2((_len)) & 0x1F) << 24) | (((_reg) << 2) & 0xFFFFF))
+
+/*
+ * Same as above, but allow reads over the range. For areas of mixed use (such
+ * as performance counters) this allows us to protect a much larger range with a
+ * single register
+ */
+#define ADRENO_PROTECT_RDONLY(_reg, _len) \
+	((1 << 29) \
+	((ilog2((_len)) & 0x1F) << 24) | (((_reg) << 2) & 0xFFFFF))

 #endif /* __ADRENO_GPU_H__ */
--- a/drivers/gpu/drm/msm/adreno/adreno_pm4.xml.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_pm4.xml.h
@ -8,13 +8,14 @@ This file was generated by the rules-ng-ng headergen tool in this git repository
 git clone https://github.com/freedreno/envytools.git

 The rules-ng-ng source files this header was generated from are:
- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    398 bytes, from 2015-09-24 17:25:31)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno.xml               (    431 bytes, from 2016-04-26 17:56:44)
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2016-02-10 17:07:21)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2016-02-10 21:03:25)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  16166 bytes, from 2016-02-11 21:20:31)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83967 bytes, from 2016-02-10 17:07:21)
- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 109916 bytes, from 2016-02-20 18:44:48)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32907 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  12025 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  22544 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83840 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110765 bytes, from 2016-11-26 23:01:48)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a5xx.xml          (  90321 bytes, from 2016-11-28 16:50:05)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)

 Copyright (C) 2013-2016 by the following authors:
@ -58,6 +59,7 @@ enum vgt_event_type {
 	RST_PIX_CNT = 13,
 	RST_VTX_CNT = 14,
 	TILE_FLUSH = 15,
+	STAT_EVENT = 16,
 	CACHE_FLUSH_AND_INV_TS_EVENT = 20,
 	ZPASS_DONE = 21,
 	CACHE_FLUSH_AND_INV_EVENT = 22,
@ -65,6 +67,10 @@ enum vgt_event_type {
 	PERFCOUNTER_STOP = 24,
 	VS_FETCH_DONE = 27,
 	FACENESS_FLUSH = 28,
+	UNK_1C = 28,
+	UNK_1D = 29,
+	BLIT = 30,
+	UNK_26 = 38,
 };

 enum pc_di_primtype {
@ -82,7 +88,6 @@ enum pc_di_primtype {
 	DI_PT_LINESTRIP_ADJ = 11,
 	DI_PT_TRI_ADJ = 12,
 	DI_PT_TRISTRIP_ADJ = 13,
-	DI_PT_PATCHES = 34,
 };

 enum pc_di_src_sel {
@ -110,11 +115,15 @@ enum adreno_pm4_packet_type {
 	CP_TYPE1_PKT = 0x40000000,
 	CP_TYPE2_PKT = 0x80000000,
 	CP_TYPE3_PKT = 0xc0000000,
+	CP_TYPE4_PKT = 0x40000000,
+	CP_TYPE7_PKT = 0x70000000,
 };

 enum adreno_pm4_type3_packets {
 	CP_ME_INIT = 72,
 	CP_NOP = 16,
+	CP_PREEMPT_ENABLE = 28,
+	CP_PREEMPT_TOKEN = 30,
 	CP_INDIRECT_BUFFER = 63,
 	CP_INDIRECT_BUFFER_PFD = 55,
 	CP_WAIT_FOR_IDLE = 38,
@ -163,6 +172,7 @@ enum adreno_pm4_type3_packets {
 	CP_TEST_TWO_MEMS = 113,
 	CP_REG_WR_NO_CTXT = 120,
 	CP_RECORD_PFP_TIMESTAMP = 17,
+	CP_SET_SECURE_MODE = 102,
 	CP_WAIT_FOR_ME = 19,
 	CP_SET_DRAW_STATE = 67,
 	CP_DRAW_INDX_OFFSET = 56,
@ -178,6 +188,22 @@ enum adreno_pm4_type3_packets {
 	CP_WAIT_MEM_WRITES = 18,
 	CP_COND_REG_EXEC = 71,
 	CP_MEM_TO_REG = 66,
+	CP_EXEC_CS = 51,
+	CP_PERFCOUNTER_ACTION = 80,
+	CP_SMMU_TABLE_UPDATE = 83,
+	CP_CONTEXT_REG_BUNCH = 92,
+	CP_YIELD_ENABLE = 28,
+	CP_SKIP_IB2_ENABLE_GLOBAL = 29,
+	CP_SKIP_IB2_ENABLE_LOCAL = 35,
+	CP_SET_SUBDRAW_SIZE = 53,
+	CP_SET_VISIBILITY_OVERRIDE = 100,
+	CP_PREEMPT_ENABLE_GLOBAL = 105,
+	CP_PREEMPT_ENABLE_LOCAL = 106,
+	CP_CONTEXT_SWITCH_YIELD = 107,
+	CP_SET_RENDER_MODE = 108,
+	CP_COMPUTE_CHECKPOINT = 110,
+	CP_MEM_TO_MEM = 115,
+	CP_BLIT = 44,
 	IN_IB_PREFETCH_END = 23,
 	IN_SUBBLK_PREFETCH = 31,
 	IN_INSTR_PREFETCH = 32,
@ -196,6 +222,7 @@ enum adreno_state_block {
 	SB_VERT_SHADER = 4,
 	SB_GEOM_SHADER = 5,
 	SB_FRAG_SHADER = 6,
+	SB_COMPUTE_SHADER = 7,
 };

 enum adreno_state_type {
@ -218,6 +245,17 @@ enum a4xx_index_size {
 	INDEX4_SIZE_32_BIT = 2,
 };

+enum render_mode_cmd {
+	BYPASS = 1,
+	GMEM = 3,
+	BLIT2D = 5,
+};
+
+enum cp_blit_cmd {
+	BLIT_OP_FILL = 0,
+	BLIT_OP_BLIT = 1,
+};
+
 #define REG_CP_LOAD_STATE_0					0x00000000
 #define CP_LOAD_STATE_0_DST_OFF__MASK				0x0000ffff
 #define CP_LOAD_STATE_0_DST_OFF__SHIFT				0
@ -258,6 +296,14 @@ static inline uint32_t CP_LOAD_STATE_1_EXT_SRC_ADDR(uint32_t val)
 	return ((val >> 2) << CP_LOAD_STATE_1_EXT_SRC_ADDR__SHIFT) & CP_LOAD_STATE_1_EXT_SRC_ADDR__MASK;
 }

+#define REG_CP_LOAD_STATE_2					0x00000002
+#define CP_LOAD_STATE_2_EXT_SRC_ADDR_HI__MASK			0xffffffff
+#define CP_LOAD_STATE_2_EXT_SRC_ADDR_HI__SHIFT			0
+static inline uint32_t CP_LOAD_STATE_2_EXT_SRC_ADDR_HI(uint32_t val)
+{
+	return ((val) << CP_LOAD_STATE_2_EXT_SRC_ADDR_HI__SHIFT) & CP_LOAD_STATE_2_EXT_SRC_ADDR_HI__MASK;
+}
+
 #define REG_CP_DRAW_INDX_0					0x00000000
 #define CP_DRAW_INDX_0_VIZ_QUERY__MASK				0xffffffff
 #define CP_DRAW_INDX_0_VIZ_QUERY__SHIFT				0
@ -389,7 +435,12 @@ static inline uint32_t CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(enum pc_di_src_sel va
 {
 	return ((val) << CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT__SHIFT) & CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT__MASK;
 }
-#define CP_DRAW_INDX_OFFSET_0_TESSELLATE			0x00000100
+#define CP_DRAW_INDX_OFFSET_0_VIS_CULL__MASK			0x00000300
+#define CP_DRAW_INDX_OFFSET_0_VIS_CULL__SHIFT			8
+static inline uint32_t CP_DRAW_INDX_OFFSET_0_VIS_CULL(enum pc_di_vis_cull_mode val)
+{
+	return ((val) << CP_DRAW_INDX_OFFSET_0_VIS_CULL__SHIFT) & CP_DRAW_INDX_OFFSET_0_VIS_CULL__MASK;
+}
 #define CP_DRAW_INDX_OFFSET_0_INDEX_SIZE__MASK			0x00000c00
 #define CP_DRAW_INDX_OFFSET_0_INDEX_SIZE__SHIFT			10
 static inline uint32_t CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(enum a4xx_index_size val)
@ -437,30 +488,40 @@ static inline uint32_t CP_DRAW_INDX_OFFSET_5_INDX_SIZE(uint32_t val)
 	return ((val) << CP_DRAW_INDX_OFFSET_5_INDX_SIZE__SHIFT) & CP_DRAW_INDX_OFFSET_5_INDX_SIZE__MASK;
 }

-#define REG_CP_SET_DRAW_STATE_0					0x00000000
-#define CP_SET_DRAW_STATE_0_COUNT__MASK				0x0000ffff
-#define CP_SET_DRAW_STATE_0_COUNT__SHIFT			0
-static inline uint32_t CP_SET_DRAW_STATE_0_COUNT(uint32_t val)
+static inline uint32_t REG_CP_SET_DRAW_STATE_(uint32_t i0) { return 0x00000000 + 0x3*i0; }
+
+static inline uint32_t REG_CP_SET_DRAW_STATE__0(uint32_t i0) { return 0x00000000 + 0x3*i0; }
+#define CP_SET_DRAW_STATE__0_COUNT__MASK			0x0000ffff
+#define CP_SET_DRAW_STATE__0_COUNT__SHIFT			0
+static inline uint32_t CP_SET_DRAW_STATE__0_COUNT(uint32_t val)
 {
-	return ((val) << CP_SET_DRAW_STATE_0_COUNT__SHIFT) & CP_SET_DRAW_STATE_0_COUNT__MASK;
+	return ((val) << CP_SET_DRAW_STATE__0_COUNT__SHIFT) & CP_SET_DRAW_STATE__0_COUNT__MASK;
 }
-#define CP_SET_DRAW_STATE_0_DIRTY				0x00010000
-#define CP_SET_DRAW_STATE_0_DISABLE				0x00020000
-#define CP_SET_DRAW_STATE_0_DISABLE_ALL_GROUPS			0x00040000
-#define CP_SET_DRAW_STATE_0_LOAD_IMMED				0x00080000
-#define CP_SET_DRAW_STATE_0_GROUP_ID__MASK			0x1f000000
-#define CP_SET_DRAW_STATE_0_GROUP_ID__SHIFT			24
-static inline uint32_t CP_SET_DRAW_STATE_0_GROUP_ID(uint32_t val)
+#define CP_SET_DRAW_STATE__0_DIRTY				0x00010000
+#define CP_SET_DRAW_STATE__0_DISABLE				0x00020000
+#define CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS			0x00040000
+#define CP_SET_DRAW_STATE__0_LOAD_IMMED				0x00080000
+#define CP_SET_DRAW_STATE__0_GROUP_ID__MASK			0x1f000000
+#define CP_SET_DRAW_STATE__0_GROUP_ID__SHIFT			24
+static inline uint32_t CP_SET_DRAW_STATE__0_GROUP_ID(uint32_t val)
 {
-	return ((val) << CP_SET_DRAW_STATE_0_GROUP_ID__SHIFT) & CP_SET_DRAW_STATE_0_GROUP_ID__MASK;
+	return ((val) << CP_SET_DRAW_STATE__0_GROUP_ID__SHIFT) & CP_SET_DRAW_STATE__0_GROUP_ID__MASK;
 }

-#define REG_CP_SET_DRAW_STATE_1					0x00000001
-#define CP_SET_DRAW_STATE_1_ADDR__MASK				0xffffffff
-#define CP_SET_DRAW_STATE_1_ADDR__SHIFT				0
-static inline uint32_t CP_SET_DRAW_STATE_1_ADDR(uint32_t val)
+static inline uint32_t REG_CP_SET_DRAW_STATE__1(uint32_t i0) { return 0x00000001 + 0x3*i0; }
+#define CP_SET_DRAW_STATE__1_ADDR_LO__MASK			0xffffffff
+#define CP_SET_DRAW_STATE__1_ADDR_LO__SHIFT			0
+static inline uint32_t CP_SET_DRAW_STATE__1_ADDR_LO(uint32_t val)
 {
-	return ((val) << CP_SET_DRAW_STATE_1_ADDR__SHIFT) & CP_SET_DRAW_STATE_1_ADDR__MASK;
+	return ((val) << CP_SET_DRAW_STATE__1_ADDR_LO__SHIFT) & CP_SET_DRAW_STATE__1_ADDR_LO__MASK;
+}
+
+static inline uint32_t REG_CP_SET_DRAW_STATE__2(uint32_t i0) { return 0x00000002 + 0x3*i0; }
+#define CP_SET_DRAW_STATE__2_ADDR_HI__MASK			0xffffffff
+#define CP_SET_DRAW_STATE__2_ADDR_HI__SHIFT			0
+static inline uint32_t CP_SET_DRAW_STATE__2_ADDR_HI(uint32_t val)
+{
+	return ((val) << CP_SET_DRAW_STATE__2_ADDR_HI__SHIFT) & CP_SET_DRAW_STATE__2_ADDR_HI__MASK;
 }

 #define REG_CP_SET_BIN_0					0x00000000
@ -533,5 +594,192 @@ static inline uint32_t CP_REG_TO_MEM_1_DEST(uint32_t val)
 	return ((val) << CP_REG_TO_MEM_1_DEST__SHIFT) & CP_REG_TO_MEM_1_DEST__MASK;
 }

+#define REG_CP_DISPATCH_COMPUTE_0				0x00000000
+
+#define REG_CP_DISPATCH_COMPUTE_1				0x00000001
+#define CP_DISPATCH_COMPUTE_1_X__MASK				0xffffffff
+#define CP_DISPATCH_COMPUTE_1_X__SHIFT				0
+static inline uint32_t CP_DISPATCH_COMPUTE_1_X(uint32_t val)
+{
+	return ((val) << CP_DISPATCH_COMPUTE_1_X__SHIFT) & CP_DISPATCH_COMPUTE_1_X__MASK;
+}
+
+#define REG_CP_DISPATCH_COMPUTE_2				0x00000002
+#define CP_DISPATCH_COMPUTE_2_Y__MASK				0xffffffff
+#define CP_DISPATCH_COMPUTE_2_Y__SHIFT				0
+static inline uint32_t CP_DISPATCH_COMPUTE_2_Y(uint32_t val)
+{
+	return ((val) << CP_DISPATCH_COMPUTE_2_Y__SHIFT) & CP_DISPATCH_COMPUTE_2_Y__MASK;
+}
+
+#define REG_CP_DISPATCH_COMPUTE_3				0x00000003
+#define CP_DISPATCH_COMPUTE_3_Z__MASK				0xffffffff
+#define CP_DISPATCH_COMPUTE_3_Z__SHIFT				0
+static inline uint32_t CP_DISPATCH_COMPUTE_3_Z(uint32_t val)
+{
+	return ((val) << CP_DISPATCH_COMPUTE_3_Z__SHIFT) & CP_DISPATCH_COMPUTE_3_Z__MASK;
+}
+
+#define REG_CP_SET_RENDER_MODE_0				0x00000000
+#define CP_SET_RENDER_MODE_0_MODE__MASK				0x000001ff
+#define CP_SET_RENDER_MODE_0_MODE__SHIFT			0
+static inline uint32_t CP_SET_RENDER_MODE_0_MODE(enum render_mode_cmd val)
+{
+	return ((val) << CP_SET_RENDER_MODE_0_MODE__SHIFT) & CP_SET_RENDER_MODE_0_MODE__MASK;
+}
+
+#define REG_CP_SET_RENDER_MODE_1				0x00000001
+#define CP_SET_RENDER_MODE_1_ADDR_0_LO__MASK			0xffffffff
+#define CP_SET_RENDER_MODE_1_ADDR_0_LO__SHIFT			0
+static inline uint32_t CP_SET_RENDER_MODE_1_ADDR_0_LO(uint32_t val)
+{
+	return ((val) << CP_SET_RENDER_MODE_1_ADDR_0_LO__SHIFT) & CP_SET_RENDER_MODE_1_ADDR_0_LO__MASK;
+}
+
+#define REG_CP_SET_RENDER_MODE_2				0x00000002
+#define CP_SET_RENDER_MODE_2_ADDR_0_HI__MASK			0xffffffff
+#define CP_SET_RENDER_MODE_2_ADDR_0_HI__SHIFT			0
+static inline uint32_t CP_SET_RENDER_MODE_2_ADDR_0_HI(uint32_t val)
+{
+	return ((val) << CP_SET_RENDER_MODE_2_ADDR_0_HI__SHIFT) & CP_SET_RENDER_MODE_2_ADDR_0_HI__MASK;
+}
+
+#define REG_CP_SET_RENDER_MODE_3				0x00000003
+#define CP_SET_RENDER_MODE_3_GMEM_ENABLE			0x00000010
+
+#define REG_CP_SET_RENDER_MODE_4				0x00000004
+
+#define REG_CP_SET_RENDER_MODE_5				0x00000005
+#define CP_SET_RENDER_MODE_5_ADDR_1_LEN__MASK			0xffffffff
+#define CP_SET_RENDER_MODE_5_ADDR_1_LEN__SHIFT			0
+static inline uint32_t CP_SET_RENDER_MODE_5_ADDR_1_LEN(uint32_t val)
+{
+	return ((val) << CP_SET_RENDER_MODE_5_ADDR_1_LEN__SHIFT) & CP_SET_RENDER_MODE_5_ADDR_1_LEN__MASK;
+}
+
+#define REG_CP_SET_RENDER_MODE_6				0x00000006
+#define CP_SET_RENDER_MODE_6_ADDR_1_LO__MASK			0xffffffff
+#define CP_SET_RENDER_MODE_6_ADDR_1_LO__SHIFT			0
+static inline uint32_t CP_SET_RENDER_MODE_6_ADDR_1_LO(uint32_t val)
+{
+	return ((val) << CP_SET_RENDER_MODE_6_ADDR_1_LO__SHIFT) & CP_SET_RENDER_MODE_6_ADDR_1_LO__MASK;
+}
+
+#define REG_CP_SET_RENDER_MODE_7				0x00000007
+#define CP_SET_RENDER_MODE_7_ADDR_1_HI__MASK			0xffffffff
+#define CP_SET_RENDER_MODE_7_ADDR_1_HI__SHIFT			0
+static inline uint32_t CP_SET_RENDER_MODE_7_ADDR_1_HI(uint32_t val)
+{
+	return ((val) << CP_SET_RENDER_MODE_7_ADDR_1_HI__SHIFT) & CP_SET_RENDER_MODE_7_ADDR_1_HI__MASK;
+}
+
+#define REG_CP_PERFCOUNTER_ACTION_0				0x00000000
+
+#define REG_CP_PERFCOUNTER_ACTION_1				0x00000001
+#define CP_PERFCOUNTER_ACTION_1_ADDR_0_LO__MASK			0xffffffff
+#define CP_PERFCOUNTER_ACTION_1_ADDR_0_LO__SHIFT		0
+static inline uint32_t CP_PERFCOUNTER_ACTION_1_ADDR_0_LO(uint32_t val)
+{
+	return ((val) << CP_PERFCOUNTER_ACTION_1_ADDR_0_LO__SHIFT) & CP_PERFCOUNTER_ACTION_1_ADDR_0_LO__MASK;
+}
+
+#define REG_CP_PERFCOUNTER_ACTION_2				0x00000002
+#define CP_PERFCOUNTER_ACTION_2_ADDR_0_HI__MASK			0xffffffff
+#define CP_PERFCOUNTER_ACTION_2_ADDR_0_HI__SHIFT		0
+static inline uint32_t CP_PERFCOUNTER_ACTION_2_ADDR_0_HI(uint32_t val)
+{
+	return ((val) << CP_PERFCOUNTER_ACTION_2_ADDR_0_HI__SHIFT) & CP_PERFCOUNTER_ACTION_2_ADDR_0_HI__MASK;
+}
+
+#define REG_CP_EVENT_WRITE_0					0x00000000
+#define CP_EVENT_WRITE_0_EVENT__MASK				0x000000ff
+#define CP_EVENT_WRITE_0_EVENT__SHIFT				0
+static inline uint32_t CP_EVENT_WRITE_0_EVENT(enum vgt_event_type val)
+{
+	return ((val) << CP_EVENT_WRITE_0_EVENT__SHIFT) & CP_EVENT_WRITE_0_EVENT__MASK;
+}
+
+#define REG_CP_EVENT_WRITE_1					0x00000001
+#define CP_EVENT_WRITE_1_ADDR_0_LO__MASK			0xffffffff
+#define CP_EVENT_WRITE_1_ADDR_0_LO__SHIFT			0
+static inline uint32_t CP_EVENT_WRITE_1_ADDR_0_LO(uint32_t val)
+{
+	return ((val) << CP_EVENT_WRITE_1_ADDR_0_LO__SHIFT) & CP_EVENT_WRITE_1_ADDR_0_LO__MASK;
+}
+
+#define REG_CP_EVENT_WRITE_2					0x00000002
+#define CP_EVENT_WRITE_2_ADDR_0_HI__MASK			0xffffffff
+#define CP_EVENT_WRITE_2_ADDR_0_HI__SHIFT			0
+static inline uint32_t CP_EVENT_WRITE_2_ADDR_0_HI(uint32_t val)
+{
+	return ((val) << CP_EVENT_WRITE_2_ADDR_0_HI__SHIFT) & CP_EVENT_WRITE_2_ADDR_0_HI__MASK;
+}
+
+#define REG_CP_EVENT_WRITE_3					0x00000003
+
+#define REG_CP_BLIT_0						0x00000000
+#define CP_BLIT_0_OP__MASK					0x0000000f
+#define CP_BLIT_0_OP__SHIFT					0
+static inline uint32_t CP_BLIT_0_OP(enum cp_blit_cmd val)
+{
+	return ((val) << CP_BLIT_0_OP__SHIFT) & CP_BLIT_0_OP__MASK;
+}
+
+#define REG_CP_BLIT_1						0x00000001
+#define CP_BLIT_1_SRC_X1__MASK					0x0000ffff
+#define CP_BLIT_1_SRC_X1__SHIFT					0
+static inline uint32_t CP_BLIT_1_SRC_X1(uint32_t val)
+{
+	return ((val) << CP_BLIT_1_SRC_X1__SHIFT) & CP_BLIT_1_SRC_X1__MASK;
+}
+#define CP_BLIT_1_SRC_Y1__MASK					0xffff0000
+#define CP_BLIT_1_SRC_Y1__SHIFT					16
+static inline uint32_t CP_BLIT_1_SRC_Y1(uint32_t val)
+{
+	return ((val) << CP_BLIT_1_SRC_Y1__SHIFT) & CP_BLIT_1_SRC_Y1__MASK;
+}
+
+#define REG_CP_BLIT_2						0x00000002
+#define CP_BLIT_2_SRC_X2__MASK					0x0000ffff
+#define CP_BLIT_2_SRC_X2__SHIFT					0
+static inline uint32_t CP_BLIT_2_SRC_X2(uint32_t val)
+{
+	return ((val) << CP_BLIT_2_SRC_X2__SHIFT) & CP_BLIT_2_SRC_X2__MASK;
+}
+#define CP_BLIT_2_SRC_Y2__MASK					0xffff0000
+#define CP_BLIT_2_SRC_Y2__SHIFT					16
+static inline uint32_t CP_BLIT_2_SRC_Y2(uint32_t val)
+{
+	return ((val) << CP_BLIT_2_SRC_Y2__SHIFT) & CP_BLIT_2_SRC_Y2__MASK;
+}
+
+#define REG_CP_BLIT_3						0x00000003
+#define CP_BLIT_3_DST_X1__MASK					0x0000ffff
+#define CP_BLIT_3_DST_X1__SHIFT					0
+static inline uint32_t CP_BLIT_3_DST_X1(uint32_t val)
+{
+	return ((val) << CP_BLIT_3_DST_X1__SHIFT) & CP_BLIT_3_DST_X1__MASK;
+}
+#define CP_BLIT_3_DST_Y1__MASK					0xffff0000
+#define CP_BLIT_3_DST_Y1__SHIFT					16
+static inline uint32_t CP_BLIT_3_DST_Y1(uint32_t val)
+{
+	return ((val) << CP_BLIT_3_DST_Y1__SHIFT) & CP_BLIT_3_DST_Y1__MASK;
+}
+
+#define REG_CP_BLIT_4						0x00000004
+#define CP_BLIT_4_DST_X2__MASK					0x0000ffff
+#define CP_BLIT_4_DST_X2__SHIFT					0
+static inline uint32_t CP_BLIT_4_DST_X2(uint32_t val)
+{
+	return ((val) << CP_BLIT_4_DST_X2__SHIFT) & CP_BLIT_4_DST_X2__MASK;
+}
+#define CP_BLIT_4_DST_Y2__MASK					0xffff0000
+#define CP_BLIT_4_DST_Y2__SHIFT					16
+static inline uint32_t CP_BLIT_4_DST_Y2(uint32_t val)
+{
+	return ((val) << CP_BLIT_4_DST_Y2__SHIFT) & CP_BLIT_4_DST_Y2__MASK;
+}
+

 #endif /* ADRENO_PM4_XML */
--- a/drivers/gpu/drm/msm/dsi/dsi.xml.h
+++ b/drivers/gpu/drm/msm/dsi/dsi.xml.h
@ -12,7 +12,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml (   1572 bytes, from 2016-02-10 17:07:21)
 - /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp4.xml            (  20915 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp_common.xml      (   2849 bytes, from 2015-09-18 12:07:28)
- /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp5.xml            (  37194 bytes, from 2015-09-18 12:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp5.xml            (  36965 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/dsi.xml             (  27887 bytes, from 2015-10-22 16:34:52)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/sfpb.xml            (    602 bytes, from 2015-10-22 16:35:02)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/mmss_cc.xml         (   1686 bytes, from 2015-05-20 20:03:14)
--- a/drivers/gpu/drm/msm/dsi/dsi_host.c
+++ b/drivers/gpu/drm/msm/dsi/dsi_host.c
@ -982,7 +982,7 @@ static int dsi_tx_buf_alloc(struct msm_dsi_host *msm_host, int size)
 	struct drm_device *dev = msm_host->dev;
 	const struct msm_dsi_cfg_handler *cfg_hnd = msm_host->cfg_hnd;
 	int ret;
-	u32 iova;
+	uint64_t iova;

 	if (cfg_hnd->major == MSM_DSI_VER_MAJOR_6G) {
 		mutex_lock(&dev->struct_mutex);
@ -1147,7 +1147,7 @@ static int dsi_cmd_dma_tx(struct msm_dsi_host *msm_host, int len)
 {
 	const struct msm_dsi_cfg_handler *cfg_hnd = msm_host->cfg_hnd;
 	int ret;
-	u32 dma_base;
+	uint64_t dma_base;
 	bool triggered;

 	if (cfg_hnd->major == MSM_DSI_VER_MAJOR_6G) {
--- a/drivers/gpu/drm/msm/dsi/mmss_cc.xml.h
+++ b/drivers/gpu/drm/msm/dsi/mmss_cc.xml.h
@ -12,7 +12,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml (   1572 bytes, from 2016-02-10 17:07:21)
 - /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp4.xml            (  20915 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp_common.xml      (   2849 bytes, from 2015-09-18 12:07:28)
- /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp5.xml            (  37194 bytes, from 2015-09-18 12:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp5.xml            (  36965 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/dsi.xml             (  27887 bytes, from 2015-10-22 16:34:52)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/sfpb.xml            (    602 bytes, from 2015-10-22 16:35:02)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/mmss_cc.xml         (   1686 bytes, from 2015-05-20 20:03:14)
--- a/drivers/gpu/drm/msm/dsi/sfpb.xml.h
+++ b/drivers/gpu/drm/msm/dsi/sfpb.xml.h
@ -12,7 +12,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml (   1572 bytes, from 2016-02-10 17:07:21)
 - /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp4.xml            (  20915 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp_common.xml      (   2849 bytes, from 2015-09-18 12:07:28)
- /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp5.xml            (  37194 bytes, from 2015-09-18 12:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp5.xml            (  36965 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/dsi.xml             (  27887 bytes, from 2015-10-22 16:34:52)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/sfpb.xml            (    602 bytes, from 2015-10-22 16:35:02)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/mmss_cc.xml         (   1686 bytes, from 2015-05-20 20:03:14)
--- a/drivers/gpu/drm/msm/edp/edp.xml.h
+++ b/drivers/gpu/drm/msm/edp/edp.xml.h
@ -12,7 +12,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml (   1572 bytes, from 2016-02-10 17:07:21)
 - /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp4.xml            (  20915 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp_common.xml      (   2849 bytes, from 2015-09-18 12:07:28)
- /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp5.xml            (  37194 bytes, from 2015-09-18 12:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp5.xml            (  36965 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/dsi.xml             (  27887 bytes, from 2015-10-22 16:34:52)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/sfpb.xml            (    602 bytes, from 2015-10-22 16:35:02)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/mmss_cc.xml         (   1686 bytes, from 2015-05-20 20:03:14)
--- a/drivers/gpu/drm/msm/hdmi/hdmi.xml.h
+++ b/drivers/gpu/drm/msm/hdmi/hdmi.xml.h
@ -12,7 +12,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml (   1572 bytes, from 2016-02-10 17:07:21)
 - /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp4.xml            (  20915 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp_common.xml      (   2849 bytes, from 2015-09-18 12:07:28)
- /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp5.xml            (  37194 bytes, from 2015-09-18 12:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp5.xml            (  36965 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/dsi.xml             (  27887 bytes, from 2015-10-22 16:34:52)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/sfpb.xml            (    602 bytes, from 2015-10-22 16:35:02)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/mmss_cc.xml         (   1686 bytes, from 2015-05-20 20:03:14)
--- a/drivers/gpu/drm/msm/hdmi/qfprom.xml.h
+++ b/drivers/gpu/drm/msm/hdmi/qfprom.xml.h
@ -12,7 +12,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml (   1572 bytes, from 2016-02-10 17:07:21)
 - /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp4.xml            (  20915 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp_common.xml      (   2849 bytes, from 2015-09-18 12:07:28)
- /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp5.xml            (  37194 bytes, from 2015-09-18 12:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp5.xml            (  36965 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/dsi.xml             (  27887 bytes, from 2015-10-22 16:34:52)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/sfpb.xml            (    602 bytes, from 2015-10-22 16:35:02)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/mmss_cc.xml         (   1686 bytes, from 2015-05-20 20:03:14)
--- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4.xml.h
+++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4.xml.h
@ -12,7 +12,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml (   1572 bytes, from 2016-02-10 17:07:21)
 - /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp4.xml            (  20915 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp_common.xml      (   2849 bytes, from 2015-09-18 12:07:28)
- /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp5.xml            (  37194 bytes, from 2015-09-18 12:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp5.xml            (  36965 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/dsi.xml             (  27887 bytes, from 2015-10-22 16:34:52)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/sfpb.xml            (    602 bytes, from 2015-10-22 16:35:02)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/mmss_cc.xml         (   1686 bytes, from 2015-05-20 20:03:14)
--- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c
+++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_crtc.c
@ -373,7 +373,7 @@ static void update_cursor(struct drm_crtc *crtc)
 	if (mdp4_crtc->cursor.stale) {
 		struct drm_gem_object *next_bo = mdp4_crtc->cursor.next_bo;
 		struct drm_gem_object *prev_bo = mdp4_crtc->cursor.scanout_bo;
-		uint32_t iova = mdp4_crtc->cursor.next_iova;
+		uint64_t iova = mdp4_crtc->cursor.next_iova;

 		if (next_bo) {
 			/* take a obj ref + iova ref when we start scanning out: */
@ -418,7 +418,7 @@ static int mdp4_crtc_cursor_set(struct drm_crtc *crtc,
 	struct drm_device *dev = crtc->dev;
 	struct drm_gem_object *cursor_bo, *old_bo;
 	unsigned long flags;
-	uint32_t iova;
+	uint64_t iova;
 	int ret;

 	if ((width > CURSOR_WIDTH) || (height > CURSOR_HEIGHT)) {
--- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c
+++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.c
@ -17,6 +17,7 @@


 #include "msm_drv.h"
+#include "msm_gem.h"
 #include "msm_mmu.h"
 #include "mdp4_kms.h"

@ -159,17 +160,18 @@ static void mdp4_destroy(struct msm_kms *kms)
 {
 	struct mdp4_kms *mdp4_kms = to_mdp4_kms(to_mdp_kms(kms));
 	struct device *dev = mdp4_kms->dev->dev;
-	struct msm_mmu *mmu = mdp4_kms->mmu;
-
-	if (mmu) {
-		mmu->funcs->detach(mmu, iommu_ports, ARRAY_SIZE(iommu_ports));
-		mmu->funcs->destroy(mmu);
-	}
+	struct msm_gem_address_space *aspace = mdp4_kms->aspace;

 	if (mdp4_kms->blank_cursor_iova)
 		msm_gem_put_iova(mdp4_kms->blank_cursor_bo, mdp4_kms->id);
 	drm_gem_object_unreference_unlocked(mdp4_kms->blank_cursor_bo);

+	if (aspace) {
+		aspace->mmu->funcs->detach(aspace->mmu,
+				iommu_ports, ARRAY_SIZE(iommu_ports));
+		msm_gem_address_space_destroy(aspace);
+	}
+
 	if (mdp4_kms->rpm_enabled)
 		pm_runtime_disable(dev);

@ -440,7 +442,7 @@ struct msm_kms *mdp4_kms_init(struct drm_device *dev)
 	struct mdp4_platform_config *config = mdp4_get_config(pdev);
 	struct mdp4_kms *mdp4_kms;
 	struct msm_kms *kms = NULL;
-	struct msm_mmu *mmu;
+	struct msm_gem_address_space *aspace;
 	int irq, ret;

 	mdp4_kms = kzalloc(sizeof(*mdp4_kms), GFP_KERNEL);
@ -531,24 +533,26 @@ struct msm_kms *mdp4_kms_init(struct drm_device *dev)
 	mdelay(16);

 	if (config->iommu) {
-		mmu = msm_iommu_new(&pdev->dev, config->iommu);
-		if (IS_ERR(mmu)) {
-			ret = PTR_ERR(mmu);
+		aspace = msm_gem_address_space_create(&pdev->dev,
+				config->iommu, "mdp4");
+		if (IS_ERR(aspace)) {
+			ret = PTR_ERR(aspace);
 			goto fail;
 		}
-		ret = mmu->funcs->attach(mmu, iommu_ports,
+
+		mdp4_kms->aspace = aspace;
+
+		ret = aspace->mmu->funcs->attach(aspace->mmu, iommu_ports,
 				ARRAY_SIZE(iommu_ports));
 		if (ret)
 			goto fail;
-
-		mdp4_kms->mmu = mmu;
 	} else {
 		dev_info(dev->dev, "no iommu, fallback to phys "
 				"contig buffers for scanout\n");
-		mmu = NULL;
+		aspace = NULL;
 	}

-	mdp4_kms->id = msm_register_mmu(dev, mmu);
+	mdp4_kms->id = msm_register_address_space(dev, aspace);
 	if (mdp4_kms->id < 0) {
 		ret = mdp4_kms->id;
 		dev_err(dev->dev, "failed to register mdp4 iommu: %d\n", ret);
@ -598,6 +602,10 @@ static struct mdp4_platform_config *mdp4_get_config(struct platform_device *dev)
 	/* TODO: Chips that aren't apq8064 have a 200 Mhz max_clk */
 	config.max_clk = 266667000;
 	config.iommu = iommu_domain_alloc(&platform_bus_type);
+	if (config.iommu) {
+		config.iommu->geometry.aperture_start = 0x1000;
+		config.iommu->geometry.aperture_end = 0xffffffff;
+	}

 	return &config;
 }
--- a/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h
+++ b/drivers/gpu/drm/msm/mdp/mdp4/mdp4_kms.h
@ -43,7 +43,7 @@ struct mdp4_kms {
 	struct clk *pclk;
 	struct clk *lut_clk;
 	struct clk *axi_clk;
-	struct msm_mmu *mmu;
+	struct msm_gem_address_space *aspace;

 	struct mdp_irq error_handler;

@ -51,7 +51,7 @@ struct mdp4_kms {

 	/* empty/blank cursor bo to use when cursor is "disabled" */
 	struct drm_gem_object *blank_cursor_bo;
-	uint32_t blank_cursor_iova;
+	uint64_t blank_cursor_iova;
 };
 #define to_mdp4_kms(x) container_of(x, struct mdp4_kms, base)

--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5.xml.h
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5.xml.h
@ -8,9 +8,17 @@ This file was generated by the rules-ng-ng headergen tool in this git repository
 git clone https://github.com/freedreno/envytools.git

 The rules-ng-ng source files this header was generated from are:
- /local/mnt/workspace/source_trees/envytools/rnndb/../rnndb/mdp/mdp5.xml   (  36965 bytes, from 2016-05-10 05:06:30)
- /local/mnt/workspace/source_trees/envytools/rnndb/freedreno_copyright.xml (   1572 bytes, from 2016-05-09 06:32:54)
- /local/mnt/workspace/source_trees/envytools/rnndb/mdp/mdp_common.xml      (   2849 bytes, from 2016-01-07 08:45:55)
+- /home/robclark/src/freedreno/envytools/rnndb/msm.xml                 (    676 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml (   1572 bytes, from 2016-02-10 17:07:21)
+- /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp4.xml            (  20915 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp_common.xml      (   2849 bytes, from 2015-09-18 12:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp5.xml            (  36965 bytes, from 2016-11-26 23:01:08)
+- /home/robclark/src/freedreno/envytools/rnndb/dsi/dsi.xml             (  27887 bytes, from 2015-10-22 16:34:52)
+- /home/robclark/src/freedreno/envytools/rnndb/dsi/sfpb.xml            (    602 bytes, from 2015-10-22 16:35:02)
+- /home/robclark/src/freedreno/envytools/rnndb/dsi/mmss_cc.xml         (   1686 bytes, from 2015-05-20 20:03:14)
+- /home/robclark/src/freedreno/envytools/rnndb/hdmi/qfprom.xml         (    600 bytes, from 2015-05-20 20:03:07)
+- /home/robclark/src/freedreno/envytools/rnndb/hdmi/hdmi.xml           (  41472 bytes, from 2016-01-22 18:18:18)
+- /home/robclark/src/freedreno/envytools/rnndb/edp/edp.xml             (  10416 bytes, from 2015-05-20 20:03:14)

 Copyright (C) 2013-2016 by the following authors:
 - Rob Clark <robdclark@gmail.com> (robclark)
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_cfg.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_cfg.c
@ -550,6 +550,10 @@ static struct mdp5_cfg_platform *mdp5_get_config(struct platform_device *dev)
 	static struct mdp5_cfg_platform config = {};

 	config.iommu = iommu_domain_alloc(&platform_bus_type);
+	if (config.iommu) {
+		config.iommu->geometry.aperture_start = 0x1000;
+		config.iommu->geometry.aperture_end = 0xffffffff;
+	}

 	return &config;
 }
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_crtc.c
@ -27,11 +27,8 @@
 #define CURSOR_WIDTH	64
 #define CURSOR_HEIGHT	64

-#define SSPP_MAX	(SSPP_RGB3 + 1) /* TODO: Add SSPP_MAX in mdp5.xml.h */
-
 struct mdp5_crtc {
 	struct drm_crtc base;
-	char name[8];
 	int id;
 	bool enabled;

@ -102,7 +99,7 @@ static u32 crtc_flush(struct drm_crtc *crtc, u32 flush_mask)
 {
 	struct mdp5_crtc *mdp5_crtc = to_mdp5_crtc(crtc);

-	DBG("%s: flush=%08x", mdp5_crtc->name, flush_mask);
+	DBG("%s: flush=%08x", crtc->name, flush_mask);
 	return mdp5_ctl_commit(mdp5_crtc->ctl, flush_mask);
 }

@ -136,7 +133,6 @@ static void complete_flip(struct drm_crtc *crtc, struct drm_file *file)
 	struct mdp5_crtc *mdp5_crtc = to_mdp5_crtc(crtc);
 	struct drm_device *dev = crtc->dev;
 	struct drm_pending_vblank_event *event;
-	struct drm_plane *plane;
 	unsigned long flags;

 	spin_lock_irqsave(&dev->event_lock, flags);
@ -148,16 +144,12 @@ static void complete_flip(struct drm_crtc *crtc, struct drm_file *file)
 		 */
 		if (!file || (event->base.file_priv == file)) {
 			mdp5_crtc->event = NULL;
-			DBG("%s: send event: %p", mdp5_crtc->name, event);
+			DBG("%s: send event: %p", crtc->name, event);
 			drm_crtc_send_vblank_event(crtc, event);
 		}
 	}
 	spin_unlock_irqrestore(&dev->event_lock, flags);

-	drm_atomic_crtc_for_each_plane(plane, crtc) {
-		mdp5_plane_complete_flip(plane);
-	}
-
 	if (mdp5_crtc->ctl && !crtc->state->enable) {
 		/* set STAGE_UNUSED for all layers */
 		mdp5_ctl_blend(mdp5_crtc->ctl, NULL, 0, 0);
@ -295,7 +287,7 @@ static void mdp5_crtc_mode_set_nofb(struct drm_crtc *crtc)
 	mode = &crtc->state->adjusted_mode;

 	DBG("%s: set mode: %d:\"%s\" %d %d %d %d %d %d %d %d %d %d 0x%x 0x%x",
-			mdp5_crtc->name, mode->base.id, mode->name,
+			crtc->name, mode->base.id, mode->name,
 			mode->vrefresh, mode->clock,
 			mode->hdisplay, mode->hsync_start,
 			mode->hsync_end, mode->htotal,
@ -315,7 +307,7 @@ static void mdp5_crtc_disable(struct drm_crtc *crtc)
 	struct mdp5_crtc *mdp5_crtc = to_mdp5_crtc(crtc);
 	struct mdp5_kms *mdp5_kms = get_kms(crtc);

-	DBG("%s", mdp5_crtc->name);
+	DBG("%s", crtc->name);

 	if (WARN_ON(!mdp5_crtc->enabled))
 		return;
@ -334,7 +326,7 @@ static void mdp5_crtc_enable(struct drm_crtc *crtc)
 	struct mdp5_crtc *mdp5_crtc = to_mdp5_crtc(crtc);
 	struct mdp5_kms *mdp5_kms = get_kms(crtc);

-	DBG("%s", mdp5_crtc->name);
+	DBG("%s", crtc->name);

 	if (WARN_ON(mdp5_crtc->enabled))
 		return;
@ -372,7 +364,6 @@ static bool is_fullscreen(struct drm_crtc_state *cstate,
 static int mdp5_crtc_atomic_check(struct drm_crtc *crtc,
 		struct drm_crtc_state *state)
 {
-	struct mdp5_crtc *mdp5_crtc = to_mdp5_crtc(crtc);
 	struct mdp5_kms *mdp5_kms = get_kms(crtc);
 	struct drm_plane *plane;
 	struct drm_device *dev = crtc->dev;
@ -381,7 +372,7 @@ static int mdp5_crtc_atomic_check(struct drm_crtc *crtc,
 	const struct drm_plane_state *pstate;
 	int cnt = 0, base = 0, i;

-	DBG("%s: check", mdp5_crtc->name);
+	DBG("%s: check", crtc->name);

 	drm_atomic_crtc_state_for_each_plane_state(plane, pstate, state) {
 		pstates[cnt].plane = plane;
@ -405,14 +396,14 @@ static int mdp5_crtc_atomic_check(struct drm_crtc *crtc,
 	hw_cfg = mdp5_cfg_get_hw_config(mdp5_kms->cfg);

 	if ((cnt + base) >= hw_cfg->lm.nb_stages) {
-		dev_err(dev->dev, "too many planes!\n");
+		dev_err(dev->dev, "too many planes! cnt=%d, base=%d\n", cnt, base);
 		return -EINVAL;
 	}

 	for (i = 0; i < cnt; i++) {
 		pstates[i].state->stage = STAGE_BASE + i + base;
-		DBG("%s: assign pipe %s on stage=%d", mdp5_crtc->name,
-				pipe2name(mdp5_plane_pipe(pstates[i].plane)),
+		DBG("%s: assign pipe %s on stage=%d", crtc->name,
+				pstates[i].plane->name,
 				pstates[i].state->stage);
 	}

@ -422,8 +413,7 @@ static int mdp5_crtc_atomic_check(struct drm_crtc *crtc,
 static void mdp5_crtc_atomic_begin(struct drm_crtc *crtc,
 				   struct drm_crtc_state *old_crtc_state)
 {
-	struct mdp5_crtc *mdp5_crtc = to_mdp5_crtc(crtc);
-	DBG("%s: begin", mdp5_crtc->name);
+	DBG("%s: begin", crtc->name);
 }

 static void mdp5_crtc_atomic_flush(struct drm_crtc *crtc,
@ -433,7 +423,7 @@ static void mdp5_crtc_atomic_flush(struct drm_crtc *crtc,
 	struct drm_device *dev = crtc->dev;
 	unsigned long flags;

-	DBG("%s: event: %p", mdp5_crtc->name, crtc->state->event);
+	DBG("%s: event: %p", crtc->name, crtc->state->event);

 	WARN_ON(mdp5_crtc->event);

@ -499,7 +489,8 @@ static int mdp5_crtc_cursor_set(struct drm_crtc *crtc,
 	struct drm_device *dev = crtc->dev;
 	struct mdp5_kms *mdp5_kms = get_kms(crtc);
 	struct drm_gem_object *cursor_bo, *old_bo = NULL;
-	uint32_t blendcfg, cursor_addr, stride;
+	uint32_t blendcfg, stride;
+	uint64_t cursor_addr;
 	int ret, lm;
 	enum mdp5_cursor_alpha cur_alpha = CURSOR_ALPHA_PER_PIXEL;
 	uint32_t flush_mask = mdp_ctl_flush_mask_cursor(0);
@ -653,7 +644,7 @@ static void mdp5_crtc_err_irq(struct mdp_irq *irq, uint32_t irqstatus)
 {
 	struct mdp5_crtc *mdp5_crtc = container_of(irq, struct mdp5_crtc, err);

-	DBG("%s: error: %08x", mdp5_crtc->name, irqstatus);
+	DBG("%s: error: %08x", mdp5_crtc->base.name, irqstatus);
 }

 static void mdp5_crtc_pp_done_irq(struct mdp_irq *irq, uint32_t irqstatus)
@ -775,9 +766,6 @@ struct drm_crtc *mdp5_crtc_init(struct drm_device *dev,
 	mdp5_crtc->vblank.irq = mdp5_crtc_vblank_irq;
 	mdp5_crtc->err.irq = mdp5_crtc_err_irq;

-	snprintf(mdp5_crtc->name, sizeof(mdp5_crtc->name), "%s:%d",
-			pipe2name(mdp5_plane_pipe(plane)), id);
-
 	drm_crtc_init_with_planes(dev, crtc, plane, NULL, &mdp5_crtc_funcs,
 				  NULL);

--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_irq.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_irq.c
@ -41,6 +41,8 @@ static void mdp5_irq_error_handler(struct mdp_irq *irq, uint32_t irqstatus)
 	if (dumpstate && __ratelimit(&rs)) {
 		struct drm_printer p = drm_info_printer(mdp5_kms->dev->dev);
 		drm_state_dump(mdp5_kms->dev, &p);
+		if (mdp5_kms->smp)
+			mdp5_smp_dump(mdp5_kms->smp, &p);
 	}
 }

--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.c
@ -19,6 +19,7 @@
 #include <linux/of_irq.h>

 #include "msm_drv.h"
+#include "msm_gem.h"
 #include "msm_mmu.h"
 #include "mdp5_kms.h"

@ -71,10 +72,49 @@ static int mdp5_hw_init(struct msm_kms *kms)
 	return 0;
 }

+struct mdp5_state *mdp5_get_state(struct drm_atomic_state *s)
+{
+	struct msm_drm_private *priv = s->dev->dev_private;
+	struct mdp5_kms *mdp5_kms = to_mdp5_kms(to_mdp_kms(priv->kms));
+	struct msm_kms_state *state = to_kms_state(s);
+	struct mdp5_state *new_state;
+	int ret;
+
+	if (state->state)
+		return state->state;
+
+	ret = drm_modeset_lock(&mdp5_kms->state_lock, s->acquire_ctx);
+	if (ret)
+		return ERR_PTR(ret);
+
+	new_state = kmalloc(sizeof(*mdp5_kms->state), GFP_KERNEL);
+	if (!new_state)
+		return ERR_PTR(-ENOMEM);
+
+	/* Copy state: */
+	new_state->hwpipe = mdp5_kms->state->hwpipe;
+	if (mdp5_kms->smp)
+		new_state->smp = mdp5_kms->state->smp;
+
+	state->state = new_state;
+
+	return new_state;
+}
+
+static void mdp5_swap_state(struct msm_kms *kms, struct drm_atomic_state *state)
+{
+	struct mdp5_kms *mdp5_kms = to_mdp5_kms(to_mdp_kms(kms));
+	swap(to_kms_state(state)->state, mdp5_kms->state);
+}
+
 static void mdp5_prepare_commit(struct msm_kms *kms, struct drm_atomic_state *state)
 {
 	struct mdp5_kms *mdp5_kms = to_mdp5_kms(to_mdp_kms(kms));
+
 	mdp5_enable(mdp5_kms);
+
+	if (mdp5_kms->smp)
+		mdp5_smp_prepare_commit(mdp5_kms->smp, &mdp5_kms->state->smp);
 }

 static void mdp5_complete_commit(struct msm_kms *kms, struct drm_atomic_state *state)
@ -87,6 +127,9 @@ static void mdp5_complete_commit(struct msm_kms *kms, struct drm_atomic_state *s
 	for_each_plane_in_state(state, plane, plane_state, i)
 		mdp5_plane_complete_commit(plane, plane_state);

+	if (mdp5_kms->smp)
+		mdp5_smp_complete_commit(mdp5_kms->smp, &mdp5_kms->state->smp);
+
 	mdp5_disable(mdp5_kms);
 }

@ -117,14 +160,66 @@ static int mdp5_set_split_display(struct msm_kms *kms,
 static void mdp5_kms_destroy(struct msm_kms *kms)
 {
 	struct mdp5_kms *mdp5_kms = to_mdp5_kms(to_mdp_kms(kms));
-	struct msm_mmu *mmu = mdp5_kms->mmu;
+	struct msm_gem_address_space *aspace = mdp5_kms->aspace;
+	int i;

-	if (mmu) {
-		mmu->funcs->detach(mmu, iommu_ports, ARRAY_SIZE(iommu_ports));
-		mmu->funcs->destroy(mmu);
+	for (i = 0; i < mdp5_kms->num_hwpipes; i++)
+		mdp5_pipe_destroy(mdp5_kms->hwpipes[i]);
+
+	if (aspace) {
+		aspace->mmu->funcs->detach(aspace->mmu,
+				iommu_ports, ARRAY_SIZE(iommu_ports));
+		msm_gem_address_space_destroy(aspace);
 	}
 }

+#ifdef CONFIG_DEBUG_FS
+static int smp_show(struct seq_file *m, void *arg)
+{
+	struct drm_info_node *node = (struct drm_info_node *) m->private;
+	struct drm_device *dev = node->minor->dev;
+	struct msm_drm_private *priv = dev->dev_private;
+	struct mdp5_kms *mdp5_kms = to_mdp5_kms(to_mdp_kms(priv->kms));
+	struct drm_printer p = drm_seq_file_printer(m);
+
+	if (!mdp5_kms->smp) {
+		drm_printf(&p, "no SMP pool\n");
+		return 0;
+	}
+
+	mdp5_smp_dump(mdp5_kms->smp, &p);
+
+	return 0;
+}
+
+static struct drm_info_list mdp5_debugfs_list[] = {
+		{"smp", smp_show },
+};
+
+static int mdp5_kms_debugfs_init(struct msm_kms *kms, struct drm_minor *minor)
+{
+	struct drm_device *dev = minor->dev;
+	int ret;
+
+	ret = drm_debugfs_create_files(mdp5_debugfs_list,
+			ARRAY_SIZE(mdp5_debugfs_list),
+			minor->debugfs_root, minor);
+
+	if (ret) {
+		dev_err(dev->dev, "could not install mdp5_debugfs_list\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void mdp5_kms_debugfs_cleanup(struct msm_kms *kms, struct drm_minor *minor)
+{
+	drm_debugfs_remove_files(mdp5_debugfs_list,
+			ARRAY_SIZE(mdp5_debugfs_list), minor);
+}
+#endif
+
 static const struct mdp_kms_funcs kms_funcs = {
 	.base = {
 		.hw_init         = mdp5_hw_init,
@ -134,6 +229,7 @@ static const struct mdp_kms_funcs kms_funcs = {
 		.irq             = mdp5_irq,
 		.enable_vblank   = mdp5_enable_vblank,
 		.disable_vblank  = mdp5_disable_vblank,
+		.swap_state      = mdp5_swap_state,
 		.prepare_commit  = mdp5_prepare_commit,
 		.complete_commit = mdp5_complete_commit,
 		.wait_for_crtc_commit_done = mdp5_wait_for_crtc_commit_done,
@ -141,6 +237,10 @@ static const struct mdp_kms_funcs kms_funcs = {
 		.round_pixclk    = mdp5_round_pixclk,
 		.set_split_display = mdp5_set_split_display,
 		.destroy         = mdp5_kms_destroy,
+#ifdef CONFIG_DEBUG_FS
+		.debugfs_init    = mdp5_kms_debugfs_init,
+		.debugfs_cleanup = mdp5_kms_debugfs_cleanup,
+#endif
 	},
 	.set_irqmask         = mdp5_set_irqmask,
 };
@ -321,15 +421,6 @@ static int modeset_init_intf(struct mdp5_kms *mdp5_kms, int intf_num)

 static int modeset_init(struct mdp5_kms *mdp5_kms)
 {
-	static const enum mdp5_pipe crtcs[] = {
-			SSPP_RGB0, SSPP_RGB1, SSPP_RGB2, SSPP_RGB3,
-	};
-	static const enum mdp5_pipe vig_planes[] = {
-			SSPP_VIG0, SSPP_VIG1, SSPP_VIG2, SSPP_VIG3,
-	};
-	static const enum mdp5_pipe dma_planes[] = {
-			SSPP_DMA0, SSPP_DMA1,
-	};
 	struct drm_device *dev = mdp5_kms->dev;
 	struct msm_drm_private *priv = dev->dev_private;
 	const struct mdp5_cfg_hw *hw_cfg;
@ -337,58 +428,35 @@ static int modeset_init(struct mdp5_kms *mdp5_kms)

 	hw_cfg = mdp5_cfg_get_hw_config(mdp5_kms->cfg);

-	/* construct CRTCs and their private planes: */
-	for (i = 0; i < hw_cfg->pipe_rgb.count; i++) {
+	/* Construct planes equaling the number of hw pipes, and CRTCs
+	 * for the N layer-mixers (LM).  The first N planes become primary
+	 * planes for the CRTCs, with the remainder as overlay planes:
+	 */
+	for (i = 0; i < mdp5_kms->num_hwpipes; i++) {
+		bool primary = i < mdp5_cfg->lm.count;
 		struct drm_plane *plane;
 		struct drm_crtc *crtc;

-		plane = mdp5_plane_init(dev, crtcs[i], true,
-			hw_cfg->pipe_rgb.base[i], hw_cfg->pipe_rgb.caps);
+		plane = mdp5_plane_init(dev, primary);
 		if (IS_ERR(plane)) {
 			ret = PTR_ERR(plane);
-			dev_err(dev->dev, "failed to construct plane for %s (%d)\n",
-					pipe2name(crtcs[i]), ret);
+			dev_err(dev->dev, "failed to construct plane %d (%d)\n", i, ret);
 			goto fail;
 		}
+		priv->planes[priv->num_planes++] = plane;
+
+		if (!primary)
+			continue;

 		crtc  = mdp5_crtc_init(dev, plane, i);
 		if (IS_ERR(crtc)) {
 			ret = PTR_ERR(crtc);
-			dev_err(dev->dev, "failed to construct crtc for %s (%d)\n",
-					pipe2name(crtcs[i]), ret);
+			dev_err(dev->dev, "failed to construct crtc %d (%d)\n", i, ret);
 			goto fail;
 		}
 		priv->crtcs[priv->num_crtcs++] = crtc;
 	}

-	/* Construct video planes: */
-	for (i = 0; i < hw_cfg->pipe_vig.count; i++) {
-		struct drm_plane *plane;
-
-		plane = mdp5_plane_init(dev, vig_planes[i], false,
-			hw_cfg->pipe_vig.base[i], hw_cfg->pipe_vig.caps);
-		if (IS_ERR(plane)) {
-			ret = PTR_ERR(plane);
-			dev_err(dev->dev, "failed to construct %s plane: %d\n",
-					pipe2name(vig_planes[i]), ret);
-			goto fail;
-		}
-	}
-
-	/* DMA planes */
-	for (i = 0; i < hw_cfg->pipe_dma.count; i++) {
-		struct drm_plane *plane;
-
-		plane = mdp5_plane_init(dev, dma_planes[i], false,
-				hw_cfg->pipe_dma.base[i], hw_cfg->pipe_dma.caps);
-		if (IS_ERR(plane)) {
-			ret = PTR_ERR(plane);
-			dev_err(dev->dev, "failed to construct %s plane: %d\n",
-					pipe2name(dma_planes[i]), ret);
-			goto fail;
-		}
-	}
-
 	/* Construct encoders and modeset initialize connector devices
 	 * for each external display interface.
 	 */
@ -564,7 +632,7 @@ struct msm_kms *mdp5_kms_init(struct drm_device *dev)
 	struct mdp5_kms *mdp5_kms;
 	struct mdp5_cfg *config;
 	struct msm_kms *kms;
-	struct msm_mmu *mmu;
+	struct msm_gem_address_space *aspace;
 	int irq, i, ret;

 	/* priv->kms would have been populated by the MDP5 driver */
@ -606,30 +674,29 @@ struct msm_kms *mdp5_kms_init(struct drm_device *dev)
 	mdelay(16);

 	if (config->platform.iommu) {
-		mmu = msm_iommu_new(&pdev->dev, config->platform.iommu);
-		if (IS_ERR(mmu)) {
-			ret = PTR_ERR(mmu);
-			dev_err(&pdev->dev, "failed to init iommu: %d\n", ret);
-			iommu_domain_free(config->platform.iommu);
+		aspace = msm_gem_address_space_create(&pdev->dev,
+				config->platform.iommu, "mdp5");
+		if (IS_ERR(aspace)) {
+			ret = PTR_ERR(aspace);
 			goto fail;
 		}

-		ret = mmu->funcs->attach(mmu, iommu_ports,
+		mdp5_kms->aspace = aspace;
+
+		ret = aspace->mmu->funcs->attach(aspace->mmu, iommu_ports,
 				ARRAY_SIZE(iommu_ports));
 		if (ret) {
 			dev_err(&pdev->dev, "failed to attach iommu: %d\n",
 				ret);
-			mmu->funcs->destroy(mmu);
 			goto fail;
 		}
 	} else {
 		dev_info(&pdev->dev,
 			 "no iommu, fallback to phys contig buffers for scanout\n");
-		mmu = NULL;
+		aspace = NULL;;
 	}
-	mdp5_kms->mmu = mmu;

-	mdp5_kms->id = msm_register_mmu(dev, mmu);
+	mdp5_kms->id = msm_register_address_space(dev, aspace);
 	if (mdp5_kms->id < 0) {
 		ret = mdp5_kms->id;
 		dev_err(&pdev->dev, "failed to register mdp5 iommu: %d\n", ret);
@ -644,8 +711,8 @@ struct msm_kms *mdp5_kms_init(struct drm_device *dev)

 	dev->mode_config.min_width = 0;
 	dev->mode_config.min_height = 0;
-	dev->mode_config.max_width = config->hw->lm.max_width;
-	dev->mode_config.max_height = config->hw->lm.max_height;
+	dev->mode_config.max_width = 0xffff;
+	dev->mode_config.max_height = 0xffff;

 	dev->driver->get_vblank_timestamp = mdp5_get_vblank_timestamp;
 	dev->driver->get_scanout_position = mdp5_get_scanoutpos;
@ -673,6 +740,69 @@ static void mdp5_destroy(struct platform_device *pdev)

 	if (mdp5_kms->rpm_enabled)
 		pm_runtime_disable(&pdev->dev);
+
+	kfree(mdp5_kms->state);
+}
+
+static int construct_pipes(struct mdp5_kms *mdp5_kms, int cnt,
+		const enum mdp5_pipe *pipes, const uint32_t *offsets,
+		uint32_t caps)
+{
+	struct drm_device *dev = mdp5_kms->dev;
+	int i, ret;
+
+	for (i = 0; i < cnt; i++) {
+		struct mdp5_hw_pipe *hwpipe;
+
+		hwpipe = mdp5_pipe_init(pipes[i], offsets[i], caps);
+		if (IS_ERR(hwpipe)) {
+			ret = PTR_ERR(hwpipe);
+			dev_err(dev->dev, "failed to construct pipe for %s (%d)\n",
+					pipe2name(pipes[i]), ret);
+			return ret;
+		}
+		hwpipe->idx = mdp5_kms->num_hwpipes;
+		mdp5_kms->hwpipes[mdp5_kms->num_hwpipes++] = hwpipe;
+	}
+
+	return 0;
+}
+
+static int hwpipe_init(struct mdp5_kms *mdp5_kms)
+{
+	static const enum mdp5_pipe rgb_planes[] = {
+			SSPP_RGB0, SSPP_RGB1, SSPP_RGB2, SSPP_RGB3,
+	};
+	static const enum mdp5_pipe vig_planes[] = {
+			SSPP_VIG0, SSPP_VIG1, SSPP_VIG2, SSPP_VIG3,
+	};
+	static const enum mdp5_pipe dma_planes[] = {
+			SSPP_DMA0, SSPP_DMA1,
+	};
+	const struct mdp5_cfg_hw *hw_cfg;
+	int ret;
+
+	hw_cfg = mdp5_cfg_get_hw_config(mdp5_kms->cfg);
+
+	/* Construct RGB pipes: */
+	ret = construct_pipes(mdp5_kms, hw_cfg->pipe_rgb.count, rgb_planes,
+			hw_cfg->pipe_rgb.base, hw_cfg->pipe_rgb.caps);
+	if (ret)
+		return ret;
+
+	/* Construct video (VIG) pipes: */
+	ret = construct_pipes(mdp5_kms, hw_cfg->pipe_vig.count, vig_planes,
+			hw_cfg->pipe_vig.base, hw_cfg->pipe_vig.caps);
+	if (ret)
+		return ret;
+
+	/* Construct DMA pipes: */
+	ret = construct_pipes(mdp5_kms, hw_cfg->pipe_dma.count, dma_planes,
+			hw_cfg->pipe_dma.base, hw_cfg->pipe_dma.caps);
+	if (ret)
+		return ret;
+
+	return 0;
 }

 static int mdp5_init(struct platform_device *pdev, struct drm_device *dev)
@ -696,6 +826,13 @@ static int mdp5_init(struct platform_device *pdev, struct drm_device *dev)
 	mdp5_kms->dev = dev;
 	mdp5_kms->pdev = pdev;

+	drm_modeset_lock_init(&mdp5_kms->state_lock);
+	mdp5_kms->state = kzalloc(sizeof(*mdp5_kms->state), GFP_KERNEL);
+	if (!mdp5_kms->state) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
 	mdp5_kms->mmio = msm_ioremap(pdev, "mdp_phys", "MDP5");
 	if (IS_ERR(mdp5_kms->mmio)) {
 		ret = PTR_ERR(mdp5_kms->mmio);
@ -749,7 +886,7 @@ static int mdp5_init(struct platform_device *pdev, struct drm_device *dev)
 	 * this section initializes the SMP:
 	 */
 	if (mdp5_kms->caps & MDP_CAP_SMP) {
-		mdp5_kms->smp = mdp5_smp_init(mdp5_kms->dev, &config->hw->smp);
+		mdp5_kms->smp = mdp5_smp_init(mdp5_kms, &config->hw->smp);
 		if (IS_ERR(mdp5_kms->smp)) {
 			ret = PTR_ERR(mdp5_kms->smp);
 			mdp5_kms->smp = NULL;
@ -764,6 +901,10 @@ static int mdp5_init(struct platform_device *pdev, struct drm_device *dev)
 		goto fail;
 	}

+	ret = hwpipe_init(mdp5_kms);
+	if (ret)
+		goto fail;
+
 	/* set uninit-ed kms */
 	priv->kms = &mdp5_kms->base.base;

--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_kms.h
@ -24,8 +24,11 @@
 #include "mdp5_cfg.h"	/* must be included before mdp5.xml.h */
 #include "mdp5.xml.h"
 #include "mdp5_ctl.h"
+#include "mdp5_pipe.h"
 #include "mdp5_smp.h"

+struct mdp5_state;
+
 struct mdp5_kms {
 	struct mdp_kms base;

@ -33,13 +36,21 @@ struct mdp5_kms {

 	struct platform_device *pdev;

+	unsigned num_hwpipes;
+	struct mdp5_hw_pipe *hwpipes[SSPP_MAX];
+
 	struct mdp5_cfg_handler *cfg;
 	uint32_t caps;	/* MDP capabilities (MDP_CAP_XXX bits) */

+	/**
+	 * Global atomic state.  Do not access directly, use mdp5_get_state()
+	 */
+	struct mdp5_state *state;
+	struct drm_modeset_lock state_lock;

 	/* mapper-id used to request GEM buffer mapped for scanout: */
 	int id;
-	struct msm_mmu *mmu;
+	struct msm_gem_address_space *aspace;

 	struct mdp5_smp *smp;
 	struct mdp5_ctl_manager *ctlm;
@ -65,9 +76,27 @@ struct mdp5_kms {
 };
 #define to_mdp5_kms(x) container_of(x, struct mdp5_kms, base)

+/* Global atomic state for tracking resources that are shared across
+ * multiple kms objects (planes/crtcs/etc).
+ *
+ * For atomic updates which require modifying global state,
+ */
+struct mdp5_state {
+	struct mdp5_hw_pipe_state hwpipe;
+	struct mdp5_smp_state smp;
+};
+
+struct mdp5_state *__must_check
+mdp5_get_state(struct drm_atomic_state *s);
+
+/* Atomic plane state.  Subclasses the base drm_plane_state in order to
+ * track assigned hwpipe and hw specific state.
+ */
 struct mdp5_plane_state {
 	struct drm_plane_state base;

+	struct mdp5_hw_pipe *hwpipe;
+
 	/* aligned with property */
 	uint8_t premultiplied;
 	uint8_t zpos;
@ -76,11 +105,6 @@ struct mdp5_plane_state {
 	/* assigned by crtc blender */
 	enum mdp_mixer_stage_id stage;

-	/* some additional transactional status to help us know in the
-	 * apply path whether we need to update SMP allocation, and
-	 * whether current update is still pending:
-	 */
-	bool mode_changed : 1;
 	bool pending : 1;
 };
 #define to_mdp5_plane_state(x) \
@ -208,13 +232,10 @@ int mdp5_irq_domain_init(struct mdp5_kms *mdp5_kms);
 void mdp5_irq_domain_fini(struct mdp5_kms *mdp5_kms);

 uint32_t mdp5_plane_get_flush(struct drm_plane *plane);
-void mdp5_plane_complete_flip(struct drm_plane *plane);
 void mdp5_plane_complete_commit(struct drm_plane *plane,
 	struct drm_plane_state *state);
 enum mdp5_pipe mdp5_plane_pipe(struct drm_plane *plane);
-struct drm_plane *mdp5_plane_init(struct drm_device *dev,
-		enum mdp5_pipe pipe, bool private_plane,
-		uint32_t reg_offset, uint32_t caps);
+struct drm_plane *mdp5_plane_init(struct drm_device *dev, bool primary);

 uint32_t mdp5_crtc_vblank(struct drm_crtc *crtc);

--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_pipe.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_pipe.c
@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2016 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "mdp5_kms.h"
+
+struct mdp5_hw_pipe *mdp5_pipe_assign(struct drm_atomic_state *s,
+		struct drm_plane *plane, uint32_t caps, uint32_t blkcfg)
+{
+	struct msm_drm_private *priv = s->dev->dev_private;
+	struct mdp5_kms *mdp5_kms = to_mdp5_kms(to_mdp_kms(priv->kms));
+	struct mdp5_state *state;
+	struct mdp5_hw_pipe_state *old_state, *new_state;
+	struct mdp5_hw_pipe *hwpipe = NULL;
+	int i;
+
+	state = mdp5_get_state(s);
+	if (IS_ERR(state))
+		return ERR_CAST(state);
+
+	/* grab old_state after mdp5_get_state(), since now we hold lock: */
+	old_state = &mdp5_kms->state->hwpipe;
+	new_state = &state->hwpipe;
+
+	for (i = 0; i < mdp5_kms->num_hwpipes; i++) {
+		struct mdp5_hw_pipe *cur = mdp5_kms->hwpipes[i];
+
+		/* skip if already in-use.. check both new and old state,
+		 * since we cannot immediately re-use a pipe that is
+		 * released in the current update in some cases:
+		 *  (1) mdp5 can have SMP (non-double-buffered)
+		 *  (2) hw pipe previously assigned to different CRTC
+		 *      (vblanks might not be aligned)
+		 */
+		if (new_state->hwpipe_to_plane[cur->idx] ||
+				old_state->hwpipe_to_plane[cur->idx])
+			continue;
+
+		/* skip if doesn't support some required caps: */
+		if (caps & ~cur->caps)
+			continue;
+
+		/* possible candidate, take the one with the
+		 * fewest unneeded caps bits set:
+		 */
+		if (!hwpipe || (hweight_long(cur->caps & ~caps) <
+				hweight_long(hwpipe->caps & ~caps)))
+			hwpipe = cur;
+	}
+
+	if (!hwpipe)
+		return ERR_PTR(-ENOMEM);
+
+	if (mdp5_kms->smp) {
+		int ret;
+
+		DBG("%s: alloc SMP blocks", hwpipe->name);
+		ret = mdp5_smp_assign(mdp5_kms->smp, &state->smp,
+				hwpipe->pipe, blkcfg);
+		if (ret)
+			return ERR_PTR(-ENOMEM);
+
+		hwpipe->blkcfg = blkcfg;
+	}
+
+	DBG("%s: assign to plane %s for caps %x",
+			hwpipe->name, plane->name, caps);
+	new_state->hwpipe_to_plane[hwpipe->idx] = plane;
+
+	return hwpipe;
+}
+
+void mdp5_pipe_release(struct drm_atomic_state *s, struct mdp5_hw_pipe *hwpipe)
+{
+	struct msm_drm_private *priv = s->dev->dev_private;
+	struct mdp5_kms *mdp5_kms = to_mdp5_kms(to_mdp_kms(priv->kms));
+	struct mdp5_state *state = mdp5_get_state(s);
+	struct mdp5_hw_pipe_state *new_state = &state->hwpipe;
+
+	if (!hwpipe)
+		return;
+
+	if (WARN_ON(!new_state->hwpipe_to_plane[hwpipe->idx]))
+		return;
+
+	DBG("%s: release from plane %s", hwpipe->name,
+		new_state->hwpipe_to_plane[hwpipe->idx]->name);
+
+	if (mdp5_kms->smp) {
+		DBG("%s: free SMP blocks", hwpipe->name);
+		mdp5_smp_release(mdp5_kms->smp, &state->smp, hwpipe->pipe);
+	}
+
+	new_state->hwpipe_to_plane[hwpipe->idx] = NULL;
+}
+
+void mdp5_pipe_destroy(struct mdp5_hw_pipe *hwpipe)
+{
+	kfree(hwpipe);
+}
+
+struct mdp5_hw_pipe *mdp5_pipe_init(enum mdp5_pipe pipe,
+		uint32_t reg_offset, uint32_t caps)
+{
+	struct mdp5_hw_pipe *hwpipe;
+
+	hwpipe = kzalloc(sizeof(*hwpipe), GFP_KERNEL);
+	if (!hwpipe)
+		return ERR_PTR(-ENOMEM);
+
+	hwpipe->name = pipe2name(pipe);
+	hwpipe->pipe = pipe;
+	hwpipe->reg_offset = reg_offset;
+	hwpipe->caps = caps;
+	hwpipe->flush_mask = mdp_ctl_flush_mask_pipe(pipe);
+
+	spin_lock_init(&hwpipe->pipe_lock);
+
+	return hwpipe;
+}
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_pipe.h
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_pipe.h
@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2016 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __MDP5_PIPE_H__
+#define __MDP5_PIPE_H__
+
+#define SSPP_MAX	(SSPP_RGB3 + 1) /* TODO: Add SSPP_MAX in mdp5.xml.h */
+
+/* represents a hw pipe, which is dynamically assigned to a plane */
+struct mdp5_hw_pipe {
+	int idx;
+
+	const char *name;
+	enum mdp5_pipe pipe;
+
+	spinlock_t pipe_lock;     /* protect REG_MDP5_PIPE_* registers */
+	uint32_t reg_offset;
+	uint32_t caps;
+
+	uint32_t flush_mask;      /* used to commit pipe registers */
+
+	/* number of smp blocks per plane, ie:
+	 *   nblks_y | (nblks_u << 8) | (nblks_v << 16)
+	 */
+	uint32_t blkcfg;
+};
+
+/* global atomic state of assignment between pipes and planes: */
+struct mdp5_hw_pipe_state {
+	struct drm_plane *hwpipe_to_plane[SSPP_MAX];
+};
+
+struct mdp5_hw_pipe *__must_check
+mdp5_pipe_assign(struct drm_atomic_state *s, struct drm_plane *plane,
+		uint32_t caps, uint32_t blkcfg);
+void mdp5_pipe_release(struct drm_atomic_state *s, struct mdp5_hw_pipe *hwpipe);
+
+struct mdp5_hw_pipe *mdp5_pipe_init(enum mdp5_pipe pipe,
+		uint32_t reg_offset, uint32_t caps);
+void mdp5_pipe_destroy(struct mdp5_hw_pipe *hwpipe);
+
+#endif /* __MDP5_PIPE_H__ */
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_plane.c
@ -21,15 +21,6 @@

 struct mdp5_plane {
 	struct drm_plane base;
-	const char *name;
-
-	enum mdp5_pipe pipe;
-
-	spinlock_t pipe_lock;	/* protect REG_MDP5_PIPE_* registers */
-	uint32_t reg_offset;
-	uint32_t caps;
-
-	uint32_t flush_mask;	/* used to commit pipe registers */

 	uint32_t nformats;
 	uint32_t formats[32];
@ -70,12 +61,6 @@ static void mdp5_plane_destroy(struct drm_plane *plane)
 static void mdp5_plane_install_rotation_property(struct drm_device *dev,
 		struct drm_plane *plane)
 {
-	struct mdp5_plane *mdp5_plane = to_mdp5_plane(plane);
-
-	if (!(mdp5_plane->caps & MDP_PIPE_CAP_HFLIP) &&
-		!(mdp5_plane->caps & MDP_PIPE_CAP_VFLIP))
-		return;
-
 	drm_plane_create_rotation_property(plane,
 					   DRM_ROTATE_0,
 					   DRM_ROTATE_0 |
@ -188,11 +173,12 @@ mdp5_plane_atomic_print_state(struct drm_printer *p,
 {
 	struct mdp5_plane_state *pstate = to_mdp5_plane_state(state);

+	drm_printf(p, "\thwpipe=%s\n", pstate->hwpipe ?
+			pstate->hwpipe->name : "(null)");
 	drm_printf(p, "\tpremultiplied=%u\n", pstate->premultiplied);
 	drm_printf(p, "\tzpos=%u\n", pstate->zpos);
 	drm_printf(p, "\talpha=%u\n", pstate->alpha);
 	drm_printf(p, "\tstage=%s\n", stage2name(pstate->stage));
-	drm_printf(p, "\tmode_changed=%u\n", pstate->mode_changed);
 	drm_printf(p, "\tpending=%u\n", pstate->pending);
 }

@ -234,7 +220,6 @@ mdp5_plane_duplicate_state(struct drm_plane *plane)
 	if (mdp5_state && mdp5_state->base.fb)
 		drm_framebuffer_reference(mdp5_state->base.fb);

-	mdp5_state->mode_changed = false;
 	mdp5_state->pending = false;

 	return &mdp5_state->base;
@ -243,10 +228,12 @@ mdp5_plane_duplicate_state(struct drm_plane *plane)
 static void mdp5_plane_destroy_state(struct drm_plane *plane,
 		struct drm_plane_state *state)
 {
+	struct mdp5_plane_state *pstate = to_mdp5_plane_state(state);
+
 	if (state->fb)
 		drm_framebuffer_unreference(state->fb);

-	kfree(to_mdp5_plane_state(state));
+	kfree(pstate);
 }

 static const struct drm_plane_funcs mdp5_plane_funcs = {
@ -265,101 +252,115 @@ static const struct drm_plane_funcs mdp5_plane_funcs = {
 static int mdp5_plane_prepare_fb(struct drm_plane *plane,
 				 struct drm_plane_state *new_state)
 {
-	struct mdp5_plane *mdp5_plane = to_mdp5_plane(plane);
 	struct mdp5_kms *mdp5_kms = get_kms(plane);
 	struct drm_framebuffer *fb = new_state->fb;

 	if (!new_state->fb)
 		return 0;

-	DBG("%s: prepare: FB[%u]", mdp5_plane->name, fb->base.id);
+	DBG("%s: prepare: FB[%u]", plane->name, fb->base.id);
 	return msm_framebuffer_prepare(fb, mdp5_kms->id);
 }

 static void mdp5_plane_cleanup_fb(struct drm_plane *plane,
 				  struct drm_plane_state *old_state)
 {
-	struct mdp5_plane *mdp5_plane = to_mdp5_plane(plane);
 	struct mdp5_kms *mdp5_kms = get_kms(plane);
 	struct drm_framebuffer *fb = old_state->fb;

 	if (!fb)
 		return;

-	DBG("%s: cleanup: FB[%u]", mdp5_plane->name, fb->base.id);
+	DBG("%s: cleanup: FB[%u]", plane->name, fb->base.id);
 	msm_framebuffer_cleanup(fb, mdp5_kms->id);
 }

 static int mdp5_plane_atomic_check(struct drm_plane *plane,
 		struct drm_plane_state *state)
 {
-	struct mdp5_plane *mdp5_plane = to_mdp5_plane(plane);
+	struct mdp5_plane_state *mdp5_state = to_mdp5_plane_state(state);
 	struct drm_plane_state *old_state = plane->state;
-	const struct mdp_format *format;
-	bool vflip, hflip;
+	struct mdp5_cfg *config = mdp5_cfg_get_config(get_kms(plane)->cfg);
+	bool new_hwpipe = false;
+	uint32_t max_width, max_height;
+	uint32_t caps = 0;

-	DBG("%s: check (%d -> %d)", mdp5_plane->name,
+	DBG("%s: check (%d -> %d)", plane->name,
 			plane_enabled(old_state), plane_enabled(state));

+	/* We don't allow faster-than-vblank updates.. if we did add this
+	 * some day, we would need to disallow in cases where hwpipe
+	 * changes
+	 */
+	if (WARN_ON(to_mdp5_plane_state(old_state)->pending))
+		return -EBUSY;
+
+	max_width = config->hw->lm.max_width << 16;
+	max_height = config->hw->lm.max_height << 16;
+
+	/* Make sure source dimensions are within bounds. */
+	if ((state->src_w > max_width) || (state->src_h > max_height)) {
+		struct drm_rect src = drm_plane_state_src(state);
+		DBG("Invalid source size "DRM_RECT_FP_FMT,
+				DRM_RECT_FP_ARG(&src));
+		return -ERANGE;
+	}
+
 	if (plane_enabled(state)) {
 		unsigned int rotation;
+		const struct mdp_format *format;
+		struct mdp5_kms *mdp5_kms = get_kms(plane);
+		uint32_t blkcfg = 0;

 		format = to_mdp_format(msm_framebuffer_format(state->fb));
-		if (MDP_FORMAT_IS_YUV(format) &&
-			!pipe_supports_yuv(mdp5_plane->caps)) {
-			DBG("Pipe doesn't support YUV\n");
+		if (MDP_FORMAT_IS_YUV(format))
+			caps |= MDP_PIPE_CAP_SCALE | MDP_PIPE_CAP_CSC;

-			return -EINVAL;
-		}
-
-		if (!(mdp5_plane->caps & MDP_PIPE_CAP_SCALE) &&
-			(((state->src_w >> 16) != state->crtc_w) ||
-			((state->src_h >> 16) != state->crtc_h))) {
-			DBG("Pipe doesn't support scaling (%dx%d -> %dx%d)\n",
-				state->src_w >> 16, state->src_h >> 16,
-				state->crtc_w, state->crtc_h);
-
-			return -EINVAL;
-		}
+		if (((state->src_w >> 16) != state->crtc_w) ||
+				((state->src_h >> 16) != state->crtc_h))
+			caps |= MDP_PIPE_CAP_SCALE;

 		rotation = drm_rotation_simplify(state->rotation,
 						 DRM_ROTATE_0 |
 						 DRM_REFLECT_X |
 						 DRM_REFLECT_Y);
-		hflip = !!(rotation & DRM_REFLECT_X);
-		vflip = !!(rotation & DRM_REFLECT_Y);

-		if ((vflip && !(mdp5_plane->caps & MDP_PIPE_CAP_VFLIP)) ||
-			(hflip && !(mdp5_plane->caps & MDP_PIPE_CAP_HFLIP))) {
-			DBG("Pipe doesn't support flip\n");
+		if (rotation & DRM_REFLECT_X)
+			caps |= MDP_PIPE_CAP_HFLIP;

-			return -EINVAL;
-		}
-	}
+		if (rotation & DRM_REFLECT_Y)
+			caps |= MDP_PIPE_CAP_VFLIP;

-	if (plane_enabled(state) && plane_enabled(old_state)) {
-		/* we cannot change SMP block configuration during scanout: */
-		bool full_modeset = false;
-		if (state->fb->pixel_format != old_state->fb->pixel_format) {
-			DBG("%s: pixel_format change!", mdp5_plane->name);
-			full_modeset = true;
+		/* (re)allocate hw pipe if we don't have one or caps-mismatch: */
+		if (!mdp5_state->hwpipe || (caps & ~mdp5_state->hwpipe->caps))
+			new_hwpipe = true;
+
+		if (mdp5_kms->smp) {
+			const struct mdp_format *format =
+				to_mdp_format(msm_framebuffer_format(state->fb));
+
+			blkcfg = mdp5_smp_calculate(mdp5_kms->smp, format,
+					state->src_w >> 16, false);
+
+			if (mdp5_state->hwpipe && (mdp5_state->hwpipe->blkcfg != blkcfg))
+				new_hwpipe = true;
 		}
-		if (state->src_w != old_state->src_w) {
-			DBG("%s: src_w change!", mdp5_plane->name);
-			full_modeset = true;
+
+		/* (re)assign hwpipe if needed, otherwise keep old one: */
+		if (new_hwpipe) {
+			/* TODO maybe we want to re-assign hwpipe sometimes
+			 * in cases when we no-longer need some caps to make
+			 * it available for other planes?
+			 */
+			struct mdp5_hw_pipe *old_hwpipe = mdp5_state->hwpipe;
+			mdp5_state->hwpipe = mdp5_pipe_assign(state->state,
+					plane, caps, blkcfg);
+			if (IS_ERR(mdp5_state->hwpipe)) {
+				DBG("%s: failed to assign hwpipe!", plane->name);
+				return PTR_ERR(mdp5_state->hwpipe);
+			}
+			mdp5_pipe_release(state->state, old_hwpipe);
 		}
-		if (to_mdp5_plane_state(old_state)->pending) {
-			DBG("%s: still pending!", mdp5_plane->name);
-			full_modeset = true;
-		}
-		if (full_modeset) {
-			struct drm_crtc_state *crtc_state =
-					drm_atomic_get_crtc_state(state->state, state->crtc);
-			crtc_state->mode_changed = true;
-			to_mdp5_plane_state(state)->mode_changed = true;
-		}
-	} else {
-		to_mdp5_plane_state(state)->mode_changed = true;
 	}

 	return 0;
@ -368,16 +369,16 @@ static int mdp5_plane_atomic_check(struct drm_plane *plane,
 static void mdp5_plane_atomic_update(struct drm_plane *plane,
 				     struct drm_plane_state *old_state)
 {
-	struct mdp5_plane *mdp5_plane = to_mdp5_plane(plane);
 	struct drm_plane_state *state = plane->state;
+	struct mdp5_plane_state *mdp5_state = to_mdp5_plane_state(state);

-	DBG("%s: update", mdp5_plane->name);
+	DBG("%s: update", plane->name);

-	if (!plane_enabled(state)) {
-		to_mdp5_plane_state(state)->pending = true;
-	} else if (to_mdp5_plane_state(state)->mode_changed) {
+	mdp5_state->pending = true;
+
+	if (plane_enabled(state)) {
 		int ret;
-		to_mdp5_plane_state(state)->pending = true;
+
 		ret = mdp5_plane_mode_set(plane,
 				state->crtc, state->fb,
 				state->crtc_x, state->crtc_y,
@ -386,11 +387,6 @@ static void mdp5_plane_atomic_update(struct drm_plane *plane,
 				state->src_w, state->src_h);
 		/* atomic_check should have ensured that this doesn't fail */
 		WARN_ON(ret < 0);
-	} else {
-		unsigned long flags;
-		spin_lock_irqsave(&mdp5_plane->pipe_lock, flags);
-		set_scanout_locked(plane, state->fb);
-		spin_unlock_irqrestore(&mdp5_plane->pipe_lock, flags);
 	}
 }

@ -404,9 +400,9 @@ static const struct drm_plane_helper_funcs mdp5_plane_helper_funcs = {
 static void set_scanout_locked(struct drm_plane *plane,
 		struct drm_framebuffer *fb)
 {
-	struct mdp5_plane *mdp5_plane = to_mdp5_plane(plane);
 	struct mdp5_kms *mdp5_kms = get_kms(plane);
-	enum mdp5_pipe pipe = mdp5_plane->pipe;
+	struct mdp5_hw_pipe *hwpipe = to_mdp5_plane_state(plane->state)->hwpipe;
+	enum mdp5_pipe pipe = hwpipe->pipe;

 	mdp5_write(mdp5_kms, REG_MDP5_PIPE_SRC_STRIDE_A(pipe),
 			MDP5_PIPE_SRC_STRIDE_A_P0(fb->pitches[0]) |
@ -686,14 +682,14 @@ static int mdp5_plane_mode_set(struct drm_plane *plane,
 		uint32_t src_x, uint32_t src_y,
 		uint32_t src_w, uint32_t src_h)
 {
-	struct mdp5_plane *mdp5_plane = to_mdp5_plane(plane);
 	struct drm_plane_state *pstate = plane->state;
+	struct mdp5_hw_pipe *hwpipe = to_mdp5_plane_state(pstate)->hwpipe;
 	struct mdp5_kms *mdp5_kms = get_kms(plane);
-	enum mdp5_pipe pipe = mdp5_plane->pipe;
+	enum mdp5_pipe pipe = hwpipe->pipe;
 	const struct mdp_format *format;
 	uint32_t nplanes, config = 0;
 	uint32_t phasex_step[COMP_MAX] = {0,}, phasey_step[COMP_MAX] = {0,};
-	bool pe = mdp5_plane->caps & MDP_PIPE_CAP_SW_PIX_EXT;
+	bool pe = hwpipe->caps & MDP_PIPE_CAP_SW_PIX_EXT;
 	int pe_left[COMP_MAX], pe_right[COMP_MAX];
 	int pe_top[COMP_MAX], pe_bottom[COMP_MAX];
 	uint32_t hdecm = 0, vdecm = 0;
@ -718,27 +714,10 @@ static int mdp5_plane_mode_set(struct drm_plane *plane,
 	src_w = src_w >> 16;
 	src_h = src_h >> 16;

-	DBG("%s: FB[%u] %u,%u,%u,%u -> CRTC[%u] %d,%d,%u,%u", mdp5_plane->name,
+	DBG("%s: FB[%u] %u,%u,%u,%u -> CRTC[%u] %d,%d,%u,%u", plane->name,
 			fb->base.id, src_x, src_y, src_w, src_h,
 			crtc->base.id, crtc_x, crtc_y, crtc_w, crtc_h);

-	/* Request some memory from the SMP: */
-	if (mdp5_kms->smp) {
-		ret = mdp5_smp_request(mdp5_kms->smp,
-				mdp5_plane->pipe, format, src_w, false);
-		if (ret)
-			return ret;
-	}
-
-	/*
-	 * Currently we update the hw for allocations/requests immediately,
-	 * but once atomic modeset/pageflip is in place, the allocation
-	 * would move into atomic->check_plane_state(), while updating the
-	 * hw would remain here:
-	 */
-	if (mdp5_kms->smp)
-		mdp5_smp_configure(mdp5_kms->smp, pipe);
-
 	ret = calc_scalex_steps(plane, pix_format, src_w, crtc_w, phasex_step);
 	if (ret)
 		return ret;
@ -747,7 +726,7 @@ static int mdp5_plane_mode_set(struct drm_plane *plane,
 	if (ret)
 		return ret;

-	if (mdp5_plane->caps & MDP_PIPE_CAP_SW_PIX_EXT) {
+	if (hwpipe->caps & MDP_PIPE_CAP_SW_PIX_EXT) {
 		calc_pixel_ext(format, src_w, crtc_w, phasex_step,
 					 pe_left, pe_right, true);
 		calc_pixel_ext(format, src_h, crtc_h, phasey_step,
@ -768,11 +747,11 @@ static int mdp5_plane_mode_set(struct drm_plane *plane,
 	hflip = !!(rotation & DRM_REFLECT_X);
 	vflip = !!(rotation & DRM_REFLECT_Y);

-	spin_lock_irqsave(&mdp5_plane->pipe_lock, flags);
+	spin_lock_irqsave(&hwpipe->pipe_lock, flags);

 	mdp5_write(mdp5_kms, REG_MDP5_PIPE_SRC_IMG_SIZE(pipe),
-			MDP5_PIPE_SRC_IMG_SIZE_WIDTH(fb->width) |
-			MDP5_PIPE_SRC_IMG_SIZE_HEIGHT(fb->height));
+			MDP5_PIPE_SRC_IMG_SIZE_WIDTH(min(fb->width, src_w)) |
+			MDP5_PIPE_SRC_IMG_SIZE_HEIGHT(min(fb->height, src_h)));

 	mdp5_write(mdp5_kms, REG_MDP5_PIPE_SRC_SIZE(pipe),
 			MDP5_PIPE_SRC_SIZE_WIDTH(src_w) |
@ -817,12 +796,12 @@ static int mdp5_plane_mode_set(struct drm_plane *plane,
 	/* not using secure mode: */
 	mdp5_write(mdp5_kms, REG_MDP5_PIPE_SRC_ADDR_SW_STATUS(pipe), 0);

-	if (mdp5_plane->caps & MDP_PIPE_CAP_SW_PIX_EXT)
+	if (hwpipe->caps & MDP_PIPE_CAP_SW_PIX_EXT)
 		mdp5_write_pixel_ext(mdp5_kms, pipe, format,
 				src_w, pe_left, pe_right,
 				src_h, pe_top, pe_bottom);

-	if (mdp5_plane->caps & MDP_PIPE_CAP_SCALE) {
+	if (hwpipe->caps & MDP_PIPE_CAP_SCALE) {
 		mdp5_write(mdp5_kms, REG_MDP5_PIPE_SCALE_PHASE_STEP_X(pipe),
 				phasex_step[COMP_0]);
 		mdp5_write(mdp5_kms, REG_MDP5_PIPE_SCALE_PHASE_STEP_Y(pipe),
@ -837,7 +816,7 @@ static int mdp5_plane_mode_set(struct drm_plane *plane,
 		mdp5_write(mdp5_kms, REG_MDP5_PIPE_SCALE_CONFIG(pipe), config);
 	}

-	if (mdp5_plane->caps & MDP_PIPE_CAP_CSC) {
+	if (hwpipe->caps & MDP_PIPE_CAP_CSC) {
 		if (MDP_FORMAT_IS_YUV(format))
 			csc_enable(mdp5_kms, pipe,
 					mdp_get_default_csc_cfg(CSC_YUV2RGB));
@ -847,56 +826,42 @@ static int mdp5_plane_mode_set(struct drm_plane *plane,

 	set_scanout_locked(plane, fb);

-	spin_unlock_irqrestore(&mdp5_plane->pipe_lock, flags);
+	spin_unlock_irqrestore(&hwpipe->pipe_lock, flags);

 	return ret;
 }

-void mdp5_plane_complete_flip(struct drm_plane *plane)
-{
-	struct mdp5_kms *mdp5_kms = get_kms(plane);
-	struct mdp5_plane *mdp5_plane = to_mdp5_plane(plane);
-	enum mdp5_pipe pipe = mdp5_plane->pipe;
-
-	DBG("%s: complete flip", mdp5_plane->name);
-
-	if (mdp5_kms->smp)
-		mdp5_smp_commit(mdp5_kms->smp, pipe);
-
-	to_mdp5_plane_state(plane->state)->pending = false;
-}
-
 enum mdp5_pipe mdp5_plane_pipe(struct drm_plane *plane)
 {
-	struct mdp5_plane *mdp5_plane = to_mdp5_plane(plane);
-	return mdp5_plane->pipe;
+	struct mdp5_plane_state *pstate = to_mdp5_plane_state(plane->state);
+
+	if (WARN_ON(!pstate->hwpipe))
+		return 0;
+
+	return pstate->hwpipe->pipe;
 }

 uint32_t mdp5_plane_get_flush(struct drm_plane *plane)
 {
-	struct mdp5_plane *mdp5_plane = to_mdp5_plane(plane);
+	struct mdp5_plane_state *pstate = to_mdp5_plane_state(plane->state);

-	return mdp5_plane->flush_mask;
+	if (WARN_ON(!pstate->hwpipe))
+		return 0;
+
+	return pstate->hwpipe->flush_mask;
 }

 /* called after vsync in thread context */
 void mdp5_plane_complete_commit(struct drm_plane *plane,
 	struct drm_plane_state *state)
 {
-	struct mdp5_kms *mdp5_kms = get_kms(plane);
-	struct mdp5_plane *mdp5_plane = to_mdp5_plane(plane);
-	enum mdp5_pipe pipe = mdp5_plane->pipe;
+	struct mdp5_plane_state *pstate = to_mdp5_plane_state(plane->state);

-	if (!plane_enabled(plane->state) && mdp5_kms->smp) {
-		DBG("%s: free SMP", mdp5_plane->name);
-		mdp5_smp_release(mdp5_kms->smp, pipe);
-	}
+	pstate->pending = false;
 }

 /* initialize plane */
-struct drm_plane *mdp5_plane_init(struct drm_device *dev,
-		enum mdp5_pipe pipe, bool private_plane, uint32_t reg_offset,
-		uint32_t caps)
+struct drm_plane *mdp5_plane_init(struct drm_device *dev, bool primary)
 {
 	struct drm_plane *plane = NULL;
 	struct mdp5_plane *mdp5_plane;
@ -911,22 +876,13 @@ struct drm_plane *mdp5_plane_init(struct drm_device *dev,

 	plane = &mdp5_plane->base;

-	mdp5_plane->pipe = pipe;
-	mdp5_plane->name = pipe2name(pipe);
-	mdp5_plane->caps = caps;
-
 	mdp5_plane->nformats = mdp_get_formats(mdp5_plane->formats,
-		ARRAY_SIZE(mdp5_plane->formats),
-		!pipe_supports_yuv(mdp5_plane->caps));
+		ARRAY_SIZE(mdp5_plane->formats), false);

-	mdp5_plane->flush_mask = mdp_ctl_flush_mask_pipe(pipe);
-	mdp5_plane->reg_offset = reg_offset;
-	spin_lock_init(&mdp5_plane->pipe_lock);
-
-	type = private_plane ? DRM_PLANE_TYPE_PRIMARY : DRM_PLANE_TYPE_OVERLAY;
+	type = primary ? DRM_PLANE_TYPE_PRIMARY : DRM_PLANE_TYPE_OVERLAY;
 	ret = drm_universal_plane_init(dev, plane, 0xff, &mdp5_plane_funcs,
 				 mdp5_plane->formats, mdp5_plane->nformats,
-				 type, "%s", mdp5_plane->name);
+				 type, NULL);
 	if (ret)
 		goto fail;

--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_smp.c
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_smp.c
@ -21,72 +21,6 @@
 #include "mdp5_smp.h"


-/* SMP - Shared Memory Pool
- *
- * These are shared between all the clients, where each plane in a
- * scanout buffer is a SMP client.  Ie. scanout of 3 plane I420 on
- * pipe VIG0 => 3 clients: VIG0_Y, VIG0_CB, VIG0_CR.
- *
- * Based on the size of the attached scanout buffer, a certain # of
- * blocks must be allocated to that client out of the shared pool.
- *
- * In some hw, some blocks are statically allocated for certain pipes
- * and CANNOT be re-allocated (eg: MMB0 and MMB1 both tied to RGB0).
- *
- * For each block that can be dynamically allocated, it can be either
- *     free:
- *     The block is free.
- *
- *     pending:
- *     The block is allocated to some client and not free.
- *
- *     configured:
- *     The block is allocated to some client, and assigned to that
- *     client in MDP5_SMP_ALLOC registers.
- *
- *     inuse:
- *     The block is being actively used by a client.
- *
- * The updates happen in the following steps:
- *
- *  1) mdp5_smp_request():
- *     When plane scanout is setup, calculate required number of
- *     blocks needed per client, and request. Blocks neither inuse nor
- *     configured nor pending by any other client are added to client's
- *     pending set.
- *     For shrinking, blocks in pending but not in configured can be freed
- *     directly, but those already in configured will be freed later by
- *     mdp5_smp_commit.
- *
- *  2) mdp5_smp_configure():
- *     As hw is programmed, before FLUSH, MDP5_SMP_ALLOC registers
- *     are configured for the union(pending, inuse)
- *     Current pending is copied to configured.
- *     It is assumed that mdp5_smp_request and mdp5_smp_configure not run
- *     concurrently for the same pipe.
- *
- *  3) mdp5_smp_commit():
- *     After next vblank, copy configured -> inuse.  Optionally update
- *     MDP5_SMP_ALLOC registers if there are newly unused blocks
- *
- *  4) mdp5_smp_release():
- *     Must be called after the pipe is disabled and no longer uses any SMB
- *
- * On the next vblank after changes have been committed to hw, the
- * client's pending blocks become it's in-use blocks (and no-longer
- * in-use blocks become available to other clients).
- *
- * btw, hurray for confusing overloaded acronyms!  :-/
- *
- * NOTE: for atomic modeset/pageflip NONBLOCK operations, step #1
- * should happen at (or before)? atomic->check().  And we'd need
- * an API to discard previous requests if update is aborted or
- * (test-only).
- *
- * TODO would perhaps be nice to have debugfs to dump out kernel
- * inuse and pending state of all clients..
- */
-
 struct mdp5_smp {
 	struct drm_device *dev;

@ -94,16 +28,8 @@ struct mdp5_smp {

 	int blk_cnt;
 	int blk_size;
-
-	spinlock_t state_lock;
-	mdp5_smp_state_t state; /* to track smp allocation amongst pipes: */
-
-	struct mdp5_client_smp_state client_state[MAX_CLIENTS];
 };

-static void update_smp_state(struct mdp5_smp *smp,
-		u32 cid, mdp5_smp_state_t *assigned);
-
 static inline
 struct mdp5_kms *get_kms(struct mdp5_smp *smp)
 {
@ -134,57 +60,38 @@ static inline u32 pipe2client(enum mdp5_pipe pipe, int plane)
 	return mdp5_cfg->smp.clients[pipe] + plane;
 }

-/* step #1: update # of blocks pending for the client: */
+/* allocate blocks for the specified request: */
 static int smp_request_block(struct mdp5_smp *smp,
+		struct mdp5_smp_state *state,
 		u32 cid, int nblks)
 {
-	struct mdp5_kms *mdp5_kms = get_kms(smp);
-	struct mdp5_client_smp_state *ps = &smp->client_state[cid];
-	int i, ret, avail, cur_nblks, cnt = smp->blk_cnt;
+	void *cs = state->client_state[cid];
+	int i, avail, cnt = smp->blk_cnt;
 	uint8_t reserved;
-	unsigned long flags;
+
+	/* we shouldn't be requesting blocks for an in-use client: */
+	WARN_ON(bitmap_weight(cs, cnt) > 0);

 	reserved = smp->reserved[cid];

-	spin_lock_irqsave(&smp->state_lock, flags);
-
 	if (reserved) {
 		nblks = max(0, nblks - reserved);
 		DBG("%d MMBs allocated (%d reserved)", nblks, reserved);
 	}

-	avail = cnt - bitmap_weight(smp->state, cnt);
+	avail = cnt - bitmap_weight(state->state, cnt);
 	if (nblks > avail) {
-		dev_err(mdp5_kms->dev->dev, "out of blks (req=%d > avail=%d)\n",
+		dev_err(smp->dev->dev, "out of blks (req=%d > avail=%d)\n",
 				nblks, avail);
-		ret = -ENOSPC;
-		goto fail;
+		return -ENOSPC;
 	}

-	cur_nblks = bitmap_weight(ps->pending, cnt);
-	if (nblks > cur_nblks) {
-		/* grow the existing pending reservation: */
-		for (i = cur_nblks; i < nblks; i++) {
-			int blk = find_first_zero_bit(smp->state, cnt);
-			set_bit(blk, ps->pending);
-			set_bit(blk, smp->state);
-		}
-	} else {
-		/* shrink the existing pending reservation: */
-		for (i = cur_nblks; i > nblks; i--) {
-			int blk = find_first_bit(ps->pending, cnt);
-			clear_bit(blk, ps->pending);
-
-			/* clear in global smp_state if not in configured
-			 * otherwise until _commit()
-			 */
-			if (!test_bit(blk, ps->configured))
-				clear_bit(blk, smp->state);
-		}
+	for (i = 0; i < nblks; i++) {
+		int blk = find_first_zero_bit(state->state, cnt);
+		set_bit(blk, cs);
+		set_bit(blk, state->state);
 	}

-fail:
-	spin_unlock_irqrestore(&smp->state_lock, flags);
 	return 0;
 }

@ -209,14 +116,15 @@ static void set_fifo_thresholds(struct mdp5_smp *smp,
 * decimated width.  Ie. SMP buffering sits downstream of decimation (which
 * presumably happens during the dma from scanout buffer).
 */
-int mdp5_smp_request(struct mdp5_smp *smp, enum mdp5_pipe pipe,
-		const struct mdp_format *format, u32 width, bool hdecim)
+uint32_t mdp5_smp_calculate(struct mdp5_smp *smp,
+		const struct mdp_format *format,
+		u32 width, bool hdecim)
 {
 	struct mdp5_kms *mdp5_kms = get_kms(smp);
-	struct drm_device *dev = mdp5_kms->dev;
 	int rev = mdp5_cfg_get_hw_rev(mdp5_kms->cfg);
-	int i, hsub, nplanes, nlines, nblks, ret;
+	int i, hsub, nplanes, nlines;
 	u32 fmt = format->base.pixel_format;
+	uint32_t blkcfg = 0;

 	nplanes = drm_format_num_planes(fmt);
 	hsub = drm_format_horz_chroma_subsampling(fmt);
@ -239,7 +147,7 @@ int mdp5_smp_request(struct mdp5_smp *smp, enum mdp5_pipe pipe,
 			hsub = 1;
 	}

-	for (i = 0, nblks = 0; i < nplanes; i++) {
+	for (i = 0; i < nplanes; i++) {
 		int n, fetch_stride, cpp;

 		cpp = drm_format_plane_cpp(fmt, i);
@ -251,60 +159,72 @@ int mdp5_smp_request(struct mdp5_smp *smp, enum mdp5_pipe pipe,
 		if (rev == 0)
 			n = roundup_pow_of_two(n);

+		blkcfg |= (n << (8 * i));
+	}
+
+	return blkcfg;
+}
+
+int mdp5_smp_assign(struct mdp5_smp *smp, struct mdp5_smp_state *state,
+		enum mdp5_pipe pipe, uint32_t blkcfg)
+{
+	struct mdp5_kms *mdp5_kms = get_kms(smp);
+	struct drm_device *dev = mdp5_kms->dev;
+	int i, ret;
+
+	for (i = 0; i < pipe2nclients(pipe); i++) {
+		u32 cid = pipe2client(pipe, i);
+		int n = blkcfg & 0xff;
+
+		if (!n)
+			continue;
+
 		DBG("%s[%d]: request %d SMP blocks", pipe2name(pipe), i, n);
-		ret = smp_request_block(smp, pipe2client(pipe, i), n);
+		ret = smp_request_block(smp, state, cid, n);
 		if (ret) {
 			dev_err(dev->dev, "Cannot allocate %d SMP blocks: %d\n",
 					n, ret);
 			return ret;
 		}

-		nblks += n;
+		blkcfg >>= 8;
 	}

-	set_fifo_thresholds(smp, pipe, nblks);
+	state->assigned |= (1 << pipe);

 	return 0;
 }

 /* Release SMP blocks for all clients of the pipe */
-void mdp5_smp_release(struct mdp5_smp *smp, enum mdp5_pipe pipe)
+void mdp5_smp_release(struct mdp5_smp *smp, struct mdp5_smp_state *state,
+		enum mdp5_pipe pipe)
 {
 	int i;
-	unsigned long flags;
 	int cnt = smp->blk_cnt;

 	for (i = 0; i < pipe2nclients(pipe); i++) {
-		mdp5_smp_state_t assigned;
 		u32 cid = pipe2client(pipe, i);
-		struct mdp5_client_smp_state *ps = &smp->client_state[cid];
+		void *cs = state->client_state[cid];

-		spin_lock_irqsave(&smp->state_lock, flags);
+		/* update global state: */
+		bitmap_andnot(state->state, state->state, cs, cnt);

-		/* clear hw assignment */
-		bitmap_or(assigned, ps->inuse, ps->configured, cnt);
-		update_smp_state(smp, CID_UNUSED, &assigned);
-
-		/* free to global pool */
-		bitmap_andnot(smp->state, smp->state, ps->pending, cnt);
-		bitmap_andnot(smp->state, smp->state, assigned, cnt);
-
-		/* clear client's infor */
-		bitmap_zero(ps->pending, cnt);
-		bitmap_zero(ps->configured, cnt);
-		bitmap_zero(ps->inuse, cnt);
-
-		spin_unlock_irqrestore(&smp->state_lock, flags);
+		/* clear client's state */
+		bitmap_zero(cs, cnt);
 	}

-	set_fifo_thresholds(smp, pipe, 0);
+	state->released |= (1 << pipe);
 }

-static void update_smp_state(struct mdp5_smp *smp,
+/* NOTE: SMP_ALLOC_* regs are *not* double buffered, so release has to
+ * happen after scanout completes.
+ */
+static unsigned update_smp_state(struct mdp5_smp *smp,
 		u32 cid, mdp5_smp_state_t *assigned)
 {
 	struct mdp5_kms *mdp5_kms = get_kms(smp);
 	int cnt = smp->blk_cnt;
+	unsigned nblks = 0;
 	u32 blk, val;

 	for_each_set_bit(blk, *assigned, cnt) {
@ -330,62 +250,88 @@ static void update_smp_state(struct mdp5_smp *smp,

 		mdp5_write(mdp5_kms, REG_MDP5_SMP_ALLOC_W_REG(idx), val);
 		mdp5_write(mdp5_kms, REG_MDP5_SMP_ALLOC_R_REG(idx), val);
+
+		nblks++;
 	}
+
+	return nblks;
 }

-/* step #2: configure hw for union(pending, inuse): */
-void mdp5_smp_configure(struct mdp5_smp *smp, enum mdp5_pipe pipe)
+void mdp5_smp_prepare_commit(struct mdp5_smp *smp, struct mdp5_smp_state *state)
 {
-	int cnt = smp->blk_cnt;
-	mdp5_smp_state_t assigned;
-	int i;
+	enum mdp5_pipe pipe;

-	for (i = 0; i < pipe2nclients(pipe); i++) {
-		u32 cid = pipe2client(pipe, i);
-		struct mdp5_client_smp_state *ps = &smp->client_state[cid];
+	for_each_set_bit(pipe, &state->assigned, sizeof(state->assigned) * 8) {
+		unsigned i, nblks = 0;

-		/*
-		 * if vblank has not happened since last smp_configure
-		 * skip the configure for now
-		 */
-		if (!bitmap_equal(ps->inuse, ps->configured, cnt))
-			continue;
+		for (i = 0; i < pipe2nclients(pipe); i++) {
+			u32 cid = pipe2client(pipe, i);
+			void *cs = state->client_state[cid];

-		bitmap_copy(ps->configured, ps->pending, cnt);
-		bitmap_or(assigned, ps->inuse, ps->configured, cnt);
-		update_smp_state(smp, cid, &assigned);
-	}
-}
+			nblks += update_smp_state(smp, cid, cs);

-/* step #3: after vblank, copy configured -> inuse: */
-void mdp5_smp_commit(struct mdp5_smp *smp, enum mdp5_pipe pipe)
-{
-	int cnt = smp->blk_cnt;
-	mdp5_smp_state_t released;
-	int i;
-
-	for (i = 0; i < pipe2nclients(pipe); i++) {
-		u32 cid = pipe2client(pipe, i);
-		struct mdp5_client_smp_state *ps = &smp->client_state[cid];
-
-		/*
-		 * Figure out if there are any blocks we where previously
-		 * using, which can be released and made available to other
-		 * clients:
-		 */
-		if (bitmap_andnot(released, ps->inuse, ps->configured, cnt)) {
-			unsigned long flags;
-
-			spin_lock_irqsave(&smp->state_lock, flags);
-			/* clear released blocks: */
-			bitmap_andnot(smp->state, smp->state, released, cnt);
-			spin_unlock_irqrestore(&smp->state_lock, flags);
-
-			update_smp_state(smp, CID_UNUSED, &released);
+			DBG("assign %s:%u, %u blks",
+				pipe2name(pipe), i, nblks);
 		}

-		bitmap_copy(ps->inuse, ps->configured, cnt);
+		set_fifo_thresholds(smp, pipe, nblks);
 	}
+
+	state->assigned = 0;
+}
+
+void mdp5_smp_complete_commit(struct mdp5_smp *smp, struct mdp5_smp_state *state)
+{
+	enum mdp5_pipe pipe;
+
+	for_each_set_bit(pipe, &state->released, sizeof(state->released) * 8) {
+		DBG("release %s", pipe2name(pipe));
+		set_fifo_thresholds(smp, pipe, 0);
+	}
+
+	state->released = 0;
+}
+
+void mdp5_smp_dump(struct mdp5_smp *smp, struct drm_printer *p)
+{
+	struct mdp5_kms *mdp5_kms = get_kms(smp);
+	struct mdp5_hw_pipe_state *hwpstate;
+	struct mdp5_smp_state *state;
+	int total = 0, i, j;
+
+	drm_printf(p, "name\tinuse\tplane\n");
+	drm_printf(p, "----\t-----\t-----\n");
+
+	if (drm_can_sleep())
+		drm_modeset_lock(&mdp5_kms->state_lock, NULL);
+
+	/* grab these *after* we hold the state_lock */
+	hwpstate = &mdp5_kms->state->hwpipe;
+	state = &mdp5_kms->state->smp;
+
+	for (i = 0; i < mdp5_kms->num_hwpipes; i++) {
+		struct mdp5_hw_pipe *hwpipe = mdp5_kms->hwpipes[i];
+		struct drm_plane *plane = hwpstate->hwpipe_to_plane[hwpipe->idx];
+		enum mdp5_pipe pipe = hwpipe->pipe;
+		for (j = 0; j < pipe2nclients(pipe); j++) {
+			u32 cid = pipe2client(pipe, j);
+			void *cs = state->client_state[cid];
+			int inuse = bitmap_weight(cs, smp->blk_cnt);
+
+			drm_printf(p, "%s:%d\t%d\t%s\n",
+				pipe2name(pipe), j, inuse,
+				plane ? plane->name : NULL);
+
+			total += inuse;
+		}
+	}
+
+	drm_printf(p, "TOTAL:\t%d\t(of %d)\n", total, smp->blk_cnt);
+	drm_printf(p, "AVAIL:\t%d\n", smp->blk_cnt -
+			bitmap_weight(state->state, smp->blk_cnt));
+
+	if (drm_can_sleep())
+		drm_modeset_unlock(&mdp5_kms->state_lock);
 }

 void mdp5_smp_destroy(struct mdp5_smp *smp)
@ -393,8 +339,9 @@ void mdp5_smp_destroy(struct mdp5_smp *smp)
 	kfree(smp);
 }

-struct mdp5_smp *mdp5_smp_init(struct drm_device *dev, const struct mdp5_smp_block *cfg)
+struct mdp5_smp *mdp5_smp_init(struct mdp5_kms *mdp5_kms, const struct mdp5_smp_block *cfg)
 {
+	struct mdp5_smp_state *state = &mdp5_kms->state->smp;
 	struct mdp5_smp *smp = NULL;
 	int ret;

@ -404,14 +351,13 @@ struct mdp5_smp *mdp5_smp_init(struct drm_device *dev, const struct mdp5_smp_blo
 		goto fail;
 	}

-	smp->dev = dev;
+	smp->dev = mdp5_kms->dev;
 	smp->blk_cnt = cfg->mmb_count;
 	smp->blk_size = cfg->mmb_size;

 	/* statically tied MMBs cannot be re-allocated: */
-	bitmap_copy(smp->state, cfg->reserved_state, smp->blk_cnt);
+	bitmap_copy(state->state, cfg->reserved_state, smp->blk_cnt);
 	memcpy(smp->reserved, cfg->reserved, sizeof(smp->reserved));
-	spin_lock_init(&smp->state_lock);

 	return smp;
 fail:
--- a/drivers/gpu/drm/msm/mdp/mdp5/mdp5_smp.h
+++ b/drivers/gpu/drm/msm/mdp/mdp5/mdp5_smp.h
@ -19,12 +19,53 @@
 #ifndef __MDP5_SMP_H__
 #define __MDP5_SMP_H__

+#include <drm/drm_print.h>
+
 #include "msm_drv.h"

-struct mdp5_client_smp_state {
-	mdp5_smp_state_t inuse;
-	mdp5_smp_state_t configured;
-	mdp5_smp_state_t pending;
+/*
+ * SMP - Shared Memory Pool:
+ *
+ * SMP blocks are shared between all the clients, where each plane in
+ * a scanout buffer is a SMP client.  Ie. scanout of 3 plane I420 on
+ * pipe VIG0 => 3 clients: VIG0_Y, VIG0_CB, VIG0_CR.
+ *
+ * Based on the size of the attached scanout buffer, a certain # of
+ * blocks must be allocated to that client out of the shared pool.
+ *
+ * In some hw, some blocks are statically allocated for certain pipes
+ * and CANNOT be re-allocated (eg: MMB0 and MMB1 both tied to RGB0).
+ *
+ *
+ * Atomic SMP State:
+ *
+ * On atomic updates that modify SMP configuration, the state is cloned
+ * (copied) and modified.  For test-only, or in cases where atomic
+ * update fails (or if we hit ww_mutex deadlock/backoff condition) the
+ * new state is simply thrown away.
+ *
+ * Because the SMP registers are not double buffered, updates are a
+ * two step process:
+ *
+ * 1) in _prepare_commit() we configure things (via read-modify-write)
+ *    for the newly assigned pipes, so we don't take away blocks
+ *    assigned to pipes that are still scanning out
+ * 2) in _complete_commit(), after vblank/etc, we clear things for the
+ *    released clients, since at that point old pipes are no longer
+ *    scanning out.
+ */
+struct mdp5_smp_state {
+	/* global state of what blocks are in use: */
+	mdp5_smp_state_t state;
+
+	/* per client state of what blocks they are using: */
+	mdp5_smp_state_t client_state[MAX_CLIENTS];
+
+	/* assigned pipes (hw updated at _prepare_commit()): */
+	unsigned long assigned;
+
+	/* released pipes (hw updated at _complete_commit()): */
+	unsigned long released;
 };

 struct mdp5_kms;
@ -36,13 +77,22 @@ struct mdp5_smp;
 * which is then used to call the other mdp5_smp_*(handler, ...) functions.
 */

-struct mdp5_smp *mdp5_smp_init(struct drm_device *dev, const struct mdp5_smp_block *cfg);
+struct mdp5_smp *mdp5_smp_init(struct mdp5_kms *mdp5_kms,
+		const struct mdp5_smp_block *cfg);
 void  mdp5_smp_destroy(struct mdp5_smp *smp);

-int  mdp5_smp_request(struct mdp5_smp *smp, enum mdp5_pipe pipe,
-		const struct mdp_format *format, u32 width, bool hdecim);
-void mdp5_smp_configure(struct mdp5_smp *smp, enum mdp5_pipe pipe);
-void mdp5_smp_commit(struct mdp5_smp *smp, enum mdp5_pipe pipe);
-void mdp5_smp_release(struct mdp5_smp *smp, enum mdp5_pipe pipe);
+void mdp5_smp_dump(struct mdp5_smp *smp, struct drm_printer *p);
+
+uint32_t mdp5_smp_calculate(struct mdp5_smp *smp,
+		const struct mdp_format *format,
+		u32 width, bool hdecim);
+
+int mdp5_smp_assign(struct mdp5_smp *smp, struct mdp5_smp_state *state,
+		enum mdp5_pipe pipe, uint32_t blkcfg);
+void mdp5_smp_release(struct mdp5_smp *smp, struct mdp5_smp_state *state,
+		enum mdp5_pipe pipe);
+
+void mdp5_smp_prepare_commit(struct mdp5_smp *smp, struct mdp5_smp_state *state);
+void mdp5_smp_complete_commit(struct mdp5_smp *smp, struct mdp5_smp_state *state);

 #endif /* __MDP5_SMP_H__ */
--- a/drivers/gpu/drm/msm/mdp/mdp_common.xml.h
+++ b/drivers/gpu/drm/msm/mdp/mdp_common.xml.h
@ -12,7 +12,7 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml (   1572 bytes, from 2016-02-10 17:07:21)
 - /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp4.xml            (  20915 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp_common.xml      (   2849 bytes, from 2015-09-18 12:07:28)
- /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp5.xml            (  37194 bytes, from 2015-09-18 12:07:28)
+- /home/robclark/src/freedreno/envytools/rnndb/mdp/mdp5.xml            (  36965 bytes, from 2016-11-26 23:01:08)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/dsi.xml             (  27887 bytes, from 2015-10-22 16:34:52)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/sfpb.xml            (    602 bytes, from 2015-10-22 16:35:02)
 - /home/robclark/src/freedreno/envytools/rnndb/dsi/mmss_cc.xml         (   1686 bytes, from 2015-05-20 20:03:14)
--- a/drivers/gpu/drm/msm/msm_atomic.c
+++ b/drivers/gpu/drm/msm/msm_atomic.c
@ -241,6 +241,10 @@ int msm_atomic_commit(struct drm_device *dev,

 	drm_atomic_helper_swap_state(state, true);

+	/* swap driver private state while still holding state_lock */
+	if (to_kms_state(state)->state)
+		priv->kms->funcs->swap_state(priv->kms, state);
+
 	/*
 	 * Everything below can be run asynchronously without the need to grab
 	 * any modeset locks at all under one conditions: It must be guaranteed
@ -271,3 +275,30 @@ int msm_atomic_commit(struct drm_device *dev,
 	drm_atomic_helper_cleanup_planes(dev, state);
 	return ret;
 }
+
+struct drm_atomic_state *msm_atomic_state_alloc(struct drm_device *dev)
+{
+	struct msm_kms_state *state = kzalloc(sizeof(*state), GFP_KERNEL);
+
+	if (!state || drm_atomic_state_init(dev, &state->base) < 0) {
+		kfree(state);
+		return NULL;
+	}
+
+	return &state->base;
+}
+
+void msm_atomic_state_clear(struct drm_atomic_state *s)
+{
+	struct msm_kms_state *state = to_kms_state(s);
+	drm_atomic_state_default_clear(&state->base);
+	kfree(state->state);
+	state->state = NULL;
+}
+
+void msm_atomic_state_free(struct drm_atomic_state *state)
+{
+	kfree(to_kms_state(state)->state);
+	drm_atomic_state_default_release(state);
+	kfree(state);
+}
--- a/drivers/gpu/drm/msm/msm_debugfs.c
+++ b/drivers/gpu/drm/msm/msm_debugfs.c
@ -18,6 +18,7 @@
 #ifdef CONFIG_DEBUG_FS
 #include "msm_drv.h"
 #include "msm_gpu.h"
+#include "msm_kms.h"
 #include "msm_debugfs.h"

 static int msm_gpu_show(struct drm_device *dev, struct seq_file *m)
@ -142,6 +143,7 @@ int msm_debugfs_late_init(struct drm_device *dev)
 int msm_debugfs_init(struct drm_minor *minor)
 {
 	struct drm_device *dev = minor->dev;
+	struct msm_drm_private *priv = dev->dev_private;
 	int ret;

 	ret = drm_debugfs_create_files(msm_debugfs_list,
@ -153,15 +155,25 @@ int msm_debugfs_init(struct drm_minor *minor)
 		return ret;
 	}

-	return 0;
+	if (priv->kms->funcs->debugfs_init)
+		ret = priv->kms->funcs->debugfs_init(priv->kms, minor);
+
+	return ret;
 }

 void msm_debugfs_cleanup(struct drm_minor *minor)
 {
+	struct drm_device *dev = minor->dev;
+	struct msm_drm_private *priv = dev->dev_private;
+
 	drm_debugfs_remove_files(msm_debugfs_list,
 			ARRAY_SIZE(msm_debugfs_list), minor);
-	if (!minor->dev->dev_private)
+	if (!priv)
 		return;
+
+	if (priv->kms->funcs->debugfs_cleanup)
+		priv->kms->funcs->debugfs_cleanup(priv->kms, minor);
+
 	msm_rd_debugfs_cleanup(minor);
 	msm_perf_debugfs_cleanup(minor);
 }
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@ -46,17 +46,21 @@ static const struct drm_mode_config_funcs mode_config_funcs = {
 	.output_poll_changed = msm_fb_output_poll_changed,
 	.atomic_check = msm_atomic_check,
 	.atomic_commit = msm_atomic_commit,
+	.atomic_state_alloc = msm_atomic_state_alloc,
+	.atomic_state_clear = msm_atomic_state_clear,
+	.atomic_state_free = msm_atomic_state_free,
 };

-int msm_register_mmu(struct drm_device *dev, struct msm_mmu *mmu)
+int msm_register_address_space(struct drm_device *dev,
+		struct msm_gem_address_space *aspace)
 {
 	struct msm_drm_private *priv = dev->dev_private;
-	int idx = priv->num_mmus++;
+	int idx = priv->num_aspaces++;

-	if (WARN_ON(idx >= ARRAY_SIZE(priv->mmus)))
+	if (WARN_ON(idx >= ARRAY_SIZE(priv->aspace)))
 		return -EINVAL;

-	priv->mmus[idx] = mmu;
+	priv->aspace[idx] = aspace;

 	return idx;
 }
@ -907,10 +911,8 @@ static int add_components_mdp(struct device *mdp_dev,
 		 * remote-endpoint isn't a component that we need to add
 		 */
 		if (of_device_is_compatible(np, "qcom,mdp4") &&
-		    ep.port == 0) {
-			of_node_put(ep_node);
+		    ep.port == 0)
 			continue;
-		}

 		/*
 		 * It's okay if some of the ports don't have a remote endpoint
@ -918,15 +920,12 @@ static int add_components_mdp(struct device *mdp_dev,
 		 * any external interface.
 		 */
 		intf = of_graph_get_remote_port_parent(ep_node);
-		if (!intf) {
-			of_node_put(ep_node);
+		if (!intf)
 			continue;
-		}

 		drm_of_component_match_add(master_dev, matchptr, compare_of,
 					   intf);
 		of_node_put(intf);
-		of_node_put(ep_node);
 	}

 	return 0;
@ -1039,7 +1038,13 @@ static int msm_pdev_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;

-	pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
+	/* on all devices that I am aware of, iommu's which can map
+	 * any address the cpu can see are used:
+	 */
+	ret = dma_set_mask_and_coherent(&pdev->dev, ~0);
+	if (ret)
+		return ret;
+
 	return component_master_add_with_match(&pdev->dev, &msm_drm_ops, match);
 }

--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@ -52,6 +52,8 @@ struct msm_perf_state;
 struct msm_gem_submit;
 struct msm_fence_context;
 struct msm_fence_cb;
+struct msm_gem_address_space;
+struct msm_gem_vma;

 #define NUM_DOMAINS 2    /* one for KMS, then one per gpu core (?) */

@ -121,12 +123,16 @@ struct msm_drm_private {
 	uint32_t pending_crtcs;
 	wait_queue_head_t pending_crtcs_event;

-	/* registered MMUs: */
-	unsigned int num_mmus;
-	struct msm_mmu *mmus[NUM_DOMAINS];
+	/* Registered address spaces.. currently this is fixed per # of
+	 * iommu's.  Ie. one for display block and one for gpu block.
+	 * Eventually, to do per-process gpu pagetables, we'll want one
+	 * of these per-process.
+	 */
+	unsigned int num_aspaces;
+	struct msm_gem_address_space *aspace[NUM_DOMAINS];

 	unsigned int num_planes;
-	struct drm_plane *planes[8];
+	struct drm_plane *planes[16];

 	unsigned int num_crtcs;
 	struct drm_crtc *crtcs[8];
@ -173,8 +179,22 @@ int msm_atomic_check(struct drm_device *dev,
 		     struct drm_atomic_state *state);
 int msm_atomic_commit(struct drm_device *dev,
 		struct drm_atomic_state *state, bool nonblock);
+struct drm_atomic_state *msm_atomic_state_alloc(struct drm_device *dev);
+void msm_atomic_state_clear(struct drm_atomic_state *state);
+void msm_atomic_state_free(struct drm_atomic_state *state);

-int msm_register_mmu(struct drm_device *dev, struct msm_mmu *mmu);
+int msm_register_address_space(struct drm_device *dev,
+		struct msm_gem_address_space *aspace);
+
+void msm_gem_unmap_vma(struct msm_gem_address_space *aspace,
+		struct msm_gem_vma *vma, struct sg_table *sgt);
+int msm_gem_map_vma(struct msm_gem_address_space *aspace,
+		struct msm_gem_vma *vma, struct sg_table *sgt, int npages);
+
+void msm_gem_address_space_destroy(struct msm_gem_address_space *aspace);
+struct msm_gem_address_space *
+msm_gem_address_space_create(struct device *dev, struct iommu_domain *domain,
+		const char *name);

 void msm_gem_submit_free(struct msm_gem_submit *submit);
 int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
@ -189,9 +209,9 @@ int msm_gem_mmap(struct file *filp, struct vm_area_struct *vma);
 int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
 uint64_t msm_gem_mmap_offset(struct drm_gem_object *obj);
 int msm_gem_get_iova_locked(struct drm_gem_object *obj, int id,
-		uint32_t *iova);
-int msm_gem_get_iova(struct drm_gem_object *obj, int id, uint32_t *iova);
-uint32_t msm_gem_iova(struct drm_gem_object *obj, int id);
+		uint64_t *iova);
+int msm_gem_get_iova(struct drm_gem_object *obj, int id, uint64_t *iova);
+uint64_t msm_gem_iova(struct drm_gem_object *obj, int id);
 struct page **msm_gem_get_pages(struct drm_gem_object *obj);
 void msm_gem_put_pages(struct drm_gem_object *obj);
 void msm_gem_put_iova(struct drm_gem_object *obj, int id);
@ -303,8 +323,8 @@ void __iomem *msm_ioremap(struct platform_device *pdev, const char *name,
 void msm_writel(u32 data, void __iomem *addr);
 u32 msm_readl(const void __iomem *addr);

-#define DBG(fmt, ...) DRM_DEBUG(fmt"\n", ##__VA_ARGS__)
-#define VERB(fmt, ...) if (0) DRM_DEBUG(fmt"\n", ##__VA_ARGS__)
+#define DBG(fmt, ...) DRM_DEBUG_DRIVER(fmt"\n", ##__VA_ARGS__)
+#define VERB(fmt, ...) if (0) DRM_DEBUG_DRIVER(fmt"\n", ##__VA_ARGS__)

 static inline int align_pitch(int width, int bpp)
 {
--- a/drivers/gpu/drm/msm/msm_fb.c
+++ b/drivers/gpu/drm/msm/msm_fb.c
@ -88,11 +88,11 @@ int msm_framebuffer_prepare(struct drm_framebuffer *fb, int id)
 {
 	struct msm_framebuffer *msm_fb = to_msm_framebuffer(fb);
 	int ret, i, n = drm_format_num_planes(fb->pixel_format);
-	uint32_t iova;
+	uint64_t iova;

 	for (i = 0; i < n; i++) {
 		ret = msm_gem_get_iova(msm_fb->planes[i], id, &iova);
-		DBG("FB[%u]: iova[%d]: %08x (%d)", fb->base.id, i, iova, ret);
+		DBG("FB[%u]: iova[%d]: %08llx (%d)", fb->base.id, i, iova, ret);
 		if (ret)
 			return ret;
 	}
--- a/drivers/gpu/drm/msm/msm_fbdev.c
+++ b/drivers/gpu/drm/msm/msm_fbdev.c
@ -76,7 +76,7 @@ static int msm_fbdev_create(struct drm_fb_helper *helper,
 	struct drm_framebuffer *fb = NULL;
 	struct fb_info *fbi = NULL;
 	struct drm_mode_fb_cmd2 mode_cmd = {0};
-	uint32_t paddr;
+	uint64_t paddr;
 	int ret, size;

 	DBG("create fbdev: %dx%d@%d (%dx%d)", sizes->surface_width,
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@ -296,12 +296,8 @@ put_iova(struct drm_gem_object *obj)
 	WARN_ON(!mutex_is_locked(&dev->struct_mutex));

 	for (id = 0; id < ARRAY_SIZE(msm_obj->domain); id++) {
-		struct msm_mmu *mmu = priv->mmus[id];
-		if (mmu && msm_obj->domain[id].iova) {
-			uint32_t offset = msm_obj->domain[id].iova;
-			mmu->funcs->unmap(mmu, offset, msm_obj->sgt, obj->size);
-			msm_obj->domain[id].iova = 0;
-		}
+		msm_gem_unmap_vma(priv->aspace[id],
+				&msm_obj->domain[id], msm_obj->sgt);
 	}
 }

@ -313,7 +309,7 @@ put_iova(struct drm_gem_object *obj)
 * the refcnt counter needs to be atomic_t.
 */
 int msm_gem_get_iova_locked(struct drm_gem_object *obj, int id,
-		uint32_t *iova)
+		uint64_t *iova)
 {
 	struct msm_gem_object *msm_obj = to_msm_bo(obj);
 	int ret = 0;
@ -326,16 +322,8 @@ int msm_gem_get_iova_locked(struct drm_gem_object *obj, int id,
 			return PTR_ERR(pages);

 		if (iommu_present(&platform_bus_type)) {
-			struct msm_mmu *mmu = priv->mmus[id];
-			uint32_t offset;
-
-			if (WARN_ON(!mmu))
-				return -EINVAL;
-
-			offset = (uint32_t)mmap_offset(obj);
-			ret = mmu->funcs->map(mmu, offset, msm_obj->sgt,
-					obj->size, IOMMU_READ | IOMMU_WRITE);
-			msm_obj->domain[id].iova = offset;
+			ret = msm_gem_map_vma(priv->aspace[id], &msm_obj->domain[id],
+					msm_obj->sgt, obj->size >> PAGE_SHIFT);
 		} else {
 			msm_obj->domain[id].iova = physaddr(obj);
 		}
@ -348,7 +336,7 @@ int msm_gem_get_iova_locked(struct drm_gem_object *obj, int id,
 }

 /* get iova, taking a reference.  Should have a matching put */
-int msm_gem_get_iova(struct drm_gem_object *obj, int id, uint32_t *iova)
+int msm_gem_get_iova(struct drm_gem_object *obj, int id, uint64_t *iova)
 {
 	struct msm_gem_object *msm_obj = to_msm_bo(obj);
 	int ret;
@ -370,7 +358,7 @@ int msm_gem_get_iova(struct drm_gem_object *obj, int id, uint32_t *iova)
 /* get iova without taking a reference, used in places where you have
 * already done a 'msm_gem_get_iova()'.
 */
-uint32_t msm_gem_iova(struct drm_gem_object *obj, int id)
+uint64_t msm_gem_iova(struct drm_gem_object *obj, int id)
 {
 	struct msm_gem_object *msm_obj = to_msm_bo(obj);
 	WARN_ON(!msm_obj->domain[id].iova);
@ -631,9 +619,11 @@ void msm_gem_describe(struct drm_gem_object *obj, struct seq_file *m)
 	struct msm_gem_object *msm_obj = to_msm_bo(obj);
 	struct reservation_object *robj = msm_obj->resv;
 	struct reservation_object_list *fobj;
+	struct msm_drm_private *priv = obj->dev->dev_private;
 	struct dma_fence *fence;
 	uint64_t off = drm_vma_node_start(&obj->vma_node);
 	const char *madv;
+	unsigned id;

 	WARN_ON(!mutex_is_locked(&obj->dev->struct_mutex));

@ -650,10 +640,15 @@ void msm_gem_describe(struct drm_gem_object *obj, struct seq_file *m)
 		break;
 	}

-	seq_printf(m, "%08x: %c %2d (%2d) %08llx %p %zu%s\n",
+	seq_printf(m, "%08x: %c %2d (%2d) %08llx %p\t",
 			msm_obj->flags, is_active(msm_obj) ? 'A' : 'I',
 			obj->name, obj->refcount.refcount.counter,
-			off, msm_obj->vaddr, obj->size, madv);
+			off, msm_obj->vaddr);
+
+	for (id = 0; id < priv->num_aspaces; id++)
+		seq_printf(m, " %08llx", msm_obj->domain[id].iova);
+
+	seq_printf(m, " %zu%s\n", obj->size, madv);

 	rcu_read_lock();
 	fobj = rcu_dereference(robj->fence);
@ -761,7 +756,6 @@ static int msm_gem_new_impl(struct drm_device *dev,
 {
 	struct msm_drm_private *priv = dev->dev_private;
 	struct msm_gem_object *msm_obj;
-	unsigned sz;
 	bool use_vram = false;

 	switch (flags & MSM_BO_CACHE_MASK) {
@ -783,16 +777,12 @@ static int msm_gem_new_impl(struct drm_device *dev,
 	if (WARN_ON(use_vram && !priv->vram.size))
 		return -EINVAL;

-	sz = sizeof(*msm_obj);
-	if (use_vram)
-		sz += sizeof(struct drm_mm_node);
-
-	msm_obj = kzalloc(sz, GFP_KERNEL);
+	msm_obj = kzalloc(sizeof(*msm_obj), GFP_KERNEL);
 	if (!msm_obj)
 		return -ENOMEM;

 	if (use_vram)
-		msm_obj->vram_node = (void *)&msm_obj[1];
+		msm_obj->vram_node = &msm_obj->domain[0].node;

 	msm_obj->flags = flags;
 	msm_obj->madv = MSM_MADV_WILLNEED;
--- a/drivers/gpu/drm/msm/msm_gem.h
+++ b/drivers/gpu/drm/msm/msm_gem.h
@ -24,6 +24,20 @@
 /* Additional internal-use only BO flags: */
 #define MSM_BO_STOLEN        0x10000000    /* try to use stolen/splash memory */

+struct msm_gem_address_space {
+	const char *name;
+	/* NOTE: mm managed at the page level, size is in # of pages
+	 * and position mm_node->start is in # of pages:
+	 */
+	struct drm_mm mm;
+	struct msm_mmu *mmu;
+};
+
+struct msm_gem_vma {
+	struct drm_mm_node node;
+	uint64_t iova;
+};
+
 struct msm_gem_object {
 	struct drm_gem_object base;

@ -61,10 +75,7 @@ struct msm_gem_object {
 	struct sg_table *sgt;
 	void *vaddr;

-	struct {
-		// XXX
-		uint32_t iova;
-	} domain[NUM_DOMAINS];
+	struct msm_gem_vma domain[NUM_DOMAINS];

 	/* normally (resv == &_resv) except for imported bo's */
 	struct reservation_object *resv;
@ -112,13 +123,13 @@ struct msm_gem_submit {
 	struct {
 		uint32_t type;
 		uint32_t size;  /* in dwords */
-		uint32_t iova;
+		uint64_t iova;
 		uint32_t idx;   /* cmdstream buffer idx in bos[] */
 	} *cmd;  /* array of size nr_cmds */
 	struct {
 		uint32_t flags;
 		struct msm_gem_object *obj;
-		uint32_t iova;
+		uint64_t iova;
 	} bos[0];
 };

--- a/drivers/gpu/drm/msm/msm_gem_submit.c
+++ b/drivers/gpu/drm/msm/msm_gem_submit.c
@ -241,7 +241,7 @@ static int submit_pin_objects(struct msm_gem_submit *submit)

 	for (i = 0; i < submit->nr_bos; i++) {
 		struct msm_gem_object *msm_obj = submit->bos[i].obj;
-		uint32_t iova;
+		uint64_t iova;

 		/* if locking succeeded, pin bo: */
 		ret = msm_gem_get_iova_locked(&msm_obj->base,
@ -266,7 +266,7 @@ static int submit_pin_objects(struct msm_gem_submit *submit)
 }

 static int submit_bo(struct msm_gem_submit *submit, uint32_t idx,
-		struct msm_gem_object **obj, uint32_t *iova, bool *valid)
+		struct msm_gem_object **obj, uint64_t *iova, bool *valid)
 {
 	if (idx >= submit->nr_bos) {
 		DRM_ERROR("invalid buffer index: %u (out of %u)\n",
@ -312,7 +312,8 @@ static int submit_reloc(struct msm_gem_submit *submit, struct msm_gem_object *ob
 		struct drm_msm_gem_submit_reloc submit_reloc;
 		void __user *userptr =
 			u64_to_user_ptr(relocs + (i * sizeof(submit_reloc)));
-		uint32_t iova, off;
+		uint32_t off;
+		uint64_t iova;
 		bool valid;

 		ret = copy_from_user(&submit_reloc, userptr, sizeof(submit_reloc));
@ -461,7 +462,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data,
 		void __user *userptr =
 			u64_to_user_ptr(args->cmds + (i * sizeof(submit_cmd)));
 		struct msm_gem_object *msm_obj;
-		uint32_t iova;
+		uint64_t iova;

 		ret = copy_from_user(&submit_cmd, userptr, sizeof(submit_cmd));
 		if (ret) {
--- a/drivers/gpu/drm/msm/msm_gem_vma.c
+++ b/drivers/gpu/drm/msm/msm_gem_vma.c
@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2016 Red Hat
+ * Author: Rob Clark <robdclark@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "msm_drv.h"
+#include "msm_gem.h"
+#include "msm_mmu.h"
+
+void
+msm_gem_unmap_vma(struct msm_gem_address_space *aspace,
+		struct msm_gem_vma *vma, struct sg_table *sgt)
+{
+	if (!vma->iova)
+		return;
+
+	if (aspace->mmu) {
+		unsigned size = vma->node.size << PAGE_SHIFT;
+		aspace->mmu->funcs->unmap(aspace->mmu, vma->iova, sgt, size);
+	}
+
+	drm_mm_remove_node(&vma->node);
+
+	vma->iova = 0;
+}
+
+int
+msm_gem_map_vma(struct msm_gem_address_space *aspace,
+		struct msm_gem_vma *vma, struct sg_table *sgt, int npages)
+{
+	int ret;
+
+	if (WARN_ON(drm_mm_node_allocated(&vma->node)))
+		return 0;
+
+	ret = drm_mm_insert_node(&aspace->mm, &vma->node, npages,
+			0, DRM_MM_SEARCH_DEFAULT);
+	if (ret)
+		return ret;
+
+	vma->iova = vma->node.start << PAGE_SHIFT;
+
+	if (aspace->mmu) {
+		unsigned size = npages << PAGE_SHIFT;
+		ret = aspace->mmu->funcs->map(aspace->mmu, vma->iova, sgt,
+				size, IOMMU_READ | IOMMU_WRITE);
+	}
+
+	return ret;
+}
+
+void
+msm_gem_address_space_destroy(struct msm_gem_address_space *aspace)
+{
+	drm_mm_takedown(&aspace->mm);
+	if (aspace->mmu)
+		aspace->mmu->funcs->destroy(aspace->mmu);
+	kfree(aspace);
+}
+
+struct msm_gem_address_space *
+msm_gem_address_space_create(struct device *dev, struct iommu_domain *domain,
+		const char *name)
+{
+	struct msm_gem_address_space *aspace;
+
+	aspace = kzalloc(sizeof(*aspace), GFP_KERNEL);
+	if (!aspace)
+		return ERR_PTR(-ENOMEM);
+
+	aspace->name = name;
+	aspace->mmu = msm_iommu_new(dev, domain);
+
+	drm_mm_init(&aspace->mm, (domain->geometry.aperture_start >> PAGE_SHIFT),
+			(domain->geometry.aperture_end >> PAGE_SHIFT) - 1);
+
+	return aspace;
+}
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@ -91,21 +91,20 @@ static int disable_pwrrail(struct msm_gpu *gpu)

 static int enable_clk(struct msm_gpu *gpu)
 {
-	struct clk *rate_clk = NULL;
 	int i;

-	/* NOTE: kgsl_pwrctrl_clk() ignores grp_clks[0].. */
-	for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i > 0; i--) {
-		if (gpu->grp_clks[i]) {
+	if (gpu->grp_clks[0] && gpu->fast_rate)
+		clk_set_rate(gpu->grp_clks[0], gpu->fast_rate);
+
+	/* Set the RBBM timer rate to 19.2Mhz */
+	if (gpu->grp_clks[2])
+		clk_set_rate(gpu->grp_clks[2], 19200000);
+
+	for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i >= 0; i--)
+		if (gpu->grp_clks[i])
 			clk_prepare(gpu->grp_clks[i]);
-			rate_clk = gpu->grp_clks[i];
-		}
-	}

-	if (rate_clk && gpu->fast_rate)
-		clk_set_rate(rate_clk, gpu->fast_rate);
-
-	for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i > 0; i--)
+	for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i >= 0; i--)
 		if (gpu->grp_clks[i])
 			clk_enable(gpu->grp_clks[i]);

@ -114,24 +113,22 @@ static int enable_clk(struct msm_gpu *gpu)

 static int disable_clk(struct msm_gpu *gpu)
 {
-	struct clk *rate_clk = NULL;
 	int i;

-	/* NOTE: kgsl_pwrctrl_clk() ignores grp_clks[0].. */
-	for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i > 0; i--) {
-		if (gpu->grp_clks[i]) {
+	for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i >= 0; i--)
+		if (gpu->grp_clks[i])
 			clk_disable(gpu->grp_clks[i]);
-			rate_clk = gpu->grp_clks[i];
-		}
-	}

-	if (rate_clk && gpu->slow_rate)
-		clk_set_rate(rate_clk, gpu->slow_rate);
-
-	for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i > 0; i--)
+	for (i = ARRAY_SIZE(gpu->grp_clks) - 1; i >= 0; i--)
 		if (gpu->grp_clks[i])
 			clk_unprepare(gpu->grp_clks[i]);

+	if (gpu->grp_clks[0] && gpu->slow_rate)
+		clk_set_rate(gpu->grp_clks[0], gpu->slow_rate);
+
+	if (gpu->grp_clks[2])
+		clk_set_rate(gpu->grp_clks[2], 0);
+
 	return 0;
 }

@ -528,7 +525,7 @@ void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit,

 	for (i = 0; i < submit->nr_bos; i++) {
 		struct msm_gem_object *msm_obj = submit->bos[i].obj;
-		uint32_t iova;
+		uint64_t iova;

 		/* can't happen yet.. but when we add 2d support we'll have
 		 * to deal w/ cross-ring synchronization:
@ -563,8 +560,8 @@ static irqreturn_t irq_handler(int irq, void *data)
 }

 static const char *clk_names[] = {
-		"src_clk", "core_clk", "iface_clk", "mem_clk", "mem_iface_clk",
-		"alt_mem_iface_clk",
+		"core_clk", "iface_clk", "rbbmtimer_clk", "mem_clk",
+		"mem_iface_clk", "alt_mem_iface_clk",
 };

 int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
@ -656,12 +653,17 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	 */
 	iommu = iommu_domain_alloc(&platform_bus_type);
 	if (iommu) {
+		/* TODO 32b vs 64b address space.. */
+		iommu->geometry.aperture_start = SZ_16M;
+		iommu->geometry.aperture_end = 0xffffffff;
+
 		dev_info(drm->dev, "%s: using IOMMU\n", name);
-		gpu->mmu = msm_iommu_new(&pdev->dev, iommu);
-		if (IS_ERR(gpu->mmu)) {
-			ret = PTR_ERR(gpu->mmu);
+		gpu->aspace = msm_gem_address_space_create(&pdev->dev,
+				iommu, "gpu");
+		if (IS_ERR(gpu->aspace)) {
+			ret = PTR_ERR(gpu->aspace);
 			dev_err(drm->dev, "failed to init iommu: %d\n", ret);
-			gpu->mmu = NULL;
+			gpu->aspace = NULL;
 			iommu_domain_free(iommu);
 			goto fail;
 		}
@ -669,7 +671,7 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev,
 	} else {
 		dev_info(drm->dev, "%s: no IOMMU, fallback to VRAM carveout!\n", name);
 	}
-	gpu->id = msm_register_mmu(drm, gpu->mmu);
+	gpu->id = msm_register_address_space(drm, gpu->aspace);


 	/* Create ringbuffer: */
@ -705,8 +707,8 @@ void msm_gpu_cleanup(struct msm_gpu *gpu)
 		msm_ringbuffer_destroy(gpu->rb);
 	}

-	if (gpu->mmu)
-		gpu->mmu->funcs->destroy(gpu->mmu);
+	if (gpu->aspace)
+		msm_gem_address_space_destroy(gpu->aspace);

 	if (gpu->fctx)
 		msm_fence_context_free(gpu->fctx);
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@ -50,7 +50,7 @@ struct msm_gpu_funcs {
 	void (*submit)(struct msm_gpu *gpu, struct msm_gem_submit *submit,
 			struct msm_file_private *ctx);
 	void (*flush)(struct msm_gpu *gpu);
-	void (*idle)(struct msm_gpu *gpu);
+	bool (*idle)(struct msm_gpu *gpu);
 	irqreturn_t (*irq)(struct msm_gpu *irq);
 	uint32_t (*last_fence)(struct msm_gpu *gpu);
 	void (*recover)(struct msm_gpu *gpu);
@ -80,7 +80,7 @@ struct msm_gpu {

 	/* ringbuffer: */
 	struct msm_ringbuffer *rb;
-	uint32_t rb_iova;
+	uint64_t rb_iova;

 	/* list of GEM active objects: */
 	struct list_head active_list;
@ -98,7 +98,7 @@ struct msm_gpu {
 	void __iomem *mmio;
 	int irq;

-	struct msm_mmu *mmu;
+	struct msm_gem_address_space *aspace;
 	int id;

 	/* Power Control: */
@ -154,6 +154,45 @@ static inline u32 gpu_read(struct msm_gpu *gpu, u32 reg)
 	return msm_readl(gpu->mmio + (reg << 2));
 }

+static inline void gpu_rmw(struct msm_gpu *gpu, u32 reg, u32 mask, u32 or)
+{
+	uint32_t val = gpu_read(gpu, reg);
+
+	val &= ~mask;
+	gpu_write(gpu, reg, val | or);
+}
+
+static inline u64 gpu_read64(struct msm_gpu *gpu, u32 lo, u32 hi)
+{
+	u64 val;
+
+	/*
+	 * Why not a readq here? Two reasons: 1) many of the LO registers are
+	 * not quad word aligned and 2) the GPU hardware designers have a bit
+	 * of a history of putting registers where they fit, especially in
+	 * spins. The longer a GPU family goes the higher the chance that
+	 * we'll get burned.  We could do a series of validity checks if we
+	 * wanted to, but really is a readq() that much better? Nah.
+	 */
+
+	/*
+	 * For some lo/hi registers (like perfcounters), the hi value is latched
+	 * when the lo is read, so make sure to read the lo first to trigger
+	 * that
+	 */
+	val = (u64) msm_readl(gpu->mmio + (lo << 2));
+	val |= ((u64) msm_readl(gpu->mmio + (hi << 2)) << 32);
+
+	return val;
+}
+
+static inline void gpu_write64(struct msm_gpu *gpu, u32 lo, u32 hi, u64 val)
+{
+	/* Why not a writeq here? Read the screed above */
+	msm_writel(lower_32_bits(val), gpu->mmio + (lo << 2));
+	msm_writel(upper_32_bits(val), gpu->mmio + (hi << 2));
+}
+
 int msm_gpu_pm_suspend(struct msm_gpu *gpu);
 int msm_gpu_pm_resume(struct msm_gpu *gpu);

--- a/drivers/gpu/drm/msm/msm_iommu.c
+++ b/drivers/gpu/drm/msm/msm_iommu.c
@ -45,13 +45,13 @@ static void msm_iommu_detach(struct msm_mmu *mmu, const char * const *names,
 	iommu_detach_device(iommu->domain, mmu->dev);
 }

-static int msm_iommu_map(struct msm_mmu *mmu, uint32_t iova,
+static int msm_iommu_map(struct msm_mmu *mmu, uint64_t iova,
 		struct sg_table *sgt, unsigned len, int prot)
 {
 	struct msm_iommu *iommu = to_msm_iommu(mmu);
 	struct iommu_domain *domain = iommu->domain;
 	struct scatterlist *sg;
-	unsigned int da = iova;
+	unsigned long da = iova;
 	unsigned int i, j;
 	int ret;

@ -62,7 +62,7 @@ static int msm_iommu_map(struct msm_mmu *mmu, uint32_t iova,
 		dma_addr_t pa = sg_phys(sg) - sg->offset;
 		size_t bytes = sg->length + sg->offset;

-		VERB("map[%d]: %08x %08lx(%zx)", i, da, (unsigned long)pa, bytes);
+		VERB("map[%d]: %08lx %08lx(%zx)", i, da, (unsigned long)pa, bytes);

 		ret = iommu_map(domain, da, pa, bytes, prot);
 		if (ret)
@ -84,13 +84,13 @@ static int msm_iommu_map(struct msm_mmu *mmu, uint32_t iova,
 	return ret;
 }

-static int msm_iommu_unmap(struct msm_mmu *mmu, uint32_t iova,
+static int msm_iommu_unmap(struct msm_mmu *mmu, uint64_t iova,
 		struct sg_table *sgt, unsigned len)
 {
 	struct msm_iommu *iommu = to_msm_iommu(mmu);
 	struct iommu_domain *domain = iommu->domain;
 	struct scatterlist *sg;
-	unsigned int da = iova;
+	unsigned long da = iova;
 	int i;

 	for_each_sg(sgt->sgl, sg, sgt->nents, i) {
@ -101,7 +101,7 @@ static int msm_iommu_unmap(struct msm_mmu *mmu, uint32_t iova,
 		if (unmapped < bytes)
 			return unmapped;

-		VERB("unmap[%d]: %08x(%zx)", i, da, bytes);
+		VERB("unmap[%d]: %08lx(%zx)", i, da, bytes);

 		BUG_ON(!PAGE_ALIGNED(bytes));

--- a/drivers/gpu/drm/msm/msm_kms.h
+++ b/drivers/gpu/drm/msm/msm_kms.h
@ -40,6 +40,8 @@ struct msm_kms_funcs {
 	irqreturn_t (*irq)(struct msm_kms *kms);
 	int (*enable_vblank)(struct msm_kms *kms, struct drm_crtc *crtc);
 	void (*disable_vblank)(struct msm_kms *kms, struct drm_crtc *crtc);
+	/* swap global atomic state: */
+	void (*swap_state)(struct msm_kms *kms, struct drm_atomic_state *state);
 	/* modeset, bracketing atomic_commit(): */
 	void (*prepare_commit)(struct msm_kms *kms, struct drm_atomic_state *state);
 	void (*complete_commit)(struct msm_kms *kms, struct drm_atomic_state *state);
@ -56,6 +58,11 @@ struct msm_kms_funcs {
 			bool is_cmd_mode);
 	/* cleanup: */
 	void (*destroy)(struct msm_kms *kms);
+#ifdef CONFIG_DEBUG_FS
+	/* debugfs: */
+	int (*debugfs_init)(struct msm_kms *kms, struct drm_minor *minor);
+	void (*debugfs_cleanup)(struct msm_kms *kms, struct drm_minor *minor);
+#endif
 };

 struct msm_kms {
@ -65,6 +72,18 @@ struct msm_kms {
 	int irq;
 };

+/**
+ * Subclass of drm_atomic_state, to allow kms backend to have driver
+ * private global state.  The kms backend can do whatever it wants
+ * with the ->state ptr.  On ->atomic_state_clear() the ->state ptr
+ * is kfree'd and set back to NULL.
+ */
+struct msm_kms_state {
+	struct drm_atomic_state base;
+	void *state;
+};
+#define to_kms_state(x) container_of(x, struct msm_kms_state, base)
+
 static inline void msm_kms_init(struct msm_kms *kms,
 		const struct msm_kms_funcs *funcs)
 {
--- a/drivers/gpu/drm/msm/msm_mmu.h
+++ b/drivers/gpu/drm/msm/msm_mmu.h
@ -23,9 +23,9 @@
 struct msm_mmu_funcs {
 	int (*attach)(struct msm_mmu *mmu, const char * const *names, int cnt);
 	void (*detach)(struct msm_mmu *mmu, const char * const *names, int cnt);
-	int (*map)(struct msm_mmu *mmu, uint32_t iova, struct sg_table *sgt,
+	int (*map)(struct msm_mmu *mmu, uint64_t iova, struct sg_table *sgt,
 			unsigned len, int prot);
-	int (*unmap)(struct msm_mmu *mmu, uint32_t iova, struct sg_table *sgt,
+	int (*unmap)(struct msm_mmu *mmu, uint64_t iova, struct sg_table *sgt,
 			unsigned len);
 	void (*destroy)(struct msm_mmu *mmu);
 };
--- a/drivers/gpu/drm/msm/msm_rd.c
+++ b/drivers/gpu/drm/msm/msm_rd.c
@ -289,7 +289,7 @@ void msm_rd_debugfs_cleanup(struct drm_minor *minor)

 static void snapshot_buf(struct msm_rd_state *rd,
 		struct msm_gem_submit *submit, int idx,
-		uint32_t iova, uint32_t size)
+		uint64_t iova, uint32_t size)
 {
 	struct msm_gem_object *obj = submit->bos[idx].obj;
 	const char *buf;
@ -306,7 +306,7 @@ static void snapshot_buf(struct msm_rd_state *rd,
 	}

 	rd_write_section(rd, RD_GPUADDR,
-			(uint32_t[2]){ iova, size }, 8);
+			(uint32_t[3]){ iova, size, iova >> 32 }, 12);
 	rd_write_section(rd, RD_BUFFER_CONTENTS, buf, size);

 	msm_gem_put_vaddr_locked(&obj->base);
--- a/include/uapi/drm/msm_drm.h
+++ b/include/uapi/drm/msm_drm.h
@ -2,17 +2,24 @@
 * Copyright (C) 2013 Red Hat
 * Author: Rob Clark <robdclark@gmail.com>
 *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
 *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
 *
- * You should have received a copy of the GNU General Public License along with
- * this program.  If not, see <http://www.gnu.org/licenses/>.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
 */

 #ifndef __MSM_DRM_H__