};
 
 /* Convenience macro for adding 32-bit registers. */
-#define REG32(address, ...)                             \
-       { .addr = address, __VA_ARGS__ }
+#define REG32(_reg, ...) \
+       { .addr = (_reg), __VA_ARGS__ }
 
 /*
  * Convenience macro for adding 64-bit registers.
  * access commands only allow 32-bit accesses. Hence, we have to include
  * entries for both halves of the 64-bit registers.
  */
-#define REG64(addr)                                     \
-       REG32(addr), REG32(addr + sizeof(u32))
+#define REG64(_reg) \
+       { .addr = _reg }, \
+       { .addr = _reg ## _UDW }
+
+#define REG64_IDX(_reg, idx) \
+       { .addr = _reg(idx) }, \
+       { .addr = _reg ## _UDW(idx) }
 
 static const struct drm_i915_reg_descriptor gen7_render_regs[] = {
        REG64(GPGPU_THREADS_DISPATCHED),
        REG32(GEN7_GPGPU_DISPATCHDIMX),
        REG32(GEN7_GPGPU_DISPATCHDIMY),
        REG32(GEN7_GPGPU_DISPATCHDIMZ),
-       REG64(GEN7_SO_NUM_PRIMS_WRITTEN(0)),
-       REG64(GEN7_SO_NUM_PRIMS_WRITTEN(1)),
-       REG64(GEN7_SO_NUM_PRIMS_WRITTEN(2)),
-       REG64(GEN7_SO_NUM_PRIMS_WRITTEN(3)),
-       REG64(GEN7_SO_PRIM_STORAGE_NEEDED(0)),
-       REG64(GEN7_SO_PRIM_STORAGE_NEEDED(1)),
-       REG64(GEN7_SO_PRIM_STORAGE_NEEDED(2)),
-       REG64(GEN7_SO_PRIM_STORAGE_NEEDED(3)),
+       REG64_IDX(GEN7_SO_NUM_PRIMS_WRITTEN, 0),
+       REG64_IDX(GEN7_SO_NUM_PRIMS_WRITTEN, 1),
+       REG64_IDX(GEN7_SO_NUM_PRIMS_WRITTEN, 2),
+       REG64_IDX(GEN7_SO_NUM_PRIMS_WRITTEN, 3),
+       REG64_IDX(GEN7_SO_PRIM_STORAGE_NEEDED, 0),
+       REG64_IDX(GEN7_SO_PRIM_STORAGE_NEEDED, 1),
+       REG64_IDX(GEN7_SO_PRIM_STORAGE_NEEDED, 2),
+       REG64_IDX(GEN7_SO_PRIM_STORAGE_NEEDED, 3),
        REG32(GEN7_SO_WRITE_OFFSET(0)),
        REG32(GEN7_SO_WRITE_OFFSET(1)),
        REG32(GEN7_SO_WRITE_OFFSET(2)),
 
 #define   MI_BATCH_RESOURCE_STREAMER (1<<10)
 
 #define MI_PREDICATE_SRC0      (0x2400)
+#define MI_PREDICATE_SRC0_UDW  (0x2400 + 4)
 #define MI_PREDICATE_SRC1      (0x2408)
+#define MI_PREDICATE_SRC1_UDW  (0x2408 + 4)
 
 #define MI_PREDICATE_RESULT_2  (0x2214)
 #define  LOWER_SLICE_ENABLED   (1<<0)
 #define BCS_SWCTRL 0x22200
 
 #define GPGPU_THREADS_DISPATCHED        0x2290
+#define GPGPU_THREADS_DISPATCHED_UDW   (0x2290 + 4)
 #define HS_INVOCATION_COUNT             0x2300
+#define HS_INVOCATION_COUNT_UDW                (0x2300 + 4)
 #define DS_INVOCATION_COUNT             0x2308
+#define DS_INVOCATION_COUNT_UDW                (0x2308 + 4)
 #define IA_VERTICES_COUNT               0x2310
+#define IA_VERTICES_COUNT_UDW          (0x2310 + 4)
 #define IA_PRIMITIVES_COUNT             0x2318
+#define IA_PRIMITIVES_COUNT_UDW                (0x2318 + 4)
 #define VS_INVOCATION_COUNT             0x2320
+#define VS_INVOCATION_COUNT_UDW                (0x2320 + 4)
 #define GS_INVOCATION_COUNT             0x2328
+#define GS_INVOCATION_COUNT_UDW                (0x2328 + 4)
 #define GS_PRIMITIVES_COUNT             0x2330
+#define GS_PRIMITIVES_COUNT_UDW                (0x2330 + 4)
 #define CL_INVOCATION_COUNT             0x2338
+#define CL_INVOCATION_COUNT_UDW                (0x2338 + 4)
 #define CL_PRIMITIVES_COUNT             0x2340
+#define CL_PRIMITIVES_COUNT_UDW                (0x2340 + 4)
 #define PS_INVOCATION_COUNT             0x2348
+#define PS_INVOCATION_COUNT_UDW                (0x2348 + 4)
 #define PS_DEPTH_COUNT                  0x2350
+#define PS_DEPTH_COUNT_UDW             (0x2350 + 4)
 
 /* There are the 4 64-bit counter registers, one for each stream output */
-#define GEN7_SO_NUM_PRIMS_WRITTEN(n) (0x5200 + (n) * 8)
+#define GEN7_SO_NUM_PRIMS_WRITTEN(n)           (0x5200 + (n) * 8)
+#define GEN7_SO_NUM_PRIMS_WRITTEN_UDW(n)       (0x5200 + (n) * 8 + 4)
 
-#define GEN7_SO_PRIM_STORAGE_NEEDED(n)  (0x5240 + (n) * 8)
+#define GEN7_SO_PRIM_STORAGE_NEEDED(n)         (0x5240 + (n) * 8)
+#define GEN7_SO_PRIM_STORAGE_NEEDED_UDW(n)     (0x5240 + (n) * 8 + 4)
 
 #define GEN7_3DPRIM_END_OFFSET          0x2420
 #define GEN7_3DPRIM_START_VERTEX        0x2430