1 // SPDX-License-Identifier: MIT
3 * Copyright © 2020 Intel Corporation
6 #include "gen6_engine_cs.h"
7 #include "intel_engine.h"
8 #include "intel_gpu_commands.h"
10 #include "intel_gt_irq.h"
11 #include "intel_gt_pm_irq.h"
12 #include "intel_ring.h"
14 #define HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH * sizeof(u32))
17 * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
18 * implementing two workarounds on gen6. From section 1.4.7.1
19 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
21 * [DevSNB-C+{W/A}] Before any depth stall flush (including those
22 * produced by non-pipelined state commands), software needs to first
23 * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
26 * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
27 * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
29 * And the workaround for these two requires this workaround first:
31 * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
32 * BEFORE the pipe-control with a post-sync op and no write-cache
35 * And this last workaround is tricky because of the requirements on
36 * that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
39 * "1 of the following must also be set:
40 * - Render Target Cache Flush Enable ([12] of DW1)
41 * - Depth Cache Flush Enable ([0] of DW1)
42 * - Stall at Pixel Scoreboard ([1] of DW1)
43 * - Depth Stall ([13] of DW1)
44 * - Post-Sync Operation ([13] of DW1)
45 * - Notify Enable ([8] of DW1)"
47 * The cache flushes require the workaround flush that triggered this
48 * one, so we can't use it. Depth stall would trigger the same.
49 * Post-sync nonzero is what triggered this second workaround, so we
50 * can't use that one either. Notify enable is IRQs, which aren't
51 * really our business. That leaves only stall at scoreboard.
54 gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
57 intel_gt_scratch_offset(rq->engine->gt,
58 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
61 cs = intel_ring_begin(rq, 6);
65 *cs++ = GFX_OP_PIPE_CONTROL(5);
66 *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
67 *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
68 *cs++ = 0; /* low dword */
69 *cs++ = 0; /* high dword */
71 intel_ring_advance(rq, cs);
73 cs = intel_ring_begin(rq, 6);
77 *cs++ = GFX_OP_PIPE_CONTROL(5);
78 *cs++ = PIPE_CONTROL_QW_WRITE;
79 *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
83 intel_ring_advance(rq, cs);
88 int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
91 intel_gt_scratch_offset(rq->engine->gt,
92 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
96 /* Force SNB workarounds for PIPE_CONTROL flushes */
97 ret = gen6_emit_post_sync_nonzero_flush(rq);
102 * Just flush everything. Experiments have shown that reducing the
103 * number of bits based on the write domains has little performance
104 * impact. And when rearranging requests, the order of flushes is
107 if (mode & EMIT_FLUSH) {
108 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
109 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
111 * Ensure that any following seqno writes only happen
112 * when the render cache is indeed flushed.
114 flags |= PIPE_CONTROL_CS_STALL;
116 if (mode & EMIT_INVALIDATE) {
117 flags |= PIPE_CONTROL_TLB_INVALIDATE;
118 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
119 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
120 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
121 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
122 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
124 * TLB invalidate requires a post-sync write.
126 flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
129 cs = intel_ring_begin(rq, 4);
133 *cs++ = GFX_OP_PIPE_CONTROL(4);
135 *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
137 intel_ring_advance(rq, cs);
142 u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
144 /* First we do the gen6_emit_post_sync_nonzero_flush w/a */
145 *cs++ = GFX_OP_PIPE_CONTROL(4);
146 *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
150 *cs++ = GFX_OP_PIPE_CONTROL(4);
151 *cs++ = PIPE_CONTROL_QW_WRITE;
152 *cs++ = intel_gt_scratch_offset(rq->engine->gt,
153 INTEL_GT_SCRATCH_FIELD_DEFAULT) |
154 PIPE_CONTROL_GLOBAL_GTT;
157 /* Finally we can flush and with it emit the breadcrumb */
158 *cs++ = GFX_OP_PIPE_CONTROL(4);
159 *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
160 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
161 PIPE_CONTROL_DC_FLUSH_ENABLE |
162 PIPE_CONTROL_QW_WRITE |
163 PIPE_CONTROL_CS_STALL);
164 *cs++ = i915_request_active_seqno(rq) |
165 PIPE_CONTROL_GLOBAL_GTT;
166 *cs++ = rq->fence.seqno;
168 *cs++ = MI_USER_INTERRUPT;
171 rq->tail = intel_ring_offset(rq, cs);
172 assert_ring_tail_valid(rq->ring, rq->tail);
177 static int mi_flush_dw(struct i915_request *rq, u32 flags)
181 cs = intel_ring_begin(rq, 4);
188 * We always require a command barrier so that subsequent
189 * commands, such as breadcrumb interrupts, are strictly ordered
190 * wrt the contents of the write cache being flushed to memory
191 * (and thus being coherent from the CPU).
193 cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
196 * Bspec vol 1c.3 - blitter engine command streamer:
197 * "If ENABLED, all TLBs will be invalidated once the flush
198 * operation is complete. This bit is only valid when the
199 * Post-Sync Operation field is a value of 1h or 3h."
204 *cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
208 intel_ring_advance(rq, cs);
213 static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
215 return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
218 int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
220 return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
223 int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
225 return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
228 int gen6_emit_bb_start(struct i915_request *rq,
230 unsigned int dispatch_flags)
235 security = MI_BATCH_NON_SECURE_I965;
236 if (dispatch_flags & I915_DISPATCH_SECURE)
239 cs = intel_ring_begin(rq, 2);
243 cs = __gen6_emit_bb_start(cs, offset, security);
244 intel_ring_advance(rq, cs);
250 hsw_emit_bb_start(struct i915_request *rq,
252 unsigned int dispatch_flags)
257 security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW;
258 if (dispatch_flags & I915_DISPATCH_SECURE)
261 cs = intel_ring_begin(rq, 2);
265 cs = __gen6_emit_bb_start(cs, offset, security);
266 intel_ring_advance(rq, cs);
271 static int gen7_stall_cs(struct i915_request *rq)
275 cs = intel_ring_begin(rq, 4);
279 *cs++ = GFX_OP_PIPE_CONTROL(4);
280 *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
283 intel_ring_advance(rq, cs);
288 int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
291 intel_gt_scratch_offset(rq->engine->gt,
292 INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
296 * Ensure that any following seqno writes only happen when the render
297 * cache is indeed flushed.
299 * Workaround: 4th PIPE_CONTROL command (except the ones with only
300 * read-cache invalidate bits set) must have the CS_STALL bit set. We
301 * don't try to be clever and just set it unconditionally.
303 flags |= PIPE_CONTROL_CS_STALL;
306 * CS_STALL suggests at least a post-sync write.
308 flags |= PIPE_CONTROL_QW_WRITE;
309 flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
312 * Just flush everything. Experiments have shown that reducing the
313 * number of bits based on the write domains has little performance
316 if (mode & EMIT_FLUSH) {
317 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
318 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
319 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
320 flags |= PIPE_CONTROL_FLUSH_ENABLE;
322 if (mode & EMIT_INVALIDATE) {
323 flags |= PIPE_CONTROL_TLB_INVALIDATE;
324 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
325 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
326 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
327 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
328 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
329 flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
332 * Workaround: we must issue a pipe_control with CS-stall bit
333 * set before a pipe_control command that has the state cache
334 * invalidate bit set.
339 cs = intel_ring_begin(rq, 4);
343 *cs++ = GFX_OP_PIPE_CONTROL(4);
345 *cs++ = scratch_addr;
347 intel_ring_advance(rq, cs);
352 u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
354 *cs++ = GFX_OP_PIPE_CONTROL(4);
355 *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
356 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
357 PIPE_CONTROL_DC_FLUSH_ENABLE |
358 PIPE_CONTROL_FLUSH_ENABLE |
359 PIPE_CONTROL_QW_WRITE |
360 PIPE_CONTROL_GLOBAL_GTT_IVB |
361 PIPE_CONTROL_CS_STALL);
362 *cs++ = i915_request_active_seqno(rq);
363 *cs++ = rq->fence.seqno;
365 *cs++ = MI_USER_INTERRUPT;
368 rq->tail = intel_ring_offset(rq, cs);
369 assert_ring_tail_valid(rq->ring, rq->tail);
374 u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
376 GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
377 GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
379 *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
380 *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
381 *cs++ = rq->fence.seqno;
383 *cs++ = MI_USER_INTERRUPT;
385 rq->tail = intel_ring_offset(rq, cs);
386 assert_ring_tail_valid(rq->ring, rq->tail);
391 #define GEN7_XCS_WA 32
392 u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
396 GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
397 GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
399 *cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
400 MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
401 *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
402 *cs++ = rq->fence.seqno;
404 for (i = 0; i < GEN7_XCS_WA; i++) {
405 *cs++ = MI_STORE_DWORD_INDEX;
406 *cs++ = I915_GEM_HWS_SEQNO_ADDR;
407 *cs++ = rq->fence.seqno;
414 *cs++ = MI_USER_INTERRUPT;
417 rq->tail = intel_ring_offset(rq, cs);
418 assert_ring_tail_valid(rq->ring, rq->tail);
424 void gen6_irq_enable(struct intel_engine_cs *engine)
426 ENGINE_WRITE(engine, RING_IMR,
427 ~(engine->irq_enable_mask | engine->irq_keep_mask));
429 /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
430 ENGINE_POSTING_READ(engine, RING_IMR);
432 gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
435 void gen6_irq_disable(struct intel_engine_cs *engine)
437 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
438 gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
441 void hsw_irq_enable_vecs(struct intel_engine_cs *engine)
443 ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
445 /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
446 ENGINE_POSTING_READ(engine, RING_IMR);
448 gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
451 void hsw_irq_disable_vecs(struct intel_engine_cs *engine)
453 ENGINE_WRITE(engine, RING_IMR, ~0);
454 gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);