Merge drm/drm-fixes into drm-misc-fixes
[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / gt / gen6_engine_cs.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2020 Intel Corporation
4  */
5
6 #include "gen6_engine_cs.h"
7 #include "intel_engine.h"
8 #include "intel_gpu_commands.h"
9 #include "intel_gt.h"
10 #include "intel_gt_irq.h"
11 #include "intel_gt_pm_irq.h"
12 #include "intel_ring.h"
13
14 #define HWS_SCRATCH_ADDR        (I915_GEM_HWS_SCRATCH * sizeof(u32))
15
16 /*
17  * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
18  * implementing two workarounds on gen6.  From section 1.4.7.1
19  * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
20  *
21  * [DevSNB-C+{W/A}] Before any depth stall flush (including those
22  * produced by non-pipelined state commands), software needs to first
23  * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
24  * 0.
25  *
26  * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
27  * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
28  *
29  * And the workaround for these two requires this workaround first:
30  *
31  * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
32  * BEFORE the pipe-control with a post-sync op and no write-cache
33  * flushes.
34  *
35  * And this last workaround is tricky because of the requirements on
36  * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
37  * volume 2 part 1:
38  *
39  *     "1 of the following must also be set:
40  *      - Render Target Cache Flush Enable ([12] of DW1)
41  *      - Depth Cache Flush Enable ([0] of DW1)
42  *      - Stall at Pixel Scoreboard ([1] of DW1)
43  *      - Depth Stall ([13] of DW1)
44  *      - Post-Sync Operation ([13] of DW1)
45  *      - Notify Enable ([8] of DW1)"
46  *
47  * The cache flushes require the workaround flush that triggered this
48  * one, so we can't use it.  Depth stall would trigger the same.
49  * Post-sync nonzero is what triggered this second workaround, so we
50  * can't use that one either.  Notify enable is IRQs, which aren't
51  * really our business.  That leaves only stall at scoreboard.
52  */
53 static int
54 gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
55 {
56         u32 scratch_addr =
57                 intel_gt_scratch_offset(rq->engine->gt,
58                                         INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
59         u32 *cs;
60
61         cs = intel_ring_begin(rq, 6);
62         if (IS_ERR(cs))
63                 return PTR_ERR(cs);
64
65         *cs++ = GFX_OP_PIPE_CONTROL(5);
66         *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
67         *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
68         *cs++ = 0; /* low dword */
69         *cs++ = 0; /* high dword */
70         *cs++ = MI_NOOP;
71         intel_ring_advance(rq, cs);
72
73         cs = intel_ring_begin(rq, 6);
74         if (IS_ERR(cs))
75                 return PTR_ERR(cs);
76
77         *cs++ = GFX_OP_PIPE_CONTROL(5);
78         *cs++ = PIPE_CONTROL_QW_WRITE;
79         *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
80         *cs++ = 0;
81         *cs++ = 0;
82         *cs++ = MI_NOOP;
83         intel_ring_advance(rq, cs);
84
85         return 0;
86 }
87
88 int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
89 {
90         u32 scratch_addr =
91                 intel_gt_scratch_offset(rq->engine->gt,
92                                         INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
93         u32 *cs, flags = 0;
94         int ret;
95
96         /* Force SNB workarounds for PIPE_CONTROL flushes */
97         ret = gen6_emit_post_sync_nonzero_flush(rq);
98         if (ret)
99                 return ret;
100
101         /*
102          * Just flush everything.  Experiments have shown that reducing the
103          * number of bits based on the write domains has little performance
104          * impact. And when rearranging requests, the order of flushes is
105          * unknown.
106          */
107         if (mode & EMIT_FLUSH) {
108                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
109                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
110                 /*
111                  * Ensure that any following seqno writes only happen
112                  * when the render cache is indeed flushed.
113                  */
114                 flags |= PIPE_CONTROL_CS_STALL;
115         }
116         if (mode & EMIT_INVALIDATE) {
117                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
118                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
119                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
120                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
121                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
122                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
123                 /*
124                  * TLB invalidate requires a post-sync write.
125                  */
126                 flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
127         }
128
129         cs = intel_ring_begin(rq, 4);
130         if (IS_ERR(cs))
131                 return PTR_ERR(cs);
132
133         *cs++ = GFX_OP_PIPE_CONTROL(4);
134         *cs++ = flags;
135         *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
136         *cs++ = 0;
137         intel_ring_advance(rq, cs);
138
139         return 0;
140 }
141
142 u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
143 {
144         /* First we do the gen6_emit_post_sync_nonzero_flush w/a */
145         *cs++ = GFX_OP_PIPE_CONTROL(4);
146         *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
147         *cs++ = 0;
148         *cs++ = 0;
149
150         *cs++ = GFX_OP_PIPE_CONTROL(4);
151         *cs++ = PIPE_CONTROL_QW_WRITE;
152         *cs++ = intel_gt_scratch_offset(rq->engine->gt,
153                                         INTEL_GT_SCRATCH_FIELD_DEFAULT) |
154                 PIPE_CONTROL_GLOBAL_GTT;
155         *cs++ = 0;
156
157         /* Finally we can flush and with it emit the breadcrumb */
158         *cs++ = GFX_OP_PIPE_CONTROL(4);
159         *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
160                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
161                  PIPE_CONTROL_DC_FLUSH_ENABLE |
162                  PIPE_CONTROL_QW_WRITE |
163                  PIPE_CONTROL_CS_STALL);
164         *cs++ = i915_request_active_seqno(rq) |
165                 PIPE_CONTROL_GLOBAL_GTT;
166         *cs++ = rq->fence.seqno;
167
168         *cs++ = MI_USER_INTERRUPT;
169         *cs++ = MI_NOOP;
170
171         rq->tail = intel_ring_offset(rq, cs);
172         assert_ring_tail_valid(rq->ring, rq->tail);
173
174         return cs;
175 }
176
177 static int mi_flush_dw(struct i915_request *rq, u32 flags)
178 {
179         u32 cmd, *cs;
180
181         cs = intel_ring_begin(rq, 4);
182         if (IS_ERR(cs))
183                 return PTR_ERR(cs);
184
185         cmd = MI_FLUSH_DW;
186
187         /*
188          * We always require a command barrier so that subsequent
189          * commands, such as breadcrumb interrupts, are strictly ordered
190          * wrt the contents of the write cache being flushed to memory
191          * (and thus being coherent from the CPU).
192          */
193         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
194
195         /*
196          * Bspec vol 1c.3 - blitter engine command streamer:
197          * "If ENABLED, all TLBs will be invalidated once the flush
198          * operation is complete. This bit is only valid when the
199          * Post-Sync Operation field is a value of 1h or 3h."
200          */
201         cmd |= flags;
202
203         *cs++ = cmd;
204         *cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
205         *cs++ = 0;
206         *cs++ = MI_NOOP;
207
208         intel_ring_advance(rq, cs);
209
210         return 0;
211 }
212
213 static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
214 {
215         return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
216 }
217
218 int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
219 {
220         return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
221 }
222
223 int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
224 {
225         return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
226 }
227
228 int gen6_emit_bb_start(struct i915_request *rq,
229                        u64 offset, u32 len,
230                        unsigned int dispatch_flags)
231 {
232         u32 security;
233         u32 *cs;
234
235         security = MI_BATCH_NON_SECURE_I965;
236         if (dispatch_flags & I915_DISPATCH_SECURE)
237                 security = 0;
238
239         cs = intel_ring_begin(rq, 2);
240         if (IS_ERR(cs))
241                 return PTR_ERR(cs);
242
243         cs = __gen6_emit_bb_start(cs, offset, security);
244         intel_ring_advance(rq, cs);
245
246         return 0;
247 }
248
249 int
250 hsw_emit_bb_start(struct i915_request *rq,
251                   u64 offset, u32 len,
252                   unsigned int dispatch_flags)
253 {
254         u32 security;
255         u32 *cs;
256
257         security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW;
258         if (dispatch_flags & I915_DISPATCH_SECURE)
259                 security = 0;
260
261         cs = intel_ring_begin(rq, 2);
262         if (IS_ERR(cs))
263                 return PTR_ERR(cs);
264
265         cs = __gen6_emit_bb_start(cs, offset, security);
266         intel_ring_advance(rq, cs);
267
268         return 0;
269 }
270
271 static int gen7_stall_cs(struct i915_request *rq)
272 {
273         u32 *cs;
274
275         cs = intel_ring_begin(rq, 4);
276         if (IS_ERR(cs))
277                 return PTR_ERR(cs);
278
279         *cs++ = GFX_OP_PIPE_CONTROL(4);
280         *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
281         *cs++ = 0;
282         *cs++ = 0;
283         intel_ring_advance(rq, cs);
284
285         return 0;
286 }
287
288 int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
289 {
290         u32 scratch_addr =
291                 intel_gt_scratch_offset(rq->engine->gt,
292                                         INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
293         u32 *cs, flags = 0;
294
295         /*
296          * Ensure that any following seqno writes only happen when the render
297          * cache is indeed flushed.
298          *
299          * Workaround: 4th PIPE_CONTROL command (except the ones with only
300          * read-cache invalidate bits set) must have the CS_STALL bit set. We
301          * don't try to be clever and just set it unconditionally.
302          */
303         flags |= PIPE_CONTROL_CS_STALL;
304
305         /*
306          * CS_STALL suggests at least a post-sync write.
307          */
308         flags |= PIPE_CONTROL_QW_WRITE;
309         flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;
310
311         /*
312          * Just flush everything.  Experiments have shown that reducing the
313          * number of bits based on the write domains has little performance
314          * impact.
315          */
316         if (mode & EMIT_FLUSH) {
317                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
318                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
319                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
320                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
321         }
322         if (mode & EMIT_INVALIDATE) {
323                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
324                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
325                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
326                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
327                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
328                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
329                 flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;
330
331                 /*
332                  * Workaround: we must issue a pipe_control with CS-stall bit
333                  * set before a pipe_control command that has the state cache
334                  * invalidate bit set.
335                  */
336                 gen7_stall_cs(rq);
337         }
338
339         cs = intel_ring_begin(rq, 4);
340         if (IS_ERR(cs))
341                 return PTR_ERR(cs);
342
343         *cs++ = GFX_OP_PIPE_CONTROL(4);
344         *cs++ = flags;
345         *cs++ = scratch_addr;
346         *cs++ = 0;
347         intel_ring_advance(rq, cs);
348
349         return 0;
350 }
351
352 u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
353 {
354         *cs++ = GFX_OP_PIPE_CONTROL(4);
355         *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
356                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
357                  PIPE_CONTROL_DC_FLUSH_ENABLE |
358                  PIPE_CONTROL_FLUSH_ENABLE |
359                  PIPE_CONTROL_QW_WRITE |
360                  PIPE_CONTROL_GLOBAL_GTT_IVB |
361                  PIPE_CONTROL_CS_STALL);
362         *cs++ = i915_request_active_seqno(rq);
363         *cs++ = rq->fence.seqno;
364
365         *cs++ = MI_USER_INTERRUPT;
366         *cs++ = MI_NOOP;
367
368         rq->tail = intel_ring_offset(rq, cs);
369         assert_ring_tail_valid(rq->ring, rq->tail);
370
371         return cs;
372 }
373
374 u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
375 {
376         GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
377         GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
378
379         *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
380         *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
381         *cs++ = rq->fence.seqno;
382
383         *cs++ = MI_USER_INTERRUPT;
384
385         rq->tail = intel_ring_offset(rq, cs);
386         assert_ring_tail_valid(rq->ring, rq->tail);
387
388         return cs;
389 }
390
391 #define GEN7_XCS_WA 32
392 u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
393 {
394         int i;
395
396         GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
397         GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);
398
399         *cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
400                 MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
401         *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
402         *cs++ = rq->fence.seqno;
403
404         for (i = 0; i < GEN7_XCS_WA; i++) {
405                 *cs++ = MI_STORE_DWORD_INDEX;
406                 *cs++ = I915_GEM_HWS_SEQNO_ADDR;
407                 *cs++ = rq->fence.seqno;
408         }
409
410         *cs++ = MI_FLUSH_DW;
411         *cs++ = 0;
412         *cs++ = 0;
413
414         *cs++ = MI_USER_INTERRUPT;
415         *cs++ = MI_NOOP;
416
417         rq->tail = intel_ring_offset(rq, cs);
418         assert_ring_tail_valid(rq->ring, rq->tail);
419
420         return cs;
421 }
422 #undef GEN7_XCS_WA
423
424 void gen6_irq_enable(struct intel_engine_cs *engine)
425 {
426         ENGINE_WRITE(engine, RING_IMR,
427                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
428
429         /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
430         ENGINE_POSTING_READ(engine, RING_IMR);
431
432         gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
433 }
434
435 void gen6_irq_disable(struct intel_engine_cs *engine)
436 {
437         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
438         gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
439 }
440
441 void hsw_irq_enable_vecs(struct intel_engine_cs *engine)
442 {
443         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);
444
445         /* Flush/delay to ensure the RING_IMR is active before the GT IMR */
446         ENGINE_POSTING_READ(engine, RING_IMR);
447
448         gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
449 }
450
451 void hsw_irq_disable_vecs(struct intel_engine_cs *engine)
452 {
453         ENGINE_WRITE(engine, RING_IMR, ~0);
454         gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
455 }