fbdev: Garbage collect fbdev scrolling acceleration, part 1 (from TODO list)
[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / gt / intel_lrc.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014 Intel Corporation
4  */
5
6 #include "gem/i915_gem_lmem.h"
7
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "intel_engine.h"
12 #include "intel_gpu_commands.h"
13 #include "intel_gt.h"
14 #include "intel_lrc.h"
15 #include "intel_lrc_reg.h"
16 #include "intel_ring.h"
17 #include "shmem_utils.h"
18
19 static void set_offsets(u32 *regs,
20                         const u8 *data,
21                         const struct intel_engine_cs *engine,
22                         bool close)
23 #define NOP(x) (BIT(7) | (x))
24 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
25 #define POSTED BIT(0)
26 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
27 #define REG16(x) \
28         (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
29         (((x) >> 2) & 0x7f)
30 #define END 0
31 {
32         const u32 base = engine->mmio_base;
33
34         while (*data) {
35                 u8 count, flags;
36
37                 if (*data & BIT(7)) { /* skip */
38                         count = *data++ & ~BIT(7);
39                         regs += count;
40                         continue;
41                 }
42
43                 count = *data & 0x3f;
44                 flags = *data >> 6;
45                 data++;
46
47                 *regs = MI_LOAD_REGISTER_IMM(count);
48                 if (flags & POSTED)
49                         *regs |= MI_LRI_FORCE_POSTED;
50                 if (GRAPHICS_VER(engine->i915) >= 11)
51                         *regs |= MI_LRI_LRM_CS_MMIO;
52                 regs++;
53
54                 GEM_BUG_ON(!count);
55                 do {
56                         u32 offset = 0;
57                         u8 v;
58
59                         do {
60                                 v = *data++;
61                                 offset <<= 7;
62                                 offset |= v & ~BIT(7);
63                         } while (v & BIT(7));
64
65                         regs[0] = base + (offset << 2);
66                         regs += 2;
67                 } while (--count);
68         }
69
70         if (close) {
71                 /* Close the batch; used mainly by live_lrc_layout() */
72                 *regs = MI_BATCH_BUFFER_END;
73                 if (GRAPHICS_VER(engine->i915) >= 11)
74                         *regs |= BIT(0);
75         }
76 }
77
78 static const u8 gen8_xcs_offsets[] = {
79         NOP(1),
80         LRI(11, 0),
81         REG16(0x244),
82         REG(0x034),
83         REG(0x030),
84         REG(0x038),
85         REG(0x03c),
86         REG(0x168),
87         REG(0x140),
88         REG(0x110),
89         REG(0x11c),
90         REG(0x114),
91         REG(0x118),
92
93         NOP(9),
94         LRI(9, 0),
95         REG16(0x3a8),
96         REG16(0x28c),
97         REG16(0x288),
98         REG16(0x284),
99         REG16(0x280),
100         REG16(0x27c),
101         REG16(0x278),
102         REG16(0x274),
103         REG16(0x270),
104
105         NOP(13),
106         LRI(2, 0),
107         REG16(0x200),
108         REG(0x028),
109
110         END
111 };
112
113 static const u8 gen9_xcs_offsets[] = {
114         NOP(1),
115         LRI(14, POSTED),
116         REG16(0x244),
117         REG(0x034),
118         REG(0x030),
119         REG(0x038),
120         REG(0x03c),
121         REG(0x168),
122         REG(0x140),
123         REG(0x110),
124         REG(0x11c),
125         REG(0x114),
126         REG(0x118),
127         REG(0x1c0),
128         REG(0x1c4),
129         REG(0x1c8),
130
131         NOP(3),
132         LRI(9, POSTED),
133         REG16(0x3a8),
134         REG16(0x28c),
135         REG16(0x288),
136         REG16(0x284),
137         REG16(0x280),
138         REG16(0x27c),
139         REG16(0x278),
140         REG16(0x274),
141         REG16(0x270),
142
143         NOP(13),
144         LRI(1, POSTED),
145         REG16(0x200),
146
147         NOP(13),
148         LRI(44, POSTED),
149         REG(0x028),
150         REG(0x09c),
151         REG(0x0c0),
152         REG(0x178),
153         REG(0x17c),
154         REG16(0x358),
155         REG(0x170),
156         REG(0x150),
157         REG(0x154),
158         REG(0x158),
159         REG16(0x41c),
160         REG16(0x600),
161         REG16(0x604),
162         REG16(0x608),
163         REG16(0x60c),
164         REG16(0x610),
165         REG16(0x614),
166         REG16(0x618),
167         REG16(0x61c),
168         REG16(0x620),
169         REG16(0x624),
170         REG16(0x628),
171         REG16(0x62c),
172         REG16(0x630),
173         REG16(0x634),
174         REG16(0x638),
175         REG16(0x63c),
176         REG16(0x640),
177         REG16(0x644),
178         REG16(0x648),
179         REG16(0x64c),
180         REG16(0x650),
181         REG16(0x654),
182         REG16(0x658),
183         REG16(0x65c),
184         REG16(0x660),
185         REG16(0x664),
186         REG16(0x668),
187         REG16(0x66c),
188         REG16(0x670),
189         REG16(0x674),
190         REG16(0x678),
191         REG16(0x67c),
192         REG(0x068),
193
194         END
195 };
196
197 static const u8 gen12_xcs_offsets[] = {
198         NOP(1),
199         LRI(13, POSTED),
200         REG16(0x244),
201         REG(0x034),
202         REG(0x030),
203         REG(0x038),
204         REG(0x03c),
205         REG(0x168),
206         REG(0x140),
207         REG(0x110),
208         REG(0x1c0),
209         REG(0x1c4),
210         REG(0x1c8),
211         REG(0x180),
212         REG16(0x2b4),
213
214         NOP(5),
215         LRI(9, POSTED),
216         REG16(0x3a8),
217         REG16(0x28c),
218         REG16(0x288),
219         REG16(0x284),
220         REG16(0x280),
221         REG16(0x27c),
222         REG16(0x278),
223         REG16(0x274),
224         REG16(0x270),
225
226         END
227 };
228
229 static const u8 gen8_rcs_offsets[] = {
230         NOP(1),
231         LRI(14, POSTED),
232         REG16(0x244),
233         REG(0x034),
234         REG(0x030),
235         REG(0x038),
236         REG(0x03c),
237         REG(0x168),
238         REG(0x140),
239         REG(0x110),
240         REG(0x11c),
241         REG(0x114),
242         REG(0x118),
243         REG(0x1c0),
244         REG(0x1c4),
245         REG(0x1c8),
246
247         NOP(3),
248         LRI(9, POSTED),
249         REG16(0x3a8),
250         REG16(0x28c),
251         REG16(0x288),
252         REG16(0x284),
253         REG16(0x280),
254         REG16(0x27c),
255         REG16(0x278),
256         REG16(0x274),
257         REG16(0x270),
258
259         NOP(13),
260         LRI(1, 0),
261         REG(0x0c8),
262
263         END
264 };
265
266 static const u8 gen9_rcs_offsets[] = {
267         NOP(1),
268         LRI(14, POSTED),
269         REG16(0x244),
270         REG(0x34),
271         REG(0x30),
272         REG(0x38),
273         REG(0x3c),
274         REG(0x168),
275         REG(0x140),
276         REG(0x110),
277         REG(0x11c),
278         REG(0x114),
279         REG(0x118),
280         REG(0x1c0),
281         REG(0x1c4),
282         REG(0x1c8),
283
284         NOP(3),
285         LRI(9, POSTED),
286         REG16(0x3a8),
287         REG16(0x28c),
288         REG16(0x288),
289         REG16(0x284),
290         REG16(0x280),
291         REG16(0x27c),
292         REG16(0x278),
293         REG16(0x274),
294         REG16(0x270),
295
296         NOP(13),
297         LRI(1, 0),
298         REG(0xc8),
299
300         NOP(13),
301         LRI(44, POSTED),
302         REG(0x28),
303         REG(0x9c),
304         REG(0xc0),
305         REG(0x178),
306         REG(0x17c),
307         REG16(0x358),
308         REG(0x170),
309         REG(0x150),
310         REG(0x154),
311         REG(0x158),
312         REG16(0x41c),
313         REG16(0x600),
314         REG16(0x604),
315         REG16(0x608),
316         REG16(0x60c),
317         REG16(0x610),
318         REG16(0x614),
319         REG16(0x618),
320         REG16(0x61c),
321         REG16(0x620),
322         REG16(0x624),
323         REG16(0x628),
324         REG16(0x62c),
325         REG16(0x630),
326         REG16(0x634),
327         REG16(0x638),
328         REG16(0x63c),
329         REG16(0x640),
330         REG16(0x644),
331         REG16(0x648),
332         REG16(0x64c),
333         REG16(0x650),
334         REG16(0x654),
335         REG16(0x658),
336         REG16(0x65c),
337         REG16(0x660),
338         REG16(0x664),
339         REG16(0x668),
340         REG16(0x66c),
341         REG16(0x670),
342         REG16(0x674),
343         REG16(0x678),
344         REG16(0x67c),
345         REG(0x68),
346
347         END
348 };
349
350 static const u8 gen11_rcs_offsets[] = {
351         NOP(1),
352         LRI(15, POSTED),
353         REG16(0x244),
354         REG(0x034),
355         REG(0x030),
356         REG(0x038),
357         REG(0x03c),
358         REG(0x168),
359         REG(0x140),
360         REG(0x110),
361         REG(0x11c),
362         REG(0x114),
363         REG(0x118),
364         REG(0x1c0),
365         REG(0x1c4),
366         REG(0x1c8),
367         REG(0x180),
368
369         NOP(1),
370         LRI(9, POSTED),
371         REG16(0x3a8),
372         REG16(0x28c),
373         REG16(0x288),
374         REG16(0x284),
375         REG16(0x280),
376         REG16(0x27c),
377         REG16(0x278),
378         REG16(0x274),
379         REG16(0x270),
380
381         LRI(1, POSTED),
382         REG(0x1b0),
383
384         NOP(10),
385         LRI(1, 0),
386         REG(0x0c8),
387
388         END
389 };
390
391 static const u8 gen12_rcs_offsets[] = {
392         NOP(1),
393         LRI(13, POSTED),
394         REG16(0x244),
395         REG(0x034),
396         REG(0x030),
397         REG(0x038),
398         REG(0x03c),
399         REG(0x168),
400         REG(0x140),
401         REG(0x110),
402         REG(0x1c0),
403         REG(0x1c4),
404         REG(0x1c8),
405         REG(0x180),
406         REG16(0x2b4),
407
408         NOP(5),
409         LRI(9, POSTED),
410         REG16(0x3a8),
411         REG16(0x28c),
412         REG16(0x288),
413         REG16(0x284),
414         REG16(0x280),
415         REG16(0x27c),
416         REG16(0x278),
417         REG16(0x274),
418         REG16(0x270),
419
420         LRI(3, POSTED),
421         REG(0x1b0),
422         REG16(0x5a8),
423         REG16(0x5ac),
424
425         NOP(6),
426         LRI(1, 0),
427         REG(0x0c8),
428         NOP(3 + 9 + 1),
429
430         LRI(51, POSTED),
431         REG16(0x588),
432         REG16(0x588),
433         REG16(0x588),
434         REG16(0x588),
435         REG16(0x588),
436         REG16(0x588),
437         REG(0x028),
438         REG(0x09c),
439         REG(0x0c0),
440         REG(0x178),
441         REG(0x17c),
442         REG16(0x358),
443         REG(0x170),
444         REG(0x150),
445         REG(0x154),
446         REG(0x158),
447         REG16(0x41c),
448         REG16(0x600),
449         REG16(0x604),
450         REG16(0x608),
451         REG16(0x60c),
452         REG16(0x610),
453         REG16(0x614),
454         REG16(0x618),
455         REG16(0x61c),
456         REG16(0x620),
457         REG16(0x624),
458         REG16(0x628),
459         REG16(0x62c),
460         REG16(0x630),
461         REG16(0x634),
462         REG16(0x638),
463         REG16(0x63c),
464         REG16(0x640),
465         REG16(0x644),
466         REG16(0x648),
467         REG16(0x64c),
468         REG16(0x650),
469         REG16(0x654),
470         REG16(0x658),
471         REG16(0x65c),
472         REG16(0x660),
473         REG16(0x664),
474         REG16(0x668),
475         REG16(0x66c),
476         REG16(0x670),
477         REG16(0x674),
478         REG16(0x678),
479         REG16(0x67c),
480         REG(0x068),
481         REG(0x084),
482         NOP(1),
483
484         END
485 };
486
487 static const u8 xehp_rcs_offsets[] = {
488         NOP(1),
489         LRI(13, POSTED),
490         REG16(0x244),
491         REG(0x034),
492         REG(0x030),
493         REG(0x038),
494         REG(0x03c),
495         REG(0x168),
496         REG(0x140),
497         REG(0x110),
498         REG(0x1c0),
499         REG(0x1c4),
500         REG(0x1c8),
501         REG(0x180),
502         REG16(0x2b4),
503
504         NOP(5),
505         LRI(9, POSTED),
506         REG16(0x3a8),
507         REG16(0x28c),
508         REG16(0x288),
509         REG16(0x284),
510         REG16(0x280),
511         REG16(0x27c),
512         REG16(0x278),
513         REG16(0x274),
514         REG16(0x270),
515
516         LRI(3, POSTED),
517         REG(0x1b0),
518         REG16(0x5a8),
519         REG16(0x5ac),
520
521         NOP(6),
522         LRI(1, 0),
523         REG(0x0c8),
524
525         END
526 };
527
528 #undef END
529 #undef REG16
530 #undef REG
531 #undef LRI
532 #undef NOP
533
534 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
535 {
536         /*
537          * The gen12+ lists only have the registers we program in the basic
538          * default state. We rely on the context image using relative
539          * addressing to automatic fixup the register state between the
540          * physical engines for virtual engine.
541          */
542         GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
543                    !intel_engine_has_relative_mmio(engine));
544
545         if (engine->class == RENDER_CLASS) {
546                 if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
547                         return xehp_rcs_offsets;
548                 else if (GRAPHICS_VER(engine->i915) >= 12)
549                         return gen12_rcs_offsets;
550                 else if (GRAPHICS_VER(engine->i915) >= 11)
551                         return gen11_rcs_offsets;
552                 else if (GRAPHICS_VER(engine->i915) >= 9)
553                         return gen9_rcs_offsets;
554                 else
555                         return gen8_rcs_offsets;
556         } else {
557                 if (GRAPHICS_VER(engine->i915) >= 12)
558                         return gen12_xcs_offsets;
559                 else if (GRAPHICS_VER(engine->i915) >= 9)
560                         return gen9_xcs_offsets;
561                 else
562                         return gen8_xcs_offsets;
563         }
564 }
565
566 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
567 {
568         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
569                 return 0x70;
570         else if (GRAPHICS_VER(engine->i915) >= 12)
571                 return 0x60;
572         else if (GRAPHICS_VER(engine->i915) >= 9)
573                 return 0x54;
574         else if (engine->class == RENDER_CLASS)
575                 return 0x58;
576         else
577                 return -1;
578 }
579
580 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
581 {
582         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
583                 return 0x84;
584         else if (GRAPHICS_VER(engine->i915) >= 12)
585                 return 0x74;
586         else if (GRAPHICS_VER(engine->i915) >= 9)
587                 return 0x68;
588         else if (engine->class == RENDER_CLASS)
589                 return 0xd8;
590         else
591                 return -1;
592 }
593
594 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
595 {
596         if (GRAPHICS_VER(engine->i915) >= 12)
597                 return 0x12;
598         else if (GRAPHICS_VER(engine->i915) >= 9 || engine->class == RENDER_CLASS)
599                 return 0x18;
600         else
601                 return -1;
602 }
603
604 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
605 {
606         int x;
607
608         x = lrc_ring_wa_bb_per_ctx(engine);
609         if (x < 0)
610                 return x;
611
612         return x + 2;
613 }
614
615 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
616 {
617         int x;
618
619         x = lrc_ring_indirect_ptr(engine);
620         if (x < 0)
621                 return x;
622
623         return x + 2;
624 }
625
626 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
627 {
628
629         if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 50))
630                 /*
631                  * Note that the CSFE context has a dummy slot for CMD_BUF_CCTL
632                  * simply to match the RCS context image layout.
633                  */
634                 return 0xc6;
635         else if (engine->class != RENDER_CLASS)
636                 return -1;
637         else if (GRAPHICS_VER(engine->i915) >= 12)
638                 return 0xb6;
639         else if (GRAPHICS_VER(engine->i915) >= 11)
640                 return 0xaa;
641         else
642                 return -1;
643 }
644
645 static u32
646 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
647 {
648         switch (GRAPHICS_VER(engine->i915)) {
649         default:
650                 MISSING_CASE(GRAPHICS_VER(engine->i915));
651                 fallthrough;
652         case 12:
653                 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
654         case 11:
655                 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
656         case 9:
657                 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
658         case 8:
659                 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
660         }
661 }
662
663 static void
664 lrc_setup_indirect_ctx(u32 *regs,
665                        const struct intel_engine_cs *engine,
666                        u32 ctx_bb_ggtt_addr,
667                        u32 size)
668 {
669         GEM_BUG_ON(!size);
670         GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
671         GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
672         regs[lrc_ring_indirect_ptr(engine) + 1] =
673                 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
674
675         GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
676         regs[lrc_ring_indirect_offset(engine) + 1] =
677                 lrc_ring_indirect_offset_default(engine) << 6;
678 }
679
680 static void init_common_regs(u32 * const regs,
681                              const struct intel_context *ce,
682                              const struct intel_engine_cs *engine,
683                              bool inhibit)
684 {
685         u32 ctl;
686
687         ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
688         ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
689         if (inhibit)
690                 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
691         if (GRAPHICS_VER(engine->i915) < 11)
692                 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
693                                            CTX_CTRL_RS_CTX_ENABLE);
694         regs[CTX_CONTEXT_CONTROL] = ctl;
695
696         regs[CTX_TIMESTAMP] = ce->runtime.last;
697 }
698
699 static void init_wa_bb_regs(u32 * const regs,
700                             const struct intel_engine_cs *engine)
701 {
702         const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
703
704         if (wa_ctx->per_ctx.size) {
705                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
706
707                 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
708                 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
709                         (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
710         }
711
712         if (wa_ctx->indirect_ctx.size) {
713                 lrc_setup_indirect_ctx(regs, engine,
714                                        i915_ggtt_offset(wa_ctx->vma) +
715                                        wa_ctx->indirect_ctx.offset,
716                                        wa_ctx->indirect_ctx.size);
717         }
718 }
719
720 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
721 {
722         if (i915_vm_is_4lvl(&ppgtt->vm)) {
723                 /* 64b PPGTT (48bit canonical)
724                  * PDP0_DESCRIPTOR contains the base address to PML4 and
725                  * other PDP Descriptors are ignored.
726                  */
727                 ASSIGN_CTX_PML4(ppgtt, regs);
728         } else {
729                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
730                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
731                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
732                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
733         }
734 }
735
736 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
737 {
738         if (i915_is_ggtt(vm))
739                 return i915_vm_to_ggtt(vm)->alias;
740         else
741                 return i915_vm_to_ppgtt(vm);
742 }
743
744 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
745 {
746         int x;
747
748         x = lrc_ring_mi_mode(engine);
749         if (x != -1) {
750                 regs[x + 1] &= ~STOP_RING;
751                 regs[x + 1] |= STOP_RING << 16;
752         }
753 }
754
755 static void __lrc_init_regs(u32 *regs,
756                             const struct intel_context *ce,
757                             const struct intel_engine_cs *engine,
758                             bool inhibit)
759 {
760         /*
761          * A context is actually a big batch buffer with several
762          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
763          * values we are setting here are only for the first context restore:
764          * on a subsequent save, the GPU will recreate this batchbuffer with new
765          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
766          * we are not initializing here).
767          *
768          * Must keep consistent with virtual_update_register_offsets().
769          */
770
771         if (inhibit)
772                 memset(regs, 0, PAGE_SIZE);
773
774         set_offsets(regs, reg_offsets(engine), engine, inhibit);
775
776         init_common_regs(regs, ce, engine, inhibit);
777         init_ppgtt_regs(regs, vm_alias(ce->vm));
778
779         init_wa_bb_regs(regs, engine);
780
781         __reset_stop_ring(regs, engine);
782 }
783
784 void lrc_init_regs(const struct intel_context *ce,
785                    const struct intel_engine_cs *engine,
786                    bool inhibit)
787 {
788         __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
789 }
790
791 void lrc_reset_regs(const struct intel_context *ce,
792                     const struct intel_engine_cs *engine)
793 {
794         __reset_stop_ring(ce->lrc_reg_state, engine);
795 }
796
797 static void
798 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
799 {
800         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
801                 return;
802
803         vaddr += engine->context_size;
804
805         memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
806 }
807
808 static void
809 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
810 {
811         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
812                 return;
813
814         vaddr += engine->context_size;
815
816         if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
817                 drm_err_once(&engine->i915->drm,
818                              "%s context redzone overwritten!\n",
819                              engine->name);
820 }
821
822 void lrc_init_state(struct intel_context *ce,
823                     struct intel_engine_cs *engine,
824                     void *state)
825 {
826         bool inhibit = true;
827
828         set_redzone(state, engine);
829
830         if (engine->default_state) {
831                 shmem_read(engine->default_state, 0,
832                            state, engine->context_size);
833                 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
834                 inhibit = false;
835         }
836
837         /* Clear the ppHWSP (inc. per-context counters) */
838         memset(state, 0, PAGE_SIZE);
839
840         /*
841          * The second page of the context object contains some registers which
842          * must be set up prior to the first execution.
843          */
844         __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
845 }
846
847 static struct i915_vma *
848 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
849 {
850         struct drm_i915_gem_object *obj;
851         struct i915_vma *vma;
852         u32 context_size;
853
854         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
855
856         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
857                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
858
859         if (GRAPHICS_VER(engine->i915) == 12) {
860                 ce->wa_bb_page = context_size / PAGE_SIZE;
861                 context_size += PAGE_SIZE;
862         }
863
864         obj = i915_gem_object_create_lmem(engine->i915, context_size, 0);
865         if (IS_ERR(obj))
866                 obj = i915_gem_object_create_shmem(engine->i915, context_size);
867         if (IS_ERR(obj))
868                 return ERR_CAST(obj);
869
870         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
871         if (IS_ERR(vma)) {
872                 i915_gem_object_put(obj);
873                 return vma;
874         }
875
876         return vma;
877 }
878
879 static struct intel_timeline *
880 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
881 {
882         struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
883
884         return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
885 }
886
887 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
888 {
889         struct intel_ring *ring;
890         struct i915_vma *vma;
891         int err;
892
893         GEM_BUG_ON(ce->state);
894
895         vma = __lrc_alloc_state(ce, engine);
896         if (IS_ERR(vma))
897                 return PTR_ERR(vma);
898
899         ring = intel_engine_create_ring(engine, ce->ring_size);
900         if (IS_ERR(ring)) {
901                 err = PTR_ERR(ring);
902                 goto err_vma;
903         }
904
905         if (!page_mask_bits(ce->timeline)) {
906                 struct intel_timeline *tl;
907
908                 /*
909                  * Use the static global HWSP for the kernel context, and
910                  * a dynamically allocated cacheline for everyone else.
911                  */
912                 if (unlikely(ce->timeline))
913                         tl = pinned_timeline(ce, engine);
914                 else
915                         tl = intel_timeline_create(engine->gt);
916                 if (IS_ERR(tl)) {
917                         err = PTR_ERR(tl);
918                         goto err_ring;
919                 }
920
921                 ce->timeline = tl;
922         }
923
924         ce->ring = ring;
925         ce->state = vma;
926
927         return 0;
928
929 err_ring:
930         intel_ring_put(ring);
931 err_vma:
932         i915_vma_put(vma);
933         return err;
934 }
935
936 void lrc_reset(struct intel_context *ce)
937 {
938         GEM_BUG_ON(!intel_context_is_pinned(ce));
939
940         intel_ring_reset(ce->ring, ce->ring->emit);
941
942         /* Scrub away the garbage */
943         lrc_init_regs(ce, ce->engine, true);
944         ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
945 }
946
947 int
948 lrc_pre_pin(struct intel_context *ce,
949             struct intel_engine_cs *engine,
950             struct i915_gem_ww_ctx *ww,
951             void **vaddr)
952 {
953         GEM_BUG_ON(!ce->state);
954         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
955
956         *vaddr = i915_gem_object_pin_map(ce->state->obj,
957                                          i915_coherent_map_type(ce->engine->i915,
958                                                                 ce->state->obj,
959                                                                 false) |
960                                          I915_MAP_OVERRIDE);
961
962         return PTR_ERR_OR_ZERO(*vaddr);
963 }
964
965 int
966 lrc_pin(struct intel_context *ce,
967         struct intel_engine_cs *engine,
968         void *vaddr)
969 {
970         ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
971
972         if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
973                 lrc_init_state(ce, engine, vaddr);
974
975         ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
976         return 0;
977 }
978
979 void lrc_unpin(struct intel_context *ce)
980 {
981         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
982                       ce->engine);
983 }
984
985 void lrc_post_unpin(struct intel_context *ce)
986 {
987         i915_gem_object_unpin_map(ce->state->obj);
988 }
989
990 void lrc_fini(struct intel_context *ce)
991 {
992         if (!ce->state)
993                 return;
994
995         intel_ring_put(fetch_and_zero(&ce->ring));
996         i915_vma_put(fetch_and_zero(&ce->state));
997 }
998
999 void lrc_destroy(struct kref *kref)
1000 {
1001         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
1002
1003         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
1004         GEM_BUG_ON(intel_context_is_pinned(ce));
1005
1006         lrc_fini(ce);
1007
1008         intel_context_fini(ce);
1009         intel_context_free(ce);
1010 }
1011
1012 static u32 *
1013 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
1014 {
1015         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1016                 MI_SRM_LRM_GLOBAL_GTT |
1017                 MI_LRI_LRM_CS_MMIO;
1018         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1019         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1020                 CTX_TIMESTAMP * sizeof(u32);
1021         *cs++ = 0;
1022
1023         *cs++ = MI_LOAD_REGISTER_REG |
1024                 MI_LRR_SOURCE_CS_MMIO |
1025                 MI_LRI_LRM_CS_MMIO;
1026         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1027         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1028
1029         *cs++ = MI_LOAD_REGISTER_REG |
1030                 MI_LRR_SOURCE_CS_MMIO |
1031                 MI_LRI_LRM_CS_MMIO;
1032         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1033         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
1034
1035         return cs;
1036 }
1037
1038 static u32 *
1039 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
1040 {
1041         GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
1042
1043         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1044                 MI_SRM_LRM_GLOBAL_GTT |
1045                 MI_LRI_LRM_CS_MMIO;
1046         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1047         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1048                 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
1049         *cs++ = 0;
1050
1051         return cs;
1052 }
1053
1054 static u32 *
1055 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1056 {
1057         GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1058
1059         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1060                 MI_SRM_LRM_GLOBAL_GTT |
1061                 MI_LRI_LRM_CS_MMIO;
1062         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1063         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1064                 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1065         *cs++ = 0;
1066
1067         *cs++ = MI_LOAD_REGISTER_REG |
1068                 MI_LRR_SOURCE_CS_MMIO |
1069                 MI_LRI_LRM_CS_MMIO;
1070         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1071         *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1072
1073         return cs;
1074 }
1075
1076 static u32 *
1077 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1078 {
1079         cs = gen12_emit_timestamp_wa(ce, cs);
1080         cs = gen12_emit_cmd_buf_wa(ce, cs);
1081         cs = gen12_emit_restore_scratch(ce, cs);
1082
1083         return cs;
1084 }
1085
1086 static u32 *
1087 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1088 {
1089         cs = gen12_emit_timestamp_wa(ce, cs);
1090         cs = gen12_emit_restore_scratch(ce, cs);
1091
1092         return cs;
1093 }
1094
1095 static u32 context_wa_bb_offset(const struct intel_context *ce)
1096 {
1097         return PAGE_SIZE * ce->wa_bb_page;
1098 }
1099
1100 static u32 *context_indirect_bb(const struct intel_context *ce)
1101 {
1102         void *ptr;
1103
1104         GEM_BUG_ON(!ce->wa_bb_page);
1105
1106         ptr = ce->lrc_reg_state;
1107         ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1108         ptr += context_wa_bb_offset(ce);
1109
1110         return ptr;
1111 }
1112
1113 static void
1114 setup_indirect_ctx_bb(const struct intel_context *ce,
1115                       const struct intel_engine_cs *engine,
1116                       u32 *(*emit)(const struct intel_context *, u32 *))
1117 {
1118         u32 * const start = context_indirect_bb(ce);
1119         u32 *cs;
1120
1121         cs = emit(ce, start);
1122         GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1123         while ((unsigned long)cs % CACHELINE_BYTES)
1124                 *cs++ = MI_NOOP;
1125
1126         lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1127                                i915_ggtt_offset(ce->state) +
1128                                context_wa_bb_offset(ce),
1129                                (cs - start) * sizeof(*cs));
1130 }
1131
1132 /*
1133  * The context descriptor encodes various attributes of a context,
1134  * including its GTT address and some flags. Because it's fairly
1135  * expensive to calculate, we'll just do it once and cache the result,
1136  * which remains valid until the context is unpinned.
1137  *
1138  * This is what a descriptor looks like, from LSB to MSB::
1139  *
1140  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1141  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1142  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1143  *      bits 53-54:    mbz, reserved for use by hardware
1144  *      bits 55-63:    group ID, currently unused and set to 0
1145  *
1146  * Starting from Gen11, the upper dword of the descriptor has a new format:
1147  *
1148  *      bits 32-36:    reserved
1149  *      bits 37-47:    SW context ID
1150  *      bits 48:53:    engine instance
1151  *      bit 54:        mbz, reserved for use by hardware
1152  *      bits 55-60:    SW counter
1153  *      bits 61-63:    engine class
1154  *
1155  * On Xe_HP, the upper dword of the descriptor has a new format:
1156  *
1157  *      bits 32-37:    virtual function number
1158  *      bit 38:        mbz, reserved for use by hardware
1159  *      bits 39-54:    SW context ID
1160  *      bits 55-57:    reserved
1161  *      bits 58-63:    SW counter
1162  *
1163  * engine info, SW context ID and SW counter need to form a unique number
1164  * (Context ID) per lrc.
1165  */
1166 static u32 lrc_descriptor(const struct intel_context *ce)
1167 {
1168         u32 desc;
1169
1170         desc = INTEL_LEGACY_32B_CONTEXT;
1171         if (i915_vm_is_4lvl(ce->vm))
1172                 desc = INTEL_LEGACY_64B_CONTEXT;
1173         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1174
1175         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1176         if (GRAPHICS_VER(ce->vm->i915) == 8)
1177                 desc |= GEN8_CTX_L3LLC_COHERENT;
1178
1179         return i915_ggtt_offset(ce->state) | desc;
1180 }
1181
1182 u32 lrc_update_regs(const struct intel_context *ce,
1183                     const struct intel_engine_cs *engine,
1184                     u32 head)
1185 {
1186         struct intel_ring *ring = ce->ring;
1187         u32 *regs = ce->lrc_reg_state;
1188
1189         GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1190         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1191
1192         regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1193         regs[CTX_RING_HEAD] = head;
1194         regs[CTX_RING_TAIL] = ring->tail;
1195         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1196
1197         /* RPCS */
1198         if (engine->class == RENDER_CLASS) {
1199                 regs[CTX_R_PWR_CLK_STATE] =
1200                         intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1201
1202                 i915_oa_init_reg_state(ce, engine);
1203         }
1204
1205         if (ce->wa_bb_page) {
1206                 u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1207
1208                 fn = gen12_emit_indirect_ctx_xcs;
1209                 if (ce->engine->class == RENDER_CLASS)
1210                         fn = gen12_emit_indirect_ctx_rcs;
1211
1212                 /* Mutually exclusive wrt to global indirect bb */
1213                 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1214                 setup_indirect_ctx_bb(ce, engine, fn);
1215         }
1216
1217         return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1218 }
1219
1220 void lrc_update_offsets(struct intel_context *ce,
1221                         struct intel_engine_cs *engine)
1222 {
1223         set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1224 }
1225
1226 void lrc_check_regs(const struct intel_context *ce,
1227                     const struct intel_engine_cs *engine,
1228                     const char *when)
1229 {
1230         const struct intel_ring *ring = ce->ring;
1231         u32 *regs = ce->lrc_reg_state;
1232         bool valid = true;
1233         int x;
1234
1235         if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1236                 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1237                        engine->name,
1238                        regs[CTX_RING_START],
1239                        i915_ggtt_offset(ring->vma));
1240                 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1241                 valid = false;
1242         }
1243
1244         if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1245             (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1246                 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1247                        engine->name,
1248                        regs[CTX_RING_CTL],
1249                        (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1250                 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1251                 valid = false;
1252         }
1253
1254         x = lrc_ring_mi_mode(engine);
1255         if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1256                 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1257                        engine->name, regs[x + 1]);
1258                 regs[x + 1] &= ~STOP_RING;
1259                 regs[x + 1] |= STOP_RING << 16;
1260                 valid = false;
1261         }
1262
1263         WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1264 }
1265
1266 /*
1267  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1268  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1269  * but there is a slight complication as this is applied in WA batch where the
1270  * values are only initialized once so we cannot take register value at the
1271  * beginning and reuse it further; hence we save its value to memory, upload a
1272  * constant value with bit21 set and then we restore it back with the saved value.
1273  * To simplify the WA, a constant value is formed by using the default value
1274  * of this register. This shouldn't be a problem because we are only modifying
1275  * it for a short period and this batch in non-premptible. We can ofcourse
1276  * use additional instructions that read the actual value of the register
1277  * at that time and set our bit of interest but it makes the WA complicated.
1278  *
1279  * This WA is also required for Gen9 so extracting as a function avoids
1280  * code duplication.
1281  */
1282 static u32 *
1283 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1284 {
1285         /* NB no one else is allowed to scribble over scratch + 256! */
1286         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1287         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1288         *batch++ = intel_gt_scratch_offset(engine->gt,
1289                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1290         *batch++ = 0;
1291
1292         *batch++ = MI_LOAD_REGISTER_IMM(1);
1293         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1294         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1295
1296         batch = gen8_emit_pipe_control(batch,
1297                                        PIPE_CONTROL_CS_STALL |
1298                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
1299                                        0);
1300
1301         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1302         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1303         *batch++ = intel_gt_scratch_offset(engine->gt,
1304                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1305         *batch++ = 0;
1306
1307         return batch;
1308 }
1309
1310 /*
1311  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1312  * initialized at the beginning and shared across all contexts but this field
1313  * helps us to have multiple batches at different offsets and select them based
1314  * on a criteria. At the moment this batch always start at the beginning of the page
1315  * and at this point we don't have multiple wa_ctx batch buffers.
1316  *
1317  * The number of WA applied are not known at the beginning; we use this field
1318  * to return the no of DWORDS written.
1319  *
1320  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1321  * so it adds NOOPs as padding to make it cacheline aligned.
1322  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1323  * makes a complete batch buffer.
1324  */
1325 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1326 {
1327         /* WaDisableCtxRestoreArbitration:bdw,chv */
1328         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1329
1330         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1331         if (IS_BROADWELL(engine->i915))
1332                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1333
1334         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1335         /* Actual scratch location is at 128 bytes offset */
1336         batch = gen8_emit_pipe_control(batch,
1337                                        PIPE_CONTROL_FLUSH_L3 |
1338                                        PIPE_CONTROL_STORE_DATA_INDEX |
1339                                        PIPE_CONTROL_CS_STALL |
1340                                        PIPE_CONTROL_QW_WRITE,
1341                                        LRC_PPHWSP_SCRATCH_ADDR);
1342
1343         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1344
1345         /* Pad to end of cacheline */
1346         while ((unsigned long)batch % CACHELINE_BYTES)
1347                 *batch++ = MI_NOOP;
1348
1349         /*
1350          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1351          * execution depends on the length specified in terms of cache lines
1352          * in the register CTX_RCS_INDIRECT_CTX
1353          */
1354
1355         return batch;
1356 }
1357
1358 struct lri {
1359         i915_reg_t reg;
1360         u32 value;
1361 };
1362
1363 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1364 {
1365         GEM_BUG_ON(!count || count > 63);
1366
1367         *batch++ = MI_LOAD_REGISTER_IMM(count);
1368         do {
1369                 *batch++ = i915_mmio_reg_offset(lri->reg);
1370                 *batch++ = lri->value;
1371         } while (lri++, --count);
1372         *batch++ = MI_NOOP;
1373
1374         return batch;
1375 }
1376
1377 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1378 {
1379         static const struct lri lri[] = {
1380                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1381                 {
1382                         COMMON_SLICE_CHICKEN2,
1383                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1384                                        0),
1385                 },
1386
1387                 /* BSpec: 11391 */
1388                 {
1389                         FF_SLICE_CHICKEN,
1390                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1391                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1392                 },
1393
1394                 /* BSpec: 11299 */
1395                 {
1396                         _3D_CHICKEN3,
1397                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1398                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1399                 }
1400         };
1401
1402         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1403
1404         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1405         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1406
1407         /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1408         batch = gen8_emit_pipe_control(batch,
1409                                        PIPE_CONTROL_FLUSH_L3 |
1410                                        PIPE_CONTROL_STORE_DATA_INDEX |
1411                                        PIPE_CONTROL_CS_STALL |
1412                                        PIPE_CONTROL_QW_WRITE,
1413                                        LRC_PPHWSP_SCRATCH_ADDR);
1414
1415         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1416
1417         /* WaMediaPoolStateCmdInWABB:bxt,glk */
1418         if (HAS_POOLED_EU(engine->i915)) {
1419                 /*
1420                  * EU pool configuration is setup along with golden context
1421                  * during context initialization. This value depends on
1422                  * device type (2x6 or 3x6) and needs to be updated based
1423                  * on which subslice is disabled especially for 2x6
1424                  * devices, however it is safe to load default
1425                  * configuration of 3x6 device instead of masking off
1426                  * corresponding bits because HW ignores bits of a disabled
1427                  * subslice and drops down to appropriate config. Please
1428                  * see render_state_setup() in i915_gem_render_state.c for
1429                  * possible configurations, to avoid duplication they are
1430                  * not shown here again.
1431                  */
1432                 *batch++ = GEN9_MEDIA_POOL_STATE;
1433                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
1434                 *batch++ = 0x00777000;
1435                 *batch++ = 0;
1436                 *batch++ = 0;
1437                 *batch++ = 0;
1438         }
1439
1440         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1441
1442         /* Pad to end of cacheline */
1443         while ((unsigned long)batch % CACHELINE_BYTES)
1444                 *batch++ = MI_NOOP;
1445
1446         return batch;
1447 }
1448
1449 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1450
1451 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1452 {
1453         struct drm_i915_gem_object *obj;
1454         struct i915_vma *vma;
1455         int err;
1456
1457         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1458         if (IS_ERR(obj))
1459                 return PTR_ERR(obj);
1460
1461         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1462         if (IS_ERR(vma)) {
1463                 err = PTR_ERR(vma);
1464                 goto err;
1465         }
1466
1467         engine->wa_ctx.vma = vma;
1468         return 0;
1469
1470 err:
1471         i915_gem_object_put(obj);
1472         return err;
1473 }
1474
1475 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1476 {
1477         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1478 }
1479
1480 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1481
1482 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1483 {
1484         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1485         struct i915_wa_ctx_bb *wa_bb[] = {
1486                 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1487         };
1488         wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1489         struct i915_gem_ww_ctx ww;
1490         void *batch, *batch_ptr;
1491         unsigned int i;
1492         int err;
1493
1494         if (engine->class != RENDER_CLASS)
1495                 return;
1496
1497         switch (GRAPHICS_VER(engine->i915)) {
1498         case 12:
1499         case 11:
1500                 return;
1501         case 9:
1502                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
1503                 wa_bb_fn[1] = NULL;
1504                 break;
1505         case 8:
1506                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
1507                 wa_bb_fn[1] = NULL;
1508                 break;
1509         default:
1510                 MISSING_CASE(GRAPHICS_VER(engine->i915));
1511                 return;
1512         }
1513
1514         err = lrc_create_wa_ctx(engine);
1515         if (err) {
1516                 /*
1517                  * We continue even if we fail to initialize WA batch
1518                  * because we only expect rare glitches but nothing
1519                  * critical to prevent us from using GPU
1520                  */
1521                 drm_err(&engine->i915->drm,
1522                         "Ignoring context switch w/a allocation error:%d\n",
1523                         err);
1524                 return;
1525         }
1526
1527         if (!engine->wa_ctx.vma)
1528                 return;
1529
1530         i915_gem_ww_ctx_init(&ww, true);
1531 retry:
1532         err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1533         if (!err)
1534                 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1535         if (err)
1536                 goto err;
1537
1538         batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1539         if (IS_ERR(batch)) {
1540                 err = PTR_ERR(batch);
1541                 goto err_unpin;
1542         }
1543
1544         /*
1545          * Emit the two workaround batch buffers, recording the offset from the
1546          * start of the workaround batch buffer object for each and their
1547          * respective sizes.
1548          */
1549         batch_ptr = batch;
1550         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1551                 wa_bb[i]->offset = batch_ptr - batch;
1552                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1553                                                   CACHELINE_BYTES))) {
1554                         err = -EINVAL;
1555                         break;
1556                 }
1557                 if (wa_bb_fn[i])
1558                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1559                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1560         }
1561         GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1562
1563         __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1564         __i915_gem_object_release_map(wa_ctx->vma->obj);
1565
1566         /* Verify that we can handle failure to setup the wa_ctx */
1567         if (!err)
1568                 err = i915_inject_probe_error(engine->i915, -ENODEV);
1569
1570 err_unpin:
1571         if (err)
1572                 i915_vma_unpin(wa_ctx->vma);
1573 err:
1574         if (err == -EDEADLK) {
1575                 err = i915_gem_ww_ctx_backoff(&ww);
1576                 if (!err)
1577                         goto retry;
1578         }
1579         i915_gem_ww_ctx_fini(&ww);
1580
1581         if (err) {
1582                 i915_vma_put(engine->wa_ctx.vma);
1583
1584                 /* Clear all flags to prevent further use */
1585                 memset(wa_ctx, 0, sizeof(*wa_ctx));
1586         }
1587 }
1588
1589 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1590 {
1591 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1592         ce->runtime.num_underflow++;
1593         ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1594 #endif
1595 }
1596
1597 void lrc_update_runtime(struct intel_context *ce)
1598 {
1599         u32 old;
1600         s32 dt;
1601
1602         if (intel_context_is_barrier(ce))
1603                 return;
1604
1605         old = ce->runtime.last;
1606         ce->runtime.last = lrc_get_runtime(ce);
1607         dt = ce->runtime.last - old;
1608
1609         if (unlikely(dt < 0)) {
1610                 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1611                          old, ce->runtime.last, dt);
1612                 st_update_runtime_underflow(ce, dt);
1613                 return;
1614         }
1615
1616         ewma_runtime_add(&ce->runtime.avg, dt);
1617         ce->runtime.total += dt;
1618 }
1619
1620 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1621 #include "selftest_lrc.c"
1622 #endif