e86897cde9846695b7d52cc2b32a88547125a2cc
[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / gt / intel_lrc.c
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2014 Intel Corporation
4  */
5
6 #include "gem/i915_gem_lmem.h"
7
8 #include "gen8_engine_cs.h"
9 #include "i915_drv.h"
10 #include "i915_perf.h"
11 #include "intel_engine.h"
12 #include "intel_gpu_commands.h"
13 #include "intel_gt.h"
14 #include "intel_lrc.h"
15 #include "intel_lrc_reg.h"
16 #include "intel_ring.h"
17 #include "shmem_utils.h"
18
19 static void set_offsets(u32 *regs,
20                         const u8 *data,
21                         const struct intel_engine_cs *engine,
22                         bool close)
23 #define NOP(x) (BIT(7) | (x))
24 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
25 #define POSTED BIT(0)
26 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
27 #define REG16(x) \
28         (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
29         (((x) >> 2) & 0x7f)
30 #define END 0
31 {
32         const u32 base = engine->mmio_base;
33
34         while (*data) {
35                 u8 count, flags;
36
37                 if (*data & BIT(7)) { /* skip */
38                         count = *data++ & ~BIT(7);
39                         regs += count;
40                         continue;
41                 }
42
43                 count = *data & 0x3f;
44                 flags = *data >> 6;
45                 data++;
46
47                 *regs = MI_LOAD_REGISTER_IMM(count);
48                 if (flags & POSTED)
49                         *regs |= MI_LRI_FORCE_POSTED;
50                 if (INTEL_GEN(engine->i915) >= 11)
51                         *regs |= MI_LRI_LRM_CS_MMIO;
52                 regs++;
53
54                 GEM_BUG_ON(!count);
55                 do {
56                         u32 offset = 0;
57                         u8 v;
58
59                         do {
60                                 v = *data++;
61                                 offset <<= 7;
62                                 offset |= v & ~BIT(7);
63                         } while (v & BIT(7));
64
65                         regs[0] = base + (offset << 2);
66                         regs += 2;
67                 } while (--count);
68         }
69
70         if (close) {
71                 /* Close the batch; used mainly by live_lrc_layout() */
72                 *regs = MI_BATCH_BUFFER_END;
73                 if (INTEL_GEN(engine->i915) >= 10)
74                         *regs |= BIT(0);
75         }
76 }
77
78 static const u8 gen8_xcs_offsets[] = {
79         NOP(1),
80         LRI(11, 0),
81         REG16(0x244),
82         REG(0x034),
83         REG(0x030),
84         REG(0x038),
85         REG(0x03c),
86         REG(0x168),
87         REG(0x140),
88         REG(0x110),
89         REG(0x11c),
90         REG(0x114),
91         REG(0x118),
92
93         NOP(9),
94         LRI(9, 0),
95         REG16(0x3a8),
96         REG16(0x28c),
97         REG16(0x288),
98         REG16(0x284),
99         REG16(0x280),
100         REG16(0x27c),
101         REG16(0x278),
102         REG16(0x274),
103         REG16(0x270),
104
105         NOP(13),
106         LRI(2, 0),
107         REG16(0x200),
108         REG(0x028),
109
110         END
111 };
112
113 static const u8 gen9_xcs_offsets[] = {
114         NOP(1),
115         LRI(14, POSTED),
116         REG16(0x244),
117         REG(0x034),
118         REG(0x030),
119         REG(0x038),
120         REG(0x03c),
121         REG(0x168),
122         REG(0x140),
123         REG(0x110),
124         REG(0x11c),
125         REG(0x114),
126         REG(0x118),
127         REG(0x1c0),
128         REG(0x1c4),
129         REG(0x1c8),
130
131         NOP(3),
132         LRI(9, POSTED),
133         REG16(0x3a8),
134         REG16(0x28c),
135         REG16(0x288),
136         REG16(0x284),
137         REG16(0x280),
138         REG16(0x27c),
139         REG16(0x278),
140         REG16(0x274),
141         REG16(0x270),
142
143         NOP(13),
144         LRI(1, POSTED),
145         REG16(0x200),
146
147         NOP(13),
148         LRI(44, POSTED),
149         REG(0x028),
150         REG(0x09c),
151         REG(0x0c0),
152         REG(0x178),
153         REG(0x17c),
154         REG16(0x358),
155         REG(0x170),
156         REG(0x150),
157         REG(0x154),
158         REG(0x158),
159         REG16(0x41c),
160         REG16(0x600),
161         REG16(0x604),
162         REG16(0x608),
163         REG16(0x60c),
164         REG16(0x610),
165         REG16(0x614),
166         REG16(0x618),
167         REG16(0x61c),
168         REG16(0x620),
169         REG16(0x624),
170         REG16(0x628),
171         REG16(0x62c),
172         REG16(0x630),
173         REG16(0x634),
174         REG16(0x638),
175         REG16(0x63c),
176         REG16(0x640),
177         REG16(0x644),
178         REG16(0x648),
179         REG16(0x64c),
180         REG16(0x650),
181         REG16(0x654),
182         REG16(0x658),
183         REG16(0x65c),
184         REG16(0x660),
185         REG16(0x664),
186         REG16(0x668),
187         REG16(0x66c),
188         REG16(0x670),
189         REG16(0x674),
190         REG16(0x678),
191         REG16(0x67c),
192         REG(0x068),
193
194         END
195 };
196
197 static const u8 gen12_xcs_offsets[] = {
198         NOP(1),
199         LRI(13, POSTED),
200         REG16(0x244),
201         REG(0x034),
202         REG(0x030),
203         REG(0x038),
204         REG(0x03c),
205         REG(0x168),
206         REG(0x140),
207         REG(0x110),
208         REG(0x1c0),
209         REG(0x1c4),
210         REG(0x1c8),
211         REG(0x180),
212         REG16(0x2b4),
213
214         NOP(5),
215         LRI(9, POSTED),
216         REG16(0x3a8),
217         REG16(0x28c),
218         REG16(0x288),
219         REG16(0x284),
220         REG16(0x280),
221         REG16(0x27c),
222         REG16(0x278),
223         REG16(0x274),
224         REG16(0x270),
225
226         END
227 };
228
229 static const u8 gen8_rcs_offsets[] = {
230         NOP(1),
231         LRI(14, POSTED),
232         REG16(0x244),
233         REG(0x034),
234         REG(0x030),
235         REG(0x038),
236         REG(0x03c),
237         REG(0x168),
238         REG(0x140),
239         REG(0x110),
240         REG(0x11c),
241         REG(0x114),
242         REG(0x118),
243         REG(0x1c0),
244         REG(0x1c4),
245         REG(0x1c8),
246
247         NOP(3),
248         LRI(9, POSTED),
249         REG16(0x3a8),
250         REG16(0x28c),
251         REG16(0x288),
252         REG16(0x284),
253         REG16(0x280),
254         REG16(0x27c),
255         REG16(0x278),
256         REG16(0x274),
257         REG16(0x270),
258
259         NOP(13),
260         LRI(1, 0),
261         REG(0x0c8),
262
263         END
264 };
265
266 static const u8 gen9_rcs_offsets[] = {
267         NOP(1),
268         LRI(14, POSTED),
269         REG16(0x244),
270         REG(0x34),
271         REG(0x30),
272         REG(0x38),
273         REG(0x3c),
274         REG(0x168),
275         REG(0x140),
276         REG(0x110),
277         REG(0x11c),
278         REG(0x114),
279         REG(0x118),
280         REG(0x1c0),
281         REG(0x1c4),
282         REG(0x1c8),
283
284         NOP(3),
285         LRI(9, POSTED),
286         REG16(0x3a8),
287         REG16(0x28c),
288         REG16(0x288),
289         REG16(0x284),
290         REG16(0x280),
291         REG16(0x27c),
292         REG16(0x278),
293         REG16(0x274),
294         REG16(0x270),
295
296         NOP(13),
297         LRI(1, 0),
298         REG(0xc8),
299
300         NOP(13),
301         LRI(44, POSTED),
302         REG(0x28),
303         REG(0x9c),
304         REG(0xc0),
305         REG(0x178),
306         REG(0x17c),
307         REG16(0x358),
308         REG(0x170),
309         REG(0x150),
310         REG(0x154),
311         REG(0x158),
312         REG16(0x41c),
313         REG16(0x600),
314         REG16(0x604),
315         REG16(0x608),
316         REG16(0x60c),
317         REG16(0x610),
318         REG16(0x614),
319         REG16(0x618),
320         REG16(0x61c),
321         REG16(0x620),
322         REG16(0x624),
323         REG16(0x628),
324         REG16(0x62c),
325         REG16(0x630),
326         REG16(0x634),
327         REG16(0x638),
328         REG16(0x63c),
329         REG16(0x640),
330         REG16(0x644),
331         REG16(0x648),
332         REG16(0x64c),
333         REG16(0x650),
334         REG16(0x654),
335         REG16(0x658),
336         REG16(0x65c),
337         REG16(0x660),
338         REG16(0x664),
339         REG16(0x668),
340         REG16(0x66c),
341         REG16(0x670),
342         REG16(0x674),
343         REG16(0x678),
344         REG16(0x67c),
345         REG(0x68),
346
347         END
348 };
349
350 static const u8 gen11_rcs_offsets[] = {
351         NOP(1),
352         LRI(15, POSTED),
353         REG16(0x244),
354         REG(0x034),
355         REG(0x030),
356         REG(0x038),
357         REG(0x03c),
358         REG(0x168),
359         REG(0x140),
360         REG(0x110),
361         REG(0x11c),
362         REG(0x114),
363         REG(0x118),
364         REG(0x1c0),
365         REG(0x1c4),
366         REG(0x1c8),
367         REG(0x180),
368
369         NOP(1),
370         LRI(9, POSTED),
371         REG16(0x3a8),
372         REG16(0x28c),
373         REG16(0x288),
374         REG16(0x284),
375         REG16(0x280),
376         REG16(0x27c),
377         REG16(0x278),
378         REG16(0x274),
379         REG16(0x270),
380
381         LRI(1, POSTED),
382         REG(0x1b0),
383
384         NOP(10),
385         LRI(1, 0),
386         REG(0x0c8),
387
388         END
389 };
390
391 static const u8 gen12_rcs_offsets[] = {
392         NOP(1),
393         LRI(13, POSTED),
394         REG16(0x244),
395         REG(0x034),
396         REG(0x030),
397         REG(0x038),
398         REG(0x03c),
399         REG(0x168),
400         REG(0x140),
401         REG(0x110),
402         REG(0x1c0),
403         REG(0x1c4),
404         REG(0x1c8),
405         REG(0x180),
406         REG16(0x2b4),
407
408         NOP(5),
409         LRI(9, POSTED),
410         REG16(0x3a8),
411         REG16(0x28c),
412         REG16(0x288),
413         REG16(0x284),
414         REG16(0x280),
415         REG16(0x27c),
416         REG16(0x278),
417         REG16(0x274),
418         REG16(0x270),
419
420         LRI(3, POSTED),
421         REG(0x1b0),
422         REG16(0x5a8),
423         REG16(0x5ac),
424
425         NOP(6),
426         LRI(1, 0),
427         REG(0x0c8),
428         NOP(3 + 9 + 1),
429
430         LRI(51, POSTED),
431         REG16(0x588),
432         REG16(0x588),
433         REG16(0x588),
434         REG16(0x588),
435         REG16(0x588),
436         REG16(0x588),
437         REG(0x028),
438         REG(0x09c),
439         REG(0x0c0),
440         REG(0x178),
441         REG(0x17c),
442         REG16(0x358),
443         REG(0x170),
444         REG(0x150),
445         REG(0x154),
446         REG(0x158),
447         REG16(0x41c),
448         REG16(0x600),
449         REG16(0x604),
450         REG16(0x608),
451         REG16(0x60c),
452         REG16(0x610),
453         REG16(0x614),
454         REG16(0x618),
455         REG16(0x61c),
456         REG16(0x620),
457         REG16(0x624),
458         REG16(0x628),
459         REG16(0x62c),
460         REG16(0x630),
461         REG16(0x634),
462         REG16(0x638),
463         REG16(0x63c),
464         REG16(0x640),
465         REG16(0x644),
466         REG16(0x648),
467         REG16(0x64c),
468         REG16(0x650),
469         REG16(0x654),
470         REG16(0x658),
471         REG16(0x65c),
472         REG16(0x660),
473         REG16(0x664),
474         REG16(0x668),
475         REG16(0x66c),
476         REG16(0x670),
477         REG16(0x674),
478         REG16(0x678),
479         REG16(0x67c),
480         REG(0x068),
481         REG(0x084),
482         NOP(1),
483
484         END
485 };
486
487 #undef END
488 #undef REG16
489 #undef REG
490 #undef LRI
491 #undef NOP
492
493 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
494 {
495         /*
496          * The gen12+ lists only have the registers we program in the basic
497          * default state. We rely on the context image using relative
498          * addressing to automatic fixup the register state between the
499          * physical engines for virtual engine.
500          */
501         GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
502                    !intel_engine_has_relative_mmio(engine));
503
504         if (engine->class == RENDER_CLASS) {
505                 if (INTEL_GEN(engine->i915) >= 12)
506                         return gen12_rcs_offsets;
507                 else if (INTEL_GEN(engine->i915) >= 11)
508                         return gen11_rcs_offsets;
509                 else if (INTEL_GEN(engine->i915) >= 9)
510                         return gen9_rcs_offsets;
511                 else
512                         return gen8_rcs_offsets;
513         } else {
514                 if (INTEL_GEN(engine->i915) >= 12)
515                         return gen12_xcs_offsets;
516                 else if (INTEL_GEN(engine->i915) >= 9)
517                         return gen9_xcs_offsets;
518                 else
519                         return gen8_xcs_offsets;
520         }
521 }
522
523 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
524 {
525         if (INTEL_GEN(engine->i915) >= 12)
526                 return 0x60;
527         else if (INTEL_GEN(engine->i915) >= 9)
528                 return 0x54;
529         else if (engine->class == RENDER_CLASS)
530                 return 0x58;
531         else
532                 return -1;
533 }
534
535 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
536 {
537         if (INTEL_GEN(engine->i915) >= 12)
538                 return 0x74;
539         else if (INTEL_GEN(engine->i915) >= 9)
540                 return 0x68;
541         else if (engine->class == RENDER_CLASS)
542                 return 0xd8;
543         else
544                 return -1;
545 }
546
547 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
548 {
549         if (INTEL_GEN(engine->i915) >= 12)
550                 return 0x12;
551         else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
552                 return 0x18;
553         else
554                 return -1;
555 }
556
557 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
558 {
559         int x;
560
561         x = lrc_ring_wa_bb_per_ctx(engine);
562         if (x < 0)
563                 return x;
564
565         return x + 2;
566 }
567
568 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
569 {
570         int x;
571
572         x = lrc_ring_indirect_ptr(engine);
573         if (x < 0)
574                 return x;
575
576         return x + 2;
577 }
578
579 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
580 {
581         if (engine->class != RENDER_CLASS)
582                 return -1;
583
584         if (INTEL_GEN(engine->i915) >= 12)
585                 return 0xb6;
586         else if (INTEL_GEN(engine->i915) >= 11)
587                 return 0xaa;
588         else
589                 return -1;
590 }
591
592 static u32
593 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
594 {
595         switch (INTEL_GEN(engine->i915)) {
596         default:
597                 MISSING_CASE(INTEL_GEN(engine->i915));
598                 fallthrough;
599         case 12:
600                 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
601         case 11:
602                 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
603         case 10:
604                 return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
605         case 9:
606                 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
607         case 8:
608                 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
609         }
610 }
611
612 static void
613 lrc_setup_indirect_ctx(u32 *regs,
614                        const struct intel_engine_cs *engine,
615                        u32 ctx_bb_ggtt_addr,
616                        u32 size)
617 {
618         GEM_BUG_ON(!size);
619         GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
620         GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
621         regs[lrc_ring_indirect_ptr(engine) + 1] =
622                 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
623
624         GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
625         regs[lrc_ring_indirect_offset(engine) + 1] =
626                 lrc_ring_indirect_offset_default(engine) << 6;
627 }
628
629 static void init_common_regs(u32 * const regs,
630                              const struct intel_context *ce,
631                              const struct intel_engine_cs *engine,
632                              bool inhibit)
633 {
634         u32 ctl;
635
636         ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
637         ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
638         if (inhibit)
639                 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
640         if (INTEL_GEN(engine->i915) < 11)
641                 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
642                                            CTX_CTRL_RS_CTX_ENABLE);
643         regs[CTX_CONTEXT_CONTROL] = ctl;
644
645         regs[CTX_TIMESTAMP] = ce->runtime.last;
646 }
647
648 static void init_wa_bb_regs(u32 * const regs,
649                             const struct intel_engine_cs *engine)
650 {
651         const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
652
653         if (wa_ctx->per_ctx.size) {
654                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
655
656                 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
657                 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
658                         (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
659         }
660
661         if (wa_ctx->indirect_ctx.size) {
662                 lrc_setup_indirect_ctx(regs, engine,
663                                        i915_ggtt_offset(wa_ctx->vma) +
664                                        wa_ctx->indirect_ctx.offset,
665                                        wa_ctx->indirect_ctx.size);
666         }
667 }
668
669 static void init_ppgtt_regs(u32 *regs, const struct i915_ppgtt *ppgtt)
670 {
671         if (i915_vm_is_4lvl(&ppgtt->vm)) {
672                 /* 64b PPGTT (48bit canonical)
673                  * PDP0_DESCRIPTOR contains the base address to PML4 and
674                  * other PDP Descriptors are ignored.
675                  */
676                 ASSIGN_CTX_PML4(ppgtt, regs);
677         } else {
678                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
679                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
680                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
681                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
682         }
683 }
684
685 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
686 {
687         if (i915_is_ggtt(vm))
688                 return i915_vm_to_ggtt(vm)->alias;
689         else
690                 return i915_vm_to_ppgtt(vm);
691 }
692
693 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
694 {
695         int x;
696
697         x = lrc_ring_mi_mode(engine);
698         if (x != -1) {
699                 regs[x + 1] &= ~STOP_RING;
700                 regs[x + 1] |= STOP_RING << 16;
701         }
702 }
703
704 static void __lrc_init_regs(u32 *regs,
705                             const struct intel_context *ce,
706                             const struct intel_engine_cs *engine,
707                             bool inhibit)
708 {
709         /*
710          * A context is actually a big batch buffer with several
711          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
712          * values we are setting here are only for the first context restore:
713          * on a subsequent save, the GPU will recreate this batchbuffer with new
714          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
715          * we are not initializing here).
716          *
717          * Must keep consistent with virtual_update_register_offsets().
718          */
719
720         if (inhibit)
721                 memset(regs, 0, PAGE_SIZE);
722
723         set_offsets(regs, reg_offsets(engine), engine, inhibit);
724
725         init_common_regs(regs, ce, engine, inhibit);
726         init_ppgtt_regs(regs, vm_alias(ce->vm));
727
728         init_wa_bb_regs(regs, engine);
729
730         __reset_stop_ring(regs, engine);
731 }
732
733 void lrc_init_regs(const struct intel_context *ce,
734                    const struct intel_engine_cs *engine,
735                    bool inhibit)
736 {
737         __lrc_init_regs(ce->lrc_reg_state, ce, engine, inhibit);
738 }
739
740 void lrc_reset_regs(const struct intel_context *ce,
741                     const struct intel_engine_cs *engine)
742 {
743         __reset_stop_ring(ce->lrc_reg_state, engine);
744 }
745
746 static void
747 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
748 {
749         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
750                 return;
751
752         vaddr += engine->context_size;
753
754         memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
755 }
756
757 static void
758 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
759 {
760         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
761                 return;
762
763         vaddr += engine->context_size;
764
765         if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
766                 drm_err_once(&engine->i915->drm,
767                              "%s context redzone overwritten!\n",
768                              engine->name);
769 }
770
771 void lrc_init_state(struct intel_context *ce,
772                     struct intel_engine_cs *engine,
773                     void *state)
774 {
775         bool inhibit = true;
776
777         set_redzone(state, engine);
778
779         if (engine->default_state) {
780                 shmem_read(engine->default_state, 0,
781                            state, engine->context_size);
782                 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
783                 inhibit = false;
784         }
785
786         /* Clear the ppHWSP (inc. per-context counters) */
787         memset(state, 0, PAGE_SIZE);
788
789         /*
790          * The second page of the context object contains some registers which
791          * must be set up prior to the first execution.
792          */
793         __lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
794 }
795
796 static struct i915_vma *
797 __lrc_alloc_state(struct intel_context *ce, struct intel_engine_cs *engine)
798 {
799         struct drm_i915_gem_object *obj;
800         struct i915_vma *vma;
801         u32 context_size;
802
803         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
804
805         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
806                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
807
808         if (INTEL_GEN(engine->i915) == 12) {
809                 ce->wa_bb_page = context_size / PAGE_SIZE;
810                 context_size += PAGE_SIZE;
811         }
812
813         obj = i915_gem_object_create_lmem(engine->i915, context_size, 0);
814         if (IS_ERR(obj))
815                 obj = i915_gem_object_create_shmem(engine->i915, context_size);
816         if (IS_ERR(obj))
817                 return ERR_CAST(obj);
818
819         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
820         if (IS_ERR(vma)) {
821                 i915_gem_object_put(obj);
822                 return vma;
823         }
824
825         return vma;
826 }
827
828 static struct intel_timeline *
829 pinned_timeline(struct intel_context *ce, struct intel_engine_cs *engine)
830 {
831         struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
832
833         return intel_timeline_create_from_engine(engine, page_unmask_bits(tl));
834 }
835
836 int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
837 {
838         struct intel_ring *ring;
839         struct i915_vma *vma;
840         int err;
841
842         GEM_BUG_ON(ce->state);
843
844         vma = __lrc_alloc_state(ce, engine);
845         if (IS_ERR(vma))
846                 return PTR_ERR(vma);
847
848         ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
849         if (IS_ERR(ring)) {
850                 err = PTR_ERR(ring);
851                 goto err_vma;
852         }
853
854         if (!page_mask_bits(ce->timeline)) {
855                 struct intel_timeline *tl;
856
857                 /*
858                  * Use the static global HWSP for the kernel context, and
859                  * a dynamically allocated cacheline for everyone else.
860                  */
861                 if (unlikely(ce->timeline))
862                         tl = pinned_timeline(ce, engine);
863                 else
864                         tl = intel_timeline_create(engine->gt);
865                 if (IS_ERR(tl)) {
866                         err = PTR_ERR(tl);
867                         goto err_ring;
868                 }
869
870                 ce->timeline = tl;
871         }
872
873         ce->ring = ring;
874         ce->state = vma;
875
876         return 0;
877
878 err_ring:
879         intel_ring_put(ring);
880 err_vma:
881         i915_vma_put(vma);
882         return err;
883 }
884
885 void lrc_reset(struct intel_context *ce)
886 {
887         GEM_BUG_ON(!intel_context_is_pinned(ce));
888
889         intel_ring_reset(ce->ring, ce->ring->emit);
890
891         /* Scrub away the garbage */
892         lrc_init_regs(ce, ce->engine, true);
893         ce->lrc.lrca = lrc_update_regs(ce, ce->engine, ce->ring->tail);
894 }
895
896 int
897 lrc_pre_pin(struct intel_context *ce,
898             struct intel_engine_cs *engine,
899             struct i915_gem_ww_ctx *ww,
900             void **vaddr)
901 {
902         GEM_BUG_ON(!ce->state);
903         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
904
905         *vaddr = i915_gem_object_pin_map(ce->state->obj,
906                                          i915_coherent_map_type(ce->engine->i915) |
907                                          I915_MAP_OVERRIDE);
908
909         return PTR_ERR_OR_ZERO(*vaddr);
910 }
911
912 int
913 lrc_pin(struct intel_context *ce,
914         struct intel_engine_cs *engine,
915         void *vaddr)
916 {
917         ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
918
919         if (!__test_and_set_bit(CONTEXT_INIT_BIT, &ce->flags))
920                 lrc_init_state(ce, engine, vaddr);
921
922         ce->lrc.lrca = lrc_update_regs(ce, engine, ce->ring->tail);
923         return 0;
924 }
925
926 void lrc_unpin(struct intel_context *ce)
927 {
928         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
929                       ce->engine);
930 }
931
932 void lrc_post_unpin(struct intel_context *ce)
933 {
934         i915_gem_object_unpin_map(ce->state->obj);
935 }
936
937 void lrc_fini(struct intel_context *ce)
938 {
939         if (!ce->state)
940                 return;
941
942         intel_ring_put(fetch_and_zero(&ce->ring));
943         i915_vma_put(fetch_and_zero(&ce->state));
944 }
945
946 void lrc_destroy(struct kref *kref)
947 {
948         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
949
950         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
951         GEM_BUG_ON(intel_context_is_pinned(ce));
952
953         lrc_fini(ce);
954
955         intel_context_fini(ce);
956         intel_context_free(ce);
957 }
958
959 static u32 *
960 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
961 {
962         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
963                 MI_SRM_LRM_GLOBAL_GTT |
964                 MI_LRI_LRM_CS_MMIO;
965         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
966         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
967                 CTX_TIMESTAMP * sizeof(u32);
968         *cs++ = 0;
969
970         *cs++ = MI_LOAD_REGISTER_REG |
971                 MI_LRR_SOURCE_CS_MMIO |
972                 MI_LRI_LRM_CS_MMIO;
973         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
974         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
975
976         *cs++ = MI_LOAD_REGISTER_REG |
977                 MI_LRR_SOURCE_CS_MMIO |
978                 MI_LRI_LRM_CS_MMIO;
979         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
980         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
981
982         return cs;
983 }
984
985 static u32 *
986 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
987 {
988         GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
989
990         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
991                 MI_SRM_LRM_GLOBAL_GTT |
992                 MI_LRI_LRM_CS_MMIO;
993         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
994         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
995                 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
996         *cs++ = 0;
997
998         return cs;
999 }
1000
1001 static u32 *
1002 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
1003 {
1004         GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
1005
1006         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
1007                 MI_SRM_LRM_GLOBAL_GTT |
1008                 MI_LRI_LRM_CS_MMIO;
1009         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1010         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
1011                 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
1012         *cs++ = 0;
1013
1014         *cs++ = MI_LOAD_REGISTER_REG |
1015                 MI_LRR_SOURCE_CS_MMIO |
1016                 MI_LRI_LRM_CS_MMIO;
1017         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
1018         *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
1019
1020         return cs;
1021 }
1022
1023 static u32 *
1024 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
1025 {
1026         cs = gen12_emit_timestamp_wa(ce, cs);
1027         cs = gen12_emit_cmd_buf_wa(ce, cs);
1028         cs = gen12_emit_restore_scratch(ce, cs);
1029
1030         return cs;
1031 }
1032
1033 static u32 *
1034 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
1035 {
1036         cs = gen12_emit_timestamp_wa(ce, cs);
1037         cs = gen12_emit_restore_scratch(ce, cs);
1038
1039         return cs;
1040 }
1041
1042 static u32 context_wa_bb_offset(const struct intel_context *ce)
1043 {
1044         return PAGE_SIZE * ce->wa_bb_page;
1045 }
1046
1047 static u32 *context_indirect_bb(const struct intel_context *ce)
1048 {
1049         void *ptr;
1050
1051         GEM_BUG_ON(!ce->wa_bb_page);
1052
1053         ptr = ce->lrc_reg_state;
1054         ptr -= LRC_STATE_OFFSET; /* back to start of context image */
1055         ptr += context_wa_bb_offset(ce);
1056
1057         return ptr;
1058 }
1059
1060 static void
1061 setup_indirect_ctx_bb(const struct intel_context *ce,
1062                       const struct intel_engine_cs *engine,
1063                       u32 *(*emit)(const struct intel_context *, u32 *))
1064 {
1065         u32 * const start = context_indirect_bb(ce);
1066         u32 *cs;
1067
1068         cs = emit(ce, start);
1069         GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
1070         while ((unsigned long)cs % CACHELINE_BYTES)
1071                 *cs++ = MI_NOOP;
1072
1073         lrc_setup_indirect_ctx(ce->lrc_reg_state, engine,
1074                                i915_ggtt_offset(ce->state) +
1075                                context_wa_bb_offset(ce),
1076                                (cs - start) * sizeof(*cs));
1077 }
1078
1079 /*
1080  * The context descriptor encodes various attributes of a context,
1081  * including its GTT address and some flags. Because it's fairly
1082  * expensive to calculate, we'll just do it once and cache the result,
1083  * which remains valid until the context is unpinned.
1084  *
1085  * This is what a descriptor looks like, from LSB to MSB::
1086  *
1087  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
1088  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
1089  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
1090  *      bits 53-54:    mbz, reserved for use by hardware
1091  *      bits 55-63:    group ID, currently unused and set to 0
1092  *
1093  * Starting from Gen11, the upper dword of the descriptor has a new format:
1094  *
1095  *      bits 32-36:    reserved
1096  *      bits 37-47:    SW context ID
1097  *      bits 48:53:    engine instance
1098  *      bit 54:        mbz, reserved for use by hardware
1099  *      bits 55-60:    SW counter
1100  *      bits 61-63:    engine class
1101  *
1102  * engine info, SW context ID and SW counter need to form a unique number
1103  * (Context ID) per lrc.
1104  */
1105 static u32 lrc_descriptor(const struct intel_context *ce)
1106 {
1107         u32 desc;
1108
1109         desc = INTEL_LEGACY_32B_CONTEXT;
1110         if (i915_vm_is_4lvl(ce->vm))
1111                 desc = INTEL_LEGACY_64B_CONTEXT;
1112         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
1113
1114         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
1115         if (IS_GEN(ce->vm->i915, 8))
1116                 desc |= GEN8_CTX_L3LLC_COHERENT;
1117
1118         return i915_ggtt_offset(ce->state) | desc;
1119 }
1120
1121 u32 lrc_update_regs(const struct intel_context *ce,
1122                     const struct intel_engine_cs *engine,
1123                     u32 head)
1124 {
1125         struct intel_ring *ring = ce->ring;
1126         u32 *regs = ce->lrc_reg_state;
1127
1128         GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
1129         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
1130
1131         regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1132         regs[CTX_RING_HEAD] = head;
1133         regs[CTX_RING_TAIL] = ring->tail;
1134         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1135
1136         /* RPCS */
1137         if (engine->class == RENDER_CLASS) {
1138                 regs[CTX_R_PWR_CLK_STATE] =
1139                         intel_sseu_make_rpcs(engine->gt, &ce->sseu);
1140
1141                 i915_oa_init_reg_state(ce, engine);
1142         }
1143
1144         if (ce->wa_bb_page) {
1145                 u32 *(*fn)(const struct intel_context *ce, u32 *cs);
1146
1147                 fn = gen12_emit_indirect_ctx_xcs;
1148                 if (ce->engine->class == RENDER_CLASS)
1149                         fn = gen12_emit_indirect_ctx_rcs;
1150
1151                 /* Mutually exclusive wrt to global indirect bb */
1152                 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
1153                 setup_indirect_ctx_bb(ce, engine, fn);
1154         }
1155
1156         return lrc_descriptor(ce) | CTX_DESC_FORCE_RESTORE;
1157 }
1158
1159 void lrc_update_offsets(struct intel_context *ce,
1160                         struct intel_engine_cs *engine)
1161 {
1162         set_offsets(ce->lrc_reg_state, reg_offsets(engine), engine, false);
1163 }
1164
1165 void lrc_check_regs(const struct intel_context *ce,
1166                     const struct intel_engine_cs *engine,
1167                     const char *when)
1168 {
1169         const struct intel_ring *ring = ce->ring;
1170         u32 *regs = ce->lrc_reg_state;
1171         bool valid = true;
1172         int x;
1173
1174         if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1175                 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1176                        engine->name,
1177                        regs[CTX_RING_START],
1178                        i915_ggtt_offset(ring->vma));
1179                 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1180                 valid = false;
1181         }
1182
1183         if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1184             (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1185                 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1186                        engine->name,
1187                        regs[CTX_RING_CTL],
1188                        (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1189                 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1190                 valid = false;
1191         }
1192
1193         x = lrc_ring_mi_mode(engine);
1194         if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1195                 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1196                        engine->name, regs[x + 1]);
1197                 regs[x + 1] &= ~STOP_RING;
1198                 regs[x + 1] |= STOP_RING << 16;
1199                 valid = false;
1200         }
1201
1202         WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1203 }
1204
1205 /*
1206  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
1207  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
1208  * but there is a slight complication as this is applied in WA batch where the
1209  * values are only initialized once so we cannot take register value at the
1210  * beginning and reuse it further; hence we save its value to memory, upload a
1211  * constant value with bit21 set and then we restore it back with the saved value.
1212  * To simplify the WA, a constant value is formed by using the default value
1213  * of this register. This shouldn't be a problem because we are only modifying
1214  * it for a short period and this batch in non-premptible. We can ofcourse
1215  * use additional instructions that read the actual value of the register
1216  * at that time and set our bit of interest but it makes the WA complicated.
1217  *
1218  * This WA is also required for Gen9 so extracting as a function avoids
1219  * code duplication.
1220  */
1221 static u32 *
1222 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
1223 {
1224         /* NB no one else is allowed to scribble over scratch + 256! */
1225         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1226         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1227         *batch++ = intel_gt_scratch_offset(engine->gt,
1228                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1229         *batch++ = 0;
1230
1231         *batch++ = MI_LOAD_REGISTER_IMM(1);
1232         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1233         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
1234
1235         batch = gen8_emit_pipe_control(batch,
1236                                        PIPE_CONTROL_CS_STALL |
1237                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
1238                                        0);
1239
1240         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
1241         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
1242         *batch++ = intel_gt_scratch_offset(engine->gt,
1243                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
1244         *batch++ = 0;
1245
1246         return batch;
1247 }
1248
1249 /*
1250  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
1251  * initialized at the beginning and shared across all contexts but this field
1252  * helps us to have multiple batches at different offsets and select them based
1253  * on a criteria. At the moment this batch always start at the beginning of the page
1254  * and at this point we don't have multiple wa_ctx batch buffers.
1255  *
1256  * The number of WA applied are not known at the beginning; we use this field
1257  * to return the no of DWORDS written.
1258  *
1259  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
1260  * so it adds NOOPs as padding to make it cacheline aligned.
1261  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
1262  * makes a complete batch buffer.
1263  */
1264 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1265 {
1266         /* WaDisableCtxRestoreArbitration:bdw,chv */
1267         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1268
1269         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
1270         if (IS_BROADWELL(engine->i915))
1271                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1272
1273         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
1274         /* Actual scratch location is at 128 bytes offset */
1275         batch = gen8_emit_pipe_control(batch,
1276                                        PIPE_CONTROL_FLUSH_L3 |
1277                                        PIPE_CONTROL_STORE_DATA_INDEX |
1278                                        PIPE_CONTROL_CS_STALL |
1279                                        PIPE_CONTROL_QW_WRITE,
1280                                        LRC_PPHWSP_SCRATCH_ADDR);
1281
1282         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1283
1284         /* Pad to end of cacheline */
1285         while ((unsigned long)batch % CACHELINE_BYTES)
1286                 *batch++ = MI_NOOP;
1287
1288         /*
1289          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
1290          * execution depends on the length specified in terms of cache lines
1291          * in the register CTX_RCS_INDIRECT_CTX
1292          */
1293
1294         return batch;
1295 }
1296
1297 struct lri {
1298         i915_reg_t reg;
1299         u32 value;
1300 };
1301
1302 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
1303 {
1304         GEM_BUG_ON(!count || count > 63);
1305
1306         *batch++ = MI_LOAD_REGISTER_IMM(count);
1307         do {
1308                 *batch++ = i915_mmio_reg_offset(lri->reg);
1309                 *batch++ = lri->value;
1310         } while (lri++, --count);
1311         *batch++ = MI_NOOP;
1312
1313         return batch;
1314 }
1315
1316 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1317 {
1318         static const struct lri lri[] = {
1319                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
1320                 {
1321                         COMMON_SLICE_CHICKEN2,
1322                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
1323                                        0),
1324                 },
1325
1326                 /* BSpec: 11391 */
1327                 {
1328                         FF_SLICE_CHICKEN,
1329                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
1330                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
1331                 },
1332
1333                 /* BSpec: 11299 */
1334                 {
1335                         _3D_CHICKEN3,
1336                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
1337                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
1338                 }
1339         };
1340
1341         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
1342
1343         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
1344         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
1345
1346         /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
1347         batch = gen8_emit_pipe_control(batch,
1348                                        PIPE_CONTROL_FLUSH_L3 |
1349                                        PIPE_CONTROL_STORE_DATA_INDEX |
1350                                        PIPE_CONTROL_CS_STALL |
1351                                        PIPE_CONTROL_QW_WRITE,
1352                                        LRC_PPHWSP_SCRATCH_ADDR);
1353
1354         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
1355
1356         /* WaMediaPoolStateCmdInWABB:bxt,glk */
1357         if (HAS_POOLED_EU(engine->i915)) {
1358                 /*
1359                  * EU pool configuration is setup along with golden context
1360                  * during context initialization. This value depends on
1361                  * device type (2x6 or 3x6) and needs to be updated based
1362                  * on which subslice is disabled especially for 2x6
1363                  * devices, however it is safe to load default
1364                  * configuration of 3x6 device instead of masking off
1365                  * corresponding bits because HW ignores bits of a disabled
1366                  * subslice and drops down to appropriate config. Please
1367                  * see render_state_setup() in i915_gem_render_state.c for
1368                  * possible configurations, to avoid duplication they are
1369                  * not shown here again.
1370                  */
1371                 *batch++ = GEN9_MEDIA_POOL_STATE;
1372                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
1373                 *batch++ = 0x00777000;
1374                 *batch++ = 0;
1375                 *batch++ = 0;
1376                 *batch++ = 0;
1377         }
1378
1379         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
1380
1381         /* Pad to end of cacheline */
1382         while ((unsigned long)batch % CACHELINE_BYTES)
1383                 *batch++ = MI_NOOP;
1384
1385         return batch;
1386 }
1387
1388 static u32 *
1389 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
1390 {
1391         int i;
1392
1393         /*
1394          * WaPipeControlBefore3DStateSamplePattern: cnl
1395          *
1396          * Ensure the engine is idle prior to programming a
1397          * 3DSTATE_SAMPLE_PATTERN during a context restore.
1398          */
1399         batch = gen8_emit_pipe_control(batch,
1400                                        PIPE_CONTROL_CS_STALL,
1401                                        0);
1402         /*
1403          * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
1404          * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
1405          * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
1406          * confusing. Since gen8_emit_pipe_control() already advances the
1407          * batch by 6 dwords, we advance the other 10 here, completing a
1408          * cacheline. It's not clear if the workaround requires this padding
1409          * before other commands, or if it's just the regular padding we would
1410          * already have for the workaround bb, so leave it here for now.
1411          */
1412         for (i = 0; i < 10; i++)
1413                 *batch++ = MI_NOOP;
1414
1415         /* Pad to end of cacheline */
1416         while ((unsigned long)batch % CACHELINE_BYTES)
1417                 *batch++ = MI_NOOP;
1418
1419         return batch;
1420 }
1421
1422 #define CTX_WA_BB_SIZE (PAGE_SIZE)
1423
1424 static int lrc_create_wa_ctx(struct intel_engine_cs *engine)
1425 {
1426         struct drm_i915_gem_object *obj;
1427         struct i915_vma *vma;
1428         int err;
1429
1430         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_SIZE);
1431         if (IS_ERR(obj))
1432                 return PTR_ERR(obj);
1433
1434         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
1435         if (IS_ERR(vma)) {
1436                 err = PTR_ERR(vma);
1437                 goto err;
1438         }
1439
1440         engine->wa_ctx.vma = vma;
1441         return 0;
1442
1443 err:
1444         i915_gem_object_put(obj);
1445         return err;
1446 }
1447
1448 void lrc_fini_wa_ctx(struct intel_engine_cs *engine)
1449 {
1450         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
1451 }
1452
1453 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
1454
1455 void lrc_init_wa_ctx(struct intel_engine_cs *engine)
1456 {
1457         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
1458         struct i915_wa_ctx_bb *wa_bb[] = {
1459                 &wa_ctx->indirect_ctx, &wa_ctx->per_ctx
1460         };
1461         wa_bb_func_t wa_bb_fn[ARRAY_SIZE(wa_bb)];
1462         struct i915_gem_ww_ctx ww;
1463         void *batch, *batch_ptr;
1464         unsigned int i;
1465         int err;
1466
1467         if (engine->class != RENDER_CLASS)
1468                 return;
1469
1470         switch (INTEL_GEN(engine->i915)) {
1471         case 12:
1472         case 11:
1473                 return;
1474         case 10:
1475                 wa_bb_fn[0] = gen10_init_indirectctx_bb;
1476                 wa_bb_fn[1] = NULL;
1477                 break;
1478         case 9:
1479                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
1480                 wa_bb_fn[1] = NULL;
1481                 break;
1482         case 8:
1483                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
1484                 wa_bb_fn[1] = NULL;
1485                 break;
1486         default:
1487                 MISSING_CASE(INTEL_GEN(engine->i915));
1488                 return;
1489         }
1490
1491         err = lrc_create_wa_ctx(engine);
1492         if (err) {
1493                 /*
1494                  * We continue even if we fail to initialize WA batch
1495                  * because we only expect rare glitches but nothing
1496                  * critical to prevent us from using GPU
1497                  */
1498                 drm_err(&engine->i915->drm,
1499                         "Ignoring context switch w/a allocation error:%d\n",
1500                         err);
1501                 return;
1502         }
1503
1504         if (!engine->wa_ctx.vma)
1505                 return;
1506
1507         i915_gem_ww_ctx_init(&ww, true);
1508 retry:
1509         err = i915_gem_object_lock(wa_ctx->vma->obj, &ww);
1510         if (!err)
1511                 err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH);
1512         if (err)
1513                 goto err;
1514
1515         batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
1516         if (IS_ERR(batch)) {
1517                 err = PTR_ERR(batch);
1518                 goto err_unpin;
1519         }
1520
1521         /*
1522          * Emit the two workaround batch buffers, recording the offset from the
1523          * start of the workaround batch buffer object for each and their
1524          * respective sizes.
1525          */
1526         batch_ptr = batch;
1527         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
1528                 wa_bb[i]->offset = batch_ptr - batch;
1529                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
1530                                                   CACHELINE_BYTES))) {
1531                         err = -EINVAL;
1532                         break;
1533                 }
1534                 if (wa_bb_fn[i])
1535                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
1536                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
1537         }
1538         GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
1539
1540         __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
1541         __i915_gem_object_release_map(wa_ctx->vma->obj);
1542
1543         /* Verify that we can handle failure to setup the wa_ctx */
1544         if (!err)
1545                 err = i915_inject_probe_error(engine->i915, -ENODEV);
1546
1547 err_unpin:
1548         if (err)
1549                 i915_vma_unpin(wa_ctx->vma);
1550 err:
1551         if (err == -EDEADLK) {
1552                 err = i915_gem_ww_ctx_backoff(&ww);
1553                 if (!err)
1554                         goto retry;
1555         }
1556         i915_gem_ww_ctx_fini(&ww);
1557
1558         if (err) {
1559                 i915_vma_put(engine->wa_ctx.vma);
1560
1561                 /* Clear all flags to prevent further use */
1562                 memset(wa_ctx, 0, sizeof(*wa_ctx));
1563         }
1564 }
1565
1566 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1567 {
1568 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1569         ce->runtime.num_underflow++;
1570         ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1571 #endif
1572 }
1573
1574 void lrc_update_runtime(struct intel_context *ce)
1575 {
1576         u32 old;
1577         s32 dt;
1578
1579         if (intel_context_is_barrier(ce))
1580                 return;
1581
1582         old = ce->runtime.last;
1583         ce->runtime.last = lrc_get_runtime(ce);
1584         dt = ce->runtime.last - old;
1585
1586         if (unlikely(dt < 0)) {
1587                 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1588                          old, ce->runtime.last, dt);
1589                 st_update_runtime_underflow(ce, dt);
1590                 return;
1591         }
1592
1593         ewma_runtime_add(&ce->runtime.avg, dt);
1594         ce->runtime.total += dt;
1595 }
1596
1597 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1598 #include "selftest_lrc.c"
1599 #endif