Merge branch 'for-5.12/i2c-hid' into for-linus
[linux-2.6-microblaze.git] / drivers / gpu / drm / i915 / gt / intel_lrc.c
1 /*
2  * Copyright © 2014 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  * Authors:
24  *    Ben Widawsky <ben@bwidawsk.net>
25  *    Michel Thierry <michel.thierry@intel.com>
26  *    Thomas Daniel <thomas.daniel@intel.com>
27  *    Oscar Mateo <oscar.mateo@intel.com>
28  *
29  */
30
31 /**
32  * DOC: Logical Rings, Logical Ring Contexts and Execlists
33  *
34  * Motivation:
35  * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36  * These expanded contexts enable a number of new abilities, especially
37  * "Execlists" (also implemented in this file).
38  *
39  * One of the main differences with the legacy HW contexts is that logical
40  * ring contexts incorporate many more things to the context's state, like
41  * PDPs or ringbuffer control registers:
42  *
43  * The reason why PDPs are included in the context is straightforward: as
44  * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45  * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46  * instead, the GPU will do it for you on the context switch.
47  *
48  * But, what about the ringbuffer control registers (head, tail, etc..)?
49  * shouldn't we just need a set of those per engine command streamer? This is
50  * where the name "Logical Rings" starts to make sense: by virtualizing the
51  * rings, the engine cs shifts to a new "ring buffer" with every context
52  * switch. When you want to submit a workload to the GPU you: A) choose your
53  * context, B) find its appropriate virtualized ring, C) write commands to it
54  * and then, finally, D) tell the GPU to switch to that context.
55  *
56  * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57  * to a contexts is via a context execution list, ergo "Execlists".
58  *
59  * LRC implementation:
60  * Regarding the creation of contexts, we have:
61  *
62  * - One global default context.
63  * - One local default context for each opened fd.
64  * - One local extra context for each context create ioctl call.
65  *
66  * Now that ringbuffers belong per-context (and not per-engine, like before)
67  * and that contexts are uniquely tied to a given engine (and not reusable,
68  * like before) we need:
69  *
70  * - One ringbuffer per-engine inside each context.
71  * - One backing object per-engine inside each context.
72  *
73  * The global default context starts its life with these new objects fully
74  * allocated and populated. The local default context for each opened fd is
75  * more complex, because we don't know at creation time which engine is going
76  * to use them. To handle this, we have implemented a deferred creation of LR
77  * contexts:
78  *
79  * The local context starts its life as a hollow or blank holder, that only
80  * gets populated for a given engine once we receive an execbuffer. If later
81  * on we receive another execbuffer ioctl for the same context but a different
82  * engine, we allocate/populate a new ringbuffer and context backing object and
83  * so on.
84  *
85  * Finally, regarding local contexts created using the ioctl call: as they are
86  * only allowed with the render ring, we can allocate & populate them right
87  * away (no need to defer anything, at least for now).
88  *
89  * Execlists implementation:
90  * Execlists are the new method by which, on gen8+ hardware, workloads are
91  * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92  * This method works as follows:
93  *
94  * When a request is committed, its commands (the BB start and any leading or
95  * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96  * for the appropriate context. The tail pointer in the hardware context is not
97  * updated at this time, but instead, kept by the driver in the ringbuffer
98  * structure. A structure representing this request is added to a request queue
99  * for the appropriate engine: this structure contains a copy of the context's
100  * tail after the request was written to the ring buffer and a pointer to the
101  * context itself.
102  *
103  * If the engine's request queue was empty before the request was added, the
104  * queue is processed immediately. Otherwise the queue will be processed during
105  * a context switch interrupt. In any case, elements on the queue will get sent
106  * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107  * globally unique 20-bits submission ID.
108  *
109  * When execution of a request completes, the GPU updates the context status
110  * buffer with a context complete event and generates a context switch interrupt.
111  * During the interrupt handling, the driver examines the events in the buffer:
112  * for each context complete event, if the announced ID matches that on the head
113  * of the request queue, then that request is retired and removed from the queue.
114  *
115  * After processing, if any requests were retired and the queue is not empty
116  * then a new execution list can be submitted. The two requests at the front of
117  * the queue are next to be submitted but since a context may not occur twice in
118  * an execution list, if subsequent requests have the same ID as the first then
119  * the two requests must be combined. This is done simply by discarding requests
120  * at the head of the queue until either only one requests is left (in which case
121  * we use a NULL second context) or the first two requests have unique IDs.
122  *
123  * By always executing the first two requests in the queue the driver ensures
124  * that the GPU is kept as busy as possible. In the case where a single context
125  * completes but a second context is still executing, the request for this second
126  * context will be at the head of the queue when we remove the first one. This
127  * request will then be resubmitted along with a new request for a different context,
128  * which will cause the hardware to continue executing the second request and queue
129  * the new request (the GPU detects the condition of a context getting preempted
130  * with the same context and optimizes the context switch flow by not doing
131  * preemption, but just sampling the new tail pointer).
132  *
133  */
134 #include <linux/interrupt.h>
135
136 #include "i915_drv.h"
137 #include "i915_perf.h"
138 #include "i915_trace.h"
139 #include "i915_vgpu.h"
140 #include "intel_breadcrumbs.h"
141 #include "intel_context.h"
142 #include "intel_engine_pm.h"
143 #include "intel_gt.h"
144 #include "intel_gt_pm.h"
145 #include "intel_gt_requests.h"
146 #include "intel_lrc_reg.h"
147 #include "intel_mocs.h"
148 #include "intel_reset.h"
149 #include "intel_ring.h"
150 #include "intel_workarounds.h"
151 #include "shmem_utils.h"
152
153 #define RING_EXECLIST_QFULL             (1 << 0x2)
154 #define RING_EXECLIST1_VALID            (1 << 0x3)
155 #define RING_EXECLIST0_VALID            (1 << 0x4)
156 #define RING_EXECLIST_ACTIVE_STATUS     (3 << 0xE)
157 #define RING_EXECLIST1_ACTIVE           (1 << 0x11)
158 #define RING_EXECLIST0_ACTIVE           (1 << 0x12)
159
160 #define GEN8_CTX_STATUS_IDLE_ACTIVE     (1 << 0)
161 #define GEN8_CTX_STATUS_PREEMPTED       (1 << 1)
162 #define GEN8_CTX_STATUS_ELEMENT_SWITCH  (1 << 2)
163 #define GEN8_CTX_STATUS_ACTIVE_IDLE     (1 << 3)
164 #define GEN8_CTX_STATUS_COMPLETE        (1 << 4)
165 #define GEN8_CTX_STATUS_LITE_RESTORE    (1 << 15)
166
167 #define GEN8_CTX_STATUS_COMPLETED_MASK \
168          (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
169
170 #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
171
172 #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE  (0x1) /* lower csb dword */
173 #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
174 #define GEN12_CSB_SW_CTX_ID_MASK                GENMASK(25, 15)
175 #define GEN12_IDLE_CTX_ID               0x7FF
176 #define GEN12_CSB_CTX_VALID(csb_dw) \
177         (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
178
179 /* Typical size of the average request (2 pipecontrols and a MI_BB) */
180 #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
181
182 struct virtual_engine {
183         struct intel_engine_cs base;
184         struct intel_context context;
185         struct rcu_work rcu;
186
187         /*
188          * We allow only a single request through the virtual engine at a time
189          * (each request in the timeline waits for the completion fence of
190          * the previous before being submitted). By restricting ourselves to
191          * only submitting a single request, each request is placed on to a
192          * physical to maximise load spreading (by virtue of the late greedy
193          * scheduling -- each real engine takes the next available request
194          * upon idling).
195          */
196         struct i915_request *request;
197
198         /*
199          * We keep a rbtree of available virtual engines inside each physical
200          * engine, sorted by priority. Here we preallocate the nodes we need
201          * for the virtual engine, indexed by physical_engine->id.
202          */
203         struct ve_node {
204                 struct rb_node rb;
205                 int prio;
206         } nodes[I915_NUM_ENGINES];
207
208         /*
209          * Keep track of bonded pairs -- restrictions upon on our selection
210          * of physical engines any particular request may be submitted to.
211          * If we receive a submit-fence from a master engine, we will only
212          * use one of sibling_mask physical engines.
213          */
214         struct ve_bond {
215                 const struct intel_engine_cs *master;
216                 intel_engine_mask_t sibling_mask;
217         } *bonds;
218         unsigned int num_bonds;
219
220         /* And finally, which physical engines this virtual engine maps onto. */
221         unsigned int num_siblings;
222         struct intel_engine_cs *siblings[];
223 };
224
225 static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
226 {
227         GEM_BUG_ON(!intel_engine_is_virtual(engine));
228         return container_of(engine, struct virtual_engine, base);
229 }
230
231 static int __execlists_context_alloc(struct intel_context *ce,
232                                      struct intel_engine_cs *engine);
233
234 static void execlists_init_reg_state(u32 *reg_state,
235                                      const struct intel_context *ce,
236                                      const struct intel_engine_cs *engine,
237                                      const struct intel_ring *ring,
238                                      bool close);
239 static void
240 __execlists_update_reg_state(const struct intel_context *ce,
241                              const struct intel_engine_cs *engine,
242                              u32 head);
243
244 static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
245 {
246         if (INTEL_GEN(engine->i915) >= 12)
247                 return 0x60;
248         else if (INTEL_GEN(engine->i915) >= 9)
249                 return 0x54;
250         else if (engine->class == RENDER_CLASS)
251                 return 0x58;
252         else
253                 return -1;
254 }
255
256 static int lrc_ring_gpr0(const struct intel_engine_cs *engine)
257 {
258         if (INTEL_GEN(engine->i915) >= 12)
259                 return 0x74;
260         else if (INTEL_GEN(engine->i915) >= 9)
261                 return 0x68;
262         else if (engine->class == RENDER_CLASS)
263                 return 0xd8;
264         else
265                 return -1;
266 }
267
268 static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine)
269 {
270         if (INTEL_GEN(engine->i915) >= 12)
271                 return 0x12;
272         else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS)
273                 return 0x18;
274         else
275                 return -1;
276 }
277
278 static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine)
279 {
280         int x;
281
282         x = lrc_ring_wa_bb_per_ctx(engine);
283         if (x < 0)
284                 return x;
285
286         return x + 2;
287 }
288
289 static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine)
290 {
291         int x;
292
293         x = lrc_ring_indirect_ptr(engine);
294         if (x < 0)
295                 return x;
296
297         return x + 2;
298 }
299
300 static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine)
301 {
302         if (engine->class != RENDER_CLASS)
303                 return -1;
304
305         if (INTEL_GEN(engine->i915) >= 12)
306                 return 0xb6;
307         else if (INTEL_GEN(engine->i915) >= 11)
308                 return 0xaa;
309         else
310                 return -1;
311 }
312
313 static u32
314 lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine)
315 {
316         switch (INTEL_GEN(engine->i915)) {
317         default:
318                 MISSING_CASE(INTEL_GEN(engine->i915));
319                 fallthrough;
320         case 12:
321                 return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
322         case 11:
323                 return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
324         case 10:
325                 return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
326         case 9:
327                 return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
328         case 8:
329                 return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
330         }
331 }
332
333 static void
334 lrc_ring_setup_indirect_ctx(u32 *regs,
335                             const struct intel_engine_cs *engine,
336                             u32 ctx_bb_ggtt_addr,
337                             u32 size)
338 {
339         GEM_BUG_ON(!size);
340         GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES));
341         GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1);
342         regs[lrc_ring_indirect_ptr(engine) + 1] =
343                 ctx_bb_ggtt_addr | (size / CACHELINE_BYTES);
344
345         GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1);
346         regs[lrc_ring_indirect_offset(engine) + 1] =
347                 lrc_ring_indirect_offset_default(engine) << 6;
348 }
349
350 static u32 intel_context_get_runtime(const struct intel_context *ce)
351 {
352         /*
353          * We can use either ppHWSP[16] which is recorded before the context
354          * switch (and so excludes the cost of context switches) or use the
355          * value from the context image itself, which is saved/restored earlier
356          * and so includes the cost of the save.
357          */
358         return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
359 }
360
361 static void mark_eio(struct i915_request *rq)
362 {
363         if (i915_request_completed(rq))
364                 return;
365
366         GEM_BUG_ON(i915_request_signaled(rq));
367
368         i915_request_set_error_once(rq, -EIO);
369         i915_request_mark_complete(rq);
370 }
371
372 static struct i915_request *
373 active_request(const struct intel_timeline * const tl, struct i915_request *rq)
374 {
375         struct i915_request *active = rq;
376
377         rcu_read_lock();
378         list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
379                 if (i915_request_completed(rq))
380                         break;
381
382                 active = rq;
383         }
384         rcu_read_unlock();
385
386         return active;
387 }
388
389 static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
390 {
391         return (i915_ggtt_offset(engine->status_page.vma) +
392                 I915_GEM_HWS_PREEMPT_ADDR);
393 }
394
395 static inline void
396 ring_set_paused(const struct intel_engine_cs *engine, int state)
397 {
398         /*
399          * We inspect HWS_PREEMPT with a semaphore inside
400          * engine->emit_fini_breadcrumb. If the dword is true,
401          * the ring is paused as the semaphore will busywait
402          * until the dword is false.
403          */
404         engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
405         if (state)
406                 wmb();
407 }
408
409 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
410 {
411         return rb_entry(rb, struct i915_priolist, node);
412 }
413
414 static inline int rq_prio(const struct i915_request *rq)
415 {
416         return READ_ONCE(rq->sched.attr.priority);
417 }
418
419 static int effective_prio(const struct i915_request *rq)
420 {
421         int prio = rq_prio(rq);
422
423         /*
424          * If this request is special and must not be interrupted at any
425          * cost, so be it. Note we are only checking the most recent request
426          * in the context and so may be masking an earlier vip request. It
427          * is hoped that under the conditions where nopreempt is used, this
428          * will not matter (i.e. all requests to that context will be
429          * nopreempt for as long as desired).
430          */
431         if (i915_request_has_nopreempt(rq))
432                 prio = I915_PRIORITY_UNPREEMPTABLE;
433
434         return prio;
435 }
436
437 static int queue_prio(const struct intel_engine_execlists *execlists)
438 {
439         struct i915_priolist *p;
440         struct rb_node *rb;
441
442         rb = rb_first_cached(&execlists->queue);
443         if (!rb)
444                 return INT_MIN;
445
446         /*
447          * As the priolist[] are inverted, with the highest priority in [0],
448          * we have to flip the index value to become priority.
449          */
450         p = to_priolist(rb);
451         if (!I915_USER_PRIORITY_SHIFT)
452                 return p->priority;
453
454         return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
455 }
456
457 static inline bool need_preempt(const struct intel_engine_cs *engine,
458                                 const struct i915_request *rq,
459                                 struct rb_node *rb)
460 {
461         int last_prio;
462
463         if (!intel_engine_has_semaphores(engine))
464                 return false;
465
466         /*
467          * Check if the current priority hint merits a preemption attempt.
468          *
469          * We record the highest value priority we saw during rescheduling
470          * prior to this dequeue, therefore we know that if it is strictly
471          * less than the current tail of ESLP[0], we do not need to force
472          * a preempt-to-idle cycle.
473          *
474          * However, the priority hint is a mere hint that we may need to
475          * preempt. If that hint is stale or we may be trying to preempt
476          * ourselves, ignore the request.
477          *
478          * More naturally we would write
479          *      prio >= max(0, last);
480          * except that we wish to prevent triggering preemption at the same
481          * priority level: the task that is running should remain running
482          * to preserve FIFO ordering of dependencies.
483          */
484         last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
485         if (engine->execlists.queue_priority_hint <= last_prio)
486                 return false;
487
488         /*
489          * Check against the first request in ELSP[1], it will, thanks to the
490          * power of PI, be the highest priority of that context.
491          */
492         if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
493             rq_prio(list_next_entry(rq, sched.link)) > last_prio)
494                 return true;
495
496         if (rb) {
497                 struct virtual_engine *ve =
498                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
499                 bool preempt = false;
500
501                 if (engine == ve->siblings[0]) { /* only preempt one sibling */
502                         struct i915_request *next;
503
504                         rcu_read_lock();
505                         next = READ_ONCE(ve->request);
506                         if (next)
507                                 preempt = rq_prio(next) > last_prio;
508                         rcu_read_unlock();
509                 }
510
511                 if (preempt)
512                         return preempt;
513         }
514
515         /*
516          * If the inflight context did not trigger the preemption, then maybe
517          * it was the set of queued requests? Pick the highest priority in
518          * the queue (the first active priolist) and see if it deserves to be
519          * running instead of ELSP[0].
520          *
521          * The highest priority request in the queue can not be either
522          * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
523          * context, it's priority would not exceed ELSP[0] aka last_prio.
524          */
525         return queue_prio(&engine->execlists) > last_prio;
526 }
527
528 __maybe_unused static inline bool
529 assert_priority_queue(const struct i915_request *prev,
530                       const struct i915_request *next)
531 {
532         /*
533          * Without preemption, the prev may refer to the still active element
534          * which we refuse to let go.
535          *
536          * Even with preemption, there are times when we think it is better not
537          * to preempt and leave an ostensibly lower priority request in flight.
538          */
539         if (i915_request_is_active(prev))
540                 return true;
541
542         return rq_prio(prev) >= rq_prio(next);
543 }
544
545 /*
546  * The context descriptor encodes various attributes of a context,
547  * including its GTT address and some flags. Because it's fairly
548  * expensive to calculate, we'll just do it once and cache the result,
549  * which remains valid until the context is unpinned.
550  *
551  * This is what a descriptor looks like, from LSB to MSB::
552  *
553  *      bits  0-11:    flags, GEN8_CTX_* (cached in ctx->desc_template)
554  *      bits 12-31:    LRCA, GTT address of (the HWSP of) this context
555  *      bits 32-52:    ctx ID, a globally unique tag (highest bit used by GuC)
556  *      bits 53-54:    mbz, reserved for use by hardware
557  *      bits 55-63:    group ID, currently unused and set to 0
558  *
559  * Starting from Gen11, the upper dword of the descriptor has a new format:
560  *
561  *      bits 32-36:    reserved
562  *      bits 37-47:    SW context ID
563  *      bits 48:53:    engine instance
564  *      bit 54:        mbz, reserved for use by hardware
565  *      bits 55-60:    SW counter
566  *      bits 61-63:    engine class
567  *
568  * engine info, SW context ID and SW counter need to form a unique number
569  * (Context ID) per lrc.
570  */
571 static u32
572 lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
573 {
574         u32 desc;
575
576         desc = INTEL_LEGACY_32B_CONTEXT;
577         if (i915_vm_is_4lvl(ce->vm))
578                 desc = INTEL_LEGACY_64B_CONTEXT;
579         desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
580
581         desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
582         if (IS_GEN(engine->i915, 8))
583                 desc |= GEN8_CTX_L3LLC_COHERENT;
584
585         return i915_ggtt_offset(ce->state) | desc;
586 }
587
588 static inline unsigned int dword_in_page(void *addr)
589 {
590         return offset_in_page(addr) / sizeof(u32);
591 }
592
593 static void set_offsets(u32 *regs,
594                         const u8 *data,
595                         const struct intel_engine_cs *engine,
596                         bool clear)
597 #define NOP(x) (BIT(7) | (x))
598 #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
599 #define POSTED BIT(0)
600 #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
601 #define REG16(x) \
602         (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
603         (((x) >> 2) & 0x7f)
604 #define END(total_state_size) 0, (total_state_size)
605 {
606         const u32 base = engine->mmio_base;
607
608         while (*data) {
609                 u8 count, flags;
610
611                 if (*data & BIT(7)) { /* skip */
612                         count = *data++ & ~BIT(7);
613                         if (clear)
614                                 memset32(regs, MI_NOOP, count);
615                         regs += count;
616                         continue;
617                 }
618
619                 count = *data & 0x3f;
620                 flags = *data >> 6;
621                 data++;
622
623                 *regs = MI_LOAD_REGISTER_IMM(count);
624                 if (flags & POSTED)
625                         *regs |= MI_LRI_FORCE_POSTED;
626                 if (INTEL_GEN(engine->i915) >= 11)
627                         *regs |= MI_LRI_LRM_CS_MMIO;
628                 regs++;
629
630                 GEM_BUG_ON(!count);
631                 do {
632                         u32 offset = 0;
633                         u8 v;
634
635                         do {
636                                 v = *data++;
637                                 offset <<= 7;
638                                 offset |= v & ~BIT(7);
639                         } while (v & BIT(7));
640
641                         regs[0] = base + (offset << 2);
642                         if (clear)
643                                 regs[1] = 0;
644                         regs += 2;
645                 } while (--count);
646         }
647
648         if (clear) {
649                 u8 count = *++data;
650
651                 /* Clear past the tail for HW access */
652                 GEM_BUG_ON(dword_in_page(regs) > count);
653                 memset32(regs, MI_NOOP, count - dword_in_page(regs));
654
655                 /* Close the batch; used mainly by live_lrc_layout() */
656                 *regs = MI_BATCH_BUFFER_END;
657                 if (INTEL_GEN(engine->i915) >= 10)
658                         *regs |= BIT(0);
659         }
660 }
661
662 static const u8 gen8_xcs_offsets[] = {
663         NOP(1),
664         LRI(11, 0),
665         REG16(0x244),
666         REG(0x034),
667         REG(0x030),
668         REG(0x038),
669         REG(0x03c),
670         REG(0x168),
671         REG(0x140),
672         REG(0x110),
673         REG(0x11c),
674         REG(0x114),
675         REG(0x118),
676
677         NOP(9),
678         LRI(9, 0),
679         REG16(0x3a8),
680         REG16(0x28c),
681         REG16(0x288),
682         REG16(0x284),
683         REG16(0x280),
684         REG16(0x27c),
685         REG16(0x278),
686         REG16(0x274),
687         REG16(0x270),
688
689         NOP(13),
690         LRI(2, 0),
691         REG16(0x200),
692         REG(0x028),
693
694         END(80)
695 };
696
697 static const u8 gen9_xcs_offsets[] = {
698         NOP(1),
699         LRI(14, POSTED),
700         REG16(0x244),
701         REG(0x034),
702         REG(0x030),
703         REG(0x038),
704         REG(0x03c),
705         REG(0x168),
706         REG(0x140),
707         REG(0x110),
708         REG(0x11c),
709         REG(0x114),
710         REG(0x118),
711         REG(0x1c0),
712         REG(0x1c4),
713         REG(0x1c8),
714
715         NOP(3),
716         LRI(9, POSTED),
717         REG16(0x3a8),
718         REG16(0x28c),
719         REG16(0x288),
720         REG16(0x284),
721         REG16(0x280),
722         REG16(0x27c),
723         REG16(0x278),
724         REG16(0x274),
725         REG16(0x270),
726
727         NOP(13),
728         LRI(1, POSTED),
729         REG16(0x200),
730
731         NOP(13),
732         LRI(44, POSTED),
733         REG(0x028),
734         REG(0x09c),
735         REG(0x0c0),
736         REG(0x178),
737         REG(0x17c),
738         REG16(0x358),
739         REG(0x170),
740         REG(0x150),
741         REG(0x154),
742         REG(0x158),
743         REG16(0x41c),
744         REG16(0x600),
745         REG16(0x604),
746         REG16(0x608),
747         REG16(0x60c),
748         REG16(0x610),
749         REG16(0x614),
750         REG16(0x618),
751         REG16(0x61c),
752         REG16(0x620),
753         REG16(0x624),
754         REG16(0x628),
755         REG16(0x62c),
756         REG16(0x630),
757         REG16(0x634),
758         REG16(0x638),
759         REG16(0x63c),
760         REG16(0x640),
761         REG16(0x644),
762         REG16(0x648),
763         REG16(0x64c),
764         REG16(0x650),
765         REG16(0x654),
766         REG16(0x658),
767         REG16(0x65c),
768         REG16(0x660),
769         REG16(0x664),
770         REG16(0x668),
771         REG16(0x66c),
772         REG16(0x670),
773         REG16(0x674),
774         REG16(0x678),
775         REG16(0x67c),
776         REG(0x068),
777
778         END(176)
779 };
780
781 static const u8 gen12_xcs_offsets[] = {
782         NOP(1),
783         LRI(13, POSTED),
784         REG16(0x244),
785         REG(0x034),
786         REG(0x030),
787         REG(0x038),
788         REG(0x03c),
789         REG(0x168),
790         REG(0x140),
791         REG(0x110),
792         REG(0x1c0),
793         REG(0x1c4),
794         REG(0x1c8),
795         REG(0x180),
796         REG16(0x2b4),
797
798         NOP(5),
799         LRI(9, POSTED),
800         REG16(0x3a8),
801         REG16(0x28c),
802         REG16(0x288),
803         REG16(0x284),
804         REG16(0x280),
805         REG16(0x27c),
806         REG16(0x278),
807         REG16(0x274),
808         REG16(0x270),
809
810         END(80)
811 };
812
813 static const u8 gen8_rcs_offsets[] = {
814         NOP(1),
815         LRI(14, POSTED),
816         REG16(0x244),
817         REG(0x034),
818         REG(0x030),
819         REG(0x038),
820         REG(0x03c),
821         REG(0x168),
822         REG(0x140),
823         REG(0x110),
824         REG(0x11c),
825         REG(0x114),
826         REG(0x118),
827         REG(0x1c0),
828         REG(0x1c4),
829         REG(0x1c8),
830
831         NOP(3),
832         LRI(9, POSTED),
833         REG16(0x3a8),
834         REG16(0x28c),
835         REG16(0x288),
836         REG16(0x284),
837         REG16(0x280),
838         REG16(0x27c),
839         REG16(0x278),
840         REG16(0x274),
841         REG16(0x270),
842
843         NOP(13),
844         LRI(1, 0),
845         REG(0x0c8),
846
847         END(80)
848 };
849
850 static const u8 gen9_rcs_offsets[] = {
851         NOP(1),
852         LRI(14, POSTED),
853         REG16(0x244),
854         REG(0x34),
855         REG(0x30),
856         REG(0x38),
857         REG(0x3c),
858         REG(0x168),
859         REG(0x140),
860         REG(0x110),
861         REG(0x11c),
862         REG(0x114),
863         REG(0x118),
864         REG(0x1c0),
865         REG(0x1c4),
866         REG(0x1c8),
867
868         NOP(3),
869         LRI(9, POSTED),
870         REG16(0x3a8),
871         REG16(0x28c),
872         REG16(0x288),
873         REG16(0x284),
874         REG16(0x280),
875         REG16(0x27c),
876         REG16(0x278),
877         REG16(0x274),
878         REG16(0x270),
879
880         NOP(13),
881         LRI(1, 0),
882         REG(0xc8),
883
884         NOP(13),
885         LRI(44, POSTED),
886         REG(0x28),
887         REG(0x9c),
888         REG(0xc0),
889         REG(0x178),
890         REG(0x17c),
891         REG16(0x358),
892         REG(0x170),
893         REG(0x150),
894         REG(0x154),
895         REG(0x158),
896         REG16(0x41c),
897         REG16(0x600),
898         REG16(0x604),
899         REG16(0x608),
900         REG16(0x60c),
901         REG16(0x610),
902         REG16(0x614),
903         REG16(0x618),
904         REG16(0x61c),
905         REG16(0x620),
906         REG16(0x624),
907         REG16(0x628),
908         REG16(0x62c),
909         REG16(0x630),
910         REG16(0x634),
911         REG16(0x638),
912         REG16(0x63c),
913         REG16(0x640),
914         REG16(0x644),
915         REG16(0x648),
916         REG16(0x64c),
917         REG16(0x650),
918         REG16(0x654),
919         REG16(0x658),
920         REG16(0x65c),
921         REG16(0x660),
922         REG16(0x664),
923         REG16(0x668),
924         REG16(0x66c),
925         REG16(0x670),
926         REG16(0x674),
927         REG16(0x678),
928         REG16(0x67c),
929         REG(0x68),
930
931         END(176)
932 };
933
934 static const u8 gen11_rcs_offsets[] = {
935         NOP(1),
936         LRI(15, POSTED),
937         REG16(0x244),
938         REG(0x034),
939         REG(0x030),
940         REG(0x038),
941         REG(0x03c),
942         REG(0x168),
943         REG(0x140),
944         REG(0x110),
945         REG(0x11c),
946         REG(0x114),
947         REG(0x118),
948         REG(0x1c0),
949         REG(0x1c4),
950         REG(0x1c8),
951         REG(0x180),
952
953         NOP(1),
954         LRI(9, POSTED),
955         REG16(0x3a8),
956         REG16(0x28c),
957         REG16(0x288),
958         REG16(0x284),
959         REG16(0x280),
960         REG16(0x27c),
961         REG16(0x278),
962         REG16(0x274),
963         REG16(0x270),
964
965         LRI(1, POSTED),
966         REG(0x1b0),
967
968         NOP(10),
969         LRI(1, 0),
970         REG(0x0c8),
971
972         END(80)
973 };
974
975 static const u8 gen12_rcs_offsets[] = {
976         NOP(1),
977         LRI(13, POSTED),
978         REG16(0x244),
979         REG(0x034),
980         REG(0x030),
981         REG(0x038),
982         REG(0x03c),
983         REG(0x168),
984         REG(0x140),
985         REG(0x110),
986         REG(0x1c0),
987         REG(0x1c4),
988         REG(0x1c8),
989         REG(0x180),
990         REG16(0x2b4),
991
992         NOP(5),
993         LRI(9, POSTED),
994         REG16(0x3a8),
995         REG16(0x28c),
996         REG16(0x288),
997         REG16(0x284),
998         REG16(0x280),
999         REG16(0x27c),
1000         REG16(0x278),
1001         REG16(0x274),
1002         REG16(0x270),
1003
1004         LRI(3, POSTED),
1005         REG(0x1b0),
1006         REG16(0x5a8),
1007         REG16(0x5ac),
1008
1009         NOP(6),
1010         LRI(1, 0),
1011         REG(0x0c8),
1012         NOP(3 + 9 + 1),
1013
1014         LRI(51, POSTED),
1015         REG16(0x588),
1016         REG16(0x588),
1017         REG16(0x588),
1018         REG16(0x588),
1019         REG16(0x588),
1020         REG16(0x588),
1021         REG(0x028),
1022         REG(0x09c),
1023         REG(0x0c0),
1024         REG(0x178),
1025         REG(0x17c),
1026         REG16(0x358),
1027         REG(0x170),
1028         REG(0x150),
1029         REG(0x154),
1030         REG(0x158),
1031         REG16(0x41c),
1032         REG16(0x600),
1033         REG16(0x604),
1034         REG16(0x608),
1035         REG16(0x60c),
1036         REG16(0x610),
1037         REG16(0x614),
1038         REG16(0x618),
1039         REG16(0x61c),
1040         REG16(0x620),
1041         REG16(0x624),
1042         REG16(0x628),
1043         REG16(0x62c),
1044         REG16(0x630),
1045         REG16(0x634),
1046         REG16(0x638),
1047         REG16(0x63c),
1048         REG16(0x640),
1049         REG16(0x644),
1050         REG16(0x648),
1051         REG16(0x64c),
1052         REG16(0x650),
1053         REG16(0x654),
1054         REG16(0x658),
1055         REG16(0x65c),
1056         REG16(0x660),
1057         REG16(0x664),
1058         REG16(0x668),
1059         REG16(0x66c),
1060         REG16(0x670),
1061         REG16(0x674),
1062         REG16(0x678),
1063         REG16(0x67c),
1064         REG(0x068),
1065         REG(0x084),
1066         NOP(1),
1067
1068         END(192)
1069 };
1070
1071 #undef END
1072 #undef REG16
1073 #undef REG
1074 #undef LRI
1075 #undef NOP
1076
1077 static const u8 *reg_offsets(const struct intel_engine_cs *engine)
1078 {
1079         /*
1080          * The gen12+ lists only have the registers we program in the basic
1081          * default state. We rely on the context image using relative
1082          * addressing to automatic fixup the register state between the
1083          * physical engines for virtual engine.
1084          */
1085         GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
1086                    !intel_engine_has_relative_mmio(engine));
1087
1088         if (engine->class == RENDER_CLASS) {
1089                 if (INTEL_GEN(engine->i915) >= 12)
1090                         return gen12_rcs_offsets;
1091                 else if (INTEL_GEN(engine->i915) >= 11)
1092                         return gen11_rcs_offsets;
1093                 else if (INTEL_GEN(engine->i915) >= 9)
1094                         return gen9_rcs_offsets;
1095                 else
1096                         return gen8_rcs_offsets;
1097         } else {
1098                 if (INTEL_GEN(engine->i915) >= 12)
1099                         return gen12_xcs_offsets;
1100                 else if (INTEL_GEN(engine->i915) >= 9)
1101                         return gen9_xcs_offsets;
1102                 else
1103                         return gen8_xcs_offsets;
1104         }
1105 }
1106
1107 static struct i915_request *
1108 __unwind_incomplete_requests(struct intel_engine_cs *engine)
1109 {
1110         struct i915_request *rq, *rn, *active = NULL;
1111         struct list_head *pl;
1112         int prio = I915_PRIORITY_INVALID;
1113
1114         lockdep_assert_held(&engine->active.lock);
1115
1116         list_for_each_entry_safe_reverse(rq, rn,
1117                                          &engine->active.requests,
1118                                          sched.link) {
1119                 if (i915_request_completed(rq))
1120                         continue; /* XXX */
1121
1122                 __i915_request_unsubmit(rq);
1123
1124                 /*
1125                  * Push the request back into the queue for later resubmission.
1126                  * If this request is not native to this physical engine (i.e.
1127                  * it came from a virtual source), push it back onto the virtual
1128                  * engine so that it can be moved across onto another physical
1129                  * engine as load dictates.
1130                  */
1131                 if (likely(rq->execution_mask == engine->mask)) {
1132                         GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
1133                         if (rq_prio(rq) != prio) {
1134                                 prio = rq_prio(rq);
1135                                 pl = i915_sched_lookup_priolist(engine, prio);
1136                         }
1137                         GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
1138
1139                         list_move(&rq->sched.link, pl);
1140                         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
1141
1142                         /* Check in case we rollback so far we wrap [size/2] */
1143                         if (intel_ring_direction(rq->ring,
1144                                                  rq->tail,
1145                                                  rq->ring->tail + 8) > 0)
1146                                 rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1147
1148                         active = rq;
1149                 } else {
1150                         struct intel_engine_cs *owner = rq->context->engine;
1151
1152                         WRITE_ONCE(rq->engine, owner);
1153                         owner->submit_request(rq);
1154                         active = NULL;
1155                 }
1156         }
1157
1158         return active;
1159 }
1160
1161 struct i915_request *
1162 execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1163 {
1164         struct intel_engine_cs *engine =
1165                 container_of(execlists, typeof(*engine), execlists);
1166
1167         return __unwind_incomplete_requests(engine);
1168 }
1169
1170 static inline void
1171 execlists_context_status_change(struct i915_request *rq, unsigned long status)
1172 {
1173         /*
1174          * Only used when GVT-g is enabled now. When GVT-g is disabled,
1175          * The compiler should eliminate this function as dead-code.
1176          */
1177         if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1178                 return;
1179
1180         atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1181                                    status, rq);
1182 }
1183
1184 static void intel_engine_context_in(struct intel_engine_cs *engine)
1185 {
1186         unsigned long flags;
1187
1188         if (atomic_add_unless(&engine->stats.active, 1, 0))
1189                 return;
1190
1191         write_seqlock_irqsave(&engine->stats.lock, flags);
1192         if (!atomic_add_unless(&engine->stats.active, 1, 0)) {
1193                 engine->stats.start = ktime_get();
1194                 atomic_inc(&engine->stats.active);
1195         }
1196         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1197 }
1198
1199 static void intel_engine_context_out(struct intel_engine_cs *engine)
1200 {
1201         unsigned long flags;
1202
1203         GEM_BUG_ON(!atomic_read(&engine->stats.active));
1204
1205         if (atomic_add_unless(&engine->stats.active, -1, 1))
1206                 return;
1207
1208         write_seqlock_irqsave(&engine->stats.lock, flags);
1209         if (atomic_dec_and_test(&engine->stats.active)) {
1210                 engine->stats.total =
1211                         ktime_add(engine->stats.total,
1212                                   ktime_sub(ktime_get(), engine->stats.start));
1213         }
1214         write_sequnlock_irqrestore(&engine->stats.lock, flags);
1215 }
1216
1217 static void
1218 execlists_check_context(const struct intel_context *ce,
1219                         const struct intel_engine_cs *engine,
1220                         const char *when)
1221 {
1222         const struct intel_ring *ring = ce->ring;
1223         u32 *regs = ce->lrc_reg_state;
1224         bool valid = true;
1225         int x;
1226
1227         if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1228                 pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1229                        engine->name,
1230                        regs[CTX_RING_START],
1231                        i915_ggtt_offset(ring->vma));
1232                 regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1233                 valid = false;
1234         }
1235
1236         if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1237             (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1238                 pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1239                        engine->name,
1240                        regs[CTX_RING_CTL],
1241                        (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1242                 regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1243                 valid = false;
1244         }
1245
1246         x = lrc_ring_mi_mode(engine);
1247         if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1248                 pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1249                        engine->name, regs[x + 1]);
1250                 regs[x + 1] &= ~STOP_RING;
1251                 regs[x + 1] |= STOP_RING << 16;
1252                 valid = false;
1253         }
1254
1255         WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
1256 }
1257
1258 static void restore_default_state(struct intel_context *ce,
1259                                   struct intel_engine_cs *engine)
1260 {
1261         u32 *regs;
1262
1263         regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE);
1264         execlists_init_reg_state(regs, ce, engine, ce->ring, true);
1265
1266         ce->runtime.last = intel_context_get_runtime(ce);
1267 }
1268
1269 static void reset_active(struct i915_request *rq,
1270                          struct intel_engine_cs *engine)
1271 {
1272         struct intel_context * const ce = rq->context;
1273         u32 head;
1274
1275         /*
1276          * The executing context has been cancelled. We want to prevent
1277          * further execution along this context and propagate the error on
1278          * to anything depending on its results.
1279          *
1280          * In __i915_request_submit(), we apply the -EIO and remove the
1281          * requests' payloads for any banned requests. But first, we must
1282          * rewind the context back to the start of the incomplete request so
1283          * that we do not jump back into the middle of the batch.
1284          *
1285          * We preserve the breadcrumbs and semaphores of the incomplete
1286          * requests so that inter-timeline dependencies (i.e other timelines)
1287          * remain correctly ordered. And we defer to __i915_request_submit()
1288          * so that all asynchronous waits are correctly handled.
1289          */
1290         ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1291                      rq->fence.context, rq->fence.seqno);
1292
1293         /* On resubmission of the active request, payload will be scrubbed */
1294         if (i915_request_completed(rq))
1295                 head = rq->tail;
1296         else
1297                 head = active_request(ce->timeline, rq)->head;
1298         head = intel_ring_wrap(ce->ring, head);
1299
1300         /* Scrub the context image to prevent replaying the previous batch */
1301         restore_default_state(ce, engine);
1302         __execlists_update_reg_state(ce, engine, head);
1303
1304         /* We've switched away, so this should be a no-op, but intent matters */
1305         ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
1306 }
1307
1308 static void st_update_runtime_underflow(struct intel_context *ce, s32 dt)
1309 {
1310 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
1311         ce->runtime.num_underflow += dt < 0;
1312         ce->runtime.max_underflow = max_t(u32, ce->runtime.max_underflow, -dt);
1313 #endif
1314 }
1315
1316 static void intel_context_update_runtime(struct intel_context *ce)
1317 {
1318         u32 old;
1319         s32 dt;
1320
1321         if (intel_context_is_barrier(ce))
1322                 return;
1323
1324         old = ce->runtime.last;
1325         ce->runtime.last = intel_context_get_runtime(ce);
1326         dt = ce->runtime.last - old;
1327
1328         if (unlikely(dt <= 0)) {
1329                 CE_TRACE(ce, "runtime underflow: last=%u, new=%u, delta=%d\n",
1330                          old, ce->runtime.last, dt);
1331                 st_update_runtime_underflow(ce, dt);
1332                 return;
1333         }
1334
1335         ewma_runtime_add(&ce->runtime.avg, dt);
1336         ce->runtime.total += dt;
1337 }
1338
1339 static inline struct intel_engine_cs *
1340 __execlists_schedule_in(struct i915_request *rq)
1341 {
1342         struct intel_engine_cs * const engine = rq->engine;
1343         struct intel_context * const ce = rq->context;
1344
1345         intel_context_get(ce);
1346
1347         if (unlikely(intel_context_is_banned(ce)))
1348                 reset_active(rq, engine);
1349
1350         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1351                 execlists_check_context(ce, engine, "before");
1352
1353         if (ce->tag) {
1354                 /* Use a fixed tag for OA and friends */
1355                 GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
1356                 ce->lrc.ccid = ce->tag;
1357         } else {
1358                 /* We don't need a strict matching tag, just different values */
1359                 unsigned int tag = ffs(READ_ONCE(engine->context_tag));
1360
1361                 GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG);
1362                 clear_bit(tag - 1, &engine->context_tag);
1363                 ce->lrc.ccid = tag << (GEN11_SW_CTX_ID_SHIFT - 32);
1364
1365                 BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
1366         }
1367
1368         ce->lrc.ccid |= engine->execlists.ccid;
1369
1370         __intel_gt_pm_get(engine->gt);
1371         if (engine->fw_domain && !atomic_fetch_inc(&engine->fw_active))
1372                 intel_uncore_forcewake_get(engine->uncore, engine->fw_domain);
1373         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1374         intel_engine_context_in(engine);
1375
1376         return engine;
1377 }
1378
1379 static inline struct i915_request *
1380 execlists_schedule_in(struct i915_request *rq, int idx)
1381 {
1382         struct intel_context * const ce = rq->context;
1383         struct intel_engine_cs *old;
1384
1385         GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1386         trace_i915_request_in(rq, idx);
1387
1388         old = READ_ONCE(ce->inflight);
1389         do {
1390                 if (!old) {
1391                         WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1392                         break;
1393                 }
1394         } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1395
1396         GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1397         return i915_request_get(rq);
1398 }
1399
1400 static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1401 {
1402         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1403         struct i915_request *next = READ_ONCE(ve->request);
1404
1405         if (next == rq || (next && next->execution_mask & ~rq->execution_mask))
1406                 tasklet_hi_schedule(&ve->base.execlists.tasklet);
1407 }
1408
1409 static inline void
1410 __execlists_schedule_out(struct i915_request *rq,
1411                          struct intel_engine_cs * const engine,
1412                          unsigned int ccid)
1413 {
1414         struct intel_context * const ce = rq->context;
1415
1416         /*
1417          * NB process_csb() is not under the engine->active.lock and hence
1418          * schedule_out can race with schedule_in meaning that we should
1419          * refrain from doing non-trivial work here.
1420          */
1421
1422         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1423                 execlists_check_context(ce, engine, "after");
1424
1425         /*
1426          * If we have just completed this context, the engine may now be
1427          * idle and we want to re-enter powersaving.
1428          */
1429         if (list_is_last_rcu(&rq->link, &ce->timeline->requests) &&
1430             i915_request_completed(rq))
1431                 intel_engine_add_retire(engine, ce->timeline);
1432
1433         ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
1434         ccid &= GEN12_MAX_CONTEXT_HW_ID;
1435         if (ccid < BITS_PER_LONG) {
1436                 GEM_BUG_ON(ccid == 0);
1437                 GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
1438                 set_bit(ccid - 1, &engine->context_tag);
1439         }
1440
1441         intel_context_update_runtime(ce);
1442         intel_engine_context_out(engine);
1443         execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1444         if (engine->fw_domain && !atomic_dec_return(&engine->fw_active))
1445                 intel_uncore_forcewake_put(engine->uncore, engine->fw_domain);
1446         intel_gt_pm_put_async(engine->gt);
1447
1448         /*
1449          * If this is part of a virtual engine, its next request may
1450          * have been blocked waiting for access to the active context.
1451          * We have to kick all the siblings again in case we need to
1452          * switch (e.g. the next request is not runnable on this
1453          * engine). Hopefully, we will already have submitted the next
1454          * request before the tasklet runs and do not need to rebuild
1455          * each virtual tree and kick everyone again.
1456          */
1457         if (ce->engine != engine)
1458                 kick_siblings(rq, ce);
1459
1460         intel_context_put(ce);
1461 }
1462
1463 static inline void
1464 execlists_schedule_out(struct i915_request *rq)
1465 {
1466         struct intel_context * const ce = rq->context;
1467         struct intel_engine_cs *cur, *old;
1468         u32 ccid;
1469
1470         trace_i915_request_out(rq);
1471
1472         ccid = rq->context->lrc.ccid;
1473         old = READ_ONCE(ce->inflight);
1474         do
1475                 cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1476         while (!try_cmpxchg(&ce->inflight, &old, cur));
1477         if (!cur)
1478                 __execlists_schedule_out(rq, old, ccid);
1479
1480         i915_request_put(rq);
1481 }
1482
1483 static u64 execlists_update_context(struct i915_request *rq)
1484 {
1485         struct intel_context *ce = rq->context;
1486         u64 desc = ce->lrc.desc;
1487         u32 tail, prev;
1488
1489         /*
1490          * WaIdleLiteRestore:bdw,skl
1491          *
1492          * We should never submit the context with the same RING_TAIL twice
1493          * just in case we submit an empty ring, which confuses the HW.
1494          *
1495          * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1496          * the normal request to be able to always advance the RING_TAIL on
1497          * subsequent resubmissions (for lite restore). Should that fail us,
1498          * and we try and submit the same tail again, force the context
1499          * reload.
1500          *
1501          * If we need to return to a preempted context, we need to skip the
1502          * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1503          * HW has a tendency to ignore us rewinding the TAIL to the end of
1504          * an earlier request.
1505          */
1506         GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
1507         prev = rq->ring->tail;
1508         tail = intel_ring_set_tail(rq->ring, rq->tail);
1509         if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1510                 desc |= CTX_DESC_FORCE_RESTORE;
1511         ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1512         rq->tail = rq->wa_tail;
1513
1514         /*
1515          * Make sure the context image is complete before we submit it to HW.
1516          *
1517          * Ostensibly, writes (including the WCB) should be flushed prior to
1518          * an uncached write such as our mmio register access, the empirical
1519          * evidence (esp. on Braswell) suggests that the WC write into memory
1520          * may not be visible to the HW prior to the completion of the UC
1521          * register write and that we may begin execution from the context
1522          * before its image is complete leading to invalid PD chasing.
1523          */
1524         wmb();
1525
1526         ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
1527         return desc;
1528 }
1529
1530 static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1531 {
1532         if (execlists->ctrl_reg) {
1533                 writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1534                 writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1535         } else {
1536                 writel(upper_32_bits(desc), execlists->submit_reg);
1537                 writel(lower_32_bits(desc), execlists->submit_reg);
1538         }
1539 }
1540
1541 static __maybe_unused char *
1542 dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
1543 {
1544         if (!rq)
1545                 return "";
1546
1547         snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
1548                  prefix,
1549                  rq->context->lrc.ccid,
1550                  rq->fence.context, rq->fence.seqno,
1551                  i915_request_completed(rq) ? "!" :
1552                  i915_request_started(rq) ? "*" :
1553                  "",
1554                  rq_prio(rq));
1555
1556         return buf;
1557 }
1558
1559 static __maybe_unused void
1560 trace_ports(const struct intel_engine_execlists *execlists,
1561             const char *msg,
1562             struct i915_request * const *ports)
1563 {
1564         const struct intel_engine_cs *engine =
1565                 container_of(execlists, typeof(*engine), execlists);
1566         char __maybe_unused p0[40], p1[40];
1567
1568         if (!ports[0])
1569                 return;
1570
1571         ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
1572                      dump_port(p0, sizeof(p0), "", ports[0]),
1573                      dump_port(p1, sizeof(p1), ", ", ports[1]));
1574 }
1575
1576 static inline bool
1577 reset_in_progress(const struct intel_engine_execlists *execlists)
1578 {
1579         return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
1580 }
1581
1582 static __maybe_unused bool
1583 assert_pending_valid(const struct intel_engine_execlists *execlists,
1584                      const char *msg)
1585 {
1586         struct intel_engine_cs *engine =
1587                 container_of(execlists, typeof(*engine), execlists);
1588         struct i915_request * const *port, *rq;
1589         struct intel_context *ce = NULL;
1590         bool sentinel = false;
1591         u32 ccid = -1;
1592
1593         trace_ports(execlists, msg, execlists->pending);
1594
1595         /* We may be messing around with the lists during reset, lalala */
1596         if (reset_in_progress(execlists))
1597                 return true;
1598
1599         if (!execlists->pending[0]) {
1600                 GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
1601                               engine->name);
1602                 return false;
1603         }
1604
1605         if (execlists->pending[execlists_num_ports(execlists)]) {
1606                 GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
1607                               engine->name, execlists_num_ports(execlists));
1608                 return false;
1609         }
1610
1611         for (port = execlists->pending; (rq = *port); port++) {
1612                 unsigned long flags;
1613                 bool ok = true;
1614
1615                 GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1616                 GEM_BUG_ON(!i915_request_is_active(rq));
1617
1618                 if (ce == rq->context) {
1619                         GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
1620                                       engine->name,
1621                                       ce->timeline->fence_context,
1622                                       port - execlists->pending);
1623                         return false;
1624                 }
1625                 ce = rq->context;
1626
1627                 if (ccid == ce->lrc.ccid) {
1628                         GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
1629                                       engine->name,
1630                                       ccid, ce->timeline->fence_context,
1631                                       port - execlists->pending);
1632                         return false;
1633                 }
1634                 ccid = ce->lrc.ccid;
1635
1636                 /*
1637                  * Sentinels are supposed to be the last request so they flush
1638                  * the current execution off the HW. Check that they are the only
1639                  * request in the pending submission.
1640                  */
1641                 if (sentinel) {
1642                         GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
1643                                       engine->name,
1644                                       ce->timeline->fence_context,
1645                                       port - execlists->pending);
1646                         return false;
1647                 }
1648                 sentinel = i915_request_has_sentinel(rq);
1649
1650                 /* Hold tightly onto the lock to prevent concurrent retires! */
1651                 if (!spin_trylock_irqsave(&rq->lock, flags))
1652                         continue;
1653
1654                 if (i915_request_completed(rq))
1655                         goto unlock;
1656
1657                 if (i915_active_is_idle(&ce->active) &&
1658                     !intel_context_is_barrier(ce)) {
1659                         GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
1660                                       engine->name,
1661                                       ce->timeline->fence_context,
1662                                       port - execlists->pending);
1663                         ok = false;
1664                         goto unlock;
1665                 }
1666
1667                 if (!i915_vma_is_pinned(ce->state)) {
1668                         GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
1669                                       engine->name,
1670                                       ce->timeline->fence_context,
1671                                       port - execlists->pending);
1672                         ok = false;
1673                         goto unlock;
1674                 }
1675
1676                 if (!i915_vma_is_pinned(ce->ring->vma)) {
1677                         GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
1678                                       engine->name,
1679                                       ce->timeline->fence_context,
1680                                       port - execlists->pending);
1681                         ok = false;
1682                         goto unlock;
1683                 }
1684
1685 unlock:
1686                 spin_unlock_irqrestore(&rq->lock, flags);
1687                 if (!ok)
1688                         return false;
1689         }
1690
1691         return ce;
1692 }
1693
1694 static void execlists_submit_ports(struct intel_engine_cs *engine)
1695 {
1696         struct intel_engine_execlists *execlists = &engine->execlists;
1697         unsigned int n;
1698
1699         GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1700
1701         /*
1702          * We can skip acquiring intel_runtime_pm_get() here as it was taken
1703          * on our behalf by the request (see i915_gem_mark_busy()) and it will
1704          * not be relinquished until the device is idle (see
1705          * i915_gem_idle_work_handler()). As a precaution, we make sure
1706          * that all ELSP are drained i.e. we have processed the CSB,
1707          * before allowing ourselves to idle and calling intel_runtime_pm_put().
1708          */
1709         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1710
1711         /*
1712          * ELSQ note: the submit queue is not cleared after being submitted
1713          * to the HW so we need to make sure we always clean it up. This is
1714          * currently ensured by the fact that we always write the same number
1715          * of elsq entries, keep this in mind before changing the loop below.
1716          */
1717         for (n = execlists_num_ports(execlists); n--; ) {
1718                 struct i915_request *rq = execlists->pending[n];
1719
1720                 write_desc(execlists,
1721                            rq ? execlists_update_context(rq) : 0,
1722                            n);
1723         }
1724
1725         /* we need to manually load the submit queue */
1726         if (execlists->ctrl_reg)
1727                 writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1728 }
1729
1730 static bool ctx_single_port_submission(const struct intel_context *ce)
1731 {
1732         return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1733                 intel_context_force_single_submission(ce));
1734 }
1735
1736 static bool can_merge_ctx(const struct intel_context *prev,
1737                           const struct intel_context *next)
1738 {
1739         if (prev != next)
1740                 return false;
1741
1742         if (ctx_single_port_submission(prev))
1743                 return false;
1744
1745         return true;
1746 }
1747
1748 static unsigned long i915_request_flags(const struct i915_request *rq)
1749 {
1750         return READ_ONCE(rq->fence.flags);
1751 }
1752
1753 static bool can_merge_rq(const struct i915_request *prev,
1754                          const struct i915_request *next)
1755 {
1756         GEM_BUG_ON(prev == next);
1757         GEM_BUG_ON(!assert_priority_queue(prev, next));
1758
1759         /*
1760          * We do not submit known completed requests. Therefore if the next
1761          * request is already completed, we can pretend to merge it in
1762          * with the previous context (and we will skip updating the ELSP
1763          * and tracking). Thus hopefully keeping the ELSP full with active
1764          * contexts, despite the best efforts of preempt-to-busy to confuse
1765          * us.
1766          */
1767         if (i915_request_completed(next))
1768                 return true;
1769
1770         if (unlikely((i915_request_flags(prev) ^ i915_request_flags(next)) &
1771                      (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1772                       BIT(I915_FENCE_FLAG_SENTINEL))))
1773                 return false;
1774
1775         if (!can_merge_ctx(prev->context, next->context))
1776                 return false;
1777
1778         GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
1779         return true;
1780 }
1781
1782 static void virtual_update_register_offsets(u32 *regs,
1783                                             struct intel_engine_cs *engine)
1784 {
1785         set_offsets(regs, reg_offsets(engine), engine, false);
1786 }
1787
1788 static bool virtual_matches(const struct virtual_engine *ve,
1789                             const struct i915_request *rq,
1790                             const struct intel_engine_cs *engine)
1791 {
1792         const struct intel_engine_cs *inflight;
1793
1794         if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1795                 return false;
1796
1797         /*
1798          * We track when the HW has completed saving the context image
1799          * (i.e. when we have seen the final CS event switching out of
1800          * the context) and must not overwrite the context image before
1801          * then. This restricts us to only using the active engine
1802          * while the previous virtualized request is inflight (so
1803          * we reuse the register offsets). This is a very small
1804          * hystersis on the greedy seelction algorithm.
1805          */
1806         inflight = intel_context_inflight(&ve->context);
1807         if (inflight && inflight != engine)
1808                 return false;
1809
1810         return true;
1811 }
1812
1813 static void virtual_xfer_context(struct virtual_engine *ve,
1814                                  struct intel_engine_cs *engine)
1815 {
1816         unsigned int n;
1817
1818         if (likely(engine == ve->siblings[0]))
1819                 return;
1820
1821         GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1822         if (!intel_engine_has_relative_mmio(engine))
1823                 virtual_update_register_offsets(ve->context.lrc_reg_state,
1824                                                 engine);
1825
1826         /*
1827          * Move the bound engine to the top of the list for
1828          * future execution. We then kick this tasklet first
1829          * before checking others, so that we preferentially
1830          * reuse this set of bound registers.
1831          */
1832         for (n = 1; n < ve->num_siblings; n++) {
1833                 if (ve->siblings[n] == engine) {
1834                         swap(ve->siblings[n], ve->siblings[0]);
1835                         break;
1836                 }
1837         }
1838 }
1839
1840 #define for_each_waiter(p__, rq__) \
1841         list_for_each_entry_lockless(p__, \
1842                                      &(rq__)->sched.waiters_list, \
1843                                      wait_link)
1844
1845 #define for_each_signaler(p__, rq__) \
1846         list_for_each_entry_rcu(p__, \
1847                                 &(rq__)->sched.signalers_list, \
1848                                 signal_link)
1849
1850 static void defer_request(struct i915_request *rq, struct list_head * const pl)
1851 {
1852         LIST_HEAD(list);
1853
1854         /*
1855          * We want to move the interrupted request to the back of
1856          * the round-robin list (i.e. its priority level), but
1857          * in doing so, we must then move all requests that were in
1858          * flight and were waiting for the interrupted request to
1859          * be run after it again.
1860          */
1861         do {
1862                 struct i915_dependency *p;
1863
1864                 GEM_BUG_ON(i915_request_is_active(rq));
1865                 list_move_tail(&rq->sched.link, pl);
1866
1867                 for_each_waiter(p, rq) {
1868                         struct i915_request *w =
1869                                 container_of(p->waiter, typeof(*w), sched);
1870
1871                         if (p->flags & I915_DEPENDENCY_WEAK)
1872                                 continue;
1873
1874                         /* Leave semaphores spinning on the other engines */
1875                         if (w->engine != rq->engine)
1876                                 continue;
1877
1878                         /* No waiter should start before its signaler */
1879                         GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1880                                    i915_request_started(w) &&
1881                                    !i915_request_completed(rq));
1882
1883                         GEM_BUG_ON(i915_request_is_active(w));
1884                         if (!i915_request_is_ready(w))
1885                                 continue;
1886
1887                         if (rq_prio(w) < rq_prio(rq))
1888                                 continue;
1889
1890                         GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1891                         list_move_tail(&w->sched.link, &list);
1892                 }
1893
1894                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1895         } while (rq);
1896 }
1897
1898 static void defer_active(struct intel_engine_cs *engine)
1899 {
1900         struct i915_request *rq;
1901
1902         rq = __unwind_incomplete_requests(engine);
1903         if (!rq)
1904                 return;
1905
1906         defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1907 }
1908
1909 static bool
1910 need_timeslice(const struct intel_engine_cs *engine,
1911                const struct i915_request *rq,
1912                const struct rb_node *rb)
1913 {
1914         int hint;
1915
1916         if (!intel_engine_has_timeslices(engine))
1917                 return false;
1918
1919         hint = engine->execlists.queue_priority_hint;
1920
1921         if (rb) {
1922                 const struct virtual_engine *ve =
1923                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1924                 const struct intel_engine_cs *inflight =
1925                         intel_context_inflight(&ve->context);
1926
1927                 if (!inflight || inflight == engine) {
1928                         struct i915_request *next;
1929
1930                         rcu_read_lock();
1931                         next = READ_ONCE(ve->request);
1932                         if (next)
1933                                 hint = max(hint, rq_prio(next));
1934                         rcu_read_unlock();
1935                 }
1936         }
1937
1938         if (!list_is_last(&rq->sched.link, &engine->active.requests))
1939                 hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1940
1941         GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE);
1942         return hint >= effective_prio(rq);
1943 }
1944
1945 static bool
1946 timeslice_yield(const struct intel_engine_execlists *el,
1947                 const struct i915_request *rq)
1948 {
1949         /*
1950          * Once bitten, forever smitten!
1951          *
1952          * If the active context ever busy-waited on a semaphore,
1953          * it will be treated as a hog until the end of its timeslice (i.e.
1954          * until it is scheduled out and replaced by a new submission,
1955          * possibly even its own lite-restore). The HW only sends an interrupt
1956          * on the first miss, and we do know if that semaphore has been
1957          * signaled, or even if it is now stuck on another semaphore. Play
1958          * safe, yield if it might be stuck -- it will be given a fresh
1959          * timeslice in the near future.
1960          */
1961         return rq->context->lrc.ccid == READ_ONCE(el->yield);
1962 }
1963
1964 static bool
1965 timeslice_expired(const struct intel_engine_execlists *el,
1966                   const struct i915_request *rq)
1967 {
1968         return timer_expired(&el->timer) || timeslice_yield(el, rq);
1969 }
1970
1971 static int
1972 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1973 {
1974         if (list_is_last(&rq->sched.link, &engine->active.requests))
1975                 return engine->execlists.queue_priority_hint;
1976
1977         return rq_prio(list_next_entry(rq, sched.link));
1978 }
1979
1980 static inline unsigned long
1981 timeslice(const struct intel_engine_cs *engine)
1982 {
1983         return READ_ONCE(engine->props.timeslice_duration_ms);
1984 }
1985
1986 static unsigned long active_timeslice(const struct intel_engine_cs *engine)
1987 {
1988         const struct intel_engine_execlists *execlists = &engine->execlists;
1989         const struct i915_request *rq = *execlists->active;
1990
1991         if (!rq || i915_request_completed(rq))
1992                 return 0;
1993
1994         if (READ_ONCE(execlists->switch_priority_hint) < effective_prio(rq))
1995                 return 0;
1996
1997         return timeslice(engine);
1998 }
1999
2000 static void set_timeslice(struct intel_engine_cs *engine)
2001 {
2002         unsigned long duration;
2003
2004         if (!intel_engine_has_timeslices(engine))
2005                 return;
2006
2007         duration = active_timeslice(engine);
2008         ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration);
2009
2010         set_timer_ms(&engine->execlists.timer, duration);
2011 }
2012
2013 static void start_timeslice(struct intel_engine_cs *engine, int prio)
2014 {
2015         struct intel_engine_execlists *execlists = &engine->execlists;
2016         unsigned long duration;
2017
2018         if (!intel_engine_has_timeslices(engine))
2019                 return;
2020
2021         WRITE_ONCE(execlists->switch_priority_hint, prio);
2022         if (prio == INT_MIN)
2023                 return;
2024
2025         if (timer_pending(&execlists->timer))
2026                 return;
2027
2028         duration = timeslice(engine);
2029         ENGINE_TRACE(engine,
2030                      "start timeslicing, prio:%d, interval:%lu",
2031                      prio, duration);
2032
2033         set_timer_ms(&execlists->timer, duration);
2034 }
2035
2036 static void record_preemption(struct intel_engine_execlists *execlists)
2037 {
2038         (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
2039 }
2040
2041 static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
2042                                             const struct i915_request *rq)
2043 {
2044         if (!rq)
2045                 return 0;
2046
2047         /* Force a fast reset for terminated contexts (ignoring sysfs!) */
2048         if (unlikely(intel_context_is_banned(rq->context)))
2049                 return 1;
2050
2051         return READ_ONCE(engine->props.preempt_timeout_ms);
2052 }
2053
2054 static void set_preempt_timeout(struct intel_engine_cs *engine,
2055                                 const struct i915_request *rq)
2056 {
2057         if (!intel_engine_has_preempt_reset(engine))
2058                 return;
2059
2060         set_timer_ms(&engine->execlists.preempt,
2061                      active_preempt_timeout(engine, rq));
2062 }
2063
2064 static inline void clear_ports(struct i915_request **ports, int count)
2065 {
2066         memset_p((void **)ports, NULL, count);
2067 }
2068
2069 static inline void
2070 copy_ports(struct i915_request **dst, struct i915_request **src, int count)
2071 {
2072         /* A memcpy_p() would be very useful here! */
2073         while (count--)
2074                 WRITE_ONCE(*dst++, *src++); /* avoid write tearing */
2075 }
2076
2077 static void execlists_dequeue(struct intel_engine_cs *engine)
2078 {
2079         struct intel_engine_execlists * const execlists = &engine->execlists;
2080         struct i915_request **port = execlists->pending;
2081         struct i915_request ** const last_port = port + execlists->port_mask;
2082         struct i915_request * const *active;
2083         struct i915_request *last;
2084         struct rb_node *rb;
2085         bool submit = false;
2086
2087         /*
2088          * Hardware submission is through 2 ports. Conceptually each port
2089          * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
2090          * static for a context, and unique to each, so we only execute
2091          * requests belonging to a single context from each ring. RING_HEAD
2092          * is maintained by the CS in the context image, it marks the place
2093          * where it got up to last time, and through RING_TAIL we tell the CS
2094          * where we want to execute up to this time.
2095          *
2096          * In this list the requests are in order of execution. Consecutive
2097          * requests from the same context are adjacent in the ringbuffer. We
2098          * can combine these requests into a single RING_TAIL update:
2099          *
2100          *              RING_HEAD...req1...req2
2101          *                                    ^- RING_TAIL
2102          * since to execute req2 the CS must first execute req1.
2103          *
2104          * Our goal then is to point each port to the end of a consecutive
2105          * sequence of requests as being the most optimal (fewest wake ups
2106          * and context switches) submission.
2107          */
2108
2109         for (rb = rb_first_cached(&execlists->virtual); rb; ) {
2110                 struct virtual_engine *ve =
2111                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2112                 struct i915_request *rq = READ_ONCE(ve->request);
2113
2114                 if (!rq) { /* lazily cleanup after another engine handled rq */
2115                         rb_erase_cached(rb, &execlists->virtual);
2116                         RB_CLEAR_NODE(rb);
2117                         rb = rb_first_cached(&execlists->virtual);
2118                         continue;
2119                 }
2120
2121                 if (!virtual_matches(ve, rq, engine)) {
2122                         rb = rb_next(rb);
2123                         continue;
2124                 }
2125
2126                 break;
2127         }
2128
2129         /*
2130          * If the queue is higher priority than the last
2131          * request in the currently active context, submit afresh.
2132          * We will resubmit again afterwards in case we need to split
2133          * the active context to interject the preemption request,
2134          * i.e. we will retrigger preemption following the ack in case
2135          * of trouble.
2136          */
2137         active = READ_ONCE(execlists->active);
2138
2139         /*
2140          * In theory we can skip over completed contexts that have not
2141          * yet been processed by events (as those events are in flight):
2142          *
2143          * while ((last = *active) && i915_request_completed(last))
2144          *      active++;
2145          *
2146          * However, the GPU cannot handle this as it will ultimately
2147          * find itself trying to jump back into a context it has just
2148          * completed and barf.
2149          */
2150
2151         if ((last = *active)) {
2152                 if (need_preempt(engine, last, rb)) {
2153                         if (i915_request_completed(last)) {
2154                                 tasklet_hi_schedule(&execlists->tasklet);
2155                                 return;
2156                         }
2157
2158                         ENGINE_TRACE(engine,
2159                                      "preempting last=%llx:%lld, prio=%d, hint=%d\n",
2160                                      last->fence.context,
2161                                      last->fence.seqno,
2162                                      last->sched.attr.priority,
2163                                      execlists->queue_priority_hint);
2164                         record_preemption(execlists);
2165
2166                         /*
2167                          * Don't let the RING_HEAD advance past the breadcrumb
2168                          * as we unwind (and until we resubmit) so that we do
2169                          * not accidentally tell it to go backwards.
2170                          */
2171                         ring_set_paused(engine, 1);
2172
2173                         /*
2174                          * Note that we have not stopped the GPU at this point,
2175                          * so we are unwinding the incomplete requests as they
2176                          * remain inflight and so by the time we do complete
2177                          * the preemption, some of the unwound requests may
2178                          * complete!
2179                          */
2180                         __unwind_incomplete_requests(engine);
2181
2182                         last = NULL;
2183                 } else if (need_timeslice(engine, last, rb) &&
2184                            timeslice_expired(execlists, last)) {
2185                         if (i915_request_completed(last)) {
2186                                 tasklet_hi_schedule(&execlists->tasklet);
2187                                 return;
2188                         }
2189
2190                         ENGINE_TRACE(engine,
2191                                      "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
2192                                      last->fence.context,
2193                                      last->fence.seqno,
2194                                      last->sched.attr.priority,
2195                                      execlists->queue_priority_hint,
2196                                      yesno(timeslice_yield(execlists, last)));
2197
2198                         ring_set_paused(engine, 1);
2199                         defer_active(engine);
2200
2201                         /*
2202                          * Unlike for preemption, if we rewind and continue
2203                          * executing the same context as previously active,
2204                          * the order of execution will remain the same and
2205                          * the tail will only advance. We do not need to
2206                          * force a full context restore, as a lite-restore
2207                          * is sufficient to resample the monotonic TAIL.
2208                          *
2209                          * If we switch to any other context, similarly we
2210                          * will not rewind TAIL of current context, and
2211                          * normal save/restore will preserve state and allow
2212                          * us to later continue executing the same request.
2213                          */
2214                         last = NULL;
2215                 } else {
2216                         /*
2217                          * Otherwise if we already have a request pending
2218                          * for execution after the current one, we can
2219                          * just wait until the next CS event before
2220                          * queuing more. In either case we will force a
2221                          * lite-restore preemption event, but if we wait
2222                          * we hopefully coalesce several updates into a single
2223                          * submission.
2224                          */
2225                         if (!list_is_last(&last->sched.link,
2226                                           &engine->active.requests)) {
2227                                 /*
2228                                  * Even if ELSP[1] is occupied and not worthy
2229                                  * of timeslices, our queue might be.
2230                                  */
2231                                 start_timeslice(engine, queue_prio(execlists));
2232                                 return;
2233                         }
2234                 }
2235         }
2236
2237         while (rb) { /* XXX virtual is always taking precedence */
2238                 struct virtual_engine *ve =
2239                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
2240                 struct i915_request *rq;
2241
2242                 spin_lock(&ve->base.active.lock);
2243
2244                 rq = ve->request;
2245                 if (unlikely(!rq)) { /* lost the race to a sibling */
2246                         spin_unlock(&ve->base.active.lock);
2247                         rb_erase_cached(rb, &execlists->virtual);
2248                         RB_CLEAR_NODE(rb);
2249                         rb = rb_first_cached(&execlists->virtual);
2250                         continue;
2251                 }
2252
2253                 GEM_BUG_ON(rq != ve->request);
2254                 GEM_BUG_ON(rq->engine != &ve->base);
2255                 GEM_BUG_ON(rq->context != &ve->context);
2256
2257                 if (rq_prio(rq) >= queue_prio(execlists)) {
2258                         if (!virtual_matches(ve, rq, engine)) {
2259                                 spin_unlock(&ve->base.active.lock);
2260                                 rb = rb_next(rb);
2261                                 continue;
2262                         }
2263
2264                         if (last && !can_merge_rq(last, rq)) {
2265                                 spin_unlock(&ve->base.active.lock);
2266                                 start_timeslice(engine, rq_prio(rq));
2267                                 return; /* leave this for another sibling */
2268                         }
2269
2270                         ENGINE_TRACE(engine,
2271                                      "virtual rq=%llx:%lld%s, new engine? %s\n",
2272                                      rq->fence.context,
2273                                      rq->fence.seqno,
2274                                      i915_request_completed(rq) ? "!" :
2275                                      i915_request_started(rq) ? "*" :
2276                                      "",
2277                                      yesno(engine != ve->siblings[0]));
2278
2279                         WRITE_ONCE(ve->request, NULL);
2280                         WRITE_ONCE(ve->base.execlists.queue_priority_hint,
2281                                    INT_MIN);
2282                         rb_erase_cached(rb, &execlists->virtual);
2283                         RB_CLEAR_NODE(rb);
2284
2285                         GEM_BUG_ON(!(rq->execution_mask & engine->mask));
2286                         WRITE_ONCE(rq->engine, engine);
2287
2288                         if (__i915_request_submit(rq)) {
2289                                 /*
2290                                  * Only after we confirm that we will submit
2291                                  * this request (i.e. it has not already
2292                                  * completed), do we want to update the context.
2293                                  *
2294                                  * This serves two purposes. It avoids
2295                                  * unnecessary work if we are resubmitting an
2296                                  * already completed request after timeslicing.
2297                                  * But more importantly, it prevents us altering
2298                                  * ve->siblings[] on an idle context, where
2299                                  * we may be using ve->siblings[] in
2300                                  * virtual_context_enter / virtual_context_exit.
2301                                  */
2302                                 virtual_xfer_context(ve, engine);
2303                                 GEM_BUG_ON(ve->siblings[0] != engine);
2304
2305                                 submit = true;
2306                                 last = rq;
2307                         }
2308                         i915_request_put(rq);
2309
2310                         /*
2311                          * Hmm, we have a bunch of virtual engine requests,
2312                          * but the first one was already completed (thanks
2313                          * preempt-to-busy!). Keep looking at the veng queue
2314                          * until we have no more relevant requests (i.e.
2315                          * the normal submit queue has higher priority).
2316                          */
2317                         if (!submit) {
2318                                 spin_unlock(&ve->base.active.lock);
2319                                 rb = rb_first_cached(&execlists->virtual);
2320                                 continue;
2321                         }
2322                 }
2323
2324                 spin_unlock(&ve->base.active.lock);
2325                 break;
2326         }
2327
2328         while ((rb = rb_first_cached(&execlists->queue))) {
2329                 struct i915_priolist *p = to_priolist(rb);
2330                 struct i915_request *rq, *rn;
2331                 int i;
2332
2333                 priolist_for_each_request_consume(rq, rn, p, i) {
2334                         bool merge = true;
2335
2336                         /*
2337                          * Can we combine this request with the current port?
2338                          * It has to be the same context/ringbuffer and not
2339                          * have any exceptions (e.g. GVT saying never to
2340                          * combine contexts).
2341                          *
2342                          * If we can combine the requests, we can execute both
2343                          * by updating the RING_TAIL to point to the end of the
2344                          * second request, and so we never need to tell the
2345                          * hardware about the first.
2346                          */
2347                         if (last && !can_merge_rq(last, rq)) {
2348                                 /*
2349                                  * If we are on the second port and cannot
2350                                  * combine this request with the last, then we
2351                                  * are done.
2352                                  */
2353                                 if (port == last_port)
2354                                         goto done;
2355
2356                                 /*
2357                                  * We must not populate both ELSP[] with the
2358                                  * same LRCA, i.e. we must submit 2 different
2359                                  * contexts if we submit 2 ELSP.
2360                                  */
2361                                 if (last->context == rq->context)
2362                                         goto done;
2363
2364                                 if (i915_request_has_sentinel(last))
2365                                         goto done;
2366
2367                                 /*
2368                                  * If GVT overrides us we only ever submit
2369                                  * port[0], leaving port[1] empty. Note that we
2370                                  * also have to be careful that we don't queue
2371                                  * the same context (even though a different
2372                                  * request) to the second port.
2373                                  */
2374                                 if (ctx_single_port_submission(last->context) ||
2375                                     ctx_single_port_submission(rq->context))
2376                                         goto done;
2377
2378                                 merge = false;
2379                         }
2380
2381                         if (__i915_request_submit(rq)) {
2382                                 if (!merge) {
2383                                         *port = execlists_schedule_in(last, port - execlists->pending);
2384                                         port++;
2385                                         last = NULL;
2386                                 }
2387
2388                                 GEM_BUG_ON(last &&
2389                                            !can_merge_ctx(last->context,
2390                                                           rq->context));
2391                                 GEM_BUG_ON(last &&
2392                                            i915_seqno_passed(last->fence.seqno,
2393                                                              rq->fence.seqno));
2394
2395                                 submit = true;
2396                                 last = rq;
2397                         }
2398                 }
2399
2400                 rb_erase_cached(&p->node, &execlists->queue);
2401                 i915_priolist_free(p);
2402         }
2403
2404 done:
2405         /*
2406          * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2407          *
2408          * We choose the priority hint such that if we add a request of greater
2409          * priority than this, we kick the submission tasklet to decide on
2410          * the right order of submitting the requests to hardware. We must
2411          * also be prepared to reorder requests as they are in-flight on the
2412          * HW. We derive the priority hint then as the first "hole" in
2413          * the HW submission ports and if there are no available slots,
2414          * the priority of the lowest executing request, i.e. last.
2415          *
2416          * When we do receive a higher priority request ready to run from the
2417          * user, see queue_request(), the priority hint is bumped to that
2418          * request triggering preemption on the next dequeue (or subsequent
2419          * interrupt for secondary ports).
2420          */
2421         execlists->queue_priority_hint = queue_prio(execlists);
2422
2423         if (submit) {
2424                 *port = execlists_schedule_in(last, port - execlists->pending);
2425                 execlists->switch_priority_hint =
2426                         switch_prio(engine, *execlists->pending);
2427
2428                 /*
2429                  * Skip if we ended up with exactly the same set of requests,
2430                  * e.g. trying to timeslice a pair of ordered contexts
2431                  */
2432                 if (!memcmp(active, execlists->pending,
2433                             (port - execlists->pending + 1) * sizeof(*port))) {
2434                         do
2435                                 execlists_schedule_out(fetch_and_zero(port));
2436                         while (port-- != execlists->pending);
2437
2438                         goto skip_submit;
2439                 }
2440                 clear_ports(port + 1, last_port - port);
2441
2442                 WRITE_ONCE(execlists->yield, -1);
2443                 set_preempt_timeout(engine, *active);
2444                 execlists_submit_ports(engine);
2445         } else {
2446                 start_timeslice(engine, execlists->queue_priority_hint);
2447 skip_submit:
2448                 ring_set_paused(engine, 0);
2449         }
2450 }
2451
2452 static void
2453 cancel_port_requests(struct intel_engine_execlists * const execlists)
2454 {
2455         struct i915_request * const *port;
2456
2457         for (port = execlists->pending; *port; port++)
2458                 execlists_schedule_out(*port);
2459         clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2460
2461         /* Mark the end of active before we overwrite *active */
2462         for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2463                 execlists_schedule_out(*port);
2464         clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2465
2466         smp_wmb(); /* complete the seqlock for execlists_active() */
2467         WRITE_ONCE(execlists->active, execlists->inflight);
2468 }
2469
2470 static inline void
2471 invalidate_csb_entries(const u64 *first, const u64 *last)
2472 {
2473         clflush((void *)first);
2474         clflush((void *)last);
2475 }
2476
2477 /*
2478  * Starting with Gen12, the status has a new format:
2479  *
2480  *     bit  0:     switched to new queue
2481  *     bit  1:     reserved
2482  *     bit  2:     semaphore wait mode (poll or signal), only valid when
2483  *                 switch detail is set to "wait on semaphore"
2484  *     bits 3-5:   engine class
2485  *     bits 6-11:  engine instance
2486  *     bits 12-14: reserved
2487  *     bits 15-25: sw context id of the lrc the GT switched to
2488  *     bits 26-31: sw counter of the lrc the GT switched to
2489  *     bits 32-35: context switch detail
2490  *                  - 0: ctx complete
2491  *                  - 1: wait on sync flip
2492  *                  - 2: wait on vblank
2493  *                  - 3: wait on scanline
2494  *                  - 4: wait on semaphore
2495  *                  - 5: context preempted (not on SEMAPHORE_WAIT or
2496  *                       WAIT_FOR_EVENT)
2497  *     bit  36:    reserved
2498  *     bits 37-43: wait detail (for switch detail 1 to 4)
2499  *     bits 44-46: reserved
2500  *     bits 47-57: sw context id of the lrc the GT switched away from
2501  *     bits 58-63: sw counter of the lrc the GT switched away from
2502  */
2503 static inline bool gen12_csb_parse(const u64 csb)
2504 {
2505         bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_32_bits(csb));
2506         bool new_queue =
2507                 lower_32_bits(csb) & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2508
2509         /*
2510          * The context switch detail is not guaranteed to be 5 when a preemption
2511          * occurs, so we can't just check for that. The check below works for
2512          * all the cases we care about, including preemptions of WAIT
2513          * instructions and lite-restore. Preempt-to-idle via the CTRL register
2514          * would require some extra handling, but we don't support that.
2515          */
2516         if (!ctx_away_valid || new_queue) {
2517                 GEM_BUG_ON(!GEN12_CSB_CTX_VALID(lower_32_bits(csb)));
2518                 return true;
2519         }
2520
2521         /*
2522          * switch detail = 5 is covered by the case above and we do not expect a
2523          * context switch on an unsuccessful wait instruction since we always
2524          * use polling mode.
2525          */
2526         GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_32_bits(csb)));
2527         return false;
2528 }
2529
2530 static inline bool gen8_csb_parse(const u64 csb)
2531 {
2532         return csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2533 }
2534
2535 static noinline u64
2536 wa_csb_read(const struct intel_engine_cs *engine, u64 * const csb)
2537 {
2538         u64 entry;
2539
2540         /*
2541          * Reading from the HWSP has one particular advantage: we can detect
2542          * a stale entry. Since the write into HWSP is broken, we have no reason
2543          * to trust the HW at all, the mmio entry may equally be unordered, so
2544          * we prefer the path that is self-checking and as a last resort,
2545          * return the mmio value.
2546          *
2547          * tgl,dg1:HSDES#22011327657
2548          */
2549         preempt_disable();
2550         if (wait_for_atomic_us((entry = READ_ONCE(*csb)) != -1, 10)) {
2551                 int idx = csb - engine->execlists.csb_status;
2552                 int status;
2553
2554                 status = GEN8_EXECLISTS_STATUS_BUF;
2555                 if (idx >= 6) {
2556                         status = GEN11_EXECLISTS_STATUS_BUF2;
2557                         idx -= 6;
2558                 }
2559                 status += sizeof(u64) * idx;
2560
2561                 entry = intel_uncore_read64(engine->uncore,
2562                                             _MMIO(engine->mmio_base + status));
2563         }
2564         preempt_enable();
2565
2566         return entry;
2567 }
2568
2569 static inline u64
2570 csb_read(const struct intel_engine_cs *engine, u64 * const csb)
2571 {
2572         u64 entry = READ_ONCE(*csb);
2573
2574         /*
2575          * Unfortunately, the GPU does not always serialise its write
2576          * of the CSB entries before its write of the CSB pointer, at least
2577          * from the perspective of the CPU, using what is known as a Global
2578          * Observation Point. We may read a new CSB tail pointer, but then
2579          * read the stale CSB entries, causing us to misinterpret the
2580          * context-switch events, and eventually declare the GPU hung.
2581          *
2582          * icl:HSDES#1806554093
2583          * tgl:HSDES#22011248461
2584          */
2585         if (unlikely(entry == -1))
2586                 entry = wa_csb_read(engine, csb);
2587
2588         /* Consume this entry so that we can spot its future reuse. */
2589         WRITE_ONCE(*csb, -1);
2590
2591         /* ELSP is an implicit wmb() before the GPU wraps and overwrites csb */
2592         return entry;
2593 }
2594
2595 static void process_csb(struct intel_engine_cs *engine)
2596 {
2597         struct intel_engine_execlists * const execlists = &engine->execlists;
2598         u64 * const buf = execlists->csb_status;
2599         const u8 num_entries = execlists->csb_size;
2600         u8 head, tail;
2601
2602         /*
2603          * As we modify our execlists state tracking we require exclusive
2604          * access. Either we are inside the tasklet, or the tasklet is disabled
2605          * and we assume that is only inside the reset paths and so serialised.
2606          */
2607         GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2608                    !reset_in_progress(execlists));
2609         GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2610
2611         /*
2612          * Note that csb_write, csb_status may be either in HWSP or mmio.
2613          * When reading from the csb_write mmio register, we have to be
2614          * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2615          * the low 4bits. As it happens we know the next 4bits are always
2616          * zero and so we can simply masked off the low u8 of the register
2617          * and treat it identically to reading from the HWSP (without having
2618          * to use explicit shifting and masking, and probably bifurcating
2619          * the code to handle the legacy mmio read).
2620          */
2621         head = execlists->csb_head;
2622         tail = READ_ONCE(*execlists->csb_write);
2623         if (unlikely(head == tail))
2624                 return;
2625
2626         /*
2627          * We will consume all events from HW, or at least pretend to.
2628          *
2629          * The sequence of events from the HW is deterministic, and derived
2630          * from our writes to the ELSP, with a smidgen of variability for
2631          * the arrival of the asynchronous requests wrt to the inflight
2632          * execution. If the HW sends an event that does not correspond with
2633          * the one we are expecting, we have to abandon all hope as we lose
2634          * all tracking of what the engine is actually executing. We will
2635          * only detect we are out of sequence with the HW when we get an
2636          * 'impossible' event because we have already drained our own
2637          * preemption/promotion queue. If this occurs, we know that we likely
2638          * lost track of execution earlier and must unwind and restart, the
2639          * simplest way is by stop processing the event queue and force the
2640          * engine to reset.
2641          */
2642         execlists->csb_head = tail;
2643         ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2644
2645         /*
2646          * Hopefully paired with a wmb() in HW!
2647          *
2648          * We must complete the read of the write pointer before any reads
2649          * from the CSB, so that we do not see stale values. Without an rmb
2650          * (lfence) the HW may speculatively perform the CSB[] reads *before*
2651          * we perform the READ_ONCE(*csb_write).
2652          */
2653         rmb();
2654         do {
2655                 bool promote;
2656                 u64 csb;
2657
2658                 if (++head == num_entries)
2659                         head = 0;
2660
2661                 /*
2662                  * We are flying near dragons again.
2663                  *
2664                  * We hold a reference to the request in execlist_port[]
2665                  * but no more than that. We are operating in softirq
2666                  * context and so cannot hold any mutex or sleep. That
2667                  * prevents us stopping the requests we are processing
2668                  * in port[] from being retired simultaneously (the
2669                  * breadcrumb will be complete before we see the
2670                  * context-switch). As we only hold the reference to the
2671                  * request, any pointer chasing underneath the request
2672                  * is subject to a potential use-after-free. Thus we
2673                  * store all of the bookkeeping within port[] as
2674                  * required, and avoid using unguarded pointers beneath
2675                  * request itself. The same applies to the atomic
2676                  * status notifier.
2677                  */
2678
2679                 csb = csb_read(engine, buf + head);
2680                 ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2681                              head, upper_32_bits(csb), lower_32_bits(csb));
2682
2683                 if (INTEL_GEN(engine->i915) >= 12)
2684                         promote = gen12_csb_parse(csb);
2685                 else
2686                         promote = gen8_csb_parse(csb);
2687                 if (promote) {
2688                         struct i915_request * const *old = execlists->active;
2689
2690                         if (GEM_WARN_ON(!*execlists->pending)) {
2691                                 execlists->error_interrupt |= ERROR_CSB;
2692                                 break;
2693                         }
2694
2695                         ring_set_paused(engine, 0);
2696
2697                         /* Point active to the new ELSP; prevent overwriting */
2698                         WRITE_ONCE(execlists->active, execlists->pending);
2699                         smp_wmb(); /* notify execlists_active() */
2700
2701                         /* cancel old inflight, prepare for switch */
2702                         trace_ports(execlists, "preempted", old);
2703                         while (*old)
2704                                 execlists_schedule_out(*old++);
2705
2706                         /* switch pending to inflight */
2707                         GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2708                         copy_ports(execlists->inflight,
2709                                    execlists->pending,
2710                                    execlists_num_ports(execlists));
2711                         smp_wmb(); /* complete the seqlock */
2712                         WRITE_ONCE(execlists->active, execlists->inflight);
2713
2714                         /* XXX Magic delay for tgl */
2715                         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
2716
2717                         WRITE_ONCE(execlists->pending[0], NULL);
2718                 } else {
2719                         if (GEM_WARN_ON(!*execlists->active)) {
2720                                 execlists->error_interrupt |= ERROR_CSB;
2721                                 break;
2722                         }
2723
2724                         /* port0 completed, advanced to port1 */
2725                         trace_ports(execlists, "completed", execlists->active);
2726
2727                         /*
2728                          * We rely on the hardware being strongly
2729                          * ordered, that the breadcrumb write is
2730                          * coherent (visible from the CPU) before the
2731                          * user interrupt is processed. One might assume
2732                          * that the breadcrumb write being before the
2733                          * user interrupt and the CS event for the context
2734                          * switch would therefore be before the CS event
2735                          * itself...
2736                          */
2737                         if (GEM_SHOW_DEBUG() &&
2738                             !i915_request_completed(*execlists->active)) {
2739                                 struct i915_request *rq = *execlists->active;
2740                                 const u32 *regs __maybe_unused =
2741                                         rq->context->lrc_reg_state;
2742
2743                                 ENGINE_TRACE(engine,
2744                                              "context completed before request!\n");
2745                                 ENGINE_TRACE(engine,
2746                                              "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n",
2747                                              ENGINE_READ(engine, RING_START),
2748                                              ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR,
2749                                              ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR,
2750                                              ENGINE_READ(engine, RING_CTL),
2751                                              ENGINE_READ(engine, RING_MI_MODE));
2752                                 ENGINE_TRACE(engine,
2753                                              "rq:{start:%08x, head:%04x, tail:%04x, seqno:%llx:%d, hwsp:%d}, ",
2754                                              i915_ggtt_offset(rq->ring->vma),
2755                                              rq->head, rq->tail,
2756                                              rq->fence.context,
2757                                              lower_32_bits(rq->fence.seqno),
2758                                              hwsp_seqno(rq));
2759                                 ENGINE_TRACE(engine,
2760                                              "ctx:{start:%08x, head:%04x, tail:%04x}, ",
2761                                              regs[CTX_RING_START],
2762                                              regs[CTX_RING_HEAD],
2763                                              regs[CTX_RING_TAIL]);
2764                         }
2765
2766                         execlists_schedule_out(*execlists->active++);
2767
2768                         GEM_BUG_ON(execlists->active - execlists->inflight >
2769                                    execlists_num_ports(execlists));
2770                 }
2771         } while (head != tail);
2772
2773         set_timeslice(engine);
2774
2775         /*
2776          * Gen11 has proven to fail wrt global observation point between
2777          * entry and tail update, failing on the ordering and thus
2778          * we see an old entry in the context status buffer.
2779          *
2780          * Forcibly evict out entries for the next gpu csb update,
2781          * to increase the odds that we get a fresh entries with non
2782          * working hardware. The cost for doing so comes out mostly with
2783          * the wash as hardware, working or not, will need to do the
2784          * invalidation before.
2785          */
2786         invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2787 }
2788
2789 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2790 {
2791         lockdep_assert_held(&engine->active.lock);
2792         if (!READ_ONCE(engine->execlists.pending[0])) {
2793                 rcu_read_lock(); /* protect peeking at execlists->active */
2794                 execlists_dequeue(engine);
2795                 rcu_read_unlock();
2796         }
2797 }
2798
2799 static void __execlists_hold(struct i915_request *rq)
2800 {
2801         LIST_HEAD(list);
2802
2803         do {
2804                 struct i915_dependency *p;
2805
2806                 if (i915_request_is_active(rq))
2807                         __i915_request_unsubmit(rq);
2808
2809                 clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2810                 list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2811                 i915_request_set_hold(rq);
2812                 RQ_TRACE(rq, "on hold\n");
2813
2814                 for_each_waiter(p, rq) {
2815                         struct i915_request *w =
2816                                 container_of(p->waiter, typeof(*w), sched);
2817
2818                         /* Leave semaphores spinning on the other engines */
2819                         if (w->engine != rq->engine)
2820                                 continue;
2821
2822                         if (!i915_request_is_ready(w))
2823                                 continue;
2824
2825                         if (i915_request_completed(w))
2826                                 continue;
2827
2828                         if (i915_request_on_hold(w))
2829                                 continue;
2830
2831                         list_move_tail(&w->sched.link, &list);
2832                 }
2833
2834                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2835         } while (rq);
2836 }
2837
2838 static bool execlists_hold(struct intel_engine_cs *engine,
2839                            struct i915_request *rq)
2840 {
2841         if (i915_request_on_hold(rq))
2842                 return false;
2843
2844         spin_lock_irq(&engine->active.lock);
2845
2846         if (i915_request_completed(rq)) { /* too late! */
2847                 rq = NULL;
2848                 goto unlock;
2849         }
2850
2851         if (rq->engine != engine) { /* preempted virtual engine */
2852                 struct virtual_engine *ve = to_virtual_engine(rq->engine);
2853
2854                 /*
2855                  * intel_context_inflight() is only protected by virtue
2856                  * of process_csb() being called only by the tasklet (or
2857                  * directly from inside reset while the tasklet is suspended).
2858                  * Assert that neither of those are allowed to run while we
2859                  * poke at the request queues.
2860                  */
2861                 GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2862
2863                 /*
2864                  * An unsubmitted request along a virtual engine will
2865                  * remain on the active (this) engine until we are able
2866                  * to process the context switch away (and so mark the
2867                  * context as no longer in flight). That cannot have happened
2868                  * yet, otherwise we would not be hanging!
2869                  */
2870                 spin_lock(&ve->base.active.lock);
2871                 GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2872                 GEM_BUG_ON(ve->request != rq);
2873                 ve->request = NULL;
2874                 spin_unlock(&ve->base.active.lock);
2875                 i915_request_put(rq);
2876
2877                 rq->engine = engine;
2878         }
2879
2880         /*
2881          * Transfer this request onto the hold queue to prevent it
2882          * being resumbitted to HW (and potentially completed) before we have
2883          * released it. Since we may have already submitted following
2884          * requests, we need to remove those as well.
2885          */
2886         GEM_BUG_ON(i915_request_on_hold(rq));
2887         GEM_BUG_ON(rq->engine != engine);
2888         __execlists_hold(rq);
2889         GEM_BUG_ON(list_empty(&engine->active.hold));
2890
2891 unlock:
2892         spin_unlock_irq(&engine->active.lock);
2893         return rq;
2894 }
2895
2896 static bool hold_request(const struct i915_request *rq)
2897 {
2898         struct i915_dependency *p;
2899         bool result = false;
2900
2901         /*
2902          * If one of our ancestors is on hold, we must also be on hold,
2903          * otherwise we will bypass it and execute before it.
2904          */
2905         rcu_read_lock();
2906         for_each_signaler(p, rq) {
2907                 const struct i915_request *s =
2908                         container_of(p->signaler, typeof(*s), sched);
2909
2910                 if (s->engine != rq->engine)
2911                         continue;
2912
2913                 result = i915_request_on_hold(s);
2914                 if (result)
2915                         break;
2916         }
2917         rcu_read_unlock();
2918
2919         return result;
2920 }
2921
2922 static void __execlists_unhold(struct i915_request *rq)
2923 {
2924         LIST_HEAD(list);
2925
2926         do {
2927                 struct i915_dependency *p;
2928
2929                 RQ_TRACE(rq, "hold release\n");
2930
2931                 GEM_BUG_ON(!i915_request_on_hold(rq));
2932                 GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2933
2934                 i915_request_clear_hold(rq);
2935                 list_move_tail(&rq->sched.link,
2936                                i915_sched_lookup_priolist(rq->engine,
2937                                                           rq_prio(rq)));
2938                 set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2939
2940                 /* Also release any children on this engine that are ready */
2941                 for_each_waiter(p, rq) {
2942                         struct i915_request *w =
2943                                 container_of(p->waiter, typeof(*w), sched);
2944
2945                         /* Propagate any change in error status */
2946                         if (rq->fence.error)
2947                                 i915_request_set_error_once(w, rq->fence.error);
2948
2949                         if (w->engine != rq->engine)
2950                                 continue;
2951
2952                         if (!i915_request_on_hold(w))
2953                                 continue;
2954
2955                         /* Check that no other parents are also on hold */
2956                         if (hold_request(w))
2957                                 continue;
2958
2959                         list_move_tail(&w->sched.link, &list);
2960                 }
2961
2962                 rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2963         } while (rq);
2964 }
2965
2966 static void execlists_unhold(struct intel_engine_cs *engine,
2967                              struct i915_request *rq)
2968 {
2969         spin_lock_irq(&engine->active.lock);
2970
2971         /*
2972          * Move this request back to the priority queue, and all of its
2973          * children and grandchildren that were suspended along with it.
2974          */
2975         __execlists_unhold(rq);
2976
2977         if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2978                 engine->execlists.queue_priority_hint = rq_prio(rq);
2979                 tasklet_hi_schedule(&engine->execlists.tasklet);
2980         }
2981
2982         spin_unlock_irq(&engine->active.lock);
2983 }
2984
2985 struct execlists_capture {
2986         struct work_struct work;
2987         struct i915_request *rq;
2988         struct i915_gpu_coredump *error;
2989 };
2990
2991 static void execlists_capture_work(struct work_struct *work)
2992 {
2993         struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2994         const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2995         struct intel_engine_cs *engine = cap->rq->engine;
2996         struct intel_gt_coredump *gt = cap->error->gt;
2997         struct intel_engine_capture_vma *vma;
2998
2999         /* Compress all the objects attached to the request, slow! */
3000         vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
3001         if (vma) {
3002                 struct i915_vma_compress *compress =
3003                         i915_vma_capture_prepare(gt);
3004
3005                 intel_engine_coredump_add_vma(gt->engine, vma, compress);
3006                 i915_vma_capture_finish(gt, compress);
3007         }
3008
3009         gt->simulated = gt->engine->simulated;
3010         cap->error->simulated = gt->simulated;
3011
3012         /* Publish the error state, and announce it to the world */
3013         i915_error_state_store(cap->error);
3014         i915_gpu_coredump_put(cap->error);
3015
3016         /* Return this request and all that depend upon it for signaling */
3017         execlists_unhold(engine, cap->rq);
3018         i915_request_put(cap->rq);
3019
3020         kfree(cap);
3021 }
3022
3023 static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
3024 {
3025         const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
3026         struct execlists_capture *cap;
3027
3028         cap = kmalloc(sizeof(*cap), gfp);
3029         if (!cap)
3030                 return NULL;
3031
3032         cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
3033         if (!cap->error)
3034                 goto err_cap;
3035
3036         cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
3037         if (!cap->error->gt)
3038                 goto err_gpu;
3039
3040         cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
3041         if (!cap->error->gt->engine)
3042                 goto err_gt;
3043
3044         cap->error->gt->engine->hung = true;
3045
3046         return cap;
3047
3048 err_gt:
3049         kfree(cap->error->gt);
3050 err_gpu:
3051         kfree(cap->error);
3052 err_cap:
3053         kfree(cap);
3054         return NULL;
3055 }
3056
3057 static struct i915_request *
3058 active_context(struct intel_engine_cs *engine, u32 ccid)
3059 {
3060         const struct intel_engine_execlists * const el = &engine->execlists;
3061         struct i915_request * const *port, *rq;
3062
3063         /*
3064          * Use the most recent result from process_csb(), but just in case
3065          * we trigger an error (via interrupt) before the first CS event has
3066          * been written, peek at the next submission.
3067          */
3068
3069         for (port = el->active; (rq = *port); port++) {
3070                 if (rq->context->lrc.ccid == ccid) {
3071                         ENGINE_TRACE(engine,
3072                                      "ccid found at active:%zd\n",
3073                                      port - el->active);
3074                         return rq;
3075                 }
3076         }
3077
3078         for (port = el->pending; (rq = *port); port++) {
3079                 if (rq->context->lrc.ccid == ccid) {
3080                         ENGINE_TRACE(engine,
3081                                      "ccid found at pending:%zd\n",
3082                                      port - el->pending);
3083                         return rq;
3084                 }
3085         }
3086
3087         ENGINE_TRACE(engine, "ccid:%x not found\n", ccid);
3088         return NULL;
3089 }
3090
3091 static u32 active_ccid(struct intel_engine_cs *engine)
3092 {
3093         return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI);
3094 }
3095
3096 static void execlists_capture(struct intel_engine_cs *engine)
3097 {
3098         struct execlists_capture *cap;
3099
3100         if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
3101                 return;
3102
3103         /*
3104          * We need to _quickly_ capture the engine state before we reset.
3105          * We are inside an atomic section (softirq) here and we are delaying
3106          * the forced preemption event.
3107          */
3108         cap = capture_regs(engine);
3109         if (!cap)
3110                 return;
3111
3112         spin_lock_irq(&engine->active.lock);
3113         cap->rq = active_context(engine, active_ccid(engine));
3114         if (cap->rq) {
3115                 cap->rq = active_request(cap->rq->context->timeline, cap->rq);
3116                 cap->rq = i915_request_get_rcu(cap->rq);
3117         }
3118         spin_unlock_irq(&engine->active.lock);
3119         if (!cap->rq)
3120                 goto err_free;
3121
3122         /*
3123          * Remove the request from the execlists queue, and take ownership
3124          * of the request. We pass it to our worker who will _slowly_ compress
3125          * all the pages the _user_ requested for debugging their batch, after
3126          * which we return it to the queue for signaling.
3127          *
3128          * By removing them from the execlists queue, we also remove the
3129          * requests from being processed by __unwind_incomplete_requests()
3130          * during the intel_engine_reset(), and so they will *not* be replayed
3131          * afterwards.
3132          *
3133          * Note that because we have not yet reset the engine at this point,
3134          * it is possible for the request that we have identified as being
3135          * guilty, did in fact complete and we will then hit an arbitration
3136          * point allowing the outstanding preemption to succeed. The likelihood
3137          * of that is very low (as capturing of the engine registers should be
3138          * fast enough to run inside an irq-off atomic section!), so we will
3139          * simply hold that request accountable for being non-preemptible
3140          * long enough to force the reset.
3141          */
3142         if (!execlists_hold(engine, cap->rq))
3143                 goto err_rq;
3144
3145         INIT_WORK(&cap->work, execlists_capture_work);
3146         schedule_work(&cap->work);
3147         return;
3148
3149 err_rq:
3150         i915_request_put(cap->rq);
3151 err_free:
3152         i915_gpu_coredump_put(cap->error);
3153         kfree(cap);
3154 }
3155
3156 static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
3157 {
3158         const unsigned int bit = I915_RESET_ENGINE + engine->id;
3159         unsigned long *lock = &engine->gt->reset.flags;
3160
3161         if (!intel_has_reset_engine(engine->gt))
3162                 return;
3163
3164         if (test_and_set_bit(bit, lock))
3165                 return;
3166
3167         ENGINE_TRACE(engine, "reset for %s\n", msg);
3168
3169         /* Mark this tasklet as disabled to avoid waiting for it to complete */
3170         tasklet_disable_nosync(&engine->execlists.tasklet);
3171
3172         ring_set_paused(engine, 1); /* Freeze the current request in place */
3173         execlists_capture(engine);
3174         intel_engine_reset(engine, msg);
3175
3176         tasklet_enable(&engine->execlists.tasklet);
3177         clear_and_wake_up_bit(bit, lock);
3178 }
3179
3180 static bool preempt_timeout(const struct intel_engine_cs *const engine)
3181 {
3182         const struct timer_list *t = &engine->execlists.preempt;
3183
3184         if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
3185                 return false;
3186
3187         if (!timer_expired(t))
3188                 return false;
3189
3190         return READ_ONCE(engine->execlists.pending[0]);
3191 }
3192
3193 /*
3194  * Check the unread Context Status Buffers and manage the submission of new
3195  * contexts to the ELSP accordingly.
3196  */
3197 static void execlists_submission_tasklet(unsigned long data)
3198 {
3199         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
3200         bool timeout = preempt_timeout(engine);
3201
3202         process_csb(engine);
3203
3204         if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3205                 const char *msg;
3206
3207                 /* Generate the error message in priority wrt to the user! */
3208                 if (engine->execlists.error_interrupt & GENMASK(15, 0))
3209                         msg = "CS error"; /* thrown by a user payload */
3210                 else if (engine->execlists.error_interrupt & ERROR_CSB)
3211                         msg = "invalid CSB event";
3212                 else
3213                         msg = "internal error";
3214
3215                 engine->execlists.error_interrupt = 0;
3216                 execlists_reset(engine, msg);
3217         }
3218
3219         if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
3220                 unsigned long flags;
3221
3222                 spin_lock_irqsave(&engine->active.lock, flags);
3223                 __execlists_submission_tasklet(engine);
3224                 spin_unlock_irqrestore(&engine->active.lock, flags);
3225
3226                 /* Recheck after serialising with direct-submission */
3227                 if (unlikely(timeout && preempt_timeout(engine))) {
3228                         cancel_timer(&engine->execlists.preempt);
3229                         execlists_reset(engine, "preemption time out");
3230                 }
3231         }
3232 }
3233
3234 static void __execlists_kick(struct intel_engine_execlists *execlists)
3235 {
3236         /* Kick the tasklet for some interrupt coalescing and reset handling */
3237         tasklet_hi_schedule(&execlists->tasklet);
3238 }
3239
3240 #define execlists_kick(t, member) \
3241         __execlists_kick(container_of(t, struct intel_engine_execlists, member))
3242
3243 static void execlists_timeslice(struct timer_list *timer)
3244 {
3245         execlists_kick(timer, timer);
3246 }
3247
3248 static void execlists_preempt(struct timer_list *timer)
3249 {
3250         execlists_kick(timer, preempt);
3251 }
3252
3253 static void queue_request(struct intel_engine_cs *engine,
3254                           struct i915_request *rq)
3255 {
3256         GEM_BUG_ON(!list_empty(&rq->sched.link));
3257         list_add_tail(&rq->sched.link,
3258                       i915_sched_lookup_priolist(engine, rq_prio(rq)));
3259         set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
3260 }
3261
3262 static void __submit_queue_imm(struct intel_engine_cs *engine)
3263 {
3264         struct intel_engine_execlists * const execlists = &engine->execlists;
3265
3266         if (reset_in_progress(execlists))
3267                 return; /* defer until we restart the engine following reset */
3268
3269         __execlists_submission_tasklet(engine);
3270 }
3271
3272 static void submit_queue(struct intel_engine_cs *engine,
3273                          const struct i915_request *rq)
3274 {
3275         struct intel_engine_execlists *execlists = &engine->execlists;
3276
3277         if (rq_prio(rq) <= execlists->queue_priority_hint)
3278                 return;
3279
3280         execlists->queue_priority_hint = rq_prio(rq);
3281         __submit_queue_imm(engine);
3282 }
3283
3284 static bool ancestor_on_hold(const struct intel_engine_cs *engine,
3285                              const struct i915_request *rq)
3286 {
3287         GEM_BUG_ON(i915_request_on_hold(rq));
3288         return !list_empty(&engine->active.hold) && hold_request(rq);
3289 }
3290
3291 static void flush_csb(struct intel_engine_cs *engine)
3292 {
3293         struct intel_engine_execlists *el = &engine->execlists;
3294
3295         if (READ_ONCE(el->pending[0]) && tasklet_trylock(&el->tasklet)) {
3296                 if (!reset_in_progress(el))
3297                         process_csb(engine);
3298                 tasklet_unlock(&el->tasklet);
3299         }
3300 }
3301
3302 static void execlists_submit_request(struct i915_request *request)
3303 {
3304         struct intel_engine_cs *engine = request->engine;
3305         unsigned long flags;
3306
3307         /* Hopefully we clear execlists->pending[] to let us through */
3308         flush_csb(engine);
3309
3310         /* Will be called from irq-context when using foreign fences. */
3311         spin_lock_irqsave(&engine->active.lock, flags);
3312
3313         if (unlikely(ancestor_on_hold(engine, request))) {
3314                 RQ_TRACE(request, "ancestor on hold\n");
3315                 list_add_tail(&request->sched.link, &engine->active.hold);
3316                 i915_request_set_hold(request);
3317         } else {
3318                 queue_request(engine, request);
3319
3320                 GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
3321                 GEM_BUG_ON(list_empty(&request->sched.link));
3322
3323                 submit_queue(engine, request);
3324         }
3325
3326         spin_unlock_irqrestore(&engine->active.lock, flags);
3327 }
3328
3329 static void __execlists_context_fini(struct intel_context *ce)
3330 {
3331         intel_ring_put(ce->ring);
3332         i915_vma_put(ce->state);
3333 }
3334
3335 static void execlists_context_destroy(struct kref *kref)
3336 {
3337         struct intel_context *ce = container_of(kref, typeof(*ce), ref);
3338
3339         GEM_BUG_ON(!i915_active_is_idle(&ce->active));
3340         GEM_BUG_ON(intel_context_is_pinned(ce));
3341
3342         if (ce->state)
3343                 __execlists_context_fini(ce);
3344
3345         intel_context_fini(ce);
3346         intel_context_free(ce);
3347 }
3348
3349 static void
3350 set_redzone(void *vaddr, const struct intel_engine_cs *engine)
3351 {
3352         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3353                 return;
3354
3355         vaddr += engine->context_size;
3356
3357         memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
3358 }
3359
3360 static void
3361 check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
3362 {
3363         if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
3364                 return;
3365
3366         vaddr += engine->context_size;
3367
3368         if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
3369                 drm_err_once(&engine->i915->drm,
3370                              "%s context redzone overwritten!\n",
3371                              engine->name);
3372 }
3373
3374 static void execlists_context_unpin(struct intel_context *ce)
3375 {
3376         check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET,
3377                       ce->engine);
3378 }
3379
3380 static void execlists_context_post_unpin(struct intel_context *ce)
3381 {
3382         i915_gem_object_unpin_map(ce->state->obj);
3383 }
3384
3385 static u32 *
3386 gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs)
3387 {
3388         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3389                 MI_SRM_LRM_GLOBAL_GTT |
3390                 MI_LRI_LRM_CS_MMIO;
3391         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3392         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3393                 CTX_TIMESTAMP * sizeof(u32);
3394         *cs++ = 0;
3395
3396         *cs++ = MI_LOAD_REGISTER_REG |
3397                 MI_LRR_SOURCE_CS_MMIO |
3398                 MI_LRI_LRM_CS_MMIO;
3399         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3400         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3401
3402         *cs++ = MI_LOAD_REGISTER_REG |
3403                 MI_LRR_SOURCE_CS_MMIO |
3404                 MI_LRI_LRM_CS_MMIO;
3405         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3406         *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0));
3407
3408         return cs;
3409 }
3410
3411 static u32 *
3412 gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs)
3413 {
3414         GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1);
3415
3416         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3417                 MI_SRM_LRM_GLOBAL_GTT |
3418                 MI_LRI_LRM_CS_MMIO;
3419         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3420         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3421                 (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32);
3422         *cs++ = 0;
3423
3424         return cs;
3425 }
3426
3427 static u32 *
3428 gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs)
3429 {
3430         GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1);
3431
3432         *cs++ = MI_LOAD_REGISTER_MEM_GEN8 |
3433                 MI_SRM_LRM_GLOBAL_GTT |
3434                 MI_LRI_LRM_CS_MMIO;
3435         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3436         *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET +
3437                 (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32);
3438         *cs++ = 0;
3439
3440         *cs++ = MI_LOAD_REGISTER_REG |
3441                 MI_LRR_SOURCE_CS_MMIO |
3442                 MI_LRI_LRM_CS_MMIO;
3443         *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0));
3444         *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0));
3445
3446         return cs;
3447 }
3448
3449 static u32 *
3450 gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs)
3451 {
3452         cs = gen12_emit_timestamp_wa(ce, cs);
3453         cs = gen12_emit_cmd_buf_wa(ce, cs);
3454         cs = gen12_emit_restore_scratch(ce, cs);
3455
3456         return cs;
3457 }
3458
3459 static u32 *
3460 gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs)
3461 {
3462         cs = gen12_emit_timestamp_wa(ce, cs);
3463         cs = gen12_emit_restore_scratch(ce, cs);
3464
3465         return cs;
3466 }
3467
3468 static inline u32 context_wa_bb_offset(const struct intel_context *ce)
3469 {
3470         return PAGE_SIZE * ce->wa_bb_page;
3471 }
3472
3473 static u32 *context_indirect_bb(const struct intel_context *ce)
3474 {
3475         void *ptr;
3476
3477         GEM_BUG_ON(!ce->wa_bb_page);
3478
3479         ptr = ce->lrc_reg_state;
3480         ptr -= LRC_STATE_OFFSET; /* back to start of context image */
3481         ptr += context_wa_bb_offset(ce);
3482
3483         return ptr;
3484 }
3485
3486 static void
3487 setup_indirect_ctx_bb(const struct intel_context *ce,
3488                       const struct intel_engine_cs *engine,
3489                       u32 *(*emit)(const struct intel_context *, u32 *))
3490 {
3491         u32 * const start = context_indirect_bb(ce);
3492         u32 *cs;
3493
3494         cs = emit(ce, start);
3495         GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs));
3496         while ((unsigned long)cs % CACHELINE_BYTES)
3497                 *cs++ = MI_NOOP;
3498
3499         lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine,
3500                                     i915_ggtt_offset(ce->state) +
3501                                     context_wa_bb_offset(ce),
3502                                     (cs - start) * sizeof(*cs));
3503 }
3504
3505 static void
3506 __execlists_update_reg_state(const struct intel_context *ce,
3507                              const struct intel_engine_cs *engine,
3508                              u32 head)
3509 {
3510         struct intel_ring *ring = ce->ring;
3511         u32 *regs = ce->lrc_reg_state;
3512
3513         GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
3514         GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
3515
3516         regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
3517         regs[CTX_RING_HEAD] = head;
3518         regs[CTX_RING_TAIL] = ring->tail;
3519         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
3520
3521         /* RPCS */
3522         if (engine->class == RENDER_CLASS) {
3523                 regs[CTX_R_PWR_CLK_STATE] =
3524                         intel_sseu_make_rpcs(engine->gt, &ce->sseu);
3525
3526                 i915_oa_init_reg_state(ce, engine);
3527         }
3528
3529         if (ce->wa_bb_page) {
3530                 u32 *(*fn)(const struct intel_context *ce, u32 *cs);
3531
3532                 fn = gen12_emit_indirect_ctx_xcs;
3533                 if (ce->engine->class == RENDER_CLASS)
3534                         fn = gen12_emit_indirect_ctx_rcs;
3535
3536                 /* Mutually exclusive wrt to global indirect bb */
3537                 GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size);
3538                 setup_indirect_ctx_bb(ce, engine, fn);
3539         }
3540 }
3541
3542 static int
3543 execlists_context_pre_pin(struct intel_context *ce,
3544                           struct i915_gem_ww_ctx *ww, void **vaddr)
3545 {
3546         GEM_BUG_ON(!ce->state);
3547         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3548
3549         *vaddr = i915_gem_object_pin_map(ce->state->obj,
3550                                         i915_coherent_map_type(ce->engine->i915) |
3551                                         I915_MAP_OVERRIDE);
3552
3553         return PTR_ERR_OR_ZERO(*vaddr);
3554 }
3555
3556 static int
3557 __execlists_context_pin(struct intel_context *ce,
3558                         struct intel_engine_cs *engine,
3559                         void *vaddr)
3560 {
3561         ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
3562         ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET;
3563         __execlists_update_reg_state(ce, engine, ce->ring->tail);
3564
3565         return 0;
3566 }
3567
3568 static int execlists_context_pin(struct intel_context *ce, void *vaddr)
3569 {
3570         return __execlists_context_pin(ce, ce->engine, vaddr);
3571 }
3572
3573 static int execlists_context_alloc(struct intel_context *ce)
3574 {
3575         return __execlists_context_alloc(ce, ce->engine);
3576 }
3577
3578 static void execlists_context_reset(struct intel_context *ce)
3579 {
3580         CE_TRACE(ce, "reset\n");
3581         GEM_BUG_ON(!intel_context_is_pinned(ce));
3582
3583         intel_ring_reset(ce->ring, ce->ring->emit);
3584
3585         /* Scrub away the garbage */
3586         execlists_init_reg_state(ce->lrc_reg_state,
3587                                  ce, ce->engine, ce->ring, true);
3588         __execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
3589
3590         ce->lrc.desc |= CTX_DESC_FORCE_RESTORE;
3591 }
3592
3593 static const struct intel_context_ops execlists_context_ops = {
3594         .alloc = execlists_context_alloc,
3595
3596         .pre_pin = execlists_context_pre_pin,
3597         .pin = execlists_context_pin,
3598         .unpin = execlists_context_unpin,
3599         .post_unpin = execlists_context_post_unpin,
3600
3601         .enter = intel_context_enter_engine,
3602         .exit = intel_context_exit_engine,
3603
3604         .reset = execlists_context_reset,
3605         .destroy = execlists_context_destroy,
3606 };
3607
3608 static u32 hwsp_offset(const struct i915_request *rq)
3609 {
3610         const struct intel_timeline_cacheline *cl;
3611
3612         /* Before the request is executed, the timeline/cachline is fixed */
3613
3614         cl = rcu_dereference_protected(rq->hwsp_cacheline, 1);
3615         if (cl)
3616                 return cl->ggtt_offset;
3617
3618         return rcu_dereference_protected(rq->timeline, 1)->hwsp_offset;
3619 }
3620
3621 static int gen8_emit_init_breadcrumb(struct i915_request *rq)
3622 {
3623         u32 *cs;
3624
3625         GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq));
3626         if (!i915_request_timeline(rq)->has_initial_breadcrumb)
3627                 return 0;
3628
3629         cs = intel_ring_begin(rq, 6);
3630         if (IS_ERR(cs))
3631                 return PTR_ERR(cs);
3632
3633         /*
3634          * Check if we have been preempted before we even get started.
3635          *
3636          * After this point i915_request_started() reports true, even if
3637          * we get preempted and so are no longer running.
3638          */
3639         *cs++ = MI_ARB_CHECK;
3640         *cs++ = MI_NOOP;
3641
3642         *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
3643         *cs++ = hwsp_offset(rq);
3644         *cs++ = 0;
3645         *cs++ = rq->fence.seqno - 1;
3646
3647         intel_ring_advance(rq, cs);
3648
3649         /* Record the updated position of the request's payload */
3650         rq->infix = intel_ring_offset(rq, cs);
3651
3652         __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags);
3653
3654         return 0;
3655 }
3656
3657 static int emit_pdps(struct i915_request *rq)
3658 {
3659         const struct intel_engine_cs * const engine = rq->engine;
3660         struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm);
3661         int err, i;
3662         u32 *cs;
3663
3664         GEM_BUG_ON(intel_vgpu_active(rq->engine->i915));
3665
3666         /*
3667          * Beware ye of the dragons, this sequence is magic!
3668          *
3669          * Small changes to this sequence can cause anything from
3670          * GPU hangs to forcewake errors and machine lockups!
3671          */
3672
3673         /* Flush any residual operations from the context load */
3674         err = engine->emit_flush(rq, EMIT_FLUSH);
3675         if (err)
3676                 return err;
3677
3678         /* Magic required to prevent forcewake errors! */
3679         err = engine->emit_flush(rq, EMIT_INVALIDATE);
3680         if (err)
3681                 return err;
3682
3683         cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2);
3684         if (IS_ERR(cs))
3685                 return PTR_ERR(cs);
3686
3687         /* Ensure the LRI have landed before we invalidate & continue */
3688         *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
3689         for (i = GEN8_3LVL_PDPES; i--; ) {
3690                 const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
3691                 u32 base = engine->mmio_base;
3692
3693                 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i));
3694                 *cs++ = upper_32_bits(pd_daddr);
3695                 *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i));
3696                 *cs++ = lower_32_bits(pd_daddr);
3697         }
3698         *cs++ = MI_NOOP;
3699
3700         intel_ring_advance(rq, cs);
3701
3702         return 0;
3703 }
3704
3705 static int execlists_request_alloc(struct i915_request *request)
3706 {
3707         int ret;
3708
3709         GEM_BUG_ON(!intel_context_is_pinned(request->context));
3710
3711         /*
3712          * Flush enough space to reduce the likelihood of waiting after
3713          * we start building the request - in which case we will just
3714          * have to repeat work.
3715          */
3716         request->reserved_space += EXECLISTS_REQUEST_SIZE;
3717
3718         /*
3719          * Note that after this point, we have committed to using
3720          * this request as it is being used to both track the
3721          * state of engine initialisation and liveness of the
3722          * golden renderstate above. Think twice before you try
3723          * to cancel/unwind this request now.
3724          */
3725
3726         if (!i915_vm_is_4lvl(request->context->vm)) {
3727                 ret = emit_pdps(request);
3728                 if (ret)
3729                         return ret;
3730         }
3731
3732         /* Unconditionally invalidate GPU caches and TLBs. */
3733         ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3734         if (ret)
3735                 return ret;
3736
3737         request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3738         return 0;
3739 }
3740
3741 /*
3742  * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3743  * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3744  * but there is a slight complication as this is applied in WA batch where the
3745  * values are only initialized once so we cannot take register value at the
3746  * beginning and reuse it further; hence we save its value to memory, upload a
3747  * constant value with bit21 set and then we restore it back with the saved value.
3748  * To simplify the WA, a constant value is formed by using the default value
3749  * of this register. This shouldn't be a problem because we are only modifying
3750  * it for a short period and this batch in non-premptible. We can ofcourse
3751  * use additional instructions that read the actual value of the register
3752  * at that time and set our bit of interest but it makes the WA complicated.
3753  *
3754  * This WA is also required for Gen9 so extracting as a function avoids
3755  * code duplication.
3756  */
3757 static u32 *
3758 gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3759 {
3760         /* NB no one else is allowed to scribble over scratch + 256! */
3761         *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3762         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3763         *batch++ = intel_gt_scratch_offset(engine->gt,
3764                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3765         *batch++ = 0;
3766
3767         *batch++ = MI_LOAD_REGISTER_IMM(1);
3768         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3769         *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3770
3771         batch = gen8_emit_pipe_control(batch,
3772                                        PIPE_CONTROL_CS_STALL |
3773                                        PIPE_CONTROL_DC_FLUSH_ENABLE,
3774                                        0);
3775
3776         *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3777         *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3778         *batch++ = intel_gt_scratch_offset(engine->gt,
3779                                            INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3780         *batch++ = 0;
3781
3782         return batch;
3783 }
3784
3785 /*
3786  * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3787  * initialized at the beginning and shared across all contexts but this field
3788  * helps us to have multiple batches at different offsets and select them based
3789  * on a criteria. At the moment this batch always start at the beginning of the page
3790  * and at this point we don't have multiple wa_ctx batch buffers.
3791  *
3792  * The number of WA applied are not known at the beginning; we use this field
3793  * to return the no of DWORDS written.
3794  *
3795  * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3796  * so it adds NOOPs as padding to make it cacheline aligned.
3797  * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3798  * makes a complete batch buffer.
3799  */
3800 static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3801 {
3802         /* WaDisableCtxRestoreArbitration:bdw,chv */
3803         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3804
3805         /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3806         if (IS_BROADWELL(engine->i915))
3807                 batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3808
3809         /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3810         /* Actual scratch location is at 128 bytes offset */
3811         batch = gen8_emit_pipe_control(batch,
3812                                        PIPE_CONTROL_FLUSH_L3 |
3813                                        PIPE_CONTROL_STORE_DATA_INDEX |
3814                                        PIPE_CONTROL_CS_STALL |
3815                                        PIPE_CONTROL_QW_WRITE,
3816                                        LRC_PPHWSP_SCRATCH_ADDR);
3817
3818         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3819
3820         /* Pad to end of cacheline */
3821         while ((unsigned long)batch % CACHELINE_BYTES)
3822                 *batch++ = MI_NOOP;
3823
3824         /*
3825          * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3826          * execution depends on the length specified in terms of cache lines
3827          * in the register CTX_RCS_INDIRECT_CTX
3828          */
3829
3830         return batch;
3831 }
3832
3833 struct lri {
3834         i915_reg_t reg;
3835         u32 value;
3836 };
3837
3838 static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3839 {
3840         GEM_BUG_ON(!count || count > 63);
3841
3842         *batch++ = MI_LOAD_REGISTER_IMM(count);
3843         do {
3844                 *batch++ = i915_mmio_reg_offset(lri->reg);
3845                 *batch++ = lri->value;
3846         } while (lri++, --count);
3847         *batch++ = MI_NOOP;
3848
3849         return batch;
3850 }
3851
3852 static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3853 {
3854         static const struct lri lri[] = {
3855                 /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3856                 {
3857                         COMMON_SLICE_CHICKEN2,
3858                         __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3859                                        0),
3860                 },
3861
3862                 /* BSpec: 11391 */
3863                 {
3864                         FF_SLICE_CHICKEN,
3865                         __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3866                                        FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3867                 },
3868
3869                 /* BSpec: 11299 */
3870                 {
3871                         _3D_CHICKEN3,
3872                         __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3873                                        _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3874                 }
3875         };
3876
3877         *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3878
3879         /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3880         batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3881
3882         /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3883         batch = gen8_emit_pipe_control(batch,
3884                                        PIPE_CONTROL_FLUSH_L3 |
3885                                        PIPE_CONTROL_STORE_DATA_INDEX |
3886                                        PIPE_CONTROL_CS_STALL |
3887                                        PIPE_CONTROL_QW_WRITE,
3888                                        LRC_PPHWSP_SCRATCH_ADDR);
3889
3890         batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3891
3892         /* WaMediaPoolStateCmdInWABB:bxt,glk */
3893         if (HAS_POOLED_EU(engine->i915)) {
3894                 /*
3895                  * EU pool configuration is setup along with golden context
3896                  * during context initialization. This value depends on
3897                  * device type (2x6 or 3x6) and needs to be updated based
3898                  * on which subslice is disabled especially for 2x6
3899                  * devices, however it is safe to load default
3900                  * configuration of 3x6 device instead of masking off
3901                  * corresponding bits because HW ignores bits of a disabled
3902                  * subslice and drops down to appropriate config. Please
3903                  * see render_state_setup() in i915_gem_render_state.c for
3904                  * possible configurations, to avoid duplication they are
3905                  * not shown here again.
3906                  */
3907                 *batch++ = GEN9_MEDIA_POOL_STATE;
3908                 *batch++ = GEN9_MEDIA_POOL_ENABLE;
3909                 *batch++ = 0x00777000;
3910                 *batch++ = 0;
3911                 *batch++ = 0;
3912                 *batch++ = 0;
3913         }
3914
3915         *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3916
3917         /* Pad to end of cacheline */
3918         while ((unsigned long)batch % CACHELINE_BYTES)
3919                 *batch++ = MI_NOOP;
3920
3921         return batch;
3922 }
3923
3924 static u32 *
3925 gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3926 {
3927         int i;
3928
3929         /*
3930          * WaPipeControlBefore3DStateSamplePattern: cnl
3931          *
3932          * Ensure the engine is idle prior to programming a
3933          * 3DSTATE_SAMPLE_PATTERN during a context restore.
3934          */
3935         batch = gen8_emit_pipe_control(batch,
3936                                        PIPE_CONTROL_CS_STALL,
3937                                        0);
3938         /*
3939          * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3940          * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3941          * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3942          * confusing. Since gen8_emit_pipe_control() already advances the
3943          * batch by 6 dwords, we advance the other 10 here, completing a
3944          * cacheline. It's not clear if the workaround requires this padding
3945          * before other commands, or if it's just the regular padding we would
3946          * already have for the workaround bb, so leave it here for now.
3947          */
3948         for (i = 0; i < 10; i++)
3949                 *batch++ = MI_NOOP;
3950
3951         /* Pad to end of cacheline */
3952         while ((unsigned long)batch % CACHELINE_BYTES)
3953                 *batch++ = MI_NOOP;
3954
3955         return batch;
3956 }
3957
3958 #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3959
3960 static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3961 {
3962         struct drm_i915_gem_object *obj;
3963         struct i915_vma *vma;
3964         int err;
3965
3966         obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3967         if (IS_ERR(obj))
3968                 return PTR_ERR(obj);
3969
3970         vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3971         if (IS_ERR(vma)) {
3972                 err = PTR_ERR(vma);
3973                 goto err;
3974         }
3975
3976         err = i915_ggtt_pin(vma, NULL, 0, PIN_HIGH);
3977         if (err)
3978                 goto err;
3979
3980         engine->wa_ctx.vma = vma;
3981         return 0;
3982
3983 err:
3984         i915_gem_object_put(obj);
3985         return err;
3986 }
3987
3988 static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3989 {
3990         i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3991
3992         /* Called on error unwind, clear all flags to prevent further use */
3993         memset(&engine->wa_ctx, 0, sizeof(engine->wa_ctx));
3994 }
3995
3996 typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3997
3998 static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3999 {
4000         struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
4001         struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
4002                                             &wa_ctx->per_ctx };
4003         wa_bb_func_t wa_bb_fn[2];
4004         void *batch, *batch_ptr;
4005         unsigned int i;
4006         int ret;
4007
4008         if (engine->class != RENDER_CLASS)
4009                 return 0;
4010
4011         switch (INTEL_GEN(engine->i915)) {
4012         case 12:
4013         case 11:
4014                 return 0;
4015         case 10:
4016                 wa_bb_fn[0] = gen10_init_indirectctx_bb;
4017                 wa_bb_fn[1] = NULL;
4018                 break;
4019         case 9:
4020                 wa_bb_fn[0] = gen9_init_indirectctx_bb;
4021                 wa_bb_fn[1] = NULL;
4022                 break;
4023         case 8:
4024                 wa_bb_fn[0] = gen8_init_indirectctx_bb;
4025                 wa_bb_fn[1] = NULL;
4026                 break;
4027         default:
4028                 MISSING_CASE(INTEL_GEN(engine->i915));
4029                 return 0;
4030         }
4031
4032         ret = lrc_setup_wa_ctx(engine);
4033         if (ret) {
4034                 drm_dbg(&engine->i915->drm,
4035                         "Failed to setup context WA page: %d\n", ret);
4036                 return ret;
4037         }
4038
4039         batch = i915_gem_object_pin_map(wa_ctx->vma->obj, I915_MAP_WB);
4040
4041         /*
4042          * Emit the two workaround batch buffers, recording the offset from the
4043          * start of the workaround batch buffer object for each and their
4044          * respective sizes.
4045          */
4046         batch_ptr = batch;
4047         for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
4048                 wa_bb[i]->offset = batch_ptr - batch;
4049                 if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
4050                                                   CACHELINE_BYTES))) {
4051                         ret = -EINVAL;
4052                         break;
4053                 }
4054                 if (wa_bb_fn[i])
4055                         batch_ptr = wa_bb_fn[i](engine, batch_ptr);
4056                 wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
4057         }
4058         GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
4059
4060         __i915_gem_object_flush_map(wa_ctx->vma->obj, 0, batch_ptr - batch);
4061         __i915_gem_object_release_map(wa_ctx->vma->obj);
4062         if (ret)
4063                 lrc_destroy_wa_ctx(engine);
4064
4065         return ret;
4066 }
4067
4068 static void reset_csb_pointers(struct intel_engine_cs *engine)
4069 {
4070         struct intel_engine_execlists * const execlists = &engine->execlists;
4071         const unsigned int reset_value = execlists->csb_size - 1;
4072
4073         ring_set_paused(engine, 0);
4074
4075         /*
4076          * Sometimes Icelake forgets to reset its pointers on a GPU reset.
4077          * Bludgeon them with a mmio update to be sure.
4078          */
4079         ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4080                      0xffff << 16 | reset_value << 8 | reset_value);
4081         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4082
4083         /*
4084          * After a reset, the HW starts writing into CSB entry [0]. We
4085          * therefore have to set our HEAD pointer back one entry so that
4086          * the *first* entry we check is entry 0. To complicate this further,
4087          * as we don't wait for the first interrupt after reset, we have to
4088          * fake the HW write to point back to the last entry so that our
4089          * inline comparison of our cached head position against the last HW
4090          * write works even before the first interrupt.
4091          */
4092         execlists->csb_head = reset_value;
4093         WRITE_ONCE(*execlists->csb_write, reset_value);
4094         wmb(); /* Make sure this is visible to HW (paranoia?) */
4095
4096         /* Check that the GPU does indeed update the CSB entries! */
4097         memset(execlists->csb_status, -1, (reset_value + 1) * sizeof(u64));
4098         invalidate_csb_entries(&execlists->csb_status[0],
4099                                &execlists->csb_status[reset_value]);
4100
4101         /* Once more for luck and our trusty paranoia */
4102         ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
4103                      0xffff << 16 | reset_value << 8 | reset_value);
4104         ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
4105
4106         GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value);
4107 }
4108
4109 static void execlists_sanitize(struct intel_engine_cs *engine)
4110 {
4111         GEM_BUG_ON(execlists_active(&engine->execlists));
4112
4113         /*
4114          * Poison residual state on resume, in case the suspend didn't!
4115          *
4116          * We have to assume that across suspend/resume (or other loss
4117          * of control) that the contents of our pinned buffers has been
4118          * lost, replaced by garbage. Since this doesn't always happen,
4119          * let's poison such state so that we more quickly spot when
4120          * we falsely assume it has been preserved.
4121          */
4122         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4123                 memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE);
4124
4125         reset_csb_pointers(engine);
4126
4127         /*
4128          * The kernel_context HWSP is stored in the status_page. As above,
4129          * that may be lost on resume/initialisation, and so we need to
4130          * reset the value in the HWSP.
4131          */
4132         intel_timeline_reset_seqno(engine->kernel_context->timeline);
4133
4134         /* And scrub the dirty cachelines for the HWSP */
4135         clflush_cache_range(engine->status_page.addr, PAGE_SIZE);
4136 }
4137
4138 static void enable_error_interrupt(struct intel_engine_cs *engine)
4139 {
4140         u32 status;
4141
4142         engine->execlists.error_interrupt = 0;
4143         ENGINE_WRITE(engine, RING_EMR, ~0u);
4144         ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
4145
4146         status = ENGINE_READ(engine, RING_ESR);
4147         if (unlikely(status)) {
4148                 drm_err(&engine->i915->drm,
4149                         "engine '%s' resumed still in error: %08x\n",
4150                         engine->name, status);
4151                 __intel_gt_reset(engine->gt, engine->mask);
4152         }
4153
4154         /*
4155          * On current gen8+, we have 2 signals to play with
4156          *
4157          * - I915_ERROR_INSTUCTION (bit 0)
4158          *
4159          *    Generate an error if the command parser encounters an invalid
4160          *    instruction
4161          *
4162          *    This is a fatal error.
4163          *
4164          * - CP_PRIV (bit 2)
4165          *
4166          *    Generate an error on privilege violation (where the CP replaces
4167          *    the instruction with a no-op). This also fires for writes into
4168          *    read-only scratch pages.
4169          *
4170          *    This is a non-fatal error, parsing continues.
4171          *
4172          * * there are a few others defined for odd HW that we do not use
4173          *
4174          * Since CP_PRIV fires for cases where we have chosen to ignore the
4175          * error (as the HW is validating and suppressing the mistakes), we
4176          * only unmask the instruction error bit.
4177          */
4178         ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
4179 }
4180
4181 static void enable_execlists(struct intel_engine_cs *engine)
4182 {
4183         u32 mode;
4184
4185         assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
4186
4187         intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
4188
4189         if (INTEL_GEN(engine->i915) >= 11)
4190                 mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
4191         else
4192                 mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
4193         ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
4194
4195         ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
4196
4197         ENGINE_WRITE_FW(engine,
4198                         RING_HWS_PGA,
4199                         i915_ggtt_offset(engine->status_page.vma));
4200         ENGINE_POSTING_READ(engine, RING_HWS_PGA);
4201
4202         enable_error_interrupt(engine);
4203
4204         engine->context_tag = GENMASK(BITS_PER_LONG - 2, 0);
4205 }
4206
4207 static bool unexpected_starting_state(struct intel_engine_cs *engine)
4208 {
4209         bool unexpected = false;
4210
4211         if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
4212                 drm_dbg(&engine->i915->drm,
4213                         "STOP_RING still set in RING_MI_MODE\n");
4214                 unexpected = true;
4215         }
4216
4217         return unexpected;
4218 }
4219
4220 static int execlists_resume(struct intel_engine_cs *engine)
4221 {
4222         intel_mocs_init_engine(engine);
4223
4224         intel_breadcrumbs_reset(engine->breadcrumbs);
4225
4226         if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
4227                 struct drm_printer p = drm_debug_printer(__func__);
4228
4229                 intel_engine_dump(engine, &p, NULL);
4230         }
4231
4232         enable_execlists(engine);
4233
4234         return 0;
4235 }
4236
4237 static void execlists_reset_prepare(struct intel_engine_cs *engine)
4238 {
4239         struct intel_engine_execlists * const execlists = &engine->execlists;
4240         unsigned long flags;
4241
4242         ENGINE_TRACE(engine, "depth<-%d\n",
4243                      atomic_read(&execlists->tasklet.count));
4244
4245         /*
4246          * Prevent request submission to the hardware until we have
4247          * completed the reset in i915_gem_reset_finish(). If a request
4248          * is completed by one engine, it may then queue a request
4249          * to a second via its execlists->tasklet *just* as we are
4250          * calling engine->resume() and also writing the ELSP.
4251          * Turning off the execlists->tasklet until the reset is over
4252          * prevents the race.
4253          */
4254         __tasklet_disable_sync_once(&execlists->tasklet);
4255         GEM_BUG_ON(!reset_in_progress(execlists));
4256
4257         /* And flush any current direct submission. */
4258         spin_lock_irqsave(&engine->active.lock, flags);
4259         spin_unlock_irqrestore(&engine->active.lock, flags);
4260
4261         /*
4262          * We stop engines, otherwise we might get failed reset and a
4263          * dead gpu (on elk). Also as modern gpu as kbl can suffer
4264          * from system hang if batchbuffer is progressing when
4265          * the reset is issued, regardless of READY_TO_RESET ack.
4266          * Thus assume it is best to stop engines on all gens
4267          * where we have a gpu reset.
4268          *
4269          * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
4270          *
4271          * FIXME: Wa for more modern gens needs to be validated
4272          */
4273         ring_set_paused(engine, 1);
4274         intel_engine_stop_cs(engine);
4275
4276         engine->execlists.reset_ccid = active_ccid(engine);
4277 }
4278
4279 static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
4280 {
4281         int x;
4282
4283         x = lrc_ring_mi_mode(engine);
4284         if (x != -1) {
4285                 regs[x + 1] &= ~STOP_RING;
4286                 regs[x + 1] |= STOP_RING << 16;
4287         }
4288 }
4289
4290 static void __execlists_reset_reg_state(const struct intel_context *ce,
4291                                         const struct intel_engine_cs *engine)
4292 {
4293         u32 *regs = ce->lrc_reg_state;
4294
4295         __reset_stop_ring(regs, engine);
4296 }
4297
4298 static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
4299 {
4300         struct intel_engine_execlists * const execlists = &engine->execlists;
4301         struct intel_context *ce;
4302         struct i915_request *rq;
4303         u32 head;
4304
4305         mb(); /* paranoia: read the CSB pointers from after the reset */
4306         clflush(execlists->csb_write);
4307         mb();
4308
4309         process_csb(engine); /* drain preemption events */
4310
4311         /* Following the reset, we need to reload the CSB read/write pointers */
4312         reset_csb_pointers(engine);
4313
4314         /*
4315          * Save the currently executing context, even if we completed
4316          * its request, it was still running at the time of the
4317          * reset and will have been clobbered.
4318          */
4319         rq = active_context(engine, engine->execlists.reset_ccid);
4320         if (!rq)
4321                 goto unwind;
4322
4323         ce = rq->context;
4324         GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
4325
4326         if (i915_request_completed(rq)) {
4327                 /* Idle context; tidy up the ring so we can restart afresh */
4328                 head = intel_ring_wrap(ce->ring, rq->tail);
4329                 goto out_replay;
4330         }
4331
4332         /* We still have requests in-flight; the engine should be active */
4333         GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
4334
4335         /* Context has requests still in-flight; it should not be idle! */
4336         GEM_BUG_ON(i915_active_is_idle(&ce->active));
4337
4338         rq = active_request(ce->timeline, rq);
4339         head = intel_ring_wrap(ce->ring, rq->head);
4340         GEM_BUG_ON(head == ce->ring->tail);
4341
4342         /*
4343          * If this request hasn't started yet, e.g. it is waiting on a
4344          * semaphore, we need to avoid skipping the request or else we
4345          * break the signaling chain. However, if the context is corrupt
4346          * the request will not restart and we will be stuck with a wedged
4347          * device. It is quite often the case that if we issue a reset
4348          * while the GPU is loading the context image, that the context
4349          * image becomes corrupt.
4350          *
4351          * Otherwise, if we have not started yet, the request should replay
4352          * perfectly and we do not need to flag the result as being erroneous.
4353          */
4354         if (!i915_request_started(rq))
4355                 goto out_replay;
4356
4357         /*
4358          * If the request was innocent, we leave the request in the ELSP
4359          * and will try to replay it on restarting. The context image may
4360          * have been corrupted by the reset, in which case we may have
4361          * to service a new GPU hang, but more likely we can continue on
4362          * without impact.
4363          *
4364          * If the request was guilty, we presume the context is corrupt
4365          * and have to at least restore the RING register in the context
4366          * image back to the expected values to skip over the guilty request.
4367          */
4368         __i915_request_reset(rq, stalled);
4369
4370         /*
4371          * We want a simple context + ring to execute the breadcrumb update.
4372          * We cannot rely on the context being intact across the GPU hang,
4373          * so clear it and rebuild just what we need for the breadcrumb.
4374          * All pending requests for this context will be zapped, and any
4375          * future request will be after userspace has had the opportunity
4376          * to recreate its own state.
4377          */
4378 out_replay:
4379         ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
4380                      head, ce->ring->tail);
4381         __execlists_reset_reg_state(ce, engine);
4382         __execlists_update_reg_state(ce, engine, head);
4383         ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
4384
4385 unwind:
4386         /* Push back any incomplete requests for replay after the reset. */
4387         cancel_port_requests(execlists);
4388         __unwind_incomplete_requests(engine);
4389 }
4390
4391 static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
4392 {
4393         unsigned long flags;
4394
4395         ENGINE_TRACE(engine, "\n");
4396
4397         spin_lock_irqsave(&engine->active.lock, flags);
4398
4399         __execlists_reset(engine, stalled);
4400
4401         spin_unlock_irqrestore(&engine->active.lock, flags);
4402 }
4403
4404 static void nop_submission_tasklet(unsigned long data)
4405 {
4406         struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
4407
4408         /* The driver is wedged; don't process any more events. */
4409         WRITE_ONCE(engine->execlists.queue_priority_hint, INT_MIN);
4410 }
4411
4412 static void execlists_reset_cancel(struct intel_engine_cs *engine)
4413 {
4414         struct intel_engine_execlists * const execlists = &engine->execlists;
4415         struct i915_request *rq, *rn;
4416         struct rb_node *rb;
4417         unsigned long flags;
4418
4419         ENGINE_TRACE(engine, "\n");
4420
4421         /*
4422          * Before we call engine->cancel_requests(), we should have exclusive
4423          * access to the submission state. This is arranged for us by the
4424          * caller disabling the interrupt generation, the tasklet and other
4425          * threads that may then access the same state, giving us a free hand
4426          * to reset state. However, we still need to let lockdep be aware that
4427          * we know this state may be accessed in hardirq context, so we
4428          * disable the irq around this manipulation and we want to keep
4429          * the spinlock focused on its duties and not accidentally conflate
4430          * coverage to the submission's irq state. (Similarly, although we
4431          * shouldn't need to disable irq around the manipulation of the
4432          * submission's irq state, we also wish to remind ourselves that
4433          * it is irq state.)
4434          */
4435         spin_lock_irqsave(&engine->active.lock, flags);
4436
4437         __execlists_reset(engine, true);
4438
4439         /* Mark all executing requests as skipped. */
4440         list_for_each_entry(rq, &engine->active.requests, sched.link)
4441                 mark_eio(rq);
4442         intel_engine_signal_breadcrumbs(engine);
4443
4444         /* Flush the queued requests to the timeline list (for retiring). */
4445         while ((rb = rb_first_cached(&execlists->queue))) {
4446                 struct i915_priolist *p = to_priolist(rb);
4447                 int i;
4448
4449                 priolist_for_each_request_consume(rq, rn, p, i) {
4450                         mark_eio(rq);
4451                         __i915_request_submit(rq);
4452                 }
4453
4454                 rb_erase_cached(&p->node, &execlists->queue);
4455                 i915_priolist_free(p);
4456         }
4457
4458         /* On-hold requests will be flushed to timeline upon their release */
4459         list_for_each_entry(rq, &engine->active.hold, sched.link)
4460                 mark_eio(rq);
4461
4462         /* Cancel all attached virtual engines */
4463         while ((rb = rb_first_cached(&execlists->virtual))) {
4464                 struct virtual_engine *ve =
4465                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
4466
4467                 rb_erase_cached(rb, &execlists->virtual);
4468                 RB_CLEAR_NODE(rb);
4469
4470                 spin_lock(&ve->base.active.lock);
4471                 rq = fetch_and_zero(&ve->request);
4472                 if (rq) {
4473                         mark_eio(rq);
4474
4475                         rq->engine = engine;
4476                         __i915_request_submit(rq);
4477                         i915_request_put(rq);
4478
4479                         ve->base.execlists.queue_priority_hint = INT_MIN;
4480                 }
4481                 spin_unlock(&ve->base.active.lock);
4482         }
4483
4484         /* Remaining _unready_ requests will be nop'ed when submitted */
4485
4486         execlists->queue_priority_hint = INT_MIN;
4487         execlists->queue = RB_ROOT_CACHED;
4488
4489         GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
4490         execlists->tasklet.func = nop_submission_tasklet;
4491
4492         spin_unlock_irqrestore(&engine->active.lock, flags);
4493 }
4494
4495 static void execlists_reset_finish(struct intel_engine_cs *engine)
4496 {
4497         struct intel_engine_execlists * const execlists = &engine->execlists;
4498
4499         /*
4500          * After a GPU reset, we may have requests to replay. Do so now while
4501          * we still have the forcewake to be sure that the GPU is not allowed
4502          * to sleep before we restart and reload a context.
4503          */
4504         GEM_BUG_ON(!reset_in_progress(execlists));
4505         if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
4506                 execlists->tasklet.func(execlists->tasklet.data);
4507
4508         if (__tasklet_enable(&execlists->tasklet))
4509                 /* And kick in case we missed a new request submission. */
4510                 tasklet_hi_schedule(&execlists->tasklet);
4511         ENGINE_TRACE(engine, "depth->%d\n",
4512                      atomic_read(&execlists->tasklet.count));
4513 }
4514
4515 static int gen8_emit_bb_start_noarb(struct i915_request *rq,
4516                                     u64 offset, u32 len,
4517                                     const unsigned int flags)
4518 {
4519         u32 *cs;
4520
4521         cs = intel_ring_begin(rq, 4);
4522         if (IS_ERR(cs))
4523                 return PTR_ERR(cs);
4524
4525         /*
4526          * WaDisableCtxRestoreArbitration:bdw,chv
4527          *
4528          * We don't need to perform MI_ARB_ENABLE as often as we do (in
4529          * particular all the gen that do not need the w/a at all!), if we
4530          * took care to make sure that on every switch into this context
4531          * (both ordinary and for preemption) that arbitrartion was enabled
4532          * we would be fine.  However, for gen8 there is another w/a that
4533          * requires us to not preempt inside GPGPU execution, so we keep
4534          * arbitration disabled for gen8 batches. Arbitration will be
4535          * re-enabled before we close the request
4536          * (engine->emit_fini_breadcrumb).
4537          */
4538         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4539
4540         /* FIXME(BDW+): Address space and security selectors. */
4541         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
4542                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4543         *cs++ = lower_32_bits(offset);
4544         *cs++ = upper_32_bits(offset);
4545
4546         intel_ring_advance(rq, cs);
4547
4548         return 0;
4549 }
4550
4551 static int gen8_emit_bb_start(struct i915_request *rq,
4552                               u64 offset, u32 len,
4553                               const unsigned int flags)
4554 {
4555         u32 *cs;
4556
4557         cs = intel_ring_begin(rq, 6);
4558         if (IS_ERR(cs))
4559                 return PTR_ERR(cs);
4560
4561         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4562
4563         *cs++ = MI_BATCH_BUFFER_START_GEN8 |
4564                 (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
4565         *cs++ = lower_32_bits(offset);
4566         *cs++ = upper_32_bits(offset);
4567
4568         *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
4569         *cs++ = MI_NOOP;
4570
4571         intel_ring_advance(rq, cs);
4572
4573         return 0;
4574 }
4575
4576 static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
4577 {
4578         ENGINE_WRITE(engine, RING_IMR,
4579                      ~(engine->irq_enable_mask | engine->irq_keep_mask));
4580         ENGINE_POSTING_READ(engine, RING_IMR);
4581 }
4582
4583 static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
4584 {
4585         ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
4586 }
4587
4588 static int gen8_emit_flush(struct i915_request *request, u32 mode)
4589 {
4590         u32 cmd, *cs;
4591
4592         cs = intel_ring_begin(request, 4);
4593         if (IS_ERR(cs))
4594                 return PTR_ERR(cs);
4595
4596         cmd = MI_FLUSH_DW + 1;
4597
4598         /* We always require a command barrier so that subsequent
4599          * commands, such as breadcrumb interrupts, are strictly ordered
4600          * wrt the contents of the write cache being flushed to memory
4601          * (and thus being coherent from the CPU).
4602          */
4603         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4604
4605         if (mode & EMIT_INVALIDATE) {
4606                 cmd |= MI_INVALIDATE_TLB;
4607                 if (request->engine->class == VIDEO_DECODE_CLASS)
4608                         cmd |= MI_INVALIDATE_BSD;
4609         }
4610
4611         *cs++ = cmd;
4612         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4613         *cs++ = 0; /* upper addr */
4614         *cs++ = 0; /* value */
4615         intel_ring_advance(request, cs);
4616
4617         return 0;
4618 }
4619
4620 static int gen8_emit_flush_render(struct i915_request *request,
4621                                   u32 mode)
4622 {
4623         bool vf_flush_wa = false, dc_flush_wa = false;
4624         u32 *cs, flags = 0;
4625         int len;
4626
4627         flags |= PIPE_CONTROL_CS_STALL;
4628
4629         if (mode & EMIT_FLUSH) {
4630                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4631                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4632                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4633                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4634         }
4635
4636         if (mode & EMIT_INVALIDATE) {
4637                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4638                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4639                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4640                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4641                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4642                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4643                 flags |= PIPE_CONTROL_QW_WRITE;
4644                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4645
4646                 /*
4647                  * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
4648                  * pipe control.
4649                  */
4650                 if (IS_GEN(request->engine->i915, 9))
4651                         vf_flush_wa = true;
4652
4653                 /* WaForGAMHang:kbl */
4654                 if (IS_KBL_GT_REVID(request->engine->i915, 0, KBL_REVID_B0))
4655                         dc_flush_wa = true;
4656         }
4657
4658         len = 6;
4659
4660         if (vf_flush_wa)
4661                 len += 6;
4662
4663         if (dc_flush_wa)
4664                 len += 12;
4665
4666         cs = intel_ring_begin(request, len);
4667         if (IS_ERR(cs))
4668                 return PTR_ERR(cs);
4669
4670         if (vf_flush_wa)
4671                 cs = gen8_emit_pipe_control(cs, 0, 0);
4672
4673         if (dc_flush_wa)
4674                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
4675                                             0);
4676
4677         cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4678
4679         if (dc_flush_wa)
4680                 cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
4681
4682         intel_ring_advance(request, cs);
4683
4684         return 0;
4685 }
4686
4687 static int gen11_emit_flush_render(struct i915_request *request,
4688                                    u32 mode)
4689 {
4690         if (mode & EMIT_FLUSH) {
4691                 u32 *cs;
4692                 u32 flags = 0;
4693
4694                 flags |= PIPE_CONTROL_CS_STALL;
4695
4696                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4697                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4698                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4699                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4700                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4701                 flags |= PIPE_CONTROL_QW_WRITE;
4702                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4703
4704                 cs = intel_ring_begin(request, 6);
4705                 if (IS_ERR(cs))
4706                         return PTR_ERR(cs);
4707
4708                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4709                 intel_ring_advance(request, cs);
4710         }
4711
4712         if (mode & EMIT_INVALIDATE) {
4713                 u32 *cs;
4714                 u32 flags = 0;
4715
4716                 flags |= PIPE_CONTROL_CS_STALL;
4717
4718                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4719                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4720                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4721                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4722                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4723                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4724                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4725                 flags |= PIPE_CONTROL_QW_WRITE;
4726                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4727
4728                 cs = intel_ring_begin(request, 6);
4729                 if (IS_ERR(cs))
4730                         return PTR_ERR(cs);
4731
4732                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4733                 intel_ring_advance(request, cs);
4734         }
4735
4736         return 0;
4737 }
4738
4739 static u32 preparser_disable(bool state)
4740 {
4741         return MI_ARB_CHECK | 1 << 8 | state;
4742 }
4743
4744 static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
4745 {
4746         static const i915_reg_t vd[] = {
4747                 GEN12_VD0_AUX_NV,
4748                 GEN12_VD1_AUX_NV,
4749                 GEN12_VD2_AUX_NV,
4750                 GEN12_VD3_AUX_NV,
4751         };
4752
4753         static const i915_reg_t ve[] = {
4754                 GEN12_VE0_AUX_NV,
4755                 GEN12_VE1_AUX_NV,
4756         };
4757
4758         if (engine->class == VIDEO_DECODE_CLASS)
4759                 return vd[engine->instance];
4760
4761         if (engine->class == VIDEO_ENHANCEMENT_CLASS)
4762                 return ve[engine->instance];
4763
4764         GEM_BUG_ON("unknown aux_inv_reg\n");
4765
4766         return INVALID_MMIO_REG;
4767 }
4768
4769 static u32 *
4770 gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
4771 {
4772         *cs++ = MI_LOAD_REGISTER_IMM(1);
4773         *cs++ = i915_mmio_reg_offset(inv_reg);
4774         *cs++ = AUX_INV;
4775         *cs++ = MI_NOOP;
4776
4777         return cs;
4778 }
4779
4780 static int gen12_emit_flush_render(struct i915_request *request,
4781                                    u32 mode)
4782 {
4783         if (mode & EMIT_FLUSH) {
4784                 u32 flags = 0;
4785                 u32 *cs;
4786
4787                 flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
4788                 flags |= PIPE_CONTROL_FLUSH_L3;
4789                 flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
4790                 flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
4791                 /* Wa_1409600907:tgl */
4792                 flags |= PIPE_CONTROL_DEPTH_STALL;
4793                 flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
4794                 flags |= PIPE_CONTROL_FLUSH_ENABLE;
4795
4796                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4797                 flags |= PIPE_CONTROL_QW_WRITE;
4798
4799                 flags |= PIPE_CONTROL_CS_STALL;
4800
4801                 cs = intel_ring_begin(request, 6);
4802                 if (IS_ERR(cs))
4803                         return PTR_ERR(cs);
4804
4805                 cs = gen12_emit_pipe_control(cs,
4806                                              PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
4807                                              flags, LRC_PPHWSP_SCRATCH_ADDR);
4808                 intel_ring_advance(request, cs);
4809         }
4810
4811         if (mode & EMIT_INVALIDATE) {
4812                 u32 flags = 0;
4813                 u32 *cs;
4814
4815                 flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
4816                 flags |= PIPE_CONTROL_TLB_INVALIDATE;
4817                 flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
4818                 flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
4819                 flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
4820                 flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
4821                 flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
4822
4823                 flags |= PIPE_CONTROL_STORE_DATA_INDEX;
4824                 flags |= PIPE_CONTROL_QW_WRITE;
4825
4826                 flags |= PIPE_CONTROL_CS_STALL;
4827
4828                 cs = intel_ring_begin(request, 8 + 4);
4829                 if (IS_ERR(cs))
4830                         return PTR_ERR(cs);
4831
4832                 /*
4833                  * Prevent the pre-parser from skipping past the TLB
4834                  * invalidate and loading a stale page for the batch
4835                  * buffer / request payload.
4836                  */
4837                 *cs++ = preparser_disable(true);
4838
4839                 cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4840
4841                 /* hsdes: 1809175790 */
4842                 cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs);
4843
4844                 *cs++ = preparser_disable(false);
4845                 intel_ring_advance(request, cs);
4846         }
4847
4848         return 0;
4849 }
4850
4851 static int gen12_emit_flush(struct i915_request *request, u32 mode)
4852 {
4853         intel_engine_mask_t aux_inv = 0;
4854         u32 cmd, *cs;
4855
4856         cmd = 4;
4857         if (mode & EMIT_INVALIDATE)
4858                 cmd += 2;
4859         if (mode & EMIT_INVALIDATE)
4860                 aux_inv = request->engine->mask & ~BIT(BCS0);
4861         if (aux_inv)
4862                 cmd += 2 * hweight8(aux_inv) + 2;
4863
4864         cs = intel_ring_begin(request, cmd);
4865         if (IS_ERR(cs))
4866                 return PTR_ERR(cs);
4867
4868         if (mode & EMIT_INVALIDATE)
4869                 *cs++ = preparser_disable(true);
4870
4871         cmd = MI_FLUSH_DW + 1;
4872
4873         /* We always require a command barrier so that subsequent
4874          * commands, such as breadcrumb interrupts, are strictly ordered
4875          * wrt the contents of the write cache being flushed to memory
4876          * (and thus being coherent from the CPU).
4877          */
4878         cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
4879
4880         if (mode & EMIT_INVALIDATE) {
4881                 cmd |= MI_INVALIDATE_TLB;
4882                 if (request->engine->class == VIDEO_DECODE_CLASS)
4883                         cmd |= MI_INVALIDATE_BSD;
4884         }
4885
4886         *cs++ = cmd;
4887         *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
4888         *cs++ = 0; /* upper addr */
4889         *cs++ = 0; /* value */
4890
4891         if (aux_inv) { /* hsdes: 1809175790 */
4892                 struct intel_engine_cs *engine;
4893                 unsigned int tmp;
4894
4895                 *cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv));
4896                 for_each_engine_masked(engine, request->engine->gt,
4897                                        aux_inv, tmp) {
4898                         *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
4899                         *cs++ = AUX_INV;
4900                 }
4901                 *cs++ = MI_NOOP;
4902         }
4903
4904         if (mode & EMIT_INVALIDATE)
4905                 *cs++ = preparser_disable(false);
4906
4907         intel_ring_advance(request, cs);
4908
4909         return 0;
4910 }
4911
4912 static void assert_request_valid(struct i915_request *rq)
4913 {
4914         struct intel_ring *ring __maybe_unused = rq->ring;
4915
4916         /* Can we unwind this request without appearing to go forwards? */
4917         GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
4918 }
4919
4920 /*
4921  * Reserve space for 2 NOOPs at the end of each request to be
4922  * used as a workaround for not being allowed to do lite
4923  * restore with HEAD==TAIL (WaIdleLiteRestore).
4924  */
4925 static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4926 {
4927         /* Ensure there's always at least one preemption point per-request. */
4928         *cs++ = MI_ARB_CHECK;
4929         *cs++ = MI_NOOP;
4930         request->wa_tail = intel_ring_offset(request, cs);
4931
4932         /* Check that entire request is less than half the ring */
4933         assert_request_valid(request);
4934
4935         return cs;
4936 }
4937
4938 static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4939 {
4940         *cs++ = MI_SEMAPHORE_WAIT |
4941                 MI_SEMAPHORE_GLOBAL_GTT |
4942                 MI_SEMAPHORE_POLL |
4943                 MI_SEMAPHORE_SAD_EQ_SDD;
4944         *cs++ = 0;
4945         *cs++ = intel_hws_preempt_address(request->engine);
4946         *cs++ = 0;
4947
4948         return cs;
4949 }
4950
4951 static __always_inline u32*
4952 gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
4953 {
4954         *cs++ = MI_USER_INTERRUPT;
4955
4956         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4957         if (intel_engine_has_semaphores(request->engine))
4958                 cs = emit_preempt_busywait(request, cs);
4959
4960         request->tail = intel_ring_offset(request, cs);
4961         assert_ring_tail_valid(request->ring, request->tail);
4962
4963         return gen8_emit_wa_tail(request, cs);
4964 }
4965
4966 static u32 *emit_xcs_breadcrumb(struct i915_request *rq, u32 *cs)
4967 {
4968         return gen8_emit_ggtt_write(cs, rq->fence.seqno, hwsp_offset(rq), 0);
4969 }
4970
4971 static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
4972 {
4973         return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs));
4974 }
4975
4976 static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4977 {
4978         cs = gen8_emit_pipe_control(cs,
4979                                     PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4980                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4981                                     PIPE_CONTROL_DC_FLUSH_ENABLE,
4982                                     0);
4983
4984         /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4985         cs = gen8_emit_ggtt_write_rcs(cs,
4986                                       request->fence.seqno,
4987                                       hwsp_offset(request),
4988                                       PIPE_CONTROL_FLUSH_ENABLE |
4989                                       PIPE_CONTROL_CS_STALL);
4990
4991         return gen8_emit_fini_breadcrumb_tail(request, cs);
4992 }
4993
4994 static u32 *
4995 gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4996 {
4997         cs = gen8_emit_ggtt_write_rcs(cs,
4998                                       request->fence.seqno,
4999                                       hwsp_offset(request),
5000                                       PIPE_CONTROL_CS_STALL |
5001                                       PIPE_CONTROL_TILE_CACHE_FLUSH |
5002                                       PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
5003                                       PIPE_CONTROL_DEPTH_CACHE_FLUSH |
5004                                       PIPE_CONTROL_DC_FLUSH_ENABLE |
5005                                       PIPE_CONTROL_FLUSH_ENABLE);
5006
5007         return gen8_emit_fini_breadcrumb_tail(request, cs);
5008 }
5009
5010 /*
5011  * Note that the CS instruction pre-parser will not stall on the breadcrumb
5012  * flush and will continue pre-fetching the instructions after it before the
5013  * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
5014  * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
5015  * of the next request before the memory has been flushed, we're guaranteed that
5016  * we won't access the batch itself too early.
5017  * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
5018  * so, if the current request is modifying an instruction in the next request on
5019  * the same intel_context, we might pre-fetch and then execute the pre-update
5020  * instruction. To avoid this, the users of self-modifying code should either
5021  * disable the parser around the code emitting the memory writes, via a new flag
5022  * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
5023  * the in-kernel use-cases we've opted to use a separate context, see
5024  * reloc_gpu() as an example.
5025  * All the above applies only to the instructions themselves. Non-inline data
5026  * used by the instructions is not pre-fetched.
5027  */
5028
5029 static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
5030 {
5031         *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
5032                 MI_SEMAPHORE_GLOBAL_GTT |
5033                 MI_SEMAPHORE_POLL |
5034                 MI_SEMAPHORE_SAD_EQ_SDD;
5035         *cs++ = 0;
5036         *cs++ = intel_hws_preempt_address(request->engine);
5037         *cs++ = 0;
5038         *cs++ = 0;
5039         *cs++ = MI_NOOP;
5040
5041         return cs;
5042 }
5043
5044 static __always_inline u32*
5045 gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs)
5046 {
5047         *cs++ = MI_USER_INTERRUPT;
5048
5049         *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
5050         if (intel_engine_has_semaphores(request->engine))
5051                 cs = gen12_emit_preempt_busywait(request, cs);
5052
5053         request->tail = intel_ring_offset(request, cs);
5054         assert_ring_tail_valid(request->ring, request->tail);
5055
5056         return gen8_emit_wa_tail(request, cs);
5057 }
5058
5059 static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs)
5060 {
5061         /* XXX Stalling flush before seqno write; post-sync not */
5062         cs = emit_xcs_breadcrumb(rq, __gen8_emit_flush_dw(cs, 0, 0, 0));
5063         return gen12_emit_fini_breadcrumb_tail(rq, cs);
5064 }
5065
5066 static u32 *
5067 gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
5068 {
5069         cs = gen12_emit_ggtt_write_rcs(cs,
5070                                        request->fence.seqno,
5071                                        hwsp_offset(request),
5072                                        PIPE_CONTROL0_HDC_PIPELINE_FLUSH,
5073                                        PIPE_CONTROL_CS_STALL |
5074                                        PIPE_CONTROL_TILE_CACHE_FLUSH |
5075                                        PIPE_CONTROL_FLUSH_L3 |
5076                                        PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
5077                                        PIPE_CONTROL_DEPTH_CACHE_FLUSH |
5078                                        /* Wa_1409600907:tgl */
5079                                        PIPE_CONTROL_DEPTH_STALL |
5080                                        PIPE_CONTROL_DC_FLUSH_ENABLE |
5081                                        PIPE_CONTROL_FLUSH_ENABLE);
5082
5083         return gen12_emit_fini_breadcrumb_tail(request, cs);
5084 }
5085
5086 static void execlists_park(struct intel_engine_cs *engine)
5087 {
5088         cancel_timer(&engine->execlists.timer);
5089         cancel_timer(&engine->execlists.preempt);
5090 }
5091
5092 void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
5093 {
5094         engine->submit_request = execlists_submit_request;
5095         engine->schedule = i915_schedule;
5096         engine->execlists.tasklet.func = execlists_submission_tasklet;
5097
5098         engine->reset.prepare = execlists_reset_prepare;
5099         engine->reset.rewind = execlists_reset_rewind;
5100         engine->reset.cancel = execlists_reset_cancel;
5101         engine->reset.finish = execlists_reset_finish;
5102
5103         engine->park = execlists_park;
5104         engine->unpark = NULL;
5105
5106         engine->flags |= I915_ENGINE_SUPPORTS_STATS;
5107         if (!intel_vgpu_active(engine->i915)) {
5108                 engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
5109                 if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) {
5110                         engine->flags |= I915_ENGINE_HAS_PREEMPTION;
5111                         if (IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
5112                                 engine->flags |= I915_ENGINE_HAS_TIMESLICES;
5113                 }
5114         }
5115
5116         if (INTEL_GEN(engine->i915) >= 12)
5117                 engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
5118
5119         if (intel_engine_has_preemption(engine))
5120                 engine->emit_bb_start = gen8_emit_bb_start;
5121         else
5122                 engine->emit_bb_start = gen8_emit_bb_start_noarb;
5123 }
5124
5125 static void execlists_shutdown(struct intel_engine_cs *engine)
5126 {
5127         /* Synchronise with residual timers and any softirq they raise */
5128         del_timer_sync(&engine->execlists.timer);
5129         del_timer_sync(&engine->execlists.preempt);
5130         tasklet_kill(&engine->execlists.tasklet);
5131 }
5132
5133 static void execlists_release(struct intel_engine_cs *engine)
5134 {
5135         engine->sanitize = NULL; /* no longer in control, nothing to sanitize */
5136
5137         execlists_shutdown(engine);
5138
5139         intel_engine_cleanup_common(engine);
5140         lrc_destroy_wa_ctx(engine);
5141 }
5142
5143 static void
5144 logical_ring_default_vfuncs(struct intel_engine_cs *engine)
5145 {
5146         /* Default vfuncs which can be overriden by each engine. */
5147
5148         engine->resume = execlists_resume;
5149
5150         engine->cops = &execlists_context_ops;
5151         engine->request_alloc = execlists_request_alloc;
5152
5153         engine->emit_flush = gen8_emit_flush;
5154         engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
5155         engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
5156         if (INTEL_GEN(engine->i915) >= 12) {
5157                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
5158                 engine->emit_flush = gen12_emit_flush;
5159         }
5160         engine->set_default_submission = intel_execlists_set_default_submission;
5161
5162         if (INTEL_GEN(engine->i915) < 11) {
5163                 engine->irq_enable = gen8_logical_ring_enable_irq;
5164                 engine->irq_disable = gen8_logical_ring_disable_irq;
5165         } else {
5166                 /*
5167                  * TODO: On Gen11 interrupt masks need to be clear
5168                  * to allow C6 entry. Keep interrupts enabled at
5169                  * and take the hit of generating extra interrupts
5170                  * until a more refined solution exists.
5171                  */
5172         }
5173 }
5174
5175 static inline void
5176 logical_ring_default_irqs(struct intel_engine_cs *engine)
5177 {
5178         unsigned int shift = 0;
5179
5180         if (INTEL_GEN(engine->i915) < 11) {
5181                 const u8 irq_shifts[] = {
5182                         [RCS0]  = GEN8_RCS_IRQ_SHIFT,
5183                         [BCS0]  = GEN8_BCS_IRQ_SHIFT,
5184                         [VCS0]  = GEN8_VCS0_IRQ_SHIFT,
5185                         [VCS1]  = GEN8_VCS1_IRQ_SHIFT,
5186                         [VECS0] = GEN8_VECS_IRQ_SHIFT,
5187                 };
5188
5189                 shift = irq_shifts[engine->id];
5190         }
5191
5192         engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
5193         engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
5194         engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
5195         engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
5196 }
5197
5198 static void rcs_submission_override(struct intel_engine_cs *engine)
5199 {
5200         switch (INTEL_GEN(engine->i915)) {
5201         case 12:
5202                 engine->emit_flush = gen12_emit_flush_render;
5203                 engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
5204                 break;
5205         case 11:
5206                 engine->emit_flush = gen11_emit_flush_render;
5207                 engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
5208                 break;
5209         default:
5210                 engine->emit_flush = gen8_emit_flush_render;
5211                 engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
5212                 break;
5213         }
5214 }
5215
5216 int intel_execlists_submission_setup(struct intel_engine_cs *engine)
5217 {
5218         struct intel_engine_execlists * const execlists = &engine->execlists;
5219         struct drm_i915_private *i915 = engine->i915;
5220         struct intel_uncore *uncore = engine->uncore;
5221         u32 base = engine->mmio_base;
5222
5223         tasklet_init(&engine->execlists.tasklet,
5224                      execlists_submission_tasklet, (unsigned long)engine);
5225         timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
5226         timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
5227
5228         logical_ring_default_vfuncs(engine);
5229         logical_ring_default_irqs(engine);
5230
5231         if (engine->class == RENDER_CLASS)
5232                 rcs_submission_override(engine);
5233
5234         if (intel_init_workaround_bb(engine))
5235                 /*
5236                  * We continue even if we fail to initialize WA batch
5237                  * because we only expect rare glitches but nothing
5238                  * critical to prevent us from using GPU
5239                  */
5240                 drm_err(&i915->drm, "WA batch buffer initialization failed\n");
5241
5242         if (HAS_LOGICAL_RING_ELSQ(i915)) {
5243                 execlists->submit_reg = uncore->regs +
5244                         i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
5245                 execlists->ctrl_reg = uncore->regs +
5246                         i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
5247         } else {
5248                 execlists->submit_reg = uncore->regs +
5249                         i915_mmio_reg_offset(RING_ELSP(base));
5250         }
5251
5252         execlists->csb_status =
5253                 (u64 *)&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
5254
5255         execlists->csb_write =
5256                 &engine->status_page.addr[intel_hws_csb_write_index(i915)];
5257
5258         if (INTEL_GEN(i915) < 11)
5259                 execlists->csb_size = GEN8_CSB_ENTRIES;
5260         else
5261                 execlists->csb_size = GEN11_CSB_ENTRIES;
5262
5263         if (INTEL_GEN(engine->i915) >= 11) {
5264                 execlists->ccid |= engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
5265                 execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32);
5266         }
5267
5268         /* Finally, take ownership and responsibility for cleanup! */
5269         engine->sanitize = execlists_sanitize;
5270         engine->release = execlists_release;
5271
5272         return 0;
5273 }
5274
5275 static void init_common_reg_state(u32 * const regs,
5276                                   const struct intel_engine_cs *engine,
5277                                   const struct intel_ring *ring,
5278                                   bool inhibit)
5279 {
5280         u32 ctl;
5281
5282         ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
5283         ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
5284         if (inhibit)
5285                 ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
5286         if (INTEL_GEN(engine->i915) < 11)
5287                 ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
5288                                            CTX_CTRL_RS_CTX_ENABLE);
5289         regs[CTX_CONTEXT_CONTROL] = ctl;
5290
5291         regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
5292         regs[CTX_TIMESTAMP] = 0;
5293 }
5294
5295 static void init_wa_bb_reg_state(u32 * const regs,
5296                                  const struct intel_engine_cs *engine)
5297 {
5298         const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
5299
5300         if (wa_ctx->per_ctx.size) {
5301                 const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
5302
5303                 GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1);
5304                 regs[lrc_ring_wa_bb_per_ctx(engine) + 1] =
5305                         (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
5306         }
5307
5308         if (wa_ctx->indirect_ctx.size) {
5309                 lrc_ring_setup_indirect_ctx(regs, engine,
5310                                             i915_ggtt_offset(wa_ctx->vma) +
5311                                             wa_ctx->indirect_ctx.offset,
5312                                             wa_ctx->indirect_ctx.size);
5313         }
5314 }
5315
5316 static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
5317 {
5318         if (i915_vm_is_4lvl(&ppgtt->vm)) {
5319                 /* 64b PPGTT (48bit canonical)
5320                  * PDP0_DESCRIPTOR contains the base address to PML4 and
5321                  * other PDP Descriptors are ignored.
5322                  */
5323                 ASSIGN_CTX_PML4(ppgtt, regs);
5324         } else {
5325                 ASSIGN_CTX_PDP(ppgtt, regs, 3);
5326                 ASSIGN_CTX_PDP(ppgtt, regs, 2);
5327                 ASSIGN_CTX_PDP(ppgtt, regs, 1);
5328                 ASSIGN_CTX_PDP(ppgtt, regs, 0);
5329         }
5330 }
5331
5332 static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
5333 {
5334         if (i915_is_ggtt(vm))
5335                 return i915_vm_to_ggtt(vm)->alias;
5336         else
5337                 return i915_vm_to_ppgtt(vm);
5338 }
5339
5340 static void execlists_init_reg_state(u32 *regs,
5341                                      const struct intel_context *ce,
5342                                      const struct intel_engine_cs *engine,
5343                                      const struct intel_ring *ring,
5344                                      bool inhibit)
5345 {
5346         /*
5347          * A context is actually a big batch buffer with several
5348          * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
5349          * values we are setting here are only for the first context restore:
5350          * on a subsequent save, the GPU will recreate this batchbuffer with new
5351          * values (including all the missing MI_LOAD_REGISTER_IMM commands that
5352          * we are not initializing here).
5353          *
5354          * Must keep consistent with virtual_update_register_offsets().
5355          */
5356         set_offsets(regs, reg_offsets(engine), engine, inhibit);
5357
5358         init_common_reg_state(regs, engine, ring, inhibit);
5359         init_ppgtt_reg_state(regs, vm_alias(ce->vm));
5360
5361         init_wa_bb_reg_state(regs, engine);
5362
5363         __reset_stop_ring(regs, engine);
5364 }
5365
5366 static int
5367 populate_lr_context(struct intel_context *ce,
5368                     struct drm_i915_gem_object *ctx_obj,
5369                     struct intel_engine_cs *engine,
5370                     struct intel_ring *ring)
5371 {
5372         bool inhibit = true;
5373         void *vaddr;
5374
5375         vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
5376         if (IS_ERR(vaddr)) {
5377                 drm_dbg(&engine->i915->drm, "Could not map object pages!\n");
5378                 return PTR_ERR(vaddr);
5379         }
5380
5381         set_redzone(vaddr, engine);
5382
5383         if (engine->default_state) {
5384                 shmem_read(engine->default_state, 0,
5385                            vaddr, engine->context_size);
5386                 __set_bit(CONTEXT_VALID_BIT, &ce->flags);
5387                 inhibit = false;
5388         }
5389
5390         /* Clear the ppHWSP (inc. per-context counters) */
5391         memset(vaddr, 0, PAGE_SIZE);
5392
5393         /*
5394          * The second page of the context object contains some registers which
5395          * must be set up prior to the first execution.
5396          */
5397         execlists_init_reg_state(vaddr + LRC_STATE_OFFSET,
5398                                  ce, engine, ring, inhibit);
5399
5400         __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
5401         i915_gem_object_unpin_map(ctx_obj);
5402         return 0;
5403 }
5404
5405 static struct intel_timeline *pinned_timeline(struct intel_context *ce)
5406 {
5407         struct intel_timeline *tl = fetch_and_zero(&ce->timeline);
5408
5409         return intel_timeline_create_from_engine(ce->engine,
5410                                                  page_unmask_bits(tl));
5411 }
5412
5413 static int __execlists_context_alloc(struct intel_context *ce,
5414                                      struct intel_engine_cs *engine)
5415 {
5416         struct drm_i915_gem_object *ctx_obj;
5417         struct intel_ring *ring;
5418         struct i915_vma *vma;
5419         u32 context_size;
5420         int ret;
5421
5422         GEM_BUG_ON(ce->state);
5423         context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
5424
5425         if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
5426                 context_size += I915_GTT_PAGE_SIZE; /* for redzone */
5427
5428         if (INTEL_GEN(engine->i915) == 12) {
5429                 ce->wa_bb_page = context_size / PAGE_SIZE;
5430                 context_size += PAGE_SIZE;
5431         }
5432
5433         ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
5434         if (IS_ERR(ctx_obj))
5435                 return PTR_ERR(ctx_obj);
5436
5437         vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
5438         if (IS_ERR(vma)) {
5439                 ret = PTR_ERR(vma);
5440                 goto error_deref_obj;
5441         }
5442
5443         if (!page_mask_bits(ce->timeline)) {
5444                 struct intel_timeline *tl;
5445
5446                 /*
5447                  * Use the static global HWSP for the kernel context, and
5448                  * a dynamically allocated cacheline for everyone else.
5449                  */
5450                 if (unlikely(ce->timeline))
5451                         tl = pinned_timeline(ce);
5452                 else
5453                         tl = intel_timeline_create(engine->gt);
5454                 if (IS_ERR(tl)) {
5455                         ret = PTR_ERR(tl);
5456                         goto error_deref_obj;
5457                 }
5458
5459                 ce->timeline = tl;
5460         }
5461
5462         ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
5463         if (IS_ERR(ring)) {
5464                 ret = PTR_ERR(ring);
5465                 goto error_deref_obj;
5466         }
5467
5468         ret = populate_lr_context(ce, ctx_obj, engine, ring);
5469         if (ret) {
5470                 drm_dbg(&engine->i915->drm,
5471                         "Failed to populate LRC: %d\n", ret);
5472                 goto error_ring_free;
5473         }
5474
5475         ce->ring = ring;
5476         ce->state = vma;
5477
5478         return 0;
5479
5480 error_ring_free:
5481         intel_ring_put(ring);
5482 error_deref_obj:
5483         i915_gem_object_put(ctx_obj);
5484         return ret;
5485 }
5486
5487 static struct list_head *virtual_queue(struct virtual_engine *ve)
5488 {
5489         return &ve->base.execlists.default_priolist.requests[0];
5490 }
5491
5492 static void rcu_virtual_context_destroy(struct work_struct *wrk)
5493 {
5494         struct virtual_engine *ve =
5495                 container_of(wrk, typeof(*ve), rcu.work);
5496         unsigned int n;
5497
5498         GEM_BUG_ON(ve->context.inflight);
5499
5500         /* Preempt-to-busy may leave a stale request behind. */
5501         if (unlikely(ve->request)) {
5502                 struct i915_request *old;
5503
5504                 spin_lock_irq(&ve->base.active.lock);
5505
5506                 old = fetch_and_zero(&ve->request);
5507                 if (old) {
5508                         GEM_BUG_ON(!i915_request_completed(old));
5509                         __i915_request_submit(old);
5510                         i915_request_put(old);
5511                 }
5512
5513                 spin_unlock_irq(&ve->base.active.lock);
5514         }
5515
5516         /*
5517          * Flush the tasklet in case it is still running on another core.
5518          *
5519          * This needs to be done before we remove ourselves from the siblings'
5520          * rbtrees as in the case it is running in parallel, it may reinsert
5521          * the rb_node into a sibling.
5522          */
5523         tasklet_kill(&ve->base.execlists.tasklet);
5524
5525         /* Decouple ourselves from the siblings, no more access allowed. */
5526         for (n = 0; n < ve->num_siblings; n++) {
5527                 struct intel_engine_cs *sibling = ve->siblings[n];
5528                 struct rb_node *node = &ve->nodes[sibling->id].rb;
5529
5530                 if (RB_EMPTY_NODE(node))
5531                         continue;
5532
5533                 spin_lock_irq(&sibling->active.lock);
5534
5535                 /* Detachment is lazily performed in the execlists tasklet */
5536                 if (!RB_EMPTY_NODE(node))
5537                         rb_erase_cached(node, &sibling->execlists.virtual);
5538
5539                 spin_unlock_irq(&sibling->active.lock);
5540         }
5541         GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
5542         GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5543
5544         if (ve->context.state)
5545                 __execlists_context_fini(&ve->context);
5546         intel_context_fini(&ve->context);
5547
5548         intel_breadcrumbs_free(ve->base.breadcrumbs);
5549         intel_engine_free_request_pool(&ve->base);
5550
5551         kfree(ve->bonds);
5552         kfree(ve);
5553 }
5554
5555 static void virtual_context_destroy(struct kref *kref)
5556 {
5557         struct virtual_engine *ve =
5558                 container_of(kref, typeof(*ve), context.ref);
5559
5560         GEM_BUG_ON(!list_empty(&ve->context.signals));
5561
5562         /*
5563          * When destroying the virtual engine, we have to be aware that
5564          * it may still be in use from an hardirq/softirq context causing
5565          * the resubmission of a completed request (background completion
5566          * due to preempt-to-busy). Before we can free the engine, we need
5567          * to flush the submission code and tasklets that are still potentially
5568          * accessing the engine. Flushing the tasklets requires process context,
5569          * and since we can guard the resubmit onto the engine with an RCU read
5570          * lock, we can delegate the free of the engine to an RCU worker.
5571          */
5572         INIT_RCU_WORK(&ve->rcu, rcu_virtual_context_destroy);
5573         queue_rcu_work(system_wq, &ve->rcu);
5574 }
5575
5576 static void virtual_engine_initial_hint(struct virtual_engine *ve)
5577 {
5578         int swp;
5579
5580         /*
5581          * Pick a random sibling on starting to help spread the load around.
5582          *
5583          * New contexts are typically created with exactly the same order
5584          * of siblings, and often started in batches. Due to the way we iterate
5585          * the array of sibling when submitting requests, sibling[0] is
5586          * prioritised for dequeuing. If we make sure that sibling[0] is fairly
5587          * randomised across the system, we also help spread the load by the
5588          * first engine we inspect being different each time.
5589          *
5590          * NB This does not force us to execute on this engine, it will just
5591          * typically be the first we inspect for submission.
5592          */
5593         swp = prandom_u32_max(ve->num_siblings);
5594         if (swp)
5595                 swap(ve->siblings[swp], ve->siblings[0]);
5596 }
5597
5598 static int virtual_context_alloc(struct intel_context *ce)
5599 {
5600         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5601
5602         return __execlists_context_alloc(ce, ve->siblings[0]);
5603 }
5604
5605 static int virtual_context_pin(struct intel_context *ce, void *vaddr)
5606 {
5607         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5608
5609         /* Note: we must use a real engine class for setting up reg state */
5610         return __execlists_context_pin(ce, ve->siblings[0], vaddr);
5611 }
5612
5613 static void virtual_context_enter(struct intel_context *ce)
5614 {
5615         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5616         unsigned int n;
5617
5618         for (n = 0; n < ve->num_siblings; n++)
5619                 intel_engine_pm_get(ve->siblings[n]);
5620
5621         intel_timeline_enter(ce->timeline);
5622 }
5623
5624 static void virtual_context_exit(struct intel_context *ce)
5625 {
5626         struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
5627         unsigned int n;
5628
5629         intel_timeline_exit(ce->timeline);
5630
5631         for (n = 0; n < ve->num_siblings; n++)
5632                 intel_engine_pm_put(ve->siblings[n]);
5633 }
5634
5635 static const struct intel_context_ops virtual_context_ops = {
5636         .alloc = virtual_context_alloc,
5637
5638         .pre_pin = execlists_context_pre_pin,
5639         .pin = virtual_context_pin,
5640         .unpin = execlists_context_unpin,
5641         .post_unpin = execlists_context_post_unpin,
5642
5643         .enter = virtual_context_enter,
5644         .exit = virtual_context_exit,
5645
5646         .destroy = virtual_context_destroy,
5647 };
5648
5649 static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
5650 {
5651         struct i915_request *rq;
5652         intel_engine_mask_t mask;
5653
5654         rq = READ_ONCE(ve->request);
5655         if (!rq)
5656                 return 0;
5657
5658         /* The rq is ready for submission; rq->execution_mask is now stable. */
5659         mask = rq->execution_mask;
5660         if (unlikely(!mask)) {
5661                 /* Invalid selection, submit to a random engine in error */
5662                 i915_request_set_error_once(rq, -ENODEV);
5663                 mask = ve->siblings[0]->mask;
5664         }
5665
5666         ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
5667                      rq->fence.context, rq->fence.seqno,
5668                      mask, ve->base.execlists.queue_priority_hint);
5669
5670         return mask;
5671 }
5672
5673 static void virtual_submission_tasklet(unsigned long data)
5674 {
5675         struct virtual_engine * const ve = (struct virtual_engine *)data;
5676         const int prio = READ_ONCE(ve->base.execlists.queue_priority_hint);
5677         intel_engine_mask_t mask;
5678         unsigned int n;
5679
5680         rcu_read_lock();
5681         mask = virtual_submission_mask(ve);
5682         rcu_read_unlock();
5683         if (unlikely(!mask))
5684                 return;
5685
5686         local_irq_disable();
5687         for (n = 0; n < ve->num_siblings; n++) {
5688                 struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]);
5689                 struct ve_node * const node = &ve->nodes[sibling->id];
5690                 struct rb_node **parent, *rb;
5691                 bool first;
5692
5693                 if (!READ_ONCE(ve->request))
5694                         break; /* already handled by a sibling's tasklet */
5695
5696                 if (unlikely(!(mask & sibling->mask))) {
5697                         if (!RB_EMPTY_NODE(&node->rb)) {
5698                                 spin_lock(&sibling->active.lock);
5699                                 rb_erase_cached(&node->rb,
5700                                                 &sibling->execlists.virtual);
5701                                 RB_CLEAR_NODE(&node->rb);
5702                                 spin_unlock(&sibling->active.lock);
5703                         }
5704                         continue;
5705                 }
5706
5707                 spin_lock(&sibling->active.lock);
5708
5709                 if (!RB_EMPTY_NODE(&node->rb)) {
5710                         /*
5711                          * Cheat and avoid rebalancing the tree if we can
5712                          * reuse this node in situ.
5713                          */
5714                         first = rb_first_cached(&sibling->execlists.virtual) ==
5715                                 &node->rb;
5716                         if (prio == node->prio || (prio > node->prio && first))
5717                                 goto submit_engine;
5718
5719                         rb_erase_cached(&node->rb, &sibling->execlists.virtual);
5720                 }
5721
5722                 rb = NULL;
5723                 first = true;
5724                 parent = &sibling->execlists.virtual.rb_root.rb_node;
5725                 while (*parent) {
5726                         struct ve_node *other;
5727
5728                         rb = *parent;
5729                         other = rb_entry(rb, typeof(*other), rb);
5730                         if (prio > other->prio) {
5731                                 parent = &rb->rb_left;
5732                         } else {
5733                                 parent = &rb->rb_right;
5734                                 first = false;
5735                         }
5736                 }
5737
5738                 rb_link_node(&node->rb, rb, parent);
5739                 rb_insert_color_cached(&node->rb,
5740                                        &sibling->execlists.virtual,
5741                                        first);
5742
5743 submit_engine:
5744                 GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
5745                 node->prio = prio;
5746                 if (first && prio > sibling->execlists.queue_priority_hint)
5747                         tasklet_hi_schedule(&sibling->execlists.tasklet);
5748
5749                 spin_unlock(&sibling->active.lock);
5750         }
5751         local_irq_enable();
5752 }
5753
5754 static void virtual_submit_request(struct i915_request *rq)
5755 {
5756         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5757         struct i915_request *old;
5758         unsigned long flags;
5759
5760         ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
5761                      rq->fence.context,
5762                      rq->fence.seqno);
5763
5764         GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
5765
5766         spin_lock_irqsave(&ve->base.active.lock, flags);
5767
5768         old = ve->request;
5769         if (old) { /* background completion event from preempt-to-busy */
5770                 GEM_BUG_ON(!i915_request_completed(old));
5771                 __i915_request_submit(old);
5772                 i915_request_put(old);
5773         }
5774
5775         if (i915_request_completed(rq)) {
5776                 __i915_request_submit(rq);
5777
5778                 ve->base.execlists.queue_priority_hint = INT_MIN;
5779                 ve->request = NULL;
5780         } else {
5781                 ve->base.execlists.queue_priority_hint = rq_prio(rq);
5782                 ve->request = i915_request_get(rq);
5783
5784                 GEM_BUG_ON(!list_empty(virtual_queue(ve)));
5785                 list_move_tail(&rq->sched.link, virtual_queue(ve));
5786
5787                 tasklet_hi_schedule(&ve->base.execlists.tasklet);
5788         }
5789
5790         spin_unlock_irqrestore(&ve->base.active.lock, flags);
5791 }
5792
5793 static struct ve_bond *
5794 virtual_find_bond(struct virtual_engine *ve,
5795                   const struct intel_engine_cs *master)
5796 {
5797         int i;
5798
5799         for (i = 0; i < ve->num_bonds; i++) {
5800                 if (ve->bonds[i].master == master)
5801                         return &ve->bonds[i];
5802         }
5803
5804         return NULL;
5805 }
5806
5807 static void
5808 virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
5809 {
5810         struct virtual_engine *ve = to_virtual_engine(rq->engine);
5811         intel_engine_mask_t allowed, exec;
5812         struct ve_bond *bond;
5813
5814         allowed = ~to_request(signal)->engine->mask;
5815
5816         bond = virtual_find_bond(ve, to_request(signal)->engine);
5817         if (bond)
5818                 allowed &= bond->sibling_mask;
5819
5820         /* Restrict the bonded request to run on only the available engines */
5821         exec = READ_ONCE(rq->execution_mask);
5822         while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
5823                 ;
5824
5825         /* Prevent the master from being re-run on the bonded engines */
5826         to_request(signal)->execution_mask &= ~allowed;
5827 }
5828
5829 struct intel_context *
5830 intel_execlists_create_virtual(struct intel_engine_cs **siblings,
5831                                unsigned int count)
5832 {
5833         struct virtual_engine *ve;
5834         unsigned int n;
5835         int err;
5836
5837         if (count == 0)
5838                 return ERR_PTR(-EINVAL);
5839
5840         if (count == 1)
5841                 return intel_context_create(siblings[0]);
5842
5843         ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
5844         if (!ve)
5845                 return ERR_PTR(-ENOMEM);
5846
5847         ve->base.i915 = siblings[0]->i915;
5848         ve->base.gt = siblings[0]->gt;
5849         ve->base.uncore = siblings[0]->uncore;
5850         ve->base.id = -1;
5851
5852         ve->base.class = OTHER_CLASS;
5853         ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
5854         ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5855         ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
5856
5857         /*
5858          * The decision on whether to submit a request using semaphores
5859          * depends on the saturated state of the engine. We only compute
5860          * this during HW submission of the request, and we need for this
5861          * state to be globally applied to all requests being submitted
5862          * to this engine. Virtual engines encompass more than one physical
5863          * engine and so we cannot accurately tell in advance if one of those
5864          * engines is already saturated and so cannot afford to use a semaphore
5865          * and be pessimized in priority for doing so -- if we are the only
5866          * context using semaphores after all other clients have stopped, we
5867          * will be starved on the saturated system. Such a global switch for
5868          * semaphores is less than ideal, but alas is the current compromise.
5869          */
5870         ve->base.saturated = ALL_ENGINES;
5871
5872         snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
5873
5874         intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
5875         intel_engine_init_execlists(&ve->base);
5876
5877         ve->base.cops = &virtual_context_ops;
5878         ve->base.request_alloc = execlists_request_alloc;
5879
5880         ve->base.schedule = i915_schedule;
5881         ve->base.submit_request = virtual_submit_request;
5882         ve->base.bond_execute = virtual_bond_execute;
5883
5884         INIT_LIST_HEAD(virtual_queue(ve));
5885         ve->base.execlists.queue_priority_hint = INT_MIN;
5886         tasklet_init(&ve->base.execlists.tasklet,
5887                      virtual_submission_tasklet,
5888                      (unsigned long)ve);
5889
5890         intel_context_init(&ve->context, &ve->base);
5891
5892         ve->base.breadcrumbs = intel_breadcrumbs_create(NULL);
5893         if (!ve->base.breadcrumbs) {
5894                 err = -ENOMEM;
5895                 goto err_put;
5896         }
5897
5898         for (n = 0; n < count; n++) {
5899                 struct intel_engine_cs *sibling = siblings[n];
5900
5901                 GEM_BUG_ON(!is_power_of_2(sibling->mask));
5902                 if (sibling->mask & ve->base.mask) {
5903                         DRM_DEBUG("duplicate %s entry in load balancer\n",
5904                                   sibling->name);
5905                         err = -EINVAL;
5906                         goto err_put;
5907                 }
5908
5909                 /*
5910                  * The virtual engine implementation is tightly coupled to
5911                  * the execlists backend -- we push out request directly
5912                  * into a tree inside each physical engine. We could support
5913                  * layering if we handle cloning of the requests and
5914                  * submitting a copy into each backend.
5915                  */
5916                 if (sibling->execlists.tasklet.func !=
5917                     execlists_submission_tasklet) {
5918                         err = -ENODEV;
5919                         goto err_put;
5920                 }
5921
5922                 GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
5923                 RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
5924
5925                 ve->siblings[ve->num_siblings++] = sibling;
5926                 ve->base.mask |= sibling->mask;
5927
5928                 /*
5929                  * All physical engines must be compatible for their emission
5930                  * functions (as we build the instructions during request
5931                  * construction and do not alter them before submission
5932                  * on the physical engine). We use the engine class as a guide
5933                  * here, although that could be refined.
5934                  */
5935                 if (ve->base.class != OTHER_CLASS) {
5936                         if (ve->base.class != sibling->class) {
5937                                 DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
5938                                           sibling->class, ve->base.class);
5939                                 err = -EINVAL;
5940                                 goto err_put;
5941                         }
5942                         continue;
5943                 }
5944
5945                 ve->base.class = sibling->class;
5946                 ve->base.uabi_class = sibling->uabi_class;
5947                 snprintf(ve->base.name, sizeof(ve->base.name),
5948                          "v%dx%d", ve->base.class, count);
5949                 ve->base.context_size = sibling->context_size;
5950
5951                 ve->base.emit_bb_start = sibling->emit_bb_start;
5952                 ve->base.emit_flush = sibling->emit_flush;
5953                 ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5954                 ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5955                 ve->base.emit_fini_breadcrumb_dw =
5956                         sibling->emit_fini_breadcrumb_dw;
5957
5958                 ve->base.flags = sibling->flags;
5959         }
5960
5961         ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5962
5963         virtual_engine_initial_hint(ve);
5964         return &ve->context;
5965
5966 err_put:
5967         intel_context_put(&ve->context);
5968         return ERR_PTR(err);
5969 }
5970
5971 struct intel_context *
5972 intel_execlists_clone_virtual(struct intel_engine_cs *src)
5973 {
5974         struct virtual_engine *se = to_virtual_engine(src);
5975         struct intel_context *dst;
5976
5977         dst = intel_execlists_create_virtual(se->siblings,
5978                                              se->num_siblings);
5979         if (IS_ERR(dst))
5980                 return dst;
5981
5982         if (se->num_bonds) {
5983                 struct virtual_engine *de = to_virtual_engine(dst->engine);
5984
5985                 de->bonds = kmemdup(se->bonds,
5986                                     sizeof(*se->bonds) * se->num_bonds,
5987                                     GFP_KERNEL);
5988                 if (!de->bonds) {
5989                         intel_context_put(dst);
5990                         return ERR_PTR(-ENOMEM);
5991                 }
5992
5993                 de->num_bonds = se->num_bonds;
5994         }
5995
5996         return dst;
5997 }
5998
5999 int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
6000                                      const struct intel_engine_cs *master,
6001                                      const struct intel_engine_cs *sibling)
6002 {
6003         struct virtual_engine *ve = to_virtual_engine(engine);
6004         struct ve_bond *bond;
6005         int n;
6006
6007         /* Sanity check the sibling is part of the virtual engine */
6008         for (n = 0; n < ve->num_siblings; n++)
6009                 if (sibling == ve->siblings[n])
6010                         break;
6011         if (n == ve->num_siblings)
6012                 return -EINVAL;
6013
6014         bond = virtual_find_bond(ve, master);
6015         if (bond) {
6016                 bond->sibling_mask |= sibling->mask;
6017                 return 0;
6018         }
6019
6020         bond = krealloc(ve->bonds,
6021                         sizeof(*bond) * (ve->num_bonds + 1),
6022                         GFP_KERNEL);
6023         if (!bond)
6024                 return -ENOMEM;
6025
6026         bond[ve->num_bonds].master = master;
6027         bond[ve->num_bonds].sibling_mask = sibling->mask;
6028
6029         ve->bonds = bond;
6030         ve->num_bonds++;
6031
6032         return 0;
6033 }
6034
6035 void intel_execlists_show_requests(struct intel_engine_cs *engine,
6036                                    struct drm_printer *m,
6037                                    void (*show_request)(struct drm_printer *m,
6038                                                         struct i915_request *rq,
6039                                                         const char *prefix),
6040                                    unsigned int max)
6041 {
6042         const struct intel_engine_execlists *execlists = &engine->execlists;
6043         struct i915_request *rq, *last;
6044         unsigned long flags;
6045         unsigned int count;
6046         struct rb_node *rb;
6047
6048         spin_lock_irqsave(&engine->active.lock, flags);
6049
6050         last = NULL;
6051         count = 0;
6052         list_for_each_entry(rq, &engine->active.requests, sched.link) {
6053                 if (count++ < max - 1)
6054                         show_request(m, rq, "\t\tE ");
6055                 else
6056                         last = rq;
6057         }
6058         if (last) {
6059                 if (count > max) {
6060                         drm_printf(m,
6061                                    "\t\t...skipping %d executing requests...\n",
6062                                    count - max);
6063                 }
6064                 show_request(m, last, "\t\tE ");
6065         }
6066
6067         if (execlists->switch_priority_hint != INT_MIN)
6068                 drm_printf(m, "\t\tSwitch priority hint: %d\n",
6069                            READ_ONCE(execlists->switch_priority_hint));
6070         if (execlists->queue_priority_hint != INT_MIN)
6071                 drm_printf(m, "\t\tQueue priority hint: %d\n",
6072                            READ_ONCE(execlists->queue_priority_hint));
6073
6074         last = NULL;
6075         count = 0;
6076         for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
6077                 struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
6078                 int i;
6079
6080                 priolist_for_each_request(rq, p, i) {
6081                         if (count++ < max - 1)
6082                                 show_request(m, rq, "\t\tQ ");
6083                         else
6084                                 last = rq;
6085                 }
6086         }
6087         if (last) {
6088                 if (count > max) {
6089                         drm_printf(m,
6090                                    "\t\t...skipping %d queued requests...\n",
6091                                    count - max);
6092                 }
6093                 show_request(m, last, "\t\tQ ");
6094         }
6095
6096         last = NULL;
6097         count = 0;
6098         for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
6099                 struct virtual_engine *ve =
6100                         rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
6101                 struct i915_request *rq = READ_ONCE(ve->request);
6102
6103                 if (rq) {
6104                         if (count++ < max - 1)
6105                                 show_request(m, rq, "\t\tV ");
6106                         else
6107                                 last = rq;
6108                 }
6109         }
6110         if (last) {
6111                 if (count > max) {
6112                         drm_printf(m,
6113                                    "\t\t...skipping %d virtual requests...\n",
6114                                    count - max);
6115                 }
6116                 show_request(m, last, "\t\tV ");
6117         }
6118
6119         spin_unlock_irqrestore(&engine->active.lock, flags);
6120 }
6121
6122 void intel_lr_context_reset(struct intel_engine_cs *engine,
6123                             struct intel_context *ce,
6124                             u32 head,
6125                             bool scrub)
6126 {
6127         GEM_BUG_ON(!intel_context_is_pinned(ce));
6128
6129         /*
6130          * We want a simple context + ring to execute the breadcrumb update.
6131          * We cannot rely on the context being intact across the GPU hang,
6132          * so clear it and rebuild just what we need for the breadcrumb.
6133          * All pending requests for this context will be zapped, and any
6134          * future request will be after userspace has had the opportunity
6135          * to recreate its own state.
6136          */
6137         if (scrub)
6138                 restore_default_state(ce, engine);
6139
6140         /* Rerun the request; its payload has been neutered (if guilty). */
6141         __execlists_update_reg_state(ce, engine, head);
6142 }
6143
6144 bool
6145 intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
6146 {
6147         return engine->set_default_submission ==
6148                intel_execlists_set_default_submission;
6149 }
6150
6151 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
6152 #include "selftest_lrc.c"
6153 #endif